Skip to content

Commit b53eef1

Browse files
authored
Merge pull request #78 from VariantEffect/mavedb-dev
Release 2026.1.1
2 parents db9a54c + 60d2a1b commit b53eef1

12 files changed

Lines changed: 121 additions & 71 deletions

File tree

pyproject.toml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -33,14 +33,14 @@ requires-python = ">=3.11"
3333

3434
dependencies = [
3535
"agct~=0.1.0",
36-
"requests",
3736
"biopython",
3837
"tqdm",
3938
"cdot",
4039
"click",
4140
"cool-seq-tool==0.4.0.dev3",
4241
"ga4gh.vrs==2.0.0-a6",
4342
"gene_normalizer[etl,pg]==0.3.0-dev2",
43+
"httpx~=0.28",
4444
"pydantic>=2",
4545
"python-dotenv",
4646
"setuptools>=68.0", # tmp -- ensure 3.12 compatibility
@@ -61,7 +61,7 @@ tests = [
6161
"pytest-mock",
6262
"pytest-cov",
6363
"pytest-asyncio",
64-
"requests-mock"
64+
"respx"
6565
]
6666
dev = [
6767
"ruff==0.2.0",

src/api/routers/map.py

Lines changed: 13 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@
55
from cool_seq_tool.schemas import AnnotationLayer
66
from fastapi import APIRouter, HTTPException
77
from fastapi.responses import JSONResponse
8-
from requests import HTTPError
8+
from httpx import HTTPStatusError
99

1010
from dcd_mapping.align import build_alignment_result
1111
from dcd_mapping.annotate import (
@@ -64,6 +64,7 @@ async def map_scoreset(urn: str, store_path: Path | None = None) -> JSONResponse
6464
records = get_scoreset_records(metadata, True, store_path)
6565
metadata = patch_target_sequence_type(metadata, records, force=False)
6666
except ScoresetNotSupportedError as e:
67+
_logger.error("Scoreset not supported for %s: %s", urn, e)
6768
return JSONResponse(
6869
content=ScoresetMapping(
6970
metadata=None,
@@ -72,6 +73,7 @@ async def map_scoreset(urn: str, store_path: Path | None = None) -> JSONResponse
7273
)
7374
except ResourceAcquisitionError as e:
7475
msg = f"Unable to acquire resource from MaveDB: {e}"
76+
_logger.error(msg)
7577
raise HTTPException(status_code=500, detail=msg) from e
7678

7779
if not records:
@@ -87,17 +89,21 @@ async def map_scoreset(urn: str, store_path: Path | None = None) -> JSONResponse
8789
alignment_results = build_alignment_result(metadata, True)
8890
except BlatNotFoundError as e:
8991
msg = "BLAT command appears missing. Ensure it is available on the $PATH or use the environment variable BLAT_BIN_PATH to point to it. See instructions in the README prerequisites section for more."
92+
_logger.error("BLAT not found for %s: %s", urn, e)
9093
raise HTTPException(status_code=500, detail=msg) from e
9194
except ResourceAcquisitionError as e:
9295
msg = f"BLAT resource could not be acquired: {e}"
96+
_logger.error(msg)
9397
raise HTTPException(status_code=500, detail=msg) from e
9498
except AlignmentError as e:
99+
_logger.error("Alignment error for %s: %s", urn, e)
95100
return JSONResponse(
96101
content=ScoresetMapping(
97102
metadata=metadata, error_message=str(e).strip("'")
98103
).model_dump(exclude_none=True)
99104
)
100105
except ScoresetNotSupportedError as e:
106+
_logger.error("Scoreset not supported during alignment for %s: %s", urn, e)
101107
return JSONResponse(
102108
content=ScoresetMapping(
103109
metadata=metadata, error_message=str(e).strip("'")
@@ -111,11 +117,13 @@ async def map_scoreset(urn: str, store_path: Path | None = None) -> JSONResponse
111117
# on the target level and on the variant level for variants relative to that target
112118
# HTTPErrors and DataLookupErrors cause the mapping process to exit because these indicate
113119
# underlying issues with data providers.
114-
except HTTPError as e:
120+
except HTTPStatusError as e:
115121
msg = f"HTTP error occurred during transcript selection: {e}"
122+
_logger.error(msg)
116123
raise HTTPException(status_code=500, detail=msg) from e
117124
except DataLookupError as e:
118125
msg = f"Data lookup error occurred during transcript selection: {e}"
126+
_logger.error(msg)
119127
raise HTTPException(status_code=500, detail=msg) from e
120128

121129
vrs_results = {}
@@ -134,6 +142,7 @@ async def map_scoreset(urn: str, store_path: Path | None = None) -> JSONResponse
134142
UnsupportedReferenceSequencePrefixError,
135143
MissingSequenceIdError,
136144
) as e:
145+
_logger.error("VRS mapping error for %s: %s", urn, e)
137146
return JSONResponse(
138147
content=ScoresetMapping(
139148
metadata=metadata, error_message=str(e).strip("'")
@@ -172,6 +181,7 @@ async def map_scoreset(urn: str, store_path: Path | None = None) -> JSONResponse
172181
VrsVersion.V_2,
173182
)
174183
except Exception as e:
184+
_logger.error("Unexpected error during annotation for %s: %s", urn, e)
175185
return JSONResponse(
176186
content=ScoresetMapping(
177187
metadata=metadata, error_message=str(e).strip("'")
@@ -287,6 +297,7 @@ async def map_scoreset(urn: str, store_path: Path | None = None) -> JSONResponse
287297
del reference_sequences[target_gene].layers[layer]
288298

289299
except Exception as e:
300+
_logger.error("Unexpected error during result assembly for %s: %s", urn, e)
290301
return JSONResponse(
291302
content=ScoresetMapping(
292303
metadata=metadata, error_message=str(e).strip("'")

src/dcd_mapping/align.py

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@
77
from pathlib import Path
88
from urllib.parse import urlparse
99

10-
import requests
10+
import httpx
1111
from Bio.SearchIO import HSP
1212
from Bio.SearchIO import parse as parse_blat
1313
from Bio.SearchIO._model import Hit, QueryResult
@@ -84,7 +84,7 @@ def get_ref_genome_file(
8484
if not genome_file.exists():
8585
try:
8686
http_download(url, genome_file, silent)
87-
except requests.HTTPError as e:
87+
except httpx.HTTPStatusError as e:
8888
msg = f"HTTPError when fetching reference genome file from {url}"
8989
_logger.error(msg)
9090
raise ResourceAcquisitionError(msg) from e
@@ -378,11 +378,11 @@ def fetch_alignment(
378378
alignment_results[accession_id] = None
379379
else:
380380
url = f"{CDOT_URL}/transcript/{accession_id}"
381-
r = requests.get(url, timeout=30)
381+
r = httpx.get(url, timeout=30)
382382

383383
try:
384384
r.raise_for_status()
385-
except requests.HTTPError as e:
385+
except httpx.HTTPStatusError as e:
386386
msg = f"Received HTTPError from {url} for scoreset {metadata.urn}"
387387
_logger.error(msg)
388388
raise ResourceAcquisitionError(msg) from e

src/dcd_mapping/lookup.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -14,8 +14,8 @@
1414
from typing import Any
1515

1616
import hgvs
17+
import httpx
1718
import polars as pl
18-
import requests
1919
from biocommons.seqrepo import SeqRepo
2020
from biocommons.seqrepo.seqaliasdb.seqaliasdb import sqlite3
2121
from cdot.hgvs.dataproviders import ChainedSeqFetcher, FastaSeqFetcher, RESTDataProvider
@@ -682,7 +682,7 @@ def get_overlapping_features_for_region(
682682
url, headers={"Content-Type": "application/json"}
683683
)
684684
response.raise_for_status()
685-
except requests.RequestException as e:
685+
except httpx.HTTPError as e:
686686
_logger.error(
687687
"Failed to fetch overlapping features for region %s-%s on chromosome %s: %s",
688688
start,
@@ -715,7 +715,7 @@ def get_uniprot_sequence(uniprot_id: str) -> str | None:
715715
:raise HTTPError: if response comes with an HTTP error code
716716
"""
717717
url = f"https://www.ebi.ac.uk/proteins/api/proteins?accession={uniprot_id.split(':')[1]}&format=json"
718-
response = requests.get(url, timeout=30)
718+
response = httpx.get(url, timeout=30)
719719
response.raise_for_status()
720720
json = response.json()
721721
return json[0]["sequence"]["sequence"]

src/dcd_mapping/main.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@
66
from pathlib import Path
77

88
import click
9-
from requests import HTTPError
9+
from httpx import HTTPStatusError
1010

1111
from dcd_mapping.align import build_alignment_result
1212
from dcd_mapping.annotate import (
@@ -205,7 +205,7 @@ async def map_scoreset(
205205
# on the target level and on the variant level for variants relative to that target
206206
# HTTPErrors and DataLookupErrors cause the mapping process to exit because these indicate
207207
# underlying issues with data providers.
208-
except HTTPError as e:
208+
except HTTPStatusError as e:
209209
_emit_info(
210210
f"HTTP error occurred during transcript selection: {e}",
211211
silent,

src/dcd_mapping/mavedb_data.py

Lines changed: 11 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,7 @@
1313
from pathlib import Path
1414
from typing import Any
1515

16-
import requests
16+
import httpx
1717
from fastapi import HTTPException
1818
from pydantic import ValidationError
1919

@@ -27,6 +27,7 @@
2727
MAVEDB_BASE_URL,
2828
authentication_header,
2929
http_download,
30+
is_missing_value,
3031
)
3132
from dcd_mapping.schemas import (
3233
ScoreRow,
@@ -56,7 +57,7 @@ def get_scoreset_urns() -> set[str]:
5657
5758
:return: set of URN strings
5859
"""
59-
r = requests.get(
60+
r = httpx.get(
6061
f"{MAVEDB_BASE_URL}/api/v1/experiments/",
6162
timeout=30,
6263
headers=authentication_header(),
@@ -100,14 +101,14 @@ def get_human_urns() -> list[str]:
100101
scoreset_urns = get_scoreset_urns()
101102
human_scoresets: list[str] = []
102103
for urn in scoreset_urns:
103-
r = requests.get(
104+
r = httpx.get(
104105
f"{MAVEDB_BASE_URL}/api/v1/score-sets/{urn}",
105106
timeout=30,
106107
headers=authentication_header(),
107108
)
108109
try:
109110
r.raise_for_status()
110-
except requests.exceptions.HTTPError:
111+
except httpx.HTTPStatusError:
111112
_logger.info("Unable to retrieve scoreset data for URN %s", urn)
112113
continue
113114
data = r.json()
@@ -155,10 +156,10 @@ def get_raw_scoreset_metadata(
155156
metadata_file = dcd_mapping_dir / f"{scoreset_urn}_metadata.json"
156157
if not metadata_file.exists():
157158
url = f"{MAVEDB_BASE_URL}/api/v1/score-sets/{scoreset_urn}"
158-
r = requests.get(url, timeout=30, headers=authentication_header())
159+
r = httpx.get(url, timeout=30, headers=authentication_header())
159160
try:
160161
r.raise_for_status()
161-
except requests.HTTPError as e:
162+
except httpx.HTTPStatusError as e:
162163
msg = f"Received HTTPError from {url} for scoreset {scoreset_urn}"
163164
_logger.error(msg)
164165
raise ResourceAcquisitionError(msg) from e
@@ -246,13 +247,13 @@ def _load_scoreset_records(
246247
with path.open() as csvfile:
247248
reader = csv.DictReader(csvfile)
248249
for row in reader:
249-
if row["score"] == "NA":
250+
if is_missing_value(row["score"]):
250251
row["score"] = None
251252
else:
252253
row["score"] = row["score"]
253-
if row["hgvs_nt"] != "NA":
254+
if not is_missing_value(row["hgvs_nt"]):
254255
prefix = row["hgvs_nt"].split(":")[0] if ":" in row["hgvs_nt"] else None
255-
elif row["hgvs_pro"] != "NA":
256+
elif not is_missing_value(row["hgvs_pro"]):
256257
prefix = (
257258
row["hgvs_pro"].split(":")[0] if ":" in row["hgvs_pro"] else None
258259
)
@@ -317,7 +318,7 @@ def get_scoreset_records(
317318
url = f"{MAVEDB_BASE_URL}/api/v1/score-sets/{metadata.urn}/scores"
318319
try:
319320
http_download(url, scores_csv, silent)
320-
except requests.HTTPError as e:
321+
except httpx.HTTPStatusError as e:
321322
msg = f"HTTPError when fetching scores CSV from {url}"
322323
_logger.error(msg)
323324
raise ResourceAcquisitionError(msg) from e

src/dcd_mapping/resource_utils.py

Lines changed: 44 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -5,11 +5,30 @@
55
from pathlib import Path
66

77
import click
8-
import requests
8+
import httpx
99
from tqdm import tqdm
1010

1111
_logger = logging.getLogger(__name__)
1212

13+
# Common representations of missing/null data in CSV files
14+
MISSING_VALUE_REPRESENTATIONS = frozenset(
15+
{
16+
"NA",
17+
"N/A",
18+
"na",
19+
"n/a",
20+
"NaN",
21+
"nan",
22+
"null",
23+
"NULL",
24+
"None",
25+
"none",
26+
"",
27+
"-",
28+
".",
29+
}
30+
)
31+
1332
MAVEDB_API_KEY = os.environ.get("MAVEDB_API_KEY")
1433
MAVEDB_BASE_URL = os.environ.get("MAVEDB_BASE_URL")
1534
ENSEMBL_API_URL = os.environ.get("ENSEMBL_API_URL", "https://rest.ensembl.org") # TODO
@@ -24,6 +43,22 @@
2443
LOCAL_STORE_PATH.mkdir(exist_ok=True, parents=True)
2544

2645

46+
def is_missing_value(value: str | None) -> bool:
47+
"""Check if a value represents missing/null data.
48+
49+
This function recognizes multiple common representations of missing data
50+
that may appear in CSV files from external sources, making the codebase
51+
more resilient to upstream changes in NA representation.
52+
53+
:param value: The value to check
54+
:return: True if the value represents missing data, False otherwise
55+
"""
56+
if value is None:
57+
return True
58+
# Strip whitespace and check against known missing value representations
59+
return value.strip() in MISSING_VALUE_REPRESENTATIONS
60+
61+
2762
def authentication_header() -> dict | None:
2863
"""Fetch with api key envvar, if available."""
2964
return {"X-API-key": MAVEDB_API_KEY} if MAVEDB_API_KEY is not None else None
@@ -36,13 +71,11 @@ def http_download(url: str, out_path: Path, silent: bool = True) -> Path:
3671
:param out_path: location to save file to
3772
:param silent: show TQDM progress bar if true
3873
:return: Path if download successful
39-
:raise requests.HTTPError: if request is unsuccessful
74+
:raise httpx.HTTPStatusError: if request is unsuccessful
4075
"""
4176
if not silent:
4277
click.echo(f"Downloading {out_path.name} to {out_path.parents[0].absolute()}")
43-
with requests.get(
44-
url, stream=True, timeout=60, headers=authentication_header()
45-
) as r:
78+
with httpx.stream("GET", url, timeout=60, headers=authentication_header()) as r:
4679
r.raise_for_status()
4780
total_size = int(r.headers.get("content-length", 0))
4881
with out_path.open("wb") as h:
@@ -54,20 +87,20 @@ def http_download(url: str, out_path: Path, silent: bool = True) -> Path:
5487
desc=out_path.name,
5588
ncols=80,
5689
) as progress_bar:
57-
for chunk in r.iter_content(chunk_size=8192):
90+
for chunk in r.iter_bytes(chunk_size=8192):
5891
if chunk:
5992
h.write(chunk)
6093
progress_bar.update(len(chunk))
6194
else:
62-
for chunk in r.iter_content(chunk_size=8192):
95+
for chunk in r.iter_bytes(chunk_size=8192):
6396
if chunk:
6497
h.write(chunk)
6598
return out_path
6699

67100

68101
def request_with_backoff(
69102
url: str, max_retries: int = 5, backoff_factor: float = 0.3, **kwargs
70-
) -> requests.Response:
103+
) -> httpx.Response:
71104
"""HTTP GET with exponential backoff only for retryable errors.
72105
73106
Retries on:
@@ -80,9 +113,9 @@ def request_with_backoff(
80113
attempt = 0
81114
while attempt < max_retries:
82115
try:
83-
kwargs.setdefault("timeout", 60) # Default timeout of 10 seconds
84-
response = requests.get(url, **kwargs) # noqa: S113
85-
except (requests.Timeout, requests.ConnectionError):
116+
kwargs.setdefault("timeout", 60)
117+
response = httpx.get(url, **kwargs)
118+
except (httpx.TimeoutException, httpx.ConnectError):
86119
# Retry on transient network failures
87120
if attempt == max_retries - 1:
88121
raise

src/dcd_mapping/version.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,3 @@
11
"""Provide dcd mapping version"""
22

3-
dcd_mapping_version = "2026.1.0"
3+
dcd_mapping_version = "2026.1.1"

0 commit comments

Comments
 (0)