Merge pull request #78 from VariantEffect/mavedb-dev

bencap · web-flow · commit b53eef190445 · 2026-03-02T15:49:51.000-08:00
Release 2026.1.1
diff --git a/pyproject.toml b/pyproject.toml
@@ -33,14 +33,14 @@ requires-python = ">=3.11"
 
 dependencies = [
     "agct~=0.1.0",
-    "requests",
     "biopython",
     "tqdm",
     "cdot",
     "click",
     "cool-seq-tool==0.4.0.dev3",
     "ga4gh.vrs==2.0.0-a6",
     "gene_normalizer[etl,pg]==0.3.0-dev2",
+    "httpx~=0.28",
     "pydantic>=2",
     "python-dotenv",
     "setuptools>=68.0",  # tmp -- ensure 3.12 compatibility
@@ -61,7 +61,7 @@ tests = [
     "pytest-mock",
     "pytest-cov",
     "pytest-asyncio",
-    "requests-mock"
+    "respx"
 ]
 dev = [
     "ruff==0.2.0",
diff --git a/src/api/routers/map.py b/src/api/routers/map.py
@@ -5,7 +5,7 @@
 from cool_seq_tool.schemas import AnnotationLayer
 from fastapi import APIRouter, HTTPException
 from fastapi.responses import JSONResponse
-from requests import HTTPError
+from httpx import HTTPStatusError
 
 from dcd_mapping.align import build_alignment_result
 from dcd_mapping.annotate import (
@@ -64,6 +64,7 @@ async def map_scoreset(urn: str, store_path: Path | None = None) -> JSONResponse
         records = get_scoreset_records(metadata, True, store_path)
         metadata = patch_target_sequence_type(metadata, records, force=False)
     except ScoresetNotSupportedError as e:
+        _logger.error("Scoreset not supported for %s: %s", urn, e)
         return JSONResponse(
             content=ScoresetMapping(
                 metadata=None,
@@ -72,6 +73,7 @@ async def map_scoreset(urn: str, store_path: Path | None = None) -> JSONResponse
         )
     except ResourceAcquisitionError as e:
         msg = f"Unable to acquire resource from MaveDB: {e}"
+        _logger.error(msg)
         raise HTTPException(status_code=500, detail=msg) from e
 
     if not records:
@@ -87,17 +89,21 @@ async def map_scoreset(urn: str, store_path: Path | None = None) -> JSONResponse
         alignment_results = build_alignment_result(metadata, True)
     except BlatNotFoundError as e:
         msg = "BLAT command appears missing. Ensure it is available on the $PATH or use the environment variable BLAT_BIN_PATH to point to it. See instructions in the README prerequisites section for more."
+        _logger.error("BLAT not found for %s: %s", urn, e)
         raise HTTPException(status_code=500, detail=msg) from e
     except ResourceAcquisitionError as e:
         msg = f"BLAT resource could not be acquired: {e}"
+        _logger.error(msg)
         raise HTTPException(status_code=500, detail=msg) from e
     except AlignmentError as e:
+        _logger.error("Alignment error for %s: %s", urn, e)
         return JSONResponse(
             content=ScoresetMapping(
                 metadata=metadata, error_message=str(e).strip("'")
             ).model_dump(exclude_none=True)
         )
     except ScoresetNotSupportedError as e:
+        _logger.error("Scoreset not supported during alignment for %s: %s", urn, e)
         return JSONResponse(
             content=ScoresetMapping(
                 metadata=metadata, error_message=str(e).strip("'")
@@ -111,11 +117,13 @@ async def map_scoreset(urn: str, store_path: Path | None = None) -> JSONResponse
     # on the target level and on the variant level for variants relative to that target
     # HTTPErrors and DataLookupErrors cause the mapping process to exit because these indicate
     # underlying issues with data providers.
-    except HTTPError as e:
+    except HTTPStatusError as e:
         msg = f"HTTP error occurred during transcript selection: {e}"
+        _logger.error(msg)
         raise HTTPException(status_code=500, detail=msg) from e
     except DataLookupError as e:
         msg = f"Data lookup error occurred during transcript selection: {e}"
+        _logger.error(msg)
         raise HTTPException(status_code=500, detail=msg) from e
 
     vrs_results = {}
@@ -134,6 +142,7 @@ async def map_scoreset(urn: str, store_path: Path | None = None) -> JSONResponse
         UnsupportedReferenceSequencePrefixError,
         MissingSequenceIdError,
     ) as e:
+        _logger.error("VRS mapping error for %s: %s", urn, e)
         return JSONResponse(
             content=ScoresetMapping(
                 metadata=metadata, error_message=str(e).strip("'")
@@ -172,6 +181,7 @@ async def map_scoreset(urn: str, store_path: Path | None = None) -> JSONResponse
                 VrsVersion.V_2,
             )
     except Exception as e:
+        _logger.error("Unexpected error during annotation for %s: %s", urn, e)
         return JSONResponse(
             content=ScoresetMapping(
                 metadata=metadata, error_message=str(e).strip("'")
@@ -287,6 +297,7 @@ async def map_scoreset(urn: str, store_path: Path | None = None) -> JSONResponse
                         del reference_sequences[target_gene].layers[layer]
 
     except Exception as e:
+        _logger.error("Unexpected error during result assembly for %s: %s", urn, e)
         return JSONResponse(
             content=ScoresetMapping(
                 metadata=metadata, error_message=str(e).strip("'")
diff --git a/src/dcd_mapping/align.py b/src/dcd_mapping/align.py
@@ -7,7 +7,7 @@
 from pathlib import Path
 from urllib.parse import urlparse
 
-import requests
+import httpx
 from Bio.SearchIO import HSP
 from Bio.SearchIO import parse as parse_blat
 from Bio.SearchIO._model import Hit, QueryResult
@@ -84,7 +84,7 @@ def get_ref_genome_file(
     if not genome_file.exists():
         try:
             http_download(url, genome_file, silent)
-        except requests.HTTPError as e:
+        except httpx.HTTPStatusError as e:
             msg = f"HTTPError when fetching reference genome file from {url}"
             _logger.error(msg)
             raise ResourceAcquisitionError(msg) from e
@@ -378,11 +378,11 @@ def fetch_alignment(
             alignment_results[accession_id] = None
         else:
             url = f"{CDOT_URL}/transcript/{accession_id}"
-            r = requests.get(url, timeout=30)
+            r = httpx.get(url, timeout=30)
 
             try:
                 r.raise_for_status()
-            except requests.HTTPError as e:
+            except httpx.HTTPStatusError as e:
                 msg = f"Received HTTPError from {url} for scoreset {metadata.urn}"
                 _logger.error(msg)
                 raise ResourceAcquisitionError(msg) from e
diff --git a/src/dcd_mapping/lookup.py b/src/dcd_mapping/lookup.py
@@ -14,8 +14,8 @@
 from typing import Any
 
 import hgvs
+import httpx
 import polars as pl
-import requests
 from biocommons.seqrepo import SeqRepo
 from biocommons.seqrepo.seqaliasdb.seqaliasdb import sqlite3
 from cdot.hgvs.dataproviders import ChainedSeqFetcher, FastaSeqFetcher, RESTDataProvider
@@ -682,7 +682,7 @@ def get_overlapping_features_for_region(
             url, headers={"Content-Type": "application/json"}
         )
         response.raise_for_status()
-    except requests.RequestException as e:
+    except httpx.HTTPError as e:
         _logger.error(
             "Failed to fetch overlapping features for region %s-%s on chromosome %s: %s",
             start,
@@ -715,7 +715,7 @@ def get_uniprot_sequence(uniprot_id: str) -> str | None:
     :raise HTTPError: if response comes with an HTTP error code
     """
     url = f"https://www.ebi.ac.uk/proteins/api/proteins?accession={uniprot_id.split(':')[1]}&format=json"
-    response = requests.get(url, timeout=30)
+    response = httpx.get(url, timeout=30)
     response.raise_for_status()
     json = response.json()
     return json[0]["sequence"]["sequence"]
diff --git a/src/dcd_mapping/main.py b/src/dcd_mapping/main.py
@@ -6,7 +6,7 @@
 from pathlib import Path
 
 import click
-from requests import HTTPError
+from httpx import HTTPStatusError
 
 from dcd_mapping.align import build_alignment_result
 from dcd_mapping.annotate import (
@@ -205,7 +205,7 @@ async def map_scoreset(
     # on the target level and on the variant level for variants relative to that target
     # HTTPErrors and DataLookupErrors cause the mapping process to exit because these indicate
     # underlying issues with data providers.
-    except HTTPError as e:
+    except HTTPStatusError as e:
         _emit_info(
             f"HTTP error occurred during transcript selection: {e}",
             silent,
diff --git a/src/dcd_mapping/mavedb_data.py b/src/dcd_mapping/mavedb_data.py
@@ -13,7 +13,7 @@
 from pathlib import Path
 from typing import Any
 
-import requests
+import httpx
 from fastapi import HTTPException
 from pydantic import ValidationError
 
@@ -27,6 +27,7 @@
     MAVEDB_BASE_URL,
     authentication_header,
     http_download,
+    is_missing_value,
 )
 from dcd_mapping.schemas import (
     ScoreRow,
@@ -56,7 +57,7 @@ def get_scoreset_urns() -> set[str]:
 
     :return: set of URN strings
     """
-    r = requests.get(
+    r = httpx.get(
         f"{MAVEDB_BASE_URL}/api/v1/experiments/",
         timeout=30,
         headers=authentication_header(),
@@ -100,14 +101,14 @@ def get_human_urns() -> list[str]:
     scoreset_urns = get_scoreset_urns()
     human_scoresets: list[str] = []
     for urn in scoreset_urns:
-        r = requests.get(
+        r = httpx.get(
             f"{MAVEDB_BASE_URL}/api/v1/score-sets/{urn}",
             timeout=30,
             headers=authentication_header(),
         )
         try:
             r.raise_for_status()
-        except requests.exceptions.HTTPError:
+        except httpx.HTTPStatusError:
             _logger.info("Unable to retrieve scoreset data for URN %s", urn)
             continue
         data = r.json()
@@ -155,10 +156,10 @@ def get_raw_scoreset_metadata(
     metadata_file = dcd_mapping_dir / f"{scoreset_urn}_metadata.json"
     if not metadata_file.exists():
         url = f"{MAVEDB_BASE_URL}/api/v1/score-sets/{scoreset_urn}"
-        r = requests.get(url, timeout=30, headers=authentication_header())
+        r = httpx.get(url, timeout=30, headers=authentication_header())
         try:
             r.raise_for_status()
-        except requests.HTTPError as e:
+        except httpx.HTTPStatusError as e:
             msg = f"Received HTTPError from {url} for scoreset {scoreset_urn}"
             _logger.error(msg)
             raise ResourceAcquisitionError(msg) from e
@@ -246,13 +247,13 @@ def _load_scoreset_records(
     with path.open() as csvfile:
         reader = csv.DictReader(csvfile)
         for row in reader:
-            if row["score"] == "NA":
+            if is_missing_value(row["score"]):
                 row["score"] = None
             else:
                 row["score"] = row["score"]
-            if row["hgvs_nt"] != "NA":
+            if not is_missing_value(row["hgvs_nt"]):
                 prefix = row["hgvs_nt"].split(":")[0] if ":" in row["hgvs_nt"] else None
-            elif row["hgvs_pro"] != "NA":
+            elif not is_missing_value(row["hgvs_pro"]):
                 prefix = (
                     row["hgvs_pro"].split(":")[0] if ":" in row["hgvs_pro"] else None
                 )
@@ -317,7 +318,7 @@ def get_scoreset_records(
             url = f"{MAVEDB_BASE_URL}/api/v1/score-sets/{metadata.urn}/scores"
             try:
                 http_download(url, scores_csv, silent)
-            except requests.HTTPError as e:
+            except httpx.HTTPStatusError as e:
                 msg = f"HTTPError when fetching scores CSV from {url}"
                 _logger.error(msg)
                 raise ResourceAcquisitionError(msg) from e
diff --git a/src/dcd_mapping/resource_utils.py b/src/dcd_mapping/resource_utils.py
@@ -5,11 +5,30 @@
 from pathlib import Path
 
 import click
-import requests
+import httpx
 from tqdm import tqdm
 
 _logger = logging.getLogger(__name__)
 
+# Common representations of missing/null data in CSV files
+MISSING_VALUE_REPRESENTATIONS = frozenset(
+    {
+        "NA",
+        "N/A",
+        "na",
+        "n/a",
+        "NaN",
+        "nan",
+        "null",
+        "NULL",
+        "None",
+        "none",
+        "",
+        "-",
+        ".",
+    }
+)
+
 MAVEDB_API_KEY = os.environ.get("MAVEDB_API_KEY")
 MAVEDB_BASE_URL = os.environ.get("MAVEDB_BASE_URL")
 ENSEMBL_API_URL = os.environ.get("ENSEMBL_API_URL", "https://rest.ensembl.org")  # TODO
@@ -24,6 +43,22 @@
     LOCAL_STORE_PATH.mkdir(exist_ok=True, parents=True)
 
 
+def is_missing_value(value: str | None) -> bool:
+    """Check if a value represents missing/null data.
+
+    This function recognizes multiple common representations of missing data
+    that may appear in CSV files from external sources, making the codebase
+    more resilient to upstream changes in NA representation.
+
+    :param value: The value to check
+    :return: True if the value represents missing data, False otherwise
+    """
+    if value is None:
+        return True
+    # Strip whitespace and check against known missing value representations
+    return value.strip() in MISSING_VALUE_REPRESENTATIONS
+
+
 def authentication_header() -> dict | None:
     """Fetch with api key envvar, if available."""
     return {"X-API-key": MAVEDB_API_KEY} if MAVEDB_API_KEY is not None else None
@@ -36,13 +71,11 @@ def http_download(url: str, out_path: Path, silent: bool = True) -> Path:
     :param out_path: location to save file to
     :param silent: show TQDM progress bar if true
     :return: Path if download successful
-    :raise requests.HTTPError: if request is unsuccessful
+    :raise httpx.HTTPStatusError: if request is unsuccessful
     """
     if not silent:
         click.echo(f"Downloading {out_path.name} to {out_path.parents[0].absolute()}")
-    with requests.get(
-        url, stream=True, timeout=60, headers=authentication_header()
-    ) as r:
+    with httpx.stream("GET", url, timeout=60, headers=authentication_header()) as r:
         r.raise_for_status()
         total_size = int(r.headers.get("content-length", 0))
         with out_path.open("wb") as h:
@@ -54,20 +87,20 @@ def http_download(url: str, out_path: Path, silent: bool = True) -> Path:
                     desc=out_path.name,
                     ncols=80,
                 ) as progress_bar:
-                    for chunk in r.iter_content(chunk_size=8192):
+                    for chunk in r.iter_bytes(chunk_size=8192):
                         if chunk:
                             h.write(chunk)
                             progress_bar.update(len(chunk))
             else:
-                for chunk in r.iter_content(chunk_size=8192):
+                for chunk in r.iter_bytes(chunk_size=8192):
                     if chunk:
                         h.write(chunk)
     return out_path
 
 
 def request_with_backoff(
     url: str, max_retries: int = 5, backoff_factor: float = 0.3, **kwargs
-) -> requests.Response:
+) -> httpx.Response:
     """HTTP GET with exponential backoff only for retryable errors.
 
     Retries on:
@@ -80,9 +113,9 @@ def request_with_backoff(
     attempt = 0
     while attempt < max_retries:
         try:
-            kwargs.setdefault("timeout", 60)  # Default timeout of 10 seconds
-            response = requests.get(url, **kwargs)  # noqa: S113
-        except (requests.Timeout, requests.ConnectionError):
+            kwargs.setdefault("timeout", 60)
+            response = httpx.get(url, **kwargs)
+        except (httpx.TimeoutException, httpx.ConnectError):
             # Retry on transient network failures
             if attempt == max_retries - 1:
                 raise
diff --git a/src/dcd_mapping/version.py b/src/dcd_mapping/version.py
@@ -1,3 +1,3 @@
 """Provide dcd mapping version"""
 
-dcd_mapping_version = "2026.1.0"
+dcd_mapping_version = "2026.1.1"
diff --git a/src/dcd_mapping/vrs_map.py b/src/dcd_mapping/vrs_map.py
diff --git a/tests/test_lookup.py b/tests/test_lookup.py
diff --git a/tests/test_mavedb_data.py b/tests/test_mavedb_data.py
diff --git a/tests/test_resource_utils.py b/tests/test_resource_utils.py

Original file line number	Diff line number	Diff line change
`@@ -1,3 +1,3 @@`
`1`	`1`	`"""Provide dcd mapping version"""`
`2`	`2`
`3`		`-dcd_mapping_version = "2026.1.0"`
	`3`	`+dcd_mapping_version = "2026.1.1"`