Skip to content

Commit 1aa5409

Browse files
committed
feat: add Redis caching for ClinGen API requests to reduce redundant calls
Implements 24-hour Redis cache for ClinGen Allele Registry API responses, significantly reducing API load when processing multiple ClinVar control versions that query the same alleles. Converts three ClinGen functions to async with @cached decorator, implements memory backend for testing, and handles 404 responses as cacheable "no data" results while raising exceptions for other API failures. Includes comprehensive test coverage and type stubs for the untyped aiocache library. - Add aiocache optional dependency with Redis backend support - Create cache configuration module with environment-based backend selection - Convert get_canonical_pa_ids, get_matching_registered_ca_ids, and get_associated_clinvar_allele_id to async cached functions - Return empty string/list for "no data" cases to enable caching of modal outcomes - Implement 404-specific error handling: cache permanent absences, raise for transient failures - Add memory cache backend for testing without Redis dependency - Create type stubs for aiocache.Cache and aiocache.cached decorator - Add 43 new tests covering caching behavior, configuration, and network interactions
1 parent 93e8519 commit 1aa5409

18 files changed

Lines changed: 1111 additions & 133 deletions

File tree

mypy_stubs/aiocache/__init__.pyi

Lines changed: 53 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,53 @@
1+
"""Type stubs for aiocache library.
2+
3+
Provides type hints for the aiocache caching library functionality used in MaveDB.
4+
"""
5+
6+
from typing import Any, Awaitable, Callable, Optional, Type, TypeVar, Union
7+
8+
from .base import BaseCache
9+
10+
# Type variables for decorator
11+
F = TypeVar("F", bound=Callable[..., Awaitable[Any]])
12+
T = TypeVar("T")
13+
14+
class Cache:
15+
"""Cache factory class for creating cache instances."""
16+
17+
# Cache backend constants
18+
REDIS: Type[BaseCache]
19+
MEMORY: Type[BaseCache]
20+
21+
def __init__(
22+
self,
23+
cache_class: Type[BaseCache],
24+
*,
25+
endpoint: Optional[str] = None,
26+
port: Optional[int] = None,
27+
ssl: bool = False,
28+
namespace: Optional[str] = None,
29+
serializer: Optional[Any] = None,
30+
plugins: Optional[Any] = None,
31+
**kwargs: Any,
32+
) -> None: ...
33+
async def get(self, key: str) -> Any: ...
34+
async def set(self, key: str, value: Any, ttl: Optional[int] = None) -> bool: ...
35+
async def delete(self, key: str) -> bool: ...
36+
async def clear(self, namespace: Optional[str] = None) -> bool: ...
37+
async def close(self) -> None: ...
38+
39+
def cached(
40+
ttl: Optional[int] = None,
41+
key: Optional[str] = None,
42+
key_builder: Optional[Callable[..., str]] = None,
43+
cache: Union[Type[BaseCache], BaseCache, None] = None,
44+
serializer: Optional[Any] = None,
45+
plugins: Optional[Any] = None,
46+
alias: Optional[str] = None,
47+
namespace: Optional[str] = None,
48+
noself: bool = False,
49+
skip_cache_func: Optional[Callable[[Any], bool]] = None,
50+
**kwargs: Any,
51+
) -> Callable[[F], F]: ...
52+
53+
__all__ = ["Cache", "cached"]

mypy_stubs/aiocache/base.pyi

Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,25 @@
1+
"""Type stubs for aiocache.base module.
2+
3+
Provides type hints for the base cache class used by aiocache backends.
4+
"""
5+
6+
from typing import Any, Optional
7+
8+
class BaseCache:
9+
"""Base class for cache backends."""
10+
11+
def __init__(
12+
self,
13+
*,
14+
namespace: Optional[str] = None,
15+
serializer: Optional[Any] = None,
16+
plugins: Optional[Any] = None,
17+
**kwargs: Any,
18+
) -> None: ...
19+
async def get(self, key: str) -> Any: ...
20+
async def set(self, key: str, value: Any, ttl: Optional[int] = None) -> bool: ...
21+
async def delete(self, key: str) -> bool: ...
22+
async def clear(self, namespace: Optional[str] = None) -> bool: ...
23+
async def close(self) -> None: ...
24+
25+
__all__ = ["BaseCache"]

poetry.lock

Lines changed: 23 additions & 2 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

pyproject.toml

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -41,6 +41,7 @@ SQLAlchemy = "~2.0.29"
4141
ga4gh-va-spec = "~0.4.2"
4242

4343
# Optional dependencies for running this application as a server
44+
aiocache = { extras = ["redis"], version = "~0.12.2", optional = true }
4445
alembic = { version = "~1.14.0", optional = true }
4546
alembic-utils = { version = "0.8.1", optional = true }
4647
arq = { version = "~0.25.0", optional = true }
@@ -89,7 +90,7 @@ SQLAlchemy = { extras = ["mypy"], version = "~2.0.0" }
8990

9091

9192
[tool.poetry.extras]
92-
server = ["alembic", "alembic-utils", "arq", "authlib", "biocommons", "boto3", "cdot", "cryptography", "fastapi", "hgvs", "orcid", "psycopg2", "python-jose", "python-multipart", "pyathena", "requests", "starlette", "starlette-context", "slack-sdk", "uvicorn", "watchtower"]
93+
server = ["aiocache", "alembic", "alembic-utils", "arq", "authlib", "biocommons", "boto3", "cdot", "cryptography", "fastapi", "hgvs", "orcid", "psycopg2", "python-jose", "python-multipart", "pyathena", "requests", "starlette", "starlette-context", "slack-sdk", "uvicorn", "watchtower"]
9394

9495

9596
[tool.mypy]

settings/.env.template

Lines changed: 16 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -106,4 +106,19 @@ GNOMAD_DATA_VERSION=v4.1
106106
AWS_ACCESS_KEY_ID=test
107107
AWS_SECRET_ACCESS_KEY=test
108108
S3_ENDPOINT_URL=http://localstack:4566
109-
UPLOAD_S3_BUCKET_NAME=score-set-csv-uploads-dev
109+
UPLOAD_S3_BUCKET_NAME=score-set-csv-uploads-dev
110+
111+
####################################################################################################
112+
# Environment variables for ClinGen cache settings
113+
####################################################################################################
114+
115+
CLINGEN_CACHE_BACKEND=redis
116+
CLINGEN_REDIS_HOST=localhost
117+
CLINGEN_REDIS_PORT=6379
118+
CLINGEN_REDIS_SSL=false
119+
120+
####################################################################################################
121+
# Environment variables for ClinVar cache settings
122+
####################################################################################################
123+
124+
CLINVAR_CACHE_DIR=/data/clinvar_cache
Lines changed: 103 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -1,20 +1,48 @@
1+
import asyncio
12
import logging
23

34
import requests
5+
from aiocache import cached
6+
7+
from mavedb.lib.clingen.cache import CACHE_CLASS, CACHE_CONFIG, CACHE_TTL_SECONDS, clingen_cache_key_builder
48

59
logger = logging.getLogger(__name__)
610
logger.setLevel(logging.DEBUG)
711

812
CLINGEN_API_URL = "https://reg.genome.network/allele"
913

1014

11-
def get_canonical_pa_ids(clingen_allele_id: str) -> list[str]:
12-
""" "Retrieve any canonical PA IDs from the ClinGen API for a given clingen allele ID."""
13-
response = requests.get(f"{CLINGEN_API_URL}/{clingen_allele_id}")
14-
if response.status_code != 200:
15-
logger.error(f"Failed to query ClinGen API for {clingen_allele_id}: {response.status_code}")
15+
@cached(ttl=CACHE_TTL_SECONDS, key_builder=clingen_cache_key_builder, cache=CACHE_CLASS, **CACHE_CONFIG)
16+
async def get_canonical_pa_ids(clingen_allele_id: str) -> list[str]:
17+
"""Retrieve canonical PA IDs from the ClinGen API for a given ClinGen allele ID.
18+
19+
Results are automatically cached for 24 hours using aiocache with configurable backend.
20+
This significantly reduces repeated API calls when processing multiple ClinVar control
21+
versions or running jobs that query the same alleles. Cache backend can be switched
22+
between Redis (production) and in-memory (testing) via CLINGEN_CACHE_BACKEND env var.
23+
24+
Args:
25+
clingen_allele_id: ClinGen allele ID to query (e.g., CA123456)
26+
27+
Returns:
28+
List of canonical PA IDs associated with the allele. Returns empty list if
29+
the allele has no MANE transcripts or if the allele doesn't exist (404).
30+
31+
Raises:
32+
requests.exceptions.HTTPError: If the API request fails with non-2xx status code
33+
(excluding 404, which returns empty list).
34+
"""
35+
loop = asyncio.get_running_loop()
36+
response = await loop.run_in_executor(None, requests.get, f"{CLINGEN_API_URL}/{clingen_allele_id}")
37+
38+
# 404 means the allele doesn't exist in ClinGen's registry - treat as "no data" (cacheable)
39+
if response.status_code == 404:
1640
return []
1741

42+
# All other non-2xx status codes raise exceptions (400, 429, 5xx, etc.)
43+
if response.status_code != 200:
44+
response.raise_for_status()
45+
1846
data = response.json()
1947

2048
pa_ids = []
@@ -27,35 +55,92 @@ def get_canonical_pa_ids(clingen_allele_id: str) -> list[str]:
2755
return pa_ids
2856

2957

30-
def get_matching_registered_ca_ids(clingen_pa_id: str) -> list[str]:
31-
"""Retrieve all matching registered transcript CA IDs for a given PA ID from the ClinGen API."""
32-
response = requests.get(f"{CLINGEN_API_URL}/{clingen_pa_id}")
33-
if response.status_code != 200:
34-
logger.error(f"Failed to query ClinGen API for {clingen_pa_id}: {response.status_code}")
58+
@cached(ttl=CACHE_TTL_SECONDS, key_builder=clingen_cache_key_builder, cache=CACHE_CLASS, **CACHE_CONFIG)
59+
async def get_matching_registered_ca_ids(clingen_pa_id: str) -> list[str]:
60+
"""Retrieve matching registered transcript CA IDs for a given PA ID from the ClinGen API.
61+
62+
Results are automatically cached for 24 hours using aiocache with configurable backend.
63+
This significantly reduces repeated API calls when processing variant translations or
64+
running jobs that query the same protein alleles. Cache backend can be switched
65+
between Redis (production) and in-memory (testing) via CLINGEN_CACHE_BACKEND env var.
66+
67+
Args:
68+
clingen_pa_id: ClinGen protein allele ID to query (e.g., PA123456)
69+
70+
Returns:
71+
List of matching registered transcript CA IDs. Returns empty list if no
72+
matching transcripts are found or if the allele doesn't exist (404).
73+
74+
Raises:
75+
requests.exceptions.HTTPError: If the API request fails with non-2xx status code
76+
(excluding 404, which returns empty list).
77+
"""
78+
loop = asyncio.get_running_loop()
79+
response = await loop.run_in_executor(None, requests.get, f"{CLINGEN_API_URL}/{clingen_pa_id}")
80+
81+
# 404 means the allele doesn't exist in ClinGen's registry - treat as "no data" (cacheable)
82+
if response.status_code == 404:
3583
return []
3684

85+
# All other non-2xx status codes raise exceptions (400, 429, 5xx, etc.)
86+
if response.status_code != 200:
87+
response.raise_for_status()
88+
3789
data = response.json()
3890

3991
ca_ids = []
4092
if data.get("aminoAcidAlleles"):
4193
for allele in data["aminoAcidAlleles"]:
4294
if allele.get("matchingRegisteredTranscripts"):
43-
# @id field returns url; the last component is the PA ID
44-
ca_ids.extend([allele["@id"].split("/")[-1] for allele in allele["matchingRegisteredTranscripts"]])
95+
# @id field returns URL; the last component is the transcript CA ID
96+
ca_ids.extend(
97+
[transcript["@id"].split("/")[-1] for transcript in allele["matchingRegisteredTranscripts"]]
98+
)
4599

46100
return ca_ids
47101

48102

49-
def get_associated_clinvar_allele_id(clingen_allele_id: str) -> str | None:
50-
"""Retrieve the associated ClinVar Allele ID for a given ClinGen Allele ID from the ClinGen API."""
51-
response = requests.get(f"{CLINGEN_API_URL}/{clingen_allele_id}")
103+
@cached(ttl=CACHE_TTL_SECONDS, key_builder=clingen_cache_key_builder, cache=CACHE_CLASS, **CACHE_CONFIG)
104+
async def get_associated_clinvar_allele_id(clingen_allele_id: str) -> str:
105+
"""Retrieve the associated ClinVar Allele ID for a given ClinGen Allele ID.
106+
107+
Results are automatically cached for 24 hours using aiocache with configurable backend.
108+
This significantly reduces repeated API calls when refreshing ClinVar controls across
109+
multiple months/years, as each job queries the same ClinGen allele IDs. Cache backend
110+
can be switched between Redis (production) and in-memory (testing) via the
111+
CLINGEN_CACHE_BACKEND environment variable.
112+
113+
Note: Returns empty string when the API call succeeds but no ClinVar association exists,
114+
or when the allele doesn't exist in ClinGen's registry (404). This ensures successful
115+
negative results are cached, which is important since most ClinGen alleles don't have
116+
ClinVar associations. Other API errors (400, 429, 5xx) raise HTTPError, which prevents
117+
caching and allows retries for transient failures or surfaces issues like rate limiting.
118+
119+
Args:
120+
clingen_allele_id: ClinGen allele ID to query (e.g., CA123456)
121+
122+
Returns:
123+
Associated ClinVar allele ID as a string, or empty string if no association exists
124+
or if the allele doesn't exist (404).
125+
126+
Raises:
127+
requests.exceptions.HTTPError: If the API request fails with non-2xx status code
128+
(excluding 404, which returns empty string).
129+
"""
130+
loop = asyncio.get_running_loop()
131+
response = await loop.run_in_executor(None, requests.get, f"{CLINGEN_API_URL}/{clingen_allele_id}")
132+
133+
# 404 means the allele doesn't exist in ClinGen's registry - treat as "no data" (cacheable)
134+
if response.status_code == 404:
135+
return ""
136+
137+
# All other non-2xx status codes raise exceptions (400, 429, 5xx, etc.)
52138
if response.status_code != 200:
53-
logger.error(f"Failed to query ClinGen API for {clingen_allele_id}: {response.status_code}")
54-
return None
139+
response.raise_for_status()
55140

56141
data = response.json()
57142
clinvar_allele_id = data.get("externalRecords", {}).get("ClinVarAlleles", [{}])[0].get("alleleId")
58143
if clinvar_allele_id:
59144
return str(clinvar_allele_id)
60145

61-
return None
146+
return ""

0 commit comments

Comments
 (0)