Skip to content
31 changes: 31 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -94,6 +94,31 @@ opensyndrome convert --language "Português do Brasil" --edit
opensyndrome convert --validate
```

### Enrich ontology IDs on a JSON definition

The `enrich` command populates `ontology_id` fields on criteria nodes and sets the `@context` to the OpenSyndrome JSON-LD context URL. It queries [EBI OLS4](https://www.ebi.ac.uk/ols4) by default, or [text2term](https://ccb-hms.github.io/ontology-mapper/) as an alternative mapper.

```bash
# enrich an existing JSON definition (uses OLS4 by default)
opensyndrome enrich definition.json

# use text2term instead (requires: pip install opensyndrome[text2term])
opensyndrome enrich definition.json --mapper text2term

# review and adjust the result in an editor before printing
opensyndrome enrich definition.json --edit

# enrich and validate in one step
opensyndrome enrich definition.json --validate
```

You can also enrich directly after conversion:

```bash
opensyndrome convert -hr "Any person with fever and rash" --enrich-ontology
opensyndrome convert -hr "Any person with fever and rash" --enrich-ontology --mapper text2term
```

### Convert a machine-readable JSON syndrome definition to a human-readable format

```bash
Expand All @@ -118,6 +143,12 @@ To get started with development, you need to have [uv](https://docs.astral.sh/uv
uv sync
```

To include the optional `text2term` mapper (and its `bioregistry` dependency) so the full test suite runs without skips:

```bash
uv sync --all-extras
```

### Generate Ollama-compatible JSON

> You only need to do this if you are a maintainer adding a new OSI schema or updating an existing one.
Expand Down
66 changes: 66 additions & 0 deletions opensyndrome/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
generate_machine_readable_format,
generate_human_readable_format,
)
from opensyndrome.ontology import enrich_definition, MAPPERS
from opensyndrome.artifacts import get_schema_filepath, get_definition_dir
from opensyndrome.validators import validate_machine_readable_format
from opensyndrome.providers import (
Expand Down Expand Up @@ -106,6 +107,18 @@ def wrapper(*args, **kwargs):
is_flag=True,
help="Open editor after generation.",
)
@click.option(
"--enrich-ontology / --no-enrich-ontology",
default=False,
help="Post-process output to populate ontology IDs.",
)
@click.option(
"--mapper",
type=click.Choice(MAPPERS),
default="ols",
show_default=True,
help="Ontology mapper to use with --enrich-ontology.",
)
@click.option(
"-hr",
"--human-readable-definition",
Expand All @@ -131,6 +144,8 @@ def convert_to_json(
model,
language,
edit,
enrich_ontology,
mapper,
human_readable_definition,
human_readable_definition_file,
provider,
Expand All @@ -156,6 +171,21 @@ def convert_to_json(
_show_llm_error(exception, provider, model)
return

if enrich_ontology:
click.echo(click.style("Enriching ontology IDs...", fg="cyan"), err=True)

def _progress(name, curie):
click.echo(
click.style(f" {name} → {curie}"),
err=True,
)

machine_readable_definition = enrich_definition(
machine_readable_definition,
mapper=mapper,
verbose_callback=_progress,
)

if edit:
machine_readable_definition_edited = click.edit(
text=json.dumps(machine_readable_definition, indent=4), extension=".json"
Expand All @@ -169,6 +199,42 @@ def convert_to_json(
validate_machine_readable_format_with_style(machine_readable_definition)


@cli.command("enrich")
@click.argument("json_file", type=click.Path(exists=True))
@click.option("--edit", is_flag=True, help="Open editor after enrichment.")
@click.option(
"--validate", is_flag=True, help="Validate the JSON file against the schema."
)
@click.option(
"--mapper",
type=click.Choice(MAPPERS),
default="ols",
show_default=True,
help="Ontology mapper to use.",
)
def enrich_json(json_file, edit, validate, mapper):
"""Populate ontology IDs on an existing JSON definition."""
definition = json.loads(Path(json_file).read_text())
click.echo(click.style("Enriching ontology IDs...", fg="cyan"), err=True)

def _progress(name, curie):
click.echo(click.style(f" {name} → {curie}"), err=True)

definition = enrich_definition(
definition, mapper=mapper, verbose_callback=_progress
)

if edit:
edited = click.edit(text=json.dumps(definition, indent=4), extension=".json")
if edited:
definition = json.loads(edited)

click.echo(color_json(definition))

if validate:
validate_machine_readable_format_with_style(definition)


@cli.command("humanize")
@click.argument("json_file", type=click.Path(exists=True))
@click.option(
Expand Down
196 changes: 196 additions & 0 deletions opensyndrome/ontology.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,196 @@
import logging
from typing import Literal, get_args

import requests
from ols_client import EBIClient

logger = logging.getLogger(__name__)

OPENSYNDROME_CONTEXT_URL = "https://opensyndrome.org/schema/v1/context.jsonld"
SKIP_TYPES = {"criterion", "demographic_criteria"}
TEXT2TERM_MIN_SCORE = 0.5

ols = EBIClient()

CRITERION_TYPE_ONTOLOGIES = {
"symptom": ["mondo", "hp"],
"diagnosis": ["mondo", "efo"],
"syndrome": ["mondo", "efo"],
"diagnostic_test": ["loinc", "obi"],
"epidemiological_history": ["hp", "efo"],
"professional_judgment": ["hp", "efo"],
}

type Mapper = Literal["ols", "text2term"]
MAPPERS: tuple[Mapper, ...] = get_args(Mapper.__value__)


def _pick_best(docs: list[dict], name: str) -> str | None:
for doc in docs:
if doc.get("label", "").lower() == name.lower():
return doc["obo_id"]
return None


def _search_ols(queries: dict[str, list[str]], timeout: int = 10) -> dict[str, str]:
"""Map names to CURIEs via OLS4 (one HTTP call per name; per-name failures are isolated)."""
result: dict[str, str] = {}
for name, ontologies in queries.items():
base_params = {
"q": name,
"ontology": ",".join(ontologies),
"rows": 5,
"type": "class",
}
try:
payload = ols.get_json("/search", params=base_params, timeout=timeout)
docs = payload.get("response", {}).get("docs", [])
curie = _pick_best(docs, name)
if not curie:
payload = ols.get_json(
"/search",
params={**base_params, "queryFields": "label,synonym"},
timeout=timeout,
)
docs = payload.get("response", {}).get("docs", [])
curie = _pick_best(docs, name)
if curie:
result[name] = curie
except requests.RequestException as exc:
logger.warning("OLS4 search failed for %r: %s", name, exc)
return result


def _iri_to_curie(iri: str) -> str | None:
"""Convert an ontology IRI to a CURIE. e.g. http://purl.obolibrary.org/obo/HP_0001945 → HP:0001945"""
import bioregistry

try:
prefix, identifier = bioregistry.parse_iri(iri, use_preferred=True)
except TypeError:
# bioregistry v0.11.35 raises on unparseable IRIs instead of returning (None, None).
return None
if not prefix:
return None
# bioregistry uses "obo" as a generic fallback for unrecognized OBO PURLs
# e.g. LOINC_2345-7, UNKNOWN_42
# reject to avoid producing OBO:* CURIEs
if prefix.lower() == "obo" and "_" in identifier:
return None
return f"{prefix.upper()}:{identifier}"


def _search_text2term(
queries: dict[str, list[str]],
min_score: float = TEXT2TERM_MIN_SCORE,
) -> dict[str, str]:
"""Batch-map names to CURIEs via text2term, loading each ontology at most once."""
try:
import text2term
except ImportError:
raise ImportError(
"text2term is not installed. Run: pip install text2term"
) from None

names_per_ontology: dict[str, set[str]] = {}
for name, ontologies in queries.items():
for ontology in ontologies:
names_per_ontology.setdefault(ontology.upper(), set()).add(name)

matches_per_ontology: dict[str, dict[str, str]] = {}
for ontology, names in names_per_ontology.items():
try:
use_cache = text2term.cache_exists(ontology)
df = text2term.map_terms(
source_terms=sorted(names),
target_ontology=ontology,
max_mappings=1,
min_score=min_score,
use_cache=use_cache,
excl_deprecated=True,
)
ontology_matches: dict[str, str] = {}
for _, row in df.iterrows():
curie = _iri_to_curie(row["Mapped Term IRI"])
if curie:
ontology_matches[row["Source Term"]] = curie
matches_per_ontology[ontology] = ontology_matches
except (requests.RequestException, OSError, RuntimeError, ValueError) as exc:
logger.warning(
"text2term batch failed for %s (%d terms): %s",
ontology,
len(names),
exc,
)

result: dict[str, str] = {}
for name, ontologies in queries.items():
for ontology in ontologies:
curie = matches_per_ontology.get(ontology.upper(), {}).get(name)
if curie:
result[name] = curie
break
return result


def _collect_enrichable(criteria: list[dict]) -> list[dict]:
result = []
for criterion in criteria:
result.extend(_collect_enrichable(criterion.get("values") or []))
type_ = criterion.get("type")
if not type_ or type_ in SKIP_TYPES:
continue
if criterion.get("ontology_id"):
continue
if not (criterion.get("name") or "").strip():
continue
result.append(criterion)
return result


def _apply_mapping(
criteria: list[dict],
mapping: dict[str, str],
verbose_callback=None,
) -> None:
for criterion in criteria:
_apply_mapping(criterion.get("values", []), mapping, verbose_callback)
if criterion.get("type") in SKIP_TYPES:
continue
name = criterion.get("name", "")
if name in mapping:
criterion["ontology_id"] = mapping[name]
if verbose_callback:
verbose_callback(name, mapping[name])


def enrich_definition(
definition: dict,
*,
mapper: Mapper = "ols",
verbose_callback=None,
) -> dict:
if mapper not in MAPPERS:
raise ValueError(
f"Unknown mapper {mapper!r}. Choose from: {', '.join(MAPPERS)}"
)

search_fn = _search_ols if mapper == "ols" else _search_text2term

inclusion_criteria = definition.get("inclusion_criteria") or []
exclusion_criteria = definition.get("exclusion_criteria") or []
definition["@context"] = OPENSYNDROME_CONTEXT_URL
enrichable = _collect_enrichable(inclusion_criteria + exclusion_criteria)
if not enrichable:
return definition

queries: dict[str, list[str]] = {
c["name"]: CRITERION_TYPE_ONTOLOGIES.get(c["type"], ["hp", "efo", "mondo"])
for c in enrichable
}
mapping = search_fn(queries)

for criteria_list in [inclusion_criteria, exclusion_criteria]:
_apply_mapping(criteria_list, mapping, verbose_callback)

return definition
5 changes: 4 additions & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -11,19 +11,22 @@ dependencies = [
"pygments>=2.19.1,<3",
"python-dotenv>=1.0.1,<2",
"pydantic>=2.10.6,<3",
"numpy>=2.3.1,<3",
"pyarrow>=20.0.0,<21",
"ollama>=0.5.1,<0.6",
"requests>=2.32.5",
"polars>=1.38.1",
"datamodel-code-generator>=0.55.0",
"litellm>=1.82.2",
"instructor>=1.14.5",
"ols-client>=0.2.1",
]

[project.scripts]
opensyndrome = "opensyndrome.cli:main"

[project.optional-dependencies]
text2term = ["text2term>=4.5.0", "bioregistry>=0.11.35"]

[dependency-groups]
dev = [
"pre-commit>=4.1.0,<5",
Expand Down
Loading
Loading