Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
25 changes: 25 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -94,6 +94,31 @@ opensyndrome convert --language "Português do Brasil" --edit
opensyndrome convert --validate
```

### Enrich ontology IDs on a JSON definition

The `enrich` command populates `ontology_id` fields on criteria nodes and sets the `@context` to the OpenSyndrome JSON-LD context URL. It queries [EBI OLS4](https://www.ebi.ac.uk/ols4) by default, or [text2term](https://ccb-hms.github.io/ontology-mapper/) as an alternative mapper.

```bash
# enrich an existing JSON definition (uses OLS4 by default)
opensyndrome enrich definition.json

# use text2term instead (requires: pip install opensyndrome[text2term])
opensyndrome enrich definition.json --mapper text2term

# review and adjust the result in an editor before printing
opensyndrome enrich definition.json --edit

# enrich and validate in one step
opensyndrome enrich definition.json --validate
```

You can also enrich directly after conversion:

```bash
opensyndrome convert -hr "Any person with fever and rash" --enrich-ontology
opensyndrome convert -hr "Any person with fever and rash" --enrich-ontology --mapper text2term
```

### Convert a machine-readable JSON syndrome definition to a human-readable format

```bash
Expand Down
66 changes: 66 additions & 0 deletions opensyndrome/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
generate_machine_readable_format,
generate_human_readable_format,
)
from opensyndrome.ontology import enrich_definition, MAPPERS
from opensyndrome.artifacts import get_schema_filepath, get_definition_dir
from opensyndrome.validators import validate_machine_readable_format
from opensyndrome.providers import (
Expand Down Expand Up @@ -106,6 +107,18 @@ def wrapper(*args, **kwargs):
is_flag=True,
help="Open editor after generation.",
)
@click.option(
"--enrich-ontology / --no-enrich-ontology",
default=False,
help="Post-process output to populate ontology IDs via EBI OLS4.",
)
@click.option(
"--mapper",
type=click.Choice(MAPPERS),
default="ols",
show_default=True,
help="Ontology mapper to use with --enrich-ontology.",
)
@click.option(
"-hr",
"--human-readable-definition",
Expand All @@ -131,6 +144,8 @@ def convert_to_json(
model,
language,
edit,
enrich_ontology,
mapper,
human_readable_definition,
human_readable_definition_file,
provider,
Expand All @@ -156,6 +171,21 @@ def convert_to_json(
_show_llm_error(exception, provider, model)
return

if enrich_ontology:
click.echo(click.style("Enriching ontology IDs...", fg="cyan"), err=True)

def _progress(name, curie):
click.echo(
click.style(f" {name} → {curie}"),
err=True,
)

machine_readable_definition = enrich_definition(
machine_readable_definition,
mapper=mapper,
verbose_callback=_progress,
)

if edit:
machine_readable_definition_edited = click.edit(
text=json.dumps(machine_readable_definition, indent=4), extension=".json"
Expand All @@ -169,6 +199,42 @@ def convert_to_json(
validate_machine_readable_format_with_style(machine_readable_definition)


@cli.command("enrich")
@click.argument("json_file", type=click.Path(exists=True))
@click.option("--edit", is_flag=True, help="Open editor after enrichment.")
@click.option(
"--validate", is_flag=True, help="Validate the JSON file against the schema."
)
@click.option(
"--mapper",
type=click.Choice(MAPPERS),
default="ols",
show_default=True,
help="Ontology mapper to use.",
)
def enrich_json(json_file, edit, validate, mapper):
"""Populate ontology IDs on an existing JSON definition via EBI OLS4."""
definition = json.loads(Path(json_file).read_text())
click.echo(click.style("Enriching ontology IDs...", fg="cyan"), err=True)

def _progress(name, curie):
click.echo(click.style(f" {name} → {curie}"), err=True)

definition = enrich_definition(
definition, mapper=mapper, verbose_callback=_progress
)

if edit:
edited = click.edit(text=json.dumps(definition, indent=4), extension=".json")
if edited:
definition = json.loads(edited)

click.echo(color_json(definition))

if validate:
validate_machine_readable_format_with_style(definition)


@cli.command("humanize")
@click.argument("json_file", type=click.Path(exists=True))
@click.option(
Expand Down
161 changes: 161 additions & 0 deletions opensyndrome/ontology.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,161 @@
import logging

import requests

logger = logging.getLogger(__name__)

OLS4_SEARCH_URL = "https://www.ebi.ac.uk/ols4/api/search"
OPENSYNDROME_CONTEXT_URL = "https://opensyndrome.org/schema/v1/context.jsonld"
SKIP_TYPES = {"criterion", "demographic_criteria"}
MIN_SCORE = 5.0
TEXT2TERM_MIN_SCORE = 0.5

CRITERION_TYPE_ONTOLOGIES = {
"symptom": ["mondo", "hp"],
"diagnosis": ["mondo", "efo"],
"syndrome": ["mondo", "efo"],
"diagnostic_test": ["loinc", "obi"],
"epidemiological_history": ["hp", "efo"],
"professional_judgment": ["hp", "efo"],
}

MAPPERS = ("ols", "text2term")


def _pick_best(docs: list[dict], name: str) -> str | None:
for doc in docs:
if doc.get("label", "").lower() == name.lower():
return doc["short_form"].replace("_", ":", 1)
for doc in docs:
score = doc.get("score")
if score is not None and float(score) >= MIN_SCORE:
return doc["short_form"].replace("_", ":", 1)
return None


def _search_ols(name: str, ontologies: list[str], timeout: int = 10) -> str | None:
base_params = {"ontology": ",".join(ontologies), "rows": 5, "type": "class"}
try:
response = requests.get(
OLS4_SEARCH_URL,
params={"q": name, **base_params},
timeout=timeout,
)
response.raise_for_status()
docs = response.json().get("response", {}).get("docs", [])
result = _pick_best(docs, name)
if result:
return result

# Fallback: search synonyms
response = requests.get(
OLS4_SEARCH_URL,
params={"q": name, "queryFields": "label,synonym", **base_params},
timeout=timeout,
)
response.raise_for_status()
docs = response.json().get("response", {}).get("docs", [])
return _pick_best(docs, name)
except requests.RequestException as exc:
logger.warning("OLS4 search failed for %r: %s", name, exc)
return None


def _iri_to_curie(iri: str) -> str:
"""Convert an OBO IRI to a CURIE. e.g. http://purl.obolibrary.org/obo/HP_0001945 → HP:0001945"""
local = iri.rstrip("/").rsplit("/", 1)[-1]
return local.replace("_", ":", 1)


def _search_text2term(
name: str, ontologies: list[str], min_score: float = TEXT2TERM_MIN_SCORE
) -> str | None:
try:
import text2term
except ImportError:
raise ImportError("text2term is not installed. Run: pip install text2term")

for ontology in ontologies:
try:
use_cache = text2term.cache.is_ontology_in_cache(ontology.upper())
df = text2term.map_terms(
source_terms=[name],
target_ontology=ontology.upper(),
max_mappings=1,
min_score=min_score,
use_cache=use_cache,
excl_deprecated=True,
)
if not df.empty:
return _iri_to_curie(df.iloc[0]["Mapped Term IRI"])
except Exception as exc:
logger.warning(
"text2term search failed for %r with %s: %s", name, ontology, exc
)
return None


def _collect_enrichable(criteria: list[dict]) -> list[dict]:
result = []
for criterion in criteria:
result.extend(_collect_enrichable(criterion.get("values") or []))
type_ = criterion.get("type")
if not type_ or type_ in SKIP_TYPES:
continue
if criterion.get("ontology_id"):
continue
if not (criterion.get("name") or "").strip():
continue
result.append(criterion)
return result


def _apply_mapping(
criteria: list[dict],
mapping: dict[str, str],
verbose_callback=None,
) -> None:
for criterion in criteria:
_apply_mapping(criterion.get("values", []), mapping, verbose_callback)
if criterion.get("type") in SKIP_TYPES:
continue
name = criterion.get("name", "")
if name in mapping:
criterion["ontology_id"] = mapping[name]
if verbose_callback:
verbose_callback(name, mapping[name])


def enrich_definition(
definition: dict,
*,
mapper: str = "ols",
verbose_callback=None,
) -> dict:
if mapper not in MAPPERS:
raise ValueError(
f"Unknown mapper {mapper!r}. Choose from: {', '.join(MAPPERS)}"
)

search_fn = _search_ols if mapper == "ols" else _search_text2term

inclusion_criteria = definition.get("inclusion_criteria") or []
exclusion_criteria = definition.get("exclusion_criteria") or []
definition["@context"] = OPENSYNDROME_CONTEXT_URL
enrichable = _collect_enrichable(inclusion_criteria + exclusion_criteria)
if not enrichable:
return definition

mapping: dict[str, str] = {}
for criterion in enrichable:
name = criterion["name"]
type_ = criterion["type"]
ontologies = CRITERION_TYPE_ONTOLOGIES.get(type_, ["hp", "efo", "mondo"])
curie = search_fn(name, ontologies)
if curie:
mapping[name] = curie

for criteria_list in [inclusion_criteria, exclusion_criteria]:
_apply_mapping(criteria_list, mapping, verbose_callback)

return definition
4 changes: 3 additions & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,6 @@ dependencies = [
"pygments>=2.19.1,<3",
"python-dotenv>=1.0.1,<2",
"pydantic>=2.10.6,<3",
"numpy>=2.3.1,<3",
"pyarrow>=20.0.0,<21",
"ollama>=0.5.1,<0.6",
"requests>=2.32.5",
Expand All @@ -24,6 +23,9 @@ dependencies = [
[project.scripts]
opensyndrome = "opensyndrome.cli:main"

[project.optional-dependencies]
text2term = ["text2term>=4.5.0"]

[dependency-groups]
dev = [
"pre-commit>=4.1.0,<5",
Expand Down
Loading
Loading