Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -83,6 +83,37 @@ def get_instance_by_path(self, instance: dict, path_list: list) -> dict:
def get_parent_path(self, path_list: list):
return list(path_list)[0 : (-1 - int(isinstance(path_list[-1], int)))]

def _get_ancestor_instance_type(self, path_list: list) -> str:
"""Walk up the path to find the nearest ancestor's instanceType."""
current_path = list(path_list)
while current_path:
current_path = current_path[:-1]
try:
ancestor = self.get_instance_by_path(
self.data_service.json, current_path
)
if isinstance(ancestor, dict) and "instanceType" in ancestor:
return ancestor["instanceType"]
except (KeyError, IndexError, TypeError):
pass
return ""

def _get_schema_class_name(self, error: exceptions.ValidationError) -> str:
"""Extract class name from error's schema title or $ref when instanceType is unavailable.

Only uses the schema ``title`` field or the last segment of a ``$ref`` —
avoids inspecting ``absolute_schema_path`` which can return JSON-Schema
keywords (e.g. ``"type"``, ``"items"``) as false positives.
"""
schema = error.schema if error.schema else {}
# Prefer explicit title (e.g. "AliasCode", "StudyVersion")
if title := schema.get("title", ""):
return title
# Fall back to the class name embedded in a $ref
if ref := schema.get("$ref", ""):
return ref.split("/")[-1]
return ""

def parse_error(
self,
error: exceptions.ValidationError,
Expand All @@ -95,8 +126,9 @@ def parse_error(
if error.validator in ["required", "additionalProperties"]
else (
"{}[{}]".format(error.absolute_path[-2], error.absolute_path[-1])
if isinstance(error.absolute_path[-1], int)
else error.absolute_path[-1]
if len(error.absolute_path) >= 2
and isinstance(error.absolute_path[-1], int)
else (error.absolute_path[-1] if error.absolute_path else "")
)
)
errlist["json_path"].append(error.json_path)
Expand All @@ -110,7 +142,17 @@ def parse_error(
and str(error.instance) in error.message
else error.message
)
errlist["dataset"].append(errctx.get("instanceType", "") if errctx else "")
if errctx:
instance_type = (
errctx.get("instanceType")
or self._get_schema_class_name(error)
or self._get_ancestor_instance_type(errpath)
)
else:
instance_type = self._get_schema_class_name(
error
) or self._get_ancestor_instance_type(errpath)
errlist["dataset"].append(instance_type)
errlist["id"].append(errctx.get("id", "") if errctx else "")
errlist["_path"].append("/" + "/".join(map(str, errpath)))

Expand Down
4 changes: 3 additions & 1 deletion cdisc_rules_engine/rules_engine.py
Original file line number Diff line number Diff line change
Expand Up @@ -149,7 +149,9 @@ def validate_single_rule(self, rule: dict):
)
break
if dataset_metadata.unsplit_name in results and "domains" in rule:
include_split = rule["domains"].get("include_split_datasets", False)
include_split = (rule["domains"] or {}).get(
"include_split_datasets", False
)
if not include_split:
continue # handling split datasets
dataset_results = self.validate_single_dataset(
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -172,3 +172,127 @@ def test_json_schema_check_dataset_builder_invalid():

# Verify that the cache_service.add method was called
cache_service.add.assert_called_once()


def test_json_schema_missing_instance_type_is_reported():
"""
When an entity in a discriminated anyOf union is missing its instanceType,
the error must be captured and reported rather than silently dropped
(regression for GitHub issue #1546).

The dataset assignment fallback chain is:
1. instanceType from data → 2. schema title/ref → 3. nearest ancestor
DDF00125 checks for validator == "required" | "additionalProperties", so the
inner `required` context error (not the top-level anyOf) must be surfaced with
a meaningful dataset assignment.
"""
schema = {
"type": "object",
"properties": {
"study": {"$ref": "#/$defs/Study"},
},
"$defs": {
"Study": {
"title": "Study",
"type": "object",
"properties": {
"instanceType": {"const": "Study"},
"id": {"type": "string"},
"items": {
"type": "array",
"items": {
"anyOf": [
{"$ref": "#/$defs/TypeA"},
{"$ref": "#/$defs/TypeB"},
]
},
},
},
"required": ["instanceType", "id"],
},
"TypeA": {
# title is intentionally present so the dataset assignment falls
# back to "TypeA" (schema title) rather than "Study" (ancestor).
"title": "TypeA",
"type": "object",
"properties": {
"instanceType": {"const": "TypeA"},
"value": {"type": "string"},
},
"required": ["instanceType", "value"],
},
"TypeB": {
"title": "TypeB",
"type": "object",
"properties": {
"instanceType": {"const": "TypeB"},
"data": {"type": "integer"},
},
"required": ["instanceType", "data"],
},
},
}

# instanceType is intentionally missing from the nested item
instance = {
"study": {
"instanceType": "Study",
"id": "study-1",
"items": [
{"value": "some-value"}, # missing instanceType
],
}
}

data_service = MagicMock()
data_service.json = instance
data_service.dataset_implementation = PandasDataset
data_service.dataset_path = "dummy.json"

cache_service = MagicMock()
cache_service.get.return_value = None

# Filter to the entity that owns the problematic object.
# Because TypeA.title == "TypeA", the dataset column will be set to "TypeA"
# (schema title wins over ancestor "Study"). This is the correct value for
# DDF00125 to report the issue under the right entity.
dataset_metadata = MagicMock()
dataset_metadata.name = "TypeA"

builder = JsonSchemaCheckDatasetBuilder(
rule={},
data_service=data_service,
cache_service=cache_service,
rule_processor=MagicMock(),
data_processor=MagicMock(),
dataset_metadata=dataset_metadata,
define_xml_path=None,
standard="USDM",
standard_version="4.0",
standard_substandard=None,
library_metadata=LibraryMetadataContainer(standard_schema_definition=schema),
)

ds = builder.get_dataset()
rows = ds.data.to_dict(orient="records")

# 1. Error must not be silently dropped
assert len(rows) >= 1, "Missing instanceType error must produce at least one row"

# 2. dataset must be set to the schema class name ("TypeA"), not empty string.
# This ensures DDF00125 associates the error with the correct entity.
assert any(row.get("dataset") == "TypeA" for row in rows), (
"Error with missing instanceType must be associated with the schema class "
"('TypeA' via schema title), not silently dropped with dataset=''"
)

# 3. The error must have validator == "required" so DDF00125's check condition
# (validator is_contained_by ['required', 'additionalProperties']) fires.
assert any(
row.get("validator") == "required" for row in rows
), "The surfaced error must have validator='required' so DDF00125 can match it"

# 4. The path should identify where in the document the problem is
assert any(
"/study/items/0" in (row.get("_path") or "") for row in rows
), "Path should point to the entity missing instanceType"
Loading