cdisc-org · SFJohnson24 · May 17, 2026 · May 14, 2026 · May 15, 2026
diff --git a/cdisc_rules_engine/dataset_builders/json_schema_check_dataset_builder.py b/cdisc_rules_engine/dataset_builders/json_schema_check_dataset_builder.py
@@ -83,6 +83,37 @@ def get_instance_by_path(self, instance: dict, path_list: list) -> dict:
     def get_parent_path(self, path_list: list):
         return list(path_list)[0 : (-1 - int(isinstance(path_list[-1], int)))]
 
+    def _get_ancestor_instance_type(self, path_list: list) -> str:
+        """Walk up the path to find the nearest ancestor's instanceType."""
+        current_path = list(path_list)
+        while current_path:
+            current_path = current_path[:-1]
+            try:
+                ancestor = self.get_instance_by_path(
+                    self.data_service.json, current_path
+                )
+                if isinstance(ancestor, dict) and "instanceType" in ancestor:
+                    return ancestor["instanceType"]
+            except (KeyError, IndexError, TypeError):
+                pass
+        return ""
+
+    def _get_schema_class_name(self, error: exceptions.ValidationError) -> str:
+        """Extract class name from error's schema title or $ref when instanceType is unavailable.
+
+        Only uses the schema ``title`` field or the last segment of a ``$ref`` —
+        avoids inspecting ``absolute_schema_path`` which can return JSON-Schema
+        keywords (e.g. ``"type"``, ``"items"``) as false positives.
+        """
+        schema = error.schema if error.schema else {}
+        # Prefer explicit title (e.g. "AliasCode", "StudyVersion")
+        if title := schema.get("title", ""):
+            return title
+        # Fall back to the class name embedded in a $ref
+        if ref := schema.get("$ref", ""):
+            return ref.split("/")[-1]
+        return ""
+
     def parse_error(
         self,
         error: exceptions.ValidationError,
@@ -95,8 +126,9 @@ def parse_error(
             if error.validator in ["required", "additionalProperties"]
             else (
                 "{}[{}]".format(error.absolute_path[-2], error.absolute_path[-1])
-                if isinstance(error.absolute_path[-1], int)
-                else error.absolute_path[-1]
+                if len(error.absolute_path) >= 2
+                and isinstance(error.absolute_path[-1], int)
+                else (error.absolute_path[-1] if error.absolute_path else "")
             )
         )
         errlist["json_path"].append(error.json_path)
@@ -110,7 +142,17 @@ def parse_error(
             and str(error.instance) in error.message
             else error.message
         )
-        errlist["dataset"].append(errctx.get("instanceType", "") if errctx else "")
+        if errctx:
+            instance_type = (
+                errctx.get("instanceType")
+                or self._get_schema_class_name(error)
+                or self._get_ancestor_instance_type(errpath)
+            )
+        else:
+            instance_type = self._get_schema_class_name(
+                error
+            ) or self._get_ancestor_instance_type(errpath)
+        errlist["dataset"].append(instance_type)
         errlist["id"].append(errctx.get("id", "") if errctx else "")
         errlist["_path"].append("/" + "/".join(map(str, errpath)))
 

diff --git a/cdisc_rules_engine/rules_engine.py b/cdisc_rules_engine/rules_engine.py
@@ -149,7 +149,9 @@ def validate_single_rule(self, rule: dict):
                     )
                     break
                 if dataset_metadata.unsplit_name in results and "domains" in rule:
-                    include_split = rule["domains"].get("include_split_datasets", False)
+                    include_split = (rule["domains"] or {}).get(
+                        "include_split_datasets", False
+                    )
                     if not include_split:
                         continue  # handling split datasets
                 dataset_results = self.validate_single_dataset(

diff --git a/tests/unit/test_dataset_builders/test_json_schema_check_dataset_builder.py b/tests/unit/test_dataset_builders/test_json_schema_check_dataset_builder.py
@@ -172,3 +172,127 @@ def test_json_schema_check_dataset_builder_invalid():
 
     # Verify that the cache_service.add method was called
     cache_service.add.assert_called_once()
+
+
+def test_json_schema_missing_instance_type_is_reported():
+    """
+    When an entity in a discriminated anyOf union is missing its instanceType,
+    the error must be captured and reported rather than silently dropped
+    (regression for GitHub issue #1546).
+
+    The dataset assignment fallback chain is:
+      1. instanceType from data  →  2. schema title/ref  →  3. nearest ancestor
+    DDF00125 checks for validator == "required" | "additionalProperties", so the
+    inner `required` context error (not the top-level anyOf) must be surfaced with
+    a meaningful dataset assignment.
+    """
+    schema = {
+        "type": "object",
+        "properties": {
+            "study": {"$ref": "#/$defs/Study"},
+        },
+        "$defs": {
+            "Study": {
+                "title": "Study",
+                "type": "object",
+                "properties": {
+                    "instanceType": {"const": "Study"},
+                    "id": {"type": "string"},
+                    "items": {
+                        "type": "array",
+                        "items": {
+                            "anyOf": [
+                                {"$ref": "#/$defs/TypeA"},
+                                {"$ref": "#/$defs/TypeB"},
+                            ]
+                        },
+                    },
+                },
+                "required": ["instanceType", "id"],
+            },
+            "TypeA": {
+                # title is intentionally present so the dataset assignment falls
+                # back to "TypeA" (schema title) rather than "Study" (ancestor).
+                "title": "TypeA",
+                "type": "object",
+                "properties": {
+                    "instanceType": {"const": "TypeA"},
+                    "value": {"type": "string"},
+                },
+                "required": ["instanceType", "value"],
+            },
+            "TypeB": {
+                "title": "TypeB",
+                "type": "object",
+                "properties": {
+                    "instanceType": {"const": "TypeB"},
+                    "data": {"type": "integer"},
+                },
+                "required": ["instanceType", "data"],
+            },
+        },
+    }
+
+    # instanceType is intentionally missing from the nested item
+    instance = {
+        "study": {
+            "instanceType": "Study",
+            "id": "study-1",
+            "items": [
+                {"value": "some-value"},  # missing instanceType
+            ],
+        }
+    }
+
+    data_service = MagicMock()
+    data_service.json = instance
+    data_service.dataset_implementation = PandasDataset
+    data_service.dataset_path = "dummy.json"
+
+    cache_service = MagicMock()
+    cache_service.get.return_value = None
+
+    # Filter to the entity that owns the problematic object.
+    # Because TypeA.title == "TypeA", the dataset column will be set to "TypeA"
+    # (schema title wins over ancestor "Study").  This is the correct value for
+    # DDF00125 to report the issue under the right entity.
+    dataset_metadata = MagicMock()
+    dataset_metadata.name = "TypeA"
+
+    builder = JsonSchemaCheckDatasetBuilder(
+        rule={},
+        data_service=data_service,
+        cache_service=cache_service,
+        rule_processor=MagicMock(),
+        data_processor=MagicMock(),
+        dataset_metadata=dataset_metadata,
+        define_xml_path=None,
+        standard="USDM",
+        standard_version="4.0",
+        standard_substandard=None,
+        library_metadata=LibraryMetadataContainer(standard_schema_definition=schema),
+    )
+
+    ds = builder.get_dataset()
+    rows = ds.data.to_dict(orient="records")
+
+    # 1. Error must not be silently dropped
+    assert len(rows) >= 1, "Missing instanceType error must produce at least one row"
+
+    # 2. dataset must be set to the schema class name ("TypeA"), not empty string.
+    #    This ensures DDF00125 associates the error with the correct entity.
+    assert any(row.get("dataset") == "TypeA" for row in rows), (
+        "Error with missing instanceType must be associated with the schema class "
+        "('TypeA' via schema title), not silently dropped with dataset=''"
+    )
+
+    # 3. The error must have validator == "required" so DDF00125's check condition
+    #    (validator is_contained_by ['required', 'additionalProperties']) fires.
+    assert any(
+        row.get("validator") == "required" for row in rows
+    ), "The surfaced error must have validator='required' so DDF00125 can match it"
+
+    # 4. The path should identify where in the document the problem is
+    assert any(
+        "/study/items/0" in (row.get("_path") or "") for row in rows
+    ), "Path should point to the entity missing instanceType"