observatory: json_report for accuracy + per_layer_accuracy lenses

quic-boyuc · claude · quic-boyuc · commit 60b93f8fd08c · 2026-05-18T18:22:58.000+08:00
Adds Frontend.json_report() overrides to the two demo lenses, producing
machine-readable Report (JSON) content suited for CI, LLM triage, and
regression detection.

accuracy (_AccuracyFrontend.json_report):
  Aggregates primary metrics (psnr, mse, cosine_sim, top_k, ...) across
  session_records: mean, min, max, and worst_record. Internal _* keys,
  per-sample _min/_max stats, and _worst_idx indices are excluded.
  worst_record semantics: for quality metrics (psnr, cosine_sim, top_k)
  worst_record = argmin (lower value = worse quality); for error metrics
  (mse, abs_err) worst_record = argmax (higher value = worse quality).

per_layer_accuracy (_PerLayerAccuracyFrontend.json_report):
  Reports anchor/target record names, n_layers, sample_source,
  metric_ranges (from analyze() global_data), and worst_layers: top-N
  rows per metric sorted worst-first.
  - psnr / cosine_sim: ascending sort (lower = worse).
  - mse / abs_err: descending sort (higher = worse).
  - Layer identity: uses from_node_root, falling back to target_node,
    matching the real row schema from observe().
  - top_n: read from analysis.global_data["json_report_top_n"] (stored
    by analyze() from config["per_layer_accuracy"]["json_report_top_n"],
    default 10). No live config-stack access at call time.

Tests (test_json_report.py, 17 tests):
  - Framework: invocation count, payload shape, no-ghost-keys for None
    returns, compare-mode archive grouping, export indentation, NaN survival.
  - AccuracyLens: mean/min/max aggregation, internal-key exclusion,
    mse worst_record = argmax, records_measured count.
  - PerLayerAccuracyLens: None when no data, sort direction for psnr
    (ascending) and mse (descending), metric_ranges propagation,
    top_n config knob via analyze().

Co-Authored-By: Claude Sonnet 4.6 (1M context) &lt;noreply@anthropic.com&gt;
diff --git a/devtools/observatory/lenses/accuracy.py b/devtools/observatory/lenses/accuracy.py
@@ -743,6 +743,68 @@ def dashboard(self, session, session_records, analysis) -> Optional[ViewList]:
             ]
         )
 
+    def json_report(self, session, session_records, analysis) -> Optional[Dict[str, Any]]:
+        """Aggregate accuracy metrics across the session's records.
+
+        Each numeric primary metric contributes a ``mean``, ``min``, ``max``,
+        and ``worst_record`` (the record name where the metric was lowest,
+        indicating the worst quality sample for that metric).
+        Internal ``_*`` keys, ``_min``/``_max`` per-sample stats, and
+        ``_worst_idx`` indices are excluded.
+        """
+        sums: Dict[str, float] = {}
+        counts: Dict[str, int] = {}
+        mins: Dict[str, float] = {}
+        maxs: Dict[str, float] = {}
+        worst: Dict[str, str] = {}
+        measured = 0
+        # Error metrics: higher value = worse quality → worst_record = argmax.
+        # Quality metrics (psnr, cosine_sim, top_k, ...): lower = worse → argmin.
+        _ERROR_METRICS = {"mse", "abs_err"}
+
+        for rec in session_records or []:
+            digest = rec.data.get("accuracy")
+            if not isinstance(digest, dict):
+                continue
+            measured += 1
+            for k, v in digest.items():
+                if not isinstance(v, (int, float)) or isinstance(v, bool):
+                    continue
+                if k.startswith("_"):
+                    continue
+                if k.endswith(("_min", "_max", "_worst_idx")):
+                    continue
+                sums[k] = sums.get(k, 0.0) + float(v)
+                counts[k] = counts.get(k, 0) + 1
+                if float(v) < mins.get(k, float("inf")):
+                    mins[k] = float(v)
+                    # Quality metric (psnr, cosine_sim, ...): lower = worse.
+                    if k not in _ERROR_METRICS:
+                        worst[k] = rec.name
+                if float(v) > maxs.get(k, float("-inf")):
+                    maxs[k] = float(v)
+                    # Error metric (mse, abs_err, ...): higher = worse.
+                    if k in _ERROR_METRICS:
+                        worst[k] = rec.name
+
+        if measured == 0:
+            return None
+
+        return {
+            "records_measured": measured,
+            "metrics": {
+                k: {
+                    "mean": round(sums[k] / counts[k], 4),
+                    "min": round(mins[k], 4),
+                    "max": round(maxs[k], 4),
+                    # For quality metrics (psnr etc.): min = worst.
+                    # For error metrics (mse etc.): max = worst.
+                    "worst_record": worst[k],
+                }
+                for k in sorted(sums)
+            },
+        }
+
     def record(
         self, digest: Any, analysis: Dict[str, Any], context: Dict[str, Any]
     ) -> Optional[ViewList]:
diff --git a/devtools/observatory/lenses/per_layer_accuracy.py b/devtools/observatory/lenses/per_layer_accuracy.py
@@ -566,6 +566,43 @@ def _metric_specs() -> Dict[str, Dict[str, Any]]:
             },
         }
 
+    @classmethod
+    def _build_context_extension(
+        cls,
+        rows: List[Dict[str, Any]],
+    ) -> GraphExtension:
+        """Umbrella extension carrying identity/context fields once per node.
+
+        Each metric extension stores only its own value, so the per-node
+        identity (target_node, anchor_node, shapes, topo indices, key_kind,
+        from_node_root, numel_compared, sparse_match_key) lives here exactly
+        once instead of being duplicated four times across the metric
+        extensions.
+        """
+        ext = GraphExtension(
+            id="ctx",
+            name="Per-Layer Accuracy Context",
+        )
+        for row in rows:
+            node_id = str(row["target_node"])
+            ext.add_node_data(
+                node_id,
+                {
+                    "sparse_match_key": row.get("match_key", ""),
+                    "key_kind": row.get("key_kind", ""),
+                    "from_node_root": row.get("from_node_root", ""),
+                    "anchor_node": row.get("anchor_node", ""),
+                    "target_node": row.get("target_node", ""),
+                    "anchor_topo_index": row.get("anchor_topo_index", -1),
+                    "target_topo_index": row.get("target_topo_index", -1),
+                    "numel_compared": row.get("numel_compared", 0),
+                    "anchor_shape": row.get("anchor_shape", "n/a"),
+                    "target_shape": row.get("target_shape", "n/a"),
+                },
+            )
+        ext.set_sync_key("sparse_match_key")
+        return ext
+
     @classmethod
     def _build_metric_extension(
         cls,
@@ -581,23 +618,13 @@ def _build_metric_extension(
         ext = GraphExtension(id=metric_name, name=str(spec["name"]))
         for row in rows:
             node_id = str(row["target_node"])
-            info = {
-                "sparse_match_key": row.get("match_key", ""),
-                "key_kind": row.get("key_kind", ""),
-                "from_node_root": row.get("from_node_root", ""),
-                "anchor_node": row.get("anchor_node", ""),
-                "target_node": row.get("target_node", ""),
-                "anchor_topo_index": row.get("anchor_topo_index", -1),
-                "target_topo_index": row.get("target_topo_index", -1),
-                "numel_compared": row.get("numel_compared", 0),
-                "anchor_shape": row.get("anchor_shape", "n/a"),
-                "target_shape": row.get("target_shape", "n/a"),
-                "psnr": cls._safe_float(row.get("psnr", 0.0)),
-                "cosine_sim": cls._safe_float(row.get("cosine_sim", 0.0)),
-                "mse": cls._safe_float(row.get("mse", 0.0)),
-                "abs_err": cls._safe_float(row.get("abs_err", 0.0)),
-            }
-            ext.add_node_data(node_id, info)
+            ext.add_node_data(
+                node_id,
+                {
+                    "sparse_match_key": row.get("match_key", ""),
+                    metric_name: cls._safe_float(row.get(metric_name, 0.0)),
+                },
+            )
 
         ext.set_sync_key("sparse_match_key")
 
@@ -611,13 +638,16 @@ def _label_formatter(d: Dict[str, Any]) -> List[str]:
             primary_label = str(spec["label"])
             return [f"{primary_label}={_format_metric_value(primary)}"]
 
+        # All metric extensions opt into label formatters; the run-time LRU
+        # in the JS canvas (cap = MAX_LABEL_EXTENSIONS = 2) keeps at most two
+        # active at once, and the build-time bbox reservation in
+        # FXGraphExporter sizes nodes for two label rows.
         ext.set_label_formatter(_label_formatter)
 
         def _tooltip_formatter(d: Dict[str, Any]) -> List[str]:
             primary = cls._safe_float(d.get(metric_name, 0.0))
             primary_label = str(spec["label"])
             return [
-                f"target_node={d.get('target_node', 'n/a')}",
                 f"match_key={d.get('sparse_match_key', '')}",
                 f"{primary_label}={_format_metric_value(primary, tooltip=True)}",
             ]
@@ -674,6 +704,12 @@ def analyze(records: List[RecordDigest], config: Dict[str, Any]) -> AnalysisResu
         metric_ranges = PerLayerAccuracyLens._aggregate_metric_ranges(records)
         if metric_ranges:
             result.global_data["metric_ranges"] = metric_ranges
+        # Stash json_report_top_n so json_report can read it from analysis.global_data
+        # without touching the live config stack (which may be empty at render time).
+        top_n = int(
+            (config.get("per_layer_accuracy") or {}).get("json_report_top_n", 10)
+        )
+        result.global_data["json_report_top_n"] = top_n
 
         for record in records:
             digest = record.data.get("per_layer_accuracy")
@@ -691,11 +727,18 @@ def analyze(records: List[RecordDigest], config: Dict[str, Any]) -> AnalysisResu
                 }
             )
 
+            # Identity context, contributed once per node, regardless of which
+            # metric layers the user has active. Replaces the per-metric-ext
+            # duplication of identity fields.
+            analysis.add_graph_layer(
+                "ctx", PerLayerAccuracyLens._build_context_extension(rows)
+            )
+
             for metric_name in ("cosine_sim", "psnr", "mse", "abs_err"):
                 r = metric_ranges.get(metric_name)
                 fixed_range = (r[0], r[1]) if r else None
                 metric_ext = PerLayerAccuracyLens._build_metric_extension(
-                    rows, metric_name, fixed_range=fixed_range
+                    rows, metric_name, fixed_range=fixed_range,
                 )
                 analysis.add_graph_layer(metric_name, metric_ext)
 
@@ -924,7 +967,10 @@ def record(
                     id="per_layer_accuracy_graph",
                     title="Per-layer Accuracy Graph",
                     graph_ref=graph_ref,
-                    default_layers=[f"{lens_name}/cosine_sim"],
+                    default_layers=[
+                        f"{lens_name}/ctx",
+                        f"{lens_name}/cosine_sim",
+                    ],
                     default_color_by=f"{lens_name}/cosine_sim",
                     compare=GraphCompareSpec(
                         default_sync={
@@ -955,6 +1001,73 @@ def check_badges(
                 ]
             return []
 
+        def json_report(
+            self, session, session_records, analysis
+        ) -> Optional[Dict[str, Any]]:
+            """Machine-readable per-layer accuracy summary for Report (JSON).
+
+            Finds the first target record in ``session_records`` that carries
+            per-layer row data, then returns:
+
+            - ``anchor``/``target``: record names
+            - ``n_layers``: number of matched layer pairs
+            - ``sample_source``: how the sample index was chosen
+            - ``metric_ranges``: {metric: [min, max]} from the archive-wide analysis
+            - ``worst_layers``: {metric: [top-N rows sorted worst-first]}
+
+            Top-N depth is controlled by
+            ``config["per_layer_accuracy"]["json_report_top_n"]`` (default 10).
+            ``config`` is passed via ``analyze()`` → ``analysis.global_data``
+            so the value is available even when called outside a live
+            ``Observatory.enable_context`` block.
+            """
+            target_record = next(
+                (
+                    r
+                    for r in session_records or []
+                    if isinstance(r.data.get("per_layer_accuracy"), dict)
+                    and r.data["per_layer_accuracy"].get("rows")
+                ),
+                None,
+            )
+            if target_record is None:
+                return None
+
+            digest = target_record.data["per_layer_accuracy"]
+            rows = digest.get("rows") or []
+            global_data = analysis.global_data or {}
+            metric_ranges = global_data.get("metric_ranges") or {}
+            top_n = int(global_data.get("json_report_top_n", 10))
+
+            worst: Dict[str, Any] = {}
+            for metric in ("psnr", "cosine_sim", "mse", "abs_err"):
+                ranked = [r for r in rows if isinstance(r.get(metric), (int, float))]
+                # psnr / cosine_sim: lower == worse (ascending sort = worst first)
+                # mse / abs_err: higher == worse (descending sort = worst first)
+                reverse = metric in ("mse", "abs_err")
+                ranked.sort(key=lambda r: r[metric], reverse=reverse)
+                entry = []
+                for r in ranked[:top_n]:
+                    # Layer identity: prefer the human-readable from_node_root;
+                    # fall back to the target_node id for unrooted nodes.
+                    layer_id = r.get("from_node_root") or r.get("target_node", "?")
+                    row: Dict[str, Any] = {"layer": layer_id}
+                    for m in ("psnr", "cosine_sim", "mse", "abs_err"):
+                        v = r.get(m)
+                        if isinstance(v, (int, float)):
+                            row[m] = round(float(v), 4)
+                    entry.append(row)
+                worst[metric] = entry
+
+            return {
+                "anchor": digest.get("anchor_record"),
+                "target": target_record.name,
+                "n_layers": len(rows),
+                "sample_source": digest.get("sample_source"),
+                "metric_ranges": metric_ranges,
+                "worst_layers": worst,
+            }
+
     @staticmethod
     def get_frontend_spec() -> Frontend:
         return PerLayerAccuracyLens._PerLayerAccuracyFrontend()
diff --git a/devtools/observatory/tests/test_json_report.py b/devtools/observatory/tests/test_json_report.py