evaleval · yananlong · Feb 23, 2026 · Feb 23, 2026 · Feb 23, 2026 · Feb 23, 2026
diff --git a/.gitignore b/.gitignore
@@ -1,5 +1,8 @@
 # Local data (generated by running adapters)
 # data/
+plan/
+misc/
+*.tmp
 
 # Byte-compiled / optimized / DLL files
 __pycache__/

diff --git a/.tmp/audit_after.json b/.tmp/audit_after.json
@@ -0,0 +1,262 @@
+{
+  "files_scanned": 6448,
+  "results_scanned": 49659,
+  "missing": {
+    "metric_id": 1021,
+    "metric_name": 1021,
+    "metric_kind": 1021,
+    "metric_unit": 1021
+  },
+  "malformed": {},
+  "top_missing_by_benchmark": {
+    "evaluation_result_id": [],
+    "metric_id": [
+      [
+        "fibble_arena",
+        336
+      ],
+      [
+        "helm_classic",
+        201
+      ],
+      [
+        "helm_lite",
+        182
+      ],
+      [
+        "livecodebenchpro",
+        87
+      ],
+      [
+        "helm_capabilities",
+        68
+      ],
+      [
+        "ace",
+        32
+      ],
+      [
+        "apex-v1",
+        19
+      ],
+      [
+        "appworld_test_normal",
+        15
+      ],
+      [
+        "browsecompplus",
+        15
+      ],
+      [
+        "swe-bench",
+        15
+      ],
+      [
+        "tau-bench-2_airline",
+        15
+      ],
+      [
+        "tau-bench-2_retail",
+        15
+      ],
+      [
+        "tau-bench-2_telecom",
+        15
+      ],
+      [
+        "la_leaderboard",
+        5
+      ],
+      [
+        "theory_of_mind",
+        1
+      ]
+    ],
+    "metric_name": [
+      [
+        "fibble_arena",
+        336
+      ],
+      [
+        "helm_classic",
+        201
+      ],
+      [
+        "helm_lite",
+        182
+      ],
+      [
+        "livecodebenchpro",
+        87
+      ],
+      [
+        "helm_capabilities",
+        68
+      ],
+      [
+        "ace",
+        32
+      ],
+      [
+        "apex-v1",
+        19
+      ],
+      [
+        "appworld_test_normal",
+        15
+      ],
+      [
+        "browsecompplus",
+        15
+      ],
+      [
+        "swe-bench",
+        15
+      ],
+      [
+        "tau-bench-2_airline",
+        15
+      ],
+      [
+        "tau-bench-2_retail",
+        15
+      ],
+      [
+        "tau-bench-2_telecom",
+        15
+      ],
+      [
+        "la_leaderboard",
+        5
+      ],
+      [
+        "theory_of_mind",
+        1
+      ]
+    ],
+    "metric_kind": [
+      [
+        "fibble_arena",
+        336
+      ],
+      [
+        "helm_classic",
+        201
+      ],
+      [
+        "helm_lite",
+        182
+      ],
+      [
+        "livecodebenchpro",
+        87
+      ],
+      [
+        "helm_capabilities",
+        68
+      ],
+      [
+        "ace",
+        32
+      ],
+      [
+        "apex-v1",
+        19
+      ],
+      [
+        "appworld_test_normal",
+        15
+      ],
+      [
+        "browsecompplus",
+        15
+      ],
+      [
+        "swe-bench",
+        15
+      ],
+      [
+        "tau-bench-2_airline",
+        15
+      ],
+      [
+        "tau-bench-2_retail",
+        15
+      ],
+      [
+        "tau-bench-2_telecom",
+        15
+      ],
+      [
+        "la_leaderboard",
+        5
+      ],
+      [
+        "theory_of_mind",
+        1
+      ]
+    ],
+    "metric_unit": [
+      [
+        "fibble_arena",
+        336
+      ],
+      [
+        "helm_classic",
+        201
+      ],
+      [
+        "helm_lite",
+        182
+      ],
+      [
+        "livecodebenchpro",
+        87
+      ],
+      [
+        "helm_capabilities",
+        68
+      ],
+      [
+        "ace",
+        32
+      ],
+      [
+        "apex-v1",
+        19
+      ],
+      [
+        "appworld_test_normal",
+        15
+      ],
+      [
+        "browsecompplus",
+        15
+      ],
+      [
+        "swe-bench",
+        15
+      ],
+      [
+        "tau-bench-2_airline",
+        15
+      ],
+      [
+        "tau-bench-2_retail",
+        15
+      ],
+      [
+        "tau-bench-2_telecom",
+        15
+      ],
+      [
+        "la_leaderboard",
+        5
+      ],
+      [
+        "theory_of_mind",
+        1
+      ]
+    ]
+  }
+}