Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
26 commits
Select commit Hold shift + click to select a range
69215e4
Add optional dedupe_identity block to aggregate schema
yananlong Feb 23, 2026
386fc29
Merge pull request #1 from yananlong/schema-metric-linkage
yananlong Feb 23, 2026
81c73f5
Initial plan
Copilot Feb 23, 2026
4343201
Initial plan
Copilot Feb 23, 2026
41b9310
Remove unsafe hash algorithms (sha1, md5) from dedupe_identity block
Copilot Feb 23, 2026
5786d94
Add if/then constraint requiring hash_algorithm when run_fingerprint …
Copilot Feb 23, 2026
b240a1a
Merge pull request #3 from yananlong/copilot/sub-pr-2
yananlong Feb 23, 2026
fc5e626
Merge pull request #4 from yananlong/copilot/sub-pr-2-again
yananlong Feb 23, 2026
0666632
Update eval.schema.json
yananlong Feb 24, 2026
1db3b90
Merge pull request #2 from yananlong/schema-dedupe-identity
yananlong Feb 24, 2026
855d866
feat: add augment-canonical-identity command for backfilling metric i…
yananlong Apr 18, 2026
0d8cdab
feat: add audit JSON files for benchmark metrics analysis
yananlong Apr 18, 2026
5c1e5d9
Improve canonical identity backfill coverage
yananlong Apr 18, 2026
53cc2aa
Ignore non-aggregate JSON in identity checker
yananlong Apr 18, 2026
c5eb8c5
Merge remote-tracking branch 'origin/main' into copilot/worktree-2026…
yananlong Apr 18, 2026
84ab649
Merge fork/main into copilot branch
yananlong Apr 18, 2026
454e8b9
Enforce strict 0.2.2 schemas and add schema upgrader CLI
yananlong Apr 24, 2026
556e185
Extend canonical backfill and path normalization coverage
yananlong Apr 24, 2026
fcd5a7d
Merge pull request #6 from yananlong/pr1-strict-schema-upgrader
yananlong Apr 24, 2026
5319be0
Merge pull request #7 from yananlong/pr2-canonical-path-normalization
yananlong Apr 24, 2026
fde863c
Merge branch 'evaleval:main' into main
yananlong Apr 24, 2026
b3725e5
Remove tracked temp datastore
yananlong Apr 24, 2026
e0374d0
Merge remote-tracking branch 'fork/main'
yananlong Apr 24, 2026
81d29c4
Merge remote-tracking branch 'fork/pr1-strict-schema-upgrader'
yananlong Apr 28, 2026
fea87e2
Merge remote-tracking branch 'origin/main'
yananlong Apr 28, 2026
568c0a3
Merge branch 'evaleval:main' into main
yananlong Apr 30, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -1,5 +1,8 @@
# Local data (generated by running adapters)
# data/
plan/
misc/
*.tmp

# Byte-compiled / optimized / DLL files
__pycache__/
Expand Down
262 changes: 262 additions & 0 deletions .tmp/audit_after.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,262 @@
{
"files_scanned": 6448,
"results_scanned": 49659,
"missing": {
"metric_id": 1021,
"metric_name": 1021,
"metric_kind": 1021,
"metric_unit": 1021
},
"malformed": {},
"top_missing_by_benchmark": {
"evaluation_result_id": [],
"metric_id": [
[
"fibble_arena",
336
],
[
"helm_classic",
201
],
[
"helm_lite",
182
],
[
"livecodebenchpro",
87
],
[
"helm_capabilities",
68
],
[
"ace",
32
],
[
"apex-v1",
19
],
[
"appworld_test_normal",
15
],
[
"browsecompplus",
15
],
[
"swe-bench",
15
],
[
"tau-bench-2_airline",
15
],
[
"tau-bench-2_retail",
15
],
[
"tau-bench-2_telecom",
15
],
[
"la_leaderboard",
5
],
[
"theory_of_mind",
1
]
],
"metric_name": [
[
"fibble_arena",
336
],
[
"helm_classic",
201
],
[
"helm_lite",
182
],
[
"livecodebenchpro",
87
],
[
"helm_capabilities",
68
],
[
"ace",
32
],
[
"apex-v1",
19
],
[
"appworld_test_normal",
15
],
[
"browsecompplus",
15
],
[
"swe-bench",
15
],
[
"tau-bench-2_airline",
15
],
[
"tau-bench-2_retail",
15
],
[
"tau-bench-2_telecom",
15
],
[
"la_leaderboard",
5
],
[
"theory_of_mind",
1
]
],
"metric_kind": [
[
"fibble_arena",
336
],
[
"helm_classic",
201
],
[
"helm_lite",
182
],
[
"livecodebenchpro",
87
],
[
"helm_capabilities",
68
],
[
"ace",
32
],
[
"apex-v1",
19
],
[
"appworld_test_normal",
15
],
[
"browsecompplus",
15
],
[
"swe-bench",
15
],
[
"tau-bench-2_airline",
15
],
[
"tau-bench-2_retail",
15
],
[
"tau-bench-2_telecom",
15
],
[
"la_leaderboard",
5
],
[
"theory_of_mind",
1
]
],
"metric_unit": [
[
"fibble_arena",
336
],
[
"helm_classic",
201
],
[
"helm_lite",
182
],
[
"livecodebenchpro",
87
],
[
"helm_capabilities",
68
],
[
"ace",
32
],
[
"apex-v1",
19
],
[
"appworld_test_normal",
15
],
[
"browsecompplus",
15
],
[
"swe-bench",
15
],
[
"tau-bench-2_airline",
15
],
[
"tau-bench-2_retail",
15
],
[
"tau-bench-2_telecom",
15
],
[
"la_leaderboard",
5
],
[
"theory_of_mind",
1
]
]
}
}
Loading