|
50 | 50 | {"id":"CodeContextBench-6yj","title":"US-001: Create sec-cve-001 curl CVE task","status":"closed","priority":1,"issue_type":"task","owner":"locobench@anthropic.com","created_at":"2026-02-16T15:56:28.996905534Z","created_by":"LoCoBench Bot","updated_at":"2026-02-16T16:01:20.995454609Z","closed_at":"2026-02-16T16:01:20.995454609Z","close_reason":"US-001 complete: sec-cve-001 curl CVE task created with all files, registered in config"} |
51 | 51 | {"id":"CodeContextBench-6z8","title":"US-002: Create sec-cve-002 Envoy CVE triage task","status":"in_progress","priority":2,"issue_type":"task","owner":"locobench@anthropic.com","created_at":"2026-02-16T16:02:22.818276011Z","created_by":"LoCoBench Bot","updated_at":"2026-02-16T16:02:31.759622574Z"} |
52 | 52 | {"id":"CodeContextBench-7ao","title":"Redesign 9 over-hinted task instructions to remove file/method/line hints","status":"closed","priority":1,"issue_type":"task","owner":"locobench@anthropic.com","created_at":"2026-02-15T20:25:30.198399183Z","created_by":"LoCoBench Bot","updated_at":"2026-02-15T20:29:45.648638308Z","closed_at":"2026-02-15T20:29:45.648638308Z","close_reason":"Redesigned 9 over-hinted task instructions to remove file/method/line hints"} |
53 | | -{"id":"CodeContextBench-7dg","title":"Epic: Design experiments to realistically demonstrate MCP code search value","description":"The current benchmark mounts full repos at /workspace/, making SG tools redundant (0.001 avg delta across 70 tasks). Need new experiment designs where SG tools provide genuine, measurable value. Three tiers: (1) Cross-repo investigation tasks exercising SG-unique capabilities, (2) Mono-repo package isolation removing full-repo local access, (3) Blind-bug task variants requiring discovery. This is the project's core mission.","status":"closed","priority":0,"issue_type":"feature","owner":"locobench@anthropic.com","created_at":"2026-02-07T13:00:03.057412295Z","created_by":"LoCoBench Bot","updated_at":"2026-02-12T10:29:00.209019489Z","closed_at":"2026-02-12T10:29:00.209019489Z","close_reason":"All 3 design experiments completed: (1) sourcegraph_isolated — docs/DESIGN_sourcegraph_isolated.md, (2) blind-bug variants — docs/DESIGN_blind_bug_variants.md, (3) investigation tasks already run and analyzed (a06). Implementation is next phase.","dependencies":[{"issue_id":"CodeContextBench-7dg","depends_on_id":"CodeContextBench-fph","type":"blocks","created_at":"2026-02-07T13:00:38.013992901Z","created_by":"LoCoBench Bot"},{"issue_id":"CodeContextBench-7dg","depends_on_id":"CodeContextBench-a06","type":"blocks","created_at":"2026-02-07T13:00:37.902203528Z","created_by":"LoCoBench Bot"},{"issue_id":"CodeContextBench-7dg","depends_on_id":"CodeContextBench-8hb","type":"blocks","created_at":"2026-02-07T13:00:37.957011767Z","created_by":"LoCoBench Bot"}]} |
| 53 | +{"id":"CodeContextBench-7dg","title":"Epic: Design experiments to realistically demonstrate MCP code search value","description":"The current benchmark mounts full repos at /workspace/, making SG tools redundant (0.001 avg delta across 70 tasks). Need new experiment designs where SG tools provide genuine, measurable value. Three tiers: (1) Cross-repo investigation tasks exercising SG-unique capabilities, (2) Mono-repo package isolation removing full-repo local access, (3) Blind-bug task variants requiring discovery. This is the project's core mission.","status":"closed","priority":0,"issue_type":"feature","owner":"locobench@anthropic.com","created_at":"2026-02-07T13:00:03.057412295Z","created_by":"LoCoBench Bot","updated_at":"2026-02-12T10:29:00.209019489Z","closed_at":"2026-02-12T10:29:00.209019489Z","close_reason":"All 3 design experiments completed: (1) sourcegraph_isolated — docs/DESIGN_sourcegraph_isolated.md, (2) blind-bug variants — docs/DESIGN_blind_bug_variants.md, (3) investigation tasks already run and analyzed (a06). Implementation is next phase.","dependencies":[{"issue_id":"CodeContextBench-7dg","depends_on_id":"CodeContextBench-a06","type":"blocks","created_at":"2026-02-07T13:00:37.902203528Z","created_by":"LoCoBench Bot"},{"issue_id":"CodeContextBench-7dg","depends_on_id":"CodeContextBench-8hb","type":"blocks","created_at":"2026-02-07T13:00:37.957011767Z","created_by":"LoCoBench Bot"},{"issue_id":"CodeContextBench-7dg","depends_on_id":"CodeContextBench-fph","type":"blocks","created_at":"2026-02-07T13:00:38.013992901Z","created_by":"LoCoBench Bot"}]} |
54 | 54 | {"id":"CodeContextBench-7dv","title":"US-005: Run 3x SWE-bench Pro variance trials","status":"closed","priority":1,"issue_type":"task","owner":"locobench@anthropic.com","created_at":"2026-02-12T08:39:08.439026641Z","created_by":"LoCoBench Bot","updated_at":"2026-02-16T01:39:42.106430509Z","closed_at":"2026-02-16T01:39:42.106430509Z","close_reason":"Stale - SWE-bench Pro variance trials not aligned with current enterprise largerepo focus"} |
55 | 55 | {"id":"CodeContextBench-7k8","title":"US-009: Implement governance evaluator and register governance tasks","status":"closed","priority":1,"issue_type":"task","owner":"locobench@anthropic.com","created_at":"2026-02-15T14:51:10.627929455Z","created_by":"LoCoBench Bot","updated_at":"2026-02-15T14:54:25.91681249Z","closed_at":"2026-02-15T14:54:25.91681249Z","close_reason":"US-009 implemented: governance_evaluator.py + 6 tasks registered"} |
56 | 56 | {"id":"CodeContextBench-7r2","title":"US-004: Archive saturated ccb_repoqa benchmark","status":"closed","priority":2,"issue_type":"task","owner":"locobench@anthropic.com","created_at":"2026-02-16T15:18:02.671684683Z","created_by":"LoCoBench Bot","updated_at":"2026-02-16T15:21:09.337735187Z","closed_at":"2026-02-16T15:21:09.337735187Z","close_reason":"Archived ccb_repoqa: moved to benchmarks/archive/, removed from task selection, updated README"} |
|
131 | 131 | {"id":"CodeContextBench-qtf","title":"US-007 Add transcript discovery abstraction for non-Claude artifacts","status":"closed","priority":2,"issue_type":"task","owner":"locobench@anthropic.com","created_at":"2026-02-17T03:33:11.290586345Z","created_by":"LoCoBench Bot","updated_at":"2026-02-17T04:00:12.261541882Z","closed_at":"2026-02-17T04:00:12.261541882Z","close_reason":"done"} |
132 | 132 | {"id":"CodeContextBench-r71","title":"CrossRepo: all runs invalid due to verifier path bug","description":"All 8 CrossRepo runs (4 tasks × 2 configs) crashed because test.sh referenced /task/tests/expected_changes.json instead of /tests/expected_changes.json. Verifier is now fixed locally but all existing runs predate the fix. Agents produced meaningful output (261-line patch, 224-line analysis, 497-line reasoning). All 4 tasks need reruns.","status":"closed","priority":1,"issue_type":"bug","owner":"locobench@anthropic.com","created_at":"2026-02-06T22:03:15.909834308Z","created_by":"LoCoBench Bot","updated_at":"2026-02-07T18:39:18.97810564Z","closed_at":"2026-02-07T18:39:18.97810564Z","close_reason":"CrossRepo all 3 configs rerun complete: baseline avg=0.571, SG_base avg=0.587, SG_full avg=0.387"} |
133 | 133 | {"id":"CodeContextBench-rch","title":"US-019: Scaffold enterprise multi-team and conflicting-docs tasks","status":"closed","priority":1,"issue_type":"task","owner":"locobench@anthropic.com","created_at":"2026-02-15T15:00:15.881711075Z","created_by":"LoCoBench Bot","updated_at":"2026-02-15T15:08:20.111033768Z","closed_at":"2026-02-15T15:08:20.111033768Z","close_reason":"US-019 complete: 3 enterprise tasks scaffolded"} |
134 | | -{"id":"CodeContextBench-rej","title":"Generate aggregate CCB evaluation report (updated: 12 benchmarks, no LoCoBench)","description":"After all benchmark runs complete and MANIFEST is clean, generate the aggregate evaluation report using python3 scripts/generate_report.py. Should cover all 13 benchmarks with 3-config comparison (baseline vs SG_base vs SG_full), MCP impact analysis, and per-benchmark breakdowns.","status":"closed","priority":2,"issue_type":"task","owner":"locobench@anthropic.com","created_at":"2026-02-06T14:50:31.544649793Z","created_by":"LoCoBench Bot","updated_at":"2026-02-16T01:39:42.151139942Z","closed_at":"2026-02-16T01:39:42.151139942Z","close_reason":"Stale - aggregate report design predates enterprise task expansion and IR pipeline work. Needs redesign.","dependencies":[{"issue_id":"CodeContextBench-rej","depends_on_id":"CodeContextBench-aot","type":"blocks","created_at":"2026-02-06T14:50:47.565065613Z","created_by":"LoCoBench Bot"},{"issue_id":"CodeContextBench-rej","depends_on_id":"CodeContextBench-kph","type":"blocks","created_at":"2026-02-06T14:50:47.632620141Z","created_by":"LoCoBench Bot"},{"issue_id":"CodeContextBench-rej","depends_on_id":"CodeContextBench-yk3","type":"blocks","created_at":"2026-02-06T14:50:47.689660185Z","created_by":"LoCoBench Bot"},{"issue_id":"CodeContextBench-rej","depends_on_id":"CodeContextBench-99h","type":"blocks","created_at":"2026-02-06T14:50:47.854278452Z","created_by":"LoCoBench Bot"},{"issue_id":"CodeContextBench-rej","depends_on_id":"CodeContextBench-8t7","type":"blocks","created_at":"2026-02-08T01:13:11.737288558Z","created_by":"LoCoBench Bot"},{"issue_id":"CodeContextBench-rej","depends_on_id":"CodeContextBench-nj3","type":"blocks","created_at":"2026-02-08T02:54:34.482946883Z","created_by":"LoCoBench Bot"},{"issue_id":"CodeContextBench-rej","depends_on_id":"CodeContextBench-36d","type":"blocks","created_at":"2026-02-08T02:54:34.538759931Z","created_by":"LoCoBench Bot"},{"issue_id":"CodeContextBench-rej","depends_on_id":"CodeContextBench-1up","type":"blocks","created_at":"2026-02-08T02:54:34.594203831Z","created_by":"LoCoBench Bot"},{"issue_id":"CodeContextBench-rej","depends_on_id":"CodeContextBench-0y8","type":"blocks","created_at":"2026-02-08T02:54:35.06846015Z","created_by":"LoCoBench Bot"},{"issue_id":"CodeContextBench-rej","depends_on_id":"CodeContextBench-24z","type":"blocks","created_at":"2026-02-08T02:54:35.124147115Z","created_by":"LoCoBench Bot"},{"issue_id":"CodeContextBench-rej","depends_on_id":"CodeContextBench-dfp","type":"blocks","created_at":"2026-02-06T14:50:47.909843823Z","created_by":"LoCoBench Bot"},{"issue_id":"CodeContextBench-rej","depends_on_id":"CodeContextBench-9r9","type":"blocks","created_at":"2026-02-06T14:50:47.744576933Z","created_by":"LoCoBench Bot"},{"issue_id":"CodeContextBench-rej","depends_on_id":"CodeContextBench-05n","type":"blocks","created_at":"2026-02-06T14:50:47.799295655Z","created_by":"LoCoBench Bot"}]} |
| 134 | +{"id":"CodeContextBench-rej","title":"Generate aggregate CCB evaluation report (updated: 12 benchmarks, no LoCoBench)","description":"After all benchmark runs complete and MANIFEST is clean, generate the aggregate evaluation report using python3 scripts/generate_report.py. Should cover all 13 benchmarks with 3-config comparison (baseline vs SG_base vs SG_full), MCP impact analysis, and per-benchmark breakdowns.","status":"closed","priority":2,"issue_type":"task","owner":"locobench@anthropic.com","created_at":"2026-02-06T14:50:31.544649793Z","created_by":"LoCoBench Bot","updated_at":"2026-02-16T01:39:42.151139942Z","closed_at":"2026-02-16T01:39:42.151139942Z","close_reason":"Stale - aggregate report design predates enterprise task expansion and IR pipeline work. Needs redesign.","dependencies":[{"issue_id":"CodeContextBench-rej","depends_on_id":"CodeContextBench-aot","type":"blocks","created_at":"2026-02-06T14:50:47.565065613Z","created_by":"LoCoBench Bot"},{"issue_id":"CodeContextBench-rej","depends_on_id":"CodeContextBench-kph","type":"blocks","created_at":"2026-02-06T14:50:47.632620141Z","created_by":"LoCoBench Bot"},{"issue_id":"CodeContextBench-rej","depends_on_id":"CodeContextBench-yk3","type":"blocks","created_at":"2026-02-06T14:50:47.689660185Z","created_by":"LoCoBench Bot"},{"issue_id":"CodeContextBench-rej","depends_on_id":"CodeContextBench-9r9","type":"blocks","created_at":"2026-02-06T14:50:47.744576933Z","created_by":"LoCoBench Bot"},{"issue_id":"CodeContextBench-rej","depends_on_id":"CodeContextBench-05n","type":"blocks","created_at":"2026-02-06T14:50:47.799295655Z","created_by":"LoCoBench Bot"},{"issue_id":"CodeContextBench-rej","depends_on_id":"CodeContextBench-99h","type":"blocks","created_at":"2026-02-06T14:50:47.854278452Z","created_by":"LoCoBench Bot"},{"issue_id":"CodeContextBench-rej","depends_on_id":"CodeContextBench-dfp","type":"blocks","created_at":"2026-02-06T14:50:47.909843823Z","created_by":"LoCoBench Bot"},{"issue_id":"CodeContextBench-rej","depends_on_id":"CodeContextBench-8t7","type":"blocks","created_at":"2026-02-08T01:13:11.737288558Z","created_by":"LoCoBench Bot"},{"issue_id":"CodeContextBench-rej","depends_on_id":"CodeContextBench-nj3","type":"blocks","created_at":"2026-02-08T02:54:34.482946883Z","created_by":"LoCoBench Bot"},{"issue_id":"CodeContextBench-rej","depends_on_id":"CodeContextBench-36d","type":"blocks","created_at":"2026-02-08T02:54:34.538759931Z","created_by":"LoCoBench Bot"},{"issue_id":"CodeContextBench-rej","depends_on_id":"CodeContextBench-1up","type":"blocks","created_at":"2026-02-08T02:54:34.594203831Z","created_by":"LoCoBench Bot"},{"issue_id":"CodeContextBench-rej","depends_on_id":"CodeContextBench-0y8","type":"blocks","created_at":"2026-02-08T02:54:35.06846015Z","created_by":"LoCoBench Bot"},{"issue_id":"CodeContextBench-rej","depends_on_id":"CodeContextBench-24z","type":"blocks","created_at":"2026-02-08T02:54:35.124147115Z","created_by":"LoCoBench Bot"}]} |
135 | 135 | {"id":"CodeContextBench-rf3","title":"US-002: Fix protonmail Docker environment","status":"closed","priority":2,"issue_type":"task","owner":"locobench@anthropic.com","created_at":"2026-02-11T23:31:45.49023811Z","created_by":"LoCoBench Bot","updated_at":"2026-02-11T23:39:17.748141376Z","closed_at":"2026-02-11T23:39:17.748141376Z","close_reason":"Fixed protonmail Docker Node.js v16→v18 in local + cached Dockerfiles"} |
136 | 136 | {"id":"CodeContextBench-rxg","title":"Rerun 7 LoCoBench SG_base zero-token gap-fill tasks","description":"7 LoCoBench tasks in locobench_gapfill_opus_20260209_010036/sourcegraph_base have zero tokens (auth failure). Tasks: c_api_graphql_expert_079 (arch+cross_file), rust_microservice_expert_008, csharp_warehouse_expert_012 (2), python_streaming_expert_085, python_desktop_expert. Current SG_base mean=0.504 (18 valid) but MANIFEST shows 0.363 including errored. Fix errored classification is done but these need actual reruns for complete data.","status":"closed","priority":1,"issue_type":"task","owner":"locobench@anthropic.com","created_at":"2026-02-10T11:28:20.889991278Z","created_by":"LoCoBench Bot","updated_at":"2026-02-15T19:31:57.593499773Z","closed_at":"2026-02-15T19:31:57.593499773Z","close_reason":"SG_base config dropped from official runs"} |
137 | 137 | {"id":"CodeContextBench-s00t","title":"US-007 - Add transcript discovery abstraction for non-Claude artifacts","status":"closed","priority":2,"issue_type":"task","owner":"locobench@anthropic.com","created_at":"2026-02-17T03:57:12.383536394Z","created_by":"LoCoBench Bot","updated_at":"2026-02-17T03:57:29.113635367Z","closed_at":"2026-02-17T03:57:29.113635367Z","close_reason":"duplicate"} |
|
0 commit comments