Skip to content

Commit 19c8814

Browse files
sjarmakcursoragent
andcommitted
chore: commit pending sg_only configs, scripts, and beads sync
Co-authored-by: Cursor <cursoragent@cursor.com>
1 parent e544c7f commit 19c8814

File tree

39 files changed

+2613
-1137
lines changed

39 files changed

+2613
-1137
lines changed

.beads/issues.jsonl

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -50,7 +50,7 @@
5050
{"id":"CodeContextBench-6yj","title":"US-001: Create sec-cve-001 curl CVE task","status":"closed","priority":1,"issue_type":"task","owner":"locobench@anthropic.com","created_at":"2026-02-16T15:56:28.996905534Z","created_by":"LoCoBench Bot","updated_at":"2026-02-16T16:01:20.995454609Z","closed_at":"2026-02-16T16:01:20.995454609Z","close_reason":"US-001 complete: sec-cve-001 curl CVE task created with all files, registered in config"}
5151
{"id":"CodeContextBench-6z8","title":"US-002: Create sec-cve-002 Envoy CVE triage task","status":"in_progress","priority":2,"issue_type":"task","owner":"locobench@anthropic.com","created_at":"2026-02-16T16:02:22.818276011Z","created_by":"LoCoBench Bot","updated_at":"2026-02-16T16:02:31.759622574Z"}
5252
{"id":"CodeContextBench-7ao","title":"Redesign 9 over-hinted task instructions to remove file/method/line hints","status":"closed","priority":1,"issue_type":"task","owner":"locobench@anthropic.com","created_at":"2026-02-15T20:25:30.198399183Z","created_by":"LoCoBench Bot","updated_at":"2026-02-15T20:29:45.648638308Z","closed_at":"2026-02-15T20:29:45.648638308Z","close_reason":"Redesigned 9 over-hinted task instructions to remove file/method/line hints"}
53-
{"id":"CodeContextBench-7dg","title":"Epic: Design experiments to realistically demonstrate MCP code search value","description":"The current benchmark mounts full repos at /workspace/, making SG tools redundant (0.001 avg delta across 70 tasks). Need new experiment designs where SG tools provide genuine, measurable value. Three tiers: (1) Cross-repo investigation tasks exercising SG-unique capabilities, (2) Mono-repo package isolation removing full-repo local access, (3) Blind-bug task variants requiring discovery. This is the project's core mission.","status":"closed","priority":0,"issue_type":"feature","owner":"locobench@anthropic.com","created_at":"2026-02-07T13:00:03.057412295Z","created_by":"LoCoBench Bot","updated_at":"2026-02-12T10:29:00.209019489Z","closed_at":"2026-02-12T10:29:00.209019489Z","close_reason":"All 3 design experiments completed: (1) sourcegraph_isolated — docs/DESIGN_sourcegraph_isolated.md, (2) blind-bug variants — docs/DESIGN_blind_bug_variants.md, (3) investigation tasks already run and analyzed (a06). Implementation is next phase.","dependencies":[{"issue_id":"CodeContextBench-7dg","depends_on_id":"CodeContextBench-fph","type":"blocks","created_at":"2026-02-07T13:00:38.013992901Z","created_by":"LoCoBench Bot"},{"issue_id":"CodeContextBench-7dg","depends_on_id":"CodeContextBench-a06","type":"blocks","created_at":"2026-02-07T13:00:37.902203528Z","created_by":"LoCoBench Bot"},{"issue_id":"CodeContextBench-7dg","depends_on_id":"CodeContextBench-8hb","type":"blocks","created_at":"2026-02-07T13:00:37.957011767Z","created_by":"LoCoBench Bot"}]}
53+
{"id":"CodeContextBench-7dg","title":"Epic: Design experiments to realistically demonstrate MCP code search value","description":"The current benchmark mounts full repos at /workspace/, making SG tools redundant (0.001 avg delta across 70 tasks). Need new experiment designs where SG tools provide genuine, measurable value. Three tiers: (1) Cross-repo investigation tasks exercising SG-unique capabilities, (2) Mono-repo package isolation removing full-repo local access, (3) Blind-bug task variants requiring discovery. This is the project's core mission.","status":"closed","priority":0,"issue_type":"feature","owner":"locobench@anthropic.com","created_at":"2026-02-07T13:00:03.057412295Z","created_by":"LoCoBench Bot","updated_at":"2026-02-12T10:29:00.209019489Z","closed_at":"2026-02-12T10:29:00.209019489Z","close_reason":"All 3 design experiments completed: (1) sourcegraph_isolated — docs/DESIGN_sourcegraph_isolated.md, (2) blind-bug variants — docs/DESIGN_blind_bug_variants.md, (3) investigation tasks already run and analyzed (a06). Implementation is next phase.","dependencies":[{"issue_id":"CodeContextBench-7dg","depends_on_id":"CodeContextBench-a06","type":"blocks","created_at":"2026-02-07T13:00:37.902203528Z","created_by":"LoCoBench Bot"},{"issue_id":"CodeContextBench-7dg","depends_on_id":"CodeContextBench-8hb","type":"blocks","created_at":"2026-02-07T13:00:37.957011767Z","created_by":"LoCoBench Bot"},{"issue_id":"CodeContextBench-7dg","depends_on_id":"CodeContextBench-fph","type":"blocks","created_at":"2026-02-07T13:00:38.013992901Z","created_by":"LoCoBench Bot"}]}
5454
{"id":"CodeContextBench-7dv","title":"US-005: Run 3x SWE-bench Pro variance trials","status":"closed","priority":1,"issue_type":"task","owner":"locobench@anthropic.com","created_at":"2026-02-12T08:39:08.439026641Z","created_by":"LoCoBench Bot","updated_at":"2026-02-16T01:39:42.106430509Z","closed_at":"2026-02-16T01:39:42.106430509Z","close_reason":"Stale - SWE-bench Pro variance trials not aligned with current enterprise largerepo focus"}
5555
{"id":"CodeContextBench-7k8","title":"US-009: Implement governance evaluator and register governance tasks","status":"closed","priority":1,"issue_type":"task","owner":"locobench@anthropic.com","created_at":"2026-02-15T14:51:10.627929455Z","created_by":"LoCoBench Bot","updated_at":"2026-02-15T14:54:25.91681249Z","closed_at":"2026-02-15T14:54:25.91681249Z","close_reason":"US-009 implemented: governance_evaluator.py + 6 tasks registered"}
5656
{"id":"CodeContextBench-7r2","title":"US-004: Archive saturated ccb_repoqa benchmark","status":"closed","priority":2,"issue_type":"task","owner":"locobench@anthropic.com","created_at":"2026-02-16T15:18:02.671684683Z","created_by":"LoCoBench Bot","updated_at":"2026-02-16T15:21:09.337735187Z","closed_at":"2026-02-16T15:21:09.337735187Z","close_reason":"Archived ccb_repoqa: moved to benchmarks/archive/, removed from task selection, updated README"}
@@ -131,7 +131,7 @@
131131
{"id":"CodeContextBench-qtf","title":"US-007 Add transcript discovery abstraction for non-Claude artifacts","status":"closed","priority":2,"issue_type":"task","owner":"locobench@anthropic.com","created_at":"2026-02-17T03:33:11.290586345Z","created_by":"LoCoBench Bot","updated_at":"2026-02-17T04:00:12.261541882Z","closed_at":"2026-02-17T04:00:12.261541882Z","close_reason":"done"}
132132
{"id":"CodeContextBench-r71","title":"CrossRepo: all runs invalid due to verifier path bug","description":"All 8 CrossRepo runs (4 tasks × 2 configs) crashed because test.sh referenced /task/tests/expected_changes.json instead of /tests/expected_changes.json. Verifier is now fixed locally but all existing runs predate the fix. Agents produced meaningful output (261-line patch, 224-line analysis, 497-line reasoning). All 4 tasks need reruns.","status":"closed","priority":1,"issue_type":"bug","owner":"locobench@anthropic.com","created_at":"2026-02-06T22:03:15.909834308Z","created_by":"LoCoBench Bot","updated_at":"2026-02-07T18:39:18.97810564Z","closed_at":"2026-02-07T18:39:18.97810564Z","close_reason":"CrossRepo all 3 configs rerun complete: baseline avg=0.571, SG_base avg=0.587, SG_full avg=0.387"}
133133
{"id":"CodeContextBench-rch","title":"US-019: Scaffold enterprise multi-team and conflicting-docs tasks","status":"closed","priority":1,"issue_type":"task","owner":"locobench@anthropic.com","created_at":"2026-02-15T15:00:15.881711075Z","created_by":"LoCoBench Bot","updated_at":"2026-02-15T15:08:20.111033768Z","closed_at":"2026-02-15T15:08:20.111033768Z","close_reason":"US-019 complete: 3 enterprise tasks scaffolded"}
134-
{"id":"CodeContextBench-rej","title":"Generate aggregate CCB evaluation report (updated: 12 benchmarks, no LoCoBench)","description":"After all benchmark runs complete and MANIFEST is clean, generate the aggregate evaluation report using python3 scripts/generate_report.py. Should cover all 13 benchmarks with 3-config comparison (baseline vs SG_base vs SG_full), MCP impact analysis, and per-benchmark breakdowns.","status":"closed","priority":2,"issue_type":"task","owner":"locobench@anthropic.com","created_at":"2026-02-06T14:50:31.544649793Z","created_by":"LoCoBench Bot","updated_at":"2026-02-16T01:39:42.151139942Z","closed_at":"2026-02-16T01:39:42.151139942Z","close_reason":"Stale - aggregate report design predates enterprise task expansion and IR pipeline work. Needs redesign.","dependencies":[{"issue_id":"CodeContextBench-rej","depends_on_id":"CodeContextBench-aot","type":"blocks","created_at":"2026-02-06T14:50:47.565065613Z","created_by":"LoCoBench Bot"},{"issue_id":"CodeContextBench-rej","depends_on_id":"CodeContextBench-kph","type":"blocks","created_at":"2026-02-06T14:50:47.632620141Z","created_by":"LoCoBench Bot"},{"issue_id":"CodeContextBench-rej","depends_on_id":"CodeContextBench-yk3","type":"blocks","created_at":"2026-02-06T14:50:47.689660185Z","created_by":"LoCoBench Bot"},{"issue_id":"CodeContextBench-rej","depends_on_id":"CodeContextBench-99h","type":"blocks","created_at":"2026-02-06T14:50:47.854278452Z","created_by":"LoCoBench Bot"},{"issue_id":"CodeContextBench-rej","depends_on_id":"CodeContextBench-8t7","type":"blocks","created_at":"2026-02-08T01:13:11.737288558Z","created_by":"LoCoBench Bot"},{"issue_id":"CodeContextBench-rej","depends_on_id":"CodeContextBench-nj3","type":"blocks","created_at":"2026-02-08T02:54:34.482946883Z","created_by":"LoCoBench Bot"},{"issue_id":"CodeContextBench-rej","depends_on_id":"CodeContextBench-36d","type":"blocks","created_at":"2026-02-08T02:54:34.538759931Z","created_by":"LoCoBench Bot"},{"issue_id":"CodeContextBench-rej","depends_on_id":"CodeContextBench-1up","type":"blocks","created_at":"2026-02-08T02:54:34.594203831Z","created_by":"LoCoBench Bot"},{"issue_id":"CodeContextBench-rej","depends_on_id":"CodeContextBench-0y8","type":"blocks","created_at":"2026-02-08T02:54:35.06846015Z","created_by":"LoCoBench Bot"},{"issue_id":"CodeContextBench-rej","depends_on_id":"CodeContextBench-24z","type":"blocks","created_at":"2026-02-08T02:54:35.124147115Z","created_by":"LoCoBench Bot"},{"issue_id":"CodeContextBench-rej","depends_on_id":"CodeContextBench-dfp","type":"blocks","created_at":"2026-02-06T14:50:47.909843823Z","created_by":"LoCoBench Bot"},{"issue_id":"CodeContextBench-rej","depends_on_id":"CodeContextBench-9r9","type":"blocks","created_at":"2026-02-06T14:50:47.744576933Z","created_by":"LoCoBench Bot"},{"issue_id":"CodeContextBench-rej","depends_on_id":"CodeContextBench-05n","type":"blocks","created_at":"2026-02-06T14:50:47.799295655Z","created_by":"LoCoBench Bot"}]}
134+
{"id":"CodeContextBench-rej","title":"Generate aggregate CCB evaluation report (updated: 12 benchmarks, no LoCoBench)","description":"After all benchmark runs complete and MANIFEST is clean, generate the aggregate evaluation report using python3 scripts/generate_report.py. Should cover all 13 benchmarks with 3-config comparison (baseline vs SG_base vs SG_full), MCP impact analysis, and per-benchmark breakdowns.","status":"closed","priority":2,"issue_type":"task","owner":"locobench@anthropic.com","created_at":"2026-02-06T14:50:31.544649793Z","created_by":"LoCoBench Bot","updated_at":"2026-02-16T01:39:42.151139942Z","closed_at":"2026-02-16T01:39:42.151139942Z","close_reason":"Stale - aggregate report design predates enterprise task expansion and IR pipeline work. Needs redesign.","dependencies":[{"issue_id":"CodeContextBench-rej","depends_on_id":"CodeContextBench-aot","type":"blocks","created_at":"2026-02-06T14:50:47.565065613Z","created_by":"LoCoBench Bot"},{"issue_id":"CodeContextBench-rej","depends_on_id":"CodeContextBench-kph","type":"blocks","created_at":"2026-02-06T14:50:47.632620141Z","created_by":"LoCoBench Bot"},{"issue_id":"CodeContextBench-rej","depends_on_id":"CodeContextBench-yk3","type":"blocks","created_at":"2026-02-06T14:50:47.689660185Z","created_by":"LoCoBench Bot"},{"issue_id":"CodeContextBench-rej","depends_on_id":"CodeContextBench-9r9","type":"blocks","created_at":"2026-02-06T14:50:47.744576933Z","created_by":"LoCoBench Bot"},{"issue_id":"CodeContextBench-rej","depends_on_id":"CodeContextBench-05n","type":"blocks","created_at":"2026-02-06T14:50:47.799295655Z","created_by":"LoCoBench Bot"},{"issue_id":"CodeContextBench-rej","depends_on_id":"CodeContextBench-99h","type":"blocks","created_at":"2026-02-06T14:50:47.854278452Z","created_by":"LoCoBench Bot"},{"issue_id":"CodeContextBench-rej","depends_on_id":"CodeContextBench-dfp","type":"blocks","created_at":"2026-02-06T14:50:47.909843823Z","created_by":"LoCoBench Bot"},{"issue_id":"CodeContextBench-rej","depends_on_id":"CodeContextBench-8t7","type":"blocks","created_at":"2026-02-08T01:13:11.737288558Z","created_by":"LoCoBench Bot"},{"issue_id":"CodeContextBench-rej","depends_on_id":"CodeContextBench-nj3","type":"blocks","created_at":"2026-02-08T02:54:34.482946883Z","created_by":"LoCoBench Bot"},{"issue_id":"CodeContextBench-rej","depends_on_id":"CodeContextBench-36d","type":"blocks","created_at":"2026-02-08T02:54:34.538759931Z","created_by":"LoCoBench Bot"},{"issue_id":"CodeContextBench-rej","depends_on_id":"CodeContextBench-1up","type":"blocks","created_at":"2026-02-08T02:54:34.594203831Z","created_by":"LoCoBench Bot"},{"issue_id":"CodeContextBench-rej","depends_on_id":"CodeContextBench-0y8","type":"blocks","created_at":"2026-02-08T02:54:35.06846015Z","created_by":"LoCoBench Bot"},{"issue_id":"CodeContextBench-rej","depends_on_id":"CodeContextBench-24z","type":"blocks","created_at":"2026-02-08T02:54:35.124147115Z","created_by":"LoCoBench Bot"}]}
135135
{"id":"CodeContextBench-rf3","title":"US-002: Fix protonmail Docker environment","status":"closed","priority":2,"issue_type":"task","owner":"locobench@anthropic.com","created_at":"2026-02-11T23:31:45.49023811Z","created_by":"LoCoBench Bot","updated_at":"2026-02-11T23:39:17.748141376Z","closed_at":"2026-02-11T23:39:17.748141376Z","close_reason":"Fixed protonmail Docker Node.js v16→v18 in local + cached Dockerfiles"}
136136
{"id":"CodeContextBench-rxg","title":"Rerun 7 LoCoBench SG_base zero-token gap-fill tasks","description":"7 LoCoBench tasks in locobench_gapfill_opus_20260209_010036/sourcegraph_base have zero tokens (auth failure). Tasks: c_api_graphql_expert_079 (arch+cross_file), rust_microservice_expert_008, csharp_warehouse_expert_012 (2), python_streaming_expert_085, python_desktop_expert. Current SG_base mean=0.504 (18 valid) but MANIFEST shows 0.363 including errored. Fix errored classification is done but these need actual reruns for complete data.","status":"closed","priority":1,"issue_type":"task","owner":"locobench@anthropic.com","created_at":"2026-02-10T11:28:20.889991278Z","created_by":"LoCoBench Bot","updated_at":"2026-02-15T19:31:57.593499773Z","closed_at":"2026-02-15T19:31:57.593499773Z","close_reason":"SG_base config dropped from official runs"}
137137
{"id":"CodeContextBench-s00t","title":"US-007 - Add transcript discovery abstraction for non-Claude artifacts","status":"closed","priority":2,"issue_type":"task","owner":"locobench@anthropic.com","created_at":"2026-02-17T03:57:12.383536394Z","created_by":"LoCoBench Bot","updated_at":"2026-02-17T03:57:29.113635367Z","closed_at":"2026-02-17T03:57:29.113635367Z","close_reason":"duplicate"}
Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,15 @@
1+
FROM ubuntu:22.04
2+
3+
RUN apt-get update && apt-get install -y \
4+
python3 \
5+
python3-pip \
6+
curl \
7+
unzip \
8+
&& rm -rf /var/lib/apt/lists/*
9+
10+
RUN pip install --no-cache-dir pytest
11+
12+
RUN mkdir -p /workspace /logs/verifier /app
13+
WORKDIR /workspace
14+
15+
ENTRYPOINT []
Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,15 @@
1+
FROM ubuntu:22.04
2+
3+
RUN apt-get update && apt-get install -y \
4+
python3 \
5+
python3-pip \
6+
curl \
7+
unzip \
8+
&& rm -rf /var/lib/apt/lists/*
9+
10+
RUN pip install --no-cache-dir pytest
11+
12+
RUN mkdir -p /workspace /logs/verifier /app
13+
WORKDIR /workspace
14+
15+
ENTRYPOINT []
Lines changed: 42 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,42 @@
1+
# Largerepo Task Environment — sg_only_env variant
2+
# Source: kubernetes/kubernetes (v1.30.0)
3+
#
4+
# Same toolchain as the standard Dockerfile but source files are truncated
5+
# so the agent must use Sourcegraph MCP for all code access.
6+
# The verifier wrapper restores the full repo before running tests.
7+
8+
FROM golang:1.23-bookworm
9+
10+
WORKDIR /workspace
11+
12+
# Install dependencies
13+
RUN apt-get update && apt-get install -y \
14+
git \
15+
curl \
16+
python3 \
17+
python3-pip \
18+
npm \
19+
rsync \
20+
&& rm -rf /var/lib/apt/lists/*
21+
22+
# Install Claude Code CLI
23+
RUN npm install -g @anthropic-ai/claude-code
24+
25+
# Clone the Kubernetes repository at pinned commit (v1.30.0)
26+
RUN git clone --filter=blob:none --no-checkout https://github.com/kubernetes/kubernetes.git . && \
27+
git checkout 11602f083ca275dcfd4341641ae7fe338b7f6f69 && \
28+
git config user.email "agent@example.com" && \
29+
git config user.name "Agent"
30+
31+
# --- sg_only_env: back up full repo, then truncate source ---
32+
RUN cp -a /workspace /repo_full
33+
34+
RUN find /workspace -type f \( \
35+
-name "*.go" -o -name "*.py" -o -name "*.js" -o -name "*.ts" \
36+
-o -name "*.java" -o -name "*.c" -o -name "*.cpp" -o -name "*.h" \
37+
-o -name "*.rs" -o -name "*.rb" -o -name "*.cs" -o -name "*.kt" \
38+
-o -name "*.swift" -o -name "*.jsx" -o -name "*.tsx" \
39+
-o -name "*.vue" -o -name "*.scala" \
40+
\) ! -path "*/.git/*" -exec truncate -s 0 {} \;
41+
42+
RUN touch /tmp/.sg_only_mode && echo '/workspace' > /tmp/.sg_only_workdir
Lines changed: 64 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,64 @@
1+
#!/bin/bash
2+
# SG-only verifier wrapper: restore full repo + overlay agent changes
3+
#
4+
# Source this at the TOP of test.sh for build-requiring tasks that use
5+
# sg_only_env mode. It detects /tmp/.sg_only_mode and:
6+
# 1. Identifies files the agent wrote (non-empty, non-git, non-test)
7+
# 2. Backs up those files to /tmp/agent_work/
8+
# 3. Restores the full repo from /repo_full/
9+
# 4. Overlays agent's changes on top
10+
#
11+
# For non-sg_only runs, this script is a no-op.
12+
#
13+
# Usage in test.sh:
14+
# #!/bin/bash
15+
# # Source the sg_only wrapper (no-op if not in sg_only mode)
16+
# if [ -f /tests/sgonly_verifier_wrapper.sh ]; then
17+
# source /tests/sgonly_verifier_wrapper.sh
18+
# fi
19+
# # ... rest of test.sh as normal ...
20+
21+
if [ ! -f /tmp/.sg_only_mode ]; then
22+
# Not in sg_only mode — nothing to do
23+
return 0 2>/dev/null || exit 0
24+
fi
25+
26+
echo "[sg_only_verifier] Detected sg_only mode, restoring full repo..."
27+
28+
# Read the working directory
29+
WORKDIR="$(cat /tmp/.sg_only_workdir 2>/dev/null || echo '/app')"
30+
echo "[sg_only_verifier] Working directory: $WORKDIR"
31+
32+
if [ ! -d /repo_full ]; then
33+
echo "[sg_only_verifier] WARNING: /repo_full not found, cannot restore"
34+
return 0 2>/dev/null || exit 0
35+
fi
36+
37+
# 1. Find files the agent wrote (non-empty, non-git, non-test files)
38+
cd "$WORKDIR"
39+
mkdir -p /tmp/agent_work
40+
AGENT_FILES=0
41+
find . -type f -size +0 ! -path './.git/*' ! -path './tests/*' ! -path './.claude/*' \
42+
-print0 | while IFS= read -r -d '' f; do
43+
mkdir -p "/tmp/agent_work/$(dirname "$f")"
44+
cp "$f" "/tmp/agent_work/$f"
45+
AGENT_FILES=$((AGENT_FILES + 1))
46+
done
47+
echo "[sg_only_verifier] Backed up agent-written files"
48+
49+
# 2. Restore full repo from backup
50+
rsync -a --delete /repo_full/ "$WORKDIR/"
51+
echo "[sg_only_verifier] Restored full repo from /repo_full/"
52+
53+
# 3. Overlay agent's changes
54+
cd /tmp/agent_work
55+
find . -type f -print0 | while IFS= read -r -d '' f; do
56+
target="${WORKDIR}/${f#./}"
57+
mkdir -p "$(dirname "$target")"
58+
cp "$f" "$target"
59+
done
60+
echo "[sg_only_verifier] Overlaid agent changes"
61+
62+
# Return to working directory
63+
cd "$WORKDIR"
64+
echo "[sg_only_verifier] Restore complete, proceeding with tests"

benchmarks/ccb_largerepo/big-code-k8s-001/tests/test.sh

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,9 @@
1313

1414
set -e
1515

16+
# sg_only_env: restore full repo before verification (no-op for regular runs)
17+
[ -f /tmp/.sg_only_mode ] && [ -f /tests/sgonly_verifier_wrapper.sh ] && source /tests/sgonly_verifier_wrapper.sh
18+
1619
cd /workspace
1720

1821
# Create log directories

0 commit comments

Comments
 (0)