sourcegraph
diff --git a/‎.beads/issues.jsonl‎
Lines changed: 2 additions & 2 deletions b/‎.beads/issues.jsonl‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎benchmarks/ccb_docgen/docgen-api-003/environment/Dockerfile.sg_only‎
Lines changed: 15 additions & 0 deletions b/‎benchmarks/ccb_docgen/docgen-api-003/environment/Dockerfile.sg_only‎
Lines changed: 15 additions & 0 deletions
diff --git a/‎benchmarks/ccb_docgen/docgen-arch-003/environment/Dockerfile.sg_only‎
Lines changed: 15 additions & 0 deletions b/‎benchmarks/ccb_docgen/docgen-arch-003/environment/Dockerfile.sg_only‎
Lines changed: 15 additions & 0 deletions
diff --git a/‎benchmarks/ccb_largerepo/big-code-k8s-001/environment/Dockerfile.sg_only‎
Lines changed: 42 additions & 0 deletions b/‎benchmarks/ccb_largerepo/big-code-k8s-001/environment/Dockerfile.sg_only‎
Lines changed: 42 additions & 0 deletions
diff --git a/‎benchmarks/ccb_largerepo/big-code-k8s-001/tests/sgonly_verifier_wrapper.sh‎
Lines changed: 64 additions & 0 deletions b/‎benchmarks/ccb_largerepo/big-code-k8s-001/tests/sgonly_verifier_wrapper.sh‎
Lines changed: 64 additions & 0 deletions
diff --git a/‎benchmarks/ccb_largerepo/big-code-k8s-001/tests/test.sh‎
Lines changed: 3 additions & 0 deletions b/‎benchmarks/ccb_largerepo/big-code-k8s-001/tests/test.sh‎
Lines changed: 3 additions & 0 deletions
@@ -50,7 +50,7 @@
 {"id":"CodeContextBench-6yj","title":"US-001: Create sec-cve-001 curl CVE task","status":"closed","priority":1,"issue_type":"task","owner":"locobench@anthropic.com","created_at":"2026-02-16T15:56:28.996905534Z","created_by":"LoCoBench Bot","updated_at":"2026-02-16T16:01:20.995454609Z","closed_at":"2026-02-16T16:01:20.995454609Z","close_reason":"US-001 complete: sec-cve-001 curl CVE task created with all files, registered in config"}
 {"id":"CodeContextBench-6z8","title":"US-002: Create sec-cve-002 Envoy CVE triage task","status":"in_progress","priority":2,"issue_type":"task","owner":"locobench@anthropic.com","created_at":"2026-02-16T16:02:22.818276011Z","created_by":"LoCoBench Bot","updated_at":"2026-02-16T16:02:31.759622574Z"}
 {"id":"CodeContextBench-7ao","title":"Redesign 9 over-hinted task instructions to remove file/method/line hints","status":"closed","priority":1,"issue_type":"task","owner":"locobench@anthropic.com","created_at":"2026-02-15T20:25:30.198399183Z","created_by":"LoCoBench Bot","updated_at":"2026-02-15T20:29:45.648638308Z","closed_at":"2026-02-15T20:29:45.648638308Z","close_reason":"Redesigned 9 over-hinted task instructions to remove file/method/line hints"}
-{"id":"CodeContextBench-7dg","title":"Epic: Design experiments to realistically demonstrate MCP code search value","description":"The current benchmark mounts full repos at /workspace/, making SG tools redundant (0.001 avg delta across 70 tasks). Need new experiment designs where SG tools provide genuine, measurable value. Three tiers: (1) Cross-repo investigation tasks exercising SG-unique capabilities, (2) Mono-repo package isolation removing full-repo local access, (3) Blind-bug task variants requiring discovery. This is the project's core mission.","status":"closed","priority":0,"issue_type":"feature","owner":"locobench@anthropic.com","created_at":"2026-02-07T13:00:03.057412295Z","created_by":"LoCoBench Bot","updated_at":"2026-02-12T10:29:00.209019489Z","closed_at":"2026-02-12T10:29:00.209019489Z","close_reason":"All 3 design experiments completed: (1) sourcegraph_isolated — docs/DESIGN_sourcegraph_isolated.md, (2) blind-bug variants — docs/DESIGN_blind_bug_variants.md, (3) investigation tasks already run and analyzed (a06). Implementation is next phase.","dependencies":[{"issue_id":"CodeContextBench-7dg","depends_on_id":"CodeContextBench-fph","type":"blocks","created_at":"2026-02-07T13:00:38.013992901Z","created_by":"LoCoBench Bot"},{"issue_id":"CodeContextBench-7dg","depends_on_id":"CodeContextBench-a06","type":"blocks","created_at":"2026-02-07T13:00:37.902203528Z","created_by":"LoCoBench Bot"},{"issue_id":"CodeContextBench-7dg","depends_on_id":"CodeContextBench-8hb","type":"blocks","created_at":"2026-02-07T13:00:37.957011767Z","created_by":"LoCoBench Bot"}]}
+{"id":"CodeContextBench-7dg","title":"Epic: Design experiments to realistically demonstrate MCP code search value","description":"The current benchmark mounts full repos at /workspace/, making SG tools redundant (0.001 avg delta across 70 tasks). Need new experiment designs where SG tools provide genuine, measurable value. Three tiers: (1) Cross-repo investigation tasks exercising SG-unique capabilities, (2) Mono-repo package isolation removing full-repo local access, (3) Blind-bug task variants requiring discovery. This is the project's core mission.","status":"closed","priority":0,"issue_type":"feature","owner":"locobench@anthropic.com","created_at":"2026-02-07T13:00:03.057412295Z","created_by":"LoCoBench Bot","updated_at":"2026-02-12T10:29:00.209019489Z","closed_at":"2026-02-12T10:29:00.209019489Z","close_reason":"All 3 design experiments completed: (1) sourcegraph_isolated — docs/DESIGN_sourcegraph_isolated.md, (2) blind-bug variants — docs/DESIGN_blind_bug_variants.md, (3) investigation tasks already run and analyzed (a06). Implementation is next phase.","dependencies":[{"issue_id":"CodeContextBench-7dg","depends_on_id":"CodeContextBench-a06","type":"blocks","created_at":"2026-02-07T13:00:37.902203528Z","created_by":"LoCoBench Bot"},{"issue_id":"CodeContextBench-7dg","depends_on_id":"CodeContextBench-8hb","type":"blocks","created_at":"2026-02-07T13:00:37.957011767Z","created_by":"LoCoBench Bot"},{"issue_id":"CodeContextBench-7dg","depends_on_id":"CodeContextBench-fph","type":"blocks","created_at":"2026-02-07T13:00:38.013992901Z","created_by":"LoCoBench Bot"}]}
 {"id":"CodeContextBench-7dv","title":"US-005: Run 3x SWE-bench Pro variance trials","status":"closed","priority":1,"issue_type":"task","owner":"locobench@anthropic.com","created_at":"2026-02-12T08:39:08.439026641Z","created_by":"LoCoBench Bot","updated_at":"2026-02-16T01:39:42.106430509Z","closed_at":"2026-02-16T01:39:42.106430509Z","close_reason":"Stale - SWE-bench Pro variance trials not aligned with current enterprise largerepo focus"}
 {"id":"CodeContextBench-7k8","title":"US-009: Implement governance evaluator and register governance tasks","status":"closed","priority":1,"issue_type":"task","owner":"locobench@anthropic.com","created_at":"2026-02-15T14:51:10.627929455Z","created_by":"LoCoBench Bot","updated_at":"2026-02-15T14:54:25.91681249Z","closed_at":"2026-02-15T14:54:25.91681249Z","close_reason":"US-009 implemented: governance_evaluator.py + 6 tasks registered"}
 {"id":"CodeContextBench-7r2","title":"US-004: Archive saturated ccb_repoqa benchmark","status":"closed","priority":2,"issue_type":"task","owner":"locobench@anthropic.com","created_at":"2026-02-16T15:18:02.671684683Z","created_by":"LoCoBench Bot","updated_at":"2026-02-16T15:21:09.337735187Z","closed_at":"2026-02-16T15:21:09.337735187Z","close_reason":"Archived ccb_repoqa: moved to benchmarks/archive/, removed from task selection, updated README"}
@@ -131,7 +131,7 @@
 {"id":"CodeContextBench-qtf","title":"US-007 Add transcript discovery abstraction for non-Claude artifacts","status":"closed","priority":2,"issue_type":"task","owner":"locobench@anthropic.com","created_at":"2026-02-17T03:33:11.290586345Z","created_by":"LoCoBench Bot","updated_at":"2026-02-17T04:00:12.261541882Z","closed_at":"2026-02-17T04:00:12.261541882Z","close_reason":"done"}
 {"id":"CodeContextBench-r71","title":"CrossRepo: all runs invalid due to verifier path bug","description":"All 8 CrossRepo runs (4 tasks × 2 configs) crashed because test.sh referenced /task/tests/expected_changes.json instead of /tests/expected_changes.json. Verifier is now fixed locally but all existing runs predate the fix. Agents produced meaningful output (261-line patch, 224-line analysis, 497-line reasoning). All 4 tasks need reruns.","status":"closed","priority":1,"issue_type":"bug","owner":"locobench@anthropic.com","created_at":"2026-02-06T22:03:15.909834308Z","created_by":"LoCoBench Bot","updated_at":"2026-02-07T18:39:18.97810564Z","closed_at":"2026-02-07T18:39:18.97810564Z","close_reason":"CrossRepo all 3 configs rerun complete: baseline avg=0.571, SG_base avg=0.587, SG_full avg=0.387"}
 {"id":"CodeContextBench-rch","title":"US-019: Scaffold enterprise multi-team and conflicting-docs tasks","status":"closed","priority":1,"issue_type":"task","owner":"locobench@anthropic.com","created_at":"2026-02-15T15:00:15.881711075Z","created_by":"LoCoBench Bot","updated_at":"2026-02-15T15:08:20.111033768Z","closed_at":"2026-02-15T15:08:20.111033768Z","close_reason":"US-019 complete: 3 enterprise tasks scaffolded"}
-{"id":"CodeContextBench-rej","title":"Generate aggregate CCB evaluation report (updated: 12 benchmarks, no LoCoBench)","description":"After all benchmark runs complete and MANIFEST is clean, generate the aggregate evaluation report using python3 scripts/generate_report.py. Should cover all 13 benchmarks with 3-config comparison (baseline vs SG_base vs SG_full), MCP impact analysis, and per-benchmark breakdowns.","status":"closed","priority":2,"issue_type":"task","owner":"locobench@anthropic.com","created_at":"2026-02-06T14:50:31.544649793Z","created_by":"LoCoBench Bot","updated_at":"2026-02-16T01:39:42.151139942Z","closed_at":"2026-02-16T01:39:42.151139942Z","close_reason":"Stale - aggregate report design predates enterprise task expansion and IR pipeline work. Needs redesign.","dependencies":[{"issue_id":"CodeContextBench-rej","depends_on_id":"CodeContextBench-aot","type":"blocks","created_at":"2026-02-06T14:50:47.565065613Z","created_by":"LoCoBench Bot"},{"issue_id":"CodeContextBench-rej","depends_on_id":"CodeContextBench-kph","type":"blocks","created_at":"2026-02-06T14:50:47.632620141Z","created_by":"LoCoBench Bot"},{"issue_id":"CodeContextBench-rej","depends_on_id":"CodeContextBench-yk3","type":"blocks","created_at":"2026-02-06T14:50:47.689660185Z","created_by":"LoCoBench Bot"},{"issue_id":"CodeContextBench-rej","depends_on_id":"CodeContextBench-99h","type":"blocks","created_at":"2026-02-06T14:50:47.854278452Z","created_by":"LoCoBench Bot"},{"issue_id":"CodeContextBench-rej","depends_on_id":"CodeContextBench-8t7","type":"blocks","created_at":"2026-02-08T01:13:11.737288558Z","created_by":"LoCoBench Bot"},{"issue_id":"CodeContextBench-rej","depends_on_id":"CodeContextBench-nj3","type":"blocks","created_at":"2026-02-08T02:54:34.482946883Z","created_by":"LoCoBench Bot"},{"issue_id":"CodeContextBench-rej","depends_on_id":"CodeContextBench-36d","type":"blocks","created_at":"2026-02-08T02:54:34.538759931Z","created_by":"LoCoBench Bot"},{"issue_id":"CodeContextBench-rej","depends_on_id":"CodeContextBench-1up","type":"blocks","created_at":"2026-02-08T02:54:34.594203831Z","created_by":"LoCoBench Bot"},{"issue_id":"CodeContextBench-rej","depends_on_id":"CodeContextBench-0y8","type":"blocks","created_at":"2026-02-08T02:54:35.06846015Z","created_by":"LoCoBench Bot"},{"issue_id":"CodeContextBench-rej","depends_on_id":"CodeContextBench-24z","type":"blocks","created_at":"2026-02-08T02:54:35.124147115Z","created_by":"LoCoBench Bot"},{"issue_id":"CodeContextBench-rej","depends_on_id":"CodeContextBench-dfp","type":"blocks","created_at":"2026-02-06T14:50:47.909843823Z","created_by":"LoCoBench Bot"},{"issue_id":"CodeContextBench-rej","depends_on_id":"CodeContextBench-9r9","type":"blocks","created_at":"2026-02-06T14:50:47.744576933Z","created_by":"LoCoBench Bot"},{"issue_id":"CodeContextBench-rej","depends_on_id":"CodeContextBench-05n","type":"blocks","created_at":"2026-02-06T14:50:47.799295655Z","created_by":"LoCoBench Bot"}]}
+{"id":"CodeContextBench-rej","title":"Generate aggregate CCB evaluation report (updated: 12 benchmarks, no LoCoBench)","description":"After all benchmark runs complete and MANIFEST is clean, generate the aggregate evaluation report using python3 scripts/generate_report.py. Should cover all 13 benchmarks with 3-config comparison (baseline vs SG_base vs SG_full), MCP impact analysis, and per-benchmark breakdowns.","status":"closed","priority":2,"issue_type":"task","owner":"locobench@anthropic.com","created_at":"2026-02-06T14:50:31.544649793Z","created_by":"LoCoBench Bot","updated_at":"2026-02-16T01:39:42.151139942Z","closed_at":"2026-02-16T01:39:42.151139942Z","close_reason":"Stale - aggregate report design predates enterprise task expansion and IR pipeline work. Needs redesign.","dependencies":[{"issue_id":"CodeContextBench-rej","depends_on_id":"CodeContextBench-aot","type":"blocks","created_at":"2026-02-06T14:50:47.565065613Z","created_by":"LoCoBench Bot"},{"issue_id":"CodeContextBench-rej","depends_on_id":"CodeContextBench-kph","type":"blocks","created_at":"2026-02-06T14:50:47.632620141Z","created_by":"LoCoBench Bot"},{"issue_id":"CodeContextBench-rej","depends_on_id":"CodeContextBench-yk3","type":"blocks","created_at":"2026-02-06T14:50:47.689660185Z","created_by":"LoCoBench Bot"},{"issue_id":"CodeContextBench-rej","depends_on_id":"CodeContextBench-9r9","type":"blocks","created_at":"2026-02-06T14:50:47.744576933Z","created_by":"LoCoBench Bot"},{"issue_id":"CodeContextBench-rej","depends_on_id":"CodeContextBench-05n","type":"blocks","created_at":"2026-02-06T14:50:47.799295655Z","created_by":"LoCoBench Bot"},{"issue_id":"CodeContextBench-rej","depends_on_id":"CodeContextBench-99h","type":"blocks","created_at":"2026-02-06T14:50:47.854278452Z","created_by":"LoCoBench Bot"},{"issue_id":"CodeContextBench-rej","depends_on_id":"CodeContextBench-dfp","type":"blocks","created_at":"2026-02-06T14:50:47.909843823Z","created_by":"LoCoBench Bot"},{"issue_id":"CodeContextBench-rej","depends_on_id":"CodeContextBench-8t7","type":"blocks","created_at":"2026-02-08T01:13:11.737288558Z","created_by":"LoCoBench Bot"},{"issue_id":"CodeContextBench-rej","depends_on_id":"CodeContextBench-nj3","type":"blocks","created_at":"2026-02-08T02:54:34.482946883Z","created_by":"LoCoBench Bot"},{"issue_id":"CodeContextBench-rej","depends_on_id":"CodeContextBench-36d","type":"blocks","created_at":"2026-02-08T02:54:34.538759931Z","created_by":"LoCoBench Bot"},{"issue_id":"CodeContextBench-rej","depends_on_id":"CodeContextBench-1up","type":"blocks","created_at":"2026-02-08T02:54:34.594203831Z","created_by":"LoCoBench Bot"},{"issue_id":"CodeContextBench-rej","depends_on_id":"CodeContextBench-0y8","type":"blocks","created_at":"2026-02-08T02:54:35.06846015Z","created_by":"LoCoBench Bot"},{"issue_id":"CodeContextBench-rej","depends_on_id":"CodeContextBench-24z","type":"blocks","created_at":"2026-02-08T02:54:35.124147115Z","created_by":"LoCoBench Bot"}]}
 {"id":"CodeContextBench-rf3","title":"US-002: Fix protonmail Docker environment","status":"closed","priority":2,"issue_type":"task","owner":"locobench@anthropic.com","created_at":"2026-02-11T23:31:45.49023811Z","created_by":"LoCoBench Bot","updated_at":"2026-02-11T23:39:17.748141376Z","closed_at":"2026-02-11T23:39:17.748141376Z","close_reason":"Fixed protonmail Docker Node.js v16→v18 in local + cached Dockerfiles"}
 {"id":"CodeContextBench-rxg","title":"Rerun 7 LoCoBench SG_base zero-token gap-fill tasks","description":"7 LoCoBench tasks in locobench_gapfill_opus_20260209_010036/sourcegraph_base have zero tokens (auth failure). Tasks: c_api_graphql_expert_079 (arch+cross_file), rust_microservice_expert_008, csharp_warehouse_expert_012 (2), python_streaming_expert_085, python_desktop_expert. Current SG_base mean=0.504 (18 valid) but MANIFEST shows 0.363 including errored. Fix errored classification is done but these need actual reruns for complete data.","status":"closed","priority":1,"issue_type":"task","owner":"locobench@anthropic.com","created_at":"2026-02-10T11:28:20.889991278Z","created_by":"LoCoBench Bot","updated_at":"2026-02-15T19:31:57.593499773Z","closed_at":"2026-02-15T19:31:57.593499773Z","close_reason":"SG_base config dropped from official runs"}
 {"id":"CodeContextBench-s00t","title":"US-007 - Add transcript discovery abstraction for non-Claude artifacts","status":"closed","priority":2,"issue_type":"task","owner":"locobench@anthropic.com","created_at":"2026-02-17T03:57:12.383536394Z","created_by":"LoCoBench Bot","updated_at":"2026-02-17T03:57:29.113635367Z","closed_at":"2026-02-17T03:57:29.113635367Z","close_reason":"duplicate"}
 
@@ -0,0 +1,15 @@
+FROM ubuntu:22.04
+
+RUN apt-get update && apt-get install -y \
+    python3 \
+    python3-pip \
+    curl \
+    unzip \
+    && rm -rf /var/lib/apt/lists/*
+
+RUN pip install --no-cache-dir pytest
+
+RUN mkdir -p /workspace /logs/verifier /app
+WORKDIR /workspace
+
+ENTRYPOINT []
@@ -0,0 +1,15 @@
+FROM ubuntu:22.04
+
+RUN apt-get update && apt-get install -y \
+    python3 \
+    python3-pip \
+    curl \
+    unzip \
+    && rm -rf /var/lib/apt/lists/*
+
+RUN pip install --no-cache-dir pytest
+
+RUN mkdir -p /workspace /logs/verifier /app
+WORKDIR /workspace
+
+ENTRYPOINT []
@@ -0,0 +1,42 @@
+# Largerepo Task Environment — sg_only_env variant
+# Source: kubernetes/kubernetes (v1.30.0)
+#
+# Same toolchain as the standard Dockerfile but source files are truncated
+# so the agent must use Sourcegraph MCP for all code access.
+# The verifier wrapper restores the full repo before running tests.
+
+FROM golang:1.23-bookworm
+
+WORKDIR /workspace
+
+# Install dependencies
+RUN apt-get update && apt-get install -y \
+    git \
+    curl \
+    python3 \
+    python3-pip \
+    npm \
+    rsync \
+    && rm -rf /var/lib/apt/lists/*
+
+# Install Claude Code CLI
+RUN npm install -g @anthropic-ai/claude-code
+
+# Clone the Kubernetes repository at pinned commit (v1.30.0)
+RUN git clone --filter=blob:none --no-checkout https://github.com/kubernetes/kubernetes.git . && \
+    git checkout 11602f083ca275dcfd4341641ae7fe338b7f6f69 && \
+    git config user.email "agent@example.com" && \
+    git config user.name "Agent"
+
+# --- sg_only_env: back up full repo, then truncate source ---
+RUN cp -a /workspace /repo_full
+
+RUN find /workspace -type f \( \
+    -name "*.go" -o -name "*.py" -o -name "*.js" -o -name "*.ts" \
+    -o -name "*.java" -o -name "*.c" -o -name "*.cpp" -o -name "*.h" \
+    -o -name "*.rs" -o -name "*.rb" -o -name "*.cs" -o -name "*.kt" \
+    -o -name "*.swift" -o -name "*.jsx" -o -name "*.tsx" \
+    -o -name "*.vue" -o -name "*.scala" \
+    \) ! -path "*/.git/*" -exec truncate -s 0 {} \;
+
+RUN touch /tmp/.sg_only_mode && echo '/workspace' > /tmp/.sg_only_workdir
@@ -0,0 +1,64 @@
+#!/bin/bash
+# SG-only verifier wrapper: restore full repo + overlay agent changes
+#
+# Source this at the TOP of test.sh for build-requiring tasks that use
+# sg_only_env mode. It detects /tmp/.sg_only_mode and:
+#   1. Identifies files the agent wrote (non-empty, non-git, non-test)
+#   2. Backs up those files to /tmp/agent_work/
+#   3. Restores the full repo from /repo_full/
+#   4. Overlays agent's changes on top
+#
+# For non-sg_only runs, this script is a no-op.
+#
+# Usage in test.sh:
+#   #!/bin/bash
+#   # Source the sg_only wrapper (no-op if not in sg_only mode)
+#   if [ -f /tests/sgonly_verifier_wrapper.sh ]; then
+#       source /tests/sgonly_verifier_wrapper.sh
+#   fi
+#   # ... rest of test.sh as normal ...
+
+if [ ! -f /tmp/.sg_only_mode ]; then
+    # Not in sg_only mode — nothing to do
+    return 0 2>/dev/null || exit 0
+fi
+
+echo "[sg_only_verifier] Detected sg_only mode, restoring full repo..."
+
+# Read the working directory
+WORKDIR="$(cat /tmp/.sg_only_workdir 2>/dev/null || echo '/app')"
+echo "[sg_only_verifier] Working directory: $WORKDIR"
+
+if [ ! -d /repo_full ]; then
+    echo "[sg_only_verifier] WARNING: /repo_full not found, cannot restore"
+    return 0 2>/dev/null || exit 0
+fi
+
+# 1. Find files the agent wrote (non-empty, non-git, non-test files)
+cd "$WORKDIR"
+mkdir -p /tmp/agent_work
+AGENT_FILES=0
+find . -type f -size +0 ! -path './.git/*' ! -path './tests/*' ! -path './.claude/*' \
+    -print0 | while IFS= read -r -d '' f; do
+    mkdir -p "/tmp/agent_work/$(dirname "$f")"
+    cp "$f" "/tmp/agent_work/$f"
+    AGENT_FILES=$((AGENT_FILES + 1))
+done
+echo "[sg_only_verifier] Backed up agent-written files"
+
+# 2. Restore full repo from backup
+rsync -a --delete /repo_full/ "$WORKDIR/"
+echo "[sg_only_verifier] Restored full repo from /repo_full/"
+
+# 3. Overlay agent's changes
+cd /tmp/agent_work
+find . -type f -print0 | while IFS= read -r -d '' f; do
+    target="${WORKDIR}/${f#./}"
+    mkdir -p "$(dirname "$target")"
+    cp "$f" "$target"
+done
+echo "[sg_only_verifier] Overlaid agent changes"
+
+# Return to working directory
+cd "$WORKDIR"
+echo "[sg_only_verifier] Restore complete, proceeding with tests"
@@ -13,6 +13,9 @@
 
 set -e
 
+# sg_only_env: restore full repo before verification (no-op for regular runs)
+[ -f /tmp/.sg_only_mode ] && [ -f /tests/sgonly_verifier_wrapper.sh ] && source /tests/sgonly_verifier_wrapper.sh
+
 cd /workspace
 
 # Create log directories