scaleapi · Georgepu1 · Mar 19, 2026 · Mar 19, 2026 · Mar 19, 2026 · Mar 19, 2026
diff --git a/.gitignore b/.gitignore
@@ -31,9 +31,6 @@ initial-data/
 # Generated underspecified variants
 task_pairs_agentcompany/underspecified/
 
-# SWEBench repo + evaluation + user simulator code
-swebenchpro/SWE-bench_Pro-os
-
 # MCP Atlas repo (clone separately, see experiments/mcpatlas/README.md)
 experiments/mcpatlas/mcp-atlas/
 
@@ -56,4 +53,4 @@ experiments/mcpatlas/reports/
 
 hf_variants/
 __pycache__
-.env
+.env
diff --git a/.gitmodules b/.gitmodules
@@ -1,3 +1,3 @@
-[submodule "research/lhaw/swebenchpro/SWE-bench_Pro-os"]
+[submodule "swebenchpro/SWE-bench_Pro-os"]
 	path = swebenchpro/SWE-bench_Pro-os
-	url = https://github.com/scaleapi/SWE-bench_Pro-os.git
+	url = https://github.com/scaleapi/SWE-bench_Pro-os.git
diff --git a/experiments/swebench/README.md b/experiments/swebench/README.md
@@ -4,36 +4,28 @@ End-to-end pipeline for generating, running, evaluating, and classifying undersp
 
 ## Setup
 
-```bash
-cd lhaw
-
-# Python 3.11+ environment (conda or venv)
-conda create -n lhaw311 python=3.11 -y && conda activate lhaw311
-# OR: python3.11 -m venv .venv311 && source .venv311/bin/activate
+Follow the environment setup in the [root README](../../README.md#setup), then activate and install SWE-bench-specific dependencies:
 
-python -m pip install -r requirements.txt
+```bash
+# In LHAW root
+source .venv/bin/activate
 
 # SWE-bench Pro + SWE-agent (submodules)
+git submodule sync
 git submodule update --init swebenchpro/SWE-bench_Pro-os
 cd swebenchpro/SWE-bench_Pro-os && git submodule update --init SWE-agent
 
 # Switch SWE-agent to ask_user fork branch
 cd SWE-agent
-git remote add fork https://github.com/yash-scaleai/SWE-agent.git
-git fetch fork yash/ask-user-host-interception
-git checkout -b yash/ask-user-host-interception fork/yash/ask-user-host-interception
+git fetch origin
+git checkout -b lhaw/ask-user-tool origin/lhaw/ask-user-tool
 cd ../../..
 
-# Install SWE-agent (requires Python >=3.11)
-# Use python -m pip to ensure the conda env's pip is used
-python -m pip install -e swebenchpro/SWE-bench_Pro-os/SWE-agent
+# Install SWE-agent
+uv pip install -e swebenchpro/SWE-bench_Pro-os/SWE-agent
 
 # Modal auth (for container deployment)
 modal token new
-
-# Environment variables (add to .env — see .env.example)
-export LLM_API_KEY="your-api-key"
-# export LLM_BASE_URL="https://your-litellm-proxy-url"  # optional, for LiteLLM proxy
 ```
 
 Source `.env` before every session:
@@ -49,7 +41,7 @@ Run baseline SWE-agent on original tasks and export `.traj` files for grounded s
 Mirrors TAC step 1 (`tac.sh` + `export_tac_golden_trajectories.py`).
 Results are written to `baseline_N/` directories (not `exp_N/`) so they coexist with Stage 3's underspec trials in the same directory — no copying needed.
 
-See `run_swebench_example.sh` step 1 for the full commands.
+See `bash run_swebench_example.sh` step 1 for the full commands.
 
 **Produces:**
 - `baseline_1/`, `baseline_2/`, `baseline_3/` — baseline trial results (preds.json, trajectories)
@@ -95,6 +87,7 @@ python task_completion_swebench.py --run \
 ### Stage 4: Evaluate predictions
 
 Runs SWE-bench Pro Docker evaluation on all patches. Handles both variant (`exp_N/`) and baseline (`baseline_N/`) predictions.
+The source dataset file `swe_bench_pro_full.csv` is downloaded automatically on first evaluation if it is missing.
 
 ```bash
 # Evaluate only (no classification)
@@ -109,6 +102,8 @@ python scripts/process_swebench_underspec.py \
     --run-eval --dockerhub-username jefzda --judge
 ```
 
+If `--eval-only` fails for any trial or baseline, the command now exits non-zero so downstream summary steps do not continue with missing eval outputs.
+
 **Produces:** `exp_N/eval_results/` and `baseline_N/eval_results/` directories with per-instance `*_output.json` files.
 
 ### Stage 5: Classify variants

diff --git a/run_swebench_example.sh b/run_swebench_example.sh
@@ -5,12 +5,7 @@
 #
 # Mirrors run_tac_example.sh for the SWE-Bench Pro pipeline.
 #
-# Prerequisites:
-#   - conda activate lhaw311 (Python 3.11+)
-#   - python -m pip install -e swebenchpro/SWE-bench_Pro-os/SWE-agent
-#   - modal token new (Modal auth for container orchestration)
-#   - LLM_API_KEY and LLM_BASE_URL set (or OPENAI_API_KEY fallback)
-#   - source .env
+# Prerequisites: Refer to the setup instructions in experiments/swebench/README.md
 #
 # Task selection:
 #   BASELINE_MODELS controls which models run baselines. The paper required

diff --git a/scripts/process_swebench_underspec.py b/scripts/process_swebench_underspec.py
@@ -377,6 +377,7 @@ def run_evaluation(
     redo: bool = False,
 ) -> bool:
     """Run SWE-bench Pro evaluation on a trial's predictions."""
+    ensure_swebench_csv()
     preds_path = exp_dir / f"exp_{trial_num}" / "preds.json"
     eval_output_dir = exp_dir / f"exp_{trial_num}" / "eval_results"
 
@@ -456,6 +457,7 @@ def run_baseline_evaluation(
     Baselines use original instance IDs (no variant suffix stripping needed).
     Output goes to baseline_N/eval_results/ with prefix 'baselineN'.
     """
+    ensure_swebench_csv()
     baseline_dir = exp_dir / f"baseline_{baseline_num}"
     preds_path = baseline_dir / "preds.json"
     eval_output_dir = baseline_dir / "eval_results"
@@ -1065,6 +1067,7 @@ def main():
         if not args.dockerhub_username:
             print("Error: --dockerhub-username required with --run-eval")
             sys.exit(1)
+        ensure_swebench_csv()
         # Detect num trials from exp_N dirs (only match exp_1, exp_2, etc.)
         num_trials = len(
             [
@@ -1080,15 +1083,29 @@ def main():
             print(f"Error: No exp_* or baseline_* directories found in {exp_dir}")
             sys.exit(1)
 
+        eval_failures = []
         if num_trials > 0:
             print(f"Running variant evaluation ({num_trials} trials)")
             for trial_num in range(1, num_trials + 1):
-                run_evaluation(exp_dir, trial_num, args.dockerhub_username, args.num_workers)
+                if not run_evaluation(
+                    exp_dir, trial_num, args.dockerhub_username, args.num_workers
+                ):
+                    eval_failures.append(f"exp_{trial_num}")
 
         if baseline_nums:
             print(f"Running baseline evaluation ({len(baseline_nums)} baselines)")
             for bnum in baseline_nums:
-                run_baseline_evaluation(exp_dir, bnum, args.dockerhub_username, args.num_workers)
+                if not run_baseline_evaluation(
+                    exp_dir, bnum, args.dockerhub_username, args.num_workers
+                ):
+                    eval_failures.append(f"baseline_{bnum}")
+
+        if eval_failures:
+            print(
+                f"\nEvaluation failed for: {', '.join(eval_failures)}",
+                file=sys.stderr,
+            )
+            sys.exit(1)
 
         print("\nEvaluation complete.")
         sys.exit(0)

diff --git a/swebenchpro/SWE-bench_Pro-os b/swebenchpro/SWE-bench_Pro-os