Skip to content

Commit 1548025

Browse files
author
LoCoBench Bot
committed
Merge branch 'ralph/multi-agent-harnesses'
2 parents 453ff87 + e14e9fe commit 1548025

16 files changed

+2081
-28
lines changed

.beads/issues.jsonl

Lines changed: 14 additions & 11 deletions
Large diffs are not rendered by default.

configs/codex_2config.sh

Lines changed: 214 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,214 @@
1+
#!/bin/bash
2+
# Codex Harness 2-Config Runner
3+
#
4+
# Runs selected tasks across 2 configurations:
5+
# 1. Baseline (BASELINE_MCP_TYPE=none)
6+
# 2. MCP-Full (BASELINE_MCP_TYPE=sourcegraph_full)
7+
#
8+
# Usage:
9+
# ./configs/codex_2config.sh [OPTIONS]
10+
#
11+
# Options:
12+
# --baseline-only Run only baseline (no MCP)
13+
# --full-only Run only MCP-Full (sourcegraph_full)
14+
# --model MODEL Override model (default: gpt-5.3-codex)
15+
# --agent-path PATH Override Harbor agent import path
16+
# --parallel N Max parallel task subshells (default: 1)
17+
# --category CATEGORY Run category label for jobs dir (default: staging)
18+
# --benchmark BENCH Optional benchmark filter (e.g. ccb_crossrepo)
19+
20+
set -e
21+
22+
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
23+
cd "$SCRIPT_DIR/.."
24+
25+
AGENT_DIR="${AGENT_DIR:-$HOME/evals/custom_agents/agents/claudecode}"
26+
export PYTHONPATH="${AGENT_DIR}:$(pwd):$PYTHONPATH"
27+
28+
# Shared helpers (validation/reporting and run helpers)
29+
source "$SCRIPT_DIR/_common.sh"
30+
31+
SELECTION_FILE="$SCRIPT_DIR/selected_benchmark_tasks.json"
32+
AGENT_PATH="${AGENT_PATH:-agents.codex_driver_agent:CodexDriverAgent}"
33+
MODEL="${MODEL:-gpt-5.3-codex}"
34+
CATEGORY="${CATEGORY:-staging}"
35+
BENCHMARK_FILTER=""
36+
CONCURRENCY=2
37+
TIMEOUT_MULTIPLIER=10
38+
RUN_BASELINE=true
39+
RUN_FULL=true
40+
41+
while [[ $# -gt 0 ]]; do
42+
case $1 in
43+
--baseline-only)
44+
RUN_FULL=false
45+
shift
46+
;;
47+
--full-only)
48+
RUN_BASELINE=false
49+
shift
50+
;;
51+
--model)
52+
MODEL="$2"
53+
shift 2
54+
;;
55+
--agent-path)
56+
AGENT_PATH="$2"
57+
shift 2
58+
;;
59+
--parallel)
60+
PARALLEL_JOBS="$2"
61+
shift 2
62+
;;
63+
--category)
64+
CATEGORY="$2"
65+
shift 2
66+
;;
67+
--benchmark)
68+
BENCHMARK_FILTER="$2"
69+
shift 2
70+
;;
71+
*)
72+
echo "Unknown option: $1"
73+
exit 1
74+
;;
75+
esac
76+
done
77+
78+
if [ ! -f "$SELECTION_FILE" ]; then
79+
echo "ERROR: selected_benchmark_tasks.json not found at $SELECTION_FILE"
80+
exit 1
81+
fi
82+
83+
readarray -t TASK_ROWS < <(python3 - "$SELECTION_FILE" "$BENCHMARK_FILTER" <<'PYEOF'
84+
import json
85+
import sys
86+
87+
selection_file = sys.argv[1]
88+
benchmark_filter = sys.argv[2]
89+
90+
data = json.load(open(selection_file))
91+
for task in data.get("tasks", []):
92+
if task.get("excluded", False):
93+
continue
94+
if benchmark_filter and task.get("benchmark") != benchmark_filter:
95+
continue
96+
task_id = task["task_id"]
97+
task_dir = task["task_dir"]
98+
benchmark = task.get("benchmark", "")
99+
print(f"{task_id}\tbenchmarks/{task_dir}\t{benchmark}")
100+
PYEOF
101+
)
102+
103+
if [ ${#TASK_ROWS[@]} -eq 0 ]; then
104+
echo "ERROR: no tasks selected after filters"
105+
exit 1
106+
fi
107+
108+
declare -A TASK_PATH_BY_ID
109+
TASK_IDS=()
110+
for row in "${TASK_ROWS[@]}"; do
111+
task_id=$(echo "$row" | cut -f1)
112+
task_path=$(echo "$row" | cut -f2)
113+
TASK_IDS+=("$task_id")
114+
TASK_PATH_BY_ID["$task_id"]="$task_path"
115+
done
116+
117+
if [ -z "${PARALLEL_JOBS:-}" ] || [ "$PARALLEL_JOBS" -lt 1 ] 2>/dev/null; then
118+
PARALLEL_JOBS=1
119+
fi
120+
121+
# run_tasks_parallel expects CLAUDE_HOMES; use current HOME for Codex harness runs.
122+
CLAUDE_HOMES=("$HOME")
123+
REAL_HOME="$HOME"
124+
125+
_model_lower=$(echo "$MODEL" | awk -F/ '{print $NF}' | tr '[:upper:]' '[:lower:]')
126+
case "$_model_lower" in
127+
*gpt-5.3-codex*|*gpt53codex*) MODEL_SHORT="gpt53codex" ;;
128+
*gpt-5*|*gpt5*) MODEL_SHORT="gpt5" ;;
129+
*gpt-4o*|*gpt4o*) MODEL_SHORT="gpt4o" ;;
130+
*gpt-4*|*gpt4*) MODEL_SHORT="gpt4" ;;
131+
*) MODEL_SHORT=$(echo "$_model_lower" | tr -d '-' | tr -d '_' | cut -c1-12) ;;
132+
esac
133+
134+
TIMESTAMP=$(date +%Y%m%d_%H%M%S)
135+
JOBS_BASE="runs/${CATEGORY}/codex_${MODEL_SHORT}_${TIMESTAMP}"
136+
mkdir -p "$JOBS_BASE"
137+
138+
echo "=============================================="
139+
echo "Codex 2-Config Runner"
140+
echo "=============================================="
141+
echo "Model: $MODEL"
142+
echo "Agent path: $AGENT_PATH"
143+
echo "Benchmark filter: ${BENCHMARK_FILTER:-<all selected benchmarks>}"
144+
echo "Task count: ${#TASK_IDS[@]}"
145+
echo "Parallel jobs: $PARALLEL_JOBS"
146+
echo "Jobs directory: $JOBS_BASE"
147+
echo "Run baseline: $RUN_BASELINE"
148+
echo "Run MCP-Full: $RUN_FULL"
149+
echo ""
150+
151+
_codex_run_single() {
152+
local task_id=$1
153+
local _task_home=$2
154+
local config=${3:-baseline}
155+
local mcp_type=${4:-none}
156+
local jobs_base=${5:-$JOBS_BASE}
157+
local jobs_subdir="${jobs_base}/${config}"
158+
local task_path="${TASK_PATH_BY_ID[$task_id]}"
159+
160+
case "$mcp_type" in
161+
none|sourcegraph_full)
162+
;;
163+
*)
164+
echo "ERROR: unsupported MCP mode for codex rollout: $mcp_type"
165+
return 1
166+
;;
167+
esac
168+
169+
mkdir -p "$jobs_subdir"
170+
171+
if [ ! -d "$task_path" ]; then
172+
echo "ERROR: Task directory not found: $task_path"
173+
return 1
174+
fi
175+
176+
echo "Running task: $task_id ($config)"
177+
BASELINE_MCP_TYPE="$mcp_type" harbor run \
178+
--path "$task_path" \
179+
--agent-import-path "$AGENT_PATH" \
180+
--model "$MODEL" \
181+
--jobs-dir "$jobs_subdir" \
182+
-n "$CONCURRENCY" \
183+
--timeout-multiplier "$TIMEOUT_MULTIPLIER" \
184+
2>&1 | tee "${jobs_subdir}/${task_id}.log" \
185+
|| echo "WARNING: Task $task_id ($config) failed"
186+
}
187+
188+
run_mode() {
189+
local mode=$1
190+
local mcp_type=$2
191+
192+
jobs_subdir="${JOBS_BASE}/${mode}"
193+
mkdir -p "$jobs_subdir"
194+
195+
_mode_dispatch() {
196+
_codex_run_single "$1" "$2" "$mode" "$mcp_type" "$JOBS_BASE"
197+
}
198+
199+
run_tasks_parallel TASK_IDS _mode_dispatch || true
200+
validate_and_report "$jobs_subdir" "$mode"
201+
}
202+
203+
if [ "$RUN_BASELINE" = true ]; then
204+
run_mode "baseline" "none"
205+
fi
206+
207+
if [ "$RUN_FULL" = true ]; then
208+
run_mode "sourcegraph_full" "sourcegraph_full"
209+
fi
210+
211+
print_validation_summary "$JOBS_BASE"
212+
213+
echo ""
214+
echo "Done. Results: $JOBS_BASE"

0 commit comments

Comments
 (0)