|
| 1 | +#!/bin/bash |
| 2 | +# Codex Harness 2-Config Runner |
| 3 | +# |
| 4 | +# Runs selected tasks across 2 configurations: |
| 5 | +# 1. Baseline (BASELINE_MCP_TYPE=none) |
| 6 | +# 2. MCP-Full (BASELINE_MCP_TYPE=sourcegraph_full) |
| 7 | +# |
| 8 | +# Usage: |
| 9 | +# ./configs/codex_2config.sh [OPTIONS] |
| 10 | +# |
| 11 | +# Options: |
| 12 | +# --baseline-only Run only baseline (no MCP) |
| 13 | +# --full-only Run only MCP-Full (sourcegraph_full) |
| 14 | +# --model MODEL Override model (default: gpt-5.3-codex) |
| 15 | +# --agent-path PATH Override Harbor agent import path |
| 16 | +# --parallel N Max parallel task subshells (default: 1) |
| 17 | +# --category CATEGORY Run category label for jobs dir (default: staging) |
| 18 | +# --benchmark BENCH Optional benchmark filter (e.g. ccb_crossrepo) |
| 19 | + |
| 20 | +set -e |
| 21 | + |
| 22 | +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" |
| 23 | +cd "$SCRIPT_DIR/.." |
| 24 | + |
| 25 | +AGENT_DIR="${AGENT_DIR:-$HOME/evals/custom_agents/agents/claudecode}" |
| 26 | +export PYTHONPATH="${AGENT_DIR}:$(pwd):$PYTHONPATH" |
| 27 | + |
| 28 | +# Shared helpers (validation/reporting and run helpers) |
| 29 | +source "$SCRIPT_DIR/_common.sh" |
| 30 | + |
| 31 | +SELECTION_FILE="$SCRIPT_DIR/selected_benchmark_tasks.json" |
| 32 | +AGENT_PATH="${AGENT_PATH:-agents.codex_driver_agent:CodexDriverAgent}" |
| 33 | +MODEL="${MODEL:-gpt-5.3-codex}" |
| 34 | +CATEGORY="${CATEGORY:-staging}" |
| 35 | +BENCHMARK_FILTER="" |
| 36 | +CONCURRENCY=2 |
| 37 | +TIMEOUT_MULTIPLIER=10 |
| 38 | +RUN_BASELINE=true |
| 39 | +RUN_FULL=true |
| 40 | + |
| 41 | +while [[ $# -gt 0 ]]; do |
| 42 | + case $1 in |
| 43 | + --baseline-only) |
| 44 | + RUN_FULL=false |
| 45 | + shift |
| 46 | + ;; |
| 47 | + --full-only) |
| 48 | + RUN_BASELINE=false |
| 49 | + shift |
| 50 | + ;; |
| 51 | + --model) |
| 52 | + MODEL="$2" |
| 53 | + shift 2 |
| 54 | + ;; |
| 55 | + --agent-path) |
| 56 | + AGENT_PATH="$2" |
| 57 | + shift 2 |
| 58 | + ;; |
| 59 | + --parallel) |
| 60 | + PARALLEL_JOBS="$2" |
| 61 | + shift 2 |
| 62 | + ;; |
| 63 | + --category) |
| 64 | + CATEGORY="$2" |
| 65 | + shift 2 |
| 66 | + ;; |
| 67 | + --benchmark) |
| 68 | + BENCHMARK_FILTER="$2" |
| 69 | + shift 2 |
| 70 | + ;; |
| 71 | + *) |
| 72 | + echo "Unknown option: $1" |
| 73 | + exit 1 |
| 74 | + ;; |
| 75 | + esac |
| 76 | +done |
| 77 | + |
| 78 | +if [ ! -f "$SELECTION_FILE" ]; then |
| 79 | + echo "ERROR: selected_benchmark_tasks.json not found at $SELECTION_FILE" |
| 80 | + exit 1 |
| 81 | +fi |
| 82 | + |
| 83 | +readarray -t TASK_ROWS < <(python3 - "$SELECTION_FILE" "$BENCHMARK_FILTER" <<'PYEOF' |
| 84 | +import json |
| 85 | +import sys |
| 86 | +
|
| 87 | +selection_file = sys.argv[1] |
| 88 | +benchmark_filter = sys.argv[2] |
| 89 | +
|
| 90 | +data = json.load(open(selection_file)) |
| 91 | +for task in data.get("tasks", []): |
| 92 | + if task.get("excluded", False): |
| 93 | + continue |
| 94 | + if benchmark_filter and task.get("benchmark") != benchmark_filter: |
| 95 | + continue |
| 96 | + task_id = task["task_id"] |
| 97 | + task_dir = task["task_dir"] |
| 98 | + benchmark = task.get("benchmark", "") |
| 99 | + print(f"{task_id}\tbenchmarks/{task_dir}\t{benchmark}") |
| 100 | +PYEOF |
| 101 | +) |
| 102 | + |
| 103 | +if [ ${#TASK_ROWS[@]} -eq 0 ]; then |
| 104 | + echo "ERROR: no tasks selected after filters" |
| 105 | + exit 1 |
| 106 | +fi |
| 107 | + |
| 108 | +declare -A TASK_PATH_BY_ID |
| 109 | +TASK_IDS=() |
| 110 | +for row in "${TASK_ROWS[@]}"; do |
| 111 | + task_id=$(echo "$row" | cut -f1) |
| 112 | + task_path=$(echo "$row" | cut -f2) |
| 113 | + TASK_IDS+=("$task_id") |
| 114 | + TASK_PATH_BY_ID["$task_id"]="$task_path" |
| 115 | +done |
| 116 | + |
| 117 | +if [ -z "${PARALLEL_JOBS:-}" ] || [ "$PARALLEL_JOBS" -lt 1 ] 2>/dev/null; then |
| 118 | + PARALLEL_JOBS=1 |
| 119 | +fi |
| 120 | + |
| 121 | +# run_tasks_parallel expects CLAUDE_HOMES; use current HOME for Codex harness runs. |
| 122 | +CLAUDE_HOMES=("$HOME") |
| 123 | +REAL_HOME="$HOME" |
| 124 | + |
| 125 | +_model_lower=$(echo "$MODEL" | awk -F/ '{print $NF}' | tr '[:upper:]' '[:lower:]') |
| 126 | +case "$_model_lower" in |
| 127 | + *gpt-5.3-codex*|*gpt53codex*) MODEL_SHORT="gpt53codex" ;; |
| 128 | + *gpt-5*|*gpt5*) MODEL_SHORT="gpt5" ;; |
| 129 | + *gpt-4o*|*gpt4o*) MODEL_SHORT="gpt4o" ;; |
| 130 | + *gpt-4*|*gpt4*) MODEL_SHORT="gpt4" ;; |
| 131 | + *) MODEL_SHORT=$(echo "$_model_lower" | tr -d '-' | tr -d '_' | cut -c1-12) ;; |
| 132 | +esac |
| 133 | + |
| 134 | +TIMESTAMP=$(date +%Y%m%d_%H%M%S) |
| 135 | +JOBS_BASE="runs/${CATEGORY}/codex_${MODEL_SHORT}_${TIMESTAMP}" |
| 136 | +mkdir -p "$JOBS_BASE" |
| 137 | + |
| 138 | +echo "==============================================" |
| 139 | +echo "Codex 2-Config Runner" |
| 140 | +echo "==============================================" |
| 141 | +echo "Model: $MODEL" |
| 142 | +echo "Agent path: $AGENT_PATH" |
| 143 | +echo "Benchmark filter: ${BENCHMARK_FILTER:-<all selected benchmarks>}" |
| 144 | +echo "Task count: ${#TASK_IDS[@]}" |
| 145 | +echo "Parallel jobs: $PARALLEL_JOBS" |
| 146 | +echo "Jobs directory: $JOBS_BASE" |
| 147 | +echo "Run baseline: $RUN_BASELINE" |
| 148 | +echo "Run MCP-Full: $RUN_FULL" |
| 149 | +echo "" |
| 150 | + |
| 151 | +_codex_run_single() { |
| 152 | + local task_id=$1 |
| 153 | + local _task_home=$2 |
| 154 | + local config=${3:-baseline} |
| 155 | + local mcp_type=${4:-none} |
| 156 | + local jobs_base=${5:-$JOBS_BASE} |
| 157 | + local jobs_subdir="${jobs_base}/${config}" |
| 158 | + local task_path="${TASK_PATH_BY_ID[$task_id]}" |
| 159 | + |
| 160 | + case "$mcp_type" in |
| 161 | + none|sourcegraph_full) |
| 162 | + ;; |
| 163 | + *) |
| 164 | + echo "ERROR: unsupported MCP mode for codex rollout: $mcp_type" |
| 165 | + return 1 |
| 166 | + ;; |
| 167 | + esac |
| 168 | + |
| 169 | + mkdir -p "$jobs_subdir" |
| 170 | + |
| 171 | + if [ ! -d "$task_path" ]; then |
| 172 | + echo "ERROR: Task directory not found: $task_path" |
| 173 | + return 1 |
| 174 | + fi |
| 175 | + |
| 176 | + echo "Running task: $task_id ($config)" |
| 177 | + BASELINE_MCP_TYPE="$mcp_type" harbor run \ |
| 178 | + --path "$task_path" \ |
| 179 | + --agent-import-path "$AGENT_PATH" \ |
| 180 | + --model "$MODEL" \ |
| 181 | + --jobs-dir "$jobs_subdir" \ |
| 182 | + -n "$CONCURRENCY" \ |
| 183 | + --timeout-multiplier "$TIMEOUT_MULTIPLIER" \ |
| 184 | + 2>&1 | tee "${jobs_subdir}/${task_id}.log" \ |
| 185 | + || echo "WARNING: Task $task_id ($config) failed" |
| 186 | +} |
| 187 | + |
| 188 | +run_mode() { |
| 189 | + local mode=$1 |
| 190 | + local mcp_type=$2 |
| 191 | + |
| 192 | + jobs_subdir="${JOBS_BASE}/${mode}" |
| 193 | + mkdir -p "$jobs_subdir" |
| 194 | + |
| 195 | + _mode_dispatch() { |
| 196 | + _codex_run_single "$1" "$2" "$mode" "$mcp_type" "$JOBS_BASE" |
| 197 | + } |
| 198 | + |
| 199 | + run_tasks_parallel TASK_IDS _mode_dispatch || true |
| 200 | + validate_and_report "$jobs_subdir" "$mode" |
| 201 | +} |
| 202 | + |
| 203 | +if [ "$RUN_BASELINE" = true ]; then |
| 204 | + run_mode "baseline" "none" |
| 205 | +fi |
| 206 | + |
| 207 | +if [ "$RUN_FULL" = true ]; then |
| 208 | + run_mode "sourcegraph_full" "sourcegraph_full" |
| 209 | +fi |
| 210 | + |
| 211 | +print_validation_summary "$JOBS_BASE" |
| 212 | + |
| 213 | +echo "" |
| 214 | +echo "Done. Results: $JOBS_BASE" |
0 commit comments