@@ -151,22 +151,20 @@ if [ ${#TASK_ROWS[@]} -eq 0 ]; then
151151fi
152152
153153declare -A TASK_PATH_BY_ID
154- declare -A TASK_SUITE_BY_ID
155154TASK_IDS=()
156155for row in " ${TASK_ROWS[@]} " ; do
157156 task_id=$( echo " $row " | cut -f1)
158157 task_path=$( echo " $row " | cut -f2)
159- benchmark=$( echo " $row " | cut -f3)
160158 TASK_IDS+=(" $task_id " )
161159 TASK_PATH_BY_ID[" $task_id " ]=" $task_path "
162- TASK_SUITE_BY_ID[" $task_id " ]=" $benchmark "
163160done
164161
165162if [ -z " ${PARALLEL_JOBS:- } " ] || [ " $PARALLEL_JOBS " -lt 1 ] 2> /dev/null; then
166163 PARALLEL_JOBS=0 # sentinel; setup_multi_accounts will auto-set
167164fi
168165
169166# Multi-account support: rotate OAuth tokens across accounts.
167+ # REAL_HOME must be set before setup_multi_accounts.
170168REAL_HOME=" $HOME "
171169setup_multi_accounts
172170
@@ -183,17 +181,6 @@ case "$_model_lower" in
183181 * ) MODEL_SHORT=$( echo " $_model_lower " | tr -d ' -' | tr -d ' _' | cut -c1-12) ;;
184182esac
185183
186- # Dotted model version for official directory structure (e.g. sonnet-4.6)
187- case " $_model_lower " in
188- * sonnet-4-6* |* sonnet46* ) MODEL_DIR=" sonnet-4.6" ;;
189- * sonnet-4-5* |* sonnet45* ) MODEL_DIR=" sonnet-4.5" ;;
190- * opus-4-6* |* opus46* ) MODEL_DIR=" opus-4.6" ;;
191- * haiku-4-5* |* haiku45* ) MODEL_DIR=" haiku-4.5" ;;
192- * gpt-5* |* gpt5* ) MODEL_DIR=" gpt-5" ;;
193- * gpt-4o* |* gpt4o* ) MODEL_DIR=" gpt-4o" ;;
194- * ) MODEL_DIR=" $MODEL_SHORT " ;;
195- esac
196-
197184TIMESTAMP=$( date +%Y%m%d_%H%M%S)
198185JOBS_BASE=" runs/${CATEGORY} /openhands_${MODEL_SHORT} _${TIMESTAMP} "
199186mkdir -p " $JOBS_BASE "
@@ -216,54 +203,17 @@ echo "ANTHROPIC_API_KEY: ${ANTHROPIC_API_KEY:+set (${#ANTHROPIC_API_KEY} chars)}
216203echo " Storage override: ${DAYTONA_OVERRIDE_STORAGE:- <none>} MB"
217204echo " "
218205
219- if [ " ${HARBOR_ENV:- } " = " daytona" ]; then
220- clear_daytona_cost_guard_ready
221- _cost_guard_cmd=(
222- python3 " $REPO_ROOT /scripts/daytona_cost_guard.py" preflight
223- --selection-file " $SELECTION_FILE "
224- --parallel-tasks " $PARALLEL_JOBS "
225- --concurrency " $CONCURRENCY "
226- --policy " $DAYTONA_COST_POLICY "
227- )
228- [ -n " $BENCHMARK_FILTER " ] && _cost_guard_cmd+=(--benchmark " $BENCHMARK_FILTER " )
229- for task_id in " ${TASK_IDS[@]} " ; do
230- _cost_guard_cmd+=(--task-id " $task_id " )
231- done
232- [ " $RUN_BASELINE " = true ] && _cost_guard_cmd+=(--config " baseline-local-direct" )
233- [ " $RUN_FULL " = true ] && _cost_guard_cmd+=(--config " mcp-remote-direct" )
234- " ${_cost_guard_cmd[@]} " || exit 1
235- mark_daytona_cost_guard_ready
236- fi
237-
238206_openhands_run_single () {
239207 local task_id=$1
240208 local _task_home=$2
241209 local config=${3:- baseline-local-direct}
242210 local mcp_type=${4:- none}
243211 local jobs_base=${5:- $JOBS_BASE }
212+ local jobs_subdir=" ${jobs_base} /${config} "
244213 local task_path=" ${TASK_PATH_BY_ID[$task_id]} "
245214
246- # Map harness config name to official config dir name
247- local official_config
248- case " $config " in
249- baseline-local-direct) official_config=" baseline" ;;
250- mcp-remote-direct) official_config=" sourcegraph_full" ;;
251- * ) official_config=" $config " ;;
252- esac
253-
254- # Build official-structure jobs dir:
255- # {jobs_base}/openhands/{csb_sdlc|csb_org}/{model_dir}/{suite}/{official_config}
256- local suite=" ${TASK_SUITE_BY_ID[$task_id]} "
257- local top_level
258- if [[ " $suite " == csb_sdlc_* ]]; then
259- top_level=" csb_sdlc"
260- else
261- top_level=" csb_org"
262- fi
263- local jobs_subdir=" ${jobs_base} /openhands/${top_level} /${MODEL_DIR} /${suite} /${official_config} "
264-
265215 # Extract ANTHROPIC_API_KEY from this account's OAuth credentials.
266- # run_tasks_parallel sets HOME=$_task_home for account rotation .
216+ # run_tasks_parallel sets HOME=$_task_home, so we read that account's token .
267217 if [ " $USE_SUBSCRIPTION " = " true" ]; then
268218 local _acct_token
269219 _acct_token=$( python3 -c "
@@ -312,19 +262,14 @@ if os.path.exists(creds_file):
312262 fi
313263
314264 echo " Running task: $task_id ($config )"
315- DAYTONA_LABEL_RUN_ID=" $( basename " $JOBS_BASE " ) " \
316- DAYTONA_LABEL_BENCHMARK=" ${TASK_SUITE_BY_ID[$task_id]} " \
317- DAYTONA_LABEL_TASK_ID=" $task_id " \
318- DAYTONA_LABEL_CONFIG=" $config " \
319- DAYTONA_LABEL_CATEGORY=" $CATEGORY " \
320- TASK_SOURCE_DIR=" $task_path " \
321- BASELINE_MCP_TYPE=" $mcp_type " harbor_run_guarded \
265+ BASELINE_MCP_TYPE=" $mcp_type " harbor run \
322266 --path " $_run_path " \
323267 --agent-import-path " $AGENT_PATH " \
324268 --model " $MODEL " \
325269 --jobs-dir " $jobs_subdir " \
326270 -n " $CONCURRENCY " \
327271 --timeout-multiplier " $TIMEOUT_MULTIPLIER " \
272+ ${HARBOR_ENV: +--env " $HARBOR_ENV " } \
328273 ${DAYTONA_OVERRIDE_STORAGE: +--override-storage-mb " $DAYTONA_OVERRIDE_STORAGE " } \
329274 2>&1 | tee " ${jobs_subdir} /${task_id} .log" \
330275 || echo " WARNING: Task $task_id ($config ) failed"
@@ -334,20 +279,23 @@ run_mode() {
334279 local mode=$1
335280 local mcp_type=$2
336281
282+ jobs_subdir=" ${JOBS_BASE} /${mode} "
283+ mkdir -p " $jobs_subdir "
284+
337285 _mode_dispatch () {
338286 _openhands_run_single " $1 " " $2 " " $mode " " $mcp_type " " $JOBS_BASE "
339287 }
340288
341289 run_tasks_parallel TASK_IDS _mode_dispatch || true
342- validate_and_report " $JOBS_BASE " " $mode "
290+ validate_and_report " $jobs_subdir " " $mode "
343291}
344292
345293if [ " $PAIRED_MODE " = true ] && [ " $RUN_BASELINE " = true ] && [ " $RUN_FULL " = true ]; then
346294 # Run baseline + MCP simultaneously per task (interleaved, not sequential)
347295 export FULL_CONFIG=" mcp-remote-direct"
348296 run_paired_configs TASK_IDS _openhands_run_single " $JOBS_BASE "
349- validate_and_report " $JOBS_BASE " " baseline"
350- validate_and_report " $JOBS_BASE " " sourcegraph_full "
297+ validate_and_report " ${ JOBS_BASE} /baseline-local-direct " " baseline-local-direct "
298+ validate_and_report " ${ JOBS_BASE} /mcp-remote-direct " " mcp-remote-direct "
351299else
352300 # Sequential mode (--baseline-only, --full-only, or --sequential)
353301 if [ " $RUN_BASELINE " = true ]; then
0 commit comments