Skip to content

Commit 2e085d2

Browse files
sjarmakclaude
andcommitted
fix: resolve merge conflicts in Dockerfiles and openhands config
Conflict resolution from concurrent upstream changes: - Dockerfile base image tags simplified (golang:1.23-bookworm → golang:1.23) - openhands_2config.sh simplified to flat directory structure - .gitignore: add sourcegraph_tasks/ exclusion Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
1 parent 1c9a6bd commit 2e085d2

File tree

5 files changed

+21
-70
lines changed

5 files changed

+21
-70
lines changed

.gitignore

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -74,6 +74,9 @@ vendor/dependeval_repos/
7474
# Oracle curation logs (auto-generated, not needed for benchmark)
7575
oracle_curation_log.json
7676

77+
# Experimental / internal task mining (not for public benchmark)
78+
sourcegraph_tasks/
79+
7780
# Credentials
7881
*.key
7982
*.pem

benchmarks/backups/csb_sdlc_design/envoy-routeconfig-dep-chain-001/environment/Dockerfile

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
FROM golang:1.23-bookworm AS base
1+
FROM golang:1.23 AS base
22

33
# Install git, bash, and python3 (needed by verifier test.sh scoring)
44
RUN apt-get update && apt-get install -y --no-install-recommends git bash python3 && rm -rf /var/lib/apt/lists/*

benchmarks/backups/csb_sdlc_design/envoy-routeconfig-dep-chain-001/environment/Dockerfile.artifact_only

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -3,10 +3,10 @@
33
# MCP agent deletes source files at runtime via agent startup script.
44
# Verifier applies patches from review.json to /repo_full copy for scoring.
55

6-
FROM golang:1.23-alpine AS base
6+
FROM golang:1.23 AS base
77

88
# Install git, bash, and python3 (needed by verifier test.sh scoring)
9-
RUN apk add --no-cache git bash python3
9+
RUN apt-get update && apt-get install -y --no-install-recommends git bash python3 && rm -rf /var/lib/apt/lists/*
1010

1111
# Clone repos at pinned commits
1212
WORKDIR /workspace

benchmarks/backups/csb_sdlc_design/envoy-routeconfig-dep-chain-001/environment/Dockerfile.sg_only

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -2,16 +2,16 @@
22
# Empty workspace — agent uses Sourcegraph MCP for code access.
33
# Verifier clones mirror(s) at verification time via clone manifest.
44

5-
FROM golang:1.23-bookworm
5+
FROM golang:1.23
66

7-
ENV DEBIAN_FRONTEND=noninteractive
8-
ENV SOURCEGRAPH_REPOS="sg-evals/data-plane-api--84e84367,sg-evals/go-control-plane--71637ad6,sg-evals/istio--4c1f845d"
7+
ENV SOURCEGRAPH_REPOS=sg-evals/istio--4c1f845d,sg-evals/go-control-plane--71637ad6,sg-evals/data-plane-api--84e84367
98

9+
ENV DEBIAN_FRONTEND=noninteractive
1010
RUN apt-get update && apt-get install -y --no-install-recommends \
1111
git \
12-
ca-certificates \
1312
python3 \
1413
curl \
14+
bash \
1515
&& rm -rf /var/lib/apt/lists/*
1616

1717
WORKDIR /workspace

configs/openhands_2config.sh

Lines changed: 11 additions & 63 deletions
Original file line numberDiff line numberDiff line change
@@ -151,22 +151,20 @@ if [ ${#TASK_ROWS[@]} -eq 0 ]; then
151151
fi
152152

153153
declare -A TASK_PATH_BY_ID
154-
declare -A TASK_SUITE_BY_ID
155154
TASK_IDS=()
156155
for row in "${TASK_ROWS[@]}"; do
157156
task_id=$(echo "$row" | cut -f1)
158157
task_path=$(echo "$row" | cut -f2)
159-
benchmark=$(echo "$row" | cut -f3)
160158
TASK_IDS+=("$task_id")
161159
TASK_PATH_BY_ID["$task_id"]="$task_path"
162-
TASK_SUITE_BY_ID["$task_id"]="$benchmark"
163160
done
164161

165162
if [ -z "${PARALLEL_JOBS:-}" ] || [ "$PARALLEL_JOBS" -lt 1 ] 2>/dev/null; then
166163
PARALLEL_JOBS=0 # sentinel; setup_multi_accounts will auto-set
167164
fi
168165

169166
# Multi-account support: rotate OAuth tokens across accounts.
167+
# REAL_HOME must be set before setup_multi_accounts.
170168
REAL_HOME="$HOME"
171169
setup_multi_accounts
172170

@@ -183,17 +181,6 @@ case "$_model_lower" in
183181
*) MODEL_SHORT=$(echo "$_model_lower" | tr -d '-' | tr -d '_' | cut -c1-12) ;;
184182
esac
185183

186-
# Dotted model version for official directory structure (e.g. sonnet-4.6)
187-
case "$_model_lower" in
188-
*sonnet-4-6*|*sonnet46*) MODEL_DIR="sonnet-4.6" ;;
189-
*sonnet-4-5*|*sonnet45*) MODEL_DIR="sonnet-4.5" ;;
190-
*opus-4-6*|*opus46*) MODEL_DIR="opus-4.6" ;;
191-
*haiku-4-5*|*haiku45*) MODEL_DIR="haiku-4.5" ;;
192-
*gpt-5*|*gpt5*) MODEL_DIR="gpt-5" ;;
193-
*gpt-4o*|*gpt4o*) MODEL_DIR="gpt-4o" ;;
194-
*) MODEL_DIR="$MODEL_SHORT" ;;
195-
esac
196-
197184
TIMESTAMP=$(date +%Y%m%d_%H%M%S)
198185
JOBS_BASE="runs/${CATEGORY}/openhands_${MODEL_SHORT}_${TIMESTAMP}"
199186
mkdir -p "$JOBS_BASE"
@@ -216,54 +203,17 @@ echo "ANTHROPIC_API_KEY: ${ANTHROPIC_API_KEY:+set (${#ANTHROPIC_API_KEY} chars)}
216203
echo "Storage override: ${DAYTONA_OVERRIDE_STORAGE:-<none>} MB"
217204
echo ""
218205

219-
if [ "${HARBOR_ENV:-}" = "daytona" ]; then
220-
clear_daytona_cost_guard_ready
221-
_cost_guard_cmd=(
222-
python3 "$REPO_ROOT/scripts/daytona_cost_guard.py" preflight
223-
--selection-file "$SELECTION_FILE"
224-
--parallel-tasks "$PARALLEL_JOBS"
225-
--concurrency "$CONCURRENCY"
226-
--policy "$DAYTONA_COST_POLICY"
227-
)
228-
[ -n "$BENCHMARK_FILTER" ] && _cost_guard_cmd+=(--benchmark "$BENCHMARK_FILTER")
229-
for task_id in "${TASK_IDS[@]}"; do
230-
_cost_guard_cmd+=(--task-id "$task_id")
231-
done
232-
[ "$RUN_BASELINE" = true ] && _cost_guard_cmd+=(--config "baseline-local-direct")
233-
[ "$RUN_FULL" = true ] && _cost_guard_cmd+=(--config "mcp-remote-direct")
234-
"${_cost_guard_cmd[@]}" || exit 1
235-
mark_daytona_cost_guard_ready
236-
fi
237-
238206
_openhands_run_single() {
239207
local task_id=$1
240208
local _task_home=$2
241209
local config=${3:-baseline-local-direct}
242210
local mcp_type=${4:-none}
243211
local jobs_base=${5:-$JOBS_BASE}
212+
local jobs_subdir="${jobs_base}/${config}"
244213
local task_path="${TASK_PATH_BY_ID[$task_id]}"
245214

246-
# Map harness config name to official config dir name
247-
local official_config
248-
case "$config" in
249-
baseline-local-direct) official_config="baseline" ;;
250-
mcp-remote-direct) official_config="sourcegraph_full" ;;
251-
*) official_config="$config" ;;
252-
esac
253-
254-
# Build official-structure jobs dir:
255-
# {jobs_base}/openhands/{csb_sdlc|csb_org}/{model_dir}/{suite}/{official_config}
256-
local suite="${TASK_SUITE_BY_ID[$task_id]}"
257-
local top_level
258-
if [[ "$suite" == csb_sdlc_* ]]; then
259-
top_level="csb_sdlc"
260-
else
261-
top_level="csb_org"
262-
fi
263-
local jobs_subdir="${jobs_base}/openhands/${top_level}/${MODEL_DIR}/${suite}/${official_config}"
264-
265215
# Extract ANTHROPIC_API_KEY from this account's OAuth credentials.
266-
# run_tasks_parallel sets HOME=$_task_home for account rotation.
216+
# run_tasks_parallel sets HOME=$_task_home, so we read that account's token.
267217
if [ "$USE_SUBSCRIPTION" = "true" ]; then
268218
local _acct_token
269219
_acct_token=$(python3 -c "
@@ -312,19 +262,14 @@ if os.path.exists(creds_file):
312262
fi
313263

314264
echo "Running task: $task_id ($config)"
315-
DAYTONA_LABEL_RUN_ID="$(basename "$JOBS_BASE")" \
316-
DAYTONA_LABEL_BENCHMARK="${TASK_SUITE_BY_ID[$task_id]}" \
317-
DAYTONA_LABEL_TASK_ID="$task_id" \
318-
DAYTONA_LABEL_CONFIG="$config" \
319-
DAYTONA_LABEL_CATEGORY="$CATEGORY" \
320-
TASK_SOURCE_DIR="$task_path" \
321-
BASELINE_MCP_TYPE="$mcp_type" harbor_run_guarded \
265+
BASELINE_MCP_TYPE="$mcp_type" harbor run \
322266
--path "$_run_path" \
323267
--agent-import-path "$AGENT_PATH" \
324268
--model "$MODEL" \
325269
--jobs-dir "$jobs_subdir" \
326270
-n "$CONCURRENCY" \
327271
--timeout-multiplier "$TIMEOUT_MULTIPLIER" \
272+
${HARBOR_ENV:+--env "$HARBOR_ENV"} \
328273
${DAYTONA_OVERRIDE_STORAGE:+--override-storage-mb "$DAYTONA_OVERRIDE_STORAGE"} \
329274
2>&1 | tee "${jobs_subdir}/${task_id}.log" \
330275
|| echo "WARNING: Task $task_id ($config) failed"
@@ -334,20 +279,23 @@ run_mode() {
334279
local mode=$1
335280
local mcp_type=$2
336281

282+
jobs_subdir="${JOBS_BASE}/${mode}"
283+
mkdir -p "$jobs_subdir"
284+
337285
_mode_dispatch() {
338286
_openhands_run_single "$1" "$2" "$mode" "$mcp_type" "$JOBS_BASE"
339287
}
340288

341289
run_tasks_parallel TASK_IDS _mode_dispatch || true
342-
validate_and_report "$JOBS_BASE" "$mode"
290+
validate_and_report "$jobs_subdir" "$mode"
343291
}
344292

345293
if [ "$PAIRED_MODE" = true ] && [ "$RUN_BASELINE" = true ] && [ "$RUN_FULL" = true ]; then
346294
# Run baseline + MCP simultaneously per task (interleaved, not sequential)
347295
export FULL_CONFIG="mcp-remote-direct"
348296
run_paired_configs TASK_IDS _openhands_run_single "$JOBS_BASE"
349-
validate_and_report "$JOBS_BASE" "baseline"
350-
validate_and_report "$JOBS_BASE" "sourcegraph_full"
297+
validate_and_report "${JOBS_BASE}/baseline-local-direct" "baseline-local-direct"
298+
validate_and_report "${JOBS_BASE}/mcp-remote-direct" "mcp-remote-direct"
351299
else
352300
# Sequential mode (--baseline-only, --full-only, or --sequential)
353301
if [ "$RUN_BASELINE" = true ]; then

0 commit comments

Comments
 (0)