-
Notifications
You must be signed in to change notification settings - Fork 10
Expand file tree
/
Copy pathcontextbench-five-lane-score.yml
More file actions
134 lines (131 loc) · 6.41 KB
/
contextbench-five-lane-score.yml
File metadata and controls
134 lines (131 loc) · 6.41 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
name: ContextBench Five Lane Score
on:
push:
branches: [master]
paths:
- .github/workflows/contextbench-five-lane-score.yml
- scripts/contextbench-score-five-lane-selections.mjs
- scripts/contextbench-score-five-lane-artifact-selections.mjs
- scripts/contextbench-build-publishable-report.mjs
- scripts/contextbench-print-publishable-report.mjs
- scripts/contextbench-five-lane-selections.json
- tests/fixtures/contextbench-benchmark-protocol.json
- tests/fixtures/contextbench-lanes.json
- tests/fixtures/contextbench-task-manifest.json
workflow_dispatch:
permissions:
contents: read
actions: read
jobs:
five-lane-score:
runs-on: ubuntu-latest
timeout-minutes: 45
env:
ROOT: /tmp/contextbench-five-lane-score
TASK_PAYLOADS: /tmp/contextbench-five-lane-score/task-payloads.json
CHECKOUT_ROOT: /tmp/contextbench-five-lane-score-checkouts
OFFICIAL_CONTEXTBENCH: /tmp/contextbench-five-lane-score/ContextBench-official
TARGET_TASK_ID: SWE-Bench-Pro__go__maintenance__bugfix__4df06349
SOURCE_SELECTIONS_PATH: scripts/contextbench-five-lane-selections.json
EXTERNAL_READINESS_ROOT: /tmp/contextbench-five-lane-score/external-readiness
REQUIRED_LANES: raw-native,codebase-context,codebase-memory-mcp,grepai,ripgrep-lexical
steps:
- uses: actions/checkout@v4
- uses: pnpm/action-setup@v2
with:
version: 10
- uses: actions/setup-node@v4
with:
node-version: '24'
cache: pnpm
- uses: actions/setup-python@v5
with:
python-version: '3.11'
- name: Install and materialize selected Go task
shell: bash
run: |
set -euo pipefail
mkdir -p "$ROOT" "$CHECKOUT_ROOT" "$ROOT/logs" "$EXTERNAL_READINESS_ROOT"
pnpm install --frozen-lockfile > "$ROOT/logs/pnpm-install.log" 2>&1
python -m pip install "tree-sitter==0.20.4" "tree-sitter-languages==1.10.2" datasets pyarrow > "$ROOT/logs/pip-install.log" 2>&1
git clone --depth 1 https://github.com/EuniAI/ContextBench.git "$OFFICIAL_CONTEXTBENCH" > "$ROOT/logs/contextbench-clone.log" 2>&1
node scripts/contextbench-runner.mjs --validate-fixtures > "$ROOT/logs/validate-fixtures.log" 2>&1
for attempt in 1 2 3; do
node scripts/contextbench-select-slice.mjs --write-task-payloads --out "$TASK_PAYLOADS.all" --checkout-root "$CHECKOUT_ROOT" > "$ROOT/logs/write-payloads-$attempt.log" 2>&1 && break
if [ "$attempt" = 3 ]; then exit 1; fi
sleep 5
done
node - <<'NODE'
const fs = require('node:fs');
const payloadPath = process.env.TASK_PAYLOADS;
const target = process.env.TARGET_TASK_ID;
const payload = JSON.parse(fs.readFileSync(`${payloadPath}.all`, 'utf8'));
const task = payload.tasks.find((candidate) => candidate.instance_id === target);
if (!task) throw new Error(`target task ${target} not found`);
fs.writeFileSync(payloadPath, `${JSON.stringify({ ...payload, task_count: 1, tasks: [task] }, null, 2)}\n`);
NODE
for attempt in 1 2 3; do
node scripts/contextbench-select-slice.mjs --materialize-checkouts --payloads "$TASK_PAYLOADS" --max-tasks 1 > "$ROOT/logs/materialize-$attempt.log" 2>&1 && break
if [ "$attempt" = 3 ]; then exit 1; fi
sleep 5
done
- name: Download GrepAI readiness artifact
uses: actions/download-artifact@v4
with:
name: contextbench-grepai-readiness
run-id: 25643757046
github-token: ${{ github.token }}
path: ${{ env.EXTERNAL_READINESS_ROOT }}/grepai
- name: Download ripgrep readiness artifact
uses: actions/download-artifact@v4
with:
name: contextbench-ripgrep-readiness
run-id: 25644197513
github-token: ${{ github.token }}
path: ${{ env.EXTERNAL_READINESS_ROOT }}/ripgrep
- name: Score five ready lane selections
shell: bash
run: |
set +e
node scripts/contextbench-score-five-lane-artifact-selections.mjs > "$ROOT/logs/score-five-lane.log" 2>&1
status=$?
tail -n 120 "$ROOT/logs/score-five-lane.log"
exit "$status"
- name: Build publishable pilot report
shell: bash
run: |
node scripts/contextbench-build-publishable-report.mjs \
--summary "$ROOT/summary.json" \
--protocol tests/fixtures/contextbench-benchmark-protocol.json \
--lanes tests/fixtures/contextbench-lanes.json \
--task-manifest tests/fixtures/contextbench-task-manifest.json \
--out "$ROOT/publishable-summary.json" \
--validation-out "$ROOT/publishable-validation.json" \
--humanized-out "$ROOT/humanized-summary.md"
- name: Print compact publishable pilot report
shell: bash
run: node scripts/contextbench-print-publishable-report.mjs "$ROOT/publishable-summary.json"
- name: Upload five-lane score artifacts
if: always()
uses: actions/upload-artifact@v4
with:
name: contextbench-five-lane-score
path: |
/tmp/contextbench-five-lane-score/summary.json
/tmp/contextbench-five-lane-score/publishable-summary.json
/tmp/contextbench-five-lane-score/publishable-validation.json
/tmp/contextbench-five-lane-score/humanized-summary.md
/tmp/contextbench-five-lane-score/task-payloads.json
/tmp/contextbench-five-lane-score/logs/**
/tmp/contextbench-five-lane-score/lane-score/summary.json
/tmp/contextbench-five-lane-score/lane-score/gold.json
/tmp/contextbench-five-lane-score/lane-score/gold-command.json
/tmp/contextbench-five-lane-score/lane-score/selections.json
/tmp/contextbench-five-lane-score/lane-score/*/selection.json
/tmp/contextbench-five-lane-score/lane-score/*/prediction.json
/tmp/contextbench-five-lane-score/lane-score/*/official-score.jsonl
/tmp/contextbench-five-lane-score/lane-score/*/evaluator-command.json
/tmp/contextbench-five-lane-score/external-readiness/**/*.json
/tmp/contextbench-five-lane-score/external-readiness/**/*.jsonl
/tmp/contextbench-five-lane-score/external-readiness/**/*.md
retention-days: 14