Skip to content

Run real ContextBench gpt-5.4-mini matrix #1

Run real ContextBench gpt-5.4-mini matrix

Run real ContextBench gpt-5.4-mini matrix #1

name: ContextBench Real GPT54 Mini
on:
push:
branches: [master]
paths:
- .github/workflows/contextbench-real-gpt54mini.yml
workflow_dispatch:
inputs:
max_tasks:
description: 'Frozen ContextBench tasks to run from the manifest prefix'
required: true
default: '3'
repeats:
description: 'Repeats per lane/task'
required: true
default: '1'
lanes:
description: 'Comma-separated lanes'
required: true
default: 'raw-native,codebase-context,jcodemunch-repomapper,grepai,codebase-memory-mcp,codegraphcontext'
permissions:
contents: read
jobs:
real-matrix:
runs-on: ubuntu-latest
timeout-minutes: 360
env:
ROOT: /tmp/contextbench-real-gpt54mini
TASK_PAYLOADS: /tmp/contextbench-real-gpt54mini/task-payloads.json
CHECKOUT_ROOT: /tmp/contextbench-checkouts
MAX_TASKS: ${{ github.event.inputs.max_tasks || '3' }}
REPEATS: ${{ github.event.inputs.repeats || '1' }}
LANES: ${{ github.event.inputs.lanes || 'raw-native,codebase-context,jcodemunch-repomapper,grepai,codebase-memory-mcp,codegraphcontext' }}
OPENAI_MODEL: gpt-5.4-mini
OPENAI_REASONING_EFFORT: high
OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
CBM_VERSION: v0.6.1
steps:
- uses: actions/checkout@v4
- uses: pnpm/action-setup@v2
with:
version: 10
- uses: actions/setup-node@v4
with:
node-version: '24'
cache: pnpm
- uses: actions/setup-python@v5
with:
python-version: '3.11'
- name: Install repo, evaluator, and lane tooling
run: |
set -euxo pipefail
mkdir -p "$ROOT" "$CHECKOUT_ROOT" "$ROOT/tool"
pnpm install --frozen-lockfile
pnpm run build
python -m pip install "tree-sitter==0.20.4" "tree-sitter-languages==1.10.2" datasets pyarrow uv codegraphcontext kuzu
curl -sSL https://raw.githubusercontent.com/yoanbernabeu/grepai/main/install.sh | sh || true
echo "$HOME/.local/bin" >> "$GITHUB_PATH"
echo "$HOME/bin" >> "$GITHUB_PATH"
curl -fsSL "https://github.com/DeusData/codebase-memory-mcp/releases/download/${CBM_VERSION}/codebase-memory-mcp-linux-amd64.tar.gz" -o "$ROOT/tool/cbm.tar.gz"
tar -xzf "$ROOT/tool/cbm.tar.gz" -C "$ROOT/tool"
chmod +x "$ROOT/tool/codebase-memory-mcp" || true
for i in 1 2 3; do git clone --depth 1 https://github.com/EuniAI/ContextBench.git "$ROOT/ContextBench-official" && break || { rm -rf "$ROOT/ContextBench-official"; sleep 5; }; done
- name: Materialize frozen ContextBench task checkouts
run: |
set -euxo pipefail
node scripts/contextbench-runner.mjs --validate-fixtures
node scripts/contextbench-select-slice.mjs --write-task-payloads --out "$TASK_PAYLOADS" --checkout-root "$CHECKOUT_ROOT"
node scripts/contextbench-select-slice.mjs --materialize-checkouts --payloads "$TASK_PAYLOADS" --max-tasks "$MAX_TASKS"
- name: Run gpt-5.4-mini-high scoreable matrix
env:
CBM_BIN: /tmp/contextbench-real-gpt54mini/tool/codebase-memory-mcp
OFFICIAL_CONTEXTBENCH: /tmp/contextbench-real-gpt54mini/ContextBench-official
run: |
cat > "$ROOT/run-real-matrix.mjs" <<'NODE'
import { spawn, spawnSync } from 'node:child_process';
import { existsSync, mkdirSync, readFileSync, writeFileSync } from 'node:fs';
import { basename, join } from 'node:path';
const root = process.env.ROOT;
const outRoot = join(root, 'matrix');
const payloads = JSON.parse(readFileSync(process.env.TASK_PAYLOADS, 'utf8'));
const maxTasks = Number(process.env.MAX_TASKS || '3');
const repeats = Number(process.env.REPEATS || '1');
const lanes = String(process.env.LANES || '').split(',').map((lane) => lane.trim()).filter(Boolean);
const tasks = payloads.tasks.slice(0, maxTasks);
mkdirSync(outRoot, { recursive: true });
function run(cmd, args, opts = {}) {
const started = Date.now();
const result = spawnSync(cmd, args, {
cwd: opts.cwd || process.cwd(),
env: opts.env || process.env,
encoding: 'utf8',
timeout: opts.timeoutMs || 20 * 60 * 1000,
maxBuffer: 96 * 1024 * 1024
});
return {
command: [cmd, ...args].join(' '),
cwd: opts.cwd || process.cwd(),
status: result.status,
signal: result.signal,
error: result.error?.message || null,
durationMs: Date.now() - started,
stdout: result.stdout || '',
stderr: result.stderr || ''
};
}
function sanitize(value) {
return String(value).replace(/[^a-zA-Z0-9._-]+/g, '-').replace(/^-+|-+$/g, '').slice(0, 180);
}
function queryOf(problem) {
const stop = new Set(['that', 'this', 'with', 'from', 'when', 'then', 'into', 'should', 'would', 'could', 'there', 'where', 'which', 'about', 'after', 'before']);
return String(problem || '')
.replace(/[`*_#>\[\](){},.;:!?/\\]/g, ' ')
.split(/\s+/)
.filter((w) => w.length >= 4 && !stop.has(w.toLowerCase()) && !/^https?$/.test(w))
.slice(0, 10)
.join(' ');
}
function add(locations, file, start = 1, end = start, source = 'tool') {
if (typeof file !== 'string' || file.length === 0) return;
if (file.includes('://')) return;
const clean = file.replace(/^\/+/, '').replace(/^\.\//, '');
if (!clean || clean.includes('..')) return;
const s = Math.max(1, Number(start) || 1);
const e = Math.max(s, Number(end) || s);
locations.push({ file: clean, start: s, end: e, source });
}
function walk(value, locations, source = 'tool') {
if (!value || typeof value !== 'object') return;
if (Array.isArray(value)) { for (const item of value) walk(item, locations, source); return; }
const file = value.file || value.path || value.file_path || value.relative_path || value.filename || value.source_path || value.uri;
const start = value.start_line || value.startLine || value.line || value.line_number || value.start || 1;
const end = value.end_line || value.endLine || value.end || start;
add(locations, file, start, end, source);
for (const item of Object.values(value)) walk(item, locations, source);
}
function jsonish(text) {
const raw = String(text || '').trim();
if (!raw) return null;
try { return JSON.parse(raw); } catch {}
for (const [open, close] of [['{', '}'], ['[', ']']]) {
const first = raw.indexOf(open);
const last = raw.lastIndexOf(close);
if (first >= 0 && last > first) {
try { return JSON.parse(raw.slice(first, last + 1)); } catch {}
}
}
return null;
}
function collectFromText(text, locations, source = 'tool') {
const re = /([A-Za-z0-9_.\/-]+\.(?:js|jsx|ts|tsx|py|go|rs|java|c|cc|cpp|h|hpp|rb|php|cs|kt|swift|vue|svelte|json|yml|yaml|md))(?::|#L|\s+line\s+)?(\d+)?/g;
let match;
while ((match = re.exec(String(text || ''))) !== null) add(locations, match[1], match[2] || 1, match[2] || 1, source);
}
function collect(result, locations, source) {
for (const text of [result?.stdout, result?.stderr]) {
const parsed = jsonish(text);
if (parsed) walk(parsed, locations, source);
collectFromText(text, locations, source);
}
}
function uniqueLocations(locations) {
const seen = new Set();
const out = [];
for (const loc of locations) {
const key = `${loc.file}:${loc.start}:${loc.end}`;
if (seen.has(key)) continue;
seen.add(key);
out.push(loc);
if (out.length >= 80) break;
}
return out;
}
function buildPrediction(task, modelSelection) {
const spans = new Map();
for (const span of modelSelection.spans || []) {
addSpan(spans, span.file, span.start, span.end);
}
for (const file of modelSelection.files || []) addSpan(spans, file, 1, 1);
const predFiles = [...spans.keys()].slice(0, 20);
const predSpans = Object.fromEntries([...spans.entries()].slice(0, 20));
return {
instance_id: task.instance_id,
repo_url: task.repo_checkout_path,
commit: task.base_commit,
traj_data: { pred_steps: [{ files: predFiles, spans: predSpans }], pred_files: predFiles, pred_spans: predSpans },
model_patch: ''
};
}
function addSpan(spans, file, start = 1, end = start) {
if (typeof file !== 'string' || !file) return;
const clean = file.replace(/^\/+/, '').replace(/^\.\//, '');
const s = Math.max(1, Number(start) || 1);
const e = Math.max(s, Number(end) || s);
const list = spans.get(clean) || [];
list.push({ start: s, end: e });
spans.set(clean, list);
}
function parseScores(scorePath) {
if (!existsSync(scorePath)) return null;
const lines = readFileSync(scorePath, 'utf8').trim().split(/\n+/).filter(Boolean);
if (lines.length === 0) return null;
try { return JSON.parse(lines.at(-1)); } catch { return null; }
}
async function callOpenAI(runDir, task, lane, query, candidates) {
const started = Date.now();
if (!process.env.OPENAI_API_KEY) {
return { ok: false, status: 'model_unavailable', durationMs: 0, error: 'missing_OPENAI_API_KEY_secret' };
}
const candidateFiles = new Set(candidates.map((c) => c.file));
const request = {
model: process.env.OPENAI_MODEL || 'gpt-5.4-mini',
reasoning: { effort: process.env.OPENAI_REASONING_EFFORT || 'high' },
max_output_tokens: 1800,
instructions: 'You are selecting likely edit locations for ContextBench. Use only the provided candidate locations from the lane tool. Return JSON only.',
input: JSON.stringify({
taskId: task.instance_id,
repo: task.repo,
lane,
query,
problemStatement: task.problem_statement,
candidateLocations: candidates.slice(0, 60)
}),
text: {
format: {
type: 'json_schema',
name: 'contextbench_location_selection',
strict: true,
schema: {
type: 'object',
additionalProperties: false,
required: ['files', 'spans', 'notes'],
properties: {
files: { type: 'array', maxItems: 20, items: { type: 'string' } },
spans: {
type: 'array',
maxItems: 40,
items: {
type: 'object',
additionalProperties: false,
required: ['file', 'start', 'end'],
properties: {
file: { type: 'string' },
start: { type: 'integer', minimum: 1 },
end: { type: 'integer', minimum: 1 }
}
}
},
notes: { type: 'string' }
}
}
}
}
};
writeFileSync(join(runDir, 'openai-request.redacted.json'), JSON.stringify({ ...request, input: JSON.parse(request.input) }, null, 2));
let responseText = '';
let responseJson = null;
try {
const response = await fetch('https://api.openai.com/v1/responses', {
method: 'POST',
headers: { 'Content-Type': 'application/json', Authorization: `Bearer ${process.env.OPENAI_API_KEY}` },
body: JSON.stringify(request)
});
responseText = await response.text();
writeFileSync(join(runDir, 'openai-response.json'), responseText);
if (!response.ok) return { ok: false, status: 'model_error', durationMs: Date.now() - started, httpStatus: response.status, error: responseText.slice(0, 2000) };
responseJson = JSON.parse(responseText);
} catch (error) {
return { ok: false, status: 'model_error', durationMs: Date.now() - started, error: String(error?.message || error) };
}
const outputText = responseJson.output_text || (responseJson.output || []).flatMap((item) => item.content || []).filter((item) => item.type === 'output_text').map((item) => item.text).join('\n');
let parsed;
try { parsed = JSON.parse(outputText); } catch (error) { return { ok: false, status: 'model_parse_error', durationMs: Date.now() - started, error: String(error?.message || error), outputText }; }
const files = [...new Set((parsed.files || []).filter((file) => candidateFiles.has(file)))].slice(0, 20);
const spans = (parsed.spans || []).filter((span) => candidateFiles.has(span.file)).slice(0, 40);
if (files.length === 0 && spans.length === 0) return { ok: false, status: 'model_empty_after_lane_filter', durationMs: Date.now() - started, parsed };
return { ok: true, status: 'completed', durationMs: Date.now() - started, parsed: { files, spans, notes: parsed.notes || '' }, responseId: responseJson.id || null, usage: responseJson.usage || null };
}
async function callMcp(command, args, cwd, calls, env = process.env) {
const { Client } = await import('@modelcontextprotocol/sdk/client/index.js');
const { StdioClientTransport } = await import('@modelcontextprotocol/sdk/client/stdio.js');
const transport = new StdioClientTransport({ command, args, cwd, env });
const client = new Client({ name: 'contextbench-ci', version: '1.0.0' }, { capabilities: {} });
const started = Date.now();
const outputs = [];
try {
await client.connect(transport);
const tools = await client.listTools();
outputs.push({ tool: 'tools/list', result: tools });
for (const call of calls) {
const result = await client.callTool({ name: call.name, arguments: call.arguments });
outputs.push({ tool: call.name, arguments: call.arguments, result });
}
await client.close();
return { status: 0, signal: null, error: null, durationMs: Date.now() - started, stdout: JSON.stringify(outputs), stderr: '', command: [command, ...args].join(' '), cwd };
} catch (error) {
try { await client.close(); } catch {}
return { status: 1, signal: null, error: String(error?.message || error), durationMs: Date.now() - started, stdout: JSON.stringify(outputs), stderr: String(error?.stack || error), command: [command, ...args].join(' '), cwd };
}
}
async function retrieve(lane, task, runDir, query) {
const repo = task.repo_checkout_path;
const firstTerm = query.split(/\s+/).find(Boolean) || 'main';
const locations = [];
const commands = [];
let setupStatus = 'completed';
let indexStatus = 'completed';
const startedSetup = Date.now();
let setupDurationMs = 0;
let indexDurationMs = 0;
let queryDurationMs = 0;
if (lane === 'raw-native') {
setupDurationMs = Date.now() - startedSetup;
const qTerms = query.split(/\s+/).slice(0, 5);
const started = Date.now();
for (const term of qTerms) {
const result = run('rg', ['-n', '-i', '--glob', '!.git', '--glob', '!vendor/**', '--glob', '!node_modules/**', term, repo], { timeoutMs: 60_000 });
commands.push(result);
collect(result, locations, 'raw-native');
}
queryDurationMs = Date.now() - started;
} else if (lane === 'codebase-context') {
const env = { ...process.env, CODEBASE_ROOT: repo, CODEBASE_CONTEXT_ASCII: '1' };
const version = run('node', ['dist/index.js', '--version'], { env, timeoutMs: 60_000 });
commands.push(version);
setupDurationMs = version.durationMs;
const reindex = run('node', ['dist/index.js', 'reindex'], { env, timeoutMs: 20 * 60_000 });
commands.push(reindex);
indexDurationMs = reindex.durationMs;
if (reindex.status !== 0) indexStatus = 'index_failed';
const search = run('node', ['dist/index.js', 'search', '--query', query, '--intent', 'edit', '--limit', '25', '--json'], { env, timeoutMs: 5 * 60_000 });
commands.push(search);
queryDurationMs = search.durationMs;
collect(search, locations, 'codebase-context');
} else if (lane === 'codebase-memory-mcp') {
const env = { ...process.env, CBM_CACHE_DIR: join(runDir, 'cbm-cache'), CBM_DIAGNOSTICS: '1' };
const version = run(process.env.CBM_BIN, ['--version'], { env, timeoutMs: 60_000 });
commands.push(version);
setupDurationMs = version.durationMs;
const index = run(process.env.CBM_BIN, ['cli', 'index_repository', JSON.stringify({ repo_path: repo })], { cwd: repo, env, timeoutMs: 45 * 60_000 });
commands.push(index);
indexDurationMs = index.durationMs;
if (index.status !== 0) indexStatus = 'index_failed';
const projectObj = jsonish(index.stdout) || jsonish(index.stderr) || {};
const project = projectObj.project || basename(repo);
const started = Date.now();
const graph = run(process.env.CBM_BIN, ['cli', 'search_graph', JSON.stringify({ project, query, limit: 25 })], { cwd: repo, env, timeoutMs: 120_000 });
const code = run(process.env.CBM_BIN, ['cli', 'search_code', JSON.stringify({ project, pattern: firstTerm, mode: 'compact', limit: 25 })], { cwd: repo, env, timeoutMs: 120_000 });
commands.push(graph, code);
queryDurationMs = Date.now() - started;
collect(graph, locations, 'codebase-memory-mcp');
collect(code, locations, 'codebase-memory-mcp');
} else if (lane === 'grepai') {
const grepai = 'grepai';
const version = run(grepai, ['version'], { timeoutMs: 60_000 });
commands.push(version);
const initArgs = process.env.OPENAI_API_KEY ? ['init', '--yes', '--provider', 'openai', '--backend', 'gob', '--model', 'text-embedding-3-small'] : ['init', '--yes', '--provider', 'synthetic', '--backend', 'gob'];
const init = run(grepai, initArgs, { cwd: repo, timeoutMs: 120_000 });
commands.push(init);
setupDurationMs = version.durationMs + init.durationMs;
if (init.status !== 0) setupStatus = 'setup_failed';
const startedIndex = Date.now();
const watch = run(grepai, ['watch', '--background'], { cwd: repo, timeoutMs: 120_000 });
commands.push(watch);
await new Promise((resolve) => setTimeout(resolve, 25_000));
const status = run(grepai, ['watch', '--status'], { cwd: repo, timeoutMs: 60_000 });
commands.push(status);
indexDurationMs = Date.now() - startedIndex;
if (watch.status !== 0 && status.status !== 0) indexStatus = 'index_failed';
const search = run(grepai, ['search', query, '--json', '--compact'], { cwd: repo, timeoutMs: 180_000 });
commands.push(search);
queryDurationMs = search.durationMs;
collect(search, locations, 'grepai');
commands.push(run(grepai, ['watch', '--stop'], { cwd: repo, timeoutMs: 60_000 }));
} else if (lane === 'jcodemunch-repomapper') {
const version = run('uvx', ['jcodemunch-mcp', '--help'], { timeoutMs: 180_000 });
commands.push(version);
setupDurationMs = version.durationMs;
const mcp = await callMcp('uvx', ['jcodemunch-mcp'], repo, [
{ name: 'index_folder', arguments: { path: repo, incremental: false, use_ai_summaries: false, follow_symlinks: false } },
{ name: 'search_symbols', arguments: { repo, query, max_results: 25, semantic: false } },
{ name: 'search_symbols', arguments: { repo: basename(repo), query, max_results: 25, semantic: false } }
]);
commands.push(mcp);
indexDurationMs = mcp.durationMs;
queryDurationMs = 0;
if (mcp.status !== 0) indexStatus = 'index_or_query_failed';
collect(mcp, locations, 'jcodemunch-repomapper');
} else if (lane === 'codegraphcontext') {
const version = run('cgc', ['--help'], { timeoutMs: 60_000 });
commands.push(version);
setupDurationMs = version.durationMs;
const index = run('cgc', ['index', repo], { cwd: repo, timeoutMs: 30 * 60_000 });
commands.push(index);
indexDurationMs = index.durationMs;
if (index.status !== 0) indexStatus = 'index_failed';
const started = Date.now();
const complexity = run('cgc', ['analyze', 'complexity', '--limit', '25'], { cwd: repo, timeoutMs: 180_000 });
const callers = run('cgc', ['analyze', 'callers', firstTerm], { cwd: repo, timeoutMs: 120_000 });
const dead = run('cgc', ['analyze', 'dead-code', '--limit', '25'], { cwd: repo, timeoutMs: 180_000 });
commands.push(complexity, callers, dead);
queryDurationMs = Date.now() - started;
collect(complexity, locations, 'codegraphcontext');
collect(callers, locations, 'codegraphcontext');
collect(dead, locations, 'codegraphcontext');
} else {
setupStatus = 'setup_failed';
commands.push({ command: lane, cwd: repo, status: 1, signal: null, error: 'unknown_lane', durationMs: 0, stdout: '', stderr: '' });
}
for (const [i, command] of commands.entries()) writeFileSync(join(runDir, `command-${i + 1}.json`), JSON.stringify(command, null, 2));
const unique = uniqueLocations(locations);
writeFileSync(join(runDir, 'candidate-locations.json'), JSON.stringify(unique, null, 2));
return { lane, setupStatus, indexStatus, toolCallable: commands.some((command) => command.status === 0), candidates: unique, costs: { setupDurationMs, indexDurationMs, queryDurationMs }, commands };
}
const rows = [];
const scoreableRows = [];
for (const task of tasks) {
const query = queryOf(task.problem_statement);
for (const lane of lanes) {
for (let repeat = 1; repeat <= repeats; repeat += 1) {
const runId = sanitize(`${lane}-${task.instance_id}-r${repeat}`);
const runDir = join(outRoot, 'runs', runId);
mkdirSync(runDir, { recursive: true });
const startedAt = new Date().toISOString();
let row;
try {
const retrieval = await retrieve(lane, task, runDir, query);
const model = await callOpenAI(runDir, task, lane, query, retrieval.candidates);
const prediction = model.ok ? buildPrediction(task, model.parsed) : buildPrediction(task, { files: [], spans: [] });
const predictionPath = join(runDir, 'prediction.json');
writeFileSync(predictionPath, JSON.stringify(prediction, null, 2));
const predFiles = prediction.traj_data.pred_files;
const predSpans = prediction.traj_data.pred_spans;
const goldPath = join(runDir, 'gold.json');
const gold = run('node', ['scripts/contextbench-select-slice.mjs', '--write-gold', '--task-id', task.instance_id, '--out', goldPath, '--payloads', process.env.TASK_PAYLOADS], { timeoutMs: 10 * 60_000 });
const scorePath = join(runDir, 'official-score.jsonl');
const evaluator = model.ok && predFiles.length > 0
? run('python', ['-m', 'contextbench.evaluate', '--gold', goldPath, '--pred', predictionPath, '--cache', join(runDir, 'repo-cache'), '--out', scorePath], { cwd: process.env.OFFICIAL_CONTEXTBENCH, timeoutMs: 20 * 60_000 })
: { command: 'python -m contextbench.evaluate', cwd: process.env.OFFICIAL_CONTEXTBENCH, status: null, signal: null, error: 'skipped_no_model_prediction', durationMs: 0, stdout: '', stderr: '' };
writeFileSync(join(runDir, 'gold-command.json'), JSON.stringify(gold, null, 2));
writeFileSync(join(runDir, 'evaluator-command.json'), JSON.stringify(evaluator, null, 2));
const score = parseScores(scorePath);
const officialEvaluatorScoreable = evaluator.status === 0 && Boolean(score);
const status = officialEvaluatorScoreable ? 'completed' : (model.ok ? 'judge_failed' : model.status);
row = {
run_id: runId,
lane_id: lane,
task_id: task.instance_id,
repeat_index: repeat,
status,
model: `${process.env.OPENAI_MODEL || 'gpt-5.4-mini'}-${process.env.OPENAI_REASONING_EFFORT || 'high'}`,
started_at: startedAt,
completed_at: new Date().toISOString(),
setupIndex: retrieval.costs,
setupStatus: retrieval.setupStatus,
indexStatus: retrieval.indexStatus,
toolCallable: retrieval.toolCallable,
candidateCount: retrieval.candidates.length,
nonEmptyPrediction: predFiles.length > 0 && Object.keys(predSpans).length > 0,
predFiles: predFiles.length,
officialEvaluatorScoreable,
modelStatus: model.status,
modelDurationMs: model.durationMs,
modelUsage: model.usage || null,
score,
paths: { runDir, predictionPath, scorePath }
};
if (officialEvaluatorScoreable) scoreableRows.push(row);
} catch (error) {
row = { run_id: runId, lane_id: lane, task_id: task.instance_id, repeat_index: repeat, status: 'tool_error', error: String(error?.stack || error), started_at: startedAt, completed_at: new Date().toISOString(), officialEvaluatorScoreable: false };
writeFileSync(join(runDir, 'row-error.txt'), row.error);
}
rows.push(row);
writeFileSync(join(runDir, 'row.json'), JSON.stringify(row, null, 2));
console.log(JSON.stringify({ run_id: row.run_id, lane_id: row.lane_id, task_id: row.task_id, status: row.status, scoreable: row.officialEvaluatorScoreable, predFiles: row.predFiles || 0, candidateCount: row.candidateCount || 0 }));
}
}
}
const byLane = {};
for (const row of scoreableRows) {
const bucket = byLane[row.lane_id] || { lane: row.lane_id, scoreableRows: 0, fileCoverage: [], filePrecision: [], symbolCoverage: [], spanCoverage: [], lineCoverage: [], editlocRecall: [] };
bucket.scoreableRows += 1;
bucket.fileCoverage.push(row.score?.final?.file?.coverage ?? null);
bucket.filePrecision.push(row.score?.final?.file?.precision ?? null);
bucket.symbolCoverage.push(row.score?.final?.symbol?.coverage ?? null);
bucket.spanCoverage.push(row.score?.final?.span?.coverage ?? null);
bucket.lineCoverage.push(row.score?.final?.line?.coverage ?? null);
bucket.editlocRecall.push(row.score?.editloc?.recall ?? null);
byLane[row.lane_id] = bucket;
}
function mean(values) {
const nums = values.filter((value) => Number.isFinite(value));
return nums.length ? nums.reduce((a, b) => a + b, 0) / nums.length : null;
}
const resultsTable = Object.values(byLane).map((bucket) => ({
lane: bucket.lane,
scoreableRows: bucket.scoreableRows,
fileCoverage: mean(bucket.fileCoverage),
filePrecision: mean(bucket.filePrecision),
symbolCoverage: mean(bucket.symbolCoverage),
spanCoverage: mean(bucket.spanCoverage),
lineCoverage: mean(bucket.lineCoverage),
editlocRecall: mean(bucket.editlocRecall)
})).sort((a, b) => (b.fileCoverage ?? -1) - (a.fileCoverage ?? -1));
const summary = {
createdAt: new Date().toISOString(),
model: `${process.env.OPENAI_MODEL || 'gpt-5.4-mini'}-${process.env.OPENAI_REASONING_EFFORT || 'high'}`,
maxTasks,
repeats,
lanes,
attemptedRows: rows.length,
scoreableRows: scoreableRows.length,
nonScoreableRows: rows.length - scoreableRows.length,
statusCounts: rows.reduce((acc, row) => { acc[row.status] = (acc[row.status] || 0) + 1; return acc; }, {}),
setupIndexCostReportedSeparately: true,
resultsTable,
rows
};
writeFileSync(join(outRoot, 'summary.json'), JSON.stringify(summary, null, 2));
writeFileSync(join(outRoot, 'results-table.json'), JSON.stringify(resultsTable, null, 2));
console.log(JSON.stringify({ attemptedRows: summary.attemptedRows, scoreableRows: summary.scoreableRows, statusCounts: summary.statusCounts, resultsTable }, null, 2));
if (scoreableRows.length === 0) process.exitCode = 1;
NODE
node "$ROOT/run-real-matrix.mjs"
- name: Upload real benchmark artifacts
if: always()
uses: actions/upload-artifact@v4
with:
name: contextbench-real-gpt54mini
path: /tmp/contextbench-real-gpt54mini
retention-days: 14