Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
52 changes: 52 additions & 0 deletions apps/cli/src/commands/results/eval-runner.ts
Original file line number Diff line number Diff line change
Expand Up @@ -105,6 +105,31 @@ interface RunEvalRequest {
threshold?: number;
workers?: number;
dry_run?: boolean;
/** Resume an interrupted run: skip already-completed tests and append results to `output`. */
resume?: boolean;
/** Re-run failed/errored tests while keeping passing results. */
rerun_failed?: boolean;
/** Path to a previous run dir or index.jsonl — re-run only execution_error cases. */
retry_errors?: string;
/** Artifact directory for run output. Required when resume/rerun_failed are set without auto-detect. */
output?: string;
}

/**
* Validate mutually-exclusive resume modes.
* Returns an error message if invalid, or undefined if valid.
*/
function validateResumeOptions(req: RunEvalRequest): string | undefined {
const modes: string[] = [];
if (req.resume) modes.push('resume');
if (req.rerun_failed) modes.push('rerun_failed');
if (req.retry_errors?.trim()) {
modes.push('retry_errors');
}
if (modes.length > 1) {
return `resume, rerun_failed, and retry_errors are mutually exclusive (got: ${modes.join(', ')})`;
}
return undefined;
}

function buildCliArgs(req: RunEvalRequest): string[] {
Expand Down Expand Up @@ -148,6 +173,20 @@ function buildCliArgs(req: RunEvalRequest): string[] {
args.push('--dry-run');
}

// Resume / rerun-failed / retry-errors / output
if (req.output?.trim()) {
args.push('--output', req.output.trim());
}
if (req.resume) {
args.push('--resume');
}
if (req.rerun_failed) {
args.push('--rerun-failed');
}
if (req.retry_errors?.trim()) {
args.push('--retry-errors', req.retry_errors.trim());
}

return args;
}

Expand Down Expand Up @@ -255,6 +294,11 @@ export function registerEvalRoutes(
return c.json({ error: 'Provide suite_filter or test_ids' }, 400);
}

const resumeError = validateResumeOptions(body);
if (resumeError) {
return c.json({ error: resumeError }, 400);
}

const cliPaths = resolveCliPath(cwd);
if (!cliPaths) {
return c.json({ error: 'Cannot locate agentv CLI entry point' }, 500);
Expand Down Expand Up @@ -405,6 +449,9 @@ export function registerEvalRoutes(
});

app.post('/api/benchmarks/:benchmarkId/eval/run', async (c) => {
if (readOnly) {
return c.json({ error: 'Studio is running in read-only mode' }, 403);
}
const cwd = getCwd(c);

let body: RunEvalRequest;
Expand All @@ -418,6 +465,11 @@ export function registerEvalRoutes(
return c.json({ error: 'Provide suite_filter or test_ids' }, 400);
}

const resumeError = validateResumeOptions(body);
if (resumeError) {
return c.json({ error: resumeError }, 400);
}

const cliPaths = resolveCliPath(cwd);
if (!cliPaths) {
return c.json({ error: 'Cannot locate agentv CLI entry point' }, 500);
Expand Down
39 changes: 39 additions & 0 deletions apps/cli/src/commands/results/serve.ts
Original file line number Diff line number Diff line change
Expand Up @@ -319,16 +319,55 @@ async function handleRunDetail(c: C, { searchDir }: DataContext) {
if (!meta) return c.json({ error: 'Run not found' }, 404);
try {
const loaded = loadManifestResults(meta.path);
// Surface run_dir + suite_filter for local runs so the UI can launch a
// Studio-side resume against this exact run. Remote runs live in the
// results-repo cache and cannot be resumed in place, so omit both fields.
const resumeMeta = meta.source === 'local' ? deriveResumeMeta(searchDir, meta.path) : {};
return c.json({
results: stripHeavyFields(loaded),
source: meta.source,
source_label: meta.displayName,
...resumeMeta,
});
} catch {
return c.json({ error: 'Failed to load run' }, 500);
}
}

/**
* Compute `run_dir` (relative to cwd, snake_case) and `suite_filter` (the
* eval file path stored in benchmark.json metadata) for a local run manifest.
* Returns whatever fields could be resolved — both are best-effort and only
* needed by the Studio "Resume run" / "Rerun failed" actions.
*/
function deriveResumeMeta(
cwd: string,
manifestPath: string,
): { run_dir?: string; suite_filter?: string } {
const out: { run_dir?: string; suite_filter?: string } = {};
const runDir = path.dirname(manifestPath);
const relative = path.relative(cwd, runDir);
// path.relative returns '..'-prefixed paths when runDir is outside cwd; keep
// those absolute so the CLI doesn't get confused. An empty string ('' = same
// dir as cwd) is unusual but valid — fall through to absolute in that case.
out.run_dir = relative !== '' && !relative.startsWith('..') ? relative : runDir;
try {
const benchmarkPath = path.join(runDir, 'benchmark.json');
if (existsSync(benchmarkPath)) {
const parsed = JSON.parse(readFileSync(benchmarkPath, 'utf8')) as {
metadata?: { eval_file?: string };
};
const evalFile = parsed.metadata?.eval_file;
if (typeof evalFile === 'string' && evalFile.trim()) {
out.suite_filter = evalFile.trim();
}
}
} catch {
// benchmark.json missing / unreadable / malformed — leave suite_filter unset.
}
return out;
}

async function handleRunSuites(c: C, { searchDir, agentvDir }: DataContext) {
const filename = c.req.param('filename') ?? '';
const meta = await findRunById(searchDir, filename);
Expand Down
Loading
Loading