EntityProcess · christso · May 6, 2026 · May 6, 2026 · May 6, 2026
diff --git a/apps/cli/src/commands/results/eval-runner.ts b/apps/cli/src/commands/results/eval-runner.ts
@@ -105,6 +105,31 @@ interface RunEvalRequest {
   threshold?: number;
   workers?: number;
   dry_run?: boolean;
+  /** Resume an interrupted run: skip already-completed tests and append results to `output`. */
+  resume?: boolean;
+  /** Re-run failed/errored tests while keeping passing results. */
+  rerun_failed?: boolean;
+  /** Path to a previous run dir or index.jsonl — re-run only execution_error cases. */
+  retry_errors?: string;
+  /** Artifact directory for run output. Required when resume/rerun_failed are set without auto-detect. */
+  output?: string;
+}
+
+/**
+ * Validate mutually-exclusive resume modes.
+ * Returns an error message if invalid, or undefined if valid.
+ */
+function validateResumeOptions(req: RunEvalRequest): string | undefined {
+  const modes: string[] = [];
+  if (req.resume) modes.push('resume');
+  if (req.rerun_failed) modes.push('rerun_failed');
+  if (req.retry_errors?.trim()) {
+    modes.push('retry_errors');
+  }
+  if (modes.length > 1) {
+    return `resume, rerun_failed, and retry_errors are mutually exclusive (got: ${modes.join(', ')})`;
+  }
+  return undefined;
 }
 
 function buildCliArgs(req: RunEvalRequest): string[] {
@@ -148,6 +173,20 @@ function buildCliArgs(req: RunEvalRequest): string[] {
     args.push('--dry-run');
   }
 
+  // Resume / rerun-failed / retry-errors / output
+  if (req.output?.trim()) {
+    args.push('--output', req.output.trim());
+  }
+  if (req.resume) {
+    args.push('--resume');
+  }
+  if (req.rerun_failed) {
+    args.push('--rerun-failed');
+  }
+  if (req.retry_errors?.trim()) {
+    args.push('--retry-errors', req.retry_errors.trim());
+  }
+
   return args;
 }
 
@@ -255,6 +294,11 @@ export function registerEvalRoutes(
       return c.json({ error: 'Provide suite_filter or test_ids' }, 400);
     }
 
+    const resumeError = validateResumeOptions(body);
+    if (resumeError) {
+      return c.json({ error: resumeError }, 400);
+    }
+
     const cliPaths = resolveCliPath(cwd);
     if (!cliPaths) {
       return c.json({ error: 'Cannot locate agentv CLI entry point' }, 500);
@@ -405,6 +449,9 @@ export function registerEvalRoutes(
   });
 
   app.post('/api/benchmarks/:benchmarkId/eval/run', async (c) => {
+    if (readOnly) {
+      return c.json({ error: 'Studio is running in read-only mode' }, 403);
+    }
     const cwd = getCwd(c);
 
     let body: RunEvalRequest;
@@ -418,6 +465,11 @@ export function registerEvalRoutes(
       return c.json({ error: 'Provide suite_filter or test_ids' }, 400);
     }
 
+    const resumeError = validateResumeOptions(body);
+    if (resumeError) {
+      return c.json({ error: resumeError }, 400);
+    }
+
     const cliPaths = resolveCliPath(cwd);
     if (!cliPaths) {
       return c.json({ error: 'Cannot locate agentv CLI entry point' }, 500);

diff --git a/apps/cli/src/commands/results/serve.ts b/apps/cli/src/commands/results/serve.ts
@@ -319,16 +319,55 @@ async function handleRunDetail(c: C, { searchDir }: DataContext) {
   if (!meta) return c.json({ error: 'Run not found' }, 404);
   try {
     const loaded = loadManifestResults(meta.path);
+    // Surface run_dir + suite_filter for local runs so the UI can launch a
+    // Studio-side resume against this exact run. Remote runs live in the
+    // results-repo cache and cannot be resumed in place, so omit both fields.
+    const resumeMeta = meta.source === 'local' ? deriveResumeMeta(searchDir, meta.path) : {};
     return c.json({
       results: stripHeavyFields(loaded),
       source: meta.source,
       source_label: meta.displayName,
+      ...resumeMeta,
     });
   } catch {
     return c.json({ error: 'Failed to load run' }, 500);
   }
 }
 
+/**
+ * Compute `run_dir` (relative to cwd, snake_case) and `suite_filter` (the
+ * eval file path stored in benchmark.json metadata) for a local run manifest.
+ * Returns whatever fields could be resolved — both are best-effort and only
+ * needed by the Studio "Resume run" / "Rerun failed" actions.
+ */
+function deriveResumeMeta(
+  cwd: string,
+  manifestPath: string,
+): { run_dir?: string; suite_filter?: string } {
+  const out: { run_dir?: string; suite_filter?: string } = {};
+  const runDir = path.dirname(manifestPath);
+  const relative = path.relative(cwd, runDir);
+  // path.relative returns '..'-prefixed paths when runDir is outside cwd; keep
+  // those absolute so the CLI doesn't get confused. An empty string ('' = same
+  // dir as cwd) is unusual but valid — fall through to absolute in that case.
+  out.run_dir = relative !== '' && !relative.startsWith('..') ? relative : runDir;
+  try {
+    const benchmarkPath = path.join(runDir, 'benchmark.json');
+    if (existsSync(benchmarkPath)) {
+      const parsed = JSON.parse(readFileSync(benchmarkPath, 'utf8')) as {
+        metadata?: { eval_file?: string };
+      };
+      const evalFile = parsed.metadata?.eval_file;
+      if (typeof evalFile === 'string' && evalFile.trim()) {
+        out.suite_filter = evalFile.trim();
+      }
+    }
+  } catch {
+    // benchmark.json missing / unreadable / malformed — leave suite_filter unset.
+  }
+  return out;
+}
+
 async function handleRunSuites(c: C, { searchDir, agentvDir }: DataContext) {
   const filename = c.req.param('filename') ?? '';
   const meta = await findRunById(searchDir, filename);