Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
41 changes: 41 additions & 0 deletions apps/cli/src/commands/results/eval-runner.ts
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@ import type { Hono } from 'hono';

import { TARGET_FILE_CANDIDATES, discoverTargetsFile } from '../../utils/targets.js';
import { discoverEvalFiles } from '../eval/discover.js';
import { buildDefaultRunDir } from '../eval/result-layout.js';
import { findRepoRoot } from '../eval/shared.js';

// ── In-memory run tracker ────────────────────────────────────────────────
Expand All @@ -32,6 +33,10 @@ interface StudioRun {
id: string;
status: 'starting' | 'running' | 'finished' | 'failed';
command: string;
/** Target name passed via --target (if any). Stored so the run list can show it before the first result is written. */
target?: string;
/** Absolute path to the run directory (e.g. .agentv/results/runs/default/<timestamp>). Used to correlate this in-memory run with the filesystem run when the JSONL has 0 records yet. */
outputDir?: string;
startedAt: string;
finishedAt?: string;
exitCode?: number | null;
Expand Down Expand Up @@ -62,6 +67,19 @@ function pruneFinishedRuns() {
}
}

/**
* Look up the target for a Studio-launched run by its index.jsonl path.
* Called by handleRuns in serve.ts when the JSONL has 0 records (run just started).
*/
export function getActiveRunTarget(indexJsonlPath: string): string | undefined {
for (const run of activeRuns.values()) {
if (run.outputDir && path.join(run.outputDir, 'index.jsonl') === indexJsonlPath) {
return run.target;
}
}
return undefined;
}

// ── Discover targets file from project root ──────────────────────────────

async function discoverTargetsInProject(cwd: string): Promise<readonly string[]> {
Expand Down Expand Up @@ -310,13 +328,26 @@ export function registerEvalRoutes(
}

const args = buildCliArgs(body);
// Determine the output directory for this run. When the caller provides
// an explicit --output (resume/rerun), use that path. Otherwise generate
// the default path now so we can pass it via --output and later correlate
// the filesystem run with this in-memory StudioRun (needed to show the
// target in the sidebar before any results have been written).
const outputDir = body.output?.trim()
? path.resolve(cwd, body.output.trim())
: buildDefaultRunDir(cwd);
if (!body.output?.trim()) {
args.push('--output', outputDir);
}
const command = buildCliPreview(args);
const runId = generateRunId();

const run: StudioRun = {
id: runId,
status: 'starting',
command,
target: body.target?.trim() || undefined,
outputDir,
startedAt: new Date().toISOString(),
stdout: '',
stderr: '',
Expand Down Expand Up @@ -405,6 +436,7 @@ export function registerEvalRoutes(
id: r.id,
status: r.status,
command: r.command,
target: r.target,
started_at: r.startedAt,
finished_at: r.finishedAt ?? null,
exit_code: r.exitCode ?? null,
Expand Down Expand Up @@ -481,13 +513,21 @@ export function registerEvalRoutes(
}

const args = buildCliArgs(body);
const outputDir = body.output?.trim()
? path.resolve(cwd, body.output.trim())
: buildDefaultRunDir(cwd);
if (!body.output?.trim()) {
args.push('--output', outputDir);
}
const command = buildCliPreview(args);
const runId = generateRunId();

const run: StudioRun = {
id: runId,
status: 'starting',
command,
target: body.target?.trim() || undefined,
outputDir,
startedAt: new Date().toISOString(),
stdout: '',
stderr: '',
Expand Down Expand Up @@ -557,6 +597,7 @@ export function registerEvalRoutes(
id: r.id,
status: r.status,
command: r.command,
target: r.target,
started_at: r.startedAt,
finished_at: r.finishedAt ?? null,
exit_code: r.exitCode ?? null,
Expand Down
6 changes: 5 additions & 1 deletion apps/cli/src/commands/results/serve.ts
Original file line number Diff line number Diff line change
Expand Up @@ -54,7 +54,7 @@ import { resolveRunManifestPath } from '../eval/result-layout.js';
import { loadRunCache, resolveRunCacheFile } from '../eval/run-cache.js';
import { findRepoRoot } from '../eval/shared.js';
import { listResultFiles } from '../inspect/utils.js';
import { registerEvalRoutes } from './eval-runner.js';
import { getActiveRunTarget, registerEvalRoutes } from './eval-runner.js';
import {
loadLightweightResults,
loadManifestResults,
Expand Down Expand Up @@ -290,6 +290,10 @@ async function handleRuns(c: C, { searchDir, agentvDir }: DataContext) {
target = records[0].target;
experiment = records[0].experiment ?? experiment;
passRate = records.filter((r) => r.score >= passThreshold).length / records.length;
} else {
// Run is in-progress with 0 results written yet — fall back to the
// in-memory target stored when the Studio launched this run.
target = getActiveRunTarget(m.path);
}
} catch {
// ignore enrichment errors
Expand Down
10 changes: 10 additions & 0 deletions apps/studio/src/lib/run-label.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -33,4 +33,14 @@ describe('formatRunLabel', () => {
}),
).toBe('29/04 09:17 · 0%');
});

it('shows target even when pass rate is 0 (active/in-progress run)', () => {
expect(
formatRunLabel({
target: 'wtalms-stg',
timestamp: '2026-05-07T10:56:00.000Z',
pass_rate: 0,
}),
).toBe('07/05 10:56 · wtalms-stg · 0%');
});
});
1 change: 1 addition & 0 deletions apps/studio/src/lib/types.ts
Original file line number Diff line number Diff line change
Expand Up @@ -337,6 +337,7 @@ export interface EvalRunListResponse {
id: string;
status: string;
command: string;
target?: string;
started_at: string;
finished_at: string | null;
exit_code: number | null;
Expand Down
Loading