Skip to content

Commit 4390e39

Browse files
committed
Rearrange some printing
1 parent f6d89b4 commit 4390e39

File tree

2 files changed

+92
-40
lines changed

2 files changed

+92
-40
lines changed

evals/buffbench/format-output.ts

Lines changed: 71 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,77 @@
11
import type { JudgingResult } from './judge'
22
import type { EvalCommitV2 } from './types'
33

4+
interface AgentResultData {
5+
agentId: string
6+
judging: JudgingResult
7+
cost: number
8+
durationMs: number
9+
error?: string
10+
traceFilePath?: string
11+
}
12+
13+
interface TraceAnalysisData {
14+
overallAnalysis: string
15+
agentFeedback: Array<{
16+
agentId: string
17+
strengths: string[]
18+
weaknesses: string[]
19+
recommendations: string[]
20+
}>
21+
}
22+
23+
export function formatTaskResults({
24+
commit,
25+
taskNumber,
26+
totalTasks,
27+
agentResults,
28+
traceAnalysis,
29+
}: {
30+
commit: EvalCommitV2
31+
taskNumber: number
32+
totalTasks: number
33+
agentResults: AgentResultData[]
34+
traceAnalysis?: TraceAnalysisData
35+
}): string {
36+
const separator = '='.repeat(80)
37+
const minorSeparator = '-'.repeat(80)
38+
const lines: string[] = [
39+
'',
40+
separator,
41+
`RESULTS FOR TASK ${taskNumber}/${totalTasks}: ${commit.id} (${commit.sha.slice(0, 7)})`,
42+
separator,
43+
'',
44+
'TASK:',
45+
minorSeparator,
46+
commit.prompt,
47+
'',
48+
]
49+
50+
// Print each agent's results
51+
agentResults.forEach((result, index) => {
52+
lines.push(
53+
formatAgentResult({
54+
...result,
55+
commit,
56+
agentNumber: index + 1,
57+
totalAgents: agentResults.length,
58+
}),
59+
)
60+
})
61+
62+
// Add trace analysis if provided
63+
if (traceAnalysis) {
64+
lines.push(
65+
formatTraceAnalysis({
66+
commit,
67+
...traceAnalysis,
68+
}),
69+
)
70+
}
71+
72+
return lines.join('\n')
73+
}
74+
475
export function formatAgentResult(params: {
576
agentId: string
677
commit: EvalCommitV2
@@ -33,11 +104,6 @@ export function formatAgentResult(params: {
33104
lines.push(minorSeparator)
34105
lines.push('')
35106

36-
lines.push('TASK:')
37-
lines.push(minorSeparator)
38-
lines.push(commit.prompt)
39-
lines.push('')
40-
41107
if (error) {
42108
lines.push('❌ ERROR:')
43109
lines.push(minorSeparator)

evals/buffbench/run-buffbench.ts

Lines changed: 21 additions & 35 deletions
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@ import { getUserCredentials } from '@codebuff/npm-app/credentials'
66
import pLimit from 'p-limit'
77

88
import { runAgentOnCommit } from './agent-runner'
9-
import { formatAgentResult, formatTraceAnalysis } from './format-output'
9+
import { formatTaskResults } from './format-output'
1010
import { judgeCommitResult } from './judge'
1111
import { analyzeAgentTraces, type AgentTraceData } from './trace-analyzer'
1212
import { CodebuffClient } from '../../sdk/src/client'
@@ -170,9 +170,7 @@ export async function runBuffBench(options: {
170170
}
171171
})
172172

173-
const agentResults = await Promise.all(agentPromises)
174-
175-
// After all agents complete for this commit, run trace analysis
173+
const agentResults = await Promise.all(agentPromises) // After all agents complete for this commit, run trace analysis
176174
if (commitTraces.length > 1) {
177175
try {
178176
const analysis = await analyzeAgentTraces({
@@ -205,36 +203,25 @@ export async function runBuffBench(options: {
205203
fs.writeFileSync(analysisPath, JSON.stringify(analysisData, null, 2))
206204

207205
// Print all agent results with their judging, then trace analysis together
208-
console.log('\n' + '='.repeat(80))
209206
console.log(
210-
`RESULTS FOR TASK ${index + 1}/${commitsToRun.length}: ${commit.id} (${commit.sha.slice(0, 7)})`,
211-
)
212-
console.log('='.repeat(80))
213-
214-
commitTraces.forEach((trace, traceIndex) => {
215-
const formattedOutput = formatAgentResult({
216-
agentId: trace.agentId,
207+
formatTaskResults({
217208
commit,
218-
judging: trace.judgeResult,
219-
cost: trace.cost,
220-
durationMs: trace.durationMs,
221-
error: trace.error,
222-
traceFilePath: path.join(
223-
logsDir,
224-
`${commit.id.replace(/[^a-zA-Z0-9-]/g, '_')}-${trace.agentId.replace(/[^a-zA-Z0-9-]/g, '_')}-${commit.sha.slice(0, 7)}.json`,
225-
),
226-
agentNumber: traceIndex + 1,
227-
totalAgents: commitTraces.length,
228-
})
229-
console.log(formattedOutput)
230-
})
231-
232-
const formattedAnalysis = formatTraceAnalysis({
233-
commit,
234-
overallAnalysis,
235-
agentFeedback,
236-
})
237-
console.log(formattedAnalysis)
209+
taskNumber: index + 1,
210+
totalTasks: commitsToRun.length,
211+
agentResults: commitTraces.map((trace) => ({
212+
agentId: trace.agentId,
213+
judging: trace.judgeResult,
214+
cost: trace.cost,
215+
durationMs: trace.durationMs,
216+
error: trace.error,
217+
traceFilePath: path.join(
218+
logsDir,
219+
`${commit.id.replace(/[^a-zA-Z0-9-]/g, '_')}-${trace.agentId.replace(/[^a-zA-Z0-9-]/g, '_')}-${commit.sha.slice(0, 7)}.json`,
220+
),
221+
})),
222+
traceAnalysis: { overallAnalysis, agentFeedback },
223+
}),
224+
)
238225
} catch (error) {
239226
console.error(
240227
`Failed to analyze traces for commit ${commit.sha}:`,
@@ -255,7 +242,7 @@ export async function runBuffBench(options: {
255242
for (const result of commitResults) {
256243
if (result.status === 'fulfilled') {
257244
const { commit, agentResults } = result.value
258-
245+
259246
// Check if any agent had an error for this commit
260247
const hasAnyError = agentResults.some(({ evalRun }) => evalRun.error)
261248
if (hasAnyError) {
@@ -289,8 +276,7 @@ export async function runBuffBench(options: {
289276

290277
agentData.averageDuration =
291278
validRuns.length > 0
292-
? validRuns.reduce((sum, r) => sum + r.durationMs, 0) /
293-
validRuns.length
279+
? validRuns.reduce((sum, r) => sum + r.durationMs, 0) / validRuns.length
294280
: 0
295281
}
296282

0 commit comments

Comments
 (0)