@@ -6,7 +6,7 @@ import { getUserCredentials } from '@codebuff/npm-app/credentials'
66import pLimit from 'p-limit'
77
88import { runAgentOnCommit } from './agent-runner'
9- import { formatAgentResult , formatTraceAnalysis } from './format-output'
9+ import { formatTaskResults } from './format-output'
1010import { judgeCommitResult } from './judge'
1111import { analyzeAgentTraces , type AgentTraceData } from './trace-analyzer'
1212import { CodebuffClient } from '../../sdk/src/client'
@@ -170,9 +170,7 @@ export async function runBuffBench(options: {
170170 }
171171 } )
172172
173- const agentResults = await Promise . all ( agentPromises )
174-
175- // After all agents complete for this commit, run trace analysis
173+ const agentResults = await Promise . all ( agentPromises ) // After all agents complete for this commit, run trace analysis
176174 if ( commitTraces . length > 1 ) {
177175 try {
178176 const analysis = await analyzeAgentTraces ( {
@@ -205,36 +203,25 @@ export async function runBuffBench(options: {
205203 fs . writeFileSync ( analysisPath , JSON . stringify ( analysisData , null , 2 ) )
206204
207205 // Print all agent results with their judging, then trace analysis together
208- console . log ( '\n' + '=' . repeat ( 80 ) )
209206 console . log (
210- `RESULTS FOR TASK ${ index + 1 } /${ commitsToRun . length } : ${ commit . id } (${ commit . sha . slice ( 0 , 7 ) } )` ,
211- )
212- console . log ( '=' . repeat ( 80 ) )
213-
214- commitTraces . forEach ( ( trace , traceIndex ) => {
215- const formattedOutput = formatAgentResult ( {
216- agentId : trace . agentId ,
207+ formatTaskResults ( {
217208 commit,
218- judging : trace . judgeResult ,
219- cost : trace . cost ,
220- durationMs : trace . durationMs ,
221- error : trace . error ,
222- traceFilePath : path . join (
223- logsDir ,
224- `${ commit . id . replace ( / [ ^ a - z A - Z 0 - 9 - ] / g, '_' ) } -${ trace . agentId . replace ( / [ ^ a - z A - Z 0 - 9 - ] / g, '_' ) } -${ commit . sha . slice ( 0 , 7 ) } .json` ,
225- ) ,
226- agentNumber : traceIndex + 1 ,
227- totalAgents : commitTraces . length ,
228- } )
229- console . log ( formattedOutput )
230- } )
231-
232- const formattedAnalysis = formatTraceAnalysis ( {
233- commit,
234- overallAnalysis,
235- agentFeedback,
236- } )
237- console . log ( formattedAnalysis )
209+ taskNumber : index + 1 ,
210+ totalTasks : commitsToRun . length ,
211+ agentResults : commitTraces . map ( ( trace ) => ( {
212+ agentId : trace . agentId ,
213+ judging : trace . judgeResult ,
214+ cost : trace . cost ,
215+ durationMs : trace . durationMs ,
216+ error : trace . error ,
217+ traceFilePath : path . join (
218+ logsDir ,
219+ `${ commit . id . replace ( / [ ^ a - z A - Z 0 - 9 - ] / g, '_' ) } -${ trace . agentId . replace ( / [ ^ a - z A - Z 0 - 9 - ] / g, '_' ) } -${ commit . sha . slice ( 0 , 7 ) } .json` ,
220+ ) ,
221+ } ) ) ,
222+ traceAnalysis : { overallAnalysis, agentFeedback } ,
223+ } ) ,
224+ )
238225 } catch ( error ) {
239226 console . error (
240227 `Failed to analyze traces for commit ${ commit . sha } :` ,
@@ -255,7 +242,7 @@ export async function runBuffBench(options: {
255242 for ( const result of commitResults ) {
256243 if ( result . status === 'fulfilled' ) {
257244 const { commit, agentResults } = result . value
258-
245+
259246 // Check if any agent had an error for this commit
260247 const hasAnyError = agentResults . some ( ( { evalRun } ) => evalRun . error )
261248 if ( hasAnyError ) {
@@ -289,8 +276,7 @@ export async function runBuffBench(options: {
289276
290277 agentData . averageDuration =
291278 validRuns . length > 0
292- ? validRuns . reduce ( ( sum , r ) => sum + r . durationMs , 0 ) /
293- validRuns . length
279+ ? validRuns . reduce ( ( sum , r ) => sum + r . durationMs , 0 ) / validRuns . length
294280 : 0
295281 }
296282
0 commit comments