Rearrange some printing

jahooma · jahooma · commit 4390e39f511d · 2025-10-13T00:37:46.000-07:00
diff --git a/evals/buffbench/format-output.ts b/evals/buffbench/format-output.ts
@@ -1,6 +1,77 @@
 import type { JudgingResult } from './judge'
 import type { EvalCommitV2 } from './types'
 
+interface AgentResultData {
+  agentId: string
+  judging: JudgingResult
+  cost: number
+  durationMs: number
+  error?: string
+  traceFilePath?: string
+}
+
+interface TraceAnalysisData {
+  overallAnalysis: string
+  agentFeedback: Array<{
+    agentId: string
+    strengths: string[]
+    weaknesses: string[]
+    recommendations: string[]
+  }>
+}
+
+export function formatTaskResults({
+  commit,
+  taskNumber,
+  totalTasks,
+  agentResults,
+  traceAnalysis,
+}: {
+  commit: EvalCommitV2
+  taskNumber: number
+  totalTasks: number
+  agentResults: AgentResultData[]
+  traceAnalysis?: TraceAnalysisData
+}): string {
+  const separator = '='.repeat(80)
+  const minorSeparator = '-'.repeat(80)
+  const lines: string[] = [
+    '',
+    separator,
+    `RESULTS FOR TASK ${taskNumber}/${totalTasks}: ${commit.id} (${commit.sha.slice(0, 7)})`,
+    separator,
+    '',
+    'TASK:',
+    minorSeparator,
+    commit.prompt,
+    '',
+  ]
+
+  // Print each agent's results
+  agentResults.forEach((result, index) => {
+    lines.push(
+      formatAgentResult({
+        ...result,
+        commit,
+        agentNumber: index + 1,
+        totalAgents: agentResults.length,
+      }),
+    )
+  })
+
+  // Add trace analysis if provided
+  if (traceAnalysis) {
+    lines.push(
+      formatTraceAnalysis({
+        commit,
+        ...traceAnalysis,
+      }),
+    )
+  }
+
+  return lines.join('\n')
+}
+
 export function formatAgentResult(params: {
   agentId: string
   commit: EvalCommitV2
@@ -33,11 +104,6 @@ export function formatAgentResult(params: {
   lines.push(minorSeparator)
   lines.push('')
 
-  lines.push('TASK:')
-  lines.push(minorSeparator)
-  lines.push(commit.prompt)
-  lines.push('')
-
   if (error) {
     lines.push('❌ ERROR:')
     lines.push(minorSeparator)
diff --git a/evals/buffbench/run-buffbench.ts b/evals/buffbench/run-buffbench.ts
@@ -6,7 +6,7 @@ import { getUserCredentials } from '@codebuff/npm-app/credentials'
 import pLimit from 'p-limit'
 
 import { runAgentOnCommit } from './agent-runner'
-import { formatAgentResult, formatTraceAnalysis } from './format-output'
+import { formatTaskResults } from './format-output'
 import { judgeCommitResult } from './judge'
 import { analyzeAgentTraces, type AgentTraceData } from './trace-analyzer'
 import { CodebuffClient } from '../../sdk/src/client'
@@ -170,9 +170,7 @@ export async function runBuffBench(options: {
         }
       })
 
-      const agentResults = await Promise.all(agentPromises)
-
-      // After all agents complete for this commit, run trace analysis
+      const agentResults = await Promise.all(agentPromises) // After all agents complete for this commit, run trace analysis
       if (commitTraces.length > 1) {
         try {
           const analysis = await analyzeAgentTraces({
@@ -205,36 +203,25 @@ export async function runBuffBench(options: {
           fs.writeFileSync(analysisPath, JSON.stringify(analysisData, null, 2))
 
           // Print all agent results with their judging, then trace analysis together
-          console.log('\n' + '='.repeat(80))
           console.log(
-            `RESULTS FOR TASK ${index + 1}/${commitsToRun.length}: ${commit.id} (${commit.sha.slice(0, 7)})`,
-          )
-          console.log('='.repeat(80))
-
-          commitTraces.forEach((trace, traceIndex) => {
-            const formattedOutput = formatAgentResult({
-              agentId: trace.agentId,
+            formatTaskResults({
               commit,
-              judging: trace.judgeResult,
-              cost: trace.cost,
-              durationMs: trace.durationMs,
-              error: trace.error,
-              traceFilePath: path.join(
-                logsDir,
-                `${commit.id.replace(/[^a-zA-Z0-9-]/g, '_')}-${trace.agentId.replace(/[^a-zA-Z0-9-]/g, '_')}-${commit.sha.slice(0, 7)}.json`,
-              ),
-              agentNumber: traceIndex + 1,
-              totalAgents: commitTraces.length,
-            })
-            console.log(formattedOutput)
-          })
-
-          const formattedAnalysis = formatTraceAnalysis({
-            commit,
-            overallAnalysis,
-            agentFeedback,
-          })
-          console.log(formattedAnalysis)
+              taskNumber: index + 1,
+              totalTasks: commitsToRun.length,
+              agentResults: commitTraces.map((trace) => ({
+                agentId: trace.agentId,
+                judging: trace.judgeResult,
+                cost: trace.cost,
+                durationMs: trace.durationMs,
+                error: trace.error,
+                traceFilePath: path.join(
+                  logsDir,
+                  `${commit.id.replace(/[^a-zA-Z0-9-]/g, '_')}-${trace.agentId.replace(/[^a-zA-Z0-9-]/g, '_')}-${commit.sha.slice(0, 7)}.json`,
+                ),
+              })),
+              traceAnalysis: { overallAnalysis, agentFeedback },
+            }),
+          )
         } catch (error) {
           console.error(
             `Failed to analyze traces for commit ${commit.sha}:`,
@@ -255,7 +242,7 @@ export async function runBuffBench(options: {
   for (const result of commitResults) {
     if (result.status === 'fulfilled') {
       const { commit, agentResults } = result.value
-      
+
       // Check if any agent had an error for this commit
       const hasAnyError = agentResults.some(({ evalRun }) => evalRun.error)
       if (hasAnyError) {
@@ -289,8 +276,7 @@ export async function runBuffBench(options: {
 
     agentData.averageDuration =
       validRuns.length > 0
-        ? validRuns.reduce((sum, r) => sum + r.durationMs, 0) /
-          validRuns.length
+        ? validRuns.reduce((sum, r) => sum + r.durationMs, 0) / validRuns.length
         : 0
   }