Agent-specific recommendations

jahooma · jahooma · commit c93f96e7c889 · 2025-10-12T15:12:36.000-07:00
diff --git a/evals/buffbench/judge.ts b/evals/buffbench/judge.ts
@@ -178,7 +178,7 @@ ${error ? `\n## Error Encountered\n${error}` : ''}`
         agentOutput.push(JSON.stringify(event, null, 2))
       }
       else if (event.type === 'error') {
-        console.error('[Judge] Error event:', event.message)
+        console.warn('[Judge] Error event:', event.message)
       }
     },
   })
diff --git a/evals/buffbench/run-buffbench.ts b/evals/buffbench/run-buffbench.ts
@@ -227,7 +227,7 @@ export async function runBuffBench(options: {
             spec: commit.spec,
           }
 
-          const { overallAnalysis, agentFeedback, recommendations } = analysis
+          const { overallAnalysis, agentFeedback } = analysis
           fs.writeFileSync(analysisPath, JSON.stringify(analysisData, null, 2))
           console.log(`Analysis saved to ${analysisPath}`)
           console.log(`\n=== Trace Analysis ===`)
@@ -237,18 +237,22 @@ export async function runBuffBench(options: {
             agentFeedback.forEach((feedback: any) => {
               console.log(`\n  [${feedback.agentId}]`)
               if (feedback.strengths.length > 0) {
-                console.log(`    Strengths: ${feedback.strengths.join(', ')}`)
+                console.log(
+                  `    Strengths:\n${feedback.strengths.join('\n    - ')}}`,
+                )
               }
               if (feedback.weaknesses.length > 0) {
-                console.log(`    Weaknesses: ${feedback.weaknesses.join(', ')}`)
+                console.log(
+                  `    Weaknesses:\n${feedback.weaknesses.join('\n    - ')}`,
+                )
+              }
+              if (feedback.recommendations.length > 0) {
+                console.log(
+                  `    Recommendations:\n${feedback.recommendations.join('\n    - ')}`,
+                )
               }
-              console.log(`    Performance: ${feedback.relativePerformance}`)
             })
           }
-          if (recommendations.length > 0) {
-            console.log(`\nRecommendations:`)
-            recommendations.forEach((r: string) => console.log(`  - ${r}`))
-          }
         } catch (error) {
           console.error(
             `Failed to analyze traces for commit ${commit.sha}:`,
diff --git a/evals/buffbench/trace-analyzer.ts b/evals/buffbench/trace-analyzer.ts
@@ -137,26 +137,17 @@ const traceAnalyzerAgent: AgentDefinition = {
               type: 'array',
               items: { type: 'string' },
             },
-            relativePerformance: {
-              type: 'string',
-              description: 'How this agent performed relative to others',
+            recommendations: {
+              type: 'array',
+              items: { type: 'string' },
+              description: 'Recommendations for improving this agent',
             },
           },
-          required: [
-            'agentId',
-            'strengths',
-            'weaknesses',
-            'relativePerformance',
-          ],
+          required: ['agentId', 'strengths', 'weaknesses', 'recommendations'],
         },
       },
-      recommendations: {
-        type: 'array',
-        items: { type: 'string' },
-        description: 'Recommendations for improving agents',
-      },
     },
-    required: ['overallAnalysis', 'agentFeedback', 'recommendations'],
+    required: ['overallAnalysis', 'agentFeedback'],
   },
   systemPrompt: `You are an expert AI agent evaluator analyzing how different coding agents approach problems and make decisions.
 
@@ -208,9 +199,8 @@ export async function analyzeAgentTraces({
     agentId: string
     strengths: string[]
     weaknesses: string[]
-    relativePerformance: string
+    recommendations: string[]
   }>
-  recommendations: string[]
 }> {
   const truncatedTraces = traces.map((t) => ({
     agentId: t.agentId,
@@ -258,7 +248,7 @@ Focus on the HOW, not the WHAT: We want to understand and improve how agents wor
       } else if (event.type === 'tool_call') {
         agentOutput.push(JSON.stringify(event, null, 2))
       } else if (event.type === 'error') {
-        console.error('[Trace Analyzer] Error event:', event.message)
+        console.warn('[Trace Analyzer] Error event:', event.message)
       }
     },
   })
@@ -274,7 +264,6 @@ Focus on the HOW, not the WHAT: We want to understand and improve how agents wor
     return {
       overallAnalysis: 'Error running trace analyzer - not structured output',
       agentFeedback: [],
-      recommendations: ['Trace analyzer failed to provide structured output'],
     }
   }
 

Original file line number	Diff line number	Diff line change
@@ -178,7 +178,7 @@ ${error ? `\n## Error Encountered\n${error}` : ''}`
`178`	`178`	`agentOutput.push(JSON.stringify(event, null, 2))`
`179`	`179`	`}`
`180`	`180`	`else if (event.type === 'error') {`
`181`		`- console.error('[Judge] Error event:', event.message)`
	`181`	`+ console.warn('[Judge] Error event:', event.message)`
`182`	`182`	`}`
`183`	`183`	`},`
`184`	`184`	`})`