Skip to content

Commit c93f96e

Browse files
committed
Agent-specific recommendations
1 parent b15c36a commit c93f96e

File tree

3 files changed

+21
-28
lines changed

3 files changed

+21
-28
lines changed

evals/buffbench/judge.ts

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -178,7 +178,7 @@ ${error ? `\n## Error Encountered\n${error}` : ''}`
178178
agentOutput.push(JSON.stringify(event, null, 2))
179179
}
180180
else if (event.type === 'error') {
181-
console.error('[Judge] Error event:', event.message)
181+
console.warn('[Judge] Error event:', event.message)
182182
}
183183
},
184184
})

evals/buffbench/run-buffbench.ts

Lines changed: 12 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -227,7 +227,7 @@ export async function runBuffBench(options: {
227227
spec: commit.spec,
228228
}
229229

230-
const { overallAnalysis, agentFeedback, recommendations } = analysis
230+
const { overallAnalysis, agentFeedback } = analysis
231231
fs.writeFileSync(analysisPath, JSON.stringify(analysisData, null, 2))
232232
console.log(`Analysis saved to ${analysisPath}`)
233233
console.log(`\n=== Trace Analysis ===`)
@@ -237,18 +237,22 @@ export async function runBuffBench(options: {
237237
agentFeedback.forEach((feedback: any) => {
238238
console.log(`\n [${feedback.agentId}]`)
239239
if (feedback.strengths.length > 0) {
240-
console.log(` Strengths: ${feedback.strengths.join(', ')}`)
240+
console.log(
241+
` Strengths:\n${feedback.strengths.join('\n - ')}}`,
242+
)
241243
}
242244
if (feedback.weaknesses.length > 0) {
243-
console.log(` Weaknesses: ${feedback.weaknesses.join(', ')}`)
245+
console.log(
246+
` Weaknesses:\n${feedback.weaknesses.join('\n - ')}`,
247+
)
248+
}
249+
if (feedback.recommendations.length > 0) {
250+
console.log(
251+
` Recommendations:\n${feedback.recommendations.join('\n - ')}`,
252+
)
244253
}
245-
console.log(` Performance: ${feedback.relativePerformance}`)
246254
})
247255
}
248-
if (recommendations.length > 0) {
249-
console.log(`\nRecommendations:`)
250-
recommendations.forEach((r: string) => console.log(` - ${r}`))
251-
}
252256
} catch (error) {
253257
console.error(
254258
`Failed to analyze traces for commit ${commit.sha}:`,

evals/buffbench/trace-analyzer.ts

Lines changed: 8 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -137,26 +137,17 @@ const traceAnalyzerAgent: AgentDefinition = {
137137
type: 'array',
138138
items: { type: 'string' },
139139
},
140-
relativePerformance: {
141-
type: 'string',
142-
description: 'How this agent performed relative to others',
140+
recommendations: {
141+
type: 'array',
142+
items: { type: 'string' },
143+
description: 'Recommendations for improving this agent',
143144
},
144145
},
145-
required: [
146-
'agentId',
147-
'strengths',
148-
'weaknesses',
149-
'relativePerformance',
150-
],
146+
required: ['agentId', 'strengths', 'weaknesses', 'recommendations'],
151147
},
152148
},
153-
recommendations: {
154-
type: 'array',
155-
items: { type: 'string' },
156-
description: 'Recommendations for improving agents',
157-
},
158149
},
159-
required: ['overallAnalysis', 'agentFeedback', 'recommendations'],
150+
required: ['overallAnalysis', 'agentFeedback'],
160151
},
161152
systemPrompt: `You are an expert AI agent evaluator analyzing how different coding agents approach problems and make decisions.
162153
@@ -208,9 +199,8 @@ export async function analyzeAgentTraces({
208199
agentId: string
209200
strengths: string[]
210201
weaknesses: string[]
211-
relativePerformance: string
202+
recommendations: string[]
212203
}>
213-
recommendations: string[]
214204
}> {
215205
const truncatedTraces = traces.map((t) => ({
216206
agentId: t.agentId,
@@ -258,7 +248,7 @@ Focus on the HOW, not the WHAT: We want to understand and improve how agents wor
258248
} else if (event.type === 'tool_call') {
259249
agentOutput.push(JSON.stringify(event, null, 2))
260250
} else if (event.type === 'error') {
261-
console.error('[Trace Analyzer] Error event:', event.message)
251+
console.warn('[Trace Analyzer] Error event:', event.message)
262252
}
263253
},
264254
})
@@ -274,7 +264,6 @@ Focus on the HOW, not the WHAT: We want to understand and improve how agents wor
274264
return {
275265
overallAnalysis: 'Error running trace analyzer - not structured output',
276266
agentFeedback: [],
277-
recommendations: ['Trace analyzer failed to provide structured output'],
278267
}
279268
}
280269

0 commit comments

Comments
 (0)