Skip to content

Commit a7abaf5

Browse files
committed
Script to compare buffbench runs on tasks they both completed
1 parent df1e76a commit a7abaf5

File tree

1 file changed

+309
-0
lines changed

1 file changed

+309
-0
lines changed

scripts/compare-buffbench-runs.ts

Lines changed: 309 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,309 @@
1+
#!/usr/bin/env bun
2+
3+
/**
4+
* Compare buffbench runs on the tasks that they **BOTH** completed successfully.
5+
*/
6+
7+
import { readdirSync, readFileSync } from 'fs'
8+
import { join } from 'path'
9+
10+
interface AgentResult {
11+
agentId: string
12+
analysis: string
13+
strengths: string[]
14+
weaknesses: string[]
15+
completionScore: number
16+
codeQualityScore: number
17+
overallScore: number
18+
cost: number
19+
durationMs: number
20+
error?: string | null
21+
}
22+
23+
interface AnalysisFile {
24+
commitSha: string
25+
timestamp: string
26+
results: AgentResult[]
27+
}
28+
29+
function compareBuffbenchRuns(dir1: string, dir2: string) {
30+
// Read all ANALYSIS files from both directories
31+
const files1 = readdirSync(dir1).filter((f) => f.includes('ANALYSIS'))
32+
const files2 = readdirSync(dir2).filter((f) => f.includes('ANALYSIS'))
33+
34+
// Parse analysis files and group by commit SHA
35+
const run1Data = new Map<string, AnalysisFile>()
36+
const run2Data = new Map<string, AnalysisFile>()
37+
38+
for (const file of files1) {
39+
const content = readFileSync(join(dir1, file), 'utf-8')
40+
const data: AnalysisFile = JSON.parse(content)
41+
run1Data.set(data.commitSha, data)
42+
}
43+
44+
for (const file of files2) {
45+
const content = readFileSync(join(dir2, file), 'utf-8')
46+
const data: AnalysisFile = JSON.parse(content)
47+
run2Data.set(data.commitSha, data)
48+
}
49+
50+
// Find common commit SHAs where both runs have successful completions (no errors)
51+
// and no agent scored <= 1.0
52+
const commonCommitShas = Array.from(run1Data.keys()).filter((sha) => {
53+
if (!run2Data.has(sha)) return false
54+
55+
// Check that all agents in both runs completed without errors
56+
const run1Results = run1Data.get(sha)!.results
57+
const run2Results = run2Data.get(sha)!.results
58+
59+
const run1HasErrors = run1Results.some((r) => r.error !== undefined && r.error !== null)
60+
const run2HasErrors = run2Results.some((r) => r.error !== undefined && r.error !== null)
61+
62+
// Check that no agent scored <= 1.0 or has null/undefined scores in either run
63+
const run1HasLowScores = run1Results.some((r) => r.overallScore == null || r.overallScore <= 1.0)
64+
const run2HasLowScores = run2Results.some((r) => r.overallScore == null || r.overallScore <= 1.0)
65+
66+
return !run1HasErrors && !run2HasErrors && !run1HasLowScores && !run2HasLowScores
67+
})
68+
69+
// Count tasks with various issues for reporting
70+
const run1Shas = Array.from(run1Data.keys())
71+
const run2Shas = Array.from(run2Data.keys())
72+
73+
const run1TasksWithErrors = run1Shas.filter((sha) => {
74+
const results = run1Data.get(sha)!.results
75+
return results.some((r) => r.error !== undefined && r.error !== null)
76+
})
77+
78+
const run2TasksWithErrors = run2Shas.filter((sha) => {
79+
const results = run2Data.get(sha)!.results
80+
return results.some((r) => r.error !== undefined && r.error !== null)
81+
})
82+
83+
const run1TasksWithLowScores = run1Shas.filter((sha) => {
84+
const results = run1Data.get(sha)!.results
85+
return results.some((r) => r.overallScore == null || r.overallScore <= 1.0)
86+
})
87+
88+
const run2TasksWithLowScores = run2Shas.filter((sha) => {
89+
const results = run2Data.get(sha)!.results
90+
return results.some((r) => r.overallScore == null || r.overallScore <= 1.0)
91+
})
92+
93+
// Count tasks excluded (either errors OR low scores)
94+
const run1ExcludedTasks = new Set([...run1TasksWithErrors, ...run1TasksWithLowScores])
95+
const run2ExcludedTasks = new Set([...run2TasksWithErrors, ...run2TasksWithLowScores])
96+
97+
console.log(`\nRun 1: ${dir1}`)
98+
console.log(`Run 2: ${dir2}`)
99+
console.log(`\nTotal tasks in Run 1: ${run1Data.size}`)
100+
console.log(` - With errors: ${run1TasksWithErrors.length}`)
101+
console.log(` - With scores ≤1.0: ${run1TasksWithLowScores.length}`)
102+
console.log(` - Excluded (errors OR low scores): ${run1ExcludedTasks.size}`)
103+
console.log(` - Valid: ${run1Data.size - run1ExcludedTasks.size}`)
104+
console.log(`\nTotal tasks in Run 2: ${run2Data.size}`)
105+
console.log(` - With errors: ${run2TasksWithErrors.length}`)
106+
console.log(` - With scores ≤1.0: ${run2TasksWithLowScores.length}`)
107+
console.log(` - Excluded (errors OR low scores): ${run2ExcludedTasks.size}`)
108+
console.log(` - Valid: ${run2Data.size - run2ExcludedTasks.size}`)
109+
console.log(`\nCommon tasks (both completed successfully with scores >1.0): ${commonCommitShas.length}\n`)
110+
111+
if (commonCommitShas.length === 0) {
112+
console.log('No common successfully-completed tasks with scores >1.0 found between the two runs!')
113+
return
114+
}
115+
116+
// Collect scores for each agent across common tasks
117+
const agentScores: Record<
118+
string,
119+
{
120+
run: 1 | 2
121+
scores: number[]
122+
completionScores: number[]
123+
qualityScores: number[]
124+
costs: number[]
125+
durations: number[]
126+
}
127+
> = {}
128+
129+
// Process Run 1
130+
for (const sha of commonCommitShas) {
131+
const data = run1Data.get(sha)!
132+
for (const result of data.results) {
133+
const key = `${result.agentId} (Run 1)`
134+
if (!agentScores[key]) {
135+
agentScores[key] = {
136+
run: 1,
137+
scores: [],
138+
completionScores: [],
139+
qualityScores: [],
140+
costs: [],
141+
durations: [],
142+
}
143+
}
144+
agentScores[key].scores.push(result.overallScore)
145+
agentScores[key].completionScores.push(result.completionScore)
146+
agentScores[key].qualityScores.push(result.codeQualityScore)
147+
agentScores[key].costs.push(result.cost)
148+
agentScores[key].durations.push(result.durationMs)
149+
}
150+
}
151+
152+
// Process Run 2
153+
for (const sha of commonCommitShas) {
154+
const data = run2Data.get(sha)!
155+
for (const result of data.results) {
156+
const key = `${result.agentId} (Run 2)`
157+
if (!agentScores[key]) {
158+
agentScores[key] = {
159+
run: 2,
160+
scores: [],
161+
completionScores: [],
162+
qualityScores: [],
163+
costs: [],
164+
durations: [],
165+
}
166+
}
167+
agentScores[key].scores.push(result.overallScore)
168+
agentScores[key].completionScores.push(result.completionScore)
169+
agentScores[key].qualityScores.push(result.codeQualityScore)
170+
agentScores[key].costs.push(result.cost)
171+
agentScores[key].durations.push(result.durationMs)
172+
}
173+
}
174+
175+
// Calculate averages and stats
176+
const results = Object.entries(agentScores).map(([agentKey, data]) => {
177+
const avgOverall =
178+
data.scores.reduce((a, b) => a + b, 0) / data.scores.length
179+
const avgCompletion =
180+
data.completionScores.reduce((a, b) => a + b, 0) /
181+
data.completionScores.length
182+
const avgQuality =
183+
data.qualityScores.reduce((a, b) => a + b, 0) /
184+
data.qualityScores.length
185+
186+
const minOverall = Math.min(...data.scores)
187+
const maxOverall = Math.max(...data.scores)
188+
189+
// Calculate standard deviation
190+
const variance =
191+
data.scores.reduce(
192+
(sum, score) => sum + Math.pow(score - avgOverall, 2),
193+
0,
194+
) / data.scores.length
195+
const stdDev = Math.sqrt(variance)
196+
197+
const avgCost = data.costs.reduce((a, b) => a + b, 0) / data.costs.length
198+
const avgDuration =
199+
data.durations.reduce((a, b) => a + b, 0) / data.durations.length
200+
201+
return {
202+
agentKey,
203+
run: data.run,
204+
count: data.scores.length,
205+
averageOverallScore: avgOverall,
206+
averageCompletionScore: avgCompletion,
207+
averageQualityScore: avgQuality,
208+
minOverallScore: minOverall,
209+
maxOverallScore: maxOverall,
210+
stdDevOverall: stdDev,
211+
averageCost: avgCost,
212+
averageDurationMs: avgDuration,
213+
}
214+
})
215+
216+
// Sort by run, then by average overall score descending
217+
results.sort((a, b) => {
218+
if (a.run !== b.run) return a.run - b.run
219+
return b.averageOverallScore - a.averageOverallScore
220+
})
221+
222+
// Print comparison table
223+
console.log('Comparison on Common Tasks (N=' + commonCommitShas.length + ')')
224+
console.log('='.repeat(140))
225+
console.log(
226+
'Agent'.padEnd(45),
227+
'Count'.padEnd(8),
228+
'Overall'.padEnd(10),
229+
'Min'.padEnd(8),
230+
'Max'.padEnd(8),
231+
'StdDev'.padEnd(10),
232+
'Completion'.padEnd(12),
233+
'Quality'.padEnd(10),
234+
'Cost ($)'.padEnd(10),
235+
'Duration (s)',
236+
)
237+
console.log('='.repeat(140))
238+
239+
for (const result of results) {
240+
console.log(
241+
result.agentKey.padEnd(45),
242+
result.count.toString().padEnd(8),
243+
result.averageOverallScore.toFixed(2).padEnd(10),
244+
result.minOverallScore.toFixed(2).padEnd(8),
245+
result.maxOverallScore.toFixed(2).padEnd(8),
246+
result.stdDevOverall.toFixed(2).padEnd(10),
247+
result.averageCompletionScore.toFixed(2).padEnd(12),
248+
result.averageQualityScore.toFixed(2).padEnd(10),
249+
result.averageCost.toFixed(2).padEnd(10),
250+
(result.averageDurationMs / 1000).toFixed(1),
251+
)
252+
}
253+
254+
console.log('='.repeat(140))
255+
256+
// Calculate and display head-to-head comparisons
257+
console.log('\n=== Head-to-Head Comparison ===\n')
258+
259+
// Group by agent name (without run suffix)
260+
const agentGroups = new Map<string, typeof results>()
261+
for (const result of results) {
262+
const agentName = result.agentKey.replace(/ \(Run [12]\)$/, '')
263+
if (!agentGroups.has(agentName)) {
264+
agentGroups.set(agentName, [])
265+
}
266+
agentGroups.get(agentName)!.push(result)
267+
}
268+
269+
for (const [agentName, agentResults] of agentGroups.entries()) {
270+
if (agentResults.length === 2) {
271+
const run1 = agentResults.find((r) => r.run === 1)
272+
const run2 = agentResults.find((r) => r.run === 2)
273+
274+
if (run1 && run2) {
275+
console.log(`${agentName}:`)
276+
const scoreDiff = run2.averageOverallScore - run1.averageOverallScore
277+
const costDiff = run2.averageCost - run1.averageCost
278+
const durationDiff =
279+
(run2.averageDurationMs - run1.averageDurationMs) / 1000
280+
281+
console.log(
282+
` Overall Score: ${run1.averageOverallScore.toFixed(2)}${run2.averageOverallScore.toFixed(2)} (${scoreDiff >= 0 ? '+' : ''}${scoreDiff.toFixed(2)})`,
283+
)
284+
console.log(
285+
` Cost: $${run1.averageCost.toFixed(2)} → $${run2.averageCost.toFixed(2)} (${costDiff >= 0 ? '+' : ''}${costDiff.toFixed(2)})`,
286+
)
287+
console.log(
288+
` Duration: ${(run1.averageDurationMs / 1000).toFixed(1)}s → ${(run2.averageDurationMs / 1000).toFixed(1)}s (${durationDiff >= 0 ? '+' : ''}${durationDiff.toFixed(1)}s)`,
289+
)
290+
console.log()
291+
}
292+
}
293+
}
294+
}
295+
296+
// Main execution
297+
const logDir1 =
298+
process.argv[2] || 'evals/buffbench/logs/2025-10-20T06-29'
299+
const logDir2 =
300+
process.argv[3] || 'evals/buffbench/logs/2025-10-20T21-26'
301+
302+
if (!process.argv[2] || !process.argv[3]) {
303+
console.log('Usage: bun run scripts/compare-buffbench-runs.ts <log-dir-1> <log-dir-2>')
304+
console.log('\nUsing default directories:')
305+
console.log(` Dir 1: ${logDir1}`)
306+
console.log(` Dir 2: ${logDir2}\n`)
307+
}
308+
309+
compareBuffbenchRuns(logDir1, logDir2)

0 commit comments

Comments
 (0)