|
| 1 | +#!/usr/bin/env bun |
| 2 | + |
| 3 | +/** |
| 4 | + * Compare buffbench runs on the tasks that they **BOTH** completed successfully. |
| 5 | + */ |
| 6 | + |
| 7 | +import { readdirSync, readFileSync } from 'fs' |
| 8 | +import { join } from 'path' |
| 9 | + |
| 10 | +interface AgentResult { |
| 11 | + agentId: string |
| 12 | + analysis: string |
| 13 | + strengths: string[] |
| 14 | + weaknesses: string[] |
| 15 | + completionScore: number |
| 16 | + codeQualityScore: number |
| 17 | + overallScore: number |
| 18 | + cost: number |
| 19 | + durationMs: number |
| 20 | + error?: string | null |
| 21 | +} |
| 22 | + |
| 23 | +interface AnalysisFile { |
| 24 | + commitSha: string |
| 25 | + timestamp: string |
| 26 | + results: AgentResult[] |
| 27 | +} |
| 28 | + |
| 29 | +function compareBuffbenchRuns(dir1: string, dir2: string) { |
| 30 | + // Read all ANALYSIS files from both directories |
| 31 | + const files1 = readdirSync(dir1).filter((f) => f.includes('ANALYSIS')) |
| 32 | + const files2 = readdirSync(dir2).filter((f) => f.includes('ANALYSIS')) |
| 33 | + |
| 34 | + // Parse analysis files and group by commit SHA |
| 35 | + const run1Data = new Map<string, AnalysisFile>() |
| 36 | + const run2Data = new Map<string, AnalysisFile>() |
| 37 | + |
| 38 | + for (const file of files1) { |
| 39 | + const content = readFileSync(join(dir1, file), 'utf-8') |
| 40 | + const data: AnalysisFile = JSON.parse(content) |
| 41 | + run1Data.set(data.commitSha, data) |
| 42 | + } |
| 43 | + |
| 44 | + for (const file of files2) { |
| 45 | + const content = readFileSync(join(dir2, file), 'utf-8') |
| 46 | + const data: AnalysisFile = JSON.parse(content) |
| 47 | + run2Data.set(data.commitSha, data) |
| 48 | + } |
| 49 | + |
| 50 | + // Find common commit SHAs where both runs have successful completions (no errors) |
| 51 | + // and no agent scored <= 1.0 |
| 52 | + const commonCommitShas = Array.from(run1Data.keys()).filter((sha) => { |
| 53 | + if (!run2Data.has(sha)) return false |
| 54 | + |
| 55 | + // Check that all agents in both runs completed without errors |
| 56 | + const run1Results = run1Data.get(sha)!.results |
| 57 | + const run2Results = run2Data.get(sha)!.results |
| 58 | + |
| 59 | + const run1HasErrors = run1Results.some((r) => r.error !== undefined && r.error !== null) |
| 60 | + const run2HasErrors = run2Results.some((r) => r.error !== undefined && r.error !== null) |
| 61 | + |
| 62 | + // Check that no agent scored <= 1.0 or has null/undefined scores in either run |
| 63 | + const run1HasLowScores = run1Results.some((r) => r.overallScore == null || r.overallScore <= 1.0) |
| 64 | + const run2HasLowScores = run2Results.some((r) => r.overallScore == null || r.overallScore <= 1.0) |
| 65 | + |
| 66 | + return !run1HasErrors && !run2HasErrors && !run1HasLowScores && !run2HasLowScores |
| 67 | + }) |
| 68 | + |
| 69 | + // Count tasks with various issues for reporting |
| 70 | + const run1Shas = Array.from(run1Data.keys()) |
| 71 | + const run2Shas = Array.from(run2Data.keys()) |
| 72 | + |
| 73 | + const run1TasksWithErrors = run1Shas.filter((sha) => { |
| 74 | + const results = run1Data.get(sha)!.results |
| 75 | + return results.some((r) => r.error !== undefined && r.error !== null) |
| 76 | + }) |
| 77 | + |
| 78 | + const run2TasksWithErrors = run2Shas.filter((sha) => { |
| 79 | + const results = run2Data.get(sha)!.results |
| 80 | + return results.some((r) => r.error !== undefined && r.error !== null) |
| 81 | + }) |
| 82 | + |
| 83 | + const run1TasksWithLowScores = run1Shas.filter((sha) => { |
| 84 | + const results = run1Data.get(sha)!.results |
| 85 | + return results.some((r) => r.overallScore == null || r.overallScore <= 1.0) |
| 86 | + }) |
| 87 | + |
| 88 | + const run2TasksWithLowScores = run2Shas.filter((sha) => { |
| 89 | + const results = run2Data.get(sha)!.results |
| 90 | + return results.some((r) => r.overallScore == null || r.overallScore <= 1.0) |
| 91 | + }) |
| 92 | + |
| 93 | + // Count tasks excluded (either errors OR low scores) |
| 94 | + const run1ExcludedTasks = new Set([...run1TasksWithErrors, ...run1TasksWithLowScores]) |
| 95 | + const run2ExcludedTasks = new Set([...run2TasksWithErrors, ...run2TasksWithLowScores]) |
| 96 | + |
| 97 | + console.log(`\nRun 1: ${dir1}`) |
| 98 | + console.log(`Run 2: ${dir2}`) |
| 99 | + console.log(`\nTotal tasks in Run 1: ${run1Data.size}`) |
| 100 | + console.log(` - With errors: ${run1TasksWithErrors.length}`) |
| 101 | + console.log(` - With scores ≤1.0: ${run1TasksWithLowScores.length}`) |
| 102 | + console.log(` - Excluded (errors OR low scores): ${run1ExcludedTasks.size}`) |
| 103 | + console.log(` - Valid: ${run1Data.size - run1ExcludedTasks.size}`) |
| 104 | + console.log(`\nTotal tasks in Run 2: ${run2Data.size}`) |
| 105 | + console.log(` - With errors: ${run2TasksWithErrors.length}`) |
| 106 | + console.log(` - With scores ≤1.0: ${run2TasksWithLowScores.length}`) |
| 107 | + console.log(` - Excluded (errors OR low scores): ${run2ExcludedTasks.size}`) |
| 108 | + console.log(` - Valid: ${run2Data.size - run2ExcludedTasks.size}`) |
| 109 | + console.log(`\nCommon tasks (both completed successfully with scores >1.0): ${commonCommitShas.length}\n`) |
| 110 | + |
| 111 | + if (commonCommitShas.length === 0) { |
| 112 | + console.log('No common successfully-completed tasks with scores >1.0 found between the two runs!') |
| 113 | + return |
| 114 | + } |
| 115 | + |
| 116 | + // Collect scores for each agent across common tasks |
| 117 | + const agentScores: Record< |
| 118 | + string, |
| 119 | + { |
| 120 | + run: 1 | 2 |
| 121 | + scores: number[] |
| 122 | + completionScores: number[] |
| 123 | + qualityScores: number[] |
| 124 | + costs: number[] |
| 125 | + durations: number[] |
| 126 | + } |
| 127 | + > = {} |
| 128 | + |
| 129 | + // Process Run 1 |
| 130 | + for (const sha of commonCommitShas) { |
| 131 | + const data = run1Data.get(sha)! |
| 132 | + for (const result of data.results) { |
| 133 | + const key = `${result.agentId} (Run 1)` |
| 134 | + if (!agentScores[key]) { |
| 135 | + agentScores[key] = { |
| 136 | + run: 1, |
| 137 | + scores: [], |
| 138 | + completionScores: [], |
| 139 | + qualityScores: [], |
| 140 | + costs: [], |
| 141 | + durations: [], |
| 142 | + } |
| 143 | + } |
| 144 | + agentScores[key].scores.push(result.overallScore) |
| 145 | + agentScores[key].completionScores.push(result.completionScore) |
| 146 | + agentScores[key].qualityScores.push(result.codeQualityScore) |
| 147 | + agentScores[key].costs.push(result.cost) |
| 148 | + agentScores[key].durations.push(result.durationMs) |
| 149 | + } |
| 150 | + } |
| 151 | + |
| 152 | + // Process Run 2 |
| 153 | + for (const sha of commonCommitShas) { |
| 154 | + const data = run2Data.get(sha)! |
| 155 | + for (const result of data.results) { |
| 156 | + const key = `${result.agentId} (Run 2)` |
| 157 | + if (!agentScores[key]) { |
| 158 | + agentScores[key] = { |
| 159 | + run: 2, |
| 160 | + scores: [], |
| 161 | + completionScores: [], |
| 162 | + qualityScores: [], |
| 163 | + costs: [], |
| 164 | + durations: [], |
| 165 | + } |
| 166 | + } |
| 167 | + agentScores[key].scores.push(result.overallScore) |
| 168 | + agentScores[key].completionScores.push(result.completionScore) |
| 169 | + agentScores[key].qualityScores.push(result.codeQualityScore) |
| 170 | + agentScores[key].costs.push(result.cost) |
| 171 | + agentScores[key].durations.push(result.durationMs) |
| 172 | + } |
| 173 | + } |
| 174 | + |
| 175 | + // Calculate averages and stats |
| 176 | + const results = Object.entries(agentScores).map(([agentKey, data]) => { |
| 177 | + const avgOverall = |
| 178 | + data.scores.reduce((a, b) => a + b, 0) / data.scores.length |
| 179 | + const avgCompletion = |
| 180 | + data.completionScores.reduce((a, b) => a + b, 0) / |
| 181 | + data.completionScores.length |
| 182 | + const avgQuality = |
| 183 | + data.qualityScores.reduce((a, b) => a + b, 0) / |
| 184 | + data.qualityScores.length |
| 185 | + |
| 186 | + const minOverall = Math.min(...data.scores) |
| 187 | + const maxOverall = Math.max(...data.scores) |
| 188 | + |
| 189 | + // Calculate standard deviation |
| 190 | + const variance = |
| 191 | + data.scores.reduce( |
| 192 | + (sum, score) => sum + Math.pow(score - avgOverall, 2), |
| 193 | + 0, |
| 194 | + ) / data.scores.length |
| 195 | + const stdDev = Math.sqrt(variance) |
| 196 | + |
| 197 | + const avgCost = data.costs.reduce((a, b) => a + b, 0) / data.costs.length |
| 198 | + const avgDuration = |
| 199 | + data.durations.reduce((a, b) => a + b, 0) / data.durations.length |
| 200 | + |
| 201 | + return { |
| 202 | + agentKey, |
| 203 | + run: data.run, |
| 204 | + count: data.scores.length, |
| 205 | + averageOverallScore: avgOverall, |
| 206 | + averageCompletionScore: avgCompletion, |
| 207 | + averageQualityScore: avgQuality, |
| 208 | + minOverallScore: minOverall, |
| 209 | + maxOverallScore: maxOverall, |
| 210 | + stdDevOverall: stdDev, |
| 211 | + averageCost: avgCost, |
| 212 | + averageDurationMs: avgDuration, |
| 213 | + } |
| 214 | + }) |
| 215 | + |
| 216 | + // Sort by run, then by average overall score descending |
| 217 | + results.sort((a, b) => { |
| 218 | + if (a.run !== b.run) return a.run - b.run |
| 219 | + return b.averageOverallScore - a.averageOverallScore |
| 220 | + }) |
| 221 | + |
| 222 | + // Print comparison table |
| 223 | + console.log('Comparison on Common Tasks (N=' + commonCommitShas.length + ')') |
| 224 | + console.log('='.repeat(140)) |
| 225 | + console.log( |
| 226 | + 'Agent'.padEnd(45), |
| 227 | + 'Count'.padEnd(8), |
| 228 | + 'Overall'.padEnd(10), |
| 229 | + 'Min'.padEnd(8), |
| 230 | + 'Max'.padEnd(8), |
| 231 | + 'StdDev'.padEnd(10), |
| 232 | + 'Completion'.padEnd(12), |
| 233 | + 'Quality'.padEnd(10), |
| 234 | + 'Cost ($)'.padEnd(10), |
| 235 | + 'Duration (s)', |
| 236 | + ) |
| 237 | + console.log('='.repeat(140)) |
| 238 | + |
| 239 | + for (const result of results) { |
| 240 | + console.log( |
| 241 | + result.agentKey.padEnd(45), |
| 242 | + result.count.toString().padEnd(8), |
| 243 | + result.averageOverallScore.toFixed(2).padEnd(10), |
| 244 | + result.minOverallScore.toFixed(2).padEnd(8), |
| 245 | + result.maxOverallScore.toFixed(2).padEnd(8), |
| 246 | + result.stdDevOverall.toFixed(2).padEnd(10), |
| 247 | + result.averageCompletionScore.toFixed(2).padEnd(12), |
| 248 | + result.averageQualityScore.toFixed(2).padEnd(10), |
| 249 | + result.averageCost.toFixed(2).padEnd(10), |
| 250 | + (result.averageDurationMs / 1000).toFixed(1), |
| 251 | + ) |
| 252 | + } |
| 253 | + |
| 254 | + console.log('='.repeat(140)) |
| 255 | + |
| 256 | + // Calculate and display head-to-head comparisons |
| 257 | + console.log('\n=== Head-to-Head Comparison ===\n') |
| 258 | + |
| 259 | + // Group by agent name (without run suffix) |
| 260 | + const agentGroups = new Map<string, typeof results>() |
| 261 | + for (const result of results) { |
| 262 | + const agentName = result.agentKey.replace(/ \(Run [12]\)$/, '') |
| 263 | + if (!agentGroups.has(agentName)) { |
| 264 | + agentGroups.set(agentName, []) |
| 265 | + } |
| 266 | + agentGroups.get(agentName)!.push(result) |
| 267 | + } |
| 268 | + |
| 269 | + for (const [agentName, agentResults] of agentGroups.entries()) { |
| 270 | + if (agentResults.length === 2) { |
| 271 | + const run1 = agentResults.find((r) => r.run === 1) |
| 272 | + const run2 = agentResults.find((r) => r.run === 2) |
| 273 | + |
| 274 | + if (run1 && run2) { |
| 275 | + console.log(`${agentName}:`) |
| 276 | + const scoreDiff = run2.averageOverallScore - run1.averageOverallScore |
| 277 | + const costDiff = run2.averageCost - run1.averageCost |
| 278 | + const durationDiff = |
| 279 | + (run2.averageDurationMs - run1.averageDurationMs) / 1000 |
| 280 | + |
| 281 | + console.log( |
| 282 | + ` Overall Score: ${run1.averageOverallScore.toFixed(2)} → ${run2.averageOverallScore.toFixed(2)} (${scoreDiff >= 0 ? '+' : ''}${scoreDiff.toFixed(2)})`, |
| 283 | + ) |
| 284 | + console.log( |
| 285 | + ` Cost: $${run1.averageCost.toFixed(2)} → $${run2.averageCost.toFixed(2)} (${costDiff >= 0 ? '+' : ''}${costDiff.toFixed(2)})`, |
| 286 | + ) |
| 287 | + console.log( |
| 288 | + ` Duration: ${(run1.averageDurationMs / 1000).toFixed(1)}s → ${(run2.averageDurationMs / 1000).toFixed(1)}s (${durationDiff >= 0 ? '+' : ''}${durationDiff.toFixed(1)}s)`, |
| 289 | + ) |
| 290 | + console.log() |
| 291 | + } |
| 292 | + } |
| 293 | + } |
| 294 | +} |
| 295 | + |
| 296 | +// Main execution |
| 297 | +const logDir1 = |
| 298 | + process.argv[2] || 'evals/buffbench/logs/2025-10-20T06-29' |
| 299 | +const logDir2 = |
| 300 | + process.argv[3] || 'evals/buffbench/logs/2025-10-20T21-26' |
| 301 | + |
| 302 | +if (!process.argv[2] || !process.argv[3]) { |
| 303 | + console.log('Usage: bun run scripts/compare-buffbench-runs.ts <log-dir-1> <log-dir-2>') |
| 304 | + console.log('\nUsing default directories:') |
| 305 | + console.log(` Dir 1: ${logDir1}`) |
| 306 | + console.log(` Dir 2: ${logDir2}\n`) |
| 307 | +} |
| 308 | + |
| 309 | +compareBuffbenchRuns(logDir1, logDir2) |
0 commit comments