Skip to content

Commit be0808c

Browse files
committed
update script to analyze buffbench logs
1 parent e63debc commit be0808c

File tree

1 file changed

+111
-35
lines changed

1 file changed

+111
-35
lines changed

scripts/analyze-buffbench-logs.ts

Lines changed: 111 additions & 35 deletions
Original file line numberDiff line numberDiff line change
@@ -2,15 +2,6 @@
22
import { readdirSync, readFileSync } from 'fs'
33
import { join } from 'path'
44

5-
interface JudgingResult {
6-
analysis: string
7-
strengths: string[]
8-
weaknesses: string[]
9-
completionScore: number
10-
codeQualityScore: number
11-
overallScore: number
12-
}
13-
145
interface AgentResult {
156
agentId: string
167
analysis: string
@@ -33,6 +24,32 @@ function analyzeBuffbenchLogs(logDirectory: string, filterBottom25 = false) {
3324
const files = readdirSync(logDirectory)
3425
const analysisFiles = files.filter((f) => f.includes('ANALYSIS'))
3526

27+
// First pass: collect all data and identify tasks to exclude
28+
const taskData: Record<string, AnalysisFile> = {}
29+
const tasksToExclude = new Set<string>()
30+
31+
for (const file of analysisFiles) {
32+
const filePath = join(logDirectory, file)
33+
const content = readFileSync(filePath, 'utf-8')
34+
const data: AnalysisFile = JSON.parse(content)
35+
const taskKey = data.commitSha
36+
37+
taskData[taskKey] = data
38+
39+
// Check if any agent in this task has zero or null overall score
40+
const hasZeroOrNullScore = data.results.some(
41+
(result) =>
42+
result.overallScore === 0 ||
43+
result.overallScore === null ||
44+
result.overallScore === undefined,
45+
)
46+
47+
if (hasZeroOrNullScore) {
48+
tasksToExclude.add(taskKey)
49+
}
50+
}
51+
52+
// Second pass: build agent scores excluding problematic tasks
3653
const agentScores: Record<
3754
string,
3855
{
@@ -44,10 +61,11 @@ function analyzeBuffbenchLogs(logDirectory: string, filterBottom25 = false) {
4461
}
4562
> = {}
4663

47-
for (const file of analysisFiles) {
48-
const filePath = join(logDirectory, file)
49-
const content = readFileSync(filePath, 'utf-8')
50-
const data: AnalysisFile = JSON.parse(content)
64+
for (const [taskKey, data] of Object.entries(taskData)) {
65+
// Skip tasks where any agent had zero/null score
66+
if (tasksToExclude.has(taskKey)) {
67+
continue
68+
}
5169

5270
for (const result of data.results) {
5371
if (!agentScores[result.agentId]) {
@@ -68,30 +86,77 @@ function analyzeBuffbenchLogs(logDirectory: string, filterBottom25 = false) {
6886
}
6987
}
7088

89+
if (tasksToExclude.size > 0) {
90+
console.log(
91+
`\nNote: Excluded ${tasksToExclude.size} task(s) where at least one agent had zero/null overall score\n`,
92+
)
93+
}
94+
7195
// Filter bottom 25% if requested
7296
if (filterBottom25) {
73-
for (const agentId in agentScores) {
74-
const data = agentScores[agentId]
75-
// Sort scores to find the 25th percentile
76-
const sortedScores = [...data.scores].sort((a, b) => a - b)
77-
const cutoffIndex = Math.floor(sortedScores.length * 0.25)
78-
const cutoffScore = sortedScores[cutoffIndex]
79-
80-
// Filter out tasks below the cutoff
81-
const filteredIndices = data.scores
82-
.map((score, idx) => (score >= cutoffScore ? idx : -1))
83-
.filter((idx) => idx !== -1)
84-
85-
agentScores[agentId] = {
86-
scores: filteredIndices.map((idx) => data.scores[idx]),
87-
completionScores: filteredIndices.map(
88-
(idx) => data.completionScores[idx],
89-
),
90-
qualityScores: filteredIndices.map((idx) => data.qualityScores[idx]),
91-
costs: filteredIndices.map((idx) => data.costs[idx]),
92-
durations: filteredIndices.map((idx) => data.durations[idx]),
97+
// Calculate a global cutoff based on all agents' scores combined
98+
const allScores: Array<{ score: number; taskKey: string }> = []
99+
100+
// Collect all scores with their task identifiers
101+
for (const [taskKey, data] of Object.entries(taskData)) {
102+
if (tasksToExclude.has(taskKey)) continue
103+
104+
for (const result of data.results) {
105+
allScores.push({ score: result.overallScore, taskKey })
106+
}
107+
}
108+
109+
// Sort by score and find the 25th percentile cutoff
110+
allScores.sort((a, b) => a.score - b.score)
111+
const cutoffIndex = Math.floor(allScores.length * 0.25)
112+
const cutoffScore = allScores[cutoffIndex]?.score ?? 0
113+
114+
// Identify tasks where ANY agent scored below the cutoff
115+
const tasksToExcludeForBottom25 = new Set<string>()
116+
for (const [taskKey, data] of Object.entries(taskData)) {
117+
if (tasksToExclude.has(taskKey)) continue
118+
119+
const hasLowScore = data.results.some(
120+
(result) => result.overallScore < cutoffScore,
121+
)
122+
if (hasLowScore) {
123+
tasksToExcludeForBottom25.add(taskKey)
93124
}
94125
}
126+
127+
// Rebuild agentScores excluding bottom 25% tasks
128+
const newAgentScores: typeof agentScores = {}
129+
130+
for (const [taskKey, data] of Object.entries(taskData)) {
131+
if (tasksToExclude.has(taskKey)) continue
132+
if (tasksToExcludeForBottom25.has(taskKey)) continue
133+
134+
for (const result of data.results) {
135+
if (!newAgentScores[result.agentId]) {
136+
newAgentScores[result.agentId] = {
137+
scores: [],
138+
completionScores: [],
139+
qualityScores: [],
140+
costs: [],
141+
durations: [],
142+
}
143+
}
144+
145+
newAgentScores[result.agentId].scores.push(result.overallScore)
146+
newAgentScores[result.agentId].completionScores.push(
147+
result.completionScore,
148+
)
149+
newAgentScores[result.agentId].qualityScores.push(
150+
result.codeQualityScore,
151+
)
152+
newAgentScores[result.agentId].costs.push(result.cost)
153+
newAgentScores[result.agentId].durations.push(result.durationMs)
154+
}
155+
}
156+
157+
// Replace agentScores with filtered version
158+
Object.keys(agentScores).forEach((key) => delete agentScores[key])
159+
Object.assign(agentScores, newAgentScores)
95160
}
96161

97162
// Calculate averages and stats
@@ -134,7 +199,7 @@ function analyzeBuffbenchLogs(logDirectory: string, filterBottom25 = false) {
134199
// Sort by average overall score descending
135200
results.sort((a, b) => b.averageOverallScore - a.averageOverallScore)
136201

137-
return results
202+
return { results, agentScores }
138203
}
139204

140205
// Main execution
@@ -143,9 +208,10 @@ const logDirectory = process.argv[2] || 'evals/buffbench/logs/2025-10-13T20-07'
143208
console.log(`Analyzing logs from: ${logDirectory}\n`)
144209

145210
function printTable(
146-
results: ReturnType<typeof analyzeBuffbenchLogs>,
211+
data: ReturnType<typeof analyzeBuffbenchLogs>,
147212
title: string,
148213
) {
214+
const { results, agentScores } = data
149215
console.log(title)
150216
console.log('='.repeat(130))
151217
console.log(
@@ -177,6 +243,16 @@ function printTable(
177243

178244
console.log('='.repeat(130))
179245
console.log(`Total agents analyzed: ${results.length}`)
246+
247+
// Print raw scores grouped by agent
248+
console.log('\n=== Raw Overall Scores by Agent ===')
249+
for (const result of results) {
250+
const scores = agentScores[result.agentId].scores
251+
.map((s) => s.toFixed(1))
252+
.join(', ')
253+
console.log(`\n${result.agentId}:`)
254+
console.log(` ${scores}`)
255+
}
180256
}
181257

182258
const allResults = analyzeBuffbenchLogs(logDirectory, false)

0 commit comments

Comments
 (0)