22import { readdirSync , readFileSync } from 'fs'
33import { join } from 'path'
44
5- interface JudgingResult {
6- analysis : string
7- strengths : string [ ]
8- weaknesses : string [ ]
9- completionScore : number
10- codeQualityScore : number
11- overallScore : number
12- }
13-
145interface AgentResult {
156 agentId : string
167 analysis : string
@@ -33,6 +24,32 @@ function analyzeBuffbenchLogs(logDirectory: string, filterBottom25 = false) {
3324 const files = readdirSync ( logDirectory )
3425 const analysisFiles = files . filter ( ( f ) => f . includes ( 'ANALYSIS' ) )
3526
27+ // First pass: collect all data and identify tasks to exclude
28+ const taskData : Record < string , AnalysisFile > = { }
29+ const tasksToExclude = new Set < string > ( )
30+
31+ for ( const file of analysisFiles ) {
32+ const filePath = join ( logDirectory , file )
33+ const content = readFileSync ( filePath , 'utf-8' )
34+ const data : AnalysisFile = JSON . parse ( content )
35+ const taskKey = data . commitSha
36+
37+ taskData [ taskKey ] = data
38+
39+ // Check if any agent in this task has zero or null overall score
40+ const hasZeroOrNullScore = data . results . some (
41+ ( result ) =>
42+ result . overallScore === 0 ||
43+ result . overallScore === null ||
44+ result . overallScore === undefined ,
45+ )
46+
47+ if ( hasZeroOrNullScore ) {
48+ tasksToExclude . add ( taskKey )
49+ }
50+ }
51+
52+ // Second pass: build agent scores excluding problematic tasks
3653 const agentScores : Record <
3754 string ,
3855 {
@@ -44,10 +61,11 @@ function analyzeBuffbenchLogs(logDirectory: string, filterBottom25 = false) {
4461 }
4562 > = { }
4663
47- for ( const file of analysisFiles ) {
48- const filePath = join ( logDirectory , file )
49- const content = readFileSync ( filePath , 'utf-8' )
50- const data : AnalysisFile = JSON . parse ( content )
64+ for ( const [ taskKey , data ] of Object . entries ( taskData ) ) {
65+ // Skip tasks where any agent had zero/null score
66+ if ( tasksToExclude . has ( taskKey ) ) {
67+ continue
68+ }
5169
5270 for ( const result of data . results ) {
5371 if ( ! agentScores [ result . agentId ] ) {
@@ -68,30 +86,77 @@ function analyzeBuffbenchLogs(logDirectory: string, filterBottom25 = false) {
6886 }
6987 }
7088
89+ if ( tasksToExclude . size > 0 ) {
90+ console . log (
91+ `\nNote: Excluded ${ tasksToExclude . size } task(s) where at least one agent had zero/null overall score\n` ,
92+ )
93+ }
94+
7195 // Filter bottom 25% if requested
7296 if ( filterBottom25 ) {
73- for ( const agentId in agentScores ) {
74- const data = agentScores [ agentId ]
75- // Sort scores to find the 25th percentile
76- const sortedScores = [ ...data . scores ] . sort ( ( a , b ) => a - b )
77- const cutoffIndex = Math . floor ( sortedScores . length * 0.25 )
78- const cutoffScore = sortedScores [ cutoffIndex ]
79-
80- // Filter out tasks below the cutoff
81- const filteredIndices = data . scores
82- . map ( ( score , idx ) => ( score >= cutoffScore ? idx : - 1 ) )
83- . filter ( ( idx ) => idx !== - 1 )
84-
85- agentScores [ agentId ] = {
86- scores : filteredIndices . map ( ( idx ) => data . scores [ idx ] ) ,
87- completionScores : filteredIndices . map (
88- ( idx ) => data . completionScores [ idx ] ,
89- ) ,
90- qualityScores : filteredIndices . map ( ( idx ) => data . qualityScores [ idx ] ) ,
91- costs : filteredIndices . map ( ( idx ) => data . costs [ idx ] ) ,
92- durations : filteredIndices . map ( ( idx ) => data . durations [ idx ] ) ,
97+ // Calculate a global cutoff based on all agents' scores combined
98+ const allScores : Array < { score : number ; taskKey : string } > = [ ]
99+
100+ // Collect all scores with their task identifiers
101+ for ( const [ taskKey , data ] of Object . entries ( taskData ) ) {
102+ if ( tasksToExclude . has ( taskKey ) ) continue
103+
104+ for ( const result of data . results ) {
105+ allScores . push ( { score : result . overallScore , taskKey } )
106+ }
107+ }
108+
109+ // Sort by score and find the 25th percentile cutoff
110+ allScores . sort ( ( a , b ) => a . score - b . score )
111+ const cutoffIndex = Math . floor ( allScores . length * 0.25 )
112+ const cutoffScore = allScores [ cutoffIndex ] ?. score ?? 0
113+
114+ // Identify tasks where ANY agent scored below the cutoff
115+ const tasksToExcludeForBottom25 = new Set < string > ( )
116+ for ( const [ taskKey , data ] of Object . entries ( taskData ) ) {
117+ if ( tasksToExclude . has ( taskKey ) ) continue
118+
119+ const hasLowScore = data . results . some (
120+ ( result ) => result . overallScore < cutoffScore ,
121+ )
122+ if ( hasLowScore ) {
123+ tasksToExcludeForBottom25 . add ( taskKey )
93124 }
94125 }
126+
127+ // Rebuild agentScores excluding bottom 25% tasks
128+ const newAgentScores : typeof agentScores = { }
129+
130+ for ( const [ taskKey , data ] of Object . entries ( taskData ) ) {
131+ if ( tasksToExclude . has ( taskKey ) ) continue
132+ if ( tasksToExcludeForBottom25 . has ( taskKey ) ) continue
133+
134+ for ( const result of data . results ) {
135+ if ( ! newAgentScores [ result . agentId ] ) {
136+ newAgentScores [ result . agentId ] = {
137+ scores : [ ] ,
138+ completionScores : [ ] ,
139+ qualityScores : [ ] ,
140+ costs : [ ] ,
141+ durations : [ ] ,
142+ }
143+ }
144+
145+ newAgentScores [ result . agentId ] . scores . push ( result . overallScore )
146+ newAgentScores [ result . agentId ] . completionScores . push (
147+ result . completionScore ,
148+ )
149+ newAgentScores [ result . agentId ] . qualityScores . push (
150+ result . codeQualityScore ,
151+ )
152+ newAgentScores [ result . agentId ] . costs . push ( result . cost )
153+ newAgentScores [ result . agentId ] . durations . push ( result . durationMs )
154+ }
155+ }
156+
157+ // Replace agentScores with filtered version
158+ Object . keys ( agentScores ) . forEach ( ( key ) => delete agentScores [ key ] )
159+ Object . assign ( agentScores , newAgentScores )
95160 }
96161
97162 // Calculate averages and stats
@@ -134,7 +199,7 @@ function analyzeBuffbenchLogs(logDirectory: string, filterBottom25 = false) {
134199 // Sort by average overall score descending
135200 results . sort ( ( a , b ) => b . averageOverallScore - a . averageOverallScore )
136201
137- return results
202+ return { results, agentScores }
138203}
139204
140205// Main execution
@@ -143,9 +208,10 @@ const logDirectory = process.argv[2] || 'evals/buffbench/logs/2025-10-13T20-07'
143208console . log ( `Analyzing logs from: ${ logDirectory } \n` )
144209
145210function printTable (
146- results : ReturnType < typeof analyzeBuffbenchLogs > ,
211+ data : ReturnType < typeof analyzeBuffbenchLogs > ,
147212 title : string ,
148213) {
214+ const { results, agentScores } = data
149215 console . log ( title )
150216 console . log ( '=' . repeat ( 130 ) )
151217 console . log (
@@ -177,6 +243,16 @@ function printTable(
177243
178244 console . log ( '=' . repeat ( 130 ) )
179245 console . log ( `Total agents analyzed: ${ results . length } ` )
246+
247+ // Print raw scores grouped by agent
248+ console . log ( '\n=== Raw Overall Scores by Agent ===' )
249+ for ( const result of results ) {
250+ const scores = agentScores [ result . agentId ] . scores
251+ . map ( ( s ) => s . toFixed ( 1 ) )
252+ . join ( ', ' )
253+ console . log ( `\n${ result . agentId } :` )
254+ console . log ( ` ${ scores } ` )
255+ }
180256}
181257
182258const allResults = analyzeBuffbenchLogs ( logDirectory , false )
0 commit comments