update script to analyze buffbench logs

jahooma · jahooma · commit be0808c3ee5f · 2025-10-30T16:34:34.000-07:00
diff --git a/scripts/analyze-buffbench-logs.ts b/scripts/analyze-buffbench-logs.ts
@@ -2,15 +2,6 @@
 import { readdirSync, readFileSync } from 'fs'
 import { join } from 'path'
 
-interface JudgingResult {
-  analysis: string
-  strengths: string[]
-  weaknesses: string[]
-  completionScore: number
-  codeQualityScore: number
-  overallScore: number
-}
-
 interface AgentResult {
   agentId: string
   analysis: string
@@ -33,6 +24,32 @@ function analyzeBuffbenchLogs(logDirectory: string, filterBottom25 = false) {
   const files = readdirSync(logDirectory)
   const analysisFiles = files.filter((f) => f.includes('ANALYSIS'))
 
+  // First pass: collect all data and identify tasks to exclude
+  const taskData: Record<string, AnalysisFile> = {}
+  const tasksToExclude = new Set<string>()
+
+  for (const file of analysisFiles) {
+    const filePath = join(logDirectory, file)
+    const content = readFileSync(filePath, 'utf-8')
+    const data: AnalysisFile = JSON.parse(content)
+    const taskKey = data.commitSha
+
+    taskData[taskKey] = data
+
+    // Check if any agent in this task has zero or null overall score
+    const hasZeroOrNullScore = data.results.some(
+      (result) =>
+        result.overallScore === 0 ||
+        result.overallScore === null ||
+        result.overallScore === undefined,
+    )
+
+    if (hasZeroOrNullScore) {
+      tasksToExclude.add(taskKey)
+    }
+  }
+
+  // Second pass: build agent scores excluding problematic tasks
   const agentScores: Record<
     string,
     {
@@ -44,10 +61,11 @@ function analyzeBuffbenchLogs(logDirectory: string, filterBottom25 = false) {
     }
   > = {}
 
-  for (const file of analysisFiles) {
-    const filePath = join(logDirectory, file)
-    const content = readFileSync(filePath, 'utf-8')
-    const data: AnalysisFile = JSON.parse(content)
+  for (const [taskKey, data] of Object.entries(taskData)) {
+    // Skip tasks where any agent had zero/null score
+    if (tasksToExclude.has(taskKey)) {
+      continue
+    }
 
     for (const result of data.results) {
       if (!agentScores[result.agentId]) {
@@ -68,30 +86,77 @@ function analyzeBuffbenchLogs(logDirectory: string, filterBottom25 = false) {
     }
   }
 
+  if (tasksToExclude.size > 0) {
+    console.log(
+      `\nNote: Excluded ${tasksToExclude.size} task(s) where at least one agent had zero/null overall score\n`,
+    )
+  }
+
   // Filter bottom 25% if requested
   if (filterBottom25) {
-    for (const agentId in agentScores) {
-      const data = agentScores[agentId]
-      // Sort scores to find the 25th percentile
-      const sortedScores = [...data.scores].sort((a, b) => a - b)
-      const cutoffIndex = Math.floor(sortedScores.length * 0.25)
-      const cutoffScore = sortedScores[cutoffIndex]
-
-      // Filter out tasks below the cutoff
-      const filteredIndices = data.scores
-        .map((score, idx) => (score >= cutoffScore ? idx : -1))
-        .filter((idx) => idx !== -1)
-
-      agentScores[agentId] = {
-        scores: filteredIndices.map((idx) => data.scores[idx]),
-        completionScores: filteredIndices.map(
-          (idx) => data.completionScores[idx],
-        ),
-        qualityScores: filteredIndices.map((idx) => data.qualityScores[idx]),
-        costs: filteredIndices.map((idx) => data.costs[idx]),
-        durations: filteredIndices.map((idx) => data.durations[idx]),
+    // Calculate a global cutoff based on all agents' scores combined
+    const allScores: Array<{ score: number; taskKey: string }> = []
+
+    // Collect all scores with their task identifiers
+    for (const [taskKey, data] of Object.entries(taskData)) {
+      if (tasksToExclude.has(taskKey)) continue
+
+      for (const result of data.results) {
+        allScores.push({ score: result.overallScore, taskKey })
+      }
+    }
+
+    // Sort by score and find the 25th percentile cutoff
+    allScores.sort((a, b) => a.score - b.score)
+    const cutoffIndex = Math.floor(allScores.length * 0.25)
+    const cutoffScore = allScores[cutoffIndex]?.score ?? 0
+
+    // Identify tasks where ANY agent scored below the cutoff
+    const tasksToExcludeForBottom25 = new Set<string>()
+    for (const [taskKey, data] of Object.entries(taskData)) {
+      if (tasksToExclude.has(taskKey)) continue
+
+      const hasLowScore = data.results.some(
+        (result) => result.overallScore < cutoffScore,
+      )
+      if (hasLowScore) {
+        tasksToExcludeForBottom25.add(taskKey)
       }
     }
+
+    // Rebuild agentScores excluding bottom 25% tasks
+    const newAgentScores: typeof agentScores = {}
+
+    for (const [taskKey, data] of Object.entries(taskData)) {
+      if (tasksToExclude.has(taskKey)) continue
+      if (tasksToExcludeForBottom25.has(taskKey)) continue
+
+      for (const result of data.results) {
+        if (!newAgentScores[result.agentId]) {
+          newAgentScores[result.agentId] = {
+            scores: [],
+            completionScores: [],
+            qualityScores: [],
+            costs: [],
+            durations: [],
+          }
+        }
+
+        newAgentScores[result.agentId].scores.push(result.overallScore)
+        newAgentScores[result.agentId].completionScores.push(
+          result.completionScore,
+        )
+        newAgentScores[result.agentId].qualityScores.push(
+          result.codeQualityScore,
+        )
+        newAgentScores[result.agentId].costs.push(result.cost)
+        newAgentScores[result.agentId].durations.push(result.durationMs)
+      }
+    }
+
+    // Replace agentScores with filtered version
+    Object.keys(agentScores).forEach((key) => delete agentScores[key])
+    Object.assign(agentScores, newAgentScores)
   }
 
   // Calculate averages and stats
@@ -134,7 +199,7 @@ function analyzeBuffbenchLogs(logDirectory: string, filterBottom25 = false) {
   // Sort by average overall score descending
   results.sort((a, b) => b.averageOverallScore - a.averageOverallScore)
 
-  return results
+  return { results, agentScores }
 }
 
 // Main execution
@@ -143,9 +208,10 @@ const logDirectory = process.argv[2] || 'evals/buffbench/logs/2025-10-13T20-07'
 console.log(`Analyzing logs from: ${logDirectory}\n`)
 
 function printTable(
-  results: ReturnType<typeof analyzeBuffbenchLogs>,
+  data: ReturnType<typeof analyzeBuffbenchLogs>,
   title: string,
 ) {
+  const { results, agentScores } = data
   console.log(title)
   console.log('='.repeat(130))
   console.log(
@@ -177,6 +243,16 @@ function printTable(
 
   console.log('='.repeat(130))
   console.log(`Total agents analyzed: ${results.length}`)
+
+  // Print raw scores grouped by agent
+  console.log('\n=== Raw Overall Scores by Agent ===')
+  for (const result of results) {
+    const scores = agentScores[result.agentId].scores
+      .map((s) => s.toFixed(1))
+      .join(', ')
+    console.log(`\n${result.agentId}:`)
+    console.log(`  ${scores}`)
+  }
 }
 
 const allResults = analyzeBuffbenchLogs(logDirectory, false)