evals: hook up credits used

charleslien · charleslien · commit 76d0c582c703 · 2025-09-02T16:10:13.000-07:00
diff --git a/evals/git-evals/run-git-evals.ts b/evals/git-evals/run-git-evals.ts
@@ -45,6 +45,7 @@ export async function runSingleEval(
   const startTime = new Date()
   const trace: CodebuffTrace[] = []
   let error: string | undefined
+  let totalCostUsd = 0
 
   // Add process-level error handlers for this eval
   const originalUncaughtHandler = process.listeners('uncaughtException')
@@ -173,6 +174,8 @@ Explain your reasoning in detail.`,
           60_000 * 30,
         )
 
+        // Track credits used
+        totalCostUsd += codebuffResult.totalCostUsd
         trace.push({ prompt, steps: codebuffResult.steps })
       }
 
@@ -223,19 +226,26 @@ Explain your reasoning in detail.`,
     error,
     gitDiff: fileStates,
     durationMs,
+    costUsd: totalCostUsd,
   }
 
   // Add judging results even for failed runs
   try {
     const judgingResults = await judgeEvalRun(evalRun)
     console.log('Judging results:', judgingResults)
+
     return {
       ...evalRun,
       judging_results: judgingResults,
+      computed_metrics: {
+        runtime_sec: durationMs / 1000,
+        cost_usd: totalCostUsd,
+      },
     }
   } catch (judgingError) {
     console.error('Error in judging:', judgingError)
     // Return without judging results if judging fails
+
     return {
       ...evalRun,
       judging_results: {
@@ -249,6 +259,10 @@ Explain your reasoning in detail.`,
           overallScore: 0,
         },
       },
+      computed_metrics: {
+        runtime_sec: durationMs / 1000,
+        cost_usd: totalCostUsd,
+      },
     }
   }
 }
@@ -591,6 +605,16 @@ export async function runGitEvals(
 
 function calculateOverallMetrics(evalRuns: EvalRunJudged[]) {
   return {
+    average_runtime_sec:
+      evalRuns.reduce(
+        (sum, run) => sum + (run.computed_metrics?.runtime_sec || 0),
+        0,
+      ) / evalRuns.length,
+    average_cost_usd:
+      evalRuns.reduce(
+        (sum, run) => sum + (run.computed_metrics?.cost_usd || 0),
+        0,
+      ) / evalRuns.length,
     average_completion:
       evalRuns.reduce(
         (sum, run) => sum + (run.judging_results.metrics.completionScore || 0),
diff --git a/evals/git-evals/runners/claude.ts b/evals/git-evals/runners/claude.ts
@@ -13,7 +13,7 @@ export class ClaudeRunner implements Runner {
     this.cwd = cwd
   }
 
-  async run(prompt: string): Promise<{ steps: AgentStep[] }> {
+  async run(prompt: string): ReturnType<Runner['run']> {
     const response: Query = query({
       prompt,
       options: {
@@ -27,6 +27,7 @@ export class ClaudeRunner implements Runner {
     let responseText = ''
     let toolCalls: AgentStep['toolCalls'] = []
     let toolResults: AgentStep['toolResults'] = []
+    let totalCostUsd = 0
     function flushStep() {
       steps.push({ response: responseText, toolCalls, toolResults })
       responseText = ''
@@ -77,6 +78,7 @@ export class ClaudeRunner implements Runner {
         console.log(`\n\nSystem: ${JSON.stringify(chunk, null, 2)}`)
       } else if (chunk.type === 'result') {
         console.log(`\n\nResult: ${JSON.stringify(chunk, null, 2)}`)
+        totalCostUsd += chunk.total_cost_usd
       } else {
         chunk satisfies never
         const chunkAny = chunk as any
@@ -88,6 +90,6 @@ export class ClaudeRunner implements Runner {
 
     flushStep()
 
-    return { steps }
+    return { steps, totalCostUsd }
   }
 }
diff --git a/evals/git-evals/runners/codebuff.ts b/evals/git-evals/runners/codebuff.ts
@@ -1,6 +1,7 @@
 import path from 'path'
 
 import { API_KEY_ENV_VAR } from '@codebuff/common/constants'
+import { MAX_AGENT_STEPS_DEFAULT } from '@codebuff/common/constants/agents'
 import { loadLocalAgents } from '@codebuff/npm-app/agents/load-agents'
 import { getUserCredentials } from '@codebuff/npm-app/credentials'
 
@@ -9,7 +10,6 @@ import { CodebuffClient } from '../../../sdk/src/index'
 import type { Runner } from './runner'
 import type { RunState } from '../../../sdk/src/index'
 import type { AgentStep } from '../../scaffolding'
-import { MAX_AGENT_STEPS_DEFAULT } from '@codebuff/common/constants/agents'
 
 const getLocalAuthToken = () => {
   return getUserCredentials()?.authToken
@@ -24,7 +24,7 @@ export class CodebuffRunner implements Runner {
     this.agent = agent ?? 'base'
   }
 
-  async run(prompt: string): Promise<{ steps: AgentStep[] }> {
+  async run(prompt: string): ReturnType<Runner['run']> {
     const steps: AgentStep[] = []
     let responseText = ''
     let toolCalls: AgentStep['toolCalls'] = []
@@ -92,6 +92,9 @@ export class CodebuffRunner implements Runner {
 
     client.closeConnection()
 
-    return { steps }
+    return {
+      steps,
+      totalCostUsd: this.runState.sessionState.mainAgentState.creditsUsed / 100,
+    }
   }
 }
diff --git a/evals/git-evals/runners/runner.ts b/evals/git-evals/runners/runner.ts
@@ -1,5 +1,5 @@
 import type { AgentStep } from 'scaffolding'
 
 export type Runner = {
-  run: (prompt: string) => Promise<{ steps: AgentStep[] }>
+  run: (prompt: string) => Promise<{ steps: AgentStep[]; totalCostUsd: number }>
 }
diff --git a/evals/git-evals/types.ts b/evals/git-evals/types.ts
@@ -45,17 +45,24 @@ export interface EvalRunLog {
   error?: string
   gitDiff: string
   durationMs: number
+  costUsd: number
 }
 
 export interface EvalRunJudged extends EvalRunLog {
   judging_results: z.infer<typeof JudgingAnalysisSchema>
+  computed_metrics: {
+    runtime_sec: number
+    cost_usd: number
+  }
 }
 
 export interface FullEvalLog {
   test_repo_name: string
   generation_date: string
   eval_runs: EvalRunJudged[]
   overall_metrics: {
+    average_runtime_sec: number
+    average_cost_usd: number
     average_completion: number
     average_efficiency: number
     average_code_quality: number

Original file line number	Diff line number	Diff line change
`@@ -1,5 +1,5 @@`
`1`	`1`	`import type { AgentStep } from 'scaffolding'`
`2`	`2`
`3`	`3`	`export type Runner = {`
`4`		`- run: (prompt: string) => Promise<{ steps: AgentStep[] }>`
	`4`	`+ run: (prompt: string) => Promise<{ steps: AgentStep[]; totalCostUsd: number }>`
`5`	`5`	`}`