Skip to content

Commit 76d0c58

Browse files
committed
evals: hook up credits used
1 parent ea22b75 commit 76d0c58

File tree

5 files changed

+42
-6
lines changed

5 files changed

+42
-6
lines changed

evals/git-evals/run-git-evals.ts

Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -45,6 +45,7 @@ export async function runSingleEval(
4545
const startTime = new Date()
4646
const trace: CodebuffTrace[] = []
4747
let error: string | undefined
48+
let totalCostUsd = 0
4849

4950
// Add process-level error handlers for this eval
5051
const originalUncaughtHandler = process.listeners('uncaughtException')
@@ -173,6 +174,8 @@ Explain your reasoning in detail.`,
173174
60_000 * 30,
174175
)
175176

177+
// Track credits used
178+
totalCostUsd += codebuffResult.totalCostUsd
176179
trace.push({ prompt, steps: codebuffResult.steps })
177180
}
178181

@@ -223,19 +226,26 @@ Explain your reasoning in detail.`,
223226
error,
224227
gitDiff: fileStates,
225228
durationMs,
229+
costUsd: totalCostUsd,
226230
}
227231

228232
// Add judging results even for failed runs
229233
try {
230234
const judgingResults = await judgeEvalRun(evalRun)
231235
console.log('Judging results:', judgingResults)
236+
232237
return {
233238
...evalRun,
234239
judging_results: judgingResults,
240+
computed_metrics: {
241+
runtime_sec: durationMs / 1000,
242+
cost_usd: totalCostUsd,
243+
},
235244
}
236245
} catch (judgingError) {
237246
console.error('Error in judging:', judgingError)
238247
// Return without judging results if judging fails
248+
239249
return {
240250
...evalRun,
241251
judging_results: {
@@ -249,6 +259,10 @@ Explain your reasoning in detail.`,
249259
overallScore: 0,
250260
},
251261
},
262+
computed_metrics: {
263+
runtime_sec: durationMs / 1000,
264+
cost_usd: totalCostUsd,
265+
},
252266
}
253267
}
254268
}
@@ -591,6 +605,16 @@ export async function runGitEvals(
591605

592606
function calculateOverallMetrics(evalRuns: EvalRunJudged[]) {
593607
return {
608+
average_runtime_sec:
609+
evalRuns.reduce(
610+
(sum, run) => sum + (run.computed_metrics?.runtime_sec || 0),
611+
0,
612+
) / evalRuns.length,
613+
average_cost_usd:
614+
evalRuns.reduce(
615+
(sum, run) => sum + (run.computed_metrics?.cost_usd || 0),
616+
0,
617+
) / evalRuns.length,
594618
average_completion:
595619
evalRuns.reduce(
596620
(sum, run) => sum + (run.judging_results.metrics.completionScore || 0),

evals/git-evals/runners/claude.ts

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,7 @@ export class ClaudeRunner implements Runner {
1313
this.cwd = cwd
1414
}
1515

16-
async run(prompt: string): Promise<{ steps: AgentStep[] }> {
16+
async run(prompt: string): ReturnType<Runner['run']> {
1717
const response: Query = query({
1818
prompt,
1919
options: {
@@ -27,6 +27,7 @@ export class ClaudeRunner implements Runner {
2727
let responseText = ''
2828
let toolCalls: AgentStep['toolCalls'] = []
2929
let toolResults: AgentStep['toolResults'] = []
30+
let totalCostUsd = 0
3031
function flushStep() {
3132
steps.push({ response: responseText, toolCalls, toolResults })
3233
responseText = ''
@@ -77,6 +78,7 @@ export class ClaudeRunner implements Runner {
7778
console.log(`\n\nSystem: ${JSON.stringify(chunk, null, 2)}`)
7879
} else if (chunk.type === 'result') {
7980
console.log(`\n\nResult: ${JSON.stringify(chunk, null, 2)}`)
81+
totalCostUsd += chunk.total_cost_usd
8082
} else {
8183
chunk satisfies never
8284
const chunkAny = chunk as any
@@ -88,6 +90,6 @@ export class ClaudeRunner implements Runner {
8890

8991
flushStep()
9092

91-
return { steps }
93+
return { steps, totalCostUsd }
9294
}
9395
}

evals/git-evals/runners/codebuff.ts

Lines changed: 6 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
import path from 'path'
22

33
import { API_KEY_ENV_VAR } from '@codebuff/common/constants'
4+
import { MAX_AGENT_STEPS_DEFAULT } from '@codebuff/common/constants/agents'
45
import { loadLocalAgents } from '@codebuff/npm-app/agents/load-agents'
56
import { getUserCredentials } from '@codebuff/npm-app/credentials'
67

@@ -9,7 +10,6 @@ import { CodebuffClient } from '../../../sdk/src/index'
910
import type { Runner } from './runner'
1011
import type { RunState } from '../../../sdk/src/index'
1112
import type { AgentStep } from '../../scaffolding'
12-
import { MAX_AGENT_STEPS_DEFAULT } from '@codebuff/common/constants/agents'
1313

1414
const getLocalAuthToken = () => {
1515
return getUserCredentials()?.authToken
@@ -24,7 +24,7 @@ export class CodebuffRunner implements Runner {
2424
this.agent = agent ?? 'base'
2525
}
2626

27-
async run(prompt: string): Promise<{ steps: AgentStep[] }> {
27+
async run(prompt: string): ReturnType<Runner['run']> {
2828
const steps: AgentStep[] = []
2929
let responseText = ''
3030
let toolCalls: AgentStep['toolCalls'] = []
@@ -92,6 +92,9 @@ export class CodebuffRunner implements Runner {
9292

9393
client.closeConnection()
9494

95-
return { steps }
95+
return {
96+
steps,
97+
totalCostUsd: this.runState.sessionState.mainAgentState.creditsUsed / 100,
98+
}
9699
}
97100
}

evals/git-evals/runners/runner.ts

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
import type { AgentStep } from 'scaffolding'
22

33
export type Runner = {
4-
run: (prompt: string) => Promise<{ steps: AgentStep[] }>
4+
run: (prompt: string) => Promise<{ steps: AgentStep[]; totalCostUsd: number }>
55
}

evals/git-evals/types.ts

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -45,17 +45,24 @@ export interface EvalRunLog {
4545
error?: string
4646
gitDiff: string
4747
durationMs: number
48+
costUsd: number
4849
}
4950

5051
export interface EvalRunJudged extends EvalRunLog {
5152
judging_results: z.infer<typeof JudgingAnalysisSchema>
53+
computed_metrics: {
54+
runtime_sec: number
55+
cost_usd: number
56+
}
5257
}
5358

5459
export interface FullEvalLog {
5560
test_repo_name: string
5661
generation_date: string
5762
eval_runs: EvalRunJudged[]
5863
overall_metrics: {
64+
average_runtime_sec: number
65+
average_cost_usd: number
5966
average_completion: number
6067
average_efficiency: number
6168
average_code_quality: number

0 commit comments

Comments
 (0)