Skip to content

Commit 6d4dcb8

Browse files
committed
evals: Add typecheck / tests for judge
1 parent 789550a commit 6d4dcb8

File tree

6 files changed

+91
-2
lines changed

6 files changed

+91
-2
lines changed

evals/buffbench/agent-runner.ts

Lines changed: 60 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,13 +1,17 @@
11
import fs from 'fs'
22
import path from 'path'
33
import { execSync } from 'child_process'
4+
import { promisify } from 'util'
5+
import { exec } from 'child_process'
6+
7+
const execAsync = promisify(exec)
48

59
import { withTimeout } from '@codebuff/common/util/promise'
610
import { CodebuffClient } from '../../sdk/src/client'
711
import { withTestRepo } from '../subagents/test-repo-utils'
812

913
import type { PrintModeEvent } from '@codebuff/common/types/print-mode'
10-
import type { EvalCommitV2 } from './types'
14+
import type { EvalCommitV2, FinalCheckOutput } from './types'
1115

1216
export type AgentStep = PrintModeEvent
1317

@@ -22,6 +26,7 @@ export async function runAgentOnCommit({
2226
env,
2327
localAgentDefinitions,
2428
printEvents,
29+
finalCheckCommands,
2530
}: {
2631
client: CodebuffClient
2732
agentId: string
@@ -31,13 +36,15 @@ export async function runAgentOnCommit({
3136
env?: Record<string, string>
3237
localAgentDefinitions: any[]
3338
printEvents: boolean
39+
finalCheckCommands?: string[]
3440
}): Promise<{
3541
diff: string
3642
contextFiles: Record<string, string>
3743
durationMs: number
3844
cost: number
3945
error?: string
4046
trace: AgentStep[]
47+
finalCheckOutputs?: FinalCheckOutput[]
4148
}> {
4249
console.log(`[${commit.id}] Running agent ${agentId}...`)
4350
const startTime = Date.now()
@@ -46,6 +53,7 @@ export async function runAgentOnCommit({
4653
let error: string | undefined
4754
let cost = 0
4855
const trace: AgentStep[] = []
56+
let finalCheckOutputs: FinalCheckOutput[] | undefined
4957

5058
try {
5159
await withTestRepo(
@@ -140,6 +148,18 @@ export async function runAgentOnCommit({
140148
contextFiles[filePath] = ''
141149
}
142150
}
151+
152+
// Run final check commands if specified
153+
if (finalCheckCommands && finalCheckCommands.length > 0) {
154+
console.log(
155+
`[${commit.id}] Running ${finalCheckCommands.length} final check commands...`,
156+
)
157+
finalCheckOutputs = await runFinalCheckCommands(
158+
finalCheckCommands,
159+
repoDir,
160+
env,
161+
)
162+
}
143163
},
144164
)
145165
} catch (e) {
@@ -155,5 +175,44 @@ export async function runAgentOnCommit({
155175
cost,
156176
error,
157177
trace,
178+
finalCheckOutputs,
158179
}
159180
}
181+
182+
async function runFinalCheckCommands(
183+
commands: string[],
184+
cwd: string,
185+
env?: Record<string, string>,
186+
): Promise<FinalCheckOutput[]> {
187+
const results: FinalCheckOutput[] = []
188+
189+
for (const command of commands) {
190+
console.log(` Running: ${command}`)
191+
try {
192+
const { stdout, stderr } = await execAsync(command, {
193+
cwd,
194+
encoding: 'utf-8',
195+
maxBuffer: 10 * 1024 * 1024, // 10MB buffer
196+
env: { ...process.env, ...env },
197+
})
198+
results.push({
199+
command,
200+
exitCode: 0,
201+
stdout,
202+
stderr,
203+
})
204+
console.log(` ✓ Command succeeded: ${command}`)
205+
} catch (error: any) {
206+
// Command failed, but we still capture the output
207+
results.push({
208+
command,
209+
exitCode: error.code || 1,
210+
stdout: error.stdout || '',
211+
stderr: error.stderr || error.message || '',
212+
})
213+
console.log(` ✗ Command failed (exit ${error.code}): ${command}`)
214+
}
215+
}
216+
217+
return results
218+
}

evals/buffbench/eval-codebuff.json

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@
99
}
1010
],
1111
"initCommand": "bun install",
12+
"finalCheckCommands": ["bun run typecheck", "bun run test"],
1213
"env": {
1314
"ANTHROPIC_API_KEY": "test-key",
1415
"ANTHROPIC_API_KEY2": "test-key-2",

evals/buffbench/judge.ts

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -125,6 +125,7 @@ interface JudgeCommitResultInput {
125125
contextFiles: Record<string, string>
126126
agentDiff: string
127127
error?: string
128+
finalCheckOutputs?: string
128129
}
129130

130131
export async function judgeCommitResult(
@@ -137,6 +138,7 @@ export async function judgeCommitResult(
137138
contextFiles,
138139
agentDiff,
139140
error,
141+
finalCheckOutputs,
140142
} = input
141143

142144
const groundTruthDiffs = groundTruthFileDiffs
@@ -164,7 +166,8 @@ ${groundTruthDiffs}
164166
\`\`\`diff
165167
${agentDiff || '(No changes made)'}
166168
\`\`\`
167-
${error ? `\n## Error Encountered\n${error}` : ''}`
169+
${error ? `\n## Error Encountered\n${error}` : ''}
170+
${finalCheckOutputs ? `\n## Final Check Command Outputs\n${finalCheckOutputs}` : ''}`
168171

169172
const agentOutput: string[] = []
170173
const judgeResult = await withTimeout(

evals/buffbench/run-buffbench.ts

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -36,6 +36,7 @@ async function runTask(options: {
3636
localAgentDefinitions: any[]
3737
extractLessons: boolean
3838
printEvents: boolean
39+
finalCheckCommands?: string[]
3940
}) {
4041
const {
4142
client,
@@ -51,6 +52,7 @@ async function runTask(options: {
5152
localAgentDefinitions,
5253
extractLessons,
5354
printEvents,
55+
finalCheckCommands,
5456
} = options
5557

5658
console.log(
@@ -70,6 +72,7 @@ async function runTask(options: {
7072
env,
7173
localAgentDefinitions,
7274
printEvents,
75+
finalCheckCommands,
7376
})
7477

7578
const judgeResult = await judgeCommitResult({
@@ -79,6 +82,14 @@ async function runTask(options: {
7982
contextFiles: agentResult.contextFiles,
8083
agentDiff: agentResult.diff,
8184
error: agentResult.error,
85+
finalCheckOutputs: agentResult.finalCheckOutputs
86+
? agentResult.finalCheckOutputs
87+
.map(
88+
(output) =>
89+
`### ${output.command}\n\`\`\`\n${output.stdout}${output.stderr ? '\nSTDERR:\n' + output.stderr : ''}\n\`\`\``,
90+
)
91+
.join('\n\n')
92+
: undefined,
8293
})
8394

8495
// Extract and append agent lessons
@@ -114,6 +125,7 @@ async function runTask(options: {
114125
cost: agentResult.cost,
115126
durationMs: agentResult.durationMs,
116127
error: agentResult.error,
128+
finalCheckOutputs: agentResult.finalCheckOutputs,
117129
}
118130

119131
// Save trace to logs directory
@@ -135,6 +147,7 @@ async function runTask(options: {
135147
durationMs: agentResult.durationMs,
136148
error: agentResult.error,
137149
timestamp: new Date().toISOString(),
150+
finalCheckOutputs: agentResult.finalCheckOutputs,
138151
})
139152

140153
fs.writeFileSync(
@@ -369,6 +382,7 @@ export async function runBuffBench(options: {
369382
localAgentDefinitions: analyzerContext.agentDefinitions,
370383
extractLessons,
371384
printEvents: agents.length === 1 && taskConcurrency === 1,
385+
finalCheckCommands: evalData.finalCheckCommands,
372386
}),
373387
),
374388
)

evals/buffbench/trace-analyzer.ts

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,8 @@ import { withTimeout } from '@codebuff/common/util/promise'
66
import { getErrorObject } from '@codebuff/common/util/error'
77
import { truncateTrace } from './trace-utils'
88

9+
import type { FinalCheckOutput } from './types'
10+
911
export interface AgentTraceData {
1012
agentId: string
1113
commitSha: string
@@ -17,6 +19,7 @@ export interface AgentTraceData {
1719
durationMs: number
1820
error?: string
1921
timestamp: string
22+
finalCheckOutputs?: FinalCheckOutput[]
2023
}
2124

2225
const traceAnalyzerAgent: AgentDefinition = {

evals/buffbench/types.ts

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -51,9 +51,17 @@ export interface EvalDataV2 {
5151
initCommand?: string
5252
binInstalls?: BinInstall[]
5353
env?: Record<string, string>
54+
finalCheckCommands?: string[]
5455
evalCommits: EvalCommitV2[]
5556
}
5657

58+
export interface FinalCheckOutput {
59+
command: string
60+
exitCode: number
61+
stdout: string
62+
stderr: string
63+
}
64+
5765
export interface EvalRun {
5866
commitSha: string
5967
prompt: string
@@ -62,6 +70,7 @@ export interface EvalRun {
6270
cost: number
6371
durationMs: number
6472
error?: string
73+
finalCheckOutputs?: FinalCheckOutput[]
6574
}
6675

6776
export interface AgentEvalResults {

0 commit comments

Comments
 (0)