Skip to content

Commit 7f02fc8

Browse files
committed
Factor codebuff out into an agent runner
1 parent 60da5fb commit 7f02fc8

File tree

6 files changed

+146
-80
lines changed

6 files changed

+146
-80
lines changed

evals/buffbench/agent-runner.ts

Lines changed: 25 additions & 73 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,3 @@
1-
import fs from 'fs'
2-
import path from 'path'
31
import { execSync } from 'child_process'
42
import { promisify } from 'util'
53
import { exec } from 'child_process'
@@ -11,17 +9,15 @@ import { CodebuffClient } from '@codebuff/sdk'
119
import { withTestRepo } from '../subagents/test-repo-utils'
1210
import { ClaudeRunner } from './runners/claude'
1311
import { CodexRunner } from './runners/codex'
12+
import { CodebuffRunner } from './runners/codebuff'
1413

15-
import type { PrintModeEvent } from '@codebuff/common/types/print-mode'
1614
import type { EvalCommitV2, FinalCheckOutput } from './types'
17-
import type { Runner } from './runners/runner'
15+
import type { Runner, AgentStep } from './runners/runner'
1816

19-
export type AgentStep = PrintModeEvent
17+
export type { AgentStep }
2018

2119
export type ExternalAgentType = 'claude' | 'codex'
2220

23-
const DEBUG_ERROR = true
24-
2521
export async function runAgentOnCommit({
2622
client,
2723
agentId,
@@ -73,78 +69,34 @@ export async function runAgentOnCommit({
7369
env,
7470
},
7571
async (repoDir) => {
76-
// Use external CLI runner if specified
77-
if (externalAgentType) {
78-
const runner: Runner =
79-
externalAgentType === 'claude'
80-
? new ClaudeRunner(repoDir, env)
81-
: new CodexRunner(repoDir, env)
82-
83-
console.log(
84-
`[${commit.id}] Running external agent: ${externalAgentType}`,
85-
)
86-
87-
const result = await runner.run(commit.prompt)
88-
trace.push(...result.steps)
89-
cost = result.totalCostUsd
90-
diff = result.diff
72+
// Select the appropriate runner
73+
let runner: Runner
74+
if (externalAgentType === 'claude') {
75+
runner = new ClaudeRunner(repoDir, env)
76+
} else if (externalAgentType === 'codex') {
77+
runner = new CodexRunner(repoDir, env)
9178
} else {
92-
// Use Codebuff client
93-
const maxAgentSteps = 40
94-
const result = await client.run({
95-
agent: agentId,
96-
prompt: commit.prompt,
97-
agentDefinitions: localAgentDefinitions,
79+
runner = new CodebuffRunner({
9880
cwd: repoDir,
9981
env,
100-
maxAgentSteps,
101-
handleEvent: (event) => {
102-
if (
103-
(event.type === 'tool_call' || event.type === 'tool_result') &&
104-
event.toolName === 'set_messages'
105-
) {
106-
return
107-
}
108-
if (event.type === 'error') {
109-
console.error(
110-
`[${commit.id}:${agentId}] Error event:`,
111-
event.message,
112-
)
113-
if (DEBUG_ERROR && !event.message.startsWith('Invalid JSON')) {
114-
// Save errors in a file, but not tool calls with invalid json.
115-
fs.writeFileSync(
116-
path.join(
117-
__dirname,
118-
`${commit.id}-${agentId}-error-${Math.random().toString(36).substring(2, 6)}.json`,
119-
),
120-
JSON.stringify(
121-
{
122-
error: event.message,
123-
trace: trace,
124-
},
125-
null,
126-
2,
127-
),
128-
)
129-
}
130-
} else if (printEvents) {
131-
console.log(
132-
`[${commit.id}:${agentId}]`,
133-
JSON.stringify(event, null, 2),
134-
)
135-
}
136-
trace.push(event)
137-
},
138-
})
139-
cost = (result.sessionState?.mainAgentState.creditsUsed ?? 0) / 100
140-
141-
execSync('git add .', { cwd: repoDir, stdio: 'ignore' })
142-
diff = execSync(`git diff ${commit.parentSha}`, {
143-
cwd: repoDir,
144-
encoding: 'utf-8',
82+
client,
83+
agentId,
84+
localAgentDefinitions,
85+
printEvents,
86+
commitId: commit.id,
87+
parentSha: commit.parentSha,
14588
})
14689
}
14790

91+
console.log(
92+
`[${commit.id}] Running agent: ${externalAgentType || 'codebuff'}`,
93+
)
94+
95+
const result = await runner.run(commit.prompt)
96+
trace.push(...result.steps)
97+
cost = result.totalCostUsd
98+
diff = result.diff
99+
148100
const contextFilePaths = new Set<string>([
149101
...commit.supplementalFiles,
150102
...commit.fileDiffs.map((fd) => fd.path),

evals/buffbench/main-single-eval.ts

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@ import { runBuffBench } from './run-buffbench'
55
async function main() {
66
await runBuffBench({
77
evalDataPath: path.join(__dirname, 'eval-codebuff.json'),
8-
agents: ['external:claude'],
8+
agents: ['base2'],
99
taskIds: ['filter-system-history'],
1010
})
1111

evals/buffbench/runners/claude.ts

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,6 @@
11
import { execSync, spawn } from 'child_process'
22

3-
import type { Runner, RunnerResult } from './runner'
4-
import type { AgentStep } from '../agent-runner'
3+
import type { Runner, RunnerResult, AgentStep } from './runner'
54
import type {
65
PrintModeToolCall,
76
PrintModeToolResult,
Lines changed: 114 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,114 @@
1+
import fs from 'fs'
2+
import path from 'path'
3+
import { execSync } from 'child_process'
4+
5+
import { CodebuffClient } from '@codebuff/sdk'
6+
7+
import type { Runner, RunnerResult, AgentStep } from './runner'
8+
9+
const DEBUG_ERROR = true
10+
11+
export class CodebuffRunner implements Runner {
12+
private cwd: string
13+
private env?: Record<string, string>
14+
private client: CodebuffClient
15+
private agentId: string
16+
private localAgentDefinitions: any[]
17+
private printEvents: boolean
18+
private commitId: string
19+
private parentSha: string
20+
21+
constructor(options: {
22+
cwd: string
23+
env?: Record<string, string>
24+
client: CodebuffClient
25+
agentId: string
26+
localAgentDefinitions: any[]
27+
printEvents: boolean
28+
commitId: string
29+
parentSha: string
30+
}) {
31+
this.cwd = options.cwd
32+
this.env = options.env
33+
this.client = options.client
34+
this.agentId = options.agentId
35+
this.localAgentDefinitions = options.localAgentDefinitions
36+
this.printEvents = options.printEvents
37+
this.commitId = options.commitId
38+
this.parentSha = options.parentSha
39+
}
40+
41+
async run(prompt: string): Promise<RunnerResult> {
42+
const steps: AgentStep[] = []
43+
let totalCostUsd = 0
44+
45+
const maxAgentSteps = 40
46+
const result = await this.client.run({
47+
agent: this.agentId,
48+
prompt,
49+
agentDefinitions: this.localAgentDefinitions,
50+
cwd: this.cwd,
51+
env: this.env,
52+
maxAgentSteps,
53+
handleEvent: (event) => {
54+
if (
55+
(event.type === 'tool_call' || event.type === 'tool_result') &&
56+
event.toolName === 'set_messages'
57+
) {
58+
return
59+
}
60+
if (event.type === 'error') {
61+
console.error(
62+
`[${this.commitId}:${this.agentId}] Error event:`,
63+
event.message,
64+
)
65+
if (DEBUG_ERROR && !event.message.startsWith('Invalid JSON')) {
66+
// Save errors in a file, but not tool calls with invalid json.
67+
fs.writeFileSync(
68+
path.join(
69+
__dirname,
70+
'..',
71+
`${this.commitId}-${this.agentId}-error-${Math.random().toString(36).substring(2, 6)}.json`,
72+
),
73+
JSON.stringify(
74+
{
75+
error: event.message,
76+
trace: steps,
77+
},
78+
null,
79+
2,
80+
),
81+
)
82+
}
83+
} else if (this.printEvents) {
84+
console.log(
85+
`[${this.commitId}:${this.agentId}]`,
86+
JSON.stringify(event, null, 2),
87+
)
88+
}
89+
steps.push(event)
90+
},
91+
})
92+
93+
totalCostUsd = (result.sessionState?.mainAgentState.creditsUsed ?? 0) / 100
94+
95+
// Get git diff after Codebuff has made changes
96+
let diff = ''
97+
try {
98+
execSync('git add .', { cwd: this.cwd, stdio: 'ignore' })
99+
diff = execSync(`git diff ${this.parentSha}`, {
100+
cwd: this.cwd,
101+
encoding: 'utf-8',
102+
maxBuffer: 10 * 1024 * 1024,
103+
})
104+
} catch {
105+
// Ignore git errors
106+
}
107+
108+
return {
109+
steps,
110+
totalCostUsd,
111+
diff,
112+
}
113+
}
114+
}

evals/buffbench/runners/codex.ts

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,6 @@
11
import { execSync, spawn } from 'child_process'
22

3-
import type { Runner, RunnerResult } from './runner'
4-
import type { AgentStep } from '../agent-runner'
3+
import type { Runner, RunnerResult, AgentStep } from './runner'
54

65
export class CodexRunner implements Runner {
76
private cwd: string

evals/buffbench/runners/runner.ts

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,11 +1,13 @@
1-
import type { AgentStep } from '../agent-runner'
1+
import type { PrintModeEvent } from '@codebuff/common/types/print-mode'
2+
3+
export type AgentStep = PrintModeEvent
24

35
export type RunnerResult = {
46
steps: AgentStep[]
57
totalCostUsd: number
68
diff: string
79
}
810

9-
export type Runner = {
11+
export interface Runner {
1012
run: (prompt: string) => Promise<RunnerResult>
1113
}

0 commit comments

Comments
 (0)