Skip to content

Commit 033c02d

Browse files
committed
eval-planner: refactor
1 parent adb75a8 commit 033c02d

File tree

2 files changed

+170
-112
lines changed

2 files changed

+170
-112
lines changed

evals/subagents/eval-planner.ts

Lines changed: 99 additions & 112 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,4 @@
1-
import { execSync } from 'child_process'
21
import * as fs from 'fs'
3-
import * as os from 'os'
42
import * as path from 'path'
53
import { createTwoFilesPatch } from 'diff'
64

@@ -9,76 +7,12 @@ import { AgentDefinition } from '../../sdk/src'
97
import { getUserCredentials } from '@codebuff/npm-app/credentials'
108
import { API_KEY_ENV_VAR } from '@codebuff/common/old-constants'
119
import { loadLocalAgents } from '@codebuff/npm-app/agents/load-agents'
12-
13-
/**
14-
* Helper function to manage test repository lifecycle
15-
* Sets up a test repo, runs a function with the repo cwd, then cleans up
16-
*/
17-
export const withTestRepo = async <T>(
18-
repoConfig: {
19-
repoUrl: string
20-
commitSha: string
21-
initCommand?: string
22-
checkoutPrevious?: boolean
23-
},
24-
fn: (cwd: string) => Promise<T>,
25-
): Promise<T> => {
26-
const { repoUrl, commitSha, initCommand, checkoutPrevious } = repoConfig
27-
28-
// Create a temporary directory for the test repo
29-
const tempDir = fs.mkdtempSync(path.join(os.tmpdir(), 'codebuff-eval-'))
30-
const repoDir = path.join(tempDir, 'repo')
31-
32-
try {
33-
// Clone the repository
34-
console.log(`Cloning repository ${repoUrl} to ${repoDir}...`)
35-
execSync(`git clone ${repoUrl} ${repoDir}`, { stdio: 'ignore' })
36-
37-
// Checkout the specific commit or the previous commit
38-
if (checkoutPrevious) {
39-
const previousCommitSha = getPreviousCommitSha(commitSha, repoDir)
40-
console.log(`Checking out previous commit ${previousCommitSha}...`)
41-
execSync(`git checkout ${previousCommitSha}`, {
42-
cwd: repoDir,
43-
stdio: 'ignore',
44-
})
45-
} else {
46-
console.log(`Checking out commit ${commitSha}...`)
47-
execSync(`git checkout ${commitSha}`, { cwd: repoDir, stdio: 'ignore' })
48-
}
49-
50-
// Run initialization command if provided
51-
if (initCommand) {
52-
console.log(`Running init command: ${initCommand}...`)
53-
execSync(initCommand, { cwd: repoDir, stdio: 'ignore' })
54-
}
55-
56-
// Run the provided function with the repo directory
57-
return await fn(repoDir)
58-
} finally {
59-
// Clean up the temporary directory
60-
console.log(`Cleaning up temporary directory ${tempDir}...`)
61-
try {
62-
fs.rmSync(tempDir, { recursive: true, force: true })
63-
} catch (error) {
64-
console.warn(`Failed to clean up temporary directory: ${error}`)
65-
}
66-
}
67-
}
68-
69-
/**
70-
* Gets the previous commit SHA (parent) of a given commit
71-
*/
72-
const getPreviousCommitSha = (commitSha: string, repoDir: string): string => {
73-
const previousSha = execSync(`git rev-parse ${commitSha}^`, {
74-
cwd: repoDir,
75-
encoding: 'utf-8',
76-
}).trim()
77-
return previousSha
78-
}
10+
import { withTestRepo } from './test-repo-utils'
7911

8012
export const evalPlannerAgent = async (params: {
13+
client: CodebuffClient
8114
agentId: string
15+
agentDefinitions: Array<AgentDefinition>
8216
spec: string
8317
repoUrl: string
8418
commitSha: string
@@ -89,21 +23,16 @@ export const evalPlannerAgent = async (params: {
8923
postContent: string
9024
}>
9125
}) => {
92-
const { agentId, spec, repoUrl, commitSha, initCommand, fileStates } = params
93-
const getLocalAuthToken = () => {
94-
return getUserCredentials()?.authToken
95-
}
96-
const client = new CodebuffClient({
97-
apiKey: process.env[API_KEY_ENV_VAR] || getLocalAuthToken(),
98-
})
99-
100-
const agentsPath = path.join(__dirname, '../../.agents')
101-
const localAgentDefinitions = Object.values(
102-
await loadLocalAgents({
103-
agentsPath,
104-
}),
105-
)
106-
26+
const {
27+
client,
28+
agentId,
29+
agentDefinitions,
30+
spec,
31+
repoUrl,
32+
commitSha,
33+
initCommand,
34+
fileStates,
35+
} = params
10736
const result = await withTestRepo(
10837
{ repoUrl, commitSha, initCommand, checkoutPrevious: true },
10938
async (cwd) => {
@@ -113,7 +42,7 @@ export const evalPlannerAgent = async (params: {
11342
agent: agentId,
11443
prompt: `Please plan a full implementation of the following spec: ${spec}`,
11544
cwd,
116-
agentDefinitions: localAgentDefinitions,
45+
agentDefinitions,
11746
handleEvent: (event) => {
11847
console.log(agentId, JSON.stringify(event, null, 2))
11948
},
@@ -178,11 +107,13 @@ ${outputString}
178107
## Your Task
179108
180109
Evaluate how well the implementation plan matches the real commit changes. Consider:
181-
- Coverage of key changes from the commit
110+
- Coverage of changes from the commit
182111
- Appropriateness and correctness of proposed code changes
183112
- Whether following the plan would achieve the same (or better) behavior
184-
- Any missing critical changes
185-
- Any unnecessary proposed changes`
113+
- Any unnecessary proposed changes
114+
- Simplicity and clarity of the plan
115+
- Efficiency of the plan: reuse existing code, touch as few files as possible
116+
`
186117

187118
const judgeResult = await client.run({
188119
agent: 'eval-judge',
@@ -196,7 +127,12 @@ Evaluate how well the implementation plan matches the real commit changes. Consi
196127
throw new Error('Error running judge agent')
197128
}
198129
const { output: judgeOutput } = judgeResult
199-
const judgingResults = judgeOutput.value ?? {}
130+
const judgingResults = (judgeOutput.value ?? {}) as {
131+
reasoning: string
132+
pros: string
133+
cons: string
134+
overallScore: number
135+
}
200136

201137
return { judgingResults, agentOutput: outputString }
202138
}
@@ -242,7 +178,11 @@ Grade how well the implementation plan matches the actual implementation. The pl
242178
- **Correctness**: Are the proposed code changes appropriate and accurate?
243179
- **Behavioral equivalence**: Would following the plan achieve the same outcome?
244180
- **Completeness**: Are any critical changes missing?
245-
- **Efficiency**: Does it avoid unnecessary changes?`,
181+
- **Efficiency**: Does it avoid unnecessary changes?
182+
- **Simplicity**: Is the plan simple and easy to understand?
183+
184+
You should be harsh if the plan makes superflous changes, fails to reuse existing code, or is otherwise not as simple as it could be.
185+
`,
246186
}
247187

248188
type EvalData = {
@@ -271,6 +211,25 @@ async function main() {
271211

272212
const { repoUrl, initCommand, evalCommits } = evalData
273213

214+
const client = new CodebuffClient({
215+
apiKey: process.env[API_KEY_ENV_VAR] || getUserCredentials()?.authToken,
216+
})
217+
218+
const agentsPath = path.join(__dirname, '../../.agents')
219+
const localAgentDefinitions = Object.values(
220+
await loadLocalAgents({
221+
agentsPath,
222+
}),
223+
)
224+
225+
// Track statistics
226+
const stats = {
227+
total: evalCommits.length,
228+
completed: 0,
229+
failed: 0,
230+
scores: [] as number[],
231+
}
232+
274233
// Loop through each eval task
275234
for (const evalCommit of evalCommits) {
276235
const { sha, spec, fileStates } = evalCommit
@@ -280,7 +239,9 @@ async function main() {
280239

281240
try {
282241
const result = await evalPlannerAgent({
242+
client,
283243
agentId: 'implementation-planner',
244+
agentDefinitions: localAgentDefinitions,
284245
spec,
285246
repoUrl,
286247
commitSha: sha,
@@ -298,42 +259,68 @@ async function main() {
298259
console.log('📊 EVALUATION RESULTS')
299260
console.log('─'.repeat(80))
300261

301-
if (reasoning) {
302-
console.log('\n🧠 REASONING:')
303-
console.log(reasoning)
304-
}
305-
306-
if (pros) {
307-
console.log('\n✅ PROS:')
308-
console.log(pros)
309-
}
262+
console.log('\n🧠 REASONING:')
263+
console.log(reasoning)
310264

311-
if (cons) {
312-
console.log('\n❌ CONS:')
313-
console.log(cons)
314-
}
265+
console.log('\n❌ CONS:')
266+
console.log(cons)
315267

316-
if (typeof overallScore === 'number') {
317-
console.log('\n📈 OVERALL SCORE:')
318-
const scoreBar = '█'.repeat(Math.floor(overallScore / 10))
319-
const emptyBar = '░'.repeat(10 - Math.floor(overallScore / 10))
320-
console.log(`${scoreBar}${emptyBar} ${overallScore}/100`)
321-
}
268+
console.log('\n📈 OVERALL SCORE:')
269+
const scoreBar = '█'.repeat(Math.floor(overallScore / 10))
270+
const emptyBar = '░'.repeat(10 - Math.floor(overallScore / 10))
271+
console.log(`${scoreBar}${emptyBar} ${overallScore}/100`)
322272

323273
console.log('\n' + '='.repeat(80) + '\n')
274+
275+
stats.completed++
276+
stats.scores.push(overallScore)
324277
} catch (error) {
325278
console.log(`\n${'='.repeat(80)}`)
326279
console.error(`✗ Failed eval for commit ${sha}`)
327280
console.log(`${'='.repeat(80)}\n`)
328281
console.error('Error details:', error)
329282
console.log('\n' + '='.repeat(80) + '\n')
283+
284+
stats.failed++
330285
}
286+
}
287+
288+
// Display summary statistics
289+
console.log('\n' + '='.repeat(80))
290+
console.log('📊 SUMMARY STATISTICS')
291+
console.log('='.repeat(80) + '\n')
292+
293+
console.log(`Total Evals: ${stats.total}`)
294+
console.log(
295+
`Completed: ${stats.completed} (${((stats.completed / stats.total) * 100).toFixed(1)}%)`,
296+
)
297+
console.log(
298+
`Failed: ${stats.failed} (${((stats.failed / stats.total) * 100).toFixed(1)}%)\n`,
299+
)
331300

332-
console.log('breaking for now')
333-
break
301+
if (stats.scores.length > 0) {
302+
const avgScore =
303+
stats.scores.reduce((a, b) => a + b, 0) / stats.scores.length
304+
const minScore = Math.min(...stats.scores)
305+
const maxScore = Math.max(...stats.scores)
306+
const medianScore = stats.scores.sort((a, b) => a - b)[
307+
Math.floor(stats.scores.length / 2)
308+
]
309+
310+
console.log('Score Statistics:')
311+
console.log(` Average: ${avgScore.toFixed(1)}/100`)
312+
console.log(` Median: ${medianScore}/100`)
313+
console.log(` Min: ${minScore}/100`)
314+
console.log(` Max: ${maxScore}/100\n`)
315+
316+
const scoreBar = '█'.repeat(Math.floor(avgScore / 10))
317+
const emptyBar = '░'.repeat(10 - Math.floor(avgScore / 10))
318+
console.log(
319+
`Average Score: ${scoreBar}${emptyBar} ${avgScore.toFixed(1)}/100\n`,
320+
)
334321
}
335322

336-
console.log('\n=== All evals completed ===')
323+
console.log('='.repeat(80))
337324
}
338325

339326
// Run main if this file is executed directly

evals/subagents/test-repo-utils.ts

Lines changed: 71 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,71 @@
1+
import fs from 'fs'
2+
import path from 'path'
3+
import * as os from 'os'
4+
import { execSync } from 'child_process'
5+
6+
/**
7+
* Helper function to manage test repository lifecycle
8+
* Sets up a test repo, runs a function with the repo cwd, then cleans up
9+
*/
10+
export const withTestRepo = async <T>(
11+
repoConfig: {
12+
repoUrl: string
13+
commitSha: string
14+
initCommand?: string
15+
checkoutPrevious?: boolean
16+
},
17+
fn: (cwd: string) => Promise<T>,
18+
): Promise<T> => {
19+
const { repoUrl, commitSha, initCommand, checkoutPrevious } = repoConfig
20+
21+
// Create a temporary directory for the test repo
22+
const tempDir = fs.mkdtempSync(path.join(os.tmpdir(), 'codebuff-eval-'))
23+
const repoDir = path.join(tempDir, 'repo')
24+
25+
try {
26+
// Clone the repository
27+
console.log(`Cloning repository ${repoUrl} to ${repoDir}...`)
28+
execSync(`git clone ${repoUrl} ${repoDir}`, { stdio: 'ignore' })
29+
30+
// Checkout the specific commit or the previous commit
31+
if (checkoutPrevious) {
32+
const previousCommitSha = getPreviousCommitSha(commitSha, repoDir)
33+
console.log(`Checking out previous commit ${previousCommitSha}...`)
34+
execSync(`git checkout ${previousCommitSha}`, {
35+
cwd: repoDir,
36+
stdio: 'ignore',
37+
})
38+
} else {
39+
console.log(`Checking out commit ${commitSha}...`)
40+
execSync(`git checkout ${commitSha}`, { cwd: repoDir, stdio: 'ignore' })
41+
}
42+
43+
// Run initialization command if provided
44+
if (initCommand) {
45+
console.log(`Running init command: ${initCommand}...`)
46+
execSync(initCommand, { cwd: repoDir, stdio: 'ignore' })
47+
}
48+
49+
// Run the provided function with the repo directory
50+
return await fn(repoDir)
51+
} finally {
52+
// Clean up the temporary directory
53+
console.log(`Cleaning up temporary directory ${tempDir}...`)
54+
try {
55+
fs.rmSync(tempDir, { recursive: true, force: true })
56+
} catch (error) {
57+
console.warn(`Failed to clean up temporary directory: ${error}`)
58+
}
59+
}
60+
}
61+
62+
/**
63+
* Gets the previous commit SHA (parent) of a given commit
64+
*/
65+
const getPreviousCommitSha = (commitSha: string, repoDir: string): string => {
66+
const previousSha = execSync(`git rev-parse ${commitSha}^`, {
67+
cwd: repoDir,
68+
encoding: 'utf-8',
69+
}).trim()
70+
return previousSha
71+
}

0 commit comments

Comments
 (0)