1- import { execSync } from 'child_process'
21import * as fs from 'fs'
3- import * as os from 'os'
42import * as path from 'path'
53import { createTwoFilesPatch } from 'diff'
64
@@ -9,76 +7,12 @@ import { AgentDefinition } from '../../sdk/src'
97import { getUserCredentials } from '@codebuff/npm-app/credentials'
108import { API_KEY_ENV_VAR } from '@codebuff/common/old-constants'
119import { loadLocalAgents } from '@codebuff/npm-app/agents/load-agents'
12-
13- /**
14- * Helper function to manage test repository lifecycle
15- * Sets up a test repo, runs a function with the repo cwd, then cleans up
16- */
17- export const withTestRepo = async < T > (
18- repoConfig : {
19- repoUrl : string
20- commitSha : string
21- initCommand ?: string
22- checkoutPrevious ?: boolean
23- } ,
24- fn : ( cwd : string ) => Promise < T > ,
25- ) : Promise < T > => {
26- const { repoUrl, commitSha, initCommand, checkoutPrevious } = repoConfig
27-
28- // Create a temporary directory for the test repo
29- const tempDir = fs . mkdtempSync ( path . join ( os . tmpdir ( ) , 'codebuff-eval-' ) )
30- const repoDir = path . join ( tempDir , 'repo' )
31-
32- try {
33- // Clone the repository
34- console . log ( `Cloning repository ${ repoUrl } to ${ repoDir } ...` )
35- execSync ( `git clone ${ repoUrl } ${ repoDir } ` , { stdio : 'ignore' } )
36-
37- // Checkout the specific commit or the previous commit
38- if ( checkoutPrevious ) {
39- const previousCommitSha = getPreviousCommitSha ( commitSha , repoDir )
40- console . log ( `Checking out previous commit ${ previousCommitSha } ...` )
41- execSync ( `git checkout ${ previousCommitSha } ` , {
42- cwd : repoDir ,
43- stdio : 'ignore' ,
44- } )
45- } else {
46- console . log ( `Checking out commit ${ commitSha } ...` )
47- execSync ( `git checkout ${ commitSha } ` , { cwd : repoDir , stdio : 'ignore' } )
48- }
49-
50- // Run initialization command if provided
51- if ( initCommand ) {
52- console . log ( `Running init command: ${ initCommand } ...` )
53- execSync ( initCommand , { cwd : repoDir , stdio : 'ignore' } )
54- }
55-
56- // Run the provided function with the repo directory
57- return await fn ( repoDir )
58- } finally {
59- // Clean up the temporary directory
60- console . log ( `Cleaning up temporary directory ${ tempDir } ...` )
61- try {
62- fs . rmSync ( tempDir , { recursive : true , force : true } )
63- } catch ( error ) {
64- console . warn ( `Failed to clean up temporary directory: ${ error } ` )
65- }
66- }
67- }
68-
69- /**
70- * Gets the previous commit SHA (parent) of a given commit
71- */
72- const getPreviousCommitSha = ( commitSha : string , repoDir : string ) : string => {
73- const previousSha = execSync ( `git rev-parse ${ commitSha } ^` , {
74- cwd : repoDir ,
75- encoding : 'utf-8' ,
76- } ) . trim ( )
77- return previousSha
78- }
10+ import { withTestRepo } from './test-repo-utils'
7911
8012export const evalPlannerAgent = async ( params : {
13+ client : CodebuffClient
8114 agentId : string
15+ agentDefinitions : Array < AgentDefinition >
8216 spec : string
8317 repoUrl : string
8418 commitSha : string
@@ -89,21 +23,16 @@ export const evalPlannerAgent = async (params: {
8923 postContent : string
9024 } >
9125} ) => {
92- const { agentId, spec, repoUrl, commitSha, initCommand, fileStates } = params
93- const getLocalAuthToken = ( ) => {
94- return getUserCredentials ( ) ?. authToken
95- }
96- const client = new CodebuffClient ( {
97- apiKey : process . env [ API_KEY_ENV_VAR ] || getLocalAuthToken ( ) ,
98- } )
99-
100- const agentsPath = path . join ( __dirname , '../../.agents' )
101- const localAgentDefinitions = Object . values (
102- await loadLocalAgents ( {
103- agentsPath,
104- } ) ,
105- )
106-
26+ const {
27+ client,
28+ agentId,
29+ agentDefinitions,
30+ spec,
31+ repoUrl,
32+ commitSha,
33+ initCommand,
34+ fileStates,
35+ } = params
10736 const result = await withTestRepo (
10837 { repoUrl, commitSha, initCommand, checkoutPrevious : true } ,
10938 async ( cwd ) => {
@@ -113,7 +42,7 @@ export const evalPlannerAgent = async (params: {
11342 agent : agentId ,
11443 prompt : `Please plan a full implementation of the following spec: ${ spec } ` ,
11544 cwd,
116- agentDefinitions : localAgentDefinitions ,
45+ agentDefinitions,
11746 handleEvent : ( event ) => {
11847 console . log ( agentId , JSON . stringify ( event , null , 2 ) )
11948 } ,
@@ -178,11 +107,13 @@ ${outputString}
178107## Your Task
179108
180109Evaluate how well the implementation plan matches the real commit changes. Consider:
181- - Coverage of key changes from the commit
110+ - Coverage of changes from the commit
182111- Appropriateness and correctness of proposed code changes
183112- Whether following the plan would achieve the same (or better) behavior
184- - Any missing critical changes
185- - Any unnecessary proposed changes`
113+ - Any unnecessary proposed changes
114+ - Simplicity and clarity of the plan
115+ - Efficiency of the plan: reuse existing code, touch as few files as possible
116+ `
186117
187118 const judgeResult = await client . run ( {
188119 agent : 'eval-judge' ,
@@ -196,7 +127,12 @@ Evaluate how well the implementation plan matches the real commit changes. Consi
196127 throw new Error ( 'Error running judge agent' )
197128 }
198129 const { output : judgeOutput } = judgeResult
199- const judgingResults = judgeOutput . value ?? { }
130+ const judgingResults = ( judgeOutput . value ?? { } ) as {
131+ reasoning : string
132+ pros : string
133+ cons : string
134+ overallScore : number
135+ }
200136
201137 return { judgingResults, agentOutput : outputString }
202138}
@@ -242,7 +178,11 @@ Grade how well the implementation plan matches the actual implementation. The pl
242178- **Correctness**: Are the proposed code changes appropriate and accurate?
243179- **Behavioral equivalence**: Would following the plan achieve the same outcome?
244180- **Completeness**: Are any critical changes missing?
245- - **Efficiency**: Does it avoid unnecessary changes?` ,
181+ - **Efficiency**: Does it avoid unnecessary changes?
182+ - **Simplicity**: Is the plan simple and easy to understand?
183+
184+ You should be harsh if the plan makes superflous changes, fails to reuse existing code, or is otherwise not as simple as it could be.
185+ ` ,
246186}
247187
248188type EvalData = {
@@ -271,6 +211,25 @@ async function main() {
271211
272212 const { repoUrl, initCommand, evalCommits } = evalData
273213
214+ const client = new CodebuffClient ( {
215+ apiKey : process . env [ API_KEY_ENV_VAR ] || getUserCredentials ( ) ?. authToken ,
216+ } )
217+
218+ const agentsPath = path . join ( __dirname , '../../.agents' )
219+ const localAgentDefinitions = Object . values (
220+ await loadLocalAgents ( {
221+ agentsPath,
222+ } ) ,
223+ )
224+
225+ // Track statistics
226+ const stats = {
227+ total : evalCommits . length ,
228+ completed : 0 ,
229+ failed : 0 ,
230+ scores : [ ] as number [ ] ,
231+ }
232+
274233 // Loop through each eval task
275234 for ( const evalCommit of evalCommits ) {
276235 const { sha, spec, fileStates } = evalCommit
@@ -280,7 +239,9 @@ async function main() {
280239
281240 try {
282241 const result = await evalPlannerAgent ( {
242+ client,
283243 agentId : 'implementation-planner' ,
244+ agentDefinitions : localAgentDefinitions ,
284245 spec,
285246 repoUrl,
286247 commitSha : sha ,
@@ -298,42 +259,68 @@ async function main() {
298259 console . log ( '📊 EVALUATION RESULTS' )
299260 console . log ( '─' . repeat ( 80 ) )
300261
301- if ( reasoning ) {
302- console . log ( '\n🧠 REASONING:' )
303- console . log ( reasoning )
304- }
305-
306- if ( pros ) {
307- console . log ( '\n✅ PROS:' )
308- console . log ( pros )
309- }
262+ console . log ( '\n🧠 REASONING:' )
263+ console . log ( reasoning )
310264
311- if ( cons ) {
312- console . log ( '\n❌ CONS:' )
313- console . log ( cons )
314- }
265+ console . log ( '\n❌ CONS:' )
266+ console . log ( cons )
315267
316- if ( typeof overallScore === 'number' ) {
317- console . log ( '\n📈 OVERALL SCORE:' )
318- const scoreBar = '█' . repeat ( Math . floor ( overallScore / 10 ) )
319- const emptyBar = '░' . repeat ( 10 - Math . floor ( overallScore / 10 ) )
320- console . log ( `${ scoreBar } ${ emptyBar } ${ overallScore } /100` )
321- }
268+ console . log ( '\n📈 OVERALL SCORE:' )
269+ const scoreBar = '█' . repeat ( Math . floor ( overallScore / 10 ) )
270+ const emptyBar = '░' . repeat ( 10 - Math . floor ( overallScore / 10 ) )
271+ console . log ( `${ scoreBar } ${ emptyBar } ${ overallScore } /100` )
322272
323273 console . log ( '\n' + '=' . repeat ( 80 ) + '\n' )
274+
275+ stats . completed ++
276+ stats . scores . push ( overallScore )
324277 } catch ( error ) {
325278 console . log ( `\n${ '=' . repeat ( 80 ) } ` )
326279 console . error ( `✗ Failed eval for commit ${ sha } ` )
327280 console . log ( `${ '=' . repeat ( 80 ) } \n` )
328281 console . error ( 'Error details:' , error )
329282 console . log ( '\n' + '=' . repeat ( 80 ) + '\n' )
283+
284+ stats . failed ++
330285 }
286+ }
287+
288+ // Display summary statistics
289+ console . log ( '\n' + '=' . repeat ( 80 ) )
290+ console . log ( '📊 SUMMARY STATISTICS' )
291+ console . log ( '=' . repeat ( 80 ) + '\n' )
292+
293+ console . log ( `Total Evals: ${ stats . total } ` )
294+ console . log (
295+ `Completed: ${ stats . completed } (${ ( ( stats . completed / stats . total ) * 100 ) . toFixed ( 1 ) } %)` ,
296+ )
297+ console . log (
298+ `Failed: ${ stats . failed } (${ ( ( stats . failed / stats . total ) * 100 ) . toFixed ( 1 ) } %)\n` ,
299+ )
331300
332- console . log ( 'breaking for now' )
333- break
301+ if ( stats . scores . length > 0 ) {
302+ const avgScore =
303+ stats . scores . reduce ( ( a , b ) => a + b , 0 ) / stats . scores . length
304+ const minScore = Math . min ( ...stats . scores )
305+ const maxScore = Math . max ( ...stats . scores )
306+ const medianScore = stats . scores . sort ( ( a , b ) => a - b ) [
307+ Math . floor ( stats . scores . length / 2 )
308+ ]
309+
310+ console . log ( 'Score Statistics:' )
311+ console . log ( ` Average: ${ avgScore . toFixed ( 1 ) } /100` )
312+ console . log ( ` Median: ${ medianScore } /100` )
313+ console . log ( ` Min: ${ minScore } /100` )
314+ console . log ( ` Max: ${ maxScore } /100\n` )
315+
316+ const scoreBar = '█' . repeat ( Math . floor ( avgScore / 10 ) )
317+ const emptyBar = '░' . repeat ( 10 - Math . floor ( avgScore / 10 ) )
318+ console . log (
319+ `Average Score: ${ scoreBar } ${ emptyBar } ${ avgScore . toFixed ( 1 ) } /100\n` ,
320+ )
334321 }
335322
336- console . log ( '\n=== All evals completed ===' )
323+ console . log ( '=' . repeat ( 80 ) )
337324}
338325
339326// Run main if this file is executed directly
0 commit comments