11#!/usr/bin/env bun
22
3+ import { execFileSync } from 'child_process'
34import path from 'path'
45
6+ import { generateCompactId } from '@codebuff/common/util/string'
57import { Command , Flags } from '@oclif/core'
68
79import { sendEvalResultsEmail } from './email-eval-results'
@@ -92,6 +94,70 @@ class RunEvalSetCommand extends Command {
9294 }
9395}
9496
97+ /**
98+ * Creates a git worktree for the current commit to isolate code version
99+ */
100+ function createEvalWorktree ( ) : string {
101+ const currentCommit = execFileSync ( 'git' , [ 'rev-parse' , 'HEAD' ] , {
102+ encoding : 'utf-8' ,
103+ } ) . trim ( )
104+
105+ const worktreeId = generateCompactId ( )
106+ // Get project root by going up from the evals/git-evals directory
107+ const projectRoot = path . resolve ( __dirname , '../..' )
108+ const worktreePath = path . resolve (
109+ projectRoot ,
110+ '..' ,
111+ `codebuff-eval-worktree-${ worktreeId } ` ,
112+ )
113+
114+ console . log ( `Creating eval worktree at ${ worktreePath } ...` )
115+ console . log ( `Commit: ${ currentCommit } ` )
116+
117+ try {
118+ execFileSync ( 'git' , [ 'worktree' , 'add' , worktreePath , currentCommit ] , {
119+ stdio : 'inherit' ,
120+ } )
121+ console . log ( '✅ Worktree created successfully' )
122+
123+ // Install dependencies in worktree to ensure node_modules are in sync
124+ console . log ( 'Installing dependencies in worktree...' )
125+ execFileSync ( 'bun' , [ 'install' ] , {
126+ cwd : worktreePath ,
127+ stdio : 'inherit' ,
128+ } )
129+ console . log ( '✅ Dependencies installed successfully' )
130+
131+ return worktreePath
132+ } catch ( error ) {
133+ console . error ( 'Failed to create worktree:' , error )
134+ throw error
135+ }
136+ }
137+
138+ /**
139+ * Removes the eval worktree
140+ */
141+ function cleanupEvalWorktree ( worktreePath : string ) : void {
142+ console . log ( `\nCleaning up eval worktree at ${ worktreePath } ...` )
143+
144+ try {
145+ // Remove the worktree
146+ execFileSync ( 'git' , [ 'worktree' , 'remove' , worktreePath , '--force' ] , {
147+ stdio : 'inherit' ,
148+ } )
149+ console . log ( '✅ Worktree removed successfully' )
150+ } catch ( error ) {
151+ console . error ( 'Failed to remove worktree:' , error )
152+ // Try to prune if remove failed
153+ try {
154+ execFileSync ( 'git' , [ 'worktree' , 'prune' ] , { stdio : 'inherit' } )
155+ } catch ( pruneError ) {
156+ console . error ( 'Failed to prune worktrees:' , pruneError )
157+ }
158+ }
159+ }
160+
95161async function runEvalSet ( options : {
96162 sets : string
97163 'output-dir' : string
@@ -124,10 +190,14 @@ async function runEvalSet(options: {
124190 console . log ( 'Starting eval set run...' )
125191 console . log ( `Output directory: ${ outputDir } ` )
126192
127- // Set up signal handlers to clean up child processes
193+ // Create worktree to freeze code version for this eval run
194+ const worktreePath = createEvalWorktree ( )
195+
196+ // Set up signal handlers to clean up child processes and worktree
128197 const signalHandler = async ( signal : string ) => {
129198 console . log ( `\nReceived ${ signal } , cleaning up evaluation processes...` )
130199 await terminateAllEvalChildren ( )
200+ cleanupEvalWorktree ( worktreePath )
131201 console . log ( 'Cleanup complete.' )
132202 process . exit ( signal === 'SIGINT' ? 130 : 143 )
133203 }
@@ -151,25 +221,28 @@ async function runEvalSet(options: {
151221 )
152222 }
153223
224+ // Resolve paths relative to worktree if using one
225+ const baseDir = path . join ( worktreePath , 'evals' , 'git-evals' )
226+
154227 const allEvalConfigs : EvalConfig [ ] = [
155228 {
156229 name : 'codebuff' ,
157- evalDataPath : path . join ( __dirname , 'eval-codebuff2.json' ) ,
230+ evalDataPath : path . join ( baseDir , 'eval-codebuff2.json' ) ,
158231 outputDir,
159232 } ,
160233 {
161234 name : 'manifold' ,
162- evalDataPath : path . join ( __dirname , 'eval-manifold2.json' ) ,
235+ evalDataPath : path . join ( baseDir , 'eval-manifold2.json' ) ,
163236 outputDir,
164237 } ,
165238 {
166239 name : 'plane' ,
167- evalDataPath : path . join ( __dirname , 'eval-plane.json' ) ,
240+ evalDataPath : path . join ( baseDir , 'eval-plane.json' ) ,
168241 outputDir,
169242 } ,
170243 {
171244 name : 'saleor' ,
172- evalDataPath : path . join ( __dirname , 'eval-saleor.json' ) ,
245+ evalDataPath : path . join ( baseDir , 'eval-saleor.json' ) ,
173246 outputDir,
174247 } ,
175248 ]
@@ -204,6 +277,7 @@ async function runEvalSet(options: {
204277 config . limit ,
205278 options . concurrency === 1 ,
206279 agent ,
280+ worktreePath ,
207281 )
208282 } catch ( error ) {
209283 const evalDuration = Date . now ( ) - evalStartTime
@@ -447,6 +521,9 @@ async function runEvalSet(options: {
447521 }
448522 }
449523
524+ // Clean up worktree before exiting
525+ cleanupEvalWorktree ( worktreePath )
526+
450527 if ( failureCount > 0 ) {
451528 console . log (
452529 '\n⚠️ Some evaluations failed. Check the logs above for details.' ,
0 commit comments