Skip to content

Commit 06c48b3

Browse files
committed
evals: allow env field in eval config .json which is passed to sdk run()
1 parent 4532582 commit 06c48b3

File tree

6 files changed

+43
-5
lines changed

6 files changed

+43
-5
lines changed

evals/buffbench/agent-runner.ts

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@ export async function runAgentOnCommit({
1919
commit,
2020
repoUrl,
2121
initCommand,
22+
env,
2223
localAgentDefinitions,
2324
printEvents,
2425
}: {
@@ -27,6 +28,7 @@ export async function runAgentOnCommit({
2728
commit: EvalCommitV2
2829
repoUrl: string
2930
initCommand?: string
31+
env?: Record<string, string>
3032
localAgentDefinitions: any[]
3133
printEvents: boolean
3234
}): Promise<{
@@ -51,6 +53,7 @@ export async function runAgentOnCommit({
5153
repoUrl,
5254
parentSha: commit.parentSha,
5355
initCommand,
56+
env,
5457
},
5558
async (repoDir) => {
5659
const timeoutMs = 30 * 60 * 1000 // 30 minutes
@@ -60,6 +63,7 @@ export async function runAgentOnCommit({
6063
prompt: commit.prompt,
6164
agentDefinitions: localAgentDefinitions,
6265
cwd: repoDir,
66+
env,
6367
handleEvent: (event) => {
6468
if (
6569
(event.type === 'tool_call' || event.type === 'tool_result') &&

evals/buffbench/eval-codebuff.json

Lines changed: 27 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,33 @@
22
"repoUrl": "https://github.com/CodebuffAI/codebuff",
33
"generationDate": "2025-10-12T05:55:40.855Z",
44
"initCommand": "bun install",
5+
"env": {
6+
"ANTHROPIC_API_KEY": "test-key",
7+
"ANTHROPIC_API_KEY2": "test-key-2",
8+
"HELICONE_API_KEY": "test-helicone",
9+
"OPEN_AI_KEY": "test-openai",
10+
"GOOGLE_GENERATIVE_AI_API_KEY": "test-google",
11+
"DEEPSEEK_API_KEY": "test-deepseek",
12+
"GEMINI_API_KEY": "test-gemini",
13+
"OPEN_ROUTER_API_KEY": "test-openrouter",
14+
"RELACE_API_KEY": "test-relace",
15+
"LINKUP_API_KEY": "test-linkup",
16+
"GOOGLE_CLOUD_PROJECT_ID": "test-project",
17+
"PORT": 3003,
18+
"DATABASE_URL": "postgresql://test",
19+
"CODEBUFF_GITHUB_ID": "test-github-id",
20+
"CODEBUFF_GITHUB_SECRET": "test-github-secret",
21+
"NEXTAUTH_SECRET": "test-nextauth-secret",
22+
"STRIPE_SECRET_KEY": "test-stripe-key",
23+
"STRIPE_WEBHOOK_SECRET_KEY": "test-stripe-webhook",
24+
"STRIPE_USAGE_PRICE_ID": "test-price-id",
25+
"STRIPE_TEAM_FEE_PRICE_ID": "test-team-price-id",
26+
"LOOPS_API_KEY": "test-loops",
27+
"DISCORD_PUBLIC_KEY": "test-discord-public",
28+
"DISCORD_BOT_TOKEN": "test-discord-bot",
29+
"DISCORD_APPLICATION_ID": "test-discord-app",
30+
"API_KEY_ENCRYPTION_SECRET": "12345678901234567890123456789012"
31+
},
532
"evalCommits": [
633
{
734
"id": "filter-system-history",

evals/buffbench/run-buffbench.ts

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,7 @@ async function runTask(options: {
2222
agents: string[]
2323
repoUrl: string
2424
initCommand?: string
25+
env?: Record<string, string>
2526
logsDir: string
2627
index: number
2728
totalTasks: number
@@ -40,6 +41,7 @@ async function runTask(options: {
4041
agents,
4142
repoUrl,
4243
initCommand,
44+
env,
4345
logsDir,
4446
index,
4547
totalTasks,
@@ -63,6 +65,7 @@ async function runTask(options: {
6365
commit,
6466
repoUrl,
6567
initCommand,
68+
env,
6669
localAgentDefinitions,
6770
printEvents,
6871
})
@@ -300,6 +303,7 @@ export async function runBuffBench(options: {
300303
agents,
301304
repoUrl: evalData.repoUrl,
302305
initCommand: evalData.initCommand,
306+
env: evalData.env,
303307
logsDir,
304308
index,
305309
totalTasks: commitsToRun.length,

evals/buffbench/types.ts

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -43,6 +43,7 @@ export interface EvalDataV2 {
4343
testRepoName?: string
4444
generationDate: string
4545
initCommand?: string
46+
env?: Record<string, string>
4647
evalCommits: EvalCommitV2[]
4748
}
4849

evals/subagents/test-repo-utils.ts

Lines changed: 7 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -14,10 +14,11 @@ export const withTestRepo = async <T>(
1414
// The sha of the commit to checkout. If you have a commit with changes to replicate, you would check out the parent commit.
1515
parentSha: string
1616
initCommand?: string
17+
env?: Record<string, string>
1718
},
1819
fn: (cwd: string) => Promise<T>,
1920
): Promise<T> => {
20-
const { repoUrl, parentSha, initCommand } = repoConfig
21+
const { repoUrl, parentSha, initCommand, env } = repoConfig
2122

2223
// Create a temporary directory for the test repo
2324
const tempDir = fs.mkdtempSync(path.join(os.tmpdir(), 'codebuff-eval-'))
@@ -35,7 +36,11 @@ export const withTestRepo = async <T>(
3536
if (initCommand) {
3637
console.log(`Running init command: ${initCommand}...`)
3738
try {
38-
execSync(initCommand, { cwd: repoDir })
39+
execSync(initCommand, {
40+
cwd: repoDir,
41+
stdio: 'ignore',
42+
env: { ...process.env, ...env },
43+
})
3944
} catch (error) {
4045
console.error(
4146
`Error running init command: ${getErrorObject(error).message}`,

sdk/src/tools/run-terminal-command.ts

Lines changed: 0 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -40,9 +40,6 @@ export function runTerminalCommand({
4040
env: {
4141
...process.env,
4242
...(env ?? {}),
43-
FORCE_COLOR: '1',
44-
CLICOLOR: '1',
45-
CLICOLOR_FORCE: '1',
4643
},
4744
stdio: 'pipe',
4845
})

0 commit comments

Comments
 (0)