Skip to content

Commit 974edf3

Browse files
committed
Also generate a spec
1 parent 59665e4 commit 974edf3

File tree

3 files changed

+73
-121
lines changed

3 files changed

+73
-121
lines changed
Lines changed: 48 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -4,16 +4,16 @@ import fileExplorerDef from '../../.agents/file-explorer/file-explorer'
44
import findAllReferencerDef from '../../.agents/file-explorer/find-all-referencer'
55
import { PLACEHOLDER } from '../../.agents/types/secret-agent-definition'
66

7-
const promptGeneratorAgentDef: AgentDefinition = {
8-
id: 'git-evals2-prompt-generator',
9-
displayName: 'Git Evals2 Prompt Generator',
7+
const evalTaskGeneratorAgentDef: AgentDefinition = {
8+
id: 'git-evals2-eval-task-generator',
9+
displayName: 'Git Evals2 Eval Task Generator',
1010
model: 'openai/gpt-5',
1111
toolNames: ['spawn_agents', 'read_files', 'set_output'],
1212
spawnableAgents: ['file-explorer', 'find-all-referencer'],
1313
inputSchema: {
1414
prompt: {
1515
type: 'string',
16-
description: 'Instructions to generate the prompt',
16+
description: 'Instructions to generate the task spec and prompt',
1717
},
1818
},
1919
outputMode: 'structured_output',
@@ -27,7 +27,12 @@ const promptGeneratorAgentDef: AgentDefinition = {
2727
},
2828
reasoning: {
2929
type: 'string',
30-
description: 'Your thoughts about what should be in the prompt',
30+
description: 'Your thoughts about the task, spec, and prompt',
31+
},
32+
spec: {
33+
type: 'string',
34+
description:
35+
'Clear specification describing WHAT needs to be implemented (observable behavior/structure, not HOW)',
3136
},
3237
prompt: {
3338
type: 'string',
@@ -38,47 +43,56 @@ const promptGeneratorAgentDef: AgentDefinition = {
3843
items: { type: 'string' },
3944
description: 'List of supplemental file paths',
4045
},
41-
confidence: {
42-
type: 'number',
43-
description: 'Confidence score 0-1 in the quality of the prompt',
44-
},
4546
},
46-
required: ['id', 'prompt', 'supplementalFiles', 'reasoning', 'confidence'],
47+
required: ['id', 'reasoning', 'spec', 'prompt', 'supplementalFiles'],
4748
},
48-
systemPrompt: `You are an expert at analyzing git commits and generating high-level user prompts.
49+
systemPrompt: `You are an expert at analyzing git commits and generating evaluation tasks for AI coding assistants.
4950
5051
You will receive:
5152
- A git diff showing the changes made
5253
- The list of files that were edited
5354
- An optional commit message
5455
- The repository directory where you can explore the codebase
5556
57+
You must generate both a specification (spec) and a user prompt for the task.
58+
5659
${PLACEHOLDER.FILE_TREE_PROMPT}
5760
${PLACEHOLDER.KNOWLEDGE_FILES_CONTENTS}`,
5861

5962
instructionsPrompt: `Your task:
6063
1. Analyze the git diff to understand what changed
61-
2. Use your tools (read_files, spawn_agents) to explore the codebase and understand context
62-
3. Generate a short, descriptive task ID (2-3 hyphenated words like "fix-auth-bug" or "refactor-login-flow")
63-
4. Identify supplemental files that would help a judge understand the change (exclude directly edited files)
64-
5. Generate a high-level user prompt that describes WHAT needs to be done (not HOW)
64+
2. Spawn the file-explorer and find-all-referencer to explore the codebase and understand context.
65+
3. Read as many files relevant to the changes as possible.
66+
4. Generate the output, including:
67+
- a short, descriptive task ID (2-3 hyphenated words like "fix-auth-bug" or "refactor-login-flow")
68+
- a clear specification describing exactly what needs to be implemented
69+
- a high-level user prompt that describes what needs to be done leaving out details that should be reconstructed by the agent
70+
- supplemental files that would help a judge understand the change (exclude directly edited files)
6571
6672
Key principles for the task ID:
6773
- 2-3 words maximum, hyphenated (e.g., "fix-memory-leak", "add-user-profile", "refactor-auth-flow")
6874
- Descriptive but concise
6975
- Use action verbs when appropriate (fix, add, remove, refactor, update, implement)
7076
- Lowercase with hyphens
7177
78+
Key principles for the spec:
79+
- Prescribe exactly how to make the change with references to the files that need to be changed
80+
- Not include code
81+
- Focus on the observable behavior or structure that needs to be implemented
82+
- Be clear enough that a skilled developer or AI could implement it from scratch
83+
- Be phrased as what needs to be done, not what was already done
84+
- Cover all the changes shown across multiple files
85+
7286
Key principles for the prompt:
73-
- Focus on the functional requirement, not implementation details
87+
- Focus on the high-level functional requirements, not implementation details
7488
- Use natural language: "add user authentication" not "implement authenticateUser function"
7589
- Omit details that should be reconstructed by the agent
7690
- Be clear enough that a skilled developer could implement from scratch
7791
- Consider the commit message as a hint but don't just copy it
7892
`,
7993
}
8094

81-
export async function generatePromptFromCommit({
95+
export async function generateEvalTask({
8296
client,
8397
input,
8498
agentDefinitions,
@@ -95,45 +109,54 @@ export async function generatePromptFromCommit({
95109
agentDefinitions?: any[]
96110
}): Promise<{
97111
id: string
112+
reasoning: string
113+
spec: string
98114
prompt: string
99115
supplementalFiles: string[]
100-
confidence: number
101-
reasoning: string
102116
}> {
103117
const { diff, editedFilePaths, commitMessage, repoPath } = input
104118

105119
const allAgentDefinitions = [
106-
promptGeneratorAgentDef,
120+
evalTaskGeneratorAgentDef,
107121
fileExplorerDef,
108122
findAllReferencerDef,
109123
...(agentDefinitions || []),
110124
]
111125

112126
const generatorResult = await client.run({
113-
agent: 'git-evals2-prompt-generator',
127+
agent: 'git-evals2-eval-task-generator',
114128
prompt:
115-
'Generate a high-level user prompt based on the git diff and codebase exploration',
129+
'Generate a task specification and user prompt based on the git diff and codebase exploration',
116130
params: {
117131
diff,
118132
editedFilePaths,
119133
commitMessage,
120134
},
121135
cwd: repoPath,
122136
agentDefinitions: allAgentDefinitions,
137+
handleEvent: (event) => {
138+
if (event.type === 'subagent_start') {
139+
console.log(`[Agent] Starting: ${event.displayName}`)
140+
} else if (event.type === 'tool_call') {
141+
console.log(`[Tool] ${event.toolName}`)
142+
} else if (event.type === 'text') {
143+
console.log(`[Text] ${event.text}...`)
144+
}
145+
},
123146
})
124147

125148
if (
126149
generatorResult.output.type !== 'structuredOutput' ||
127150
!generatorResult.output.value
128151
) {
129-
throw new Error('Failed to generate structured prompt output')
152+
throw new Error('Failed to generate structured task output')
130153
}
131154

132155
return generatorResult.output.value as {
133156
id: string
157+
reasoning: string
158+
spec: string
134159
prompt: string
135160
supplementalFiles: string[]
136-
reasoning: string
137-
confidence: number
138161
}
139162
}

evals/git-evals2/gen-evals.ts

Lines changed: 12 additions & 86 deletions
Original file line numberDiff line numberDiff line change
@@ -2,41 +2,18 @@ import { execSync } from 'child_process'
22
import { createTwoFilesPatch } from 'diff'
33
import fs from 'fs'
44
import path from 'path'
5+
import { mapLimit } from 'async'
56

6-
import { disableLiveUserInputCheck } from '@codebuff/backend/live-user-inputs'
7-
import { promptAiSdk } from '@codebuff/backend/llm-apis/vercel-ai-sdk/ai-sdk'
8-
import { models } from '@codebuff/common/old-constants'
97
import { API_KEY_ENV_VAR } from '@codebuff/common/old-constants'
108
import { getUserCredentials } from '@codebuff/npm-app/credentials'
11-
import { mapLimit } from 'async'
129

1310
import { CodebuffClient } from '../../sdk/src/client'
1411
import { extractRepoNameFromUrl } from '../git-evals/setup-test-repo'
1512
import { withTestRepoAndParent } from '../subagents/test-repo-utils'
16-
import { generatePromptFromCommit } from './prompt-generator'
13+
import { generateEvalTask } from './eval-task-generator'
1714

1815
import type { EvalDataV2, EvalCommitV2, FileDiff } from './types'
1916

20-
const SPEC_GENERATION_PROMPT = `Given a set of file changes and an optional description, write a clear specification describing WHAT needs to be implemented.
21-
First, use <thinking> tags to analyze the changes and determine what should go into the spec.
22-
23-
Then, generate the spec.
24-
25-
The spec should:
26-
1. Focus on the observable behavior or structure that needs to be implemented
27-
2. Not include implementation details or specific code
28-
3. Not prescribe HOW to make the change
29-
4. Be clear enough that a skilled developer or AI could implement it from scratch
30-
5. Be phrased as what needs to be done, not what was already done
31-
6. Cover all the changes shown across multiple files
32-
33-
The spec will be used to test an AI coding assistant's ability to implement the described functionality.
34-
35-
Please wrap your final specification in <spec></spec> tags.`
36-
37-
const fingerprintId = 'evals-v2'
38-
const userInputId = 'evals-v2'
39-
4017
function getFileContentAtCommit(
4118
repoPath: string,
4219
commitSha: string,
@@ -127,52 +104,6 @@ function getCommitMessage(repoPath: string, commitSha: string): string {
127104
}).trim()
128105
}
129106

130-
async function generateSpecForFileDiffs(
131-
fileDiffs: FileDiff[],
132-
clientSessionId: string,
133-
): Promise<string> {
134-
const fileContext = fileDiffs
135-
.map(({ path, status, diff }) => {
136-
let diffDescription = `File: ${path}\n`
137-
138-
if (status === 'added') {
139-
diffDescription += `New file created\n${diff}\n`
140-
} else if (status === 'deleted') {
141-
diffDescription += `File deleted\n${diff}\n`
142-
} else if (status === 'renamed') {
143-
diffDescription += `File renamed\n${diff}\n`
144-
} else {
145-
diffDescription += `${diff}\n`
146-
}
147-
148-
return diffDescription
149-
})
150-
.join('\n---\n')
151-
152-
const prompt = `${SPEC_GENERATION_PROMPT}\n\nFile Changes:\n${fileContext}`
153-
154-
try {
155-
disableLiveUserInputCheck()
156-
const response = await promptAiSdk({
157-
messages: [{ role: 'user', content: prompt }],
158-
model: models.openrouter_claude_sonnet_4,
159-
clientSessionId,
160-
fingerprintId,
161-
userInputId,
162-
userId: undefined,
163-
logger: console,
164-
})
165-
166-
const specMatch = response.match(/<spec>(.*?)<\/spec>/s)
167-
const spec = specMatch ? specMatch[1].trim() : response.trim()
168-
169-
return spec || 'Failed to generate specification'
170-
} catch (error) {
171-
console.error('Error generating spec:', error)
172-
return 'Failed to generate specification due to error'
173-
}
174-
}
175-
176107
export async function generateEvalFileV2({
177108
repoUrl,
178109
commitShas,
@@ -188,8 +119,6 @@ export async function generateEvalFileV2({
188119
apiKey: process.env[API_KEY_ENV_VAR] || getUserCredentials()?.authToken,
189120
})
190121

191-
const clientSessionId = `gen-evals-v2-${Math.random().toString(36).substring(2)}`
192-
193122
console.log(`Processing ${commitShas.length} commits in parallel...`)
194123

195124
const BATCH_SIZE = 5
@@ -212,18 +141,13 @@ export async function generateEvalFileV2({
212141
commitSha,
213142
parentSha,
214143
)
215-
const spec = await generateSpecForFileDiffs(fileDiffs, clientSessionId)
216-
217-
console.log(
218-
`Generated spec for ${commitSha.slice(0, 8)}: ${spec.substring(0, 100)}...`,
219-
)
220144

221145
const fullDiff = getFullDiff(repoPath, commitSha, parentSha)
222146
const commitMessage = getCommitMessage(repoPath, commitSha)
223147
const editedFilePaths = fileDiffs.map((f) => f.path)
224148

225-
console.log(`Generating prompt for ${commitSha.slice(0, 8)}...`)
226-
const promptResult = await generatePromptFromCommit({
149+
console.log(`Generating eval task for ${commitSha.slice(0, 8)}...`)
150+
const taskResult = await generateEvalTask({
227151
client,
228152
input: {
229153
commitSha,
@@ -235,20 +159,22 @@ export async function generateEvalFileV2({
235159
},
236160
})
237161

162+
console.log(`Task ID: ${taskResult.id}`)
163+
console.log(`Generated spec: ${taskResult.spec.substring(0, 100)}...`)
238164
console.log(
239-
`Generated prompt: ${promptResult.prompt.substring(0, 100)}...`,
165+
`Generated prompt: ${taskResult.prompt.substring(0, 100)}...`,
240166
)
241167
console.log(
242-
`Supplemental files: ${promptResult.supplementalFiles.length} files`,
168+
`Supplemental files: ${taskResult.supplementalFiles.length} files`,
243169
)
244170

245171
return {
172+
id: taskResult.id,
246173
sha: commitSha,
247174
parentSha,
248-
spec,
249-
id: promptResult.id,
250-
prompt: promptResult.prompt,
251-
supplementalFiles: promptResult.supplementalFiles,
175+
spec: taskResult.spec,
176+
prompt: taskResult.prompt,
177+
supplementalFiles: taskResult.supplementalFiles,
252178
fileDiffs,
253179
}
254180
},

evals/git-evals2/migrate-evals-to-v2.ts

Lines changed: 13 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -82,10 +82,10 @@ async function migrateCommit(
8282
encoding: 'utf-8',
8383
}).trim()
8484

85-
console.log(`Generating prompt for ${commitSha.slice(0, 8)}...`)
85+
console.log(`Generating task for ${commitSha.slice(0, 8)}...`)
8686

87-
const { generatePromptFromCommit } = await import('./prompt-generator')
88-
const promptResult = await generatePromptFromCommit({
87+
const { generateEvalTask } = await import('./eval-task-generator')
88+
const taskResult = await generateEvalTask({
8989
client,
9090
input: {
9191
commitSha,
@@ -98,21 +98,24 @@ async function migrateCommit(
9898
agentDefinitions,
9999
})
100100

101+
console.log(`Task ID: ${taskResult.id}`)
101102
console.log(
102-
`Generated prompt: ${promptResult.prompt.substring(0, 100)}...`,
103+
`Generated spec: ${taskResult.spec.substring(0, 100)}...`,
103104
)
104105
console.log(
105-
`Supplemental files: ${promptResult.supplementalFiles.length} files`,
106+
`Generated prompt: ${taskResult.prompt.substring(0, 100)}...`,
107+
)
108+
console.log(
109+
`Supplemental files: ${taskResult.supplementalFiles.length} files`,
106110
)
107-
console.log(`Task ID: ${promptResult.id}`)
108111

109112
return {
110-
id: promptResult.id,
113+
id: taskResult.id,
111114
sha: commitSha,
112115
parentSha,
113-
spec: oldCommit.spec,
114-
prompt: promptResult.prompt,
115-
supplementalFiles: promptResult.supplementalFiles,
116+
spec: taskResult.spec || oldCommit.spec,
117+
prompt: taskResult.prompt,
118+
supplementalFiles: taskResult.supplementalFiles,
116119
fileDiffs,
117120
}
118121
},

0 commit comments

Comments
 (0)