Also generate a spec

jahooma · jahooma · commit 974edf34ac67 · 2025-10-11T19:58:52.000-07:00
diff --git a/evals/git-evals2/eval-task-generator.ts b/evals/git-evals2/eval-task-generator.ts
@@ -4,16 +4,16 @@ import fileExplorerDef from '../../.agents/file-explorer/file-explorer'
 import findAllReferencerDef from '../../.agents/file-explorer/find-all-referencer'
 import { PLACEHOLDER } from '../../.agents/types/secret-agent-definition'
 
-const promptGeneratorAgentDef: AgentDefinition = {
-  id: 'git-evals2-prompt-generator',
-  displayName: 'Git Evals2 Prompt Generator',
+const evalTaskGeneratorAgentDef: AgentDefinition = {
+  id: 'git-evals2-eval-task-generator',
+  displayName: 'Git Evals2 Eval Task Generator',
   model: 'openai/gpt-5',
   toolNames: ['spawn_agents', 'read_files', 'set_output'],
   spawnableAgents: ['file-explorer', 'find-all-referencer'],
   inputSchema: {
     prompt: {
       type: 'string',
-      description: 'Instructions to generate the prompt',
+      description: 'Instructions to generate the task spec and prompt',
     },
   },
   outputMode: 'structured_output',
@@ -27,7 +27,12 @@ const promptGeneratorAgentDef: AgentDefinition = {
       },
       reasoning: {
         type: 'string',
-        description: 'Your thoughts about what should be in the prompt',
+        description: 'Your thoughts about the task, spec, and prompt',
+      },
+      spec: {
+        type: 'string',
+        description:
+          'Clear specification describing WHAT needs to be implemented (observable behavior/structure, not HOW)',
       },
       prompt: {
         type: 'string',
@@ -38,47 +43,56 @@ const promptGeneratorAgentDef: AgentDefinition = {
         items: { type: 'string' },
         description: 'List of supplemental file paths',
       },
-      confidence: {
-        type: 'number',
-        description: 'Confidence score 0-1 in the quality of the prompt',
-      },
     },
-    required: ['id', 'prompt', 'supplementalFiles', 'reasoning', 'confidence'],
+    required: ['id', 'reasoning', 'spec', 'prompt', 'supplementalFiles'],
   },
-  systemPrompt: `You are an expert at analyzing git commits and generating high-level user prompts.
+  systemPrompt: `You are an expert at analyzing git commits and generating evaluation tasks for AI coding assistants.
 
 You will receive:
 - A git diff showing the changes made
 - The list of files that were edited
 - An optional commit message
 - The repository directory where you can explore the codebase
 
+You must generate both a specification (spec) and a user prompt for the task.
+
 ${PLACEHOLDER.FILE_TREE_PROMPT}
 ${PLACEHOLDER.KNOWLEDGE_FILES_CONTENTS}`,
 
   instructionsPrompt: `Your task:
 1. Analyze the git diff to understand what changed
-2. Use your tools (read_files, spawn_agents) to explore the codebase and understand context
-3. Generate a short, descriptive task ID (2-3 hyphenated words like "fix-auth-bug" or "refactor-login-flow")
-4. Identify supplemental files that would help a judge understand the change (exclude directly edited files)
-5. Generate a high-level user prompt that describes WHAT needs to be done (not HOW)
+2. Spawn the file-explorer and find-all-referencer to explore the codebase and understand context.
+3. Read as many files relevant to the changes as possible.
+4. Generate the output, including:
+- a short, descriptive task ID (2-3 hyphenated words like "fix-auth-bug" or "refactor-login-flow")
+- a clear specification describing exactly what needs to be implemented
+- a high-level user prompt that describes what needs to be done leaving out details that should be reconstructed by the agent
+- supplemental files that would help a judge understand the change (exclude directly edited files)
 
 Key principles for the task ID:
 - 2-3 words maximum, hyphenated (e.g., "fix-memory-leak", "add-user-profile", "refactor-auth-flow")
 - Descriptive but concise
 - Use action verbs when appropriate (fix, add, remove, refactor, update, implement)
 - Lowercase with hyphens
 
+Key principles for the spec:
+- Prescribe exactly how to make the change with references to the files that need to be changed
+- Not include code
+- Focus on the observable behavior or structure that needs to be implemented
+- Be clear enough that a skilled developer or AI could implement it from scratch
+- Be phrased as what needs to be done, not what was already done
+- Cover all the changes shown across multiple files
+
 Key principles for the prompt:
-- Focus on the functional requirement, not implementation details
+- Focus on the high-level functional requirements, not implementation details
 - Use natural language: "add user authentication" not "implement authenticateUser function"
 - Omit details that should be reconstructed by the agent
 - Be clear enough that a skilled developer could implement from scratch
 - Consider the commit message as a hint but don't just copy it
 `,
 }
 
-export async function generatePromptFromCommit({
+export async function generateEvalTask({
   client,
   input,
   agentDefinitions,
@@ -95,45 +109,54 @@ export async function generatePromptFromCommit({
   agentDefinitions?: any[]
 }): Promise<{
   id: string
+  reasoning: string
+  spec: string
   prompt: string
   supplementalFiles: string[]
-  confidence: number
-  reasoning: string
 }> {
   const { diff, editedFilePaths, commitMessage, repoPath } = input
 
   const allAgentDefinitions = [
-    promptGeneratorAgentDef,
+    evalTaskGeneratorAgentDef,
     fileExplorerDef,
     findAllReferencerDef,
     ...(agentDefinitions || []),
   ]
 
   const generatorResult = await client.run({
-    agent: 'git-evals2-prompt-generator',
+    agent: 'git-evals2-eval-task-generator',
     prompt:
-      'Generate a high-level user prompt based on the git diff and codebase exploration',
+      'Generate a task specification and user prompt based on the git diff and codebase exploration',
     params: {
       diff,
       editedFilePaths,
       commitMessage,
     },
     cwd: repoPath,
     agentDefinitions: allAgentDefinitions,
+    handleEvent: (event) => {
+      if (event.type === 'subagent_start') {
+        console.log(`[Agent] Starting: ${event.displayName}`)
+      } else if (event.type === 'tool_call') {
+        console.log(`[Tool] ${event.toolName}`)
+      } else if (event.type === 'text') {
+        console.log(`[Text] ${event.text}...`)
+      }
+    },
   })
 
   if (
     generatorResult.output.type !== 'structuredOutput' ||
     !generatorResult.output.value
   ) {
-    throw new Error('Failed to generate structured prompt output')
+    throw new Error('Failed to generate structured task output')
   }
 
   return generatorResult.output.value as {
     id: string
+    reasoning: string
+    spec: string
     prompt: string
     supplementalFiles: string[]
-    reasoning: string
-    confidence: number
   }
 }
diff --git a/evals/git-evals2/gen-evals.ts b/evals/git-evals2/gen-evals.ts
@@ -2,41 +2,18 @@ import { execSync } from 'child_process'
 import { createTwoFilesPatch } from 'diff'
 import fs from 'fs'
 import path from 'path'
+import { mapLimit } from 'async'
 
-import { disableLiveUserInputCheck } from '@codebuff/backend/live-user-inputs'
-import { promptAiSdk } from '@codebuff/backend/llm-apis/vercel-ai-sdk/ai-sdk'
-import { models } from '@codebuff/common/old-constants'
 import { API_KEY_ENV_VAR } from '@codebuff/common/old-constants'
 import { getUserCredentials } from '@codebuff/npm-app/credentials'
-import { mapLimit } from 'async'
 
 import { CodebuffClient } from '../../sdk/src/client'
 import { extractRepoNameFromUrl } from '../git-evals/setup-test-repo'
 import { withTestRepoAndParent } from '../subagents/test-repo-utils'
-import { generatePromptFromCommit } from './prompt-generator'
+import { generateEvalTask } from './eval-task-generator'
 
 import type { EvalDataV2, EvalCommitV2, FileDiff } from './types'
 
-const SPEC_GENERATION_PROMPT = `Given a set of file changes and an optional description, write a clear specification describing WHAT needs to be implemented.
-First, use <thinking> tags to analyze the changes and determine what should go into the spec.
-
-Then, generate the spec.
-
-The spec should:
-1. Focus on the observable behavior or structure that needs to be implemented
-2. Not include implementation details or specific code
-3. Not prescribe HOW to make the change
-4. Be clear enough that a skilled developer or AI could implement it from scratch
-5. Be phrased as what needs to be done, not what was already done
-6. Cover all the changes shown across multiple files
-
-The spec will be used to test an AI coding assistant's ability to implement the described functionality.
-
-Please wrap your final specification in <spec></spec> tags.`
-
-const fingerprintId = 'evals-v2'
-const userInputId = 'evals-v2'
-
 function getFileContentAtCommit(
   repoPath: string,
   commitSha: string,
@@ -127,52 +104,6 @@ function getCommitMessage(repoPath: string, commitSha: string): string {
   }).trim()
 }
 
-async function generateSpecForFileDiffs(
-  fileDiffs: FileDiff[],
-  clientSessionId: string,
-): Promise<string> {
-  const fileContext = fileDiffs
-    .map(({ path, status, diff }) => {
-      let diffDescription = `File: ${path}\n`
-
-      if (status === 'added') {
-        diffDescription += `New file created\n${diff}\n`
-      } else if (status === 'deleted') {
-        diffDescription += `File deleted\n${diff}\n`
-      } else if (status === 'renamed') {
-        diffDescription += `File renamed\n${diff}\n`
-      } else {
-        diffDescription += `${diff}\n`
-      }
-
-      return diffDescription
-    })
-    .join('\n---\n')
-
-  const prompt = `${SPEC_GENERATION_PROMPT}\n\nFile Changes:\n${fileContext}`
-
-  try {
-    disableLiveUserInputCheck()
-    const response = await promptAiSdk({
-      messages: [{ role: 'user', content: prompt }],
-      model: models.openrouter_claude_sonnet_4,
-      clientSessionId,
-      fingerprintId,
-      userInputId,
-      userId: undefined,
-      logger: console,
-    })
-
-    const specMatch = response.match(/<spec>(.*?)<\/spec>/s)
-    const spec = specMatch ? specMatch[1].trim() : response.trim()
-
-    return spec || 'Failed to generate specification'
-  } catch (error) {
-    console.error('Error generating spec:', error)
-    return 'Failed to generate specification due to error'
-  }
-}
-
 export async function generateEvalFileV2({
   repoUrl,
   commitShas,
@@ -188,8 +119,6 @@ export async function generateEvalFileV2({
     apiKey: process.env[API_KEY_ENV_VAR] || getUserCredentials()?.authToken,
   })
 
-  const clientSessionId = `gen-evals-v2-${Math.random().toString(36).substring(2)}`
-
   console.log(`Processing ${commitShas.length} commits in parallel...`)
 
   const BATCH_SIZE = 5
@@ -212,18 +141,13 @@ export async function generateEvalFileV2({
           commitSha,
           parentSha,
         )
-        const spec = await generateSpecForFileDiffs(fileDiffs, clientSessionId)
-
-        console.log(
-          `Generated spec for ${commitSha.slice(0, 8)}: ${spec.substring(0, 100)}...`,
-        )
 
         const fullDiff = getFullDiff(repoPath, commitSha, parentSha)
         const commitMessage = getCommitMessage(repoPath, commitSha)
         const editedFilePaths = fileDiffs.map((f) => f.path)
 
-        console.log(`Generating prompt for ${commitSha.slice(0, 8)}...`)
-        const promptResult = await generatePromptFromCommit({
+        console.log(`Generating eval task for ${commitSha.slice(0, 8)}...`)
+        const taskResult = await generateEvalTask({
           client,
           input: {
             commitSha,
@@ -235,20 +159,22 @@ export async function generateEvalFileV2({
           },
         })
 
+        console.log(`Task ID: ${taskResult.id}`)
+        console.log(`Generated spec: ${taskResult.spec.substring(0, 100)}...`)
         console.log(
-          `Generated prompt: ${promptResult.prompt.substring(0, 100)}...`,
+          `Generated prompt: ${taskResult.prompt.substring(0, 100)}...`,
         )
         console.log(
-          `Supplemental files: ${promptResult.supplementalFiles.length} files`,
+          `Supplemental files: ${taskResult.supplementalFiles.length} files`,
         )
 
         return {
+          id: taskResult.id,
           sha: commitSha,
           parentSha,
-          spec,
-          id: promptResult.id,
-          prompt: promptResult.prompt,
-          supplementalFiles: promptResult.supplementalFiles,
+          spec: taskResult.spec,
+          prompt: taskResult.prompt,
+          supplementalFiles: taskResult.supplementalFiles,
           fileDiffs,
         }
       },
diff --git a/evals/git-evals2/migrate-evals-to-v2.ts b/evals/git-evals2/migrate-evals-to-v2.ts
@@ -82,10 +82,10 @@ async function migrateCommit(
         encoding: 'utf-8',
       }).trim()
 
-      console.log(`Generating prompt for ${commitSha.slice(0, 8)}...`)
+      console.log(`Generating task for ${commitSha.slice(0, 8)}...`)
 
-      const { generatePromptFromCommit } = await import('./prompt-generator')
-      const promptResult = await generatePromptFromCommit({
+      const { generateEvalTask } = await import('./eval-task-generator')
+      const taskResult = await generateEvalTask({
         client,
         input: {
           commitSha,
@@ -98,21 +98,24 @@ async function migrateCommit(
         agentDefinitions,
       })
 
+      console.log(`Task ID: ${taskResult.id}`)
       console.log(
-        `Generated prompt: ${promptResult.prompt.substring(0, 100)}...`,
+        `Generated spec: ${taskResult.spec.substring(0, 100)}...`,
       )
       console.log(
-        `Supplemental files: ${promptResult.supplementalFiles.length} files`,
+        `Generated prompt: ${taskResult.prompt.substring(0, 100)}...`,
+      )
+      console.log(
+        `Supplemental files: ${taskResult.supplementalFiles.length} files`,
       )
-      console.log(`Task ID: ${promptResult.id}`)
 
       return {
-        id: promptResult.id,
+        id: taskResult.id,
         sha: commitSha,
         parentSha,
-        spec: oldCommit.spec,
-        prompt: promptResult.prompt,
-        supplementalFiles: promptResult.supplementalFiles,
+        spec: taskResult.spec || oldCommit.spec,
+        prompt: taskResult.prompt,
+        supplementalFiles: taskResult.supplementalFiles,
         fileDiffs,
       }
     },