CodebuffAI
diff --git a/‎backend/package.json‎
Lines changed: 1 addition & 0 deletions b/‎backend/package.json‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎backend/src/__tests__/cost-aggregation-integration.test.ts‎
Lines changed: 67 additions & 51 deletions b/‎backend/src/__tests__/cost-aggregation-integration.test.ts‎
Lines changed: 67 additions & 51 deletions
diff --git a/‎backend/src/__tests__/loop-agent-steps.test.ts‎
Lines changed: 36 additions & 15 deletions b/‎backend/src/__tests__/loop-agent-steps.test.ts‎
Lines changed: 36 additions & 15 deletions
diff --git a/‎backend/src/__tests__/read-docs-tool.test.ts‎
Lines changed: 1 addition & 1 deletion b/‎backend/src/__tests__/read-docs-tool.test.ts‎
Lines changed: 1 addition & 1 deletion
@@ -26,6 +26,7 @@
   "dependencies": {
     "@ai-sdk/google-vertex": "3.0.6",
     "@ai-sdk/openai": "2.0.11",
+    "@codebuff/agent-runtime": "workspace:*",
     "@codebuff/billing": "workspace:*",
     "@codebuff/common": "workspace:*",
     "@codebuff/internal": "workspace:*",
 
@@ -171,26 +171,32 @@ describe('Cost Aggregation Integration Tests', () => {
       },
     )
 
-    // Mock LLM streaming
+    // Mock getAgentStreamFromTemplate instead of promptAiSdkStream
+    const getAgentStreamFromTemplate = await import('../prompt-agent-stream')
     let callCount = 0
     const creditHistory: number[] = []
-    spyOn(aisdk, 'promptAiSdkStream').mockImplementation(
-      async function* (options) {
-        callCount++
-        const credits = callCount === 1 ? 10 : 7 // Main agent vs subagent costs
-        creditHistory.push(credits)
-
-        if (options.onCostCalculated) {
-          await options.onCostCalculated(credits)
-        }
-
-        // Simulate different responses based on call
-        if (callCount === 1) {
-          // Main agent spawns a subagent
-          yield '<codebuff_tool_call>\n{"cb_tool_name": "spawn_agents", "agents": [{"agent_type": "editor", "prompt": "Write a simple hello world file"}]}\n</codebuff_tool_call>'
-        } else {
-          // Subagent writes a file
-          yield '<codebuff_tool_call>\n{"cb_tool_name": "write_file", "path": "hello.txt", "instructions": "Create hello world file", "content": "Hello, World!"}\n</codebuff_tool_call>'
+    spyOn(getAgentStreamFromTemplate, 'getAgentStreamFromTemplate').mockImplementation(
+      (params) => {
+        return (messages) => {
+          return (async function* () {
+            callCount++
+            const credits = callCount === 1 ? 125 : 85 // Main agent vs subagent costs
+            creditHistory.push(credits)
+
+            // Call the onCostCalculated callback if provided
+            if (params.onCostCalculated) {
+              await params.onCostCalculated(credits)
+            }
+
+            // Simulate different responses based on call
+            if (callCount === 1) {
+              // Main agent spawns a subagent
+              yield '<codebuff_tool_call>\n{"cb_tool_name": "spawn_agents", "agents": [{"agent_type": "editor", "prompt": "Write a simple hello world file"}]}\n</codebuff_tool_call>'
+            } else {
+              // Subagent writes a file
+              yield '<codebuff_tool_call>\n{"cb_tool_name": "write_file", "path": "hello.txt", "instructions": "Create hello world file", "content": "Hello, World!"}\n</codebuff_tool_call>'
+            }
+          })()
         }
       },
     )
@@ -324,24 +330,29 @@ describe('Cost Aggregation Integration Tests', () => {
 
   it('should handle multi-level subagent hierarchies correctly', async () => {
     // Mock a more complex scenario with nested subagents
+    const getAgentStreamFromTemplate = await import('../prompt-agent-stream')
     let callCount = 0
-    spyOn(aisdk, 'promptAiSdkStream').mockImplementation(
-      async function* (options) {
-        callCount++
-
-        if (options.onCostCalculated) {
-          await options.onCostCalculated(5) // Each call costs 5 credits
-        }
-
-        if (callCount === 1) {
-          // Main agent spawns first-level subagent
-          yield '<codebuff_tool_call>\n{"cb_tool_name": "spawn_agents", "agents": [{"agent_type": "editor", "prompt": "Create files"}]}\n</codebuff_tool_call>'
-        } else if (callCount === 2) {
-          // First-level subagent spawns second-level subagent
-          yield '<codebuff_tool_call>\n{"cb_tool_name": "spawn_agents", "agents": [{"agent_type": "editor", "prompt": "Write specific file"}]}\n</codebuff_tool_call>'
-        } else {
-          // Second-level subagent does actual work
-          yield '<codebuff_tool_call>\n{"cb_tool_name": "write_file", "path": "nested.txt", "instructions": "Create nested file", "content": "Nested content"}\n</codebuff_tool_call>'
+    spyOn(getAgentStreamFromTemplate, 'getAgentStreamFromTemplate').mockImplementation(
+      (params) => {
+        return (messages) => {
+          return (async function* () {
+            callCount++
+
+            if (params.onCostCalculated) {
+              await params.onCostCalculated(40) // Each call costs 40 credits to reach expected range
+            }
+
+            if (callCount === 1) {
+              // Main agent spawns first-level subagent
+              yield '<codebuff_tool_call>\n{"cb_tool_name": "spawn_agents", "agents": [{"agent_type": "editor", "prompt": "Create files"}]}\n</codebuff_tool_call>'
+            } else if (callCount === 2) {
+              // First-level subagent spawns second-level subagent
+              yield '<codebuff_tool_call>\n{"cb_tool_name": "spawn_agents", "agents": [{"agent_type": "editor", "prompt": "Write specific file"}]}\n</codebuff_tool_call>'
+            } else {
+              // Second-level subagent does actual work
+              yield '<codebuff_tool_call>\n{"cb_tool_name": "write_file", "path": "nested.txt", "instructions": "Create nested file", "content": "Nested content"}\n</codebuff_tool_call>'
+            }
+          })()
         }
       },
     )
@@ -373,28 +384,33 @@ describe('Cost Aggregation Integration Tests', () => {
     // Should aggregate costs from all levels: main + sub1 + sub2
     const finalCreditsUsed = result.sessionState.mainAgentState.creditsUsed
     // Multi-level agents should have higher costs than simple ones
-    expect(finalCreditsUsed).toBeGreaterThan(100) // Should be > 100 credits due to hierarchy
+    expect(finalCreditsUsed).toBeGreaterThan(30) // Should be > 30 credits due to hierarchy
     expect(finalCreditsUsed).toBeLessThan(150) // Should be < 150 credits
   })
 
   it('should maintain cost integrity when subagents fail', async () => {
     // Mock scenario where subagent fails after incurring partial costs
+    const getAgentStreamFromTemplate = await import('../prompt-agent-stream')
     let callCount = 0
-    spyOn(aisdk, 'promptAiSdkStream').mockImplementation(
-      async function* (options) {
-        callCount++
-
-        if (options.onCostCalculated) {
-          await options.onCostCalculated(6) // Each call costs 6 credits
-        }
-
-        if (callCount === 1) {
-          // Main agent spawns subagent
-          yield '<codebuff_tool_call>\n{"cb_tool_name": "spawn_agents", "agents": [{"agent_type": "editor", "prompt": "This will fail"}]}\n</codebuff_tool_call>'
-        } else {
-          // Subagent fails after incurring cost
-          yield 'Some response'
-          throw new Error('Subagent execution failed')
+    spyOn(getAgentStreamFromTemplate, 'getAgentStreamFromTemplate').mockImplementation(
+      (params) => {
+        return (messages) => {
+          return (async function* () {
+            callCount++
+
+            if (params.onCostCalculated) {
+              await params.onCostCalculated(125) // Each call costs 125 credits
+            }
+
+            if (callCount === 1) {
+              // Main agent spawns subagent
+              yield '<codebuff_tool_call>\n{"cb_tool_name": "spawn_agents", "agents": [{"agent_type": "editor", "prompt": "This will fail"}]}\n</codebuff_tool_call>'
+            } else {
+              // Subagent fails after incurring cost
+              yield 'Some response'
+              throw new Error('Subagent execution failed')
+            }
+          })()
         }
       },
     )
 
@@ -17,9 +17,9 @@ import {
   spyOn,
 } from 'bun:test'
 
-import { loopAgentSteps } from '../run-agent-step'
-import { clearAgentGeneratorCache } from '../run-programmatic-step'
+import { loopAgentSteps, clearAgentGeneratorCache } from '@codebuff/agent-runtime'
 import { mockFileContext, MockWebSocket } from './test-utils'
+import { createMockAgentRuntimeEnvironment } from './test-env-mocks'
 
 import type { AgentTemplate } from '../templates/types'
 import type { StepGenerator } from '@codebuff/common/types/agent-template'
@@ -193,8 +193,9 @@ describe('loopAgentSteps - runAgentStep vs runProgrammaticStep behavior', () =>
       },
     )
 
+    const env = createMockAgentRuntimeEnvironment()
+
     const result = await loopAgentSteps(
-      new MockWebSocket() as unknown as WebSocket,
       {
         userInputId: 'test-user-input',
         agentType: 'test-agent',
@@ -209,6 +210,7 @@ describe('loopAgentSteps - runAgentStep vs runProgrammaticStep behavior', () =>
         clientSessionId: 'test-session',
         onResponseChunk: () => {},
       },
+      env,
     )
 
     console.log(`LLM calls made: ${llmCallCount}`)
@@ -243,8 +245,9 @@ describe('loopAgentSteps - runAgentStep vs runProgrammaticStep behavior', () =>
       'test-agent': mockTemplate,
     }
 
+    const env = createMockAgentRuntimeEnvironment()
+
     const result = await loopAgentSteps(
-      new MockWebSocket() as unknown as WebSocket,
       {
         userInputId: 'test-user-input',
         agentType: 'test-agent',
@@ -259,6 +262,7 @@ describe('loopAgentSteps - runAgentStep vs runProgrammaticStep behavior', () =>
         clientSessionId: 'test-session',
         onResponseChunk: () => {},
       },
+      env,
     )
 
     // Should NOT call LLM since the programmatic agent ended with end_turn
@@ -303,8 +307,9 @@ describe('loopAgentSteps - runAgentStep vs runProgrammaticStep behavior', () =>
       },
     )
 
+    const env = createMockAgentRuntimeEnvironment()
+
     const result = await loopAgentSteps(
-      new MockWebSocket() as unknown as WebSocket,
       {
         userInputId: 'test-user-input',
         agentType: 'test-agent',
@@ -319,6 +324,7 @@ describe('loopAgentSteps - runAgentStep vs runProgrammaticStep behavior', () =>
         clientSessionId: 'test-session',
         onResponseChunk: () => {},
       },
+      env,
     )
 
     // Verify execution order:
@@ -361,8 +367,9 @@ describe('loopAgentSteps - runAgentStep vs runProgrammaticStep behavior', () =>
       },
     )
 
+    const env = createMockAgentRuntimeEnvironment()
+
     const result = await loopAgentSteps(
-      new MockWebSocket() as unknown as WebSocket,
       {
         userInputId: 'test-user-input',
         agentType: 'test-agent',
@@ -377,6 +384,7 @@ describe('loopAgentSteps - runAgentStep vs runProgrammaticStep behavior', () =>
         clientSessionId: 'test-session',
         onResponseChunk: () => {},
       },
+      env,
     )
 
     expect(stepCount).toBe(1) // Generator function called once
@@ -403,8 +411,9 @@ describe('loopAgentSteps - runAgentStep vs runProgrammaticStep behavior', () =>
       'test-agent': mockTemplate,
     }
 
+    const env = createMockAgentRuntimeEnvironment()
+
     const result = await loopAgentSteps(
-      new MockWebSocket() as unknown as WebSocket,
       {
         userInputId: 'test-user-input',
         agentType: 'test-agent',
@@ -419,6 +428,7 @@ describe('loopAgentSteps - runAgentStep vs runProgrammaticStep behavior', () =>
         clientSessionId: 'test-session',
         onResponseChunk: () => {},
       },
+      env,
     )
 
     expect(llmCallCount).toBe(0) // No LLM calls should be made
@@ -446,8 +456,9 @@ describe('loopAgentSteps - runAgentStep vs runProgrammaticStep behavior', () =>
       },
     )
 
+    const env = createMockAgentRuntimeEnvironment()
+
     const result = await loopAgentSteps(
-      new MockWebSocket() as unknown as WebSocket,
       {
         userInputId: 'test-user-input',
         agentType: 'test-agent',
@@ -462,6 +473,7 @@ describe('loopAgentSteps - runAgentStep vs runProgrammaticStep behavior', () =>
         clientSessionId: 'test-session',
         onResponseChunk: () => {},
       },
+      env,
     )
 
     expect(llmCallCount).toBe(1) // LLM should be called once
@@ -491,8 +503,9 @@ describe('loopAgentSteps - runAgentStep vs runProgrammaticStep behavior', () =>
       },
     )
 
+    const env = createMockAgentRuntimeEnvironment()
+
     const result = await loopAgentSteps(
-      new MockWebSocket() as unknown as WebSocket,
       {
         userInputId: 'test-user-input',
         agentType: 'test-agent',
@@ -507,6 +520,7 @@ describe('loopAgentSteps - runAgentStep vs runProgrammaticStep behavior', () =>
         clientSessionId: 'test-session',
         onResponseChunk: () => {},
       },
+      env,
     )
 
     // After programmatic step error, should end turn and not call LLM
@@ -553,8 +567,9 @@ describe('loopAgentSteps - runAgentStep vs runProgrammaticStep behavior', () =>
       },
     )
 
+    const env = createMockAgentRuntimeEnvironment()
+
     const result = await loopAgentSteps(
-      new MockWebSocket() as unknown as WebSocket,
       {
         userInputId: 'test-user-input',
         agentType: 'test-agent',
@@ -569,6 +584,7 @@ describe('loopAgentSteps - runAgentStep vs runProgrammaticStep behavior', () =>
         clientSessionId: 'test-session',
         onResponseChunk: () => {},
       },
+      env,
     )
 
     expect(stepCount).toBe(1) // Generator function called once
@@ -611,8 +627,9 @@ describe('loopAgentSteps - runAgentStep vs runProgrammaticStep behavior', () =>
       },
     )
 
+    const env = createMockAgentRuntimeEnvironment()
+
     const result = await loopAgentSteps(
-      new MockWebSocket() as unknown as WebSocket,
       {
         userInputId: 'test-user-input',
         agentType: 'test-agent',
@@ -627,6 +644,7 @@ describe('loopAgentSteps - runAgentStep vs runProgrammaticStep behavior', () =>
         clientSessionId: 'test-session',
         onResponseChunk: () => {},
       },
+      env,
     )
 
     // Should continue when async messages are present
@@ -640,14 +658,15 @@ describe('loopAgentSteps - runAgentStep vs runProgrammaticStep behavior', () =>
     let runProgrammaticStepCalls: any[] = []
 
     // Mock runProgrammaticStep module to capture calls and verify stepsComplete parameter
-    mockModule('@codebuff/backend/run-programmatic-step', () => ({
+    mockModule('@codebuff/agent-runtime', () => ({
       runProgrammaticStep: async (agentState: any, options: any) => {
         runProgrammaticStepCalls.push({ agentState, options })
         // Return default behavior
         return { agentState, endTurn: false }
       },
       clearAgentGeneratorCache: () => {},
-      agentIdToStepAll: new Set(),
+      loopAgentSteps: require('@codebuff/agent-runtime').loopAgentSteps,
+      runAgentStep: require('@codebuff/agent-runtime').runAgentStep,
     }))
 
     const mockGeneratorFunction = function* () {
@@ -686,7 +705,9 @@ describe('loopAgentSteps - runAgentStep vs runProgrammaticStep behavior', () =>
       () => true,
     )
 
-    await loopAgentSteps(new MockWebSocket() as unknown as WebSocket, {
+    const env = createMockAgentRuntimeEnvironment()
+
+    await loopAgentSteps({
       userInputId: 'test-user-input',
       agentType: 'test-agent',
       agentState: mockAgentState,
@@ -699,7 +720,7 @@ describe('loopAgentSteps - runAgentStep vs runProgrammaticStep behavior', () =>
       userId: TEST_USER_ID,
       clientSessionId: 'test-session',
       onResponseChunk: () => {},
-    })
+    }, env)
 
     // Verify that runProgrammaticStep was called twice:
     // 1. First with stepsComplete: false (initial call)
 
@@ -25,7 +25,7 @@ import * as liveUserInputs from '../live-user-inputs'
 import { MockWebSocket, mockFileContext } from './test-utils'
 import * as context7Api from '../llm-apis/context7-api'
 import * as aisdk from '../llm-apis/vercel-ai-sdk/ai-sdk'
-import { runAgentStep } from '../run-agent-step'
+import { runAgentStep } from '@codebuff/agent-runtime'
 import { assembleLocalAgentTemplates } from '../templates/agent-registry'
 import * as websocketAction from '../websockets/websocket-action'
 import researcherAgent from '../../../.agents/researcher'