CodebuffAI
diff --git a/‎.agents/registry/etl-manager.ts‎
Lines changed: 172 additions & 0 deletions b/‎.agents/registry/etl-manager.ts‎
Lines changed: 172 additions & 0 deletions
diff --git a/‎.agents/registry/extract-agent.ts‎
Lines changed: 94 additions & 0 deletions b/‎.agents/registry/extract-agent.ts‎
Lines changed: 94 additions & 0 deletions
@@ -0,0 +1,172 @@
+import type { AgentDefinition, ToolCall } from '../types/agent-definition'
+
+/**
+ * ETL Manager Agent
+ *
+ * Coordinates the ETL pipeline using handleSteps for sequential execution.
+ * A lightweight shim that spawns extract → transform → load in sequence.
+ */
+
+const agent: AgentDefinition = {
+  id: 'etl-manager',
+  displayName: 'ETL Pipeline Manager',
+  model: 'openai/gpt-5',
+  version: '1.0.0',
+  publisher: 'web-demo',
+
+  toolNames: ['spawn_agents', 'think_deeply', 'add_message'],
+
+  outputMode: 'last_message',
+  stepPrompt: '',
+  includeMessageHistory: true,
+
+  spawnableAgents: ['extract-agent', 'transform-agent', 'load-agent'],
+
+  handleSteps: function* ({ prompt, params }) {
+    // Step 1: Generate context-aware prompt for extract agent
+    const extractPrompt = `Analyzing user request "${prompt}" to generate optimal extraction strategy. Consider: data domain (${params?.domain || 'unknown'}), specific search terms needed, target sources, and query refinement for maximum relevance.`
+
+    const { toolResult: extractResults } = yield {
+      toolName: 'spawn_agents',
+      input: {
+        agents: [
+          {
+            agent_type: 'extract-agent',
+            prompt: extractPrompt,
+            params: params?.extractParams || {},
+          },
+        ],
+      },
+    } satisfies ToolCall
+    if (!extractResults || extractResults.length === 0) {
+      yield {
+        toolName: 'add_message',
+        input: {
+          role: 'user',
+          content: 'Extract step failed.',
+        },
+      } satisfies ToolCall
+      return
+    }
+    const extractResult =
+      extractResults[0]?.type === 'json'
+        ? extractResults[0].value
+        : extractResults[0]
+
+    // Step 2: Generate context-aware prompt for transform agent
+    const transformPrompt = `Processing extracted data from previous step. Need to transform raw data into canonical schema. Consider: data quality, normalization needs, deduplication strategy, and enrichment opportunities based on extracted content.`
+
+    const { toolResult: transformResults } = yield {
+      toolName: 'spawn_agents',
+      input: {
+        agents: [
+          {
+            agent_type: 'transform-agent',
+            prompt: transformPrompt,
+            params: {
+              ...params?.transformParams,
+              extractResult: extractResult,
+            },
+          },
+        ],
+      },
+    } satisfies ToolCall
+    if (!transformResults || transformResults.length === 0) {
+      yield {
+        toolName: 'add_message',
+        input: {
+          role: 'user',
+          content: 'Transform step failed.',
+        },
+      } satisfies ToolCall
+      return
+    }
+    const transformResult =
+      transformResults[0]?.type === 'json'
+        ? transformResults[0].value
+        : transformResults[0]
+
+    // Step 3: Generate context-aware prompt for load agent
+    const loadPrompt = `Final filtering and ranking phase for user request "${prompt}". Need to apply user constraints, score relevance, and rank results. Consider: user preferences, contextual relevance, quality metrics, and practical constraints.`
+
+    const { toolResult: loadResults } = yield {
+      toolName: 'spawn_agents',
+      input: {
+        agents: [
+          {
+            agent_type: 'load-agent',
+            prompt: loadPrompt,
+            params: {
+              ...params?.loadParams,
+              transformResult: transformResult,
+            },
+          },
+        ],
+      },
+    } satisfies ToolCall
+    if (!loadResults || loadResults.length === 0) {
+      yield {
+        toolName: 'add_message',
+        input: {
+          role: 'user',
+          content: 'Load step failed.',
+        },
+      } satisfies ToolCall
+      return
+    }
+    const loadResult =
+      loadResults[0]?.type === 'json' ? loadResults[0].value : loadResults[0]
+
+    // Return final ETL results
+    yield {
+      toolName: 'add_message',
+      input: {
+        role: 'user',
+        content:
+          typeof loadResult === 'string'
+            ? loadResult
+            : JSON.stringify(loadResult),
+      },
+    } satisfies ToolCall
+  },
+
+  inputSchema: {
+    prompt: {
+      type: 'string',
+      description:
+        'The data processing request to execute through ETL pipeline',
+    },
+    params: {
+      type: 'object',
+      properties: {
+        domain: {
+          type: 'string',
+          description:
+            'Data domain for ETL processing, e.g. places, events, projects',
+        },
+        extractParams: {
+          type: 'object',
+          description: 'Any special parameters for extract agent',
+        },
+        transformParams: {
+          type: 'object',
+          description: 'Any special parameters for transform agent',
+        },
+        loadParams: {
+          type: 'object',
+          description: 'Any special parameters for load agent',
+        },
+      },
+    },
+  },
+
+  systemPrompt:
+    'You are an ETL pipeline manager that coordinates sequential data processing through extract, transform, and load stages.',
+
+  spawnerPrompt:
+    'Use this agent to execute a complete ETL pipeline for data processing requests',
+
+  instructionsPrompt: '',
+}
+
+export default agent
@@ -0,0 +1,94 @@
+import type { AgentDefinition } from '../types/agent-definition'
+
+/**
+ * Extract Agent
+ *
+ * Handles data extraction from web sources using web_search.
+ * First stage of ETL pipeline - pulls raw/semi-structured content.
+ */
+
+const agent: AgentDefinition = {
+  id: 'extract-agent',
+  displayName: 'Extract Agent',
+  model: 'anthropic/claude-4-sonnet-20250522',
+  version: '1.0.0',
+  outputMode: 'last_message',
+  includeMessageHistory: false,
+
+  toolNames: ['web_search', 'end_turn'],
+
+  spawnableAgents: [],
+
+  instructionsPrompt: `You are the Extract Agent - the first stage of the ETL pipeline.
+
+Your role:
+1. Use web_search to fetch raw data from multiple sources
+2. Handle pagination, rate limits, and retries
+3. Output raw artifacts with rich metadata
+4. Support incremental extraction with caching
+
+Extraction Strategies by Domain:
+
+Places (cafés, venues):
+- Query patterns: "[location] coffee shops", "[location] coworking spaces"
+- Sources: Yelp, Google Maps, Foursquare
+- Extract: name, address, hours, ratings, amenities
+
+Events (meetups, conferences):
+- Query patterns: "[location] tech meetups", "[date] conferences [location]"
+- Sources: Meetup.com, Eventbrite, Facebook Events
+- Extract: title, date/time, venue, capacity, cost, organizer
+
+Projects (startups, opportunities):
+- Query patterns: "[location] startups", "[industry] companies [location]"
+- Sources: AngelList, Crunchbase, TechCrunch
+- Extract: name, stage, funding, team, industry, description
+
+Caching & Incrementality:
+- Cache key: hash(domain, location, timeWindow, sources)
+- TTL: 1 hour for real-time data, 24 hours for static data
+- Merge strategy: append new results, dedupe by URL/ID
+
+Error Handling:
+- Retry with exponential backoff (2^n seconds, max 60s)
+- Graceful degradation: partial results OK if >50% coverage
+- Source rotation: if one source fails, try alternatives
+- Rate limit respect: pause when limits hit
+
+Don't worry about the output format - just make sure all the data is well-represented.
+`,
+
+  spawnerPrompt: `Use this agent to extract raw data from web sources`,
+
+  inputSchema: {
+    prompt: {
+      type: 'string',
+      description: 'The user request for data extraction',
+    },
+    params: {
+      type: 'object',
+      domain: {
+        type: 'string',
+        description: 'Data domain for schema selection',
+      },
+    },
+  },
+
+  systemPrompt: `You are the Extract Agent - web data harvesting specialist.
+
+Extract data systematically:
+1. Build comprehensive search queries for the domain
+2. Execute web_search with retry/backoff logic
+3. Collect raw results with full provenance tracking
+4. Handle pagination and rate limits gracefully
+5. Output structured artifacts for downstream processing
+
+Speak like a data extraction system:
+"[EXTRACT] Harvesting places data from 3 sources..."
+"[QUERY] SF coffee shops SOMA - 47 results found"
+"[CACHE] Artifact saved: /data/etl/extract/abc123.json"`,
+
+  stepPrompt: `Extract raw data from web sources using web_search tool.`,
+}
+
+export default agent