Skip to content

Commit 185ef39

Browse files
committed
feat: new etl agents
1 parent c8256bd commit 185ef39

File tree

4 files changed

+893
-0
lines changed

4 files changed

+893
-0
lines changed

.agents/registry/etl-manager.ts

Lines changed: 172 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,172 @@
1+
import type { AgentDefinition, ToolCall } from '../types/agent-definition'
2+
3+
/**
4+
* ETL Manager Agent
5+
*
6+
* Coordinates the ETL pipeline using handleSteps for sequential execution.
7+
* A lightweight shim that spawns extract → transform → load in sequence.
8+
*/
9+
10+
const agent: AgentDefinition = {
11+
id: 'etl-manager',
12+
displayName: 'ETL Pipeline Manager',
13+
model: 'openai/gpt-5',
14+
version: '1.0.0',
15+
publisher: 'web-demo',
16+
17+
toolNames: ['spawn_agents', 'think_deeply', 'add_message'],
18+
19+
outputMode: 'last_message',
20+
stepPrompt: '',
21+
includeMessageHistory: true,
22+
23+
spawnableAgents: ['extract-agent', 'transform-agent', 'load-agent'],
24+
25+
handleSteps: function* ({ prompt, params }) {
26+
// Step 1: Generate context-aware prompt for extract agent
27+
const extractPrompt = `Analyzing user request "${prompt}" to generate optimal extraction strategy. Consider: data domain (${params?.domain || 'unknown'}), specific search terms needed, target sources, and query refinement for maximum relevance.`
28+
29+
const { toolResult: extractResults } = yield {
30+
toolName: 'spawn_agents',
31+
input: {
32+
agents: [
33+
{
34+
agent_type: 'extract-agent',
35+
prompt: extractPrompt,
36+
params: params?.extractParams || {},
37+
},
38+
],
39+
},
40+
} satisfies ToolCall
41+
if (!extractResults || extractResults.length === 0) {
42+
yield {
43+
toolName: 'add_message',
44+
input: {
45+
role: 'user',
46+
content: 'Extract step failed.',
47+
},
48+
} satisfies ToolCall
49+
return
50+
}
51+
const extractResult =
52+
extractResults[0]?.type === 'json'
53+
? extractResults[0].value
54+
: extractResults[0]
55+
56+
// Step 2: Generate context-aware prompt for transform agent
57+
const transformPrompt = `Processing extracted data from previous step. Need to transform raw data into canonical schema. Consider: data quality, normalization needs, deduplication strategy, and enrichment opportunities based on extracted content.`
58+
59+
const { toolResult: transformResults } = yield {
60+
toolName: 'spawn_agents',
61+
input: {
62+
agents: [
63+
{
64+
agent_type: 'transform-agent',
65+
prompt: transformPrompt,
66+
params: {
67+
...params?.transformParams,
68+
extractResult: extractResult,
69+
},
70+
},
71+
],
72+
},
73+
} satisfies ToolCall
74+
if (!transformResults || transformResults.length === 0) {
75+
yield {
76+
toolName: 'add_message',
77+
input: {
78+
role: 'user',
79+
content: 'Transform step failed.',
80+
},
81+
} satisfies ToolCall
82+
return
83+
}
84+
const transformResult =
85+
transformResults[0]?.type === 'json'
86+
? transformResults[0].value
87+
: transformResults[0]
88+
89+
// Step 3: Generate context-aware prompt for load agent
90+
const loadPrompt = `Final filtering and ranking phase for user request "${prompt}". Need to apply user constraints, score relevance, and rank results. Consider: user preferences, contextual relevance, quality metrics, and practical constraints.`
91+
92+
const { toolResult: loadResults } = yield {
93+
toolName: 'spawn_agents',
94+
input: {
95+
agents: [
96+
{
97+
agent_type: 'load-agent',
98+
prompt: loadPrompt,
99+
params: {
100+
...params?.loadParams,
101+
transformResult: transformResult,
102+
},
103+
},
104+
],
105+
},
106+
} satisfies ToolCall
107+
if (!loadResults || loadResults.length === 0) {
108+
yield {
109+
toolName: 'add_message',
110+
input: {
111+
role: 'user',
112+
content: 'Load step failed.',
113+
},
114+
} satisfies ToolCall
115+
return
116+
}
117+
const loadResult =
118+
loadResults[0]?.type === 'json' ? loadResults[0].value : loadResults[0]
119+
120+
// Return final ETL results
121+
yield {
122+
toolName: 'add_message',
123+
input: {
124+
role: 'user',
125+
content:
126+
typeof loadResult === 'string'
127+
? loadResult
128+
: JSON.stringify(loadResult),
129+
},
130+
} satisfies ToolCall
131+
},
132+
133+
inputSchema: {
134+
prompt: {
135+
type: 'string',
136+
description:
137+
'The data processing request to execute through ETL pipeline',
138+
},
139+
params: {
140+
type: 'object',
141+
properties: {
142+
domain: {
143+
type: 'string',
144+
description:
145+
'Data domain for ETL processing, e.g. places, events, projects',
146+
},
147+
extractParams: {
148+
type: 'object',
149+
description: 'Any special parameters for extract agent',
150+
},
151+
transformParams: {
152+
type: 'object',
153+
description: 'Any special parameters for transform agent',
154+
},
155+
loadParams: {
156+
type: 'object',
157+
description: 'Any special parameters for load agent',
158+
},
159+
},
160+
},
161+
},
162+
163+
systemPrompt:
164+
'You are an ETL pipeline manager that coordinates sequential data processing through extract, transform, and load stages.',
165+
166+
spawnerPrompt:
167+
'Use this agent to execute a complete ETL pipeline for data processing requests',
168+
169+
instructionsPrompt: '',
170+
}
171+
172+
export default agent

.agents/registry/extract-agent.ts

Lines changed: 94 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,94 @@
1+
import type { AgentDefinition } from '../types/agent-definition'
2+
3+
/**
4+
* Extract Agent
5+
*
6+
* Handles data extraction from web sources using web_search.
7+
* First stage of ETL pipeline - pulls raw/semi-structured content.
8+
*/
9+
10+
const agent: AgentDefinition = {
11+
id: 'extract-agent',
12+
displayName: 'Extract Agent',
13+
model: 'anthropic/claude-4-sonnet-20250522',
14+
version: '1.0.0',
15+
outputMode: 'last_message',
16+
includeMessageHistory: false,
17+
18+
toolNames: ['web_search', 'end_turn'],
19+
20+
spawnableAgents: [],
21+
22+
instructionsPrompt: `You are the Extract Agent - the first stage of the ETL pipeline.
23+
24+
Your role:
25+
1. Use web_search to fetch raw data from multiple sources
26+
2. Handle pagination, rate limits, and retries
27+
3. Output raw artifacts with rich metadata
28+
4. Support incremental extraction with caching
29+
30+
Extraction Strategies by Domain:
31+
32+
Places (cafés, venues):
33+
- Query patterns: "[location] coffee shops", "[location] coworking spaces"
34+
- Sources: Yelp, Google Maps, Foursquare
35+
- Extract: name, address, hours, ratings, amenities
36+
37+
Events (meetups, conferences):
38+
- Query patterns: "[location] tech meetups", "[date] conferences [location]"
39+
- Sources: Meetup.com, Eventbrite, Facebook Events
40+
- Extract: title, date/time, venue, capacity, cost, organizer
41+
42+
Projects (startups, opportunities):
43+
- Query patterns: "[location] startups", "[industry] companies [location]"
44+
- Sources: AngelList, Crunchbase, TechCrunch
45+
- Extract: name, stage, funding, team, industry, description
46+
47+
Caching & Incrementality:
48+
- Cache key: hash(domain, location, timeWindow, sources)
49+
- TTL: 1 hour for real-time data, 24 hours for static data
50+
- Merge strategy: append new results, dedupe by URL/ID
51+
52+
Error Handling:
53+
- Retry with exponential backoff (2^n seconds, max 60s)
54+
- Graceful degradation: partial results OK if >50% coverage
55+
- Source rotation: if one source fails, try alternatives
56+
- Rate limit respect: pause when limits hit
57+
58+
Don't worry about the output format - just make sure all the data is well-represented.
59+
`,
60+
61+
spawnerPrompt: `Use this agent to extract raw data from web sources`,
62+
63+
inputSchema: {
64+
prompt: {
65+
type: 'string',
66+
description: 'The user request for data extraction',
67+
},
68+
params: {
69+
type: 'object',
70+
domain: {
71+
type: 'string',
72+
description: 'Data domain for schema selection',
73+
},
74+
},
75+
},
76+
77+
systemPrompt: `You are the Extract Agent - web data harvesting specialist.
78+
79+
Extract data systematically:
80+
1. Build comprehensive search queries for the domain
81+
2. Execute web_search with retry/backoff logic
82+
3. Collect raw results with full provenance tracking
83+
4. Handle pagination and rate limits gracefully
84+
5. Output structured artifacts for downstream processing
85+
86+
Speak like a data extraction system:
87+
"[EXTRACT] Harvesting places data from 3 sources..."
88+
"[QUERY] SF coffee shops SOMA - 47 results found"
89+
"[CACHE] Artifact saved: /data/etl/extract/abc123.json"`,
90+
91+
stepPrompt: `Extract raw data from web sources using web_search tool.`,
92+
}
93+
94+
export default agent

0 commit comments

Comments
 (0)