-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathcodeagent_retry.go
More file actions
256 lines (224 loc) · 6.47 KB
/
codeagent_retry.go
File metadata and controls
256 lines (224 loc) · 6.47 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
package eyrie
import (
"context"
"fmt"
"strings"
"sync"
"time"
)
// CodeAgentRetry provides intelligent retry and fallback strategies
// specifically for code agent workloads. Unlike generic retry, this
// understands code-specific failures and adapts accordingly.
type CodeAgentRetry struct {
mu sync.Mutex
strategies map[string]*RetryStrategy
history []RetryRecord
}
// RetryStrategy defines retry behavior for a specific failure type.
type RetryStrategy struct {
Name string
MaxRetries int
BaseDelay time.Duration
MaxDelay time.Duration
Backoff float64
FallbackModel string // switch to this model on failure
FallbackProvider string // switch to this provider on failure
}
// RetryRecord captures a retry attempt for learning.
type RetryRecord struct {
Timestamp time.Time
Provider string
Model string
ErrorType string
ErrorMessage string
RetryCount int
Recovered bool
FallbackUsed bool
}
// NewCodeAgentRetry creates a retry system with code-agent-specific strategies.
func NewCodeAgentRetry() *CodeAgentRetry {
cr := &CodeAgentRetry{
strategies: make(map[string]*RetryStrategy),
history: make([]RetryRecord, 0, 1000),
}
cr.registerDefaults()
return cr
}
// registerDefaults sets up default retry strategies for common code agent failures.
func (cr *CodeAgentRetry) registerDefaults() {
// Rate limiting - wait and retry
cr.strategies["rate_limit"] = &RetryStrategy{
Name: "Rate Limit",
MaxRetries: 5,
BaseDelay: 5 * time.Second,
MaxDelay: 60 * time.Second,
Backoff: 2.0,
}
// Context length exceeded - switch to model with larger context
cr.strategies["context_length"] = &RetryStrategy{
Name: "Context Length",
MaxRetries: 2,
BaseDelay: 1 * time.Second,
FallbackModel: "claude-3-5-sonnet", // larger context
FallbackProvider: "anthropic",
}
// Tool execution failure - retry with different approach
cr.strategies["tool_failure"] = &RetryStrategy{
Name: "Tool Failure",
MaxRetries: 3,
BaseDelay: 2 * time.Second,
Backoff: 1.5,
}
// Token budget exceeded - switch to cheaper model
cr.strategies["budget_exceeded"] = &RetryStrategy{
Name: "Budget Exceeded",
MaxRetries: 1,
FallbackModel: "gpt-4o-mini", // cheaper
FallbackProvider: "openai",
}
// Server error - retry with backoff
cr.strategies["server_error"] = &RetryStrategy{
Name: "Server Error",
MaxRetries: 3,
BaseDelay: 3 * time.Second,
MaxDelay: 30 * time.Second,
Backoff: 2.0,
}
// Timeout - retry with longer timeout
cr.strategies["timeout"] = &RetryStrategy{
Name: "Timeout",
MaxRetries: 2,
BaseDelay: 5 * time.Second,
Backoff: 2.0,
}
}
// DecideRetry determines how to handle a failure.
func (cr *CodeAgentRetry) DecideRetry(ctx context.Context, err error, provider, model string) *RetryDecision {
cr.mu.Lock()
defer cr.mu.Unlock()
errorType := classifyError(err)
strategy, exists := cr.strategies[errorType]
if !exists {
strategy = cr.strategies["server_error"] // default
}
// Check if we've exceeded max retries for this error type
recentRetries := cr.countRecentRetries(errorType, provider, model)
if recentRetries >= strategy.MaxRetries {
// Try fallback if available
if strategy.FallbackModel != "" {
return &RetryDecision{
ShouldRetry: true,
Delay: 0,
Reason: fmt.Sprintf("max retries exceeded for %s, switching to fallback", errorType),
FallbackModel: strategy.FallbackModel,
FallbackProvider: strategy.FallbackProvider,
}
}
return &RetryDecision{
ShouldRetry: false,
Reason: fmt.Sprintf("max retries exceeded for %s", errorType),
}
}
// Calculate delay with exponential backoff
delay := strategy.BaseDelay
for i := 0; i < recentRetries; i++ {
delay = time.Duration(float64(delay) * strategy.Backoff)
}
if delay > strategy.MaxDelay {
delay = strategy.MaxDelay
}
// Record the retry attempt
cr.recordRetry(provider, model, errorType, err.Error())
return &RetryDecision{
ShouldRetry: true,
Delay: delay,
Reason: fmt.Sprintf("retrying %s (attempt %d/%d)", errorType, recentRetries+1, strategy.MaxRetries),
}
}
// RetryDecision describes what to do after a failure.
type RetryDecision struct {
ShouldRetry bool
Delay time.Duration
Reason string
FallbackModel string
FallbackProvider string
}
func (cr *CodeAgentRetry) recordRetry(provider, model, errorType, errorMsg string) {
cr.history = append(cr.history, RetryRecord{
Timestamp: time.Now(),
Provider: provider,
Model: model,
ErrorType: errorType,
ErrorMessage: errorMsg,
Recovered: false,
})
// Keep history bounded
if len(cr.history) > 1000 {
cr.history = cr.history[500:]
}
}
func (cr *CodeAgentRetry) countRecentRetries(errorType, provider, model string) int {
count := 0
cutoff := time.Now().Add(-5 * time.Minute)
for _, r := range cr.history {
if r.Timestamp.After(cutoff) &&
r.ErrorType == errorType &&
r.Provider == provider &&
r.Model == model {
count++
}
}
return count
}
// classifyError determines the error type from an error message.
func classifyError(err error) string {
msg := err.Error()
lower := strings.ToLower(msg)
if strings.Contains(lower, "rate limit") || strings.Contains(lower, "429") {
return "rate_limit"
}
if strings.Contains(lower, "context length") || strings.Contains(lower, "too long") {
return "context_length"
}
if strings.Contains(lower, "budget") || strings.Contains(lower, "cost") {
return "budget_exceeded"
}
if strings.Contains(lower, "timeout") || strings.Contains(lower, "deadline") {
return "timeout"
}
if strings.Contains(lower, "500") || strings.Contains(lower, "503") || strings.Contains(lower, "server") {
return "server_error"
}
if strings.Contains(lower, "tool") || strings.Contains(lower, "function") {
return "tool_failure"
}
return "unknown"
}
// Stats returns retry statistics.
func (cr *CodeAgentRetry) Stats() map[string]interface{} {
cr.mu.Lock()
defer cr.mu.Unlock()
total := len(cr.history)
recovered := 0
fallbacks := 0
for _, r := range cr.history {
if r.Recovered {
recovered++
}
if r.FallbackUsed {
fallbacks++
}
}
return map[string]interface{}{
"total_retries": total,
"recovered": recovered,
"fallbacks_used": fallbacks,
"recovery_rate": float64(recovered) / float64(imax(1, total)),
}
}
func imax(a, b int) int {
if a > b {
return a
}
return b
}