Skip to content

Commit 92aeeeb

Browse files
committed
feat(db): improve database error handling with expanded retry logic
- Add comprehensive PostgreSQL error code handling for retryable errors: - Class 40 (transaction rollback): serialization failures, deadlocks - Class 08 (connection exceptions): connection failures, timeouts - Class 57 (operator intervention): query canceled, admin shutdown - Class 53 (insufficient resources): too many connections, out of memory - Add class-level fallback matching for unlisted error codes - Add exponential backoff with ±20% jitter to prevent thundering herd - Improve error logging in consumeCreditsAndAddAgentStep with: - PostgreSQL error details (code, constraint, detail) - Transaction phase tracking - Grant snapshot for debugging - Add isRetryablePostgresError and getRetryableErrorDescription helpers - Add comprehensive unit tests for retry logic and error handling
1 parent 5fa399b commit 92aeeeb

File tree

5 files changed

+1156
-84
lines changed

5 files changed

+1156
-84
lines changed
Lines changed: 325 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,325 @@
1+
import { afterEach, beforeEach, describe, expect, it, mock, spyOn } from 'bun:test'
2+
3+
import { INITIAL_RETRY_DELAY, withRetry } from '../promise'
4+
5+
describe('withRetry', () => {
6+
describe('basic functionality', () => {
7+
it('should return result on successful first attempt', async () => {
8+
const operation = mock(() => Promise.resolve('success'))
9+
10+
const result = await withRetry(operation)
11+
12+
expect(result).toBe('success')
13+
expect(operation).toHaveBeenCalledTimes(1)
14+
})
15+
16+
it('should retry on retryable error and succeed', async () => {
17+
let attempts = 0
18+
const operation = mock(() => {
19+
attempts++
20+
if (attempts === 1) {
21+
const error = { type: 'APIConnectionError' }
22+
return Promise.reject(error)
23+
}
24+
return Promise.resolve('success after retry')
25+
})
26+
27+
// Mock setTimeout to avoid delays
28+
const setTimeoutSpy = spyOn(globalThis, 'setTimeout').mockImplementation(
29+
((callback: () => void) => {
30+
callback()
31+
return 0 as unknown as NodeJS.Timeout
32+
}) as typeof setTimeout,
33+
)
34+
35+
const result = await withRetry(operation)
36+
37+
expect(result).toBe('success after retry')
38+
expect(attempts).toBe(2)
39+
40+
setTimeoutSpy.mockRestore()
41+
})
42+
43+
it('should throw immediately on non-retryable error', async () => {
44+
const error = new Error('non-retryable')
45+
const operation = mock(() => Promise.reject(error))
46+
47+
await expect(withRetry(operation)).rejects.toThrow('non-retryable')
48+
expect(operation).toHaveBeenCalledTimes(1)
49+
})
50+
51+
it('should throw after max retries exceeded', async () => {
52+
const error = { type: 'APIConnectionError' }
53+
const operation = mock(() => Promise.reject(error))
54+
55+
// Mock setTimeout to avoid delays
56+
const setTimeoutSpy = spyOn(globalThis, 'setTimeout').mockImplementation(
57+
((callback: () => void) => {
58+
callback()
59+
return 0 as unknown as NodeJS.Timeout
60+
}) as typeof setTimeout,
61+
)
62+
63+
await expect(
64+
withRetry(operation, { maxRetries: 3 }),
65+
).rejects.toMatchObject({ type: 'APIConnectionError' })
66+
67+
expect(operation).toHaveBeenCalledTimes(3)
68+
69+
setTimeoutSpy.mockRestore()
70+
})
71+
})
72+
73+
describe('jitter behavior', () => {
74+
let setTimeoutSpy: ReturnType<typeof spyOn>
75+
let mathRandomSpy: ReturnType<typeof spyOn>
76+
let capturedDelays: number[]
77+
78+
beforeEach(() => {
79+
capturedDelays = []
80+
81+
// Capture the delay values passed to setTimeout
82+
setTimeoutSpy = spyOn(globalThis, 'setTimeout').mockImplementation(
83+
((callback: () => void, delay: number) => {
84+
capturedDelays.push(delay)
85+
callback()
86+
return 0 as unknown as NodeJS.Timeout
87+
}) as typeof setTimeout,
88+
)
89+
})
90+
91+
afterEach(() => {
92+
setTimeoutSpy.mockRestore()
93+
if (mathRandomSpy) {
94+
mathRandomSpy.mockRestore()
95+
}
96+
})
97+
98+
it('should apply minimum jitter (0.8x) when Math.random returns 0', async () => {
99+
mathRandomSpy = spyOn(Math, 'random').mockReturnValue(0)
100+
101+
const error = { type: 'APIConnectionError' }
102+
let attempts = 0
103+
const operation = mock(() => {
104+
attempts++
105+
if (attempts < 3) {
106+
return Promise.reject(error)
107+
}
108+
return Promise.resolve('success')
109+
})
110+
111+
await withRetry(operation, {
112+
maxRetries: 3,
113+
retryDelayMs: INITIAL_RETRY_DELAY,
114+
})
115+
116+
// With Math.random() = 0, jitter = 0.8
117+
// Attempt 0 (first retry): baseDelay = 1000 * 2^0 = 1000, delay = 1000 * 0.8 = 800
118+
// Attempt 1 (second retry): baseDelay = 1000 * 2^1 = 2000, delay = 2000 * 0.8 = 1600
119+
expect(capturedDelays).toEqual([800, 1600])
120+
})
121+
122+
it('should apply maximum jitter (1.2x) when Math.random returns 1', async () => {
123+
mathRandomSpy = spyOn(Math, 'random').mockReturnValue(1)
124+
125+
const error = { type: 'APIConnectionError' }
126+
let attempts = 0
127+
const operation = mock(() => {
128+
attempts++
129+
if (attempts < 3) {
130+
return Promise.reject(error)
131+
}
132+
return Promise.resolve('success')
133+
})
134+
135+
await withRetry(operation, {
136+
maxRetries: 3,
137+
retryDelayMs: INITIAL_RETRY_DELAY,
138+
})
139+
140+
// With Math.random() = 1, jitter = 1.2
141+
// Attempt 0: baseDelay = 1000 * 2^0 = 1000, delay = 1000 * 1.2 = 1200
142+
// Attempt 1: baseDelay = 1000 * 2^1 = 2000, delay = 2000 * 1.2 = 2400
143+
expect(capturedDelays).toEqual([1200, 2400])
144+
})
145+
146+
it('should apply no jitter (1.0x) when Math.random returns 0.5', async () => {
147+
mathRandomSpy = spyOn(Math, 'random').mockReturnValue(0.5)
148+
149+
const error = { type: 'APIConnectionError' }
150+
let attempts = 0
151+
const operation = mock(() => {
152+
attempts++
153+
if (attempts < 3) {
154+
return Promise.reject(error)
155+
}
156+
return Promise.resolve('success')
157+
})
158+
159+
await withRetry(operation, {
160+
maxRetries: 3,
161+
retryDelayMs: INITIAL_RETRY_DELAY,
162+
})
163+
164+
// With Math.random() = 0.5, jitter = 0.8 + 0.5 * 0.4 = 1.0
165+
// Attempt 0: baseDelay = 1000, delay = 1000 * 1.0 = 1000
166+
// Attempt 1: baseDelay = 2000, delay = 2000 * 1.0 = 2000
167+
expect(capturedDelays).toEqual([1000, 2000])
168+
})
169+
170+
it('should apply exponential backoff with jitter correctly', async () => {
171+
// Use a specific random value to verify the jitter calculation
172+
mathRandomSpy = spyOn(Math, 'random').mockReturnValue(0.25)
173+
174+
const error = { type: 'APIConnectionError' }
175+
let attempts = 0
176+
const operation = mock(() => {
177+
attempts++
178+
if (attempts < 4) {
179+
return Promise.reject(error)
180+
}
181+
return Promise.resolve('success')
182+
})
183+
184+
await withRetry(operation, {
185+
maxRetries: 4,
186+
retryDelayMs: 1000,
187+
})
188+
189+
// With Math.random() = 0.25, jitter = 0.8 + 0.25 * 0.4 = 0.9
190+
// Attempt 0: 1000 * 2^0 * 0.9 = 900
191+
// Attempt 1: 1000 * 2^1 * 0.9 = 1800
192+
// Attempt 2: 1000 * 2^2 * 0.9 = 3600
193+
expect(capturedDelays).toEqual([900, 1800, 3600])
194+
})
195+
196+
it('should produce delays within ±20% of base delay', async () => {
197+
// Don't mock Math.random - let it use real random values
198+
mathRandomSpy?.mockRestore()
199+
200+
const error = { type: 'APIConnectionError' }
201+
let attempts = 0
202+
const operation = mock(() => {
203+
attempts++
204+
if (attempts < 3) {
205+
return Promise.reject(error)
206+
}
207+
return Promise.resolve('success')
208+
})
209+
210+
await withRetry(operation, {
211+
maxRetries: 3,
212+
retryDelayMs: 1000,
213+
})
214+
215+
// Verify delays are within expected ranges
216+
// Attempt 0: base = 1000, range = [800, 1200]
217+
expect(capturedDelays[0]).toBeGreaterThanOrEqual(800)
218+
expect(capturedDelays[0]).toBeLessThanOrEqual(1200)
219+
220+
// Attempt 1: base = 2000, range = [1600, 2400]
221+
expect(capturedDelays[1]).toBeGreaterThanOrEqual(1600)
222+
expect(capturedDelays[1]).toBeLessThanOrEqual(2400)
223+
})
224+
225+
it('should use custom retryDelayMs for base delay calculation', async () => {
226+
mathRandomSpy = spyOn(Math, 'random').mockReturnValue(0.5) // jitter = 1.0
227+
228+
const error = { type: 'APIConnectionError' }
229+
let attempts = 0
230+
const operation = mock(() => {
231+
attempts++
232+
if (attempts < 2) {
233+
return Promise.reject(error)
234+
}
235+
return Promise.resolve('success')
236+
})
237+
238+
await withRetry(operation, {
239+
maxRetries: 2,
240+
retryDelayMs: 500, // Custom base delay
241+
})
242+
243+
// With jitter = 1.0 and retryDelayMs = 500
244+
// Attempt 0: 500 * 2^0 * 1.0 = 500
245+
expect(capturedDelays).toEqual([500])
246+
})
247+
})
248+
249+
describe('onRetry callback', () => {
250+
it('should call onRetry with error and attempt number', async () => {
251+
const setTimeoutSpy = spyOn(globalThis, 'setTimeout').mockImplementation(
252+
((callback: () => void) => {
253+
callback()
254+
return 0 as unknown as NodeJS.Timeout
255+
}) as typeof setTimeout,
256+
)
257+
258+
const onRetry = mock(() => {})
259+
const error = { type: 'APIConnectionError' }
260+
let attempts = 0
261+
const operation = mock(() => {
262+
attempts++
263+
if (attempts < 3) {
264+
return Promise.reject(error)
265+
}
266+
return Promise.resolve('success')
267+
})
268+
269+
await withRetry(operation, {
270+
maxRetries: 3,
271+
onRetry,
272+
})
273+
274+
expect(onRetry).toHaveBeenCalledTimes(2)
275+
expect(onRetry).toHaveBeenNthCalledWith(1, error, 1)
276+
expect(onRetry).toHaveBeenNthCalledWith(2, error, 2)
277+
278+
setTimeoutSpy.mockRestore()
279+
})
280+
})
281+
282+
describe('retryIf callback', () => {
283+
it('should use custom retryIf to determine retryability', async () => {
284+
const setTimeoutSpy = spyOn(globalThis, 'setTimeout').mockImplementation(
285+
((callback: () => void) => {
286+
callback()
287+
return 0 as unknown as NodeJS.Timeout
288+
}) as typeof setTimeout,
289+
)
290+
291+
let attempts = 0
292+
const operation = mock(() => {
293+
attempts++
294+
if (attempts === 1) {
295+
return Promise.reject({ code: 'RETRY_ME' })
296+
}
297+
return Promise.resolve('success')
298+
})
299+
300+
const result = await withRetry(operation, {
301+
maxRetries: 3,
302+
retryIf: (error) => error?.code === 'RETRY_ME',
303+
})
304+
305+
expect(result).toBe('success')
306+
expect(attempts).toBe(2)
307+
308+
setTimeoutSpy.mockRestore()
309+
})
310+
311+
it('should not retry when retryIf returns false', async () => {
312+
const error = { code: 'DO_NOT_RETRY' }
313+
const operation = mock(() => Promise.reject(error))
314+
315+
await expect(
316+
withRetry(operation, {
317+
maxRetries: 3,
318+
retryIf: (err) => err?.code === 'RETRY_ME',
319+
}),
320+
).rejects.toMatchObject({ code: 'DO_NOT_RETRY' })
321+
322+
expect(operation).toHaveBeenCalledTimes(1)
323+
})
324+
})
325+
})

common/src/util/promise.ts

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -30,8 +30,10 @@ export async function withRetry<T>(
3030

3131
onRetry(error, attempt + 1)
3232

33-
// Exponential backoff
34-
const delayMs = retryDelayMs * Math.pow(2, attempt)
33+
// Exponential backoff with jitter (±20%) to prevent thundering herd
34+
const baseDelayMs = retryDelayMs * Math.pow(2, attempt)
35+
const jitter = 0.8 + Math.random() * 0.4 // Random multiplier between 0.8 and 1.2
36+
const delayMs = Math.round(baseDelayMs * jitter)
3537
await new Promise((resolve) => setTimeout(resolve, delayMs))
3638
}
3739
}

0 commit comments

Comments
 (0)