Skip to content

Commit 82cee7e

Browse files
committed
handle special graceful shutdown code
1 parent 1672a66 commit 82cee7e

File tree

2 files changed

+122
-2
lines changed

2 files changed

+122
-2
lines changed

apps/supervisor/src/services/failedPodHandler.test.ts

Lines changed: 104 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -314,6 +314,102 @@ describe("FailedPodHandler Integration Tests", () => {
314314
await handler.stop();
315315
}
316316
}, 60000);
317+
318+
it("should handle graceful shutdown pods differently", async () => {
319+
const handler = new FailedPodHandler({ namespace, k8s, register });
320+
321+
try {
322+
// Create first batch of pods before starting handler
323+
const firstBatchPodNames = await createTestPods({
324+
k8sApi: k8s,
325+
namespace,
326+
count: 2,
327+
exitCode: FailedPodHandler.GRACEFUL_SHUTDOWN_EXIT_CODE,
328+
});
329+
330+
// Wait for pods to reach Failed state
331+
await waitForPodsPhase({
332+
k8sApi: k8s,
333+
namespace,
334+
podNames: firstBatchPodNames,
335+
phase: "Failed",
336+
});
337+
338+
// Start the handler
339+
await handler.start();
340+
341+
// Wait for first batch to be deleted
342+
await waitForPodsDeletion({
343+
k8sApi: k8s,
344+
namespace,
345+
podNames: firstBatchPodNames,
346+
});
347+
348+
// Create second batch of pods after handler is running
349+
const secondBatchPodNames = await createTestPods({
350+
k8sApi: k8s,
351+
namespace,
352+
count: 3,
353+
exitCode: FailedPodHandler.GRACEFUL_SHUTDOWN_EXIT_CODE,
354+
});
355+
356+
// Wait for second batch to be deleted
357+
await waitForPodsDeletion({
358+
k8sApi: k8s,
359+
namespace,
360+
podNames: secondBatchPodNames,
361+
});
362+
363+
// Verify metrics
364+
const metrics = handler.getMetrics();
365+
366+
// Check informer events were recorded for both batches
367+
const informerEvents = await metrics.informerEventsTotal.get();
368+
expect(informerEvents.values).toContainEqual(
369+
expect.objectContaining({
370+
labels: expect.objectContaining({
371+
namespace,
372+
verb: "add",
373+
}),
374+
value: 5, // 2 from first batch + 3 from second batch
375+
})
376+
);
377+
378+
// Check pods were processed as graceful shutdowns
379+
const processedPods = await metrics.processedPodsTotal.get();
380+
381+
// Should not be marked as Failed
382+
const failedPods = processedPods.values.find(
383+
(v) => v.labels.namespace === namespace && v.labels.status === "Failed"
384+
);
385+
expect(failedPods).toBeUndefined();
386+
387+
// Should be marked as GracefulShutdown
388+
const gracefulShutdowns = processedPods.values.find(
389+
(v) => v.labels.namespace === namespace && v.labels.status === "GracefulShutdown"
390+
);
391+
expect(gracefulShutdowns).toBeDefined();
392+
expect(gracefulShutdowns?.value).toBe(5); // Total from both batches
393+
394+
// Check pods were still deleted
395+
const deletedPods = await metrics.deletedPodsTotal.get();
396+
expect(deletedPods.values).toContainEqual(
397+
expect.objectContaining({
398+
labels: expect.objectContaining({
399+
namespace,
400+
status: "Failed",
401+
}),
402+
value: 5, // Total from both batches
403+
})
404+
);
405+
406+
// Check no deletion errors were recorded
407+
const deletionErrors = await metrics.deletionErrorsTotal.get();
408+
expect(deletionErrors.values).toHaveLength(0);
409+
} finally {
410+
await handler.stop();
411+
}
412+
}, 30000);
317413
});
318414

319415
async function createTestPods({
@@ -325,6 +421,7 @@ async function createTestPods({
325421
namePrefix = "test-pod",
326422
command = ["/bin/sh", "-c", shouldFail ? "exit 1" : "exit 0"],
327423
randomizeName = true,
424+
exitCode,
328425
}: {
329426
k8sApi: K8sApi;
330427
namespace: string;
@@ -334,9 +431,15 @@ async function createTestPods({
334431
namePrefix?: string;
335432
command?: string[];
336433
randomizeName?: boolean;
434+
exitCode?: number;
337435
}) {
338436
const createdPods: string[] = [];
339437

438+
// If exitCode is specified, override the command
439+
if (exitCode !== undefined) {
440+
command = ["/bin/sh", "-c", `exit ${exitCode}`];
441+
}
442+
340443
for (let i = 0; i < count; i++) {
341444
const podName = randomizeName
342445
? `${namePrefix}-${i}-${Math.random().toString(36).substring(2, 15)}`
@@ -352,7 +455,7 @@ async function createTestPods({
352455
restartPolicy: "Never",
353456
containers: [
354457
{
355-
name: "test",
458+
name: "run-controller", // Changed to match the name we check in failedPodHandler
356459
image: "busybox:1.37.0",
357460
command,
358461
},

apps/supervisor/src/services/failedPodHandler.ts

Lines changed: 18 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@ import { Counter, Registry, Histogram } from "prom-client";
66
import { register } from "../metrics.js";
77
import { setTimeout } from "timers/promises";
88

9-
type PodStatus = "Pending" | "Running" | "Succeeded" | "Failed" | "Unknown";
9+
type PodStatus = "Pending" | "Running" | "Succeeded" | "Failed" | "Unknown" | "GracefulShutdown";
1010

1111
export type FailedPodHandlerOptions = {
1212
namespace: string;
@@ -34,6 +34,8 @@ export class FailedPodHandler {
3434
private readonly processingDurationSeconds: Histogram<string>;
3535
private readonly informerEventsTotal: Counter;
3636

37+
static readonly GRACEFUL_SHUTDOWN_EXIT_CODE = 200;
38+
3739
constructor(opts: FailedPodHandlerOptions) {
3840
this.id = Math.random().toString(36).substring(2, 15);
3941
this.logger = new SimpleStructuredLogger("failed-pod-handler", LogLevel.debug, {
@@ -206,6 +208,21 @@ export class FailedPodHandler {
206208

207209
private async processFailedPod(pod: V1Pod) {
208210
this.logger.info("pod-failed: processing pod", this.podSummary(pod));
211+
212+
const mainContainer = pod.status?.containerStatuses?.find((c) => c.name === "run-controller");
213+
214+
// If it's our special "graceful shutdown" exit code, don't process it further, just delete it
215+
if (
216+
mainContainer?.state?.terminated?.exitCode === FailedPodHandler.GRACEFUL_SHUTDOWN_EXIT_CODE
217+
) {
218+
this.logger.debug("pod-failed: graceful shutdown detected", this.podSummary(pod));
219+
this.processedPodsTotal.inc({
220+
namespace: this.namespace,
221+
status: "GracefulShutdown",
222+
});
223+
return;
224+
}
225+
209226
this.processedPodsTotal.inc({
210227
namespace: this.namespace,
211228
status: this.podStatus(pod),

0 commit comments

Comments
 (0)