Skip to content

Commit 83dff56

Browse files
committed
handle exit for case where we already retried after oom
1 parent e3bcb4f commit 83dff56

File tree

1 file changed

+21
-10
lines changed

1 file changed

+21
-10
lines changed

apps/webapp/app/v3/services/completeAttempt.server.ts

Lines changed: 21 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -254,9 +254,11 @@ export class CompleteAttemptService extends BaseService {
254254

255255
let retriableError = shouldRetryError(taskRunErrorEnhancer(completion.error));
256256
let isOOMRetry = false;
257+
let isOOMAttempt = isOOMError(completion.error);
258+
let isOnMaxOOMMachine = false;
257259

258-
//OOM errors should retry (if an OOM machine is specified)
259-
if (isOOMError(completion.error)) {
260+
//OOM errors should retry (if an OOM machine is specified, and we're not already on it)
261+
if (isOOMAttempt) {
260262
const retryConfig = FailedTaskRunRetryHelper.getRetryConfig({
261263
run: {
262264
...taskRunAttempt.taskRun,
@@ -266,10 +268,10 @@ export class CompleteAttemptService extends BaseService {
266268
execution,
267269
});
268270

269-
if (
270-
retryConfig?.outOfMemory?.machine &&
271-
retryConfig.outOfMemory.machine !== taskRunAttempt.taskRun.machinePreset
272-
) {
271+
isOnMaxOOMMachine =
272+
retryConfig?.outOfMemory?.machine === taskRunAttempt.taskRun.machinePreset;
273+
274+
if (retryConfig?.outOfMemory?.machine && !isOnMaxOOMMachine) {
273275
//we will retry
274276
isOOMRetry = true;
275277
retriableError = true;
@@ -312,6 +314,11 @@ export class CompleteAttemptService extends BaseService {
312314

313315
// The attempt has failed and we won't retry
314316

317+
if (isOOMAttempt && isOnMaxOOMMachine) {
318+
// The attempt failed due to an OOM error but we're already on the machine we should retry on
319+
exitRun(taskRunAttempt.taskRunId);
320+
}
321+
315322
// Now we need to "complete" the task run event/span
316323
await eventRepository.completeEvent(
317324
getTaskEventStoreTableForRun(taskRunAttempt.taskRun),
@@ -508,10 +515,7 @@ export class CompleteAttemptService extends BaseService {
508515

509516
// The run won't know it should shut down as we make the decision to force requeue here
510517
// This also ensures that this change is backwards compatible with older workers
511-
socketIo.coordinatorNamespace.emit("REQUEST_RUN_CANCELLATION", {
512-
version: "v1",
513-
runId: run.id,
514-
});
518+
exitRun(run.id);
515519

516520
await retryViaQueue();
517521
return;
@@ -759,3 +763,10 @@ function isOOMError(error: TaskRunError) {
759763

760764
return false;
761765
}
766+
767+
function exitRun(runId: string) {
768+
socketIo.coordinatorNamespace.emit("REQUEST_RUN_CANCELLATION", {
769+
version: "v1",
770+
runId,
771+
});
772+
}

0 commit comments

Comments
 (0)