@@ -254,9 +254,11 @@ export class CompleteAttemptService extends BaseService {
254254
255255 let retriableError = shouldRetryError ( taskRunErrorEnhancer ( completion . error ) ) ;
256256 let isOOMRetry = false ;
257+ let isOOMAttempt = isOOMError ( completion . error ) ;
258+ let isOnMaxOOMMachine = false ;
257259
258- //OOM errors should retry (if an OOM machine is specified)
259- if ( isOOMError ( completion . error ) ) {
260+ //OOM errors should retry (if an OOM machine is specified, and we're not already on it )
261+ if ( isOOMAttempt ) {
260262 const retryConfig = FailedTaskRunRetryHelper . getRetryConfig ( {
261263 run : {
262264 ...taskRunAttempt . taskRun ,
@@ -266,10 +268,10 @@ export class CompleteAttemptService extends BaseService {
266268 execution,
267269 } ) ;
268270
269- if (
270- retryConfig ?. outOfMemory ?. machine &&
271- retryConfig . outOfMemory . machine !== taskRunAttempt . taskRun . machinePreset
272- ) {
271+ isOnMaxOOMMachine =
272+ retryConfig ?. outOfMemory ?. machine === taskRunAttempt . taskRun . machinePreset ;
273+
274+ if ( retryConfig ?. outOfMemory ?. machine && ! isOnMaxOOMMachine ) {
273275 //we will retry
274276 isOOMRetry = true ;
275277 retriableError = true ;
@@ -312,6 +314,11 @@ export class CompleteAttemptService extends BaseService {
312314
313315 // The attempt has failed and we won't retry
314316
317+ if ( isOOMAttempt && isOnMaxOOMMachine ) {
318+ // The attempt failed due to an OOM error but we're already on the machine we should retry on
319+ exitRun ( taskRunAttempt . taskRunId ) ;
320+ }
321+
315322 // Now we need to "complete" the task run event/span
316323 await eventRepository . completeEvent (
317324 getTaskEventStoreTableForRun ( taskRunAttempt . taskRun ) ,
@@ -508,10 +515,7 @@ export class CompleteAttemptService extends BaseService {
508515
509516 // The run won't know it should shut down as we make the decision to force requeue here
510517 // This also ensures that this change is backwards compatible with older workers
511- socketIo . coordinatorNamespace . emit ( "REQUEST_RUN_CANCELLATION" , {
512- version : "v1" ,
513- runId : run . id ,
514- } ) ;
518+ exitRun ( run . id ) ;
515519
516520 await retryViaQueue ( ) ;
517521 return ;
@@ -759,3 +763,10 @@ function isOOMError(error: TaskRunError) {
759763
760764 return false ;
761765}
766+
767+ function exitRun ( runId : string ) {
768+ socketIo . coordinatorNamespace . emit ( "REQUEST_RUN_CANCELLATION" , {
769+ version : "v1" ,
770+ runId,
771+ } ) ;
772+ }
0 commit comments