@@ -63,6 +63,7 @@ type ControllableTask struct {
6363 taskBase
6464 rpc * executorcmd.RpcClient
6565 pendingFinalTaskStateCh chan mesos.TaskState
66+ taskDoneCh chan error
6667 knownPid int
6768}
6869
@@ -159,6 +160,12 @@ func (t *ControllableTask) doLaunchTask(taskCmd *exec.Cmd, launchStartTime time.
159160
160161 t .initTaskStdLogging (stdoutIn , stderrIn )
161162
163+ // We start to Wait() for the result already, so we have access to ProcessState on an early failure
164+ t .taskDoneCh = make (chan error , 1 )
165+ go func () {
166+ t .taskDoneCh <- taskCmd .Wait ()
167+ }()
168+
162169 log .WithFields (defaultLogFields ).
163170 WithFields (logrus.Fields {
164171 "controlPort" : t .Tci .ControlPort ,
@@ -186,12 +193,20 @@ func (t *ControllableTask) doLaunchTask(taskCmd *exec.Cmd, launchStartTime time.
186193 WithFields (defaultLogFields ),
187194 )
188195 if t .rpc == nil {
189- err = errors .New ("rpc client is nil" )
190- log .WithFields (defaultLogFields ).
191- WithField ("command" , truncatedCmd ).
192- WithError (err ).
193- WithField (infologger .Level , infologger .IL_Devel ).
194- Error ("could not start gRPC client" )
196+ // Check if the task is still running by checking ProcessState
197+ if taskCmd .ProcessState != nil {
198+ err = errors .New ("AliECS executor could not connect to task, likely crashed on startup" )
199+ } else {
200+ err = errors .New ("AliECS executor could not connect to task, likely took too long to start" )
201+ }
202+
203+ taskClassName , _ := utils .ExtractTaskClassName (t .ti .Name )
204+ log .WithFields (logrus.Fields {
205+ "task" : utils .TrimJitPrefix (taskClassName ),
206+ "partition" : t .knownEnvironmentId .String (),
207+ "detector" : t .knownDetector ,
208+ infologger .Level : infologger .IL_Ops ,
209+ }).Error (err .Error ())
195210
196211 t .sendStatus (t .knownEnvironmentId , mesos .TASK_FAILED , err .Error ())
197212
@@ -262,7 +277,7 @@ func (t *ControllableTask) doLaunchTask(taskCmd *exec.Cmd, launchStartTime time.
262277 t .processEventsFromTask (esc )
263278 }()
264279
265- err = taskCmd . Wait ()
280+ err = <- t . taskDoneCh
266281 // ^ when this unblocks, the task is done
267282 log .WithFields (defaultLogFields ).
268283 WithField ("command" , truncatedCmd ).
@@ -330,7 +345,8 @@ func (t *ControllableTask) cleanupFailedTask(taskCmd *exec.Cmd) {
330345
331346 _ = t .doTermIntKill (- taskCmd .Process .Pid )
332347
333- err := taskCmd .Wait ()
348+ // Wait for task to finish and report the error
349+ err := <- t .taskDoneCh
334350 if err != nil {
335351 log .WithFields (defaultLogFields ).
336352 WithField (infologger .Level , infologger .IL_Support ).
0 commit comments