Skip to content

Commit a10f236

Browse files
committed
Distinguish between task too slow to start and crashing on start
Fixes OCTRL-1075
1 parent e724c0c commit a10f236

File tree

2 files changed

+25
-13
lines changed

2 files changed

+25
-13
lines changed

executor/executable/controllabletask.go

Lines changed: 24 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -63,6 +63,7 @@ type ControllableTask struct {
6363
taskBase
6464
rpc *executorcmd.RpcClient
6565
pendingFinalTaskStateCh chan mesos.TaskState
66+
taskDoneCh chan error
6667
knownPid int
6768
}
6869

@@ -159,6 +160,12 @@ func (t *ControllableTask) doLaunchTask(taskCmd *exec.Cmd, launchStartTime time.
159160

160161
t.initTaskStdLogging(stdoutIn, stderrIn)
161162

163+
// We start to Wait() for the result already, so we have access to ProcessState on an early failure
164+
t.taskDoneCh = make(chan error, 1)
165+
go func() {
166+
t.taskDoneCh <- taskCmd.Wait()
167+
}()
168+
162169
log.WithFields(defaultLogFields).
163170
WithFields(logrus.Fields{
164171
"controlPort": t.Tci.ControlPort,
@@ -186,12 +193,20 @@ func (t *ControllableTask) doLaunchTask(taskCmd *exec.Cmd, launchStartTime time.
186193
WithFields(defaultLogFields),
187194
)
188195
if t.rpc == nil {
189-
err = errors.New("rpc client is nil")
190-
log.WithFields(defaultLogFields).
191-
WithField("command", truncatedCmd).
192-
WithError(err).
193-
WithField(infologger.Level, infologger.IL_Devel).
194-
Error("could not start gRPC client")
196+
// Check if the task is still running by checking ProcessState
197+
if taskCmd.ProcessState != nil {
198+
err = errors.New("AliECS executor could not connect to task, likely crashed on startup")
199+
} else {
200+
err = errors.New("AliECS executor could not connect to task, likely took too long to start")
201+
}
202+
203+
taskClassName, _ := utils.ExtractTaskClassName(t.ti.Name)
204+
log.WithFields(logrus.Fields{
205+
"task": utils.TrimJitPrefix(taskClassName),
206+
"partition": t.knownEnvironmentId.String(),
207+
"detector": t.knownDetector,
208+
infologger.Level: infologger.IL_Ops,
209+
}).Error(err.Error())
195210

196211
t.sendStatus(t.knownEnvironmentId, mesos.TASK_FAILED, err.Error())
197212

@@ -262,7 +277,7 @@ func (t *ControllableTask) doLaunchTask(taskCmd *exec.Cmd, launchStartTime time.
262277
t.processEventsFromTask(esc)
263278
}()
264279

265-
err = taskCmd.Wait()
280+
err = <-t.taskDoneCh
266281
// ^ when this unblocks, the task is done
267282
log.WithFields(defaultLogFields).
268283
WithField("command", truncatedCmd).
@@ -330,7 +345,8 @@ func (t *ControllableTask) cleanupFailedTask(taskCmd *exec.Cmd) {
330345

331346
_ = t.doTermIntKill(-taskCmd.Process.Pid)
332347

333-
err := taskCmd.Wait()
348+
// Wait for task to finish and report the error
349+
err := <-t.taskDoneCh
334350
if err != nil {
335351
log.WithFields(defaultLogFields).
336352
WithField(infologger.Level, infologger.IL_Support).

executor/executorcmd/client.go

Lines changed: 1 addition & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -80,12 +80,8 @@ func NewClient(
8080
log.WithField("error", err.Error()).
8181
WithField("endpoint", endpoint).
8282
WithField("transport", controlTransportS).
83-
WithField("level", infologger.IL_Trace).
83+
WithField("level", infologger.IL_Devel).
8484
Error("gRPC client can't dial")
85-
log.WithField("error", err.Error()).
86-
WithField("endpoint", endpoint).
87-
WithField("level", infologger.IL_Ops).
88-
Error("AliECS executor could not connect to task, possible crash on startup")
8985

9086
cancel()
9187
if conn != nil {

0 commit comments

Comments
 (0)