Skip to content

Commit c64a594

Browse files
committed
Rework handling failed GO_ERROR
In this commit we unify the way that a failed GO_ERROR is handled. We recognize invalid event errors as harmless (ERROR->ERROR is pointless, DONE->ERROR is too late). Any other case is very much unexpected and we print a visible error and comply with the previous behaviour - setting EROR state manually. Closes OCTRL-1064.
1 parent 94c397f commit c64a594

File tree

4 files changed

+83
-39
lines changed

4 files changed

+83
-39
lines changed

core/environment/environment.go

Lines changed: 2 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -1232,19 +1232,8 @@ func (env *Environment) subscribeToWfState(taskman *task.Manager) {
12321232
NewEnvGoErrorEvent(env, newCriticalTasksErrorMessage(env)),
12331233
)
12341234
err := env.TryTransition(NewGoErrorTransition(taskman))
1235-
12361235
if err != nil {
1237-
if env.Sm.Current() == "ERROR" {
1238-
log.WithField("partition", env.id).
1239-
WithField("level", infologger.IL_Devel).
1240-
Info("skipped requested transition to ERROR: environment already in ERROR state")
1241-
} else {
1242-
log.WithField("partition", env.id).
1243-
WithError(err).
1244-
WithField("level", infologger.IL_Devel).
1245-
Warn("could not transition gently to ERROR, forcing it")
1246-
env.setState(wfState.String())
1247-
}
1236+
handleFailedGoError(err, env)
12481237
}
12491238
})
12501239
break WORKFLOW_STATE_LOOP
@@ -1472,10 +1461,7 @@ func (env *Environment) scheduleAutoStopTransition() (scheduled bool, expected t
14721461

14731462
err = env.TryTransition(NewGoErrorTransition(ManagerInstance().taskman))
14741463
if err != nil {
1475-
log.WithField("partition", env.id).
1476-
WithField("run", env.currentRunNumber).
1477-
Errorf("Forced transition to ERROR failed: %s", err.Error())
1478-
env.setState("ERROR")
1464+
handleFailedGoError(err, env)
14791465
}
14801466
return
14811467
}

core/environment/manager.go

Lines changed: 5 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -496,10 +496,7 @@ func (envs *Manager) CreateEnvironment(workflowPath string, userVars map[string]
496496
envs.taskman),
497497
)
498498
if err != nil {
499-
log.WithField("partition", env.Id().String()).
500-
WithField("state", envState).
501-
Debug("could not transition failed auto-transitioning environment to ERROR, cleanup in progress")
502-
env.setState("ERROR")
499+
handleFailedGoError(err, env)
503500
}
504501

505502
envTasks := env.Workflow().GetTasks()
@@ -603,10 +600,7 @@ func (envs *Manager) CreateEnvironment(workflowPath string, userVars map[string]
603600
envs.taskman),
604601
)
605602
if errTxErr != nil {
606-
log.WithField("partition", env.Id().String()).
607-
WithField("state", envState).
608-
WithError(errTxErr).
609-
Debug("could not transition to ERROR after failed deployment/configuration, cleanup in progress")
603+
handleFailedGoError(errTxErr, env)
610604
}
611605
envTasks := env.Workflow().GetTasks()
612606
// TeardownEnvironment manages the envs.mu internally
@@ -1064,11 +1058,7 @@ func (envs *Manager) handleIntegratedServiceEvent(evt event.IntegratedServiceEve
10641058
)
10651059
err = env.TryTransition(NewGoErrorTransition(envs.taskman))
10661060
if err != nil {
1067-
log.WithPrefix("scheduler").
1068-
WithField("partition", envId.String()).
1069-
WithError(err).
1070-
Error("environment GO_ERROR transition failed after ODC_PARTITION_STATE_CHANGE ERROR event")
1071-
env.setState("ERROR")
1061+
handleFailedGoError(err, env)
10721062
}
10731063
}
10741064
}()
@@ -1124,12 +1114,7 @@ func (envs *Manager) handleLhcEvents(evt event.IntegratedServiceEvent) {
11241114
if env.CurrentState() != "ERROR" {
11251115
err = env.TryTransition(NewGoErrorTransition(envs.taskman))
11261116
if err != nil {
1127-
log.WithPrefix("scheduler").
1128-
WithField("partition", envId.String()).
1129-
WithField("run", env.currentRunNumber).
1130-
WithError(err).
1131-
Error("environment GO_ERROR transition failed after a beam dump event, forcing")
1132-
env.setState("ERROR")
1117+
handleFailedGoError(err, env)
11331118
}
11341119
}
11351120
}
@@ -1483,11 +1468,8 @@ func (envs *Manager) CreateAutoEnvironment(workflowPath string, userVars map[str
14831468
envs.taskman),
14841469
)
14851470
if err != nil {
1486-
log.WithField("partition", env.Id().String()).
1487-
WithField("state", envState).
1488-
Debug("could not transition failed auto-transitioning environment to ERROR, cleanup in progress")
1471+
handleFailedGoError(err, env)
14891472
env.sendEnvironmentEvent(&event.EnvironmentEvent{Message: "transition ERROR failed, forcing", EnvironmentID: env.Id().String(), Error: err})
1490-
env.setState("ERROR")
14911473
}
14921474

14931475
envTasks := env.Workflow().GetTasks()

core/environment/utils.go

Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -27,12 +27,14 @@ package environment
2727
import (
2828
"bytes"
2929
"encoding/json"
30+
"errors"
3031
"fmt"
3132
"github.com/AliceO2Group/Control/common/logger/infologger"
3233
pb "github.com/AliceO2Group/Control/common/protos"
3334
"github.com/AliceO2Group/Control/core/task"
3435
"github.com/AliceO2Group/Control/core/task/sm"
3536
"github.com/AliceO2Group/Control/core/workflow"
37+
"github.com/looplab/fsm"
3638
"os"
3739
"sort"
3840

@@ -139,3 +141,27 @@ func newCriticalTasksErrorMessage(env *Environment) string {
139141
return fmt.Sprintf("%d critical tasks transitioned to ERROR, could not determine the first one to fail", len(criticalTasksInError))
140142
}
141143
}
144+
145+
func handleFailedGoError(err error, env *Environment) {
146+
var invalidEventErr *fsm.InvalidEventError
147+
if errors.As(err, &invalidEventErr) {
148+
// this case can occur if the environment is in either:
149+
// - ERROR (env already transitioned to ERROR for another reason)
150+
// - DONE (an error might have occurred during teardown, but it's already over, no point in spreading panic)
151+
log.WithError(invalidEventErr).
152+
WithField("partition", env.Id().String()).
153+
WithField("run", env.currentRunNumber).
154+
WithField("state", env.CurrentState()).
155+
WithField(infologger.Level, infologger.IL_Support).
156+
Warn("did not perform GO_ERROR transition")
157+
} else {
158+
// in principle this should never happen, so we log it accordingly and force the ERROR state just in case
159+
log.WithError(err).
160+
WithField("partition", env.Id().String()).
161+
WithField("run", env.currentRunNumber).
162+
WithField("state", env.CurrentState()).
163+
WithField(infologger.Level, infologger.IL_Ops).
164+
Error("could not perform GO_ERROR transition due to unexpected error, forcing...")
165+
env.setState("ERROR")
166+
}
167+
}

core/environment/utils_test.go

Lines changed: 50 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,50 @@
1+
/*
2+
* === This file is part of ALICE O² ===
3+
*
4+
* Copyright 2025 CERN and copyright holders of ALICE O².
5+
* Author: Piotr Konopka <piotr.konopka@cern.ch>
6+
*
7+
* This program is free software: you can redistribute it and/or modify
8+
* it under the terms of the GNU General Public License as published by
9+
* the Free Software Foundation, either version 3 of the License, or
10+
* (at your option) any later version.
11+
*
12+
* This program is distributed in the hope that it will be useful,
13+
* but WITHOUT ANY WARRANTY; without even the implied warranty of
14+
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15+
* GNU General Public License for more details.
16+
*
17+
* You should have received a copy of the GNU General Public License
18+
* along with this program. If not, see <http://www.gnu.org/licenses/>.
19+
*
20+
* In applying this license CERN does not waive the privileges and
21+
* immunities granted to it by virtue of its status as an
22+
* Intergovernmental Organization or submit itself to any jurisdiction.
23+
*/
24+
25+
package environment
26+
27+
import (
28+
"github.com/looplab/fsm"
29+
. "github.com/onsi/ginkgo/v2"
30+
. "github.com/onsi/gomega"
31+
)
32+
33+
var _ = Describe("handleFailedGoError", func() {
34+
It("does not overwrite state for InvalidEventError", func() {
35+
env := &Environment{}
36+
env.Sm = fsm.NewFSM("DONE", fsm.Events{}, fsm.Callbacks{})
37+
Expect(env.Sm.Current()).To(Equal("DONE"))
38+
39+
handleFailedGoError(&fsm.InvalidEventError{Event: "GO_ERROR", State: "DONE"}, env)
40+
Expect(env.Sm.Current()).To(Equal("DONE"))
41+
})
42+
43+
It("overwrites state to ERROR for other errors", func() {
44+
env := &Environment{}
45+
env.Sm = fsm.NewFSM("CONFIGURED", fsm.Events{}, fsm.Callbacks{})
46+
47+
handleFailedGoError(fsm.UnknownEventError{Event: "BOOM"}, env)
48+
Expect(env.Sm.Current()).To(Equal("ERROR"))
49+
})
50+
})

0 commit comments

Comments
 (0)