Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
18 changes: 2 additions & 16 deletions core/environment/environment.go
Original file line number Diff line number Diff line change
Expand Up @@ -1232,19 +1232,8 @@ func (env *Environment) subscribeToWfState(taskman *task.Manager) {
NewEnvGoErrorEvent(env, newCriticalTasksErrorMessage(env)),
)
err := env.TryTransition(NewGoErrorTransition(taskman))

if err != nil {
if env.Sm.Current() == "ERROR" {
log.WithField("partition", env.id).
WithField("level", infologger.IL_Devel).
Info("skipped requested transition to ERROR: environment already in ERROR state")
} else {
log.WithField("partition", env.id).
WithError(err).
WithField("level", infologger.IL_Devel).
Warn("could not transition gently to ERROR, forcing it")
env.setState(wfState.String())
}
handleFailedGoError(err, env)
}
})
break WORKFLOW_STATE_LOOP
Expand Down Expand Up @@ -1472,10 +1461,7 @@ func (env *Environment) scheduleAutoStopTransition() (scheduled bool, expected t

err = env.TryTransition(NewGoErrorTransition(ManagerInstance().taskman))
if err != nil {
log.WithField("partition", env.id).
WithField("run", env.currentRunNumber).
Errorf("Forced transition to ERROR failed: %s", err.Error())
env.setState("ERROR")
handleFailedGoError(err, env)
}
return
}
Expand Down
28 changes: 5 additions & 23 deletions core/environment/manager.go
Original file line number Diff line number Diff line change
Expand Up @@ -496,10 +496,7 @@ func (envs *Manager) CreateEnvironment(workflowPath string, userVars map[string]
envs.taskman),
)
if err != nil {
log.WithField("partition", env.Id().String()).
WithField("state", envState).
Debug("could not transition failed auto-transitioning environment to ERROR, cleanup in progress")
env.setState("ERROR")
handleFailedGoError(err, env)
}

envTasks := env.Workflow().GetTasks()
Expand Down Expand Up @@ -603,10 +600,7 @@ func (envs *Manager) CreateEnvironment(workflowPath string, userVars map[string]
envs.taskman),
)
if errTxErr != nil {
log.WithField("partition", env.Id().String()).
WithField("state", envState).
WithError(errTxErr).
Debug("could not transition to ERROR after failed deployment/configuration, cleanup in progress")
handleFailedGoError(errTxErr, env)
}
envTasks := env.Workflow().GetTasks()
// TeardownEnvironment manages the envs.mu internally
Expand Down Expand Up @@ -1064,11 +1058,7 @@ func (envs *Manager) handleIntegratedServiceEvent(evt event.IntegratedServiceEve
)
err = env.TryTransition(NewGoErrorTransition(envs.taskman))
if err != nil {
log.WithPrefix("scheduler").
WithField("partition", envId.String()).
WithError(err).
Error("environment GO_ERROR transition failed after ODC_PARTITION_STATE_CHANGE ERROR event")
env.setState("ERROR")
handleFailedGoError(err, env)
}
}
}()
Expand Down Expand Up @@ -1124,12 +1114,7 @@ func (envs *Manager) handleLhcEvents(evt event.IntegratedServiceEvent) {
if env.CurrentState() != "ERROR" {
err = env.TryTransition(NewGoErrorTransition(envs.taskman))
if err != nil {
log.WithPrefix("scheduler").
WithField("partition", envId.String()).
WithField("run", env.currentRunNumber).
WithError(err).
Error("environment GO_ERROR transition failed after a beam dump event, forcing")
env.setState("ERROR")
handleFailedGoError(err, env)
}
}
}
Expand Down Expand Up @@ -1483,11 +1468,8 @@ func (envs *Manager) CreateAutoEnvironment(workflowPath string, userVars map[str
envs.taskman),
)
if err != nil {
log.WithField("partition", env.Id().String()).
WithField("state", envState).
Debug("could not transition failed auto-transitioning environment to ERROR, cleanup in progress")
handleFailedGoError(err, env)
env.sendEnvironmentEvent(&event.EnvironmentEvent{Message: "transition ERROR failed, forcing", EnvironmentID: env.Id().String(), Error: err})
env.setState("ERROR")
}

envTasks := env.Workflow().GetTasks()
Expand Down
26 changes: 26 additions & 0 deletions core/environment/utils.go
Original file line number Diff line number Diff line change
Expand Up @@ -27,12 +27,14 @@ package environment
import (
"bytes"
"encoding/json"
"errors"
"fmt"
"github.com/AliceO2Group/Control/common/logger/infologger"
pb "github.com/AliceO2Group/Control/common/protos"
"github.com/AliceO2Group/Control/core/task"
"github.com/AliceO2Group/Control/core/task/sm"
"github.com/AliceO2Group/Control/core/workflow"
"github.com/looplab/fsm"
"os"
"sort"

Expand Down Expand Up @@ -139,3 +141,27 @@ func newCriticalTasksErrorMessage(env *Environment) string {
return fmt.Sprintf("%d critical tasks transitioned to ERROR, could not determine the first one to fail", len(criticalTasksInError))
}
}

func handleFailedGoError(err error, env *Environment) {
var invalidEventErr *fsm.InvalidEventError
if errors.As(err, &invalidEventErr) {
// this case can occur if the environment is in either:
// - ERROR (env already transitioned to ERROR for another reason)
// - DONE (an error might have occurred during teardown, but it's already over, no point in spreading panic)
log.WithError(invalidEventErr).
WithField("partition", env.Id().String()).
WithField("run", env.currentRunNumber).
WithField("state", env.CurrentState()).
WithField(infologger.Level, infologger.IL_Support).
Warn("did not perform GO_ERROR transition")
} else {
// in principle this should never happen, so we log it accordingly and force the ERROR state just in case
log.WithError(err).
WithField("partition", env.Id().String()).
WithField("run", env.currentRunNumber).
WithField("state", env.CurrentState()).
WithField(infologger.Level, infologger.IL_Ops).
Error("could not perform GO_ERROR transition due to unexpected error, forcing...")
env.setState("ERROR")
}
}
50 changes: 50 additions & 0 deletions core/environment/utils_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,50 @@
/*
* === This file is part of ALICE O² ===
*
* Copyright 2025 CERN and copyright holders of ALICE O².
* Author: Piotr Konopka <piotr.konopka@cern.ch>
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program. If not, see <http://www.gnu.org/licenses/>.
*
* In applying this license CERN does not waive the privileges and
* immunities granted to it by virtue of its status as an
* Intergovernmental Organization or submit itself to any jurisdiction.
*/

package environment

import (
"github.com/looplab/fsm"
. "github.com/onsi/ginkgo/v2"
. "github.com/onsi/gomega"
)

var _ = Describe("handleFailedGoError", func() {
It("does not overwrite state for InvalidEventError", func() {
env := &Environment{}
env.Sm = fsm.NewFSM("DONE", fsm.Events{}, fsm.Callbacks{})
Expect(env.Sm.Current()).To(Equal("DONE"))

handleFailedGoError(&fsm.InvalidEventError{Event: "GO_ERROR", State: "DONE"}, env)
Expect(env.Sm.Current()).To(Equal("DONE"))
})

It("overwrites state to ERROR for other errors", func() {
env := &Environment{}
env.Sm = fsm.NewFSM("CONFIGURED", fsm.Events{}, fsm.Callbacks{})

handleFailedGoError(fsm.UnknownEventError{Event: "BOOM"}, env)
Expect(env.Sm.Current()).To(Equal("ERROR"))
})
})