Skip to content

Commit 7d426e7

Browse files
committed
Perform GO_ERROR after failed STOP_ACTIVITY and RESET during env destruction
Even if we destroy an environment, it is good to recognize that it was not a healthy shutdown, e.g. for bookkeeping reasons. Also, we might miss out on some necessary calls to integrated services by not going to ERROR. Fixes OCTRL-1063.
1 parent a3fd1f2 commit 7d426e7

File tree

5 files changed

+33
-17
lines changed

5 files changed

+33
-17
lines changed

core/environment/environment.go

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1233,7 +1233,7 @@ func (env *Environment) subscribeToWfState(taskman *task.Manager) {
12331233
)
12341234
err := env.TryTransition(NewGoErrorTransition(taskman))
12351235
if err != nil {
1236-
handleFailedGoError(err, env)
1236+
HandleFailedGoError(err, env)
12371237
}
12381238
})
12391239
break WORKFLOW_STATE_LOOP
@@ -1461,7 +1461,7 @@ func (env *Environment) scheduleAutoStopTransition() (scheduled bool, expected t
14611461

14621462
err = env.TryTransition(NewGoErrorTransition(ManagerInstance().taskman))
14631463
if err != nil {
1464-
handleFailedGoError(err, env)
1464+
HandleFailedGoError(err, env)
14651465
}
14661466
return
14671467
}

core/environment/manager.go

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -496,7 +496,7 @@ func (envs *Manager) CreateEnvironment(workflowPath string, userVars map[string]
496496
envs.taskman),
497497
)
498498
if err != nil {
499-
handleFailedGoError(err, env)
499+
HandleFailedGoError(err, env)
500500
}
501501

502502
envTasks := env.Workflow().GetTasks()
@@ -600,7 +600,7 @@ func (envs *Manager) CreateEnvironment(workflowPath string, userVars map[string]
600600
envs.taskman),
601601
)
602602
if errTxErr != nil {
603-
handleFailedGoError(errTxErr, env)
603+
HandleFailedGoError(errTxErr, env)
604604
}
605605
envTasks := env.Workflow().GetTasks()
606606
// TeardownEnvironment manages the envs.mu internally
@@ -1058,7 +1058,7 @@ func (envs *Manager) handleIntegratedServiceEvent(evt event.IntegratedServiceEve
10581058
)
10591059
err = env.TryTransition(NewGoErrorTransition(envs.taskman))
10601060
if err != nil {
1061-
handleFailedGoError(err, env)
1061+
HandleFailedGoError(err, env)
10621062
}
10631063
}
10641064
}()
@@ -1114,7 +1114,7 @@ func (envs *Manager) handleLhcEvents(evt event.IntegratedServiceEvent) {
11141114
if env.CurrentState() != "ERROR" {
11151115
err = env.TryTransition(NewGoErrorTransition(envs.taskman))
11161116
if err != nil {
1117-
handleFailedGoError(err, env)
1117+
HandleFailedGoError(err, env)
11181118
}
11191119
}
11201120
}
@@ -1468,7 +1468,7 @@ func (envs *Manager) CreateAutoEnvironment(workflowPath string, userVars map[str
14681468
envs.taskman),
14691469
)
14701470
if err != nil {
1471-
handleFailedGoError(err, env)
1471+
HandleFailedGoError(err, env)
14721472
env.sendEnvironmentEvent(&event.EnvironmentEvent{Message: "transition ERROR failed, forcing", EnvironmentID: env.Id().String(), Error: err})
14731473
}
14741474

core/environment/utils.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -143,7 +143,7 @@ func newCriticalTasksErrorMessage(env *Environment) string {
143143
}
144144
}
145145

146-
func handleFailedGoError(err error, env *Environment) {
146+
func HandleFailedGoError(err error, env *Environment) {
147147
var invalidEventErr fsm.InvalidEventError
148148
if errors.As(err, &invalidEventErr) {
149149
// this case can occur if the environment is in either:

core/environment/utils_test.go

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -30,21 +30,21 @@ import (
3030
. "github.com/onsi/gomega"
3131
)
3232

33-
var _ = Describe("handleFailedGoError", func() {
33+
var _ = Describe("HandleFailedGoError", func() {
3434
It("does not overwrite state for InvalidEventError", func() {
3535
env := &Environment{}
3636
env.Sm = fsm.NewFSM("DONE", fsm.Events{}, fsm.Callbacks{})
3737
Expect(env.Sm.Current()).To(Equal("DONE"))
3838

39-
handleFailedGoError(fsm.InvalidEventError{Event: "GO_ERROR", State: "DONE"}, env)
39+
HandleFailedGoError(fsm.InvalidEventError{Event: "GO_ERROR", State: "DONE"}, env)
4040
Expect(env.Sm.Current()).To(Equal("DONE"))
4141
})
4242

4343
It("overwrites state to ERROR for other errors", func() {
4444
env := &Environment{}
4545
env.Sm = fsm.NewFSM("CONFIGURED", fsm.Events{}, fsm.Callbacks{})
4646

47-
handleFailedGoError(fsm.UnknownEventError{Event: "BOOM"}, env)
47+
HandleFailedGoError(fsm.UnknownEventError{Event: "BOOM"}, env)
4848
Expect(env.Sm.Current()).To(Equal("ERROR"))
4949
})
5050
})

core/server.go

Lines changed: 22 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -718,10 +718,18 @@ func (m *RpcServer) DestroyEnvironment(cxt context.Context, req *pb.DestroyEnvir
718718
}
719719

720720
if req.AllowInRunningState && env.CurrentState() == "RUNNING" {
721-
err = env.TryTransition(environment.MakeTransition(m.state.taskman, pb.ControlEnvironmentRequest_STOP_ACTIVITY))
721+
err = env.TryTransition(environment.NewStopActivityTransition(m.state.taskman))
722722
if err != nil {
723-
log.WithField("partition", env.Id().String()).
724-
Warn("could not perform STOP transition for environment teardown, forcing")
723+
log.WithError(err).
724+
WithField("partition", env.Id().String()).
725+
Warn("could not perform STOP transition for environment teardown, going to ERROR, then forcing")
726+
the.EventWriterWithTopic(topic.Environment).WriteEvent(
727+
environment.NewEnvGoErrorEvent(env, "STOP_ACTIVITY during environment destruction failed"),
728+
)
729+
err = env.TryTransition(environment.NewGoErrorTransition(m.state.taskman))
730+
if err != nil {
731+
environment.HandleFailedGoError(err, env)
732+
}
725733
reply, err = m.doTeardownAndCleanup(env, true /*force*/, false /*keepTasks*/)
726734
return
727735
}
@@ -746,10 +754,18 @@ func (m *RpcServer) DestroyEnvironment(cxt context.Context, req *pb.DestroyEnvir
746754

747755
// This might transition to STANDBY if needed, or do nothing if we're already there
748756
if env.CurrentState() == "CONFIGURED" {
749-
err = env.TryTransition(environment.MakeTransition(m.state.taskman, pb.ControlEnvironmentRequest_RESET))
757+
err = env.TryTransition(environment.NewResetTransition(m.state.taskman))
750758
if err != nil {
751-
log.WithField("partition", env.Id().String()).
752-
Warnf("cannot teardown environment in state %s, forcing", env.CurrentState())
759+
log.WithError(err).
760+
WithField("partition", env.Id().String()).
761+
Warnf("cannot teardown environment in state %s, going to ERROR, then forcing", env.CurrentState())
762+
the.EventWriterWithTopic(topic.Environment).WriteEvent(
763+
environment.NewEnvGoErrorEvent(env, "RESET during environment destruction failed"),
764+
)
765+
err = env.TryTransition(environment.NewGoErrorTransition(m.state.taskman))
766+
if err != nil {
767+
environment.HandleFailedGoError(err, env)
768+
}
753769
reply, err = m.doTeardownAndCleanup(env, true /*force*/, false /*keepTasks*/)
754770
return
755771
}

0 commit comments

Comments
 (0)