Skip to content

Commit 5126853

Browse files
committed
[core][executor] fine-tuning operator logs
A bunch of OPS logs was moved to SUPPORT or DEVEL, while some of the OPS logs were made simpler. OCTRL-978
1 parent a699776 commit 5126853

File tree

9 files changed

+71
-20
lines changed

9 files changed

+71
-20
lines changed

Makefile

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -70,7 +70,7 @@ INSTALL_WHAT:=$(patsubst %, install_%, $(WHAT))
7070

7171
GENERATE_DIRS := ./apricot ./coconut/cmd ./common ./common/runtype ./common/system ./core ./core/integration/ccdb ./core/integration/dcs ./core/integration/ddsched ./core/integration/kafka ./core/integration/odc ./executor ./walnut ./core/integration/trg ./core/integration/bookkeeping
7272
SRC_DIRS := ./apricot ./cmd/* ./core ./coconut ./executor ./common ./configuration ./occ/peanut ./walnut
73-
TEST_DIRS := ./apricot/local ./common/gera ./common/utils/safeacks ./configuration/cfgbackend ./configuration/componentcfg ./configuration/template ./core/task/sm ./core/workflow ./core/integration/odc/fairmq ./core/integration ./core/environment
73+
TEST_DIRS := ./apricot/local ./common/gera ./common/utils ./common/utils/safeacks ./configuration/cfgbackend ./configuration/componentcfg ./configuration/template ./core/task/sm ./core/workflow ./core/integration/odc/fairmq ./core/integration ./core/environment
7474
GO_TEST_DIRS := ./core/repos ./core/integration/dcs ./common/monitoring
7575

7676
coverage:COVERAGE_PREFIX := ./coverage_results

common/utils/utils.go

Lines changed: 28 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -179,3 +179,31 @@ func TruncateString(str string, length int) string {
179179

180180
return string([]rune(str)[:length])
181181
}
182+
183+
var taskClassNameRgx = regexp.MustCompile(`(?:.*/)?([^/@]+)@`) // Captures the segment between the last '/' and '@'
184+
185+
// ExtractTaskClassName extracts the task class name from the provided task name string
186+
// For example, we extract "readout" from the following string:
187+
// "alio2-cr1-hv-gw01.cern.ch:/opt/git/ControlWorkflows/tasks/readout@12b11ac4bb652e1835e3e94806a688c951691d5f#2sP21PjpfCQ"
188+
func ExtractTaskClassName(taskName string) (string, error) {
189+
190+
matches := taskClassNameRgx.FindStringSubmatch(taskName)
191+
if len(matches) < 2 {
192+
return "", fmt.Errorf("failed to extract task class name from '%s'", taskName)
193+
}
194+
195+
return matches[1], nil
196+
}
197+
198+
// TrimJitPrefix removes the JIT prefix from task class names.
199+
// For example, "jit-ad6f2b64b7502198430d7d7f93f15bf94c088cab-qc-pp-TPC-CalibQC_long" becomes "qc-pp-TPC-CalibQC_long".
200+
// If input does not contain a JIT prefix, it is returned as it is.
201+
func TrimJitPrefix(taskClassName string) string {
202+
if strings.HasPrefix(taskClassName, "jit-") {
203+
parts := strings.SplitN(taskClassName, "-", 3)
204+
if len(parts) > 2 {
205+
return parts[2]
206+
}
207+
}
208+
return taskClassName
209+
}

core/environment/environment.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1201,7 +1201,7 @@ func (env *Environment) subscribeToWfState(taskman *task.Manager) {
12011201
time.AfterFunc(500*time.Millisecond, func() { // wait 0.5s for any other tasks to go to ERROR/INACTIVE
12021202
log.WithField("partition", env.id).
12031203
WithField("level", infologger.IL_Ops).
1204-
Warn("one of the critical tasks went into ERROR state, transitioning the environment into ERROR")
1204+
Error("one of the critical tasks went into ERROR state, transitioning the environment into ERROR")
12051205
err := env.TryTransition(NewGoErrorTransition(taskman))
12061206
if err != nil {
12071207
if env.Sm.Current() == "ERROR" {

core/environment/manager.go

Lines changed: 13 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -892,6 +892,10 @@ func (envs *Manager) TeardownEnvironment(environmentId uid.ID, force bool) error
892892
WorkflowTemplateInfo: env.GetWorkflowInfo(),
893893
})
894894

895+
log.WithFields(logrus.Fields{
896+
"partition": environmentId.String(),
897+
infologger.Level: infologger.IL_Ops,
898+
}).Info("environment teardown complete")
895899
return err
896900
}
897901

@@ -935,7 +939,7 @@ func (envs *Manager) Environment(environmentId uid.ID) (env *Environment, err er
935939

936940
func (envs *Manager) environment(environmentId uid.ID) (env *Environment, err error) {
937941
if len(environmentId) == 0 { // invalid id
938-
return nil, fmt.Errorf("invalid id: %s", environmentId)
942+
return nil, fmt.Errorf("empty env ID")
939943
}
940944
envs.mu.RLock()
941945
defer envs.mu.RUnlock()
@@ -1050,6 +1054,7 @@ func (envs *Manager) handleDeviceEvent(evt event.DeviceEvent) {
10501054
env, err := envs.environment(t.GetEnvironmentId())
10511055
if err != nil {
10521056
log.WithPrefix("scheduler").
1057+
WithField(infologger.Level, infologger.IL_Devel).
10531058
WithError(err).
10541059
Error("cannot find environment for DeviceEvent")
10551060
}
@@ -1060,11 +1065,13 @@ func (envs *Manager) handleDeviceEvent(evt event.DeviceEvent) {
10601065
} else {
10611066
log.WithPrefix("scheduler").
10621067
WithField("partition", envId.String()).
1068+
WithField(infologger.Level, infologger.IL_Devel).
10631069
Error("DeviceEvent BASIC_TASK_TERMINATED received for task with no parent role")
10641070
}
10651071
} else {
10661072
log.WithPrefix("scheduler").
10671073
WithField("partition", envId.String()).
1074+
WithField(infologger.Level, infologger.IL_Devel).
10681075
Debug("cannot find task for DeviceEvent BASIC_TASK_TERMINATED")
10691076
}
10701077

@@ -1084,6 +1091,7 @@ func (envs *Manager) handleDeviceEvent(evt event.DeviceEvent) {
10841091
log.WithPrefix("scheduler").
10851092
WithField("partition", envId.String()).
10861093
WithField("taskId", taskId.Value).
1094+
WithField(infologger.Level, infologger.IL_Devel).
10871095
Debug("cannot find task for DeviceEvent END_OF_STREAM")
10881096
return
10891097
}
@@ -1092,12 +1100,14 @@ func (envs *Manager) handleDeviceEvent(evt event.DeviceEvent) {
10921100
log.WithPrefix("scheduler").
10931101
WithField("partition", envId.String()).
10941102
WithField("taskId", taskId.Value).
1103+
WithField(infologger.Level, infologger.IL_Devel).
10951104
WithError(err).
10961105
Error("cannot find environment for DeviceEvent")
10971106
} else {
10981107
log.WithPrefix("scheduler").
10991108
WithField("partition", envId.String()).
11001109
WithField("taskId", taskId.Value).
1110+
WithField("role", t.GetParent().GetName()).
11011111
WithField("envState", env.CurrentState()).
11021112
Debug("received END_OF_STREAM event from task, trying to stop the run")
11031113
if env.CurrentState() == "RUNNING" {
@@ -1132,6 +1142,7 @@ func (envs *Manager) handleDeviceEvent(evt event.DeviceEvent) {
11321142
log.WithPrefix("scheduler").
11331143
WithField("partition", envId.String()).
11341144
WithField("taskId", taskId.Value).
1145+
WithField(infologger.Level, infologger.IL_Devel).
11351146
WithError(err).
11361147
Error("cannot find environment for DeviceEvent")
11371148
} else {
@@ -1140,6 +1151,7 @@ func (envs *Manager) handleDeviceEvent(evt event.DeviceEvent) {
11401151
WithField("taskId", taskId.Value).
11411152
WithField("taskRole", t.GetParentRolePath()).
11421153
WithField("envState", env.CurrentState()).
1154+
WithField(infologger.Level, infologger.IL_Support).
11431155
Debug("received TASK_INTERNAL_ERROR event from task, trying to stop the run")
11441156
if env.CurrentState() == "RUNNING" {
11451157
go func() {

core/environment/transition_startactivity.go

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -60,6 +60,7 @@ func (t StartActivityTransition) do(env *Environment) (err error) {
6060

6161
log.WithField(infologger.Run, runNumber).
6262
WithField("partition", env.Id().String()).
63+
WithField(infologger.Level, infologger.IL_Support).
6364
Info("starting new run")
6465

6566
cleanupCount := 0
@@ -123,6 +124,7 @@ func (t StartActivityTransition) do(env *Environment) (err error) {
123124

124125
log.WithField(infologger.Run, env.currentRunNumber).
125126
WithField("partition", env.Id().String()).
127+
WithField(infologger.Level, infologger.IL_Support).
126128
Info("run started")
127129
env.sendEnvironmentEvent(&event.EnvironmentEvent{
128130
EnvironmentID: env.Id().String(),

core/environment/transition_stopactivity.go

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -56,6 +56,7 @@ func (t StopActivityTransition) do(env *Environment) (err error) {
5656

5757
log.WithField(infologger.Run, env.currentRunNumber).
5858
WithField("partition", env.Id().String()).
59+
WithField(infologger.Level, infologger.IL_Support).
5960
Info("stopping run")
6061

6162
args := controlcommands.PropertyMap{}
@@ -89,5 +90,10 @@ func (t StopActivityTransition) do(env *Environment) (err error) {
8990
}
9091
env.sendEnvironmentEvent(&event.EnvironmentEvent{EnvironmentID: env.Id().String(), State: "CONFIGURED"})
9192

93+
log.WithField(infologger.Run, env.currentRunNumber).
94+
WithField("partition", env.Id().String()).
95+
WithField(infologger.Level, infologger.IL_Support).
96+
Info("run stopped")
97+
9298
return
9399
}

core/integration/odc/plugin.go

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1157,6 +1157,7 @@ func (p *Plugin) CallStack(data interface{}) (stack map[string]interface{}) {
11571157
} else {
11581158
log.WithField("partition", envId).
11591159
WithField("call", "PartitionInitialize").
1160+
WithField(infologger.Level, infologger.IL_Support).
11601161
Info("odc_extract_topology_resources is set to true, plugin and resources will not be included in the ODC Run request")
11611162
}
11621163

@@ -1264,6 +1265,7 @@ func (p *Plugin) CallStack(data interface{}) (stack map[string]interface{}) {
12641265
log.WithField("partition", envId).
12651266
WithField("call", "Configure").
12661267
WithField("runType", runType).
1268+
WithField(infologger.Level, infologger.IL_Support).
12671269
Infof("overriding run start time (orbit-reset-time) to %s for SYNTHETIC run", pdpOverrideRunStartTime)
12681270
} else {
12691271
log.WithField("partition", envId).

executor/executable/basictaskcommon.go

Lines changed: 12 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,7 @@ import (
2828
"bytes"
2929
"encoding/json"
3030
"errors"
31+
"github.com/AliceO2Group/Control/common/utils"
3132
"io"
3233
"os/exec"
3334
"syscall"
@@ -178,19 +179,20 @@ func (t *basicTaskBase) startBasicTask() (err error) {
178179
}
179180

180181
if err != nil {
182+
taskClassName, _ := utils.ExtractTaskClassName(t.ti.Name)
183+
log.WithField("partition", t.knownEnvironmentId.String()).
184+
WithField("detector", t.knownDetector).
185+
WithField("level", infologger.IL_Ops).
186+
Errorf("task '%s' terminated with error: %s", taskClassName, err.Error())
181187
log.WithField("partition", t.knownEnvironmentId.String()).
182188
WithFields(logrus.Fields{
183-
"id": t.ti.TaskID.Value,
184-
"task": t.ti.Name,
185-
"error": err.Error(),
186-
"level": infologger.IL_Devel,
189+
"id": t.ti.TaskID.Value,
190+
"task": t.ti.Name,
191+
"command": tciCommandStr,
192+
"error": err.Error(),
193+
"level": infologger.IL_Devel,
187194
}).
188-
Error("task terminated with error")
189-
log.WithField("partition", t.knownEnvironmentId.String()).
190-
WithField("level", infologger.IL_Ops).
191-
Errorf("task terminated with error: %s %s",
192-
tciCommandStr,
193-
err.Error())
195+
Error("task terminated with error (details)")
194196
pendingState = mesos.TASK_FAILED
195197
}
196198

executor/executable/controllabletask.go

Lines changed: 6 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -573,6 +573,11 @@ func (t *ControllableTask) Launch() error {
573573

574574
pendingState := mesos.TASK_FINISHED
575575
if err != nil {
576+
taskClassName, _ := utils.ExtractTaskClassName(t.ti.Name)
577+
log.WithField("partition", t.knownEnvironmentId.String()).
578+
WithField("detector", t.knownDetector).
579+
WithField("level", infologger.IL_Ops).
580+
Errorf("task '%s' terminated with error: %s", utils.TrimJitPrefix(taskClassName), err.Error())
576581
log.WithField("partition", t.knownEnvironmentId.String()).
577582
WithField("detector", t.knownDetector).
578583
WithFields(logrus.Fields{
@@ -582,13 +587,7 @@ func (t *ControllableTask) Launch() error {
582587
"error": err.Error(),
583588
"level": infologger.IL_Devel,
584589
}).
585-
Error("task terminated with error")
586-
log.WithField("partition", t.knownEnvironmentId.String()).
587-
WithField("detector", t.knownDetector).
588-
WithField("level", infologger.IL_Ops).
589-
Errorf("task terminated with error: %s %s",
590-
truncatedCmd,
591-
err.Error())
590+
Error("task terminated with error (details):")
592591
pendingState = mesos.TASK_FAILED
593592
}
594593

0 commit comments

Comments
 (0)