Skip to content

Commit 46a932c

Browse files
committed
Improve exit handling of O2HitMerger
So far, the exit status of O2HitMerger was not analysed. This could lead to situations where O2HitMerger was killed by the OS due to out-of-memory, yet the o2-sim simulator still exited as "successfull". This commit improves the handling. Problems in O2HitMerger will lead to exit code 1 of o2-sim.
1 parent ad4cc98 commit 46a932c

File tree

2 files changed

+26
-13
lines changed

2 files changed

+26
-13
lines changed

run/O2HitMerger.h

Lines changed: 0 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -87,15 +87,6 @@ namespace o2
8787
namespace devices
8888
{
8989

90-
// signal handler
91-
void sighandler(int signal)
92-
{
93-
if (signal == SIGSEGV) {
94-
LOG(warn) << "segmentation violation ... just exit without coredump in order not to hang";
95-
raise(SIGKILL);
96-
}
97-
}
98-
9990
class O2HitMerger : public fair::mq::Device
10091
{
10192

@@ -130,7 +121,6 @@ class O2HitMerger : public fair::mq::Device
130121
void InitTask() final
131122
{
132123
LOG(info) << "INIT HIT MERGER";
133-
// signal(SIGSEGV, sighandler);
134124
ROOT::EnableThreadSafety();
135125

136126
std::string outfilename("o2sim_merged_hits.root"); // default name
@@ -764,7 +754,6 @@ class O2HitMerger : public fair::mq::Device
764754
eventheader->putInfo("prims_eta_0.8_pi", eta0Point8CounterPi);
765755
eventheader->putInfo("prims_total", prims);
766756
};
767-
768757
reorderAndMergeMCTracks(flusheventID, mOutTree, nprimaries, subevOrdered, mcheaderhook, eventheader);
769758

770759
if (mOutTree) {

run/o2sim_parallel.cxx

Lines changed: 26 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -732,7 +732,12 @@ int main(int argc, char* argv[])
732732
int status, cpid;
733733
// wait just blocks and waits until any child returns; but we make sure to wait until merger is here
734734
bool errored = false;
735-
while ((cpid = wait(&status)) != mergerpid) {
735+
// wait at least until mergerpid is reaped
736+
while ((cpid = wait(&status)) != -1) {
737+
if (cpid == mergerpid) {
738+
break; // Defer handling of mergerpid exit status until after the loop
739+
}
740+
736741
if (WEXITSTATUS(status) || WIFSIGNALED(status)) {
737742
if (!shutdown_initiated) {
738743
LOG(info) << "Process " << cpid << " EXITED WITH CODE " << WEXITSTATUS(status) << " SIGNALED "
@@ -748,11 +753,30 @@ int main(int argc, char* argv[])
748753
}
749754
}
750755
LOG(error) << "SHUTTING DOWN DUE TO SIGNALED EXIT IN COMPONENT " << cpid;
751-
o2::simpubsub::publishMessage(externalpublishchannel, o2::simpubsub::simStatusString("O2SIM", "STATE", "FAILURE"));
756+
o2::simpubsub::publishMessage(externalpublishchannel,o2::simpubsub::simStatusString("O2SIM", "STATE", "FAILURE"));
757+
errored = true;
758+
}
759+
}
760+
}
761+
// Handle mergerpid status separately
762+
if (cpid == mergerpid) {
763+
if (WIFEXITED(status)) {
764+
// anything other than 128 is indicative of error
765+
if (WEXITSTATUS(status) != 128) {
766+
LOG(error) << "Merger process exited with abnormal code " << WEXITSTATUS(status);
767+
errored = true;
768+
}
769+
} else if (WIFSIGNALED(status)) {
770+
auto sig = WTERMSIG(status);
771+
if (sig == SIGKILL || sig == SIGBUS || sig == SIGSEGV || sig == SIGABRT) {
772+
LOG(error) << "Merger process terminated through abnormal signal " << WTERMSIG(status);
752773
errored = true;
753774
}
775+
} else {
776+
LOG(warning) << "Merger process exited with unexpected status.";
754777
}
755778
}
779+
756780
// This marks the actual end of the computation (since results are available)
757781
LOG(info) << "Merger process " << mergerpid << " returned";
758782
LOG(info) << "Simulation process took " << timer.RealTime() << " s";

0 commit comments

Comments
 (0)