Skip to content

Commit 23fa86b

Browse files
committed
Improve exit handling of O2HitMerger
So far, the exit status of O2HitMerger was not analysed. This could lead to situations where O2HitMerger was killed by the OS due to out-of-memory, yet the o2-sim simulator still exited as "successfull". This commit improves the handling. Problems in O2HitMerger will lead to exit code 1 of o2-sim.
1 parent ad4cc98 commit 23fa86b

File tree

2 files changed

+30
-15
lines changed

2 files changed

+30
-15
lines changed

run/O2HitMerger.h

Lines changed: 0 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -87,15 +87,6 @@ namespace o2
8787
namespace devices
8888
{
8989

90-
// signal handler
91-
void sighandler(int signal)
92-
{
93-
if (signal == SIGSEGV) {
94-
LOG(warn) << "segmentation violation ... just exit without coredump in order not to hang";
95-
raise(SIGKILL);
96-
}
97-
}
98-
9990
class O2HitMerger : public fair::mq::Device
10091
{
10192

@@ -130,7 +121,6 @@ class O2HitMerger : public fair::mq::Device
130121
void InitTask() final
131122
{
132123
LOG(info) << "INIT HIT MERGER";
133-
// signal(SIGSEGV, sighandler);
134124
ROOT::EnableThreadSafety();
135125

136126
std::string outfilename("o2sim_merged_hits.root"); // default name
@@ -764,7 +754,6 @@ class O2HitMerger : public fair::mq::Device
764754
eventheader->putInfo("prims_eta_0.8_pi", eta0Point8CounterPi);
765755
eventheader->putInfo("prims_total", prims);
766756
};
767-
768757
reorderAndMergeMCTracks(flusheventID, mOutTree, nprimaries, subevOrdered, mcheaderhook, eventheader);
769758

770759
if (mOutTree) {

run/o2sim_parallel.cxx

Lines changed: 30 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -732,11 +732,16 @@ int main(int argc, char* argv[])
732732
int status, cpid;
733733
// wait just blocks and waits until any child returns; but we make sure to wait until merger is here
734734
bool errored = false;
735-
while ((cpid = wait(&status)) != mergerpid) {
735+
// wait at least until mergerpid is reaped
736+
while ((cpid = wait(&status)) != -1) {
737+
if (cpid == mergerpid) {
738+
break; // Defer handling of mergerpid exit status until after the loop
739+
}
740+
736741
if (WEXITSTATUS(status) || WIFSIGNALED(status)) {
737742
if (!shutdown_initiated) {
738-
LOG(info) << "Process " << cpid << " EXITED WITH CODE " << WEXITSTATUS(status) << " SIGNALED "
739-
<< WIFSIGNALED(status) << " SIGNAL " << WTERMSIG(status);
743+
LOG(info) << "Process " << cpid << " EXITED WITH CODE " << WEXITSTATUS(status)
744+
<< " SIGNALED " << WIFSIGNALED(status) << " SIGNAL " << WTERMSIG(status);
740745

741746
// we bring down all processes if one of them had problems or got a termination signal
742747
// if (WTERMSIG(status) == SIGABRT || WTERMSIG(status) == SIGSEGV || WTERMSIG(status) == SIGBUS || WTERMSIG(status) == SIGTERM) {
@@ -748,11 +753,32 @@ int main(int argc, char* argv[])
748753
}
749754
}
750755
LOG(error) << "SHUTTING DOWN DUE TO SIGNALED EXIT IN COMPONENT " << cpid;
751-
o2::simpubsub::publishMessage(externalpublishchannel, o2::simpubsub::simStatusString("O2SIM", "STATE", "FAILURE"));
756+
o2::simpubsub::publishMessage(
757+
externalpublishchannel,
758+
o2::simpubsub::simStatusString("O2SIM", "STATE", "FAILURE"));
759+
errored = true;
760+
}
761+
}
762+
}
763+
// Handle mergerpid status separately
764+
if (cpid == mergerpid) {
765+
if (WIFEXITED(status)) {
766+
// anything other than 128 is indicative of error
767+
if (WEXITSTATUS(status) != 128) {
768+
LOG(error) << "Merger process exited with abnormal code " << WEXITSTATUS(status);
769+
errored = true;
770+
}
771+
} else if (WIFSIGNALED(status)) {
772+
auto sig = WTERMSIG(status);
773+
if (sig == SIGKILL || sig == SIGBUS || sig == SIGSEGV || sig == SIGABRT) {
774+
LOG(error) << "Merger process terminated through abnormal signal " << WTERMSIG(status);
752775
errored = true;
753776
}
777+
} else {
778+
LOG(warning) << "Merger process exited with unexpected status.";
754779
}
755780
}
781+
756782
// This marks the actual end of the computation (since results are available)
757783
LOG(info) << "Merger process " << mergerpid << " returned";
758784
LOG(info) << "Simulation process took " << timer.RealTime() << " s";

0 commit comments

Comments
 (0)