Skip to content

Commit 0a97792

Browse files
committed
Improve exit handling of O2HitMerger
So far, the exit status of O2HitMerger was not analysed. This could lead to situations where O2HitMerger was killed by the OS due to out-of-memory, yet the o2-sim simulator still exited as "successfull". This commit improves the handling. Problems in O2HitMerger will lead to exit code 1 of o2-sim.
1 parent ad4cc98 commit 0a97792

File tree

2 files changed

+31
-18
lines changed

2 files changed

+31
-18
lines changed

run/O2HitMerger.h

Lines changed: 0 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -87,15 +87,6 @@ namespace o2
8787
namespace devices
8888
{
8989

90-
// signal handler
91-
void sighandler(int signal)
92-
{
93-
if (signal == SIGSEGV) {
94-
LOG(warn) << "segmentation violation ... just exit without coredump in order not to hang";
95-
raise(SIGKILL);
96-
}
97-
}
98-
9990
class O2HitMerger : public fair::mq::Device
10091
{
10192

@@ -130,7 +121,6 @@ class O2HitMerger : public fair::mq::Device
130121
void InitTask() final
131122
{
132123
LOG(info) << "INIT HIT MERGER";
133-
// signal(SIGSEGV, sighandler);
134124
ROOT::EnableThreadSafety();
135125

136126
std::string outfilename("o2sim_merged_hits.root"); // default name
@@ -764,7 +754,6 @@ class O2HitMerger : public fair::mq::Device
764754
eventheader->putInfo("prims_eta_0.8_pi", eta0Point8CounterPi);
765755
eventheader->putInfo("prims_total", prims);
766756
};
767-
768757
reorderAndMergeMCTracks(flusheventID, mOutTree, nprimaries, subevOrdered, mcheaderhook, eventheader);
769758

770759
if (mOutTree) {

run/o2sim_parallel.cxx

Lines changed: 31 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -732,27 +732,51 @@ int main(int argc, char* argv[])
732732
int status, cpid;
733733
// wait just blocks and waits until any child returns; but we make sure to wait until merger is here
734734
bool errored = false;
735-
while ((cpid = wait(&status)) != mergerpid) {
735+
// wait at least until mergerpid is reaped
736+
while ((cpid = wait(&status)) != -1) {
737+
if (cpid == mergerpid) {
738+
break; // Defer handling of mergerpid exit status until after the loop
739+
}
740+
736741
if (WEXITSTATUS(status) || WIFSIGNALED(status)) {
737742
if (!shutdown_initiated) {
738-
LOG(info) << "Process " << cpid << " EXITED WITH CODE " << WEXITSTATUS(status) << " SIGNALED "
739-
<< WIFSIGNALED(status) << " SIGNAL " << WTERMSIG(status);
743+
LOG(info) << "Process " << cpid << " EXITED WITH CODE " << WEXITSTATUS(status)
744+
<< " SIGNALED " << WIFSIGNALED(status) << " SIGNAL " << WTERMSIG(status);
740745

741-
// we bring down all processes if one of them had problems or got a termination signal
742-
// if (WTERMSIG(status) == SIGABRT || WTERMSIG(status) == SIGSEGV || WTERMSIG(status) == SIGBUS || WTERMSIG(status) == SIGTERM) {
743746
LOG(info) << "Problem detected (or child received termination signal) ... shutting down whole system ";
744747
for (auto p : gChildProcesses) {
745748
LOG(info) << "TERMINATING " << p;
746749
if (killpg(p, 0) == 0) {
747-
killpg(p, SIGTERM); // <--- makes sure to shutdown "unknown" child pids via the group property
750+
killpg(p, SIGTERM);
748751
}
749752
}
750753
LOG(error) << "SHUTTING DOWN DUE TO SIGNALED EXIT IN COMPONENT " << cpid;
751-
o2::simpubsub::publishMessage(externalpublishchannel, o2::simpubsub::simStatusString("O2SIM", "STATE", "FAILURE"));
754+
o2::simpubsub::publishMessage(
755+
externalpublishchannel,
756+
o2::simpubsub::simStatusString("O2SIM", "STATE", "FAILURE"));
757+
errored = true;
758+
}
759+
}
760+
}
761+
// Handle mergerpid status separately
762+
if (cpid == mergerpid) {
763+
if (WIFEXITED(status)) {
764+
// anything other than 128 is indicative of error
765+
if (WEXITSTATUS(status) != 128) {
766+
LOG(error) << "Merger process exited with abnormal code " << WEXITSTATUS(status);
767+
errored = true;
768+
}
769+
} else if (WIFSIGNALED(status)) {
770+
auto sig = WTERMSIG(status);
771+
if (sig == SIGKILL || sig == SIGBUS || sig == SIGSEGV || sig == SIGABRT) {
772+
LOG(error) << "Merger process terminated through abnormal signal " << WTERMSIG(status);
752773
errored = true;
753774
}
775+
} else {
776+
LOG(warning) << "Merger process exited with unexpected status.";
754777
}
755778
}
779+
756780
// This marks the actual end of the computation (since results are available)
757781
LOG(info) << "Merger process " << mergerpid << " returned";
758782
LOG(info) << "Simulation process took " << timer.RealTime() << " s";

0 commit comments

Comments
 (0)