Skip to content

Commit c4bca14

Browse files
committed
Improve exit handling of O2HitMerger
So far, the exit status of O2HitMerger was not analysed. This could lead to situations where O2HitMerger was killed by the OS due to out-of-memory, yet the o2-sim simulator still exited as "successfull". This commit improves the handling. Problems in O2HitMerger will lead to exit code 1 of o2-sim.
1 parent dbacf7b commit c4bca14

File tree

2 files changed

+26
-12
lines changed

2 files changed

+26
-12
lines changed

run/O2HitMerger.h

Lines changed: 0 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -87,15 +87,6 @@ namespace o2
8787
namespace devices
8888
{
8989

90-
// signal handler
91-
void sighandler(int signal)
92-
{
93-
if (signal == SIGSEGV) {
94-
LOG(warn) << "segmentation violation ... just exit without coredump in order not to hang";
95-
raise(SIGKILL);
96-
}
97-
}
98-
9990
class O2HitMerger : public fair::mq::Device
10091
{
10192

@@ -130,7 +121,6 @@ class O2HitMerger : public fair::mq::Device
130121
void InitTask() final
131122
{
132123
LOG(info) << "INIT HIT MERGER";
133-
// signal(SIGSEGV, sighandler);
134124
ROOT::EnableThreadSafety();
135125

136126
std::string outfilename("o2sim_merged_hits.root"); // default name
@@ -764,7 +754,6 @@ class O2HitMerger : public fair::mq::Device
764754
eventheader->putInfo("prims_eta_0.8_pi", eta0Point8CounterPi);
765755
eventheader->putInfo("prims_total", prims);
766756
};
767-
768757
reorderAndMergeMCTracks(flusheventID, mOutTree, nprimaries, subevOrdered, mcheaderhook, eventheader);
769758

770759
if (mOutTree) {

run/o2sim_parallel.cxx

Lines changed: 26 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -732,7 +732,12 @@ int main(int argc, char* argv[])
732732
int status, cpid;
733733
// wait just blocks and waits until any child returns; but we make sure to wait until merger is here
734734
bool errored = false;
735-
while ((cpid = wait(&status)) != mergerpid) {
735+
// wait at least until mergerpid is reaped
736+
while ((cpid = wait(&status)) != -1) {
737+
if (cpid == mergerpid) {
738+
break; // Defer handling of mergerpid exit status until after the loop
739+
}
740+
736741
if (WEXITSTATUS(status) || WIFSIGNALED(status)) {
737742
if (!shutdown_initiated) {
738743
LOG(info) << "Process " << cpid << " EXITED WITH CODE " << WEXITSTATUS(status) << " SIGNALED "
@@ -753,6 +758,26 @@ int main(int argc, char* argv[])
753758
}
754759
}
755760
}
761+
762+
// Handle mergerpid status separately
763+
if (cpid == mergerpid) {
764+
if (WIFEXITED(status)) {
765+
// anything other than 128 is indicative of error
766+
if (WEXITSTATUS(status) != 128) {
767+
LOG(error) << "Merger process exited with abnormal code " << WEXITSTATUS(status);
768+
errored = true;
769+
}
770+
} else if (WIFSIGNALED(status)) {
771+
auto sig = WTERMSIG(status);
772+
if (sig == SIGKILL || sig == SIGBUS || sig == SIGSEGV || sig == SIGABRT) {
773+
LOG(error) << "Merger process terminated through abnormal signal " << WTERMSIG(status);
774+
errored = true;
775+
}
776+
} else {
777+
LOG(warning) << "Merger process exited with unexpected status.";
778+
}
779+
}
780+
756781
// This marks the actual end of the computation (since results are available)
757782
LOG(info) << "Merger process " << mergerpid << " returned";
758783
LOG(info) << "Simulation process took " << timer.RealTime() << " s";

0 commit comments

Comments
 (0)