@@ -732,27 +732,51 @@ int main(int argc, char* argv[])
732732 int status, cpid;
733733 // wait just blocks and waits until any child returns; but we make sure to wait until merger is here
734734 bool errored = false ;
735- while ((cpid = wait (&status)) != mergerpid) {
735+ // wait at least until mergerpid is reaped
736+ while ((cpid = wait (&status)) != -1 ) {
737+ if (cpid == mergerpid) {
738+ break ; // Defer handling of mergerpid exit status until after the loop
739+ }
740+
736741 if (WEXITSTATUS (status) || WIFSIGNALED (status)) {
737742 if (!shutdown_initiated) {
738- LOG (info) << " Process " << cpid << " EXITED WITH CODE " << WEXITSTATUS (status) << " SIGNALED "
739- << WIFSIGNALED (status) << " SIGNAL " << WTERMSIG (status);
743+ LOG (info) << " Process " << cpid << " EXITED WITH CODE " << WEXITSTATUS (status)
744+ << " SIGNALED " << WIFSIGNALED (status) << " SIGNAL " << WTERMSIG (status);
740745
741- // we bring down all processes if one of them had problems or got a termination signal
742- // if (WTERMSIG(status) == SIGABRT || WTERMSIG(status) == SIGSEGV || WTERMSIG(status) == SIGBUS || WTERMSIG(status) == SIGTERM) {
743746 LOG (info) << " Problem detected (or child received termination signal) ... shutting down whole system " ;
744747 for (auto p : gChildProcesses ) {
745748 LOG (info) << " TERMINATING " << p;
746749 if (killpg (p, 0 ) == 0 ) {
747- killpg (p, SIGTERM); // <--- makes sure to shutdown "unknown" child pids via the group property
750+ killpg (p, SIGTERM);
748751 }
749752 }
750753 LOG (error) << " SHUTTING DOWN DUE TO SIGNALED EXIT IN COMPONENT " << cpid;
751- o2::simpubsub::publishMessage (externalpublishchannel, o2::simpubsub::simStatusString (" O2SIM" , " STATE" , " FAILURE" ));
754+ o2::simpubsub::publishMessage (
755+ externalpublishchannel,
756+ o2::simpubsub::simStatusString (" O2SIM" , " STATE" , " FAILURE" ));
757+ errored = true ;
758+ }
759+ }
760+ }
761+ // Handle mergerpid status separately
762+ if (cpid == mergerpid) {
763+ if (WIFEXITED (status)) {
764+ // anything other than 128 is indicative of error
765+ if (WEXITSTATUS (status) != 128 ) {
766+ LOG (error) << " Merger process exited with abnormal code " << WEXITSTATUS (status);
767+ errored = true ;
768+ }
769+ } else if (WIFSIGNALED (status)) {
770+ auto sig = WTERMSIG (status);
771+ if (sig == SIGKILL || sig == SIGBUS || sig == SIGSEGV || sig == SIGABRT) {
772+ LOG (error) << " Merger process terminated through abnormal signal " << WTERMSIG (status);
752773 errored = true ;
753774 }
775+ } else {
776+ LOG (warning) << " Merger process exited with unexpected status." ;
754777 }
755778 }
779+
756780 // This marks the actual end of the computation (since results are available)
757781 LOG (info) << " Merger process " << mergerpid << " returned" ;
758782 LOG (info) << " Simulation process took " << timer.RealTime () << " s" ;
0 commit comments