@@ -732,11 +732,16 @@ int main(int argc, char* argv[])
732732 int status, cpid;
733733 // wait just blocks and waits until any child returns; but we make sure to wait until merger is here
734734 bool errored = false ;
735- while ((cpid = wait (&status)) != mergerpid) {
735+ // wait at least until mergerpid is reaped
736+ while ((cpid = wait (&status)) != -1 ) {
737+ if (cpid == mergerpid) {
738+ break ; // Defer handling of mergerpid exit status until after the loop
739+ }
740+
736741 if (WEXITSTATUS (status) || WIFSIGNALED (status)) {
737742 if (!shutdown_initiated) {
738- LOG (info) << " Process " << cpid << " EXITED WITH CODE " << WEXITSTATUS (status) << " SIGNALED "
739- << WIFSIGNALED (status) << " SIGNAL " << WTERMSIG (status);
743+ LOG (info) << " Process " << cpid << " EXITED WITH CODE " << WEXITSTATUS (status)
744+ << " SIGNALED " << WIFSIGNALED (status) << " SIGNAL " << WTERMSIG (status);
740745
741746 // we bring down all processes if one of them had problems or got a termination signal
742747 // if (WTERMSIG(status) == SIGABRT || WTERMSIG(status) == SIGSEGV || WTERMSIG(status) == SIGBUS || WTERMSIG(status) == SIGTERM) {
@@ -748,11 +753,32 @@ int main(int argc, char* argv[])
748753 }
749754 }
750755 LOG (error) << " SHUTTING DOWN DUE TO SIGNALED EXIT IN COMPONENT " << cpid;
751- o2::simpubsub::publishMessage (externalpublishchannel, o2::simpubsub::simStatusString (" O2SIM" , " STATE" , " FAILURE" ));
756+ o2::simpubsub::publishMessage (
757+ externalpublishchannel,
758+ o2::simpubsub::simStatusString (" O2SIM" , " STATE" , " FAILURE" ));
759+ errored = true ;
760+ }
761+ }
762+ }
763+ // Handle mergerpid status separately
764+ if (cpid == mergerpid) {
765+ if (WIFEXITED (status)) {
766+ // anything other than 128 is indicative of error
767+ if (WEXITSTATUS (status) != 128 ) {
768+ LOG (error) << " Merger process exited with abnormal code " << WEXITSTATUS (status);
769+ errored = true ;
770+ }
771+ } else if (WIFSIGNALED (status)) {
772+ auto sig = WTERMSIG (status);
773+ if (sig == SIGKILL || sig == SIGBUS || sig == SIGSEGV || sig == SIGABRT) {
774+ LOG (error) << " Merger process terminated through abnormal signal " << WTERMSIG (status);
752775 errored = true ;
753776 }
777+ } else {
778+ LOG (warning) << " Merger process exited with unexpected status." ;
754779 }
755780 }
781+
756782 // This marks the actual end of the computation (since results are available)
757783 LOG (info) << " Merger process " << mergerpid << " returned" ;
758784 LOG (info) << " Simulation process took " << timer.RealTime () << " s" ;
0 commit comments