From 46a932c81ab7fd6b314abf00cdc9e84c61fa42ba Mon Sep 17 00:00:00 2001 From: swenzel Date: Thu, 12 Jun 2025 11:36:20 +0200 Subject: [PATCH 1/2] Improve exit handling of O2HitMerger So far, the exit status of O2HitMerger was not analysed. This could lead to situations where O2HitMerger was killed by the OS due to out-of-memory, yet the o2-sim simulator still exited as "successfull". This commit improves the handling. Problems in O2HitMerger will lead to exit code 1 of o2-sim. --- run/O2HitMerger.h | 11 ----------- run/o2sim_parallel.cxx | 28 ++++++++++++++++++++++++++-- 2 files changed, 26 insertions(+), 13 deletions(-) diff --git a/run/O2HitMerger.h b/run/O2HitMerger.h index c2a094bfc9e54..520873e7aaafe 100644 --- a/run/O2HitMerger.h +++ b/run/O2HitMerger.h @@ -87,15 +87,6 @@ namespace o2 namespace devices { -// signal handler -void sighandler(int signal) -{ - if (signal == SIGSEGV) { - LOG(warn) << "segmentation violation ... just exit without coredump in order not to hang"; - raise(SIGKILL); - } -} - class O2HitMerger : public fair::mq::Device { @@ -130,7 +121,6 @@ class O2HitMerger : public fair::mq::Device void InitTask() final { LOG(info) << "INIT HIT MERGER"; - // signal(SIGSEGV, sighandler); ROOT::EnableThreadSafety(); std::string outfilename("o2sim_merged_hits.root"); // default name @@ -764,7 +754,6 @@ class O2HitMerger : public fair::mq::Device eventheader->putInfo("prims_eta_0.8_pi", eta0Point8CounterPi); eventheader->putInfo("prims_total", prims); }; - reorderAndMergeMCTracks(flusheventID, mOutTree, nprimaries, subevOrdered, mcheaderhook, eventheader); if (mOutTree) { diff --git a/run/o2sim_parallel.cxx b/run/o2sim_parallel.cxx index 3e28428938b20..fbb9937ecbcef 100644 --- a/run/o2sim_parallel.cxx +++ b/run/o2sim_parallel.cxx @@ -732,7 +732,12 @@ int main(int argc, char* argv[]) int status, cpid; // wait just blocks and waits until any child returns; but we make sure to wait until merger is here bool errored = false; - while ((cpid = wait(&status)) != mergerpid) { + // wait at least until mergerpid is reaped + while ((cpid = wait(&status)) != -1) { + if (cpid == mergerpid) { + break; // Defer handling of mergerpid exit status until after the loop + } + if (WEXITSTATUS(status) || WIFSIGNALED(status)) { if (!shutdown_initiated) { LOG(info) << "Process " << cpid << " EXITED WITH CODE " << WEXITSTATUS(status) << " SIGNALED " @@ -748,11 +753,30 @@ int main(int argc, char* argv[]) } } LOG(error) << "SHUTTING DOWN DUE TO SIGNALED EXIT IN COMPONENT " << cpid; - o2::simpubsub::publishMessage(externalpublishchannel, o2::simpubsub::simStatusString("O2SIM", "STATE", "FAILURE")); + o2::simpubsub::publishMessage(externalpublishchannel,o2::simpubsub::simStatusString("O2SIM", "STATE", "FAILURE")); + errored = true; + } + } + } + // Handle mergerpid status separately + if (cpid == mergerpid) { + if (WIFEXITED(status)) { + // anything other than 128 is indicative of error + if (WEXITSTATUS(status) != 128) { + LOG(error) << "Merger process exited with abnormal code " << WEXITSTATUS(status); + errored = true; + } + } else if (WIFSIGNALED(status)) { + auto sig = WTERMSIG(status); + if (sig == SIGKILL || sig == SIGBUS || sig == SIGSEGV || sig == SIGABRT) { + LOG(error) << "Merger process terminated through abnormal signal " << WTERMSIG(status); errored = true; } + } else { + LOG(warning) << "Merger process exited with unexpected status."; } } + // This marks the actual end of the computation (since results are available) LOG(info) << "Merger process " << mergerpid << " returned"; LOG(info) << "Simulation process took " << timer.RealTime() << " s"; From 0e1662d5c8f69ae178e24c068ce3e3aad429d434 Mon Sep 17 00:00:00 2001 From: ALICE Action Bot Date: Thu, 12 Jun 2025 12:11:09 +0000 Subject: [PATCH 2/2] Please consider the following formatting changes --- run/o2sim_parallel.cxx | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/run/o2sim_parallel.cxx b/run/o2sim_parallel.cxx index fbb9937ecbcef..0dabafaf40c67 100644 --- a/run/o2sim_parallel.cxx +++ b/run/o2sim_parallel.cxx @@ -753,7 +753,7 @@ int main(int argc, char* argv[]) } } LOG(error) << "SHUTTING DOWN DUE TO SIGNALED EXIT IN COMPONENT " << cpid; - o2::simpubsub::publishMessage(externalpublishchannel,o2::simpubsub::simStatusString("O2SIM", "STATE", "FAILURE")); + o2::simpubsub::publishMessage(externalpublishchannel, o2::simpubsub::simStatusString("O2SIM", "STATE", "FAILURE")); errored = true; } }