19 #include <sys/types.h>
21 #include <sys/sysinfo.h>
23 #include <mach/task.h>
24 #include <mach/mach_init.h>
35 #include "Gaudi/Property.h"
36 #include "GaudiKernel/IAlgorithm.h"
37 #include "GaudiKernel/IIncidentSvc.h"
38 #include "GaudiKernel/IAlgContextSvc.h"
39 #include "GaudiKernel/IAlgExecStateSvc.h"
40 #include "GaudiKernel/IAlgManager.h"
41 #include "GaudiKernel/ServiceHandle.h"
42 #include "GaudiKernel/System.h"
43 #include "GaudiKernel/ConcurrencyFlags.h"
44 #include "GaudiKernel/EventContext.h"
57 const char*
const horizLine =
"-------------------------------------------------------------------------------------\n";
60 if (
sig == SIGINT ) {
62 std::cout << std::endl;
63 std::cerr <<
"Athena CRITICAL stopped by user interrupt\n";
88 std::ostream* ostr(&std::cout);
103 const int maxcalls = 64;
104 static std::atomic<int> ncalls (0);
105 if (++ncalls >= maxcalls) _exit (98);
111 pthread_t
self = pthread_self();
112 std::lock_guard<std::mutex>
lock (tidlist_mutex);
113 for (
size_t i = 0;
i < ntids;
i++) {
114 if (pthread_equal (
self, tids[
i]))
return;
116 if (ntids == maxcalls) _exit (98);
117 tids[ntids++] =
self;
121 static std::atomic<int> inThreads = 0;
126 if (
sig == SIGALRM) {
128 log() <<
"Received SIGALRM. Aborting job..." << std::endl;
134 log() <<
"Received SIGALRM. Terminating job..." << std::endl;
146 const timespec one_second { 1, 0 };
148 unsigned int waits = 0;
149 while (!threadMutex.try_lock()) {
150 nanosleep (&one_second,
nullptr);
151 if (++waits > timeoutSeconds) _exit (97);
156 if ( timeoutSeconds > 0 && (
sig == SIGSEGV ||
sig == SIGBUS ||
sig == SIGABRT) ) {
158 alarm(timeoutSeconds);
164 write (1, horizLine, strlen(horizLine));
165 const char*
msg =
"Producing (fast) stack trace...\n";
167 write (1, horizLine, strlen(horizLine));
185 log() << horizLine <<
"Producing stack trace (can be slow, check gdb process)...\n"
187 gSystem->StackTrace();
195 log() << horizLine <<
"Invoking previous signal handler (can be slow, check gdb process)...\n"
197 if ( oact.sa_flags & SA_SIGINFO ) {
200 else if (oact.sa_handler != SIG_DFL && oact.sa_handler != SIG_IGN ) {
201 oact.sa_handler(
sig);
204 log() <<
"Could not invoke previous signal handler" << std::endl;
209 threadMutex.unlock();
215 unsigned int waits = 0;
216 while (inThreads > 0 && waits < timeoutSeconds) {
217 nanosleep (&one_second,
nullptr);
221 log() <<
"Aborting job... " << std::endl;
238 base_class(
name, pSvcLocator )
244 m_dumpCoreFile.declareUpdateHandler(&CoreDumpSvc::propertyHandler,
this);
245 m_stackTrace.declareUpdateHandler(&CoreDumpSvc::propertyHandler,
this);
249 m_killOnSigInt.declareUpdateHandler(&CoreDumpSvc::propertyHandler,
this);
260 void CoreDumpSvc::propertyHandler(Gaudi::Details::PropertyBase&
p)
268 const std::string
val =
p.toString();
269 if (
val==
"stdout" ) {
270 CoreDumpSvcHandler::ostr = &std::cout;
272 else if (
val==
"stderr" ) {
273 CoreDumpSvcHandler::ostr = &std::cerr;
285 ATH_MSG_INFO(
"could not convert [" <<
p.toString() <<
"] to integer");
317 if ( installSignalHandler().isFailure() ) {
319 return StatusCode::FAILURE;
324 if ( !incSvc.retrieve().isSuccess() ) {
328 incSvc->addListener(
this, IncidentType::BeginRun);
329 incSvc->addListener(
this, IncidentType::BeginEvent);
330 incSvc->addListener(
this, IncidentType::EndRun);
331 incSvc->addListener(
this, IncidentType::EndEvent);
332 incSvc->addListener(
this,
"StoreCleared");
335 return StatusCode::SUCCESS;
340 auto numSlots = std::max<size_t>(1, Gaudi::Concurrency::ConcurrencyFlags::numConcurrentEvents());
343 return StatusCode::SUCCESS;
350 if ( uninstallSignalHandler().isFailure() ) {
352 return StatusCode::FAILURE;
355 return StatusCode::SUCCESS;
372 auto slot = ctx.valid() ? ctx.slot() : 0;
383 << m_coreDumpStream.value()
384 << (m_dumpCoreFile ?
". Will try to produce a core dump file on exit." :
".")
395 std::ostringstream
os;
397 const time_t
now =
time(
nullptr);
399 os <<
"-------------------------------------------------------------------------------------" <<
"\n";
400 os <<
"Core dump from " <<
name() <<
" on " << System::hostName()
401 <<
" at " << ctime_r(&
now, buf) ;
408 os <<
"Caught signal " << signo
409 <<
"(" << strsignal(signo) <<
"). Details: "
424 << std::hex <<
m_siginfo->si_ptr <<
")" << std::dec <<
"\n";
430 const long pagesz = sysconf(_SC_PAGESIZE);
431 os <<
" vmem = " <<
s.vm_pages*pagesz/1024./1024. <<
" MB\n"
432 <<
" rss = " <<
s.rss_pages*pagesz/1024./1024. <<
" MB\n";
440 if ( 0 == sysinfo(&
sys) ) {
442 const float mem_units =
sys.mem_unit/(1024.*1024.);
443 os <<
" total-ram = " <<
sys.totalram * mem_units <<
" MB\n"
444 <<
" free-ram = " <<
sys.freeram * mem_units <<
" MB\n"
445 <<
" buffer-ram= " <<
sys.bufferram* mem_units <<
" MB\n"
446 <<
" total-swap= " <<
sys.totalswap* mem_units <<
" MB\n"
447 <<
" free-swap = " <<
sys.freeswap * mem_units <<
" MB\n";
453 if (signo == SIGILL || signo == SIGFPE || signo == SIGSEGV || signo == SIGBUS)
454 os <<
" addr = " << std::hex <<
m_siginfo->si_addr << std::dec <<
"\n";
461 SmartIF<IAlgManager> algMgr{serviceLocator()->as<IAlgManager>()};
462 SmartIF<IAlgContextSvc> algContextSvc;
466 if (Gaudi::Concurrency::ConcurrencyFlags::numConcurrentEvents() == 0) {
467 algContextSvc = service(
"AlgContextSvc",
false);
474 std::string currentAlg;
477 if (Gaudi::Concurrency::ConcurrencyFlags::numConcurrentEvents() > 0) {
478 const EventContext ctx(0,
t);
479 ATH_MSG_DEBUG(
"Using AlgExecStateSvc to determine current algorithm(s)");
481 for (
const IAlgorithm*
alg : algMgr->getAlgorithms()) {
482 auto aes =
alg->execState(ctx);
483 if (aes.state()==AlgExecState::State::Executing)
484 currentAlg += (
alg->name() +
" ");
487 catch (
const GaudiException&) {
488 ATH_MSG_INFO(
"No information from AlgExecStateSvc because no algorithm was executed yet.");
491 else if (algContextSvc) {
492 ATH_MSG_DEBUG(
"Using AlgContextSvc to determine current algorithm");
493 IAlgorithm*
alg = algContextSvc->currentAlg();
494 if (
alg) currentAlg =
alg->name();
497 ATH_MSG_WARNING(
"AlgExecStateSvc or AlgContextSvc not available. Cannot determine current algorithm.");
500 if (currentAlg.empty()) currentAlg =
"<NONE>";
501 os <<
"Slot " << std::setw(3) <<
t <<
" : Current algorithm = " << currentAlg << std::endl;
505 if (!
sys.LastInc.empty()) {
506 os <<
" : Last Incident = " <<
sys.LastInc << std::endl
507 <<
" : Event ID = " <<
sys.EvId << std::endl;
513 for (
auto &
s : usr) {
514 os <<
" : (usr) " <<
s.first <<
" = " <<
s.second << std::endl;
520 os <<
"Algorithm stack: ";
521 if ( algContextSvc->algorithms().empty() )
os <<
"<EMPTY>" <<
"\n";
524 for (
auto alg : algContextSvc->algorithms()) {
525 if (
alg)
os <<
" " <<
alg->name() <<
"\n";
531 os <<
"| AtlasBaseDir : " << std::setw(66) <<
getenv(
"AtlasBaseDir") <<
" |\n";
532 os <<
"| AtlasVersion : " << std::setw(66) <<
getenv(
"AtlasVersion") <<
" |\n";
533 os <<
"| BINARY_TAG : " << std::setw(66) <<
getenv(
"BINARY_TAG") <<
" |\n";
535 os <<
" Note: to see line numbers in below stacktrace you might consider running following :\n";
536 os <<
" atlasAddress2Line --file <logfile>\n";
538 SmartIF<IAthenaSummarySvc> iass{service(
"AthenaSummarySvc",
false)};
540 iass->addSummary(
"CoreDumpSvc",
os.str());
542 iass->createSummary().ignore();
555 auto slot = incident.context().valid() ? incident.context().slot() : 0;
558 currRec.LastInc = incident.source() +
":" + incident.type();
560 std::ostringstream oss;
561 oss << incident.context().eventID();
562 currRec.EvId = oss.str();
564 if (incident.type()==IncidentType::BeginEvent) {
568 }
else if (incident.type() ==
"StoreCleared") {
570 auto newstr = currRec.EvId;
573 newstr[0] = newstr[0];
574 currRec.EvId = std::move(newstr);
589 std::ostringstream oss;
591 for (
auto sig : m_signals) {
593 if (sig<1 || sig>SIGRTMAX) {
598 oss <<
sig <<
"(" << strsignal(
sig) <<
") ";
604 struct sigaction sigact;
605 memset (&sigact, 0,
sizeof(sigact));
608 sigact.sa_flags = SA_SIGINFO + SA_ONSTACK;
612 <<
": " << strerror(errno));
613 return StatusCode::FAILURE;
618 return StatusCode::SUCCESS;
631 int ret = sigaction(kv.first, &(kv.second),
nullptr);
633 sc = StatusCode::FAILURE;
634 ATH_MSG_WARNING(
"Error on uninstalling handler for signal " << kv.first
635 <<
": " << strerror(errno));
649 std::vector<uint8_t>& stack =
s_stack;
651 stack.resize (
std::max (SIGSTKSZ, MINSIGSTKSZ) + 2*1024*1024);
653 ss.ss_sp = stack.data();
655 ss.ss_size = stack.size();
656 int ret = sigaltstack (&
ss,
nullptr);