21#include <sys/sysinfo.h>
24#include <mach/mach_init.h>
35#include "Gaudi/Property.h"
36#include "GaudiKernel/IAlgorithm.h"
37#include "GaudiKernel/IIncidentSvc.h"
38#include "GaudiKernel/IAlgContextSvc.h"
39#include "GaudiKernel/IAlgExecStateSvc.h"
40#include "GaudiKernel/IAlgManager.h"
41#include "GaudiKernel/ServiceHandle.h"
42#include "GaudiKernel/System.h"
43#include "GaudiKernel/ConcurrencyFlags.h"
44#include "GaudiKernel/EventContext.h"
57 const char*
const horizLine =
"-------------------------------------------------------------------------------------\n";
59 void ExitOnInt(
int sig,
siginfo_t*,
void* ) {
60 if ( sig == SIGINT ) {
62 std::cout << std::endl;
63 std::cerr <<
"Athena CRITICAL stopped by user interrupt\n";
88 std::ostream* ostr(&std::cout);
103 const int maxcalls = 64;
104 static std::atomic<int> ncalls (0);
105 if (++ncalls >= maxcalls) _exit (98);
107 static std::mutex tidlist_mutex;
111 pthread_t self = pthread_self();
112 std::lock_guard<std::mutex> lock (tidlist_mutex);
113 for (
size_t i = 0; i < ntids; i++) {
114 if (pthread_equal (self, tids[i]))
return;
116 if (ntids == maxcalls) _exit (98);
117 tids[ntids++] = self;
121 static std::atomic<int> inThreads = 0;
124 const unsigned int timeoutSeconds =
static_cast<unsigned int>(round(
coreDumpSvc->m_timeout * 1e-9));
126 if ( sig == SIGALRM) {
128 log() <<
"Received SIGALRM. Aborting job..." << std::endl;
134 log() <<
"Received SIGALRM. Terminating job..." << std::endl;
145 static std::mutex threadMutex;
146 const timespec one_second { 1, 0 };
148 unsigned int waits = 0;
149 while (!threadMutex.try_lock()) {
150 nanosleep (&one_second,
nullptr);
151 if (++waits > timeoutSeconds) _exit (97);
156 if ( timeoutSeconds > 0 && (sig == SIGSEGV || sig == SIGBUS || sig == SIGABRT) ) {
158 alarm(timeoutSeconds);
164 write (1, horizLine, strlen(horizLine));
165 const char*
msg =
"Producing (fast) stack trace...\n";
166 write (1,
msg, strlen (
msg));
167 write (1, horizLine, strlen(horizLine));
168 Athena::Signal::fatalDump (sig, info, extra,
185 log() << horizLine <<
"Producing stack trace (can be slow, check gdb process)...\n"
186 << horizLine << std::flush;
187 gSystem->StackTrace();
195 log() << horizLine <<
"Invoking previous signal handler (can be slow, check gdb process)...\n"
196 << horizLine << std::flush;
197 if ( oact.sa_flags & SA_SIGINFO ) {
198 oact.sa_sigaction(sig, info, extra);
200 else if (oact.sa_handler != SIG_DFL && oact.sa_handler != SIG_IGN ) {
201 oact.sa_handler(sig);
204 log() <<
"Could not invoke previous signal handler" << std::endl;
209 threadMutex.unlock();
212 if (
coreDumpSvc && (sig == SIGSEGV || sig == SIGBUS || sig == SIGABRT) ) {
215 unsigned int waits = 0;
216 while (inThreads > 0 && waits < timeoutSeconds) {
217 nanosleep (&one_second,
nullptr);
221 log() <<
"Aborting job... " << std::endl;
238 base_class( name, pSvcLocator )
244 m_dumpCoreFile.declareUpdateHandler(&CoreDumpSvc::propertyHandler,
this);
245 m_stackTrace.declareUpdateHandler(&CoreDumpSvc::propertyHandler,
this);
249 m_killOnSigInt.declareUpdateHandler(&CoreDumpSvc::propertyHandler,
this);
260void CoreDumpSvc::propertyHandler(Gaudi::Details::PropertyBase& p)
268 const std::string val = p.toString();
269 if ( val==
"stdout" ) {
270 CoreDumpSvcHandler::ostr = &std::cout;
272 else if ( val==
"stderr" ) {
273 CoreDumpSvcHandler::ostr = &std::cerr;
285 ATH_MSG_INFO(
"could not convert [" <<
p.toString() <<
"] to integer");
305StatusCode CoreDumpSvc::initialize()
317 if ( installSignalHandler().isFailure() ) {
319 return StatusCode::FAILURE;
323 ServiceHandle<IIncidentSvc> incSvc(
"IncidentSvc",
name());
324 if ( !incSvc.retrieve().isSuccess() ) {
328 incSvc->addListener(
this, IncidentType::BeginRun);
329 incSvc->addListener(
this, IncidentType::BeginEvent);
330 incSvc->addListener(
this, IncidentType::EndRun);
331 incSvc->addListener(
this, IncidentType::EndEvent);
332 incSvc->addListener(
this,
"StoreCleared");
335 return StatusCode::SUCCESS;
340 auto numSlots = std::max<size_t>(1, Gaudi::Concurrency::ConcurrencyFlags::numConcurrentEvents());
343 return StatusCode::SUCCESS;
346StatusCode CoreDumpSvc::finalize()
350 if ( uninstallSignalHandler().isFailure() ) {
352 return StatusCode::FAILURE;
355 return StatusCode::SUCCESS;
372 auto slot = ctx.valid() ? ctx.slot() : 0;
382 CoreDumpSvcHandler::log() << name() <<
" FATAL Caught fatal signal. Printing details to "
383 << m_coreDumpStream.value()
384 << (m_dumpCoreFile ?
". Will try to produce a core dump file on exit." :
".")
387 CoreDumpSvcHandler::log() <<
dump() << std::flush;
395 std::ostringstream os;
397 const time_t now = time(
nullptr);
399 os <<
"-------------------------------------------------------------------------------------" <<
"\n";
400 os <<
"Core dump from " << name() <<
" on " << System::hostName()
401 <<
" at " << ctime_r(&now, buf) ;
408 os <<
"Caught signal " << signo
409 <<
"(" << strsignal(signo) <<
"). Details: "
423 os <<
" value = (" <<
m_siginfo->si_int <<
", "
424 << std::hex <<
m_siginfo->si_ptr <<
")" << std::dec <<
"\n";
430 const long pagesz = sysconf(_SC_PAGESIZE);
431 os <<
" vmem = " << s.vm_pages*pagesz/1024./1024. <<
" MB\n"
432 <<
" rss = " << s.rss_pages*pagesz/1024./1024. <<
" MB\n";
440 if ( 0 == sysinfo(&sys) ) {
442 const float mem_units = sys.mem_unit/(1024.*1024.);
443 os <<
" total-ram = " << sys.totalram * mem_units <<
" MB\n"
444 <<
" free-ram = " << sys.freeram * mem_units <<
" MB\n"
445 <<
" buffer-ram= " << sys.bufferram* mem_units <<
" MB\n"
446 <<
" total-swap= " << sys.totalswap* mem_units <<
" MB\n"
447 <<
" free-swap = " << sys.freeswap * mem_units <<
" MB\n";
453 if (signo == SIGILL || signo == SIGFPE || signo == SIGSEGV || signo == SIGBUS)
454 os <<
" addr = " << std::hex <<
m_siginfo->si_addr << std::dec <<
"\n";
461 SmartIF<IAlgManager> algMgr{serviceLocator()->as<IAlgManager>()};
462 SmartIF<IAlgContextSvc> algContextSvc;
466 if (Gaudi::Concurrency::ConcurrencyFlags::numConcurrentEvents() == 0) {
467 algContextSvc = service(
"AlgContextSvc",
false);
474 std::string currentAlg;
477 if (Gaudi::Concurrency::ConcurrencyFlags::numConcurrentEvents() > 0) {
478 const EventContext ctx(0,t);
479 ATH_MSG_DEBUG(
"Using AlgExecStateSvc to determine current algorithm(s)");
481 for (
const IAlgorithm* alg : algMgr->getAlgorithms()) {
482 auto aes = alg->execState(ctx);
483 if (aes.state()==AlgExecState::State::Executing)
484 currentAlg += (alg->name() +
" ");
487 catch (
const GaudiException&) {
488 ATH_MSG_INFO(
"No information from AlgExecStateSvc because no algorithm was executed yet.");
491 else if (algContextSvc) {
492 ATH_MSG_DEBUG(
"Using AlgContextSvc to determine current algorithm");
493 IAlgorithm* alg = algContextSvc->currentAlg();
494 if (alg) currentAlg = alg->name();
497 ATH_MSG_WARNING(
"AlgExecStateSvc or AlgContextSvc not available. Cannot determine current algorithm.");
500 if (currentAlg.empty()) currentAlg =
"<NONE>";
501 os <<
"Slot " << std::setw(3) << t <<
" : Current algorithm = " << currentAlg << std::endl;
505 if (!sys.LastInc.empty()) {
506 os <<
" : Last Incident = " << sys.LastInc << std::endl
507 <<
" : Event ID = " << sys.EvId << std::endl;
513 for (
auto &s : usr) {
514 os <<
" : (usr) " << s.first <<
" = " << s.second << std::endl;
520 os <<
"Algorithm stack: ";
521 if ( algContextSvc->algorithms().empty() ) os <<
"<EMPTY>" <<
"\n";
524 for (
auto alg : algContextSvc->algorithms()) {
525 if (alg) os <<
" " << alg->name() <<
"\n";
531 os <<
"| AtlasBaseDir : " << std::setw(66) << getenv(
"AtlasBaseDir") <<
" |\n";
532 os <<
"| AtlasVersion : " << std::setw(66) << getenv(
"AtlasVersion") <<
" |\n";
533 os <<
"| BINARY_TAG : " << std::setw(66) << getenv(
"BINARY_TAG") <<
" |\n";
535 os <<
" Note: to see line numbers in below stacktrace you might consider running following :\n";
536 os <<
" atlasAddress2Line --file <logfile>\n";
538 SmartIF<IAthenaSummarySvc> iass{service(
"AthenaSummarySvc",
false)};
540 iass->addSummary(
"CoreDumpSvc",os.str());
542 iass->createSummary().ignore();
555 auto slot = incident.context().valid() ? incident.context().slot() : 0;
558 currRec.LastInc = incident.source() +
":" + incident.type();
560 std::ostringstream oss;
561 oss << incident.context().eventID();
562 currRec.EvId = oss.str();
564 if (incident.type()==IncidentType::BeginEvent) {
568 }
else if (incident.type() ==
"StoreCleared") {
570 auto newstr = currRec.EvId;
573 newstr[0] = newstr[0];
574 currRec.EvId = std::move(newstr);
589 std::ostringstream oss;
591 for (
auto sig : m_signals) {
593 if (sig<1 || sig>SIGRTMAX) {
598 oss <<
sig <<
"(" << strsignal(sig) <<
") ";
604 struct sigaction sigact;
605 memset (&sigact, 0,
sizeof(sigact));
606 sigact.sa_sigaction = CoreDumpSvcHandler::action;
608 sigact.sa_flags = SA_SIGINFO + SA_ONSTACK;
611 ATH_MSG_ERROR (
"Error on installing handler for signal " << sig
612 <<
": " << strerror(errno));
613 return StatusCode::FAILURE;
618 return StatusCode::SUCCESS;
628 StatusCode
sc = StatusCode::SUCCESS;
631 int ret = sigaction(kv.first, &(kv.second),
nullptr);
633 sc = StatusCode::FAILURE;
634 ATH_MSG_WARNING(
"Error on uninstalling handler for signal " << kv.first
635 <<
": " << strerror(errno));
649 std::vector<uint8_t>& stack =
s_stack;
651 stack.resize (std::max (SIGSTKSZ, MINSIGSTKSZ) + 2*1024*1024);
653 ss.ss_sp = stack.data();
655 ss.ss_size = stack.size();
656 int ret = sigaltstack (&
ss,
nullptr);
#define ATH_MSG_WARNING(x)
void CoreDumpSvc::print ATLAS_NOT_THREAD_SAFE()
Install fatal handler with default options.
Collecting a few shared bits and pieces from SEAL headers.
#define IOFD_INVALID
Invalid channel descriptor constant.
This are the SEAL debug aids, adapted to build in Atlas, after the drop of that project.
This is the signal handler from SEAL, adapted to build in Atlas, after the drop of that project.
Define macros for attributes used to control the static checker.
#define ATLAS_THREAD_SAFE
static IOFD stacktraceFd(IOFD fd=IOFD_INVALID)
Set and return the file descriptor for stack trace output.
static const int FATAL_DUMP_CONTEXT
Option to make fataldump(int, siginfo_t *, void *) (invoked by fatal(int, siginfo_t *,...
static const char * describe(int sig, int code)
Return the description for signal info code code for signal number sig.
static void revert(int sig)
Revert the signal number sig back to its default behaviour.
static const int FATAL_DUMP_SIG
Option to make fataldump(int, siginfo_t *, void *) (invoked by fatal(int, siginfo_t *,...
static HandlerType handle(int sig, HandlerType handler, const sigset_t *blockMask=0)
Install a new signal handler handler for signal number sig and returns the old handler.
static const int FATAL_DUMP_STACK
Option to make fataldump(int, siginfo_t *, void *) (invoked by fatal(int, siginfo_t *,...
Service to print additional information before a crash.
siginfo_t * m_siginfo
Pointer to siginfo_t struct (set by signal handler)
void setAltStack()
Set up an alternate stack for the current thread.
virtual StatusCode start() override
Gaudi::Property< int > m_fatalHandlerFlags
std::vector< UserCore_t > m_usrCoreDumps
User defined core dump info.
static thread_local std::vector< uint8_t > s_stack
Gaudi::Property< bool > m_callOldHandler
Gaudi::Property< bool > m_stackTrace
virtual std::string dump() const override
Print all core dump records.
virtual void handle(const Incident &incident) override
Incident listener.
virtual ~CoreDumpSvc() ATLAS_CTORDTOR_NOT_THREAD_SAFE
Destructor.
virtual void setCoreDumpInfo(const std::string &name, const std::string &value) override
Set a name/value pair in the core dump record.
Gaudi::Property< bool > m_dumpCoreFile
Gaudi::Property< bool > m_killOnSigInt
CoreDumpSvc()
Default constructor (do not use)
Gaudi::Property< std::string > m_coreDumpStream
std::vector< sysDumpRec > m_sysCoreDumps
Core dump info collected by this service.
std::atomic< EventID::event_number_t > m_eventCounter
Event counter.
Gaudi::Property< bool > m_fastStackTrace
Signal handler for CoreDumpSvc.
CoreDumpSvc * coreDumpSvc(nullptr)
pointer to CoreDumpSvc
SigHandler_t oldSigHandler
old signal handlers
std::map< int, struct sigaction > SigHandler_t
std::ostream &log ATLAS_NOT_THREAD_SAFE()
convenience method for logging
bool dumpCoreFile(false)
dump core file on exit?
bool callOldHandler(true)
forward calls to old handlers?
bool stackTrace(false)
produce stack trace?
bool fastStackTrace(false)
produce fast stack trace using CxxUtils/Seal
struct athena_statm read_athena_statm()