d2/d59/TimeoutAlg_8cxx_source.html

/*

 * Copyright (C) 2002-2025 CERN for the benefit of the ATLAS collaboration

 */


#include "TimeoutAlg.h"


#include "AthenaKernel/ICoreDumpSvc.h"

#include "GaudiKernel/IScheduler.h"

#include "GaudiKernel/ServiceHandle.h"


#include <format>


StatusCode TimeoutAlg::initialize()

{

  m_timeout = std::chrono::nanoseconds(m_timeoutProp);


  // Subscribe to EndAlgorithms (includes output sequence)

  ServiceHandle<IIncidentSvc> incSvc("IncidentSvc/IncidentSvc", name());

  ATH_CHECK(incSvc.retrieve());

  incSvc->addListener(this, "EndAlgorithms", /*priority*/ 0);


  return StatusCode::SUCCESS;

}


StatusCode TimeoutAlg::execute (const EventContext& ctx) const

{

  // Timeout thread is started on first event to make sure this also works

  // in athenaMP (threads usually don't survive forking).

  [[maybe_unused]] static const bool initThread = [&](){

    if (m_timeoutProp > 0) {

      const auto nc_this ATLAS_THREAD_SAFE = const_cast<TimeoutAlg*>(this);

      m_thread = std::thread(&TimeoutAlg::timeoutThread, nc_this);

    }

    return true;

  }();


  // Set event start time for current slot

  *m_eventStartTime.get(ctx) = clock_t::now();


  return StatusCode::SUCCESS;

}


void TimeoutAlg::handle(const Incident& inc)

{

  if (inc.type() == "EndAlgorithms") {

    ATH_MSG_DEBUG("Resetting event timeout for slot " << inc.context().slot());

    // Reset start time for slot to zero

    *m_eventStartTime.get(inc.context()) = {};

  }

}


StatusCode TimeoutAlg::stop()

{

  if (m_thread.joinable() && !m_stopped.exchange(true)) {

    // Signal timeout thread to stop

    ATH_MSG_DEBUG("Stopping timeout thread");

    m_stop_thread.set_value();

    m_thread.join();

  }


  return StatusCode::SUCCESS;

}


void TimeoutAlg::timeoutThread()

{

  ATH_MSG_INFO(std::format("Setting per-event timeout of {}",

                           std::chrono::duration<double>(m_timeout)));


  // Wakeup at regular intervals (with a minimum frequency, useful for long timeouts)

  const std::chrono::nanoseconds wakeup_interval =

    std::min(m_timeout, std::chrono::nanoseconds(m_checkInterval));


  // Loop until we have received stop signal

  auto stop_signal = m_stop_thread.get_future();

  while ( stop_signal.wait_for(wakeup_interval) == std::future_status::timeout ) {


    // Loop over all slots and check if event has reached timeout

    const auto now = clock_t::now();

    for (EventContext::ContextID_t slot = 0;

         const auto& startTime : m_eventStartTime) {


      if (startTime.time_since_epoch().count() > 0 &&  now > startTime + m_timeout) {

        handleTimeout(slot);

      }


      ++slot;

    }

  }

}


void TimeoutAlg::handleTimeout(EventContext::ContextID_t slot)

{

  // To avoid getting another timeout while handling this one

  std::scoped_lock lock(m_handleMutex);


  // Create minimal context with slot number

  const EventContext ctx(0, slot);


  // Don't duplicate the actions if the timeout was already reached for this slot

  if (Athena::Timeout::instance(ctx).reached()) return;


  // Print ERROR message

  const std::string msg = std::format("Event timeout ({}) in slot {} reached",

                                      std::chrono::duration<double>(m_timeout), slot);

  ATH_MSG_ERROR(msg);


  // Set timeout flag

  setTimeout(Athena::Timeout::instance(ctx));


  // Dump scheduler state if requested

  if (m_dumpState) {

    ServiceHandle<IScheduler> schedulerSvc("AvalancheSchedulerSvc", name());

    if (schedulerSvc.retrieve().isSuccess()) {

      schedulerSvc->dumpState();

    }

  }


  // Abort job if requested

  if (m_abort) {

    // Stop the timeout thread to avoid additional triggers

    stop().ignore();


    // Tell CoreDumpSvc about the reason for the abort

    ServiceHandle<ICoreDumpSvc> coreDumpSvc("CoreDumpSvc", name());

    if ( coreDumpSvc.retrieve().isSuccess() ) {

      coreDumpSvc->setCoreDumpInfo(ctx, "Reason", msg);

    }

    else {

      std::cerr << msg << std::endl;

    }

    // Abort job (and let CoreDumpSvc handle SIGABRT)

    std::abort();

  }


}


ATH_CHECK
#define ATH_CHECK
Evaluate an expression and check for errors.
Definition AthCheckMacros.h:40

ATH_MSG_ERROR
#define ATH_MSG_ERROR(x)
Definition AthMsgStreamMacros.h:33

ATH_MSG_INFO
#define ATH_MSG_INFO(x)
Definition AthMsgStreamMacros.h:31

ATH_MSG_DEBUG
#define ATH_MSG_DEBUG(x)
Definition AthMsgStreamMacros.h:29

ICoreDumpSvc.h
Interface of a core dump service.

TimeoutAlg.h

Athena::TimeoutMaster::setTimeout
void setTimeout(Timeout &instance)
Set timeout.
Definition Timeout.h:80

Athena::Timeout::instance
static Timeout & instance()
Get reference to Timeout singleton.
Definition Timeout.h:64

CoreDumpSvc::setCoreDumpInfo
virtual void setCoreDumpInfo(const std::string &name, const std::string &value) override
Set a name/value pair in the core dump record.
Definition CoreDumpSvc.cxx:365

ServiceHandle
Definition ClusterMakerTool.h:36

TimeoutAlg
Algorithm to monitor event timeouts.
Definition TimeoutAlg.h:37

TimeoutAlg::m_stop_thread
std::promise< void > m_stop_thread
Signal to stop watchdog thread.
Definition TimeoutAlg.h:82

TimeoutAlg::handleTimeout
void handleTimeout(EventContext::ContextID_t slot)
Handle timeout.
Definition TimeoutAlg.cxx:103

TimeoutAlg::m_dumpState
Gaudi::Property< bool > m_dumpState
Definition TimeoutAlg.h:64

TimeoutAlg::handle
virtual void handle(const Incident &inc) override
Definition TimeoutAlg.cxx:52

TimeoutAlg::m_handleMutex
std::mutex m_handleMutex
Mutex for handleTimeout.
Definition TimeoutAlg.h:88

TimeoutAlg::stop
virtual StatusCode stop() override
Definition TimeoutAlg.cxx:62

TimeoutAlg::m_abort
Gaudi::Property< bool > m_abort
Definition TimeoutAlg.h:67

TimeoutAlg::m_stopped
std::atomic< bool > m_stopped
Has watchdog thread already been stopped? (to avoid setting future twice)
Definition TimeoutAlg.h:85

TimeoutAlg::m_timeoutProp
Gaudi::Property< unsigned long long > m_timeoutProp
Definition TimeoutAlg.h:58

TimeoutAlg::timeoutThread
void timeoutThread()
Watchdog thread.
Definition TimeoutAlg.cxx:75

TimeoutAlg::execute
virtual StatusCode execute(const EventContext &ctx) const override
Definition TimeoutAlg.cxx:33

TimeoutAlg::m_timeout
std::chrono::nanoseconds m_timeout
Timeout property as duration.
Definition TimeoutAlg.h:73

TimeoutAlg::ATLAS_THREAD_SAFE
SG::SlotSpecificObj< clock_t::time_point > m_eventStartTime ATLAS_THREAD_SAFE
Start time of each event per slot.
Definition TimeoutAlg.h:76

TimeoutAlg::m_checkInterval
Gaudi::Property< unsigned long long > m_checkInterval
Definition TimeoutAlg.h:61

TimeoutAlg::initialize
virtual StatusCode initialize() override
Algorithm to monitor event timeouts.
Definition TimeoutAlg.cxx:20

msg
MsgStream & msg
Definition testRead.cxx:32