13#include "GaudiKernel/IIncidentSvc.h"
27#include "tbb/task_arena.h"
54 const long highestPriority =
static_cast<long>(-1);
55 const long lowestPriority = 0;
56 incSvc->addListener(
this, IncidentType::BeginEvent, highestPriority);
57 incSvc->addListener(
this,
"EndAlgorithms", lowestPriority);
58 incSvc->addListener(
this,
"EndEvtLoop", highestPriority);
59 incSvc->addListener(
this, IncidentType::SvcPostFinalize);
64 ATH_MSG_INFO(
"The system doesn't support /proc. Therefore, memory measurements are not available");
69 "analyzing [" <<
m_numberOfSlots.toString() <<
"] events concurrently");
72 ATH_MSG_INFO(
" >> Component-level memory monitoring in the event-loop is disabled in jobs with more than 1 thread");
85 ATH_CHECK(auditorSvc()->addAuditor(
"PerfMonMTAuditor"));
87 return StatusCode::SUCCESS;
97 return StatusCode::SUCCESS;
105 if (inc.type() == IncidentType::BeginEvent) {
160 else if (inc.type() ==
"EndEvtLoop") {
166 else if (inc.type() == IncidentType::SvcPostFinalize) {
180 const EventContext& getCurrentContext() {
181 return Gaudi::Hive::currentContext();
198 startCompAud(stepName, compName, ctx.valid() ? ctx : getCurrentContext());
212 stopCompAud(stepName, compName, ctx.valid() ? ctx : getCurrentContext());
221 if (compName ==
"AthOutSeq" && stepName ==
"Start") {
228 if (compName ==
"AthMasterSeq" && stepName ==
"Finalize") {
239 if (compName ==
"AthMasterSeq" && stepName ==
"Initialize") {
258 const unsigned int ithread = (ctx.valid() && tbb::this_task_arena::current_thread_index() > -1) ? tbb::this_task_arena::current_thread_index() : 0;
269 if(compLevelDataMap.find(currentState) == compLevelDataMap.end()) {
270 compLevelDataMap.insert({currentState, std::make_unique<PMonMT::ComponentData>()});
281 compLevelDataMap[currentState]->addPointStart(meas, doMem);
284 ATH_MSG_DEBUG(
"Start Audit - Component " << compName <<
" , "
285 "Step " << stepName <<
" , "
286 "Event " << ctx.evt() <<
" , "
287 "Slot " << ctx.slot() <<
" , "
288 "Context " << ctx.valid() <<
" , "
289 "Thread " << ithread <<
" , "
292 "Vmem " << meas.
vmem <<
" kb, "
293 "Malloc " << meas.
malloc <<
" kb");
301 const unsigned int ithread = (ctx.valid() && tbb::this_task_arena::current_thread_index() > -1) ? tbb::this_task_arena::current_thread_index() : 0;
319 auto itr = compLevelDataMap.find(currentState);
323 if (itr==compLevelDataMap.end())
return;
325 itr->second->addPointStop(meas, doMem);
331 if (
m_isFirstEvent && compName ==
"IncidentProcAlg3" && stepName ==
"Execute") {
337 "Step " << stepName <<
" , "
338 "Event " << ctx.evt() <<
" , "
339 "Slot " << ctx.slot() <<
" , "
340 "Context " << ctx.valid() <<
" , "
341 "Thread " << ithread <<
" , "
342 "Cpu (" << compLevelDataMap[currentState]->m_tmp_cpu <<
":"
344 << (meas.
cpu_time - compLevelDataMap[currentState]->m_tmp_cpu) <<
":"
345 << compLevelDataMap[currentState]->m_delta_cpu <<
") ms, "
346 "Wall (" << compLevelDataMap[currentState]->m_tmp_wall <<
":"
348 << (meas.
wall_time - compLevelDataMap[currentState]->m_tmp_wall) <<
":"
349 << compLevelDataMap[currentState]->m_delta_wall <<
") ms, "
350 "Vmem (" << compLevelDataMap[currentState]->m_tmp_vmem <<
":"
352 << (meas.
vmem - compLevelDataMap[currentState]->m_tmp_vmem) <<
":"
353 << compLevelDataMap[currentState]->m_delta_vmem <<
") kb, "
354 "Malloc (" << compLevelDataMap[currentState]->m_tmp_malloc <<
":"
356 << (meas.
malloc - compLevelDataMap[currentState]->m_tmp_malloc) <<
":"
357 << compLevelDataMap[currentState]->m_delta_malloc <<
") kb");
371 const double totalCpuTime =
378 const double scaledWallTime =
385 return ( scaledWallTime > 0 ? totalCpuTime / scaledWallTime * 100. : 0 );
429 ATH_MSG_INFO(
"=======================================================================================");
431 ATH_MSG_INFO(
"=======================================================================================");
434 ATH_MSG_INFO(
"*** In order to make plots using the results run the following commands:");
436 ATH_MSG_INFO(
"*** In order to print tables using the results run the following commands:");
438 ATH_MSG_INFO(
"=======================================================================================");
447 ATH_MSG_INFO(
"=======================================================================================");
449 ATH_MSG_INFO(
"=======================================================================================");
451 ATH_MSG_INFO(std::format(
"{:<10} {:<15} {:<25} {:<40} {:<55} {:<75}",
"Step",
"Count",
"CPU Time [ms]",
452 "Vmem [kB]",
"Malloc [kB]",
"Component"));
454 ATH_MSG_INFO(
"---------------------------------------------------------------------------------------");
461 std::vector<std::pair<PMonMT::StepComp, PMonMT::ComponentData*>> pairs;
462 for (
auto itr = vec_itr.begin(); itr != vec_itr.end(); ++itr) pairs.push_back(*itr);
464 sort(pairs.begin(), pairs.end(),
465 [=](std::pair<PMonMT::StepComp, PMonMT::ComponentData*>&
a,
466 std::pair<PMonMT::StepComp, PMonMT::ComponentData*>& b) {
467 return a.second->getDeltaCPU() > b.second->getDeltaCPU();
471 for (
auto it : pairs) {
478 ATH_MSG_INFO(std::format(
"{:<10} {:<15} {:<25.2f} {:<40.0f} {:<55.0f} {:<75}",it.first.stepName,
479 it.second->getCallCount(),it.second->getDeltaCPU(),it.second->getDeltaVmem(),
480 it.second->getDeltaMalloc(),it.first.compName));
483 ATH_MSG_INFO(
"=======================================================================================");
512 " and the last measurements are explicitly printed)");
513 ATH_MSG_INFO(
"=======================================================================================");
515 ATH_MSG_INFO(std::format(
"{:<16} {:<12} {:<12} {:<12} {:<12} {:<12} {:<12}",
"Event",
"CPU [s]",
516 "Wall [s]",
"Vmem [kB]",
"Rss [kB]",
"Pss [kB]",
"Swap [kB]"));
518 ATH_MSG_INFO(
"---------------------------------------------------------------------------------------");
529 ATH_MSG_INFO(std::format(
"{:<16} {:>12.2f} {:>12.2f} {:>12} {:>12} {:>12} {:>12}", it.first,
530 it.second.cpu_time * 0.001,it.second.wall_time * 0.001,it.second.mem_stats.at(
"vmem"),
531 it.second.mem_stats.at(
"rss"),it.second.mem_stats.at(
"pss"),it.second.mem_stats.at(
"swap")));
536 m_fit_vmem.addPoint(it.first, it.second.mem_stats.at(
"vmem"));
537 m_fit_pss.addPoint(it.first, it.second.mem_stats.at(
"pss"));
540 ATH_MSG_INFO(
"=======================================================================================");
549 ATH_MSG_INFO(
"=======================================================================================");
551 ATH_MSG_INFO(std::format(
"{:<13} {:<12} {:<12} {:<7} {:<11} {:<11} {:<11} {:<11}",
"Step",
552 "dCPU [s]",
"dWall [s]",
"<CPU>",
"dVmem [kB]",
"dRss [kB]",
"dPss [kB]",
"dSwap [kB]"));
554 ATH_MSG_INFO(
"---------------------------------------------------------------------------------------");
556 for (
unsigned int idx = 0; idx <
NSNAPSHOTS; idx++) {
557 ATH_MSG_INFO(std::format(
"{:<13} {:<12.2f} {:<12.2f} {:<7.2f} {:<11} {:<11} {:<11} {:<11}",
565 ATH_MSG_INFO(
"***************************************************************************************");
570 ATH_MSG_INFO(std::format(
"{:<35} {:.0f}",
"CPU usage per event [ms]:",
572 ATH_MSG_INFO(std::format(
"{:<35} {:.3f}",
"Events per second:",
573 (wall_exec_total > 0 ?
m_eventCounter / wall_exec_total * 1000. : 0)));
576 ATH_MSG_INFO(
"***************************************************************************************");
581 ATH_MSG_INFO(
"***************************************************************************************");
585 <<
" measurements from the Event Level Monitoring");
589 ATH_MSG_INFO(
"=======================================================================================");
598 ATH_MSG_INFO(
"=======================================================================================");
603 ATH_MSG_INFO(
"=======================================================================================");
612 ATH_MSG_INFO(
"=======================================================================================");
617 ATH_MSG_INFO(
"=======================================================================================");
640 o << std::setw(4) << j << std::endl;
645 int rc = std::system(cmd.c_str());
670 const double cpuUtil = dCPU / dWall;
671 const int64_t dVmem =
m_snapshotData[i].getMemMonDeltaMap(
"vmem");
674 const int64_t dSwap =
m_snapshotData[i].getMemMonDeltaMap(
"swap");
676 j[
"summary"][
"snapshotLevel"][step] = {{
"dCPU", dCPU},
678 {
"cpuUtil", cpuUtil},
688 j[
"summary"][
"nEvents"] =
nEvents;
696 j[
"summary"][
"peaks"] = {{
"vmemPeak", vmemPeak},
697 {
"rssPeak", rssPeak},
698 {
"pssPeak", pssPeak},
699 {
"swapPeak", swapPeak}};
703 const int64_t pssLeak =
m_fit_pss.slope();
706 j[
"summary"][
"leakEstimates"] = {{
"vmemLeak", vmemLeak},
707 {
"pssLeak", pssLeak},
708 {
"nPoints", nPoints}};
715 j[
"summary"][
"sysInfo"] = {{
"cpuModel", cpuModel},
716 {
"coreNum", coreNum},
720 const std::string mallocLib = std::filesystem::path(
PMonMT::symb2lib(
"malloc")).filename().string();
721 const std::string mathLib = std::filesystem::path(
PMonMT::symb2lib(
"atan2")).filename().string();
723 j[
"summary"][
"envInfo"] = {{
"mallocLib", mallocLib},
724 {
"mathLib", mathLib}};
728 j[
"summary"][
"misc"] = {{
"cpuUtilEff", cpuUtilEff}};
736 for(
const auto& meas : dataMapPerStep){
738 const std::string step = meas.first.stepName;
739 const std::string component = meas.first.compName;
740 const uint64_t
count = meas.second->getCallCount();
741 const double cpuTime = meas.second->getDeltaCPU();
742 const double wallTime = meas.second->getDeltaWall();
743 const int64_t vmem = meas.second->getDeltaVmem();
744 const int64_t mall = meas.second->getDeltaMalloc();
746 j[
"componentLevel"][step][component] = {{
"count",
count},
747 {
"cpuTime", cpuTime},
748 {
"wallTime", wallTime},
761 const uint64_t
event = it.first;
762 const double cpuTime = it.second.cpu_time;
763 const double wallTime = it.second.wall_time;
764 const int64_t vmem = it.second.mem_stats.at(
"vmem");
765 const int64_t rss = it.second.mem_stats.at(
"rss");
766 const int64_t pss = it.second.mem_stats.at(
"pss");
767 const int64_t
swap = it.second.mem_stats.at(
"swap");
769 j[
"eventLevel"][std::to_string(event)] = {{
"cpuTime", cpuTime},
770 {
"wallTime", wallTime},
796 for (
const auto& it : slotData) {
809 if(it.second->getDeltaCPU() < 0) {
810 ATH_MSG_WARNING(
"Negative CPU-time measurement of " << it.second->getDeltaCPU() <<
811 " ms for component " << it.first.compName <<
812 " in step " << it.first.stepName);
814 if(it.second->getDeltaWall() < 0) {
815 ATH_MSG_WARNING(
"Negative Wall-time measurement of " << it.second->getDeltaWall() <<
816 " ms for component " << it.first.compName <<
817 " in step " << it.first.stepName);
828 if (it.first.stepName ==
"Initialize")
830 else if (it.first.stepName ==
"FirstEvent")
832 else if (it.first.stepName ==
"Execute")
834 else if (it.first.stepName ==
"Finalize")
836 else if (it.first.stepName ==
"preLoadProxy")
838 else if (it.first.stepName ==
"Callback")
852 auto ms =
static_cast<int64_t
>(timeMeas);
855 auto hrs = ms / 3600000;
858 auto mins = ms / 60000;
861 auto secs = ms / 1000;
865 std::stringstream
ss;
867 ss << std::setw(2) << hrs <<
"h" <<
868 std::setw(2) << mins <<
"m" <<
869 std::setw(2) << secs <<
"s";
881 std::ostringstream
ss;
883 ss << std::setprecision(2);
886 std::vector<std::string> significance = {
"KB",
"MB",
"GB",
"TB"};
889 int64_t absMemMeas = std::abs(memMeas);
891 int64_t order = std::log(absMemMeas)/std::log(1024);
893 double value = memMeas/std::pow(1024, order);
897 return ss.str() +
" " + significance[order];
904 const std::string& fieldName)
const {
910 std::ifstream
file{fileName};
911 std::string line{
""};
913 while (std::getline(
file, line)) {
914 if (line.empty())
continue;
915 size_t splitIdx = line.find(
':');
916 if (splitIdx != std::string::npos) {
917 std::string val = line.substr(splitIdx + 1);
918 if (val.empty())
continue;
919 if (line.size() >= fieldName.size() &&
920 line.compare(0, fieldName.size(), fieldName) == 0) {
938 if (val.empty())
return 0;
939 return std::stoi(val) + 1;
944 if (val.empty())
return 0;
945 val.resize(val.size() - 3);
946 return std::stoull(val);
#define ATH_CHECK
Evaluate an expression and check for errors.
#define ATH_MSG_WARNING(x)
void swap(DataVector< T > &a, DataVector< T > &b)
See DataVector<T, BASE>::swap().
void sort(typename DataModel_detail::iterator< DVL > beg, typename DataModel_detail::iterator< DVL > end)
Specialization of sort for DataVector/List.
Define macros for attributes used to control the static checker.
#define ATLAS_THREAD_SAFE
PMonMT::EventLevelData m_eventLevelData
std::vector< data_map_t > m_stdoutVec_serial
data_map_t m_compLevelDataMap_cbk
virtual void stopAud(const std::string &stepName, const std::string &compName, const EventContext &ctx) override
Stop Auditing.
Gaudi::Property< int > m_numberOfSlots
Get the number of slots.
Gaudi::Property< std::string > m_jsonFileName
Name of the JSON file.
data_map_t m_compLevelDataMap_plp
std::atomic< bool > m_isFirstEvent
Gaudi::Property< bool > m_doComponentLevelMonitoring
Do component level monitoring.
void aggregateSlotData()
A few helper functions.
PerfMon::LinFitSglPass m_fit_pss
void stopSnapshotAud(const std::string &stepName, const std::string &compName)
void report2JsonFile()
Report to the JSON File.
data_map_t m_compLevelDataMap_fin
const std::set< std::string > m_exclusionSet
Exclude some common components from monitoring In the future this might be converted to a inclusion s...
Gaudi::Property< uint64_t > m_checkPointThreshold
Frequency of event level monitoring.
std::atomic< double > m_checkPointTime
std::string get_info_from_file(const std::string &fileName, const std::string &fieldName) const
A few helper methods to get system information These should be carried to PerfMonMTUtils at some poin...
void report2Log_EventLevel()
void report2Log_Description() const
std::atomic< uint64_t > m_eventCounter
std::vector< std::string > m_snapshotStepNames
virtual void handle(const Incident &incident) override
Incident service handle for post-finalize.
std::map< PMonMT::StepComp, std::unique_ptr< PMonMT::ComponentData > > data_map_unique_t
PMonMT::StepComp generate_state(const std::string &stepName, const std::string &compName) const
std::string scaleTime(double timeMeas) const
virtual StatusCode initialize() override
Standard Gaudi Service initialization.
PMonMT::SnapshotMeasurement m_measurementSnapshots
Measurement to capture snapshots.
Gaudi::Property< bool > m_doEventLoopMonitoring
Do event loop monitoring.
PMonMT::SnapshotMeasurement m_measurementEvents
Measurement to capture events.
Gaudi::Property< uint64_t > m_memFitLowerLimit
Lower limit (in number of events) for the memory fit.
std::vector< PMonMT::SnapshotData > m_snapshotData
virtual StatusCode finalize() override
Standard Gaudi Service finalization.
void report2Log_EventLevel_instant() const
void stopCompAud(const std::string &stepName, const std::string &compName, const EventContext &ctx)
std::atomic< bool > m_isEvtLoopStopped
std::atomic< uint64_t > m_eventLoopMsgCounter
int getCpuEfficiency() const
data_map_t m_compLevelDataMap_ini
int get_cpu_core_info() const
data_map_t m_compLevelDataMap
void report2Log_Summary()
void report2Log()
Report to log.
virtual void startAud(const std::string &stepName, const std::string &compName, const EventContext &ctx) override
Start Auditing.
data_map_t m_compLevelDataMap_evt
void report2Log_CpuInfo() const
Gaudi::Property< double > m_wallTimeOffset
Offset for the wall-time, comes from configuration.
void report2JsonFile_Summary(nlohmann::json &j) const
int m_motherPID
Snapshots data.
void report()
Report the results.
void report2JsonFile_EventLevel(nlohmann::json &j) const
void report2Log_ComponentLevel()
std::string scaleMem(int64_t memMeas) const
std::string get_cpu_model_info() const
Gaudi::Property< bool > m_printDetailedTables
Print detailed tables.
std::mutex m_mutex_capture
Gaudi::Property< int > m_numberOfThreads
Get the number of threads.
void startSnapshotAud(const std::string &stepName, const std::string &compName)
Snapshot Auditing: Take snapshots at the beginning and at the end of each step.
uint64_t get_memory_info() const
void report2JsonFile_ComponentLevel(nlohmann::json &j) const
Gaudi::Property< int > m_printNComps
Print the top N components.
Gaudi::Property< bool > m_reportResultsToJSON
Report results to JSON.
std::vector< data_map_unique_t > m_compLevelDataMapVec
PerfMonMTSvc(const std::string &name, ISvcLocator *pSvcLocator)
Standard Gaudi Service constructor.
void report2Log_EnvInfo() const
void startCompAud(const std::string &stepName, const std::string &compName, const EventContext &ctx)
Component Level Auditing: Take measurements at the beginning and at the end of each component call.
data_map_t m_compLevelDataMap_1stevt
Gaudi::Property< uint64_t > m_eventLoopMsgLimit
Set the number of messages for the event-level report.
PerfMon::LinFitSglPass m_fit_vmem
int count(std::string s, const std::string ®x)
count how many occurances of a regx are in a string
bool doesDirectoryExist(const std::string &dir)
const char * symb2lib(const char *symbol, const char *failstr)
DataModel_detail::iterator< DVL > remove(typename DataModel_detail::iterator< DVL > beg, typename DataModel_detail::iterator< DVL > end, const T &value)
Specialization of remove for DataVector/List.