|
ATLAS Offline Software
|
Go to the documentation of this file.
13 #include "GaudiKernel/IIncidentSvc.h"
14 #include "GaudiKernel/ThreadLocalContext.h"
29 #include "boost/format.hpp"
32 #include "tbb/task_arena.h"
38 :
AthService(
name, pSvcLocator), m_isFirstEvent{
false}, m_eventCounter{0}, m_eventLoopMsgCounter{0}, m_checkPointTime{0}, m_isEvtLoopStopped{
false} {
40 m_motherPID = getpid();
41 m_snapshotData.resize(NSNAPSHOTS);
44 m_measurementSnapshots.capture();
45 m_snapshotData[CONFIGURE].addPointStop(m_measurementSnapshots);
46 m_snapshotData[INITIALIZE].addPointStart(m_measurementSnapshots);
54 return StatusCode::FAILURE;
59 return StatusCode::SUCCESS;
62 return AthService::queryInterface(riid, ppvInterface);
75 const long highestPriority =
static_cast<long>(-1);
76 const long lowestPriority = 0;
77 incSvc->addListener(
this, IncidentType::BeginEvent, highestPriority);
78 incSvc->addListener(
this,
"EndAlgorithms", lowestPriority);
79 incSvc->addListener(
this,
"EndEvtLoop", highestPriority);
80 incSvc->addListener(
this, IncidentType::SvcPostFinalize);
85 ATH_MSG_INFO(
"The system doesn't support /proc. Therefore, memory measurements are not available");
90 "analyzing [" <<
m_numberOfSlots.toString() <<
"] events concurrently");
93 ATH_MSG_INFO(
" >> Component-level memory monitoring in the event-loop is disabled in jobs with more than 1 thread");
107 ATH_MSG_ERROR(
"Could not register auditor [PerfMonMTAuditor]!");
108 return StatusCode::FAILURE;
110 return StatusCode::SUCCESS;
120 return StatusCode::SUCCESS;
128 if (inc.type() == IncidentType::BeginEvent) {
183 else if (inc.type() ==
"EndEvtLoop") {
189 else if (inc.type() == IncidentType::SvcPostFinalize) {
213 auto const &ctx = Gaudi::Hive::currentContext();
228 auto const &ctx = Gaudi::Hive::currentContext();
238 if (
compName ==
"AthOutSeq" && stepName ==
"Start") {
245 if (
compName ==
"AthMasterSeq" && stepName ==
"Finalize") {
256 if (
compName ==
"AthMasterSeq" && stepName ==
"Initialize") {
275 const unsigned int ithread = (ctx.valid() && tbb::this_task_arena::current_thread_index() > -1) ? tbb::this_task_arena::current_thread_index() : 0;
286 if(compLevelDataMap.find(currentState) == compLevelDataMap.end()) {
287 compLevelDataMap.insert({currentState, std::make_unique<PMonMT::ComponentData>()});
298 compLevelDataMap[currentState]->addPointStart(meas, doMem);
302 "Step " << stepName <<
" , "
303 "Event " << ctx.evt() <<
" , "
304 "Slot " << ctx.slot() <<
" , "
305 "Context " << ctx.valid() <<
" , "
306 "Thread " << ithread <<
" , "
309 "Vmem " << meas.
vmem <<
" kb, "
310 "Malloc " << meas.
malloc <<
" kb");
318 const unsigned int ithread = (ctx.valid() && tbb::this_task_arena::current_thread_index() > -1) ? tbb::this_task_arena::current_thread_index() : 0;
336 compLevelDataMap[currentState]->addPointStop(meas, doMem);
348 "Step " << stepName <<
" , "
349 "Event " << ctx.evt() <<
" , "
350 "Slot " << ctx.slot() <<
" , "
351 "Context " << ctx.valid() <<
" , "
352 "Thread " << ithread <<
" , "
353 "Cpu (" << compLevelDataMap[currentState]->m_tmp_cpu <<
":"
355 << (meas.
cpu_time - compLevelDataMap[currentState]->m_tmp_cpu) <<
":"
356 << compLevelDataMap[currentState]->m_delta_cpu <<
") ms, "
357 "Wall (" << compLevelDataMap[currentState]->m_tmp_wall <<
":"
359 << (meas.
wall_time - compLevelDataMap[currentState]->m_tmp_wall) <<
":"
360 << compLevelDataMap[currentState]->m_delta_wall <<
") ms, "
361 "Vmem (" << compLevelDataMap[currentState]->m_tmp_vmem <<
":"
363 << (meas.
vmem - compLevelDataMap[currentState]->m_tmp_vmem) <<
":"
364 << compLevelDataMap[currentState]->m_delta_vmem <<
") kb, "
365 "Malloc (" << compLevelDataMap[currentState]->m_tmp_malloc <<
":"
367 << (meas.
malloc - compLevelDataMap[currentState]->m_tmp_malloc) <<
":"
368 << compLevelDataMap[currentState]->m_delta_malloc <<
") kb");
382 const double totalCpuTime =
389 const double scaledWallTime =
396 return ( scaledWallTime > 0 ? totalCpuTime / scaledWallTime * 100. : 0 );
440 ATH_MSG_INFO(
"=======================================================================================");
442 ATH_MSG_INFO(
"=======================================================================================");
445 ATH_MSG_INFO(
"*** In order to make plots using the results run the following commands:");
447 ATH_MSG_INFO(
"*** In order to print tables using the results run the following commands:");
449 ATH_MSG_INFO(
"=======================================================================================");
459 ATH_MSG_INFO(
"=======================================================================================");
461 ATH_MSG_INFO(
"=======================================================================================");
463 ATH_MSG_INFO(
format(
"%1% %|15t|%2% %|25t|%3% %|40t|%4% %|55t|%5% %|75t|%6%") %
"Step" %
"Count" %
"CPU Time [ms]" %
464 "Vmem [kB]" %
"Malloc [kB]" %
"Component");
466 ATH_MSG_INFO(
"---------------------------------------------------------------------------------------");
473 std::vector<std::pair<PMonMT::StepComp, PMonMT::ComponentData*>>
pairs;
474 for (
auto itr = vec_itr.begin(); itr != vec_itr.end(); ++itr)
pairs.push_back(*itr);
477 [=](std::pair<PMonMT::StepComp, PMonMT::ComponentData*>&
a,
478 std::pair<PMonMT::StepComp, PMonMT::ComponentData*>&
b) {
479 return a.second->getDeltaCPU() > b.second->getDeltaCPU();
490 ATH_MSG_INFO(
format(
"%1% %|15t|%2% %|25t|%3$.2f %|40t|%4$.0f %|55t|%5$.0f %|75t|%6%") %
it.first.stepName %
491 it.second->getCallCount() %
it.second->getDeltaCPU() %
it.second->getDeltaVmem() %
492 it.second->getDeltaMalloc() %
it.first.compName);
495 ATH_MSG_INFO(
"=======================================================================================");
525 " and the last measurements are explicitly printed)");
526 ATH_MSG_INFO(
"=======================================================================================");
528 ATH_MSG_INFO(
format(
"%1% %|16t|%2% %|28t|%3% %|40t|%4% %|52t|%5% %|64t|%6% %|76t|%7%") %
"Event" %
"CPU [s]" %
529 "Wall [s]" %
"Vmem [kB]" %
"Rss [kB]" %
"Pss [kB]" %
"Swap [kB]");
531 ATH_MSG_INFO(
"---------------------------------------------------------------------------------------");
542 ATH_MSG_INFO(
format(
"%1% %|16t|%2$.2f %|28t|%3$.2f %|40t|%4% %|52t|%5% %|64t|%6% %|76t|%7%") %
it.first %
543 (
it.second.cpu_time * 0.001) % (
it.second.wall_time * 0.001) %
it.second.mem_stats.at(
"vmem") %
544 it.second.mem_stats.at(
"rss") %
it.second.mem_stats.at(
"pss") %
it.second.mem_stats.at(
"swap"));
553 ATH_MSG_INFO(
"=======================================================================================");
563 ATH_MSG_INFO(
"=======================================================================================");
565 ATH_MSG_INFO(
format(
"%1% %|13t|%2% %|25t|%3% %|37t|%4% %|44t|%5% %|55t|%6% %|66t|%7% %|77t|%8%") %
"Step" %
566 "dCPU [s]" %
"dWall [s]" %
"<CPU>" %
"dVmem [kB]" %
"dRss [kB]" %
"dPss [kB]" %
"dSwap [kB]");
568 ATH_MSG_INFO(
"---------------------------------------------------------------------------------------");
571 ATH_MSG_INFO(
format(
"%1% %|13t|%2% %|25t|%3% %|37t|%4$.2f %|44t|%5% %|55t|%6% %|66t|%7% %|77t|%8%") %
579 ATH_MSG_INFO(
"***************************************************************************************");
587 (wall_exec_total > 0 ?
m_eventCounter / wall_exec_total * 1000. : 0));
591 ATH_MSG_INFO(
"***************************************************************************************");
596 ATH_MSG_INFO(
"***************************************************************************************");
600 <<
" measurements from the Event Level Monitoring");
604 ATH_MSG_INFO(
"=======================================================================================");
614 ATH_MSG_INFO(
"=======================================================================================");
620 ATH_MSG_INFO(
"=======================================================================================");
631 ATH_MSG_INFO(
"=======================================================================================");
636 ATH_MSG_INFO(
"=======================================================================================");
659 o << std::setw(4) << j << std::endl;
664 int rc = std::system(
cmd.c_str());
689 const double cpuUtil = dCPU / dWall;
695 j[
"summary"][
"snapshotLevel"][
step] = {{
"dCPU", dCPU},
697 {
"cpuUtil", cpuUtil},
707 j[
"summary"][
"nEvents"] =
nEvents;
715 j[
"summary"][
"peaks"] = {{
"vmemPeak", vmemPeak},
716 {
"rssPeak", rssPeak},
717 {
"pssPeak", pssPeak},
718 {
"swapPeak", swapPeak}};
725 j[
"summary"][
"leakEstimates"] = {{
"vmemLeak", vmemLeak},
726 {
"pssLeak", pssLeak},
727 {
"nPoints", nPoints}};
734 j[
"summary"][
"sysInfo"] = {{
"cpuModel", cpuModel},
735 {
"coreNum", coreNum},
742 j[
"summary"][
"envInfo"] = {{
"mallocLib", mallocLib},
743 {
"mathLib", mathLib}};
747 j[
"summary"][
"misc"] = {{
"cpuUtilEff", cpuUtilEff}};
755 for(
const auto& meas : dataMapPerStep){
757 const std::string
step = meas.first.stepName;
758 const std::string component = meas.first.compName;
760 const double cpuTime = meas.second->getDeltaCPU();
761 const double wallTime = meas.second->getDeltaWall();
762 const int64_t vmem = meas.second->getDeltaVmem();
763 const int64_t mall = meas.second->getDeltaMalloc();
765 j[
"componentLevel"][
step][component] = {{
"count",
count},
766 {
"cpuTime", cpuTime},
767 {
"wallTime", wallTime},
781 const double cpuTime =
it.second.cpu_time;
782 const double wallTime =
it.second.wall_time;
783 const int64_t vmem =
it.second.mem_stats.at(
"vmem");
784 const int64_t rss =
it.second.mem_stats.at(
"rss");
785 const int64_t pss =
it.second.mem_stats.at(
"pss");
786 const int64_t
swap =
it.second.mem_stats.at(
"swap");
789 {
"wallTime", wallTime},
815 for (
const auto&
it : slotData) {
828 if(
it.second->getDeltaCPU() < 0) {
829 ATH_MSG_WARNING(
"Negative CPU-time measurement of " <<
it.second->getDeltaCPU() <<
830 " ms for component " <<
it.first.compName <<
831 " in step " <<
it.first.stepName);
833 if(
it.second->getDeltaWall() < 0) {
834 ATH_MSG_WARNING(
"Negative Wall-time measurement of " <<
it.second->getDeltaWall() <<
835 " ms for component " <<
it.first.compName <<
836 " in step " <<
it.first.stepName);
847 if (
it.first.stepName ==
"Initialize")
849 else if (
it.first.stepName ==
"FirstEvent")
851 else if (
it.first.stepName ==
"Execute")
853 else if (
it.first.stepName ==
"Finalize")
855 else if (
it.first.stepName ==
"preLoadProxy")
857 else if (
it.first.stepName ==
"Callback")
871 auto ms =
static_cast<int64_t
>(timeMeas);
874 auto hrs =
ms / 3600000;
877 auto mins =
ms / 60000;
880 auto secs =
ms / 1000;
884 std::stringstream
ss;
886 ss << std::setw(2) << hrs <<
"h" <<
887 std::setw(2) << mins <<
"m" <<
888 std::setw(2) << secs <<
"s";
900 std::ostringstream
ss;
902 ss << std::setprecision(2);
905 std::vector<std::string> significance = {
"KB",
"MB",
"GB",
"TB"};
908 int64_t absMemMeas = std::abs(memMeas);
916 return ss.str() +
" " + significance[
order];
930 std::string
line{
""};
933 if (
line.empty())
continue;
934 size_t splitIdx =
line.find(
':');
935 if (splitIdx != std::string::npos) {
936 std::string
val =
line.substr(splitIdx + 1);
937 if (
val.empty())
continue;
957 if (
val.empty())
return 0;
958 return std::stoi(
val) + 1;
963 if (
val.empty())
return 0;
964 val.resize(
val.size() - 3);
965 return std::stoull(
val);
Gaudi::Property< bool > m_doComponentLevelMonitoring
Do component level monitoring.
void report2JsonFile()
Report to the JSON File.
std::atomic< bool > m_isFirstEvent
void stopSnapshotAud(const std::string &stepName, const std::string &compName)
void report2Log_Description() const
int getCpuEfficiency() const
void report2Log_ComponentLevel()
data_map_t m_compLevelDataMap_fin
path
python interpreter configuration --------------------------------------—
Gaudi::Property< int > m_numberOfThreads
Get the number of threads.
PMonMT::SnapshotMeasurement m_measurementSnapshots
Measurement to capture snapshots.
data_map_t m_compLevelDataMap
void addPoint(const double &, const double &)
StatusCode makeAuditor(const std::string &audName, IAuditorSvc *audSvc, MsgStream &msg)
simple function to factorize boring things such as asking the AuditorSvc if an auditor is there (and ...
int m_motherPID
Snapshots data.
void report2Log_CpuInfo() const
void report2Log_EnvInfo() const
virtual StatusCode initialize() override
Standard Gaudi Service initialization.
std::atomic< uint64_t > m_eventCounter
int get_cpu_core_info() const
std::atomic< double > m_checkPointTime
Gaudi::Property< double > m_wallTimeOffset
Offset for the wall-time, comes from configuration.
void report2JsonFile_ComponentLevel(nlohmann::json &j) const
PerfMon::LinFitSglPass m_fit_vmem
virtual void handle(const Incident &incident) override
Incident service handle for post-finalize.
std::string get_cpu_model_info() const
void stopCompAud(const std::string &stepName, const std::string &compName, const EventContext &ctx)
PMonMT::SnapshotMeasurement m_measurementEvents
Measurement to capture events.
Gaudi::Property< uint64_t > m_eventLoopMsgLimit
Set the number of messages for the event-level report.
EventMeasMap_t getEventLevelData() const
void report2Log_EventLevel()
void startSnapshotAud(const std::string &stepName, const std::string &compName)
Snapshot Auditing: Take snapshots at the beginning and at the end of each step.
std::atomic< bool > m_isEvtLoopStopped
void report2Log_Summary()
data_map_t m_compLevelDataMap_ini
data_map_t m_compLevelDataMap_evt
std::vector< PMonMT::SnapshotData > m_snapshotData
void report2Log_EventLevel_instant() const
virtual void startAud(const std::string &stepName, const std::string &compName) override
Start Auditing.
PerfMon::LinFitSglPass m_fit_pss
void report2Log()
Report to log.
POOL::TEvent event(POOL::TEvent::kClassAccess)
virtual StatusCode queryInterface(const InterfaceID &riid, void **ppvInterface) override
Function declaring the interface(s) implemented by the service.
int64_t getEventLevelMemoryMax(const std::string &stat) const
Gaudi::Property< bool > m_reportResultsToJSON
Report results to JSON.
::StatusCode StatusCode
StatusCode definition for legacy code.
Gaudi::Property< std::string > m_jsonFileName
Name of the JSON file.
virtual StatusCode finalize() override
Standard Gaudi Service finalization.
void report2JsonFile_EventLevel(nlohmann::json &j) const
std::atomic< uint64_t > m_eventLoopMsgCounter
std::string scaleMem(int64_t memMeas) const
std::string scaleTime(double timeMeas) const
uint64_t get_memory_info() const
virtual void stopAud(const std::string &stepName, const std::string &compName) override
Stop Auditing.
void report2JsonFile_Summary(nlohmann::json &j) const
std::vector< std::string > m_snapshotStepNames
const std::set< std::string > m_exclusionSet
Exclude some common components from monitoring In the future this might be converted to a inclusion s...
std::string to_string(const DetectorType &type)
const char * symb2lib(const char *symbol, const char *failstr="unknown")
int64_t getEventLevelMemory(const uint64_t event_count, const std::string &stat) const
std::mutex m_mutex_capture
Gaudi::Property< int > m_numberOfSlots
Get the number of slots.
Gaudi::Property< int > m_printNComps
Print the top N components.
Gaudi::Property< bool > m_printDetailedTables
Print detailed tables.
void aggregateSlotData()
A few helper functions.
void set_wall_time_offset(const double wall_time_offset)
std::vector< data_map_t > m_stdoutVec_serial
PMonMT::EventLevelData m_eventLevelData
PerfMonMTSvc(const std::string &name, ISvcLocator *pSvcLocator)
Standard Gaudi Service constructor.
uint64_t getNMeasurements() const
Gaudi::Property< bool > m_doEventLoopMonitoring
Do event loop monitoring.
Gaudi::Property< uint64_t > m_checkPointThreshold
Frequency of event level monitoring.
static const InterfaceID & interfaceID()
Framework - Service InterfaceID.
#define ATH_MSG_WARNING(x)
data_map_t m_compLevelDataMap_1stevt
Gaudi::Property< uint64_t > m_memFitLowerLimit
Lower limit (in number of events) for the memory fit.
#define ATLAS_THREAD_SAFE
Define macros for attributes used to control the static checker.
std::string get_info_from_file(const std::string &fileName, const std::string &fieldName) const
A few helper methods to get system information These should be carried to PerfMonMTUtils at some poin...
bool doesDirectoryExist(const std::string &dir)
std::vector< data_map_unique_t > m_compLevelDataMapVec
data_map_t m_compLevelDataMap_cbk
std::map< PMonMT::StepComp, std::unique_ptr< PMonMT::ComponentData > > data_map_unique_t
PMonMT::StepComp generate_state(const std::string &stepName, const std::string &compName) const
void report()
Report the results.
void startCompAud(const std::string &stepName, const std::string &compName, const EventContext &ctx)
Component Level Auditing: Take measurements at the beginning and at the end of each component call.
double getEventLevelWallTime(const uint64_t event_count) const
data_map_t m_compLevelDataMap_plp
void recordEvent(const SnapshotMeasurement &meas, const int eventCount)
double getEventLevelCpuTime(const uint64_t event_count) const