|
ATLAS Offline Software
|
Go to the documentation of this file.
13 #include "GaudiKernel/IIncidentSvc.h"
14 #include "GaudiKernel/ThreadLocalContext.h"
29 #include "tbb/task_arena.h"
35 : base_class(
name, pSvcLocator), m_isFirstEvent{
false}, m_eventCounter{0}, m_eventLoopMsgCounter{0}, m_checkPointTime{0}, m_isEvtLoopStopped{
false} {
37 m_motherPID = getpid();
38 m_snapshotData.resize(NSNAPSHOTS);
41 m_measurementSnapshots.capture();
42 m_snapshotData[CONFIGURE].addPointStop(m_measurementSnapshots);
43 m_snapshotData[INITIALIZE].addPointStart(m_measurementSnapshots);
56 const long highestPriority =
static_cast<long>(-1);
57 const long lowestPriority = 0;
58 incSvc->addListener(
this, IncidentType::BeginEvent, highestPriority);
59 incSvc->addListener(
this,
"EndAlgorithms", lowestPriority);
60 incSvc->addListener(
this,
"EndEvtLoop", highestPriority);
61 incSvc->addListener(
this, IncidentType::SvcPostFinalize);
66 ATH_MSG_INFO(
"The system doesn't support /proc. Therefore, memory measurements are not available");
71 "analyzing [" <<
m_numberOfSlots.toString() <<
"] events concurrently");
74 ATH_MSG_INFO(
" >> Component-level memory monitoring in the event-loop is disabled in jobs with more than 1 thread");
88 ATH_MSG_ERROR(
"Could not register auditor [PerfMonMTAuditor]!");
89 return StatusCode::FAILURE;
91 return StatusCode::SUCCESS;
101 return StatusCode::SUCCESS;
109 if (inc.type() == IncidentType::BeginEvent) {
164 else if (inc.type() ==
"EndEvtLoop") {
170 else if (inc.type() == IncidentType::SvcPostFinalize) {
194 auto const &ctx = Gaudi::Hive::currentContext();
209 auto const &ctx = Gaudi::Hive::currentContext();
219 if (
compName ==
"AthOutSeq" && stepName ==
"Start") {
226 if (
compName ==
"AthMasterSeq" && stepName ==
"Finalize") {
237 if (
compName ==
"AthMasterSeq" && stepName ==
"Initialize") {
256 const unsigned int ithread = (ctx.valid() && tbb::this_task_arena::current_thread_index() > -1) ? tbb::this_task_arena::current_thread_index() : 0;
267 if(compLevelDataMap.find(currentState) == compLevelDataMap.end()) {
268 compLevelDataMap.insert({currentState, std::make_unique<PMonMT::ComponentData>()});
279 compLevelDataMap[currentState]->addPointStart(meas, doMem);
283 "Step " << stepName <<
" , "
284 "Event " << ctx.evt() <<
" , "
285 "Slot " << ctx.slot() <<
" , "
286 "Context " << ctx.valid() <<
" , "
287 "Thread " << ithread <<
" , "
290 "Vmem " << meas.
vmem <<
" kb, "
291 "Malloc " << meas.
malloc <<
" kb");
299 const unsigned int ithread = (ctx.valid() && tbb::this_task_arena::current_thread_index() > -1) ? tbb::this_task_arena::current_thread_index() : 0;
317 compLevelDataMap[currentState]->addPointStop(meas, doMem);
329 "Step " << stepName <<
" , "
330 "Event " << ctx.evt() <<
" , "
331 "Slot " << ctx.slot() <<
" , "
332 "Context " << ctx.valid() <<
" , "
333 "Thread " << ithread <<
" , "
334 "Cpu (" << compLevelDataMap[currentState]->m_tmp_cpu <<
":"
336 << (meas.
cpu_time - compLevelDataMap[currentState]->m_tmp_cpu) <<
":"
337 << compLevelDataMap[currentState]->m_delta_cpu <<
") ms, "
338 "Wall (" << compLevelDataMap[currentState]->m_tmp_wall <<
":"
340 << (meas.
wall_time - compLevelDataMap[currentState]->m_tmp_wall) <<
":"
341 << compLevelDataMap[currentState]->m_delta_wall <<
") ms, "
342 "Vmem (" << compLevelDataMap[currentState]->m_tmp_vmem <<
":"
344 << (meas.
vmem - compLevelDataMap[currentState]->m_tmp_vmem) <<
":"
345 << compLevelDataMap[currentState]->m_delta_vmem <<
") kb, "
346 "Malloc (" << compLevelDataMap[currentState]->m_tmp_malloc <<
":"
348 << (meas.
malloc - compLevelDataMap[currentState]->m_tmp_malloc) <<
":"
349 << compLevelDataMap[currentState]->m_delta_malloc <<
") kb");
363 const double totalCpuTime =
370 const double scaledWallTime =
377 return ( scaledWallTime > 0 ? totalCpuTime / scaledWallTime * 100. : 0 );
421 ATH_MSG_INFO(
"=======================================================================================");
423 ATH_MSG_INFO(
"=======================================================================================");
426 ATH_MSG_INFO(
"*** In order to make plots using the results run the following commands:");
428 ATH_MSG_INFO(
"*** In order to print tables using the results run the following commands:");
430 ATH_MSG_INFO(
"=======================================================================================");
439 ATH_MSG_INFO(
"=======================================================================================");
441 ATH_MSG_INFO(
"=======================================================================================");
444 "Vmem [kB]",
"Malloc [kB]",
"Component"));
446 ATH_MSG_INFO(
"---------------------------------------------------------------------------------------");
453 std::vector<std::pair<PMonMT::StepComp, PMonMT::ComponentData*>>
pairs;
454 for (
auto itr = vec_itr.begin(); itr != vec_itr.end(); ++itr)
pairs.push_back(*itr);
457 [=](std::pair<PMonMT::StepComp, PMonMT::ComponentData*>&
a,
458 std::pair<PMonMT::StepComp, PMonMT::ComponentData*>&
b) {
459 return a.second->getDeltaCPU() > b.second->getDeltaCPU();
471 it.second->getCallCount(),
it.second->getDeltaCPU(),
it.second->getDeltaVmem(),
472 it.second->getDeltaMalloc(),
it.first.compName));
475 ATH_MSG_INFO(
"=======================================================================================");
504 " and the last measurements are explicitly printed)");
505 ATH_MSG_INFO(
"=======================================================================================");
508 "Wall [s]",
"Vmem [kB]",
"Rss [kB]",
"Pss [kB]",
"Swap [kB]"));
510 ATH_MSG_INFO(
"---------------------------------------------------------------------------------------");
522 it.second.cpu_time * 0.001,
it.second.wall_time * 0.001,
it.second.mem_stats.at(
"vmem"),
523 it.second.mem_stats.at(
"rss"),
it.second.mem_stats.at(
"pss"),
it.second.mem_stats.at(
"swap")));
532 ATH_MSG_INFO(
"=======================================================================================");
541 ATH_MSG_INFO(
"=======================================================================================");
544 "dCPU [s]",
"dWall [s]",
"<CPU>",
"dVmem [kB]",
"dRss [kB]",
"dPss [kB]",
"dSwap [kB]"));
546 ATH_MSG_INFO(
"---------------------------------------------------------------------------------------");
557 ATH_MSG_INFO(
"***************************************************************************************");
565 (wall_exec_total > 0 ?
m_eventCounter / wall_exec_total * 1000. : 0)));
568 ATH_MSG_INFO(
"***************************************************************************************");
573 ATH_MSG_INFO(
"***************************************************************************************");
577 <<
" measurements from the Event Level Monitoring");
581 ATH_MSG_INFO(
"=======================================================================================");
590 ATH_MSG_INFO(
"=======================================================================================");
595 ATH_MSG_INFO(
"=======================================================================================");
604 ATH_MSG_INFO(
"=======================================================================================");
609 ATH_MSG_INFO(
"=======================================================================================");
632 o << std::setw(4) << j << std::endl;
637 int rc = std::system(
cmd.c_str());
662 const double cpuUtil = dCPU / dWall;
668 j[
"summary"][
"snapshotLevel"][
step] = {{
"dCPU", dCPU},
670 {
"cpuUtil", cpuUtil},
680 j[
"summary"][
"nEvents"] =
nEvents;
688 j[
"summary"][
"peaks"] = {{
"vmemPeak", vmemPeak},
689 {
"rssPeak", rssPeak},
690 {
"pssPeak", pssPeak},
691 {
"swapPeak", swapPeak}};
698 j[
"summary"][
"leakEstimates"] = {{
"vmemLeak", vmemLeak},
699 {
"pssLeak", pssLeak},
700 {
"nPoints", nPoints}};
707 j[
"summary"][
"sysInfo"] = {{
"cpuModel", cpuModel},
708 {
"coreNum", coreNum},
715 j[
"summary"][
"envInfo"] = {{
"mallocLib", mallocLib},
716 {
"mathLib", mathLib}};
720 j[
"summary"][
"misc"] = {{
"cpuUtilEff", cpuUtilEff}};
728 for(
const auto& meas : dataMapPerStep){
730 const std::string
step = meas.first.stepName;
731 const std::string component = meas.first.compName;
733 const double cpuTime = meas.second->getDeltaCPU();
734 const double wallTime = meas.second->getDeltaWall();
735 const int64_t vmem = meas.second->getDeltaVmem();
736 const int64_t mall = meas.second->getDeltaMalloc();
738 j[
"componentLevel"][
step][component] = {{
"count",
count},
739 {
"cpuTime", cpuTime},
740 {
"wallTime", wallTime},
754 const double cpuTime =
it.second.cpu_time;
755 const double wallTime =
it.second.wall_time;
756 const int64_t vmem =
it.second.mem_stats.at(
"vmem");
757 const int64_t rss =
it.second.mem_stats.at(
"rss");
758 const int64_t pss =
it.second.mem_stats.at(
"pss");
759 const int64_t
swap =
it.second.mem_stats.at(
"swap");
762 {
"wallTime", wallTime},
788 for (
const auto&
it : slotData) {
801 if(
it.second->getDeltaCPU() < 0) {
802 ATH_MSG_WARNING(
"Negative CPU-time measurement of " <<
it.second->getDeltaCPU() <<
803 " ms for component " <<
it.first.compName <<
804 " in step " <<
it.first.stepName);
806 if(
it.second->getDeltaWall() < 0) {
807 ATH_MSG_WARNING(
"Negative Wall-time measurement of " <<
it.second->getDeltaWall() <<
808 " ms for component " <<
it.first.compName <<
809 " in step " <<
it.first.stepName);
820 if (
it.first.stepName ==
"Initialize")
822 else if (
it.first.stepName ==
"FirstEvent")
824 else if (
it.first.stepName ==
"Execute")
826 else if (
it.first.stepName ==
"Finalize")
828 else if (
it.first.stepName ==
"preLoadProxy")
830 else if (
it.first.stepName ==
"Callback")
844 auto ms =
static_cast<int64_t
>(timeMeas);
847 auto hrs =
ms / 3600000;
850 auto mins =
ms / 60000;
853 auto secs =
ms / 1000;
857 std::stringstream
ss;
859 ss << std::setw(2) << hrs <<
"h" <<
860 std::setw(2) << mins <<
"m" <<
861 std::setw(2) << secs <<
"s";
873 std::ostringstream
ss;
875 ss << std::setprecision(2);
878 std::vector<std::string> significance = {
"KB",
"MB",
"GB",
"TB"};
881 int64_t absMemMeas = std::abs(memMeas);
889 return ss.str() +
" " + significance[
order];
903 std::string
line{
""};
906 if (
line.empty())
continue;
907 size_t splitIdx =
line.find(
':');
908 if (splitIdx != std::string::npos) {
909 std::string
val =
line.substr(splitIdx + 1);
910 if (
val.empty())
continue;
930 if (
val.empty())
return 0;
931 return std::stoi(
val) + 1;
936 if (
val.empty())
return 0;
937 val.resize(
val.size() - 3);
938 return std::stoull(
val);
Gaudi::Property< bool > m_doComponentLevelMonitoring
Do component level monitoring.
void report2JsonFile()
Report to the JSON File.
std::atomic< bool > m_isFirstEvent
void stopSnapshotAud(const std::string &stepName, const std::string &compName)
void report2Log_Description() const
int getCpuEfficiency() const
void report2Log_ComponentLevel()
const char * symb2lib(const char *symbol, const char *failstr)
data_map_t m_compLevelDataMap_fin
path
python interpreter configuration --------------------------------------—
Gaudi::Property< int > m_numberOfThreads
Get the number of threads.
PMonMT::SnapshotMeasurement m_measurementSnapshots
Measurement to capture snapshots.
data_map_t m_compLevelDataMap
void addPoint(const double &, const double &)
StatusCode makeAuditor(const std::string &audName, IAuditorSvc *audSvc, MsgStream &msg)
simple function to factorize boring things such as asking the AuditorSvc if an auditor is there (and ...
int m_motherPID
Snapshots data.
void report2Log_CpuInfo() const
void report2Log_EnvInfo() const
virtual StatusCode initialize() override
Standard Gaudi Service initialization.
std::atomic< uint64_t > m_eventCounter
int get_cpu_core_info() const
std::atomic< double > m_checkPointTime
Gaudi::Property< double > m_wallTimeOffset
Offset for the wall-time, comes from configuration.
void report2JsonFile_ComponentLevel(nlohmann::json &j) const
PerfMon::LinFitSglPass m_fit_vmem
virtual void handle(const Incident &incident) override
Incident service handle for post-finalize.
std::string get_cpu_model_info() const
void stopCompAud(const std::string &stepName, const std::string &compName, const EventContext &ctx)
PMonMT::SnapshotMeasurement m_measurementEvents
Measurement to capture events.
Gaudi::Property< uint64_t > m_eventLoopMsgLimit
Set the number of messages for the event-level report.
void report2Log_EventLevel()
void startSnapshotAud(const std::string &stepName, const std::string &compName)
Snapshot Auditing: Take snapshots at the beginning and at the end of each step.
std::atomic< bool > m_isEvtLoopStopped
void report2Log_Summary()
data_map_t m_compLevelDataMap_ini
data_map_t m_compLevelDataMap_evt
std::vector< PMonMT::SnapshotData > m_snapshotData
void report2Log_EventLevel_instant() const
const EventMeasMap_t & getEventLevelData() const
virtual void startAud(const std::string &stepName, const std::string &compName) override
Start Auditing.
PerfMon::LinFitSglPass m_fit_pss
void report2Log()
Report to log.
POOL::TEvent event(POOL::TEvent::kClassAccess)
int64_t getEventLevelMemoryMax(const std::string &stat) const
Gaudi::Property< bool > m_reportResultsToJSON
Report results to JSON.
::StatusCode StatusCode
StatusCode definition for legacy code.
Gaudi::Property< std::string > m_jsonFileName
Name of the JSON file.
virtual StatusCode finalize() override
Standard Gaudi Service finalization.
void report2JsonFile_EventLevel(nlohmann::json &j) const
std::atomic< uint64_t > m_eventLoopMsgCounter
std::string scaleMem(int64_t memMeas) const
std::string scaleTime(double timeMeas) const
uint64_t get_memory_info() const
virtual void stopAud(const std::string &stepName, const std::string &compName) override
Stop Auditing.
void report2JsonFile_Summary(nlohmann::json &j) const
std::vector< std::string > m_snapshotStepNames
const std::set< std::string > m_exclusionSet
Exclude some common components from monitoring In the future this might be converted to a inclusion s...
std::string to_string(const DetectorType &type)
int64_t getEventLevelMemory(const uint64_t event_count, const std::string &stat) const
std::mutex m_mutex_capture
Gaudi::Property< int > m_numberOfSlots
Get the number of slots.
Gaudi::Property< int > m_printNComps
Print the top N components.
Gaudi::Property< bool > m_printDetailedTables
Print detailed tables.
void aggregateSlotData()
A few helper functions.
void set_wall_time_offset(const double wall_time_offset)
std::vector< data_map_t > m_stdoutVec_serial
PMonMT::EventLevelData m_eventLevelData
PerfMonMTSvc(const std::string &name, ISvcLocator *pSvcLocator)
Standard Gaudi Service constructor.
uint64_t getNMeasurements() const
Gaudi::Property< bool > m_doEventLoopMonitoring
Do event loop monitoring.
Gaudi::Property< uint64_t > m_checkPointThreshold
Frequency of event level monitoring.
#define ATH_MSG_WARNING(x)
data_map_t m_compLevelDataMap_1stevt
Gaudi::Property< uint64_t > m_memFitLowerLimit
Lower limit (in number of events) for the memory fit.
#define ATLAS_THREAD_SAFE
Define macros for attributes used to control the static checker.
std::string get_info_from_file(const std::string &fileName, const std::string &fieldName) const
A few helper methods to get system information These should be carried to PerfMonMTUtils at some poin...
bool doesDirectoryExist(const std::string &dir)
std::vector< data_map_unique_t > m_compLevelDataMapVec
data_map_t m_compLevelDataMap_cbk
std::map< PMonMT::StepComp, std::unique_ptr< PMonMT::ComponentData > > data_map_unique_t
PMonMT::StepComp generate_state(const std::string &stepName, const std::string &compName) const
void report()
Report the results.
void startCompAud(const std::string &stepName, const std::string &compName, const EventContext &ctx)
Component Level Auditing: Take measurements at the beginning and at the end of each component call.
double getEventLevelWallTime(const uint64_t event_count) const
data_map_t m_compLevelDataMap_plp
void recordEvent(const SnapshotMeasurement &meas, const int eventCount)
double getEventLevelCpuTime(const uint64_t event_count) const