ATLAS Offline Software
Loading...
Searching...
No Matches
PerfMonMTSvc.cxx
Go to the documentation of this file.
1/*
2 Copyright (C) 2002-2025 CERN for the benefit of the ATLAS collaboration
3*/
4
5/*
6 * @authors: Alaettin Serhan Mete, Hasan Ozturk - alaettin.serhan.mete@cern.ch, haozturk@cern.ch
7 */
8
9// Thread-safety-checker
11
12// Framework includes
13#include "GaudiKernel/IIncidentSvc.h"
14
15// PerfMonComps includes
16#include "PerfMonMTSvc.h"
17
18// STD includes
19#include <algorithm>
20#include <cmath>
21#include <fstream>
22#include <iomanip>
23#include <filesystem>
24#include <format>
25
26// TBB
27#include "tbb/task_arena.h"
28
29/*
30 * Constructor
31 */
32PerfMonMTSvc::PerfMonMTSvc(const std::string& name, ISvcLocator* pSvcLocator)
33 : base_class(name, pSvcLocator), m_isFirstEvent{false}, m_eventCounter{0}, m_eventLoopMsgCounter{0}, m_checkPointTime{0}, m_isEvtLoopStopped{false} {
34 // Five main snapshots : Configure, Initialize, FirstEvent, Execute, and Finalize
35 m_motherPID = getpid();
36 m_snapshotData.resize(NSNAPSHOTS); // Default construct
37
38 // Initial capture upon construction
39 m_measurementSnapshots.capture();
42}
43
44/*
45 * Initialize the Service
46 */
48 // Print where we are
49 ATH_MSG_INFO("Initializing " << name());
50
51 // Set to be listener to SvcPostFinalize
52 ServiceHandle<IIncidentSvc> incSvc("IncidentSvc/IncidentSvc", name());
53 ATH_CHECK(incSvc.retrieve());
54 const long highestPriority = static_cast<long>(-1);
55 const long lowestPriority = 0;
56 incSvc->addListener(this, IncidentType::BeginEvent, highestPriority);
57 incSvc->addListener(this, "EndAlgorithms", lowestPriority);
58 incSvc->addListener(this, "EndEvtLoop", highestPriority);
59 incSvc->addListener(this, IncidentType::SvcPostFinalize);
60
61 // Check if /proc exists, if not memory statistics are not available
62 const bool procExists = PMonMT::doesDirectoryExist("/proc");
63 if(!procExists) {
64 ATH_MSG_INFO("The system doesn't support /proc. Therefore, memory measurements are not available");
65 }
66
67 // Print some information minimal information about our configuration
68 ATH_MSG_INFO("Service is configured for [" << m_numberOfThreads.toString() << "] threads " <<
69 "analyzing [" << m_numberOfSlots.toString() << "] events concurrently");
70 ATH_MSG_INFO("Component-level measurements are [" << (m_doComponentLevelMonitoring ? "Enabled" : "Disabled") << "]");
72 ATH_MSG_INFO(" >> Component-level memory monitoring in the event-loop is disabled in jobs with more than 1 thread");
73 }
74
75 // Thread specific component-level data map
76 m_compLevelDataMapVec.resize(m_numberOfThreads+1); // Default construct
77
78 // Set wall time offset
79 m_eventLevelData.set_wall_time_offset(m_wallTimeOffset);
80 if (m_wallTimeOffset > 0) {
82 }
83
85 ATH_CHECK(auditorSvc()->addAuditor("PerfMonMTAuditor"));
86
87 return StatusCode::SUCCESS;
88}
89
90/*
91 * Finalize the Service
92 */
94 // Print where we are
95 ATH_MSG_INFO("Finalizing " << name());
96
97 return StatusCode::SUCCESS;
98}
99
100/*
101 * Handle relevant incidents
102 */
103void PerfMonMTSvc::handle(const Incident& inc) {
104 // Begin event processing
105 if (inc.type() == IncidentType::BeginEvent) {
106 // Lock for data integrity
107 std::lock_guard<std::mutex> lock(m_mutex_capture);
108
109 // Increment the internal counter
111
112 // Get current time in seconds
113 double currentTime = PMonMT::get_wall_time()*0.001;
114
115 // Monitor
117 // Overwrite the last measurement time
118 m_checkPointTime = currentTime;
119
120 // Capture
121 m_measurementEvents.capture();
123 // Report instantly - no more than m_eventLoopMsgLimit times
127 }
128 }
129 }
130 // End event processing (as signaled by SG clean-up)
131 // By convention the first event is executed serially
132 // Therefore, we treat it a little bit differently
133 else if (m_eventCounter == 1 && inc.type() == "EndAlgorithms") {
134 // In AthenaMP w/ fork-after-initialize, the loop starts
135 // in the mother process but the first event is actually
136 // executed in the worker. Here, we try to work around this
137 // by resetting the first event measurement if we think
138 // we're in AthenaMP. This is not an ideal approach but
139 // gets the job done for the fork-after-initialize case.
140 if (m_motherPID != getpid()) {
141 m_snapshotData[FIRSTEVENT].m_tmp_cpu = 0;
142 m_snapshotData[FIRSTEVENT].m_memMonTmpMap["vmem"] = 0;
143 m_snapshotData[FIRSTEVENT].m_memMonTmpMap["pss"] = 0;
144 m_snapshotData[FIRSTEVENT].m_memMonTmpMap["rss"] = 0;
145 m_snapshotData[FIRSTEVENT].m_memMonTmpMap["swap"] = 0;
146 }
147 m_measurementSnapshots.capture();
150 // Normally this flag is set in stopCompAud but we don't
151 // go in there unless m_doComponentLevelMonitoring is true.
152 // If it's false, we toggle it here but
153 // this is mostly for completeness since in that mode
154 // this flag is not really used at the moment.
156 m_isFirstEvent = false;
157 }
158 }
159 // This incident is fired by only some loop managers to signal the end of event processing
160 else if (inc.type() == "EndEvtLoop") {
161 m_isEvtLoopStopped = true;
162 m_measurementSnapshots.capture();
164 }
165 // Finalize ourself and print the metrics in SvcPostFinalize
166 else if (inc.type() == IncidentType::SvcPostFinalize) {
167 // Final capture upon post-finalization
168 m_measurementSnapshots.capture();
170
171 // Report everything
172 report();
173 }
174 return;
175}
176
177namespace {
178 /* Workaround to avoid gcchecker warning about calling currentContext in
179 functions that have an EventContext argument. */
180 const EventContext& getCurrentContext() {
181 return Gaudi::Hive::currentContext();
182 }
183}
184
185/*
186 * Start Auditing
187 */
188void PerfMonMTSvc::startAud(const std::string& stepName, const std::string& compName, const EventContext& ctx) {
189 // Snapshots, i.e. Initialize, Event Loop, etc.
190 startSnapshotAud(stepName, compName);
191
192 /*
193 * Perform component monitoring only if the user asked for it.
194 * By default we don't monitor a set of common components.
195 */
196 if (m_doComponentLevelMonitoring && !m_exclusionSet.contains(compName)) {
197 // Start component auditing (for tools/services we have to resort to thread-local context)
198 startCompAud(stepName, compName, ctx.valid() ? ctx : getCurrentContext());
199 }
200}
201
202/*
203 * Stop Auditing
204 */
205void PerfMonMTSvc::stopAud(const std::string& stepName, const std::string& compName, const EventContext& ctx) {
206 // Snapshots, i.e. Initialize, Event Loop, etc.
207 stopSnapshotAud(stepName, compName);
208
209 // Check if we should monitor this component
210 if (m_doComponentLevelMonitoring && !m_exclusionSet.contains(compName)) {
211 // Stop component auditing (for tools/services we have to resort to thread-local context)
212 stopCompAud(stepName, compName, ctx.valid() ? ctx : getCurrentContext());
213 }
214}
215
216/*
217 * Start Snapshot Auditing
218 */
219void PerfMonMTSvc::startSnapshotAud(const std::string& stepName, const std::string& compName) {
220 // Last thing to be called before the event loop begins
221 if (compName == "AthOutSeq" && stepName == "Start") {
222 m_measurementSnapshots.capture();
224 m_isFirstEvent = true;
225 }
226
227 // Last thing to be called before finalize step begins
228 if (compName == "AthMasterSeq" && stepName == "Finalize") {
229 m_measurementSnapshots.capture();
231 }
232}
233
234/*
235 * Stop Snapshot Auditing
236 */
237void PerfMonMTSvc::stopSnapshotAud(const std::string& stepName, const std::string& compName) {
238 // First thing to be called after the initialize step ends
239 if (compName == "AthMasterSeq" && stepName == "Initialize") {
240 m_measurementSnapshots.capture();
242 }
243
244 // First thing to be called after the event loop ends
245 // Some loop managers fire a dedicated incident to signal the end of the event loop
246 // That preceeds the AthMasterSeq Stop and if it is already handled we don't do anything here
247 if (compName == "AthMasterSeq" && stepName == "Stop" && m_eventCounter > 0 && !m_isEvtLoopStopped) {
248 m_measurementSnapshots.capture();
250 }
251}
252
253/*
254 * Start Component Auditing
255 */
256void PerfMonMTSvc::startCompAud(const std::string& stepName, const std::string& compName, const EventContext& ctx) {
257 // Get the thread index
258 const unsigned int ithread = (ctx.valid() && tbb::this_task_arena::current_thread_index() > -1) ? tbb::this_task_arena::current_thread_index() : 0;
259
260 // Memory measurement is only done outside the loop except when there is only a single thread
261 const bool doMem = !ctx.valid() || (m_numberOfThreads == 1);
262
263 // Generate State
264 PMonMT::StepComp currentState = generate_state(stepName, compName);
265
266 // Check if this is the first time calling if so create the mesurement data if not use the existing one.
267 // Metrics are collected per thread then aggregated before reporting
268 data_map_unique_t& compLevelDataMap = m_compLevelDataMapVec[ithread];
269 if(compLevelDataMap.find(currentState) == compLevelDataMap.end()) {
270 compLevelDataMap.insert({currentState, std::make_unique<PMonMT::ComponentData>()});
271 }
272
273 // Capture and store
275 meas.capture(); // No memory in the event-loop
276 if (doMem) {
277 // we made sure this is only run outside event loop or single-threaded
278 [[maybe_unused]] bool dummy ATLAS_THREAD_SAFE = meas.capture_memory();
279 }
280
281 compLevelDataMap[currentState]->addPointStart(meas, doMem);
282
283 // Debug
284 ATH_MSG_DEBUG("Start Audit - Component " << compName << " , "
285 "Step " << stepName << " , "
286 "Event " << ctx.evt() << " , "
287 "Slot " << ctx.slot() << " , "
288 "Context " << ctx.valid() << " , "
289 "Thread " << ithread << " , "
290 "Cpu " << meas.cpu_time << " ms, "
291 "Wall " << meas.wall_time << " ms, "
292 "Vmem " << meas.vmem << " kb, "
293 "Malloc " << meas.malloc << " kb");
294}
295
296/*
297 * Stop Component Auditing
298 */
299void PerfMonMTSvc::stopCompAud(const std::string& stepName, const std::string& compName, const EventContext& ctx) {
300 // Get the thread index
301 const unsigned int ithread = (ctx.valid() && tbb::this_task_arena::current_thread_index() > -1) ? tbb::this_task_arena::current_thread_index() : 0;
302
303 // Memory measurement is only done outside the loop except when there is only a single thread
304 const bool doMem = !ctx.valid() || (m_numberOfThreads == 1);
305
306 // Capture
308 meas.capture(); // No memory in the event-loop
309 if (doMem) {
310 // we made sure this is only run outside event loop or single-threaded
311 [[maybe_unused]] bool dummy ATLAS_THREAD_SAFE = meas.capture_memory();
312 }
313
314 // Generate State
315 PMonMT::StepComp currentState = generate_state(stepName, compName);
316
317 // Store
318 data_map_unique_t& compLevelDataMap = m_compLevelDataMapVec[ithread];
319 auto itr = compLevelDataMap.find(currentState);
320
321 // This can happen if we never got the startCompAud call.
322 // Usually because Gaudi's AuditorSvc was not fully initialized yet.
323 if (itr==compLevelDataMap.end()) return;
324
325 itr->second->addPointStop(meas, doMem);
326
327 // Once the first time IncidentProcAlg3 is excuted, toggle m_isFirstEvent to false.
328 // Doing it this way, instead of at EndAlgorithms incident, makes sure there is no
329 // mismatch in start-stop calls to IncidentProcAlg3.
330 // It's a little ad-hoc but I don't think this workflow will change much anytime soon.
331 if ( m_isFirstEvent && compName == "IncidentProcAlg3" && stepName == "Execute") {
332 m_isFirstEvent = false;
333 }
334
335 // Debug
336 ATH_MSG_DEBUG("Stop Audit - Component " << compName << " , "
337 "Step " << stepName << " , "
338 "Event " << ctx.evt() << " , "
339 "Slot " << ctx.slot() << " , "
340 "Context " << ctx.valid() << " , "
341 "Thread " << ithread << " , "
342 "Cpu (" << compLevelDataMap[currentState]->m_tmp_cpu << ":"
343 << meas.cpu_time << ":"
344 << (meas.cpu_time - compLevelDataMap[currentState]->m_tmp_cpu) << ":"
345 << compLevelDataMap[currentState]->m_delta_cpu << ") ms, "
346 "Wall (" << compLevelDataMap[currentState]->m_tmp_wall << ":"
347 << meas.wall_time << ":"
348 << (meas.wall_time - compLevelDataMap[currentState]->m_tmp_wall) << ":"
349 << compLevelDataMap[currentState]->m_delta_wall << ") ms, "
350 "Vmem (" << compLevelDataMap[currentState]->m_tmp_vmem << ":"
351 << meas.vmem << ":"
352 << (meas.vmem - compLevelDataMap[currentState]->m_tmp_vmem) << ":"
353 << compLevelDataMap[currentState]->m_delta_vmem << ") kb, "
354 "Malloc (" << compLevelDataMap[currentState]->m_tmp_malloc << ":"
355 << meas.malloc << ":"
356 << (meas.malloc - compLevelDataMap[currentState]->m_tmp_malloc) << ":"
357 << compLevelDataMap[currentState]->m_delta_malloc << ") kb");
358}
359
360/*
361 * Helper finction to estimate CPU efficiency
362 */
364
365 // In AthenaMT only the event-loop is executed concurrently
366 // In this metric, we scale the event-loop wall-time by
367 // the number of slots to take the concurrency into account
368 // Then we divide the total cpu-time by this number
369 // It's A metric not THE metric...
370
371 const double totalCpuTime =
372 m_snapshotData[CONFIGURE].getDeltaCPU() +
373 m_snapshotData[INITIALIZE].getDeltaCPU() +
374 m_snapshotData[FIRSTEVENT].getDeltaCPU() +
375 m_snapshotData[EXECUTE].getDeltaCPU() +
376 m_snapshotData[FINALIZE].getDeltaCPU();
377
378 const double scaledWallTime =
379 m_snapshotData[CONFIGURE].getDeltaWall() * 1. +
380 m_snapshotData[INITIALIZE].getDeltaWall() * 1. +
381 m_snapshotData[FIRSTEVENT].getDeltaWall() * 1. +
382 m_snapshotData[EXECUTE].getDeltaWall() * m_numberOfSlots +
383 m_snapshotData[FINALIZE].getDeltaWall() * 1.;
384
385 return ( scaledWallTime > 0 ? totalCpuTime / scaledWallTime * 100. : 0 );
386
387}
388
389/*
390 * Report the results to the log and the JSON file
391 */
393 // Write into log file
394 report2Log();
395
396 // Write into JSON
399 }
400}
401
402/*
403 * Log reporting
404 */
406 // Header
408
409 // Component-level
412 }
413
414 // Event-level
417 }
418
419 // Summary and system information
423}
424
425/*
426 * Report header to log
427 */
429 ATH_MSG_INFO("=======================================================================================");
430 ATH_MSG_INFO(" PerfMonMTSvc Report ");
431 ATH_MSG_INFO("=======================================================================================");
433 ATH_MSG_INFO("*** Full set of information can also be found in: " << m_jsonFileName.toString());
434 ATH_MSG_INFO("*** In order to make plots using the results run the following commands:");
435 ATH_MSG_INFO("*** $ perfmonmt-plotter.py -i " << m_jsonFileName.toString());
436 ATH_MSG_INFO("*** In order to print tables using the results run the following commands:");
437 ATH_MSG_INFO("*** $ perfmonmt-printer.py -i " << m_jsonFileName.toString());
438 ATH_MSG_INFO("=======================================================================================");
439 }
440}
441
442/*
443 * Report component-level information to log
444 */
446
447 ATH_MSG_INFO("=======================================================================================");
448 ATH_MSG_INFO(" Component Level Monitoring ");
449 ATH_MSG_INFO("=======================================================================================");
450
451 ATH_MSG_INFO(std::format("{:<10} {:<15} {:<25} {:<40} {:<55} {:<75}","Step", "Count", "CPU Time [ms]",
452 "Vmem [kB]", "Malloc [kB]", "Component"));
453
454 ATH_MSG_INFO("---------------------------------------------------------------------------------------");
455
456 aggregateSlotData(); // aggregate data from slots
457 divideData2Steps(); // divive data into steps for ordered printing
458
459 for (auto vec_itr : m_stdoutVec_serial) {
460 // Sort the results by CPU time for the time being
461 std::vector<std::pair<PMonMT::StepComp, PMonMT::ComponentData*>> pairs;
462 for (auto itr = vec_itr.begin(); itr != vec_itr.end(); ++itr) pairs.push_back(*itr);
463
464 sort(pairs.begin(), pairs.end(),
465 [=](std::pair<PMonMT::StepComp, PMonMT::ComponentData*>& a,
466 std::pair<PMonMT::StepComp, PMonMT::ComponentData*>& b) {
467 return a.second->getDeltaCPU() > b.second->getDeltaCPU();
468 });
469
470 int counter = 0;
471 for (auto it : pairs) {
472 // Only write out a certian number of components
473 if (counter >= m_printNComps) {
474 break;
475 }
476 counter++;
477
478 ATH_MSG_INFO(std::format("{:<10} {:<15} {:<25.2f} {:<40.0f} {:<55.0f} {:<75}",it.first.stepName,
479 it.second->getCallCount(),it.second->getDeltaCPU(),it.second->getDeltaVmem(),
480 it.second->getDeltaMalloc(),it.first.compName));
481 }
482 if(counter>0) {
483 ATH_MSG_INFO("=======================================================================================");
484 }
485 }
486}
487
488/*
489 * Report event-level information to log as we capture it
490 */
492 double cpu_time = m_eventLevelData.getEventLevelCpuTime(m_eventCounter);
493 double wall_time = m_eventLevelData.getEventLevelWallTime(m_eventCounter);
494
495 int64_t vmem = m_eventLevelData.getEventLevelMemory(m_eventCounter, "vmem");
496 int64_t rss = m_eventLevelData.getEventLevelMemory(m_eventCounter, "rss");
497 int64_t pss = m_eventLevelData.getEventLevelMemory(m_eventCounter, "pss");
498 int64_t swap = m_eventLevelData.getEventLevelMemory(m_eventCounter, "swap");
499
500 ATH_MSG_INFO("Event [" << std::setw(5) << m_eventCounter << "] CPU Time: " << scaleTime(cpu_time) <<
501 ", Wall Time: " << scaleTime(wall_time) << ", Vmem: " << scaleMem(vmem) <<
502 ", Rss: " << scaleMem(rss) << ", Pss: " << scaleMem(pss) << ", Swap: " << scaleMem(swap));
503}
504
505/*
506 * Report event-level information to log
507 */
509
510 ATH_MSG_INFO(" Event Level Monitoring ");
511 ATH_MSG_INFO(" (Only the first " << m_eventLoopMsgLimit.toString() <<
512 " and the last measurements are explicitly printed)");
513 ATH_MSG_INFO("=======================================================================================");
514
515 ATH_MSG_INFO(std::format("{:<16} {:<12} {:<12} {:<12} {:<12} {:<12} {:<12}","Event", "CPU [s]",
516 "Wall [s]", "Vmem [kB]", "Rss [kB]", "Pss [kB]", "Swap [kB]"));
517
518 ATH_MSG_INFO("---------------------------------------------------------------------------------------");
519
520 m_eventLoopMsgCounter = 0; // reset counter
521 uint64_t nMeasurements = m_eventLevelData.getNMeasurements();
522
523 for (const auto& it : m_eventLevelData.getEventLevelData()) {
524 // Print
527 ATH_MSG_INFO(std::format("{:=<87}", "..."));
528 }
529 ATH_MSG_INFO(std::format("{:<16} {:>12.2f} {:>12.2f} {:>12} {:>12} {:>12} {:>12}", it.first,
530 it.second.cpu_time * 0.001,it.second.wall_time * 0.001,it.second.mem_stats.at("vmem"),
531 it.second.mem_stats.at("rss"),it.second.mem_stats.at("pss"),it.second.mem_stats.at("swap")));
532 }
534 // Add to leak estimate
535 if (it.first >= m_memFitLowerLimit) {
536 m_fit_vmem.addPoint(it.first, it.second.mem_stats.at("vmem"));
537 m_fit_pss.addPoint(it.first, it.second.mem_stats.at("pss"));
538 }
539 }
540 ATH_MSG_INFO("=======================================================================================");
541}
542
543/*
544 * Report summary information to log
545 */
547
548 ATH_MSG_INFO(" Snapshots Summary ");
549 ATH_MSG_INFO("=======================================================================================");
550
551 ATH_MSG_INFO(std::format("{:<13} {:<12} {:<12} {:<7} {:<11} {:<11} {:<11} {:<11}","Step",
552 "dCPU [s]","dWall [s]","<CPU>","dVmem [kB]","dRss [kB]","dPss [kB]","dSwap [kB]"));
553
554 ATH_MSG_INFO("---------------------------------------------------------------------------------------");
555
556 for (unsigned int idx = 0; idx < NSNAPSHOTS; idx++) {
557 ATH_MSG_INFO(std::format("{:<13} {:<12.2f} {:<12.2f} {:<7.2f} {:<11} {:<11} {:<11} {:<11}",
558 m_snapshotStepNames[idx], m_snapshotData[idx].getDeltaCPU() * 0.001,
559 m_snapshotData[idx].getDeltaWall() * 0.001,
560 m_snapshotData[idx].getDeltaCPU() / m_snapshotData[idx].getDeltaWall(),
561 m_snapshotData[idx].getMemMonDeltaMap("vmem"),m_snapshotData[idx].getMemMonDeltaMap("rss"),
562 m_snapshotData[idx].getMemMonDeltaMap("pss"),m_snapshotData[idx].getMemMonDeltaMap("swap")));
563 }
564
565 ATH_MSG_INFO("***************************************************************************************");
566 const double cpu_exec_total = m_snapshotData[FIRSTEVENT].getDeltaCPU() + m_snapshotData[EXECUTE].getDeltaCPU();
567 const double wall_exec_total = m_snapshotData[FIRSTEVENT].getDeltaWall() + m_snapshotData[EXECUTE].getDeltaWall();
568
569 ATH_MSG_INFO(std::format("{:<35} {}", "Number of events processed:", static_cast<int>(m_eventCounter)));
570 ATH_MSG_INFO(std::format("{:<35} {:.0f}", "CPU usage per event [ms]:",
571 (m_eventCounter > 0 ? cpu_exec_total / m_eventCounter : 0)));
572 ATH_MSG_INFO(std::format("{:<35} {:.3f}", "Events per second:",
573 (wall_exec_total > 0 ? m_eventCounter / wall_exec_total * 1000. : 0)));
574 ATH_MSG_INFO(std::format("{:<35} {}", "CPU utilization efficiency [%]:", getCpuEfficiency()));
576 ATH_MSG_INFO("***************************************************************************************");
577 ATH_MSG_INFO(std::format("{:<35} {}", "Max Vmem:", scaleMem(m_eventLevelData.getEventLevelMemoryMax("vmem"))));
578 ATH_MSG_INFO(std::format("{:<35} {}", "Max Rss:", scaleMem(m_eventLevelData.getEventLevelMemoryMax("rss"))));
579 ATH_MSG_INFO(std::format("{:<35} {}", "Max Pss:", scaleMem(m_eventLevelData.getEventLevelMemoryMax("pss"))));
580 ATH_MSG_INFO(std::format("{:<35} {}", "Max Swap:", scaleMem(m_eventLevelData.getEventLevelMemoryMax("swap"))));
581 ATH_MSG_INFO("***************************************************************************************");
582 ATH_MSG_INFO(std::format("{:<35} {}", "Leak estimate per event Vmem:", scaleMem(m_fit_vmem.slope())));
583 ATH_MSG_INFO(std::format("{:<35} {}", "Leak estimate per event Pss:", scaleMem(m_fit_pss.slope())));
584 ATH_MSG_INFO(" >> Estimated using the last " << m_fit_vmem.nPoints()
585 << " measurements from the Event Level Monitoring");
586 ATH_MSG_INFO(" >> Events prior to the first " << m_memFitLowerLimit.toString() << " are omitted...");
587 }
588
589 ATH_MSG_INFO("=======================================================================================");
590}
591
592/*
593 * Report CPU information to log
594 */
596
597 ATH_MSG_INFO(" System Information ");
598 ATH_MSG_INFO("=======================================================================================");
599
600 ATH_MSG_INFO(std::format("{:<34} {}", "CPU Model:", get_cpu_model_info()));
601 ATH_MSG_INFO(std::format("{:<35} {}", "Number of Available Cores:", get_cpu_core_info()));
602 ATH_MSG_INFO(std::format("{:<35} {}", "Total Memory:", scaleMem(get_memory_info())));
603 ATH_MSG_INFO("=======================================================================================");
604}
605
606/*
607 * Report run-time enviroment information
608 */
610
611 ATH_MSG_INFO(" Environment Information ");
612 ATH_MSG_INFO("=======================================================================================");
613
614 ATH_MSG_INFO(std::format("{:<35} {}","Malloc Library:", std::filesystem::path(PMonMT::symb2lib("malloc")).filename().string()));
615 ATH_MSG_INFO(std::format("{:<35} {}","Math Library:", std::filesystem::path(PMonMT::symb2lib("atan2")).filename().string()));
616
617 ATH_MSG_INFO("=======================================================================================");
618
619}
620
621/*
622 * Report data to JSON
623 */
625 nlohmann::json j;
626
627 // CPU and Wall-time
628 report2JsonFile_Summary(j); // Snapshots
629
630 // Memory
632 report2JsonFile_ComponentLevel(j); // Component-level
633 }
635 report2JsonFile_EventLevel(j); // Event-level
636 }
637
638 // Write and close the JSON file
639 std::ofstream o(m_jsonFileName);
640 o << std::setw(4) << j << std::endl;
641 o.close();
642
643 // Compress the JSON file into tar.gz
644 auto cmd = "tar -czf " + m_jsonFileName + ".tar.gz " + m_jsonFileName + ";";
645 int rc = std::system(cmd.c_str());
646 if(rc!=0) {
647 ATH_MSG_WARNING("Couldn't compress the JSON file...");
648 return;
649 }
650
651 // Remove the uncompressed JSON file to save disk-space
652 rc = std::remove(m_jsonFileName.toString().c_str());
653 if(rc!=0) {
654 ATH_MSG_WARNING("Couldn't remove the uncompressed JSON file...");
655 return;
656 }
657}
658
659/*
660 * Report summary data to JSON
661 */
662void PerfMonMTSvc::report2JsonFile_Summary(nlohmann::json& j) const {
663
664 // Report snapshot level results
665 for(int i=0; i < NSNAPSHOTS; i++){
666
667 const std::string step = m_snapshotStepNames[i];
668 const double dCPU = m_snapshotData[i].getDeltaCPU();
669 const double dWall = m_snapshotData[i].getDeltaWall();
670 const double cpuUtil = dCPU / dWall;
671 const int64_t dVmem = m_snapshotData[i].getMemMonDeltaMap("vmem");
672 const int64_t dRss = m_snapshotData[i].getMemMonDeltaMap("rss");
673 const int64_t dPss = m_snapshotData[i].getMemMonDeltaMap("pss");
674 const int64_t dSwap = m_snapshotData[i].getMemMonDeltaMap("swap");
675
676 j["summary"]["snapshotLevel"][step] = {{"dCPU", dCPU},
677 {"dWall", dWall},
678 {"cpuUtil", cpuUtil},
679 {"dVmem", dVmem},
680 {"dRss", dRss},
681 {"dPss", dPss},
682 {"dSwap", dSwap}};
683
684 }
685
686 // Report the total number of events
687 const int64_t nEvents = m_eventCounter;
688 j["summary"]["nEvents"] = nEvents;
689
690 // Report Peaks
691 const int64_t vmemPeak = m_eventLevelData.getEventLevelMemoryMax("vmem");
692 const int64_t rssPeak = m_eventLevelData.getEventLevelMemoryMax("rss");
693 const int64_t pssPeak = m_eventLevelData.getEventLevelMemoryMax("pss");
694 const int64_t swapPeak = m_eventLevelData.getEventLevelMemoryMax("swap");
695
696 j["summary"]["peaks"] = {{"vmemPeak", vmemPeak},
697 {"rssPeak", rssPeak},
698 {"pssPeak", pssPeak},
699 {"swapPeak", swapPeak}};
700
701 // Report leak estimates
702 const int64_t vmemLeak = m_fit_vmem.slope();
703 const int64_t pssLeak = m_fit_pss.slope();
704 const int64_t nPoints = m_fit_vmem.nPoints();
705
706 j["summary"]["leakEstimates"] = {{"vmemLeak", vmemLeak},
707 {"pssLeak", pssLeak},
708 {"nPoints", nPoints}};
709
710 // Report Sys info
711 const std::string cpuModel = get_cpu_model_info();
712 const int coreNum = get_cpu_core_info();
713 const int64_t totMem = get_memory_info();
714
715 j["summary"]["sysInfo"] = {{"cpuModel", cpuModel},
716 {"coreNum", coreNum},
717 {"totMem", totMem}};
718
719 // Report Enviroment info
720 const std::string mallocLib = std::filesystem::path(PMonMT::symb2lib("malloc")).filename().string();
721 const std::string mathLib = std::filesystem::path(PMonMT::symb2lib("atan2")).filename().string();
722
723 j["summary"]["envInfo"] = {{"mallocLib", mallocLib},
724 {"mathLib", mathLib}};
725
726 // Report CPU utilization efficiency;
727 const int cpuUtilEff = getCpuEfficiency();
728 j["summary"]["misc"] = {{"cpuUtilEff", cpuUtilEff}};
729
730}
731
732void PerfMonMTSvc::report2JsonFile_ComponentLevel(nlohmann::json& j) const {
733
734 for (const auto& dataMapPerStep : m_stdoutVec_serial) {
735
736 for(const auto& meas : dataMapPerStep){
737
738 const std::string step = meas.first.stepName;
739 const std::string component = meas.first.compName;
740 const uint64_t count = meas.second->getCallCount();
741 const double cpuTime = meas.second->getDeltaCPU();
742 const double wallTime = meas.second->getDeltaWall();
743 const int64_t vmem = meas.second->getDeltaVmem();
744 const int64_t mall = meas.second->getDeltaMalloc();
745
746 j["componentLevel"][step][component] = {{"count", count},
747 {"cpuTime", cpuTime},
748 {"wallTime", wallTime},
749 {"vmem", vmem},
750 {"malloc", mall}};
751 }
752
753 }
754
755}
756
757void PerfMonMTSvc::report2JsonFile_EventLevel(nlohmann::json& j) const {
758
759 for (const auto& it : m_eventLevelData.getEventLevelData()) {
760
761 const uint64_t event = it.first;
762 const double cpuTime = it.second.cpu_time;
763 const double wallTime = it.second.wall_time;
764 const int64_t vmem = it.second.mem_stats.at("vmem");
765 const int64_t rss = it.second.mem_stats.at("rss");
766 const int64_t pss = it.second.mem_stats.at("pss");
767 const int64_t swap = it.second.mem_stats.at("swap");
768
769 j["eventLevel"][std::to_string(event)] = {{"cpuTime", cpuTime},
770 {"wallTime", wallTime},
771 {"vmem", vmem},
772 {"rss", rss},
773 {"pss", pss},
774 {"swap", swap}};
775
776
777 }
778}
779
780/*
781 * Generate a "state" that is use as a key for the component-level data
782 */
783PMonMT::StepComp PerfMonMTSvc::generate_state(const std::string& stepName, const std::string& compName) const {
784 PMonMT::StepComp currentState;
785 currentState.stepName = (m_isFirstEvent && stepName == "Execute") ? "FirstEvent" : stepName;
786 currentState.compName = compName;
787 return currentState;
788}
789
790/*
791 * Aggregate component-level data from all slots
792 */
794 // Loop over data from all slots
795 for (const auto& slotData : m_compLevelDataMapVec) {
796 for (const auto& it : slotData) {
797 // Copy the first slot data and sum the rest
798 if(m_compLevelDataMap.find(it.first) == m_compLevelDataMap.end()) {
799 m_compLevelDataMap.insert({it.first, it.second.get()});
800 } else {
801 m_compLevelDataMap[it.first]->add2CallCount(it.second->getCallCount());
802 m_compLevelDataMap[it.first]->add2DeltaCPU(it.second->getDeltaCPU());
803 m_compLevelDataMap[it.first]->add2DeltaWall(it.second->getDeltaWall());
804 m_compLevelDataMap[it.first]->add2DeltaVmem(it.second->getDeltaVmem());
805 m_compLevelDataMap[it.first]->add2DeltaMalloc(it.second->getDeltaMalloc());
806 }
807 // Do a quick consistency check here and print any suspicious measurements.
808 // Timing measurements should always be positive definite
809 if(it.second->getDeltaCPU() < 0) {
810 ATH_MSG_WARNING("Negative CPU-time measurement of " << it.second->getDeltaCPU() <<
811 " ms for component " << it.first.compName <<
812 " in step " << it.first.stepName);
813 }
814 if(it.second->getDeltaWall() < 0) {
815 ATH_MSG_WARNING("Negative Wall-time measurement of " << it.second->getDeltaWall() <<
816 " ms for component " << it.first.compName <<
817 " in step " << it.first.stepName);
818 }
819 }
820 }
821}
822
823/*
824 * Divide component-level data into steps, for printing
825 */
827 for (const auto &it : m_compLevelDataMap) {
828 if (it.first.stepName == "Initialize")
829 m_compLevelDataMap_ini[it.first] = it.second;
830 else if (it.first.stepName == "FirstEvent")
831 m_compLevelDataMap_1stevt[it.first] = it.second;
832 else if (it.first.stepName == "Execute")
833 m_compLevelDataMap_evt[it.first] = it.second;
834 else if (it.first.stepName == "Finalize")
835 m_compLevelDataMap_fin[it.first] = it.second;
836 else if (it.first.stepName == "preLoadProxy")
837 m_compLevelDataMap_plp[it.first] = it.second;
838 else if (it.first.stepName == "Callback")
839 m_compLevelDataMap_cbk[it.first] = it.second;
840 }
847}
848
849std::string PerfMonMTSvc::scaleTime(double timeMeas) const {
850 // Not a huge fan of this, we should eventually unify the types
851 // Just want to be explicit about what's happening
852 auto ms = static_cast<int64_t>(timeMeas);
853
854 // Compute hrs and offset
855 auto hrs = ms / 3600000;
856 ms -= hrs * 3600000;
857 // Compute mins and offset
858 auto mins = ms / 60000;
859 ms -= mins * 60000;
860 // Compute secs and offset
861 auto secs = ms / 1000;
862 ms -= secs * 1000;
863
864 // Primarily care about H:M:S
865 std::stringstream ss;
866 ss.fill('0');
867 ss << std::setw(2) << hrs << "h" <<
868 std::setw(2) << mins << "m" <<
869 std::setw(2) << secs << "s";
870 return ss.str();
871}
872
873std::string PerfMonMTSvc::scaleMem(int64_t memMeas) const {
874
875 // Check if there is anything to be done
876 if (memMeas == 0) {
877 return "0.00 KB" ;
878 }
879
880 // Prepare for the result
881 std::ostringstream ss;
882 ss << std::fixed;
883 ss << std::setprecision(2);
884
885 // The input is in KB
886 std::vector<std::string> significance = {"KB", "MB", "GB", "TB"};
887
888 // Get the absolute value
889 int64_t absMemMeas = std::abs(memMeas);
890 // Find the order, note that this is an int operation
891 int64_t order = std::log(absMemMeas)/std::log(1024);
892 // Compute the final value preserving the sign
893 double value = memMeas/std::pow(1024, order);
894 // Convert the result to a string
895 ss << value;
896
897 return ss.str() + " " + significance[order];
898}
899
900/*
901 * Collect some hardware information
902 */
903std::string PerfMonMTSvc::get_info_from_file(const std::string& fileName,
904 const std::string& fieldName) const {
905 // Helper function to read files of type Key : Value
906 // Returns the last instance if there are multiple matches
907 // This is because we use this method to get the processor count
908 std::string result{""};
909
910 std::ifstream file{fileName};
911 std::string line{""};
912
913 while (std::getline(file, line)) {
914 if (line.empty()) continue;
915 size_t splitIdx = line.find(':');
916 if (splitIdx != std::string::npos) {
917 std::string val = line.substr(splitIdx + 1);
918 if (val.empty()) continue;
919 if (line.size() >= fieldName.size() &&
920 line.compare(0, fieldName.size(), fieldName) == 0) {
921 result = std::move(val);
922 }
923 }
924 }
925
926 file.close();
927
928 return result;
929}
930
932 return get_info_from_file("/proc/cpuinfo","model name") +
933 get_info_from_file("/proc/cpuinfo","cache size");
934}
935
937 std::string val = get_info_from_file("/proc/cpuinfo","processor");
938 if (val.empty()) return 0;
939 return std::stoi(val) + 1;
940}
941
943 std::string val = get_info_from_file("/proc/meminfo","MemTotal");
944 if (val.empty()) return 0;
945 val.resize(val.size() - 3); // strip the trailing kB
946 return std::stoull(val);
947}
#define ATH_CHECK
Evaluate an expression and check for errors.
#define ATH_MSG_INFO(x)
#define ATH_MSG_WARNING(x)
#define ATH_MSG_DEBUG(x)
void swap(DataVector< T > &a, DataVector< T > &b)
See DataVector<T, BASE>::swap().
static Double_t a
static Double_t ss
static Double_t rc
void sort(typename DataModel_detail::iterator< DVL > beg, typename DataModel_detail::iterator< DVL > end)
Specialization of sort for DataVector/List.
Define macros for attributes used to control the static checker.
#define ATLAS_THREAD_SAFE
PMonMT::EventLevelData m_eventLevelData
std::vector< data_map_t > m_stdoutVec_serial
data_map_t m_compLevelDataMap_cbk
virtual void stopAud(const std::string &stepName, const std::string &compName, const EventContext &ctx) override
Stop Auditing.
Gaudi::Property< int > m_numberOfSlots
Get the number of slots.
Gaudi::Property< std::string > m_jsonFileName
Name of the JSON file.
data_map_t m_compLevelDataMap_plp
std::atomic< bool > m_isFirstEvent
Gaudi::Property< bool > m_doComponentLevelMonitoring
Do component level monitoring.
void aggregateSlotData()
A few helper functions.
PerfMon::LinFitSglPass m_fit_pss
void stopSnapshotAud(const std::string &stepName, const std::string &compName)
void report2JsonFile()
Report to the JSON File.
data_map_t m_compLevelDataMap_fin
const std::set< std::string > m_exclusionSet
Exclude some common components from monitoring In the future this might be converted to a inclusion s...
Gaudi::Property< uint64_t > m_checkPointThreshold
Frequency of event level monitoring.
std::atomic< double > m_checkPointTime
std::string get_info_from_file(const std::string &fileName, const std::string &fieldName) const
A few helper methods to get system information These should be carried to PerfMonMTUtils at some poin...
void report2Log_EventLevel()
void report2Log_Description() const
std::atomic< uint64_t > m_eventCounter
std::vector< std::string > m_snapshotStepNames
virtual void handle(const Incident &incident) override
Incident service handle for post-finalize.
std::map< PMonMT::StepComp, std::unique_ptr< PMonMT::ComponentData > > data_map_unique_t
PMonMT::StepComp generate_state(const std::string &stepName, const std::string &compName) const
std::string scaleTime(double timeMeas) const
virtual StatusCode initialize() override
Standard Gaudi Service initialization.
PMonMT::SnapshotMeasurement m_measurementSnapshots
Measurement to capture snapshots.
Gaudi::Property< bool > m_doEventLoopMonitoring
Do event loop monitoring.
PMonMT::SnapshotMeasurement m_measurementEvents
Measurement to capture events.
Gaudi::Property< uint64_t > m_memFitLowerLimit
Lower limit (in number of events) for the memory fit.
std::vector< PMonMT::SnapshotData > m_snapshotData
virtual StatusCode finalize() override
Standard Gaudi Service finalization.
void report2Log_EventLevel_instant() const
void stopCompAud(const std::string &stepName, const std::string &compName, const EventContext &ctx)
std::atomic< bool > m_isEvtLoopStopped
std::atomic< uint64_t > m_eventLoopMsgCounter
int getCpuEfficiency() const
data_map_t m_compLevelDataMap_ini
int get_cpu_core_info() const
data_map_t m_compLevelDataMap
void report2Log_Summary()
void report2Log()
Report to log.
virtual void startAud(const std::string &stepName, const std::string &compName, const EventContext &ctx) override
Start Auditing.
data_map_t m_compLevelDataMap_evt
void report2Log_CpuInfo() const
Gaudi::Property< double > m_wallTimeOffset
Offset for the wall-time, comes from configuration.
void report2JsonFile_Summary(nlohmann::json &j) const
int m_motherPID
Snapshots data.
void divideData2Steps()
void report()
Report the results.
void report2JsonFile_EventLevel(nlohmann::json &j) const
void report2Log_ComponentLevel()
std::string scaleMem(int64_t memMeas) const
std::string get_cpu_model_info() const
Gaudi::Property< bool > m_printDetailedTables
Print detailed tables.
std::mutex m_mutex_capture
Gaudi::Property< int > m_numberOfThreads
Get the number of threads.
void startSnapshotAud(const std::string &stepName, const std::string &compName)
Snapshot Auditing: Take snapshots at the beginning and at the end of each step.
uint64_t get_memory_info() const
void report2JsonFile_ComponentLevel(nlohmann::json &j) const
Gaudi::Property< int > m_printNComps
Print the top N components.
Gaudi::Property< bool > m_reportResultsToJSON
Report results to JSON.
std::vector< data_map_unique_t > m_compLevelDataMapVec
PerfMonMTSvc(const std::string &name, ISvcLocator *pSvcLocator)
Standard Gaudi Service constructor.
void report2Log_EnvInfo() const
void startCompAud(const std::string &stepName, const std::string &compName, const EventContext &ctx)
Component Level Auditing: Take measurements at the beginning and at the end of each component call.
data_map_t m_compLevelDataMap_1stevt
Gaudi::Property< uint64_t > m_eventLoopMsgLimit
Set the number of messages for the event-level report.
PerfMon::LinFitSglPass m_fit_vmem
const int nEvents
int count(std::string s, const std::string &regx)
count how many occurances of a regx are in a string
Definition hcg.cxx:146
double get_wall_time()
bool doesDirectoryExist(const std::string &dir)
const char * symb2lib(const char *symbol, const char *failstr)
DataModel_detail::iterator< DVL > remove(typename DataModel_detail::iterator< DVL > beg, typename DataModel_detail::iterator< DVL > end, const T &value)
Specialization of remove for DataVector/List.
std::string compName
std::string stepName
TFile * file