ATLAS Offline Software
Loading...
Searching...
No Matches
PerfMonMTSvc.h
Go to the documentation of this file.
1/*
2 Copyright (C) 2002-2025 CERN for the benefit of the ATLAS collaboration
3*/
4
5/*
6 * @authors: Alaettin Serhan Mete, Hasan Ozturk - alaettin.serhan.mete@cern.ch, haozturk@cern.ch
7 */
8
9#ifndef PERFMONCOMPS_PERFMONMTSVC_H
10#define PERFMONCOMPS_PERFMONMTSVC_H
11
12// Thread-safety-checker
14
15// Framework includes
17#include "GaudiKernel/IIncidentListener.h"
18
19// PerfMonKernel includes
21
22// PerfMonComps includes
23#include "LinFitSglPass.h"
24#include "PerfMonMTUtils.h"
25
26// Containers
27#include <set>
28#include <map>
29#include <vector>
30
31
32// Input/Output includes
33#include <nlohmann/json.hpp>
34
35// Other Libraries
36
37
38#include <memory>
39#include <mutex>
40
41class PerfMonMTSvc : public extends<AthService, IPerfMonMTSvc, IIncidentListener> {
42 public:
44 PerfMonMTSvc(const std::string& name, ISvcLocator* pSvcLocator);
45
46 // Destructor
47 virtual ~PerfMonMTSvc() = default;
48
50 virtual void handle( const Incident& incident ) override;
51
53 virtual StatusCode initialize() override;
54
56 virtual StatusCode finalize() override;
57
59 virtual void startAud(const std::string& stepName, const std::string& compName, const EventContext& ctx) override;
60
62 virtual void stopAud(const std::string& stepName, const std::string& compName, const EventContext& ctx) override;
63
65 void startSnapshotAud(const std::string& stepName, const std::string& compName);
66 void stopSnapshotAud(const std::string& stepName, const std::string& compName);
67
69 void startCompAud(const std::string& stepName, const std::string& compName, const EventContext& ctx);
70 void stopCompAud(const std::string& stepName, const std::string& compName, const EventContext& ctx);
71
73 void report();
74
76 void report2Log();
77 void report2Log_Description() const;
81 void report2Log_Summary(); // make it const
82 void report2Log_CpuInfo() const;
83 void report2Log_EnvInfo() const;
84
86 void report2JsonFile();
87 void report2JsonFile_Summary(nlohmann::json& j) const;
88 void report2JsonFile_ComponentLevel(nlohmann::json& j) const;
89 void report2JsonFile_EventLevel(nlohmann::json& j) const;
90
92 void aggregateSlotData();
93 void divideData2Steps();
94
95 std::string scaleTime(double timeMeas) const;
96 std::string scaleMem(int64_t memMeas) const;
97
100 std::string get_info_from_file(const std::string& fileName, const std::string& fieldName) const;
101 std::string get_cpu_model_info() const;
102 int get_cpu_core_info() const;
103 uint64_t get_memory_info() const;
104
105 PMonMT::StepComp generate_state(const std::string& stepName, const std::string& compName) const;
106
107 private:
110
113
115 Gaudi::Property<bool> m_doEventLoopMonitoring{
116 this, "doEventLoopMonitoring", true,
117 "True if event loop monitoring is enabled, false o/w. Event loop monitoring may cause a decrease in the "
118 "performance due to the usage of locks."};
119
120 Gaudi::Property<bool> m_doComponentLevelMonitoring{
121 this, "doComponentLevelMonitoring", false,
122 "True if component level monitoring is enabled, false o/w. Component monitoring may cause a decrease in the "
123 "performance due to the usage of locks."};
124
125 Gaudi::Property<bool> m_reportResultsToJSON{this, "reportResultsToJSON", true, "Report results into the json file."};
127 Gaudi::Property<std::string> m_jsonFileName{this, "jsonFileName", "PerfMonMTSvc_result.json",
128 "Name of the JSON file that contains the results."};
129
130 Gaudi::Property<bool> m_printDetailedTables{this, "printDetailedTables", true,
131 "Print detailed component-level metrics."};
132
133 Gaudi::Property<uint64_t> m_memFitLowerLimit{
134 this, "memFitLowerLimit", 25,
135 "Lower limit (in number of events) for the memory fit."};
136
137 Gaudi::Property<uint64_t> m_checkPointThreshold{
138 this, "checkPointThreshold", 30,
139 "Least amount of time (in seconds) between event-level checks."};
140
141 Gaudi::Property<double> m_wallTimeOffset{this, "wallTimeOffset", 0, "Job start wall time in miliseconds."};
143 Gaudi::Property<int> m_printNComps{
144 this, "printNComps", 50, "Maximum number of components to be printed."};
145
146 Gaudi::Property<int> m_numberOfThreads{this, "numberOfThreads", 1, "Number of threads in the job."};
148 Gaudi::Property<int> m_numberOfSlots{this, "numberOfSlots", 1, "Number of slots in the job."};
150 Gaudi::Property<uint64_t> m_eventLoopMsgLimit{this, "eventLoopMsgLimit", 10, "Maximum number of event-level messages."};
151
155 const std::set<std::string> m_exclusionSet = {"AthMasterSeq", "AthAlgEvtSeq", "AthAllAlgSeq", "AthAlgSeq", "AthOutSeq",
156 "AthCondSeq", "AthBeginSeq", "AthEndSeq", "AthenaEventLoopMgr", "AthenaHiveEventLoopMgr", "AthMpEvtLoopMgr", "AthenaMtesEventLoopMgr",
157 "PerfMonMTSvc"};
158
161 std::vector<PMonMT::SnapshotData> m_snapshotData;
162 std::vector<std::string> m_snapshotStepNames = {"Configure", "Initialize", "FirstEvent", "Execute", "Finalize"};
164
165 // Store event level measurements
167
168 // Lock for capturing event loop measurements
169 std::mutex m_mutex_capture;
170
171 // Are we processing the first event?
172 std::atomic<bool> m_isFirstEvent;
173
174 // Count the number of events processed
175 std::atomic<uint64_t> m_eventCounter;
176
177 // Instant event-loop report counter
178 std::atomic<uint64_t> m_eventLoopMsgCounter;
179
180 // The last event-level measurement time in seconds
181 std::atomic<double> m_checkPointTime;
182
183 // This flag protects against double stopping the event loop measurement
184 std::atomic<bool> m_isEvtLoopStopped;
185
186 /*
187 * Data structure to store component level measurements
188 */
189 typedef std::map<PMonMT::StepComp, PMonMT::ComponentData*> data_map_t;
190 typedef std::map<PMonMT::StepComp, std::unique_ptr<PMonMT::ComponentData>> data_map_unique_t;
191 // Here I'd prefer to use SG::SlotSpecificObj<data_map_t>
192 // However, w/ invalid context it seems to segfault
193 // Can investigate in the future, for now std::vector should be OK
195
196 // m_compLevelDataMap is divided into following maps and these are stored in the m_stdoutVec_serial.
197 // There should be a more clever way!
198 std::vector<data_map_unique_t> m_compLevelDataMapVec; // all
205
206 std::vector<data_map_t> m_stdoutVec_serial;
207
208 // Leak estimates
211
212 // Estimate CPU efficiency
213 int getCpuEfficiency() const;
214
215}; // class PerfMonMTSvc
216
217#endif // PERFMONCOMPS_PERFMONMTSVC_H
Define macros for attributes used to control the static checker.
PMonMT::EventLevelData m_eventLevelData
std::vector< data_map_t > m_stdoutVec_serial
data_map_t m_compLevelDataMap_cbk
virtual void stopAud(const std::string &stepName, const std::string &compName, const EventContext &ctx) override
Stop Auditing.
Gaudi::Property< int > m_numberOfSlots
Get the number of slots.
Gaudi::Property< std::string > m_jsonFileName
Name of the JSON file.
data_map_t m_compLevelDataMap_plp
std::atomic< bool > m_isFirstEvent
Gaudi::Property< bool > m_doComponentLevelMonitoring
Do component level monitoring.
void aggregateSlotData()
A few helper functions.
PerfMon::LinFitSglPass m_fit_pss
void stopSnapshotAud(const std::string &stepName, const std::string &compName)
void report2JsonFile()
Report to the JSON File.
data_map_t m_compLevelDataMap_fin
const std::set< std::string > m_exclusionSet
Exclude some common components from monitoring In the future this might be converted to a inclusion s...
Gaudi::Property< uint64_t > m_checkPointThreshold
Frequency of event level monitoring.
std::atomic< double > m_checkPointTime
std::string get_info_from_file(const std::string &fileName, const std::string &fieldName) const
A few helper methods to get system information These should be carried to PerfMonMTUtils at some poin...
void report2Log_EventLevel()
void report2Log_Description() const
std::atomic< uint64_t > m_eventCounter
std::vector< std::string > m_snapshotStepNames
virtual void handle(const Incident &incident) override
Incident service handle for post-finalize.
std::map< PMonMT::StepComp, std::unique_ptr< PMonMT::ComponentData > > data_map_unique_t
PMonMT::StepComp generate_state(const std::string &stepName, const std::string &compName) const
std::string scaleTime(double timeMeas) const
virtual StatusCode initialize() override
Standard Gaudi Service initialization.
PMonMT::SnapshotMeasurement m_measurementSnapshots
Measurement to capture snapshots.
Gaudi::Property< bool > m_doEventLoopMonitoring
Do event loop monitoring.
PMonMT::SnapshotMeasurement m_measurementEvents
Measurement to capture events.
Gaudi::Property< uint64_t > m_memFitLowerLimit
Lower limit (in number of events) for the memory fit.
std::vector< PMonMT::SnapshotData > m_snapshotData
virtual StatusCode finalize() override
Standard Gaudi Service finalization.
void report2Log_EventLevel_instant() const
void stopCompAud(const std::string &stepName, const std::string &compName, const EventContext &ctx)
std::atomic< bool > m_isEvtLoopStopped
std::atomic< uint64_t > m_eventLoopMsgCounter
int getCpuEfficiency() const
std::map< PMonMT::StepComp, PMonMT::ComponentData * > data_map_t
data_map_t m_compLevelDataMap_ini
int get_cpu_core_info() const
data_map_t m_compLevelDataMap
void report2Log_Summary()
void report2Log()
Report to log.
virtual void startAud(const std::string &stepName, const std::string &compName, const EventContext &ctx) override
Start Auditing.
data_map_t m_compLevelDataMap_evt
void report2Log_CpuInfo() const
Gaudi::Property< double > m_wallTimeOffset
Offset for the wall-time, comes from configuration.
void report2JsonFile_Summary(nlohmann::json &j) const
int m_motherPID
Snapshots data.
void divideData2Steps()
void report()
Report the results.
void report2JsonFile_EventLevel(nlohmann::json &j) const
void report2Log_ComponentLevel()
std::string scaleMem(int64_t memMeas) const
virtual ~PerfMonMTSvc()=default
std::string get_cpu_model_info() const
Gaudi::Property< bool > m_printDetailedTables
Print detailed tables.
std::mutex m_mutex_capture
Gaudi::Property< int > m_numberOfThreads
Get the number of threads.
void startSnapshotAud(const std::string &stepName, const std::string &compName)
Snapshot Auditing: Take snapshots at the beginning and at the end of each step.
uint64_t get_memory_info() const
void report2JsonFile_ComponentLevel(nlohmann::json &j) const
Gaudi::Property< int > m_printNComps
Print the top N components.
Gaudi::Property< bool > m_reportResultsToJSON
Report results to JSON.
std::vector< data_map_unique_t > m_compLevelDataMapVec
PerfMonMTSvc(const std::string &name, ISvcLocator *pSvcLocator)
Standard Gaudi Service constructor.
void report2Log_EnvInfo() const
void startCompAud(const std::string &stepName, const std::string &compName, const EventContext &ctx)
Component Level Auditing: Take measurements at the beginning and at the end of each component call.
data_map_t m_compLevelDataMap_1stevt
Gaudi::Property< uint64_t > m_eventLoopMsgLimit
Set the number of messages for the event-level report.
PerfMon::LinFitSglPass m_fit_vmem