ATLAS Offline Software
PerfMonMTSvc.h
Go to the documentation of this file.
1 /*
2  Copyright (C) 2002-2022 CERN for the benefit of the ATLAS collaboration
3 */
4 
5 /*
6  * @authors: Alaettin Serhan Mete, Hasan Ozturk - alaettin.serhan.mete@cern.ch, haozturk@cern.ch
7  */
8 
9 #ifndef PERFMONCOMPS_PERFMONMTSVC_H
10 #define PERFMONCOMPS_PERFMONMTSVC_H
11 
12 // Thread-safety-checker
14 
15 // Framework includes
17 #include "GaudiKernel/IIncidentListener.h"
18 
19 // PerfMonKernel includes
21 
22 // PerfMonComps includes
23 #include "LinFitSglPass.h"
24 #include "PerfMonMTUtils.h"
25 
26 // Containers
27 #include <set>
28 #include <map>
29 #include <vector>
30 
31 
32 // Input/Output includes
33 #include <nlohmann/json.hpp>
34 
35 // Other Libraries
36 
37 
38 #include <memory>
39 #include <mutex>
40 
41 class PerfMonMTSvc : virtual public IPerfMonMTSvc, virtual public IIncidentListener, public AthService {
42  public:
44  PerfMonMTSvc(const std::string& name, ISvcLocator* pSvcLocator);
45 
46  // Destructor
47  virtual ~PerfMonMTSvc() = default;
48 
50  virtual StatusCode queryInterface(const InterfaceID& riid, void** ppvInterface) override;
51 
53  virtual void handle( const Incident& incident ) override;
54 
56  virtual StatusCode initialize() override;
57 
59  virtual StatusCode finalize() override;
60 
62  virtual void startAud(const std::string& stepName, const std::string& compName) override;
63 
65  virtual void stopAud(const std::string& stepName, const std::string& compName) override;
66 
68  void startSnapshotAud(const std::string& stepName, const std::string& compName);
69  void stopSnapshotAud(const std::string& stepName, const std::string& compName);
70 
72  void startCompAud(const std::string& stepName, const std::string& compName, const EventContext& ctx);
73  void stopCompAud(const std::string& stepName, const std::string& compName, const EventContext& ctx);
74 
76  void report();
77 
79  void report2Log();
80  void report2Log_Description() const;
82  void report2Log_EventLevel_instant() const;
83  void report2Log_EventLevel();
84  void report2Log_Summary(); // make it const
85  void report2Log_CpuInfo() const;
86  void report2Log_EnvInfo() const;
87 
89  void report2JsonFile();
93 
95  void aggregateSlotData();
96  void divideData2Steps();
97 
98  std::string scaleTime(double timeMeas) const;
99  std::string scaleMem(int64_t memMeas) const;
100 
103  std::string get_info_from_file(const std::string& fileName, const std::string& fieldName) const;
104  std::string get_cpu_model_info() const;
105  int get_cpu_core_info() const;
106  uint64_t get_memory_info() const;
107 
108  PMonMT::StepComp generate_state(const std::string& stepName, const std::string& compName) const;
109 
110  private:
113 
116 
118  Gaudi::Property<bool> m_doEventLoopMonitoring{
119  this, "doEventLoopMonitoring", true,
120  "True if event loop monitoring is enabled, false o/w. Event loop monitoring may cause a decrease in the "
121  "performance due to the usage of locks."};
123  Gaudi::Property<bool> m_doComponentLevelMonitoring{
124  this, "doComponentLevelMonitoring", false,
125  "True if component level monitoring is enabled, false o/w. Component monitoring may cause a decrease in the "
126  "performance due to the usage of locks."};
128  Gaudi::Property<bool> m_reportResultsToJSON{this, "reportResultsToJSON", true, "Report results into the json file."};
130  Gaudi::Property<std::string> m_jsonFileName{this, "jsonFileName", "PerfMonMTSvc_result.json",
131  "Name of the JSON file that contains the results."};
133  Gaudi::Property<bool> m_printDetailedTables{this, "printDetailedTables", true,
134  "Print detailed component-level metrics."};
136  Gaudi::Property<uint64_t> m_memFitLowerLimit{
137  this, "memFitLowerLimit", 25,
138  "Lower limit (in number of events) for the memory fit."};
140  Gaudi::Property<uint64_t> m_checkPointThreshold{
141  this, "checkPointThreshold", 30,
142  "Least amount of time (in seconds) between event-level checks."};
144  Gaudi::Property<double> m_wallTimeOffset{this, "wallTimeOffset", 0, "Job start wall time in miliseconds."};
146  Gaudi::Property<int> m_printNComps{
147  this, "printNComps", 50, "Maximum number of components to be printed."};
149  Gaudi::Property<int> m_numberOfThreads{this, "numberOfThreads", 1, "Number of threads in the job."};
151  Gaudi::Property<int> m_numberOfSlots{this, "numberOfSlots", 1, "Number of slots in the job."};
153  Gaudi::Property<uint64_t> m_eventLoopMsgLimit{this, "eventLoopMsgLimit", 10, "Maximum number of event-level messages."};
154 
158  const std::set<std::string> m_exclusionSet = {"AthMasterSeq", "AthAlgEvtSeq", "AthAllAlgSeq", "AthAlgSeq", "AthOutSeq",
159  "AthCondSeq", "AthBeginSeq", "AthEndSeq", "AthenaEventLoopMgr", "AthenaHiveEventLoopMgr", "AthMpEvtLoopMgr", "AthenaMtesEventLoopMgr",
160  "PerfMonMTSvc"};
161 
164  std::vector<PMonMT::SnapshotData> m_snapshotData;
165  std::vector<std::string> m_snapshotStepNames = {"Configure", "Initialize", "FirstEvent", "Execute", "Finalize"};
167 
168  // Store event level measurements
170 
171  // Lock for capturing event loop measurements
173 
174  // Are we processing the first event?
175  std::atomic<bool> m_isFirstEvent;
176 
177  // Count the number of events processed
178  std::atomic<uint64_t> m_eventCounter;
179 
180  // Instant event-loop report counter
181  std::atomic<uint64_t> m_eventLoopMsgCounter;
182 
183  // The last event-level measurement time in seconds
184  std::atomic<double> m_checkPointTime;
185 
186  // This flag protects against double stopping the event loop measurement
187  std::atomic<bool> m_isEvtLoopStopped;
188 
189  /*
190  * Data structure to store component level measurements
191  */
192  typedef std::map<PMonMT::StepComp, PMonMT::ComponentData*> data_map_t;
193  typedef std::map<PMonMT::StepComp, std::unique_ptr<PMonMT::ComponentData>> data_map_unique_t;
194  // Here I'd prefer to use SG::SlotSpecificObj<data_map_t>
195  // However, w/ invalid context it seems to segfault
196  // Can investigate in the future, for now std::vector should be OK
198 
199  // m_compLevelDataMap is divided into following maps and these are stored in the m_stdoutVec_serial.
200  // There should be a more clever way!
201  std::vector<data_map_unique_t> m_compLevelDataMapVec; // all
208 
209  std::vector<data_map_t> m_stdoutVec_serial;
210 
211  // Leak estimates
214 
215  // Estimate CPU efficiency
216  int getCpuEfficiency() const;
217 
218 }; // class PerfMonMTSvc
219 
220 #endif // PERFMONCOMPS_PERFMONMTSVC_H
PerfMonMTSvc::m_doComponentLevelMonitoring
Gaudi::Property< bool > m_doComponentLevelMonitoring
Do component level monitoring.
Definition: PerfMonMTSvc.h:123
PerfMonMTSvc::report2JsonFile
void report2JsonFile()
Report to the JSON File.
Definition: PerfMonMTSvc.cxx:643
PerfMonMTSvc::m_isFirstEvent
std::atomic< bool > m_isFirstEvent
Definition: PerfMonMTSvc.h:175
PerfMonMTSvc::stopSnapshotAud
void stopSnapshotAud(const std::string &stepName, const std::string &compName)
Definition: PerfMonMTSvc.cxx:254
LinFitSglPass.h
PerfMonMTSvc::report2Log_Description
void report2Log_Description() const
Definition: PerfMonMTSvc.cxx:439
PerfMonMTSvc::divideData2Steps
void divideData2Steps()
Definition: PerfMonMTSvc.cxx:845
PerfMonMTSvc::getCpuEfficiency
int getCpuEfficiency() const
Definition: PerfMonMTSvc.cxx:374
AddEmptyComponent.compName
compName
Definition: AddEmptyComponent.py:32
PerfMon::LinFitSglPass
Definition: LinFitSglPass.h:22
PerfMonMTSvc::report2Log_ComponentLevel
void report2Log_ComponentLevel()
Definition: PerfMonMTSvc.cxx:456
PerfMonMTSvc::Snapshots
Snapshots
Definition: PerfMonMTSvc.h:166
PerfMonMTSvc::m_compLevelDataMap_fin
data_map_t m_compLevelDataMap_fin
Definition: PerfMonMTSvc.h:205
PerfMonMTSvc::m_numberOfThreads
Gaudi::Property< int > m_numberOfThreads
Get the number of threads.
Definition: PerfMonMTSvc.h:149
PerfMonMTSvc::m_measurementSnapshots
PMonMT::SnapshotMeasurement m_measurementSnapshots
Measurement to capture snapshots.
Definition: PerfMonMTSvc.h:112
PerfMonMTSvc::m_compLevelDataMap
data_map_t m_compLevelDataMap
Definition: PerfMonMTSvc.h:197
PerfMonMTSvc::m_motherPID
int m_motherPID
Snapshots data.
Definition: PerfMonMTSvc.h:163
PerfMonMTSvc::report2Log_CpuInfo
void report2Log_CpuInfo() const
Definition: PerfMonMTSvc.cxx:610
PerfMonMTSvc::report2Log_EnvInfo
void report2Log_EnvInfo() const
Definition: PerfMonMTSvc.cxx:626
PMonMT::EventLevelData
Definition: PerfMonMTUtils.h:191
json
nlohmann::json json
Definition: HistogramDef.cxx:9
PerfMonMTSvc::initialize
virtual StatusCode initialize() override
Standard Gaudi Service initialization.
Definition: PerfMonMTSvc.cxx:68
PerfMonMTSvc::m_eventCounter
std::atomic< uint64_t > m_eventCounter
Definition: PerfMonMTSvc.h:178
PerfMonMTSvc::get_cpu_core_info
int get_cpu_core_info() const
Definition: PerfMonMTSvc.cxx:955
BeamSpot::mutex
std::mutex mutex
Definition: InDetBeamSpotVertex.cxx:18
PerfMonMTSvc::~PerfMonMTSvc
virtual ~PerfMonMTSvc()=default
PerfMonMTSvc::m_checkPointTime
std::atomic< double > m_checkPointTime
Definition: PerfMonMTSvc.h:184
PerfMonMTSvc::m_wallTimeOffset
Gaudi::Property< double > m_wallTimeOffset
Offset for the wall-time, comes from configuration.
Definition: PerfMonMTSvc.h:144
PerfMonMTSvc::report2JsonFile_ComponentLevel
void report2JsonFile_ComponentLevel(nlohmann::json &j) const
Definition: PerfMonMTSvc.cxx:751
PMonMT::SnapshotMeasurement
Definition: PerfMonMTUtils.h:165
PerfMonMTSvc::m_fit_vmem
PerfMon::LinFitSglPass m_fit_vmem
Definition: PerfMonMTSvc.h:212
PerfMonMTSvc::handle
virtual void handle(const Incident &incident) override
Incident service handle for post-finalize.
Definition: PerfMonMTSvc.cxx:126
PMonMT::StepComp
Definition: PerfMonMTUtils.h:58
PerfMonMTSvc::get_cpu_model_info
std::string get_cpu_model_info() const
Definition: PerfMonMTSvc.cxx:950
PerfMonMTSvc::stopCompAud
void stopCompAud(const std::string &stepName, const std::string &compName, const EventContext &ctx)
Definition: PerfMonMTSvc.cxx:316
PerfMonMTSvc::CONFIGURE
@ CONFIGURE
Definition: PerfMonMTSvc.h:166
PerfMonMTSvc::m_measurementEvents
PMonMT::SnapshotMeasurement m_measurementEvents
Measurement to capture events.
Definition: PerfMonMTSvc.h:115
PerfMonMTSvc::m_eventLoopMsgLimit
Gaudi::Property< uint64_t > m_eventLoopMsgLimit
Set the number of messages for the event-level report.
Definition: PerfMonMTSvc.h:153
PerfMonMTSvc::FINALIZE
@ FINALIZE
Definition: PerfMonMTSvc.h:166
PerfMonMTSvc::report2Log_EventLevel
void report2Log_EventLevel()
Definition: PerfMonMTSvc.cxx:520
PerfMonMTSvc::startSnapshotAud
void startSnapshotAud(const std::string &stepName, const std::string &compName)
Snapshot Auditing: Take snapshots at the beginning and at the end of each step.
Definition: PerfMonMTSvc.cxx:236
PerfMonMTSvc::m_isEvtLoopStopped
std::atomic< bool > m_isEvtLoopStopped
Definition: PerfMonMTSvc.h:187
PerfMonMTSvc::report2Log_Summary
void report2Log_Summary()
Definition: PerfMonMTSvc.cxx:559
PerfMonMTSvc::m_compLevelDataMap_ini
data_map_t m_compLevelDataMap_ini
Definition: PerfMonMTSvc.h:202
IPerfMonMTSvc.h
PerfMonMTSvc::m_compLevelDataMap_evt
data_map_t m_compLevelDataMap_evt
Definition: PerfMonMTSvc.h:204
PerfMonMTSvc::m_snapshotData
std::vector< PMonMT::SnapshotData > m_snapshotData
Definition: PerfMonMTSvc.h:164
PerfMonMTSvc::INITIALIZE
@ INITIALIZE
Definition: PerfMonMTSvc.h:166
PerfMonMTSvc::report2Log_EventLevel_instant
void report2Log_EventLevel_instant() const
Definition: PerfMonMTSvc.cxx:503
FortranAlgorithmOptions.fileName
fileName
Definition: FortranAlgorithmOptions.py:13
PerfMonMTSvc::startAud
virtual void startAud(const std::string &stepName, const std::string &compName) override
Start Auditing.
Definition: PerfMonMTSvc.cxx:202
PerfMonMTSvc::m_fit_pss
PerfMon::LinFitSglPass m_fit_pss
Definition: PerfMonMTSvc.h:213
PerfMonMTSvc::report2Log
void report2Log()
Report to log.
Definition: PerfMonMTSvc.cxx:416
PerfMonMTSvc::queryInterface
virtual StatusCode queryInterface(const InterfaceID &riid, void **ppvInterface) override
Function declaring the interface(s) implemented by the service.
Definition: PerfMonMTSvc.cxx:52
PerfMonMTSvc::m_reportResultsToJSON
Gaudi::Property< bool > m_reportResultsToJSON
Report results to JSON.
Definition: PerfMonMTSvc.h:128
EL::StatusCode
::StatusCode StatusCode
StatusCode definition for legacy code.
Definition: PhysicsAnalysis/D3PDTools/EventLoop/EventLoop/StatusCode.h:22
PerfMonMTUtils.h
AthService
Definition: AthService.h:32
PerfMonMTSvc::m_jsonFileName
Gaudi::Property< std::string > m_jsonFileName
Name of the JSON file.
Definition: PerfMonMTSvc.h:130
taskman.fieldName
fieldName
Definition: taskman.py:492
PerfMonMTSvc::finalize
virtual StatusCode finalize() override
Standard Gaudi Service finalization.
Definition: PerfMonMTSvc.cxx:116
PerfMonMTSvc::report2JsonFile_EventLevel
void report2JsonFile_EventLevel(nlohmann::json &j) const
Definition: PerfMonMTSvc.cxx:776
PerfMonMTSvc::m_eventLoopMsgCounter
std::atomic< uint64_t > m_eventLoopMsgCounter
Definition: PerfMonMTSvc.h:181
PerfMonMTSvc
Definition: PerfMonMTSvc.h:41
xAOD::uint64_t
uint64_t
Definition: EventInfo_v1.cxx:123
PerfMonMTSvc::scaleMem
std::string scaleMem(int64_t memMeas) const
Definition: PerfMonMTSvc.cxx:892
PerfMonMTSvc::scaleTime
std::string scaleTime(double timeMeas) const
Definition: PerfMonMTSvc.cxx:868
PerfMonMTSvc::get_memory_info
uint64_t get_memory_info() const
Definition: PerfMonMTSvc.cxx:961
PerfMonMTSvc::NSNAPSHOTS
@ NSNAPSHOTS
Definition: PerfMonMTSvc.h:166
PerfMonMTSvc::stopAud
virtual void stopAud(const std::string &stepName, const std::string &compName) override
Stop Auditing.
Definition: PerfMonMTSvc.cxx:221
PerfMonMTSvc::report2JsonFile_Summary
void report2JsonFile_Summary(nlohmann::json &j) const
Definition: PerfMonMTSvc.cxx:681
PerfMonMTSvc::m_snapshotStepNames
std::vector< std::string > m_snapshotStepNames
Definition: PerfMonMTSvc.h:165
PerfMonMTSvc::m_exclusionSet
const std::set< std::string > m_exclusionSet
Exclude some common components from monitoring In the future this might be converted to a inclusion s...
Definition: PerfMonMTSvc.h:158
name
std::string name
Definition: Control/AthContainers/Root/debug.cxx:195
PerfMonMTSvc::m_mutex_capture
std::mutex m_mutex_capture
Definition: PerfMonMTSvc.h:172
PerfMonMTSvc::m_numberOfSlots
Gaudi::Property< int > m_numberOfSlots
Get the number of slots.
Definition: PerfMonMTSvc.h:151
PerfMonMTSvc::m_printNComps
Gaudi::Property< int > m_printNComps
Print the top N components.
Definition: PerfMonMTSvc.h:146
PerfMonMTSvc::m_printDetailedTables
Gaudi::Property< bool > m_printDetailedTables
Print detailed tables.
Definition: PerfMonMTSvc.h:133
PerfMonMTSvc::aggregateSlotData
void aggregateSlotData()
A few helper functions.
Definition: PerfMonMTSvc.cxx:812
PerfMonMTSvc::m_stdoutVec_serial
std::vector< data_map_t > m_stdoutVec_serial
Definition: PerfMonMTSvc.h:209
PerfMonMTSvc::m_eventLevelData
PMonMT::EventLevelData m_eventLevelData
Definition: PerfMonMTSvc.h:169
IPerfMonMTSvc
STL includes.
Definition: IPerfMonMTSvc.h:16
PerfMonMTSvc::PerfMonMTSvc
PerfMonMTSvc(const std::string &name, ISvcLocator *pSvcLocator)
Standard Gaudi Service constructor.
Definition: PerfMonMTSvc.cxx:37
PerfMonMTSvc::m_doEventLoopMonitoring
Gaudi::Property< bool > m_doEventLoopMonitoring
Do event loop monitoring.
Definition: PerfMonMTSvc.h:118
PerfMonMTSvc::EXECUTE
@ EXECUTE
Definition: PerfMonMTSvc.h:166
PerfMonMTSvc::m_checkPointThreshold
Gaudi::Property< uint64_t > m_checkPointThreshold
Frequency of event level monitoring.
Definition: PerfMonMTSvc.h:140
PerfMonMTSvc::m_compLevelDataMap_1stevt
data_map_t m_compLevelDataMap_1stevt
Definition: PerfMonMTSvc.h:203
PerfMonMTSvc::m_memFitLowerLimit
Gaudi::Property< uint64_t > m_memFitLowerLimit
Lower limit (in number of events) for the memory fit.
Definition: PerfMonMTSvc.h:136
AthService.h
PerfMonMTSvc::FIRSTEVENT
@ FIRSTEVENT
Definition: PerfMonMTSvc.h:166
checker_macros.h
Define macros for attributes used to control the static checker.
PerfMonMTSvc::get_info_from_file
std::string get_info_from_file(const std::string &fileName, const std::string &fieldName) const
A few helper methods to get system information These should be carried to PerfMonMTUtils at some poin...
Definition: PerfMonMTSvc.cxx:922
PerfMonMTSvc::m_compLevelDataMapVec
std::vector< data_map_unique_t > m_compLevelDataMapVec
Definition: PerfMonMTSvc.h:201
PerfMonMTSvc::m_compLevelDataMap_cbk
data_map_t m_compLevelDataMap_cbk
Definition: PerfMonMTSvc.h:207
PerfMonMTSvc::data_map_unique_t
std::map< PMonMT::StepComp, std::unique_ptr< PMonMT::ComponentData > > data_map_unique_t
Definition: PerfMonMTSvc.h:193
PerfMonMTSvc::data_map_t
std::map< PMonMT::StepComp, PMonMT::ComponentData * > data_map_t
Definition: PerfMonMTSvc.h:192
PerfMonMTSvc::generate_state
PMonMT::StepComp generate_state(const std::string &stepName, const std::string &compName) const
Definition: PerfMonMTSvc.cxx:802
PerfMonMTSvc::report
void report()
Report the results.
Definition: PerfMonMTSvc.cxx:403
PerfMonMTSvc::startCompAud
void startCompAud(const std::string &stepName, const std::string &compName, const EventContext &ctx)
Component Level Auditing: Take measurements at the beginning and at the end of each component call.
Definition: PerfMonMTSvc.cxx:273
PerfMonMTSvc::m_compLevelDataMap_plp
data_map_t m_compLevelDataMap_plp
Definition: PerfMonMTSvc.h:206