ATLAS Offline Software
GPUKernelSizeOptimizerSvc.h
Go to the documentation of this file.
1 //
2 // Copyright (C) 2002-2023 CERN for the benefit of the ATLAS collaboration
3 //
4 // Dear emacs, this is -*- c++ -*-
5 //
6 
7 #ifndef CALORECGPU_GPUKERNELSIZEOPTIMIZERSVC_H
8 #define CALORECGPU_GPUKERNELSIZEOPTIMIZERSVC_H
9 
10 #include <string>
11 #include <vector>
12 #include <unordered_map>
13 #include <cstdint>
14 
16 
19 
20 #include <nlohmann/json.hpp>
21 
29 class GPUKernelSizeOptimizerSvc : public extends <AthService, IGPUKernelSizeOptimizerSvc>, public CaloGPUCUDAInitialization
30 {
31  public:
32 
33  GPUKernelSizeOptimizerSvc(const std::string & name, ISvcLocator * svc);
34 
41  virtual void register_kernels(const std::string & tool_name,
42  const int number,
43  void ** kernels,
44  const int * blocksize_hints,
45  const int * gridsize_hints,
46  const int * max_total_threads,
47  const int offset = 0) override;
48 
51  const int number = 0,
52  const int dynamic_memory = 0) const override;
53 
55  virtual bool can_use_cooperative_groups() const override
56  {
57  return m_coopgroup_support;
58  }
59 
61  virtual bool can_use_dynamic_parallelism() const override
62  {
63  return m_dynpar_support;
64  }
65 
67  virtual bool should_use_minimal_kernel_sizes() const
68  {
69  //Testing shows that, at least on the devices we use,
70  //we only lose performance by dyn-par'ing our way to do this.
71  return false;
72  }
73 
74  virtual StatusCode initialize() override
75  {
77  }
78 
79  virtual StatusCode initialize_CUDA() override;
80 
81  virtual StatusCode finalize() override;
82 
83  private:
84 
85  bool m_dynpar_support = false;
86  bool m_coopgroup_support = false;
87 
88  struct KernelRecord
89  {
92  const int usage_start = 0,
93  const int usage_end = 100,
94  const bool overwrite = false)
95  {
96  for (int u = usage_start; u <= usage_end && u <= 100; ++u)
97  {
99  if (overwrite || cfg.grid_x <= 0)
100  {
101  cfg = config;
102  }
103  }
104  }
105  };
106 
107  std::unordered_map<std::string, std::vector<KernelRecord>> m_kernel_map;
108 
113  int get_GPU_usage() const
114  {
115  return 0;
116  }
117 
120  Gaudi::Property<std::vector<std::string>> m_kernelFiles {this, "KernelSizeInput", {}, "Kernel size input JSON files"};
121 
125  Gaudi::Property<bool> m_outputSizes {this, "OutputSizes", true, "Write out last used kernel sizes"};
126 
129  Gaudi::Property<std::string> m_outputFile {this, "OutputFile", "sizes.json", "Kernel size output file"};
130 
132  {
133  struct KernelInfo
134  {
135  int usage_start = 0;
136  int usage_end = 100;
137  int grid_x = 0, grid_y = 0, grid_z = 0, block_x = 0, block_y = 0, block_z = 0;
138 
140  grid_x, grid_y, grid_z,
142  };
143 
144  std::string device;
145  std::string name;
146  std::vector< std::vector<KernelInfo> > kernels;
147 
148 
149  NLOHMANN_DEFINE_TYPE_INTRUSIVE(KernelsEntry, device, name, kernels)
150  };
151 
152 };
153 
154 #endif
GPUKernelSizeOptimizerSvc::KernelsEntry::name
std::string name
Definition: GPUKernelSizeOptimizerSvc.h:145
GPUKernelSizeOptimizerSvc::register_kernels
virtual void register_kernels(const std::string &tool_name, const int number, void **kernels, const int *blocksize_hints, const int *gridsize_hints, const int *max_total_threads, const int offset=0) override
Register a set of kernels that can be referred back to with a name and a number.
Definition: GPUKernelSizeOptimizerSvc.cxx:17
GPUKernelSizeOptimizerSvc::can_use_cooperative_groups
virtual bool can_use_cooperative_groups() const override
Whether the device + environment in use support cooperative groups.
Definition: GPUKernelSizeOptimizerSvc.h:55
GPUKernelSizeOptimizerSvc::KernelsEntry::KernelInfo::grid_x
int grid_x
Definition: GPUKernelSizeOptimizerSvc.h:137
GPUKernelSizeOptimizerSvc::m_kernelFiles
Gaudi::Property< std::vector< std::string > > m_kernelFiles
List of JSON files from where to read (hopefully optimized) kernel sizes for different GPUs.
Definition: GPUKernelSizeOptimizerSvc.h:120
GPUKernelSizeOptimizerSvc::m_outputSizes
Gaudi::Property< bool > m_outputSizes
If true, writes the (last used) kernel sizes to an output JSON file.
Definition: GPUKernelSizeOptimizerSvc.h:125
GPUKernelSizeOptimizerSvc::KernelsEntry::KernelInfo::usage_start
int usage_start
Definition: GPUKernelSizeOptimizerSvc.h:135
GPUKernelSizeOptimizerSvc::should_use_minimal_kernel_sizes
virtual bool should_use_minimal_kernel_sizes() const
Whether to avoid oversizing kernels and instead (if possible) launch kernels with the exact number of...
Definition: GPUKernelSizeOptimizerSvc.h:67
GPUKernelSizeOptimizerSvc::KernelsEntry
Definition: GPUKernelSizeOptimizerSvc.h:132
GPUKernelSizeOptimizerSvc
Definition: GPUKernelSizeOptimizerSvc.h:30
GPUKernelSizeOptimizerSvc::KernelsEntry::KernelInfo::NLOHMANN_DEFINE_TYPE_INTRUSIVE
NLOHMANN_DEFINE_TYPE_INTRUSIVE(KernelInfo, usage_start, usage_end, grid_x, grid_y, grid_z, block_x, block_y, block_z)
GPUKernelSizeOptimizerSvc::KernelRecord::add_configuration
void add_configuration(const CUDAKernelLaunchConfiguration &config, const int usage_start=0, const int usage_end=100, const bool overwrite=false)
Definition: GPUKernelSizeOptimizerSvc.h:91
Trk::u
@ u
Enums for curvilinear frames.
Definition: ParamDefs.h:83
GPUKernelSizeOptimizerSvc::finalize
virtual StatusCode finalize() override
Definition: GPUKernelSizeOptimizerSvc.cxx:119
config
Definition: PhysicsAnalysis/AnalysisCommon/AssociationUtils/python/config.py:1
GPUKernelSizeOptimizerSvc::KernelsEntry::KernelInfo::grid_z
int grid_z
Definition: GPUKernelSizeOptimizerSvc.h:137
CaloGPUCUDAInitialization::initialize
virtual StatusCode initialize()
Definition: CaloGPUCUDAInitialization.h:44
IGPUKernelSizeOptimizerSvc.h
GPUKernelSizeOptimizerSvc::KernelsEntry::KernelInfo::grid_y
int grid_y
Definition: GPUKernelSizeOptimizerSvc.h:137
EL::StatusCode
::StatusCode StatusCode
StatusCode definition for legacy code.
Definition: PhysicsAnalysis/D3PDTools/EventLoop/EventLoop/StatusCode.h:22
GPUKernelSizeOptimizerSvc::KernelsEntry::KernelInfo::block_z
int block_z
Definition: GPUKernelSizeOptimizerSvc.h:137
GPUKernelSizeOptimizerSvc::m_kernel_map
std::unordered_map< std::string, std::vector< KernelRecord > > m_kernel_map
Definition: GPUKernelSizeOptimizerSvc.h:107
CaloGPUCUDAInitialization
Base class to provide some basic common infrastructure for initializing CUDA only at the right place ...
Definition: CaloGPUCUDAInitialization.h:28
Handler::svc
AthROOTErrorHandlerSvc * svc
Definition: AthROOTErrorHandlerSvc.cxx:10
GPUKernelSizeOptimizerSvc::GPUKernelSizeOptimizerSvc
GPUKernelSizeOptimizerSvc(const std::string &name, ISvcLocator *svc)
Definition: GPUKernelSizeOptimizerSvc.cxx:12
GPUKernelSizeOptimizerSvc::KernelsEntry::device
std::string device
Definition: GPUKernelSizeOptimizerSvc.h:144
python.selection.number
number
Definition: selection.py:20
name
std::string name
Definition: Control/AthContainers/Root/debug.cxx:195
GPUKernelSizeOptimizerSvc::KernelsEntry::KernelInfo::block_x
int block_x
Definition: GPUKernelSizeOptimizerSvc.h:137
GPUKernelSizeOptimizerSvc::KernelRecord::configs
CUDAKernelLaunchConfiguration configs[101]
Definition: GPUKernelSizeOptimizerSvc.h:90
WriteCaloSwCorrections.cfg
cfg
Definition: WriteCaloSwCorrections.py:23
GPUKernelSizeOptimizerSvc::m_outputFile
Gaudi::Property< std::string > m_outputFile
If m_outputSizes is true, the file to which the kernel sizes should be output.
Definition: GPUKernelSizeOptimizerSvc.h:129
GPUKernelSizeOptimizerSvc::get_launch_configuration
virtual CUDAKernelLaunchConfiguration get_launch_configuration(const std::string &name, const int number=0, const int dynamic_memory=0) const override
Retrieve the (hopefully optimal) kernel launch configuration.
Definition: GPUKernelSizeOptimizerSvc.cxx:51
GPUKernelSizeOptimizerSvc::get_GPU_usage
int get_GPU_usage() const
Get the GPU usage, in percentage, rounded to the nearest integer.
Definition: GPUKernelSizeOptimizerSvc.h:113
GPUKernelSizeOptimizerSvc::initialize_CUDA
virtual StatusCode initialize_CUDA() override
Initialization that invokes CUDA functions.
Definition: GPUKernelSizeOptimizerSvc.cxx:66
GPUKernelSizeOptimizerSvc::m_dynpar_support
bool m_dynpar_support
Definition: GPUKernelSizeOptimizerSvc.h:85
config
std::vector< std::string > config
Definition: fbtTestBasics.cxx:72
GPUKernelSizeOptimizerSvc::KernelRecord
Definition: GPUKernelSizeOptimizerSvc.h:89
convertTimingResiduals.offset
offset
Definition: convertTimingResiduals.py:71
GPUKernelSizeOptimizerSvc::KernelsEntry::KernelInfo::usage_end
int usage_end
Definition: GPUKernelSizeOptimizerSvc.h:136
AthService.h
GPUKernelSizeOptimizerSvc::KernelsEntry::KernelInfo
Definition: GPUKernelSizeOptimizerSvc.h:134
GPUKernelSizeOptimizerSvc::can_use_dynamic_parallelism
virtual bool can_use_dynamic_parallelism() const override
Whether the device + environment in use support dynamic parallelism.
Definition: GPUKernelSizeOptimizerSvc.h:61
CUDAKernelLaunchConfiguration
Definition: IGPUKernelSizeOptimizer.h:13
GPUKernelSizeOptimizerSvc::KernelsEntry::kernels
std::vector< std::vector< KernelInfo > > kernels
Definition: GPUKernelSizeOptimizerSvc.h:146
GPUKernelSizeOptimizerSvc::KernelsEntry::KernelInfo::block_y
int block_y
Definition: GPUKernelSizeOptimizerSvc.h:137
GPUKernelSizeOptimizerSvc::m_coopgroup_support
bool m_coopgroup_support
Definition: GPUKernelSizeOptimizerSvc.h:86
CaloGPUCUDAInitialization.h
GPUKernelSizeOptimizerSvc::initialize
virtual StatusCode initialize() override
Definition: GPUKernelSizeOptimizerSvc.h:74