ATLAS Offline Software
GPUKernelSizeOptimizerSvc.cxx
Go to the documentation of this file.
1 //
2 // Copyright (C) 2002-2023 CERN for the benefit of the ATLAS collaboration
3 //
4 // Dear emacs, this is -*- c++ -*-
5 //
6 
8 #include "CaloRecGPU/Helpers.h"
9 
10 #include <fstream>
11 
13  base_class(name, svc)
14 {
15 }
16 
17 void GPUKernelSizeOptimizerSvc::register_kernels(const std::string & tool_name,
18  const int number,
19  void ** kernels,
20  const int * /*blocksize_hints*/,
21  const int * gridsize_hints,
22  const int * max_total_threads,
23  const int offset)
24 {
25  ATH_MSG_INFO("Registering " << number << " kernels under: " << tool_name);
26 
27  std::vector<KernelRecord> & vect = m_kernel_map[tool_name];
28 
29  if (int(vect.size()) < number + offset)
30  {
31  vect.resize(number + offset);
32  }
33 
34  for (int i = 0; i < number; ++i)
35  {
36  CUDAKernelLaunchConfiguration cfg{1, 1, 1, 1, 1, 1};
37  if (gridsize_hints[i] == IGPUKernelSizeOptimizer::SpecialSizeHints::CooperativeLaunch)
38  {
40  }
41  else
42  {
44  }
45  cfg.grid_x = std::min(cfg.grid_x, CaloRecGPU::Helpers::int_ceil_div(max_total_threads[i], cfg.block_x));
46  vect[i + offset].add_configuration(cfg);
47  }
48 }
49 
50 
51 CUDAKernelLaunchConfiguration GPUKernelSizeOptimizerSvc::get_launch_configuration(const std::string & name, int number, const int /*dynamic_memory*/) const
52 {
53  auto it = m_kernel_map.find(name);
54  if (it != m_kernel_map.end() && int(it->second.size()) > number)
55  {
56  const int usage = get_GPU_usage();
57  return it->second[number].configs[usage];
58  }
59  else
60  {
61  return {};
62  }
63 }
64 
65 
67 {
70 
71  const std::string device_name = CaloRecGPU::CUDA_Helpers::GPU_name();
72 
73  for (const auto & file : m_kernelFiles)
74  {
75  std::ifstream in(file);
76 
77  if (!in.is_open())
78  {
79  ATH_MSG_WARNING("Cannot open '" << m_outputFile << "' for kernel size input.");
80  continue;
81  }
82 
84  in >> j;
85 
86  for (const auto & entry : j)
87  {
88  if (entry.at("device") != device_name)
89  {
90  continue;
91  }
92  const KernelsEntry ke = entry.template get<KernelsEntry>();
93 
94  std::vector<KernelRecord> & vect = m_kernel_map[ke.name];
95  vect.resize(ke.kernels.size());
96 
97  for (size_t i = 0; i < vect.size(); ++i)
98  {
99  for (const auto & ki : ke.kernels[i])
100  {
102  config.grid_x = ki.grid_x;
103  config.grid_y = ki.grid_y;
104  config.grid_z = ki.grid_z;
105  config.block_x = ki.block_x;
106  config.block_y = ki.block_y;
107  config.block_z = ki.block_z;
108 
109  vect[i].add_configuration(config, ki.usage_start, ki.usage_end, true);
110  }
111  }
112  }
113 
114  }
115 
116  return StatusCode::SUCCESS;
117 }
118 
120 {
121  if (m_outputSizes && m_kernel_map.size() > 0)
122  {
123  std::ofstream output(m_outputFile);
124 
125  auto delta_configs = [](const CUDAKernelLaunchConfiguration & a, const KernelsEntry::KernelInfo & b) -> bool
126  {
127  return ( a.grid_x != b.grid_x ) ||
128  ( a.grid_y != b.grid_y ) ||
129  ( a.grid_z != b.grid_z ) ||
130  ( a.block_x != b.block_x ) ||
131  ( a.block_y != b.block_y ) ||
132  ( a.block_z != b.block_z );
133  };
134 
135  if (output.is_open())
136  {
137  output << "[\n";
138  const std::string device_name = CaloRecGPU::CUDA_Helpers::GPU_name();
139  bool first = true;
140  for (const auto & pair : m_kernel_map)
141  {
142  if (first)
143  {
144  first = false;
145  }
146  else
147  {
148  output << ",\n";
149  }
150 
151  KernelsEntry ke;
152  ke.device = device_name;
153  ke.name = pair.first;
154  ke.kernels.resize(pair.second.size());
155 
156  for (size_t i = 0; i < ke.kernels.size(); ++i)
157  {
158  const KernelRecord & kr = pair.second[i];
159 
161  for (int u = 0; u <= 100; ++u)
162  {
164  if (delta_configs(cfg, ki))
165  {
166  if (ki.grid_x > 0)
167  {
168  ki.usage_end = u - 1;
169  ke.kernels[i].push_back(ki);
170  }
171  ki.usage_start = u;
172  ki.grid_x = cfg.grid_x;
173  ki.grid_y = cfg.grid_y;
174  ki.grid_z = cfg.grid_z;
175  ki.block_x = cfg.block_x;
176  ki.block_y = cfg.block_y;
177  ki.block_z = cfg.block_z;
178  }
179  }
180  if (ki.grid_x > 0)
181  {
182  ki.usage_end = 100;
183  ke.kernels[i].push_back(ki);
184  }
185 
186  }
187 
188  nlohmann::json j = ke;
189 
190  output << j.dump(2);
191 
192  }
193  output << "\n]" << std::endl;
194  }
195  else
196  {
197  ATH_MSG_WARNING("Cannot open '" << m_outputFile << "' for kernel size output.");
198  }
199  }
200  return StatusCode::SUCCESS;
201 }
AtlCoolConsole.usage
tuple usage
Definition: AtlCoolConsole.py:443
GPUKernelSizeOptimizerSvc::KernelsEntry::name
std::string name
Definition: GPUKernelSizeOptimizerSvc.h:145
CaloRecGPU::CUDA_Helpers::optimize_block_and_grid_size
void optimize_block_and_grid_size(void *func, int &block_size, int &grid_size, const int dynamic_memory=0, const int block_size_limit=0)
Optimizes block and grid size according to cudaOccupancyMaxPotentialBlockSize.
CaloRecGPU::CUDA_Helpers::supports_cooperative_launches
bool supports_cooperative_launches()
GPUKernelSizeOptimizerSvc::register_kernels
virtual void register_kernels(const std::string &tool_name, const int number, void **kernels, const int *blocksize_hints, const int *gridsize_hints, const int *max_total_threads, const int offset=0) override
Register a set of kernels that can be referred back to with a name and a number.
Definition: GPUKernelSizeOptimizerSvc.cxx:17
ATH_MSG_INFO
#define ATH_MSG_INFO(x)
Definition: AthMsgStreamMacros.h:31
CaloRecGPU::CUDA_Helpers::optimize_block_and_grid_size_for_cooperative_launch
void optimize_block_and_grid_size_for_cooperative_launch(void *func, int &block_size, int &grid_size, const int dynamic_memory=0, const int block_size_limit=0)
Optimizes block and grid size for a cooperative launch.
GPUKernelSizeOptimizerSvc::KernelsEntry::KernelInfo::grid_x
int grid_x
Definition: GPUKernelSizeOptimizerSvc.h:137
json
nlohmann::json json
Definition: HistogramDef.cxx:9
GPUKernelSizeOptimizerSvc::m_kernelFiles
Gaudi::Property< std::vector< std::string > > m_kernelFiles
List of JSON files from where to read (hopefully optimized) kernel sizes for different GPUs.
Definition: GPUKernelSizeOptimizerSvc.h:120
min
constexpr double min()
Definition: ap_fixedTest.cxx:26
GPUKernelSizeOptimizerSvc::m_outputSizes
Gaudi::Property< bool > m_outputSizes
If true, writes the (last used) kernel sizes to an output JSON file.
Definition: GPUKernelSizeOptimizerSvc.h:125
python.base_data.config
config
Definition: base_data.py:21
CaloRecGPU::Helpers::int_ceil_div
constexpr int int_ceil_div(const int num, const int denom)
Returns the ceiling of num/denom, with proper rounding.
Definition: Calorimeter/CaloRecGPU/CaloRecGPU/Helpers.h:214
skel.it
it
Definition: skel.GENtoEVGEN.py:396
GPUKernelSizeOptimizerSvc::KernelsEntry::KernelInfo::usage_start
int usage_start
Definition: GPUKernelSizeOptimizerSvc.h:135
GPUKernelSizeOptimizerSvc::KernelsEntry
Definition: GPUKernelSizeOptimizerSvc.h:132
Trk::u
@ u
Enums for curvilinear frames.
Definition: ParamDefs.h:77
GPUKernelSizeOptimizerSvc::finalize
virtual StatusCode finalize() override
Definition: GPUKernelSizeOptimizerSvc.cxx:119
config
Definition: PhysicsAnalysis/AnalysisCommon/AssociationUtils/python/config.py:1
GPUKernelSizeOptimizerSvc::KernelsEntry::KernelInfo::grid_z
int grid_z
Definition: GPUKernelSizeOptimizerSvc.h:137
CaloRecGPU::CUDA_Helpers::GPU_name
std::string GPU_name()
GPUKernelSizeOptimizerSvc::KernelsEntry::KernelInfo::grid_y
int grid_y
Definition: GPUKernelSizeOptimizerSvc.h:137
lumiFormat.i
int i
Definition: lumiFormat.py:85
Helpers.h
EL::StatusCode
::StatusCode StatusCode
StatusCode definition for legacy code.
Definition: PhysicsAnalysis/D3PDTools/EventLoop/EventLoop/StatusCode.h:22
GPUKernelSizeOptimizerSvc.h
file
TFile * file
Definition: tile_monitor.h:29
GPUKernelSizeOptimizerSvc::KernelsEntry::KernelInfo::block_z
int block_z
Definition: GPUKernelSizeOptimizerSvc.h:137
GPUKernelSizeOptimizerSvc::m_kernel_map
std::unordered_map< std::string, std::vector< KernelRecord > > m_kernel_map
Definition: GPUKernelSizeOptimizerSvc.h:107
CaloRecGPU::CUDA_Helpers::supports_dynamic_parallelism
bool supports_dynamic_parallelism()
Handler::svc
AthROOTErrorHandlerSvc * svc
Definition: AthROOTErrorHandlerSvc.cxx:10
GPUKernelSizeOptimizerSvc::GPUKernelSizeOptimizerSvc
GPUKernelSizeOptimizerSvc(const std::string &name, ISvcLocator *svc)
Definition: GPUKernelSizeOptimizerSvc.cxx:12
GPUKernelSizeOptimizerSvc::KernelsEntry::device
std::string device
Definition: GPUKernelSizeOptimizerSvc.h:144
merge.output
output
Definition: merge.py:17
GetAllXsec.entry
list entry
Definition: GetAllXsec.py:132
python.selection.number
number
Definition: selection.py:20
name
std::string name
Definition: Control/AthContainers/Root/debug.cxx:228
plotBeamSpotMon.b
b
Definition: plotBeamSpotMon.py:77
GPUKernelSizeOptimizerSvc::KernelsEntry::KernelInfo::block_x
int block_x
Definition: GPUKernelSizeOptimizerSvc.h:137
GPUKernelSizeOptimizerSvc::KernelRecord::configs
CUDAKernelLaunchConfiguration configs[101]
Definition: GPUKernelSizeOptimizerSvc.h:90
WriteCaloSwCorrections.cfg
cfg
Definition: WriteCaloSwCorrections.py:23
GPUKernelSizeOptimizerSvc::m_outputFile
Gaudi::Property< std::string > m_outputFile
If m_outputSizes is true, the file to which the kernel sizes should be output.
Definition: GPUKernelSizeOptimizerSvc.h:129
GPUKernelSizeOptimizerSvc::get_launch_configuration
virtual CUDAKernelLaunchConfiguration get_launch_configuration(const std::string &name, const int number=0, const int dynamic_memory=0) const override
Retrieve the (hopefully optimal) kernel launch configuration.
Definition: GPUKernelSizeOptimizerSvc.cxx:51
GPUKernelSizeOptimizerSvc::get_GPU_usage
int get_GPU_usage() const
Get the GPU usage, in percentage, rounded to the nearest integer.
Definition: GPUKernelSizeOptimizerSvc.h:113
GPUKernelSizeOptimizerSvc::initialize_CUDA
virtual StatusCode initialize_CUDA() override
Initialization that invokes CUDA functions.
Definition: GPUKernelSizeOptimizerSvc.cxx:66
a
TList * a
Definition: liststreamerinfos.cxx:10
GPUKernelSizeOptimizerSvc::m_dynpar_support
bool m_dynpar_support
Definition: GPUKernelSizeOptimizerSvc.h:85
ATH_MSG_WARNING
#define ATH_MSG_WARNING(x)
Definition: AthMsgStreamMacros.h:32
DeMoScan.first
bool first
Definition: DeMoScan.py:536
GPUKernelSizeOptimizerSvc::KernelRecord
Definition: GPUKernelSizeOptimizerSvc.h:89
convertTimingResiduals.offset
offset
Definition: convertTimingResiduals.py:71
GPUKernelSizeOptimizerSvc::KernelsEntry::KernelInfo::usage_end
int usage_end
Definition: GPUKernelSizeOptimizerSvc.h:136
GPUKernelSizeOptimizerSvc::KernelsEntry::KernelInfo
Definition: GPUKernelSizeOptimizerSvc.h:134
CUDAKernelLaunchConfiguration
Definition: IGPUKernelSizeOptimizer.h:13
GPUKernelSizeOptimizerSvc::KernelsEntry::kernels
std::vector< std::vector< KernelInfo > > kernels
Definition: GPUKernelSizeOptimizerSvc.h:146
GPUKernelSizeOptimizerSvc::KernelsEntry::KernelInfo::block_y
int block_y
Definition: GPUKernelSizeOptimizerSvc.h:137
GPUKernelSizeOptimizerSvc::m_coopgroup_support
bool m_coopgroup_support
Definition: GPUKernelSizeOptimizerSvc.h:86