ATLAS Offline Software
Loading...
Searching...
No Matches
GPUKernelSizeOptimizerSvc.cxx
Go to the documentation of this file.
1//
2// Copyright (C) 2002-2023 CERN for the benefit of the ATLAS collaboration
3//
4// Dear emacs, this is -*- c++ -*-
5//
6
9
10#include <fstream>
11
12GPUKernelSizeOptimizerSvc::GPUKernelSizeOptimizerSvc(const std::string & name, ISvcLocator * svc):
13 base_class(name, svc)
14{
15}
16
17void GPUKernelSizeOptimizerSvc::register_kernels(const std::string & tool_name,
18 const int number,
19 void ** kernels,
20 const int * /*blocksize_hints*/,
21 const int * gridsize_hints,
22 const int * max_total_threads,
23 const int offset)
24{
25 ATH_MSG_INFO("Registering " << number << " kernels under: " << tool_name);
26
27 std::vector<KernelRecord> & vect = m_kernel_map[tool_name];
28
29 if (int(vect.size()) < number + offset)
30 {
31 vect.resize(number + offset);
32 }
33
34 for (int i = 0; i < number; ++i)
35 {
36 CUDAKernelLaunchConfiguration cfg{1, 1, 1, 1, 1, 1};
38 {
40 }
41 else
42 {
43 CaloRecGPU::CUDA_Helpers::optimize_block_and_grid_size(kernels[i], cfg.block_x, cfg.grid_x);
44 }
45 cfg.grid_x = std::min(cfg.grid_x, CaloRecGPU::Helpers::int_ceil_div(max_total_threads[i], cfg.block_x));
46 vect[i + offset].add_configuration(cfg);
47 }
48}
49
50
51CUDAKernelLaunchConfiguration GPUKernelSizeOptimizerSvc::get_launch_configuration(const std::string & name, int number, const int /*dynamic_memory*/) const
52{
53 auto it = m_kernel_map.find(name);
54 if (it != m_kernel_map.end() && int(it->second.size()) > number)
55 {
56 const int usage = get_GPU_usage();
57 return it->second[number].configs[usage];
58 }
59 else
60 {
61 return {};
62 }
63}
64
65
67{
70
71 const std::string device_name = CaloRecGPU::CUDA_Helpers::GPU_name();
72
73 for (const auto & file : m_kernelFiles)
74 {
75 std::ifstream in(file);
76
77 if (!in.is_open())
78 {
79 ATH_MSG_WARNING("Cannot open '" << m_outputFile << "' for kernel size input.");
80 continue;
81 }
82
83 nlohmann::json j;
84 in >> j;
85
86 for (const auto & entry : j)
87 {
88 if (entry.at("device") != device_name)
89 {
90 continue;
91 }
92 const KernelsEntry ke = entry.template get<KernelsEntry>();
93
94 std::vector<KernelRecord> & vect = m_kernel_map[ke.name];
95 vect.resize(ke.kernels.size());
96
97 for (size_t i = 0; i < vect.size(); ++i)
98 {
99 for (const auto & ki : ke.kernels[i])
100 {
102 config.grid_x = ki.grid_x;
103 config.grid_y = ki.grid_y;
104 config.grid_z = ki.grid_z;
105 config.block_x = ki.block_x;
106 config.block_y = ki.block_y;
107 config.block_z = ki.block_z;
108
109 vect[i].add_configuration(config, ki.usage_start, ki.usage_end, true);
110 }
111 }
112 }
113
114 }
115
116 return StatusCode::SUCCESS;
117}
118
120{
121 if (m_outputSizes && m_kernel_map.size() > 0)
122 {
123 std::ofstream output(m_outputFile);
124
125 auto delta_configs = [](const CUDAKernelLaunchConfiguration & a, const KernelsEntry::KernelInfo & b) -> bool
126 {
127 return ( a.grid_x != b.grid_x ) ||
128 ( a.grid_y != b.grid_y ) ||
129 ( a.grid_z != b.grid_z ) ||
130 ( a.block_x != b.block_x ) ||
131 ( a.block_y != b.block_y ) ||
132 ( a.block_z != b.block_z );
133 };
134
135 if (output.is_open())
136 {
137 output << "[\n";
138 const std::string device_name = CaloRecGPU::CUDA_Helpers::GPU_name();
139 bool first = true;
140 for (const auto & pair : m_kernel_map)
141 {
142 if (first)
143 {
144 first = false;
145 }
146 else
147 {
148 output << ",\n";
149 }
150
151 KernelsEntry ke;
152 ke.device = device_name;
153 ke.name = pair.first;
154 ke.kernels.resize(pair.second.size());
155
156 for (size_t i = 0; i < ke.kernels.size(); ++i)
157 {
158 const KernelRecord & kr = pair.second[i];
159
161 for (int u = 0; u <= 100; ++u)
162 {
163 const CUDAKernelLaunchConfiguration & cfg = kr.configs[u];
164 if (delta_configs(cfg, ki))
165 {
166 if (ki.grid_x > 0)
167 {
168 ki.usage_end = u - 1;
169 ke.kernels[i].push_back(ki);
170 }
171 ki.usage_start = u;
172 ki.grid_x = cfg.grid_x;
173 ki.grid_y = cfg.grid_y;
174 ki.grid_z = cfg.grid_z;
175 ki.block_x = cfg.block_x;
176 ki.block_y = cfg.block_y;
177 ki.block_z = cfg.block_z;
178 }
179 }
180 if (ki.grid_x > 0)
181 {
182 ki.usage_end = 100;
183 ke.kernels[i].push_back(ki);
184 }
185
186 }
187
188 nlohmann::json j = ke;
189
190 output << j.dump(2);
191
192 }
193 output << "\n]" << std::endl;
194 }
195 else
196 {
197 ATH_MSG_WARNING("Cannot open '" << m_outputFile << "' for kernel size output.");
198 }
199 }
200 return StatusCode::SUCCESS;
201}
#define ATH_MSG_INFO(x)
#define ATH_MSG_WARNING(x)
static Double_t a
virtual StatusCode initialize_CUDA() override
Initialization that invokes CUDA functions.
GPUKernelSizeOptimizerSvc(const std::string &name, ISvcLocator *svc)
int get_GPU_usage() const
Get the GPU usage, in percentage, rounded to the nearest integer.
virtual StatusCode finalize() override
virtual void register_kernels(const std::string &tool_name, const int number, void **kernels, const int *blocksize_hints, const int *gridsize_hints, const int *max_total_threads, const int offset=0) override
Register a set of kernels that can be referred back to with a name and a number.
Gaudi::Property< std::vector< std::string > > m_kernelFiles
List of JSON files from where to read (hopefully optimized) kernel sizes for different GPUs.
virtual CUDAKernelLaunchConfiguration get_launch_configuration(const std::string &name, const int number=0, const int dynamic_memory=0) const override
Retrieve the (hopefully optimal) kernel launch configuration.
Gaudi::Property< std::string > m_outputFile
If m_outputSizes is true, the file to which the kernel sizes should be output.
Gaudi::Property< bool > m_outputSizes
If true, writes the (last used) kernel sizes to an output JSON file.
std::unordered_map< std::string, std::vector< KernelRecord > > m_kernel_map
STL class.
T * get(TKey *tobj)
get a TObject* from a TKey* (why can't a TObject be a TKey?)
Definition hcg.cxx:130
int usage(std::ostream &s, int, char **argv, int status=-1)
Definition hcg.cxx:1035
void optimize_block_and_grid_size_for_cooperative_launch(void *func, int &block_size, int &grid_size, const int dynamic_memory=0, const int block_size_limit=0)
Optimizes block and grid size for a cooperative launch.
void optimize_block_and_grid_size(void *func, int &block_size, int &grid_size, const int dynamic_memory=0, const int block_size_limit=0)
Optimizes block and grid size according to cudaOccupancyMaxPotentialBlockSize.
constexpr auto int_ceil_div(const T1 num, const T2 denom)
Returns the ceiling of num/denom, with proper rounding.
CUDAKernelLaunchConfiguration configs[101]
std::vector< std::vector< KernelInfo > > kernels
TFile * file
std::string number(const double &d, const std::string &s)
Definition utils.cxx:186