More...
#include <GPUKernelSizeOptimizerSvc.h>
|
| | GPUKernelSizeOptimizerSvc (const std::string &name, ISvcLocator *svc) |
| virtual void | register_kernels (const std::string &tool_name, const int number, void **kernels, const int *blocksize_hints, const int *gridsize_hints, const int *max_total_threads, const int offset=0) override |
| | Register a set of kernels that can be referred back to with a name and a number.
|
| virtual CUDAKernelLaunchConfiguration | get_launch_configuration (const std::string &name, const int number=0, const int dynamic_memory=0) const override |
| | Retrieve the (hopefully optimal) kernel launch configuration.
|
| virtual bool | can_use_cooperative_groups () const override |
| | Whether the device + environment in use support cooperative groups.
|
| virtual bool | can_use_dynamic_parallelism () const override |
| | Whether the device + environment in use support dynamic parallelism.
|
| virtual bool | should_use_minimal_kernel_sizes () const |
| | Whether to avoid oversizing kernels and instead (if possible) launch kernels with the exact number of threads...
|
| virtual StatusCode | initialize () override |
| virtual StatusCode | initialize_CUDA () override |
| | Initialization that invokes CUDA functions.
|
| virtual StatusCode | finalize () override |
| void | handle (const Incident &incident) override |
|
| int | get_GPU_usage () const |
| | Get the GPU usage, in percentage, rounded to the nearest integer.
|
|
| bool | m_dynpar_support = false |
| bool | m_coopgroup_support = false |
| std::unordered_map< std::string, std::vector< KernelRecord > > | m_kernel_map |
| Gaudi::Property< std::vector< std::string > > | m_kernelFiles {this, "KernelSizeInput", {}, "Kernel size input JSON files"} |
| | List of JSON files from where to read (hopefully optimized) kernel sizes for different GPUs.
|
| Gaudi::Property< bool > | m_outputSizes {this, "OutputSizes", true, "Write out last used kernel sizes"} |
| | If true, writes the (last used) kernel sizes to an output JSON file.
|
| Gaudi::Property< std::string > | m_outputFile {this, "OutputFile", "sizes.json", "Kernel size output file"} |
| | If m_outputSizes is true, the file to which the kernel sizes should be output.
|
| Gaudi::Property< bool > | m_overrideCooperativeGroups {this, "OverrideCooperativeGroups", false, "Disable cooperative group support (to force fallback to alternative iteration method)."} |
| | If true, forces can_use_cooperative_groups to always return false.
|
◆ GPUKernelSizeOptimizerSvc()
| GPUKernelSizeOptimizerSvc::GPUKernelSizeOptimizerSvc |
( |
const std::string & | name, |
|
|
ISvcLocator * | svc ) |
◆ can_use_cooperative_groups()
| virtual bool GPUKernelSizeOptimizerSvc::can_use_cooperative_groups |
( |
| ) |
const |
|
inlineoverridevirtual |
Whether the device + environment in use support cooperative groups.
Definition at line 55 of file GPUKernelSizeOptimizerSvc.h.
56 {
58 }
Gaudi::Property< bool > m_overrideCooperativeGroups
If true, forces can_use_cooperative_groups to always return false.
◆ can_use_dynamic_parallelism()
| virtual bool GPUKernelSizeOptimizerSvc::can_use_dynamic_parallelism |
( |
| ) |
const |
|
inlineoverridevirtual |
◆ finalize()
| StatusCode GPUKernelSizeOptimizerSvc::finalize |
( |
| ) |
|
|
overridevirtual |
Definition at line 119 of file GPUKernelSizeOptimizerSvc.cxx.
120{
122 {
124
126 {
127 return (
a.grid_x !=
b.grid_x ) ||
128 (
a.grid_y !=
b.grid_y ) ||
129 (
a.grid_z !=
b.grid_z ) ||
130 (
a.block_x !=
b.block_x ) ||
131 (
a.block_y !=
b.block_y ) ||
132 (
a.block_z !=
b.block_z );
133 };
134
136 {
141 {
142 if (first)
143 {
145 }
146 else
147 {
149 }
150
153 ke.name = pair.first;
154 ke.kernels.resize(pair.second.size());
155
156 for (
size_t i = 0;
i < ke.kernels.size(); ++
i)
157 {
159
161 for (
int u = 0;
u <= 100; ++
u)
162 {
163 const CUDAKernelLaunchConfiguration &
cfg = kr.configs[
u];
164 if (delta_configs(cfg, ki))
165 {
166 if (ki.grid_x > 0)
167 {
168 ki.usage_end =
u - 1;
169 ke.kernels[
i].push_back(ki);
170 }
172 ki.grid_x =
cfg.grid_x;
173 ki.grid_y =
cfg.grid_y;
174 ki.grid_z =
cfg.grid_z;
175 ki.block_x =
cfg.block_x;
176 ki.block_y =
cfg.block_y;
177 ki.block_z =
cfg.block_z;
178 }
179 }
180 if (ki.grid_x > 0)
181 {
182 ki.usage_end = 100;
183 ke.kernels[
i].push_back(ki);
184 }
185
186 }
187
188 nlohmann::json j = ke;
189
191
192 }
193 output <<
"\n]" << std::endl;
194 }
195 else
196 {
198 }
199 }
200 return StatusCode::SUCCESS;
201}
#define ATH_MSG_WARNING(x)
Gaudi::Property< std::string > m_outputFile
If m_outputSizes is true, the file to which the kernel sizes should be output.
Gaudi::Property< bool > m_outputSizes
If true, writes the (last used) kernel sizes to an output JSON file.
std::unordered_map< std::string, std::vector< KernelRecord > > m_kernel_map
@ u
Enums for curvilinear frames.
◆ get_GPU_usage()
| int GPUKernelSizeOptimizerSvc::get_GPU_usage |
( |
| ) |
const |
|
inlineprivate |
Get the GPU usage, in percentage, rounded to the nearest integer.
- Warning
- Getting GPU usage not yet supported in the current version of the code, it will default to considering the GPU 100% available.
Definition at line 113 of file GPUKernelSizeOptimizerSvc.h.
114 {
115 return 0;
116 }
◆ get_launch_configuration()
| CUDAKernelLaunchConfiguration GPUKernelSizeOptimizerSvc::get_launch_configuration |
( |
const std::string & | name, |
|
|
const int | number = 0, |
|
|
const int | dynamic_memory = 0 ) const |
|
overridevirtual |
Retrieve the (hopefully optimal) kernel launch configuration.
Definition at line 51 of file GPUKernelSizeOptimizerSvc.cxx.
52{
55 {
58 }
59 else
60 {
61 return {};
62 }
63}
int get_GPU_usage() const
Get the GPU usage, in percentage, rounded to the nearest integer.
std::string number(const double &d, const std::string &s)
◆ handle()
| void CaloGPUCUDAInitialization::handle |
( |
const Incident & | incident | ) |
|
|
inlineoverrideinherited |
Definition at line 66 of file CaloGPUCUDAInitialization.h.
67 {
68 const bool is_multiprocess = (Gaudi::Concurrency::ConcurrencyFlags::numProcs() > 0);
70 {
72 {
73 throw GaudiException("Failed to perform the CUDA initialization!",
74 "CaloGPUCUDAInitialization::handle",
75 StatusCode::FAILURE);
76 }
77 }
78 }
static const std::string & type()
Incident type.
virtual StatusCode initialize_CUDA()
Initialization that invokes CUDA functions.
◆ initialize()
| virtual StatusCode GPUKernelSizeOptimizerSvc::initialize |
( |
| ) |
|
|
inlineoverridevirtual |
◆ initialize_CUDA()
| StatusCode GPUKernelSizeOptimizerSvc::initialize_CUDA |
( |
| ) |
|
|
overridevirtual |
Initialization that invokes CUDA functions.
Reimplemented from CaloGPUCUDAInitialization.
Definition at line 66 of file GPUKernelSizeOptimizerSvc.cxx.
67{
70
72
74 {
75 std::ifstream in(
file);
76
77 if (!in.is_open())
78 {
80 continue;
81 }
82
83 nlohmann::json j;
84 in >> j;
85
86 for (const auto & entry : j)
87 {
88 if (
entry.at(
"device") != device_name)
89 {
90 continue;
91 }
93
94 std::vector<KernelRecord> & vect =
m_kernel_map[ke.name];
95 vect.resize(ke.kernels.size());
96
97 for (
size_t i = 0;
i < vect.size(); ++
i)
98 {
99 for (const auto & ki : ke.kernels[i])
100 {
101 CUDAKernelLaunchConfiguration
config;
102 config.grid_x = ki.grid_x;
103 config.grid_y = ki.grid_y;
104 config.grid_z = ki.grid_z;
105 config.block_x = ki.block_x;
106 config.block_y = ki.block_y;
107 config.block_z = ki.block_z;
108
109 vect[
i].add_configuration(config, ki.usage_start, ki.usage_end,
true);
110 }
111 }
112 }
113
114 }
115
116 return StatusCode::SUCCESS;
117}
Gaudi::Property< std::vector< std::string > > m_kernelFiles
List of JSON files from where to read (hopefully optimized) kernel sizes for different GPUs.
T * get(TKey *tobj)
get a TObject* from a TKey* (why can't a TObject be a TKey?)
bool supports_cooperative_launches()
bool supports_dynamic_parallelism()
◆ initialize_non_CUDA()
| virtual StatusCode CaloGPUCUDAInitialization::initialize_non_CUDA |
( |
| ) |
|
|
inlineprotectedvirtualinherited |
◆ register_kernels()
| void GPUKernelSizeOptimizerSvc::register_kernels |
( |
const std::string & | tool_name, |
|
|
const int | number, |
|
|
void ** | kernels, |
|
|
const int * | blocksize_hints, |
|
|
const int * | gridsize_hints, |
|
|
const int * | max_total_threads, |
|
|
const int | offset = 0 ) |
|
overridevirtual |
Register a set of kernels that can be referred back to with a name and a number.
Uses C-style arrays for more immediate CUDA compatibility, assumes the size of kernels, blocksize_hints and gridsize_hints is number and starts the numbering with an optional offset.
Definition at line 17 of file GPUKernelSizeOptimizerSvc.cxx.
24{
26
27 std::vector<KernelRecord> & vect =
m_kernel_map[tool_name];
28
29 if (
int(vect.size()) <
number + offset)
30 {
31 vect.resize(
number + offset);
32 }
33
35 {
36 CUDAKernelLaunchConfiguration
cfg{1, 1, 1, 1, 1, 1};
38 {
40 }
41 else
42 {
44 }
46 vect[
i +
offset].add_configuration(cfg);
47 }
48}
void optimize_block_and_grid_size_for_cooperative_launch(void *func, int &block_size, int &grid_size, const int dynamic_memory=0, const int block_size_limit=0)
Optimizes block and grid size for a cooperative launch.
void optimize_block_and_grid_size(void *func, int &block_size, int &grid_size, const int dynamic_memory=0, const int block_size_limit=0)
Optimizes block and grid size according to cudaOccupancyMaxPotentialBlockSize.
constexpr auto int_ceil_div(const T1 num, const T2 denom)
Returns the ceiling of num/denom, with proper rounding.
◆ should_use_minimal_kernel_sizes()
| virtual bool GPUKernelSizeOptimizerSvc::should_use_minimal_kernel_sizes |
( |
| ) |
const |
|
inlinevirtual |
Whether to avoid oversizing kernels and instead (if possible) launch kernels with the exact number of threads...
Definition at line 67 of file GPUKernelSizeOptimizerSvc.h.
68 {
69
70
71 return false;
72 }
◆ m_coopgroup_support
| bool GPUKernelSizeOptimizerSvc::m_coopgroup_support = false |
|
private |
◆ m_dynpar_support
| bool GPUKernelSizeOptimizerSvc::m_dynpar_support = false |
|
private |
◆ m_kernel_map
| std::unordered_map<std::string, std::vector<KernelRecord> > GPUKernelSizeOptimizerSvc::m_kernel_map |
|
private |
◆ m_kernelFiles
| Gaudi::Property<std::vector<std::string> > GPUKernelSizeOptimizerSvc::m_kernelFiles {this, "KernelSizeInput", {}, "Kernel size input JSON files"} |
|
private |
List of JSON files from where to read (hopefully optimized) kernel sizes for different GPUs.
Definition at line 120 of file GPUKernelSizeOptimizerSvc.h.
120{this, "KernelSizeInput", {}, "Kernel size input JSON files"};
◆ m_outputFile
| Gaudi::Property<std::string> GPUKernelSizeOptimizerSvc::m_outputFile {this, "OutputFile", "sizes.json", "Kernel size output file"} |
|
private |
If m_outputSizes is true, the file to which the kernel sizes should be output.
Definition at line 129 of file GPUKernelSizeOptimizerSvc.h.
129{this, "OutputFile", "sizes.json", "Kernel size output file"};
◆ m_outputSizes
| Gaudi::Property<bool> GPUKernelSizeOptimizerSvc::m_outputSizes {this, "OutputSizes", true, "Write out last used kernel sizes"} |
|
private |
If true, writes the (last used) kernel sizes to an output JSON file.
Defaults to true.
Definition at line 125 of file GPUKernelSizeOptimizerSvc.h.
125{this, "OutputSizes", true, "Write out last used kernel sizes"};
◆ m_overrideCooperativeGroups
| Gaudi::Property<bool> GPUKernelSizeOptimizerSvc::m_overrideCooperativeGroups {this, "OverrideCooperativeGroups", false, "Disable cooperative group support (to force fallback to alternative iteration method)."} |
|
private |
If true, forces can_use_cooperative_groups to always return false.
Defaults to false.
GPU algorithms may implement alternative code paths when cooperative groups are not supported. This allows easier testing of the performance impact of such a fallback.
Definition at line 137 of file GPUKernelSizeOptimizerSvc.h.
137{this, "OverrideCooperativeGroups", false, "Disable cooperative group support (to force fallback to alternative iteration method)."};
The documentation for this class was generated from the following files: