ATLAS Offline Software
Loading...
Searching...
No Matches
GPUKernelSizeOptimizerSvc Class Reference

More...

#include <GPUKernelSizeOptimizerSvc.h>

Inheritance diagram for GPUKernelSizeOptimizerSvc:
Collaboration diagram for GPUKernelSizeOptimizerSvc:

Classes

struct  KernelRecord
struct  KernelsEntry

Public Member Functions

 GPUKernelSizeOptimizerSvc (const std::string &name, ISvcLocator *svc)
virtual void register_kernels (const std::string &tool_name, const int number, void **kernels, const int *blocksize_hints, const int *gridsize_hints, const int *max_total_threads, const int offset=0) override
 Register a set of kernels that can be referred back to with a name and a number.
virtual CUDAKernelLaunchConfiguration get_launch_configuration (const std::string &name, const int number=0, const int dynamic_memory=0) const override
 Retrieve the (hopefully optimal) kernel launch configuration.
virtual bool can_use_cooperative_groups () const override
 Whether the device + environment in use support cooperative groups.
virtual bool can_use_dynamic_parallelism () const override
 Whether the device + environment in use support dynamic parallelism.
virtual bool should_use_minimal_kernel_sizes () const
 Whether to avoid oversizing kernels and instead (if possible) launch kernels with the exact number of threads...
virtual StatusCode initialize () override
virtual StatusCode initialize_CUDA () override
 Initialization that invokes CUDA functions.
virtual StatusCode finalize () override
void handle (const Incident &incident) override

Protected Member Functions

virtual StatusCode initialize_non_CUDA ()
 Initialization that does not invoke CUDA functions.

Private Member Functions

int get_GPU_usage () const
 Get the GPU usage, in percentage, rounded to the nearest integer.

Private Attributes

bool m_dynpar_support = false
bool m_coopgroup_support = false
std::unordered_map< std::string, std::vector< KernelRecord > > m_kernel_map
Gaudi::Property< std::vector< std::string > > m_kernelFiles {this, "KernelSizeInput", {}, "Kernel size input JSON files"}
 List of JSON files from where to read (hopefully optimized) kernel sizes for different GPUs.
Gaudi::Property< bool > m_outputSizes {this, "OutputSizes", true, "Write out last used kernel sizes"}
 If true, writes the (last used) kernel sizes to an output JSON file.
Gaudi::Property< std::string > m_outputFile {this, "OutputFile", "sizes.json", "Kernel size output file"}
 If m_outputSizes is true, the file to which the kernel sizes should be output.
Gaudi::Property< bool > m_overrideCooperativeGroups {this, "OverrideCooperativeGroups", false, "Disable cooperative group support (to force fallback to alternative iteration method)."}
 If true, forces can_use_cooperative_groups to always return false.

Detailed Description

Constructor & Destructor Documentation

◆ GPUKernelSizeOptimizerSvc()

GPUKernelSizeOptimizerSvc::GPUKernelSizeOptimizerSvc ( const std::string & name,
ISvcLocator * svc )

Definition at line 12 of file GPUKernelSizeOptimizerSvc.cxx.

12 :
13 base_class(name, svc)
14{
15}

Member Function Documentation

◆ can_use_cooperative_groups()

virtual bool GPUKernelSizeOptimizerSvc::can_use_cooperative_groups ( ) const
inlineoverridevirtual

Whether the device + environment in use support cooperative groups.

Definition at line 55 of file GPUKernelSizeOptimizerSvc.h.

56 {
58 }
Gaudi::Property< bool > m_overrideCooperativeGroups
If true, forces can_use_cooperative_groups to always return false.

◆ can_use_dynamic_parallelism()

virtual bool GPUKernelSizeOptimizerSvc::can_use_dynamic_parallelism ( ) const
inlineoverridevirtual

Whether the device + environment in use support dynamic parallelism.

Definition at line 61 of file GPUKernelSizeOptimizerSvc.h.

◆ finalize()

StatusCode GPUKernelSizeOptimizerSvc::finalize ( )
overridevirtual

Definition at line 119 of file GPUKernelSizeOptimizerSvc.cxx.

120{
121 if (m_outputSizes && m_kernel_map.size() > 0)
122 {
123 std::ofstream output(m_outputFile);
124
125 auto delta_configs = [](const CUDAKernelLaunchConfiguration & a, const KernelsEntry::KernelInfo & b) -> bool
126 {
127 return ( a.grid_x != b.grid_x ) ||
128 ( a.grid_y != b.grid_y ) ||
129 ( a.grid_z != b.grid_z ) ||
130 ( a.block_x != b.block_x ) ||
131 ( a.block_y != b.block_y ) ||
132 ( a.block_z != b.block_z );
133 };
134
135 if (output.is_open())
136 {
137 output << "[\n";
138 const std::string device_name = CaloRecGPU::CUDA_Helpers::GPU_name();
139 bool first = true;
140 for (const auto & pair : m_kernel_map)
141 {
142 if (first)
143 {
144 first = false;
145 }
146 else
147 {
148 output << ",\n";
149 }
150
151 KernelsEntry ke;
152 ke.device = device_name;
153 ke.name = pair.first;
154 ke.kernels.resize(pair.second.size());
155
156 for (size_t i = 0; i < ke.kernels.size(); ++i)
157 {
158 const KernelRecord & kr = pair.second[i];
159
161 for (int u = 0; u <= 100; ++u)
162 {
163 const CUDAKernelLaunchConfiguration & cfg = kr.configs[u];
164 if (delta_configs(cfg, ki))
165 {
166 if (ki.grid_x > 0)
167 {
168 ki.usage_end = u - 1;
169 ke.kernels[i].push_back(ki);
170 }
171 ki.usage_start = u;
172 ki.grid_x = cfg.grid_x;
173 ki.grid_y = cfg.grid_y;
174 ki.grid_z = cfg.grid_z;
175 ki.block_x = cfg.block_x;
176 ki.block_y = cfg.block_y;
177 ki.block_z = cfg.block_z;
178 }
179 }
180 if (ki.grid_x > 0)
181 {
182 ki.usage_end = 100;
183 ke.kernels[i].push_back(ki);
184 }
185
186 }
187
188 nlohmann::json j = ke;
189
190 output << j.dump(2);
191
192 }
193 output << "\n]" << std::endl;
194 }
195 else
196 {
197 ATH_MSG_WARNING("Cannot open '" << m_outputFile << "' for kernel size output.");
198 }
199 }
200 return StatusCode::SUCCESS;
201}
#define ATH_MSG_WARNING(x)
static Double_t a
Gaudi::Property< std::string > m_outputFile
If m_outputSizes is true, the file to which the kernel sizes should be output.
Gaudi::Property< bool > m_outputSizes
If true, writes the (last used) kernel sizes to an output JSON file.
std::unordered_map< std::string, std::vector< KernelRecord > > m_kernel_map
bool first
Definition DeMoScan.py:534
@ u
Enums for curvilinear frames.
Definition ParamDefs.h:77
output
Definition merge.py:16

◆ get_GPU_usage()

int GPUKernelSizeOptimizerSvc::get_GPU_usage ( ) const
inlineprivate

Get the GPU usage, in percentage, rounded to the nearest integer.

Warning
Getting GPU usage not yet supported in the current version of the code, it will default to considering the GPU 100% available.

Definition at line 113 of file GPUKernelSizeOptimizerSvc.h.

114 {
115 return 0;
116 }

◆ get_launch_configuration()

CUDAKernelLaunchConfiguration GPUKernelSizeOptimizerSvc::get_launch_configuration ( const std::string & name,
const int number = 0,
const int dynamic_memory = 0 ) const
overridevirtual

Retrieve the (hopefully optimal) kernel launch configuration.

Definition at line 51 of file GPUKernelSizeOptimizerSvc.cxx.

52{
53 auto it = m_kernel_map.find(name);
54 if (it != m_kernel_map.end() && int(it->second.size()) > number)
55 {
56 const int usage = get_GPU_usage();
57 return it->second[number].configs[usage];
58 }
59 else
60 {
61 return {};
62 }
63}
int get_GPU_usage() const
Get the GPU usage, in percentage, rounded to the nearest integer.
StatusCode usage()
std::string number(const double &d, const std::string &s)
Definition utils.cxx:186

◆ handle()

void CaloGPUCUDAInitialization::handle ( const Incident & incident)
inlineoverrideinherited

Definition at line 66 of file CaloGPUCUDAInitialization.h.

67 {
68 const bool is_multiprocess = (Gaudi::Concurrency::ConcurrencyFlags::numProcs() > 0);
69 if (is_multiprocess && incident.type() == AthenaInterprocess::UpdateAfterFork::type())
70 {
71 if (!this->initialize_CUDA().isSuccess())
72 {
73 throw GaudiException("Failed to perform the CUDA initialization!",
74 "CaloGPUCUDAInitialization::handle",
75 StatusCode::FAILURE);
76 }
77 }
78 }
static const std::string & type()
Incident type.
Definition Incidents.h:49
virtual StatusCode initialize_CUDA()
Initialization that invokes CUDA functions.

◆ initialize()

virtual StatusCode GPUKernelSizeOptimizerSvc::initialize ( )
inlineoverridevirtual

Reimplemented from CaloGPUCUDAInitialization.

Definition at line 74 of file GPUKernelSizeOptimizerSvc.h.

◆ initialize_CUDA()

StatusCode GPUKernelSizeOptimizerSvc::initialize_CUDA ( )
overridevirtual

Initialization that invokes CUDA functions.

Reimplemented from CaloGPUCUDAInitialization.

Definition at line 66 of file GPUKernelSizeOptimizerSvc.cxx.

67{
70
71 const std::string device_name = CaloRecGPU::CUDA_Helpers::GPU_name();
72
73 for (const auto & file : m_kernelFiles)
74 {
75 std::ifstream in(file);
76
77 if (!in.is_open())
78 {
79 ATH_MSG_WARNING("Cannot open '" << m_outputFile << "' for kernel size input.");
80 continue;
81 }
82
83 nlohmann::json j;
84 in >> j;
85
86 for (const auto & entry : j)
87 {
88 if (entry.at("device") != device_name)
89 {
90 continue;
91 }
92 const KernelsEntry ke = entry.template get<KernelsEntry>();
93
94 std::vector<KernelRecord> & vect = m_kernel_map[ke.name];
95 vect.resize(ke.kernels.size());
96
97 for (size_t i = 0; i < vect.size(); ++i)
98 {
99 for (const auto & ki : ke.kernels[i])
100 {
101 CUDAKernelLaunchConfiguration config;
102 config.grid_x = ki.grid_x;
103 config.grid_y = ki.grid_y;
104 config.grid_z = ki.grid_z;
105 config.block_x = ki.block_x;
106 config.block_y = ki.block_y;
107 config.block_z = ki.block_z;
108
109 vect[i].add_configuration(config, ki.usage_start, ki.usage_end, true);
110 }
111 }
112 }
113
114 }
115
116 return StatusCode::SUCCESS;
117}
Gaudi::Property< std::vector< std::string > > m_kernelFiles
List of JSON files from where to read (hopefully optimized) kernel sizes for different GPUs.
T * get(TKey *tobj)
get a TObject* from a TKey* (why can't a TObject be a TKey?)
Definition hcg.cxx:130
TFile * file

◆ initialize_non_CUDA()

virtual StatusCode CaloGPUCUDAInitialization::initialize_non_CUDA ( )
inlineprotectedvirtualinherited

Initialization that does not invoke CUDA functions.

Reimplemented in BasicGPUClusterInfoCalculator, CaloGPUHybridClusterProcessor, GPUClusterInfoAndMomentsCalculator, GPUClusterSorter, TopoAutomatonClustering, and TopoAutomatonSplitting.

Definition at line 33 of file CaloGPUCUDAInitialization.h.

34 {
35 return StatusCode::SUCCESS;
36 }

◆ register_kernels()

void GPUKernelSizeOptimizerSvc::register_kernels ( const std::string & tool_name,
const int number,
void ** kernels,
const int * blocksize_hints,
const int * gridsize_hints,
const int * max_total_threads,
const int offset = 0 )
overridevirtual

Register a set of kernels that can be referred back to with a name and a number.

Uses C-style arrays for more immediate CUDA compatibility, assumes the size of kernels, blocksize_hints and gridsize_hints is number and starts the numbering with an optional offset.

Definition at line 17 of file GPUKernelSizeOptimizerSvc.cxx.

24{
25 ATH_MSG_INFO("Registering " << number << " kernels under: " << tool_name);
26
27 std::vector<KernelRecord> & vect = m_kernel_map[tool_name];
28
29 if (int(vect.size()) < number + offset)
30 {
31 vect.resize(number + offset);
32 }
33
34 for (int i = 0; i < number; ++i)
35 {
36 CUDAKernelLaunchConfiguration cfg{1, 1, 1, 1, 1, 1};
38 {
40 }
41 else
42 {
44 }
45 cfg.grid_x = std::min(cfg.grid_x, CaloRecGPU::Helpers::int_ceil_div(max_total_threads[i], cfg.block_x));
46 vect[i + offset].add_configuration(cfg);
47 }
48}
#define ATH_MSG_INFO(x)
void optimize_block_and_grid_size_for_cooperative_launch(void *func, int &block_size, int &grid_size, const int dynamic_memory=0, const int block_size_limit=0)
Optimizes block and grid size for a cooperative launch.
void optimize_block_and_grid_size(void *func, int &block_size, int &grid_size, const int dynamic_memory=0, const int block_size_limit=0)
Optimizes block and grid size according to cudaOccupancyMaxPotentialBlockSize.
constexpr auto int_ceil_div(const T1 num, const T2 denom)
Returns the ceiling of num/denom, with proper rounding.

◆ should_use_minimal_kernel_sizes()

virtual bool GPUKernelSizeOptimizerSvc::should_use_minimal_kernel_sizes ( ) const
inlinevirtual

Whether to avoid oversizing kernels and instead (if possible) launch kernels with the exact number of threads...

Definition at line 67 of file GPUKernelSizeOptimizerSvc.h.

68 {
69 //Testing shows that, at least on the devices we use,
70 //we only lose performance by dyn-par'ing our way to do this.
71 return false;
72 }

Member Data Documentation

◆ m_coopgroup_support

bool GPUKernelSizeOptimizerSvc::m_coopgroup_support = false
private

Definition at line 86 of file GPUKernelSizeOptimizerSvc.h.

◆ m_dynpar_support

bool GPUKernelSizeOptimizerSvc::m_dynpar_support = false
private

Definition at line 85 of file GPUKernelSizeOptimizerSvc.h.

◆ m_kernel_map

std::unordered_map<std::string, std::vector<KernelRecord> > GPUKernelSizeOptimizerSvc::m_kernel_map
private

Definition at line 107 of file GPUKernelSizeOptimizerSvc.h.

◆ m_kernelFiles

Gaudi::Property<std::vector<std::string> > GPUKernelSizeOptimizerSvc::m_kernelFiles {this, "KernelSizeInput", {}, "Kernel size input JSON files"}
private

List of JSON files from where to read (hopefully optimized) kernel sizes for different GPUs.

Definition at line 120 of file GPUKernelSizeOptimizerSvc.h.

120{this, "KernelSizeInput", {}, "Kernel size input JSON files"};

◆ m_outputFile

Gaudi::Property<std::string> GPUKernelSizeOptimizerSvc::m_outputFile {this, "OutputFile", "sizes.json", "Kernel size output file"}
private

If m_outputSizes is true, the file to which the kernel sizes should be output.

Definition at line 129 of file GPUKernelSizeOptimizerSvc.h.

129{this, "OutputFile", "sizes.json", "Kernel size output file"};

◆ m_outputSizes

Gaudi::Property<bool> GPUKernelSizeOptimizerSvc::m_outputSizes {this, "OutputSizes", true, "Write out last used kernel sizes"}
private

If true, writes the (last used) kernel sizes to an output JSON file.

Defaults to true.

Definition at line 125 of file GPUKernelSizeOptimizerSvc.h.

125{this, "OutputSizes", true, "Write out last used kernel sizes"};

◆ m_overrideCooperativeGroups

Gaudi::Property<bool> GPUKernelSizeOptimizerSvc::m_overrideCooperativeGroups {this, "OverrideCooperativeGroups", false, "Disable cooperative group support (to force fallback to alternative iteration method)."}
private

If true, forces can_use_cooperative_groups to always return false.

Defaults to false.

GPU algorithms may implement alternative code paths when cooperative groups are not supported. This allows easier testing of the performance impact of such a fallback.

Definition at line 137 of file GPUKernelSizeOptimizerSvc.h.

137{this, "OverrideCooperativeGroups", false, "Disable cooperative group support (to force fallback to alternative iteration method)."};

The documentation for this class was generated from the following files: