21 const int * gridsize_hints,
22 const int * max_total_threads,
27 std::vector<KernelRecord> & vect =
m_kernel_map[tool_name];
29 if (
int(vect.size()) <
number + offset)
31 vect.resize(
number + offset);
34 for (
int i = 0; i <
number; ++i)
46 vect[i + offset].add_configuration(cfg);
75 std::ifstream in(
file);
86 for (
const auto & entry : j)
88 if (entry.at(
"device") != device_name)
97 for (
size_t i = 0; i < vect.size(); ++i)
99 for (
const auto & ki : ke.
kernels[i])
102 config.grid_x = ki.grid_x;
103 config.grid_y = ki.grid_y;
104 config.grid_z = ki.grid_z;
105 config.block_x = ki.block_x;
106 config.block_y = ki.block_y;
107 config.block_z = ki.block_z;
109 vect[i].add_configuration(
config, ki.usage_start, ki.usage_end,
true);
116 return StatusCode::SUCCESS;
127 return (
a.grid_x != b.grid_x ) ||
128 (
a.grid_y != b.grid_y ) ||
129 (
a.grid_z != b.grid_z ) ||
130 (
a.block_x != b.block_x ) ||
131 (
a.block_y != b.block_y ) ||
132 (
a.block_z != b.block_z );
135 if (output.is_open())
156 for (
size_t i = 0; i < ke.
kernels.size(); ++i)
161 for (
int u = 0; u <= 100; ++u)
164 if (delta_configs(cfg, ki))
188 nlohmann::json j = ke;
193 output <<
"\n]" << std::endl;
200 return StatusCode::SUCCESS;
#define ATH_MSG_WARNING(x)
virtual StatusCode initialize_CUDA() override
Initialization that invokes CUDA functions.
GPUKernelSizeOptimizerSvc(const std::string &name, ISvcLocator *svc)
int get_GPU_usage() const
Get the GPU usage, in percentage, rounded to the nearest integer.
virtual StatusCode finalize() override
virtual void register_kernels(const std::string &tool_name, const int number, void **kernels, const int *blocksize_hints, const int *gridsize_hints, const int *max_total_threads, const int offset=0) override
Register a set of kernels that can be referred back to with a name and a number.
Gaudi::Property< std::vector< std::string > > m_kernelFiles
List of JSON files from where to read (hopefully optimized) kernel sizes for different GPUs.
virtual CUDAKernelLaunchConfiguration get_launch_configuration(const std::string &name, const int number=0, const int dynamic_memory=0) const override
Retrieve the (hopefully optimal) kernel launch configuration.
Gaudi::Property< std::string > m_outputFile
If m_outputSizes is true, the file to which the kernel sizes should be output.
Gaudi::Property< bool > m_outputSizes
If true, writes the (last used) kernel sizes to an output JSON file.
std::unordered_map< std::string, std::vector< KernelRecord > > m_kernel_map
T * get(TKey *tobj)
get a TObject* from a TKey* (why can't a TObject be a TKey?)
int usage(std::ostream &s, int, char **argv, int status=-1)
void optimize_block_and_grid_size_for_cooperative_launch(void *func, int &block_size, int &grid_size, const int dynamic_memory=0, const int block_size_limit=0)
Optimizes block and grid size for a cooperative launch.
bool supports_cooperative_launches()
bool supports_dynamic_parallelism()
void optimize_block_and_grid_size(void *func, int &block_size, int &grid_size, const int dynamic_memory=0, const int block_size_limit=0)
Optimizes block and grid size according to cudaOccupancyMaxPotentialBlockSize.
constexpr auto int_ceil_div(const T1 num, const T2 denom)
Returns the ceiling of num/denom, with proper rounding.
CUDAKernelLaunchConfiguration configs[101]
std::vector< std::vector< KernelInfo > > kernels
std::string number(const double &d, const std::string &s)