ATLAS Offline Software
Loading...
Searching...
No Matches
GPUKernelSizeOptimizerSvc.h
Go to the documentation of this file.
1//
2// Copyright (C) 2002-2023 CERN for the benefit of the ATLAS collaboration
3//
4// Dear emacs, this is -*- c++ -*-
5//
6
7#ifndef CALORECGPU_GPUKERNELSIZEOPTIMIZERSVC_H
8#define CALORECGPU_GPUKERNELSIZEOPTIMIZERSVC_H
9
10#include <string>
11#include <vector>
12#include <unordered_map>
13#include <cstdint>
14
16
19
20#include <nlohmann/json.hpp>
21
28
29class GPUKernelSizeOptimizerSvc : public extends <AthService, IGPUKernelSizeOptimizerSvc>, public CaloGPUCUDAInitialization
30{
31 public:
32
33 GPUKernelSizeOptimizerSvc(const std::string & name, ISvcLocator * svc);
34
41 virtual void register_kernels(const std::string & tool_name,
42 const int number,
43 void ** kernels,
44 const int * blocksize_hints,
45 const int * gridsize_hints,
46 const int * max_total_threads,
47 const int offset = 0) override;
48
50 virtual CUDAKernelLaunchConfiguration get_launch_configuration(const std::string & name,
51 const int number = 0,
52 const int dynamic_memory = 0) const override;
53
55 virtual bool can_use_cooperative_groups() const override
56 {
58 }
59
61 virtual bool can_use_dynamic_parallelism() const override
62 {
63 return m_dynpar_support;
64 }
65
68 {
69 //Testing shows that, at least on the devices we use,
70 //we only lose performance by dyn-par'ing our way to do this.
71 return false;
72 }
73
74 virtual StatusCode initialize() override
75 {
77 }
78
79 virtual StatusCode initialize_CUDA() override;
80
81 virtual StatusCode finalize() override;
82
83 private:
84
85 bool m_dynpar_support = false;
86 bool m_coopgroup_support = false;
87
89 {
92 const int usage_start = 0,
93 const int usage_end = 100,
94 const bool overwrite = false)
95 {
96 for (int u = usage_start; u <= usage_end && u <= 100; ++u)
97 {
99 if (overwrite || cfg.grid_x <= 0)
100 {
101 cfg = config;
102 }
103 }
104 }
105 };
106
107 std::unordered_map<std::string, std::vector<KernelRecord>> m_kernel_map;
108
113 int get_GPU_usage() const
114 {
115 return 0;
116 }
117
120 Gaudi::Property<std::vector<std::string>> m_kernelFiles {this, "KernelSizeInput", {}, "Kernel size input JSON files"};
121
125 Gaudi::Property<bool> m_outputSizes {this, "OutputSizes", true, "Write out last used kernel sizes"};
126
129 Gaudi::Property<std::string> m_outputFile {this, "OutputFile", "sizes.json", "Kernel size output file"};
130
137 Gaudi::Property<bool> m_overrideCooperativeGroups {this, "OverrideCooperativeGroups", false, "Disable cooperative group support (to force fallback to alternative iteration method)."};
138
140 {
142 {
143 int usage_start = 0;
144 int usage_end = 100;
145 int grid_x = 0, grid_y = 0, grid_z = 0, block_x = 0, block_y = 0, block_z = 0;
146
147 NLOHMANN_DEFINE_TYPE_INTRUSIVE(KernelInfo, usage_start, usage_end,
150 };
151
152 std::string device;
153 std::string name;
154 std::vector< std::vector<KernelInfo> > kernels;
155
156
157 NLOHMANN_DEFINE_TYPE_INTRUSIVE(KernelsEntry, device, name, kernels)
158 };
159
160};
161
162#endif
Base class to provide some basic common infrastructure for initializing CUDA only at the right place ...
virtual bool can_use_cooperative_groups() const override
Whether the device + environment in use support cooperative groups.
virtual StatusCode initialize_CUDA() override
Initialization that invokes CUDA functions.
GPUKernelSizeOptimizerSvc(const std::string &name, ISvcLocator *svc)
int get_GPU_usage() const
Get the GPU usage, in percentage, rounded to the nearest integer.
virtual StatusCode finalize() override
virtual void register_kernels(const std::string &tool_name, const int number, void **kernels, const int *blocksize_hints, const int *gridsize_hints, const int *max_total_threads, const int offset=0) override
Register a set of kernels that can be referred back to with a name and a number.
virtual StatusCode initialize() override
Gaudi::Property< std::vector< std::string > > m_kernelFiles
List of JSON files from where to read (hopefully optimized) kernel sizes for different GPUs.
virtual bool can_use_dynamic_parallelism() const override
Whether the device + environment in use support dynamic parallelism.
virtual CUDAKernelLaunchConfiguration get_launch_configuration(const std::string &name, const int number=0, const int dynamic_memory=0) const override
Retrieve the (hopefully optimal) kernel launch configuration.
Gaudi::Property< std::string > m_outputFile
If m_outputSizes is true, the file to which the kernel sizes should be output.
Gaudi::Property< bool > m_overrideCooperativeGroups
If true, forces can_use_cooperative_groups to always return false.
Gaudi::Property< bool > m_outputSizes
If true, writes the (last used) kernel sizes to an output JSON file.
virtual bool should_use_minimal_kernel_sizes() const
Whether to avoid oversizing kernels and instead (if possible) launch kernels with the exact number of...
std::unordered_map< std::string, std::vector< KernelRecord > > m_kernel_map
void add_configuration(const CUDAKernelLaunchConfiguration &config, const int usage_start=0, const int usage_end=100, const bool overwrite=false)
CUDAKernelLaunchConfiguration configs[101]
std::vector< std::vector< KernelInfo > > kernels
std::string number(const double &d, const std::string &s)
Definition utils.cxx:186