#include <GPUKernelSizeOptimizerSvc.h>

Inheritance diagram for GPUKernelSizeOptimizerSvc:

Collaboration diagram for GPUKernelSizeOptimizerSvc:

Classes
struct	KernelRecord

struct	KernelsEntry

Public Member Functions
	GPUKernelSizeOptimizerSvc (const std::string &name, ISvcLocator *svc)

virtual void	register_kernels (const std::string &tool_name, const int number, void *kernels, const int blocksize_hints, const int gridsize_hints, const int max_total_threads, const int offset=0) override
	Register a set of kernels that can be referred back to with a name and a number. More...

virtual CUDAKernelLaunchConfiguration	get_launch_configuration (const std::string &name, const int number=0, const int dynamic_memory=0) const override
	Retrieve the (hopefully optimal) kernel launch configuration. More...

virtual bool	can_use_cooperative_groups () const override
	Whether the device + environment in use support cooperative groups. More...

virtual bool	can_use_dynamic_parallelism () const override
	Whether the device + environment in use support dynamic parallelism. More...

virtual bool	should_use_minimal_kernel_sizes () const
	Whether to avoid oversizing kernels and instead (if possible) launch kernels with the exact number of threads... More...

virtual StatusCode	initialize () override

virtual StatusCode	initialize_CUDA () override
	Initialization that invokes CUDA functions. More...

virtual StatusCode	finalize () override

void	handle (const Incident &incident) override

Protected Member Functions
virtual StatusCode	initialize_non_CUDA ()
	Initialization that does not invoke CUDA functions. More...

Private Member Functions
int	get_GPU_usage () const
	Get the GPU usage, in percentage, rounded to the nearest integer. More...

Private Attributes
bool	m_dynpar_support = false

bool	m_coopgroup_support = false

std::unordered_map< std::string, std::vector< KernelRecord > >	m_kernel_map

Gaudi::Property< std::vector< std::string > >	m_kernelFiles {this, "KernelSizeInput", {}, "Kernel size input JSON files"}
	List of JSON files from where to read (hopefully optimized) kernel sizes for different GPUs. More...

Gaudi::Property< bool >	m_outputSizes {this, "OutputSizes", true, "Write out last used kernel sizes"}
	If `true`, writes the (last used) kernel sizes to an output JSON file. More...

Gaudi::Property< std::string >	m_outputFile {this, "OutputFile", "sizes.json", "Kernel size output file"}
	If `m_outputSizes` is `true`, the file to which the kernel sizes should be output. More...

Detailed Description

Author: Nuno Fernandes nuno..nosp@m.dos..nosp@m.santo.nosp@m.s.fe.nosp@m.rnand.nosp@m.es@c.nosp@m.ern.c.nosp@m.h

Date: 06 August 2023

Definition at line 29 of file GPUKernelSizeOptimizerSvc.h.

Constructor & Destructor Documentation

◆ GPUKernelSizeOptimizerSvc()

GPUKernelSizeOptimizerSvc::GPUKernelSizeOptimizerSvc	(	const std::string &	name,
		ISvcLocator *	svc
	)

Definition at line 12 of file GPUKernelSizeOptimizerSvc.cxx.

                                                                                              :
   base_class(name, svc)
 {
 }

Member Function Documentation

◆ can_use_cooperative_groups()

virtual bool GPUKernelSizeOptimizerSvc::can_use_cooperative_groups ( ) const

inlineoverridevirtual

Whether the device + environment in use support cooperative groups.

Definition at line 55 of file GPUKernelSizeOptimizerSvc.h.

   {
     return m_coopgroup_support;
   }

◆ can_use_dynamic_parallelism()

virtual bool GPUKernelSizeOptimizerSvc::can_use_dynamic_parallelism ( ) const

inlineoverridevirtual

Whether the device + environment in use support dynamic parallelism.

Definition at line 61 of file GPUKernelSizeOptimizerSvc.h.

   {
     return m_dynpar_support;
   }

◆ finalize()

StatusCode GPUKernelSizeOptimizerSvc::finalize ( )

overridevirtual

Definition at line 119 of file GPUKernelSizeOptimizerSvc.cxx.

 {
   if (m_outputSizes && m_kernel_map.size() > 0)
     {
       std::ofstream output(m_outputFile);
  
       auto delta_configs = [](const CUDAKernelLaunchConfiguration & a, const KernelsEntry::KernelInfo & b) -> bool
       {
         return ( a.grid_x  != b.grid_x  ) ||
         ( a.grid_y  != b.grid_y  ) ||
         ( a.grid_z  != b.grid_z  ) ||
         ( a.block_x != b.block_x ) ||
         ( a.block_y != b.block_y ) ||
         ( a.block_z != b.block_z );
       };
  
       if (output.is_open())
         {
           output << "[\n";
           const std::string device_name = CaloRecGPU::CUDA_Helpers::GPU_name();
           bool first = true;
           for (const auto & pair : m_kernel_map)
             {
               if (first)
                 {
                   first = false;
                 }
               else
                 {
                   output << ",\n";
                 }
  
               KernelsEntry ke;
               ke.device = device_name;
               ke.name = pair.first;
               ke.kernels.resize(pair.second.size());
  
               for (size_t i = 0; i < ke.kernels.size(); ++i)
                 {
                   const KernelRecord & kr = pair.second[i];
  
                   KernelsEntry::KernelInfo ki;
                   for (int u = 0; u <= 100; ++u)
                     {
                       const CUDAKernelLaunchConfiguration & cfg = kr.configs[u];
                       if (delta_configs(cfg, ki))
                         {
                           if (ki.grid_x > 0)
                             {
                               ki.usage_end = u - 1;
                               ke.kernels[i].push_back(ki);
                             }
                           ki.usage_start = u;
                           ki.grid_x = cfg.grid_x;
                           ki.grid_y = cfg.grid_y;
                           ki.grid_z = cfg.grid_z;
                           ki.block_x = cfg.block_x;
                           ki.block_y = cfg.block_y;
                           ki.block_z = cfg.block_z;
                         }
                     }
                   if (ki.grid_x > 0)
                     {
                       ki.usage_end = 100;
                       ke.kernels[i].push_back(ki);
                     }
  
                 }
  
               nlohmann::json j = ke;
  
               output << j.dump(2);
  
             }
           output << "\n]" << std::endl;
         }
       else
         {
           ATH_MSG_WARNING("Cannot open '" << m_outputFile << "' for kernel size output.");
         }
     }
   return StatusCode::SUCCESS;
 }

◆ get_GPU_usage()

int GPUKernelSizeOptimizerSvc::get_GPU_usage ( ) const

inlineprivate

Get the GPU usage, in percentage, rounded to the nearest integer.

Warning: Getting GPU usage not yet supported in the current version of the code, it will default to considering the GPU 100% available.

Definition at line 113 of file GPUKernelSizeOptimizerSvc.h.

   {
     return 0;
   }

◆ get_launch_configuration()

CUDAKernelLaunchConfiguration GPUKernelSizeOptimizerSvc::get_launch_configuration	(	const std::string &	name,
		const int	number = `0`,
		const int	dynamic_memory = `0`
	)		const

overridevirtual

Retrieve the (hopefully optimal) kernel launch configuration.

Definition at line 51 of file GPUKernelSizeOptimizerSvc.cxx.

 {
   auto it = m_kernel_map.find(name);
   if (it != m_kernel_map.end() && int(it->second.size()) > number)
     {
       const int usage = get_GPU_usage();
       return it->second[number].configs[usage];
     }
   else
     {
       return {};
     }
 }

◆ handle()

void CaloGPUCUDAInitialization::handle ( const Incident & incident )

inlineoverrideinherited

Definition at line 66 of file CaloGPUCUDAInitialization.h.

   {
     const bool is_multiprocess = (Gaudi::Concurrency::ConcurrencyFlags::numProcs() > 0);
     if (is_multiprocess && incident.type() == AthenaInterprocess::UpdateAfterFork::type())
       {
         if (!this->initialize_CUDA().isSuccess())
         {
           throw GaudiException("Failed to perform the CUDA initialization!",
                                "CaloGPUCUDAInitialization::handle",
                                StatusCode::FAILURE);
         }
       }
   }

◆ initialize()

virtual StatusCode GPUKernelSizeOptimizerSvc::initialize ( )

inlineoverridevirtual

Reimplemented from CaloGPUCUDAInitialization.

Definition at line 74 of file GPUKernelSizeOptimizerSvc.h.

   {
     return CaloGPUCUDAInitialization::initialize();
   }

◆ initialize_CUDA()

StatusCode GPUKernelSizeOptimizerSvc::initialize_CUDA ( )

overridevirtual

Initialization that invokes CUDA functions.

Reimplemented from CaloGPUCUDAInitialization.

Definition at line 66 of file GPUKernelSizeOptimizerSvc.cxx.

 {
   m_dynpar_support = CaloRecGPU::CUDA_Helpers::supports_dynamic_parallelism();
   m_coopgroup_support = CaloRecGPU::CUDA_Helpers::supports_cooperative_launches();
  
   const std::string device_name = CaloRecGPU::CUDA_Helpers::GPU_name();
  
   for (const auto & file : m_kernelFiles)
     {
       std::ifstream in(file);
  
       if (!in.is_open())
         {
           ATH_MSG_WARNING("Cannot open '" << m_outputFile << "' for kernel size input.");
           continue;
         }
  
       nlohmann::json j;
       in >> j;
  
       for (const auto & entry : j)
         {
           if (entry.at("device") != device_name)
             {
               continue;
             }
           const KernelsEntry ke = entry.template get<KernelsEntry>();
  
           std::vector<KernelRecord> & vect = m_kernel_map[ke.name];
           vect.resize(ke.kernels.size());
  
           for (size_t i = 0; i < vect.size(); ++i)
             {
               for (const auto & ki : ke.kernels[i])
                 {
                   CUDAKernelLaunchConfiguration config;
                   config.grid_x = ki.grid_x;
                   config.grid_y = ki.grid_y;
                   config.grid_z = ki.grid_z;
                   config.block_x = ki.block_x;
                   config.block_y = ki.block_y;
                   config.block_z = ki.block_z;
  
                   vect[i].add_configuration(config, ki.usage_start, ki.usage_end, true);
                 }
             }
         }
  
     }
  
   return StatusCode::SUCCESS;
 }

◆ initialize_non_CUDA()

virtual StatusCode CaloGPUCUDAInitialization::initialize_non_CUDA ( )

inlineprotectedvirtualinherited

Initialization that does not invoke CUDA functions.

Reimplemented in CaloGPUHybridClusterProcessor, TopoAutomatonClustering, BasicGPUClusterInfoCalculator, GPUClusterInfoAndMomentsCalculator, and TopoAutomatonSplitting.

Definition at line 33 of file CaloGPUCUDAInitialization.h.

   {
     return StatusCode::SUCCESS;
   }

◆ register_kernels()

void GPUKernelSizeOptimizerSvc::register_kernels	(	const std::string &	tool_name,
		const int	number,
		void **	kernels,
		const int *	blocksize_hints,
		const int *	gridsize_hints,
		const int *	max_total_threads,
		const int	offset = `0`
	)

overridevirtual

Register a set of kernels that can be referred back to with a name and a number.

Uses C-style arrays for more immediate CUDA compatibility, assumes the size of kernels, blocksize_hints and gridsize_hints is number and starts the numbering with an optional offset.

Definition at line 17 of file GPUKernelSizeOptimizerSvc.cxx.

 {
   ATH_MSG_INFO("Registering " << number << " kernels under: " << tool_name);
   
   std::vector<KernelRecord> & vect = m_kernel_map[tool_name];
  
   if (int(vect.size()) < number + offset)
     {
       vect.resize(number + offset);
     }
  
   for (int i = 0; i < number; ++i)
     {      
       CUDAKernelLaunchConfiguration cfg{1, 1, 1, 1, 1, 1};
       if (gridsize_hints[i] == IGPUKernelSizeOptimizer::SpecialSizeHints::CooperativeLaunch)
         {
           CaloRecGPU::CUDA_Helpers::optimize_block_and_grid_size_for_cooperative_launch(kernels[i], cfg.block_x, cfg.grid_x);
         }
       else
         {
           CaloRecGPU::CUDA_Helpers::optimize_block_and_grid_size(kernels[i], cfg.block_x, cfg.grid_x);
         }
       cfg.grid_x = std::min(cfg.grid_x, CaloRecGPU::Helpers::int_ceil_div(max_total_threads[i], cfg.block_x));
       vect[i + offset].add_configuration(cfg);
     }
 }

◆ should_use_minimal_kernel_sizes()

virtual bool GPUKernelSizeOptimizerSvc::should_use_minimal_kernel_sizes ( ) const

inlinevirtual

Whether to avoid oversizing kernels and instead (if possible) launch kernels with the exact number of threads...

Definition at line 67 of file GPUKernelSizeOptimizerSvc.h.

   {
     //Testing shows that, at least on the devices we use,
     //we only lose performance by dyn-par'ing our way to do this.
     return false;
   }

Member Data Documentation

◆ m_coopgroup_support

bool GPUKernelSizeOptimizerSvc::m_coopgroup_support = false

private

Definition at line 86 of file GPUKernelSizeOptimizerSvc.h.

◆ m_dynpar_support

bool GPUKernelSizeOptimizerSvc::m_dynpar_support = false

private

Definition at line 85 of file GPUKernelSizeOptimizerSvc.h.

◆ m_kernel_map

std::unordered_map<std::string, std::vector<KernelRecord> > GPUKernelSizeOptimizerSvc::m_kernel_map

private

Definition at line 107 of file GPUKernelSizeOptimizerSvc.h.

◆ m_kernelFiles

Gaudi::Property<std::vector<std::string> > GPUKernelSizeOptimizerSvc::m_kernelFiles {this, "KernelSizeInput", {}, "Kernel size input JSON files"}

private

List of JSON files from where to read (hopefully optimized) kernel sizes for different GPUs.

Definition at line 120 of file GPUKernelSizeOptimizerSvc.h.

◆ m_outputFile

Gaudi::Property<std::string> GPUKernelSizeOptimizerSvc::m_outputFile {this, "OutputFile", "sizes.json", "Kernel size output file"}

private

If m_outputSizes is true, the file to which the kernel sizes should be output.

Definition at line 129 of file GPUKernelSizeOptimizerSvc.h.

◆ m_outputSizes

Gaudi::Property<bool> GPUKernelSizeOptimizerSvc::m_outputSizes {this, "OutputSizes", true, "Write out last used kernel sizes"}

private

If true, writes the (last used) kernel sizes to an output JSON file.

Defaults to true.

Definition at line 125 of file GPUKernelSizeOptimizerSvc.h.

The documentation for this class was generated from the following files:

Classes

Public Member Functions

Protected Member Functions

Private Member Functions

Private Attributes

Detailed Description

Constructor & Destructor Documentation

◆ GPUKernelSizeOptimizerSvc()

Member Function Documentation

◆ can_use_cooperative_groups()

◆ can_use_dynamic_parallelism()

◆ finalize()

◆ get_GPU_usage()

◆ get_launch_configuration()

◆ handle()

◆ initialize()

◆ initialize_CUDA()

◆ initialize_non_CUDA()

◆ register_kernels()

◆ should_use_minimal_kernel_sizes()

Member Data Documentation

◆ m_coopgroup_support

◆ m_dynpar_support

◆ m_kernel_map

◆ m_kernelFiles

◆ m_outputFile

◆ m_outputSizes