da/d7e/GPUKernelSizeOptimizerSvc_8h_source.html

//

// Copyright (C) 2002-2023 CERN for the benefit of the ATLAS collaboration

//

// Dear emacs, this is -*- c++ -*-

//


#ifndef CALORECGPU_GPUKERNELSIZEOPTIMIZERSVC_H

#define CALORECGPU_GPUKERNELSIZEOPTIMIZERSVC_H


#include <string>

#include <vector>

#include <unordered_map>

#include <cstdint>


#include "AthenaBaseComps/AthService.h"


#include "CaloRecGPU/IGPUKernelSizeOptimizerSvc.h"

#include "CaloRecGPU/CaloGPUCUDAInitialization.h"


#include <nlohmann/json.hpp>


class GPUKernelSizeOptimizerSvc : public extends <AthService, IGPUKernelSizeOptimizerSvc>, public CaloGPUCUDAInitialization

{

 public:


  GPUKernelSizeOptimizerSvc(const std::string & name, ISvcLocator * svc);


  virtual void register_kernels(const std::string & tool_name,

                                const int number,

                                void ** kernels,

                                const int * blocksize_hints,

                                const int * gridsize_hints,

                                const int * max_total_threads,

                                const int offset = 0) override;


  virtual CUDAKernelLaunchConfiguration get_launch_configuration(const std::string & name,

                                                                 const int number = 0,

                                                                 const int dynamic_memory = 0) const override;


  virtual bool can_use_cooperative_groups() const override

  {

    return m_coopgroup_support;

  }


  virtual bool can_use_dynamic_parallelism() const override

  {

    return m_dynpar_support;

  }


  virtual bool should_use_minimal_kernel_sizes() const

  {

    //Testing shows that, at least on the devices we use,

    //we only lose performance by dyn-par'ing our way to do this.

    return false;

  }


  virtual StatusCode initialize() override

  {

    return CaloGPUCUDAInitialization::initialize();

  }


  virtual StatusCode initialize_CUDA() override;


  virtual StatusCode finalize() override;


 private:


  bool m_dynpar_support = false;

  bool m_coopgroup_support = false;


  struct KernelRecord

  {

    CUDAKernelLaunchConfiguration configs[101];

    void add_configuration(const CUDAKernelLaunchConfiguration & config,

                           const int usage_start = 0,

                           const int usage_end = 100,

                           const bool overwrite = false)

    {

      for (int u = usage_start; u <= usage_end && u <= 100; ++u)

        {

          CUDAKernelLaunchConfiguration & cfg = configs[u];

          if (overwrite || cfg.grid_x <= 0)

            {

              cfg = config;

            }

        }

    }

  };


  std::unordered_map<std::string, std::vector<KernelRecord>> m_kernel_map;


  int get_GPU_usage() const

  {

    return 0;

  }


  Gaudi::Property<std::vector<std::string>> m_kernelFiles {this, "KernelSizeInput", {}, "Kernel size input JSON files"};


  Gaudi::Property<bool> m_outputSizes {this, "OutputSizes", true, "Write out last used kernel sizes"};


  Gaudi::Property<std::string> m_outputFile {this, "OutputFile", "sizes.json", "Kernel size output file"};


  struct KernelsEntry

  {

    struct KernelInfo

    {

      int usage_start = 0;

      int usage_end = 100;

      int grid_x = 0, grid_y = 0, grid_z = 0, block_x = 0, block_y = 0, block_z = 0;


      NLOHMANN_DEFINE_TYPE_INTRUSIVE(KernelInfo, usage_start, usage_end,

                                     grid_x, grid_y, grid_z,

                                     block_x, block_y, block_z);

    };


    std::string device;

    std::string name;

    std::vector< std::vector<KernelInfo> > kernels;


    NLOHMANN_DEFINE_TYPE_INTRUSIVE(KernelsEntry, device, name, kernels)

  };


};


#endif