d6/d3f/IterateUntilCondition_8h_source.html

//

// Copyright (C) 2002-2025 CERN for the benefit of the ATLAS collaboration

//

// Dear emacs, this is -*- c++ -*-

//


#ifndef CALORECGPU_ITERATEUNTILCONDITION_H


#define CALORECGPU_ITERATEUNTILCONDITION_H


#include <cooperative_groups.h>


#ifndef CALORECGPU_ITERATE_UNTIL_CONDITION_DEBUG


  #define CALORECGPU_ITERATE_UNTIL_CONDITION_DEBUG 0


#endif


#ifndef CALORECGPU_ITERATE_UNTIL_CONDITION_INCLUDE_ASSERTS


  #define CALORECGPU_ITERATE_UNTIL_CONDITION_INCLUDE_ASSERTS 0


#endif


namespace IterateUntilCondition

{


  template <class Condition, class Before, class After, class ... Funcs>

  struct Holder;


  template <class Condition, class Before, class After, class ... Funcs, class ... Args>


  __device__ void cooperative_kernel_impl(const Holder<Condition, Before, After, Funcs...> &, Args && ... args)

  {

    cooperative_groups::grid_group grid = cooperative_groups::this_grid();


    Condition checker;


    Before{}(gridDim.x, blockIdx.x, checker, std::forward<Args>(args)...);


    while (!checker(gridDim.x, blockIdx.x, std::forward<Args>(args)...))

      {

        auto helper = [&](auto func)

        {

          func(gridDim.x, blockIdx.x, checker, std::forward<Args>(args)...);

          grid.sync();

        };


        (helper(Funcs{}), ...);

      }

    After{}(gridDim.x, blockIdx.x, checker, std::forward<Args>(args)...);

  }


  template <class HolderLike, class ... Args>


  __global__ void cooperative_kernel(Args ... args)

  {

    cooperative_kernel_impl(HolderLike{}, args...);

  }


  struct BasicStorage

  {

    static constexpr unsigned int NumMaxBlocks = 1024;


    unsigned int mutex_check;

    unsigned int mutex_ticket;

    unsigned int count;

    unsigned int poll_closed;

    unsigned int wait_flags[NumMaxBlocks];

  };


  struct Storage : BasicStorage

  {

    unsigned int block_indices[NumMaxBlocks];

  };


  inline __device__ bool try_lock_mutex(Storage * store)

  {

    const unsigned int ticket = atomicAdd(&store->mutex_ticket, 1U);


    unsigned int last_check = 0;


    bool was_once_valid = false;


    int count = 0;


    do

      {

        last_check = atomicOr(&store->mutex_check, 0U);

        was_once_valid = !(last_check & 0x80000000U);

        ++count;

      }

    while (last_check < ticket && !(last_check & 0x80000000U));


    return was_once_valid && count < 10000;

  }


  inline __device__ void unlock_mutex(Storage * store)

  {

    atomicAdd(&store->mutex_check, 1U);

  }


  inline __device__ void disable_mutex(Storage * store)

  {

    atomicOr(&store->mutex_check, 0x80000000U);

  }


  inline __device__ bool check_if_participating(Storage * store)

  {

    const bool locked = try_lock_mutex(store);


    unsigned int old_count = Storage::NumMaxBlocks;


    if (atomicOr(&store->poll_closed, 0U) == 0)

      {

        old_count = atomicAdd(&store->count, 1);

        store->block_indices[blockIdx.x] = old_count;

        unlock_mutex(store);

      }

    else

      {

        if (locked)

          {

            unlock_mutex(store);

          }

        return false;

      }


    if (atomicOr(&store->poll_closed, 0U))

      {

        try_lock_mutex(store);

        atomicOr(&store->poll_closed, 1U);

        //disable_mutex(store);

        unlock_mutex(store);

      }


    return (old_count < Storage::NumMaxBlocks);

  }


  //Possible TO-DO:

  //Some/all of these atomic operations

  //probably just require volatile semantics.

  //To investigate at some other point...


  template <class Condition, class Before, class After, class ... Funcs, class ... Args>


  __device__ void normal_kernel_impl(const Holder<Condition, Before, After, Funcs...> &, Storage * store, Args && ... args)

  {

    __shared__ bool is_participating;


    const bool is_reference_thread = (threadIdx.x == 0 && threadIdx.y == 0 && threadIdx.z == 0);


    if (is_reference_thread)

      {

        is_participating = check_if_participating(store);

      }


    __syncthreads();


    const unsigned int this_block_index = store->block_indices[blockIdx.x];

    const unsigned int total_blocks = min(store->count, Storage::NumMaxBlocks);


    if (is_participating)

      {

        const bool is_reference_block = (this_block_index == 0);


        const unsigned int this_thread_index = threadIdx.z * blockDim.y * blockDim.x +

                                               threadIdx.y * blockDim.x +

                                               threadIdx.x;


        const unsigned int num_threads_per_block = blockDim.x * blockDim.y * blockDim.z;


        Condition checker;


        Before{}(total_blocks, this_block_index, checker, std::forward<Args>(args)...);


        while (!checker(total_blocks, this_block_index, std::forward<Args>(args)...))

          {

            auto helper = [&](auto func)

            {


              func(total_blocks, this_block_index, checker, std::forward<Args>(args)...);


              return;


              //Technically, for the foreseeable future,

              //this could be simply the if,

              //as the maximum number of concurrent blocks

              //in all devices is smaller than 1024...

              if (is_reference_block)

                {


                  for (unsigned int block_to_check = this_thread_index + 1; block_to_check < total_blocks; block_to_check += num_threads_per_block)

                    {

                      while (store->wait_flags[block_to_check] == 0);

                      //When porting to non-CUDA, this may need to be some form of atomic load.

                    }


                  __syncthreads();


                  for (unsigned int block_to_check = this_thread_index + 1; block_to_check < total_blocks; block_to_check += num_threads_per_block)

                    {

                      atomicAnd(&(store->wait_flags[block_to_check]), 0U);

                    }

                }

              else

                {

                  __syncthreads();


                  if (is_reference_thread)

                    {

                      atomicOr(&(store->wait_flags[this_block_index]), 1U);


                      while (store->wait_flags[this_block_index] != 0);

                      //When porting to non-CUDA, this may need to be some form of atomic load.

                    }


                  __syncthreads();

                }


            };


            (helper(Funcs{}), ...);

          }


        After{}(total_blocks, this_block_index, checker, std::forward<Args>(args)...);

      }


#if CALORECGPU_ITERATE_UNTIL_CONDITION_DEBUG

    if (is_reference_thread)

      {

        printf("%d | %d | %u %u \n", blockIdx.x, static_cast<int>(is_participating), total_blocks, this_block_index);

      }

#endif

  }


  template <class HolderLike, class ... Args>


  __global__ void normal_kernel(Storage * store, Args ... args)

  {

    normal_kernel_impl(HolderLike{}, store, args...);

  }


  template <class Condition, class Before, class After, class ... Funcs>


  struct Holder

  {

    template <class ... Args>


    static void execute(const bool use_native_sync,

                        const dim3 & grid_size,

                        const dim3 & block_size,

                        size_t shared_memory,

                        cudaStream_t stream,

                        Storage * gpu_ptr,

                        Args ... args)

    {

#if CALORECGPU_ITERATE_UNTIL_CONDITION_INCLUDE_ASSERTS

      assert(grid_size.x <= Storage::NumMaxBlocks);

      assert(grid_size.y == 1);

      assert(grid_size.z == 1);

#endif


      if (use_native_sync)

        {

          void * arg_ptrs[] = { static_cast<void *>(&args)... };


          cudaLaunchCooperativeKernel((void *) cooperative_kernel<Holder, Args...>,

                                      grid_size,

                                      block_size,

                                      arg_ptrs,

                                      shared_memory,

                                      stream);

        }

      else

        {

          cudaMemsetAsync(static_cast<BasicStorage *>(gpu_ptr), 0, sizeof(BasicStorage), stream);


          normal_kernel<Holder, Args...> <<< grid_size, block_size, shared_memory, stream>>>(gpu_ptr, args...);

        }

    }


  };


  template <class Condition, class Before, class After, class ... Funcs>


  auto make_holder(Condition c, Before b, After a, Funcs ... fs)

  {

    return Holder<Condition, Before, After, Funcs...> {};

  }


}


#endif

Condition
std::unique_ptr< ICondition > Condition
Definition ConditionsDefs.h:19

a
static Double_t a
Definition LArPhysWaveHECTool.cxx:38

fs
static Double_t fs
Definition LArPhysWaveHECTool.cxx:37

min
#define min(a, b)
Definition cfImp.cxx:40

count
int count(std::string s, const std::string &regx)
count how many occurances of a regx are in a string
Definition hcg.cxx:146

IterateUntilCondition
Definition IterateUntilCondition.h:37

IterateUntilCondition::try_lock_mutex
__device__ bool try_lock_mutex(Storage *store)
Definition IterateUntilCondition.h:86

IterateUntilCondition::unlock_mutex
__device__ void unlock_mutex(Storage *store)
Definition IterateUntilCondition.h:107

IterateUntilCondition::disable_mutex
__device__ void disable_mutex(Storage *store)
Definition IterateUntilCondition.h:112

IterateUntilCondition::normal_kernel
__global__ void normal_kernel(Storage *store, Args ... args)
Definition IterateUntilCondition.h:247

IterateUntilCondition::check_if_participating
__device__ bool check_if_participating(Storage *store)
Definition IterateUntilCondition.h:117

IterateUntilCondition::cooperative_kernel
__global__ void cooperative_kernel(Args ... args)
Definition IterateUntilCondition.h:65

IterateUntilCondition::normal_kernel_impl
__device__ void normal_kernel_impl(const Holder< Condition, Before, After, Funcs... > &, Storage *store, Args &&... args)
Definition IterateUntilCondition.h:155

IterateUntilCondition::cooperative_kernel_impl
__device__ void cooperative_kernel_impl(const Holder< Condition, Before, After, Funcs... > &, Args &&... args)
Definition IterateUntilCondition.h:43

IterateUntilCondition::make_holder
auto make_holder(Condition c, Before b, After a, Funcs ... fs)
Must pass functors!
Definition IterateUntilCondition.h:318

Args
Definition test_lwtnn_fastgraph.cxx:12

IterateUntilCondition::BasicStorage
Definition IterateUntilCondition.h:71

IterateUntilCondition::BasicStorage::mutex_ticket
unsigned int mutex_ticket
Definition IterateUntilCondition.h:75

IterateUntilCondition::BasicStorage::poll_closed
unsigned int poll_closed
Definition IterateUntilCondition.h:77

IterateUntilCondition::BasicStorage::mutex_check
unsigned int mutex_check
Definition IterateUntilCondition.h:74

IterateUntilCondition::BasicStorage::wait_flags
unsigned int wait_flags[NumMaxBlocks]
Definition IterateUntilCondition.h:78

IterateUntilCondition::BasicStorage::NumMaxBlocks
static constexpr unsigned int NumMaxBlocks
Definition IterateUntilCondition.h:72

IterateUntilCondition::BasicStorage::count
unsigned int count
Definition IterateUntilCondition.h:76

IterateUntilCondition::Holder
Condition, Before, After and Funcs must all be functor classes.
Definition IterateUntilCondition.h:267

IterateUntilCondition::Holder::execute
static void execute(const bool use_native_sync, const dim3 &grid_size, const dim3 &block_size, size_t shared_memory, cudaStream_t stream, Storage *gpu_ptr, Args ... args)
Definition IterateUntilCondition.h:269

IterateUntilCondition::Storage
Definition IterateUntilCondition.h:82

IterateUntilCondition::Storage::block_indices
unsigned int block_indices[NumMaxBlocks]
Definition IterateUntilCondition.h:83