ATLAS Offline Software
Loading...
Searching...
No Matches
IterateUntilCondition.h
Go to the documentation of this file.
1//
2// Copyright (C) 2002-2025 CERN for the benefit of the ATLAS collaboration
3//
4// Dear emacs, this is -*- c++ -*-
5//
6
15
16
17#ifndef CALORECGPU_ITERATEUNTILCONDITION_H
18
19#define CALORECGPU_ITERATEUNTILCONDITION_H
20
21#include <cooperative_groups.h>
22
23#ifndef CALORECGPU_ITERATE_UNTIL_CONDITION_DEBUG
24
25 #define CALORECGPU_ITERATE_UNTIL_CONDITION_DEBUG 0
26
27#endif
28
29
30#ifndef CALORECGPU_ITERATE_UNTIL_CONDITION_INCLUDE_ASSERTS
31
32 #define CALORECGPU_ITERATE_UNTIL_CONDITION_INCLUDE_ASSERTS 0
33
34#endif
35
37{
38
39 template <class Condition, class Before, class After, class ... Funcs>
40 struct Holder;
41
42 template <class Condition, class Before, class After, class ... Funcs, class ... Args>
44 {
45 cooperative_groups::grid_group grid = cooperative_groups::this_grid();
46
47 Condition checker;
48
49 Before{}(gridDim.x, blockIdx.x, checker, std::forward<Args>(args)...);
50
51 while (!checker(gridDim.x, blockIdx.x, std::forward<Args>(args)...))
52 {
53 auto helper = [&](auto func)
54 {
55 func(gridDim.x, blockIdx.x, checker, std::forward<Args>(args)...);
56 grid.sync();
57 };
58
59 (helper(Funcs{}), ...);
60 }
61 After{}(gridDim.x, blockIdx.x, checker, std::forward<Args>(args)...);
62 }
63
64 template <class HolderLike, class ... Args>
65 __global__ void cooperative_kernel(Args ... args)
66 {
67 cooperative_kernel_impl(HolderLike{}, args...);
68 }
69
71 {
72 static constexpr unsigned int NumMaxBlocks = 1024;
73
74 unsigned int mutex_check;
75 unsigned int mutex_ticket;
76 unsigned int count;
77 unsigned int poll_closed;
78 unsigned int wait_flags[NumMaxBlocks];
79 };
80
82 {
84 };
85
86 inline __device__ bool try_lock_mutex(Storage * store)
87 {
88 const unsigned int ticket = atomicAdd(&store->mutex_ticket, 1U);
89
90 unsigned int last_check = 0;
91
92 bool was_once_valid = false;
93
94 int count = 0;
95
96 do
97 {
98 last_check = atomicOr(&store->mutex_check, 0U);
99 was_once_valid = !(last_check & 0x80000000U);
100 ++count;
101 }
102 while (last_check < ticket && !(last_check & 0x80000000U));
103
104 return was_once_valid && count < 10000;
105 }
106
107 inline __device__ void unlock_mutex(Storage * store)
108 {
109 atomicAdd(&store->mutex_check, 1U);
110 }
111
112 inline __device__ void disable_mutex(Storage * store)
113 {
114 atomicOr(&store->mutex_check, 0x80000000U);
115 }
116
117 inline __device__ bool check_if_participating(Storage * store)
118 {
119 const bool locked = try_lock_mutex(store);
120
121 unsigned int old_count = Storage::NumMaxBlocks;
122
123 if (atomicOr(&store->poll_closed, 0U) == 0)
124 {
125 old_count = atomicAdd(&store->count, 1);
126 store->block_indices[blockIdx.x] = old_count;
127 unlock_mutex(store);
128 }
129 else
130 {
131 if (locked)
132 {
133 unlock_mutex(store);
134 }
135 return false;
136 }
137
138 if (atomicOr(&store->poll_closed, 0U))
139 {
140 try_lock_mutex(store);
141 atomicOr(&store->poll_closed, 1U);
142 //disable_mutex(store);
143 unlock_mutex(store);
144 }
145
146 return (old_count < Storage::NumMaxBlocks);
147 }
148
149 //Possible TO-DO:
150 //Some/all of these atomic operations
151 //probably just require volatile semantics.
152 //To investigate at some other point...
153
154 template <class Condition, class Before, class After, class ... Funcs, class ... Args>
155 __device__ void normal_kernel_impl(const Holder<Condition, Before, After, Funcs...> &, Storage * store, Args && ... args)
156 {
157 __shared__ bool is_participating;
158
159 const bool is_reference_thread = (threadIdx.x == 0 && threadIdx.y == 0 && threadIdx.z == 0);
160
161 if (is_reference_thread)
162 {
163 is_participating = check_if_participating(store);
164 }
165
166 __syncthreads();
167
168 const unsigned int this_block_index = store->block_indices[blockIdx.x];
169 const unsigned int total_blocks = min(store->count, Storage::NumMaxBlocks);
170
171 if (is_participating)
172 {
173 const bool is_reference_block = (this_block_index == 0);
174
175 const unsigned int this_thread_index = threadIdx.z * blockDim.y * blockDim.x +
176 threadIdx.y * blockDim.x +
177 threadIdx.x;
178
179 const unsigned int num_threads_per_block = blockDim.x * blockDim.y * blockDim.z;
180
181 Condition checker;
182
183
184 Before{}(total_blocks, this_block_index, checker, std::forward<Args>(args)...);
185
186 while (!checker(total_blocks, this_block_index, std::forward<Args>(args)...))
187 {
188 auto helper = [&](auto func)
189 {
190
191 func(total_blocks, this_block_index, checker, std::forward<Args>(args)...);
192
193 return;
194
195 //Technically, for the foreseeable future,
196 //this could be simply the if,
197 //as the maximum number of concurrent blocks
198 //in all devices is smaller than 1024...
199 if (is_reference_block)
200 {
201
202 for (unsigned int block_to_check = this_thread_index + 1; block_to_check < total_blocks; block_to_check += num_threads_per_block)
203 {
204 while (store->wait_flags[block_to_check] == 0);
205 //When porting to non-CUDA, this may need to be some form of atomic load.
206 }
207
208 __syncthreads();
209
210 for (unsigned int block_to_check = this_thread_index + 1; block_to_check < total_blocks; block_to_check += num_threads_per_block)
211 {
212 atomicAnd(&(store->wait_flags[block_to_check]), 0U);
213 }
214 }
215 else
216 {
217 __syncthreads();
218
219 if (is_reference_thread)
220 {
221 atomicOr(&(store->wait_flags[this_block_index]), 1U);
222
223 while (store->wait_flags[this_block_index] != 0);
224 //When porting to non-CUDA, this may need to be some form of atomic load.
225 }
226
227 __syncthreads();
228 }
229
230 };
231
232 (helper(Funcs{}), ...);
233 }
234
235 After{}(total_blocks, this_block_index, checker, std::forward<Args>(args)...);
236 }
237
238#if CALORECGPU_ITERATE_UNTIL_CONDITION_DEBUG
239 if (is_reference_thread)
240 {
241 printf("%d | %d | %u %u \n", blockIdx.x, static_cast<int>(is_participating), total_blocks, this_block_index);
242 }
243#endif
244 }
245
246 template <class HolderLike, class ... Args>
247 __global__ void normal_kernel(Storage * store, Args ... args)
248 {
249 normal_kernel_impl(HolderLike{}, store, args...);
250 }
251
265 template <class Condition, class Before, class After, class ... Funcs>
266 struct Holder
267 {
268 template <class ... Args>
269 static void execute(const bool use_native_sync,
270 const dim3 & grid_size,
271 const dim3 & block_size,
272 size_t shared_memory,
273 cudaStream_t stream,
274 Storage * gpu_ptr,
275 Args ... args)
276 {
277#if CALORECGPU_ITERATE_UNTIL_CONDITION_INCLUDE_ASSERTS
278 assert(grid_size.x <= Storage::NumMaxBlocks);
279 assert(grid_size.y == 1);
280 assert(grid_size.z == 1);
281#endif
282
283 if (use_native_sync)
284 {
285 void * arg_ptrs[] = { static_cast<void *>(&args)... };
286
287 cudaLaunchCooperativeKernel((void *) cooperative_kernel<Holder, Args...>,
288 grid_size,
289 block_size,
290 arg_ptrs,
291 shared_memory,
292 stream);
293 }
294 else
295 {
296 cudaMemsetAsync(static_cast<BasicStorage *>(gpu_ptr), 0, sizeof(BasicStorage), stream);
297
298 normal_kernel<Holder, Args...> <<< grid_size, block_size, shared_memory, stream>>>(gpu_ptr, args...);
299 }
300 }
301
302 };
303
317 template <class Condition, class Before, class After, class ... Funcs>
318 auto make_holder(Condition c, Before b, After a, Funcs ... fs)
319 {
320 return Holder<Condition, Before, After, Funcs...> {};
321 }
322}
323
324#endif
std::unique_ptr< ICondition > Condition
static Double_t a
static Double_t fs
#define min(a, b)
Definition cfImp.cxx:40
int count(std::string s, const std::string &regx)
count how many occurances of a regx are in a string
Definition hcg.cxx:146
__device__ bool try_lock_mutex(Storage *store)
__device__ void unlock_mutex(Storage *store)
__device__ void disable_mutex(Storage *store)
__global__ void normal_kernel(Storage *store, Args ... args)
__device__ bool check_if_participating(Storage *store)
__global__ void cooperative_kernel(Args ... args)
__device__ void normal_kernel_impl(const Holder< Condition, Before, After, Funcs... > &, Storage *store, Args &&... args)
__device__ void cooperative_kernel_impl(const Holder< Condition, Before, After, Funcs... > &, Args &&... args)
auto make_holder(Condition c, Before b, After a, Funcs ... fs)
Must pass functors!
static constexpr unsigned int NumMaxBlocks
Condition, Before, After and Funcs must all be functor classes.
static void execute(const bool use_native_sync, const dim3 &grid_size, const dim3 &block_size, size_t shared_memory, cudaStream_t stream, Storage *gpu_ptr, Args ... args)
unsigned int block_indices[NumMaxBlocks]