ATLAS Offline Software
Loading...
Searching...
No Matches
IterateUntilCondition.h
Go to the documentation of this file.
1//
2// Copyright (C) 2002-2025 CERN for the benefit of the ATLAS collaboration
3//
4// Dear emacs, this is -*- c++ -*-
5//
6
15
16
17#ifndef CALORECGPU_ITERATEUNTILCONDITION_H
18
19#define CALORECGPU_ITERATEUNTILCONDITION_H
20
21#include <cooperative_groups.h>
22
23#ifndef CALORECGPU_ITERATE_UNTIL_CONDITION_DEBUG
24
25 #define CALORECGPU_ITERATE_UNTIL_CONDITION_DEBUG 0
26
27#endif
28
29
30#ifndef CALORECGPU_ITERATE_UNTIL_CONDITION_INCLUDE_ASSERTS
31
32 #define CALORECGPU_ITERATE_UNTIL_CONDITION_INCLUDE_ASSERTS 0
33
34#endif
35
37{
38
39 template <class Condition, class Before, class After, class ... Funcs>
40 struct Holder;
41
42 template <class Condition, class Before, class After, class ... Funcs, class ... Args>
44 {
45 cooperative_groups::grid_group grid = cooperative_groups::this_grid();
46
47 Condition checker;
48
49 Before{}(gridDim.x, blockIdx.x, checker, std::forward<Args>(args)...);
50
51 while (!checker(gridDim.x, blockIdx.x, std::forward<Args>(args)...))
52 {
53 auto helper = [&](auto func)
54 {
55 func(gridDim.x, blockIdx.x, checker, std::forward<Args>(args)...);
56 grid.sync();
57 };
58
59 (helper(Funcs{}), ...);
60 }
61 After{}(gridDim.x, blockIdx.x, checker, std::forward<Args>(args)...);
62 }
63
64 template <class HolderLike, class ... Args>
65 __global__ void cooperative_kernel(Args ... args)
66 {
67 cooperative_kernel_impl(HolderLike{}, args...);
68 }
69
71 {
72 static constexpr unsigned int NumMaxBlocks = 1024;
73
74 unsigned int mutex_check;
75 unsigned int mutex_ticket;
76 unsigned int count;
77 unsigned int poll_closed;
78 unsigned int wait_flags[NumMaxBlocks];
79 };
80
82 {
84 };
85
86 inline __device__ bool try_lock_mutex(Storage * store)
87 {
88 const unsigned int ticket = atomicAdd(&store->mutex_ticket, 1U);
89
90 unsigned int last_check = 0;
91
92 bool was_once_valid = false;
93
94 volatile unsigned int * to_check = static_cast<volatile unsigned int *>(&store->mutex_check);
95
96 do
97 {
98 last_check = *to_check;
99 was_once_valid = !(last_check & 0x80000000U);
100 }
101 while (last_check < ticket && !(last_check & 0x80000000U));
102
103 return was_once_valid;
104 }
105
106 inline __device__ void unlock_mutex(Storage * store)
107 {
108 volatile unsigned int * ptr = static_cast<volatile unsigned int *>(&store->mutex_check);
109 *ptr = *ptr + 1;
110 }
111
112 inline __device__ void disable_mutex(Storage * store)
113 {
114 volatile unsigned int * ptr = static_cast<volatile unsigned int *>(&store->mutex_check);
115 *ptr = *ptr | 0x80000000U;
116 }
117
118 inline __device__ bool check_if_participating(Storage * store)
119 {
120 const bool locked = try_lock_mutex(store);
121
122 unsigned int old_count = Storage::NumMaxBlocks;
123
124 volatile unsigned int * poll_closed_ptr = static_cast<volatile unsigned int *>(&store->poll_closed);
125
126 if (*poll_closed_ptr == 0)
127 {
128 old_count = atomicAdd(&store->count, 1);
129 store->block_indices[blockIdx.x] = old_count;
130 unlock_mutex(store);
131 }
132 else
133 {
134 if (locked)
135 {
136 unlock_mutex(store);
137 }
138 return false;
139 }
140
141 if (*poll_closed_ptr == 0)
142 {
143 try_lock_mutex(store);
144 *poll_closed_ptr = 1;
145 disable_mutex(store);
146 unlock_mutex(store);
147 }
148
149 return (old_count < Storage::NumMaxBlocks);
150 }
151
152 template <class Condition, class Before, class After, class ... Funcs, class ... Args>
153 __device__ void normal_kernel_impl(const Holder<Condition, Before, After, Funcs...> &, Storage * store, Args && ... args)
154 {
155 __shared__ bool is_participating;
156
157 const bool is_reference_thread = (threadIdx.x == 0 && threadIdx.y == 0 && threadIdx.z == 0);
158
159 if (is_reference_thread)
160 {
161 is_participating = check_if_participating(store);
162 }
163
164 __syncthreads();
165
166 const unsigned int this_block_index = store->block_indices[blockIdx.x];
167 const unsigned int total_blocks = min(*static_cast<volatile unsigned int *>(&store->count), Storage::NumMaxBlocks);
168
169 if (is_participating)
170 {
171 const bool is_reference_block = (this_block_index == 0);
172
173 const unsigned int this_thread_index = threadIdx.z * blockDim.y * blockDim.x +
174 threadIdx.y * blockDim.x +
175 threadIdx.x;
176
177 const unsigned int num_threads_per_block = blockDim.x * blockDim.y * blockDim.z;
178
179 Condition checker;
180
181
182 Before{}(total_blocks, this_block_index, checker, std::forward<Args>(args)...);
183
184 while (!checker(total_blocks, this_block_index, std::forward<Args>(args)...))
185 {
186 auto helper = [&](auto func)
187 {
188
189 func(total_blocks, this_block_index, checker, std::forward<Args>(args)...);
190
191 //Technically, for the foreseeable future,
192 //this could be simply the if,
193 //as the maximum number of concurrent blocks
194 //in all devices is smaller than 1024...
195 if (is_reference_block)
196 {
197 for (unsigned int block_to_check = this_thread_index + 1; block_to_check < total_blocks; block_to_check += num_threads_per_block)
198 {
199 unsigned int last_check = 0;
200
201 volatile unsigned int * to_check = static_cast<volatile unsigned int*>(&store->wait_flags[block_to_check]);
202
203 do
204 {
205 last_check = *to_check;
206 }
207 while (last_check == 0);
208 //When porting to non-CUDA, this may need to be some form of atomic load.
209 }
210
211 __syncthreads();
212
213 for (unsigned int block_to_check = this_thread_index + 1; block_to_check < total_blocks; block_to_check += num_threads_per_block)
214 {
215 __threadfence();
216 //*static_cast<volatile unsigned int*>(&store->wait_flags[block_to_check]) = 0;
217 atomicAnd(&(store->wait_flags[block_to_check]), 0U);
218 }
219
220 __syncthreads();
221 }
222 else
223 {
224 __syncthreads();
225
226 if (is_reference_thread)
227 {
228 atomicOr(&(store->wait_flags[this_block_index]), 1U);
229
230 unsigned int last_check = 1;
231
232 volatile unsigned int * to_check = static_cast<volatile unsigned int*>(&store->wait_flags[this_block_index]);
233
234 __threadfence();
235
236 do
237 {
238 last_check = *to_check;
239 }
240 while (last_check != 0);
241 //When porting to non-CUDA, this may need to be some form of atomic load.
242 }
243
244 __syncthreads();
245 }
246
247 };
248
249 (helper(Funcs{}), ...);
250 }
251
252 After{}(total_blocks, this_block_index, checker, std::forward<Args>(args)...);
253 }
254
255#if CALORECGPU_ITERATE_UNTIL_CONDITION_DEBUG
256 if (is_reference_thread)
257 {
258 printf("%d | %d | %u %u \n", blockIdx.x, static_cast<int>(is_participating), total_blocks, this_block_index);
259 }
260#endif
261 }
262
263 template <class HolderLike, class ... Args>
264 __global__ void normal_kernel(Storage * store, Args ... args)
265 {
266 normal_kernel_impl(HolderLike{}, store, args...);
267 }
268
282 template <class Condition, class Before, class After, class ... Funcs>
283 struct Holder
284 {
285 template <class ... Args>
286 static void execute(const bool use_native_sync,
287 const dim3 & grid_size,
288 const dim3 & block_size,
289 size_t shared_memory,
290 cudaStream_t stream,
291 Storage * gpu_ptr,
292 Args ... args)
293 {
294#if CALORECGPU_ITERATE_UNTIL_CONDITION_INCLUDE_ASSERTS
295 assert(grid_size.x <= Storage::NumMaxBlocks);
296 assert(grid_size.y == 1);
297 assert(grid_size.z == 1);
298#endif
299
300 if (use_native_sync)
301 {
302 void * arg_ptrs[] = { static_cast<void *>(&args)... };
303
304 cudaLaunchCooperativeKernel((void *) cooperative_kernel<Holder, Args...>,
305 grid_size,
306 block_size,
307 arg_ptrs,
308 shared_memory,
309 stream);
310 }
311 else
312 {
313 cudaMemsetAsync(static_cast<BasicStorage *>(gpu_ptr), 0, sizeof(BasicStorage), stream);
314
315 normal_kernel<Holder, Args...> <<< grid_size, block_size, shared_memory, stream>>>(gpu_ptr, args...);
316 }
317 }
318
319 };
320
334 template <class Condition, class Before, class After, class ... Funcs>
335 auto make_holder(Condition c, Before b, After a, Funcs ... fs)
336 {
337 return Holder<Condition, Before, After, Funcs...> {};
338 }
339}
340
341#endif
std::unique_ptr< ICondition > Condition
static Double_t a
static Double_t fs
#define min(a, b)
Definition cfImp.cxx:40
__device__ bool try_lock_mutex(Storage *store)
__device__ void unlock_mutex(Storage *store)
__device__ void disable_mutex(Storage *store)
__global__ void normal_kernel(Storage *store, Args ... args)
__device__ bool check_if_participating(Storage *store)
__global__ void cooperative_kernel(Args ... args)
__device__ void normal_kernel_impl(const Holder< Condition, Before, After, Funcs... > &, Storage *store, Args &&... args)
__device__ void cooperative_kernel_impl(const Holder< Condition, Before, After, Funcs... > &, Args &&... args)
auto make_holder(Condition c, Before b, After a, Funcs ... fs)
Must pass functors!
static constexpr unsigned int NumMaxBlocks
Condition, Before, After and Funcs must all be functor classes.
static void execute(const bool use_native_sync, const dim3 &grid_size, const dim3 &block_size, size_t shared_memory, cudaStream_t stream, Storage *gpu_ptr, Args ... args)
unsigned int block_indices[NumMaxBlocks]