ATLAS Offline Software
Loading...
Searching...
No Matches
Calorimeter/CaloRecGPU/CaloRecGPU/Helpers.h
Go to the documentation of this file.
1//
2// Copyright (C) 2002-2025 CERN for the benefit of the ATLAS collaboration
3//
4// Dear emacs, this is -*- c++ -*-
5//
6
7#ifndef CALORECGPU_HELPERS_H
8#define CALORECGPU_HELPERS_H
9
10#include <utility>
11#include <type_traits>
12#include <cstring>
13//For memcpy, of all things...
14#include <string>
15#include <cstdio>
16#include <iostream>
17#include <thread>
18#include <mutex>
19#include <shared_mutex>
20#include <memory>
21#include <vector>
22#include <climits>
23#include <new>
24#include <cmath>
25
26#if __cpp_lib_math_constants
27 #include <numbers>
28#endif
29//This is the best way to have pi,
30//but we provide a more manual alternative.
31//Of course, there's also M_PI,
32//but we wanted to ensure the type matched
33//to prevent any GPU-based casting shenanigans.
34
35#if __cpp_lib_bitops
36 #include <bit>
37#endif
38
39namespace CaloRecGPU
40{
41
42#ifndef CUDA_AVAILABLE
43
44 #ifdef __CUDA_ARCH__
45 #define CUDA_AVAILABLE 1
46 #elif __CUDA__
47 #define CUDA_AVAILABLE 1
48 #elif __CUDACC__
49 #define CUDA_AVAILABLE 1
50 #else
51 #define CUDA_AVAILABLE 0
52 #endif
53
54#endif
55
56#if CUDA_AVAILABLE
57
58#define CUDA_HOS_DEV __host__ __device__
59
60
66 CUDA_HOS_DEV inline void CUDA_gpu_assert(cudaError_t code, const char * file, int line, bool abort = true)
67 {
68 if (code != cudaSuccess)
69 {
70 printf("CUDA error: %s (%s %d)\n", cudaGetErrorString(code), file, line);
71 if (abort)
72 {
73#ifdef __CUDA_ARCH__
74 asm("trap;");
75#else
76 exit(code);
77#endif
78 }
79 }
80 }
81
86#define CUDA_ERRCHECK(...) CUDA_ERRCHECK_HELPER(__VA_ARGS__, true)
87
88#define CUDA_ERRCHECK_HELPER(ans, ...) do { ::CaloRecGPU::CUDA_gpu_assert((ans), __FILE__, __LINE__, CUDA_ERRCHECK_GET_FIRST(__VA_ARGS__, true) ); } while(0)
89#define CUDA_ERRCHECK_GET_FIRST(x, ...) x
90
91
92#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ > 350
93 #if CUDART_VERSION >= 12000
94 #define CUDA_CAN_USE_TAIL_LAUNCH 1
95 #else
96 #define CUDA_CAN_USE_TAIL_LAUNCH 0
97 #endif
98#elif defined(__CUDA_ARCH__)
99 #error "CUDA compute capability at least 3.5 is needed so we can have dynamic parallelism!"
100#endif
101
102
103#else
104
105#define CUDA_HOS_DEV
106#define CUDA_ERRCHECK(...)
107
108#endif
109
110 namespace CUDA_Helpers
111 {
112
114 {
115 void * ptr = nullptr;
116
117 template <class T = const void>
118 constexpr operator T * () const
119 {
120 return (T *) ptr;
121 }
122
123 constexpr operator bool() const
124 {
125 return ptr != nullptr;
126 }
127
128 template <class T>
130 {
131 }
132
134 };
135 //Can't do much more than this
136 //since cudaStream_t is a typedef...
137 //Though not typesafe, it is still
138 //semantically more safe than a naked void *...
139
143 void * allocate(const size_t num);
144
148 void deallocate(void * address);
149
153 void * allocate_pinned(const size_t num);
154
158 void deallocate_pinned(void * address);
159
160
164 void GPU_to_CPU(void * dest, const void * const source, const size_t num);
165
169 void CPU_to_GPU(void * dest, const void * const source, const size_t num);
170
174 void GPU_to_GPU(void * dest, const void * const source, const size_t num);
175
176
180 void GPU_to_CPU_async(void * dest, const void * const source, const size_t num, CUDAStreamPtrHolder stream = {});
181
185 void CPU_to_GPU_async(void * dest, const void * const source, const size_t num, CUDAStreamPtrHolder stream = {});
186
190 void GPU_to_GPU_async(void * dest, const void * const source, const size_t num, CUDAStreamPtrHolder stream = {});
191
196
200 void optimize_block_and_grid_size(void * func, int & block_size, int & grid_size, const int dynamic_memory = 0, const int block_size_limit = 0);
201
205 void optimize_block_and_grid_size_for_cooperative_launch(void * func, int & block_size, int & grid_size, const int dynamic_memory = 0, const int block_size_limit = 0);
206
208
210
211 std::string GPU_name();
212 }
213
214 namespace Helpers
215 {
216
218 template <class T1, class T2>
219 inline constexpr auto int_ceil_div(const T1 num, const T2 denom)
220 {
221 return num / denom + (num % denom != 0);
222 }
223
225 template <class T1, class T2>
226 inline constexpr auto int_floor_div(const T1 num, const T2 denom)
227 {
228 return num / denom;
229 }
230
232 template <class Base = float, class Exp = int>
233 inline constexpr Base compile_time_pow2(const Exp exp)
234 {
235 Base ret = 1;
236 if (exp < 0)
237 {
238 for (Exp i = 0; i < -exp; ++i)
239 {
240 ret /= Base(2);
241 }
242 }
243 else
244 {
245 for (Exp i = 0; i < exp; ++i)
246 {
247 ret *= Base(2);
248 }
249 }
250 return ret;
251 }
252 //Though we could possibly bit-hack stuff due to IEEE-754 reliance elsewhere,
253 //it's not valid and type-safe C++...
254 //Since it's compile-time, this being a trifle slower is meaningless.
255
258 template <class T>
259 inline constexpr unsigned int int_ceil_log_2(T num)
260 {
261#if __cpp_lib_bitops
262 return sizeof(T) * CHAR_BIT - std::countl_zero(num);
263#else
264 unsigned int ret = 64;
265
266 for (unsigned long long int mask = 0x8000000000000000U; mask > 0; mask >>= 1U)
267 {
268 if (num & mask)
269 {
270 return ret;
271 }
272 --ret;
273 }
274
275 return ret;
276#endif
277 }
278
281 template <class T>
282 inline constexpr unsigned char Pearson_hash(const T number)
283 {
284 constexpr unsigned char initial_value = 42;
285 //The answer.
286
287 constexpr unsigned char c_mult = 7;
288 constexpr unsigned char c_add = 1;
289 //For our "look up table": table[i] = c_mult * i + c_add
290 //For an appropriate choice of constants (such as this),
291 //this will be bijective (modulo 255), as required.
292
293 unsigned char ret = initial_value;
294
295 for (unsigned int i = 0; i < sizeof(T); i += sizeof(unsigned char))
296 {
297 const unsigned char to_hash = number >> (i * CHAR_BIT);
298 const unsigned char operand = ret ^ to_hash;
299 ret = c_mult * operand + c_add;
300 }
301
302 return ret;
303 }
304
305
308 template <class T>
309 inline constexpr unsigned short Pearson_hash_16_bit(const T number)
310 {
311 constexpr unsigned short initial_value = 42754;
312 //The answer and the standard.
313
314 constexpr unsigned short c_mult = 7;
315 constexpr unsigned short c_add = 1;
316 //For our "look up table": table[i] = c_mult * i + c_add
317 //For an appropriate choice of constants (such as this),
318 //this will be bijective (modulo 255), as required.
319
320 unsigned short ret = initial_value;
321
322 for (unsigned int i = 0; i < sizeof(T); i += sizeof(unsigned short))
323 {
324 const unsigned short to_hash = number >> (i * CHAR_BIT);
325 const unsigned short operand = ret ^ to_hash;
326 ret = c_mult * operand + c_add;
327 }
328
329 return ret;
330 }
331
332
334 namespace Constants
335 {
336#ifdef __cpp_lib_math_constants
337 template <class T>
338 inline constexpr T pi = std::numbers::pi_v<T>;
339
340 template <class T>
341 inline constexpr T sqrt2 = std::numbers::sqrt2_v<T>;
342#else
343 template <class T>
344 inline constexpr T pi = T(3.1415926535897932384626433832795028841971693993751058209749445923078164062862089986280348253421170679821480865132823066470938446095505822317253594081284811174502841027019385211055596446229489549303819644288109756659334461284756482337867831652712019091456485669234603486104543266482133936072602491412737245870066063155881748815209209628292540917153643678925903600113305305488204665213841469519415116094330572703657595919530921861173819326117931051185480744623799627495673518857527248912279381830119491298336733624L);
345
346 template <class T>
347 inline constexpr T sqrt2 = T(1.4142135623730950488016887242096980785696718753769480731766797379907324784621070388503875343276415727350138462309122970249248360558507372126441214970999358314132226659275055927557999505011527820605714701095599716059702745345968620147285174186408891986095523292304843087143214508397626036279952514079896872533965463318088296406206152583523950547457502877599617298355752203375318570113543746034084988471603868999706990048150305440277903164542478230684929369186215805784631115966687130130156185689872372352885092649L);
348#endif
349
350 template <class T>
351 inline constexpr T inv_sqrt2 = T(0.70710678118654752440084436210484903928483593768847403658833986899536623923105351942519376716382078636750692311545614851246241802792536860632206074854996791570661133296375279637789997525057639103028573505477998580298513726729843100736425870932044459930477616461524215435716072541988130181399762570399484362669827316590441482031030762917619752737287514387998086491778761016876592850567718730170424942358019344998534950240751527201389515822712391153424646845931079028923155579833435650650780928449361861764425463243L);
352 //Why is this not in the C++ constants?!
353
354 }
355
356 CUDA_HOS_DEV static inline
357 float erf_inv_wrapper (const float x)
358 {
359 using namespace std;
360#ifdef __CUDA_ARCH__
361 return erfinvf(x);
362#else
363 //Copied directly from ROOT...
364
365 int kMaxit = 50;
366 float kEps = 1e-14f;
367 float kConst = 0.8862269254527579f; // sqrt(pi)/2.0
368
369 if (abs(x) <= kEps)
370 {
371 return kConst * x;
372 }
373
374 // Newton iterations
375 float erfi, derfi, y0, y1, dy0, dy1;
376 if (fabsf(x) < 1.0f)
377 {
378 erfi = kConst * fabsf(x);
379 y0 = erff(0.9f * erfi);
380 derfi = 0.1f * erfi;
381 for (int iter = 0; iter < kMaxit; iter++)
382 {
383 y1 = 1.f - erfc(erfi);
384 dy1 = fabsf(x) - y1;
385 if (fabsf(dy1) < kEps)
386 {
387 if (x < 0)
388 {
389 return -erfi;
390 }
391 else
392 {
393 return erfi;
394 }
395 }
396 dy0 = y1 - y0;
397 derfi *= dy1 / dy0;
398 y0 = y1;
399 erfi += derfi;
400 if (fabsf(derfi / erfi) < kEps)
401 {
402 if (x < 0.f)
403 {
404 return -erfi;
405 }
406 else
407 {
408 return erfi;
409 }
410 }
411 }
412 }
413 return 0; //did not converge
414#endif
415 }
416
417 //Food for thought: any sort of proper argument reduction here?
418 //(E. g. Cody-Waite or Payne-Hanek algorithm?)
419
420 CUDA_HOS_DEV static inline
421 float regularize_angle(const float b, const float a = 0.f)
422 //a. k. a. proxim in Athena code.
423 {
424 using namespace std;
425 constexpr float pi = Helpers::Constants::pi<float>;
426 constexpr float two_pi = 2 * pi;
427 const float ret = remainderf(b, two_pi);
428 return ret + ((ret < a - pi) - (ret > a + pi)) * two_pi;
429 }
430
431 CUDA_HOS_DEV static inline
432 double regularize_angle(const double b, const double a = 0.)
433 //a. k. a. proxim in Athena code.
434 {
435 using namespace std;
436 constexpr double pi = Helpers::Constants::pi<double>;
437 constexpr double two_pi = 2 * pi;
438 const double ret = remainderf(b, two_pi);
439 return ret + ((ret < a - pi) - (ret > a + pi)) * two_pi;
440 }
441
442 template <class T>
443 CUDA_HOS_DEV static inline
444 T angular_difference(const T x, const T y)
445 {
447 }
448
449 CUDA_HOS_DEV static inline
450 float eta_from_coordinates(const float x, const float y, const float z)
451 {
452 using namespace std;
453
454 if (x != 0 || y != 0)
455 {
456#ifdef __CUDA_ARCH__
457 const float m = norm3df(x, y, z);
458#else
459 const float m = hypot(x, y, z);
460#endif
461 return 0.5f * logf((m + z) / (m - z));
462 }
463 else
464 {
465 constexpr float s_etaMax = 22756.0f;
466 return z + ((z > 0) - (z < 0)) * s_etaMax;
467 }
468 }
469
470 CUDA_HOS_DEV static inline
471 double eta_from_coordinates(const double x, const double y, const double z)
472 {
473 using namespace std;
474 if (x != 0 || y != 0)
475 {
476#ifdef __CUDA_ARCH__
477 const float m = norm3d(x, y, z);
478#else
479 const float m = hypot(x, y, z);
480#endif
481 return 0.5 * log((m + z) / (m - z));
482 }
483 else
484 {
485 constexpr double s_etaMax = 22756.0;
486 return z + ((z > 0) - (z < 0)) * s_etaMax;
487 }
488 }
489
492 CUDA_HOS_DEV static inline
493 void partial_kahan_babushka_neumaier_sum(const float & to_add, float & sum, float & corr)
494 {
495 const float t = sum + to_add;
496
497 const bool test = fabsf(sum) >= fabsf(to_add);
498
499 const float opt_1 = (sum - t) + to_add;
500 const float opt_2 = (to_add - t) + sum;
501
502 corr += (test) * opt_1 + (!test) * opt_2;
503
504 sum = t;
505 }
506
509 template < class ... Floats, class disabler = std::enable_if_t < (std::is_same_v<std::decay_t<Floats>, float> && ...) > >
510 CUDA_HOS_DEV inline
511 float sum_kahan_babushka_neumaier(const Floats & ... fs)
512 {
513 float ret = 0.f;
514 float corr = 0.f;
515
516 (partial_kahan_babushka_neumaier_sum(fs, ret, corr), ...);
517
518 return ret + corr;
519 }
520
521
522#if CUDA_AVAILABLE
529 __device__ static inline
530 void device_kahan_babushka_neumaier(float * sum_arr, float * corr_arr, const float v, const int idx = 0)
531 {
532 const float old_sum = atomicAdd(sum_arr + idx, v);
533 const float new_sum = old_sum + v;
534 if (fabsf(old_sum) >= fabsf(v))
535 {
536 atomicAdd(corr_arr + idx, (old_sum - new_sum) + v);
537 }
538 else
539 {
540 atomicAdd(corr_arr + idx, (v - new_sum) + old_sum);
541 }
542 }
543#endif
544
545 //Algorithm that calculates a * b + c * d with better precision using FMA,
546 //following "Error bounds on complex floating-point multiplication with an FMA"
547 //by Jeannerod et. al.
548 CUDA_HOS_DEV static inline
549 float product_sum_cornea_harrison_tang(const float a, const float b, const float c, const float d)
550 {
551 using namespace std;
552
553 const float w_1 = a * b;
554 const float w_2 = c * d;
555
556 const float e_1 = fmaf(a, b, -w_1);
557 const float e_2 = fmaf(c, d, -w_2);
558
559 return sum_kahan_babushka_neumaier(w_1, w_2, e_1, e_2);
560 }
561
562 //Generalization of the Cornea-Harrison-Tang algorithm for dot products.
563 CUDA_HOS_DEV inline static
564 float corrected_dot_product(const float a_1, const float a_2, const float a_3,
565 const float b_1, const float b_2, const float b_3)
566 {
567 using namespace std;
568
569 const float w_1 = a_1 * b_1;
570 const float w_2 = a_2 * b_2;
571 const float w_3 = a_3 * b_3;
572
573 const float e_1 = fmaf(a_1, b_1, -w_1);
574 const float e_2 = fmaf(a_2, b_2, -w_2);
575 const float e_3 = fmaf(a_3, b_3, -w_3);
576
577 return sum_kahan_babushka_neumaier(w_1, w_2, w_3, e_1, e_2, e_3);
578 }
579
580 //Generalization of the Cornea-Harrison-Tang algorithm for dot products.
581 inline CUDA_HOS_DEV
582 float corrected_dot_product(const float (&a)[3], const float (&b)[3])
583 {
584 return corrected_dot_product(a[0], a[1], a[2], b[0], b[1], b[2]);
585 }
586
587 //Cross product using the Cornea-Harrison-Tang algorithm.
588 CUDA_HOS_DEV inline static
589 void corrected_cross_product(float (&res)[3], const float a1, const float a2, const float a3, const float b1, const float b2, const float b3)
590 {
591 res[0] = product_sum_cornea_harrison_tang(a2, b3, -a3, b2);
592 res[1] = product_sum_cornea_harrison_tang(a3, b1, -a1, b3);
593 res[2] = product_sum_cornea_harrison_tang(a1, b2, -a2, b1);
594 }
595
596 //Cross product using the Cornea-Harrison-Tang algorithm.
597 CUDA_HOS_DEV inline static
598 void corrected_cross_product(float (&res)[3], const float (&x)[3], const float (&y)[3])
599 {
600 corrected_cross_product(res, x[0], x[1], x[2], y[0], y[1], y[2]);
601 }
602
603 //Magnitude of a cross product using the generalization of the Cornea-Harrison-Tang algorithm.
604 CUDA_HOS_DEV inline static
605 float corrected_magn_cross_product(const float a1, const float a2, const float a3, const float b1, const float b2, const float b3)
606 {
607 using namespace std;
608
609 const float r_1 = product_sum_cornea_harrison_tang(a2, b3, -a3, b2);
610 const float r_2 = product_sum_cornea_harrison_tang(a3, b1, -a1, b3);
611 const float r_3 = product_sum_cornea_harrison_tang(a1, b2, -a2, b1);
612
613#ifdef __CUDA_ARCH__
614 return norm3df(r_1, r_2, r_3);
615#else
616 return hypot(r_1, r_2, r_3);
617#endif
618
619 }
620
621 //Magnitude of a cross product using the generalization of the Cornea-Harrison-Tang algorithm.
622 CUDA_HOS_DEV inline static
623 float corrected_magn_cross_product(const float (&x)[3], const float (&y)[3])
624 {
625 return corrected_magn_cross_product(x[0], x[1], x[2], y[0], y[1], y[2]);
626 }
627
630 {
631 struct CPU
632 {
633 constexpr static char const * name = "CPU";
634 };
635 struct CUDAGPU
636 {
637 constexpr static char const * name = "CUDA GPU";
638 };
640 {
641 constexpr static char const * name = "CUDA Pinned CPU";
642 };
643 }
644
646 template <class T, class indexer>
648 {
649 private:
650 template <class C, class dummy = void> struct unary_helper;
651
652 template <class dummy> struct unary_helper<MemoryContext::CPU, dummy>
653 {
654 static inline T * allocate(const indexer size)
655 {
656 return new T[size];
657 }
658
659 static inline void deallocate(T *& arr)
660 {
661 delete[] arr;
662 }
663
664 };
665
666 template <class dummy> struct unary_helper<MemoryContext::CUDAGPU, dummy>
667 {
668 static inline T * allocate(const indexer size)
669 {
670 return static_cast<T *>(CUDA_Helpers::allocate(sizeof(T) * size));
671 }
672
673 static inline void deallocate(T *& arr)
674 {
676 }
677 };
678
679
680 template <class dummy> struct unary_helper<MemoryContext::CUDAPinnedCPU, dummy>
681 {
682 static inline T * allocate(const indexer size)
683 {
684 return static_cast<T *>(CUDA_Helpers::allocate_pinned(sizeof(T) * size));
685 }
686
687 static inline void deallocate(T *& arr)
688 {
690 }
691 };
692
693 template <class C1, class C2, class dummy = void> struct copy_helper;
694
695 template <class dummy> struct copy_helper<MemoryContext::CPU, MemoryContext::CPU, dummy>
696 {
697 static inline void copy (T * dest, const T * const source, const indexer sz)
698 {
699 std::memcpy(dest, source, sizeof(T) * sz);
700 }
701 };
702
703 template <class dummy> struct copy_helper<MemoryContext::CPU, MemoryContext::CUDAGPU, dummy>
704 {
705 static inline void copy (T * dest, const T * const source, const indexer sz)
706 {
707 CUDA_Helpers::GPU_to_CPU(dest, source, sizeof(T) * sz);
708 }
709 };
710
711 template <class dummy> struct copy_helper<MemoryContext::CUDAGPU, MemoryContext::CUDAGPU, dummy>
712 {
713 static inline void copy (T * dest, const T * const source, const indexer sz)
714 {
715 CUDA_Helpers::GPU_to_GPU(dest, source, sizeof(T) * sz);
716 }
717 };
718
719 template <class dummy> struct copy_helper<MemoryContext::CUDAGPU, MemoryContext::CPU, dummy>
720 {
721 static inline void copy (T * dest, const T * const source, const indexer sz)
722 {
723 CUDA_Helpers::CPU_to_GPU(dest, source, sizeof(T) * sz);
724 }
725 };
726
727 template <class dummy> struct copy_helper<MemoryContext::CUDAPinnedCPU, MemoryContext::CPU, dummy>
728 {
729 static inline void copy (T * dest, const T * const source, const indexer sz)
730 {
731 std::memcpy(dest, source, sizeof(T) * sz);
732 }
733 };
734
735 template <class dummy> struct copy_helper<MemoryContext::CPU, MemoryContext::CUDAPinnedCPU, dummy>
736 {
737 static inline void copy (T * dest, const T * const source, const indexer sz)
738 {
739 std::memcpy(dest, source, sizeof(T) * sz);
740 }
741 };
742
743 template <class dummy> struct copy_helper<MemoryContext::CUDAPinnedCPU, MemoryContext::CUDAPinnedCPU, dummy>
744 {
745 static inline void copy (T * dest, const T * const source, const indexer sz)
746 {
747 std::memcpy(dest, source, sizeof(T) * sz);
748 }
749 };
750
751 template <class dummy> struct copy_helper<MemoryContext::CUDAPinnedCPU, MemoryContext::CUDAGPU, dummy>
752 {
753 static inline void copy (T * dest, const T * const source, const indexer sz)
754 {
755 CUDA_Helpers::GPU_to_CPU(dest, source, sizeof(T) * sz);
756 }
757 };
758
759 template <class dummy> struct copy_helper<MemoryContext::CUDAGPU, MemoryContext::CUDAPinnedCPU, dummy>
760 {
761 static inline void copy (T * dest, const T * const source, const indexer sz)
762 {
763 CUDA_Helpers::CPU_to_GPU(dest, source, sizeof(T) * sz);
764 }
765 };
766
767
768 template <class C1, class C2, class dummy = void> struct move_helper;
769
770 template <class C1, class C2, class dummy> struct move_helper
771 {
772 inline static void move(T *& dest, T *& source, const indexer sz)
773 {
777 }
778 };
779
780 template <class C, class dummy> struct move_helper<C, C, dummy>
781 {
782 inline static void move(T *& dest, T *& source, const indexer)
783 {
784 dest = source;
785 source = nullptr;
786 }
787 };
788
789 public:
791 template <class Context> static inline T * allocate(const indexer size)
792 {
793 T * ret = nullptr;
794 if (size > 0)
795 {
797 }
798#if CALORECGPU_HELPERS_DEBUG
799 std::cerr << "ALLOCATED " << size << " x " << sizeof(T) << " in " << Context::name << ": " << ret << std::endl;
800#endif
801 return ret;
802 }
803
805 template <class Context> static inline void deallocate(T *& arr)
806 {
807 if (arr == nullptr)
808 //This check is to ensure the code behaves on non-CUDA enabled platforms
809 //where some destructors might still be called with nullptr.
810 {
811 return;
812 }
814#if CALORECGPU_HELPERS_DEBUG
815 std::cerr << "DEALLOCATED in " << Context::name << ": " << arr << std::endl;
816#endif
817 arr = nullptr;
818 }
819
820
822 template <class DestContext, class SourceContext>
823 static inline void copy(T * dest, const T * const source, const indexer sz)
824 {
825 if (sz > 0 && source != nullptr)
826 {
828 }
829#if CALORECGPU_HELPERS_DEBUG
830 std::cerr << "COPIED " << sz << " from " << SourceContext::name << " to " << DestContext::name << ": " << source << " to " << dest << std::endl;
831#endif
832 }
833
834
840 template <class DestContext, class SourceContext>
841 static inline void move(T *& dest, T *& source, const indexer sz)
842 {
843#if CALORECGPU_HELPERS_DEBUG
844 std::cerr << "MOVED " << sz << " from " << SourceContext::name << " to " << DestContext::name << ": " << source << " to " << dest;
845#endif
846 if (sz > 0 && source != nullptr)
847 {
849 }
850 else
851 {
852 dest = nullptr;
854 }
855#if CALORECGPU_HELPERS_DEBUG
856 std::cerr << " | " << source << " to " << dest << std::endl;
857#endif
858 }
859
860 };
861
868 template <class T, class indexer, class Context, bool hold_arrays = true>
870
871 template <class T, class indexer, class Context>
872 class SimpleContainer<T, indexer, Context, true>
873 {
874 static_assert(std::is_trivially_copyable<T>::value, "SimpleContainer only works with a trivially copyable type.");
876 indexer m_size;
877
878 template <class a, class b, class c, bool d> friend class SimpleContainer;
879
881
882 public:
883
884 CUDA_HOS_DEV inline indexer size() const
885 {
886 return m_size;
887 }
888
889 CUDA_HOS_DEV inline T & operator[] (const indexer i)
890 {
891 return m_array[i];
892 }
893
894 CUDA_HOS_DEV inline const T & operator[] (const indexer i) const
895 {
896 return m_array[i];
897 }
898
899 inline void clear()
900 {
902 m_size = 0;
903 }
904
905 inline void resize(const indexer new_size)
906 {
907 if (new_size == 0)
908 {
909 clear();
910 }
911 else if (new_size != m_size)
912 {
913 T * temp = m_array;
914 m_array = Manager::template allocate<Context>(new_size);
915 Manager::template copy<Context, Context>(m_array, temp, (m_size < new_size ? m_size : new_size));
916 Manager::template deallocate<Context>(temp);
917 m_size = new_size;
918 }
919 }
920
922 {
923 }
924
925 SimpleContainer(const indexer sz) : m_size(sz)
926 {
927 m_array = Manager::template allocate<Context>(sz);
928 }
929
933 SimpleContainer(T * other_array, const indexer sz) : m_size(sz)
934 {
935 m_array = Manager::template allocate<Context>(sz);
936 Manager::template copy<Context, Context>(m_array, other_array, sz);
937 }
938
940 {
941 m_array = Manager::template allocate<Context>(m_size);
942 Manager::template copy<Context, Context>(m_array, other.m_array, m_size);
943 }
944
946 {
947 m_array = nullptr;
948 Manager::template move<Context, Context>(m_array, other.m_array, m_size);
949 other.m_size = 0;
950 }
951
952 template <class other_indexer, class other_context, bool other_hold>
954 m_size(other.m_size)
955 {
956
957 m_array = Manager::template allocate<Context>(m_size);
958 Manager::template copy<Context, other_context>(m_array, other.m_array, m_size);
959 }
960
961 template <class other_indexer, class other_context>
963 m_size(other.m_size)
964 {
965 m_array = nullptr;
966 Manager::template move<Context, other_context>(m_array, other.m_array, m_size);
967 other.m_size = 0;
968 }
969
970 SimpleContainer & operator= (const SimpleContainer & other)
971 {
972 if (this == &other)
973 {
974 return (*this);
975 }
976 else
977 {
978 resize(other.size());
979 Manager::template copy<Context, Context>(m_array, other.m_array, m_size);
980 return (*this);
981 }
982 }
983
984 SimpleContainer & operator= (SimpleContainer && other)
985 {
986 if (this == &other)
987 {
988 return (*this);
989 }
990 else
991 {
992 clear();
993 Manager::template move<Context, Context>(m_array, other.m_array, other.size());
994 m_size = other.m_size;
995 other.m_size = 0;
996 return (*this);
997 }
998 }
999
1000
1001 template <class other_indexer, class other_context, bool other_hold>
1003 {
1004 resize(other.m_size);
1005 Manager::template copy<Context, other_context>(m_array, other.m_array, m_size);
1006 return (*this);
1007 }
1008
1009 template <class other_indexer, class other_context>
1011 {
1012 clear();
1013 Manager::template move<Context, other_context>(m_array, other.m_array, other.m_size);
1014 m_size = other.m_size;
1015 other.m_size = 0;
1016 return (*this);
1017 }
1018
1020 {
1021 Manager::template deallocate<Context>(m_array);
1022 m_size = 0;
1023 }
1024
1025 CUDA_HOS_DEV operator const T * () const
1026 {
1027 return m_array;
1028 }
1029
1030 CUDA_HOS_DEV operator T * ()
1031 {
1032 return m_array;
1033 }
1034
1035 CUDA_HOS_DEV operator const void * () const
1036 {
1037 return m_array;
1038 }
1039
1040 CUDA_HOS_DEV operator void * ()
1041 {
1042 return m_array;
1043 }
1044
1045 template <class stream, class str = std::basic_string<typename stream::char_type> >
1046 void textual_output(stream & s, const str & separator = " ") const
1047 {
1048 if (std::is_same<Context, MemoryContext::CPU>::value)
1049 {
1050 s << m_size << separator;
1051 for (indexer i = 0; i < m_size - 1; ++i)
1052 {
1053 s << m_array[i] << separator;
1054 }
1055 s << m_array[m_size - 1];
1056 }
1057 else
1058 {
1060 other.textual_output(s, separator);
1061 }
1062 }
1063
1064 template <class stream>
1065 void textual_input(stream & s)
1066 {
1067 if (std::is_same<Context, MemoryContext::CPU>::value)
1068 {
1069 indexer new_size;
1070 s >> new_size >> std::ws;
1071 if (s.fail())
1072 {
1073 //Throw errors, perhaps? Don't know if we can/should use exceptions...
1074 std::cerr << "FAILED READING " << this << "!" << std::endl;
1075 new_size = 0;
1076 }
1077 resize(new_size);
1078 for (indexer i = 0; i < m_size - 1; ++i)
1079 {
1080 s >> m_array[i];
1081 s >> std::ws;
1082 }
1083 s >> m_array[m_size - 1];
1084 }
1085 else
1086 {
1088 other.textual_input(s);
1089 (*this) = other;
1090 }
1091 }
1092
1093 template <class stream>
1094 void binary_output(stream & s) const
1095 {
1096 if (std::is_same<Context, MemoryContext::CPU>::value)
1097 {
1098 s.write(reinterpret_cast<const char *>(&m_size), sizeof(indexer));
1099 for (indexer i = 0; i < m_size; ++i)
1100 {
1101 s.write(reinterpret_cast<const char *>(m_array + i), sizeof(T));
1102 }
1103 }
1104 else
1105 {
1107 other.binary_output(s);
1108 }
1109 }
1110
1111 template <class stream>
1112 void binary_input(stream & s)
1113 {
1114 if (std::is_same<Context, MemoryContext::CPU>::value)
1115 {
1116 indexer new_size;
1117 s.read(reinterpret_cast<char *>(&new_size), sizeof(indexer));
1118 if (s.fail())
1119 {
1120 //Throw errors, perhaps? Don't know if we can/should use exceptions...
1121 std::cerr << "FAILED READING " << this << "!" << std::endl;
1122 new_size = 0;
1123 }
1124 resize(new_size);
1125 for (indexer i = 0; i < m_size; ++i)
1126 {
1127 s.read(reinterpret_cast<char *>(m_array + i), sizeof(T));
1128 }
1129 }
1130 else
1131 {
1133 other.binary_input(s);
1134 (*this) = other;
1135 }
1136 }
1137
1138 };
1139
1140 template <class T, class indexer, class Context>
1141 class SimpleContainer<T, indexer, Context, false>
1142 {
1143 static_assert(std::is_trivially_copyable<T>::value, "SimpleContainer only works with a trivially copyable type.");
1145 indexer m_size;
1146
1148
1149 template <class a, class b, class c, bool d> friend class SimpleContainer;
1150
1151 public:
1152
1153 CUDA_HOS_DEV inline indexer size() const
1154 {
1155 return m_size;
1156 }
1157
1158 CUDA_HOS_DEV inline T & operator[] (const indexer i)
1159 {
1160 return m_array[i];
1161 }
1162
1163 CUDA_HOS_DEV inline const T & operator[] (const indexer i) const
1164 {
1165 return m_array[i];
1166 }
1167
1168 // cppcheck-suppress uninitMemberVar
1169 //Try to suppress the uninitialized member thing that is probably being thrown off by the CUDA_HOS_DEV macro...
1171 {
1172 }
1173
1177 // cppcheck-suppress uninitMemberVar
1178 //Try to suppress the uninitialized member thing that is probably being thrown off by the CUDA_HOS_DEV macro...
1179 CUDA_HOS_DEV SimpleContainer(T * other_array, const indexer sz) : m_array(other_array), m_size(sz)
1180 {
1181 }
1182
1183 template <class other_indexer, bool other_hold>
1184 // cppcheck-suppress uninitMemberVar
1185 //Try to suppress the uninitialized member thing that is probably being thrown off by the CUDA_HOS_DEV macro...
1191
1192 // cppcheck-suppress operatorEqVarError
1193 //Try to suppress the uninitialized member thing that is probably being thrown off by the CUDA_HOS_DEV macro...
1195 {
1196 if (this == &other)
1197 {
1198 return (*this);
1199 }
1200 else
1201 {
1202 m_array = other.m_array;
1203 m_size = other.m_size;
1204 return (*this);
1205 }
1206 }
1207
1208 template <class other_indexer, bool other_hold>
1209 // cppcheck-suppress operatorEqVarError
1210 //Try to suppress the uninitialized member thing that is probably being thrown off by the CUDA_HOS_DEV macro...
1212 {
1213 m_size = other.m_size;
1214 m_array = other.m_array;
1215 return (*this);
1216 }
1217
1218 CUDA_HOS_DEV operator const T * () const
1219 {
1220 return m_array;
1221 }
1222
1223 CUDA_HOS_DEV operator T * ()
1224 {
1225 return m_array;
1226 }
1227
1228 CUDA_HOS_DEV operator const void * () const
1229 {
1230 return m_array;
1231 }
1232
1233 CUDA_HOS_DEV operator void * ()
1234 {
1235 return m_array;
1236 }
1237 };
1238
1240 template <class T, class indexer = unsigned int>
1242
1244 template <class T, class indexer = unsigned int>
1246
1248 template <class T, class indexer = unsigned int>
1250
1257 template <class T, class Context, bool hold_object = true>
1259
1260 template <class T, class Context>
1261 class SimpleHolder<T, Context, true>
1262 {
1263 static_assert(std::is_trivially_copyable<T>::value, "SimpleHolder only works with a trivially copyable type.");
1264
1265 using indexer = unsigned int;
1266
1268
1270
1271 template <class a, class b, bool c> friend class SimpleHolder;
1272
1273 public:
1274
1275 CUDA_HOS_DEV const T & operator *() const
1276 {
1277 return *m_object;
1278 }
1279
1281 {
1282 return *m_object;
1283 }
1284
1285 CUDA_HOS_DEV const T * operator ->() const
1286 {
1287 return m_object;
1288 }
1289
1290 CUDA_HOS_DEV T * operator ->()
1291 {
1292 return m_object;
1293 }
1294
1295 CUDA_HOS_DEV inline bool valid() const
1296 {
1297 return m_object != nullptr;
1298 }
1299
1300 inline void clear()
1301 {
1302 Manager::template deallocate<Context>(m_object);
1303 }
1304
1305 inline void allocate()
1306 {
1307 if (m_object == nullptr)
1308 {
1309 m_object = Manager::template allocate<Context>(1);
1310 }
1311 }
1312
1314 {
1315 }
1316
1317 SimpleHolder(const bool really_allocate)
1318 {
1319 if (really_allocate)
1320 {
1321 m_object = Manager::template allocate<Context>(1);
1322 }
1323 else
1324 {
1325 m_object = nullptr;
1326 }
1327 }
1328
1332 template < class X, class disabler = typename std::enable_if < std::is_base_of<T, X>::value || std::is_same<T, X>::value >::type >
1333 explicit SimpleHolder(X * other_p)
1334 {
1335 m_object = Manager::template allocate<Context>(1);
1336 Manager::template copy<Context, Context>(m_object, other_p, 1);
1337 }
1338
1342 template < class X, class disabler = typename std::enable_if < std::is_base_of<T, X>::value || std::is_same<T, X>::value >::type >
1343 SimpleHolder(const X & other_v) : SimpleHolder(&other_v)
1344 {
1345
1346
1347 }
1348
1350 {
1351 if (other.valid())
1352 {
1353 m_object = Manager::template allocate<Context>(1);
1354 Manager::template copy<Context, Context>(m_object, other.m_object, other.valid());
1355 }
1356 else
1357 {
1358 m_object = nullptr;
1359 }
1360 }
1361
1362 template < class X, class other_context, bool other_hold,
1363 class disabler = typename std::enable_if < std::is_base_of<T, X>::value || std::is_same<T, X>::value >::type >
1365 {
1366 if (other.valid())
1367 {
1368 m_object = Manager::template allocate<Context>(1);
1369 Manager::template copy<Context, other_context>(m_object, other.m_object, other.valid());
1370 }
1371 else
1372 {
1373 m_object = nullptr;
1374 }
1375 }
1376
1378 {
1379 m_object = nullptr;
1380 Manager::template move<Context, Context>(m_object, other.m_object, other.valid());
1381 }
1382
1383 template < class X, class other_context,
1384 class disabler = typename std::enable_if < std::is_base_of<T, X>::value || std::is_same<T, X>::value >::type >
1386 {
1387 m_object = nullptr;
1388 Manager::template move<Context, other_context>(m_object, other.m_object, other.valid());
1389 }
1390
1391 // cppcheck-suppress operatorEqVarError
1392 //Try to suppress the uninitialized member thing that is probably being thrown off by the CUDA_HOS_DEV macro...
1393 SimpleHolder & operator= (const SimpleHolder & other)
1394 {
1395 if (!valid() && other.valid())
1396 {
1397 allocate();
1398 }
1399 if (&other != this)
1400 {
1401 Manager::template copy<Context, Context>(m_object, other.m_object, other.valid());
1402 }
1403 return (*this);
1404 }
1405
1406 template < class X, class other_context, bool other_hold,
1407 class disabler = typename std::enable_if < std::is_base_of<T, X>::value || std::is_same<T, X>::value >::type >
1408 // cppcheck-suppress operatorEqVarError
1409 //Try to suppress the uninitialized member thing that is probably being thrown off by the CUDA_HOS_DEV macro...
1411 {
1412 if (!valid() && other.valid())
1413 {
1414 allocate();
1415 }
1416 Manager::template copy<Context, other_context>(m_object, other.m_object, other.valid());
1417 return (*this);
1418 }
1419
1420 // cppcheck-suppress operatorEqVarError
1421 //Try to suppress the uninitialized member thing that is probably being thrown off by the CUDA_HOS_DEV macro...
1422 SimpleHolder & operator= (SimpleHolder && other)
1423 {
1424 if (&other != this)
1425 {
1426 clear();
1427 Manager::template move<Context, Context>(m_object, other.m_object, other.valid());
1428 }
1429 return (*this);
1430 }
1431
1432 template < class X, class other_context,
1433 class disabler = typename std::enable_if < std::is_base_of<T, X>::value || std::is_same<T, X>::value >::type >
1434 // cppcheck-suppress operatorEqVarError
1435 //Try to suppress the uninitialized member thing that is probably being thrown off by the CUDA_HOS_DEV macro...
1437 {
1438 clear();
1439 Manager::template move<Context, other_context>(m_object, other.m_object, other.valid());
1440 return (*this);
1441 }
1442
1444 {
1445 Manager::template deallocate<Context>(m_object);
1446 }
1447
1448 template < class X, class disabler = typename std::enable_if < std::is_base_of<X, T>::value || std::is_same<T, X>::value >::type >
1449 CUDA_HOS_DEV operator const X * () const
1450 {
1451 return m_object;
1452 }
1453
1454 template < class X, class disabler = typename std::enable_if < std::is_base_of<X, T>::value || std::is_same<T, X>::value >::type >
1455 CUDA_HOS_DEV operator X * ()
1456 {
1457 return m_object;
1458 }
1459
1460 CUDA_HOS_DEV operator const void * () const
1461 {
1462 return m_object;
1463 }
1464
1465 CUDA_HOS_DEV operator void * ()
1466 {
1467 return m_object;
1468 }
1469
1470 template <class stream, class str = std::basic_string<typename stream::char_type> >
1471 void textual_output(stream & s, const str & separator = " ") const
1472 {
1473 if (std::is_same<Context, MemoryContext::CPU>::value)
1474 {
1475 if (m_object == nullptr)
1476 {
1477 s << 0;
1478 }
1479 else
1480 {
1481 s << 1 << separator << (*m_object);
1482 }
1483 }
1484 else
1485 {
1487 other.textual_output(s, separator);
1488 }
1489 }
1490
1491 template <class stream>
1492 void textual_input(stream & s)
1493 {
1494 if (std::is_same<Context, MemoryContext::CPU>::value)
1495 {
1496 bool is_valid;
1497 s >> is_valid >> std::ws;
1498 if (s.fail())
1499 {
1500 //Throw errors, perhaps? Don't know if we can/should use exceptions...
1501 std::cerr << "FAILED READING " << this << "!" << std::endl;
1502 is_valid = false;
1503 }
1504 if (is_valid)
1505 {
1506 allocate();
1507 s >> (*m_object);
1508 }
1509 else
1510 {
1511 clear();
1512 }
1513 }
1514 else
1515 {
1517 other.textual_input(s);
1518 (*this) = other;
1519 }
1520 }
1521
1522 template <class stream>
1523 void binary_output(stream & s) const
1524 {
1525 if (m_object == nullptr)
1526 {
1527 return;
1528 }
1529 if (std::is_same<Context, MemoryContext::CPU>::value)
1530 {
1531 s.write(reinterpret_cast<const char *>(m_object), sizeof(T));
1532 }
1533 else
1534 {
1536 other.binary_output(s);
1537 }
1538 }
1539
1540 template <class stream>
1541 void binary_input(stream & s)
1542 {
1543 if (std::is_same<Context, MemoryContext::CPU>::value)
1544 {
1545 allocate();
1546 s.read(reinterpret_cast<char *>(m_object), sizeof(T));
1547 }
1548 else
1549 {
1551 other.binary_input(s);
1552 (*this) = other;
1553 }
1554 }
1555
1556 };
1557
1558 template <class T, class Context>
1559 class SimpleHolder<T, Context, false>
1560 {
1561 static_assert(std::is_trivially_copyable<T>::value, "SimpleHolder only works with a trivially copyable type.");
1562
1563 using indexer = unsigned int;
1564
1566
1568
1569 template <class a, class b, bool c> friend class SimpleHolder;
1570
1571 public:
1572
1573 CUDA_HOS_DEV const T & operator *() const
1574 {
1575 return *m_object;
1576 }
1577
1579 {
1580 return *m_object;
1581 }
1582
1583 CUDA_HOS_DEV const T * operator ->() const
1584 {
1585 return m_object;
1586 }
1587
1588 CUDA_HOS_DEV T * operator ->()
1589 {
1590 return m_object;
1591 }
1592
1593 CUDA_HOS_DEV inline bool valid() const
1594 {
1595 return m_object != nullptr;
1596 }
1597
1598 // cppcheck-suppress uninitMemberVar
1599 //Try to suppress the uninitialized member thing that is probably being thrown off by the CUDA_HOS_DEV macro...
1601 {
1602 }
1603
1607 template < class X, class disabler = typename std::enable_if < std::is_base_of<T, X>::value || std::is_same<T, X>::value >::type >
1608 // cppcheck-suppress uninitMemberVar
1609 //Try to suppress the uninitialized member thing that is probably being thrown off by the CUDA_HOS_DEV macro...
1611 {
1612 m_object = other_p;
1613 }
1614
1615 template < class X, bool other_hold,
1616 class disabler = typename std::enable_if < std::is_base_of<T, X>::value || std::is_same<T, X>::value >::type >
1617 // cppcheck-suppress uninitMemberVar
1618 //Try to suppress the uninitialized member thing that is probably being thrown off by the CUDA_HOS_DEV macro...
1620 {
1621 m_object = other.m_object;
1622 }
1623
1624 template < class X, bool other_hold,
1625 class disabler = typename std::enable_if < std::is_base_of<T, X>::value || std::is_same<T, X>::value >::type >
1626 // cppcheck-suppress operatorEqVarError
1627 //Try to suppress the uninitialized member thing that is probably being thrown off by the CUDA_HOS_DEV macro...
1629 {
1630 m_object = other.m_object;
1631 return (*this);
1632 }
1633
1634 template < class X, class disabler = typename std::enable_if < std::is_base_of<X, T>::value || std::is_same<T, X>::value >::type >
1635 CUDA_HOS_DEV operator const X * () const
1636 {
1637 return m_object;
1638 }
1639
1640 template < class X, class disabler = typename std::enable_if < std::is_base_of<X, T>::value || std::is_same<T, X>::value >::type >
1641 CUDA_HOS_DEV operator X * ()
1642 {
1643 return m_object;
1644 }
1645
1646 CUDA_HOS_DEV operator const void * () const
1647 {
1648 return m_object;
1649 }
1650
1651 CUDA_HOS_DEV operator void * ()
1652 {
1653 return m_object;
1654 }
1655 };
1656
1658 template <class T>
1660
1662 template <class T>
1664
1666 template <class T>
1668
1670 template <class T>
1672
1682 template <class T>
1684 {
1685 private:
1686 std::vector< std::unique_ptr<T> > m_held;
1687 std::vector< typename std::thread::id > m_thread_equivs;
1688 //For a sufficiently small number of threads
1689 //(not much more than 100 or so?)
1690 //it's faster to have linear search+insert
1691 //than any other addressing mode
1692 //(e. g. unordered_map)
1693 //We could still consider a more sophisticated solution...
1694
1695 //Simple alternative: some sort of stack for non-assigned objects,
1696 //pushing and popping instead of linear searching.
1697 //(But with constant memory -> no (de)allocations.)
1698
1699 mutable std::shared_mutex m_mutex;
1700
1702 {
1703 std::unique_lock<std::shared_mutex> lock(m_mutex);
1704 m_held.emplace_back(std::make_unique<T>());
1705 m_thread_equivs.emplace_back(std::this_thread::get_id());
1706 return *(m_held.back());
1707 }
1708
1709 public:
1711 {
1712 {
1713 std::shared_lock<std::shared_mutex> lock(m_mutex);
1714 std::thread::id this_id = std::this_thread::get_id();
1715 const std::thread::id invalid_id{};
1716 for (size_t i = 0; i < m_thread_equivs.size(); ++i)
1717 {
1718 if (m_thread_equivs[i] == invalid_id)
1719 {
1720 m_thread_equivs[i] = this_id;
1721 return *(m_held[i]);
1722 }
1723 }
1724 }
1725 return add_one_and_return();
1726 }
1727
1729 T & get_for_thread() const
1730 {
1731 std::shared_lock<std::shared_mutex> lock(m_mutex);
1732 std::thread::id this_id = std::this_thread::get_id();
1733 for (size_t i = 0; i < m_thread_equivs.size(); ++i)
1734 {
1735 if (m_thread_equivs[i] == this_id)
1736 {
1737 return *(m_held[i]);
1738 }
1739 }
1740 //Here would be a good place for an unreachable.
1741 //C++23?
1742 return *(m_held.back());
1743 }
1744
1746 {
1747 std::unique_lock<std::shared_mutex> lock(m_mutex);
1748 std::thread::id this_id = std::this_thread::get_id();
1749 const std::thread::id invalid_id{};
1750 for (size_t i = 0; i < m_thread_equivs.size(); ++i)
1751 {
1752 if (m_thread_equivs[i] == this_id)
1753 {
1754 m_thread_equivs[i] = invalid_id;
1755 }
1756 }
1757 }
1758
1759 void resize(const size_t new_size)
1760 {
1761 std::unique_lock<std::shared_mutex> lock(m_mutex);
1762 if (new_size < m_held.size())
1763 {
1764 m_held.resize(new_size);
1765 m_thread_equivs.resize(new_size);
1766 }
1767 else if (new_size > m_held.size())
1768 {
1769 const size_t to_add = new_size - m_held.size();
1770 const std::thread::id invalid_id{};
1771 for (size_t i = 0; i < to_add; ++i)
1772 {
1773 m_held.emplace_back(std::make_unique<T>());
1774 m_thread_equivs.emplace_back(invalid_id);
1775 }
1776 }
1777 }
1778
1779 template <class F, class ... Args>
1780 void operate_on_all(F && f, Args && ... args)
1781 {
1782 std::unique_lock<std::shared_mutex> lock(m_mutex);
1783 for (std::unique_ptr<T> & obj : m_held)
1784 {
1785 f(*obj, std::forward<Args>(args)...);
1786 }
1787 }
1788
1789 size_t held_size() const
1790 {
1791 std::shared_lock<std::shared_mutex> lock(m_mutex);
1792 return m_held.size();
1793 }
1794
1795 size_t available_size() const
1796 {
1797 std::shared_lock<std::shared_mutex> lock(m_mutex);
1798 size_t count = 0;
1799 const std::thread::id invalid_id{};
1800 for (const auto & id : m_thread_equivs)
1801 {
1802 if (id == invalid_id)
1803 {
1804 ++count;
1805 }
1806 }
1807 return count;
1808 }
1809
1810 size_t filled_size() const
1811 {
1812 std::shared_lock<std::shared_mutex> lock(m_mutex);
1813 size_t count = 0;
1814 const std::thread::id invalid_id{};
1815 for (const auto & id : m_thread_equivs)
1816 {
1817 if (id == invalid_id)
1818 {
1819 ++count;
1820 }
1821 }
1822 return m_held.size() - count;
1823 }
1824 };
1825
1828 template <class T>
1830 {
1831 private:
1834 public:
1840 {
1841 if (m_held == nullptr)
1842 {
1843 m_held = &(m_sth.get_one());
1844 }
1845 return *m_held;
1846 }
1848 {
1849 if (m_held != nullptr)
1850 {
1851 m_sth.release_one();
1852 m_held = nullptr;
1853 }
1854 }
1856 {
1857 if (m_held != nullptr)
1858 {
1859 m_sth.release_one();
1860 }
1861 }
1864 {
1865 get_one();
1866 ptr = m_held;
1867 }
1868 };
1869
1874 template <class T>
1876 {
1877 private:
1878
1879 alignas(T) char m_buf[sizeof(T)];
1880 T * m_object = nullptr;
1881
1882 public:
1883
1884 maybe_allocate(const bool allocate, const T & t)
1885 {
1886 if (allocate)
1887 {
1888 m_object = new (m_buf) T(t);
1889 }
1890 }
1891
1892 maybe_allocate(const bool allocate, T && t)
1893 {
1894 if (allocate)
1895 {
1896 m_object = new (m_buf) T(t);
1897 }
1898 }
1899
1900 template <class ... Args>
1901 maybe_allocate(const bool allocate, Args && ... args)
1902 {
1903 if (allocate)
1904 {
1905 m_object = new (m_buf) T(std::forward<Args>(args)...);
1906 }
1907 }
1908
1910 {
1911 }
1912
1913
1915 {
1916 }
1917
1919 {
1920 if (&other != this)
1921 {
1922 if (m_object != nullptr)
1923 {
1924 (*m_object) = other.get();
1925 }
1926 else
1927 {
1928 m_object = new (m_buf) T(other.get());
1929 }
1930 }
1931 return (*this);
1932 }
1933
1934
1936 {
1937 if (&other != this)
1938 {
1939 if (m_object != nullptr)
1940 {
1941 (*m_object) = other.get();
1942 }
1943 else
1944 {
1945 m_object = new (m_buf) T(other.get());
1946 }
1947 }
1948 return (*this);
1949 }
1950
1952 {
1953 if (m_object != nullptr)
1954 {
1955 m_object->~T();
1956 }
1957 }
1958
1959 bool valid() const
1960 {
1961 return m_object != nullptr;
1962 }
1963
1964 T && get() &&
1965 {
1966 return *m_object;
1967 }
1968
1969 T & get() &
1970 {
1971 return *m_object;
1972 }
1973
1974 const T & get() const &
1975 {
1976 return *m_object;
1977 }
1978
1979 const T * operator ->() const
1980 {
1981 return m_object;
1982 }
1983
1985 {
1986 return m_object;
1987 }
1988
1989 operator T & ()
1990 {
1991 return *m_object;
1992 }
1993
1994 operator T && () &&
1995 {
1996 return *m_object;
1997 }
1998
1999 operator const T & () const
2000 {
2001 return *m_object;
2002 }
2003 };
2004 }
2005
2006}
2007
2008#endif // CALORECGPU_HELPERS_H
std::pair< std::vector< unsigned int >, bool > res
static Double_t sz
static Double_t a
static Double_t fs
#define F(x, y, z)
Definition MD5.cxx:112
xAOD::MissingET_v1 operator*(const xAOD::MissingET_v1 &met, float scale)
Create new MET object from source with scaled (weighted) kinematics.
#define pi
#define y
#define x
#define z
H5Mergers.
! Handles allocation of a type T, using indexer as the integer type to indicate sizes.
static void move(T *&dest, T *&source, const indexer sz)
!
static void copy(T *dest, const T *const source, const indexer sz)
!
CUDA_HOS_DEV SimpleContainer(const SimpleContainer< T, other_indexer, Context, other_hold > &other)
SimpleContainer(SimpleContainer< T, other_indexer, other_context, true > &&other)
SimpleContainer(const SimpleContainer< T, other_indexer, other_context, other_hold > &other)
Holds a run-time amount of objects of type \T, measuring sizes with indexer, in memory context Contex...
CUDA_HOS_DEV SimpleHolder(const SimpleHolder< X, Context, other_hold > &other)
SimpleHolder(const SimpleHolder< X, other_context, other_hold > &other)
Holds one objects of type \T in memory context Context.
Manages objects of type T in a thread-safe way, ensuring that there's an object available for each se...
int count(std::string s, const std::string &regx)
count how many occurances of a regx are in a string
Definition hcg.cxx:146
struct color C
void CPU_to_GPU(void *dest, const void *const source, const size_t num)
Copies num bytes from source in CPU memory to dest in GPU memory.
void optimize_block_and_grid_size_for_cooperative_launch(void *func, int &block_size, int &grid_size, const int dynamic_memory=0, const int block_size_limit=0)
Optimizes block and grid size for a cooperative launch.
void GPU_to_CPU(void *dest, const void *const source, const size_t num)
Copies num bytse from source in GPU memory to dest in CPU memory.
void deallocate(void *address)
Deallocates address in GPU memory.
void * allocate(const size_t num)
Allocates and returns the address of num bytes from GPU memory.
void * allocate_pinned(const size_t num)
Allocates and returns the address of num bytes from CPU pinned memory.
void GPU_to_GPU_async(void *dest, const void *const source, const size_t num, CUDAStreamPtrHolder stream={})
Copies num bytes from source to dest, both in GPU memory, asynchronously.
void GPU_to_GPU(void *dest, const void *const source, const size_t num)
Copies num bytes from source to dest, both in GPU memory.
void CPU_to_GPU_async(void *dest, const void *const source, const size_t num, CUDAStreamPtrHolder stream={})
Copies num bytes from source in CPU memory to dest in GPU memory, asynchronously.
void deallocate_pinned(void *address)
Deallocates address in CPU pinned memory.
void GPU_synchronize(CUDAStreamPtrHolder stream={})
Synchronizes the stream.
void GPU_to_CPU_async(void *dest, const void *const source, const size_t num, CUDAStreamPtrHolder stream={})
Copies num bytes from source in GPU memory to dest in CPU memory, asynchronously.
void optimize_block_and_grid_size(void *func, int &block_size, int &grid_size, const int dynamic_memory=0, const int block_size_limit=0)
Optimizes block and grid size according to cudaOccupancyMaxPotentialBlockSize.
! Holds dummy classes just to identify the place in which memory lives.
static CUDA_HOS_DEV float regularize_angle(const float b, const float a=0.f)
static CUDA_HOS_DEV float corrected_magn_cross_product(const float a1, const float a2, const float a3, const float b1, const float b2, const float b3)
constexpr auto int_ceil_div(const T1 num, const T2 denom)
Returns the ceiling of num/denom, with proper rounding.
SimpleHolder< T, MemoryContext::CUDAGPU, true > CUDA_object
Holds an object of type T in CUDA GPU memory.
static CUDA_HOS_DEV float eta_from_coordinates(const float x, const float y, const float z)
constexpr unsigned short Pearson_hash_16_bit(const T number)
Calculates a 16-bit Pearson hash from @ number.
static CUDA_HOS_DEV float product_sum_cornea_harrison_tang(const float a, const float b, const float c, const float d)
static CUDA_HOS_DEV float erf_inv_wrapper(const float x)
static CUDA_HOS_DEV T angular_difference(const T x, const T y)
SimpleHolder< T, MemoryContext::CUDAPinnedCPU, true > CUDA_pinned_CPU_object
Holds an object of type T in CUDA GPU memory.
static CUDA_HOS_DEV void partial_kahan_babushka_neumaier_sum(const float &to_add, float &sum, float &corr)
Implements one step of a Kahan-Babushka-Neumaier sum by adding to_add to sum with the correction term...
SimpleContainer< T, indexer, MemoryContext::CPU, true > CPU_array
Holds a run-time specified amount of objects of type T in CPU memory.
static CUDA_HOS_DEV float corrected_dot_product(const float a_1, const float a_2, const float a_3, const float b_1, const float b_2, const float b_3)
constexpr Base compile_time_pow2(const Exp exp)
Returns 2 to the power of exp.
SimpleHolder< T, MemoryContext::CPU, true > CPU_object
Holds an object of type T in CPU memory.
static CUDA_HOS_DEV void corrected_cross_product(float(&res)[3], const float a1, const float a2, const float a3, const float b1, const float b2, const float b3)
constexpr unsigned char Pearson_hash(const T number)
Calculates a Pearson hash from @ number.
CUDA_HOS_DEV float sum_kahan_babushka_neumaier(const Floats &... fs)
Adds a list of floats using the Kahan-Babushka-Neumaier algorithm for greater precision (at the cost ...
SimpleContainer< T, indexer, MemoryContext::CUDAGPU, false > CUDA_kernel_array
Non-owning pointer to an array of T in CUDA GPU memory.
SimpleContainer< T, indexer, MemoryContext::CUDAGPU, true > CUDA_array
Holds a run-time specified amount of objects of type T in CUDA GPU memory.
constexpr auto int_floor_div(const T1 num, const T2 denom)
Returns the floor of num/denom, with proper rounding.
constexpr unsigned int int_ceil_log_2(T num)
Returns the ceiling of the base-2 logarithm of a number (i.
SimpleHolder< T, MemoryContext::CUDAGPU, false > CUDA_kernel_object
Non-owning pointer to an object of type T in CUDA GPU memory.
Copyright (C) 2002-2025 CERN for the benefit of the ATLAS collaboration.
STL namespace.
maybe_allocate & operator=(const maybe_allocate &other)
TFile * file
std::string number(const double &d, const std::string &s)
Definition utils.cxx:186