7#ifndef CALORECGPU_HELPERS_H
8#define CALORECGPU_HELPERS_H
19#include <shared_mutex>
26#if __cpp_lib_math_constants
45 #define CUDA_AVAILABLE 1
47 #define CUDA_AVAILABLE 1
49 #define CUDA_AVAILABLE 1
51 #define CUDA_AVAILABLE 0
58#define CUDA_HOS_DEV __host__ __device__
66 CUDA_HOS_DEV inline void CUDA_gpu_assert(cudaError_t code,
const char *
file,
int line,
bool abort =
true)
68 if (code != cudaSuccess)
70 printf(
"CUDA error: %s (%s %d)\n", cudaGetErrorString(code),
file, line);
86#define CUDA_ERRCHECK(...) CUDA_ERRCHECK_HELPER(__VA_ARGS__, true)
88#define CUDA_ERRCHECK_HELPER(ans, ...) do { ::CaloRecGPU::CUDA_gpu_assert((ans), __FILE__, __LINE__, CUDA_ERRCHECK_GET_FIRST(__VA_ARGS__, true) ); } while(0)
89#define CUDA_ERRCHECK_GET_FIRST(x, ...) x
92#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ > 350
93 #if CUDART_VERSION >= 12000
94 #define CUDA_CAN_USE_TAIL_LAUNCH 1
96 #define CUDA_CAN_USE_TAIL_LAUNCH 0
98#elif defined(__CUDA_ARCH__)
99 #error "CUDA compute capability at least 3.5 is needed so we can have dynamic parallelism!"
106#define CUDA_ERRCHECK(...)
117 template <
class T = const
void>
118 constexpr operator T * ()
const
123 constexpr operator bool()
const
125 return ptr !=
nullptr;
164 void GPU_to_CPU(
void * dest,
const void *
const source,
const size_t num);
169 void CPU_to_GPU(
void * dest,
const void *
const source,
const size_t num);
174 void GPU_to_GPU(
void * dest,
const void *
const source,
const size_t num);
218 template <
class T1,
class T2>
221 return num / denom + (num % denom != 0);
225 template <
class T1,
class T2>
232 template <
class Base =
float,
class Exp =
int>
238 for (Exp i = 0; i < -exp; ++i)
245 for (Exp i = 0; i < exp; ++i)
262 return sizeof(T) * CHAR_BIT - std::countl_zero(num);
264 unsigned int ret = 64;
266 for (
unsigned long long int mask = 0x8000000000000000U; mask > 0; mask >>= 1U)
284 constexpr unsigned char initial_value = 42;
287 constexpr unsigned char c_mult = 7;
288 constexpr unsigned char c_add = 1;
293 unsigned char ret = initial_value;
295 for (
unsigned int i = 0; i <
sizeof(T); i +=
sizeof(
unsigned char))
297 const unsigned char to_hash =
number >> (i * CHAR_BIT);
298 const unsigned char operand = ret ^ to_hash;
299 ret = c_mult * operand + c_add;
311 constexpr unsigned short initial_value = 42754;
314 constexpr unsigned short c_mult = 7;
315 constexpr unsigned short c_add = 1;
320 unsigned short ret = initial_value;
322 for (
unsigned int i = 0; i <
sizeof(T); i +=
sizeof(
unsigned short))
324 const unsigned short to_hash =
number >> (i * CHAR_BIT);
325 const unsigned short operand = ret ^ to_hash;
326 ret = c_mult * operand + c_add;
336#ifdef __cpp_lib_math_constants
338 inline constexpr T
pi = std::numbers::pi_v<T>;
341 inline constexpr T
sqrt2 = std::numbers::sqrt2_v<T>;
344 inline constexpr T
pi = T(3.1415926535897932384626433832795028841971693993751058209749445923078164062862089986280348253421170679821480865132823066470938446095505822317253594081284811174502841027019385211055596446229489549303819644288109756659334461284756482337867831652712019091456485669234603486104543266482133936072602491412737245870066063155881748815209209628292540917153643678925903600113305305488204665213841469519415116094330572703657595919530921861173819326117931051185480744623799627495673518857527248912279381830119491298336733624L);
347 inline constexpr T
sqrt2 = T(1.4142135623730950488016887242096980785696718753769480731766797379907324784621070388503875343276415727350138462309122970249248360558507372126441214970999358314132226659275055927557999505011527820605714701095599716059702745345968620147285174186408891986095523292304843087143214508397626036279952514079896872533965463318088296406206152583523950547457502877599617298355752203375318570113543746034084988471603868999706990048150305440277903164542478230684929369186215805784631115966687130130156185689872372352885092649L);
351 inline constexpr T
inv_sqrt2 = T(0.70710678118654752440084436210484903928483593768847403658833986899536623923105351942519376716382078636750692311545614851246241802792536860632206074854996791570661133296375279637789997525057639103028573505477998580298513726729843100736425870932044459930477616461524215435716072541988130181399762570399484362669827316590441482031030762917619752737287514387998086491778761016876592850567718730170424942358019344998534950240751527201389515822712391153424646845931079028923155579833435650650780928449361861764425463243L);
367 float kConst = 0.8862269254527579f;
375 float erfi, derfi, y0, y1, dy0, dy1;
378 erfi = kConst * fabsf(
x);
379 y0 = erff(0.9f * erfi);
381 for (
int iter = 0; iter < kMaxit; iter++)
383 y1 = 1.f - erfc(erfi);
385 if (fabsf(dy1) < kEps)
400 if (fabsf(derfi / erfi) < kEps)
426 constexpr float two_pi = 2 *
pi;
427 const float ret = remainderf(b, two_pi);
428 return ret + ((ret <
a -
pi) - (ret >
a +
pi)) * two_pi;
437 constexpr double two_pi = 2 *
pi;
438 const double ret = remainderf(b, two_pi);
439 return ret + ((ret <
a -
pi) - (ret >
a +
pi)) * two_pi;
454 if (
x != 0 ||
y != 0)
457 const float m = norm3df(
x,
y,
z);
459 const float m = hypot(
x,
y,
z);
461 return 0.5f * logf((m +
z) / (m -
z));
465 constexpr float s_etaMax = 22756.0f;
466 return z + ((
z > 0) - (
z < 0)) * s_etaMax;
474 if (
x != 0 ||
y != 0)
477 const float m = norm3d(
x,
y,
z);
479 const float m = hypot(
x,
y,
z);
481 return 0.5 * log((m +
z) / (m -
z));
485 constexpr double s_etaMax = 22756.0;
486 return z + ((
z > 0) - (
z < 0)) * s_etaMax;
495 const float t = sum + to_add;
497 const bool test = fabsf(sum) >= fabsf(to_add);
499 const float opt_1 = (sum - t) + to_add;
500 const float opt_2 = (to_add - t) + sum;
502 corr += (test) * opt_1 + (!test) * opt_2;
509 template <
class ... Floats,
class disabler = std::enable_if_t < (std::is_same_v<std::decay_t<Floats>,
float> && ...) > >
529 __device__
static inline
530 void device_kahan_babushka_neumaier(
float * sum_arr,
float * corr_arr,
const float v,
const int idx = 0)
532 const float old_sum = atomicAdd(sum_arr + idx, v);
533 const float new_sum = old_sum + v;
534 if (fabsf(old_sum) >= fabsf(v))
536 atomicAdd(corr_arr + idx, (old_sum - new_sum) + v);
540 atomicAdd(corr_arr + idx, (v - new_sum) + old_sum);
553 const float w_1 =
a * b;
554 const float w_2 = c * d;
556 const float e_1 = fmaf(
a, b, -w_1);
557 const float e_2 = fmaf(c, d, -w_2);
565 const float b_1,
const float b_2,
const float b_3)
569 const float w_1 = a_1 * b_1;
570 const float w_2 = a_2 * b_2;
571 const float w_3 = a_3 * b_3;
573 const float e_1 = fmaf(a_1, b_1, -w_1);
574 const float e_2 = fmaf(a_2, b_2, -w_2);
575 const float e_3 = fmaf(a_3, b_3, -w_3);
589 void corrected_cross_product(
float (&
res)[3],
const float a1,
const float a2,
const float a3,
const float b1,
const float b2,
const float b3)
614 return norm3df(r_1, r_2, r_3);
616 return hypot(r_1, r_2, r_3);
633 constexpr static char const *
name =
"CPU";
637 constexpr static char const *
name =
"CUDA GPU";
641 constexpr static char const *
name =
"CUDA Pinned CPU";
646 template <
class T,
class indexer>
693 template <
class C1,
class C2,
class dummy =
void>
struct copy_helper;
697 static inline void copy (T * dest,
const T *
const source,
const indexer
sz)
699 std::memcpy(dest, source,
sizeof(T) *
sz);
705 static inline void copy (T * dest,
const T *
const source,
const indexer
sz)
713 static inline void copy (T * dest,
const T *
const source,
const indexer
sz)
721 static inline void copy (T * dest,
const T *
const source,
const indexer
sz)
729 static inline void copy (T * dest,
const T *
const source,
const indexer
sz)
731 std::memcpy(dest, source,
sizeof(T) *
sz);
737 static inline void copy (T * dest,
const T *
const source,
const indexer
sz)
739 std::memcpy(dest, source,
sizeof(T) *
sz);
745 static inline void copy (T * dest,
const T *
const source,
const indexer
sz)
747 std::memcpy(dest, source,
sizeof(T) *
sz);
753 static inline void copy (T * dest,
const T *
const source,
const indexer
sz)
761 static inline void copy (T * dest,
const T *
const source,
const indexer
sz)
768 template <
class C1,
class C2,
class dummy =
void>
struct move_helper;
772 inline static void move(T *& dest, T *& source,
const indexer
sz)
782 inline static void move(T *& dest, T *& source,
const indexer)
791 template <
class Context>
static inline T *
allocate(
const indexer size)
798#if CALORECGPU_HELPERS_DEBUG
799 std::cerr <<
"ALLOCATED " << size <<
" x " <<
sizeof(T) <<
" in " << Context::name <<
": " << ret << std::endl;
805 template <
class Context>
static inline void deallocate(T *& arr)
814#if CALORECGPU_HELPERS_DEBUG
815 std::cerr <<
"DEALLOCATED in " << Context::name <<
": " << arr << std::endl;
822 template <
class DestContext,
class SourceContext>
823 static inline void copy(T * dest,
const T *
const source,
const indexer
sz)
825 if (
sz > 0 && source !=
nullptr)
829#if CALORECGPU_HELPERS_DEBUG
830 std::cerr <<
"COPIED " <<
sz <<
" from " << SourceContext::name <<
" to " << DestContext::name <<
": " << source <<
" to " << dest << std::endl;
840 template <
class DestContext,
class SourceContext>
841 static inline void move(T *& dest, T *& source,
const indexer
sz)
843#if CALORECGPU_HELPERS_DEBUG
844 std::cerr <<
"MOVED " <<
sz <<
" from " << SourceContext::name <<
" to " << DestContext::name <<
": " << source <<
" to " << dest;
846 if (
sz > 0 && source !=
nullptr)
855#if CALORECGPU_HELPERS_DEBUG
856 std::cerr <<
" | " << source <<
" to " << dest << std::endl;
868 template <
class T,
class indexer,
class Context,
bool hold_arrays = true>
871 template <
class T,
class indexer,
class Context>
874 static_assert(std::is_trivially_copyable<T>::value,
"SimpleContainer only works with a trivially copyable type.");
905 inline void resize(
const indexer new_size)
911 else if (new_size !=
m_size)
914 m_array = Manager::template allocate<Context>(new_size);
915 Manager::template copy<Context, Context>(
m_array, temp, (
m_size < new_size ?
m_size : new_size));
916 Manager::template deallocate<Context>(temp);
927 m_array = Manager::template allocate<Context>(
sz);
935 m_array = Manager::template allocate<Context>(
sz);
936 Manager::template copy<Context, Context>(
m_array, other_array,
sz);
942 Manager::template copy<Context, Context>(
m_array, other.m_array,
m_size);
948 Manager::template move<Context, Context>(
m_array, other.m_array,
m_size);
952 template <
class other_indexer,
class other_context,
bool other_hold>
958 Manager::template copy<Context, other_context>(
m_array, other.m_array,
m_size);
961 template <
class other_indexer,
class other_context>
966 Manager::template move<Context, other_context>(
m_array, other.m_array,
m_size);
979 Manager::template copy<Context, Context>(
m_array, other.m_array,
m_size);
993 Manager::template move<Context, Context>(
m_array, other.m_array, other.size());
1001 template <
class other_indexer,
class other_context,
bool other_hold>
1005 Manager::template copy<Context, other_context>(
m_array, other.m_array,
m_size);
1009 template <
class other_indexer,
class other_context>
1013 Manager::template move<Context, other_context>(
m_array, other.m_array, other.m_size);
1021 Manager::template deallocate<Context>(
m_array);
1045 template <
class stream,
class str = std::basic_
string<
typename stream::
char_type> >
1048 if (std::is_same<Context, MemoryContext::CPU>::value)
1050 s <<
m_size << separator;
1051 for (indexer i = 0; i <
m_size - 1; ++i)
1060 other.textual_output(s, separator);
1064 template <
class stream>
1067 if (std::is_same<Context, MemoryContext::CPU>::value)
1070 s >> new_size >> std::ws;
1074 std::cerr <<
"FAILED READING " <<
this <<
"!" << std::endl;
1078 for (indexer i = 0; i <
m_size - 1; ++i)
1088 other.textual_input(s);
1093 template <
class stream>
1096 if (std::is_same<Context, MemoryContext::CPU>::value)
1098 s.write(
reinterpret_cast<const char *
>(&
m_size),
sizeof(indexer));
1099 for (indexer i = 0; i <
m_size; ++i)
1101 s.write(
reinterpret_cast<const char *
>(
m_array + i),
sizeof(T));
1107 other.binary_output(s);
1111 template <
class stream>
1114 if (std::is_same<Context, MemoryContext::CPU>::value)
1117 s.read(
reinterpret_cast<char *
>(&new_size),
sizeof(indexer));
1121 std::cerr <<
"FAILED READING " <<
this <<
"!" << std::endl;
1125 for (indexer i = 0; i <
m_size; ++i)
1127 s.read(
reinterpret_cast<char *
>(
m_array + i),
sizeof(T));
1133 other.binary_input(s);
1140 template <
class T,
class indexer,
class Context>
1143 static_assert(std::is_trivially_copyable<T>::value,
"SimpleContainer only works with a trivially copyable type.");
1183 template <
class other_indexer,
bool other_hold>
1208 template <
class other_indexer,
bool other_hold>
1240 template <
class T,
class indexer =
unsigned int>
1244 template <
class T,
class indexer =
unsigned int>
1248 template <
class T,
class indexer =
unsigned int>
1257 template <
class T,
class Context,
bool hold_
object = true>
1260 template <
class T,
class Context>
1263 static_assert(std::is_trivially_copyable<T>::value,
"SimpleHolder only works with a trivially copyable type.");
1302 Manager::template deallocate<Context>(
m_object);
1319 if (really_allocate)
1332 template < class X, class disabler = typename std::enable_if < std::is_base_of<T, X>::value || std::is_same<T, X>::value >
::type >
1336 Manager::template copy<Context, Context>(
m_object, other_p, 1);
1342 template < class X, class disabler = typename std::enable_if < std::is_base_of<T, X>::value || std::is_same<T, X>::value >
::type >
1354 Manager::template copy<Context, Context>(
m_object, other.m_object, other.valid());
1362 template <
class X,
class other_context,
bool other_hold,
1363 class disabler =
typename std::enable_if < std::is_base_of<T, X>::value || std::is_same<T, X>::value >
::type >
1369 Manager::template copy<Context, other_context>(
m_object, other.m_object, other.valid());
1380 Manager::template move<Context, Context>(
m_object, other.m_object, other.valid());
1383 template <
class X,
class other_context,
1384 class disabler =
typename std::enable_if < std::is_base_of<T, X>::value || std::is_same<T, X>::value >
::type >
1388 Manager::template move<Context, other_context>(
m_object, other.m_object, other.valid());
1395 if (!
valid() && other.valid())
1401 Manager::template copy<Context, Context>(
m_object, other.m_object, other.valid());
1406 template <
class X,
class other_context,
bool other_hold,
1407 class disabler =
typename std::enable_if < std::is_base_of<T, X>::value || std::is_same<T, X>::value >
::type >
1412 if (!
valid() && other.valid())
1416 Manager::template copy<Context, other_context>(
m_object, other.m_object, other.valid());
1427 Manager::template move<Context, Context>(
m_object, other.m_object, other.valid());
1432 template <
class X,
class other_context,
1433 class disabler =
typename std::enable_if < std::is_base_of<T, X>::value || std::is_same<T, X>::value >
::type >
1439 Manager::template move<Context, other_context>(
m_object, other.m_object, other.valid());
1445 Manager::template deallocate<Context>(
m_object);
1448 template < class X, class disabler = typename std::enable_if < std::is_base_of<X, T>::value || std::is_same<T, X>::value >
::type >
1454 template < class X, class disabler = typename std::enable_if < std::is_base_of<X, T>::value || std::is_same<T, X>::value >
::type >
1470 template <
class stream,
class str = std::basic_
string<
typename stream::
char_type> >
1473 if (std::is_same<Context, MemoryContext::CPU>::value)
1481 s << 1 << separator << (*m_object);
1487 other.textual_output(s, separator);
1491 template <
class stream>
1494 if (std::is_same<Context, MemoryContext::CPU>::value)
1497 s >> is_valid >> std::ws;
1501 std::cerr <<
"FAILED READING " <<
this <<
"!" << std::endl;
1517 other.textual_input(s);
1522 template <
class stream>
1529 if (std::is_same<Context, MemoryContext::CPU>::value)
1531 s.write(
reinterpret_cast<const char *
>(
m_object),
sizeof(T));
1536 other.binary_output(s);
1540 template <
class stream>
1543 if (std::is_same<Context, MemoryContext::CPU>::value)
1546 s.read(
reinterpret_cast<char *
>(
m_object),
sizeof(T));
1551 other.binary_input(s);
1558 template <
class T,
class Context>
1561 static_assert(std::is_trivially_copyable<T>::value,
"SimpleHolder only works with a trivially copyable type.");
1607 template < class X, class disabler = typename std::enable_if < std::is_base_of<T, X>::value || std::is_same<T, X>::value >
::type >
1615 template <
class X,
bool other_hold,
1616 class disabler =
typename std::enable_if < std::is_base_of<T, X>::value || std::is_same<T, X>::value >
::type >
1624 template <
class X,
bool other_hold,
1625 class disabler =
typename std::enable_if < std::is_base_of<T, X>::value || std::is_same<T, X>::value >
::type >
1634 template < class X, class disabler = typename std::enable_if < std::is_base_of<X, T>::value || std::is_same<T, X>::value >
::type >
1640 template < class X, class disabler = typename std::enable_if < std::is_base_of<X, T>::value || std::is_same<T, X>::value >
::type >
1703 std::unique_lock<std::shared_mutex> lock(
m_mutex);
1704 m_held.emplace_back(std::make_unique<T>());
1713 std::shared_lock<std::shared_mutex> lock(
m_mutex);
1714 std::thread::id this_id = std::this_thread::get_id();
1715 const std::thread::id invalid_id{};
1731 std::shared_lock<std::shared_mutex> lock(
m_mutex);
1732 std::thread::id this_id = std::this_thread::get_id();
1747 std::unique_lock<std::shared_mutex> lock(
m_mutex);
1748 std::thread::id this_id = std::this_thread::get_id();
1749 const std::thread::id invalid_id{};
1761 std::unique_lock<std::shared_mutex> lock(
m_mutex);
1762 if (new_size <
m_held.size())
1767 else if (new_size >
m_held.size())
1769 const size_t to_add = new_size -
m_held.size();
1770 const std::thread::id invalid_id{};
1771 for (
size_t i = 0; i < to_add; ++i)
1773 m_held.emplace_back(std::make_unique<T>());
1779 template <
class F,
class ...
Args>
1782 std::unique_lock<std::shared_mutex> lock(
m_mutex);
1783 for (std::unique_ptr<T> & obj :
m_held)
1785 f(*obj, std::forward<Args>(args)...);
1791 std::shared_lock<std::shared_mutex> lock(
m_mutex);
1797 std::shared_lock<std::shared_mutex> lock(
m_mutex);
1799 const std::thread::id invalid_id{};
1802 if (
id == invalid_id)
1812 std::shared_lock<std::shared_mutex> lock(
m_mutex);
1814 const std::thread::id invalid_id{};
1817 if (
id == invalid_id)
1851 m_sth.release_one();
1859 m_sth.release_one();
1900 template <
class ...
Args>
1924 (*m_object) = other.get();
1941 (*m_object) = other.get();
1999 operator const T & ()
const
std::pair< std::vector< unsigned int >, bool > res
xAOD::MissingET_v1 operator*(const xAOD::MissingET_v1 &met, float scale)
Create new MET object from source with scaled (weighted) kinematics.
! Handles allocation of a type T, using indexer as the integer type to indicate sizes.
static void move(T *&dest, T *&source, const indexer sz)
!
static void deallocate(T *&arr)
!
static T * allocate(const indexer size)
!
static void copy(T *dest, const T *const source, const indexer sz)
!
CUDA_HOS_DEV SimpleContainer(T *other_array, const indexer sz)
friend class SimpleContainer
MemoryManagement< T, indexer > Manager
CUDA_HOS_DEV indexer size() const
CUDA_HOS_DEV SimpleContainer()
CUDA_HOS_DEV SimpleContainer(const SimpleContainer< T, other_indexer, Context, other_hold > &other)
SimpleContainer(SimpleContainer &&other)
SimpleContainer(const indexer sz)
friend class SimpleContainer
CUDA_HOS_DEV indexer size() const
MemoryManagement< T, indexer > Manager
SimpleContainer(SimpleContainer< T, other_indexer, other_context, true > &&other)
SimpleContainer(const SimpleContainer &other)
SimpleContainer(const SimpleContainer< T, other_indexer, other_context, other_hold > &other)
SimpleContainer(T *other_array, const indexer sz)
void textual_output(stream &s, const str &separator=" ") const
void resize(const indexer new_size)
void textual_input(stream &s)
void binary_input(stream &s)
void binary_output(stream &s) const
Holds a run-time amount of objects of type \T, measuring sizes with indexer, in memory context Contex...
CUDA_HOS_DEV bool valid() const
friend class SimpleHolder
CUDA_HOS_DEV SimpleHolder(const SimpleHolder< X, Context, other_hold > &other)
CUDA_HOS_DEV SimpleHolder()
CUDA_HOS_DEV SimpleHolder(X *other_p)
MemoryManagement< T, indexer > Manager
void textual_output(stream &s, const str &separator=" ") const
SimpleHolder(const X &other_v)
SimpleHolder(SimpleHolder &&other)
CUDA_HOS_DEV bool valid() const
void binary_output(stream &s) const
SimpleHolder(const SimpleHolder &other)
friend class SimpleHolder
SimpleHolder(const bool really_allocate)
SimpleHolder(SimpleHolder< X, other_context, true > &&other)
SimpleHolder(const SimpleHolder< X, other_context, other_hold > &other)
void binary_input(stream &s)
void textual_input(stream &s)
MemoryManagement< T, indexer > Manager
Holds one objects of type \T in memory context Context.
Manages objects of type T in a thread-safe way, ensuring that there's an object available for each se...
std::shared_mutex m_mutex
size_t available_size() const
void operate_on_all(F &&f, Args &&... args)
size_t filled_size() const
void resize(const size_t new_size)
std::vector< std::unique_ptr< T > > m_held
T & get_for_thread() const
std::vector< typename std::thread::id > m_thread_equivs
int count(std::string s, const std::string ®x)
count how many occurances of a regx are in a string
void CPU_to_GPU(void *dest, const void *const source, const size_t num)
Copies num bytes from source in CPU memory to dest in GPU memory.
void optimize_block_and_grid_size_for_cooperative_launch(void *func, int &block_size, int &grid_size, const int dynamic_memory=0, const int block_size_limit=0)
Optimizes block and grid size for a cooperative launch.
void GPU_to_CPU(void *dest, const void *const source, const size_t num)
Copies num bytse from source in GPU memory to dest in CPU memory.
void deallocate(void *address)
Deallocates address in GPU memory.
void * allocate(const size_t num)
Allocates and returns the address of num bytes from GPU memory.
void * allocate_pinned(const size_t num)
Allocates and returns the address of num bytes from CPU pinned memory.
void GPU_to_GPU_async(void *dest, const void *const source, const size_t num, CUDAStreamPtrHolder stream={})
Copies num bytes from source to dest, both in GPU memory, asynchronously.
bool supports_cooperative_launches()
void GPU_to_GPU(void *dest, const void *const source, const size_t num)
Copies num bytes from source to dest, both in GPU memory.
void CPU_to_GPU_async(void *dest, const void *const source, const size_t num, CUDAStreamPtrHolder stream={})
Copies num bytes from source in CPU memory to dest in GPU memory, asynchronously.
void deallocate_pinned(void *address)
Deallocates address in CPU pinned memory.
bool supports_dynamic_parallelism()
void GPU_synchronize(CUDAStreamPtrHolder stream={})
Synchronizes the stream.
void GPU_to_CPU_async(void *dest, const void *const source, const size_t num, CUDAStreamPtrHolder stream={})
Copies num bytes from source in GPU memory to dest in CPU memory, asynchronously.
void optimize_block_and_grid_size(void *func, int &block_size, int &grid_size, const int dynamic_memory=0, const int block_size_limit=0)
Optimizes block and grid size according to cudaOccupancyMaxPotentialBlockSize.
! Holds dummy classes just to identify the place in which memory lives.
static CUDA_HOS_DEV float regularize_angle(const float b, const float a=0.f)
static CUDA_HOS_DEV float corrected_magn_cross_product(const float a1, const float a2, const float a3, const float b1, const float b2, const float b3)
constexpr auto int_ceil_div(const T1 num, const T2 denom)
Returns the ceiling of num/denom, with proper rounding.
SimpleHolder< T, MemoryContext::CUDAGPU, true > CUDA_object
Holds an object of type T in CUDA GPU memory.
static CUDA_HOS_DEV float eta_from_coordinates(const float x, const float y, const float z)
constexpr unsigned short Pearson_hash_16_bit(const T number)
Calculates a 16-bit Pearson hash from @ number.
static CUDA_HOS_DEV float product_sum_cornea_harrison_tang(const float a, const float b, const float c, const float d)
static CUDA_HOS_DEV float erf_inv_wrapper(const float x)
static CUDA_HOS_DEV T angular_difference(const T x, const T y)
SimpleHolder< T, MemoryContext::CUDAPinnedCPU, true > CUDA_pinned_CPU_object
Holds an object of type T in CUDA GPU memory.
static CUDA_HOS_DEV void partial_kahan_babushka_neumaier_sum(const float &to_add, float &sum, float &corr)
Implements one step of a Kahan-Babushka-Neumaier sum by adding to_add to sum with the correction term...
SimpleContainer< T, indexer, MemoryContext::CPU, true > CPU_array
Holds a run-time specified amount of objects of type T in CPU memory.
static CUDA_HOS_DEV float corrected_dot_product(const float a_1, const float a_2, const float a_3, const float b_1, const float b_2, const float b_3)
constexpr Base compile_time_pow2(const Exp exp)
Returns 2 to the power of exp.
SimpleHolder< T, MemoryContext::CPU, true > CPU_object
Holds an object of type T in CPU memory.
static CUDA_HOS_DEV void corrected_cross_product(float(&res)[3], const float a1, const float a2, const float a3, const float b1, const float b2, const float b3)
constexpr unsigned char Pearson_hash(const T number)
Calculates a Pearson hash from @ number.
CUDA_HOS_DEV float sum_kahan_babushka_neumaier(const Floats &... fs)
Adds a list of floats using the Kahan-Babushka-Neumaier algorithm for greater precision (at the cost ...
SimpleContainer< T, indexer, MemoryContext::CUDAGPU, false > CUDA_kernel_array
Non-owning pointer to an array of T in CUDA GPU memory.
SimpleContainer< T, indexer, MemoryContext::CUDAGPU, true > CUDA_array
Holds a run-time specified amount of objects of type T in CUDA GPU memory.
constexpr auto int_floor_div(const T1 num, const T2 denom)
Returns the floor of num/denom, with proper rounding.
constexpr unsigned int int_ceil_log_2(T num)
Returns the ceiling of the base-2 logarithm of a number (i.
SimpleHolder< T, MemoryContext::CUDAGPU, false > CUDA_kernel_object
Non-owning pointer to an object of type T in CUDA GPU memory.
Copyright (C) 2002-2025 CERN for the benefit of the ATLAS collaboration.
CUDAStreamPtrHolder(T *p)
CUDAStreamPtrHolder()=default
static constexpr char const * name
static constexpr char const * name
static constexpr char const * name
static void copy(T *dest, const T *const source, const indexer sz)
static void copy(T *dest, const T *const source, const indexer sz)
static void copy(T *dest, const T *const source, const indexer sz)
static void copy(T *dest, const T *const source, const indexer sz)
static void copy(T *dest, const T *const source, const indexer sz)
static void copy(T *dest, const T *const source, const indexer sz)
static void copy(T *dest, const T *const source, const indexer sz)
static void copy(T *dest, const T *const source, const indexer sz)
static void copy(T *dest, const T *const source, const indexer sz)
static void move(T *&dest, T *&source, const indexer)
static void move(T *&dest, T *&source, const indexer sz)
static void deallocate(T *&arr)
static T * allocate(const indexer size)
static T * allocate(const indexer size)
static void deallocate(T *&arr)
static void deallocate(T *&arr)
static T * allocate(const indexer size)
maybe_allocate & operator=(const maybe_allocate &other)
maybe_allocate(maybe_allocate &&other)
maybe_allocate(const bool allocate, const T &t)
maybe_allocate(const bool allocate, T &&t)
maybe_allocate(const bool allocate, Args &&... args)
const T * operator->() const
maybe_allocate(const maybe_allocate &other)
separate_thread_accessor(separate_thread_holder< T > &s)
separate_thread_holder< T > & m_sth
separate_thread_accessor(separate_thread_holder< T > &s, T *&ptr)
~separate_thread_accessor()
std::string number(const double &d, const std::string &s)