|
ATLAS Offline Software
|
Go to the documentation of this file.
7 #ifndef CALORECGPU_HELPERS_H
8 #define CALORECGPU_HELPERS_H
11 #include <type_traits>
19 #include <shared_mutex>
26 #if __cpp_lib_math_constants
38 #ifndef CUDA_AVAILABLE
41 #define CUDA_AVAILABLE 1
43 #define CUDA_AVAILABLE 1
45 #define CUDA_AVAILABLE 1
47 #define CUDA_AVAILABLE 0
54 #define CUDA_HOS_DEV __host__ __device__
64 if (
code != cudaSuccess)
66 printf(
"CUDA error: %s (%s %d)\n", cudaGetErrorString(
code),
file,
line);
82 #define CUDA_ERRCHECK(...) CUDA_ERRCHECK_HELPER(__VA_ARGS__, true)
84 #define CUDA_ERRCHECK_HELPER(ans, ...) do { ::CaloRecGPU::CUDA_gpu_assert((ans), __FILE__, __LINE__, CUDA_ERRCHECK_GET_FIRST(__VA_ARGS__, true) ); } while(0)
85 #define CUDA_ERRCHECK_GET_FIRST(x, ...) x
88 #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ > 350
89 #if CUDART_VERSION >= 12000
90 #define CUDA_CAN_USE_TAIL_LAUNCH 1
92 #define CUDA_CAN_USE_TAIL_LAUNCH 0
94 #elif defined(__CUDA_ARCH__)
95 #error "CUDA compute capability at least 3.5 is needed so we can have dynamic parallelism!"
102 #define CUDA_ERRCHECK(...)
106 namespace CUDA_Helpers
113 template <
class T = const
void>
114 constexpr
operator T * ()
const
119 constexpr
operator bool()
const
121 return ptr !=
nullptr;
226 template <
class Base =
float,
class Exp =
int>
232 for (Exp
i = 0;
i < -
exp; ++
i)
239 for (Exp
i = 0;
i <
exp; ++
i)
256 constexpr
unsigned char initial_value = 42;
259 constexpr
unsigned char c_mult = 7;
260 constexpr
unsigned char c_add = 1;
265 unsigned char ret = initial_value;
267 for (
unsigned int i = 0;
i <
sizeof(T);
i +=
sizeof(
unsigned char))
269 const unsigned char to_hash =
number >> (
i * CHAR_BIT);
270 const unsigned char operand = ret ^ to_hash;
271 ret = c_mult *
operand + c_add;
283 constexpr
unsigned short initial_value = 42754;
286 constexpr
unsigned short c_mult = 7;
287 constexpr
unsigned short c_add = 1;
292 unsigned short ret = initial_value;
294 for (
unsigned int i = 0;
i <
sizeof(T);
i +=
sizeof(
unsigned short))
296 const unsigned short to_hash =
number >> (
i * CHAR_BIT);
297 const unsigned short operand = ret ^ to_hash;
298 ret = c_mult *
operand + c_add;
308 #ifdef __cpp_lib_math_constants
310 inline constexpr T
pi = std::numbers::pi_v<T>;
313 inline constexpr T
sqrt2 = std::numbers::sqrt2_v<T>;
316 inline constexpr T
pi = T(3.1415926535897932384626433832795028841971693993751058209749445923078164062862089986280348253421170679821480865132823066470938446095505822317253594081284811174502841027019385211055596446229489549303819644288109756659334461284756482337867831652712019091456485669234603486104543266482133936072602491412737245870066063155881748815209209628292540917153643678925903600113305305488204665213841469519415116094330572703657595919530921861173819326117931051185480744623799627495673518857527248912279381830119491298336733624L);
319 inline constexpr T
sqrt2 = T(1.4142135623730950488016887242096980785696718753769480731766797379907324784621070388503875343276415727350138462309122970249248360558507372126441214970999358314132226659275055927557999505011527820605714701095599716059702745345968620147285174186408891986095523292304843087143214508397626036279952514079896872533965463318088296406206152583523950547457502877599617298355752203375318570113543746034084988471603868999706990048150305440277903164542478230684929369186215805784631115966687130130156185689872372352885092649L);
323 inline constexpr T
inv_sqrt2 = T(0.70710678118654752440084436210484903928483593768847403658833986899536623923105351942519376716382078636750692311545614851246241802792536860632206074854996791570661133296375279637789997525057639103028573505477998580298513726729843100736425870932044459930477616461524215435716072541988130181399762570399484362669827316590441482031030762917619752737287514387998086491778761016876592850567718730170424942358019344998534950240751527201389515822712391153424646845931079028923155579833435650650780928449361861764425463243L);
329 float erf_inv_wrapper (
const float x)
339 float kConst = 0.8862269254527579;
347 float erfi, derfi, y0,
y1, dy0, dy1;
350 erfi = kConst * fabsf(
x);
351 y0 = erff(0.9
f * erfi);
353 for (
int iter = 0; iter < kMaxit; iter++)
355 y1 = 1. - erfc(erfi);
357 if (fabsf(dy1) < kEps)
372 if (fabsf(derfi / erfi) < kEps)
390 float regularize_angle(
const float b,
const float a = 0.
f)
395 const float divi = (fabsf(
diff) - Helpers::Constants::pi<float>) / (2 * Helpers::Constants::pi<float>);
396 return b - ceilf(divi) * ((
b >
a + Helpers::Constants::pi<float>) - (
b <
a - Helpers::Constants::pi<float>)) * 2 * Helpers::Constants::pi<float>;
400 double regularize_angle(
const double b,
const double a = 0.)
405 const float divi = (fabs(
diff) - Helpers::Constants::pi<double>) / (2 * Helpers::Constants::pi<double>);
406 return b - ceil(divi) * ((
b >
a + Helpers::Constants::pi<double>) - (
b <
a - Helpers::Constants::pi<double>)) * 2 * Helpers::Constants::pi<double>;
411 T angular_difference(
const T
x,
const T
y)
413 return regularize_angle(
x -
y,
T(0));
424 float eta_from_coordinates(
const float x,
const float y,
const float z)
430 const float m = sqrtf(
rho2 +
z *
z);
431 return 0.5 * logf((
m +
z) / (
m -
z));
435 constexpr
float s_etaMax = 22756.0;
436 return z + ((
z > 0) - (
z < 0)) * s_etaMax;
441 double eta_from_coordinates(
const double x,
const double y,
const double z)
447 const double m = sqrt(
rho2 +
z *
z);
448 return 0.5 *
log((
m +
z) / (
m -
z));
452 constexpr
double s_etaMax = 22756.0;
453 return z + ((
z > 0) - (
z < 0)) * s_etaMax;
458 namespace MemoryContext
462 constexpr
static char const *
name =
"CPU";
466 constexpr
static char const *
name =
"CUDA GPU";
470 constexpr
static char const *
name =
"CUDA Pinned CPU";
475 template <
class T,
class indexer>
522 template <
class C1,
class C2,
class dummy =
void>
struct copy_helper;
597 template <
class C1,
class C2,
class dummy =
void>
struct move_helper;
599 template <
class C1,
class C2,
class dummy>
struct move_helper
620 template <
class Context>
static inline T *
allocate(
const indexer
size)
627 #if CALORECGPU_HELPERS_DEBUG
628 std::cerr <<
"ALLOCATED " <<
size <<
" x " <<
sizeof(T) <<
" in " <<
Context::name <<
": " << ret << std::endl;
634 template <
class Context>
static inline void deallocate(T *& arr)
643 #if CALORECGPU_HELPERS_DEBUG
644 std::cerr <<
"DEALLOCATED in " <<
Context::name <<
": " << arr << std::endl;
651 template <
class DestContext,
class SourceContext>
658 #if CALORECGPU_HELPERS_DEBUG
669 template <
class DestContext,
class SourceContext>
672 #if CALORECGPU_HELPERS_DEBUG
682 deallocate<SourceContext>(
source);
684 #if CALORECGPU_HELPERS_DEBUG
685 std::cerr <<
" | " <<
source <<
" to " <<
dest << std::endl;
697 template <
class T,
class indexer,
class Context,
bool hold_arrays = true>
700 template <
class T,
class indexer,
class Context>
734 inline void resize(
const indexer new_size)
740 else if (new_size != m_size)
781 template <
class other_indexer,
class other_context,
bool other_hold>
790 template <
class other_indexer,
class other_context>
807 resize(
other.size());
823 m_size =
other.m_size;
830 template <
class other_indexer,
class other_context,
bool other_hold>
833 resize(
other.m_size);
838 template <
class other_indexer,
class other_context>
843 m_size =
other.m_size;
864 template <
class stream,
class str = std::basic_
string<
typename stream::
char_type> >
869 s << m_size << separator;
870 for (indexer
i = 0;
i < m_size - 1; ++
i)
872 s << m_array[
i] << separator;
874 s << m_array[m_size - 1];
879 other.textual_output(
s, separator);
883 template <
class stream>
889 s >> new_size >> std::ws;
893 std::cerr <<
"FAILED READING " <<
this <<
"!" << std::endl;
897 for (indexer
i = 0;
i < m_size - 1; ++
i)
902 s >> m_array[m_size - 1];
912 template <
class stream>
917 s.write((
char *) &m_size,
sizeof(indexer));
918 for (indexer
i = 0;
i < m_size; ++
i)
920 s.write((
char *) (m_array +
i),
sizeof(
T));
930 template <
class stream>
936 s.read((
char *) &new_size,
sizeof(indexer));
940 std::cerr <<
"FAILED READING " <<
this <<
"!" << std::endl;
944 for (indexer
i = 0;
i < m_size; ++
i)
946 s.read((
char *) (m_array +
i),
sizeof(
T));
959 template <
class T,
class indexer,
class Context>
1002 template <
class other_indexer,
bool other_hold>
1006 m_size(
other.m_size),
1007 m_array(
other.m_array)
1021 m_array =
other.m_array;
1022 m_size =
other.m_size;
1027 template <
class other_indexer,
bool other_hold>
1032 m_size =
other.m_size;
1033 m_array =
other.m_array;
1049 template <
class T,
class indexer =
unsigned int>
1053 template <
class T,
class indexer =
unsigned int>
1057 template <
class T,
class indexer =
unsigned int>
1066 template <
class T,
class Context,
bool hold_
object = true>
1069 template <
class T,
class Context>
1106 return m_object !=
nullptr;
1116 if (m_object ==
nullptr)
1128 if (really_allocate)
1171 template <
class X,
class other_context,
bool other_hold,
1192 template <
class X,
class other_context,
1215 template <
class X,
class other_context,
bool other_hold,
1241 template <
class X,
class other_context,
1269 template <
class stream,
class str = std::basic_
string<
typename stream::
char_type> >
1274 if (m_object ==
nullptr)
1280 s << 1 << separator << (*m_object);
1286 other.textual_output(
s, separator);
1290 template <
class stream>
1296 s >> is_valid >> std::ws;
1300 std::cerr <<
"FAILED READING " <<
this <<
"!" << std::endl;
1321 template <
class stream>
1324 if (m_object ==
nullptr)
1330 s.write((
char *) m_object,
sizeof(
T));
1339 template <
class stream>
1345 s.read((
char *) m_object,
sizeof(
T));
1357 template <
class T,
class Context>
1394 return m_object !=
nullptr;
1414 template <
class X,
bool other_hold,
1420 m_object =
other.m_object;
1423 template <
class X,
bool other_hold,
1429 m_object =
other.m_object;
1492 std::unique_lock<std::shared_mutex> lock(
m_mutex);
1493 m_held.emplace_back(std::make_unique<T>());
1502 std::shared_lock<std::shared_mutex> lock(
m_mutex);
1520 std::shared_lock<std::shared_mutex> lock(
m_mutex);
1536 std::unique_lock<std::shared_mutex> lock(
m_mutex);
1550 std::unique_lock<std::shared_mutex> lock(
m_mutex);
1551 if (new_size <
m_held.size())
1556 else if (new_size >
m_held.size())
1558 const size_t to_add = new_size -
m_held.size();
1560 for (
size_t i = 0;
i < to_add; ++
i)
1562 m_held.emplace_back(std::make_unique<T>());
1568 template <
class F,
class ...
Args>
1571 std::unique_lock<std::shared_mutex> lock(
m_mutex);
1574 f(*
obj, std::forward<Args>(
args)...);
1580 std::shared_lock<std::shared_mutex> lock(
m_mutex);
1586 std::shared_lock<std::shared_mutex> lock(
m_mutex);
1591 if (
id == invalid_id)
1601 std::shared_lock<std::shared_mutex> lock(
m_mutex);
1606 if (
id == invalid_id)
1640 m_sth.release_one();
1648 m_sth.release_one();
1689 template <
class ...
Args>
1713 (*m_object) =
other.get();
1730 (*m_object) =
other.get();
1788 operator const T & ()
const
1797 #endif // CALORECGPU_HELPERS_H
maybe_allocate(maybe_allocate &&other)
void CPU_to_GPU_async(void *dest, const void *const source, const size_t num, CUDAStreamPtrHolder stream={})
Copies num bytes from source in CPU memory to dest in GPU memory, asynchronously.
maybe_allocate(const bool allocate, T &&t)
maybe_allocate & operator=(const maybe_allocate &other)
! Handles allocation of a type T, using indexer as the integer type to indicate sizes.
void optimize_block_and_grid_size(void *func, int &block_size, int &grid_size, const int dynamic_memory=0, const int block_size_limit=0)
Optimizes block and grid size according to cudaOccupancyMaxPotentialBlockSize.
bool supports_cooperative_launches()
static void copy(T *dest, const T *const source, const indexer sz)
static void copy(T *dest, const T *const source, const indexer sz)
SimpleHolder(SimpleHolder &&other)
void GPU_to_CPU_async(void *dest, const void *const source, const size_t num, CUDAStreamPtrHolder stream={})
Copies num bytes from source in GPU memory to dest in CPU memory, asynchronously.
SimpleContainer(const SimpleContainer< T, other_indexer, other_context, other_hold > &other)
static T * allocate(const indexer size)
void optimize_block_and_grid_size_for_cooperative_launch(void *func, int &block_size, int &grid_size, const int dynamic_memory=0, const int block_size_limit=0)
Optimizes block and grid size for a cooperative launch.
static void deallocate(T *&arr)
!
void CPU_to_GPU(void *dest, const void *const source, const size_t num)
Copies num bytes from source in CPU memory to dest in GPU memory.
CUDA_HOS_DEV bool valid() const
maybe_allocate(const maybe_allocate &other)
void binary_output(stream &s) const
void resize(const size_t new_size)
void textual_output(stream &s, const str &separator=" ") const
SimpleHolder(const bool really_allocate)
static T * allocate(const indexer size)
!
Holds one objects of type \T in memory context Context.
constexpr unsigned short Pearson_hash_16_bit(const T number)
Calculates a 16-bit Pearson hash from @ number.
static T * allocate(const indexer size)
constexpr int int_ceil_div(const int num, const int denom)
Returns the ceiling of num/denom, with proper rounding.
separate_thread_accessor(separate_thread_holder< T > &s)
CUDA_HOS_DEV bool valid() const
void GPU_synchronize(CUDAStreamPtrHolder stream={})
Synchronizes the stream.
void deallocate_pinned(void *address)
Deallocates address in CPU pinned memory.
SimpleHolder(const X &other_v)
std::vector< std::unique_ptr< T > > m_held
SimpleContainer(SimpleContainer &&other)
const T * operator->() const
void operate_on_all(F &&f, Args &&... args)
CUDA_HOS_DEV SimpleContainer(T *other_array, const indexer sz)
void textual_input(stream &s)
SimpleContainer(SimpleContainer< T, other_indexer, other_context, true > &&other)
static void deallocate(T *&arr)
static void deallocate(T *&arr)
static T * allocate(const indexer size)
bool const RAWDATA *ch2 const
static void deallocate(T *&arr)
maybe_allocate(const bool allocate, Args &&... args)
SimpleContainer(const indexer sz)
static void copy(T *dest, const T *const source, const indexer sz)
SimpleHolder(SimpleHolder< X, other_context, true > &&other)
void binary_input(stream &s)
void binary_output(stream &s) const
constexpr int int_floor_div(const int num, const int denom)
Returns the floor of num/denom, with proper rounding.
void deallocate(void *address)
Deallocates address in GPU memory.
static void move(T *&dest, T *&source, const indexer)
void textual_input(stream &s)
constexpr unsigned char Pearson_hash(const T number)
Calculates a Pearson hash from @ number.
constexpr static char const * name
size_t filled_size() const
SiLocalPosition operator*(const SiLocalPosition &position, const double factor)
static void copy(T *dest, const T *const source, const indexer sz)
std::shared_mutex m_mutex
constexpr static char const * name
CUDA_HOS_DEV indexer size() const
void textual_output(stream &s, const str &separator=" ") const
void binary_input(stream &s)
std::vector< typename std::thread::id > m_thread_equivs
void GPU_to_GPU(void *dest, const void *const source, const size_t num)
Copies num bytes from source to dest, both in GPU memory.
constexpr static char const * name
void GPU_to_CPU(void *dest, const void *const source, const size_t num)
Copies num bytse from source in GPU memory to dest in CPU memory.
SimpleHolder(const SimpleHolder< X, other_context, other_hold > &other)
bool supports_dynamic_parallelism()
CUDA_HOS_DEV SimpleHolder()
CUDA_HOS_DEV SimpleContainer(const SimpleContainer< T, other_indexer, Context, other_hold > &other)
static void copy(T *dest, const T *const source, const indexer sz)
Holds a run-time amount of objects of type \T, measuring sizes with indexer, in memory context Contex...
static void copy(T *dest, const T *const source, const indexer sz)
SimpleContainer(T *other_array, const indexer sz)
static void copy(T *dest, const T *const source, const indexer sz)
CUDA_HOS_DEV indexer size() const
void * allocate(const size_t num)
Allocates and returns the address of num bytes from GPU memory.
void * allocate_pinned(const size_t num)
Allocates and returns the address of num bytes from CPU pinned memory.
static void copy(T *dest, const T *const source, const indexer sz)
Manages objects of type T in a thread-safe way, ensuring that there's an object available for each se...
T & get_for_thread() const
CUDA_HOS_DEV SimpleContainer()
static void move(T *&dest, T *&source, const indexer sz)
!
separate_thread_accessor(separate_thread_holder< T > &s, T *&ptr)
void GPU_to_GPU_async(void *dest, const void *const source, const size_t num, CUDAStreamPtrHolder stream={})
Copies num bytes from source to dest, both in GPU memory, asynchronously.
CUDA_HOS_DEV SimpleHolder(const SimpleHolder< X, Context, other_hold > &other)
SimpleContainer(const SimpleContainer &other)
SimpleHolder(const SimpleHolder &other)
static void copy(T *dest, const T *const source, const indexer sz)
!
maybe_allocate(const bool allocate, const T &t)
Possibly holds an object in its internal buffer.
CUDAStreamPtrHolder()=default
boost::variant< nil, double, unsigned int, bool, std::string, boost::recursive_wrapper< unaryexpr_ >, boost::recursive_wrapper< expression > > operand
constexpr Base compile_time_pow2(const Exp exp)
Returns 2 to the power of exp.
static void copy(T *dest, const T *const source, const indexer sz)
static void move(T *&dest, T *&source, const indexer sz)
CUDAStreamPtrHolder(T *p)
size_t available_size() const
setBGCode setTAP setLVL2ErrorBits bool
separate_thread_holder< T > & m_sth
~separate_thread_accessor()
CUDA_HOS_DEV SimpleHolder(X *other_p)
void resize(const indexer new_size)