|
ATLAS Offline Software
|
Go to the documentation of this file.
7 #ifndef CALORECGPU_HELPERS_H
8 #define CALORECGPU_HELPERS_H
11 #include <type_traits>
25 #if __cpp_lib_math_constants
37 #ifndef CUDA_AVAILABLE
40 #define CUDA_AVAILABLE 1
42 #define CUDA_AVAILABLE 1
44 #define CUDA_AVAILABLE 1
46 #define CUDA_AVAILABLE 0
53 #define CUDA_HOS_DEV __host__ __device__
63 if (
code != cudaSuccess)
65 printf(
"CUDA error: %s (%s %d)\n", cudaGetErrorString(
code),
file,
line);
81 #define CUDA_ERRCHECK(...) CUDA_ERRCHECK_HELPER(__VA_ARGS__, true)
83 #define CUDA_ERRCHECK_HELPER(ans, ...) do { ::CaloRecGPU::CUDA_gpu_assert((ans), __FILE__, __LINE__, CUDA_ERRCHECK_GET_FIRST(__VA_ARGS__, true) ); } while(0)
84 #define CUDA_ERRCHECK_GET_FIRST(x, ...) x
87 #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ > 350
88 #if CUDART_VERSION >= 12000
89 #define CUDA_CAN_USE_TAIL_LAUNCH 1
91 #define CUDA_CAN_USE_TAIL_LAUNCH 0
93 #elif defined(__CUDA_ARCH__)
94 #error "CUDA compute capability at least 3.5 is needed so we can have dynamic parallelism!"
101 #define CUDA_ERRCHECK(...)
105 namespace CUDA_Helpers
112 template <
class T = const
void>
113 constexpr
operator T * ()
const
118 constexpr
operator bool()
const
120 return ptr !=
nullptr;
225 template <
class Base =
float,
class Exp =
int>
231 for (Exp
i = 0;
i < -
exp; ++
i)
238 for (Exp
i = 0;
i <
exp; ++
i)
255 constexpr
unsigned char initial_value = 42;
258 constexpr
unsigned char c_mult = 7;
259 constexpr
unsigned char c_add = 1;
264 unsigned char ret = initial_value;
266 for (
unsigned int i = 0;
i <
sizeof(T);
i +=
sizeof(
unsigned char))
268 const unsigned char to_hash =
number >> (
i * CHAR_BIT);
282 constexpr
unsigned short initial_value = 42754;
285 constexpr
unsigned short c_mult = 7;
286 constexpr
unsigned short c_add = 1;
291 unsigned short ret = initial_value;
293 for (
unsigned int i = 0;
i <
sizeof(T);
i +=
sizeof(
unsigned short))
295 const unsigned short to_hash =
number >> (
i * CHAR_BIT);
307 #ifdef __cpp_lib_math_constants
309 inline constexpr T
pi = std::numbers::pi_v<T>;
312 inline constexpr T
sqrt2 = std::numbers::sqrt2_v<T>;
315 inline constexpr T
pi = T(3.1415926535897932384626433832795028841971693993751058209749445923078164062862089986280348253421170679821480865132823066470938446095505822317253594081284811174502841027019385211055596446229489549303819644288109756659334461284756482337867831652712019091456485669234603486104543266482133936072602491412737245870066063155881748815209209628292540917153643678925903600113305305488204665213841469519415116094330572703657595919530921861173819326117931051185480744623799627495673518857527248912279381830119491298336733624L);
318 inline constexpr T
sqrt2 = T(1.4142135623730950488016887242096980785696718753769480731766797379907324784621070388503875343276415727350138462309122970249248360558507372126441214970999358314132226659275055927557999505011527820605714701095599716059702745345968620147285174186408891986095523292304843087143214508397626036279952514079896872533965463318088296406206152583523950547457502877599617298355752203375318570113543746034084988471603868999706990048150305440277903164542478230684929369186215805784631115966687130130156185689872372352885092649L);
322 inline constexpr T
inv_sqrt2 = T(0.70710678118654752440084436210484903928483593768847403658833986899536623923105351942519376716382078636750692311545614851246241802792536860632206074854996791570661133296375279637789997525057639103028573505477998580298513726729843100736425870932044459930477616461524215435716072541988130181399762570399484362669827316590441482031030762917619752737287514387998086491778761016876592850567718730170424942358019344998534950240751527201389515822712391153424646845931079028923155579833435650650780928449361861764425463243L);
328 float erf_inv_wrapper (
const float x)
338 float kConst = 0.8862269254527579;
346 float erfi, derfi, y0,
y1, dy0, dy1;
349 erfi = kConst * fabsf(
x);
350 y0 = erff(0.9
f * erfi);
352 for (
int iter = 0; iter < kMaxit; iter++)
354 y1 = 1. - erfc(erfi);
356 if (fabsf(dy1) < kEps)
371 if (fabsf(derfi / erfi) < kEps)
389 float regularize_angle(
const float b,
const float a = 0.
f)
394 const float divi = (fabsf(
diff) - Helpers::Constants::pi<float>) / (2 * Helpers::Constants::pi<float>);
395 return b - ceilf(divi) * ((
b >
a + Helpers::Constants::pi<float>) - (
b <
a - Helpers::Constants::pi<float>)) * 2 * Helpers::Constants::pi<float>;
399 double regularize_angle(
const double b,
const double a = 0.)
404 const float divi = (fabs(
diff) - Helpers::Constants::pi<double>) / (2 * Helpers::Constants::pi<double>);
405 return b - ceil(divi) * ((
b >
a + Helpers::Constants::pi<double>) - (
b <
a - Helpers::Constants::pi<double>)) * 2 * Helpers::Constants::pi<double>;
410 T angular_difference(
const T
x,
const T
y)
412 return regularize_angle(
x -
y,
T(0));
423 float eta_from_coordinates(
const float x,
const float y,
const float z)
429 const float m = sqrtf(
rho2 +
z *
z);
430 return 0.5 * logf((
m +
z) / (
m -
z));
434 constexpr
float s_etaMax = 22756.0;
435 return z + ((
z > 0) - (
z < 0)) * s_etaMax;
440 double eta_from_coordinates(
const double x,
const double y,
const double z)
446 const double m = sqrt(
rho2 +
z *
z);
447 return 0.5 *
log((
m +
z) / (
m -
z));
451 constexpr
double s_etaMax = 22756.0;
452 return z + ((
z > 0) - (
z < 0)) * s_etaMax;
457 namespace MemoryContext
461 constexpr
static char const *
name =
"CPU";
465 constexpr
static char const *
name =
"CUDA GPU";
469 constexpr
static char const *
name =
"CUDA Pinned CPU";
474 template <
class T,
class indexer>
521 template <
class C1,
class C2,
class dummy =
void>
struct copy_helper;
525 static inline void copy (T *
dest,
const T *
const source,
const indexer
sz)
527 std::memcpy(
dest, source,
sizeof(T) *
sz);
533 static inline void copy (T *
dest,
const T *
const source,
const indexer
sz)
541 static inline void copy (T *
dest,
const T *
const source,
const indexer
sz)
549 static inline void copy (T *
dest,
const T *
const source,
const indexer
sz)
557 static inline void copy (T *
dest,
const T *
const source,
const indexer
sz)
559 std::memcpy(
dest, source,
sizeof(T) *
sz);
565 static inline void copy (T *
dest,
const T *
const source,
const indexer
sz)
567 std::memcpy(
dest, source,
sizeof(T) *
sz);
573 static inline void copy (T *
dest,
const T *
const source,
const indexer
sz)
575 std::memcpy(
dest, source,
sizeof(T) *
sz);
581 static inline void copy (T *
dest,
const T *
const source,
const indexer
sz)
589 static inline void copy (T *
dest,
const T *
const source,
const indexer
sz)
596 template <
class C1,
class C2,
class dummy =
void>
struct move_helper;
598 template <
class C1,
class C2,
class dummy>
struct move_helper
600 inline static void move(T *&
dest, T *& source,
const indexer
sz)
610 inline static void move(T *&
dest, T *& source,
const indexer)
619 template <
class Context>
static inline T *
allocate(
const indexer
size)
626 #if CALORECGPU_HELPERS_DEBUG
627 std::cerr <<
"ALLOCATED " <<
size <<
" x " <<
sizeof(T) <<
" in " <<
Context::name <<
": " <<
ret << std::endl;
633 template <
class Context>
static inline void deallocate(T *& arr)
642 #if CALORECGPU_HELPERS_DEBUG
643 std::cerr <<
"DEALLOCATED in " <<
Context::name <<
": " << arr << std::endl;
650 template <
class DestContext,
class SourceContext>
651 static inline void copy(T *
dest,
const T *
const source,
const indexer
sz)
653 if (
sz > 0 && source !=
nullptr)
657 #if CALORECGPU_HELPERS_DEBUG
668 template <
class DestContext,
class SourceContext>
669 static inline void move(T *&
dest, T *& source,
const indexer
sz)
671 #if CALORECGPU_HELPERS_DEBUG
674 if (
sz > 0 && source !=
nullptr)
681 deallocate<SourceContext>(source);
683 #if CALORECGPU_HELPERS_DEBUG
684 std::cerr <<
" | " << source <<
" to " <<
dest << std::endl;
696 template <
class T,
class indexer,
class Context,
bool hold_arrays = true>
699 template <
class T,
class indexer,
class Context>
733 inline void resize(
const indexer new_size)
739 else if (new_size != m_size)
771 m_size =
other.m_size;
778 m_size =
other.m_size;
784 template <
class other_indexer,
class other_context,
bool other_hold>
787 m_size =
other.m_size;
792 template <
class other_indexer,
class other_context>
795 m_size =
other.m_size;
809 resize(
other.size());
825 m_size =
other.m_size;
832 template <
class other_indexer,
class other_context,
bool other_hold>
835 resize(
other.m_size);
840 template <
class other_indexer,
class other_context>
845 m_size =
other.m_size;
866 template <
class stream,
class str = std::basic_
string<
typename stream::
char_type> >
871 s << m_size << separator;
872 for (indexer
i = 0;
i < m_size - 1; ++
i)
874 s << m_array[
i] << separator;
876 s << m_array[m_size - 1];
881 other.textual_output(
s, separator);
885 template <
class stream>
891 s >> new_size >> std::ws;
895 std::cerr <<
"FAILED READING " <<
this <<
"!" << std::endl;
899 for (indexer
i = 0;
i < m_size - 1; ++
i)
904 s >> m_array[m_size - 1];
914 template <
class stream>
919 s.write((
char *) &m_size,
sizeof(indexer));
920 for (indexer
i = 0;
i < m_size; ++
i)
922 s.write((
char *) (m_array +
i),
sizeof(
T));
932 template <
class stream>
938 s.read((
char *) &new_size,
sizeof(indexer));
942 std::cerr <<
"FAILED READING " <<
this <<
"!" << std::endl;
946 for (indexer
i = 0;
i < m_size; ++
i)
948 s.read((
char *) (m_array +
i),
sizeof(
T));
961 template <
class T,
class indexer,
class Context>
1004 template <
class other_indexer,
bool other_hold>
1009 m_size =
other.m_size;
1010 m_array =
other.m_array;
1023 m_array =
other.m_array;
1024 m_size =
other.m_size;
1029 template <
class other_indexer,
bool other_hold>
1034 m_size =
other.m_size;
1035 m_array =
other.m_array;
1051 template <
class T,
class indexer =
unsigned int>
1055 template <
class T,
class indexer =
unsigned int>
1059 template <
class T,
class indexer =
unsigned int>
1068 template <
class T,
class Context,
bool hold_
object = true>
1071 template <
class T,
class Context>
1108 return m_object !=
nullptr;
1118 if (m_object ==
nullptr)
1130 if (really_allocate)
1173 template <
class X,
class other_context,
bool other_hold,
1194 template <
class X,
class other_context,
1217 template <
class X,
class other_context,
bool other_hold,
1243 template <
class X,
class other_context,
1271 template <
class stream,
class str = std::basic_
string<
typename stream::
char_type> >
1276 if (m_object ==
nullptr)
1282 s << 1 << separator << (*m_object);
1288 other.textual_output(
s, separator);
1292 template <
class stream>
1298 s >> is_valid >> std::ws;
1302 std::cerr <<
"FAILED READING " <<
this <<
"!" << std::endl;
1323 template <
class stream>
1326 if (m_object ==
nullptr)
1332 s.write((
char *) m_object,
sizeof(
T));
1341 template <
class stream>
1347 s.read((
char *) m_object,
sizeof(
T));
1359 template <
class T,
class Context>
1396 return m_object !=
nullptr;
1416 template <
class X,
bool other_hold,
1422 m_object =
other.m_object;
1425 template <
class X,
bool other_hold,
1431 m_object =
other.m_object;
1495 std::lock_guard<std::mutex> lock_guard(
m_mutex);
1506 m_held.emplace_back(std::make_unique<T>());
1529 std::lock_guard<std::mutex> lock_guard(
m_mutex);
1543 std::lock_guard<std::mutex> lock_guard(
m_mutex);
1544 if (new_size <
m_held.size())
1549 else if (new_size >
m_held.size())
1551 const size_t to_add = new_size -
m_held.size();
1553 for (
size_t i = 0;
i < to_add; ++
i)
1555 m_held.emplace_back(std::make_unique<T>());
1561 template <
class F,
class ...
Args>
1564 std::lock_guard<std::mutex> lock_guard(
m_mutex);
1567 f(*
obj, std::forward<Args>(
args)...);
1582 if (
id == invalid_id)
1621 m_sth.release_one();
1629 m_sth.release_one();
1670 template <
class ...
Args>
1694 (*m_object) =
other.get();
1711 (*m_object) =
other.get();
1769 operator const T & ()
const
1778 #endif // CALORECGPU_HELPERS_H
maybe_allocate(maybe_allocate &&other)
void CPU_to_GPU_async(void *dest, const void *const source, const size_t num, CUDAStreamPtrHolder stream={})
Copies num bytes from source in CPU memory to dest in GPU memory, asynchronously.
maybe_allocate(const bool allocate, T &&t)
maybe_allocate & operator=(const maybe_allocate &other)
! Handles allocation of a type T, using indexer as the integer type to indicate sizes.
void optimize_block_and_grid_size(void *func, int &block_size, int &grid_size, const int dynamic_memory=0, const int block_size_limit=0)
Optimizes block and grid size according to cudaOccupancyMaxPotentialBlockSize.
bool supports_cooperative_launches()
static void copy(T *dest, const T *const source, const indexer sz)
static void copy(T *dest, const T *const source, const indexer sz)
SimpleHolder(SimpleHolder &&other)
void GPU_to_CPU_async(void *dest, const void *const source, const size_t num, CUDAStreamPtrHolder stream={})
Copies num bytes from source in GPU memory to dest in CPU memory, asynchronously.
SimpleContainer(const SimpleContainer< T, other_indexer, other_context, other_hold > &other)
static T * allocate(const indexer size)
void optimize_block_and_grid_size_for_cooperative_launch(void *func, int &block_size, int &grid_size, const int dynamic_memory=0, const int block_size_limit=0)
Optimizes block and grid size for a cooperative launch.
static void deallocate(T *&arr)
!
void CPU_to_GPU(void *dest, const void *const source, const size_t num)
Copies num bytes from source in CPU memory to dest in GPU memory.
CUDA_HOS_DEV bool valid() const
maybe_allocate(const maybe_allocate &other)
void binary_output(stream &s) const
void resize(const size_t new_size)
void textual_output(stream &s, const str &separator=" ") const
SimpleHolder(const bool really_allocate)
static T * allocate(const indexer size)
!
Holds one objects of type \T in memory context Context.
constexpr unsigned short Pearson_hash_16_bit(const T number)
Calculates a 16-bit Pearson hash from @ number.
static T * allocate(const indexer size)
constexpr int int_ceil_div(const int num, const int denom)
Returns the ceiling of num/denom, with proper rounding.
separate_thread_accessor(separate_thread_holder< T > &s)
CUDA_HOS_DEV bool valid() const
void GPU_synchronize(CUDAStreamPtrHolder stream={})
Synchronizes the stream.
void deallocate_pinned(void *address)
Deallocates address in CPU pinned memory.
SimpleHolder(const X &other_v)
std::vector< std::unique_ptr< T > > m_held
SimpleContainer(SimpleContainer &&other)
const T * operator->() const
void operate_on_all(F &&f, Args &&... args)
CUDA_HOS_DEV SimpleContainer(T *other_array, const indexer sz)
void textual_input(stream &s)
SimpleContainer(SimpleContainer< T, other_indexer, other_context, true > &&other)
static void deallocate(T *&arr)
static void deallocate(T *&arr)
static T * allocate(const indexer size)
bool const RAWDATA *ch2 const
static void deallocate(T *&arr)
maybe_allocate(const bool allocate, Args &&... args)
SimpleContainer(const indexer sz)
static void copy(T *dest, const T *const source, const indexer sz)
SimpleHolder(SimpleHolder< X, other_context, true > &&other)
void binary_input(stream &s)
void binary_output(stream &s) const
constexpr int int_floor_div(const int num, const int denom)
Returns the floor of num/denom, with proper rounding.
void deallocate(void *address)
Deallocates address in GPU memory.
static void move(T *&dest, T *&source, const indexer)
void textual_input(stream &s)
constexpr unsigned char Pearson_hash(const T number)
Calculates a Pearson hash from @ number.
constexpr static char const * name
size_t filled_size() const
SiLocalPosition operator*(const SiLocalPosition &position, const double factor)
static void copy(T *dest, const T *const source, const indexer sz)
constexpr static char const * name
CUDA_HOS_DEV indexer size() const
void textual_output(stream &s, const str &separator=" ") const
void binary_input(stream &s)
std::vector< typename std::thread::id > m_thread_equivs
void GPU_to_GPU(void *dest, const void *const source, const size_t num)
Copies num bytes from source to dest, both in GPU memory.
constexpr static char const * name
void GPU_to_CPU(void *dest, const void *const source, const size_t num)
Copies num bytse from source in GPU memory to dest in CPU memory.
SimpleHolder(const SimpleHolder< X, other_context, other_hold > &other)
bool supports_dynamic_parallelism()
CUDA_HOS_DEV SimpleHolder()
CUDA_HOS_DEV SimpleContainer(const SimpleContainer< T, other_indexer, Context, other_hold > &other)
static void copy(T *dest, const T *const source, const indexer sz)
Holds a run-time amount of objects of type \T, measuring sizes with indexer, in memory context Contex...
static void copy(T *dest, const T *const source, const indexer sz)
SimpleContainer(T *other_array, const indexer sz)
static void copy(T *dest, const T *const source, const indexer sz)
CUDA_HOS_DEV indexer size() const
void * allocate(const size_t num)
Allocates and returns the address of num bytes from GPU memory.
void * allocate_pinned(const size_t num)
Allocates and returns the address of num bytes from CPU pinned memory.
static void copy(T *dest, const T *const source, const indexer sz)
Manages objects of type T in a thread-safe way, ensuring that there's an object available for each se...
T & get_for_thread() const
CUDA_HOS_DEV SimpleContainer()
static void move(T *&dest, T *&source, const indexer sz)
!
separate_thread_accessor(separate_thread_holder< T > &s, T *&ptr)
void GPU_to_GPU_async(void *dest, const void *const source, const size_t num, CUDAStreamPtrHolder stream={})
Copies num bytes from source to dest, both in GPU memory, asynchronously.
CUDA_HOS_DEV SimpleHolder(const SimpleHolder< X, Context, other_hold > &other)
SimpleContainer(const SimpleContainer &other)
SimpleHolder(const SimpleHolder &other)
static void copy(T *dest, const T *const source, const indexer sz)
!
maybe_allocate(const bool allocate, const T &t)
Possibly holds an object in its internal buffer.
CUDAStreamPtrHolder()=default
boost::variant< nil, double, unsigned int, bool, std::string, boost::recursive_wrapper< unaryexpr_ >, boost::recursive_wrapper< expression > > operand
constexpr Base compile_time_pow2(const Exp exp)
Returns 2 to the power of exp.
static void copy(T *dest, const T *const source, const indexer sz)
static void move(T *&dest, T *&source, const indexer sz)
CUDAStreamPtrHolder(T *p)
size_t available_size() const
setBGCode setTAP setLVL2ErrorBits bool
separate_thread_holder< T > & m_sth
~separate_thread_accessor()
CUDA_HOS_DEV SimpleHolder(X *other_p)
void resize(const indexer new_size)