7#ifndef CALORECGPU_GPUCLUSTERINFOANDMOMENTSCALCULATOR_CUDA_H
8#define CALORECGPU_GPUCLUSTERINFOANDMOMENTSCALCULATOR_CUDA_H
35 const float max_ab =
max( fabsf(a_orig), fabsf(b_orig) );
36 const float max_cd =
max( fabsf(c_orig), fabsf(d_orig) );
37 const float max_ef =
max( fabsf(e_orig), fabsf(f_orig) );
43 const float inv_scale = 1.0f /
scale;
44 a = a_orig * inv_scale;
45 b = b_orig * inv_scale;
46 c = c_orig * inv_scale;
47 d = d_orig * inv_scale;
48 e = e_orig * inv_scale;
49 f = f_orig * inv_scale;
83 const float beta = hypot(
d,
f);
85 const float inv_beta = 1.f / beta;
87 const float em_0_1 =
d * inv_beta;
88 const float em_0_2 =
f * inv_beta;
90 const float q_w_1 = 2 * em_0_1 *
e;
91 const float q_c_1 = fmaf(2 * em_0_1,
e, -q_w_1);
92 const float q_w_2 = em_0_2 *
c;
93 const float q_c_2 = fmaf(em_0_2,
c, -q_w_2);
94 const float q_w_3 = em_0_2 *
b;
95 const float q_c_3 = fmaf(em_0_2,
b, -q_w_3);
99 temp_diag[1] = fmaf( em_0_2, q,
b);
100 temp_diag[2] = fmaf(-em_0_2, q,
c);
102 temp_subdiag[0] = beta;
103 temp_subdiag[1] = fmaf(-em_0_1, q,
e);
105 temp_mat[0][0] = 1.f;
106 temp_mat[0][1] = 0.f;
107 temp_mat[0][2] = 0.f;
109 temp_mat[1][0] = 0.f;
110 temp_mat[1][1] = em_0_1;
111 temp_mat[1][2] = em_0_2;
113 temp_mat[2][0] = 0.f;
114 temp_mat[2][1] = em_0_2;
115 temp_mat[2][2] = -em_0_1;
121 float (&temp_diag)[3],
122 float (&temp_subdiag)[2],
123 float (&temp_mat)[3][3])
125 const float td = (temp_diag[end - 1] - temp_diag[end]) * 0.5f;
127 const float ee = temp_subdiag[end - 1];
129 float mu = temp_diag[end];
137 const float ee_2 = ee * ee;
139 const float h = hypot(td, ee);
141 const float factor = td +
h * ((td >= 0.f) - (td < 0.f));
145 mu -= ee / (factor / ee);
153 float x = temp_diag[start] - mu;
154 float z = temp_subdiag[start];
156 for (
int k = start; k < end &&
z != 0.f; ++k)
158 float givens_c, givens_s;
168 givens_s = (
z < 0.f) - (
z >= 0.f);
170 else if (fabsf(
x) >= fabsf(
z))
172 const float t =
z /
x;
173 const float u = hypot(1.f, fabsf(t)) * ((
x >= 0.f) - (
x < 0.f));
176 givens_s = -t * givens_c;
180 const float t =
x /
z;
181 const float u = hypot(1.f, fabsf(t)) * ((
z >= 0.f) - (
z < 0.f));
184 givens_c = -t * givens_s;
223 z = -givens_s * temp_subdiag[k + 1];
225 temp_subdiag[k + 1] *= givens_c;
231 for (
int i = 0; i < 3; ++i)
233 float & c_1 = temp_mat[k] [i];
234 float & c_2 = temp_mat[k + 1][i];
236 const float c_1_old = c_1;
237 const float c_2_old = c_2;
251 float (&temp_subdiag)[2],
252 float (&temp_mat)[3][3],
258 int start = 0, end = 2;
260 const float precision_inv = 1.f / epsilon;
264 for (
int i = start; i < end; ++i)
266 if (fabsf(temp_subdiag[i]) < near_zero)
268 temp_subdiag[i] = 0.f;
272 const float scaled_subdiag = precision_inv * temp_subdiag[i];
273 if (scaled_subdiag * scaled_subdiag <= fabsf(temp_diag[i]) + fabsf(temp_diag[i + 1]))
275 temp_subdiag[i] = 0.f;
280 while (end > 0 && temp_subdiag[end - 1] == 0.f)
292 if (iter_count > max_iter)
300 while (start > 0 && temp_subdiag[start - 1] != 0.f)
317 float (&eigenvectors)[3][3],
323 float temp_subdiag[2];
328 compute(eigenvalues, temp_subdiag, eigenvectors, near_zero, epsilon, max_iter);
330 eigenvalues[0] *=
scale;
331 eigenvalues[1] *=
scale;
332 eigenvalues[2] *=
scale;
354 const float max_ab =
max( fabsf(
a), fabsf(
b) );
355 const float max_cd =
max( fabsf(
c), fabsf(d_orig) );
356 const float max_ef =
max( fabsf(e_orig), fabsf(f_orig) );
362 const float inv_scale = 1.0f /
scale;
366 d = d_orig * inv_scale;
367 e = e_orig * inv_scale;
368 f = f_orig * inv_scale;
378 const float ab =
a *
b;
379 const float corr_ab = fmaf(
a,
b, -ab);
380 const float dd =
d *
d;
381 const float corr_dd = fmaf(
d,
d, -dd);
382 const float ac =
a *
c;
383 const float corr_ac = fmaf(
a,
c, -ac);
384 const float ff =
f *
f;
385 const float corr_ff = fmaf(
f,
f, -ff);
386 const float bc =
b *
c;
387 const float corr_bc = fmaf(
b,
c, -bc);
388 const float ee =
e *
e;
389 const float corr_ee = fmaf(
e,
e, -ee);
391 const float c_0 = fmaf(2 *
d,
f *
e,
397 const float c_1 =
CaloRecGPU::Helpers::sum_kahan_babushka_neumaier(ab, -dd, ac, -ff, bc, -ee, corr_ab, -corr_dd, corr_ac, -corr_ff, corr_bc, corr_ee);
401 constexpr float inv_3 = 1.f / 3.f;
403 const float c_2_over_3 = c_2 * inv_3;
405 const float a_over_3 =
max(fma(c_2, c_2_over_3, -c_1) * inv_3, 0.f);
407 const float half_b = 0.5f * (fma(c_2_over_3, fma(2.f * c_2_over_3, c_2_over_3, -c_1), c_0));
415 const float rho = sqrtf(a_over_3);
418 const float theta = atan2f(1.0f, rsqrtf(q) * half_b) * inv_3;
420 const float theta = atan2f(sqrtf(q), half_b) * inv_3;
424 float sin_theta, cos_theta;
425 sincosf(
theta, &sin_theta, &cos_theta);
427 const float sin_theta = sinf(
theta);
428 const float cos_theta = cosf(
theta);
431 const float sqrt_3 = sqrtf(3.f);
433 e_1 = fma(-rho, fma( sqrt_3, sin_theta, cos_theta), c_2_over_3);
434 e_2 = fma(-rho, fma(-sqrt_3, sin_theta, cos_theta), c_2_over_3);
435 e_3 = fma( rho, 2.0f * cos_theta, c_2_over_3);
442 const float diag[3] = {
a - eigenvalue,
447 float vec_1[3], vec_2[3];
450 float max_value = -1.f;
452 for (
int i = 0; i < 3; ++i)
454 const float this_diag = fabsf(diag[i]);
455 if (this_diag >= max_value)
458 max_value = this_diag;
464 representative[0] = diag[0];
465 representative[1] =
d;
466 representative[2] =
f;
476 else if (max_diag == 1)
478 representative[0] =
d;
479 representative[1] = diag[1];
480 representative[2] =
e;
493 representative[0] =
f;
494 representative[1] =
e;
495 representative[2] = diag[2];
511 const float norm_1 = rnorm3df(
res[0],
res[1],
res[2]);
512 const float norm_2 = rnorm3df(vec_1[0], vec_1[1], vec_1[2]);
514 const float norm_1 = 1.f / hypot(
res[0],
res[1],
res[2]);
515 const float norm_2 = 1.f / hypot(vec_1[0], vec_1[1], vec_1[2]);
518 if (norm_1 <= norm_2)
527 res[0] = vec_1[0] * norm_2;
528 res[1] = vec_1[1] * norm_2;
529 res[2] = vec_1[2] * norm_2;
542 if (e_3 - e_1 <= epsilon)
558 const float d_0 = e_3 - e_2;
559 const float d_1 = e_2 - e_1;
561 const float d_min =
min(d_0, d_1);
562 const float d_max =
max(d_0, d_1);
565 float first_e, second_e;
585#if USE_ORIGINAL_EIGEN
586 if (d_min <= 2 * epsilon * d1)
589 const float base_norm = rnorm3df(
res[j][0],
res[j][1],
res[j][2]);
591 const float base_norm = 1.f / hypot(
res[j][0],
res[j][1],
res[j][2]);
595 const float norm = base_norm / extra_factor;
602 if (d_min <= 2 * epsilon * d_max)
611 res[j][0] -=
res[k][0] * prod;
612 res[j][1] -=
res[k][1] * prod;
613 res[j][2] -=
res[k][2] * prod;
616 const float norm = rnorm3df(
res[j][0],
res[j][1],
res[j][2]);
618 const float norm = 1.f / hypot(
res[j][0],
res[j][1],
res[j][2]);
627 float extra_vector[3];
635 const float norm = rnorm3df(
res[1][0],
res[1][1],
res[1][2]);
637 const float norm = 1.f / hypot(
res[1][0],
res[1][1],
res[1][2]);
653 get_eigenvectors(eigenvectors, eigenvalues[0], eigenvalues[1], eigenvalues[2], epsilon);
655 if (rescale_and_reshift_values)
657 eigenvalues[0] = fmaf(eigenvalues[0],
scale,
shift);
658 eigenvalues[1] = fmaf(eigenvalues[1],
scale,
shift);
659 eigenvalues[2] = fmaf(eigenvalues[2],
scale,
shift);
699 const bool synchronize =
false,
Scalar theta() const
theta method
std::pair< std::vector< unsigned int >, bool > res
Header file for AthHistogramAlgorithm.
Holds CPU and GPU versions of the geometry and cell noise information, which are assumed to be consta...
Holds the mutable per-event information (clusters and cells) and provides utilities to convert betwee...
Interface for GPU kernel size optimization (allowing adjustment of kernel sizes to the properties of ...
SimpleHolder< T, MemoryContext::CUDAGPU, true > CUDA_object
Holds an object of type T in CUDA GPU memory.
static CUDA_HOS_DEV float product_sum_cornea_harrison_tang(const float a, const float b, const float c, const float d)
static CUDA_HOS_DEV float corrected_dot_product(const float a_1, const float a_2, const float a_3, const float b_1, const float b_2, const float b_3)
SimpleHolder< T, MemoryContext::CPU, true > CPU_object
Holds an object of type T in CPU memory.
static CUDA_HOS_DEV void corrected_cross_product(float(&res)[3], const float a1, const float a2, const float a3, const float b1, const float b2, const float b3)
CUDA_HOS_DEV float sum_kahan_babushka_neumaier(const Floats &... fs)
Adds a list of floats using the Kahan-Babushka-Neumaier algorithm for greater precision (at the cost ...
void register_kernels(IGPUKernelSizeOptimizer &optimizer)
constexpr unsigned int num_time_measurements
void calculateClusterPropertiesAndMoments(CaloRecGPU::EventDataHolder &holder, const CaloRecGPU::ConstantDataHolder &instance_data, const CMCOptionsHolder &options, const IGPUKernelSizeOptimizer &optimizer, size_t(×)[num_time_measurements], const bool synchronize=false, CaloRecGPU::CUDA_Helpers::CUDAStreamPtrHolder stream={})
void sendToGPU(const bool clear_CPU=false)
CaloRecGPU::Helpers::CUDA_object< ClusterMomentCalculationOptions > m_options_dev
CaloRecGPU::Helpers::CPU_object< ClusterMomentCalculationOptions > m_options
bool use_two_gaussian_noise
bool skip_invalid_clusters
CUDA_HOS_DEV RealSymmetricMatrixSolverIterative(const float a_orig, const float b_orig, const float c_orig, const float d_orig, const float e_orig, const float f_orig)
static constexpr float s_typical_near_zero
static constexpr float s_typical_tolerance
static constexpr float s_typical_epsilon
CUDA_HOS_DEV void get_solution(float(&eigenvalues)[3], float(&eigenvectors)[3][3], const float tolerance=s_typical_tolerance, const float near_zero=s_typical_near_zero, const float epsilon=s_typical_epsilon, const int max_iter=s_typical_max_iterations)
Get the full eigenvalues and eigenvectors for this matrix.
CUDA_HOS_DEV void tridiagonalize(float(&temp_diag)[3], float(&temp_subdiag)[2], float(&temp_mat)[3][3], const float tolerance=s_typical_tolerance)
static constexpr int s_typical_max_iterations
CUDA_HOS_DEV void compute(float(&temp_diag)[3], float(&temp_subdiag)[2], float(&temp_mat)[3][3], const float near_zero=s_typical_near_zero, const float epsilon=s_typical_epsilon, const int max_iter=s_typical_max_iterations)
CUDA_HOS_DEV void compute_iteration(const int start, const int end, float(&temp_diag)[3], float(&temp_subdiag)[2], float(&temp_mat)[3][3])
CUDA_HOS_DEV void get_eigenvalues(float &e_1, float &e_2, float &e_3) const
Calculate shifted and scaled eigenvalues of the matrix, in ascending value.
CUDA_HOS_DEV void get_solution(float(&eigenvalues)[3], float(&eigenvectors)[3][3], bool rescale_and_reshift_values=true, const float epsilon=s_typical_epsilon)
Get the full eigenvalues and eigenvectors for this matrix.
CUDA_HOS_DEV void get_eigenvectors(float(&res)[3][3], const float e_1, const float e_2, const float e_3, const float epsilon=s_typical_epsilon) const
Calculate the eigenvectors of the matrix, using the (possibly unscaled) eigenvalues e_1,...
CUDA_HOS_DEV void extract_one(const float eigenvalue, float(&res)[3], float(&representative)[3]) const
static constexpr float s_typical_epsilon
CUDA_HOS_DEV RealSymmetricMatrixSolver(const float a_orig, const float b_orig, const float c_orig, const float d_orig, const float e_orig, const float f_orig)