7 #ifndef CALORECGPU_GPUCLUSTERINFOANDMOMENTSCALCULATOR_CUDA_H 
    8 #define CALORECGPU_GPUCLUSTERINFOANDMOMENTSCALCULATOR_CUDA_H 
   17 #include <type_traits> 
   23     const float t = 
sum + to_add;
 
   25     const bool test = fabsf(
sum) >= fabsf(to_add);
 
   27     const float opt_1 = (
sum - 
t) + to_add;
 
   28     const float opt_2 = (to_add - 
t) + 
sum;
 
   30     corr += (
test) * opt_1 + (!
test) * opt_2;
 
   39   template < 
class ... Floats, 
class disabler = std::enable_if_t < (std::is_same_v<std::decay_t<Floats>, 
float> && ...) >>
 
   58     const float w_1 = 
a * 
b;
 
   59     const float w_2 = 
c * 
d;
 
   61     const float e_1 = fmaf(
a, 
b, -w_1);
 
   62     const float e_2 = fmaf(
c, 
d, -w_2);
 
   70                               const float b_1, 
const float b_2, 
const float b_3)
 
   74     const float w_1 = a_1 * b_1;
 
   75     const float w_2 = a_2 * b_2;
 
   76     const float w_3 = a_3 * b_3;
 
   78     const float e_1 = fmaf(a_1, b_1, -w_1);
 
   79     const float e_2 = fmaf(a_2, b_2, -w_2);
 
   80     const float e_3 = fmaf(a_3, b_3, -w_3);
 
   93   void corrected_cross_product(
float (&
res)[3], 
const float a1, 
const float a2, 
const float a3, 
const float b1, 
const float b2, 
const float b3)
 
  117     return norm3df(r_1, r_2, r_3);
 
  119     return hypot(r_1, r_2, r_3);
 
  144       const float max_ab = 
max( fabsf(a_orig),  fabsf(b_orig) );
 
  145       const float max_cd = 
max( fabsf(c_orig),  fabsf(d_orig) );
 
  146       const float max_ef = 
max( fabsf(e_orig),  fabsf(f_orig) );
 
  152       const float inv_scale = 1.0f / 
scale;
 
  153       a = a_orig * inv_scale;
 
  154       b = b_orig * inv_scale;
 
  155       c = c_orig * inv_scale;
 
  156       d = d_orig * inv_scale;
 
  157       e = e_orig * inv_scale;
 
  158       f = f_orig * inv_scale;
 
  178           temp_mat[0][0] =    1.f;
 
  179           temp_mat[0][1] =    0.f;
 
  180           temp_mat[0][2] =    0.f;
 
  182           temp_mat[1][0] =    0.f;
 
  183           temp_mat[1][1] =    1.f;
 
  184           temp_mat[1][2] =    0.f;
 
  186           temp_mat[2][0] =    0.f;
 
  187           temp_mat[2][1] =    0.f;
 
  188           temp_mat[2][2] =    1.f;
 
  192           const float beta = hypot(
d, 
f);
 
  194           const float inv_beta = 1.f / 
beta;
 
  196           const float em_0_1 = 
d * inv_beta;
 
  197           const float em_0_2 = 
f * inv_beta;
 
  199           const float q_w_1 = 2 * em_0_1 * 
e;
 
  200           const float q_c_1 = fmaf(2 * em_0_1, 
e, -q_w_1);
 
  201           const float q_w_2 = em_0_2 * 
c;
 
  202           const float q_c_2 = fmaf(em_0_2, 
c, -q_w_2);
 
  203           const float q_w_3 = em_0_2 * 
b;
 
  204           const float q_c_3 = fmaf(em_0_2, 
b, -q_w_3);
 
  208           temp_diag[1] = fmaf( em_0_2, 
q, 
b);
 
  209           temp_diag[2] = fmaf(-em_0_2, 
q, 
c);
 
  211           temp_subdiag[0] = 
beta;
 
  212           temp_subdiag[1] = fmaf(-em_0_1, 
q, 
e);
 
  214           temp_mat[0][0] =    1.f;
 
  215           temp_mat[0][1] =    0.f;
 
  216           temp_mat[0][2] =    0.f;
 
  218           temp_mat[1][0] =    0.f;
 
  219           temp_mat[1][1] =  em_0_1;
 
  220           temp_mat[1][2] =  em_0_2;
 
  222           temp_mat[2][0] =    0.f;
 
  223           temp_mat[2][1] =  em_0_2;
 
  224           temp_mat[2][2] = -em_0_1;
 
  230                                         float (&temp_diag)[3],
 
  231                                         float (&temp_subdiag)[2],
 
  232                                         float (&temp_mat)[3][3])
 
  234       const float td = (temp_diag[
end - 1] - temp_diag[
end]) * 0.5
f;
 
  236       const float ee = temp_subdiag[
end - 1];
 
  238       float mu = temp_diag[
end];
 
  246           const float ee_2 = ee * ee;
 
  248           const float h = hypot(td, ee);
 
  250           const float factor = td + 
h * ((td >= 0.f) - (td < 0.
f));
 
  254               mu -= ee / (factor / ee);
 
  263       float z = temp_subdiag[
start];
 
  267           float givens_c, givens_s;
 
  277               givens_s = (
z < 0.f) - (
z >= 0.
f);
 
  279           else if (fabsf(
x) >= fabsf(
z))
 
  281               const float t = 
z / 
x;
 
  282               const float u = hypot(1.
f, fabsf(
t)) * ((
x >= 0.f) - (
x < 0.
f));
 
  285               givens_s = -
t * givens_c;
 
  289               const float t = 
x / 
z;
 
  290               const float u = hypot(1.
f, fabsf(
t)) * ((
z >= 0.f) - (
z < 0.
f));
 
  293               givens_c = -
t * givens_s;
 
  332               z = -givens_s * temp_subdiag[
k + 1];
 
  334               temp_subdiag[
k + 1] *= givens_c;
 
  340           for (
int i = 0; 
i < 3; ++
i)
 
  342               float & c_1 = temp_mat[
k]    [
i];
 
  343               float & c_2 = temp_mat[
k + 1][
i];
 
  345               const float c_1_old = c_1;
 
  346               const float c_2_old = c_2;
 
  360                               float (&temp_subdiag)[2],
 
  361                               float (&temp_mat)[3][3],
 
  369       const float precision_inv = 1.f / 
epsilon;
 
  375               if (fabsf(temp_subdiag[
i]) < near_zero)
 
  377                   temp_subdiag[
i] = 0.f;
 
  381                   const float scaled_subdiag = precision_inv * temp_subdiag[
i];
 
  382                   if (scaled_subdiag * scaled_subdiag <= fabsf(temp_diag[
i]) + fabsf(temp_diag[
i + 1]))
 
  384                       temp_subdiag[
i] = 0.f;
 
  389           while (
end > 0 && temp_subdiag[
end - 1] == 0.
f)
 
  401           if (iter_count > max_iter)
 
  403               printf(
"OUT OF ITERS! %d %d\n", 
start, 
end);
 
  409           while (
start > 0 && temp_subdiag[
start - 1] != 0.
f)
 
  426                                    float (&eigenvectors)[3][3],
 
  432       float temp_subdiag[2];
 
  437       compute(eigenvalues, temp_subdiag, eigenvectors, near_zero, 
epsilon, max_iter);
 
  439       eigenvalues[0] *= 
scale;
 
  440       eigenvalues[1] *= 
scale;
 
  441       eigenvalues[2] *= 
scale;
 
  463       const float max_ab = 
max( fabsf(
a),       fabsf(
b)      );
 
  464       const float max_cd = 
max( fabsf(
c),       fabsf(d_orig) );
 
  465       const float max_ef = 
max( fabsf(e_orig),  fabsf(f_orig) );
 
  471       const float inv_scale = 1.0f / 
scale;
 
  475       d = d_orig * inv_scale;
 
  476       e = e_orig * inv_scale;
 
  477       f = f_orig * inv_scale;
 
  487       const float ab      = 
a * 
b;
 
  488       const float corr_ab = fmaf(
a, 
b, -ab);
 
  489       const float dd      = 
d * 
d;
 
  490       const float corr_dd = fmaf(
d, 
d, -dd);
 
  491       const float ac      = 
a * 
c;
 
  492       const float corr_ac = fmaf(
a, 
c, -ac);
 
  493       const float ff      = 
f * 
f;
 
  494       const float corr_ff = fmaf(
f, 
f, -
ff);
 
  495       const float bc      = 
b * 
c;
 
  496       const float corr_bc = fmaf(
b, 
c, -bc);
 
  497       const float ee      = 
e * 
e;
 
  498       const float corr_ee = fmaf(
e, 
e, -ee);
 
  500       const float c_0 = fmaf(2 * 
d, 
f * 
e,
 
  506       const float c_1 = 
sum_kahan_babushka_neumaier(ab, -dd, ac, -
ff, bc, -ee, corr_ab, -corr_dd, corr_ac, -corr_ff, corr_bc, corr_ee);
 
  510       constexpr 
float inv_3 = 1.f / 3.f;
 
  512       const float c_2_over_3 = c_2 * inv_3;
 
  514       const float a_over_3 = 
max(fma(c_2, c_2_over_3, -c_1) * inv_3, 0.
f);
 
  516       const float half_b = 0.5f * (fma(c_2_over_3, fma(2.
f * c_2_over_3, c_2_over_3, -c_1), c_0));
 
  524       const float rho = sqrtf(a_over_3);
 
  527       const float theta = atan2f(1.0
f, rsqrtf(
q) * half_b) * inv_3;
 
  529       const float theta = atan2f(sqrtf(
q), half_b) * inv_3;
 
  533       float sin_theta, cos_theta;
 
  534       sincosf(
theta, &sin_theta, &cos_theta);
 
  536       const float sin_theta = sinf(
theta);
 
  537       const float cos_theta = cosf(
theta);
 
  540       const float sqrt_3 = sqrtf(3.
f);
 
  542       e_1 = fma(-
rho, fma( sqrt_3, sin_theta, cos_theta), c_2_over_3);
 
  543       e_2 = fma(-
rho, fma(-sqrt_3, sin_theta, cos_theta), c_2_over_3);
 
  544       e_3 = fma( 
rho, 2.0
f * cos_theta,                   c_2_over_3);
 
  551       const float diag[3] = { 
a - eigenvalue,
 
  556       float vec_1[3], vec_2[3];
 
  559       float max_value = -1.f;
 
  561       for (
int i = 0; 
i < 3; ++
i)
 
  563           const float this_diag = fabsf(diag[
i]);
 
  564           if (this_diag >= max_value)
 
  567               max_value = this_diag;
 
  573           representative[0] = diag[0];
 
  574           representative[1] = 
d;
 
  575           representative[2] = 
f;
 
  585       else if (max_diag == 1)
 
  587           representative[0] = 
d;
 
  588           representative[1] = diag[1];
 
  589           representative[2] = 
e;
 
  602           representative[0] = 
f;
 
  603           representative[1] = 
e;
 
  604           representative[2] = diag[2];
 
  620       const float norm_1 = rnorm3df(
res[0], 
res[1], 
res[2]);
 
  621       const float norm_2 = rnorm3df(vec_1[0], vec_1[1], vec_1[2]);
 
  623       const float norm_1 = 1.f / hypot(
res[0], 
res[1], 
res[2]);
 
  624       const float norm_2 = 1.f / hypot(vec_1[0], vec_1[1], vec_1[2]);
 
  627       if (norm_1 <= norm_2)
 
  636           res[0] = vec_1[0] * norm_2;
 
  637           res[1] = vec_1[1] * norm_2;
 
  638           res[2] = vec_1[2] * norm_2;
 
  667           const float d_0 = e_3 - e_2;
 
  668           const float d_1 = e_2 - e_1;
 
  670           const float d_min = 
min(d_0, d_1);
 
  671           const float d_max = 
max(d_0, d_1);
 
  674           float first_e, second_e;
 
  693           extract_one(first_e, 
res[
k], 
res[j]);
 
  694 #if USE_ORIGINAL_EIGEN 
  698               const float base_norm = rnorm3df(
res[j][0], 
res[j][1], 
res[j][2]);
 
  700               const float base_norm = 1.f / hypot(
res[j][0], 
res[j][1], 
res[j][2]);
 
  704               const float norm = base_norm / extra_factor;
 
  711           if (d_min <= 2 * 
epsilon * d_max)
 
  725               const float norm = rnorm3df(
res[j][0], 
res[j][1], 
res[j][2]);
 
  727               const float norm = 1.f / hypot(
res[j][0], 
res[j][1], 
res[j][2]);
 
  736               float extra_vector[3];
 
  738               extract_one(second_e, 
res[j], extra_vector);
 
  744           const float norm = rnorm3df(
res[1][0], 
res[1][1], 
res[1][2]);
 
  746           const float norm = 1.f / hypot(
res[1][0], 
res[1][1], 
res[1][2]);
 
  761       get_eigenvalues(eigenvalues[0], eigenvalues[1], eigenvalues[2]);
 
  762       get_eigenvectors(eigenvectors, eigenvalues[0], eigenvalues[1], eigenvalues[2], 
epsilon);
 
  764       if (rescale_and_reshift_values)
 
  766           eigenvalues[0] = fmaf(eigenvalues[0], 
scale, shift);
 
  767           eigenvalues[1] = fmaf(eigenvalues[1], 
scale, shift);
 
  768           eigenvalues[2] = fmaf(eigenvalues[2], 
scale, shift);
 
  793       m_options.allocate();
 
  808                                             const bool synchronize = 
false,
 
  810                                             const bool defer_instead_of_oversize = 
false);