7 #ifndef CALORECGPU_GPUCLUSTERINFOANDMOMENTSCALCULATOR_CUDA_H
8 #define CALORECGPU_GPUCLUSTERINFOANDMOMENTSCALCULATOR_CUDA_H
17 #include <type_traits>
23 const float t =
sum + to_add;
25 const bool test = fabsf(
sum) >= fabsf(to_add);
27 const float opt_1 = (
sum -
t) + to_add;
28 const float opt_2 = (to_add -
t) +
sum;
30 corr += (
test) * opt_1 + (!
test) * opt_2;
39 template <
class ... Floats,
class disabler = std::enable_if_t < (std::is_same_v<std::decay_t<Floats>,
float> && ...) >>
58 const float w_1 =
a *
b;
59 const float w_2 =
c *
d;
61 const float e_1 = fmaf(
a,
b, -w_1);
62 const float e_2 = fmaf(
c,
d, -w_2);
70 const float b_1,
const float b_2,
const float b_3)
74 const float w_1 = a_1 * b_1;
75 const float w_2 = a_2 * b_2;
76 const float w_3 = a_3 * b_3;
78 const float e_1 = fmaf(a_1, b_1, -w_1);
79 const float e_2 = fmaf(a_2, b_2, -w_2);
80 const float e_3 = fmaf(a_3, b_3, -w_3);
93 void corrected_cross_product(
float (&
res)[3],
const float a1,
const float a2,
const float a3,
const float b1,
const float b2,
const float b3)
117 return norm3df(r_1, r_2, r_3);
119 return hypot(r_1, r_2, r_3);
144 const float max_ab =
max( fabsf(a_orig), fabsf(b_orig) );
145 const float max_cd =
max( fabsf(c_orig), fabsf(d_orig) );
146 const float max_ef =
max( fabsf(e_orig), fabsf(f_orig) );
152 const float inv_scale = 1.0f /
scale;
153 a = a_orig * inv_scale;
154 b = b_orig * inv_scale;
155 c = c_orig * inv_scale;
156 d = d_orig * inv_scale;
157 e = e_orig * inv_scale;
158 f = f_orig * inv_scale;
178 temp_mat[0][0] = 1.f;
179 temp_mat[0][1] = 0.f;
180 temp_mat[0][2] = 0.f;
182 temp_mat[1][0] = 0.f;
183 temp_mat[1][1] = 1.f;
184 temp_mat[1][2] = 0.f;
186 temp_mat[2][0] = 0.f;
187 temp_mat[2][1] = 0.f;
188 temp_mat[2][2] = 1.f;
192 const float beta = hypot(
d,
f);
194 const float inv_beta = 1.f /
beta;
196 const float em_0_1 =
d * inv_beta;
197 const float em_0_2 =
f * inv_beta;
199 const float q_w_1 = 2 * em_0_1 *
e;
200 const float q_c_1 = fmaf(2 * em_0_1,
e, -q_w_1);
201 const float q_w_2 = em_0_2 *
c;
202 const float q_c_2 = fmaf(em_0_2,
c, -q_w_2);
203 const float q_w_3 = em_0_2 *
b;
204 const float q_c_3 = fmaf(em_0_2,
b, -q_w_3);
208 temp_diag[1] = fmaf( em_0_2,
q,
b);
209 temp_diag[2] = fmaf(-em_0_2,
q,
c);
211 temp_subdiag[0] =
beta;
212 temp_subdiag[1] = fmaf(-em_0_1,
q,
e);
214 temp_mat[0][0] = 1.f;
215 temp_mat[0][1] = 0.f;
216 temp_mat[0][2] = 0.f;
218 temp_mat[1][0] = 0.f;
219 temp_mat[1][1] = em_0_1;
220 temp_mat[1][2] = em_0_2;
222 temp_mat[2][0] = 0.f;
223 temp_mat[2][1] = em_0_2;
224 temp_mat[2][2] = -em_0_1;
230 float (&temp_diag)[3],
231 float (&temp_subdiag)[2],
232 float (&temp_mat)[3][3])
234 const float td = (temp_diag[
end - 1] - temp_diag[
end]) * 0.5
f;
236 const float ee = temp_subdiag[
end - 1];
238 float mu = temp_diag[
end];
246 const float ee_2 = ee * ee;
248 const float h = hypot(td, ee);
250 const float factor = td +
h * ((td >= 0.f) - (td < 0.
f));
254 mu -= ee / (factor / ee);
263 float z = temp_subdiag[
start];
267 float givens_c, givens_s;
277 givens_s = (
z < 0.f) - (
z >= 0.
f);
279 else if (fabsf(
x) >= fabsf(
z))
281 const float t =
z /
x;
282 const float u = hypot(1.
f, fabsf(
t)) * ((
x >= 0.f) - (
x < 0.
f));
285 givens_s = -
t * givens_c;
289 const float t =
x /
z;
290 const float u = hypot(1.
f, fabsf(
t)) * ((
z >= 0.f) - (
z < 0.
f));
293 givens_c = -
t * givens_s;
332 z = -givens_s * temp_subdiag[
k + 1];
334 temp_subdiag[
k + 1] *= givens_c;
340 for (
int i = 0;
i < 3; ++
i)
342 float & c_1 = temp_mat[
k] [
i];
343 float & c_2 = temp_mat[
k + 1][
i];
345 const float c_1_old = c_1;
346 const float c_2_old = c_2;
360 float (&temp_subdiag)[2],
361 float (&temp_mat)[3][3],
369 const float precision_inv = 1.f / epsilon;
375 if (fabsf(temp_subdiag[
i]) < near_zero)
377 temp_subdiag[
i] = 0.f;
381 const float scaled_subdiag = precision_inv * temp_subdiag[
i];
382 if (scaled_subdiag * scaled_subdiag <= fabsf(temp_diag[
i]) + fabsf(temp_diag[
i + 1]))
384 temp_subdiag[
i] = 0.f;
389 while (
end > 0 && temp_subdiag[
end - 1] == 0.
f)
401 if (iter_count > max_iter)
403 printf(
"OUT OF ITERS! %d %d\n",
start,
end);
409 while (
start > 0 && temp_subdiag[
start - 1] != 0.
f)
426 float (&eigenvectors)[3][3],
432 float temp_subdiag[2];
437 compute(eigenvalues, temp_subdiag, eigenvectors, near_zero, epsilon, max_iter);
439 eigenvalues[0] *=
scale;
440 eigenvalues[1] *=
scale;
441 eigenvalues[2] *=
scale;
463 const float max_ab =
max( fabsf(
a), fabsf(
b) );
464 const float max_cd =
max( fabsf(
c), fabsf(d_orig) );
465 const float max_ef =
max( fabsf(e_orig), fabsf(f_orig) );
471 const float inv_scale = 1.0f /
scale;
475 d = d_orig * inv_scale;
476 e = e_orig * inv_scale;
477 f = f_orig * inv_scale;
487 const float ab =
a *
b;
488 const float corr_ab = fmaf(
a,
b, -ab);
489 const float dd =
d *
d;
490 const float corr_dd = fmaf(
d,
d, -dd);
491 const float ac =
a *
c;
492 const float corr_ac = fmaf(
a,
c, -ac);
493 const float ff =
f *
f;
494 const float corr_ff = fmaf(
f,
f, -
ff);
495 const float bc =
b *
c;
496 const float corr_bc = fmaf(
b,
c, -bc);
497 const float ee =
e *
e;
498 const float corr_ee = fmaf(
e,
e, -ee);
500 const float c_0 = fmaf(2 *
d,
f *
e,
506 const float c_1 =
sum_kahan_babushka_neumaier(ab, -dd, ac, -
ff, bc, -ee, corr_ab, -corr_dd, corr_ac, -corr_ff, corr_bc, corr_ee);
510 constexpr
float inv_3 = 1.f / 3.f;
512 const float c_2_over_3 = c_2 * inv_3;
514 const float a_over_3 =
max(fma(c_2, c_2_over_3, -c_1) * inv_3, 0.
f);
516 const float half_b = 0.5f * (fma(c_2_over_3, fma(2.
f * c_2_over_3, c_2_over_3, -c_1), c_0));
524 const float rho = sqrtf(a_over_3);
527 const float theta = atan2f(1.0
f, rsqrtf(
q) * half_b) * inv_3;
529 const float theta = atan2f(sqrtf(
q), half_b) * inv_3;
533 float sin_theta, cos_theta;
534 sincosf(theta, &sin_theta, &cos_theta);
536 const float sin_theta = sinf(theta);
537 const float cos_theta = cosf(theta);
540 const float sqrt_3 = sqrtf(3.
f);
542 e_1 = fma(-
rho, fma( sqrt_3, sin_theta, cos_theta), c_2_over_3);
543 e_2 = fma(-
rho, fma(-sqrt_3, sin_theta, cos_theta), c_2_over_3);
544 e_3 = fma(
rho, 2.0
f * cos_theta, c_2_over_3);
551 const float diag[3] = {
a - eigenvalue,
556 float vec_1[3], vec_2[3];
559 float max_value = -1.f;
561 for (
int i = 0;
i < 3; ++
i)
563 const float this_diag = fabsf(diag[
i]);
564 if (this_diag >= max_value)
567 max_value = this_diag;
573 representative[0] = diag[0];
574 representative[1] =
d;
575 representative[2] =
f;
585 else if (max_diag == 1)
587 representative[0] =
d;
588 representative[1] = diag[1];
589 representative[2] =
e;
602 representative[0] =
f;
603 representative[1] =
e;
604 representative[2] = diag[2];
620 const float norm_1 = rnorm3df(
res[0],
res[1],
res[2]);
621 const float norm_2 = rnorm3df(vec_1[0], vec_1[1], vec_1[2]);
623 const float norm_1 = 1.f / hypot(
res[0],
res[1],
res[2]);
624 const float norm_2 = 1.f / hypot(vec_1[0], vec_1[1], vec_1[2]);
627 if (norm_1 <= norm_2)
636 res[0] = vec_1[0] * norm_2;
637 res[1] = vec_1[1] * norm_2;
638 res[2] = vec_1[2] * norm_2;
651 if (e_3 - e_1 <= epsilon)
667 const float d_0 = e_3 - e_2;
668 const float d_1 = e_2 - e_1;
670 const float d_min =
min(d_0, d_1);
671 const float d_max =
max(d_0, d_1);
674 float first_e, second_e;
693 extract_one(first_e,
res[
k],
res[j]);
694 #if USE_ORIGINAL_EIGEN
695 if (d_min <= 2 * epsilon *
d1)
698 const float base_norm = rnorm3df(
res[j][0],
res[j][1],
res[j][2]);
700 const float base_norm = 1.f / hypot(
res[j][0],
res[j][1],
res[j][2]);
704 const float norm = base_norm / extra_factor;
711 if (d_min <= 2 * epsilon * d_max)
725 const float norm = rnorm3df(
res[j][0],
res[j][1],
res[j][2]);
727 const float norm = 1.f / hypot(
res[j][0],
res[j][1],
res[j][2]);
736 float extra_vector[3];
738 extract_one(second_e,
res[j], extra_vector);
744 const float norm = rnorm3df(
res[1][0],
res[1][1],
res[1][2]);
746 const float norm = 1.f / hypot(
res[1][0],
res[1][1],
res[1][2]);
761 get_eigenvalues(eigenvalues[0], eigenvalues[1], eigenvalues[2]);
762 get_eigenvectors(eigenvectors, eigenvalues[0], eigenvalues[1], eigenvalues[2], epsilon);
764 if (rescale_and_reshift_values)
766 eigenvalues[0] = fmaf(eigenvalues[0],
scale, shift);
767 eigenvalues[1] = fmaf(eigenvalues[1],
scale, shift);
768 eigenvalues[2] = fmaf(eigenvalues[2],
scale, shift);
793 m_options.allocate();
808 const bool synchronize =
false,
810 const bool defer_instead_of_oversize =
false);