ATLAS Offline Software
Loading...
Searching...
No Matches
GPUClusterInfoAndMomentsCalculatorImpl.h
Go to the documentation of this file.
1//
2// Copyright (C) 2002-2025 CERN for the benefit of the ATLAS collaboration
3//
4// Dear emacs, this is -*- c++ -*-
5//
6
7#ifndef CALORECGPU_GPUCLUSTERINFOANDMOMENTSCALCULATOR_CUDA_H
8#define CALORECGPU_GPUCLUSTERINFOANDMOMENTSCALCULATOR_CUDA_H
9
12#include "CaloRecGPU/Helpers.h"
13
15
16#include <cmath>
17#include <type_traits>
18
20{
21 inline CUDA_HOS_DEV void partial_kahan_babushka_neumaier_sum(const float & to_add, float & sum, float & corr)
22 {
23 const float t = sum + to_add;
24
25 const bool test = fabsf(sum) >= fabsf(to_add);
26
27 const float opt_1 = (sum - t) + to_add;
28 const float opt_2 = (to_add - t) + sum;
29
30 corr += (test) * opt_1 + (!test) * opt_2;
31
32 sum = t;
33 }
34
35
36 //There are some extra operations, yes,
37 //but we benefit from added precision
38 //that can compensate for not using doubles...
39 template < class ... Floats, class disabler = std::enable_if_t < (std::is_same_v<std::decay_t<Floats>, float> && ...) >>
40 CUDA_HOS_DEV float sum_kahan_babushka_neumaier(const Floats & ... fs)
41 {
42 float ret = 0.f;
43 float corr = 0.f;
44
45 (partial_kahan_babushka_neumaier_sum(fs, ret, corr), ...);
46
47 return ret + corr;
48 }
49
50 //Algorithm that calculates a * b + c * d with better precision using FMA,
51 //following "Error bounds on complex floating-point multiplication with an FMA"
52 //by Jeannerod et. al.
53 inline CUDA_HOS_DEV
54 float product_sum_cornea_harrison_tang(const float a, const float b, const float c, const float d)
55 {
56 using namespace std;
57
58 const float w_1 = a * b;
59 const float w_2 = c * d;
60
61 const float e_1 = fmaf(a, b, -w_1);
62 const float e_2 = fmaf(c, d, -w_2);
63
64 return sum_kahan_babushka_neumaier(w_1, w_2, e_1, e_2);
65 }
66
67 //Generalization of the Cornea-Harrison-Tang algorithm for dot products.
68 inline CUDA_HOS_DEV
69 float corrected_dot_product(const float a_1, const float a_2, const float a_3,
70 const float b_1, const float b_2, const float b_3)
71 {
72 using namespace std;
73
74 const float w_1 = a_1 * b_1;
75 const float w_2 = a_2 * b_2;
76 const float w_3 = a_3 * b_3;
77
78 const float e_1 = fmaf(a_1, b_1, -w_1);
79 const float e_2 = fmaf(a_2, b_2, -w_2);
80 const float e_3 = fmaf(a_3, b_3, -w_3);
81
82 return sum_kahan_babushka_neumaier(w_1, w_2, w_3, e_1, e_2, e_3);
83 }
84
85 inline CUDA_HOS_DEV
86 float corrected_dot_product(const float (&a)[3], const float (&b)[3])
87 {
88 return corrected_dot_product(a[0], a[1], a[2], b[0], b[1], b[2]);
89 }
90
91 //Cross product using the Cornea-Harrison-Tang algorithm
92 inline CUDA_HOS_DEV
93 void corrected_cross_product(float (&res)[3], const float a1, const float a2, const float a3, const float b1, const float b2, const float b3)
94 {
95 res[0] = product_sum_cornea_harrison_tang(a2, b3, -a3, b2);
96 res[1] = product_sum_cornea_harrison_tang(a3, b1, -a1, b3);
97 res[2] = product_sum_cornea_harrison_tang(a1, b2, -a2, b1);
98 }
99
100 inline CUDA_HOS_DEV
101 void corrected_cross_product(float (&res)[3], const float (&x)[3], const float (&y)[3])
102 {
103 corrected_cross_product(res, x[0], x[1], x[2], y[0], y[1], y[2]);
104 }
105
106 //Magnitude of a cross product using the generalization of the Cornea-Harrison-Tang algorithm
107 inline CUDA_HOS_DEV
108 float corrected_magn_cross_product(const float a1, const float a2, const float a3, const float b1, const float b2, const float b3)
109 {
110 using namespace std;
111
112 const float r_1 = product_sum_cornea_harrison_tang(a2, b3, -a3, b2);
113 const float r_2 = product_sum_cornea_harrison_tang(a3, b1, -a1, b3);
114 const float r_3 = product_sum_cornea_harrison_tang(a1, b2, -a2, b1);
115
116#ifdef __CUDA_ARCH__
117 return norm3df(r_1, r_2, r_3);
118#else
119 return hypot(r_1, r_2, r_3);
120#endif
121
122 }
123
124 inline CUDA_HOS_DEV
125 float corrected_magn_cross_product(const float (&x)[3], const float (&y)[3])
126 {
127 return corrected_magn_cross_product(x[0], x[1], x[2], y[0], y[1], y[2]);
128 }
129
131 //Following the Eigen implementation too...
132 {
133 float a, b, c, d, e, f, scale;
134 // +-- --+
135 // | a d f |
136 // | d b e |
137 // | f e c |
138 // +-- --+
139
140 CUDA_HOS_DEV RealSymmetricMatrixSolverIterative(const float a_orig, const float b_orig, const float c_orig, const float d_orig, const float e_orig, const float f_orig)
141 {
142 using namespace std;
143
144 const float max_ab = max( fabsf(a_orig), fabsf(b_orig) );
145 const float max_cd = max( fabsf(c_orig), fabsf(d_orig) );
146 const float max_ef = max( fabsf(e_orig), fabsf(f_orig) );
147 scale = max(max_ab, max(max_cd, max_ef) );
148 if (scale == 0.f)
149 {
150 scale = 1.f;
151 }
152 const float inv_scale = 1.0f / scale;
153 a = a_orig * inv_scale;
154 b = b_orig * inv_scale;
155 c = c_orig * inv_scale;
156 d = d_orig * inv_scale;
157 e = e_orig * inv_scale;
158 f = f_orig * inv_scale;
159 }
160
161 static constexpr float s_typical_tolerance = std::numeric_limits<float>::min();
162
163 //Uses temp_diag to store the diagonal
164 //and temp_mat to store the tridiagonalized form of the matrix
165 CUDA_HOS_DEV void tridiagonalize(float (&temp_diag)[3], float (&temp_subdiag)[2], float (&temp_mat)[3][3], const float tolerance = s_typical_tolerance)
166 {
167 using namespace std;
168
169 temp_diag[0] = a;
170 if (f * f <= tolerance)
171 {
172 temp_diag[1] = b;
173 temp_diag[2] = c;
174
175 temp_subdiag[0] = d;
176 temp_subdiag[1] = e;
177
178 temp_mat[0][0] = 1.f;
179 temp_mat[0][1] = 0.f;
180 temp_mat[0][2] = 0.f;
181
182 temp_mat[1][0] = 0.f;
183 temp_mat[1][1] = 1.f;
184 temp_mat[1][2] = 0.f;
185
186 temp_mat[2][0] = 0.f;
187 temp_mat[2][1] = 0.f;
188 temp_mat[2][2] = 1.f;
189 }
190 else
191 {
192 const float beta = hypot(d, f);
193
194 const float inv_beta = 1.f / beta;
195
196 const float em_0_1 = d * inv_beta;
197 const float em_0_2 = f * inv_beta;
198
199 const float q_w_1 = 2 * em_0_1 * e;
200 const float q_c_1 = fmaf(2 * em_0_1, e, -q_w_1);
201 const float q_w_2 = em_0_2 * c;
202 const float q_c_2 = fmaf(em_0_2, c, -q_w_2);
203 const float q_w_3 = em_0_2 * b;
204 const float q_c_3 = fmaf(em_0_2, b, -q_w_3);
205
206 const float q = sum_kahan_babushka_neumaier(q_w_1, q_w_2, -q_w_3, q_c_1, q_c_2, -q_c_3);
207
208 temp_diag[1] = fmaf( em_0_2, q, b);
209 temp_diag[2] = fmaf(-em_0_2, q, c);
210
211 temp_subdiag[0] = beta;
212 temp_subdiag[1] = fmaf(-em_0_1, q, e);
213
214 temp_mat[0][0] = 1.f;
215 temp_mat[0][1] = 0.f;
216 temp_mat[0][2] = 0.f;
217
218 temp_mat[1][0] = 0.f;
219 temp_mat[1][1] = em_0_1;
220 temp_mat[1][2] = em_0_2;
221
222 temp_mat[2][0] = 0.f;
223 temp_mat[2][1] = em_0_2;
224 temp_mat[2][2] = -em_0_1;
225 }
226 }
227
228 CUDA_HOS_DEV void compute_iteration(const int start,
229 const int end,
230 float (&temp_diag)[3],
231 float (&temp_subdiag)[2],
232 float (&temp_mat)[3][3])
233 {
234 const float td = (temp_diag[end - 1] - temp_diag[end]) * 0.5f;
235
236 const float ee = temp_subdiag[end - 1];
237
238 float mu = temp_diag[end];
239
240 if (td == 0.f)
241 {
242 mu -= fabsf(ee);
243 }
244 else if (ee != 0.f)
245 {
246 const float ee_2 = ee * ee;
247
248 const float h = hypot(td, ee);
249
250 const float factor = td + h * ((td >= 0.f) - (td < 0.f));
251
252 if (ee_2 == 0.f)
253 {
254 mu -= ee / (factor / ee);
255 }
256 else
257 {
258 mu -= ee_2 / factor;
259 }
260 }
261
262 float x = temp_diag[start] - mu;
263 float z = temp_subdiag[start];
264
265 for (int k = start; k < end && z != 0.f; ++k)
266 {
267 float givens_c, givens_s;
268
269 /*if (z == 0.f)
270 {
271 givens_c = ((x >= 0.f) - (x < 0.f));
272 givens_s = 0.f;
273 }
274 else */if (x == 0.f)
275 {
276 givens_c = 0.f;
277 givens_s = (z < 0.f) - (z >= 0.f);
278 }
279 else if (fabsf(x) >= fabsf(z))
280 {
281 const float t = z / x;
282 const float u = hypot(1.f, fabsf(t)) * ((x >= 0.f) - (x < 0.f));
283
284 givens_c = 1.f / u;
285 givens_s = -t * givens_c;
286 }
287 else
288 {
289 const float t = x / z;
290 const float u = hypot(1.f, fabsf(t)) * ((z >= 0.f) - (z < 0.f));
291
292 givens_s = -1.f / u;
293 givens_c = -t * givens_s;
294 }
295
296 const float sdk = product_sum_cornea_harrison_tang(givens_s,
297 temp_diag[k],
298 givens_c,
299 temp_subdiag[k]);
300
301 const float dkp1 = product_sum_cornea_harrison_tang(givens_s,
302 temp_subdiag[k],
303 givens_c,
304 temp_diag[k + 1]);
305
306 temp_diag[k] = product_sum_cornea_harrison_tang(givens_c,
308 temp_diag[k],
309 -givens_s,
310 temp_subdiag[k]),
311 -givens_s,
313 temp_subdiag[k],
314 -givens_s,
315 temp_diag[k + 1])
316 );
317 temp_diag[k + 1] = product_sum_cornea_harrison_tang(givens_s, sdk, givens_c, dkp1);
318 temp_subdiag[k] = product_sum_cornea_harrison_tang(givens_c, sdk, -givens_s, dkp1);
319
320 if (k > start)
321 {
322 temp_subdiag[k - 1] = product_sum_cornea_harrison_tang(givens_c,
323 temp_subdiag[k - 1],
324 -givens_s,
325 z);
326 }
327
328 x = temp_subdiag[k];
329
330 if (k < end - 1)
331 {
332 z = -givens_s * temp_subdiag[k + 1];
333
334 temp_subdiag[k + 1] *= givens_c;
335 }
336
337 //We could skip if (c, s) == (1, 0)
338 //Also, apply on the right means
339 //we have to consider -s instead of s.
340 for (int i = 0; i < 3; ++i)
341 {
342 float & c_1 = temp_mat[k] [i];
343 float & c_2 = temp_mat[k + 1][i];
344
345 const float c_1_old = c_1;
346 const float c_2_old = c_2;
347
348 c_1 = product_sum_cornea_harrison_tang(givens_c, c_1_old, -givens_s, c_2_old);
349 c_2 = product_sum_cornea_harrison_tang(givens_s, c_1_old, givens_c, c_2_old);
350 }
351 }
352 }
353
354
355 static constexpr int s_typical_max_iterations = 90;
356 static constexpr float s_typical_near_zero = std::numeric_limits<float>::min();
357 static constexpr float s_typical_epsilon = std::numeric_limits<float>::epsilon();
358
359 CUDA_HOS_DEV void compute(float (&temp_diag)[3],
360 float (&temp_subdiag)[2],
361 float (&temp_mat)[3][3],
362 const float near_zero = s_typical_near_zero,
363 const float epsilon = s_typical_epsilon,
364 const int max_iter = s_typical_max_iterations)
365 {
366 int iter_count = 0;
367 int start = 0, end = 2;
368
369 const float precision_inv = 1.f / epsilon;
370
371 while (end > 0)
372 {
373 for (int i = start; i < end; ++i)
374 {
375 if (fabsf(temp_subdiag[i]) < near_zero)
376 {
377 temp_subdiag[i] = 0.f;
378 }
379 else
380 {
381 const float scaled_subdiag = precision_inv * temp_subdiag[i];
382 if (scaled_subdiag * scaled_subdiag <= fabsf(temp_diag[i]) + fabsf(temp_diag[i + 1]))
383 {
384 temp_subdiag[i] = 0.f;
385 }
386 }
387 }
388
389 while (end > 0 && temp_subdiag[end - 1] == 0.f)
390 {
391 --end;
392 }
393
394 if (end <= 0)
395 {
396 break;
397 }
398
399 ++iter_count;
400
401 if (iter_count > max_iter)
402 {
403 printf("OUT OF ITERS! %d %d\n", start, end);
404 break;
405 }
406
407 start = end - 1;
408
409 while (start > 0 && temp_subdiag[start - 1] != 0.f)
410 {
411 --start;
412 }
413
414 compute_iteration(start, end, temp_diag, temp_subdiag, temp_mat);
415 }
416
417 //No need to sort eigenvalues and eigenvectors:
418 //we are going to check them all anyway.
419 }
420
425 CUDA_HOS_DEV void get_solution(float (&eigenvalues)[3],
426 float (&eigenvectors)[3][3],
427 const float tolerance = s_typical_tolerance,
428 const float near_zero = s_typical_near_zero,
429 const float epsilon = s_typical_epsilon,
430 const int max_iter = s_typical_max_iterations )
431 {
432 float temp_subdiag[2];
433
434 tridiagonalize(eigenvalues, temp_subdiag, eigenvectors, tolerance);
435 //eigenvalues and eigenvectors are used temporarily to store the diagonal and the matrix.
436
437 compute(eigenvalues, temp_subdiag, eigenvectors, near_zero, epsilon, max_iter);
438
439 eigenvalues[0] *= scale;
440 eigenvalues[1] *= scale;
441 eigenvalues[2] *= scale;
442 }
443 };
444
446//Taken from the Eigen implementation of direct_selfadjoint_eigenvalues
447 {
448 float a, b, c, d, e, f, shift, scale;
449 // +-- --+
450 // | a d f |
451 // | d b e |
452 // | f e c |
453 // +-- --+
454
455 CUDA_HOS_DEV RealSymmetricMatrixSolver(const float a_orig, const float b_orig, const float c_orig, const float d_orig, const float e_orig, const float f_orig)
456 {
457 using namespace std;
458
459 shift = sum_kahan_babushka_neumaier(a_orig, b_orig, c_orig) / 3.f;
460 a = a_orig - shift;
461 b = b_orig - shift;
462 c = c_orig - shift;
463 const float max_ab = max( fabsf(a), fabsf(b) );
464 const float max_cd = max( fabsf(c), fabsf(d_orig) );
465 const float max_ef = max( fabsf(e_orig), fabsf(f_orig) );
466 scale = max(max_ab, max(max_cd, max_ef) );
467 if (scale == 0.f)
468 {
469 scale = 1.f;
470 }
471 const float inv_scale = 1.0f / scale;
472 a *= inv_scale;
473 b *= inv_scale;
474 c *= inv_scale;
475 d = d_orig * inv_scale;
476 e = e_orig * inv_scale;
477 f = f_orig * inv_scale;
478 }
479
483 CUDA_HOS_DEV void get_eigenvalues(float & e_1, float & e_2, float & e_3) const
484 {
485 using namespace std;
486
487 const float ab = a * b;
488 const float corr_ab = fmaf(a, b, -ab);
489 const float dd = d * d;
490 const float corr_dd = fmaf(d, d, -dd);
491 const float ac = a * c;
492 const float corr_ac = fmaf(a, c, -ac);
493 const float ff = f * f;
494 const float corr_ff = fmaf(f, f, -ff);
495 const float bc = b * c;
496 const float corr_bc = fmaf(b, c, -bc);
497 const float ee = e * e;
498 const float corr_ee = fmaf(e, e, -ee);
499
500 const float c_0 = fmaf(2 * d, f * e,
503 //Consider using some other strategy to select the best pairs
504 //to minimize error here?
505
506 const float c_1 = sum_kahan_babushka_neumaier(ab, -dd, ac, -ff, bc, -ee, corr_ab, -corr_dd, corr_ac, -corr_ff, corr_bc, corr_ee);
507
508 const float c_2 = sum_kahan_babushka_neumaier(a, b, c);
509
510 constexpr float inv_3 = 1.f / 3.f;
511
512 const float c_2_over_3 = c_2 * inv_3;
513
514 const float a_over_3 = max(fma(c_2, c_2_over_3, -c_1) * inv_3, 0.f);
515
516 const float half_b = 0.5f * (fma(c_2_over_3, fma(2.f * c_2_over_3, c_2_over_3, -c_1), c_0));
517
518 const float q = max(product_sum_cornea_harrison_tang(a_over_3, a_over_3 * a_over_3, -half_b, half_b), 0.f);
519
520 //None of what we are doing here is the best choice.
521 //We would need to perform a proper, formal, numerical analysis
522 //to figure out all possible errors and so on.
523
524 const float rho = sqrtf(a_over_3);
525
526#ifdef __CUDA_ARCH__
527 const float theta = atan2f(1.0f, rsqrtf(q) * half_b) * inv_3;
528#else
529 const float theta = atan2f(sqrtf(q), half_b) * inv_3;
530#endif
531
532#ifdef __CUDA_ARCH__
533 float sin_theta, cos_theta;
534 sincosf(theta, &sin_theta, &cos_theta);
535#else
536 const float sin_theta = sinf(theta);
537 const float cos_theta = cosf(theta);
538#endif
539
540 const float sqrt_3 = sqrtf(3.f);
541
542 e_1 = fma(-rho, fma( sqrt_3, sin_theta, cos_theta), c_2_over_3);
543 e_2 = fma(-rho, fma(-sqrt_3, sin_theta, cos_theta), c_2_over_3);
544 e_3 = fma( rho, 2.0f * cos_theta, c_2_over_3);
545 }
546
547 CUDA_HOS_DEV void extract_one(const float eigenvalue, float (&res)[3], float (&representative)[3]) const
548 {
549 using namespace std;
550
551 const float diag[3] = { a - eigenvalue,
552 b - eigenvalue,
553 c - eigenvalue
554 };
555
556 float vec_1[3], vec_2[3];
557
558 int max_diag = -1;
559 float max_value = -1.f;
560
561 for (int i = 0; i < 3; ++i)
562 {
563 const float this_diag = fabsf(diag[i]);
564 if (this_diag >= max_value)
565 {
566 max_diag = i;
567 max_value = this_diag;
568 }
569 }
570
571 if (max_diag == 0)
572 {
573 representative[0] = diag[0];
574 representative[1] = d;
575 representative[2] = f;
576
577 vec_1[0] = d;
578 vec_1[1] = diag[1];
579 vec_1[2] = e;
580
581 vec_2[0] = f;
582 vec_2[1] = e;
583 vec_2[2] = diag[2];
584 }
585 else if (max_diag == 1)
586 {
587 representative[0] = d;
588 representative[1] = diag[1];
589 representative[2] = e;
590
591 vec_1[0] = f;
592 vec_1[1] = e;
593 vec_1[2] = diag[2];
594
595 vec_2[0] = diag[0];
596 vec_2[1] = d;
597 vec_2[2] = f;
598
599 }
600 else /*if (max_diag == 2)*/
601 {
602 representative[0] = f;
603 representative[1] = e;
604 representative[2] = diag[2];
605
606 vec_1[0] = diag[0];
607 vec_1[1] = d;
608 vec_1[2] = f;
609
610 vec_2[0] = d;
611 vec_2[1] = diag[1];
612 vec_2[2] = e;
613 }
614
615 corrected_cross_product(res, representative, vec_1);
616 corrected_cross_product(vec_1, representative, vec_2);
617 //Can safely override previous value...
618
619#ifdef __CUDA_ARCH__
620 const float norm_1 = rnorm3df(res[0], res[1], res[2]);
621 const float norm_2 = rnorm3df(vec_1[0], vec_1[1], vec_1[2]);
622#else
623 const float norm_1 = 1.f / hypot(res[0], res[1], res[2]);
624 const float norm_2 = 1.f / hypot(vec_1[0], vec_1[1], vec_1[2]);
625#endif
626
627 if (norm_1 <= norm_2)
628 //Greater magnitude -> multiply by a smaller value
629 {
630 res[0] *= norm_1;
631 res[1] *= norm_1;
632 res[2] *= norm_1;
633 }
634 else
635 {
636 res[0] = vec_1[0] * norm_2;
637 res[1] = vec_1[1] * norm_2;
638 res[2] = vec_1[2] * norm_2;
639 }
640 }
641
642 static constexpr float s_typical_epsilon = std::numeric_limits<float>::epsilon();
643
647 CUDA_HOS_DEV void get_eigenvectors(float (&res)[3][3], const float e_1, const float e_2, const float e_3, const float epsilon = s_typical_epsilon) const
648 {
649 using namespace std;
650
651 if (e_3 - e_1 <= epsilon)
652 {
653 res[0][0] = 1.f;
654 res[0][1] = 0.f;
655 res[0][2] = 0.f;
656
657 res[1][0] = 0.f;
658 res[1][1] = 1.f;
659 res[1][2] = 0.f;
660
661 res[2][0] = 0.f;
662 res[2][1] = 0.f;
663 res[2][2] = 1.f;
664 }
665 else
666 {
667 const float d_0 = e_3 - e_2;
668 const float d_1 = e_2 - e_1;
669
670 const float d_min = min(d_0, d_1);
671 const float d_max = max(d_0, d_1);
672
673 int k, j;
674 float first_e, second_e;
675
676 if (d_0 > d_1)
677 {
678 k = 2;
679 j = 0;
680
681 first_e = e_3;
682 second_e = e_1;
683 }
684 else
685 {
686 k = 0;
687 j = 2;
688
689 first_e = e_1;
690 second_e = e_3;
691 }
692
693 extract_one(first_e, res[k], res[j]);
694#if USE_ORIGINAL_EIGEN
695 if (d_min <= 2 * epsilon * d1)
696 {
697#ifdef __CUDA_ARCH__
698 const float base_norm = rnorm3df(res[j][0], res[j][1], res[j][2]);
699#else
700 const float base_norm = 1.f / hypot(res[j][0], res[j][1], res[j][2]);
701#endif
702 const float extra_factor = 1.f - corrected_dot_product(res[k], res[j]);
703
704 const float norm = base_norm / extra_factor;
705
706 res[j][0] *= norm;
707 res[j][1] *= norm;
708 res[j][2] *= norm;
709 }
710#else
711 if (d_min <= 2 * epsilon * d_max)
712 //Eigen has d0 <= 2 * eps <= d1, but d0 is overwritten while d1 isn't...
713 {
714 //Eigen speaks about ortho-normalization
715 //but does eivecs.col(l) -= eivecs.col(k).dot(eivecs.col(l))*eivecs.col(l)
716 //which... does not ortho-normalize anything.
717
718 const float prod = corrected_dot_product(res[k], res[j]);
719
720 res[j][0] -= res[k][0] * prod;
721 res[j][1] -= res[k][1] * prod;
722 res[j][2] -= res[k][2] * prod;
723
724#ifdef __CUDA_ARCH__
725 const float norm = rnorm3df(res[j][0], res[j][1], res[j][2]);
726#else
727 const float norm = 1.f / hypot(res[j][0], res[j][1], res[j][2]);
728#endif
729 res[j][0] *= norm;
730 res[j][1] *= norm;
731 res[j][2] *= norm;
732 }
733#endif
734 else
735 {
736 float extra_vector[3];
737
738 extract_one(second_e, res[j], extra_vector);
739 }
740
742
743#ifdef __CUDA_ARCH__
744 const float norm = rnorm3df(res[1][0], res[1][1], res[1][2]);
745#else
746 const float norm = 1.f / hypot(res[1][0], res[1][1], res[1][2]);
747#endif
748
749 res[1][0] *= norm;
750 res[1][1] *= norm;
751 res[1][2] *= norm;
752 }
753 }
754
759 CUDA_HOS_DEV void get_solution(float (&eigenvalues)[3], float (&eigenvectors)[3][3], bool rescale_and_reshift_values = true, const float epsilon = s_typical_epsilon)
760 {
761 get_eigenvalues(eigenvalues[0], eigenvalues[1], eigenvalues[2]);
762 get_eigenvectors(eigenvectors, eigenvalues[0], eigenvalues[1], eigenvalues[2], epsilon);
763
764 if (rescale_and_reshift_values)
765 {
766 eigenvalues[0] = fmaf(eigenvalues[0], scale, shift);
767 eigenvalues[1] = fmaf(eigenvalues[1], scale, shift);
768 eigenvalues[2] = fmaf(eigenvalues[2], scale, shift);
769 }
770 }
771 };
772
784
798
800
801 constexpr unsigned int num_time_measurements = 11;
802
804 const CaloRecGPU::ConstantDataHolder & instance_data,
805 const CMCOptionsHolder & options,
806 const IGPUKernelSizeOptimizer & optimizer,
807 size_t (&times)[num_time_measurements],
808 const bool synchronize = false,
810 const bool defer_instead_of_oversize = false);
811}
812
813#endif
Scalar theta() const
theta method
std::pair< std::vector< unsigned int >, bool > res
#define y
#define x
#define z
#define min(a, b)
Definition cfImp.cxx:40
#define max(a, b)
Definition cfImp.cxx:41
Header file for AthHistogramAlgorithm.
Interface for GPU kernel size optimization (allowing adjustment of kernel sizes to the properties of ...
SimpleHolder< T, MemoryContext::CUDAGPU, true > CUDA_object
Holds an object of type T in CUDA GPU memory.
SimpleHolder< T, MemoryContext::CPU, true > CPU_object
Holds an object of type T in CPU memory.
void calculateClusterPropertiesAndMoments(CaloRecGPU::EventDataHolder &holder, const CaloRecGPU::ConstantDataHolder &instance_data, const CMCOptionsHolder &options, const IGPUKernelSizeOptimizer &optimizer, size_t(&times)[num_time_measurements], const bool synchronize=false, CaloRecGPU::CUDA_Helpers::CUDAStreamPtrHolder stream={}, const bool defer_instead_of_oversize=false)
void register_kernels(IGPUKernelSizeOptimizer &optimizer)
CUDA_HOS_DEV void corrected_cross_product(float(&res)[3], const float a1, const float a2, const float a3, const float b1, const float b2, const float b3)
CUDA_HOS_DEV float sum_kahan_babushka_neumaier(const Floats &... fs)
CUDA_HOS_DEV float corrected_dot_product(const float a_1, const float a_2, const float a_3, const float b_1, const float b_2, const float b_3)
CUDA_HOS_DEV void partial_kahan_babushka_neumaier_sum(const float &to_add, float &sum, float &corr)
CUDA_HOS_DEV float product_sum_cornea_harrison_tang(const float a, const float b, const float c, const float d)
CUDA_HOS_DEV float corrected_magn_cross_product(const float a1, const float a2, const float a3, const float b1, const float b2, const float b3)
STL namespace.
void sendToGPU(const bool clear_CPU=false)
CaloRecGPU::Helpers::CUDA_object< ClusterMomentCalculationOptions > m_options_dev
CaloRecGPU::Helpers::CPU_object< ClusterMomentCalculationOptions > m_options
CUDA_HOS_DEV RealSymmetricMatrixSolverIterative(const float a_orig, const float b_orig, const float c_orig, const float d_orig, const float e_orig, const float f_orig)
CUDA_HOS_DEV void get_solution(float(&eigenvalues)[3], float(&eigenvectors)[3][3], const float tolerance=s_typical_tolerance, const float near_zero=s_typical_near_zero, const float epsilon=s_typical_epsilon, const int max_iter=s_typical_max_iterations)
Get the full eigenvalues and eigenvectors for this matrix.
CUDA_HOS_DEV void tridiagonalize(float(&temp_diag)[3], float(&temp_subdiag)[2], float(&temp_mat)[3][3], const float tolerance=s_typical_tolerance)
CUDA_HOS_DEV void compute(float(&temp_diag)[3], float(&temp_subdiag)[2], float(&temp_mat)[3][3], const float near_zero=s_typical_near_zero, const float epsilon=s_typical_epsilon, const int max_iter=s_typical_max_iterations)
CUDA_HOS_DEV void compute_iteration(const int start, const int end, float(&temp_diag)[3], float(&temp_subdiag)[2], float(&temp_mat)[3][3])
CUDA_HOS_DEV void get_eigenvalues(float &e_1, float &e_2, float &e_3) const
Calculate shifted and scaled eigenvalues of the matrix, in ascending value.
CUDA_HOS_DEV void get_solution(float(&eigenvalues)[3], float(&eigenvectors)[3][3], bool rescale_and_reshift_values=true, const float epsilon=s_typical_epsilon)
Get the full eigenvalues and eigenvectors for this matrix.
CUDA_HOS_DEV void get_eigenvectors(float(&res)[3][3], const float e_1, const float e_2, const float e_3, const float epsilon=s_typical_epsilon) const
Calculate the eigenvectors of the matrix, using the (possibly unscaled) eigenvalues e_1,...
CUDA_HOS_DEV void extract_one(const float eigenvalue, float(&res)[3], float(&representative)[3]) const
CUDA_HOS_DEV RealSymmetricMatrixSolver(const float a_orig, const float b_orig, const float c_orig, const float d_orig, const float e_orig, const float f_orig)