ATLAS Offline Software
GPUClusterInfoAndMomentsCalculatorImpl.h
Go to the documentation of this file.
1 //
2 // Copyright (C) 2002-2025 CERN for the benefit of the ATLAS collaboration
3 //
4 // Dear emacs, this is -*- c++ -*-
5 //
6 
7 #ifndef CALORECGPU_GPUCLUSTERINFOANDMOMENTSCALCULATOR_CUDA_H
8 #define CALORECGPU_GPUCLUSTERINFOANDMOMENTSCALCULATOR_CUDA_H
9 
11 #include "CaloRecGPU/DataHolders.h"
12 #include "CaloRecGPU/Helpers.h"
13 
15 
16 #include <cmath>
17 #include <type_traits>
18 
20 {
21  inline CUDA_HOS_DEV void partial_kahan_babushka_neumaier_sum(const float & to_add, float & sum, float & corr)
22  {
23  const float t = sum + to_add;
24 
25  const bool test = fabsf(sum) >= fabsf(to_add);
26 
27  const float opt_1 = (sum - t) + to_add;
28  const float opt_2 = (to_add - t) + sum;
29 
30  corr += (test) * opt_1 + (!test) * opt_2;
31 
32  sum = t;
33  }
34 
35 
36  //There are some extra operations, yes,
37  //but we benefit from added precision
38  //that can compensate for not using doubles...
39  template < class ... Floats, class disabler = std::enable_if_t < (std::is_same_v<std::decay_t<Floats>, float> && ...) >>
40  CUDA_HOS_DEV float sum_kahan_babushka_neumaier(const Floats & ... fs)
41  {
42  float ret = 0.f;
43  float corr = 0.f;
44 
45  (partial_kahan_babushka_neumaier_sum(fs, ret, corr), ...);
46 
47  return ret + corr;
48  }
49 
50  //Algorithm that calculates a * b + c * d with better precision using FMA,
51  //following "Error bounds on complex floating-point multiplication with an FMA"
52  //by Jeannerod et. al.
53  inline CUDA_HOS_DEV
54  float product_sum_cornea_harrison_tang(const float a, const float b, const float c, const float d)
55  {
56  using namespace std;
57 
58  const float w_1 = a * b;
59  const float w_2 = c * d;
60 
61  const float e_1 = fmaf(a, b, -w_1);
62  const float e_2 = fmaf(c, d, -w_2);
63 
64  return sum_kahan_babushka_neumaier(w_1, w_2, e_1, e_2);
65  }
66 
67  //Generalization of the Cornea-Harrison-Tang algorithm for dot products.
68  inline CUDA_HOS_DEV
69  float corrected_dot_product(const float a_1, const float a_2, const float a_3,
70  const float b_1, const float b_2, const float b_3)
71  {
72  using namespace std;
73 
74  const float w_1 = a_1 * b_1;
75  const float w_2 = a_2 * b_2;
76  const float w_3 = a_3 * b_3;
77 
78  const float e_1 = fmaf(a_1, b_1, -w_1);
79  const float e_2 = fmaf(a_2, b_2, -w_2);
80  const float e_3 = fmaf(a_3, b_3, -w_3);
81 
82  return sum_kahan_babushka_neumaier(w_1, w_2, w_3, e_1, e_2, e_3);
83  }
84 
85  inline CUDA_HOS_DEV
86  float corrected_dot_product(const float (&a)[3], const float (&b)[3])
87  {
88  return corrected_dot_product(a[0], a[1], a[2], b[0], b[1], b[2]);
89  }
90 
91  //Cross product using the Cornea-Harrison-Tang algorithm
92  inline CUDA_HOS_DEV
93  void corrected_cross_product(float (&res)[3], const float a1, const float a2, const float a3, const float b1, const float b2, const float b3)
94  {
95  res[0] = product_sum_cornea_harrison_tang(a2, b3, -a3, b2);
96  res[1] = product_sum_cornea_harrison_tang(a3, b1, -a1, b3);
97  res[2] = product_sum_cornea_harrison_tang(a1, b2, -a2, b1);
98  }
99 
100  inline CUDA_HOS_DEV
101  void corrected_cross_product(float (&res)[3], const float (&x)[3], const float (&y)[3])
102  {
103  corrected_cross_product(res, x[0], x[1], x[2], y[0], y[1], y[2]);
104  }
105 
106  //Magnitude of a cross product using the generalization of the Cornea-Harrison-Tang algorithm
107  inline CUDA_HOS_DEV
108  float corrected_magn_cross_product(const float a1, const float a2, const float a3, const float b1, const float b2, const float b3)
109  {
110  using namespace std;
111 
112  const float r_1 = product_sum_cornea_harrison_tang(a2, b3, -a3, b2);
113  const float r_2 = product_sum_cornea_harrison_tang(a3, b1, -a1, b3);
114  const float r_3 = product_sum_cornea_harrison_tang(a1, b2, -a2, b1);
115 
116 #ifdef __CUDA_ARCH__
117  return norm3df(r_1, r_2, r_3);
118 #else
119  return hypot(r_1, r_2, r_3);
120 #endif
121 
122  }
123 
124  inline CUDA_HOS_DEV
125  float corrected_magn_cross_product(const float (&x)[3], const float (&y)[3])
126  {
127  return corrected_magn_cross_product(x[0], x[1], x[2], y[0], y[1], y[2]);
128  }
129 
131  //Following the Eigen implementation too...
132  {
133  float a, b, c, d, e, f, scale;
134  // +-- --+
135  // | a d f |
136  // | d b e |
137  // | f e c |
138  // +-- --+
139 
140  CUDA_HOS_DEV RealSymmetricMatrixSolverIterative(const float a_orig, const float b_orig, const float c_orig, const float d_orig, const float e_orig, const float f_orig)
141  {
142  using namespace std;
143 
144  const float max_ab = max( fabsf(a_orig), fabsf(b_orig) );
145  const float max_cd = max( fabsf(c_orig), fabsf(d_orig) );
146  const float max_ef = max( fabsf(e_orig), fabsf(f_orig) );
147  scale = max(max_ab, max(max_cd, max_ef) );
148  if (scale == 0.f)
149  {
150  scale = 1.f;
151  }
152  const float inv_scale = 1.0f / scale;
153  a = a_orig * inv_scale;
154  b = b_orig * inv_scale;
155  c = c_orig * inv_scale;
156  d = d_orig * inv_scale;
157  e = e_orig * inv_scale;
158  f = f_orig * inv_scale;
159  }
160 
162 
163  //Uses temp_diag to store the diagonal
164  //and temp_mat to store the tridiagonalized form of the matrix
165  CUDA_HOS_DEV void tridiagonalize(float (&temp_diag)[3], float (&temp_subdiag)[2], float (&temp_mat)[3][3], const float tolerance = s_typical_tolerance)
166  {
167  using namespace std;
168 
169  temp_diag[0] = a;
170  if (f * f <= tolerance)
171  {
172  temp_diag[1] = b;
173  temp_diag[2] = c;
174 
175  temp_subdiag[0] = d;
176  temp_subdiag[1] = e;
177 
178  temp_mat[0][0] = 1.f;
179  temp_mat[0][1] = 0.f;
180  temp_mat[0][2] = 0.f;
181 
182  temp_mat[1][0] = 0.f;
183  temp_mat[1][1] = 1.f;
184  temp_mat[1][2] = 0.f;
185 
186  temp_mat[2][0] = 0.f;
187  temp_mat[2][1] = 0.f;
188  temp_mat[2][2] = 1.f;
189  }
190  else
191  {
192  const float beta = hypot(d, f);
193 
194  const float inv_beta = 1.f / beta;
195 
196  const float em_0_1 = d * inv_beta;
197  const float em_0_2 = f * inv_beta;
198 
199  const float q_w_1 = 2 * em_0_1 * e;
200  const float q_c_1 = fmaf(2 * em_0_1, e, -q_w_1);
201  const float q_w_2 = em_0_2 * c;
202  const float q_c_2 = fmaf(em_0_2, c, -q_w_2);
203  const float q_w_3 = em_0_2 * b;
204  const float q_c_3 = fmaf(em_0_2, b, -q_w_3);
205 
206  const float q = sum_kahan_babushka_neumaier(q_w_1, q_w_2, -q_w_3, q_c_1, q_c_2, -q_c_3);
207 
208  temp_diag[1] = fmaf( em_0_2, q, b);
209  temp_diag[2] = fmaf(-em_0_2, q, c);
210 
211  temp_subdiag[0] = beta;
212  temp_subdiag[1] = fmaf(-em_0_1, q, e);
213 
214  temp_mat[0][0] = 1.f;
215  temp_mat[0][1] = 0.f;
216  temp_mat[0][2] = 0.f;
217 
218  temp_mat[1][0] = 0.f;
219  temp_mat[1][1] = em_0_1;
220  temp_mat[1][2] = em_0_2;
221 
222  temp_mat[2][0] = 0.f;
223  temp_mat[2][1] = em_0_2;
224  temp_mat[2][2] = -em_0_1;
225  }
226  }
227 
229  const int end,
230  float (&temp_diag)[3],
231  float (&temp_subdiag)[2],
232  float (&temp_mat)[3][3])
233  {
234  const float td = (temp_diag[end - 1] - temp_diag[end]) * 0.5f;
235 
236  const float ee = temp_subdiag[end - 1];
237 
238  float mu = temp_diag[end];
239 
240  if (td == 0.f)
241  {
242  mu -= fabsf(ee);
243  }
244  else if (ee != 0.f)
245  {
246  const float ee_2 = ee * ee;
247 
248  const float h = hypot(td, ee);
249 
250  const float factor = td + h * ((td >= 0.f) - (td < 0.f));
251 
252  if (ee_2 == 0.f)
253  {
254  mu -= ee / (factor / ee);
255  }
256  else
257  {
258  mu -= ee_2 / factor;
259  }
260  }
261 
262  float x = temp_diag[start] - mu;
263  float z = temp_subdiag[start];
264 
265  for (int k = start; k < end && z != 0.f; ++k)
266  {
267  float givens_c, givens_s;
268 
269  /*if (z == 0.f)
270  {
271  givens_c = ((x >= 0.f) - (x < 0.f));
272  givens_s = 0.f;
273  }
274  else */if (x == 0.f)
275  {
276  givens_c = 0.f;
277  givens_s = (z < 0.f) - (z >= 0.f);
278  }
279  else if (fabsf(x) >= fabsf(z))
280  {
281  const float t = z / x;
282  const float u = hypot(1.f, fabsf(t)) * ((x >= 0.f) - (x < 0.f));
283 
284  givens_c = 1.f / u;
285  givens_s = -t * givens_c;
286  }
287  else
288  {
289  const float t = x / z;
290  const float u = hypot(1.f, fabsf(t)) * ((z >= 0.f) - (z < 0.f));
291 
292  givens_s = -1.f / u;
293  givens_c = -t * givens_s;
294  }
295 
296  const float sdk = product_sum_cornea_harrison_tang(givens_s,
297  temp_diag[k],
298  givens_c,
299  temp_subdiag[k]);
300 
301  const float dkp1 = product_sum_cornea_harrison_tang(givens_s,
302  temp_subdiag[k],
303  givens_c,
304  temp_diag[k + 1]);
305 
306  temp_diag[k] = product_sum_cornea_harrison_tang(givens_c,
308  temp_diag[k],
309  -givens_s,
310  temp_subdiag[k]),
311  -givens_s,
313  temp_subdiag[k],
314  -givens_s,
315  temp_diag[k + 1])
316  );
317  temp_diag[k + 1] = product_sum_cornea_harrison_tang(givens_s, sdk, givens_c, dkp1);
318  temp_subdiag[k] = product_sum_cornea_harrison_tang(givens_c, sdk, -givens_s, dkp1);
319 
320  if (k > start)
321  {
322  temp_subdiag[k - 1] = product_sum_cornea_harrison_tang(givens_c,
323  temp_subdiag[k - 1],
324  -givens_s,
325  z);
326  }
327 
328  x = temp_subdiag[k];
329 
330  if (k < end - 1)
331  {
332  z = -givens_s * temp_subdiag[k + 1];
333 
334  temp_subdiag[k + 1] *= givens_c;
335  }
336 
337  //We could skip if (c, s) == (1, 0)
338  //Also, apply on the right means
339  //we have to consider -s instead of s.
340  for (int i = 0; i < 3; ++i)
341  {
342  float & c_1 = temp_mat[k] [i];
343  float & c_2 = temp_mat[k + 1][i];
344 
345  const float c_1_old = c_1;
346  const float c_2_old = c_2;
347 
348  c_1 = product_sum_cornea_harrison_tang(givens_c, c_1_old, -givens_s, c_2_old);
349  c_2 = product_sum_cornea_harrison_tang(givens_s, c_1_old, givens_c, c_2_old);
350  }
351  }
352  }
353 
354 
355  static constexpr int s_typical_max_iterations = 90;
357  static constexpr float s_typical_epsilon = std::numeric_limits<float>::epsilon();
358 
359  CUDA_HOS_DEV void compute(float (&temp_diag)[3],
360  float (&temp_subdiag)[2],
361  float (&temp_mat)[3][3],
362  const float near_zero = s_typical_near_zero,
363  const float epsilon = s_typical_epsilon,
364  const int max_iter = s_typical_max_iterations)
365  {
366  int iter_count = 0;
367  int start = 0, end = 2;
368 
369  const float precision_inv = 1.f / epsilon;
370 
371  while (end > 0)
372  {
373  for (int i = start; i < end; ++i)
374  {
375  if (fabsf(temp_subdiag[i]) < near_zero)
376  {
377  temp_subdiag[i] = 0.f;
378  }
379  else
380  {
381  const float scaled_subdiag = precision_inv * temp_subdiag[i];
382  if (scaled_subdiag * scaled_subdiag <= fabsf(temp_diag[i]) + fabsf(temp_diag[i + 1]))
383  {
384  temp_subdiag[i] = 0.f;
385  }
386  }
387  }
388 
389  while (end > 0 && temp_subdiag[end - 1] == 0.f)
390  {
391  --end;
392  }
393 
394  if (end <= 0)
395  {
396  break;
397  }
398 
399  ++iter_count;
400 
401  if (iter_count > max_iter)
402  {
403  printf("OUT OF ITERS! %d %d\n", start, end);
404  break;
405  }
406 
407  start = end - 1;
408 
409  while (start > 0 && temp_subdiag[start - 1] != 0.f)
410  {
411  --start;
412  }
413 
414  compute_iteration(start, end, temp_diag, temp_subdiag, temp_mat);
415  }
416 
417  //No need to sort eigenvalues and eigenvectors:
418  //we are going to check them all anyway.
419  }
420 
425  CUDA_HOS_DEV void get_solution(float (&eigenvalues)[3],
426  float (&eigenvectors)[3][3],
427  const float tolerance = s_typical_tolerance,
428  const float near_zero = s_typical_near_zero,
429  const float epsilon = s_typical_epsilon,
430  const int max_iter = s_typical_max_iterations )
431  {
432  float temp_subdiag[2];
433 
434  tridiagonalize(eigenvalues, temp_subdiag, eigenvectors, tolerance);
435  //eigenvalues and eigenvectors are used temporarily to store the diagonal and the matrix.
436 
437  compute(eigenvalues, temp_subdiag, eigenvectors, near_zero, epsilon, max_iter);
438 
439  eigenvalues[0] *= scale;
440  eigenvalues[1] *= scale;
441  eigenvalues[2] *= scale;
442  }
443  };
444 
446 //Taken from the Eigen implementation of direct_selfadjoint_eigenvalues
447  {
448  float a, b, c, d, e, f, shift, scale;
449  // +-- --+
450  // | a d f |
451  // | d b e |
452  // | f e c |
453  // +-- --+
454 
455  CUDA_HOS_DEV RealSymmetricMatrixSolver(const float a_orig, const float b_orig, const float c_orig, const float d_orig, const float e_orig, const float f_orig)
456  {
457  using namespace std;
458 
459  shift = sum_kahan_babushka_neumaier(a_orig, b_orig, c_orig) / 3.f;
460  a = a_orig - shift;
461  b = b_orig - shift;
462  c = c_orig - shift;
463  const float max_ab = max( fabsf(a), fabsf(b) );
464  const float max_cd = max( fabsf(c), fabsf(d_orig) );
465  const float max_ef = max( fabsf(e_orig), fabsf(f_orig) );
466  scale = max(max_ab, max(max_cd, max_ef) );
467  if (scale == 0.f)
468  {
469  scale = 1.f;
470  }
471  const float inv_scale = 1.0f / scale;
472  a *= inv_scale;
473  b *= inv_scale;
474  c *= inv_scale;
475  d = d_orig * inv_scale;
476  e = e_orig * inv_scale;
477  f = f_orig * inv_scale;
478  }
479 
483  CUDA_HOS_DEV void get_eigenvalues(float & e_1, float & e_2, float & e_3) const
484  {
485  using namespace std;
486 
487  const float ab = a * b;
488  const float corr_ab = fmaf(a, b, -ab);
489  const float dd = d * d;
490  const float corr_dd = fmaf(d, d, -dd);
491  const float ac = a * c;
492  const float corr_ac = fmaf(a, c, -ac);
493  const float ff = f * f;
494  const float corr_ff = fmaf(f, f, -ff);
495  const float bc = b * c;
496  const float corr_bc = fmaf(b, c, -bc);
497  const float ee = e * e;
498  const float corr_ee = fmaf(e, e, -ee);
499 
500  const float c_0 = fmaf(2 * d, f * e,
503  //Consider using some other strategy to select the best pairs
504  //to minimize error here?
505 
506  const float c_1 = sum_kahan_babushka_neumaier(ab, -dd, ac, -ff, bc, -ee, corr_ab, -corr_dd, corr_ac, -corr_ff, corr_bc, corr_ee);
507 
508  const float c_2 = sum_kahan_babushka_neumaier(a, b, c);
509 
510  constexpr float inv_3 = 1.f / 3.f;
511 
512  const float c_2_over_3 = c_2 * inv_3;
513 
514  const float a_over_3 = max(fma(c_2, c_2_over_3, -c_1) * inv_3, 0.f);
515 
516  const float half_b = 0.5f * (fma(c_2_over_3, fma(2.f * c_2_over_3, c_2_over_3, -c_1), c_0));
517 
518  const float q = max(product_sum_cornea_harrison_tang(a_over_3, a_over_3 * a_over_3, -half_b, half_b), 0.f);
519 
520  //None of what we are doing here is the best choice.
521  //We would need to perform a proper, formal, numerical analysis
522  //to figure out all possible errors and so on.
523 
524  const float rho = sqrtf(a_over_3);
525 
526 #ifdef __CUDA_ARCH__
527  const float theta = atan2f(1.0f, rsqrtf(q) * half_b) * inv_3;
528 #else
529  const float theta = atan2f(sqrtf(q), half_b) * inv_3;
530 #endif
531 
532 #ifdef __CUDA_ARCH__
533  float sin_theta, cos_theta;
534  sincosf(theta, &sin_theta, &cos_theta);
535 #else
536  const float sin_theta = sinf(theta);
537  const float cos_theta = cosf(theta);
538 #endif
539 
540  const float sqrt_3 = sqrtf(3.f);
541 
542  e_1 = fma(-rho, fma( sqrt_3, sin_theta, cos_theta), c_2_over_3);
543  e_2 = fma(-rho, fma(-sqrt_3, sin_theta, cos_theta), c_2_over_3);
544  e_3 = fma( rho, 2.0f * cos_theta, c_2_over_3);
545  }
546 
547  CUDA_HOS_DEV void extract_one(const float eigenvalue, float (&res)[3], float (&representative)[3]) const
548  {
549  using namespace std;
550 
551  const float diag[3] = { a - eigenvalue,
552  b - eigenvalue,
553  c - eigenvalue
554  };
555 
556  float vec_1[3], vec_2[3];
557 
558  int max_diag = -1;
559  float max_value = -1.f;
560 
561  for (int i = 0; i < 3; ++i)
562  {
563  const float this_diag = fabsf(diag[i]);
564  if (this_diag >= max_value)
565  {
566  max_diag = i;
567  max_value = this_diag;
568  }
569  }
570 
571  if (max_diag == 0)
572  {
573  representative[0] = diag[0];
574  representative[1] = d;
575  representative[2] = f;
576 
577  vec_1[0] = d;
578  vec_1[1] = diag[1];
579  vec_1[2] = e;
580 
581  vec_2[0] = f;
582  vec_2[1] = e;
583  vec_2[2] = diag[2];
584  }
585  else if (max_diag == 1)
586  {
587  representative[0] = d;
588  representative[1] = diag[1];
589  representative[2] = e;
590 
591  vec_1[0] = f;
592  vec_1[1] = e;
593  vec_1[2] = diag[2];
594 
595  vec_2[0] = diag[0];
596  vec_2[1] = d;
597  vec_2[2] = f;
598 
599  }
600  else /*if (max_diag == 2)*/
601  {
602  representative[0] = f;
603  representative[1] = e;
604  representative[2] = diag[2];
605 
606  vec_1[0] = diag[0];
607  vec_1[1] = d;
608  vec_1[2] = f;
609 
610  vec_2[0] = d;
611  vec_2[1] = diag[1];
612  vec_2[2] = e;
613  }
614 
615  corrected_cross_product(res, representative, vec_1);
616  corrected_cross_product(vec_1, representative, vec_2);
617  //Can safely override previous value...
618 
619 #ifdef __CUDA_ARCH__
620  const float norm_1 = rnorm3df(res[0], res[1], res[2]);
621  const float norm_2 = rnorm3df(vec_1[0], vec_1[1], vec_1[2]);
622 #else
623  const float norm_1 = 1.f / hypot(res[0], res[1], res[2]);
624  const float norm_2 = 1.f / hypot(vec_1[0], vec_1[1], vec_1[2]);
625 #endif
626 
627  if (norm_1 <= norm_2)
628  //Greater magnitude -> multiply by a smaller value
629  {
630  res[0] *= norm_1;
631  res[1] *= norm_1;
632  res[2] *= norm_1;
633  }
634  else
635  {
636  res[0] = vec_1[0] * norm_2;
637  res[1] = vec_1[1] * norm_2;
638  res[2] = vec_1[2] * norm_2;
639  }
640  }
641 
642  static constexpr float s_typical_epsilon = std::numeric_limits<float>::epsilon();
643 
647  CUDA_HOS_DEV void get_eigenvectors(float (&res)[3][3], const float e_1, const float e_2, const float e_3, const float epsilon = s_typical_epsilon) const
648  {
649  using namespace std;
650 
651  if (e_3 - e_1 <= epsilon)
652  {
653  res[0][0] = 1.f;
654  res[0][1] = 0.f;
655  res[0][2] = 0.f;
656 
657  res[1][0] = 0.f;
658  res[1][1] = 1.f;
659  res[1][2] = 0.f;
660 
661  res[2][0] = 0.f;
662  res[2][1] = 0.f;
663  res[2][2] = 1.f;
664  }
665  else
666  {
667  const float d_0 = e_3 - e_2;
668  const float d_1 = e_2 - e_1;
669 
670  const float d_min = min(d_0, d_1);
671  const float d_max = max(d_0, d_1);
672 
673  int k, j;
674  float first_e, second_e;
675 
676  if (d_0 > d_1)
677  {
678  k = 2;
679  j = 0;
680 
681  first_e = e_3;
682  second_e = e_1;
683  }
684  else
685  {
686  k = 0;
687  j = 2;
688 
689  first_e = e_1;
690  second_e = e_3;
691  }
692 
693  extract_one(first_e, res[k], res[j]);
694 #if USE_ORIGINAL_EIGEN
695  if (d_min <= 2 * epsilon * d1)
696  {
697 #ifdef __CUDA_ARCH__
698  const float base_norm = rnorm3df(res[j][0], res[j][1], res[j][2]);
699 #else
700  const float base_norm = 1.f / hypot(res[j][0], res[j][1], res[j][2]);
701 #endif
702  const float extra_factor = 1.f - corrected_dot_product(res[k], res[j]);
703 
704  const float norm = base_norm / extra_factor;
705 
706  res[j][0] *= norm;
707  res[j][1] *= norm;
708  res[j][2] *= norm;
709  }
710 #else
711  if (d_min <= 2 * epsilon * d_max)
712  //Eigen has d0 <= 2 * eps <= d1, but d0 is overwritten while d1 isn't...
713  {
714  //Eigen speaks about ortho-normalization
715  //but does eivecs.col(l) -= eivecs.col(k).dot(eivecs.col(l))*eivecs.col(l)
716  //which... does not ortho-normalize anything.
717 
718  const float prod = corrected_dot_product(res[k], res[j]);
719 
720  res[j][0] -= res[k][0] * prod;
721  res[j][1] -= res[k][1] * prod;
722  res[j][2] -= res[k][2] * prod;
723 
724 #ifdef __CUDA_ARCH__
725  const float norm = rnorm3df(res[j][0], res[j][1], res[j][2]);
726 #else
727  const float norm = 1.f / hypot(res[j][0], res[j][1], res[j][2]);
728 #endif
729  res[j][0] *= norm;
730  res[j][1] *= norm;
731  res[j][2] *= norm;
732  }
733 #endif
734  else
735  {
736  float extra_vector[3];
737 
738  extract_one(second_e, res[j], extra_vector);
739  }
740 
741  corrected_cross_product(res[1], res[2], res[0]);
742 
743 #ifdef __CUDA_ARCH__
744  const float norm = rnorm3df(res[1][0], res[1][1], res[1][2]);
745 #else
746  const float norm = 1.f / hypot(res[1][0], res[1][1], res[1][2]);
747 #endif
748 
749  res[1][0] *= norm;
750  res[1][1] *= norm;
751  res[1][2] *= norm;
752  }
753  }
754 
759  CUDA_HOS_DEV void get_solution(float (&eigenvalues)[3], float (&eigenvectors)[3][3], bool rescale_and_reshift_values = true, const float epsilon = s_typical_epsilon)
760  {
761  get_eigenvalues(eigenvalues[0], eigenvalues[1], eigenvalues[2]);
762  get_eigenvectors(eigenvectors, eigenvalues[0], eigenvalues[1], eigenvalues[2], epsilon);
763 
764  if (rescale_and_reshift_values)
765  {
766  eigenvalues[0] = fmaf(eigenvalues[0], scale, shift);
767  eigenvalues[1] = fmaf(eigenvalues[1], scale, shift);
768  eigenvalues[2] = fmaf(eigenvalues[2], scale, shift);
769  }
770  }
771  };
772 
774  {
783  };
784 
786  {
788 
790 
791  void allocate()
792  {
793  m_options.allocate();
794  }
795 
796  void sendToGPU(const bool clear_CPU = false);
797  };
798 
800 
801  constexpr unsigned int num_time_measurements = 11;
802 
804  const CaloRecGPU::ConstantDataHolder & instance_data,
805  const CMCOptionsHolder & options,
806  const IGPUKernelSizeOptimizer & optimizer,
807  size_t (&times)[num_time_measurements],
808  const bool synchronize = false,
810  const bool defer_instead_of_oversize = false);
811 }
812 
813 #endif
ClusterMomentsCalculator::ClusterMomentCalculationOptions::min_l_longitudinal
float min_l_longitudinal
Definition: GPUClusterInfoAndMomentsCalculatorImpl.h:781
PlotCalibFromCool.norm
norm
Definition: PlotCalibFromCool.py:100
IGPUKernelSizeOptimizer.h
hist_file_dump.d
d
Definition: hist_file_dump.py:142
max
constexpr double max()
Definition: ap_fixedTest.cxx:33
ClusterMomentsCalculator::RealSymmetricMatrixSolverIterative::s_typical_max_iterations
static constexpr int s_typical_max_iterations
Definition: GPUClusterInfoAndMomentsCalculatorImpl.h:355
min
constexpr double min()
Definition: ap_fixedTest.cxx:26
ClusterMomentsCalculator::RealSymmetricMatrixSolver::get_solution
CUDA_HOS_DEV void get_solution(float(&eigenvalues)[3], float(&eigenvectors)[3][3], bool rescale_and_reshift_values=true, const float epsilon=s_typical_epsilon)
Get the full eigenvalues and eigenvectors for this matrix.
Definition: GPUClusterInfoAndMomentsCalculatorImpl.h:759
mergePhysValFiles.start
start
Definition: DataQuality/DataQualityUtils/scripts/mergePhysValFiles.py:13
CaloRecGPU::Helpers::SimpleHolder
Holds one objects of type \T in memory context Context.
Definition: Calorimeter/CaloRecGPU/CaloRecGPU/Helpers.h:1074
ClusterMomentsCalculator::RealSymmetricMatrixSolverIterative::s_typical_near_zero
static constexpr float s_typical_near_zero
Definition: GPUClusterInfoAndMomentsCalculatorImpl.h:356
ClusterMomentsCalculator::RealSymmetricMatrixSolverIterative::a
float a
Definition: GPUClusterInfoAndMomentsCalculatorImpl.h:133
ClusterMomentsCalculator::num_time_measurements
constexpr unsigned int num_time_measurements
Definition: GPUClusterInfoAndMomentsCalculatorImpl.h:801
dq_defect_virtual_defect_validation.d1
d1
Definition: dq_defect_virtual_defect_validation.py:79
ClusterMomentsCalculator::CMCOptionsHolder::sendToGPU
void sendToGPU(const bool clear_CPU=false)
ClusterMomentsCalculator::RealSymmetricMatrixSolverIterative::get_solution
CUDA_HOS_DEV void get_solution(float(&eigenvalues)[3], float(&eigenvectors)[3][3], const float tolerance=s_typical_tolerance, const float near_zero=s_typical_near_zero, const float epsilon=s_typical_epsilon, const int max_iter=s_typical_max_iterations)
Get the full eigenvalues and eigenvectors for this matrix.
Definition: GPUClusterInfoAndMomentsCalculatorImpl.h:425
TrigInDetValidation_Base.test
test
Definition: TrigInDetValidation_Base.py:142
read_hist_ntuple.t
t
Definition: read_hist_ntuple.py:5
ClusterMomentsCalculator::ClusterMomentCalculationOptions::use_abs_energy
bool use_abs_energy
Definition: GPUClusterInfoAndMomentsCalculatorImpl.h:775
ClusterMomentsCalculator::corrected_cross_product
CUDA_HOS_DEV void corrected_cross_product(float(&res)[3], const float a1, const float a2, const float a3, const float b1, const float b2, const float b3)
Definition: GPUClusterInfoAndMomentsCalculatorImpl.h:93
x
#define x
AthenaPoolTestWrite.stream
string stream
Definition: AthenaPoolTestWrite.py:12
Trk::u
@ u
Enums for curvilinear frames.
Definition: ParamDefs.h:77
ClusterMomentsCalculator
Definition: GPUClusterInfoAndMomentsCalculatorImpl.h:20
ClusterMomentsCalculator::calculateClusterPropertiesAndMoments
void calculateClusterPropertiesAndMoments(CaloRecGPU::EventDataHolder &holder, const CaloRecGPU::ConstantDataHolder &instance_data, const CMCOptionsHolder &options, const IGPUKernelSizeOptimizer &optimizer, size_t(&times)[num_time_measurements], const bool synchronize=false, CaloRecGPU::CUDA_Helpers::CUDAStreamPtrHolder stream={}, const bool defer_instead_of_oversize=false)
mergePhysValFiles.end
end
Definition: DataQuality/DataQualityUtils/scripts/mergePhysValFiles.py:92
ClusterMomentsCalculator::RealSymmetricMatrixSolver::extract_one
CUDA_HOS_DEV void extract_one(const float eigenvalue, float(&res)[3], float(&representative)[3]) const
Definition: GPUClusterInfoAndMomentsCalculatorImpl.h:547
ClusterMomentsCalculator::RealSymmetricMatrixSolverIterative::s_typical_epsilon
static constexpr float s_typical_epsilon
Definition: GPUClusterInfoAndMomentsCalculatorImpl.h:357
ClusterMomentsCalculator::CMCOptionsHolder::allocate
void allocate()
Definition: GPUClusterInfoAndMomentsCalculatorImpl.h:791
ClusterMomentsCalculator::RealSymmetricMatrixSolverIterative::s_typical_tolerance
static constexpr float s_typical_tolerance
Definition: GPUClusterInfoAndMomentsCalculatorImpl.h:161
CaloRecGPU::EventDataHolder
Definition: DataHolders.h:35
CUDA_HOS_DEV
#define CUDA_HOS_DEV
Definition: Calorimeter/CaloRecGPU/CaloRecGPU/Helpers.h:101
convertTimingResiduals.sum
sum
Definition: convertTimingResiduals.py:55
ClusterMomentsCalculator::product_sum_cornea_harrison_tang
CUDA_HOS_DEV float product_sum_cornea_harrison_tang(const float a, const float b, const float c, const float d)
Definition: GPUClusterInfoAndMomentsCalculatorImpl.h:54
lumiFormat.i
int i
Definition: lumiFormat.py:85
z
#define z
Helpers.h
D3PDSizeSummary.ff
ff
Definition: D3PDSizeSummary.py:305
h
ClusterMomentsCalculator::ClusterMomentCalculationOptions::min_r_lateral
float min_r_lateral
Definition: GPUClusterInfoAndMomentsCalculatorImpl.h:782
res
std::pair< std::vector< unsigned int >, bool > res
Definition: JetGroupProductTest.cxx:11
ClusterMomentsCalculator::RealSymmetricMatrixSolverIterative::b
float b
Definition: GPUClusterInfoAndMomentsCalculatorImpl.h:133
python.AtlRunQueryLib.options
options
Definition: AtlRunQueryLib.py:378
ClusterMomentsCalculator::corrected_dot_product
CUDA_HOS_DEV float corrected_dot_product(const float a_1, const float a_2, const float a_3, const float b_1, const float b_2, const float b_3)
Definition: GPUClusterInfoAndMomentsCalculatorImpl.h:69
ClusterMomentsCalculator::RealSymmetricMatrixSolverIterative::c
float c
Definition: GPUClusterInfoAndMomentsCalculatorImpl.h:133
ClusterMomentsCalculator::RealSymmetricMatrixSolver
Definition: GPUClusterInfoAndMomentsCalculatorImpl.h:447
ClusterMomentsCalculator::partial_kahan_babushka_neumaier_sum
CUDA_HOS_DEV void partial_kahan_babushka_neumaier_sum(const float &to_add, float &sum, float &corr)
Definition: GPUClusterInfoAndMomentsCalculatorImpl.h:21
ClusterMomentsCalculator::ClusterMomentCalculationOptions::max_axis_angle
float max_axis_angle
Definition: GPUClusterInfoAndMomentsCalculatorImpl.h:779
tolerance
Definition: suep_shower.h:17
plotBeamSpotMon.b
b
Definition: plotBeamSpotMon.py:76
ClusterMomentsCalculator::RealSymmetricMatrixSolverIterative::compute_iteration
CUDA_HOS_DEV void compute_iteration(const int start, const int end, float(&temp_diag)[3], float(&temp_subdiag)[2], float(&temp_mat)[3][3])
Definition: GPUClusterInfoAndMomentsCalculatorImpl.h:228
ClusterMomentsCalculator::sum_kahan_babushka_neumaier
CUDA_HOS_DEV float sum_kahan_babushka_neumaier(const Floats &... fs)
Definition: GPUClusterInfoAndMomentsCalculatorImpl.h:40
CaloRecGPU::CUDA_Helpers::CUDAStreamPtrHolder
Definition: Calorimeter/CaloRecGPU/CaloRecGPU/Helpers.h:110
ClusterMomentsCalculator::CMCOptionsHolder::m_options_dev
CaloRecGPU::Helpers::CUDA_object< ClusterMomentCalculationOptions > m_options_dev
Definition: GPUClusterInfoAndMomentsCalculatorImpl.h:789
ClusterMomentsCalculator::ClusterMomentCalculationOptions
Definition: GPUClusterInfoAndMomentsCalculatorImpl.h:774
a
TList * a
Definition: liststreamerinfos.cxx:10
y
#define y
ClusterMomentsCalculator::RealSymmetricMatrixSolver::get_eigenvectors
CUDA_HOS_DEV void get_eigenvectors(float(&res)[3][3], const float e_1, const float e_2, const float e_3, const float epsilon=s_typical_epsilon) const
Calculate the eigenvectors of the matrix, using the (possibly unscaled) eigenvalues e_1,...
Definition: GPUClusterInfoAndMomentsCalculatorImpl.h:647
ClusterMomentsCalculator::RealSymmetricMatrixSolver::get_eigenvalues
CUDA_HOS_DEV void get_eigenvalues(float &e_1, float &e_2, float &e_3) const
Calculate shifted and scaled eigenvalues of the matrix, in ascending value.
Definition: GPUClusterInfoAndMomentsCalculatorImpl.h:483
ClusterMomentsCalculator::RealSymmetricMatrixSolver::shift
float shift
Definition: GPUClusterInfoAndMomentsCalculatorImpl.h:448
ClusterMomentsCalculator::RealSymmetricMatrixSolverIterative::RealSymmetricMatrixSolverIterative
CUDA_HOS_DEV RealSymmetricMatrixSolverIterative(const float a_orig, const float b_orig, const float c_orig, const float d_orig, const float e_orig, const float f_orig)
Definition: GPUClusterInfoAndMomentsCalculatorImpl.h:140
CUDAFriendlyClasses.h
ClusterMomentsCalculator::RealSymmetricMatrixSolverIterative::scale
float scale
Definition: GPUClusterInfoAndMomentsCalculatorImpl.h:133
Herwig7_QED_EvtGen_ll.fs
dictionary fs
Definition: Herwig7_QED_EvtGen_ll.py:17
ClusterMomentsCalculator::ClusterMomentCalculationOptions::min_LAr_quality
float min_LAr_quality
Definition: GPUClusterInfoAndMomentsCalculatorImpl.h:778
ClusterMomentsCalculator::RealSymmetricMatrixSolverIterative::compute
CUDA_HOS_DEV void compute(float(&temp_diag)[3], float(&temp_subdiag)[2], float(&temp_mat)[3][3], const float near_zero=s_typical_near_zero, const float epsilon=s_typical_epsilon, const int max_iter=s_typical_max_iterations)
Definition: GPUClusterInfoAndMomentsCalculatorImpl.h:359
CaloRecGPU::ConstantDataHolder
Definition: DataHolders.h:19
extractSporadic.q
list q
Definition: extractSporadic.py:97
ClusterMomentsCalculator::register_kernels
void register_kernels(IGPUKernelSizeOptimizer &optimizer)
ClusterMomentsCalculator::ClusterMomentCalculationOptions::use_two_gaussian_noise
bool use_two_gaussian_noise
Definition: GPUClusterInfoAndMomentsCalculatorImpl.h:776
ClusterMomentsCalculator::corrected_magn_cross_product
CUDA_HOS_DEV float corrected_magn_cross_product(const float a1, const float a2, const float a3, const float b1, const float b2, const float b3)
Definition: GPUClusterInfoAndMomentsCalculatorImpl.h:108
MuonParameters::beta
@ beta
Definition: MuonParamDefs.h:144
ClusterMomentsCalculator::RealSymmetricMatrixSolverIterative::f
float f
Definition: GPUClusterInfoAndMomentsCalculatorImpl.h:133
CaloNoise_fillDB.mu
mu
Definition: CaloNoise_fillDB.py:51
python.compressB64.c
def c
Definition: compressB64.py:93
IGPUKernelSizeOptimizer
Interface for GPU kernel size optimization (allowing adjustment of kernel sizes to the properties of ...
Definition: IGPUKernelSizeOptimizer.h:29
DataHolders.h
fitman.rho
rho
Definition: fitman.py:532
plot_times.times
def times(fn)
Definition: plot_times.py:10
ClusterMomentsCalculator::CMCOptionsHolder::m_options
CaloRecGPU::Helpers::CPU_object< ClusterMomentCalculationOptions > m_options
Definition: GPUClusterInfoAndMomentsCalculatorImpl.h:787
ClusterMomentsCalculator::RealSymmetricMatrixSolverIterative::tridiagonalize
CUDA_HOS_DEV void tridiagonalize(float(&temp_diag)[3], float(&temp_subdiag)[2], float(&temp_mat)[3][3], const float tolerance=s_typical_tolerance)
Definition: GPUClusterInfoAndMomentsCalculatorImpl.h:165
fitman.k
k
Definition: fitman.py:528
ClusterMomentsCalculator::RealSymmetricMatrixSolverIterative::e
float e
Definition: GPUClusterInfoAndMomentsCalculatorImpl.h:133
ClusterMomentsCalculator::RealSymmetricMatrixSolverIterative::d
float d
Definition: GPUClusterInfoAndMomentsCalculatorImpl.h:133
ClusterMomentsCalculator::CMCOptionsHolder
Definition: GPUClusterInfoAndMomentsCalculatorImpl.h:786
ClusterMomentsCalculator::RealSymmetricMatrixSolver::RealSymmetricMatrixSolver
CUDA_HOS_DEV RealSymmetricMatrixSolver(const float a_orig, const float b_orig, const float c_orig, const float d_orig, const float e_orig, const float f_orig)
Definition: GPUClusterInfoAndMomentsCalculatorImpl.h:455
ClusterMomentsCalculator::ClusterMomentCalculationOptions::skip_invalid_clusters
bool skip_invalid_clusters
Definition: GPUClusterInfoAndMomentsCalculatorImpl.h:777
ClusterMomentsCalculator::RealSymmetricMatrixSolverIterative
Definition: GPUClusterInfoAndMomentsCalculatorImpl.h:132
ClusterMomentsCalculator::ClusterMomentCalculationOptions::eta_inner_wheel
float eta_inner_wheel
Definition: GPUClusterInfoAndMomentsCalculatorImpl.h:780