7 #ifndef CALORECGPU_GPUCLUSTERINFOANDMOMENTSCALCULATOR_CUDA_H
8 #define CALORECGPU_GPUCLUSTERINFOANDMOMENTSCALCULATOR_CUDA_H
35 shift = (a_orig + b_orig + c_orig) / 3.
f;
59 const float c_0 =
a *
b *
c + 2.f *
d *
f *
e -
a *
e *
e -
b *
f *
f -
c *
d *
d;
60 const float c_1 =
a *
b -
d *
d +
a *
c -
f *
f +
b *
c -
e *
e;
61 const float c_2 =
a +
b +
c;
63 constexpr
float inv_3 = 1.f / 3.f;
65 const float c_2_over_3 = c_2 * inv_3;
67 const float a_over_3 =
max((c_2 * c_2_over_3 - c_1) * inv_3, 0.
f);
69 const float half_b = 0.5f * (c_0 + c_2_over_3 * (2.f * c_2_over_3 * c_2_over_3 - c_1));
71 const float q =
max(a_over_3 * a_over_3 * a_over_3 - half_b * half_b, 0.
f);
73 const float rho = sqrtf(a_over_3);
76 const float theta = atan2f(1.0
f, rsqrtf(
q) * half_b) * inv_3;
78 const float theta = atan2f(sqrtf(
q), half_b) * inv_3;
82 float sin_theta, cos_theta;
83 sincosf(theta, &sin_theta, &cos_theta);
85 const float sin_theta = sinf(theta);
86 const float cos_theta = cosf(theta);
89 const float sqrt_3 = sqrtf(3.
f);
91 e_1 = c_2_over_3 -
rho * (cos_theta + sqrt_3 * sin_theta);
92 e_2 = c_2_over_3 -
rho * (cos_theta - sqrt_3 * sin_theta);
93 e_3 = c_2_over_3 + 2.f *
rho * cos_theta;
96 CUDA_HOS_DEV static void cross_prod(
float (&
res)[3],
const float a1,
const float a2,
const float a3,
const float b1,
const float b2,
const float b3)
98 res[0] = a2 * b3 - a3 * b2;
99 res[1] = a3 * b1 - a1 * b3;
100 res[2] = a1 * b2 - a2 * b1;
108 CUDA_HOS_DEV static float dot_prod(
const float a1,
const float a2,
const float a3,
const float b1,
const float b2,
const float b3)
110 return a1 * b1 + a2 * b2 + a3 * b3;
122 const float diag_0 =
a - eigenvalue;
123 const float diag_1 =
b - eigenvalue;
124 const float diag_2 =
c - eigenvalue;
126 float vec_1[3], vec_2[3];
128 if (fabsf(diag_0) > fabsf(diag_1) && fabsf(diag_0) > fabsf(diag_2))
130 representative[0] = diag_0;
131 representative[1] =
d;
132 representative[2] =
f;
142 else if ( fabsf(diag_1) > fabsf(diag_2))
144 representative[0] =
d;
145 representative[1] = diag_1;
146 representative[2] =
e;
159 representative[0] =
f;
160 representative[1] =
e;
161 representative[2] = diag_2;
177 const float norm_1 = rnorm3df(
res[0],
res[1],
res[2]);
178 const float norm_2 = rnorm3df(vec_1[0], vec_1[1], vec_1[2]);
180 const float norm_1 = 1.f / hypot(
res[0],
res[1],
res[2]);
181 const float norm_2 = 1.f / hypot(vec_1[0], vec_1[1], vec_1[2]);
184 if (norm_1 <= norm_2)
193 res[0] = vec_1[0] * norm_2;
194 res[1] = vec_1[1] * norm_2;
195 res[2] = vec_1[2] * norm_2;
208 if (e_3 - e_1 <= epsilon)
224 const float d_0 = e_3 - e_2;
225 const float d_1 = e_2 - e_1;
227 const float d_min =
min(d_0, d_1);
230 float first_e, second_e;
251 if (d_min <= 2 * epsilon * d_1)
254 const float base_norm = rnorm3df(
res[j][0],
res[j][1],
res[j][2]);
256 const float base_norm = 1.f / hypot(
res[j][0],
res[j][1],
res[j][2]);
260 const float norm = base_norm / extra_factor;
268 float extra_vector[3];
276 const float norm = rnorm3df(
res[1][0],
res[1][1],
res[1][2]);
278 const float norm = 1.f / hypot(
res[1][0],
res[1][1],
res[1][2]);
294 get_eigenvectors(eigenvectors, eigenvalues[0], eigenvalues[1], eigenvalues[2], epsilon);
296 if (rescale_and_reshift_values)
298 eigenvalues[0] = eigenvalues[0] *
scale +
shift;
299 eigenvalues[1] = eigenvalues[1] *
scale +
shift;
300 eigenvalues[2] = eigenvalues[2] *
scale +
shift;
325 m_options.allocate();
337 const bool synchronize =
false,
339 const bool defer_instead_of_oversize =
false);