ATLAS Offline Software
Loading...
Searching...
No Matches
vAlgs Namespace Reference

Functions

template<size_t ISA_WIDTH>
constexpr size_t alignmentForArray ()
 In the following ISA_WIDTH is the ISA width in bits e.g 128 for SSE 256 for AVX2 etc For the cases of interest/tested doing 4 simd vectors at a time seemed best.
template<size_t ISA_WIDTH, typename T>
constexpr size_t strideOfNumSIMDVec (size_t NumSIMDVec)
 returns the STRIDE in units of elements covered by NumSIMDVec simd vectors of type T for the specific ISA For example for a 256 bit, a SIMD vector is 8 floats So 4 simd vectors correspond to 32 elements.
template<size_t STRIDE>
constexpr int numPadded (const int n)
 Given a number n returns a new n >= n that is padded to the required STRIDE.
template<size_t ISA_WIDTH, typename T>
ATH_ALWAYS_INLINEvFindMinimum (const T *distancesIn, int n)
 Find the minimum element in the array of distances processing four simd vectors at a time.
template<size_t ISA_WIDTH, typename T>
ATH_ALWAYS_INLINE int vIdxOfValue (const T value, const T *distancesIn, int n)
 Find the index of an element in the array of distances processing four simd vectors at a time.
template<int ISA_WIDTH, typename T>
ATH_ALWAYS_INLINE int vIdxOfMin (const T *distancesIn, int n)
 Find the index of the minimum in the array of distances.

Function Documentation

◆ alignmentForArray()

template<size_t ISA_WIDTH>
size_t vAlgs::alignmentForArray ( )
constexpr

In the following ISA_WIDTH is the ISA width in bits e.g 128 for SSE 256 for AVX2 etc For the cases of interest/tested doing 4 simd vectors at a time seemed best.

Alignment needed for arrays of elements when using an ISA of specific width e.g 32 for AVX2 16 for SSE4 etc

Definition at line 68 of file GsfFindIndexOfMinimum.h.

68 {
69 return ISA_WIDTH / CHAR_BIT;
70}

◆ numPadded()

template<size_t STRIDE>
int vAlgs::numPadded ( const int n)
constexpr

Given a number n returns a new n >= n that is padded to the required STRIDE.

Definition at line 86 of file GsfFindIndexOfMinimum.h.

86 {
87 // This always return a padded number dividable
88 // with STRIDE , eg if STRIDE = 16
89 // e.g ((33+15)&~15) = 48
90 constexpr size_t STRIDEMINUS1 = STRIDE - 1;
91 return ((n + STRIDEMINUS1) & ~STRIDEMINUS1);
92}

◆ strideOfNumSIMDVec()

template<size_t ISA_WIDTH, typename T>
size_t vAlgs::strideOfNumSIMDVec ( size_t NumSIMDVec)
constexpr

returns the STRIDE in units of elements covered by NumSIMDVec simd vectors of type T for the specific ISA For example for a 256 bit, a SIMD vector is 8 floats So 4 simd vectors correspond to 32 elements.

Definition at line 78 of file GsfFindIndexOfMinimum.h.

78 {
79 return NumSIMDVec * (ISA_WIDTH / (sizeof(T) * CHAR_BIT));
80}

◆ vFindMinimum()

template<size_t ISA_WIDTH, typename T>
ATH_ALWAYS_INLINE T vAlgs::vFindMinimum ( const T * distancesIn,
int n )

Find the minimum element in the array of distances processing four simd vectors at a time.

Definition at line 99 of file GsfFindIndexOfMinimum.h.

99 {
100
101 using namespace CxxUtils;
102 static_assert(std::is_floating_point_v<T>, "T not a floating point type");
103 constexpr size_t VEC_WIDTH = ISA_WIDTH / (sizeof(T) * CHAR_BIT);
104 const T* array = std::assume_aligned<alignmentForArray<ISA_WIDTH>()>(distancesIn);
105 using vec_t = vec<T, VEC_WIDTH>;
106 vec_t minValues1;
107 vec_t minValues2;
108 vec_t minValues3;
109 vec_t minValues4;
110 vload(minValues1, array);
111 vload(minValues2, array + VEC_WIDTH);
112 vload(minValues3, array + VEC_WIDTH * 2);
113 vload(minValues4, array + VEC_WIDTH * 3);
114 vec_t values1;
115 vec_t values2;
116 vec_t values3;
117 vec_t values4;
118 for (int i = 4 * VEC_WIDTH; i < n; i += 4 * VEC_WIDTH) {
119 // 1
120 vload(values1, array + i);
121 vmin(minValues1, values1, minValues1);
122 // 2
123 vload(values2, array + i + VEC_WIDTH);
124 vmin(minValues2, values2, minValues2);
125 // 3
126 vload(values3, array + i + 2 * VEC_WIDTH);
127 vmin(minValues3, values3, minValues3);
128 // 4
129 vload(values4, array + i + 3 * VEC_WIDTH);
130 vmin(minValues4, values4, minValues4);
131 }
132 // Compare //1 with //2
133 vmin(minValues1, minValues1, minValues2);
134 // compare //3 with //4
135 vmin(minValues3, minValues3, minValues4);
136 // Compare //1 with //3
137 vmin(minValues1, minValues1, minValues3);
138 // Do the final calculation scalar way
139 T finalMinValues[VEC_WIDTH];
140 vstore(finalMinValues, minValues1);
141
142 // Do the final calculation scalar way
143 return std::reduce(std::begin(finalMinValues), std::end(finalMinValues),
144 finalMinValues[0],
145 [](T a, T b) { return a < b ? a : b; });
146}
std::vector< size_t > vec
static Double_t a
ATH_ALWAYS_INLINE void vstore(vec_type_t< VEC > *dst, const VEC &src)
Definition vec.h:290
ATH_ALWAYS_INLINE void vmin(VEC &dst, const VEC &a, const VEC &b)
Definition vec.h:324
ATH_ALWAYS_INLINE void vload(VEC &dst, vec_type_t< VEC > const *src)
Definition vec.h:272

◆ vIdxOfMin()

template<int ISA_WIDTH, typename T>
ATH_ALWAYS_INLINE int vAlgs::vIdxOfMin ( const T * distancesIn,
int n )

Find the index of the minimum in the array of distances.

Inputs are a ptr to the data and the number of elements to examine. The input needs to be properly padded (see numpadded method)

Definition at line 205 of file GsfFindIndexOfMinimum.h.

205 {
206 using namespace CxxUtils;
207 const T* array = std::assume_aligned<vAlgs::alignmentForArray<ISA_WIDTH>()>(distancesIn);
208 static_assert(std::is_floating_point_v<T>, "T not a floating point type");
209 //We process elements in blocks.
210 //When we find the minimum we also
211 //keep track in which block it was.
212 //The blocksize of 512 seemed to be a good enough
213 //compromise in tests.
214 constexpr int blockSize = 512;
215 // case for n less than blockSize
216 if (n <= blockSize) {
217 T min = vFindMinimum<ISA_WIDTH>(array, n);
218 return vIdxOfValue<ISA_WIDTH>(min, array, n);
219 }
220 int idx = 0;
221 T min = array[0];
222 // We might have a remainder, elements after an integral
223 // number of blockes that we need to handle
224 const int remainder = n & (blockSize - 1);
225 // Process elements up to the remainder in blocks
226 // For example for blockSize 512 and 1056 elements
227 // (if we opt for padding/multiple of 32)
228 // The loop will run two times and then we need
229 // to handle the 32 remaining elements.
230 for (int i = 0; i < (n - remainder); i += blockSize) {
231 T mintmp = vFindMinimum<ISA_WIDTH>(array + i, blockSize);
232 if (mintmp < min) {
233 min = mintmp;
234 idx = i;
235 }
236 }
237 //Process the remaining elements if any
238 if (remainder != 0) {
239 int index = n - remainder;
240 T mintmp = vFindMinimum<ISA_WIDTH>(array + index, remainder);
241 if (mintmp < min) {
242 min = mintmp;
243 return index + vIdxOfValue<ISA_WIDTH>(min, array + index, remainder);
244 }
245 }
246 // Return the idx of the minimum just looping over a single block
247 return idx + vIdxOfValue<ISA_WIDTH>(min, array + idx, blockSize);
248}
#define min(a, b)
Definition cfImp.cxx:40
std::vector< std::string > remainder(const std::vector< std::string > &v1, const std::vector< std::string > &v2)
str index
Definition DeMoScan.py:362
unsigned long long T
ATH_ALWAYS_INLINE T vFindMinimum(const T *distancesIn, int n)
Find the minimum element in the array of distances processing four simd vectors at a time.
ATH_ALWAYS_INLINE int vIdxOfValue(const T value, const T *distancesIn, int n)
Find the index of an element in the array of distances processing four simd vectors at a time.

◆ vIdxOfValue()

template<size_t ISA_WIDTH, typename T>
ATH_ALWAYS_INLINE int vAlgs::vIdxOfValue ( const T value,
const T * distancesIn,
int n )

Find the index of an element in the array of distances processing four simd vectors at a time.

Returns -1 if the element is not found.

Definition at line 154 of file GsfFindIndexOfMinimum.h.

155 {
156 using namespace CxxUtils;
157
158 static_assert(std::is_floating_point_v<T>, "T not a floating point type");
159 constexpr int VEC_WIDTH = ISA_WIDTH / (sizeof(T) * CHAR_BIT);
160 const T* array = std::assume_aligned<alignmentForArray<ISA_WIDTH>()>(distancesIn);
161 using vec_t = vec<T, VEC_WIDTH>;
162 using vec_mask = vec_mask_type_t<vec_t>;
163 vec_t values1;
164 vec_t values2;
165 vec_t values3;
166 vec_t values4;
167 vec_t target;
168 vbroadcast(target, value);
169 for (int i = 0; i < n; i += 4 * VEC_WIDTH) {
170 // 1
171 vload(values1, array + i);
172 vec_mask eq1 = values1 == target;
173 // 2
174 vload(values2, array + i + VEC_WIDTH);
175 vec_mask eq2 = values2 == target;
176 // 3
177 vload(values3, array + i + VEC_WIDTH * 2);
178 vec_mask eq3 = values3 == target;
179 // 4
180 vload(values4, array + i + VEC_WIDTH * 3);
181 vec_mask eq4 = values4 == target;
182
183 vec_mask eq12 = eq1 || eq2;
184 vec_mask eq34 = eq3 || eq4;
185 vec_mask eqAny = eq12 || eq34;
186 if (vany(eqAny)) {
187 for (int idx = i; idx < i + 4 * VEC_WIDTH; ++idx) {
188 if (distancesIn[idx] == value) {
189 return idx;
190 }
191 }
192 }
193 }
194 return -1;
195}
ATH_ALWAYS_INLINE bool vany(const VEC &mask)
Definition vec.h:362
typename vecDetail::vec_mask_type< VEC >::type vec_mask_type_t
Define a nice alias for the mask type for a vectorized type.
Definition vec.h:219
ATH_ALWAYS_INLINE void vbroadcast(VEC &v, T x)
Copy a scalar to each element of a vectorized type.
Definition vec.h:251