Functions
template<size_t ISA_WIDTH>
constexpr size_t	alignmentForArray ()
	In the following ISA_WIDTH is the ISA width in bits e.g 128 for SSE 256 for AVX2 etc For the cases of interest/tested doing 4 simd vectors at a time seemed best. More...

template<size_t ISA_WIDTH, typename T >
constexpr size_t	strideOfNumSIMDVec (size_t NumSIMDVec)
	returns the STRIDE in units of elements covered by NumSIMDVec simd vectors of type T for the specific ISA For example for a 256 bit, a SIMD vector is 8 floats So 4 simd vectors correspond to 32 elements. More...

template<size_t STRIDE>
constexpr int	numPadded (const int n)
	Given a number n returns a new n >= n that is padded to the required STRIDE. More...

template<size_t ISA_WIDTH, typename T >
ATH_ALWAYS_INLINE T	vFindMinimum (const T *distancesIn, int n)
	Find the minimum element in the array of distances processing four simd vectors at a time. More...

template<size_t ISA_WIDTH, typename T >
ATH_ALWAYS_INLINE int	vIdxOfValue (const T value, const T *distancesIn, int n)
	Find the index of an element in the array of distances processing four simd vectors at a time. More...

template<int ISA_WIDTH, typename T >
ATH_ALWAYS_INLINE int	vIdxOfMin (const T *distancesIn, int n)
	Find the index of the minimum in the array of distances. More...

Function Documentation

◆ alignmentForArray()

template<size_t ISA_WIDTH>

constexpr size_t vAlgs::alignmentForArray ( )

constexpr

In the following ISA_WIDTH is the ISA width in bits e.g 128 for SSE 256 for AVX2 etc For the cases of interest/tested doing 4 simd vectors at a time seemed best.

Alignment needed for arrays of elements when using an ISA of specific width e.g 32 for AVX2 16 for SSE4 etc

Definition at line 76 of file GSFFindIndexOfMinimum.h.

                                     {
   return ISA_WIDTH / CHAR_BIT;
 }

◆ numPadded()

template<size_t STRIDE>

constexpr int vAlgs::numPadded ( const int n )

constexpr

Given a number n returns a new n >= n that is padded to the required STRIDE.

Definition at line 94 of file GSFFindIndexOfMinimum.h.

                                      {
   // This always return a padded number dividable
   // with STRIDE , eg if STRIDE = 16
   // e.g ((33+15)&~15) = 48
   constexpr size_t STRIDEMINUS1 = STRIDE - 1;
   return ((n + STRIDEMINUS1) & ~STRIDEMINUS1);
 }

◆ strideOfNumSIMDVec()

template<size_t ISA_WIDTH, typename T >

constexpr size_t vAlgs::strideOfNumSIMDVec ( size_t NumSIMDVec )

constexpr

returns the STRIDE in units of elements covered by NumSIMDVec simd vectors of type T for the specific ISA For example for a 256 bit, a SIMD vector is 8 floats So 4 simd vectors correspond to 32 elements.

Definition at line 86 of file GSFFindIndexOfMinimum.h.

                                                       {
   return NumSIMDVec * (ISA_WIDTH / (sizeof(T) * CHAR_BIT));
 }

◆ vFindMinimum()

template<size_t ISA_WIDTH, typename T >

ATH_ALWAYS_INLINE T vAlgs::vFindMinimum	(	const T *	distancesIn,
		int	n
	)

Find the minimum element in the array of distances processing four simd vectors at a time.

Definition at line 107 of file GSFFindIndexOfMinimum.h.

                                             {
  
   using namespace CxxUtils;
   static_assert(std::is_floating_point_v<T>, "T not a floating point type");
   constexpr size_t VEC_WIDTH = ISA_WIDTH / (sizeof(T) * CHAR_BIT);
   const T* array =
       std::assume_aligned<alignmentForArray<ISA_WIDTH>()>(distancesIn);
   using vec_t = vec<T, VEC_WIDTH>;
   vec_t minValues1;
   vec_t minValues2;
   vec_t minValues3;
   vec_t minValues4;
   vload(minValues1, array);
   vload(minValues2, array + VEC_WIDTH);
   vload(minValues3, array + VEC_WIDTH * 2);
   vload(minValues4, array + VEC_WIDTH * 3);
   vec_t values1;
   vec_t values2;
   vec_t values3;
   vec_t values4;
   for (int i = 4 * VEC_WIDTH; i < n; i += 4 * VEC_WIDTH) {
     // 1
     vload(values1, array + i);
     vmin(minValues1, values1, minValues1);
     // 2
     vload(values2, array + i + VEC_WIDTH);
     vmin(minValues2, values2, minValues2);
     // 3
     vload(values3, array + i + 2 * VEC_WIDTH);
     vmin(minValues3, values3, minValues3);
     // 4
     vload(values4, array + i + 3 * VEC_WIDTH);
     vmin(minValues4, values4, minValues4);
   }
   // Compare //1 with //2
   vmin(minValues1, minValues1, minValues2);
   // compare //3 with //4
   vmin(minValues3, minValues3, minValues4);
   // Compare //1 with //3
   vmin(minValues1, minValues1, minValues3);
   // Do the final calculation scalar way
   T finalMinValues[VEC_WIDTH];
   vstore(finalMinValues, minValues1);
  
   // Do the final calculation scalar way
   return std::reduce(std::begin(finalMinValues), std::end(finalMinValues),
                      finalMinValues[0],
                      [](T a, T b) { return a < b ? a : b; });
 }

◆ vIdxOfMin()

template<int ISA_WIDTH, typename T >

ATH_ALWAYS_INLINE int vAlgs::vIdxOfMin	(	const T *	distancesIn,
		int	n
	)

Find the index of the minimum in the array of distances.

Definition at line 210 of file GSFFindIndexOfMinimum.h.

                                            {
   using namespace CxxUtils;
   const T* array =
       std::assume_aligned<vAlgs::alignmentForArray<ISA_WIDTH>()>(distancesIn);
   static_assert(std::is_floating_point_v<T>, "T not a floating point type");
   //We process elements in blocks. When we find the minimum we also
   //keep track in which block it was
   constexpr int blockSize = 512;
   // case for n less than blockSize
   if (n <= blockSize) {
     T min = vFindMinimum<ISA_WIDTH>(array, n);
     return vIdxOfValue<ISA_WIDTH>(min, array, n);
   }
   int idx = 0;
   T min = array[0];
   // We might have a remainder that we need to handle
   const int remainder = n & (blockSize - 1);
   // process elements up to the remainder in blocks
   for (int i = 0; i < (n - remainder); i += blockSize) {
     T mintmp = vFindMinimum<ISA_WIDTH>(array + i, blockSize);
     if (mintmp < min) {
       min = mintmp;
       idx = i;
     }
   }
  
   //Process the remaining elements if any
   if (remainder != 0) {
     int index = n - remainder;
     T mintmp = vFindMinimum<ISA_WIDTH>(array + index, remainder);
     // if the minimu is here
     if (mintmp < min) {
       min = mintmp;
       return index + vIdxOfValue<ISA_WIDTH>(min, array + index, remainder);
     }
   }
   // Return the idx of the minimum just looping over a single block
   return idx + vIdxOfValue<ISA_WIDTH>(min, array + idx, blockSize);
 }

◆ vIdxOfValue()

template<size_t ISA_WIDTH, typename T >

ATH_ALWAYS_INLINE int vAlgs::vIdxOfValue	(	const T	value,
		const T *	distancesIn,
		int	n
	)

Find the index of an element in the array of distances processing four simd vectors at a time.

Definition at line 162 of file GSFFindIndexOfMinimum.h.

                                              {
   using namespace CxxUtils;
  
   static_assert(std::is_floating_point_v<T>, "T not a floating point type");
   constexpr int VEC_WIDTH = ISA_WIDTH / (sizeof(T) * CHAR_BIT);
   const T* array =
       std::assume_aligned<alignmentForArray<ISA_WIDTH>()>(distancesIn);
   using vec_t = vec<T, VEC_WIDTH>;
   using vec_mask = vec_mask_type_t<vec_t>;
   vec_t values1;
   vec_t values2;
   vec_t values3;
   vec_t values4;
   vec_t target;
   vbroadcast(target, value);
   for (int i = 0; i < n; i += 4 * VEC_WIDTH) {
     // 1
     vload(values1, array + i);
     vec_mask eq1 = values1 == target;
     // 2
     vload(values2, array + i + VEC_WIDTH);
     vec_mask eq2 = values2 == target;
     // 3
     vload(values3, array + i + VEC_WIDTH * 2);
     vec_mask eq3 = values3 == target;
     // 4
     vload(values4, array + i + VEC_WIDTH * 3);
     vec_mask eq4 = values4 == target;
  
     vec_mask eq12 = eq1 || eq2;
     vec_mask eq34 = eq3 || eq4;
     vec_mask eqAny = eq12 || eq34;
     if (vany(eqAny)) {
       for (int idx = i; idx < i + 4 * VEC_WIDTH; ++idx) {
         if (distancesIn[idx] == value) {
           return idx;
         }
       }
     }
   }
   return -1;
 }

Functions

Function Documentation

◆ alignmentForArray()

◆ numPadded()

◆ strideOfNumSIMDVec()

◆ vFindMinimum()

◆ vIdxOfMin()

◆ vIdxOfValue()