da/dba/crc64_8cxx_source.html

/*

  Copyright (C) 2002-2020 CERN for the benefit of the ATLAS collaboration

*/

/*

 */

#include "CxxUtils/crc64.h"

#include "CxxUtils/AthUnlikelyMacros.h"

#include <stdio.h>


#if ATH_CRC64_VEC


// Two 64-bit integers.

typedef long long int  v2di __attribute__ ((vector_size (16)));


// Two unsigned 64-bit integers.

typedef uint64_t v2du __attribute__ ((vector_size (16)));


// 16 signed characters.

typedef char v16qi __attribute__ ((vector_size (16)));


#endif


namespace {


//***************************************************************************

// Primitive functions and utilities.

//


#if ATH_CRC64_VEC


inline

v2di load_unaligned (const char* x)

{

  return (v2di)__builtin_ia32_loaddqu (x);

}


inline

v2di load_aligned (const char* x)

{

  return *(v2di*)x;

}


// A macro, not a function, because the second argument is constrained

// to be an 8-bit constant.  If this were a function, compilation might fail

// if it is not inlined.

#define byteshift_l(X, N) (__builtin_ia32_pslldqi128 ((X), (N)*8))


// A macro, not a function; see above.

#define byteshift_r(X, N) (__builtin_ia32_psrldqi128 ((X), (N)*8))


// A macro, not a function; see above.

#define clmul(A, B, WHICH) (__builtin_ia32_pclmulqdq128 ((A), (B), (WHICH)))


__attribute__ ((target ("sse4")))

inline

void byteshift_l256 (v2di in, size_t n, v2di& outHigh, v2di& outLow)

{

  static const uint8_t shuffleMasks[] = {

    0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f,

    0x8f, 0x8e, 0x8d, 0x8c, 0x8b, 0x8a, 0x89, 0x88, 0x87, 0x86, 0x85, 0x84, 0x83, 0x82, 0x81, 0x80,

  };


  const v16qi mask = (v16qi)load_unaligned ((const char*)shuffleMasks + (16-n));

  outLow  = (v2di)__builtin_ia32_pshufb128 ((v16qi)in, ~mask);

  outHigh = (v2di)__builtin_ia32_pshufb128 ((v16qi)in,  mask);

}


inline

uint64_t hightest (uint64_t x, uint64_t y)

{

  // Relies on sign-extension of right-shift of a signed int.

  // This is strictly speaking implementation-defined behavior.

  // Since this code is anyway enabled only on x86_64, that's ok.

  // cppcheck-suppress shiftTooManyBitsSigned

  return y & (static_cast<int64_t>(x)>>63);

}


uint64_t exp_mod (unsigned exp, uint64_t p)

{

  // This is basically just doing binary long division without carry

  // (so subtraction becomes xor).

  uint64_t d = p;

  for (unsigned i=0; i < exp-64; i++) {

    d = (d<<1) ^ hightest (d, p);

  }

  return d;

}


uint64_t exp129_div (uint64_t p)

{

  // Again, just binary long division without carry.

  uint64_t q = 0;

  uint64_t h = p;

  for (unsigned i=0; i < 64; i++) {

    q |= (h & (1ull << 63)) >> i;

    h = (h << 1) ^ hightest (h, p);

  }

  return q;

}


#endif // ATH_CRC64_VEC


/*

 * @brief Reflect the bits in a 64-bit word around the center.

 * @param v Value to reflect.

 *

 * So, for example,  0x0120034005600780 becomes

 *                   0x01e006a002c00480

 *

 * Reference: https://graphics.stanford.edu/~seander/bithacks.html#ReverseParallel

 * Originally credited to E. Freed, Dr. Dobbs's Journal 8 no. 4, p. 24 (1983).

 */

uint64_t bit_reflect (uint64_t v)

{

  v = ((v >> 1) & 0x5555555555555555) | ((v & 0x5555555555555555) << 1);

  v = ((v >> 2) & 0x3333333333333333) | ((v & 0x3333333333333333) << 2);

  v = ((v >> 4) & 0x0F0F0F0F0F0F0F0F) | ((v & 0x0F0F0F0F0F0F0F0F) << 4);

  v = ((v >> 8) & 0x00FF00FF00FF00FF) | ((v & 0x00FF00FF00FF00FF) << 8);

  v = ((v >> 16) & 0x0000FFFF0000FFFF) | ((v & 0x0000FFFF0000FFFF) << 16);

  v = (v >> 32) | (v << 32);

  return v;

}


//****************************************************************************

// CRC helpers.

//


#if ATH_CRC64_VEC


__attribute__ ((target ("pclmul")))

inline

v2di folding_round (v2di fold, v2di data, v2di k)

{

  return data

    ^ clmul (fold, k, 0x00)

    ^ clmul (fold, k, 0x11);

}


__attribute__ ((target ("pclmul")))

inline

v2di fold_trailing_zeros (v2di data, v2di k)

{

  return clmul (data, k, 0x10) ^  byteshift_r (data, 8);

}


__attribute__ ((target ("pclmul")))

inline

v2di barrett_reduce (v2di R, v2di k)

{

  v2di T1  = clmul (R,  k, 0x00);

  return R

    ^ clmul (T1, k, 0x10)

    ^ byteshift_l (T1, 8);

}

#endif


} // anonymous namespace


namespace CxxUtils {


//***************************************************************************

// Public entry points.

//


class CRCTable

{

public:

  CRCTable (uint64_t p, uint64_t initial = 0xffffffffffffffff);


  uint64_t m_initial;


  uint64_t m_table[256];


#if ATH_CRC64_VEC

  v2di m_fold_constants;


  v2di m_barrett_constants;

#endif

};


CRCTable::CRCTable (uint64_t p, uint64_t initial /* = 0xffffffffffffffff*/)

{

  m_initial = initial;


  uint64_t prev = bit_reflect (p);

  for (int i = 0; i < 256; i++)

  {

    uint64_t r = i;

    for (int j = 0; j < 8; j++)

    {

      if (r & 1)

        r = (r >> 1) ^ prev;

      else

        r >>= 1;

    }

    m_table[i] = r;

  }


#if ATH_CRC64_VEC

  const uint64_t k1 = bit_reflect (exp_mod (128+64, p)) << 1;

  const uint64_t k2 = bit_reflect (exp_mod (128, p)) << 1;

  const uint64_t mu = (bit_reflect (exp129_div (p)) << 1) | 1;

  const uint64_t prev65 =  (bit_reflect (p) << 1) | 1;

  v2du a = {k1, k2};

  m_fold_constants = reinterpret_cast<v2di>(a);

  v2du b = { mu, prev65 };

  m_barrett_constants = reinterpret_cast<v2di>(b);

#endif

}


// Polynomial taken from code from David T. Jones (dtj@cs.ucl.ac.uk).

// (The form given here is reversed.)

// http://www0.cs.ucl.ac.uk/staff/D.Jones/crcnote.pdf

const CRCTable defaultCRCTable (0xad93d23594c935a9);


void deleteCRCTable (CxxUtils::CRCTable* table)

{

  delete table;

}


std::unique_ptr<CRCTable> makeCRCTable (uint64_t p,

                                        uint64_t initial /*= 0xffffffffffffffff*/)

{

  return std::make_unique<CRCTable> (p, initial);

}


uint64_t crc64_bytewise (const CRCTable& table,

                         const char* data,

                         size_t data_len)

{

  uint64_t crc = table.m_initial;

  const char* seq = data;

  const char* end = seq + data_len;

  while (seq < end)

    crc = table.m_table[(crc ^ *seq++) & 0xff] ^ (crc >> 8);

  return crc;

}


uint64_t crc64_bytewise (const char* data,

                         size_t data_len)

{

  return crc64_bytewise (defaultCRCTable, data, data_len);

}


uint64_t crc64_bytewise (const std::string& s)

{

  return crc64_bytewise (defaultCRCTable, s.data(), s.size());

}


#if ATH_CRC64_VEC


__attribute__ ((target ("pclmul")))

uint64_t crc64 (const CRCTable& table,

                const char* data,

                size_t data_len)

{

  uint64_t crc = table.m_initial;


  // Early exit if the string is null.

  if (ATH_UNLIKELY(!data_len)) return crc;


  // The main body assumes that the data are aligned to 128 bits.

  // This should almost always be the case.  But just in case the input

  // string is not, consume the initial unaligned portion byte-by-byte

  // until it is.

  if (ATH_UNLIKELY (reinterpret_cast<unsigned long>(data) & 15)) {

    // Number of unaligned bytes we need to read from the start of the string.

    size_t leadin = std::min (16 - (reinterpret_cast<unsigned long>(data) & 15), data_len);

    crc = crc64_bytewise (table, data, leadin);

    data += leadin;

    data_len -= leadin;


    if (ATH_UNLIKELY(!data_len)) return crc;

  }


  // Accumulator for CRC value.

  v2di fold = {static_cast<int64_t>(crc), 0};


  // Constants for the folding step.

  v2di k = table.m_fold_constants;


  if (ATH_UNLIKELY (data_len < 16)) {

    // Special case for less than 128 bits.

    v2di temp2 = load_aligned (data);

    v2di crc0, crc1;

    byteshift_l256 (fold, 16-data_len, crc1, crc0);

    v2di A, B;

    byteshift_l256 (temp2, 16-data_len, B, A);


    fold = A ^ crc0;

    fold = fold_trailing_zeros (fold, k);

    fold ^= byteshift_l (crc1, 8);

  }

  else {

    // We have 128 bits or more.


    // Load the first 128 bits.

    fold ^= load_aligned (data);


    // Main folding loop.  Fold in 128 bits at a time until there

    // are fewer than 128 left.

    size_t n = 16;

    for (; n+16 <= data_len; n += 16) {

      v2di temp2 = load_aligned (data + n);

      fold = folding_round (fold, temp2, k);

    }


    // Handle a partial block at the end of less than 128 bits.

    if (ATH_LIKELY (n < data_len)) {

      v2di remainder = load_aligned (data + n);

      // Number of remaining bytes.

      size_t nrem = data_len - n;

      v2di A, B, C, D;

      byteshift_l256 (fold, 16-nrem, B, A);

      byteshift_l256 (remainder, 16-nrem, D, C);

      fold = folding_round (A, B|C, k);

    }

    fold = fold_trailing_zeros (fold, k);

  }


  fold = barrett_reduce (fold, table.m_barrett_constants);


  return fold[1];

}


#endif // ATH_CRC64_VEC


#if ATH_CRC64_VEC

__attribute__ ((target ("default")))

#endif

uint64_t crc64 (const CRCTable& table,

                const char* data,

                size_t data_len)

{

  return crc64_bytewise (table, data, data_len);

}


uint64_t crc64 (const char* data,

                size_t data_len)

{

  return crc64 (defaultCRCTable, data, data_len);

}


uint64_t crc64 (const std::string& s)

{

  return crc64 (defaultCRCTable, s.data(), s.size());

}


uint64_t crc64addint (uint64_t crc, uint64_t x)

{

  while (x > 0) {

    crc = defaultCRCTable.m_table[(crc ^ x) & 0xff] ^ (crc >> 8);

    x >>= 8;

  }

  return crc;

}


std::string crc64format (uint64_t crc)

{

  char buf[64];

  sprintf (buf, "%08X%08X",

           (unsigned)((crc>>32)&0xffffffff), (unsigned)(crc&0xffffffff));

  return buf;

}


std::string crc64digest (const std::string& str)

{

  return crc64format (crc64 (str));

}


} // namespace CxxUtils