204 typedef long long int v2di
__attribute__ ((vector_size (16)));
233 v2di load_unaligned (
const char*
x)
235 return (v2di)__builtin_ia32_loaddqu (
x);
245 v2di load_aligned (
const char*
x)
259 #define byteshift_l(X, N) (__builtin_ia32_pslldqi128 ((X), (N)*8))
268 #define byteshift_r(X, N) (__builtin_ia32_psrldqi128 ((X), (N)*8))
282 #define clmul(A, B, WHICH) (__builtin_ia32_pclmulqdq128 ((A), (B), (WHICH)))
299 void byteshift_l256 (v2di in,
size_t n, v2di& outHigh, v2di& outLow)
301 static const uint8_t shuffleMasks[] = {
302 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f,
303 0x8f, 0x8e, 0x8d, 0x8c, 0x8b, 0x8a, 0x89, 0x88, 0x87, 0x86, 0x85, 0x84, 0x83, 0x82, 0x81, 0x80,
306 const v16qi
mask = (v16qi)load_unaligned ((
const char*)shuffleMasks + (16-
n));
307 outLow = (v2di)__builtin_ia32_pshufb128 ((v16qi)in, ~
mask);
308 outHigh = (v2di)__builtin_ia32_pshufb128 ((v16qi)in,
mask);
326 return y & (
static_cast<int64_t
>(
x)>>63);
344 for (
unsigned i=0;
i <
exp-64;
i++) {
345 d = (
d<<1) ^ hightest (
d,
p);
364 for (
unsigned i=0;
i < 64;
i++) {
365 q |= (
h & (1ull << 63)) >>
i;
366 h = (
h << 1) ^ hightest (
h,
p);
372 #endif // ATH_CRC64_VEC
387 v = ((
v >> 1) & 0x5555555555555555) | ((
v & 0x5555555555555555) << 1);
388 v = ((
v >> 2) & 0x3333333333333333) | ((
v & 0x3333333333333333) << 2);
389 v = ((
v >> 4) & 0x0F0F0F0F0F0F0F0F) | ((
v & 0x0F0F0F0F0F0F0F0F) << 4);
390 v = ((
v >> 8) & 0x00FF00FF00FF00FF) | ((
v & 0x00FF00FF00FF00FF) << 8);
391 v = ((
v >> 16) & 0x0000FFFF0000FFFF) | ((
v & 0x0000FFFF0000FFFF) << 16);
392 v = (
v >> 32) | (
v << 32);
413 v2di folding_round (v2di fold, v2di
data, v2di
k)
416 ^ clmul (fold,
k, 0x00)
417 ^ clmul (fold,
k, 0x11);
428 v2di fold_trailing_zeros (v2di
data, v2di
k)
430 return clmul (
data,
k, 0x10) ^ byteshift_r (
data, 8);
441 v2di barrett_reduce (v2di
R, v2di
k)
443 v2di T1 = clmul (
R,
k, 0x00);
445 ^ clmul (T1,
k, 0x10)
446 ^ byteshift_l (T1, 8);
481 v2di m_fold_constants;
485 v2di m_barrett_constants;
500 for (
int i = 0;
i < 256;
i++)
503 for (
int j = 0; j < 8; j++)
514 const uint64_t k1 = bit_reflect (exp_mod (128+64,
p)) << 1;
515 const uint64_t k2 = bit_reflect (exp_mod (128,
p)) << 1;
516 const uint64_t mu = (bit_reflect (exp129_div (
p)) << 1) | 1;
517 const uint64_t prev65 = (bit_reflect (
p) << 1) | 1;
519 m_fold_constants =
reinterpret_cast<v2di
>(
a);
520 v2du
b = {
mu, prev65 };
521 m_barrett_constants =
reinterpret_cast<v2di
>(
b);
550 return std::make_unique<CRCTable> (
p, initial);
566 const char*
end =
seq + data_len;
568 crc =
table.m_table[(crc ^ *
seq++) & 0xff] ^ (crc >> 8);
622 size_t leadin =
std::min (16 - (
reinterpret_cast<unsigned long>(
data) & 15), data_len);
631 v2di fold = {
static_cast<int64_t
>(crc), 0};
634 v2di
k =
table.m_fold_constants;
638 v2di temp2 = load_aligned (
data);
640 byteshift_l256 (fold, 16-data_len, crc1, crc0);
642 byteshift_l256 (temp2, 16-data_len, B,
A);
645 fold = fold_trailing_zeros (fold,
k);
646 fold ^= byteshift_l (crc1, 8);
652 fold ^= load_aligned (
data);
657 for (;
n+16 <= data_len;
n += 16) {
658 v2di temp2 = load_aligned (
data +
n);
659 fold = folding_round (fold, temp2,
k);
666 size_t nrem = data_len -
n;
668 byteshift_l256 (fold, 16-nrem, B,
A);
669 byteshift_l256 (
remainder, 16-nrem, D, C);
670 fold = folding_round (
A, B|C,
k);
672 fold = fold_trailing_zeros (fold,
k);
676 fold = barrett_reduce (fold,
table.m_barrett_constants);
682 #endif // ATH_CRC64_VEC
749 sprintf (buf,
"%08X%08X",
750 (
unsigned)((crc>>32)&0xffffffff), (
unsigned)(crc&0xffffffff));