Browse Source
MDEV-19935 Create unified CRC-32 interface
MDEV-19935 Create unified CRC-32 interface
Add CRC32C code to mysys. The x86-64 implementation uses PCMULQDQ in addition to CRC32 instruction after Intel whitepaper, and is ported from rocksdb code. Optimized ARM and POWER CRC32 were already present in mysys.10.5-MDEV-23711-redo-log-overwrite
24 changed files with 2087 additions and 1147 deletions
-
9config.h.cmake
-
12extra/CMakeLists.txt
-
1extra/innochecksum.cc
-
1extra/mariabackup/backup_copy.cc
-
2extra/mariabackup/xbstream.cc
-
4extra/mariabackup/xtrabackup.cc
-
14include/my_sys.h
-
44mysys/CMakeLists.txt
-
19mysys/crc32/crc32_arm64.c
-
678mysys/crc32/crc32_ppc64.c
-
1254mysys/crc32/crc32c.cc
-
5mysys/crc32/crc32c_ppc.c
-
19mysys/crc32/crc32c_ppc.h
-
664mysys/crc32/crc_ppc64.h
-
44mysys/crc32ieee.cc
-
3mysys/my_init.c
-
2storage/innobase/CMakeLists.txt
-
27storage/innobase/include/ut0crc32.h
-
10storage/innobase/innodb.cmake
-
1storage/innobase/srv/srv0srv.cc
-
2storage/innobase/srv/srv0start.cc
-
346storage/innobase/ut/ut0crc32.cc
-
4unittest/mysys/CMakeLists.txt
-
69unittest/mysys/crc32-t.c
@ -1,675 +1,5 @@ |
|||
/* |
|||
* Calculate the checksum of data that is 16 byte aligned and a multiple of |
|||
* 16 bytes. |
|||
* |
|||
* The first step is to reduce it to 1024 bits. We do this in 8 parallel |
|||
* chunks in order to mask the latency of the vpmsum instructions. If we |
|||
* have more than 32 kB of data to checksum we repeat this step multiple |
|||
* times, passing in the previous 1024 bits. |
|||
* |
|||
* The next step is to reduce the 1024 bits to 64 bits. This step adds |
|||
* 32 bits of 0s to the end - this matches what a CRC does. We just |
|||
* calculate constants that land the data in this 32 bits. |
|||
* |
|||
* We then use fixed point Barrett reduction to compute a mod n over GF(2) |
|||
* for n = CRC using POWER8 instructions. We use x = 32. |
|||
* |
|||
* http://en.wikipedia.org/wiki/Barrett_reduction |
|||
* |
|||
* This code uses gcc vector builtins instead using assembly directly. |
|||
* |
|||
* Copyright (C) 2017 Rogerio Alves <rogealve@br.ibm.com>, IBM |
|||
* |
|||
* This program is free software; you can redistribute it and/or |
|||
* modify it under the terms of either: |
|||
* |
|||
* a) the GNU General Public License as published by the Free Software |
|||
* Foundation; either version 2 of the License, or (at your option) |
|||
* any later version, or |
|||
* b) the Apache License, Version 2.0 |
|||
*/ |
|||
|
|||
#include <altivec.h> |
|||
|
|||
#define POWER8_INTRINSICS |
|||
#define CRC32_FUNCTION my_checksum |
|||
#define CRC_TABLE |
|||
|
|||
#ifdef CRC32_CONSTANTS_HEADER |
|||
#include CRC32_CONSTANTS_HEADER |
|||
#else |
|||
#include "crc32_constants.h" |
|||
#endif |
|||
|
|||
#define VMX_ALIGN 16 |
|||
#define VMX_ALIGN_MASK (VMX_ALIGN-1) |
|||
|
|||
#ifdef REFLECT |
|||
static unsigned int crc32_align(unsigned int crc, const unsigned char *p, |
|||
unsigned long len) |
|||
{ |
|||
while (len--) |
|||
crc = crc_table[(crc ^ *p++) & 0xff] ^ (crc >> 8); |
|||
return crc; |
|||
} |
|||
#else |
|||
static unsigned int crc32_align(unsigned int crc, const unsigned char *p, |
|||
unsigned long len) |
|||
{ |
|||
while (len--) |
|||
crc = crc_table[((crc >> 24) ^ *p++) & 0xff] ^ (crc << 8); |
|||
return crc; |
|||
} |
|||
#endif |
|||
|
|||
static unsigned int __attribute__ ((aligned (32))) |
|||
__crc32_vpmsum(unsigned int crc, const void* p, unsigned long len); |
|||
|
|||
#ifndef CRC32_FUNCTION |
|||
#define CRC32_FUNCTION crc32_vpmsum |
|||
#endif |
|||
|
|||
unsigned int CRC32_FUNCTION(unsigned int crc, const unsigned char *p, |
|||
unsigned long len) |
|||
{ |
|||
unsigned int prealign; |
|||
unsigned int tail; |
|||
|
|||
#ifdef CRC_XOR |
|||
crc ^= 0xffffffff; |
|||
#endif |
|||
|
|||
if (len < VMX_ALIGN + VMX_ALIGN_MASK) { |
|||
crc = crc32_align(crc, p, len); |
|||
goto out; |
|||
} |
|||
|
|||
if ((unsigned long)p & VMX_ALIGN_MASK) { |
|||
prealign = VMX_ALIGN - ((unsigned long)p & VMX_ALIGN_MASK); |
|||
crc = crc32_align(crc, p, prealign); |
|||
len -= prealign; |
|||
p += prealign; |
|||
} |
|||
|
|||
crc = __crc32_vpmsum(crc, p, len & ~VMX_ALIGN_MASK); |
|||
|
|||
tail = len & VMX_ALIGN_MASK; |
|||
if (tail) { |
|||
p += len & ~VMX_ALIGN_MASK; |
|||
crc = crc32_align(crc, p, tail); |
|||
} |
|||
|
|||
out: |
|||
#ifdef CRC_XOR |
|||
crc ^= 0xffffffff; |
|||
#endif |
|||
|
|||
return crc; |
|||
} |
|||
|
|||
#if defined (__clang__) |
|||
#include "clang_workaround.h" |
|||
#else |
|||
#define __builtin_pack_vector(a, b) __builtin_pack_vector_int128 ((a), (b)) |
|||
#define __builtin_unpack_vector_0(a) __builtin_unpack_vector_int128 ((vector __int128_t)(a), 0) |
|||
#define __builtin_unpack_vector_1(a) __builtin_unpack_vector_int128 ((vector __int128_t)(a), 1) |
|||
#endif |
|||
|
|||
/* When we have a load-store in a single-dispatch group and address overlap |
|||
* such that foward is not allowed (load-hit-store) the group must be flushed. |
|||
* A group ending NOP prevents the flush. |
|||
*/ |
|||
#define GROUP_ENDING_NOP asm("ori 2,2,0" ::: "memory") |
|||
|
|||
#if defined(__BIG_ENDIAN__) && defined (REFLECT) |
|||
#define BYTESWAP_DATA |
|||
#elif defined(__LITTLE_ENDIAN__) && !defined(REFLECT) |
|||
#define BYTESWAP_DATA |
|||
#endif |
|||
|
|||
#ifdef BYTESWAP_DATA |
|||
#define VEC_PERM(vr, va, vb, vc) vr = vec_perm(va, vb,\ |
|||
(__vector unsigned char) vc) |
|||
#if defined(__LITTLE_ENDIAN__) |
|||
/* Byte reverse permute constant LE. */ |
|||
static const __vector unsigned long long vperm_const |
|||
__attribute__ ((aligned(16))) = { 0x08090A0B0C0D0E0FUL, |
|||
0x0001020304050607UL }; |
|||
#else |
|||
static const __vector unsigned long long vperm_const |
|||
__attribute__ ((aligned(16))) = { 0x0F0E0D0C0B0A0908UL, |
|||
0X0706050403020100UL }; |
|||
#endif |
|||
#else |
|||
#define VEC_PERM(vr, va, vb, vc) |
|||
#endif |
|||
|
|||
static unsigned int __attribute__ ((aligned (32))) |
|||
__crc32_vpmsum(unsigned int crc, const void* p, unsigned long len) { |
|||
|
|||
const __vector unsigned long long vzero = {0,0}; |
|||
const __vector unsigned long long vones = {0xffffffffffffffffUL, |
|||
0xffffffffffffffffUL}; |
|||
|
|||
#ifdef REFLECT |
|||
__vector unsigned char vsht_splat; |
|||
const __vector unsigned long long vmask_32bit = |
|||
(__vector unsigned long long)vec_sld((__vector unsigned char)vzero, |
|||
(__vector unsigned char)vones, 4); |
|||
#endif |
|||
|
|||
const __vector unsigned long long vmask_64bit = |
|||
(__vector unsigned long long)vec_sld((__vector unsigned char)vzero, |
|||
(__vector unsigned char)vones, 8); |
|||
|
|||
__vector unsigned long long vcrc; |
|||
|
|||
__vector unsigned long long vconst1, vconst2; |
|||
|
|||
/* vdata0-vdata7 will contain our data (p). */ |
|||
__vector unsigned long long vdata0, vdata1, vdata2, vdata3, vdata4, |
|||
vdata5, vdata6, vdata7; |
|||
|
|||
/* v0-v7 will contain our checksums */ |
|||
__vector unsigned long long v0 = {0,0}; |
|||
__vector unsigned long long v1 = {0,0}; |
|||
__vector unsigned long long v2 = {0,0}; |
|||
__vector unsigned long long v3 = {0,0}; |
|||
__vector unsigned long long v4 = {0,0}; |
|||
__vector unsigned long long v5 = {0,0}; |
|||
__vector unsigned long long v6 = {0,0}; |
|||
__vector unsigned long long v7 = {0,0}; |
|||
|
|||
|
|||
/* Vector auxiliary variables. */ |
|||
__vector unsigned long long va0, va1, va2, va3, va4, va5, va6, va7; |
|||
|
|||
unsigned int result = 0; |
|||
unsigned int offset; /* Constant table offset. */ |
|||
|
|||
unsigned long i; /* Counter. */ |
|||
unsigned long chunks; |
|||
|
|||
unsigned long block_size; |
|||
int next_block = 0; |
|||
|
|||
/* Align by 128 bits. The last 128 bit block will be processed at end. */ |
|||
unsigned long length = len & 0xFFFFFFFFFFFFFF80UL; |
|||
|
|||
#ifdef REFLECT |
|||
vcrc = (__vector unsigned long long)__builtin_pack_vector(0UL, crc); |
|||
#else |
|||
vcrc = (__vector unsigned long long)__builtin_pack_vector(crc, 0UL); |
|||
|
|||
/* Shift into top 32 bits */ |
|||
vcrc = (__vector unsigned long long)vec_sld((__vector unsigned char)vcrc, |
|||
(__vector unsigned char)vzero, 4); |
|||
#endif |
|||
|
|||
/* Short version. */ |
|||
if (len < 256) { |
|||
/* Calculate where in the constant table we need to start. */ |
|||
offset = 256 - len; |
|||
|
|||
vconst1 = vec_ld(offset, vcrc_short_const); |
|||
vdata0 = vec_ld(0, (__vector unsigned long long*) p); |
|||
VEC_PERM(vdata0, vdata0, vconst1, vperm_const); |
|||
|
|||
/* xor initial value*/ |
|||
vdata0 = vec_xor(vdata0, vcrc); |
|||
|
|||
vdata0 = (__vector unsigned long long) __builtin_crypto_vpmsumw |
|||
((__vector unsigned int)vdata0, (__vector unsigned int)vconst1); |
|||
v0 = vec_xor(v0, vdata0); |
|||
|
|||
for (i = 16; i < len; i += 16) { |
|||
vconst1 = vec_ld(offset + i, vcrc_short_const); |
|||
vdata0 = vec_ld(i, (__vector unsigned long long*) p); |
|||
VEC_PERM(vdata0, vdata0, vconst1, vperm_const); |
|||
vdata0 = (__vector unsigned long long) __builtin_crypto_vpmsumw |
|||
((__vector unsigned int)vdata0, (__vector unsigned int)vconst1); |
|||
v0 = vec_xor(v0, vdata0); |
|||
} |
|||
} else { |
|||
|
|||
/* Load initial values. */ |
|||
vdata0 = vec_ld(0, (__vector unsigned long long*) p); |
|||
vdata1 = vec_ld(16, (__vector unsigned long long*) p); |
|||
|
|||
VEC_PERM(vdata0, vdata0, vdata0, vperm_const); |
|||
VEC_PERM(vdata1, vdata1, vdata1, vperm_const); |
|||
|
|||
vdata2 = vec_ld(32, (__vector unsigned long long*) p); |
|||
vdata3 = vec_ld(48, (__vector unsigned long long*) p); |
|||
|
|||
VEC_PERM(vdata2, vdata2, vdata2, vperm_const); |
|||
VEC_PERM(vdata3, vdata3, vdata3, vperm_const); |
|||
|
|||
vdata4 = vec_ld(64, (__vector unsigned long long*) p); |
|||
vdata5 = vec_ld(80, (__vector unsigned long long*) p); |
|||
|
|||
VEC_PERM(vdata4, vdata4, vdata4, vperm_const); |
|||
VEC_PERM(vdata5, vdata5, vdata5, vperm_const); |
|||
|
|||
vdata6 = vec_ld(96, (__vector unsigned long long*) p); |
|||
vdata7 = vec_ld(112, (__vector unsigned long long*) p); |
|||
|
|||
VEC_PERM(vdata6, vdata6, vdata6, vperm_const); |
|||
VEC_PERM(vdata7, vdata7, vdata7, vperm_const); |
|||
|
|||
/* xor in initial value */ |
|||
vdata0 = vec_xor(vdata0, vcrc); |
|||
|
|||
p = (char *)p + 128; |
|||
|
|||
do { |
|||
/* Checksum in blocks of MAX_SIZE. */ |
|||
block_size = length; |
|||
if (block_size > MAX_SIZE) { |
|||
block_size = MAX_SIZE; |
|||
} |
|||
|
|||
length = length - block_size; |
|||
|
|||
/* |
|||
* Work out the offset into the constants table to start at. Each |
|||
* constant is 16 bytes, and it is used against 128 bytes of input |
|||
* data - 128 / 16 = 8 |
|||
*/ |
|||
offset = (MAX_SIZE/8) - (block_size/8); |
|||
/* We reduce our final 128 bytes in a separate step */ |
|||
chunks = (block_size/128)-1; |
|||
|
|||
vconst1 = vec_ld(offset, vcrc_const); |
|||
|
|||
va0 = __builtin_crypto_vpmsumd ((__vector unsigned long long)vdata0, |
|||
(__vector unsigned long long)vconst1); |
|||
va1 = __builtin_crypto_vpmsumd ((__vector unsigned long long)vdata1, |
|||
(__vector unsigned long long)vconst1); |
|||
va2 = __builtin_crypto_vpmsumd ((__vector unsigned long long)vdata2, |
|||
(__vector unsigned long long)vconst1); |
|||
va3 = __builtin_crypto_vpmsumd ((__vector unsigned long long)vdata3, |
|||
(__vector unsigned long long)vconst1); |
|||
va4 = __builtin_crypto_vpmsumd ((__vector unsigned long long)vdata4, |
|||
(__vector unsigned long long)vconst1); |
|||
va5 = __builtin_crypto_vpmsumd ((__vector unsigned long long)vdata5, |
|||
(__vector unsigned long long)vconst1); |
|||
va6 = __builtin_crypto_vpmsumd ((__vector unsigned long long)vdata6, |
|||
(__vector unsigned long long)vconst1); |
|||
va7 = __builtin_crypto_vpmsumd ((__vector unsigned long long)vdata7, |
|||
(__vector unsigned long long)vconst1); |
|||
|
|||
if (chunks > 1) { |
|||
offset += 16; |
|||
vconst2 = vec_ld(offset, vcrc_const); |
|||
GROUP_ENDING_NOP; |
|||
|
|||
vdata0 = vec_ld(0, (__vector unsigned long long*) p); |
|||
VEC_PERM(vdata0, vdata0, vdata0, vperm_const); |
|||
|
|||
vdata1 = vec_ld(16, (__vector unsigned long long*) p); |
|||
VEC_PERM(vdata1, vdata1, vdata1, vperm_const); |
|||
|
|||
vdata2 = vec_ld(32, (__vector unsigned long long*) p); |
|||
VEC_PERM(vdata2, vdata2, vdata2, vperm_const); |
|||
|
|||
vdata3 = vec_ld(48, (__vector unsigned long long*) p); |
|||
VEC_PERM(vdata3, vdata3, vdata3, vperm_const); |
|||
|
|||
vdata4 = vec_ld(64, (__vector unsigned long long*) p); |
|||
VEC_PERM(vdata4, vdata4, vdata4, vperm_const); |
|||
|
|||
vdata5 = vec_ld(80, (__vector unsigned long long*) p); |
|||
VEC_PERM(vdata5, vdata5, vdata5, vperm_const); |
|||
|
|||
vdata6 = vec_ld(96, (__vector unsigned long long*) p); |
|||
VEC_PERM(vdata6, vdata6, vdata6, vperm_const); |
|||
|
|||
vdata7 = vec_ld(112, (__vector unsigned long long*) p); |
|||
VEC_PERM(vdata7, vdata7, vdata7, vperm_const); |
|||
|
|||
p = (char *)p + 128; |
|||
|
|||
/* |
|||
* main loop. We modulo schedule it such that it takes three |
|||
* iterations to complete - first iteration load, second |
|||
* iteration vpmsum, third iteration xor. |
|||
*/ |
|||
for (i = 0; i < chunks-2; i++) { |
|||
vconst1 = vec_ld(offset, vcrc_const); |
|||
offset += 16; |
|||
GROUP_ENDING_NOP; |
|||
|
|||
v0 = vec_xor(v0, va0); |
|||
va0 = __builtin_crypto_vpmsumd ((__vector unsigned long |
|||
long)vdata0, (__vector unsigned long long)vconst2); |
|||
vdata0 = vec_ld(0, (__vector unsigned long long*) p); |
|||
VEC_PERM(vdata0, vdata0, vdata0, vperm_const); |
|||
GROUP_ENDING_NOP; |
|||
|
|||
v1 = vec_xor(v1, va1); |
|||
va1 = __builtin_crypto_vpmsumd ((__vector unsigned long |
|||
long)vdata1, (__vector unsigned long long)vconst2); |
|||
vdata1 = vec_ld(16, (__vector unsigned long long*) p); |
|||
VEC_PERM(vdata1, vdata1, vdata1, vperm_const); |
|||
GROUP_ENDING_NOP; |
|||
|
|||
v2 = vec_xor(v2, va2); |
|||
va2 = __builtin_crypto_vpmsumd ((__vector unsigned long |
|||
long)vdata2, (__vector unsigned long long)vconst2); |
|||
vdata2 = vec_ld(32, (__vector unsigned long long*) p); |
|||
VEC_PERM(vdata2, vdata2, vdata2, vperm_const); |
|||
GROUP_ENDING_NOP; |
|||
|
|||
v3 = vec_xor(v3, va3); |
|||
va3 = __builtin_crypto_vpmsumd ((__vector unsigned long |
|||
long)vdata3, (__vector unsigned long long)vconst2); |
|||
vdata3 = vec_ld(48, (__vector unsigned long long*) p); |
|||
VEC_PERM(vdata3, vdata3, vdata3, vperm_const); |
|||
|
|||
vconst2 = vec_ld(offset, vcrc_const); |
|||
GROUP_ENDING_NOP; |
|||
|
|||
v4 = vec_xor(v4, va4); |
|||
va4 = __builtin_crypto_vpmsumd ((__vector unsigned long |
|||
long)vdata4, (__vector unsigned long long)vconst1); |
|||
vdata4 = vec_ld(64, (__vector unsigned long long*) p); |
|||
VEC_PERM(vdata4, vdata4, vdata4, vperm_const); |
|||
GROUP_ENDING_NOP; |
|||
|
|||
v5 = vec_xor(v5, va5); |
|||
va5 = __builtin_crypto_vpmsumd ((__vector unsigned long |
|||
long)vdata5, (__vector unsigned long long)vconst1); |
|||
vdata5 = vec_ld(80, (__vector unsigned long long*) p); |
|||
VEC_PERM(vdata5, vdata5, vdata5, vperm_const); |
|||
GROUP_ENDING_NOP; |
|||
|
|||
v6 = vec_xor(v6, va6); |
|||
va6 = __builtin_crypto_vpmsumd ((__vector unsigned long |
|||
long)vdata6, (__vector unsigned long long)vconst1); |
|||
vdata6 = vec_ld(96, (__vector unsigned long long*) p); |
|||
VEC_PERM(vdata6, vdata6, vdata6, vperm_const); |
|||
GROUP_ENDING_NOP; |
|||
|
|||
v7 = vec_xor(v7, va7); |
|||
va7 = __builtin_crypto_vpmsumd ((__vector unsigned long |
|||
long)vdata7, (__vector unsigned long long)vconst1); |
|||
vdata7 = vec_ld(112, (__vector unsigned long long*) p); |
|||
VEC_PERM(vdata7, vdata7, vdata7, vperm_const); |
|||
|
|||
p = (char *)p + 128; |
|||
} |
|||
|
|||
/* First cool down*/ |
|||
vconst1 = vec_ld(offset, vcrc_const); |
|||
offset += 16; |
|||
|
|||
v0 = vec_xor(v0, va0); |
|||
va0 = __builtin_crypto_vpmsumd ((__vector unsigned long |
|||
long)vdata0, (__vector unsigned long long)vconst1); |
|||
GROUP_ENDING_NOP; |
|||
|
|||
v1 = vec_xor(v1, va1); |
|||
va1 = __builtin_crypto_vpmsumd ((__vector unsigned long |
|||
long)vdata1, (__vector unsigned long long)vconst1); |
|||
GROUP_ENDING_NOP; |
|||
|
|||
v2 = vec_xor(v2, va2); |
|||
va2 = __builtin_crypto_vpmsumd ((__vector unsigned long |
|||
long)vdata2, (__vector unsigned long long)vconst1); |
|||
GROUP_ENDING_NOP; |
|||
|
|||
v3 = vec_xor(v3, va3); |
|||
va3 = __builtin_crypto_vpmsumd ((__vector unsigned long |
|||
long)vdata3, (__vector unsigned long long)vconst1); |
|||
GROUP_ENDING_NOP; |
|||
|
|||
v4 = vec_xor(v4, va4); |
|||
va4 = __builtin_crypto_vpmsumd ((__vector unsigned long |
|||
long)vdata4, (__vector unsigned long long)vconst1); |
|||
GROUP_ENDING_NOP; |
|||
|
|||
v5 = vec_xor(v5, va5); |
|||
va5 = __builtin_crypto_vpmsumd ((__vector unsigned long |
|||
long)vdata5, (__vector unsigned long long)vconst1); |
|||
GROUP_ENDING_NOP; |
|||
|
|||
v6 = vec_xor(v6, va6); |
|||
va6 = __builtin_crypto_vpmsumd ((__vector unsigned long |
|||
long)vdata6, (__vector unsigned long long)vconst1); |
|||
GROUP_ENDING_NOP; |
|||
|
|||
v7 = vec_xor(v7, va7); |
|||
va7 = __builtin_crypto_vpmsumd ((__vector unsigned long |
|||
long)vdata7, (__vector unsigned long long)vconst1); |
|||
}/* else */ |
|||
|
|||
/* Second cool down. */ |
|||
v0 = vec_xor(v0, va0); |
|||
v1 = vec_xor(v1, va1); |
|||
v2 = vec_xor(v2, va2); |
|||
v3 = vec_xor(v3, va3); |
|||
v4 = vec_xor(v4, va4); |
|||
v5 = vec_xor(v5, va5); |
|||
v6 = vec_xor(v6, va6); |
|||
v7 = vec_xor(v7, va7); |
|||
|
|||
#ifdef REFLECT |
|||
/* |
|||
* vpmsumd produces a 96 bit result in the least significant bits |
|||
* of the register. Since we are bit reflected we have to shift it |
|||
* left 32 bits so it occupies the least significant bits in the |
|||
* bit reflected domain. |
|||
*/ |
|||
v0 = (__vector unsigned long long)vec_sld((__vector unsigned char)v0, |
|||
(__vector unsigned char)vzero, 4); |
|||
v1 = (__vector unsigned long long)vec_sld((__vector unsigned char)v1, |
|||
(__vector unsigned char)vzero, 4); |
|||
v2 = (__vector unsigned long long)vec_sld((__vector unsigned char)v2, |
|||
(__vector unsigned char)vzero, 4); |
|||
v3 = (__vector unsigned long long)vec_sld((__vector unsigned char)v3, |
|||
(__vector unsigned char)vzero, 4); |
|||
v4 = (__vector unsigned long long)vec_sld((__vector unsigned char)v4, |
|||
(__vector unsigned char)vzero, 4); |
|||
v5 = (__vector unsigned long long)vec_sld((__vector unsigned char)v5, |
|||
(__vector unsigned char)vzero, 4); |
|||
v6 = (__vector unsigned long long)vec_sld((__vector unsigned char)v6, |
|||
(__vector unsigned char)vzero, 4); |
|||
v7 = (__vector unsigned long long)vec_sld((__vector unsigned char)v7, |
|||
(__vector unsigned char)vzero, 4); |
|||
#endif |
|||
|
|||
/* xor with the last 1024 bits. */ |
|||
va0 = vec_ld(0, (__vector unsigned long long*) p); |
|||
VEC_PERM(va0, va0, va0, vperm_const); |
|||
|
|||
va1 = vec_ld(16, (__vector unsigned long long*) p); |
|||
VEC_PERM(va1, va1, va1, vperm_const); |
|||
|
|||
va2 = vec_ld(32, (__vector unsigned long long*) p); |
|||
VEC_PERM(va2, va2, va2, vperm_const); |
|||
|
|||
va3 = vec_ld(48, (__vector unsigned long long*) p); |
|||
VEC_PERM(va3, va3, va3, vperm_const); |
|||
|
|||
va4 = vec_ld(64, (__vector unsigned long long*) p); |
|||
VEC_PERM(va4, va4, va4, vperm_const); |
|||
|
|||
va5 = vec_ld(80, (__vector unsigned long long*) p); |
|||
VEC_PERM(va5, va5, va5, vperm_const); |
|||
|
|||
va6 = vec_ld(96, (__vector unsigned long long*) p); |
|||
VEC_PERM(va6, va6, va6, vperm_const); |
|||
|
|||
va7 = vec_ld(112, (__vector unsigned long long*) p); |
|||
VEC_PERM(va7, va7, va7, vperm_const); |
|||
|
|||
p = (char *)p + 128; |
|||
|
|||
vdata0 = vec_xor(v0, va0); |
|||
vdata1 = vec_xor(v1, va1); |
|||
vdata2 = vec_xor(v2, va2); |
|||
vdata3 = vec_xor(v3, va3); |
|||
vdata4 = vec_xor(v4, va4); |
|||
vdata5 = vec_xor(v5, va5); |
|||
vdata6 = vec_xor(v6, va6); |
|||
vdata7 = vec_xor(v7, va7); |
|||
|
|||
/* Check if we have more blocks to process */ |
|||
next_block = 0; |
|||
if (length != 0) { |
|||
next_block = 1; |
|||
|
|||
/* zero v0-v7 */ |
|||
v0 = vec_xor(v0, v0); |
|||
v1 = vec_xor(v1, v1); |
|||
v2 = vec_xor(v2, v2); |
|||
v3 = vec_xor(v3, v3); |
|||
v4 = vec_xor(v4, v4); |
|||
v5 = vec_xor(v5, v5); |
|||
v6 = vec_xor(v6, v6); |
|||
v7 = vec_xor(v7, v7); |
|||
} |
|||
length = length + 128; |
|||
|
|||
} while (next_block); |
|||
|
|||
/* Calculate how many bytes we have left. */ |
|||
length = (len & 127); |
|||
|
|||
/* Calculate where in (short) constant table we need to start. */ |
|||
offset = 128 - length; |
|||
|
|||
v0 = vec_ld(offset, vcrc_short_const); |
|||
v1 = vec_ld(offset + 16, vcrc_short_const); |
|||
v2 = vec_ld(offset + 32, vcrc_short_const); |
|||
v3 = vec_ld(offset + 48, vcrc_short_const); |
|||
v4 = vec_ld(offset + 64, vcrc_short_const); |
|||
v5 = vec_ld(offset + 80, vcrc_short_const); |
|||
v6 = vec_ld(offset + 96, vcrc_short_const); |
|||
v7 = vec_ld(offset + 112, vcrc_short_const); |
|||
|
|||
offset += 128; |
|||
|
|||
v0 = (__vector unsigned long long)__builtin_crypto_vpmsumw ( |
|||
(__vector unsigned int)vdata0,(__vector unsigned int)v0); |
|||
v1 = (__vector unsigned long long)__builtin_crypto_vpmsumw ( |
|||
(__vector unsigned int)vdata1,(__vector unsigned int)v1); |
|||
v2 = (__vector unsigned long long)__builtin_crypto_vpmsumw ( |
|||
(__vector unsigned int)vdata2,(__vector unsigned int)v2); |
|||
v3 = (__vector unsigned long long)__builtin_crypto_vpmsumw ( |
|||
(__vector unsigned int)vdata3,(__vector unsigned int)v3); |
|||
v4 = (__vector unsigned long long)__builtin_crypto_vpmsumw ( |
|||
(__vector unsigned int)vdata4,(__vector unsigned int)v4); |
|||
v5 = (__vector unsigned long long)__builtin_crypto_vpmsumw ( |
|||
(__vector unsigned int)vdata5,(__vector unsigned int)v5); |
|||
v6 = (__vector unsigned long long)__builtin_crypto_vpmsumw ( |
|||
(__vector unsigned int)vdata6,(__vector unsigned int)v6); |
|||
v7 = (__vector unsigned long long)__builtin_crypto_vpmsumw ( |
|||
(__vector unsigned int)vdata7,(__vector unsigned int)v7); |
|||
|
|||
/* Now reduce the tail (0-112 bytes). */ |
|||
for (i = 0; i < length; i+=16) { |
|||
vdata0 = vec_ld(i,(__vector unsigned long long*)p); |
|||
VEC_PERM(vdata0, vdata0, vdata0, vperm_const); |
|||
va0 = vec_ld(offset + i,vcrc_short_const); |
|||
va0 = (__vector unsigned long long)__builtin_crypto_vpmsumw ( |
|||
(__vector unsigned int)vdata0,(__vector unsigned int)va0); |
|||
v0 = vec_xor(v0, va0); |
|||
} |
|||
|
|||
/* xor all parallel chunks together. */ |
|||
v0 = vec_xor(v0, v1); |
|||
v2 = vec_xor(v2, v3); |
|||
v4 = vec_xor(v4, v5); |
|||
v6 = vec_xor(v6, v7); |
|||
|
|||
v0 = vec_xor(v0, v2); |
|||
v4 = vec_xor(v4, v6); |
|||
|
|||
v0 = vec_xor(v0, v4); |
|||
} |
|||
|
|||
/* Barrett Reduction */ |
|||
vconst1 = vec_ld(0, v_Barrett_const); |
|||
vconst2 = vec_ld(16, v_Barrett_const); |
|||
|
|||
v1 = (__vector unsigned long long)vec_sld((__vector unsigned char)v0, |
|||
(__vector unsigned char)v0, 8); |
|||
v0 = vec_xor(v1,v0); |
|||
|
|||
#ifdef REFLECT |
|||
/* shift left one bit */ |
|||
vsht_splat = vec_splat_u8 (1); |
|||
v0 = (__vector unsigned long long)vec_sll ((__vector unsigned char)v0, |
|||
vsht_splat); |
|||
#endif |
|||
|
|||
v0 = vec_and(v0, vmask_64bit); |
|||
|
|||
#ifndef REFLECT |
|||
|
|||
/* |
|||
* Now for the actual algorithm. The idea is to calculate q, |
|||
* the multiple of our polynomial that we need to subtract. By |
|||
* doing the computation 2x bits higher (ie 64 bits) and shifting the |
|||
* result back down 2x bits, we round down to the nearest multiple. |
|||
*/ |
|||
|
|||
/* ma */ |
|||
v1 = __builtin_crypto_vpmsumd ((__vector unsigned long long)v0, |
|||
(__vector unsigned long long)vconst1); |
|||
/* q = floor(ma/(2^64)) */ |
|||
v1 = (__vector unsigned long long)vec_sld ((__vector unsigned char)vzero, |
|||
(__vector unsigned char)v1, 8); |
|||
/* qn */ |
|||
v1 = __builtin_crypto_vpmsumd ((__vector unsigned long long)v1, |
|||
(__vector unsigned long long)vconst2); |
|||
/* a - qn, subtraction is xor in GF(2) */ |
|||
v0 = vec_xor (v0, v1); |
|||
/* |
|||
* Get the result into r3. We need to shift it left 8 bytes: |
|||
* V0 [ 0 1 2 X ] |
|||
* V0 [ 0 X 2 3 ] |
|||
*/ |
|||
result = __builtin_unpack_vector_1 (v0); |
|||
#else |
|||
|
|||
/* |
|||
* The reflected version of Barrett reduction. Instead of bit |
|||
* reflecting our data (which is expensive to do), we bit reflect our |
|||
* constants and our algorithm, which means the intermediate data in |
|||
* our vector registers goes from 0-63 instead of 63-0. We can reflect |
|||
* the algorithm because we don't carry in mod 2 arithmetic. |
|||
*/ |
|||
|
|||
/* bottom 32 bits of a */ |
|||
v1 = vec_and(v0, vmask_32bit); |
|||
|
|||
/* ma */ |
|||
v1 = __builtin_crypto_vpmsumd ((__vector unsigned long long)v1, |
|||
(__vector unsigned long long)vconst1); |
|||
|
|||
/* bottom 32bits of ma */ |
|||
v1 = vec_and(v1, vmask_32bit); |
|||
/* qn */ |
|||
v1 = __builtin_crypto_vpmsumd ((__vector unsigned long long)v1, |
|||
(__vector unsigned long long)vconst2); |
|||
/* a - qn, subtraction is xor in GF(2) */ |
|||
v0 = vec_xor (v0, v1); |
|||
|
|||
/* |
|||
* Since we are bit reflected, the result (ie the low 32 bits) is in |
|||
* the high 32 bits. We just need to shift it left 4 bytes |
|||
* V0 [ 0 1 X 3 ] |
|||
* V0 [ 0 X 2 3 ] |
|||
*/ |
|||
|
|||
/* shift result into top 64 bits of */ |
|||
v0 = (__vector unsigned long long)vec_sld((__vector unsigned char)v0, |
|||
(__vector unsigned char)vzero, 4); |
|||
|
|||
result = __builtin_unpack_vector_0 (v0); |
|||
#endif |
|||
|
|||
return result; |
|||
} |
|||
#define POWER8_INTRINSICS |
|||
#include "pcc_crc32_constants.h" |
|||
#include "crc_ppc64.h" |
1254
mysys/crc32/crc32c.cc
File diff suppressed because it is too large
View File
File diff suppressed because it is too large
View File
@ -0,0 +1,5 @@ |
|||
#define CRC32_FUNCTION crc32c_ppc |
|||
#define CRC_TABLE |
|||
#define POWER8_INTRINSICS |
|||
#include "pcc_crc32c_constants.h" |
|||
#include "crc_ppc64.h" |
@ -0,0 +1,19 @@ |
|||
// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. |
|||
// Copyright (c) 2017 International Business Machines Corp. |
|||
// All rights reserved. |
|||
// This source code is licensed under both the GPLv2 (found in the |
|||
// COPYING file in the root directory) and Apache 2.0 License |
|||
// (found in the LICENSE.Apache file in the root directory). |
|||
|
|||
#pragma once |
|||
|
|||
#ifdef __cplusplus |
|||
extern "C" { |
|||
#endif |
|||
|
|||
extern uint32_t crc32c_ppc(uint32_t crc, unsigned char const *buffer, |
|||
unsigned len); |
|||
|
|||
#ifdef __cplusplus |
|||
} |
|||
#endif |
@ -0,0 +1,664 @@ |
|||
/* |
|||
* Calculate the checksum of data that is 16 byte aligned and a multiple of |
|||
* 16 bytes. |
|||
* |
|||
* The first step is to reduce it to 1024 bits. We do this in 8 parallel |
|||
* chunks in order to mask the latency of the vpmsum instructions. If we |
|||
* have more than 32 kB of data to checksum we repeat this step multiple |
|||
* times, passing in the previous 1024 bits. |
|||
* |
|||
* The next step is to reduce the 1024 bits to 64 bits. This step adds |
|||
* 32 bits of 0s to the end - this matches what a CRC does. We just |
|||
* calculate constants that land the data in this 32 bits. |
|||
* |
|||
* We then use fixed point Barrett reduction to compute a mod n over GF(2) |
|||
* for n = CRC using POWER8 instructions. We use x = 32. |
|||
* |
|||
* http://en.wikipedia.org/wiki/Barrett_reduction |
|||
* |
|||
* This code uses gcc vector builtins instead using assembly directly. |
|||
* |
|||
* Copyright (C) 2017 Rogerio Alves <rogealve@br.ibm.com>, IBM |
|||
* |
|||
* This program is free software; you can redistribute it and/or |
|||
* modify it under the terms of either: |
|||
* |
|||
* a) the GNU General Public License as published by the Free Software |
|||
* Foundation; either version 2 of the License, or (at your option) |
|||
* any later version, or |
|||
* b) the Apache License, Version 2.0 |
|||
*/ |
|||
|
|||
#include <altivec.h> |
|||
|
|||
|
|||
#define VMX_ALIGN 16 |
|||
#define VMX_ALIGN_MASK (VMX_ALIGN-1) |
|||
|
|||
#ifdef REFLECT |
|||
static unsigned int crc32_align(unsigned int crc, const unsigned char *p, |
|||
unsigned long len) |
|||
{ |
|||
while (len--) |
|||
crc = crc_table[(crc ^ *p++) & 0xff] ^ (crc >> 8); |
|||
return crc; |
|||
} |
|||
#else |
|||
static unsigned int crc32_align(unsigned int crc, const unsigned char *p, |
|||
unsigned long len) |
|||
{ |
|||
while (len--) |
|||
crc = crc_table[((crc >> 24) ^ *p++) & 0xff] ^ (crc << 8); |
|||
return crc; |
|||
} |
|||
#endif |
|||
|
|||
static unsigned int __attribute__ ((aligned (32))) |
|||
__crc32_vpmsum(unsigned int crc, const void* p, unsigned long len); |
|||
|
|||
|
|||
unsigned int CRC32_FUNCTION(unsigned int crc, const unsigned char *p, |
|||
unsigned long len) |
|||
{ |
|||
unsigned int prealign; |
|||
unsigned int tail; |
|||
|
|||
#ifdef CRC_XOR |
|||
crc ^= 0xffffffff; |
|||
#endif |
|||
|
|||
if (len < VMX_ALIGN + VMX_ALIGN_MASK) { |
|||
crc = crc32_align(crc, p, len); |
|||
goto out; |
|||
} |
|||
|
|||
if ((unsigned long)p & VMX_ALIGN_MASK) { |
|||
prealign = VMX_ALIGN - ((unsigned long)p & VMX_ALIGN_MASK); |
|||
crc = crc32_align(crc, p, prealign); |
|||
len -= prealign; |
|||
p += prealign; |
|||
} |
|||
|
|||
crc = __crc32_vpmsum(crc, p, len & ~VMX_ALIGN_MASK); |
|||
|
|||
tail = len & VMX_ALIGN_MASK; |
|||
if (tail) { |
|||
p += len & ~VMX_ALIGN_MASK; |
|||
crc = crc32_align(crc, p, tail); |
|||
} |
|||
|
|||
out: |
|||
#ifdef CRC_XOR |
|||
crc ^= 0xffffffff; |
|||
#endif |
|||
|
|||
return crc; |
|||
} |
|||
|
|||
#if defined (__clang__) |
|||
#include "clang_workaround.h" |
|||
#else |
|||
#define __builtin_pack_vector(a, b) __builtin_pack_vector_int128 ((a), (b)) |
|||
#define __builtin_unpack_vector_0(a) __builtin_unpack_vector_int128 ((vector __int128_t)(a), 0) |
|||
#define __builtin_unpack_vector_1(a) __builtin_unpack_vector_int128 ((vector __int128_t)(a), 1) |
|||
#endif |
|||
|
|||
/* When we have a load-store in a single-dispatch group and address overlap |
|||
* such that foward is not allowed (load-hit-store) the group must be flushed. |
|||
* A group ending NOP prevents the flush. |
|||
*/ |
|||
#define GROUP_ENDING_NOP asm("ori 2,2,0" ::: "memory") |
|||
|
|||
#if defined(__BIG_ENDIAN__) && defined (REFLECT) |
|||
#define BYTESWAP_DATA |
|||
#elif defined(__LITTLE_ENDIAN__) && !defined(REFLECT) |
|||
#define BYTESWAP_DATA |
|||
#endif |
|||
|
|||
#ifdef BYTESWAP_DATA |
|||
#define VEC_PERM(vr, va, vb, vc) vr = vec_perm(va, vb,\ |
|||
(__vector unsigned char) vc) |
|||
#if defined(__LITTLE_ENDIAN__) |
|||
/* Byte reverse permute constant LE. */ |
|||
static const __vector unsigned long long vperm_const |
|||
__attribute__ ((aligned(16))) = { 0x08090A0B0C0D0E0FUL, |
|||
0x0001020304050607UL }; |
|||
#else |
|||
static const __vector unsigned long long vperm_const |
|||
__attribute__ ((aligned(16))) = { 0x0F0E0D0C0B0A0908UL, |
|||
0X0706050403020100UL }; |
|||
#endif |
|||
#else |
|||
#define VEC_PERM(vr, va, vb, vc) |
|||
#endif |
|||
|
|||
static unsigned int __attribute__ ((aligned (32))) |
|||
__crc32_vpmsum(unsigned int crc, const void* p, unsigned long len) { |
|||
|
|||
const __vector unsigned long long vzero = {0,0}; |
|||
const __vector unsigned long long vones = {0xffffffffffffffffUL, |
|||
0xffffffffffffffffUL}; |
|||
|
|||
#ifdef REFLECT |
|||
__vector unsigned char vsht_splat; |
|||
const __vector unsigned long long vmask_32bit = |
|||
(__vector unsigned long long)vec_sld((__vector unsigned char)vzero, |
|||
(__vector unsigned char)vones, 4); |
|||
#endif |
|||
|
|||
const __vector unsigned long long vmask_64bit = |
|||
(__vector unsigned long long)vec_sld((__vector unsigned char)vzero, |
|||
(__vector unsigned char)vones, 8); |
|||
|
|||
__vector unsigned long long vcrc; |
|||
|
|||
__vector unsigned long long vconst1, vconst2; |
|||
|
|||
/* vdata0-vdata7 will contain our data (p). */ |
|||
__vector unsigned long long vdata0, vdata1, vdata2, vdata3, vdata4, |
|||
vdata5, vdata6, vdata7; |
|||
|
|||
/* v0-v7 will contain our checksums */ |
|||
__vector unsigned long long v0 = {0,0}; |
|||
__vector unsigned long long v1 = {0,0}; |
|||
__vector unsigned long long v2 = {0,0}; |
|||
__vector unsigned long long v3 = {0,0}; |
|||
__vector unsigned long long v4 = {0,0}; |
|||
__vector unsigned long long v5 = {0,0}; |
|||
__vector unsigned long long v6 = {0,0}; |
|||
__vector unsigned long long v7 = {0,0}; |
|||
|
|||
|
|||
/* Vector auxiliary variables. */ |
|||
__vector unsigned long long va0, va1, va2, va3, va4, va5, va6, va7; |
|||
|
|||
unsigned int result = 0; |
|||
unsigned int offset; /* Constant table offset. */ |
|||
|
|||
unsigned long i; /* Counter. */ |
|||
unsigned long chunks; |
|||
|
|||
unsigned long block_size; |
|||
int next_block = 0; |
|||
|
|||
/* Align by 128 bits. The last 128 bit block will be processed at end. */ |
|||
unsigned long length = len & 0xFFFFFFFFFFFFFF80UL; |
|||
|
|||
#ifdef REFLECT |
|||
vcrc = (__vector unsigned long long)__builtin_pack_vector(0UL, crc); |
|||
#else |
|||
vcrc = (__vector unsigned long long)__builtin_pack_vector(crc, 0UL); |
|||
|
|||
/* Shift into top 32 bits */ |
|||
vcrc = (__vector unsigned long long)vec_sld((__vector unsigned char)vcrc, |
|||
(__vector unsigned char)vzero, 4); |
|||
#endif |
|||
|
|||
/* Short version. */ |
|||
if (len < 256) { |
|||
/* Calculate where in the constant table we need to start. */ |
|||
offset = 256 - len; |
|||
|
|||
vconst1 = vec_ld(offset, vcrc_short_const); |
|||
vdata0 = vec_ld(0, (__vector unsigned long long*) p); |
|||
VEC_PERM(vdata0, vdata0, vconst1, vperm_const); |
|||
|
|||
/* xor initial value*/ |
|||
vdata0 = vec_xor(vdata0, vcrc); |
|||
|
|||
vdata0 = (__vector unsigned long long) __builtin_crypto_vpmsumw |
|||
((__vector unsigned int)vdata0, (__vector unsigned int)vconst1); |
|||
v0 = vec_xor(v0, vdata0); |
|||
|
|||
for (i = 16; i < len; i += 16) { |
|||
vconst1 = vec_ld(offset + i, vcrc_short_const); |
|||
vdata0 = vec_ld(i, (__vector unsigned long long*) p); |
|||
VEC_PERM(vdata0, vdata0, vconst1, vperm_const); |
|||
vdata0 = (__vector unsigned long long) __builtin_crypto_vpmsumw |
|||
((__vector unsigned int)vdata0, (__vector unsigned int)vconst1); |
|||
v0 = vec_xor(v0, vdata0); |
|||
} |
|||
} else { |
|||
|
|||
/* Load initial values. */ |
|||
vdata0 = vec_ld(0, (__vector unsigned long long*) p); |
|||
vdata1 = vec_ld(16, (__vector unsigned long long*) p); |
|||
|
|||
VEC_PERM(vdata0, vdata0, vdata0, vperm_const); |
|||
VEC_PERM(vdata1, vdata1, vdata1, vperm_const); |
|||
|
|||
vdata2 = vec_ld(32, (__vector unsigned long long*) p); |
|||
vdata3 = vec_ld(48, (__vector unsigned long long*) p); |
|||
|
|||
VEC_PERM(vdata2, vdata2, vdata2, vperm_const); |
|||
VEC_PERM(vdata3, vdata3, vdata3, vperm_const); |
|||
|
|||
vdata4 = vec_ld(64, (__vector unsigned long long*) p); |
|||
vdata5 = vec_ld(80, (__vector unsigned long long*) p); |
|||
|
|||
VEC_PERM(vdata4, vdata4, vdata4, vperm_const); |
|||
VEC_PERM(vdata5, vdata5, vdata5, vperm_const); |
|||
|
|||
vdata6 = vec_ld(96, (__vector unsigned long long*) p); |
|||
vdata7 = vec_ld(112, (__vector unsigned long long*) p); |
|||
|
|||
VEC_PERM(vdata6, vdata6, vdata6, vperm_const); |
|||
VEC_PERM(vdata7, vdata7, vdata7, vperm_const); |
|||
|
|||
/* xor in initial value */ |
|||
vdata0 = vec_xor(vdata0, vcrc); |
|||
|
|||
p = (char *)p + 128; |
|||
|
|||
do { |
|||
/* Checksum in blocks of MAX_SIZE. */ |
|||
block_size = length; |
|||
if (block_size > MAX_SIZE) { |
|||
block_size = MAX_SIZE; |
|||
} |
|||
|
|||
length = length - block_size; |
|||
|
|||
/* |
|||
* Work out the offset into the constants table to start at. Each |
|||
* constant is 16 bytes, and it is used against 128 bytes of input |
|||
* data - 128 / 16 = 8 |
|||
*/ |
|||
offset = (MAX_SIZE/8) - (block_size/8); |
|||
/* We reduce our final 128 bytes in a separate step */ |
|||
chunks = (block_size/128)-1; |
|||
|
|||
vconst1 = vec_ld(offset, vcrc_const); |
|||
|
|||
va0 = __builtin_crypto_vpmsumd ((__vector unsigned long long)vdata0, |
|||
(__vector unsigned long long)vconst1); |
|||
va1 = __builtin_crypto_vpmsumd ((__vector unsigned long long)vdata1, |
|||
(__vector unsigned long long)vconst1); |
|||
va2 = __builtin_crypto_vpmsumd ((__vector unsigned long long)vdata2, |
|||
(__vector unsigned long long)vconst1); |
|||
va3 = __builtin_crypto_vpmsumd ((__vector unsigned long long)vdata3, |
|||
(__vector unsigned long long)vconst1); |
|||
va4 = __builtin_crypto_vpmsumd ((__vector unsigned long long)vdata4, |
|||
(__vector unsigned long long)vconst1); |
|||
va5 = __builtin_crypto_vpmsumd ((__vector unsigned long long)vdata5, |
|||
(__vector unsigned long long)vconst1); |
|||
va6 = __builtin_crypto_vpmsumd ((__vector unsigned long long)vdata6, |
|||
(__vector unsigned long long)vconst1); |
|||
va7 = __builtin_crypto_vpmsumd ((__vector unsigned long long)vdata7, |
|||
(__vector unsigned long long)vconst1); |
|||
|
|||
if (chunks > 1) { |
|||
offset += 16; |
|||
vconst2 = vec_ld(offset, vcrc_const); |
|||
GROUP_ENDING_NOP; |
|||
|
|||
vdata0 = vec_ld(0, (__vector unsigned long long*) p); |
|||
VEC_PERM(vdata0, vdata0, vdata0, vperm_const); |
|||
|
|||
vdata1 = vec_ld(16, (__vector unsigned long long*) p); |
|||
VEC_PERM(vdata1, vdata1, vdata1, vperm_const); |
|||
|
|||
vdata2 = vec_ld(32, (__vector unsigned long long*) p); |
|||
VEC_PERM(vdata2, vdata2, vdata2, vperm_const); |
|||
|
|||
vdata3 = vec_ld(48, (__vector unsigned long long*) p); |
|||
VEC_PERM(vdata3, vdata3, vdata3, vperm_const); |
|||
|
|||
vdata4 = vec_ld(64, (__vector unsigned long long*) p); |
|||
VEC_PERM(vdata4, vdata4, vdata4, vperm_const); |
|||
|
|||
vdata5 = vec_ld(80, (__vector unsigned long long*) p); |
|||
VEC_PERM(vdata5, vdata5, vdata5, vperm_const); |
|||
|
|||
vdata6 = vec_ld(96, (__vector unsigned long long*) p); |
|||
VEC_PERM(vdata6, vdata6, vdata6, vperm_const); |
|||
|
|||
vdata7 = vec_ld(112, (__vector unsigned long long*) p); |
|||
VEC_PERM(vdata7, vdata7, vdata7, vperm_const); |
|||
|
|||
p = (char *)p + 128; |
|||
|
|||
/* |
|||
* main loop. We modulo schedule it such that it takes three |
|||
* iterations to complete - first iteration load, second |
|||
* iteration vpmsum, third iteration xor. |
|||
*/ |
|||
for (i = 0; i < chunks-2; i++) { |
|||
vconst1 = vec_ld(offset, vcrc_const); |
|||
offset += 16; |
|||
GROUP_ENDING_NOP; |
|||
|
|||
v0 = vec_xor(v0, va0); |
|||
va0 = __builtin_crypto_vpmsumd ((__vector unsigned long |
|||
long)vdata0, (__vector unsigned long long)vconst2); |
|||
vdata0 = vec_ld(0, (__vector unsigned long long*) p); |
|||
VEC_PERM(vdata0, vdata0, vdata0, vperm_const); |
|||
GROUP_ENDING_NOP; |
|||
|
|||
v1 = vec_xor(v1, va1); |
|||
va1 = __builtin_crypto_vpmsumd ((__vector unsigned long |
|||
long)vdata1, (__vector unsigned long long)vconst2); |
|||
vdata1 = vec_ld(16, (__vector unsigned long long*) p); |
|||
VEC_PERM(vdata1, vdata1, vdata1, vperm_const); |
|||
GROUP_ENDING_NOP; |
|||
|
|||
v2 = vec_xor(v2, va2); |
|||
va2 = __builtin_crypto_vpmsumd ((__vector unsigned long |
|||
long)vdata2, (__vector unsigned long long)vconst2); |
|||
vdata2 = vec_ld(32, (__vector unsigned long long*) p); |
|||
VEC_PERM(vdata2, vdata2, vdata2, vperm_const); |
|||
GROUP_ENDING_NOP; |
|||
|
|||
v3 = vec_xor(v3, va3); |
|||
va3 = __builtin_crypto_vpmsumd ((__vector unsigned long |
|||
long)vdata3, (__vector unsigned long long)vconst2); |
|||
vdata3 = vec_ld(48, (__vector unsigned long long*) p); |
|||
VEC_PERM(vdata3, vdata3, vdata3, vperm_const); |
|||
|
|||
vconst2 = vec_ld(offset, vcrc_const); |
|||
GROUP_ENDING_NOP; |
|||
|
|||
v4 = vec_xor(v4, va4); |
|||
va4 = __builtin_crypto_vpmsumd ((__vector unsigned long |
|||
long)vdata4, (__vector unsigned long long)vconst1); |
|||
vdata4 = vec_ld(64, (__vector unsigned long long*) p); |
|||
VEC_PERM(vdata4, vdata4, vdata4, vperm_const); |
|||
GROUP_ENDING_NOP; |
|||
|
|||
v5 = vec_xor(v5, va5); |
|||
va5 = __builtin_crypto_vpmsumd ((__vector unsigned long |
|||
long)vdata5, (__vector unsigned long long)vconst1); |
|||
vdata5 = vec_ld(80, (__vector unsigned long long*) p); |
|||
VEC_PERM(vdata5, vdata5, vdata5, vperm_const); |
|||
GROUP_ENDING_NOP; |
|||
|
|||
v6 = vec_xor(v6, va6); |
|||
va6 = __builtin_crypto_vpmsumd ((__vector unsigned long |
|||
long)vdata6, (__vector unsigned long long)vconst1); |
|||
vdata6 = vec_ld(96, (__vector unsigned long long*) p); |
|||
VEC_PERM(vdata6, vdata6, vdata6, vperm_const); |
|||
GROUP_ENDING_NOP; |
|||
|
|||
v7 = vec_xor(v7, va7); |
|||
va7 = __builtin_crypto_vpmsumd ((__vector unsigned long |
|||
long)vdata7, (__vector unsigned long long)vconst1); |
|||
vdata7 = vec_ld(112, (__vector unsigned long long*) p); |
|||
VEC_PERM(vdata7, vdata7, vdata7, vperm_const); |
|||
|
|||
p = (char *)p + 128; |
|||
} |
|||
|
|||
/* First cool down*/ |
|||
vconst1 = vec_ld(offset, vcrc_const); |
|||
offset += 16; |
|||
|
|||
v0 = vec_xor(v0, va0); |
|||
va0 = __builtin_crypto_vpmsumd ((__vector unsigned long |
|||
long)vdata0, (__vector unsigned long long)vconst1); |
|||
GROUP_ENDING_NOP; |
|||
|
|||
v1 = vec_xor(v1, va1); |
|||
va1 = __builtin_crypto_vpmsumd ((__vector unsigned long |
|||
long)vdata1, (__vector unsigned long long)vconst1); |
|||
GROUP_ENDING_NOP; |
|||
|
|||
v2 = vec_xor(v2, va2); |
|||
va2 = __builtin_crypto_vpmsumd ((__vector unsigned long |
|||
long)vdata2, (__vector unsigned long long)vconst1); |
|||
GROUP_ENDING_NOP; |
|||
|
|||
v3 = vec_xor(v3, va3); |
|||
va3 = __builtin_crypto_vpmsumd ((__vector unsigned long |
|||
long)vdata3, (__vector unsigned long long)vconst1); |
|||
GROUP_ENDING_NOP; |
|||
|
|||
v4 = vec_xor(v4, va4); |
|||
va4 = __builtin_crypto_vpmsumd ((__vector unsigned long |
|||
long)vdata4, (__vector unsigned long long)vconst1); |
|||
GROUP_ENDING_NOP; |
|||
|
|||
v5 = vec_xor(v5, va5); |
|||
va5 = __builtin_crypto_vpmsumd ((__vector unsigned long |
|||
long)vdata5, (__vector unsigned long long)vconst1); |
|||
GROUP_ENDING_NOP; |
|||
|
|||
v6 = vec_xor(v6, va6); |
|||
va6 = __builtin_crypto_vpmsumd ((__vector unsigned long |
|||
long)vdata6, (__vector unsigned long long)vconst1); |
|||
GROUP_ENDING_NOP; |
|||
|
|||
v7 = vec_xor(v7, va7); |
|||
va7 = __builtin_crypto_vpmsumd ((__vector unsigned long |
|||
long)vdata7, (__vector unsigned long long)vconst1); |
|||
}/* else */ |
|||
|
|||
/* Second cool down. */ |
|||
v0 = vec_xor(v0, va0); |
|||
v1 = vec_xor(v1, va1); |
|||
v2 = vec_xor(v2, va2); |
|||
v3 = vec_xor(v3, va3); |
|||
v4 = vec_xor(v4, va4); |
|||
v5 = vec_xor(v5, va5); |
|||
v6 = vec_xor(v6, va6); |
|||
v7 = vec_xor(v7, va7); |
|||
|
|||
#ifdef REFLECT |
|||
/* |
|||
* vpmsumd produces a 96 bit result in the least significant bits |
|||
* of the register. Since we are bit reflected we have to shift it |
|||
* left 32 bits so it occupies the least significant bits in the |
|||
* bit reflected domain. |
|||
*/ |
|||
v0 = (__vector unsigned long long)vec_sld((__vector unsigned char)v0, |
|||
(__vector unsigned char)vzero, 4); |
|||
v1 = (__vector unsigned long long)vec_sld((__vector unsigned char)v1, |
|||
(__vector unsigned char)vzero, 4); |
|||
v2 = (__vector unsigned long long)vec_sld((__vector unsigned char)v2, |
|||
(__vector unsigned char)vzero, 4); |
|||
v3 = (__vector unsigned long long)vec_sld((__vector unsigned char)v3, |
|||
(__vector unsigned char)vzero, 4); |
|||
v4 = (__vector unsigned long long)vec_sld((__vector unsigned char)v4, |
|||
(__vector unsigned char)vzero, 4); |
|||
v5 = (__vector unsigned long long)vec_sld((__vector unsigned char)v5, |
|||
(__vector unsigned char)vzero, 4); |
|||
v6 = (__vector unsigned long long)vec_sld((__vector unsigned char)v6, |
|||
(__vector unsigned char)vzero, 4); |
|||
v7 = (__vector unsigned long long)vec_sld((__vector unsigned char)v7, |
|||
(__vector unsigned char)vzero, 4); |
|||
#endif |
|||
|
|||
/* xor with the last 1024 bits. */ |
|||
va0 = vec_ld(0, (__vector unsigned long long*) p); |
|||
VEC_PERM(va0, va0, va0, vperm_const); |
|||
|
|||
va1 = vec_ld(16, (__vector unsigned long long*) p); |
|||
VEC_PERM(va1, va1, va1, vperm_const); |
|||
|
|||
va2 = vec_ld(32, (__vector unsigned long long*) p); |
|||
VEC_PERM(va2, va2, va2, vperm_const); |
|||
|
|||
va3 = vec_ld(48, (__vector unsigned long long*) p); |
|||
VEC_PERM(va3, va3, va3, vperm_const); |
|||
|
|||
va4 = vec_ld(64, (__vector unsigned long long*) p); |
|||
VEC_PERM(va4, va4, va4, vperm_const); |
|||
|
|||
va5 = vec_ld(80, (__vector unsigned long long*) p); |
|||
VEC_PERM(va5, va5, va5, vperm_const); |
|||
|
|||
va6 = vec_ld(96, (__vector unsigned long long*) p); |
|||
VEC_PERM(va6, va6, va6, vperm_const); |
|||
|
|||
va7 = vec_ld(112, (__vector unsigned long long*) p); |
|||
VEC_PERM(va7, va7, va7, vperm_const); |
|||
|
|||
p = (char *)p + 128; |
|||
|
|||
vdata0 = vec_xor(v0, va0); |
|||
vdata1 = vec_xor(v1, va1); |
|||
vdata2 = vec_xor(v2, va2); |
|||
vdata3 = vec_xor(v3, va3); |
|||
vdata4 = vec_xor(v4, va4); |
|||
vdata5 = vec_xor(v5, va5); |
|||
vdata6 = vec_xor(v6, va6); |
|||
vdata7 = vec_xor(v7, va7); |
|||
|
|||
/* Check if we have more blocks to process */ |
|||
next_block = 0; |
|||
if (length != 0) { |
|||
next_block = 1; |
|||
|
|||
/* zero v0-v7 */ |
|||
v0 = vec_xor(v0, v0); |
|||
v1 = vec_xor(v1, v1); |
|||
v2 = vec_xor(v2, v2); |
|||
v3 = vec_xor(v3, v3); |
|||
v4 = vec_xor(v4, v4); |
|||
v5 = vec_xor(v5, v5); |
|||
v6 = vec_xor(v6, v6); |
|||
v7 = vec_xor(v7, v7); |
|||
} |
|||
length = length + 128; |
|||
|
|||
} while (next_block); |
|||
|
|||
/* Calculate how many bytes we have left. */ |
|||
length = (len & 127); |
|||
|
|||
/* Calculate where in (short) constant table we need to start. */ |
|||
offset = 128 - length; |
|||
|
|||
v0 = vec_ld(offset, vcrc_short_const); |
|||
v1 = vec_ld(offset + 16, vcrc_short_const); |
|||
v2 = vec_ld(offset + 32, vcrc_short_const); |
|||
v3 = vec_ld(offset + 48, vcrc_short_const); |
|||
v4 = vec_ld(offset + 64, vcrc_short_const); |
|||
v5 = vec_ld(offset + 80, vcrc_short_const); |
|||
v6 = vec_ld(offset + 96, vcrc_short_const); |
|||
v7 = vec_ld(offset + 112, vcrc_short_const); |
|||
|
|||
offset += 128; |
|||
|
|||
v0 = (__vector unsigned long long)__builtin_crypto_vpmsumw ( |
|||
(__vector unsigned int)vdata0,(__vector unsigned int)v0); |
|||
v1 = (__vector unsigned long long)__builtin_crypto_vpmsumw ( |
|||
(__vector unsigned int)vdata1,(__vector unsigned int)v1); |
|||
v2 = (__vector unsigned long long)__builtin_crypto_vpmsumw ( |
|||
(__vector unsigned int)vdata2,(__vector unsigned int)v2); |
|||
v3 = (__vector unsigned long long)__builtin_crypto_vpmsumw ( |
|||
(__vector unsigned int)vdata3,(__vector unsigned int)v3); |
|||
v4 = (__vector unsigned long long)__builtin_crypto_vpmsumw ( |
|||
(__vector unsigned int)vdata4,(__vector unsigned int)v4); |
|||
v5 = (__vector unsigned long long)__builtin_crypto_vpmsumw ( |
|||
(__vector unsigned int)vdata5,(__vector unsigned int)v5); |
|||
v6 = (__vector unsigned long long)__builtin_crypto_vpmsumw ( |
|||
(__vector unsigned int)vdata6,(__vector unsigned int)v6); |
|||
v7 = (__vector unsigned long long)__builtin_crypto_vpmsumw ( |
|||
(__vector unsigned int)vdata7,(__vector unsigned int)v7); |
|||
|
|||
/* Now reduce the tail (0-112 bytes). */ |
|||
for (i = 0; i < length; i+=16) { |
|||
vdata0 = vec_ld(i,(__vector unsigned long long*)p); |
|||
VEC_PERM(vdata0, vdata0, vdata0, vperm_const); |
|||
va0 = vec_ld(offset + i,vcrc_short_const); |
|||
va0 = (__vector unsigned long long)__builtin_crypto_vpmsumw ( |
|||
(__vector unsigned int)vdata0,(__vector unsigned int)va0); |
|||
v0 = vec_xor(v0, va0); |
|||
} |
|||
|
|||
/* xor all parallel chunks together. */ |
|||
v0 = vec_xor(v0, v1); |
|||
v2 = vec_xor(v2, v3); |
|||
v4 = vec_xor(v4, v5); |
|||
v6 = vec_xor(v6, v7); |
|||
|
|||
v0 = vec_xor(v0, v2); |
|||
v4 = vec_xor(v4, v6); |
|||
|
|||
v0 = vec_xor(v0, v4); |
|||
} |
|||
|
|||
/* Barrett Reduction */ |
|||
vconst1 = vec_ld(0, v_Barrett_const); |
|||
vconst2 = vec_ld(16, v_Barrett_const); |
|||
|
|||
v1 = (__vector unsigned long long)vec_sld((__vector unsigned char)v0, |
|||
(__vector unsigned char)v0, 8); |
|||
v0 = vec_xor(v1,v0); |
|||
|
|||
#ifdef REFLECT |
|||
/* shift left one bit */ |
|||
vsht_splat = vec_splat_u8 (1); |
|||
v0 = (__vector unsigned long long)vec_sll ((__vector unsigned char)v0, |
|||
vsht_splat); |
|||
#endif |
|||
|
|||
v0 = vec_and(v0, vmask_64bit); |
|||
|
|||
#ifndef REFLECT |
|||
|
|||
/* |
|||
* Now for the actual algorithm. The idea is to calculate q, |
|||
* the multiple of our polynomial that we need to subtract. By |
|||
* doing the computation 2x bits higher (ie 64 bits) and shifting the |
|||
* result back down 2x bits, we round down to the nearest multiple. |
|||
*/ |
|||
|
|||
/* ma */ |
|||
v1 = __builtin_crypto_vpmsumd ((__vector unsigned long long)v0, |
|||
(__vector unsigned long long)vconst1); |
|||
/* q = floor(ma/(2^64)) */ |
|||
v1 = (__vector unsigned long long)vec_sld ((__vector unsigned char)vzero, |
|||
(__vector unsigned char)v1, 8); |
|||
/* qn */ |
|||
v1 = __builtin_crypto_vpmsumd ((__vector unsigned long long)v1, |
|||
(__vector unsigned long long)vconst2); |
|||
/* a - qn, subtraction is xor in GF(2) */ |
|||
v0 = vec_xor (v0, v1); |
|||
/* |
|||
* Get the result into r3. We need to shift it left 8 bytes: |
|||
* V0 [ 0 1 2 X ] |
|||
* V0 [ 0 X 2 3 ] |
|||
*/ |
|||
result = __builtin_unpack_vector_1 (v0); |
|||
#else |
|||
|
|||
/* |
|||
* The reflected version of Barrett reduction. Instead of bit |
|||
* reflecting our data (which is expensive to do), we bit reflect our |
|||
* constants and our algorithm, which means the intermediate data in |
|||
* our vector registers goes from 0-63 instead of 63-0. We can reflect |
|||
* the algorithm because we don't carry in mod 2 arithmetic. |
|||
*/ |
|||
|
|||
/* bottom 32 bits of a */ |
|||
v1 = vec_and(v0, vmask_32bit); |
|||
|
|||
/* ma */ |
|||
v1 = __builtin_crypto_vpmsumd ((__vector unsigned long long)v1, |
|||
(__vector unsigned long long)vconst1); |
|||
|
|||
/* bottom 32bits of ma */ |
|||
v1 = vec_and(v1, vmask_32bit); |
|||
/* qn */ |
|||
v1 = __builtin_crypto_vpmsumd ((__vector unsigned long long)v1, |
|||
(__vector unsigned long long)vconst2); |
|||
/* a - qn, subtraction is xor in GF(2) */ |
|||
v0 = vec_xor (v0, v1); |
|||
|
|||
/* |
|||
* Since we are bit reflected, the result (ie the low 32 bits) is in |
|||
* the high 32 bits. We just need to shift it left 4 bytes |
|||
* V0 [ 0 1 X 3 ] |
|||
* V0 [ 0 X 2 3 ] |
|||
*/ |
|||
|
|||
/* shift result into top 64 bits of */ |
|||
v0 = (__vector unsigned long long)vec_sld((__vector unsigned char)v0, |
|||
(__vector unsigned char)vzero, 4); |
|||
|
|||
result = __builtin_unpack_vector_0 (v0); |
|||
#endif |
|||
|
|||
return result; |
|||
} |
@ -1,346 +0,0 @@ |
|||
/*****************************************************************************
|
|||
|
|||
Copyright (c) 2009, 2010 Facebook, Inc. All Rights Reserved. |
|||
Copyright (c) 2011, 2015, Oracle and/or its affiliates. All Rights Reserved. |
|||
Copyright (c) 2016, 2020, MariaDB Corporation. |
|||
|
|||
This program is free software; you can redistribute it and/or modify it under |
|||
the terms of the GNU General Public License as published by the Free Software |
|||
Foundation; version 2 of the License. |
|||
|
|||
This program is distributed in the hope that it will be useful, but WITHOUT |
|||
ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS |
|||
FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. |
|||
|
|||
You should have received a copy of the GNU General Public License along with |
|||
this program; if not, write to the Free Software Foundation, Inc., |
|||
51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA |
|||
|
|||
*****************************************************************************/ |
|||
|
|||
/***************************************************************//**
|
|||
@file ut/ut0crc32.cc |
|||
CRC32 implementation from Facebook, based on the zlib implementation. |
|||
|
|||
Created Aug 8, 2011, Vasil Dimov, based on mysys/my_crc32.c and |
|||
mysys/my_perf.c, contributed by Facebook under the following license. |
|||
********************************************************************/ |
|||
|
|||
/* Copyright (C) 2009-2010 Facebook, Inc. All Rights Reserved.
|
|||
|
|||
Dual licensed under BSD license and GPLv2. |
|||
|
|||
Redistribution and use in source and binary forms, with or without |
|||
modification, are permitted provided that the following conditions are met: |
|||
1. Redistributions of source code must retain the above copyright notice, |
|||
this list of conditions and the following disclaimer. |
|||
2. Redistributions in binary form must reproduce the above copyright notice, |
|||
this list of conditions and the following disclaimer in the documentation |
|||
and/or other materials provided with the distribution. |
|||
|
|||
THIS SOFTWARE IS PROVIDED BY FACEBOOK, INC. ``AS IS'' AND ANY EXPRESS OR |
|||
IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF |
|||
MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO |
|||
EVENT SHALL FACEBOOK, INC. BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, |
|||
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, |
|||
PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; |
|||
OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, |
|||
WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR |
|||
OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF |
|||
ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. |
|||
|
|||
This program is free software; you can redistribute it and/or modify it |
|||
under the terms of the GNU General Public License as published by the Free |
|||
Software Foundation; version 2 of the License. |
|||
|
|||
This program is distributed in the hope that it will be useful, but WITHOUT |
|||
ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or |
|||
FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for |
|||
more details. |
|||
|
|||
You should have received a copy of the GNU General Public License along with |
|||
this program; if not, write to the Free Software Foundation, Inc., |
|||
51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA */ |
|||
|
|||
/* The below CRC32 implementation is based on the implementation included with
|
|||
* zlib with modifications to process 8 bytes at a time and using SSE 4.2 |
|||
* extensions when available. The polynomial constant has been changed to |
|||
* match the one used by SSE 4.2 and does not return the same value as the |
|||
* version used by zlib. The original zlib copyright notice follows. */ |
|||
|
|||
/* crc32.c -- compute the CRC-32 of a buf stream
|
|||
* Copyright (C) 1995-2005 Mark Adler |
|||
* For conditions of distribution and use, see copyright notice in zlib.h |
|||
* |
|||
* Thanks to Rodney Brown <rbrown64@csc.com.au> for his contribution of faster |
|||
* CRC methods: exclusive-oring 32 bits of buf at a time, and pre-computing |
|||
* tables for updating the shift register in one step with three exclusive-ors |
|||
* instead of four steps with four exclusive-ors. This results in about a |
|||
* factor of two increase in speed on a Power PC G4 (PPC7455) using gcc -O3. |
|||
*/ |
|||
|
|||
// First include (the generated) my_config.h, to get correct platform defines.
|
|||
#include "my_config.h"
|
|||
#include <string.h>
|
|||
|
|||
#include "ut0crc32.h"
|
|||
#include "my_valgrind.h"
|
|||
|
|||
#ifdef HAVE_CPUID_INSTRUCTION
|
|||
# ifdef _MSC_VER
|
|||
# include <intrin.h>
|
|||
# else
|
|||
# include <cpuid.h>
|
|||
# if defined __GNUC__ && !defined __clang__ && __GNUC__ < 5
|
|||
/* <nmmintrin.h> does not really work in GCC before version 5 */ |
|||
# define _mm_crc32_u8(crc,data) __builtin_ia32_crc32qi(crc,data)
|
|||
# define _mm_crc32_u32(crc,data) __builtin_ia32_crc32si(crc,data)
|
|||
# define _mm_crc32_u64(crc,data) __builtin_ia32_crc32di(crc,data)
|
|||
# else
|
|||
# include <nmmintrin.h>
|
|||
# endif
|
|||
# endif
|
|||
#endif
|
|||
|
|||
/* CRC32 hardware implementation. */ |
|||
|
|||
#ifdef HAVE_CRC32_VPMSUM
|
|||
extern "C" |
|||
unsigned int crc32c_vpmsum(unsigned int crc, const unsigned char *p, unsigned long len); |
|||
ut_crc32_func_t ut_crc32_low= crc32c_vpmsum; |
|||
const char* ut_crc32_implementation = "Using POWER8 crc32 instructions"; |
|||
#else
|
|||
# if defined(__GNUC__) && defined(HAVE_ARMV8_CRC)
|
|||
extern "C" { |
|||
uint32_t crc32c_aarch64(uint32_t crc, const unsigned char *buffer, uint64_t len); |
|||
}; |
|||
# elif defined HAVE_CPUID_INSTRUCTION
|
|||
/** return whether SSE4.2 instructions are available */ |
|||
static inline bool has_sse4_2() |
|||
{ |
|||
/* We assume that the CPUID instruction and its parameter 1 are available.
|
|||
We do not support any precursors of the Intel 80486. */ |
|||
# ifdef _MSC_VER
|
|||
int data[4]; |
|||
__cpuid(data, 1); |
|||
return !!(data[2] & 1 << 20); |
|||
# else
|
|||
uint32_t reax = 0, rebx = 0, recx = 0, redx = 0; |
|||
__cpuid(1, reax, rebx, recx, redx); |
|||
return !!(recx & 1 << 20); |
|||
# endif
|
|||
} |
|||
|
|||
/** Append 8 bits (1 byte) to a CRC-32C checksum.
|
|||
@param crc CRC-32C checksum so far |
|||
@param data data to be checksummed |
|||
@return the updated CRC-32C */ |
|||
__attribute__((target("sse4.2"))) |
|||
static inline ulint ut_crc32c_8(ulint crc, byte data) |
|||
{ |
|||
return _mm_crc32_u8(static_cast<uint32_t>(crc), data); |
|||
} |
|||
|
|||
/** Append 64 bits (8 aligned bytes) to a CRC-32C checksum
|
|||
@param[in] crc CRC-32C checksum so far |
|||
@param[in] data 8 bytes of aligned data |
|||
@return the updated CRC-32C */ |
|||
__attribute__((target("sse4.2"))) |
|||
static inline ulint ut_crc32c_64(ulint crc, uint64_t data) |
|||
{ |
|||
# if SIZEOF_SIZE_T > 4
|
|||
return _mm_crc32_u64(crc, data); |
|||
# else
|
|||
crc= _mm_crc32_u32(crc, static_cast<uint32_t>(data)); |
|||
crc= _mm_crc32_u32(crc, static_cast<uint32_t>(data >> 32)); |
|||
return crc; |
|||
# endif
|
|||
} |
|||
|
|||
/** Calculate CRC-32C using dedicated IA-32 or AMD64 instructions
|
|||
@param crc current checksum |
|||
@param buf data to append to the checksum |
|||
@param len data length in bytes |
|||
@return CRC-32C (polynomial 0x11EDC6F41) */ |
|||
uint32_t ut_crc32_hw(uint32_t crc, const byte *buf, size_t len) |
|||
{ |
|||
ulint c= static_cast<uint32_t>(~crc); |
|||
|
|||
/* Calculate byte-by-byte up to an 8-byte aligned address. After
|
|||
this consume the input 8-bytes at a time. */ |
|||
while (len > 0 && (reinterpret_cast<uintptr_t>(buf) & 7) != 0) |
|||
{ |
|||
c= ut_crc32c_8(c, *buf++); |
|||
len--; |
|||
} |
|||
|
|||
const uint64_t* b64= reinterpret_cast<const uint64_t*>(buf); |
|||
|
|||
for (; len >= 128; len-= 128) |
|||
{ |
|||
/* This call is repeated 16 times. 16 * 8 = 128. */ |
|||
c= ut_crc32c_64(c, *b64++); |
|||
c= ut_crc32c_64(c, *b64++); |
|||
c= ut_crc32c_64(c, *b64++); |
|||
c= ut_crc32c_64(c, *b64++); |
|||
c= ut_crc32c_64(c, *b64++); |
|||
c= ut_crc32c_64(c, *b64++); |
|||
c= ut_crc32c_64(c, *b64++); |
|||
c= ut_crc32c_64(c, *b64++); |
|||
c= ut_crc32c_64(c, *b64++); |
|||
c= ut_crc32c_64(c, *b64++); |
|||
c= ut_crc32c_64(c, *b64++); |
|||
c= ut_crc32c_64(c, *b64++); |
|||
c= ut_crc32c_64(c, *b64++); |
|||
c= ut_crc32c_64(c, *b64++); |
|||
c= ut_crc32c_64(c, *b64++); |
|||
c= ut_crc32c_64(c, *b64++); |
|||
} |
|||
|
|||
for (; len >= 8; len-= 8) |
|||
c= ut_crc32c_64(c, *b64++); |
|||
|
|||
buf= reinterpret_cast<const byte*>(b64); |
|||
|
|||
while (len--) |
|||
c= ut_crc32c_8(c, *buf++); |
|||
|
|||
return ~static_cast<uint32_t>(c); |
|||
} |
|||
# endif /* (defined(__GNUC__) && defined(__i386__)) || _MSC_VER */
|
|||
|
|||
/* CRC32 software implementation. */ |
|||
|
|||
/* Precalculated table used to generate the CRC32 if the CPU does not
|
|||
have support for it */ |
|||
static uint32_t ut_crc32_slice8_table[8][256]; |
|||
|
|||
/********************************************************************//**
|
|||
Initializes the table that is used to generate the CRC32 if the CPU does |
|||
not have support for it. */ |
|||
static |
|||
void |
|||
ut_crc32_slice8_table_init() |
|||
/*========================*/ |
|||
{ |
|||
/* bit-reversed poly 0x1EDC6F41 (from SSE42 crc32 instruction) */ |
|||
static const uint32_t poly = 0x82f63b78; |
|||
uint32_t n; |
|||
uint32_t k; |
|||
uint32_t c; |
|||
|
|||
for (n = 0; n < 256; n++) { |
|||
c = n; |
|||
for (k = 0; k < 8; k++) { |
|||
c = (c & 1) ? (poly ^ (c >> 1)) : (c >> 1); |
|||
} |
|||
ut_crc32_slice8_table[0][n] = c; |
|||
} |
|||
|
|||
for (n = 0; n < 256; n++) { |
|||
c = ut_crc32_slice8_table[0][n]; |
|||
for (k = 1; k < 8; k++) { |
|||
c = ut_crc32_slice8_table[0][c & 0xFF] ^ (c >> 8); |
|||
ut_crc32_slice8_table[k][n] = c; |
|||
} |
|||
} |
|||
} |
|||
|
|||
/** Append 8 bits (1 byte) to a CRC-32C checksum.
|
|||
@param crc CRC-32C checksum so far |
|||
@param data data to be checksummed |
|||
@return the updated CRC-32C */ |
|||
static inline uint32_t ut_crc32c_8_sw(uint32_t crc, byte data) |
|||
{ |
|||
const uint8_t i= (crc ^ data) & 0xFF; |
|||
|
|||
return (crc >> 8) ^ ut_crc32_slice8_table[0][i]; |
|||
} |
|||
|
|||
/** Append 64 bits (8 aligned bytes) to a CRC-32C checksum
|
|||
@param[in] crc CRC-32C checksum so far |
|||
@param[in] data 8 bytes of aligned data |
|||
@return the updated CRC-32C */ |
|||
static inline uint32_t ut_crc32c_64_sw(uint32_t crc, uint64_t data) |
|||
{ |
|||
# ifdef WORDS_BIGENDIAN
|
|||
data= data << 56 | |
|||
(data & 0x000000000000FF00ULL) << 40 | |
|||
(data & 0x0000000000FF0000ULL) << 24 | |
|||
(data & 0x00000000FF000000ULL) << 8 | |
|||
(data & 0x000000FF00000000ULL) >> 8 | |
|||
(data & 0x0000FF0000000000ULL) >> 24 | |
|||
(data & 0x00FF000000000000ULL) >> 40 | |
|||
data >> 56; |
|||
# endif /* WORDS_BIGENDIAN */
|
|||
|
|||
data^= crc; |
|||
return |
|||
ut_crc32_slice8_table[7][(data ) & 0xFF] ^ |
|||
ut_crc32_slice8_table[6][(data >> 8) & 0xFF] ^ |
|||
ut_crc32_slice8_table[5][(data >> 16) & 0xFF] ^ |
|||
ut_crc32_slice8_table[4][(data >> 24) & 0xFF] ^ |
|||
ut_crc32_slice8_table[3][(data >> 32) & 0xFF] ^ |
|||
ut_crc32_slice8_table[2][(data >> 40) & 0xFF] ^ |
|||
ut_crc32_slice8_table[1][(data >> 48) & 0xFF] ^ |
|||
ut_crc32_slice8_table[0][(data >> 56)]; |
|||
} |
|||
|
|||
/** Calculate CRC-32C using a look-up table.
|
|||
@param crc current checksum |
|||
@param buf data to append to the checksum |
|||
@param len data length in bytes |
|||
@return CRC-32C (polynomial 0x11EDC6F41) */ |
|||
uint32_t ut_crc32_sw(uint32_t crc, const byte *buf, size_t len) |
|||
{ |
|||
crc= ~crc; |
|||
|
|||
/* Calculate byte-by-byte up to an 8-byte aligned address. After
|
|||
this consume the input 8-bytes at a time. */ |
|||
while (len > 0 && (reinterpret_cast<uintptr_t>(buf) & 7) != 0) |
|||
{ |
|||
crc= ut_crc32c_8_sw(crc, *buf++); |
|||
len--; |
|||
} |
|||
|
|||
const uint64_t* b64= reinterpret_cast<const uint64_t*>(buf); |
|||
|
|||
for (; len >= 8; len-= 8) |
|||
crc= ut_crc32c_64_sw(crc, *b64++); |
|||
|
|||
buf= reinterpret_cast<const byte*>(b64); |
|||
|
|||
while (len--) |
|||
crc= ut_crc32c_8_sw(crc, *buf++); |
|||
|
|||
return ~crc; |
|||
} |
|||
|
|||
ut_crc32_func_t ut_crc32_low= ut_crc32_sw; |
|||
const char *ut_crc32_implementation= "Using generic crc32 instructions"; |
|||
#endif
|
|||
|
|||
/********************************************************************//**
|
|||
Initializes the data structures used by ut_crc32*(). Does not do any |
|||
allocations, would not hurt if called twice, but would be pointless. */ |
|||
void ut_crc32_init() |
|||
{ |
|||
#ifndef HAVE_CRC32_VPMSUM
|
|||
# if defined(__GNUC__) && defined(HAVE_ARMV8_CRC)
|
|||
if (const char *crc32c_implementation= crc32c_aarch64_available()) |
|||
{ |
|||
ut_crc32_low= crc32c_aarch64; |
|||
ut_crc32_implementation= crc32c_implementation; |
|||
return; |
|||
} |
|||
# elif defined HAVE_CPUID_INSTRUCTION
|
|||
if (has_sse4_2()) |
|||
{ |
|||
ut_crc32_low= ut_crc32_hw; |
|||
ut_crc32_implementation= "Using SSE4.2 crc32 instructions"; |
|||
return; |
|||
} |
|||
# endif
|
|||
ut_crc32_slice8_table_init(); |
|||
#endif /* !HAVE_CRC32_VPMSUM */
|
|||
} |
@ -0,0 +1,69 @@ |
|||
/* Copyright (c) MariaDB 2020 |
|||
|
|||
This program is free software; you can redistribute it and/or |
|||
modify it under the terms of the GNU General Public License as |
|||
published by the Free Software Foundation; version 2 of the License. |
|||
|
|||
This program is distributed in the hope that it will be useful, but |
|||
WITHOUT ANY WARRANTY; without even the implied warranty of |
|||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
|||
General Public License for more details. |
|||
|
|||
You should have received a copy of the GNU General Public License |
|||
along with this program; if not, write to the Free Software |
|||
Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1335 USA */ |
|||
|
|||
#include <my_global.h> |
|||
#include <my_sys.h> |
|||
#include <my_crypt.h> |
|||
#include <tap.h> |
|||
#include <string.h> |
|||
#include <ctype.h> |
|||
#include <zlib.h> |
|||
|
|||
/* |
|||
Check that optimized crc32 (ieee, or ethernet polynomical) returns the same |
|||
result as zlib (not so well optimized, yet, but trustworthy) |
|||
*/ |
|||
#define DO_TEST_CRC32(crc,str) \ |
|||
ok(crc32(crc,(const Bytef *)str,(uint)(sizeof(str)-1)) == my_checksum(crc, str, sizeof(str)-1), "crc32 '%s'",str) |
|||
|
|||
/* Check that CRC32-C calculation returns correct result*/ |
|||
#define DO_TEST_CRC32C(crc,str,expected) \ |
|||
do { \ |
|||
unsigned int v = my_crc32c(crc, str, sizeof(str)-1); \ |
|||
printf("crc32(%u,'%s',%zu)=%u\n",crc,str,sizeof(str)-1,v); \ |
|||
ok(expected == my_crc32c(crc, str, sizeof(str)-1),"crc32c '%s'",str); \ |
|||
}while(0) |
|||
|
|||
|
|||
#define LONG_STR "1234567890234568900212345678901231213123321212123123123123123"\ |
|||
"............................................................................." \ |
|||
"xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx" \ |
|||
"yyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyy" \ |
|||
"zzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzz" |
|||
|
|||
int main(int argc __attribute__((unused)),char *argv[]) |
|||
{ |
|||
MY_INIT(argv[0]); |
|||
plan(14); |
|||
printf("%s\n",my_crc32c_implementation()); |
|||
DO_TEST_CRC32(0,""); |
|||
DO_TEST_CRC32(1,""); |
|||
DO_TEST_CRC32(0,"12345"); |
|||
DO_TEST_CRC32(1,"12345"); |
|||
DO_TEST_CRC32(0,"1234567890123456789"); |
|||
DO_TEST_CRC32(0, LONG_STR); |
|||
ok(0 == my_checksum(0, NULL, 0) , "crc32 data = NULL, length = 0"); |
|||
|
|||
DO_TEST_CRC32C(0,"", 0); |
|||
DO_TEST_CRC32C(1,"", 1); |
|||
DO_TEST_CRC32C(0, "12345", 416359221); |
|||
DO_TEST_CRC32C(1, "12345", 549473433); |
|||
DO_TEST_CRC32C(0, "1234567890123456789", 2366987449); |
|||
DO_TEST_CRC32C(0, LONG_STR, 3009234172); |
|||
ok(0 == my_crc32c(0, NULL, 0), "crc32c data = NULL, length = 0"); |
|||
|
|||
my_end(0); |
|||
return exit_status(); |
|||
} |
Write
Preview
Loading…
Cancel
Save
Reference in new issue