MDEV-19935 Create unified CRC-32 interface

Add CRC32C code to mysys. The x86-64 implementation uses PCMULQDQ in addition to CRC32 instruction after Intel whitepaper, and is ported from rocksdb code. Optimized ARM and POWER CRC32 were already present in mysys.
5 years ago · ccbe6bb6fc
24 changed files with 2087 additions and 1147 deletions
--- a/config.h.cmake
+++ b/config.h.cmake
@ -103,15 +103,6 @@
 #cmakedefine HAVE_LIBWRAP 1
 #cmakedefine HAVE_SYSTEMD 1

-#cmakedefine HAVE_CPUID_INSTRUCTION 1
-#cmakedefine HAVE_CLMUL_INSTRUCTION 1
-#cmakedefine HAVE_CRC32_VPMSUM 1
-
-/* Support ARMv8 crc + crypto  */
-#cmakedefine HAVE_ARMV8_CRC 1
-#cmakedefine HAVE_ARMV8_CRYPTO 1
-#cmakedefine HAVE_ARMV8_CRC_CRYPTO_INTRINSICS 1
-
 /* Does "struct timespec" have a "sec" and "nsec" field? */
 #cmakedefine HAVE_TIMESPEC_TS_SEC 1

--- a/extra/CMakeLists.txt
+++ b/extra/CMakeLists.txt
@ -73,20 +73,8 @@ IF(WITH_INNOBASE_STORAGE_ENGINE)
  # We use the InnoDB code directly in case the code changes.
  ADD_DEFINITIONS("-DUNIV_INNOCHECKSUM")

-  # Avoid generating Hardware Capabilities due to crc32 instructions
-  IF(CMAKE_SYSTEM_NAME MATCHES "SunOS" AND CMAKE_SYSTEM_PROCESSOR MATCHES "i386")
-    MY_CHECK_CXX_COMPILER_FLAG("-Wa,-nH")
-    IF(have_CXX__Wa__nH)
-      ADD_COMPILE_FLAGS(
-        ../storage/innobase/ut/ut0crc32.cc
-        COMPILE_FLAGS "-Wa,-nH"
-      )
-    ENDIF()
-  ENDIF()
-
  SET(INNOBASE_SOURCES
      ../storage/innobase/buf/buf0checksum.cc
-      ../storage/innobase/ut/ut0crc32.cc
      ../storage/innobase/ut/ut0ut.cc
      ../storage/innobase/buf/buf0buf.cc
      ../storage/innobase/page/page0zip.cc
--- a/extra/innochecksum.cc
+++ b/extra/innochecksum.cc
@ -1583,7 +1583,6 @@ int main(
 	/* enable when space_id of given file is zero. */
 	bool		is_system_tablespace = false;

-	ut_crc32_init();
 	MY_INIT(argv[0]);
 	DBUG_ENTER("main");
 	DBUG_PROCESS(argv[0]);
--- a/extra/mariabackup/backup_copy.cc
+++ b/extra/mariabackup/backup_copy.cc
@ -1833,7 +1833,6 @@ copy_back()

 	srv_max_n_threads = 1000;
 	sync_check_init();
-	ut_crc32_init();

 	/* copy undo tablespaces */

--- a/extra/mariabackup/xbstream.cc
+++ b/extra/mariabackup/xbstream.cc
@ -97,8 +97,6 @@ main(int argc, char **argv)
 {
 	MY_INIT(argv[0]);

-	my_checksum_init();
-
 	if (get_options(&argc, &argv)) {
 		goto err;
 	}
--- a/extra/mariabackup/xtrabackup.cc
+++ b/extra/mariabackup/xtrabackup.cc
@ -4011,9 +4011,6 @@ fail:
 	ut_d(sync_check_enable());
 	/* Reset the system variables in the recovery module. */
 	trx_pool_init();
-
-	ut_crc32_init();
-	my_checksum_init();
 	recv_sys.create();

 #ifdef WITH_INNODB_DISALLOW_WRITES
@ -5386,7 +5383,6 @@ static bool xtrabackup_prepare_func(char** argv)

 		sync_check_init();
 		ut_d(sync_check_enable());
-		ut_crc32_init();
 		recv_sys.create();
 		log_sys.create();
 		recv_sys.recovery_on = true;
--- a/include/my_sys.h
+++ b/include/my_sys.h
@ -901,18 +901,10 @@ extern int my_compress_buffer(uchar *dest, size_t *destLen,
 extern int packfrm(const uchar *, size_t, uchar **, size_t *);
 extern int unpackfrm(uchar **, size_t *, const uchar *);

-void my_checksum_init(void);
-#ifdef HAVE_CRC32_VPMSUM
-extern ha_checksum my_checksum(ha_checksum, const void *, size_t);
-#else
-typedef ha_checksum (*my_crc32_t)(ha_checksum, const void *, size_t);
-extern MYSQL_PLUGIN_IMPORT my_crc32_t my_checksum;
-#endif
+extern uint32 my_checksum(uint32, const void *, size_t);
+extern uint32 my_crc32c(uint32, const void *, size_t);

-#if defined(__GNUC__) && defined(HAVE_ARMV8_CRC)
-int crc32_aarch64_available(void);
-const char *crc32c_aarch64_available(void);
-#endif
+extern const char *my_crc32c_implementation();

 #ifdef DBUG_ASSERT_EXISTS
 extern void my_debug_put_break_here(void);
--- a/mysys/CMakeLists.txt
+++ b/mysys/CMakeLists.txt
@ -16,7 +16,7 @@

 INCLUDE_DIRECTORIES(${ZLIB_INCLUDE_DIR} ${CMAKE_SOURCE_DIR}/include ${CMAKE_SOURCE_DIR}/mysys)

-SET(MYSYS_SOURCES  array.c charset-def.c charset.c checksum.c my_default.c
+SET(MYSYS_SOURCES  array.c charset-def.c charset.c crc32ieee.cc my_default.c
                get_password.c
 				errors.c hash.c list.c
                                mf_cache.c mf_dirname.c mf_fn_ext.c
@ -45,7 +45,7 @@ SET(MYSYS_SOURCES  array.c charset-def.c charset.c checksum.c my_default.c
                                my_uuid.c wqueue.c waiting_threads.c ma_dyncol.c ../sql-common/my_time.c
 				my_rdtsc.c my_context.c psi_noop.c
                                my_atomic_writes.c my_cpu.c my_likely.c my_largepage.c
-                                file_logger.c my_dlerror.c)
+                                file_logger.c my_dlerror.c   crc32/crc32c.cc)

 IF (WIN32)
  SET (MYSYS_SOURCES ${MYSYS_SOURCES}
@ -59,25 +59,23 @@ IF (WIN32)
 ENDIF()

 IF(MSVC)
-  SET(HAVE_CPUID_INSTRUCTION 1 CACHE BOOL "")
-  SET(HAVE_CLMUL_INSTRUCTION 1 CACHE BOOL "")
  SET(MYSYS_SOURCES ${MYSYS_SOURCES} crc32/crc32_x86.c)
+  ADD_DEFINITIONS(-DHAVE_SSE42 -DHAVE_PCLMUL)
  IF(CLANG_CL)
-    SET_SOURCE_FILES_PROPERTIES(crc32/crc32_x86.c PROPERTIES COMPILE_FLAGS "-msse4.2 -mpclmul")
+    SET_SOURCE_FILES_PROPERTIES(crc32/crc32_x86.cc crc32/crc32c.c PROPERTIES COMPILE_FLAGS "-msse4.2 -mpclmul")
  ENDIF()
 ELSEIF(CMAKE_SYSTEM_PROCESSOR MATCHES "x86_64|amd64|i386|i686")
-  SET(HAVE_CPUID_INSTRUCTION 1 CACHE BOOL "")
  MY_CHECK_C_COMPILER_FLAG(-msse4.2)
  MY_CHECK_C_COMPILER_FLAG(-mpclmul)
  CHECK_INCLUDE_FILE(cpuid.h HAVE_CPUID_H)
  CHECK_INCLUDE_FILE(x86intrin.h HAVE_X86INTRIN_H)
  IF(have_C__msse4.2 AND have_C__mpclmul AND HAVE_CPUID_H AND HAVE_X86INTRIN_H)
-    SET(HAVE_CLMUL_INSTRUCTION 1 CACHE BOOL "")
    SET(MYSYS_SOURCES ${MYSYS_SOURCES} crc32/crc32_x86.c)
-    SET_SOURCE_FILES_PROPERTIES(crc32/crc32_x86.c PROPERTIES COMPILE_FLAGS "-msse4.2 -mpclmul")
+    SET_SOURCE_FILES_PROPERTIES(crc32/crc32_x86.c crc32/crc32c.cc PROPERTIES COMPILE_FLAGS "-msse4.2 -mpclmul")
+    ADD_DEFINITIONS(-DHAVE_SSE42 -DHAVE_PCLMUL)
  ENDIF()
 ELSEIF(CMAKE_SYSTEM_PROCESSOR MATCHES "aarch64|AARCH64")
-  IF(CMAKE_COMPILER_IS_GNUCC AND NOT CMAKE_CXX_COMPILER_VERSION VERSION_LESS 5.1)
+  IF(CMAKE_COMPILER_IS_GNUCC)
    include(CheckCXXSourceCompiles)

    CHECK_CXX_SOURCE_COMPILES("
@ -99,23 +97,29 @@ ELSEIF(CMAKE_SYSTEM_PROCESSOR MATCHES "aarch64|AARCH64")
    #include <sys/auxv.h>
    int main() { foo(0); getauxval(AT_HWCAP); }" HAVE_ARMV8_CRYPTO)

-    CHECK_C_COMPILER_FLAG(-march=armv8-a+crc+crypto HAVE_ARMV8_CRC_CRYPTO_INTRINSICS)
-    IF(HAVE_ARMV8_CRC_CRYPTO_INTRINSICS)
+    CHECK_C_COMPILER_FLAG(-march=armv8-a+crc+crypto HAVE_ARMV8_CRC_CRYPTO_MARCH)
+
+    IF(HAVE_ARMV8_CRC_CRYPTO_MARCH)
+      CHECK_INCLUDE_FILE(arm_acle.h HAVE_ARM_ACLE_H -march=armv8-a+crc+crypto)
+      IF(HAVE_ARM_ACLE_H)
+       ADD_DEFINITIONS(-DHAVE_ARMV8_CRC_CRYPTO_INTRINSICS)
+      ENDIF()
+      IF(HAVE_ARMV8_CRC)
+        ADD_DEFINITIONS(-DHAVE_ARMV8_CRC)
+      ENDIF()
+      IF(HAVE_ARMV8_CRYPTO)
+        ADD_DEFINITIONS(-DHAVE_ARMV8_CRYPTO)
+      ENDIF()
      SET(MYSYS_SOURCES ${MYSYS_SOURCES} crc32/crc32_arm64.c)
      SET_SOURCE_FILES_PROPERTIES(crc32/crc32_arm64.c PROPERTIES
        COMPILE_FLAGS "-march=armv8-a+crc+crypto")
    ENDIF()
  ENDIF()
 ELSEIF(CMAKE_SYSTEM_PROCESSOR MATCHES "ppc64")
-  SET(HAVE_CRC32_VPMSUM 1 PARENT_SCOPE)
-  SET(MYSYS_SOURCES ${MYSYS_SOURCES} $<TARGET_OBJECTS:crc32c> $<TARGET_OBJECTS:crc32ieee>)
-
-  ADD_LIBRARY(crc32c OBJECT crc32/crc32_ppc64.c)
-  ADD_LIBRARY(crc32ieee OBJECT crc32/crc32_ppc64.c)
-
-  SET_TARGET_PROPERTIES(crc32c crc32ieee PROPERTIES COMPILE_FLAGS "${COMPILE_FLAGS} -maltivec -mvsx -mpower8-vector -mcrypto -mpower8-vector")
-  SET_TARGET_PROPERTIES(crc32ieee PROPERTIES COMPILE_DEFINITIONS "CRC32_FUNCTION=my_checksum;CRC32_CONSTANTS_HEADER=\"pcc_crc32_constants.h\"")
-  SET_TARGET_PROPERTIES(crc32c PROPERTIES COMPILE_DEFINITIONS "CRC32_FUNCTION=crc32c_vpmsum;CRC32_CONSTANTS_HEADER=\"pcc_crc32c_constants.h\"")
+  SET(MYSYS_SOURCES ${MYSYS_SOURCES} crc32/crc32_ppc64.c crc32/crc32c_ppc.c)
+  SET_SOURCE_FILES_PROPERTIES(crc32/crc32_ppc64.c crc32/crc32c_ppc.c PROPERTIES
+        COMPILE_FLAGS "${COMPILE_FLAGS} -maltivec -mvsx -mpower8-vector -mcrypto -mpower8-vector")
+  ADD_DEFINITIONS(-DHAVE_POWER8 -DHAS_ALTIVEC)
 ENDIF()

 IF(UNIX)
--- a/mysys/crc32/crc32_arm64.c
+++ b/mysys/crc32/crc32_arm64.c
@ -57,6 +57,12 @@ asm(".arch_extension crypto");
 #define CRC32CH(crc, value) __asm__("crc32ch %w[c], %w[c], %w[v]":[c]"+r"(crc):[v]"r"(value))
 #define CRC32CB(crc, value) __asm__("crc32cb %w[c], %w[c], %w[v]":[c]"+r"(crc):[v]"r"(value))

+#define CRC32X(crc, value) __asm__("crc32x %w[c], %w[c], %x[v]":[c]"+r"(crc):[v]"r"(value))
+#define CRC32W(crc, value) __asm__("crc32w %w[c], %w[c], %w[v]":[c]"+r"(crc):[v]"r"(value))
+#define CRC32H(crc, value) __asm__("crc32h %w[c], %w[c], %w[v]":[c]"+r"(crc):[v]"r"(value))
+#define CRC32B(crc, value) __asm__("crc32b %w[c], %w[c], %w[v]":[c]"+r"(crc):[v]"r"(value))
+
+
 #define CRC32C3X8(buffer, ITR) \
  __asm__("crc32cx %w[c1], %w[c1], %x[v]":[c1]"+r"(crc1):[v]"r"(*((const uint64_t *)buffer + 42*1 + (ITR))));\
  __asm__("crc32cx %w[c2], %w[c2], %x[v]":[c2]"+r"(crc2):[v]"r"(*((const uint64_t *)buffer + 42*2 + (ITR))));\
@ -73,6 +79,11 @@ asm(".arch_extension crypto");
 #define CRC32CH(crc, value) (crc) = __crc32ch((crc), (value))
 #define CRC32CB(crc, value) (crc) = __crc32cb((crc), (value))

+#define CRC32X(crc, value) (crc) = __crc32d((crc), (value))
+#define CRC32W(crc, value) (crc) = __crc32w((crc), (value))
+#define CRC32H(crc, value) (crc) = __crc32h((crc), (value))
+#define CRC32B(crc, value) (crc) = __crc32b((crc), (value))
+
 #define CRC32C3X8(buffer, ITR) \
  crc1 = __crc32cd(crc1, *((const uint64_t *)buffer + 42*1 + (ITR)));\
  crc2 = __crc32cd(crc2, *((const uint64_t *)buffer + 42*2 + (ITR)));\
@ -119,7 +130,7 @@ uint32_t crc32c_aarch64(uint32_t crc, const unsigned char *buffer, uint64_t len)
  uint32_t crc0, crc1, crc2;
  int64_t length= (int64_t)len;

-  crc= 0xFFFFFFFFU;
+  crc^= 0xffffffff;

  /* Pmull runtime check here.
   * Raspberry Pi 4 supports crc32 but doesn't support pmull (MDEV-23030).
@ -282,16 +293,16 @@ unsigned int crc32_aarch64(unsigned int crc, const void *buf, size_t len)
  /* if start pointer is not 8 bytes aligned */
  while ((buf1 != (const uint8_t *) buf8) && len)
  {
-    crc= __crc32b(crc, *buf1++);
+    CRC32B(crc, *buf1++);
    len--;
  }

  for (; len >= 8; len-= 8)
-    crc= __crc32d(crc, *buf8++);
+    CRC32X(crc, *buf8++);

  buf1= (const uint8_t *) buf8;
  while (len--)
-    crc= __crc32b(crc, *buf1++);
+    CRC32B(crc, *buf1++);

  return ~crc;
 }
--- a/mysys/crc32/crc32_ppc64.c
+++ b/mysys/crc32/crc32_ppc64.c
@ -1,675 +1,5 @@
-/*
- * Calculate the checksum of data that is 16 byte aligned and a multiple of
- * 16 bytes.
- *
- * The first step is to reduce it to 1024 bits. We do this in 8 parallel
- * chunks in order to mask the latency of the vpmsum instructions. If we
- * have more than 32 kB of data to checksum we repeat this step multiple
- * times, passing in the previous 1024 bits.
- *
- * The next step is to reduce the 1024 bits to 64 bits. This step adds
- * 32 bits of 0s to the end - this matches what a CRC does. We just
- * calculate constants that land the data in this 32 bits.
- *
- * We then use fixed point Barrett reduction to compute a mod n over GF(2)
- * for n = CRC using POWER8 instructions. We use x = 32.
- *
- * http://en.wikipedia.org/wiki/Barrett_reduction
- *
- * This code uses gcc vector builtins instead using assembly directly.
- *
- * Copyright (C) 2017 Rogerio Alves <rogealve@br.ibm.com>, IBM
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of either:
- *
- *  a) the GNU General Public License as published by the Free Software
- *     Foundation; either version 2 of the License, or (at your option)
- *     any later version, or
- *  b) the Apache License, Version 2.0
- */
-
-#include <altivec.h>
-
-#define POWER8_INTRINSICS
+#define CRC32_FUNCTION my_checksum
 #define CRC_TABLE
-
-#ifdef CRC32_CONSTANTS_HEADER
-#include CRC32_CONSTANTS_HEADER
-#else
-#include "crc32_constants.h"
-#endif
-
-#define VMX_ALIGN	16
-#define VMX_ALIGN_MASK	(VMX_ALIGN-1)
-
-#ifdef REFLECT
-static unsigned int crc32_align(unsigned int crc, const unsigned char *p,
-			       unsigned long len)
-{
-	while (len--)
-		crc = crc_table[(crc ^ *p++) & 0xff] ^ (crc >> 8);
-	return crc;
-}
-#else
-static unsigned int crc32_align(unsigned int crc, const unsigned char *p,
-				unsigned long len)
-{
-	while (len--)
-		crc = crc_table[((crc >> 24) ^ *p++) & 0xff] ^ (crc << 8);
-	return crc;
-}
-#endif
-
-static unsigned int __attribute__ ((aligned (32)))
-__crc32_vpmsum(unsigned int crc, const void* p, unsigned long len);
-
-#ifndef CRC32_FUNCTION
-#define CRC32_FUNCTION  crc32_vpmsum
-#endif
-
-unsigned int CRC32_FUNCTION(unsigned int crc, const unsigned char *p,
-			    unsigned long len)
-{
-	unsigned int prealign;
-	unsigned int tail;
-
-#ifdef CRC_XOR
-	crc ^= 0xffffffff;
-#endif
-
-	if (len < VMX_ALIGN + VMX_ALIGN_MASK) {
-		crc = crc32_align(crc, p, len);
-		goto out;
-	}
-
-	if ((unsigned long)p & VMX_ALIGN_MASK) {
-		prealign = VMX_ALIGN - ((unsigned long)p & VMX_ALIGN_MASK);
-		crc = crc32_align(crc, p, prealign);
-		len -= prealign;
-		p += prealign;
-	}
-
-	crc = __crc32_vpmsum(crc, p, len & ~VMX_ALIGN_MASK);
-
-	tail = len & VMX_ALIGN_MASK;
-	if (tail) {
-		p += len & ~VMX_ALIGN_MASK;
-		crc = crc32_align(crc, p, tail);
-	}
-
-out:
-#ifdef CRC_XOR
-	crc ^= 0xffffffff;
-#endif
-
-	return crc;
-}
-
-#if defined (__clang__)
-#include "clang_workaround.h"
-#else
-#define __builtin_pack_vector(a, b)  __builtin_pack_vector_int128 ((a), (b))
-#define __builtin_unpack_vector_0(a) __builtin_unpack_vector_int128 ((vector __int128_t)(a), 0)
-#define __builtin_unpack_vector_1(a) __builtin_unpack_vector_int128 ((vector __int128_t)(a), 1)
-#endif
-
-/* When we have a load-store in a single-dispatch group and address overlap
- * such that foward is not allowed (load-hit-store) the group must be flushed.
- * A group ending NOP prevents the flush.
- */
-#define GROUP_ENDING_NOP asm("ori 2,2,0" ::: "memory")
-
-#if defined(__BIG_ENDIAN__) && defined (REFLECT)
-#define BYTESWAP_DATA
-#elif defined(__LITTLE_ENDIAN__) && !defined(REFLECT)
-#define BYTESWAP_DATA
-#endif
-
-#ifdef BYTESWAP_DATA
-#define VEC_PERM(vr, va, vb, vc) vr = vec_perm(va, vb,\
-			(__vector unsigned char) vc)
-#if defined(__LITTLE_ENDIAN__)
-/* Byte reverse permute constant LE. */
-static const __vector unsigned long long vperm_const
-	__attribute__ ((aligned(16))) = { 0x08090A0B0C0D0E0FUL,
-			0x0001020304050607UL };
-#else
-static const __vector unsigned long long vperm_const
-	__attribute__ ((aligned(16))) = { 0x0F0E0D0C0B0A0908UL,
-			0X0706050403020100UL };
-#endif
-#else
-#define VEC_PERM(vr, va, vb, vc)
-#endif
-
-static unsigned int __attribute__ ((aligned (32)))
-__crc32_vpmsum(unsigned int crc, const void* p, unsigned long len) {
-
-	const __vector unsigned long long vzero = {0,0};
-	const __vector unsigned long long vones = {0xffffffffffffffffUL,
-		0xffffffffffffffffUL};
-
-#ifdef REFLECT
-	__vector unsigned char vsht_splat;
-	const __vector unsigned long long vmask_32bit =
-		(__vector unsigned long long)vec_sld((__vector unsigned char)vzero,
-			(__vector unsigned char)vones, 4);
-#endif
-
-	const __vector unsigned long long vmask_64bit =
-		(__vector unsigned long long)vec_sld((__vector unsigned char)vzero,
-			(__vector unsigned char)vones, 8);
-
-	__vector unsigned long long vcrc;
-
-	__vector unsigned long long vconst1, vconst2;
-
-	/* vdata0-vdata7 will contain our data (p). */
-	__vector unsigned long long vdata0, vdata1, vdata2, vdata3, vdata4,
-		vdata5, vdata6, vdata7;
-
-	/* v0-v7 will contain our checksums */
-	__vector unsigned long long v0 = {0,0};
-	__vector unsigned long long v1 = {0,0};
-	__vector unsigned long long v2 = {0,0};
-	__vector unsigned long long v3 = {0,0};
-	__vector unsigned long long v4 = {0,0};
-	__vector unsigned long long v5 = {0,0};
-	__vector unsigned long long v6 = {0,0};
-	__vector unsigned long long v7 = {0,0};
-
-
-	/* Vector auxiliary variables. */
-	__vector unsigned long long va0, va1, va2, va3, va4, va5, va6, va7;
-
-	unsigned int result = 0;
-	unsigned int offset; /* Constant table offset. */
-
-	unsigned long i; /* Counter. */
-	unsigned long chunks;
-
-	unsigned long block_size;
-	int next_block = 0;
-
-	/* Align by 128 bits. The last 128 bit block will be processed at end. */
-	unsigned long length = len & 0xFFFFFFFFFFFFFF80UL;
-
-#ifdef REFLECT
-	vcrc = (__vector unsigned long long)__builtin_pack_vector(0UL, crc);
-#else
-	vcrc = (__vector unsigned long long)__builtin_pack_vector(crc, 0UL);
-
-	/* Shift into top 32 bits */
-	vcrc = (__vector unsigned long long)vec_sld((__vector unsigned char)vcrc,
-        (__vector unsigned char)vzero, 4);
-#endif
-
-	/* Short version. */
-	if (len < 256) {
-		/* Calculate where in the constant table we need to start. */
-		offset = 256 - len;
-
-		vconst1 = vec_ld(offset, vcrc_short_const);
-		vdata0 = vec_ld(0, (__vector unsigned long long*) p);
-		VEC_PERM(vdata0, vdata0, vconst1, vperm_const);
-
-		/* xor initial value*/
-		vdata0 = vec_xor(vdata0, vcrc);
-
-		vdata0 = (__vector unsigned long long) __builtin_crypto_vpmsumw
-				((__vector unsigned int)vdata0, (__vector unsigned int)vconst1);
-		v0 = vec_xor(v0, vdata0);
-
-		for (i = 16; i < len; i += 16) {
-			vconst1 = vec_ld(offset + i, vcrc_short_const);
-			vdata0 = vec_ld(i, (__vector unsigned long long*) p);
-			VEC_PERM(vdata0, vdata0, vconst1, vperm_const);
-			vdata0 = (__vector unsigned long long) __builtin_crypto_vpmsumw
-				((__vector unsigned int)vdata0, (__vector unsigned int)vconst1);
-			v0 = vec_xor(v0, vdata0);
-		}
-	} else {
-
-		/* Load initial values. */
-		vdata0 = vec_ld(0, (__vector unsigned long long*) p);
-		vdata1 = vec_ld(16, (__vector unsigned long long*) p);
-
-		VEC_PERM(vdata0, vdata0, vdata0, vperm_const);
-		VEC_PERM(vdata1, vdata1, vdata1, vperm_const);
-
-		vdata2 = vec_ld(32, (__vector unsigned long long*) p);
-		vdata3 = vec_ld(48, (__vector unsigned long long*) p);
-
-		VEC_PERM(vdata2, vdata2, vdata2, vperm_const);
-		VEC_PERM(vdata3, vdata3, vdata3, vperm_const);
-
-		vdata4 = vec_ld(64, (__vector unsigned long long*) p);
-		vdata5 = vec_ld(80, (__vector unsigned long long*) p);
-
-		VEC_PERM(vdata4, vdata4, vdata4, vperm_const);
-		VEC_PERM(vdata5, vdata5, vdata5, vperm_const);
-
-		vdata6 = vec_ld(96, (__vector unsigned long long*) p);
-		vdata7 = vec_ld(112, (__vector unsigned long long*) p);
-
-		VEC_PERM(vdata6, vdata6, vdata6, vperm_const);
-		VEC_PERM(vdata7, vdata7, vdata7, vperm_const);
-
-		/* xor in initial value */
-		vdata0 = vec_xor(vdata0, vcrc);
-
-		p = (char *)p + 128;
-
-		do {
-			/* Checksum in blocks of MAX_SIZE. */
-			block_size = length;
-			if (block_size > MAX_SIZE) {
-				block_size = MAX_SIZE;
-			}
-
-			length = length - block_size;
-
-			/*
-			* Work out the offset into the constants table to start at. Each
-			* constant is 16 bytes, and it is used against 128 bytes of input
-			* data - 128 / 16 = 8
-			*/
-			offset = (MAX_SIZE/8) - (block_size/8);
-			/* We reduce our final 128 bytes in a separate step */
-			chunks = (block_size/128)-1;
-
-		    vconst1 = vec_ld(offset, vcrc_const);
-
-			va0 = __builtin_crypto_vpmsumd ((__vector unsigned long long)vdata0,
-						(__vector unsigned long long)vconst1);
-			va1 = __builtin_crypto_vpmsumd ((__vector unsigned long long)vdata1,
-						(__vector unsigned long long)vconst1);
-			va2 = __builtin_crypto_vpmsumd ((__vector unsigned long long)vdata2,
-						(__vector unsigned long long)vconst1);
-			va3 = __builtin_crypto_vpmsumd ((__vector unsigned long long)vdata3,
-						(__vector unsigned long long)vconst1);
-			va4 = __builtin_crypto_vpmsumd ((__vector unsigned long long)vdata4,
-						(__vector unsigned long long)vconst1);
-			va5 = __builtin_crypto_vpmsumd ((__vector unsigned long long)vdata5,
-						(__vector unsigned long long)vconst1);
-			va6 = __builtin_crypto_vpmsumd ((__vector unsigned long long)vdata6,
-						(__vector unsigned long long)vconst1);
-			va7 = __builtin_crypto_vpmsumd ((__vector unsigned long long)vdata7,
-						(__vector unsigned long long)vconst1);
-
-			if (chunks > 1) {
-				offset += 16;
-				vconst2 = vec_ld(offset, vcrc_const);
-				GROUP_ENDING_NOP;
-
-				vdata0 = vec_ld(0, (__vector unsigned long long*) p);
-				VEC_PERM(vdata0, vdata0, vdata0, vperm_const);
-
-				vdata1 = vec_ld(16, (__vector unsigned long long*) p);
-				VEC_PERM(vdata1, vdata1, vdata1, vperm_const);
-
-				vdata2 = vec_ld(32, (__vector unsigned long long*) p);
-				VEC_PERM(vdata2, vdata2, vdata2, vperm_const);
-
-				vdata3 = vec_ld(48, (__vector unsigned long long*) p);
-				VEC_PERM(vdata3, vdata3, vdata3, vperm_const);
-
-				vdata4 = vec_ld(64, (__vector unsigned long long*) p);
-				VEC_PERM(vdata4, vdata4, vdata4, vperm_const);
-
-				vdata5 = vec_ld(80, (__vector unsigned long long*) p);
-				VEC_PERM(vdata5, vdata5, vdata5, vperm_const);
-
-				vdata6 = vec_ld(96, (__vector unsigned long long*) p);
-				VEC_PERM(vdata6, vdata6, vdata6, vperm_const);
-
-				vdata7 = vec_ld(112, (__vector unsigned long long*) p);
-				VEC_PERM(vdata7, vdata7, vdata7, vperm_const);
-
-				p = (char *)p + 128;
-
-				/*
-				 * main loop. We modulo schedule it such that it takes three
-				 * iterations to complete - first iteration load, second
-				 * iteration vpmsum, third iteration xor.
-				 */
-				for (i = 0; i < chunks-2; i++) {
-					vconst1 = vec_ld(offset, vcrc_const);
-					offset += 16;
-					GROUP_ENDING_NOP;
-
-					v0 = vec_xor(v0, va0);
-					va0 = __builtin_crypto_vpmsumd ((__vector unsigned long
-							long)vdata0, (__vector unsigned long long)vconst2);
-					vdata0 = vec_ld(0, (__vector unsigned long long*) p);
-					VEC_PERM(vdata0, vdata0, vdata0, vperm_const);
-					GROUP_ENDING_NOP;
-
-					v1 = vec_xor(v1, va1);
-					va1 = __builtin_crypto_vpmsumd ((__vector unsigned long
-							long)vdata1, (__vector unsigned long long)vconst2);
-					vdata1 = vec_ld(16, (__vector unsigned long long*) p);
-					VEC_PERM(vdata1, vdata1, vdata1, vperm_const);
-					GROUP_ENDING_NOP;
-
-					v2 = vec_xor(v2, va2);
-					va2 = __builtin_crypto_vpmsumd ((__vector unsigned long
-							long)vdata2, (__vector unsigned long long)vconst2);
-					vdata2 = vec_ld(32, (__vector unsigned long long*) p);
-					VEC_PERM(vdata2, vdata2, vdata2, vperm_const);
-					GROUP_ENDING_NOP;
-
-					v3 = vec_xor(v3, va3);
-					va3 = __builtin_crypto_vpmsumd ((__vector unsigned long
-							long)vdata3, (__vector unsigned long long)vconst2);
-					vdata3 = vec_ld(48, (__vector unsigned long long*) p);
-					VEC_PERM(vdata3, vdata3, vdata3, vperm_const);
-
-					vconst2 = vec_ld(offset, vcrc_const);
-					GROUP_ENDING_NOP;
-
-					v4 = vec_xor(v4, va4);
-					va4 = __builtin_crypto_vpmsumd ((__vector unsigned long
-							long)vdata4, (__vector unsigned long long)vconst1);
-					vdata4 = vec_ld(64, (__vector unsigned long long*) p);
-					VEC_PERM(vdata4, vdata4, vdata4, vperm_const);
-					GROUP_ENDING_NOP;
-
-					v5 = vec_xor(v5, va5);
-					va5 = __builtin_crypto_vpmsumd ((__vector unsigned long
-							long)vdata5, (__vector unsigned long long)vconst1);
-					vdata5 = vec_ld(80, (__vector unsigned long long*) p);
-					VEC_PERM(vdata5, vdata5, vdata5, vperm_const);
-					GROUP_ENDING_NOP;
-
-					v6 = vec_xor(v6, va6);
-					va6 = __builtin_crypto_vpmsumd ((__vector unsigned long
-							long)vdata6, (__vector unsigned long long)vconst1);
-					vdata6 = vec_ld(96, (__vector unsigned long long*) p);
-					VEC_PERM(vdata6, vdata6, vdata6, vperm_const);
-					GROUP_ENDING_NOP;
-
-					v7 = vec_xor(v7, va7);
-					va7 = __builtin_crypto_vpmsumd ((__vector unsigned long
-							long)vdata7, (__vector unsigned long long)vconst1);
-					vdata7 = vec_ld(112, (__vector unsigned long long*) p);
-					VEC_PERM(vdata7, vdata7, vdata7, vperm_const);
-
-					p = (char *)p + 128;
-				}
-
-				/* First cool down*/
-				vconst1 = vec_ld(offset, vcrc_const);
-				offset += 16;
-
-				v0 = vec_xor(v0, va0);
-				va0 = __builtin_crypto_vpmsumd ((__vector unsigned long
-							long)vdata0, (__vector unsigned long long)vconst1);
-				GROUP_ENDING_NOP;
-
-				v1 = vec_xor(v1, va1);
-				va1 = __builtin_crypto_vpmsumd ((__vector unsigned long
-							long)vdata1, (__vector unsigned long long)vconst1);
-				GROUP_ENDING_NOP;
-
-				v2 = vec_xor(v2, va2);
-				va2 = __builtin_crypto_vpmsumd ((__vector unsigned long
-							long)vdata2, (__vector unsigned long long)vconst1);
-				GROUP_ENDING_NOP;
-
-				v3 = vec_xor(v3, va3);
-				va3 = __builtin_crypto_vpmsumd ((__vector unsigned long
-							long)vdata3, (__vector unsigned long long)vconst1);
-				GROUP_ENDING_NOP;
-
-				v4 = vec_xor(v4, va4);
-				va4 = __builtin_crypto_vpmsumd ((__vector unsigned long
-							long)vdata4, (__vector unsigned long long)vconst1);
-				GROUP_ENDING_NOP;
-
-				v5 = vec_xor(v5, va5);
-				va5 = __builtin_crypto_vpmsumd ((__vector unsigned long
-							long)vdata5, (__vector unsigned long long)vconst1);
-				GROUP_ENDING_NOP;
-
-				v6 = vec_xor(v6, va6);
-				va6 = __builtin_crypto_vpmsumd ((__vector unsigned long
-							long)vdata6, (__vector unsigned long long)vconst1);
-				GROUP_ENDING_NOP;
-
-				v7 = vec_xor(v7, va7);
-				va7 = __builtin_crypto_vpmsumd ((__vector unsigned long
-							long)vdata7, (__vector unsigned long long)vconst1);
-			}/* else */
-
-			/* Second cool down. */
-			v0 = vec_xor(v0, va0);
-			v1 = vec_xor(v1, va1);
-			v2 = vec_xor(v2, va2);
-			v3 = vec_xor(v3, va3);
-			v4 = vec_xor(v4, va4);
-			v5 = vec_xor(v5, va5);
-			v6 = vec_xor(v6, va6);
-			v7 = vec_xor(v7, va7);
-
-#ifdef REFLECT
-			/*
-			 * vpmsumd produces a 96 bit result in the least significant bits
-			 * of the register. Since we are bit reflected we have to shift it
-			 * left 32 bits so it occupies the least significant bits in the
-			 * bit reflected domain.
-			 */
-			v0 = (__vector unsigned long long)vec_sld((__vector unsigned char)v0,
-					(__vector unsigned char)vzero, 4);
-			v1 = (__vector unsigned long long)vec_sld((__vector unsigned char)v1,
-					(__vector unsigned char)vzero, 4);
-			v2 = (__vector unsigned long long)vec_sld((__vector unsigned char)v2,
-					(__vector unsigned char)vzero, 4);
-			v3 = (__vector unsigned long long)vec_sld((__vector unsigned char)v3,
-					(__vector unsigned char)vzero, 4);
-			v4 = (__vector unsigned long long)vec_sld((__vector unsigned char)v4,
-					(__vector unsigned char)vzero, 4);
-			v5 = (__vector unsigned long long)vec_sld((__vector unsigned char)v5,
-					(__vector unsigned char)vzero, 4);
-			v6 = (__vector unsigned long long)vec_sld((__vector unsigned char)v6,
-					(__vector unsigned char)vzero, 4);
-			v7 = (__vector unsigned long long)vec_sld((__vector unsigned char)v7,
-					(__vector unsigned char)vzero, 4);
-#endif
-
-			/* xor with the last 1024 bits. */
-			va0 = vec_ld(0, (__vector unsigned long long*) p);
-			VEC_PERM(va0, va0, va0, vperm_const);
-
-			va1 = vec_ld(16, (__vector unsigned long long*) p);
-			VEC_PERM(va1, va1, va1, vperm_const);
-
-			va2 = vec_ld(32, (__vector unsigned long long*) p);
-			VEC_PERM(va2, va2, va2, vperm_const);
-
-			va3 = vec_ld(48, (__vector unsigned long long*) p);
-			VEC_PERM(va3, va3, va3, vperm_const);
-
-			va4 = vec_ld(64, (__vector unsigned long long*) p);
-			VEC_PERM(va4, va4, va4, vperm_const);
-
-			va5 = vec_ld(80, (__vector unsigned long long*) p);
-			VEC_PERM(va5, va5, va5, vperm_const);
-
-			va6 = vec_ld(96, (__vector unsigned long long*) p);
-			VEC_PERM(va6, va6, va6, vperm_const);
-
-			va7 = vec_ld(112, (__vector unsigned long long*) p);
-			VEC_PERM(va7, va7, va7, vperm_const);
-
-			p = (char *)p + 128;
-
-			vdata0 = vec_xor(v0, va0);
-			vdata1 = vec_xor(v1, va1);
-			vdata2 = vec_xor(v2, va2);
-			vdata3 = vec_xor(v3, va3);
-			vdata4 = vec_xor(v4, va4);
-			vdata5 = vec_xor(v5, va5);
-			vdata6 = vec_xor(v6, va6);
-			vdata7 = vec_xor(v7, va7);
-
-			/* Check if we have more blocks to process */
-			next_block = 0;
-			if (length != 0) {
-				next_block = 1;
-
-			    /* zero v0-v7 */
-				v0 = vec_xor(v0, v0);
-				v1 = vec_xor(v1, v1);
-				v2 = vec_xor(v2, v2);
-				v3 = vec_xor(v3, v3);
-				v4 = vec_xor(v4, v4);
-				v5 = vec_xor(v5, v5);
-				v6 = vec_xor(v6, v6);
-				v7 = vec_xor(v7, v7);
-			}
-			length = length + 128;
-
-		} while (next_block);
-
-		/* Calculate how many bytes we have left. */
-		length = (len & 127);
-
-		/* Calculate where in (short) constant table we need to start. */
-		offset = 128 - length;
-
-		v0 = vec_ld(offset, vcrc_short_const);
-		v1 = vec_ld(offset + 16, vcrc_short_const);
-		v2 = vec_ld(offset + 32, vcrc_short_const);
-		v3 = vec_ld(offset + 48, vcrc_short_const);
-		v4 = vec_ld(offset + 64, vcrc_short_const);
-		v5 = vec_ld(offset + 80, vcrc_short_const);
-		v6 = vec_ld(offset + 96, vcrc_short_const);
-		v7 = vec_ld(offset + 112, vcrc_short_const);
-
-		offset += 128;
-
-		v0 = (__vector unsigned long long)__builtin_crypto_vpmsumw (
-			(__vector unsigned int)vdata0,(__vector unsigned int)v0);
-		v1 = (__vector unsigned long long)__builtin_crypto_vpmsumw (
-			(__vector unsigned int)vdata1,(__vector unsigned int)v1);
-		v2 = (__vector unsigned long long)__builtin_crypto_vpmsumw (
-			(__vector unsigned int)vdata2,(__vector unsigned int)v2);
-		v3 = (__vector unsigned long long)__builtin_crypto_vpmsumw (
-			(__vector unsigned int)vdata3,(__vector unsigned int)v3);
-		v4 = (__vector unsigned long long)__builtin_crypto_vpmsumw (
-			(__vector unsigned int)vdata4,(__vector unsigned int)v4);
-		v5 = (__vector unsigned long long)__builtin_crypto_vpmsumw (
-			(__vector unsigned int)vdata5,(__vector unsigned int)v5);
-		v6 = (__vector unsigned long long)__builtin_crypto_vpmsumw (
-			(__vector unsigned int)vdata6,(__vector unsigned int)v6);
-		v7 = (__vector unsigned long long)__builtin_crypto_vpmsumw (
-			(__vector unsigned int)vdata7,(__vector unsigned int)v7);
-
-		/* Now reduce the tail (0-112 bytes). */
-		for (i = 0; i < length; i+=16) {
-			vdata0 = vec_ld(i,(__vector unsigned long long*)p);
-			VEC_PERM(vdata0, vdata0, vdata0, vperm_const);
-			va0 = vec_ld(offset + i,vcrc_short_const);
-			va0 = (__vector unsigned long long)__builtin_crypto_vpmsumw (
-			(__vector unsigned int)vdata0,(__vector unsigned int)va0);
-			v0 = vec_xor(v0, va0);
-		}
-
-		/* xor all parallel chunks together. */
-		v0 = vec_xor(v0, v1);
-		v2 = vec_xor(v2, v3);
-		v4 = vec_xor(v4, v5);
-		v6 = vec_xor(v6, v7);
-
-		v0 = vec_xor(v0, v2);
-		v4 = vec_xor(v4, v6);
-
-		v0 = vec_xor(v0, v4);
-	}
-
-	/* Barrett Reduction */
-	vconst1 = vec_ld(0, v_Barrett_const);
-	vconst2 = vec_ld(16, v_Barrett_const);
-
-	v1 = (__vector unsigned long long)vec_sld((__vector unsigned char)v0,
-			(__vector unsigned char)v0, 8);
-	v0 = vec_xor(v1,v0);
-
-#ifdef REFLECT
-	/* shift left one bit */
-	vsht_splat = vec_splat_u8 (1);
-	v0 = (__vector unsigned long long)vec_sll ((__vector unsigned char)v0,
-			vsht_splat);
-#endif
-
-	v0 = vec_and(v0, vmask_64bit);
-
-#ifndef REFLECT
-
-	/*
-	 * Now for the actual algorithm. The idea is to calculate q,
-	 * the multiple of our polynomial that we need to subtract. By
-	 * doing the computation 2x bits higher (ie 64 bits) and shifting the
-	 * result back down 2x bits, we round down to the nearest multiple.
-	 */
-
-	/* ma */
-	v1 = __builtin_crypto_vpmsumd ((__vector unsigned long long)v0,
-			(__vector unsigned long long)vconst1);
-	/* q = floor(ma/(2^64)) */
-	v1 = (__vector unsigned long long)vec_sld ((__vector unsigned char)vzero,
-			(__vector unsigned char)v1, 8);
-	/* qn */
-	v1 = __builtin_crypto_vpmsumd ((__vector unsigned long long)v1,
-			(__vector unsigned long long)vconst2);
-	/* a - qn, subtraction is xor in GF(2) */
-	v0 = vec_xor (v0, v1);
-	/*
-	 * Get the result into r3. We need to shift it left 8 bytes:
-	 * V0 [ 0 1 2 X ]
-	 * V0 [ 0 X 2 3 ]
-	 */
-	result = __builtin_unpack_vector_1 (v0);
-#else
-
-	/*
-	 * The reflected version of Barrett reduction. Instead of bit
-	 * reflecting our data (which is expensive to do), we bit reflect our
-	 * constants and our algorithm, which means the intermediate data in
-	 * our vector registers goes from 0-63 instead of 63-0. We can reflect
-	 * the algorithm because we don't carry in mod 2 arithmetic.
-	 */
-
-	/* bottom 32 bits of a */
-	v1 = vec_and(v0, vmask_32bit);
-
-	/* ma */
-	v1 = __builtin_crypto_vpmsumd ((__vector unsigned long long)v1,
-			(__vector unsigned long long)vconst1);
-
-	/* bottom 32bits of ma */
-	v1 = vec_and(v1, vmask_32bit);
-	/* qn */
-	v1 = __builtin_crypto_vpmsumd ((__vector unsigned long long)v1,
-			(__vector unsigned long long)vconst2);
-	/* a - qn, subtraction is xor in GF(2) */
-	v0 = vec_xor (v0, v1);
-
-	/*
-	 * Since we are bit reflected, the result (ie the low 32 bits) is in
-	 * the high 32 bits. We just need to shift it left 4 bytes
-	 * V0 [ 0 1 X 3 ]
-	 * V0 [ 0 X 2 3 ]
-	 */
-
-	/* shift result into top 64 bits of */
-	v0 = (__vector unsigned long long)vec_sld((__vector unsigned char)v0,
-        (__vector unsigned char)vzero, 4);
-
-	result = __builtin_unpack_vector_0 (v0);
-#endif
-
-	return result;
-}
+#define POWER8_INTRINSICS
+#include "pcc_crc32_constants.h"
+#include "crc_ppc64.h"
--- a/mysys/crc32/crc32c.cc
+++ b/mysys/crc32/crc32c.cc
--- a/mysys/crc32/crc32c_ppc.c
+++ b/mysys/crc32/crc32c_ppc.c
@ -0,0 +1,5 @@
+#define CRC32_FUNCTION crc32c_ppc
+#define CRC_TABLE
+#define POWER8_INTRINSICS
+#include "pcc_crc32c_constants.h"
+#include "crc_ppc64.h"
--- a/mysys/crc32/crc32c_ppc.h
+++ b/mysys/crc32/crc32c_ppc.h
@ -0,0 +1,19 @@
+//  Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+//  Copyright (c) 2017 International Business Machines Corp.
+//  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+extern uint32_t crc32c_ppc(uint32_t crc, unsigned char const *buffer,
+                           unsigned len);
+
+#ifdef __cplusplus
+}
+#endif
--- a/mysys/crc32/crc_ppc64.h
+++ b/mysys/crc32/crc_ppc64.h
@ -0,0 +1,664 @@
+/*
+ * Calculate the checksum of data that is 16 byte aligned and a multiple of
+ * 16 bytes.
+ *
+ * The first step is to reduce it to 1024 bits. We do this in 8 parallel
+ * chunks in order to mask the latency of the vpmsum instructions. If we
+ * have more than 32 kB of data to checksum we repeat this step multiple
+ * times, passing in the previous 1024 bits.
+ *
+ * The next step is to reduce the 1024 bits to 64 bits. This step adds
+ * 32 bits of 0s to the end - this matches what a CRC does. We just
+ * calculate constants that land the data in this 32 bits.
+ *
+ * We then use fixed point Barrett reduction to compute a mod n over GF(2)
+ * for n = CRC using POWER8 instructions. We use x = 32.
+ *
+ * http://en.wikipedia.org/wiki/Barrett_reduction
+ *
+ * This code uses gcc vector builtins instead using assembly directly.
+ *
+ * Copyright (C) 2017 Rogerio Alves <rogealve@br.ibm.com>, IBM
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of either:
+ *
+ *  a) the GNU General Public License as published by the Free Software
+ *     Foundation; either version 2 of the License, or (at your option)
+ *     any later version, or
+ *  b) the Apache License, Version 2.0
+ */
+
+#include <altivec.h>
+
+
+#define VMX_ALIGN	16
+#define VMX_ALIGN_MASK	(VMX_ALIGN-1)
+
+#ifdef REFLECT
+static unsigned int crc32_align(unsigned int crc, const unsigned char *p,
+			       unsigned long len)
+{
+	while (len--)
+		crc = crc_table[(crc ^ *p++) & 0xff] ^ (crc >> 8);
+	return crc;
+}
+#else
+static unsigned int crc32_align(unsigned int crc, const unsigned char *p,
+				unsigned long len)
+{
+	while (len--)
+		crc = crc_table[((crc >> 24) ^ *p++) & 0xff] ^ (crc << 8);
+	return crc;
+}
+#endif
+
+static unsigned int __attribute__ ((aligned (32)))
+__crc32_vpmsum(unsigned int crc, const void* p, unsigned long len);
+
+
+unsigned int CRC32_FUNCTION(unsigned int crc, const unsigned char *p,
+			    unsigned long len)
+{
+	unsigned int prealign;
+	unsigned int tail;
+
+#ifdef CRC_XOR
+	crc ^= 0xffffffff;
+#endif
+
+	if (len < VMX_ALIGN + VMX_ALIGN_MASK) {
+		crc = crc32_align(crc, p, len);
+		goto out;
+	}
+
+	if ((unsigned long)p & VMX_ALIGN_MASK) {
+		prealign = VMX_ALIGN - ((unsigned long)p & VMX_ALIGN_MASK);
+		crc = crc32_align(crc, p, prealign);
+		len -= prealign;
+		p += prealign;
+	}
+
+	crc = __crc32_vpmsum(crc, p, len & ~VMX_ALIGN_MASK);
+
+	tail = len & VMX_ALIGN_MASK;
+	if (tail) {
+		p += len & ~VMX_ALIGN_MASK;
+		crc = crc32_align(crc, p, tail);
+	}
+
+out:
+#ifdef CRC_XOR
+	crc ^= 0xffffffff;
+#endif
+
+	return crc;
+}
+
+#if defined (__clang__)
+#include "clang_workaround.h"
+#else
+#define __builtin_pack_vector(a, b)  __builtin_pack_vector_int128 ((a), (b))
+#define __builtin_unpack_vector_0(a) __builtin_unpack_vector_int128 ((vector __int128_t)(a), 0)
+#define __builtin_unpack_vector_1(a) __builtin_unpack_vector_int128 ((vector __int128_t)(a), 1)
+#endif
+
+/* When we have a load-store in a single-dispatch group and address overlap
+ * such that foward is not allowed (load-hit-store) the group must be flushed.
+ * A group ending NOP prevents the flush.
+ */
+#define GROUP_ENDING_NOP asm("ori 2,2,0" ::: "memory")
+
+#if defined(__BIG_ENDIAN__) && defined (REFLECT)
+#define BYTESWAP_DATA
+#elif defined(__LITTLE_ENDIAN__) && !defined(REFLECT)
+#define BYTESWAP_DATA
+#endif
+
+#ifdef BYTESWAP_DATA
+#define VEC_PERM(vr, va, vb, vc) vr = vec_perm(va, vb,\
+			(__vector unsigned char) vc)
+#if defined(__LITTLE_ENDIAN__)
+/* Byte reverse permute constant LE. */
+static const __vector unsigned long long vperm_const
+	__attribute__ ((aligned(16))) = { 0x08090A0B0C0D0E0FUL,
+			0x0001020304050607UL };
+#else
+static const __vector unsigned long long vperm_const
+	__attribute__ ((aligned(16))) = { 0x0F0E0D0C0B0A0908UL,
+			0X0706050403020100UL };
+#endif
+#else
+#define VEC_PERM(vr, va, vb, vc)
+#endif
+
+static unsigned int __attribute__ ((aligned (32)))
+__crc32_vpmsum(unsigned int crc, const void* p, unsigned long len) {
+
+	const __vector unsigned long long vzero = {0,0};
+	const __vector unsigned long long vones = {0xffffffffffffffffUL,
+		0xffffffffffffffffUL};
+
+#ifdef REFLECT
+	__vector unsigned char vsht_splat;
+	const __vector unsigned long long vmask_32bit =
+		(__vector unsigned long long)vec_sld((__vector unsigned char)vzero,
+			(__vector unsigned char)vones, 4);
+#endif
+
+	const __vector unsigned long long vmask_64bit =
+		(__vector unsigned long long)vec_sld((__vector unsigned char)vzero,
+			(__vector unsigned char)vones, 8);
+
+	__vector unsigned long long vcrc;
+
+	__vector unsigned long long vconst1, vconst2;
+
+	/* vdata0-vdata7 will contain our data (p). */
+	__vector unsigned long long vdata0, vdata1, vdata2, vdata3, vdata4,
+		vdata5, vdata6, vdata7;
+
+	/* v0-v7 will contain our checksums */
+	__vector unsigned long long v0 = {0,0};
+	__vector unsigned long long v1 = {0,0};
+	__vector unsigned long long v2 = {0,0};
+	__vector unsigned long long v3 = {0,0};
+	__vector unsigned long long v4 = {0,0};
+	__vector unsigned long long v5 = {0,0};
+	__vector unsigned long long v6 = {0,0};
+	__vector unsigned long long v7 = {0,0};
+
+
+	/* Vector auxiliary variables. */
+	__vector unsigned long long va0, va1, va2, va3, va4, va5, va6, va7;
+
+	unsigned int result = 0;
+	unsigned int offset; /* Constant table offset. */
+
+	unsigned long i; /* Counter. */
+	unsigned long chunks;
+
+	unsigned long block_size;
+	int next_block = 0;
+
+	/* Align by 128 bits. The last 128 bit block will be processed at end. */
+	unsigned long length = len & 0xFFFFFFFFFFFFFF80UL;
+
+#ifdef REFLECT
+	vcrc = (__vector unsigned long long)__builtin_pack_vector(0UL, crc);
+#else
+	vcrc = (__vector unsigned long long)__builtin_pack_vector(crc, 0UL);
+
+	/* Shift into top 32 bits */
+	vcrc = (__vector unsigned long long)vec_sld((__vector unsigned char)vcrc,
+        (__vector unsigned char)vzero, 4);
+#endif
+
+	/* Short version. */
+	if (len < 256) {
+		/* Calculate where in the constant table we need to start. */
+		offset = 256 - len;
+
+		vconst1 = vec_ld(offset, vcrc_short_const);
+		vdata0 = vec_ld(0, (__vector unsigned long long*) p);
+		VEC_PERM(vdata0, vdata0, vconst1, vperm_const);
+
+		/* xor initial value*/
+		vdata0 = vec_xor(vdata0, vcrc);
+
+		vdata0 = (__vector unsigned long long) __builtin_crypto_vpmsumw
+				((__vector unsigned int)vdata0, (__vector unsigned int)vconst1);
+		v0 = vec_xor(v0, vdata0);
+
+		for (i = 16; i < len; i += 16) {
+			vconst1 = vec_ld(offset + i, vcrc_short_const);
+			vdata0 = vec_ld(i, (__vector unsigned long long*) p);
+			VEC_PERM(vdata0, vdata0, vconst1, vperm_const);
+			vdata0 = (__vector unsigned long long) __builtin_crypto_vpmsumw
+				((__vector unsigned int)vdata0, (__vector unsigned int)vconst1);
+			v0 = vec_xor(v0, vdata0);
+		}
+	} else {
+
+		/* Load initial values. */
+		vdata0 = vec_ld(0, (__vector unsigned long long*) p);
+		vdata1 = vec_ld(16, (__vector unsigned long long*) p);
+
+		VEC_PERM(vdata0, vdata0, vdata0, vperm_const);
+		VEC_PERM(vdata1, vdata1, vdata1, vperm_const);
+
+		vdata2 = vec_ld(32, (__vector unsigned long long*) p);
+		vdata3 = vec_ld(48, (__vector unsigned long long*) p);
+
+		VEC_PERM(vdata2, vdata2, vdata2, vperm_const);
+		VEC_PERM(vdata3, vdata3, vdata3, vperm_const);
+
+		vdata4 = vec_ld(64, (__vector unsigned long long*) p);
+		vdata5 = vec_ld(80, (__vector unsigned long long*) p);
+
+		VEC_PERM(vdata4, vdata4, vdata4, vperm_const);
+		VEC_PERM(vdata5, vdata5, vdata5, vperm_const);
+
+		vdata6 = vec_ld(96, (__vector unsigned long long*) p);
+		vdata7 = vec_ld(112, (__vector unsigned long long*) p);
+
+		VEC_PERM(vdata6, vdata6, vdata6, vperm_const);
+		VEC_PERM(vdata7, vdata7, vdata7, vperm_const);
+
+		/* xor in initial value */
+		vdata0 = vec_xor(vdata0, vcrc);
+
+		p = (char *)p + 128;
+
+		do {
+			/* Checksum in blocks of MAX_SIZE. */
+			block_size = length;
+			if (block_size > MAX_SIZE) {
+				block_size = MAX_SIZE;
+			}
+
+			length = length - block_size;
+
+			/*
+			* Work out the offset into the constants table to start at. Each
+			* constant is 16 bytes, and it is used against 128 bytes of input
+			* data - 128 / 16 = 8
+			*/
+			offset = (MAX_SIZE/8) - (block_size/8);
+			/* We reduce our final 128 bytes in a separate step */
+			chunks = (block_size/128)-1;
+
+		    vconst1 = vec_ld(offset, vcrc_const);
+
+			va0 = __builtin_crypto_vpmsumd ((__vector unsigned long long)vdata0,
+						(__vector unsigned long long)vconst1);
+			va1 = __builtin_crypto_vpmsumd ((__vector unsigned long long)vdata1,
+						(__vector unsigned long long)vconst1);
+			va2 = __builtin_crypto_vpmsumd ((__vector unsigned long long)vdata2,
+						(__vector unsigned long long)vconst1);
+			va3 = __builtin_crypto_vpmsumd ((__vector unsigned long long)vdata3,
+						(__vector unsigned long long)vconst1);
+			va4 = __builtin_crypto_vpmsumd ((__vector unsigned long long)vdata4,
+						(__vector unsigned long long)vconst1);
+			va5 = __builtin_crypto_vpmsumd ((__vector unsigned long long)vdata5,
+						(__vector unsigned long long)vconst1);
+			va6 = __builtin_crypto_vpmsumd ((__vector unsigned long long)vdata6,
+						(__vector unsigned long long)vconst1);
+			va7 = __builtin_crypto_vpmsumd ((__vector unsigned long long)vdata7,
+						(__vector unsigned long long)vconst1);
+
+			if (chunks > 1) {
+				offset += 16;
+				vconst2 = vec_ld(offset, vcrc_const);
+				GROUP_ENDING_NOP;
+
+				vdata0 = vec_ld(0, (__vector unsigned long long*) p);
+				VEC_PERM(vdata0, vdata0, vdata0, vperm_const);
+
+				vdata1 = vec_ld(16, (__vector unsigned long long*) p);
+				VEC_PERM(vdata1, vdata1, vdata1, vperm_const);
+
+				vdata2 = vec_ld(32, (__vector unsigned long long*) p);
+				VEC_PERM(vdata2, vdata2, vdata2, vperm_const);
+
+				vdata3 = vec_ld(48, (__vector unsigned long long*) p);
+				VEC_PERM(vdata3, vdata3, vdata3, vperm_const);
+
+				vdata4 = vec_ld(64, (__vector unsigned long long*) p);
+				VEC_PERM(vdata4, vdata4, vdata4, vperm_const);
+
+				vdata5 = vec_ld(80, (__vector unsigned long long*) p);
+				VEC_PERM(vdata5, vdata5, vdata5, vperm_const);
+
+				vdata6 = vec_ld(96, (__vector unsigned long long*) p);
+				VEC_PERM(vdata6, vdata6, vdata6, vperm_const);
+
+				vdata7 = vec_ld(112, (__vector unsigned long long*) p);
+				VEC_PERM(vdata7, vdata7, vdata7, vperm_const);
+
+				p = (char *)p + 128;
+
+				/*
+				 * main loop. We modulo schedule it such that it takes three
+				 * iterations to complete - first iteration load, second
+				 * iteration vpmsum, third iteration xor.
+				 */
+				for (i = 0; i < chunks-2; i++) {
+					vconst1 = vec_ld(offset, vcrc_const);
+					offset += 16;
+					GROUP_ENDING_NOP;
+
+					v0 = vec_xor(v0, va0);
+					va0 = __builtin_crypto_vpmsumd ((__vector unsigned long
+							long)vdata0, (__vector unsigned long long)vconst2);
+					vdata0 = vec_ld(0, (__vector unsigned long long*) p);
+					VEC_PERM(vdata0, vdata0, vdata0, vperm_const);
+					GROUP_ENDING_NOP;
+
+					v1 = vec_xor(v1, va1);
+					va1 = __builtin_crypto_vpmsumd ((__vector unsigned long
+							long)vdata1, (__vector unsigned long long)vconst2);
+					vdata1 = vec_ld(16, (__vector unsigned long long*) p);
+					VEC_PERM(vdata1, vdata1, vdata1, vperm_const);
+					GROUP_ENDING_NOP;
+
+					v2 = vec_xor(v2, va2);
+					va2 = __builtin_crypto_vpmsumd ((__vector unsigned long
+							long)vdata2, (__vector unsigned long long)vconst2);
+					vdata2 = vec_ld(32, (__vector unsigned long long*) p);
+					VEC_PERM(vdata2, vdata2, vdata2, vperm_const);
+					GROUP_ENDING_NOP;
+
+					v3 = vec_xor(v3, va3);
+					va3 = __builtin_crypto_vpmsumd ((__vector unsigned long
+							long)vdata3, (__vector unsigned long long)vconst2);
+					vdata3 = vec_ld(48, (__vector unsigned long long*) p);
+					VEC_PERM(vdata3, vdata3, vdata3, vperm_const);
+
+					vconst2 = vec_ld(offset, vcrc_const);
+					GROUP_ENDING_NOP;
+
+					v4 = vec_xor(v4, va4);
+					va4 = __builtin_crypto_vpmsumd ((__vector unsigned long
+							long)vdata4, (__vector unsigned long long)vconst1);
+					vdata4 = vec_ld(64, (__vector unsigned long long*) p);
+					VEC_PERM(vdata4, vdata4, vdata4, vperm_const);
+					GROUP_ENDING_NOP;
+
+					v5 = vec_xor(v5, va5);
+					va5 = __builtin_crypto_vpmsumd ((__vector unsigned long
+							long)vdata5, (__vector unsigned long long)vconst1);
+					vdata5 = vec_ld(80, (__vector unsigned long long*) p);
+					VEC_PERM(vdata5, vdata5, vdata5, vperm_const);
+					GROUP_ENDING_NOP;
+
+					v6 = vec_xor(v6, va6);
+					va6 = __builtin_crypto_vpmsumd ((__vector unsigned long
+							long)vdata6, (__vector unsigned long long)vconst1);
+					vdata6 = vec_ld(96, (__vector unsigned long long*) p);
+					VEC_PERM(vdata6, vdata6, vdata6, vperm_const);
+					GROUP_ENDING_NOP;
+
+					v7 = vec_xor(v7, va7);
+					va7 = __builtin_crypto_vpmsumd ((__vector unsigned long
+							long)vdata7, (__vector unsigned long long)vconst1);
+					vdata7 = vec_ld(112, (__vector unsigned long long*) p);
+					VEC_PERM(vdata7, vdata7, vdata7, vperm_const);
+
+					p = (char *)p + 128;
+				}
+
+				/* First cool down*/
+				vconst1 = vec_ld(offset, vcrc_const);
+				offset += 16;
+
+				v0 = vec_xor(v0, va0);
+				va0 = __builtin_crypto_vpmsumd ((__vector unsigned long
+							long)vdata0, (__vector unsigned long long)vconst1);
+				GROUP_ENDING_NOP;
+
+				v1 = vec_xor(v1, va1);
+				va1 = __builtin_crypto_vpmsumd ((__vector unsigned long
+							long)vdata1, (__vector unsigned long long)vconst1);
+				GROUP_ENDING_NOP;
+
+				v2 = vec_xor(v2, va2);
+				va2 = __builtin_crypto_vpmsumd ((__vector unsigned long
+							long)vdata2, (__vector unsigned long long)vconst1);
+				GROUP_ENDING_NOP;
+
+				v3 = vec_xor(v3, va3);
+				va3 = __builtin_crypto_vpmsumd ((__vector unsigned long
+							long)vdata3, (__vector unsigned long long)vconst1);
+				GROUP_ENDING_NOP;
+
+				v4 = vec_xor(v4, va4);
+				va4 = __builtin_crypto_vpmsumd ((__vector unsigned long
+							long)vdata4, (__vector unsigned long long)vconst1);
+				GROUP_ENDING_NOP;
+
+				v5 = vec_xor(v5, va5);
+				va5 = __builtin_crypto_vpmsumd ((__vector unsigned long
+							long)vdata5, (__vector unsigned long long)vconst1);
+				GROUP_ENDING_NOP;
+
+				v6 = vec_xor(v6, va6);
+				va6 = __builtin_crypto_vpmsumd ((__vector unsigned long
+							long)vdata6, (__vector unsigned long long)vconst1);
+				GROUP_ENDING_NOP;
+
+				v7 = vec_xor(v7, va7);
+				va7 = __builtin_crypto_vpmsumd ((__vector unsigned long
+							long)vdata7, (__vector unsigned long long)vconst1);
+			}/* else */
+
+			/* Second cool down. */
+			v0 = vec_xor(v0, va0);
+			v1 = vec_xor(v1, va1);
+			v2 = vec_xor(v2, va2);
+			v3 = vec_xor(v3, va3);
+			v4 = vec_xor(v4, va4);
+			v5 = vec_xor(v5, va5);
+			v6 = vec_xor(v6, va6);
+			v7 = vec_xor(v7, va7);
+
+#ifdef REFLECT
+			/*
+			 * vpmsumd produces a 96 bit result in the least significant bits
+			 * of the register. Since we are bit reflected we have to shift it
+			 * left 32 bits so it occupies the least significant bits in the
+			 * bit reflected domain.
+			 */
+			v0 = (__vector unsigned long long)vec_sld((__vector unsigned char)v0,
+					(__vector unsigned char)vzero, 4);
+			v1 = (__vector unsigned long long)vec_sld((__vector unsigned char)v1,
+					(__vector unsigned char)vzero, 4);
+			v2 = (__vector unsigned long long)vec_sld((__vector unsigned char)v2,
+					(__vector unsigned char)vzero, 4);
+			v3 = (__vector unsigned long long)vec_sld((__vector unsigned char)v3,
+					(__vector unsigned char)vzero, 4);
+			v4 = (__vector unsigned long long)vec_sld((__vector unsigned char)v4,
+					(__vector unsigned char)vzero, 4);
+			v5 = (__vector unsigned long long)vec_sld((__vector unsigned char)v5,
+					(__vector unsigned char)vzero, 4);
+			v6 = (__vector unsigned long long)vec_sld((__vector unsigned char)v6,
+					(__vector unsigned char)vzero, 4);
+			v7 = (__vector unsigned long long)vec_sld((__vector unsigned char)v7,
+					(__vector unsigned char)vzero, 4);
+#endif
+
+			/* xor with the last 1024 bits. */
+			va0 = vec_ld(0, (__vector unsigned long long*) p);
+			VEC_PERM(va0, va0, va0, vperm_const);
+
+			va1 = vec_ld(16, (__vector unsigned long long*) p);
+			VEC_PERM(va1, va1, va1, vperm_const);
+
+			va2 = vec_ld(32, (__vector unsigned long long*) p);
+			VEC_PERM(va2, va2, va2, vperm_const);
+
+			va3 = vec_ld(48, (__vector unsigned long long*) p);
+			VEC_PERM(va3, va3, va3, vperm_const);
+
+			va4 = vec_ld(64, (__vector unsigned long long*) p);
+			VEC_PERM(va4, va4, va4, vperm_const);
+
+			va5 = vec_ld(80, (__vector unsigned long long*) p);
+			VEC_PERM(va5, va5, va5, vperm_const);
+
+			va6 = vec_ld(96, (__vector unsigned long long*) p);
+			VEC_PERM(va6, va6, va6, vperm_const);
+
+			va7 = vec_ld(112, (__vector unsigned long long*) p);
+			VEC_PERM(va7, va7, va7, vperm_const);
+
+			p = (char *)p + 128;
+
+			vdata0 = vec_xor(v0, va0);
+			vdata1 = vec_xor(v1, va1);
+			vdata2 = vec_xor(v2, va2);
+			vdata3 = vec_xor(v3, va3);
+			vdata4 = vec_xor(v4, va4);
+			vdata5 = vec_xor(v5, va5);
+			vdata6 = vec_xor(v6, va6);
+			vdata7 = vec_xor(v7, va7);
+
+			/* Check if we have more blocks to process */
+			next_block = 0;
+			if (length != 0) {
+				next_block = 1;
+
+			    /* zero v0-v7 */
+				v0 = vec_xor(v0, v0);
+				v1 = vec_xor(v1, v1);
+				v2 = vec_xor(v2, v2);
+				v3 = vec_xor(v3, v3);
+				v4 = vec_xor(v4, v4);
+				v5 = vec_xor(v5, v5);
+				v6 = vec_xor(v6, v6);
+				v7 = vec_xor(v7, v7);
+			}
+			length = length + 128;
+
+		} while (next_block);
+
+		/* Calculate how many bytes we have left. */
+		length = (len & 127);
+
+		/* Calculate where in (short) constant table we need to start. */
+		offset = 128 - length;
+
+		v0 = vec_ld(offset, vcrc_short_const);
+		v1 = vec_ld(offset + 16, vcrc_short_const);
+		v2 = vec_ld(offset + 32, vcrc_short_const);
+		v3 = vec_ld(offset + 48, vcrc_short_const);
+		v4 = vec_ld(offset + 64, vcrc_short_const);
+		v5 = vec_ld(offset + 80, vcrc_short_const);
+		v6 = vec_ld(offset + 96, vcrc_short_const);
+		v7 = vec_ld(offset + 112, vcrc_short_const);
+
+		offset += 128;
+
+		v0 = (__vector unsigned long long)__builtin_crypto_vpmsumw (
+			(__vector unsigned int)vdata0,(__vector unsigned int)v0);
+		v1 = (__vector unsigned long long)__builtin_crypto_vpmsumw (
+			(__vector unsigned int)vdata1,(__vector unsigned int)v1);
+		v2 = (__vector unsigned long long)__builtin_crypto_vpmsumw (
+			(__vector unsigned int)vdata2,(__vector unsigned int)v2);
+		v3 = (__vector unsigned long long)__builtin_crypto_vpmsumw (
+			(__vector unsigned int)vdata3,(__vector unsigned int)v3);
+		v4 = (__vector unsigned long long)__builtin_crypto_vpmsumw (
+			(__vector unsigned int)vdata4,(__vector unsigned int)v4);
+		v5 = (__vector unsigned long long)__builtin_crypto_vpmsumw (
+			(__vector unsigned int)vdata5,(__vector unsigned int)v5);
+		v6 = (__vector unsigned long long)__builtin_crypto_vpmsumw (
+			(__vector unsigned int)vdata6,(__vector unsigned int)v6);
+		v7 = (__vector unsigned long long)__builtin_crypto_vpmsumw (
+			(__vector unsigned int)vdata7,(__vector unsigned int)v7);
+
+		/* Now reduce the tail (0-112 bytes). */
+		for (i = 0; i < length; i+=16) {
+			vdata0 = vec_ld(i,(__vector unsigned long long*)p);
+			VEC_PERM(vdata0, vdata0, vdata0, vperm_const);
+			va0 = vec_ld(offset + i,vcrc_short_const);
+			va0 = (__vector unsigned long long)__builtin_crypto_vpmsumw (
+			(__vector unsigned int)vdata0,(__vector unsigned int)va0);
+			v0 = vec_xor(v0, va0);
+		}
+
+		/* xor all parallel chunks together. */
+		v0 = vec_xor(v0, v1);
+		v2 = vec_xor(v2, v3);
+		v4 = vec_xor(v4, v5);
+		v6 = vec_xor(v6, v7);
+
+		v0 = vec_xor(v0, v2);
+		v4 = vec_xor(v4, v6);
+
+		v0 = vec_xor(v0, v4);
+	}
+
+	/* Barrett Reduction */
+	vconst1 = vec_ld(0, v_Barrett_const);
+	vconst2 = vec_ld(16, v_Barrett_const);
+
+	v1 = (__vector unsigned long long)vec_sld((__vector unsigned char)v0,
+			(__vector unsigned char)v0, 8);
+	v0 = vec_xor(v1,v0);
+
+#ifdef REFLECT
+	/* shift left one bit */
+	vsht_splat = vec_splat_u8 (1);
+	v0 = (__vector unsigned long long)vec_sll ((__vector unsigned char)v0,
+			vsht_splat);
+#endif
+
+	v0 = vec_and(v0, vmask_64bit);
+
+#ifndef REFLECT
+
+	/*
+	 * Now for the actual algorithm. The idea is to calculate q,
+	 * the multiple of our polynomial that we need to subtract. By
+	 * doing the computation 2x bits higher (ie 64 bits) and shifting the
+	 * result back down 2x bits, we round down to the nearest multiple.
+	 */
+
+	/* ma */
+	v1 = __builtin_crypto_vpmsumd ((__vector unsigned long long)v0,
+			(__vector unsigned long long)vconst1);
+	/* q = floor(ma/(2^64)) */
+	v1 = (__vector unsigned long long)vec_sld ((__vector unsigned char)vzero,
+			(__vector unsigned char)v1, 8);
+	/* qn */
+	v1 = __builtin_crypto_vpmsumd ((__vector unsigned long long)v1,
+			(__vector unsigned long long)vconst2);
+	/* a - qn, subtraction is xor in GF(2) */
+	v0 = vec_xor (v0, v1);
+	/*
+	 * Get the result into r3. We need to shift it left 8 bytes:
+	 * V0 [ 0 1 2 X ]
+	 * V0 [ 0 X 2 3 ]
+	 */
+	result = __builtin_unpack_vector_1 (v0);
+#else
+
+	/*
+	 * The reflected version of Barrett reduction. Instead of bit
+	 * reflecting our data (which is expensive to do), we bit reflect our
+	 * constants and our algorithm, which means the intermediate data in
+	 * our vector registers goes from 0-63 instead of 63-0. We can reflect
+	 * the algorithm because we don't carry in mod 2 arithmetic.
+	 */
+
+	/* bottom 32 bits of a */
+	v1 = vec_and(v0, vmask_32bit);
+
+	/* ma */
+	v1 = __builtin_crypto_vpmsumd ((__vector unsigned long long)v1,
+			(__vector unsigned long long)vconst1);
+
+	/* bottom 32bits of ma */
+	v1 = vec_and(v1, vmask_32bit);
+	/* qn */
+	v1 = __builtin_crypto_vpmsumd ((__vector unsigned long long)v1,
+			(__vector unsigned long long)vconst2);
+	/* a - qn, subtraction is xor in GF(2) */
+	v0 = vec_xor (v0, v1);
+
+	/*
+	 * Since we are bit reflected, the result (ie the low 32 bits) is in
+	 * the high 32 bits. We just need to shift it left 4 bytes
+	 * V0 [ 0 1 X 3 ]
+	 * V0 [ 0 X 2 3 ]
+	 */
+
+	/* shift result into top 64 bits of */
+	v0 = (__vector unsigned long long)vec_sld((__vector unsigned char)v0,
+        (__vector unsigned char)vzero, 4);
+
+	result = __builtin_unpack_vector_0 (v0);
+#endif
+
+	return result;
+}
--- a/mysys/crc32ieee.cc
+++ b/mysys/crc32ieee.cc
@ -18,40 +18,46 @@
 #include <my_sys.h>
 #include <zlib.h>

-#if !defined(HAVE_CRC32_VPMSUM)
 /* TODO: remove this once zlib adds inherent support for hardware accelerated
 crc32 for all architectures. */
 static unsigned int my_crc32_zlib(unsigned int crc, const void *data,
                                  size_t len)
 {
-  return (unsigned int) crc32(crc, data, (unsigned int) len);
+  return (unsigned int) crc32(crc, (const Bytef *)data, (unsigned int) len);
 }

-my_crc32_t my_checksum= my_crc32_zlib;
+#ifdef HAVE_PCLMUL
+extern "C" int crc32_pclmul_enabled();
+extern "C" unsigned int crc32_pclmul(unsigned int, const void *, size_t);
+#elif defined(__GNUC__) && defined(HAVE_ARMV8_CRC)
+extern "C" int crc32_aarch64_available();
+extern "C" unsigned int crc32_aarch64(unsigned int, const void *, size_t);
 #endif

-#ifdef HAVE_CLMUL_INSTRUCTION
-extern int crc32_pclmul_enabled();
-extern unsigned int crc32_pclmul(unsigned int, const void *, size_t);

-/*----------------------------- x86_64 ---------------------------------*/
-void my_checksum_init(void)
+typedef unsigned int (*my_crc32_t)(unsigned int, const void *, size_t);
+
+static my_crc32_t init_crc32()
 {
+  my_crc32_t func= my_crc32_zlib;
+#ifdef HAVE_PCLMUL
  if (crc32_pclmul_enabled())
-    my_checksum= crc32_pclmul;
-}
+    func = crc32_pclmul;
 #elif defined(__GNUC__) && defined(HAVE_ARMV8_CRC)
-/*----------------------------- aarch64 --------------------------------*/
+  if (crc32_aarch64_available())
+    func= crc32_aarch64;
+#endif
+  return func;
+}

-extern unsigned int crc32_aarch64(unsigned int, const void *, size_t);
+static const my_crc32_t my_checksum_func= init_crc32();

-/* Ideally all ARM 64 bit processor should support crc32 but if some model
-doesn't support better to find it out through auxillary vector. */
-void my_checksum_init(void)
+#ifndef __powerpc64__
+/* For powerpc, my_checksum is defined elsewhere.*/
+extern "C" unsigned int my_checksum(unsigned int crc, const void *data, size_t len)
 {
-  if (crc32_aarch64_available())
-    my_checksum= crc32_aarch64;
+  return my_checksum_func(crc, data, len);
 }
-#else
-void my_checksum_init(void) {}
 #endif
+
+
--- a/mysys/my_init.c
+++ b/mysys/my_init.c
@ -100,9 +100,6 @@ my_bool my_init(void)
  /* Initialize our mutex handling */
  my_mutex_init();

-  /* Initialize CPU architecture specific hardware based crc32 optimization */
-  my_checksum_init();
-
  if (my_thread_global_init())
    return 1;

--- a/storage/innobase/CMakeLists.txt
+++ b/storage/innobase/CMakeLists.txt
@ -264,7 +264,6 @@ SET(INNOBASE_SOURCES
 	include/ut0byte.h
 	include/ut0byte.ic
 	include/ut0counter.h
-	include/ut0crc32.h
 	include/ut0dbg.h
 	include/ut0list.h
 	include/ut0list.ic
@ -340,7 +339,6 @@ SET(INNOBASE_SOURCES
 	trx/trx0sys.cc
 	trx/trx0trx.cc
 	trx/trx0undo.cc
-	ut/ut0crc32.cc
 	ut/ut0dbg.cc
 	ut/ut0list.cc
 	ut/ut0mem.cc
--- a/storage/innobase/include/ut0crc32.h
+++ b/storage/innobase/include/ut0crc32.h
@ -28,33 +28,10 @@ Created Aug 10, 2011 Vasil Dimov
 #define ut0crc32_h

 #include "univ.i"
-
-/********************************************************************//**
-Initializes the data structures used by ut_crc32*(). Does not do any
-allocations, would not hurt if called twice, but would be pointless. */
-void ut_crc32_init();
-
-/** Append data to a CRC-32C checksum.
-@param crc   current checksum
-@param s     data to append to the checksum
-@param size  data length in bytes
-@return CRC-32C, using the GF(2) primitive polynomial 0x11EDC6F41,
-or 0x1EDC6F41 without the highest degree term */
-typedef uint32_t (*ut_crc32_func_t)(uint32_t crc, const byte *s, size_t size);
-
-/** Pointer to CRC32 calculation function. */
-extern ut_crc32_func_t ut_crc32_low;
-
-/** Text description of CRC32 implementation */
-extern const char*	ut_crc32_implementation;
-
-/** Compute CRC-32C over a string of bytes.
-@param s     data
-@param len   data length in bytes
-@return the CRC-32C of the data */
+#include <my_sys.h>
 static inline uint32_t ut_crc32(const byte *s, size_t size)
 {
-  return ut_crc32_low(0, s, size);
+  return my_crc32c(0, s, size);
 }

 #endif /* ut0crc32_h */
--- a/storage/innobase/innodb.cmake
+++ b/storage/innobase/innodb.cmake
@ -209,16 +209,6 @@ IF(CMAKE_CXX_COMPILER_ID MATCHES "SunPro"
    PROPERTIES COMPILE_FLAGS -xO3)
 ENDIF()

-# Avoid generating Hardware Capabilities due to crc32 instructions
-IF(CMAKE_SYSTEM_NAME MATCHES "SunOS" AND CMAKE_SYSTEM_PROCESSOR MATCHES "i386")
-  MY_CHECK_CXX_COMPILER_FLAG("-Wa,-nH")
-  IF(have_CXX__Wa__nH)
-    ADD_COMPILE_FLAGS(
-      ut/ut0crc32.cc
-      COMPILE_FLAGS "-Wa,-nH"
-    )
-  ENDIF()
-ENDIF()

 IF(MSVC)
  # Avoid "unreferenced label" warning in generated file
--- a/storage/innobase/srv/srv0srv.cc
+++ b/storage/innobase/srv/srv0srv.cc
@ -762,7 +762,6 @@ static void srv_init()
 	/* Initialize some INFORMATION SCHEMA internal structures */
 	trx_i_s_cache_init(trx_i_s_cache);

-	ut_crc32_init();
 }

 /*********************************************************************//**
--- a/storage/innobase/srv/srv0start.cc
+++ b/storage/innobase/srv/srv0start.cc
@ -1223,7 +1223,7 @@ dberr_t srv_start(bool create_new_db)

 	srv_boot();

-	ib::info() << ut_crc32_implementation;
+	ib::info() << my_crc32c_implementation();

 	if (!srv_read_only_mode) {

--- a/storage/innobase/ut/ut0crc32.cc
+++ b/storage/innobase/ut/ut0crc32.cc
@ -1,346 +0,0 @@
-/*****************************************************************************
-
-Copyright (c) 2009, 2010 Facebook, Inc. All Rights Reserved.
-Copyright (c) 2011, 2015, Oracle and/or its affiliates. All Rights Reserved.
-Copyright (c) 2016, 2020, MariaDB Corporation.
-
-This program is free software; you can redistribute it and/or modify it under
-the terms of the GNU General Public License as published by the Free Software
-Foundation; version 2 of the License.
-
-This program is distributed in the hope that it will be useful, but WITHOUT
-ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
-FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
-
-You should have received a copy of the GNU General Public License along with
-this program; if not, write to the Free Software Foundation, Inc.,
-51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
-
-*****************************************************************************/
-
-/***************************************************************//**
-@file ut/ut0crc32.cc
-CRC32 implementation from Facebook, based on the zlib implementation.
-
-Created Aug 8, 2011, Vasil Dimov, based on mysys/my_crc32.c and
-mysys/my_perf.c, contributed by Facebook under the following license.
-********************************************************************/
-
-/* Copyright (C) 2009-2010 Facebook, Inc.  All Rights Reserved.
-
-   Dual licensed under BSD license and GPLv2.
-
-   Redistribution and use in source and binary forms, with or without
-   modification, are permitted provided that the following conditions are met:
-   1. Redistributions of source code must retain the above copyright notice,
-      this list of conditions and the following disclaimer.
-   2. Redistributions in binary form must reproduce the above copyright notice,
-      this list of conditions and the following disclaimer in the documentation
-      and/or other materials provided with the distribution.
-
-   THIS SOFTWARE IS PROVIDED BY FACEBOOK, INC. ``AS IS'' AND ANY EXPRESS OR
-   IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
-   MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO
-   EVENT SHALL FACEBOOK, INC. BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
-   OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
-   WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
-   OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
-   ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-   This program is free software; you can redistribute it and/or modify it
-   under the terms of the GNU General Public License as published by the Free
-   Software Foundation; version 2 of the License.
-
-   This program is distributed in the hope that it will be useful, but WITHOUT
-   ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
-   FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
-   more details.
-
-   You should have received a copy of the GNU General Public License along with
-   this program; if not, write to the Free Software Foundation, Inc.,
-   51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA */
-
-/* The below CRC32 implementation is based on the implementation included with
- * zlib with modifications to process 8 bytes at a time and using SSE 4.2
- * extensions when available.  The polynomial constant has been changed to
- * match the one used by SSE 4.2 and does not return the same value as the
- * version used by zlib.  The original zlib copyright notice follows. */
-
-/* crc32.c -- compute the CRC-32 of a buf stream
- * Copyright (C) 1995-2005 Mark Adler
- * For conditions of distribution and use, see copyright notice in zlib.h
- *
- * Thanks to Rodney Brown <rbrown64@csc.com.au> for his contribution of faster
- * CRC methods: exclusive-oring 32 bits of buf at a time, and pre-computing
- * tables for updating the shift register in one step with three exclusive-ors
- * instead of four steps with four exclusive-ors.  This results in about a
- * factor of two increase in speed on a Power PC G4 (PPC7455) using gcc -O3.
- */
-
-// First include (the generated) my_config.h, to get correct platform defines.
-#include "my_config.h"
-#include <string.h>
-
-#include "ut0crc32.h"
-#include "my_valgrind.h"
-
-#ifdef HAVE_CPUID_INSTRUCTION
-# ifdef _MSC_VER
-#  include <intrin.h>
-# else
-#  include <cpuid.h>
-#    if defined __GNUC__ && !defined __clang__ && __GNUC__ < 5
-/* <nmmintrin.h> does not really work in GCC before version 5 */
-#      define _mm_crc32_u8(crc,data) __builtin_ia32_crc32qi(crc,data)
-#      define _mm_crc32_u32(crc,data) __builtin_ia32_crc32si(crc,data)
-#      define _mm_crc32_u64(crc,data) __builtin_ia32_crc32di(crc,data)
-#    else
-#      include <nmmintrin.h>
-#    endif
-#  endif
-#endif
-
-/* CRC32 hardware implementation. */
-
-#ifdef HAVE_CRC32_VPMSUM
-extern "C"
-unsigned int crc32c_vpmsum(unsigned int crc, const unsigned char *p, unsigned long len);
-ut_crc32_func_t ut_crc32_low= crc32c_vpmsum;
-const char*	ut_crc32_implementation = "Using POWER8 crc32 instructions";
-#else
-# if defined(__GNUC__) && defined(HAVE_ARMV8_CRC)
-extern "C" {
-uint32_t crc32c_aarch64(uint32_t crc, const unsigned char *buffer, uint64_t len);
-};
-# elif defined HAVE_CPUID_INSTRUCTION
-/** return whether SSE4.2 instructions are available */
-static inline bool has_sse4_2()
-{
-  /* We assume that the CPUID instruction and its parameter 1 are available.
-  We do not support any precursors of the Intel 80486. */
-#  ifdef _MSC_VER
-  int data[4];
-  __cpuid(data, 1);
-  return !!(data[2] & 1 << 20);
-#  else
-  uint32_t reax = 0, rebx = 0, recx = 0, redx = 0;
-  __cpuid(1, reax, rebx, recx, redx);
-  return !!(recx & 1 << 20);
-#  endif
-}
-
-/** Append 8 bits (1 byte) to a CRC-32C checksum.
-@param crc   CRC-32C checksum so far
-@param data  data to be checksummed
-@return the updated CRC-32C */
-__attribute__((target("sse4.2")))
-static inline ulint ut_crc32c_8(ulint crc, byte data)
-{
-  return _mm_crc32_u8(static_cast<uint32_t>(crc), data);
-}
-
-/** Append 64 bits (8 aligned bytes) to a CRC-32C checksum
-@param[in] crc    CRC-32C checksum so far
-@param[in] data   8 bytes of aligned data
-@return the updated CRC-32C */
-__attribute__((target("sse4.2")))
-static inline ulint ut_crc32c_64(ulint crc, uint64_t data)
-{
-#  if SIZEOF_SIZE_T > 4
-  return _mm_crc32_u64(crc, data);
-#  else
-  crc= _mm_crc32_u32(crc, static_cast<uint32_t>(data));
-  crc= _mm_crc32_u32(crc, static_cast<uint32_t>(data >> 32));
-  return crc;
-#  endif
-}
-
-/** Calculate CRC-32C using dedicated IA-32 or AMD64 instructions
-@param crc   current checksum
-@param buf   data to append to the checksum
-@param len   data length in bytes
-@return CRC-32C (polynomial 0x11EDC6F41) */
-uint32_t ut_crc32_hw(uint32_t crc, const byte *buf, size_t len)
-{
-  ulint c= static_cast<uint32_t>(~crc);
-
-  /* Calculate byte-by-byte up to an 8-byte aligned address. After
-  this consume the input 8-bytes at a time. */
-  while (len > 0 && (reinterpret_cast<uintptr_t>(buf) & 7) != 0)
-  {
-    c= ut_crc32c_8(c, *buf++);
-    len--;
-  }
-
-  const uint64_t* b64= reinterpret_cast<const uint64_t*>(buf);
-
-  for (; len >= 128; len-= 128)
-  {
-    /* This call is repeated 16 times. 16 * 8 = 128. */
-    c= ut_crc32c_64(c, *b64++);
-    c= ut_crc32c_64(c, *b64++);
-    c= ut_crc32c_64(c, *b64++);
-    c= ut_crc32c_64(c, *b64++);
-    c= ut_crc32c_64(c, *b64++);
-    c= ut_crc32c_64(c, *b64++);
-    c= ut_crc32c_64(c, *b64++);
-    c= ut_crc32c_64(c, *b64++);
-    c= ut_crc32c_64(c, *b64++);
-    c= ut_crc32c_64(c, *b64++);
-    c= ut_crc32c_64(c, *b64++);
-    c= ut_crc32c_64(c, *b64++);
-    c= ut_crc32c_64(c, *b64++);
-    c= ut_crc32c_64(c, *b64++);
-    c= ut_crc32c_64(c, *b64++);
-    c= ut_crc32c_64(c, *b64++);
-  }
-
-  for (; len >= 8; len-= 8)
-    c= ut_crc32c_64(c, *b64++);
-
-  buf= reinterpret_cast<const byte*>(b64);
-
-  while (len--)
-    c= ut_crc32c_8(c, *buf++);
-
-  return ~static_cast<uint32_t>(c);
-}
-# endif /* (defined(__GNUC__) && defined(__i386__)) || _MSC_VER */
-
-/* CRC32 software implementation. */
-
-/* Precalculated table used to generate the CRC32 if the CPU does not
-have support for it */
-static uint32_t	ut_crc32_slice8_table[8][256];
-
-/********************************************************************//**
-Initializes the table that is used to generate the CRC32 if the CPU does
-not have support for it. */
-static
-void
-ut_crc32_slice8_table_init()
-/*========================*/
-{
-	/* bit-reversed poly 0x1EDC6F41 (from SSE42 crc32 instruction) */
-	static const uint32_t	poly = 0x82f63b78;
-	uint32_t		n;
-	uint32_t		k;
-	uint32_t		c;
-
-	for (n = 0; n < 256; n++) {
-		c = n;
-		for (k = 0; k < 8; k++) {
-			c = (c & 1) ? (poly ^ (c >> 1)) : (c >> 1);
-		}
-		ut_crc32_slice8_table[0][n] = c;
-	}
-
-	for (n = 0; n < 256; n++) {
-		c = ut_crc32_slice8_table[0][n];
-		for (k = 1; k < 8; k++) {
-			c = ut_crc32_slice8_table[0][c & 0xFF] ^ (c >> 8);
-			ut_crc32_slice8_table[k][n] = c;
-		}
-	}
-}
-
-/** Append 8 bits (1 byte) to a CRC-32C checksum.
-@param crc   CRC-32C checksum so far
-@param data  data to be checksummed
-@return the updated CRC-32C */
-static inline uint32_t ut_crc32c_8_sw(uint32_t crc, byte data)
-{
-  const uint8_t i= (crc ^ data) & 0xFF;
-
-  return (crc >> 8) ^ ut_crc32_slice8_table[0][i];
-}
-
-/** Append 64 bits (8 aligned bytes) to a CRC-32C checksum
-@param[in] crc    CRC-32C checksum so far
-@param[in] data   8 bytes of aligned data
-@return the updated CRC-32C */
-static inline uint32_t ut_crc32c_64_sw(uint32_t crc, uint64_t data)
-{
-# ifdef WORDS_BIGENDIAN
-  data= data << 56 |
-    (data & 0x000000000000FF00ULL) << 40 |
-    (data & 0x0000000000FF0000ULL) << 24 |
-    (data & 0x00000000FF000000ULL) << 8 |
-    (data & 0x000000FF00000000ULL) >> 8 |
-    (data & 0x0000FF0000000000ULL) >> 24 |
-    (data & 0x00FF000000000000ULL) >> 40 |
-    data >> 56;
-# endif /* WORDS_BIGENDIAN */
-
-  data^= crc;
-  return
-    ut_crc32_slice8_table[7][(data      ) & 0xFF] ^
-    ut_crc32_slice8_table[6][(data >>  8) & 0xFF] ^
-    ut_crc32_slice8_table[5][(data >> 16) & 0xFF] ^
-    ut_crc32_slice8_table[4][(data >> 24) & 0xFF] ^
-    ut_crc32_slice8_table[3][(data >> 32) & 0xFF] ^
-    ut_crc32_slice8_table[2][(data >> 40) & 0xFF] ^
-    ut_crc32_slice8_table[1][(data >> 48) & 0xFF] ^
-    ut_crc32_slice8_table[0][(data >> 56)];
-}
-
-/** Calculate CRC-32C using a look-up table.
-@param crc   current checksum
-@param buf   data to append to the checksum
-@param len   data length in bytes
-@return CRC-32C (polynomial 0x11EDC6F41) */
-uint32_t ut_crc32_sw(uint32_t crc, const byte *buf, size_t len)
-{
-  crc= ~crc;
-
-  /* Calculate byte-by-byte up to an 8-byte aligned address. After
-  this consume the input 8-bytes at a time. */
-  while (len > 0 && (reinterpret_cast<uintptr_t>(buf) & 7) != 0)
-  {
-    crc= ut_crc32c_8_sw(crc, *buf++);
-    len--;
-  }
-
-  const uint64_t* b64= reinterpret_cast<const uint64_t*>(buf);
-
-  for (; len >= 8; len-= 8)
-    crc= ut_crc32c_64_sw(crc, *b64++);
-
-  buf= reinterpret_cast<const byte*>(b64);
-
-  while (len--)
-    crc= ut_crc32c_8_sw(crc, *buf++);
-
-  return ~crc;
-}
-
-ut_crc32_func_t ut_crc32_low= ut_crc32_sw;
-const char *ut_crc32_implementation= "Using generic crc32 instructions";
-#endif
-
-/********************************************************************//**
-Initializes the data structures used by ut_crc32*(). Does not do any
-allocations, would not hurt if called twice, but would be pointless. */
-void ut_crc32_init()
-{
-#ifndef HAVE_CRC32_VPMSUM
-# if defined(__GNUC__) && defined(HAVE_ARMV8_CRC)
-  if (const char *crc32c_implementation= crc32c_aarch64_available())
-  {
-    ut_crc32_low= crc32c_aarch64;
-    ut_crc32_implementation= crc32c_implementation;
-    return;
-  }
-# elif defined HAVE_CPUID_INSTRUCTION
-  if (has_sse4_2())
-  {
-    ut_crc32_low= ut_crc32_hw;
-    ut_crc32_implementation= "Using SSE4.2 crc32 instructions";
-    return;
-  }
-# endif
-  ut_crc32_slice8_table_init();
-#endif /* !HAVE_CRC32_VPMSUM */
-}
--- a/unittest/mysys/CMakeLists.txt
+++ b/unittest/mysys/CMakeLists.txt
@ -15,11 +15,11 @@

 MY_ADD_TESTS(bitmap base64 my_atomic my_rdtsc lf my_malloc my_getopt dynstring
             byte_order
-             queues stacktrace LINK_LIBRARIES mysys)
+             queues stacktrace crc32 LINK_LIBRARIES mysys)
 MY_ADD_TESTS(my_vsnprintf LINK_LIBRARIES strings mysys)
 MY_ADD_TESTS(aes LINK_LIBRARIES  mysys mysys_ssl)
 ADD_DEFINITIONS(${SSL_DEFINES})
-
+INCLUDE_DIRECTORIES(${ZLIB_INCLUDE_DIR})
 MY_ADD_TESTS(ma_dyncol LINK_LIBRARIES  mysys)

 IF(WIN32)
--- a/unittest/mysys/crc32-t.c
+++ b/unittest/mysys/crc32-t.c
@ -0,0 +1,69 @@
+/* Copyright (c) MariaDB 2020
+
+   This program is free software; you can redistribute it and/or
+   modify it under the terms of the GNU General Public License as
+   published by the Free Software Foundation; version 2 of the License.
+
+   This program is distributed in the hope that it will be useful, but
+   WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with this program; if not, write to the Free Software
+   Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1335  USA */
+
+#include <my_global.h>
+#include <my_sys.h>
+#include <my_crypt.h>
+#include <tap.h>
+#include <string.h>
+#include <ctype.h>
+#include <zlib.h>
+
+/*
+  Check that optimized crc32 (ieee, or ethernet polynomical) returns the same
+  result as zlib (not so well optimized, yet, but trustworthy)
+*/
+#define DO_TEST_CRC32(crc,str)  \
+    ok(crc32(crc,(const Bytef *)str,(uint)(sizeof(str)-1)) == my_checksum(crc, str, sizeof(str)-1), "crc32 '%s'",str)
+
+/* Check that CRC32-C calculation returns correct result*/
+#define DO_TEST_CRC32C(crc,str,expected) \
+    do { \
+     unsigned int v = my_crc32c(crc, str, sizeof(str)-1); \
+     printf("crc32(%u,'%s',%zu)=%u\n",crc,str,sizeof(str)-1,v); \
+     ok(expected == my_crc32c(crc, str, sizeof(str)-1),"crc32c '%s'",str); \
+    }while(0)
+
+
+#define LONG_STR "1234567890234568900212345678901231213123321212123123123123123"\
+ "............................................................................." \
+ "xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx" \
+ "yyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyy" \
+ "zzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzz"
+
+int main(int argc __attribute__((unused)),char *argv[])
+{
+  MY_INIT(argv[0]);
+  plan(14);
+  printf("%s\n",my_crc32c_implementation());
+  DO_TEST_CRC32(0,"");
+  DO_TEST_CRC32(1,"");
+  DO_TEST_CRC32(0,"12345");
+  DO_TEST_CRC32(1,"12345");
+  DO_TEST_CRC32(0,"1234567890123456789");
+  DO_TEST_CRC32(0, LONG_STR);
+  ok(0 == my_checksum(0, NULL, 0) , "crc32 data = NULL, length = 0");
+
+  DO_TEST_CRC32C(0,"", 0);
+  DO_TEST_CRC32C(1,"", 1);
+  DO_TEST_CRC32C(0, "12345", 416359221);
+  DO_TEST_CRC32C(1, "12345", 549473433);
+  DO_TEST_CRC32C(0, "1234567890123456789", 2366987449);
+  DO_TEST_CRC32C(0, LONG_STR, 3009234172);
+  ok(0 == my_crc32c(0, NULL, 0), "crc32c data = NULL, length = 0");
+
+  my_end(0);
+  return exit_status();
+}