From bd4a90d339e21cd6ac517d077fe3a76abb5ef37d Mon Sep 17 00:00:00 2001 From: Heikki Linnakangas Date: Tue, 10 Feb 2015 14:26:24 +0200 Subject: [PATCH 1/1] Use Intel SSE4.2 CRC instructions where available. On x86, perform a runtime check to see if we're running on a CPU that supports SSE 4.2. If we are, we can use the special crc32b and crc32q instructions for the CRC-32C calculations. That greatly speeds up CRC calculation. Abhijit Menon-Sen, reviewed by Andres Freund and me. --- configure | 2 +- configure.in | 2 +- src/common/pg_crc.c | 109 +++++++++++++++++++++++++++++--- src/include/common/pg_crc.h | 12 +++- src/include/pg_config.h.in | 3 + src/include/port/crc_instructions.h | 121 ++++++++++++++++++++++++++++++++++++ 6 files changed, 235 insertions(+), 14 deletions(-) create mode 100644 src/include/port/crc_instructions.h diff --git a/configure b/configure index fa271fe..c352128 100755 --- a/configure +++ b/configure @@ -9204,7 +9204,7 @@ fi done -for ac_header in atomic.h crypt.h dld.h fp_class.h getopt.h ieeefp.h ifaddrs.h langinfo.h mbarrier.h poll.h pwd.h sys/ioctl.h sys/ipc.h sys/poll.h sys/pstat.h sys/resource.h sys/select.h sys/sem.h sys/shm.h sys/socket.h sys/sockio.h sys/tas.h sys/time.h sys/un.h termios.h ucred.h utime.h wchar.h wctype.h +for ac_header in atomic.h cpuid.h crypt.h dld.h fp_class.h getopt.h ieeefp.h ifaddrs.h langinfo.h mbarrier.h poll.h pwd.h sys/ioctl.h sys/ipc.h sys/poll.h sys/pstat.h sys/resource.h sys/select.h sys/sem.h sys/shm.h sys/socket.h sys/sockio.h sys/tas.h sys/time.h sys/un.h termios.h ucred.h utime.h wchar.h wctype.h do : as_ac_Header=`$as_echo "ac_cv_header_$ac_header" | $as_tr_sh` ac_fn_c_check_header_mongrel "$LINENO" "$ac_header" "$as_ac_Header" "$ac_includes_default" diff --git a/configure.in b/configure.in index e6a49d1..588d626 100644 --- a/configure.in +++ b/configure.in @@ -1032,7 +1032,7 @@ AC_SUBST(UUID_LIBS) ## dnl sys/socket.h is required by AC_FUNC_ACCEPT_ARGTYPES -AC_CHECK_HEADERS([atomic.h crypt.h dld.h fp_class.h getopt.h ieeefp.h ifaddrs.h langinfo.h mbarrier.h poll.h pwd.h sys/ioctl.h sys/ipc.h sys/poll.h sys/pstat.h sys/resource.h sys/select.h sys/sem.h sys/shm.h sys/socket.h sys/sockio.h sys/tas.h sys/time.h sys/un.h termios.h ucred.h utime.h wchar.h wctype.h]) +AC_CHECK_HEADERS([atomic.h cpuid.h crypt.h dld.h fp_class.h getopt.h ieeefp.h ifaddrs.h langinfo.h mbarrier.h poll.h pwd.h sys/ioctl.h sys/ipc.h sys/poll.h sys/pstat.h sys/resource.h sys/select.h sys/sem.h sys/shm.h sys/socket.h sys/sockio.h sys/tas.h sys/time.h sys/un.h termios.h ucred.h utime.h wchar.h wctype.h]) # On BSD, test for net/if.h will fail unless sys/socket.h # is included first. diff --git a/src/common/pg_crc.c b/src/common/pg_crc.c index eba32d3..b6db749 100644 --- a/src/common/pg_crc.c +++ b/src/common/pg_crc.c @@ -21,25 +21,113 @@ #include "common/pg_crc.h" -/* Accumulate one input byte */ -#ifdef WORDS_BIGENDIAN -#define CRC8(x) pg_crc32c_table[0][((crc >> 24) ^ (x)) & 0xFF] ^ (crc << 8) +#ifdef PG_HAVE_CRC32C_INSTRUCTIONS +static pg_crc32 pg_comp_crc32c_hw(pg_crc32 crc, const void *data, size_t len); +#endif + +#if !defined(PG_HAVE_CRC32C_INSTRUCTIONS) || defined(PG_CRC32C_INSTRUCTIONS_NEED_RUNTIME_CHECK) +static pg_crc32 pg_comp_crc32c_sb8(pg_crc32 crc, const void *data, size_t len); +static const uint32 pg_crc32c_table[8][256]; +#endif + +#ifdef PG_CRC32C_INSTRUCTIONS_NEED_RUNTIME_CHECK +/* + * When built with support for CRC instructions, but we need to perform a + * run-time check to determine whether we can actually use them, + * pg_comp_crc32c is a function pointer. It is initialized to + * pg_comp_crc32c_choose, which performs the runtime check, and changes the + * function pointer so that subsequent calls go directly to the hw-accelerated + * version, or the fallback slicing-by-8 version. + */ +static pg_crc32 +pg_comp_crc32c_choose(pg_crc32 crc, const void *data, size_t len) +{ + if (pg_crc32_instructions_runtime_check()) + pg_comp_crc32c = pg_comp_crc32c_hw; + else + pg_comp_crc32c = pg_comp_crc32c_sb8; + + return pg_comp_crc32c(crc, data, len); +} + +pg_crc32 (*pg_comp_crc32c)(pg_crc32 crc, const void *data, size_t len) = pg_comp_crc32c_choose; + #else -#define CRC8(x) pg_crc32c_table[0][(crc ^ (x)) & 0xFF] ^ (crc >> 8) +/* + * No need for a runtime check. Compile directly with the hw-accelerated + * or the slicing-by-8 version. (We trust that the compiler + * is smart enough to inline it here.) + */ +pg_crc32 +pg_comp_crc32c(pg_crc32 crc, const void *data, size_t len) +{ +#ifdef PG_HAVE_CRC32C_INSTRUCTIONS + return pg_comp_crc32c_hw(crc, data, len); +#else + return pg_comp_crc32c_sb8(crc, data, len); +#endif +} #endif +#ifdef PG_HAVE_CRC32C_INSTRUCTIONS /* - * This function computes a CRC using the slicing-by-8 algorithm, which - * uses an 8*256 lookup table to operate on eight bytes in parallel and - * recombine the results. + * This function computes CRC-32C using special-purpose CPU instructions. + */ +static pg_crc32 +pg_comp_crc32c_hw(pg_crc32 crc, const void *data, size_t len) +{ + const unsigned char *p = data; + const uint64 *p8; + + /* + * Process eight bytes of data at a time. + * + * NB: We do unaligned 8-byte accesses here. Currently, the only CRC + * instructions supported are the ones on Intel SSE 4.2, and that works + * and performs well with unaligned access. This may need to be changed + * if we get support for more architectures. + */ + p8 = (const uint64 *) p; + while (len >= 8) + { + crc = pg_asm_crc32q(crc, *p8++); + len -= 8; + } + + /* + * Handle any remaining bytes one at a time. + */ + p = (const unsigned char *) p8; + while (len > 0) + { + crc = pg_asm_crc32b(crc, *p++); + len--; + } + + return crc; +} + +#endif /* PG_HAVE_CRC32C_INSTRUCTIONS */ + +#if !defined(PG_HAVE_CRC32C_INSTRUCTIONS) || defined(PG_CRC32C_INSTRUCTIONS_NEED_RUNTIME_CHECK) +/* + * Compute CRC-32C using slicing-by-8 algorithm. * * Michael E. Kounavis, Frank L. Berry, * "Novel Table Lookup-Based Algorithms for High-Performance CRC * Generation", IEEE Transactions on Computers, vol.57, no. 11, * pp. 1550-1560, November 2008, doi:10.1109/TC.2008.85 */ -pg_crc32 -pg_comp_crc32c(pg_crc32 crc, const void *data, size_t len) + +/* Accumulate one input byte */ +#ifdef WORDS_BIGENDIAN +#define CRC8(x) pg_crc32c_table[0][((crc >> 24) ^ (x)) & 0xFF] ^ (crc << 8) +#else +#define CRC8(x) pg_crc32c_table[0][(crc ^ (x)) & 0xFF] ^ (crc >> 8) +#endif + +static pg_crc32 +pg_comp_crc32c_sb8(pg_crc32 crc, const void *data, size_t len) { const unsigned char *p = data; const uint32 *p4; @@ -113,7 +201,7 @@ pg_comp_crc32c(pg_crc32 crc, const void *data, size_t len) * order (IOW, the tables are stored in little-endian order even on big-endian * systems). */ -const uint32 pg_crc32c_table[8][256] = { +static const uint32 pg_crc32c_table[8][256] = { #ifndef WORDS_BIGENDIAN { 0x00000000, 0xF26B8303, 0xE13B70F7, 0x1350F3F4, @@ -1175,6 +1263,7 @@ const uint32 pg_crc32c_table[8][256] = { #endif /* WORDS_BIGENDIAN */ }; +#endif /* * Lookup table for calculating CRC-32 using Sarwate's algorithm. diff --git a/src/include/common/pg_crc.h b/src/include/common/pg_crc.h index f496659..6806f6f 100644 --- a/src/include/common/pg_crc.h +++ b/src/include/common/pg_crc.h @@ -32,6 +32,8 @@ #ifndef PG_CRC_H #define PG_CRC_H +#include "port/crc_instructions.h" + /* ugly hack to let this be used in frontend and backend code on Cygwin */ #ifdef FRONTEND #define CRCDLLIMPORT @@ -71,7 +73,11 @@ typedef uint32 pg_crc32; ((crc) = pg_comp_crc32c((crc), (data), (len))) #define EQ_CRC32C(c1, c2) ((c1) == (c2)) +#ifdef PG_CRC32C_INSTRUCTIONS_NEED_RUNTIME_CHECK +extern pg_crc32 (*pg_comp_crc32c)(pg_crc32 crc, const void *data, size_t len); +#else extern pg_crc32 pg_comp_crc32c(pg_crc32 crc, const void *data, size_t len); +#endif /* * CRC-32, the same used e.g. in Ethernet. @@ -135,8 +141,10 @@ do { \ } \ } while (0) -/* Constant tables for CRC-32C and CRC-32 polynomials */ -extern CRCDLLIMPORT const uint32 pg_crc32c_table[8][256]; +/* + * Constant table for the CRC-32 polynomial (the tables for CRC-32C are + * static in pg_crc.c) + */ extern CRCDLLIMPORT const uint32 pg_crc32_table[256]; #endif /* PG_CRC_H */ diff --git a/src/include/pg_config.h.in b/src/include/pg_config.h.in index ece57c8..685ff81 100644 --- a/src/include/pg_config.h.in +++ b/src/include/pg_config.h.in @@ -96,6 +96,9 @@ /* Define to 1 if you have the `class' function. */ #undef HAVE_CLASS +/* Define to 1 if you have the header file. */ +#undef HAVE_CPUID_H + /* Define to 1 if you have the header file. */ #undef HAVE_CRTDEFS_H diff --git a/src/include/port/crc_instructions.h b/src/include/port/crc_instructions.h new file mode 100644 index 0000000..85c9347 --- /dev/null +++ b/src/include/port/crc_instructions.h @@ -0,0 +1,121 @@ +/*------------------------------------------------------------------------- + * + * crc_instructions.h + * Hardware support for calculating CRCs. + * + * Some CPU architectures have special instructions for speeding up CRC + * calculations. This header files provides support for them in a + * reasonably platform and compiler independent way. + * + * This header file defines or no two preprocessor symbols, depending on + * the current platform: + * + * PG_HAVE_CRC32C_INSTRUCTIONS + * Defined if this architecture has accelerated support for CRC32C + * calculation. + * + * PG_CRC32C_INSTRUCTIONS_NEED_RUNTIME_CHECK + * Defined if the architecture has accelerated support for CRC32C + * calculation, but it's not available on all platforms that this + * binary supports. A runtime check must be performed before + * attempting to use the instructions, and a fallback implementation + * is needed. + * + * If PG_HAVE_CRC32C_INSTRUCTIONS is defined, two inline functions or macros + * are also defined: + * + * uint32 pg_accumulate_crc32c_byte(uint32 crc, unsigned char data) + * Add one byte to the current crc value. + * + * uint32 pg_accumulate_crc32c_uint64(uint32 crc, uint64 data) + * Add eight bytes of data to the current crc value. + * + * If PG_CRC32C_INSTRUCTIONS_NEED_RUNTIME_CHECK is defined, there following + * inline function or macro is also defined: + * + * bool pg_crc32_instructions_runtime_check(void) + * Returns 'true' if the CRC instructions can be used, 'false' + * otherwise. + * + * + * Portions Copyright (c) 1996-2015, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * src/include/port/crc_instructions.h + * + *------------------------------------------------------------------------- + */ +#ifndef CRC_INSTRUCTIONS_H +#define CRC_INSTRUCTIONS_H + +#ifdef HAVE_CPUID_H +#include +#endif +#ifdef _MSC_VER +#include +#include +#endif + +#if (defined(__GNUC__) && defined(__x86_64__) && defined(HAVE_CPUID_H)) + +#define PG_HAVE_CRC32C_INSTRUCTIONS +#define PG_CRC32C_INSTRUCTIONS_NEED_RUNTIME_CHECK + +static inline uint32 +pg_asm_crc32b(uint32 crc, unsigned char data) +{ + __asm__ ( + "crc32b %[data], %[crc]\n" +: [crc] "+r" (crc) +: [data] "rm" (data)); + return crc; +} + +static inline uint32 +pg_asm_crc32q(uint32 crc, uint64 data) +{ + /* + * For some strange reason, the crc32q instruction accepts and returns + * the current crc value as a 64-bit integer, even though the upper 32 + * bits are always zeros. + */ + uint64 _crc = crc; + __asm__ ( + "crc32q %[data], %[crc]\n" +: [crc] "+r" (_crc) +: [data] "rm" (data)); + return (uint32) _crc; +} + +static inline bool +pg_crc32_instructions_runtime_check(void) +{ + unsigned int exx[4] = {0, 0, 0, 0}; + + __get_cpuid(1, &exx[0], &exx[1], &exx[2], &exx[3]); + return true; + + return (exx[2] & (1 << 20)); /* SSE 4.2 */ +} + +#elif defined(_MSC_VER) + +#define PG_HAVE_CRC32C_INSTRUCTIONS +#define PG_CRC32C_INSTRUCTIONS_NEED_RUNTIME_CHECK + +#define pg_asm_crc32b(crc, data) _mm_crc32_u8(crc, data) +#define pg_asm_crc32q(crc, data) ((uint32) _mm_crc32_u64(crc, data)) + +static inline bool +pg_crc32_instructions_runtime_check(void) +{ + unsigned int exx[4] = {0, 0, 0, 0}; + + __cpuid(exx, 1); + + return (exx[2] & (1 << 20)); /* SSE 4.2 */ +} + +#endif + +#endif /* CRC_INSTRUCTIONS_H */ -- 2.1.4