From ff6226aba084830887ea71d6c377b9e94ee79106 Mon Sep 17 00:00:00 2001 From: Michael Paquier Date: Fri, 8 Nov 2024 14:19:59 +0900 Subject: [PATCH v10 1/2] Optimize pg_memory_is_all_zeros() pg_memory_is_all_zeros() is currently doing byte per byte comparison and so could lead to performance regression or penalties when multi bytes comparison could be done instead. Let's provide an optimized version that divides the checks into four phases for efficiency: - Initial alignment (byte per byte comparison) - Compare 8 size_t chunks at once using bitwise OR (candidate for SIMD optimization) - Compare remaining size_t aligned chunks - Compare remaining bytes (byte per byte comparison) Code mainly suggested by David Rowley. --- src/include/utils/memutils.h | 61 ++++++++++++++++++++++++++++++++++-- 1 file changed, 58 insertions(+), 3 deletions(-) 100.0% src/include/utils/ diff --git a/src/include/utils/memutils.h b/src/include/utils/memutils.h index 3590c8bad9..9637370838 100644 --- a/src/include/utils/memutils.h +++ b/src/include/utils/memutils.h @@ -190,19 +190,74 @@ extern MemoryContext BumpContextCreate(MemoryContext parent, #define SLAB_LARGE_BLOCK_SIZE (8 * 1024 * 1024) /* + * pg_memory_is_all_zeros + * * Test if a memory region starting at "ptr" and of size "len" is full of * zeroes. + * + * The test is divided into multiple phases, to be efficient for various + * length values: + * - Byte by byte comparison, until the pointer is aligned. + * - 8 * sizeof(size_t) comparisons using bitwise OR, to encourage compilers + * to use SIMD instructions if available, up to the last aligned location + * possible. + * - size_t comparisons, with aligned pointers, up to the last location + * possible. + * - Byte by byte comparison, until the end location. + * + * Caller must ensure that "ptr" is not NULL. */ static inline bool pg_memory_is_all_zeros(const void *ptr, size_t len) { - const char *p = (const char *) ptr; + const unsigned char *p = (const unsigned char *) ptr; + const unsigned char *end = &p[len]; + const unsigned char *aligned_end = (const unsigned char *) + ((uintptr_t) end & (~(sizeof(size_t) - 1))); - for (size_t i = 0; i < len; i++) + /* Compare bytes until the pointer "p" is aligned */ + while (((uintptr_t) p & (sizeof(size_t) - 1)) != 0) { - if (p[i] != 0) + if (p == end) + return true; + + if (*p++ != 0) return false; } + + /* + * Compare 8 * sizeof(size_t) chunks at once. + * + * For performance reasons, we manually unroll this loop and purposefully + * use bitwise-ORs to combine each comparison. This prevents boolean + * short-circuiting and lets the compiler know that it's safe to access + * all 8 elements regardless of the result of the other comparisons. This + * seems to be enough to coax a few compilers into using SIMD + * instructions. + */ + for (; p < aligned_end - (sizeof(size_t) * 7); p += sizeof(size_t) * 8) + { + if ((((size_t *) p)[0] != 0) | (((size_t *) p)[1] != 0) | + (((size_t *) p)[2] != 0) | (((size_t *) p)[3] != 0) | + (((size_t *) p)[4] != 0) | (((size_t *) p)[5] != 0) | + (((size_t *) p)[6] != 0) | (((size_t *) p)[7] != 0)) + return false; + } + + /* Compare remaining size_t-aligned chunks */ + for (; p < aligned_end; p += sizeof(size_t)) + { + if (*(size_t *) p != 0) + return false; + } + + /* Compare remaining bytes until the end */ + while (p < end) + { + if (*p++ != 0) + return false; + } + return true; } -- 2.34.1