From 59bbfd7ece0deb785f73a94cd53d330d86b58875 Mon Sep 17 00:00:00 2001 From: Michael Paquier Date: Fri, 8 Nov 2024 14:19:59 +0900 Subject: [PATCH v13 1/3] Optimize pg_memory_is_all_zeros() pg_memory_is_all_zeros() is currently doing byte per byte comparison and so could lead to performance regression or penalties when multi bytes comparison could be done instead. Let's provide an optimized version that divides the checks into multiple cases for safety reason and multiple phases for efficiency: Case 1: len < 8 bytes, then byte per byte comparison. Case 2: len in the 8-63 bytes range: - Phase 1: Byte by byte comparison, until the pointer is aligned. - Phase 2: size_t comparisons, with aligned pointers, up to the last location possible. - Phase 3: Byte by byte comparison, until the end location. Case 3: len >= 64 bytes, same as case 2 except that an additional phase is is placed between Phase 1 and Phase 2, with 8 * sizeof(size_t) comparisons using bitwise OR, to encourage compilers to use SIMD instructions if available, up to the last aligned location possible. Case 1 and Case 2 are mandatory to ensure that we won't read beyond the memory area. Code mainly suggested by David Rowley. --- src/include/utils/memutils.h | 123 ++++++++++++++++++++++++++++++++++- 1 file changed, 120 insertions(+), 3 deletions(-) diff --git a/src/include/utils/memutils.h b/src/include/utils/memutils.h index 3590c8bad9..0808e91b77 100644 --- a/src/include/utils/memutils.h +++ b/src/include/utils/memutils.h @@ -190,19 +190,136 @@ extern MemoryContext BumpContextCreate(MemoryContext parent, #define SLAB_LARGE_BLOCK_SIZE (8 * 1024 * 1024) /* + * pg_memory_is_all_zeros + * * Test if a memory region starting at "ptr" and of size "len" is full of * zeroes. + * + * The test is divided into multiple cases for safety reason and multiple phases + * for efficiency. + * + * Case 1: len < 8 bytes, then byte per byte comparison. + * Case 2: len in the 8-63 bytes range: + * - Phase 1: Byte by byte comparison, until the pointer is aligned. + * - Phase 2: size_t comparisons, with aligned pointers, up to the last + * location possible. + * - Phase 3: Byte by byte comparison, until the end location. + * Case 3: len >= 64 bytes, same as case 2 except that an additional phase is + * is placed between Phase 1 and Phase 2, with 8 * sizeof(size_t) + * comparisons using bitwise OR, to encourage compilers to use SIMD + * instructions if available, up to the last aligned location possible. + * + * Case 1 and Case 2 are mandatory to ensure that we won't read beyond the + * memory area. + * + * Caller must ensure that "ptr" is not NULL. */ static inline bool pg_memory_is_all_zeros(const void *ptr, size_t len) { - const char *p = (const char *) ptr; + const unsigned char *p = (const unsigned char *) ptr; + const unsigned char *end = &p[len]; + const unsigned char *aligned_end = (const unsigned char *) + ((uintptr_t) end & (~(sizeof(size_t) - 1))); - for (size_t i = 0; i < len; i++) + /* len < 8 bytes case */ + if (len < sizeof(size_t)) { - if (p[i] != 0) + while (p < end) + { + if (*p++ != 0) + return false; + } + return true; + } + + /* len in the 8-63 bytes range case */ + if (len < sizeof(size_t) * 8) + { + /* Compare bytes until the pointer "p" is aligned */ + while (((uintptr_t) p & (sizeof(size_t) - 1)) != 0) + { + if (p == end) + return true; + if (*p++ != 0) + return false; + } + + /* + * Compare remaining size_t-aligned chunks. + * + * There is no risk to read beyond the memory area, as aligned_end + * can't be > end as we are in the len 8-63 bytes range case (so len + * >= 8). + */ + for (; p < aligned_end; p += sizeof(size_t)) + { + if (*(size_t *) p != 0) + return false; + } + + /* Compare remaining bytes until the end */ + while (p < end) + { + if (*p++ != 0) + return false; + } + return true; + } + + /* len >= 64 bytes case */ + + /* Compare bytes until the pointer "p" is aligned */ + while (((uintptr_t) p & (sizeof(size_t) - 1)) != 0) + { + if (p == end) + return true; + + if (*p++ != 0) return false; } + + /* + * Compare 8 * sizeof(size_t) chunks at once. + * + * For performance reasons, we manually unroll this loop and purposefully + * use bitwise-ORs to combine each comparison. This prevents boolean + * short-circuiting and lets the compiler know that it's safe to access + * all 8 elements regardless of the result of the other comparisons. This + * seems to be enough to coax a few compilers into using SIMD + * instructions. + * + * There is no risk to read beyond the memory area as we are in the len >= + * 64 bytes case. + */ + for (; p < aligned_end - (sizeof(size_t) * 7); p += sizeof(size_t) * 8) + { + if ((((size_t *) p)[0] != 0) | (((size_t *) p)[1] != 0) | + (((size_t *) p)[2] != 0) | (((size_t *) p)[3] != 0) | + (((size_t *) p)[4] != 0) | (((size_t *) p)[5] != 0) | + (((size_t *) p)[6] != 0) | (((size_t *) p)[7] != 0)) + return false; + } + + /* + * Compare remaining size_t-aligned chunks. + * + * There is no risk to read beyond the memory area, as aligned_end can't + * be > end as we are in the len >= 64 bytes case (so len >= 8). + */ + for (; p < aligned_end; p += sizeof(size_t)) + { + if (*(size_t *) p != 0) + return false; + } + + /* Compare remaining bytes until the end */ + while (p < end) + { + if (*p++ != 0) + return false; + } + return true; } -- 2.45.2