From 2de219babe0b9010a309152a2b629edce17278b0 Mon Sep 17 00:00:00 2001
From: Michael Paquier <michael@paquier.xyz>
Date: Fri, 8 Nov 2024 14:19:59 +0900
Subject: [PATCH v13 1/2] Optimize pg_memory_is_all_zeros()

pg_memory_is_all_zeros() is currently doing byte per byte comparison and so
could lead to performance regression or penalties when multi bytes comparison
could be done instead.

Let's provide an optimized version that divides the checks into multiple cases
for safety reason and multiple phases for efficiency:

Case 1: len < 8 bytes, then byte per byte comparison.

Case 2: len in the 8-63 bytes range:
      - Phase 1: Byte by byte comparison, until the pointer is aligned.
      - Phase 2: size_t comparisons, with aligned pointers, up to the last
                 location possible.
      - Phase 3: Byte by byte comparison, until the end location.

Case 3: len >= 64 bytes, same as case 2 except that an additional phase is
        is placed between Phase 1 and Phase 2, with 8 * sizeof(size_t)
        comparisons using bitwise OR, to encourage compilers to use SIMD
        instructions if available, up to the last aligned location possible.

Case 1 and Case 2 are mandatory to ensure that we won't read beyond the
memory area.

Code mainly suggested by David Rowley.
---
 src/include/utils/memutils.h | 119 ++++++++++++++++++++++++++++++++++-
 1 file changed, 116 insertions(+), 3 deletions(-)
 100.0% src/include/utils/

diff --git a/src/include/utils/memutils.h b/src/include/utils/memutils.h
index 3590c8bad9..c437232162 100644
--- a/src/include/utils/memutils.h
+++ b/src/include/utils/memutils.h
@@ -190,19 +190,132 @@ extern MemoryContext BumpContextCreate(MemoryContext parent,
 #define SLAB_LARGE_BLOCK_SIZE		(8 * 1024 * 1024)
 
 /*
+ * pg_memory_is_all_zeros
+ *
  * Test if a memory region starting at "ptr" and of size "len" is full of
  * zeroes.
+ *
+ * The test is divided into multiple cases for safety reason and multiple phases
+ * for efficiency.
+ *
+ * Case 1: len < sizeof(size_t) bytes, then byte per byte comparison.
+ * Case 2: len < (sizeof(size_t) * 8 - 1) bytes:
+ *       - Phase 1: Byte by byte comparison, until the pointer is aligned.
+ *       - Phase 2: size_t comparisons, with aligned pointers, up to the last
+ *                  location possible.
+ *       - Phase 3: Byte by byte comparison, until the end location.
+ * Case 3: len >= (sizeof(size_t) * 8) bytes, same as case 2 except that an
+ *         additional phase is placed between Phase 1 and Phase 2, with
+ *         (8 * sizeof(size_t)) comparisons using bitwise OR, to encourage
+ *         compilers to use SIMD instructions if available, up to the last
+ *         aligned location possible.
+ *
+ * Case 1 and Case 2 are mandatory to ensure that we won't read beyond the
+ * memory area.  This is portable for 32-bit and 64-bit architectures.
+ *
+ * Caller must ensure that "ptr" is not NULL.
  */
 static inline bool
 pg_memory_is_all_zeros(const void *ptr, size_t len)
 {
-	const char *p = (const char *) ptr;
+	const unsigned char *p = (const unsigned char *) ptr;
+	const unsigned char *end = &p[len];
+	const unsigned char *aligned_end = (const unsigned char *)
+		((uintptr_t) end & (~(sizeof(size_t) - 1)));
+
+	if (len < sizeof(size_t))
+	{
+		while (p < end)
+		{
+			if (*p++ != 0)
+				return false;
+		}
+		return true;
+	}
+
+	/* "len" in the [sizeof(size_t), sizeof(size_t) * 8 - 1] range */
+	if (len < sizeof(size_t) * 8)
+	{
+		/* Compare bytes until the pointer "p" is aligned */
+		while (((uintptr_t) p & (sizeof(size_t) - 1)) != 0)
+		{
+			if (p == end)
+				return true;
+			if (*p++ != 0)
+				return false;
+		}
+
+		/*
+		 * Compare remaining size_t-aligned chunks.
+		 *
+		 * There is no risk to read beyond the memory area, as "aligned_end"
+		 * cannot be higher than "end".
+		 */
+		for (; p < aligned_end; p += sizeof(size_t))
+		{
+			if (*(size_t *) p != 0)
+				return false;
+		}
+
+		/* Compare remaining bytes until the end */
+		while (p < end)
+		{
+			if (*p++ != 0)
+				return false;
+		}
+		return true;
+	}
+
+	/* "len" in the [sizeof(size_t) * 8, inf[ range */
 
-	for (size_t i = 0; i < len; i++)
+	/* Compare bytes until the pointer "p" is aligned */
+	while (((uintptr_t) p & (sizeof(size_t) - 1)) != 0)
 	{
-		if (p[i] != 0)
+		if (p == end)
+			return true;
+
+		if (*p++ != 0)
+			return false;
+	}
+
+	/*
+	 * Compare 8 * sizeof(size_t) chunks at once.
+	 *
+	 * For performance reasons, we manually unroll this loop and purposefully
+	 * use bitwise-ORs to combine each comparison.  This prevents boolean
+	 * short-circuiting and lets the compiler know that it's safe to access
+	 * all 8 elements regardless of the result of the other comparisons.  This
+	 * seems to be enough to coax a few compilers into using SIMD
+	 * instructions.
+	 */
+	for (; p < aligned_end - (sizeof(size_t) * 7); p += sizeof(size_t) * 8)
+	{
+		if ((((size_t *) p)[0] != 0) | (((size_t *) p)[1] != 0) |
+			(((size_t *) p)[2] != 0) | (((size_t *) p)[3] != 0) |
+			(((size_t *) p)[4] != 0) | (((size_t *) p)[5] != 0) |
+			(((size_t *) p)[6] != 0) | (((size_t *) p)[7] != 0))
+			return false;
+	}
+
+	/*
+	 * Compare remaining size_t-aligned chunks.
+	 *
+	 * There is no risk to read beyond the memory area, as "aligned_end"
+	 * cannot be higher than "end".
+	 */
+	for (; p < aligned_end; p += sizeof(size_t))
+	{
+		if (*(size_t *) p != 0)
 			return false;
 	}
+
+	/* Compare remaining bytes until the end */
+	while (p < end)
+	{
+		if (*p++ != 0)
+			return false;
+	}
+
 	return true;
 }
 
-- 
2.34.1

