From 200e7752335d0f537f82a27ce8425fad8309a9fb Mon Sep 17 00:00:00 2001
From: Michael Paquier <michael@paquier.xyz>
Date: Fri, 8 Nov 2024 14:19:59 +0900
Subject: [PATCH v11 1/2] Optimize pg_memory_is_all_zeros()

pg_memory_is_all_zeros() is currently doing byte per byte comparison and so
could lead to performance regression or penalties when multi bytes comparison
could be done instead.

Let's provide an optimized version that divides the checks into four phases for
efficiency:

- Initial alignment (byte per byte comparison)
- Compare 8 size_t chunks at once using bitwise OR (candidate for SIMD optimization)
- Compare remaining size_t aligned chunks
- Compare remaining bytes (byte per byte comparison)

If the memory area size is < 64 bytes then we are using byte per byte comparison
only to ensure that no data beyond the memory area could be read (that's likely
to be good enough for such sizes).

Code mainly suggested by David Rowley.
---
 src/include/utils/memutils.h | 85 ++++++++++++++++++++++++++++++++++--
 1 file changed, 82 insertions(+), 3 deletions(-)
 100.0% src/include/utils/

diff --git a/src/include/utils/memutils.h b/src/include/utils/memutils.h
index 3590c8bad9..f10b0ea05e 100644
--- a/src/include/utils/memutils.h
+++ b/src/include/utils/memutils.h
@@ -190,19 +190,98 @@ extern MemoryContext BumpContextCreate(MemoryContext parent,
 #define SLAB_LARGE_BLOCK_SIZE		(8 * 1024 * 1024)
 
 /*
+ * pg_memory_is_all_zeros
+ *
  * Test if a memory region starting at "ptr" and of size "len" is full of
  * zeroes.
+ *
+ * The test is divided into multiple phases, to be efficient for various
+ * length values:
+ * - Byte by byte comparison if len < 64 to ensure that we won't read beyond the
+ *   memory area.
+ * - Byte by byte comparison, until the pointer is aligned.
+ * - 8 * sizeof(size_t) comparisons using bitwise OR, to encourage compilers
+ *   to use SIMD instructions if available, up to the last aligned location
+ *   possible.
+ * - size_t comparisons, with aligned pointers, up to the last location
+ *   possible.
+ * - Byte by byte comparison, until the end location.
+ *
+ * Caller must ensure that "ptr" is not NULL.
  */
 static inline bool
 pg_memory_is_all_zeros(const void *ptr, size_t len)
 {
-	const char *p = (const char *) ptr;
+	const unsigned char *p = (const unsigned char *) ptr;
+	const unsigned char *end = &p[len];
+	const unsigned char *aligned_end = (const unsigned char *)
+		((uintptr_t) end & (~(sizeof(size_t) - 1)));
+
+	/*
+	 * For len < 64, compare byte per byte to ensure we'll not read beyond the
+	 * memory area.
+	 */
+	if (len < sizeof(size_t) * 8)
+	{
+		while (p < end)
+		{
+			if (*p++ != 0)
+				return false;
+		}
+		return true;
+	}
+
+	/* Compare bytes until the pointer "p" is aligned */
+	while (((uintptr_t) p & (sizeof(size_t) - 1)) != 0)
+	{
+		if (p == end)
+			return true;
+
+		if (*p++ != 0)
+			return false;
+	}
+
+	/*
+	 * Compare 8 * sizeof(size_t) chunks at once.
+	 *
+	 * For performance reasons, we manually unroll this loop and purposefully
+	 * use bitwise-ORs to combine each comparison.  This prevents boolean
+	 * short-circuiting and lets the compiler know that it's safe to access
+	 * all 8 elements regardless of the result of the other comparisons.  This
+	 * seems to be enough to coax a few compilers into using SIMD
+	 * instructions.
+	 *
+	 * There is no risk to read beyond the memory area thanks to the len < 64
+	 * check done below.
+	 */
+	for (; p < aligned_end - (sizeof(size_t) * 7); p += sizeof(size_t) * 8)
+	{
+		if ((((size_t *) p)[0] != 0) | (((size_t *) p)[1] != 0) |
+			(((size_t *) p)[2] != 0) | (((size_t *) p)[3] != 0) |
+			(((size_t *) p)[4] != 0) | (((size_t *) p)[5] != 0) |
+			(((size_t *) p)[6] != 0) | (((size_t *) p)[7] != 0))
+			return false;
+	}
 
-	for (size_t i = 0; i < len; i++)
+	/*
+	 * Compare remaining size_t-aligned chunks.
+	 *
+	 * aligned_end cant' be > end as we ensured to take care of len < 8 (in
+	 * the len < 64 check below). So, no risk to read beyond the memory area.
+	 */
+	for (; p < aligned_end; p += sizeof(size_t))
 	{
-		if (p[i] != 0)
+		if (*(size_t *) p != 0)
 			return false;
 	}
+
+	/* Compare remaining bytes until the end */
+	while (p < end)
+	{
+		if (*p++ != 0)
+			return false;
+	}
+
 	return true;
 }
 
-- 
2.34.1

