From c609eeb1ee333018a107b15f9c3a1a699a9f661d Mon Sep 17 00:00:00 2001
From: Bertrand Drouvot <bertranddrouvot.pg@gmail.com>
Date: Fri, 1 Nov 2024 11:46:29 +0000
Subject: [PATCH v7] Optimize pg_memory_is_all_zeros()

pg_memory_is_all_zeros() is currently doing byte per byte comparison and so
could lead to performance regression or penalties when multi bytes comparison
could be done instead.

Let's provide an optimized version that divides the checks into three phases for
efficiency:

- Initial alignment (byte per byte comparison)
- Multiple bytes comparison at once (candidate for SIMD optimization)
- Remaining bytes (byte per byte comparison)

Code mainly provided by David Rowley.

In passing, let's make use of this new version in PageIsVerifiedExtended() now
that multi bytes comparison is handled.
---
 src/backend/storage/page/bufpage.c | 13 +--------
 src/include/utils/memutils.h       | 43 +++++++++++++++++++++++++++---
 2 files changed, 41 insertions(+), 15 deletions(-)
  15.1% src/backend/storage/page/
  84.8% src/include/utils/

diff --git a/src/backend/storage/page/bufpage.c b/src/backend/storage/page/bufpage.c
index be6f1f62d2..aa264f61b9 100644
--- a/src/backend/storage/page/bufpage.c
+++ b/src/backend/storage/page/bufpage.c
@@ -89,10 +89,8 @@ PageIsVerifiedExtended(Page page, BlockNumber blkno, int flags)
 {
 	PageHeader	p = (PageHeader) page;
 	size_t	   *pagebytes;
-	int			i;
 	bool		checksum_failure = false;
 	bool		header_sane = false;
-	bool		all_zeroes = false;
 	uint16		checksum = 0;
 
 	/*
@@ -126,18 +124,9 @@ PageIsVerifiedExtended(Page page, BlockNumber blkno, int flags)
 	}
 
 	/* Check all-zeroes case */
-	all_zeroes = true;
 	pagebytes = (size_t *) page;
-	for (i = 0; i < (BLCKSZ / sizeof(size_t)); i++)
-	{
-		if (pagebytes[i] != 0)
-		{
-			all_zeroes = false;
-			break;
-		}
-	}
 
-	if (all_zeroes)
+	if (pg_memory_is_all_zeros(pagebytes, BLCKSZ))
 		return true;
 
 	/*
diff --git a/src/include/utils/memutils.h b/src/include/utils/memutils.h
index 3590c8bad9..8bce3c6141 100644
--- a/src/include/utils/memutils.h
+++ b/src/include/utils/memutils.h
@@ -192,17 +192,54 @@ extern MemoryContext BumpContextCreate(MemoryContext parent,
 /*
  * Test if a memory region starting at "ptr" and of size "len" is full of
  * zeroes.
+ *
+ * The test is divided into three phases for efficiency:
+ *  - Initial alignment (byte per byte comparison)
+ *  - Multiple bytes comparison at once
+ *  - Remaining bytes (byte per byte comparison)
+ *
+ * Caller must ensure that "ptr" is non-null.
  */
 static inline bool
 pg_memory_is_all_zeros(const void *ptr, size_t len)
 {
-	const char *p = (const char *) ptr;
+	const unsigned char *p = (const unsigned char *) ptr;
+	const unsigned char *end = &p[len];
+	const unsigned char *aligned_end = (const unsigned char *) ((uintptr_t) end & (~(sizeof(size_t) - 1)));
+
+	/* Compare bytes, byte by byte, until the pointer "p" is aligned */
+	while (((uintptr_t) p & (sizeof(size_t) - 1)) != 0)
+	{
+		if (p == end)
+			return true;
+
+		if (*p++ != 0)
+			return false;
+	}
+
+	/*
+	 * Multiple bytes comparison(s) at once.
+	 *
+	 * The logic here is that all comparisons can be done in parallel and
+	 * combined with a single OR operation, making it a good candidate for
+	 * SIMD optimization.
+	 */
+	for (; p < aligned_end - (sizeof(size_t) * 7); p += sizeof(size_t) * 8)
+	{
+		if ((((size_t *) p)[0] != 0) | (((size_t *) p)[1] != 0) |
+			(((size_t *) p)[2] != 0) | (((size_t *) p)[3] != 0) |
+			(((size_t *) p)[4] != 0) | (((size_t *) p)[5] != 0) |
+			(((size_t *) p)[6] != 0) | (((size_t *) p)[7] != 0))
+			return false;
+	}
 
-	for (size_t i = 0; i < len; i++)
+	/* Compare remaining bytes, byte by byte */
+	while (p < end)
 	{
-		if (p[i] != 0)
+		if (*p++ != 0)
 			return false;
 	}
+
 	return true;
 }
 
-- 
2.34.1

