From 2118edc30758b2e23d11c5492699c17642a00902 Mon Sep 17 00:00:00 2001
From: Andres Freund <andres@anarazel.de>
Date: Mon, 28 Oct 2024 18:03:53 -0400
Subject: [PATCH v2 14/15] WIP: bufmgr: Detect some bad buffer accesses

I wrote this mainly to ensure that I did not miss converting any hint bit sets
to BufferPrepareToSetHintBits(), but it seems like it might be more generally
useful.

If we do want to include it, it needs a bit more polishing.

On my workstation, the performance effect of this test infrastructure is as
follows:

base:
real	1m4.613s
user	4m31.409s
sys	3m20.445s

ENFORCE_BUFFER_PROT

real	1m11.912s
user	4m27.332s
sys	3m28.063s

ENFORCE_BUFFER_PROT + ENFORCE_BUFFER_PROT_READ
real	1m33.455s
user	4m32.188s
sys	3m41.275s

Author:
Reviewed-by:
Discussion: https://postgr.es/m/
Backpatch:
---
 src/include/pg_config_manual.h        |  24 ++++
 src/include/storage/buf_internals.h   |   9 ++
 src/include/storage/pg_shmem.h        |   2 +
 src/backend/storage/buffer/buf_init.c |  10 ++
 src/backend/storage/buffer/bufmgr.c   | 173 ++++++++++++++++++++++++--
 src/backend/utils/misc/guc_tables.c   |   2 +-
 6 files changed, 208 insertions(+), 12 deletions(-)

diff --git a/src/include/pg_config_manual.h b/src/include/pg_config_manual.h
index e49eb13e43c..8ea96bd1f1b 100644
--- a/src/include/pg_config_manual.h
+++ b/src/include/pg_config_manual.h
@@ -361,3 +361,27 @@
  * Enable tracing of syncscan operations (see also the trace_syncscan GUC var).
  */
 /* #define TRACE_SYNCSCAN */
+
+
+/*
+ * On some operating systems we know how to change whether pages are
+ * readable/writeable. We can use that to verify that we are following buffer
+ * locking rules, we can make pages inaccessible or read-only when we don't
+ * have sufficient locks etc. This obviously is fairly expensive, so by
+ * default we only so in assert enabled builds.
+ */
+#if defined(USE_ASSERT_CHECKING) && !defined(WIN32)
+#define ENFORCE_BUFFER_PROT
+#endif
+
+/*
+ * Protecting pages against being modified without an exclusive lock /
+ * BufferPrepareToSetHintBits() is reasonably cheap, neither happens *that*
+ * often. Pinning/unpinning buffers is a lot more common, making it more
+ * expensive to call mprotect() that often.
+ *
+ * Therefore disable this by default, even in assert enabled builds.
+ */
+#ifdef ENFORCE_BUFFER_PROT
+/* #define ENFORCE_BUFFER_PROT_READ */
+#endif
diff --git a/src/include/storage/buf_internals.h b/src/include/storage/buf_internals.h
index a8dbcec666c..2fd451e0329 100644
--- a/src/include/storage/buf_internals.h
+++ b/src/include/storage/buf_internals.h
@@ -488,6 +488,15 @@ extern Size StrategyShmemSize(void);
 extern void StrategyInitialize(bool init);
 extern bool have_free_buffer(void);
 
+#ifdef ENFORCE_BUFFER_PROT
+extern void SetBufferProtection(Buffer buf, bool allow_reads, bool allow_writes);
+#else
+static inline void
+SetBufferProtection(Buffer buf, bool allow_reads, bool allow_writes)
+{
+}
+#endif
+
 /* buf_table.c */
 extern Size BufTableShmemSize(int size);
 extern void InitBufTable(int size);
diff --git a/src/include/storage/pg_shmem.h b/src/include/storage/pg_shmem.h
index 3065ff5be71..42a12688b9a 100644
--- a/src/include/storage/pg_shmem.h
+++ b/src/include/storage/pg_shmem.h
@@ -63,6 +63,8 @@ typedef enum
 	SHMEM_TYPE_MMAP,
 }			PGShmemType;
 
+extern PGDLLIMPORT int huge_pages_status;
+
 #ifndef WIN32
 extern PGDLLIMPORT unsigned long UsedShmemSegID;
 #else
diff --git a/src/backend/storage/buffer/buf_init.c b/src/backend/storage/buffer/buf_init.c
index 56761a8eedc..0e02d8e1e9b 100644
--- a/src/backend/storage/buffer/buf_init.c
+++ b/src/backend/storage/buffer/buf_init.c
@@ -135,6 +135,16 @@ BufferManagerShmemInit(void)
 							 LWTRANCHE_BUFFER_CONTENT);
 
 			ConditionVariableInit(BufferDescriptorGetIOCV(buf));
+
+			/*
+			 * Unused buffers are inaccessible. But if we're not enforcing
+			 * making buffers inaccessible without a pin, we won't mark
+			 * buffers as accessible during pinning, therefore we better don't
+			 * make them initially inaccessible.
+			 */
+#if defined(ENFORCE_BUFFER_PROT) && defined(ENFORCE_BUFFER_PROT_READ)
+			SetBufferProtection(i + 1, false, false);
+#endif
 		}
 
 		/* Correct last entry of linked list */
diff --git a/src/backend/storage/buffer/bufmgr.c b/src/backend/storage/buffer/bufmgr.c
index a91e4a7eff5..ee8078fc06d 100644
--- a/src/backend/storage/buffer/bufmgr.c
+++ b/src/backend/storage/buffer/bufmgr.c
@@ -37,6 +37,10 @@
 #include <sys/file.h>
 #include <unistd.h>
 
+#ifdef ENFORCE_BUFFER_PROT
+#include <sys/mman.h>			/* for mprotect() */
+#endif							/* ENFORCE_BUFFER_PROT */
+
 #include "access/tableam.h"
 #include "access/xloginsert.h"
 #include "access/xlogutils.h"
@@ -53,6 +57,7 @@
 #include "storage/fd.h"
 #include "storage/ipc.h"
 #include "storage/lmgr.h"
+#include "storage/pg_shmem.h"
 #include "storage/proc.h"
 #include "storage/read_stream.h"
 #include "storage/smgr.h"
@@ -1065,8 +1070,6 @@ ZeroAndLockBuffer(Buffer buffer, ReadBufferMode mode, bool already_valid)
 
 	if (need_to_zero)
 	{
-		memset(BufferGetPage(buffer), 0, BLCKSZ);
-
 		/*
 		 * Grab the buffer content lock before marking the page as valid, to
 		 * make sure that no other backend sees the zeroed page before the
@@ -1079,7 +1082,12 @@ ZeroAndLockBuffer(Buffer buffer, ReadBufferMode mode, bool already_valid)
 		 * already valid.)
 		 */
 		if (!isLocalBuf)
+		{
 			LWLockAcquire(BufferDescriptorGetContentLock(bufHdr), LW_EXCLUSIVE);
+			SetBufferProtection(buffer, true, true);
+		}
+
+		memset(BufferGetPage(buffer), 0, BLCKSZ);
 
 		if (isLocalBuf)
 		{
@@ -1505,6 +1513,9 @@ WaitReadBuffers(ReadBuffersOperation *operation)
 		io_first_block = blocknum + i;
 		io_buffers_len = 1;
 
+		if (persistence != RELPERSISTENCE_TEMP)
+			SetBufferProtection(io_buffers[0], true, true);
+
 		/*
 		 * How many neighboring-on-disk blocks can we scatter-read into other
 		 * buffers at the same time?  In this case we don't wait if we see an
@@ -1520,7 +1531,13 @@ WaitReadBuffers(ReadBuffersOperation *operation)
 				   BufferGetBlockNumber(buffers[i]) + 1);
 
 			io_buffers[io_buffers_len] = buffers[++i];
-			io_pages[io_buffers_len++] = BufferGetBlock(buffers[i]);
+			io_pages[io_buffers_len] = BufferGetBlock(buffers[i]);
+
+			/* smgrreadv() needs to modify the buffer */
+			if (persistence != RELPERSISTENCE_TEMP)
+				SetBufferProtection(io_buffers[io_buffers_len], true, true);
+
+			io_buffers_len++;
 		}
 
 		io_start = pgstat_prepare_io_time(track_io_timing);
@@ -1580,6 +1597,9 @@ WaitReadBuffers(ReadBuffersOperation *operation)
 				TerminateBufferIO(bufHdr, false, BM_VALID, true);
 			}
 
+			if (persistence != RELPERSISTENCE_TEMP)
+				SetBufferProtection(io_buffers[j], true, false);
+
 			/* Report I/Os as completing individually. */
 			TRACE_POSTGRESQL_BUFFER_READ_DONE(forknum, io_first_block + j,
 											  operation->smgr->smgr_rlocator.locator.spcOid,
@@ -2233,7 +2253,9 @@ ExtendBufferedRelShared(BufferManagerRelation bmr,
 		buf_block = BufHdrGetBlock(GetBufferDescriptor(buffers[i] - 1));
 
 		/* new buffers are zero-filled */
+		SetBufferProtection(buffers[i], true, true);
 		MemSet((char *) buf_block, 0, BLCKSZ);
+		SetBufferProtection(buffers[i], true, false);
 	}
 
 	/*
@@ -2460,7 +2482,7 @@ ExtendBufferedRelShared(BufferManagerRelation bmr,
 		}
 
 		if (lock)
-			LWLockAcquire(BufferDescriptorGetContentLock(buf_hdr), LW_EXCLUSIVE);
+			LockBuffer(buf, BUFFER_LOCK_EXCLUSIVE);
 
 		TerminateBufferIO(buf_hdr, false, BM_VALID, true);
 	}
@@ -2719,6 +2741,10 @@ PinBuffer(BufferDesc *buf, BufferAccessStrategy strategy)
 				 * non-accessible in any case.
 				 */
 				VALGRIND_MAKE_MEM_DEFINED(BufHdrGetBlock(buf), BLCKSZ);
+
+#if defined(ENFORCE_BUFFER_PROT) && defined(ENFORCE_BUFFER_PROT_READ)
+				SetBufferProtection(BufferDescriptorGetBuffer(buf), true, false);
+#endif
 				break;
 			}
 		}
@@ -2791,6 +2817,10 @@ PinBuffer_Locked(BufferDesc *buf)
 	 */
 	VALGRIND_MAKE_MEM_DEFINED(BufHdrGetBlock(buf), BLCKSZ);
 
+#if defined(ENFORCE_BUFFER_PROT) && defined(ENFORCE_BUFFER_PROT_READ)
+	SetBufferProtection(BufferDescriptorGetBuffer(buf), true, false);
+#endif
+
 	/*
 	 * Since we hold the buffer spinlock, we can update the buffer state and
 	 * release the lock in one operation.
@@ -2850,6 +2880,10 @@ UnpinBufferNoOwner(BufferDesc *buf)
 		 */
 		VALGRIND_MAKE_MEM_NOACCESS(BufHdrGetBlock(buf), BLCKSZ);
 
+#if defined(ENFORCE_BUFFER_PROT) && defined(ENFORCE_BUFFER_PROT_READ)
+		SetBufferProtection(BufferDescriptorGetBuffer(buf), false, false);
+#endif
+
 		/* I'd better not still hold the buffer content lock */
 		Assert(!LWLockHeldByMe(BufferDescriptorGetContentLock(buf)));
 
@@ -3670,6 +3704,69 @@ CheckForBufferLeaks(void)
 #endif
 }
 
+/*
+ * To verify that we are following buffer locking rules, we can make pages
+ * inaccessible or read-only when we don't have sufficient locks etc.
+ *
+ * XXX: It might be possible to fold the VALGRIND_MAKE_MEM_NOACCESS() /
+ * VALGRIND_MAKE_MEM_DEFINED() calls into this.
+ */
+#ifdef ENFORCE_BUFFER_PROT
+void
+SetBufferProtection(Buffer buf, bool allow_reads, bool allow_writes)
+{
+	static long	pagesz = 0;
+	int			prot = PROT_NONE;
+	int			rc;
+
+	Assert(huge_pages_status != HUGE_PAGES_UNKNOWN &&
+		   huge_pages_status != HUGE_PAGES_TRY);
+	StaticAssertStmt(PROT_NONE == 0, "that can't be right");
+
+	if (unlikely(pagesz == 0))
+	{
+		pagesz = sysconf(_SC_PAGESIZE);
+
+		elog(DEBUG1, "sysconf(_SC_PAGESIZE) = %ld", pagesz);
+		if (pagesz == -1)
+		{
+			elog(ERROR, "sysconf(_SC_PAGESIZE) failed: %m");
+		}
+		else if (pagesz > BLCKSZ)
+		{
+			elog(DEBUG1, "pagesz > BLCKSZ, disabling buffer protection mode");
+			pagesz = -1;
+		}
+		else if(BLCKSZ % pagesz != 0)
+		{
+			elog(DEBUG1, "BLCKSZ %% pagesz != 0, disabling buffer protection mode");
+			pagesz = -1;
+		}
+		else if (huge_pages_status == HUGE_PAGES_ON)
+		{
+			/* can't set status in a granular enough way */
+			elog(DEBUG1, "huge pages enabled, disabling buffer protection mode");
+			pagesz = -1;
+		}
+	}
+
+	/* disabled */
+	if (pagesz == -1)
+		return;
+
+	if (allow_reads)
+		prot |= PROT_READ;
+
+	if (allow_writes)
+		prot |= PROT_WRITE;
+
+	rc = mprotect(BufferGetBlock(buf), BLCKSZ, prot);
+
+	if (rc != 0)
+		elog(ERROR, "mprotect(%d, %d) failed: %m", buf, prot);
+}
+#endif							/* ENFORCE_BUFFER_PROT */
+
 /*
  * Helper routine to issue warnings when a buffer is unexpectedly pinned
  */
@@ -3865,7 +3962,10 @@ FlushBuffer(BufferDesc *buf, SMgrRelation reln, IOObject io_object,
 	bufBlock = BufHdrGetBlock(buf);
 
 	/* Update page checksum if desired. */
+	SetBufferProtection(BufferDescriptorGetBuffer(buf), true, true);
 	PageSetChecksum((Page) bufBlock, buf->tag.blockNum);
+	/* FIXME: could theoretically be exclusively locked */
+	SetBufferProtection(BufferDescriptorGetBuffer(buf), true, false);
 
 	io_start = pgstat_prepare_io_time(track_io_timing);
 
@@ -5193,19 +5293,38 @@ void
 LockBuffer(Buffer buffer, int mode)
 {
 	BufferDesc *buf;
+	LWLock	   *content_lock;
 
 	Assert(BufferIsPinned(buffer));
 	if (BufferIsLocal(buffer))
 		return;					/* local buffers need no lock */
 
 	buf = GetBufferDescriptor(buffer - 1);
+	content_lock = BufferDescriptorGetContentLock(buf);
 
 	if (mode == BUFFER_LOCK_UNLOCK)
-		LWLockRelease(BufferDescriptorGetContentLock(buf));
+	{
+#ifdef ENFORCE_BUFFER_PROT
+		bool		was_exclusive;
+
+		was_exclusive = LWLockHeldByMeInMode(content_lock, LW_EXCLUSIVE);
+#endif							/* ENFORCE_BUFFER_PROT */
+
+		LWLockRelease(content_lock);
+
+#ifdef ENFORCE_BUFFER_PROT
+		if (was_exclusive)
+			SetBufferProtection(buffer, true, false);
+#endif							/* ENFORCE_BUFFER_PROT */
+	}
 	else if (mode == BUFFER_LOCK_SHARE)
-		LWLockAcquire(BufferDescriptorGetContentLock(buf), LW_SHARED);
+		LWLockAcquire(content_lock, LW_SHARED);
 	else if (mode == BUFFER_LOCK_EXCLUSIVE)
-		LWLockAcquire(BufferDescriptorGetContentLock(buf), LW_EXCLUSIVE);
+	{
+		LWLockAcquire(content_lock, LW_EXCLUSIVE);
+
+		SetBufferProtection(buffer, true, true);
+	}
 	else
 		elog(ERROR, "unrecognized buffer lock mode: %d", mode);
 }
@@ -5219,6 +5338,7 @@ bool
 ConditionalLockBuffer(Buffer buffer)
 {
 	BufferDesc *buf;
+	bool		ret;
 
 	Assert(BufferIsPinned(buffer));
 	if (BufferIsLocal(buffer))
@@ -5226,8 +5346,13 @@ ConditionalLockBuffer(Buffer buffer)
 
 	buf = GetBufferDescriptor(buffer - 1);
 
-	return LWLockConditionalAcquire(BufferDescriptorGetContentLock(buf),
-									LW_EXCLUSIVE);
+	ret = LWLockConditionalAcquire(BufferDescriptorGetContentLock(buf),
+								   LW_EXCLUSIVE);
+
+	if (ret)
+		SetBufferProtection(buffer, true, true);
+
+	return ret;
 }
 
 /*
@@ -6336,6 +6461,11 @@ BufferPrepareToSetHintBits(Buffer buffer)
 		{
 			ResourceOwnerRememberBufferSettingHints(CurrentResourceOwner, buffer);
 
+#ifdef ENFORCE_BUFFER_PROT
+			if (!LWLockHeldByMeInMode(BufferDescriptorGetContentLock(desc), LW_EXCLUSIVE))
+				SetBufferProtection(buffer, true, true);
+#endif							/* ENFORCE_BUFFER_PROT */
+
 			return true;
 		}
 	}
@@ -6411,6 +6541,11 @@ BufferFinishSetHintBits(Buffer buffer, bool mark_dirty, bool buffer_std)
 	 * know if there is somebody waiting.
 	 */
 	ConditionVariableBroadcast(BufferDescriptorGetIOCV(desc));
+
+#ifdef ENFORCE_BUFFER_PROT
+	if (!LWLockHeldByMeInMode(BufferDescriptorGetContentLock(desc), LW_EXCLUSIVE))
+		SetBufferProtection(buffer, true, false);
+#endif							/* ENFORCE_BUFFER_PROT */
 }
 
 /*
@@ -6457,14 +6592,25 @@ BufferSetHintBits16(Buffer buffer, uint16 *ptr, uint16 val)
 	}
 	else
 	{
+#ifdef ENFORCE_BUFFER_PROT
+		bool exclusive;
+#endif
+
 		desc = GetBufferDescriptor(buffer - 1);
+
+#ifdef ENFORCE_BUFFER_PROT
+		exclusive = LWLockHeldByMeInMode(BufferDescriptorGetContentLock(desc),
+										 LW_EXCLUSIVE);
+
+		if (!exclusive)
+			SetBufferProtection(buffer, true, true);
+#endif
+
 		buf_state = LockBufHdr(desc);
 
 		if (likely((buf_state & BM_IO_IN_PROGRESS) == 0))
 		{
-
 			*ptr = val;
-
 			did_set = true;
 		}
 
@@ -6473,5 +6619,10 @@ BufferSetHintBits16(Buffer buffer, uint16 *ptr, uint16 val)
 		is_dirty = (buf_state & (BM_DIRTY | BM_JUST_DIRTIED)) == (BM_DIRTY | BM_JUST_DIRTIED);
 		if (did_set && !is_dirty)
 			MarkBufferDirtyHintImpl(buffer, true, true);
+
+#ifdef ENFORCE_BUFFER_PROT
+		if (!exclusive)
+			SetBufferProtection(buffer, true, false);
+#endif
 	}
 }
diff --git a/src/backend/utils/misc/guc_tables.c b/src/backend/utils/misc/guc_tables.c
index 8a67f01200c..4c94c20f0cb 100644
--- a/src/backend/utils/misc/guc_tables.c
+++ b/src/backend/utils/misc/guc_tables.c
@@ -561,7 +561,7 @@ static int	ssl_renegotiation_limit;
  */
 int			huge_pages = HUGE_PAGES_TRY;
 int			huge_page_size;
-static int	huge_pages_status = HUGE_PAGES_UNKNOWN;
+int			huge_pages_status = HUGE_PAGES_UNKNOWN;
 
 /*
  * These variables are all dummies that don't do anything, except in some
-- 
2.45.2.746.g06e570c0df.dirty

