From ab2a7ea025a32053f6f201d14ee764a235c480ce Mon Sep 17 00:00:00 2001
From: Andres Freund <andres@anarazel.de>
Date: Thu, 17 Oct 2024 13:59:13 -0400
Subject: [PATCH v1 08/14] bufmgr: Add interface to acquire right to set hint
 bits

At the moment hint bits can be set with just a share lock on a page (and in
one place even without any lock). Because of this we need to copy pages while
writing them out, as otherwise the checksum could be corrupted.

The need to copy the page is problematic for the AIO patchset:

1) Instead of just needing a single buffer for a copied page we need one for
   each page that's potentially undergoing IO
2) To be able to use the "worker" AIO implementation the copied page needs to
   reside in shared memory.

Even without AIO copying the page isn't free...

This commit starts to address that by adding BufferPrepareToSetHintBits(),
which needs to be called before setting hint bits on a
buffer. BufferPrepareToSetHintBits() only allows hint bit writes if there's no
ongoing IO and while hint bits are being set no IO is allowed to be started.

To know that a buffer is undergoing IO a new BufferDesc state flag is used,
BM_SETTING_HINTS. Theoretically it'd be possible to reuse BM_IO_IN_PROGRESS,
but that'd make it harder to debug the system.

The new interface is not yet used, that will happen in subsequent commits, to
make review a bit easier. Therefore we cannot yet rely on not needing a copy
of the buffer during IO.

Author:
Reviewed-by:
Discussion: https://postgr.es/m/
Backpatch:
---
 src/include/storage/buf_internals.h           |  22 ++-
 src/include/storage/bufmgr.h                  |   3 +
 src/backend/storage/buffer/README             |   6 +-
 src/backend/storage/buffer/bufmgr.c           | 177 +++++++++++++++++-
 .../utils/activity/wait_event_names.txt       |   1 +
 5 files changed, 198 insertions(+), 11 deletions(-)

diff --git a/src/include/storage/buf_internals.h b/src/include/storage/buf_internals.h
index d8444651736..1f8a0482e90 100644
--- a/src/include/storage/buf_internals.h
+++ b/src/include/storage/buf_internals.h
@@ -31,8 +31,8 @@
  * Buffer state is a single 32-bit variable where following data is combined.
  *
  * - 18 bits refcount
- * - 4 bits usage count
- * - 10 bits of flags
+ * - 3 bits usage count
+ * - 11 bits of flags
  *
  * Combining these values allows to perform some operations without locking
  * the buffer header, by modifying them together with a CAS loop.
@@ -40,8 +40,8 @@
  * The definition of buffer state components is below.
  */
 #define BUF_REFCOUNT_BITS 18
-#define BUF_USAGECOUNT_BITS 4
-#define BUF_FLAG_BITS 10
+#define BUF_USAGECOUNT_BITS 3
+#define BUF_FLAG_BITS 11
 
 StaticAssertDecl(BUF_REFCOUNT_BITS + BUF_USAGECOUNT_BITS + BUF_FLAG_BITS == 32,
 				 "buffer bit counts need to equal 32");
@@ -63,6 +63,7 @@ StaticAssertDecl(BUF_REFCOUNT_BITS + BUF_USAGECOUNT_BITS + BUF_FLAG_BITS == 32,
  * Note: BM_TAG_VALID essentially means that there is a buffer hashtable
  * entry associated with the buffer's tag.
  */
+#define BM_SETTING_HINTS		(1U << 21)	/* hint bits are being set */
 #define BM_LOCKED				(1U << 22)	/* buffer header is locked */
 #define BM_DIRTY				(1U << 23)	/* data needs writing */
 #define BM_VALID				(1U << 24)	/* data is valid */
@@ -396,6 +397,7 @@ extern PGDLLIMPORT CkptSortItem *CkptBufferIds;
 /* ResourceOwner callbacks to hold buffer I/Os and pins */
 extern PGDLLIMPORT const ResourceOwnerDesc buffer_io_resowner_desc;
 extern PGDLLIMPORT const ResourceOwnerDesc buffer_pin_resowner_desc;
+extern PGDLLIMPORT const ResourceOwnerDesc buffer_setting_hints_resowner_desc;
 
 /* Convenience wrappers over ResourceOwnerRemember/Forget */
 static inline void
@@ -418,6 +420,18 @@ ResourceOwnerForgetBufferIO(ResourceOwner owner, Buffer buffer)
 {
 	ResourceOwnerForget(owner, Int32GetDatum(buffer), &buffer_io_resowner_desc);
 }
+static inline void
+ResourceOwnerRememberBufferSettingHints(ResourceOwner owner, Buffer buffer)
+{
+	ResourceOwnerRemember(owner, Int32GetDatum(buffer),
+						  &buffer_setting_hints_resowner_desc);
+}
+static inline void
+ResourceOwnerForgetBufferSettingHints(ResourceOwner owner, Buffer buffer)
+{
+	ResourceOwnerForget(owner, Int32GetDatum(buffer),
+						&buffer_setting_hints_resowner_desc);
+}
 
 /*
  * Internal buffer management routines
diff --git a/src/include/storage/bufmgr.h b/src/include/storage/bufmgr.h
index ded46a57889..a37872377bc 100644
--- a/src/include/storage/bufmgr.h
+++ b/src/include/storage/bufmgr.h
@@ -300,6 +300,9 @@ extern void LimitAdditionalLocalPins(uint32 *additional_pins);
 
 extern bool EvictUnpinnedBuffer(Buffer buf);
 
+extern bool BufferPrepareToSetHintBits(Buffer buf);
+extern void BufferFinishSetHintBits(Buffer buf);
+
 /* in buf_init.c */
 extern void BufferManagerShmemInit(void);
 extern Size BufferManagerShmemSize(void);
diff --git a/src/backend/storage/buffer/README b/src/backend/storage/buffer/README
index 011af7aff3e..756ad542910 100644
--- a/src/backend/storage/buffer/README
+++ b/src/backend/storage/buffer/README
@@ -58,7 +58,8 @@ tuple while they are doing visibility checks.
 4. It is considered OK to update tuple commit status bits (ie, OR the
 values HEAP_XMIN_COMMITTED, HEAP_XMIN_INVALID, HEAP_XMAX_COMMITTED, or
 HEAP_XMAX_INVALID into t_infomask) while holding only a shared lock and
-pin on a buffer.  This is OK because another backend looking at the tuple
+pin on a buffer after calling BufferPrepareToSetHintBits().
+This is OK because another backend looking at the tuple
 at about the same time would OR the same bits into the field, so there
 is little or no risk of conflicting update; what's more, if there did
 manage to be a conflict it would merely mean that one bit-update would
@@ -68,6 +69,9 @@ great harm is done if they get reset to zero by conflicting updates.
 Note, however, that a tuple is frozen by setting both HEAP_XMIN_INVALID
 and HEAP_XMIN_COMMITTED; this is a critical update and accordingly requires
 an exclusive buffer lock (and it must also be WAL-logged).
+Calling BufferPrepareToSetHintBits() ensures that write IO cannot happen at
+the same time as modifying hint bits, which can lead the checksum computed at
+the start of the write to not match anymore.
 
 5. To physically remove a tuple or compact free space on a page, one
 must hold a pin and an exclusive lock, *and* observe while holding the
diff --git a/src/backend/storage/buffer/bufmgr.c b/src/backend/storage/buffer/bufmgr.c
index f56f7f9f441..ee8b898121c 100644
--- a/src/backend/storage/buffer/bufmgr.c
+++ b/src/backend/storage/buffer/bufmgr.c
@@ -222,6 +222,8 @@ static void ResOwnerReleaseBufferIO(Datum res);
 static char *ResOwnerPrintBufferIO(Datum res);
 static void ResOwnerReleaseBufferPin(Datum res);
 static char *ResOwnerPrintBufferPin(Datum res);
+static void ResOwnerReleaseBufferSettingHints(Datum res);
+static char *ResOwnerPrintBufferSettingHints(Datum res);
 
 const ResourceOwnerDesc buffer_io_resowner_desc =
 {
@@ -241,6 +243,15 @@ const ResourceOwnerDesc buffer_pin_resowner_desc =
 	.DebugPrint = ResOwnerPrintBufferPin
 };
 
+const ResourceOwnerDesc buffer_setting_hints_resowner_desc =
+{
+	.name = "buffer setting hints",
+	.release_phase = RESOURCE_RELEASE_BEFORE_LOCKS,
+	.release_priority = RELEASE_PRIO_BUFFER_IOS,
+	.ReleaseResource = ResOwnerReleaseBufferSettingHints,
+	.DebugPrint = ResOwnerPrintBufferSettingHints
+};
+
 /*
  * Ensure that the PrivateRefCountArray has sufficient space to store one more
  * entry. This has to be called before using NewPrivateRefCountEntry() to fill
@@ -1737,7 +1748,8 @@ BufferAlloc(SMgrRelation smgr, char relpersistence, ForkNumber forkNum,
 
 	/* some sanity checks while we hold the buffer header lock */
 	Assert(BUF_STATE_GET_REFCOUNT(victim_buf_state) == 1);
-	Assert(!(victim_buf_state & (BM_TAG_VALID | BM_VALID | BM_DIRTY | BM_IO_IN_PROGRESS)));
+	Assert(!(victim_buf_state & (BM_TAG_VALID | BM_VALID | BM_DIRTY
+								 | BM_IO_IN_PROGRESS | BM_SETTING_HINTS)));
 
 	victim_buf_hdr->tag = newTag;
 
@@ -2092,7 +2104,8 @@ again:
 	buf_state = pg_atomic_read_u32(&buf_hdr->state);
 
 	Assert(BUF_STATE_GET_REFCOUNT(buf_state) == 1);
-	Assert(!(buf_state & (BM_TAG_VALID | BM_VALID | BM_DIRTY)));
+	Assert(!(buf_state & (BM_TAG_VALID | BM_VALID | BM_DIRTY
+						  | BM_IO_IN_PROGRESS | BM_SETTING_HINTS)));
 
 	CheckBufferIsPinnedOnce(buf);
 #endif
@@ -5544,7 +5557,8 @@ BufferLockHeldByMe(Buffer buffer, int mode)
  */
 
 /*
- * WaitIO -- Block until the IO_IN_PROGRESS flag on 'buf' is cleared.
+ * WaitIO -- Block until the IO_IN_PROGRESS and SETTING_HINTS flags on 'buf'
+ * are cleared.
  */
 static void
 WaitIO(BufferDesc *buf)
@@ -5555,6 +5569,7 @@ WaitIO(BufferDesc *buf)
 	for (;;)
 	{
 		uint32		buf_state;
+		uint32		wait_event;
 
 		/*
 		 * It may not be necessary to acquire the spinlock to check the flag
@@ -5564,9 +5579,21 @@ WaitIO(BufferDesc *buf)
 		buf_state = LockBufHdr(buf);
 		UnlockBufHdr(buf, buf_state);
 
-		if (!(buf_state & BM_IO_IN_PROGRESS))
+		if (likely((buf_state & (BM_IO_IN_PROGRESS | BM_SETTING_HINTS)) == 0))
 			break;
-		ConditionVariableSleep(cv, WAIT_EVENT_BUFFER_IO);
+		else if (buf_state & BM_IO_IN_PROGRESS)
+		{
+			Assert(!(buf_state & BM_SETTING_HINTS));
+			wait_event = WAIT_EVENT_BUFFER_IO;
+		}
+		else
+		{
+			Assert(buf_state & BM_SETTING_HINTS);
+			Assert(!(buf_state & BM_IO_IN_PROGRESS));
+			wait_event = WAIT_EVENT_BUFFER_SETTING_HINTS;
+		}
+
+		ConditionVariableSleep(cv, wait_event);
 	}
 	ConditionVariableCancelSleep();
 }
@@ -5606,7 +5633,7 @@ StartBufferIO(BufferDesc *buf, bool forInput, bool nowait)
 	{
 		buf_state = LockBufHdr(buf);
 
-		if (!(buf_state & BM_IO_IN_PROGRESS))
+		if (!(buf_state & (BM_IO_IN_PROGRESS | BM_SETTING_HINTS)))
 			break;
 		UnlockBufHdr(buf, buf_state);
 		if (nowait)
@@ -5623,6 +5650,8 @@ StartBufferIO(BufferDesc *buf, bool forInput, bool nowait)
 		return false;
 	}
 
+	Assert(!(buf_state & BM_SETTING_HINTS));
+
 	buf_state |= BM_IO_IN_PROGRESS;
 	UnlockBufHdr(buf, buf_state);
 
@@ -5661,6 +5690,7 @@ TerminateBufferIO(BufferDesc *buf, bool clear_dirty, uint32 set_flag_bits,
 	buf_state = LockBufHdr(buf);
 
 	Assert(buf_state & BM_IO_IN_PROGRESS);
+	Assert(!(buf_state & BM_SETTING_HINTS));
 
 	buf_state &= ~(BM_IO_IN_PROGRESS | BM_IO_ERROR);
 	if (clear_dirty && !(buf_state & BM_JUST_DIRTIED))
@@ -5697,6 +5727,7 @@ AbortBufferIO(Buffer buffer)
 
 	buf_state = LockBufHdr(buf_hdr);
 	Assert(buf_state & (BM_IO_IN_PROGRESS | BM_TAG_VALID));
+	Assert(!(buf_state & BM_SETTING_HINTS));
 
 	if (!(buf_state & BM_VALID))
 	{
@@ -6122,6 +6153,23 @@ ResOwnerPrintBufferPin(Datum res)
 	return DebugPrintBufferRefcount(DatumGetInt32(res));
 }
 
+
+static void
+ResOwnerReleaseBufferSettingHints(Datum res)
+{
+	Buffer		buffer = DatumGetInt32(res);
+
+	BufferFinishSetHintBits(buffer);
+}
+
+static char *
+ResOwnerPrintBufferSettingHints(Datum res)
+{
+	Buffer		buffer = DatumGetInt32(res);
+
+	return psprintf("lost track of setting hint bits on buffer %d", buffer);
+}
+
 /*
  * Try to evict the current block in a shared buffer.
  *
@@ -6184,3 +6232,120 @@ EvictUnpinnedBuffer(Buffer buf)
 
 	return result;
 }
+
+/*
+ * Try to acquire the right to set hint bits for buf.
+ *
+ * It's only permitted to set hint bits in a buffer if the buffer is not
+ * undergoing IO. Otherwise the page level checksum could be corrupted. This
+ * could happen both for PG's checksum and on the OS/filesystem
+ * level. E.g. btrfs with direct-io relies on the page to not change while IO
+ * is going on.
+ *
+ * We can't just check if IO going on at the time BufferPrepareToSetHintBits()
+ * is called, we also need to block IO from starting before we're done setting
+ * hints. This is achieved by setting BM_SETTING_HINTS for the buffer and
+ * having StartBufferIO()/WaitIO() wait for that.  We could combine
+ * BM_SETTING_HINTS and BM_IO_IN_PROGRESS into one, as they can never be set
+ * at the same time, but that seems unnecessarily confusing.
+ *
+ * Because we use only a single bit (BM_SETTING_HINTS) to track whether hint
+ * bits are currently being set, we cannot allow multiple backends to set
+ * hints at the same time - it'd be unknown whether BM_SETTING_HINTS would
+ * need to be remain set when a backend finishes setting hint bits.  In almost
+ * all situations the two backends would just set the same hint bits anyway,
+ * so this is unlikely to be a problem.
+ *
+ * If allowed to set hint bits, the caller needs to call
+ * BufferFinishSetHintBits() once done setting hint bits. In case of an error
+ * occuring before BufferFinishSetHintBits() is reached, the cleanup will be
+ * done via resowner.c.
+ *
+ * It can make sense to amortize the cost of BufferPrepareToSetHintBits() +
+ * BufferFinishSetHintBits() over multiple hint bit sets on a page.
+ *
+ * Returns whether caller is allowed to set hint bits.
+ */
+bool
+BufferPrepareToSetHintBits(Buffer buf)
+{
+	BufferDesc *desc;
+	uint32		old_buf_state;
+
+	Assert(BufferIsPinned(buf));
+
+	if (BufferIsLocal(buf))
+		return true;
+
+	ResourceOwnerEnlarge(CurrentResourceOwner);
+
+	desc = GetBufferDescriptor(buf - 1);
+
+	/*
+	 * We could use LockBufHdr() instead, but a CAS loop turns out to be
+	 * slightly faster and has better concurrency behaviour due to not
+	 * blocking other backends.
+	 */
+	old_buf_state = pg_atomic_read_u32(&desc->state);
+
+	for (;;)
+	{
+		uint32		new_buf_state;
+
+		if (unlikely(old_buf_state & BM_LOCKED))
+			old_buf_state = WaitBufHdrUnlocked(desc);
+
+		if (unlikely((old_buf_state & (BM_SETTING_HINTS | BM_IO_IN_PROGRESS)) != 0))
+			return false;
+
+		new_buf_state = old_buf_state;
+
+		new_buf_state |= BM_SETTING_HINTS;
+
+		if (pg_atomic_compare_exchange_u32(&desc->state, &old_buf_state,
+										   new_buf_state))
+		{
+			ResourceOwnerRememberBufferSettingHints(CurrentResourceOwner, buf);
+			return true;
+		}
+	}
+}
+
+/*
+ * Release the permission to set hint bits on the current page.
+ */
+void
+BufferFinishSetHintBits(Buffer buf)
+{
+	BufferDesc *desc;
+	uint32		buf_state;
+	uint32		new_buf_state;
+
+	if (BufferIsLocal(buf))
+		return;
+
+	desc = GetBufferDescriptor(buf - 1);
+
+	buf_state = LockBufHdr(desc);
+	new_buf_state = buf_state;
+	new_buf_state &= ~BM_SETTING_HINTS;
+	UnlockBufHdr(desc, new_buf_state);
+
+	Assert(buf_state & (BM_TAG_VALID));
+	Assert(buf_state & (BM_VALID));
+	Assert(buf_state & (BM_SETTING_HINTS));
+	Assert(!(buf_state & (BM_IO_IN_PROGRESS)));
+
+	ResourceOwnerForgetBufferSettingHints(CurrentResourceOwner, buf);
+
+	/*
+	 * Wake everyone that might be waiting for a chance to perform IO (i.e.
+	 * WaitIO()).
+	 *
+	 * XXX: Right now this is somewhat expensive, due to
+	 * ConditionVariableBroadcast() acquiring its spinlock unconditionally. I
+	 * don't see a good way to avoid that from the bufmgr.c side, we don't
+	 * know if there is somebody waiting.
+	 */
+	ConditionVariableBroadcast(BufferDescriptorGetIOCV(desc));
+}
diff --git a/src/backend/utils/activity/wait_event_names.txt b/src/backend/utils/activity/wait_event_names.txt
index 8efb4044d6f..a1222a1cfbb 100644
--- a/src/backend/utils/activity/wait_event_names.txt
+++ b/src/backend/utils/activity/wait_event_names.txt
@@ -111,6 +111,7 @@ BGWORKER_SHUTDOWN	"Waiting for background worker to shut down."
 BGWORKER_STARTUP	"Waiting for background worker to start up."
 BTREE_PAGE	"Waiting for the page number needed to continue a parallel B-tree scan to become available."
 BUFFER_IO	"Waiting for buffer I/O to complete."
+BUFFER_SETTING_HINTS	"Waiting for hint bit setting to complete."
 CHECKPOINT_DELAY_COMPLETE	"Waiting for a backend that blocks a checkpoint from completing."
 CHECKPOINT_DELAY_START	"Waiting for a backend that blocks a checkpoint from starting."
 CHECKPOINT_DONE	"Waiting for a checkpoint to complete."
-- 
2.46.0.519.g2e7b89e038

