From 8b73c9143118487ee3e7663b4adb050db99b86d8 Mon Sep 17 00:00:00 2001
From: Andres Freund <andres@anarazel.de>
Date: Thu, 17 Oct 2024 14:14:35 -0400
Subject: [PATCH v6 14/14] WIP: bufmgr: Don't copy pages while writing out

After the series of preceding commits introducing and using
BufferBeginSetHintBits()/BufferSetHintBits16() hint bits are not set
anymore while IO is going on. Therefore we do not need to copy pages while
they are being written out anymore.

TODO: Update comments

Author:
Reviewed-by:
Discussion: https://postgr.es/m/
Backpatch:
---
 src/include/storage/bufpage.h           |  3 +-
 src/backend/access/hash/hashpage.c      |  2 +-
 src/backend/access/transam/xloginsert.c | 43 ++++++----------------
 src/backend/storage/buffer/bufmgr.c     | 21 +++++------
 src/backend/storage/buffer/localbuf.c   |  2 +-
 src/backend/storage/page/bufpage.c      | 48 ++++---------------------
 src/backend/storage/smgr/bulk_write.c   |  2 +-
 src/test/modules/test_aio/test_aio.c    |  2 +-
 8 files changed, 33 insertions(+), 90 deletions(-)

diff --git a/src/include/storage/bufpage.h b/src/include/storage/bufpage.h
index abc2cf2a020..f8f621446c4 100644
--- a/src/include/storage/bufpage.h
+++ b/src/include/storage/bufpage.h
@@ -504,7 +504,6 @@ extern void PageIndexMultiDelete(Page page, OffsetNumber *itemnos, int nitems);
 extern void PageIndexTupleDeleteNoCompact(Page page, OffsetNumber offnum);
 extern bool PageIndexTupleOverwrite(Page page, OffsetNumber offnum,
 									const void *newtup, Size newsize);
-extern char *PageSetChecksumCopy(Page page, BlockNumber blkno);
-extern void PageSetChecksumInplace(Page page, BlockNumber blkno);
+extern void PageSetChecksum(Page page, BlockNumber blkno);
 
 #endif							/* BUFPAGE_H */
diff --git a/src/backend/access/hash/hashpage.c b/src/backend/access/hash/hashpage.c
index b8e5bd005e5..dd17eff59d1 100644
--- a/src/backend/access/hash/hashpage.c
+++ b/src/backend/access/hash/hashpage.c
@@ -1029,7 +1029,7 @@ _hash_alloc_buckets(Relation rel, BlockNumber firstblock, uint32 nblocks)
 					zerobuf.data,
 					true);
 
-	PageSetChecksumInplace(page, lastblock);
+	PageSetChecksum(page, lastblock);
 	smgrextend(RelationGetSmgr(rel), MAIN_FORKNUM, lastblock, zerobuf.data,
 			   false);
 
diff --git a/src/backend/access/transam/xloginsert.c b/src/backend/access/transam/xloginsert.c
index a56d5a55282..0af148e9496 100644
--- a/src/backend/access/transam/xloginsert.c
+++ b/src/backend/access/transam/xloginsert.c
@@ -261,8 +261,11 @@ XLogRegisterBuffer(uint8 block_id, Buffer buffer, uint8 flags)
 	 */
 #ifdef USE_ASSERT_CHECKING
 	if (!(flags & REGBUF_NO_CHANGE))
-		Assert(BufferIsLockedByMeInMode(buffer, BUFFER_LOCK_EXCLUSIVE) &&
-			   BufferIsDirty(buffer));
+	{
+		Assert(BufferIsDirty(buffer));
+		Assert(BufferIsLockedByMeInMode(buffer, BUFFER_LOCK_EXCLUSIVE) ||
+			   BufferIsLockedByMeInMode(buffer, BUFFER_LOCK_SHARE_EXCLUSIVE));
+	}
 #endif
 
 	if (block_id >= max_registered_block_id)
@@ -1066,7 +1069,7 @@ XLogCheckBufferNeedsBackup(Buffer buffer)
  * Write a backup block if needed when we are setting a hint. Note that
  * this may be called for a variety of page types, not just heaps.
  *
- * Callable while holding just share lock on the buffer content.
+ * Callable while holding just share-exclusive lock on the buffer content.
  *
  * We can't use the plain backup block mechanism since that relies on the
  * Buffer being exclusively locked. Since some modifications (setting LSN, hint
@@ -1074,6 +1077,8 @@ XLogCheckBufferNeedsBackup(Buffer buffer)
  * failures. So instead we copy the page and insert the copied data as normal
  * record data.
  *
+ * FIXME: outdated
+ *
  * We only need to do something if page has not yet been full page written in
  * this checkpoint round. The LSN of the inserted wal record is returned if we
  * had to write, InvalidXLogRecPtr otherwise.
@@ -1102,46 +1107,20 @@ XLogSaveBufferForHint(Buffer buffer, bool buffer_std)
 
 	/*
 	 * We assume page LSN is first data on *every* page that can be passed to
-	 * XLogInsert, whether it has the standard page layout or not. Since we're
-	 * only holding a share-lock on the page, we must take the buffer header
-	 * lock when we look at the LSN.
+	 * XLogInsert, whether it has the standard page layout or not.
 	 */
 	lsn = BufferGetLSNAtomic(buffer);
 
 	if (lsn <= RedoRecPtr)
 	{
-		int			flags = 0;
-		PGAlignedBlock copied_buffer;
-		char	   *origdata = (char *) BufferGetBlock(buffer);
-		RelFileLocator rlocator;
-		ForkNumber	forkno;
-		BlockNumber blkno;
-
-		/*
-		 * Copy buffer so we don't have to worry about concurrent hint bit or
-		 * lsn updates. We assume pd_lower/upper cannot be changed without an
-		 * exclusive lock, so the contents bkp are not racy.
-		 */
-		if (buffer_std)
-		{
-			/* Assume we can omit data between pd_lower and pd_upper */
-			Page		page = BufferGetPage(buffer);
-			uint16		lower = ((PageHeader) page)->pd_lower;
-			uint16		upper = ((PageHeader) page)->pd_upper;
-
-			memcpy(copied_buffer.data, origdata, lower);
-			memcpy(copied_buffer.data + upper, origdata + upper, BLCKSZ - upper);
-		}
-		else
-			memcpy(copied_buffer.data, origdata, BLCKSZ);
+		int			flags = REGBUF_NO_CHANGE;
 
 		XLogBeginInsert();
 
 		if (buffer_std)
 			flags |= REGBUF_STANDARD;
 
-		BufferGetTag(buffer, &rlocator, &forkno, &blkno);
-		XLogRegisterBlock(0, &rlocator, forkno, blkno, copied_buffer.data, flags);
+		XLogRegisterBuffer(0, buffer, flags);
 
 		recptr = XLogInsert(RM_XLOG_ID, XLOG_FPI_FOR_HINT);
 	}
diff --git a/src/backend/storage/buffer/bufmgr.c b/src/backend/storage/buffer/bufmgr.c
index 584c3b2ee75..d6a638613ae 100644
--- a/src/backend/storage/buffer/bufmgr.c
+++ b/src/backend/storage/buffer/bufmgr.c
@@ -4342,7 +4342,6 @@ FlushBuffer(BufferDesc *buf, SMgrRelation reln, IOObject io_object,
 	ErrorContextCallback errcallback;
 	instr_time	io_start;
 	Block		bufBlock;
-	char	   *bufToWrite;
 	uint64		buf_state;
 
 	Assert(BufferLockHeldByMeInMode(buf, BUFFER_LOCK_EXCLUSIVE) ||
@@ -4413,12 +4412,8 @@ FlushBuffer(BufferDesc *buf, SMgrRelation reln, IOObject io_object,
 	 */
 	bufBlock = BufHdrGetBlock(buf);
 
-	/*
-	 * Update page checksum if desired.  Since we have only shared lock on the
-	 * buffer, other processes might be updating hint bits in it, so we must
-	 * copy the page to private storage if we do checksumming.
-	 */
-	bufToWrite = PageSetChecksumCopy((Page) bufBlock, buf->tag.blockNum);
+	/* Update page checksum if desired. */
+	PageSetChecksum((Page) bufBlock, buf->tag.blockNum);
 
 	io_start = pgstat_prepare_io_time(track_io_timing);
 
@@ -4428,7 +4423,7 @@ FlushBuffer(BufferDesc *buf, SMgrRelation reln, IOObject io_object,
 	smgrwrite(reln,
 			  BufTagGetForkNum(&buf->tag),
 			  buf->tag.blockNum,
-			  bufToWrite,
+			  bufBlock,
 			  false);
 
 	/*
@@ -4552,8 +4547,8 @@ BufferIsPermanent(Buffer buffer)
 /*
  * BufferGetLSNAtomic
  *		Retrieves the LSN of the buffer atomically using a buffer header lock.
- *		This is necessary for some callers who may not have an exclusive lock
- *		on the buffer.
+ *		This is necessary for some callers who may not have a (share-)exclusive
+ *		lock on the buffer.
  */
 XLogRecPtr
 BufferGetLSNAtomic(Buffer buffer)
@@ -5606,6 +5601,12 @@ MarkSharedBufferDirtyHint(Buffer buffer, BufferDesc *bufHdr, uint64 lockstate, b
 			 * It's possible we may enter here without an xid, so it is
 			 * essential that CreateCheckPoint waits for virtual transactions
 			 * rather than full transactionids.
+			 *
+			 * FIXME: I think we now should simply mark the page dirty before
+			 * WAL logging the hint bit - afaikt it then should work just like
+			 * any other buffer write (due to SyncBuffers()/SyncOneBuffer()
+			 * seeing the dirty bit and trying to lock the page
+			 * share-exclusive, and thus having to wait).
 			 */
 			Assert((MyProc->delayChkptFlags & DELAY_CHKPT_START) == 0);
 			MyProc->delayChkptFlags |= DELAY_CHKPT_START;
diff --git a/src/backend/storage/buffer/localbuf.c b/src/backend/storage/buffer/localbuf.c
index a41a5facd3a..5826d4b54c6 100644
--- a/src/backend/storage/buffer/localbuf.c
+++ b/src/backend/storage/buffer/localbuf.c
@@ -199,7 +199,7 @@ FlushLocalBuffer(BufferDesc *bufHdr, SMgrRelation reln)
 		reln = smgropen(BufTagGetRelFileLocator(&bufHdr->tag),
 						MyProcNumber);
 
-	PageSetChecksumInplace(localpage, bufHdr->tag.blockNum);
+	PageSetChecksum(localpage, bufHdr->tag.blockNum);
 
 	io_start = pgstat_prepare_io_time(track_io_timing);
 
diff --git a/src/backend/storage/page/bufpage.c b/src/backend/storage/page/bufpage.c
index aac6e695954..c8cbdd1f7a6 100644
--- a/src/backend/storage/page/bufpage.c
+++ b/src/backend/storage/page/bufpage.c
@@ -1494,51 +1494,15 @@ PageIndexTupleOverwrite(Page page, OffsetNumber offnum,
 /*
  * Set checksum for a page in shared buffers.
  *
- * If checksums are disabled, or if the page is not initialized, just return
- * the input.  Otherwise, we must make a copy of the page before calculating
- * the checksum, to prevent concurrent modifications (e.g. setting hint bits)
- * from making the final checksum invalid.  It doesn't matter if we include or
- * exclude hints during the copy, as long as we write a valid page and
- * associated checksum.
+ * If checksums are disabled, or if the page is not initialized, just
+ * return. Otherwise compute and set the checksum.
  *
- * Returns a pointer to the block-sized data that needs to be written. Uses
- * statically-allocated memory, so the caller must immediately write the
- * returned page and not refer to it again.
- */
-char *
-PageSetChecksumCopy(Page page, BlockNumber blkno)
-{
-	static char *pageCopy = NULL;
-
-	/* If we don't need a checksum, just return the passed-in data */
-	if (PageIsNew(page) || !DataChecksumsEnabled())
-		return page;
-
-	/*
-	 * We allocate the copy space once and use it over on each subsequent
-	 * call.  The point of palloc'ing here, rather than having a static char
-	 * array, is first to ensure adequate alignment for the checksumming code
-	 * and second to avoid wasting space in processes that never call this.
-	 */
-	if (pageCopy == NULL)
-		pageCopy = MemoryContextAllocAligned(TopMemoryContext,
-											 BLCKSZ,
-											 PG_IO_ALIGN_SIZE,
-											 0);
-
-	memcpy(pageCopy, page, BLCKSZ);
-	((PageHeader) pageCopy)->pd_checksum = pg_checksum_page(pageCopy, blkno);
-	return pageCopy;
-}
-
-/*
- * Set checksum for a page in private memory.
- *
- * This must only be used when we know that no other process can be modifying
- * the page buffer.
+ * In the past this needed to be done on a copy of the page, due to the
+ * possibility of e.g. hint bits being set concurrently. However, this is not
+ * necessary anymore as hint bits won't be set while IO is going on.
  */
 void
-PageSetChecksumInplace(Page page, BlockNumber blkno)
+PageSetChecksum(Page page, BlockNumber blkno)
 {
 	/* If we don't need a checksum, just return */
 	if (PageIsNew(page) || !DataChecksumsEnabled())
diff --git a/src/backend/storage/smgr/bulk_write.c b/src/backend/storage/smgr/bulk_write.c
index b958be15716..f4d07543365 100644
--- a/src/backend/storage/smgr/bulk_write.c
+++ b/src/backend/storage/smgr/bulk_write.c
@@ -279,7 +279,7 @@ smgr_bulk_flush(BulkWriteState *bulkstate)
 		BlockNumber blkno = pending_writes[i].blkno;
 		Page		page = pending_writes[i].buf->data;
 
-		PageSetChecksumInplace(page, blkno);
+		PageSetChecksum(page, blkno);
 
 		if (blkno >= bulkstate->relsize)
 		{
diff --git a/src/test/modules/test_aio/test_aio.c b/src/test/modules/test_aio/test_aio.c
index 488d98e7e66..e5fc7642dc2 100644
--- a/src/test/modules/test_aio/test_aio.c
+++ b/src/test/modules/test_aio/test_aio.c
@@ -288,7 +288,7 @@ modify_rel_block(PG_FUNCTION_ARGS)
 	}
 	else
 	{
-		PageSetChecksumInplace(page, blkno);
+		PageSetChecksum(page, blkno);
 	}
 
 	smgrwrite(RelationGetSmgr(rel),
-- 
2.48.1.76.g4e746b1a31.dirty

