From 0392adb59be052bb5d04a4b2e0d151cefdd810e4 Mon Sep 17 00:00:00 2001
From: Thomas Munro <thomas.munro@gmail.com>
Date: Sat, 1 Aug 2020 09:30:41 +1200
Subject: [PATCH] WIP: Cache smgrnblocks() in more cases.

This is just early sketch code and may be completely wrong...

Discussion: https://postgr.es/m/CAEepm%3D3SSw-Ty1DFcK%3D1rU-K6GSzYzfdD4d%2BZwapdN7dTa6%3DnQ%40mail.gmail.com
---
 src/backend/access/gist/gistbuild.c       |  2 +-
 src/backend/access/heap/visibilitymap.c   |  6 +-
 src/backend/access/table/tableam.c        |  4 +-
 src/backend/access/transam/xlogutils.c    |  2 +-
 src/backend/catalog/storage.c             |  4 +-
 src/backend/storage/buffer/bufmgr.c       |  4 +-
 src/backend/storage/freespace/freespace.c |  6 +-
 src/backend/storage/ipc/ipci.c            |  2 +
 src/backend/storage/smgr/smgr.c           | 85 +++++++++++++++++++----
 src/include/storage/smgr.h                | 11 ++-
 10 files changed, 97 insertions(+), 29 deletions(-)

diff --git a/src/backend/access/gist/gistbuild.c b/src/backend/access/gist/gistbuild.c
index 671b5e9186..d5baa8d21a 100644
--- a/src/backend/access/gist/gistbuild.c
+++ b/src/backend/access/gist/gistbuild.c
@@ -495,7 +495,7 @@ gistBuildCallback(Relation index,
 	 */
 	if ((buildstate->bufferingMode == GIST_BUFFERING_AUTO &&
 		 buildstate->indtuples % BUFFERING_MODE_SWITCH_CHECK_STEP == 0 &&
-		 effective_cache_size < smgrnblocks(index->rd_smgr, MAIN_FORKNUM)) ||
+		 effective_cache_size < smgrnblocks(index->rd_smgr, MAIN_FORKNUM, 0)) ||
 		(buildstate->bufferingMode == GIST_BUFFERING_STATS &&
 		 buildstate->indtuples >= BUFFERING_MODE_TUPLE_SIZE_STATS_TARGET))
 	{
diff --git a/src/backend/access/heap/visibilitymap.c b/src/backend/access/heap/visibilitymap.c
index b1072183bc..242b4a183f 100644
--- a/src/backend/access/heap/visibilitymap.c
+++ b/src/backend/access/heap/visibilitymap.c
@@ -528,7 +528,7 @@ visibilitymap_prepare_truncate(Relation rel, BlockNumber nheapblocks)
 	else
 		newnblocks = truncBlock;
 
-	if (smgrnblocks(rel->rd_smgr, VISIBILITYMAP_FORKNUM) <= newnblocks)
+	if (smgrnblocks(rel->rd_smgr, VISIBILITYMAP_FORKNUM, 0) <= newnblocks)
 	{
 		/* nothing to do, the file was already smaller than requested size */
 		return InvalidBlockNumber;
@@ -564,7 +564,7 @@ vm_readbuf(Relation rel, BlockNumber blkno, bool extend)
 	if (rel->rd_smgr->smgr_cached_nblocks[VISIBILITYMAP_FORKNUM] == InvalidBlockNumber)
 	{
 		if (smgrexists(rel->rd_smgr, VISIBILITYMAP_FORKNUM))
-			smgrnblocks(rel->rd_smgr, VISIBILITYMAP_FORKNUM);
+			smgrnblocks(rel->rd_smgr, VISIBILITYMAP_FORKNUM, 0);
 		else
 			rel->rd_smgr->smgr_cached_nblocks[VISIBILITYMAP_FORKNUM] = 0;
 	}
@@ -647,7 +647,7 @@ vm_extend(Relation rel, BlockNumber vm_nblocks)
 
 	/* Invalidate cache so that smgrnblocks() asks the kernel. */
 	rel->rd_smgr->smgr_cached_nblocks[VISIBILITYMAP_FORKNUM] = InvalidBlockNumber;
-	vm_nblocks_now = smgrnblocks(rel->rd_smgr, VISIBILITYMAP_FORKNUM);
+	vm_nblocks_now = smgrnblocks(rel->rd_smgr, VISIBILITYMAP_FORKNUM, 0);
 
 	/* Now extend the file */
 	while (vm_nblocks_now < vm_nblocks)
diff --git a/src/backend/access/table/tableam.c b/src/backend/access/table/tableam.c
index 3afb63b1fe..e8ead8f019 100644
--- a/src/backend/access/table/tableam.c
+++ b/src/backend/access/table/tableam.c
@@ -628,10 +628,10 @@ table_block_relation_size(Relation rel, ForkNumber forkNumber)
 	if (forkNumber == InvalidForkNumber)
 	{
 		for (int i = 0; i < MAX_FORKNUM; i++)
-			nblocks += smgrnblocks(rel->rd_smgr, i);
+			nblocks += smgrnblocks(rel->rd_smgr, i, 0);
 	}
 	else
-		nblocks = smgrnblocks(rel->rd_smgr, forkNumber);
+		nblocks = smgrnblocks(rel->rd_smgr, forkNumber, SMGRNBLOCKS_RELAXED);
 
 	return nblocks * BLCKSZ;
 }
diff --git a/src/backend/access/transam/xlogutils.c b/src/backend/access/transam/xlogutils.c
index b2ca0cd4cf..c7768a7595 100644
--- a/src/backend/access/transam/xlogutils.c
+++ b/src/backend/access/transam/xlogutils.c
@@ -461,7 +461,7 @@ XLogReadBufferExtended(RelFileNode rnode, ForkNumber forknum,
 	 */
 	smgrcreate(smgr, forknum, true);
 
-	lastblock = smgrnblocks(smgr, forknum);
+	lastblock = smgrnblocks(smgr, forknum, 0);
 
 	if (blkno < lastblock)
 	{
diff --git a/src/backend/catalog/storage.c b/src/backend/catalog/storage.c
index 9e6e6c42d3..5e359ff5f2 100644
--- a/src/backend/catalog/storage.c
+++ b/src/backend/catalog/storage.c
@@ -434,7 +434,7 @@ RelationCopyStorage(SMgrRelation src, SMgrRelation dst,
 	use_wal = XLogIsNeeded() &&
 		(relpersistence == RELPERSISTENCE_PERMANENT || copying_initfork);
 
-	nblocks = smgrnblocks(src, forkNum);
+	nblocks = smgrnblocks(src, forkNum, 0);
 
 	for (blkno = 0; blkno < nblocks; blkno++)
 	{
@@ -721,7 +721,7 @@ smgrDoPendingSyncs(bool isCommit, bool isParallelWorker)
 			{
 				if (smgrexists(srel, fork))
 				{
-					BlockNumber n = smgrnblocks(srel, fork);
+					BlockNumber n = smgrnblocks(srel, fork, 0);
 
 					/* we shouldn't come here for unlogged relations */
 					Assert(fork != INIT_FORKNUM);
diff --git a/src/backend/storage/buffer/bufmgr.c b/src/backend/storage/buffer/bufmgr.c
index f1ae6f9f84..35806e60ea 100644
--- a/src/backend/storage/buffer/bufmgr.c
+++ b/src/backend/storage/buffer/bufmgr.c
@@ -739,7 +739,7 @@ ReadBuffer_common(SMgrRelation smgr, char relpersistence, ForkNumber forkNum,
 
 	/* Substitute proper block number if caller asked for P_NEW */
 	if (isExtend)
-		blockNum = smgrnblocks(smgr, forkNum);
+		blockNum = smgrnblocks(smgr, forkNum, 0);
 
 	if (isLocalBuf)
 	{
@@ -2862,7 +2862,7 @@ RelationGetNumberOfBlocksInFork(Relation relation, ForkNumber forkNum)
 			/* Open it at the smgr level if not already done */
 			RelationOpenSmgr(relation);
 
-			return smgrnblocks(relation->rd_smgr, forkNum);
+			return smgrnblocks(relation->rd_smgr, forkNum, 0);
 
 		case RELKIND_RELATION:
 		case RELKIND_TOASTVALUE:
diff --git a/src/backend/storage/freespace/freespace.c b/src/backend/storage/freespace/freespace.c
index 6a96126b0c..e4d93b4794 100644
--- a/src/backend/storage/freespace/freespace.c
+++ b/src/backend/storage/freespace/freespace.c
@@ -317,7 +317,7 @@ FreeSpaceMapPrepareTruncateRel(Relation rel, BlockNumber nblocks)
 	else
 	{
 		new_nfsmblocks = fsm_logical_to_physical(first_removed_address);
-		if (smgrnblocks(rel->rd_smgr, FSM_FORKNUM) <= new_nfsmblocks)
+		if (smgrnblocks(rel->rd_smgr, FSM_FORKNUM, 0) <= new_nfsmblocks)
 			return InvalidBlockNumber;	/* nothing to do; the FSM was already
 										 * smaller */
 	}
@@ -547,7 +547,7 @@ fsm_readbuf(Relation rel, FSMAddress addr, bool extend)
 		/* Invalidate the cache so smgrnblocks asks the kernel. */
 		rel->rd_smgr->smgr_cached_nblocks[FSM_FORKNUM] = InvalidBlockNumber;
 		if (smgrexists(rel->rd_smgr, FSM_FORKNUM))
-			smgrnblocks(rel->rd_smgr, FSM_FORKNUM);
+			smgrnblocks(rel->rd_smgr, FSM_FORKNUM, 0);
 		else
 			rel->rd_smgr->smgr_cached_nblocks[FSM_FORKNUM] = 0;
 	}
@@ -633,7 +633,7 @@ fsm_extend(Relation rel, BlockNumber fsm_nblocks)
 
 	/* Invalidate cache so that smgrnblocks() asks the kernel. */
 	rel->rd_smgr->smgr_cached_nblocks[FSM_FORKNUM] = InvalidBlockNumber;
-	fsm_nblocks_now = smgrnblocks(rel->rd_smgr, FSM_FORKNUM);
+	fsm_nblocks_now = smgrnblocks(rel->rd_smgr, FSM_FORKNUM, 0);
 
 	while (fsm_nblocks_now < fsm_nblocks)
 	{
diff --git a/src/backend/storage/ipc/ipci.c b/src/backend/storage/ipc/ipci.c
index 96c2aaabbd..1182d3f0df 100644
--- a/src/backend/storage/ipc/ipci.c
+++ b/src/backend/storage/ipc/ipci.c
@@ -120,6 +120,7 @@ CreateSharedMemoryAndSemaphores(void)
 		size = add_size(size, SpinlockSemaSize());
 		size = add_size(size, hash_estimate_size(SHMEM_INDEX_SIZE,
 												 sizeof(ShmemIndexEnt)));
+		size = add_size(size, smgr_shmem_estimate_size());
 		size = add_size(size, dsm_estimate_size());
 		size = add_size(size, BufferShmemSize());
 		size = add_size(size, LockShmemSize());
@@ -220,6 +221,7 @@ CreateSharedMemoryAndSemaphores(void)
 	CommitTsShmemInit();
 	SUBTRANSShmemInit();
 	MultiXactShmemInit();
+	smgr_shmem_init();
 	InitBufferPool();
 
 	/*
diff --git a/src/backend/storage/smgr/smgr.c b/src/backend/storage/smgr/smgr.c
index dcc09df0c7..218826e2b1 100644
--- a/src/backend/storage/smgr/smgr.c
+++ b/src/backend/storage/smgr/smgr.c
@@ -22,10 +22,17 @@
 #include "storage/bufmgr.h"
 #include "storage/ipc.h"
 #include "storage/md.h"
+#include "storage/shmem.h"
 #include "storage/smgr.h"
 #include "utils/hsearch.h"
 #include "utils/inval.h"
 
+/* Control object in shared memory. */
+typedef struct SMgrShared
+{
+	pg_atomic_uint64 nblocks_inval[1024];
+} SMgrShared;
+
 
 /*
  * This struct of function pointers defines the API between smgr.c and
@@ -98,6 +105,50 @@ static dlist_head unowned_relns;
 /* local function prototypes */
 static void smgrshutdown(int code, Datum arg);
 
+static SMgrShared *smgr_shared;
+
+size_t
+smgr_shmem_estimate_size(void)
+{
+	return sizeof(*smgr_shared);
+}
+
+void
+smgr_shmem_init(void)
+{
+	bool		found;
+
+	smgr_shared = ShmemInitStruct("SMgr", sizeof(*smgr_shared), &found);
+	if (!found)
+	{
+		for (size_t i = 0; i < lengthof(smgr_shared->nblocks_inval); ++i)
+			pg_atomic_init_u64(&smgr_shared->nblocks_inval[i], 0);
+	}
+}
+
+/*
+ * Read the invalidation counter that a given relation maps to.
+ */
+static uint64
+smgrnblocks_get_inval(SMgrRelation reln)
+{
+	size_t		slot;
+
+	slot = reln->smgr_rnode.node.relNode % lengthof(smgr_shared->nblocks_inval);
+	return pg_atomic_read_u64(&smgr_shared->nblocks_inval[slot]);
+}
+
+/*
+ * Increment the invalidation counter that a given relation maps to.
+ */
+static uint64
+smgrnblocks_inc_inval(SMgrRelation reln)
+{
+	size_t		slot;
+
+	slot = reln->smgr_rnode.node.relNode % lengthof(smgr_shared->nblocks_inval);
+	return pg_atomic_fetch_add_u64(&smgr_shared->nblocks_inval[slot], 1) + 1;
+}
 
 /*
  *	smgrinit(), smgrshutdown() -- Initialize or shut down storage
@@ -466,15 +517,8 @@ smgrextend(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
 	smgrsw[reln->smgr_which].smgr_extend(reln, forknum, blocknum,
 										 buffer, skipFsync);
 
-	/*
-	 * Normally we expect this to increase nblocks by one, but if the cached
-	 * value isn't as expected, just invalidate it so the next call asks the
-	 * kernel.
-	 */
-	if (reln->smgr_cached_nblocks[forknum] == blocknum)
-		reln->smgr_cached_nblocks[forknum] = blocknum + 1;
-	else
-		reln->smgr_cached_nblocks[forknum] = InvalidBlockNumber;
+	reln->smgr_cached_nblocks_inval[forknum] = smgrnblocks_inc_inval(reln);
+	reln->smgr_cached_nblocks[forknum] = blocknum + 1;
 }
 
 /*
@@ -543,23 +587,35 @@ smgrwriteback(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
 
 /*
  *	smgrnblocks() -- Calculate the number of blocks in the
- *					 supplied relation.
+ *					 supplied relation, optionally returning a relaxed value
+ *					 that already out of date, but not older than the most
+ *					 recent read barrier that pairs with the most recent
+ * 					 write barrier in any process that earlier changed the
+ *					 true value.
  */
 BlockNumber
-smgrnblocks(SMgrRelation reln, ForkNumber forknum)
+smgrnblocks(SMgrRelation reln, ForkNumber forknum, int flags)
 {
 	BlockNumber result;
+	uint64		inval;
 
 	/*
-	 * For now, we only use cached values in recovery due to lack of a shared
-	 * invalidation mechanism for changes in file size.
+	 * We can use cached values in recovery since no other process can change
+	 * the size of a relation.  Otherwise, we only use a cached value if the
+	 * caller said that a relaxed value is OK, and the shared invalidation
+	 * counter hasn't moved.
 	 */
-	if (InRecovery && reln->smgr_cached_nblocks[forknum] != InvalidBlockNumber)
+	inval = smgrnblocks_get_inval(reln);
+	if ((InRecovery ||
+		 ((flags & SMGRNBLOCKS_RELAXED) &&
+		  inval == reln->smgr_cached_nblocks_inval[forknum])) &&
+		reln->smgr_cached_nblocks[forknum] != InvalidBlockNumber)
 		return reln->smgr_cached_nblocks[forknum];
 
 	result = smgrsw[reln->smgr_which].smgr_nblocks(reln, forknum);
 
 	reln->smgr_cached_nblocks[forknum] = result;
+	reln->smgr_cached_nblocks_inval[forknum] = inval;
 
 	return result;
 }
@@ -614,6 +670,7 @@ smgrtruncate(SMgrRelation reln, ForkNumber *forknum, int nforks, BlockNumber *nb
 		 */
 		reln->smgr_cached_nblocks[forknum[i]] = nblocks[i];
 	}
+	smgrnblocks_inc_inval(reln);
 }
 
 /*
diff --git a/src/include/storage/smgr.h b/src/include/storage/smgr.h
index f28a842401..af8a5cb874 100644
--- a/src/include/storage/smgr.h
+++ b/src/include/storage/smgr.h
@@ -14,10 +14,14 @@
 #ifndef SMGR_H
 #define SMGR_H
 
+#include "port/atomics.h"
 #include "lib/ilist.h"
 #include "storage/block.h"
 #include "storage/relfilenode.h"
 
+/* Flags for smgrnblocks(). */
+#define SMGRNBLOCKS_RELAXED 0x01
+
 /*
  * smgr.c maintains a table of SMgrRelation objects, which are essentially
  * cached file handles.  An SMgrRelation is created (if not already present)
@@ -52,6 +56,7 @@ typedef struct SMgrRelationData
 	 */
 	BlockNumber smgr_targblock; /* current insertion target block */
 	BlockNumber smgr_cached_nblocks[MAX_FORKNUM + 1];	/* last known size */
+	uint64		smgr_cached_nblocks_inval[MAX_FORKNUM + 1];
 
 	/* additional public fields may someday exist here */
 
@@ -77,6 +82,9 @@ typedef SMgrRelationData *SMgrRelation;
 #define SmgrIsTemp(smgr) \
 	RelFileNodeBackendIsTemp((smgr)->smgr_rnode)
 
+extern size_t smgr_shmem_estimate_size(void);
+extern void smgr_shmem_init(void);
+
 extern void smgrinit(void);
 extern SMgrRelation smgropen(RelFileNode rnode, BackendId backend);
 extern bool smgrexists(SMgrRelation reln, ForkNumber forknum);
@@ -98,7 +106,8 @@ extern void smgrwrite(SMgrRelation reln, ForkNumber forknum,
 					  BlockNumber blocknum, char *buffer, bool skipFsync);
 extern void smgrwriteback(SMgrRelation reln, ForkNumber forknum,
 						  BlockNumber blocknum, BlockNumber nblocks);
-extern BlockNumber smgrnblocks(SMgrRelation reln, ForkNumber forknum);
+extern BlockNumber smgrnblocks(SMgrRelation reln, ForkNumber forknum,
+							   int flags);
 extern void smgrtruncate(SMgrRelation reln, ForkNumber *forknum,
 						 int nforks, BlockNumber *nblocks);
 extern void smgrimmedsync(SMgrRelation reln, ForkNumber forknum);
-- 
2.20.1

