From 45a2111c51016386a31d766700b23d9d88ff6c0b Mon Sep 17 00:00:00 2001
From: Tomas Vondra <tomas@vondra.me>
Date: Thu, 12 Sep 2024 23:09:41 +0200
Subject: [PATCH v20240913 1/2] Increase the number of fast-path lock slots

The fast-path locking introduced in 9.2 allowed each backend to acquire
up to 16 relation locks cheaply, provided the lock level allows that.
If a backend needs to hold more locks, it has to insert them into the
regular lock table in shared memory. This is considerably more
expensive, and on many-core systems may be subject to contention.

The limit of 16 entries was always rather low, even with simple queries
and schemas with only a few tables. We have to lock all relations - not
just tables, but also indexes, views, etc. Moreover, for planning we
need to lock all relations that might be used in the plan, not just
those that actually get used in the final plan. It only takes a couple
tables with multiple indexes to need more than 16 locks. It was quite
common to fill all fast-path slots.

As partitioning gets used more widely, with more and more partitions,
this limit is trivial to hit, with complex queries easily using hundreds
or even thousands of locks. For workloads doing a lot of I/O this is not
noticeable, but on large machines with enough RAM to keep the data in
memory, the access to the shared lock table may be a serious issue.

This patch improves this by increasing the number of fast-path slots
from 16 to 1024. The slots remain in PGPROC, and are organized as an
array of 16-slot groups (each group being effectively a clone of the
original fast-path approach). Instead of accessing this as a big hash
table with open addressing, we treat this as a 16-way set associative
cache. Each relation (identified by a "relid" OID) is mapped to a
particular 16-slot group by calculating a hash

    h(relid) = ((relid * P) mod N)

where P is a hard-coded prime, and N is the number of groups. This is
not a great hash function, but it works well enough - the main purpose
is to prevent "hot groups" with runs of consecutive OIDs, which might
fill some of the fast-path groups. The multiplication by P ensures that.
If the OIDs are already spread out, the hash should not group them.

The groups are processed by linear search. With only 16 entries this is
cheap, and the groups have very good locality.

Treating this as a simple hash table with open addressing would not be
efficient, especially once the hash table is getting almost full. The
usual solution is to grow the table, but for hash tables in shared
memory that's not trivial. It would also have worse locality, due to
more random access.

Luckily, fast-path locking already has a simple solution to deal with a
full hash table. The lock can be simply inserted into the shared lock
table, just like before. Of course, if this happens too often, that
reduces the benefit of fast-path locking.

This patch hard-codes the number of groups to 64, which means 1024
fast-path locks. As all the information is still stored in PGPROC, this
grows PGPROC by about 4.5kB (from ~840B to ~5kB). This is a trade off
exchanging memory for cheaper locking.

Ultimately, the number of fast-path slots should not be hard coded, but
adjustable based on what the workload does, perhaps using a GUC. That
however means it can't be stored in PGPROC directly.
---
 src/backend/storage/lmgr/lock.c | 118 ++++++++++++++++++++++++++------
 src/include/storage/proc.h      |   8 ++-
 2 files changed, 102 insertions(+), 24 deletions(-)

diff --git a/src/backend/storage/lmgr/lock.c b/src/backend/storage/lmgr/lock.c
index 83b99a98f08..d053ae0c409 100644
--- a/src/backend/storage/lmgr/lock.c
+++ b/src/backend/storage/lmgr/lock.c
@@ -167,7 +167,7 @@ typedef struct TwoPhaseLockRecord
  * our locks to the primary lock table, but it can never be lower than the
  * real value, since only we can acquire locks on our own behalf.
  */
-static int	FastPathLocalUseCount = 0;
+static int	FastPathLocalUseCounts[FP_LOCK_GROUPS_PER_BACKEND];
 
 /*
  * Flag to indicate if the relation extension lock is held by this backend.
@@ -184,23 +184,53 @@ static int	FastPathLocalUseCount = 0;
  */
 static bool IsRelationExtensionLockHeld PG_USED_FOR_ASSERTS_ONLY = false;
 
+/*
+ * Macros to calculate the group and index for a relation.
+ *
+ * The formula is a simple hash function, designed to spread the OIDs a bit,
+ * so that even contiguous values end up in different groups. In most cases
+ * there will be gaps anyway, but the multiplication should help a bit.
+ *
+ * The selected value (49157) is a prime not too close to 2^k, and it's
+ * small enough to not cause overflows (in 64-bit).
+ */
+#define FAST_PATH_LOCK_REL_GROUP(rel) \
+	(((uint64) (rel) * 49157) % FP_LOCK_GROUPS_PER_BACKEND)
+
+/* Calculate index in the whole per-backend array of lock slots. */
+#define FP_LOCK_SLOT_INDEX(group, index) \
+	(AssertMacro(((group) >= 0) && ((group) < FP_LOCK_GROUPS_PER_BACKEND)), \
+	 AssertMacro(((index) >= 0) && ((index) < FP_LOCK_SLOTS_PER_GROUP)), \
+	 ((group) * FP_LOCK_SLOTS_PER_GROUP + (index)))
+
+/*
+ * Given a lock index (into the per-backend array), calculated using the
+ * FP_LOCK_SLOT_INDEX macro, calculate group and index (within the group).
+ */
+#define FAST_PATH_LOCK_GROUP(index)	\
+	(AssertMacro(((index) >= 0) && ((index) < FP_LOCK_SLOTS_PER_BACKEND)), \
+	 ((index) / FP_LOCK_SLOTS_PER_GROUP))
+#define FAST_PATH_LOCK_INDEX(index)	\
+	(AssertMacro(((index) >= 0) && ((index) < FP_LOCK_SLOTS_PER_BACKEND)), \
+	 ((index) % FP_LOCK_SLOTS_PER_GROUP))
+
 /* Macros for manipulating proc->fpLockBits */
 #define FAST_PATH_BITS_PER_SLOT			3
 #define FAST_PATH_LOCKNUMBER_OFFSET		1
 #define FAST_PATH_MASK					((1 << FAST_PATH_BITS_PER_SLOT) - 1)
 #define FAST_PATH_GET_BITS(proc, n) \
-	(((proc)->fpLockBits >> (FAST_PATH_BITS_PER_SLOT * n)) & FAST_PATH_MASK)
+	(((proc)->fpLockBits[(n)/16] >> (FAST_PATH_BITS_PER_SLOT * FAST_PATH_LOCK_INDEX(n))) & FAST_PATH_MASK)
 #define FAST_PATH_BIT_POSITION(n, l) \
 	(AssertMacro((l) >= FAST_PATH_LOCKNUMBER_OFFSET), \
 	 AssertMacro((l) < FAST_PATH_BITS_PER_SLOT+FAST_PATH_LOCKNUMBER_OFFSET), \
 	 AssertMacro((n) < FP_LOCK_SLOTS_PER_BACKEND), \
-	 ((l) - FAST_PATH_LOCKNUMBER_OFFSET + FAST_PATH_BITS_PER_SLOT * (n)))
+	 ((l) - FAST_PATH_LOCKNUMBER_OFFSET + FAST_PATH_BITS_PER_SLOT * (FAST_PATH_LOCK_INDEX(n))))
 #define FAST_PATH_SET_LOCKMODE(proc, n, l) \
-	 (proc)->fpLockBits |= UINT64CONST(1) << FAST_PATH_BIT_POSITION(n, l)
+	 (proc)->fpLockBits[FAST_PATH_LOCK_GROUP(n)] |= UINT64CONST(1) << FAST_PATH_BIT_POSITION(n, l)
 #define FAST_PATH_CLEAR_LOCKMODE(proc, n, l) \
-	 (proc)->fpLockBits &= ~(UINT64CONST(1) << FAST_PATH_BIT_POSITION(n, l))
+	 (proc)->fpLockBits[FAST_PATH_LOCK_GROUP(n)] &= ~(UINT64CONST(1) << FAST_PATH_BIT_POSITION(n, l))
 #define FAST_PATH_CHECK_LOCKMODE(proc, n, l) \
-	 ((proc)->fpLockBits & (UINT64CONST(1) << FAST_PATH_BIT_POSITION(n, l)))
+	 ((proc)->fpLockBits[FAST_PATH_LOCK_GROUP(n)] & (UINT64CONST(1) << FAST_PATH_BIT_POSITION(n, l)))
 
 /*
  * The fast-path lock mechanism is concerned only with relation locks on
@@ -926,7 +956,7 @@ LockAcquireExtended(const LOCKTAG *locktag,
 	 * for now we don't worry about that case either.
 	 */
 	if (EligibleForRelationFastPath(locktag, lockmode) &&
-		FastPathLocalUseCount < FP_LOCK_SLOTS_PER_BACKEND)
+		FastPathLocalUseCounts[FAST_PATH_LOCK_REL_GROUP(locktag->locktag_field2)] < FP_LOCK_SLOTS_PER_GROUP)
 	{
 		uint32		fasthashcode = FastPathStrongLockHashPartition(hashcode);
 		bool		acquired;
@@ -1970,6 +2000,7 @@ LockRelease(const LOCKTAG *locktag, LOCKMODE lockmode, bool sessionLock)
 	PROCLOCK   *proclock;
 	LWLock	   *partitionLock;
 	bool		wakeupNeeded;
+	int			group;
 
 	if (lockmethodid <= 0 || lockmethodid >= lengthof(LockMethods))
 		elog(ERROR, "unrecognized lock method: %d", lockmethodid);
@@ -2063,9 +2094,12 @@ LockRelease(const LOCKTAG *locktag, LOCKMODE lockmode, bool sessionLock)
 	 */
 	locallock->lockCleared = false;
 
+	/* fast-path group the lock belongs to */
+	group = FAST_PATH_LOCK_REL_GROUP(locktag->locktag_field2);
+
 	/* Attempt fast release of any lock eligible for the fast path. */
 	if (EligibleForRelationFastPath(locktag, lockmode) &&
-		FastPathLocalUseCount > 0)
+		FastPathLocalUseCounts[group] > 0)
 	{
 		bool		released;
 
@@ -2633,12 +2667,21 @@ LockReassignOwner(LOCALLOCK *locallock, ResourceOwner parent)
 static bool
 FastPathGrantRelationLock(Oid relid, LOCKMODE lockmode)
 {
-	uint32		f;
 	uint32		unused_slot = FP_LOCK_SLOTS_PER_BACKEND;
+	uint32		i,
+				group;
+
+	/* fast-path group the lock belongs to */
+	group = FAST_PATH_LOCK_REL_GROUP(relid);
 
 	/* Scan for existing entry for this relid, remembering empty slot. */
-	for (f = 0; f < FP_LOCK_SLOTS_PER_BACKEND; f++)
+	for (i = 0; i < FP_LOCK_SLOTS_PER_GROUP; i++)
 	{
+		uint32		f;
+
+		/* index into the whole per-backend array */
+		f = FP_LOCK_SLOT_INDEX(group, i);
+
 		if (FAST_PATH_GET_BITS(MyProc, f) == 0)
 			unused_slot = f;
 		else if (MyProc->fpRelId[f] == relid)
@@ -2654,7 +2697,7 @@ FastPathGrantRelationLock(Oid relid, LOCKMODE lockmode)
 	{
 		MyProc->fpRelId[unused_slot] = relid;
 		FAST_PATH_SET_LOCKMODE(MyProc, unused_slot, lockmode);
-		++FastPathLocalUseCount;
+		++FastPathLocalUseCounts[group];
 		return true;
 	}
 
@@ -2670,12 +2713,21 @@ FastPathGrantRelationLock(Oid relid, LOCKMODE lockmode)
 static bool
 FastPathUnGrantRelationLock(Oid relid, LOCKMODE lockmode)
 {
-	uint32		f;
 	bool		result = false;
+	uint32		i,
+				group;
 
-	FastPathLocalUseCount = 0;
-	for (f = 0; f < FP_LOCK_SLOTS_PER_BACKEND; f++)
+	/* fast-path group the lock belongs to */
+	group = FAST_PATH_LOCK_REL_GROUP(relid);
+
+	FastPathLocalUseCounts[group] = 0;
+	for (i = 0; i < FP_LOCK_SLOTS_PER_GROUP; i++)
 	{
+		uint32		f;
+
+		/* index into the whole per-backend array */
+		f = FP_LOCK_SLOT_INDEX(group, i);
+
 		if (MyProc->fpRelId[f] == relid
 			&& FAST_PATH_CHECK_LOCKMODE(MyProc, f, lockmode))
 		{
@@ -2685,7 +2737,7 @@ FastPathUnGrantRelationLock(Oid relid, LOCKMODE lockmode)
 			/* we continue iterating so as to update FastPathLocalUseCount */
 		}
 		if (FAST_PATH_GET_BITS(MyProc, f) != 0)
-			++FastPathLocalUseCount;
+			++FastPathLocalUseCounts[group];
 	}
 	return result;
 }
@@ -2714,7 +2766,8 @@ FastPathTransferRelationLocks(LockMethod lockMethodTable, const LOCKTAG *locktag
 	for (i = 0; i < ProcGlobal->allProcCount; i++)
 	{
 		PGPROC	   *proc = &ProcGlobal->allProcs[i];
-		uint32		f;
+		uint32		j,
+					group;
 
 		LWLockAcquire(&proc->fpInfoLock, LW_EXCLUSIVE);
 
@@ -2739,9 +2792,16 @@ FastPathTransferRelationLocks(LockMethod lockMethodTable, const LOCKTAG *locktag
 			continue;
 		}
 
-		for (f = 0; f < FP_LOCK_SLOTS_PER_BACKEND; f++)
+		/* fast-path group the lock belongs to */
+		group = FAST_PATH_LOCK_REL_GROUP(relid);
+
+		for (j = 0; j < FP_LOCK_SLOTS_PER_GROUP; j++)
 		{
 			uint32		lockmode;
+			uint32		f;
+
+			/* index into the whole per-backend array */
+			f = FP_LOCK_SLOT_INDEX(group, j);
 
 			/* Look for an allocated slot matching the given relid. */
 			if (relid != proc->fpRelId[f] || FAST_PATH_GET_BITS(proc, f) == 0)
@@ -2793,13 +2853,21 @@ FastPathGetRelationLockEntry(LOCALLOCK *locallock)
 	PROCLOCK   *proclock = NULL;
 	LWLock	   *partitionLock = LockHashPartitionLock(locallock->hashcode);
 	Oid			relid = locktag->locktag_field2;
-	uint32		f;
+	uint32		i,
+				group;
+
+	/* fast-path group the lock belongs to */
+	group = FAST_PATH_LOCK_REL_GROUP(relid);
 
 	LWLockAcquire(&MyProc->fpInfoLock, LW_EXCLUSIVE);
 
-	for (f = 0; f < FP_LOCK_SLOTS_PER_BACKEND; f++)
+	for (i = 0; i < FP_LOCK_SLOTS_PER_GROUP; i++)
 	{
 		uint32		lockmode;
+		uint32		f;
+
+		/* index into the whole per-backend array */
+		f = FP_LOCK_SLOT_INDEX(group, i);
 
 		/* Look for an allocated slot matching the given relid. */
 		if (relid != MyProc->fpRelId[f] || FAST_PATH_GET_BITS(MyProc, f) == 0)
@@ -2903,6 +2971,10 @@ GetLockConflicts(const LOCKTAG *locktag, LOCKMODE lockmode, int *countp)
 	LWLock	   *partitionLock;
 	int			count = 0;
 	int			fast_count = 0;
+	uint32		group;
+
+	/* fast-path group the lock belongs to */
+	group = FAST_PATH_LOCK_REL_GROUP(locktag->locktag_field2);
 
 	if (lockmethodid <= 0 || lockmethodid >= lengthof(LockMethods))
 		elog(ERROR, "unrecognized lock method: %d", lockmethodid);
@@ -2957,7 +3029,7 @@ GetLockConflicts(const LOCKTAG *locktag, LOCKMODE lockmode, int *countp)
 		for (i = 0; i < ProcGlobal->allProcCount; i++)
 		{
 			PGPROC	   *proc = &ProcGlobal->allProcs[i];
-			uint32		f;
+			uint32		j;
 
 			/* A backend never blocks itself */
 			if (proc == MyProc)
@@ -2979,9 +3051,13 @@ GetLockConflicts(const LOCKTAG *locktag, LOCKMODE lockmode, int *countp)
 				continue;
 			}
 
-			for (f = 0; f < FP_LOCK_SLOTS_PER_BACKEND; f++)
+			for (j = 0; j < FP_LOCK_SLOTS_PER_GROUP; j++)
 			{
 				uint32		lockmask;
+				uint32		f;
+
+				/* index into the whole per-backend array */
+				f = FP_LOCK_SLOT_INDEX(group, j);
 
 				/* Look for an allocated slot matching the given relid. */
 				if (relid != proc->fpRelId[f])
diff --git a/src/include/storage/proc.h b/src/include/storage/proc.h
index deeb06c9e01..845058da9fa 100644
--- a/src/include/storage/proc.h
+++ b/src/include/storage/proc.h
@@ -83,8 +83,9 @@ struct XidCache
  * rather than the main lock table.  This eases contention on the lock
  * manager LWLocks.  See storage/lmgr/README for additional details.
  */
-#define		FP_LOCK_SLOTS_PER_BACKEND 16
-
+#define		FP_LOCK_GROUPS_PER_BACKEND	64
+#define		FP_LOCK_SLOTS_PER_GROUP		16	/* don't change */
+#define		FP_LOCK_SLOTS_PER_BACKEND	(FP_LOCK_SLOTS_PER_GROUP * FP_LOCK_GROUPS_PER_BACKEND)
 /*
  * Flags for PGPROC.delayChkptFlags
  *
@@ -292,7 +293,8 @@ struct PGPROC
 
 	/* Lock manager data, recording fast-path locks taken by this backend. */
 	LWLock		fpInfoLock;		/* protects per-backend fast-path state */
-	uint64		fpLockBits;		/* lock modes held for each fast-path slot */
+	uint64		fpLockBits[FP_LOCK_GROUPS_PER_BACKEND]; /* lock modes held for
+														 * each fast-path slot */
 	Oid			fpRelId[FP_LOCK_SLOTS_PER_BACKEND]; /* slots for rel oids */
 	bool		fpVXIDLock;		/* are we holding a fast-path VXID lock? */
 	LocalTransactionId fpLocalTransactionId;	/* lxid for fast-path VXID
-- 
2.46.0

