From 2e6473d293e43d78cc3552c685f6c53e6d47213e Mon Sep 17 00:00:00 2001 From: Zhiguo Zhou Date: Fri, 27 Jun 2025 11:16:28 +0800 Subject: [v1 1/2] Introduce ReadBiasedLWLock for high-concurrency read workloads This new lock type addresses contention bottlenecks in high-core-count systems by partitioning lock state across multiple cache lines. Readers are distributed across 16 underlying LWLocks using PID-based indexing, while writers acquire all locks to maintain exclusivity. Key components: - PID-modulo indexing for reader lock distribution - Bulk acquisition/release for writer paths - New wait events and tranche type - Shared memory initialization hooks - Conditional and blocking acquire APIs Designed for read-heavy locks like ProcArrayLock where profiling showed ~93% shared access patterns. Reduces cache-line ping-pong while preserving exclusive access semantics. Patch includes generation script updates, memory allocation logic, and extended wait event visibility via pg_wait_events. --- .../storage/lmgr/generate-lwlocknames.pl | 167 ++++++++++++------ src/backend/storage/lmgr/lwlock.c | 165 +++++++++++++++++ .../activity/generate-wait_event_types.pl | 1 + .../utils/activity/wait_event_names.txt | 5 + src/include/storage/lwlock.h | 77 ++++++++ src/test/regress/expected/sysviews.out | 25 +-- 6 files changed, 375 insertions(+), 65 deletions(-) diff --git a/src/backend/storage/lmgr/generate-lwlocknames.pl b/src/backend/storage/lmgr/generate-lwlocknames.pl index 4441b7cba0c..5bb9e6d3fea 100644 --- a/src/backend/storage/lmgr/generate-lwlocknames.pl +++ b/src/backend/storage/lmgr/generate-lwlocknames.pl @@ -9,7 +9,6 @@ use Getopt::Long; my $output_path = '.'; -my $lastlockidx = -1; my $continue = "\n"; GetOptions('outdir:s' => \$output_path); @@ -28,18 +27,30 @@ print $h "/* there is deliberately not an #ifndef LWLOCKNAMES_H here */\n\n"; # -# First, record the predefined LWLocks listed in wait_event_names.txt. We'll +# First, record the predefined ReadBiasedLWLocks/LWLocks listed in wait_event_names.txt. We'll # cross-check those with the ones in lwlocklist.h. # my @wait_event_lwlocks; my $record_lwlocks = 0; +my @wait_event_read_biased_lwlocks; +my $record_read_biased_lwlocks = 0; while (<$wait_event_names>) { chomp; - # Check for end marker. - last if /^# END OF PREDEFINED LWLOCKS/; + # Check for end marker of LWLocks. + if (/^# END OF PREDEFINED LWLOCKS/) + { + $record_lwlocks = 0; + next; + } + # Check for end marker of ReadBiasedLWLocks. + if (/^# END OF PREDEFINED READ_BIASED_LWLOCKS/) + { + $record_read_biased_lwlocks = 0; + next; + } # Skip comments and empty lines. next if /^#/; @@ -51,74 +62,124 @@ while (<$wait_event_names>) $record_lwlocks = 1; next; } + # Start recording ReadBiased LWLocks when we find the WaitEventReadBiasedLWLock section. + if (/^Section: ClassName - WaitEventReadBiasedLWLock$/) + { + $record_read_biased_lwlocks = 1; + next; + } - # Go to the next line if we are not yet recording LWLocks. - next if not $record_lwlocks; + # Go to the next line if we are not yet recording LWLocks/ReadBiasedLWLocks. + next unless $record_lwlocks || $record_read_biased_lwlocks; - # Record the LWLock. + # Record the LWLock/ReadBiasedLWLocks. (my $waiteventname, my $waitevendocsentence) = split(/\t/, $_); - push(@wait_event_lwlocks, $waiteventname); -} - -my $in_comment = 0; -my $i = 0; -while (<$lwlocklist>) -{ - chomp; - - # Skip single-line C comments and empty lines - next if m{^\s*/\*.*\*/$}; - next if /^\s*$/; - - # skip multiline C comments - if ($in_comment == 1) + if ($record_lwlocks == 1) { - $in_comment = 0 if m{\*/}; - next; + push(@wait_event_lwlocks, $waiteventname); } - elsif (m{^\s*/\*}) + elsif ($record_read_biased_lwlocks == 1) { - $in_comment = 1; - next; + push(@wait_event_read_biased_lwlocks, $waiteventname); } +} - die "unable to parse lwlocklist.h line \"$_\"" - unless /^PG_LWLOCK\((\d+),\s+(\w+)\)$/; +use constant { + PG_LWLOCK => 1, + PG_READ_BIASED_LWLOCK => 2, +}; - (my $lockidx, my $lockname) = ($1, $2); +sub generate_lwlock_names +{ + my ($type, @locks) = @_; - die "lwlocklist.h not in order" if $lockidx < $lastlockidx; - die "lwlocklist.h has duplicates" if $lockidx == $lastlockidx; + die "Invalid lock type: $type. Expected PG_LWLOCK or PG_READ_BIASED_LWLOCK" + if $type != PG_LWLOCK && $type != PG_READ_BIASED_LWLOCK; - die "$lockname defined in lwlocklist.h but missing from " - . "wait_event_names.txt" - if $i >= scalar @wait_event_lwlocks; - die "lists of predefined LWLocks do not match (first mismatch at " - . "$wait_event_lwlocks[$i] in wait_event_names.txt and $lockname in " - . "lwlocklist.h)" - if $wait_event_lwlocks[$i] ne $lockname; - $i++; + my $in_comment = 0; + my $i = 0; + my $lastlockidx = -1; + my $pattern = $type == PG_LWLOCK ? qr/^PG_LWLOCK\((\d+),\s+(\w+)\)$/ : qr/^PG_READ_BIASED_LWLOCK\((\d+),\s+(\w+)\)$/; - while ($lastlockidx < $lockidx - 1) + while (<$lwlocklist>) { - ++$lastlockidx; + chomp; + + # Skip single-line C comments and empty lines + next if m{^\s*/\*.*\*/$}; + next if /^\s*$/; + + # skip multiline C comments + if ($in_comment == 1) + { + $in_comment = 0 if m{\*/}; + next; + } + elsif (m{^\s*/\*}) + { + $in_comment = 1; + next; + } + + next unless $_ =~ $pattern; + + (my $lockidx, my $lockname) = ($1, $2); + + die "lwlocklist.h not in order" if $lockidx < $lastlockidx; + die "lwlocklist.h has duplicates" if $lockidx == $lastlockidx; + + die "$lockname defined in lwlocklist.h but missing from " + . "wait_event_names.txt" + if $i >= scalar @locks; + die "lists of predefined LWLocks do not match (first mismatch at " + . "$locks[$i] in wait_event_names.txt and $lockname in " + . "lwlocklist.h)" + if $locks[$i] ne $lockname; + $i++; + + while ($lastlockidx < $lockidx - 1) + { + ++$lastlockidx; + $continue = ",\n"; + } + $lastlockidx = $lockidx; $continue = ",\n"; + + # Add a "Lock" suffix to each lock name, as the C code depends on that + if ($type == PG_LWLOCK) + { + printf $h "#define %-32s (&MainLWLockArray[$lockidx].lock)\n", + $lockname . "Lock"; + } + else + { + printf $h "#define %-32s (&MainReadBiasedLWLockArray[$lockidx])\n", + $lockname . "Lock"; + } } - $lastlockidx = $lockidx; - $continue = ",\n"; - # Add a "Lock" suffix to each lock name, as the C code depends on that - printf $h "#define %-32s (&MainLWLockArray[$lockidx].lock)\n", - $lockname . "Lock"; + die + "$locks[$i] defined in wait_event_names.txt but missing from " + . "lwlocklist.h" + if $i < scalar @locks; + + print $h "\n"; + if ($type == PG_LWLOCK) + { + printf $h "#define NUM_INDIVIDUAL_LWLOCKS %s\n", $lastlockidx + 1; + } + else + { + printf $h "#define NUM_INDIVIDUAL_READ_BIASED_LWLOCKS %s\n", + $lastlockidx + 1; + } } -die - "$wait_event_lwlocks[$i] defined in wait_event_names.txt but missing from " - . "lwlocklist.h" - if $i < scalar @wait_event_lwlocks; +generate_lwlock_names(PG_LWLOCK, @wait_event_lwlocks); + +seek($lwlocklist, 0, 0) or die "Cannot seek to the beginning of lwlocklist.h"; -print $h "\n"; -printf $h "#define NUM_INDIVIDUAL_LWLOCKS %s\n", $lastlockidx + 1; +generate_lwlock_names(PG_READ_BIASED_LWLOCK, @wait_event_read_biased_lwlocks); close $h; diff --git a/src/backend/storage/lmgr/lwlock.c b/src/backend/storage/lmgr/lwlock.c index 4c29016ce35..572802b46d8 100644 --- a/src/backend/storage/lmgr/lwlock.c +++ b/src/backend/storage/lmgr/lwlock.c @@ -156,6 +156,7 @@ StaticAssertDecl((LW_VAL_EXCLUSIVE & LW_FLAG_MASK) == 0, */ static const char *const BuiltinTrancheNames[] = { #define PG_LWLOCK(id, lockname) [id] = CppAsString(lockname), +#define PG_READ_BIASED_LWLOCK(id, lockname) /* ignored */ #include "storage/lwlocklist.h" #undef PG_LWLOCK [LWTRANCHE_XACT_BUFFER] = "XactBuffer", @@ -199,6 +200,7 @@ static const char *const BuiltinTrancheNames[] = { [LWTRANCHE_XACT_SLRU] = "XactSLRU", [LWTRANCHE_PARALLEL_VACUUM_DSA] = "ParallelVacuumDSA", [LWTRANCHE_AIO_URING_COMPLETION] = "AioUringCompletion", + [LWTRANCHE_READ_BIASED] = "ReadBiased", }; StaticAssertDecl(lengthof(BuiltinTrancheNames) == @@ -219,6 +221,7 @@ static int LWLockTrancheNamesAllocated = 0; * where we have special measures to pass it down). */ LWLockPadded *MainLWLockArray = NULL; +ReadBiasedLWLock *MainReadBiasedLWLockArray = NULL; /* * We use this structure to keep track of locked LWLocks for release @@ -464,6 +467,8 @@ LWLockShmemSize(void) /* Space for the LWLock array. */ size = mul_size(numLocks, sizeof(LWLockPadded)); + size = add_size(size, mul_size(NUM_INDIVIDUAL_READ_BIASED_LWLOCKS, sizeof(ReadBiasedLWLock))); + /* Space for dynamic allocation counter, plus room for alignment. */ size = add_size(size, sizeof(int) + LWLOCK_PADDED_SIZE); @@ -489,7 +494,11 @@ CreateLWLocks(void) Size spaceLocks = LWLockShmemSize(); int *LWLockCounter; char *ptr; + int numLocks = NUM_FIXED_LWLOCKS; + int i; + ReadBiasedLWLock *readBiasedLock; + numLocks += NumLWLocksForNamedTranches(); /* Allocate space */ ptr = (char *) ShmemAlloc(spaceLocks); @@ -508,8 +517,16 @@ CreateLWLocks(void) LWLockCounter = (int *) ((char *) MainLWLockArray - sizeof(int)); *LWLockCounter = LWTRANCHE_FIRST_USER_DEFINED; + ptr += mul_size(numLocks, sizeof(LWLockPadded)); + + MainReadBiasedLWLockArray = (ReadBiasedLWLock *) ptr; /* Initialize all LWLocks */ InitializeLWLocks(); + /* Initialize all ReadBiasedLWLocks*/ + for (i = 0, readBiasedLock = MainReadBiasedLWLockArray; i < NUM_INDIVIDUAL_READ_BIASED_LWLOCKS; i++, readBiasedLock++) + { + ReadBiasedLWLockInitialize(readBiasedLock); + } } /* Register named extension LWLock tranches in the current process. */ @@ -2076,3 +2093,151 @@ LWLockHeldByMeInMode(LWLock *lock, LWLockMode mode) } return false; } + +/* + * ReadBiasedLWLockInitialize - initialize a ReadBiasedLWLock; it's initially unlocked + */ +void +ReadBiasedLWLockInitialize(ReadBiasedLWLock *lock) +{ + for (int i = 0; i < READ_BIASED_LOCK_STATE_COUNT; i++) + { + LWLockInitialize(&lock->lwlocks[i].lock, LWTRANCHE_READ_BIASED); + } +} + +/* + * ReadBiasedLWLockAcquire - acquire a ReadBiasedLWLock in the specified mode + * + * If the lock is not available, sleep until it is. Returns true if the lock + * was available immediately, false if we had to sleep. + */ +bool +ReadBiasedLWLockAcquire(ReadBiasedLWLock *lock, LWLockMode mode) +{ + PGPROC *proc = MyProc; + int i; + bool result = true; + + Assert(mode == LW_SHARED || mode == LW_EXCLUSIVE); + + if (proc == NULL) + elog(PANIC, "cannot acquire ReadBiasedLWLock without a PGPROC structure"); + + if (mode == LW_SHARED) + { + /* Acquire the shared lock */ + return LWLockAcquire(&lock->lwlocks[proc->pid % READ_BIASED_LOCK_STATE_COUNT].lock, mode); + } + + for (i = 0; i < READ_BIASED_LOCK_STATE_COUNT; i++) + { + result = LWLockAcquire(&lock->lwlocks[i].lock, mode) && result; + } + + return result; +} + +/* + * ReadBiasedLWLockConditionalAcquire - acquire a ReadBiasedLWLock in the specified mode + * + * If the lock is not available, return false with no side-effects. + */ +bool +ReadBiasedLWLockConditionalAcquire(ReadBiasedLWLock *lock, LWLockMode mode) +{ + PGPROC *proc = MyProc; + int i; + + Assert(mode == LW_SHARED || mode == LW_EXCLUSIVE); + + if (proc == NULL) + elog(PANIC, "cannot acquire ReadBiasedLWLock without a PGPROC structure"); + + if (mode == LW_SHARED) + { + /* Acquire the shared lock */ + return LWLockConditionalAcquire(&lock->lwlocks[proc->pid % READ_BIASED_LOCK_STATE_COUNT].lock, mode); + } + + for (i = 0; i < READ_BIASED_LOCK_STATE_COUNT; i++) + { + if (!LWLockConditionalAcquire(&lock->lwlocks[i].lock, mode)) + break; + } + + if (i == READ_BIASED_LOCK_STATE_COUNT) + return true; + + for (i = i - 1; i >= 0; i--) + { + LWLockRelease(&lock->lwlocks[i].lock); + } + + return false; +} + +/* + * ReadBiasedLWLockRelease - release a ReadBiasedLWLock + * + * If the lock was acquired in shared mode, it releases the shared lock. + * If it was acquired in exclusive mode, it releases all exclusive locks. + */ +void +ReadBiasedLWLockRelease(ReadBiasedLWLock *lock) +{ + PGPROC *proc = MyProc; + LWLockMode mode; + uint32 lockstate; + int i; + + if (proc == NULL) + elog(PANIC, "cannot release ReadBiasedLWLock without a PGPROC structure"); + + lockstate = pg_atomic_read_u32(&lock->lwlocks[proc->pid % READ_BIASED_LOCK_STATE_COUNT].lock.state); + + Assert(lockstate & LW_LOCK_MASK != 0); + mode = (lockstate & LW_VAL_EXCLUSIVE) ? LW_EXCLUSIVE : LW_SHARED; + if (mode == LW_SHARED) + { + LWLockRelease(&lock->lwlocks[proc->pid % READ_BIASED_LOCK_STATE_COUNT].lock); + return; + } + else + { + for (i = READ_BIASED_LOCK_STATE_COUNT - 1; i >= 0; i--) + { + LWLockRelease(&lock->lwlocks[i].lock); + } + } +} + +/* + * ReadBiasedLWLockHeldByMe - test whether my process holds a ReadBiasedLWLock + * + * This is meant as debug support only. + */ +bool ReadBiasedLWLockHeldByMe(ReadBiasedLWLock *lock) +{ + PGPROC *proc = MyProc; + + if (proc == NULL) + elog(PANIC, "cannot check holder of ReadBiasedLWLock without a PGPROC structure"); + + return LWLockHeldByMe(&lock->lwlocks[proc->pid % READ_BIASED_LOCK_STATE_COUNT].lock); +} + +/* + * ReadBiasedLWLockHeldByMeInMode - test whether my process holds a ReadBiasedLWLock in given mode + * + * This is meant as debug support only. + */ +bool ReadBiasedLWLockHeldByMeInMode(ReadBiasedLWLock *lock, LWLockMode mode) +{ + PGPROC *proc = MyProc; + + if (proc == NULL) + elog(PANIC, "cannot check holder of ReadBiasedLWLock without a PGPROC structure"); + + return LWLockHeldByMeInMode(&lock->lwlocks[proc->pid % READ_BIASED_LOCK_STATE_COUNT].lock, mode); +} diff --git a/src/backend/utils/activity/generate-wait_event_types.pl b/src/backend/utils/activity/generate-wait_event_types.pl index 424ad9f115d..5454e3ad1aa 100644 --- a/src/backend/utils/activity/generate-wait_event_types.pl +++ b/src/backend/utils/activity/generate-wait_event_types.pl @@ -184,6 +184,7 @@ if ($gen_code) if ( $waitclass eq 'WaitEventExtension' || $waitclass eq 'WaitEventInjectionPoint' || $waitclass eq 'WaitEventLWLock' + || $waitclass eq 'WaitEventReadBiasedLWLock' || $waitclass eq 'WaitEventLock'); my $last = $waitclass; diff --git a/src/backend/utils/activity/wait_event_names.txt b/src/backend/utils/activity/wait_event_names.txt index 5d9e04d6823..d4c6bd5d0c9 100644 --- a/src/backend/utils/activity/wait_event_names.txt +++ b/src/backend/utils/activity/wait_event_names.txt @@ -427,3 +427,8 @@ advisory "Waiting to acquire an advisory user lock." applytransaction "Waiting to acquire a lock on a remote transaction being applied by a logical replication subscriber." # No "ABI_compatibility" region here as WaitEventLock has its own C code. + +Section: ClassName - WaitEventReadBiasedLWLock + +# END OF PREDEFINED READ_BIASED_LWLOCKS +# No "ABI_compatibility" region here as WaitEventReadBiasedLWLock has its own C code. diff --git a/src/include/storage/lwlock.h b/src/include/storage/lwlock.h index 08a72569ae5..e0c4e610fd0 100644 --- a/src/include/storage/lwlock.h +++ b/src/include/storage/lwlock.h @@ -172,6 +172,82 @@ extern int LWLockNewTrancheId(void); extern void LWLockRegisterTranche(int tranche_id, const char *tranche_name); extern void LWLockInitialize(LWLock *lock, int tranche_id); +/* + * ReadBiasedLWLock - A scalable read-biased locking mechanism + * + * Motivation: + * On high-core-count systems, traditional LWLocks show significant contention + * for frequently accessed shared locks like ProcArrayLock. Profiling of HammerDB + * TPCC benchmark revealed: + * 1. 93.5% of ProcArrayLock acquires were in LW_SHARED mode + * 2. Cumulative shared lock latency was 37x higher than exclusive mode + * 3. LWLockAttemptLock/Release consumed ~25% of total CPU cycles on 384-vCPU systems + * + * The bottleneck occurs because: + * - All readers contend on a single lock state cache line + * - Atomic operations on shared locks cause true sharing overhead + * - High reader/writer ratio creates severe cache line ping-pong + * + * Solution: + * ReadBiasedLWLock partitions lock state across multiple cache lines to: + * 1. Distribute reader contention across READ_BIASED_LOCK_STATE_COUNT (16) LWLocks + * 2. Maintain writer exclusivity while optimizing for read-heavy workloads + * 3. Leverage process-local locking semantics (PID-based partitioning) + * + * Implementation Details: + * + * Structure: + * Comprises an array of READ_BIASED_LOCK_STATE_COUNT LWLockPadded elements. Each + * element occupies a separate cache line. + * + * Reader Protocol (LW_SHARED mode): + * - Lock acquisition uses a single atomic operation per reader + * - Each reader acquires/releases only ONE specific lock: + * lock_index = MyProc->pid % READ_BIASED_LOCK_STATE_COUNT + * - Readers with different indices operate on separate cache lines + * - Readers sharing an index contend ONLY with same-index peers + * - Writers are blocked if ANY underlying lock is held (shared/exclusive) + * - New readers can acquire available indices while writer waits + * + * Writer Protocol (LW_EXCLUSIVE mode): + * - Writers must acquire ALL underlying LWLocks in exact order (0..15) + * - If any LWLock is held shared: + * a) ReadBiasedLWLockAcquire waits for ALL shared holders to release + * b) ReadBiasedLWLockConditionalAcquire releases acquired locks and fails + * - Release happens in reverse acquisition order (15..0) to prevent deadlocks + * + * Design Trade-offs: + * - Memory Overhead: 16x larger than standard LWLock + * - Writer Penalty: Exclusive acquisition requires 16 atomic operations + * - Process Binding: Requires valid MyProc (PID-based partitioning) + * - Ideal For: Read-dominated locks + * + * Usage Notes: + * 1. Initialize with ReadBiasedLWLockInitialize() + * 2. Acquire with ReadBiasedLWLockAcquire()/ReadBiasedLWLockConditionalAcquire() + * 3. Release with ReadBiasedLWLockRelease() + * 4. Check ownership with ReadBiasedLWLockHeldByMe() variants + * + * Critical Implementation Constraints: + * - Writers MUST release in reverse acquisition order + * - Conditional acquisition MUST clean up partially acquired locks + * - PID modulo operation assumes stable MyProc->pid during lock hold + */ +#define READ_BIASED_LOCK_STATE_COUNT 16 +typedef struct ReadBiasedLWLock +{ + LWLockPadded lwlocks[READ_BIASED_LOCK_STATE_COUNT]; +} ReadBiasedLWLock; + +extern PGDLLIMPORT ReadBiasedLWLock *MainReadBiasedLWLockArray; + +extern void ReadBiasedLWLockInitialize(ReadBiasedLWLock *lock); +extern bool ReadBiasedLWLockAcquire(ReadBiasedLWLock *lock, LWLockMode mode); +extern bool ReadBiasedLWLockConditionalAcquire(ReadBiasedLWLock *lock, LWLockMode mode); +extern void ReadBiasedLWLockRelease(ReadBiasedLWLock *lock); +extern bool ReadBiasedLWLockHeldByMe(ReadBiasedLWLock *lock); +extern bool ReadBiasedLWLockHeldByMeInMode(ReadBiasedLWLock *lock, LWLockMode mode); + /* * Every tranche ID less than NUM_INDIVIDUAL_LWLOCKS is reserved; also, * we reserve additional tranche IDs for builtin tranches not included in @@ -221,6 +297,7 @@ typedef enum BuiltinTrancheIds LWTRANCHE_XACT_SLRU, LWTRANCHE_PARALLEL_VACUUM_DSA, LWTRANCHE_AIO_URING_COMPLETION, + LWTRANCHE_READ_BIASED, LWTRANCHE_FIRST_USER_DEFINED, } BuiltinTrancheIds; diff --git a/src/test/regress/expected/sysviews.out b/src/test/regress/expected/sysviews.out index 83228cfca29..dcbfcb299d8 100644 --- a/src/test/regress/expected/sysviews.out +++ b/src/test/regress/expected/sysviews.out @@ -178,18 +178,19 @@ select name, setting from pg_settings where name like 'enable%'; -- may be present or absent, depending on history since last postmaster start. select type, count(*) > 0 as ok FROM pg_wait_events where type <> 'InjectionPoint' group by type order by type COLLATE "C"; - type | ok ------------+---- - Activity | t - BufferPin | t - Client | t - Extension | t - IO | t - IPC | t - LWLock | t - Lock | t - Timeout | t -(9 rows) + type | ok +------------------+---- + Activity | t + BufferPin | t + Client | t + Extension | t + IO | t + IPC | t + LWLock | t + Lock | t + ReadBiasedLWLock | t + Timeout | t +(10 rows) -- Test that the pg_timezone_names and pg_timezone_abbrevs views are -- more-or-less working. We can't test their contents in any great detail -- 2.43.0