From 4fb02b26be463b9686bcef4f7bc2caeba2d48220 Mon Sep 17 00:00:00 2001 From: Greg Burd Date: Tue, 12 Aug 2025 17:20:55 -0400 Subject: [PATCH v13 4/4] Optimize modulo and division used in clock-sweep algorithm Improve the performance of the buffer manager by replacing the modulo and division operations with more pipeline friendly implementations. When the size of the clock (NBuffers) is a power-of-two we can simply bitwise and with a pre-computed mask (NBuffers - 1) to get the modulo (4 instructions, ~3-4 cycles) or bitshift (log2(NBuffres)) for division. When it isn't we can replace modulo and division using a 64-bit multiplication by the inverse of NBuffers and a right shift as described in the paper "Division by Invariant Integers using Multiplication" (4 instructions, ~8-12 cycles) and then do a bit of accounting for the remainder. In either case the branch prediction should be nearly 100% given that NBuffers never changes at runtime. In comparison a modulo operation translates into IDIV and the code would require ~26-90 cycles. Switching to these methods should use common bitshift and ALU operations that don't block the pipeline and have better instruction level parallelism. [1] https://gmplib.org/~tege/divcnst-pldi94.pdf --- src/backend/storage/buffer/freelist.c | 114 ++++++++++++++++++++++++-- 1 file changed, 107 insertions(+), 7 deletions(-) diff --git a/src/backend/storage/buffer/freelist.c b/src/backend/storage/buffer/freelist.c index 7d68f2227b3..65f3a6eb5a5 100644 --- a/src/backend/storage/buffer/freelist.c +++ b/src/backend/storage/buffer/freelist.c @@ -15,6 +15,8 @@ */ #include "postgres.h" +#include + #include "pgstat.h" #include "port/atomics.h" #include "storage/buf_internals.h" @@ -31,12 +33,24 @@ typedef struct { /* * The clock-sweep counter is atomically updated by 1 at every tick. Use - * the macro CLOCKSWEEP_HAND() to find the location of the hand on the - * clock. Use CLOCKSWEEP_PASSES() to calculate the number of times the + * the function ClockSweepHand() to find the location of the hand on the + * clock. Use ClockSweepPasses() to calculate the number of times the * clock-sweep hand has made a complete pass around the clock. */ pg_atomic_uint64 clockSweepCounter; + /* + * Modulo can be expensive to calculate repeatedly, so we implement two + * strategies to avoid it. When NBuffers is a power-of-2 we can replace + * modulo with a bit shift, when it is not we implement a more pipeline + * friendly modulo using "Division by invariant Integers using + * Multiplication" (https://gmplib.org/~tege/divcnst-pldi94.pdf). + */ + uint64 mask; + uint64 multiplier; + uint32 shift; + bool pow2; + /* * Statistics. These counters should be wide enough that they can't * overflow during a single bgwriter cycle. @@ -86,17 +100,81 @@ static BufferDesc *GetBufferFromRing(BufferAccessStrategy strategy, static void AddBufferToRing(BufferAccessStrategy strategy, BufferDesc *buf); +static inline uint32 +InvariantDivision(uint64 n) +{ + uint32 divisor = NBuffers; + + /* Compute quotient using multiplication */ + uint64 product = n * StrategyControl->multiplier; + uint32 quotient = (uint32) (product >> 32); + + /* + * The invariant multiplication gives us an approximation that may be off + * by 1. Check if we need to adjust upward. + */ + uint32 remainder = n - quotient * divisor; + + if (remainder >= NBuffers) + quotient++; + + return quotient; +} + +static inline uint32 +InvariantModulo(uint64 n) +{ + uint32 quotient = (uint32) ((n * StrategyControl->multiplier) >> 32); + uint32 remainder = (uint32) (n - (uint64) quotient * NBuffers); + + if (remainder >= NBuffers) + remainder -= NBuffers; + + return remainder; +} + /* * The clock-sweep counter is a uint64 but the clock hand can never be larger * than a uint32. Enforce that contract uniformly using this macro. */ -#define CLOCKSWEEP_HAND(counter) \ ((uint32) (counter)) % NBuffers +static inline uint32 +ClockSweepHand(uint64 counter) +{ + uint32 result; + + if (StrategyControl->pow2) + /* Power of 2: use mask */ + result = counter & StrategyControl->mask; + else + /* Non-power of 2: use magic modulo */ + result = InvariantModulo(counter); + + Assert(result < NBuffers); + Assert(result == (uint32) (counter % NBuffers)); + + return result; +} /* * The number of times the clock hand has made a complete pass around the clock * visiting all the available buffers is the counter divided by NBuffers. */ -#define CLOCKSWEEP_PASSES(counter) \ (uint32) ((counter) / NBuffers) +static inline uint32 +ClockSweepPasses(uint64 counter) +{ + uint32 result; + + if (StrategyControl->pow2) + /* Power of 2: use shift */ + result = counter >> StrategyControl->shift; + else + /* Non-power of 2: use magic modulo */ + result = InvariantDivision(counter); + + Assert(result == (uint32) (counter / NBuffers)); + + return result; +} /* * ClockSweepTick - Helper routine for StrategyGetBuffer() @@ -117,7 +195,7 @@ ClockSweepTick(void) */ counter = pg_atomic_fetch_add_u64(&StrategyControl->clockSweepCounter, 1); - hand = CLOCKSWEEP_HAND(counter); + hand = ClockSweepHand(counter); Assert(hand < NBuffers); return hand; @@ -251,10 +329,10 @@ StrategySyncStart(uint32 *complete_passes, uint32 *num_buf_alloc) uint32 result; counter = pg_atomic_read_u64(&StrategyControl->clockSweepCounter); - result = CLOCKSWEEP_HAND(counter); + result = ClockSweepHand(counter); if (complete_passes) - *complete_passes = CLOCKSWEEP_PASSES(counter); + *complete_passes = ClockSweepPasses(counter); if (num_buf_alloc) *num_buf_alloc = pg_atomic_exchange_u32(&StrategyControl->numBufferAllocs, 0); @@ -341,6 +419,27 @@ StrategyInitialize(bool init) /* Initialize combined clock-sweep pointer/complete passes counter */ pg_atomic_init_u64(&StrategyControl->clockSweepCounter, 0); + if ((NBuffers & (NBuffers - 1)) == 0) + { + /* NBuffers is a power-of-2. */ + StrategyControl->pow2 = true; + StrategyControl->mask = NBuffers - 1; + StrategyControl->shift = log2(NBuffers); + StrategyControl->multiplier = 0; + } + else + { + StrategyControl->pow2 = false; + StrategyControl->mask = 0; + StrategyControl->shift = 0; + + /* + * Calculate the invariant constant for later using ceil() + * division. + */ + StrategyControl->multiplier = ((1ULL << 32) + NBuffers - 1) / NBuffers; + } + /* Clear statistics */ pg_atomic_init_u32(&StrategyControl->numBufferAllocs, 0); -- 2.49.0