From d59e12b13cc06807ff622673acbfd681dc244afd Mon Sep 17 00:00:00 2001 From: Dmitrii Dolgov <9erthalion6@gmail.com> Date: Mon, 27 Apr 2026 16:54:36 +0200 Subject: [PATCH v2 3/3] Randomize nbtree split location to avoid oscillating patterns MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The way nbtree page split works can lead to the same split location chosen over and over under certain workloads. To simplify it, as long as the data to be ingested follows the same distribution as already existing data, in particular it's true for an empty tree. According to [1] (and some one-off experiments) this could lead to the number of splits following an oscillating pattern, meaning some intrinsic variability in performance. A possible workaround is to introduce a random shift to the interval for searching the best split location as suggested in [1]. This brings some randomization, while keeping the suffix truncation benefits. The whitepaper mentioned above recommends range of 20%, so we stick with this value for the shift. [1]: Glombiewski N., Seeger B., Graefe G. (2019). Waves of Misery After Index Creation. BTW 2019. Gesellschaft für Informatik. doi:10.18420/btw2019-06 --- src/backend/access/nbtree/nbtsplitloc.c | 50 ++++++++++++++++++++++++- 1 file changed, 48 insertions(+), 2 deletions(-) diff --git a/src/backend/access/nbtree/nbtsplitloc.c b/src/backend/access/nbtree/nbtsplitloc.c index c64fef5ab27..f61d55346fc 100644 --- a/src/backend/access/nbtree/nbtsplitloc.c +++ b/src/backend/access/nbtree/nbtsplitloc.c @@ -18,6 +18,7 @@ #include "access/tableam.h" #include "common/int.h" #include "pg_trace.h" +#include "common/pg_prng.h" typedef enum { @@ -778,7 +779,8 @@ _bt_adjacenthtid(const ItemPointerData *lowhtid, const ItemPointerData *highhtid * points that fall within current/final split interval. Penalty is an * abstract score, with a definition that varies depending on whether we're * splitting a leaf page or an internal page. See _bt_split_penalty() for - * details. + * details. The range of candidate split points is shifted by a random delta to + * the right to spread the choice of split point among equally good candidates. * * "perfectpenalty" is assumed to be the lowest possible penalty among * candidate split points. This allows us to return early without wasting @@ -797,11 +799,35 @@ _bt_bestsplitloc(FindSplitData *state, int perfectpenalty, int bestpenalty, lowsplit; int highsplit = Min(state->interval, state->nsplits); + int rand_offset = 0; SplitPoint *final; bestpenalty = INT_MAX; lowsplit = 0; - for (int i = lowsplit; i < highsplit; i++) + + /* + * There are workloads, where we would find the same best split location + * over and over, even with the suffix truncation introducing some + * variability. According to [1] this leads to the number of splits + * following oscillating pattern, and the easiest workaround is to + * introduce some randomness in chosing split location. + * + * To achieve that add a random shift to the search interval, corresponding + * to the 20% of the all possible split locations. For the sake of not + * degrading suffix truncation performance, search [0, rand shift) interval + * first, in case if it contains the only available best split point. Then + * search the main interval [rand shift, highsplit + rand shift), and pick + * up a split point from it if found. In the best case (a single column + * unique index) this leads to a single iteration of the first loop, then a + * random jump and a single iteration of the second loop. + * + * [1]: Glombiewski N., Seeger B., Graefe G. (2019). Waves of Misery After + * Index Creation. BTW 2019. Gesellschaft für Informatik. doi:10.18420/btw2019-06 + */ + rand_offset = pg_prng_uint64_range( + &pg_global_prng_state, 0, state->nsplits * 0.2); + + for (int i = lowsplit; i < rand_offset; i++) { int penalty; @@ -817,6 +843,26 @@ _bt_bestsplitloc(FindSplitData *state, int perfectpenalty, break; } + for (int i = lowsplit + rand_offset; i < highsplit + rand_offset; i++) + { + int penalty; + + penalty = _bt_split_penalty(state, state->splits + i); + + /* + * We have already updated best penalty in the previous loop, so it's + * important to search for lower or equal penalties here. + */ + if (penalty <= bestpenalty) + { + bestpenalty = penalty; + lowsplit = i; + } + + if (penalty <= perfectpenalty) + break; + } + final = &state->splits[lowsplit]; /* -- 2.52.0