From 013cda0a1edb18f4315b6ce16f1cd372188ba399 Mon Sep 17 00:00:00 2001 From: Peter Geoghegan Date: Wed, 25 Sep 2019 10:08:53 -0700 Subject: [PATCH v28 1/3] Add deduplication to nbtree. Deduplication reduces the storage overhead of duplicates in indexes that use the standard nbtree index access method. The deduplication process is applied lazily, after the point where opportunistic deletion of LP_DEAD-marked index tuples occurs. Deduplication is only applied at the point where a leaf page split will be required if deduplication can't free up enough space. New "posting list tuples" are formed by merging together existing duplicate tuples. The physical representation of the items on an nbtree leaf page is made more space efficient by deduplication, but the logical contents of the page are not changed. Testing has shown that nbtree deduplication can generally make indexes with about 10 or 15 tuples for each distinct key value about 2.5X - 4X smaller, even with single column integer indexes (e.g., an index on a referencing column that accompanies a foreign key). Much larger reductions in index size are possible in less common cases, where individual index tuple keys happen to be large. The final size of single column nbtree indexes comes close to the final size of a similar contrib/btree_gin index, at least in cases where GIN's posting list compression isn't very effective. HEIKKI: seems obvious that the gain can be even better. No need to sell the feature in the commit message. The lazy approach taken by nbtree has significant advantages over a GIN-style eager approach. Most individual inserts of index tuples have exactly the same overhead as before. The extra overhead of deduplication is amortized across insertions, just like the overhead of page splits. The "key space" of indexes works in the same way as it has since commit dd299df8 (the commit which made heap TID a tiebreaker column), since only the physical representation of tuples is changed. Furthermore, deduplication can be turned on or off as needed, or applied HEIKKI: When would it be needed? selectively when required. The split point choice logic doesn't need to be changed, since posting list tuples are just tuples with payload, much like tuples with non-key columns in INCLUDE indexes. (nbtsplitloc.c is still optimized to make intelligent choices in the presence of posting list tuples, though only because suffix truncation will routinely make new high keys far far smaller than the non-pivot tuple they're derived from). Unique indexes can also make use of deduplication, though the strategy used has significantly differences. The high-level goal is to entirely HEIKKI: why "entirely"? It's good to avoid pages, even if you can't eliminate them entirely, right? prevent "unnecessary" page splits -- splits caused only by a short term burst of index tuple versions. This is often a concern with frequently updated tables where UPDATEs always modify at least one indexed column (making it impossible for the table am to use an optimization like heapam's heap-only tuples optimization). Deduplication in unique indexes effectively "buys time" for existing nbtree garbage collection mechanisms to run and prevent these page splits (the LP_DEAD bit setting performed during the uniqueness check is the most important mechanism for controlling bloat with affected workloads). HEIKKI: Mention that even a unique index can have duplicates, as long as they're not visible to same snapshot. That's not immediately obvious, and if you don't realize that, deduplicating a unique index seems like an oxymoron. HEIKKI: How do LP_DEAD work on posting list tuples? Bump XLOG_PAGE_MAGIC because xl_btree_vacuum changed. Author: Anastasia Lubennikova, Peter Geoghegan Reviewed-By: Peter Geoghegan Discussion: https://postgr.es/m/55E4051B.7020209@postgrespro.ru https://postgr.es/m/4ab6e2db-bcee-f4cf-0916-3a06e6ccbb55@postgrespro.ru --- src/include/access/itup.h | 5 + src/include/access/nbtree.h | 413 ++++++++-- src/include/access/nbtxlog.h | 96 ++- src/include/access/rmgrlist.h | 2 +- src/backend/access/common/reloptions.c | 9 + src/backend/access/index/genam.c | 4 + src/backend/access/nbtree/Makefile | 1 + src/backend/access/nbtree/README | 151 +++- src/backend/access/nbtree/nbtdedup.c | 738 ++++++++++++++++++ src/backend/access/nbtree/nbtinsert.c | 342 +++++++- src/backend/access/nbtree/nbtpage.c | 227 +++++- src/backend/access/nbtree/nbtree.c | 180 ++++- src/backend/access/nbtree/nbtsearch.c | 271 ++++++- src/backend/access/nbtree/nbtsort.c | 202 ++++- src/backend/access/nbtree/nbtsplitloc.c | 39 +- src/backend/access/nbtree/nbtutils.c | 224 +++++- src/backend/access/nbtree/nbtxlog.c | 201 ++++- src/backend/access/rmgrdesc/nbtdesc.c | 23 +- src/backend/storage/page/bufpage.c | 9 +- src/backend/utils/misc/guc.c | 10 + src/backend/utils/misc/postgresql.conf.sample | 1 + src/bin/psql/tab-complete.c | 4 +- contrib/amcheck/verify_nbtree.c | 234 +++++- doc/src/sgml/btree.sgml | 115 ++- doc/src/sgml/charset.sgml | 9 +- doc/src/sgml/config.sgml | 25 + doc/src/sgml/ref/create_index.sgml | 37 +- src/test/regress/expected/btree_index.out | 16 + src/test/regress/sql/btree_index.sql | 17 + 29 files changed, 3306 insertions(+), 299 deletions(-) create mode 100644 src/backend/access/nbtree/nbtdedup.c diff --git a/src/include/access/itup.h b/src/include/access/itup.h index b9c41d3455..223f2e7eff 100644 --- a/src/include/access/itup.h +++ b/src/include/access/itup.h @@ -141,6 +141,11 @@ typedef IndexAttributeBitMapData * IndexAttributeBitMap; * On such a page, N tuples could take one MAXALIGN quantum less space than * estimated here, seemingly allowing one more tuple than estimated here. * But such a page always has at least MAXALIGN special space, so we're safe. + * + * Note: MaxIndexTuplesPerPage is a limit on the number of tuples on a page + * that consume a line pointer -- "physical" tuples. Some index AMs can store + * a greater number of "logical" tuples, though (e.g., btree leaf pages with + * posting list tuples). */ #define MaxIndexTuplesPerPage \ ((int) ((BLCKSZ - SizeOfPageHeaderData) / \ HEIKKI: I'm not a fan of the name "logical tuples". Maybe call them TIDs or heap tuple pointers or something? diff --git a/src/include/access/nbtree.h b/src/include/access/nbtree.h index f90ee3a0e0..6fae1ec079 100644 --- a/src/include/access/nbtree.h +++ b/src/include/access/nbtree.h @@ -24,6 +24,9 @@ #include "storage/bufmgr.h" #include "storage/shm_toc.h" +/* GUC parameter */ +extern bool btree_deduplication; + HEIKKI: Not a fan of this name. deduplicate_btree_items? /* There's room for a 16-bit vacuum cycle ID in BTPageOpaqueData */ typedef uint16 BTCycleId; @@ -108,4 +111,5 @@ typedef struct BTMetaPageData * pages */ float8 btm_last_cleanup_num_heap_tuples; /* number of heap tuples * during last cleanup */ + bool btm_safededup; /* deduplication known to be safe? */ } BTMetaPageData; HEIKKI: When is it not safe? #define BTPageGetMeta(p) \ @@ -115,7 +119,8 @@ typedef struct BTMetaPageData /* * The current Btree version is 4. That's what you'll get when you create - * a new index. + * a new index. The btm_safededup field can only be set if this happened + * on Postgres 13, but it's safe to read with version 3 indexes. * HEIKKI: Why is it safe to read on version 3 indexes? Because unused space is set to zeros? HEIKKI: Do we need it as a separate flag, isn't it always safe with version 4 indexes, and never with version 3? * Btree version 3 was used in PostgreSQL v11. It is mostly the same as * version 4, but heap TIDs were not part of the keyspace. Index tuples @@ -132,5 +137,5 @@ typedef struct BTMetaPageData #define BTREE_METAPAGE 0 /* first page is meta */ #define BTREE_MAGIC 0x053162 /* magic number in metapage */ #define BTREE_VERSION 4 /* current version number */ -#define BTREE_MIN_VERSION 2 /* minimal supported version number */ -#define BTREE_NOVAC_VERSION 3 /* minimal version with all meta fields */ +#define BTREE_MIN_VERSION 2 /* minimum supported version */ +#define BTREE_NOVAC_VERSION 3 /* version with all meta fields set */ HEIKKI: I like these comments tweaks, regardless of the rest of the patch. Commit separately? /* * Maximum size of a btree index entry, including its tuple header. @@ -156,9 +161,30 @@ typedef struct BTMetaPageData MAXALIGN(SizeOfPageHeaderData + 3*sizeof(ItemIdData)) - \ MAXALIGN(sizeof(BTPageOpaqueData))) / 3) +/* + * MaxBTreeIndexTuplesPerPage is an upper bound on the number of "logical" + * tuples that may be stored on a btree leaf page. This is comparable to + * the generic/physical MaxIndexTuplesPerPage upper bound. A separate + * upper bound is needed in certain contexts due to posting list tuples, + * which only use a single physical page entry to store many logical + * tuples. (MaxBTreeIndexTuplesPerPage is used to size the per-page + * temporary buffers used by index scans.) + * + * Note: we don't bother considering per-physical-tuple overheads here to + * keep things simple (value is based on how many elements a single array + * of heap TIDs must have to fill the space between the page header and + * special area). The value is slightly higher (i.e. more conservative) + * than necessary as a result, which is considered acceptable. There will + * only be three (very large) physical posting list tuples in leaf pages + * that have the largest possible number of heap TIDs/logical tuples. + */ +#define MaxBTreeIndexTuplesPerPage \ + (int) ((BLCKSZ - SizeOfPageHeaderData - sizeof(BTPageOpaqueData)) / \ + sizeof(ItemPointerData)) + HEIKKI: I find this name confusing. There's "IndexTuples" in the name, which makes me think of the IndexTuple struct. But this is explicitly *not* about the number of IndexTuples that fit on a page. HEIKKI: Maybe "MaxTIDsPerBtreePage" or something ? /* * The leaf-page fillfactor defaults to 90% but is user-adjustable. * For pages above the leaf level, we use a fixed 70% fillfactor. @@ -230,16 +256,15 @@ typedef struct BTMetaPageData * tuples (non-pivot tuples). _bt_check_natts() enforces the rules * described here. * - * Non-pivot tuple format: + * Non-pivot tuple format (plain/non-posting variant): * * t_tid | t_info | key values | INCLUDE columns, if any * * t_tid points to the heap TID, which is a tiebreaker key column as of - * BTREE_VERSION 4. Currently, the INDEX_ALT_TID_MASK status bit is never - * set for non-pivot tuples. + * BTREE_VERSION 4. * - * All other types of index tuples ("pivot" tuples) only have key columns, - * since pivot tuples only exist to represent how the key space is + * Non-pivot tuples complement pivot tuples, which only have key columns. HEIKKI: What does it mean that they complement pivot tuples? + * The sole purpose of pivot tuples is to represent how the key space is * separated. In general, any B-Tree index that has more than one level * (i.e. any index that does not just consist of a metapage and a single * leaf root page) must have some number of pivot tuples, since pivot @@ -263,9 +288,9 @@ typedef struct BTMetaPageData * offset field only stores the number of columns/attributes when the * INDEX_ALT_TID_MASK bit is set, which doesn't count the trailing heap * TID column sometimes stored in pivot tuples -- that's represented by - * the presence of BT_HEAP_TID_ATTR. The INDEX_ALT_TID_MASK bit in t_info - * is always set on BTREE_VERSION 4. BT_HEAP_TID_ATTR can only be set on - * BTREE_VERSION 4. + * the presence of BT_PIVOT_HEAP_TID_ATTR. The INDEX_ALT_TID_MASK bit in HEIKKI: That renaming seems useful, regardless of the rest of the patch + * t_info is always set on BTREE_VERSION 4. BT_PIVOT_HEAP_TID_ATTR can + * only be set on BTREE_VERSION 4. * * In version 3 indexes, the INDEX_ALT_TID_MASK flag might not be set in * pivot tuples. In that case, the number of key columns is implicitly @@ -283,12 +308,37 @@ typedef struct BTMetaPageData * future use. BT_N_KEYS_OFFSET_MASK should be large enough to store any * number of columns/attributes <= INDEX_MAX_KEYS. * + * Sometimes non-pivot tuples also use a representation that repurposes + * t_tid to store metadata rather than a TID. Postgres 13 introduced a new + * non-pivot tuple format to support deduplication: posting list tuples. + * Deduplication merges together multiple equal non-pivot tuples into a + * logically equivalent, space efficient representation. A posting list is + * an array of ItemPointerData elements. Regular non-pivot tuples are + * merged together to form posting list tuples lazily, at the point where + * we'd otherwise have to split a leaf page. + * + * Posting tuple format (alternative non-pivot tuple representation): + * + * t_tid | t_info | key values | posting list (TID array) + * + * Posting list tuples are recognized as such by having the + * INDEX_ALT_TID_MASK status bit set in t_info and the BT_IS_POSTING status + * bit set in t_tid. These flags redefine the content of the posting + * tuple's t_tid to store an offset to the posting list, as well as the + * total number of posting list array elements. + * + * The 12 least significant offset bits from t_tid are used to represent + * the number of posting items present in the tuple, leaving 4 status + * bits (BT_RESERVED_OFFSET_MASK bits), 3 of which that are reserved for + * future use. Like any non-pivot tuple, the number of columns stored is + * always implicitly the total number in the index (in practice there can + * never be non-key columns stored, since deduplication is not supported + * with INCLUDE indexes). + * * Note well: The macros that deal with the number of attributes in tuples - * assume that a tuple with INDEX_ALT_TID_MASK set must be a pivot tuple, - * and that a tuple without INDEX_ALT_TID_MASK set must be a non-pivot - * tuple (or must have the same number of attributes as the index has - * generally in the case of !heapkeyspace indexes). They will need to be - * updated if non-pivot tuples ever get taught to use INDEX_ALT_TID_MASK - * for something else. + * assume that a tuple with INDEX_ALT_TID_MASK set must be a pivot tuple or + * non-pivot posting tuple, and that a tuple without INDEX_ALT_TID_MASK set + * must be a non-pivot tuple (or must have the same number of attributes as + * the index has generally in the case of !heapkeyspace indexes). */ #define INDEX_ALT_TID_MASK INDEX_AM_RESERVED_BIT HEIKKI: I must say, this B-tree index tuple format has become really complicated. I don't like it, but I'm not sure what to do about it. HEIKKI: I know there are backwards-compatibility reasons for why it's the way it is, but still.. /* Item pointer offset bits */ #define BT_RESERVED_OFFSET_MASK 0xF000 #define BT_N_KEYS_OFFSET_MASK 0x0FFF -#define BT_HEAP_TID_ATTR 0x1000 - -/* Get/set downlink block number in pivot tuple */ -#define BTreeTupleGetDownLink(itup) \ - ItemPointerGetBlockNumberNoCheck(&((itup)->t_tid)) -#define BTreeTupleSetDownLink(itup, blkno) \ - ItemPointerSetBlockNumber(&((itup)->t_tid), (blkno)) +#define BT_N_POSTING_OFFSET_MASK 0x0FFF +#define BT_PIVOT_HEAP_TID_ATTR 0x1000 +#define BT_IS_POSTING 0x2000 /* - * Get/set leaf page highkey's link. During the second phase of deletion, the - * target leaf page's high key may point to an ancestor page (at all other - * times, the leaf level high key's link is not used). See the nbtree README - * for full details. + * Note: BTreeTupleIsPivot() can have false negatives (but not false + * positives) when used with !heapkeyspace indexes */ -#define BTreeTupleGetTopParent(itup) \ - ItemPointerGetBlockNumberNoCheck(&((itup)->t_tid)) -#define BTreeTupleSetTopParent(itup, blkno) \ - do { \ - ItemPointerSetBlockNumber(&((itup)->t_tid), (blkno)); \ - BTreeTupleSetNAtts((itup), 0); \ - } while(0) +static inline bool +BTreeTupleIsPivot(IndexTuple itup) +{ + if ((itup->t_info & INDEX_ALT_TID_MASK) == 0) + return false; + /* absence of BT_IS_POSTING in offset number indicates pivot tuple */ + if ((ItemPointerGetOffsetNumberNoCheck(&itup->t_tid) & BT_IS_POSTING) != 0) + return false; + + return true; +} + +static inline bool +BTreeTupleIsPosting(IndexTuple itup) +{ + if ((itup->t_info & INDEX_ALT_TID_MASK) == 0) + return false; + /* presence of BT_IS_POSTING in offset number indicates posting tuple */ + if ((ItemPointerGetOffsetNumberNoCheck(&itup->t_tid) & BT_IS_POSTING) == 0) + return false; + + return true; +} + +static inline void +BTreeTupleSetPosting(IndexTuple itup, int nhtids, int postingoffset) +{ + Assert(nhtids > 1 && (nhtids & BT_N_POSTING_OFFSET_MASK) == nhtids); + Assert(postingoffset == MAXALIGN(postingoffset)); + Assert(postingoffset < INDEX_SIZE_MASK); + + itup->t_info |= INDEX_ALT_TID_MASK; + ItemPointerSetOffsetNumber(&itup->t_tid, (nhtids | BT_IS_POSTING)); + ItemPointerSetBlockNumber(&itup->t_tid, postingoffset); +} + +static inline uint16 +BTreeTupleGetNPosting(IndexTuple posting) +{ + OffsetNumber existing; + + Assert(BTreeTupleIsPosting(posting)); + + existing = ItemPointerGetOffsetNumberNoCheck(&posting->t_tid); + return (existing & BT_N_POSTING_OFFSET_MASK); +} + +static inline uint32 +BTreeTupleGetPostingOffset(IndexTuple posting) +{ + Assert(BTreeTupleIsPosting(posting)); + + return ItemPointerGetBlockNumberNoCheck(&posting->t_tid); +} + +static inline ItemPointer +BTreeTupleGetPosting(IndexTuple posting) +{ + return (ItemPointer) ((char *) posting + + BTreeTupleGetPostingOffset(posting)); +} + +static inline ItemPointer +BTreeTupleGetPostingN(IndexTuple posting, int n) +{ + return BTreeTupleGetPosting(posting) + n; +} /* - * Get/set number of attributes within B-tree index tuple. + * Get/set downlink block number in pivot tuple. + * + * Note: Cannot assert that tuple is a pivot tuple. If we did so then + * !heapkeyspace indexes would exhibit false positive assertion failures. + */ +static inline BlockNumber +BTreeTupleGetDownLink(IndexTuple pivot) +{ + return ItemPointerGetBlockNumberNoCheck(&pivot->t_tid); +} + +static inline void +BTreeTupleSetDownLink(IndexTuple pivot, BlockNumber blkno) +{ + ItemPointerSetBlockNumber(&pivot->t_tid, blkno); +} + +/* + * Get number of attributes within tuple. * * Note that this does not include an implicit tiebreaker heap TID * attribute, if any. Note also that the number of key attributes must be * explicitly represented in all heapkeyspace pivot tuples. + * + * Note: This is defined at a macro rather than an inline function to HEIKKI: typo, should be "as a macro" + * avoid including rel.h. */ #define BTreeTupleGetNAtts(itup, rel) \ ( \ - (itup)->t_info & INDEX_ALT_TID_MASK ? \ + (BTreeTupleIsPivot(itup)) ? \ ( \ ItemPointerGetOffsetNumberNoCheck(&(itup)->t_tid) & BT_N_KEYS_OFFSET_MASK \ ) \ : \ IndexRelationGetNumberOfAttributes(rel) \ ) -#define BTreeTupleSetNAtts(itup, n) \ - do { \ - (itup)->t_info |= INDEX_ALT_TID_MASK; \ - ItemPointerSetOffsetNumber(&(itup)->t_tid, (n) & BT_N_KEYS_OFFSET_MASK); \ - } while(0) /* - * Get tiebreaker heap TID attribute, if any. Macro works with both pivot - * and non-pivot tuples, despite differences in how heap TID is represented. + * Set number of attributes in tuple, making it into a pivot tuple */ -#define BTreeTupleGetHeapTID(itup) \ - ( \ - (itup)->t_info & INDEX_ALT_TID_MASK && \ - (ItemPointerGetOffsetNumberNoCheck(&(itup)->t_tid) & BT_HEAP_TID_ATTR) != 0 ? \ - ( \ - (ItemPointer) (((char *) (itup) + IndexTupleSize(itup)) - \ - sizeof(ItemPointerData)) \ - ) \ - : (itup)->t_info & INDEX_ALT_TID_MASK ? NULL : (ItemPointer) &((itup)->t_tid) \ - ) +static inline void +BTreeTupleSetNAtts(IndexTuple itup, int natts) +{ + Assert(natts <= INDEX_MAX_KEYS); + + itup->t_info |= INDEX_ALT_TID_MASK; + /* BT_IS_POSTING bit may be unset -- tuple always becomes a pivot tuple */ + ItemPointerSetOffsetNumber(&itup->t_tid, natts); + Assert(BTreeTupleIsPivot(itup)); +} + /* - * Set the heap TID attribute for a tuple that uses the INDEX_ALT_TID_MASK - * representation (currently limited to pivot tuples) + * Set the bit indicating heap TID attribute present in pivot tuple */ -#define BTreeTupleSetAltHeapTID(itup) \ - do { \ - Assert((itup)->t_info & INDEX_ALT_TID_MASK); \ - ItemPointerSetOffsetNumber(&(itup)->t_tid, \ - ItemPointerGetOffsetNumberNoCheck(&(itup)->t_tid) | BT_HEAP_TID_ATTR); \ - } while(0) +static inline void +BTreeTupleSetAltHeapTID(IndexTuple pivot) +{ + OffsetNumber existing; + + Assert(BTreeTupleIsPivot(pivot)); + + existing = ItemPointerGetOffsetNumberNoCheck(&pivot->t_tid); + ItemPointerSetOffsetNumber(&pivot->t_tid, + existing | BT_PIVOT_HEAP_TID_ATTR); +} + +/* + * Get/set leaf page's "top parent" link from its high key. Used during page + * deletion. + * + * Note: Cannot assert that tuple is a pivot tuple. If we did so then + * !heapkeyspace indexes would exhibit false positive assertion failures. + */ +static inline BlockNumber +BTreeTupleGetTopParent(IndexTuple leafhikey) +{ + return ItemPointerGetBlockNumberNoCheck(&leafhikey->t_tid); +} + +static inline void +BTreeTupleSetTopParent(IndexTuple leafhikey, BlockNumber blkno) +{ + ItemPointerSetBlockNumber(&leafhikey->t_tid, blkno); + BTreeTupleSetNAtts(leafhikey, 0); +} + +/* + * Get tiebreaker heap TID attribute, if any. + * + * This returns the first/lowest heap TID in the case of a posting list tuple. + */ +static inline ItemPointer +BTreeTupleGetHeapTID(IndexTuple itup) +{ + if (BTreeTupleIsPivot(itup)) + { + /* Pivot tuple heap TID representation? */ + if ((ItemPointerGetOffsetNumberNoCheck(&itup->t_tid) & + BT_PIVOT_HEAP_TID_ATTR) != 0) + return (ItemPointer) ((char *) itup + IndexTupleSize(itup) - + sizeof(ItemPointerData)); + + /* Heap TID attribute was truncated */ + return NULL; + } + else if (BTreeTupleIsPosting(itup)) + return BTreeTupleGetPosting(itup); + + return &itup->t_tid; +} + +/* + * Get maximum heap TID attribute, which could be the only TID in the case of + * a non-pivot tuple that does not have a posting list tuple. + * + * Works with non-pivot tuples only. + */ +static inline ItemPointer +BTreeTupleGetMaxHeapTID(IndexTuple itup) +{ + Assert(!BTreeTupleIsPivot(itup)); + + if (BTreeTupleIsPosting(itup)) + { + uint16 nposting = BTreeTupleGetNPosting(itup); + + return BTreeTupleGetPostingN(itup, nposting - 1); + } + + return &itup->t_tid; +} /* * Operator strategy numbers for B-tree have been moved to access/stratnum.h, @@ -435,6 +625,11 @@ typedef BTStackData *BTStack; * indexes whose version is >= version 4. It's convenient to keep this close * by, rather than accessing the metapage repeatedly. * + * safededup is set to indicate that index may use dynamic deduplication + * safely (index storage parameter separately indicates if deduplication is HEIKKI: Is there really an "index storage parameter" for that? What is that, something in the WITH clause? + * currently in use). This is also a property of the index relation rather + * than an indexscan that is kept around for convenience. + * * anynullkeys indicates if any of the keys had NULL value when scankey was * built from index tuple (note that already-truncated tuple key attributes * set NULL as a placeholder key value, which also affects value of @@ -470,6 +665,7 @@ typedef BTStackData *BTStack; typedef struct BTScanInsertData { bool heapkeyspace; + bool safededup; bool anynullkeys; bool nextkey; bool pivotsearch; @@ -508,10 +704,60 @@ typedef struct BTInsertStateData bool bounds_valid; OffsetNumber low; OffsetNumber stricthigh; + + /* + * if _bt_binsrch_insert found the location inside existing posting list, + * save the position inside the list. -1 sentinel value indicates overlap + * with an existing posting list tuple that has its LP_DEAD bit set. + */ + int postingoff; } BTInsertStateData; typedef BTInsertStateData *BTInsertState; +/* + * BTDedupStateData is a working area used during deduplication. + * + * The status info fields track the state of a whole-page deduplication pass. + * State about the current pending posting list is also tracked. + * + * A pending posting list is comprised of a contiguous group of equal physical + * items from the page, starting from page offset number 'baseoff'. This is + * the offset number of the "base" tuple for new posting list. 'nitems' is + * the current total number of existing items from the page that will be + * merged to make a new posting list tuple, including the base tuple item. + * (Existing physical items may themselves be posting list tuples, or regular + * non-pivot tuples.) + * + * Note that when deduplication merges together existing physical tuples, the + * page is modified eagerly. This makes tracking the details of more than a + * single pending posting list at a time unnecessary. The total size of the + * existing tuples to be freed when pending posting list is processed gets + * tracked by 'phystupsize'. This information allows deduplication to + * calculate the space saving for each new posting list tuple, and for the + * entire pass over the page as a whole. + */ +typedef struct BTDedupStateData +{ + /* Deduplication status info for entire pass over page */ + Size maxitemsize; /* Limit on size of final tuple */ + bool checkingunique; /* Use unique index strategy? */ + OffsetNumber skippedbase; /* First offset skipped by checkingunique */ + + /* Metadata about base tuple of current pending posting list */ + IndexTuple base; /* Use to form new posting list */ + OffsetNumber baseoff; /* page offset of base */ + Size basetupsize; /* base size without posting list */ + + /* Other metadata about pending posting list */ + ItemPointer htids; /* Heap TIDs in pending posting list */ + int nhtids; /* Number of heap TIDs in nhtids array */ + int nitems; /* Number of existing physical tuples */ + Size phystupsize; /* Includes line pointer overhead */ +} BTDedupStateData; + +typedef BTDedupStateData *BTDedupState; + /* * BTScanOpaqueData is the btree-private state needed for an indexscan. * This consists of preprocessed scan keys (see _bt_preprocess_keys() for @@ -535,7 +781,10 @@ typedef BTInsertStateData *BTInsertState; * If we are doing an index-only scan, we save the entire IndexTuple for each * matched item, otherwise only its heap TID and offset. The IndexTuples go * into a separate workspace array; each BTScanPosItem stores its tuple's - * offset within that array. + * offset within that array. Posting list tuples store a "base" tuple once, + * allowing the same key to be returned for each logical tuple associated + * with the physical posting list tuple (i.e. for each TID from the posting + * list). */ typedef struct BTScanPosItem /* what we remember about each match */ @@ -579,5 +828,5 @@ typedef struct BTScanPosData int lastItem; /* last valid index in items[] */ int itemIndex; /* current index in items[] */ - BTScanPosItem items[MaxIndexTuplesPerPage]; /* MUST BE LAST */ + BTScanPosItem items[MaxBTreeIndexTuplesPerPage]; /* MUST BE LAST */ } BTScanPosData; HEIKKI: How much memory does this need now? Should we consider pallocing this separately? typedef BTScanPosData *BTScanPos; @@ -687,6 +936,7 @@ typedef struct BTOptions int fillfactor; /* page fill factor in percent (0..100) */ /* fraction of newly inserted tuples prior to trigger index cleanup */ float8 vacuum_cleanup_index_scale_factor; + bool deduplication; /* Use deduplication where safe? */ } BTOptions; #define BTGetFillFactor(relation) \ @@ -695,8 +945,16 @@ typedef struct BTOptions (relation)->rd_options ? \ ((BTOptions *) (relation)->rd_options)->fillfactor : \ BTREE_DEFAULT_FILLFACTOR) +#define BTGetUseDedup(relation) \ + (AssertMacro(relation->rd_rel->relkind == RELKIND_INDEX && \ + relation->rd_rel->relam == BTREE_AM_OID), \ + ((relation)->rd_options ? \ + ((BTOptions *) (relation)->rd_options)->deduplication : \ + BTGetUseDedupGUC(relation))) #define BTGetTargetPageFreeSpace(relation) \ (BLCKSZ * (100 - BTGetFillFactor(relation)) / 100) +#define BTGetUseDedupGUC(relation) \ + (relation->rd_index->indisunique || btree_deduplication) /* * Constant definition for progress reporting. Phase numbers must match @@ -743,6 +1001,22 @@ extern void _bt_parallel_release(IndexScanDesc scan, BlockNumber scan_page); extern void _bt_parallel_done(IndexScanDesc scan); extern void _bt_parallel_advance_array_keys(IndexScanDesc scan); +/* + * prototypes for functions in nbtdedup.c + */ +extern void _bt_dedup_one_page(Relation rel, Buffer buf, Relation heapRel, + IndexTuple newitem, Size newitemsz, + bool checkingunique); +extern void _bt_dedup_start_pending(BTDedupState state, IndexTuple base, + OffsetNumber base_off); +extern bool _bt_dedup_save_htid(BTDedupState state, IndexTuple itup); +extern Size _bt_dedup_finish_pending(Buffer buf, BTDedupState state, + bool logged); +extern IndexTuple _bt_form_posting(IndexTuple base, ItemPointer htids, + int nhtids); +extern IndexTuple _bt_swap_posting(IndexTuple newitem, IndexTuple oposting, + int postingoff); + /* * prototypes for functions in nbtinsert.c */ @@ -761,14 +1035,16 @@ extern OffsetNumber _bt_findsplitloc(Relation rel, Page page, /* * prototypes for functions in nbtpage.c */ -extern void _bt_initmetapage(Page page, BlockNumber rootbknum, uint32 level); +extern void _bt_initmetapage(Page page, BlockNumber rootbknum, uint32 level, + bool safededup); extern void _bt_update_meta_cleanup_info(Relation rel, TransactionId oldestBtpoXact, float8 numHeapTuples); extern void _bt_upgrademetapage(Page page); extern Buffer _bt_getroot(Relation rel, int access); extern Buffer _bt_gettrueroot(Relation rel); extern int _bt_getrootheight(Relation rel); -extern bool _bt_heapkeyspace(Relation rel); +extern void _bt_metaversion(Relation rel, bool *heapkeyspace, + bool *safededup); extern void _bt_checkpage(Relation rel, Buffer buf); extern Buffer _bt_getbuf(Relation rel, BlockNumber blkno, int access); extern Buffer _bt_relandgetbuf(Relation rel, Buffer obuf, @@ -777,7 +1053,9 @@ extern void _bt_relbuf(Relation rel, Buffer buf); extern void _bt_pageinit(Page page, Size size); extern bool _bt_page_recyclable(Page page); extern void _bt_delitems_vacuum(Relation rel, Buffer buf, - OffsetNumber *deletable, int ndeletable); + OffsetNumber *deletable, int ndeletable, + OffsetNumber *updatable, IndexTuple *updated, + int nupdatable); extern void _bt_delitems_delete(Relation rel, Buffer buf, OffsetNumber *deletable, int ndeletable, Relation heapRel); @@ -830,6 +1108,7 @@ extern bool _bt_check_natts(Relation rel, bool heapkeyspace, Page page, OffsetNumber offnum); extern void _bt_check_third_page(Relation rel, Relation heap, bool needheaptidspace, Page page, IndexTuple newtup); +extern bool _bt_opclasses_support_dedup(Relation index); /* * prototypes for functions in nbtvalidate.c diff --git a/src/include/access/nbtxlog.h b/src/include/access/nbtxlog.h index 776a9bd723..de855efbba 100644 --- a/src/include/access/nbtxlog.h +++ b/src/include/access/nbtxlog.h @@ -28,7 +28,8 @@ #define XLOG_BTREE_INSERT_META 0x20 /* same, plus update metapage */ #define XLOG_BTREE_SPLIT_L 0x30 /* add index tuple with split */ #define XLOG_BTREE_SPLIT_R 0x40 /* as above, new item on right */ -/* 0x50 and 0x60 are unused */ +#define XLOG_BTREE_DEDUP_PAGE 0x50 /* deduplicate tuples on leaf page */ +#define XLOG_BTREE_INSERT_POST 0x60 /* add index tuple with posting split */ #define XLOG_BTREE_DELETE 0x70 /* delete leaf index tuples for a page */ #define XLOG_BTREE_UNLINK_PAGE 0x80 /* delete a half-dead page */ #define XLOG_BTREE_UNLINK_PAGE_META 0x90 /* same, and update metapage */ @@ -53,19 +54,32 @@ typedef struct xl_btree_metadata uint32 fastlevel; TransactionId oldest_btpo_xact; float8 last_cleanup_num_heap_tuples; + bool safededup; } xl_btree_metadata; /* * This is what we need to know about simple (without split) insert. * - * This data record is used for INSERT_LEAF, INSERT_UPPER, INSERT_META. - * Note that INSERT_META implies it's not a leaf page. + * This data record is used for INSERT_LEAF, INSERT_UPPER, INSERT_META, and + * INSERT_POST. Note that INSERT_META and INSERT_UPPER implies it's not a + * leaf page, while INSERT_POST and INSERT_LEAF imply that it must be a leaf + * page. * - * Backup Blk 0: original page (data contains the inserted tuple) + * Backup Blk 0: original page * Backup Blk 1: child's left sibling, if INSERT_UPPER or INSERT_META * Backup Blk 2: xl_btree_metadata, if INSERT_META + * + * Note: The new tuple is actually the "original" new item in the posting + * list split insert case (i.e. the INSERT_POST case). A split offset for + * the posting list is logged before the original new item. Recovery needs + * both, since it must do an in-place update of the existing posting list + * that was split as an extra step. Also, recovery generates a "final" + * newitem. See _bt_swap_posting() for details on posting list splits. */ typedef struct xl_btree_insert { OffsetNumber offnum; + + /* POSTING SPLIT OFFSET FOLLOWS (INSERT_POST case) */ + /* NEW TUPLE ALWAYS FOLLOWS AT THE END */ } xl_btree_insert; HEIKKI: Would it be more clear to have a separate struct for the posting list split case? #define SizeOfBtreeInsert (offsetof(xl_btree_insert, offnum) + sizeof(OffsetNumber)) @@ -92,8 +106,37 @@ typedef struct xl_btree_insert * Backup Blk 0: original page / new left page * * The left page's data portion contains the new item, if it's the _L variant. - * An IndexTuple representing the high key of the left page must follow with - * either variant. + * _R variant split records generally do not have a newitem (_R variant leaf + * page split records that must deal with a posting list split will include an + * explicit newitem, though it is never used on the right page -- it is + * actually an orignewitem needed to update existing posting list). The new + * high key of the left/original page appears last of all (and must always be + * present). + * + * Page split records that need the REDO routine to deal with a posting list + * split directly will have an explicit newitem, which is actually an + * orignewitem (the newitem as it was before the posting list split, not + * after). A posting list split always has a newitem that comes immediately + * after the posting list being split (which would have overlapped with + * orignewitem prior to split). Usually REDO must deal with posting list + * splits with an _L variant page split record, and usually both the new + * posting list and the final newitem go on the left page (the existing + * posting list will be inserted instead of the old, and the final newitem + * will be inserted next to that). However, _R variant split records will + * include an orignewitem when the split point for the page happens to have a + * lastleft tuple that is also the posting list being split (leaving newitem + * as the page split's firstright tuple). The existence of this corner case + * does not change the basic fact about newitem/orignewitem for the REDO + * routine: it is always state used for the left page alone. (This is why the + * record's postingoff field isn't a reliable indicator of whether or not a + * posting list split occurred during the page split; a non-zero value merely + * indicates that the REDO routine must reconstruct a new posting list tuple + * that is needed for the left page.) + * + * This posting list split handling is equivalent to the xl_btree_insert REDO + * routine's INSERT_POST handling. While the details are more complicated + * here, the concept and goals are exactly the same. See _bt_swap_posting() + * for details on posting list splits. * * Backup Blk 1: new right page * @@ -111,7 +154,23 @@ typedef struct xl_btree_split { uint32 level; /* tree level of page being split */ OffsetNumber firstright; /* first item moved to right page */ - OffsetNumber newitemoff; /* new item's offset (useful for _L variant) */ + OffsetNumber newitemoff; /* new item's offset */ + uint16 postingoff; /* offset inside orig posting tuple */ } xl_btree_split; -#define SizeOfBtreeSplit (offsetof(xl_btree_split, newitemoff) + sizeof(OffsetNumber)) +#define SizeOfBtreeSplit (offsetof(xl_btree_split, postingoff) + sizeof(uint16)) + +/* + * When page is deduplicated, consecutive groups of tuples with equal keys are + * merged together into posting list tuples. + * + * The WAL record represents the interval that describes the posing tuple HEIKKI: typo: "posing tuple" + * that should be added to the page. + */ +typedef struct xl_btree_dedup +{ + OffsetNumber baseoff; + uint16 nitems; +} xl_btree_dedup; + +#define SizeOfBtreeDedup (offsetof(xl_btree_dedup, nitems) + sizeof(uint16)) HEIKKI: Do we only generate one posting list in one WAL record? I would assume it's better to deduplicate everything on the page, since we're modifying it anyway. /* * This is what we need to know about delete of individual leaf index tuples. * The WAL record can represent deletion of any number of index tuples on a - * single index page when *not* executed by VACUUM. + * single index page when *not* executed by VACUUM. Deletion of a subset of + * "logical" tuples within a posting list tuple is not supported. * * Backup Blk 0: index page */ @@ -152,14 +212,18 @@ typedef struct xl_btree_reuse_page /* * This is what we need to know about vacuum of individual leaf index tuples. * The WAL record can represent deletion of any number of index tuples on a - * single index page when executed by VACUUM. - * - * Note that the WAL record in any vacuum of an index must have at least one - * item to delete. + * single index page when executed by VACUUM. It can also support "updates" + * of index tuples, which are how deletes of "logical" tuples contained in an + * existing posting list tuple are implemented. (Updates are only used when + * there will be some remaining logical tuples once VACUUM finishes; otherwise + * the physical posting list tuple can just be deleted). */ typedef struct xl_btree_vacuum { - uint32 ndeleted; + uint16 ndeleted; + uint16 nupdated; /* DELETED TARGET OFFSET NUMBERS FOLLOW */ + /* UPDATED TARGET OFFSET NUMBERS FOLLOW */ + /* UPDATED TUPLES FOR OVERWRITES FOLLOW */ } xl_btree_vacuum; HEIKKI: Does this store a whole copy of the remaining posting list on an updated tuple? Wouldn't it be simpler and more space-efficient to store just the deleted TIDs? -#define SizeOfBtreeVacuum (offsetof(xl_btree_vacuum, ndeleted) + sizeof(uint32)) +#define SizeOfBtreeVacuum (offsetof(xl_btree_vacuum, nupdated) + sizeof(uint16)) /* * This is what we need to know about marking an empty branch for deletion. @@ -245,6 +309,8 @@ typedef struct xl_btree_newroot extern void btree_redo(XLogReaderState *record); extern void btree_desc(StringInfo buf, XLogReaderState *record); extern const char *btree_identify(uint8 info); +extern void btree_xlog_startup(void); +extern void btree_xlog_cleanup(void); extern void btree_mask(char *pagedata, BlockNumber blkno); #endif /* NBTXLOG_H */ diff --git a/src/include/access/rmgrlist.h b/src/include/access/rmgrlist.h index c88dccfb8d..6c15df7e70 100644 --- a/src/include/access/rmgrlist.h +++ b/src/include/access/rmgrlist.h @@ -36,7 +36,7 @@ PG_RMGR(RM_RELMAP_ID, "RelMap", relmap_redo, relmap_desc, relmap_identify, NULL, PG_RMGR(RM_STANDBY_ID, "Standby", standby_redo, standby_desc, standby_identify, NULL, NULL, NULL) PG_RMGR(RM_HEAP2_ID, "Heap2", heap2_redo, heap2_desc, heap2_identify, NULL, NULL, heap_mask) PG_RMGR(RM_HEAP_ID, "Heap", heap_redo, heap_desc, heap_identify, NULL, NULL, heap_mask) -PG_RMGR(RM_BTREE_ID, "Btree", btree_redo, btree_desc, btree_identify, NULL, NULL, btree_mask) +PG_RMGR(RM_BTREE_ID, "Btree", btree_redo, btree_desc, btree_identify, btree_xlog_startup, btree_xlog_cleanup, btree_mask) PG_RMGR(RM_HASH_ID, "Hash", hash_redo, hash_desc, hash_identify, NULL, NULL, hash_mask) PG_RMGR(RM_GIN_ID, "Gin", gin_redo, gin_desc, gin_identify, gin_xlog_startup, gin_xlog_cleanup, gin_mask) PG_RMGR(RM_GIST_ID, "Gist", gist_redo, gist_desc, gist_identify, gist_xlog_startup, gist_xlog_cleanup, gist_mask) diff --git a/src/backend/access/common/reloptions.c b/src/backend/access/common/reloptions.c index 79430d2b7b..18b1bf5e20 100644 --- a/src/backend/access/common/reloptions.c +++ b/src/backend/access/common/reloptions.c @@ -158,6 +158,15 @@ static relopt_bool boolRelOpts[] = }, true }, + { + { + "deduplication", + "Enables deduplication on btree index leaf pages", + RELOPT_KIND_BTREE, + ShareUpdateExclusiveLock + }, + true + }, /* list terminator */ {{NULL}} }; diff --git a/src/backend/access/index/genam.c b/src/backend/access/index/genam.c index c16eb05416..dfba5ae39a 100644 --- a/src/backend/access/index/genam.c +++ b/src/backend/access/index/genam.c @@ -276,6 +276,10 @@ BuildIndexValueDescription(Relation indexRelation, /* * Get the latestRemovedXid from the table entries pointed at by the index * tuples being deleted. + * + * Note: index access methods that don't consistently use the standard + * IndexTuple + heap TID item pointer representation will need to provide + * their own version of this function. */ TransactionId index_compute_xid_horizon_for_tuples(Relation irel, diff --git a/src/backend/access/nbtree/Makefile b/src/backend/access/nbtree/Makefile index bf245f5dab..d69808e78c 100644 --- a/src/backend/access/nbtree/Makefile +++ b/src/backend/access/nbtree/Makefile @@ -14,6 +14,7 @@ include $(top_builddir)/src/Makefile.global OBJS = \ nbtcompare.o \ + nbtdedup.o \ nbtinsert.o \ nbtpage.o \ nbtree.o \ diff --git a/src/backend/access/nbtree/README b/src/backend/access/nbtree/README index c60a4d0d9e..e235750597 100644 --- a/src/backend/access/nbtree/README +++ b/src/backend/access/nbtree/README @@ -432,4 +432,7 @@ because we allow LP_DEAD to be set with only a share lock (it's exactly like a hint bit for a heap tuple), but physically removing tuples requires exclusive lock. In the current code we try to remove LP_DEAD tuples when we are otherwise faced with having to split a page to do an insertion (and -hence have exclusive lock on it already). +hence have exclusive lock on it already). Deduplication can also prevent +a page split, but removing LP_DEAD tuples is the preferred approach. +(Note that posting list tuples can only have their LP_DEAD bit set when +every "logical" tuple represented within the posting list is known dead.) HEIKKI: Do we ever do that? Do we ever set the LP_DEAD bit on a posting list tuple? This leaves the index in a state where it has no entry for a dead tuple that still exists in the heap. This is not a problem for the current @@ -726,9 +729,155 @@ if it must. When a page that's already full of duplicates must be split, the fallback strategy assumes that duplicates are mostly inserted in ascending heap TID order. The page is split in a way that leaves the left half of the page mostly full, and the right half of the page mostly empty. +The overall effect is that leaf page splits gracefully adapt to inserts of +large groups of duplicates, maximizing space utilization. Note also that +"trapping" large groups of duplicates on the same leaf page like this makes +deduplication more efficient. Deduplication can be performed infrequently, +while freeing just as much space. HEIKKI: I don't understand what the last sentence means. Just as much space as what? + +Notes about deduplication +------------------------- + +We deduplicate non-pivot tuples in non-unique indexes to reduce storage +overhead, and to avoid (or at least delay) page splits. Note that the +goals for deduplication in unique indexes are rather different; see later +section for details. Deduplication alters the physical representation of +tuples without changing the logical contents of the index, and without +adding overhead to read queries. Non-pivot tuples are merged together +into a single physical tuple with a posting list (a simple array of heap +TIDs with the standard item pointer format). Deduplication is always +applied lazily, at the point where it would otherwise be necessary to +perform a page split. It occurs only when LP_DEAD items have been +removed, as our last line of defense against splitting a leaf page. We +can set the LP_DEAD bit with posting list tuples, though only when all +TIDs are known dead. + +Our lazy approach to deduplication allows the page space accounting used +during page splits to have absolutely minimal special case logic for +posting lists. Posting lists can be thought of as extra payload that +suffix truncation will reliably truncate away as needed during page +splits, just like non-key columns from an INCLUDE index tuple. +Incoming/new tuples can generally be treated as non-overlapping plain +items (though see section on posting list splits for information about how +overlapping new/incoming items are really handled). + +The representation of posting lists is almost identical to the posting +lists used by GIN, so it would be straightforward to apply GIN's varbyte +encoding compression scheme to individual posting lists. Posting list +compression would break the assumptions made by posting list splits about +page space accounting (see later section), so it's not clear how +compression could be integrated with nbtree. Besides, posting list +compression does not offer a compelling trade-off for nbtree, since in +general nbtree is optimized for consistent performance with many +concurrent readers and writers. HEIKKI: Well, it's optimized for that today, but if it was compressed, a btree would be useful in more situations... + +A major goal of our lazy approach to deduplication is to limit the +performance impact of deduplication with random updates. Even concurrent +append-only inserts of the same key value will tend to have inserts of +individual index tuples in an order that doesn't quite match heap TID +order. Delaying deduplication minimizes page level fragmentation. + +Deduplication in unique indexes +------------------------------- + +Very often, the range of values that can be placed on a given leaf page in +a unique index is fixed and permanent. For example, a primary key on an +identity column will usually only have page splits caused by the insertion +of new logical rows within the rightmost leaf page. If there is a split +of a non-rightmost leaf page, then the split must have been triggered by +inserts associated with an UPDATE of an existing logical row. Splitting a +leaf page purely to store multiple versions should be considered +pathological, since it permanently degrades the index structure in order +to absorb a temporary burst of duplicates. Deduplication in unique +indexes helps to prevent these pathological page splits. + +Like all index access methods, nbtree does not have direct knowledge of +versioning or of MVCC; it deals only with physical tuples. However, unique +indexes implicitly give nbtree basic information about tuple versioning, +since by definition zero or one tuples of any given key value can be +visible to any possible MVCC snapshot (excluding index entries with NULL +values). When optimizations such as heapam's Heap-only tuples (HOT) happen +to be ineffective, nbtree's on-the-fly deletion of tuples in unique indexes +can be very important with UPDATE-heavy workloads. Unique checking's +LP_DEAD bit setting reliably attempts to kill old, equal index tuple +versions. This prevents (or at least delays) page splits that are +necessary only because a leaf page must contain multiple physical tuples +for the same logical row. Deduplication in unique indexes must cooperate +with this mechanism. Deleting items on the page is always preferable to +deduplication. + +The strategy used during a deduplication pass has significant differences +to the strategy used for indexes that can have multiple logical rows with +the same key value. We're not really trying to store duplicates in a +space efficient manner, since in the long run there won't be any +duplicates anyway. Rather, we're buying time for garbage collection +mechanisms to run before a page split is needed. + +Unique index leaf pages only get a deduplication pass when an insertion +(that might have to split the page) observed an existing duplicate on the +page in passing. This is based on the assumption that deduplication will +only work out when _all_ new insertions are duplicates from UPDATEs. This +may mean that we miss an opportunity to delay a page split, but that's +okay because our ultimate goal is to delay leaf page splits _indefinitely_ +(i.e. to prevent them altogether). There is little point in trying to +delay a split that is probably inevitable anyway. This allows us to avoid +the overhead of attempting to deduplicate with unique indexes that always +have few or no duplicates. + +Posting list splits +------------------- + +When the incoming tuple happens to overlap with an existing posting list, +a posting list split is performed. Like a page split, a posting list +split resolves the situation where a new/incoming item "won't fit", while +inserting the incoming item in passing (i.e. as part of the same atomic +action). It's possible (though not particularly likely) that an insert of +a new item on to an almost-full page will overlap with a posting list, +resulting in both a posting list split and a page split. Even then, the +atomic action that splits the posting list also inserts the new item +(since page splits always insert the new item in passing). Including the +posting list split in the same atomic action as the insert avoids problems +caused by concurrent inserts into the same posting list -- the exact +details of how we change the posting list depend upon the new item, and +vice-versa. A single atomic action also minimizes the volume of extra +WAL required for a posting list split, since we don't have to explicitly +WAL-log the original posting list tuple. + +Despite piggy-backing on the same atomic action that inserts a new tuple, +posting list splits can be thought of as a separate, extra action to the +insert itself (or to the page split itself). Posting list splits +conceptually "rewrite" an insert that overlaps with an existing posting +list into an insert that adds its final new item just to the right of +posting list instead. The size of the posting list won't change, and so +page space accounting code does not need to care about posting list splits +at all. This is an important upside of our design; the page split point +choice logic is very subtle even without it needing to deal with posting +list splits. + +Only a few isolated extra steps are required to preserve the illusion that +the new item never overlapped with an existing posting list in the first +place: the heap TID of the incoming tuple is swapped with the rightmost +heap TID from the existing/originally overlapping posting list. Also, the +posting-split-with-page-split case must generate a new high key based on +an imaginary version of the original page that has both the final new item +and the after-list-split posting tuple (page splits usually just operate +against an imaginary version that contains the new item/item that won't +fit). + +This approach avoids inventing an "eager" atomic posting split operation +that splits the posting list without simultaneously finishing the insert +of the incoming item. This alternative design might seem cleaner, but it +creates subtle problems for page space accounting. In general, there +might not be enough free space on the page to split a posting list such +that the incoming/new item no longer overlaps with either posting list +half --- the operation could fail before the actual retail insert of the +new item even begins. We'd end up having to handle posting list splits +that need a page split anyway. Besides, supporting variable "split points" +while splitting posting lists won't actually improve overall space +utilization. Notes About Data Representation ------------------------------- diff --git a/src/backend/access/nbtree/nbtdedup.c b/src/backend/access/nbtree/nbtdedup.c new file mode 100644 index 0000000000..d9f4e9db38 --- /dev/null +++ b/src/backend/access/nbtree/nbtdedup.c @@ -0,0 +1,738 @@ +/*------------------------------------------------------------------------- + * + * nbtdedup.c + * Deduplicate items in Postgres btrees. + * + * Portions Copyright (c) 1996-2020, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * + * IDENTIFICATION + * src/backend/access/nbtree/nbtdedup.c + * + *------------------------------------------------------------------------- + */ + +#include "postgres.h" + +#include "access/nbtree.h" +#include "access/nbtxlog.h" +#include "miscadmin.h" +#include "utils/rel.h" + + +/* + * Try to deduplicate items to free at least enough space to avoid a page + * split. This function should be called during insertion, only after LP_DEAD + * items were removed by _bt_vacuum_one_page() to prevent a page split. + * (We'll have to kill LP_DEAD items here when the page's BTP_HAS_GARBAGE hint + * was not set, but that should be rare.) + * + * The strategy for !checkingunique callers is to perform as much + * deduplication as possible to free as much space as possible now, since + * making it harder to set LP_DEAD bits is considered an acceptable price for + * not having to deduplicate the same page many times. It is unlikely that + * the items on the page will have their LP_DEAD bit set in the future, since + * that hasn't happened before now (besides, entire posting lists can still + * have their LP_DEAD bit set). + * + * The strategy for checkingunique callers is completely different. + * Deduplication works in tandem with garbage collection, especially the + * LP_DEAD bit setting that takes place in _bt_check_unique(). We give up as + * soon as it becomes clear that enough space has been made available to + * insert newitem without needing to split the page. Also, we merge together + * larger groups of duplicate tuples first (merging together two index tuples + * usually saves very little space), and avoid merging together existing + * posting list tuples. The goal is to generate posting lists with TIDs that + * are "close together in time", in order to maximize the chances of an + * LP_DEAD bit being set opportunistically. See nbtree/README for more + * information on deduplication within unique indexes. + * + * nbtinsert.c caller should call _bt_vacuum_one_page() before calling here. + * Note that this routine will delete all items on the page that have their + * LP_DEAD bit set, even when page's BTP_HAS_GARBAGE bit is not set (a rare + * edge case). Caller can rely on that to avoid inserting a new tuple that + * happens to overlap with an existing posting list tuple with its LP_DEAD bit + * set. (Calling here with a newitemsz of 0 will reliably delete the existing + * item, making it possible to avoid unsetting the LP_DEAD bit just to insert + * the new item. In general, posting list splits should never have to deal + * with a posting list tuple with its LP_DEAD bit set.) + * + * Note: If newitem contains NULL values in key attributes, caller will be + * !checkingunique even when rel is a unique index. The page in question will + * usually have many existing items with NULLs. + */ +void +_bt_dedup_one_page(Relation rel, Buffer buf, Relation heapRel, + IndexTuple newitem, Size newitemsz, bool checkingunique) +{ + OffsetNumber offnum, + minoff, + maxoff; + Page page = BufferGetPage(buf); + BTPageOpaque opaque; + OffsetNumber deletable[MaxIndexTuplesPerPage]; + BTDedupState state; + int natts = IndexRelationGetNumberOfAttributes(rel); + bool minimal = checkingunique; + int ndeletable = 0; + Size pagesaving = 0; + int count = 0; + bool singlevalue = false; + + /* + * Caller should call _bt_vacuum_one_page() before calling here when it + * looked like there were LP_DEAD items on the page. However, we can't + * assume that there are no LP_DEAD items (for one thing, VACUUM will + * clear the BTP_HAS_GARBAGE hint without reliably removing items that are + * marked LP_DEAD). We must be careful to clear all LP_DEAD items because + * posting list splits cannot go ahead if an existing posting list item + * has its LP_DEAD bit set. (Also, we don't want to unnecessarily unset + * LP_DEAD bits when deduplicating items on the page below, though that + * should be harmless.) + * + * The opposite problem is also possible: _bt_vacuum_one_page() won't + * clear the BTP_HAS_GARBAGE bit when it is falsely set (i.e. when there + * are no LP_DEAD bits). This probably doesn't matter in practice, since + * it's only a hint, and VACUUM will clear it at some point anyway. Even + * still, we clear the BTP_HAS_GARBAGE hint reliably here. (Seems like a + * good idea for deduplication to only begin when we unambiguously have no + * LP_DEAD items.) + */ + opaque = (BTPageOpaque) PageGetSpecialPointer(page); + minoff = P_FIRSTDATAKEY(opaque); + maxoff = PageGetMaxOffsetNumber(page); + for (offnum = minoff; + offnum <= maxoff; + offnum = OffsetNumberNext(offnum)) + { + ItemId itemid = PageGetItemId(page, offnum); + + if (ItemIdIsDead(itemid)) + deletable[ndeletable++] = offnum; + } + + if (ndeletable > 0) + { + _bt_delitems_delete(rel, buf, deletable, ndeletable, heapRel); + + /* + * Return when a split will be avoided. This is equivalent to + * avoiding a split by following the usual _bt_vacuum_one_page() path. + */ + if (PageGetFreeSpace(page) >= newitemsz) + return; + + /* + * Reconsider number of items on page, in case _bt_delitems_delete() + * managed to delete an item or two + */ + minoff = P_FIRSTDATAKEY(opaque); + maxoff = PageGetMaxOffsetNumber(page); + } + else if (P_HAS_GARBAGE(opaque)) + { + opaque->btpo_flags &= ~BTP_HAS_GARBAGE; + MarkBufferDirtyHint(buf, true); + } + + /* + * Return early in case where caller just wants us to kill an existing + * LP_DEAD posting list tuple + */ + Assert(!P_HAS_GARBAGE(opaque)); + if (newitemsz == 0) + return; + + /* + * By here, it's clear that deduplication will definitely be attempted. + * Initialize deduplication state. + */ + state = (BTDedupState) palloc(sizeof(BTDedupStateData)); + state->maxitemsize = Min(BTMaxItemSize(page), INDEX_SIZE_MASK); + state->checkingunique = checkingunique; + state->skippedbase = InvalidOffsetNumber; + /* Metadata about base tuple of current pending posting list */ + state->base = NULL; + state->baseoff = InvalidOffsetNumber; + state->basetupsize = 0; + /* Metadata about current pending posting list TIDs */ + state->htids = NULL; + state->nhtids = 0; + state->nitems = 0; + /* Size of all physical tuples to be replaced by pending posting list */ + state->phystupsize = 0; + + /* Passed-in newitemsz is MAXALIGNED but does not include line pointer */ + newitemsz += sizeof(ItemIdData); + /* Conservatively size array */ + state->htids = palloc(state->maxitemsize); + + /* + * Determine if a "single value" strategy page split is likely to occur + * shortly after deduplication finishes. It should be possible for the + * single value split to find a split point that packs the left half of + * the split BTREE_SINGLEVAL_FILLFACTOR% full. + */ + if (!checkingunique) + { + ItemId itemid; + IndexTuple itup; + + itemid = PageGetItemId(page, minoff); + itup = (IndexTuple) PageGetItem(page, itemid); + + if (_bt_keep_natts_fast(rel, newitem, itup) > natts) + { + itemid = PageGetItemId(page, PageGetMaxOffsetNumber(page)); + itup = (IndexTuple) PageGetItem(page, itemid); + + /* + * Use different strategy if future page split likely to need to + * use "single value" strategy + */ + if (_bt_keep_natts_fast(rel, newitem, itup) > natts) + singlevalue = true; + } + } + + /* + * Iterate over tuples on the page, try to deduplicate them into posting + * lists and insert into new page. NOTE: We must reassess the max offset + * on each iteration, since the number of items on the page goes down as + * existing items are deduplicated. + */ + offnum = minoff; +retry: + while (offnum <= PageGetMaxOffsetNumber(page)) + { + ItemId itemid = PageGetItemId(page, offnum); + IndexTuple itup = (IndexTuple) PageGetItem(page, itemid); + + Assert(!ItemIdIsDead(itemid)); + + if (state->nitems == 0) + { + /* + * No previous/base tuple for the data item -- use the data item + * as base tuple of pending posting list + */ + _bt_dedup_start_pending(state, itup, offnum); + } + else if (_bt_keep_natts_fast(rel, state->base, itup) > natts && + _bt_dedup_save_htid(state, itup)) + { + /* + * Tuple is equal to base tuple of pending posting list. Heap + * TID(s) for itup have been saved in state. The next iteration + * will also end up here if it's possible to merge the next tuple + * into the same pending posting list. + */ + } + else + { + /* + * Tuple is not equal to pending posting list tuple, or + * _bt_dedup_save_htid() opted to not merge current item into + * pending posting list for some other reason (e.g., adding more + * TIDs would have caused posting list to exceed BTMaxItemSize() + * limit). + * + * If state contains pending posting list with more than one item, + * form new posting tuple, and update the page. Otherwise, reset + * the state and move on. + */ + pagesaving += _bt_dedup_finish_pending(buf, state, + RelationNeedsWAL(rel)); + count++; + + /* + * When caller is a checkingunique caller and we have deduplicated + * enough to avoid a page split, do minimal deduplication in case + * the remaining items are about to be marked dead within + * _bt_check_unique(). + */ + if (minimal && pagesaving >= newitemsz) + break; + + /* + * Consider special steps when a future page split of the leaf + * page is likely to occur using nbtsplitloc.c's "single value" + * strategy + */ + if (singlevalue) + { + /* + * Adjust maxitemsize so that there isn't a third and final + * 1/3 of a page width tuple that fills the page to capacity. + * The third tuple produced should be smaller than the first + * two by an amount equal to the free space that nbtsplitloc.c + * is likely to want to leave behind when the page it split. + * When there are 3 posting lists on the page, then we end + * deduplication. Remaining tuples on the page can be + * deduplicated later, when they're on the new right sibling + * of this page, and the new sibling page needs to be split in + * turn. + * + * Note that it doesn't matter if there are items on the page + * that were already 1/3 of a page during current pass; + * they'll still count as the first two posting list tuples. + */ + if (count == 2) + { + Size leftfree; + + /* This calculation needs to match nbtsplitloc.c */ + leftfree = PageGetPageSize(page) - SizeOfPageHeaderData - + MAXALIGN(sizeof(BTPageOpaqueData)); + /* Subtract predicted size of new high key */ + leftfree -= newitemsz + MAXALIGN(sizeof(ItemPointerData)); + + /* + * Reduce maxitemsize by an amount equal to target free + * space on left half of page + */ + state->maxitemsize -= leftfree * + ((100 - BTREE_SINGLEVAL_FILLFACTOR) / 100.0); + } + else if (count == 3) + break; + } + + /* + * Next iteration starts immediately after base tuple offset (this + * will be the next offset on the page when we didn't modify the + * page) + */ + offnum = state->baseoff; + } + + offnum = OffsetNumberNext(offnum); + } + + /* Handle the last item when pending posting list is not empty */ + if (state->nitems != 0) + { + pagesaving += _bt_dedup_finish_pending(buf, state, + RelationNeedsWAL(rel)); + count++; + } + + if (pagesaving < newitemsz && state->skippedbase != InvalidOffsetNumber) + { + /* + * Didn't free enough space for new item in first checkingunique pass. + * Try making a second pass over the page, this time starting from the + * first candidate posting list base offset that was skipped over in + * the first pass (only do a second pass when this actually happened). + * + * The second pass over the page may deduplicate items that were + * initially passed over due to concerns about limiting the + * effectiveness of LP_DEAD bit setting within _bt_check_unique(). + * Note that the second pass will still stop deduplicating as soon as + * enough space has been freed to avoid an immediate page split. + */ + Assert(state->checkingunique); + offnum = state->skippedbase; + + state->checkingunique = false; + state->skippedbase = InvalidOffsetNumber; + state->phystupsize = 0; + state->nitems = 0; + state->base = NULL; + state->baseoff = InvalidOffsetNumber; + state->basetupsize = 0; + goto retry; + } + + /* Local space accounting should agree with page accounting */ + Assert(pagesaving < newitemsz || PageGetExactFreeSpace(page) >= newitemsz); + + /* cannot leak memory here */ + pfree(state->htids); + pfree(state); +} + +/* + * Create a new pending posting list tuple based on caller's tuple. + * + * Every tuple processed by the deduplication routines either becomes the base + * tuple for a posting list, or gets its heap TID(s) accepted into a pending + * posting list. A tuple that starts out as the base tuple for a posting list + * will only actually be rewritten within _bt_dedup_finish_pending() when + * there was at least one successful call to _bt_dedup_save_htid(). + */ +void +_bt_dedup_start_pending(BTDedupState state, IndexTuple base, + OffsetNumber baseoff) +{ + Assert(state->nhtids == 0); + Assert(state->nitems == 0); + Assert(!BTreeTupleIsPivot(base)); + + /* + * Copy heap TIDs from new base tuple for new candidate posting list into + * ipd array. Assume that we'll eventually create a new posting tuple by + * merging later tuples with this existing one, though we may not. + */ + if (!BTreeTupleIsPosting(base)) + { + memcpy(state->htids, base, sizeof(ItemPointerData)); + state->nhtids = 1; + /* Save size of tuple without any posting list */ + state->basetupsize = IndexTupleSize(base); + } + else + { + int nposting; + + nposting = BTreeTupleGetNPosting(base); + memcpy(state->htids, BTreeTupleGetPosting(base), + sizeof(ItemPointerData) * nposting); + state->nhtids = nposting; + /* Save size of tuple without any posting list */ + state->basetupsize = BTreeTupleGetPostingOffset(base); + } + + /* + * Save new base tuple itself -- it'll be needed if we actually create a + * new posting list from new pending posting list. + * + * Must maintain size of all tuples (including line pointer overhead) to + * calculate space savings on page within _bt_dedup_finish_pending(). + * Also, save number of base tuple logical tuples so that we can save + * cycles in the common case where an existing posting list can't or won't + * be merged with other tuples on the page. + */ + state->nitems = 1; + state->base = base; + state->baseoff = baseoff; + state->phystupsize = MAXALIGN(IndexTupleSize(base)) + sizeof(ItemIdData); +} + +/* + * Save itup heap TID(s) into pending posting list where possible. + * + * Returns bool indicating if the pending posting list managed by state has + * itup's heap TID(s) saved. When this is false, enlarging the pending + * posting list by the required amount would exceed the maxitemsize limit, so + * caller must finish the pending posting list tuple. (Generally itup becomes + * the base tuple of caller's new pending posting list). + */ +bool +_bt_dedup_save_htid(BTDedupState state, IndexTuple itup) +{ + int nhtids; + ItemPointer htids; + Size mergedtupsz; + + Assert(!BTreeTupleIsPivot(itup)); + + if (!BTreeTupleIsPosting(itup)) + { + nhtids = 1; + htids = &itup->t_tid; + } + else + { + nhtids = BTreeTupleGetNPosting(itup); + htids = BTreeTupleGetPosting(itup); + } + + /* + * Don't append (have caller finish pending posting list as-is) if + * appending heap TID(s) from itup would put us over limit. + * + * This calculation needs to match the code used within _bt_form_posting() + * for new posting list tuples. + */ + mergedtupsz = MAXALIGN(state->basetupsize + + (state->nhtids + nhtids) * sizeof(ItemPointerData)); + + if (mergedtupsz > state->maxitemsize) + return false; + + /* Don't merge existing posting lists in first checkingunique pass */ + if (state->checkingunique && + (BTreeTupleIsPosting(state->base) || nhtids > 1)) + { + /* May begin here if second pass over page is required */ + if (state->skippedbase == InvalidOffsetNumber) + state->skippedbase = state->baseoff; + return false; + } + + /* + * Save heap TIDs to pending posting list tuple -- itup can be merged into + * pending posting list + */ + state->nitems++; + memcpy(state->htids + state->nhtids, htids, + sizeof(ItemPointerData) * nhtids); + state->nhtids += nhtids; + state->phystupsize += MAXALIGN(IndexTupleSize(itup)) + sizeof(ItemIdData); + + return true; +} + +/* + * Finalize pending posting list tuple, and add it to the page. Final tuple + * is based on saved base tuple, and saved list of heap TIDs. + * + * Returns space saving from deduplicating to make a new posting list tuple. + * Note that this includes line pointer overhead. This is zero in the case + * where no deduplication was possible. + */ +Size +_bt_dedup_finish_pending(Buffer buf, BTDedupState state, bool logged) +{ + Size spacesaving = 0; + Page page = BufferGetPage(buf); + int minimum = 2; + + Assert(state->nitems > 0); + Assert(state->nitems <= state->nhtids); + + /* + * Only create a posting list when at least 3 heap TIDs will appear in the + * checkingunique case (checkingunique strategy won't merge existing + * posting list tuples, so we know that the number of items here must also + * be the total number of heap TIDs). Creating a new posting lists with + * only two heap TIDs won't even save enough space to fit another + * duplicate with the same key as the posting list. This is a bad + * trade-off if there is a chance that the LP_DEAD bit can be set for + * either existing tuple by putting off deduplication. + * + * (Note that a second pass over the page can deduplicate the item if that + * is truly the only way to avoid a page split for checkingunique caller.) + */ + Assert(!state->checkingunique || state->nitems == 1 || + state->nhtids == state->nitems); + if (state->checkingunique) + { + minimum = 3; + /* May begin here if second pass over page is required */ + if (state->nitems == 2 && state->skippedbase == InvalidOffsetNumber) + state->skippedbase = state->baseoff; + } + + if (state->nitems >= minimum) + { + IndexTuple final; + Size finalsz; + OffsetNumber offnum; + OffsetNumber deletable[MaxOffsetNumber]; + int ndeletable = 0; + + /* find all tuples that will be replaced with this new posting tuple */ + for (offnum = state->baseoff; + offnum < state->baseoff + state->nitems; + offnum = OffsetNumberNext(offnum)) + deletable[ndeletable++] = offnum; + + /* Form a tuple with a posting list */ + final = _bt_form_posting(state->base, state->htids, state->nhtids); + finalsz = IndexTupleSize(final); + spacesaving = state->phystupsize - (finalsz + sizeof(ItemIdData)); + /* Must save some space, and must not exceed tuple limits */ + Assert(spacesaving > 0 && spacesaving < BLCKSZ); + Assert(finalsz <= state->maxitemsize); + Assert(finalsz == MAXALIGN(IndexTupleSize(final))); + + START_CRIT_SECTION(); + + /* Delete original items */ + PageIndexMultiDelete(page, deletable, ndeletable); + /* Insert posting tuple, replacing original items */ + if (PageAddItem(page, (Item) final, finalsz, state->baseoff, false, + false) == InvalidOffsetNumber) + elog(ERROR, "deduplication failed to add tuple to page"); + + MarkBufferDirty(buf); + + /* Log deduplicated items */ + if (logged) + { + XLogRecPtr recptr; + xl_btree_dedup xlrec_dedup; + + xlrec_dedup.baseoff = state->baseoff; + xlrec_dedup.nitems = state->nitems; + + XLogBeginInsert(); + XLogRegisterBuffer(0, buf, REGBUF_STANDARD); + XLogRegisterData((char *) &xlrec_dedup, SizeOfBtreeDedup); + + recptr = XLogInsert(RM_BTREE_ID, XLOG_BTREE_DEDUP_PAGE); + + PageSetLSN(page, recptr); + } + + END_CRIT_SECTION(); + + pfree(final); + } + + /* Reset state for next pending posting list */ + state->nhtids = 0; + state->nitems = 0; + state->phystupsize = 0; + + return spacesaving; +} + +/* + * Build a posting list tuple based on caller's "base" index tuple and list of + * heap TIDs. When nhtids == 1, builds a standard non-pivot tuple without a + * posting list. (Posting list tuples can never have a single heap TID, partly + * because that ensures that deduplication always reduces final MAXALIGN()'d + * size of entire tuple.) + * + * Convention is that posting list starts at a MAXALIGN()'d offset (rather + * than a SHORTALIGN()'d offset), in line with the approach taken when + * appending a heap TID to new pivot tuple/high key during suffix truncation. + * This sometimes wastes a little space that was only needed as alignment + * padding in the original tuple. Following this convention simplifies the + * space accounting used when deduplicating a page (the same convention + * simplifies the accounting for choosing a point to split a page at). + * + * Note: Caller's "htids" array must be unique and already in ascending TID + * order. Any existing heap TIDs from "base" won't automatically appear in + * returned posting list tuple (they must be included in item pointer array as + * required.) + */ +IndexTuple +_bt_form_posting(IndexTuple base, ItemPointer htids, int nhtids) +{ + uint32 keysize, + newsize; + IndexTuple itup; + + /* We only use the key from the base tuple */ + if (BTreeTupleIsPosting(base)) + keysize = BTreeTupleGetPostingOffset(base); + else + keysize = IndexTupleSize(base); + + Assert(!BTreeTupleIsPivot(base)); + Assert(nhtids > 0 && nhtids <= PG_UINT16_MAX); + Assert(keysize == MAXALIGN(keysize)); + + /* + * Determine final size of new tuple. + * + * The calculation used when new tuple has a posting list needs to match + * the code used within _bt_dedup_save_htid(). + */ + if (nhtids > 1) + newsize = MAXALIGN(keysize + + nhtids * sizeof(ItemPointerData)); + else + newsize = keysize; + + Assert(newsize <= INDEX_SIZE_MASK); + Assert(newsize == MAXALIGN(newsize)); + + /* Allocate memory using palloc0() to match index_form_tuple() */ + itup = palloc0(newsize); + memcpy(itup, base, keysize); + itup->t_info &= ~INDEX_SIZE_MASK; + itup->t_info |= newsize; + if (nhtids > 1) + { + /* Form posting list tuple */ + BTreeTupleSetPosting(itup, nhtids, keysize); + memcpy(BTreeTupleGetPosting(itup), htids, + sizeof(ItemPointerData) * nhtids); + Assert(BTreeTupleIsPosting(itup)); + +#ifdef USE_ASSERT_CHECKING + { + /* Verify posting list invariants with assertions */ + ItemPointerData last; + ItemPointer htid; + + ItemPointerCopy(BTreeTupleGetHeapTID(itup), &last); + + for (int i = 1; i < BTreeTupleGetNPosting(itup); i++) + { + htid = BTreeTupleGetPostingN(itup, i); + + Assert(ItemPointerIsValid(htid)); + Assert(ItemPointerCompare(htid, &last) > 0); + ItemPointerCopy(htid, &last); + } + } +#endif + } + else + { + /* + * Copy only TID in htids array to header field (i.e. create standard + * non-pivot representation) + */ + itup->t_info &= ~INDEX_ALT_TID_MASK; + Assert(ItemPointerIsValid(&itup->t_tid)); + ItemPointerCopy(htids, &itup->t_tid); + } + + return itup; +} + +/* + * Prepare for a posting list split by swapping heap TID in newitem with heap + * TID from original posting list (the 'oposting' heap TID located at offset + * 'postingoff'). Modifies newitem, so caller should probably pass their own + * private copy that can safely be modified. + * + * Returns new posting list tuple, which is palloc()'d in caller's context. + * This is guaranteed to be the same size as 'oposting'. Modified newitem is + * what caller actually inserts. (This generally happens inside the same + * critical section that performs an in-place update of old posting list using + * new posting list returned here). + * + * Caller should avoid assuming that the IndexTuple-wise key representation in + * newitem is bitwise equal to the representation used within oposting. Note, + * in particular, that one may even be larger than the other. This could + * occur due to differences in TOAST input state, for example. + * + * See nbtree/README for details on the design of posting list splits. + */ +IndexTuple +_bt_swap_posting(IndexTuple newitem, IndexTuple oposting, int postingoff) +{ + int nhtids; + char *replacepos; + char *rightpos; + Size nbytes; + IndexTuple nposting; + + Assert(!BTreeTupleIsPivot(newitem)); + + nhtids = BTreeTupleGetNPosting(oposting); + Assert(postingoff > 0 && postingoff < nhtids); + + nposting = CopyIndexTuple(oposting); + replacepos = (char *) BTreeTupleGetPostingN(nposting, postingoff); + rightpos = replacepos + sizeof(ItemPointerData); + nbytes = (nhtids - postingoff - 1) * sizeof(ItemPointerData); + + /* + * Move item pointers in posting list to make a gap for the new item's + * heap TID (shift TIDs one place to the right, losing original rightmost + * TID) + */ + memmove(rightpos, replacepos, nbytes); + + /* Fill the gap with the TID of the new item */ + ItemPointerCopy(&newitem->t_tid, (ItemPointer) replacepos); + + /* Copy original posting list's rightmost TID into new item */ + ItemPointerCopy(BTreeTupleGetPostingN(oposting, nhtids - 1), + &newitem->t_tid); + Assert(ItemPointerCompare(BTreeTupleGetMaxHeapTID(nposting), + BTreeTupleGetHeapTID(newitem)) < 0); + Assert(BTreeTupleGetNPosting(oposting) == BTreeTupleGetNPosting(nposting)); + + return nposting; +} diff --git a/src/backend/access/nbtree/nbtinsert.c b/src/backend/access/nbtree/nbtinsert.c index 144d339e8d..51468e0455 100644 --- a/src/backend/access/nbtree/nbtinsert.c +++ b/src/backend/access/nbtree/nbtinsert.c @@ -28,6 +28,8 @@ /* Minimum tree height for application of fastpath optimization */ #define BTREE_FASTPATH_MIN_LEVEL 2 +/* GUC parameter */ +bool btree_deduplication = true; static Buffer _bt_newroot(Relation rel, Buffer lbuf, Buffer rbuf); @@ -47,10 +49,12 @@ static void _bt_insertonpg(Relation rel, BTScanInsert itup_key, BTStack stack, IndexTuple itup, OffsetNumber newitemoff, + int postingoff, bool split_only_page); static Buffer _bt_split(Relation rel, BTScanInsert itup_key, Buffer buf, Buffer cbuf, OffsetNumber newitemoff, Size newitemsz, - IndexTuple newitem); + IndexTuple newitem, IndexTuple orignewitem, + IndexTuple nposting, uint16 postingoff); static void _bt_insert_parent(Relation rel, Buffer buf, Buffer rbuf, BTStack stack, bool is_root, bool is_only); static bool _bt_pgaddtup(Page page, Size itemsize, IndexTuple itup, @@ -125,6 +129,7 @@ _bt_doinsert(Relation rel, IndexTuple itup, insertstate.itup_key = itup_key; insertstate.bounds_valid = false; insertstate.buf = InvalidBuffer; + insertstate.postingoff = 0; /* * It's very common to have an index on an auto-incremented or @@ -300,7 +305,7 @@ top: newitemoff = _bt_findinsertloc(rel, &insertstate, checkingunique, stack, heapRel); _bt_insertonpg(rel, itup_key, insertstate.buf, InvalidBuffer, stack, - itup, newitemoff, false); + itup, newitemoff, insertstate.postingoff, false); } else { @@ -353,6 +358,9 @@ _bt_check_unique(Relation rel, BTInsertState insertstate, Relation heapRel, BTPageOpaque opaque; Buffer nbuf = InvalidBuffer; bool found = false; + bool inposting = false; + bool prevalldead = true; + int curposti = 0; /* Assume unique until we find a duplicate */ *is_unique = true; @@ -374,6 +382,11 @@ _bt_check_unique(Relation rel, BTInsertState insertstate, Relation heapRel, /* * Scan over all equal tuples, looking for live conflicts. + * + * Note that each iteration of the loop processes one heap TID, not one + * index tuple. The page offset number won't be advanced for iterations + * which process heap TIDs from posting list tuples until the last such + * heap TID for the posting list (curposti will be advanced instead). */ Assert(!insertstate->bounds_valid || insertstate->low == offset); Assert(!itup_key->anynullkeys); @@ -435,7 +448,28 @@ _bt_check_unique(Relation rel, BTInsertState insertstate, Relation heapRel, /* okay, we gotta fetch the heap tuple ... */ curitup = (IndexTuple) PageGetItem(page, curitemid); - htid = curitup->t_tid; + + /* + * decide if this is the first heap TID in tuple we'll + * process, or if we should continue to process current + * posting list + */ + Assert(!BTreeTupleIsPivot(curitup)); + if (!BTreeTupleIsPosting(curitup)) + { + htid = curitup->t_tid; + inposting = false; + } + else if (!inposting) + { + /* First heap TID in posting list */ + inposting = true; + prevalldead = true; + curposti = 0; + } + + if (inposting) + htid = *BTreeTupleGetPostingN(curitup, curposti); /* * If we are doing a recheck, we expect to find the tuple we @@ -511,8 +545,7 @@ _bt_check_unique(Relation rel, BTInsertState insertstate, Relation heapRel, * not part of this chain because it had a different index * entry. */ - htid = itup->t_tid; - if (table_index_fetch_tuple_check(heapRel, &htid, + if (table_index_fetch_tuple_check(heapRel, &itup->t_tid, SnapshotSelf, NULL)) { /* Normal case --- it's still live */ @@ -570,12 +603,14 @@ _bt_check_unique(Relation rel, BTInsertState insertstate, Relation heapRel, RelationGetRelationName(rel)))); } } - else if (all_dead) + else if (all_dead && (!inposting || + (prevalldead && + curposti == BTreeTupleGetNPosting(curitup) - 1))) { /* - * The conflicting tuple (or whole HOT chain) is dead to - * everyone, so we may as well mark the index entry - * killed. + * The conflicting tuple (or all HOT chains pointed to by + * all posting list TIDs) is dead to everyone, so mark the + * index entry killed. */ ItemIdMarkDead(curitemid); opaque->btpo_flags |= BTP_HAS_GARBAGE; @@ -589,14 +624,29 @@ _bt_check_unique(Relation rel, BTInsertState insertstate, Relation heapRel, else MarkBufferDirtyHint(insertstate->buf, true); } + + /* + * Remember if posting list tuple has even a single HOT chain + * whose members are not all dead + */ + if (!all_dead && inposting) + prevalldead = false; } } - /* - * Advance to next tuple to continue checking. - */ - if (offset < maxoff) + if (inposting && curposti < BTreeTupleGetNPosting(curitup) - 1) + { + /* Advance to next TID in same posting list */ + curposti++; + continue; + } + else if (offset < maxoff) + { + /* Advance to next tuple */ + curposti = 0; + inposting = false; offset = OffsetNumberNext(offset); + } else { int highkeycmp; @@ -621,6 +671,8 @@ _bt_check_unique(Relation rel, BTInsertState insertstate, Relation heapRel, elog(ERROR, "fell off the end of index \"%s\"", RelationGetRelationName(rel)); } + curposti = 0; + inposting = false; maxoff = PageGetMaxOffsetNumber(page); offset = P_FIRSTDATAKEY(opaque); /* Don't invalidate binary search bounds */ @@ -689,6 +741,7 @@ _bt_findinsertloc(Relation rel, BTScanInsert itup_key = insertstate->itup_key; Page page = BufferGetPage(insertstate->buf); BTPageOpaque lpageop; + OffsetNumber newitemoff; lpageop = (BTPageOpaque) PageGetSpecialPointer(page); @@ -704,6 +757,8 @@ _bt_findinsertloc(Relation rel, if (itup_key->heapkeyspace) { + bool dedupunique = false; + /* * If we're inserting into a unique index, we may have to walk right * through leaf pages to find the one leaf page that we must insert on @@ -717,9 +772,25 @@ _bt_findinsertloc(Relation rel, * tuple belongs on. The heap TID attribute for new tuple (scantid) * could force us to insert on a sibling page, though that should be * very rare in practice. + * + * checkingunique inserters that encounter a duplicate will apply + * deduplication when it looks like there will be a page split, but + * there is no LP_DEAD garbage on the leaf page to vacuum away (or + * there wasn't enough space freed by LP_DEAD cleanup). This + * complements the opportunistic LP_DEAD vacuuming mechanism. The + * high level goal is to avoid page splits caused by new, unchanged + * versions of existing logical rows altogether. See nbtree/README + * for full details. */ if (checkingunique) { + if (insertstate->low < insertstate->stricthigh) + { + /* Encountered a duplicate in _bt_check_unique() */ + Assert(insertstate->bounds_valid); + dedupunique = true; + } + for (;;) { /* @@ -746,18 +817,37 @@ _bt_findinsertloc(Relation rel, /* Update local state after stepping right */ page = BufferGetPage(insertstate->buf); lpageop = (BTPageOpaque) PageGetSpecialPointer(page); + /* Assume duplicates (helpful when initial page is empty) */ + dedupunique = true; } } /* * If the target page is full, see if we can obtain enough space by - * erasing LP_DEAD items + * erasing LP_DEAD items. If that doesn't work out, try to obtain + * enough free space to avoid a page split by deduplicating existing + * items (if deduplication is safe). */ - if (PageGetFreeSpace(page) < insertstate->itemsz && - P_HAS_GARBAGE(lpageop)) + if (PageGetFreeSpace(page) < insertstate->itemsz) { - _bt_vacuum_one_page(rel, insertstate->buf, heapRel); - insertstate->bounds_valid = false; + if (P_HAS_GARBAGE(lpageop)) + { + _bt_vacuum_one_page(rel, insertstate->buf, heapRel); + insertstate->bounds_valid = false; + + /* Might as well assume duplicates if checkingunique */ + dedupunique = true; + } + + if (itup_key->safededup && BTGetUseDedup(rel) && + PageGetFreeSpace(page) < insertstate->itemsz && + (!checkingunique || dedupunique)) + { + _bt_dedup_one_page(rel, insertstate->buf, heapRel, + insertstate->itup, insertstate->itemsz, + checkingunique); + insertstate->bounds_valid = false; + } } } else @@ -839,7 +929,36 @@ _bt_findinsertloc(Relation rel, Assert(P_RIGHTMOST(lpageop) || _bt_compare(rel, itup_key, page, P_HIKEY) <= 0); - return _bt_binsrch_insert(rel, insertstate); + newitemoff = _bt_binsrch_insert(rel, insertstate); + + if (insertstate->postingoff == -1) + { + /* + * There is an overlapping posting list tuple with its LP_DEAD bit + * set. _bt_insertonpg() cannot handle this, so delete all LP_DEAD + * items early. This is the only case where LP_DEAD deletes happen + * even though a page split wouldn't take place if we went straight to + * the _bt_insertonpg() call. + * + * Call _bt_dedup_one_page() instead of _bt_vacuum_one_page() to force + * deletes (this avoids relying on the BTP_HAS_GARBAGE hint flag, + * which might be falsely unset). Call can't actually dedup items, + * since we pass a newitemsz of 0. + */ + _bt_dedup_one_page(rel, insertstate->buf, heapRel, + insertstate->itup, 0, true); + + /* + * Do new binary search, having killed LP_DEAD items. New insert + * location cannot overlap with any posting list now. + */ + insertstate->bounds_valid = false; + insertstate->postingoff = 0; + newitemoff = _bt_binsrch_insert(rel, insertstate); + Assert(insertstate->postingoff == 0); + } + + return newitemoff; } /* @@ -905,10 +1024,12 @@ _bt_stepright(Relation rel, BTInsertState insertstate, BTStack stack) * * This recursive procedure does the following things: * + * + if postingoff != 0, splits existing posting list tuple + * (since it overlaps with new 'itup' tuple). * + if necessary, splits the target page, using 'itup_key' for * suffix truncation on leaf pages (caller passes NULL for * non-leaf pages). - * + inserts the tuple. + * + inserts the new tuple (might be split from posting list). * + if the page was split, pops the parent stack, and finds the * right place to insert the new child pointer (by walking * right using information stored in the parent stack). @@ -936,11 +1057,15 @@ _bt_insertonpg(Relation rel, BTStack stack, IndexTuple itup, OffsetNumber newitemoff, + int postingoff, bool split_only_page) { Page page; BTPageOpaque lpageop; Size itemsz; + IndexTuple oposting; + IndexTuple origitup = NULL; + IndexTuple nposting = NULL; page = BufferGetPage(buf); lpageop = (BTPageOpaque) PageGetSpecialPointer(page); @@ -954,6 +1079,7 @@ _bt_insertonpg(Relation rel, Assert(P_ISLEAF(lpageop) || BTreeTupleGetNAtts(itup, rel) <= IndexRelationGetNumberOfKeyAttributes(rel)); + Assert(!BTreeTupleIsPosting(itup)); /* The caller should've finished any incomplete splits already. */ if (P_INCOMPLETE_SPLIT(lpageop)) @@ -964,6 +1090,34 @@ _bt_insertonpg(Relation rel, itemsz = MAXALIGN(itemsz); /* be safe, PageAddItem will do this but we * need to be consistent */ + /* + * Do we need to split an existing posting list item? + */ + if (postingoff != 0) + { + ItemId itemid = PageGetItemId(page, newitemoff); + + /* + * The new tuple is a duplicate with a heap TID that falls inside the + * range of an existing posting list tuple on a leaf page. Prepare to + * split an existing posting list. Overwriting the posting list with + * its post-split version is treated as an extra step in either the + * insert or page split critical section. + */ + Assert(P_ISLEAF(lpageop) && !ItemIdIsDead(itemid)); + Assert(itup_key->heapkeyspace && itup_key->safededup); + oposting = (IndexTuple) PageGetItem(page, itemid); + + /* use a mutable copy of itup as our itup from here on */ + origitup = itup; + itup = CopyIndexTuple(origitup); + nposting = _bt_swap_posting(itup, oposting, postingoff); + /* itup now contains rightmost TID from oposting */ + + /* Alter offset so that newitem goes after posting list */ + newitemoff = OffsetNumberNext(newitemoff); + } + /* * Do we need to split the page to fit the item on it? * @@ -996,7 +1150,8 @@ _bt_insertonpg(Relation rel, BlockNumberIsValid(RelationGetTargetBlock(rel)))); /* split the buffer into left and right halves */ - rbuf = _bt_split(rel, itup_key, buf, cbuf, newitemoff, itemsz, itup); + rbuf = _bt_split(rel, itup_key, buf, cbuf, newitemoff, itemsz, itup, + origitup, nposting, postingoff); PredicateLockPageSplit(rel, BufferGetBlockNumber(buf), BufferGetBlockNumber(rbuf)); @@ -1071,6 +1226,9 @@ _bt_insertonpg(Relation rel, /* Do the update. No ereport(ERROR) until changes are logged */ START_CRIT_SECTION(); + if (postingoff != 0) + memcpy(oposting, nposting, MAXALIGN(IndexTupleSize(nposting))); + if (!_bt_pgaddtup(page, itemsz, itup, newitemoff)) elog(PANIC, "failed to add new item to block %u in index \"%s\"", itup_blkno, RelationGetRelationName(rel)); @@ -1120,8 +1278,19 @@ _bt_insertonpg(Relation rel, XLogBeginInsert(); XLogRegisterData((char *) &xlrec, SizeOfBtreeInsert); - if (P_ISLEAF(lpageop)) + if (P_ISLEAF(lpageop) && postingoff == 0) + { + /* Simple leaf insert */ xlinfo = XLOG_BTREE_INSERT_LEAF; + } + else if (postingoff != 0) + { + /* + * Leaf insert with posting list split. Must include + * postingoff field before newitem/orignewitem. + */ + xlinfo = XLOG_BTREE_INSERT_POST; + } else { /* @@ -1144,6 +1313,7 @@ _bt_insertonpg(Relation rel, xlmeta.oldest_btpo_xact = metad->btm_oldest_btpo_xact; xlmeta.last_cleanup_num_heap_tuples = metad->btm_last_cleanup_num_heap_tuples; + xlmeta.safededup = metad->btm_safededup; XLogRegisterBuffer(2, metabuf, REGBUF_WILL_INIT | REGBUF_STANDARD); XLogRegisterBufData(2, (char *) &xlmeta, sizeof(xl_btree_metadata)); @@ -1152,7 +1322,27 @@ _bt_insertonpg(Relation rel, } XLogRegisterBuffer(0, buf, REGBUF_STANDARD); - XLogRegisterBufData(0, (char *) itup, IndexTupleSize(itup)); + if (postingoff == 0) + { + /* Simple, common case -- log itup from caller */ + XLogRegisterBufData(0, (char *) itup, IndexTupleSize(itup)); + } + else + { + /* + * Insert with posting list split (XLOG_BTREE_INSERT_POST + * record) case. + * + * Log postingoff. Also log origitup, not itup. REDO routine + * must reconstruct final itup (as well as nposting) using + * _bt_swap_posting(). + */ + uint16 upostingoff = postingoff; + + XLogRegisterBufData(0, (char *) &upostingoff, sizeof(uint16)); + XLogRegisterBufData(0, (char *) origitup, + IndexTupleSize(origitup)); + } recptr = XLogInsert(RM_BTREE_ID, xlinfo); @@ -1194,6 +1384,14 @@ _bt_insertonpg(Relation rel, _bt_getrootheight(rel) >= BTREE_FASTPATH_MIN_LEVEL) RelationSetTargetBlock(rel, cachedBlock); } + + /* be tidy */ + if (postingoff != 0) + { + /* itup is actually a modified copy of caller's original */ + pfree(nposting); + pfree(itup); + } } /* @@ -1209,12 +1407,24 @@ _bt_insertonpg(Relation rel, * This function will clear the INCOMPLETE_SPLIT flag on it, and * release the buffer. * + * orignewitem, nposting, and postingoff are needed when an insert of + * orignewitem results in both a posting list split and a page split. + * These extra posting list split details are used here in the same + * way as they are used in the more common case where a posting list + * split does not coincide with a page split. We need to deal with + * posting list splits directly in order to ensure that everything + * that follows from the insert of orignewitem is handled as a single + * atomic operation (though caller's insert of a new pivot/downlink + * into parent page will still be a separate operation). See + * nbtree/README for details on the design of posting list splits. + * * Returns the new right sibling of buf, pinned and write-locked. * The pin and lock on buf are maintained. */ static Buffer _bt_split(Relation rel, BTScanInsert itup_key, Buffer buf, Buffer cbuf, - OffsetNumber newitemoff, Size newitemsz, IndexTuple newitem) + OffsetNumber newitemoff, Size newitemsz, IndexTuple newitem, + IndexTuple orignewitem, IndexTuple nposting, uint16 postingoff) { Buffer rbuf; Page origpage; @@ -1234,6 +1444,7 @@ _bt_split(Relation rel, BTScanInsert itup_key, Buffer buf, Buffer cbuf, OffsetNumber leftoff, rightoff; OffsetNumber firstright; + OffsetNumber origpagepostingoff; OffsetNumber maxoff; OffsetNumber i; bool newitemonleft, @@ -1303,6 +1514,34 @@ _bt_split(Relation rel, BTScanInsert itup_key, Buffer buf, Buffer cbuf, PageSetLSN(leftpage, PageGetLSN(origpage)); isleaf = P_ISLEAF(oopaque); + /* + * Determine page offset number of existing overlapped-with-orignewitem + * posting list when it is necessary to perform a posting list split in + * passing. Note that newitem was already changed by caller (newitem no + * longer has the orignewitem TID). + * + * This page offset number (origpagepostingoff) will be used to pretend + * that the posting split has already taken place, even though the + * required modifications to origpage won't occur until we reach the + * critical section. The lastleft and firstright tuples of our page split + * point should, in effect, come from an imaginary version of origpage + * that has the nposting tuple instead of the original posting list tuple. + * + * Note: _bt_findsplitloc() should have compensated for coinciding posting + * list splits in just the same way, at least in theory. It doesn't + * bother with that, though. In practice it won't affect its choice of + * split point. + */ + origpagepostingoff = InvalidOffsetNumber; + if (postingoff != 0) + { + Assert(isleaf); + Assert(ItemPointerCompare(&orignewitem->t_tid, + &newitem->t_tid) < 0); + Assert(BTreeTupleIsPosting(nposting)); + origpagepostingoff = OffsetNumberPrev(newitemoff); + } + /* * The "high key" for the new left page will be the first key that's going * to go into the new right page, or a truncated version if this is a leaf @@ -1340,6 +1579,8 @@ _bt_split(Relation rel, BTScanInsert itup_key, Buffer buf, Buffer cbuf, itemid = PageGetItemId(origpage, firstright); itemsz = ItemIdGetLength(itemid); item = (IndexTuple) PageGetItem(origpage, itemid); + if (firstright == origpagepostingoff) + item = nposting; } /* @@ -1373,6 +1614,8 @@ _bt_split(Relation rel, BTScanInsert itup_key, Buffer buf, Buffer cbuf, Assert(lastleftoff >= P_FIRSTDATAKEY(oopaque)); itemid = PageGetItemId(origpage, lastleftoff); lastleft = (IndexTuple) PageGetItem(origpage, itemid); + if (lastleftoff == origpagepostingoff) + lastleft = nposting; } Assert(lastleft != item); @@ -1388,6 +1631,7 @@ _bt_split(Relation rel, BTScanInsert itup_key, Buffer buf, Buffer cbuf, */ leftoff = P_HIKEY; + Assert(BTreeTupleIsPivot(lefthikey) || !itup_key->heapkeyspace); Assert(BTreeTupleGetNAtts(lefthikey, rel) > 0); Assert(BTreeTupleGetNAtts(lefthikey, rel) <= indnkeyatts); if (PageAddItem(leftpage, (Item) lefthikey, itemsz, leftoff, @@ -1452,6 +1696,7 @@ _bt_split(Relation rel, BTScanInsert itup_key, Buffer buf, Buffer cbuf, itemid = PageGetItemId(origpage, P_HIKEY); itemsz = ItemIdGetLength(itemid); item = (IndexTuple) PageGetItem(origpage, itemid); + Assert(BTreeTupleIsPivot(item) || !itup_key->heapkeyspace); Assert(BTreeTupleGetNAtts(item, rel) > 0); Assert(BTreeTupleGetNAtts(item, rel) <= indnkeyatts); if (PageAddItem(rightpage, (Item) item, itemsz, rightoff, @@ -1480,8 +1725,16 @@ _bt_split(Relation rel, BTScanInsert itup_key, Buffer buf, Buffer cbuf, itemsz = ItemIdGetLength(itemid); item = (IndexTuple) PageGetItem(origpage, itemid); + /* replace original item with nposting due to posting split? */ + if (i == origpagepostingoff) + { + Assert(BTreeTupleIsPosting(item)); + Assert(itemsz == MAXALIGN(IndexTupleSize(nposting))); + item = nposting; + } + /* does new item belong before this one? */ - if (i == newitemoff) + else if (i == newitemoff) { if (newitemonleft) { @@ -1650,8 +1903,12 @@ _bt_split(Relation rel, BTScanInsert itup_key, Buffer buf, Buffer cbuf, XLogRecPtr recptr; xlrec.level = ropaque->btpo.level; + /* See comments below on newitem, orignewitem, and posting lists */ xlrec.firstright = firstright; xlrec.newitemoff = newitemoff; + xlrec.postingoff = 0; + if (postingoff != 0 && origpagepostingoff < firstright) + xlrec.postingoff = postingoff; XLogBeginInsert(); XLogRegisterData((char *) &xlrec, SizeOfBtreeSplit); @@ -1670,11 +1927,35 @@ _bt_split(Relation rel, BTScanInsert itup_key, Buffer buf, Buffer cbuf, * because it's included with all the other items on the right page.) * Show the new item as belonging to the left page buffer, so that it * is not stored if XLogInsert decides it needs a full-page image of - * the left page. We store the offset anyway, though, to support - * archive compression of these records. + * the left page. We always store newitemoff in the record, though. + * + * The details are sometimes slightly different for page splits that + * coincide with a posting list split. If both the replacement + * posting list and newitem go on the right page, then we don't need + * to log anything extra, just like the simple !newitemonleft + * no-posting-split case (postingoff is set to zero in the WAL record, + * so recovery doesn't need to process a posting list split at all). + * Otherwise, we set postingoff and log orignewitem instead of + * newitem, despite having actually inserted newitem. REDO routine + * must reconstruct nposting and newitem using _bt_swap_posting(). + * + * Note: It's possible that our page split point is the point that + * makes the posting list lastleft and newitem firstright. This is + * the only case where we log orignewitem/newitem despite newitem + * going on the right page. If XLogInsert decides that it can omit + * orignewitem due to logging a full-page image of the left page, + * everything still works out, since recovery only needs to log + * orignewitem for items on the left page (just like the regular + * newitem-logged case). */ - if (newitemonleft) + if (newitemonleft && xlrec.postingoff == 0) XLogRegisterBufData(0, (char *) newitem, MAXALIGN(newitemsz)); + else if (xlrec.postingoff != 0) + { + Assert(newitemonleft || firstright == newitemoff); + Assert(MAXALIGN(newitemsz) == IndexTupleSize(orignewitem)); + XLogRegisterBufData(0, (char *) orignewitem, MAXALIGN(newitemsz)); + } /* Log the left page's new high key */ itemid = PageGetItemId(origpage, P_HIKEY); @@ -1834,7 +2115,7 @@ _bt_insert_parent(Relation rel, /* Recursively insert into the parent */ _bt_insertonpg(rel, NULL, pbuf, buf, stack->bts_parent, - new_item, stack->bts_offset + 1, + new_item, stack->bts_offset + 1, 0, is_only); /* be tidy */ @@ -2190,6 +2471,7 @@ _bt_newroot(Relation rel, Buffer lbuf, Buffer rbuf) md.fastlevel = metad->btm_level; md.oldest_btpo_xact = metad->btm_oldest_btpo_xact; md.last_cleanup_num_heap_tuples = metad->btm_last_cleanup_num_heap_tuples; + md.safededup = metad->btm_safededup; XLogRegisterBufData(2, (char *) &md, sizeof(xl_btree_metadata)); @@ -2303,6 +2585,6 @@ _bt_vacuum_one_page(Relation rel, Buffer buffer, Relation heapRel) * Note: if we didn't find any LP_DEAD items, then the page's * BTP_HAS_GARBAGE hint bit is falsely set. We do not bother expending a * separate write to clear it, however. We will clear it when we split - * the page. + * the page, or when deduplication runs. */ } diff --git a/src/backend/access/nbtree/nbtpage.c b/src/backend/access/nbtree/nbtpage.c index f05cbe7467..23ab30fa9b 100644 --- a/src/backend/access/nbtree/nbtpage.c +++ b/src/backend/access/nbtree/nbtpage.c @@ -24,6 +24,7 @@ #include "access/nbtree.h" #include "access/nbtxlog.h" +#include "access/tableam.h" #include "access/transam.h" #include "access/xlog.h" #include "access/xloginsert.h" @@ -37,6 +38,8 @@ static BTMetaPageData *_bt_getmeta(Relation rel, Buffer metabuf); static bool _bt_mark_page_halfdead(Relation rel, Buffer buf, BTStack stack); static bool _bt_unlink_halfdead_page(Relation rel, Buffer leafbuf, bool *rightsib_empty); +static TransactionId _bt_xid_horizon(Relation rel, Relation heapRel, Page page, + OffsetNumber *deletable, int ndeletable); static bool _bt_lock_branch_parent(Relation rel, BlockNumber child, BTStack stack, Buffer *topparent, OffsetNumber *topoff, BlockNumber *target, BlockNumber *rightsib); @@ -47,7 +50,8 @@ static void _bt_log_reuse_page(Relation rel, BlockNumber blkno, * _bt_initmetapage() -- Fill a page buffer with a correct metapage image */ void -_bt_initmetapage(Page page, BlockNumber rootbknum, uint32 level) +_bt_initmetapage(Page page, BlockNumber rootbknum, uint32 level, + bool safededup) { BTMetaPageData *metad; BTPageOpaque metaopaque; @@ -63,6 +67,7 @@ _bt_initmetapage(Page page, BlockNumber rootbknum, uint32 level) metad->btm_fastlevel = level; metad->btm_oldest_btpo_xact = InvalidTransactionId; metad->btm_last_cleanup_num_heap_tuples = -1.0; + metad->btm_safededup = safededup; metaopaque = (BTPageOpaque) PageGetSpecialPointer(page); metaopaque->btpo_flags = BTP_META; @@ -102,6 +107,9 @@ _bt_upgrademetapage(Page page) metad->btm_version = BTREE_NOVAC_VERSION; metad->btm_oldest_btpo_xact = InvalidTransactionId; metad->btm_last_cleanup_num_heap_tuples = -1.0; + /* Only a REINDEX can set this field */ + Assert(!metad->btm_safededup); + metad->btm_safededup = false; /* Adjust pd_lower (see _bt_initmetapage() for details) */ ((PageHeader) page)->pd_lower = @@ -213,6 +221,7 @@ _bt_update_meta_cleanup_info(Relation rel, TransactionId oldestBtpoXact, md.fastlevel = metad->btm_fastlevel; md.oldest_btpo_xact = oldestBtpoXact; md.last_cleanup_num_heap_tuples = numHeapTuples; + md.safededup = metad->btm_safededup; XLogRegisterBufData(0, (char *) &md, sizeof(xl_btree_metadata)); @@ -274,6 +283,8 @@ _bt_getroot(Relation rel, int access) Assert(metad->btm_magic == BTREE_MAGIC); Assert(metad->btm_version >= BTREE_MIN_VERSION); Assert(metad->btm_version <= BTREE_VERSION); + Assert(!metad->btm_safededup || + metad->btm_version > BTREE_NOVAC_VERSION); Assert(metad->btm_root != P_NONE); rootblkno = metad->btm_fastroot; @@ -394,6 +405,7 @@ _bt_getroot(Relation rel, int access) md.fastlevel = 0; md.oldest_btpo_xact = InvalidTransactionId; md.last_cleanup_num_heap_tuples = -1.0; + md.safededup = metad->btm_safededup; XLogRegisterBufData(2, (char *) &md, sizeof(xl_btree_metadata)); @@ -618,22 +630,33 @@ _bt_getrootheight(Relation rel) Assert(metad->btm_magic == BTREE_MAGIC); Assert(metad->btm_version >= BTREE_MIN_VERSION); Assert(metad->btm_version <= BTREE_VERSION); + Assert(!metad->btm_safededup || metad->btm_version > BTREE_NOVAC_VERSION); Assert(metad->btm_fastroot != P_NONE); return metad->btm_fastlevel; } /* - * _bt_heapkeyspace() -- is heap TID being treated as a key? + * _bt_metaversion() -- Get version/status info from metapage. + * + * Sets caller's *heapkeyspace and *safededup arguments using data from + * the B-Tree metapage (could be locally-cached version). This + * information needs to be stashed in insertion scankey, so we provide a + * single function that fetches both at once. * * This is used to determine the rules that must be used to descend a * btree. Version 4 indexes treat heap TID as a tiebreaker attribute. * pg_upgrade'd version 3 indexes need extra steps to preserve reasonable * performance when inserting a new BTScanInsert-wise duplicate tuple * among many leaf pages already full of such duplicates. + * + * Also sets field that indicates to caller whether or not it is safe to + * apply deduplication within index. Note that we rely on the assumption + * that btm_safededup will be zero'ed on heapkeyspace indexes that were + * pg_upgrade'd from Postgres 12. */ -bool -_bt_heapkeyspace(Relation rel) +void +_bt_metaversion(Relation rel, bool *heapkeyspace, bool *safededup) { BTMetaPageData *metad; @@ -651,10 +674,11 @@ _bt_heapkeyspace(Relation rel) */ if (metad->btm_root == P_NONE) { - uint32 btm_version = metad->btm_version; + *heapkeyspace = metad->btm_version > BTREE_NOVAC_VERSION; + *safededup = metad->btm_safededup; _bt_relbuf(rel, metabuf); - return btm_version > BTREE_NOVAC_VERSION; + return; } /* @@ -678,9 +702,11 @@ _bt_heapkeyspace(Relation rel) Assert(metad->btm_magic == BTREE_MAGIC); Assert(metad->btm_version >= BTREE_MIN_VERSION); Assert(metad->btm_version <= BTREE_VERSION); + Assert(!metad->btm_safededup || metad->btm_version > BTREE_NOVAC_VERSION); Assert(metad->btm_fastroot != P_NONE); - return metad->btm_version > BTREE_NOVAC_VERSION; + *heapkeyspace = metad->btm_version > BTREE_NOVAC_VERSION; + *safededup = metad->btm_safededup; } /* @@ -964,28 +990,90 @@ _bt_page_recyclable(Page page) * Delete item(s) from a btree leaf page during VACUUM. * * This routine assumes that the caller has a super-exclusive write lock on - * the buffer. Also, the given deletable array *must* be sorted in ascending - * order. + * the buffer. Also, the given deletable and updatable arrays *must* be + * sorted in ascending order. + * + * Routine deals with "deleting logical tuples" when some (but not all) of the + * heap TIDs in an existing posting list item are to be removed by VACUUM. + * This works by updating/overwriting an existing item with caller's new + * version of the item (a version that lacks the TIDs that are to be deleted). * * We record VACUUMs and b-tree deletes differently in WAL. Deletes must * generate their own latestRemovedXid by accessing the heap directly, whereas - * VACUUMs rely on the initial heap scan taking care of it indirectly. + * VACUUMs rely on the initial heap scan taking care of it indirectly. Also, + * only VACUUM can perform granular deletes of individual TIDs in posting list + * tuples. */ void _bt_delitems_vacuum(Relation rel, Buffer buf, - OffsetNumber *deletable, int ndeletable) + OffsetNumber *deletable, int ndeletable, + OffsetNumber *updatable, IndexTuple *updated, + int nupdatable) { Page page = BufferGetPage(buf); BTPageOpaque opaque; + IndexTuple itup; + Size itemsz; + char *updatedbuf = NULL; + Size updatedbuflen; /* Shouldn't be called unless there's something to do */ - Assert(ndeletable > 0); + Assert(ndeletable > 0 || nupdatable > 0); + + /* XLOG stuff -- allocate and fill buffer before critical section */ + if (nupdatable > 0 && RelationNeedsWAL(rel)) + { + Size offset; + + updatedbuflen = 0; + for (int i = 0; i < nupdatable; i++) + { + itup = updated[i]; + itemsz = MAXALIGN(IndexTupleSize(itup)); + updatedbuflen += itemsz; + } + + updatedbuf = palloc(updatedbuflen); + offset = 0; + for (int i = 0; i < nupdatable; i++) + { + itup = updated[i]; + itemsz = MAXALIGN(IndexTupleSize(itup)); + memcpy(updatedbuf + offset, itup, itemsz); + offset += itemsz; + } + } /* No ereport(ERROR) until changes are logged */ START_CRIT_SECTION(); - /* Fix the page */ - PageIndexMultiDelete(page, deletable, ndeletable); + /* + * Handle posting tuple updates. + * + * Deliberately do this before handling simple deletes. If we did it the + * other way around (i.e. WAL record order -- simple deletes before + * updates) then we'd have to make compensating changes to the 'updatable' + * array of offset numbers. + * + * PageIndexTupleOverwrite() won't unset each item's LP_DEAD bit when it + * happens to already be set. Although we unset the BTP_HAS_GARBAGE page + * level flag, unsetting individual LP_DEAD bits should still be avoided. + */ + for (int i = 0; i < nupdatable; i++) + { + OffsetNumber offnum = updatable[i]; + + itup = updated[i]; + itemsz = MAXALIGN(IndexTupleSize(itup)); + + if (!PageIndexTupleOverwrite(page, offnum, (Item) itup, itemsz)) + elog(PANIC, "could not update partially dead item in block %u of index \"%s\"", + BufferGetBlockNumber(buf), RelationGetRelationName(rel)); + } + + /* Now handle simple deletes of entire physical tuples */ + if (ndeletable > 0) + PageIndexMultiDelete(page, deletable, ndeletable); /* * We can clear the vacuum cycle ID since this page has certainly been @@ -1006,7 +1094,9 @@ _bt_delitems_vacuum(Relation rel, Buffer buf, * limited, since we never falsely unset an LP_DEAD bit. Workloads that * are particularly dependent on LP_DEAD bits being set quickly will * usually manage to set the BTP_HAS_GARBAGE flag before the page fills up - * again anyway. + * again anyway. Furthermore, attempting a deduplication pass will remove + * all LP_DEAD items, regardless of whether the BTP_HAS_GARBAGE hint bit + * is set or not. */ opaque->btpo_flags &= ~BTP_HAS_GARBAGE; @@ -1019,18 +1109,22 @@ _bt_delitems_vacuum(Relation rel, Buffer buf, xl_btree_vacuum xlrec_vacuum; xlrec_vacuum.ndeleted = ndeletable; + xlrec_vacuum.nupdated = nupdatable; XLogBeginInsert(); XLogRegisterBuffer(0, buf, REGBUF_STANDARD); XLogRegisterData((char *) &xlrec_vacuum, SizeOfBtreeVacuum); - /* - * The deletable array is not in the buffer, but pretend that it is. - * When XLogInsert stores the whole buffer, the array need not be - * stored too. - */ - XLogRegisterBufData(0, (char *) deletable, - ndeletable * sizeof(OffsetNumber)); + if (ndeletable > 0) + XLogRegisterBufData(0, (char *) deletable, + ndeletable * sizeof(OffsetNumber)); + + if (nupdatable > 0) + { + XLogRegisterBufData(0, (char *) updatable, + nupdatable * sizeof(OffsetNumber)); + XLogRegisterBufData(0, updatedbuf, updatedbuflen); + } recptr = XLogInsert(RM_BTREE_ID, XLOG_BTREE_VACUUM); @@ -1038,6 +1132,10 @@ _bt_delitems_vacuum(Relation rel, Buffer buf, } END_CRIT_SECTION(); + + /* can't leak memory here */ + if (updatedbuf != NULL) + pfree(updatedbuf); } /* @@ -1050,6 +1148,9 @@ _bt_delitems_vacuum(Relation rel, Buffer buf, * This is nearly the same as _bt_delitems_vacuum as far as what it does to * the page, but it needs to generate its own latestRemovedXid by accessing * the heap. This is used by the REDO routine to generate recovery conflicts. + * Also, it doesn't handle posting list tuples unless the entire physical + * tuple can be deleted as a whole (since there is only one LP_DEAD bit per + * line pointer). */ void _bt_delitems_delete(Relation rel, Buffer buf, @@ -1065,8 +1166,7 @@ _bt_delitems_delete(Relation rel, Buffer buf, if (XLogStandbyInfoActive() && RelationNeedsWAL(rel)) latestRemovedXid = - index_compute_xid_horizon_for_tuples(rel, heapRel, buf, - deletable, ndeletable); + _bt_xid_horizon(rel, heapRel, page, deletable, ndeletable); /* No ereport(ERROR) until changes are logged */ START_CRIT_SECTION(); @@ -1113,6 +1213,84 @@ _bt_delitems_delete(Relation rel, Buffer buf, END_CRIT_SECTION(); } +/* + * Get the latestRemovedXid from the table entries pointed to by the non-pivot + * tuples being deleted. + * + * This is a specialized version of index_compute_xid_horizon_for_tuples(). + * It's needed because btree tuples don't always store table TID using the + * standard index tuple header field. + */ +static TransactionId +_bt_xid_horizon(Relation rel, Relation heapRel, Page page, + OffsetNumber *deletable, int ndeletable) +{ + TransactionId latestRemovedXid = InvalidTransactionId; + int spacenhtids; + int nhtids; + ItemPointer htids; + + /* Array will grow iff there are posting list tuples to consider */ + spacenhtids = ndeletable; + nhtids = 0; + htids = (ItemPointer) palloc(sizeof(ItemPointerData) * spacenhtids); + for (int i = 0; i < ndeletable; i++) + { + ItemId itemid; + IndexTuple itup; + + itemid = PageGetItemId(page, deletable[i]); + itup = (IndexTuple) PageGetItem(page, itemid); + + Assert(ItemIdIsDead(itemid)); + Assert(!BTreeTupleIsPivot(itup)); + + if (!BTreeTupleIsPosting(itup)) + { + if (nhtids + 1 > spacenhtids) + { + spacenhtids *= 2; + htids = (ItemPointer) + repalloc(htids, sizeof(ItemPointerData) * spacenhtids); + } + + Assert(ItemPointerIsValid(&itup->t_tid)); + ItemPointerCopy(&itup->t_tid, &htids[nhtids]); + nhtids++; + } + else + { + int nposting = BTreeTupleGetNPosting(itup); + + if (nhtids + nposting > spacenhtids) + { + spacenhtids = Max(spacenhtids * 2, nhtids + nposting); + htids = (ItemPointer) + repalloc(htids, sizeof(ItemPointerData) * spacenhtids); + } + + for (int j = 0; j < nposting; j++) + { + ItemPointer htid = BTreeTupleGetPostingN(itup, j); + + Assert(ItemPointerIsValid(htid)); + ItemPointerCopy(htid, &htids[nhtids]); + nhtids++; + } + } + } + + Assert(nhtids >= ndeletable); + + latestRemovedXid = + table_compute_xid_horizon_for_tuples(heapRel, htids, nhtids); + + /* be tidy */ + pfree(htids); + + return latestRemovedXid; +} + /* * Returns true, if the given block has the half-dead flag set. */ @@ -2058,6 +2236,7 @@ _bt_unlink_halfdead_page(Relation rel, Buffer leafbuf, bool *rightsib_empty) xlmeta.fastlevel = metad->btm_fastlevel; xlmeta.oldest_btpo_xact = metad->btm_oldest_btpo_xact; xlmeta.last_cleanup_num_heap_tuples = metad->btm_last_cleanup_num_heap_tuples; + xlmeta.safededup = metad->btm_safededup; XLogRegisterBufData(4, (char *) &xlmeta, sizeof(xl_btree_metadata)); xlinfo = XLOG_BTREE_UNLINK_PAGE_META; diff --git a/src/backend/access/nbtree/nbtree.c b/src/backend/access/nbtree/nbtree.c index 8376a5e6b7..eabb839f43 100644 --- a/src/backend/access/nbtree/nbtree.c +++ b/src/backend/access/nbtree/nbtree.c @@ -95,6 +95,8 @@ static void btvacuumscan(IndexVacuumInfo *info, IndexBulkDeleteResult *stats, BTCycleId cycleid, TransactionId *oldestBtpoXact); static void btvacuumpage(BTVacState *vstate, BlockNumber blkno, BlockNumber orig_blkno); +static ItemPointer btreevacuumposting(BTVacState *vstate, IndexTuple posting, + int *nremaining); /* @@ -158,7 +160,7 @@ btbuildempty(Relation index) /* Construct metapage. */ metapage = (Page) palloc(BLCKSZ); - _bt_initmetapage(metapage, P_NONE, 0); + _bt_initmetapage(metapage, P_NONE, 0, _bt_opclasses_support_dedup(index)); /* * Write the page and log it. It might seem that an immediate sync would @@ -261,8 +263,8 @@ btgettuple(IndexScanDesc scan, ScanDirection dir) */ if (so->killedItems == NULL) so->killedItems = (int *) - palloc(MaxIndexTuplesPerPage * sizeof(int)); - if (so->numKilled < MaxIndexTuplesPerPage) + palloc(MaxBTreeIndexTuplesPerPage * sizeof(int)); + if (so->numKilled < MaxBTreeIndexTuplesPerPage) so->killedItems[so->numKilled++] = so->currPos.itemIndex; } @@ -1151,11 +1153,16 @@ restart: } else if (P_ISLEAF(opaque)) { - OffsetNumber deletable[MaxOffsetNumber]; + OffsetNumber deletable[MaxIndexTuplesPerPage]; int ndeletable; + IndexTuple updated[MaxIndexTuplesPerPage]; + OffsetNumber updatable[MaxIndexTuplesPerPage]; + int nupdatable; OffsetNumber offnum, minoff, maxoff; + int nhtidsdead, + nhtidslive; /* * Trade in the initial read lock for a super-exclusive write lock on @@ -1187,8 +1194,11 @@ restart: * point using callback. */ ndeletable = 0; + nupdatable = 0; minoff = P_FIRSTDATAKEY(opaque); maxoff = PageGetMaxOffsetNumber(page); + nhtidsdead = 0; + nhtidslive = 0; if (callback) { for (offnum = minoff; @@ -1196,11 +1206,9 @@ restart: offnum = OffsetNumberNext(offnum)) { IndexTuple itup; - ItemPointer htup; itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, offnum)); - htup = &(itup->t_tid); /* * Hot Standby assumes that it's okay that XLOG_BTREE_VACUUM @@ -1223,22 +1231,90 @@ restart: * simple, and allows us to always avoid generating our own * conflicts. */ - if (callback(htup, callback_state)) - deletable[ndeletable++] = offnum; + Assert(!BTreeTupleIsPivot(itup)); + if (!BTreeTupleIsPosting(itup)) + { + /* Regular tuple, standard table TID representation */ + if (callback(&itup->t_tid, callback_state)) + { + deletable[ndeletable++] = offnum; + nhtidsdead++; + } + else + nhtidslive++; + } + else + { + ItemPointer newhtids; + int nremaining; + + /* + * Posting list tuple, a physical tuple that represents + * two or more logical tuples, any of which could be an + * index row version that must be removed + */ + newhtids = btreevacuumposting(vstate, itup, &nremaining); + if (newhtids == NULL) + { + /* + * All table TIDs/logical tuples from the posting + * tuple remain, so no delete or update required + */ + Assert(nremaining == BTreeTupleGetNPosting(itup)); + } + else if (nremaining > 0) + { + IndexTuple updatedtuple; + + /* + * Form new tuple that contains only remaining TIDs. + * Remember this new tuple and the offset of the tuple + * to be updated for the page's _bt_delitems_vacuum() + * call. + */ + Assert(nremaining < BTreeTupleGetNPosting(itup)); + updatedtuple = _bt_form_posting(itup, newhtids, + nremaining); + updated[nupdatable] = updatedtuple; + updatable[nupdatable++] = offnum; + nhtidsdead += BTreeTupleGetNPosting(itup) - nremaining; + pfree(newhtids); + } + else + { + /* + * All table TIDs/logical tuples from the posting list + * must be deleted. We'll delete the physical index + * tuple completely (no update). + */ + Assert(nremaining == 0); + deletable[ndeletable++] = offnum; + nhtidsdead += BTreeTupleGetNPosting(itup); + pfree(newhtids); + } + + nhtidslive += nremaining; + } } } /* - * Apply any needed deletes. We issue just one _bt_delitems_vacuum() - * call per page, so as to minimize WAL traffic. + * Apply any needed deletes or updates. We issue just one + * _bt_delitems_vacuum() call per page, so as to minimize WAL traffic. */ - if (ndeletable > 0) + if (ndeletable > 0 || nupdatable > 0) { - _bt_delitems_vacuum(rel, buf, deletable, ndeletable); + Assert(nhtidsdead >= Max(ndeletable, 1)); + _bt_delitems_vacuum(rel, buf, deletable, ndeletable, updatable, + updated, nupdatable); - stats->tuples_removed += ndeletable; + stats->tuples_removed += nhtidsdead; /* must recompute maxoff */ maxoff = PageGetMaxOffsetNumber(page); + + /* can't leak memory here */ + for (int i = 0; i < nupdatable; i++) + pfree(updated[i]); } else { @@ -1251,6 +1327,7 @@ restart: * We treat this like a hint-bit update because there's no need to * WAL-log it. */ + Assert(nhtidsdead == 0); if (vstate->cycleid != 0 && opaque->btpo_cycleid == vstate->cycleid) { @@ -1260,15 +1337,18 @@ restart: } /* - * If it's now empty, try to delete; else count the live tuples. We - * don't delete when recursing, though, to avoid putting entries into + * If it's now empty, try to delete; else count the live tuples (live + * table TIDs in posting lists are counted as live tuples). We don't + * delete when recursing, though, to avoid putting entries into * freePages out-of-order (doesn't seem worth any extra code to handle * the case). */ if (minoff > maxoff) delete_now = (blkno == orig_blkno); else - stats->num_index_tuples += maxoff - minoff + 1; + stats->num_index_tuples += nhtidslive; + + Assert(!delete_now || nhtidslive == 0); } if (delete_now) @@ -1300,9 +1380,10 @@ restart: /* * This is really tail recursion, but if the compiler is too stupid to * optimize it as such, we'd eat an uncomfortably large amount of stack - * space per recursion level (due to the deletable[] array). A failure is - * improbable since the number of levels isn't likely to be large ... but - * just in case, let's hand-optimize into a loop. + * space per recursion level (due to the arrays used to track details of + * deletable/updatable items). A failure is improbable since the number + * of levels isn't likely to be large ... but just in case, let's + * hand-optimize into a loop. */ if (recurse_to != P_NONE) { @@ -1311,6 +1392,67 @@ restart: } } +/* + * btreevacuumposting --- determine TIDs still needed in posting list + * + * Returns new palloc'd array of item pointers needed to build + * replacement posting list tuple without the TIDs that VACUUM needs to + * delete. Returned value is NULL in the common case no changes are + * needed in caller's posting list tuple (we avoid allocating memory + * here as an optimization). + * + * The number of TIDs that should remain in the posting list tuple is + * set for caller in *nremaining. This indicates the number of elements + * in the returned array (assuming that return value isn't just NULL). + */ +static ItemPointer +btreevacuumposting(BTVacState *vstate, IndexTuple posting, int *nremaining) +{ + int live = 0; + int nitem = BTreeTupleGetNPosting(posting); + ItemPointer tmpitems = NULL, + items = BTreeTupleGetPosting(posting); + + for (int i = 0; i < nitem; i++) + { + if (!vstate->callback(items + i, vstate->callback_state)) + { + /* + * Live table TID. + * + * Only save live TID when we already know that we're going to + * have to kill at least one TID, and have already allocated + * memory. + */ + if (tmpitems) + tmpitems[live] = items[i]; + live++; + } + else if (tmpitems == NULL) + { + /* + * First dead table TID encountered. + * + * It's now clear that we need to delete one or more dead table + * TIDs, so start maintaining an array of live TIDs for caller to + * reconstruct smaller replacement posting list tuple + */ + tmpitems = palloc(sizeof(ItemPointerData) * nitem); + + /* Copy live TIDs skipped in previous iterations, if any */ + if (live > 0) + memcpy(tmpitems, items, sizeof(ItemPointerData) * live); + } + else + { + /* Second or subsequent dead table TID */ + } + } + + *nremaining = live; + return tmpitems; +} + /* * btcanreturn() -- Check whether btree indexes support index-only scans. * diff --git a/src/backend/access/nbtree/nbtsearch.c b/src/backend/access/nbtree/nbtsearch.c index c573814f01..362e9d9efa 100644 --- a/src/backend/access/nbtree/nbtsearch.c +++ b/src/backend/access/nbtree/nbtsearch.c @@ -26,10 +26,18 @@ static void _bt_drop_lock_and_maybe_pin(IndexScanDesc scan, BTScanPos sp); static OffsetNumber _bt_binsrch(Relation rel, BTScanInsert key, Buffer buf); +static int _bt_binsrch_posting(BTScanInsert key, Page page, + OffsetNumber offnum); static bool _bt_readpage(IndexScanDesc scan, ScanDirection dir, OffsetNumber offnum); static void _bt_saveitem(BTScanOpaque so, int itemIndex, OffsetNumber offnum, IndexTuple itup); +static int _bt_setuppostingitems(BTScanOpaque so, int itemIndex, + OffsetNumber offnum, ItemPointer heapTid, + IndexTuple itup); +static inline void _bt_savepostingitem(BTScanOpaque so, int itemIndex, + OffsetNumber offnum, + ItemPointer heapTid, int tupleOffset); static bool _bt_steppage(IndexScanDesc scan, ScanDirection dir); static bool _bt_readnextpage(IndexScanDesc scan, BlockNumber blkno, ScanDirection dir); static bool _bt_parallel_readpage(IndexScanDesc scan, BlockNumber blkno, @@ -142,6 +150,7 @@ _bt_search(Relation rel, BTScanInsert key, Buffer *bufP, int access, offnum = _bt_binsrch(rel, key, *bufP); itemid = PageGetItemId(page, offnum); itup = (IndexTuple) PageGetItem(page, itemid); + Assert(BTreeTupleIsPivot(itup) || !key->heapkeyspace); blkno = BTreeTupleGetDownLink(itup); par_blkno = BufferGetBlockNumber(*bufP); @@ -434,7 +443,10 @@ _bt_binsrch(Relation rel, * low) makes bounds invalid. * * Caller is responsible for invalidating bounds when it modifies the page - * before calling here a second time. + * before calling here a second time, and for dealing with posting list + * tuple matches (callers can use insertstate's postingoff field to + * determine which existing heap TID will need to be replaced by a posting + * list split). */ OffsetNumber _bt_binsrch_insert(Relation rel, BTInsertState insertstate) @@ -453,6 +465,7 @@ _bt_binsrch_insert(Relation rel, BTInsertState insertstate) Assert(P_ISLEAF(opaque)); Assert(!key->nextkey); + Assert(insertstate->postingoff == 0); if (!insertstate->bounds_valid) { @@ -509,6 +522,16 @@ _bt_binsrch_insert(Relation rel, BTInsertState insertstate) if (result != 0) stricthigh = high; } + + /* + * If tuple at offset located by binary search is a posting list whose + * TID range overlaps with caller's scantid, perform posting list + * binary search to set postingoff for caller. Caller must split the + * posting list when postingoff is set. This should happen + * infrequently. + */ + if (unlikely(result == 0 && key->scantid != NULL)) + insertstate->postingoff = _bt_binsrch_posting(key, page, mid); } /* @@ -528,6 +551,73 @@ _bt_binsrch_insert(Relation rel, BTInsertState insertstate) return low; } +/*---------- + * _bt_binsrch_posting() -- posting list binary search. + * + * Helper routine for _bt_binsrch_insert(). + * + * Returns offset into posting list where caller's scantid belongs. + *---------- + */ +static int +_bt_binsrch_posting(BTScanInsert key, Page page, OffsetNumber offnum) +{ + IndexTuple itup; + ItemId itemid; + int low, + high, + mid, + res; + + /* + * If this isn't a posting tuple, then the index must be corrupt (if it is + * an ordinary non-pivot tuple then there must be an existing tuple with a + * heap TID that equals inserter's new heap TID/scantid). Defensively + * check that tuple is a posting list tuple whose posting list range + * includes caller's scantid. + * + * (This is also needed because contrib/amcheck's rootdescend option needs + * to be able to relocate a non-pivot tuple using _bt_binsrch_insert().) + */ + itemid = PageGetItemId(page, offnum); + itup = (IndexTuple) PageGetItem(page, itemid); + if (!BTreeTupleIsPosting(itup)) + return 0; + + Assert(key->heapkeyspace && key->safededup); + + /* + * In the event that posting list tuple has LP_DEAD bit set, indicate this + * to _bt_binsrch_insert() caller by returning -1, a sentinel value. A + * second call to _bt_binsrch_insert() can take place when its caller has + * removed the dead item. + */ + if (ItemIdIsDead(itemid)) + return -1; + + /* "high" is past end of posting list for loop invariant */ + low = 0; + high = BTreeTupleGetNPosting(itup); + Assert(high >= 2); + + while (high > low) + { + mid = low + ((high - low) / 2); + res = ItemPointerCompare(key->scantid, + BTreeTupleGetPostingN(itup, mid)); + + if (res > 0) + low = mid + 1; + else if (res < 0) + high = mid; + else + return mid; + } + + /* Exact match not found */ + return low; +} + /*---------- * _bt_compare() -- Compare insertion-type scankey to tuple on a page. * @@ -537,9 +627,14 @@ _bt_binsrch_insert(Relation rel, BTInsertState insertstate) * <0 if scankey < tuple at offnum; * 0 if scankey == tuple at offnum; * >0 if scankey > tuple at offnum. - * NULLs in the keys are treated as sortable values. Therefore - * "equality" does not necessarily mean that the item should be - * returned to the caller as a matching key! + * + * NULLs in the keys are treated as sortable values. Therefore + * "equality" does not necessarily mean that the item should be returned + * to the caller as a matching key. Similarly, an insertion scankey + * with its scantid set is treated as equal to a posting tuple whose TID + * range overlaps with their scantid. There generally won't be a + * matching TID in the posting tuple, which caller must handle + * themselves (e.g., by splitting the posting list tuple). * * CRUCIAL NOTE: on a non-leaf page, the first data key is assumed to be * "minus infinity": this routine will always claim it is less than the @@ -563,6 +658,7 @@ _bt_compare(Relation rel, ScanKey scankey; int ncmpkey; int ntupatts; + int32 result; Assert(_bt_check_natts(rel, key->heapkeyspace, page, offnum)); Assert(key->keysz <= IndexRelationGetNumberOfKeyAttributes(rel)); @@ -597,7 +693,6 @@ _bt_compare(Relation rel, { Datum datum; bool isNull; - int32 result; datum = index_getattr(itup, scankey->sk_attno, itupdesc, &isNull); @@ -713,8 +808,25 @@ _bt_compare(Relation rel, if (heapTid == NULL) return 1; + /* + * scankey must be treated as equal to a posting list tuple if its scantid + * value falls within the range of the posting list. In all other cases + * there can only be a single heap TID value, which is compared directly + * as a simple scalar value. + */ Assert(ntupatts >= IndexRelationGetNumberOfKeyAttributes(rel)); - return ItemPointerCompare(key->scantid, heapTid); + result = ItemPointerCompare(key->scantid, heapTid); + if (result <= 0 || !BTreeTupleIsPosting(itup)) + return result; + else + { + result = ItemPointerCompare(key->scantid, + BTreeTupleGetMaxHeapTID(itup)); + if (result > 0) + return 1; + } + + return 0; } /* @@ -1229,7 +1341,7 @@ _bt_first(IndexScanDesc scan, ScanDirection dir) } /* Initialize remaining insertion scan key fields */ - inskey.heapkeyspace = _bt_heapkeyspace(rel); + _bt_metaversion(rel, &inskey.heapkeyspace, &inskey.safededup); inskey.anynullkeys = false; /* unused */ inskey.nextkey = nextkey; inskey.pivotsearch = false; @@ -1484,9 +1596,35 @@ _bt_readpage(IndexScanDesc scan, ScanDirection dir, OffsetNumber offnum) if (_bt_checkkeys(scan, itup, indnatts, dir, &continuescan)) { - /* tuple passes all scan key conditions, so remember it */ - _bt_saveitem(so, itemIndex, offnum, itup); - itemIndex++; + /* tuple passes all scan key conditions */ + if (!BTreeTupleIsPosting(itup)) + { + /* Remember it */ + _bt_saveitem(so, itemIndex, offnum, itup); + itemIndex++; + } + else + { + int tupleOffset; + + /* + * Set up state to return posting list, and remember first + * "logical" tuple + */ + tupleOffset = + _bt_setuppostingitems(so, itemIndex, offnum, + BTreeTupleGetPostingN(itup, 0), + itup); + itemIndex++; + /* Remember additional logical tuples */ + for (int i = 1; i < BTreeTupleGetNPosting(itup); i++) + { + _bt_savepostingitem(so, itemIndex, offnum, + BTreeTupleGetPostingN(itup, i), + tupleOffset); + itemIndex++; + } + } } /* When !continuescan, there can't be any more matches, so stop */ if (!continuescan) @@ -1519,7 +1657,7 @@ _bt_readpage(IndexScanDesc scan, ScanDirection dir, OffsetNumber offnum) if (!continuescan) so->currPos.moreRight = false; - Assert(itemIndex <= MaxIndexTuplesPerPage); + Assert(itemIndex <= MaxBTreeIndexTuplesPerPage); so->currPos.firstItem = 0; so->currPos.lastItem = itemIndex - 1; so->currPos.itemIndex = 0; @@ -1527,7 +1665,7 @@ _bt_readpage(IndexScanDesc scan, ScanDirection dir, OffsetNumber offnum) else { /* load items[] in descending order */ - itemIndex = MaxIndexTuplesPerPage; + itemIndex = MaxBTreeIndexTuplesPerPage; offnum = Min(offnum, maxoff); @@ -1568,9 +1706,41 @@ _bt_readpage(IndexScanDesc scan, ScanDirection dir, OffsetNumber offnum) &continuescan); if (passes_quals && tuple_alive) { - /* tuple passes all scan key conditions, so remember it */ - itemIndex--; - _bt_saveitem(so, itemIndex, offnum, itup); + /* tuple passes all scan key conditions */ + if (!BTreeTupleIsPosting(itup)) + { + /* Remember it */ + itemIndex--; + _bt_saveitem(so, itemIndex, offnum, itup); + } + else + { + int tupleOffset; + + /* + * Set up state to return posting list, and remember first + * "logical" tuple. + * + * Note that we deliberately save/return items from + * posting lists in ascending heap TID order for backwards + * scans. This allows _bt_killitems() to make a + * consistent assumption about the order of items + * associated with the same posting list tuple. + */ + itemIndex--; + tupleOffset = + _bt_setuppostingitems(so, itemIndex, offnum, + BTreeTupleGetPostingN(itup, 0), + itup); + /* Remember additional logical tuples */ + for (int i = 1; i < BTreeTupleGetNPosting(itup); i++) + { + itemIndex--; + _bt_savepostingitem(so, itemIndex, offnum, + BTreeTupleGetPostingN(itup, i), + tupleOffset); + } + } } if (!continuescan) { @@ -1584,8 +1754,8 @@ _bt_readpage(IndexScanDesc scan, ScanDirection dir, OffsetNumber offnum) Assert(itemIndex >= 0); so->currPos.firstItem = itemIndex; - so->currPos.lastItem = MaxIndexTuplesPerPage - 1; - so->currPos.itemIndex = MaxIndexTuplesPerPage - 1; + so->currPos.lastItem = MaxBTreeIndexTuplesPerPage - 1; + so->currPos.itemIndex = MaxBTreeIndexTuplesPerPage - 1; } return (so->currPos.firstItem <= so->currPos.lastItem); @@ -1598,6 +1768,8 @@ _bt_saveitem(BTScanOpaque so, int itemIndex, { BTScanPosItem *currItem = &so->currPos.items[itemIndex]; + Assert(!BTreeTupleIsPivot(itup) && !BTreeTupleIsPosting(itup)); + currItem->heapTid = itup->t_tid; currItem->indexOffset = offnum; if (so->currTuples) @@ -1610,6 +1782,71 @@ _bt_saveitem(BTScanOpaque so, int itemIndex, } } +/* + * Setup state to save posting items from a single posting list tuple. Saves + * the logical tuple that will be returned to scan first in passing. + * + * Saves an index item into so->currPos.items[itemIndex] for logical tuple + * that is returned to scan first. Second or subsequent heap TID for posting + * list should be saved by calling _bt_savepostingitem(). + * + * Returns an offset into tuple storage space that main tuple is stored at if + * needed. + */ +static int +_bt_setuppostingitems(BTScanOpaque so, int itemIndex, OffsetNumber offnum, + ItemPointer heapTid, IndexTuple itup) +{ + BTScanPosItem *currItem = &so->currPos.items[itemIndex]; + + Assert(BTreeTupleIsPosting(itup)); + + currItem->heapTid = *heapTid; + currItem->indexOffset = offnum; + if (so->currTuples) + { + /* Save base IndexTuple (truncate posting list) */ + IndexTuple base; + Size itupsz = BTreeTupleGetPostingOffset(itup); + + itupsz = MAXALIGN(itupsz); + currItem->tupleOffset = so->currPos.nextTupleOffset; + base = (IndexTuple) (so->currTuples + so->currPos.nextTupleOffset); + memcpy(base, itup, itupsz); + /* Defensively reduce work area index tuple header size */ + base->t_info &= ~INDEX_SIZE_MASK; + base->t_info |= itupsz; + so->currPos.nextTupleOffset += itupsz; + + return currItem->tupleOffset; + } + + return 0; +} + +/* + * Save an index item into so->currPos.items[itemIndex] for posting tuple. + * + * Assumes that _bt_setuppostingitems() has already been called for current + * posting list tuple. + */ +static inline void +_bt_savepostingitem(BTScanOpaque so, int itemIndex, OffsetNumber offnum, + ItemPointer heapTid, int tupleOffset) +{ + BTScanPosItem *currItem = &so->currPos.items[itemIndex]; + + currItem->heapTid = *heapTid; + currItem->indexOffset = offnum; + + /* + * Have index-only scans return the same base IndexTuple for every logical + * tuple that originates from the same posting list + */ + if (so->currTuples) + currItem->tupleOffset = tupleOffset; +} + /* * _bt_steppage() -- Step to next page containing valid data for scan * diff --git a/src/backend/access/nbtree/nbtsort.c b/src/backend/access/nbtree/nbtsort.c index f163491d60..129fe8668a 100644 --- a/src/backend/access/nbtree/nbtsort.c +++ b/src/backend/access/nbtree/nbtsort.c @@ -243,6 +243,7 @@ typedef struct BTPageState BlockNumber btps_blkno; /* block # to write this page at */ IndexTuple btps_lowkey; /* page's strict lower bound pivot tuple */ OffsetNumber btps_lastoff; /* last item offset loaded */ + Size btps_lastextra; /* last item's extra posting list space */ uint32 btps_level; /* tree level (0 = leaf) */ Size btps_full; /* "full" if less than this much free space */ struct BTPageState *btps_next; /* link to parent level, if any */ @@ -277,7 +278,10 @@ static void _bt_slideleft(Page page); static void _bt_sortaddtup(Page page, Size itemsize, IndexTuple itup, OffsetNumber itup_off); static void _bt_buildadd(BTWriteState *wstate, BTPageState *state, - IndexTuple itup); + IndexTuple itup, Size truncextra); +static void _bt_sort_dedup_finish_pending(BTWriteState *wstate, + BTPageState *state, + BTDedupState dstate); static void _bt_uppershutdown(BTWriteState *wstate, BTPageState *state); static void _bt_load(BTWriteState *wstate, BTSpool *btspool, BTSpool *btspool2); @@ -711,6 +715,7 @@ _bt_pagestate(BTWriteState *wstate, uint32 level) state->btps_lowkey = NULL; /* initialize lastoff so first item goes into P_FIRSTKEY */ state->btps_lastoff = P_HIKEY; + state->btps_lastextra = 0; state->btps_level = level; /* set "full" threshold based on level. See notes at head of file. */ if (level > 0) @@ -789,7 +794,8 @@ _bt_sortaddtup(Page page, } /*---------- - * Add an item to a disk page from the sort output. + * Add an item to a disk page from the sort output (or add a posting list + * item formed from the sort output). * * We must be careful to observe the page layout conventions of nbtsearch.c: * - rightmost pages start data items at P_HIKEY instead of at P_FIRSTKEY. @@ -821,14 +827,27 @@ _bt_sortaddtup(Page page, * the truncated high key at offset 1. * * 'last' pointer indicates the last offset added to the page. + * + * 'truncextra' is the size of the posting list in itup, if any. This + * information is stashed for the next call here, when we may benefit + * from considering the impact of truncating away the posting list on + * the page before deciding to finish the page off. Posting lists are + * often relatively large, so it is worth going to the trouble of + * accounting for the saving from truncating away the posting list of + * the tuple that becomes the high key (that may be the only way to + * get close to target free space on the page). Note that this is + * only used for the soft fillfactor-wise limit, not the critical hard + * limit. *---------- */ static void -_bt_buildadd(BTWriteState *wstate, BTPageState *state, IndexTuple itup) +_bt_buildadd(BTWriteState *wstate, BTPageState *state, IndexTuple itup, + Size truncextra) { Page npage; BlockNumber nblkno; OffsetNumber last_off; + Size last_truncextra; Size pgspc; Size itupsz; bool isleaf; @@ -842,6 +861,8 @@ _bt_buildadd(BTWriteState *wstate, BTPageState *state, IndexTuple itup) npage = state->btps_page; nblkno = state->btps_blkno; last_off = state->btps_lastoff; + last_truncextra = state->btps_lastextra; + state->btps_lastextra = truncextra; pgspc = PageGetFreeSpace(npage); itupsz = IndexTupleSize(itup); @@ -883,10 +904,10 @@ _bt_buildadd(BTWriteState *wstate, BTPageState *state, IndexTuple itup) * page. Disregard fillfactor and insert on "full" current page if we * don't have the minimum number of items yet. (Note that we deliberately * assume that suffix truncation neither enlarges nor shrinks new high key - * when applying soft limit.) + * when applying soft limit, except when last tuple had a posting list.) */ if (pgspc < itupsz + (isleaf ? MAXALIGN(sizeof(ItemPointerData)) : 0) || - (pgspc < state->btps_full && last_off > P_FIRSTKEY)) + (pgspc + last_truncextra < state->btps_full && last_off > P_FIRSTKEY)) { /* * Finish off the page and write it out. @@ -944,11 +965,11 @@ _bt_buildadd(BTWriteState *wstate, BTPageState *state, IndexTuple itup) * We don't try to bias our choice of split point to make it more * likely that _bt_truncate() can truncate away more attributes, * whereas the split point used within _bt_split() is chosen much - * more delicately. Suffix truncation is mostly useful because it - * improves space utilization for workloads with random - * insertions. It doesn't seem worthwhile to add logic for - * choosing a split point here for a benefit that is bound to be - * much smaller. + * more delicately. On the other hand, non-unique index builds + * usually deduplicate, which often results in every "physical" + * tuple on the page having distinct key values. When that + * happens, _bt_truncate() will never need to include a heap TID + * in the new high key. * * Overwrite the old item with new truncated high key directly. * oitup is already located at the physical beginning of tuple @@ -983,7 +1004,7 @@ _bt_buildadd(BTWriteState *wstate, BTPageState *state, IndexTuple itup) Assert(BTreeTupleGetNAtts(state->btps_lowkey, wstate->index) == 0 || !P_LEFTMOST((BTPageOpaque) PageGetSpecialPointer(opage))); BTreeTupleSetDownLink(state->btps_lowkey, oblkno); - _bt_buildadd(wstate, state->btps_next, state->btps_lowkey); + _bt_buildadd(wstate, state->btps_next, state->btps_lowkey, 0); pfree(state->btps_lowkey); /* @@ -1045,6 +1066,47 @@ _bt_buildadd(BTWriteState *wstate, BTPageState *state, IndexTuple itup) state->btps_lastoff = last_off; } +/* + * Finalize pending posting list tuple, and add it to the index. Final tuple + * is based on saved base tuple, and saved list of heap TIDs. + * + * This is almost like _bt_dedup_finish_pending(), but it adds a new tuple + * using _bt_buildadd(). + */ +static void +_bt_sort_dedup_finish_pending(BTWriteState *wstate, BTPageState *state, + BTDedupState dstate) +{ + IndexTuple final; + Size truncextra; + + Assert(dstate->nitems > 0); + truncextra = 0; + if (dstate->nitems == 1) + final = dstate->base; + else + { + IndexTuple postingtuple; + + /* form a tuple with a posting list */ + postingtuple = _bt_form_posting(dstate->base, + dstate->htids, + dstate->nhtids); + final = postingtuple; + /* Determine size of posting list */ + truncextra = IndexTupleSize(final) - + BTreeTupleGetPostingOffset(final); + } + + _bt_buildadd(wstate, state, final, truncextra); + + if (dstate->nitems > 1) + pfree(final); + dstate->nhtids = 0; + dstate->nitems = 0; + dstate->phystupsize = 0; +} + /* * Finish writing out the completed btree. */ @@ -1090,7 +1152,7 @@ _bt_uppershutdown(BTWriteState *wstate, BTPageState *state) Assert(BTreeTupleGetNAtts(s->btps_lowkey, wstate->index) == 0 || !P_LEFTMOST(opaque)); BTreeTupleSetDownLink(s->btps_lowkey, blkno); - _bt_buildadd(wstate, s->btps_next, s->btps_lowkey); + _bt_buildadd(wstate, s->btps_next, s->btps_lowkey, 0); pfree(s->btps_lowkey); s->btps_lowkey = NULL; } @@ -1111,7 +1173,8 @@ _bt_uppershutdown(BTWriteState *wstate, BTPageState *state) * by filling in a valid magic number in the metapage. */ metapage = (Page) palloc(BLCKSZ); - _bt_initmetapage(metapage, rootblkno, rootlevel); + _bt_initmetapage(metapage, rootblkno, rootlevel, + wstate->inskey->safededup); _bt_blwritepage(wstate, metapage, BTREE_METAPAGE); } @@ -1132,6 +1195,9 @@ _bt_load(BTWriteState *wstate, BTSpool *btspool, BTSpool *btspool2) keysz = IndexRelationGetNumberOfKeyAttributes(wstate->index); SortSupport sortKeys; int64 tuples_done = 0; + bool deduplicate; + + deduplicate = wstate->inskey->safededup && BTGetUseDedup(wstate->index); if (merge) { @@ -1228,12 +1294,12 @@ _bt_load(BTWriteState *wstate, BTSpool *btspool, BTSpool *btspool2) if (load1) { - _bt_buildadd(wstate, state, itup); + _bt_buildadd(wstate, state, itup, 0); itup = tuplesort_getindextuple(btspool->sortstate, true); } else { - _bt_buildadd(wstate, state, itup2); + _bt_buildadd(wstate, state, itup2, 0); itup2 = tuplesort_getindextuple(btspool2->sortstate, true); } @@ -1243,9 +1309,111 @@ _bt_load(BTWriteState *wstate, BTSpool *btspool, BTSpool *btspool2) } pfree(sortKeys); } + else if (deduplicate) + { + /* merge is unnecessary, deduplicate into posting lists */ + BTDedupState dstate; + IndexTuple newbase; + + dstate = (BTDedupState) palloc(sizeof(BTDedupStateData)); + dstate->maxitemsize = 0; /* set later */ + dstate->checkingunique = false; /* unused */ + dstate->skippedbase = InvalidOffsetNumber; + /* Metadata about base tuple of current pending posting list */ + dstate->base = NULL; + dstate->baseoff = InvalidOffsetNumber; /* unused */ + dstate->basetupsize = 0; + /* Metadata about current pending posting list TIDs */ + dstate->htids = NULL; + dstate->nhtids = 0; + dstate->nitems = 0; + dstate->phystupsize = 0; /* unused */ + + while ((itup = tuplesort_getindextuple(btspool->sortstate, + true)) != NULL) + { + /* When we see first tuple, create first index page */ + if (state == NULL) + { + state = _bt_pagestate(wstate, 0); + + /* + * Limit size of posting list tuples to the size of the free + * space we want to leave behind on the page, plus space for + * final item's line pointer (but make sure that posting list + * tuple size won't exceed the generic 1/3 of a page limit). + * + * This is more conservative than the approach taken in the + * retail insert path, but it allows us to get most of the + * space savings deduplication provides without noticeably + * impacting how much free space is left behind on each leaf + * page. + */ + dstate->maxitemsize = + Min(Min(BTMaxItemSize(state->btps_page), INDEX_SIZE_MASK), + MAXALIGN_DOWN(state->btps_full) - sizeof(ItemIdData)); + /* Minimum posting tuple size used here is arbitrary: */ + dstate->maxitemsize = Max(dstate->maxitemsize, 100); + dstate->htids = palloc(dstate->maxitemsize); + + /* + * No previous/base tuple, since itup is the first item + * returned by the tuplesort -- use itup as base tuple of + * first pending posting list for entire index build + */ + newbase = CopyIndexTuple(itup); + _bt_dedup_start_pending(dstate, newbase, InvalidOffsetNumber); + } + else if (_bt_keep_natts_fast(wstate->index, dstate->base, + itup) > keysz && + _bt_dedup_save_htid(dstate, itup)) + { + /* + * Tuple is equal to base tuple of pending posting list, and + * merging itup into pending posting list won't exceed the + * maxitemsize limit. Heap TID(s) for itup have been saved in + * state. The next iteration will also end up here if it's + * possible to merge the next tuple into the same pending + * posting list. + */ + } + else + { + /* + * Tuple is not equal to pending posting list tuple, or + * maxitemsize limit was reached + */ + _bt_sort_dedup_finish_pending(wstate, state, dstate); + /* Base tuple is always a copy */ + pfree(dstate->base); + + /* itup starts new pending posting list */ + newbase = CopyIndexTuple(itup); + _bt_dedup_start_pending(dstate, newbase, InvalidOffsetNumber); + } + + /* Report progress */ + pgstat_progress_update_param(PROGRESS_CREATEIDX_TUPLES_DONE, + ++tuples_done); + } + + /* + * Handle the last item (there must be a last item when the tuplesort + * returned one or more tuples) + */ + if (state) + { + _bt_sort_dedup_finish_pending(wstate, state, dstate); + /* Base tuple is always a copy */ + pfree(dstate->base); + pfree(dstate->htids); + } + + pfree(dstate); + } else { - /* merge is unnecessary */ + /* merging and deduplication are both unnecessary */ while ((itup = tuplesort_getindextuple(btspool->sortstate, true)) != NULL) { @@ -1253,7 +1421,7 @@ _bt_load(BTWriteState *wstate, BTSpool *btspool, BTSpool *btspool2) if (state == NULL) state = _bt_pagestate(wstate, 0); - _bt_buildadd(wstate, state, itup); + _bt_buildadd(wstate, state, itup, 0); /* Report progress */ pgstat_progress_update_param(PROGRESS_CREATEIDX_TUPLES_DONE, diff --git a/src/backend/access/nbtree/nbtsplitloc.c b/src/backend/access/nbtree/nbtsplitloc.c index 76c2d945c8..8ba055be9e 100644 --- a/src/backend/access/nbtree/nbtsplitloc.c +++ b/src/backend/access/nbtree/nbtsplitloc.c @@ -183,6 +183,9 @@ _bt_findsplitloc(Relation rel, state.minfirstrightsz = SIZE_MAX; state.newitemoff = newitemoff; + /* newitem cannot be a posting list item */ + Assert(!BTreeTupleIsPosting(newitem)); + /* * maxsplits should never exceed maxoff because there will be at most as * many candidate split points as there are points _between_ tuples, once @@ -459,6 +462,7 @@ _bt_recsplitloc(FindSplitData *state, int16 leftfree, rightfree; Size firstrightitemsz; + Size postingsz = 0; bool newitemisfirstonright; /* Is the new item going to be the first item on the right page? */ @@ -468,8 +472,30 @@ _bt_recsplitloc(FindSplitData *state, if (newitemisfirstonright) firstrightitemsz = state->newitemsz; else + { firstrightitemsz = firstoldonrightsz; + /* + * Calculate suffix truncation space saving when firstright is a + * posting list tuple, though only when the firstright is over 64 + * bytes including line pointer overhead (arbitrary). This avoids + * accessing the tuple in cases where its posting list must be very + * small (if firstright has one at all). + */ + if (state->is_leaf && firstrightitemsz > 64) + { + ItemId itemid; + IndexTuple newhighkey; + + itemid = PageGetItemId(state->page, firstoldonright); + newhighkey = (IndexTuple) PageGetItem(state->page, itemid); + + if (BTreeTupleIsPosting(newhighkey)) + postingsz = IndexTupleSize(newhighkey) - + BTreeTupleGetPostingOffset(newhighkey); + } + } + /* Account for all the old tuples */ leftfree = state->leftspace - olddataitemstoleft; rightfree = state->rightspace - @@ -491,11 +517,17 @@ _bt_recsplitloc(FindSplitData *state, * If we are on the leaf level, assume that suffix truncation cannot avoid * adding a heap TID to the left half's new high key when splitting at the * leaf level. In practice the new high key will often be smaller and - * will rarely be larger, but conservatively assume the worst case. + * will rarely be larger, but conservatively assume the worst case. We do + * go to the trouble of subtracting away posting list overhead, though + * only when it looks like it will make an appreciable difference. + * (Posting lists are the only case where truncation will typically make + * the final high key far smaller than firstright, so being a bit more + * precise there noticeably improves the balance of free space.) */ if (state->is_leaf) leftfree -= (int16) (firstrightitemsz + - MAXALIGN(sizeof(ItemPointerData))); + MAXALIGN(sizeof(ItemPointerData)) - + postingsz); else leftfree -= (int16) firstrightitemsz; @@ -691,7 +723,8 @@ _bt_afternewitemoff(FindSplitData *state, OffsetNumber maxoff, itemid = PageGetItemId(state->page, OffsetNumberPrev(state->newitemoff)); tup = (IndexTuple) PageGetItem(state->page, itemid); /* Do cheaper test first */ - if (!_bt_adjacenthtid(&tup->t_tid, &state->newitem->t_tid)) + if (BTreeTupleIsPosting(tup) || + !_bt_adjacenthtid(&tup->t_tid, &state->newitem->t_tid)) return false; /* Check same conditions as rightmost item case, too */ keepnatts = _bt_keep_natts_fast(state->rel, tup, state->newitem); diff --git a/src/backend/access/nbtree/nbtutils.c b/src/backend/access/nbtree/nbtutils.c index 5ab4e712f1..27299c3f75 100644 --- a/src/backend/access/nbtree/nbtutils.c +++ b/src/backend/access/nbtree/nbtutils.c @@ -20,6 +20,7 @@ #include "access/nbtree.h" #include "access/reloptions.h" #include "access/relscan.h" +#include "catalog/catalog.h" #include "commands/progress.h" #include "lib/qunique.h" #include "miscadmin.h" @@ -107,7 +108,13 @@ _bt_mkscankey(Relation rel, IndexTuple itup) */ key = palloc(offsetof(BTScanInsertData, scankeys) + sizeof(ScanKeyData) * indnkeyatts); - key->heapkeyspace = itup == NULL || _bt_heapkeyspace(rel); + if (itup) + _bt_metaversion(rel, &key->heapkeyspace, &key->safededup); + else + { + key->heapkeyspace = true; + key->safededup = _bt_opclasses_support_dedup(rel); + } key->anynullkeys = false; /* initial assumption */ key->nextkey = false; key->pivotsearch = false; @@ -1373,6 +1380,7 @@ _bt_checkkeys(IndexScanDesc scan, IndexTuple tuple, int tupnatts, * attribute passes the qual. */ Assert(ScanDirectionIsForward(dir)); + Assert(BTreeTupleIsPivot(tuple)); continue; } @@ -1534,6 +1542,7 @@ _bt_check_rowcompare(ScanKey skey, IndexTuple tuple, int tupnatts, * attribute passes the qual. */ Assert(ScanDirectionIsForward(dir)); + Assert(BTreeTupleIsPivot(tuple)); cmpresult = 0; if (subkey->sk_flags & SK_ROW_END) break; @@ -1773,10 +1782,65 @@ _bt_killitems(IndexScanDesc scan) { ItemId iid = PageGetItemId(page, offnum); IndexTuple ituple = (IndexTuple) PageGetItem(page, iid); + bool killtuple = false; - if (ItemPointerEquals(&ituple->t_tid, &kitem->heapTid)) + if (BTreeTupleIsPosting(ituple)) { - /* found the item */ + int pi = i + 1; + int nposting = BTreeTupleGetNPosting(ituple); + int j; + + /* + * Note that we rely on the assumption that heap TIDs in the + * scanpos items array are always in ascending heap TID order + * within a posting list + */ + for (j = 0; j < nposting; j++) + { + ItemPointer item = BTreeTupleGetPostingN(ituple, j); + + if (!ItemPointerEquals(item, &kitem->heapTid)) + break; /* out of posting list loop */ + + /* kitem must have matching offnum when heap TIDs match */ + Assert(kitem->indexOffset == offnum); + + /* + * Read-ahead to later kitems here. + * + * We rely on the assumption that not advancing kitem here + * will prevent us from considering the posting list tuple + * fully dead by not matching its next heap TID in next + * loop iteration. + * + * If, on the other hand, this is the final heap TID in + * the posting list tuple, then tuple gets killed + * regardless (i.e. we handle the case where the last + * kitem is also the last heap TID in the last index tuple + * correctly -- posting tuple still gets killed). + */ + if (pi < numKilled) + kitem = &so->currPos.items[so->killedItems[pi++]]; + } + + /* + * Don't bother advancing the outermost loop's int iterator to + * avoid processing killed items that relate to the same + * offnum/posting list tuple. This micro-optimization hardly + * seems worth it. (Further iterations of the outermost loop + * will fail to match on this same posting list's first heap + * TID instead, so we'll advance to the next offnum/index + * tuple pretty quickly.) + */ + if (j == nposting) + killtuple = true; + } + else if (ItemPointerEquals(&ituple->t_tid, &kitem->heapTid)) + killtuple = true; + + if (killtuple) + { + /* found the item/all posting list items */ ItemIdMarkDead(iid); killedsomething = true; break; /* out of inner search loop */ @@ -2017,7 +2081,9 @@ btoptions(Datum reloptions, bool validate) static const relopt_parse_elt tab[] = { {"fillfactor", RELOPT_TYPE_INT, offsetof(BTOptions, fillfactor)}, {"vacuum_cleanup_index_scale_factor", RELOPT_TYPE_REAL, - offsetof(BTOptions, vacuum_cleanup_index_scale_factor)} + offsetof(BTOptions, vacuum_cleanup_index_scale_factor)}, + {"deduplication", RELOPT_TYPE_BOOL, + offsetof(BTOptions, deduplication)} }; @@ -2118,11 +2184,10 @@ _bt_truncate(Relation rel, IndexTuple lastleft, IndexTuple firstright, Size newsize; /* - * We should only ever truncate leaf index tuples. It's never okay to - * truncate a second time. + * We should only ever truncate non-pivot tuples from leaf pages. It's + * never okay to truncate when splitting an internal page. */ - Assert(BTreeTupleGetNAtts(lastleft, rel) == natts); - Assert(BTreeTupleGetNAtts(firstright, rel) == natts); + Assert(!BTreeTupleIsPivot(lastleft) && !BTreeTupleIsPivot(firstright)); /* Determine how many attributes must be kept in truncated tuple */ keepnatts = _bt_keep_natts(rel, lastleft, firstright, itup_key); @@ -2138,6 +2203,19 @@ _bt_truncate(Relation rel, IndexTuple lastleft, IndexTuple firstright, pivot = index_truncate_tuple(itupdesc, firstright, keepnatts); + if (BTreeTupleIsPosting(pivot)) + { + /* + * index_truncate_tuple() just returns a straight copy of + * firstright when it has no key attributes to truncate. We need + * to truncate away the posting list ourselves. + */ + Assert(keepnatts == nkeyatts); + Assert(natts == nkeyatts); + pivot->t_info &= ~INDEX_SIZE_MASK; + pivot->t_info |= MAXALIGN(BTreeTupleGetPostingOffset(firstright)); + } + /* * If there is a distinguishing key attribute within new pivot tuple, * there is no need to add an explicit heap TID attribute @@ -2154,6 +2232,8 @@ _bt_truncate(Relation rel, IndexTuple lastleft, IndexTuple firstright, * attribute to the new pivot tuple. */ Assert(natts != nkeyatts); + Assert(!BTreeTupleIsPosting(lastleft) && + !BTreeTupleIsPosting(firstright)); newsize = IndexTupleSize(pivot) + MAXALIGN(sizeof(ItemPointerData)); tidpivot = palloc0(newsize); memcpy(tidpivot, pivot, IndexTupleSize(pivot)); @@ -2171,6 +2251,18 @@ _bt_truncate(Relation rel, IndexTuple lastleft, IndexTuple firstright, newsize = IndexTupleSize(firstright) + MAXALIGN(sizeof(ItemPointerData)); pivot = palloc0(newsize); memcpy(pivot, firstright, IndexTupleSize(firstright)); + + if (BTreeTupleIsPosting(pivot)) + { + /* + * New pivot tuple was copied from firstright, which happens to be + * a posting list tuple. We will have to include a lastleft heap + * TID in the final pivot, but we can remove the posting list now. + * (Pivot tuples should never contain a posting list.) + */ + newsize = MAXALIGN(BTreeTupleGetPostingOffset(firstright)) + + MAXALIGN(sizeof(ItemPointerData)); + } } /* @@ -2198,7 +2290,7 @@ _bt_truncate(Relation rel, IndexTuple lastleft, IndexTuple firstright, */ pivotheaptid = (ItemPointer) ((char *) pivot + newsize - sizeof(ItemPointerData)); - ItemPointerCopy(&lastleft->t_tid, pivotheaptid); + ItemPointerCopy(BTreeTupleGetMaxHeapTID(lastleft), pivotheaptid); /* * Lehman and Yao require that the downlink to the right page, which is to @@ -2209,9 +2301,12 @@ _bt_truncate(Relation rel, IndexTuple lastleft, IndexTuple firstright, * tiebreaker. */ #ifndef DEBUG_NO_TRUNCATE - Assert(ItemPointerCompare(&lastleft->t_tid, &firstright->t_tid) < 0); - Assert(ItemPointerCompare(pivotheaptid, &lastleft->t_tid) >= 0); - Assert(ItemPointerCompare(pivotheaptid, &firstright->t_tid) < 0); + Assert(ItemPointerCompare(BTreeTupleGetMaxHeapTID(lastleft), + BTreeTupleGetHeapTID(firstright)) < 0); + Assert(ItemPointerCompare(pivotheaptid, + BTreeTupleGetHeapTID(lastleft)) >= 0); + Assert(ItemPointerCompare(pivotheaptid, + BTreeTupleGetHeapTID(firstright)) < 0); #else /* @@ -2224,7 +2319,7 @@ _bt_truncate(Relation rel, IndexTuple lastleft, IndexTuple firstright, * attribute values along with lastleft's heap TID value when lastleft's * TID happens to be greater than firstright's TID. */ - ItemPointerCopy(&firstright->t_tid, pivotheaptid); + ItemPointerCopy(BTreeTupleGetHeapTID(firstright), pivotheaptid); /* * Pivot heap TID should never be fully equal to firstright. Note that @@ -2233,7 +2328,8 @@ _bt_truncate(Relation rel, IndexTuple lastleft, IndexTuple firstright, */ ItemPointerSetOffsetNumber(pivotheaptid, OffsetNumberPrev(ItemPointerGetOffsetNumber(pivotheaptid))); - Assert(ItemPointerCompare(pivotheaptid, &firstright->t_tid) < 0); + Assert(ItemPointerCompare(pivotheaptid, + BTreeTupleGetHeapTID(firstright)) < 0); #endif BTreeTupleSetNAtts(pivot, nkeyatts); @@ -2314,13 +2410,16 @@ _bt_keep_natts(Relation rel, IndexTuple lastleft, IndexTuple firstright, * The approach taken here usually provides the same answer as _bt_keep_natts * will (for the same pair of tuples from a heapkeyspace index), since the * majority of btree opclasses can never indicate that two datums are equal - * unless they're bitwise equal after detoasting. + * unless they're bitwise equal after detoasting. When an index is considered + * deduplication-safe by _bt_opclasses_support_dedup, routine is guaranteed to + * give the same result as _bt_keep_natts would. * - * These issues must be acceptable to callers, typically because they're only - * concerned about making suffix truncation as effective as possible without - * leaving excessive amounts of free space on either side of page split. - * Callers can rely on the fact that attributes considered equal here are - * definitely also equal according to _bt_keep_natts. + * Suffix truncation callers can rely on the fact that attributes considered + * equal here are definitely also equal according to _bt_keep_natts, even when + * the index uses an opclass or collation that is not deduplication-safe. + * This weaker guarantee is good enough for these callers, since false + * negatives generally only have the effect of making leaf page splits use a + * more balanced split point. */ int _bt_keep_natts_fast(Relation rel, IndexTuple lastleft, IndexTuple firstright) @@ -2398,22 +2497,36 @@ _bt_check_natts(Relation rel, bool heapkeyspace, Page page, OffsetNumber offnum) itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, offnum)); tupnatts = BTreeTupleGetNAtts(itup, rel); + /* !heapkeyspace indexes do not support deduplication */ + if (!heapkeyspace && BTreeTupleIsPosting(itup)) + return false; + + /* Posting list tuples should never have "pivot heap TID" bit set */ + if (BTreeTupleIsPosting(itup) && + (ItemPointerGetOffsetNumberNoCheck(&itup->t_tid) & + BT_PIVOT_HEAP_TID_ATTR) != 0) + return false; + + /* INCLUDE indexes do not support deduplication */ + if (natts != nkeyatts && BTreeTupleIsPosting(itup)) + return false; + if (P_ISLEAF(opaque)) { if (offnum >= P_FIRSTDATAKEY(opaque)) { /* - * Non-pivot tuples currently never use alternative heap TID - * representation -- even those within heapkeyspace indexes + * Non-pivot tuple should never be explicitly marked as a pivot + * tuple */ - if ((itup->t_info & INDEX_ALT_TID_MASK) != 0) + if (BTreeTupleIsPivot(itup)) return false; /* * Leaf tuples that are not the page high key (non-pivot tuples) * should never be truncated. (Note that tupnatts must have been - * inferred, rather than coming from an explicit on-disk - * representation.) + * inferred, even with a posting list tuple, because only pivot + * tuples store tupnatts directly.) */ return tupnatts == natts; } @@ -2457,12 +2570,12 @@ _bt_check_natts(Relation rel, bool heapkeyspace, Page page, OffsetNumber offnum) * non-zero, or when there is no explicit representation and the * tuple is evidently not a pre-pg_upgrade tuple. * - * Prior to v11, downlinks always had P_HIKEY as their offset. Use - * that to decide if the tuple is a pre-v11 tuple. + * Prior to v11, downlinks always had P_HIKEY as their offset. + * Accept that as an alternative indication of a valid + * !heapkeyspace negative infinity tuple. */ return tupnatts == 0 || - ((itup->t_info & INDEX_ALT_TID_MASK) == 0 && - ItemPointerGetOffsetNumber(&(itup->t_tid)) == P_HIKEY); + ItemPointerGetOffsetNumber(&(itup->t_tid)) == P_HIKEY; } else { @@ -2488,7 +2601,11 @@ _bt_check_natts(Relation rel, bool heapkeyspace, Page page, OffsetNumber offnum) * heapkeyspace index pivot tuples, regardless of whether or not there are * non-key attributes. */ - if ((itup->t_info & INDEX_ALT_TID_MASK) == 0) + if (!BTreeTupleIsPivot(itup)) + return false; + + /* Pivot tuple should not use posting list representation (redundant) */ + if (BTreeTupleIsPosting(itup)) return false; /* @@ -2558,11 +2675,54 @@ _bt_check_third_page(Relation rel, Relation heap, bool needheaptidspace, BTMaxItemSizeNoHeapTid(page), RelationGetRelationName(rel)), errdetail("Index row references tuple (%u,%u) in relation \"%s\".", - ItemPointerGetBlockNumber(&newtup->t_tid), - ItemPointerGetOffsetNumber(&newtup->t_tid), + ItemPointerGetBlockNumber(BTreeTupleGetHeapTID(newtup)), + ItemPointerGetOffsetNumber(BTreeTupleGetHeapTID(newtup)), RelationGetRelationName(heap)), errhint("Values larger than 1/3 of a buffer page cannot be indexed.\n" "Consider a function index of an MD5 hash of the value, " "or use full text indexing."), errtableconstraint(heap, RelationGetRelationName(rel)))); } + +/* + * Is it safe to perform deduplication for an index, given the opclasses and + * collations used? + * + * Returned value is stored in index metapage during index builds. Function + * does not account for incompatibilities caused by index being on an earlier + * nbtree version. + */ +bool +_bt_opclasses_support_dedup(Relation index) +{ + /* INCLUDE indexes don't support deduplication */ + if (IndexRelationGetNumberOfAttributes(index) != + IndexRelationGetNumberOfKeyAttributes(index)) + return false; + + /* + * There is no reason why deduplication cannot be used with system catalog + * indexes. However, we deem it generally unsafe because it's not clear + * how it could be disabled. (ALTER INDEX is not supported with system + * catalog indexes, so users have no way to set the "deduplicate" storage + * parameter.) + */ + if (IsCatalogRelation(index)) + return false; + + for (int i = 0; i < IndexRelationGetNumberOfKeyAttributes(index); i++) + { + Oid opfamily = index->rd_opfamily[i]; + Oid collation = index->rd_indcollation[i]; + + /* TODO add adequate check of opclasses and collations */ + elog(DEBUG4, "index %s column i %d opfamilyOid %u collationOid %u", + RelationGetRelationName(index), i, opfamily, collation); + + /* NUMERIC btree opfamily OID is 1988 */ + if (opfamily == 1988) + return false; + } + + return true; +} diff --git a/src/backend/access/nbtree/nbtxlog.c b/src/backend/access/nbtree/nbtxlog.c index 2e5202c2d6..9a4b522950 100644 --- a/src/backend/access/nbtree/nbtxlog.c +++ b/src/backend/access/nbtree/nbtxlog.c @@ -22,6 +22,9 @@ #include "access/xlogutils.h" #include "miscadmin.h" #include "storage/procarray.h" +#include "utils/memutils.h" + +static MemoryContext opCtx; /* working memory for operations */ /* * _bt_restore_page -- re-enter all the index tuples on a page @@ -111,6 +114,7 @@ _bt_restore_meta(XLogReaderState *record, uint8 block_id) Assert(md->btm_version >= BTREE_NOVAC_VERSION); md->btm_oldest_btpo_xact = xlrec->oldest_btpo_xact; md->btm_last_cleanup_num_heap_tuples = xlrec->last_cleanup_num_heap_tuples; + md->btm_safededup = xlrec->safededup; pageop = (BTPageOpaque) PageGetSpecialPointer(metapg); pageop->btpo_flags = BTP_META; @@ -156,7 +160,8 @@ _bt_clear_incomplete_split(XLogReaderState *record, uint8 block_id) } static void -btree_xlog_insert(bool isleaf, bool ismeta, XLogReaderState *record) +btree_xlog_insert(bool isleaf, bool ismeta, bool posting, + XLogReaderState *record) { XLogRecPtr lsn = record->EndRecPtr; xl_btree_insert *xlrec = (xl_btree_insert *) XLogRecGetData(record); @@ -181,9 +186,52 @@ btree_xlog_insert(bool isleaf, bool ismeta, XLogReaderState *record) page = BufferGetPage(buffer); - if (PageAddItem(page, (Item) datapos, datalen, xlrec->offnum, - false, false) == InvalidOffsetNumber) - elog(PANIC, "btree_xlog_insert: failed to add item"); + if (!posting) + { + /* Simple retail insertion */ + if (PageAddItem(page, (Item) datapos, datalen, xlrec->offnum, + false, false) == InvalidOffsetNumber) + elog(PANIC, "btree_xlog_insert: failed to add item"); + } + else + { + ItemId itemid; + IndexTuple oposting, + newitem, + nposting; + uint16 postingoff; + + /* + * A posting list split occurred during leaf page insertion. WAL + * record data will start with an offset number representing the + * point in an existing posting list that a split occurs at. + * + * Use _bt_swap_posting() to repeat posting list split steps from + * primary. Note that newitem from WAL record is 'orignewitem', + * not the final version of newitem that is actually inserted on + * page. + */ + postingoff = *((uint16 *) datapos); + datapos += sizeof(uint16); + datalen -= sizeof(uint16); + + itemid = PageGetItemId(page, OffsetNumberPrev(xlrec->offnum)); + oposting = (IndexTuple) PageGetItem(page, itemid); + + /* Use mutable, aligned newitem copy in _bt_swap_posting() */ + Assert(isleaf && postingoff > 0); + newitem = CopyIndexTuple((IndexTuple) datapos); + nposting = _bt_swap_posting(newitem, oposting, postingoff); + + /* Replace existing posting list with post-split version */ + memcpy(oposting, nposting, MAXALIGN(IndexTupleSize(nposting))); + + /* Insert "final" new item (not orignewitem from WAL stream) */ + Assert(IndexTupleSize(newitem) == datalen); + if (PageAddItem(page, (Item) newitem, datalen, xlrec->offnum, + false, false) == InvalidOffsetNumber) + elog(PANIC, "btree_xlog_insert: failed to add posting split new item"); + } PageSetLSN(page, lsn); MarkBufferDirty(buffer); @@ -265,20 +313,38 @@ btree_xlog_split(bool onleft, XLogReaderState *record) BTPageOpaque lopaque = (BTPageOpaque) PageGetSpecialPointer(lpage); OffsetNumber off; IndexTuple newitem = NULL, - left_hikey = NULL; + left_hikey = NULL, + nposting = NULL; Size newitemsz = 0, left_hikeysz = 0; Page newlpage; - OffsetNumber leftoff; + OffsetNumber leftoff, + replacepostingoff = InvalidOffsetNumber; datapos = XLogRecGetBlockData(record, 0, &datalen); - if (onleft) + if (onleft || xlrec->postingoff != 0) { newitem = (IndexTuple) datapos; newitemsz = MAXALIGN(IndexTupleSize(newitem)); datapos += newitemsz; datalen -= newitemsz; + + if (xlrec->postingoff != 0) + { + ItemId itemid; + IndexTuple oposting; + + /* Posting list must be at offset number before new item's */ + replacepostingoff = OffsetNumberPrev(xlrec->newitemoff); + + /* Use mutable, aligned newitem copy in _bt_swap_posting() */ + newitem = CopyIndexTuple(newitem); + itemid = PageGetItemId(lpage, replacepostingoff); + oposting = (IndexTuple) PageGetItem(lpage, itemid); + nposting = _bt_swap_posting(newitem, oposting, + xlrec->postingoff); + } } /* @@ -308,8 +374,20 @@ btree_xlog_split(bool onleft, XLogReaderState *record) Size itemsz; IndexTuple item; + /* Add replacement posting list when required */ + if (off == replacepostingoff) + { + Assert(onleft || xlrec->firstright == xlrec->newitemoff); + if (PageAddItem(newlpage, (Item) nposting, + MAXALIGN(IndexTupleSize(nposting)), leftoff, + false, false) == InvalidOffsetNumber) + elog(ERROR, "failed to add new posting list item to left page after split"); + leftoff = OffsetNumberNext(leftoff); + continue; /* don't insert oposting */ + } + /* add the new item if it was inserted on left page */ - if (onleft && off == xlrec->newitemoff) + else if (onleft && off == xlrec->newitemoff) { if (PageAddItem(newlpage, (Item) newitem, newitemsz, leftoff, false, false) == InvalidOffsetNumber) @@ -383,6 +461,56 @@ btree_xlog_split(bool onleft, XLogReaderState *record) } } +static void +btree_xlog_dedup(XLogReaderState *record) +{ + XLogRecPtr lsn = record->EndRecPtr; + xl_btree_dedup *xlrec = (xl_btree_dedup *) XLogRecGetData(record); + Buffer buf; + + if (XLogReadBufferForRedo(record, 0, &buf) == BLK_NEEDS_REDO) + { + Page page = (Page) BufferGetPage(buf); + OffsetNumber offnum; + BTDedupState state; + + state = (BTDedupState) palloc(sizeof(BTDedupStateData)); + state->maxitemsize = Min(BTMaxItemSize(page), INDEX_SIZE_MASK); + state->checkingunique = false; /* unused */ + state->skippedbase = InvalidOffsetNumber; + state->base = NULL; + state->baseoff = InvalidOffsetNumber; + state->basetupsize = 0; + state->htids = NULL; + state->nhtids = 0; + state->nitems = 0; + state->phystupsize = 0; + state->htids = palloc(state->maxitemsize); + + for (offnum = xlrec->baseoff; + offnum < xlrec->baseoff + xlrec->nitems; + offnum = OffsetNumberNext(offnum)) + { + ItemId itemid = PageGetItemId(page, offnum); + IndexTuple itup = (IndexTuple) PageGetItem(page, itemid); + + if (offnum == xlrec->baseoff) + _bt_dedup_start_pending(state, itup, offnum); + else if (!_bt_dedup_save_htid(state, itup)) + elog(ERROR, "could not add heap tid to pending posting list"); + } + + Assert(state->nitems == xlrec->nitems); + _bt_dedup_finish_pending(buf, state, false); + + PageSetLSN(page, lsn); + MarkBufferDirty(buf); + } + + if (BufferIsValid(buf)) + UnlockReleaseBuffer(buf); +} + static void btree_xlog_vacuum(XLogReaderState *record) { @@ -405,7 +533,31 @@ btree_xlog_vacuum(XLogReaderState *record) page = (Page) BufferGetPage(buffer); - PageIndexMultiDelete(page, (OffsetNumber *) ptr, xlrec->ndeleted); + if (xlrec->nupdated > 0) + { + OffsetNumber *updatedoffsets; + IndexTuple updated; + Size itemsz; + + updatedoffsets = (OffsetNumber *) + (ptr + xlrec->ndeleted * sizeof(OffsetNumber)); + updated = (IndexTuple) ((char *) updatedoffsets + + xlrec->nupdated * sizeof(OffsetNumber)); + + for (int i = 0; i < xlrec->nupdated; i++) + { + itemsz = MAXALIGN(IndexTupleSize(updated)); + + if (!PageIndexTupleOverwrite(page, updatedoffsets[i], + (Item) updated, itemsz)) + elog(PANIC, "could not update partially dead item"); + + updated = (IndexTuple) ((char *) updated + itemsz); + } + } + + if (xlrec->ndeleted > 0) + PageIndexMultiDelete(page, (OffsetNumber *) ptr, xlrec->ndeleted); /* * Mark the page as not containing any LP_DEAD items --- see comments @@ -724,17 +876,22 @@ void btree_redo(XLogReaderState *record) { uint8 info = XLogRecGetInfo(record) & ~XLR_INFO_MASK; + MemoryContext oldCtx; + oldCtx = MemoryContextSwitchTo(opCtx); switch (info) { case XLOG_BTREE_INSERT_LEAF: - btree_xlog_insert(true, false, record); + btree_xlog_insert(true, false, false, record); break; case XLOG_BTREE_INSERT_UPPER: - btree_xlog_insert(false, false, record); + btree_xlog_insert(false, false, false, record); break; case XLOG_BTREE_INSERT_META: - btree_xlog_insert(false, true, record); + btree_xlog_insert(false, true, false, record); + break; + case XLOG_BTREE_INSERT_POST: + btree_xlog_insert(true, false, true, record); break; case XLOG_BTREE_SPLIT_L: btree_xlog_split(true, record); @@ -742,6 +899,9 @@ btree_redo(XLogReaderState *record) case XLOG_BTREE_SPLIT_R: btree_xlog_split(false, record); break; + case XLOG_BTREE_DEDUP_PAGE: + btree_xlog_dedup(record); + break; case XLOG_BTREE_VACUUM: btree_xlog_vacuum(record); break; @@ -767,6 +927,23 @@ btree_redo(XLogReaderState *record) default: elog(PANIC, "btree_redo: unknown op code %u", info); } + MemoryContextSwitchTo(oldCtx); + MemoryContextReset(opCtx); +} + +void +btree_xlog_startup(void) +{ + opCtx = AllocSetContextCreate(CurrentMemoryContext, + "Btree recovery temporary context", + ALLOCSET_DEFAULT_SIZES); +} + +void +btree_xlog_cleanup(void) +{ + MemoryContextDelete(opCtx); + opCtx = NULL; } /* diff --git a/src/backend/access/rmgrdesc/nbtdesc.c b/src/backend/access/rmgrdesc/nbtdesc.c index 7d63a7124e..68fad1c91f 100644 --- a/src/backend/access/rmgrdesc/nbtdesc.c +++ b/src/backend/access/rmgrdesc/nbtdesc.c @@ -27,6 +27,7 @@ btree_desc(StringInfo buf, XLogReaderState *record) case XLOG_BTREE_INSERT_LEAF: case XLOG_BTREE_INSERT_UPPER: case XLOG_BTREE_INSERT_META: + case XLOG_BTREE_INSERT_POST: { xl_btree_insert *xlrec = (xl_btree_insert *) rec; @@ -38,15 +39,25 @@ btree_desc(StringInfo buf, XLogReaderState *record) { xl_btree_split *xlrec = (xl_btree_split *) rec; - appendStringInfo(buf, "level %u, firstright %d, newitemoff %d", - xlrec->level, xlrec->firstright, xlrec->newitemoff); + appendStringInfo(buf, "level %u, firstright %d, newitemoff %d, postingoff %d", + xlrec->level, xlrec->firstright, + xlrec->newitemoff, xlrec->postingoff); + break; + } + case XLOG_BTREE_DEDUP_PAGE: + { + xl_btree_dedup *xlrec = (xl_btree_dedup *) rec; + + appendStringInfo(buf, "baseoff %u; nitems %u", + xlrec->baseoff, xlrec->nitems); break; } case XLOG_BTREE_VACUUM: { xl_btree_vacuum *xlrec = (xl_btree_vacuum *) rec; - appendStringInfo(buf, "ndeleted %u", xlrec->ndeleted); + appendStringInfo(buf, "ndeleted %u; nupdated %u", + xlrec->ndeleted, xlrec->nupdated); break; } case XLOG_BTREE_DELETE: @@ -130,6 +141,12 @@ btree_identify(uint8 info) case XLOG_BTREE_SPLIT_R: id = "SPLIT_R"; break; + case XLOG_BTREE_DEDUP_PAGE: + id = "DEDUPLICATE"; + break; + case XLOG_BTREE_INSERT_POST: + id = "INSERT_POST"; + break; case XLOG_BTREE_VACUUM: id = "VACUUM"; break; diff --git a/src/backend/storage/page/bufpage.c b/src/backend/storage/page/bufpage.c index f47176753d..32ff03b3e4 100644 --- a/src/backend/storage/page/bufpage.c +++ b/src/backend/storage/page/bufpage.c @@ -1055,8 +1055,10 @@ PageIndexTupleDeleteNoCompact(Page page, OffsetNumber offnum) * This is better than deleting and reinserting the tuple, because it * avoids any data shifting when the tuple size doesn't change; and * even when it does, we avoid moving the line pointers around. - * Conceivably this could also be of use to an index AM that cares about - * the physical order of tuples as well as their ItemId order. + * This could be used by an index AM that doesn't want to unset the + * LP_DEAD bit when it happens to be set. It could conceivably also be + * used by an index AM that cares about the physical order of tuples as + * well as their logical/ItemId order. * * If there's insufficient space for the new tuple, return false. Other * errors represent data-corruption problems, so we just elog. @@ -1142,7 +1144,8 @@ PageIndexTupleOverwrite(Page page, OffsetNumber offnum, } /* Update the item's tuple length (other fields shouldn't change) */ - ItemIdSetNormal(tupid, offset + size_diff, newsize); + tupid->lp_off = offset + size_diff; + tupid->lp_len = newsize; /* Copy new tuple data onto page */ memcpy(PageGetItem(page, tupid), newtup, newsize); diff --git a/src/backend/utils/misc/guc.c b/src/backend/utils/misc/guc.c index a4e5d0886a..f44f2ce93f 100644 --- a/src/backend/utils/misc/guc.c +++ b/src/backend/utils/misc/guc.c @@ -28,6 +28,7 @@ #include "access/commit_ts.h" #include "access/gin.h" +#include "access/nbtree.h" #include "access/rmgr.h" #include "access/tableam.h" #include "access/transam.h" @@ -1091,6 +1092,15 @@ static struct config_bool ConfigureNamesBool[] = false, check_bonjour, NULL, NULL }, + { + {"btree_deduplication", PGC_USERSET, CLIENT_CONN_STATEMENT, + gettext_noop("Enables B-tree index deduplication optimization."), + NULL + }, + &btree_deduplication, + true, + NULL, NULL, NULL + }, { {"track_commit_timestamp", PGC_POSTMASTER, REPLICATION, gettext_noop("Collects transaction commit time."), diff --git a/src/backend/utils/misc/postgresql.conf.sample b/src/backend/utils/misc/postgresql.conf.sample index 087190ce63..739676b9d0 100644 --- a/src/backend/utils/misc/postgresql.conf.sample +++ b/src/backend/utils/misc/postgresql.conf.sample @@ -651,6 +651,7 @@ #vacuum_cleanup_index_scale_factor = 0.1 # fraction of total number of tuples # before index cleanup, 0 always performs # index cleanup +#btree_deduplication = on #bytea_output = 'hex' # hex, escape #xmlbinary = 'base64' #xmloption = 'content' diff --git a/src/bin/psql/tab-complete.c b/src/bin/psql/tab-complete.c index 2fd88866c9..c374c64e2a 100644 --- a/src/bin/psql/tab-complete.c +++ b/src/bin/psql/tab-complete.c @@ -1685,14 +1685,14 @@ psql_completion(const char *text, int start, int end) /* ALTER INDEX SET|RESET ( */ else if (Matches("ALTER", "INDEX", MatchAny, "RESET", "(")) COMPLETE_WITH("fillfactor", - "vacuum_cleanup_index_scale_factor", /* BTREE */ + "vacuum_cleanup_index_scale_factor", "deduplication", /* BTREE */ "fastupdate", "gin_pending_list_limit", /* GIN */ "buffering", /* GiST */ "pages_per_range", "autosummarize" /* BRIN */ ); else if (Matches("ALTER", "INDEX", MatchAny, "SET", "(")) COMPLETE_WITH("fillfactor =", - "vacuum_cleanup_index_scale_factor =", /* BTREE */ + "vacuum_cleanup_index_scale_factor =", "deduplication =", /* BTREE */ "fastupdate =", "gin_pending_list_limit =", /* GIN */ "buffering =", /* GiST */ "pages_per_range =", "autosummarize =" /* BRIN */ diff --git a/contrib/amcheck/verify_nbtree.c b/contrib/amcheck/verify_nbtree.c index 6a058ccdac..b533a99300 100644 --- a/contrib/amcheck/verify_nbtree.c +++ b/contrib/amcheck/verify_nbtree.c @@ -145,6 +145,7 @@ static void bt_tuple_present_callback(Relation index, ItemPointer tid, bool tupleIsAlive, void *checkstate); static IndexTuple bt_normalize_tuple(BtreeCheckState *state, IndexTuple itup); +static inline IndexTuple bt_posting_logical_tuple(IndexTuple itup, int n); static bool bt_rootdescend(BtreeCheckState *state, IndexTuple itup); static inline bool offset_is_negative_infinity(BTPageOpaque opaque, OffsetNumber offset); @@ -167,6 +168,7 @@ static ItemId PageGetItemIdCareful(BtreeCheckState *state, BlockNumber block, Page page, OffsetNumber offset); static inline ItemPointer BTreeTupleGetHeapTIDCareful(BtreeCheckState *state, IndexTuple itup, bool nonpivot); +static inline ItemPointer BTreeTupleGetPointsToTID(IndexTuple itup); /* * bt_index_check(index regclass, heapallindexed boolean) @@ -278,7 +280,8 @@ bt_index_check_internal(Oid indrelid, bool parentcheck, bool heapallindexed, if (btree_index_mainfork_expected(indrel)) { - bool heapkeyspace; + bool heapkeyspace, + safededup; RelationOpenSmgr(indrel); if (!smgrexists(indrel->rd_smgr, MAIN_FORKNUM)) @@ -288,7 +291,7 @@ bt_index_check_internal(Oid indrelid, bool parentcheck, bool heapallindexed, RelationGetRelationName(indrel)))); /* Check index, possibly against table it is an index on */ - heapkeyspace = _bt_heapkeyspace(indrel); + _bt_metaversion(indrel, &heapkeyspace, &safededup); bt_check_every_level(indrel, heaprel, heapkeyspace, parentcheck, heapallindexed, rootdescend); } @@ -419,12 +422,13 @@ bt_check_every_level(Relation rel, Relation heaprel, bool heapkeyspace, /* * Size Bloom filter based on estimated number of tuples in index, * while conservatively assuming that each block must contain at least - * MaxIndexTuplesPerPage / 5 non-pivot tuples. (Non-leaf pages cannot - * contain non-pivot tuples. That's okay because they generally make - * up no more than about 1% of all pages in the index.) + * MaxBTreeIndexTuplesPerPage / 3 "logical" tuples. heapallindexed + * verification fingerprints posting list heap TIDs as plain non-pivot + * tuples, complete with index keys. This allows its heap scan to + * behave as if posting lists do not exist. */ total_pages = RelationGetNumberOfBlocks(rel); - total_elems = Max(total_pages * (MaxIndexTuplesPerPage / 5), + total_elems = Max(total_pages * (MaxBTreeIndexTuplesPerPage / 3), (int64) state->rel->rd_rel->reltuples); /* Random seed relies on backend srandom() call to avoid repetition */ seed = random(); @@ -924,6 +928,7 @@ bt_target_page_check(BtreeCheckState *state) size_t tupsize; BTScanInsert skey; bool lowersizelimit; + ItemPointer scantid; CHECK_FOR_INTERRUPTS(); @@ -954,13 +959,15 @@ bt_target_page_check(BtreeCheckState *state) if (!_bt_check_natts(state->rel, state->heapkeyspace, state->target, offset)) { + ItemPointer tid; char *itid, *htid; itid = psprintf("(%u,%u)", state->targetblock, offset); + tid = BTreeTupleGetPointsToTID(itup); htid = psprintf("(%u,%u)", - ItemPointerGetBlockNumberNoCheck(&(itup->t_tid)), - ItemPointerGetOffsetNumberNoCheck(&(itup->t_tid))); + ItemPointerGetBlockNumberNoCheck(tid), + ItemPointerGetOffsetNumberNoCheck(tid)); ereport(ERROR, (errcode(ERRCODE_INDEX_CORRUPTED), @@ -994,18 +1001,21 @@ bt_target_page_check(BtreeCheckState *state) /* * Readonly callers may optionally verify that non-pivot tuples can - * each be found by an independent search that starts from the root + * each be found by an independent search that starts from the root. + * Note that we deliberately don't do individual searches for each + * "logical" posting list tuple, since the posting list itself is + * validated by other checks. */ if (state->rootdescend && P_ISLEAF(topaque) && !bt_rootdescend(state, itup)) { + ItemPointer tid = BTreeTupleGetPointsToTID(itup); char *itid, *htid; itid = psprintf("(%u,%u)", state->targetblock, offset); - htid = psprintf("(%u,%u)", - ItemPointerGetBlockNumber(&(itup->t_tid)), - ItemPointerGetOffsetNumber(&(itup->t_tid))); + htid = psprintf("(%u,%u)", ItemPointerGetBlockNumber(tid), + ItemPointerGetOffsetNumber(tid)); ereport(ERROR, (errcode(ERRCODE_INDEX_CORRUPTED), @@ -1017,6 +1027,40 @@ bt_target_page_check(BtreeCheckState *state) (uint32) state->targetlsn))); } + /* + * If tuple is actually a posting list, make sure posting list TIDs + * are in order. + */ + if (BTreeTupleIsPosting(itup)) + { + ItemPointerData last; + ItemPointer current; + + ItemPointerCopy(BTreeTupleGetHeapTID(itup), &last); + + for (int i = 1; i < BTreeTupleGetNPosting(itup); i++) + { + + current = BTreeTupleGetPostingN(itup, i); + + if (ItemPointerCompare(current, &last) <= 0) + { + char *itid = psprintf("(%u,%u)", state->targetblock, offset); + + ereport(ERROR, + (errcode(ERRCODE_INDEX_CORRUPTED), + errmsg("posting list heap TIDs out of order in index \"%s\"", + RelationGetRelationName(state->rel)), + errdetail_internal("Index tid=%s posting list offset=%d page lsn=%X/%X.", + itid, i, + (uint32) (state->targetlsn >> 32), + (uint32) state->targetlsn))); + } + + ItemPointerCopy(current, &last); + } + } + /* Build insertion scankey for current page offset */ skey = bt_mkscankey_pivotsearch(state->rel, itup); @@ -1049,13 +1093,14 @@ bt_target_page_check(BtreeCheckState *state) if (tupsize > (lowersizelimit ? BTMaxItemSize(state->target) : BTMaxItemSizeNoHeapTid(state->target))) { + ItemPointer tid = BTreeTupleGetPointsToTID(itup); char *itid, *htid; itid = psprintf("(%u,%u)", state->targetblock, offset); htid = psprintf("(%u,%u)", - ItemPointerGetBlockNumberNoCheck(&(itup->t_tid)), - ItemPointerGetOffsetNumberNoCheck(&(itup->t_tid))); + ItemPointerGetBlockNumberNoCheck(tid), + ItemPointerGetOffsetNumberNoCheck(tid)); ereport(ERROR, (errcode(ERRCODE_INDEX_CORRUPTED), @@ -1074,12 +1119,32 @@ bt_target_page_check(BtreeCheckState *state) { IndexTuple norm; - norm = bt_normalize_tuple(state, itup); - bloom_add_element(state->filter, (unsigned char *) norm, - IndexTupleSize(norm)); - /* Be tidy */ - if (norm != itup) - pfree(norm); + if (BTreeTupleIsPosting(itup)) + { + /* Fingerprint all elements as distinct "logical" tuples */ + for (int i = 0; i < BTreeTupleGetNPosting(itup); i++) + { + IndexTuple logtuple; + + logtuple = bt_posting_logical_tuple(itup, i); + norm = bt_normalize_tuple(state, logtuple); + bloom_add_element(state->filter, (unsigned char *) norm, + IndexTupleSize(norm)); + /* Be tidy */ + if (norm != logtuple) + pfree(norm); + pfree(logtuple); + } + } + else + { + norm = bt_normalize_tuple(state, itup); + bloom_add_element(state->filter, (unsigned char *) norm, + IndexTupleSize(norm)); + /* Be tidy */ + if (norm != itup) + pfree(norm); + } } /* @@ -1087,7 +1152,8 @@ bt_target_page_check(BtreeCheckState *state) * * If there is a high key (if this is not the rightmost page on its * entire level), check that high key actually is upper bound on all - * page items. + * page items. If this is a posting list tuple, we'll need to set + * scantid to be highest TID in posting list. * * We prefer to check all items against high key rather than checking * just the last and trusting that the operator class obeys the @@ -1127,17 +1193,22 @@ bt_target_page_check(BtreeCheckState *state) * tuple. (See also: "Notes About Data Representation" in the nbtree * README.) */ + scantid = skey->scantid; + if (state->heapkeyspace && !BTreeTupleIsPivot(itup)) + skey->scantid = BTreeTupleGetMaxHeapTID(itup); + if (!P_RIGHTMOST(topaque) && !(P_ISLEAF(topaque) ? invariant_leq_offset(state, skey, P_HIKEY) : invariant_l_offset(state, skey, P_HIKEY))) { + ItemPointer tid = BTreeTupleGetPointsToTID(itup); char *itid, *htid; itid = psprintf("(%u,%u)", state->targetblock, offset); htid = psprintf("(%u,%u)", - ItemPointerGetBlockNumberNoCheck(&(itup->t_tid)), - ItemPointerGetOffsetNumberNoCheck(&(itup->t_tid))); + ItemPointerGetBlockNumberNoCheck(tid), + ItemPointerGetOffsetNumberNoCheck(tid)); ereport(ERROR, (errcode(ERRCODE_INDEX_CORRUPTED), @@ -1150,6 +1221,7 @@ bt_target_page_check(BtreeCheckState *state) (uint32) (state->targetlsn >> 32), (uint32) state->targetlsn))); } + skey->scantid = scantid; /* * * Item order check * @@ -1160,15 +1232,17 @@ bt_target_page_check(BtreeCheckState *state) if (OffsetNumberNext(offset) <= max && !invariant_l_offset(state, skey, OffsetNumberNext(offset))) { + ItemPointer tid; char *itid, *htid, *nitid, *nhtid; itid = psprintf("(%u,%u)", state->targetblock, offset); + tid = BTreeTupleGetPointsToTID(itup); htid = psprintf("(%u,%u)", - ItemPointerGetBlockNumberNoCheck(&(itup->t_tid)), - ItemPointerGetOffsetNumberNoCheck(&(itup->t_tid))); + ItemPointerGetBlockNumberNoCheck(tid), + ItemPointerGetOffsetNumberNoCheck(tid)); nitid = psprintf("(%u,%u)", state->targetblock, OffsetNumberNext(offset)); @@ -1177,9 +1251,10 @@ bt_target_page_check(BtreeCheckState *state) state->target, OffsetNumberNext(offset)); itup = (IndexTuple) PageGetItem(state->target, itemid); + tid = BTreeTupleGetPointsToTID(itup); nhtid = psprintf("(%u,%u)", - ItemPointerGetBlockNumberNoCheck(&(itup->t_tid)), - ItemPointerGetOffsetNumberNoCheck(&(itup->t_tid))); + ItemPointerGetBlockNumberNoCheck(tid), + ItemPointerGetOffsetNumberNoCheck(tid)); ereport(ERROR, (errcode(ERRCODE_INDEX_CORRUPTED), @@ -1953,10 +2028,10 @@ bt_tuple_present_callback(Relation index, ItemPointer tid, Datum *values, * verification. In particular, it won't try to normalize opclass-equal * datums with potentially distinct representations (e.g., btree/numeric_ops * index datums will not get their display scale normalized-away here). - * Normalization may need to be expanded to handle more cases in the future, - * though. For example, it's possible that non-pivot tuples could in the - * future have alternative logically equivalent representations due to using - * the INDEX_ALT_TID_MASK bit to implement intelligent deduplication. + * Caller does normalization for non-pivot tuples that have a posting list, + * since dummy CREATE INDEX callback code generates new tuples with the same + * normalized representation. Deduplication is performed opportunistically, + * and in general there is no guarantee about how or when it will be applied. */ static IndexTuple bt_normalize_tuple(BtreeCheckState *state, IndexTuple itup) @@ -1969,6 +2044,9 @@ bt_normalize_tuple(BtreeCheckState *state, IndexTuple itup) IndexTuple reformed; int i; + /* Caller should only pass "logical" non-pivot tuples here */ + Assert(!BTreeTupleIsPosting(itup) && !BTreeTupleIsPivot(itup)); + /* Easy case: It's immediately clear that tuple has no varlena datums */ if (!IndexTupleHasVarwidths(itup)) return itup; @@ -2031,6 +2109,30 @@ bt_normalize_tuple(BtreeCheckState *state, IndexTuple itup) return reformed; } +/* + * Produce palloc()'d "logical" tuple for nth posting list entry. + * + * In general, deduplication is not supposed to change the logical contents of + * an index. Multiple logical index tuples are merged together into one + * physical posting list index tuple when convenient. + * + * heapallindexed verification must normalize-away this variation in + * representation by converting posting list tuples into two or more "logical" + * tuples. Each logical tuple must be fingerprinted separately -- there must + * be one logical tuple for each corresponding Bloom filter probe during the + * heap scan. + * + * Note: Caller needs to call bt_normalize_tuple() with returned tuple. + */ +static inline IndexTuple +bt_posting_logical_tuple(IndexTuple itup, int n) +{ + Assert(BTreeTupleIsPosting(itup)); + + /* Returns non-posting-list tuple */ + return _bt_form_posting(itup, BTreeTupleGetPostingN(itup, n), 1); +} + /* * Search for itup in index, starting from fast root page. itup must be a * non-pivot tuple. This is only supported with heapkeyspace indexes, since @@ -2087,6 +2189,7 @@ bt_rootdescend(BtreeCheckState *state, IndexTuple itup) insertstate.itup = itup; insertstate.itemsz = MAXALIGN(IndexTupleSize(itup)); insertstate.itup_key = key; + insertstate.postingoff = 0; insertstate.bounds_valid = false; insertstate.buf = lbuf; @@ -2094,7 +2197,9 @@ bt_rootdescend(BtreeCheckState *state, IndexTuple itup) offnum = _bt_binsrch_insert(state->rel, &insertstate); /* Compare first >= matching item on leaf page, if any */ page = BufferGetPage(lbuf); + /* Should match on first heap TID when tuple has a posting list */ if (offnum <= PageGetMaxOffsetNumber(page) && + insertstate.postingoff <= 0 && _bt_compare(state->rel, key, page, offnum) == 0) exists = true; _bt_relbuf(state->rel, lbuf); @@ -2548,26 +2653,69 @@ PageGetItemIdCareful(BtreeCheckState *state, BlockNumber block, Page page, } /* - * BTreeTupleGetHeapTID() wrapper that lets caller enforce that a heap TID must - * be present in cases where that is mandatory. - * - * This doesn't add much as of BTREE_VERSION 4, since the INDEX_ALT_TID_MASK - * bit is effectively a proxy for whether or not the tuple is a pivot tuple. - * It may become more useful in the future, when non-pivot tuples support their - * own alternative INDEX_ALT_TID_MASK representation. + * BTreeTupleGetHeapTID() wrapper that enforces that a heap TID is present in + * cases where that is mandatory (i.e. for non-pivot tuples) */ static inline ItemPointer BTreeTupleGetHeapTIDCareful(BtreeCheckState *state, IndexTuple itup, bool nonpivot) { - ItemPointer result = BTreeTupleGetHeapTID(itup); - BlockNumber targetblock = state->targetblock; + ItemPointer htid; - if (result == NULL && nonpivot) + /* + * Caller determines whether this is supposed to be a pivot or non-pivot + * tuple using page type and item offset number. Verify that tuple + * metadata agrees with this. + */ + Assert(state->heapkeyspace); + if (BTreeTupleIsPivot(itup) && nonpivot) + ereport(ERROR, + (errcode(ERRCODE_INDEX_CORRUPTED), + errmsg_internal("block %u or its right sibling block or child block in index \"%s\" has unexpected pivot tuple", + state->targetblock, + RelationGetRelationName(state->rel)))); + + if (!BTreeTupleIsPivot(itup) && !nonpivot) + ereport(ERROR, + (errcode(ERRCODE_INDEX_CORRUPTED), + errmsg_internal("block %u or its right sibling block or child block in index \"%s\" has unexpected non-pivot tuple", + state->targetblock, + RelationGetRelationName(state->rel)))); + + htid = BTreeTupleGetHeapTID(itup); + if (!ItemPointerIsValid(htid) && nonpivot) ereport(ERROR, (errcode(ERRCODE_INDEX_CORRUPTED), errmsg("block %u or its right sibling block or child block in index \"%s\" contains non-pivot tuple that lacks a heap TID", - targetblock, RelationGetRelationName(state->rel)))); + state->targetblock, + RelationGetRelationName(state->rel)))); - return result; + return htid; +} + +/* + * Return the "pointed to" TID for itup, which is used to generate a + * descriptive error message. itup must be a "data item" tuple (it wouldn't + * make much sense to call here with a high key tuple, since there won't be a + * valid downlink/block number to display). + * + * Returns either a heap TID (which will be the first heap TID in posting list + * if itup is posting list tuple), or a TID that contains downlink block + * number, plus some encoded metadata (e.g., the number of attributes present + * in itup). + */ +static inline ItemPointer +BTreeTupleGetPointsToTID(IndexTuple itup) +{ + /* + * Rely on the assumption that !heapkeyspace internal page data items will + * correctly return TID with downlink here -- BTreeTupleGetHeapTID() won't + * recognize it as a pivot tuple, but everything still works out because + * the t_tid field is still returned + */ + if (!BTreeTupleIsPivot(itup)) + return BTreeTupleGetHeapTID(itup); + + /* Pivot tuple returns TID with downlink block (heapkeyspace variant) */ + return &itup->t_tid; } diff --git a/doc/src/sgml/btree.sgml b/doc/src/sgml/btree.sgml index 5881ea5dd6..059477be1e 100644 --- a/doc/src/sgml/btree.sgml +++ b/doc/src/sgml/btree.sgml @@ -433,11 +433,122 @@ returns bool Implementation + + Internally, a B-tree index consists of a tree structure with leaf + pages. Each leaf page contains tuples that point to table entries + using a heap item pointer. Each tuple's key is considered unique + internally, since the item pointer is treated as part of the key. + + + An introduction to the btree index implementation can be found in + src/backend/access/nbtree/README. + + + + Deduplication - An introduction to the btree index implementation can be found in - src/backend/access/nbtree/README. + B-Tree supports deduplication. Existing + leaf page tuples with fully equal keys (equal prior to the heap + item pointer) are merged together into a single posting + list tuple. The keys appear only once in this + representation. A simple array of heap item pointers follows. + Posting lists are formed lazily, when a new item is + inserted that cannot fit on an existing leaf page. The immediate + goal of the deduplication process is to at least free enough space + to fit the new item; otherwise a leaf page split occurs, which + allocates a new leaf page. The key space + covered by the original leaf page is shared among the original page, + and its new right sibling page. + + + Deduplication can greatly increase index space efficiency with data + sets where each distinct key appears at least a few times on + average. It can also reduce the cost of subsequent index scans, + especially when many leaf pages must be accessed. For example, an + index on a simple integer column that uses + deduplication will have a storage size that is only about 65% of an + equivalent unoptimized index when each distinct + integer value appears three times. If each distinct + integer value appears six times, the storage overhead + can be as low as 50% of baseline. With hundreds of duplicates per + distinct value (or with larger base key values) a + storage size of about one third of the + unoptimized case is expected. There is often a direct benefit for + queries, as well as an indirect benefit due to reduced I/O during + routine vacuuming. + + + Cases that don't benefit due to having no duplicate values will + incur a small performance penalty with mixed read-write workloads. + There is no performance penalty with read-only workloads, since + reading from posting lists is at least as efficient as reading the + standard index tuple representation. + + + + + Configuring Deduplication + + + The configuration + parameter controls deduplication. By default, deduplication is + enabled. The deduplication storage parameter + can be used to override the configuration paramater for individual + indexes. See + from the CREATE INDEX documentation for details. + + + + + Restrictions + + + Deduplication can only be used with indexes that use B-Tree + operator classes that were declared BITWISE. In + practice almost all datatypes support deduplication, though + numeric is a notable exception (the display + scale feature makes it impossible to enable deduplication + without losing useful information about equal numeric + datums). Deduplication is not supported with nondeterministic + collations, nor is it supported with INCLUDE + indexes. + + + Note that a multicolumn index is only considered to have duplicates + when there are index entries that repeat entire + combinations of values (the values stored in + each and every column must be equal). + + + + Internal use of Deduplication in unique indexes + + + Page splits that occur due to inserting multiple physical versions + (rather than inserting new logical rows) tend to degrade the + structure of indexes, especially in the case of unique indexes. + Unique indexes use deduplication internally + and selectively to delay (and ideally to + prevent) these unnecessary page splits. + VACUUM or autovacuum will eventually remove dead + versions of tuples from every index in any case, but usually cannot + reverse page splits (in general, the page must be completely empty + before VACUUM can delete it). + + + The configuration + parameter does not affect whether or not deduplication is used + within unique indexes. The internal use of deduplication for + unique indexes is subject to all of the same restrictions as + deduplication in general. The deduplication + storage parameter can be set to OFF to disable + deduplication in unique indexes, but this is intended only as a + debugging option for developers. + + + diff --git a/doc/src/sgml/charset.sgml b/doc/src/sgml/charset.sgml index 55669b5cad..9f371d3e3a 100644 --- a/doc/src/sgml/charset.sgml +++ b/doc/src/sgml/charset.sgml @@ -928,10 +928,11 @@ CREATE COLLATION ignore_accents (provider = icu, locale = 'und-u-ks-level1-kc-tr nondeterministic collations give a more correct behavior, especially when considering the full power of Unicode and its many special cases, they also have some drawbacks. Foremost, their use leads - to a performance penalty. Also, certain operations are not possible with - nondeterministic collations, such as pattern matching operations. - Therefore, they should be used only in cases where they are specifically - wanted. + to a performance penalty. Note, in particular, that B-tree cannot use + deduplication with indexes that use a nondeterministic collation. Also, + certain operations are not possible with nondeterministic collations, + such as pattern matching operations. Therefore, they should be used + only in cases where they are specifically wanted. diff --git a/doc/src/sgml/config.sgml b/doc/src/sgml/config.sgml index 5d1c90282f..05f442d57a 100644 --- a/doc/src/sgml/config.sgml +++ b/doc/src/sgml/config.sgml @@ -8021,6 +8021,31 @@ COPY postgres_log FROM '/full/path/to/logfile.csv' WITH csv; + + btree_deduplication (boolean) + + btree_deduplication + configuration parameter + + + + + Controls whether deduplication should be used within B-Tree + indexes. Deduplication is an optimization that reduces the + storage size of indexes by storing equal index keys only once. + See for more + information. + + + + This setting can be overridden for individual B-Tree indexes + by changing index storage parameters. See from the + CREATE INDEX documentation for details. + + + + bytea_output (enum) diff --git a/doc/src/sgml/ref/create_index.sgml b/doc/src/sgml/ref/create_index.sgml index 629a31ef79..e6cdba4c29 100644 --- a/doc/src/sgml/ref/create_index.sgml +++ b/doc/src/sgml/ref/create_index.sgml @@ -166,6 +166,8 @@ CREATE [ UNIQUE ] INDEX [ CONCURRENTLY ] [ [ IF NOT EXISTS ] @@ -388,10 +390,39 @@ CREATE [ UNIQUE ] INDEX [ CONCURRENTLY ] [ [ IF NOT EXISTS ] - B-tree indexes additionally accept this parameter: + B-tree indexes also accept these parameters: + + deduplication + + deduplication + storage parameter + + + + + Per-index value for . + Controls usage of the B-tree deduplication technique described + in . Set to + ON or OFF to override GUC. + (Alternative spellings of ON and + OFF are allowed as described in .) + + + + + Turning deduplication off via ALTER + INDEX prevents future insertions from triggering + deduplication, but does not in itself make existing posting list + tuples use the standard tuple representation. + + + + + vacuum_cleanup_index_scale_factor @@ -446,9 +477,7 @@ CREATE [ UNIQUE ] INDEX [ CONCURRENTLY ] [ [ IF NOT EXISTS ] . It is a Boolean parameter: ON enables fast update, OFF disables it. - (Alternative spellings of ON and OFF are - allowed as described in .) The - default is ON. + The default is ON. diff --git a/src/test/regress/expected/btree_index.out b/src/test/regress/expected/btree_index.out index f567117a46..e32c8fa826 100644 --- a/src/test/regress/expected/btree_index.out +++ b/src/test/regress/expected/btree_index.out @@ -266,6 +266,22 @@ select * from btree_bpchar where f1::bpchar like 'foo%'; fool (2 rows) +-- +-- Perform unique checking, with and without the use of deduplication +-- +CREATE TABLE dedup_unique_test_table (a int) WITH (autovacuum_enabled=false); +CREATE UNIQUE INDEX dedup_unique ON dedup_unique_test_table (a) WITH (deduplication=on); +CREATE UNIQUE INDEX plain_unique ON dedup_unique_test_table (a) WITH (deduplication=off); +-- Generate enough garbage tuples in index to ensure that even the unique index +-- with deduplication enabled has to check multiple leaf pages during unique +-- checking (at least with a BLCKSZ of 8192 or less) +DO $$ +BEGIN + FOR r IN 1..1350 LOOP + DELETE FROM dedup_unique_test_table; + INSERT INTO dedup_unique_test_table SELECT 1; + END LOOP; +END$$; -- -- Test B-tree fast path (cache rightmost leaf page) optimization. -- diff --git a/src/test/regress/sql/btree_index.sql b/src/test/regress/sql/btree_index.sql index 558dcae0ec..627ba80bc1 100644 --- a/src/test/regress/sql/btree_index.sql +++ b/src/test/regress/sql/btree_index.sql @@ -103,6 +103,23 @@ explain (costs off) select * from btree_bpchar where f1::bpchar like 'foo%'; select * from btree_bpchar where f1::bpchar like 'foo%'; +-- +-- Perform unique checking, with and without the use of deduplication +-- +CREATE TABLE dedup_unique_test_table (a int) WITH (autovacuum_enabled=false); +CREATE UNIQUE INDEX dedup_unique ON dedup_unique_test_table (a) WITH (deduplication=on); +CREATE UNIQUE INDEX plain_unique ON dedup_unique_test_table (a) WITH (deduplication=off); +-- Generate enough garbage tuples in index to ensure that even the unique index +-- with deduplication enabled has to check multiple leaf pages during unique +-- checking (at least with a BLCKSZ of 8192 or less) +DO $$ +BEGIN + FOR r IN 1..1350 LOOP + DELETE FROM dedup_unique_test_table; + INSERT INTO dedup_unique_test_table SELECT 1; + END LOOP; +END$$; + -- -- Test B-tree fast path (cache rightmost leaf page) optimization. -- -- 2.17.1