From ec58bc56548045d34bd92d2042432f7d5eaee5d4 Mon Sep 17 00:00:00 2001 From: Alvaro Herrera Date: Wed, 8 Mar 2017 13:49:43 -0300 Subject: [PATCH 5/6] warm updates v16 --- contrib/bloom/blutils.c | 1 + src/backend/access/brin/brin.c | 1 + src/backend/access/gist/gist.c | 1 + src/backend/access/hash/hash.c | 5 +- src/backend/access/hash/hashsearch.c | 5 + src/backend/access/hash/hashutil.c | 110 +++++++++ src/backend/access/heap/README.WARM | 306 +++++++++++++++++++++++++ src/backend/access/heap/heapam.c | 256 +++++++++++++++++++-- src/backend/access/heap/pruneheap.c | 7 + src/backend/access/index/indexam.c | 89 ++++++-- src/backend/access/nbtree/nbtinsert.c | 229 +++++++++++-------- src/backend/access/nbtree/nbtree.c | 5 +- src/backend/access/nbtree/nbtutils.c | 104 +++++++++ src/backend/access/spgist/spgutils.c | 1 + src/backend/catalog/index.c | 15 ++ src/backend/catalog/indexing.c | 57 ++++- src/backend/catalog/system_views.sql | 4 +- src/backend/commands/constraint.c | 4 +- src/backend/commands/copy.c | 3 + src/backend/commands/indexcmds.c | 17 +- src/backend/commands/vacuumlazy.c | 25 ++ src/backend/executor/execIndexing.c | 18 +- src/backend/executor/execReplication.c | 25 +- src/backend/executor/nodeBitmapHeapscan.c | 21 +- src/backend/executor/nodeIndexscan.c | 6 +- src/backend/executor/nodeModifyTable.c | 27 ++- src/backend/postmaster/pgstat.c | 7 +- src/backend/utils/adt/pgstatfuncs.c | 31 +++ src/backend/utils/cache/relcache.c | 61 ++++- src/include/access/amapi.h | 8 + src/include/access/hash.h | 4 + src/include/access/heapam.h | 12 +- src/include/access/heapam_xlog.h | 1 + src/include/access/htup_details.h | 29 ++- src/include/access/nbtree.h | 2 + src/include/access/relscan.h | 3 +- src/include/catalog/pg_proc.h | 4 + src/include/executor/executor.h | 1 + src/include/executor/nodeIndexscan.h | 1 - src/include/nodes/execnodes.h | 1 + src/include/pgstat.h | 4 +- src/include/utils/rel.h | 5 + src/include/utils/relcache.h | 4 +- src/test/regress/expected/rules.out | 12 +- src/test/regress/expected/warm.out | 367 ++++++++++++++++++++++++++++++ src/test/regress/parallel_schedule | 2 + src/test/regress/sql/warm.sql | 171 ++++++++++++++ 47 files changed, 1905 insertions(+), 167 deletions(-) create mode 100644 src/backend/access/heap/README.WARM create mode 100644 src/test/regress/expected/warm.out create mode 100644 src/test/regress/sql/warm.sql diff --git a/contrib/bloom/blutils.c b/contrib/bloom/blutils.c index f2eda67..b356e2b 100644 --- a/contrib/bloom/blutils.c +++ b/contrib/bloom/blutils.c @@ -142,6 +142,7 @@ blhandler(PG_FUNCTION_ARGS) amroutine->amestimateparallelscan = NULL; amroutine->aminitparallelscan = NULL; amroutine->amparallelrescan = NULL; + amroutine->amrecheck = NULL; PG_RETURN_POINTER(amroutine); } diff --git a/src/backend/access/brin/brin.c b/src/backend/access/brin/brin.c index b22563b..b4a1465 100644 --- a/src/backend/access/brin/brin.c +++ b/src/backend/access/brin/brin.c @@ -116,6 +116,7 @@ brinhandler(PG_FUNCTION_ARGS) amroutine->amestimateparallelscan = NULL; amroutine->aminitparallelscan = NULL; amroutine->amparallelrescan = NULL; + amroutine->amrecheck = NULL; PG_RETURN_POINTER(amroutine); } diff --git a/src/backend/access/gist/gist.c b/src/backend/access/gist/gist.c index 6593771..843389b 100644 --- a/src/backend/access/gist/gist.c +++ b/src/backend/access/gist/gist.c @@ -94,6 +94,7 @@ gisthandler(PG_FUNCTION_ARGS) amroutine->amestimateparallelscan = NULL; amroutine->aminitparallelscan = NULL; amroutine->amparallelrescan = NULL; + amroutine->amrecheck = NULL; PG_RETURN_POINTER(amroutine); } diff --git a/src/backend/access/hash/hash.c b/src/backend/access/hash/hash.c index 1f8a7f6..9b20ae6 100644 --- a/src/backend/access/hash/hash.c +++ b/src/backend/access/hash/hash.c @@ -90,6 +90,7 @@ hashhandler(PG_FUNCTION_ARGS) amroutine->amestimateparallelscan = NULL; amroutine->aminitparallelscan = NULL; amroutine->amparallelrescan = NULL; + amroutine->amrecheck = hashrecheck; PG_RETURN_POINTER(amroutine); } @@ -271,6 +272,8 @@ hashgettuple(IndexScanDesc scan, ScanDirection dir) OffsetNumber offnum; ItemPointer current; bool res; + IndexTuple itup; + /* Hash indexes are always lossy since we store only the hash code */ scan->xs_recheck = true; @@ -308,8 +311,6 @@ hashgettuple(IndexScanDesc scan, ScanDirection dir) offnum <= maxoffnum; offnum = OffsetNumberNext(offnum)) { - IndexTuple itup; - itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, offnum)); if (ItemPointerEquals(&(so->hashso_heappos), &(itup->t_tid))) break; diff --git a/src/backend/access/hash/hashsearch.c b/src/backend/access/hash/hashsearch.c index 9e5d7e4..60e941d 100644 --- a/src/backend/access/hash/hashsearch.c +++ b/src/backend/access/hash/hashsearch.c @@ -59,6 +59,8 @@ _hash_next(IndexScanDesc scan, ScanDirection dir) itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, offnum)); so->hashso_heappos = itup->t_tid; + if (scan->xs_want_itup) + scan->xs_itup = itup; return true; } @@ -363,6 +365,9 @@ _hash_first(IndexScanDesc scan, ScanDirection dir) itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, offnum)); so->hashso_heappos = itup->t_tid; + if (scan->xs_want_itup) + scan->xs_itup = itup; + return true; } diff --git a/src/backend/access/hash/hashutil.c b/src/backend/access/hash/hashutil.c index c705531..dcba734 100644 --- a/src/backend/access/hash/hashutil.c +++ b/src/backend/access/hash/hashutil.c @@ -17,8 +17,12 @@ #include "access/hash.h" #include "access/reloptions.h" #include "access/relscan.h" +#include "catalog/index.h" +#include "executor/executor.h" +#include "nodes/execnodes.h" #include "utils/lsyscache.h" #include "utils/rel.h" +#include "utils/datum.h" #define CALC_NEW_BUCKET(old_bucket, lowmask) \ old_bucket | (lowmask + 1) @@ -446,3 +450,109 @@ _hash_get_newbucket_from_oldbucket(Relation rel, Bucket old_bucket, return new_bucket; } + +/* + * Recheck if the heap tuple satisfies the key stored in the index tuple + */ +bool +hashrecheck(Relation indexRel, IndexTuple indexTuple, + Relation heapRel, HeapTuple heapTuple) +{ + IndexInfo *indexInfo; + EState *estate; + ExprContext *econtext; + TupleTableSlot *slot; + Datum values[INDEX_MAX_KEYS]; + bool isnull[INDEX_MAX_KEYS]; + Datum values2[INDEX_MAX_KEYS]; + bool isnull2[INDEX_MAX_KEYS]; + int i; + bool equal; + int natts = indexRel->rd_rel->relnatts; + Form_pg_attribute att; + + indexInfo = BuildIndexInfo(indexRel); + + /* + * The heap tuple must be put into a slot for FormIndexDatum. + */ + slot = MakeSingleTupleTableSlot(RelationGetDescr(heapRel)); + + ExecStoreTuple(heapTuple, slot, InvalidBuffer, false); + + /* + * Typically the index won't have expressions, but if it does we need an + * EState to evaluate them. We need it for exclusion constraints too, + * even if they are just on simple columns. + */ + if (indexInfo->ii_Expressions != NIL || + indexInfo->ii_ExclusionOps != NULL) + { + estate = CreateExecutorState(); + econtext = GetPerTupleExprContext(estate); + econtext->ecxt_scantuple = slot; + } + else + estate = NULL; + + /* + * Form the index values and isnull flags for the index entry that we need + * to check. + * + * Note: if the index uses functions that are not as immutable as they are + * supposed to be, this could produce an index tuple different from the + * original. The index AM can catch such errors by verifying that it + * finds a matching index entry with the tuple's TID. For exclusion + * constraints we check this in check_exclusion_constraint(). + */ + FormIndexDatum(indexInfo, slot, estate, values, isnull); + + /* + * HASH indexes compute a hash value of the key and store that in the + * index. So we must first obtain the hash of the value obtained from the + * heap and then do a comparison + */ + _hash_convert_tuple(indexRel, values, isnull, values2, isnull2); + + equal = true; + for (i = 1; i <= natts; i++) + { + Datum indxvalue; + bool indxisnull; + + indxvalue = index_getattr(indexTuple, i, indexRel->rd_att, &indxisnull); + + /* + * If both are NULL then they are equal + */ + if (isnull2[i - 1] && indxisnull) + continue; + + /* + * If either is NULL then they are not equal + */ + if (isnull2[i - 1] || indxisnull) + { + equal = false; + break; + } + + /* + * Now do a raw memory comparison + */ + att = indexRel->rd_att->attrs[i - 1]; + if (!datumIsEqual(values2[i - 1], indxvalue, att->attbyval, + att->attlen)) + { + equal = false; + break; + } + } + + if (estate != NULL) + FreeExecutorState(estate); + + ExecDropSingleTupleTableSlot(slot); + + return equal; +} diff --git a/src/backend/access/heap/README.WARM b/src/backend/access/heap/README.WARM new file mode 100644 index 0000000..7b9a712 --- /dev/null +++ b/src/backend/access/heap/README.WARM @@ -0,0 +1,306 @@ +src/backend/access/heap/README.WARM + +Write Amplification Reduction Method (WARM) +=========================================== + +The Heap Only Tuple (HOT) feature greatly eliminated redudant index +entries and allowed re-use of the dead space occupied by previously +updated or deleted tuples (see src/backend/access/heap/README.HOT) + +One of the necessary conditions for satisfying HOT update is that the +update must not change a column used in any of the indexes on the table. +The condition is sometimes hard to meet, especially for complex +workloads with several indexes on large yet frequently updated tables. +Worse, sometimes only one or two index columns may be updated, but the +regular non-HOT update will still insert a new index entry in every +index on the table, irrespective of whether the key pertaining to the +index changed or not. + +WARM is a technique devised to address these problems. + + +Update Chains With Multiple Index Entries Pointing to the Root +-------------------------------------------------------------- + +When a non-HOT update is caused by an index key change, a new index +entry must be inserted for the changed index. But if the index key +hasn't changed for other indexes, we don't really need to insert a new +entry. Even though the existing index entry is pointing to the old +tuple, the new tuple is reachable via the t_ctid chain. To keep things +simple, a WARM update requires that the heap block must have enough +space to store the new version of the tuple. This is same as HOT +updates. + +In WARM, we ensure that every index entry always points to the root of +the WARM chain. In fact, a WARM chain looks exactly like a HOT chain +except for the fact that there could be multiple index entries pointing +to the root of the chain. So when new entry is inserted in an index for +updated tuple, and if we are doing a WARM update, the new entry is made +point to the root of the WARM chain. + +For example, if we have a table with two columns and two indexes on each +of the column. When a tuple is first inserted the table, we have exactly +one index entry pointing to the tuple from both indexes. + + lp [1] + [1111, aaaa] + + Index1's entry (1111) points to 1 + Index2's entry (aaaa) also points to 1 + +Now if the tuple's second column is updated and if there is room on the +page, we perform a WARM update. To do so, Index1 does not get any new +entry and Index2's new entry will still point to the root tuple of the +chain. + + lp [1] [2] + [1111, aaaa]->[111, bbbb] + + Index1's entry (1111) points to 1 + Index2's old entry (aaaa) points to 1 + Index2's new entry (bbbb) also points to 1 + +"A update chain which has more than one index entries pointing to its +root line pointer is called WARM chain and the action that creates a +WARM chain is called WARM update." + +Since all indexes always point to the root of the WARM chain, even when +there are more than one index entries, WARM chains can be pruned and +dead tuples can be removed without a need to do corresponding index +cleanup. + +While this solves the problem of pruning dead tuples from a HOT/WARM +chain, it also opens up a new technical challenge because now we have a +situation where a heap tuple is reachable from multiple index entries, +each having a different index key. While MVCC still ensures that only +valid tuples are returned, a tuple with a wrong index key may be +returned because of wrong index entries. In the above example, tuple +[1111, bbbb] is reachable from both keys (aaaa) as well as (bbbb). For +this reason, tuples returned from a WARM chain must always be rechecked +for index key-match. + +Recheck Index Key Againt Heap Tuple +----------------------------------- + +Since every Index AM has it's own notion of index tuples, each Index AM +must implement its own method to recheck heap tuples. For example, a +hash index stores the hash value of the column and hence recheck routine +for hash AM must first compute the hash value of the heap attribute and +then compare it against the value stored in the index tuple. + +The patch currently implement recheck routines for hash and btree +indexes. If the table has an index which doesn't support recheck +routine, WARM updates are disabled on such tables. + +Problem With Duplicate (key, ctid) Index Entries +------------------------------------------------ + +The index-key recheck logic works as long as there are no duplicate +index keys, both pointing to the same WARM chain. In that case, the same +valid tuple will be reachable via multiple index keys, yet satisfying +the index key checks. In the above example, if the tuple [1111, bbbb] is +again updated to [1111, aaaa] and if we insert a new index entry (aaaa) +pointing to the root line pointer, we will end up with the following +structure: + + lp [1] [2] [3] + [1111, aaaa]->[1111, bbbb]->[1111, aaaa] + + Index1's entry (1111) points to 1 + Index2's oldest entry (aaaa) points to 1 + Index2's old entry (bbbb) also points to 1 + Index2's new entry (aaaa) also points to 1 + +We must solve this problem to ensure that the same tuple is not +reachable via multiple index pointers. There are couple of ways to +address this issue: + +1. Do not allow WARM update to a tuple from a WARM chain. This +guarantees that there can never be duplicate index entries to the same +root line pointer because we must have checked for old and new index +keys while doing the first WARM update. + +2. Do not allow duplicate (key, ctid) index pointers. In the above +example, since (aaaa, 1) already exists in the index, we must not insert +a duplicate index entry. + +The patch currently implements 1 i.e. do not do WARM updates to a tuple +from a WARM chain. HOT updates are fine because they do not add a new +index entry. + +Even with the restriction, this is a significant improvement because the +number of regular UPDATEs are curtailed down to half. + +Expression and Partial Indexes +------------------------------ + +Expressions may evaluate to the same value even if the underlying column +values have changed. A simple example is an index on "lower(col)" which +will return the same value if the new heap value only differs in the +case sensitivity. So we can not solely rely on the heap column check to +decide whether or not to insert a new index entry for expression +indexes. Similarly, for partial indexes, the predicate expression must +be evaluated to decide whether or not to cause a new index entry when +columns referred in the predicate expressions change. + +(None of these things are currently implemented and we squarely disallow +WARM update if a column from expression indexes or predicate has +changed). + + +Efficiently Finding the Root Line Pointer +----------------------------------------- + +During WARM update, we must be able to find the root line pointer of the +tuple being updated. It must be noted that the t_ctid field in the heap +tuple header is usually used to find the next tuple in the update chain. +But the tuple that we are updating, must be the last tuple in the update +chain. In such cases, the c_tid field usually points the tuple itself. +So in theory, we could use the t_ctid to store additional information in +the last tuple of the update chain, if the information about the tuple +being the last tuple is stored elsewhere. + +We now utilize another bit from t_infomask2 to explicitly identify that +this is the last tuple in the update chain. + +HEAP_LATEST_TUPLE - When this bit is set, the tuple is the last tuple in +the update chain. The OffsetNumber part of t_ctid points to the root +line pointer of the chain when HEAP_LATEST_TUPLE flag is set. + +If UPDATE operation is aborted, the last tuple in the update chain +becomes dead. The root line pointer information stored in the tuple +which remains the last valid tuple in the chain is also lost. In such +rare cases, the root line pointer must be found in a hard way by +scanning the entire heap page. + +Tracking WARM Chains +-------------------- + +The old and every subsequent tuple in the chain is marked with a special +HEAP_WARM_TUPLE flag. We use the last remaining bit in t_infomask2 to +store this information. + +When a tuple is returned from a WARM chain, the caller must do +additional checks to ensure that the tuple matches the index key. Even +if the tuple comes precedes the WARM update in the chain, it must still +be rechecked for the index key match (case when old tuple is returned by +the new index key). So we must follow the update chain everytime to the +end to see check if this is a WARM chain. + +When the old updated tuple is retired and the root line pointer is +converted into a redirected line pointer, we can copy the information +about WARM chain to the redirected line pointer by storing a special +value in the lp_len field of the line pointer. This will handle the most +common case where a WARM chain is replaced by a redirect line pointer +and a single tuple in the chain. + +Converting WARM chains back to HOT chains (VACUUM ?) +---------------------------------------------------- + +The current implementation of WARM allows only one WARM update per +chain. This simplifies the design and addresses certain issues around +duplicate scans. But this also implies that the benefit of WARM will be +no more than 50%, which is still significant, but if we could return +WARM chains back to normal status, we could do far more WARM updates. + +A distinct property of a WARM chain is that at least one index has more +than one live index entries pointing to the root of the chain. In other +words, if we can remove duplicate entry from every index or conclusively +prove that there are no duplicate index entries for the root line +pointer, the chain can again be marked as HOT. + +Here is one idea: + +A WARM chain has two parts, separated by the tuple that caused WARM +update. All tuples in each part has matching index keys, but certain +index keys may not match between these two parts. Lets say we mark heap +tuples in each part with a special Red-Blue flag. The same flag is +replicated in the index tuples. For example, when new rows are inserted +in a table, they are marked with Blue flag and the index entries +associated with those rows are also marked with Blue flag. When a row is +WARM updated, the new version is marked with Red flag and the new index +entry created by the update is also marked with Red flag. + + +Heap chain: [1] [2] [3] [4] + [aaaa, 1111]B -> [aaaa, 1111]B -> [bbbb, 1111]R -> [bbbb, 1111]R + +Index1: (aaaa)B points to 1 (satisfies only tuples marked with B) + (bbbb)R points to 1 (satisfies only tuples marked with R) + +Index2: (1111)B points to 1 (satisfied bith B and R tuples) + + +It's clear that for indexes with Red and Blue pointers, a heap tuple +with Blue flag will be reachable from Blue pointer and that with Red +flag will be reachable from Red pointer. But for indexes which did not +create a new entry, both Blue and Red tuples will be reachable from Blue +pointer (there is no Red pointer in such indexes). So, as a side note, +matching Red and Blue flags is not enough from index scan perspective. + +During first heap scan of VACUUM, we look for tuples with +HEAP_WARM_TUPLE set. If all live tuples in the chain are either marked +with Blue flag or Red flag (but no mix of Red and Blue), then the chain +is a candidate for HOT conversion. We remember the root line pointer +and Red-Blue flag of the WARM chain in a separate array. + +If we have a Red WARM chain, then our goal is to remove Blue pointers +and vice versa. But there is a catch. For Index2 above, there is only +Blue pointer and that must not be removed. IOW we should remove Blue +pointer iff a Red pointer exists. Since index vacuum may visit Red and +Blue pointers in any order, I think we will need another index pass to +remove dead index pointers. So in the first index pass we check which +WARM candidates have 2 index pointers. In the second pass, we remove the +dead pointer and reset Red flag is the surviving index pointer is Red. + +During the second heap scan, we fix WARM chain by clearing +HEAP_WARM_TUPLE flag and also reset Red flag to Blue. + +There are some more problems around aborted vacuums. For example, if +vacuum aborts after changing Red index flag to Blue but before removing +the other Blue pointer, we will end up with two Blue pointers to a Red +WARM chain. But since the HEAP_WARM_TUPLE flag on the heap tuple is +still set, further WARM updates to the chain will be blocked. I guess we +will need some special handling for case with multiple Blue pointers. We +can either leave these WARM chains alone and let them die with a +subsequent non-WARM update or must apply heap-recheck logic during index +vacuum to find the dead pointer. Given that vacuum-aborts are not +common, I am inclined to leave this case unhandled. We must still check +for presence of multiple Blue pointers and ensure that we don't +accidently remove either of the Blue pointers and not clear WARM chains +either. + +CREATE INDEX CONCURRENTLY +------------------------- + +Currently CREATE INDEX CONCURRENTLY (CIC) is implemented as a 3-phase +process. In the first phase, we create catalog entry for the new index +so that the index is visible to all other backends, but still don't use +it for either read or write. But we ensure that no new broken HOT +chains are created by new transactions. In the second phase, we build +the new index using a MVCC snapshot and then make the index available +for inserts. We then do another pass over the index and insert any +missing tuples, everytime indexing only it's root line pointer. See +README.HOT for details about how HOT impacts CIC and how various +challenges are tackeled. + +WARM poses another challenge because it allows creation of HOT chains +even when an index key is changed. But since the index is not ready for +insertion until the second phase is over, we might end up with a +situation where the HOT chain has tuples with different index columns, +yet only one of these values are indexed by the new index. Note that +during the third phase, we only index tuples whose root line pointer is +missing from the index. But we can't easily check if the existing index +tuple is actually indexing the heap tuple visible to the new MVCC +snapshot. Finding that information will require us to query the index +again for every tuple in the chain, especially if it's a WARM tuple. +This would require repeated access to the index. Another option would be +to return index keys along with the heap TIDs when index is scanned for +collecting all indexed TIDs during third phase. We can then compare the +heap tuple against the already indexed key and decide whether or not to +index the new tuple. + +We solve this problem more simply by disallowing WARM updates until the +index is ready for insertion. We don't need to disallow WARM on a +wholesale basis, but only those updates that change the columns of the +new index are disallowed to be WARM updates. diff --git a/src/backend/access/heap/heapam.c b/src/backend/access/heap/heapam.c index 93cde9a..b9ff94d 100644 --- a/src/backend/access/heap/heapam.c +++ b/src/backend/access/heap/heapam.c @@ -1958,6 +1958,78 @@ heap_fetch(Relation relation, } /* + * Check if the HOT chain containing this tid is actually a WARM chain. + * Note that even if the WARM update ultimately aborted, we still must do a + * recheck because the failing UPDATE when have inserted created index entries + * which are now stale, but still referencing this chain. + */ +static bool +hot_check_warm_chain(Page dp, ItemPointer tid) +{ + TransactionId prev_xmax = InvalidTransactionId; + OffsetNumber offnum; + HeapTupleData heapTuple; + + offnum = ItemPointerGetOffsetNumber(tid); + heapTuple.t_self = *tid; + /* Scan through possible multiple members of HOT-chain */ + for (;;) + { + ItemId lp; + + /* check for bogus TID */ + if (offnum < FirstOffsetNumber || offnum > PageGetMaxOffsetNumber(dp)) + break; + + lp = PageGetItemId(dp, offnum); + + /* check for unused, dead, or redirected items */ + if (!ItemIdIsNormal(lp)) + break; + + heapTuple.t_data = (HeapTupleHeader) PageGetItem(dp, lp); + ItemPointerSetOffsetNumber(&heapTuple.t_self, offnum); + + /* + * The xmin should match the previous xmax value, else chain is + * broken. + */ + if (TransactionIdIsValid(prev_xmax) && + !TransactionIdEquals(prev_xmax, + HeapTupleHeaderGetXmin(heapTuple.t_data))) + break; + + + /* + * Presence of either WARM or WARM updated tuple signals possible + * breakage and the caller must recheck tuple returned from this chain + * for index satisfaction + */ + if (HeapTupleHeaderIsHeapWarmTuple(heapTuple.t_data)) + return true; + + /* + * Check to see if HOT chain continues past this tuple; if so fetch + * the next offnum and loop around. + */ + if (!HeapTupleIsHotUpdated(&heapTuple)) + break; + + /* + * It can't be a HOT chain if the tuple contains root line pointer + */ + if (HeapTupleHeaderHasRootOffset(heapTuple.t_data)) + break; + + offnum = ItemPointerGetOffsetNumber(&heapTuple.t_data->t_ctid); + prev_xmax = HeapTupleHeaderGetUpdateXid(heapTuple.t_data); + } + + /* All OK. No need to recheck */ + return false; +} + +/* * heap_hot_search_buffer - search HOT chain for tuple satisfying snapshot * * On entry, *tid is the TID of a tuple (either a simple tuple, or the root @@ -1977,11 +2049,14 @@ heap_fetch(Relation relation, * Unlike heap_fetch, the caller must already have pin and (at least) share * lock on the buffer; it is still pinned/locked at exit. Also unlike * heap_fetch, we do not report any pgstats count; caller may do so if wanted. + * + * recheck should be set false on entry by caller, will be set true on exit + * if a WARM tuple is encountered. */ bool heap_hot_search_buffer(ItemPointer tid, Relation relation, Buffer buffer, Snapshot snapshot, HeapTuple heapTuple, - bool *all_dead, bool first_call) + bool *all_dead, bool first_call, bool *recheck) { Page dp = (Page) BufferGetPage(buffer); TransactionId prev_xmax = InvalidTransactionId; @@ -2035,9 +2110,12 @@ heap_hot_search_buffer(ItemPointer tid, Relation relation, Buffer buffer, ItemPointerSetOffsetNumber(&heapTuple->t_self, offnum); /* - * Shouldn't see a HEAP_ONLY tuple at chain start. + * Shouldn't see a HEAP_ONLY tuple at chain start, unless we are + * dealing with a WARM updated tuple in which case deferred triggers + * may request to fetch a WARM tuple from middle of a chain. */ - if (at_chain_start && HeapTupleIsHeapOnly(heapTuple)) + if (at_chain_start && HeapTupleIsHeapOnly(heapTuple) && + !HeapTupleIsHeapWarmTuple(heapTuple)) break; /* @@ -2050,6 +2128,16 @@ heap_hot_search_buffer(ItemPointer tid, Relation relation, Buffer buffer, break; /* + * Check if there exists a WARM tuple somewhere down the chain and set + * recheck to TRUE. + * + * XXX This is not very efficient right now, and we should look for + * possible improvements here + */ + if (recheck && *recheck == false) + *recheck = hot_check_warm_chain(dp, &heapTuple->t_self); + + /* * When first_call is true (and thus, skip is initially false) we'll * return the first tuple we find. But on later passes, heapTuple * will initially be pointing to the tuple we returned last time. @@ -2098,7 +2186,8 @@ heap_hot_search_buffer(ItemPointer tid, Relation relation, Buffer buffer, * Check to see if HOT chain continues past this tuple; if so fetch * the next offnum and loop around. */ - if (HeapTupleIsHotUpdated(heapTuple)) + if (HeapTupleIsHotUpdated(heapTuple) && + !HeapTupleHeaderHasRootOffset(heapTuple->t_data)) { Assert(ItemPointerGetBlockNumber(&heapTuple->t_data->t_ctid) == ItemPointerGetBlockNumber(tid)); @@ -2122,18 +2211,41 @@ heap_hot_search_buffer(ItemPointer tid, Relation relation, Buffer buffer, */ bool heap_hot_search(ItemPointer tid, Relation relation, Snapshot snapshot, - bool *all_dead) + bool *all_dead, bool *recheck, Buffer *cbuffer, + HeapTuple heapTuple) { bool result; Buffer buffer; - HeapTupleData heapTuple; + ItemPointerData ret_tid = *tid; buffer = ReadBuffer(relation, ItemPointerGetBlockNumber(tid)); LockBuffer(buffer, BUFFER_LOCK_SHARE); - result = heap_hot_search_buffer(tid, relation, buffer, snapshot, - &heapTuple, all_dead, true); - LockBuffer(buffer, BUFFER_LOCK_UNLOCK); - ReleaseBuffer(buffer); + result = heap_hot_search_buffer(&ret_tid, relation, buffer, snapshot, + heapTuple, all_dead, true, recheck); + + /* + * If we are returning a potential candidate tuple from this chain and the + * caller has requested for "recheck" hint, keep the buffer locked and + * pinned. The caller must release the lock and pin on the buffer in all + * such cases + */ + if (!result || !recheck || !(*recheck)) + { + LockBuffer(buffer, BUFFER_LOCK_UNLOCK); + ReleaseBuffer(buffer); + } + + /* + * Set the caller supplied tid with the actual location of the tuple being + * returned + */ + if (result) + { + *tid = ret_tid; + if (cbuffer) + *cbuffer = buffer; + } + return result; } @@ -3492,15 +3604,18 @@ simple_heap_delete(Relation relation, ItemPointer tid) HTSU_Result heap_update(Relation relation, ItemPointer otid, HeapTuple newtup, CommandId cid, Snapshot crosscheck, bool wait, - HeapUpdateFailureData *hufd, LockTupleMode *lockmode) + HeapUpdateFailureData *hufd, LockTupleMode *lockmode, + Bitmapset **modified_attrsp, bool *warm_update) { HTSU_Result result; TransactionId xid = GetCurrentTransactionId(); Bitmapset *hot_attrs; Bitmapset *key_attrs; Bitmapset *id_attrs; + Bitmapset *exprindx_attrs; Bitmapset *interesting_attrs; Bitmapset *modified_attrs; + Bitmapset *notready_attrs; ItemId lp; HeapTupleData oldtup; HeapTuple heaptup; @@ -3521,6 +3636,7 @@ heap_update(Relation relation, ItemPointer otid, HeapTuple newtup, bool have_tuple_lock = false; bool iscombo; bool use_hot_update = false; + bool use_warm_update = false; bool key_intact; bool all_visible_cleared = false; bool all_visible_cleared_new = false; @@ -3545,6 +3661,10 @@ heap_update(Relation relation, ItemPointer otid, HeapTuple newtup, (errcode(ERRCODE_INVALID_TRANSACTION_STATE), errmsg("cannot update tuples during a parallel operation"))); + /* Assume no-warm update */ + if (warm_update) + *warm_update = false; + /* * Fetch the list of attributes to be checked for various operations. * @@ -3566,10 +3686,17 @@ heap_update(Relation relation, ItemPointer otid, HeapTuple newtup, key_attrs = RelationGetIndexAttrBitmap(relation, INDEX_ATTR_BITMAP_KEY); id_attrs = RelationGetIndexAttrBitmap(relation, INDEX_ATTR_BITMAP_IDENTITY_KEY); + exprindx_attrs = RelationGetIndexAttrBitmap(relation, + INDEX_ATTR_BITMAP_EXPR_PREDICATE); + notready_attrs = RelationGetIndexAttrBitmap(relation, + INDEX_ATTR_BITMAP_NOTREADY); + + interesting_attrs = bms_add_members(NULL, hot_attrs); interesting_attrs = bms_add_members(interesting_attrs, key_attrs); interesting_attrs = bms_add_members(interesting_attrs, id_attrs); - + interesting_attrs = bms_add_members(interesting_attrs, exprindx_attrs); + interesting_attrs = bms_add_members(interesting_attrs, notready_attrs); block = ItemPointerGetBlockNumber(otid); offnum = ItemPointerGetOffsetNumber(otid); @@ -3621,6 +3748,9 @@ heap_update(Relation relation, ItemPointer otid, HeapTuple newtup, modified_attrs = HeapDetermineModifiedColumns(relation, interesting_attrs, &oldtup, newtup); + if (modified_attrsp) + *modified_attrsp = bms_copy(modified_attrs); + /* * If we're not updating any "key" column, we can grab a weaker lock type. * This allows for more concurrency when we are running simultaneously @@ -3876,6 +4006,7 @@ l2: bms_free(hot_attrs); bms_free(key_attrs); bms_free(id_attrs); + bms_free(exprindx_attrs); bms_free(modified_attrs); bms_free(interesting_attrs); return result; @@ -4194,6 +4325,37 @@ l2: */ if (!bms_overlap(modified_attrs, hot_attrs)) use_hot_update = true; + else + { + /* + * If no WARM updates yet on this chain, let this update be a WARM + * update. + * + * We check for both warm and warm updated tuples since if the + * previous WARM update aborted, we may still have added + * another index entry for this HOT chain. In such situations, we + * must not attempt a WARM update until duplicate (key, CTID) index + * entry issue is sorted out + * + * XXX Later we'll add more checks to ensure WARM chains can + * further be WARM updated. This is probably good to do first rounf + * of tests of remaining functionality + * + * XXX Disable WARM updates on system tables. There is nothing in + * principle that stops us from supporting this. But it would + * require API change to propogate the changed columns back to the + * caller so that CatalogUpdateIndexes() can avoid adding new + * entries to indexes that are not changed by update. This will be + * fixed once basic patch is tested. !!FIXME + */ + if (relation->rd_supportswarm && + !bms_overlap(modified_attrs, exprindx_attrs) && + !bms_is_subset(hot_attrs, modified_attrs) && + !IsSystemRelation(relation) && + !bms_overlap(notready_attrs, modified_attrs) && + !HeapTupleIsHeapWarmTuple(&oldtup)) + use_warm_update = true; + } } else { @@ -4240,6 +4402,22 @@ l2: HeapTupleSetHeapOnly(heaptup); /* Mark the caller's copy too, in case different from heaptup */ HeapTupleSetHeapOnly(newtup); + + /* + * Even if we are doing a HOT update, we must carry forward the WARM + * flag because we may have already inserted another index entry + * pointing to our root and a third entry may create duplicates + * + * Note: If we ever have a mechanism to avoid duplicate in + * indexes, we could look at relaxing this restriction and allow even + * more WARM udpates + */ + if (HeapTupleIsHeapWarmTuple(&oldtup)) + { + HeapTupleSetHeapWarmTuple(heaptup); + HeapTupleSetHeapWarmTuple(newtup); + } + /* * For HOT (or WARM) updated tuples, we store the offset of the root * line pointer of this chain in the ip_posid field of the new tuple. @@ -4252,12 +4430,35 @@ l2: if (HeapTupleHeaderHasRootOffset(oldtup.t_data)) root_offnum = HeapTupleHeaderGetRootOffset(oldtup.t_data); } + else if (use_warm_update) + { + /* Mark the old tuple as HOT-updated */ + HeapTupleSetHotUpdated(&oldtup); + HeapTupleSetHeapWarmTuple(&oldtup); + /* And mark the new tuple as heap-only */ + HeapTupleSetHeapOnly(heaptup); + HeapTupleSetHeapWarmTuple(heaptup); + /* Mark the caller's copy too, in case different from heaptup */ + HeapTupleSetHeapOnly(newtup); + HeapTupleSetHeapWarmTuple(newtup); + if (HeapTupleHeaderHasRootOffset(oldtup.t_data)) + root_offnum = HeapTupleHeaderGetRootOffset(oldtup.t_data); + else + root_offnum = heap_get_root_tuple(page, + ItemPointerGetOffsetNumber(&(oldtup.t_self))); + + /* Let the caller know we did a WARM update */ + if (warm_update) + *warm_update = true; + } else { /* Make sure tuples are correctly marked as not-HOT */ HeapTupleClearHotUpdated(&oldtup); HeapTupleClearHeapOnly(heaptup); HeapTupleClearHeapOnly(newtup); + HeapTupleClearHeapWarmTuple(heaptup); + HeapTupleClearHeapWarmTuple(newtup); root_offnum = InvalidOffsetNumber; } @@ -4367,7 +4568,10 @@ l2: if (have_tuple_lock) UnlockTupleTuplock(relation, &(oldtup.t_self), *lockmode); - pgstat_count_heap_update(relation, use_hot_update); + /* + * Count HOT and WARM updates separately + */ + pgstat_count_heap_update(relation, use_hot_update, use_warm_update); /* * If heaptup is a private copy, release it. Don't forget to copy t_self @@ -4507,7 +4711,8 @@ HeapDetermineModifiedColumns(Relation relation, Bitmapset *interesting_cols, * via ereport(). */ void -simple_heap_update(Relation relation, ItemPointer otid, HeapTuple tup) +simple_heap_update(Relation relation, ItemPointer otid, HeapTuple tup, + Bitmapset **modified_attrs, bool *warm_update) { HTSU_Result result; HeapUpdateFailureData hufd; @@ -4516,7 +4721,7 @@ simple_heap_update(Relation relation, ItemPointer otid, HeapTuple tup) result = heap_update(relation, otid, tup, GetCurrentCommandId(true), InvalidSnapshot, true /* wait for commit */ , - &hufd, &lockmode); + &hufd, &lockmode, modified_attrs, warm_update); switch (result) { case HeapTupleSelfUpdated: @@ -7568,6 +7773,7 @@ log_heap_update(Relation reln, Buffer oldbuf, bool need_tuple_data = RelationIsLogicallyLogged(reln); bool init; int bufflags; + bool warm_update = false; /* Caller should not call me on a non-WAL-logged relation */ Assert(RelationNeedsWAL(reln)); @@ -7579,6 +7785,9 @@ log_heap_update(Relation reln, Buffer oldbuf, else info = XLOG_HEAP_UPDATE; + if (HeapTupleIsHeapWarmTuple(newtup)) + warm_update = true; + /* * If the old and new tuple are on the same page, we only need to log the * parts of the new tuple that were changed. That saves on the amount of @@ -7652,6 +7861,8 @@ log_heap_update(Relation reln, Buffer oldbuf, xlrec.flags |= XLH_UPDATE_CONTAINS_OLD_KEY; } } + if (warm_update) + xlrec.flags |= XLH_UPDATE_WARM_UPDATE; /* If new tuple is the single and first tuple on page... */ if (ItemPointerGetOffsetNumber(&(newtup->t_self)) == FirstOffsetNumber && @@ -8629,16 +8840,22 @@ heap_xlog_update(XLogReaderState *record, bool hot_update) Size freespace = 0; XLogRedoAction oldaction; XLogRedoAction newaction; + bool warm_update = false; /* initialize to keep the compiler quiet */ oldtup.t_data = NULL; oldtup.t_len = 0; + if (xlrec->flags & XLH_UPDATE_WARM_UPDATE) + warm_update = true; + XLogRecGetBlockTag(record, 0, &rnode, NULL, &newblk); if (XLogRecGetBlockTag(record, 1, NULL, NULL, &oldblk)) { /* HOT updates are never done across pages */ Assert(!hot_update); + /* WARM updates are never done across pages */ + Assert(!warm_update); } else oldblk = newblk; @@ -8698,6 +8915,11 @@ heap_xlog_update(XLogReaderState *record, bool hot_update) &htup->t_infomask2); HeapTupleHeaderSetXmax(htup, xlrec->old_xmax); HeapTupleHeaderSetCmax(htup, FirstCommandId, false); + + /* Mark the old tuple has a WARM tuple */ + if (warm_update) + HeapTupleHeaderSetHeapWarmTuple(htup); + /* Set forward chain link in t_ctid */ HeapTupleHeaderSetNextTid(htup, &newtid); @@ -8833,6 +9055,10 @@ heap_xlog_update(XLogReaderState *record, bool hot_update) HeapTupleHeaderSetCmin(htup, FirstCommandId); HeapTupleHeaderSetXmax(htup, xlrec->new_xmax); + /* Mark the new tuple has a WARM tuple */ + if (warm_update) + HeapTupleHeaderSetHeapWarmTuple(htup); + offnum = PageAddItem(page, (Item) htup, newlen, offnum, true, true); if (offnum == InvalidOffsetNumber) elog(PANIC, "failed to add tuple"); diff --git a/src/backend/access/heap/pruneheap.c b/src/backend/access/heap/pruneheap.c index f54337c..4e8ed79 100644 --- a/src/backend/access/heap/pruneheap.c +++ b/src/backend/access/heap/pruneheap.c @@ -834,6 +834,13 @@ heap_get_root_tuples_internal(Page page, OffsetNumber target_offnum, if (!HeapTupleHeaderIsHotUpdated(htup)) continue; + /* + * If the tuple has root line pointer, it must be the end of the + * chain + */ + if (HeapTupleHeaderHasRootOffset(htup)) + break; + /* Set up to scan the HOT-chain */ nextoffnum = ItemPointerGetOffsetNumber(&htup->t_ctid); priorXmax = HeapTupleHeaderGetUpdateXid(htup); diff --git a/src/backend/access/index/indexam.c b/src/backend/access/index/indexam.c index cc5ac8b..da6c252 100644 --- a/src/backend/access/index/indexam.c +++ b/src/backend/access/index/indexam.c @@ -75,10 +75,12 @@ #include "access/xlog.h" #include "catalog/catalog.h" #include "catalog/index.h" +#include "executor/executor.h" #include "pgstat.h" #include "storage/bufmgr.h" #include "storage/lmgr.h" #include "storage/predicate.h" +#include "utils/datum.h" #include "utils/snapmgr.h" #include "utils/tqual.h" @@ -234,6 +236,21 @@ index_beginscan(Relation heapRelation, scan->heapRelation = heapRelation; scan->xs_snapshot = snapshot; + /* + * If the index supports recheck, make sure that index tuple is saved + * during index scans. + * + * XXX Ideally, we should look at all indexes on the table and check if + * WARM is at all supported on the base table. If WARM is not supported + * then we don't need to do any recheck. RelationGetIndexAttrBitmap() does + * do that and sets rd_supportswarm after looking at all indexes. But we + * don't know if the function was called earlier in the session when we're + * here. We can't call it now because there exists a risk of causing + * deadlock. + */ + if (indexRelation->rd_amroutine->amrecheck) + scan->xs_want_itup = true; + return scan; } @@ -535,8 +552,8 @@ index_getnext_tid(IndexScanDesc scan, ScanDirection direction) /* * The AM's amgettuple proc finds the next index entry matching the scan * keys, and puts the TID into scan->xs_ctup.t_self. It should also set - * scan->xs_recheck and possibly scan->xs_itup/scan->xs_hitup, though we - * pay no attention to those fields here. + * scan->xs_tuple_recheck and possibly scan->xs_itup/scan->xs_hitup, + * though we pay no attention to those fields here. */ found = scan->indexRelation->rd_amroutine->amgettuple(scan, direction); @@ -574,7 +591,7 @@ index_getnext_tid(IndexScanDesc scan, ScanDirection direction) * dropped in a future index_getnext_tid, index_fetch_heap or index_endscan * call). * - * Note: caller must check scan->xs_recheck, and perform rechecking of the + * Note: caller must check scan->xs_tuple_recheck, and perform rechecking of the * scan keys if required. We do not do that here because we don't have * enough information to do it efficiently in the general case. * ---------------- @@ -601,6 +618,12 @@ index_fetch_heap(IndexScanDesc scan) */ if (prev_buf != scan->xs_cbuf) heap_page_prune_opt(scan->heapRelation, scan->xs_cbuf); + + /* + * If we're not always re-checking, reset recheck for this tuple. + * Otherwise we must recheck every tuple. + */ + scan->xs_tuple_recheck = scan->xs_recheck; } /* Obtain share-lock on the buffer so we can examine visibility */ @@ -610,32 +633,64 @@ index_fetch_heap(IndexScanDesc scan) scan->xs_snapshot, &scan->xs_ctup, &all_dead, - !scan->xs_continue_hot); + !scan->xs_continue_hot, + &scan->xs_tuple_recheck); LockBuffer(scan->xs_cbuf, BUFFER_LOCK_UNLOCK); if (got_heap_tuple) { + bool res = true; + + /* + * Ok we got a tuple which satisfies the snapshot, but if its part of a + * WARM chain, we must do additional checks to ensure that we are + * indeed returning a correct tuple. Note that if the index AM does not + * implement amrecheck method, then we don't any additional checks + * since WARM must have been disabled on such tables + * + * XXX What happens when a new index which does not support amcheck is + * added to the table? Do we need to handle this case or is CREATE + * INDEX and CREATE INDEX CONCURRENTLY smart enough to handle this + * issue? + */ + if (scan->xs_tuple_recheck && + scan->xs_itup && + scan->indexRelation->rd_amroutine->amrecheck) + { + LockBuffer(scan->xs_cbuf, BUFFER_LOCK_SHARE); + res = scan->indexRelation->rd_amroutine->amrecheck( + scan->indexRelation, + scan->xs_itup, + scan->heapRelation, + &scan->xs_ctup); + LockBuffer(scan->xs_cbuf, BUFFER_LOCK_UNLOCK); + } + /* * Only in a non-MVCC snapshot can more than one member of the HOT * chain be visible. */ scan->xs_continue_hot = !IsMVCCSnapshot(scan->xs_snapshot); pgstat_count_heap_fetch(scan->indexRelation); - return &scan->xs_ctup; + + if (res) + return &scan->xs_ctup; } + else + { + /* We've reached the end of the HOT chain. */ + scan->xs_continue_hot = false; - /* We've reached the end of the HOT chain. */ - scan->xs_continue_hot = false; - - /* - * If we scanned a whole HOT chain and found only dead tuples, tell index - * AM to kill its entry for that TID (this will take effect in the next - * amgettuple call, in index_getnext_tid). We do not do this when in - * recovery because it may violate MVCC to do so. See comments in - * RelationGetIndexScan(). - */ - if (!scan->xactStartedInRecovery) - scan->kill_prior_tuple = all_dead; + /* + * If we scanned a whole HOT chain and found only dead tuples, tell index + * AM to kill its entry for that TID (this will take effect in the next + * amgettuple call, in index_getnext_tid). We do not do this when in + * recovery because it may violate MVCC to do so. See comments in + * RelationGetIndexScan(). + */ + if (!scan->xactStartedInRecovery) + scan->kill_prior_tuple = all_dead; + } return NULL; } diff --git a/src/backend/access/nbtree/nbtinsert.c b/src/backend/access/nbtree/nbtinsert.c index 6dca810..b5cb619 100644 --- a/src/backend/access/nbtree/nbtinsert.c +++ b/src/backend/access/nbtree/nbtinsert.c @@ -20,11 +20,14 @@ #include "access/nbtxlog.h" #include "access/transam.h" #include "access/xloginsert.h" +#include "catalog/index.h" +#include "executor/executor.h" #include "miscadmin.h" +#include "nodes/execnodes.h" #include "storage/lmgr.h" #include "storage/predicate.h" #include "utils/tqual.h" - +#include "utils/datum.h" typedef struct { @@ -250,6 +253,9 @@ _bt_check_unique(Relation rel, IndexTuple itup, Relation heapRel, BTPageOpaque opaque; Buffer nbuf = InvalidBuffer; bool found = false; + Buffer buffer; + HeapTupleData heapTuple; + bool recheck = false; /* Assume unique until we find a duplicate */ *is_unique = true; @@ -309,6 +315,8 @@ _bt_check_unique(Relation rel, IndexTuple itup, Relation heapRel, curitup = (IndexTuple) PageGetItem(page, curitemid); htid = curitup->t_tid; + recheck = false; + /* * If we are doing a recheck, we expect to find the tuple we * are rechecking. It's not a duplicate, but we have to keep @@ -326,112 +334,153 @@ _bt_check_unique(Relation rel, IndexTuple itup, Relation heapRel, * have just a single index entry for the entire chain. */ else if (heap_hot_search(&htid, heapRel, &SnapshotDirty, - &all_dead)) + &all_dead, &recheck, &buffer, + &heapTuple)) { TransactionId xwait; + bool result = true; /* - * It is a duplicate. If we are only doing a partial - * check, then don't bother checking if the tuple is being - * updated in another transaction. Just return the fact - * that it is a potential conflict and leave the full - * check till later. + * If the tuple was WARM update, we may again see our own + * tuple. Since WARM updates don't create new index + * entries, our own tuple is only reachable via the old + * index pointer */ - if (checkUnique == UNIQUE_CHECK_PARTIAL) + if (checkUnique == UNIQUE_CHECK_EXISTING && + ItemPointerCompare(&htid, &itup->t_tid) == 0) { - if (nbuf != InvalidBuffer) - _bt_relbuf(rel, nbuf); - *is_unique = false; - return InvalidTransactionId; + found = true; + result = false; + if (recheck) + UnlockReleaseBuffer(buffer); + } + else if (recheck) + { + result = btrecheck(rel, curitup, heapRel, &heapTuple); + UnlockReleaseBuffer(buffer); } - /* - * If this tuple is being updated by other transaction - * then we have to wait for its commit/abort. - */ - xwait = (TransactionIdIsValid(SnapshotDirty.xmin)) ? - SnapshotDirty.xmin : SnapshotDirty.xmax; - - if (TransactionIdIsValid(xwait)) - { - if (nbuf != InvalidBuffer) - _bt_relbuf(rel, nbuf); - /* Tell _bt_doinsert to wait... */ - *speculativeToken = SnapshotDirty.speculativeToken; - return xwait; - } - - /* - * Otherwise we have a definite conflict. But before - * complaining, look to see if the tuple we want to insert - * is itself now committed dead --- if so, don't complain. - * This is a waste of time in normal scenarios but we must - * do it to support CREATE INDEX CONCURRENTLY. - * - * We must follow HOT-chains here because during - * concurrent index build, we insert the root TID though - * the actual tuple may be somewhere in the HOT-chain. - * While following the chain we might not stop at the - * exact tuple which triggered the insert, but that's OK - * because if we find a live tuple anywhere in this chain, - * we have a unique key conflict. The other live tuple is - * not part of this chain because it had a different index - * entry. - */ - htid = itup->t_tid; - if (heap_hot_search(&htid, heapRel, SnapshotSelf, NULL)) - { - /* Normal case --- it's still live */ - } - else + if (result) { /* - * It's been deleted, so no error, and no need to - * continue searching + * It is a duplicate. If we are only doing a partial + * check, then don't bother checking if the tuple is being + * updated in another transaction. Just return the fact + * that it is a potential conflict and leave the full + * check till later. */ - break; - } + if (checkUnique == UNIQUE_CHECK_PARTIAL) + { + if (nbuf != InvalidBuffer) + _bt_relbuf(rel, nbuf); + *is_unique = false; + return InvalidTransactionId; + } - /* - * Check for a conflict-in as we would if we were going to - * write to this page. We aren't actually going to write, - * but we want a chance to report SSI conflicts that would - * otherwise be masked by this unique constraint - * violation. - */ - CheckForSerializableConflictIn(rel, NULL, buf); + /* + * If this tuple is being updated by other transaction + * then we have to wait for its commit/abort. + */ + xwait = (TransactionIdIsValid(SnapshotDirty.xmin)) ? + SnapshotDirty.xmin : SnapshotDirty.xmax; - /* - * This is a definite conflict. Break the tuple down into - * datums and report the error. But first, make sure we - * release the buffer locks we're holding --- - * BuildIndexValueDescription could make catalog accesses, - * which in the worst case might touch this same index and - * cause deadlocks. - */ - if (nbuf != InvalidBuffer) - _bt_relbuf(rel, nbuf); - _bt_relbuf(rel, buf); + if (TransactionIdIsValid(xwait)) + { + if (nbuf != InvalidBuffer) + _bt_relbuf(rel, nbuf); + /* Tell _bt_doinsert to wait... */ + *speculativeToken = SnapshotDirty.speculativeToken; + return xwait; + } - { - Datum values[INDEX_MAX_KEYS]; - bool isnull[INDEX_MAX_KEYS]; - char *key_desc; + /* + * Otherwise we have a definite conflict. But before + * complaining, look to see if the tuple we want to insert + * is itself now committed dead --- if so, don't complain. + * This is a waste of time in normal scenarios but we must + * do it to support CREATE INDEX CONCURRENTLY. + * + * We must follow HOT-chains here because during + * concurrent index build, we insert the root TID though + * the actual tuple may be somewhere in the HOT-chain. + * While following the chain we might not stop at the + * exact tuple which triggered the insert, but that's OK + * because if we find a live tuple anywhere in this chain, + * we have a unique key conflict. The other live tuple is + * not part of this chain because it had a different index + * entry. + */ + recheck = false; + ItemPointerCopy(&itup->t_tid, &htid); + if (heap_hot_search(&htid, heapRel, SnapshotSelf, NULL, + &recheck, &buffer, &heapTuple)) + { + bool result = true; + if (recheck) + { + /* + * Recheck if the tuple actually satisfies the + * index key. Otherwise, we might be following + * a wrong index pointer and mustn't entertain + * this tuple + */ + result = btrecheck(rel, itup, heapRel, &heapTuple); + UnlockReleaseBuffer(buffer); + } + if (!result) + break; + /* Normal case --- it's still live */ + } + else + { + /* + * It's been deleted, so no error, and no need to + * continue searching + */ + break; + } - index_deform_tuple(itup, RelationGetDescr(rel), - values, isnull); + /* + * Check for a conflict-in as we would if we were going to + * write to this page. We aren't actually going to write, + * but we want a chance to report SSI conflicts that would + * otherwise be masked by this unique constraint + * violation. + */ + CheckForSerializableConflictIn(rel, NULL, buf); - key_desc = BuildIndexValueDescription(rel, values, - isnull); + /* + * This is a definite conflict. Break the tuple down into + * datums and report the error. But first, make sure we + * release the buffer locks we're holding --- + * BuildIndexValueDescription could make catalog accesses, + * which in the worst case might touch this same index and + * cause deadlocks. + */ + if (nbuf != InvalidBuffer) + _bt_relbuf(rel, nbuf); + _bt_relbuf(rel, buf); - ereport(ERROR, - (errcode(ERRCODE_UNIQUE_VIOLATION), - errmsg("duplicate key value violates unique constraint \"%s\"", - RelationGetRelationName(rel)), - key_desc ? errdetail("Key %s already exists.", - key_desc) : 0, - errtableconstraint(heapRel, - RelationGetRelationName(rel)))); + { + Datum values[INDEX_MAX_KEYS]; + bool isnull[INDEX_MAX_KEYS]; + char *key_desc; + + index_deform_tuple(itup, RelationGetDescr(rel), + values, isnull); + + key_desc = BuildIndexValueDescription(rel, values, + isnull); + + ereport(ERROR, + (errcode(ERRCODE_UNIQUE_VIOLATION), + errmsg("duplicate key value violates unique constraint \"%s\"", + RelationGetRelationName(rel)), + key_desc ? errdetail("Key %s already exists.", + key_desc) : 0, + errtableconstraint(heapRel, + RelationGetRelationName(rel)))); + } } } else if (all_dead) diff --git a/src/backend/access/nbtree/nbtree.c b/src/backend/access/nbtree/nbtree.c index 775f2ff..952ed8f 100644 --- a/src/backend/access/nbtree/nbtree.c +++ b/src/backend/access/nbtree/nbtree.c @@ -23,6 +23,7 @@ #include "access/xlog.h" #include "catalog/index.h" #include "commands/vacuum.h" +#include "executor/nodeIndexscan.h" #include "pgstat.h" #include "storage/condition_variable.h" #include "storage/indexfsm.h" @@ -163,6 +164,7 @@ bthandler(PG_FUNCTION_ARGS) amroutine->amestimateparallelscan = btestimateparallelscan; amroutine->aminitparallelscan = btinitparallelscan; amroutine->amparallelrescan = btparallelrescan; + amroutine->amrecheck = btrecheck; PG_RETURN_POINTER(amroutine); } @@ -344,8 +346,9 @@ btgettuple(IndexScanDesc scan, ScanDirection dir) BTScanOpaque so = (BTScanOpaque) scan->opaque; bool res; - /* btree indexes are never lossy */ + /* btree indexes are never lossy, except for WARM tuples */ scan->xs_recheck = false; + scan->xs_tuple_recheck = false; /* * If we have any array keys, initialize them during first call for a diff --git a/src/backend/access/nbtree/nbtutils.c b/src/backend/access/nbtree/nbtutils.c index 5b259a3..c376c1b 100644 --- a/src/backend/access/nbtree/nbtutils.c +++ b/src/backend/access/nbtree/nbtutils.c @@ -20,11 +20,15 @@ #include "access/nbtree.h" #include "access/reloptions.h" #include "access/relscan.h" +#include "catalog/index.h" +#include "executor/executor.h" #include "miscadmin.h" +#include "nodes/execnodes.h" #include "utils/array.h" #include "utils/lsyscache.h" #include "utils/memutils.h" #include "utils/rel.h" +#include "utils/datum.h" typedef struct BTSortArrayContext @@ -2069,3 +2073,103 @@ btproperty(Oid index_oid, int attno, return false; /* punt to generic code */ } } + +/* + * Check if the index tuple's key matches the one computed from the given heap + * tuple's attribute + */ +bool +btrecheck(Relation indexRel, IndexTuple indexTuple, + Relation heapRel, HeapTuple heapTuple) +{ + IndexInfo *indexInfo; + EState *estate; + ExprContext *econtext; + TupleTableSlot *slot; + Datum values[INDEX_MAX_KEYS]; + bool isnull[INDEX_MAX_KEYS]; + int i; + bool equal; + int natts = indexRel->rd_rel->relnatts; + Form_pg_attribute att; + + /* Get IndexInfo for this index */ + indexInfo = BuildIndexInfo(indexRel); + + /* + * The heap tuple must be put into a slot for FormIndexDatum. + */ + slot = MakeSingleTupleTableSlot(RelationGetDescr(heapRel)); + + ExecStoreTuple(heapTuple, slot, InvalidBuffer, false); + + /* + * Typically the index won't have expressions, but if it does we need an + * EState to evaluate them. We need it for exclusion constraints too, + * even if they are just on simple columns. + */ + if (indexInfo->ii_Expressions != NIL || + indexInfo->ii_ExclusionOps != NULL) + { + estate = CreateExecutorState(); + econtext = GetPerTupleExprContext(estate); + econtext->ecxt_scantuple = slot; + } + else + estate = NULL; + + /* + * Form the index values and isnull flags for the index entry that we need + * to check. + * + * Note: if the index uses functions that are not as immutable as they are + * supposed to be, this could produce an index tuple different from the + * original. The index AM can catch such errors by verifying that it + * finds a matching index entry with the tuple's TID. For exclusion + * constraints we check this in check_exclusion_constraint(). + */ + FormIndexDatum(indexInfo, slot, estate, values, isnull); + + equal = true; + for (i = 1; i <= natts; i++) + { + Datum indxvalue; + bool indxisnull; + + indxvalue = index_getattr(indexTuple, i, indexRel->rd_att, &indxisnull); + + /* + * If both are NULL, then they are equal + */ + if (isnull[i - 1] && indxisnull) + continue; + + /* + * If just one is NULL, then they are not equal + */ + if (isnull[i - 1] || indxisnull) + { + equal = false; + break; + } + + /* + * Now just do a raw memory comparison. If the index tuple was formed + * using this heap tuple, the computed index values must match + */ + att = indexRel->rd_att->attrs[i - 1]; + if (!datumIsEqual(values[i - 1], indxvalue, att->attbyval, + att->attlen)) + { + equal = false; + break; + } + } + + if (estate != NULL) + FreeExecutorState(estate); + + ExecDropSingleTupleTableSlot(slot); + + return equal; +} diff --git a/src/backend/access/spgist/spgutils.c b/src/backend/access/spgist/spgutils.c index e57ac49..59ef7f3 100644 --- a/src/backend/access/spgist/spgutils.c +++ b/src/backend/access/spgist/spgutils.c @@ -72,6 +72,7 @@ spghandler(PG_FUNCTION_ARGS) amroutine->amestimateparallelscan = NULL; amroutine->aminitparallelscan = NULL; amroutine->amparallelrescan = NULL; + amroutine->amrecheck = NULL; PG_RETURN_POINTER(amroutine); } diff --git a/src/backend/catalog/index.c b/src/backend/catalog/index.c index 8d42a34..049eb28 100644 --- a/src/backend/catalog/index.c +++ b/src/backend/catalog/index.c @@ -54,6 +54,7 @@ #include "nodes/makefuncs.h" #include "nodes/nodeFuncs.h" #include "optimizer/clauses.h" +#include "optimizer/var.h" #include "parser/parser.h" #include "storage/bufmgr.h" #include "storage/lmgr.h" @@ -1691,6 +1692,20 @@ BuildIndexInfo(Relation index) ii->ii_AmCache = NULL; ii->ii_Context = CurrentMemoryContext; + /* build a bitmap of all table attributes referred by this index */ + for (i = 0; i < ii->ii_NumIndexAttrs; i++) + { + AttrNumber attr = ii->ii_KeyAttrNumbers[i]; + ii->ii_indxattrs = bms_add_member(ii->ii_indxattrs, attr - + FirstLowInvalidHeapAttributeNumber); + } + + /* Collect all attributes used in expressions, too */ + pull_varattnos((Node *) ii->ii_Expressions, 1, &ii->ii_indxattrs); + + /* Collect all attributes in the index predicate, too */ + pull_varattnos((Node *) ii->ii_Predicate, 1, &ii->ii_indxattrs); + return ii; } diff --git a/src/backend/catalog/indexing.c b/src/backend/catalog/indexing.c index abc344a..970254f 100644 --- a/src/backend/catalog/indexing.c +++ b/src/backend/catalog/indexing.c @@ -66,10 +66,15 @@ CatalogCloseIndexes(CatalogIndexState indstate) * * This should be called for each inserted or updated catalog tuple. * + * If the tuple was WARM updated, the modified_attrs contains the list of + * columns updated by the update. We must not insert new index entries for + * indexes which do not refer to any of the modified columns. + * * This is effectively a cut-down version of ExecInsertIndexTuples. */ static void -CatalogIndexInsert(CatalogIndexState indstate, HeapTuple heapTuple) +CatalogIndexInsert(CatalogIndexState indstate, HeapTuple heapTuple, + Bitmapset *modified_attrs, bool warm_update) { int i; int numIndexes; @@ -79,12 +84,28 @@ CatalogIndexInsert(CatalogIndexState indstate, HeapTuple heapTuple) IndexInfo **indexInfoArray; Datum values[INDEX_MAX_KEYS]; bool isnull[INDEX_MAX_KEYS]; + ItemPointerData root_tid; - /* HOT update does not require index inserts */ - if (HeapTupleIsHeapOnly(heapTuple)) + /* + * HOT update does not require index inserts, but WARM may need for some + * indexes. + */ + if (HeapTupleIsHeapOnly(heapTuple) && !warm_update) return; /* + * If we've done a WARM update, then we must index the TID of the root line + * pointer and not the actual TID of the new tuple. + */ + if (warm_update) + ItemPointerSet(&root_tid, + ItemPointerGetBlockNumber(&(heapTuple->t_self)), + HeapTupleHeaderGetRootOffset(heapTuple->t_data)); + else + ItemPointerCopy(&heapTuple->t_self, &root_tid); + + + /* * Get information from the state structure. Fall out if nothing to do. */ numIndexes = indstate->ri_NumIndices; @@ -112,6 +133,17 @@ CatalogIndexInsert(CatalogIndexState indstate, HeapTuple heapTuple) continue; /* + * If we've done WARM update, then we must not insert a new index tuple + * if none of the index keys have changed. This is not just an + * optimization, but a requirement for WARM to work correctly. + */ + if (warm_update) + { + if (!bms_overlap(modified_attrs, indexInfo->ii_indxattrs)) + continue; + } + + /* * Expressional and partial indexes on system catalogs are not * supported, nor exclusion constraints, nor deferred uniqueness */ @@ -136,7 +168,7 @@ CatalogIndexInsert(CatalogIndexState indstate, HeapTuple heapTuple) index_insert(relationDescs[i], /* index relation */ values, /* array of index Datums */ isnull, /* is-null flags */ - &(heapTuple->t_self), /* tid of heap tuple */ + &root_tid, heapRelation, relationDescs[i]->rd_index->indisunique ? UNIQUE_CHECK_YES : UNIQUE_CHECK_NO, @@ -168,7 +200,7 @@ CatalogTupleInsert(Relation heapRel, HeapTuple tup) oid = simple_heap_insert(heapRel, tup); - CatalogIndexInsert(indstate, tup); + CatalogIndexInsert(indstate, tup, NULL, false); CatalogCloseIndexes(indstate); return oid; @@ -190,7 +222,7 @@ CatalogTupleInsertWithInfo(Relation heapRel, HeapTuple tup, oid = simple_heap_insert(heapRel, tup); - CatalogIndexInsert(indstate, tup); + CatalogIndexInsert(indstate, tup, false, NULL); return oid; } @@ -210,12 +242,14 @@ void CatalogTupleUpdate(Relation heapRel, ItemPointer otid, HeapTuple tup) { CatalogIndexState indstate; + bool warm_update; + Bitmapset *modified_attrs; indstate = CatalogOpenIndexes(heapRel); - simple_heap_update(heapRel, otid, tup); + simple_heap_update(heapRel, otid, tup, &modified_attrs, &warm_update); - CatalogIndexInsert(indstate, tup); + CatalogIndexInsert(indstate, tup, modified_attrs, warm_update); CatalogCloseIndexes(indstate); } @@ -231,9 +265,12 @@ void CatalogTupleUpdateWithInfo(Relation heapRel, ItemPointer otid, HeapTuple tup, CatalogIndexState indstate) { - simple_heap_update(heapRel, otid, tup); + Bitmapset *modified_attrs; + bool warm_update; - CatalogIndexInsert(indstate, tup); + simple_heap_update(heapRel, otid, tup, &modified_attrs, &warm_update); + + CatalogIndexInsert(indstate, tup, modified_attrs, warm_update); } /* diff --git a/src/backend/catalog/system_views.sql b/src/backend/catalog/system_views.sql index ba980de..410ccd3 100644 --- a/src/backend/catalog/system_views.sql +++ b/src/backend/catalog/system_views.sql @@ -498,6 +498,7 @@ CREATE VIEW pg_stat_all_tables AS pg_stat_get_tuples_updated(C.oid) AS n_tup_upd, pg_stat_get_tuples_deleted(C.oid) AS n_tup_del, pg_stat_get_tuples_hot_updated(C.oid) AS n_tup_hot_upd, + pg_stat_get_tuples_warm_updated(C.oid) AS n_tup_warm_upd, pg_stat_get_live_tuples(C.oid) AS n_live_tup, pg_stat_get_dead_tuples(C.oid) AS n_dead_tup, pg_stat_get_mod_since_analyze(C.oid) AS n_mod_since_analyze, @@ -528,7 +529,8 @@ CREATE VIEW pg_stat_xact_all_tables AS pg_stat_get_xact_tuples_inserted(C.oid) AS n_tup_ins, pg_stat_get_xact_tuples_updated(C.oid) AS n_tup_upd, pg_stat_get_xact_tuples_deleted(C.oid) AS n_tup_del, - pg_stat_get_xact_tuples_hot_updated(C.oid) AS n_tup_hot_upd + pg_stat_get_xact_tuples_hot_updated(C.oid) AS n_tup_hot_upd, + pg_stat_get_xact_tuples_warm_updated(C.oid) AS n_tup_warm_upd FROM pg_class C LEFT JOIN pg_index I ON C.oid = I.indrelid LEFT JOIN pg_namespace N ON (N.oid = C.relnamespace) diff --git a/src/backend/commands/constraint.c b/src/backend/commands/constraint.c index e2544e5..d9c0fe7 100644 --- a/src/backend/commands/constraint.c +++ b/src/backend/commands/constraint.c @@ -40,6 +40,7 @@ unique_key_recheck(PG_FUNCTION_ARGS) TriggerData *trigdata = castNode(TriggerData, fcinfo->context); const char *funcname = "unique_key_recheck"; HeapTuple new_row; + HeapTupleData heapTuple; ItemPointerData tmptid; Relation indexRel; IndexInfo *indexInfo; @@ -102,7 +103,8 @@ unique_key_recheck(PG_FUNCTION_ARGS) * removed. */ tmptid = new_row->t_self; - if (!heap_hot_search(&tmptid, trigdata->tg_relation, SnapshotSelf, NULL)) + if (!heap_hot_search(&tmptid, trigdata->tg_relation, SnapshotSelf, NULL, + NULL, NULL, &heapTuple)) { /* * All rows in the HOT chain are dead, so skip the check. diff --git a/src/backend/commands/copy.c b/src/backend/commands/copy.c index 3102ab1..428fc65 100644 --- a/src/backend/commands/copy.c +++ b/src/backend/commands/copy.c @@ -2681,6 +2681,8 @@ CopyFrom(CopyState cstate) if (resultRelInfo->ri_NumIndices > 0) recheckIndexes = ExecInsertIndexTuples(slot, &(tuple->t_self), + &(tuple->t_self), + NULL, estate, false, NULL, @@ -2835,6 +2837,7 @@ CopyFromInsertBatch(CopyState cstate, EState *estate, CommandId mycid, ExecStoreTuple(bufferedTuples[i], myslot, InvalidBuffer, false); recheckIndexes = ExecInsertIndexTuples(myslot, &(bufferedTuples[i]->t_self), + &(bufferedTuples[i]->t_self), NULL, estate, false, NULL, NIL); ExecARInsertTriggers(estate, resultRelInfo, bufferedTuples[i], diff --git a/src/backend/commands/indexcmds.c b/src/backend/commands/indexcmds.c index 72bb06c..d8f033d 100644 --- a/src/backend/commands/indexcmds.c +++ b/src/backend/commands/indexcmds.c @@ -699,7 +699,14 @@ DefineIndex(Oid relationId, * visible to other transactions before we start to build the index. That * will prevent them from making incompatible HOT updates. The new index * will be marked not indisready and not indisvalid, so that no one else - * tries to either insert into it or use it for queries. + * tries to either insert into it or use it for queries. In addition to + * that, WARM updates will be disallowed if an update is modifying one of + * the columns used by this new index. This is necessary to ensure that we + * don't create WARM tuples which do not have corresponding entry in this + * index. It must be noted that during the second phase, we will index only + * those heap tuples whose root line pointer is not already in the index, + * hence it's important that all tuples in a given chain, has the same + * value for any indexed column (including this new index). * * We must commit our current transaction so that the index becomes * visible; then start another. Note that all the data structures we just @@ -747,7 +754,10 @@ DefineIndex(Oid relationId, * marked as "not-ready-for-inserts". The index is consulted while * deciding HOT-safety though. This arrangement ensures that no new HOT * chains can be created where the new tuple and the old tuple in the - * chain have different index keys. + * chain have different index keys. Also, the new index is consulted for + * deciding whether a WARM update is possible, and WARM update is not done + * if a column used by this index is being updated. This ensures that we + * don't create WARM tuples which are not indexed by this index. * * We now take a new snapshot, and build the index using all tuples that * are visible in this snapshot. We can be sure that any HOT updates to @@ -782,7 +792,8 @@ DefineIndex(Oid relationId, /* * Update the pg_index row to mark the index as ready for inserts. Once we * commit this transaction, any new transactions that open the table must - * insert new entries into the index for insertions and non-HOT updates. + * insert new entries into the index for insertions and non-HOT updates or + * WARM updates where this index needs new entry. */ index_set_state_flags(indexRelationId, INDEX_CREATE_SET_READY); diff --git a/src/backend/commands/vacuumlazy.c b/src/backend/commands/vacuumlazy.c index 5d47f16..7376099 100644 --- a/src/backend/commands/vacuumlazy.c +++ b/src/backend/commands/vacuumlazy.c @@ -1033,6 +1033,19 @@ lazy_scan_heap(Relation onerel, int options, LVRelStats *vacrelstats, break; } + /* + * If this tuple was ever WARM updated or is a WARM + * tuple, there could be multiple index entries + * pointing to the root of this chain. We can't do + * index-only scans for such tuples without verifying + * index key check. So mark the page as !all_visible + */ + if (HeapTupleHeaderIsHeapWarmTuple(tuple.t_data)) + { + all_visible = false; + break; + } + /* Track newest xmin on page. */ if (TransactionIdFollows(xmin, visibility_cutoff_xid)) visibility_cutoff_xid = xmin; @@ -2159,6 +2172,18 @@ heap_page_is_all_visible(Relation rel, Buffer buf, break; } + /* + * If this tuple was ever WARM updated or is a WARM tuple, + * there could be multiple index entries pointing to the + * root of this chain. We can't do index-only scans for + * such tuples without verifying index key check. So mark + * the page as !all_visible + */ + if (HeapTupleHeaderIsHeapWarmTuple(tuple.t_data)) + { + all_visible = false; + } + /* Track newest xmin on page. */ if (TransactionIdFollows(xmin, *visibility_cutoff_xid)) *visibility_cutoff_xid = xmin; diff --git a/src/backend/executor/execIndexing.c b/src/backend/executor/execIndexing.c index 2142273..d62d2de 100644 --- a/src/backend/executor/execIndexing.c +++ b/src/backend/executor/execIndexing.c @@ -270,6 +270,8 @@ ExecCloseIndices(ResultRelInfo *resultRelInfo) List * ExecInsertIndexTuples(TupleTableSlot *slot, ItemPointer tupleid, + ItemPointer root_tid, + Bitmapset *modified_attrs, EState *estate, bool noDupErr, bool *specConflict, @@ -324,6 +326,17 @@ ExecInsertIndexTuples(TupleTableSlot *slot, if (!indexInfo->ii_ReadyForInserts) continue; + /* + * If modified_attrs is set, we only insert index entries for those + * indexes whose column has changed. All other indexes can use their + * existing index pointers to look up the new tuple + */ + if (modified_attrs) + { + if (!bms_overlap(modified_attrs, indexInfo->ii_indxattrs)) + continue; + } + /* Check for partial index */ if (indexInfo->ii_Predicate != NIL) { @@ -389,7 +402,7 @@ ExecInsertIndexTuples(TupleTableSlot *slot, index_insert(indexRelation, /* index relation */ values, /* array of index Datums */ isnull, /* null flags */ - tupleid, /* tid of heap tuple */ + root_tid, /* tid of heap or root tuple */ heapRelation, /* heap relation */ checkUnique, /* type of uniqueness check to do */ indexInfo); /* index AM may need this */ @@ -791,6 +804,9 @@ retry: { if (!HeapTupleHeaderIsHeapLatest(tup->t_data, &tup->t_self)) HeapTupleHeaderGetNextTid(tup->t_data, &ctid_wait); + else + ItemPointerCopy(&tup->t_self, &ctid_wait); + reason_wait = indexInfo->ii_ExclusionOps ? XLTW_RecheckExclusionConstr : XLTW_InsertIndex; index_endscan(index_scan); diff --git a/src/backend/executor/execReplication.c b/src/backend/executor/execReplication.c index f20d728..943a30c 100644 --- a/src/backend/executor/execReplication.c +++ b/src/backend/executor/execReplication.c @@ -399,6 +399,8 @@ ExecSimpleRelationInsert(EState *estate, TupleTableSlot *slot) if (resultRelInfo->ri_NumIndices > 0) recheckIndexes = ExecInsertIndexTuples(slot, &(tuple->t_self), + &(tuple->t_self), + NULL, estate, false, NULL, NIL); @@ -445,6 +447,8 @@ ExecSimpleRelationUpdate(EState *estate, EPQState *epqstate, if (!skip_tuple) { List *recheckIndexes = NIL; + bool warm_update; + Bitmapset *modified_attrs; /* Check the constraints of the tuple */ if (rel->rd_att->constr) @@ -455,13 +459,30 @@ ExecSimpleRelationUpdate(EState *estate, EPQState *epqstate, /* OK, update the tuple and index entries for it */ simple_heap_update(rel, &searchslot->tts_tuple->t_self, - slot->tts_tuple); + slot->tts_tuple, &modified_attrs, &warm_update); if (resultRelInfo->ri_NumIndices > 0 && - !HeapTupleIsHeapOnly(slot->tts_tuple)) + (!HeapTupleIsHeapOnly(slot->tts_tuple) || warm_update)) + { + ItemPointerData root_tid; + if (warm_update) + ItemPointerSet(&root_tid, + ItemPointerGetBlockNumber(&(tuple->t_self)), + HeapTupleHeaderGetRootOffset(tuple->t_data)); + else + { + ItemPointerCopy(&tuple->t_self, + &root_tid); + bms_free(modified_attrs); + modified_attrs = NULL; + } + recheckIndexes = ExecInsertIndexTuples(slot, &(tuple->t_self), + &root_tid, + modified_attrs, estate, false, NULL, NIL); + } /* AFTER ROW UPDATE Triggers */ ExecARUpdateTriggers(estate, resultRelInfo, diff --git a/src/backend/executor/nodeBitmapHeapscan.c b/src/backend/executor/nodeBitmapHeapscan.c index c1aa9f1..35b0b83 100644 --- a/src/backend/executor/nodeBitmapHeapscan.c +++ b/src/backend/executor/nodeBitmapHeapscan.c @@ -39,6 +39,7 @@ #include "access/relscan.h" #include "access/transam.h" +#include "access/valid.h" #include "executor/execdebug.h" #include "executor/nodeBitmapHeapscan.h" #include "pgstat.h" @@ -314,11 +315,27 @@ bitgetpage(HeapScanDesc scan, TBMIterateResult *tbmres) OffsetNumber offnum = tbmres->offsets[curslot]; ItemPointerData tid; HeapTupleData heapTuple; + bool recheck = false; ItemPointerSet(&tid, page, offnum); if (heap_hot_search_buffer(&tid, scan->rs_rd, buffer, snapshot, - &heapTuple, NULL, true)) - scan->rs_vistuples[ntup++] = ItemPointerGetOffsetNumber(&tid); + &heapTuple, NULL, true, &recheck)) + { + bool valid = true; + + if (scan->rs_key) + HeapKeyTest(&heapTuple, RelationGetDescr(scan->rs_rd), + scan->rs_nkeys, scan->rs_key, valid); + if (valid) + scan->rs_vistuples[ntup++] = ItemPointerGetOffsetNumber(&tid); + + /* + * If the heap tuple needs a recheck because of a WARM update, + * it's a lossy case + */ + if (recheck) + tbmres->recheck = true; + } } } else diff --git a/src/backend/executor/nodeIndexscan.c b/src/backend/executor/nodeIndexscan.c index cb6aff9..355a2d8 100644 --- a/src/backend/executor/nodeIndexscan.c +++ b/src/backend/executor/nodeIndexscan.c @@ -142,10 +142,10 @@ IndexNext(IndexScanState *node) false); /* don't pfree */ /* - * If the index was lossy, we have to recheck the index quals using - * the fetched tuple. + * If the index was lossy or the tuple was WARM, we have to recheck + * the index quals using the fetched tuple. */ - if (scandesc->xs_recheck) + if (scandesc->xs_recheck || scandesc->xs_tuple_recheck) { econtext->ecxt_scantuple = slot; ResetExprContext(econtext); diff --git a/src/backend/executor/nodeModifyTable.c b/src/backend/executor/nodeModifyTable.c index 95e1589..a1f3440 100644 --- a/src/backend/executor/nodeModifyTable.c +++ b/src/backend/executor/nodeModifyTable.c @@ -512,6 +512,7 @@ ExecInsert(ModifyTableState *mtstate, /* insert index entries for tuple */ recheckIndexes = ExecInsertIndexTuples(slot, &(tuple->t_self), + &(tuple->t_self), NULL, estate, true, &specConflict, arbiterIndexes); @@ -558,6 +559,7 @@ ExecInsert(ModifyTableState *mtstate, /* insert index entries for tuple */ if (resultRelInfo->ri_NumIndices > 0) recheckIndexes = ExecInsertIndexTuples(slot, &(tuple->t_self), + &(tuple->t_self), NULL, estate, false, NULL, arbiterIndexes); } @@ -891,6 +893,9 @@ ExecUpdate(ItemPointer tupleid, HTSU_Result result; HeapUpdateFailureData hufd; List *recheckIndexes = NIL; + Bitmapset *modified_attrs = NULL; + ItemPointerData root_tid; + bool warm_update; /* * abort the operation if not running transactions @@ -1007,7 +1012,7 @@ lreplace:; estate->es_output_cid, estate->es_crosscheck_snapshot, true /* wait for commit */ , - &hufd, &lockmode); + &hufd, &lockmode, &modified_attrs, &warm_update); switch (result) { case HeapTupleSelfUpdated: @@ -1094,10 +1099,28 @@ lreplace:; * the t_self field. * * If it's a HOT update, we mustn't insert new index entries. + * + * If it's a WARM update, then we must insert new entries with TID + * pointing to the root of the WARM chain. */ - if (resultRelInfo->ri_NumIndices > 0 && !HeapTupleIsHeapOnly(tuple)) + if (resultRelInfo->ri_NumIndices > 0 && + (!HeapTupleIsHeapOnly(tuple) || warm_update)) + { + if (warm_update) + ItemPointerSet(&root_tid, + ItemPointerGetBlockNumber(&(tuple->t_self)), + HeapTupleHeaderGetRootOffset(tuple->t_data)); + else + { + ItemPointerCopy(&tuple->t_self, &root_tid); + bms_free(modified_attrs); + modified_attrs = NULL; + } recheckIndexes = ExecInsertIndexTuples(slot, &(tuple->t_self), + &root_tid, + modified_attrs, estate, false, NULL, NIL); + } } if (canSetTag) diff --git a/src/backend/postmaster/pgstat.c b/src/backend/postmaster/pgstat.c index 2fb9a8b..35cc6c5 100644 --- a/src/backend/postmaster/pgstat.c +++ b/src/backend/postmaster/pgstat.c @@ -1823,7 +1823,7 @@ pgstat_count_heap_insert(Relation rel, int n) * pgstat_count_heap_update - count a tuple update */ void -pgstat_count_heap_update(Relation rel, bool hot) +pgstat_count_heap_update(Relation rel, bool hot, bool warm) { PgStat_TableStatus *pgstat_info = rel->pgstat_info; @@ -1841,6 +1841,8 @@ pgstat_count_heap_update(Relation rel, bool hot) /* t_tuples_hot_updated is nontransactional, so just advance it */ if (hot) pgstat_info->t_counts.t_tuples_hot_updated++; + else if (warm) + pgstat_info->t_counts.t_tuples_warm_updated++; } } @@ -4088,6 +4090,7 @@ pgstat_get_tab_entry(PgStat_StatDBEntry *dbentry, Oid tableoid, bool create) result->tuples_updated = 0; result->tuples_deleted = 0; result->tuples_hot_updated = 0; + result->tuples_warm_updated = 0; result->n_live_tuples = 0; result->n_dead_tuples = 0; result->changes_since_analyze = 0; @@ -5197,6 +5200,7 @@ pgstat_recv_tabstat(PgStat_MsgTabstat *msg, int len) tabentry->tuples_updated = tabmsg->t_counts.t_tuples_updated; tabentry->tuples_deleted = tabmsg->t_counts.t_tuples_deleted; tabentry->tuples_hot_updated = tabmsg->t_counts.t_tuples_hot_updated; + tabentry->tuples_warm_updated = tabmsg->t_counts.t_tuples_warm_updated; tabentry->n_live_tuples = tabmsg->t_counts.t_delta_live_tuples; tabentry->n_dead_tuples = tabmsg->t_counts.t_delta_dead_tuples; tabentry->changes_since_analyze = tabmsg->t_counts.t_changed_tuples; @@ -5224,6 +5228,7 @@ pgstat_recv_tabstat(PgStat_MsgTabstat *msg, int len) tabentry->tuples_updated += tabmsg->t_counts.t_tuples_updated; tabentry->tuples_deleted += tabmsg->t_counts.t_tuples_deleted; tabentry->tuples_hot_updated += tabmsg->t_counts.t_tuples_hot_updated; + tabentry->tuples_warm_updated += tabmsg->t_counts.t_tuples_warm_updated; /* If table was truncated, first reset the live/dead counters */ if (tabmsg->t_counts.t_truncated) { diff --git a/src/backend/utils/adt/pgstatfuncs.c b/src/backend/utils/adt/pgstatfuncs.c index a987d0d..b8677f3 100644 --- a/src/backend/utils/adt/pgstatfuncs.c +++ b/src/backend/utils/adt/pgstatfuncs.c @@ -145,6 +145,22 @@ pg_stat_get_tuples_hot_updated(PG_FUNCTION_ARGS) Datum +pg_stat_get_tuples_warm_updated(PG_FUNCTION_ARGS) +{ + Oid relid = PG_GETARG_OID(0); + int64 result; + PgStat_StatTabEntry *tabentry; + + if ((tabentry = pgstat_fetch_stat_tabentry(relid)) == NULL) + result = 0; + else + result = (int64) (tabentry->tuples_warm_updated); + + PG_RETURN_INT64(result); +} + + +Datum pg_stat_get_live_tuples(PG_FUNCTION_ARGS) { Oid relid = PG_GETARG_OID(0); @@ -1644,6 +1660,21 @@ pg_stat_get_xact_tuples_hot_updated(PG_FUNCTION_ARGS) } Datum +pg_stat_get_xact_tuples_warm_updated(PG_FUNCTION_ARGS) +{ + Oid relid = PG_GETARG_OID(0); + int64 result; + PgStat_TableStatus *tabentry; + + if ((tabentry = find_tabstat_entry(relid)) == NULL) + result = 0; + else + result = (int64) (tabentry->t_counts.t_tuples_warm_updated); + + PG_RETURN_INT64(result); +} + +Datum pg_stat_get_xact_blocks_fetched(PG_FUNCTION_ARGS) { Oid relid = PG_GETARG_OID(0); diff --git a/src/backend/utils/cache/relcache.c b/src/backend/utils/cache/relcache.c index 9001e20..c85898c 100644 --- a/src/backend/utils/cache/relcache.c +++ b/src/backend/utils/cache/relcache.c @@ -2338,6 +2338,7 @@ RelationDestroyRelation(Relation relation, bool remember_tupdesc) list_free_deep(relation->rd_fkeylist); list_free(relation->rd_indexlist); bms_free(relation->rd_indexattr); + bms_free(relation->rd_exprindexattr); bms_free(relation->rd_keyattr); bms_free(relation->rd_pkattr); bms_free(relation->rd_idattr); @@ -4352,6 +4353,13 @@ RelationGetIndexList(Relation relation) return list_copy(relation->rd_indexlist); /* + * If the index list was invalidated, we better also invalidate the index + * attribute list (which should automatically invalidate other attributes + * such as primary key and replica identity) + */ + relation->rd_indexattr = NULL; + + /* * We build the list we intend to return (in the caller's context) while * doing the scan. After successfully completing the scan, we copy that * list into the relcache entry. This avoids cache-context memory leakage @@ -4759,15 +4767,19 @@ Bitmapset * RelationGetIndexAttrBitmap(Relation relation, IndexAttrBitmapKind attrKind) { Bitmapset *indexattrs; /* indexed columns */ + Bitmapset *exprindexattrs; /* indexed columns in expression/prediacate + indexes */ Bitmapset *uindexattrs; /* columns in unique indexes */ Bitmapset *pkindexattrs; /* columns in the primary index */ Bitmapset *idindexattrs; /* columns in the replica identity */ + Bitmapset *indxnotreadyattrs; /* columns in not ready indexes */ List *indexoidlist; List *newindexoidlist; Oid relpkindex; Oid relreplindex; ListCell *l; MemoryContext oldcxt; + bool supportswarm = true;/* True if the table can be WARM updated */ /* Quick exit if we already computed the result. */ if (relation->rd_indexattr != NULL) @@ -4782,6 +4794,10 @@ RelationGetIndexAttrBitmap(Relation relation, IndexAttrBitmapKind attrKind) return bms_copy(relation->rd_pkattr); case INDEX_ATTR_BITMAP_IDENTITY_KEY: return bms_copy(relation->rd_idattr); + case INDEX_ATTR_BITMAP_EXPR_PREDICATE: + return bms_copy(relation->rd_exprindexattr); + case INDEX_ATTR_BITMAP_NOTREADY: + return bms_copy(relation->rd_indxnotreadyattr); default: elog(ERROR, "unknown attrKind %u", attrKind); } @@ -4822,9 +4838,11 @@ restart: * won't be returned at all by RelationGetIndexList. */ indexattrs = NULL; + exprindexattrs = NULL; uindexattrs = NULL; pkindexattrs = NULL; idindexattrs = NULL; + indxnotreadyattrs = NULL; foreach(l, indexoidlist) { Oid indexOid = lfirst_oid(l); @@ -4861,6 +4879,10 @@ restart: indexattrs = bms_add_member(indexattrs, attrnum - FirstLowInvalidHeapAttributeNumber); + if (!indexInfo->ii_ReadyForInserts) + indxnotreadyattrs = bms_add_member(indxnotreadyattrs, + attrnum - FirstLowInvalidHeapAttributeNumber); + if (isKey) uindexattrs = bms_add_member(uindexattrs, attrnum - FirstLowInvalidHeapAttributeNumber); @@ -4876,10 +4898,29 @@ restart: } /* Collect all attributes used in expressions, too */ - pull_varattnos((Node *) indexInfo->ii_Expressions, 1, &indexattrs); + pull_varattnos((Node *) indexInfo->ii_Expressions, 1, &exprindexattrs); /* Collect all attributes in the index predicate, too */ - pull_varattnos((Node *) indexInfo->ii_Predicate, 1, &indexattrs); + pull_varattnos((Node *) indexInfo->ii_Predicate, 1, &exprindexattrs); + + /* + * indexattrs should include attributes referenced in index expressions + * and predicates too + */ + indexattrs = bms_add_members(indexattrs, exprindexattrs); + + if (!indexInfo->ii_ReadyForInserts) + indxnotreadyattrs = bms_add_members(indxnotreadyattrs, + exprindexattrs); + + /* + * Check if the index has amrecheck method defined. If the method is + * not defined, the index does not support WARM update. Completely + * disable WARM updates on such tables + */ + if (!indexDesc->rd_amroutine->amrecheck) + supportswarm = false; + index_close(indexDesc, AccessShareLock); } @@ -4912,15 +4953,22 @@ restart: goto restart; } + /* Remember if the table can do WARM updates */ + relation->rd_supportswarm = supportswarm; + /* Don't leak the old values of these bitmaps, if any */ bms_free(relation->rd_indexattr); relation->rd_indexattr = NULL; + bms_free(relation->rd_exprindexattr); + relation->rd_exprindexattr = NULL; bms_free(relation->rd_keyattr); relation->rd_keyattr = NULL; bms_free(relation->rd_pkattr); relation->rd_pkattr = NULL; bms_free(relation->rd_idattr); relation->rd_idattr = NULL; + bms_free(relation->rd_indxnotreadyattr); + relation->rd_indxnotreadyattr = NULL; /* * Now save copies of the bitmaps in the relcache entry. We intentionally @@ -4933,7 +4981,9 @@ restart: relation->rd_keyattr = bms_copy(uindexattrs); relation->rd_pkattr = bms_copy(pkindexattrs); relation->rd_idattr = bms_copy(idindexattrs); - relation->rd_indexattr = bms_copy(indexattrs); + relation->rd_exprindexattr = bms_copy(exprindexattrs); + relation->rd_indexattr = bms_copy(bms_union(indexattrs, exprindexattrs)); + relation->rd_indxnotreadyattr = bms_copy(indxnotreadyattrs); MemoryContextSwitchTo(oldcxt); /* We return our original working copy for caller to play with */ @@ -4947,6 +4997,10 @@ restart: return bms_copy(relation->rd_pkattr); case INDEX_ATTR_BITMAP_IDENTITY_KEY: return idindexattrs; + case INDEX_ATTR_BITMAP_EXPR_PREDICATE: + return exprindexattrs; + case INDEX_ATTR_BITMAP_NOTREADY: + return indxnotreadyattrs; default: elog(ERROR, "unknown attrKind %u", attrKind); return NULL; @@ -5559,6 +5613,7 @@ load_relcache_init_file(bool shared) rel->rd_keyattr = NULL; rel->rd_pkattr = NULL; rel->rd_idattr = NULL; + rel->rd_indxnotreadyattr = NULL; rel->rd_pubactions = NULL; rel->rd_createSubid = InvalidSubTransactionId; rel->rd_newRelfilenodeSubid = InvalidSubTransactionId; diff --git a/src/include/access/amapi.h b/src/include/access/amapi.h index f919cf8..d7702e5 100644 --- a/src/include/access/amapi.h +++ b/src/include/access/amapi.h @@ -13,6 +13,7 @@ #define AMAPI_H #include "access/genam.h" +#include "access/itup.h" /* * We don't wish to include planner header files here, since most of an index @@ -152,6 +153,10 @@ typedef void (*aminitparallelscan_function) (void *target); /* (re)start parallel index scan */ typedef void (*amparallelrescan_function) (IndexScanDesc scan); +/* recheck index tuple and heap tuple match */ +typedef bool (*amrecheck_function) (Relation indexRel, IndexTuple indexTuple, + Relation heapRel, HeapTuple heapTuple); + /* * API struct for an index AM. Note this must be stored in a single palloc'd * chunk of memory. @@ -217,6 +222,9 @@ typedef struct IndexAmRoutine amestimateparallelscan_function amestimateparallelscan; /* can be NULL */ aminitparallelscan_function aminitparallelscan; /* can be NULL */ amparallelrescan_function amparallelrescan; /* can be NULL */ + + /* interface function to support WARM */ + amrecheck_function amrecheck; /* can be NULL */ } IndexAmRoutine; diff --git a/src/include/access/hash.h b/src/include/access/hash.h index bfdfed8..0af6b4e 100644 --- a/src/include/access/hash.h +++ b/src/include/access/hash.h @@ -391,4 +391,8 @@ extern void hashbucketcleanup(Relation rel, Bucket cur_bucket, bool bucket_has_garbage, IndexBulkDeleteCallback callback, void *callback_state); +/* hash.c */ +extern bool hashrecheck(Relation indexRel, IndexTuple indexTuple, + Relation heapRel, HeapTuple heapTuple); + #endif /* HASH_H */ diff --git a/src/include/access/heapam.h b/src/include/access/heapam.h index 95aa976..9412c3a 100644 --- a/src/include/access/heapam.h +++ b/src/include/access/heapam.h @@ -137,9 +137,10 @@ extern bool heap_fetch(Relation relation, Snapshot snapshot, Relation stats_relation); extern bool heap_hot_search_buffer(ItemPointer tid, Relation relation, Buffer buffer, Snapshot snapshot, HeapTuple heapTuple, - bool *all_dead, bool first_call); + bool *all_dead, bool first_call, bool *recheck); extern bool heap_hot_search(ItemPointer tid, Relation relation, - Snapshot snapshot, bool *all_dead); + Snapshot snapshot, bool *all_dead, + bool *recheck, Buffer *buffer, HeapTuple heapTuple); extern void heap_get_latest_tid(Relation relation, Snapshot snapshot, ItemPointer tid); @@ -161,7 +162,8 @@ extern void heap_abort_speculative(Relation relation, HeapTuple tuple); extern HTSU_Result heap_update(Relation relation, ItemPointer otid, HeapTuple newtup, CommandId cid, Snapshot crosscheck, bool wait, - HeapUpdateFailureData *hufd, LockTupleMode *lockmode); + HeapUpdateFailureData *hufd, LockTupleMode *lockmode, + Bitmapset **modified_attrsp, bool *warm_update); extern HTSU_Result heap_lock_tuple(Relation relation, HeapTuple tuple, CommandId cid, LockTupleMode mode, LockWaitPolicy wait_policy, bool follow_update, @@ -176,7 +178,9 @@ extern bool heap_tuple_needs_eventual_freeze(HeapTupleHeader tuple); extern Oid simple_heap_insert(Relation relation, HeapTuple tup); extern void simple_heap_delete(Relation relation, ItemPointer tid); extern void simple_heap_update(Relation relation, ItemPointer otid, - HeapTuple tup); + HeapTuple tup, + Bitmapset **modified_attrs, + bool *warm_update); extern void heap_sync(Relation relation); diff --git a/src/include/access/heapam_xlog.h b/src/include/access/heapam_xlog.h index e6019d5..9b081bf 100644 --- a/src/include/access/heapam_xlog.h +++ b/src/include/access/heapam_xlog.h @@ -80,6 +80,7 @@ #define XLH_UPDATE_CONTAINS_NEW_TUPLE (1<<4) #define XLH_UPDATE_PREFIX_FROM_OLD (1<<5) #define XLH_UPDATE_SUFFIX_FROM_OLD (1<<6) +#define XLH_UPDATE_WARM_UPDATE (1<<7) /* convenience macro for checking whether any form of old tuple was logged */ #define XLH_UPDATE_CONTAINS_OLD \ diff --git a/src/include/access/htup_details.h b/src/include/access/htup_details.h index 4d614b7..b5891ca 100644 --- a/src/include/access/htup_details.h +++ b/src/include/access/htup_details.h @@ -260,7 +260,8 @@ struct HeapTupleHeaderData * information stored in t_infomask2: */ #define HEAP_NATTS_MASK 0x07FF /* 11 bits for number of attributes */ -/* bits 0x0800 are available */ +#define HEAP_WARM_TUPLE 0x0800 /* This tuple is a part of a WARM chain + */ #define HEAP_LATEST_TUPLE 0x1000 /* * This is the last tuple in chain and * ip_posid points to the root line @@ -271,7 +272,7 @@ struct HeapTupleHeaderData #define HEAP_HOT_UPDATED 0x4000 /* tuple was HOT-updated */ #define HEAP_ONLY_TUPLE 0x8000 /* this is heap-only tuple */ -#define HEAP2_XACT_MASK 0xF000 /* visibility-related bits */ +#define HEAP2_XACT_MASK 0xF800 /* visibility-related bits */ /* @@ -510,6 +511,21 @@ do { \ ((tup)->t_infomask2 & HEAP_ONLY_TUPLE) != 0 \ ) +#define HeapTupleHeaderSetHeapWarmTuple(tup) \ +do { \ + (tup)->t_infomask2 |= HEAP_WARM_TUPLE; \ +} while (0) + +#define HeapTupleHeaderClearHeapWarmTuple(tup) \ +do { \ + (tup)->t_infomask2 &= ~HEAP_WARM_TUPLE; \ +} while (0) + +#define HeapTupleHeaderIsHeapWarmTuple(tup) \ +( \ + ((tup)->t_infomask2 & HEAP_WARM_TUPLE) != 0 \ +) + /* * Mark this as the last tuple in the HOT chain. Before PG v10 we used to store * the TID of the tuple itself in t_ctid field to mark the end of the chain. @@ -785,6 +801,15 @@ struct MinimalTupleData #define HeapTupleClearHeapOnly(tuple) \ HeapTupleHeaderClearHeapOnly((tuple)->t_data) +#define HeapTupleIsHeapWarmTuple(tuple) \ + HeapTupleHeaderIsHeapWarmTuple((tuple)->t_data) + +#define HeapTupleSetHeapWarmTuple(tuple) \ + HeapTupleHeaderSetHeapWarmTuple((tuple)->t_data) + +#define HeapTupleClearHeapWarmTuple(tuple) \ + HeapTupleHeaderClearHeapWarmTuple((tuple)->t_data) + #define HeapTupleGetOid(tuple) \ HeapTupleHeaderGetOid((tuple)->t_data) diff --git a/src/include/access/nbtree.h b/src/include/access/nbtree.h index f9304db..d4b35ca 100644 --- a/src/include/access/nbtree.h +++ b/src/include/access/nbtree.h @@ -537,6 +537,8 @@ extern bytea *btoptions(Datum reloptions, bool validate); extern bool btproperty(Oid index_oid, int attno, IndexAMProperty prop, const char *propname, bool *res, bool *isnull); +extern bool btrecheck(Relation indexRel, IndexTuple indexTuple, + Relation heapRel, HeapTuple heapTuple); /* * prototypes for functions in nbtvalidate.c diff --git a/src/include/access/relscan.h b/src/include/access/relscan.h index 3fc726d..f971b43 100644 --- a/src/include/access/relscan.h +++ b/src/include/access/relscan.h @@ -119,7 +119,8 @@ typedef struct IndexScanDescData HeapTupleData xs_ctup; /* current heap tuple, if any */ Buffer xs_cbuf; /* current heap buffer in scan, if any */ /* NB: if xs_cbuf is not InvalidBuffer, we hold a pin on that buffer */ - bool xs_recheck; /* T means scan keys must be rechecked */ + bool xs_recheck; /* T means scan keys must be rechecked for each tuple */ + bool xs_tuple_recheck; /* T means scan keys must be rechecked for current tuple */ /* * When fetching with an ordering operator, the values of the ORDER BY diff --git a/src/include/catalog/pg_proc.h b/src/include/catalog/pg_proc.h index ec4aedb..ec42c30 100644 --- a/src/include/catalog/pg_proc.h +++ b/src/include/catalog/pg_proc.h @@ -2740,6 +2740,8 @@ DATA(insert OID = 1933 ( pg_stat_get_tuples_deleted PGNSP PGUID 12 1 0 0 0 f f DESCR("statistics: number of tuples deleted"); DATA(insert OID = 1972 ( pg_stat_get_tuples_hot_updated PGNSP PGUID 12 1 0 0 0 f f f f t f s r 1 0 20 "26" _null_ _null_ _null_ _null_ _null_ pg_stat_get_tuples_hot_updated _null_ _null_ _null_ )); DESCR("statistics: number of tuples hot updated"); +DATA(insert OID = 3353 ( pg_stat_get_tuples_warm_updated PGNSP PGUID 12 1 0 0 0 f f f f t f s r 1 0 20 "26" _null_ _null_ _null_ _null_ _null_ pg_stat_get_tuples_warm_updated _null_ _null_ _null_ )); +DESCR("statistics: number of tuples warm updated"); DATA(insert OID = 2878 ( pg_stat_get_live_tuples PGNSP PGUID 12 1 0 0 0 f f f f t f s r 1 0 20 "26" _null_ _null_ _null_ _null_ _null_ pg_stat_get_live_tuples _null_ _null_ _null_ )); DESCR("statistics: number of live tuples"); DATA(insert OID = 2879 ( pg_stat_get_dead_tuples PGNSP PGUID 12 1 0 0 0 f f f f t f s r 1 0 20 "26" _null_ _null_ _null_ _null_ _null_ pg_stat_get_dead_tuples _null_ _null_ _null_ )); @@ -2892,6 +2894,8 @@ DATA(insert OID = 3042 ( pg_stat_get_xact_tuples_deleted PGNSP PGUID 12 1 0 0 DESCR("statistics: number of tuples deleted in current transaction"); DATA(insert OID = 3043 ( pg_stat_get_xact_tuples_hot_updated PGNSP PGUID 12 1 0 0 0 f f f f t f v r 1 0 20 "26" _null_ _null_ _null_ _null_ _null_ pg_stat_get_xact_tuples_hot_updated _null_ _null_ _null_ )); DESCR("statistics: number of tuples hot updated in current transaction"); +DATA(insert OID = 3354 ( pg_stat_get_xact_tuples_warm_updated PGNSP PGUID 12 1 0 0 0 f f f f t f v r 1 0 20 "26" _null_ _null_ _null_ _null_ _null_ pg_stat_get_xact_tuples_warm_updated _null_ _null_ _null_ )); +DESCR("statistics: number of tuples warm updated in current transaction"); DATA(insert OID = 3044 ( pg_stat_get_xact_blocks_fetched PGNSP PGUID 12 1 0 0 0 f f f f t f v r 1 0 20 "26" _null_ _null_ _null_ _null_ _null_ pg_stat_get_xact_blocks_fetched _null_ _null_ _null_ )); DESCR("statistics: number of blocks fetched in current transaction"); DATA(insert OID = 3045 ( pg_stat_get_xact_blocks_hit PGNSP PGUID 12 1 0 0 0 f f f f t f v r 1 0 20 "26" _null_ _null_ _null_ _null_ _null_ pg_stat_get_xact_blocks_hit _null_ _null_ _null_ )); diff --git a/src/include/executor/executor.h b/src/include/executor/executor.h index 02dbe7b..c4495a3 100644 --- a/src/include/executor/executor.h +++ b/src/include/executor/executor.h @@ -382,6 +382,7 @@ extern void UnregisterExprContextCallback(ExprContext *econtext, extern void ExecOpenIndices(ResultRelInfo *resultRelInfo, bool speculative); extern void ExecCloseIndices(ResultRelInfo *resultRelInfo); extern List *ExecInsertIndexTuples(TupleTableSlot *slot, ItemPointer tupleid, + ItemPointer root_tid, Bitmapset *modified_attrs, EState *estate, bool noDupErr, bool *specConflict, List *arbiterIndexes); extern bool ExecCheckIndexConstraints(TupleTableSlot *slot, EState *estate, diff --git a/src/include/executor/nodeIndexscan.h b/src/include/executor/nodeIndexscan.h index ea3f3a5..ebeec74 100644 --- a/src/include/executor/nodeIndexscan.h +++ b/src/include/executor/nodeIndexscan.h @@ -41,5 +41,4 @@ extern void ExecIndexEvalRuntimeKeys(ExprContext *econtext, extern bool ExecIndexEvalArrayKeys(ExprContext *econtext, IndexArrayKeyInfo *arrayKeys, int numArrayKeys); extern bool ExecIndexAdvanceArrayKeys(IndexArrayKeyInfo *arrayKeys, int numArrayKeys); - #endif /* NODEINDEXSCAN_H */ diff --git a/src/include/nodes/execnodes.h b/src/include/nodes/execnodes.h index 2fde67a..0b16157 100644 --- a/src/include/nodes/execnodes.h +++ b/src/include/nodes/execnodes.h @@ -64,6 +64,7 @@ typedef struct IndexInfo NodeTag type; int ii_NumIndexAttrs; AttrNumber ii_KeyAttrNumbers[INDEX_MAX_KEYS]; + Bitmapset *ii_indxattrs; /* bitmap of all columns used in this index */ List *ii_Expressions; /* list of Expr */ List *ii_ExpressionsState; /* list of ExprState */ List *ii_Predicate; /* list of Expr */ diff --git a/src/include/pgstat.h b/src/include/pgstat.h index 0062fb8..70a7c8d 100644 --- a/src/include/pgstat.h +++ b/src/include/pgstat.h @@ -105,6 +105,7 @@ typedef struct PgStat_TableCounts PgStat_Counter t_tuples_updated; PgStat_Counter t_tuples_deleted; PgStat_Counter t_tuples_hot_updated; + PgStat_Counter t_tuples_warm_updated; bool t_truncated; PgStat_Counter t_delta_live_tuples; @@ -625,6 +626,7 @@ typedef struct PgStat_StatTabEntry PgStat_Counter tuples_updated; PgStat_Counter tuples_deleted; PgStat_Counter tuples_hot_updated; + PgStat_Counter tuples_warm_updated; PgStat_Counter n_live_tuples; PgStat_Counter n_dead_tuples; @@ -1178,7 +1180,7 @@ pgstat_report_wait_end(void) (pgStatBlockWriteTime += (n)) extern void pgstat_count_heap_insert(Relation rel, int n); -extern void pgstat_count_heap_update(Relation rel, bool hot); +extern void pgstat_count_heap_update(Relation rel, bool hot, bool warm); extern void pgstat_count_heap_delete(Relation rel); extern void pgstat_count_truncate(Relation rel); extern void pgstat_update_heap_dead_tuples(Relation rel, int delta); diff --git a/src/include/utils/rel.h b/src/include/utils/rel.h index a617a7c..fbac7c0 100644 --- a/src/include/utils/rel.h +++ b/src/include/utils/rel.h @@ -138,9 +138,14 @@ typedef struct RelationData /* data managed by RelationGetIndexAttrBitmap: */ Bitmapset *rd_indexattr; /* identifies columns used in indexes */ + Bitmapset *rd_exprindexattr; /* indentified columns used in expression or + predicate indexes */ + Bitmapset *rd_indxnotreadyattr; /* columns used by indexes not yet + ready */ Bitmapset *rd_keyattr; /* cols that can be ref'd by foreign keys */ Bitmapset *rd_pkattr; /* cols included in primary key */ Bitmapset *rd_idattr; /* included in replica identity index */ + bool rd_supportswarm;/* True if the table can be WARM updated */ PublicationActions *rd_pubactions; /* publication actions */ diff --git a/src/include/utils/relcache.h b/src/include/utils/relcache.h index da36b67..d18bd09 100644 --- a/src/include/utils/relcache.h +++ b/src/include/utils/relcache.h @@ -50,7 +50,9 @@ typedef enum IndexAttrBitmapKind INDEX_ATTR_BITMAP_ALL, INDEX_ATTR_BITMAP_KEY, INDEX_ATTR_BITMAP_PRIMARY_KEY, - INDEX_ATTR_BITMAP_IDENTITY_KEY + INDEX_ATTR_BITMAP_IDENTITY_KEY, + INDEX_ATTR_BITMAP_EXPR_PREDICATE, + INDEX_ATTR_BITMAP_NOTREADY } IndexAttrBitmapKind; extern Bitmapset *RelationGetIndexAttrBitmap(Relation relation, diff --git a/src/test/regress/expected/rules.out b/src/test/regress/expected/rules.out index c661f1d..561d9579 100644 --- a/src/test/regress/expected/rules.out +++ b/src/test/regress/expected/rules.out @@ -1732,6 +1732,7 @@ pg_stat_all_tables| SELECT c.oid AS relid, pg_stat_get_tuples_updated(c.oid) AS n_tup_upd, pg_stat_get_tuples_deleted(c.oid) AS n_tup_del, pg_stat_get_tuples_hot_updated(c.oid) AS n_tup_hot_upd, + pg_stat_get_tuples_warm_updated(c.oid) AS n_tup_warm_upd, pg_stat_get_live_tuples(c.oid) AS n_live_tup, pg_stat_get_dead_tuples(c.oid) AS n_dead_tup, pg_stat_get_mod_since_analyze(c.oid) AS n_mod_since_analyze, @@ -1875,6 +1876,7 @@ pg_stat_sys_tables| SELECT pg_stat_all_tables.relid, pg_stat_all_tables.n_tup_upd, pg_stat_all_tables.n_tup_del, pg_stat_all_tables.n_tup_hot_upd, + pg_stat_all_tables.n_tup_warm_upd, pg_stat_all_tables.n_live_tup, pg_stat_all_tables.n_dead_tup, pg_stat_all_tables.n_mod_since_analyze, @@ -1918,6 +1920,7 @@ pg_stat_user_tables| SELECT pg_stat_all_tables.relid, pg_stat_all_tables.n_tup_upd, pg_stat_all_tables.n_tup_del, pg_stat_all_tables.n_tup_hot_upd, + pg_stat_all_tables.n_tup_warm_upd, pg_stat_all_tables.n_live_tup, pg_stat_all_tables.n_dead_tup, pg_stat_all_tables.n_mod_since_analyze, @@ -1955,7 +1958,8 @@ pg_stat_xact_all_tables| SELECT c.oid AS relid, pg_stat_get_xact_tuples_inserted(c.oid) AS n_tup_ins, pg_stat_get_xact_tuples_updated(c.oid) AS n_tup_upd, pg_stat_get_xact_tuples_deleted(c.oid) AS n_tup_del, - pg_stat_get_xact_tuples_hot_updated(c.oid) AS n_tup_hot_upd + pg_stat_get_xact_tuples_hot_updated(c.oid) AS n_tup_hot_upd, + pg_stat_get_xact_tuples_warm_updated(c.oid) AS n_tup_warm_upd FROM ((pg_class c LEFT JOIN pg_index i ON ((c.oid = i.indrelid))) LEFT JOIN pg_namespace n ON ((n.oid = c.relnamespace))) @@ -1971,7 +1975,8 @@ pg_stat_xact_sys_tables| SELECT pg_stat_xact_all_tables.relid, pg_stat_xact_all_tables.n_tup_ins, pg_stat_xact_all_tables.n_tup_upd, pg_stat_xact_all_tables.n_tup_del, - pg_stat_xact_all_tables.n_tup_hot_upd + pg_stat_xact_all_tables.n_tup_hot_upd, + pg_stat_xact_all_tables.n_tup_warm_upd FROM pg_stat_xact_all_tables WHERE ((pg_stat_xact_all_tables.schemaname = ANY (ARRAY['pg_catalog'::name, 'information_schema'::name])) OR (pg_stat_xact_all_tables.schemaname ~ '^pg_toast'::text)); pg_stat_xact_user_functions| SELECT p.oid AS funcid, @@ -1993,7 +1998,8 @@ pg_stat_xact_user_tables| SELECT pg_stat_xact_all_tables.relid, pg_stat_xact_all_tables.n_tup_ins, pg_stat_xact_all_tables.n_tup_upd, pg_stat_xact_all_tables.n_tup_del, - pg_stat_xact_all_tables.n_tup_hot_upd + pg_stat_xact_all_tables.n_tup_hot_upd, + pg_stat_xact_all_tables.n_tup_warm_upd FROM pg_stat_xact_all_tables WHERE ((pg_stat_xact_all_tables.schemaname <> ALL (ARRAY['pg_catalog'::name, 'information_schema'::name])) AND (pg_stat_xact_all_tables.schemaname !~ '^pg_toast'::text)); pg_statio_all_indexes| SELECT c.oid AS relid, diff --git a/src/test/regress/expected/warm.out b/src/test/regress/expected/warm.out new file mode 100644 index 0000000..6391891 --- /dev/null +++ b/src/test/regress/expected/warm.out @@ -0,0 +1,367 @@ +CREATE TABLE updtst_tab1 (a integer unique, b int, c text, d text); +CREATE INDEX updtst_indx1 ON updtst_tab1 (b); +INSERT INTO updtst_tab1 + SELECT generate_series(1,10000), generate_series(70001, 80000), 'foo', 'bar'; +-- This should be a HOT update as non-index key is updated, but the +-- page won't have any free space, so probably a non-HOT update +UPDATE updtst_tab1 SET c = 'foo1' WHERE a = 1; +-- Next update should be a HOT update as dead space is recycled +UPDATE updtst_tab1 SET c = 'foo2' WHERE a = 1; +-- And next too +UPDATE updtst_tab1 SET c = 'foo3' WHERE a = 1; +-- Now update one of the index key columns +UPDATE updtst_tab1 SET b = b + 70000 WHERE a = 1; +-- Ensure that the correct row is fetched +SELECT * FROM updtst_tab1 WHERE a = 1; + a | b | c | d +---+--------+------+----- + 1 | 140001 | foo3 | bar +(1 row) + +SELECT * FROM updtst_tab1 WHERE b = 70001 + 70000; + a | b | c | d +---+--------+------+----- + 1 | 140001 | foo3 | bar +(1 row) + +-- Even when seqscan is disabled and indexscan is forced +SET enable_seqscan = false; +EXPLAIN (costs off) SELECT * FROM updtst_tab1 WHERE b = 70001 + 70000; + QUERY PLAN +----------------------------------------- + Bitmap Heap Scan on updtst_tab1 + Recheck Cond: (b = 140001) + -> Bitmap Index Scan on updtst_indx1 + Index Cond: (b = 140001) +(4 rows) + +SELECT * FROM updtst_tab1 WHERE b = 70001 + 70000; + a | b | c | d +---+--------+------+----- + 1 | 140001 | foo3 | bar +(1 row) + +-- Check if index only scan works correctly +EXPLAIN (costs off) SELECT b FROM updtst_tab1 WHERE b = 70001 + 70000; + QUERY PLAN +----------------------------------------- + Bitmap Heap Scan on updtst_tab1 + Recheck Cond: (b = 140001) + -> Bitmap Index Scan on updtst_indx1 + Index Cond: (b = 140001) +(4 rows) + +SELECT b FROM updtst_tab1 WHERE b = 70001 + 70000; + b +-------- + 140001 +(1 row) + +-- Table must be vacuumed to force index-only scan +VACUUM updtst_tab1; +EXPLAIN (costs off) SELECT b FROM updtst_tab1 WHERE b = 70001 + 70000; + QUERY PLAN +--------------------------------------------------- + Index Only Scan using updtst_indx1 on updtst_tab1 + Index Cond: (b = 140001) +(2 rows) + +SELECT b FROM updtst_tab1 WHERE b = 70001 + 70000; + b +-------- + 140001 +(1 row) + +SET enable_seqscan = true; +DROP TABLE updtst_tab1; +------------------ +CREATE TABLE updtst_tab2 (a integer unique, b int, c text, d text) WITH (fillfactor = 80); +CREATE INDEX updtst_indx2 ON updtst_tab2 (b); +INSERT INTO updtst_tab2 + SELECT generate_series(1,100), generate_series(701, 800), 'foo', 'bar'; +UPDATE updtst_tab2 SET b = b + 700 WHERE a = 1; +UPDATE updtst_tab2 SET c = 'foo1' WHERE a = 1; +UPDATE updtst_tab2 SET c = 'foo2' WHERE a = 1; +UPDATE updtst_tab2 SET c = 'foo3' WHERE a = 1; +UPDATE updtst_tab2 SET b = b - 700 WHERE a = 1; +UPDATE updtst_tab2 SET c = 'foo4' WHERE a = 1; +UPDATE updtst_tab2 SET c = 'foo5' WHERE a = 1; +UPDATE updtst_tab2 SET c = 'foo6' WHERE a = 1; +SELECT count(*) FROM updtst_tab2 WHERE c = 'foo'; + count +------- + 99 +(1 row) + +SELECT * FROM updtst_tab2 WHERE c = 'foo6'; + a | b | c | d +---+-----+------+----- + 1 | 701 | foo6 | bar +(1 row) + +EXPLAIN (costs off) SELECT * FROM updtst_tab2 WHERE b = 701; + QUERY PLAN +----------------------------------------- + Bitmap Heap Scan on updtst_tab2 + Recheck Cond: (b = 701) + -> Bitmap Index Scan on updtst_indx2 + Index Cond: (b = 701) +(4 rows) + +SELECT * FROM updtst_tab2 WHERE a = 1; + a | b | c | d +---+-----+------+----- + 1 | 701 | foo6 | bar +(1 row) + +SET enable_seqscan = false; +EXPLAIN (costs off) SELECT * FROM updtst_tab2 WHERE b = 701; + QUERY PLAN +----------------------------------------- + Bitmap Heap Scan on updtst_tab2 + Recheck Cond: (b = 701) + -> Bitmap Index Scan on updtst_indx2 + Index Cond: (b = 701) +(4 rows) + +SELECT * FROM updtst_tab2 WHERE b = 701; + a | b | c | d +---+-----+------+----- + 1 | 701 | foo6 | bar +(1 row) + +VACUUM updtst_tab2; +EXPLAIN (costs off) SELECT b FROM updtst_tab2 WHERE b = 701; + QUERY PLAN +--------------------------------------------------- + Index Only Scan using updtst_indx2 on updtst_tab2 + Index Cond: (b = 701) +(2 rows) + +SELECT b FROM updtst_tab2 WHERE b = 701; + b +----- + 701 +(1 row) + +SET enable_seqscan = true; +DROP TABLE updtst_tab2; +------------------ +CREATE TABLE updtst_tab3 (a integer unique, b int, c text, d text) WITH (fillfactor = 80); +CREATE INDEX updtst_indx3 ON updtst_tab3 (b); +INSERT INTO updtst_tab3 + SELECT generate_series(1,100), generate_series(701, 800), 'foo', 'bar'; +BEGIN; +UPDATE updtst_tab3 SET c = 'foo1', b = b + 700 WHERE a = 1; +UPDATE updtst_tab3 SET c = 'foo2' WHERE a = 1; +UPDATE updtst_tab3 SET c = 'foo3' WHERE a = 1; +UPDATE updtst_tab3 SET b = b - 700 WHERE a = 1; +UPDATE updtst_tab3 SET c = 'foo4' WHERE a = 1; +UPDATE updtst_tab3 SET c = 'foo5' WHERE a = 1; +UPDATE updtst_tab3 SET c = 'foo6' WHERE a = 1; +-- Abort the transaction and ensure the original tuple is visible correctly +ROLLBACK; +BEGIN; +UPDATE updtst_tab3 SET c = 'foo11', b = b + 750 WHERE b = 701; +UPDATE updtst_tab3 SET c = 'foo12' WHERE a = 1; +UPDATE updtst_tab3 SET b = b - 30 WHERE a = 1; +COMMIT; +SELECT count(*) FROM updtst_tab3 WHERE c = 'foo'; + count +------- + 99 +(1 row) + +SELECT * FROM updtst_tab3 WHERE c = 'foo6'; + a | b | c | d +---+---+---+--- +(0 rows) + +SELECT * FROM updtst_tab3 WHERE c = 'foo12'; + a | b | c | d +---+------+-------+----- + 1 | 1421 | foo12 | bar +(1 row) + +SELECT * FROM updtst_tab3 WHERE b = 701; + a | b | c | d +---+---+---+--- +(0 rows) + +SELECT * FROM updtst_tab3 WHERE b = 1421; + a | b | c | d +---+------+-------+----- + 1 | 1421 | foo12 | bar +(1 row) + +SELECT * FROM updtst_tab3 WHERE a = 1; + a | b | c | d +---+------+-------+----- + 1 | 1421 | foo12 | bar +(1 row) + +SELECT * FROM updtst_tab3 WHERE b = 701; + a | b | c | d +---+---+---+--- +(0 rows) + +SELECT * FROM updtst_tab3 WHERE b = 1421; + a | b | c | d +---+------+-------+----- + 1 | 1421 | foo12 | bar +(1 row) + +VACUUM updtst_tab3; +EXPLAIN (costs off) SELECT b FROM updtst_tab3 WHERE b = 701; + QUERY PLAN +------------------------- + Seq Scan on updtst_tab3 + Filter: (b = 701) +(2 rows) + +SELECT b FROM updtst_tab3 WHERE b = 701; + b +--- +(0 rows) + +SELECT b FROM updtst_tab3 WHERE b = 1421; + b +------ + 1421 +(1 row) + +BEGIN; +UPDATE updtst_tab3 SET c = 'foo21', b = b + 700 WHERE a = 2; +UPDATE updtst_tab3 SET c = 'foo22' WHERE a = 2; +UPDATE updtst_tab3 SET c = 'foo23' WHERE a = 2; +UPDATE updtst_tab3 SET b = b - 700 WHERE a = 2; +UPDATE updtst_tab3 SET c = 'foo24' WHERE a = 2; +UPDATE updtst_tab3 SET c = 'foo25' WHERE a = 2; +UPDATE updtst_tab3 SET c = 'foo26' WHERE a = 2; +-- Abort the transaction and ensure the original tuple is visible correctly +ROLLBACK; +SET enable_seqscan = false; +BEGIN; +UPDATE updtst_tab3 SET c = 'foo21', b = b + 750 WHERE b = 702; +UPDATE updtst_tab3 SET c = 'foo22' WHERE a = 2; +UPDATE updtst_tab3 SET b = b - 30 WHERE a = 2; +COMMIT; +SELECT count(*) FROM updtst_tab3 WHERE c = 'foo'; + count +------- + 98 +(1 row) + +SELECT * FROM updtst_tab3 WHERE c = 'foo26'; + a | b | c | d +---+---+---+--- +(0 rows) + +SELECT * FROM updtst_tab3 WHERE c = 'foo22'; + a | b | c | d +---+------+-------+----- + 2 | 1422 | foo22 | bar +(1 row) + +SELECT * FROM updtst_tab3 WHERE b = 702; + a | b | c | d +---+---+---+--- +(0 rows) + +SELECT * FROM updtst_tab3 WHERE b = 1422; + a | b | c | d +---+------+-------+----- + 2 | 1422 | foo22 | bar +(1 row) + +SELECT * FROM updtst_tab3 WHERE a = 2; + a | b | c | d +---+------+-------+----- + 2 | 1422 | foo22 | bar +(1 row) + +-- Try fetching both old and new value using updtst_indx3 +SELECT * FROM updtst_tab3 WHERE b = 702; + a | b | c | d +---+---+---+--- +(0 rows) + +SELECT * FROM updtst_tab3 WHERE b = 1422; + a | b | c | d +---+------+-------+----- + 2 | 1422 | foo22 | bar +(1 row) + +VACUUM updtst_tab3; +EXPLAIN (costs off) SELECT b FROM updtst_tab3 WHERE b = 702; + QUERY PLAN +--------------------------------------------------- + Index Only Scan using updtst_indx3 on updtst_tab3 + Index Cond: (b = 702) +(2 rows) + +SELECT b FROM updtst_tab3 WHERE b = 702; + b +--- +(0 rows) + +SELECT b FROM updtst_tab3 WHERE b = 1422; + b +------ + 1422 +(1 row) + +SET enable_seqscan = true; +DROP TABLE updtst_tab3; +------------------ +CREATE TABLE test_warm (a text unique, b text); +CREATE INDEX test_warmindx ON test_warm (lower(a)); +INSERT INTO test_warm values ('test', 'foo'); +UPDATE test_warm SET a = 'TEST'; +select *, ctid from test_warm where lower(a) = 'test'; + a | b | ctid +------+-----+------- + TEST | foo | (0,2) +(1 row) + +explain select * from test_warm where lower(a) = 'test'; + QUERY PLAN +---------------------------------------------------------------------------- + Bitmap Heap Scan on test_warm (cost=4.18..12.65 rows=4 width=64) + Recheck Cond: (lower(a) = 'test'::text) + -> Bitmap Index Scan on test_warmindx (cost=0.00..4.18 rows=4 width=0) + Index Cond: (lower(a) = 'test'::text) +(4 rows) + +select *, ctid from test_warm where lower(a) = 'test'; + a | b | ctid +------+-----+------- + TEST | foo | (0,2) +(1 row) + +select *, ctid from test_warm where a = 'test'; + a | b | ctid +---+---+------ +(0 rows) + +select *, ctid from test_warm where a = 'TEST'; + a | b | ctid +------+-----+------- + TEST | foo | (0,2) +(1 row) + +set enable_bitmapscan TO false; +explain select * from test_warm where lower(a) = 'test'; + QUERY PLAN +--------------------------------------------------------------------------------- + Index Scan using test_warmindx on test_warm (cost=0.15..20.22 rows=4 width=64) + Index Cond: (lower(a) = 'test'::text) +(2 rows) + +select *, ctid from test_warm where lower(a) = 'test'; + a | b | ctid +------+-----+------- + TEST | foo | (0,2) +(1 row) + +DROP TABLE test_warm; diff --git a/src/test/regress/parallel_schedule b/src/test/regress/parallel_schedule index 13bf494..0b6193b 100644 --- a/src/test/regress/parallel_schedule +++ b/src/test/regress/parallel_schedule @@ -42,6 +42,8 @@ test: create_type test: create_table test: create_function_2 +test: warm + # ---------- # Load huge amounts of data # We should split the data files into single files and then diff --git a/src/test/regress/sql/warm.sql b/src/test/regress/sql/warm.sql new file mode 100644 index 0000000..c025087 --- /dev/null +++ b/src/test/regress/sql/warm.sql @@ -0,0 +1,171 @@ +-- WARM update tests + +CREATE TABLE updtst_tab1 (a integer unique, b int, c text, d text); +CREATE INDEX updtst_indx1 ON updtst_tab1 (b); +INSERT INTO updtst_tab1 + SELECT generate_series(1,10000), generate_series(70001, 80000), 'foo', 'bar'; + +-- This should be a HOT update as non-index key is updated, but the +-- page won't have any free space, so probably a non-HOT update +UPDATE updtst_tab1 SET c = 'foo1' WHERE a = 1; + +-- Next update should be a HOT update as dead space is recycled +UPDATE updtst_tab1 SET c = 'foo2' WHERE a = 1; + +-- And next too +UPDATE updtst_tab1 SET c = 'foo3' WHERE a = 1; + +-- Now update one of the index key columns +UPDATE updtst_tab1 SET b = b + 70000 WHERE a = 1; + +-- Ensure that the correct row is fetched +SELECT * FROM updtst_tab1 WHERE a = 1; +SELECT * FROM updtst_tab1 WHERE b = 70001 + 70000; + +-- Even when seqscan is disabled and indexscan is forced +SET enable_seqscan = false; +EXPLAIN (costs off) SELECT * FROM updtst_tab1 WHERE b = 70001 + 70000; +SELECT * FROM updtst_tab1 WHERE b = 70001 + 70000; + +-- Check if index only scan works correctly +EXPLAIN (costs off) SELECT b FROM updtst_tab1 WHERE b = 70001 + 70000; +SELECT b FROM updtst_tab1 WHERE b = 70001 + 70000; + +-- Table must be vacuumed to force index-only scan +VACUUM updtst_tab1; +EXPLAIN (costs off) SELECT b FROM updtst_tab1 WHERE b = 70001 + 70000; +SELECT b FROM updtst_tab1 WHERE b = 70001 + 70000; + +SET enable_seqscan = true; + +DROP TABLE updtst_tab1; + +------------------ + +CREATE TABLE updtst_tab2 (a integer unique, b int, c text, d text) WITH (fillfactor = 80); +CREATE INDEX updtst_indx2 ON updtst_tab2 (b); +INSERT INTO updtst_tab2 + SELECT generate_series(1,100), generate_series(701, 800), 'foo', 'bar'; + +UPDATE updtst_tab2 SET b = b + 700 WHERE a = 1; +UPDATE updtst_tab2 SET c = 'foo1' WHERE a = 1; +UPDATE updtst_tab2 SET c = 'foo2' WHERE a = 1; +UPDATE updtst_tab2 SET c = 'foo3' WHERE a = 1; +UPDATE updtst_tab2 SET b = b - 700 WHERE a = 1; +UPDATE updtst_tab2 SET c = 'foo4' WHERE a = 1; +UPDATE updtst_tab2 SET c = 'foo5' WHERE a = 1; +UPDATE updtst_tab2 SET c = 'foo6' WHERE a = 1; + +SELECT count(*) FROM updtst_tab2 WHERE c = 'foo'; +SELECT * FROM updtst_tab2 WHERE c = 'foo6'; + +EXPLAIN (costs off) SELECT * FROM updtst_tab2 WHERE b = 701; +SELECT * FROM updtst_tab2 WHERE a = 1; + +SET enable_seqscan = false; +EXPLAIN (costs off) SELECT * FROM updtst_tab2 WHERE b = 701; +SELECT * FROM updtst_tab2 WHERE b = 701; + +VACUUM updtst_tab2; +EXPLAIN (costs off) SELECT b FROM updtst_tab2 WHERE b = 701; +SELECT b FROM updtst_tab2 WHERE b = 701; + +SET enable_seqscan = true; + +DROP TABLE updtst_tab2; +------------------ + +CREATE TABLE updtst_tab3 (a integer unique, b int, c text, d text) WITH (fillfactor = 80); +CREATE INDEX updtst_indx3 ON updtst_tab3 (b); +INSERT INTO updtst_tab3 + SELECT generate_series(1,100), generate_series(701, 800), 'foo', 'bar'; + +BEGIN; +UPDATE updtst_tab3 SET c = 'foo1', b = b + 700 WHERE a = 1; +UPDATE updtst_tab3 SET c = 'foo2' WHERE a = 1; +UPDATE updtst_tab3 SET c = 'foo3' WHERE a = 1; +UPDATE updtst_tab3 SET b = b - 700 WHERE a = 1; +UPDATE updtst_tab3 SET c = 'foo4' WHERE a = 1; +UPDATE updtst_tab3 SET c = 'foo5' WHERE a = 1; +UPDATE updtst_tab3 SET c = 'foo6' WHERE a = 1; + +-- Abort the transaction and ensure the original tuple is visible correctly +ROLLBACK; + +BEGIN; +UPDATE updtst_tab3 SET c = 'foo11', b = b + 750 WHERE b = 701; +UPDATE updtst_tab3 SET c = 'foo12' WHERE a = 1; +UPDATE updtst_tab3 SET b = b - 30 WHERE a = 1; +COMMIT; + +SELECT count(*) FROM updtst_tab3 WHERE c = 'foo'; +SELECT * FROM updtst_tab3 WHERE c = 'foo6'; +SELECT * FROM updtst_tab3 WHERE c = 'foo12'; + +SELECT * FROM updtst_tab3 WHERE b = 701; +SELECT * FROM updtst_tab3 WHERE b = 1421; +SELECT * FROM updtst_tab3 WHERE a = 1; + +SELECT * FROM updtst_tab3 WHERE b = 701; +SELECT * FROM updtst_tab3 WHERE b = 1421; + +VACUUM updtst_tab3; +EXPLAIN (costs off) SELECT b FROM updtst_tab3 WHERE b = 701; +SELECT b FROM updtst_tab3 WHERE b = 701; +SELECT b FROM updtst_tab3 WHERE b = 1421; + +BEGIN; +UPDATE updtst_tab3 SET c = 'foo21', b = b + 700 WHERE a = 2; +UPDATE updtst_tab3 SET c = 'foo22' WHERE a = 2; +UPDATE updtst_tab3 SET c = 'foo23' WHERE a = 2; +UPDATE updtst_tab3 SET b = b - 700 WHERE a = 2; +UPDATE updtst_tab3 SET c = 'foo24' WHERE a = 2; +UPDATE updtst_tab3 SET c = 'foo25' WHERE a = 2; +UPDATE updtst_tab3 SET c = 'foo26' WHERE a = 2; + +-- Abort the transaction and ensure the original tuple is visible correctly +ROLLBACK; + +SET enable_seqscan = false; + +BEGIN; +UPDATE updtst_tab3 SET c = 'foo21', b = b + 750 WHERE b = 702; +UPDATE updtst_tab3 SET c = 'foo22' WHERE a = 2; +UPDATE updtst_tab3 SET b = b - 30 WHERE a = 2; +COMMIT; + +SELECT count(*) FROM updtst_tab3 WHERE c = 'foo'; +SELECT * FROM updtst_tab3 WHERE c = 'foo26'; +SELECT * FROM updtst_tab3 WHERE c = 'foo22'; + +SELECT * FROM updtst_tab3 WHERE b = 702; +SELECT * FROM updtst_tab3 WHERE b = 1422; +SELECT * FROM updtst_tab3 WHERE a = 2; + +-- Try fetching both old and new value using updtst_indx3 +SELECT * FROM updtst_tab3 WHERE b = 702; +SELECT * FROM updtst_tab3 WHERE b = 1422; + +VACUUM updtst_tab3; +EXPLAIN (costs off) SELECT b FROM updtst_tab3 WHERE b = 702; +SELECT b FROM updtst_tab3 WHERE b = 702; +SELECT b FROM updtst_tab3 WHERE b = 1422; + +SET enable_seqscan = true; + +DROP TABLE updtst_tab3; +------------------ + +CREATE TABLE test_warm (a text unique, b text); +CREATE INDEX test_warmindx ON test_warm (lower(a)); +INSERT INTO test_warm values ('test', 'foo'); +UPDATE test_warm SET a = 'TEST'; +select *, ctid from test_warm where lower(a) = 'test'; +explain select * from test_warm where lower(a) = 'test'; +select *, ctid from test_warm where lower(a) = 'test'; +select *, ctid from test_warm where a = 'test'; +select *, ctid from test_warm where a = 'TEST'; +set enable_bitmapscan TO false; +explain select * from test_warm where lower(a) = 'test'; +select *, ctid from test_warm where lower(a) = 'test'; +DROP TABLE test_warm; -- 2.1.4