From 2c901fe7c1829d21e3630070750c12d4415fb40c Mon Sep 17 00:00:00 2001 From: Alvaro Herrera Date: Wed, 8 Mar 2017 13:51:12 -0300 Subject: [PATCH 6/6] warm chain conversion v16 --- contrib/bloom/blvacuum.c | 2 +- src/backend/access/gin/ginvacuum.c | 3 +- src/backend/access/gist/gistvacuum.c | 3 +- src/backend/access/hash/hash.c | 82 ++++- src/backend/access/hash/hashpage.c | 14 + src/backend/access/heap/heapam.c | 323 +++++++++++++++-- src/backend/access/heap/tuptoaster.c | 3 +- src/backend/access/index/indexam.c | 9 +- src/backend/access/nbtree/nbtpage.c | 51 ++- src/backend/access/nbtree/nbtree.c | 75 +++- src/backend/access/nbtree/nbtxlog.c | 99 +---- src/backend/access/rmgrdesc/heapdesc.c | 26 +- src/backend/access/rmgrdesc/nbtdesc.c | 4 +- src/backend/access/spgist/spgvacuum.c | 12 +- src/backend/catalog/index.c | 11 +- src/backend/catalog/indexing.c | 5 +- src/backend/commands/constraint.c | 3 +- src/backend/commands/vacuumlazy.c | 602 +++++++++++++++++++++++++++++-- src/backend/executor/execIndexing.c | 3 +- src/backend/replication/logical/decode.c | 13 +- src/backend/utils/time/combocid.c | 4 +- src/backend/utils/time/tqual.c | 24 +- src/include/access/amapi.h | 9 + src/include/access/genam.h | 22 +- src/include/access/hash.h | 11 + src/include/access/heapam.h | 18 + src/include/access/heapam_xlog.h | 23 +- src/include/access/htup_details.h | 84 ++++- src/include/access/nbtree.h | 18 +- src/include/access/nbtxlog.h | 26 +- src/include/commands/progress.h | 1 + 31 files changed, 1321 insertions(+), 262 deletions(-) diff --git a/contrib/bloom/blvacuum.c b/contrib/bloom/blvacuum.c index 04abd0f..ff50361 100644 --- a/contrib/bloom/blvacuum.c +++ b/contrib/bloom/blvacuum.c @@ -88,7 +88,7 @@ blbulkdelete(IndexVacuumInfo *info, IndexBulkDeleteResult *stats, while (itup < itupEnd) { /* Do we have to delete this tuple? */ - if (callback(&itup->heapPtr, callback_state)) + if (callback(&itup->heapPtr, false, callback_state) == IBDCR_DELETE) { /* Yes; adjust count of tuples that will be left on page */ BloomPageGetOpaque(page)->maxoff--; diff --git a/src/backend/access/gin/ginvacuum.c b/src/backend/access/gin/ginvacuum.c index c9ccfee..8ed71c5 100644 --- a/src/backend/access/gin/ginvacuum.c +++ b/src/backend/access/gin/ginvacuum.c @@ -56,7 +56,8 @@ ginVacuumItemPointers(GinVacuumState *gvs, ItemPointerData *items, */ for (i = 0; i < nitem; i++) { - if (gvs->callback(items + i, gvs->callback_state)) + if (gvs->callback(items + i, false, gvs->callback_state) == + IBDCR_DELETE) { gvs->result->tuples_removed += 1; if (!tmpitems) diff --git a/src/backend/access/gist/gistvacuum.c b/src/backend/access/gist/gistvacuum.c index 77d9d12..0955db6 100644 --- a/src/backend/access/gist/gistvacuum.c +++ b/src/backend/access/gist/gistvacuum.c @@ -202,7 +202,8 @@ gistbulkdelete(IndexVacuumInfo *info, IndexBulkDeleteResult *stats, iid = PageGetItemId(page, i); idxtuple = (IndexTuple) PageGetItem(page, iid); - if (callback(&(idxtuple->t_tid), callback_state)) + if (callback(&(idxtuple->t_tid), false, callback_state) == + IBDCR_DELETE) todelete[ntodelete++] = i; else stats->num_index_tuples += 1; diff --git a/src/backend/access/hash/hash.c b/src/backend/access/hash/hash.c index 9b20ae6..5310c67 100644 --- a/src/backend/access/hash/hash.c +++ b/src/backend/access/hash/hash.c @@ -73,6 +73,7 @@ hashhandler(PG_FUNCTION_ARGS) amroutine->ambuild = hashbuild; amroutine->ambuildempty = hashbuildempty; amroutine->aminsert = hashinsert; + amroutine->amwarminsert = hashwarminsert; amroutine->ambulkdelete = hashbulkdelete; amroutine->amvacuumcleanup = hashvacuumcleanup; amroutine->amcanreturn = NULL; @@ -231,11 +232,11 @@ hashbuildCallback(Relation index, * Hash on the heap tuple's key, form an index tuple with hash code. * Find the appropriate location for the new tuple, and put it there. */ -bool -hashinsert(Relation rel, Datum *values, bool *isnull, +static bool +hashinsert_internal(Relation rel, Datum *values, bool *isnull, ItemPointer ht_ctid, Relation heapRel, IndexUniqueCheck checkUnique, - IndexInfo *indexInfo) + IndexInfo *indexInfo, bool warm_update) { Datum index_values[1]; bool index_isnull[1]; @@ -251,6 +252,11 @@ hashinsert(Relation rel, Datum *values, bool *isnull, itup = index_form_tuple(RelationGetDescr(rel), index_values, index_isnull); itup->t_tid = *ht_ctid; + if (warm_update) + ItemPointerSetFlags(&itup->t_tid, HASH_INDEX_RED_POINTER); + else + ItemPointerClearFlags(&itup->t_tid); + _hash_doinsert(rel, itup); pfree(itup); @@ -258,6 +264,26 @@ hashinsert(Relation rel, Datum *values, bool *isnull, return false; } +bool +hashinsert(Relation rel, Datum *values, bool *isnull, + ItemPointer ht_ctid, Relation heapRel, + IndexUniqueCheck checkUnique, + IndexInfo *indexInfo) +{ + return hashinsert_internal(rel, values, isnull, ht_ctid, heapRel, + checkUnique, indexInfo, false); +} + +bool +hashwarminsert(Relation rel, Datum *values, bool *isnull, + ItemPointer ht_ctid, Relation heapRel, + IndexUniqueCheck checkUnique, + IndexInfo *indexInfo) +{ + return hashinsert_internal(rel, values, isnull, ht_ctid, heapRel, + checkUnique, indexInfo, true); + +} /* * hashgettuple() -- Get the next tuple in the scan. @@ -738,6 +764,8 @@ hashbucketcleanup(Relation rel, Bucket cur_bucket, Buffer bucket_buf, Page page; OffsetNumber deletable[MaxOffsetNumber]; int ndeletable = 0; + OffsetNumber colorblue[MaxOffsetNumber]; + int ncolorblue = 0; bool retain_pin = false; vacuum_delay_point(); @@ -755,20 +783,35 @@ hashbucketcleanup(Relation rel, Bucket cur_bucket, Buffer bucket_buf, IndexTuple itup; Bucket bucket; bool kill_tuple = false; + bool color_tuple = false; + int flags; + bool is_red; + IndexBulkDeleteCallbackResult result; itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, offno)); htup = &(itup->t_tid); + flags = ItemPointerGetFlags(&itup->t_tid); + is_red = ((flags & HASH_INDEX_RED_POINTER) != 0); + /* * To remove the dead tuples, we strictly want to rely on results * of callback function. refer btvacuumpage for detailed reason. */ - if (callback && callback(htup, callback_state)) + if (callback) { - kill_tuple = true; - if (tuples_removed) - *tuples_removed += 1; + result = callback(htup, is_red, callback_state); + if (result == IBDCR_DELETE) + { + kill_tuple = true; + if (tuples_removed) + *tuples_removed += 1; + } + else if (result == IBDCR_COLOR_BLUE) + { + color_tuple = true; + } } else if (split_cleanup) { @@ -791,6 +834,12 @@ hashbucketcleanup(Relation rel, Bucket cur_bucket, Buffer bucket_buf, } } + if (color_tuple) + { + /* color the pointer blue */ + colorblue[ncolorblue++] = offno; + } + if (kill_tuple) { /* mark the item for deletion */ @@ -815,9 +864,24 @@ hashbucketcleanup(Relation rel, Bucket cur_bucket, Buffer bucket_buf, /* * Apply deletions, advance to next page and write page if needed. */ - if (ndeletable > 0) + if (ndeletable > 0 || ncolorblue > 0) { - PageIndexMultiDelete(page, deletable, ndeletable); + /* + * Color the Red pointers Blue. + * + * We must do this before dealing with the dead items because + * PageIndexMultiDelete may move items around to compactify the + * array and hence offnums recorded earlier won't make any sense + * after PageIndexMultiDelete is called.. + */ + if (ncolorblue > 0) + _hash_color_items(page, colorblue, ncolorblue); + + /* + * And delete the deletable items + */ + if (ndeletable > 0) + PageIndexMultiDelete(page, deletable, ndeletable); bucket_dirty = true; MarkBufferDirty(buf); } diff --git a/src/backend/access/hash/hashpage.c b/src/backend/access/hash/hashpage.c index c73929c..7df3e12 100644 --- a/src/backend/access/hash/hashpage.c +++ b/src/backend/access/hash/hashpage.c @@ -1376,3 +1376,17 @@ _hash_getbucketbuf_from_hashkey(Relation rel, uint32 hashkey, int access, return buf; } + +void _hash_color_items(Page page, OffsetNumber *coloritemnos, + uint16 ncoloritems) +{ + int i; + IndexTuple itup; + + for (i = 0; i < ncoloritems; i++) + { + itup = (IndexTuple) PageGetItem(page, + PageGetItemId(page, coloritemnos[i])); + ItemPointerClearFlags(&itup->t_tid); + } +} diff --git a/src/backend/access/heap/heapam.c b/src/backend/access/heap/heapam.c index b9ff94d..0ffb9a9 100644 --- a/src/backend/access/heap/heapam.c +++ b/src/backend/access/heap/heapam.c @@ -1958,17 +1958,32 @@ heap_fetch(Relation relation, } /* - * Check if the HOT chain containing this tid is actually a WARM chain. - * Note that even if the WARM update ultimately aborted, we still must do a - * recheck because the failing UPDATE when have inserted created index entries - * which are now stale, but still referencing this chain. + * Check status of a (possibly) WARM chain. + * + * This function looks at a HOT/WARM chain starting at tid and return a bitmask + * of information. We only follow the chain as long as it's known to be valid + * HOT chain. Information returned by the function consists of: + * + * HCWC_WARM_TUPLE - a warm tuple is found somewhere in the chain. Note that + * when a tuple is WARM updated, both old and new versions + * of the tuple are treated as WARM tuple + * + * HCWC_RED_TUPLE - a warm tuple part of the Red chain is found somewhere in + * the chain. + * + * HCWC_BLUE_TUPLE - a warm tuple part of the Blue chain is found somewhere in + * the chain. + * + * If stop_at_warm is true, we stop when the first WARM tuple is found and + * return information collected so far. */ -static bool -hot_check_warm_chain(Page dp, ItemPointer tid) +HeapCheckWarmChainStatus +heap_check_warm_chain(Page dp, ItemPointer tid, bool stop_at_warm) { - TransactionId prev_xmax = InvalidTransactionId; - OffsetNumber offnum; - HeapTupleData heapTuple; + TransactionId prev_xmax = InvalidTransactionId; + OffsetNumber offnum; + HeapTupleData heapTuple; + HeapCheckWarmChainStatus status = 0; offnum = ItemPointerGetOffsetNumber(tid); heapTuple.t_self = *tid; @@ -1985,7 +2000,16 @@ hot_check_warm_chain(Page dp, ItemPointer tid) /* check for unused, dead, or redirected items */ if (!ItemIdIsNormal(lp)) + { + if (ItemIdIsRedirected(lp)) + { + /* Follow the redirect */ + offnum = ItemIdGetRedirect(lp); + continue; + } + /* else must be end of chain */ break; + } heapTuple.t_data = (HeapTupleHeader) PageGetItem(dp, lp); ItemPointerSetOffsetNumber(&heapTuple.t_self, offnum); @@ -2000,13 +2024,30 @@ hot_check_warm_chain(Page dp, ItemPointer tid) break; - /* - * Presence of either WARM or WARM updated tuple signals possible - * breakage and the caller must recheck tuple returned from this chain - * for index satisfaction - */ if (HeapTupleHeaderIsHeapWarmTuple(heapTuple.t_data)) - return true; + { + /* We found a WARM tuple */ + status |= HCWC_WARM_TUPLE; + + /* + * If we've been told to stop at the first WARM tuple, just return + * whatever information collected so far. + */ + if (stop_at_warm) + return status; + + /* + * If it's not a Red tuple, then it's definitely a Blue tuple. Set + * either of the bit correctly. + */ + if (HeapTupleHeaderIsWarmRed(heapTuple.t_data)) + status |= HCWC_RED_TUPLE; + else + status |= HCWC_BLUE_TUPLE; + } + else + /* Must be a tuple belonging to the Blue chain */ + status |= HCWC_BLUE_TUPLE; /* * Check to see if HOT chain continues past this tuple; if so fetch @@ -2026,7 +2067,94 @@ hot_check_warm_chain(Page dp, ItemPointer tid) } /* All OK. No need to recheck */ - return false; + return status; +} + +/* + * Scan through the WARM chain starting at tid and reset all WARM related + * flags. At the end, the chain will have all characteristics of a regular HOT + * chain. + * + * Return the number of cleared offnums. Cleared offnums are returned in the + * passed-in cleared_offnums array. The caller must ensure that the array is + * large enough to hold maximum offnums that can be cleared by this invokation + * of heap_clear_warm_chain(). + */ +int +heap_clear_warm_chain(Page dp, ItemPointer tid, OffsetNumber *cleared_offnums) +{ + TransactionId prev_xmax = InvalidTransactionId; + OffsetNumber offnum; + HeapTupleData heapTuple; + int num_cleared = 0; + + offnum = ItemPointerGetOffsetNumber(tid); + heapTuple.t_self = *tid; + /* Scan through possible multiple members of HOT-chain */ + for (;;) + { + ItemId lp; + + /* check for bogus TID */ + if (offnum < FirstOffsetNumber || offnum > PageGetMaxOffsetNumber(dp)) + break; + + lp = PageGetItemId(dp, offnum); + + /* check for unused, dead, or redirected items */ + if (!ItemIdIsNormal(lp)) + { + if (ItemIdIsRedirected(lp)) + { + /* Follow the redirect */ + offnum = ItemIdGetRedirect(lp); + continue; + } + /* else must be end of chain */ + break; + } + + heapTuple.t_data = (HeapTupleHeader) PageGetItem(dp, lp); + ItemPointerSetOffsetNumber(&heapTuple.t_self, offnum); + + /* + * The xmin should match the previous xmax value, else chain is + * broken. + */ + if (TransactionIdIsValid(prev_xmax) && + !TransactionIdEquals(prev_xmax, + HeapTupleHeaderGetXmin(heapTuple.t_data))) + break; + + + /* + * Clear WARM and Red flags + */ + if (HeapTupleHeaderIsHeapWarmTuple(heapTuple.t_data)) + { + HeapTupleHeaderClearHeapWarmTuple(heapTuple.t_data); + HeapTupleHeaderClearWarmRed(heapTuple.t_data); + cleared_offnums[num_cleared++] = offnum; + } + + /* + * Check to see if HOT chain continues past this tuple; if so fetch + * the next offnum and loop around. + */ + if (!HeapTupleIsHotUpdated(&heapTuple)) + break; + + /* + * It can't be a HOT chain if the tuple contains root line pointer + */ + if (HeapTupleHeaderHasRootOffset(heapTuple.t_data)) + break; + + offnum = ItemPointerGetOffsetNumber(&heapTuple.t_data->t_ctid); + prev_xmax = HeapTupleHeaderGetUpdateXid(heapTuple.t_data); + } + + return num_cleared; } /* @@ -2135,7 +2263,11 @@ heap_hot_search_buffer(ItemPointer tid, Relation relation, Buffer buffer, * possible improvements here */ if (recheck && *recheck == false) - *recheck = hot_check_warm_chain(dp, &heapTuple->t_self); + { + HeapCheckWarmChainStatus status; + status = heap_check_warm_chain(dp, &heapTuple->t_self, true); + *recheck = HCWC_IS_WARM(status); + } /* * When first_call is true (and thus, skip is initially false) we'll @@ -2888,7 +3020,7 @@ heap_multi_insert(Relation relation, HeapTuple *tuples, int ntuples, { XLogRecPtr recptr; xl_heap_multi_insert *xlrec; - uint8 info = XLOG_HEAP2_MULTI_INSERT; + uint8 info = XLOG_HEAP_MULTI_INSERT; char *tupledata; int totaldatalen; char *scratchptr = scratch; @@ -2985,7 +3117,7 @@ heap_multi_insert(Relation relation, HeapTuple *tuples, int ntuples, /* filtering by origin on a row level is much more efficient */ XLogSetRecordFlags(XLOG_INCLUDE_ORIGIN); - recptr = XLogInsert(RM_HEAP2_ID, info); + recptr = XLogInsert(RM_HEAP_ID, info); PageSetLSN(page, recptr); } @@ -3409,7 +3541,9 @@ l1: } /* store transaction information of xact deleting the tuple */ - tp.t_data->t_infomask &= ~(HEAP_XMAX_BITS | HEAP_MOVED); + tp.t_data->t_infomask &= ~HEAP_XMAX_BITS; + if (HeapTupleHeaderIsMoved(tp.t_data)) + tp.t_data->t_infomask &= ~HEAP_MOVED; tp.t_data->t_infomask2 &= ~HEAP_KEYS_UPDATED; tp.t_data->t_infomask |= new_infomask; tp.t_data->t_infomask2 |= new_infomask2; @@ -4172,7 +4306,9 @@ l2: START_CRIT_SECTION(); /* Clear obsolete visibility flags ... */ - oldtup.t_data->t_infomask &= ~(HEAP_XMAX_BITS | HEAP_MOVED); + oldtup.t_data->t_infomask &= ~HEAP_XMAX_BITS; + if (HeapTupleHeaderIsMoved(oldtup.t_data)) + oldtup.t_data->t_infomask &= ~HEAP_MOVED; oldtup.t_data->t_infomask2 &= ~HEAP_KEYS_UPDATED; HeapTupleClearHotUpdated(&oldtup); /* ... and store info about transaction updating this tuple */ @@ -4419,6 +4555,16 @@ l2: } /* + * If the old tuple is already a member of the Red chain, mark the new + * tuple with the same flag + */ + if (HeapTupleIsHeapWarmTupleRed(&oldtup)) + { + HeapTupleSetHeapWarmTupleRed(heaptup); + HeapTupleSetHeapWarmTupleRed(newtup); + } + + /* * For HOT (or WARM) updated tuples, we store the offset of the root * line pointer of this chain in the ip_posid field of the new tuple. * Usually this information will be available in the corresponding @@ -4435,12 +4581,20 @@ l2: /* Mark the old tuple as HOT-updated */ HeapTupleSetHotUpdated(&oldtup); HeapTupleSetHeapWarmTuple(&oldtup); + /* And mark the new tuple as heap-only */ HeapTupleSetHeapOnly(heaptup); + /* Mark the new tuple as WARM tuple */ HeapTupleSetHeapWarmTuple(heaptup); + /* This update also starts a Red chain */ + HeapTupleSetHeapWarmTupleRed(heaptup); + Assert(!HeapTupleIsHeapWarmTupleRed(&oldtup)); + /* Mark the caller's copy too, in case different from heaptup */ HeapTupleSetHeapOnly(newtup); HeapTupleSetHeapWarmTuple(newtup); + HeapTupleSetHeapWarmTupleRed(newtup); + if (HeapTupleHeaderHasRootOffset(oldtup.t_data)) root_offnum = HeapTupleHeaderGetRootOffset(oldtup.t_data); else @@ -4459,6 +4613,8 @@ l2: HeapTupleClearHeapOnly(newtup); HeapTupleClearHeapWarmTuple(heaptup); HeapTupleClearHeapWarmTuple(newtup); + HeapTupleClearHeapWarmTupleRed(heaptup); + HeapTupleClearHeapWarmTupleRed(newtup); root_offnum = InvalidOffsetNumber; } @@ -4477,7 +4633,9 @@ l2: HeapTupleHeaderSetHeapLatest(newtup->t_data, root_offnum); /* Clear obsolete visibility flags, possibly set by ourselves above... */ - oldtup.t_data->t_infomask &= ~(HEAP_XMAX_BITS | HEAP_MOVED); + oldtup.t_data->t_infomask &= ~HEAP_XMAX_BITS; + if (HeapTupleHeaderIsMoved(oldtup.t_data)) + oldtup.t_data->t_infomask &= ~HEAP_MOVED; oldtup.t_data->t_infomask2 &= ~HEAP_KEYS_UPDATED; /* ... and store info about transaction updating this tuple */ Assert(TransactionIdIsValid(xmax_old_tuple)); @@ -6398,7 +6556,9 @@ heap_abort_speculative(Relation relation, HeapTuple tuple) PageSetPrunable(page, RecentGlobalXmin); /* store transaction information of xact deleting the tuple */ - tp.t_data->t_infomask &= ~(HEAP_XMAX_BITS | HEAP_MOVED); + tp.t_data->t_infomask &= ~HEAP_XMAX_BITS; + if (HeapTupleHeaderIsMoved(tp.t_data)) + tp.t_data->t_infomask &= ~HEAP_MOVED; tp.t_data->t_infomask2 &= ~HEAP_KEYS_UPDATED; /* @@ -6972,7 +7132,7 @@ heap_prepare_freeze_tuple(HeapTupleHeader tuple, TransactionId cutoff_xid, * Old-style VACUUM FULL is gone, but we have to keep this code as long as * we support having MOVED_OFF/MOVED_IN tuples in the database. */ - if (tuple->t_infomask & HEAP_MOVED) + if (HeapTupleHeaderIsMoved(tuple)) { xid = HeapTupleHeaderGetXvac(tuple); @@ -6991,7 +7151,7 @@ heap_prepare_freeze_tuple(HeapTupleHeader tuple, TransactionId cutoff_xid, * have failed; whereas a non-dead MOVED_IN tuple must mean the * xvac transaction succeeded. */ - if (tuple->t_infomask & HEAP_MOVED_OFF) + if (HeapTupleHeaderIsMovedOff(tuple)) frz->frzflags |= XLH_INVALID_XVAC; else frz->frzflags |= XLH_FREEZE_XVAC; @@ -7461,7 +7621,7 @@ heap_tuple_needs_eventual_freeze(HeapTupleHeader tuple) return true; } - if (tuple->t_infomask & HEAP_MOVED) + if (HeapTupleHeaderIsMoved(tuple)) { xid = HeapTupleHeaderGetXvac(tuple); if (TransactionIdIsNormal(xid)) @@ -7544,7 +7704,7 @@ heap_tuple_needs_freeze(HeapTupleHeader tuple, TransactionId cutoff_xid, return true; } - if (tuple->t_infomask & HEAP_MOVED) + if (HeapTupleHeaderIsMoved(tuple)) { xid = HeapTupleHeaderGetXvac(tuple); if (TransactionIdIsNormal(xid) && @@ -7570,7 +7730,7 @@ HeapTupleHeaderAdvanceLatestRemovedXid(HeapTupleHeader tuple, TransactionId xmax = HeapTupleHeaderGetUpdateXid(tuple); TransactionId xvac = HeapTupleHeaderGetXvac(tuple); - if (tuple->t_infomask & HEAP_MOVED) + if (HeapTupleHeaderIsMoved(tuple)) { if (TransactionIdPrecedes(*latestRemovedXid, xvac)) *latestRemovedXid = xvac; @@ -7619,6 +7779,36 @@ log_heap_cleanup_info(RelFileNode rnode, TransactionId latestRemovedXid) } /* + * Perform XLogInsert for a heap-warm-clear operation. Caller must already + * have modified the buffer and marked it dirty. + */ +XLogRecPtr +log_heap_warmclear(Relation reln, Buffer buffer, + OffsetNumber *cleared, int ncleared) +{ + xl_heap_warmclear xlrec; + XLogRecPtr recptr; + + /* Caller should not call me on a non-WAL-logged relation */ + Assert(RelationNeedsWAL(reln)); + + xlrec.ncleared = ncleared; + + XLogBeginInsert(); + XLogRegisterData((char *) &xlrec, SizeOfHeapWarmClear); + + XLogRegisterBuffer(0, buffer, REGBUF_STANDARD); + + if (ncleared > 0) + XLogRegisterBufData(0, (char *) cleared, + ncleared * sizeof(OffsetNumber)); + + recptr = XLogInsert(RM_HEAP2_ID, XLOG_HEAP2_WARMCLEAR); + + return recptr; +} + +/* * Perform XLogInsert for a heap-clean operation. Caller must already * have modified the buffer and marked it dirty. * @@ -8277,6 +8467,60 @@ heap_xlog_clean(XLogReaderState *record) XLogRecordPageWithFreeSpace(rnode, blkno, freespace); } + +/* + * Handles HEAP2_WARMCLEAR record type + */ +static void +heap_xlog_warmclear(XLogReaderState *record) +{ + XLogRecPtr lsn = record->EndRecPtr; + xl_heap_warmclear *xlrec = (xl_heap_warmclear *) XLogRecGetData(record); + Buffer buffer; + RelFileNode rnode; + BlockNumber blkno; + XLogRedoAction action; + + XLogRecGetBlockTag(record, 0, &rnode, NULL, &blkno); + + /* + * If we have a full-page image, restore it (using a cleanup lock) and + * we're done. + */ + action = XLogReadBufferForRedoExtended(record, 0, RBM_NORMAL, true, + &buffer); + if (action == BLK_NEEDS_REDO) + { + Page page = (Page) BufferGetPage(buffer); + OffsetNumber *cleared; + int ncleared; + Size datalen; + int i; + + cleared = (OffsetNumber *) XLogRecGetBlockData(record, 0, &datalen); + + ncleared = xlrec->ncleared; + + for (i = 0; i < ncleared; i++) + { + ItemId lp; + OffsetNumber offnum = cleared[i]; + HeapTupleData heapTuple; + + lp = PageGetItemId(page, offnum); + heapTuple.t_data = (HeapTupleHeader) PageGetItem(page, lp); + + HeapTupleHeaderClearHeapWarmTuple(heapTuple.t_data); + HeapTupleHeaderClearWarmRed(heapTuple.t_data); + } + + PageSetLSN(page, lsn); + MarkBufferDirty(buffer); + } + if (BufferIsValid(buffer)) + UnlockReleaseBuffer(buffer); +} + /* * Replay XLOG_HEAP2_VISIBLE record. * @@ -8523,7 +8767,9 @@ heap_xlog_delete(XLogReaderState *record) htup = (HeapTupleHeader) PageGetItem(page, lp); - htup->t_infomask &= ~(HEAP_XMAX_BITS | HEAP_MOVED); + htup->t_infomask &= ~HEAP_XMAX_BITS; + if (HeapTupleHeaderIsMoved(htup)) + htup->t_infomask &= ~HEAP_MOVED; htup->t_infomask2 &= ~HEAP_KEYS_UPDATED; HeapTupleHeaderClearHotUpdated(htup); fix_infomask_from_infobits(xlrec->infobits_set, @@ -9186,7 +9432,9 @@ heap_xlog_lock(XLogReaderState *record) htup = (HeapTupleHeader) PageGetItem(page, lp); - htup->t_infomask &= ~(HEAP_XMAX_BITS | HEAP_MOVED); + htup->t_infomask &= ~HEAP_XMAX_BITS; + if (HeapTupleHeaderIsMoved(htup)) + htup->t_infomask &= ~HEAP_MOVED; htup->t_infomask2 &= ~HEAP_KEYS_UPDATED; fix_infomask_from_infobits(xlrec->infobits_set, &htup->t_infomask, &htup->t_infomask2); @@ -9265,7 +9513,9 @@ heap_xlog_lock_updated(XLogReaderState *record) htup = (HeapTupleHeader) PageGetItem(page, lp); - htup->t_infomask &= ~(HEAP_XMAX_BITS | HEAP_MOVED); + htup->t_infomask &= ~HEAP_XMAX_BITS; + if (HeapTupleHeaderIsMoved(htup)) + htup->t_infomask &= ~HEAP_MOVED; htup->t_infomask2 &= ~HEAP_KEYS_UPDATED; fix_infomask_from_infobits(xlrec->infobits_set, &htup->t_infomask, &htup->t_infomask2); @@ -9334,6 +9584,9 @@ heap_redo(XLogReaderState *record) case XLOG_HEAP_INSERT: heap_xlog_insert(record); break; + case XLOG_HEAP_MULTI_INSERT: + heap_xlog_multi_insert(record); + break; case XLOG_HEAP_DELETE: heap_xlog_delete(record); break; @@ -9362,7 +9615,7 @@ heap2_redo(XLogReaderState *record) { uint8 info = XLogRecGetInfo(record) & ~XLR_INFO_MASK; - switch (info & XLOG_HEAP_OPMASK) + switch (info & XLOG_HEAP2_OPMASK) { case XLOG_HEAP2_CLEAN: heap_xlog_clean(record); @@ -9376,9 +9629,6 @@ heap2_redo(XLogReaderState *record) case XLOG_HEAP2_VISIBLE: heap_xlog_visible(record); break; - case XLOG_HEAP2_MULTI_INSERT: - heap_xlog_multi_insert(record); - break; case XLOG_HEAP2_LOCK_UPDATED: heap_xlog_lock_updated(record); break; @@ -9392,6 +9642,9 @@ heap2_redo(XLogReaderState *record) case XLOG_HEAP2_REWRITE: heap_xlog_logical_rewrite(record); break; + case XLOG_HEAP2_WARMCLEAR: + heap_xlog_warmclear(record); + break; default: elog(PANIC, "heap2_redo: unknown op code %u", info); } diff --git a/src/backend/access/heap/tuptoaster.c b/src/backend/access/heap/tuptoaster.c index 19e7048..47b01eb 100644 --- a/src/backend/access/heap/tuptoaster.c +++ b/src/backend/access/heap/tuptoaster.c @@ -1620,7 +1620,8 @@ toast_save_datum(Relation rel, Datum value, toastrel, toastidxs[i]->rd_index->indisunique ? UNIQUE_CHECK_YES : UNIQUE_CHECK_NO, - NULL); + NULL, + false); } /* diff --git a/src/backend/access/index/indexam.c b/src/backend/access/index/indexam.c index da6c252..e0553d0 100644 --- a/src/backend/access/index/indexam.c +++ b/src/backend/access/index/indexam.c @@ -199,7 +199,8 @@ index_insert(Relation indexRelation, ItemPointer heap_t_ctid, Relation heapRelation, IndexUniqueCheck checkUnique, - IndexInfo *indexInfo) + IndexInfo *indexInfo, + bool warm_update) { RELATION_CHECKS; CHECK_REL_PROCEDURE(aminsert); @@ -209,6 +210,12 @@ index_insert(Relation indexRelation, (HeapTuple) NULL, InvalidBuffer); + if (warm_update) + { + Assert(indexRelation->rd_amroutine->amwarminsert != NULL); + return indexRelation->rd_amroutine->amwarminsert(indexRelation, values, + isnull, heap_t_ctid, heapRelation, checkUnique, indexInfo); + } return indexRelation->rd_amroutine->aminsert(indexRelation, values, isnull, heap_t_ctid, heapRelation, checkUnique, indexInfo); diff --git a/src/backend/access/nbtree/nbtpage.c b/src/backend/access/nbtree/nbtpage.c index f815fd4..7959155 100644 --- a/src/backend/access/nbtree/nbtpage.c +++ b/src/backend/access/nbtree/nbtpage.c @@ -766,11 +766,12 @@ _bt_page_recyclable(Page page) } /* - * Delete item(s) from a btree page during VACUUM. + * Delete item(s) and color item(s) blue on a btree page during VACUUM. * * This must only be used for deleting leaf items. Deleting an item on a * non-leaf page has to be done as part of an atomic action that includes - * deleting the page it points to. + * deleting the page it points to. We don't ever color pointers on a non-leaf + * page. * * This routine assumes that the caller has pinned and locked the buffer. * Also, the given itemnos *must* appear in increasing order in the array. @@ -786,9 +787,9 @@ _bt_page_recyclable(Page page) * ensure correct locking. */ void -_bt_delitems_vacuum(Relation rel, Buffer buf, - OffsetNumber *itemnos, int nitems, - BlockNumber lastBlockVacuumed) +_bt_handleitems_vacuum(Relation rel, Buffer buf, + OffsetNumber *delitemnos, int ndelitems, + OffsetNumber *coloritemnos, int ncoloritems) { Page page = BufferGetPage(buf); BTPageOpaque opaque; @@ -796,9 +797,20 @@ _bt_delitems_vacuum(Relation rel, Buffer buf, /* No ereport(ERROR) until changes are logged */ START_CRIT_SECTION(); + /* + * Color the Red pointers Blue. + * + * We must do this before dealing with the dead items because + * PageIndexMultiDelete may move items around to compactify the array and + * hence offnums recorded earlier won't make any sense after + * PageIndexMultiDelete is called.. + */ + if (ncoloritems > 0) + _bt_color_items(page, coloritemnos, ncoloritems); + /* Fix the page */ - if (nitems > 0) - PageIndexMultiDelete(page, itemnos, nitems); + if (ndelitems > 0) + PageIndexMultiDelete(page, delitemnos, ndelitems); /* * We can clear the vacuum cycle ID since this page has certainly been @@ -824,7 +836,8 @@ _bt_delitems_vacuum(Relation rel, Buffer buf, XLogRecPtr recptr; xl_btree_vacuum xlrec_vacuum; - xlrec_vacuum.lastBlockVacuumed = lastBlockVacuumed; + xlrec_vacuum.ndelitems = ndelitems; + xlrec_vacuum.ncoloritems = ncoloritems; XLogBeginInsert(); XLogRegisterBuffer(0, buf, REGBUF_STANDARD); @@ -835,8 +848,11 @@ _bt_delitems_vacuum(Relation rel, Buffer buf, * is. When XLogInsert stores the whole buffer, the offsets array * need not be stored too. */ - if (nitems > 0) - XLogRegisterBufData(0, (char *) itemnos, nitems * sizeof(OffsetNumber)); + if (ndelitems > 0) + XLogRegisterBufData(0, (char *) delitemnos, ndelitems * sizeof(OffsetNumber)); + + if (ncoloritems > 0) + XLogRegisterBufData(0, (char *) coloritemnos, ncoloritems * sizeof(OffsetNumber)); recptr = XLogInsert(RM_BTREE_ID, XLOG_BTREE_VACUUM); @@ -1882,3 +1898,18 @@ _bt_unlink_halfdead_page(Relation rel, Buffer leafbuf, bool *rightsib_empty) return true; } + +void +_bt_color_items(Page page, OffsetNumber *coloritemnos, uint16 ncoloritems) +{ + int i; + ItemId itemid; + IndexTuple itup; + + for (i = 0; i < ncoloritems; i++) + { + itemid = PageGetItemId(page, coloritemnos[i]); + itup = (IndexTuple) PageGetItem(page, itemid); + ItemPointerClearFlags(&itup->t_tid); + } +} diff --git a/src/backend/access/nbtree/nbtree.c b/src/backend/access/nbtree/nbtree.c index 952ed8f..92f490e 100644 --- a/src/backend/access/nbtree/nbtree.c +++ b/src/backend/access/nbtree/nbtree.c @@ -147,6 +147,7 @@ bthandler(PG_FUNCTION_ARGS) amroutine->ambuild = btbuild; amroutine->ambuildempty = btbuildempty; amroutine->aminsert = btinsert; + amroutine->amwarminsert = btwarminsert; amroutine->ambulkdelete = btbulkdelete; amroutine->amvacuumcleanup = btvacuumcleanup; amroutine->amcanreturn = btcanreturn; @@ -317,11 +318,12 @@ btbuildempty(Relation index) * Descend the tree recursively, find the appropriate location for our * new tuple, and put it there. */ -bool -btinsert(Relation rel, Datum *values, bool *isnull, +static bool +btinsert_internal(Relation rel, Datum *values, bool *isnull, ItemPointer ht_ctid, Relation heapRel, IndexUniqueCheck checkUnique, - IndexInfo *indexInfo) + IndexInfo *indexInfo, + bool warm_update) { bool result; IndexTuple itup; @@ -330,6 +332,11 @@ btinsert(Relation rel, Datum *values, bool *isnull, itup = index_form_tuple(RelationGetDescr(rel), values, isnull); itup->t_tid = *ht_ctid; + if (warm_update) + ItemPointerSetFlags(&itup->t_tid, BTREE_INDEX_RED_POINTER); + else + ItemPointerClearFlags(&itup->t_tid); + result = _bt_doinsert(rel, itup, checkUnique, heapRel); pfree(itup); @@ -337,6 +344,26 @@ btinsert(Relation rel, Datum *values, bool *isnull, return result; } +bool +btinsert(Relation rel, Datum *values, bool *isnull, + ItemPointer ht_ctid, Relation heapRel, + IndexUniqueCheck checkUnique, + IndexInfo *indexInfo) +{ + return btinsert_internal(rel, values, isnull, ht_ctid, heapRel, + checkUnique, indexInfo, false); +} + +bool +btwarminsert(Relation rel, Datum *values, bool *isnull, + ItemPointer ht_ctid, Relation heapRel, + IndexUniqueCheck checkUnique, + IndexInfo *indexInfo) +{ + return btinsert_internal(rel, values, isnull, ht_ctid, heapRel, + checkUnique, indexInfo, true); +} + /* * btgettuple() -- Get the next tuple in the scan. */ @@ -1106,7 +1133,7 @@ btvacuumscan(IndexVacuumInfo *info, IndexBulkDeleteResult *stats, RBM_NORMAL, info->strategy); LockBufferForCleanup(buf); _bt_checkpage(rel, buf); - _bt_delitems_vacuum(rel, buf, NULL, 0, vstate.lastBlockVacuumed); + _bt_handleitems_vacuum(rel, buf, NULL, 0, NULL, 0); _bt_relbuf(rel, buf); } @@ -1204,6 +1231,8 @@ restart: { OffsetNumber deletable[MaxOffsetNumber]; int ndeletable; + OffsetNumber colorblue[MaxOffsetNumber]; + int ncolorblue; OffsetNumber offnum, minoff, maxoff; @@ -1242,7 +1271,7 @@ restart: * Scan over all items to see which ones need deleted according to the * callback function. */ - ndeletable = 0; + ndeletable = ncolorblue = 0; minoff = P_FIRSTDATAKEY(opaque); maxoff = PageGetMaxOffsetNumber(page); if (callback) @@ -1253,6 +1282,9 @@ restart: { IndexTuple itup; ItemPointer htup; + int flags; + bool is_red = false; + IndexBulkDeleteCallbackResult result; itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, offnum)); @@ -1279,16 +1311,36 @@ restart: * applies to *any* type of index that marks index tuples as * killed. */ - if (callback(htup, callback_state)) + flags = ItemPointerGetFlags(&itup->t_tid); + is_red = ((flags & BTREE_INDEX_RED_POINTER) != 0); + + if (is_red) + stats->num_red_pointers++; + else + stats->num_blue_pointers++; + + result = callback(htup, is_red, callback_state); + if (result == IBDCR_DELETE) + { + if (is_red) + stats->red_pointers_removed++; + else + stats->blue_pointers_removed++; deletable[ndeletable++] = offnum; + } + else if (result == IBDCR_COLOR_BLUE) + { + colorblue[ncolorblue++] = offnum; + } } } /* - * Apply any needed deletes. We issue just one _bt_delitems_vacuum() - * call per page, so as to minimize WAL traffic. + * Apply any needed deletes and coloring. We issue just one + * _bt_handleitems_vacuum() call per page, so as to minimize WAL + * traffic. */ - if (ndeletable > 0) + if (ndeletable > 0 || ncolorblue > 0) { /* * Notice that the issued XLOG_BTREE_VACUUM WAL record includes @@ -1304,8 +1356,8 @@ restart: * doesn't seem worth the amount of bookkeeping it'd take to avoid * that. */ - _bt_delitems_vacuum(rel, buf, deletable, ndeletable, - vstate->lastBlockVacuumed); + _bt_handleitems_vacuum(rel, buf, deletable, ndeletable, + colorblue, ncolorblue); /* * Remember highest leaf page number we've issued a @@ -1315,6 +1367,7 @@ restart: vstate->lastBlockVacuumed = blkno; stats->tuples_removed += ndeletable; + stats->pointers_colored += ncolorblue; /* must recompute maxoff */ maxoff = PageGetMaxOffsetNumber(page); } diff --git a/src/backend/access/nbtree/nbtxlog.c b/src/backend/access/nbtree/nbtxlog.c index ac60db0..916c76e 100644 --- a/src/backend/access/nbtree/nbtxlog.c +++ b/src/backend/access/nbtree/nbtxlog.c @@ -390,83 +390,9 @@ btree_xlog_vacuum(XLogReaderState *record) Buffer buffer; Page page; BTPageOpaque opaque; -#ifdef UNUSED xl_btree_vacuum *xlrec = (xl_btree_vacuum *) XLogRecGetData(record); /* - * This section of code is thought to be no longer needed, after analysis - * of the calling paths. It is retained to allow the code to be reinstated - * if a flaw is revealed in that thinking. - * - * If we are running non-MVCC scans using this index we need to do some - * additional work to ensure correctness, which is known as a "pin scan" - * described in more detail in next paragraphs. We used to do the extra - * work in all cases, whereas we now avoid that work in most cases. If - * lastBlockVacuumed is set to InvalidBlockNumber then we skip the - * additional work required for the pin scan. - * - * Avoiding this extra work is important since it requires us to touch - * every page in the index, so is an O(N) operation. Worse, it is an - * operation performed in the foreground during redo, so it delays - * replication directly. - * - * If queries might be active then we need to ensure every leaf page is - * unpinned between the lastBlockVacuumed and the current block, if there - * are any. This prevents replay of the VACUUM from reaching the stage of - * removing heap tuples while there could still be indexscans "in flight" - * to those particular tuples for those scans which could be confused by - * finding new tuples at the old TID locations (see nbtree/README). - * - * It might be worth checking if there are actually any backends running; - * if not, we could just skip this. - * - * Since VACUUM can visit leaf pages out-of-order, it might issue records - * with lastBlockVacuumed >= block; that's not an error, it just means - * nothing to do now. - * - * Note: since we touch all pages in the range, we will lock non-leaf - * pages, and also any empty (all-zero) pages that may be in the index. It - * doesn't seem worth the complexity to avoid that. But it's important - * that HotStandbyActiveInReplay() will not return true if the database - * isn't yet consistent; so we need not fear reading still-corrupt blocks - * here during crash recovery. - */ - if (HotStandbyActiveInReplay() && BlockNumberIsValid(xlrec->lastBlockVacuumed)) - { - RelFileNode thisrnode; - BlockNumber thisblkno; - BlockNumber blkno; - - XLogRecGetBlockTag(record, 0, &thisrnode, NULL, &thisblkno); - - for (blkno = xlrec->lastBlockVacuumed + 1; blkno < thisblkno; blkno++) - { - /* - * We use RBM_NORMAL_NO_LOG mode because it's not an error - * condition to see all-zero pages. The original btvacuumpage - * scan would have skipped over all-zero pages, noting them in FSM - * but not bothering to initialize them just yet; so we mustn't - * throw an error here. (We could skip acquiring the cleanup lock - * if PageIsNew, but it's probably not worth the cycles to test.) - * - * XXX we don't actually need to read the block, we just need to - * confirm it is unpinned. If we had a special call into the - * buffer manager we could optimise this so that if the block is - * not in shared_buffers we confirm it as unpinned. Optimizing - * this is now moot, since in most cases we avoid the scan. - */ - buffer = XLogReadBufferExtended(thisrnode, MAIN_FORKNUM, blkno, - RBM_NORMAL_NO_LOG); - if (BufferIsValid(buffer)) - { - LockBufferForCleanup(buffer); - UnlockReleaseBuffer(buffer); - } - } - } -#endif - - /* * Like in btvacuumpage(), we need to take a cleanup lock on every leaf * page. See nbtree/README for details. */ @@ -482,19 +408,30 @@ btree_xlog_vacuum(XLogReaderState *record) if (len > 0) { - OffsetNumber *unused; - OffsetNumber *unend; + OffsetNumber *offnums = (OffsetNumber *) ptr; - unused = (OffsetNumber *) ptr; - unend = (OffsetNumber *) ((char *) ptr + len); + /* + * Color the Red pointers Blue. + * + * We must do this before dealing with the dead items because + * PageIndexMultiDelete may move items around to compactify the + * array and hence offnums recorded earlier won't make any sense + * after PageIndexMultiDelete is called.. + */ + if (xlrec->ncoloritems > 0) + _bt_color_items(page, offnums + xlrec->ndelitems, + xlrec->ncoloritems); - if ((unend - unused) > 0) - PageIndexMultiDelete(page, unused, unend - unused); + /* + * And handle the deleted items too + */ + if (xlrec->ndelitems > 0) + PageIndexMultiDelete(page, offnums, xlrec->ndelitems); } /* * Mark the page as not containing any LP_DEAD items --- see comments - * in _bt_delitems_vacuum(). + * in _bt_handleitems_vacuum(). */ opaque = (BTPageOpaque) PageGetSpecialPointer(page); opaque->btpo_flags &= ~BTP_HAS_GARBAGE; diff --git a/src/backend/access/rmgrdesc/heapdesc.c b/src/backend/access/rmgrdesc/heapdesc.c index 44d2d63..d373e61 100644 --- a/src/backend/access/rmgrdesc/heapdesc.c +++ b/src/backend/access/rmgrdesc/heapdesc.c @@ -44,6 +44,12 @@ heap_desc(StringInfo buf, XLogReaderState *record) appendStringInfo(buf, "off %u", xlrec->offnum); } + else if (info == XLOG_HEAP_MULTI_INSERT) + { + xl_heap_multi_insert *xlrec = (xl_heap_multi_insert *) rec; + + appendStringInfo(buf, "%d tuples", xlrec->ntuples); + } else if (info == XLOG_HEAP_DELETE) { xl_heap_delete *xlrec = (xl_heap_delete *) rec; @@ -102,7 +108,7 @@ heap2_desc(StringInfo buf, XLogReaderState *record) char *rec = XLogRecGetData(record); uint8 info = XLogRecGetInfo(record) & ~XLR_INFO_MASK; - info &= XLOG_HEAP_OPMASK; + info &= XLOG_HEAP2_OPMASK; if (info == XLOG_HEAP2_CLEAN) { xl_heap_clean *xlrec = (xl_heap_clean *) rec; @@ -129,12 +135,6 @@ heap2_desc(StringInfo buf, XLogReaderState *record) appendStringInfo(buf, "cutoff xid %u flags %d", xlrec->cutoff_xid, xlrec->flags); } - else if (info == XLOG_HEAP2_MULTI_INSERT) - { - xl_heap_multi_insert *xlrec = (xl_heap_multi_insert *) rec; - - appendStringInfo(buf, "%d tuples", xlrec->ntuples); - } else if (info == XLOG_HEAP2_LOCK_UPDATED) { xl_heap_lock_updated *xlrec = (xl_heap_lock_updated *) rec; @@ -171,6 +171,12 @@ heap_identify(uint8 info) case XLOG_HEAP_INSERT | XLOG_HEAP_INIT_PAGE: id = "INSERT+INIT"; break; + case XLOG_HEAP_MULTI_INSERT: + id = "MULTI_INSERT"; + break; + case XLOG_HEAP_MULTI_INSERT | XLOG_HEAP_INIT_PAGE: + id = "MULTI_INSERT+INIT"; + break; case XLOG_HEAP_DELETE: id = "DELETE"; break; @@ -219,12 +225,6 @@ heap2_identify(uint8 info) case XLOG_HEAP2_VISIBLE: id = "VISIBLE"; break; - case XLOG_HEAP2_MULTI_INSERT: - id = "MULTI_INSERT"; - break; - case XLOG_HEAP2_MULTI_INSERT | XLOG_HEAP_INIT_PAGE: - id = "MULTI_INSERT+INIT"; - break; case XLOG_HEAP2_LOCK_UPDATED: id = "LOCK_UPDATED"; break; diff --git a/src/backend/access/rmgrdesc/nbtdesc.c b/src/backend/access/rmgrdesc/nbtdesc.c index fbde9d6..0e9a2eb 100644 --- a/src/backend/access/rmgrdesc/nbtdesc.c +++ b/src/backend/access/rmgrdesc/nbtdesc.c @@ -48,8 +48,8 @@ btree_desc(StringInfo buf, XLogReaderState *record) { xl_btree_vacuum *xlrec = (xl_btree_vacuum *) rec; - appendStringInfo(buf, "lastBlockVacuumed %u", - xlrec->lastBlockVacuumed); + appendStringInfo(buf, "ndelitems %u, ncoloritems %u", + xlrec->ndelitems, xlrec->ncoloritems); break; } case XLOG_BTREE_DELETE: diff --git a/src/backend/access/spgist/spgvacuum.c b/src/backend/access/spgist/spgvacuum.c index cce9b3f..5343b10 100644 --- a/src/backend/access/spgist/spgvacuum.c +++ b/src/backend/access/spgist/spgvacuum.c @@ -155,7 +155,8 @@ vacuumLeafPage(spgBulkDeleteState *bds, Relation index, Buffer buffer, { Assert(ItemPointerIsValid(<->heapPtr)); - if (bds->callback(<->heapPtr, bds->callback_state)) + if (bds->callback(<->heapPtr, false, bds->callback_state) == + IBDCR_DELETE) { bds->stats->tuples_removed += 1; deletable[i] = true; @@ -425,7 +426,8 @@ vacuumLeafRoot(spgBulkDeleteState *bds, Relation index, Buffer buffer) { Assert(ItemPointerIsValid(<->heapPtr)); - if (bds->callback(<->heapPtr, bds->callback_state)) + if (bds->callback(<->heapPtr, false, bds->callback_state) == + IBDCR_DELETE) { bds->stats->tuples_removed += 1; toDelete[xlrec.nDelete] = i; @@ -902,10 +904,10 @@ spgbulkdelete(IndexVacuumInfo *info, IndexBulkDeleteResult *stats, } /* Dummy callback to delete no tuples during spgvacuumcleanup */ -static bool -dummy_callback(ItemPointer itemptr, void *state) +static IndexBulkDeleteCallbackResult +dummy_callback(ItemPointer itemptr, bool is_red, void *state) { - return false; + return IBDCR_KEEP; } /* diff --git a/src/backend/catalog/index.c b/src/backend/catalog/index.c index 049eb28..166efd8 100644 --- a/src/backend/catalog/index.c +++ b/src/backend/catalog/index.c @@ -115,7 +115,7 @@ static void IndexCheckExclusion(Relation heapRelation, IndexInfo *indexInfo); static inline int64 itemptr_encode(ItemPointer itemptr); static inline void itemptr_decode(ItemPointer itemptr, int64 encoded); -static bool validate_index_callback(ItemPointer itemptr, void *opaque); +static IndexBulkDeleteCallbackResult validate_index_callback(ItemPointer itemptr, bool is_red, void *opaque); static void validate_index_heapscan(Relation heapRelation, Relation indexRelation, IndexInfo *indexInfo, @@ -2949,15 +2949,15 @@ itemptr_decode(ItemPointer itemptr, int64 encoded) /* * validate_index_callback - bulkdelete callback to collect the index TIDs */ -static bool -validate_index_callback(ItemPointer itemptr, void *opaque) +static IndexBulkDeleteCallbackResult +validate_index_callback(ItemPointer itemptr, bool is_red, void *opaque) { v_i_state *state = (v_i_state *) opaque; int64 encoded = itemptr_encode(itemptr); tuplesort_putdatum(state->tuplesort, Int64GetDatum(encoded), false); state->itups += 1; - return false; /* never actually delete anything */ + return IBDCR_KEEP; /* never actually delete anything */ } /* @@ -3178,7 +3178,8 @@ validate_index_heapscan(Relation heapRelation, heapRelation, indexInfo->ii_Unique ? UNIQUE_CHECK_YES : UNIQUE_CHECK_NO, - indexInfo); + indexInfo, + false); state->tups_inserted += 1; } diff --git a/src/backend/catalog/indexing.c b/src/backend/catalog/indexing.c index 970254f..6392f33 100644 --- a/src/backend/catalog/indexing.c +++ b/src/backend/catalog/indexing.c @@ -172,7 +172,8 @@ CatalogIndexInsert(CatalogIndexState indstate, HeapTuple heapTuple, heapRelation, relationDescs[i]->rd_index->indisunique ? UNIQUE_CHECK_YES : UNIQUE_CHECK_NO, - indexInfo); + indexInfo, + warm_update); } ExecDropSingleTupleTableSlot(slot); @@ -222,7 +223,7 @@ CatalogTupleInsertWithInfo(Relation heapRel, HeapTuple tup, oid = simple_heap_insert(heapRel, tup); - CatalogIndexInsert(indstate, tup, false, NULL); + CatalogIndexInsert(indstate, tup, NULL, false); return oid; } diff --git a/src/backend/commands/constraint.c b/src/backend/commands/constraint.c index d9c0fe7..330b661 100644 --- a/src/backend/commands/constraint.c +++ b/src/backend/commands/constraint.c @@ -168,7 +168,8 @@ unique_key_recheck(PG_FUNCTION_ARGS) */ index_insert(indexRel, values, isnull, &(new_row->t_self), trigdata->tg_relation, UNIQUE_CHECK_EXISTING, - indexInfo); + indexInfo, + false); } else { diff --git a/src/backend/commands/vacuumlazy.c b/src/backend/commands/vacuumlazy.c index 7376099..deb76cb 100644 --- a/src/backend/commands/vacuumlazy.c +++ b/src/backend/commands/vacuumlazy.c @@ -104,6 +104,25 @@ */ #define PREFETCH_SIZE ((BlockNumber) 32) +/* + * Structure to track WARM chains that can be converted into HOT chains during + * this run. + * + * To reduce space requirement, we're using bitfields. But the way things are + * laid down, we're still wasting 1-byte per candidate chain. + */ +typedef struct LVRedBlueChain +{ + ItemPointerData chain_tid; /* root of the chain */ + uint8 is_red_chain:2; /* is the WARM chain complete red ? */ + uint8 keep_warm_chain:2; /* this chain can't be cleared of WARM + * tuples */ + uint8 num_blue_pointers:2;/* number of blue pointers found so + * far */ + uint8 num_red_pointers:2; /* number of red pointers found so far + * in the current index */ +} LVRedBlueChain; + typedef struct LVRelStats { /* hasindex = true means two-pass strategy; false means one-pass */ @@ -121,6 +140,16 @@ typedef struct LVRelStats BlockNumber pages_removed; double tuples_deleted; BlockNumber nonempty_pages; /* actually, last nonempty page + 1 */ + + double num_warm_chains; /* number of warm chains seen so far */ + + /* List of WARM chains that can be converted into HOT chains */ + /* NB: this list is ordered by TID of the root pointers */ + int num_redblue_chains; /* current # of entries */ + int max_redblue_chains; /* # slots allocated in array */ + LVRedBlueChain *redblue_chains; /* array of LVRedBlueChain */ + double num_non_convertible_warm_chains; + /* List of TIDs of tuples we intend to delete */ /* NB: this list is ordered by TID address */ int num_dead_tuples; /* current # of entries */ @@ -149,6 +178,7 @@ static void lazy_scan_heap(Relation onerel, int options, static void lazy_vacuum_heap(Relation onerel, LVRelStats *vacrelstats); static bool lazy_check_needs_freeze(Buffer buf, bool *hastup); static void lazy_vacuum_index(Relation indrel, + bool clear_warm, IndexBulkDeleteResult **stats, LVRelStats *vacrelstats); static void lazy_cleanup_index(Relation indrel, @@ -156,6 +186,10 @@ static void lazy_cleanup_index(Relation indrel, LVRelStats *vacrelstats); static int lazy_vacuum_page(Relation onerel, BlockNumber blkno, Buffer buffer, int tupindex, LVRelStats *vacrelstats, Buffer *vmbuffer); +static int lazy_warmclear_page(Relation onerel, BlockNumber blkno, + Buffer buffer, int chainindex, LVRelStats *vacrelstats, + Buffer *vmbuffer, bool check_all_visible); +static void lazy_reset_redblue_pointer_count(LVRelStats *vacrelstats); static bool should_attempt_truncation(LVRelStats *vacrelstats); static void lazy_truncate_heap(Relation onerel, LVRelStats *vacrelstats); static BlockNumber count_nondeletable_pages(Relation onerel, @@ -163,8 +197,15 @@ static BlockNumber count_nondeletable_pages(Relation onerel, static void lazy_space_alloc(LVRelStats *vacrelstats, BlockNumber relblocks); static void lazy_record_dead_tuple(LVRelStats *vacrelstats, ItemPointer itemptr); -static bool lazy_tid_reaped(ItemPointer itemptr, void *state); +static void lazy_record_red_chain(LVRelStats *vacrelstats, + ItemPointer itemptr); +static void lazy_record_blue_chain(LVRelStats *vacrelstats, + ItemPointer itemptr); +static IndexBulkDeleteCallbackResult lazy_tid_reaped(ItemPointer itemptr, bool is_red, void *state); +static IndexBulkDeleteCallbackResult lazy_indexvac_phase1(ItemPointer itemptr, bool is_red, void *state); +static IndexBulkDeleteCallbackResult lazy_indexvac_phase2(ItemPointer itemptr, bool is_red, void *state); static int vac_cmp_itemptr(const void *left, const void *right); +static int vac_cmp_redblue_chain(const void *left, const void *right); static bool heap_page_is_all_visible(Relation rel, Buffer buf, TransactionId *visibility_cutoff_xid, bool *all_frozen); @@ -684,8 +725,10 @@ lazy_scan_heap(Relation onerel, int options, LVRelStats *vacrelstats, * If we are close to overrunning the available space for dead-tuple * TIDs, pause and do a cycle of vacuuming before we tackle this page. */ - if ((vacrelstats->max_dead_tuples - vacrelstats->num_dead_tuples) < MaxHeapTuplesPerPage && - vacrelstats->num_dead_tuples > 0) + if (((vacrelstats->max_dead_tuples - vacrelstats->num_dead_tuples) < MaxHeapTuplesPerPage && + vacrelstats->num_dead_tuples > 0) || + ((vacrelstats->max_redblue_chains - vacrelstats->num_redblue_chains) < MaxHeapTuplesPerPage && + vacrelstats->num_redblue_chains > 0)) { const int hvp_index[] = { PROGRESS_VACUUM_PHASE, @@ -715,6 +758,7 @@ lazy_scan_heap(Relation onerel, int options, LVRelStats *vacrelstats, /* Remove index entries */ for (i = 0; i < nindexes; i++) lazy_vacuum_index(Irel[i], + (vacrelstats->num_redblue_chains > 0), &indstats[i], vacrelstats); @@ -737,6 +781,9 @@ lazy_scan_heap(Relation onerel, int options, LVRelStats *vacrelstats, * valid. */ vacrelstats->num_dead_tuples = 0; + vacrelstats->num_redblue_chains = 0; + memset(vacrelstats->redblue_chains, 0, + vacrelstats->max_redblue_chains * sizeof (LVRedBlueChain)); vacrelstats->num_index_scans++; /* Report that we are once again scanning the heap */ @@ -940,15 +987,33 @@ lazy_scan_heap(Relation onerel, int options, LVRelStats *vacrelstats, continue; } + ItemPointerSet(&(tuple.t_self), blkno, offnum); + /* Redirect items mustn't be touched */ if (ItemIdIsRedirected(itemid)) { + HeapCheckWarmChainStatus status = heap_check_warm_chain(page, + &tuple.t_self, false); + if (HCWC_IS_WARM(status)) + { + vacrelstats->num_warm_chains++; + + /* + * A chain which is either complete Red or Blue is a + * candidate for chain conversion. Remember the chain and + * its color. + */ + if (HCWC_IS_ALL_RED(status)) + lazy_record_red_chain(vacrelstats, &tuple.t_self); + else if (HCWC_IS_ALL_BLUE(status)) + lazy_record_blue_chain(vacrelstats, &tuple.t_self); + else + vacrelstats->num_non_convertible_warm_chains++; + } hastup = true; /* this page won't be truncatable */ continue; } - ItemPointerSet(&(tuple.t_self), blkno, offnum); - /* * DEAD item pointers are to be vacuumed normally; but we don't * count them in tups_vacuumed, else we'd be double-counting (at @@ -968,6 +1033,28 @@ lazy_scan_heap(Relation onerel, int options, LVRelStats *vacrelstats, tuple.t_len = ItemIdGetLength(itemid); tuple.t_tableOid = RelationGetRelid(onerel); + if (!HeapTupleIsHeapOnly(&tuple)) + { + HeapCheckWarmChainStatus status = heap_check_warm_chain(page, + &tuple.t_self, false); + if (HCWC_IS_WARM(status)) + { + vacrelstats->num_warm_chains++; + + /* + * A chain which is either complete Red or Blue is a + * candidate for chain conversion. Remember the chain and + * its color. + */ + if (HCWC_IS_ALL_RED(status)) + lazy_record_red_chain(vacrelstats, &tuple.t_self); + else if (HCWC_IS_ALL_BLUE(status)) + lazy_record_blue_chain(vacrelstats, &tuple.t_self); + else + vacrelstats->num_non_convertible_warm_chains++; + } + } + tupgone = false; switch (HeapTupleSatisfiesVacuum(&tuple, OldestXmin, buf)) @@ -1288,7 +1375,7 @@ lazy_scan_heap(Relation onerel, int options, LVRelStats *vacrelstats, /* If any tuples need to be deleted, perform final vacuum cycle */ /* XXX put a threshold on min number of tuples here? */ - if (vacrelstats->num_dead_tuples > 0) + if (vacrelstats->num_dead_tuples > 0 || vacrelstats->num_redblue_chains > 0) { const int hvp_index[] = { PROGRESS_VACUUM_PHASE, @@ -1306,6 +1393,7 @@ lazy_scan_heap(Relation onerel, int options, LVRelStats *vacrelstats, /* Remove index entries */ for (i = 0; i < nindexes; i++) lazy_vacuum_index(Irel[i], + (vacrelstats->num_redblue_chains > 0), &indstats[i], vacrelstats); @@ -1373,7 +1461,10 @@ lazy_scan_heap(Relation onerel, int options, LVRelStats *vacrelstats, * * This routine marks dead tuples as unused and compacts out free * space on their pages. Pages not having dead tuples recorded from - * lazy_scan_heap are not visited at all. + * lazy_scan_heap are not visited at all. This routine also converts + * candidate WARM chains to HOT chains by clearing WARM related flags. The + * candidate chains are determined by the preceeding index scans after + * looking at the data collected by the first heap scan. * * Note: the reason for doing this as a second pass is we cannot remove * the tuples until we've removed their index entries, and we want to @@ -1382,7 +1473,7 @@ lazy_scan_heap(Relation onerel, int options, LVRelStats *vacrelstats, static void lazy_vacuum_heap(Relation onerel, LVRelStats *vacrelstats) { - int tupindex; + int tupindex, chainindex; int npages; PGRUsage ru0; Buffer vmbuffer = InvalidBuffer; @@ -1391,33 +1482,69 @@ lazy_vacuum_heap(Relation onerel, LVRelStats *vacrelstats) npages = 0; tupindex = 0; - while (tupindex < vacrelstats->num_dead_tuples) + chainindex = 0; + while (tupindex < vacrelstats->num_dead_tuples || + chainindex < vacrelstats->num_redblue_chains) { - BlockNumber tblk; + BlockNumber tblk, chainblk, vacblk; Buffer buf; Page page; Size freespace; vacuum_delay_point(); - tblk = ItemPointerGetBlockNumber(&vacrelstats->dead_tuples[tupindex]); - buf = ReadBufferExtended(onerel, MAIN_FORKNUM, tblk, RBM_NORMAL, + tblk = chainblk = InvalidBlockNumber; + if (chainindex < vacrelstats->num_redblue_chains) + chainblk = + ItemPointerGetBlockNumber(&(vacrelstats->redblue_chains[chainindex].chain_tid)); + + if (tupindex < vacrelstats->num_dead_tuples) + tblk = ItemPointerGetBlockNumber(&vacrelstats->dead_tuples[tupindex]); + + if (tblk == InvalidBlockNumber) + vacblk = chainblk; + else if (chainblk == InvalidBlockNumber) + vacblk = tblk; + else + vacblk = Min(chainblk, tblk); + + Assert(vacblk != InvalidBlockNumber); + + buf = ReadBufferExtended(onerel, MAIN_FORKNUM, vacblk, RBM_NORMAL, vac_strategy); - if (!ConditionalLockBufferForCleanup(buf)) + + + if (vacblk == chainblk) + LockBufferForCleanup(buf); + else if (!ConditionalLockBufferForCleanup(buf)) { ReleaseBuffer(buf); ++tupindex; continue; } - tupindex = lazy_vacuum_page(onerel, tblk, buf, tupindex, vacrelstats, - &vmbuffer); + + /* + * Convert WARM chains on this page. This should be done before + * vacuuming the page to ensure that we can correctly set visibility + * bits after clearing WARM chains. + * + * If we are going to vacuum this page then don't check for + * all-visibility just yet. + */ + if (vacblk == chainblk) + chainindex = lazy_warmclear_page(onerel, chainblk, buf, chainindex, + vacrelstats, &vmbuffer, chainblk != tblk); + + if (vacblk == tblk) + tupindex = lazy_vacuum_page(onerel, tblk, buf, tupindex, vacrelstats, + &vmbuffer); /* Now that we've compacted the page, record its available space */ page = BufferGetPage(buf); freespace = PageGetHeapFreeSpace(page); UnlockReleaseBuffer(buf); - RecordPageWithFreeSpace(onerel, tblk, freespace); + RecordPageWithFreeSpace(onerel, vacblk, freespace); npages++; } @@ -1436,6 +1563,107 @@ lazy_vacuum_heap(Relation onerel, LVRelStats *vacrelstats) } /* + * lazy_warmclear_page() -- clear WARM flag and mark chains blue when possible + * + * Caller must hold pin and buffer cleanup lock on the buffer. + * + * chainindex is the index in vacrelstats->redblue_chains of the first dead + * tuple for this page. We assume the rest follow sequentially. + * The return value is the first tupindex after the tuples of this page. + * + * If check_all_visible is set then we also check if the page has now become + * all visible and update visibility map. + */ +static int +lazy_warmclear_page(Relation onerel, BlockNumber blkno, Buffer buffer, + int chainindex, LVRelStats *vacrelstats, Buffer *vmbuffer, + bool check_all_visible) +{ + Page page = BufferGetPage(buffer); + OffsetNumber cleared_offnums[MaxHeapTuplesPerPage]; + int num_cleared = 0; + TransactionId visibility_cutoff_xid; + bool all_frozen; + + pgstat_progress_update_param(PROGRESS_VACUUM_HEAP_BLKS_WARMCLEARED, blkno); + + START_CRIT_SECTION(); + + for (; chainindex < vacrelstats->num_redblue_chains ; chainindex++) + { + BlockNumber tblk; + LVRedBlueChain *chain; + + chain = &vacrelstats->redblue_chains[chainindex]; + + tblk = ItemPointerGetBlockNumber(&chain->chain_tid); + if (tblk != blkno) + break; /* past end of tuples for this block */ + + /* + * Since a heap page can have no more than MaxHeapTuplesPerPage + * offnums and we process each offnum only once, MaxHeapTuplesPerPage + * size array should be enough to hold all cleared tuples in this page. + */ + if (!chain->keep_warm_chain) + num_cleared += heap_clear_warm_chain(page, &chain->chain_tid, + cleared_offnums + num_cleared); + } + + /* + * Mark buffer dirty before we write WAL. + */ + MarkBufferDirty(buffer); + + /* XLOG stuff */ + if (RelationNeedsWAL(onerel)) + { + XLogRecPtr recptr; + + recptr = log_heap_warmclear(onerel, buffer, + cleared_offnums, num_cleared); + PageSetLSN(page, recptr); + } + + END_CRIT_SECTION(); + + /* If not checking for all-visibility then we're done */ + if (!check_all_visible) + return chainindex; + + /* + * The following code should match the corresponding code in + * lazy_vacuum_page + **/ + if (heap_page_is_all_visible(onerel, buffer, &visibility_cutoff_xid, + &all_frozen)) + PageSetAllVisible(page); + + /* + * All the changes to the heap page have been done. If the all-visible + * flag is now set, also set the VM all-visible bit (and, if possible, the + * all-frozen bit) unless this has already been done previously. + */ + if (PageIsAllVisible(page)) + { + uint8 vm_status = visibilitymap_get_status(onerel, blkno, vmbuffer); + uint8 flags = 0; + + /* Set the VM all-frozen bit to flag, if needed */ + if ((vm_status & VISIBILITYMAP_ALL_VISIBLE) == 0) + flags |= VISIBILITYMAP_ALL_VISIBLE; + if ((vm_status & VISIBILITYMAP_ALL_FROZEN) == 0 && all_frozen) + flags |= VISIBILITYMAP_ALL_FROZEN; + + Assert(BufferIsValid(*vmbuffer)); + if (flags != 0) + visibilitymap_set(onerel, blkno, buffer, InvalidXLogRecPtr, + *vmbuffer, visibility_cutoff_xid, flags); + } + return chainindex; +} + +/* * lazy_vacuum_page() -- free dead tuples on a page * and repair its fragmentation. * @@ -1588,6 +1816,16 @@ lazy_check_needs_freeze(Buffer buf, bool *hastup) return false; } +static void +lazy_reset_redblue_pointer_count(LVRelStats *vacrelstats) +{ + int i; + for (i = 0; i < vacrelstats->num_redblue_chains; i++) + { + LVRedBlueChain *chain = &vacrelstats->redblue_chains[i]; + chain->num_blue_pointers = chain->num_red_pointers = 0; + } +} /* * lazy_vacuum_index() -- vacuum one index relation. @@ -1597,6 +1835,7 @@ lazy_check_needs_freeze(Buffer buf, bool *hastup) */ static void lazy_vacuum_index(Relation indrel, + bool clear_warm, IndexBulkDeleteResult **stats, LVRelStats *vacrelstats) { @@ -1612,15 +1851,81 @@ lazy_vacuum_index(Relation indrel, ivinfo.num_heap_tuples = vacrelstats->old_rel_tuples; ivinfo.strategy = vac_strategy; - /* Do bulk deletion */ - *stats = index_bulk_delete(&ivinfo, *stats, - lazy_tid_reaped, (void *) vacrelstats); + /* + * If told, convert WARM chains into HOT chains. + * + * We must have already collected candidate WARM chains i.e. chains which + * has either has only Red or only Blue tuples, but not a mix of both. + * + * This works in two phases. In the first phase, we do a complete index + * scan and collect information about index pointers to the candidate + * chains, but we don't do conversion. To be precise, we count the number + * of Blue and Red index pointers to each candidate chain and use that + * knowledge to arrive at a decision and do the actual conversion during + * the second phase (we kill known dead pointers though in this phase). + * + * In the second phase, for each Red chain we check if we have seen a Red + * index pointer. For such chains, we kill the Blue pointer and color the + * Red pointer Blue. the heap tuples are marked Blue in the second heap + * scan. If we did not find any Red pointer to a Red chain, that means that + * the chain is reachable from the Blue pointer (because say WARM update + * did not added a new entry for this index). In that case, we do nothing. + * There is a third case where we find more than one Blue pointers to a Red + * chain. This can happen because of aborted vacuums. We don't handle that + * case yet, but it should be possible to apply the same recheck logic and + * find which of the Blue pointers is redundant and should be removed. + * + * For Blue chains, we just kill the Red pointer, if it exists and keep the + * Blue pointer. + */ + if (clear_warm) + { + lazy_reset_redblue_pointer_count(vacrelstats); + *stats = index_bulk_delete(&ivinfo, *stats, + lazy_indexvac_phase1, (void *) vacrelstats); + ereport(elevel, + (errmsg("scanned index \"%s\" to remove %d row version, found " + "%0.f red pointers, %0.f blue pointers, removed " + "%0.f red pointers, removed %0.f blue pointers", + RelationGetRelationName(indrel), + vacrelstats->num_dead_tuples, + (*stats)->num_red_pointers, + (*stats)->num_blue_pointers, + (*stats)->red_pointers_removed, + (*stats)->blue_pointers_removed))); + + (*stats)->num_red_pointers = 0; + (*stats)->num_blue_pointers = 0; + (*stats)->red_pointers_removed = 0; + (*stats)->blue_pointers_removed = 0; + (*stats)->pointers_colored = 0; + + *stats = index_bulk_delete(&ivinfo, *stats, + lazy_indexvac_phase2, (void *) vacrelstats); + ereport(elevel, + (errmsg("scanned index \"%s\" to convert red pointers, found " + "%0.f red pointers, %0.f blue pointers, removed " + "%0.f red pointers, removed %0.f blue pointers, " + "colored %0.f red pointers blue", + RelationGetRelationName(indrel), + (*stats)->num_red_pointers, + (*stats)->num_blue_pointers, + (*stats)->red_pointers_removed, + (*stats)->blue_pointers_removed, + (*stats)->pointers_colored))); + } + else + { + /* Do bulk deletion */ + *stats = index_bulk_delete(&ivinfo, *stats, + lazy_tid_reaped, (void *) vacrelstats); + ereport(elevel, + (errmsg("scanned index \"%s\" to remove %d row versions", + RelationGetRelationName(indrel), + vacrelstats->num_dead_tuples), + errdetail("%s.", pg_rusage_show(&ru0)))); + } - ereport(elevel, - (errmsg("scanned index \"%s\" to remove %d row versions", - RelationGetRelationName(indrel), - vacrelstats->num_dead_tuples), - errdetail("%s.", pg_rusage_show(&ru0)))); } /* @@ -1994,9 +2299,11 @@ lazy_space_alloc(LVRelStats *vacrelstats, BlockNumber relblocks) if (vacrelstats->hasindex) { - maxtuples = (vac_work_mem * 1024L) / sizeof(ItemPointerData); + maxtuples = (vac_work_mem * 1024L) / (sizeof(ItemPointerData) + + sizeof(LVRedBlueChain)); maxtuples = Min(maxtuples, INT_MAX); - maxtuples = Min(maxtuples, MaxAllocSize / sizeof(ItemPointerData)); + maxtuples = Min(maxtuples, MaxAllocSize / (sizeof(ItemPointerData) + + sizeof(LVRedBlueChain))); /* curious coding here to ensure the multiplication can't overflow */ if ((BlockNumber) (maxtuples / LAZY_ALLOC_TUPLES) > relblocks) @@ -2014,6 +2321,57 @@ lazy_space_alloc(LVRelStats *vacrelstats, BlockNumber relblocks) vacrelstats->max_dead_tuples = (int) maxtuples; vacrelstats->dead_tuples = (ItemPointer) palloc(maxtuples * sizeof(ItemPointerData)); + + /* + * XXX Cheat for now and allocate the same size array for tracking blue and + * red chains. maxtuples must have been already adjusted above to ensure we + * don't cross vac_work_mem. + */ + vacrelstats->num_redblue_chains = 0; + vacrelstats->max_redblue_chains = (int) maxtuples; + vacrelstats->redblue_chains = (LVRedBlueChain *) + palloc0(maxtuples * sizeof(LVRedBlueChain)); + +} + +/* + * lazy_record_blue_chain - remember one blue chain + */ +static void +lazy_record_blue_chain(LVRelStats *vacrelstats, + ItemPointer itemptr) +{ + /* + * The array shouldn't overflow under normal behavior, but perhaps it + * could if we are given a really small maintenance_work_mem. In that + * case, just forget the last few tuples (we'll get 'em next time). + */ + if (vacrelstats->num_redblue_chains < vacrelstats->max_redblue_chains) + { + vacrelstats->redblue_chains[vacrelstats->num_redblue_chains].chain_tid = *itemptr; + vacrelstats->redblue_chains[vacrelstats->num_redblue_chains].is_red_chain = 0; + vacrelstats->num_redblue_chains++; + } +} + +/* + * lazy_record_red_chain - remember one red chain + */ +static void +lazy_record_red_chain(LVRelStats *vacrelstats, + ItemPointer itemptr) +{ + /* + * The array shouldn't overflow under normal behavior, but perhaps it + * could if we are given a really small maintenance_work_mem. In that + * case, just forget the last few tuples (we'll get 'em next time). + */ + if (vacrelstats->num_redblue_chains < vacrelstats->max_redblue_chains) + { + vacrelstats->redblue_chains[vacrelstats->num_redblue_chains].chain_tid = *itemptr; + vacrelstats->redblue_chains[vacrelstats->num_redblue_chains].is_red_chain = 1; + vacrelstats->num_redblue_chains++; + } } /* @@ -2044,8 +2402,8 @@ lazy_record_dead_tuple(LVRelStats *vacrelstats, * * Assumes dead_tuples array is in sorted order. */ -static bool -lazy_tid_reaped(ItemPointer itemptr, void *state) +static IndexBulkDeleteCallbackResult +lazy_tid_reaped(ItemPointer itemptr, bool is_red, void *state) { LVRelStats *vacrelstats = (LVRelStats *) state; ItemPointer res; @@ -2056,7 +2414,193 @@ lazy_tid_reaped(ItemPointer itemptr, void *state) sizeof(ItemPointerData), vac_cmp_itemptr); - return (res != NULL); + return (res != NULL) ? IBDCR_DELETE : IBDCR_KEEP; +} + +/* + * lazy_indexvac_phase1() -- run first pass of index vacuum + * + * This has the right signature to be an IndexBulkDeleteCallback. + */ +static IndexBulkDeleteCallbackResult +lazy_indexvac_phase1(ItemPointer itemptr, bool is_red, void *state) +{ + LVRelStats *vacrelstats = (LVRelStats *) state; + ItemPointer res; + LVRedBlueChain *chain; + + res = (ItemPointer) bsearch((void *) itemptr, + (void *) vacrelstats->dead_tuples, + vacrelstats->num_dead_tuples, + sizeof(ItemPointerData), + vac_cmp_itemptr); + + if (res != NULL) + return IBDCR_DELETE; + + chain = (LVRedBlueChain *) bsearch((void *) itemptr, + (void *) vacrelstats->redblue_chains, + vacrelstats->num_redblue_chains, + sizeof(LVRedBlueChain), + vac_cmp_redblue_chain); + if (chain != NULL) + { + if (is_red) + chain->num_red_pointers++; + else + chain->num_blue_pointers++; + } + return IBDCR_KEEP; +} + +/* + * lazy_indexvac_phase2() -- run first pass of index vacuum + * + * This has the right signature to be an IndexBulkDeleteCallback. + */ +static IndexBulkDeleteCallbackResult +lazy_indexvac_phase2(ItemPointer itemptr, bool is_red, void *state) +{ + LVRelStats *vacrelstats = (LVRelStats *) state; + LVRedBlueChain *chain; + + chain = (LVRedBlueChain *) bsearch((void *) itemptr, + (void *) vacrelstats->redblue_chains, + vacrelstats->num_redblue_chains, + sizeof(LVRedBlueChain), + vac_cmp_redblue_chain); + + if (chain != NULL && (chain->keep_warm_chain != 1)) + { + /* + * At no point, we can have more than 1 Red pointer to any chain and no + * more than 2 Blue pointers. + */ + Assert(chain->num_red_pointers <= 1); + Assert(chain->num_blue_pointers <= 2); + + if (chain->is_red_chain == 1) + { + if (is_red) + { + /* + * A Red pointer, pointing to a Blue chain. + * + * Color the Red pointer Blue (and delete the Blue pointer). We + * may have already seen the Blue pointer in the scan and + * deleted that or we may see it later in the scan. It doesn't + * matter if we fail at any point because we won't clear up + * WARM bits on the heap tuples until we have dealt with the + * index pointers cleanly. + */ + return IBDCR_COLOR_BLUE; + } + else + { + /* + * Blue pointer to a Red chain. + */ + if (chain->num_red_pointers > 0) + { + /* + * If there exists a Red pointer to the chain, we can + * delete the Blue pointer and clear the WARM bits on the + * heap tuples. + */ + return IBDCR_DELETE; + } + else if (chain->num_blue_pointers == 1) + { + /* + * If this is the only pointer to a Red chain, we must keep the + * Blue pointer. + * + * The presence of Red chain indicates that the WARM update + * must have been committed good. But during the update + * this index was probably not updated and hence it + * contains just one, original Blue pointer to the chain. + * We should be able to clear the WARM bits on heap tuples + * unless we later find another index which prevents the + * cleanup. + */ + return IBDCR_KEEP; + } + } + } + else + { + /* + * This is a Blue chain. + */ + if (is_red) + { + /* + * A Red pointer to a Blue chain. + * + * This can happen when a WARM update is aborted. Later the HOT + * chain is pruned leaving behind only Blue tuples in the + * chain. But the Red index pointer inserted in the index + * remains and it must now be deleted before we clear WARM bits + * from the heap tuple. + */ + return IBDCR_DELETE; + } + + /* + * Blue pointer to a Blue chain. + * + * If this is the only surviving Blue pointer, keep it and clear + * the WARM bits from the heap tuples. + */ + if (chain->num_blue_pointers == 1) + return IBDCR_KEEP; + + /* + * If there are more than 1 Blue pointers to this chain, we can + * apply the recheck logic and kill the redudant Blue pointer and + * convert the chain. But that's not yet done. + */ + } + + /* + * For everything else, we must keep the WARM bits and also keep the + * index pointers. + */ + chain->keep_warm_chain = 1; + return IBDCR_KEEP; + } + return IBDCR_KEEP; +} + +/* + * Comparator routines for use with qsort() and bsearch(). Similar to + * vac_cmp_itemptr, but right hand argument is LVRedBlueChain struct pointer. + */ +static int +vac_cmp_redblue_chain(const void *left, const void *right) +{ + BlockNumber lblk, + rblk; + OffsetNumber loff, + roff; + + lblk = ItemPointerGetBlockNumber((ItemPointer) left); + rblk = ItemPointerGetBlockNumber(&((LVRedBlueChain *) right)->chain_tid); + + if (lblk < rblk) + return -1; + if (lblk > rblk) + return 1; + + loff = ItemPointerGetOffsetNumber((ItemPointer) left); + roff = ItemPointerGetOffsetNumber(&((LVRedBlueChain *) right)->chain_tid); + + if (loff < roff) + return -1; + if (loff > roff) + return 1; + + return 0; } /* diff --git a/src/backend/executor/execIndexing.c b/src/backend/executor/execIndexing.c index d62d2de..3e49a8f 100644 --- a/src/backend/executor/execIndexing.c +++ b/src/backend/executor/execIndexing.c @@ -405,7 +405,8 @@ ExecInsertIndexTuples(TupleTableSlot *slot, root_tid, /* tid of heap or root tuple */ heapRelation, /* heap relation */ checkUnique, /* type of uniqueness check to do */ - indexInfo); /* index AM may need this */ + indexInfo, /* index AM may need this */ + (modified_attrs != NULL)); /* type of uniqueness check to do */ /* * If the index has an associated exclusion constraint, check that. diff --git a/src/backend/replication/logical/decode.c b/src/backend/replication/logical/decode.c index 5c13d26..7a9b48a 100644 --- a/src/backend/replication/logical/decode.c +++ b/src/backend/replication/logical/decode.c @@ -347,7 +347,7 @@ DecodeStandbyOp(LogicalDecodingContext *ctx, XLogRecordBuffer *buf) static void DecodeHeap2Op(LogicalDecodingContext *ctx, XLogRecordBuffer *buf) { - uint8 info = XLogRecGetInfo(buf->record) & XLOG_HEAP_OPMASK; + uint8 info = XLogRecGetInfo(buf->record) & XLOG_HEAP2_OPMASK; TransactionId xid = XLogRecGetXid(buf->record); SnapBuild *builder = ctx->snapshot_builder; @@ -359,10 +359,6 @@ DecodeHeap2Op(LogicalDecodingContext *ctx, XLogRecordBuffer *buf) switch (info) { - case XLOG_HEAP2_MULTI_INSERT: - if (SnapBuildProcessChange(builder, xid, buf->origptr)) - DecodeMultiInsert(ctx, buf); - break; case XLOG_HEAP2_NEW_CID: { xl_heap_new_cid *xlrec; @@ -390,6 +386,7 @@ DecodeHeap2Op(LogicalDecodingContext *ctx, XLogRecordBuffer *buf) case XLOG_HEAP2_CLEANUP_INFO: case XLOG_HEAP2_VISIBLE: case XLOG_HEAP2_LOCK_UPDATED: + case XLOG_HEAP2_WARMCLEAR: break; default: elog(ERROR, "unexpected RM_HEAP2_ID record type: %u", info); @@ -418,6 +415,10 @@ DecodeHeapOp(LogicalDecodingContext *ctx, XLogRecordBuffer *buf) if (SnapBuildProcessChange(builder, xid, buf->origptr)) DecodeInsert(ctx, buf); break; + case XLOG_HEAP_MULTI_INSERT: + if (SnapBuildProcessChange(builder, xid, buf->origptr)) + DecodeMultiInsert(ctx, buf); + break; /* * Treat HOT update as normal updates. There is no useful @@ -809,7 +810,7 @@ DecodeDelete(LogicalDecodingContext *ctx, XLogRecordBuffer *buf) } /* - * Decode XLOG_HEAP2_MULTI_INSERT_insert record into multiple tuplebufs. + * Decode XLOG_HEAP_MULTI_INSERT_insert record into multiple tuplebufs. * * Currently MULTI_INSERT will always contain the full tuples. */ diff --git a/src/backend/utils/time/combocid.c b/src/backend/utils/time/combocid.c index baff998..6a2e2f2 100644 --- a/src/backend/utils/time/combocid.c +++ b/src/backend/utils/time/combocid.c @@ -106,7 +106,7 @@ HeapTupleHeaderGetCmin(HeapTupleHeader tup) { CommandId cid = HeapTupleHeaderGetRawCommandId(tup); - Assert(!(tup->t_infomask & HEAP_MOVED)); + Assert(!(HeapTupleHeaderIsMoved(tup))); Assert(TransactionIdIsCurrentTransactionId(HeapTupleHeaderGetXmin(tup))); if (tup->t_infomask & HEAP_COMBOCID) @@ -120,7 +120,7 @@ HeapTupleHeaderGetCmax(HeapTupleHeader tup) { CommandId cid = HeapTupleHeaderGetRawCommandId(tup); - Assert(!(tup->t_infomask & HEAP_MOVED)); + Assert(!(HeapTupleHeaderIsMoved(tup))); /* * Because GetUpdateXid() performs memory allocations if xmax is a diff --git a/src/backend/utils/time/tqual.c b/src/backend/utils/time/tqual.c index 703bdce..0df5a44 100644 --- a/src/backend/utils/time/tqual.c +++ b/src/backend/utils/time/tqual.c @@ -186,7 +186,7 @@ HeapTupleSatisfiesSelf(HeapTuple htup, Snapshot snapshot, Buffer buffer) return false; /* Used by pre-9.0 binary upgrades */ - if (tuple->t_infomask & HEAP_MOVED_OFF) + if (HeapTupleHeaderIsMovedOff(tuple)) { TransactionId xvac = HeapTupleHeaderGetXvac(tuple); @@ -205,7 +205,7 @@ HeapTupleSatisfiesSelf(HeapTuple htup, Snapshot snapshot, Buffer buffer) } } /* Used by pre-9.0 binary upgrades */ - else if (tuple->t_infomask & HEAP_MOVED_IN) + else if (HeapTupleHeaderIsMovedIn(tuple)) { TransactionId xvac = HeapTupleHeaderGetXvac(tuple); @@ -377,7 +377,7 @@ HeapTupleSatisfiesToast(HeapTuple htup, Snapshot snapshot, return false; /* Used by pre-9.0 binary upgrades */ - if (tuple->t_infomask & HEAP_MOVED_OFF) + if (HeapTupleHeaderIsMovedOff(tuple)) { TransactionId xvac = HeapTupleHeaderGetXvac(tuple); @@ -396,7 +396,7 @@ HeapTupleSatisfiesToast(HeapTuple htup, Snapshot snapshot, } } /* Used by pre-9.0 binary upgrades */ - else if (tuple->t_infomask & HEAP_MOVED_IN) + else if (HeapTupleHeaderIsMovedIn(tuple)) { TransactionId xvac = HeapTupleHeaderGetXvac(tuple); @@ -471,7 +471,7 @@ HeapTupleSatisfiesUpdate(HeapTuple htup, CommandId curcid, return HeapTupleInvisible; /* Used by pre-9.0 binary upgrades */ - if (tuple->t_infomask & HEAP_MOVED_OFF) + if (HeapTupleHeaderIsMovedOff(tuple)) { TransactionId xvac = HeapTupleHeaderGetXvac(tuple); @@ -490,7 +490,7 @@ HeapTupleSatisfiesUpdate(HeapTuple htup, CommandId curcid, } } /* Used by pre-9.0 binary upgrades */ - else if (tuple->t_infomask & HEAP_MOVED_IN) + else if (HeapTupleHeaderIsMovedIn(tuple)) { TransactionId xvac = HeapTupleHeaderGetXvac(tuple); @@ -753,7 +753,7 @@ HeapTupleSatisfiesDirty(HeapTuple htup, Snapshot snapshot, return false; /* Used by pre-9.0 binary upgrades */ - if (tuple->t_infomask & HEAP_MOVED_OFF) + if (HeapTupleHeaderIsMovedOff(tuple)) { TransactionId xvac = HeapTupleHeaderGetXvac(tuple); @@ -772,7 +772,7 @@ HeapTupleSatisfiesDirty(HeapTuple htup, Snapshot snapshot, } } /* Used by pre-9.0 binary upgrades */ - else if (tuple->t_infomask & HEAP_MOVED_IN) + else if (HeapTupleHeaderIsMovedIn(tuple)) { TransactionId xvac = HeapTupleHeaderGetXvac(tuple); @@ -974,7 +974,7 @@ HeapTupleSatisfiesMVCC(HeapTuple htup, Snapshot snapshot, return false; /* Used by pre-9.0 binary upgrades */ - if (tuple->t_infomask & HEAP_MOVED_OFF) + if (HeapTupleHeaderIsMovedOff(tuple)) { TransactionId xvac = HeapTupleHeaderGetXvac(tuple); @@ -993,7 +993,7 @@ HeapTupleSatisfiesMVCC(HeapTuple htup, Snapshot snapshot, } } /* Used by pre-9.0 binary upgrades */ - else if (tuple->t_infomask & HEAP_MOVED_IN) + else if (HeapTupleHeaderIsMovedIn(tuple)) { TransactionId xvac = HeapTupleHeaderGetXvac(tuple); @@ -1180,7 +1180,7 @@ HeapTupleSatisfiesVacuum(HeapTuple htup, TransactionId OldestXmin, if (HeapTupleHeaderXminInvalid(tuple)) return HEAPTUPLE_DEAD; /* Used by pre-9.0 binary upgrades */ - else if (tuple->t_infomask & HEAP_MOVED_OFF) + else if (HeapTupleHeaderIsMovedOff(tuple)) { TransactionId xvac = HeapTupleHeaderGetXvac(tuple); @@ -1198,7 +1198,7 @@ HeapTupleSatisfiesVacuum(HeapTuple htup, TransactionId OldestXmin, InvalidTransactionId); } /* Used by pre-9.0 binary upgrades */ - else if (tuple->t_infomask & HEAP_MOVED_IN) + else if (HeapTupleHeaderIsMovedIn(tuple)) { TransactionId xvac = HeapTupleHeaderGetXvac(tuple); diff --git a/src/include/access/amapi.h b/src/include/access/amapi.h index d7702e5..68859f2 100644 --- a/src/include/access/amapi.h +++ b/src/include/access/amapi.h @@ -75,6 +75,14 @@ typedef bool (*aminsert_function) (Relation indexRelation, Relation heapRelation, IndexUniqueCheck checkUnique, struct IndexInfo *indexInfo); +/* insert this WARM tuple */ +typedef bool (*amwarminsert_function) (Relation indexRelation, + Datum *values, + bool *isnull, + ItemPointer heap_tid, + Relation heapRelation, + IndexUniqueCheck checkUnique, + struct IndexInfo *indexInfo); /* bulk delete */ typedef IndexBulkDeleteResult *(*ambulkdelete_function) (IndexVacuumInfo *info, @@ -203,6 +211,7 @@ typedef struct IndexAmRoutine ambuild_function ambuild; ambuildempty_function ambuildempty; aminsert_function aminsert; + amwarminsert_function amwarminsert; ambulkdelete_function ambulkdelete; amvacuumcleanup_function amvacuumcleanup; amcanreturn_function amcanreturn; /* can be NULL */ diff --git a/src/include/access/genam.h b/src/include/access/genam.h index f467b18..bf1e6bd 100644 --- a/src/include/access/genam.h +++ b/src/include/access/genam.h @@ -75,12 +75,29 @@ typedef struct IndexBulkDeleteResult bool estimated_count; /* num_index_tuples is an estimate */ double num_index_tuples; /* tuples remaining */ double tuples_removed; /* # removed during vacuum operation */ + double num_red_pointers; /* # red pointers found */ + double num_blue_pointers; /* # blue pointers found */ + double pointers_colored; /* # red pointers colored blue */ + double red_pointers_removed; /* # red pointers removed */ + double blue_pointers_removed; /* # blue pointers removed */ BlockNumber pages_deleted; /* # unused pages in index */ BlockNumber pages_free; /* # pages available for reuse */ } IndexBulkDeleteResult; +/* + * IndexBulkDeleteCallback should return one of the following + */ +typedef enum IndexBulkDeleteCallbackResult +{ + IBDCR_KEEP, /* index tuple should be preserved */ + IBDCR_DELETE, /* index tuple should be deleted */ + IBDCR_COLOR_BLUE /* index tuple should be colored blue */ +} IndexBulkDeleteCallbackResult; + /* Typedef for callback function to determine if a tuple is bulk-deletable */ -typedef bool (*IndexBulkDeleteCallback) (ItemPointer itemptr, void *state); +typedef IndexBulkDeleteCallbackResult (*IndexBulkDeleteCallback) ( + ItemPointer itemptr, + bool is_red, void *state); /* struct definitions appear in relscan.h */ typedef struct IndexScanDescData *IndexScanDesc; @@ -135,7 +152,8 @@ extern bool index_insert(Relation indexRelation, ItemPointer heap_t_ctid, Relation heapRelation, IndexUniqueCheck checkUnique, - struct IndexInfo *indexInfo); + struct IndexInfo *indexInfo, + bool warm_update); extern IndexScanDesc index_beginscan(Relation heapRelation, Relation indexRelation, diff --git a/src/include/access/hash.h b/src/include/access/hash.h index 0af6b4e..97d9cfb 100644 --- a/src/include/access/hash.h +++ b/src/include/access/hash.h @@ -269,6 +269,11 @@ typedef HashMetaPageData *HashMetaPage; #define HASHPROC 1 #define HASHNProcs 1 +/* + * Flags overloaded on t_tid.ip_posid field. They are managed by + * ItemPointerSetFlags and corresponing routines. + */ +#define HASH_INDEX_RED_POINTER 0x01 /* public routines */ @@ -279,6 +284,10 @@ extern bool hashinsert(Relation rel, Datum *values, bool *isnull, ItemPointer ht_ctid, Relation heapRel, IndexUniqueCheck checkUnique, struct IndexInfo *indexInfo); +extern bool hashwarminsert(Relation rel, Datum *values, bool *isnull, + ItemPointer ht_ctid, Relation heapRel, + IndexUniqueCheck checkUnique, + struct IndexInfo *indexInfo); extern bool hashgettuple(IndexScanDesc scan, ScanDirection dir); extern int64 hashgetbitmap(IndexScanDesc scan, TIDBitmap *tbm); extern IndexScanDesc hashbeginscan(Relation rel, int nkeys, int norderbys); @@ -348,6 +357,8 @@ extern void _hash_expandtable(Relation rel, Buffer metabuf); extern void _hash_finish_split(Relation rel, Buffer metabuf, Buffer obuf, Bucket obucket, uint32 maxbucket, uint32 highmask, uint32 lowmask); +extern void _hash_color_items(Page page, OffsetNumber *coloritemsno, + uint16 ncoloritems); /* hashsearch.c */ extern bool _hash_next(IndexScanDesc scan, ScanDirection dir); diff --git a/src/include/access/heapam.h b/src/include/access/heapam.h index 9412c3a..719a725 100644 --- a/src/include/access/heapam.h +++ b/src/include/access/heapam.h @@ -72,6 +72,20 @@ typedef struct HeapUpdateFailureData CommandId cmax; } HeapUpdateFailureData; +typedef int HeapCheckWarmChainStatus; + +#define HCWC_BLUE_TUPLE 0x0001 +#define HCWC_RED_TUPLE 0x0002 +#define HCWC_WARM_TUPLE 0x0004 + +#define HCWC_IS_MIXED(status) \ + (((status) & (HCWC_BLUE_TUPLE | HCWC_RED_TUPLE)) != 0) +#define HCWC_IS_ALL_RED(status) \ + (((status) & HCWC_BLUE_TUPLE) == 0) +#define HCWC_IS_ALL_BLUE(status) \ + (((status) & HCWC_RED_TUPLE) == 0) +#define HCWC_IS_WARM(status) \ + (((status) & HCWC_WARM_TUPLE) != 0) /* ---------------- * function prototypes for heap access method @@ -183,6 +197,10 @@ extern void simple_heap_update(Relation relation, ItemPointer otid, bool *warm_update); extern void heap_sync(Relation relation); +extern HeapCheckWarmChainStatus heap_check_warm_chain(Page dp, + ItemPointer tid, bool stop_at_warm); +extern int heap_clear_warm_chain(Page dp, ItemPointer tid, + OffsetNumber *cleared_offnums); /* in heap/pruneheap.c */ extern void heap_page_prune_opt(Relation relation, Buffer buffer); diff --git a/src/include/access/heapam_xlog.h b/src/include/access/heapam_xlog.h index 9b081bf..66fd0ea 100644 --- a/src/include/access/heapam_xlog.h +++ b/src/include/access/heapam_xlog.h @@ -32,7 +32,7 @@ #define XLOG_HEAP_INSERT 0x00 #define XLOG_HEAP_DELETE 0x10 #define XLOG_HEAP_UPDATE 0x20 -/* 0x030 is free, was XLOG_HEAP_MOVE */ +#define XLOG_HEAP_MULTI_INSERT 0x30 #define XLOG_HEAP_HOT_UPDATE 0x40 #define XLOG_HEAP_CONFIRM 0x50 #define XLOG_HEAP_LOCK 0x60 @@ -47,18 +47,23 @@ /* * We ran out of opcodes, so heapam.c now has a second RmgrId. These opcodes * are associated with RM_HEAP2_ID, but are not logically different from - * the ones above associated with RM_HEAP_ID. XLOG_HEAP_OPMASK applies to - * these, too. + * the ones above associated with RM_HEAP_ID. + * + * In PG 10, we moved XLOG_HEAP2_MULTI_INSERT to RM_HEAP_ID. That allows us to + * use 0x80 bit in RM_HEAP2_ID, thus potentially creating another 8 possible + * opcodes in RM_HEAP2_ID. */ #define XLOG_HEAP2_REWRITE 0x00 #define XLOG_HEAP2_CLEAN 0x10 #define XLOG_HEAP2_FREEZE_PAGE 0x20 #define XLOG_HEAP2_CLEANUP_INFO 0x30 #define XLOG_HEAP2_VISIBLE 0x40 -#define XLOG_HEAP2_MULTI_INSERT 0x50 +#define XLOG_HEAP2_WARMCLEAR 0x50 #define XLOG_HEAP2_LOCK_UPDATED 0x60 #define XLOG_HEAP2_NEW_CID 0x70 +#define XLOG_HEAP2_OPMASK 0x70 + /* * xl_heap_insert/xl_heap_multi_insert flag values, 8 bits are available. */ @@ -226,6 +231,14 @@ typedef struct xl_heap_clean #define SizeOfHeapClean (offsetof(xl_heap_clean, ndead) + sizeof(uint16)) +typedef struct xl_heap_warmclear +{ + uint16 ncleared; + /* OFFSET NUMBERS are in the block reference 0 */ +} xl_heap_warmclear; + +#define SizeOfHeapWarmClear (offsetof(xl_heap_warmclear, ncleared) + sizeof(uint16)) + /* * Cleanup_info is required in some cases during a lazy VACUUM. * Used for reporting the results of HeapTupleHeaderAdvanceLatestRemovedXid() @@ -389,6 +402,8 @@ extern XLogRecPtr log_heap_clean(Relation reln, Buffer buffer, OffsetNumber *nowdead, int ndead, OffsetNumber *nowunused, int nunused, TransactionId latestRemovedXid); +extern XLogRecPtr log_heap_warmclear(Relation reln, Buffer buffer, + OffsetNumber *cleared, int ncleared); extern XLogRecPtr log_heap_freeze(Relation reln, Buffer buffer, TransactionId cutoff_xid, xl_heap_freeze_tuple *tuples, int ntuples); diff --git a/src/include/access/htup_details.h b/src/include/access/htup_details.h index b5891ca..ba5e94d 100644 --- a/src/include/access/htup_details.h +++ b/src/include/access/htup_details.h @@ -201,6 +201,21 @@ struct HeapTupleHeaderData * upgrade support */ #define HEAP_MOVED (HEAP_MOVED_OFF | HEAP_MOVED_IN) +/* + * A WARM chain usually consists of two parts. Each of these parts are HOT + * chains in themselves i.e. all indexed columns has the same value, but a WARM + * update separates these parts. We call these two parts as Blue chain and Red + * chain. We need a mechanism to identify which part a tuple belongs to. We + * can't just look at if it's a HeapTupleHeaderIsHeapWarmTuple() because during + * WARM update, both old and new tuples are marked as WARM tuples. + * + * We need another infomask bit for this. But we use the same infomask bit that + * was earlier used for by old-style VACUUM FULL. This is safe because + * HEAP_WARM_TUPLE flag will always be set along with HEAP_WARM_RED. So if + * HEAP_WARM_TUPLE and HEAP_WARM_RED is set then we know that it's referring to + * red part of the WARM chain. + */ +#define HEAP_WARM_RED 0x4000 #define HEAP_XACT_MASK 0xFFF0 /* visibility-related bits */ /* @@ -397,7 +412,7 @@ struct HeapTupleHeaderData /* SetCmin is reasonably simple since we never need a combo CID */ #define HeapTupleHeaderSetCmin(tup, cid) \ do { \ - Assert(!((tup)->t_infomask & HEAP_MOVED)); \ + Assert(!HeapTupleHeaderIsMoved(tup)); \ (tup)->t_choice.t_heap.t_field3.t_cid = (cid); \ (tup)->t_infomask &= ~HEAP_COMBOCID; \ } while (0) @@ -405,7 +420,7 @@ do { \ /* SetCmax must be used after HeapTupleHeaderAdjustCmax; see combocid.c */ #define HeapTupleHeaderSetCmax(tup, cid, iscombo) \ do { \ - Assert(!((tup)->t_infomask & HEAP_MOVED)); \ + Assert(!HeapTupleHeaderIsMoved(tup)); \ (tup)->t_choice.t_heap.t_field3.t_cid = (cid); \ if (iscombo) \ (tup)->t_infomask |= HEAP_COMBOCID; \ @@ -415,7 +430,7 @@ do { \ #define HeapTupleHeaderGetXvac(tup) \ ( \ - ((tup)->t_infomask & HEAP_MOVED) ? \ + HeapTupleHeaderIsMoved(tup) ? \ (tup)->t_choice.t_heap.t_field3.t_xvac \ : \ InvalidTransactionId \ @@ -423,7 +438,7 @@ do { \ #define HeapTupleHeaderSetXvac(tup, xid) \ do { \ - Assert((tup)->t_infomask & HEAP_MOVED); \ + Assert(HeapTupleHeaderIsMoved(tup)); \ (tup)->t_choice.t_heap.t_field3.t_xvac = (xid); \ } while (0) @@ -651,6 +666,58 @@ do { \ ) /* + * Macros to check if tuple is a moved-off/in tuple by VACUUM FULL in from + * pre-9.0 era. Such tuple must not have HEAP_WARM_TUPLE flag set. + * + * Beware of multiple evaluations of the argument. + */ +#define HeapTupleHeaderIsMovedOff(tuple) \ +( \ + !HeapTupleHeaderIsHeapWarmTuple((tuple)) && \ + ((tuple)->t_infomask & HEAP_MOVED_OFF) \ +) + +#define HeapTupleHeaderIsMovedIn(tuple) \ +( \ + !HeapTupleHeaderIsHeapWarmTuple((tuple)) && \ + ((tuple)->t_infomask & HEAP_MOVED_IN) \ +) + +#define HeapTupleHeaderIsMoved(tuple) \ +( \ + !HeapTupleHeaderIsHeapWarmTuple((tuple)) && \ + ((tuple)->t_infomask & HEAP_MOVED) \ +) + +/* + * Check if tuple belongs to the Red part of the WARM chain. + * + * Beware of multiple evaluations of the argument. + */ +#define HeapTupleHeaderIsWarmRed(tuple) \ +( \ + HeapTupleHeaderIsHeapWarmTuple(tuple) && \ + (((tuple)->t_infomask & HEAP_WARM_RED) != 0) \ +) + +/* + * Mark tuple as a member of the Red chain. Must only be done on a tuple which + * is already marked a WARM-tuple. + * + * Beware of multiple evaluations of the argument. + */ +#define HeapTupleHeaderSetWarmRed(tuple) \ +( \ + AssertMacro(HeapTupleHeaderIsHeapWarmTuple(tuple)), \ + (tuple)->t_infomask |= HEAP_WARM_RED \ +) + +#define HeapTupleHeaderClearWarmRed(tuple) \ +( \ + (tuple)->t_infomask &= ~HEAP_WARM_RED \ +) + +/* * BITMAPLEN(NATTS) - * Computes size of null bitmap given number of data columns. */ @@ -810,6 +877,15 @@ struct MinimalTupleData #define HeapTupleClearHeapWarmTuple(tuple) \ HeapTupleHeaderClearHeapWarmTuple((tuple)->t_data) +#define HeapTupleIsHeapWarmTupleRed(tuple) \ + HeapTupleHeaderIsWarmRed((tuple)->t_data) + +#define HeapTupleSetHeapWarmTupleRed(tuple) \ + HeapTupleHeaderSetWarmRed((tuple)->t_data) + +#define HeapTupleClearHeapWarmTupleRed(tuple) \ + HeapTupleHeaderClearWarmRed((tuple)->t_data) + #define HeapTupleGetOid(tuple) \ HeapTupleHeaderGetOid((tuple)->t_data) diff --git a/src/include/access/nbtree.h b/src/include/access/nbtree.h index d4b35ca..1f4f0bd 100644 --- a/src/include/access/nbtree.h +++ b/src/include/access/nbtree.h @@ -427,6 +427,12 @@ typedef BTScanOpaqueData *BTScanOpaque; #define SK_BT_NULLS_FIRST (INDOPTION_NULLS_FIRST << SK_BT_INDOPTION_SHIFT) /* + * Flags overloaded on t_tid.ip_posid field. They are managed by + * ItemPointerSetFlags and corresponing routines. + */ +#define BTREE_INDEX_RED_POINTER 0x01 + +/* * external entry points for btree, in nbtree.c */ extern IndexBuildResult *btbuild(Relation heap, Relation index, @@ -436,6 +442,10 @@ extern bool btinsert(Relation rel, Datum *values, bool *isnull, ItemPointer ht_ctid, Relation heapRel, IndexUniqueCheck checkUnique, struct IndexInfo *indexInfo); +extern bool btwarminsert(Relation rel, Datum *values, bool *isnull, + ItemPointer ht_ctid, Relation heapRel, + IndexUniqueCheck checkUnique, + struct IndexInfo *indexInfo); extern IndexScanDesc btbeginscan(Relation rel, int nkeys, int norderbys); extern Size btestimateparallelscan(void); extern void btinitparallelscan(void *target); @@ -487,10 +497,12 @@ extern void _bt_pageinit(Page page, Size size); extern bool _bt_page_recyclable(Page page); extern void _bt_delitems_delete(Relation rel, Buffer buf, OffsetNumber *itemnos, int nitems, Relation heapRel); -extern void _bt_delitems_vacuum(Relation rel, Buffer buf, - OffsetNumber *itemnos, int nitems, - BlockNumber lastBlockVacuumed); +extern void _bt_handleitems_vacuum(Relation rel, Buffer buf, + OffsetNumber *delitemnos, int ndelitems, + OffsetNumber *coloritemnos, int ncoloritems); extern int _bt_pagedel(Relation rel, Buffer buf); +extern void _bt_color_items(Page page, OffsetNumber *coloritemnos, + uint16 ncoloritems); /* * prototypes for functions in nbtsearch.c diff --git a/src/include/access/nbtxlog.h b/src/include/access/nbtxlog.h index d6a3085..5555742 100644 --- a/src/include/access/nbtxlog.h +++ b/src/include/access/nbtxlog.h @@ -142,34 +142,20 @@ typedef struct xl_btree_reuse_page /* * This is what we need to know about vacuum of individual leaf index tuples. * The WAL record can represent deletion of any number of index tuples on a - * single index page when executed by VACUUM. - * - * For MVCC scans, lastBlockVacuumed will be set to InvalidBlockNumber. - * For a non-MVCC index scans there is an additional correctness requirement - * for applying these changes during recovery, which is that we must do one - * of these two things for every block in the index: - * * lock the block for cleanup and apply any required changes - * * EnsureBlockUnpinned() - * The purpose of this is to ensure that no index scans started before we - * finish scanning the index are still running by the time we begin to remove - * heap tuples. - * - * Any changes to any one block are registered on just one WAL record. All - * blocks that we need to run EnsureBlockUnpinned() are listed as a block range - * starting from the last block vacuumed through until this one. Individual - * block numbers aren't given. + * single index page when executed by VACUUM. It also includes tuples whose + * color is changed from red to blue by VACUUM. * * Note that the *last* WAL record in any vacuum of an index is allowed to * have a zero length array of offsets. Earlier records must have at least one. */ typedef struct xl_btree_vacuum { - BlockNumber lastBlockVacuumed; - - /* TARGET OFFSET NUMBERS FOLLOW */ + uint16 ndelitems; + uint16 ncoloritems; + /* ndelitems + ncoloritems TARGET OFFSET NUMBERS FOLLOW */ } xl_btree_vacuum; -#define SizeOfBtreeVacuum (offsetof(xl_btree_vacuum, lastBlockVacuumed) + sizeof(BlockNumber)) +#define SizeOfBtreeVacuum (offsetof(xl_btree_vacuum, ncoloritems) + sizeof(uint16)) /* * This is what we need to know about marking an empty branch for deletion. diff --git a/src/include/commands/progress.h b/src/include/commands/progress.h index 9472ecc..b355b61 100644 --- a/src/include/commands/progress.h +++ b/src/include/commands/progress.h @@ -25,6 +25,7 @@ #define PROGRESS_VACUUM_NUM_INDEX_VACUUMS 4 #define PROGRESS_VACUUM_MAX_DEAD_TUPLES 5 #define PROGRESS_VACUUM_NUM_DEAD_TUPLES 6 +#define PROGRESS_VACUUM_HEAP_BLKS_WARMCLEARED 7 /* Phases of vacuum (as advertised via PROGRESS_VACUUM_PHASE) */ #define PROGRESS_VACUUM_PHASE_SCAN_HEAP 1 -- 2.1.4