*** a/src/backend/access/heap/heapam.c --- b/src/backend/access/heap/heapam.c *************** *** 5553,5584 **** HeapTupleHeaderAdvanceLatestRemovedXid(HeapTupleHeader tuple, } /* - * Perform XLogInsert to register a heap cleanup info message. These - * messages are sent once per VACUUM and are required because - * of the phasing of removal operations during a lazy VACUUM. - * see comments for vacuum_log_cleanup_info(). - */ - XLogRecPtr - log_heap_cleanup_info(RelFileNode rnode, TransactionId latestRemovedXid) - { - xl_heap_cleanup_info xlrec; - XLogRecPtr recptr; - XLogRecData rdata; - - xlrec.node = rnode; - xlrec.latestRemovedXid = latestRemovedXid; - - rdata.data = (char *) &xlrec; - rdata.len = SizeOfHeapCleanupInfo; - rdata.buffer = InvalidBuffer; - rdata.next = NULL; - - recptr = XLogInsert(RM_HEAP2_ID, XLOG_HEAP2_CLEANUP_INFO, &rdata); - - return recptr; - } - - /* * Perform XLogInsert for a heap-clean operation. Caller must already * have modified the buffer and marked it dirty. * --- 5553,5558 ---- *************** *** 5930,5956 **** log_newpage_buffer(Buffer buffer) } /* - * Handles CLEANUP_INFO - */ - static void - heap_xlog_cleanup_info(XLogRecPtr lsn, XLogRecord *record) - { - xl_heap_cleanup_info *xlrec = (xl_heap_cleanup_info *) XLogRecGetData(record); - - if (InHotStandby) - ResolveRecoveryConflictWithSnapshot(xlrec->latestRemovedXid, xlrec->node); - - /* - * Actual operation is a no-op. Record type exists to provide a means for - * conflict processing to occur before we begin index vacuum actions. see - * vacuumlazy.c and also comments in btvacuumpage() - */ - - /* Backup blocks are not used in cleanup_info records */ - Assert(!(record->xl_info & XLR_BKP_BLOCK_MASK)); - } - - /* * Handles HEAP2_CLEAN record type */ static void --- 5904,5909 ---- *************** *** 7057,7065 **** heap2_redo(XLogRecPtr lsn, XLogRecord *record) case XLOG_HEAP2_CLEAN: heap_xlog_clean(lsn, record); break; - case XLOG_HEAP2_CLEANUP_INFO: - heap_xlog_cleanup_info(lsn, record); - break; case XLOG_HEAP2_VISIBLE: heap_xlog_visible(lsn, record); break; --- 7010,7015 ---- *** a/src/backend/access/heap/pruneheap.c --- b/src/backend/access/heap/pruneheap.c *************** *** 121,133 **** heap_page_prune_opt(Relation relation, Buffer buffer, TransactionId OldestXmin) * have pruned while we hold pin.) */ if (PageIsFull(page) || PageGetHeapFreeSpace(page) < minfree) - { - TransactionId ignore = InvalidTransactionId; /* return value not - * needed */ - /* OK to prune */ ! (void) heap_page_prune(relation, buffer, OldestXmin, true, &ignore); ! } /* And release buffer lock */ LockBuffer(buffer, BUFFER_LOCK_UNLOCK); --- 121,128 ---- * have pruned while we hold pin.) */ if (PageIsFull(page) || PageGetHeapFreeSpace(page) < minfree) /* OK to prune */ ! (void) heap_page_prune(relation, buffer, OldestXmin, true); /* And release buffer lock */ LockBuffer(buffer, BUFFER_LOCK_UNLOCK); *************** *** 148,159 **** heap_page_prune_opt(Relation relation, Buffer buffer, TransactionId OldestXmin) * send its own new total to pgstats, and we don't want this delta applied * on top of that.) * ! * Returns the number of tuples deleted from the page and sets ! * latestRemovedXid. */ int heap_page_prune(Relation relation, Buffer buffer, TransactionId OldestXmin, ! bool report_stats, TransactionId *latestRemovedXid) { int ndeleted = 0; Page page = BufferGetPage(buffer); --- 143,153 ---- * send its own new total to pgstats, and we don't want this delta applied * on top of that.) * ! * Returns the number of tuples deleted from the page. */ int heap_page_prune(Relation relation, Buffer buffer, TransactionId OldestXmin, ! bool report_stats) { int ndeleted = 0; Page page = BufferGetPage(buffer); *************** *** 173,179 **** heap_page_prune(Relation relation, Buffer buffer, TransactionId OldestXmin, * initialize the rest of our working state. */ prstate.new_prune_xid = InvalidTransactionId; ! prstate.latestRemovedXid = *latestRemovedXid; prstate.nredirected = prstate.ndead = prstate.nunused = 0; memset(prstate.marked, 0, sizeof(prstate.marked)); --- 167,173 ---- * initialize the rest of our working state. */ prstate.new_prune_xid = InvalidTransactionId; ! prstate.latestRemovedXid = InvalidTransactionId; prstate.nredirected = prstate.ndead = prstate.nunused = 0; memset(prstate.marked, 0, sizeof(prstate.marked)); *************** *** 277,284 **** heap_page_prune(Relation relation, Buffer buffer, TransactionId OldestXmin, if (report_stats && ndeleted > prstate.ndead) pgstat_update_heap_dead_tuples(relation, ndeleted - prstate.ndead); - *latestRemovedXid = prstate.latestRemovedXid; - /* * XXX Should we update the FSM information of this page ? * --- 271,276 ---- *** a/src/backend/access/nbtree/nbtxlog.c --- b/src/backend/access/nbtree/nbtxlog.c *************** *** 728,738 **** btree_xlog_delete(XLogRecPtr lsn, XLogRecord *record) * update the page. * * Btree delete records can conflict with standby queries. You might ! * think that vacuum records would conflict as well, but we've handled ! * that already. XLOG_HEAP2_CLEANUP_INFO records provide the highest xid ! * cleaned by the vacuum of the heap and so we can resolve any conflicts ! * just once when that arrives. After that we know that no conflicts ! * exist from individual btree vacuum records on that index. */ if (InHotStandby) { --- 728,736 ---- * update the page. * * Btree delete records can conflict with standby queries. You might ! * think that vacuum records would conflict as well. However, VACUUM ! * always removes the heap tuple first, and the XLOG_HEAP2_CLEAN record ! * from doing so is enough. */ if (InHotStandby) { *** a/src/backend/access/rmgrdesc/heapdesc.c --- b/src/backend/access/rmgrdesc/heapdesc.c *************** *** 149,161 **** heap2_desc(StringInfo buf, uint8 xl_info, char *rec) xlrec->node.relNode, xlrec->block, xlrec->latestRemovedXid); } - else if (info == XLOG_HEAP2_CLEANUP_INFO) - { - xl_heap_cleanup_info *xlrec = (xl_heap_cleanup_info *) rec; - - appendStringInfo(buf, "cleanup info: remxid %u", - xlrec->latestRemovedXid); - } else if (info == XLOG_HEAP2_VISIBLE) { xl_heap_visible *xlrec = (xl_heap_visible *) rec; --- 149,154 ---- *** a/src/backend/commands/vacuumlazy.c --- b/src/backend/commands/vacuumlazy.c *************** *** 11,17 **** * on the number of tuples and pages we will keep track of at once. * * We are willing to use at most maintenance_work_mem memory space to keep ! * track of dead tuples. We initially allocate an array of TIDs of that size, * with an upper limit that depends on table size (this limit ensures we don't * allocate a huge area uselessly for vacuuming small tables). If the array * threatens to overflow, we suspend the heap scan phase and perform a pass of --- 11,17 ---- * on the number of tuples and pages we will keep track of at once. * * We are willing to use at most maintenance_work_mem memory space to keep ! * track of dead TIDs. We initially allocate an array of TIDs of that size, * with an upper limit that depends on table size (this limit ensures we don't * allocate a huge area uselessly for vacuuming small tables). If the array * threatens to overflow, we suspend the heap scan phase and perform a pass of *************** *** 19,25 **** * TID array. * * If we're processing a table with no indexes, we can just vacuum each page ! * as we go; there's no need to save up multiple tuples to minimize the number * of index scans performed. So we don't use maintenance_work_mem memory for * the TID array, just enough to hold as many heap tuples as fit on one page. * --- 19,25 ---- * TID array. * * If we're processing a table with no indexes, we can just vacuum each page ! * as we go; there's no need to save up many TIDs to minimize the number * of index scans performed. So we don't use maintenance_work_mem memory for * the TID array, just enough to hold as many heap tuples as fit on one page. * *************** *** 109,121 **** typedef struct LVRelStats BlockNumber pages_removed; double tuples_deleted; BlockNumber nonempty_pages; /* actually, last nonempty page + 1 */ ! /* List of TIDs of tuples we intend to delete */ ! /* NB: this list is ordered by TID address */ int num_dead_tuples; /* current # of entries */ int max_dead_tuples; /* # slots allocated in array */ ItemPointer dead_tuples; /* array of ItemPointerData */ int num_index_scans; - TransactionId latestRemovedXid; bool lock_waiter_detected; } LVRelStats; --- 109,119 ---- BlockNumber pages_removed; double tuples_deleted; BlockNumber nonempty_pages; /* actually, last nonempty page + 1 */ ! /* Ordered list of TIDs we intend to delete */ int num_dead_tuples; /* current # of entries */ int max_dead_tuples; /* # slots allocated in array */ ItemPointer dead_tuples; /* array of ItemPointerData */ int num_index_scans; bool lock_waiter_detected; } LVRelStats; *************** *** 340,388 **** lazy_vacuum_rel(Relation onerel, VacuumStmt *vacstmt, } /* - * For Hot Standby we need to know the highest transaction id that will - * be removed by any change. VACUUM proceeds in a number of passes so - * we need to consider how each pass operates. The first phase runs - * heap_page_prune(), which can issue XLOG_HEAP2_CLEAN records as it - * progresses - these will have a latestRemovedXid on each record. - * In some cases this removes all of the tuples to be removed, though - * often we have dead tuples with index pointers so we must remember them - * for removal in phase 3. Index records for those rows are removed - * in phase 2 and index blocks do not have MVCC information attached. - * So before we can allow removal of any index tuples we need to issue - * a WAL record containing the latestRemovedXid of rows that will be - * removed in phase three. This allows recovery queries to block at the - * correct place, i.e. before phase two, rather than during phase three - * which would be after the rows have become inaccessible. - */ - static void - vacuum_log_cleanup_info(Relation rel, LVRelStats *vacrelstats) - { - /* - * Skip this for relations for which no WAL is to be written, or if we're - * not trying to support archive recovery. - */ - if (!RelationNeedsWAL(rel) || !XLogIsNeeded()) - return; - - /* - * No need to write the record at all unless it contains a valid value - */ - if (TransactionIdIsValid(vacrelstats->latestRemovedXid)) - (void) log_heap_cleanup_info(rel->rd_node, vacrelstats->latestRemovedXid); - } - - /* * lazy_scan_heap() -- scan an open heap relation * * This routine prunes each page in the heap, which will among other * things truncate dead tuples to dead line pointers, defragment the ! * page, and set commit status bits (see heap_page_prune). It also builds ! * lists of dead tuples and pages with free space, calculates statistics ! * on the number of live tuples in the heap, and marks pages as ! * all-visible if appropriate. When done, or when we run low on space for ! * dead-tuple TIDs, invoke vacuuming of indexes and call lazy_vacuum_heap ! * to reclaim dead line pointers. * * If there are no indexes then we can reclaim line pointers on the fly; * dead line pointers need only be retained until all index pointers that --- 338,353 ---- } /* * lazy_scan_heap() -- scan an open heap relation * * This routine prunes each page in the heap, which will among other * things truncate dead tuples to dead line pointers, defragment the ! * page, and set commit status bits (see heap_page_prune). It then ! * builds lists of dead line pointers, tracks feasibility of truncating ! * the end of the heap, calculates statistics on the number of live ! * tuples, and marks pages as all-visible if appropriate. When done, or ! * when we run low on space for dead-tuple TIDs, invoke vacuuming of ! * indexes and call lazy_vacuum_heap to reclaim dead line pointers. * * If there are no indexes then we can reclaim line pointers on the fly; * dead line pointers need only be retained until all index pointers that *************** *** 398,407 **** lazy_scan_heap(Relation onerel, LVRelStats *vacrelstats, char *relname; BlockNumber empty_pages, vacuumed_pages; ! double num_tuples, ! tups_vacuumed, ! nkeep, ! nunused; IndexBulkDeleteResult **indstats; int i; PGRUsage ru0; --- 363,372 ---- char *relname; BlockNumber empty_pages, vacuumed_pages; ! double num_tuples, /* kept: any reason */ ! tups_vacuumed, /* truncated to LP_DEAD */ ! nkeep, /* kept: writer finished too recently */ ! nunused; /* already LP_UNUSED */ IndexBulkDeleteResult **indstats; int i; PGRUsage ru0; *************** *** 427,433 **** lazy_scan_heap(Relation onerel, LVRelStats *vacrelstats, vacrelstats->rel_pages = nblocks; vacrelstats->scanned_pages = 0; vacrelstats->nonempty_pages = 0; - vacrelstats->latestRemovedXid = InvalidTransactionId; lazy_space_alloc(vacrelstats, nblocks); --- 392,397 ---- *************** *** 481,495 **** lazy_scan_heap(Relation onerel, LVRelStats *vacrelstats, Page page; OffsetNumber offnum, maxoff; ! bool tupgone, ! hastup; ! int prev_dead_count; OffsetNumber frozen[MaxOffsetNumber]; int nfrozen; Size freespace; bool all_visible_according_to_vm; - bool all_visible; - bool has_dead_tuples; TransactionId visibility_cutoff_xid = InvalidTransactionId; if (blkno == next_not_all_visible_block) --- 445,460 ---- Page page; OffsetNumber offnum, maxoff; ! /* Page has items !LP_UNUSED && !LP_DEAD? */ ! bool hastup, ! /* Page has items !LP_UNUSED && !(LP_NORMAL && HEAPTUPLE_LIVE)? */ ! has_nonlive, ! /* Page has live tuples too new to be visible to some snapshot? */ ! has_too_recent; OffsetNumber frozen[MaxOffsetNumber]; int nfrozen; Size freespace; bool all_visible_according_to_vm; TransactionId visibility_cutoff_xid = InvalidTransactionId; if (blkno == next_not_all_visible_block) *************** *** 545,566 **** lazy_scan_heap(Relation onerel, LVRelStats *vacrelstats, vmbuffer = InvalidBuffer; } - /* Log cleanup info before we touch indexes */ - vacuum_log_cleanup_info(onerel, vacrelstats); - /* Remove index entries */ for (i = 0; i < nindexes; i++) lazy_vacuum_index(Irel[i], &indstats[i], vacrelstats); ! /* Remove tuples from heap */ lazy_vacuum_heap(onerel, vacrelstats); ! /* ! * Forget the now-vacuumed tuples, and press on, but be careful ! * not to reset latestRemovedXid since we want that value to be ! * valid. ! */ vacrelstats->num_dead_tuples = 0; vacrelstats->num_index_scans++; } --- 510,524 ---- vmbuffer = InvalidBuffer; } /* Remove index entries */ for (i = 0; i < nindexes; i++) lazy_vacuum_index(Irel[i], &indstats[i], vacrelstats); ! /* Mark dead line pointers unused */ lazy_vacuum_heap(onerel, vacrelstats); ! /* Forget the now-vacuumed TIDs, and press on */ vacrelstats->num_dead_tuples = 0; vacrelstats->num_index_scans++; } *************** *** 578,584 **** lazy_scan_heap(Relation onerel, LVRelStats *vacrelstats, buf = ReadBufferExtended(onerel, MAIN_FORKNUM, blkno, RBM_NORMAL, vac_strategy); ! /* We need buffer cleanup lock so that we can prune HOT chains. */ if (!ConditionalLockBufferForCleanup(buf)) { /* --- 536,542 ---- buf = ReadBufferExtended(onerel, MAIN_FORKNUM, blkno, RBM_NORMAL, vac_strategy); ! /* We need buffer cleanup lock so that we can prune. */ if (!ConditionalLockBufferForCleanup(buf)) { /* *************** *** 680,707 **** lazy_scan_heap(Relation onerel, LVRelStats *vacrelstats, } /* ! * Prune all HOT-update chains in this page. ! * ! * We count tuples removed by the pruning step as removed by VACUUM. */ ! tups_vacuumed += heap_page_prune(onerel, buf, OldestXmin, false, ! &vacrelstats->latestRemovedXid); /* * Now scan the page to collect vacuumable items and check for tuples * requiring freezing. */ - all_visible = true; - has_dead_tuples = false; nfrozen = 0; hastup = false; ! prev_dead_count = vacrelstats->num_dead_tuples; maxoff = PageGetMaxOffsetNumber(page); for (offnum = FirstOffsetNumber; offnum <= maxoff; offnum = OffsetNumberNext(offnum)) { ItemId itemid; itemid = PageGetItemId(page, offnum); --- 638,666 ---- } /* ! * Remove dead tuples in this page. Certain heap-only tuples will be ! * reduced straight to LP_UNUSED line pointers. Other tuples, ! * including all index-referenced tuples, will be reduced to LP_DEAD ! * with no storage. We will finish the job by removing the index ! * entries and changing them to LP_UNUSED. */ ! tups_vacuumed += heap_page_prune(onerel, buf, OldestXmin, false); /* * Now scan the page to collect vacuumable items and check for tuples * requiring freezing. */ nfrozen = 0; hastup = false; ! has_nonlive = false; ! has_too_recent = false; maxoff = PageGetMaxOffsetNumber(page); for (offnum = FirstOffsetNumber; offnum <= maxoff; offnum = OffsetNumberNext(offnum)) { ItemId itemid; + bool try_freeze = true; itemid = PageGetItemId(page, offnum); *************** *** 725,746 **** lazy_scan_heap(Relation onerel, LVRelStats *vacrelstats, * DEAD item pointers are to be vacuumed normally; but we don't * count them in tups_vacuumed, else we'd be double-counting (at * least in the common case where heap_page_prune() just freed up ! * a non-HOT tuple). */ if (ItemIdIsDead(itemid)) { lazy_record_dead_tuple(vacrelstats, &(tuple.t_self)); ! all_visible = false; continue; } Assert(ItemIdIsNormal(itemid)); tuple.t_data = (HeapTupleHeader) PageGetItem(page, itemid); tuple.t_len = ItemIdGetLength(itemid); - tupgone = false; - switch (HeapTupleSatisfiesVacuum(tuple.t_data, OldestXmin, buf)) { case HEAPTUPLE_DEAD: --- 684,706 ---- * DEAD item pointers are to be vacuumed normally; but we don't * count them in tups_vacuumed, else we'd be double-counting (at * least in the common case where heap_page_prune() just freed up ! * a tuple). */ if (ItemIdIsDead(itemid)) { lazy_record_dead_tuple(vacrelstats, &(tuple.t_self)); ! has_nonlive = true; continue; } Assert(ItemIdIsNormal(itemid)); + num_tuples += 1; + hastup = true; + tuple.t_data = (HeapTupleHeader) PageGetItem(page, itemid); tuple.t_len = ItemIdGetLength(itemid); switch (HeapTupleSatisfiesVacuum(tuple.t_data, OldestXmin, buf)) { case HEAPTUPLE_DEAD: *************** *** 751,771 **** lazy_scan_heap(Relation onerel, LVRelStats *vacrelstats, * state changed since heap_page_prune() looked. In * particular an INSERT_IN_PROGRESS tuple could have * changed to DEAD if the inserter aborted. So this ! * cannot be considered an error condition. * ! * If the tuple is HOT-updated then it must only be ! * removed by a prune operation; so we keep it just as if ! * it were RECENTLY_DEAD. Also, if it's a heap-only ! * tuple, we choose to keep it, because it'll be a lot ! * cheaper to get rid of it in the next pruning pass than ! * to treat it like an indexed tuple. */ ! if (HeapTupleIsHotUpdated(&tuple) || ! HeapTupleIsHeapOnly(&tuple)) ! nkeep += 1; ! else ! tupgone = true; /* we can delete the tuple */ ! all_visible = false; break; case HEAPTUPLE_LIVE: /* Tuple is good --- but let's do some validity checks */ --- 711,735 ---- * state changed since heap_page_prune() looked. In * particular an INSERT_IN_PROGRESS tuple could have * changed to DEAD if the inserter aborted. So this ! * cannot be considered an error condition, and it's not ! * worth the code to actually kill the tuple here. * ! * heap_freeze_tuple() can't cope with HEAPTUPLE_DEAD. ! * Since our OldestXmin predates the inserter's abort, ! * none of the tuple's XIDs should qualify for freezing. ! * For this rarely-tested branch, burn a few cycles ! * verifying that at runtime. ! * ! * In all other respects, treat this outcome just like ! * HEAPTUPLE_RECENTLY_DEAD. */ ! if (heap_tuple_needs_freeze(tuple.t_data, FreezeLimit, ! MultiXactFrzLimit, buf)) ! elog(ERROR, "just-dead tuple requires freezing"); ! try_freeze = false; ! ! nkeep += 1; ! has_nonlive = true; break; case HEAPTUPLE_LIVE: /* Tuple is good --- but let's do some validity checks */ *************** *** 783,795 **** lazy_scan_heap(Relation onerel, LVRelStats *vacrelstats, * that the HEAP_XMIN_COMMITTED hint bit is set because of * that. */ ! if (all_visible) { TransactionId xmin; if (!(tuple.t_data->t_infomask & HEAP_XMIN_COMMITTED)) { ! all_visible = false; break; } --- 747,759 ---- * that the HEAP_XMIN_COMMITTED hint bit is set because of * that. */ ! if (!has_too_recent) { TransactionId xmin; if (!(tuple.t_data->t_infomask & HEAP_XMIN_COMMITTED)) { ! has_too_recent = true; break; } *************** *** 800,806 **** lazy_scan_heap(Relation onerel, LVRelStats *vacrelstats, xmin = HeapTupleHeaderGetXmin(tuple.t_data); if (!TransactionIdPrecedes(xmin, OldestXmin)) { ! all_visible = false; break; } --- 764,770 ---- xmin = HeapTupleHeaderGetXmin(tuple.t_data); if (!TransactionIdPrecedes(xmin, OldestXmin)) { ! has_too_recent = true; break; } *************** *** 816,857 **** lazy_scan_heap(Relation onerel, LVRelStats *vacrelstats, * from relation. */ nkeep += 1; ! all_visible = false; break; case HEAPTUPLE_INSERT_IN_PROGRESS: /* This is an expected case during concurrent vacuum */ ! all_visible = false; break; case HEAPTUPLE_DELETE_IN_PROGRESS: /* This is an expected case during concurrent vacuum */ ! all_visible = false; break; default: elog(ERROR, "unexpected HeapTupleSatisfiesVacuum result"); break; } ! if (tupgone) ! { ! lazy_record_dead_tuple(vacrelstats, &(tuple.t_self)); ! HeapTupleHeaderAdvanceLatestRemovedXid(tuple.t_data, ! &vacrelstats->latestRemovedXid); ! tups_vacuumed += 1; ! has_dead_tuples = true; ! } ! else ! { ! num_tuples += 1; ! hastup = true; ! ! /* ! * Each non-removable tuple must be checked to see if it needs ! * freezing. Note we already have exclusive buffer lock. ! */ ! if (heap_freeze_tuple(tuple.t_data, FreezeLimit, ! MultiXactFrzLimit)) ! frozen[nfrozen++] = offnum; ! } } /* scan along page */ /* --- 780,808 ---- * from relation. */ nkeep += 1; ! has_nonlive = true; break; case HEAPTUPLE_INSERT_IN_PROGRESS: /* This is an expected case during concurrent vacuum */ ! has_nonlive = true; break; case HEAPTUPLE_DELETE_IN_PROGRESS: /* This is an expected case during concurrent vacuum */ ! has_nonlive = true; break; default: elog(ERROR, "unexpected HeapTupleSatisfiesVacuum result"); break; } ! /* ! * Apart from the case(s) handled specially above, each ! * non-removable tuple must be checked to see if it needs ! * freezing. Note we already have exclusive buffer lock. ! */ ! if (try_freeze && heap_freeze_tuple(tuple.t_data, FreezeLimit, ! MultiXactFrzLimit)) ! frozen[nfrozen++] = offnum; } /* scan along page */ /* *************** *** 880,893 **** lazy_scan_heap(Relation onerel, LVRelStats *vacrelstats, if (nindexes == 0 && vacrelstats->num_dead_tuples > 0) { ! /* Remove tuples from heap */ lazy_vacuum_page(onerel, blkno, buf, 0, vacrelstats); ! /* ! * Forget the now-vacuumed tuples, and press on, but be careful ! * not to reset latestRemovedXid since we want that value to be ! * valid. ! */ vacrelstats->num_dead_tuples = 0; vacuumed_pages++; } --- 831,840 ---- if (nindexes == 0 && vacrelstats->num_dead_tuples > 0) { ! /* Mark dead line pointers unused */ lazy_vacuum_page(onerel, blkno, buf, 0, vacrelstats); ! /* Forget the now-vacuumed TIDs, and press on */ vacrelstats->num_dead_tuples = 0; vacuumed_pages++; } *************** *** 895,901 **** lazy_scan_heap(Relation onerel, LVRelStats *vacrelstats, freespace = PageGetHeapFreeSpace(page); /* mark page all-visible, if appropriate */ ! if (all_visible) { if (!PageIsAllVisible(page)) { --- 842,848 ---- freespace = PageGetHeapFreeSpace(page); /* mark page all-visible, if appropriate */ ! if (!has_nonlive && !has_too_recent) { if (!PageIsAllVisible(page)) { *************** *** 942,951 **** lazy_scan_heap(Relation onerel, LVRelStats *vacrelstats, * means that the tuples that we think are not visible to everyone yet * actually are, and the PD_ALL_VISIBLE flag is correct. * ! * There should never be dead tuples on a page with PD_ALL_VISIBLE ! * set, however. */ ! else if (PageIsAllVisible(page) && has_dead_tuples) { elog(WARNING, "page containing dead tuples is marked as all-visible in relation \"%s\" page %u", relname, blkno); --- 889,897 ---- * means that the tuples that we think are not visible to everyone yet * actually are, and the PD_ALL_VISIBLE flag is correct. * ! * A PD_ALL_VISIBLE page should, however, carry only live tuples. */ ! else if (PageIsAllVisible(page) && has_nonlive) { elog(WARNING, "page containing dead tuples is marked as all-visible in relation \"%s\" page %u", relname, blkno); *************** *** 960,974 **** lazy_scan_heap(Relation onerel, LVRelStats *vacrelstats, if (hastup) vacrelstats->nonempty_pages = blkno + 1; ! /* ! * If we remembered any tuples for deletion, then the page will be ! * visited again by lazy_vacuum_heap, which will compute and record ! * its post-compaction free space. If not, then we're done with this ! * page, so remember its free space as-is. (This path will always be ! * taken if there are no indexes.) ! */ ! if (vacrelstats->num_dead_tuples == prev_dead_count) ! RecordPageWithFreeSpace(onerel, blkno, freespace); } /* save stats for use later */ --- 906,912 ---- if (hastup) vacrelstats->nonempty_pages = blkno + 1; ! RecordPageWithFreeSpace(onerel, blkno, freespace); } /* save stats for use later */ *************** *** 990,1008 **** lazy_scan_heap(Relation onerel, LVRelStats *vacrelstats, vmbuffer = InvalidBuffer; } ! /* If any tuples need to be deleted, perform final vacuum cycle */ ! /* XXX put a threshold on min number of tuples here? */ if (vacrelstats->num_dead_tuples > 0) { - /* Log cleanup info before we touch indexes */ - vacuum_log_cleanup_info(onerel, vacrelstats); - /* Remove index entries */ for (i = 0; i < nindexes; i++) lazy_vacuum_index(Irel[i], &indstats[i], vacrelstats); ! /* Remove tuples from heap */ lazy_vacuum_heap(onerel, vacrelstats); vacrelstats->num_index_scans++; } --- 928,943 ---- vmbuffer = InvalidBuffer; } ! /* If any line pointers need to be deleted, perform final vacuum cycle */ ! /* XXX put a threshold on minimum count here? */ if (vacrelstats->num_dead_tuples > 0) { /* Remove index entries */ for (i = 0; i < nindexes; i++) lazy_vacuum_index(Irel[i], &indstats[i], vacrelstats); ! /* Mark dead line pointers unused */ lazy_vacuum_heap(onerel, vacrelstats); vacrelstats->num_index_scans++; } *************** *** 1037,1048 **** lazy_scan_heap(Relation onerel, LVRelStats *vacrelstats, /* * lazy_vacuum_heap() -- second pass over the heap * ! * This routine marks dead tuples as unused and compacts out free ! * space on their pages. Pages not having dead tuples recorded from ! * lazy_scan_heap are not visited at all. * ! * Note: the reason for doing this as a second pass is we cannot remove ! * the tuples until we've removed their index entries, and we want to * process index entry removal in batches as large as possible. */ static void --- 972,982 ---- /* * lazy_vacuum_heap() -- second pass over the heap * ! * This routine marks dead line pointers as unused. Pages not recorded ! * during lazy_scan_heap() are not visited at all. * ! * Note: the reason for doing this as a second pass is we cannot reuse the ! * line pointers until we've removed their index entries, and we want to * process index entry removal in batches as large as possible. */ static void *************** *** 1060,1087 **** lazy_vacuum_heap(Relation onerel, LVRelStats *vacrelstats) { BlockNumber tblk; Buffer buf; - Page page; - Size freespace; vacuum_delay_point(); tblk = ItemPointerGetBlockNumber(&vacrelstats->dead_tuples[tupindex]); buf = ReadBufferExtended(onerel, MAIN_FORKNUM, tblk, RBM_NORMAL, vac_strategy); ! if (!ConditionalLockBufferForCleanup(buf)) ! { ! ReleaseBuffer(buf); ! ++tupindex; ! continue; ! } tupindex = lazy_vacuum_page(onerel, tblk, buf, tupindex, vacrelstats); - - /* Now that we've compacted the page, record its available space */ - page = BufferGetPage(buf); - freespace = PageGetHeapFreeSpace(page); - UnlockReleaseBuffer(buf); - RecordPageWithFreeSpace(onerel, tblk, freespace); npages++; } --- 994,1008 ---- { BlockNumber tblk; Buffer buf; vacuum_delay_point(); tblk = ItemPointerGetBlockNumber(&vacrelstats->dead_tuples[tupindex]); buf = ReadBufferExtended(onerel, MAIN_FORKNUM, tblk, RBM_NORMAL, vac_strategy); ! LockBuffer(buf, BUFFER_LOCK_SHARE); tupindex = lazy_vacuum_page(onerel, tblk, buf, tupindex, vacrelstats); UnlockReleaseBuffer(buf); npages++; } *************** *** 1094,1117 **** lazy_vacuum_heap(Relation onerel, LVRelStats *vacrelstats) } /* ! * lazy_vacuum_page() -- free dead tuples on a page ! * and repair its fragmentation. * ! * Caller must hold pin and buffer cleanup lock on the buffer. * ! * tupindex is the index in vacrelstats->dead_tuples of the first dead ! * tuple for this page. We assume the rest follow sequentially. ! * The return value is the first tupindex after the tuples of this page. */ static int lazy_vacuum_page(Relation onerel, BlockNumber blkno, Buffer buffer, int tupindex, LVRelStats *vacrelstats) { Page page = BufferGetPage(buffer); - OffsetNumber unused[MaxOffsetNumber]; - int uncnt = 0; - - START_CRIT_SECTION(); for (; tupindex < vacrelstats->num_dead_tuples; tupindex++) { --- 1015,1045 ---- } /* ! * lazy_vacuum_page() -- change LP_DEAD line pointers to LP_UNUSED. ! * ! * Caller must hold at least a shared lock on the buffer. Only VACUUM ! * modifies heap LP_DEAD line pointers, and no other process will be upset if ! * such a line pointer suddenly becomes LP_UNUSED. The primary task noticing ! * the difference is PageAddItem(), and it would only mind a switch in the ! * opposite direction, from LP_UNUSED to LP_DEAD. (Furthermore, it needs an ! * exclusive lock on the buffer.) * ! * Emit no WAL. The next VACUUM will redo lost changes. A mere LP_DEAD ! * pointer imposes no wraparound hazard; all XIDs are gone. However, WAL ! * replay of operations that repurpose LP_UNUSED pointers must be prepared to ! * find the rare LP_DEAD pointer until recovery reaches a consistent state. ! * That can happen when such a WAL record makes it to disk, but the crash ! * precedes any flush of this buffer. * ! * tupindex is the index in vacrelstats->dead_tuples of the first dead TID for ! * this page. We assume the rest follow sequentially. The return value is ! * the first tupindex beyond this page. */ static int lazy_vacuum_page(Relation onerel, BlockNumber blkno, Buffer buffer, int tupindex, LVRelStats *vacrelstats) { Page page = BufferGetPage(buffer); for (; tupindex < vacrelstats->num_dead_tuples; tupindex++) { *************** *** 1124,1151 **** lazy_vacuum_page(Relation onerel, BlockNumber blkno, Buffer buffer, break; /* past end of tuples for this block */ toff = ItemPointerGetOffsetNumber(&vacrelstats->dead_tuples[tupindex]); itemid = PageGetItemId(page, toff); ItemIdSetUnused(itemid); - unused[uncnt++] = toff; } ! PageRepairFragmentation(page); ! ! MarkBufferDirty(buffer); ! ! /* XLOG stuff */ ! if (RelationNeedsWAL(onerel)) ! { ! XLogRecPtr recptr; ! ! recptr = log_heap_clean(onerel, buffer, ! NULL, 0, NULL, 0, ! unused, uncnt, ! vacrelstats->latestRemovedXid); ! PageSetLSN(page, recptr); ! PageSetTLI(page, ThisTimeLineID); ! } ! END_CRIT_SECTION(); return tupindex; } --- 1052,1065 ---- break; /* past end of tuples for this block */ toff = ItemPointerGetOffsetNumber(&vacrelstats->dead_tuples[tupindex]); itemid = PageGetItemId(page, toff); + Assert(ItemIdIsDead(itemid)); /* stronger lock needed otherwise */ ItemIdSetUnused(itemid); } ! /* This is a pure hint; there's no harm if it arrives early or never. */ ! PageSetHasFreeLinePointers(page); ! SetBufferCommitInfoNeedsSave(buffer); return tupindex; } *************** *** 1198,1204 **** lazy_check_needs_freeze(Buffer buf) /* * lazy_vacuum_index() -- vacuum one index relation. * ! * Delete all the index entries pointing to tuples listed in * vacrelstats->dead_tuples, and update running statistics. */ static void --- 1112,1118 ---- /* * lazy_vacuum_index() -- vacuum one index relation. * ! * Delete all the index entries pointing to items listed in * vacrelstats->dead_tuples, and update running statistics. */ static void *************** *** 1503,1511 **** count_nondeletable_pages(Relation onerel, LVRelStats *vacrelstats) /* * Note: any non-unused item should be taken as a reason to keep ! * this page. We formerly thought that DEAD tuples could be ! * thrown away, but that's not so, because we'd not have cleaned ! * out their index entries. */ if (ItemIdIsUsed(itemid)) { --- 1417,1425 ---- /* * Note: any non-unused item should be taken as a reason to keep ! * this page. We formerly thought that LP_DEAD line pointers ! * could be thrown away, but that's not so, because we'd not have ! * cleaned out their index entries. */ if (ItemIdIsUsed(itemid)) { *************** *** 1564,1570 **** lazy_space_alloc(LVRelStats *vacrelstats, BlockNumber relblocks) } /* ! * lazy_record_dead_tuple - remember one deletable tuple */ static void lazy_record_dead_tuple(LVRelStats *vacrelstats, --- 1478,1484 ---- } /* ! * lazy_record_dead_tuple - remember one reclaimable line pointer */ static void lazy_record_dead_tuple(LVRelStats *vacrelstats, *************** *** 1573,1579 **** lazy_record_dead_tuple(LVRelStats *vacrelstats, /* * The array shouldn't overflow under normal behavior, but perhaps it * could if we are given a really small maintenance_work_mem. In that ! * case, just forget the last few tuples (we'll get 'em next time). */ if (vacrelstats->num_dead_tuples < vacrelstats->max_dead_tuples) { --- 1487,1493 ---- /* * The array shouldn't overflow under normal behavior, but perhaps it * could if we are given a really small maintenance_work_mem. In that ! * case, just forget the last few entries (we'll get 'em next time). */ if (vacrelstats->num_dead_tuples < vacrelstats->max_dead_tuples) { *** a/src/backend/storage/page/bufpage.c --- b/src/backend/storage/page/bufpage.c *************** *** 15,20 **** --- 15,21 ---- #include "postgres.h" #include "access/htup_details.h" + #include "access/xlog.h" /* ---------------------------------------------------------------- *************** *** 156,163 **** PageAddItem(Page page, if (offsetNumber < limit) { itemId = PageGetItemId(phdr, offsetNumber); ! if (ItemIdIsUsed(itemId) || ItemIdHasStorage(itemId)) { elog(WARNING, "will not overwrite a used ItemId"); return InvalidOffsetNumber; } --- 157,175 ---- if (offsetNumber < limit) { itemId = PageGetItemId(phdr, offsetNumber); ! if (InRecovery && !reachedConsistency && ! ItemIdIsDead(itemId) && !ItemIdHasStorage(itemId)) { + /* + * Before recovering to a consistent state, it is possible + * to find LP_DEAD items where we expect LP_UNUSED. See + * comments at lazy_vacuum_page(). + */ + elog(WARNING, "overwriting dead ItemId"); + } + else if (ItemIdIsUsed(itemId) || ItemIdHasStorage(itemId)) + { + /* Be strict later on. */ elog(WARNING, "will not overwrite a used ItemId"); return InvalidOffsetNumber; } *** a/src/include/access/heapam.h --- b/src/include/access/heapam.h *************** *** 166,172 **** extern void heap_page_prune_opt(Relation relation, Buffer buffer, TransactionId OldestXmin); extern int heap_page_prune(Relation relation, Buffer buffer, TransactionId OldestXmin, ! bool report_stats, TransactionId *latestRemovedXid); extern void heap_page_prune_execute(Buffer buffer, OffsetNumber *redirected, int nredirected, OffsetNumber *nowdead, int ndead, --- 166,172 ---- TransactionId OldestXmin); extern int heap_page_prune(Relation relation, Buffer buffer, TransactionId OldestXmin, ! bool report_stats); extern void heap_page_prune_execute(Buffer buffer, OffsetNumber *redirected, int nredirected, OffsetNumber *nowdead, int ndead, *** a/src/include/access/heapam_xlog.h --- b/src/include/access/heapam_xlog.h *************** *** 51,57 **** #define XLOG_HEAP2_FREEZE 0x00 #define XLOG_HEAP2_CLEAN 0x10 /* 0x20 is free, was XLOG_HEAP2_CLEAN_MOVE */ ! #define XLOG_HEAP2_CLEANUP_INFO 0x30 #define XLOG_HEAP2_VISIBLE 0x40 #define XLOG_HEAP2_MULTI_INSERT 0x50 #define XLOG_HEAP2_LOCK_UPDATED 0x60 --- 51,57 ---- #define XLOG_HEAP2_FREEZE 0x00 #define XLOG_HEAP2_CLEAN 0x10 /* 0x20 is free, was XLOG_HEAP2_CLEAN_MOVE */ ! /* 0x30 is free, was XLOG_HEAP2_CLEANUP_INFO */ #define XLOG_HEAP2_VISIBLE 0x40 #define XLOG_HEAP2_MULTI_INSERT 0x50 #define XLOG_HEAP2_LOCK_UPDATED 0x60 *************** *** 178,196 **** typedef struct xl_heap_clean #define SizeOfHeapClean (offsetof(xl_heap_clean, ndead) + sizeof(uint16)) - /* - * Cleanup_info is required in some cases during a lazy VACUUM. - * Used for reporting the results of HeapTupleHeaderAdvanceLatestRemovedXid() - * see vacuumlazy.c for full explanation - */ - typedef struct xl_heap_cleanup_info - { - RelFileNode node; - TransactionId latestRemovedXid; - } xl_heap_cleanup_info; - - #define SizeOfHeapCleanupInfo (sizeof(xl_heap_cleanup_info)) - /* This is for replacing a page's contents in toto */ /* NB: this is used for indexes as well as heaps */ typedef struct xl_heap_newpage --- 178,183 ---- *************** *** 269,276 **** extern void heap_desc(StringInfo buf, uint8 xl_info, char *rec); extern void heap2_redo(XLogRecPtr lsn, XLogRecord *rptr); extern void heap2_desc(StringInfo buf, uint8 xl_info, char *rec); - extern XLogRecPtr log_heap_cleanup_info(RelFileNode rnode, - TransactionId latestRemovedXid); extern XLogRecPtr log_heap_clean(Relation reln, Buffer buffer, OffsetNumber *redirected, int nredirected, OffsetNumber *nowdead, int ndead, --- 256,261 ---- *** a/src/include/storage/bufpage.h --- b/src/include/storage/bufpage.h *************** *** 164,169 **** typedef PageHeaderData *PageHeader; --- 164,172 ---- * PD_PAGE_FULL is set if an UPDATE doesn't find enough free space in the * page for its new tuple version; this suggests that a prune is needed. * Again, this is just a hint. + * + * Hold at least a shared lock to change the hints. Critical flags, currently + * PD_ALL_VISIBLE, require an exclusive lock. */ #define PD_HAS_FREE_LINES 0x0001 /* are there any unused line pointers? */ #define PD_PAGE_FULL 0x0002 /* not enough free space for new *** a/src/include/storage/itemid.h --- b/src/include/storage/itemid.h *************** *** 123,128 **** typedef uint16 ItemLength; --- 123,131 ---- * ItemIdSetUnused * Set the item identifier to be UNUSED, with no storage. * Beware of multiple evaluations of itemId! + * + * Note: VACUUM uses this on LP_DEAD heap items as though it were a hint-bit + * mechanism, holding only a shared lock. */ #define ItemIdSetUnused(itemId) \ ( \