From 187c9224596ba0ab4c33cd7f08fd3179b0567b50 Mon Sep 17 00:00:00 2001
From: Greg Burd <greg@burd.me>
Date: Wed, 17 Jun 2026 21:28:33 -0400
Subject: [PATCH v48 5/9] Collapse dead HOT-indexed chains to xid-free stubs on
 prune/vacuum

A HOT-indexed update plants index entries that point at mid-chain heap-only
tuples, so a dead chain member cannot simply be removed: a not-yet-swept index
entry may still arrive at it, and the per-hop modified-attrs bitmap on it is
what a reader unions to judge staleness.

Teach prune to collapse a dead chain prefix into xid-free forwarding stubs:
each preserved dead key tuple is rewritten in place to a stub (frozen,
natts == 0, HEAP_INDEXED_UPDATED, forwarding via t_ctid.offnum) that keeps its
segment's modified-attrs bitmap, and a member whose attributes are wholly
subsumed by later hops is reclaimed instead.  Readers step through stubs
transparently and still cross every surviving hop's bitmap.  The collapse back
to classic HOT is driven by prune: once a chain is fully dead, a later prune
(heap_prune_chain / heap_prune_chain_find_live) reclaims its members and
re-points the root redirect straight at the first live tuple.  VACUUM's index
cleanup sweeps the stale leaves; its second pass (lazy_vacuum_heap_page) does
the usual LP_DEAD -> LP_UNUSED conversion and leaves the HOT-indexed collapse
to prune.

The collapse reuses the existing prune/freeze WAL via an xlhp_prune_items
sub-record carrying the (offset, forward) stub pairs; no new record type is
introduced.  A page that still carries a preserved stub (or a redirect that
forwards into a live HOT-indexed member) is kept non-all-visible so index-only
scans heap-fetch through the chain; heap_page_would_be_all_visible recognizes
both the redirect-to-SIU and the stub case explicitly.

Co-authored-by: Greg Burd <greg@burd.me>
Co-authored-by: Nathan Bossart <nathandbossart@gmail.com>
---
 src/backend/access/heap/README.HOT-INDEXED |  38 ++
 src/backend/access/heap/heapam_xlog.c      |  10 +-
 src/backend/access/heap/pruneheap.c        | 645 +++++++++++++++++++--
 src/backend/access/heap/vacuumlazy.c       |  94 ++-
 src/backend/access/rmgrdesc/heapdesc.c     |  36 +-
 src/include/access/heapam.h                |   6 +-
 src/include/access/heapam_xlog.h           |  19 +-
 7 files changed, 777 insertions(+), 71 deletions(-)

diff --git a/src/backend/access/heap/README.HOT-INDEXED b/src/backend/access/heap/README.HOT-INDEXED
index 5d4a2c7d66c..ab4f8bc1881 100644
--- a/src/backend/access/heap/README.HOT-INDEXED
+++ b/src/backend/access/heap/README.HOT-INDEXED
@@ -206,6 +206,44 @@ under the opclass even if not bitwise-identical, e.g. numeric 1.0 vs 1.00) is
 still detected.  (Appendix A motivates this recheck in detail.)
 
 
+Prune and chain collapse
+-------------------------
+
+Because a HOT-indexed update plants an index entry pointing at a mid-chain
+heap-only tuple's own TID, classic HOT's assumption that mid-chain line
+pointers have no external references no longer holds.  Pruning therefore must
+not reclaim such a line pointer while a not-yet-swept index entry can still
+arrive at it.
+
+heap_prune_chain collapses a run of dead chain members to a single
+LP_REDIRECT that forwards to the first live tuple, and preserves the line
+pointer of a live HOT-indexed member (heap_prune_item_preserves_hot_indexed)
+so a reader arriving via a stale entry still finds a walkable hop.  More than
+one LP_REDIRECT may forward to the same live tuple.  The redirect lifecycle
+reuses the existing prune WAL records; there is no new on-disk format.
+
+
+Vacuum reclamation
+------------------
+
+VACUUM's index cleanup sweeps the stale index entries.  The collapse back to
+classic HOT is driven by prune, not by VACUUM's second pass: once a chain is
+fully dead, a later prune (heap_prune_chain / heap_prune_chain_find_live)
+reclaims its members and re-points the root redirect straight at first_live.
+Re-pointing a redirect preserves reachability (every walker still reaches
+first_live), so it is safe under the exclusive lock prune already holds.
+
+VACUUM's second pass (lazy_vacuum_heap_page) does not itself re-point
+redirects or reclaim stubs; it performs the usual LP_DEAD -> LP_UNUSED
+conversion and leaves the HOT-indexed collapse to prune.
+
+A page that still carries a preserved HOT-indexed member or a collapse-survivor
+stub is deliberately left non-all-visible, so that an index-only scan
+heap-fetches through the chain and the crossed-attribute bitmap can filter
+stale entries (enforced in heap_prune_record_redirect, the stub recorders, and
+heap_page_would_be_all_visible).
+
+
 Appendices
 ----------
 
diff --git a/src/backend/access/heap/heapam_xlog.c b/src/backend/access/heap/heapam_xlog.c
index 9ed7024e814..571ea5a4db6 100644
--- a/src/backend/access/heap/heapam_xlog.c
+++ b/src/backend/access/heap/heapam_xlog.c
@@ -103,6 +103,8 @@ heap_xlog_prune_freeze(XLogReaderState *record)
 		Size		datalen;
 		xlhp_freeze_plan *plans;
 		OffsetNumber *frz_offsets;
+		OffsetNumber *stubs;
+		int			nstubs;
 		char	   *dataptr = XLogRecGetBlockData(record, 0, &datalen);
 		bool		do_prune;
 
@@ -110,9 +112,10 @@ heap_xlog_prune_freeze(XLogReaderState *record)
 											   &nplans, &plans, &frz_offsets,
 											   &nredirected, &redirected,
 											   &ndead, &nowdead,
-											   &nunused, &nowunused);
+											   &nunused, &nowunused,
+											   &nstubs, &stubs);
 
-		do_prune = nredirected > 0 || ndead > 0 || nunused > 0;
+		do_prune = nredirected > 0 || ndead > 0 || nunused > 0 || nstubs > 0;
 
 		/* Ensure the record does something */
 		Assert(do_prune || nplans > 0 || vmflags & VISIBILITYMAP_VALID_BITS);
@@ -126,7 +129,8 @@ heap_xlog_prune_freeze(XLogReaderState *record)
 									(xlrec.flags & XLHP_CLEANUP_LOCK) == 0,
 									redirected, nredirected,
 									nowdead, ndead,
-									nowunused, nunused);
+									nowunused, nunused,
+									stubs, nstubs);
 
 		/* Freeze tuples */
 		for (int p = 0; p < nplans; p++)
diff --git a/src/backend/access/heap/pruneheap.c b/src/backend/access/heap/pruneheap.c
index fdddd23035b..c1dc23c2270 100644
--- a/src/backend/access/heap/pruneheap.c
+++ b/src/backend/access/heap/pruneheap.c
@@ -16,6 +16,7 @@
 
 #include "access/heapam.h"
 #include "access/heapam_xlog.h"
+#include "access/hot_indexed.h"
 #include "access/htup_details.h"
 #include "access/multixact.h"
 #include "access/transam.h"
@@ -67,11 +68,20 @@ typedef struct
 	int			nredirected;	/* numbers of entries in arrays below */
 	int			ndead;
 	int			nunused;
+	int			nstubs;
 	int			nfrozen;
 	/* arrays that accumulate indexes of items to be changed */
 	OffsetNumber redirected[MaxHeapTuplesPerPage * 2];
 	OffsetNumber nowdead[MaxHeapTuplesPerPage];
 	OffsetNumber nowunused[MaxHeapTuplesPerPage];
+
+	/*
+	 * HOT-selectively-updated collapse-survivor stubs: (offset, forward)
+	 * pairs of line pointers rewritten in place into xid-free forwarding
+	 * stubs that preserve their segment's modified-attrs bitmap.
+	 */
+	OffsetNumber stubs[MaxHeapTuplesPerPage * 2];
+
 	HeapTupleFreeze frozen[MaxHeapTuplesPerPage];
 
 	/*
@@ -220,6 +230,10 @@ static void heap_prune_record_prunable(PruneState *prstate, TransactionId xid,
 static void heap_prune_record_redirect(PruneState *prstate,
 									   OffsetNumber offnum, OffsetNumber rdoffnum,
 									   bool was_normal);
+static void heap_prune_record_stub(PruneState *prstate,
+								   OffsetNumber offnum, OffsetNumber forward);
+static void heap_prune_record_unchanged_lp_stub(PruneState *prstate,
+												OffsetNumber offnum);
 static void heap_prune_record_dead(PruneState *prstate, OffsetNumber offnum,
 								   bool was_normal);
 static void heap_prune_record_dead_or_unused(PruneState *prstate, OffsetNumber offnum,
@@ -230,6 +244,7 @@ static void heap_prune_record_unchanged_lp_unused(PruneState *prstate, OffsetNum
 static void heap_prune_record_unchanged_lp_normal(PruneState *prstate, OffsetNumber offnum);
 static void heap_prune_record_unchanged_lp_dead(PruneState *prstate, OffsetNumber offnum);
 static void heap_prune_record_unchanged_lp_redirect(PruneState *prstate, OffsetNumber offnum);
+static bool heap_prune_item_preserves_hot_indexed(Page page, OffsetNumber offnum);
 
 static void page_verify_redirects(Page page);
 
@@ -439,6 +454,7 @@ prune_freeze_setup(PruneFreezeParams *params,
 	prstate->new_prune_xid = InvalidTransactionId;
 	prstate->latest_xid_removed = InvalidTransactionId;
 	prstate->nredirected = prstate->ndead = prstate->nunused = 0;
+	prstate->nstubs = 0;
 	prstate->nfrozen = 0;
 	prstate->nroot_items = 0;
 	prstate->nheaponly_items = 0;
@@ -607,6 +623,23 @@ prune_freeze_plan(PruneState *prstate, OffsetNumber *off_loc)
 		 * Get the tuple's visibility status and queue it up for processing.
 		 */
 		htup = (HeapTupleHeader) PageGetItem(page, itemid);
+
+		/*
+		 * A collapse-survivor stub is an LP_NORMAL item that is not a real
+		 * tuple: HEAP_INDEXED_UPDATED with natts == 0, permanently invisible
+		 * (HEAP_XMIN_INVALID), carrying a forward link and the modified-attrs
+		 * bitmap for the chain segment it stands in for.
+		 * heap_prune_satisfies_vacuum() would classify it HEAPTUPLE_DEAD and
+		 * pruning would reclaim it, destroying the bitmap a read needs.
+		 * Record it as an unchanged item so it is preserved; the HOT chain
+		 * walk steps through it transparently to reach the live tuple.
+		 */
+		if (HotIndexedHeaderIsStub(htup))
+		{
+			heap_prune_record_unchanged_lp_stub(prstate, offnum);
+			continue;
+		}
+
 		tup.t_data = htup;
 		tup.t_len = ItemIdGetLength(itemid);
 		ItemPointerSet(&tup.t_self, blockno, offnum);
@@ -677,25 +710,23 @@ prune_freeze_plan(PruneState *prstate, OffsetNumber *off_loc)
 			ItemId		itemid = PageGetItemId(page, offnum);
 			HeapTupleHeader htup = (HeapTupleHeader) PageGetItem(page, itemid);
 
-			if (likely(!HeapTupleHeaderIsHotUpdated(htup)))
-			{
-				HeapTupleHeaderAdvanceConflictHorizon(htup,
-													  &prstate->latest_xid_removed);
-				heap_prune_record_unused(prstate, offnum, true);
-			}
+			/*
+			 * This dead heap-only tuple was not reached by any HOT chain walk
+			 * (an aborted HOT-selectively-updated sub-chain, or a member
+			 * whose live root stopped the walk).  If it carries a stale btree
+			 * leaf (HEAP_INDEXED_UPDATED), mark it LP_DEAD: that pins the
+			 * slot against reuse and adds it to the dead-items array so
+			 * ambulkdelete sweeps the stale leaf and a later vacuum reclaims
+			 * the LP. Otherwise (classic HOT, no leaf of its own) reclaim it
+			 * to LP_UNUSED.
+			 */
+			HeapTupleHeaderAdvanceConflictHorizon(htup,
+												  &prstate->latest_xid_removed);
+			if ((htup->t_infomask2 & HEAP_INDEXED_UPDATED) != 0 &&
+				HeapTupleHeaderGetNatts(htup) > 0)
+				heap_prune_record_dead_or_unused(prstate, offnum, true);
 			else
-			{
-				/*
-				 * This tuple should've been processed and removed as part of
-				 * a HOT chain, so something's wrong.  To preserve evidence,
-				 * we don't dare to remove it.  We cannot leave behind a DEAD
-				 * tuple either, because that will cause VACUUM to error out.
-				 * Throwing an error with a distinct error message seems like
-				 * the least bad option.
-				 */
-				elog(ERROR, "dead heap-only tuple (%u, %d) is not linked to from any HOT chain",
-					 blockno, offnum);
-			}
+				heap_prune_record_unused(prstate, offnum, true);
 		}
 		else
 			heap_prune_record_unchanged_lp_normal(prstate, offnum);
@@ -1163,7 +1194,8 @@ heap_page_prune_and_freeze(PruneFreezeParams *params,
 
 	do_prune = prstate.nredirected > 0 ||
 		prstate.ndead > 0 ||
-		prstate.nunused > 0;
+		prstate.nunused > 0 ||
+		prstate.nstubs > 0;
 
 	/*
 	 * Even if we don't prune anything, if we found a new value for the
@@ -1264,7 +1296,8 @@ heap_page_prune_and_freeze(PruneFreezeParams *params,
 			heap_page_prune_execute(prstate.buffer, false,
 									prstate.redirected, prstate.nredirected,
 									prstate.nowdead, prstate.ndead,
-									prstate.nowunused, prstate.nunused);
+									prstate.nowunused, prstate.nunused,
+									prstate.stubs, prstate.nstubs);
 		}
 
 		if (do_freeze)
@@ -1307,7 +1340,8 @@ heap_page_prune_and_freeze(PruneFreezeParams *params,
 									  prstate.frozen, prstate.nfrozen,
 									  prstate.redirected, prstate.nredirected,
 									  prstate.nowdead, prstate.ndead,
-									  prstate.nowunused, prstate.nunused);
+									  prstate.nowunused, prstate.nunused,
+									  prstate.stubs, prstate.nstubs);
 		}
 	}
 
@@ -1448,6 +1482,101 @@ htsv_get_valid_status(int status)
 	return (HTSV_Result) status;
 }
 
+/*
+ * heap_prune_chain_find_live
+ *		Follow a HOT chain from 'start' to its first surviving member.
+ *
+ * Used when re-pruning a HOT/SIU chain that was collapsed by an earlier prune:
+ * the root and any entry-bearing dead members were turned into LP_REDIRECTs to
+ * what was then the first live tuple.  If that tuple has since been HOT-updated
+ * again and died, the redirects must be re-pointed to the current first live
+ * tuple, or several redirects forwarding to one live tuple must agree on it.
+ * Both cases need the chain's current first surviving member.
+ *
+ * Walks t_ctid on this page starting at 'start', skipping DEAD members, and
+ * returns the offset of the first non-DEAD (surviving) member.  Returns
+ * InvalidOffsetNumber if the chain dead-ends with no survivor or runs off the
+ * page.  Reads only the page's pre-execute state, so it is correct regardless
+ * of the order in which sibling redirects are processed.
+ */
+static OffsetNumber
+heap_prune_chain_find_live(PruneState *prstate, OffsetNumber start)
+{
+	Page		page = prstate->page;
+	OffsetNumber maxoff = PageGetMaxOffsetNumber(page);
+	OffsetNumber offnum = start;
+	OffsetNumber survivor = start;	/* successor of the last DEAD member */
+	int			loops = 0;
+
+	while (offnum >= FirstOffsetNumber && offnum <= maxoff)
+	{
+		ItemId		lp = PageGetItemId(page, offnum);
+		HTSV_Result status;
+		HeapTupleHeader htup;
+
+		/* A redirect/dead/unused item cannot be a surviving chain member. */
+		if (!ItemIdIsNormal(lp))
+			return InvalidOffsetNumber;
+
+		htup = (HeapTupleHeader) PageGetItem(page, lp);
+
+		/*
+		 * A collapse-survivor stub is an xid-free forwarding node, not a
+		 * chain member; the page scan records it unchanged without computing
+		 * visibility, so its htsv slot is unset.  Step through it to its
+		 * forward link rather than reading htsv, which would trip the
+		 * validity assert.
+		 */
+		if (HotIndexedHeaderIsStub(htup))
+		{
+			offnum = HotIndexedStubGetForward(htup);
+			if (++loops > maxoff)
+				return InvalidOffsetNumber;
+			continue;
+		}
+
+		status = htsv_get_valid_status(prstate->htsv[offnum]);
+		htup = (HeapTupleHeader) PageGetItem(page, lp);
+
+		if (status == HEAPTUPLE_DEAD)
+		{
+			/*
+			 * A DEAD member is reclaimed/redirected, so the surviving tail
+			 * starts at its successor.  A DEAD member with no live successor
+			 * means the whole chain is dead.
+			 */
+			if (!HeapTupleHeaderIsHotUpdated(htup) ||
+				ItemPointerGetBlockNumber(&htup->t_ctid) != prstate->block)
+				return InvalidOffsetNumber;
+			offnum = ItemPointerGetOffsetNumber(&htup->t_ctid);
+			survivor = offnum;
+		}
+		else if (status == HEAPTUPLE_RECENTLY_DEAD)
+		{
+			/*
+			 * RECENTLY_DEAD members belong to the surviving tail unless a
+			 * DEAD member follows them (which would make them part of the
+			 * dead prefix).  Keep walking to find out, but do not advance the
+			 * survivor; it stays at the successor of the last DEAD member.
+			 */
+			if (!HeapTupleHeaderIsHotUpdated(htup) ||
+				ItemPointerGetBlockNumber(&htup->t_ctid) != prstate->block)
+				return survivor;
+			offnum = ItemPointerGetOffsetNumber(&htup->t_ctid);
+		}
+		else
+		{
+			/* LIVE (or in-progress): the surviving tail is settled. */
+			return survivor;
+		}
+
+		if (++loops > maxoff)
+			return InvalidOffsetNumber; /* defend against a corrupt cycle */
+	}
+
+	return InvalidOffsetNumber;
+}
+
 /*
  * Prune specified line pointer or a HOT chain originating at line pointer.
  *
@@ -1518,6 +1647,36 @@ heap_prune_chain(OffsetNumber maxoff, OffsetNumber rootoffnum,
 		if (offnum > maxoff)
 			break;
 
+		/*
+		 * Step transparently through a collapse-survivor stub.  A redirect or
+		 * an earlier stub may forward into a stub that replaced a dead
+		 * mid-chain member; the stub was already recorded (as unchanged) by
+		 * the page scan and is not itself a chain member, so follow its
+		 * forward link rather than stopping at the processed check below.
+		 */
+		{
+			ItemId		slp = PageGetItemId(page, offnum);
+
+			if (ItemIdIsNormal(slp))
+			{
+				HeapTupleHeader shtup = (HeapTupleHeader) PageGetItem(page, slp);
+
+				if (HotIndexedHeaderIsStub(shtup))
+				{
+					/*
+					 * A stub is xid-free, so the xmin/xmax linkage cannot be
+					 * verified across it.  Trust the stub's forward link and
+					 * skip the prior-xmax check for the first member past it
+					 * (otherwise the chain would be severed there, dropping
+					 * its tail).
+					 */
+					offnum = HotIndexedStubGetForward(shtup);
+					priorXmax = InvalidTransactionId;
+					continue;
+				}
+			}
+		}
+
 		/* If item is already processed, stop --- it must not be same chain */
 		if (prstate->processed[offnum])
 			break;
@@ -1624,13 +1783,27 @@ heap_prune_chain(OffsetNumber maxoff, OffsetNumber rootoffnum,
 	if (ItemIdIsRedirected(rootlp) && nchain < 2)
 	{
 		/*
-		 * We found a redirect item that doesn't point to a valid follow-on
-		 * item.  This can happen if the loop in heap_page_prune_and_freeze()
-		 * caused us to visit the dead successor of a redirect item before
-		 * visiting the redirect item.  We can clean up by setting the
-		 * redirect item to LP_DEAD state or LP_UNUSED if the caller
-		 * indicated.
+		 * The walk could not get past the redirect: its target was either
+		 * already processed by a sibling redirect's walk (several redirects
+		 * of a collapsed HOT/SIU chain forward to the same live tuple) or has
+		 * since died and been collapsed further.  Re-point this redirect at
+		 * the chain's current first surviving member so every entry that
+		 * resolves through it still reaches the live tuple.  If no survivor
+		 * remains, the redirect is dangling and is reclaimed (LP_DEAD, or
+		 * LP_UNUSED if the caller allows it).
 		 */
+		OffsetNumber target = ItemIdGetRedirect(rootlp);
+		OffsetNumber live = heap_prune_chain_find_live(prstate, target);
+
+		if (OffsetNumberIsValid(live))
+		{
+			if (live == target)
+				heap_prune_record_unchanged_lp_redirect(prstate, rootoffnum);
+			else
+				heap_prune_record_redirect(prstate, rootoffnum, live, false);
+			return;
+		}
+
 		heap_prune_record_dead_or_unused(prstate, rootoffnum, false);
 		return;
 	}
@@ -1656,24 +1829,126 @@ process_chain:
 	else if (ndeadchain == nchain)
 	{
 		/*
-		 * The entire chain is dead.  Mark the root line pointer LP_DEAD, and
-		 * fully remove the other tuples in the chain.
+		 * The entire chain is dead.  No live tuple remains to forward to, so
+		 * mark the root LP_DEAD (or LP_UNUSED if the caller allows it) and
+		 * reclaim each member.  A dead HOT-selectively-updated member may
+		 * still have a stale btree leaf pointing at it: mark it LP_DEAD so
+		 * the slot is pinned against reuse and added to the dead-items array,
+		 * letting ambulkdelete sweep the leaf and a later vacuum reclaim the
+		 * line pointer.  Classic-HOT members carry no leaf of their own and
+		 * go straight to LP_UNUSED.
 		 */
 		heap_prune_record_dead_or_unused(prstate, rootoffnum, ItemIdIsNormal(rootlp));
 		for (int i = 1; i < nchain; i++)
-			heap_prune_record_unused(prstate, chainitems[i], true);
+		{
+			if (heap_prune_item_preserves_hot_indexed(page, chainitems[i]))
+				heap_prune_record_dead_or_unused(prstate, chainitems[i], true);
+			else
+				heap_prune_record_unused(prstate, chainitems[i], true);
+		}
 	}
 	else
 	{
 		/*
-		 * We found a DEAD tuple in the chain.  Redirect the root line pointer
-		 * to the first non-DEAD tuple, and mark as unused each intermediate
-		 * item that we are able to remove from the chain.
+		 * The chain has a dead prefix followed by a live remainder.  Collapse
+		 * it with PHOT-style key tuples so that the per-hop modified-attrs
+		 * bitmaps survive for the bitmap-overlap read path.
+		 *
+		 * Walk the dead members from the live end backwards, accumulating in
+		 * laterattrs the union of the modified-attrs bitmaps of the members
+		 * that follow (the "later hops").  A dead key tuple -- one that
+		 * carried its own index entries because it changed an indexed
+		 * attribute at its hop (heap_prune_item_preserves_hot_indexed) -- is
+		 * disposed of as follows:
+		 *
+		 * - If every attribute it changed was changed again by a later hop
+		 * (its bitmap is a subset of laterattrs), every index entry pointing
+		 * at it is superseded, so no live entry references it: reclaim it
+		 * (LP_DEAD), which lets this vacuum's index pass sweep its now-stale
+		 * leaves and a later pass free the line pointer.  This loses no
+		 * per-hop information for readers -- its attributes are already
+		 * carried by the surviving later members the reader still crosses.
+		 *
+		 * - Otherwise it introduced an attribute not changed again, so a live
+		 * entry still points at it: keep it as an xid-free stub forwarding to
+		 * the next survivor, preserving its inline bitmap for the read path.
+		 *
+		 * Classic-HOT members carry no entry of their own and are reclaimed
+		 * to LP_UNUSED; survivors forward past them.  The root is redirected
+		 * to the first survivor.  Stubs carry no XIDs, so the page stays
+		 * freeze-safe; because each forwards to the next survivor (not the
+		 * live tuple), a reader crossing the collapsed prefix sees every
+		 * surviving hop's bitmap, and stub->stub forwarding lets a later
+		 * collapse extend the chain without re-pointing existing stubs.
+		 */
+		OffsetNumber first_live = chainitems[ndeadchain];
+		OffsetNumber next_survivor = first_live;
+		OffsetNumber root_target;
+		int			relnatts = RelationGetNumberOfAttributes(prstate->relation);
+		uint8		laterattrs[(MaxHeapAttributeNumber + 7) / 8];
+
+		/*
+		 * laterattrs accumulates every surviving hop's modified attributes.
+		 * Size it for the relation's current natts (the maximum); each
+		 * contributing tuple's bitmap is located and OR-ed using that tuple's
+		 * write-time natts (HotIndexedTupleBitmapNatts), since ADD COLUMN may
+		 * have grown the relation since some hops were written.
 		 */
-		heap_prune_record_redirect(prstate, rootoffnum, chainitems[ndeadchain],
-								   ItemIdIsNormal(rootlp));
-		for (int i = 1; i < ndeadchain; i++)
-			heap_prune_record_unused(prstate, chainitems[i], true);
+		memset(laterattrs, 0, HotIndexedBitmapBytes(relnatts));
+		for (int i = ndeadchain; i < nchain; i++)
+		{
+			ItemId		lp = PageGetItemId(page, chainitems[i]);
+			HeapTupleHeader htup = (HeapTupleHeader) PageGetItem(page, lp);
+
+			if ((htup->t_infomask2 & HEAP_INDEXED_UPDATED) != 0)
+			{
+				int			bmnatts = HotIndexedTupleBitmapNatts(htup);
+
+				HotIndexedBitmapUnion(laterattrs,
+									  HotIndexedGetModifiedBitmap(htup,
+															  ItemIdGetLength(lp),
+															  bmnatts),
+									  bmnatts);
+			}
+		}
+
+		/* dead prefix: reclaim superseded members, stub the rest */
+		for (int i = ndeadchain - 1; i >= 1; i--)
+		{
+			if (heap_prune_item_preserves_hot_indexed(page, chainitems[i]))
+			{
+				ItemId		lp = PageGetItemId(page, chainitems[i]);
+				HeapTupleHeader htup = (HeapTupleHeader) PageGetItem(page, lp);
+				int			bmnatts = HotIndexedTupleBitmapNatts(htup);
+				const uint8 *attrs = HotIndexedGetModifiedBitmap(htup,
+														 ItemIdGetLength(lp),
+														 bmnatts);
+
+				if (HotIndexedBitmapIsSubset(attrs, laterattrs, bmnatts))
+					heap_prune_record_dead_or_unused(prstate, chainitems[i], true);
+				else
+				{
+					heap_prune_record_stub(prstate, chainitems[i], next_survivor);
+					next_survivor = chainitems[i];
+				}
+				HotIndexedBitmapUnion(laterattrs, attrs, bmnatts);
+			}
+			else
+				heap_prune_record_unused(prstate, chainitems[i], true);
+		}
+
+		root_target = next_survivor;
+
+		/*
+		 * root -> first survivor (skip a redundant no-op redirect on
+		 * re-prune)
+		 */
+		if (ItemIdIsRedirected(rootlp) &&
+			ItemIdGetRedirect(rootlp) == root_target)
+			heap_prune_record_unchanged_lp_redirect(prstate, rootoffnum);
+		else
+			heap_prune_record_redirect(prstate, rootoffnum, root_target,
+									   ItemIdIsNormal(rootlp));
 
 		/* the rest of tuples in the chain are normal, unchanged tuples */
 		for (int i = ndeadchain; i < nchain; i++)
@@ -1718,6 +1993,34 @@ heap_prune_record_redirect(PruneState *prstate,
 	 * separately as an unchanged tuple.
 	 */
 
+	/*
+	 * If the redirect points at a HOT-selectively-updated live tuple, the
+	 * page may still carry stale btree entries that resolve through this
+	 * redirect to a tuple with a different key.  Such entries are filtered by
+	 * the read path's crossed-attribute bitmap, which requires fetching the
+	 * heap tuple -- but an index-only scan trusts the visibility map and skips
+	 * that fetch.  So
+	 * the page must not be reported all-visible/all-frozen while such a
+	 * redirect exists; it becomes eligible again only once vacuum has swept
+	 * the stale leaves and reclaimed the redirect.
+	 */
+	if (rdoffnum >= FirstOffsetNumber &&
+		rdoffnum <= PageGetMaxOffsetNumber(prstate->page))
+	{
+		ItemId		tlp = PageGetItemId(prstate->page, rdoffnum);
+
+		if (ItemIdIsNormal(tlp))
+		{
+			HeapTupleHeader thtup = (HeapTupleHeader) PageGetItem(prstate->page, tlp);
+
+			if ((thtup->t_infomask2 & HEAP_INDEXED_UPDATED) != 0)
+			{
+				prstate->set_all_visible = false;
+				prstate->set_all_frozen = false;
+			}
+		}
+	}
+
 	Assert(prstate->nredirected < MaxHeapTuplesPerPage);
 	prstate->redirected[prstate->nredirected * 2] = offnum;
 	prstate->redirected[prstate->nredirected * 2 + 1] = rdoffnum;
@@ -1735,6 +2038,65 @@ heap_prune_record_redirect(PruneState *prstate,
 	prstate->hastup = true;
 }
 
+/*
+ * Record a line pointer to be rewritten in place as a HOT-selectively-updated
+ * collapse-survivor stub forwarding to 'forward'.
+ *
+ * The source must be a dead heap-only tuple that carried its own btree
+ * entries (a key tuple) and so cannot be reclaimed outright: a stale entry may
+ * still resolve through it.  Rewriting it into an xid-free stub keeps the
+ * forward link and the tuple's inline modified-attrs bitmap (so the read path
+ * can judge staleness) while dropping its XIDs, which keeps the page
+ * freeze-safe.
+ */
+static void
+heap_prune_record_stub(PruneState *prstate,
+					   OffsetNumber offnum, OffsetNumber forward)
+{
+	Assert(!prstate->processed[offnum]);
+	prstate->processed[offnum] = true;
+
+	/*
+	 * As with a redirect to a HOT-selectively-updated tuple, the page must
+	 * not be reported all-visible/all-frozen while a stub exists: an
+	 * index-only scan would otherwise trust the VM and skip the recheck that
+	 * filters a now-stale entry resolving through the stub.  A stub's
+	 * HEAP_XMIN_INVALID also makes it invisible to every snapshot, which an
+	 * all-visible page must never contain.  Eligibility returns once vacuum
+	 * reclaims the stub.
+	 */
+	prstate->set_all_visible = false;
+	prstate->set_all_frozen = false;
+
+	Assert(prstate->nstubs < MaxHeapTuplesPerPage);
+	prstate->stubs[prstate->nstubs * 2] = offnum;
+	prstate->stubs[prstate->nstubs * 2 + 1] = forward;
+	prstate->nstubs++;
+
+	/* The dead key tuple's storage is being discarded; count it removed. */
+	prstate->ndeleted++;
+
+	prstate->hastup = true;
+}
+
+/*
+ * Record an existing collapse-survivor stub that is to be left unchanged.
+ *
+ * Encountered when re-pruning a page that already holds stubs from an earlier
+ * collapse.  The stub is preserved (its bitmap is still needed) and counts as
+ * a reason the page cannot be reported all-visible/all-frozen.
+ */
+static void
+heap_prune_record_unchanged_lp_stub(PruneState *prstate, OffsetNumber offnum)
+{
+	Assert(!prstate->processed[offnum]);
+	prstate->processed[offnum] = true;
+	prstate->hastup = true;
+
+	prstate->set_all_visible = false;
+	prstate->set_all_frozen = false;
+}
+
 /* Record line pointer to be marked dead */
 static void
 heap_prune_record_dead(PruneState *prstate, OffsetNumber offnum,
@@ -1816,6 +2178,52 @@ heap_prune_record_unused(PruneState *prstate, OffsetNumber offnum, bool was_norm
 		prstate->ndeleted++;
 }
 
+
+/*
+ * heap_prune_item_preserves_hot_indexed
+ *		True iff the LP at `offnum` on `page` is a live HOT-indexed (HOT/SIU)
+ *		heap-only tuple whose LP must be preserved rather than reclaimed to
+ *		LP_UNUSED, because a not-yet-swept index entry may still point at it.
+ *
+ * A HOT-indexed update plants a new index entry pointing at the heap-only
+ * tuple's own TID.  Classic HOT's invariant that mid-chain LPs have no
+ * external references therefore does not hold for such tuples: until
+ * ambulkdelete sweeps any stale index entry, a reader arriving via it must
+ * still find a walkable hop at the LP.  Chain collapse converts dead members
+ * to LP_REDIRECT forwarders for exactly this reason; a live member like this
+ * one must simply not be reclaimed out from under such a reader.
+ *
+ * Excluded from preservation:
+ *   - items that are not LP_NORMAL (REDIRECT, DEAD, UNUSED);
+ *   - tuples without HEAP_INDEXED_UPDATED (classic HOT chain members never
+ *     had a per-tuple index entry planted);
+ *   - tuples with no attributes (defensive: not a real chain member);
+ *   - aborted heap-only tuples (HEAP_XMIN_INVALID): never visible through any
+ *     index entry, so reclaiming them is safe.
+ */
+static bool
+heap_prune_item_preserves_hot_indexed(Page page, OffsetNumber offnum)
+{
+	ItemId		lp = PageGetItemId(page, offnum);
+	HeapTupleHeader htup;
+
+	if (!ItemIdIsNormal(lp))
+		return false;
+
+	htup = (HeapTupleHeader) PageGetItem(page, lp);
+
+	if ((htup->t_infomask2 & HEAP_INDEXED_UPDATED) == 0)
+		return false;
+	if (HeapTupleHeaderGetNatts(htup) == 0)
+		return false;
+	if ((htup->t_infomask & HEAP_XMIN_INVALID) != 0)
+		return false;
+
+	return true;
+}
+
+
+
 /*
  * Record an unused line pointer that is left unchanged.
  */
@@ -2049,8 +2457,44 @@ heap_prune_record_unchanged_lp_redirect(PruneState *prstate, OffsetNumber offnum
 	 */
 	Assert(!prstate->processed[offnum]);
 	prstate->processed[offnum] = true;
+
+	/*
+	 * As in heap_prune_record_redirect: if this redirect forwards to a
+	 * HOT-selectively-updated live tuple, the page may carry stale btree
+	 * entries that resolve through it, so it must not be reported
+	 * all-visible/all-frozen (an index-only scan would otherwise skip the
+	 * crossed-attribute bitmap check).  This must happen here too, not only when the
+	 * redirect is first created, because a re-prune records an existing SIU
+	 * redirect as unchanged.
+	 */
+	{
+		ItemId		lp = PageGetItemId(prstate->page, offnum);
+
+		if (ItemIdIsRedirected(lp))
+		{
+			OffsetNumber rdoffnum = ItemIdGetRedirect(lp);
+
+			if (rdoffnum >= FirstOffsetNumber &&
+				rdoffnum <= PageGetMaxOffsetNumber(prstate->page))
+			{
+				ItemId		tlp = PageGetItemId(prstate->page, rdoffnum);
+
+				if (ItemIdIsNormal(tlp))
+				{
+					HeapTupleHeader thtup = (HeapTupleHeader) PageGetItem(prstate->page, tlp);
+
+					if ((thtup->t_infomask2 & HEAP_INDEXED_UPDATED) != 0)
+					{
+						prstate->set_all_visible = false;
+						prstate->set_all_frozen = false;
+					}
+				}
+			}
+		}
+	}
 }
 
+
 /*
  * Perform the actual page changes needed by heap_page_prune_and_freeze().
  *
@@ -2065,17 +2509,22 @@ void
 heap_page_prune_execute(Buffer buffer, bool lp_truncate_only,
 						OffsetNumber *redirected, int nredirected,
 						OffsetNumber *nowdead, int ndead,
-						OffsetNumber *nowunused, int nunused)
+						OffsetNumber *nowunused, int nunused,
+						OffsetNumber *stubs, int nstubs)
 {
 	Page		page = BufferGetPage(buffer);
 	OffsetNumber *offnum;
 	HeapTupleHeader htup PG_USED_FOR_ASSERTS_ONLY;
 
 	/* Shouldn't be called unless there's something to do */
-	Assert(nredirected > 0 || ndead > 0 || nunused > 0);
+	Assert(nredirected > 0 || ndead > 0 || nunused > 0 || nstubs > 0);
 
-	/* If 'lp_truncate_only', we can only remove already-dead line pointers */
-	Assert(!lp_truncate_only || (nredirected == 0 && ndead == 0));
+	/*
+	 * If 'lp_truncate_only', we can only remove already-dead line pointers
+	 * and re-point redirects (the latter when vacuum reclaims a collapsed
+	 * chain and re-points a root redirect at the surviving tuple).
+	 */
+	Assert(!lp_truncate_only || ndead == 0);
 
 	/* Update all redirected line pointers */
 	offnum = redirected;
@@ -2086,6 +2535,16 @@ heap_page_prune_execute(Buffer buffer, bool lp_truncate_only,
 		ItemId		fromlp = PageGetItemId(page, fromoff);
 		ItemId		tolp PG_USED_FOR_ASSERTS_ONLY;
 
+		/*
+		 * A redundant redirect (the LP already redirects to tooff) is a
+		 * harmless no-op.  This arises when a HOT-indexed chain that was
+		 * already collapsed is re-pruned and the root still resolves to the
+		 * same target; skip it so the apply stays idempotent on both primary
+		 * and replay.
+		 */
+		if (ItemIdIsRedirected(fromlp) && ItemIdGetRedirect(fromlp) == tooff)
+			continue;
+
 #ifdef USE_ASSERT_CHECKING
 
 		/*
@@ -2100,7 +2559,16 @@ heap_page_prune_execute(Buffer buffer, bool lp_truncate_only,
 			Assert(ItemIdHasStorage(fromlp) && ItemIdIsNormal(fromlp));
 
 			htup = (HeapTupleHeader) PageGetItem(page, fromlp);
-			Assert(!HeapTupleHeaderIsHeapOnly(htup));
+
+			/*
+			 * The redirect source is normally the non-heap-only chain root. A
+			 * HOT/SIU chain collapse additionally redirects dead heap-only
+			 * members that carried their own btree entry to the live tuple,
+			 * so a heap-only redirect source is allowed when it is
+			 * HOT-selectively-updated (HEAP_INDEXED_UPDATED).
+			 */
+			Assert(!HeapTupleHeaderIsHeapOnly(htup) ||
+				   (htup->t_infomask2 & HEAP_INDEXED_UPDATED) != 0);
 		}
 		else
 		{
@@ -2128,12 +2596,55 @@ heap_page_prune_execute(Buffer buffer, bool lp_truncate_only,
 		tolp = PageGetItemId(page, tooff);
 		Assert(ItemIdHasStorage(tolp) && ItemIdIsNormal(tolp));
 		htup = (HeapTupleHeader) PageGetItem(page, tolp);
+		/* A redirect targets the first surviving member: a heap-only tuple. */
 		Assert(HeapTupleHeaderIsHeapOnly(htup));
 #endif
 
 		ItemIdSetRedirect(fromlp, tooff);
 	}
 
+	/*
+	 * Rewrite collapse-survivor stubs in place.  Each (offset, forward) pair
+	 * names a dead key tuple to be turned into an xid-free forwarding stub:
+	 * permanently invisible (HEAP_XMIN_INVALID|HEAP_XMAX_INVALID), flagged
+	 * HEAP_INDEXED_UPDATED with natts == 0 so consumers recognise it as a
+	 * stub rather than a tuple, heap-only preserved so it remains a valid
+	 * redirect/forward target, and t_ctid.offnum set to the forward offset.
+	 * The item's storage (including its inline modified-attrs bitmap in the
+	 * final bytes) is left undisturbed, so the bitmap survives and need not
+	 * be carried in WAL.
+	 */
+	offnum = stubs;
+	for (int i = 0; i < nstubs; i++)
+	{
+		OffsetNumber off = *offnum++;
+		OffsetNumber forward = *offnum++;
+		ItemId		lp = PageGetItemId(page, off);
+		HeapTupleHeader tup;
+		int			bitmap_natts;
+
+		Assert(ItemIdIsNormal(lp));
+		tup = (HeapTupleHeader) PageGetItem(page, lp);
+
+		/*
+		 * Preserve the tuple's write-time natts before we overwrite the natts
+		 * field with the stub sentinel (0): the trailing modified-attrs bitmap
+		 * was sized with it, and readers need it to locate the bitmap when the
+		 * relation's current natts has since grown (ADD COLUMN).  The stub's
+		 * t_ctid offset half holds the forward link; the block half is unused
+		 * for a stub, so stash the natts there.  This runs identically on the
+		 * primary and in redo (the pre-stub tuple is on both pages), so no WAL
+		 * change is needed.
+		 */
+		bitmap_natts = HeapTupleHeaderGetNatts(tup);
+
+		tup->t_infomask = HEAP_XMIN_INVALID | HEAP_XMAX_INVALID;
+		tup->t_infomask2 = HEAP_ONLY_TUPLE | HEAP_INDEXED_UPDATED;
+		HeapTupleHeaderSetNatts(tup, 0);
+		ItemPointerSetOffsetNumber(&tup->t_ctid, forward);
+		HotIndexedStubSetBitmapNatts(tup, bitmap_natts);
+	}
+
 	/* Update all now-dead line pointers */
 	offnum = nowdead;
 	for (int i = 0; i < ndead; i++)
@@ -2149,12 +2660,23 @@ heap_page_prune_execute(Buffer buffer, bool lp_truncate_only,
 		 * an index.  This should never be necessary with any individual
 		 * heap-only tuple item, though. (It's not clear how much of a problem
 		 * that would be, but there is no reason to allow it.)
+		 *
+		 * Exception: a HOT-indexed aborted orphan whose chain root is
+		 * unreachable on this page is intentionally marked LP_DEAD by the
+		 * heap-only-tuples loop in heap_page_prune_and_freeze (see the
+		 * heap_prune_record_dead call there).  The tuple is heap-only (it was
+		 * created by an UPDATE) and carries HEAP_INDEXED_UPDATED; the
+		 * adjacent btree leaf is still live, so we keep the slot pinned via
+		 * LP_DEAD until ambulkdelete sweeps it.  A subsequent vacuum reclaims
+		 * the LP to LP_UNUSED.
 		 */
 		if (ItemIdHasStorage(lp))
 		{
 			Assert(ItemIdIsNormal(lp));
 			htup = (HeapTupleHeader) PageGetItem(page, lp);
-			Assert(!HeapTupleHeaderIsHeapOnly(htup));
+			Assert(!HeapTupleHeaderIsHeapOnly(htup) ||
+				   ((htup->t_infomask2 & HEAP_INDEXED_UPDATED) != 0 &&
+					HeapTupleHeaderGetNatts(htup) > 0));
 		}
 		else
 		{
@@ -2177,7 +2699,7 @@ heap_page_prune_execute(Buffer buffer, bool lp_truncate_only,
 
 		if (lp_truncate_only)
 		{
-			/* Setting LP_DEAD to LP_UNUSED in vacuum's second pass */
+			/* Setting LP_DEAD to LP_UNUSED in vacuum's second pass. */
 			Assert(ItemIdIsDead(lp) && !ItemIdHasStorage(lp));
 		}
 		else
@@ -2188,7 +2710,8 @@ heap_page_prune_execute(Buffer buffer, bool lp_truncate_only,
 			 * items to be made LP_UNUSED instead.  This is only possible if
 			 * the relation has no indexes.  If there are any dead items, then
 			 * mark_unused_now was not true and every item being marked
-			 * LP_UNUSED must refer to a heap-only tuple.
+			 * LP_UNUSED must refer to a heap-only tuple whose chain has been
+			 * pruned.
 			 */
 			if (ndead > 0)
 			{
@@ -2264,6 +2787,8 @@ page_verify_redirects(Page page)
 		Assert(ItemIdIsNormal(targitem));
 		Assert(ItemIdHasStorage(targitem));
 		htup = (HeapTupleHeader) PageGetItem(page, targitem);
+
+		/* A redirect targets the first surviving chain member: heap-only. */
 		Assert(HeapTupleHeaderIsHeapOnly(htup));
 	}
 #endif
@@ -2566,7 +3091,8 @@ log_heap_prune_and_freeze(Relation relation, Buffer buffer,
 						  HeapTupleFreeze *frozen, int nfrozen,
 						  OffsetNumber *redirected, int nredirected,
 						  OffsetNumber *dead, int ndead,
-						  OffsetNumber *unused, int nunused)
+						  OffsetNumber *unused, int nunused,
+						  OffsetNumber *stubs, int nstubs)
 {
 	xl_heap_prune xlrec;
 	XLogRecPtr	recptr;
@@ -2581,8 +3107,10 @@ log_heap_prune_and_freeze(Relation relation, Buffer buffer,
 	xlhp_prune_items redirect_items;
 	xlhp_prune_items dead_items;
 	xlhp_prune_items unused_items;
+	xlhp_prune_items stub_items;
 	OffsetNumber frz_offsets[MaxHeapTuplesPerPage];
-	bool		do_prune = nredirected > 0 || ndead > 0 || nunused > 0;
+	bool		do_prune = nredirected > 0 || ndead > 0 || nunused > 0 ||
+		nstubs > 0;
 	bool		do_set_vm = vmflags & VISIBILITYMAP_VALID_BITS;
 	bool		heap_fpi_allowed = true;
 
@@ -2670,6 +3198,16 @@ log_heap_prune_and_freeze(Relation relation, Buffer buffer,
 		XLogRegisterBufData(0, unused,
 							sizeof(OffsetNumber) * nunused);
 	}
+	if (nstubs > 0)
+	{
+		xlrec.flags |= XLHP_HAS_HOT_INDEXED_STUBS;
+
+		stub_items.ntargets = nstubs;
+		XLogRegisterBufData(0, &stub_items,
+							offsetof(xlhp_prune_items, data));
+		XLogRegisterBufData(0, stubs,
+							sizeof(OffsetNumber[2]) * nstubs);
+	}
 	if (nfrozen > 0)
 		XLogRegisterBufData(0, frz_offsets,
 							sizeof(OffsetNumber) * nfrozen);
@@ -2692,8 +3230,17 @@ log_heap_prune_and_freeze(Relation relation, Buffer buffer,
 		xlrec.flags |= XLHP_CLEANUP_LOCK;
 	else
 	{
-		Assert(nredirected == 0 && ndead == 0);
-		/* also, any items in 'unused' must've been LP_DEAD previously */
+		/*
+		 * Without a cleanup lock we can only remove already-dead line
+		 * pointers and re-point redirects.  The latter happens when vacuum's
+		 * second pass reclaims a collapsed HOT-indexed chain and re-points
+		 * the root redirect at first_live: that change is made under an
+		 * exclusive lock and preserves the chain's reachability (every walker
+		 * still reaches first_live), so no cleanup lock is needed -- the same
+		 * basis on which this pass already reclaims dead line pointers to
+		 * LP_UNUSED.
+		 */
+		Assert(ndead == 0);
 	}
 	XLogRegisterData(&xlrec, SizeOfHeapPrune);
 	if (TransactionIdIsValid(conflict_xid))
diff --git a/src/backend/access/heap/vacuumlazy.c b/src/backend/access/heap/vacuumlazy.c
index 39395aed0d5..742b99fbd6d 100644
--- a/src/backend/access/heap/vacuumlazy.c
+++ b/src/backend/access/heap/vacuumlazy.c
@@ -131,6 +131,7 @@
 
 #include "access/genam.h"
 #include "access/heapam.h"
+#include "access/hot_indexed.h"
 #include "access/htup_details.h"
 #include "access/multixact.h"
 #include "access/tidstore.h"
@@ -445,7 +446,8 @@ static bool lazy_vacuum_all_indexes(LVRelState *vacrel);
 static void lazy_vacuum_heap_rel(LVRelState *vacrel);
 static void lazy_vacuum_heap_page(LVRelState *vacrel, BlockNumber blkno,
 								  Buffer buffer, OffsetNumber *deadoffsets,
-								  int num_offsets, Buffer vmbuffer);
+								  int num_offsets, Buffer vmbuffer,
+								  bool got_cleanup_lock);
 static bool lazy_check_wraparound_failsafe(LVRelState *vacrel);
 static void lazy_cleanup_all_indexes(LVRelState *vacrel);
 static IndexBulkDeleteResult *lazy_vacuum_one_index(Relation indrel,
@@ -1972,6 +1974,7 @@ lazy_scan_new_or_empty(LVRelState *vacrel, Buffer buf, BlockNumber blkno,
 										  NULL, 0,
 										  NULL, 0,
 										  NULL, 0,
+										  NULL, 0,
 										  NULL, 0);
 
 			END_CRIT_SECTION();
@@ -2214,6 +2217,7 @@ lazy_scan_noprune(LVRelState *vacrel,
 
 		hastup = true;			/* page prevents rel truncation */
 		tupleheader = (HeapTupleHeader) PageGetItem(page, itemid);
+
 		if (heap_tuple_should_freeze(tupleheader, &vacrel->cutoffs,
 									 &NoFreezePageRelfrozenXid,
 									 &NoFreezePageRelminMxid))
@@ -2686,6 +2690,7 @@ lazy_vacuum_heap_rel(LVRelState *vacrel)
 		Size		freespace;
 		OffsetNumber offsets[MaxOffsetNumber];
 		int			num_offsets;
+		bool		got_cleanup_lock;
 
 		vacuum_delay_point(false);
 
@@ -2708,10 +2713,20 @@ lazy_vacuum_heap_rel(LVRelState *vacrel)
 		 */
 		visibilitymap_pin(vacrel->rel, blkno, &vmbuffer);
 
-		/* We need a non-cleanup exclusive lock to mark dead_items unused */
-		LockBuffer(buf, BUFFER_LOCK_EXCLUSIVE);
+		/*
+		 * Setting dead items unused needs only an exclusive lock, but
+		 * reclaiming a collapsed HOT-indexed chain additionally re-points the
+		 * redirects that forward into it -- moving chain structure concurrent
+		 * scans follow, which requires a cleanup lock.  Prefer a cleanup lock
+		 * (as the first pass does); if unavailable, fall back to an exclusive
+		 * lock and lazy_vacuum_heap_page defers any such reclaim on this page
+		 * to a later vacuum that can obtain one.
+		 */
+		got_cleanup_lock = ConditionalLockBufferForCleanup(buf);
+		if (!got_cleanup_lock)
+			LockBuffer(buf, BUFFER_LOCK_EXCLUSIVE);
 		lazy_vacuum_heap_page(vacrel, blkno, buf, offsets,
-							  num_offsets, vmbuffer);
+							  num_offsets, vmbuffer, got_cleanup_lock);
 
 		/* Now that we've vacuumed the page, record its available space */
 		page = BufferGetPage(buf);
@@ -2746,6 +2761,7 @@ lazy_vacuum_heap_rel(LVRelState *vacrel)
 	restore_vacuum_error_info(vacrel, &saved_err_info);
 }
 
+
 /*
  *	lazy_vacuum_heap_page() -- free page's LP_DEAD items listed in the
  *						  vacrel->dead_items store.
@@ -2757,7 +2773,7 @@ lazy_vacuum_heap_rel(LVRelState *vacrel)
 static void
 lazy_vacuum_heap_page(LVRelState *vacrel, BlockNumber blkno, Buffer buffer,
 					  OffsetNumber *deadoffsets, int num_offsets,
-					  Buffer vmbuffer)
+					  Buffer vmbuffer, bool got_cleanup_lock)
 {
 	Page		page = BufferGetPage(buffer);
 	OffsetNumber unused[MaxHeapTuplesPerPage];
@@ -2783,7 +2799,9 @@ lazy_vacuum_heap_page(LVRelState *vacrel, BlockNumber blkno, Buffer buffer,
 	 * and mark the page all-visible within the same critical section,
 	 * enabling both changes to be emitted in a single WAL record. Since the
 	 * visibility checks may perform I/O and allocate memory, they must be
-	 * done outside the critical section.
+	 * done outside the critical section.  A deferred reclaim leaves a
+	 * not-yet-removed member on the page, so skip the check when anything was
+	 * deferred.
 	 */
 	if (heap_page_would_be_all_visible(vacrel->rel, buffer,
 									   vacrel->vistest, true,
@@ -2815,13 +2833,12 @@ lazy_vacuum_heap_page(LVRelState *vacrel, BlockNumber blkno, Buffer buffer,
 
 		itemid = PageGetItemId(page, toff);
 
+		/* A reclaimable item is a classic LP_DEAD line pointer. */
 		Assert(ItemIdIsDead(itemid) && !ItemIdHasStorage(itemid));
 		ItemIdSetUnused(itemid);
 		unused[nunused++] = toff;
 	}
 
-	Assert(nunused > 0);
-
 	/* Attempt to truncate line pointer array now */
 	PageTruncateLinePointerArray(page);
 
@@ -2851,12 +2868,13 @@ lazy_vacuum_heap_page(LVRelState *vacrel, BlockNumber blkno, Buffer buffer,
 								  vmflags != 0 ? vmbuffer : InvalidBuffer,
 								  vmflags,
 								  conflict_xid,
-								  false,	/* no cleanup lock required */
+								  got_cleanup_lock,
 								  PRUNE_VACUUM_CLEANUP,
 								  NULL, 0,	/* frozen */
 								  NULL, 0,	/* redirected */
 								  NULL, 0,	/* dead */
-								  unused, nunused);
+								  unused, nunused,
+								  NULL, 0); /* stubs */
 	}
 
 	END_CRIT_SECTION();
@@ -3647,10 +3665,44 @@ heap_page_would_be_all_visible(Relation rel, Buffer buf,
 		*logging_offnum = offnum;
 		itemid = PageGetItemId(page, offnum);
 
-		/* Unused or redirect line pointers are of no interest */
-		if (!ItemIdIsUsed(itemid) || ItemIdIsRedirected(itemid))
+		/* Unused line pointers are of no interest. */
+		if (!ItemIdIsUsed(itemid))
 			continue;
 
+		/*
+		 * Plain redirects are of no interest (the chain member they point at
+		 * is inspected separately) -- except a redirect that forwards to a
+		 * HOT-selectively-updated live tuple.  Such a redirect may still be
+		 * reached by a stale index entry whose key the live tuple no longer
+		 * holds; if the page were marked all-visible an index-only scan would
+		 * trust the VM, skip the heap fetch, and surface that stale key.
+		 * Keep the page not-all-visible until the stale leaves are swept and
+		 * the redirect reclaimed.  This mirrors the guard in
+		 * heap_prune_record_redirect, applied here because VACUUM's second
+		 * pass can set all-visible after reclaiming other items on the page.
+		 */
+		if (ItemIdIsRedirected(itemid))
+		{
+			OffsetNumber rdoff = ItemIdGetRedirect(itemid);
+
+			if (rdoff >= FirstOffsetNumber && rdoff <= maxoff)
+			{
+				ItemId		tlp = PageGetItemId(page, rdoff);
+
+				if (ItemIdIsNormal(tlp))
+				{
+					HeapTupleHeader thtup = (HeapTupleHeader) PageGetItem(page, tlp);
+
+					if ((thtup->t_infomask2 & HEAP_INDEXED_UPDATED) != 0)
+					{
+						*all_frozen = all_visible = false;
+						break;
+					}
+				}
+			}
+			continue;
+		}
+
 		ItemPointerSet(&(tuple.t_self), blockno, offnum);
 
 		/*
@@ -3676,6 +3728,24 @@ heap_page_would_be_all_visible(Relation rel, Buffer buf,
 		tuple.t_len = ItemIdGetLength(itemid);
 		tuple.t_tableOid = RelationGetRelid(rel);
 
+		/*
+		 * A HOT-indexed collapse-survivor stub is an LP_NORMAL item that is
+		 * not a real tuple: it forwards through the chain and carries a
+		 * preserved modified-attrs bitmap that a reader arriving via a stale
+		 * leaf must still cross.  A page holding one must stay not-all-visible
+		 * so index-only scans heap-fetch through the chain, exactly like the
+		 * redirect-to-SIU case above.  A stub's header is frozen-invalid
+		 * (HEAP_XMIN_INVALID), so the visibility check below would also class
+		 * it not-all-visible -- but recognize it explicitly here rather than
+		 * relying on that side effect, so the guard cannot silently lapse if
+		 * the stub encoding ever changes.
+		 */
+		if (HotIndexedHeaderIsStub(tuple.t_data))
+		{
+			*all_frozen = all_visible = false;
+			break;
+		}
+
 		/* Visibility checks may do IO or allocate memory */
 		Assert(CritSectionCount == 0);
 		switch (HeapTupleSatisfiesVacuumHorizon(&tuple, buf, &dead_after))
diff --git a/src/backend/access/rmgrdesc/heapdesc.c b/src/backend/access/rmgrdesc/heapdesc.c
index 75ae6f9d375..97f925df161 100644
--- a/src/backend/access/rmgrdesc/heapdesc.c
+++ b/src/backend/access/rmgrdesc/heapdesc.c
@@ -108,7 +108,8 @@ heap_xlog_deserialize_prune_and_freeze(char *cursor, uint16 flags,
 									   OffsetNumber **frz_offsets,
 									   int *nredirected, OffsetNumber **redirected,
 									   int *ndead, OffsetNumber **nowdead,
-									   int *nunused, OffsetNumber **nowunused)
+									   int *nunused, OffsetNumber **nowunused,
+									   int *nstubs, OffsetNumber **stubs)
 {
 	if (flags & XLHP_HAS_FREEZE_PLANS)
 	{
@@ -178,6 +179,23 @@ heap_xlog_deserialize_prune_and_freeze(char *cursor, uint16 flags,
 		*nowunused = NULL;
 	}
 
+	if (flags & XLHP_HAS_HOT_INDEXED_STUBS)
+	{
+		xlhp_prune_items *subrecord = (xlhp_prune_items *) cursor;
+
+		*nstubs = subrecord->ntargets;
+		Assert(*nstubs > 0);
+		*stubs = &subrecord->data[0];
+
+		cursor += offsetof(xlhp_prune_items, data);
+		cursor += sizeof(OffsetNumber[2]) * *nstubs;
+	}
+	else
+	{
+		*nstubs = 0;
+		*stubs = NULL;
+	}
+
 	*frz_offsets = (OffsetNumber *) cursor;
 }
 
@@ -305,6 +323,8 @@ heap2_desc(StringInfo buf, XLogReaderState *record)
 			int			nredirected;
 			int			nunused;
 			int			ndead;
+			int			nstubs;
+			OffsetNumber *stubs;
 			int			nplans;
 			xlhp_freeze_plan *plans;
 			OffsetNumber *frz_offsets;
@@ -315,10 +335,11 @@ heap2_desc(StringInfo buf, XLogReaderState *record)
 												   &nplans, &plans, &frz_offsets,
 												   &nredirected, &redirected,
 												   &ndead, &nowdead,
-												   &nunused, &nowunused);
+												   &nunused, &nowunused,
+												   &nstubs, &stubs);
 
-			appendStringInfo(buf, ", nplans: %u, nredirected: %u, ndead: %u, nunused: %u",
-							 nplans, nredirected, ndead, nunused);
+			appendStringInfo(buf, ", nplans: %u, nredirected: %u, ndead: %u, nunused: %u, nstubs: %u",
+							 nplans, nredirected, ndead, nunused, nstubs);
 
 			if (nplans > 0)
 			{
@@ -347,6 +368,13 @@ heap2_desc(StringInfo buf, XLogReaderState *record)
 				array_desc(buf, nowunused, sizeof(OffsetNumber), nunused,
 						   &offset_elem_desc, NULL);
 			}
+
+			if (nstubs > 0)
+			{
+				appendStringInfoString(buf, ", stubs:");
+				array_desc(buf, stubs, sizeof(OffsetNumber) * 2,
+						   nstubs, &redirect_elem_desc, NULL);
+			}
 		}
 	}
 	else if (info == XLOG_HEAP2_MULTI_INSERT)
diff --git a/src/include/access/heapam.h b/src/include/access/heapam.h
index 1cdf891055c..72c9e4203c5 100644
--- a/src/include/access/heapam.h
+++ b/src/include/access/heapam.h
@@ -488,7 +488,8 @@ extern void heap_page_prune_and_freeze(PruneFreezeParams *params,
 extern void heap_page_prune_execute(Buffer buffer, bool lp_truncate_only,
 									OffsetNumber *redirected, int nredirected,
 									OffsetNumber *nowdead, int ndead,
-									OffsetNumber *nowunused, int nunused);
+									OffsetNumber *nowunused, int nunused,
+									OffsetNumber *stubs, int nstubs);
 extern void heap_get_root_tuples(Page page, OffsetNumber *root_offsets);
 extern void log_heap_prune_and_freeze(Relation relation, Buffer buffer,
 									  Buffer vmbuffer, uint8 vmflags,
@@ -498,7 +499,8 @@ extern void log_heap_prune_and_freeze(Relation relation, Buffer buffer,
 									  HeapTupleFreeze *frozen, int nfrozen,
 									  OffsetNumber *redirected, int nredirected,
 									  OffsetNumber *dead, int ndead,
-									  OffsetNumber *unused, int nunused);
+									  OffsetNumber *unused, int nunused,
+									  OffsetNumber *stubs, int nstubs);
 
 /* in heap/heapam.c */
 
diff --git a/src/include/access/heapam_xlog.h b/src/include/access/heapam_xlog.h
index fdca7d821c8..8997a505006 100644
--- a/src/include/access/heapam_xlog.h
+++ b/src/include/access/heapam_xlog.h
@@ -273,6 +273,10 @@ typedef struct xl_heap_update
  *		uint16				nunused
  *		OffsetNumber		nowunused[nunused]
  *
+ *	xlhp_prune_items
+ *		uint16				nstubs
+ *		OffsetNumber		stubs[2 * nstubs]
+ *
  *	OffsetNumber			frz_offsets[sum([plan.ntuples for plan in plans])]
  *-----------------------------------------------------------------------------
  *
@@ -341,6 +345,18 @@ typedef struct xl_heap_prune
 #define		XLHP_VM_ALL_VISIBLE			(1 << 8)
 #define		XLHP_VM_ALL_FROZEN			(1 << 9)
 
+/*
+ * Indicates that an xlhp_prune_items sub-record with HOT-selectively-updated
+ * collapse-survivor stubs is present.  Each pair (offset, forward) names a
+ * line pointer to be rewritten in place into an xid-free forwarding stub
+ * (HEAP_XMIN_INVALID|HEAP_XMAX_INVALID, HEAP_ONLY_TUPLE|HEAP_INDEXED_UPDATED,
+ * natts==0) whose t_ctid.offnum is set to the forward offset.  The stub's
+ * modified-attrs bitmap is already present in the item on the page (it is the
+ * pre-prune tuple's inline bitmap, left undisturbed), so it is not carried in
+ * the WAL.
+ */
+#define		XLHP_HAS_HOT_INDEXED_STUBS	(1 << 10)
+
 /*
  * xlhp_freeze_plan describes how to freeze a group of one or more heap tuples
  * (appears in xl_heap_prune's xlhp_freeze_plans sub-record)
@@ -494,6 +510,7 @@ extern void heap_xlog_deserialize_prune_and_freeze(char *cursor, uint16 flags,
 												   OffsetNumber **frz_offsets,
 												   int *nredirected, OffsetNumber **redirected,
 												   int *ndead, OffsetNumber **nowdead,
-												   int *nunused, OffsetNumber **nowunused);
+												   int *nunused, OffsetNumber **nowunused,
+												   int *nstubs, OffsetNumber **stubs);
 
 #endif							/* HEAPAM_XLOG_H */
-- 
2.50.1

