From a183e46174ba0893df3e08347911f0f59108e446 Mon Sep 17 00:00:00 2001
From: Greg Burd <greg@burd.me>
Date: Wed, 17 Jun 2026 21:31:13 -0400
Subject: [PATCH v48 6/9] Teach amcheck to recognize HOT-indexed chains and
 collapse stubs

verify_heapam must not flag the HOT-indexed artifacts as corruption: a live
HEAP_INDEXED_UPDATED heap-only tuple whose mid-chain line pointer is preserved
because an index entry still points at it, an xid-free collapse-survivor stub,
and more than one LP_REDIRECT forwarding to the same live tuple are all
legitimate.  Recognize them and continue checking the rest of the chain.

Cover this with an amcheck regression test, and add a pg_upgrade test that
carries a relation with HOT-indexed chains, an ABA-cycled indexed column, an
out-of-line indexed column, and VACUUM-collapsed stubs across an upgrade,
verifying the data, verify_heapam, bt_index_check, and the chain scans on the
new cluster.

Authored-by: Greg Burd <greg@burd.me>
---
 contrib/amcheck/expected/check_heap.out    |  40 +++++++
 contrib/amcheck/sql/check_heap.sql         |  37 +++++++
 contrib/amcheck/verify_heapam.c            |  81 ++++++++++++--
 src/backend/access/heap/README.HOT-INDEXED |  10 ++
 src/bin/pg_upgrade/meson.build             |   1 +
 src/bin/pg_upgrade/t/009_hot_indexed.pl    | 118 +++++++++++++++++++++
 6 files changed, 277 insertions(+), 10 deletions(-)
 create mode 100644 src/bin/pg_upgrade/t/009_hot_indexed.pl

diff --git a/contrib/amcheck/expected/check_heap.out b/contrib/amcheck/expected/check_heap.out
index 979e5e84e72..b8dee2bb71b 100644
--- a/contrib/amcheck/expected/check_heap.out
+++ b/contrib/amcheck/expected/check_heap.out
@@ -231,6 +231,46 @@ SELECT * FROM verify_heapam('test_foreign_table',
 							endblock := NULL);
 ERROR:  cannot check relation "test_foreign_table"
 DETAIL:  This operation is not supported for foreign tables.
+-- HOT-indexed (HOT/SIU) on-page artifacts:
+--
+-- A HOT-indexed UPDATE keeps the new tuple on the same page as a heap-only
+-- tuple marked HEAP_INDEXED_UPDATED and plants index entries pointing at its
+-- own TID.  Pruning a chain of such updates collapses dead members to
+-- LP_REDIRECT forwarders and preserves the LP of a live HOT-indexed member
+-- whose index entries may not yet be swept.  verify_heapam must treat all of
+-- these as legitimate.  This scenario exercises them and asserts that
+-- verify_heapam reports zero corruption against legitimate HOT-indexed
+-- activity.
+CREATE TABLE hot_indexed_check (id int PRIMARY KEY, c1 int, c2 int, c3 int)
+	WITH (fillfactor = 70);
+CREATE INDEX hot_indexed_check_c1 ON hot_indexed_check (c1);
+CREATE INDEX hot_indexed_check_c2 ON hot_indexed_check (c2);
+INSERT INTO hot_indexed_check
+	SELECT g, g, g, g FROM generate_series(1, 200) g;
+-- Single-step UPDATEs: each row gets one HOT-indexed update.  Each
+-- successful HOT-indexed update keeps its new tuple on-page and inserts an
+-- entry only into the index whose attribute changed.
+UPDATE hot_indexed_check SET c1 = c1 + 1000;
+-- Multi-step UPDATEs: drive several successive HOT-indexed updates against
+-- the same rows so prune sees a chain of dead intermediates and collapses
+-- them to LP_REDIRECT forwarders.  An explicit VACUUM runs the prune path
+-- and exercises chain collapse.
+UPDATE hot_indexed_check SET c2 = c2 + 1 WHERE id <= 50;
+UPDATE hot_indexed_check SET c2 = c2 + 1 WHERE id <= 50;
+UPDATE hot_indexed_check SET c2 = c2 + 1 WHERE id <= 50;
+VACUUM (INDEX_CLEANUP off) hot_indexed_check;
+-- verify_heapam must not report any corruption against legitimate HOT-
+-- indexed artifacts.  Selecting the corrupting message makes any
+-- regression unmistakable in the regress diff.
+SELECT blkno, offnum, attnum, msg
+	FROM verify_heapam('hot_indexed_check',
+					   startblock := NULL,
+					   endblock := NULL);
+ blkno | offnum | attnum | msg 
+-------+--------+--------+-----
+(0 rows)
+
+DROP TABLE hot_indexed_check;
 -- cleanup
 DROP TABLE heaptest;
 DROP TABLESPACE regress_test_stats_tblspc;
diff --git a/contrib/amcheck/sql/check_heap.sql b/contrib/amcheck/sql/check_heap.sql
index 1745bae634e..c0ba2635180 100644
--- a/contrib/amcheck/sql/check_heap.sql
+++ b/contrib/amcheck/sql/check_heap.sql
@@ -138,6 +138,43 @@ SELECT * FROM verify_heapam('test_foreign_table',
 							startblock := NULL,
 							endblock := NULL);
 
+-- HOT-indexed (HOT/SIU) on-page artifacts:
+--
+-- A HOT-indexed UPDATE keeps the new tuple on the same page as a heap-only
+-- tuple marked HEAP_INDEXED_UPDATED and plants index entries pointing at its
+-- own TID.  Pruning a chain of such updates collapses dead members to
+-- LP_REDIRECT forwarders and preserves the LP of a live HOT-indexed member
+-- whose index entries may not yet be swept.  verify_heapam must treat all of
+-- these as legitimate.  This scenario exercises them and asserts that
+-- verify_heapam reports zero corruption against legitimate HOT-indexed
+-- activity.
+CREATE TABLE hot_indexed_check (id int PRIMARY KEY, c1 int, c2 int, c3 int)
+	WITH (fillfactor = 70);
+CREATE INDEX hot_indexed_check_c1 ON hot_indexed_check (c1);
+CREATE INDEX hot_indexed_check_c2 ON hot_indexed_check (c2);
+INSERT INTO hot_indexed_check
+	SELECT g, g, g, g FROM generate_series(1, 200) g;
+-- Single-step UPDATEs: each row gets one HOT-indexed update.  Each
+-- successful HOT-indexed update keeps its new tuple on-page and inserts an
+-- entry only into the index whose attribute changed.
+UPDATE hot_indexed_check SET c1 = c1 + 1000;
+-- Multi-step UPDATEs: drive several successive HOT-indexed updates against
+-- the same rows so prune sees a chain of dead intermediates and collapses
+-- them to LP_REDIRECT forwarders.  An explicit VACUUM runs the prune path
+-- and exercises chain collapse.
+UPDATE hot_indexed_check SET c2 = c2 + 1 WHERE id <= 50;
+UPDATE hot_indexed_check SET c2 = c2 + 1 WHERE id <= 50;
+UPDATE hot_indexed_check SET c2 = c2 + 1 WHERE id <= 50;
+VACUUM (INDEX_CLEANUP off) hot_indexed_check;
+-- verify_heapam must not report any corruption against legitimate HOT-
+-- indexed artifacts.  Selecting the corrupting message makes any
+-- regression unmistakable in the regress diff.
+SELECT blkno, offnum, attnum, msg
+	FROM verify_heapam('hot_indexed_check',
+					   startblock := NULL,
+					   endblock := NULL);
+DROP TABLE hot_indexed_check;
+
 -- cleanup
 DROP TABLE heaptest;
 DROP TABLESPACE regress_test_stats_tblspc;
diff --git a/contrib/amcheck/verify_heapam.c b/contrib/amcheck/verify_heapam.c
index 20ff58aa782..73b20623381 100644
--- a/contrib/amcheck/verify_heapam.c
+++ b/contrib/amcheck/verify_heapam.c
@@ -13,8 +13,10 @@
 #include "access/detoast.h"
 #include "access/genam.h"
 #include "access/heaptoast.h"
+#include "access/hot_indexed.h"
 #include "access/multixact.h"
 #include "access/relation.h"
+
 #include "access/table.h"
 #include "access/toast_internals.h"
 #include "access/visibilitymap.h"
@@ -522,9 +524,12 @@ verify_heapam(PG_FUNCTION_ARGS)
 			 */
 			if (ItemIdIsRedirected(ctx.itemid))
 			{
-				OffsetNumber rdoffnum = ItemIdGetRedirect(ctx.itemid);
+				OffsetNumber rdoffnum;
 				ItemId		rditem;
 
+				/* Resolve the redirect's target offset. */
+				rdoffnum = ItemIdGetRedirect(ctx.itemid);
+
 				if (rdoffnum < FirstOffsetNumber)
 				{
 					report_corruption(&ctx,
@@ -615,18 +620,38 @@ verify_heapam(PG_FUNCTION_ARGS)
 			ctx.tuphdr = (HeapTupleHeader) PageGetItem(ctx.page, ctx.itemid);
 			ctx.natts = HeapTupleHeaderGetNatts(ctx.tuphdr);
 
-			/* Ok, ready to check this next tuple */
-			check_tuple(&ctx,
-						&xmin_commit_status_ok[ctx.offnum],
-						&xmin_commit_status[ctx.offnum]);
+			/*
+			 * A HOT-selectively-updated collapse-survivor stub is an
+			 * LP_NORMAL item that is not a real tuple: HEAP_INDEXED_UPDATED
+			 * with natts == 0, permanently invisible (HEAP_XMIN_INVALID),
+			 * carrying a forward link and a modified-attrs bitmap.  The
+			 * per-tuple checks assume a real tuple and would misreport it, so
+			 * skip them; the update-chain pass below still records its
+			 * forward edge and treats it like a redirect (a forwarding node).
+			 */
+			if (!HotIndexedHeaderIsStub(ctx.tuphdr))
+				check_tuple(&ctx,
+							&xmin_commit_status_ok[ctx.offnum],
+							&xmin_commit_status[ctx.offnum]);
 
 			/*
 			 * If the CTID field of this tuple seems to point to another tuple
 			 * on the same page, record that tuple as the successor of this
-			 * one.
+			 * one.  A collapse-survivor stub stores its forward link in the
+			 * t_ctid offset only (the block half is repurposed to hold the
+			 * stub's write-time natts), so resolve its successor via the stub
+			 * accessor; the forward target is always on the same page.
 			 */
-			nextblkno = ItemPointerGetBlockNumber(&(ctx.tuphdr)->t_ctid);
-			nextoffnum = ItemPointerGetOffsetNumber(&(ctx.tuphdr)->t_ctid);
+			if (HotIndexedHeaderIsStub(ctx.tuphdr))
+			{
+				nextblkno = ctx.blkno;
+				nextoffnum = HotIndexedStubGetForward(ctx.tuphdr);
+			}
+			else
+			{
+				nextblkno = ItemPointerGetBlockNumber(&(ctx.tuphdr)->t_ctid);
+				nextoffnum = ItemPointerGetOffsetNumber(&(ctx.tuphdr)->t_ctid);
+			}
 			if (nextblkno == ctx.blkno && nextoffnum != ctx.offnum &&
 				nextoffnum >= FirstOffsetNumber && nextoffnum <= maxoff)
 				successor[ctx.offnum] = nextoffnum;
@@ -675,7 +700,7 @@ verify_heapam(PG_FUNCTION_ARGS)
 				 */
 				Assert(ItemIdIsNormal(next_lp));
 
-				/* Can only redirect to a HOT tuple. */
+				/* A redirect targets the first surviving chain member. */
 				next_htup = (HeapTupleHeader) PageGetItem(ctx.page, next_lp);
 				if (!HeapTupleHeaderIsHeapOnly(next_htup))
 				{
@@ -687,6 +712,19 @@ verify_heapam(PG_FUNCTION_ARGS)
 				/* HOT chains should not intersect. */
 				if (predecessor[nextoffnum] != InvalidOffsetNumber)
 				{
+					/*
+					 * In the HOT/SIU model several redirects legitimately
+					 * forward to the same live tuple: when a chain collapses,
+					 * the root and each entry-bearing dead member become a
+					 * redirect to first_live so every stale btree entry still
+					 * resolves there (the read path then rechecks the leaf
+					 * key).  Multiple predecessors are therefore expected
+					 * when the target is HOT-selectively-updated; keep the
+					 * first predecessor and do not report it as corruption.
+					 */
+					if ((next_htup->t_infomask2 & HEAP_INDEXED_UPDATED) != 0)
+						continue;
+
 					report_corruption(&ctx,
 									  psprintf("redirect line pointer points to offset %d, but offset %d also points there",
 											   nextoffnum, predecessor[nextoffnum]));
@@ -701,6 +739,30 @@ verify_heapam(PG_FUNCTION_ARGS)
 				continue;
 			}
 
+			/*
+			 * A collapse-survivor stub forwards like a redirect: it is not a
+			 * real tuple, so don't apply the tuple-to-tuple update-chain
+			 * checks, but do record the predecessor edge to its target so the
+			 * live tuple it ultimately forwards to is not mistaken for a
+			 * chain root.  Its target must be heap-only (another stub or the
+			 * live heap-only tuple).
+			 */
+			curr_htup = (HeapTupleHeader) PageGetItem(ctx.page, curr_lp);
+			if (HotIndexedHeaderIsStub(curr_htup))
+			{
+				if (ItemIdIsNormal(next_lp))
+				{
+					next_htup = (HeapTupleHeader) PageGetItem(ctx.page, next_lp);
+					if (!HeapTupleHeaderIsHeapOnly(next_htup))
+						report_corruption(&ctx,
+										  psprintf("HOT-indexed stub forwards to a non-heap-only tuple at offset %d",
+												   nextoffnum));
+					else if (predecessor[nextoffnum] == InvalidOffsetNumber)
+						predecessor[nextoffnum] = ctx.offnum;
+				}
+				continue;
+			}
+
 			/*
 			 * If the next line pointer is a redirect, or if it's a tuple but
 			 * the XMAX of this tuple doesn't match the XMIN of the next
@@ -709,7 +771,6 @@ verify_heapam(PG_FUNCTION_ARGS)
 			 */
 			if (ItemIdIsRedirected(next_lp))
 				continue;
-			curr_htup = (HeapTupleHeader) PageGetItem(ctx.page, curr_lp);
 			curr_xmax = HeapTupleHeaderGetUpdateXid(curr_htup);
 			next_htup = (HeapTupleHeader) PageGetItem(ctx.page, next_lp);
 			next_xmin = HeapTupleHeaderGetXmin(next_htup);
diff --git a/src/backend/access/heap/README.HOT-INDEXED b/src/backend/access/heap/README.HOT-INDEXED
index ab4f8bc1881..1f41b0fffe8 100644
--- a/src/backend/access/heap/README.HOT-INDEXED
+++ b/src/backend/access/heap/README.HOT-INDEXED
@@ -244,6 +244,16 @@ stale entries (enforced in heap_prune_record_redirect, the stub recorders, and
 heap_page_would_be_all_visible).
 
 
+amcheck and statistics
+----------------------
+
+verify_heapam treats the HOT-indexed artifacts as legitimate: a live
+HEAP_INDEXED_UPDATED heap-only tuple whose line pointer is preserved, and
+multiple LP_REDIRECTs forwarding to one live tuple.
+
+Statistics: pg_stat_all_tables.n_tup_hot_indexed_upd counts HOT-indexed
+
+
 Appendices
 ----------
 
diff --git a/src/bin/pg_upgrade/meson.build b/src/bin/pg_upgrade/meson.build
index ffbf6ae8d75..0a6fa0dcff2 100644
--- a/src/bin/pg_upgrade/meson.build
+++ b/src/bin/pg_upgrade/meson.build
@@ -69,6 +69,7 @@ tests += {
       't/006_transfer_modes.pl',
       't/007_multixact_conversion.pl',
       't/008_extension_control_path.pl',
+      't/009_hot_indexed.pl',
     ],
     'deps': [test_ext],
     'test_kwargs': {'priority': 40}, # pg_upgrade tests are slow
diff --git a/src/bin/pg_upgrade/t/009_hot_indexed.pl b/src/bin/pg_upgrade/t/009_hot_indexed.pl
new file mode 100644
index 00000000000..b71129ae03d
--- /dev/null
+++ b/src/bin/pg_upgrade/t/009_hot_indexed.pl
@@ -0,0 +1,118 @@
+# Copyright (c) 2026, PostgreSQL Global Development Group
+
+# pg_upgrade must preserve HOT-indexed on-disk state.  A relation that has
+# accumulated HOT-indexed chains -- including a value cycled away and back
+# (ABA), an out-of-line (TOAST) indexed column, and chains collapsed to
+# xid-free forwarding stubs by VACUUM -- must come through an upgrade with its
+# data intact, its indexes structurally sound, and its chains still scanning
+# correctly.  pg_upgrade transfers heap and index files verbatim, so this is
+# really a check that the new-state bits (HEAP_INDEXED_UPDATED, collapse stubs)
+# are not rejected by pg_upgrade's checks and stay correct on the new cluster.
+
+use strict;
+use warnings FATAL => 'all';
+use PostgreSQL::Test::Cluster;
+use PostgreSQL::Test::Utils;
+use Test::More;
+
+my $mode = $ENV{PG_TEST_PG_UPGRADE_MODE} || '--copy';
+
+my $oldnode = PostgreSQL::Test::Cluster->new('old_node');
+$oldnode->init;
+$oldnode->start;
+
+# Build a relation with several secondary indexes so single-column updates
+# stay HOT-indexed, then exercise the cases that produce interesting on-disk
+# state.
+$oldnode->safe_psql('postgres', q{
+	CREATE EXTENSION amcheck;
+	CREATE TABLE hi (id int PRIMARY KEY, k int, v int, big text)
+	  WITH (fillfactor = 50);
+	ALTER TABLE hi ALTER COLUMN big SET STORAGE EXTERNAL;
+	CREATE INDEX hi_k ON hi (k);
+	CREATE INDEX hi_v ON hi (v);
+	CREATE INDEX hi_big ON hi (big);
+	INSERT INTO hi SELECT g, g, g * 10, repeat(chr(64 + g), 2000)
+	  FROM generate_series(1, 20) g;
+});
+
+# Interleave updates of different indexed columns on the same rows.  A member
+# that changed a column not changed again by a later hop survives VACUUM as a
+# collapse stub; the rest are reclaimed.  Row 1 additionally cycles k away and
+# back (ABA), and row 2 rewrites its toasted indexed column.
+$oldnode->safe_psql('postgres', q{
+	UPDATE hi SET k = k + 100 WHERE id <= 10;   -- changes k
+	UPDATE hi SET v = v + 1   WHERE id <= 10;    -- changes v (survives as stub)
+	UPDATE hi SET k = k - 100 WHERE id <= 10;    -- k back to original (ABA)
+	UPDATE hi SET big = repeat('Z', 2000) WHERE id = 2;
+});
+# Collapse dead chain members to stubs.
+$oldnode->safe_psql('postgres', 'VACUUM (INDEX_CLEANUP off) hi');
+
+# The pre-upgrade state must already be self-consistent.
+is( $oldnode->safe_psql('postgres',
+		q{SELECT count(*) FROM verify_heapam('hi')}),
+	'0', 'pre-upgrade heap is consistent');
+
+# Snapshot the data we will compare after the upgrade.
+my $expect = $oldnode->safe_psql('postgres',
+	q{SELECT id, k, v, length(big) FROM hi ORDER BY id});
+
+$oldnode->stop;
+
+# New cluster, same version.
+my $newnode = PostgreSQL::Test::Cluster->new('new_node');
+$newnode->init;
+
+my $oldbindir = $oldnode->config_data('--bindir');
+my $newbindir = $newnode->config_data('--bindir');
+
+# Run pg_upgrade from a writable directory (matches 002_pg_upgrade).
+chdir ${PostgreSQL::Test::Utils::tmp_check};
+
+command_ok(
+	[
+		'pg_upgrade', '--no-sync',
+		'--old-datadir' => $oldnode->data_dir,
+		'--new-datadir' => $newnode->data_dir,
+		'--old-bindir' => $oldbindir,
+		'--new-bindir' => $newbindir,
+		'--socketdir' => $newnode->host,
+		'--old-port' => $oldnode->port,
+		'--new-port' => $newnode->port,
+		$mode,
+	],
+	'run of pg_upgrade for HOT-indexed relation');
+
+$newnode->start;
+
+# Data survived intact.
+my $got = $newnode->safe_psql('postgres',
+	q{SELECT id, k, v, length(big) FROM hi ORDER BY id});
+is($got, $expect, 'HOT-indexed table data preserved across pg_upgrade');
+
+# Heap and indexes are structurally sound on the new cluster.
+is( $newnode->safe_psql('postgres',
+		q{SELECT count(*) FROM verify_heapam('hi')}),
+	'0', 'post-upgrade heap is consistent (collapse stubs recognised)');
+is( $newnode->safe_psql('postgres', q{
+		SELECT count(*) FROM (
+			SELECT bt_index_check(c.oid)
+			FROM pg_class c JOIN pg_index i ON i.indexrelid = c.oid
+			WHERE i.indrelid = 'hi'::regclass) s}),
+	'4', 'post-upgrade indexes pass bt_index_check');
+
+# The ABA chain on row 1 still scans correctly through a forced index scan:
+# k=1 returns exactly the one live row, and its superseded value is gone.
+is( $newnode->safe_psql('postgres', q{
+		SET enable_seqscan = off; SET enable_bitmapscan = off;
+		SELECT count(*) FROM hi WHERE k = 1}),
+	'1', 'post-upgrade index scan returns the ABA row once');
+is( $newnode->safe_psql('postgres', q{
+		SET enable_seqscan = off; SET enable_bitmapscan = off;
+		SELECT count(*) FROM hi WHERE k = 101}),
+	'0', 'post-upgrade index scan drops the superseded value');
+
+$newnode->stop;
+
+done_testing();
-- 
2.50.1

