# Copyright (c) 2026, PostgreSQL Global Development Group
#
# Verify that data committed before a recovery target stays visible after a
# standby pauses at that target from an OVERFLOWED (STANDBY_SNAPSHOT_PENDING)
# snapshot and recovery is later continued past the target.
#
# This is a regression test for a hint-bit corruption introduced together with
# enabling hot-standby connections at an end-of-recovery pause taken from an
# overflowed snapshot.  A read-only query served during such a pause, over a
# tuple whose xmin is a still-running overflowed subtransaction, takes the
# "must have aborted or crashed" path in HeapTupleSatisfiesMVCC() and stamps a
# HEAP_XMIN_INVALID hint.  With data checksums and wal_log_hints off, that hint
# is dirtied during recovery and written to disk; continuing recovery past the
# transaction's COMMIT (the documented "shut down, raise the target, restart to
# continue recovery" workflow) never reconciles the on-disk hint with CLOG, so
# the committed rows become permanently invisible.
#
# The test pauses a standby at a target BEFORE the overflow transaction commits,
# runs a query there (which stamps the hints), then continues recovery to a
# target AFTER the commit and checks that the rows are visible.  On the buggy
# code the final visibility check sees 0 rows instead of 70.

use strict;
use warnings FATAL => 'all';
use PostgreSQL::Test::Cluster;
use PostgreSQL::Test::Utils;
use Test::More;

# Primary with data checksums OFF and wal_log_hints OFF.  In that configuration
# a hint-bit update dirties the page even during recovery (XLogHintBitIsNeeded()
# is false, so MarkBufferDirtyHint() does not take the recovery bail-out), which
# is what lets a wrong hint reach disk.
my $node_primary = PostgreSQL::Test::Cluster->new('primary');
$node_primary->init(
	allows_streaming  => 1,
	has_archiving     => 1,
	no_data_checksums => 1);
$node_primary->append_conf('postgresql.conf', 'wal_log_hints = off');
$node_primary->start;

$node_primary->safe_psql('postgres', 'CREATE TABLE subxid_test (id int);');

# Open a transaction with more than PGPROC_MAX_CACHED_SUBXIDS (64)
# subtransactions and keep it open to force a snapshot overflow.  XIDs are only
# assigned when a subtransaction writes, so each level performs an INSERT.
my $bg = $node_primary->background_psql('postgres');
$bg->query_safe('BEGIN');
for my $i (1 .. 70)
{
	$bg->query_safe("SAVEPOINT s$i");
	$bg->query_safe("INSERT INTO subxid_test VALUES ($i)");
}

# Base backup taken while the overflow transaction is open: the first
# XLOG_RUNNING_XACTS the standby replays has subxid_overflow set, so the standby
# stays at STANDBY_SNAPSHOT_PENDING.
my $backup_name = 'my_backup';
$node_primary->backup($backup_name);

# committed_test commits after the backup but before the target.  Besides being
# data whose visibility we check, its XIDs raise the recovery snapshot's xmax
# above the overflowed subxids, so a query over subxid_test takes the
# hint-stamping path rather than the in-progress path.
$node_primary->safe_psql('postgres',
	    'CREATE TABLE committed_test (id int); '
	  . 'INSERT INTO committed_test VALUES (1), (2), (3);');

# A checkpoint here is replayed by the standby before it pauses, giving its
# shutdown a restartpoint that flushes the dirtied hint page to disk.  The
# overflow transaction is still open, so this running-xacts record is still
# overflowed and the standby remains in STANDBY_SNAPSHOT_PENDING.
$node_primary->safe_psql('postgres', 'CHECKPOINT;');

# First target: before the overflow transaction commits.
my $lsn1 = $node_primary->safe_psql('postgres', 'SELECT pg_current_wal_lsn()');

# A couple of buffer records so recovery_target_inclusive cannot overshoot the
# first target into the COMMIT record.
$node_primary->safe_psql('postgres',
	'CREATE TABLE b1 (x int); INSERT INTO b1 VALUES (1);');

# Commit the overflow transaction, then capture a second target past the commit.
$bg->query_safe('COMMIT');
$bg->quit;
my $lsn2 = $node_primary->safe_psql('postgres', 'SELECT pg_current_wal_lsn()');

# Force segment switches so the segments containing both targets are archived
# and reachable through the standby's restore_command.
$node_primary->safe_psql('postgres',
	'INSERT INTO b1 VALUES (2); SELECT pg_switch_wal();');
$node_primary->safe_psql('postgres',
	'INSERT INTO b1 VALUES (3); SELECT pg_switch_wal();');

# Standby that pauses once it reaches the first target (before the commit).
my $node_standby = PostgreSQL::Test::Cluster->new('standby');
$node_standby->init_from_backup($node_primary, $backup_name,
	has_restoring => 1);
$node_standby->append_conf('postgresql.conf', "recovery_target_lsn = '$lsn1'");
$node_standby->append_conf('postgresql.conf', 'recovery_target_action = pause');
# Raise the log level so we can confirm the overflowed-snapshot path was taken.
$node_standby->append_conf('postgresql.conf', 'log_min_messages = debug1');
$node_standby->start;

my $state_query =
	  "SELECT CASE WHEN pg_is_in_recovery()"
	. " THEN pg_get_wal_replay_pause_state()"
	. " ELSE 'not in recovery' END";

$node_standby->poll_query_until('postgres', "$state_query <> 'not paused'")
  or die "Timed out while waiting for the standby to reach the first target";
is($node_standby->safe_psql('postgres', $state_query),
	'paused', 'standby pauses at the first recovery target');

# Confirm the standby actually served queries from an overflowed (pending)
# snapshot and never reached a ready (non-overflowed) one.
ok( $node_standby->log_contains(
		qr/recovery snapshot waiting for non-overflowed snapshot/),
	'standby paused while still waiting for a non-overflowed snapshot');
ok( !$node_standby->log_contains(qr/recovery snapshots are now enabled/),
	'standby never reached a ready (non-overflowed) snapshot');

# The corrupting action: a read-only query while paused.  The overflow
# transaction is still uncommitted as of this target, so its rows are correctly
# invisible now -- but answering the query stamps HEAP_XMIN_INVALID hints, which
# in this configuration are written to disk.
is($node_standby->safe_psql('postgres', 'SELECT count(*) FROM subxid_test'),
	'0',
	'overflowed-subxid rows are not visible at the first target (txn open)');

# Continue recovery past the commit, per the documented workflow: shut down
# (the restartpoint flushes the hint pages dirtied above), advance the target,
# restart.
$node_standby->stop('fast');
$node_standby->append_conf('postgresql.conf', "recovery_target_lsn = '$lsn2'");
$node_standby->start;

$node_standby->poll_query_until('postgres', "$state_query <> 'not paused'")
  or die "Timed out while waiting for the standby to reach the second target";
is($node_standby->safe_psql('postgres', $state_query),
	'paused', 'standby pauses again at the advanced recovery target');

# The overflow transaction committed before this target, so its rows must be
# visible.  With the bug, the stale on-disk HEAP_XMIN_INVALID hint stamped at
# the first pause overrides CLOG and they remain invisible (count 0).
is($node_standby->safe_psql('postgres', 'SELECT count(*) FROM subxid_test'),
	'70',
	'rows committed before the recovery target are visible after continuing recovery'
);

# Sanity check that ordinary committed data is visible too.
is($node_standby->safe_psql('postgres', 'SELECT count(*) FROM committed_test'),
	'3', 'data committed before the recovery target is visible');

$node_standby->teardown_node;
$node_primary->teardown_node;

done_testing();