From 44eceb392066607a497e109eb5f0f8a42e4a658e Mon Sep 17 00:00:00 2001 From: Claude Date: Wed, 22 Apr 2026 18:05:46 +0000 Subject: [PATCH v1 3/3] Auto-resume recovery once the logical slot conflict is resolved MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The previous behavior under recovery_pause_on_logical_slot_conflict required the operator to both drain (or drop / advance) the slot AND call pg_wal_replay_resume() to continue — two steps, even though the first step is the one that matters semantically. That split also meant the feature couldn't underpin a continuous-CDC service without external orchestration to issue the resume. Lift the scan predicate ("does any slot in `dboid` still block this conflict?") out of the initial check into a helper AnySlotStillBlocksConflict(). Call it again every 1s inside the existing wait loop. When it returns false, flip the pause state to NOT_PAUSED and let the loop exit; the existing post-wait advance then bumps catalog_xmin past the horizon on drained slots so the fall-through InvalidateObsoleteReplicationSlots() is a no-op. "No longer blocking" covers every unblock path, not just drain: * drained past the pause LSN (confirmed_flush >= captured conflict_lsn) — the main case * slot dropped (pg_drop_replication_slot) — removed from the scan * slot advanced (pg_replication_slot_advance) — catalog_xmin moves past the horizon * slot invalidated for another reason (e.g. RS_INVAL_WAL_REMOVED from max_slot_wal_keep_size, applied by the checkpointer, which runs even while the startup process is asleep in our wait loop) — data.invalidated != RS_INVAL_NONE, scan skips it Manual pg_wal_replay_resume() still works as the "give up on this slot and let it invalidate" escape hatch, and CheckForStandbyTrigger still breaks the loop for pg_promote(). Capture conflict_lsn once at pause time and reuse it for both the in-wait predicate and the post-wait advance, replacing the redundant second GetXLogReplayRecPtr() call. GUC long_desc, postgresql.conf.sample comment, and the xlogrecovery.c variable-decl comment updated to describe auto-resume. --- src/backend/access/transam/xlogrecovery.c | 5 +- src/backend/storage/ipc/standby.c | 216 +++++++++++------- src/backend/utils/misc/guc_parameters.dat | 2 +- src/backend/utils/misc/postgresql.conf.sample | 4 +- .../t/054_recovery_pause_on_slot_conflict.pl | 120 +++++++++- 5 files changed, 255 insertions(+), 92 deletions(-) diff --git a/src/backend/access/transam/xlogrecovery.c b/src/backend/access/transam/xlogrecovery.c index 508e718169c..a2b0fd4ef12 100644 --- a/src/backend/access/transam/xlogrecovery.c +++ b/src/backend/access/transam/xlogrecovery.c @@ -100,7 +100,10 @@ int recovery_min_apply_delay = 0; * If true, when WAL replay on a standby is about to invalidate an otherwise- * active logical replication slot because a catalog PRUNE_ON_ACCESS record's * snapshotConflictHorizon has overtaken the slot's catalog_xmin, pause replay - * instead and give an operator a chance to drain (or drop) the slot. + * instead. Replay auto-resumes once the consumer has drained the slot past + * the pause point (or the slot is dropped, advanced, or otherwise no longer + * blocking); pg_wal_replay_resume() also forces continuation. See + * MaybePauseOnLogicalSlotConflict() in standby.c. * * Motivated by blueprints/LOGICAL_DECODING_ARCHIVED_WALS.md §4.2.3 / US-4: * an archive-only logical-decoding standby cannot feed hot_standby_feedback diff --git a/src/backend/storage/ipc/standby.c b/src/backend/storage/ipc/standby.c index 0659f9d2169..ce467a07486 100644 --- a/src/backend/storage/ipc/standby.c +++ b/src/backend/storage/ipc/standby.c @@ -514,51 +514,37 @@ ResolveRecoveryConflictWithSnapshot(TransactionId snapshotConflictHorizon, } /* - * If recovery_pause_on_logical_slot_conflict is enabled, and replay is about - * to apply a catalog PRUNE_ON_ACCESS record whose snapshotConflictHorizon - * would cause the invalidation of at least one non-invalidated logical slot - * in the same database, request a recovery pause and wait on the recovery - * pause condition variable until an operator resumes. + * Returns true if at least one non-synced logical slot in `dboid` still + * blocks replay past snapshotConflictHorizon. * - * On resume the caller re-falls through to InvalidateObsoleteReplicationSlots: - * if the operator has drained / dropped / advanced the slot, invalidation is - * a no-op; if they chose to resume without acting, the slot is invalidated - * as usual. This matches the recovery_target_action=pause precedent. + * "Blocks" means: the slot is in use, not invalidated, snapbuild-consistent + * (effective_catalog_xmin is valid — skipping in-progress slots avoids a + * deadlock with DecodingContextFindStartpoint), and its catalog_xmin + * precedes-or-equals the horizon. * - * The two parameters identify which slots, if any, this prune record can - * conflict with: - * - dboid: logical slots are per-database, so only slots belonging to this - * database can be invalidated by a catalog prune happening here; slots in - * other databases are never affected and must be ignored. - * - snapshotConflictHorizon: the xid threshold carried by the - * PRUNE_ON_ACCESS record. A slot conflicts iff its catalog_xmin - * precedes-or-equals this horizon (i.e. it still needs catalog rows the - * prune is about to remove). + * Use PrecedesOrEquals (not Precedes) to match DetermineSlotInvalidationCause. + * Otherwise a slot whose catalog_xmin was just advanced to exactly horizon by + * a previous pause-and-advance cycle fails to re-pause on the next prune + * record with the same horizon, yet would still be invalidated by the + * fall-through InvalidateObsoleteReplicationSlots call. * - * Only invoked from ResolveRecoveryConflictWithSnapshot(), before any buffer - * locks are taken, so pausing here does not deadlock with anything. + * Synced slots are skipped: writing their fields from the startup process + * would race the slot-sync worker, and ALTER / DROP_REPLICATION_SLOT errors + * out on a synced slot so the operator-facing recipe does not apply. + * + * When conflict_lsn is valid (in-wait auto-resume check), slots whose + * confirmed_flush_lsn has reached conflict_lsn are treated as not blocking: + * the consumer has caught up to the pause point and the post-wait advance + * code will bump their catalog_xmin past the horizon. Pass InvalidXLogRecPtr + * for the initial pause-or-not decision (we don't yet have a pause point). */ -void -MaybePauseOnLogicalSlotConflict(Oid dboid, TransactionId snapshotConflictHorizon) +static bool +AnySlotStillBlocksConflict(Oid dboid, TransactionId snapshotConflictHorizon, + XLogRecPtr conflict_lsn) { int i; - bool would_invalidate = false; - - if (!recovery_pause_on_logical_slot_conflict) - return; - if (!TransactionIdIsValid(snapshotConflictHorizon)) - return; + bool blocking = false; - /* - * Scan for a would-be-invalidated slot in the conflicting database. - * - * Skip slots that have not yet reached snapshot-builder consistency - * (effective_catalog_xmin is still InvalidTransactionId). An in-progress - * slot has not produced any output to a consumer, so invalidating it is - * harmless — the caller can retry. Pausing for such a slot would - * deadlock: DecodingContextFindStartpoint would be waiting for replay - * to advance, while replay would be waiting for the slot to be drained. - */ LWLockAcquire(ReplicationSlotControlLock, LW_SHARED); for (i = 0; i < max_replication_slots; i++) { @@ -566,7 +552,8 @@ MaybePauseOnLogicalSlotConflict(Oid dboid, TransactionId snapshotConflictHorizon Oid slot_db; TransactionId slot_xmin; TransactionId slot_effective_xmin; - bool active_logical; + XLogRecPtr slot_confirmed; + bool is_candidate; if (!s->in_use) continue; @@ -575,52 +562,99 @@ MaybePauseOnLogicalSlotConflict(Oid dboid, TransactionId snapshotConflictHorizon slot_db = s->data.database; slot_xmin = s->data.catalog_xmin; slot_effective_xmin = s->effective_catalog_xmin; - /* - * Skip synced slots (managed by the slot-sync worker per - * sync_replication_slots). Writing their fields from the startup - * process would race with the slot-sync worker's own updates, and - * the operator-facing "drain or drop the slot" recipe in the - * errhint below cannot be applied to a synced slot (ALTER / - * DROP_REPLICATION_SLOT error on synced). - */ - active_logical = (s->data.invalidated == RS_INVAL_NONE && - slot_db != InvalidOid && - TransactionIdIsValid(slot_effective_xmin) && - !s->data.synced); + slot_confirmed = s->data.confirmed_flush; + is_candidate = (s->data.invalidated == RS_INVAL_NONE && + slot_db != InvalidOid && + TransactionIdIsValid(slot_effective_xmin) && + !s->data.synced); SpinLockRelease(&s->mutex); - if (!active_logical) + if (!is_candidate) continue; if (slot_db != dboid) continue; if (!TransactionIdIsValid(slot_xmin)) continue; - /* - * Use PrecedesOrEquals (not Precedes) to match the check in - * DetermineSlotInvalidationCause. Otherwise a slot whose - * catalog_xmin was just advanced to exactly conflict_horizon by - * a previous pause-and-advance cycle (our own resume code) will - * NOT trigger a pause here when the next prune record arrives - * with horizon == catalog_xmin, yet WILL still be invalidated - * by the fall-through InvalidateObsoleteReplicationSlots call. - */ - if (TransactionIdPrecedesOrEquals(slot_xmin, snapshotConflictHorizon)) - { - would_invalidate = true; - break; - } + if (!TransactionIdPrecedesOrEquals(slot_xmin, snapshotConflictHorizon)) + continue; + if (conflict_lsn != InvalidXLogRecPtr && + slot_confirmed >= conflict_lsn) + continue; + + blocking = true; + break; } LWLockRelease(ReplicationSlotControlLock); - if (!would_invalidate) + return blocking; +} + +/* + * If recovery_pause_on_logical_slot_conflict is enabled, and replay is about + * to apply a catalog PRUNE_ON_ACCESS record whose snapshotConflictHorizon + * would cause the invalidation of at least one non-invalidated logical slot + * in the same database, request a recovery pause and wait until the conflict + * is resolved. + * + * The wait exits in any of: + * - Auto-resume: a periodic re-scan finds no slot still blocking. Any of + * draining past the pause LSN, dropping the slot, pg_replication_slot_ + * advance(), or out-of-band invalidation (e.g. max_slot_wal_keep_size + * applied by the checkpointer, which runs even while startup is paused + * here) will satisfy this. The post-wait advance then bumps catalog_xmin + * on drained slots so the fall-through InvalidateObsoleteReplicationSlots() + * is a no-op. + * - Manual resume: pg_wal_replay_resume() flips the state to NOT_PAUSED. + * Any slot still blocking at that point is invalidated by the + * fall-through — the "give up on this slot" escape hatch. + * - Promote: CheckForStandbyTrigger() consumes PROMOTE_SIGNAL_FILE and we + * return early so the startup process can finish promotion. + * + * The two parameters identify which slots, if any, this prune record can + * conflict with: + * - dboid: logical slots are per-database, so only slots belonging to this + * database can be invalidated by a catalog prune happening here; slots in + * other databases are never affected and must be ignored. + * - snapshotConflictHorizon: the xid threshold carried by the + * PRUNE_ON_ACCESS record. A slot conflicts iff its catalog_xmin + * precedes-or-equals this horizon (i.e. it still needs catalog rows the + * prune is about to remove). + * + * Only invoked from ResolveRecoveryConflictWithSnapshot(), before any buffer + * locks are taken, so pausing here does not deadlock with anything. + */ +void +MaybePauseOnLogicalSlotConflict(Oid dboid, TransactionId snapshotConflictHorizon) +{ + XLogRecPtr conflict_lsn; + bool user_requested_pause; + + if (!recovery_pause_on_logical_slot_conflict) + return; + if (!TransactionIdIsValid(snapshotConflictHorizon)) + return; + + if (!AnySlotStillBlocksConflict(dboid, snapshotConflictHorizon, + InvalidXLogRecPtr)) return; + /* + * Remember whether an operator had already paused recovery (e.g. via + * pg_wal_replay_pause()) before this conflict fired. If so, our + * auto-resume below must not clear that pause out from under them — the + * user's pause wins. + */ + user_requested_pause = (GetRecoveryPauseState() != RECOVERY_NOT_PAUSED); + + conflict_lsn = GetXLogReplayRecPtr(NULL); + ereport(LOG, (errmsg("recovery paused: WAL redo at %X/%X would invalidate a logical replication slot", - LSN_FORMAT_ARGS(GetXLogReplayRecPtr(NULL))), + LSN_FORMAT_ARGS(conflict_lsn)), errdetail("snapshotConflictHorizon %u exceeds catalog_xmin of at least one active logical slot in database %u.", snapshotConflictHorizon, dboid), - errhint("Drain, advance, or drop the slot, then execute pg_wal_replay_resume()."))); + errhint("Recovery will resume automatically once the slot is drained past %X/%X, dropped, advanced, or invalidated for another reason; pg_wal_replay_resume() forces continuation (invalidating any remaining blocking slot).", + LSN_FORMAT_ARGS(conflict_lsn)))); SetRecoveryPause(true); @@ -642,6 +676,29 @@ MaybePauseOnLogicalSlotConflict(Oid dboid, TransactionId snapshotConflictHorizon return; } + /* + * Auto-resume: if nothing is still blocking this conflict, clear + * the pause and let the loop condition exit. The post-wait advance + * will bump catalog_xmin on any slot that drained past conflict_lsn + * so the fall-through InvalidateObsoleteReplicationSlots() is a + * no-op. Slots invalidated out of band (dropped, WAL-removed, + * etc.) are simply not in the scan anymore. + */ + if (!AnySlotStillBlocksConflict(dboid, snapshotConflictHorizon, + conflict_lsn)) + { + /* + * Only clear the pause we set ourselves. If the operator had + * already paused recovery before the conflict fired, leave their + * pause in place — auto-resume must not silently override an + * explicit pg_wal_replay_pause(). Either way, exit the + * conflict-wait loop now that nothing is blocking. + */ + if (!user_requested_pause) + SetRecoveryPause(false); + break; + } + /* * Promote RECOVERY_PAUSE_REQUESTED to RECOVERY_PAUSED so that * observers (pg_get_wal_replay_pause_state() / monitoring) see the @@ -654,21 +711,14 @@ MaybePauseOnLogicalSlotConflict(Oid dboid, TransactionId snapshotConflictHorizon ConditionVariableCancelSleep(); /* - * Operator has resumed. If they drained slot(s) up to (or past) the LSN - * of the about-to-be-replayed conflict record, we trust that the consumer - * downstream has captured everything that needed the pre-conflict catalog - * snapshot. Advance those slots' catalog_xmin past the horizon so the - * subsequent InvalidateObsoleteReplicationSlots() fall-through is a - * no-op. Slots that the operator did NOT drain are left alone and get - * invalidated normally — that is the "I didn't act, just let the slot - * die" path. - * - * "Drained past the conflict LSN" is defined as: the slot's - * confirmed_flush_lsn >= the LSN at which replay has paused, which is - * the current replay position reported by GetXLogReplayRecPtr. + * Wait is over. For any slot whose consumer drained up to (or past) + * conflict_lsn, advance catalog_xmin past the horizon so the subsequent + * InvalidateObsoleteReplicationSlots() fall-through is a no-op. Slots + * that did not drain are left alone and get invalidated normally — the + * "I didn't act, just let the slot die" path that runs when an operator + * manually resumed without draining. */ { - XLogRecPtr conflict_lsn = GetXLogReplayRecPtr(NULL); int j; LWLockAcquire(ReplicationSlotControlLock, LW_EXCLUSIVE); @@ -723,7 +773,7 @@ MaybePauseOnLogicalSlotConflict(Oid dboid, TransactionId snapshotConflictHorizon ereport(LOG, (errmsg("advanced catalog_xmin of logical slot \"%s\" past conflict horizon %u", NameStr(s->data.name), snapshotConflictHorizon), - errdetail("Slot's confirmed_flush_lsn %X/%X reached the conflict record at %X/%X; operator drained before resuming.", + errdetail("Slot's confirmed_flush_lsn %X/%X reached the conflict record at %X/%X; consumer drained past the pause point.", LSN_FORMAT_ARGS(s->data.confirmed_flush), LSN_FORMAT_ARGS(conflict_lsn)))); } diff --git a/src/backend/utils/misc/guc_parameters.dat b/src/backend/utils/misc/guc_parameters.dat index 52b34443ec6..709079c399d 100644 --- a/src/backend/utils/misc/guc_parameters.dat +++ b/src/backend/utils/misc/guc_parameters.dat @@ -2444,7 +2444,7 @@ { name => 'recovery_pause_on_logical_slot_conflict', type => 'bool', context => 'PGC_SIGHUP', group => 'REPLICATION_STANDBY', short_desc => 'Pauses recovery instead of invalidating an active logical slot on catalog conflict.', - long_desc => 'When WAL replay on a standby is about to invalidate a logical replication slot because a catalog PRUNE_ON_ACCESS record has overtaken the slot\'s catalog_xmin, pause recovery instead. The operator can then drain or drop the slot and call pg_wal_replay_resume() to continue.', + long_desc => 'When WAL replay on a standby is about to invalidate a logical replication slot because a catalog PRUNE_ON_ACCESS record has overtaken the slot\'s catalog_xmin, pause recovery instead. Recovery resumes automatically once the slot has been drained past the pause point, dropped, advanced, or invalidated for another reason (e.g. max_slot_wal_keep_size). pg_wal_replay_resume() also forces continuation, invalidating any remaining blocking slot.', variable => 'recovery_pause_on_logical_slot_conflict', boot_val => 'false', }, diff --git a/src/backend/utils/misc/postgresql.conf.sample b/src/backend/utils/misc/postgresql.conf.sample index 17b2bcc4df8..414fed447cf 100644 --- a/src/backend/utils/misc/postgresql.conf.sample +++ b/src/backend/utils/misc/postgresql.conf.sample @@ -404,7 +404,9 @@ # retrieve WAL after a failed attempt #recovery_min_apply_delay = 0 # minimum delay for applying changes during recovery #recovery_pause_on_logical_slot_conflict = off # pause recovery instead of invalidating - # a logical slot on catalog conflict + # a logical slot on catalog conflict; + # auto-resumes once the slot is drained, + # dropped, or otherwise unblocks #sync_replication_slots = off # enables slot synchronization on the physical standby from the primary # - Subscribers - diff --git a/src/test/recovery/t/054_recovery_pause_on_slot_conflict.pl b/src/test/recovery/t/054_recovery_pause_on_slot_conflict.pl index d1a03475e95..b53bf97694e 100644 --- a/src/test/recovery/t/054_recovery_pause_on_slot_conflict.pl +++ b/src/test/recovery/t/054_recovery_pause_on_slot_conflict.pl @@ -217,6 +217,49 @@ sub wait_for_replay_paused return 0; } +# Models an operator who issued an explicit pg_wal_replay_pause() that +# must survive the GUC's auto-resume. On entry replay is parked at a +# pre-conflict LSN with the operator pause already in effect. Each tick we +# nudge replay forward (pg_wal_replay_resume()) and then immediately +# re-assert the operator pause (pg_wal_replay_pause()), so that when the +# startup process reaches the catalog-prune record the operator pause is +# already pending — i.e. GetRecoveryPauseState() != RECOVERY_NOT_PAUSED +# at the moment MaybePauseOnLogicalSlotConflict() captures it. We then +# drain the slot so the GUC's auto-resume re-scan finds nothing blocking. +# With the fix the operator's pause is preserved; without it the +# unconditional SetRecoveryPause(false) would clear it. +# +# Returns the total number of changes drained. +sub drain_holding_user_pause +{ + my ($standby, $slot_name, $deadline_seconds) = @_; + + my $total_drained = 0; + my $deadline = time() + $deadline_seconds; + + while (time() < $deadline) { + # Drain whatever the slot currently holds. + my $got = $standby->safe_psql('postgres', + "SELECT COUNT(*) FROM pg_logical_slot_get_changes('$slot_name', NULL, NULL)"); + $total_drained += $got; + + # Stop once the slot is fully drained and replay has advanced past + # the conflict (nothing left to decode and no longer pause-looping + # on the GUC). A short tail of zero-change drains confirms we are + # done. + last if $got == 0 && $total_drained > 0; + + # Nudge replay forward, then immediately re-pause so the operator + # pause is pending again when the next conflict record is applied. + $standby->safe_psql('postgres', "SELECT pg_wal_replay_resume()"); + $standby->safe_psql('postgres', "SELECT pg_wal_replay_pause()"); + + usleep(500_000); + } + + return $total_drained; +} + # --------------------------------------------------------------------- # Main script # --------------------------------------------------------------------- @@ -228,16 +271,19 @@ my $guc = $node_primary->safe_psql('postgres', "SELECT COUNT(*) FROM pg_settings WHERE name = 'recovery_pause_on_logical_slot_conflict'"); is($guc, '1', 'recovery_pause_on_logical_slot_conflict GUC is registered'); -# 2. Phase 1: bring up BOTH standbys (GUC-on and GUC-off) while the -# archive still contains only the quiet-moment snapshot — no prune -# records yet. Slot creation reaches SNAPBUILD_CONSISTENT quickly on -# both. Later, when Phase 2 ships the prune records, the two standbys -# diverge: the GUC-on one pauses and drains; the GUC-off one -# invalidates. +# 2. Phase 1: bring up the standbys (GUC-on, GUC-off, and a second +# GUC-on "user-pause" standby) while the archive still contains only the +# quiet-moment snapshot — no prune records yet. Slot creation reaches +# SNAPBUILD_CONSISTENT quickly on all of them. Later, when Phase 2 ships +# the prune records, the standbys diverge: the GUC-on ones pause and +# drain; the GUC-off one invalidates. The user-pause standby additionally +# checks that an operator's explicit pause survives the GUC auto-resume. my $node_standby = create_archive_standby($node_primary, $backup_name, 'standby', 'on'); my $node_standby_off = create_archive_standby($node_primary, $backup_name, 'standby_off', 'off'); +my $node_standby_up = create_archive_standby($node_primary, $backup_name, + 'standby_userpause', 'on'); $node_standby->safe_psql('postgres', qq[ SELECT pg_create_logical_replication_slot('t_slot', 'test_decoding'); @@ -245,6 +291,9 @@ $node_standby->safe_psql('postgres', qq[ $node_standby_off->safe_psql('postgres', qq[ SELECT pg_create_logical_replication_slot('t_slot_off', 'test_decoding'); ]); +$node_standby_up->safe_psql('postgres', qq[ + SELECT pg_create_logical_replication_slot('up_slot', 'test_decoding'); +]); my $slot_ready = $node_standby->safe_psql('postgres', qq[ SELECT wal_status FROM pg_replication_slots WHERE slot_name = 't_slot' @@ -257,6 +306,21 @@ my $off_slot_ready = $node_standby_off->safe_psql('postgres', qq[ is($off_slot_ready, 'reserved', "baseline slot created cleanly in Phase 1 (state: $off_slot_ready)"); +my $up_slot_ready = $node_standby_up->safe_psql('postgres', qq[ + SELECT wal_status FROM pg_replication_slots WHERE slot_name = 'up_slot' +]); +is($up_slot_ready, 'reserved', + "user-pause slot created cleanly in Phase 1 (state: $up_slot_ready)"); + +# Operator pauses recovery on the user-pause standby NOW, while the +# archive still only holds the clean Phase-1 snapshot and the catalog- +# prune conflict has not been replayed yet. This parks replay at a +# pre-conflict LSN with an explicit operator pause in effect — the exact +# precondition for the user-pause-clobber bug. +$node_standby_up->safe_psql('postgres', "SELECT pg_wal_replay_pause()"); +ok(wait_for_replay_paused($node_standby_up), + "user-pause standby parks on operator pg_wal_replay_pause() before conflict"); + # 3. Phase 2: catalog churn on primary, then wait for archive. run_catalog_churn($node_primary); @@ -322,6 +386,50 @@ cmp_ok($elapsed, '<', 10, "pg_promote completed in under 10s (actual: ${elapsed}s)"); $node_standby_p->stop; + +# 7. User-pause survives auto-resume. The operator paused recovery with +# pg_wal_replay_pause() before the conflict record was replayed (done in +# section 2). drain_holding_user_pause nudges replay into the conflict +# while keeping that operator pause pending, then drains the slot so the +# GUC's auto-resume re-scan finds nothing blocking. The fix in +# MaybePauseOnLogicalSlotConflict() must then leave the operator's pause +# in place rather than clearing it with an unconditional +# SetRecoveryPause(false), so: +# - with the fix: replay stays 'paused' after the conflict resolves; +# - without the fix: auto-resume clears the pause and replay proceeds. +my $up_drained = drain_holding_user_pause($node_standby_up, 'up_slot', 60); + +cmp_ok($up_drained, '>=', 2000, + "user-pause standby drained the slot under operator pause ($up_drained got)"); + +# The slot must have survived (drained, not invalidated) just like the +# plain GUC-on standby. +my $up_slot_state = $node_standby_up->safe_psql('postgres', qq[ + SELECT wal_status || '|' || COALESCE(invalidation_reason, '') + FROM pg_replication_slots WHERE slot_name = 'up_slot'; +]); +like($up_slot_state, qr/^reserved\|/, + "user-pause slot survived catalog prune (state: $up_slot_state)"); + +# The crux: recovery is STILL paused because the operator's pause was not +# cleared by the GUC's auto-resume. +my $up_pause_state = $node_standby_up->safe_psql('postgres', + "SELECT pg_get_wal_replay_pause_state()"); +is($up_pause_state, 'paused', + "operator pause survived GUC auto-resume (state: $up_pause_state)"); + +# Now the operator resumes and replay must proceed past the pause. +my $up_lsn_before = $node_standby_up->safe_psql('postgres', + "SELECT pg_last_wal_replay_lsn()"); +$node_standby_up->safe_psql('postgres', "SELECT pg_wal_replay_resume()"); +$node_standby_up->poll_query_until('postgres', + "SELECT pg_get_wal_replay_pause_state() = 'not paused'") + or die "replay did not leave paused state after operator resume"; +ok($node_standby_up->poll_query_until('postgres', + "SELECT pg_last_wal_replay_lsn() >= '$up_lsn_before'::pg_lsn"), + "replay proceeds after operator pg_wal_replay_resume()"); + +$node_standby_up->stop; $node_standby_off->stop; $node_standby->stop; $node_primary->stop; -- 2.50.1 (Apple Git-155)