From dc7b84386209c745427d98586b7f100b6a8e122c Mon Sep 17 00:00:00 2001 From: Fujii Masao Date: Fri, 26 Jun 2026 11:53:24 +0900 Subject: [PATCH v2] Fix unlogged sequence corruption after standby promotion Previously, if an unlogged sequence was created on the primary and replicated to a standby, reading the sequence after promoting the standby (for example, with nextval()) could trigger the following assertion failure: TRAP: failed Assert("((const PageHeaderData *) page)->pd_special >= SizeOfPageHeaderData") The problem was that seq_redo() updated the init fork page in shared buffers but did not flush it to disk. During promotion, ResetUnloggedRelations() recreates the main fork of unlogged relations by copying the init fork from disk, bypassing shared buffers. As a result, the main fork could be recreated from a stale init fork instead of the WAL-replayed page. Fix this by introducing a helper to flush init fork buffers immediately, and make seq_redo() use it. As a result, the main fork of an unlogged sequence is recreated from the up-to-date init fork on disk, allowing the unlogged sequence to be read successfully after standby promotion. Backpatch to v15, where unlogged sequences were introduced. --- src/backend/access/hash/hash_xlog.c | 29 ++-------------- src/backend/access/transam/xlogutils.c | 26 +++++++++++++- src/backend/commands/sequence.c | 1 + src/include/access/xlogutils.h | 2 ++ src/test/recovery/meson.build | 1 + .../t/054_unlogged_sequence_promotion.pl | 34 +++++++++++++++++++ 6 files changed, 66 insertions(+), 27 deletions(-) create mode 100644 src/test/recovery/t/054_unlogged_sequence_promotion.pl diff --git a/src/backend/access/hash/hash_xlog.c b/src/backend/access/hash/hash_xlog.c index 8d97067fe54..d4cb6246b48 100644 --- a/src/backend/access/hash/hash_xlog.c +++ b/src/backend/access/hash/hash_xlog.c @@ -29,7 +29,6 @@ hash_xlog_init_meta_page(XLogReaderState *record) XLogRecPtr lsn = record->EndRecPtr; Page page; Buffer metabuf; - ForkNumber forknum; xl_hash_init_meta_page *xlrec = (xl_hash_init_meta_page *) XLogRecGetData(record); @@ -41,16 +40,7 @@ hash_xlog_init_meta_page(XLogReaderState *record) page = (Page) BufferGetPage(metabuf); PageSetLSN(page, lsn); MarkBufferDirty(metabuf); - - /* - * Force the on-disk state of init forks to always be in sync with the - * state in shared buffers. See XLogReadBufferForRedoExtended. We need - * special handling for init forks as create index operations don't log a - * full page image of the metapage. - */ - XLogRecGetBlockTag(record, 0, NULL, &forknum, NULL); - if (forknum == INIT_FORKNUM) - FlushOneBuffer(metabuf); + XLogFlushBufferForRedoIfInit(record, 0, metabuf); /* all done */ UnlockReleaseBuffer(metabuf); @@ -68,7 +58,6 @@ hash_xlog_init_bitmap_page(XLogReaderState *record) Page page; HashMetaPage metap; uint32 num_buckets; - ForkNumber forknum; xl_hash_init_bitmap_page *xlrec = (xl_hash_init_bitmap_page *) XLogRecGetData(record); @@ -79,16 +68,7 @@ hash_xlog_init_bitmap_page(XLogReaderState *record) _hash_initbitmapbuffer(bitmapbuf, xlrec->bmsize, true); PageSetLSN(BufferGetPage(bitmapbuf), lsn); MarkBufferDirty(bitmapbuf); - - /* - * Force the on-disk state of init forks to always be in sync with the - * state in shared buffers. See XLogReadBufferForRedoExtended. We need - * special handling for init forks as create index operations don't log a - * full page image of the metapage. - */ - XLogRecGetBlockTag(record, 0, NULL, &forknum, NULL); - if (forknum == INIT_FORKNUM) - FlushOneBuffer(bitmapbuf); + XLogFlushBufferForRedoIfInit(record, 0, bitmapbuf); UnlockReleaseBuffer(bitmapbuf); /* add the new bitmap page to the metapage's list of bitmaps */ @@ -109,10 +89,7 @@ hash_xlog_init_bitmap_page(XLogReaderState *record) PageSetLSN(page, lsn); MarkBufferDirty(metabuf); - - XLogRecGetBlockTag(record, 1, NULL, &forknum, NULL); - if (forknum == INIT_FORKNUM) - FlushOneBuffer(metabuf); + XLogFlushBufferForRedoIfInit(record, 1, metabuf); } if (BufferIsValid(metabuf)) UnlockReleaseBuffer(metabuf); diff --git a/src/backend/access/transam/xlogutils.c b/src/backend/access/transam/xlogutils.c index db5a314edf8..0d67f256afe 100644 --- a/src/backend/access/transam/xlogutils.c +++ b/src/backend/access/transam/xlogutils.c @@ -321,6 +321,28 @@ XLogInitBufferForRedo(XLogReaderState *record, uint8 block_id) return buf; } +/* + * If a redo routine modified an init fork, flush the buffer immediately. + * + * At the end of crash recovery the init forks of unlogged relations are + * copied to the main fork directly from disk, without going through shared + * buffers. Therefore, redo routines that update init forks without + * restoring a full-page image must call this after setting the page LSN and + * marking the buffer dirty. + */ +void +XLogFlushBufferForRedoIfInit(XLogReaderState *record, uint8 block_id, + Buffer buffer) +{ + ForkNumber forknum; + + Assert(BufferIsValid(buffer)); + + XLogRecGetBlockTag(record, block_id, NULL, &forknum, NULL); + if (forknum == INIT_FORKNUM) + FlushOneBuffer(buffer); +} + /* * XLogReadBufferForRedoExtended * Like XLogReadBufferForRedo, but with extra options. @@ -398,7 +420,9 @@ XLogReadBufferForRedoExtended(XLogReaderState *record, * At the end of crash recovery the init forks of unlogged relations * are copied, without going through shared buffers. So we need to * force the on-disk state of init forks to always be in sync with the - * state in shared buffers. + * state in shared buffers. Use XLogFlushBufferForRedoIfInit() for + * redo routines that dirty init-fork buffers without restoring a + * full-page image. */ if (forknum == INIT_FORKNUM) FlushOneBuffer(*buf); diff --git a/src/backend/commands/sequence.c b/src/backend/commands/sequence.c index a79ef0651a9..c1ad656397a 100644 --- a/src/backend/commands/sequence.c +++ b/src/backend/commands/sequence.c @@ -1933,6 +1933,7 @@ seq_redo(XLogReaderState *record) memcpy(page, localpage, BufferGetPageSize(buffer)); MarkBufferDirty(buffer); + XLogFlushBufferForRedoIfInit(record, 0, buffer); UnlockReleaseBuffer(buffer); pfree(localpage); diff --git a/src/include/access/xlogutils.h b/src/include/access/xlogutils.h index a1870d8e5aa..7639bd523e1 100644 --- a/src/include/access/xlogutils.h +++ b/src/include/access/xlogutils.h @@ -87,6 +87,8 @@ typedef struct ReadLocalXLogPageNoWaitPrivate extern XLogRedoAction XLogReadBufferForRedo(XLogReaderState *record, uint8 block_id, Buffer *buf); extern Buffer XLogInitBufferForRedo(XLogReaderState *record, uint8 block_id); +extern void XLogFlushBufferForRedoIfInit(XLogReaderState *record, + uint8 block_id, Buffer buffer); extern XLogRedoAction XLogReadBufferForRedoExtended(XLogReaderState *record, uint8 block_id, ReadBufferMode mode, bool get_cleanup_lock, diff --git a/src/test/recovery/meson.build b/src/test/recovery/meson.build index 5245fdde43c..38e1e43e041 100644 --- a/src/test/recovery/meson.build +++ b/src/test/recovery/meson.build @@ -58,6 +58,7 @@ tests += { 't/047_checkpoint_physical_slot.pl', 't/048_vacuum_horizon_floor.pl', 't/053_standby_login_event_trigger.pl', + 't/054_unlogged_sequence_promotion.pl', ], }, } diff --git a/src/test/recovery/t/054_unlogged_sequence_promotion.pl b/src/test/recovery/t/054_unlogged_sequence_promotion.pl new file mode 100644 index 00000000000..96d1e4bf18b --- /dev/null +++ b/src/test/recovery/t/054_unlogged_sequence_promotion.pl @@ -0,0 +1,34 @@ +# Copyright (c) 2026, PostgreSQL Global Development Group + +# Test that unlogged sequences created on a primary can be read after +# promotion of a standby that replayed their init fork. + +use strict; +use warnings FATAL => 'all'; +use PostgreSQL::Test::Cluster; +use PostgreSQL::Test::Utils; +use Test::More; + +my $node_primary = PostgreSQL::Test::Cluster->new('primary'); +$node_primary->init(allows_streaming => 1); +$node_primary->start; + +my $backup_name = 'my_backup'; +$node_primary->backup($backup_name); + +my $node_standby = PostgreSQL::Test::Cluster->new('standby'); +$node_standby->init_from_backup($node_primary, $backup_name, + has_streaming => 1); +$node_standby->start; + +# Create the unlogged sequence after the standby has started, so its init fork +# is generated by WAL replay on the standby. +$node_primary->safe_psql('postgres', "CREATE UNLOGGED SEQUENCE ulseq"); +$node_primary->wait_for_replay_catchup($node_standby); + +$node_standby->promote; + +is($node_standby->safe_psql('postgres', "SELECT nextval('ulseq')"), + 1, 'unlogged sequence can be read after standby promotion'); + +done_testing(); -- 2.53.0