From 90b5a189df90c263648f66bbc3edc7ce5dc12f9a Mon Sep 17 00:00:00 2001 From: Fujii Masao Date: Fri, 26 Jun 2026 11:53:24 +0900 Subject: [PATCH v2] Fix unlogged sequence corruption after standby promotion Previously, if an unlogged sequence was created on the primary and replicated to a standby, reading the sequence after promoting the standby (for example, with nextval()) could trigger the following assertion failure: TRAP: failed Assert("((const PageHeaderData *) page)->pd_special >= SizeOfPageHeaderData") The problem was that seq_redo() updated the init fork page in shared buffers but did not flush it to disk. During promotion, ResetUnloggedRelations() recreates the main fork of unlogged relations by copying the init fork from disk, bypassing shared buffers. As a result, the main fork could be recreated from a stale init fork instead of the WAL-replayed page. Fix this by introducing a helper to flush init fork buffers immediately, and make seq_redo() use it. As a result, the main fork of an unlogged sequence is recreated from the up-to-date init fork on disk, allowing the unlogged sequence to be read successfully after standby promotion. Backpatch to v15, where unlogged sequences were introduced. --- src/backend/access/hash/hash_xlog.c | 29 ++-------------- src/backend/access/transam/xlogutils.c | 26 +++++++++++++- src/backend/commands/sequence.c | 1 + src/include/access/xlogutils.h | 2 ++ src/test/recovery/meson.build | 1 + .../t/054_unlogged_sequence_promotion.pl | 34 +++++++++++++++++++ 6 files changed, 66 insertions(+), 27 deletions(-) create mode 100644 src/test/recovery/t/054_unlogged_sequence_promotion.pl diff --git a/src/backend/access/hash/hash_xlog.c b/src/backend/access/hash/hash_xlog.c index e8e06c62a95..cd9617533c2 100644 --- a/src/backend/access/hash/hash_xlog.c +++ b/src/backend/access/hash/hash_xlog.c @@ -32,7 +32,6 @@ hash_xlog_init_meta_page(XLogReaderState *record) XLogRecPtr lsn = record->EndRecPtr; Page page; Buffer metabuf; - ForkNumber forknum; xl_hash_init_meta_page *xlrec = (xl_hash_init_meta_page *) XLogRecGetData(record); @@ -44,16 +43,7 @@ hash_xlog_init_meta_page(XLogReaderState *record) page = (Page) BufferGetPage(metabuf); PageSetLSN(page, lsn); MarkBufferDirty(metabuf); - - /* - * Force the on-disk state of init forks to always be in sync with the - * state in shared buffers. See XLogReadBufferForRedoExtended. We need - * special handling for init forks as create index operations don't log a - * full page image of the metapage. - */ - XLogRecGetBlockTag(record, 0, NULL, &forknum, NULL); - if (forknum == INIT_FORKNUM) - FlushOneBuffer(metabuf); + XLogFlushBufferForRedoIfInit(record, 0, metabuf); /* all done */ UnlockReleaseBuffer(metabuf); @@ -71,7 +61,6 @@ hash_xlog_init_bitmap_page(XLogReaderState *record) Page page; HashMetaPage metap; uint32 num_buckets; - ForkNumber forknum; xl_hash_init_bitmap_page *xlrec = (xl_hash_init_bitmap_page *) XLogRecGetData(record); @@ -82,16 +71,7 @@ hash_xlog_init_bitmap_page(XLogReaderState *record) _hash_initbitmapbuffer(bitmapbuf, xlrec->bmsize, true); PageSetLSN(BufferGetPage(bitmapbuf), lsn); MarkBufferDirty(bitmapbuf); - - /* - * Force the on-disk state of init forks to always be in sync with the - * state in shared buffers. See XLogReadBufferForRedoExtended. We need - * special handling for init forks as create index operations don't log a - * full page image of the metapage. - */ - XLogRecGetBlockTag(record, 0, NULL, &forknum, NULL); - if (forknum == INIT_FORKNUM) - FlushOneBuffer(bitmapbuf); + XLogFlushBufferForRedoIfInit(record, 0, bitmapbuf); UnlockReleaseBuffer(bitmapbuf); /* add the new bitmap page to the metapage's list of bitmaps */ @@ -112,10 +92,7 @@ hash_xlog_init_bitmap_page(XLogReaderState *record) PageSetLSN(page, lsn); MarkBufferDirty(metabuf); - - XLogRecGetBlockTag(record, 1, NULL, &forknum, NULL); - if (forknum == INIT_FORKNUM) - FlushOneBuffer(metabuf); + XLogFlushBufferForRedoIfInit(record, 1, metabuf); } if (BufferIsValid(metabuf)) UnlockReleaseBuffer(metabuf); diff --git a/src/backend/access/transam/xlogutils.c b/src/backend/access/transam/xlogutils.c index d63364fd506..2352dcd607e 100644 --- a/src/backend/access/transam/xlogutils.c +++ b/src/backend/access/transam/xlogutils.c @@ -335,6 +335,28 @@ XLogInitBufferForRedo(XLogReaderState *record, uint8 block_id) return buf; } +/* + * If a redo routine modified an init fork, flush the buffer immediately. + * + * At the end of crash recovery the init forks of unlogged relations are + * copied to the main fork directly from disk, without going through shared + * buffers. Therefore, redo routines that update init forks without + * restoring a full-page image must call this after setting the page LSN and + * marking the buffer dirty. + */ +void +XLogFlushBufferForRedoIfInit(XLogReaderState *record, uint8 block_id, + Buffer buffer) +{ + ForkNumber forknum; + + Assert(BufferIsValid(buffer)); + + XLogRecGetBlockTag(record, block_id, NULL, &forknum, NULL); + if (forknum == INIT_FORKNUM) + FlushOneBuffer(buffer); +} + /* * XLogReadBufferForRedoExtended * Like XLogReadBufferForRedo, but with extra options. @@ -412,7 +434,9 @@ XLogReadBufferForRedoExtended(XLogReaderState *record, * At the end of crash recovery the init forks of unlogged relations * are copied, without going through shared buffers. So we need to * force the on-disk state of init forks to always be in sync with the - * state in shared buffers. + * state in shared buffers. Use XLogFlushBufferForRedoIfInit() for + * redo routines that dirty init-fork buffers without restoring a + * full-page image. */ if (forknum == INIT_FORKNUM) FlushOneBuffer(*buf); diff --git a/src/backend/commands/sequence.c b/src/backend/commands/sequence.c index e0af32075d1..5eda26df455 100644 --- a/src/backend/commands/sequence.c +++ b/src/backend/commands/sequence.c @@ -1893,6 +1893,7 @@ seq_redo(XLogReaderState *record) memcpy(page, localpage, BufferGetPageSize(buffer)); MarkBufferDirty(buffer); + XLogFlushBufferForRedoIfInit(record, 0, buffer); UnlockReleaseBuffer(buffer); pfree(localpage); diff --git a/src/include/access/xlogutils.h b/src/include/access/xlogutils.h index 5b77b11f508..d0545248abf 100644 --- a/src/include/access/xlogutils.h +++ b/src/include/access/xlogutils.h @@ -84,6 +84,8 @@ typedef struct ReadLocalXLogPageNoWaitPrivate extern XLogRedoAction XLogReadBufferForRedo(XLogReaderState *record, uint8 block_id, Buffer *buf); extern Buffer XLogInitBufferForRedo(XLogReaderState *record, uint8 block_id); +extern void XLogFlushBufferForRedoIfInit(XLogReaderState *record, + uint8 block_id, Buffer buffer); extern XLogRedoAction XLogReadBufferForRedoExtended(XLogReaderState *record, uint8 block_id, ReadBufferMode mode, bool get_cleanup_lock, diff --git a/src/test/recovery/meson.build b/src/test/recovery/meson.build index 42059801ce2..5453e1aa6fa 100644 --- a/src/test/recovery/meson.build +++ b/src/test/recovery/meson.build @@ -47,6 +47,7 @@ tests += { 't/043_vacuum_horizon_floor.pl', 't/043_no_contrecord_switch.pl', 't/045_archive_restartpoint.pl', + 't/054_unlogged_sequence_promotion.pl', ], }, } diff --git a/src/test/recovery/t/054_unlogged_sequence_promotion.pl b/src/test/recovery/t/054_unlogged_sequence_promotion.pl new file mode 100644 index 00000000000..96d1e4bf18b --- /dev/null +++ b/src/test/recovery/t/054_unlogged_sequence_promotion.pl @@ -0,0 +1,34 @@ +# Copyright (c) 2026, PostgreSQL Global Development Group + +# Test that unlogged sequences created on a primary can be read after +# promotion of a standby that replayed their init fork. + +use strict; +use warnings FATAL => 'all'; +use PostgreSQL::Test::Cluster; +use PostgreSQL::Test::Utils; +use Test::More; + +my $node_primary = PostgreSQL::Test::Cluster->new('primary'); +$node_primary->init(allows_streaming => 1); +$node_primary->start; + +my $backup_name = 'my_backup'; +$node_primary->backup($backup_name); + +my $node_standby = PostgreSQL::Test::Cluster->new('standby'); +$node_standby->init_from_backup($node_primary, $backup_name, + has_streaming => 1); +$node_standby->start; + +# Create the unlogged sequence after the standby has started, so its init fork +# is generated by WAL replay on the standby. +$node_primary->safe_psql('postgres', "CREATE UNLOGGED SEQUENCE ulseq"); +$node_primary->wait_for_replay_catchup($node_standby); + +$node_standby->promote; + +is($node_standby->safe_psql('postgres', "SELECT nextval('ulseq')"), + 1, 'unlogged sequence can be read after standby promotion'); + +done_testing(); -- 2.53.0