From a9f04391473b3a88482433b881167d662027d968 Mon Sep 17 00:00:00 2001 From: Jingtang Zhang Date: Sat, 18 Apr 2026 21:07:13 +0800 Subject: [PATCH v4] Optimize CPU usage of dropping buffers during recovery Initialize cached nblocks to 0 when redo CREATE record. --- src/backend/access/transam/xlogutils.c | 10 ++++ src/backend/catalog/storage.c | 24 ++++++++ src/test/recovery/t/060_truncate_empty.pl | 69 +++++++++++++++++++++++ 3 files changed, 103 insertions(+) create mode 100644 src/test/recovery/t/060_truncate_empty.pl diff --git a/src/backend/access/transam/xlogutils.c b/src/backend/access/transam/xlogutils.c index 5fbe39133b8..090947ba5fe 100644 --- a/src/backend/access/transam/xlogutils.c +++ b/src/backend/access/transam/xlogutils.c @@ -489,6 +489,16 @@ XLogReadBufferExtended(RelFileLocator rlocator, ForkNumber forknum, */ smgrcreate(smgr, forknum, true); + /* + * If the cached nblocks is 0, it was set by smgr_redo(CREATE) to + * enable the optimized drop-buffer path. But the relation may + * have been extended before the crash, so we must invalidate the + * cache and let smgrnblocks() do an lseek to get the real size. + * This extra lseek is acceptable here because we're about to do + * I/O to read the block anyway. + */ + if (smgr->smgr_cached_nblocks[forknum] == 0) + smgr->smgr_cached_nblocks[forknum] = InvalidBlockNumber; lastblock = smgrnblocks(smgr, forknum); if (blkno < lastblock) diff --git a/src/backend/catalog/storage.c b/src/backend/catalog/storage.c index e443a4993c5..b93a8b58753 100644 --- a/src/backend/catalog/storage.c +++ b/src/backend/catalog/storage.c @@ -993,6 +993,30 @@ smgr_redo(XLogReaderState *record) reln = smgropen(xlrec->rlocator, INVALID_PROC_NUMBER); smgrcreate(reln, xlrec->forkNum, true); + + /* + * Initialize the cached nblocks to 0 for a newly created + * relation, so that DropRelationsAllBuffers() can use the + * optimized path (BufMapping lookup) instead of scanning + * the entire buffer pool. + * + * This is safe because a CREATE record means the relation + * has just been created with zero blocks. If the relation + * was extended before the crash, XLogReadBufferExtended() + * will invalidate this cached value and let smgrnblocks() + * do a fresh lseek. + * + * We only do this for MAIN_FORKNUM CREATE records, which + * correspond to new relation creation. FSM and VM forks + * are also set to 0 because they cannot exist yet for a + * newly created relation. + */ + if (xlrec->forkNum == MAIN_FORKNUM) + { + reln->smgr_cached_nblocks[MAIN_FORKNUM] = 0; + reln->smgr_cached_nblocks[FSM_FORKNUM] = 0; + reln->smgr_cached_nblocks[VISIBILITYMAP_FORKNUM] = 0; + } } else if (info == XLOG_SMGR_TRUNCATE) { diff --git a/src/test/recovery/t/060_truncate_empty.pl b/src/test/recovery/t/060_truncate_empty.pl new file mode 100644 index 00000000000..1956733f05f --- /dev/null +++ b/src/test/recovery/t/060_truncate_empty.pl @@ -0,0 +1,69 @@ +use strict; +use warnings FATAL => 'all'; +use PostgreSQL::Test::Cluster; +use PostgreSQL::Test::Utils; +use Test::More; +use Time::HiRes qw(gettimeofday tv_interval); + +my $node = PostgreSQL::Test::Cluster->new('primary'); +$node->init(); + +$node->append_conf('postgresql.conf', 'shared_buffers = 4GB'); +$node->append_conf('postgresql.conf', 'restart_after_crash = on'); + +$node->start(); + +$node->safe_psql('postgres', + q[CREATE TABLE test (id int);]); + +# SIGSTOP checkpointer and run some transactions +my $checkpointer_pid = $node->safe_psql('postgres', + q[SELECT pid FROM pg_stat_activity WHERE backend_type = 'checkpointer';]); +chomp($checkpointer_pid); +kill 'STOP', $checkpointer_pid; +note("Checkpointer stopped"); + +$node->pgbench( + '--no-vacuum --client=10 --transactions=1000', + 0, + [qr{actually processed}], + [qr{^$}], + 'concurrent CREATE and DROP TABLE transactions', + { + 'truncate_empty_script' => q( + BEGIN; + INSERT INTO test VALUES (:client_id); + DELETE FROM test WHERE id = :client_id; + CREATE TABLE test_empty_:client_id (id int); + DROP TABLE test_empty_:client_id; + COMMIT; + ) + }); + +# stop the node in immediate mode for crash recovery +$node->stop('immediate'); + +my $recovery_start = [gettimeofday]; +$node->start(); +my $recovery_end = [gettimeofday]; +my $recovery_time = tv_interval($recovery_start, $recovery_end); + +note("Crash recovery time: ${recovery_time} seconds"); + +my $log_content = $node->log_content(); +if ($log_content =~ /redo done at .+? system usage: CPU: user: ([\d.]+) s, system: ([\d.]+) s, elapsed: ([\d.]+) s/m) +{ + my $cpu_user = $1; + my $cpu_system = $2; + my $redo_elapsed = $3; + + note("Redo elapsed time: $redo_elapsed s"); + note(" CPU user: $cpu_user s, system: $cpu_system s"); +} + +# consistency check +my $result = $node->safe_psql('postgres', q[SELECT COUNT(*) FROM test;]); +is($result, '0', 'test table is empty after recovery'); + +$node->stop(); +done_testing(); -- 2.39.5 (Apple Git-154)