From f184ace1b21c70c09b41bc54ebeb5f341098a96c Mon Sep 17 00:00:00 2001 From: Michael Paquier Date: Thu, 16 Nov 2023 14:42:31 +0900 Subject: [PATCH v6 4/4] Add basic test for promotion and restart race condition This test fails after 7863ee4def65 is reverted. test_injection_points is extended so as it is possible to add condition variables to wait for in the point callbacks, with a SQL function to broadcast condition variables that may be sleeping. I guess that this should be extended so as there is more than one condition variable stored in shmem for this module, controlling which variable to wait for directly in the callback itself, but that's not really necessary now. --- src/backend/access/transam/xlog.c | 7 + .../modules/test_injection_points/meson.build | 1 + .../t/002_invalid_checkpoint_after_promote.pl | 132 ++++++++++++++++++ .../test_injection_points--1.0.sql | 10 ++ .../test_injection_points.c | 73 ++++++++++ 5 files changed, 223 insertions(+) create mode 100644 src/test/modules/test_injection_points/t/002_invalid_checkpoint_after_promote.pl diff --git a/src/backend/access/transam/xlog.c b/src/backend/access/transam/xlog.c index 01e0484584..ece31bb2a6 100644 --- a/src/backend/access/transam/xlog.c +++ b/src/backend/access/transam/xlog.c @@ -99,6 +99,7 @@ #include "storage/sync.h" #include "utils/guc_hooks.h" #include "utils/guc_tables.h" +#include "utils/injection_point.h" #include "utils/memutils.h" #include "utils/ps_status.h" #include "utils/relmapper.h" @@ -7345,6 +7346,12 @@ CreateRestartPoint(int flags) CheckPointGuts(lastCheckPoint.redo, flags); + /* + * This location is important to be after CheckPointGuts() to ensure + * that some work has happened. + */ + INJECTION_POINT("CreateRestartPoint"); + /* * Remember the prior checkpoint's redo ptr for * UpdateCheckPointDistanceEstimate() diff --git a/src/test/modules/test_injection_points/meson.build b/src/test/modules/test_injection_points/meson.build index 6006b38f3d..6ebdc728b7 100644 --- a/src/test/modules/test_injection_points/meson.build +++ b/src/test/modules/test_injection_points/meson.build @@ -37,6 +37,7 @@ tests += { 'tap': { 'tests': [ 't/001_snapshot_status.pl', + 't/002_invalid_checkpoint_after_promote.pl', ], } } diff --git a/src/test/modules/test_injection_points/t/002_invalid_checkpoint_after_promote.pl b/src/test/modules/test_injection_points/t/002_invalid_checkpoint_after_promote.pl new file mode 100644 index 0000000000..2da243e871 --- /dev/null +++ b/src/test/modules/test_injection_points/t/002_invalid_checkpoint_after_promote.pl @@ -0,0 +1,132 @@ +use strict; +use warnings; +use PostgreSQL::Test::Cluster; +use PostgreSQL::Test::Utils; +use Time::HiRes qw(usleep nanosleep); +use Test::More; + +# initialize primary node +my $node_primary = PostgreSQL::Test::Cluster->new('master'); +$node_primary->init(allows_streaming => 1); +$node_primary->append_conf( + 'postgresql.conf', q[ +checkpoint_timeout = 30s +log_checkpoints = on +restart_after_crash = on +]); +$node_primary->start; +my $backup_name = 'my_backup'; +$node_primary->backup($backup_name); + +# setup a standby +my $node_standby = PostgreSQL::Test::Cluster->new('standby1'); +$node_standby->init_from_backup($node_primary, $backup_name, has_streaming => 1); +$node_standby->start; + +# dummy table for the upcoming tests. +$node_primary->safe_psql('postgres', 'checkpoint'); +$node_primary->safe_psql('postgres', 'CREATE TABLE prim_tab (a int);'); + +# Register a injection point on the standby so as the follow-up +# restart point running on it will wait. +$node_primary->safe_psql('postgres', 'CREATE EXTENSION test_injection_points;'); +# Wait until the extension has been created on the standby +$node_primary->wait_for_replay_catchup($node_standby); +# This causes a restartpoint to wait on a standby. +$node_standby->safe_psql('postgres', + "SELECT test_injection_points_attach('CreateRestartPoint', 'wait');"); + +# Execute a restart point on the standby, that will be waited on. +# This needs to be in the background as we'll wait on it. +my $logstart = -s $node_standby->logfile; +my $psql_session = + $node_standby->background_psql('postgres', on_error_stop => 0); +$psql_session->query_until(qr/starting_checkpoint/, q( + \echo starting_checkpoint + CHECKPOINT; +)); + +# Switch one WAL segment to make the restartpoint remove it. +$node_primary->safe_psql('postgres', 'INSERT INTO prim_tab VALUES (1);'); +$node_primary->safe_psql('postgres', 'SELECT pg_switch_wal();'); +$node_primary->wait_for_replay_catchup($node_standby); + +# Wait until the checkpointer is in the middle of the restartpoint +# processing. +ok( $node_standby->poll_query_until( + 'postgres', + qq[SELECT count(*) FROM pg_stat_activity + WHERE backend_type = 'checkpointer' AND wait_event = 'test_injection_wait' ;], + '1'), + 'checkpointer is waiting at restart point' + ) or die "Timed out while waiting for checkpointer to run restartpoint"; + + +# Restartpoint should have started on standby. +my $log = slurp_file($node_standby->logfile, $logstart); +my $checkpoint_start = 0; +if ($log =~ m/restartpoint starting: immediate wait/) +{ + $checkpoint_start = 1; +} +is($checkpoint_start, 1, 'restartpoint has started'); + +# promote during restartpoint +$node_primary->stop; +$node_standby->promote; + +# Update the start position before waking up the checkpointer! +$logstart = -s $node_standby->logfile; + +# Now wake up the checkpointer +$node_standby->safe_psql('postgres', + "SELECT test_injection_points_wake();"); + +# wait until checkpoint completes on the newly-promoted standby. +my $checkpoint_complete = 0; +for (my $i = 0; $i < 3000; $i++) +{ + my $log = slurp_file($node_standby->logfile, $logstart); + if ($log =~ m/restartpoint complete/) + { + $checkpoint_complete = 1; + last; + } + usleep(100_000); +} +is($checkpoint_complete, 1, 'restartpoint has completed'); + +# kill SIGKILL a backend, and all backend will restart. Note that previous checkpoint has not completed. +my $psql_timeout = IPC::Run::timer(3600); +my ($killme_stdin, $killme_stdout, $killme_stderr) = ('', '', ''); +my $killme = IPC::Run::start( + [ 'psql', '-XAtq', '-v', 'ON_ERROR_STOP=1', '-f', '-', '-d', $node_standby->connstr('postgres') ], + '<', + \$killme_stdin, + '>', + \$killme_stdout, + '2>', + \$killme_stderr, + $psql_timeout); +$killme_stdin .= q[ +SELECT pg_backend_pid(); +]; +$killme->pump until $killme_stdout =~ /[[:digit:]]+[\r\n]$/; +my $pid = $killme_stdout; +chomp($pid); +my $ret = PostgreSQL::Test::Utils::system_log('pg_ctl', 'kill', 'KILL', $pid); +is($ret, 0, 'killed process with KILL'); +my $stdout; +my $stderr; + +# after recovery, the server will not start, and log PANIC: could not locate a valid checkpoint record +for (my $i = 0; $i < 30; $i++) +{ + ($ret, $stdout, $stderr) = $node_standby->psql('postgres', 'select 1'); + last if $ret == 0; + sleep(1); +} +is($ret, 0, "psql connect success"); +is($stdout, 1, "psql select 1"); + +done_testing(); diff --git a/src/test/modules/test_injection_points/test_injection_points--1.0.sql b/src/test/modules/test_injection_points/test_injection_points--1.0.sql index 1c0a689ae2..05f97f0982 100644 --- a/src/test/modules/test_injection_points/test_injection_points--1.0.sql +++ b/src/test/modules/test_injection_points/test_injection_points--1.0.sql @@ -25,6 +25,16 @@ RETURNS void AS 'MODULE_PATHNAME', 'test_injection_points_run' LANGUAGE C STRICT PARALLEL UNSAFE; +-- +-- test_injection_points_wake() +-- +-- Wakes a condition variable executed in an injection point. +-- +CREATE FUNCTION test_injection_points_wake() +RETURNS void +AS 'MODULE_PATHNAME', 'test_injection_points_wake' +LANGUAGE C STRICT PARALLEL UNSAFE; + -- -- test_injection_points_detach() -- diff --git a/src/test/modules/test_injection_points/test_injection_points.c b/src/test/modules/test_injection_points/test_injection_points.c index efb2c74c47..8b837d85d3 100644 --- a/src/test/modules/test_injection_points/test_injection_points.c +++ b/src/test/modules/test_injection_points/test_injection_points.c @@ -18,13 +18,56 @@ #include "postgres.h" #include "fmgr.h" +#include "storage/condition_variable.h" +#include "storage/lwlock.h" +#include "storage/shmem.h" #include "utils/builtins.h" #include "utils/injection_point.h" +#include "utils/wait_event.h" PG_MODULE_MAGIC; +/* Shared state information for injection points. */ +typedef struct TestInjectionPointSharedState +{ + /* + * Wait variable that can be registered at a given point, and that can be + * awakened via SQL. + */ + ConditionVariable wait_point; +} TestInjectionPointSharedState; + +/* Pointer to shared-memory state. */ +static TestInjectionPointSharedState *inj_state = NULL; + +/* Wait event when waiting on condition variable */ +static uint32 test_injection_wait_event = 0; + extern PGDLLEXPORT void test_injection_error(const char *name); extern PGDLLEXPORT void test_injection_notice(const char *name); +extern PGDLLEXPORT void test_injection_wait(const char *name); + + +static void +test_injection_init_shmem(void) +{ + bool found; + + if (inj_state != NULL) + return; + + LWLockAcquire(AddinShmemInitLock, LW_EXCLUSIVE); + inj_state = ShmemInitStruct("test_injection_points", + sizeof(TestInjectionPointSharedState), + &found); + if (!found) + { + /* First time through ... */ + MemSet(inj_state, 0, sizeof(TestInjectionPointSharedState)); + ConditionVariableInit(&inj_state->wait_point); + } + LWLockRelease(AddinShmemInitLock); +} /* Set of callbacks available at point creation */ void @@ -39,6 +82,20 @@ test_injection_notice(const char *name) elog(NOTICE, "notice triggered for injection point %s", name); } +void +test_injection_wait(const char *name) +{ + if (inj_state == NULL) + test_injection_init_shmem(); + if (test_injection_wait_event == 0) + test_injection_wait_event = WaitEventExtensionNew("test_injection_wait"); + + /* And sleep.. */ + ConditionVariablePrepareToSleep(&inj_state->wait_point); + ConditionVariableSleep(&inj_state->wait_point, test_injection_wait_event); + ConditionVariableCancelSleep(); +} + /* * SQL function for creating an injection point. */ @@ -54,6 +111,8 @@ test_injection_points_attach(PG_FUNCTION_ARGS) function = "test_injection_error"; else if (strcmp(mode, "notice") == 0) function = "test_injection_notice"; + else if (strcmp(mode, "wait") == 0) + function = "test_injection_wait"; else elog(ERROR, "incorrect mode \"%s\" for injection point creation", mode); @@ -76,6 +135,20 @@ test_injection_points_run(PG_FUNCTION_ARGS) PG_RETURN_VOID(); } +/* + * SQL function for waking a condition variable. + */ +PG_FUNCTION_INFO_V1(test_injection_points_wake); +Datum +test_injection_points_wake(PG_FUNCTION_ARGS) +{ + if (inj_state == NULL) + test_injection_init_shmem(); + + ConditionVariableBroadcast(&inj_state->wait_point); + PG_RETURN_VOID(); +} + /* * SQL function for dropping an injection point. */ -- 2.43.0