doc/src/sgml/config.sgml | 15 ++++++ src/backend/access/transam/xlog.c | 3 ++ src/backend/postmaster/pgstat.c | 31 +++++++++++ src/backend/storage/ipc/procsignal.c | 36 +++++++++++++ src/backend/storage/ipc/standby.c | 76 +++++++++++++++++++++++++++ src/backend/tcop/postgres.c | 54 +++++++++++++++++++ src/backend/utils/misc/guc.c | 10 +++- src/backend/utils/misc/postgresql.conf.sample | 1 + src/include/access/xlog.h | 1 + src/include/pgstat.h | 1 + src/include/storage/procsignal.h | 1 + src/include/storage/standby.h | 1 + src/include/tcop/tcopprot.h | 1 + 13 files changed, 230 insertions(+), 1 deletion(-) diff --git a/doc/src/sgml/config.sgml b/doc/src/sgml/config.sgml index aca8f73a50..5580ea6978 100644 --- a/doc/src/sgml/config.sgml +++ b/doc/src/sgml/config.sgml @@ -6763,6 +6763,21 @@ log_line_prefix = '%m [%p] %q%u@%d/%a ' + + log_recovery_conflicts (boolean) + + log_recovery_conflicts configuration parameter + + + + + Controls whether detailed information is produced when a conflict + occurs during standby recovery. The default is on. + Only superusers can change this setting. + + + + log_parameter_max_length (integer) diff --git a/src/backend/access/transam/xlog.c b/src/backend/access/transam/xlog.c index 55cac186dc..d3f923f951 100644 --- a/src/backend/access/transam/xlog.c +++ b/src/backend/access/transam/xlog.c @@ -880,6 +880,8 @@ static bool holdingAllLocks = false; #ifdef WAL_DEBUG static MemoryContext walDebugCxt = NULL; #endif +/* For LogBlockedWalRecordInfo */ +XLogReaderState *current_replayed_xlog = NULL; static void readRecoverySignalFile(void); static void validateRecoveryParameters(void); @@ -7319,6 +7321,7 @@ StartupXLOG(void) TransactionIdIsValid(record->xl_xid)) RecordKnownAssignedTransactionIds(record->xl_xid); + current_replayed_xlog = xlogreader; /* Now apply the WAL record itself */ RmgrTable[record->xl_rmid].rm_redo(xlogreader); diff --git a/src/backend/postmaster/pgstat.c b/src/backend/postmaster/pgstat.c index 309378ae54..513c9cd643 100644 --- a/src/backend/postmaster/pgstat.c +++ b/src/backend/postmaster/pgstat.c @@ -6633,6 +6633,37 @@ pgstat_db_requested(Oid databaseid) return false; } +/* ---------- + * pgstat_get_state_name() - + * + * Return a string representing the current backend state. + * ---------- + */ +const char * +pgstat_get_state_name(BackendState backend_state) +{ + switch (backend_state) + { + case STATE_IDLE: + return "idle"; + case STATE_RUNNING: + return "active"; + case STATE_IDLEINTRANSACTION: + return "idle in transaction"; + case STATE_FASTPATH: + return "fastpath function call"; + case STATE_IDLEINTRANSACTION_ABORTED: + return "idle in transaction (aborted)"; + case STATE_DISABLED: + return "disabled"; + case STATE_UNDEFINED: + return ""; + default: + Assert(0); + } + return ""; +} + /* * Convert a potentially unsafely truncated activity string (see * PgBackendStatus.st_activity_raw's documentation) into a correctly truncated diff --git a/src/backend/storage/ipc/procsignal.c b/src/backend/storage/ipc/procsignal.c index c809196d06..2d5886ff80 100644 --- a/src/backend/storage/ipc/procsignal.c +++ b/src/backend/storage/ipc/procsignal.c @@ -576,3 +576,39 @@ procsignal_sigusr1_handler(SIGNAL_ARGS) errno = save_errno; } + +extern const char *get_procsignal_reason_desc(ProcSignalReason reason) + { + const char *reasonDesc = "unknown reason"; + + switch (reason) + { + case PROCSIG_RECOVERY_CONFLICT_BUFFERPIN: + reasonDesc = "User was holding shared buffer pin for too long."; + break; + case PROCSIG_RECOVERY_CONFLICT_LOCK: + reasonDesc = "User was holding a relation lock for too long."; + break; + case PROCSIG_RECOVERY_CONFLICT_TABLESPACE: + reasonDesc = "User was or might have been using tablespace that must be dropped."; + break; + case PROCSIG_RECOVERY_CONFLICT_SNAPSHOT: + reasonDesc = "User query might have needed to see row versions that must be removed."; + break; + case PROCSIG_RECOVERY_CONFLICT_STARTUP_DEADLOCK: + reasonDesc = "User transaction caused buffer deadlock with recovery."; + break; + case PROCSIG_RECOVERY_CONFLICT_DATABASE: + reasonDesc = "User was connected to a database that must be dropped."; + break; + case PROCSIG_CATCHUP_INTERRUPT: + case PROCSIG_NOTIFY_INTERRUPT: + case PROCSIG_PARALLEL_MESSAGE: + case PROCSIG_WALSND_INIT_STOPPING: + case NUM_PROCSIGNALS: + break; + default: + break; + } + return reasonDesc; +} diff --git a/src/backend/storage/ipc/standby.c b/src/backend/storage/ipc/standby.c index 9e0d5ec257..2ec2b37fcf 100644 --- a/src/backend/storage/ipc/standby.c +++ b/src/backend/storage/ipc/standby.c @@ -16,6 +16,7 @@ *------------------------------------------------------------------------- */ #include "postgres.h" +#include "access/rmgr.h" #include "access/transam.h" #include "access/twophase.h" #include "access/xact.h" @@ -34,6 +35,7 @@ #include "utils/ps_status.h" #include "utils/timeout.h" #include "utils/timestamp.h" +#include "access/xlog_internal.h" /* User-settable GUC parameters */ int vacuum_defer_cleanup_age; @@ -215,6 +217,57 @@ WaitExceedsMaxStandbyDelay(uint32 wait_event_info) return false; } +/* + * Display information about the wal record + * apply being blocked + */ +static void +LogBlockedWalRecordInfo(int num_waitlist_entries, ProcSignalReason reason) +{ + RelFileNode rnode; + ForkNumber forknum; + BlockNumber blknum; + uint64 wal_record_lsn; + TimestampTz rtime; + bool fromStream; + char *receipt_time_str; + RmgrId rmgrId = -1; + uint8 info = 0; + const char *rm_name; + const char *rm_info; + char *wal_record_info_str; + + if (num_waitlist_entries > 0) + ereport(LOG, (errmsg("wal record apply is blocked by %d connection(s), reason: %s", + num_waitlist_entries, get_procsignal_reason_desc(reason)))); + else + ereport(LOG, (errmsg("wal record apply is blocked, reason: %s", + get_procsignal_reason_desc(reason)))); + + wal_record_lsn = current_replayed_xlog->ReadRecPtr; + rmgrId = XLogRecGetRmid(current_replayed_xlog); + info = XLogRecGetInfo(current_replayed_xlog) & ~XLR_INFO_MASK; + + rm_name = RmgrTable[rmgrId].rm_name; + rm_info = RmgrTable[rmgrId].rm_identify(info); + + GetXLogReceiptTime(&rtime, &fromStream); + receipt_time_str = pstrdup(timestamptz_to_str(rtime)); + wal_record_info_str = psprintf("blocked wal record rmgr: %s, lsn: %X/%08X, received at: %s, desc: %s", rm_name, (uint32) (wal_record_lsn >> 32), (uint32) wal_record_lsn, receipt_time_str, rm_info); + + if (XLogRecGetBlockTag(current_replayed_xlog, 0, &rnode, &forknum, &blknum)) + ereport(LOG, (errmsg("%s, relation: rel %u/%u/%u fork %s blk %u", + wal_record_info_str, + rnode.spcNode, rnode.dbNode, rnode.relNode, + forkNames[forknum], + blknum))); + else + ereport(LOG, (errmsg("%s", wal_record_info_str))); + + pfree(wal_record_info_str); + pfree(receipt_time_str); +} + /* * This is the main executioner for any query backend that conflicts with * recovery processing. Judgement has already been passed on it within @@ -232,6 +285,8 @@ ResolveRecoveryConflictWithVirtualXIDs(VirtualTransactionId *waitlist, { TimestampTz waitStart = 0; char *new_status; + VirtualTransactionId *tmpWaitlist; + int num_waitlist_entries; /* Fast exit, to avoid a kernel call if there's no work to be done. */ if (!VirtualTransactionIdIsValid(*waitlist)) @@ -241,6 +296,18 @@ ResolveRecoveryConflictWithVirtualXIDs(VirtualTransactionId *waitlist, waitStart = GetCurrentTimestamp(); new_status = NULL; /* we haven't changed the ps display */ + tmpWaitlist = waitlist; + while (VirtualTransactionIdIsValid(*tmpWaitlist)) + { + tmpWaitlist++; + } + + num_waitlist_entries = (tmpWaitlist - waitlist); + + /* display wal record information */ + if (log_recovery_conflicts) + LogBlockedWalRecordInfo(num_waitlist_entries, reason); + while (VirtualTransactionIdIsValid(*waitlist)) { /* reset standbyWait_us for each xact we wait for */ @@ -370,6 +437,11 @@ ResolveRecoveryConflictWithDatabase(Oid dbid) * block during InitPostgres() and then disconnect when they see the * database has been removed. */ + + /* display wal record information */ + if (log_recovery_conflicts) + LogBlockedWalRecordInfo(CountDBBackends(dbid), PROCSIG_RECOVERY_CONFLICT_DATABASE); + while (CountDBBackends(dbid) > 0) { CancelDBBackends(dbid, PROCSIG_RECOVERY_CONFLICT_DATABASE, true); @@ -609,6 +681,10 @@ StandbyTimeoutHandler(void) /* forget any pending STANDBY_DEADLOCK_TIMEOUT request */ disable_timeout(STANDBY_DEADLOCK_TIMEOUT, false); + /* display wal record information */ + if (log_recovery_conflicts) + LogBlockedWalRecordInfo(-1, PROCSIG_RECOVERY_CONFLICT_BUFFERPIN); + SendRecoveryConflictWithBufferPin(PROCSIG_RECOVERY_CONFLICT_BUFFERPIN); } diff --git a/src/backend/tcop/postgres.c b/src/backend/tcop/postgres.c index c9424f167c..37f1abf802 100644 --- a/src/backend/tcop/postgres.c +++ b/src/backend/tcop/postgres.c @@ -98,6 +98,9 @@ int log_statement = LOGSTMT_NONE; /* GUC variable for maximum stack depth (measured in kilobytes) */ int max_stack_depth = 100; +/* GUC variable for standby recovery conflicts information */ +bool log_recovery_conflicts = true; + /* wait N seconds to allow attach from a debugger */ int PostAuthDelay = 0; @@ -126,6 +129,8 @@ char *stack_base_ptr = NULL; char *register_stack_base_ptr = NULL; #endif +#define UINT32_ACCESS_ONCE(var) ((uint32)(*((volatile uint32 *)&(var)))) + /* * Flag to keep track of whether we have started a transaction. * For extended query protocol this has to be remembered across messages. @@ -191,6 +196,7 @@ static void drop_unnamed_stmt(void); static void log_disconnections(int code, Datum arg); static void enable_statement_timeout(void); static void disable_statement_timeout(void); +static void LogBlockingWalRecordApplyInfo(void); /* ---------------------------------------------------------------- @@ -2886,6 +2892,52 @@ FloatExceptionHandler(SIGNAL_ARGS) "invalid operation, such as division by zero."))); } +/* + * Display information about the backend + * about to be interrupted due to recovery conflict + */ +void +LogBlockingWalRecordApplyInfo(void) +{ + + PgBackendStatus *beentry; + int num_backends = pgstat_fetch_stat_numbackends(); + int curr_backend; + uint32 raw_wait_event; + const char *wait_event = NULL; + const char *activity_start; + const char *backend_state_name; + const char *backend_type; + + Assert(MyProc != NULL); + /* + * Make sure any stats snapshot is thrown away + */ + pgstat_clear_snapshot(); + + for (curr_backend = 1; curr_backend <= num_backends; curr_backend++) + { + beentry = pgstat_fetch_stat_beentry(curr_backend); + if (beentry->st_procpid != MyProc->pid) + continue; + + backend_state_name = pgstat_get_state_name(beentry->st_state); + backend_type = GetBackendTypeDesc(beentry->st_backendType); + activity_start = timestamptz_to_str(beentry->st_activity_start_timestamp); + + raw_wait_event = UINT32_ACCESS_ONCE(MyProc->wait_event_info); + wait_event = pgstat_get_wait_event(raw_wait_event); + + ereport(LOG, + (errmsg("about to be interrupted: backend_type: %s, state: %s, wait_event: %s, query_start: %s", + backend_type, backend_state_name, wait_event ? wait_event : "none", activity_start))); + } + /* + * Make sure any stats snapshot is thrown away + */ + pgstat_clear_snapshot(); +} + /* * RecoveryConflictInterrupt: out-of-line portion of recovery conflict * handling following receipt of SIGUSR1. Designed to be similar to die() @@ -2986,6 +3038,8 @@ RecoveryConflictInterrupt(ProcSignalReason reason) (int) reason); } + if (log_recovery_conflicts) + LogBlockingWalRecordApplyInfo(); Assert(RecoveryConflictPending && (QueryCancelPending || ProcDiePending)); /* diff --git a/src/backend/utils/misc/guc.c b/src/backend/utils/misc/guc.c index 2f3e0a70e0..7aa55db5af 100644 --- a/src/backend/utils/misc/guc.c +++ b/src/backend/utils/misc/guc.c @@ -1594,7 +1594,15 @@ static struct config_bool ConfigureNamesBool[] = false, NULL, NULL, NULL }, - + { + {"log_recovery_conflicts", PGC_SUSET, LOGGING_WHAT, + gettext_noop("Logs standby recovery conflicts."), + NULL + }, + &log_recovery_conflicts, + true, + NULL, NULL, NULL + }, { {"log_hostname", PGC_SIGHUP, LOGGING_WHAT, gettext_noop("Logs the host name in the connection logs."), diff --git a/src/backend/utils/misc/postgresql.conf.sample b/src/backend/utils/misc/postgresql.conf.sample index ac02bd0c00..1009506eff 100644 --- a/src/backend/utils/misc/postgresql.conf.sample +++ b/src/backend/utils/misc/postgresql.conf.sample @@ -548,6 +548,7 @@ # %% = '%' # e.g. '<%u%%%d> ' #log_lock_waits = off # log lock waits >= deadlock_timeout +#log_recovery_conflicts = on # log standby recovery conflicts #log_parameter_max_length = -1 # when logging statements, limit logged # bind-parameter values to N bytes; # -1 means print in full, 0 disables diff --git a/src/include/access/xlog.h b/src/include/access/xlog.h index e917dfe92d..eafd4201db 100644 --- a/src/include/access/xlog.h +++ b/src/include/access/xlog.h @@ -100,6 +100,7 @@ typedef enum extern XLogRecPtr ProcLastRecPtr; extern XLogRecPtr XactLastRecEnd; extern PGDLLIMPORT XLogRecPtr XactLastCommitEnd; +extern XLogReaderState *current_replayed_xlog; extern bool reachedConsistency; diff --git a/src/include/pgstat.h b/src/include/pgstat.h index c55dc1481c..2fca769bc9 100644 --- a/src/include/pgstat.h +++ b/src/include/pgstat.h @@ -1334,6 +1334,7 @@ extern PgStat_TableStatus *find_tabstat_entry(Oid rel_id); extern PgStat_BackendFunctionEntry *find_funcstat_entry(Oid func_id); extern void pgstat_initstats(Relation rel); +extern const char *pgstat_get_state_name(BackendState backend_state); extern char *pgstat_clip_activity(const char *raw_activity); diff --git a/src/include/storage/procsignal.h b/src/include/storage/procsignal.h index a0c0bc3ce5..c57413b06d 100644 --- a/src/include/storage/procsignal.h +++ b/src/include/storage/procsignal.h @@ -70,5 +70,6 @@ extern void WaitForProcSignalBarrier(uint64 generation); extern void ProcessProcSignalBarrier(void); extern void procsignal_sigusr1_handler(SIGNAL_ARGS); +extern const char *get_procsignal_reason_desc(ProcSignalReason reason); #endif /* PROCSIGNAL_H */ diff --git a/src/include/storage/standby.h b/src/include/storage/standby.h index cfbe426e5a..0a99ad3f40 100644 --- a/src/include/storage/standby.h +++ b/src/include/storage/standby.h @@ -23,6 +23,7 @@ extern int vacuum_defer_cleanup_age; extern int max_standby_archive_delay; extern int max_standby_streaming_delay; +extern bool log_recovery_conflicts; extern void InitRecoveryTransactionEnvironment(void); extern void ShutdownRecoveryTransactionEnvironment(void); diff --git a/src/include/tcop/tcopprot.h b/src/include/tcop/tcopprot.h index bd30607b07..e73602ca2e 100644 --- a/src/include/tcop/tcopprot.h +++ b/src/include/tcop/tcopprot.h @@ -29,6 +29,7 @@ extern CommandDest whereToSendOutput; extern PGDLLIMPORT const char *debug_query_string; extern int max_stack_depth; extern int PostAuthDelay; +extern bool log_recovery_conflicts; /* GUC-configurable parameters */