From 98465a3b80707ff02edede39f180dbf460f14cdd Mon Sep 17 00:00:00 2001 From: Seongjun Shin Date: Fri, 29 May 2026 14:45:23 +0900 Subject: [PATCH v5 1/2] Add wait events for server logging destination writes When a backend writes server log output, the underlying call can block: write(2) to the syslogger pipe or to stderr once the pipe buffer fills up or the output device is slow, and syslog(3) when the system logger is slow. These blocking calls were not instrumented, so pg_stat_activity reported wait_event IS NULL during that time. Many monitoring tools interpret NULL as on-CPU work, which made heavy-logging stalls hard to attribute. Add three new WaitEventIO events and report them around the relevant calls: IO / SysloggerWrite - write(2) to the syslogger pipe inside write_pipe_chunks(). IO / StderrWrite - write(2) to stderr inside write_console(). IO / SyslogWrite - syslog(3) inside write_syslog(). The instrumentation is limited to the leaf write/syslog call. It uses only the existing pgstat_report_wait_start()/end() inline helpers, which are allocation-free and safe to call before MyProc is set up, so this remains safe to invoke from within error reporting paths. The call that actually blocks on a slow log device is write_syslogger_file() in the syslogger process, which has no PGPROC and so never appears in pg_stat_activity. Backends surface the stall as the pipe fills and they block on the pipe write, so the backend-side SysloggerWrite event is where it becomes visible. Because wait events are single-slot (pgstat_report_wait_end() simply writes 0), a log message emitted while an outer wait event is set will briefly mask that event. This is a pre-existing property of the mechanism rather than something specific to these events; a comment at each wrapped site notes it. --- .../utils/activity/wait_event_names.txt | 3 +++ src/backend/utils/error/elog.c | 26 +++++++++++++++++++ 2 files changed, 29 insertions(+) diff --git a/src/backend/utils/activity/wait_event_names.txt b/src/backend/utils/activity/wait_event_names.txt index 560659f9568..d4a0f6361ad 100644 --- a/src/backend/utils/activity/wait_event_names.txt +++ b/src/backend/utils/activity/wait_event_names.txt @@ -258,6 +258,9 @@ SLRU_WRITE "Waiting for a write of an SLRU page." SNAPBUILD_READ "Waiting for a read of a serialized historical catalog snapshot." SNAPBUILD_SYNC "Waiting for a serialized historical catalog snapshot to reach durable storage." SNAPBUILD_WRITE "Waiting for a write of a serialized historical catalog snapshot." +STDERR_WRITE "Waiting for a write to the server's standard error stream." +SYSLOGGER_WRITE "Waiting for a write to the syslogger pipe." +SYSLOG_WRITE "Waiting for a write to the system logger (syslog)." TIMELINE_HISTORY_FILE_SYNC "Waiting for a timeline history file received via streaming replication to reach durable storage." TIMELINE_HISTORY_FILE_WRITE "Waiting for a write of a timeline history file received via streaming replication." TIMELINE_HISTORY_READ "Waiting for a read of a timeline history file." diff --git a/src/backend/utils/error/elog.c b/src/backend/utils/error/elog.c index a6936a0c664..da17efd88c1 100644 --- a/src/backend/utils/error/elog.c +++ b/src/backend/utils/error/elog.c @@ -89,6 +89,7 @@ #include "utils/pg_locale.h" #include "utils/ps_status.h" #include "utils/varlena.h" +#include "utils/wait_event.h" /* In this module, access gettext() via err_gettext() */ @@ -2890,10 +2891,16 @@ write_syslog(int level, const char *line) chunk_nr++; + /* + * A nested log write briefly masks any outer wait event + * (wait events are single-slot). + */ + pgstat_report_wait_start(WAIT_EVENT_SYSLOG_WRITE); if (syslog_sequence_numbers) syslog(level, "[%lu-%d] %s", seq, chunk_nr, buf); else syslog(level, "[%d] %s", chunk_nr, buf); + pgstat_report_wait_end(); line += buflen; len -= buflen; @@ -2902,10 +2909,12 @@ write_syslog(int level, const char *line) else { /* message short enough */ + pgstat_report_wait_start(WAIT_EVENT_SYSLOG_WRITE); if (syslog_sequence_numbers) syslog(level, "[%lu] %s", seq, line); else syslog(level, "%s", line); + pgstat_report_wait_end(); } } #endif /* HAVE_SYSLOG */ @@ -3089,8 +3098,13 @@ write_console(const char *line, int len) /* * We ignore any error from write() here. We have no useful way to report * it ... certainly whining on stderr isn't likely to be productive. + * + * A nested log write briefly masks any outer wait event (wait events are + * single-slot). */ + pgstat_report_wait_start(WAIT_EVENT_STDERR_WRITE); rc = write(fileno(stderr), line, len); + pgstat_report_wait_end(); (void) rc; } @@ -3903,6 +3917,14 @@ send_message_to_server_log(ErrorData *edata) * that are no more than that length, and send one chunk per write() call. * The collector process knows how to reassemble the chunks. * + * The actual file write happens later in the syslogger process + * (write_syslogger_file()), which has no PGPROC and so never shows up in + * pg_stat_activity. A backend blocks here on the pipe write once the pipe + * fills, so this SysloggerWrite event on the backend side is where such a + * stall becomes visible. As with the other logging writes, the wait is + * reported only around the leaf write() and uses the single-slot wait-event + * mechanism, so a log emitted while an outer event is set briefly masks it. + * * Because of the atomic write requirement, there are only two possible * results from write() here: -1 for failure, or the requested number of * bytes. There is not really anything we can do about a failure; retry would @@ -3937,7 +3959,9 @@ write_pipe_chunks(char *data, int len, int dest) /* no need to set PIPE_PROTO_IS_LAST yet */ p.proto.len = PIPE_MAX_PAYLOAD; memcpy(p.proto.data, data, PIPE_MAX_PAYLOAD); + pgstat_report_wait_start(WAIT_EVENT_SYSLOGGER_WRITE); rc = write(fd, &p, PIPE_HEADER_SIZE + PIPE_MAX_PAYLOAD); + pgstat_report_wait_end(); (void) rc; data += PIPE_MAX_PAYLOAD; len -= PIPE_MAX_PAYLOAD; @@ -3947,7 +3971,9 @@ write_pipe_chunks(char *data, int len, int dest) p.proto.flags |= PIPE_PROTO_IS_LAST; p.proto.len = len; memcpy(p.proto.data, data, len); + pgstat_report_wait_start(WAIT_EVENT_SYSLOGGER_WRITE); rc = write(fd, &p, PIPE_HEADER_SIZE + len); + pgstat_report_wait_end(); (void) rc; } -- 2.50.1 (Apple Git-155)