From f1e91ae45a52b90973906bbd2e528688a890a560 Mon Sep 17 00:00:00 2001 From: Bertrand Drouvot Date: Mon, 15 Jun 2026 05:44:08 +0000 Subject: [PATCH v2] Fix pgstat_count_io_op_time() calls passing error returns as bytes Several calls of pgstat_count_io_op_time() have been used as data to count negative values returned by pg_pread() or pg_pwrite(), leading to an incorrect count reported, casting them back to uint64. Most of the problematic calls updated here are adjusted so as we do not report buggy negative numbers anymore. In xlogrecovery.c, the spot updated still counts short reads. In xlog.c after a WAL segment initialization, we count a number only after checking that the operation succeeds. Reported-by: Peter Eisentraut Author: Bertrand Drouvot Discussion: https://postgr.es/m/0db864e6-4477-4eba-b2be-d3523cc86564@eisentraut.org Backpatch-through: 18 --- src/backend/access/transam/xlog.c | 22 +++++++++++----------- src/backend/access/transam/xlogreader.c | 8 +++++--- src/backend/access/transam/xlogrecovery.c | 6 ++++-- src/backend/replication/walreceiver.c | 6 +++--- 4 files changed, 23 insertions(+), 19 deletions(-) diff --git a/src/backend/access/transam/xlog.c b/src/backend/access/transam/xlog.c index 6c2304fef33f..a81912b7441e 100644 --- a/src/backend/access/transam/xlog.c +++ b/src/backend/access/transam/xlog.c @@ -2455,9 +2455,6 @@ XLogWrite(XLogwrtRqst WriteRqst, TimeLineID tli, bool flexible) written = pg_pwrite(openLogFile, from, nleft, startoffset); pgstat_report_wait_end(); - pgstat_count_io_op_time(IOOBJECT_WAL, IOCONTEXT_NORMAL, - IOOP_WRITE, start, 1, written); - if (written <= 0) { char xlogfname[MAXFNAMELEN]; @@ -2475,6 +2472,9 @@ XLogWrite(XLogwrtRqst WriteRqst, TimeLineID tli, bool flexible) errmsg("could not write to log file \"%s\" at offset %u, length %zu: %m", xlogfname, startoffset, nleft))); } + + pgstat_count_io_op_time(IOOBJECT_WAL, IOCONTEXT_NORMAL, + IOOP_WRITE, start, 1, written); nleft -= written; from += written; startoffset += written; @@ -3331,14 +3331,6 @@ XLogFileInitInternal(XLogSegNo logsegno, TimeLineID logtli, } pgstat_report_wait_end(); - /* - * A full segment worth of data is written when using wal_init_zero. One - * byte is written when not using it. - */ - pgstat_count_io_op_time(IOOBJECT_WAL, IOCONTEXT_INIT, IOOP_WRITE, - io_start, 1, - wal_init_zero ? wal_segment_size : 1); - if (save_errno) { /* @@ -3355,6 +3347,14 @@ XLogFileInitInternal(XLogSegNo logsegno, TimeLineID logtli, errmsg("could not write to file \"%s\": %m", tmppath))); } + /* + * A full segment worth of data is written when using wal_init_zero. One + * byte is written when not using it. + */ + pgstat_count_io_op_time(IOOBJECT_WAL, IOCONTEXT_INIT, IOOP_WRITE, + io_start, 1, + wal_init_zero ? wal_segment_size : 1); + /* Measure I/O timing when flushing segment */ io_start = pgstat_prepare_io_time(track_wal_io_timing); diff --git a/src/backend/access/transam/xlogreader.c b/src/backend/access/transam/xlogreader.c index 3145c58a9b19..9d64ae34932b 100644 --- a/src/backend/access/transam/xlogreader.c +++ b/src/backend/access/transam/xlogreader.c @@ -1597,9 +1597,6 @@ WALRead(XLogReaderState *state, #ifndef FRONTEND pgstat_report_wait_end(); - - pgstat_count_io_op_time(IOOBJECT_WAL, IOCONTEXT_NORMAL, IOOP_READ, - io_start, 1, readbytes); #endif if (readbytes <= 0) @@ -1612,6 +1609,11 @@ WALRead(XLogReaderState *state, return false; } +#ifndef FRONTEND + pgstat_count_io_op_time(IOOBJECT_WAL, IOCONTEXT_NORMAL, IOOP_READ, + io_start, 1, readbytes); +#endif + /* Update state for read */ recptr += readbytes; nbytes -= readbytes; diff --git a/src/backend/access/transam/xlogrecovery.c b/src/backend/access/transam/xlogrecovery.c index 73b78a83fa74..4d61795b4836 100644 --- a/src/backend/access/transam/xlogrecovery.c +++ b/src/backend/access/transam/xlogrecovery.c @@ -3390,8 +3390,10 @@ retry: pgstat_report_wait_end(); - pgstat_count_io_op_time(IOOBJECT_WAL, IOCONTEXT_NORMAL, IOOP_READ, - io_start, 1, r); + /* Count I/O stats only for successful short reads */ + if (r > 0) + pgstat_count_io_op_time(IOOBJECT_WAL, IOCONTEXT_NORMAL, IOOP_READ, + io_start, 1, r); XLogFileName(fname, curFileTLI, readSegNo, wal_segment_size); if (r < 0) diff --git a/src/backend/replication/walreceiver.c b/src/backend/replication/walreceiver.c index d19317703c1f..05e2f690fa79 100644 --- a/src/backend/replication/walreceiver.c +++ b/src/backend/replication/walreceiver.c @@ -954,9 +954,6 @@ XLogWalRcvWrite(char *buf, Size nbytes, XLogRecPtr recptr, TimeLineID tli) byteswritten = pg_pwrite(recvFile, buf, segbytes, (pgoff_t) startoff); pgstat_report_wait_end(); - pgstat_count_io_op_time(IOOBJECT_WAL, IOCONTEXT_NORMAL, - IOOP_WRITE, start, 1, byteswritten); - if (byteswritten <= 0) { char xlogfname[MAXFNAMELEN]; @@ -976,6 +973,9 @@ XLogWalRcvWrite(char *buf, Size nbytes, XLogRecPtr recptr, TimeLineID tli) xlogfname, startoff, segbytes))); } + pgstat_count_io_op_time(IOOBJECT_WAL, IOCONTEXT_NORMAL, + IOOP_WRITE, start, 1, byteswritten); + /* Update state for write */ recptr += byteswritten; -- 2.54.0