From c4c91a8579364987038ffa6e4cf0ea6314266c42 Mon Sep 17 00:00:00 2001
From: Michael Zhilin <m.zhilin@postgrespro.ru>
Date: Thu, 10 Mar 2022 17:23:11 +0300
Subject: [PATCH v1] fix column wait_event of view pg_stat_activity

In case of high amount of connections, we have observed value 'ClientRead' of
column wait_event for active connections. Actually backend sets wait_event as
'ClientRead' only in case of idle or idle in transaction state.
The reason of discrepancy is different time moments of gathering information
from internal structures. At first it gathers state information for all backends.
Then it iterates over connections to gather wait information for each backend.

Before this patch the columns 'state' and 'wait_event' of view pg_stat_activity
may contain contadictive values.

The aim of change is to make wait information correct. To achieve it the state
and wait information should be gathered at same time.

Also the patch changes the way to allocate memory for temporary (aka local state)
structures. In case of huge amount of connections (1000 or more) it results in
increase of latency and waste of allocated memory. Instead of one-shot
allocation of huge portition it triggers memory allocation on demand by chunks.

Patch makes no change of system behaviour, but makes pg_stat_activity view more
consistent and a bit faster.

Tested on CentOS 7.9 / Debian 11 / FreeBSD 14 with 8-32 cores.

Co-authored-by: Yura Sokolov <y.sokolov@postgrespro.ru>
---
 src/backend/utils/activity/backend_status.c | 129 +++++++++++++++-----
 src/backend/utils/adt/pgstatfuncs.c         |  49 ++------
 src/include/utils/backend_status.h          |  13 +-
 3 files changed, 120 insertions(+), 71 deletions(-)

diff --git a/src/backend/utils/activity/backend_status.c b/src/backend/utils/activity/backend_status.c
index c7ed1e6d7ac..5cfe35398fb 100644
--- a/src/backend/utils/activity/backend_status.c
+++ b/src/backend/utils/activity/backend_status.c
@@ -25,6 +25,7 @@
 #include "utils/guc.h"			/* for application_name */
 #include "utils/memutils.h"
 
+#define UINT32_ACCESS_ONCE(var)           ((uint32)(*((volatile uint32 *)&(var))))
 
 /* ----------
  * Total number of backends including auxiliary
@@ -716,6 +717,24 @@ pgstat_report_xact_timestamp(TimestampTz tstamp)
 	PGSTAT_END_WRITE_ACTIVITY(beentry);
 }
 
+#define PGSTAT_LOCALDATA_CHUNK	128
+#ifdef HAVE_TYPEOF
+#define PGSTAT_LOCALDATA_ALLOC(x,cnt) \
+	if (x == NULL || x > x##_thr) {	\
+		x = (typeof(x)) \
+			MemoryContextAlloc(backendStatusSnapContext, \
+							   cnt * sizeof(x[0]) * PGSTAT_LOCALDATA_CHUNK); \
+		x##_thr = x + cnt * (PGSTAT_LOCALDATA_CHUNK - 1); \
+	}
+#else
+#define PGSTAT_LOCALDATA_ALLOC(x,cnt) \
+	if (x == NULL || x > x##_thr) {	\
+		x = MemoryContextAlloc(backendStatusSnapContext, \
+							   cnt * sizeof(x[0]) * PGSTAT_LOCALDATA_CHUNK); \
+		x##_thr = x + cnt * (PGSTAT_LOCALDATA_CHUNK - 1); \
+	}
+#endif
+
 /* ----------
  * pgstat_read_current_status() -
  *
@@ -729,16 +748,25 @@ pgstat_read_current_status(void)
 	volatile PgBackendStatus *beentry;
 	LocalPgBackendStatus *localtable;
 	LocalPgBackendStatus *localentry;
+	PgBackendStatus *localbeentry,
+			   *localbeentry_thr;
+	PGPROC	   *proc;
 	char	   *localappname,
+			   *localappname_thr,
 			   *localclienthostname,
-			   *localactivity;
+			   *localclienthostname_thr,
+			   *localactivity,
+			   *localactivity_thr;
 #ifdef USE_SSL
-	PgBackendSSLStatus *localsslstatus;
+	PgBackendSSLStatus *localsslstatus,
+			   *localsslstatus_thr;
 #endif
 #ifdef ENABLE_GSS
-	PgBackendGSSStatus *localgssstatus;
+	PgBackendGSSStatus *localgssstatus,
+			   *localgssstatus_thr;
 #endif
 	int			i;
+	int			pid;
 
 	if (localBackendStatusTable)
 		return;					/* already done */
@@ -756,32 +784,39 @@ pgstat_read_current_status(void)
 	localtable = (LocalPgBackendStatus *)
 		MemoryContextAlloc(backendStatusSnapContext,
 						   sizeof(LocalPgBackendStatus) * NumBackendStatSlots);
-	localappname = (char *)
-		MemoryContextAlloc(backendStatusSnapContext,
-						   NAMEDATALEN * NumBackendStatSlots);
-	localclienthostname = (char *)
-		MemoryContextAlloc(backendStatusSnapContext,
-						   NAMEDATALEN * NumBackendStatSlots);
-	localactivity = (char *)
-		MemoryContextAllocHuge(backendStatusSnapContext,
-							   pgstat_track_activity_query_size * NumBackendStatSlots);
+
 #ifdef USE_SSL
-	localsslstatus = (PgBackendSSLStatus *)
-		MemoryContextAlloc(backendStatusSnapContext,
-						   sizeof(PgBackendSSLStatus) * NumBackendStatSlots);
+	localsslstatus = NULL;
 #endif
 #ifdef ENABLE_GSS
-	localgssstatus = (PgBackendGSSStatus *)
-		MemoryContextAlloc(backendStatusSnapContext,
-						   sizeof(PgBackendGSSStatus) * NumBackendStatSlots);
+	localgssstatus = NULL;
 #endif
 
 	localNumBackends = 0;
+	localbeentry = NULL;
+	localappname = NULL;
+	localclienthostname = NULL;
+	localactivity = NULL;
 
 	beentry = BackendStatusArray;
 	localentry = localtable;
 	for (i = 1; i <= NumBackendStatSlots; i++)
 	{
+		/*
+		 * Check available space in buffers for local state data
+		 */
+		PGSTAT_LOCALDATA_ALLOC(localbeentry, 1);
+		PGSTAT_LOCALDATA_ALLOC(localappname, NAMEDATALEN);
+		PGSTAT_LOCALDATA_ALLOC(localclienthostname, NAMEDATALEN);
+		PGSTAT_LOCALDATA_ALLOC(localactivity, pgstat_track_activity_query_size);
+#ifdef USE_SSL
+		PGSTAT_LOCALDATA_ALLOC(localsslstatus, 1);
+#endif
+#ifdef ENABLE_GSS
+		PGSTAT_LOCALDATA_ALLOC(localgssstatus, 1);
+#endif
+
+		localentry->backendStatus = localbeentry;
 		/*
 		 * Follow the protocol of retrying if st_changecount changes while we
 		 * copy the entry, or if it's odd.  (The check for odd is needed to
@@ -796,11 +831,11 @@ pgstat_read_current_status(void)
 
 			pgstat_begin_read_activity(beentry, before_changecount);
 
-			localentry->backendStatus.st_procpid = beentry->st_procpid;
+			localentry->backendStatus->st_procpid = beentry->st_procpid;
 			/* Skip all the data-copying work if entry is not in use */
-			if (localentry->backendStatus.st_procpid > 0)
+			if (localentry->backendStatus->st_procpid > 0)
 			{
-				memcpy(&localentry->backendStatus, unvolatize(PgBackendStatus *, beentry), sizeof(PgBackendStatus));
+				memcpy(localentry->backendStatus, unvolatize(PgBackendStatus *, beentry), sizeof(PgBackendStatus));
 
 				/*
 				 * For each PgBackendStatus field that is a pointer, copy the
@@ -811,23 +846,23 @@ pgstat_read_current_status(void)
 				 * because there's always a \0 at the end of the buffer.
 				 */
 				strcpy(localappname, (char *) beentry->st_appname);
-				localentry->backendStatus.st_appname = localappname;
+				localentry->backendStatus->st_appname = localappname;
 				strcpy(localclienthostname, (char *) beentry->st_clienthostname);
-				localentry->backendStatus.st_clienthostname = localclienthostname;
+				localentry->backendStatus->st_clienthostname = localclienthostname;
 				strcpy(localactivity, (char *) beentry->st_activity_raw);
-				localentry->backendStatus.st_activity_raw = localactivity;
+				localentry->backendStatus->st_activity_raw = localactivity;
 #ifdef USE_SSL
 				if (beentry->st_ssl)
 				{
 					memcpy(localsslstatus, beentry->st_sslstatus, sizeof(PgBackendSSLStatus));
-					localentry->backendStatus.st_sslstatus = localsslstatus;
+					localentry->backendStatus->st_sslstatus = localsslstatus;
 				}
 #endif
 #ifdef ENABLE_GSS
 				if (beentry->st_gss)
 				{
 					memcpy(localgssstatus, beentry->st_gssstatus, sizeof(PgBackendGSSStatus));
-					localentry->backendStatus.st_gssstatus = localgssstatus;
+					localentry->backendStatus->st_gssstatus = localgssstatus;
 				}
 #endif
 			}
@@ -843,17 +878,49 @@ pgstat_read_current_status(void)
 		}
 
 		beentry++;
+		pid = localentry->backendStatus->st_procpid;
 		/* Only valid entries get included into the local array */
-		if (localentry->backendStatus.st_procpid > 0)
+		if (pid > 0)
 		{
 			BackendIdGetTransactionIds(i,
 									   &localentry->backend_xid,
 									   &localentry->backend_xmin);
+			proc = BackendIdGetProc(i);
+
+			/*
+			 * If a PGPROC entry was retrieved, display wait events and lock
+			 * group leader information if any.  To avoid extra overhead, no
+			 * extra lock is being held, so there is no guarantee of
+			 * consistency across multiple rows.
+			 */
+			if (proc != NULL)
+			{
+				PGPROC	   *leader;
+
+				localentry->wait_event_info = UINT32_ACCESS_ONCE(proc->wait_event_info);
+				leader = proc->lockGroupLeader;
+
+				/*
+				 * Show the leader only for active parallel workers.  This
+				 * leaves the field as NULL for the leader of a parallel
+				 * group.
+				 */
+				if (leader && leader->pid != pid)
+					localentry->leader_pid = leader->pid;
+				else
+					localentry->leader_pid = 0;
+			}
+			else
+			{
+				localentry->wait_event_info = 0;
+				localentry->leader_pid = 0;
+			}
 
 			localentry++;
-			localappname += NAMEDATALEN;
-			localclienthostname += NAMEDATALEN;
-			localactivity += pgstat_track_activity_query_size;
+			localbeentry++;
+			localappname += strlen(localappname) + 1;
+			localclienthostname += strlen(localclienthostname) + 1;
+			localactivity += strlen(localactivity) + 1;
 #ifdef USE_SSL
 			localsslstatus++;
 #endif
@@ -1064,7 +1131,7 @@ pgstat_fetch_stat_beentry(int beid)
 	if (beid < 1 || beid > localNumBackends)
 		return NULL;
 
-	return &localBackendStatusTable[beid - 1].backendStatus;
+	return localBackendStatusTable[beid - 1].backendStatus;
 }
 
 
diff --git a/src/backend/utils/adt/pgstatfuncs.c b/src/backend/utils/adt/pgstatfuncs.c
index 893690dad52..c1595ec1a9c 100644
--- a/src/backend/utils/adt/pgstatfuncs.c
+++ b/src/backend/utils/adt/pgstatfuncs.c
@@ -500,7 +500,7 @@ pg_stat_get_progress_info(PG_FUNCTION_ARGS)
 		if (!local_beentry)
 			continue;
 
-		beentry = &local_beentry->backendStatus;
+		beentry = local_beentry->backendStatus;
 
 		/*
 		 * Report values for only those backends which are running the given
@@ -555,7 +555,6 @@ pg_stat_get_activity(PG_FUNCTION_ARGS)
 		bool		nulls[PG_STAT_GET_ACTIVITY_COLS];
 		LocalPgBackendStatus *local_beentry;
 		PgBackendStatus *beentry;
-		PGPROC	   *proc;
 		const char *wait_event_type = NULL;
 		const char *wait_event = NULL;
 
@@ -582,7 +581,7 @@ pg_stat_get_activity(PG_FUNCTION_ARGS)
 			continue;
 		}
 
-		beentry = &local_beentry->backendStatus;
+		beentry = local_beentry->backendStatus;
 
 		/* If looking for specific PID, ignore all the others */
 		if (pid != -1 && beentry->st_procpid != pid)
@@ -652,46 +651,18 @@ pg_stat_get_activity(PG_FUNCTION_ARGS)
 			pfree(clipped_activity);
 
 			/* leader_pid */
-			nulls[28] = true;
-
-			proc = BackendPidGetProc(beentry->st_procpid);
-
-			if (proc == NULL && (beentry->st_backendType != B_BACKEND))
+			if (local_beentry->leader_pid)
 			{
-				/*
-				 * For an auxiliary process, retrieve process info from
-				 * AuxiliaryProcs stored in shared-memory.
-				 */
-				proc = AuxiliaryPidGetProc(beentry->st_procpid);
+				values[28] = Int32GetDatum(local_beentry->leader_pid);
+				nulls[28] = false;
 			}
+			else
+				nulls[28] = true;
 
-			/*
-			 * If a PGPROC entry was retrieved, display wait events and lock
-			 * group leader information if any.  To avoid extra overhead, no
-			 * extra lock is being held, so there is no guarantee of
-			 * consistency across multiple rows.
-			 */
-			if (proc != NULL)
+			if (local_beentry->wait_event_info)
 			{
-				uint32		raw_wait_event;
-				PGPROC	   *leader;
-
-				raw_wait_event = UINT32_ACCESS_ONCE(proc->wait_event_info);
-				wait_event_type = pgstat_get_wait_event_type(raw_wait_event);
-				wait_event = pgstat_get_wait_event(raw_wait_event);
-
-				leader = proc->lockGroupLeader;
-
-				/*
-				 * Show the leader only for active parallel workers.  This
-				 * leaves the field as NULL for the leader of a parallel
-				 * group.
-				 */
-				if (leader && leader->pid != beentry->st_procpid)
-				{
-					values[28] = Int32GetDatum(leader->pid);
-					nulls[28] = false;
-				}
+				wait_event_type = pgstat_get_wait_event_type(local_beentry->wait_event_info);
+				wait_event = pgstat_get_wait_event(local_beentry->wait_event_info);
 			}
 
 			if (wait_event_type)
diff --git a/src/include/utils/backend_status.h b/src/include/utils/backend_status.h
index 7403bca25ed..63eae799978 100644
--- a/src/include/utils/backend_status.h
+++ b/src/include/utils/backend_status.h
@@ -245,7 +245,7 @@ typedef struct LocalPgBackendStatus
 	/*
 	 * Local version of the backend status entry.
 	 */
-	PgBackendStatus backendStatus;
+	PgBackendStatus *backendStatus;
 
 	/*
 	 * The xid of the current transaction if available, InvalidTransactionId
@@ -258,6 +258,17 @@ typedef struct LocalPgBackendStatus
 	 * not.
 	 */
 	TransactionId backend_xmin;
+
+	/*
+	 * The process wait information
+	 */
+	uint32 wait_event_info;
+
+	/*
+	 * The leader process ID
+	 */
+	int leader_pid;
+
 } LocalPgBackendStatus;
 
 
-- 
2.34.1

