From 4c5b1aad892240e72d8e18786e7cc409fb5e8f85 Mon Sep 17 00:00:00 2001
From: Bertrand Drouvot <bertranddrouvot.pg@gmail.com>
Date: Tue, 1 Jul 2025 12:03:55 +0000
Subject: [PATCH v2 2/2] Add os_page_num to pg_buffercache

ba2a3c2302f added a way to check if a buffer is spread across multiple pages.

Adding the same information in pg_buffercache so that one does not need NUMA
support enabled to get this information.
---
 contrib/pg_buffercache/Makefile               |   2 +-
 .../expected/pg_buffercache.out               |   2 +-
 contrib/pg_buffercache/meson.build            |   1 +
 .../pg_buffercache--1.6--1.7.sql              |  11 ++
 contrib/pg_buffercache/pg_buffercache.control |   2 +-
 contrib/pg_buffercache/pg_buffercache_pages.c | 108 ++++++++++++++----
 contrib/pg_buffercache/sql/pg_buffercache.sql |   2 +-
 doc/src/sgml/pgbuffercache.sgml               |  34 ++++++
 8 files changed, 133 insertions(+), 29 deletions(-)
 create mode 100644 contrib/pg_buffercache/pg_buffercache--1.6--1.7.sql

diff --git a/contrib/pg_buffercache/Makefile b/contrib/pg_buffercache/Makefile
index 5f748543e2e..0e618f66aec 100644
--- a/contrib/pg_buffercache/Makefile
+++ b/contrib/pg_buffercache/Makefile
@@ -9,7 +9,7 @@ EXTENSION = pg_buffercache
 DATA = pg_buffercache--1.2.sql pg_buffercache--1.2--1.3.sql \
 	pg_buffercache--1.1--1.2.sql pg_buffercache--1.0--1.1.sql \
 	pg_buffercache--1.3--1.4.sql pg_buffercache--1.4--1.5.sql \
-	pg_buffercache--1.5--1.6.sql
+	pg_buffercache--1.5--1.6.sql pg_buffercache--1.6--1.7.sql
 PGFILEDESC = "pg_buffercache - monitoring of shared buffer cache in real-time"
 
 REGRESS = pg_buffercache pg_buffercache_numa
diff --git a/contrib/pg_buffercache/expected/pg_buffercache.out b/contrib/pg_buffercache/expected/pg_buffercache.out
index 9a9216dc7b1..52e38dcc027 100644
--- a/contrib/pg_buffercache/expected/pg_buffercache.out
+++ b/contrib/pg_buffercache/expected/pg_buffercache.out
@@ -1,5 +1,5 @@
 CREATE EXTENSION pg_buffercache;
-select count(*) = (select setting::bigint
+select count(*) >= (select setting::bigint
                    from pg_settings
                    where name = 'shared_buffers')
 from pg_buffercache;
diff --git a/contrib/pg_buffercache/meson.build b/contrib/pg_buffercache/meson.build
index 7cd039a1df9..7c31141881f 100644
--- a/contrib/pg_buffercache/meson.build
+++ b/contrib/pg_buffercache/meson.build
@@ -24,6 +24,7 @@ install_data(
   'pg_buffercache--1.3--1.4.sql',
   'pg_buffercache--1.4--1.5.sql',
   'pg_buffercache--1.5--1.6.sql',
+  'pg_buffercache--1.6--1.7.sql',
   'pg_buffercache.control',
   kwargs: contrib_data_args,
 )
diff --git a/contrib/pg_buffercache/pg_buffercache--1.6--1.7.sql b/contrib/pg_buffercache/pg_buffercache--1.6--1.7.sql
new file mode 100644
index 00000000000..f2ac5d79eb8
--- /dev/null
+++ b/contrib/pg_buffercache/pg_buffercache--1.6--1.7.sql
@@ -0,0 +1,11 @@
+/* contrib/pg_buffercache/pg_buffercache--1.6--1.7.sql */
+
+-- complain if script is sourced in psql, rather than via ALTER EXTENSION
+\echo Use "ALTER EXTENSION pg_buffercache UPDATE TO '1.7'" to load this file. \quit
+
+-- Upgrade view to 1.7. format
+CREATE OR REPLACE VIEW pg_buffercache AS
+	SELECT P.* FROM pg_buffercache_pages() AS P
+	(bufferid integer, relfilenode oid, reltablespace oid, reldatabase oid,
+	 relforknumber int2, relblocknumber int8, isdirty bool, usagecount int2,
+	 pinning_backends int4, os_page_num bigint);
diff --git a/contrib/pg_buffercache/pg_buffercache.control b/contrib/pg_buffercache/pg_buffercache.control
index b030ba3a6fa..11499550945 100644
--- a/contrib/pg_buffercache/pg_buffercache.control
+++ b/contrib/pg_buffercache/pg_buffercache.control
@@ -1,5 +1,5 @@
 # pg_buffercache extension
 comment = 'examine the shared buffer cache'
-default_version = '1.6'
+default_version = '1.7'
 module_pathname = '$libdir/pg_buffercache'
 relocatable = true
diff --git a/contrib/pg_buffercache/pg_buffercache_pages.c b/contrib/pg_buffercache/pg_buffercache_pages.c
index 8ef13d74186..f46de190975 100644
--- a/contrib/pg_buffercache/pg_buffercache_pages.c
+++ b/contrib/pg_buffercache/pg_buffercache_pages.c
@@ -19,7 +19,7 @@
 
 
 #define NUM_BUFFERCACHE_PAGES_MIN_ELEM	8
-#define NUM_BUFFERCACHE_PAGES_ELEM	9
+#define NUM_BUFFERCACHE_PAGES_ELEM	10
 #define NUM_BUFFERCACHE_SUMMARY_ELEM 5
 #define NUM_BUFFERCACHE_USAGE_COUNTS_ELEM 4
 #define NUM_BUFFERCACHE_EVICT_ELEM 2
@@ -60,6 +60,7 @@ typedef struct
 	 * because of bufmgr.c's PrivateRefCount infrastructure.
 	 */
 	int32		pinning_backends;
+	int64		page_num;
 } BufferCachePagesRec;
 
 
@@ -69,6 +70,9 @@ typedef struct
 typedef struct
 {
 	TupleDesc	tupdesc;
+	int			buffers_per_page;
+	int			pages_per_buffer;
+	int			os_page_size;
 	BufferCachePagesRec *record;
 } BufferCachePagesContext;
 
@@ -152,8 +156,24 @@ pg_buffercache_pages(PG_FUNCTION_ARGS)
 
 	if (SRF_IS_FIRSTCALL())
 	{
-		int			i;
+		int			i,
+					idx;
+		Size		os_page_size;
+		char	   *startptr;
+		int			max_entries;
 
+		/*
+		 * Different database block sizes (4kB, 8kB, ..., 32kB) can be used,
+		 * while the OS may have different memory page sizes.
+		 *
+		 * To correctly map between them, we need to: 1. Determine the OS
+		 * memory page size 2. Calculate how many OS pages are used by all
+		 * buffer blocks 3. Calculate how many OS pages are contained within
+		 * each database block.
+		 */
+		os_page_size = pg_get_shmem_pagesize();
+
+		/* Initialize the multi-call context, load entries about buffers */
 		funcctx = SRF_FIRSTCALL_INIT();
 
 		/* Switch context when allocating stuff to be used in later calls */
@@ -196,24 +216,36 @@ pg_buffercache_pages(PG_FUNCTION_ARGS)
 		TupleDescInitEntry(tupledesc, (AttrNumber) 8, "usage_count",
 						   INT2OID, -1, 0);
 
-		if (expected_tupledesc->natts == NUM_BUFFERCACHE_PAGES_ELEM)
+		if (expected_tupledesc->natts >= (NUM_BUFFERCACHE_PAGES_ELEM - 1))
 			TupleDescInitEntry(tupledesc, (AttrNumber) 9, "pinning_backends",
 							   INT4OID, -1, 0);
 
+		if (expected_tupledesc->natts == NUM_BUFFERCACHE_PAGES_ELEM)
+			TupleDescInitEntry(tupledesc, (AttrNumber) 10, "os_page_num",
+							   INT8OID, -1, 0);
+
 		fctx->tupdesc = BlessTupleDesc(tupledesc);
 
+		/*
+		 * Each buffer needs at least one entry, but it might be offset in
+		 * some way, and use one extra entry. So we allocate space for the
+		 * maximum number of entries we might need, and then count the exact
+		 * number as we're walking buffers. That way we can do it in one pass,
+		 * without reallocating memory.
+		 */
+		max_entries = GET_MAX_BUFFER_ENTRIES(NBuffers, os_page_size);
+
 		/* Allocate NBuffers worth of BufferCachePagesRec records. */
 		fctx->record = (BufferCachePagesRec *)
 			MemoryContextAllocHuge(CurrentMemoryContext,
-								   sizeof(BufferCachePagesRec) * NBuffers);
-
-		/* Set max calls and remember the user function context. */
-		funcctx->max_calls = NBuffers;
-		funcctx->user_fctx = fctx;
+								   sizeof(BufferCachePagesRec) * max_entries);
 
 		/* Return to original context when allocating transient memory */
 		MemoryContextSwitchTo(oldcontext);
 
+		startptr = (char *) TYPEALIGN_DOWN(os_page_size, (char *) BufferGetBlock(1));
+		idx = 0;
+
 		/*
 		 * Scan through all the buffers, saving the relevant fields in the
 		 * fctx->record structure.
@@ -224,35 +256,58 @@ pg_buffercache_pages(PG_FUNCTION_ARGS)
 		 */
 		for (i = 0; i < NBuffers; i++)
 		{
+			char	   *buffptr = (char *) BufferGetBlock(i + 1);
 			BufferDesc *bufHdr;
 			uint32		buf_state;
+			int32		page_num;
+			char	   *startptr_buff,
+					   *endptr_buff;
 
 			bufHdr = GetBufferDescriptor(i);
 			/* Lock each buffer header before inspecting. */
 			buf_state = LockBufHdr(bufHdr);
 
-			fctx->record[i].bufferid = BufferDescriptorGetBuffer(bufHdr);
-			fctx->record[i].relfilenumber = BufTagGetRelNumber(&bufHdr->tag);
-			fctx->record[i].reltablespace = bufHdr->tag.spcOid;
-			fctx->record[i].reldatabase = bufHdr->tag.dbOid;
-			fctx->record[i].forknum = BufTagGetForkNum(&bufHdr->tag);
-			fctx->record[i].blocknum = bufHdr->tag.blockNum;
-			fctx->record[i].usagecount = BUF_STATE_GET_USAGECOUNT(buf_state);
-			fctx->record[i].pinning_backends = BUF_STATE_GET_REFCOUNT(buf_state);
+			/* Get page boundaries for this buffer. */
+			get_buffer_page_boundaries(buffptr, os_page_size, startptr,
+									   &startptr_buff, &endptr_buff, &page_num);
 
-			if (buf_state & BM_DIRTY)
-				fctx->record[i].isdirty = true;
-			else
-				fctx->record[i].isdirty = false;
+			/* Add an entry for each OS page overlapping with this buffer. */
+			for (char *ptr = startptr_buff; ptr < endptr_buff; ptr += os_page_size)
+			{
+				fctx->record[idx].bufferid = BufferDescriptorGetBuffer(bufHdr);
+				fctx->record[idx].relfilenumber = BufTagGetRelNumber(&bufHdr->tag);
+				fctx->record[idx].reltablespace = bufHdr->tag.spcOid;
+				fctx->record[idx].reldatabase = bufHdr->tag.dbOid;
+				fctx->record[idx].forknum = BufTagGetForkNum(&bufHdr->tag);
+				fctx->record[idx].blocknum = bufHdr->tag.blockNum;
+				fctx->record[idx].usagecount = BUF_STATE_GET_USAGECOUNT(buf_state);
+				fctx->record[idx].pinning_backends = BUF_STATE_GET_REFCOUNT(buf_state);
+
+				if (buf_state & BM_DIRTY)
+					fctx->record[idx].isdirty = true;
+				else
+					fctx->record[idx].isdirty = false;
+
+				/* Note if the buffer is valid, and has storage created */
+				if ((buf_state & BM_VALID) && (buf_state & BM_TAG_VALID))
+					fctx->record[idx].isvalid = true;
+				else
+					fctx->record[idx].isvalid = false;
 
-			/* Note if the buffer is valid, and has storage created */
-			if ((buf_state & BM_VALID) && (buf_state & BM_TAG_VALID))
-				fctx->record[i].isvalid = true;
-			else
-				fctx->record[i].isvalid = false;
+				fctx->record[idx].page_num = page_num;
+				/* advance to the next entry/page */
+				++idx;
+				++page_num;
+			}
 
 			UnlockBufHdr(bufHdr, buf_state);
 		}
+
+		Assert(idx <= max_entries);
+
+		/* Set max calls and remember the user function context. */
+		funcctx->max_calls = idx;
+		funcctx->user_fctx = fctx;
 	}
 
 	funcctx = SRF_PERCALL_SETUP();
@@ -285,6 +340,7 @@ pg_buffercache_pages(PG_FUNCTION_ARGS)
 			nulls[7] = true;
 			/* unused for v1.0 callers, but the array is always long enough */
 			nulls[8] = true;
+			nulls[9] = true;
 		}
 		else
 		{
@@ -305,6 +361,8 @@ pg_buffercache_pages(PG_FUNCTION_ARGS)
 			/* unused for v1.0 callers, but the array is always long enough */
 			values[8] = Int32GetDatum(fctx->record[i].pinning_backends);
 			nulls[8] = false;
+			values[9] = Int64GetDatum(fctx->record[i].page_num);
+			nulls[9] = false;
 		}
 
 		/* Build and return the tuple. */
diff --git a/contrib/pg_buffercache/sql/pg_buffercache.sql b/contrib/pg_buffercache/sql/pg_buffercache.sql
index 47cca1907c7..78ec7a88ea0 100644
--- a/contrib/pg_buffercache/sql/pg_buffercache.sql
+++ b/contrib/pg_buffercache/sql/pg_buffercache.sql
@@ -1,6 +1,6 @@
 CREATE EXTENSION pg_buffercache;
 
-select count(*) = (select setting::bigint
+select count(*) >= (select setting::bigint
                    from pg_settings
                    where name = 'shared_buffers')
 from pg_buffercache;
diff --git a/doc/src/sgml/pgbuffercache.sgml b/doc/src/sgml/pgbuffercache.sgml
index 537d6014942..6183adf73db 100644
--- a/doc/src/sgml/pgbuffercache.sgml
+++ b/doc/src/sgml/pgbuffercache.sgml
@@ -205,6 +205,15 @@
        Number of backends pinning this buffer
       </para></entry>
      </row>
+
+     <row>
+      <entry role="catalog_table_entry"><para role="column_definition">
+       <structfield>os_page_num</structfield> <type>bigint</type>
+      </para>
+      <para>
+       number of OS memory page for this buffer
+      </para></entry>
+     </row>
     </tbody>
    </tgroup>
   </table>
@@ -550,6 +559,31 @@ regression=# SELECT n.nspname, c.relname, count(*) AS buffers
  public     | spgist_text_tbl        |     182
 (10 rows)
 
+regression=# SELECT n.nspname, c.relname, count(*) AS buffers_on_multiple_pages
+             FROM pg_buffercache b JOIN pg_class c
+             ON b.relfilenode = pg_relation_filenode(c.oid) AND
+                b.reldatabase IN (0, (SELECT oid FROM pg_database
+                                      WHERE datname = current_database()))
+             JOIN pg_namespace n ON n.oid = c.relnamespace
+             JOIN (SELECT bufferid FROM pg_buffercache
+                   GROUP BY bufferid HAVING count(*) > 1) m on m.bufferid = b.bufferid
+             GROUP BY n.nspname, c.relname
+             ORDER BY 3 DESC
+             LIMIT 10;
+
+  nspname   |             relname             | buffers_on_multiple_pages
+------------+---------------------------------+---------------------------
+ public     | gin_test_tbl                    |                         4
+ public     | delete_test_table               |                         4
+ public     | tenk1                           |                         4
+ pg_catalog | pg_attribute_relid_attnum_index |                         4
+ pg_catalog | pg_class                        |                         2
+ pg_catalog | pg_depend_depender_index        |                         2
+ pg_catalog | pg_attribute                    |                         2
+ pg_catalog | pg_opfamily                     |                         2
+ pg_catalog | pg_opclass_oid_index            |                         2
+ pg_catalog | pg_description                  |                         2
+(10 rows)
 
 regression=# SELECT * FROM pg_buffercache_summary();
  buffers_used | buffers_unused | buffers_dirty | buffers_pinned | usagecount_avg
-- 
2.34.1

