From c762845c00049b418a326cc35c3fe4b51016333d Mon Sep 17 00:00:00 2001
From: Bertrand Drouvot <bertranddrouvot.pg@gmail.com>
Date: Wed, 2 Jul 2025 04:33:03 +0000
Subject: [PATCH v4 2/2] Add pg_buffercache_os_pages function and view

ba2a3c2302f added a way to check if a buffer is spread across multiple pages.

Adding those new function and view so that one does not need NUMA support
enabled to get this information.
---
 contrib/pg_buffercache/Makefile               |   4 +-
 .../expected/pg_buffercache_os_pages.out      |  25 +++
 contrib/pg_buffercache/meson.build            |   2 +
 .../pg_buffercache--1.6--1.7.sql              |  21 +++
 contrib/pg_buffercache/pg_buffercache.control |   2 +-
 contrib/pg_buffercache/pg_buffercache_pages.c | 177 ++++++++++++++++++
 .../sql/pg_buffercache_os_pages.sql           |  16 ++
 doc/src/sgml/pgbuffercache.sgml               | 110 +++++++++++
 src/tools/pgindent/typedefs.list              |   2 +
 9 files changed, 356 insertions(+), 3 deletions(-)
 create mode 100644 contrib/pg_buffercache/expected/pg_buffercache_os_pages.out
 create mode 100644 contrib/pg_buffercache/pg_buffercache--1.6--1.7.sql
 create mode 100644 contrib/pg_buffercache/sql/pg_buffercache_os_pages.sql

diff --git a/contrib/pg_buffercache/Makefile b/contrib/pg_buffercache/Makefile
index 5f748543e2e..a452b28e6d9 100644
--- a/contrib/pg_buffercache/Makefile
+++ b/contrib/pg_buffercache/Makefile
@@ -9,10 +9,10 @@ EXTENSION = pg_buffercache
 DATA = pg_buffercache--1.2.sql pg_buffercache--1.2--1.3.sql \
 	pg_buffercache--1.1--1.2.sql pg_buffercache--1.0--1.1.sql \
 	pg_buffercache--1.3--1.4.sql pg_buffercache--1.4--1.5.sql \
-	pg_buffercache--1.5--1.6.sql
+	pg_buffercache--1.5--1.6.sql pg_buffercache--1.6--1.7.sql
 PGFILEDESC = "pg_buffercache - monitoring of shared buffer cache in real-time"
 
-REGRESS = pg_buffercache pg_buffercache_numa
+REGRESS = pg_buffercache pg_buffercache_numa pg_buffercache_os_pages
 
 ifdef USE_PGXS
 PG_CONFIG = pg_config
diff --git a/contrib/pg_buffercache/expected/pg_buffercache_os_pages.out b/contrib/pg_buffercache/expected/pg_buffercache_os_pages.out
new file mode 100644
index 00000000000..2d3d3185885
--- /dev/null
+++ b/contrib/pg_buffercache/expected/pg_buffercache_os_pages.out
@@ -0,0 +1,25 @@
+-- We expect at least one entry for each buffer
+select count(*) >= (select setting::bigint
+                    from pg_settings
+                    where name = 'shared_buffers')
+from pg_buffercache_os_pages;
+ ?column? 
+----------
+ t
+(1 row)
+
+-- Check that the functions / views can't be accessed by default. To avoid
+-- having to create a dedicated user, use the pg_database_owner pseudo-role.
+SET ROLE pg_database_owner;
+SELECT count(*) > 0 FROM pg_buffercache_os_pages;
+ERROR:  permission denied for view pg_buffercache_os_pages
+RESET role;
+-- Check that pg_monitor is allowed to query view / function
+SET ROLE pg_monitor;
+SELECT count(*) > 0 FROM pg_buffercache_os_pages;
+ ?column? 
+----------
+ t
+(1 row)
+
+RESET role;
diff --git a/contrib/pg_buffercache/meson.build b/contrib/pg_buffercache/meson.build
index 7cd039a1df9..e2acd10c266 100644
--- a/contrib/pg_buffercache/meson.build
+++ b/contrib/pg_buffercache/meson.build
@@ -24,6 +24,7 @@ install_data(
   'pg_buffercache--1.3--1.4.sql',
   'pg_buffercache--1.4--1.5.sql',
   'pg_buffercache--1.5--1.6.sql',
+  'pg_buffercache--1.6--1.7.sql',
   'pg_buffercache.control',
   kwargs: contrib_data_args,
 )
@@ -36,6 +37,7 @@ tests += {
     'sql': [
       'pg_buffercache',
       'pg_buffercache_numa',
+      'pg_buffercache_os_pages',
     ],
   },
 }
diff --git a/contrib/pg_buffercache/pg_buffercache--1.6--1.7.sql b/contrib/pg_buffercache/pg_buffercache--1.6--1.7.sql
new file mode 100644
index 00000000000..28caf24688f
--- /dev/null
+++ b/contrib/pg_buffercache/pg_buffercache--1.6--1.7.sql
@@ -0,0 +1,21 @@
+/* contrib/pg_buffercache/pg_buffercache--1.6--1.7.sql */
+
+-- complain if script is sourced in psql, rather than via ALTER EXTENSION
+\echo Use "ALTER EXTENSION pg_buffercache UPDATE TO '1.7'" to load this file. \quit
+
+-- Register the new function.
+CREATE FUNCTION pg_buffercache_os_pages()
+RETURNS SETOF RECORD
+AS 'MODULE_PATHNAME', 'pg_buffercache_os_pages'
+LANGUAGE C PARALLEL SAFE;
+
+-- Create a view for convenient access.
+CREATE VIEW pg_buffercache_os_pages AS
+    SELECT P.* FROM pg_buffercache_os_pages() AS P
+    (bufferid integer, os_page_num bigint);
+
+REVOKE ALL ON FUNCTION pg_buffercache_os_pages() FROM PUBLIC;
+REVOKE ALL ON pg_buffercache_os_pages FROM PUBLIC;
+
+GRANT EXECUTE ON FUNCTION pg_buffercache_os_pages() TO pg_monitor;
+GRANT SELECT ON pg_buffercache_os_pages TO pg_monitor;
diff --git a/contrib/pg_buffercache/pg_buffercache.control b/contrib/pg_buffercache/pg_buffercache.control
index b030ba3a6fa..11499550945 100644
--- a/contrib/pg_buffercache/pg_buffercache.control
+++ b/contrib/pg_buffercache/pg_buffercache.control
@@ -1,5 +1,5 @@
 # pg_buffercache extension
 comment = 'examine the shared buffer cache'
-default_version = '1.6'
+default_version = '1.7'
 module_pathname = '$libdir/pg_buffercache'
 relocatable = true
diff --git a/contrib/pg_buffercache/pg_buffercache_pages.c b/contrib/pg_buffercache/pg_buffercache_pages.c
index 8ef13d74186..df7287e1283 100644
--- a/contrib/pg_buffercache/pg_buffercache_pages.c
+++ b/contrib/pg_buffercache/pg_buffercache_pages.c
@@ -27,6 +27,7 @@
 #define NUM_BUFFERCACHE_EVICT_ALL_ELEM 3
 
 #define NUM_BUFFERCACHE_NUMA_ELEM	3
+#define NUM_BUFFERCACHE_OS_PAGES_ELEM	2
 
 /*
  * Get the maximum buffer cache entries needed.
@@ -94,12 +95,30 @@ typedef struct
 	BufferCacheNumaRec *record;
 } BufferCacheNumaContext;
 
+/*
+ * Record structure holding the to be exposed cache data.
+ */
+typedef struct
+{
+	uint32		bufferid;
+	int64		page_num;
+} BufferCacheOsPagesRec;
+
+/*
+ * Function context for data persisting over repeated calls.
+ */
+typedef struct
+{
+	TupleDesc	tupdesc;
+	BufferCacheOsPagesRec *record;
+} BufferCacheOsPagesContext;
 
 /*
  * Function returning data from the shared buffer cache - buffer number,
  * relation node/tablespace/database/blocknum and dirty indicator.
  */
 PG_FUNCTION_INFO_V1(pg_buffercache_pages);
+PG_FUNCTION_INFO_V1(pg_buffercache_os_pages);
 PG_FUNCTION_INFO_V1(pg_buffercache_numa_pages);
 PG_FUNCTION_INFO_V1(pg_buffercache_summary);
 PG_FUNCTION_INFO_V1(pg_buffercache_usage_counts);
@@ -139,6 +158,164 @@ get_buffer_page_boundaries(char *buffptr, Size os_page_size, char *startptr,
 	*endptr_buff = end_ptr;
 }
 
+/*
+ * Inquire about OS pages mappings for shared buffers.
+ *
+ * Returns each OS memory page used by the buffer. Buffers may
+ * be smaller or larger than OS memory pages. For each buffer we return one
+ * entry for each memory page used by the buffer (if the buffer is smaller,
+ * it only uses a part of one memory page).
+ *
+ * We expect both sizes (for buffers and memory pages) to be a power-of-2, so
+ * one is always a multiple of the other.
+ */
+Datum
+pg_buffercache_os_pages(PG_FUNCTION_ARGS)
+{
+	FuncCallContext *funcctx;
+	Datum		result;
+	MemoryContext oldcontext;
+	BufferCacheOsPagesContext *fctx;	/* User function context. */
+	TupleDesc	tupledesc;
+	TupleDesc	expected_tupledesc;
+	HeapTuple	tuple;
+
+	if (SRF_IS_FIRSTCALL())
+	{
+		int			i,
+					idx;
+		Size		os_page_size;
+		char	   *startptr;
+		int			max_entries;
+
+		/*
+		 * Different database block sizes (4kB, 8kB, ..., 32kB) can be used,
+		 * while the OS may have different memory page sizes.
+		 *
+		 * To correctly map between them, we need to: 1. Determine the OS
+		 * memory page size 2. Calculate how many OS pages are used by all
+		 * buffer blocks 3. Calculate how many OS pages are contained within
+		 * each database block.
+		 */
+		os_page_size = pg_get_shmem_pagesize();
+
+		/* Initialize the multi-call context, load entries about buffers */
+		funcctx = SRF_FIRSTCALL_INIT();
+
+		/* Switch context when allocating stuff to be used in later calls */
+		oldcontext = MemoryContextSwitchTo(funcctx->multi_call_memory_ctx);
+
+		/* Create a user function context for cross-call persistence */
+		fctx = (BufferCacheOsPagesContext *) palloc(sizeof(BufferCacheOsPagesContext));
+
+		if (get_call_result_type(fcinfo, NULL, &expected_tupledesc) != TYPEFUNC_COMPOSITE)
+			elog(ERROR, "return type must be a row type");
+
+		if (expected_tupledesc->natts != NUM_BUFFERCACHE_OS_PAGES_ELEM)
+			elog(ERROR, "incorrect number of output arguments");
+
+		/* Construct a tuple descriptor for the result rows. */
+		tupledesc = CreateTemplateTupleDesc(expected_tupledesc->natts);
+		TupleDescInitEntry(tupledesc, (AttrNumber) 1, "bufferid",
+						   INT4OID, -1, 0);
+
+		TupleDescInitEntry(tupledesc, (AttrNumber) 2, "os_page_num",
+						   INT8OID, -1, 0);
+
+		fctx->tupdesc = BlessTupleDesc(tupledesc);
+
+		/*
+		 * Each buffer needs at least one entry, but it might be offset in
+		 * some way, and use one extra entry. So we allocate space for the
+		 * maximum number of entries we might need, and then count the exact
+		 * number as we're walking buffers. That way we can do it in one pass,
+		 * without reallocating memory.
+		 */
+		max_entries = GET_MAX_BUFFER_ENTRIES(NBuffers, os_page_size);
+
+		/* Allocate NBuffers worth of BufferCacheOsPagesRec records. */
+		fctx->record = (BufferCacheOsPagesRec *)
+			MemoryContextAllocHuge(CurrentMemoryContext,
+								   sizeof(BufferCacheOsPagesRec) * max_entries);
+
+		/* Return to original context when allocating transient memory */
+		MemoryContextSwitchTo(oldcontext);
+
+		startptr = (char *) TYPEALIGN_DOWN(os_page_size, (char *) BufferGetBlock(1));
+		idx = 0;
+
+		/*
+		 * Scan through all the buffers, saving the relevant fields in the
+		 * fctx->record structure.
+		 *
+		 * We don't hold the partition locks, so we don't get a consistent
+		 * snapshot across all buffers, but we do grab the buffer header
+		 * locks, so the information of each buffer is self-consistent.
+		 */
+		for (i = 0; i < NBuffers; i++)
+		{
+			char	   *buffptr = (char *) BufferGetBlock(i + 1);
+			BufferDesc *bufHdr;
+			uint32		buf_state;
+			uint32		bufferid;
+			int32		page_num;
+			char	   *startptr_buff,
+					   *endptr_buff;
+
+			bufHdr = GetBufferDescriptor(i);
+			/* Lock each buffer header before inspecting. */
+			buf_state = LockBufHdr(bufHdr);
+			bufferid = BufferDescriptorGetBuffer(bufHdr);
+			UnlockBufHdr(bufHdr, buf_state);
+
+			/* Get page boundaries for this buffer. */
+			get_buffer_page_boundaries(buffptr, os_page_size, startptr,
+									   &startptr_buff, &endptr_buff, &page_num);
+
+			/* Add an entry for each OS page overlapping with this buffer. */
+			for (char *ptr = startptr_buff; ptr < endptr_buff; ptr += os_page_size)
+			{
+				fctx->record[idx].bufferid = bufferid;
+				fctx->record[idx].page_num = page_num;
+				/* advance to the next entry/page */
+				++idx;
+				++page_num;
+			}
+		}
+
+		Assert(idx <= max_entries);
+
+		/* Set max calls and remember the user function context. */
+		funcctx->max_calls = idx;
+		funcctx->user_fctx = fctx;
+	}
+
+	funcctx = SRF_PERCALL_SETUP();
+
+	/* Get the saved state */
+	fctx = funcctx->user_fctx;
+
+	if (funcctx->call_cntr < funcctx->max_calls)
+	{
+		uint32		i = funcctx->call_cntr;
+		Datum		values[NUM_BUFFERCACHE_OS_PAGES_ELEM];
+		bool		nulls[NUM_BUFFERCACHE_OS_PAGES_ELEM];
+
+		values[0] = Int32GetDatum(fctx->record[i].bufferid);
+		nulls[0] = false;
+		values[1] = Int64GetDatum(fctx->record[i].page_num);
+		nulls[1] = false;
+
+		/* Build and return the tuple. */
+		tuple = heap_form_tuple(fctx->tupdesc, values, nulls);
+		result = HeapTupleGetDatum(tuple);
+
+		SRF_RETURN_NEXT(funcctx, result);
+	}
+	else
+		SRF_RETURN_DONE(funcctx);
+}
+
 Datum
 pg_buffercache_pages(PG_FUNCTION_ARGS)
 {
diff --git a/contrib/pg_buffercache/sql/pg_buffercache_os_pages.sql b/contrib/pg_buffercache/sql/pg_buffercache_os_pages.sql
new file mode 100644
index 00000000000..618b96e51b8
--- /dev/null
+++ b/contrib/pg_buffercache/sql/pg_buffercache_os_pages.sql
@@ -0,0 +1,16 @@
+-- We expect at least one entry for each buffer
+select count(*) >= (select setting::bigint
+                    from pg_settings
+                    where name = 'shared_buffers')
+from pg_buffercache_os_pages;
+
+-- Check that the functions / views can't be accessed by default. To avoid
+-- having to create a dedicated user, use the pg_database_owner pseudo-role.
+SET ROLE pg_database_owner;
+SELECT count(*) > 0 FROM pg_buffercache_os_pages;
+RESET role;
+
+-- Check that pg_monitor is allowed to query view / function
+SET ROLE pg_monitor;
+SELECT count(*) > 0 FROM pg_buffercache_os_pages;
+RESET role;
diff --git a/doc/src/sgml/pgbuffercache.sgml b/doc/src/sgml/pgbuffercache.sgml
index 546ace8369e..d9a729f0c09 100644
--- a/doc/src/sgml/pgbuffercache.sgml
+++ b/doc/src/sgml/pgbuffercache.sgml
@@ -38,6 +38,8 @@
  <para>
   This module provides the <function>pg_buffercache_pages()</function>
   function (wrapped in the <structname>pg_buffercache</structname> view), the
+  <function>pg_buffercache_os_pages()</function> function (wrapped in the
+  <structname>pg_buffercache_os_pages</structname> view), the
   <function>pg_buffercache_numa_pages()</function> function (wrapped in the
   <structname>pg_buffercache_numa</structname> view), the
   <function>pg_buffercache_summary()</function> function, the
@@ -54,6 +56,13 @@
   convenient use.
  </para>
 
+ <para>
+  The <function>pg_buffercache_os_pages()</function> function provides OS
+  pages mappings for shared buffer entries. The
+  <structname>pg_buffercache_os_pages</structname> view wraps the function for
+  convenient use.
+ </para>
+
  <para>
   The <function>pg_buffercache_numa_pages()</function> function provides
   <acronym>NUMA</acronym> node mappings for shared buffer entries. This
@@ -234,6 +243,52 @@
   </para>
  </sect2>
 
+ <sect2 id="pgbuffercache-pg-buffercache-os-pages">
+  <title>The <structname>pg_buffercache_os_pages</structname> View</title>
+
+  <para>
+   The definitions of the columns exposed by the view are shown in <xref linkend="pgbuffercache-os-pages-columns"/>.
+  </para>
+
+  <table id="pgbuffercache-os-pages-columns">
+   <title><structname>pg_buffercache_os_pages</structname> Columns</title>
+   <tgroup cols="1">
+    <thead>
+     <row>
+      <entry role="catalog_table_entry"><para role="column_definition">
+       Column Type
+      </para>
+      <para>
+       Description
+      </para></entry>
+     </row>
+    </thead>
+
+    <tbody>
+     <row>
+      <entry role="catalog_table_entry"><para role="column_definition">
+       <structfield>bufferid</structfield> <type>integer</type>
+      </para>
+      <para>
+       ID, in the range 1..<varname>shared_buffers</varname>
+      </para></entry>
+     </row>
+
+     <row>
+      <entry role="catalog_table_entry"><para role="column_definition">
+       <structfield>os_page_num</structfield> <type>bigint</type>
+      </para>
+      <para>
+       number of OS memory page for this buffer
+      </para></entry>
+     </row>
+
+    </tbody>
+   </tgroup>
+  </table>
+
+ </sect2>
+
  <sect2 id="pgbuffercache-pg-buffercache-numa">
   <title>The <structname>pg_buffercache_numa</structname> View</title>
 
@@ -550,6 +605,61 @@ regression=# SELECT n.nspname, c.relname, count(*) AS buffers
  public     | spgist_text_tbl        |     182
 (10 rows)
 
+regression=# SELECT pages_per_buffer, COUNT(*) as buffer_count
+             FROM (
+                   SELECT bufferid, COUNT(*) as pages_per_buffer
+                   FROM pg_buffercache_os_pages
+                   GROUP BY bufferid
+                  )
+             GROUP BY pages_per_buffer
+             ORDER BY pages_per_buffer;
+
+ pages_per_buffer | buffer_count
+------------------+--------------
+                1 |       261120
+                2 |         1024
+(2 rows)
+
+regression=# SELECT n.nspname, c.relname, count(*) AS buffers_on_multiple_pages
+             FROM pg_buffercache b JOIN pg_class c
+             ON b.relfilenode = pg_relation_filenode(c.oid) AND
+                b.reldatabase IN (0, (SELECT oid FROM pg_database
+                                      WHERE datname = current_database()))
+             JOIN pg_namespace n ON n.oid = c.relnamespace
+             JOIN (SELECT bufferid FROM pg_buffercache_os_pages
+                   GROUP BY bufferid HAVING count(*) > 1) m on m.bufferid = b.bufferid
+             GROUP BY n.nspname, c.relname
+             ORDER BY 3 DESC
+             LIMIT 10;
+
+  nspname   |           relname            | buffers_on_multiple_pages
+------------+------------------------------+---------------------------
+ public     | delete_test_table            |                         3
+ public     | gin_test_idx                 |                         2
+ pg_catalog | pg_depend                    |                         2
+ public     | quad_poly_tbl                |                         2
+ pg_catalog | pg_depend_reference_index    |                         1
+ pg_catalog | pg_index_indexrelid_index    |                         1
+ pg_catalog | pg_constraint_contypid_index |                         1
+ pg_catalog | pg_statistic                 |                         1
+ pg_catalog | pg_depend_depender_index     |                         1
+ pg_catalog | pg_operator                  |                         1
+(10 rows)
+
+
+  nspname   |             relname             | buffers_on_multiple_pages
+------------+---------------------------------+---------------------------
+ public     | gin_test_tbl                    |                         4
+ public     | delete_test_table               |                         4
+ public     | tenk1                           |                         4
+ pg_catalog | pg_attribute_relid_attnum_index |                         4
+ pg_catalog | pg_class                        |                         2
+ pg_catalog | pg_depend_depender_index        |                         2
+ pg_catalog | pg_attribute                    |                         2
+ pg_catalog | pg_opfamily                     |                         2
+ pg_catalog | pg_opclass_oid_index            |                         2
+ pg_catalog | pg_description                  |                         2
+(10 rows)
 
 regression=# SELECT * FROM pg_buffercache_summary();
  buffers_used | buffers_unused | buffers_dirty | buffers_pinned | usagecount_avg
diff --git a/src/tools/pgindent/typedefs.list b/src/tools/pgindent/typedefs.list
index 220e5a4f6b3..44c937fd08d 100644
--- a/src/tools/pgindent/typedefs.list
+++ b/src/tools/pgindent/typedefs.list
@@ -341,6 +341,8 @@ BufferAccessStrategy
 BufferAccessStrategyType
 BufferCacheNumaContext
 BufferCacheNumaRec
+BufferCacheOsPagesContext
+BufferCacheOsPagesRec
 BufferCachePagesContext
 BufferCachePagesRec
 BufferDesc
-- 
2.34.1

