From 3e79a2dfab3d578f3d0b16ea743e6a1bb501561a Mon Sep 17 00:00:00 2001
From: Bertrand Drouvot <bertranddrouvot.pg@gmail.com>
Date: Wed, 2 Jul 2025 04:33:03 +0000
Subject: [PATCH v6 2/2] Add pg_buffercache_os_pages function and view

ba2a3c2302f added a way to check if a buffer is spread across multiple pages.

Adding those new function and view so that one does not need NUMA support
enabled to get this information.

In passing, let's capitalyze "number" in the pg_buffercache_numa documentation
definition.
---
 contrib/pg_buffercache/Makefile               |   2 +-
 .../expected/pg_buffercache.out               |  18 ++
 contrib/pg_buffercache/meson.build            |   1 +
 .../pg_buffercache--1.6--1.7.sql              |  21 ++
 contrib/pg_buffercache/pg_buffercache.control |   2 +-
 contrib/pg_buffercache/pg_buffercache_pages.c | 181 ++++++++++++++++++
 contrib/pg_buffercache/sql/pg_buffercache.sql |   8 +
 doc/src/sgml/pgbuffercache.sgml               | 112 ++++++++++-
 src/tools/pgindent/typedefs.list              |   2 +
 9 files changed, 344 insertions(+), 3 deletions(-)
   3.7% contrib/pg_buffercache/expected/
  58.3% contrib/pg_buffercache/
  37.5% doc/src/sgml/

diff --git a/contrib/pg_buffercache/Makefile b/contrib/pg_buffercache/Makefile
index 5f748543e2e..0e618f66aec 100644
--- a/contrib/pg_buffercache/Makefile
+++ b/contrib/pg_buffercache/Makefile
@@ -9,7 +9,7 @@ EXTENSION = pg_buffercache
 DATA = pg_buffercache--1.2.sql pg_buffercache--1.2--1.3.sql \
 	pg_buffercache--1.1--1.2.sql pg_buffercache--1.0--1.1.sql \
 	pg_buffercache--1.3--1.4.sql pg_buffercache--1.4--1.5.sql \
-	pg_buffercache--1.5--1.6.sql
+	pg_buffercache--1.5--1.6.sql pg_buffercache--1.6--1.7.sql
 PGFILEDESC = "pg_buffercache - monitoring of shared buffer cache in real-time"
 
 REGRESS = pg_buffercache pg_buffercache_numa
diff --git a/contrib/pg_buffercache/expected/pg_buffercache.out b/contrib/pg_buffercache/expected/pg_buffercache.out
index 9a9216dc7b1..26c2d5f5710 100644
--- a/contrib/pg_buffercache/expected/pg_buffercache.out
+++ b/contrib/pg_buffercache/expected/pg_buffercache.out
@@ -8,6 +8,16 @@ from pg_buffercache;
  t
 (1 row)
 
+-- For pg_buffercache_os_pages, we expect at least one entry for each buffer
+select count(*) >= (select setting::bigint
+                    from pg_settings
+                    where name = 'shared_buffers')
+from pg_buffercache_os_pages;
+ ?column? 
+----------
+ t
+(1 row)
+
 select buffers_used + buffers_unused > 0,
         buffers_dirty <= buffers_used,
         buffers_pinned <= buffers_used
@@ -28,6 +38,8 @@ SELECT count(*) > 0 FROM pg_buffercache_usage_counts() WHERE buffers >= 0;
 SET ROLE pg_database_owner;
 SELECT * FROM pg_buffercache;
 ERROR:  permission denied for view pg_buffercache
+SELECT * FROM pg_buffercache_os_pages;
+ERROR:  permission denied for view pg_buffercache_os_pages
 SELECT * FROM pg_buffercache_pages() AS p (wrong int);
 ERROR:  permission denied for function pg_buffercache_pages
 SELECT * FROM pg_buffercache_summary();
@@ -43,6 +55,12 @@ SELECT count(*) > 0 FROM pg_buffercache;
  t
 (1 row)
 
+SELECT count(*) > 0 FROM pg_buffercache_os_pages;
+ ?column? 
+----------
+ t
+(1 row)
+
 SELECT buffers_used + buffers_unused > 0 FROM pg_buffercache_summary();
  ?column? 
 ----------
diff --git a/contrib/pg_buffercache/meson.build b/contrib/pg_buffercache/meson.build
index 7cd039a1df9..7c31141881f 100644
--- a/contrib/pg_buffercache/meson.build
+++ b/contrib/pg_buffercache/meson.build
@@ -24,6 +24,7 @@ install_data(
   'pg_buffercache--1.3--1.4.sql',
   'pg_buffercache--1.4--1.5.sql',
   'pg_buffercache--1.5--1.6.sql',
+  'pg_buffercache--1.6--1.7.sql',
   'pg_buffercache.control',
   kwargs: contrib_data_args,
 )
diff --git a/contrib/pg_buffercache/pg_buffercache--1.6--1.7.sql b/contrib/pg_buffercache/pg_buffercache--1.6--1.7.sql
new file mode 100644
index 00000000000..28caf24688f
--- /dev/null
+++ b/contrib/pg_buffercache/pg_buffercache--1.6--1.7.sql
@@ -0,0 +1,21 @@
+/* contrib/pg_buffercache/pg_buffercache--1.6--1.7.sql */
+
+-- complain if script is sourced in psql, rather than via ALTER EXTENSION
+\echo Use "ALTER EXTENSION pg_buffercache UPDATE TO '1.7'" to load this file. \quit
+
+-- Register the new function.
+CREATE FUNCTION pg_buffercache_os_pages()
+RETURNS SETOF RECORD
+AS 'MODULE_PATHNAME', 'pg_buffercache_os_pages'
+LANGUAGE C PARALLEL SAFE;
+
+-- Create a view for convenient access.
+CREATE VIEW pg_buffercache_os_pages AS
+    SELECT P.* FROM pg_buffercache_os_pages() AS P
+    (bufferid integer, os_page_num bigint);
+
+REVOKE ALL ON FUNCTION pg_buffercache_os_pages() FROM PUBLIC;
+REVOKE ALL ON pg_buffercache_os_pages FROM PUBLIC;
+
+GRANT EXECUTE ON FUNCTION pg_buffercache_os_pages() TO pg_monitor;
+GRANT SELECT ON pg_buffercache_os_pages TO pg_monitor;
diff --git a/contrib/pg_buffercache/pg_buffercache.control b/contrib/pg_buffercache/pg_buffercache.control
index b030ba3a6fa..11499550945 100644
--- a/contrib/pg_buffercache/pg_buffercache.control
+++ b/contrib/pg_buffercache/pg_buffercache.control
@@ -1,5 +1,5 @@
 # pg_buffercache extension
 comment = 'examine the shared buffer cache'
-default_version = '1.6'
+default_version = '1.7'
 module_pathname = '$libdir/pg_buffercache'
 relocatable = true
diff --git a/contrib/pg_buffercache/pg_buffercache_pages.c b/contrib/pg_buffercache/pg_buffercache_pages.c
index 8ef13d74186..946b6ff4077 100644
--- a/contrib/pg_buffercache/pg_buffercache_pages.c
+++ b/contrib/pg_buffercache/pg_buffercache_pages.c
@@ -27,6 +27,7 @@
 #define NUM_BUFFERCACHE_EVICT_ALL_ELEM 3
 
 #define NUM_BUFFERCACHE_NUMA_ELEM	3
+#define NUM_BUFFERCACHE_OS_PAGES_ELEM	2
 
 /*
  * Get the maximum buffer cache entries needed.
@@ -94,12 +95,30 @@ typedef struct
 	BufferCacheNumaRec *record;
 } BufferCacheNumaContext;
 
+/*
+ * Record structure holding the to be exposed cache data.
+ */
+typedef struct
+{
+	uint32		bufferid;
+	int64		page_num;
+} BufferCacheOsPagesRec;
+
+/*
+ * Function context for data persisting over repeated calls.
+ */
+typedef struct
+{
+	TupleDesc	tupdesc;
+	BufferCacheOsPagesRec *record;
+} BufferCacheOsPagesContext;
 
 /*
  * Function returning data from the shared buffer cache - buffer number,
  * relation node/tablespace/database/blocknum and dirty indicator.
  */
 PG_FUNCTION_INFO_V1(pg_buffercache_pages);
+PG_FUNCTION_INFO_V1(pg_buffercache_os_pages);
 PG_FUNCTION_INFO_V1(pg_buffercache_numa_pages);
 PG_FUNCTION_INFO_V1(pg_buffercache_summary);
 PG_FUNCTION_INFO_V1(pg_buffercache_usage_counts);
@@ -139,6 +158,168 @@ get_buffer_page_boundaries(char *buffptr, Size os_page_size, char *startptr,
 	*endptr_buff = end_ptr;
 }
 
+/*
+ * Inquire about OS pages mappings for shared buffers.
+ *
+ * Returns each OS memory page used by the buffer. Buffers may
+ * be smaller or larger than OS memory pages. For each buffer we return one
+ * entry for each memory page used by the buffer (if the buffer is smaller,
+ * it only uses a part of one memory page).
+ *
+ * We expect both sizes (for buffers and memory pages) to be a power-of-2, so
+ * one is always a multiple of the other.
+ */
+Datum
+pg_buffercache_os_pages(PG_FUNCTION_ARGS)
+{
+	FuncCallContext *funcctx;
+	Datum		result;
+	MemoryContext oldcontext;
+	BufferCacheOsPagesContext *fctx;	/* User function context. */
+	TupleDesc	tupledesc;
+	TupleDesc	expected_tupledesc;
+	HeapTuple	tuple;
+
+	if (SRF_IS_FIRSTCALL())
+	{
+		int			i,
+					idx;
+		Size		os_page_size;
+		char	   *startptr;
+		int			max_entries;
+
+		/*
+		 * The database block size and OS memory page size are unlikely to be
+		 * the same. The block size is 1-32KB, the memory page size depends on
+		 * platform. On x86 it's usually 4KB, on ARM it's 4KB or 64KB, but
+		 * there are also features like THP etc. Moreover, we don't quite know
+		 * how the pages and buffers "align" in memory - the buffers may be
+		 * shifted in some way, using more memory pages than necessary.
+		 *
+		 * So we need to be careful about mapping buffers to memory pages. We
+		 * calculate the maximum number of pages a buffer might use, so that
+		 * we allocate enough space for the entries. And then we count the
+		 * actual number of entries as we scan the buffers.
+		 */
+		os_page_size = pg_get_shmem_pagesize();
+
+		/* Initialize the multi-call context, load entries about buffers */
+		funcctx = SRF_FIRSTCALL_INIT();
+
+		/* Switch context when allocating stuff to be used in later calls */
+		oldcontext = MemoryContextSwitchTo(funcctx->multi_call_memory_ctx);
+
+		/* Create a user function context for cross-call persistence */
+		fctx = (BufferCacheOsPagesContext *) palloc(sizeof(BufferCacheOsPagesContext));
+
+		if (get_call_result_type(fcinfo, NULL, &expected_tupledesc) != TYPEFUNC_COMPOSITE)
+			elog(ERROR, "return type must be a row type");
+
+		if (expected_tupledesc->natts != NUM_BUFFERCACHE_OS_PAGES_ELEM)
+			elog(ERROR, "incorrect number of output arguments");
+
+		/* Construct a tuple descriptor for the result rows. */
+		tupledesc = CreateTemplateTupleDesc(expected_tupledesc->natts);
+		TupleDescInitEntry(tupledesc, (AttrNumber) 1, "bufferid",
+						   INT4OID, -1, 0);
+
+		TupleDescInitEntry(tupledesc, (AttrNumber) 2, "os_page_num",
+						   INT8OID, -1, 0);
+
+		fctx->tupdesc = BlessTupleDesc(tupledesc);
+
+		/*
+		 * Each buffer needs at least one entry, but it might be offset in
+		 * some way, and use one extra entry. So we allocate space for the
+		 * maximum number of entries we might need, and then count the exact
+		 * number as we're walking buffers. That way we can do it in one pass,
+		 * without reallocating memory.
+		 */
+		max_entries = GET_MAX_BUFFER_ENTRIES(NBuffers, os_page_size);
+
+		/* Allocate NBuffers worth of BufferCacheOsPagesRec records. */
+		fctx->record = (BufferCacheOsPagesRec *)
+			MemoryContextAllocHuge(CurrentMemoryContext,
+								   sizeof(BufferCacheOsPagesRec) * max_entries);
+
+		/* Return to original context when allocating transient memory */
+		MemoryContextSwitchTo(oldcontext);
+
+		startptr = (char *) TYPEALIGN_DOWN(os_page_size, (char *) BufferGetBlock(1));
+		idx = 0;
+
+		/*
+		 * Scan through all the buffers, saving the relevant fields in the
+		 * fctx->record structure.
+		 *
+		 * We don't hold the partition locks, so we don't get a consistent
+		 * snapshot across all buffers, but we do grab the buffer header
+		 * locks, so the information of each buffer is self-consistent.
+		 */
+		for (i = 0; i < NBuffers; i++)
+		{
+			char	   *buffptr = (char *) BufferGetBlock(i + 1);
+			BufferDesc *bufHdr;
+			uint32		buf_state;
+			uint32		bufferid;
+			int32		page_num;
+			char	   *startptr_buff,
+					   *endptr_buff;
+
+			bufHdr = GetBufferDescriptor(i);
+			/* Lock each buffer header before inspecting. */
+			buf_state = LockBufHdr(bufHdr);
+			bufferid = BufferDescriptorGetBuffer(bufHdr);
+			UnlockBufHdr(bufHdr, buf_state);
+
+			/* Get page boundaries for this buffer. */
+			get_buffer_page_boundaries(buffptr, os_page_size, startptr,
+									   &startptr_buff, &endptr_buff, &page_num);
+
+			/* Add an entry for each OS page overlapping with this buffer. */
+			for (char *ptr = startptr_buff; ptr < endptr_buff; ptr += os_page_size)
+			{
+				fctx->record[idx].bufferid = bufferid;
+				fctx->record[idx].page_num = page_num;
+				/* advance to the next entry/page */
+				++idx;
+				++page_num;
+			}
+		}
+
+		Assert(idx <= max_entries);
+
+		/* Set max calls and remember the user function context. */
+		funcctx->max_calls = idx;
+		funcctx->user_fctx = fctx;
+	}
+
+	funcctx = SRF_PERCALL_SETUP();
+
+	/* Get the saved state */
+	fctx = funcctx->user_fctx;
+
+	if (funcctx->call_cntr < funcctx->max_calls)
+	{
+		uint32		i = funcctx->call_cntr;
+		Datum		values[NUM_BUFFERCACHE_OS_PAGES_ELEM];
+		bool		nulls[NUM_BUFFERCACHE_OS_PAGES_ELEM];
+
+		values[0] = Int32GetDatum(fctx->record[i].bufferid);
+		nulls[0] = false;
+		values[1] = Int64GetDatum(fctx->record[i].page_num);
+		nulls[1] = false;
+
+		/* Build and return the tuple. */
+		tuple = heap_form_tuple(fctx->tupdesc, values, nulls);
+		result = HeapTupleGetDatum(tuple);
+
+		SRF_RETURN_NEXT(funcctx, result);
+	}
+	else
+		SRF_RETURN_DONE(funcctx);
+}
+
 Datum
 pg_buffercache_pages(PG_FUNCTION_ARGS)
 {
diff --git a/contrib/pg_buffercache/sql/pg_buffercache.sql b/contrib/pg_buffercache/sql/pg_buffercache.sql
index 47cca1907c7..3c70ee9ef4a 100644
--- a/contrib/pg_buffercache/sql/pg_buffercache.sql
+++ b/contrib/pg_buffercache/sql/pg_buffercache.sql
@@ -5,6 +5,12 @@ select count(*) = (select setting::bigint
                    where name = 'shared_buffers')
 from pg_buffercache;
 
+-- For pg_buffercache_os_pages, we expect at least one entry for each buffer
+select count(*) >= (select setting::bigint
+                    from pg_settings
+                    where name = 'shared_buffers')
+from pg_buffercache_os_pages;
+
 select buffers_used + buffers_unused > 0,
         buffers_dirty <= buffers_used,
         buffers_pinned <= buffers_used
@@ -16,6 +22,7 @@ SELECT count(*) > 0 FROM pg_buffercache_usage_counts() WHERE buffers >= 0;
 -- having to create a dedicated user, use the pg_database_owner pseudo-role.
 SET ROLE pg_database_owner;
 SELECT * FROM pg_buffercache;
+SELECT * FROM pg_buffercache_os_pages;
 SELECT * FROM pg_buffercache_pages() AS p (wrong int);
 SELECT * FROM pg_buffercache_summary();
 SELECT * FROM pg_buffercache_usage_counts();
@@ -24,6 +31,7 @@ RESET role;
 -- Check that pg_monitor is allowed to query view / function
 SET ROLE pg_monitor;
 SELECT count(*) > 0 FROM pg_buffercache;
+SELECT count(*) > 0 FROM pg_buffercache_os_pages;
 SELECT buffers_used + buffers_unused > 0 FROM pg_buffercache_summary();
 SELECT count(*) > 0 FROM pg_buffercache_usage_counts();
 RESET role;
diff --git a/doc/src/sgml/pgbuffercache.sgml b/doc/src/sgml/pgbuffercache.sgml
index eeb85a0e049..72f179286d6 100644
--- a/doc/src/sgml/pgbuffercache.sgml
+++ b/doc/src/sgml/pgbuffercache.sgml
@@ -46,6 +46,8 @@
  <para>
   This module provides the <function>pg_buffercache_pages()</function>
   function (wrapped in the <structname>pg_buffercache</structname> view), the
+  <function>pg_buffercache_os_pages()</function> function (wrapped in the
+  <structname>pg_buffercache_os_pages</structname> view), the
   <function>pg_buffercache_numa_pages()</function> function (wrapped in the
   <structname>pg_buffercache_numa</structname> view), the
   <function>pg_buffercache_summary()</function> function, the
@@ -62,6 +64,13 @@
   convenient use.
  </para>
 
+ <para>
+  The <function>pg_buffercache_os_pages()</function> function provides OS
+  pages mappings for shared buffer entries. The
+  <structname>pg_buffercache_os_pages</structname> view wraps the function for
+  convenient use.
+ </para>
+
  <para>
   The <function>pg_buffercache_numa_pages()</function> function provides
   <acronym>NUMA</acronym> node mappings for shared buffer entries. This
@@ -242,6 +251,52 @@
   </para>
  </sect2>
 
+ <sect2 id="pgbuffercache-pg-buffercache-os-pages">
+  <title>The <structname>pg_buffercache_os_pages</structname> View</title>
+
+  <para>
+   The definitions of the columns exposed by the view are shown in <xref linkend="pgbuffercache-os-pages-columns"/>.
+  </para>
+
+  <table id="pgbuffercache-os-pages-columns">
+   <title><structname>pg_buffercache_os_pages</structname> Columns</title>
+   <tgroup cols="1">
+    <thead>
+     <row>
+      <entry role="catalog_table_entry"><para role="column_definition">
+       Column Type
+      </para>
+      <para>
+       Description
+      </para></entry>
+     </row>
+    </thead>
+
+    <tbody>
+     <row>
+      <entry role="catalog_table_entry"><para role="column_definition">
+       <structfield>bufferid</structfield> <type>integer</type>
+      </para>
+      <para>
+       ID, in the range 1..<varname>shared_buffers</varname>
+      </para></entry>
+     </row>
+
+     <row>
+      <entry role="catalog_table_entry"><para role="column_definition">
+       <structfield>os_page_num</structfield> <type>bigint</type>
+      </para>
+      <para>
+       Number of OS memory page for this buffer
+      </para></entry>
+     </row>
+
+    </tbody>
+   </tgroup>
+  </table>
+
+ </sect2>
+
  <sect2 id="pgbuffercache-pg-buffercache-numa">
   <title>The <structname>pg_buffercache_numa</structname> View</title>
 
@@ -278,7 +333,7 @@
        <structfield>os_page_num</structfield> <type>bigint</type>
       </para>
       <para>
-       number of OS memory page for this buffer
+       Number of OS memory page for this buffer
       </para></entry>
      </row>
 
@@ -558,6 +613,61 @@ regression=# SELECT n.nspname, c.relname, count(*) AS buffers
  public     | spgist_text_tbl        |     182
 (10 rows)
 
+regression=# SELECT pages_per_buffer, COUNT(*) as buffer_count
+             FROM (
+                   SELECT bufferid, COUNT(*) as pages_per_buffer
+                   FROM pg_buffercache_os_pages
+                   GROUP BY bufferid
+                  )
+             GROUP BY pages_per_buffer
+             ORDER BY pages_per_buffer;
+
+ pages_per_buffer | buffer_count
+------------------+--------------
+                1 |       261120
+                2 |         1024
+(2 rows)
+
+regression=# SELECT n.nspname, c.relname, count(*) AS buffers_on_multiple_pages
+             FROM pg_buffercache b JOIN pg_class c
+             ON b.relfilenode = pg_relation_filenode(c.oid) AND
+                b.reldatabase IN (0, (SELECT oid FROM pg_database
+                                      WHERE datname = current_database()))
+             JOIN pg_namespace n ON n.oid = c.relnamespace
+             JOIN (SELECT bufferid FROM pg_buffercache_os_pages
+                   GROUP BY bufferid HAVING count(*) > 1) m on m.bufferid = b.bufferid
+             GROUP BY n.nspname, c.relname
+             ORDER BY 3 DESC
+             LIMIT 10;
+
+  nspname   |           relname            | buffers_on_multiple_pages
+------------+------------------------------+---------------------------
+ public     | delete_test_table            |                         3
+ public     | gin_test_idx                 |                         2
+ pg_catalog | pg_depend                    |                         2
+ public     | quad_poly_tbl                |                         2
+ pg_catalog | pg_depend_reference_index    |                         1
+ pg_catalog | pg_index_indexrelid_index    |                         1
+ pg_catalog | pg_constraint_contypid_index |                         1
+ pg_catalog | pg_statistic                 |                         1
+ pg_catalog | pg_depend_depender_index     |                         1
+ pg_catalog | pg_operator                  |                         1
+(10 rows)
+
+
+  nspname   |             relname             | buffers_on_multiple_pages
+------------+---------------------------------+---------------------------
+ public     | gin_test_tbl                    |                         4
+ public     | delete_test_table               |                         4
+ public     | tenk1                           |                         4
+ pg_catalog | pg_attribute_relid_attnum_index |                         4
+ pg_catalog | pg_class                        |                         2
+ pg_catalog | pg_depend_depender_index        |                         2
+ pg_catalog | pg_attribute                    |                         2
+ pg_catalog | pg_opfamily                     |                         2
+ pg_catalog | pg_opclass_oid_index            |                         2
+ pg_catalog | pg_description                  |                         2
+(10 rows)
 
 regression=# SELECT * FROM pg_buffercache_summary();
  buffers_used | buffers_unused | buffers_dirty | buffers_pinned | usagecount_avg
diff --git a/src/tools/pgindent/typedefs.list b/src/tools/pgindent/typedefs.list
index 4353befab99..da444c156e6 100644
--- a/src/tools/pgindent/typedefs.list
+++ b/src/tools/pgindent/typedefs.list
@@ -339,6 +339,8 @@ BufferAccessStrategy
 BufferAccessStrategyType
 BufferCacheNumaContext
 BufferCacheNumaRec
+BufferCacheOsPagesContext
+BufferCacheOsPagesRec
 BufferCachePagesContext
 BufferCachePagesRec
 BufferDesc
-- 
2.34.1

