From 79919341336748ae058702f375ace04eb1d2f74a Mon Sep 17 00:00:00 2001
From: Sami Imseih <samimseih@gmail.com>
Date: Wed, 24 Jun 2026 15:01:26 +0000
Subject: [PATCH v4 4/4] [WIP] pg_stat_statements: modernize entry storage with
 pgstat kind

---
 contrib/pg_stat_statements/expected/dml.out   |    6 +
 .../expected/oldextversions.out               |   67 +
 .../pg_stat_statements/expected/select.out    |   36 +
 contrib/pg_stat_statements/meson.build        |    4 +-
 .../pg_stat_statements--1.13--1.14.sql        |  103 +
 .../pg_stat_statements/pg_stat_statements.c   | 2769 +++++++----------
 .../pg_stat_statements.conf                   |    1 +
 .../pg_stat_statements.control                |    2 +-
 contrib/pg_stat_statements/sql/dml.sql        |    1 +
 .../pg_stat_statements/sql/oldextversions.sql |    5 +
 contrib/pg_stat_statements/sql/select.sql     |   12 +
 .../t/{010_restart.pl => 001_restart.pl}      |    0
 .../t/002_query_text_memory.pl                |  124 +
 doc/src/sgml/pgstatstatements.sgml            |   39 +-
 src/tools/pgindent/typedefs.list              |    3 +
 15 files changed, 1427 insertions(+), 1745 deletions(-)
 create mode 100644 contrib/pg_stat_statements/pg_stat_statements--1.13--1.14.sql
 rename contrib/pg_stat_statements/t/{010_restart.pl => 001_restart.pl} (100%)
 create mode 100644 contrib/pg_stat_statements/t/002_query_text_memory.pl

diff --git a/contrib/pg_stat_statements/expected/dml.out b/contrib/pg_stat_statements/expected/dml.out
index 347cb8699e4..a48bcc4450f 100644
--- a/contrib/pg_stat_statements/expected/dml.out
+++ b/contrib/pg_stat_statements/expected/dml.out
@@ -1,6 +1,12 @@
 --
 -- DMLs on test table
 --
+SELECT pg_stat_statements_reset() IS NOT NULL AS t;
+ t 
+---
+ t
+(1 row)
+
 SET pg_stat_statements.track_utility = FALSE;
 CREATE TEMP TABLE pgss_dml_tab (a int, b char(20));
 INSERT INTO pgss_dml_tab VALUES(generate_series(1, 10), 'aaa');
diff --git a/contrib/pg_stat_statements/expected/oldextversions.out b/contrib/pg_stat_statements/expected/oldextversions.out
index 726383a99d7..5d65eb6b521 100644
--- a/contrib/pg_stat_statements/expected/oldextversions.out
+++ b/contrib/pg_stat_statements/expected/oldextversions.out
@@ -474,4 +474,71 @@ SELECT count(*) > 0 AS has_data FROM pg_stat_statements;
  t
 (1 row)
 
+-- Functions marked PARALLEL RESTRICTED in 1.14
+AlTER EXTENSION pg_stat_statements UPDATE TO '1.14';
+\d pg_stat_statements
+                            View "public.pg_stat_statements"
+           Column           |           Type           | Collation | Nullable | Default 
+----------------------------+--------------------------+-----------+----------+---------
+ userid                     | oid                      |           |          | 
+ dbid                       | oid                      |           |          | 
+ toplevel                   | boolean                  |           |          | 
+ queryid                    | bigint                   |           |          | 
+ query                      | text                     |           |          | 
+ plans                      | bigint                   |           |          | 
+ total_plan_time            | double precision         |           |          | 
+ min_plan_time              | double precision         |           |          | 
+ max_plan_time              | double precision         |           |          | 
+ mean_plan_time             | double precision         |           |          | 
+ stddev_plan_time           | double precision         |           |          | 
+ calls                      | bigint                   |           |          | 
+ total_exec_time            | double precision         |           |          | 
+ min_exec_time              | double precision         |           |          | 
+ max_exec_time              | double precision         |           |          | 
+ mean_exec_time             | double precision         |           |          | 
+ stddev_exec_time           | double precision         |           |          | 
+ rows                       | bigint                   |           |          | 
+ shared_blks_hit            | bigint                   |           |          | 
+ shared_blks_read           | bigint                   |           |          | 
+ shared_blks_dirtied        | bigint                   |           |          | 
+ shared_blks_written        | bigint                   |           |          | 
+ local_blks_hit             | bigint                   |           |          | 
+ local_blks_read            | bigint                   |           |          | 
+ local_blks_dirtied         | bigint                   |           |          | 
+ local_blks_written         | bigint                   |           |          | 
+ temp_blks_read             | bigint                   |           |          | 
+ temp_blks_written          | bigint                   |           |          | 
+ shared_blk_read_time       | double precision         |           |          | 
+ shared_blk_write_time      | double precision         |           |          | 
+ local_blk_read_time        | double precision         |           |          | 
+ local_blk_write_time       | double precision         |           |          | 
+ temp_blk_read_time         | double precision         |           |          | 
+ temp_blk_write_time        | double precision         |           |          | 
+ wal_records                | bigint                   |           |          | 
+ wal_fpi                    | bigint                   |           |          | 
+ wal_bytes                  | numeric                  |           |          | 
+ wal_buffers_full           | bigint                   |           |          | 
+ jit_functions              | bigint                   |           |          | 
+ jit_generation_time        | double precision         |           |          | 
+ jit_inlining_count         | bigint                   |           |          | 
+ jit_inlining_time          | double precision         |           |          | 
+ jit_optimization_count     | bigint                   |           |          | 
+ jit_optimization_time      | double precision         |           |          | 
+ jit_emission_count         | bigint                   |           |          | 
+ jit_emission_time          | double precision         |           |          | 
+ jit_deform_count           | bigint                   |           |          | 
+ jit_deform_time            | double precision         |           |          | 
+ parallel_workers_to_launch | bigint                   |           |          | 
+ parallel_workers_launched  | bigint                   |           |          | 
+ generic_plan_calls         | bigint                   |           |          | 
+ custom_plan_calls          | bigint                   |           |          | 
+ stats_since                | timestamp with time zone |           |          | 
+ minmax_stats_since         | timestamp with time zone |           |          | 
+
+SELECT count(*) > 0 AS has_data FROM pg_stat_statements;
+ has_data 
+----------
+ t
+(1 row)
+
 DROP EXTENSION pg_stat_statements;
diff --git a/contrib/pg_stat_statements/expected/select.out b/contrib/pg_stat_statements/expected/select.out
index a069119c790..6abef76968c 100644
--- a/contrib/pg_stat_statements/expected/select.out
+++ b/contrib/pg_stat_statements/expected/select.out
@@ -889,3 +889,39 @@ SELECT pg_stat_statements_reset() IS NOT NULL AS t;
  t
 (1 row)
 
+--
+-- reset within a transaction: entries with unflushed pending data should
+-- still be removed and not reappear in the view
+--
+SELECT pg_stat_statements_reset() IS NOT NULL AS t;
+ t 
+---
+ t
+(1 row)
+
+BEGIN;
+SELECT 1 AS "RESET_TXN_TEST";
+ RESET_TXN_TEST 
+----------------
+              1
+(1 row)
+
+SELECT count(*) FROM pg_stat_statements WHERE query LIKE '%RESET_TXN_TEST%';
+ count 
+-------
+     1
+(1 row)
+
+SELECT pg_stat_statements_reset() IS NOT NULL AS t;
+ t 
+---
+ t
+(1 row)
+
+SELECT count(*) FROM pg_stat_statements WHERE query LIKE '%RESET_TXN_TEST%';
+ count 
+-------
+     0
+(1 row)
+
+COMMIT;
diff --git a/contrib/pg_stat_statements/meson.build b/contrib/pg_stat_statements/meson.build
index 9d78cb88b7d..a3920669541 100644
--- a/contrib/pg_stat_statements/meson.build
+++ b/contrib/pg_stat_statements/meson.build
@@ -21,6 +21,7 @@ contrib_targets += pg_stat_statements
 install_data(
   'pg_stat_statements.control',
   'pg_stat_statements--1.4.sql',
+  'pg_stat_statements--1.13--1.14.sql',
   'pg_stat_statements--1.12--1.13.sql',
   'pg_stat_statements--1.11--1.12.sql',
   'pg_stat_statements--1.10--1.11.sql',
@@ -69,7 +70,8 @@ tests += {
   },
   'tap': {
     'tests': [
-      't/010_restart.pl',
+      't/001_restart.pl',
+      't/002_query_text_memory.pl',
     ],
   },
 }
diff --git a/contrib/pg_stat_statements/pg_stat_statements--1.13--1.14.sql b/contrib/pg_stat_statements/pg_stat_statements--1.13--1.14.sql
new file mode 100644
index 00000000000..22c799007c3
--- /dev/null
+++ b/contrib/pg_stat_statements/pg_stat_statements--1.13--1.14.sql
@@ -0,0 +1,103 @@
+/* contrib/pg_stat_statements/pg_stat_statements--1.13--1.14.sql */
+
+-- complain if script is sourced in psql, rather than via ALTER EXTENSION
+\echo Use "ALTER EXTENSION pg_stat_statements UPDATE TO '1.14'" to load this file. \quit
+
+/* First we have to remove them from the extension */
+ALTER EXTENSION pg_stat_statements DROP VIEW pg_stat_statements;
+ALTER EXTENSION pg_stat_statements DROP FUNCTION pg_stat_statements(boolean);
+
+/* Then we can drop them */
+DROP VIEW pg_stat_statements;
+DROP FUNCTION pg_stat_statements(boolean);
+
+/* Now redefine with PARALLEL RESTRICTED */
+CREATE FUNCTION pg_stat_statements(IN showtext boolean,
+    OUT userid oid,
+    OUT dbid oid,
+    OUT toplevel bool,
+    OUT queryid bigint,
+    OUT query text,
+    OUT plans int8,
+    OUT total_plan_time float8,
+    OUT min_plan_time float8,
+    OUT max_plan_time float8,
+    OUT mean_plan_time float8,
+    OUT stddev_plan_time float8,
+    OUT calls int8,
+    OUT total_exec_time float8,
+    OUT min_exec_time float8,
+    OUT max_exec_time float8,
+    OUT mean_exec_time float8,
+    OUT stddev_exec_time float8,
+    OUT rows int8,
+    OUT shared_blks_hit int8,
+    OUT shared_blks_read int8,
+    OUT shared_blks_dirtied int8,
+    OUT shared_blks_written int8,
+    OUT local_blks_hit int8,
+    OUT local_blks_read int8,
+    OUT local_blks_dirtied int8,
+    OUT local_blks_written int8,
+    OUT temp_blks_read int8,
+    OUT temp_blks_written int8,
+    OUT shared_blk_read_time float8,
+    OUT shared_blk_write_time float8,
+    OUT local_blk_read_time float8,
+    OUT local_blk_write_time float8,
+    OUT temp_blk_read_time float8,
+    OUT temp_blk_write_time float8,
+    OUT wal_records int8,
+    OUT wal_fpi int8,
+    OUT wal_bytes numeric,
+    OUT wal_buffers_full int8,
+    OUT jit_functions int8,
+    OUT jit_generation_time float8,
+    OUT jit_inlining_count int8,
+    OUT jit_inlining_time float8,
+    OUT jit_optimization_count int8,
+    OUT jit_optimization_time float8,
+    OUT jit_emission_count int8,
+    OUT jit_emission_time float8,
+    OUT jit_deform_count int8,
+    OUT jit_deform_time float8,
+    OUT parallel_workers_to_launch int8,
+    OUT parallel_workers_launched int8,
+    OUT generic_plan_calls int8,
+    OUT custom_plan_calls int8,
+    OUT stats_since timestamp with time zone,
+    OUT minmax_stats_since timestamp with time zone
+)
+RETURNS SETOF record
+AS 'MODULE_PATHNAME', 'pg_stat_statements_1_13'
+LANGUAGE C STRICT VOLATILE PARALLEL RESTRICTED;
+
+CREATE VIEW pg_stat_statements AS
+  SELECT * FROM pg_stat_statements(true);
+
+GRANT SELECT ON pg_stat_statements TO PUBLIC;
+
+/* Mark reset functions as PARALLEL RESTRICTED */
+ALTER FUNCTION pg_stat_statements_reset(Oid, Oid, bigint, boolean) PARALLEL RESTRICTED;
+
+/* Recreate pg_stat_statements_info with new columns */
+ALTER EXTENSION pg_stat_statements DROP VIEW pg_stat_statements_info;
+ALTER EXTENSION pg_stat_statements DROP FUNCTION pg_stat_statements_info();
+DROP VIEW pg_stat_statements_info;
+DROP FUNCTION pg_stat_statements_info();
+
+CREATE FUNCTION pg_stat_statements_info(
+    OUT dealloc bigint,
+    OUT stats_reset timestamp with time zone,
+    OUT skipped_entries bigint,
+    OUT query_text_size bigint,
+    OUT num_entries bigint
+)
+RETURNS record
+AS 'MODULE_PATHNAME'
+LANGUAGE C STRICT VOLATILE PARALLEL SAFE;
+
+CREATE VIEW pg_stat_statements_info AS
+  SELECT * FROM pg_stat_statements_info();
+
+GRANT SELECT ON pg_stat_statements_info TO PUBLIC;
diff --git a/contrib/pg_stat_statements/pg_stat_statements.c b/contrib/pg_stat_statements/pg_stat_statements.c
index 92315627916..b9140212501 100644
--- a/contrib/pg_stat_statements/pg_stat_statements.c
+++ b/contrib/pg_stat_statements/pg_stat_statements.c
@@ -5,34 +5,61 @@
  *		usage across a whole database cluster.
  *
  * Execution costs are totaled for each distinct source query, and kept in
- * a shared hashtable.  (We track only as many distinct queries as will fit
- * in the designated amount of shared memory.)
+ * shared memory as a custom pgstat kind (own_hash=true), giving this module
+ * its own dedicated dshash table.  Query text is stored in a separate DSA
+ * area (via DSM registry).
  *
- * Starting in Postgres 9.2, this module normalized query entries.  As of
- * Postgres 14, the normalization is done by the core if compute_query_id is
- * enabled, or optionally by third-party modules.
+ * Normalization of query constants is performed by the core query jumble
+ * machinery (compute_query_id).  This module stores a "representative"
+ * query string with constants replaced by parameter symbols ($n).
  *
- * To facilitate presenting entries to users, we create "representative" query
- * strings in which constants are replaced with parameter symbols ($n), to
- * make it clearer what a normalized entry can represent.  To save on shared
- * memory, and to avoid having to truncate oversized query strings, we store
- * these strings in a temporary external query-texts file.  Offsets into this
- * file are kept in shared memory.
+ * Entry metadata (key, query text pointer, usage counter, timestamps) and
+ * counters live together in the PgStatShared_Pgss body.  Backends accumulate
+ * stats locally via the pending/flush infrastructure, flushing to shared
+ * memory without contending on entry-level locks on the hot path.  All
+ * locking (per-entry LWLock, dshash partition locks) is provided by the
+ * core pgstat infrastructure.
  *
- * Note about locking issues: to create or delete an entry in the shared
- * hashtable, one must hold pgss->lock exclusively.  Modifying any field
- * in an entry except the counters requires the same.  To look up an entry,
- * one must hold the lock shared.  To read or update the counters within
- * an entry, one must hold the lock shared or exclusive (so the entry doesn't
- * disappear!) and also take the entry's mutex spinlock.
- * The shared state variable pgss->extent (the next free spot in the external
- * query-text file) should be accessed only while holding either the
- * pgss->mutex spinlock, or exclusive lock on pgss->lock.  We use the mutex to
- * allow reserving file space while holding only shared lock on pgss->lock.
- * Rewriting the entire external query-text file, eg for garbage collection,
- * requires holding pgss->lock exclusively; this allows individual entries
- * in the file to be read or written while holding only shared lock.
+ * Locking strategy for entry creation:
  *
+ * The hot path (existing entries) is lock-free: the backend holds a
+ * cached local reference to the entry (or acquires one via a dshash
+ * lookup on first access), then pending stats are accumulated in
+ * backend-local memory without any external lock.
+ *
+ * New entries require an exclusive LWLock (pgss_shared->lock) to serialize
+ * creation against concurrent eviction.  This lock is only taken when the
+ * entry does not already exist (cache miss), which is rare in steady state.
+ *
+ * Eviction:
+ *
+ * When the live entry count reaches pgss_max, eviction is triggered
+ * under the exclusive lock.  At most one eviction pass runs per
+ * PGSS_EVICT_INTERVAL_MS (globally across all backends) to prevent
+ * lock convoy effects under
+ * high-churn workloads.  Between passes, backends that would need to
+ * create a new entry simply skip recording.
+ *
+ * Each eviction pass scans all entries, sorts by last_access timestamp
+ * (LRU, least recently used first), and drops the bottom 1/N (where
+ * N = PGSS_EVICT_RATIO).  A grace period (PGSS_EVICT_GRACE_MS)
+ * protects entries accessed recently from eviction, even if they rank among
+ * the coldest.  This prevents a churn cycle where the same active
+ * entries are evicted and immediately re-created, consuming eviction
+ * capacity without admitting genuinely new queries.  Only entries that
+ * are truly stale (not accessed recently) are evicted, which means they
+ * are unlikely to be pinned by backends and can be freed immediately.
+ *
+ * Eviction decisions use a live entry count (incremented on creation,
+ * decremented on eviction) rather than the physical dshash size, which
+ * includes dropped-but-pinned entries awaiting GC.  This ensures
+ * headroom is available immediately after eviction without waiting for
+ * backends to release their refs.
+ *
+ * To avoid runaway growth of the dshash when long-running transactions
+ * hold refs and prevent GC from freeing evicted entries, creation is
+ * skipped unconditionally once the physical entry count reaches 2x
+ * pgss_max.
  *
  * Copyright (c) 2008-2026, PostgreSQL Global Development Group
  *
@@ -44,30 +71,30 @@
 #include "postgres.h"
 
 #include <math.h>
-#include <sys/stat.h>
-#include <unistd.h>
 
 #include "access/htup_details.h"
 #include "access/parallel.h"
 #include "catalog/pg_authid.h"
+#include "common/hashfn.h"
 #include "executor/instrument.h"
 #include "funcapi.h"
 #include "jit/jit.h"
+#include "lib/dshash.h"
 #include "mb/pg_wchar.h"
 #include "miscadmin.h"
+#include "pgstat.h"
 #include "nodes/queryjumble.h"
 #include "optimizer/planner.h"
 #include "parser/analyze.h"
-#include "pgstat.h"
-#include "storage/fd.h"
-#include "storage/ipc.h"
-#include "storage/lwlock.h"
-#include "storage/shmem.h"
-#include "storage/spin.h"
+#include "storage/dsm_registry.h"
 #include "tcop/utility.h"
+#include "access/xact.h"
 #include "utils/acl.h"
 #include "utils/builtins.h"
-#include "utils/memutils.h"
+#include "utils/dsa.h"
+#include "utils/guc.h"
+#include "utils/numeric.h"
+#include "utils/pgstat_internal.h"
 #include "utils/timestamp.h"
 #include "utils/tuplestore.h"
 
@@ -76,29 +103,26 @@ PG_MODULE_MAGIC_EXT(
 					.version = PG_VERSION
 );
 
-/* Location of permanent stats file (valid when database is shut down) */
-#define PGSS_DUMP_FILE	PGSTAT_STAT_PERMANENT_DIRECTORY "/pg_stat_statements.stat"
+/* Custom pgstat kind ID */
+#define PGSTAT_KIND_PGSS	25
+
+/* Fraction of pgss_max to evict per pass (1/N) */
+#define PGSS_EVICT_RATIO		10
 
 /*
- * Location of external query text file.
+ * Minimum interval (ms) between eviction passes.  Limits lock hold time
+ * under high-churn workloads to at most one scan+sort per this interval.
  */
-#define PGSS_TEXT_FILE	PG_STAT_TMP_DIR "/pgss_query_texts.stat"
-
-/* Magic number identifying the stats file format */
-static const uint32 PGSS_FILE_HEADER = 0x20250731;
+#define PGSS_EVICT_INTERVAL_MS	1000
 
-/* PostgreSQL major version number, changes in which invalidate all entries */
-static const uint32 PGSS_PG_MAJOR_VERSION = PG_VERSION_NUM / 100;
-
-/* XXX: Should USAGE_EXEC reflect execution time and/or buffer usage? */
-#define USAGE_EXEC(duration)	(1.0)
-#define USAGE_INIT				(1.0)	/* including initial planning */
-#define ASSUMED_MEDIAN_INIT		(10.0)	/* initial assumed median usage */
-#define ASSUMED_LENGTH_INIT		1024	/* initial assumed mean query length */
-#define USAGE_DECREASE_FACTOR	(0.99)	/* decreased every entry_dealloc */
-#define STICKY_DECREASE_FACTOR	(0.50)	/* factor for sticky entries */
-#define USAGE_DEALLOC_PERCENT	5	/* free this % of entries at once */
-#define IS_STICKY(c)	((c.calls[PGSS_PLAN] + c.calls[PGSS_EXEC]) == 0)
+/*
+ * Grace period (ms) protecting recently-accessed entries from eviction.
+ * Entries whose last_access is within this window are skipped even if they
+ * rank among the coldest.  Without this, entries that are still being
+ * executed get evicted and immediately re-created, wasting eviction
+ * capacity without making room for genuinely new queries.
+ */
+#define PGSS_EVICT_GRACE_MS		10000
 
 /*
  * Extension version number, for supporting older extension versions' objects
@@ -116,7 +140,6 @@ typedef enum pgssVersion
 	PGSS_V1_12,
 	PGSS_V1_13,
 } pgssVersion;
-
 typedef enum pgssStoreKind
 {
 	PGSS_INVALID = -1,
@@ -133,12 +156,9 @@ typedef enum pgssStoreKind
 #define PGSS_NUMKIND (PGSS_EXEC + 1)
 
 /*
- * Hashtable key that defines the identity of a hashtable entry.  We separate
- * queries by user and by database even if they are otherwise identical.
- *
- * If you add a new key to this struct, make sure to teach pgss_store() to
- * zero the padding bytes.  Otherwise, things will break, because pgss_hash is
- * created using HASH_BLOBS, and thus tag_hash is used to hash this.
+ * Hashtable key that defines the identity of a tracked statement.
+ * We separate queries by user and by database even if they are otherwise
+ * identical.
  */
 typedef struct pgssHashKey
 {
@@ -149,9 +169,9 @@ typedef struct pgssHashKey
 } pgssHashKey;
 
 /*
- * The actual stats counters kept within pgssEntry.
+ * The actual stats counters kept within the custom pgstat kind.
  */
-typedef struct Counters
+typedef struct pgssCounters
 {
 	int64		calls[PGSS_NUMKIND];	/* # of times planned/executed */
 	double		total_time[PGSS_NUMKIND];	/* total planning/execution time,
@@ -212,65 +232,71 @@ typedef struct Counters
 											 * launched */
 	int64		generic_plan_calls; /* number of calls using a generic plan */
 	int64		custom_plan_calls;	/* number of calls using a custom plan */
-} Counters;
-
-/*
- * Global statistics for pg_stat_statements
- */
-typedef struct pgssGlobalStats
-{
-	int64		dealloc;		/* # of times entries were deallocated */
-	TimestampTz stats_reset;	/* timestamp with all stats reset */
-} pgssGlobalStats;
+} pgssCounters;
 
 /*
- * Statistics per statement
- *
- * Note: in event of a failure in garbage collection of the query text file,
- * we reset query_offset to zero and query_len to -1.  This will be seen as
- * an invalid state by qtext_fetch().
+ * Shared pgstat entry  - stored in the per-kind dshash (own_hash).
+ * Contains both the stats counters and the entry metadata that was
+ * previously stored in a separate dshash registry.
  */
-typedef struct pgssEntry
+typedef struct PgStatShared_Pgss
 {
-	pgssHashKey key;			/* hash key of entry - MUST BE FIRST */
-	Counters	counters;		/* the statistics for this query */
-	Size		query_offset;	/* query text offset in external file */
+	PgStatShared_Common header;
+	pgssHashKey key;
+	dsa_pointer query_text;		/* DSA pointer to query text */
 	int			query_len;		/* # of valid bytes in query string, or -1 */
 	int			encoding;		/* query text encoding */
 	TimestampTz stats_since;	/* timestamp of entry allocation */
 	TimestampTz minmax_stats_since; /* timestamp of last min/max values reset */
-	slock_t		mutex;			/* protects the counters only */
-} pgssEntry;
+	pg_atomic_uint32 usage;		/* hotness: decremented by sweep, incremented
+								 * on access */
+	pg_atomic_uint64 last_access;	/* statement start timestamp of last
+									 * access, for LRU eviction */
+	pgssCounters counters;
+} PgStatShared_Pgss;
 
 /*
- * Global shared state
+ * Global shared state stored in DSM segment
  */
 typedef struct pgssSharedState
 {
-	LWLockPadded lock;			/* protects hashtable search/modification */
-	double		cur_median_usage;	/* current median usage in hashtable */
-	Size		mean_query_len; /* current mean entry text length */
-	slock_t		mutex;			/* protects following fields only: */
-	Size		extent;			/* current extent of query file */
-	int			n_writers;		/* number of active writers to query file */
-	int			gc_count;		/* query file garbage collection cycle count */
-	pgssGlobalStats stats;		/* global statistics for pgss */
-} pgssSharedState;
+	LWLock		lock;			/* protects entry creation and eviction */
+	pg_atomic_uint64 dealloc;	/* total # of entries evicted */
+	pg_atomic_uint64 skipped_entries;	/* # of entries skipped due to
+										 * throttle */
+	pg_atomic_uint64 stats_reset;	/* timestamp with all stats reset */
+
+	/*
+	 * Live entry count: tracks non-dropped entries.  We need this separate
+	 * from pgstat_get_entry_count() because the core counter includes
+	 * dropped-but-pinned entries still held by backend references.
+	 */
+	pg_atomic_uint64 live_entries;
 
-/* Links to shared memory state */
-static pgssSharedState *pgss;
-static HTAB *pgss_hash;
+	/* Timestamp of last eviction pass; limits to once per second globally */
+	pg_atomic_uint64 last_eviction;
+} pgssSharedState;
 
-static void pgss_shmem_request(void *arg);
-static void pgss_shmem_init(void *arg);
+/* Backend-local pending entry */
+typedef struct PgStat_PgssPending
+{
+	pgssHashKey key;
+	bool		last_access_updated;
+	pgssCounters counters;
+} PgStat_PgssPending;
 
-static const ShmemCallbacks pgss_shmem_callbacks = {
-	.request_fn = pgss_shmem_request,
-	.init_fn = pgss_shmem_init,
-};
 
 /*---- Local variables ----*/
 
+/* Global shared state */
+static pgssSharedState *pgss_shared = NULL;
+
+/* Query text dsa area */
+static dsa_area *pgss_qtext_dsa = NULL;
+
+/* Per-kind pgstat dshash for this kind */
+static dshash_table *pgss_hash = NULL;
+
 /* Current nesting depth of planner/ExecutorRun/ProcessUtility calls */
 static int	nesting_level = 0;
 
@@ -305,20 +331,17 @@ static int	pgss_track = PGSS_TRACK_TOP;	/* tracking level */
 static bool pgss_track_utility = true;	/* whether to track utility commands */
 static bool pgss_track_planning = false;	/* whether to track planning
 											 * duration */
-static bool pgss_save = true;	/* whether to save stats across shutdown */
+static bool pgss_save = true;	/* whether to save stats across shutdown XXX:
+								 * this does not prevent stats being saved to
+								 * the core stats file, should it? */
+static int	pgss_query_text_memory = 4096;	/* in KB XXX: Should default be
+											 * lower? */
 
 #define pgss_enabled(level) \
 	(!IsParallelWorker() && \
 	(pgss_track == PGSS_TRACK_ALL || \
 	(pgss_track == PGSS_TRACK_TOP && (level) == 0)))
 
-#define record_gc_qtexts() \
-	do { \
-		SpinLockAcquire(&pgss->mutex); \
-		pgss->gc_count++; \
-		SpinLockRelease(&pgss->mutex); \
-	} while(0)
-
 /*---- Function declarations ----*/
 
 PG_FUNCTION_INFO_V1(pg_stat_statements_reset);
@@ -335,7 +358,6 @@ PG_FUNCTION_INFO_V1(pg_stat_statements_1_13);
 PG_FUNCTION_INFO_V1(pg_stat_statements);
 PG_FUNCTION_INFO_V1(pg_stat_statements_info);
 
-static void pgss_shmem_shutdown(int code, Datum arg);
 static void pgss_post_parse_analyze(ParseState *pstate, Query *query,
 									const JumbleState *jstate);
 static PlannedStmt *pgss_planner(Query *parse,
@@ -365,24 +387,102 @@ static void pgss_store(const char *query, int64 queryId,
 					   int parallel_workers_to_launch,
 					   int parallel_workers_launched,
 					   PlannedStmtOrigin planOrigin);
-static void pg_stat_statements_internal(FunctionCallInfo fcinfo,
-										pgssVersion api_version,
-										bool showtext);
-static pgssEntry *entry_alloc(pgssHashKey *key, Size query_offset, int query_len,
-							  int encoding, bool sticky);
-static void entry_dealloc(void);
-static bool qtext_store(const char *query, int query_len,
-						Size *query_offset, int *gc_count);
-static char *qtext_load_file(Size *buffer_size);
-static char *qtext_fetch(Size query_offset, int query_len,
-						 char *buffer, Size buffer_size);
-static bool need_gc_qtexts(void);
-static void gc_qtexts(void);
+static void pgss_assign_query_text_memory(int newval, void *extra);
+static void pgss_attach_shmem(void);
 static TimestampTz entry_reset(Oid userid, Oid dbid, int64 queryid, bool minmax_only);
+static inline uint64 pgss_hash_key(pgssHashKey *key);
 static char *generate_normalized_query(const JumbleState *jstate,
 									   const char *query,
 									   int query_loc, int *query_len_p);
 
+static void pg_stat_statements_internal(FunctionCallInfo fcinfo,
+										pgssVersion api_version,
+										bool showtext);
+
+static bool pgss_flush_pending_cb(PgStat_EntryRef *entry_ref, bool nowait);
+static void pgss_to_serialized_data(const PgStat_HashKey *key,
+									const PgStatShared_Common *header,
+									FILE *statfile);
+static bool pgss_from_serialized_data(const PgStat_HashKey *key,
+									  PgStatShared_Common *header,
+									  FILE *statfile);
+
+/*--------------------------------------------------------------------------
+ * Custom pgstat kind definition
+ *--------------------------------------------------------------------------
+ */
+
+static const PgStat_KindInfo pgss_kind_info = {
+	.name = "pg_stat_statements",
+	.fixed_amount = false,
+	.write_to_file = true,
+	.track_entry_count = true,
+	.accessed_across_databases = true,
+	.own_hash = true,
+	.shared_size = sizeof(PgStatShared_Pgss),
+	.shared_data_off = offsetof(PgStatShared_Pgss, counters),
+	.shared_data_len = sizeof(pgssCounters),
+	.pending_size = sizeof(PgStat_PgssPending),
+	.init_backend_cb = pgss_attach_shmem,
+	.flush_pending_cb = pgss_flush_pending_cb,
+	.to_serialized_data = pgss_to_serialized_data,
+	.from_serialized_data = pgss_from_serialized_data,
+};
+
+static inline uint64
+pgss_hash_key(pgssHashKey *key)
+{
+	return hash_bytes_extended((const unsigned char *) key,
+							   sizeof(pgssHashKey), 0);
+}
+
+static void
+pgss_init_shmem(void *ptr, void *arg)
+{
+	pgssSharedState *state = (pgssSharedState *) ptr;
+
+	LWLockInitialize(&state->lock, LWLockNewTrancheId("pg_stat_statements"));
+	pg_atomic_init_u64(&state->dealloc, 0);
+	pg_atomic_init_u64(&state->skipped_entries, 0);
+	pg_atomic_init_u64(&state->stats_reset, (uint64) GetCurrentTimestamp());
+	pg_atomic_init_u64(&state->live_entries, 0);
+	pg_atomic_init_u64(&state->last_eviction, 0);
+}
+
+static void
+pgss_assign_query_text_memory(int newval, void *extra)
+{
+	if (pgss_qtext_dsa)
+		dsa_set_size_limit(pgss_qtext_dsa, (size_t) newval * 1024);
+}
+
+static void
+pgss_attach_shmem(void)
+{
+	bool		found;
+
+	if (pgss_shared != NULL)
+		return;
+	if (!pgstat_get_kind_info(PGSTAT_KIND_PGSS))
+		ereport(ERROR,
+				(errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
+				 errmsg("pg_stat_statements must be loaded via shared_preload_libraries")));
+
+	pgss_shared = GetNamedDSMSegment("pg_stat_statements_state",
+									 sizeof(pgssSharedState),
+									 pgss_init_shmem,
+									 &found, NULL);
+
+	if (pgss_qtext_dsa == NULL)
+	{
+		pgss_qtext_dsa = GetNamedDSA("pg_stat_statements_qtext", &found);
+		dsa_set_size_limit(pgss_qtext_dsa, (size_t) pgss_query_text_memory * 1024);
+	}
+
+	if (pgss_hash == NULL)
+		pgss_hash = pgstat_get_hash_for_kind(PGSTAT_KIND_PGSS);
+}
+
 /*
  * Module load callback
  */
@@ -390,7 +490,7 @@ void
 _PG_init(void)
 {
 	/*
-	 * In order to create our shared memory area, we have to be loaded via
+	 * In order to register our custom pgstat kind, we have to be loaded via
 	 * shared_preload_libraries.  If not, fall out without hooking into any of
 	 * the main system.  (We don't throw error here because it seems useful to
 	 * allow the pg_stat_statements functions to be created even when the
@@ -406,9 +506,10 @@ _PG_init(void)
 	 */
 	EnableQueryId();
 
-	/*
-	 * Define (or redefine) custom GUC variables.
-	 */
+	/* Register custom pgstat kind */
+	pgstat_register_kind(PGSTAT_KIND_PGSS, &pgss_kind_info);
+
+	/* Define GUCs */
 	DefineCustomIntVariable("pg_stat_statements.max",
 							"Sets the maximum number of statements tracked by pg_stat_statements.",
 							NULL,
@@ -416,7 +517,7 @@ _PG_init(void)
 							5000,
 							100,
 							INT_MAX / 2,
-							PGC_POSTMASTER,
+							PGC_SIGHUP,
 							0,
 							NULL,
 							NULL,
@@ -467,12 +568,20 @@ _PG_init(void)
 							 NULL,
 							 NULL);
 
-	MarkGUCPrefixReserved("pg_stat_statements");
+	DefineCustomIntVariable("pg_stat_statements.query_text_memory",
+							"Sets the memory limit for query text storage.",
+							NULL,
+							&pgss_query_text_memory,
+							4096,
+							256,
+							MAX_KILOBYTES,
+							PGC_SIGHUP,
+							GUC_UNIT_KB,
+							NULL,
+							pgss_assign_query_text_memory,
+							NULL);
 
-	/*
-	 * Register our shared memory needs.
-	 */
-	RegisterShmemCallbacks(&pgss_shmem_callbacks);
+	MarkGUCPrefixReserved("pg_stat_statements");
 
 	/*
 	 * Install hooks.
@@ -493,399 +602,689 @@ _PG_init(void)
 	ProcessUtility_hook = pgss_ProcessUtility;
 }
 
-/*
- * shmem request callback: Request shared memory resources.
- *
- * This is called at postmaster startup.  Note that the shared memory isn't
- * allocated here yet, this merely register our needs.
- *
- * In EXEC_BACKEND mode, this is also called in each backend, to re-attach to
- * the shared memory area that was already initialized.
+/*--------------------------------------------------------------------------
+ * pgstat flush helpers: merge per-kind timing counters into shared memory
+ *--------------------------------------------------------------------------
  */
 static void
-pgss_shmem_request(void *arg)
+pgss_flush_kind(pgssCounters *shared, pgssCounters *pending, pgssStoreKind kind)
+{
+	int64		n_a,
+				n_b;
+	double		delta;
+
+	n_a = shared->calls[kind];
+	n_b = pending->calls[kind];
+
+	shared->calls[kind] += n_b;
+	shared->total_time[kind] += pending->total_time[kind];
+
+	if (n_a == 0)
+	{
+		shared->min_time[kind] = pending->min_time[kind];
+		shared->max_time[kind] = pending->max_time[kind];
+		shared->mean_time[kind] = pending->mean_time[kind];
+		shared->sum_var_time[kind] = pending->sum_var_time[kind];
+	}
+	else
+	{
+		if (pending->min_time[kind] < shared->min_time[kind])
+			shared->min_time[kind] = pending->min_time[kind];
+		if (pending->max_time[kind] > shared->max_time[kind])
+			shared->max_time[kind] = pending->max_time[kind];
+
+		/*
+		 * Chan's parallel variance algorithm: combine two sets of (count,
+		 * mean, sum_of_squared_deviations). See
+		 * <http://www.johndcook.com/blog/standard_deviation/>
+		 */
+		delta = pending->mean_time[kind] - shared->mean_time[kind];
+		shared->sum_var_time[kind] +=
+			pending->sum_var_time[kind] +
+			delta * delta * (double) n_a * (double) n_b / (double) (n_a + n_b);
+		shared->mean_time[kind] =
+			shared->total_time[kind] / shared->calls[kind];
+	}
+}
+
+static bool
+pgss_flush_pending_cb(PgStat_EntryRef *entry_ref, bool nowait)
 {
-	ShmemRequestHash(.name = "pg_stat_statements hash",
-					 .nelems = pgss_max,
-					 .hash_info.keysize = sizeof(pgssHashKey),
-					 .hash_info.entrysize = sizeof(pgssEntry),
-					 .hash_flags = HASH_ELEM | HASH_BLOBS,
-					 .ptr = &pgss_hash,
-		);
-	ShmemRequestStruct(.name = "pg_stat_statements",
-					   .size = sizeof(pgssSharedState),
-					   .ptr = (void **) &pgss,
-		);
+	PgStat_PgssPending *pending;
+	PgStatShared_Pgss *shared;
+
+	pending = (PgStat_PgssPending *) entry_ref->pending;
+	shared = (PgStatShared_Pgss *) entry_ref->shared_stats;
+
+	if (!pgstat_lock_entry(entry_ref, nowait))
+		return false;
+
+	shared->key = pending->key;
+
+	pgss_flush_kind(&shared->counters, &pending->counters, PGSS_EXEC);
+
+	if (pgss_track_planning && pending->counters.calls[PGSS_PLAN] > 0)
+		pgss_flush_kind(&shared->counters, &pending->counters, PGSS_PLAN);
+
+	shared->counters.rows += pending->counters.rows;
+	shared->counters.shared_blks_hit += pending->counters.shared_blks_hit;
+	shared->counters.shared_blks_read += pending->counters.shared_blks_read;
+	shared->counters.shared_blks_dirtied += pending->counters.shared_blks_dirtied;
+	shared->counters.shared_blks_written += pending->counters.shared_blks_written;
+	shared->counters.local_blks_hit += pending->counters.local_blks_hit;
+	shared->counters.local_blks_read += pending->counters.local_blks_read;
+	shared->counters.local_blks_dirtied += pending->counters.local_blks_dirtied;
+	shared->counters.local_blks_written += pending->counters.local_blks_written;
+	shared->counters.temp_blks_read += pending->counters.temp_blks_read;
+	shared->counters.temp_blks_written += pending->counters.temp_blks_written;
+	shared->counters.shared_blk_read_time += pending->counters.shared_blk_read_time;
+	shared->counters.shared_blk_write_time += pending->counters.shared_blk_write_time;
+	shared->counters.local_blk_read_time += pending->counters.local_blk_read_time;
+	shared->counters.local_blk_write_time += pending->counters.local_blk_write_time;
+	shared->counters.temp_blk_read_time += pending->counters.temp_blk_read_time;
+	shared->counters.temp_blk_write_time += pending->counters.temp_blk_write_time;
+	shared->counters.wal_records += pending->counters.wal_records;
+	shared->counters.wal_fpi += pending->counters.wal_fpi;
+	shared->counters.wal_bytes += pending->counters.wal_bytes;
+	shared->counters.wal_buffers_full += pending->counters.wal_buffers_full;
+	shared->counters.jit_functions += pending->counters.jit_functions;
+	shared->counters.jit_generation_time += pending->counters.jit_generation_time;
+	shared->counters.jit_inlining_count += pending->counters.jit_inlining_count;
+	shared->counters.jit_inlining_time += pending->counters.jit_inlining_time;
+	shared->counters.jit_optimization_count += pending->counters.jit_optimization_count;
+	shared->counters.jit_optimization_time += pending->counters.jit_optimization_time;
+	shared->counters.jit_emission_count += pending->counters.jit_emission_count;
+	shared->counters.jit_emission_time += pending->counters.jit_emission_time;
+	shared->counters.jit_deform_count += pending->counters.jit_deform_count;
+	shared->counters.jit_deform_time += pending->counters.jit_deform_time;
+	shared->counters.parallel_workers_to_launch += pending->counters.parallel_workers_to_launch;
+	shared->counters.parallel_workers_launched += pending->counters.parallel_workers_launched;
+	shared->counters.generic_plan_calls += pending->counters.generic_plan_calls;
+	shared->counters.custom_plan_calls += pending->counters.custom_plan_calls;
+
+	if (pending->last_access_updated)
+		pg_atomic_write_u64(&shared->last_access,
+							(uint64) GetCurrentStatementStartTimestamp());
+
+	pgstat_unlock_entry(entry_ref);
+
+	return true;
 }
 
 /*
- * shmem init callback: Initialize our shared memory data structures at
- * postmaster startup.
- *
- * Load any pre-existing statistics from file.  Also create and load the
- * query-texts file, which is expected to exist (even if empty) while the
- * module is enabled.
+ * Serialize entry metadata + query text alongside each pgstat entry.
+ * On restart, from_serialized_data reconstructs both.
  */
 static void
-pgss_shmem_init(void *arg)
+pgss_to_serialized_data(const PgStat_HashKey *key,
+						const PgStatShared_Common *header,
+						FILE *statfile)
 {
-	int			tranche_id;
-	FILE	   *file = NULL;
-	FILE	   *qfile = NULL;
-	uint32		header;
-	int32		num;
-	int32		pgver;
-	int32		i;
-	int			buffer_size;
-	char	   *buffer = NULL;
+	PgStatShared_Pgss *shpgss = (PgStatShared_Pgss *) header;
+	bool		found = pgss_save;
+	char	   *qtext = NULL;
+	int			qtext_len = 0;
 
-	/*
-	 * We already checked that we're loaded from shared_preload_libraries in
-	 * _PG_init(), so we should not get here after postmaster startup.
-	 */
-	Assert(!IsUnderPostmaster);
-
-	/*
-	 * Initialize the shmem area with no statistics.
-	 */
-	tranche_id = LWLockNewTrancheId("pg_stat_statements");
-	LWLockInitialize(&pgss->lock.lock, tranche_id);
-	pgss->cur_median_usage = ASSUMED_MEDIAN_INIT;
-	pgss->mean_query_len = ASSUMED_LENGTH_INIT;
-	SpinLockInit(&pgss->mutex);
-	pgss->extent = 0;
-	pgss->n_writers = 0;
-	pgss->gc_count = 0;
-	pgss->stats.dealloc = 0;
-	pgss->stats.stats_reset = GetCurrentTimestamp();
-
-	/* The hash table must've also been initialized by now */
-	Assert(pgss_hash != NULL);
+	pgstat_write_chunk_s(statfile, &found);
 
-	/*
-	 * Set up a shmem exit hook to dump the statistics to disk on postmaster
-	 * (or standalone backend) exit.
-	 */
-	on_shmem_exit(pgss_shmem_shutdown, (Datum) 0);
+	if (!pgss_save)
+		return;
 
-	/*
-	 * Load any pre-existing statistics from file.
-	 *
-	 * Note: we don't bother with locks here, because there should be no other
-	 * processes running when this code is reached.
-	 */
+	pgss_attach_shmem();
 
-	/* Unlink query text file possibly left over from crash */
-	unlink(PGSS_TEXT_FILE);
+	pgstat_write_chunk_s(statfile, &shpgss->encoding);
+	pgstat_write_chunk_s(statfile, &shpgss->stats_since);
+	pgstat_write_chunk_s(statfile, &shpgss->minmax_stats_since);
 
-	/* Allocate new query text temp file */
-	qfile = AllocateFile(PGSS_TEXT_FILE, PG_BINARY_W);
-	if (qfile == NULL)
-		goto write_error;
+	/* Write query text */
+	if (DsaPointerIsValid(shpgss->query_text) && shpgss->query_len >= 0)
+		qtext = dsa_get_address(pgss_qtext_dsa, shpgss->query_text);
 
-	/*
-	 * If we were told not to load old statistics, we're done.  (Note we do
-	 * not try to unlink any old dump file in this case.  This seems a bit
-	 * questionable but it's the historical behavior.)
-	 */
-	if (!pgss_save)
+	if (qtext)
 	{
-		FreeFile(qfile);
-		return;
+		qtext_len = shpgss->query_len;
+		pgstat_write_chunk_s(statfile, &qtext_len);
+		pgstat_write_chunk(statfile, qtext, qtext_len + 1);
 	}
-
-	/*
-	 * Attempt to load old statistics from the dump file.
-	 */
-	file = AllocateFile(PGSS_DUMP_FILE, PG_BINARY_R);
-	if (file == NULL)
+	else
 	{
-		if (errno != ENOENT)
-			goto read_error;
-		/* No existing persisted stats file, so we're done */
-		FreeFile(qfile);
-		return;
+		qtext_len = -1;
+		pgstat_write_chunk_s(statfile, &qtext_len);
 	}
+}
 
-	buffer_size = 2048;
-	buffer = (char *) palloc(buffer_size);
-
-	if (fread(&header, sizeof(uint32), 1, file) != 1 ||
-		fread(&pgver, sizeof(uint32), 1, file) != 1 ||
-		fread(&num, sizeof(int32), 1, file) != 1)
-		goto read_error;
+/*
+ * Deserialize auxiliary data: restore metadata fields and query text.
+ */
+static bool
+pgss_from_serialized_data(const PgStat_HashKey *key,
+						  PgStatShared_Common *header,
+						  FILE *statfile)
+{
+	PgStatShared_Pgss *shpgss = (PgStatShared_Pgss *) header;
+	bool		had_entry;
+	int			qtext_len;
 
-	if (header != PGSS_FILE_HEADER ||
-		pgver != PGSS_PG_MAJOR_VERSION)
-		goto data_error;
+	if (!pgstat_read_chunk_s(statfile, &had_entry))
+		return false;
 
-	for (i = 0; i < num; i++)
+	if (!had_entry)
 	{
-		pgssEntry	temp;
-		pgssEntry  *entry;
-		Size		query_offset;
+		pgstat_drop_entry(PGSTAT_KIND_PGSS, key->dboid, key->objid, false);
+		pg_atomic_sub_fetch_u64(&pgss_shared->live_entries, 1);
+		return true;
+	}
+
+	if (!pgstat_read_chunk_s(statfile, &shpgss->encoding))
+		return false;
+	if (!pgstat_read_chunk_s(statfile, &shpgss->stats_since))
+		return false;
+	if (!pgstat_read_chunk_s(statfile, &shpgss->minmax_stats_since))
+		return false;
+	if (!pgstat_read_chunk_s(statfile, &qtext_len))
+		return false;
 
-		if (fread(&temp, sizeof(pgssEntry), 1, file) != 1)
-			goto read_error;
+	pgss_attach_shmem();
 
-		/* Encoding is the only field we can easily sanity-check */
-		if (!PG_VALID_BE_ENCODING(temp.encoding))
-			goto data_error;
+	if (qtext_len >= 0)
+	{
+		char	   *qtext;
+		dsa_pointer dp;
 
-		/* Resize buffer as needed */
-		if (temp.query_len >= buffer_size)
+		qtext = palloc(qtext_len + 1);
+		if (!pgstat_read_chunk(statfile, qtext, qtext_len + 1))
 		{
-			buffer_size = Max(buffer_size * 2, temp.query_len + 1);
-			buffer = repalloc(buffer, buffer_size);
+			pfree(qtext);
+			return false;
 		}
 
-		if (fread(buffer, 1, temp.query_len + 1, file) != temp.query_len + 1)
-			goto read_error;
-
-		/* Should have a trailing null, but let's make sure */
-		buffer[temp.query_len] = '\0';
-
-		/* Skip loading "sticky" entries */
-		if (IS_STICKY(temp.counters))
-			continue;
+		dp = dsa_allocate_extended(pgss_qtext_dsa, qtext_len + 1, DSA_ALLOC_NO_OOM);
+		if (DsaPointerIsValid(dp))
+		{
+			memcpy(dsa_get_address(pgss_qtext_dsa, dp), qtext, qtext_len + 1);
+			shpgss->query_text = dp;
+			shpgss->query_len = qtext_len;
+		}
+		else
+		{
+			shpgss->query_text = InvalidDsaPointer;
+			shpgss->query_len = -1;
+		}
 
-		/* Store the query text */
-		query_offset = pgss->extent;
-		if (fwrite(buffer, 1, temp.query_len + 1, qfile) != temp.query_len + 1)
-			goto write_error;
-		pgss->extent += temp.query_len + 1;
-
-		/* make the hashtable entry (discards old entries if too many) */
-		entry = entry_alloc(&temp.key, query_offset, temp.query_len,
-							temp.encoding,
-							false);
-
-		/* copy in the actual stats */
-		entry->counters = temp.counters;
-		entry->stats_since = temp.stats_since;
-		entry->minmax_stats_since = temp.minmax_stats_since;
+		pfree(qtext);
+	}
+	else
+	{
+		shpgss->query_text = InvalidDsaPointer;
+		shpgss->query_len = -1;
 	}
 
-	/* Read global statistics for pg_stat_statements */
-	if (fread(&pgss->stats, sizeof(pgssGlobalStats), 1, file) != 1)
-		goto read_error;
+	return true;
+}
 
-	pfree(buffer);
-	FreeFile(file);
-	FreeFile(qfile);
 
-	/*
-	 * Remove the persisted stats file so it's not included in
-	 * backups/replication standbys, etc.  A new file will be written on next
-	 * shutdown.
-	 *
-	 * Note: it's okay if the PGSS_TEXT_FILE is included in a basebackup,
-	 * because we remove that file on startup; it acts inversely to
-	 * PGSS_DUMP_FILE, in that it is only supposed to be around when the
-	 * server is running, whereas PGSS_DUMP_FILE is only supposed to be around
-	 * when the server is not running.  Leaving the file creates no danger of
-	 * a newly restored database having a spurious record of execution costs,
-	 * which is what we're really concerned about here.
-	 */
-	unlink(PGSS_DUMP_FILE);
-
-	return;
-
-read_error:
-	ereport(LOG,
-			(errcode_for_file_access(),
-			 errmsg("could not read file \"%s\": %m",
-					PGSS_DUMP_FILE)));
-	goto fail;
-data_error:
-	ereport(LOG,
-			(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
-			 errmsg("ignoring invalid data in file \"%s\"",
-					PGSS_DUMP_FILE)));
-	goto fail;
-write_error:
-	ereport(LOG,
-			(errcode_for_file_access(),
-			 errmsg("could not write file \"%s\": %m",
-					PGSS_TEXT_FILE)));
-fail:
-	if (buffer)
-		pfree(buffer);
-	if (file)
-		FreeFile(file);
-	if (qfile)
-		FreeFile(qfile);
-	/* If possible, throw away the bogus file; ignore any error */
-	unlink(PGSS_DUMP_FILE);
+/*
+ * Store query text in the DSA area and update the entry's text pointer.
+ * On allocation failure, sets query_text to InvalidDsaPointer.
+ */
+static void
+pgss_store_query_text(PgStatShared_Pgss *entry, const char *query, int query_len,
+					  int encoding)
+{
+	dsa_pointer dp;
 
-	/*
-	 * Don't unlink PGSS_TEXT_FILE here; it should always be around while the
-	 * server is running with pg_stat_statements enabled
-	 */
+	dp = dsa_allocate_extended(pgss_qtext_dsa, query_len + 1, DSA_ALLOC_NO_OOM);
+	if (DsaPointerIsValid(dp))
+	{
+		char	   *dst = dsa_get_address(pgss_qtext_dsa, dp);
+
+		memcpy(dst, query, query_len);
+		dst[query_len] = '\0';
+		entry->query_text = dp;
+		entry->query_len = query_len;
+		entry->encoding = encoding;
+	}
+	else
+	{
+		entry->query_text = InvalidDsaPointer;
+		entry->query_len = -1;
+		entry->encoding = encoding;
+	}
 }
 
-/*
- * shmem_shutdown hook: Dump statistics into file.
+/*--------------------------------------------------------------------------
+ * pgss_evict: Evict the least-recently-used entries.
  *
- * Note: we don't bother with acquiring lock, because there should be no
- * other processes running when this is called.
+ * Called with pgss_shared->lock held exclusively.  Scans all entries,
+ * sorts by last_access timestamp (oldest first), and drops the bottom 5%.
+ *--------------------------------------------------------------------------
  */
-static void
-pgss_shmem_shutdown(int code, Datum arg)
+typedef struct pgssEvictEntry
 {
-	FILE	   *file;
-	char	   *qbuffer = NULL;
-	Size		qbuffer_size = 0;
-	HASH_SEQ_STATUS hash_seq;
-	int32		num_entries;
-	pgssEntry  *entry;
-
-	/* Don't try to dump during a crash. */
-	if (code)
-		return;
+	Oid			dbid;
+	uint64		objid;
+	TimestampTz last_access;
+	dsa_pointer text_ptr;
+}			pgssEvictEntry;
 
-	/* Safety check ... shouldn't get here unless shmem is set up. */
-	if (!pgss || !pgss_hash)
-		return;
+static int
+pgss_evict_entry_cmp(const void *a, const void *b)
+{
+	TimestampTz ta = ((const pgssEvictEntry *) a)->last_access;
+	TimestampTz tb = ((const pgssEvictEntry *) b)->last_access;
 
-	/* Don't dump if told not to. */
-	if (!pgss_save)
-		return;
+	if (ta < tb)
+		return -1;
+	if (ta > tb)
+		return 1;
+	return 0;
+}
+
+static void
+pgss_evict(void)
+{
+	dshash_seq_status hstat;
+	PgStatShared_HashEntry *p;
+	pgssEvictEntry *entries;
+	int			nentries = 0;
+	int			live = (int) pg_atomic_read_u64(&pgss_shared->live_entries);
+	int			nevict;
+	int			not_freed = 0;
+	instr_time	start,
+				duration;
+
+	INSTR_TIME_SET_CURRENT(start);
+
+	entries = palloc(live * sizeof(pgssEvictEntry));
+
+	dshash_seq_init(&hstat, pgss_hash, false);
+	while ((p = dshash_seq_next(&hstat)) != NULL)
+	{
+		PgStatShared_Pgss *shared;
 
-	file = AllocateFile(PGSS_DUMP_FILE ".tmp", PG_BINARY_W);
-	if (file == NULL)
-		goto error;
+		if (p->dropped)
+			continue;
 
-	if (fwrite(&PGSS_FILE_HEADER, sizeof(uint32), 1, file) != 1)
-		goto error;
-	if (fwrite(&PGSS_PG_MAJOR_VERSION, sizeof(uint32), 1, file) != 1)
-		goto error;
-	num_entries = hash_get_num_entries(pgss_hash);
-	if (fwrite(&num_entries, sizeof(int32), 1, file) != 1)
-		goto error;
+		shared = (PgStatShared_Pgss *) dsa_get_address(pgStatLocal.dsa, p->body);
 
-	qbuffer = qtext_load_file(&qbuffer_size);
-	if (qbuffer == NULL)
-		goto error;
+		entries[nentries].dbid = p->key.dboid;
+		entries[nentries].objid = p->key.objid;
+		entries[nentries].last_access = (TimestampTz) pg_atomic_read_u64(&shared->last_access);
+		entries[nentries].text_ptr = shared->query_text;
+		nentries++;
+	}
+	dshash_seq_term(&hstat);
 
-	/*
-	 * When serializing to disk, we store query texts immediately after their
-	 * entry data.  Any orphaned query texts are thereby excluded.
-	 */
-	hash_seq_init(&hash_seq, pgss_hash);
-	while ((entry = hash_seq_search(&hash_seq)) != NULL)
+	if (nentries == 0)
 	{
-		int			len = entry->query_len;
-		char	   *qstr = qtext_fetch(entry->query_offset, len,
-									   qbuffer, qbuffer_size);
+		pfree(entries);
+		return;
+	}
+
+	qsort(entries, nentries, sizeof(pgssEvictEntry), pgss_evict_entry_cmp);
 
-		if (qstr == NULL)
-			continue;			/* Ignore any entries with bogus texts */
+	nevict = Max(nentries / PGSS_EVICT_RATIO, 1);
 
-		if (fwrite(entry, sizeof(pgssEntry), 1, file) != 1 ||
-			fwrite(qstr, 1, len + 1, file) != len + 1)
+	/* Don't evict entries accessed within the grace period */
+	for (int i = 0; i < nevict; i++)
+	{
+		if (!TimestampDifferenceExceeds(entries[i].last_access,
+										GetCurrentTimestamp(),
+										PGSS_EVICT_GRACE_MS))
 		{
-			/* note: we assume hash_seq_term won't change errno */
-			hash_seq_term(&hash_seq);
-			goto error;
+			nevict = i;
+			break;
 		}
 	}
 
-	/* Dump global statistics for pg_stat_statements */
-	if (fwrite(&pgss->stats, sizeof(pgssGlobalStats), 1, file) != 1)
-		goto error;
+	for (int i = 0; i < nevict; i++)
+	{
+		PgStat_EntryRef *victim_ref;
+		dsa_pointer text_ptr = InvalidDsaPointer;
+		bool		freed;
 
-	pfree(qbuffer);
-	qbuffer = NULL;
+		/*
+		 * Look up the entry to safely clear its query_text pointer under the
+		 * entry-level lock before freeing.  This prevents concurrent readers
+		 * from following a dangling DSA pointer.
+		 */
+		victim_ref = pgstat_get_entry_ref(PGSTAT_KIND_PGSS, entries[i].dbid,
+										  entries[i].objid, false, NULL);
+		if (victim_ref != NULL)
+		{
+			PgStatShared_Pgss *shared;
 
-	if (FreeFile(file))
-	{
-		file = NULL;
-		goto error;
+			shared = (PgStatShared_Pgss *) victim_ref->shared_stats;
+			pgstat_lock_entry(victim_ref, false);
+			text_ptr = shared->query_text;
+			shared->query_text = InvalidDsaPointer;
+			pgstat_unlock_entry(victim_ref);
+		}
+
+		freed = pgstat_drop_entry(PGSTAT_KIND_PGSS, entries[i].dbid,
+								  entries[i].objid, true);
+
+		if (DsaPointerIsValid(text_ptr))
+			dsa_free(pgss_qtext_dsa, text_ptr);
+
+		if (!freed)
+			not_freed++;
+		pg_atomic_sub_fetch_u64(&pgss_shared->live_entries, 1);
 	}
 
-	/*
-	 * Rename file into place, so we atomically replace any old one.
-	 */
-	(void) durable_rename(PGSS_DUMP_FILE ".tmp", PGSS_DUMP_FILE, LOG);
-
-	/* Unlink query-texts file; it's not needed while shutdown */
-	unlink(PGSS_TEXT_FILE);
-
-	return;
-
-error:
-	ereport(LOG,
-			(errcode_for_file_access(),
-			 errmsg("could not write file \"%s\": %m",
-					PGSS_DUMP_FILE ".tmp")));
-	if (qbuffer)
-		pfree(qbuffer);
-	if (file)
-		FreeFile(file);
-	unlink(PGSS_DUMP_FILE ".tmp");
-	unlink(PGSS_TEXT_FILE);
+	pg_atomic_fetch_add_u64(&pgss_shared->dealloc, 1);
+
+	if (not_freed > 0)
+		pgstat_request_entry_refs_gc();
+
+	pfree(entries);
+
+	INSTR_TIME_SET_CURRENT(duration);
+	INSTR_TIME_SUBTRACT(duration, start);
 }
 
 /*
- * Post-parse-analysis hook: mark query with a queryId
+ * pgss_maybe_evict: Run eviction if needed.  Returns true if creation
+ * can proceed, false if the caller should skip recording.
+ *
+ * Must be called with pgss_shared->lock held exclusively.
  */
-static void
-pgss_post_parse_analyze(ParseState *pstate, Query *query, const JumbleState *jstate)
+static bool
+pgss_maybe_evict(void)
 {
-	if (prev_post_parse_analyze_hook)
-		prev_post_parse_analyze_hook(pstate, query, jstate);
+	int64		live = (int64) pg_atomic_read_u64(&pgss_shared->live_entries);
 
-	/* Safety check... */
-	if (!pgss || !pgss_hash || !pgss_enabled(nesting_level))
-		return;
+	/* Safety valve: dshash overflow from pinned entries (see above) */
+	if (pgstat_get_entry_count(PGSTAT_KIND_PGSS) >= pgss_max * 2)
+	{
+		pg_atomic_fetch_add_u64(&pgss_shared->skipped_entries, 1);
+		return false;
+	}
 
-	/*
-	 * If it's EXECUTE, clear the queryId so that stats will accumulate for
-	 * the underlying PREPARE.  But don't do this if we're not tracking
-	 * utility statements, to avoid messing up another extension that might be
-	 * tracking them.
-	 */
-	if (query->utilityStmt)
+	if (live >= pgss_max)
 	{
-		if (pgss_track_utility && IsA(query->utilityStmt, ExecuteStmt))
+		TimestampTz now = GetCurrentStatementStartTimestamp();
+		TimestampTz last = (TimestampTz) pg_atomic_read_u64(&pgss_shared->last_eviction);
+
+		if (last == 0 || TimestampDifferenceExceeds(last, now,
+													PGSS_EVICT_INTERVAL_MS))
 		{
-			query->queryId = INT64CONST(0);
-			return;
+			pg_atomic_write_u64(&pgss_shared->last_eviction, (uint64) now);
+			pgss_evict();
+		}
+		else
+		{
+			pg_atomic_fetch_add_u64(&pgss_shared->skipped_entries, 1);
+			return false;
 		}
 	}
 
-	/*
-	 * If query jumbling were able to identify any ignorable constants, we
-	 * immediately create a hash table entry for the query, so that we can
-	 * record the normalized form of the query string.  If there were no such
-	 * constants, the normalized string would be the same as the query text
-	 * anyway, so there's no need for an early entry.
-	 */
-	if (jstate && jstate->clocations_count > 0)
-		pgss_store(pstate->p_sourcetext,
-				   query->queryId,
-				   query->stmt_location,
-				   query->stmt_len,
-				   PGSS_INVALID,
-				   0,
-				   0,
-				   NULL,
-				   NULL,
-				   NULL,
-				   jstate,
-				   0,
-				   0,
-				   PLAN_STMT_UNKNOWN);
+	return true;
 }
 
-/*
- * Planner hook: forward to regular planner, but measure planning time
- * if needed.
+/*--------------------------------------------------------------------------
+ * pgss_store: Record statistics for one statement execution.
+ *
+ * Lock-free lookup for existing entries.  If the entry is new, takes an
+ * exclusive lock to serialize creation, evicting if at capacity.
+ *--------------------------------------------------------------------------
  */
-static PlannedStmt *
-pgss_planner(Query *parse,
-			 const char *query_string,
+static void
+pgss_store(const char *query, int64 queryId,
+		   int query_location, int query_len,
+		   pgssStoreKind kind,
+		   double total_time, uint64 rows,
+		   const BufferUsage *bufusage,
+		   const WalUsage *walusage,
+		   const struct JitInstrumentation *jitusage,
+		   const JumbleState *jstate,
+		   int parallel_workers_to_launch,
+		   int parallel_workers_launched,
+		   PlannedStmtOrigin planOrigin)
+{
+	pgssHashKey key;
+
+	uint64		objid;
+	PgStat_EntryRef *entry_ref;
+	PgStat_PgssPending *pending;
+	PgStatShared_Pgss *shared;
+	char	   *norm_query = NULL;
+	int			encoding = GetDatabaseEncoding();
+	bool		created_entry = false;
+
+	Assert(query != NULL);
+
+	if (queryId == INT64CONST(0))
+		return;
+
+	memset(&key, 0, sizeof(pgssHashKey));
+	key.userid = GetUserId();
+	key.dbid = MyDatabaseId;
+	key.queryid = queryId;
+	key.toplevel = (nesting_level == 0);
+
+	pgss_attach_shmem();
+
+	objid = pgss_hash_key(&key);
+
+	/* Fast path: check if the entry already exists (lock-free lookup) */
+	entry_ref = pgstat_get_entry_ref(PGSTAT_KIND_PGSS, key.dbid, objid,
+									 false, NULL);
+
+	if (entry_ref == NULL)
+	{
+		/*
+		 * Entry doesn't exist.  Serialize creation under exclusive lock,
+		 * evicting if needed.
+		 */
+		LWLockAcquire(&pgss_shared->lock, LW_EXCLUSIVE);
+
+		if (!pgss_maybe_evict())
+		{
+			LWLockRelease(&pgss_shared->lock);
+			return;
+		}
+
+		entry_ref = pgstat_prep_pending_entry(PGSTAT_KIND_PGSS, key.dbid,
+											  objid, &created_entry);
+
+		LWLockRelease(&pgss_shared->lock);
+	}
+	else
+		entry_ref = pgstat_prep_pending_entry(PGSTAT_KIND_PGSS, key.dbid,
+											  objid, &created_entry);
+
+	shared = (PgStatShared_Pgss *) entry_ref->shared_stats;
+
+	/* New entry, or existing entry whose text was freed during eviction */
+	if (created_entry || !DsaPointerIsValid(shared->query_text))
+	{
+		pgstat_lock_entry(entry_ref, false);
+
+		if (created_entry)
+		{
+			pg_atomic_fetch_add_u64(&pgss_shared->live_entries, 1);
+			shared->key = key;
+			shared->stats_since = GetCurrentTimestamp();
+			pg_atomic_init_u32(&shared->usage, 1);
+			pg_atomic_init_u64(&shared->last_access,
+							   (uint64) GetCurrentStatementStartTimestamp());
+			shared->minmax_stats_since = shared->stats_since;
+			shared->encoding = encoding;
+		}
+
+		if (!DsaPointerIsValid(shared->query_text))
+		{
+			query = CleanQuerytext(query, &query_location, &query_len);
+			if (jstate && jstate->clocations_count > 0)
+				norm_query = generate_normalized_query(jstate, query,
+													   query_location,
+													   &query_len);
+
+			pgss_store_query_text(shared, norm_query ? norm_query : query,
+								  query_len, encoding);
+		}
+		pgstat_unlock_entry(entry_ref);
+	}
+
+	if (norm_query)
+		pfree(norm_query);
+
+	if (!jstate)
+	{
+		Assert(kind == PGSS_PLAN || kind == PGSS_EXEC);
+
+		pending = (PgStat_PgssPending *) entry_ref->pending;
+		pending->key = key;
+
+		pending->counters.calls[kind]++;
+		pending->last_access_updated = true;
+		pending->counters.total_time[kind] += total_time;
+
+		if (pending->counters.calls[kind] == 1)
+		{
+			pending->counters.min_time[kind] = total_time;
+			pending->counters.max_time[kind] = total_time;
+			pending->counters.mean_time[kind] = total_time;
+		}
+		else
+		{
+			/*
+			 * Welford's online algorithm for accumulating mean and sum of
+			 * squared deviations. See
+			 * <http://www.johndcook.com/blog/standard_deviation/>
+			 */
+			double		old_mean = pending->counters.mean_time[kind];
+
+			pending->counters.mean_time[kind] +=
+				(total_time - old_mean) / pending->counters.calls[kind];
+			pending->counters.sum_var_time[kind] +=
+				(total_time - old_mean) * (total_time - pending->counters.mean_time[kind]);
+
+			if (pending->counters.min_time[kind] > total_time)
+				pending->counters.min_time[kind] = total_time;
+			if (pending->counters.max_time[kind] < total_time)
+				pending->counters.max_time[kind] = total_time;
+		}
+
+		pending->counters.rows += rows;
+
+		if (bufusage)
+		{
+			pending->counters.shared_blks_hit += bufusage->shared_blks_hit;
+			pending->counters.shared_blks_read += bufusage->shared_blks_read;
+			pending->counters.shared_blks_dirtied += bufusage->shared_blks_dirtied;
+			pending->counters.shared_blks_written += bufusage->shared_blks_written;
+			pending->counters.local_blks_hit += bufusage->local_blks_hit;
+			pending->counters.local_blks_read += bufusage->local_blks_read;
+			pending->counters.local_blks_dirtied += bufusage->local_blks_dirtied;
+			pending->counters.local_blks_written += bufusage->local_blks_written;
+			pending->counters.temp_blks_read += bufusage->temp_blks_read;
+			pending->counters.temp_blks_written += bufusage->temp_blks_written;
+			pending->counters.shared_blk_read_time += INSTR_TIME_GET_MILLISEC(bufusage->shared_blk_read_time);
+			pending->counters.shared_blk_write_time += INSTR_TIME_GET_MILLISEC(bufusage->shared_blk_write_time);
+			pending->counters.local_blk_read_time += INSTR_TIME_GET_MILLISEC(bufusage->local_blk_read_time);
+			pending->counters.local_blk_write_time += INSTR_TIME_GET_MILLISEC(bufusage->local_blk_write_time);
+			pending->counters.temp_blk_read_time += INSTR_TIME_GET_MILLISEC(bufusage->temp_blk_read_time);
+			pending->counters.temp_blk_write_time += INSTR_TIME_GET_MILLISEC(bufusage->temp_blk_write_time);
+		}
+
+		if (walusage)
+		{
+			pending->counters.wal_records += walusage->wal_records;
+			pending->counters.wal_fpi += walusage->wal_fpi;
+			pending->counters.wal_bytes += walusage->wal_bytes;
+			pending->counters.wal_buffers_full += walusage->wal_buffers_full;
+		}
+
+		if (jitusage)
+		{
+			pending->counters.jit_functions += jitusage->created_functions;
+			pending->counters.jit_generation_time += INSTR_TIME_GET_MILLISEC(jitusage->generation_counter);
+
+			if (INSTR_TIME_GET_MILLISEC(jitusage->deform_counter))
+				pending->counters.jit_deform_count++;
+			pending->counters.jit_deform_time += INSTR_TIME_GET_MILLISEC(jitusage->deform_counter);
+
+			if (INSTR_TIME_GET_MILLISEC(jitusage->inlining_counter))
+				pending->counters.jit_inlining_count++;
+			pending->counters.jit_inlining_time += INSTR_TIME_GET_MILLISEC(jitusage->inlining_counter);
+
+			if (INSTR_TIME_GET_MILLISEC(jitusage->optimization_counter))
+				pending->counters.jit_optimization_count++;
+			pending->counters.jit_optimization_time += INSTR_TIME_GET_MILLISEC(jitusage->optimization_counter);
+
+			if (INSTR_TIME_GET_MILLISEC(jitusage->emission_counter))
+				pending->counters.jit_emission_count++;
+			pending->counters.jit_emission_time += INSTR_TIME_GET_MILLISEC(jitusage->emission_counter);
+		}
+
+		pending->counters.parallel_workers_to_launch += parallel_workers_to_launch;
+		pending->counters.parallel_workers_launched += parallel_workers_launched;
+
+		if (planOrigin == PLAN_STMT_CACHE_GENERIC)
+			pending->counters.generic_plan_calls++;
+		else if (planOrigin == PLAN_STMT_CACHE_CUSTOM)
+			pending->counters.custom_plan_calls++;
+	}
+
+}
+
+/*--------------------------------------------------------------------------
+ * Hook implementations
+ *--------------------------------------------------------------------------
+ */
+
+static void
+pgss_post_parse_analyze(ParseState *pstate, Query *query,
+						const JumbleState *jstate)
+{
+	if (prev_post_parse_analyze_hook)
+		prev_post_parse_analyze_hook(pstate, query, jstate);
+
+	if (!pgss_enabled(nesting_level))
+		return;
+
+	/*
+	 * Clear queryId for EXECUTE so stats accumulate under the PREPARE's
+	 * queryId instead.
+	 */
+	if (query->utilityStmt)
+	{
+		if (pgss_track_utility && IsA(query->utilityStmt, ExecuteStmt))
+		{
+			query->queryId = INT64CONST(0);
+			return;
+		}
+	}
+
+	/*
+	 * If query jumbling were able to identify any ignorable constants, we
+	 * immediately create a hash table entry for the query, so that we can
+	 * record the normalized form of the query string.  If there were no such
+	 * constants, the normalized string would be the same as the query text
+	 * anyway, so there's no need for an early entry.
+	 */
+	if (jstate && jstate->clocations_count > 0)
+		pgss_store(pstate->p_sourcetext,
+				   query->queryId,
+				   query->stmt_location,
+				   query->stmt_len,
+				   PGSS_INVALID,
+				   0,
+				   0,
+				   NULL,
+				   NULL,
+				   NULL,
+				   jstate,
+				   0,
+				   0,
+				   PLAN_STMT_UNKNOWN);
+}
+
+static PlannedStmt *
+pgss_planner(Query *parse,
+			 const char *query_string,
 			 int cursorOptions,
 			 ParamListInfo boundParams,
 			 ExplainState *es)
@@ -908,13 +1307,7 @@ pgss_planner(Query *parse,
 		WalUsage	walusage_start,
 					walusage;
 
-		/* We need to track buffer usage as the planner can access them. */
 		bufusage_start = pgBufferUsage;
-
-		/*
-		 * Similarly the planner could write some WAL records in some cases
-		 * (e.g. setting a hint bit with those being WAL-logged)
-		 */
 		walusage_start = pgWalUsage;
 		INSTR_TIME_SET_CURRENT(start);
 
@@ -937,11 +1330,9 @@ pgss_planner(Query *parse,
 		INSTR_TIME_SET_CURRENT(duration);
 		INSTR_TIME_SUBTRACT(duration, start);
 
-		/* calc differences of buffer counters. */
 		memset(&bufusage, 0, sizeof(BufferUsage));
 		BufferUsageAccumDiff(&bufusage, &pgBufferUsage, &bufusage_start);
 
-		/* calc differences of WAL counters. */
 		memset(&walusage, 0, sizeof(WalUsage));
 		WalUsageAccumDiff(&walusage, &pgWalUsage, &walusage_start);
 
@@ -956,17 +1347,11 @@ pgss_planner(Query *parse,
 				   &walusage,
 				   NULL,
 				   NULL,
-				   0,
-				   0,
+				   0, 0,
 				   result->planOrigin);
 	}
 	else
 	{
-		/*
-		 * Even though we're not tracking plan time for this statement, we
-		 * must still increment the nesting level, to ensure that functions
-		 * evaluated during planning are not seen as top-level calls.
-		 */
 		nesting_level++;
 		PG_TRY();
 		{
@@ -987,20 +1372,12 @@ pgss_planner(Query *parse,
 	return result;
 }
 
-/*
- * ExecutorStart hook: start up tracking if needed
- */
 static void
 pgss_ExecutorStart(QueryDesc *queryDesc, int eflags)
 {
-	/*
-	 * If query has queryId zero, don't track it.  This prevents double
-	 * counting of optimizable statements that are directly contained in
-	 * utility statements.
-	 */
-	if (pgss_enabled(nesting_level) && queryDesc->plannedstmt->queryId != INT64CONST(0))
+	if (pgss_enabled(nesting_level) &&
+		queryDesc->plannedstmt->queryId != INT64CONST(0))
 	{
-		/* Request all summary instrumentation, i.e. timing, buffers and WAL */
 		queryDesc->query_instr_options |= INSTRUMENT_ALL;
 	}
 
@@ -1010,9 +1387,6 @@ pgss_ExecutorStart(QueryDesc *queryDesc, int eflags)
 		standard_ExecutorStart(queryDesc, eflags);
 }
 
-/*
- * ExecutorRun hook: all we need do is track nesting depth
- */
 static void
 pgss_ExecutorRun(QueryDesc *queryDesc, ScanDirection direction, uint64 count)
 {
@@ -1031,9 +1405,6 @@ pgss_ExecutorRun(QueryDesc *queryDesc, ScanDirection direction, uint64 count)
 	PG_END_TRY();
 }
 
-/*
- * ExecutorFinish hook: all we need do is track nesting depth
- */
 static void
 pgss_ExecutorFinish(QueryDesc *queryDesc)
 {
@@ -1052,9 +1423,6 @@ pgss_ExecutorFinish(QueryDesc *queryDesc)
 	PG_END_TRY();
 }
 
-/*
- * ExecutorEnd hook: store results if needed
- */
 static void
 pgss_ExecutorEnd(QueryDesc *queryDesc)
 {
@@ -1085,9 +1453,6 @@ pgss_ExecutorEnd(QueryDesc *queryDesc)
 		standard_ExecutorEnd(queryDesc);
 }
 
-/*
- * ProcessUtility hook
- */
 static void
 pgss_ProcessUtility(PlannedStmt *pstmt, const char *queryString,
 					bool readOnlyTree,
@@ -1102,36 +1467,9 @@ pgss_ProcessUtility(PlannedStmt *pstmt, const char *queryString,
 	PlannedStmtOrigin saved_planOrigin = pstmt->planOrigin;
 	bool		enabled = pgss_track_utility && pgss_enabled(nesting_level);
 
-	/*
-	 * Force utility statements to get queryId zero.  We do this even in cases
-	 * where the statement contains an optimizable statement for which a
-	 * queryId could be derived (such as EXPLAIN or DECLARE CURSOR).  For such
-	 * cases, runtime control will first go through ProcessUtility and then
-	 * the executor, and we don't want the executor hooks to do anything,
-	 * since we are already measuring the statement's costs at the utility
-	 * level.
-	 *
-	 * Note that this is only done if pg_stat_statements is enabled and
-	 * configured to track utility statements, in the unlikely possibility
-	 * that user configured another extension to handle utility statements
-	 * only.
-	 */
 	if (enabled)
 		pstmt->queryId = INT64CONST(0);
 
-	/*
-	 * If it's an EXECUTE statement, we don't track it and don't increment the
-	 * nesting level.  This allows the cycles to be charged to the underlying
-	 * PREPARE instead (by the Executor hooks), which is much more useful.
-	 *
-	 * We also don't track execution of PREPARE.  If we did, we would get one
-	 * hash table entry for the PREPARE (with hash calculated from the query
-	 * string), and then a different one with the same query string (but hash
-	 * calculated from the query tree) would be used to accumulate costs of
-	 * ensuing EXECUTEs.  This would be confusing.  Since PREPARE doesn't
-	 * actually run the planner (only parse+rewrite), its costs are generally
-	 * pretty negligible and it seems okay to just ignore it.
-	 */
 	if (enabled &&
 		!IsA(parsetree, ExecuteStmt) &&
 		!IsA(parsetree, PrepareStmt))
@@ -1166,36 +1504,20 @@ pgss_ProcessUtility(PlannedStmt *pstmt, const char *queryString,
 		}
 		PG_END_TRY();
 
-		/*
-		 * CAUTION: do not access the *pstmt data structure again below here.
-		 * If it was a ROLLBACK or similar, that data structure may have been
-		 * freed.  We must copy everything we still need into local variables,
-		 * which we did above.
-		 *
-		 * For the same reason, we can't risk restoring pstmt->queryId to its
-		 * former value, which'd otherwise be a good idea.
-		 */
 		pstmt = NULL;
 
 		INSTR_TIME_SET_CURRENT(duration);
 		INSTR_TIME_SUBTRACT(duration, start);
 
-		/*
-		 * Track the total number of rows retrieved or affected by the utility
-		 * statements of COPY, FETCH, CREATE TABLE AS, CREATE MATERIALIZED
-		 * VIEW, REFRESH MATERIALIZED VIEW and SELECT INTO.
-		 */
 		rows = (qc && (qc->commandTag == CMDTAG_COPY ||
 					   qc->commandTag == CMDTAG_FETCH ||
 					   qc->commandTag == CMDTAG_SELECT ||
 					   qc->commandTag == CMDTAG_REFRESH_MATERIALIZED_VIEW)) ?
 			qc->nprocessed : 0;
 
-		/* calc differences of buffer counters. */
 		memset(&bufusage, 0, sizeof(BufferUsage));
 		BufferUsageAccumDiff(&bufusage, &pgBufferUsage, &bufusage_start);
 
-		/* calc differences of WAL counters. */
 		memset(&walusage, 0, sizeof(WalUsage));
 		WalUsageAccumDiff(&walusage, &pgWalUsage, &walusage_start);
 
@@ -1210,24 +1532,11 @@ pgss_ProcessUtility(PlannedStmt *pstmt, const char *queryString,
 				   &walusage,
 				   NULL,
 				   NULL,
-				   0,
-				   0,
+				   0, 0,
 				   saved_planOrigin);
 	}
 	else
 	{
-		/*
-		 * Even though we're not tracking execution time for this statement,
-		 * we must still increment the nesting level, to ensure that functions
-		 * evaluated within it are not seen as top-level calls.  But don't do
-		 * so for EXECUTE; that way, when control reaches pgss_planner or
-		 * pgss_ExecutorStart, we will treat the costs as top-level if
-		 * appropriate.  Likewise, don't bump for PREPARE, so that parse
-		 * analysis will treat the statement as top-level if appropriate.
-		 *
-		 * To be absolutely certain we don't mess up the nesting level,
-		 * evaluate the bump_level condition just once.
-		 */
 		bool		bump_level =
 			!IsA(parsetree, ExecuteStmt) &&
 			!IsA(parsetree, PrepareStmt);
@@ -1254,405 +1563,144 @@ pgss_ProcessUtility(PlannedStmt *pstmt, const char *queryString,
 	}
 }
 
+/*--------------------------------------------------------------------------
+ * SQL-callable functions
+ *--------------------------------------------------------------------------
+ */
+
+/* Number of output arguments (columns) for various API versions */
+#define PG_STAT_STATEMENTS_COLS_V1_0	14
+#define PG_STAT_STATEMENTS_COLS_V1_1	18
+#define PG_STAT_STATEMENTS_COLS_V1_2	19
+#define PG_STAT_STATEMENTS_COLS_V1_3	23
+#define PG_STAT_STATEMENTS_COLS_V1_8	32
+#define PG_STAT_STATEMENTS_COLS_V1_9	33
+#define PG_STAT_STATEMENTS_COLS_V1_10	43
+#define PG_STAT_STATEMENTS_COLS_V1_11	49
+#define PG_STAT_STATEMENTS_COLS_V1_12	52
+#define PG_STAT_STATEMENTS_COLS_V1_13	54
+#define PG_STAT_STATEMENTS_COLS			54	/* maximum of above */
+
 /*
- * Store some statistics for a statement.
- *
- * If jstate is not NULL then we're trying to create an entry for which
- * we have no statistics as yet; we just want to record the normalized
- * query string.  total_time, rows, bufusage and walusage are ignored in this
- * case.
- *
- * If kind is PGSS_PLAN or PGSS_EXEC, its value is used as the array position
- * for the arrays in the Counters field.
+ * Reset statement statistics.
  */
-static void
-pgss_store(const char *query, int64 queryId,
-		   int query_location, int query_len,
-		   pgssStoreKind kind,
-		   double total_time, uint64 rows,
-		   const BufferUsage *bufusage,
-		   const WalUsage *walusage,
-		   const struct JitInstrumentation *jitusage,
-		   const JumbleState *jstate,
-		   int parallel_workers_to_launch,
-		   int parallel_workers_launched,
-		   PlannedStmtOrigin planOrigin)
+Datum
+pg_stat_statements_reset(PG_FUNCTION_ARGS)
 {
-	pgssHashKey key;
-	pgssEntry  *entry;
-	char	   *norm_query = NULL;
-	int			encoding = GetDatabaseEncoding();
-
-	Assert(query != NULL);
+	entry_reset(0, 0, 0, false);
 
-	/* Safety check... */
-	if (!pgss || !pgss_hash)
-		return;
+	PG_RETURN_VOID();
+}
 
-	/*
-	 * Nothing to do if compute_query_id isn't enabled and no other module
-	 * computed a query identifier.
-	 */
-	if (queryId == INT64CONST(0))
-		return;
+/*
+ * Reset statement statistics corresponding to userid, dbid, and queryid.
+ */
+Datum
+pg_stat_statements_reset_1_7(PG_FUNCTION_ARGS)
+{
+	Oid			userid;
+	Oid			dbid;
+	int64		queryid;
 
-	/*
-	 * Confine our attention to the relevant part of the string, if the query
-	 * is a portion of a multi-statement source string, and update query
-	 * location and length if needed.
-	 */
-	query = CleanQuerytext(query, &query_location, &query_len);
+	userid = PG_GETARG_OID(0);
+	dbid = PG_GETARG_OID(1);
+	queryid = PG_GETARG_INT64(2);
 
-	/* Set up key for hashtable search */
+	entry_reset(userid, dbid, queryid, false);
 
-	/* clear padding */
-	memset(&key, 0, sizeof(pgssHashKey));
+	PG_RETURN_VOID();
+}
 
-	key.userid = GetUserId();
-	key.dbid = MyDatabaseId;
-	key.queryid = queryId;
-	key.toplevel = (nesting_level == 0);
+Datum
+pg_stat_statements_reset_1_11(PG_FUNCTION_ARGS)
+{
+	Oid			userid;
+	Oid			dbid;
+	int64		queryid;
+	bool		minmax_only;
 
-	/* Lookup the hash table entry with shared lock. */
-	LWLockAcquire(&pgss->lock.lock, LW_SHARED);
-
-	entry = (pgssEntry *) hash_search(pgss_hash, &key, HASH_FIND, NULL);
-
-	/* Create new entry, if not present */
-	if (!entry)
-	{
-		Size		query_offset;
-		int			gc_count;
-		bool		stored;
-		bool		do_gc;
-
-		/*
-		 * Create a new, normalized query string if caller asked.  We don't
-		 * need to hold the lock while doing this work.  (Note: in any case,
-		 * it's possible that someone else creates a duplicate hashtable entry
-		 * in the interval where we don't hold the lock below.  That case is
-		 * handled by entry_alloc.)
-		 */
-		if (jstate)
-		{
-			LWLockRelease(&pgss->lock.lock);
-			norm_query = generate_normalized_query(jstate, query,
-												   query_location,
-												   &query_len);
-			LWLockAcquire(&pgss->lock.lock, LW_SHARED);
-		}
-
-		/* Append new query text to file with only shared lock held */
-		stored = qtext_store(norm_query ? norm_query : query, query_len,
-							 &query_offset, &gc_count);
-
-		/*
-		 * Determine whether we need to garbage collect external query texts
-		 * while the shared lock is still held.  This micro-optimization
-		 * avoids taking the time to decide this while holding exclusive lock.
-		 */
-		do_gc = need_gc_qtexts();
-
-		/* Need exclusive lock to make a new hashtable entry - promote */
-		LWLockRelease(&pgss->lock.lock);
-		LWLockAcquire(&pgss->lock.lock, LW_EXCLUSIVE);
-
-		/*
-		 * A garbage collection may have occurred while we weren't holding the
-		 * lock.  In the unlikely event that this happens, the query text we
-		 * stored above will have been garbage collected, so write it again.
-		 * This should be infrequent enough that doing it while holding
-		 * exclusive lock isn't a performance problem.
-		 */
-		if (!stored || pgss->gc_count != gc_count)
-			stored = qtext_store(norm_query ? norm_query : query, query_len,
-								 &query_offset, NULL);
-
-		/* If we failed to write to the text file, give up */
-		if (!stored)
-			goto done;
-
-		/* OK to create a new hashtable entry */
-		entry = entry_alloc(&key, query_offset, query_len, encoding,
-							jstate != NULL);
-
-		/* If needed, perform garbage collection while exclusive lock held */
-		if (do_gc)
-			gc_qtexts();
-	}
-
-	/* Increment the counts, except when jstate is not NULL */
-	if (!jstate)
-	{
-		Assert(kind == PGSS_PLAN || kind == PGSS_EXEC);
-
-		/*
-		 * Grab the spinlock while updating the counters (see comment about
-		 * locking rules at the head of the file)
-		 */
-		SpinLockAcquire(&entry->mutex);
-
-		/* "Unstick" entry if it was previously sticky */
-		if (IS_STICKY(entry->counters))
-			entry->counters.usage = USAGE_INIT;
-
-		entry->counters.calls[kind] += 1;
-		entry->counters.total_time[kind] += total_time;
-
-		if (entry->counters.calls[kind] == 1)
-		{
-			entry->counters.min_time[kind] = total_time;
-			entry->counters.max_time[kind] = total_time;
-			entry->counters.mean_time[kind] = total_time;
-		}
-		else
-		{
-			/*
-			 * Welford's method for accurately computing variance. See
-			 * <http://www.johndcook.com/blog/standard_deviation/>
-			 */
-			double		old_mean = entry->counters.mean_time[kind];
-
-			entry->counters.mean_time[kind] +=
-				(total_time - old_mean) / entry->counters.calls[kind];
-			entry->counters.sum_var_time[kind] +=
-				(total_time - old_mean) * (total_time - entry->counters.mean_time[kind]);
-
-			/*
-			 * Calculate min and max time. min = 0 and max = 0 means that the
-			 * min/max statistics were reset
-			 */
-			if (entry->counters.min_time[kind] == 0
-				&& entry->counters.max_time[kind] == 0)
-			{
-				entry->counters.min_time[kind] = total_time;
-				entry->counters.max_time[kind] = total_time;
-			}
-			else
-			{
-				if (entry->counters.min_time[kind] > total_time)
-					entry->counters.min_time[kind] = total_time;
-				if (entry->counters.max_time[kind] < total_time)
-					entry->counters.max_time[kind] = total_time;
-			}
-		}
-		entry->counters.rows += rows;
-		entry->counters.shared_blks_hit += bufusage->shared_blks_hit;
-		entry->counters.shared_blks_read += bufusage->shared_blks_read;
-		entry->counters.shared_blks_dirtied += bufusage->shared_blks_dirtied;
-		entry->counters.shared_blks_written += bufusage->shared_blks_written;
-		entry->counters.local_blks_hit += bufusage->local_blks_hit;
-		entry->counters.local_blks_read += bufusage->local_blks_read;
-		entry->counters.local_blks_dirtied += bufusage->local_blks_dirtied;
-		entry->counters.local_blks_written += bufusage->local_blks_written;
-		entry->counters.temp_blks_read += bufusage->temp_blks_read;
-		entry->counters.temp_blks_written += bufusage->temp_blks_written;
-		entry->counters.shared_blk_read_time += INSTR_TIME_GET_MILLISEC(bufusage->shared_blk_read_time);
-		entry->counters.shared_blk_write_time += INSTR_TIME_GET_MILLISEC(bufusage->shared_blk_write_time);
-		entry->counters.local_blk_read_time += INSTR_TIME_GET_MILLISEC(bufusage->local_blk_read_time);
-		entry->counters.local_blk_write_time += INSTR_TIME_GET_MILLISEC(bufusage->local_blk_write_time);
-		entry->counters.temp_blk_read_time += INSTR_TIME_GET_MILLISEC(bufusage->temp_blk_read_time);
-		entry->counters.temp_blk_write_time += INSTR_TIME_GET_MILLISEC(bufusage->temp_blk_write_time);
-		entry->counters.usage += USAGE_EXEC(total_time);
-		entry->counters.wal_records += walusage->wal_records;
-		entry->counters.wal_fpi += walusage->wal_fpi;
-		entry->counters.wal_bytes += walusage->wal_bytes;
-		entry->counters.wal_buffers_full += walusage->wal_buffers_full;
-		if (jitusage)
-		{
-			entry->counters.jit_functions += jitusage->created_functions;
-			entry->counters.jit_generation_time += INSTR_TIME_GET_MILLISEC(jitusage->generation_counter);
-
-			if (INSTR_TIME_GET_MILLISEC(jitusage->deform_counter))
-				entry->counters.jit_deform_count++;
-			entry->counters.jit_deform_time += INSTR_TIME_GET_MILLISEC(jitusage->deform_counter);
-
-			if (INSTR_TIME_GET_MILLISEC(jitusage->inlining_counter))
-				entry->counters.jit_inlining_count++;
-			entry->counters.jit_inlining_time += INSTR_TIME_GET_MILLISEC(jitusage->inlining_counter);
-
-			if (INSTR_TIME_GET_MILLISEC(jitusage->optimization_counter))
-				entry->counters.jit_optimization_count++;
-			entry->counters.jit_optimization_time += INSTR_TIME_GET_MILLISEC(jitusage->optimization_counter);
-
-			if (INSTR_TIME_GET_MILLISEC(jitusage->emission_counter))
-				entry->counters.jit_emission_count++;
-			entry->counters.jit_emission_time += INSTR_TIME_GET_MILLISEC(jitusage->emission_counter);
-		}
-
-		/* parallel worker counters */
-		entry->counters.parallel_workers_to_launch += parallel_workers_to_launch;
-		entry->counters.parallel_workers_launched += parallel_workers_launched;
-
-		/* plan cache counters */
-		if (planOrigin == PLAN_STMT_CACHE_GENERIC)
-			entry->counters.generic_plan_calls++;
-		else if (planOrigin == PLAN_STMT_CACHE_CUSTOM)
-			entry->counters.custom_plan_calls++;
-
-		SpinLockRelease(&entry->mutex);
-	}
-
-done:
-	LWLockRelease(&pgss->lock.lock);
-
-	/* We postpone this clean-up until we're out of the lock */
-	if (norm_query)
-		pfree(norm_query);
-}
-
-/*
- * Reset statement statistics corresponding to userid, dbid, and queryid.
- */
-Datum
-pg_stat_statements_reset_1_7(PG_FUNCTION_ARGS)
-{
-	Oid			userid;
-	Oid			dbid;
-	int64		queryid;
-
-	userid = PG_GETARG_OID(0);
-	dbid = PG_GETARG_OID(1);
-	queryid = PG_GETARG_INT64(2);
-
-	entry_reset(userid, dbid, queryid, false);
-
-	PG_RETURN_VOID();
-}
-
-Datum
-pg_stat_statements_reset_1_11(PG_FUNCTION_ARGS)
-{
-	Oid			userid;
-	Oid			dbid;
-	int64		queryid;
-	bool		minmax_only;
-
-	userid = PG_GETARG_OID(0);
-	dbid = PG_GETARG_OID(1);
-	queryid = PG_GETARG_INT64(2);
-	minmax_only = PG_GETARG_BOOL(3);
+	userid = PG_GETARG_OID(0);
+	dbid = PG_GETARG_OID(1);
+	queryid = PG_GETARG_INT64(2);
+	minmax_only = PG_GETARG_BOOL(3);
 
 	PG_RETURN_TIMESTAMPTZ(entry_reset(userid, dbid, queryid, minmax_only));
 }
 
-/*
- * Reset statement statistics.
- */
 Datum
-pg_stat_statements_reset(PG_FUNCTION_ARGS)
-{
-	entry_reset(0, 0, 0, false);
-
-	PG_RETURN_VOID();
-}
-
-/* Number of output arguments (columns) for various API versions */
-#define PG_STAT_STATEMENTS_COLS_V1_0	14
-#define PG_STAT_STATEMENTS_COLS_V1_1	18
-#define PG_STAT_STATEMENTS_COLS_V1_2	19
-#define PG_STAT_STATEMENTS_COLS_V1_3	23
-#define PG_STAT_STATEMENTS_COLS_V1_8	32
-#define PG_STAT_STATEMENTS_COLS_V1_9	33
-#define PG_STAT_STATEMENTS_COLS_V1_10	43
-#define PG_STAT_STATEMENTS_COLS_V1_11	49
-#define PG_STAT_STATEMENTS_COLS_V1_12	52
-#define PG_STAT_STATEMENTS_COLS_V1_13	54
-#define PG_STAT_STATEMENTS_COLS			54	/* maximum of above */
-
-/*
- * Retrieve statement statistics.
- *
- * The SQL API of this function has changed multiple times, and will likely
- * do so again in future.  To support the case where a newer version of this
- * loadable module is being used with an old SQL declaration of the function,
- * we continue to support the older API versions.  For 1.2 and later, the
- * expected API version is identified by embedding it in the C name of the
- * function.  Unfortunately we weren't bright enough to do that for 1.1.
- */
-Datum
-pg_stat_statements_1_13(PG_FUNCTION_ARGS)
+pg_stat_statements_1_2(PG_FUNCTION_ARGS)
 {
 	bool		showtext = PG_GETARG_BOOL(0);
 
-	pg_stat_statements_internal(fcinfo, PGSS_V1_13, showtext);
-
+	pg_stat_statements_internal(fcinfo, PGSS_V1_2, showtext);
 	return (Datum) 0;
 }
 
 Datum
-pg_stat_statements_1_12(PG_FUNCTION_ARGS)
+pg_stat_statements_1_3(PG_FUNCTION_ARGS)
 {
 	bool		showtext = PG_GETARG_BOOL(0);
 
-	pg_stat_statements_internal(fcinfo, PGSS_V1_12, showtext);
-
+	pg_stat_statements_internal(fcinfo, PGSS_V1_3, showtext);
 	return (Datum) 0;
 }
 
 Datum
-pg_stat_statements_1_11(PG_FUNCTION_ARGS)
+pg_stat_statements_1_8(PG_FUNCTION_ARGS)
 {
 	bool		showtext = PG_GETARG_BOOL(0);
 
-	pg_stat_statements_internal(fcinfo, PGSS_V1_11, showtext);
-
+	pg_stat_statements_internal(fcinfo, PGSS_V1_8, showtext);
 	return (Datum) 0;
 }
 
 Datum
-pg_stat_statements_1_10(PG_FUNCTION_ARGS)
+pg_stat_statements_1_9(PG_FUNCTION_ARGS)
 {
 	bool		showtext = PG_GETARG_BOOL(0);
 
-	pg_stat_statements_internal(fcinfo, PGSS_V1_10, showtext);
-
+	pg_stat_statements_internal(fcinfo, PGSS_V1_9, showtext);
 	return (Datum) 0;
 }
 
 Datum
-pg_stat_statements_1_9(PG_FUNCTION_ARGS)
+pg_stat_statements_1_10(PG_FUNCTION_ARGS)
 {
 	bool		showtext = PG_GETARG_BOOL(0);
 
-	pg_stat_statements_internal(fcinfo, PGSS_V1_9, showtext);
-
+	pg_stat_statements_internal(fcinfo, PGSS_V1_10, showtext);
 	return (Datum) 0;
 }
 
 Datum
-pg_stat_statements_1_8(PG_FUNCTION_ARGS)
+pg_stat_statements_1_11(PG_FUNCTION_ARGS)
 {
 	bool		showtext = PG_GETARG_BOOL(0);
 
-	pg_stat_statements_internal(fcinfo, PGSS_V1_8, showtext);
-
+	pg_stat_statements_internal(fcinfo, PGSS_V1_11, showtext);
 	return (Datum) 0;
 }
 
 Datum
-pg_stat_statements_1_3(PG_FUNCTION_ARGS)
+pg_stat_statements_1_12(PG_FUNCTION_ARGS)
 {
 	bool		showtext = PG_GETARG_BOOL(0);
 
-	pg_stat_statements_internal(fcinfo, PGSS_V1_3, showtext);
-
+	pg_stat_statements_internal(fcinfo, PGSS_V1_12, showtext);
 	return (Datum) 0;
 }
 
 Datum
-pg_stat_statements_1_2(PG_FUNCTION_ARGS)
+pg_stat_statements_1_13(PG_FUNCTION_ARGS)
 {
 	bool		showtext = PG_GETARG_BOOL(0);
 
-	pg_stat_statements_internal(fcinfo, PGSS_V1_2, showtext);
-
+	pg_stat_statements_internal(fcinfo, PGSS_V1_13, showtext);
 	return (Datum) 0;
 }
 
 /*
  * Legacy entry point for pg_stat_statements() API versions 1.0 and 1.1.
- * This can be removed someday, perhaps.
  */
 Datum
 pg_stat_statements(PG_FUNCTION_ARGS)
@@ -1663,40 +1711,34 @@ pg_stat_statements(PG_FUNCTION_ARGS)
 	return (Datum) 0;
 }
 
-/* Common code for all versions of pg_stat_statements() */
+/*
+ * pg_stat_statements_internal
+ *
+ * Scan the per-kind pgstat dshash for all entries, reading counters and
+ * metadata directly from the shared body.
+ */
 static void
 pg_stat_statements_internal(FunctionCallInfo fcinfo,
 							pgssVersion api_version,
 							bool showtext)
 {
 	ReturnSetInfo *rsinfo = (ReturnSetInfo *) fcinfo->resultinfo;
+	dshash_seq_status hstat;
+	PgStatShared_HashEntry *p;
 	Oid			userid = GetUserId();
-	bool		is_allowed_role = false;
-	char	   *qbuffer = NULL;
-	Size		qbuffer_size = 0;
-	Size		extent = 0;
-	int			gc_count = 0;
-	HASH_SEQ_STATUS hash_seq;
-	pgssEntry  *entry;
+	bool		is_allowed_role;
 
-	/*
-	 * Superusers or roles with the privileges of pg_read_all_stats members
-	 * are allowed
-	 */
 	is_allowed_role = has_privs_of_role(userid, ROLE_PG_READ_ALL_STATS);
 
-	/* hash table must exist already */
-	if (!pgss || !pgss_hash)
-		ereport(ERROR,
-				(errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
-				 errmsg("pg_stat_statements must be loaded via \"shared_preload_libraries\"")));
+	pgss_attach_shmem();
+
+	/* Flush pending stats so we can read up-to-date counters */
+	pgstat_report_stat(true);
 
 	InitMaterializedSRF(fcinfo, 0);
 
 	/*
-	 * Check we have the expected number of output arguments.  Aside from
-	 * being a good safety check, we need a kluge here to detect API version
-	 * 1.1, which was wedged into the code in an ill-considered way.
+	 * Check we have the expected number of output arguments.
 	 */
 	switch (rsinfo->setDesc->natts)
 	{
@@ -1746,157 +1788,70 @@ pg_stat_statements_internal(FunctionCallInfo fcinfo,
 			elog(ERROR, "incorrect number of output arguments");
 	}
 
-	/*
-	 * We'd like to load the query text file (if needed) while not holding any
-	 * lock on pgss->lock.  In the worst case we'll have to do this again
-	 * after we have the lock, but it's unlikely enough to make this a win
-	 * despite occasional duplicated work.  We need to reload if anybody
-	 * writes to the file (either a retail qtext_store(), or a garbage
-	 * collection) between this point and where we've gotten shared lock.  If
-	 * a qtext_store is actually in progress when we look, we might as well
-	 * skip the speculative load entirely.
-	 */
-	if (showtext)
-	{
-		int			n_writers;
-
-		/* Take the mutex so we can examine variables */
-		SpinLockAcquire(&pgss->mutex);
-		extent = pgss->extent;
-		n_writers = pgss->n_writers;
-		gc_count = pgss->gc_count;
-		SpinLockRelease(&pgss->mutex);
-
-		/* No point in loading file now if there are active writers */
-		if (n_writers == 0)
-			qbuffer = qtext_load_file(&qbuffer_size);
-	}
-
-	/*
-	 * Get shared lock, load or reload the query text file if we must, and
-	 * iterate over the hashtable entries.
-	 *
-	 * With a large hash table, we might be holding the lock rather longer
-	 * than one could wish.  However, this only blocks creation of new hash
-	 * table entries, and the larger the hash table the less likely that is to
-	 * be needed.  So we can hope this is okay.  Perhaps someday we'll decide
-	 * we need to partition the hash table to limit the time spent holding any
-	 * one lock.
-	 */
-	LWLockAcquire(&pgss->lock.lock, LW_SHARED);
-
-	if (showtext)
-	{
-		/*
-		 * Here it is safe to examine extent and gc_count without taking the
-		 * mutex.  Note that although other processes might change
-		 * pgss->extent just after we look at it, the strings they then write
-		 * into the file cannot yet be referenced in the hashtable, so we
-		 * don't care whether we see them or not.
-		 *
-		 * If qtext_load_file fails, we just press on; we'll return NULL for
-		 * every query text.
-		 */
-		if (qbuffer == NULL ||
-			pgss->extent != extent ||
-			pgss->gc_count != gc_count)
-		{
-			if (qbuffer)
-				pfree(qbuffer);
-			qbuffer = qtext_load_file(&qbuffer_size);
-		}
-	}
-
-	hash_seq_init(&hash_seq, pgss_hash);
-	while ((entry = hash_seq_search(&hash_seq)) != NULL)
+	dshash_seq_init(&hstat, pgss_hash, false);
+	while ((p = dshash_seq_next(&hstat)) != NULL)
 	{
 		Datum		values[PG_STAT_STATEMENTS_COLS];
 		bool		nulls[PG_STAT_STATEMENTS_COLS];
 		int			i = 0;
-		Counters	tmp;
+		PgStatShared_Pgss *shared;
+		pgssCounters tmp;
 		double		stddev;
-		int64		queryid = entry->key.queryid;
-		TimestampTz stats_since;
-		TimestampTz minmax_stats_since;
+
+		if (p->dropped)
+			continue;
 
 		memset(values, 0, sizeof(values));
 		memset(nulls, 0, sizeof(nulls));
 
-		values[i++] = ObjectIdGetDatum(entry->key.userid);
-		values[i++] = ObjectIdGetDatum(entry->key.dbid);
+		shared = (PgStatShared_Pgss *) dsa_get_address(pgStatLocal.dsa, p->body);
+
+		LWLockAcquire(&shared->header.lock, LW_SHARED);
+		tmp = shared->counters;
+		LWLockRelease(&shared->header.lock);
+
+		if (tmp.calls[PGSS_EXEC] == 0 && tmp.calls[PGSS_PLAN] == 0)
+			continue;
+
+		values[i++] = ObjectIdGetDatum(shared->key.userid);
+		values[i++] = ObjectIdGetDatum(shared->key.dbid);
 		if (api_version >= PGSS_V1_9)
-			values[i++] = BoolGetDatum(entry->key.toplevel);
+			values[i++] = BoolGetDatum(shared->key.toplevel);
 
-		if (is_allowed_role || entry->key.userid == userid)
+		if (is_allowed_role || shared->key.userid == userid)
 		{
 			if (api_version >= PGSS_V1_2)
-				values[i++] = Int64GetDatumFast(queryid);
+				values[i++] = Int64GetDatumFast(shared->key.queryid);
 
 			if (showtext)
 			{
-				char	   *qstr = qtext_fetch(entry->query_offset,
-											   entry->query_len,
-											   qbuffer,
-											   qbuffer_size);
-
-				if (qstr)
+				if (DsaPointerIsValid(shared->query_text) && shared->query_len >= 0)
 				{
-					char	   *enc;
-
-					enc = pg_any_to_server(qstr,
-										   entry->query_len,
-										   entry->encoding);
+					char	   *qstr = dsa_get_address(pgss_qtext_dsa, shared->query_text);
+					char	   *enc = pg_any_to_server(qstr, shared->query_len, shared->encoding);
 
 					values[i++] = CStringGetTextDatum(enc);
-
 					if (enc != qstr)
 						pfree(enc);
 				}
 				else
-				{
-					/* Just return a null if we fail to find the text */
 					nulls[i++] = true;
-				}
 			}
 			else
-			{
-				/* Query text not requested */
 				nulls[i++] = true;
-			}
 		}
 		else
 		{
-			/* Don't show queryid */
 			if (api_version >= PGSS_V1_2)
 				nulls[i++] = true;
 
-			/*
-			 * Don't show query text, but hint as to the reason for not doing
-			 * so if it was requested
-			 */
 			if (showtext)
 				values[i++] = CStringGetTextDatum("<insufficient privilege>");
 			else
 				nulls[i++] = true;
 		}
 
-		/* copy counters to a local variable to keep locking time short */
-		SpinLockAcquire(&entry->mutex);
-		tmp = entry->counters;
-		SpinLockRelease(&entry->mutex);
-
-		/*
-		 * The spinlock is not required when reading these two as they are
-		 * always updated when holding pgss->lock exclusively.
-		 */
-		stats_since = entry->stats_since;
-		minmax_stats_since = entry->minmax_stats_since;
-
-		/* Skip entry if unexecuted (ie, it's a pending "sticky" entry) */
-		if (IS_STICKY(tmp))
-			continue;
-
-		/* Note that we rely on PGSS_PLAN being 0 and PGSS_EXEC being 1. */
+		/* Note: PGSS_PLAN is 0, PGSS_EXEC is 1 */
 		for (int kind = 0; kind < PGSS_NUMKIND; kind++)
 		{
 			if (kind == PGSS_EXEC || api_version >= PGSS_V1_8)
@@ -1912,12 +1867,6 @@ pg_stat_statements_internal(FunctionCallInfo fcinfo,
 				values[i++] = Float8GetDatumFast(tmp.max_time[kind]);
 				values[i++] = Float8GetDatumFast(tmp.mean_time[kind]);
 
-				/*
-				 * Note we are calculating the population variance here, not
-				 * the sample variance, as we have data for the whole
-				 * population, so Bessel's correction is not used, and we
-				 * don't divide by tmp.calls - 1.
-				 */
 				if (tmp.calls[kind] > 1)
 					stddev = sqrt(tmp.sum_var_time[kind] / tmp.calls[kind]);
 				else
@@ -1925,6 +1874,7 @@ pg_stat_statements_internal(FunctionCallInfo fcinfo,
 				values[i++] = Float8GetDatumFast(stddev);
 			}
 		}
+
 		values[i++] = Int64GetDatumFast(tmp.rows);
 		values[i++] = Int64GetDatumFast(tmp.shared_blks_hit);
 		values[i++] = Int64GetDatumFast(tmp.shared_blks_read);
@@ -1962,8 +1912,6 @@ pg_stat_statements_internal(FunctionCallInfo fcinfo,
 			values[i++] = Int64GetDatumFast(tmp.wal_fpi);
 
 			snprintf(buf, sizeof buf, UINT64_FORMAT, tmp.wal_bytes);
-
-			/* Convert to numeric. */
 			wal_bytes = DirectFunctionCall3(numeric_in,
 											CStringGetDatum(buf),
 											ObjectIdGetDatum(0),
@@ -1971,9 +1919,7 @@ pg_stat_statements_internal(FunctionCallInfo fcinfo,
 			values[i++] = wal_bytes;
 		}
 		if (api_version >= PGSS_V1_12)
-		{
 			values[i++] = Int64GetDatumFast(tmp.wal_buffers_full);
-		}
 		if (api_version >= PGSS_V1_10)
 		{
 			values[i++] = Int64GetDatumFast(tmp.jit_functions);
@@ -2002,8 +1948,8 @@ pg_stat_statements_internal(FunctionCallInfo fcinfo,
 		}
 		if (api_version >= PGSS_V1_11)
 		{
-			values[i++] = TimestampTzGetDatum(stats_since);
-			values[i++] = TimestampTzGetDatum(minmax_stats_since);
+			values[i++] = TimestampTzGetDatum(shared->stats_since);
+			values[i++] = TimestampTzGetDatum(shared->minmax_stats_since);
 		}
 
 		Assert(i == (api_version == PGSS_V1_0 ? PG_STAT_STATEMENTS_COLS_V1_0 :
@@ -2020,15 +1966,11 @@ pg_stat_statements_internal(FunctionCallInfo fcinfo,
 
 		tuplestore_putvalues(rsinfo->setResult, rsinfo->setDesc, values, nulls);
 	}
-
-	LWLockRelease(&pgss->lock.lock);
-
-	if (qbuffer)
-		pfree(qbuffer);
+	dshash_seq_term(&hstat);
 }
 
 /* Number of output arguments (columns) for pg_stat_statements_info */
-#define PG_STAT_STATEMENTS_INFO_COLS	2
+#define PG_STAT_STATEMENTS_INFO_COLS	5
 
 /*
  * Return statistics of pg_stat_statements.
@@ -2036,756 +1978,129 @@ pg_stat_statements_internal(FunctionCallInfo fcinfo,
 Datum
 pg_stat_statements_info(PG_FUNCTION_ARGS)
 {
-	pgssGlobalStats stats;
 	TupleDesc	tupdesc;
 	Datum		values[PG_STAT_STATEMENTS_INFO_COLS] = {0};
 	bool		nulls[PG_STAT_STATEMENTS_INFO_COLS] = {0};
 
-	if (!pgss || !pgss_hash)
-		ereport(ERROR,
-				(errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
-				 errmsg("pg_stat_statements must be loaded via \"shared_preload_libraries\"")));
-
-	/* Build a tuple descriptor for our result type */
 	if (get_call_result_type(fcinfo, NULL, &tupdesc) != TYPEFUNC_COMPOSITE)
 		elog(ERROR, "return type must be a row type");
 
-	/* Read global statistics for pg_stat_statements */
-	SpinLockAcquire(&pgss->mutex);
-	stats = pgss->stats;
-	SpinLockRelease(&pgss->mutex);
+	pgss_attach_shmem();
 
-	values[0] = Int64GetDatum(stats.dealloc);
-	values[1] = TimestampTzGetDatum(stats.stats_reset);
+	values[0] = Int64GetDatum((int64) pg_atomic_read_u64(&pgss_shared->dealloc));
+	values[1] = TimestampTzGetDatum((TimestampTz) pg_atomic_read_u64(&pgss_shared->stats_reset));
+	values[2] = Int64GetDatum((int64) pg_atomic_read_u64(&pgss_shared->skipped_entries));
+	values[3] = Int64GetDatum((int64) dsa_get_total_size(pgss_qtext_dsa));
+	values[4] = Int64GetDatum((int64) pgstat_get_entry_count(PGSTAT_KIND_PGSS));
 
 	PG_RETURN_DATUM(HeapTupleGetDatum(heap_form_tuple(tupdesc, values, nulls)));
 }
 
-/*
- * Allocate a new hashtable entry.
- * caller must hold an exclusive lock on pgss->lock
- *
- * "query" need not be null-terminated; we rely on query_len instead
- *
- * If "sticky" is true, make the new entry artificially sticky so that it will
- * probably still be there when the query finishes execution.  We do this by
- * giving it a median usage value rather than the normal value.  (Strictly
- * speaking, query strings are normalized on a best effort basis, though it
- * would be difficult to demonstrate this even under artificial conditions.)
- *
- * Note: despite needing exclusive lock, it's not an error for the target
- * entry to already exist.  This is because pgss_store releases and
- * reacquires lock after failing to find a match; so someone else could
- * have made the entry while we waited to get exclusive lock.
- */
-static pgssEntry *
-entry_alloc(pgssHashKey *key, Size query_offset, int query_len, int encoding,
-			bool sticky)
-{
-	pgssEntry  *entry;
-	bool		found;
-
-	/* Make space if needed */
-	while (hash_get_num_entries(pgss_hash) >= pgss_max)
-		entry_dealloc();
-
-	/* Find or create an entry with desired hash code */
-	entry = (pgssEntry *) hash_search(pgss_hash, key, HASH_ENTER, &found);
-
-	if (!found)
-	{
-		/* New entry, initialize it */
-
-		/* reset the statistics */
-		memset(&entry->counters, 0, sizeof(Counters));
-		/* set the appropriate initial usage count */
-		entry->counters.usage = sticky ? pgss->cur_median_usage : USAGE_INIT;
-		/* re-initialize the mutex each time ... we assume no one using it */
-		SpinLockInit(&entry->mutex);
-		/* ... and don't forget the query text metadata */
-		Assert(query_len >= 0);
-		entry->query_offset = query_offset;
-		entry->query_len = query_len;
-		entry->encoding = encoding;
-		entry->stats_since = GetCurrentTimestamp();
-		entry->minmax_stats_since = entry->stats_since;
-	}
-
-	return entry;
-}
-
-/*
- * qsort comparator for sorting into increasing usage order
- */
-static int
-entry_cmp(const void *lhs, const void *rhs)
-{
-	double		l_usage = (*(pgssEntry *const *) lhs)->counters.usage;
-	double		r_usage = (*(pgssEntry *const *) rhs)->counters.usage;
-
-	if (l_usage < r_usage)
-		return -1;
-	else if (l_usage > r_usage)
-		return +1;
-	else
-		return 0;
-}
-
-/*
- * Deallocate least-used entries.
- *
- * Caller must hold an exclusive lock on pgss->lock.
- */
-static void
-entry_dealloc(void)
-{
-	HASH_SEQ_STATUS hash_seq;
-	pgssEntry **entries;
-	pgssEntry  *entry;
-	int			nvictims;
-	int			i;
-	Size		tottextlen;
-	int			nvalidtexts;
-
-	/*
-	 * Sort entries by usage and deallocate USAGE_DEALLOC_PERCENT of them.
-	 * While we're scanning the table, apply the decay factor to the usage
-	 * values, and update the mean query length.
-	 *
-	 * Note that the mean query length is almost immediately obsolete, since
-	 * we compute it before not after discarding the least-used entries.
-	 * Hopefully, that doesn't affect the mean too much; it doesn't seem worth
-	 * making two passes to get a more current result.  Likewise, the new
-	 * cur_median_usage includes the entries we're about to zap.
-	 */
-
-	entries = palloc(hash_get_num_entries(pgss_hash) * sizeof(pgssEntry *));
-
-	i = 0;
-	tottextlen = 0;
-	nvalidtexts = 0;
-
-	hash_seq_init(&hash_seq, pgss_hash);
-	while ((entry = hash_seq_search(&hash_seq)) != NULL)
-	{
-		entries[i++] = entry;
-		/* "Sticky" entries get a different usage decay rate. */
-		if (IS_STICKY(entry->counters))
-			entry->counters.usage *= STICKY_DECREASE_FACTOR;
-		else
-			entry->counters.usage *= USAGE_DECREASE_FACTOR;
-		/* In the mean length computation, ignore dropped texts. */
-		if (entry->query_len >= 0)
-		{
-			tottextlen += entry->query_len + 1;
-			nvalidtexts++;
-		}
-	}
-
-	/* Sort into increasing order by usage */
-	qsort(entries, i, sizeof(pgssEntry *), entry_cmp);
-
-	/* Record the (approximate) median usage */
-	if (i > 0)
-		pgss->cur_median_usage = entries[i / 2]->counters.usage;
-	/* Record the mean query length */
-	if (nvalidtexts > 0)
-		pgss->mean_query_len = tottextlen / nvalidtexts;
-	else
-		pgss->mean_query_len = ASSUMED_LENGTH_INIT;
-
-	/* Now zap an appropriate fraction of lowest-usage entries */
-	nvictims = Max(10, i * USAGE_DEALLOC_PERCENT / 100);
-	nvictims = Min(nvictims, i);
-
-	for (i = 0; i < nvictims; i++)
-	{
-		hash_search(pgss_hash, &entries[i]->key, HASH_REMOVE, NULL);
-	}
-
-	pfree(entries);
-
-	/* Increment the number of times entries are deallocated */
-	SpinLockAcquire(&pgss->mutex);
-	pgss->stats.dealloc += 1;
-	SpinLockRelease(&pgss->mutex);
-}
-
-/*
- * Given a query string (not necessarily null-terminated), allocate a new
- * entry in the external query text file and store the string there.
- *
- * If successful, returns true, and stores the new entry's offset in the file
- * into *query_offset.  Also, if gc_count isn't NULL, *gc_count is set to the
- * number of garbage collections that have occurred so far.
- *
- * On failure, returns false.
- *
- * At least a shared lock on pgss->lock must be held by the caller, so as
- * to prevent a concurrent garbage collection.  Share-lock-holding callers
- * should pass a gc_count pointer to obtain the number of garbage collections,
- * so that they can recheck the count after obtaining exclusive lock to
- * detect whether a garbage collection occurred (and removed this entry).
- */
-static bool
-qtext_store(const char *query, int query_len,
-			Size *query_offset, int *gc_count)
-{
-	Size		off;
-	int			fd;
-
-	/*
-	 * We use a spinlock to protect extent/n_writers/gc_count, so that
-	 * multiple processes may execute this function concurrently.
-	 */
-	SpinLockAcquire(&pgss->mutex);
-	off = pgss->extent;
-	pgss->extent += query_len + 1;
-	pgss->n_writers++;
-	if (gc_count)
-		*gc_count = pgss->gc_count;
-	SpinLockRelease(&pgss->mutex);
-
-	*query_offset = off;
-
-	/*
-	 * Don't allow the file to grow larger than what qtext_load_file can
-	 * (theoretically) handle.  This has been seen to be reachable on 32-bit
-	 * platforms.
-	 */
-	if (unlikely(query_len >= MaxAllocHugeSize - off))
-	{
-		errno = EFBIG;			/* not quite right, but it'll do */
-		fd = -1;
-		goto error;
-	}
-
-	/* Now write the data into the successfully-reserved part of the file */
-	fd = OpenTransientFile(PGSS_TEXT_FILE, O_RDWR | O_CREAT | PG_BINARY);
-	if (fd < 0)
-		goto error;
-
-	if (pg_pwrite(fd, query, query_len, off) != query_len)
-		goto error;
-	if (pg_pwrite(fd, "\0", 1, off + query_len) != 1)
-		goto error;
-
-	CloseTransientFile(fd);
-
-	/* Mark our write complete */
-	SpinLockAcquire(&pgss->mutex);
-	pgss->n_writers--;
-	SpinLockRelease(&pgss->mutex);
-
-	return true;
-
-error:
-	ereport(LOG,
-			(errcode_for_file_access(),
-			 errmsg("could not write file \"%s\": %m",
-					PGSS_TEXT_FILE)));
-
-	if (fd >= 0)
-		CloseTransientFile(fd);
-
-	/* Mark our write complete */
-	SpinLockAcquire(&pgss->mutex);
-	pgss->n_writers--;
-	SpinLockRelease(&pgss->mutex);
-
-	return false;
-}
-
-/*
- * Read the external query text file into a palloc'd buffer.
- *
- * Returns NULL (without throwing an error) if unable to read, eg
- * file not there or insufficient memory.
- *
- * On success, the buffer size is also returned into *buffer_size.
- *
- * This can be called without any lock on pgss->lock, but in that case
- * the caller is responsible for verifying that the result is sane.
- */
-static char *
-qtext_load_file(Size *buffer_size)
+static TimestampTz
+entry_reset(Oid userid, Oid dbid, int64 queryid, bool minmax_only)
 {
-	char	   *buf;
-	int			fd;
-	struct stat stat;
-	Size		nread;
+	dshash_seq_status hstat;
+	PgStatShared_HashEntry *p;
+	TimestampTz stats_reset;
 
-	fd = OpenTransientFile(PGSS_TEXT_FILE, O_RDONLY | PG_BINARY);
-	if (fd < 0)
-	{
-		if (errno != ENOENT)
-			ereport(LOG,
-					(errcode_for_file_access(),
-					 errmsg("could not read file \"%s\": %m",
-							PGSS_TEXT_FILE)));
-		return NULL;
-	}
+	pgss_attach_shmem();
 
-	/* Get file length */
-	if (fstat(fd, &stat))
-	{
-		ereport(LOG,
-				(errcode_for_file_access(),
-				 errmsg("could not stat file \"%s\": %m",
-						PGSS_TEXT_FILE)));
-		CloseTransientFile(fd);
-		return NULL;
-	}
-
-	/* Allocate buffer; beware that off_t might be wider than size_t */
-	if (stat.st_size <= MaxAllocHugeSize)
-		buf = (char *) palloc_extended(stat.st_size, MCXT_ALLOC_HUGE | MCXT_ALLOC_NO_OOM);
-	else
-		buf = NULL;
-	if (buf == NULL)
-	{
-		ereport(LOG,
-				(errcode(ERRCODE_OUT_OF_MEMORY),
-				 errmsg("out of memory"),
-				 errdetail("Could not allocate enough memory to read file \"%s\".",
-						   PGSS_TEXT_FILE)));
-		CloseTransientFile(fd);
-		return NULL;
-	}
+	stats_reset = GetCurrentTimestamp();
 
-	/*
-	 * OK, slurp in the file.  Windows fails if we try to read more than
-	 * INT_MAX bytes at once, and other platforms might not like that either,
-	 * so read a very large file in 1GB segments.
-	 */
-	nread = 0;
-	while (nread < stat.st_size)
+	if (minmax_only)
 	{
-		int			toread = Min(1024 * 1024 * 1024, stat.st_size - nread);
-
-		/*
-		 * If we get a short read and errno doesn't get set, the reason is
-		 * probably that garbage collection truncated the file since we did
-		 * the fstat(), so we don't log a complaint --- but we don't return
-		 * the data, either, since it's most likely corrupt due to concurrent
-		 * writes from garbage collection.
-		 */
-		errno = 0;
-		if (read(fd, buf + nread, toread) != toread)
+		dshash_seq_init(&hstat, pgss_hash, false);
+		while ((p = dshash_seq_next(&hstat)) != NULL)
 		{
-			if (errno)
-				ereport(LOG,
-						(errcode_for_file_access(),
-						 errmsg("could not read file \"%s\": %m",
-								PGSS_TEXT_FILE)));
-			pfree(buf);
-			CloseTransientFile(fd);
-			return NULL;
-		}
-		nread += toread;
-	}
+			PgStatShared_Pgss *shared;
 
-	if (CloseTransientFile(fd) != 0)
-		ereport(LOG,
-				(errcode_for_file_access(),
-				 errmsg("could not close file \"%s\": %m", PGSS_TEXT_FILE)));
-
-	*buffer_size = nread;
-	return buf;
-}
+			if (p->dropped)
+				continue;
 
-/*
- * Locate a query text in the file image previously read by qtext_load_file().
- *
- * We validate the given offset/length, and return NULL if bogus.  Otherwise,
- * the result points to a null-terminated string within the buffer.
- */
-static char *
-qtext_fetch(Size query_offset, int query_len,
-			char *buffer, Size buffer_size)
-{
-	/* File read failed? */
-	if (buffer == NULL)
-		return NULL;
-	/* Bogus offset/length? */
-	if (query_len < 0 ||
-		query_offset + query_len >= buffer_size)
-		return NULL;
-	/* As a further sanity check, make sure there's a trailing null */
-	if (buffer[query_offset + query_len] != '\0')
-		return NULL;
-	/* Looks OK */
-	return buffer + query_offset;
-}
-
-/*
- * Do we need to garbage-collect the external query text file?
- *
- * Caller should hold at least a shared lock on pgss->lock.
- */
-static bool
-need_gc_qtexts(void)
-{
-	Size		extent;
-
-	/* Read shared extent pointer */
-	SpinLockAcquire(&pgss->mutex);
-	extent = pgss->extent;
-	SpinLockRelease(&pgss->mutex);
-
-	/*
-	 * Don't proceed if file does not exceed 512 bytes per possible entry.
-	 *
-	 * Here and in the next test, 32-bit machines have overflow hazards if
-	 * pgss_max and/or mean_query_len are large.  Force the multiplications
-	 * and comparisons to be done in uint64 arithmetic to forestall trouble.
-	 */
-	if ((uint64) extent < (uint64) 512 * pgss_max)
-		return false;
+			shared = (PgStatShared_Pgss *) dsa_get_address(pgStatLocal.dsa, p->body);
 
-	/*
-	 * Don't proceed if file is less than about 50% bloat.  Nothing can or
-	 * should be done in the event of unusually large query texts accounting
-	 * for file's large size.  We go to the trouble of maintaining the mean
-	 * query length in order to prevent garbage collection from thrashing
-	 * uselessly.
-	 */
-	if ((uint64) extent < (uint64) pgss->mean_query_len * pgss_max * 2)
-		return false;
-
-	return true;
-}
-
-/*
- * Garbage-collect orphaned query texts in external file.
- *
- * This won't be called often in the typical case, since it's likely that
- * there won't be too much churn, and besides, a similar compaction process
- * occurs when serializing to disk at shutdown or as part of resetting.
- * Despite this, it seems prudent to plan for the edge case where the file
- * becomes unreasonably large, with no other method of compaction likely to
- * occur in the foreseeable future.
- *
- * The caller must hold an exclusive lock on pgss->lock.
- *
- * At the first sign of trouble we unlink the query text file to get a clean
- * slate (although existing statistics are retained), rather than risk
- * thrashing by allowing the same problem case to recur indefinitely.
- */
-static void
-gc_qtexts(void)
-{
-	char	   *qbuffer;
-	Size		qbuffer_size;
-	FILE	   *qfile = NULL;
-	HASH_SEQ_STATUS hash_seq;
-	pgssEntry  *entry;
-	Size		extent;
-	int			nentries;
-
-	/*
-	 * When called from pgss_store, some other session might have proceeded
-	 * with garbage collection in the no-lock-held interim of lock strength
-	 * escalation.  Check once more that this is actually necessary.
-	 */
-	if (!need_gc_qtexts())
-		return;
-
-	/*
-	 * Load the old texts file.  If we fail (out of memory, for instance),
-	 * invalidate query texts.  Hopefully this is rare.  It might seem better
-	 * to leave things alone on an OOM failure, but the problem is that the
-	 * file is only going to get bigger; hoping for a future non-OOM result is
-	 * risky and can easily lead to complete denial of service.
-	 */
-	qbuffer = qtext_load_file(&qbuffer_size);
-	if (qbuffer == NULL)
-		goto gc_fail;
-
-	/*
-	 * We overwrite the query texts file in place, so as to reduce the risk of
-	 * an out-of-disk-space failure.  Since the file is guaranteed not to get
-	 * larger, this should always work on traditional filesystems; though we
-	 * could still lose on copy-on-write filesystems.
-	 */
-	qfile = AllocateFile(PGSS_TEXT_FILE, PG_BINARY_W);
-	if (qfile == NULL)
-	{
-		ereport(LOG,
-				(errcode_for_file_access(),
-				 errmsg("could not write file \"%s\": %m",
-						PGSS_TEXT_FILE)));
-		goto gc_fail;
-	}
-
-	extent = 0;
-	nentries = 0;
+			if ((!userid || shared->key.userid == userid) &&
+				(!dbid || shared->key.dbid == dbid) &&
+				(!queryid || shared->key.queryid == queryid))
+			{
+				LWLockAcquire(&shared->header.lock, LW_EXCLUSIVE);
 
-	hash_seq_init(&hash_seq, pgss_hash);
-	while ((entry = hash_seq_search(&hash_seq)) != NULL)
-	{
-		int			query_len = entry->query_len;
-		char	   *qry = qtext_fetch(entry->query_offset,
-									  query_len,
-									  qbuffer,
-									  qbuffer_size);
+				shared->minmax_stats_since = stats_reset;
 
-		if (qry == NULL)
-		{
-			/* Trouble ... drop the text */
-			entry->query_offset = 0;
-			entry->query_len = -1;
-			/* entry will not be counted in mean query length computation */
-			continue;
-		}
+				for (int kind = 0; kind < PGSS_NUMKIND; kind++)
+				{
+					shared->counters.min_time[kind] = 0;
+					shared->counters.max_time[kind] = 0;
+					shared->counters.mean_time[kind] = 0;
+					shared->counters.sum_var_time[kind] = 0;
+				}
 
-		if (fwrite(qry, 1, query_len + 1, qfile) != query_len + 1)
-		{
-			ereport(LOG,
-					(errcode_for_file_access(),
-					 errmsg("could not write file \"%s\": %m",
-							PGSS_TEXT_FILE)));
-			hash_seq_term(&hash_seq);
-			goto gc_fail;
+				LWLockRelease(&shared->header.lock);
+			}
 		}
+		dshash_seq_term(&hstat);
 
-		entry->query_offset = extent;
-		extent += query_len + 1;
-		nentries++;
+		return stats_reset;
 	}
 
-	/*
-	 * Truncate away any now-unused space.  If this fails for some odd reason,
-	 * we log it, but there's no need to fail.
-	 */
-	if (ftruncate(fileno(qfile), extent) != 0)
-		ereport(LOG,
-				(errcode_for_file_access(),
-				 errmsg("could not truncate file \"%s\": %m",
-						PGSS_TEXT_FILE)));
-
-	if (FreeFile(qfile))
-	{
-		ereport(LOG,
-				(errcode_for_file_access(),
-				 errmsg("could not write file \"%s\": %m",
-						PGSS_TEXT_FILE)));
-		qfile = NULL;
-		goto gc_fail;
-	}
-
-	elog(DEBUG1, "pgss gc of queries file shrunk size from %zu to %zu",
-		 pgss->extent, extent);
-
-	/* Reset the shared extent pointer */
-	pgss->extent = extent;
-
-	/*
-	 * Also update the mean query length, to be sure that need_gc_qtexts()
-	 * won't still think we have a problem.
-	 */
-	if (nentries > 0)
-		pgss->mean_query_len = extent / nentries;
-	else
-		pgss->mean_query_len = ASSUMED_LENGTH_INIT;
-
-	pfree(qbuffer);
-
-	/*
-	 * OK, count a garbage collection cycle.  (Note: even though we have
-	 * exclusive lock on pgss->lock, we must take pgss->mutex for this, since
-	 * other processes may examine gc_count while holding only the mutex.
-	 * Also, we have to advance the count *after* we've rewritten the file,
-	 * else other processes might not realize they read a stale file.)
-	 */
-	record_gc_qtexts();
-
-	return;
-
-gc_fail:
-	/* clean up resources */
-	if (qfile)
-		FreeFile(qfile);
-	if (qbuffer)
-		pfree(qbuffer);
-
-	/*
-	 * Since the contents of the external file are now uncertain, mark all
-	 * hashtable entries as having invalid texts.
-	 */
-	hash_seq_init(&hash_seq, pgss_hash);
-	while ((entry = hash_seq_search(&hash_seq)) != NULL)
-	{
-		entry->query_offset = 0;
-		entry->query_len = -1;
-	}
-
-	/*
-	 * Destroy the query text file and create a new, empty one
-	 */
-	(void) unlink(PGSS_TEXT_FILE);
-	qfile = AllocateFile(PGSS_TEXT_FILE, PG_BINARY_W);
-	if (qfile == NULL)
-		ereport(LOG,
-				(errcode_for_file_access(),
-				 errmsg("could not recreate file \"%s\": %m",
-						PGSS_TEXT_FILE)));
-	else
-		FreeFile(qfile);
-
-	/* Reset the shared extent pointer */
-	pgss->extent = 0;
-
-	/* Reset mean_query_len to match the new state */
-	pgss->mean_query_len = ASSUMED_LENGTH_INIT;
-
-	/*
-	 * Bump the GC count even though we failed.
-	 *
-	 * This is needed to make concurrent readers of file without any lock on
-	 * pgss->lock notice existence of new version of file.  Once readers
-	 * subsequently observe a change in GC count with pgss->lock held, that
-	 * forces a safe reopen of file.  Writers also require that we bump here,
-	 * of course.  (As required by locking protocol, readers and writers don't
-	 * trust earlier file contents until gc_count is found unchanged after
-	 * pgss->lock acquired in shared or exclusive mode respectively.)
-	 */
-	record_gc_qtexts();
-}
-
-#define SINGLE_ENTRY_RESET(e) \
-if (e) { \
-	if (minmax_only) { \
-		/* When requested reset only min/max statistics of an entry */ \
-		for (int kind = 0; kind < PGSS_NUMKIND; kind++) \
-		{ \
-			e->counters.max_time[kind] = 0; \
-			e->counters.min_time[kind] = 0; \
-		} \
-		e->minmax_stats_since = stats_reset; \
-	} \
-	else \
-	{ \
-		/* Remove the key otherwise  */ \
-		hash_search(pgss_hash, &e->key, HASH_REMOVE, NULL); \
-		num_remove++; \
-	} \
-}
-
-/*
- * Reset entries corresponding to parameters passed.
- */
-static TimestampTz
-entry_reset(Oid userid, Oid dbid, int64 queryid, bool minmax_only)
-{
-	HASH_SEQ_STATUS hash_seq;
-	pgssEntry  *entry;
-	FILE	   *qfile;
-	int64		num_entries;
-	int64		num_remove = 0;
-	pgssHashKey key;
-	TimestampTz stats_reset;
-
-	if (!pgss || !pgss_hash)
-		ereport(ERROR,
-				(errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
-				 errmsg("pg_stat_statements must be loaded via \"shared_preload_libraries\"")));
-
-	LWLockAcquire(&pgss->lock.lock, LW_EXCLUSIVE);
-	num_entries = hash_get_num_entries(pgss_hash);
-
-	stats_reset = GetCurrentTimestamp();
-
 	if (userid != 0 && dbid != 0 && queryid != INT64CONST(0))
 	{
-		/* If all the parameters are available, use the fast path. */
+		pgssHashKey key;
+		uint64		objid;
+
 		memset(&key, 0, sizeof(pgssHashKey));
 		key.userid = userid;
 		key.dbid = dbid;
 		key.queryid = queryid;
 
-		/*
-		 * Reset the entry if it exists, starting with the non-top-level
-		 * entry.
-		 */
 		key.toplevel = false;
-		entry = (pgssEntry *) hash_search(pgss_hash, &key, HASH_FIND, NULL);
-
-		SINGLE_ENTRY_RESET(entry);
+		objid = pgss_hash_key(&key);
+		pgstat_drop_entry(PGSTAT_KIND_PGSS, key.dbid, objid, true);
 
-		/* Also reset the top-level entry if it exists. */
 		key.toplevel = true;
-		entry = (pgssEntry *) hash_search(pgss_hash, &key, HASH_FIND, NULL);
+		objid = pgss_hash_key(&key);
+		pgstat_drop_entry(PGSTAT_KIND_PGSS, key.dbid, objid, true);
 
-		SINGLE_ENTRY_RESET(entry);
-	}
-	else if (userid != 0 || dbid != 0 || queryid != INT64CONST(0))
-	{
-		/* Reset entries corresponding to valid parameters. */
-		hash_seq_init(&hash_seq, pgss_hash);
-		while ((entry = hash_seq_search(&hash_seq)) != NULL)
-		{
-			if ((!userid || entry->key.userid == userid) &&
-				(!dbid || entry->key.dbid == dbid) &&
-				(!queryid || entry->key.queryid == queryid))
-			{
-				SINGLE_ENTRY_RESET(entry);
-			}
-		}
+		pgstat_request_entry_refs_gc();
 	}
 	else
 	{
-		/* Reset all entries. */
-		hash_seq_init(&hash_seq, pgss_hash);
-		while ((entry = hash_seq_search(&hash_seq)) != NULL)
+		dshash_seq_init(&hstat, pgss_hash, true);
+		while ((p = dshash_seq_next(&hstat)) != NULL)
 		{
-			SINGLE_ENTRY_RESET(entry);
-		}
-	}
-
-	/* All entries are removed? */
-	if (num_entries != num_remove)
-		goto release_lock;
-
-	/*
-	 * Reset global statistics for pg_stat_statements since all entries are
-	 * removed.
-	 */
-	SpinLockAcquire(&pgss->mutex);
-	pgss->stats.dealloc = 0;
-	pgss->stats.stats_reset = stats_reset;
-	SpinLockRelease(&pgss->mutex);
+			PgStatShared_Pgss *shared;
 
-	/*
-	 * Write new empty query file, perhaps even creating a new one to recover
-	 * if the file was missing.
-	 */
-	qfile = AllocateFile(PGSS_TEXT_FILE, PG_BINARY_W);
-	if (qfile == NULL)
-	{
-		ereport(LOG,
-				(errcode_for_file_access(),
-				 errmsg("could not create file \"%s\": %m",
-						PGSS_TEXT_FILE)));
-		goto done;
-	}
+			if (p->dropped)
+				continue;
 
-	/* If ftruncate fails, log it, but it's not a fatal problem */
-	if (ftruncate(fileno(qfile), 0) != 0)
-		ereport(LOG,
-				(errcode_for_file_access(),
-				 errmsg("could not truncate file \"%s\": %m",
-						PGSS_TEXT_FILE)));
+			shared = (PgStatShared_Pgss *) dsa_get_address(pgStatLocal.dsa, p->body);
 
-	FreeFile(qfile);
+			if ((!userid || shared->key.userid == userid) &&
+				(!dbid || shared->key.dbid == dbid) &&
+				(!queryid || shared->key.queryid == queryid))
+			{
+				if (DsaPointerIsValid(shared->query_text))
+				{
+					dsa_free(pgss_qtext_dsa, shared->query_text);
+					shared->query_text = InvalidDsaPointer;
+				}
 
-done:
-	pgss->extent = 0;
-	/* This counts as a query text garbage collection for our purposes */
-	record_gc_qtexts();
+				pgstat_drop_current(p, &hstat);
+				pg_atomic_sub_fetch_u64(&pgss_shared->live_entries, 1);
+			}
+		}
+		dshash_seq_term(&hstat);
 
-release_lock:
-	LWLockRelease(&pgss->lock.lock);
+		pgstat_request_entry_refs_gc();
+	}
+	/* If this was a full reset (no filters), reset global statistics */
+	if (!userid && !dbid && !queryid)
+	{
+		pg_atomic_write_u64(&pgss_shared->dealloc, 0);
+		pg_atomic_write_u64(&pgss_shared->skipped_entries, 0);
+		pg_atomic_write_u64(&pgss_shared->live_entries, 0);
+		pg_atomic_write_u64(&pgss_shared->stats_reset, (uint64) stats_reset);
+	}
 
 	return stats_reset;
 }
diff --git a/contrib/pg_stat_statements/pg_stat_statements.conf b/contrib/pg_stat_statements/pg_stat_statements.conf
index 0e900d7119b..21a10c41d09 100644
--- a/contrib/pg_stat_statements/pg_stat_statements.conf
+++ b/contrib/pg_stat_statements/pg_stat_statements.conf
@@ -1,2 +1,3 @@
 shared_preload_libraries = 'pg_stat_statements'
 max_prepared_transactions = 5
+max_parallel_workers_per_gather = 0
diff --git a/contrib/pg_stat_statements/pg_stat_statements.control b/contrib/pg_stat_statements/pg_stat_statements.control
index 2eee0ceffa8..61ae41efc14 100644
--- a/contrib/pg_stat_statements/pg_stat_statements.control
+++ b/contrib/pg_stat_statements/pg_stat_statements.control
@@ -1,5 +1,5 @@
 # pg_stat_statements extension
 comment = 'track planning and execution statistics of all SQL statements executed'
-default_version = '1.13'
+default_version = '1.14'
 module_pathname = '$libdir/pg_stat_statements'
 relocatable = true
diff --git a/contrib/pg_stat_statements/sql/dml.sql b/contrib/pg_stat_statements/sql/dml.sql
index 9986b0a22d3..8f31ac96153 100644
--- a/contrib/pg_stat_statements/sql/dml.sql
+++ b/contrib/pg_stat_statements/sql/dml.sql
@@ -2,6 +2,7 @@
 -- DMLs on test table
 --
 
+SELECT pg_stat_statements_reset() IS NOT NULL AS t;
 SET pg_stat_statements.track_utility = FALSE;
 
 CREATE TEMP TABLE pgss_dml_tab (a int, b char(20));
diff --git a/contrib/pg_stat_statements/sql/oldextversions.sql b/contrib/pg_stat_statements/sql/oldextversions.sql
index e416efe9ffb..b2090be6267 100644
--- a/contrib/pg_stat_statements/sql/oldextversions.sql
+++ b/contrib/pg_stat_statements/sql/oldextversions.sql
@@ -68,4 +68,9 @@ AlTER EXTENSION pg_stat_statements UPDATE TO '1.13';
 \d pg_stat_statements
 SELECT count(*) > 0 AS has_data FROM pg_stat_statements;
 
+-- Functions marked PARALLEL RESTRICTED in 1.14
+AlTER EXTENSION pg_stat_statements UPDATE TO '1.14';
+\d pg_stat_statements
+SELECT count(*) > 0 AS has_data FROM pg_stat_statements;
+
 DROP EXTENSION pg_stat_statements;
diff --git a/contrib/pg_stat_statements/sql/select.sql b/contrib/pg_stat_statements/sql/select.sql
index a10d618c034..f7e731bce1b 100644
--- a/contrib/pg_stat_statements/sql/select.sql
+++ b/contrib/pg_stat_statements/sql/select.sql
@@ -269,3 +269,15 @@ DROP SCHEMA pgss_schema_1 CASCADE;
 DROP SCHEMA pgss_schema_2 CASCADE;
 DROP TABLE tab_search_same, tab_search_diff_1, tab_search_diff_2;
 SELECT pg_stat_statements_reset() IS NOT NULL AS t;
+
+--
+-- reset within a transaction: entries with unflushed pending data should
+-- still be removed and not reappear in the view
+--
+SELECT pg_stat_statements_reset() IS NOT NULL AS t;
+BEGIN;
+SELECT 1 AS "RESET_TXN_TEST";
+SELECT count(*) FROM pg_stat_statements WHERE query LIKE '%RESET_TXN_TEST%';
+SELECT pg_stat_statements_reset() IS NOT NULL AS t;
+SELECT count(*) FROM pg_stat_statements WHERE query LIKE '%RESET_TXN_TEST%';
+COMMIT;
diff --git a/contrib/pg_stat_statements/t/010_restart.pl b/contrib/pg_stat_statements/t/001_restart.pl
similarity index 100%
rename from contrib/pg_stat_statements/t/010_restart.pl
rename to contrib/pg_stat_statements/t/001_restart.pl
diff --git a/contrib/pg_stat_statements/t/002_query_text_memory.pl b/contrib/pg_stat_statements/t/002_query_text_memory.pl
new file mode 100644
index 00000000000..50b00153625
--- /dev/null
+++ b/contrib/pg_stat_statements/t/002_query_text_memory.pl
@@ -0,0 +1,124 @@
+# Copyright (c) 2008-2026, PostgreSQL Global Development Group
+
+# Tests for pg_stat_statements.query_text_memory behavior.
+# Verifies that when the query text DSA is exhausted:
+#   - entries are still tracked with counters accumulating
+#   - query text is NULL for entries that could not store text
+#   - both showtext=true and showtext=false return all entries
+#   - after raising the limit and re-executing, text is backfilled
+
+use strict;
+use warnings FATAL => 'all';
+use PostgreSQL::Test::Cluster;
+use PostgreSQL::Test::Utils;
+use Test::More;
+
+my $node = PostgreSQL::Test::Cluster->new('main');
+$node->init;
+$node->append_conf('postgresql.conf', qq{
+shared_preload_libraries = 'pg_stat_statements'
+pg_stat_statements.query_text_memory = 2MB
+});
+$node->start;
+
+$node->safe_psql('postgres', 'CREATE EXTENSION pg_stat_statements');
+$node->safe_psql('postgres', 'SELECT pg_stat_statements_reset()');
+
+my $mem = $node->safe_psql('postgres',
+	"SHOW pg_stat_statements.query_text_memory");
+is($mem, '2MB', 'query_text_memory is 2MB');
+
+# Generate unique queries to exhaust the text DSA (set to 2MB).
+# Each CTE query with 200 integer constants is ~1000 bytes of text.
+my $cols = join(', ', map { "$_" } (1 .. 200));
+my $sql = '';
+for my $i (1 .. 2500)
+{
+	$sql .= "WITH t${i} AS (SELECT $cols) SELECT FROM t${i};\n";
+}
+$node->safe_psql('postgres', $sql);
+
+my $null_count = $node->safe_psql('postgres', q{
+SELECT count(*)
+FROM pg_stat_statements
+WHERE query IS NULL AND queryid IS NOT NULL
+});
+
+diag("$null_count entries without text after 2500 queries");
+
+cmp_ok($null_count, '>', 0,
+	"some entries have NULL query text after DSA exhaustion ($null_count)");
+
+# Entries without text still have calls > 0
+my $tracked = $node->safe_psql('postgres', q{
+SELECT count(*)
+FROM pg_stat_statements
+WHERE query IS NULL AND queryid IS NOT NULL AND calls > 0
+});
+cmp_ok($tracked, '>', 0,
+	"entries without text still track counters ($tracked)");
+
+# Entries with text also exist (early entries got text before exhaustion)
+my $with_text = $node->safe_psql('postgres', q{
+SELECT count(*)
+FROM pg_stat_statements
+WHERE query IS NOT NULL AND query LIKE 'WITH t%'
+});
+cmp_ok($with_text, '>', 0,
+	"some entries still have query text ($with_text)");
+
+# Both showtext=true and showtext=false should return all entries
+my $count_true = $node->safe_psql('postgres', q{
+SELECT count(*) FROM pg_stat_statements(true)
+});
+my $count_false = $node->safe_psql('postgres', q{
+SELECT count(*) FROM pg_stat_statements(false)
+});
+cmp_ok($count_true, '>=', 2500,
+	"showtext=true returns all entries ($count_true)");
+cmp_ok($count_false, '>=', 2500,
+	"showtext=false returns all entries ($count_false)");
+
+# Run a probe query with constants while DSA is exhausted.
+# Since nentries < pg_stat_statements.max, entry_dealloc won't evict or
+# free any DSA space, so this entry should remain with NULL text.
+$node->safe_psql('postgres', 'SELECT 11111 + 22222 + 33333');
+
+my $probe_before = $node->safe_psql('postgres', q{
+SELECT count(*) FROM pg_stat_statements WHERE query = 'SELECT $1 + $2 + $3'
+});
+is($probe_before, '0',
+	'probe query text is NULL while DSA is exhausted');
+
+# Phase 2: Raise limit and verify backfill.
+$node->safe_psql('postgres',
+	"ALTER SYSTEM SET pg_stat_statements.query_text_memory = '100MB'");
+$node->safe_psql('postgres', "SELECT pg_reload_conf()");
+
+# Re-run the probe query in a new connection to trigger backfill.
+# Normalization is not guaranteed for all backfills (e.g. when triggered
+# from ExecutorEnd where jstate is unavailable), but when the backfill
+# occurs via post_parse_analyze, jstate is available and the text should
+# be stored in normalized form.
+$node->safe_psql('postgres', 'SELECT 11111 + 22222 + 33333');
+
+my $norm_after = $node->safe_psql('postgres', q{
+SELECT query FROM pg_stat_statements WHERE query = 'SELECT $1 + $2 + $3'
+});
+is($norm_after, 'SELECT $1 + $2 + $3',
+	'backfilled text is stored in normalized form');
+
+# Also re-run the bulk queries to trigger their backfill
+$node->safe_psql('postgres', $sql);
+
+my $null_after = $node->safe_psql('postgres', q{
+SELECT count(*)
+FROM pg_stat_statements
+WHERE query IS NULL AND queryid IS NOT NULL
+});
+diag("after backfill: $null_after entries without text");
+cmp_ok($null_after, '<', $null_count,
+	"backfill reduced NULL text entries ($null_count -> $null_after)");
+
+$node->stop;
+done_testing();
diff --git a/doc/src/sgml/pgstatstatements.sgml b/doc/src/sgml/pgstatstatements.sgml
index d753de5836e..8470250ff66 100644
--- a/doc/src/sgml/pgstatstatements.sgml
+++ b/doc/src/sgml/pgstatstatements.sgml
@@ -16,12 +16,13 @@
  <para>
   The module must be loaded by adding <literal>pg_stat_statements</literal> to
   <xref linkend="guc-shared-preload-libraries"/> in
-  <filename>postgresql.conf</filename>, because it requires additional shared memory.
-  This means that a server restart is needed to add or remove the module.
-  In addition, query identifier calculation must be enabled in order for the
-  module to be active, which is done automatically if <xref linkend="guc-compute-query-id"/>
-  is set to <literal>auto</literal> or <literal>on</literal>, or any third-party
-  module that calculates query identifiers is loaded.
+  <filename>postgresql.conf</filename>, because it must register hooks and a
+  custom statistics kind at server start. This means that a server restart is
+  needed to add or remove the module. In addition, query identifier calculation
+  must be enabled in order for the module to be active, which is done automatically
+  if <xref linkend="guc-compute-query-id"/> is set to <literal>auto</literal> or
+  <literal>on</literal>, or any third-party module that calculates query identifiers
+  is loaded.
  </para>
 
  <para>
@@ -794,10 +795,12 @@ calls | 2
        <structfield>dealloc</structfield> <type>bigint</type>
       </para>
       <para>
-       Total number of times <structname>pg_stat_statements</structname>
-       entries about the least-executed statements were deallocated
-       because more distinct statements than
-       <varname>pg_stat_statements.max</varname> were observed
+       Total number of <structname>pg_stat_statements</structname>
+       entries evicted because more distinct statements than
+       <varname>pg_stat_statements.max</varname> were observed.
+       A high value relative to <varname>pg_stat_statements.max</varname>
+       indicates significant query churn and that
+       <varname>pg_stat_statements.max</varname> should be increased
       </para></entry>
      </row>
      <row>
@@ -910,12 +913,15 @@ calls | 2
       <varname>pg_stat_statements.max</varname> is the maximum number of
       statements tracked by the module (i.e., the maximum number of rows
       in the <structname>pg_stat_statements</structname> view).  If more distinct
-      statements than that are observed, information about the least-executed
+      statements than that are observed, information about the least-recently-used
       statements is discarded.  The number of times such information was
       discarded can be seen in the
       <structname>pg_stat_statements_info</structname> view.
+      This is a soft limit; the actual number of tracked statements may
+      briefly exceed it until eviction reclaims space.
       The default value is 5000.
-      This parameter can only be set at server start.
+      This parameter can only be set in the <filename>postgresql.conf</filename>
+      file or on the server command line.
      </para>
     </listitem>
    </varlistentry>
@@ -1007,10 +1013,11 @@ calls | 2
   </variablelist>
 
   <para>
-   The module requires additional shared memory proportional to
-   <varname>pg_stat_statements.max</varname>.  Note that this
-   memory is consumed whenever the module is loaded, even if
-   <varname>pg_stat_statements.track</varname> is set to <literal>none</literal>.
+   The module uses dynamic shared memory that grows as statements are
+   tracked, up to the limit set by
+   <varname>pg_stat_statements.max</varname>.  Note that this memory is
+   not reclaimed when entries are deallocated; it is reused for new
+   entries but the overall shared memory footprint does not shrink.
   </para>
 
   <para>
diff --git a/src/tools/pgindent/typedefs.list b/src/tools/pgindent/typedefs.list
index c5db6ca6705..d0d9a290399 100644
--- a/src/tools/pgindent/typedefs.list
+++ b/src/tools/pgindent/typedefs.list
@@ -4132,6 +4132,9 @@ pgsa_stash_name_table_hash
 pgsa_writer_context
 pgsocket
 pgsql_thing_t
+PgStat_PgssPending
+PgStatShared_Pgss
+pgssCounters
 pgssEntry
 pgssGlobalStats
 pgssHashKey
-- 
2.50.1 (Apple Git-155)