From 37104ecad152def1263b39eaaa70c58c447e6978 Mon Sep 17 00:00:00 2001
From: Yugo Nagata <nagata@sraoss.co.jp>
Date: Fri, 18 Jul 2025 09:58:56 +0900
Subject: [PATCH v9] Allow creating extended statistics on virtual generated
 columns

This allows users to define extended statistics on virtual generated
columns. Expressions including such columns are stored in the catalog
as-is, and expanded at ANALYZE time. Extended statistics on a single
virtual generated column are also allowed, since it is treated as a
single expression.

To enable the optimizer to make use of these statistics, expressions
are also expanded at planning time.
---
 doc/src/sgml/ref/alter_table.sgml       |  6 +-
 doc/src/sgml/ref/create_statistics.sgml |  3 +-
 src/backend/commands/statscmds.c        | 99 +++++++++++++++----------
 src/backend/optimizer/util/plancat.c    |  5 ++
 src/backend/statistics/extended_stats.c | 15 +++-
 src/test/regress/expected/stats_ext.out | 62 +++++++++++-----
 src/test/regress/sql/stats_ext.sql      | 44 +++++++----
 7 files changed, 156 insertions(+), 78 deletions(-)

diff --git a/doc/src/sgml/ref/alter_table.sgml b/doc/src/sgml/ref/alter_table.sgml
index 8591a6b5014..40cb7227477 100644
--- a/doc/src/sgml/ref/alter_table.sgml
+++ b/doc/src/sgml/ref/alter_table.sgml
@@ -217,7 +217,8 @@ WITH ( MODULUS <replaceable class="parameter">numeric_literal</replaceable>, REM
       so running <link linkend="sql-analyze"><command>ANALYZE</command></link>
       on the table afterwards is recommended.
       For a virtual generated column, <command>ANALYZE</command>
-      is not necessary because such columns never have statistics.
+      is not necessary unless extended statistics are defined on it,
+      since such columns never have statistics.
      </para>
     </listitem>
    </varlistentry>
@@ -289,7 +290,8 @@ WITH ( MODULUS <replaceable class="parameter">numeric_literal</replaceable>, REM
       <link linkend="sql-analyze"><command>ANALYZE</command></link>
       on the table afterwards is recommended.
       For a virtual generated column, <command>ANALYZE</command>
-      is not necessary because such columns never have statistics.
+      is not necessary unless extended statistics are defined on it,
+      since such columns never have statistics.
      </para>
     </listitem>
    </varlistentry>
diff --git a/doc/src/sgml/ref/create_statistics.sgml b/doc/src/sgml/ref/create_statistics.sgml
index d6b25ed2c9b..c7852d017cd 100644
--- a/doc/src/sgml/ref/create_statistics.sgml
+++ b/doc/src/sgml/ref/create_statistics.sgml
@@ -123,7 +123,8 @@ CREATE STATISTICS [ [ IF NOT EXISTS ] <replaceable class="parameter">statistics_
       The name of a table column to be covered by the computed statistics.
       This is only allowed when building multivariate statistics.  At least
       two column names or expressions must be specified, and their order is
-      not significant.
+      not significant.  Note that a virtual generated column name can be
+      specified on its own, since it is effectively treated as an expression.
      </para>
     </listitem>
    </varlistentry>
diff --git a/src/backend/commands/statscmds.c b/src/backend/commands/statscmds.c
index c1da79f36ba..5a88d3d95bd 100644
--- a/src/backend/commands/statscmds.c
+++ b/src/backend/commands/statscmds.c
@@ -28,6 +28,7 @@
 #include "commands/comment.h"
 #include "commands/defrem.h"
 #include "miscadmin.h"
+#include "nodes/makefuncs.h"
 #include "nodes/nodeFuncs.h"
 #include "optimizer/optimizer.h"
 #include "statistics/statistics.h"
@@ -268,22 +269,39 @@ CreateStatistics(CreateStatsStmt *stmt, bool check_rights)
 						(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
 						 errmsg("statistics creation on system columns is not supported")));
 
-			/* Disallow use of virtual generated columns in extended stats */
-			if (attForm->attgenerated == ATTRIBUTE_GENERATED_VIRTUAL)
-				ereport(ERROR,
-						(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
-						 errmsg("statistics creation on virtual generated columns is not supported")));
-
-			/* Disallow data types without a less-than operator */
-			type = lookup_type_cache(attForm->atttypid, TYPECACHE_LT_OPR);
-			if (type->lt_opr == InvalidOid)
-				ereport(ERROR,
-						(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
-						 errmsg("column \"%s\" cannot be used in statistics because its type %s has no default btree operator class",
-								attname, format_type_be(attForm->atttypid))));
+			/*
+			 * Disallow data types without a less-than operator.
+			 *
+			 * We ignore this for statistics on a single virtual generated column,
+			 * in which case we'll build the regular statistics only (and that
+			 * code can deal with such data types).
+			 */
+			if (list_length(stmt->exprs) > 1)
+			{
+				type = lookup_type_cache(attForm->atttypid, TYPECACHE_LT_OPR);
+				if (type->lt_opr == InvalidOid)
+					ereport(ERROR,
+							(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
+							 errmsg("column \"%s\" cannot be used in statistics because its type %s has no default btree operator class",
+									attname, format_type_be(attForm->atttypid))));
+			}
 
-			attnums[nattnums] = attForm->attnum;
-			nattnums++;
+			/* Treat virtual generated columns as expressions */
+			if (attForm->attgenerated == ATTRIBUTE_GENERATED_VIRTUAL)
+			{
+				Node *expr = (Node *) makeVar(1,
+											  attForm->attnum,
+											  attForm->atttypid,
+											  attForm->atttypmod,
+											  attForm->attcollation,
+											  0);
+				stxexprs = lappend(stxexprs, expr);
+			}
+			else
+			{
+				attnums[nattnums] = attForm->attnum;
+				nattnums++;
+			}
 			ReleaseSysCache(atttuple);
 		}
 		else if (IsA(selem->expr, Var)) /* column reference in parens */
@@ -297,22 +315,32 @@ CreateStatistics(CreateStatsStmt *stmt, bool check_rights)
 						(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
 						 errmsg("statistics creation on system columns is not supported")));
 
-			/* Disallow use of virtual generated columns in extended stats */
-			if (get_attgenerated(relid, var->varattno) == ATTRIBUTE_GENERATED_VIRTUAL)
-				ereport(ERROR,
-						(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
-						 errmsg("statistics creation on virtual generated columns is not supported")));
-
-			/* Disallow data types without a less-than operator */
-			type = lookup_type_cache(var->vartype, TYPECACHE_LT_OPR);
-			if (type->lt_opr == InvalidOid)
-				ereport(ERROR,
-						(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
-						 errmsg("column \"%s\" cannot be used in statistics because its type %s has no default btree operator class",
-								get_attname(relid, var->varattno, false), format_type_be(var->vartype))));
+			/*
+			 * Disallow data types without a less-than operator.
+			 *
+			 * We ignore this for statistics on a single virtual generated column.
+			 * (See the comments above.)
+			 */
+			if (list_length(stmt->exprs) > 1)
+			{
+				type = lookup_type_cache(var->vartype, TYPECACHE_LT_OPR);
+				if (type->lt_opr == InvalidOid)
+					ereport(ERROR,
+							(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
+							 errmsg("column \"%s\" cannot be used in statistics because its type %s has no default btree operator class",
+									get_attname(relid, var->varattno, false), format_type_be(var->vartype))));
+			}
 
-			attnums[nattnums] = var->varattno;
-			nattnums++;
+			/* Treat virtual generated columns as expressions */
+			if (get_attgenerated(relid, var->varattno) == ATTRIBUTE_GENERATED_VIRTUAL)
+			{
+				stxexprs = lappend(stxexprs, (Node *) var);
+			}
+			else
+			{
+				attnums[nattnums] = var->varattno;
+				nattnums++;
+			}
 		}
 		else					/* expression */
 		{
@@ -336,20 +364,13 @@ CreateStatistics(CreateStatsStmt *stmt, bool check_rights)
 					ereport(ERROR,
 							(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
 							 errmsg("statistics creation on system columns is not supported")));
-
-				/* Disallow use of virtual generated columns in extended stats */
-				if (get_attgenerated(relid, attnum) == ATTRIBUTE_GENERATED_VIRTUAL)
-					ereport(ERROR,
-							(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
-							 errmsg("statistics creation on virtual generated columns is not supported")));
 			}
 
 			/*
 			 * Disallow data types without a less-than operator.
 			 *
-			 * We ignore this for statistics on a single expression, in which
-			 * case we'll build the regular statistics only (and that code can
-			 * deal with such data types).
+			 * We ignore this for statistics on a single expression.
+			 * (See the comments above.)
 			 */
 			if (list_length(stmt->exprs) > 1)
 			{
diff --git a/src/backend/optimizer/util/plancat.c b/src/backend/optimizer/util/plancat.c
index b2fbd6a082b..8b7c9a9f2e4 100644
--- a/src/backend/optimizer/util/plancat.c
+++ b/src/backend/optimizer/util/plancat.c
@@ -1792,6 +1792,11 @@ get_relation_statistics(PlannerInfo *root, RelOptInfo *rel,
 				exprs = (List *) stringToNode(exprsString);
 				pfree(exprsString);
 
+				/*
+				 * Expand virtual generated columns in the expressions.
+				 */
+				exprs = (List *) expand_generated_columns_in_expr((Node *) exprs, relation, 1);
+
 				/*
 				 * Modify the copies we obtain from the relcache to have the
 				 * correct varno for the parent relation, so that they match
diff --git a/src/backend/statistics/extended_stats.c b/src/backend/statistics/extended_stats.c
index 334c6498581..9225a168a8a 100644
--- a/src/backend/statistics/extended_stats.c
+++ b/src/backend/statistics/extended_stats.c
@@ -32,6 +32,7 @@
 #include "parser/parsetree.h"
 #include "pgstat.h"
 #include "postmaster/autovacuum.h"
+#include "rewrite/rewriteHandler.h"
 #include "statistics/extended_stats_internal.h"
 #include "statistics/statistics.h"
 #include "utils/acl.h"
@@ -73,7 +74,7 @@ typedef struct StatExtEntry
 } StatExtEntry;
 
 
-static List *fetch_statentries_for_relation(Relation pg_statext, Oid relid);
+static List *fetch_statentries_for_relation(Relation pg_statext, Relation rel);
 static VacAttrStats **lookup_var_attr_stats(Bitmapset *attrs, List *exprs,
 											int nvacatts, VacAttrStats **vacatts);
 static void statext_store(Oid statOid, bool inh,
@@ -125,7 +126,7 @@ BuildRelationExtStatistics(Relation onerel, bool inh, double totalrows,
 
 	/* the list of stats has to be allocated outside the memory context */
 	pg_stext = table_open(StatisticExtRelationId, RowExclusiveLock);
-	statslist = fetch_statentries_for_relation(pg_stext, RelationGetRelid(onerel));
+	statslist = fetch_statentries_for_relation(pg_stext, onerel);
 
 	/* memory context for building each statistics object */
 	cxt = AllocSetContextCreate(CurrentMemoryContext,
@@ -279,7 +280,7 @@ ComputeExtStatisticsRows(Relation onerel,
 	oldcxt = MemoryContextSwitchTo(cxt);
 
 	pg_stext = table_open(StatisticExtRelationId, RowExclusiveLock);
-	lstats = fetch_statentries_for_relation(pg_stext, RelationGetRelid(onerel));
+	lstats = fetch_statentries_for_relation(pg_stext, onerel);
 
 	foreach(lc, lstats)
 	{
@@ -416,12 +417,13 @@ statext_is_kind_built(HeapTuple htup, char type)
  * Return a list (of StatExtEntry) of statistics objects for the given relation.
  */
 static List *
-fetch_statentries_for_relation(Relation pg_statext, Oid relid)
+fetch_statentries_for_relation(Relation pg_statext, Relation rel)
 {
 	SysScanDesc scan;
 	ScanKeyData skey;
 	HeapTuple	htup;
 	List	   *result = NIL;
+	Oid			relid = RelationGetRelid(rel);
 
 	/*
 	 * Prepare to scan pg_statistic_ext for entries having stxrelid = this
@@ -491,6 +493,11 @@ fetch_statentries_for_relation(Relation pg_statext, Oid relid)
 
 			pfree(exprsString);
 
+			/*
+			 * Expand virtual generated columns in the expressions.
+			 */
+			exprs = (List *) expand_generated_columns_in_expr((Node *) exprs, rel, 1);
+
 			/*
 			 * Run the expressions through eval_const_expressions. This is not
 			 * just an optimization, but is necessary, because the planner
diff --git a/src/test/regress/expected/stats_ext.out b/src/test/regress/expected/stats_ext.out
index b6431d1ee95..4c9c91ecd8f 100644
--- a/src/test/regress/expected/stats_ext.out
+++ b/src/test/regress/expected/stats_ext.out
@@ -25,7 +25,7 @@ begin
 end;
 $$;
 -- Verify failures
-CREATE TABLE ext_stats_test (x text, y int, z int);
+CREATE TABLE ext_stats_test (x text, y int, z int, w xid);
 CREATE STATISTICS tst;
 ERROR:  syntax error at or near ";"
 LINE 1: CREATE STATISTICS tst;
@@ -84,30 +84,19 @@ CREATE STATISTICS tst ON (x, y) FROM ext_stats_test; -- tuple expression
 ERROR:  syntax error at or near ","
 LINE 1: CREATE STATISTICS tst ON (x, y) FROM ext_stats_test;
                                    ^
-DROP TABLE ext_stats_test;
--- statistics on virtual generated column not allowed
-CREATE TABLE ext_stats_test1 (x int, y int, z int GENERATED ALWAYS AS (x+y) VIRTUAL, w xid);
-CREATE STATISTICS tst on z from ext_stats_test1;
-ERROR:  statistics creation on virtual generated columns is not supported
-CREATE STATISTICS tst on (z) from ext_stats_test1;
-ERROR:  statistics creation on virtual generated columns is not supported
-CREATE STATISTICS tst on (z+1) from ext_stats_test1;
-ERROR:  statistics creation on virtual generated columns is not supported
-CREATE STATISTICS tst (ndistinct) ON z from ext_stats_test1;
-ERROR:  statistics creation on virtual generated columns is not supported
 -- statistics on system column not allowed
-CREATE STATISTICS tst on tableoid from ext_stats_test1;
+CREATE STATISTICS tst on tableoid from ext_stats_test;
 ERROR:  statistics creation on system columns is not supported
-CREATE STATISTICS tst on (tableoid) from ext_stats_test1;
+CREATE STATISTICS tst on (tableoid) from ext_stats_test;
 ERROR:  statistics creation on system columns is not supported
-CREATE STATISTICS tst on (tableoid::int+1) from ext_stats_test1;
+CREATE STATISTICS tst on (tableoid::int+1) from ext_stats_test;
 ERROR:  statistics creation on system columns is not supported
-CREATE STATISTICS tst (ndistinct) ON xmin from ext_stats_test1;
+CREATE STATISTICS tst (ndistinct) ON xmin from ext_stats_test;
 ERROR:  statistics creation on system columns is not supported
 -- statistics without a less-than operator not supported
-CREATE STATISTICS tst (ndistinct) ON w from ext_stats_test1;
+CREATE STATISTICS tst (ndistinct) ON x,w from ext_stats_test;
 ERROR:  column "w" cannot be used in statistics because its type xid has no default btree operator class
-DROP TABLE ext_stats_test1;
+DROP TABLE ext_stats_test;
 -- Ensure stats are dropped sanely, and test IF NOT EXISTS while at it
 CREATE TABLE ab1 (a INTEGER, b INTEGER, c INTEGER);
 CREATE STATISTICS IF NOT EXISTS ab1_a_b_stats ON a, b FROM ab1;
@@ -3153,6 +3142,43 @@ SELECT c0 FROM ONLY expr_stats_incompatible_test WHERE
 (0 rows)
 
 DROP TABLE expr_stats_incompatible_test;
+-- statistics on virtual generated columns
+CREATE TABLE virtual_gen_stats (a int, b int, c int GENERATED ALWAYS AS (2*a), d int GENERATED ALWAYS AS (a+b), w xid GENERATED ALWAYS AS (a::text::xid));
+INSERT INTO virtual_gen_stats SELECT mod(i,10), mod(i,10) FROM generate_series(1,1000) s(i);
+ANALYZE virtual_gen_stats;
+SELECT * FROM check_estimated_rows('SELECT * FROM virtual_gen_stats WHERE c = 0 AND (3*b) = 0');
+ estimated | actual 
+-----------+--------
+         1 |    100
+(1 row)
+
+SELECT * FROM check_estimated_rows('SELECT * FROM virtual_gen_stats WHERE d = 0 AND (d-2*a) = 0');
+ estimated | actual 
+-----------+--------
+         1 |    100
+(1 row)
+
+CREATE STATISTICS virtual_gen_stats_1 (mcv) ON c, (3*b), d, (d-2*a) FROM virtual_gen_stats;
+ANALYZE virtual_gen_stats;
+SELECT * FROM check_estimated_rows('SELECT * FROM virtual_gen_stats WHERE c = 0 AND (3*b) = 0');
+ estimated | actual 
+-----------+--------
+       100 |    100
+(1 row)
+
+SELECT * FROM check_estimated_rows('SELECT * FROM virtual_gen_stats WHERE d = 0 AND (d-2*a) = 0');
+ estimated | actual 
+-----------+--------
+       100 |    100
+(1 row)
+
+-- statistics on a single virtual column are supported
+CREATE STATISTICS virtual_gen_stats_single ON c FROM virtual_gen_stats;
+CREATE STATISTICS virtual_gen_stats_single_without_less_than ON c FROM virtual_gen_stats;
+DROP STATISTICS virtual_gen_stats_1;
+DROP STATISTICS virtual_gen_stats_single;
+DROP STATISTICS virtual_gen_stats_single_without_less_than;
+DROP TABLE virtual_gen_stats;
 -- Permission tests. Users should not be able to see specific data values in
 -- the extended statistics, if they lack permission to see those values in
 -- the underlying table.
diff --git a/src/test/regress/sql/stats_ext.sql b/src/test/regress/sql/stats_ext.sql
index 9dcce3440c8..e093269e32f 100644
--- a/src/test/regress/sql/stats_ext.sql
+++ b/src/test/regress/sql/stats_ext.sql
@@ -28,7 +28,7 @@ end;
 $$;
 
 -- Verify failures
-CREATE TABLE ext_stats_test (x text, y int, z int);
+CREATE TABLE ext_stats_test (x text, y int, z int, w xid);
 CREATE STATISTICS tst;
 CREATE STATISTICS tst ON a, b;
 CREATE STATISTICS tst FROM sometab;
@@ -56,21 +56,14 @@ DROP FUNCTION tftest;
 CREATE STATISTICS tst ON (y) FROM ext_stats_test; -- single column reference
 CREATE STATISTICS tst ON y + z FROM ext_stats_test; -- missing parentheses
 CREATE STATISTICS tst ON (x, y) FROM ext_stats_test; -- tuple expression
-DROP TABLE ext_stats_test;
--- statistics on virtual generated column not allowed
-CREATE TABLE ext_stats_test1 (x int, y int, z int GENERATED ALWAYS AS (x+y) VIRTUAL, w xid);
-CREATE STATISTICS tst on z from ext_stats_test1;
-CREATE STATISTICS tst on (z) from ext_stats_test1;
-CREATE STATISTICS tst on (z+1) from ext_stats_test1;
-CREATE STATISTICS tst (ndistinct) ON z from ext_stats_test1;
 -- statistics on system column not allowed
-CREATE STATISTICS tst on tableoid from ext_stats_test1;
-CREATE STATISTICS tst on (tableoid) from ext_stats_test1;
-CREATE STATISTICS tst on (tableoid::int+1) from ext_stats_test1;
-CREATE STATISTICS tst (ndistinct) ON xmin from ext_stats_test1;
+CREATE STATISTICS tst on tableoid from ext_stats_test;
+CREATE STATISTICS tst on (tableoid) from ext_stats_test;
+CREATE STATISTICS tst on (tableoid::int+1) from ext_stats_test;
+CREATE STATISTICS tst (ndistinct) ON xmin from ext_stats_test;
 -- statistics without a less-than operator not supported
-CREATE STATISTICS tst (ndistinct) ON w from ext_stats_test1;
-DROP TABLE ext_stats_test1;
+CREATE STATISTICS tst (ndistinct) ON x,w from ext_stats_test;
+DROP TABLE ext_stats_test;
 
 -- Ensure stats are dropped sanely, and test IF NOT EXISTS while at it
 CREATE TABLE ab1 (a INTEGER, b INTEGER, c INTEGER);
@@ -1584,6 +1577,29 @@ SELECT c0 FROM ONLY expr_stats_incompatible_test WHERE
 
 DROP TABLE expr_stats_incompatible_test;
 
+-- statistics on virtual generated columns
+CREATE TABLE virtual_gen_stats (a int, b int, c int GENERATED ALWAYS AS (2*a), d int GENERATED ALWAYS AS (a+b), w xid GENERATED ALWAYS AS (a::text::xid));
+INSERT INTO virtual_gen_stats SELECT mod(i,10), mod(i,10) FROM generate_series(1,1000) s(i);
+ANALYZE virtual_gen_stats;
+
+SELECT * FROM check_estimated_rows('SELECT * FROM virtual_gen_stats WHERE c = 0 AND (3*b) = 0');
+SELECT * FROM check_estimated_rows('SELECT * FROM virtual_gen_stats WHERE d = 0 AND (d-2*a) = 0');
+
+CREATE STATISTICS virtual_gen_stats_1 (mcv) ON c, (3*b), d, (d-2*a) FROM virtual_gen_stats;
+ANALYZE virtual_gen_stats;
+
+SELECT * FROM check_estimated_rows('SELECT * FROM virtual_gen_stats WHERE c = 0 AND (3*b) = 0');
+SELECT * FROM check_estimated_rows('SELECT * FROM virtual_gen_stats WHERE d = 0 AND (d-2*a) = 0');
+
+-- statistics on a single virtual column are supported
+CREATE STATISTICS virtual_gen_stats_single ON c FROM virtual_gen_stats;
+CREATE STATISTICS virtual_gen_stats_single_without_less_than ON c FROM virtual_gen_stats;
+
+DROP STATISTICS virtual_gen_stats_1;
+DROP STATISTICS virtual_gen_stats_single;
+DROP STATISTICS virtual_gen_stats_single_without_less_than;
+DROP TABLE virtual_gen_stats;
+
 -- Permission tests. Users should not be able to see specific data values in
 -- the extended statistics, if they lack permission to see those values in
 -- the underlying table.
-- 
2.43.0

