diff --git a/doc/src/sgml/catalogs.sgml b/doc/src/sgml/catalogs.sgml index 2c2da2a..b5c4129 100644 --- a/doc/src/sgml/catalogs.sgml +++ b/doc/src/sgml/catalogs.sgml @@ -296,6 +296,11 @@ + pg_statistic_ext + extended planner statistics + + + pg_subscription logical replication subscriptions @@ -4223,6 +4228,98 @@ + + <structname>pg_statistic_ext</structname> + + + pg_statistic_ext + + + + The catalog pg_statistic_ext + holds extended planner statistics. + + + + <structname>pg_statistic_ext</> Columns + + + + + Name + Type + References + Description + + + + + + + starelid + oid + pg_class.oid + The table that the described columns belongs to + + + + staname + name + + Name of the statistic. + + + + stanamespace + oid + pg_namespace.oid + + The OID of the namespace that contains this statistic + + + + + staowner + oid + pg_authid.oid + Owner of the statistic + + + + staenabled + char[] + + + An array with the modes of the enabled statistic types, encoded as + d for ndistinct coefficients. + + + + + stakeys + int2vector + pg_attribute.attnum + + This is an array of values that indicate which table columns this + statistic covers. For example a value of 1 3 would + mean that the first and the third table columns make up the statistic key. + + + + + standistinct + pg_ndistinct + + + Ndistict coefficients, serialized as pg_ndistinct type. + + + + + +
+
+ <structname>pg_namespace</structname> diff --git a/doc/src/sgml/planstats.sgml b/doc/src/sgml/planstats.sgml index b73c66b..76955e5 100644 --- a/doc/src/sgml/planstats.sgml +++ b/doc/src/sgml/planstats.sgml @@ -448,4 +448,145 @@ rows = (outer_cardinality * inner_cardinality) * selectivity + + Extended Statistics + + + extended statistics + planner + + + + The examples presented in used + statistics about individual columns to compute selectivity estimates. + When estimating conditions on multiple columns, the planner assumes + independence of the conditions and multiplies the selectivities. When the + columns are correlated, the independence assumption is violated, and the + estimates may be off by several orders of magnitude, resulting in poor + plan choices. + + + + The examples presented below demonstrate such estimation errors on simple + data sets, and also how to resolve them by creating extended statistics + using CREATE STATISTICS command. + + + + Let's start with a very simple data set - a table with two columns, + containing exactly the same values: + + +CREATE TABLE t (a INT, b INT); +INSERT INTO t SELECT i % 100, i % 100 FROM generate_series(1, 10000) s(i); +ANALYZE t; + + + As explained in , the planner can determine + cardinality of t using the number of pages and + rows is looked up in pg_class: + + +SELECT relpages, reltuples FROM pg_class WHERE relname = 't'; + + relpages | reltuples +----------+----------- + 45 | 10000 + + + The data distribution is very simple - there are only 100 distinct values + in each column, uniformly distributed. + + + + The following example shows the result of estimating a WHERE + condition on the a column: + + +EXPLAIN ANALYZE SELECT * FROM t WHERE a = 1; + QUERY PLAN +------------------------------------------------------------------------------------------------- + Seq Scan on t (cost=0.00..170.00 rows=100 width=8) (actual time=0.031..2.870 rows=100 loops=1) + Filter: (a = 1) + Rows Removed by Filter: 9900 + Planning time: 0.092 ms + Execution time: 3.103 ms +(5 rows) + + + The planner examines the condition and computes the estimate using + eqsel, the selectivity function for =, and + statistics stored in the pg_stats table. In this case + the planner estimates the condition matches 1% rows, and by comparing + the estimated and actual number of rows, we see that the estimate is + very accurate (in fact exact, as the table is very small). + + + + Adding a condition on the second column results in the following plan: + + +EXPLAIN ANALYZE SELECT * FROM t WHERE a = 1 AND b = 1; + QUERY PLAN +----------------------------------------------------------------------------------------------- + Seq Scan on t (cost=0.00..195.00 rows=1 width=8) (actual time=0.033..3.006 rows=100 loops=1) + Filter: ((a = 1) AND (b = 1)) + Rows Removed by Filter: 9900 + Planning time: 0.121 ms + Execution time: 3.220 ms +(5 rows) + + + The planner estimates the selectivity for each condition individually, + arriving to the 1% estimates as above, and then multiplies them, getting + the final 0.01% estimate. The plan however shows that this results in + a significant underestimate, as the actual number of rows matching the + conditions is two orders of magnitude higher than estimated. + + + + Overestimates, i.e. errors in the opposite direction, are also possible. + Consider for example the following combination of range conditions, each + matching + + +EXPLAIN ANALYZE SELECT * FROM t WHERE a <= 49 AND b > 49; + QUERY PLAN +------------------------------------------------------------------------------------------------ + Seq Scan on t (cost=0.00..195.00 rows=2500 width=8) (actual time=1.607..1.607 rows=0 loops=1) + Filter: ((a <= 49) AND (b > 49)) + Rows Removed by Filter: 10000 + Planning time: 0.050 ms + Execution time: 1.623 ms +(5 rows) + + + The planner examines both WHERE clauses and estimates them + using the scalarltsel and scalargtsel functions, + specified as the selectivity functions matching the <= and + > operators. Both conditions match 50% of the + table, and assuming independence the planner multiplies them to compute + the total estimate of 25%. However as the explain output shows, the actual + number of rows is 0, because the columns are correlated and the conditions + contradict each other. + + + + Both estimation errors are caused by violation of the independence + assumption, as the two columns contain exactly the same values, and are + therefore perfectly correlated. Providing additional information about + correlation between columns is the purpose of extended statistics, + and the rest of this section explains in more detail how the planner + leverages them to improve estimates. + + + + For additional details about extended statistics, see + src/backend/statistics/README. There are additional + READMEs for each type of statistics, mentioned in the following + sections. + + + + diff --git a/doc/src/sgml/ref/allfiles.sgml b/doc/src/sgml/ref/allfiles.sgml index 2bc4d9f..255e800 100644 --- a/doc/src/sgml/ref/allfiles.sgml +++ b/doc/src/sgml/ref/allfiles.sgml @@ -34,6 +34,7 @@ Complete list of usable sgml source files in this directory. + @@ -80,6 +81,7 @@ Complete list of usable sgml source files in this directory. + @@ -126,6 +128,7 @@ Complete list of usable sgml source files in this directory. + diff --git a/doc/src/sgml/ref/alter_statistics.sgml b/doc/src/sgml/ref/alter_statistics.sgml new file mode 100644 index 0000000..35cbc09 --- /dev/null +++ b/doc/src/sgml/ref/alter_statistics.sgml @@ -0,0 +1,115 @@ + + + + + ALTER STATISTICS + + + + ALTER STATISTICS + 7 + SQL - Language Statements + + + + ALTER STATISTICS + + change the definition of a extended statistics + + + + + +ALTER STATISTICS name OWNER TO { new_owner | CURRENT_USER | SESSION_USER } +ALTER STATISTICS name RENAME TO new_name +ALTER STATISTICS name SET SCHEMA new_schema + + + + + Description + + + ALTER STATISTICS changes the parameters of an existing + extended statistics. Any parameters not specifically set in the + ALTER STATISTICS command retain their prior settings. + + + + You must own the statistics to use ALTER STATISTICS. + To change a statistics' schema, you must also have CREATE + privilege on the new schema. + To alter the owner, you must also be a direct or indirect member of the new + owning role, and that role must have CREATE privilege on + the statistics' schema. (These restrictions enforce that altering the owner + doesn't do anything you couldn't do by dropping and recreating the statistics. + However, a superuser can alter ownership of any statistics anyway.) + + + + + Parameters + + + + + name + + + The name (optionally schema-qualified) of a statistics to be altered. + + + + + + new_owner + + + The user name of the new owner of the statistics. + + + + + + new_name + + + The new name for the statistics. + + + + + + new_schema + + + The new schema for the statistics. + + + + + + + + + + Compatibility + + + There's no ALTER STATISTICS command in the SQL standard. + + + + + See Also + + + + + + + + diff --git a/doc/src/sgml/ref/alter_table.sgml b/doc/src/sgml/ref/alter_table.sgml index 077c003..f3ad5ed 100644 --- a/doc/src/sgml/ref/alter_table.sgml +++ b/doc/src/sgml/ref/alter_table.sgml @@ -119,9 +119,12 @@ ALTER TABLE [ IF EXISTS ] name This form drops a column from a table. Indexes and table constraints involving the column will be automatically - dropped as well. You will need to say CASCADE if - anything outside the table depends on the column, for example, - foreign key references or views. + dropped as well. + Multivariate statistics referencing the dropped column will also be + removed if the removal of the column would cause the statistics to + contain data for only a single column. + You will need to say CASCADE if anything outside the table + depends on the column, for example, foreign key references or views. If IF EXISTS is specified and the column does not exist, no error is thrown. In this case a notice is issued instead. diff --git a/doc/src/sgml/ref/create_statistics.sgml b/doc/src/sgml/ref/create_statistics.sgml new file mode 100644 index 0000000..5919a25 --- /dev/null +++ b/doc/src/sgml/ref/create_statistics.sgml @@ -0,0 +1,152 @@ + + + + + CREATE STATISTICS + + + + CREATE STATISTICS + 7 + SQL - Language Statements + + + + CREATE STATISTICS + define extended statistics + + + + +CREATE STATISTICS [ IF NOT EXISTS ] statistics_name ON ( + column_name, column_name [, ...]) + FROM table_name + + + + + + Description + + + CREATE STATISTICS will create a new extended statistics + on the table. The statistics will be created in the current database and + will be owned by the user issuing the command. + + + + If a schema name is given (for example, CREATE STATISTICS + myschema.mystat ...) then the statistics is created in the specified + schema. Otherwise it is created in the current schema. The name of + the table must be distinct from the name of any other statistics in the + same schema. + + + + To be able to create a table, you must have USAGE + privilege on all column types or the type in the OF + clause, respectively. + + + + + Parameters + + + + + IF NOT EXISTS + + + Do not throw an error if a statistics with the same name already exists. + A notice is issued in this case. Note that there is no guarantee that + the existing statistics is anything like the one that would have been + created. + + + + + + statistics_name + + + The name (optionally schema-qualified) of the statistics to be created. + + + + + + table_name + + + The name (optionally schema-qualified) of the table the statistics should + be created on. + + + + + + column_name + + + The name of a column to be included in the statistics. + + + + + + + + + + Examples + + + Create table t1 with two functionally dependent columns, i.e. + knowledge of a value in the first column is sufficient for determining the + value in the other column. Then functional dependencies are built on those + columns: + + +CREATE TABLE t1 ( + a int, + b int +); + +INSERT INTO t1 SELECT i/100, i/500 + FROM generate_series(1,1000000) s(i); + +CREATE STATISTICS s1 ON (a, b) FROM t1; + +ANALYZE t1; + +-- valid combination of values +EXPLAIN ANALYZE SELECT * FROM t1 WHERE (a = 1) AND (b = 0); + +-- invalid combination of values +EXPLAIN ANALYZE SELECT * FROM t1 WHERE (a = 1) AND (b = 1); + + + + + + + Compatibility + + + There's no CREATE STATISTICS command in the SQL standard. + + + + + See Also + + + + + + + diff --git a/doc/src/sgml/ref/drop_statistics.sgml b/doc/src/sgml/ref/drop_statistics.sgml new file mode 100644 index 0000000..d7c657f --- /dev/null +++ b/doc/src/sgml/ref/drop_statistics.sgml @@ -0,0 +1,91 @@ + + + + + DROP STATISTICS + + + + DROP STATISTICS + 7 + SQL - Language Statements + + + + DROP STATISTICS + remove extended statistics + + + + +DROP STATISTICS [ IF EXISTS ] name [, ...] + + + + + Description + + + DROP STATISTICS removes statistics from the database. + Only the statistics owner, the schema owner, and superuser can drop a + statistics. + + + + + + Parameters + + + + IF EXISTS + + + Do not throw an error if the statistics do not exist. A notice is + issued in this case. + + + + + + name + + + The name (optionally schema-qualified) of the statistics to drop. + + + + + + + + + Examples + + + ... + + + + + + Compatibility + + + There's no DROP STATISTICS command in the SQL standard. + + + + + See Also + + + + + + + + diff --git a/doc/src/sgml/reference.sgml b/doc/src/sgml/reference.sgml index c8191de..aa8a157 100644 --- a/doc/src/sgml/reference.sgml +++ b/doc/src/sgml/reference.sgml @@ -60,6 +60,7 @@ &alterSchema; &alterSequence; &alterServer; + &alterStatistics; &alterSubscription; &alterSystem; &alterTable; @@ -108,6 +109,7 @@ &createSchema; &createSequence; &createServer; + &createStatistics; &createSubscription; &createTable; &createTableAs; @@ -154,6 +156,7 @@ &dropSchema; &dropSequence; &dropServer; + &dropStatistics; &dropSubscription; &dropTable; &dropTableSpace; diff --git a/src/backend/Makefile b/src/backend/Makefile index 7a0bbb2..426ef4f 100644 --- a/src/backend/Makefile +++ b/src/backend/Makefile @@ -19,7 +19,7 @@ include $(top_builddir)/src/Makefile.global SUBDIRS = access bootstrap catalog parser commands executor foreign lib libpq \ main nodes optimizer port postmaster regex replication rewrite \ - storage tcop tsearch utils $(top_builddir)/src/timezone + statistics storage tcop tsearch utils $(top_builddir)/src/timezone include $(srcdir)/common.mk diff --git a/src/backend/catalog/Makefile b/src/backend/catalog/Makefile index 3136858..ff7cc79 100644 --- a/src/backend/catalog/Makefile +++ b/src/backend/catalog/Makefile @@ -33,6 +33,7 @@ POSTGRES_BKI_SRCS = $(addprefix $(top_srcdir)/src/include/catalog/,\ pg_attrdef.h pg_constraint.h pg_inherits.h pg_index.h pg_operator.h \ pg_opfamily.h pg_opclass.h pg_am.h pg_amop.h pg_amproc.h \ pg_language.h pg_largeobject_metadata.h pg_largeobject.h pg_aggregate.h \ + pg_statistic_ext.h \ pg_statistic.h pg_rewrite.h pg_trigger.h pg_event_trigger.h pg_description.h \ pg_cast.h pg_enum.h pg_namespace.h pg_conversion.h pg_depend.h \ pg_database.h pg_db_role_setting.h pg_tablespace.h pg_pltemplate.h \ diff --git a/src/backend/catalog/aclchk.c b/src/backend/catalog/aclchk.c index be86d76..1d71c7c 100644 --- a/src/backend/catalog/aclchk.c +++ b/src/backend/catalog/aclchk.c @@ -48,6 +48,7 @@ #include "catalog/pg_operator.h" #include "catalog/pg_opfamily.h" #include "catalog/pg_proc.h" +#include "catalog/pg_statistic_ext.h" #include "catalog/pg_subscription.h" #include "catalog/pg_tablespace.h" #include "catalog/pg_type.h" @@ -5104,6 +5105,32 @@ pg_subscription_ownercheck(Oid sub_oid, Oid roleid) } /* + * Ownership check for a extended statistics (specified by OID). + */ +bool +pg_statistics_ownercheck(Oid stat_oid, Oid roleid) +{ + HeapTuple tuple; + Oid ownerId; + + /* Superusers bypass all permission checking. */ + if (superuser_arg(roleid)) + return true; + + tuple = SearchSysCache1(STATEXTOID, ObjectIdGetDatum(stat_oid)); + if (!HeapTupleIsValid(tuple)) + ereport(ERROR, + (errcode(ERRCODE_UNDEFINED_OBJECT), + errmsg("statistics with OID %u do not exist", stat_oid))); + + ownerId = ((Form_pg_statistic_ext) GETSTRUCT(tuple))->staowner; + + ReleaseSysCache(tuple); + + return has_privs_of_role(roleid, ownerId); +} + +/* * Check whether specified role has CREATEROLE privilege (or is a superuser) * * Note: roles do not have owners per se; instead we use this test in diff --git a/src/backend/catalog/dependency.c b/src/backend/catalog/dependency.c index fc088b2..ee27cae 100644 --- a/src/backend/catalog/dependency.c +++ b/src/backend/catalog/dependency.c @@ -51,6 +51,7 @@ #include "catalog/pg_publication.h" #include "catalog/pg_publication_rel.h" #include "catalog/pg_rewrite.h" +#include "catalog/pg_statistic_ext.h" #include "catalog/pg_subscription.h" #include "catalog/pg_tablespace.h" #include "catalog/pg_transform.h" @@ -154,6 +155,7 @@ static const Oid object_classes[] = { RewriteRelationId, /* OCLASS_REWRITE */ TriggerRelationId, /* OCLASS_TRIGGER */ NamespaceRelationId, /* OCLASS_SCHEMA */ + StatisticExtRelationId, /* OCLASS_STATISTIC_EXT */ TSParserRelationId, /* OCLASS_TSPARSER */ TSDictionaryRelationId, /* OCLASS_TSDICT */ TSTemplateRelationId, /* OCLASS_TSTEMPLATE */ @@ -1263,6 +1265,10 @@ doDeletion(const ObjectAddress *object, int flags) DropTransformById(object->objectId); break; + case OCLASS_STATISTIC_EXT: + RemoveStatisticsById(object->objectId); + break; + default: elog(ERROR, "unrecognized object class: %u", object->classId); @@ -2377,6 +2383,9 @@ getObjectClass(const ObjectAddress *object) case NamespaceRelationId: return OCLASS_SCHEMA; + case StatisticExtRelationId: + return OCLASS_STATISTIC_EXT; + case TSParserRelationId: return OCLASS_TSPARSER; diff --git a/src/backend/catalog/heap.c b/src/backend/catalog/heap.c index 41c0056..c944b57 100644 --- a/src/backend/catalog/heap.c +++ b/src/backend/catalog/heap.c @@ -52,6 +52,7 @@ #include "catalog/pg_opclass.h" #include "catalog/pg_partitioned_table.h" #include "catalog/pg_statistic.h" +#include "catalog/pg_statistic_ext.h" #include "catalog/pg_tablespace.h" #include "catalog/pg_type.h" #include "catalog/pg_type_fn.h" @@ -1608,7 +1609,10 @@ RemoveAttributeById(Oid relid, AttrNumber attnum) heap_close(attr_rel, RowExclusiveLock); if (attnum > 0) + { RemoveStatistics(relid, attnum); + RemoveStatisticsExt(relid, attnum); + } relation_close(rel, NoLock); } @@ -1856,6 +1860,11 @@ heap_drop_with_catalog(Oid relid) RemoveStatistics(relid, 0); /* + * delete multi-variate statistics + */ + RemoveStatisticsExt(relid, 0); + + /* * delete attribute tuples */ DeleteAttributeTuples(relid); @@ -2766,6 +2775,98 @@ RemoveStatistics(Oid relid, AttrNumber attnum) /* + * RemoveStatisticsExt --- remove entries in pg_statistic_ext for a rel + * + * If attnum is zero, remove all entries for rel; else remove only the one(s) + * for that column. + */ +void +RemoveStatisticsExt(Oid relid, AttrNumber attnum) +{ + Relation pgstatisticext; + TupleDesc tupdesc = NULL; + SysScanDesc scan; + ScanKeyData key; + HeapTuple tuple; + + /* + * When dropping a column, we'll drop statistics with a single remaining + * (undropped column). To do that, we need the tuple descriptor. + * + * We already have the relation locked (as we're running ALTER TABLE ... + * DROP COLUMN), so we'll just get the descriptor here. + */ + if (attnum != 0) + { + Relation rel = relation_open(relid, NoLock); + + /* extended stats are supported on tables and matviews */ + if (rel->rd_rel->relkind == RELKIND_RELATION || + rel->rd_rel->relkind == RELKIND_MATVIEW) + tupdesc = RelationGetDescr(rel); + + relation_close(rel, NoLock); + } + + if (tupdesc == NULL) + return; + + pgstatisticext = heap_open(StatisticExtRelationId, RowExclusiveLock); + + ScanKeyInit(&key, + Anum_pg_statistic_ext_starelid, + BTEqualStrategyNumber, F_OIDEQ, + ObjectIdGetDatum(relid)); + + scan = systable_beginscan(pgstatisticext, + StatisticExtRelidIndexId, + true, NULL, 1, &key); + + /* we must loop even when attnum != 0, in case of inherited stats */ + while (HeapTupleIsValid(tuple = systable_getnext(scan))) + { + bool delete = true; + + if (attnum != 0) + { + Datum adatum; + bool isnull; + int i; + int ncolumns = 0; + ArrayType *arr; + int16 *attnums; + + /* get the columns */ + adatum = SysCacheGetAttr(STATEXTOID, tuple, + Anum_pg_statistic_ext_stakeys, &isnull); + Assert(!isnull); + + arr = DatumGetArrayTypeP(adatum); + attnums = (int16 *) ARR_DATA_PTR(arr); + + for (i = 0; i < ARR_DIMS(arr)[0]; i++) + { + /* count the column unless it's has been / is being dropped */ + if ((!tupdesc->attrs[attnums[i] - 1]->attisdropped) && + (attnums[i] != attnum)) + ncolumns += 1; + } + + /* delete if there are less than two attributes */ + delete = (ncolumns < 2); + } + + if (delete) + simple_heap_delete(pgstatisticext, &tuple->t_self); + } + + systable_endscan(scan); + + heap_close(pgstatisticext, RowExclusiveLock); +} + + +/* * RelationTruncateIndexes - truncate all indexes associated * with the heap relation to zero tuples. * diff --git a/src/backend/catalog/namespace.c b/src/backend/catalog/namespace.c index a38da30..e521bd9 100644 --- a/src/backend/catalog/namespace.c +++ b/src/backend/catalog/namespace.c @@ -2086,6 +2086,62 @@ ConversionIsVisible(Oid conid) } /* + * get_statistics_oid - find a statistics by possibly qualified name + * + * If not found, returns InvalidOid if missing_ok, else throws error + */ +Oid +get_statistics_oid(List *names, bool missing_ok) +{ + char *schemaname; + char *stats_name; + Oid namespaceId; + Oid stats_oid = InvalidOid; + ListCell *l; + + /* deconstruct the name list */ + DeconstructQualifiedName(names, &schemaname, &stats_name); + + if (schemaname) + { + /* use exact schema given */ + namespaceId = LookupExplicitNamespace(schemaname, missing_ok); + if (missing_ok && !OidIsValid(namespaceId)) + stats_oid = InvalidOid; + else + stats_oid = GetSysCacheOid2(STATEXTNAMENSP, + PointerGetDatum(stats_name), + ObjectIdGetDatum(namespaceId)); + } + else + { + /* search for it in search path */ + recomputeNamespacePath(); + + foreach(l, activeSearchPath) + { + namespaceId = lfirst_oid(l); + + if (namespaceId == myTempNamespace) + continue; /* do not look in temp namespace */ + stats_oid = GetSysCacheOid2(STATEXTNAMENSP, + PointerGetDatum(stats_name), + ObjectIdGetDatum(namespaceId)); + if (OidIsValid(stats_oid)) + break; + } + } + + if (!OidIsValid(stats_oid) && !missing_ok) + ereport(ERROR, + (errcode(ERRCODE_UNDEFINED_OBJECT), + errmsg("statistics \"%s\" do not exist", + NameListToString(names)))); + + return stats_oid; +} + +/* * get_ts_parser_oid - find a TS parser by possibly qualified name * * If not found, returns InvalidOid if missing_ok, else throws error diff --git a/src/backend/catalog/objectaddress.c b/src/backend/catalog/objectaddress.c index 3a7f049..a346215 100644 --- a/src/backend/catalog/objectaddress.c +++ b/src/backend/catalog/objectaddress.c @@ -48,6 +48,7 @@ #include "catalog/pg_publication.h" #include "catalog/pg_publication_rel.h" #include "catalog/pg_rewrite.h" +#include "catalog/pg_statistic_ext.h" #include "catalog/pg_subscription.h" #include "catalog/pg_tablespace.h" #include "catalog/pg_transform.h" @@ -478,6 +479,18 @@ static const ObjectPropertyType ObjectProperty[] = InvalidAttrNumber, ACL_KIND_SUBSCRIPTION, true + }, + { + StatisticExtRelationId, + StatisticExtOidIndexId, + STATEXTOID, + STATEXTNAMENSP, + Anum_pg_statistic_ext_staname, + Anum_pg_statistic_ext_stanamespace, + Anum_pg_statistic_ext_staowner, + InvalidAttrNumber, /* no ACL (same as relation) */ + -1, /* no ACL */ + true } }; @@ -696,6 +709,10 @@ static const struct object_type_map /* OCLASS_TRANSFORM */ { "transform", OBJECT_TRANSFORM + }, + /* OBJECT_STATISTICS */ + { + "statistics", OBJECT_STATISTICS } }; @@ -974,6 +991,12 @@ get_object_address(ObjectType objtype, Node *object, address = get_object_address_defacl(castNode(List, object), missing_ok); break; + case OBJECT_STATISTICS: + address.classId = StatisticExtRelationId; + address.objectId = get_statistics_oid(castNode(List, object), + missing_ok); + address.objectSubId = 0; + break; default: elog(ERROR, "unrecognized objtype: %d", (int) objtype); /* placate compiler, in case it thinks elog might return */ @@ -2079,6 +2102,7 @@ pg_get_object_address(PG_FUNCTION_ARGS) case OBJECT_ATTRIBUTE: case OBJECT_COLLATION: case OBJECT_CONVERSION: + case OBJECT_STATISTICS: case OBJECT_TSPARSER: case OBJECT_TSDICTIONARY: case OBJECT_TSTEMPLATE: @@ -2366,6 +2390,10 @@ check_object_ownership(Oid roleid, ObjectType objtype, ObjectAddress address, (errcode(ERRCODE_INSUFFICIENT_PRIVILEGE), errmsg("must be superuser"))); break; + case OBJECT_STATISTICS: + if (!pg_statistics_ownercheck(address.objectId, roleid)) + aclcheck_error_type(ACLCHECK_NOT_OWNER, address.objectId); + break; default: elog(ERROR, "unrecognized object type: %d", (int) objtype); @@ -3853,6 +3881,10 @@ getObjectTypeDescription(const ObjectAddress *object) appendStringInfoString(&buffer, "subscription"); break; + case OCLASS_STATISTIC_EXT: + appendStringInfoString(&buffer, "extended statistics"); + break; + default: appendStringInfo(&buffer, "unrecognized %u", object->classId); break; @@ -4876,6 +4908,29 @@ getObjectIdentityParts(const ObjectAddress *object, break; } + case OCLASS_STATISTIC_EXT: + { + HeapTuple tup; + Form_pg_statistic_ext formStatistic; + char *schema; + + tup = SearchSysCache1(STATEXTOID, + ObjectIdGetDatum(object->objectId)); + if (!HeapTupleIsValid(tup)) + elog(ERROR, "cache lookup failed for statistics %u", + object->objectId); + formStatistic = (Form_pg_statistic_ext) GETSTRUCT(tup); + schema = get_namespace_name_or_temp(formStatistic->stanamespace); + appendStringInfoString(&buffer, + quote_qualified_identifier(schema, + NameStr(formStatistic->staname))); + if (objname) + *objname = list_make2(schema, + pstrdup(NameStr(formStatistic->staname))); + ReleaseSysCache(tup); + } + break; + default: appendStringInfo(&buffer, "unrecognized object %u %u %d", object->classId, diff --git a/src/backend/catalog/system_views.sql b/src/backend/catalog/system_views.sql index 0bce209..f3b3578 100644 --- a/src/backend/catalog/system_views.sql +++ b/src/backend/catalog/system_views.sql @@ -186,6 +186,16 @@ CREATE OR REPLACE VIEW pg_sequences AS WHERE NOT pg_is_other_temp_schema(N.oid) AND relkind = 'S'; +CREATE VIEW pg_stats_ext AS + SELECT + N.nspname AS schemaname, + C.relname AS tablename, + S.staname AS staname, + S.stakeys AS attnums, + length(s.standistinct) AS ndistbytes + FROM (pg_statistic_ext S JOIN pg_class C ON (C.oid = S.starelid)) + LEFT JOIN pg_namespace N ON (N.oid = C.relnamespace); + CREATE VIEW pg_stats WITH (security_barrier) AS SELECT nspname AS schemaname, diff --git a/src/backend/commands/Makefile b/src/backend/commands/Makefile index e0fab38..4a6c99e 100644 --- a/src/backend/commands/Makefile +++ b/src/backend/commands/Makefile @@ -18,8 +18,8 @@ OBJS = amcmds.o aggregatecmds.o alter.o analyze.o async.o cluster.o comment.o \ event_trigger.o explain.o extension.o foreigncmds.o functioncmds.o \ indexcmds.o lockcmds.o matview.o operatorcmds.o opclasscmds.o \ policy.o portalcmds.o prepare.o proclang.o publicationcmds.o \ - schemacmds.o seclabel.o sequence.o subscriptioncmds.o tablecmds.o \ - tablespace.o trigger.o tsearchcmds.o typecmds.o user.o vacuum.o \ - vacuumlazy.o variable.o view.o + schemacmds.o seclabel.o sequence.o statscmds.o subscriptioncmds.o \ + tablecmds.o tablespace.o trigger.o tsearchcmds.o typecmds.o user.o \ + vacuum.o vacuumlazy.o variable.o view.o include $(top_srcdir)/src/backend/common.mk diff --git a/src/backend/commands/alter.c b/src/backend/commands/alter.c index cf1391c..bf1aba1 100644 --- a/src/backend/commands/alter.c +++ b/src/backend/commands/alter.c @@ -373,6 +373,7 @@ ExecRenameStmt(RenameStmt *stmt) case OBJECT_OPCLASS: case OBJECT_OPFAMILY: case OBJECT_LANGUAGE: + case OBJECT_STATISTICS: case OBJECT_TSCONFIGURATION: case OBJECT_TSDICTIONARY: case OBJECT_TSPARSER: @@ -489,6 +490,7 @@ ExecAlterObjectSchemaStmt(AlterObjectSchemaStmt *stmt, case OBJECT_OPERATOR: case OBJECT_OPCLASS: case OBJECT_OPFAMILY: + case OBJECT_STATISTICS: case OBJECT_TSCONFIGURATION: case OBJECT_TSDICTIONARY: case OBJECT_TSPARSER: @@ -803,6 +805,7 @@ ExecAlterOwnerStmt(AlterOwnerStmt *stmt) case OBJECT_OPERATOR: case OBJECT_OPCLASS: case OBJECT_OPFAMILY: + case OBJECT_STATISTICS: case OBJECT_TABLESPACE: case OBJECT_TSDICTIONARY: case OBJECT_TSCONFIGURATION: diff --git a/src/backend/commands/analyze.c b/src/backend/commands/analyze.c index b91df98..39d9bdb 100644 --- a/src/backend/commands/analyze.c +++ b/src/backend/commands/analyze.c @@ -17,6 +17,7 @@ #include #include "access/multixact.h" +#include "access/sysattr.h" #include "access/transam.h" #include "access/tupconvert.h" #include "access/tuptoaster.h" @@ -28,6 +29,7 @@ #include "catalog/pg_collation.h" #include "catalog/pg_inherits_fn.h" #include "catalog/pg_namespace.h" +#include "catalog/pg_statistic_ext.h" #include "commands/dbcommands.h" #include "commands/tablecmds.h" #include "commands/vacuum.h" @@ -39,13 +41,17 @@ #include "parser/parse_relation.h" #include "pgstat.h" #include "postmaster/autovacuum.h" +#include "statistics/common.h" +#include "statistics/stats.h" #include "storage/bufmgr.h" #include "storage/lmgr.h" #include "storage/proc.h" #include "storage/procarray.h" #include "utils/acl.h" #include "utils/attoptcache.h" +#include "utils/builtins.h" #include "utils/datum.h" +#include "utils/fmgroids.h" #include "utils/guc.h" #include "utils/lsyscache.h" #include "utils/memutils.h" @@ -566,6 +572,9 @@ do_analyze_rel(Relation onerel, int options, VacuumParams *params, update_attstats(RelationGetRelid(Irel[ind]), false, thisdata->attr_cnt, thisdata->vacattrstats); } + + /* Build extended statistics (if there are any). */ + build_ext_stats(onerel, totalrows, numrows, rows, attr_cnt, vacattrstats); } /* @@ -1683,19 +1692,6 @@ ind_fetch_func(VacAttrStatsP stats, int rownum, bool *isNull) */ typedef struct { - Oid eqopr; /* '=' operator for datatype, if any */ - Oid eqfunc; /* and associated function */ - Oid ltopr; /* '<' operator for datatype, if any */ -} StdAnalyzeData; - -typedef struct -{ - Datum value; /* a data value */ - int tupno; /* position index for tuple it came from */ -} ScalarItem; - -typedef struct -{ int count; /* # of duplicates */ int first; /* values[] index of first occurrence */ } ScalarMCVItem; diff --git a/src/backend/commands/dropcmds.c b/src/backend/commands/dropcmds.c index ab73fbf..e7ae4a5 100644 --- a/src/backend/commands/dropcmds.c +++ b/src/backend/commands/dropcmds.c @@ -286,6 +286,13 @@ does_not_exist_skipping(ObjectType objtype, Node *object) msg = gettext_noop("schema \"%s\" does not exist, skipping"); name = strVal((Value *) object); break; + case OBJECT_STATISTICS: + if (!schema_does_not_exist_skipping(castNode(List, object), &msg, &name)) + { + msg = gettext_noop("statistics \"%s\" do not exist, skipping"); + name = NameListToString(castNode(List, object)); + } + break; case OBJECT_TSPARSER: if (!schema_does_not_exist_skipping(castNode(List, object), &msg, &name)) { diff --git a/src/backend/commands/event_trigger.c b/src/backend/commands/event_trigger.c index 346b347..b84a10f 100644 --- a/src/backend/commands/event_trigger.c +++ b/src/backend/commands/event_trigger.c @@ -112,6 +112,7 @@ static event_trigger_support_data event_trigger_support[] = { {"SCHEMA", true}, {"SEQUENCE", true}, {"SERVER", true}, + {"STATISTICS", true}, {"SUBSCRIPTION", true}, {"TABLE", true}, {"TABLESPACE", false}, @@ -1108,6 +1109,7 @@ EventTriggerSupportsObjectType(ObjectType obtype) case OBJECT_SCHEMA: case OBJECT_SEQUENCE: case OBJECT_SUBSCRIPTION: + case OBJECT_STATISTICS: case OBJECT_TABCONSTRAINT: case OBJECT_TABLE: case OBJECT_TRANSFORM: @@ -1173,6 +1175,7 @@ EventTriggerSupportsObjectClass(ObjectClass objclass) case OCLASS_PUBLICATION: case OCLASS_PUBLICATION_REL: case OCLASS_SUBSCRIPTION: + case OCLASS_STATISTIC_EXT: return true; } diff --git a/src/backend/commands/statscmds.c b/src/backend/commands/statscmds.c new file mode 100644 index 0000000..77d7a36 --- /dev/null +++ b/src/backend/commands/statscmds.c @@ -0,0 +1,270 @@ +/*------------------------------------------------------------------------- + * + * statscmds.c + * Commands for creating and altering extended statistics + * + * Portions Copyright (c) 1996-2015, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * + * IDENTIFICATION + * src/backend/commands/statscmds.c + * + *------------------------------------------------------------------------- + */ +#include "postgres.h" + +#include "access/relscan.h" +#include "catalog/dependency.h" +#include "catalog/indexing.h" +#include "catalog/namespace.h" +#include "catalog/pg_namespace.h" +#include "catalog/pg_statistic_ext.h" +#include "commands/defrem.h" +#include "miscadmin.h" +#include "statistics/stats.h" +#include "utils/builtins.h" +#include "utils/inval.h" +#include "utils/memutils.h" +#include "utils/rel.h" +#include "utils/syscache.h" + + +/* used for sorting the attnums in ExecCreateStatistics */ +static int +compare_int16(const void *a, const void *b) +{ + return memcmp(a, b, sizeof(int16)); +} + +/* + * Implements the CREATE STATISTICS name ON (columns) FROM table + * + * We do require that the types support sorting (ltopr), although some + * statistics might work with equality only. + */ +ObjectAddress +CreateStatistics(CreateStatsStmt *stmt) +{ + int i; + ListCell *l; + int16 attnums[STATS_MAX_DIMENSIONS]; + int numcols = 0; + ObjectAddress address = InvalidObjectAddress; + char *namestr; + NameData staname; + Oid statoid; + Oid namespaceId; + + HeapTuple htup; + Datum values[Natts_pg_statistic_ext]; + bool nulls[Natts_pg_statistic_ext]; + int2vector *stakeys; + Relation statrel; + Relation rel; + Oid relid; + ObjectAddress parentobject, + childobject; + + /* costruction of array of enabled statistic */ + Datum types[1]; /* only ndistinct defined now */ + int ntypes; + ArrayType *staenabled; + + Assert(IsA(stmt, CreateStatsStmt)); + + /* resolve the pieces of the name (namespace etc.) */ + namespaceId = QualifiedNameGetCreationNamespace(stmt->defnames, &namestr); + namestrcpy(&staname, namestr); + + /* + * If if_not_exists was given and the statistics already exists, bail out. + */ + if (SearchSysCacheExists2(STATEXTNAMENSP, + PointerGetDatum(&staname), + ObjectIdGetDatum(namespaceId))) + { + if (stmt->if_not_exists) + { + ereport(NOTICE, + (errcode(ERRCODE_DUPLICATE_OBJECT), + errmsg("statistics \"%s\" already exist, skipping", + namestr))); + return InvalidObjectAddress; + } + + ereport(ERROR, + (errcode(ERRCODE_DUPLICATE_OBJECT), + errmsg("statistics \"%s\" already exist", namestr))); + } + + rel = heap_openrv(stmt->relation, AccessExclusiveLock); + relid = RelationGetRelid(rel); + + /* ndistinct coefficients is the only known type of extended statistics */ + ntypes = 1; + types[0] = CharGetDatum(STATS_EXT_NDISTINCT); + + /* + * Transform column names to array of attnums. While doing that, we also + * enforce the maximum number of keys. + */ + foreach(l, stmt->keys) + { + char *attname = strVal(lfirst(l)); + HeapTuple atttuple; + + atttuple = SearchSysCacheAttName(relid, attname); + + if (!HeapTupleIsValid(atttuple)) + ereport(ERROR, + (errcode(ERRCODE_UNDEFINED_COLUMN), + errmsg("column \"%s\" referenced in statistics does not exist", + attname))); + + /* more than STATS_MAX_DIMENSIONS columns not allowed */ + if (numcols >= STATS_MAX_DIMENSIONS) + ereport(ERROR, + (errcode(ERRCODE_TOO_MANY_COLUMNS), + errmsg("cannot have more than %d keys in statistics", + STATS_MAX_DIMENSIONS))); + + attnums[numcols] = ((Form_pg_attribute) GETSTRUCT(atttuple))->attnum; + ReleaseSysCache(atttuple); + numcols++; + } + + /* + * Check that at least two columns were specified in the statement. The + * upper bound was already checked in the loop above. + */ + if (numcols < 2) + ereport(ERROR, + (errcode(ERRCODE_TOO_MANY_COLUMNS), + errmsg("statistics require at least 2 columns"))); + + /* + * Sort the attnums, which makes detecting duplicies somewhat easier, and + * it does not hurt (it does not affect the efficiency, unlike for + * indexes, for example). + */ + qsort(attnums, numcols, sizeof(int16), compare_int16); + + /* + * Look for duplicities in the list of columns. The attnums are sorted so + * just check consecutive elements. + */ + for (i = 1; i < numcols; i++) + if (attnums[i] == attnums[i - 1]) + ereport(ERROR, + (errcode(ERRCODE_UNDEFINED_COLUMN), + errmsg("duplicate column name in statistics definition"))); + + stakeys = buildint2vector(attnums, numcols); + + /* construct the char array of enabled statistic types */ + staenabled = construct_array(types, ntypes, CHAROID, 1, true, 'c'); + + /* + * Everything seems fine, so let's build the pg_statistic_ext entry. At + * this point we obviously only have the keys and options. + */ + + memset(values, 0, sizeof(values)); + memset(nulls, false, sizeof(nulls)); + + /* metadata */ + values[Anum_pg_statistic_ext_starelid - 1] = ObjectIdGetDatum(relid); + values[Anum_pg_statistic_ext_staname - 1] = NameGetDatum(&staname); + values[Anum_pg_statistic_ext_stanamespace - 1] = ObjectIdGetDatum(namespaceId); + values[Anum_pg_statistic_ext_staowner - 1] = ObjectIdGetDatum(GetUserId()); + + values[Anum_pg_statistic_ext_stakeys - 1] = PointerGetDatum(stakeys); + + /* enabled statistics */ + values[Anum_pg_statistic_ext_staenabled - 1] = PointerGetDatum(staenabled); + + /* no statistics build yet */ + nulls[Anum_pg_statistic_ext_standistinct - 1] = true; + + /* insert the tuple into pg_statistic_ext */ + statrel = heap_open(StatisticExtRelationId, RowExclusiveLock); + + htup = heap_form_tuple(statrel->rd_att, values, nulls); + + CatalogTupleInsert(statrel, htup); + + statoid = HeapTupleGetOid(htup); + + heap_freetuple(htup); + + /* + * Add a dependency on a table, so that stats get dropped on DROP TABLE. + */ + ObjectAddressSet(parentobject, RelationRelationId, relid); + ObjectAddressSet(childobject, StatisticExtRelationId, statoid); + + recordDependencyOn(&childobject, &parentobject, DEPENDENCY_AUTO); + + /* + * Also add dependency on the schema (to drop statistics on DROP SCHEMA). + * This is not handled automatically by DROP TABLE because statistics have + * their own schema. + */ + ObjectAddressSet(parentobject, NamespaceRelationId, namespaceId); + + recordDependencyOn(&childobject, &parentobject, DEPENDENCY_AUTO); + + heap_close(statrel, RowExclusiveLock); + + relation_close(rel, NoLock); + + /* + * Invalidate relcache so that others see the new statistics. + */ + CacheInvalidateRelcache(rel); + + ObjectAddressSet(address, StatisticExtRelationId, statoid); + + return address; +} + + +/* + * Implements the DROP STATISTICS + * + * DROP STATISTICS stats_name + */ +void +RemoveStatisticsById(Oid statsOid) +{ + Relation relation; + Oid relid; + Relation rel; + HeapTuple tup; + Form_pg_statistic_ext statext; + + /* + * Delete the pg_proc tuple. + */ + relation = heap_open(StatisticExtRelationId, RowExclusiveLock); + + tup = SearchSysCache1(STATEXTOID, ObjectIdGetDatum(statsOid)); + + if (!HeapTupleIsValid(tup)) /* should not happen */ + elog(ERROR, "cache lookup failed for statistics %u", statsOid); + + statext = (Form_pg_statistic_ext) GETSTRUCT(tup); + relid = statext->starelid; + + rel = heap_open(relid, AccessExclusiveLock); + + simple_heap_delete(relation, &tup->t_self); + + CacheInvalidateRelcache(rel); + + ReleaseSysCache(tup); + + heap_close(relation, RowExclusiveLock); + heap_close(rel, NoLock); +} diff --git a/src/backend/nodes/copyfuncs.c b/src/backend/nodes/copyfuncs.c index bfc2ac1..9a34f94 100644 --- a/src/backend/nodes/copyfuncs.c +++ b/src/backend/nodes/copyfuncs.c @@ -4447,6 +4447,19 @@ _copyDropSubscriptionStmt(const DropSubscriptionStmt *from) return newnode; } +static CreateStatsStmt * +_copyCreateStatsStmt(const CreateStatsStmt *from) +{ + CreateStatsStmt *newnode = makeNode(CreateStatsStmt); + + COPY_NODE_FIELD(defnames); + COPY_NODE_FIELD(relation); + COPY_NODE_FIELD(keys); + COPY_SCALAR_FIELD(if_not_exists); + + return newnode; +} + /* **************************************************************** * pg_list.h copy functions * **************************************************************** @@ -5385,6 +5398,9 @@ copyObject(const void *from) case T_CommonTableExpr: retval = _copyCommonTableExpr(from); break; + case T_CreateStatsStmt: + retval = _copyCreateStatsStmt(from); + break; case T_ObjectWithArgs: retval = _copyObjectWithArgs(from); break; diff --git a/src/backend/nodes/outfuncs.c b/src/backend/nodes/outfuncs.c index 7418fbe..953e6e2 100644 --- a/src/backend/nodes/outfuncs.c +++ b/src/backend/nodes/outfuncs.c @@ -2266,6 +2266,18 @@ _outForeignKeyOptInfo(StringInfo str, const ForeignKeyOptInfo *node) } static void +_outStatisticExtInfo(StringInfo str, const StatisticExtInfo *node) +{ + WRITE_NODE_TYPE("STATISTICEXTINFO"); + + /* NB: this isn't a complete set of fields */ + WRITE_OID_FIELD(statOid); + + /* built/available statistics */ + WRITE_BOOL_FIELD(ndist_built); +} + +static void _outEquivalenceClass(StringInfo str, const EquivalenceClass *node) { /* @@ -3915,6 +3927,9 @@ outNode(StringInfo str, const void *obj) case T_PlannerParamItem: _outPlannerParamItem(str, obj); break; + case T_StatisticExtInfo: + _outStatisticExtInfo(str, obj); + break; case T_ExtensibleNode: _outExtensibleNode(str, obj); diff --git a/src/backend/optimizer/util/plancat.c b/src/backend/optimizer/util/plancat.c index 463f806..d90f199 100644 --- a/src/backend/optimizer/util/plancat.c +++ b/src/backend/optimizer/util/plancat.c @@ -29,6 +29,7 @@ #include "catalog/heap.h" #include "catalog/partition.h" #include "catalog/pg_am.h" +#include "catalog/pg_statistic_ext.h" #include "foreign/fdwapi.h" #include "miscadmin.h" #include "nodes/makefuncs.h" @@ -40,8 +41,11 @@ #include "parser/parse_relation.h" #include "parser/parsetree.h" #include "rewrite/rewriteManip.h" +#include "statistics/stats.h" #include "storage/bufmgr.h" +#include "utils/builtins.h" #include "utils/lsyscache.h" +#include "utils/syscache.h" #include "utils/rel.h" #include "utils/snapmgr.h" @@ -63,7 +67,7 @@ static List *get_relation_constraints(PlannerInfo *root, bool include_notnull); static List *build_index_tlist(PlannerInfo *root, IndexOptInfo *index, Relation heapRelation); - +static List *get_relation_statistics(RelOptInfo *rel, Relation relation); /* * get_relation_info - @@ -398,6 +402,8 @@ get_relation_info(PlannerInfo *root, Oid relationObjectId, bool inhparent, rel->indexlist = indexinfos; + rel->statlist = get_relation_statistics(rel, relation); + /* Grab foreign-table info using the relcache, while we have it */ if (relation->rd_rel->relkind == RELKIND_FOREIGN_TABLE) { @@ -1251,6 +1257,64 @@ get_relation_constraints(PlannerInfo *root, return result; } +/* + * get_relation_statistics + * + * Retrieve extended statistics defined on the table. + * + * Returns a List (possibly empty) of StatisticExtInfo objects describing + * the statistics. Only attributes needed for selecting statistics are + * retrieved (columns covered by the statistics, etc.). + */ +static List * +get_relation_statistics(RelOptInfo *rel, Relation relation) +{ + List *statoidlist; + ListCell *l; + List *stainfos = NIL; + + statoidlist = RelationGetStatExtList(relation); + + foreach(l, statoidlist) + { + ArrayType *arr; + Datum adatum; + bool isnull; + Oid statOid = lfirst_oid(l); + + HeapTuple htup = SearchSysCache1(STATEXTOID, ObjectIdGetDatum(statOid)); + + /* unavailable stats are not interesting for the planner */ + if (stats_are_built(htup, STATS_EXT_NDISTINCT)) + { + StatisticExtInfo *info = makeNode(StatisticExtInfo); + + info->statOid = statOid; + info->rel = rel; + + /* built/available statistics */ + info->ndist_built = true; + + /* decode the stakeys array */ + adatum = SysCacheGetAttr(STATEXTOID, htup, + Anum_pg_statistic_ext_stakeys, &isnull); + Assert(!isnull); + + arr = DatumGetArrayTypeP(adatum); + + info->stakeys = buildint2vector((int16 *) ARR_DATA_PTR(arr), + ARR_DIMS(arr)[0]); + + stainfos = lcons(info, stainfos); + } + + ReleaseSysCache(htup); + } + + list_free(statoidlist); + + return stainfos; +} /* * relation_excluded_by_constraints diff --git a/src/backend/parser/gram.y b/src/backend/parser/gram.y index e7acc2d..a0801dc 100644 --- a/src/backend/parser/gram.y +++ b/src/backend/parser/gram.y @@ -257,7 +257,7 @@ static Node *makeRecursiveViewSelect(char *relname, List *aliases, Node *query); ConstraintsSetStmt CopyStmt CreateAsStmt CreateCastStmt CreateDomainStmt CreateExtensionStmt CreateGroupStmt CreateOpClassStmt CreateOpFamilyStmt AlterOpFamilyStmt CreatePLangStmt - CreateSchemaStmt CreateSeqStmt CreateStmt CreateTableSpaceStmt + CreateSchemaStmt CreateSeqStmt CreateStmt CreateStatsStmt CreateTableSpaceStmt CreateFdwStmt CreateForeignServerStmt CreateForeignTableStmt CreateAssertStmt CreateTransformStmt CreateTrigStmt CreateEventTrigStmt CreateUserStmt CreateUserMappingStmt CreateRoleStmt CreatePolicyStmt @@ -873,6 +873,7 @@ stmt : | CreateSeqStmt | CreateStmt | CreateSubscriptionStmt + | CreateStatsStmt | CreateTableSpaceStmt | CreateTransformStmt | CreateTrigStmt @@ -3746,6 +3747,34 @@ OptConsTableSpace: USING INDEX TABLESPACE name { $$ = $4; } ExistingIndex: USING INDEX index_name { $$ = $3; } ; +/***************************************************************************** + * + * QUERY : + * CREATE STATISTICS stats_name ON relname (columns) WITH (options) + * + *****************************************************************************/ + + +CreateStatsStmt: CREATE STATISTICS any_name ON '(' columnList ')' FROM qualified_name + { + CreateStatsStmt *n = makeNode(CreateStatsStmt); + n->defnames = $3; + n->relation = $9; + n->keys = $6; + n->if_not_exists = false; + $$ = (Node *)n; + } + | CREATE STATISTICS IF_P NOT EXISTS any_name ON '(' columnList ')' FROM qualified_name + { + CreateStatsStmt *n = makeNode(CreateStatsStmt); + n->defnames = $6; + n->relation = $12; + n->keys = $9; + n->if_not_exists = true; + $$ = (Node *)n; + } + ; + /***************************************************************************** * @@ -6033,6 +6062,7 @@ drop_type_name: | PUBLICATION { $$ = OBJECT_PUBLICATION; } | SCHEMA { $$ = OBJECT_SCHEMA; } | SERVER { $$ = OBJECT_FOREIGN_SERVER; } + | STATISTICS { $$ = OBJECT_STATISTICS; } ; /* object types attached to a table */ @@ -8377,6 +8407,15 @@ RenameStmt: ALTER AGGREGATE aggregate_with_argtypes RENAME TO name n->missing_ok = false; $$ = (Node *)n; } + | ALTER STATISTICS any_name RENAME TO name + { + RenameStmt *n = makeNode(RenameStmt); + n->renameType = OBJECT_STATISTICS; + n->object = (Node *) $3; + n->newname = $6; + n->missing_ok = false; + $$ = (Node *)n; + } | ALTER TEXT_P SEARCH PARSER any_name RENAME TO name { RenameStmt *n = makeNode(RenameStmt); @@ -8592,6 +8631,15 @@ AlterObjectSchemaStmt: n->missing_ok = true; $$ = (Node *)n; } + | ALTER STATISTICS any_name SET SCHEMA name + { + AlterObjectSchemaStmt *n = makeNode(AlterObjectSchemaStmt); + n->objectType = OBJECT_STATISTICS; + n->object = (Node *) $3; + n->newschema = $6; + n->missing_ok = false; + $$ = (Node *)n; + } | ALTER TEXT_P SEARCH PARSER any_name SET SCHEMA name { AlterObjectSchemaStmt *n = makeNode(AlterObjectSchemaStmt); @@ -8855,6 +8903,14 @@ AlterOwnerStmt: ALTER AGGREGATE aggregate_with_argtypes OWNER TO RoleSpec n->newowner = $6; $$ = (Node *)n; } + | ALTER STATISTICS name OWNER TO RoleSpec + { + AlterOwnerStmt *n = makeNode(AlterOwnerStmt); + n->objectType = OBJECT_STATISTICS; + n->object = (Node *) makeString($3); + n->newowner = $6; + $$ = (Node *)n; + } | ALTER TEXT_P SEARCH DICTIONARY any_name OWNER TO RoleSpec { AlterOwnerStmt *n = makeNode(AlterOwnerStmt); diff --git a/src/backend/statistics/Makefile b/src/backend/statistics/Makefile new file mode 100644 index 0000000..e77b350 --- /dev/null +++ b/src/backend/statistics/Makefile @@ -0,0 +1,17 @@ +#------------------------------------------------------------------------- +# +# Makefile-- +# Makefile for statistics +# +# IDENTIFICATION +# src/backend/statistics/Makefile +# +#------------------------------------------------------------------------- + +subdir = src/backend/statistics +top_builddir = ../../.. +include $(top_builddir)/src/Makefile.global + +OBJS = common.o mvdist.o + +include $(top_srcdir)/src/backend/common.mk diff --git a/src/backend/statistics/README b/src/backend/statistics/README new file mode 100644 index 0000000..beb7c24 --- /dev/null +++ b/src/backend/statistics/README @@ -0,0 +1,34 @@ +Extended statistics +=================== + +When estimating various quantities (e.g. condition selectivities) the default +approach relies on the assumption of independence. In practice that's often +not true, resulting in estimation errors. + +Extended statistics track different types of dependencies between the columns, +hopefully improving the estimates and producing better plans. + +Currently we only have one type of extended statistics - ndistinct +coefficients, and we use it to improve estimates of grouping queries. See +README.ndistinct for details. + + +Size of sample in ANALYZE +------------------------- +When performing ANALYZE, the number of rows to sample is determined as + + (300 * statistics_target) + +That works reasonably well for statistics on individual columns, but perhaps +it's not enough for extended statistics. Papers analyzing estimation errors +all use samples proportional to the table (usually finding that 1-3% of the +table is enough to build accurate stats). + +The requested accuracy (number of MCV items or histogram bins) should also +be considered when determining the sample size, and in extended statistics +those are not necessarily limited by statistics_target. + +This however merits further discussion, because collecting the sample is quite +expensive and increasing it further would make ANALYZE even more painful. +Judging by the experiments with the current implementation, the fixed size +seems to work reasonably well for now, so we leave this as a future work. diff --git a/src/backend/statistics/README.ndistinct b/src/backend/statistics/README.ndistinct new file mode 100644 index 0000000..9365b17 --- /dev/null +++ b/src/backend/statistics/README.ndistinct @@ -0,0 +1,22 @@ +ndistinct coefficients +====================== + +Estimating number of groups in a combination of columns (e.g. for GROUP BY) +is tricky, and the estimation error is often significant. + +The ndistinct coefficients address this by storing ndistinct estimates not +only for individual columns, but also for (all) combinations of columns. +So for example given three columns (a,b,c) the statistics will estimate +ndistinct for (a,b), (a,c), (b,c) and (a,b,c). The per-column estimates +are already available in pg_statistic. + + +GROUP BY estimation (estimate_num_groups) +----------------------------------------- + +Although ndistinct coefficient might be used for selectivity estimation +(of equality conditions in WHERE clause), that is not implemented at this +point. + +Instead, ndistinct coefficients are only used in estimate_num_groups() to +estimate grouped queries. diff --git a/src/backend/statistics/common.c b/src/backend/statistics/common.c new file mode 100644 index 0000000..f63d8cc --- /dev/null +++ b/src/backend/statistics/common.c @@ -0,0 +1,454 @@ +/*------------------------------------------------------------------------- + * + * common.c + * POSTGRES extended statistics + * + * + * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * + * IDENTIFICATION + * src/backend/statistics/common.c + * + *------------------------------------------------------------------------- + */ +#include "postgres.h" + +#include "access/genam.h" +#include "access/heapam.h" +#include "access/htup_details.h" +#include "catalog/indexing.h" +#include "catalog/pg_collation.h" +#include "catalog/pg_statistic_ext.h" +#include "nodes/relation.h" +#include "statistics/common.h" +#include "statistics/stats.h" +#include "utils/builtins.h" +#include "utils/fmgroids.h" +#include "utils/rel.h" +#include "utils/syscache.h" + + +static VacAttrStats **lookup_var_attr_stats(int2vector *attrs, + int natts, VacAttrStats **vacattrstats); + +static List *list_ext_stats(Oid relid); + +static void update_ext_stats(Oid relid, MVNDistinct ndistinct, + int2vector *attrs, VacAttrStats **stats); + + +/* + * Compute requested extended stats, using the rows sampled for the plain + * (single-column) stats. + * + * This fetches a list of stats from pg_statistic_ext, computes the stats + * and serializes them back into the catalog (as bytea values). + */ +void +build_ext_stats(Relation onerel, double totalrows, + int numrows, HeapTuple *rows, + int natts, VacAttrStats **vacattrstats) +{ + ListCell *lc; + List *stats; + + TupleDesc tupdesc = RelationGetDescr(onerel); + + /* Fetch defined statistics from pg_statistic_ext, and compute them. */ + stats = list_ext_stats(RelationGetRelid(onerel)); + + foreach(lc, stats) + { + int j; + StatisticExtInfo *stat = (StatisticExtInfo *) lfirst(lc); + MVNDistinct ndistinct = NULL; + + VacAttrStats **stats = NULL; + int numatts = 0; + + /* int2 vector of attnums the stats should be computed on */ + int2vector *attrs = stat->stakeys; + + /* see how many of the columns are not dropped */ + for (j = 0; j < attrs->dim1; j++) + if (!tupdesc->attrs[attrs->values[j] - 1]->attisdropped) + numatts += 1; + + /* if there are dropped attributes, build a filtered int2vector */ + if (numatts != attrs->dim1) + { + int16 *tmp = palloc0(numatts * sizeof(int16)); + int attnum = 0; + + for (j = 0; j < attrs->dim1; j++) + if (!tupdesc->attrs[attrs->values[j] - 1]->attisdropped) + tmp[attnum++] = attrs->values[j]; + + pfree(attrs); + attrs = buildint2vector(tmp, numatts); + } + + /* filter only the interesting vacattrstats records */ + stats = lookup_var_attr_stats(attrs, natts, vacattrstats); + + /* check allowed number of dimensions */ + Assert((attrs->dim1 >= 2) && (attrs->dim1 <= STATS_MAX_DIMENSIONS)); + + /* compute ndistinct coefficients */ + if (stat->ndist_enabled) + ndistinct = build_ext_ndistinct(totalrows, numrows, rows, attrs, stats); + + /* store the statistics in the catalog */ + update_ext_stats(stat->statOid, ndistinct, attrs, stats); + } +} + +/* + * Lookup the VacAttrStats info for the selected columns, with indexes + * matching the attrs vector (to make it easy to work with when + * computing extended stats). + */ +static VacAttrStats ** +lookup_var_attr_stats(int2vector *attrs, int natts, VacAttrStats **vacattrstats) +{ + int i, + j; + int numattrs = attrs->dim1; + VacAttrStats **stats = (VacAttrStats **) palloc0(numattrs * sizeof(VacAttrStats *)); + + /* lookup VacAttrStats info for the requested columns (same attnum) */ + for (i = 0; i < numattrs; i++) + { + stats[i] = NULL; + for (j = 0; j < natts; j++) + { + if (attrs->values[i] == vacattrstats[j]->tupattnum) + { + stats[i] = vacattrstats[j]; + break; + } + } + + /* + * Check that we found the info, that the attnum matches and that + * there's the requested 'lt' operator and that the type is + * 'passed-by-value'. + */ + Assert(stats[i] != NULL); + Assert(stats[i]->tupattnum == attrs->values[i]); + + /* + * FIXME This is rather ugly way to check for 'ltopr' (which is + * defined for 'scalar' attributes). + */ + Assert(((StdAnalyzeData *) stats[i]->extra_data)->ltopr != InvalidOid); + } + + return stats; +} + +/* + * Fetch list of MV stats defined on a table, without the actual data + * for histograms, MCV lists etc. + */ +static List * +list_ext_stats(Oid relid) +{ + Relation indrel; + SysScanDesc indscan; + ScanKeyData skey; + HeapTuple htup; + List *result = NIL; + + /* + * Prepare to scan pg_statistic_ext for entries having indrelid = this + * rel. + */ + ScanKeyInit(&skey, + Anum_pg_statistic_ext_starelid, + BTEqualStrategyNumber, F_OIDEQ, + ObjectIdGetDatum(relid)); + + indrel = heap_open(StatisticExtRelationId, AccessShareLock); + indscan = systable_beginscan(indrel, StatisticExtRelidIndexId, true, + NULL, 1, &skey); + + while (HeapTupleIsValid(htup = systable_getnext(indscan))) + { + StatisticExtInfo *info = makeNode(StatisticExtInfo); + Form_pg_statistic_ext stats = (Form_pg_statistic_ext) GETSTRUCT(htup); + + info->statOid = HeapTupleGetOid(htup); + info->stakeys = buildint2vector(stats->stakeys.values, stats->stakeys.dim1); + + info->ndist_enabled = stats_are_enabled(htup, STATS_EXT_NDISTINCT); + info->ndist_built = stats_are_built(htup, STATS_EXT_NDISTINCT); + + result = lappend(result, info); + } + + systable_endscan(indscan); + + heap_close(indrel, AccessShareLock); + + /* + * TODO maybe save the list into relcache, as in RelationGetIndexList + * (which was used as an inspiration of this one)?. + */ + + return result; +} + +/* + * update_ext_stats + * Serializes the statistics and stores them into the pg_statistic_ext tuple. + */ +static void +update_ext_stats(Oid statOid, MVNDistinct ndistinct, + int2vector *attrs, VacAttrStats **stats) +{ + HeapTuple stup, + oldtup; + Datum values[Natts_pg_statistic_ext]; + bool nulls[Natts_pg_statistic_ext]; + bool replaces[Natts_pg_statistic_ext]; + + Relation sd = heap_open(StatisticExtRelationId, RowExclusiveLock); + + memset(nulls, 1, Natts_pg_statistic_ext * sizeof(bool)); + memset(replaces, 0, Natts_pg_statistic_ext * sizeof(bool)); + memset(values, 0, Natts_pg_statistic_ext * sizeof(Datum)); + + /* + * Construct a new pg_statistic_ext tuple - replace only the histogram and + * MCV list, depending whether it actually was computed. + */ + if (ndistinct != NULL) + { + bytea *data = serialize_ext_ndistinct(ndistinct); + + nulls[Anum_pg_statistic_ext_standistinct - 1] = (data == NULL); + values[Anum_pg_statistic_ext_standistinct - 1] = PointerGetDatum(data); + } + + /* always replace the value (either by bytea or NULL) */ + replaces[Anum_pg_statistic_ext_standistinct - 1] = true; + + /* always change the availability flags */ + nulls[Anum_pg_statistic_ext_stakeys - 1] = false; + + /* use the new attnums, in case we removed some dropped ones */ + replaces[Anum_pg_statistic_ext_stakeys - 1] = true; + + values[Anum_pg_statistic_ext_stakeys - 1] = PointerGetDatum(attrs); + + /* Is there already a pg_statistic_ext tuple for this attribute? */ + oldtup = SearchSysCache1(STATEXTOID, + ObjectIdGetDatum(statOid)); + + if (!HeapTupleIsValid(oldtup)) + elog(ERROR, "cache lookup failed for extended statistics %u", statOid); + + /* replace it */ + stup = heap_modify_tuple(oldtup, + RelationGetDescr(sd), + values, + nulls, + replaces); + ReleaseSysCache(oldtup); + CatalogTupleUpdate(sd, &stup->t_self, stup); + + heap_freetuple(stup); + heap_close(sd, RowExclusiveLock); +} + +/* multi-variate stats comparator */ + +/* + * qsort_arg comparator for sorting Datums (MV stats) + * + * This does not maintain the tupnoLink array. + */ +int +compare_scalars_simple(const void *a, const void *b, void *arg) +{ + Datum da = *(Datum *) a; + Datum db = *(Datum *) b; + SortSupport ssup = (SortSupport) arg; + + return ApplySortComparator(da, false, db, false, ssup); +} + +/* + * qsort_arg comparator for sorting data when partitioning a MV bucket + */ +int +compare_scalars_partition(const void *a, const void *b, void *arg) +{ + Datum da = ((ScalarItem *) a)->value; + Datum db = ((ScalarItem *) b)->value; + SortSupport ssup = (SortSupport) arg; + + return ApplySortComparator(da, false, db, false, ssup); +} + +/* initialize multi-dimensional sort */ +MultiSortSupport +multi_sort_init(int ndims) +{ + MultiSortSupport mss; + + Assert(ndims >= 2); + + mss = (MultiSortSupport) palloc0(offsetof(MultiSortSupportData, ssup) + +sizeof(SortSupportData) * ndims); + + mss->ndims = ndims; + + return mss; +} + +/* + * Prepare sort support info for dimension 'dim' (index into vacattrstats) to + * 'mss', at the position 'sortdim' + */ +void +multi_sort_add_dimension(MultiSortSupport mss, int sortdim, + int dim, VacAttrStats **vacattrstats) +{ + /* first, lookup StdAnalyzeData for the dimension (attribute) */ + SortSupportData ssup; + StdAnalyzeData *tmp = (StdAnalyzeData *) vacattrstats[dim]->extra_data; + + Assert(mss != NULL); + Assert(sortdim < mss->ndims); + + /* initialize sort support, etc. */ + memset(&ssup, 0, sizeof(ssup)); + ssup.ssup_cxt = CurrentMemoryContext; + + /* We always use the default collation for statistics */ + ssup.ssup_collation = DEFAULT_COLLATION_OID; + ssup.ssup_nulls_first = false; + + PrepareSortSupportFromOrderingOp(tmp->ltopr, &ssup); + + mss->ssup[sortdim] = ssup; +} + +/* compare all the dimensions in the selected order */ +int +multi_sort_compare(const void *a, const void *b, void *arg) +{ + int i; + SortItem *ia = (SortItem *) a; + SortItem *ib = (SortItem *) b; + + MultiSortSupport mss = (MultiSortSupport) arg; + + for (i = 0; i < mss->ndims; i++) + { + int compare; + + compare = ApplySortComparator(ia->values[i], ia->isnull[i], + ib->values[i], ib->isnull[i], + &mss->ssup[i]); + + if (compare != 0) + return compare; + } + + /* equal by default */ + return 0; +} + +/* compare selected dimension */ +int +multi_sort_compare_dim(int dim, const SortItem *a, const SortItem *b, + MultiSortSupport mss) +{ + return ApplySortComparator(a->values[dim], a->isnull[dim], + b->values[dim], b->isnull[dim], + &mss->ssup[dim]); +} + +int +multi_sort_compare_dims(int start, int end, + const SortItem *a, const SortItem *b, + MultiSortSupport mss) +{ + int dim; + + for (dim = start; dim <= end; dim++) + { + int r = ApplySortComparator(a->values[dim], a->isnull[dim], + b->values[dim], b->isnull[dim], + &mss->ssup[dim]); + + if (r != 0) + return r; + } + + return 0; +} + +bool +stats_are_enabled(HeapTuple htup, char type) +{ + Datum datum; + bool isnull; + int i, + nenabled; + char *enabled; + ArrayType *enabledArray; + + /* see which statistics are enabled */ + datum = SysCacheGetAttr(STATEXTOID, htup, + Anum_pg_statistic_ext_staenabled, &isnull); + + /* if there are no values in staenabled field, everything is enabled */ + if (isnull || (datum == PointerGetDatum(NULL))) + return false; + + /* + * We expect the array to be a 1-D CHAR array; verify that. We don't need + * to use deconstruct_array() since the array data is just going to look + * like a C array of char values. + */ + enabledArray = DatumGetArrayTypeP(datum); + + if (ARR_NDIM(enabledArray) != 1 || + ARR_HASNULL(enabledArray) || + ARR_ELEMTYPE(enabledArray) != CHAROID) + elog(ERROR, "enabled statistics (staenabled) is not a 1-D char array"); + + nenabled = ARR_DIMS(enabledArray)[0]; + enabled = (char *) ARR_DATA_PTR(enabledArray); + + for (i = 0; i < nenabled; i++) + if (enabled[i] == type) + return true; + + return false; +} + +bool +stats_are_built(HeapTuple htup, char type) +{ + bool isnull; + + switch (type) + { + case STATS_EXT_NDISTINCT: + SysCacheGetAttr(STATEXTOID, htup, + Anum_pg_statistic_ext_standistinct, &isnull); + break; + + default: + elog(ERROR, "unexpected statistics type requested: %d", type); + } + + return !isnull; +} diff --git a/src/backend/statistics/mvdist.c b/src/backend/statistics/mvdist.c new file mode 100644 index 0000000..8f318da --- /dev/null +++ b/src/backend/statistics/mvdist.c @@ -0,0 +1,621 @@ +/*------------------------------------------------------------------------- + * + * mvdist.c + * POSTGRES multivariate ndistinct coefficients + * + * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * IDENTIFICATION + * src/backend/statistics/mvdist.c + *------------------------------------------------------------------------- + */ +#include "postgres.h" + +#include + +#include "access/htup_details.h" +#include "catalog/pg_statistic_ext.h" +#include "utils/fmgrprotos.h" +#include "utils/lsyscache.h" +#include "lib/stringinfo.h" +#include "utils/syscache.h" +#include "statistics/common.h" +#include "statistics/stats.h" + + +static double estimate_ndistinct(double totalrows, int numrows, int d, int f1); + +/* internal state for generator of k-combinations of n elements */ +typedef struct CombinationGeneratorData +{ + + int k; /* size of the combination */ + int current; /* index of the next combination to return */ + + int ncombinations; /* number of combinations (size of array) */ + AttrNumber *combinations; /* array of pre-built combinations */ + +} CombinationGeneratorData; + +typedef CombinationGeneratorData *CombinationGenerator; + +/* generator API */ +static CombinationGenerator generator_init(int2vector *attrs, int k); +static void generator_free(CombinationGenerator state); +static AttrNumber *generator_next(CombinationGenerator state, int2vector *attrs); + +static int n_choose_k(int n, int k); +static int num_combinations(int n); +static double ndistinct_for_combination(double totalrows, int numrows, + HeapTuple *rows, int2vector *attrs, VacAttrStats **stats, + int k, AttrNumber *combination); + +/* + * Compute ndistinct coefficient for the combination of attributes. This + * computes the ndistinct estimate using the same estimator used in analyze.c + * and then computes the coefficient. + */ +MVNDistinct +build_ext_ndistinct(double totalrows, int numrows, HeapTuple *rows, + int2vector *attrs, VacAttrStats **stats) +{ + int i, + k; + int numattrs = attrs->dim1; + int numcombs = num_combinations(numattrs); + + MVNDistinct result; + + result = palloc0(offsetof(MVNDistinctData, items) + + numcombs * sizeof(MVNDistinctItem)); + + result->nitems = numcombs; + + i = 0; + for (k = 2; k <= numattrs; k++) + { + AttrNumber *combination; + CombinationGenerator generator; + + generator = generator_init(attrs, k); + + while ((combination = generator_next(generator, attrs))) + { + MVNDistinctItem *item = &result->items[i++]; + + item->nattrs = k; + item->ndistinct = ndistinct_for_combination(totalrows, numrows, rows, + attrs, stats, k, combination); + + item->attrs = palloc(k * sizeof(AttrNumber)); + memcpy(item->attrs, combination, k * sizeof(AttrNumber)); + + /* must not overflow the output array */ + Assert(i <= result->nitems); + } + + generator_free(generator); + } + + /* must consume exactly the whole output array */ + Assert(i == result->nitems); + + return result; +} + +/* + * ndistinct_for_combination + * Estimates number of distinct values in a combination of columns. + * + * This uses the same ndistinct estimator as compute_scalar_stats() in + * ANALYZE, i.e., + * n*d / (n - f1 + f1*n/N) + * + * except that instead of values in a single column we are dealing with + * combination of multiple columns. + */ +static double +ndistinct_for_combination(double totalrows, int numrows, HeapTuple *rows, + int2vector *attrs, VacAttrStats **stats, + int k, AttrNumber *combination) +{ + int i, + j; + int f1, + cnt, + d; + int nmultiple, + summultiple; + bool *isnull; + Datum *values; + SortItem *items; + MultiSortSupport mss; + + /* + * It's possible to sort the sample rows directly, but this seemed somehow + * simpler / less error prone. Another option would be to allocate the + * arrays for each SortItem separately, but that'd be significant overhead + * (not just CPU, but especially memory bloat). + */ + mss = multi_sort_init(k); + items = (SortItem *) palloc0(numrows * sizeof(SortItem)); + values = (Datum *) palloc0(sizeof(Datum) * numrows * k); + isnull = (bool *) palloc0(sizeof(bool) * numrows * k); + + Assert((k >= 2) && (k <= attrs->dim1)); + + for (i = 0; i < numrows; i++) + { + items[i].values = &values[i * k]; + items[i].isnull = &isnull[i * k]; + } + + for (i = 0; i < k; i++) + { + /* prepare the sort function for the first dimension */ + multi_sort_add_dimension(mss, i, combination[i], stats); + + /* accumulate all the data into the array and sort it */ + for (j = 0; j < numrows; j++) + { + items[j].values[i] = + heap_getattr(rows[j], attrs->values[combination[i]], + stats[combination[i]]->tupDesc, + &items[j].isnull[i]); + } + } + + qsort_arg((void *) items, numrows, sizeof(SortItem), + multi_sort_compare, mss); + + /* count number of distinct combinations */ + + f1 = 0; + cnt = 1; + d = 1; + for (i = 1; i < numrows; i++) + { + if (multi_sort_compare(&items[i], &items[i - 1], mss) != 0) + { + if (cnt == 1) + f1 += 1; + else + { + nmultiple += 1; + summultiple += cnt; + } + + d++; + cnt = 0; + } + + cnt += 1; + } + + if (cnt == 1) + f1 += 1; + else + { + nmultiple += 1; + summultiple += cnt; + } + + return estimate_ndistinct(totalrows, numrows, d, f1); +} + +MVNDistinct +load_ext_ndistinct(Oid mvoid) +{ + bool isnull = false; + Datum ndist; + + /* + * Prepare to scan pg_statistic_ext for entries having indrelid = this + * rel. + */ + HeapTuple htup = SearchSysCache1(STATEXTOID, ObjectIdGetDatum(mvoid)); + + Assert(stats_are_enabled(htup, STATS_EXT_NDISTINCT)); + Assert(stats_are_built(htup, STATS_EXT_NDISTINCT)); + + ndist = SysCacheGetAttr(STATEXTOID, htup, + Anum_pg_statistic_ext_standistinct, &isnull); + + Assert(!isnull); + + ReleaseSysCache(htup); + + return deserialize_ext_ndistinct(DatumGetByteaP(ndist)); +} + +/* The Duj1 estimator (already used in analyze.c). */ +static double +estimate_ndistinct(double totalrows, int numrows, int d, int f1) +{ + double numer, + denom, + ndistinct; + + numer = (double) numrows *(double) d; + + denom = (double) (numrows - f1) + + (double) f1 *(double) numrows / totalrows; + + ndistinct = numer / denom; + + /* Clamp to sane range in case of roundoff error */ + if (ndistinct < (double) d) + ndistinct = (double) d; + + if (ndistinct > totalrows) + ndistinct = totalrows; + + return floor(ndistinct + 0.5); +} + +/* + * pg_ndistinct_in - input routine for type pg_ndistinct. + * + * pg_ndistinct is real enough to be a table column, but it has no operations + * of its own, and disallows input too + * + * XXX This is inspired by what pg_node_tree does. + */ +Datum +pg_ndistinct_in(PG_FUNCTION_ARGS) +{ + /* + * pg_node_list stores the data in binary form and parsing text input is + * not needed, so disallow this. + */ + ereport(ERROR, + (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), + errmsg("cannot accept a value of type %s", "pg_ndistinct"))); + + PG_RETURN_VOID(); /* keep compiler quiet */ +} + +/* + * pg_ndistinct - output routine for type pg_ndistinct. + * + * histograms are serialized into a bytea value, so we simply call byteaout() + * to serialize the value into text. But it'd be nice to serialize that into + * a meaningful representation (e.g. for inspection by people). + */ +Datum +pg_ndistinct_out(PG_FUNCTION_ARGS) +{ + int i, + j; + StringInfoData str; + + bytea *data = PG_GETARG_BYTEA_PP(0); + + MVNDistinct ndist = deserialize_ext_ndistinct(data); + + initStringInfo(&str); + appendStringInfoChar(&str, '['); + + for (i = 0; i < ndist->nitems; i++) + { + MVNDistinctItem item = ndist->items[i]; + + if (i > 0) + appendStringInfoString(&str, ", "); + + appendStringInfoChar(&str, '{'); + + for (j = 0; j < item.nattrs; j++) + { + if (j > 0) + appendStringInfoString(&str, ", "); + + appendStringInfo(&str, "%d", item.attrs[j]); + } + + appendStringInfo(&str, ", %f", item.ndistinct); + + appendStringInfoChar(&str, '}'); + } + + appendStringInfoChar(&str, ']'); + + PG_RETURN_CSTRING(str.data); +} + +/* + * pg_ndistinct_recv - binary input routine for type pg_ndistinct. + */ +Datum +pg_ndistinct_recv(PG_FUNCTION_ARGS) +{ + ereport(ERROR, + (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), + errmsg("cannot accept a value of type %s", "pg_ndistinct"))); + + PG_RETURN_VOID(); /* keep compiler quiet */ +} + +/* + * pg_ndistinct_send - binary output routine for type pg_ndistinct. + * + * XXX Histograms are serialized into a bytea value, so let's just send that. + */ +Datum +pg_ndistinct_send(PG_FUNCTION_ARGS) +{ + return byteasend(fcinfo); +} + +/* + * n_choose_k + * computes binomial coefficients using an algorithm that is both + * efficient and prevents overflows + */ +static int +n_choose_k(int n, int k) +{ + int d, + r; + + Assert((k > 0) && (n >= k)); + + /* use symmetry of the binomial coefficients */ + k = Min(k, n - k); + + r = 1; + for (d = 1; d <= k; ++d) + { + r *= n--; + r /= d; + } + + return r; +} + +/* + * num_combinations + * computes number of combinations, excluding single-value combinations + */ +static int +num_combinations(int n) +{ + int k; + int ncombs = 1; + + for (k = 1; k <= n; k++) + ncombs *= 2; + + ncombs -= (n + 1); + + return ncombs; +} + +/* + * generate all combinations (k elements from n) + */ +static void +generate_combinations_recurse(CombinationGenerator state, AttrNumber n, + int index, AttrNumber start, AttrNumber *current) +{ + /* If we haven't filled all the elements, simply recurse. */ + if (index < state->k) + { + AttrNumber i; + + /* + * The values have to be in ascending order, so make sure we start + * with the value passed by parameter. + */ + + for (i = start; i < n; i++) + { + current[index] = i; + generate_combinations_recurse(state, n, (index + 1), (i + 1), current); + } + + return; + } + else + { + /* we got a correct combination */ + state->combinations = (AttrNumber *) repalloc(state->combinations, + state->k * (state->current + 1) * sizeof(AttrNumber)); + memcpy(&state->combinations[(state->k * state->current)], + current, state->k * sizeof(AttrNumber)); + state->current++; + } +} + +/* generate all k-combinations of n elements */ +static void +generate_combinations(CombinationGenerator state, int n) +{ + AttrNumber *current = (AttrNumber *) palloc0(sizeof(AttrNumber) * state->k); + + generate_combinations_recurse(state, n, 0, 0, current); + + pfree(current); +} + +/* + * initialize the generator of combinations, and prebuild them. + * + * This pre-builds all the combinations. We could also generate them in + * generator_next(), but this seems simpler. + */ +static CombinationGenerator +generator_init(int2vector *attrs, int k) +{ + int n = attrs->dim1; + CombinationGenerator state; + + Assert((n >= k) && (k > 0)); + + /* allocate the generator state as a single chunk of memory */ + state = (CombinationGenerator) palloc0(sizeof(CombinationGeneratorData)); + state->combinations = (AttrNumber *) palloc(k * sizeof(AttrNumber)); + + state->ncombinations = n_choose_k(n, k); + state->current = 0; + state->k = k; + + /* now actually pre-generate all the combinations */ + generate_combinations(state, n); + + /* make sure we got the expected number of combinations */ + Assert(state->current == state->ncombinations); + + /* reset the number, so we start with the first one */ + state->current = 0; + + return state; +} + +/* free the generator state */ +static void +generator_free(CombinationGenerator state) +{ + /* we've allocated a single chunk, so just free it */ + pfree(state); +} + +/* generate next combination */ +static AttrNumber * +generator_next(CombinationGenerator state, int2vector *attrs) +{ + if (state->current == state->ncombinations) + return NULL; + + return &state->combinations[state->k * state->current++]; +} + +/* + * serialize list of ndistinct items into a bytea + */ +bytea * +serialize_ext_ndistinct(MVNDistinct ndistinct) +{ + int i; + bytea *output; + char *tmp; + + /* we need to store nitems */ + Size len = VARHDRSZ + offsetof(MVNDistinctData, items) + + ndistinct->nitems * offsetof(MVNDistinctItem, attrs); + + /* and also include space for the actual attribute numbers */ + for (i = 0; i < ndistinct->nitems; i++) + len += (sizeof(AttrNumber) * ndistinct->items[i].nattrs); + + output = (bytea *) palloc0(len); + SET_VARSIZE(output, len); + + tmp = VARDATA(output); + + ndistinct->magic = STATS_NDISTINCT_MAGIC; + ndistinct->type = STATS_NDISTINCT_TYPE_BASIC; + + /* first, store the number of items */ + memcpy(tmp, ndistinct, offsetof(MVNDistinctData, items)); + tmp += offsetof(MVNDistinctData, items); + + /* + * store number of attributes and attribute numbers for each ndistinct + * entry + */ + for (i = 0; i < ndistinct->nitems; i++) + { + MVNDistinctItem item = ndistinct->items[i]; + + memcpy(tmp, &item, offsetof(MVNDistinctItem, attrs)); + tmp += offsetof(MVNDistinctItem, attrs); + + memcpy(tmp, item.attrs, sizeof(AttrNumber) * item.nattrs); + tmp += sizeof(AttrNumber) * item.nattrs; + + Assert(tmp <= ((char *) output + len)); + } + + return output; +} + +/* + * Reads serialized ndistinct into MVNDistinct structure. + */ +MVNDistinct +deserialize_ext_ndistinct(bytea *data) +{ + int i; + Size expected_size; + MVNDistinct ndistinct; + char *tmp; + + if (data == NULL) + return NULL; + + if (VARSIZE_ANY_EXHDR(data) < offsetof(MVNDistinctData, items)) + elog(ERROR, "invalid MVNDistinct size %ld (expected at least %ld)", + VARSIZE_ANY_EXHDR(data), offsetof(MVNDistinctData, items)); + + /* read the MVNDistinct header */ + ndistinct = (MVNDistinct) palloc0(sizeof(MVNDistinctData)); + + /* initialize pointer to the data part (skip the varlena header) */ + tmp = VARDATA_ANY(data); + + /* get the header and perform basic sanity checks */ + memcpy(ndistinct, tmp, offsetof(MVNDistinctData, items)); + tmp += offsetof(MVNDistinctData, items); + + if (ndistinct->magic != STATS_NDISTINCT_MAGIC) + elog(ERROR, "invalid ndistinct magic %d (expected %d)", + ndistinct->magic, STATS_NDISTINCT_MAGIC); + + if (ndistinct->type != STATS_NDISTINCT_TYPE_BASIC) + elog(ERROR, "invalid ndistinct type %d (expected %d)", + ndistinct->type, STATS_NDISTINCT_TYPE_BASIC); + + Assert(ndistinct->nitems > 0); + + /* what minimum bytea size do we expect for those parameters */ + expected_size = offsetof(MVNDistinctData, items) + + ndistinct->nitems * (offsetof(MVNDistinctItem, attrs) + + sizeof(AttrNumber) * 2); + + if (VARSIZE_ANY_EXHDR(data) < expected_size) + elog(ERROR, "invalid dependencies size %ld (expected at least %ld)", + VARSIZE_ANY_EXHDR(data), expected_size); + + /* allocate space for the ndistinct items */ + ndistinct = repalloc(ndistinct, offsetof(MVNDistinctData, items) + + (ndistinct->nitems * sizeof(MVNDistinctItem))); + + for (i = 0; i < ndistinct->nitems; i++) + { + MVNDistinctItem *item = &ndistinct->items[i]; + + /* number of attributes */ + memcpy(item, tmp, offsetof(MVNDistinctItem, attrs)); + tmp += offsetof(MVNDistinctItem, attrs); + + /* is the number of attributes valid? */ + Assert((item->nattrs >= 2) && (item->nattrs <= STATS_MAX_DIMENSIONS)); + + /* now that we know the number of attributes, allocate the attribute */ + item->attrs = (AttrNumber *) palloc0(item->nattrs * sizeof(AttrNumber)); + + /* copy attribute numbers */ + memcpy(item->attrs, tmp, sizeof(AttrNumber) * item->nattrs); + tmp += sizeof(AttrNumber) * item->nattrs; + + /* still within the bytea */ + Assert(tmp <= ((char *) data + VARSIZE_ANY(data))); + } + + /* we should have consumed the whole bytea exactly */ + Assert(tmp == ((char *) data + VARSIZE_ANY(data))); + + return ndistinct; +} diff --git a/src/backend/tcop/utility.c b/src/backend/tcop/utility.c index 20b5273..0af8c34 100644 --- a/src/backend/tcop/utility.c +++ b/src/backend/tcop/utility.c @@ -1623,6 +1623,10 @@ ProcessUtilitySlow(ParseState *pstate, commandCollected = true; break; + case T_CreateStatsStmt: /* CREATE STATISTICS */ + address = CreateStatistics((CreateStatsStmt *) parsetree); + break; + default: elog(ERROR, "unrecognized node type: %d", (int) nodeTag(parsetree)); @@ -1988,6 +1992,8 @@ AlterObjectTypeCommandTag(ObjectType objtype) break; case OBJECT_SUBSCRIPTION: tag = "ALTER SUBSCRIPTION"; + case OBJECT_STATISTICS: + tag = "ALTER STATISTICS"; break; default: tag = "???"; @@ -2282,6 +2288,8 @@ CreateCommandTag(Node *parsetree) break; case OBJECT_PUBLICATION: tag = "DROP PUBLICATION"; + case OBJECT_STATISTICS: + tag = "DROP STATISTICS"; break; default: tag = "???"; @@ -2681,6 +2689,10 @@ CreateCommandTag(Node *parsetree) tag = "EXECUTE"; break; + case T_CreateStatsStmt: + tag = "CREATE STATISTICS"; + break; + case T_DeallocateStmt: { DeallocateStmt *stmt = (DeallocateStmt *) parsetree; diff --git a/src/backend/utils/adt/selfuncs.c b/src/backend/utils/adt/selfuncs.c index 04bd9b9..5ea9e5b 100644 --- a/src/backend/utils/adt/selfuncs.c +++ b/src/backend/utils/adt/selfuncs.c @@ -126,6 +126,7 @@ #include "parser/parse_clause.h" #include "parser/parse_coerce.h" #include "parser/parsetree.h" +#include "statistics/stats.h" #include "utils/builtins.h" #include "utils/bytea.h" #include "utils/date.h" @@ -208,6 +209,8 @@ static Const *string_to_const(const char *str, Oid datatype); static Const *string_to_bytea_const(const char *str, size_t str_len); static List *add_predicate_to_quals(IndexOptInfo *index, List *indexQuals); +static double find_ndistinct(PlannerInfo *root, RelOptInfo *rel, List *varinfos, + bool *found); /* * eqsel - Selectivity of "=" for any data types. @@ -3437,12 +3440,26 @@ estimate_num_groups(PlannerInfo *root, List *groupExprs, double input_rows, * don't know by how much. We should never clamp to less than the * largest ndistinct value for any of the Vars, though, since * there will surely be at least that many groups. + * + * However we don't need to do this if we have ndistinct stats on + * the columns - in that case we can simply use the coefficient to + * get the (probably way more accurate) estimate. + * + * XXX Might benefit from some refactoring, mixing the ndistinct + * coefficients and clamp seems a bit unfortunate. */ double clamp = rel->tuples; if (relvarcount > 1) { - clamp *= 0.1; + bool found; + double ndist = find_ndistinct(root, rel, varinfos, &found); + + if (found) + reldistinct = ndist; + else + clamp *= 0.1; + if (clamp < relmaxndistinct) { clamp = relmaxndistinct; @@ -3451,6 +3468,7 @@ estimate_num_groups(PlannerInfo *root, List *groupExprs, double input_rows, clamp = rel->tuples; } } + if (reldistinct > clamp) reldistinct = clamp; @@ -7592,3 +7610,155 @@ brincostestimate(PlannerInfo *root, IndexPath *path, double loop_count, /* XXX what about pages_per_range? */ } + +/* + * Find applicable ndistinct statistics and compute the coefficient to + * correct the estimate (simply a product of per-column ndistincts). + * + * XXX Currently we only look for a perfect match, i.e. a single ndistinct + * estimate exactly matching all the columns of the statistics. This may be + * a bit problematic as adding a column (not covered by the ndistinct stats) + * will prevent us from using the stats entirely. So instead this needs to + * estimate the covered attributes, and then combine that with the extra + * attributes somehow (probably the old way). + */ +static double +find_ndistinct(PlannerInfo *root, RelOptInfo *rel, List *varinfos, bool *found) +{ + ListCell *lc; + Bitmapset *attnums = NULL; + VariableStatData vardata; + + /* assume we haven't found any suitable ndistinct statistics */ + *found = false; + + /* bail out immediately if the table has no extended statistics */ + if (!rel->statlist) + return 0.0; + + foreach(lc, varinfos) + { + GroupVarInfo *varinfo = (GroupVarInfo *) lfirst(lc); + + if (varinfo->rel != rel) + continue; + + /* FIXME handle expressions in general only */ + + /* + * examine the variable (or expression) so that we know which + * attribute we're dealing with - we need this for matching the + * ndistinct coefficient + * + * FIXME probably might remember this from estimate_num_groups + */ + examine_variable(root, varinfo->var, 0, &vardata); + + if (HeapTupleIsValid(vardata.statsTuple)) + { + Form_pg_statistic stats + = (Form_pg_statistic) GETSTRUCT(vardata.statsTuple); + + attnums = bms_add_member(attnums, stats->staattnum); + + ReleaseVariableStats(vardata); + } + } + + /* look for a matching ndistinct statistics */ + foreach(lc, rel->statlist) + { + int i, + k; + bool matches; + StatisticExtInfo *info = (StatisticExtInfo *) lfirst(lc); + + /* skip statistics without ndistinct coefficient built */ + if (!info->ndist_built) + continue; + + /* + * Only ndistinct stats covering all Vars are acceptable, which can't + * happen if the statistics has fewer attributes than we have Vars. + */ + if (bms_num_members(attnums) > info->stakeys->dim1) + continue; + + /* check that all Vars are covered by the statistic */ + matches = true; /* assume match until we find unmatched + * attribute */ + k = -1; + while ((k = bms_next_member(attnums, k)) >= 0) + { + bool attr_found = false; + + for (i = 0; i < info->stakeys->dim1; i++) + { + if (info->stakeys->values[i] == k) + { + attr_found = true; + break; + } + } + + /* found attribute not covered by this ndistinct stats, skip */ + if (!attr_found) + { + matches = false; + break; + } + } + + if (!matches) + continue; + + /* hey, this statistics matches! great, let's extract the value */ + *found = true; + + { + int j; + MVNDistinct stat = load_ext_ndistinct(info->statOid); + + for (j = 0; j < stat->nitems; j++) + { + bool item_matches = true; + MVNDistinctItem *item = &stat->items[j]; + + /* not the right item (different number of attributes) */ + if (item->nattrs != bms_num_members(attnums)) + continue; + + /* check the attribute numbers */ + k = -1; + while ((k = bms_next_member(attnums, k)) >= 0) + { + bool attr_found = false; + + for (i = 0; i < item->nattrs; i++) + { + if (info->stakeys->values[item->attrs[i]] == k) + { + attr_found = true; + break; + } + } + + if (!attr_found) + { + item_matches = false; + break; + } + } + + if (!item_matches) + continue; + + return item->ndistinct; + } + } + } + + Assert(!(*found)); + + return 0.0; +} diff --git a/src/backend/utils/cache/relcache.c b/src/backend/utils/cache/relcache.c index ce55fc5..a6b60c6 100644 --- a/src/backend/utils/cache/relcache.c +++ b/src/backend/utils/cache/relcache.c @@ -56,6 +56,7 @@ #include "catalog/pg_publication.h" #include "catalog/pg_rewrite.h" #include "catalog/pg_shseclabel.h" +#include "catalog/pg_statistic_ext.h" #include "catalog/pg_subscription.h" #include "catalog/pg_tablespace.h" #include "catalog/pg_trigger.h" @@ -4452,6 +4453,82 @@ RelationGetIndexList(Relation relation) } /* + * RelationGetStatExtList + * get a list of OIDs of extended statistics on this relation + * + * The statistics list is created only if someone requests it, in a way + * similar to RelationGetIndexList(). We scan pg_statistic_ext to find + * relevant statistics, and add the list to the relcache entry so that we + * won't have to compute it again. Note that shared cache inval of a + * relcache entry will delete the old list and set rd_statvalid to 0, + * so that we must recompute the statistics list on next request. This + * handles creation or deletion of a statistic. + * + * The returned list is guaranteed to be sorted in order by OID, although + * this is not currently needed. + * + * Since shared cache inval causes the relcache's copy of the list to go away, + * we return a copy of the list palloc'd in the caller's context. The caller + * may list_free() the returned list after scanning it. This is necessary + * since the caller will typically be doing syscache lookups on the relevant + * statistics, and syscache lookup could cause SI messages to be processed! + */ +List * +RelationGetStatExtList(Relation relation) +{ + Relation indrel; + SysScanDesc indscan; + ScanKeyData skey; + HeapTuple htup; + List *result; + List *oldlist; + MemoryContext oldcxt; + + /* Quick exit if we already computed the list. */ + if (relation->rd_statvalid != 0) + return list_copy(relation->rd_statlist); + + /* + * We build the list we intend to return (in the caller's context) while + * doing the scan. After successfully completing the scan, we copy that + * list into the relcache entry. This avoids cache-context memory leakage + * if we get some sort of error partway through. + */ + result = NIL; + + /* Prepare to scan pg_statistic_ext for entries having starelid = this rel. */ + ScanKeyInit(&skey, + Anum_pg_statistic_ext_starelid, + BTEqualStrategyNumber, F_OIDEQ, + ObjectIdGetDatum(RelationGetRelid(relation))); + + indrel = heap_open(StatisticExtRelationId, AccessShareLock); + indscan = systable_beginscan(indrel, StatisticExtRelidIndexId, true, + NULL, 1, &skey); + + while (HeapTupleIsValid(htup = systable_getnext(indscan))) + /* TODO maybe include only already built statistics? */ + result = insert_ordered_oid(result, HeapTupleGetOid(htup)); + + systable_endscan(indscan); + + heap_close(indrel, AccessShareLock); + + /* Now save a copy of the completed list in the relcache entry. */ + oldcxt = MemoryContextSwitchTo(CacheMemoryContext); + oldlist = relation->rd_statlist; + relation->rd_statlist = list_copy(result); + + relation->rd_statvalid = true; + MemoryContextSwitchTo(oldcxt); + + /* Don't leak the old list, if there is one */ + list_free(oldlist); + + return result; +} + +/* * insert_ordered_oid * Insert a new Oid into a sorted list of Oids, preserving ordering * @@ -5560,6 +5637,8 @@ load_relcache_init_file(bool shared) rel->rd_pkattr = NULL; rel->rd_idattr = NULL; rel->rd_pubactions = NULL; + rel->rd_statvalid = false; + rel->rd_statlist = NIL; rel->rd_createSubid = InvalidSubTransactionId; rel->rd_newRelfilenodeSubid = InvalidSubTransactionId; rel->rd_amcache = NULL; diff --git a/src/backend/utils/cache/syscache.c b/src/backend/utils/cache/syscache.c index b1c0b4b..4a9cb76 100644 --- a/src/backend/utils/cache/syscache.c +++ b/src/backend/utils/cache/syscache.c @@ -61,6 +61,7 @@ #include "catalog/pg_shseclabel.h" #include "catalog/pg_replication_origin.h" #include "catalog/pg_statistic.h" +#include "catalog/pg_statistic_ext.h" #include "catalog/pg_subscription.h" #include "catalog/pg_tablespace.h" #include "catalog/pg_transform.h" @@ -725,6 +726,28 @@ static const struct cachedesc cacheinfo[] = { }, 32 }, + {StatisticExtRelationId, /* STATEXTNAMENSP */ + StatisticExtNameIndexId, + 2, + { + Anum_pg_statistic_ext_staname, + Anum_pg_statistic_ext_stanamespace, + 0, + 0 + }, + 4 + }, + {StatisticExtRelationId, /* STATEXTOID */ + StatisticExtOidIndexId, + 1, + { + ObjectIdAttributeNumber, + 0, + 0, + 0 + }, + 4 + }, {StatisticRelationId, /* STATRELATTINH */ StatisticRelidAttnumInhIndexId, 3, diff --git a/src/bin/psql/describe.c b/src/bin/psql/describe.c index 61a3e2a..3001dee 100644 --- a/src/bin/psql/describe.c +++ b/src/bin/psql/describe.c @@ -2320,6 +2320,50 @@ describeOneTableDetails(const char *schemaname, PQclear(result); } + /* print any extended statistics */ + if (pset.sversion >= 100000) + { + printfPQExpBuffer(&buf, + "SELECT oid, stanamespace::regnamespace AS nsp, staname, stakeys,\n" + " (staenabled::char[] @> '{d}'::char[]) AS ndist_enabled,\n" + " (standistinct IS NOT NULL) AS ndist_built,\n" + " (SELECT string_agg(attname::text,', ')\n" + " FROM ((SELECT unnest(stakeys) AS attnum) s\n" + " JOIN pg_attribute a ON (starelid = a.attrelid and a.attnum = s.attnum))) AS attnums\n" + "FROM pg_statistic_ext stat WHERE starelid = '%s' ORDER BY 1;", + oid); + + result = PSQLexec(buf.data); + if (!result) + goto error_return; + else + tuples = PQntuples(result); + + if (tuples > 0) + { + printTableAddFooter(&cont, _("Statistics:")); + for (i = 0; i < tuples; i++) + { + printfPQExpBuffer(&buf, " "); + + /* statistics name (qualified with namespace) */ + appendPQExpBuffer(&buf, "\"%s.%s\" ", + PQgetvalue(result, i, 1), + PQgetvalue(result, i, 2)); + + /* options */ + if (!strcmp(PQgetvalue(result, i, 4), "t")) + appendPQExpBuffer(&buf, "(dependencies)"); + + appendPQExpBuffer(&buf, " ON (%s)", + PQgetvalue(result, i, 6)); + + printTableAddFooter(&cont, buf.data); + } + } + PQclear(result); + } + /* print rules */ if (tableinfo.hasrules && tableinfo.relkind != RELKIND_MATVIEW) { diff --git a/src/include/catalog/dependency.h b/src/include/catalog/dependency.h index 10759c7..9effbce 100644 --- a/src/include/catalog/dependency.h +++ b/src/include/catalog/dependency.h @@ -147,6 +147,7 @@ typedef enum ObjectClass OCLASS_REWRITE, /* pg_rewrite */ OCLASS_TRIGGER, /* pg_trigger */ OCLASS_SCHEMA, /* pg_namespace */ + OCLASS_STATISTIC_EXT, /* pg_statistic_ext */ OCLASS_TSPARSER, /* pg_ts_parser */ OCLASS_TSDICT, /* pg_ts_dict */ OCLASS_TSTEMPLATE, /* pg_ts_template */ diff --git a/src/include/catalog/heap.h b/src/include/catalog/heap.h index 1187797..473fe17 100644 --- a/src/include/catalog/heap.h +++ b/src/include/catalog/heap.h @@ -119,6 +119,7 @@ extern void RemoveAttrDefault(Oid relid, AttrNumber attnum, DropBehavior behavior, bool complain, bool internal); extern void RemoveAttrDefaultById(Oid attrdefId); extern void RemoveStatistics(Oid relid, AttrNumber attnum); +extern void RemoveStatisticsExt(Oid relid, AttrNumber attnum); extern Form_pg_attribute SystemAttributeDefinition(AttrNumber attno, bool relhasoids); diff --git a/src/include/catalog/indexing.h b/src/include/catalog/indexing.h index 6bce732..8130581 100644 --- a/src/include/catalog/indexing.h +++ b/src/include/catalog/indexing.h @@ -182,6 +182,13 @@ DECLARE_UNIQUE_INDEX(pg_largeobject_loid_pn_index, 2683, on pg_largeobject using DECLARE_UNIQUE_INDEX(pg_largeobject_metadata_oid_index, 2996, on pg_largeobject_metadata using btree(oid oid_ops)); #define LargeObjectMetadataOidIndexId 2996 +DECLARE_UNIQUE_INDEX(pg_statistic_ext_oid_index, 3380, on pg_statistic_ext using btree(oid oid_ops)); +#define StatisticExtOidIndexId 3380 +DECLARE_UNIQUE_INDEX(pg_statistic_ext_name_index, 3997, on pg_statistic_ext using btree(staname name_ops, stanamespace oid_ops)); +#define StatisticExtNameIndexId 3997 +DECLARE_INDEX(pg_statistic_ext_relid_index, 3379, on pg_statistic_ext using btree(starelid oid_ops)); +#define StatisticExtRelidIndexId 3379 + DECLARE_UNIQUE_INDEX(pg_namespace_nspname_index, 2684, on pg_namespace using btree(nspname name_ops)); #define NamespaceNameIndexId 2684 DECLARE_UNIQUE_INDEX(pg_namespace_oid_index, 2685, on pg_namespace using btree(oid oid_ops)); diff --git a/src/include/catalog/namespace.h b/src/include/catalog/namespace.h index dbeb25b..35e0e2b 100644 --- a/src/include/catalog/namespace.h +++ b/src/include/catalog/namespace.h @@ -141,6 +141,8 @@ extern Oid get_collation_oid(List *collname, bool missing_ok); extern Oid get_conversion_oid(List *conname, bool missing_ok); extern Oid FindDefaultConversionProc(int32 for_encoding, int32 to_encoding); +extern Oid get_statistics_oid(List *names, bool missing_ok); + /* initialization & transaction cleanup code */ extern void InitializeSearchPath(void); extern void AtEOXact_Namespace(bool isCommit, bool parallel); diff --git a/src/include/catalog/pg_cast.h b/src/include/catalog/pg_cast.h index 80a40ab..5bcdce7 100644 --- a/src/include/catalog/pg_cast.h +++ b/src/include/catalog/pg_cast.h @@ -254,6 +254,10 @@ DATA(insert ( 23 18 78 e f )); /* pg_node_tree can be coerced to, but not from, text */ DATA(insert ( 194 25 0 i b )); +/* pg_ndistinct can be coerced to, but not from, bytea and text */ +DATA(insert ( 3353 17 0 i b )); +DATA(insert ( 3353 25 0 i i )); + /* * Datetime category */ diff --git a/src/include/catalog/pg_proc.h b/src/include/catalog/pg_proc.h index ec4aedb..05baa80 100644 --- a/src/include/catalog/pg_proc.h +++ b/src/include/catalog/pg_proc.h @@ -2726,6 +2726,15 @@ DESCR("current user privilege on any column by rel name"); DATA(insert OID = 3029 ( has_any_column_privilege PGNSP PGUID 12 10 0 0 0 f f f f t f s s 2 0 16 "26 25" _null_ _null_ _null_ _null_ _null_ has_any_column_privilege_id _null_ _null_ _null_ )); DESCR("current user privilege on any column by rel oid"); +DATA(insert OID = 3354 ( pg_ndistinct_in PGNSP PGUID 12 1 0 0 0 f f f f t f i s 1 0 3353 "2275" _null_ _null_ _null_ _null_ _null_ pg_ndistinct_in _null_ _null_ _null_ )); +DESCR("I/O"); +DATA(insert OID = 3355 ( pg_ndistinct_out PGNSP PGUID 12 1 0 0 0 f f f f t f i s 1 0 2275 "3353" _null_ _null_ _null_ _null_ _null_ pg_ndistinct_out _null_ _null_ _null_ )); +DESCR("I/O"); +DATA(insert OID = 3356 ( pg_ndistinct_recv PGNSP PGUID 12 1 0 0 0 f f f f t f s s 1 0 3353 "2281" _null_ _null_ _null_ _null_ _null_ pg_ndistinct_recv _null_ _null_ _null_ )); +DESCR("I/O"); +DATA(insert OID = 3357 ( pg_ndistinct_send PGNSP PGUID 12 1 0 0 0 f f f f t f s s 1 0 17 "3353" _null_ _null_ _null_ _null_ _null_ pg_ndistinct_send _null_ _null_ _null_ )); +DESCR("I/O"); + DATA(insert OID = 1928 ( pg_stat_get_numscans PGNSP PGUID 12 1 0 0 0 f f f f t f s r 1 0 20 "26" _null_ _null_ _null_ _null_ _null_ pg_stat_get_numscans _null_ _null_ _null_ )); DESCR("statistics: number of scans done for table/index"); DATA(insert OID = 1929 ( pg_stat_get_tuples_returned PGNSP PGUID 12 1 0 0 0 f f f f t f s r 1 0 20 "26" _null_ _null_ _null_ _null_ _null_ pg_stat_get_tuples_returned _null_ _null_ _null_ )); diff --git a/src/include/catalog/pg_statistic_ext.h b/src/include/catalog/pg_statistic_ext.h new file mode 100644 index 0000000..94b23a2 --- /dev/null +++ b/src/include/catalog/pg_statistic_ext.h @@ -0,0 +1,74 @@ +/*------------------------------------------------------------------------- + * + * pg_statistic_ext.h + * definition of the system "extended statistic" relation (pg_statistic_ext) + * along with the relation's initial contents. + * + * + * Portions Copyright (c) 1996-2014, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * src/include/catalog/pg_statistic_ext.h + * + * NOTES + * the genbki.pl script reads this file and generates .bki + * information from the DATA() statements. + * + *------------------------------------------------------------------------- + */ +#ifndef PG_STATISTIC_EXT_H +#define PG_STATISTIC_EXT_H + +#include "catalog/genbki.h" + +/* ---------------- + * pg_statistic_ext definition. cpp turns this into + * typedef struct FormData_pg_statistic_ext + * ---------------- + */ +#define StatisticExtRelationId 3381 + +CATALOG(pg_statistic_ext,3381) +{ + /* These fields form the unique key for the entry: */ + Oid starelid; /* relation containing attributes */ + NameData staname; /* statistics name */ + Oid stanamespace; /* OID of namespace containing this statistics */ + Oid staowner; /* statistics owner */ + + /* + * variable-length fields start here, but we allow direct access to + * stakeys + */ + int2vector stakeys; /* array of column keys */ + +#ifdef CATALOG_VARLEN + char staenabled[1]; /* statistics requested to build */ + pg_ndistinct standistinct; /* ndistinct coefficients (serialized) */ +#endif + +} FormData_pg_statistic_ext; + +/* ---------------- + * Form_pg_statistic_ext corresponds to a pointer to a tuple with + * the format of pg_statistic_ext relation. + * ---------------- + */ +typedef FormData_pg_statistic_ext *Form_pg_statistic_ext; + +/* ---------------- + * compiler constants for pg_statistic_ext + * ---------------- + */ +#define Natts_pg_statistic_ext 7 +#define Anum_pg_statistic_ext_starelid 1 +#define Anum_pg_statistic_ext_staname 2 +#define Anum_pg_statistic_ext_stanamespace 3 +#define Anum_pg_statistic_ext_staowner 4 +#define Anum_pg_statistic_ext_stakeys 5 +#define Anum_pg_statistic_ext_staenabled 6 +#define Anum_pg_statistic_ext_standistinct 7 + +#define STATS_EXT_NDISTINCT 'd' + +#endif /* PG_STATISTIC_EXT_H */ diff --git a/src/include/catalog/pg_type.h b/src/include/catalog/pg_type.h index 6e4c65e..9c9caf3 100644 --- a/src/include/catalog/pg_type.h +++ b/src/include/catalog/pg_type.h @@ -364,6 +364,10 @@ DATA(insert OID = 194 ( pg_node_tree PGNSP PGUID -1 f b S f t \054 0 0 0 pg_node DESCR("string representing an internal node tree"); #define PGNODETREEOID 194 +DATA(insert OID = 3353 ( pg_ndistinct PGNSP PGUID -1 f b S f t \054 0 0 0 pg_ndistinct_in pg_ndistinct_out pg_ndistinct_recv pg_ndistinct_send - - - i x f 0 -1 0 100 _null_ _null_ _null_ )); +DESCR("multivariate ndistinct coefficients"); +#define PGNDISTINCTOID 3353 + DATA(insert OID = 32 ( pg_ddl_command PGNSP PGUID SIZEOF_POINTER t p P f t \054 0 0 0 pg_ddl_command_in pg_ddl_command_out pg_ddl_command_recv pg_ddl_command_send - - - ALIGNOF_POINTER p f 0 -1 0 0 _null_ _null_ _null_ )); DESCR("internal type for passing CollectedCommand"); #define PGDDLCOMMANDOID 32 diff --git a/src/include/catalog/toasting.h b/src/include/catalog/toasting.h index db7f145..00d0a83 100644 --- a/src/include/catalog/toasting.h +++ b/src/include/catalog/toasting.h @@ -53,6 +53,7 @@ DECLARE_TOAST(pg_proc, 2836, 2837); DECLARE_TOAST(pg_rewrite, 2838, 2839); DECLARE_TOAST(pg_seclabel, 3598, 3599); DECLARE_TOAST(pg_statistic, 2840, 2841); +DECLARE_TOAST(pg_statistic_ext, 3439, 3440); DECLARE_TOAST(pg_trigger, 2336, 2337); /* shared catalogs */ diff --git a/src/include/commands/defrem.h b/src/include/commands/defrem.h index 8740cee..c323e81 100644 --- a/src/include/commands/defrem.h +++ b/src/include/commands/defrem.h @@ -77,6 +77,10 @@ extern ObjectAddress DefineOperator(List *names, List *parameters); extern void RemoveOperatorById(Oid operOid); extern ObjectAddress AlterOperator(AlterOperatorStmt *stmt); +/* commands/statscmds.c */ +extern ObjectAddress CreateStatistics(CreateStatsStmt *stmt); +extern void RemoveStatisticsById(Oid statsOid); + /* commands/aggregatecmds.c */ extern ObjectAddress DefineAggregate(ParseState *pstate, List *name, List *args, bool oldstyle, List *parameters); diff --git a/src/include/nodes/nodes.h b/src/include/nodes/nodes.h index 2bc7a5d..d269e77 100644 --- a/src/include/nodes/nodes.h +++ b/src/include/nodes/nodes.h @@ -278,6 +278,7 @@ typedef enum NodeTag T_PlaceHolderInfo, T_MinMaxAggInfo, T_PlannerParamItem, + T_StatisticExtInfo, /* * TAGS FOR MEMORY NODES (memnodes.h) @@ -423,6 +424,7 @@ typedef enum NodeTag T_CreateSubscriptionStmt, T_AlterSubscriptionStmt, T_DropSubscriptionStmt, + T_CreateStatsStmt, /* * TAGS FOR PARSE TREE NODES (parsenodes.h) diff --git a/src/include/nodes/parsenodes.h b/src/include/nodes/parsenodes.h index a44d217..0a7a8d5c 100644 --- a/src/include/nodes/parsenodes.h +++ b/src/include/nodes/parsenodes.h @@ -644,6 +644,16 @@ typedef struct ColumnDef int location; /* parse location, or -1 if none/unknown */ } ColumnDef; +typedef struct CreateStatsStmt +{ + NodeTag type; + List *defnames; /* qualified name (list of Value strings) */ + RangeVar *relation; /* relation to build statistics on */ + List *keys; /* String nodes naming referenced column(s) */ + bool if_not_exists; /* do nothing if statistics already exists */ +} CreateStatsStmt; + + /* * TableLikeClause - CREATE TABLE ( ... LIKE ... ) clause */ @@ -1593,6 +1603,7 @@ typedef enum ObjectType OBJECT_SCHEMA, OBJECT_SEQUENCE, OBJECT_SUBSCRIPTION, + OBJECT_STATISTICS, OBJECT_TABCONSTRAINT, OBJECT_TABLE, OBJECT_TABLESPACE, diff --git a/src/include/nodes/relation.h b/src/include/nodes/relation.h index 05d6f07..5923b5f 100644 --- a/src/include/nodes/relation.h +++ b/src/include/nodes/relation.h @@ -525,6 +525,7 @@ typedef struct RelOptInfo List *lateral_vars; /* LATERAL Vars and PHVs referenced by rel */ Relids lateral_referencers; /* rels that reference me laterally */ List *indexlist; /* list of IndexOptInfo */ + List *statlist; /* list of StatisticExtInfo */ BlockNumber pages; /* size estimates derived from pg_class */ double tuples; double allvisfrac; @@ -664,6 +665,31 @@ typedef struct ForeignKeyOptInfo List *rinfos[INDEX_MAX_KEYS]; } ForeignKeyOptInfo; +/* + * StatisticExtInfo + * Information about extended statistics for planning/optimization + * + * This contains information about which columns are covered by the + * statistics (stakeys), which options were requested while adding the + * statistics (*_enabled), and which kinds of statistics were actually + * built and are available for the optimizer (*_built). + */ +typedef struct StatisticExtInfo +{ + NodeTag type; + + Oid statOid; /* OID of the statistics row */ + RelOptInfo *rel; /* back-link to index's table */ + + /* enabled statistics */ + bool ndist_enabled; /* ndistinct coefficient enabled */ + + /* built/available statistics */ + bool ndist_built; /* ndistinct coefficient built */ + + /* columns in the statistics (attnums) */ + int2vector *stakeys; /* attnums of the columns covered */ +} StatisticExtInfo; /* * EquivalenceClasses diff --git a/src/include/statistics/common.h b/src/include/statistics/common.h new file mode 100644 index 0000000..39c62bd --- /dev/null +++ b/src/include/statistics/common.h @@ -0,0 +1,62 @@ +/*------------------------------------------------------------------------- + * + * common.h + * POSTGRES extended statistics internal declarations + * + * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * IDENTIFICATION + * src/include/statistics/common.h + * + *------------------------------------------------------------------------- + */ +#ifndef STATISTICS_COMMON_H +#define STATISTICS_COMMON_H + +#include "commands/vacuum.h" +#include "utils/sortsupport.h" + + +typedef struct +{ + Oid eqopr; /* '=' operator for datatype, if any */ + Oid eqfunc; /* and associated function */ + Oid ltopr; /* '<' operator for datatype, if any */ +} StdAnalyzeData; + +typedef struct +{ + Datum value; /* a data value */ + int tupno; /* position index for tuple it came from */ +} ScalarItem; + +/* multi-sort */ +typedef struct MultiSortSupportData +{ + int ndims; /* number of dimensions supported by the */ + SortSupportData ssup[1]; /* sort support data for each dimension */ +} MultiSortSupportData; + +typedef MultiSortSupportData *MultiSortSupport; + +typedef struct SortItem +{ + Datum *values; + bool *isnull; +} SortItem; + +extern MultiSortSupport multi_sort_init(int ndims); +extern void multi_sort_add_dimension(MultiSortSupport mss, int sortdim, + int dim, VacAttrStats **vacattrstats); +extern int multi_sort_compare(const void *a, const void *b, void *arg); +extern int multi_sort_compare_dim(int dim, const SortItem * a, + const SortItem * b, MultiSortSupport mss); +extern int multi_sort_compare_dims(int start, int end, const SortItem * a, + const SortItem * b, MultiSortSupport mss); + +/* comparators, used when constructing extended stats */ +extern int compare_scalars_simple(const void *a, const void *b, void *arg); +extern int compare_scalars_partition(const void *a, const void *b, void *arg); + +#endif /* STATISTICS_COMMON_H */ diff --git a/src/include/statistics/stats.h b/src/include/statistics/stats.h new file mode 100644 index 0000000..ed14459 --- /dev/null +++ b/src/include/statistics/stats.h @@ -0,0 +1,57 @@ +/*------------------------------------------------------------------------- + * + * stats.h + * Multivariate statistics and selectivity estimation functions. + * + * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * src/include/statistics/stats.h + * + *------------------------------------------------------------------------- + */ +#ifndef STATS_H +#define STATS_H + +#include "commands/vacuum.h" + +#define STATS_MAX_DIMENSIONS 8 /* max number of attributes */ + +#define STATS_NDISTINCT_MAGIC 0xA352BFA4 /* marks serialized bytea */ +#define STATS_NDISTINCT_TYPE_BASIC 1 /* basic MCV list type */ + +/* Multivariate distinct coefficients. */ +typedef struct MVNDistinctItem +{ + double ndistinct; + AttrNumber nattrs; + AttrNumber *attrs; +} MVNDistinctItem; + +typedef struct MVNDistinctData +{ + uint32 magic; /* magic constant marker */ + uint32 type; /* type of ndistinct (BASIC) */ + uint32 nitems; /* number of items in the statistic */ + MVNDistinctItem items[FLEXIBLE_ARRAY_MEMBER]; +} MVNDistinctData; + +typedef MVNDistinctData *MVNDistinct; + +extern MVNDistinct load_ext_ndistinct(Oid mvoid); + +extern bytea *serialize_ext_ndistinct(MVNDistinct ndistinct); + +/* deserialization of stats (serialization is private to analyze) */ +extern MVNDistinct deserialize_ext_ndistinct(bytea *data); + +extern MVNDistinct build_ext_ndistinct(double totalrows, int numrows, HeapTuple *rows, + int2vector *attrs, VacAttrStats **stats); + +extern void build_ext_stats(Relation onerel, double totalrows, + int numrows, HeapTuple *rows, + int natts, VacAttrStats **vacattrstats); +extern bool stats_are_enabled(HeapTuple htup, char type); +extern bool stats_are_built(HeapTuple htup, char type); + +#endif /* STATS_H */ diff --git a/src/include/utils/acl.h b/src/include/utils/acl.h index 0d11852..90dac93 100644 --- a/src/include/utils/acl.h +++ b/src/include/utils/acl.h @@ -326,6 +326,7 @@ extern bool pg_event_trigger_ownercheck(Oid et_oid, Oid roleid); extern bool pg_extension_ownercheck(Oid ext_oid, Oid roleid); extern bool pg_publication_ownercheck(Oid pub_oid, Oid roleid); extern bool pg_subscription_ownercheck(Oid sub_oid, Oid roleid); +extern bool pg_statistics_ownercheck(Oid stat_oid, Oid roleid); extern bool has_createrole_privilege(Oid roleid); extern bool has_bypassrls_privilege(Oid roleid); diff --git a/src/include/utils/rel.h b/src/include/utils/rel.h index a617a7c..5772804 100644 --- a/src/include/utils/rel.h +++ b/src/include/utils/rel.h @@ -92,6 +92,7 @@ typedef struct RelationData bool rd_isvalid; /* relcache entry is valid */ char rd_indexvalid; /* state of rd_indexlist: 0 = not valid, 1 = * valid, 2 = temporarily forced */ + bool rd_statvalid; /* state of rd_statlist: true/false */ /* * rd_createSubid is the ID of the highest subtransaction the rel has @@ -136,6 +137,9 @@ typedef struct RelationData Oid rd_pkindex; /* OID of primary key, if any */ Oid rd_replidindex; /* OID of replica identity index, if any */ + /* data managed by RelationGetStatExtList: */ + List *rd_statlist; /* list of OIDs of extended stats */ + /* data managed by RelationGetIndexAttrBitmap: */ Bitmapset *rd_indexattr; /* identifies columns used in indexes */ Bitmapset *rd_keyattr; /* cols that can be ref'd by foreign keys */ diff --git a/src/include/utils/relcache.h b/src/include/utils/relcache.h index da36b67..81af3ae 100644 --- a/src/include/utils/relcache.h +++ b/src/include/utils/relcache.h @@ -39,6 +39,7 @@ extern void RelationClose(Relation relation); */ extern List *RelationGetFKeyList(Relation relation); extern List *RelationGetIndexList(Relation relation); +extern List *RelationGetStatExtList(Relation relation); extern Oid RelationGetOidIndex(Relation relation); extern Oid RelationGetPrimaryKeyIndex(Relation relation); extern Oid RelationGetReplicaIndex(Relation relation); diff --git a/src/include/utils/syscache.h b/src/include/utils/syscache.h index 66f60d2..048541e 100644 --- a/src/include/utils/syscache.h +++ b/src/include/utils/syscache.h @@ -86,6 +86,8 @@ enum SysCacheIdentifier PUBLICATIONRELMAP, RULERELNAME, SEQRELID, + STATEXTNAMENSP, + STATEXTOID, STATRELATTINH, SUBSCRIPTIONOID, SUBSCRIPTIONNAME, diff --git a/src/test/regress/expected/object_address.out b/src/test/regress/expected/object_address.out index 836773f..07b3701 100644 --- a/src/test/regress/expected/object_address.out +++ b/src/test/regress/expected/object_address.out @@ -38,6 +38,7 @@ CREATE TRANSFORM FOR int LANGUAGE SQL ( TO SQL WITH FUNCTION int4recv(internal)); CREATE PUBLICATION addr_pub FOR TABLE addr_nsp.gentable; CREATE SUBSCRIPTION addr_sub CONNECTION '' PUBLICATION bar WITH (DISABLED, NOCREATE SLOT); +CREATE STATISTICS addr_nsp.gentable_stat ON (a,b) FROM addr_nsp.gentable; -- test some error cases SELECT pg_get_object_address('stone', '{}', '{}'); ERROR: unrecognized object type "stone" @@ -399,7 +400,8 @@ WITH objects (type, name, args) AS (VALUES ('access method', '{btree}', '{}'), ('publication', '{addr_pub}', '{}'), ('publication relation', '{addr_nsp, gentable}', '{addr_pub}'), - ('subscription', '{addr_sub}', '{}') + ('subscription', '{addr_sub}', '{}'), + ('statistics', '{addr_nsp, gentable_stat}', '{}') ) SELECT (pg_identify_object(addr1.classid, addr1.objid, addr1.objsubid)).*, -- test roundtrip through pg_identify_object_as_address @@ -447,6 +449,7 @@ SELECT (pg_identify_object(addr1.classid, addr1.objid, addr1.objsubid)).*, trigger | | | t on addr_nsp.gentable | t operator family | pg_catalog | integer_ops | pg_catalog.integer_ops USING btree | t policy | | | genpol on addr_nsp.gentable | t + statistics | addr_nsp | gentable_stat | addr_nsp.gentable_stat | t collation | pg_catalog | "default" | pg_catalog."default" | t transform | | | for integer on language sql | t text search dictionary | addr_nsp | addr_ts_dict | addr_nsp.addr_ts_dict | t @@ -456,7 +459,7 @@ SELECT (pg_identify_object(addr1.classid, addr1.objid, addr1.objsubid)).*, subscription | | addr_sub | addr_sub | t publication | | addr_pub | addr_pub | t publication relation | | | gentable in publication addr_pub | t -(45 rows) +(46 rows) --- --- Cleanup resources diff --git a/src/test/regress/expected/opr_sanity.out b/src/test/regress/expected/opr_sanity.out index 0bcec13..9a26205 100644 --- a/src/test/regress/expected/opr_sanity.out +++ b/src/test/regress/expected/opr_sanity.out @@ -817,11 +817,12 @@ WHERE c.castmethod = 'b' AND text | character | 0 | i character varying | character | 0 | i pg_node_tree | text | 0 | i + pg_ndistinct | bytea | 0 | i cidr | inet | 0 | i xml | text | 0 | a xml | character varying | 0 | a xml | character | 0 | a -(7 rows) +(8 rows) -- **************** pg_conversion **************** -- Look for illegal values in pg_conversion fields. diff --git a/src/test/regress/expected/rules.out b/src/test/regress/expected/rules.out index bd13ae6..d4b2158 100644 --- a/src/test/regress/expected/rules.out +++ b/src/test/regress/expected/rules.out @@ -2160,6 +2160,14 @@ pg_stats| SELECT n.nspname AS schemaname, JOIN pg_attribute a ON (((c.oid = a.attrelid) AND (a.attnum = s.staattnum)))) LEFT JOIN pg_namespace n ON ((n.oid = c.relnamespace))) WHERE ((NOT a.attisdropped) AND has_column_privilege(c.oid, a.attnum, 'select'::text) AND ((c.relrowsecurity = false) OR (NOT row_security_active(c.oid)))); +pg_stats_ext| SELECT n.nspname AS schemaname, + c.relname AS tablename, + s.staname, + s.stakeys AS attnums, + length((s.standistinct)::text) AS ndistbytes + FROM ((pg_statistic_ext s + JOIN pg_class c ON ((c.oid = s.starelid))) + LEFT JOIN pg_namespace n ON ((n.oid = c.relnamespace))); pg_tables| SELECT n.nspname AS schemaname, c.relname AS tablename, pg_get_userbyid(c.relowner) AS tableowner, diff --git a/src/test/regress/expected/sanity_check.out b/src/test/regress/expected/sanity_check.out index b5eff55..9edba4f 100644 --- a/src/test/regress/expected/sanity_check.out +++ b/src/test/regress/expected/sanity_check.out @@ -142,6 +142,7 @@ pg_shdepend|t pg_shdescription|t pg_shseclabel|t pg_statistic|t +pg_statistic_ext|t pg_subscription|t pg_tablespace|t pg_transform|t diff --git a/src/test/regress/expected/stats_ext.out b/src/test/regress/expected/stats_ext.out new file mode 100644 index 0000000..77ce1f1 --- /dev/null +++ b/src/test/regress/expected/stats_ext.out @@ -0,0 +1,117 @@ +-- data type passed by value +CREATE TABLE ndistinct ( + a INT, + b INT, + c INT, + d INT +); +-- unknown column +CREATE STATISTICS s10 ON (unknown_column) FROM ndistinct; +ERROR: column "unknown_column" referenced in statistics does not exist +-- single column +CREATE STATISTICS s10 ON (a) FROM ndistinct; +ERROR: statistics require at least 2 columns +-- single column, duplicated +CREATE STATISTICS s10 ON (a,a) FROM ndistinct; +ERROR: duplicate column name in statistics definition +-- two columns, one duplicated +CREATE STATISTICS s10 ON (a, a, b) FROM ndistinct; +ERROR: duplicate column name in statistics definition +-- correct command +CREATE STATISTICS s10 ON (a, b, c) FROM ndistinct; +-- perfectly correlated groups +INSERT INTO ndistinct + SELECT i/100, i/100, i/100 FROM generate_series(1,10000) s(i); +ANALYZE ndistinct; +SELECT staenabled, standistinct + FROM pg_statistic_ext WHERE starelid = 'ndistinct'::regclass; + staenabled | standistinct +------------+------------------------------------------------------------------------------------- + {d} | [{0, 1, 101.000000}, {0, 2, 101.000000}, {1, 2, 101.000000}, {0, 1, 2, 101.000000}] +(1 row) + +EXPLAIN (COSTS off) + SELECT COUNT(*) FROM ndistinct GROUP BY a, b; + QUERY PLAN +----------------------------- + HashAggregate + Group Key: a, b + -> Seq Scan on ndistinct +(3 rows) + +EXPLAIN (COSTS off) + SELECT COUNT(*) FROM ndistinct GROUP BY a, b, c; + QUERY PLAN +----------------------------- + HashAggregate + Group Key: a, b, c + -> Seq Scan on ndistinct +(3 rows) + +EXPLAIN (COSTS off) + SELECT COUNT(*) FROM ndistinct GROUP BY a, b, c, d; + QUERY PLAN +----------------------------- + HashAggregate + Group Key: a, b, c, d + -> Seq Scan on ndistinct +(3 rows) + +TRUNCATE TABLE ndistinct; +-- partially correlated groups +INSERT INTO ndistinct + SELECT i/50, i/100, i/200 FROM generate_series(1,10000) s(i); +ANALYZE ndistinct; +SELECT staenabled, standistinct + FROM pg_statistic_ext WHERE starelid = 'ndistinct'::regclass; + staenabled | standistinct +------------+------------------------------------------------------------------------------------- + {d} | [{0, 1, 201.000000}, {0, 2, 201.000000}, {1, 2, 101.000000}, {0, 1, 2, 201.000000}] +(1 row) + +EXPLAIN + SELECT COUNT(*) FROM ndistinct GROUP BY a, b; + QUERY PLAN +--------------------------------------------------------------------- + HashAggregate (cost=230.00..232.01 rows=201 width=16) + Group Key: a, b + -> Seq Scan on ndistinct (cost=0.00..155.00 rows=10000 width=8) +(3 rows) + +EXPLAIN + SELECT COUNT(*) FROM ndistinct GROUP BY a, b, c; + QUERY PLAN +---------------------------------------------------------------------- + HashAggregate (cost=255.00..257.01 rows=201 width=20) + Group Key: a, b, c + -> Seq Scan on ndistinct (cost=0.00..155.00 rows=10000 width=12) +(3 rows) + +EXPLAIN + SELECT COUNT(*) FROM ndistinct GROUP BY a, b, c, d; + QUERY PLAN +---------------------------------------------------------------------- + HashAggregate (cost=280.00..290.00 rows=1000 width=24) + Group Key: a, b, c, d + -> Seq Scan on ndistinct (cost=0.00..155.00 rows=10000 width=16) +(3 rows) + +EXPLAIN + SELECT COUNT(*) FROM ndistinct GROUP BY b, c, d; + QUERY PLAN +---------------------------------------------------------------------- + HashAggregate (cost=255.00..265.00 rows=1000 width=20) + Group Key: b, c, d + -> Seq Scan on ndistinct (cost=0.00..155.00 rows=10000 width=12) +(3 rows) + +EXPLAIN + SELECT COUNT(*) FROM ndistinct GROUP BY a, d; + QUERY PLAN +--------------------------------------------------------------------- + HashAggregate (cost=230.00..240.00 rows=1000 width=16) + Group Key: a, d + -> Seq Scan on ndistinct (cost=0.00..155.00 rows=10000 width=8) +(3 rows) + +DROP TABLE ndistinct; diff --git a/src/test/regress/expected/type_sanity.out b/src/test/regress/expected/type_sanity.out index 8d75bbf..f6b799a 100644 --- a/src/test/regress/expected/type_sanity.out +++ b/src/test/regress/expected/type_sanity.out @@ -59,7 +59,7 @@ WHERE (p1.typtype = 'c' AND p1.typrelid = 0) OR -- Look for types that should have an array type according to their typtype, -- but don't. We exclude composites here because we have not bothered to -- make array types corresponding to the system catalogs' rowtypes. --- NOTE: as of v10, this check finds pg_node_tree and smgr. +-- NOTE: as of v10, this check finds pg_node_tree, pg_ndistinct, smgr. SELECT p1.oid, p1.typname FROM pg_type as p1 WHERE p1.typtype not in ('c','d','p') AND p1.typname NOT LIKE E'\\_%' @@ -67,11 +67,12 @@ WHERE p1.typtype not in ('c','d','p') AND p1.typname NOT LIKE E'\\_%' (SELECT 1 FROM pg_type as p2 WHERE p2.typname = ('_' || p1.typname)::name AND p2.typelem = p1.oid and p1.typarray = p2.oid); - oid | typname ------+-------------- - 194 | pg_node_tree - 210 | smgr -(2 rows) + oid | typname +------+-------------- + 194 | pg_node_tree + 3353 | pg_ndistinct + 210 | smgr +(3 rows) -- Make sure typarray points to a varlena array type of our own base SELECT p1.oid, p1.typname as basetype, p2.typname as arraytype, diff --git a/src/test/regress/parallel_schedule b/src/test/regress/parallel_schedule index 9f38349..a8ebf93 100644 --- a/src/test/regress/parallel_schedule +++ b/src/test/regress/parallel_schedule @@ -89,7 +89,7 @@ test: brin gin gist spgist privileges init_privs security_label collate matview # ---------- # Another group of parallel tests # ---------- -test: alter_generic alter_operator misc psql async dbsize misc_functions sysviews tsrf +test: alter_generic alter_operator misc psql async dbsize misc_functions sysviews tsrf stats_ext # rules cannot run concurrently with any test that creates a view test: rules psql_crosstab amutils diff --git a/src/test/regress/serial_schedule b/src/test/regress/serial_schedule index 2987b24..bff9432 100644 --- a/src/test/regress/serial_schedule +++ b/src/test/regress/serial_schedule @@ -128,6 +128,7 @@ test: dbsize test: misc_functions test: sysviews test: tsrf +test: stats_ext test: rules test: psql_crosstab test: select_parallel diff --git a/src/test/regress/sql/object_address.sql b/src/test/regress/sql/object_address.sql index 0ace4dd..4e34185 100644 --- a/src/test/regress/sql/object_address.sql +++ b/src/test/regress/sql/object_address.sql @@ -41,6 +41,7 @@ CREATE TRANSFORM FOR int LANGUAGE SQL ( TO SQL WITH FUNCTION int4recv(internal)); CREATE PUBLICATION addr_pub FOR TABLE addr_nsp.gentable; CREATE SUBSCRIPTION addr_sub CONNECTION '' PUBLICATION bar WITH (DISABLED, NOCREATE SLOT); +CREATE STATISTICS addr_nsp.gentable_stat ON (a,b) FROM addr_nsp.gentable; -- test some error cases SELECT pg_get_object_address('stone', '{}', '{}'); @@ -179,7 +180,8 @@ WITH objects (type, name, args) AS (VALUES ('access method', '{btree}', '{}'), ('publication', '{addr_pub}', '{}'), ('publication relation', '{addr_nsp, gentable}', '{addr_pub}'), - ('subscription', '{addr_sub}', '{}') + ('subscription', '{addr_sub}', '{}'), + ('statistics', '{addr_nsp, gentable_stat}', '{}') ) SELECT (pg_identify_object(addr1.classid, addr1.objid, addr1.objsubid)).*, -- test roundtrip through pg_identify_object_as_address diff --git a/src/test/regress/sql/stats_ext.sql b/src/test/regress/sql/stats_ext.sql new file mode 100644 index 0000000..6381157 --- /dev/null +++ b/src/test/regress/sql/stats_ext.sql @@ -0,0 +1,75 @@ +-- Generic extended statistics support +CREATE TABLE ab1 (a int, b int); +CREATE STATISTICS ab1_a_b_stats ON (a, b) FROM ab1; +ALTER TABLE ab1 DROP COLUMN a; +DROP TABLE ab1; + + +-- data type passed by value +CREATE TABLE ndistinct ( + a INT, + b INT, + c INT, + d INT +); + +-- unknown column +CREATE STATISTICS s10 ON (unknown_column) FROM ndistinct; + +-- single column +CREATE STATISTICS s10 ON (a) FROM ndistinct; + +-- single column, duplicated +CREATE STATISTICS s10 ON (a,a) FROM ndistinct; + +-- two columns, one duplicated +CREATE STATISTICS s10 ON (a, a, b) FROM ndistinct; + +-- correct command +CREATE STATISTICS s10 ON (a, b, c) FROM ndistinct; + +-- perfectly correlated groups +INSERT INTO ndistinct + SELECT i/100, i/100, i/100 FROM generate_series(1,10000) s(i); + +ANALYZE ndistinct; + +SELECT staenabled, standistinct + FROM pg_statistic_ext WHERE starelid = 'ndistinct'::regclass; + +EXPLAIN (COSTS off) + SELECT COUNT(*) FROM ndistinct GROUP BY a, b; + +EXPLAIN (COSTS off) + SELECT COUNT(*) FROM ndistinct GROUP BY a, b, c; + +EXPLAIN (COSTS off) + SELECT COUNT(*) FROM ndistinct GROUP BY a, b, c, d; + +TRUNCATE TABLE ndistinct; + +-- partially correlated groups +INSERT INTO ndistinct + SELECT i/50, i/100, i/200 FROM generate_series(1,10000) s(i); + +ANALYZE ndistinct; + +SELECT staenabled, standistinct + FROM pg_statistic_ext WHERE starelid = 'ndistinct'::regclass; + +EXPLAIN + SELECT COUNT(*) FROM ndistinct GROUP BY a, b; + +EXPLAIN + SELECT COUNT(*) FROM ndistinct GROUP BY a, b, c; + +EXPLAIN + SELECT COUNT(*) FROM ndistinct GROUP BY a, b, c, d; + +EXPLAIN + SELECT COUNT(*) FROM ndistinct GROUP BY b, c, d; + +EXPLAIN + SELECT COUNT(*) FROM ndistinct GROUP BY a, d; + +DROP TABLE ndistinct; diff --git a/src/test/regress/sql/type_sanity.sql b/src/test/regress/sql/type_sanity.sql index 0a31249..4c65814 100644 --- a/src/test/regress/sql/type_sanity.sql +++ b/src/test/regress/sql/type_sanity.sql @@ -53,7 +53,7 @@ WHERE (p1.typtype = 'c' AND p1.typrelid = 0) OR -- Look for types that should have an array type according to their typtype, -- but don't. We exclude composites here because we have not bothered to -- make array types corresponding to the system catalogs' rowtypes. --- NOTE: as of v10, this check finds pg_node_tree and smgr. +-- NOTE: as of v10, this check finds pg_node_tree, pg_ndistinct, smgr. SELECT p1.oid, p1.typname FROM pg_type as p1