Re: pgsql: Extended statistics on expressions

From: David Rowley <dgrowleyml(at)gmail(dot)com>
To: Tomas Vondra <tomas(dot)vondra(at)postgresql(dot)org>, Tomas Vondra <tv(at)fuzzy(dot)cz>
Cc: pgsql-committers(at)lists(dot)postgresql(dot)org
Subject: Re: pgsql: Extended statistics on expressions
Date: 2021-03-31 08:05:43
Message-ID: CAApHDvpYT10-nkSp8xXe-nbO3jmoaRyRFHbzh-RWMfAJynqgpQ@mail.gmail.com
Views: Raw Message | Whole Thread | Download mbox | Resend email
Thread:
Lists: pgsql-committers

Hi Tomas,

I'm debugging a crash after running sqllancer on current master. The
first bad commit seems to be this one.

The crash stack trace is:

Program received signal SIGSEGV, Segmentation fault.
pg_detoast_datum_packed (datum=0x5) at fmgr.c:1759
1759 if (VARATT_IS_COMPRESSED(datum) || VARATT_IS_EXTERNAL(datum))
(gdb) bt
#0 pg_detoast_datum_packed (datum=0x5) at fmgr.c:1759
#1 0x000055e460c99da2 in textlike (fcinfo=0x7fff1bbc39e0) at like.c:287
#2 0x000055e460d44dde in FunctionCall2Coll
(flinfo=flinfo(at)entry=0x7fff1bbc3ae0, collation=<optimized out>,
arg1=<optimized out>, arg2=<optimized out>) at fmgr.c:1163
#3 0x000055e460bd6dad in mcv_get_match_bitmap (clauses=<optimized
out>, keys=0x55e4614627e8, exprs=0x0, mcvlist=<optimized out>,
is_or=<optimized out>, root=<optimized out>) at mcv.c:1706
#4 0x000055e460bd965d in mcv_clauselist_selectivity
(root=root(at)entry=0x55e46145fca0, stat=stat(at)entry=0x55e461462790,
clauses=clauses(at)entry=0x55e46144cf08, varRelid=varRelid(at)entry=0,
jointype=jointype(at)entry=JOIN_INNER,
sjinfo=sjinfo(at)entry=0x0, rel=0x55e461461f18,
basesel=0x7fff1bbc3c88, totalsel=0x7fff1bbc3c90) at mcv.c:2043
#5 0x000055e460bd6364 in statext_mcv_clauselist_selectivity
(is_or=<optimized out>, estimatedclauses=<optimized out>,
rel=<optimized out>, sjinfo=<optimized out>, jointype=<optimized out>,
varRelid=<optimized out>,
clauses=<optimized out>, root=<optimized out>) at extended_stats.c:1916
#6 statext_clauselist_selectivity (root=root(at)entry=0x55e46145fca0,
clauses=clauses(at)entry=0x55e461462ad0, varRelid=varRelid(at)entry=0,
jointype=jointype(at)entry=JOIN_INNER, sjinfo=sjinfo(at)entry=0x0,
rel=0x55e461461f18,
estimatedclauses=0x7fff1bbc3d28, is_or=false) at extended_stats.c:1948
#7 0x000055e460b0e973 in clauselist_selectivity_ext
(root=root(at)entry=0x55e46145fca0, clauses=0x55e461462ad0,
varRelid=varRelid(at)entry=0, jointype=jointype(at)entry=JOIN_INNER,
sjinfo=0x0, use_extended_stats=true) at clausesel.c:155
#8 0x000055e460b0e346 in clause_selectivity_ext
(root=root(at)entry=0x55e46145fca0, clause=0x55e46144c8f8,
varRelid=varRelid(at)entry=0, jointype=jointype(at)entry=JOIN_INNER,
sjinfo=sjinfo(at)entry=0x0,
use_extended_stats=use_extended_stats(at)entry=true) at clausesel.c:838
#9 0x000055e460b0e1a4 in clauselist_selectivity_or
(use_extended_stats=<optimized out>, sjinfo=<optimized out>,
jointype=JOIN_INNER, varRelid=0, clauses=0x55e461462cc0,
root=0x55e46145fca0) at clausesel.c:414
#10 clause_selectivity_ext (root=0x55e46145fca0, clause=<optimized
out>, varRelid=0, jointype=JOIN_INNER, sjinfo=<optimized out>,
use_extended_stats=use_extended_stats(at)entry=true) at clausesel.c:851
#11 0x000055e460b0e863 in clauselist_selectivity_ext
(root=root(at)entry=0x55e46145fca0, clauses=0x55e46144cd28,
varRelid=varRelid(at)entry=0, jointype=jointype(at)entry=JOIN_INNER,
sjinfo=sjinfo(at)entry=0x0,
use_extended_stats=use_extended_stats(at)entry=true) at
../../../../src/include/nodes/pg_list.h:260
#12 0x000055e460b0eacf in clauselist_selectivity
(root=root(at)entry=0x55e46145fca0, clauses=<optimized out>,
varRelid=varRelid(at)entry=0, jointype=jointype(at)entry=JOIN_INNER,
sjinfo=sjinfo(at)entry=0x0) at clausesel.c:108
#13 0x000055e460b16992 in set_baserel_size_estimates
(root=root(at)entry=0x55e46145fca0, rel=rel(at)entry=0x55e461461f18) at
costsize.c:4753
#14 0x000055e460b0b971 in set_plain_rel_size (rte=<optimized out>,
rel=0x55e461461f18, root=0x55e46145fca0) at allpaths.c:583
#15 set_rel_size (root=0x55e46145fca0, rel=0x55e461461f18, rti=1,
rte=0x55e4614220a8) at allpaths.c:412
#16 0x000055e460b0d8c0 in set_base_rel_sizes (root=<optimized out>) at
allpaths.c:323

I'm struggling a bit to reproduce this consistently.

If you do:

psql -c "CREATE ROLE sqlancer;" postgres
created test;
psql -f database0.sql test

then run the following set of commands:

insert into t1 values(1234,false),(5678,true);
analyze;
delete from t1;
SELECT t1.c0 FROM ONLY t1 WHERE
((((((((upper('%|1j]<#^j\???u,???D'))LIKE((((('h???x')||(178227136)))||((('(-2073106655,58629343]'::int4range)*('(-1903439243,-1774128827)'::int4range)))))))AND((((t1.c0)
IN (-91026522, 0.6000845835151706))OR(t1.c1)))))OR(t1.c1)))OR(CAST(t1.c1
AS BOOLEAN)));

It seems to be a corrupt Datum in the 2nd argument to textlike().

(gdb) frame 3
#3 0x000055e460bd6dad in mcv_get_match_bitmap (clauses=<optimized
out>, keys=0x55e461480148, exprs=0x0, mcvlist=<optimized out>,
is_or=<optimized out>, root=<optimized out>) at mcv.c:1706
1706 match = DatumGetBool(FunctionCall2Coll(&opproc,
(gdb) p i
$7 = 0
(gdb) p idx
$8 = 2

I've not studied the code in great detail, but per above, idx == 2 and
you're doing item->values[idx]. Isn't the index of 2 out of bounds of
the most_common_vals column below?

test1=# select most_common_vals from pg_stats_ext;
most_common_vals
---------------------
{{1234,f},{5678,t}}
(1 row)

The code that sets idx to 2 (mcv_match_expression()) looks a bit
weird. These stats don't have any exprs and the "expr" past to
mcv_match_expression is an OpExpr. The function just goes and sets
idx = bms_num_members(keys); then does 0 loops due to the empty
"exprs" and returns 2. The Assert() does not help catch not finding
anything since it checks idx >= bms_num_members(keys), which is 2.

I'm not quite sure how this is all meant to work. Are you able to look further?

David

On Sat, 27 Mar 2021 at 13:14, Tomas Vondra <tomas(dot)vondra(at)postgresql(dot)org> wrote:
>
> Extended statistics on expressions
>
> Allow defining extended statistics on expressions, not just just on
> simple column references. With this commit, expressions are supported
> by all existing extended statistics kinds, improving the same types of
> estimates. A simple example may look like this:
>
> CREATE TABLE t (a int);
> CREATE STATISTICS s ON mod(a,10), mod(a,20) FROM t;
> ANALYZE t;
>
> The collected statistics are useful e.g. to estimate queries with those
> expressions in WHERE or GROUP BY clauses:
>
> SELECT * FROM t WHERE mod(a,10) = 0 AND mod(a,20) = 0;
>
> SELECT 1 FROM t GROUP BY mod(a,10), mod(a,20);
>
> This introduces new internal statistics kind 'e' (expressions) which is
> built automatically when the statistics object definition includes any
> expressions. This represents single-expression statistics, as if there
> was an expression index (but without the index maintenance overhead).
> The statistics is stored in pg_statistics_ext_data as an array of
> composite types, which is possible thanks to 79f6a942bd.
>
> CREATE STATISTICS allows building statistics on a single expression, in
> which case in which case it's not possible to specify statistics kinds.
>
> A new system view pg_stats_ext_exprs can be used to display expression
> statistics, similarly to pg_stats and pg_stats_ext views.
>
> ALTER TABLE ... ALTER COLUMN ... TYPE now treats indexes the same way it
> treats indexes, i.e. it drops and recreates the statistics. This means
> all statistics are reset, and we no longer try to preserve at least the
> functional dependencies. This should not be a major issue in practice,
> as the functional dependencies actually rely on per-column statistics,
> which were always reset anyway.
>
> Author: Tomas Vondra
> Reviewed-by: Justin Pryzby, Dean Rasheed, Zhihong Yu
> Discussion: https://postgr.es/m/ad7891d2-e90c-b446-9fe2-7419143847d7%40enterprisedb.com
>
> Branch
> ------
> master
>
> Details
> -------
> https://git.postgresql.org/pg/commitdiff/a4d75c86bf15220df22de0a92c819ecef9db3849
>
> Modified Files
> --------------
> doc/src/sgml/catalogs.sgml | 295 +++-
> doc/src/sgml/ref/create_statistics.sgml | 116 +-
> src/backend/catalog/Makefile | 8 +-
> src/backend/catalog/system_views.sql | 69 +
> src/backend/commands/statscmds.c | 437 +++---
> src/backend/commands/tablecmds.c | 104 +-
> src/backend/nodes/copyfuncs.c | 14 +
> src/backend/nodes/equalfuncs.c | 13 +
> src/backend/nodes/outfuncs.c | 12 +
> src/backend/optimizer/util/plancat.c | 62 +
> src/backend/parser/gram.y | 38 +-
> src/backend/parser/parse_agg.c | 10 +
> src/backend/parser/parse_expr.c | 6 +
> src/backend/parser/parse_func.c | 3 +
> src/backend/parser/parse_utilcmd.c | 125 +-
> src/backend/statistics/dependencies.c | 616 +++++++-
> src/backend/statistics/extended_stats.c | 1253 ++++++++++++++--
> src/backend/statistics/mcv.c | 369 +++--
> src/backend/statistics/mvdistinct.c | 96 +-
> src/backend/tcop/utility.c | 29 +-
> src/backend/utils/adt/ruleutils.c | 281 +++-
> src/backend/utils/adt/selfuncs.c | 413 +++++-
> src/bin/pg_dump/t/002_pg_dump.pl | 12 +
> src/bin/psql/describe.c | 130 +-
> src/include/catalog/catversion.h | 2 +-
> src/include/catalog/pg_proc.dat | 8 +
> src/include/catalog/pg_statistic_ext.h | 4 +
> src/include/catalog/pg_statistic_ext_data.h | 1 +
> src/include/commands/defrem.h | 4 +-
> src/include/nodes/nodes.h | 1 +
> src/include/nodes/parsenodes.h | 19 +-
> src/include/nodes/pathnodes.h | 1 +
> src/include/parser/parse_node.h | 1 +
> src/include/parser/parse_utilcmd.h | 2 +
> src/include/statistics/extended_stats_internal.h | 32 +-
> src/include/statistics/statistics.h | 5 +-
> src/include/utils/ruleutils.h | 2 +
> src/test/regress/expected/create_table_like.out | 20 +-
> src/test/regress/expected/oidjoins.out | 10 +-
> src/test/regress/expected/rules.out | 73 +
> src/test/regress/expected/stats_ext.out | 1658 +++++++++++++++++++---
> src/test/regress/sql/create_table_like.sql | 2 +
> src/test/regress/sql/stats_ext.sql | 589 +++++++-
> 43 files changed, 5982 insertions(+), 963 deletions(-)
>

Attachment Content-Type Size
database0.sql text/plain 1.4 KB

In response to

Responses

Browse pgsql-committers by date

  From Date Subject
Next Message Peter Eisentraut 2021-03-31 09:22:05 pgsql: Add p_names field to ParseNamespaceItem
Previous Message Peter Eisentraut 2021-03-31 07:21:43 pgsql: Add errhint_plural() function and make use of it