[PATCH] Never convert n_distinct < 2 values to a ratio when computing stats

From: Dan McGee <dan(at)archlinux(dot)org>
To: pgsql-hackers(at)postgresql(dot)org
Subject: [PATCH] Never convert n_distinct < 2 values to a ratio when computing stats
Date: 2012-03-24 04:17:18
Message-ID: 1332562638-11675-1-git-send-email-dan@archlinux.org
Views: Raw Message | Whole Thread | Download mbox | Resend email
Thread:
Lists: pgsql-hackers

This is a bit of a corner case in all honesty, but if you have a short
table (under 20 rows), the 10% heuristic used that decides whether
distinct values scale with the row count will result in rather odd
values for stadistinct in pg_statistic, such as '-0.2' or '-0.666667',
rather than the expected '2'. Additionally, this can cause only one of
{t, f} to appear in the most common values array.

Does this actually affect query planning in any way? Probably not, but
it is extremely odd to look at pg_stats for these columns, and the
solution seems easy.
---

The only other minor changes included here were to make it clear when we were
comparing float values, so use 0.0 instead of 0.

Example stats output from the database I noticed this on:

archweb=# SELECT c.relname, a.attname, pg_stat_get_live_tuples(c.oid) AS n_live_tup, stadistinct, stanullfrac, stawidth, stavalues1, stanumbers1 FROM pg_statistic s JOIN pg_class c ON c.oid = s.starelid JOIN pg_attribute a ON c.oid = a.attrelid AND a.attnum = s.staattnum LEFT JOIN pg_namespace n ON n.oid = c.relnamespace JOIN pg_type t ON t.oid = a.atttypid WHERE NOT a.attisdropped AND nspname = 'public' AND t.typname = 'bool' ORDER BY stadistinct, n_live_tup;
relname | attname | n_live_tup | stadistinct | stanullfrac | stawidth | stavalues1 | stanumbers1
-------------------------------+---------------+------------+-------------+-------------+----------+------------+-----------------------
mirrors_mirrorprotocol | is_download | 3 | -0.666667 | 0 | 1 | {t} | {0.666667}
arches | agnostic | 3 | -0.666667 | 0 | 1 | {f} | {0.666667}
repos | staging | 10 | -0.2 | 0 | 1 | {f,t} | {0.7,0.3}
repos | testing | 10 | -0.2 | 0 | 1 | {f,t} | {0.7,0.3}
devel_pgpsignature | valid | 264 | 1 | 0 | 1 | {t} | {1}
packages_flagrequest | is_spam | 415 | 1 | 0 | 1 | {f} | {1}
donors | visible | 716 | 1 | 0 | 1 | {t} | {1}
auth_user | is_superuser | 95 | 2 | 0 | 1 | {f,t} | {0.957895,0.0421053}
user_profiles | notify | 95 | 2 | 0 | 1 | {t,f} | {0.957895,0.0421053}
auth_user | is_active | 95 | 2 | 0 | 1 | {t,f} | {0.621053,0.378947}
auth_user | is_staff | 95 | 2 | 0 | 1 | {f,t} | {0.873684,0.126316}
releng_iso | active | 158 | 2 | 0 | 1 | {f,t} | {0.893333,0.106667}
mirrors_mirror | isos | 180 | 2 | 0 | 1 | {t,f} | {0.972678,0.0273224}
mirrors_mirror | active | 180 | 2 | 0 | 1 | {t,f} | {0.672131,0.327869}
mirrors_mirror | public | 180 | 2 | 0 | 1 | {t,f} | {0.978142,0.0218579}
mirrors_mirrorurl | has_ipv6 | 379 | 2 | 0 | 1 | {f,t} | {0.709763,0.290237}
mirrors_mirrorurl | has_ipv4 | 379 | 2 | 0 | 1 | {t} | {0.997361}
packages_flagrequest | is_legitimate | 415 | 2 | 0 | 1 | {t,f} | {0.992754,0.00724638}
packages_signoffspecification | enabled | 1130 | 2 | 0 | 1 | {t,f} | {0.977578,0.0224215}
packages_signoffspecification | known_bad | 1130 | 2 | 0 | 1 | {f,t} | {0.993722,0.00627803}
mirrors_mirrorlog | is_success | 12715 | 2 | 0 | 1 | {t,f} | {0.953345,0.0466552}
package_depends | optional | 28592 | 2 | 0 | 1 | {f,t} | {0.880322,0.119678}
package_files | is_directory | 225084 | 2 | 0 | 1 | {f,t} | {0.829933,0.170067}
(23 rows)

src/backend/commands/analyze.c | 18 +++++++++---------
1 file changed, 9 insertions(+), 9 deletions(-)

diff --git a/src/backend/commands/analyze.c b/src/backend/commands/analyze.c
index 9cd6e67..995ed9d 100644
--- a/src/backend/commands/analyze.c
+++ b/src/backend/commands/analyze.c
@@ -2110,7 +2110,7 @@ compute_minimal_stats(VacAttrStatsP stats,
* least 2 instances in the sample.
*/
if (track_cnt < track_max && toowide_cnt == 0 &&
- stats->stadistinct > 0 &&
+ stats->stadistinct > 0.0 &&
track_cnt <= num_mcv)
{
/* Track list includes all values seen, and all will fit */
@@ -2122,7 +2122,7 @@ compute_minimal_stats(VacAttrStatsP stats,
double avgcount,
mincount;

- if (ndistinct < 0)
+ if (ndistinct < 0.0)
ndistinct = -ndistinct * totalrows;
/* estimate # of occurrences in sample of a typical value */
avgcount = (double) samplerows / ndistinct;
@@ -2434,12 +2434,12 @@ compute_scalar_stats(VacAttrStatsP stats,
}

/*
- * If we estimated the number of distinct values at more than 10% of
- * the total row count (a very arbitrary limit), then assume that
- * stadistinct should scale with the row count rather than be a fixed
- * value.
+ * If we estimated the number of distinct values at more than 2 total
+ * values (a boolean) and more than 10% of the total row count (a very
+ * arbitrary limit), then assume that stadistinct should scale with
+ * the row count rather than be a fixed value.
*/
- if (stats->stadistinct > 0.1 * totalrows)
+ if (stats->stadistinct > 2.0 && stats->stadistinct > 0.1 * totalrows)
stats->stadistinct = -(stats->stadistinct / totalrows);

/*
@@ -2457,7 +2457,7 @@ compute_scalar_stats(VacAttrStatsP stats,
* but we prefer to treat such values as MCVs if at all possible.)
*/
if (track_cnt == ndistinct && toowide_cnt == 0 &&
- stats->stadistinct > 0 &&
+ stats->stadistinct > 0.0 &&
track_cnt <= num_mcv)
{
/* Track list includes all values seen, and all will fit */
@@ -2470,7 +2470,7 @@ compute_scalar_stats(VacAttrStatsP stats,
mincount,
maxmincount;

- if (ndistinct < 0)
+ if (ndistinct < 0.0)
ndistinct = -ndistinct * totalrows;
/* estimate # of occurrences in sample of a typical value */
avgcount = (double) samplerows / ndistinct;
--
1.7.9.4

Responses

Browse pgsql-hackers by date

  From Date Subject
Next Message Qi Huang 2012-03-24 04:20:26 Re: Gsoc2012 Idea --- Social Network database schema
Previous Message Marko Kreen 2012-03-24 00:22:24 Re: Speed dblink using alternate libpq tuple storage