diff --git a/contrib/file_fdw/file_fdw.c b/contrib/file_fdw/file_fdw.c index 9863e32748..3cdb839489 100644 --- a/contrib/file_fdw/file_fdw.c +++ b/contrib/file_fdw/file_fdw.c @@ -159,6 +159,7 @@ static void estimate_costs(PlannerInfo *root, RelOptInfo *baserel, FileFdwPlanState *fdw_private, Cost *startup_cost, Cost *total_cost); static int file_acquire_sample_rows(Relation onerel, int elevel, + int natts, VacAttrStats **stats, HeapTuple *rows, int targrows, double *totalrows, double *totaldeadrows); @@ -1093,6 +1094,7 @@ estimate_costs(PlannerInfo *root, RelOptInfo *baserel, */ static int file_acquire_sample_rows(Relation onerel, int elevel, + int natts, VacAttrStats **stats, HeapTuple *rows, int targrows, double *totalrows, double *totaldeadrows) { diff --git a/contrib/postgres_fdw/postgres_fdw.c b/contrib/postgres_fdw/postgres_fdw.c index b6c72e1d1e..471970601a 100644 --- a/contrib/postgres_fdw/postgres_fdw.c +++ b/contrib/postgres_fdw/postgres_fdw.c @@ -466,6 +466,7 @@ static void process_query_params(ExprContext *econtext, List *param_exprs, const char **param_values); static int postgresAcquireSampleRowsFunc(Relation relation, int elevel, + int natts, VacAttrStats **stats, HeapTuple *rows, int targrows, double *totalrows, double *totaldeadrows); @@ -4483,6 +4484,7 @@ postgresAnalyzeForeignTable(Relation relation, */ static int postgresAcquireSampleRowsFunc(Relation relation, int elevel, + int natts, VacAttrStats **stats, HeapTuple *rows, int targrows, double *totalrows, double *totaldeadrows) diff --git a/src/backend/access/heap/heapam_handler.c b/src/backend/access/heap/heapam_handler.c index 3eea215b85..b2f31626b8 100644 --- a/src/backend/access/heap/heapam_handler.c +++ b/src/backend/access/heap/heapam_handler.c @@ -33,6 +33,7 @@ #include "catalog/storage.h" #include "catalog/storage_xlog.h" #include "commands/progress.h" +#include "commands/vacuum.h" #include "executor/executor.h" #include "miscadmin.h" #include "pgstat.h" @@ -44,6 +45,7 @@ #include "storage/smgr.h" #include "utils/builtins.h" #include "utils/rel.h" +#include "utils/sampling.h" static void reform_and_rewrite_tuple(HeapTuple tuple, Relation OldHeap, Relation NewHeap, @@ -1154,6 +1156,151 @@ heapam_scan_analyze_next_tuple(TableScanDesc scan, TransactionId OldestXmin, return false; } +/* + * As of May 2004 we use a new two-stage method: Stage one selects up + * to targrows random blocks (or all blocks, if there aren't so many). + * Stage two scans these blocks and uses the Vitter algorithm to create + * a random sample of targrows rows (or less, if there are less in the + * sample of blocks). The two stages are executed simultaneously: each + * block is processed as soon as stage one returns its number and while + * the rows are read stage two controls which ones are to be inserted + * into the sample. + * + * Although every row has an equal chance of ending up in the final + * sample, this sampling method is not perfect: not every possible + * sample has an equal chance of being selected. For large relations + * the number of different blocks represented by the sample tends to be + * too small. We can live with that for now. Improvements are welcome. + * + * An important property of this sampling method is that because we do + * look at a statistically unbiased set of blocks, we should get + * unbiased estimates of the average numbers of live and dead rows per + * block. The previous sampling method put too much credence in the row + * density near the start of the table. + */ +static int +heapam_acquire_sample_rows(TableScanDesc scan, int elevel, + int natts, VacAttrStats **stats, + BufferAccessStrategy bstrategy, + HeapTuple *rows, int targrows, + double *totalrows, double *totaldeadrows) +{ + int numrows = 0; /* # rows now in reservoir */ + double samplerows = 0; /* total # rows collected */ + double liverows = 0; /* # live rows seen */ + double deadrows = 0; /* # dead rows seen */ + double rowstoskip = -1; /* -1 means not set yet */ + BlockNumber totalblocks; + TransactionId OldestXmin; + BlockSamplerData bs; + ReservoirStateData rstate; + TupleTableSlot *slot; + BlockNumber nblocks; + + totalblocks = RelationGetNumberOfBlocks(scan->rs_rd); + + /* Need a cutoff xmin for HeapTupleSatisfiesVacuum */ + OldestXmin = GetOldestNonRemovableTransactionId(scan->rs_rd); + + /* Prepare for sampling block numbers */ + nblocks = BlockSampler_Init(&bs, totalblocks, targrows, random()); + + /* Prepare for sampling rows */ + reservoir_init_selection_state(&rstate, targrows); + + slot = table_slot_create(scan->rs_rd, NULL); + + /* Outer loop over blocks to sample */ + while (BlockSampler_HasMore(&bs)) + { + BlockNumber targblock = BlockSampler_Next(&bs); + + vacuum_delay_point(); + + if (!heapam_scan_analyze_next_block(scan, targblock, bstrategy)) + continue; + + while (heapam_scan_analyze_next_tuple(scan, OldestXmin, &liverows, &deadrows, slot)) + { + /* + * The first targrows sample rows are simply copied into the + * reservoir. Then we start replacing tuples in the sample until + * we reach the end of the relation. This algorithm is from Jeff + * Vitter's paper (see full citation in utils/misc/sampling.c). It + * works by repeatedly computing the number of tuples to skip + * before selecting a tuple, which replaces a randomly chosen + * element of the reservoir (current set of tuples). At all times + * the reservoir is a true random sample of the tuples we've + * passed over so far, so when we fall off the end of the relation + * we're done. + */ + if (numrows < targrows) + rows[numrows++] = ExecCopySlotHeapTuple(slot); + else + { + /* + * t in Vitter's paper is the number of records already + * processed. If we need to compute a new S value, we must + * use the not-yet-incremented value of samplerows as t. + */ + if (rowstoskip < 0) + rowstoskip = reservoir_get_next_S(&rstate, samplerows, targrows); + + if (rowstoskip <= 0) + { + /* + * Found a suitable tuple, so save it, replacing one old + * tuple at random + */ + int k = (int) (targrows * sampler_random_fract(rstate.randstate)); + + Assert(k >= 0 && k < targrows); + heap_freetuple(rows[k]); + rows[k] = ExecCopySlotHeapTuple(slot); + } + + rowstoskip -= 1; + } + + samplerows += 1; + } + } + + ExecDropSingleTupleTableSlot(slot); + + /* + * Estimate total numbers of live and dead rows in relation, extrapolating + * on the assumption that the average tuple density in pages we didn't + * scan is the same as in the pages we did scan. Since what we scanned is + * a random sample of the pages in the relation, this should be a good + * assumption. + */ + if (bs.m > 0) + { + *totalrows = floor((liverows / bs.m) * totalblocks + 0.5); + *totaldeadrows = floor((deadrows / bs.m) * totalblocks + 0.5); + } + else + { + *totalrows = 0.0; + *totaldeadrows = 0.0; + } + + /* + * Emit some interesting relation info + */ + ereport(elevel, + (errmsg("\"%s\": scanned %d of %u pages, " + "containing %.0f live rows and %.0f dead rows; " + "%d rows in sample, %.0f estimated total rows", + RelationGetRelationName(scan->rs_rd), + bs.m, totalblocks, + liverows, deadrows, + numrows, *totalrows))); + + return numrows; +} + static double heapam_index_build_range_scan(Relation heapRelation, Relation indexRelation, @@ -1657,13 +1804,13 @@ heapam_index_build_range_scan(Relation heapRelation, offnum = ItemPointerGetOffsetNumber(&heapTuple->t_self); /* - * If a HOT tuple points to a root that we don't know - * about, obtain root items afresh. If that still fails, - * report it as corruption. + * If a HOT tuple points to a root that we don't know about, + * obtain root items afresh. If that still fails, report it as + * corruption. */ if (root_offsets[offnum - 1] == InvalidOffsetNumber) { - Page page = BufferGetPage(hscan->rs_cbuf); + Page page = BufferGetPage(hscan->rs_cbuf); LockBuffer(hscan->rs_cbuf, BUFFER_LOCK_SHARE); heap_get_root_tuples(page, root_offsets); @@ -2569,8 +2716,7 @@ static const TableAmRoutine heapam_methods = { .relation_copy_data = heapam_relation_copy_data, .relation_copy_for_cluster = heapam_relation_copy_for_cluster, .relation_vacuum = heap_vacuum_rel, - .scan_analyze_next_block = heapam_scan_analyze_next_block, - .scan_analyze_next_tuple = heapam_scan_analyze_next_tuple, + .acquire_sample_rows = heapam_acquire_sample_rows, .index_build_range_scan = heapam_index_build_range_scan, .index_validate_scan = heapam_index_validate_scan, diff --git a/src/backend/access/table/tableamapi.c b/src/backend/access/table/tableamapi.c index 58de0743ba..0a981e20e4 100644 --- a/src/backend/access/table/tableamapi.c +++ b/src/backend/access/table/tableamapi.c @@ -87,8 +87,7 @@ GetTableAmRoutine(Oid amhandler) Assert(routine->relation_copy_data != NULL); Assert(routine->relation_copy_for_cluster != NULL); Assert(routine->relation_vacuum != NULL); - Assert(routine->scan_analyze_next_block != NULL); - Assert(routine->scan_analyze_next_tuple != NULL); + Assert(routine->acquire_sample_rows != NULL); Assert(routine->index_build_range_scan != NULL); Assert(routine->index_validate_scan != NULL); diff --git a/src/backend/commands/analyze.c b/src/backend/commands/analyze.c index 8af12b5c6b..e6476d7047 100644 --- a/src/backend/commands/analyze.c +++ b/src/backend/commands/analyze.c @@ -37,7 +37,6 @@ #include "commands/dbcommands.h" #include "commands/progress.h" #include "commands/tablecmds.h" -#include "commands/vacuum.h" #include "executor/executor.h" #include "foreign/fdwapi.h" #include "miscadmin.h" @@ -61,7 +60,6 @@ #include "utils/lsyscache.h" #include "utils/memutils.h" #include "utils/pg_rusage.h" -#include "utils/sampling.h" #include "utils/sortsupport.h" #include "utils/syscache.h" #include "utils/timestamp.h" @@ -96,10 +94,12 @@ static void compute_index_stats(Relation onerel, double totalrows, static VacAttrStats *examine_attribute(Relation onerel, int attnum, Node *index_expr); static int acquire_sample_rows(Relation onerel, int elevel, + int natts, VacAttrStats **stats, HeapTuple *rows, int targrows, double *totalrows, double *totaldeadrows); static int compare_rows(const void *a, const void *b); static int acquire_inherited_sample_rows(Relation onerel, int elevel, + int natts, VacAttrStats **stats, HeapTuple *rows, int targrows, double *totalrows, double *totaldeadrows); static void update_attstats(Oid relid, bool inh, @@ -505,10 +505,12 @@ do_analyze_rel(Relation onerel, VacuumParams *params, PROGRESS_ANALYZE_PHASE_ACQUIRE_SAMPLE_ROWS); if (inh) numrows = acquire_inherited_sample_rows(onerel, elevel, + attr_cnt, vacattrstats, rows, targrows, &totalrows, &totaldeadrows); else numrows = (*acquirefunc) (onerel, elevel, + attr_cnt, vacattrstats, rows, targrows, &totalrows, &totaldeadrows); @@ -999,127 +1001,26 @@ examine_attribute(Relation onerel, int attnum, Node *index_expr) * * The returned list of tuples is in order by physical position in the table. * (We will rely on this later to derive correlation estimates.) - * - * As of May 2004 we use a new two-stage method: Stage one selects up - * to targrows random blocks (or all blocks, if there aren't so many). - * Stage two scans these blocks and uses the Vitter algorithm to create - * a random sample of targrows rows (or less, if there are less in the - * sample of blocks). The two stages are executed simultaneously: each - * block is processed as soon as stage one returns its number and while - * the rows are read stage two controls which ones are to be inserted - * into the sample. - * - * Although every row has an equal chance of ending up in the final - * sample, this sampling method is not perfect: not every possible - * sample has an equal chance of being selected. For large relations - * the number of different blocks represented by the sample tends to be - * too small. We can live with that for now. Improvements are welcome. - * - * An important property of this sampling method is that because we do - * look at a statistically unbiased set of blocks, we should get - * unbiased estimates of the average numbers of live and dead rows per - * block. The previous sampling method put too much credence in the row - * density near the start of the table. */ static int acquire_sample_rows(Relation onerel, int elevel, + int natts, VacAttrStats **stats, HeapTuple *rows, int targrows, double *totalrows, double *totaldeadrows) { int numrows = 0; /* # rows now in reservoir */ - double samplerows = 0; /* total # rows collected */ - double liverows = 0; /* # live rows seen */ - double deadrows = 0; /* # dead rows seen */ - double rowstoskip = -1; /* -1 means not set yet */ - BlockNumber totalblocks; - TransactionId OldestXmin; - BlockSamplerData bs; - ReservoirStateData rstate; - TupleTableSlot *slot; TableScanDesc scan; - BlockNumber nblocks; - BlockNumber blksdone = 0; Assert(targrows > 0); - totalblocks = RelationGetNumberOfBlocks(onerel); - - /* Need a cutoff xmin for HeapTupleSatisfiesVacuum */ - OldestXmin = GetOldestNonRemovableTransactionId(onerel); - - /* Prepare for sampling block numbers */ - nblocks = BlockSampler_Init(&bs, totalblocks, targrows, random()); - - /* Report sampling block numbers */ - pgstat_progress_update_param(PROGRESS_ANALYZE_BLOCKS_TOTAL, - nblocks); - - /* Prepare for sampling rows */ - reservoir_init_selection_state(&rstate, targrows); - scan = table_beginscan_analyze(onerel); - slot = table_slot_create(onerel, NULL); - /* Outer loop over blocks to sample */ - while (BlockSampler_HasMore(&bs)) - { - BlockNumber targblock = BlockSampler_Next(&bs); + numrows = table_acquire_sample_rows(scan, elevel, + natts, stats, + vac_strategy, rows, + targrows, totalrows, + totaldeadrows); - vacuum_delay_point(); - - if (!table_scan_analyze_next_block(scan, targblock, vac_strategy)) - continue; - - while (table_scan_analyze_next_tuple(scan, OldestXmin, &liverows, &deadrows, slot)) - { - /* - * The first targrows sample rows are simply copied into the - * reservoir. Then we start replacing tuples in the sample until - * we reach the end of the relation. This algorithm is from Jeff - * Vitter's paper (see full citation in utils/misc/sampling.c). It - * works by repeatedly computing the number of tuples to skip - * before selecting a tuple, which replaces a randomly chosen - * element of the reservoir (current set of tuples). At all times - * the reservoir is a true random sample of the tuples we've - * passed over so far, so when we fall off the end of the relation - * we're done. - */ - if (numrows < targrows) - rows[numrows++] = ExecCopySlotHeapTuple(slot); - else - { - /* - * t in Vitter's paper is the number of records already - * processed. If we need to compute a new S value, we must - * use the not-yet-incremented value of samplerows as t. - */ - if (rowstoskip < 0) - rowstoskip = reservoir_get_next_S(&rstate, samplerows, targrows); - - if (rowstoskip <= 0) - { - /* - * Found a suitable tuple, so save it, replacing one old - * tuple at random - */ - int k = (int) (targrows * sampler_random_fract(rstate.randstate)); - - Assert(k >= 0 && k < targrows); - heap_freetuple(rows[k]); - rows[k] = ExecCopySlotHeapTuple(slot); - } - - rowstoskip -= 1; - } - - samplerows += 1; - } - - pgstat_progress_update_param(PROGRESS_ANALYZE_BLOCKS_DONE, - ++blksdone); - } - - ExecDropSingleTupleTableSlot(slot); table_endscan(scan); /* @@ -1133,36 +1034,6 @@ acquire_sample_rows(Relation onerel, int elevel, if (numrows == targrows) qsort((void *) rows, numrows, sizeof(HeapTuple), compare_rows); - /* - * Estimate total numbers of live and dead rows in relation, extrapolating - * on the assumption that the average tuple density in pages we didn't - * scan is the same as in the pages we did scan. Since what we scanned is - * a random sample of the pages in the relation, this should be a good - * assumption. - */ - if (bs.m > 0) - { - *totalrows = floor((liverows / bs.m) * totalblocks + 0.5); - *totaldeadrows = floor((deadrows / bs.m) * totalblocks + 0.5); - } - else - { - *totalrows = 0.0; - *totaldeadrows = 0.0; - } - - /* - * Emit some interesting relation info - */ - ereport(elevel, - (errmsg("\"%s\": scanned %d of %u pages, " - "containing %.0f live rows and %.0f dead rows; " - "%d rows in sample, %.0f estimated total rows", - RelationGetRelationName(onerel), - bs.m, totalblocks, - liverows, deadrows, - numrows, *totalrows))); - return numrows; } @@ -1201,6 +1072,7 @@ compare_rows(const void *a, const void *b) */ static int acquire_inherited_sample_rows(Relation onerel, int elevel, + int natts, VacAttrStats **stats, HeapTuple *rows, int targrows, double *totalrows, double *totaldeadrows) { @@ -1374,6 +1246,7 @@ acquire_inherited_sample_rows(Relation onerel, int elevel, /* Fetch a random sample of the child's rows */ childrows = (*acquirefunc) (childrel, elevel, + natts, stats, rows + numrows, childtargrows, &trows, &tdrows); diff --git a/src/include/access/tableam.h b/src/include/access/tableam.h index 387eb34a61..f9359e27ed 100644 --- a/src/include/access/tableam.h +++ b/src/include/access/tableam.h @@ -20,6 +20,7 @@ #include "access/relscan.h" #include "access/sdir.h" #include "access/xact.h" +#include "commands/vacuum.h" #include "utils/guc.h" #include "utils/rel.h" #include "utils/snapshot.h" @@ -502,39 +503,18 @@ typedef struct TableAmRoutine BufferAccessStrategy bstrategy); /* - * Prepare to analyze block `blockno` of `scan`. The scan has been started - * with table_beginscan_analyze(). See also - * table_scan_analyze_next_block(). - * - * The callback may acquire resources like locks that are held until - * table_scan_analyze_next_tuple() returns false. It e.g. can make sense - * to hold a lock until all tuples on a block have been analyzed by - * scan_analyze_next_tuple. - * - * The callback can return false if the block is not suitable for - * sampling, e.g. because it's a metapage that could never contain tuples. - * - * XXX: This obviously is primarily suited for block-based AMs. It's not - * clear what a good interface for non block based AMs would be, so there - * isn't one yet. + * This callback needs to fill reservour with sample rows during analyze + * scan. */ - bool (*scan_analyze_next_block) (TableScanDesc scan, - BlockNumber blockno, - BufferAccessStrategy bstrategy); - - /* - * See table_scan_analyze_next_tuple(). - * - * Not every AM might have a meaningful concept of dead rows, in which - * case it's OK to not increment *deadrows - but note that that may - * influence autovacuum scheduling (see comment for relation_vacuum - * callback). - */ - bool (*scan_analyze_next_tuple) (TableScanDesc scan, - TransactionId OldestXmin, - double *liverows, - double *deadrows, - TupleTableSlot *slot); + int (*acquire_sample_rows) (TableScanDesc scan, + int elevel, + int natts, + VacAttrStats **stats, + BufferAccessStrategy bstrategy, + HeapTuple *rows, + int targrows, + double *totalrows, + double *totaldeadrows); /* see table_index_build_range_scan for reference about parameters */ double (*index_build_range_scan) (Relation table_rel, @@ -1485,40 +1465,18 @@ table_relation_vacuum(Relation rel, struct VacuumParams *params, rel->rd_tableam->relation_vacuum(rel, params, bstrategy); } -/* - * Prepare to analyze block `blockno` of `scan`. The scan needs to have been - * started with table_beginscan_analyze(). Note that this routine might - * acquire resources like locks that are held until - * table_scan_analyze_next_tuple() returns false. - * - * Returns false if block is unsuitable for sampling, true otherwise. - */ -static inline bool -table_scan_analyze_next_block(TableScanDesc scan, BlockNumber blockno, - BufferAccessStrategy bstrategy) +static inline int +table_acquire_sample_rows(TableScanDesc scan, int elevel, + int natts, VacAttrStats **stats, + BufferAccessStrategy bstrategy, + HeapTuple *rows, int targrows, + double *totalrows, double *totaldeadrows) { - return scan->rs_rd->rd_tableam->scan_analyze_next_block(scan, blockno, - bstrategy); -} - -/* - * Iterate over tuples in the block selected with - * table_scan_analyze_next_block() (which needs to have returned true, and - * this routine may not have returned false for the same block before). If a - * tuple that's suitable for sampling is found, true is returned and a tuple - * is stored in `slot`. - * - * *liverows and *deadrows are incremented according to the encountered - * tuples. - */ -static inline bool -table_scan_analyze_next_tuple(TableScanDesc scan, TransactionId OldestXmin, - double *liverows, double *deadrows, - TupleTableSlot *slot) -{ - return scan->rs_rd->rd_tableam->scan_analyze_next_tuple(scan, OldestXmin, - liverows, deadrows, - slot); + return scan->rs_rd->rd_tableam->acquire_sample_rows(scan, elevel, + natts, stats, + bstrategy, rows, + targrows, totalrows, + totaldeadrows); } /* diff --git a/src/include/commands/progress.h b/src/include/commands/progress.h index 36b073e677..397c8612d4 100644 --- a/src/include/commands/progress.h +++ b/src/include/commands/progress.h @@ -36,13 +36,11 @@ /* Progress parameters for analyze */ #define PROGRESS_ANALYZE_PHASE 0 -#define PROGRESS_ANALYZE_BLOCKS_TOTAL 1 -#define PROGRESS_ANALYZE_BLOCKS_DONE 2 -#define PROGRESS_ANALYZE_EXT_STATS_TOTAL 3 -#define PROGRESS_ANALYZE_EXT_STATS_COMPUTED 4 -#define PROGRESS_ANALYZE_CHILD_TABLES_TOTAL 5 -#define PROGRESS_ANALYZE_CHILD_TABLES_DONE 6 -#define PROGRESS_ANALYZE_CURRENT_CHILD_TABLE_RELID 7 +#define PROGRESS_ANALYZE_EXT_STATS_TOTAL 1 +#define PROGRESS_ANALYZE_EXT_STATS_COMPUTED 2 +#define PROGRESS_ANALYZE_CHILD_TABLES_TOTAL 3 +#define PROGRESS_ANALYZE_CHILD_TABLES_DONE 4 +#define PROGRESS_ANALYZE_CURRENT_CHILD_TABLE_RELID 5 /* Phases of analyze (as advertised via PROGRESS_ANALYZE_PHASE) */ #define PROGRESS_ANALYZE_PHASE_ACQUIRE_SAMPLE_ROWS 1 diff --git a/src/include/foreign/fdwapi.h b/src/include/foreign/fdwapi.h index 95556dfb15..f056d02b4a 100644 --- a/src/include/foreign/fdwapi.h +++ b/src/include/foreign/fdwapi.h @@ -13,6 +13,7 @@ #define FDWAPI_H #include "access/parallel.h" +#include "commands/vacuum.h" #include "nodes/execnodes.h" #include "nodes/pathnodes.h" @@ -140,6 +141,7 @@ typedef void (*ExplainDirectModify_function) (ForeignScanState *node, struct ExplainState *es); typedef int (*AcquireSampleRowsFunc) (Relation relation, int elevel, + int natts, VacAttrStats **stats, HeapTuple *rows, int targrows, double *totalrows, double *totaldeadrows);