From a5104b15683524a375aab1beb9615f690de31882 Mon Sep 17 00:00:00 2001 From: Denis Smirnov Date: Sat, 5 Dec 2020 23:25:29 +1000 Subject: [PATCH v2] Refactor AM analyze (acqiure rows method) Analyze AM functions were implemented for fix sized block AM (heap influence). Other AMs with variable size blocks are not be able to use current two stage block sampling (Knuth' algorithm S and Vitter's algorithm Z). This refactoring gives more freedom to AM developers to implement analyze for non-fix sized AMs. Discussion: https://www.postgresql.org/message-id/flat/C7CFE16B-F192-4124-BEBB-7864285E0FF7%40arenadata.io --- src/backend/access/heap/heapam_handler.c | 199 ++++++++++++++++++++++- src/backend/access/table/tableamapi.c | 3 +- src/backend/commands/analyze.c | 189 +-------------------- src/include/access/tableam.h | 99 +++-------- 4 files changed, 227 insertions(+), 263 deletions(-) diff --git a/src/backend/access/heap/heapam_handler.c b/src/backend/access/heap/heapam_handler.c index 4a70e20a14..6bf0c887be 100644 --- a/src/backend/access/heap/heapam_handler.c +++ b/src/backend/access/heap/heapam_handler.c @@ -33,9 +33,11 @@ #include "catalog/storage.h" #include "catalog/storage_xlog.h" #include "commands/progress.h" +#include "commands/vacuum.h" #include "executor/executor.h" #include "miscadmin.h" #include "pgstat.h" +#include "port.h" #include "storage/bufmgr.h" #include "storage/bufpage.h" #include "storage/lmgr.h" @@ -44,6 +46,7 @@ #include "storage/smgr.h" #include "utils/builtins.h" #include "utils/rel.h" +#include "utils/sampling.h" static void reform_and_rewrite_tuple(HeapTuple tuple, Relation OldHeap, Relation NewHeap, @@ -53,6 +56,8 @@ static bool SampleHeapTupleVisible(TableScanDesc scan, Buffer buffer, HeapTuple tuple, OffsetNumber tupoffset); +static int compare_rows(const void *a, const void *b); + static BlockNumber heapam_scan_get_blocks_done(HeapScanDesc hscan); static const TableAmRoutine heapam_methods; @@ -1154,6 +1159,173 @@ heapam_scan_analyze_next_tuple(TableScanDesc scan, TransactionId OldestXmin, return false; } +/* + * As of May 2004 we use a new two-stage method: Stage one selects up + * to targrows random blocks (or all blocks, if there aren't so many). + * Stage two scans these blocks and uses the Vitter algorithm to create + * a random sample of targrows rows (or less, if there are less in the + * sample of blocks). The two stages are executed simultaneously: each + * block is processed as soon as stage one returns its number and while + * the rows are read stage two controls which ones are to be inserted + * into the sample. + * + * Although every row has an equal chance of ending up in the final + * sample, this sampling method is not perfect: not every possible + * sample has an equal chance of being selected. For large relations + * the number of different blocks represented by the sample tends to be + * too small. We can live with that for now. Improvements are welcome. + * + * An important property of this sampling method is that because we do + * look at a statistically unbiased set of blocks, we should get + * unbiased estimates of the average numbers of live and dead rows per + * block. The previous sampling method put too much credence in the row + * density near the start of the table. + */ +static int +heapam_acquire_sample_rows(Relation rel, int elevel, + BufferAccessStrategy bstrategy, + HeapTuple *rows, int targrows, + double *totalrows, double *totaldeadrows) +{ + int numrows = 0; /* # rows now in reservoir */ + double samplerows = 0; /* total # rows collected */ + double liverows = 0; /* # live rows seen */ + double deadrows = 0; /* # dead rows seen */ + double rowstoskip = -1; /* -1 means not set yet */ + BlockNumber totalblocks; + TransactionId OldestXmin; + BlockSamplerData bs; + ReservoirStateData rstate; + TupleTableSlot *slot; + TableScanDesc scan; + BlockNumber nblocks; + BlockNumber blksdone = 0; + + scan = heap_beginscan(rel, NULL, 0, NULL, NULL, SO_TYPE_ANALYZE); + + totalblocks = RelationGetNumberOfBlocks(scan->rs_rd); + + /* Need a cutoff xmin for HeapTupleSatisfiesVacuum */ + OldestXmin = GetOldestNonRemovableTransactionId(scan->rs_rd); + + /* Prepare for sampling block numbers */ + nblocks = BlockSampler_Init(&bs, totalblocks, targrows, random()); + + /* Report sampling block numbers */ + pgstat_progress_update_param(PROGRESS_ANALYZE_BLOCKS_TOTAL, + nblocks); + + /* Prepare for sampling rows */ + reservoir_init_selection_state(&rstate, targrows); + + slot = table_slot_create(scan->rs_rd, NULL); + + /* Outer loop over blocks to sample */ + while (BlockSampler_HasMore(&bs)) + { + BlockNumber targblock = BlockSampler_Next(&bs); + + vacuum_delay_point(); + + if (!heapam_scan_analyze_next_block(scan, targblock, bstrategy)) + continue; + + while (heapam_scan_analyze_next_tuple(scan, OldestXmin, &liverows, &deadrows, slot)) + { + /* + * The first targrows sample rows are simply copied into the + * reservoir. Then we start replacing tuples in the sample until + * we reach the end of the relation. This algorithm is from Jeff + * Vitter's paper (see full citation in utils/misc/sampling.c). It + * works by repeatedly computing the number of tuples to skip + * before selecting a tuple, which replaces a randomly chosen + * element of the reservoir (current set of tuples). At all times + * the reservoir is a true random sample of the tuples we've + * passed over so far, so when we fall off the end of the relation + * we're done. + */ + if (numrows < targrows) + rows[numrows++] = ExecCopySlotHeapTuple(slot); + else + { + /* + * t in Vitter's paper is the number of records already + * processed. If we need to compute a new S value, we must + * use the not-yet-incremented value of samplerows as t. + */ + if (rowstoskip < 0) + rowstoskip = reservoir_get_next_S(&rstate, samplerows, targrows); + + if (rowstoskip <= 0) + { + /* + * Found a suitable tuple, so save it, replacing one old + * tuple at random + */ + int k = (int) (targrows * sampler_random_fract(rstate.randstate)); + + Assert(k >= 0 && k < targrows); + heap_freetuple(rows[k]); + rows[k] = ExecCopySlotHeapTuple(slot); + } + + rowstoskip -= 1; + } + + samplerows += 1; + } + + pgstat_progress_update_param(PROGRESS_ANALYZE_BLOCKS_DONE, + ++blksdone); + } + + ExecDropSingleTupleTableSlot(slot); + heap_endscan(scan); + + /* + * If we didn't find as many tuples as we wanted then we're done. No sort + * is needed, since they're already in order. + * + * Otherwise we need to sort the collected tuples by position + * (itempointer). It's not worth worrying about corner cases where the + * tuples are already sorted. + */ + if (numrows == targrows) + qsort((void *) rows, numrows, sizeof(HeapTuple), compare_rows); + + /* + * Estimate total numbers of live and dead rows in relation, extrapolating + * on the assumption that the average tuple density in pages we didn't + * scan is the same as in the pages we did scan. Since what we scanned is + * a random sample of the pages in the relation, this should be a good + * assumption. + */ + if (bs.m > 0) + { + *totalrows = floor((liverows / bs.m) * totalblocks + 0.5); + *totaldeadrows = floor((deadrows / bs.m) * totalblocks + 0.5); + } + else + { + *totalrows = 0.0; + *totaldeadrows = 0.0; + } + + /* + * Emit some interesting relation info + */ + ereport(elevel, + (errmsg("\"%s\": scanned %d of %u pages, " + "containing %.0f live rows and %.0f dead rows; " + "%d rows in sample, %.0f estimated total rows", + RelationGetRelationName(scan->rs_rd), + bs.m, totalblocks, + liverows, deadrows, + numrows, *totalrows))); + + return numrows; +} + static double heapam_index_build_range_scan(Relation heapRelation, Relation indexRelation, @@ -2526,6 +2698,30 @@ SampleHeapTupleVisible(TableScanDesc scan, Buffer buffer, } } +/* + * qsort comparator for sorting rows[] array + */ +static int +compare_rows(const void *a, const void *b) +{ + HeapTuple ha = *(const HeapTuple *) a; + HeapTuple hb = *(const HeapTuple *) b; + BlockNumber ba = ItemPointerGetBlockNumber(&ha->t_self); + OffsetNumber oa = ItemPointerGetOffsetNumber(&ha->t_self); + BlockNumber bb = ItemPointerGetBlockNumber(&hb->t_self); + OffsetNumber ob = ItemPointerGetOffsetNumber(&hb->t_self); + + if (ba < bb) + return -1; + if (ba > bb) + return 1; + if (oa < ob) + return -1; + if (oa > ob) + return 1; + return 0; +} + /* ------------------------------------------------------------------------ * Definition of the heap table access method. @@ -2570,8 +2766,7 @@ static const TableAmRoutine heapam_methods = { .relation_copy_data = heapam_relation_copy_data, .relation_copy_for_cluster = heapam_relation_copy_for_cluster, .relation_vacuum = heap_vacuum_rel, - .scan_analyze_next_block = heapam_scan_analyze_next_block, - .scan_analyze_next_tuple = heapam_scan_analyze_next_tuple, + .acquire_sample_rows = heapam_acquire_sample_rows, .index_build_range_scan = heapam_index_build_range_scan, .index_validate_scan = heapam_index_validate_scan, diff --git a/src/backend/access/table/tableamapi.c b/src/backend/access/table/tableamapi.c index 325ecdc122..91f3b04dc4 100644 --- a/src/backend/access/table/tableamapi.c +++ b/src/backend/access/table/tableamapi.c @@ -87,8 +87,7 @@ GetTableAmRoutine(Oid amhandler) Assert(routine->relation_copy_data != NULL); Assert(routine->relation_copy_for_cluster != NULL); Assert(routine->relation_vacuum != NULL); - Assert(routine->scan_analyze_next_block != NULL); - Assert(routine->scan_analyze_next_tuple != NULL); + Assert(routine->acquire_sample_rows != NULL); Assert(routine->index_build_range_scan != NULL); Assert(routine->index_validate_scan != NULL); diff --git a/src/backend/commands/analyze.c b/src/backend/commands/analyze.c index 7295cf0215..82b40e22dc 100644 --- a/src/backend/commands/analyze.c +++ b/src/backend/commands/analyze.c @@ -37,7 +37,6 @@ #include "commands/dbcommands.h" #include "commands/progress.h" #include "commands/tablecmds.h" -#include "commands/vacuum.h" #include "executor/executor.h" #include "foreign/fdwapi.h" #include "miscadmin.h" @@ -61,7 +60,6 @@ #include "utils/lsyscache.h" #include "utils/memutils.h" #include "utils/pg_rusage.h" -#include "utils/sampling.h" #include "utils/sortsupport.h" #include "utils/syscache.h" #include "utils/timestamp.h" @@ -98,7 +96,6 @@ static VacAttrStats *examine_attribute(Relation onerel, int attnum, static int acquire_sample_rows(Relation onerel, int elevel, HeapTuple *rows, int targrows, double *totalrows, double *totaldeadrows); -static int compare_rows(const void *a, const void *b); static int acquire_inherited_sample_rows(Relation onerel, int elevel, HeapTuple *rows, int targrows, double *totalrows, double *totaldeadrows); @@ -997,29 +994,9 @@ examine_attribute(Relation onerel, int attnum, Node *index_expr) * We also estimate the total numbers of live and dead rows in the table, * and return them into *totalrows and *totaldeadrows, respectively. * - * The returned list of tuples is in order by physical position in the table. - * (We will rely on this later to derive correlation estimates.) - * - * As of May 2004 we use a new two-stage method: Stage one selects up - * to targrows random blocks (or all blocks, if there aren't so many). - * Stage two scans these blocks and uses the Vitter algorithm to create - * a random sample of targrows rows (or less, if there are less in the - * sample of blocks). The two stages are executed simultaneously: each - * block is processed as soon as stage one returns its number and while - * the rows are read stage two controls which ones are to be inserted - * into the sample. - * - * Although every row has an equal chance of ending up in the final - * sample, this sampling method is not perfect: not every possible - * sample has an equal chance of being selected. For large relations - * the number of different blocks represented by the sample tends to be - * too small. We can live with that for now. Improvements are welcome. - * - * An important property of this sampling method is that because we do - * look at a statistically unbiased set of blocks, we should get - * unbiased estimates of the average numbers of live and dead rows per - * block. The previous sampling method put too much credence in the row - * density near the start of the table. + * AM acquire sample function MUST return tuples in the order by physical + * position in the table. (We will rely on this later to derive correlation + * estimates.) */ static int acquire_sample_rows(Relation onerel, int elevel, @@ -1027,169 +1004,17 @@ acquire_sample_rows(Relation onerel, int elevel, double *totalrows, double *totaldeadrows) { int numrows = 0; /* # rows now in reservoir */ - double samplerows = 0; /* total # rows collected */ - double liverows = 0; /* # live rows seen */ - double deadrows = 0; /* # dead rows seen */ - double rowstoskip = -1; /* -1 means not set yet */ - BlockNumber totalblocks; - TransactionId OldestXmin; - BlockSamplerData bs; - ReservoirStateData rstate; - TupleTableSlot *slot; - TableScanDesc scan; - BlockNumber nblocks; - BlockNumber blksdone = 0; Assert(targrows > 0); - totalblocks = RelationGetNumberOfBlocks(onerel); - - /* Need a cutoff xmin for HeapTupleSatisfiesVacuum */ - OldestXmin = GetOldestNonRemovableTransactionId(onerel); - - /* Prepare for sampling block numbers */ - nblocks = BlockSampler_Init(&bs, totalblocks, targrows, random()); - - /* Report sampling block numbers */ - pgstat_progress_update_param(PROGRESS_ANALYZE_BLOCKS_TOTAL, - nblocks); - - /* Prepare for sampling rows */ - reservoir_init_selection_state(&rstate, targrows); - - scan = table_beginscan_analyze(onerel); - slot = table_slot_create(onerel, NULL); - - /* Outer loop over blocks to sample */ - while (BlockSampler_HasMore(&bs)) - { - BlockNumber targblock = BlockSampler_Next(&bs); - - vacuum_delay_point(); - - if (!table_scan_analyze_next_block(scan, targblock, vac_strategy)) - continue; - - while (table_scan_analyze_next_tuple(scan, OldestXmin, &liverows, &deadrows, slot)) - { - /* - * The first targrows sample rows are simply copied into the - * reservoir. Then we start replacing tuples in the sample until - * we reach the end of the relation. This algorithm is from Jeff - * Vitter's paper (see full citation in utils/misc/sampling.c). It - * works by repeatedly computing the number of tuples to skip - * before selecting a tuple, which replaces a randomly chosen - * element of the reservoir (current set of tuples). At all times - * the reservoir is a true random sample of the tuples we've - * passed over so far, so when we fall off the end of the relation - * we're done. - */ - if (numrows < targrows) - rows[numrows++] = ExecCopySlotHeapTuple(slot); - else - { - /* - * t in Vitter's paper is the number of records already - * processed. If we need to compute a new S value, we must - * use the not-yet-incremented value of samplerows as t. - */ - if (rowstoskip < 0) - rowstoskip = reservoir_get_next_S(&rstate, samplerows, targrows); - - if (rowstoskip <= 0) - { - /* - * Found a suitable tuple, so save it, replacing one old - * tuple at random - */ - int k = (int) (targrows * sampler_random_fract(rstate.randstate)); - - Assert(k >= 0 && k < targrows); - heap_freetuple(rows[k]); - rows[k] = ExecCopySlotHeapTuple(slot); - } - - rowstoskip -= 1; - } - - samplerows += 1; - } - - pgstat_progress_update_param(PROGRESS_ANALYZE_BLOCKS_DONE, - ++blksdone); - } - - ExecDropSingleTupleTableSlot(slot); - table_endscan(scan); - - /* - * If we didn't find as many tuples as we wanted then we're done. No sort - * is needed, since they're already in order. - * - * Otherwise we need to sort the collected tuples by position - * (itempointer). It's not worth worrying about corner cases where the - * tuples are already sorted. - */ - if (numrows == targrows) - qsort((void *) rows, numrows, sizeof(HeapTuple), compare_rows); - - /* - * Estimate total numbers of live and dead rows in relation, extrapolating - * on the assumption that the average tuple density in pages we didn't - * scan is the same as in the pages we did scan. Since what we scanned is - * a random sample of the pages in the relation, this should be a good - * assumption. - */ - if (bs.m > 0) - { - *totalrows = floor((liverows / bs.m) * totalblocks + 0.5); - *totaldeadrows = floor((deadrows / bs.m) * totalblocks + 0.5); - } - else - { - *totalrows = 0.0; - *totaldeadrows = 0.0; - } - - /* - * Emit some interesting relation info - */ - ereport(elevel, - (errmsg("\"%s\": scanned %d of %u pages, " - "containing %.0f live rows and %.0f dead rows; " - "%d rows in sample, %.0f estimated total rows", - RelationGetRelationName(onerel), - bs.m, totalblocks, - liverows, deadrows, - numrows, *totalrows))); + numrows = table_acquire_sample_rows(onerel, elevel, + vac_strategy, rows, + targrows, totalrows, + totaldeadrows); return numrows; } -/* - * qsort comparator for sorting rows[] array - */ -static int -compare_rows(const void *a, const void *b) -{ - HeapTuple ha = *(const HeapTuple *) a; - HeapTuple hb = *(const HeapTuple *) b; - BlockNumber ba = ItemPointerGetBlockNumber(&ha->t_self); - OffsetNumber oa = ItemPointerGetOffsetNumber(&ha->t_self); - BlockNumber bb = ItemPointerGetBlockNumber(&hb->t_self); - OffsetNumber ob = ItemPointerGetOffsetNumber(&hb->t_self); - - if (ba < bb) - return -1; - if (ba > bb) - return 1; - if (oa < ob) - return -1; - if (oa > ob) - return 1; - return 0; -} - /* * acquire_inherited_sample_rows -- acquire sample rows from inheritance tree diff --git a/src/include/access/tableam.h b/src/include/access/tableam.h index 33bffb6815..9d38341098 100644 --- a/src/include/access/tableam.h +++ b/src/include/access/tableam.h @@ -601,39 +601,21 @@ typedef struct TableAmRoutine BufferAccessStrategy bstrategy); /* - * Prepare to analyze block `blockno` of `scan`. The scan has been started - * with table_beginscan_analyze(). See also - * table_scan_analyze_next_block(). + * This callback needs to acquire a sample of rows from the table (for + * ANALYZE). Note, that returned tuples in a sample MUST be in the order + * by physical position in the table, as we rely on this fact in + * acquire_sample_rows(). * - * The callback may acquire resources like locks that are held until - * table_scan_analyze_next_tuple() returns false. It e.g. can make sense - * to hold a lock until all tuples on a block have been analyzed by - * scan_analyze_next_tuple. - * - * The callback can return false if the block is not suitable for - * sampling, e.g. because it's a metapage that could never contain tuples. - * - * XXX: This obviously is primarily suited for block-based AMs. It's not - * clear what a good interface for non block based AMs would be, so there - * isn't one yet. + * Buffer ring access strategy is derived from analyze_rel() and should be + * used by AMs, using shared buffers for ANALYZE. */ - bool (*scan_analyze_next_block) (TableScanDesc scan, - BlockNumber blockno, - BufferAccessStrategy bstrategy); - - /* - * See table_scan_analyze_next_tuple(). - * - * Not every AM might have a meaningful concept of dead rows, in which - * case it's OK to not increment *deadrows - but note that that may - * influence autovacuum scheduling (see comment for relation_vacuum - * callback). - */ - bool (*scan_analyze_next_tuple) (TableScanDesc scan, - TransactionId OldestXmin, - double *liverows, - double *deadrows, - TupleTableSlot *slot); + int (*acquire_sample_rows) (Relation onerel, + int elevel, + BufferAccessStrategy bstrategy, + HeapTuple *rows, + int targrows, + double *totalrows, + double *totaldeadrows); /* see table_index_build_range_scan for reference about parameters */ double (*index_build_range_scan) (Relation table_rel, @@ -942,19 +924,6 @@ table_beginscan_tid(Relation rel, Snapshot snapshot) return rel->rd_tableam->scan_begin(rel, snapshot, 0, NULL, NULL, flags); } -/* - * table_beginscan_analyze is an alternative entry point for setting up a - * TableScanDesc for an ANALYZE scan. As with bitmap scans, it's worth using - * the same data structure although the behavior is rather different. - */ -static inline TableScanDesc -table_beginscan_analyze(Relation rel) -{ - uint32 flags = SO_TYPE_ANALYZE; - - return rel->rd_tableam->scan_begin(rel, NULL, 0, NULL, NULL, flags); -} - /* * End relation scan. */ @@ -1591,40 +1560,16 @@ table_relation_vacuum(Relation rel, struct VacuumParams *params, rel->rd_tableam->relation_vacuum(rel, params, bstrategy); } -/* - * Prepare to analyze block `blockno` of `scan`. The scan needs to have been - * started with table_beginscan_analyze(). Note that this routine might - * acquire resources like locks that are held until - * table_scan_analyze_next_tuple() returns false. - * - * Returns false if block is unsuitable for sampling, true otherwise. - */ -static inline bool -table_scan_analyze_next_block(TableScanDesc scan, BlockNumber blockno, - BufferAccessStrategy bstrategy) -{ - return scan->rs_rd->rd_tableam->scan_analyze_next_block(scan, blockno, - bstrategy); -} - -/* - * Iterate over tuples in the block selected with - * table_scan_analyze_next_block() (which needs to have returned true, and - * this routine may not have returned false for the same block before). If a - * tuple that's suitable for sampling is found, true is returned and a tuple - * is stored in `slot`. - * - * *liverows and *deadrows are incremented according to the encountered - * tuples. - */ -static inline bool -table_scan_analyze_next_tuple(TableScanDesc scan, TransactionId OldestXmin, - double *liverows, double *deadrows, - TupleTableSlot *slot) +static inline int +table_acquire_sample_rows(Relation rel, int elevel, + BufferAccessStrategy bstrategy, + HeapTuple *rows, int targrows, + double *totalrows, double *totaldeadrows) { - return scan->rs_rd->rd_tableam->scan_analyze_next_tuple(scan, OldestXmin, - liverows, deadrows, - slot); + return rel->rd_tableam->acquire_sample_rows(rel, elevel, + bstrategy, rows, + targrows, totalrows, + totaldeadrows); } /* -- 2.24.3 (Apple Git-128)