diff -ruN ../base/src/backend/commands/analyze.c src/backend/commands/analyze.c --- ../base/src/backend/commands/analyze.c 2003-10-18 17:38:06.000000000 +0200 +++ src/backend/commands/analyze.c 2004-04-05 10:16:30.000000000 +0200 @@ -105,6 +105,21 @@ int first; /* values[] index of first occurrence */ } ScalarMCVItem; +/* +** data structure for Algorithm S from Knuth 3.4.2 +*/ +typedef struct +{ + BlockNumber N; /* number of blocks, known in advance */ + int n; /* sample size */ + BlockNumber t; /* current block number */ + int m; /* blocks selected so far */ +} BlockSamplerData; +typedef BlockSamplerData *BlockSampler; + +static void BlockSampler_Init(BlockSampler bs, BlockNumber nblocks, int samplesize); +static bool BlockSampler_HasMore(BlockSampler bs); +static BlockNumber BlockSampler_Next(BlockSampler bs); #define swapInt(a,b) do {int _tmp; _tmp=a; a=b; b=_tmp;} while(0) #define swapDatum(a,b) do {Datum _tmp; _tmp=a; a=b; b=_tmp;} while(0) @@ -113,6 +128,7 @@ /* Default statistics target (GUC parameter) */ int default_statistics_target = 10; +extern int sampling_method; /* debugging hack */ static int elevel = -1; @@ -310,7 +326,10 @@ * Acquire the sample rows */ rows = (HeapTuple *) palloc(targrows * sizeof(HeapTuple)); - numrows = acquire_sample_rows(onerel, rows, targrows, &totalrows); + if (sampling_method == 0) + numrows = old_acquire_sample_rows(onerel, rows, targrows, &totalrows); + else + numrows = acquire_sample_rows(onerel, rows, targrows, &totalrows); /* * If we are running a standalone ANALYZE, update pages/tuples stats @@ -488,6 +507,281 @@ } /* +** BlockSampler_Init -- prepare for random sampling of blocknumbers +** +** BlockSampler is used for stage one of our new two-stage tuple +** sampling mechanism as discussed on -hackers 2004-04-02 (subject +** "Large DB"). It selects a random sample of samplesize blocks out of +** the nblocks blocks in the table. If the table has less than +** samplesize blocks, all blocks are selected. +** +*/ +static void +BlockSampler_Init(BlockSampler bs, BlockNumber nblocks, int samplesize) +{ + bs->N = nblocks; /* table size */ + /* + ** If we decide to reduce samplesize for tables that have less or + ** not much more than samplesize blocks, here is the place to do + ** it. + */ + bs->n = samplesize; + bs->t = 0; /* blocks scanned so far */ + bs->m = 0; /* blocks selected so far */ +} + +static bool +BlockSampler_HasMore(BlockSampler bs) +{ + return (bs->t < bs->N) && (bs->m < bs->n); +} + +static BlockNumber +BlockSampler_Next(BlockSampler bs) +{ + BlockNumber K = bs->N - bs->t; /* remaining blocks */ + int k = bs->n - bs->m; /* blocks still to sample */ + double p; /* probability to skip the next block */ + double V; /* random */ + + Assert(BlockSampler_HasMore(bs)); + + if (k >= K) + { + /* need all the rest */ + bs->t += 1; + bs->m += 1; + return bs->t - 1; + } + +#ifdef NOT_USED + /* simple, robust, but wastes a lot of entropy */ + for (;;) { + double U = random_fract(); + + if (K * U < k) { + /* select */ + bs->t += 1; + bs->m += 1; + return bs->t - 1; + } + else { + /* skip */ + bs->t += 1; + K -= 1; + } + } +#endif + + p = 1.0 - (double) k / (double) K; + V = random_fract(); + while (V < p) + { + bs->t += 1; + K -= 1; + /* + ** Assert(K > 0) + ** because we startet with K > k > 0, + ** and when K == k, the loop terminates + */ + p *= 1.0 - (double) k / (double) K; + } + + /* select */ + bs->t += 1; + bs->m += 1; + return bs->t - 1; +} + +/* + * acquire_sample_rows -- acquire a random sample of rows from the table + * + * As of April 2004 we use a new two-stage method: Stage one selects up + * to targrows random blocks (or all blocks, if there aren't so many). + * Stage two scans these blocks and uses the Vitter algorithm to create + * a random sample of targrows rows (or less, if there are less in the + * sample of blocks). The two stages are excuted simultaneously: Each + * block is processed as soon as stage one returns its number and while the + * rows are read stage two controls which ones are to be kept in the sample. + * A truly random sample is collected: every row has an equal chance of + * ending up in the final sample. + * + * We also estimate the total number of rows in the table, and return that + * into *totalrows. + * + * The returned list of tuples is in order by physical position in the table. + * (We will rely on this later to derive correlation estimates.) + */ +static int +acquire_sample_rows(Relation onerel, HeapTuple *rows, int targrows, + double *totalrows) +{ + int numrows = 0; + int liverows = 0; + int deadrows = 0; + int skippedrows = 0; + int rowstoskip = -1; + HeapScanDesc scan; + BlockSamplerData bs; + double rstate; + + Assert(targrows > 1); + + /* + ** Hack for setting onerel->rd_nblocks: + */ + scan = heap_beginscan(onerel, SnapshotNow, 0, NULL); + heap_endscan(scan); + + /* + ** Prepare for sampling block numbers + */ + BlockSampler_Init(&bs, onerel->rd_nblocks, targrows); + /* + ** Prepare for sampling rows + */ + rstate = init_selection_state(targrows); + while (BlockSampler_HasMore(&bs)) { + BlockNumber currblock; + Buffer currbuffer; + Page currpage; + OffsetNumber targoffset, + maxoffset; + + CHECK_FOR_INTERRUPTS(); + + currblock = BlockSampler_Next(&bs); + + /* + * We must maintain a pin on the target page's buffer to ensure + * that the maxoffset value stays good (else concurrent VACUUM + * might delete tuples out from under us). Hence, pin the page + * until we are done looking at it. We don't maintain a lock on + * the page, so tuples could get added to it, but we ignore such + * tuples. + */ + currbuffer = ReadBuffer(onerel, currblock); + if (!BufferIsValid(currbuffer)) + elog(ERROR, "ReadBuffer failed"); + LockBuffer(currbuffer, BUFFER_LOCK_SHARE); + currpage = BufferGetPage(currbuffer); + maxoffset = PageGetMaxOffsetNumber(currpage); + LockBuffer(currbuffer, BUFFER_LOCK_UNLOCK); + + for (targoffset = FirstOffsetNumber; targoffset <= maxoffset; ++targoffset) + { + HeapTupleData tuple; + Buffer tupbuffer; + + if (sampling_method == 2) { + if (rowstoskip > 0) { + --rowstoskip; + ++skippedrows; + continue; + }/*if*/ + }/*if*/ + + ItemPointerSet(&tuple.t_self, currblock, targoffset); + if (heap_fetch(onerel, SnapshotNow, &tuple, &tupbuffer, + false, NULL)) + { + ++liverows; + if (numrows < targrows) + rows[numrows++] = heap_copytuple(&tuple); + else + { + if (rowstoskip < 0) + { + double t_o; + double t_n; + + /* + ** Use the old function for now. + ** This should be refactored into a function + ** that natively returns the number of records + ** to skip, as soon as we trust the new + ** sampling method. + */ + t_o = liverows + + skippedrows * (double) liverows + / ((double) liverows + deadrows); + t_n = select_next_random_record(t_o, + targrows, + &rstate); + rowstoskip = (int) (t_n - t_o); + } + if (rowstoskip == 0) + { + /* + * Found a suitable tuple, so save it, + * replacing one old tuple at random + */ + int k = (int) (targrows * random_fract()); + + Assert(k >= 0 && k < targrows); + heap_freetuple(rows[k]); + rows[k] = heap_copytuple(&tuple); + } + + rowstoskip -= 1; + } + + /* this releases the second pin acquired by heap_fetch: */ + ReleaseBuffer(tupbuffer); + } + else + { + /* + ** This information is currently not used. + */ + ++deadrows; + } + } + + /* this releases the initial pin: */ + ReleaseBuffer(currbuffer); + } + + ereport(DEBUG2, + (errmsg("new: %d pages sampled, %d live rows and %d dead rows scanned", + bs.m, liverows, deadrows))); + if (sampling_method == 2) + ereport(DEBUG2, + (errmsg("new: %d rows skipped", skippedrows))); + /* + * If we ran out of tuples then we're done. No sort is needed, + * since they're already in order. + * + * Otherwise we need to sort the collected tuples by position + * (itempointer). I don't care for corner cases where the tuples + * are already sorted. + */ + if (numrows == targrows) + qsort((void *) rows, numrows, sizeof(HeapTuple), compare_rows); + + /* + * Estimate total number of valid rows in relation. + */ + if (bs.m > 0) + { + liverows += skippedrows * (double) liverows / ((double) liverows + deadrows); + *totalrows = floor((double) onerel->rd_nblocks * liverows / bs.m + 0.5); + } + else + *totalrows = 0.0; + + /* + * Emit some interesting relation info + */ + ereport(elevel, + (errmsg("\"%s\": %u pages, %d rows sampled, %.0f estimated total rows", + RelationGetRelationName(onerel), + onerel->rd_nblocks, numrows, *totalrows))); + + return numrows; +} + +/* * acquire_sample_rows -- acquire a random sample of rows from the table * * Up to targrows rows are collected (if there are fewer than that many @@ -502,7 +796,7 @@ * (We will rely on this later to derive correlation estimates.) */ static int -acquire_sample_rows(Relation onerel, HeapTuple *rows, int targrows, +old_acquire_sample_rows(Relation onerel, HeapTuple *rows, int targrows, double *totalrows) { int numrows = 0; @@ -516,6 +810,10 @@ double tuplesperpage; double t; double rstate; + /* for debugging: */ + int blocksaccessed = 0; + int tuplesaccessed = 0; + int deadtuples = 0; Assert(targrows > 1); @@ -592,6 +890,8 @@ estblock = lastblock; tuplesperpage = (double) numest / (double) estblock; + blocksaccessed = lastblock + 1; + tuplesaccessed = numrows; t = (double) numrows; /* t is the # of records processed so far */ rstate = init_selection_state(targrows); for (;;) @@ -644,6 +944,9 @@ maxoffset = PageGetMaxOffsetNumber(targpage); LockBuffer(targbuffer, BUFFER_LOCK_UNLOCK); + if (targblock != lastblock) + ++blocksaccessed; + for (;;) { HeapTupleData targtuple; @@ -657,6 +960,7 @@ targoffset = FirstOffsetNumber; goto pageloop; } + ++tuplesaccessed; ItemPointerSet(&targtuple.t_self, targblock, targoffset); if (heap_fetch(onerel, SnapshotNow, &targtuple, &tupbuffer, false, NULL)) @@ -680,9 +984,14 @@ } /* this tuple is dead, so advance to next one on same page */ targoffset++; + ++deadtuples; } } + ereport(DEBUG2, + (errmsg("old: %d pages, %d rows accessed (%d dead rows)", + blocksaccessed, tuplesaccessed, deadtuples))); + /* * Now we need to sort the collected tuples by position (itempointer). */ diff -ruN ../base/src/backend/utils/misc/guc.c src/backend/utils/misc/guc.c --- ../base/src/backend/utils/misc/guc.c 2004-03-13 07:26:11.000000000 +0100 +++ src/backend/utils/misc/guc.c 2004-04-04 10:53:26.000000000 +0200 @@ -131,6 +131,9 @@ int log_min_duration_statement = -1; +/* quick hack for tuple sampling tests */ +int sampling_method = 1; + /* * These variables are all dummies that don't do anything, except in some @@ -859,6 +862,15 @@ static struct config_int ConfigureNamesInt[] = { { + {"sampling_method", PGC_USERSET, DEVELOPER_OPTIONS, + gettext_noop("Controls how to collect a tuple sample."), + gettext_noop("0 = old method; 1 = new, scan all tuples; " + "2 = new, skip tuples.") + }, + &sampling_method, + 1, 0, 2, NULL, NULL + }, + { {"default_statistics_target", PGC_USERSET, QUERY_TUNING_OTHER, gettext_noop("Sets the default statistics target."), gettext_noop("This applies to table columns that have not had a "