diff -rcN ../base/src/backend/commands/analyze.c src/backend/commands/analyze.c
*** ../base/src/backend/commands/analyze.c	Sun May  9 12:05:51 2004
--- src/backend/commands/analyze.c	Sun May  9 19:33:27 2004
***************
*** 38,43 ****
--- 38,66 ----
  #include "utils/syscache.h"
  #include "utils/tuplesort.h"
  
+ /*
+  * With two-stage sampling we process at most 300000 blocks, each of
+  * which contains less than 1200 tuples (at 32K block size, smallest
+  * possible tuple size).  So the number of tuples processed can be
+  * stored in a 32 bit int.
+  * Change this datatype to double if the number of rows in the table
+  * might exceed INT_MAX.  Then the algorithm should work as long as the
+  * row count does not become so large that it is not represented accurately
+  * in a double (on IEEE-math machines this would be around 2^52 rows).
+  */
+ typedef int TupleCount;
+ 
+ /*
+ ** data structure for Algorithm S from Knuth 3.4.2
+ */
+ typedef struct
+ {
+ 	BlockNumber	N;				/* number of blocks, known in advance */
+ 	int			n;				/* sample size */
+ 	BlockNumber	t;				/* current block number */
+ 	int			m;				/* blocks selected so far */
+ } BlockSamplerData;
+ typedef BlockSamplerData *BlockSampler;
  
  /* Per-index data for ANALYZE */
  typedef struct AnlIndexData
***************
*** 57,62 ****
--- 80,89 ----
  static MemoryContext anl_context = NULL;
  
  
+ static void BlockSampler_Init(BlockSampler bs, BlockNumber nblocks, int samplesize);
+ static bool BlockSampler_HasMore(BlockSampler bs);
+ static BlockNumber BlockSampler_Next(BlockSampler bs);
+ 
  static void compute_index_stats(Relation onerel, double totalrows,
  								AnlIndexData *indexdata, int nindexes,
  								HeapTuple *rows, int numrows,
***************
*** 66,72 ****
  					int targrows, double *totalrows);
  static double random_fract(void);
  static double init_selection_state(int n);
! static double select_next_random_record(double t, int n, double *stateptr);
  static int	compare_rows(const void *a, const void *b);
  static void update_attstats(Oid relid, int natts, VacAttrStats **vacattrstats);
  static Datum std_fetch_func(VacAttrStatsP stats, int rownum, bool *isNull);
--- 93,99 ----
  					int targrows, double *totalrows);
  static double random_fract(void);
  static double init_selection_state(int n);
! static TupleCount get_next_S(TupleCount t, int n, double *stateptr);
  static int	compare_rows(const void *a, const void *b);
  static void update_attstats(Oid relid, int natts, VacAttrStats **vacattrstats);
  static Datum std_fetch_func(VacAttrStatsP stats, int rownum, bool *isNull);
***************
*** 638,649 ****
  }
  
  /*
   * acquire_sample_rows -- acquire a random sample of rows from the table
   *
!  * Up to targrows rows are collected (if there are fewer than that many
!  * rows in the table, all rows are collected).	When the table is larger
!  * than targrows, a truly random sample is collected: every row has an
!  * equal chance of ending up in the final sample.
   *
   * We also estimate the total number of rows in the table, and return that
   * into *totalrows.
--- 665,754 ----
  }
  
  /*
+ ** BlockSampler_Init -- prepare for random sampling of blocknumbers
+ **
+ ** BlockSampler is used for stage one of our new two-stage tuple
+ ** sampling mechanism as discussed on -hackers 2004-04-02 (subject
+ ** "Large DB").  It selects a random sample of samplesize blocks out of
+ ** the nblocks blocks in the table.  If the table has less than
+ ** samplesize blocks, all blocks are selected.
+ **
+ */
+ static void
+ BlockSampler_Init(BlockSampler bs, BlockNumber nblocks, int samplesize)
+ {
+ 	bs->N = nblocks;			/* table size */
+ 	/*
+ 	** If we decide to reduce samplesize for tables that have less or
+ 	** not much more than samplesize blocks, here is the place to do
+ 	** it.
+ 	*/
+ 	bs->n = samplesize;
+ 	bs->t = 0;					/* blocks scanned so far */
+ 	bs->m = 0;					/* blocks selected so far */
+ }
+ 
+ static bool
+ BlockSampler_HasMore(BlockSampler bs)
+ {
+ 	return (bs->t < bs->N) && (bs->m < bs->n);
+ }
+ 
+ static BlockNumber
+ BlockSampler_Next(BlockSampler bs)
+ {
+ 	BlockNumber	K = bs->N - bs->t;		/* remaining blocks */
+ 	int			k = bs->n - bs->m;		/* blocks still to sample */
+ 	double		p;				/* probability to skip the next block */
+ 	double		V;				/* random */
+ 
+ 	Assert(BlockSampler_HasMore(bs));
+ 
+ 	if (k >= K)
+ 	{
+ 		/* need all the rest */
+ 		bs->t += 1;
+ 		bs->m += 1;
+ 		return bs->t - 1;
+ 	}
+ 	
+ 	p = 1.0 - (double) k / (double) K;
+ 	V = random_fract();
+ 	while (V < p)
+ 	{
+ 		bs->t += 1;
+ 		K -= 1;
+ 		/*
+ 		** Assert(K > 0)
+ 		** because we startet with K > k > 0,
+ 		** and when K == k, the loop terminates
+ 		*/
+ 		p *= 1.0 - (double) k / (double) K;
+ 	}
+ 
+ 	/* select */
+ 	bs->t += 1;
+ 	bs->m += 1;
+ 	return bs->t - 1;
+ }
+ 
+ /*
   * acquire_sample_rows -- acquire a random sample of rows from the table
   *
!  * As of May 2004 we use a new two-stage method:  Stage one selects up
!  * to targrows random blocks (or all blocks, if there aren't so many).
!  * Stage two scans these blocks and uses the Vitter algorithm to create
!  * a random sample of targrows rows (or less, if there are less in the
!  * sample of blocks).  The two stages are executed simultaneously:  Each
!  * block is processed as soon as stage one returns its number and while
!  * the rows are read stage two controls which ones are to be inserted
!  * into the sample.
!  *
!  * Although every row has an equal chance of ending up in the final
!  * sample, this sampling method is not perfect: not every possible
!  * sample has an equal chance of being selected.  For large relations
!  * the number of different blocks represented by the sample tends to be
!  * too small.  We can live with that for now.  Improvements are welcome.
   *
   * We also estimate the total number of rows in the table, and return that
   * into *totalrows.
***************
*** 656,754 ****
  					double *totalrows)
  {
  	int			numrows = 0;
! 	HeapScanDesc scan;
  	BlockNumber	totalblocks;
! 	HeapTuple	tuple;
! 	ItemPointer lasttuple;
! 	BlockNumber lastblock,
! 				estblock;
! 	OffsetNumber lastoffset;
! 	int			numest;
! 	double		tuplesperpage;
! 	double		t;
  	double		rstate;
  
  	Assert(targrows > 1);
  
! 	/*
! 	 * Do a simple linear scan until we reach the target number of rows.
! 	 */
! 	scan = heap_beginscan(onerel, SnapshotNow, 0, NULL);
! 	totalblocks = scan->rs_nblocks;	/* grab current relation size */
! 	while ((tuple = heap_getnext(scan, ForwardScanDirection)) != NULL)
! 	{
! 		rows[numrows++] = heap_copytuple(tuple);
! 		if (numrows >= targrows)
! 			break;
! 		vacuum_delay_point();
! 	}
! 	heap_endscan(scan);
  
  	/*
! 	 * If we ran out of tuples then we're done, no matter how few we
! 	 * collected.  No sort is needed, since they're already in order.
! 	 */
! 	if (!HeapTupleIsValid(tuple))
! 	{
! 		*totalrows = (double) numrows;
! 
! 		ereport(elevel,
! 				(errmsg("\"%s\": %u pages, %d rows sampled, %.0f estimated total rows",
! 						RelationGetRelationName(onerel),
! 						totalblocks, numrows, *totalrows)));
! 
! 		return numrows;
! 	}
! 
  	/*
! 	 * Otherwise, start replacing tuples in the sample until we reach the
! 	 * end of the relation.  This algorithm is from Jeff Vitter's paper
! 	 * (see full citation below).  It works by repeatedly computing the
! 	 * number of the next tuple we want to fetch, which will replace a
! 	 * randomly chosen element of the reservoir (current set of tuples).
! 	 * At all times the reservoir is a true random sample of the tuples
! 	 * we've passed over so far, so when we fall off the end of the
! 	 * relation we're done.
! 	 *
! 	 * A slight difficulty is that since we don't want to fetch tuples or
! 	 * even pages that we skip over, it's not possible to fetch *exactly*
! 	 * the N'th tuple at each step --- we don't know how many valid tuples
! 	 * are on the skipped pages.  We handle this by assuming that the
! 	 * average number of valid tuples/page on the pages already scanned
! 	 * over holds good for the rest of the relation as well; this lets us
! 	 * estimate which page the next tuple should be on and its position in
! 	 * the page.  Then we fetch the first valid tuple at or after that
! 	 * position, being careful not to use the same tuple twice.  This
! 	 * approach should still give a good random sample, although it's not
! 	 * perfect.
! 	 */
! 	lasttuple = &(rows[numrows - 1]->t_self);
! 	lastblock = ItemPointerGetBlockNumber(lasttuple);
! 	lastoffset = ItemPointerGetOffsetNumber(lasttuple);
! 
! 	/*
! 	 * If possible, estimate tuples/page using only completely-scanned
! 	 * pages.
! 	 */
! 	for (numest = numrows; numest > 0; numest--)
! 	{
! 		if (ItemPointerGetBlockNumber(&(rows[numest - 1]->t_self)) != lastblock)
! 			break;
! 	}
! 	if (numest == 0)
! 	{
! 		numest = numrows;		/* don't have a full page? */
! 		estblock = lastblock + 1;
! 	}
! 	else
! 		estblock = lastblock;
! 	tuplesperpage = (double) numest / (double) estblock;
! 
! 	t = (double) numrows;		/* t is the # of records processed so far */
  	rstate = init_selection_state(targrows);
! 	for (;;)
  	{
- 		double		targpos;
  		BlockNumber targblock;
  		Buffer		targbuffer;
  		Page		targpage;
--- 761,788 ----
  					double *totalrows)
  {
  	int			numrows = 0;
! 	TupleCount	liverows = 0;
! 	TupleCount	deadrows = 0;
! 	TupleCount	rowstoskip = -1;
  	BlockNumber	totalblocks;
! 	BlockSamplerData bs;
  	double		rstate;
  
  	Assert(targrows > 1);
  
! 	totalblocks = RelationGetNumberOfBlocks(onerel);
  
  	/*
! 	** Prepare for sampling block numbers
! 	*/
! 	BlockSampler_Init(&bs, totalblocks, targrows);
  	/*
! 	** Prepare for sampling rows
! 	*/
  	rstate = init_selection_state(targrows);
! 
! 	while (BlockSampler_HasMore(&bs))
  	{
  		BlockNumber targblock;
  		Buffer		targbuffer;
  		Page		targpage;
***************
*** 757,783 ****
  
  		vacuum_delay_point();
  
! 		t = select_next_random_record(t, targrows, &rstate);
! 		/* Try to read the t'th record in the table */
! 		targpos = t / tuplesperpage;
! 		targblock = (BlockNumber) targpos;
! 		targoffset = ((int) ((targpos - targblock) * tuplesperpage)) +
! 			FirstOffsetNumber;
! 		/* Make sure we are past the last selected record */
! 		if (targblock <= lastblock)
! 		{
! 			targblock = lastblock;
! 			if (targoffset <= lastoffset)
! 				targoffset = lastoffset + 1;
! 		}
! 		/* Loop to find first valid record at or after given position */
! pageloop:;
! 
! 		/*
! 		 * Have we fallen off the end of the relation?
! 		 */
! 		if (targblock >= totalblocks)
! 			break;
  
  		/*
  		 * We must maintain a pin on the target page's buffer to ensure
--- 791,797 ----
  
  		vacuum_delay_point();
  
! 		targblock = BlockSampler_Next(&bs);
  
  		/*
  		 * We must maintain a pin on the target page's buffer to ensure
***************
*** 795,848 ****
  		maxoffset = PageGetMaxOffsetNumber(targpage);
  		LockBuffer(targbuffer, BUFFER_LOCK_UNLOCK);
  
! 		for (;;)
  		{
  			HeapTupleData targtuple;
  			Buffer		tupbuffer;
  
- 			if (targoffset > maxoffset)
- 			{
- 				/* Fell off end of this page, try next */
- 				ReleaseBuffer(targbuffer);
- 				targblock++;
- 				targoffset = FirstOffsetNumber;
- 				goto pageloop;
- 			}
  			ItemPointerSet(&targtuple.t_self, targblock, targoffset);
  			if (heap_fetch(onerel, SnapshotNow, &targtuple, &tupbuffer,
  						   false, NULL))
  			{
  				/*
! 				 * Found a suitable tuple, so save it, replacing one old
! 				 * tuple at random
  				 */
! 				int			k = (int) (targrows * random_fract());
  
- 				Assert(k >= 0 && k < targrows);
- 				heap_freetuple(rows[k]);
- 				rows[k] = heap_copytuple(&targtuple);
  				/* this releases the second pin acquired by heap_fetch: */
  				ReleaseBuffer(tupbuffer);
- 				/* this releases the initial pin: */
- 				ReleaseBuffer(targbuffer);
- 				lastblock = targblock;
- 				lastoffset = targoffset;
- 				break;
  			}
! 			/* this tuple is dead, so advance to next one on same page */
! 			targoffset++;
  		}
  	}
  
  	/*
! 	 * Now we need to sort the collected tuples by position (itempointer).
  	 */
! 	qsort((void *) rows, numrows, sizeof(HeapTuple), compare_rows);
  
  	/*
  	 * Estimate total number of valid rows in relation.
  	 */
! 	*totalrows = floor((double) totalblocks * tuplesperpage + 0.5);
  
  	/*
  	 * Emit some interesting relation info 
--- 809,895 ----
  		maxoffset = PageGetMaxOffsetNumber(targpage);
  		LockBuffer(targbuffer, BUFFER_LOCK_UNLOCK);
  
! 		for (targoffset = FirstOffsetNumber; targoffset <= maxoffset; ++targoffset)
  		{
  			HeapTupleData targtuple;
  			Buffer		tupbuffer;
  
  			ItemPointerSet(&targtuple.t_self, targblock, targoffset);
  			if (heap_fetch(onerel, SnapshotNow, &targtuple, &tupbuffer,
  						   false, NULL))
  			{
+ 				liverows += 1;
  				/*
! 				 * The first targrows rows are simply copied into the
! 				 * reservoir.
! 				 * Then we start replacing tuples in the sample until
! 				 * we reach the end of the relation.  This algorithm is
! 				 * from Jeff Vitter's paper (see full citation below).
! 				 * It works by repeatedly computing the number of the
! 				 * next tuple we want to fetch, which will replace a
! 				 * randomly chosen element of the reservoir (current
! 				 * set of tuples).  At all times the reservoir is a true
! 				 * random sample of the tuples we've passed over so far,
! 				 * so when we fall off the end of the relation we're done.
  				 */
! 				if (numrows < targrows)
! 					rows[numrows++] = heap_copytuple(&targtuple);
! 				else
! 				{
! 					if (rowstoskip < 0)
! 						rowstoskip = get_next_S(liverows, targrows, &rstate);
! 					if (rowstoskip == 0)
! 					{
! 						/*
! 						 * Found a suitable tuple, so save it,
! 						 * replacing one old tuple at random
! 						 */
! 						int	k = (int) (targrows * random_fract());
! 
! 						Assert(k >= 0 && k < targrows);
! 						heap_freetuple(rows[k]);
! 						rows[k] = heap_copytuple(&targtuple);
! 					}
! 
! 					rowstoskip -= 1;
! 				}
  
  				/* this releases the second pin acquired by heap_fetch: */
  				ReleaseBuffer(tupbuffer);
  			}
! 			else
! 			{
! 				/* This information is currently not used. */
! 				deadrows += 1;
! 			}
  		}
+ 
+ 		/* this releases the initial pin: */
+ 		ReleaseBuffer(targbuffer);
  	}
  
+ 	ereport(DEBUG2,
+ 			(errmsg("%d pages sampled, %.0f live rows and %.0f dead rows scanned",
+ 					bs.m, (double) liverows, (double) deadrows)));
+ 
  	/*
! 	 * If we ran out of tuples then we're done.  No sort is needed,
! 	 * since they're already in order.
! 	 *
! 	 * Otherwise we need to sort the collected tuples by position
! 	 * (itempointer).  I don't care for corner cases where the tuples
! 	 * are already sorted.
  	 */
! 	if (numrows == targrows)
! 		qsort((void *) rows, numrows, sizeof(HeapTuple), compare_rows);
  
  	/*
  	 * Estimate total number of valid rows in relation.
  	 */
! 	if (bs.m > 0)
! 		*totalrows = floor((double) liverows * totalblocks / bs.m + 0.5);
! 	else
! 		*totalrows = 0.0;
  
  	/*
  	 * Emit some interesting relation info 
***************
*** 872,894 ****
  /*
   * These two routines embody Algorithm Z from "Random sampling with a
   * reservoir" by Jeffrey S. Vitter, in ACM Trans. Math. Softw. 11, 1
!  * (Mar. 1985), Pages 37-57.  While Vitter describes his algorithm in terms
!  * of the count S of records to skip before processing another record,
!  * it is convenient to work primarily with t, the index (counting from 1)
!  * of the last record processed and next record to process.  The only extra
!  * state needed between calls is W, a random state variable.
!  *
!  * Note: the original algorithm defines t, S, numer, and denom as integers.
!  * Here we express them as doubles to avoid overflow if the number of rows
!  * in the table exceeds INT_MAX.  The algorithm should work as long as the
!  * row count does not become so large that it is not represented accurately
!  * in a double (on IEEE-math machines this would be around 2^52 rows).
   *
   * init_selection_state computes the initial W value.
   *
!  * Given that we've already processed t records (t >= n),
!  * select_next_random_record determines the number of the next record to
!  * process.
   */
  static double
  init_selection_state(int n)
--- 919,935 ----
  /*
   * These two routines embody Algorithm Z from "Random sampling with a
   * reservoir" by Jeffrey S. Vitter, in ACM Trans. Math. Softw. 11, 1
!  * (Mar. 1985), Pages 37-57.  Vitter describes his algorithm in terms
!  * of the count S of records to skip before processing another record.
!  * It is computed primarily based on t, the index (counting from 1)
!  * of the last record processed.  The only extra state needed between
!  * calls is W, a random state variable.
   *
   * init_selection_state computes the initial W value.
   *
!  * Given that we've already processed t records (t >= n), get_next_S
!  * determines the number of records to skip before the next record is
!  * processed.
   */
  static double
  init_selection_state(int n)
***************
*** 897,932 ****
  	return exp(-log(random_fract()) / n);
  }
  
! static double
! select_next_random_record(double t, int n, double *stateptr)
  {
  	/* The magic constant here is T from Vitter's paper */
! 	if (t <= (22.0 * n))
  	{
  		/* Process records using Algorithm X until t is large enough */
  		double		V,
  					quot;
  
  		V = random_fract();		/* Generate V */
  		t += 1;
! 		quot = (t - (double) n) / t;
  		/* Find min S satisfying (4.1) */
  		while (quot > V)
  		{
  			t += 1;
! 			quot *= (t - (double) n) / t;
  		}
  	}
  	else
  	{
  		/* Now apply Algorithm Z */
  		double		W = *stateptr;
! 		double		term = t - (double) n + 1;
! 		double		S;
  
  		for (;;)
  		{
! 			double		numer,
  						numer_lim,
  						denom;
  			double		U,
--- 938,976 ----
  	return exp(-log(random_fract()) / n);
  }
  
! static TupleCount
! get_next_S(TupleCount t, int n, double *stateptr)
  {
  	/* The magic constant here is T from Vitter's paper */
! 	if (t <= (22 * n))
  	{
  		/* Process records using Algorithm X until t is large enough */
+ 		TupleCount S = 0;
  		double		V,
  					quot;
  
  		V = random_fract();		/* Generate V */
  		t += 1;
! 		quot = 1.0 - (double) n / t;
  		/* Find min S satisfying (4.1) */
  		while (quot > V)
  		{
+ 			S += 1;
  			t += 1;
! 			quot *= 1.0 - (double) n / t;
  		}
+ 		return S;
  	}
  	else
  	{
  		/* Now apply Algorithm Z */
  		double		W = *stateptr;
! 		TupleCount	term = t - n + 1;
! 		TupleCount	S;
  
  		for (;;)
  		{
! 			TupleCount	numer,
  						numer_lim,
  						denom;
  			double		U,
***************
*** 941,947 ****
  			X = t * (W - 1.0);
  			S = floor(X);		/* S is tentatively set to floor(X) */
  			/* Test if U <= h(S)/cg(X) in the manner of (6.3) */
! 			tmp = (t + 1) / term;
  			lhs = exp(log(((U * tmp * tmp) * (term + S)) / (t + X)) / n);
  			rhs = (((t + X) / (term + S)) * term) / t;
  			if (lhs <= rhs)
--- 985,991 ----
  			X = t * (W - 1.0);
  			S = floor(X);		/* S is tentatively set to floor(X) */
  			/* Test if U <= h(S)/cg(X) in the manner of (6.3) */
! 			tmp = (double) (t + 1) / term;
  			lhs = exp(log(((U * tmp * tmp) * (term + S)) / (t + X)) / n);
  			rhs = (((t + X) / (term + S)) * term) / t;
  			if (lhs <= rhs)
***************
*** 951,979 ****
  			}
  			/* Test if U <= f(S)/cg(X) */
  			y = (((U * (t + 1)) / term) * (t + S + 1)) / (t + X);
! 			if ((double) n < S)
  			{
  				denom = t;
  				numer_lim = term + S;
  			}
  			else
  			{
! 				denom = t - (double) n + S;
  				numer_lim = t + 1;
  			}
  			for (numer = t + S; numer >= numer_lim; numer -= 1)
  			{
! 				y *= numer / denom;
  				denom -= 1;
  			}
  			W = exp(-log(random_fract()) / n);	/* Generate W in advance */
  			if (exp(log(y) / n) <= (t + X) / t)
  				break;
  		}
- 		t += S + 1;
  		*stateptr = W;
  	}
- 	return t;
  }
  
  /*
--- 995,1022 ----
  			}
  			/* Test if U <= f(S)/cg(X) */
  			y = (((U * (t + 1)) / term) * (t + S + 1)) / (t + X);
! 			if (n < S)
  			{
  				denom = t;
  				numer_lim = term + S;
  			}
  			else
  			{
! 				denom = t - n + S;
  				numer_lim = t + 1;
  			}
  			for (numer = t + S; numer >= numer_lim; numer -= 1)
  			{
! 				y *= (double) numer / denom;
  				denom -= 1;
  			}
  			W = exp(-log(random_fract()) / n);	/* Generate W in advance */
  			if (exp(log(y) / n) <= (t + X) / t)
  				break;
  		}
  		*stateptr = W;
+ 		return S;
  	}
  }
  
  /*