From d799c85e65346615f9fa102890e3c6c6156ce92f Mon Sep 17 00:00:00 2001
From: Petr Jelinek <pjmodos@pjmodos.net>
Date: Wed, 7 Jan 2015 23:36:56 +0100
Subject: [PATCH 1/3] separate block sampling functions

---
 src/backend/commands/analyze.c    | 123 +++---------------------------------
 src/backend/utils/misc/Makefile   |   2 +-
 src/backend/utils/misc/sampling.c | 130 ++++++++++++++++++++++++++++++++++++++
 src/include/utils/sampling.h      |  49 ++++++++++++++
 4 files changed, 188 insertions(+), 116 deletions(-)
 create mode 100644 src/backend/utils/misc/sampling.c
 create mode 100644 src/include/utils/sampling.h

diff --git a/src/backend/commands/analyze.c b/src/backend/commands/analyze.c
index 5de2b39..0e770b7 100644
--- a/src/backend/commands/analyze.c
+++ b/src/backend/commands/analyze.c
@@ -50,23 +50,13 @@
 #include "utils/lsyscache.h"
 #include "utils/memutils.h"
 #include "utils/pg_rusage.h"
+#include "utils/sampling.h"
 #include "utils/sortsupport.h"
 #include "utils/syscache.h"
 #include "utils/timestamp.h"
 #include "utils/tqual.h"
 
 
-/* Data structure for Algorithm S from Knuth 3.4.2 */
-typedef struct
-{
-	BlockNumber N;				/* number of blocks, known in advance */
-	int			n;				/* desired sample size */
-	BlockNumber t;				/* current block number */
-	int			m;				/* blocks selected so far */
-} BlockSamplerData;
-
-typedef BlockSamplerData *BlockSampler;
-
 /* Per-index data for ANALYZE */
 typedef struct AnlIndexData
 {
@@ -88,10 +78,6 @@ static BufferAccessStrategy vac_strategy;
 static void do_analyze_rel(Relation onerel, VacuumStmt *vacstmt,
 			   AcquireSampleRowsFunc acquirefunc, BlockNumber relpages,
 			   bool inh, bool in_outer_xact, int elevel);
-static void BlockSampler_Init(BlockSampler bs, BlockNumber nblocks,
-				  int samplesize);
-static bool BlockSampler_HasMore(BlockSampler bs);
-static BlockNumber BlockSampler_Next(BlockSampler bs);
 static void compute_index_stats(Relation onerel, double totalrows,
 					AnlIndexData *indexdata, int nindexes,
 					HeapTuple *rows, int numrows,
@@ -947,94 +933,6 @@ examine_attribute(Relation onerel, int attnum, Node *index_expr)
 }
 
 /*
- * BlockSampler_Init -- prepare for random sampling of blocknumbers
- *
- * BlockSampler is used for stage one of our new two-stage tuple
- * sampling mechanism as discussed on pgsql-hackers 2004-04-02 (subject
- * "Large DB").  It selects a random sample of samplesize blocks out of
- * the nblocks blocks in the table.  If the table has less than
- * samplesize blocks, all blocks are selected.
- *
- * Since we know the total number of blocks in advance, we can use the
- * straightforward Algorithm S from Knuth 3.4.2, rather than Vitter's
- * algorithm.
- */
-static void
-BlockSampler_Init(BlockSampler bs, BlockNumber nblocks, int samplesize)
-{
-	bs->N = nblocks;			/* measured table size */
-
-	/*
-	 * If we decide to reduce samplesize for tables that have less or not much
-	 * more than samplesize blocks, here is the place to do it.
-	 */
-	bs->n = samplesize;
-	bs->t = 0;					/* blocks scanned so far */
-	bs->m = 0;					/* blocks selected so far */
-}
-
-static bool
-BlockSampler_HasMore(BlockSampler bs)
-{
-	return (bs->t < bs->N) && (bs->m < bs->n);
-}
-
-static BlockNumber
-BlockSampler_Next(BlockSampler bs)
-{
-	BlockNumber K = bs->N - bs->t;		/* remaining blocks */
-	int			k = bs->n - bs->m;		/* blocks still to sample */
-	double		p;				/* probability to skip block */
-	double		V;				/* random */
-
-	Assert(BlockSampler_HasMore(bs));	/* hence K > 0 and k > 0 */
-
-	if ((BlockNumber) k >= K)
-	{
-		/* need all the rest */
-		bs->m++;
-		return bs->t++;
-	}
-
-	/*----------
-	 * It is not obvious that this code matches Knuth's Algorithm S.
-	 * Knuth says to skip the current block with probability 1 - k/K.
-	 * If we are to skip, we should advance t (hence decrease K), and
-	 * repeat the same probabilistic test for the next block.  The naive
-	 * implementation thus requires an anl_random_fract() call for each block
-	 * number.  But we can reduce this to one anl_random_fract() call per
-	 * selected block, by noting that each time the while-test succeeds,
-	 * we can reinterpret V as a uniform random number in the range 0 to p.
-	 * Therefore, instead of choosing a new V, we just adjust p to be
-	 * the appropriate fraction of its former value, and our next loop
-	 * makes the appropriate probabilistic test.
-	 *
-	 * We have initially K > k > 0.  If the loop reduces K to equal k,
-	 * the next while-test must fail since p will become exactly zero
-	 * (we assume there will not be roundoff error in the division).
-	 * (Note: Knuth suggests a "<=" loop condition, but we use "<" just
-	 * to be doubly sure about roundoff error.)  Therefore K cannot become
-	 * less than k, which means that we cannot fail to select enough blocks.
-	 *----------
-	 */
-	V = anl_random_fract();
-	p = 1.0 - (double) k / (double) K;
-	while (V < p)
-	{
-		/* skip */
-		bs->t++;
-		K--;					/* keep K == N - t */
-
-		/* adjust p to be new cutoff point in reduced range */
-		p *= 1.0 - (double) k / (double) K;
-	}
-
-	/* select */
-	bs->m++;
-	return bs->t++;
-}
-
-/*
  * acquire_sample_rows -- acquire a random sample of rows from the table
  *
  * Selected rows are returned in the caller-allocated array rows[], which
@@ -1089,6 +987,8 @@ acquire_sample_rows(Relation onerel, int elevel,
 	/* Need a cutoff xmin for HeapTupleSatisfiesVacuum */
 	OldestXmin = GetOldestXmin(onerel, true);
 
+	/* Seed the sampler random number generator */
+	sampler_setseed(random());
 	/* Prepare for sampling block numbers */
 	BlockSampler_Init(&bs, totalblocks, targrows);
 	/* Prepare for sampling rows */
@@ -1249,7 +1149,7 @@ acquire_sample_rows(Relation onerel, int elevel,
 						 * Found a suitable tuple, so save it, replacing one
 						 * old tuple at random
 						 */
-						int			k = (int) (targrows * anl_random_fract());
+						int			k = (int) (targrows * sampler_random_fract());
 
 						Assert(k >= 0 && k < targrows);
 						heap_freetuple(rows[k]);
@@ -1308,13 +1208,6 @@ acquire_sample_rows(Relation onerel, int elevel,
 	return numrows;
 }
 
-/* Select a random value R uniformly distributed in (0 - 1) */
-double
-anl_random_fract(void)
-{
-	return ((double) random() + 1) / ((double) MAX_RANDOM_VALUE + 2);
-}
-
 /*
  * These two routines embody Algorithm Z from "Random sampling with a
  * reservoir" by Jeffrey S. Vitter, in ACM Trans. Math. Softw. 11, 1
@@ -1333,7 +1226,7 @@ double
 anl_init_selection_state(int n)
 {
 	/* Initial value of W (for use when Algorithm Z is first applied) */
-	return exp(-log(anl_random_fract()) / n);
+	return exp(-log(sampler_random_fract()) / n);
 }
 
 double
@@ -1348,7 +1241,7 @@ anl_get_next_S(double t, int n, double *stateptr)
 		double		V,
 					quot;
 
-		V = anl_random_fract(); /* Generate V */
+		V = sampler_random_fract(); /* Generate V */
 		S = 0;
 		t += 1;
 		/* Note: "num" in Vitter's code is always equal to t - n */
@@ -1380,7 +1273,7 @@ anl_get_next_S(double t, int n, double *stateptr)
 						tmp;
 
 			/* Generate U and X */
-			U = anl_random_fract();
+			U = sampler_random_fract();
 			X = t * (W - 1.0);
 			S = floor(X);		/* S is tentatively set to floor(X) */
 			/* Test if U <= h(S)/cg(X) in the manner of (6.3) */
@@ -1409,7 +1302,7 @@ anl_get_next_S(double t, int n, double *stateptr)
 				y *= numer / denom;
 				denom -= 1;
 			}
-			W = exp(-log(anl_random_fract()) / n);		/* Generate W in advance */
+			W = exp(-log(sampler_random_fract()) / n);		/* Generate W in advance */
 			if (exp(log(y) / n) <= (t + X) / t)
 				break;
 		}
diff --git a/src/backend/utils/misc/Makefile b/src/backend/utils/misc/Makefile
index 449d5b4..848ba29 100644
--- a/src/backend/utils/misc/Makefile
+++ b/src/backend/utils/misc/Makefile
@@ -15,7 +15,7 @@ include $(top_builddir)/src/Makefile.global
 override CPPFLAGS := -I. -I$(srcdir) $(CPPFLAGS)
 
 OBJS = guc.o help_config.o pg_rusage.o ps_status.o \
-       superuser.o timeout.o tzparser.o
+       sampling.o superuser.o timeout.o tzparser.o
 
 # This location might depend on the installation directories. Therefore
 # we can't subsitute it into pg_config.h.
diff --git a/src/backend/utils/misc/sampling.c b/src/backend/utils/misc/sampling.c
new file mode 100644
index 0000000..71a91f9
--- /dev/null
+++ b/src/backend/utils/misc/sampling.c
@@ -0,0 +1,130 @@
+/*-------------------------------------------------------------------------
+ *
+ * sampling.c
+ *	  Block sampling routines shared by ANALYZE and TABLESAMPLE.
+ *
+ * Portions Copyright (c) 1996-2012, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ *
+ * IDENTIFICATION
+ *	  src/backend/utils/misc/sampling.c
+ *
+ *-------------------------------------------------------------------------
+ */
+
+#include "postgres.h"
+
+#include <math.h>
+
+#include "utils/sampling.h"
+
+static unsigned short _sampler_seed[3] = { 0x330e, 0xabcd, 0x1234 };
+
+/*
+ * BlockSampler_Init -- prepare for random sampling of blocknumbers
+ *
+ * BlockSampler is used for stage one of our new two-stage tuple
+ * sampling mechanism as discussed on pgsql-hackers 2004-04-02 (subject
+ * "Large DB").  It selects a random sample of samplesize blocks out of
+ * the nblocks blocks in the table.  If the table has less than
+ * samplesize blocks, all blocks are selected.
+ *
+ * Since we know the total number of blocks in advance, we can use the
+ * straightforward Algorithm S from Knuth 3.4.2, rather than Vitter's
+ * algorithm.
+ */
+void
+BlockSampler_Init(BlockSampler bs, BlockNumber nblocks, int samplesize)
+{
+	bs->N = nblocks;			/* measured table size */
+
+	/*
+	 * If we decide to reduce samplesize for tables that have less or not much
+	 * more than samplesize blocks, here is the place to do it.
+	 */
+	bs->n = samplesize;
+	bs->t = 0;					/* blocks scanned so far */
+	bs->m = 0;					/* blocks selected so far */
+}
+
+bool
+BlockSampler_HasMore(BlockSampler bs)
+{
+	return (bs->t < bs->N) && (bs->m < bs->n);
+}
+
+BlockNumber
+BlockSampler_Next(BlockSampler bs)
+{
+	BlockNumber K = bs->N - bs->t;		/* remaining blocks */
+	int			k = bs->n - bs->m;		/* blocks still to sample */
+	double		p;				/* probability to skip block */
+	double		V;				/* random */
+
+	Assert(BlockSampler_HasMore(bs));	/* hence K > 0 and k > 0 */
+
+	if ((BlockNumber) k >= K)
+	{
+		/* need all the rest */
+		bs->m++;
+		return bs->t++;
+	}
+
+	/*----------
+	 * It is not obvious that this code matches Knuth's Algorithm S.
+	 * Knuth says to skip the current block with probability 1 - k/K.
+	 * If we are to skip, we should advance t (hence decrease K), and
+	 * repeat the same probabilistic test for the next block.  The naive
+	 * implementation thus requires an sampler_random_fract() call for each
+	 * block number.  But we can reduce this to one sampler_random_fract()
+	 * call per selected block, by noting that each time the while-test
+	 * succeeds, we can reinterpret V as a uniform random number in the range
+	 * 0 to p. Therefore, instead of choosing a new V, we just adjust p to be
+	 * the appropriate fraction of its former value, and our next loop
+	 * makes the appropriate probabilistic test.
+	 *
+	 * We have initially K > k > 0.  If the loop reduces K to equal k,
+	 * the next while-test must fail since p will become exactly zero
+	 * (we assume there will not be roundoff error in the division).
+	 * (Note: Knuth suggests a "<=" loop condition, but we use "<" just
+	 * to be doubly sure about roundoff error.)  Therefore K cannot become
+	 * less than k, which means that we cannot fail to select enough blocks.
+	 *----------
+	 */
+	V = sampler_random_fract();
+	p = 1.0 - (double) k / (double) K;
+	while (V < p)
+	{
+		/* skip */
+		bs->t++;
+		K--;					/* keep K == N - t */
+
+		/* adjust p to be new cutoff point in reduced range */
+		p *= 1.0 - (double) k / (double) K;
+	}
+
+	/* select */
+	bs->m++;
+	return bs->t++;
+}
+
+
+/*----------
+ * Random number generator used by sampling
+ *----------
+ */
+void
+sampler_setseed(long seed)
+{
+	_sampler_seed[0] = 0x330e;
+	_sampler_seed[1] = (unsigned short) seed;
+	_sampler_seed[2] = (unsigned short) (seed >> 16);
+}
+
+/* Select a random value R uniformly distributed in (0 - 1) */
+double
+sampler_random_fract(void)
+{
+	return pg_erand48(_sampler_seed);
+}
diff --git a/src/include/utils/sampling.h b/src/include/utils/sampling.h
new file mode 100644
index 0000000..734cdc0
--- /dev/null
+++ b/src/include/utils/sampling.h
@@ -0,0 +1,49 @@
+/*-------------------------------------------------------------------------
+ *
+ * sampling.h
+ *	  definitions for sampling functions
+ *
+ *
+ * Portions Copyright (c) 1996-2014, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ * src/include/utils/sampling.h
+ *
+ *-------------------------------------------------------------------------
+ */
+#ifndef SAMPLING_H
+#define SAMPLING_H
+
+#include "storage/bufmgr.h"
+
+typedef enum SamplerAccessStrategy
+{
+	SAS_RANDOM,
+	SAS_SEQUENTIAL
+} SamplerAccessStrategy;
+
+/* Data structure for Algorithm S from Knuth 3.4.2 */
+typedef struct
+{
+	BlockNumber N;				/* number of blocks, known in advance */
+	int			n;				/* desired sample size */
+	BlockNumber t;				/* current block number */
+	int			m;				/* blocks selected so far */
+} BlockSamplerData;
+
+typedef BlockSamplerData *BlockSampler;
+
+extern void BlockSampler_Init(BlockSampler bs, BlockNumber nblocks,
+				  int samplesize);
+extern bool BlockSampler_HasMore(BlockSampler bs);
+extern BlockNumber BlockSampler_Next(BlockSampler bs);
+
+/* Vitter reservoir sampling functions */
+extern double vitter_init_selection_state(int n);
+extern double vitter_get_next_S(double t, int n, double *stateptr);
+
+/* Random generator */
+extern void sampler_setseed(long seed);
+extern double sampler_random_fract(void);
+
+#endif /* SAMPLING_H */
-- 
1.9.1