From f2486568558d4c2cd3ee59af024c3f450d6ba0fa Mon Sep 17 00:00:00 2001
From: Peter Geoghegan <peter.geoghegan86@gmail.com>
Date: Thu, 13 Aug 2015 14:32:32 -0700
Subject: [PATCH 2/5] Further diminish role of replacement selection

Tuplesort callers now provide a total row estimate hint, typically the
optimizer's own estimate.  This is used to determine if replacement
selection will be viable even for the first run.  Testing shows that the
major benefit of replacement selection is only that it may enable a
"quicksort with spillover", which is the sole remaining justification
for going with replacement selection for the first run.  Even the cases
traditionally considered very sympathetic to replacement selection (e.g.
almost sorted input) do not appear to come out ahead on contemporary
hardware, so callers may not provide a physical/logical correlation
hint.  There is surprisingly little reason to try replacement selection
in the event of a strong correlation.

Some of the best cases for a simple hybrid sort-merge strategy can only
be seen when replacement selection isn't even attempted before being
abandoned;  replacement selection's tendency to produce longer runs is a
liability here rather than a benefit.  This change significantly reduces
the frequency that replacement selection will even be attempted
(previously, it was always at least used for the first run).
---
 src/backend/access/hash/hash.c         |   2 +-
 src/backend/access/hash/hashsort.c     |   4 +-
 src/backend/access/nbtree/nbtree.c     |  11 +-
 src/backend/access/nbtree/nbtsort.c    |  10 +-
 src/backend/catalog/index.c            |   1 +
 src/backend/commands/cluster.c         |   4 +-
 src/backend/executor/nodeAgg.c         |  26 ++++-
 src/backend/executor/nodeSort.c        |   1 +
 src/backend/utils/adt/orderedsetaggs.c |  13 ++-
 src/backend/utils/sort/tuplesort.c     | 182 +++++++++++++++++++++++++--------
 src/include/access/hash.h              |   3 +-
 src/include/access/nbtree.h            |   2 +-
 src/include/executor/nodeAgg.h         |   2 +
 src/include/utils/tuplesort.h          |  15 ++-
 14 files changed, 214 insertions(+), 62 deletions(-)

diff --git a/src/backend/access/hash/hash.c b/src/backend/access/hash/hash.c
index 24b06a5..8f71980 100644
--- a/src/backend/access/hash/hash.c
+++ b/src/backend/access/hash/hash.c
@@ -86,7 +86,7 @@ hashbuild(PG_FUNCTION_ARGS)
 	 * one page.
 	 */
 	if (num_buckets >= (uint32) NBuffers)
-		buildstate.spool = _h_spoolinit(heap, index, num_buckets);
+		buildstate.spool = _h_spoolinit(heap, index, num_buckets, reltuples);
 	else
 		buildstate.spool = NULL;
 
diff --git a/src/backend/access/hash/hashsort.c b/src/backend/access/hash/hashsort.c
index c67c057..5c7e137 100644
--- a/src/backend/access/hash/hashsort.c
+++ b/src/backend/access/hash/hashsort.c
@@ -44,7 +44,8 @@ struct HSpool
  * create and initialize a spool structure
  */
 HSpool *
-_h_spoolinit(Relation heap, Relation index, uint32 num_buckets)
+_h_spoolinit(Relation heap, Relation index, uint32 num_buckets,
+			 double reltuples)
 {
 	HSpool	   *hspool = (HSpool *) palloc0(sizeof(HSpool));
 	uint32		hash_mask;
@@ -71,6 +72,7 @@ _h_spoolinit(Relation heap, Relation index, uint32 num_buckets)
 												   index,
 												   hash_mask,
 												   maintenance_work_mem,
+												   reltuples,
 												   false);
 
 	return hspool;
diff --git a/src/backend/access/nbtree/nbtree.c b/src/backend/access/nbtree/nbtree.c
index cf4a6dc..0957e0f 100644
--- a/src/backend/access/nbtree/nbtree.c
+++ b/src/backend/access/nbtree/nbtree.c
@@ -23,6 +23,7 @@
 #include "access/xlog.h"
 #include "catalog/index.h"
 #include "commands/vacuum.h"
+#include "optimizer/plancat.h"
 #include "storage/indexfsm.h"
 #include "storage/ipc.h"
 #include "storage/lmgr.h"
@@ -85,7 +86,9 @@ btbuild(PG_FUNCTION_ARGS)
 	Relation	index = (Relation) PG_GETARG_POINTER(1);
 	IndexInfo  *indexInfo = (IndexInfo *) PG_GETARG_POINTER(2);
 	IndexBuildResult *result;
+	BlockNumber relpages;
 	double		reltuples;
+	double		allvisfrac;
 	BTBuildState buildstate;
 
 	buildstate.isUnique = indexInfo->ii_Unique;
@@ -100,6 +103,9 @@ btbuild(PG_FUNCTION_ARGS)
 		ResetUsage();
 #endif   /* BTREE_BUILD_STATS */
 
+	/* Estimate the number of rows currently present in the table */
+	estimate_rel_size(heap, NULL, &relpages, &reltuples, &allvisfrac);
+
 	/*
 	 * We expect to be called exactly once for any index relation. If that's
 	 * not the case, big trouble's what we have.
@@ -108,14 +114,15 @@ btbuild(PG_FUNCTION_ARGS)
 		elog(ERROR, "index \"%s\" already contains data",
 			 RelationGetRelationName(index));
 
-	buildstate.spool = _bt_spoolinit(heap, index, indexInfo->ii_Unique, false);
+	buildstate.spool = _bt_spoolinit(heap, index, indexInfo->ii_Unique, false,
+									 reltuples);
 
 	/*
 	 * If building a unique index, put dead tuples in a second spool to keep
 	 * them out of the uniqueness check.
 	 */
 	if (indexInfo->ii_Unique)
-		buildstate.spool2 = _bt_spoolinit(heap, index, false, true);
+		buildstate.spool2 = _bt_spoolinit(heap, index, false, true, reltuples);
 
 	/* do the heap scan */
 	reltuples = IndexBuildHeapScan(heap, index, indexInfo, true,
diff --git a/src/backend/access/nbtree/nbtsort.c b/src/backend/access/nbtree/nbtsort.c
index f95f67a..0d4a5ea 100644
--- a/src/backend/access/nbtree/nbtsort.c
+++ b/src/backend/access/nbtree/nbtsort.c
@@ -149,7 +149,8 @@ static void _bt_load(BTWriteState *wstate,
  * create and initialize a spool structure
  */
 BTSpool *
-_bt_spoolinit(Relation heap, Relation index, bool isunique, bool isdead)
+_bt_spoolinit(Relation heap, Relation index, bool isunique, bool isdead,
+			  double reltuples)
 {
 	BTSpool    *btspool = (BTSpool *) palloc0(sizeof(BTSpool));
 	int			btKbytes;
@@ -165,10 +166,15 @@ _bt_spoolinit(Relation heap, Relation index, bool isunique, bool isdead)
 	 * unique index actually requires two BTSpool objects.  We expect that the
 	 * second one (for dead tuples) won't get very full, so we give it only
 	 * work_mem.
+	 *
+	 * reltuples hint does not account for factors like whether or not this is
+	 * a partial index, or if this is second BTSpool object, because it seems
+	 * more conservative to estimate high.
 	 */
 	btKbytes = isdead ? work_mem : maintenance_work_mem;
 	btspool->sortstate = tuplesort_begin_index_btree(heap, index, isunique,
-													 btKbytes, false);
+													 btKbytes, reltuples,
+													 false);
 
 	return btspool;
 }
diff --git a/src/backend/catalog/index.c b/src/backend/catalog/index.c
index e59b163..88ee81d 100644
--- a/src/backend/catalog/index.c
+++ b/src/backend/catalog/index.c
@@ -2835,6 +2835,7 @@ validate_index(Oid heapId, Oid indexId, Snapshot snapshot)
 	state.tuplesort = tuplesort_begin_datum(TIDOID, TIDLessOperator,
 											InvalidOid, false,
 											maintenance_work_mem,
+											ivinfo.num_heap_tuples,
 											false);
 	state.htups = state.itups = state.tups_inserted = 0;
 
diff --git a/src/backend/commands/cluster.c b/src/backend/commands/cluster.c
index 7ab4874..23f6459 100644
--- a/src/backend/commands/cluster.c
+++ b/src/backend/commands/cluster.c
@@ -891,7 +891,9 @@ copy_heap_data(Oid OIDNewHeap, Oid OIDOldHeap, Oid OIDOldIndex, bool verbose,
 	/* Set up sorting if wanted */
 	if (use_sort)
 		tuplesort = tuplesort_begin_cluster(oldTupDesc, OldIndex,
-											maintenance_work_mem, false);
+											maintenance_work_mem,
+											OldHeap->rd_rel->reltuples,
+											false);
 	else
 		tuplesort = NULL;
 
diff --git a/src/backend/executor/nodeAgg.c b/src/backend/executor/nodeAgg.c
index 2e36855..f580cca 100644
--- a/src/backend/executor/nodeAgg.c
+++ b/src/backend/executor/nodeAgg.c
@@ -520,6 +520,7 @@ initialize_phase(AggState *aggstate, int newphase)
 												  sortnode->collations,
 												  sortnode->nullsFirst,
 												  work_mem,
+												  sortnode->plan.plan_rows,
 												  false);
 	}
 
@@ -588,7 +589,8 @@ initialize_aggregate(AggState *aggstate, AggStatePerTrans pertrans,
 									  pertrans->sortOperators[0],
 									  pertrans->sortCollations[0],
 									  pertrans->sortNullsFirst[0],
-									  work_mem, false);
+									  work_mem, agg_input_rows(aggstate),
+									  false);
 		else
 			pertrans->sortstates[aggstate->current_set] =
 				tuplesort_begin_heap(pertrans->evaldesc,
@@ -597,7 +599,8 @@ initialize_aggregate(AggState *aggstate, AggStatePerTrans pertrans,
 									 pertrans->sortOperators,
 									 pertrans->sortCollations,
 									 pertrans->sortNullsFirst,
-									 work_mem, false);
+									 work_mem, agg_input_rows(aggstate),
+									 false);
 	}
 
 	/*
@@ -1439,6 +1442,25 @@ find_hash_columns(AggState *aggstate)
 }
 
 /*
+ * Estimate the number of rows input to the sorter.
+ *
+ * Exported for use by ordered-set aggregates.
+ */
+double
+agg_input_rows(AggState *aggstate)
+{
+	Plan	   *outerNode;
+
+	/*
+	 * Get information about the size of the relation to be sorted (it's the
+	 * "outer" subtree of this node)
+	 */
+	outerNode = outerPlanState(aggstate)->plan;
+
+	return outerNode->plan_rows;
+}
+
+/*
  * Estimate per-hash-table-entry overhead for the planner.
  *
  * Note that the estimate does not include space for pass-by-reference
diff --git a/src/backend/executor/nodeSort.c b/src/backend/executor/nodeSort.c
index af1dccf..e4b1104 100644
--- a/src/backend/executor/nodeSort.c
+++ b/src/backend/executor/nodeSort.c
@@ -89,6 +89,7 @@ ExecSort(SortState *node)
 											  plannode->collations,
 											  plannode->nullsFirst,
 											  work_mem,
+											  plannode->plan.plan_rows,
 											  node->randomAccess);
 		if (node->bounded)
 			tuplesort_set_bound(tuplesortstate, node->bound);
diff --git a/src/backend/utils/adt/orderedsetaggs.c b/src/backend/utils/adt/orderedsetaggs.c
index 39ed85b..b51a945 100644
--- a/src/backend/utils/adt/orderedsetaggs.c
+++ b/src/backend/utils/adt/orderedsetaggs.c
@@ -20,6 +20,7 @@
 #include "catalog/pg_operator.h"
 #include "catalog/pg_type.h"
 #include "executor/executor.h"
+#include "executor/nodeAgg.h"
 #include "miscadmin.h"
 #include "nodes/nodeFuncs.h"
 #include "optimizer/tlist.h"
@@ -103,6 +104,7 @@ ordered_set_startup(FunctionCallInfo fcinfo, bool use_tuples)
 {
 	OSAPerGroupState *osastate;
 	OSAPerQueryState *qstate;
+	AggState		 *aggstate;
 	MemoryContext gcontext;
 	MemoryContext oldcontext;
 
@@ -117,8 +119,11 @@ ordered_set_startup(FunctionCallInfo fcinfo, bool use_tuples)
 	/*
 	 * We keep a link to the per-query state in fn_extra; if it's not there,
 	 * create it, and do the per-query setup we need.
+	 *
+	 * aggstate is used to get hint on total number of tuples for tuplesort.
 	 */
 	qstate = (OSAPerQueryState *) fcinfo->flinfo->fn_extra;
+	aggstate = (AggState *) fcinfo->context;
 	if (qstate == NULL)
 	{
 		Aggref	   *aggref;
@@ -276,13 +281,17 @@ ordered_set_startup(FunctionCallInfo fcinfo, bool use_tuples)
 												   qstate->sortOperators,
 												   qstate->sortCollations,
 												   qstate->sortNullsFirsts,
-												   work_mem, false);
+												   work_mem,
+												   agg_input_rows(aggstate),
+												   false);
 	else
 		osastate->sortstate = tuplesort_begin_datum(qstate->sortColType,
 													qstate->sortOperator,
 													qstate->sortCollation,
 													qstate->sortNullsFirst,
-													work_mem, false);
+													work_mem,
+													agg_input_rows(aggstate),
+													false);
 
 	osastate->number_of_rows = 0;
 
diff --git a/src/backend/utils/sort/tuplesort.c b/src/backend/utils/sort/tuplesort.c
index fc4ac90..6d766d2 100644
--- a/src/backend/utils/sort/tuplesort.c
+++ b/src/backend/utils/sort/tuplesort.c
@@ -13,11 +13,13 @@
  * See Knuth, volume 3, for more than you want to know about the external
  * sorting algorithm.  We divide the input into sorted runs using replacement
  * selection, in the form of a priority tree implemented as a heap
- * (essentially his Algorithm 5.2.3H -- although that strategy can be
- * abandoned where it does not appear to help), then merge the runs using
- * polyphase merge, Knuth's Algorithm 5.4.2D.  The logical "tapes" used by
- * Algorithm D are implemented by logtape.c, which avoids space wastage by
- * recycling disk space as soon as each block is read from its "tape".
+ * (essentially his Algorithm 5.2.3H -- although that strategy is often
+ * avoided altogether), then merge the runs using polyphase merge, Knuth's
+ * Algorithm 5.4.2D.  The logical "tapes" used by Algorithm D are
+ * implemented by logtape.c, which avoids space wastage by recycling disk
+ * space as soon as each block is read from its "tape".  Note that a hybrid
+ * sort-merge strategy is usually used in practice, because maintaining a
+ * priority tree/heap is expensive.
  *
  * We do not form the initial runs using Knuth's recommended replacement
  * selection data structure (Algorithm 5.4.1R), because it uses a fixed
@@ -108,10 +110,13 @@
  * If, having maintained a replacement selection priority queue (heap) for
  * the first run it transpires that there will be multiple on-tape runs
  * anyway, we abandon treating memtuples as a heap, and quicksort and write
- * in memtuples-sized batches.  This gives us most of the advantages of
- * always quicksorting and batch dumping runs, which can perform much better
- * than heap sorting and incrementally spilling tuples, without giving up on
- * replacement selection in cases where it remains compelling.
+ * in memtuples-sized batches.  This allows a "quicksort with spillover" to
+ * occur, but that remains about the only truly compelling case for
+ * replacement selection.  Callers provides a hint for the total number of
+ * rows, used to avoid replacement selection when a "quicksort with
+ * spillover" is not anticipated -- see useselection().  A hybrid sort-merge
+ * strategy can be much faster for very large inputs when replacement
+ * selection is never attempted.
  *
  *
  * Portions Copyright (c) 1996-2015, PostgreSQL Global Development Group
@@ -245,6 +250,7 @@ struct Tuplesortstate
 {
 	TupSortStatus status;		/* enumerated value as shown above */
 	int			nKeys;			/* number of columns in sort key */
+	double		rowNumHint;		/* caller's hint of total # of rows */
 	bool		randomAccess;	/* did caller request random access? */
 	bool		bounded;		/* did caller specify a maximum number of
 								 * tuples to return? */
@@ -313,7 +319,9 @@ struct Tuplesortstate
 	/*
 	 * While building initial runs, this indicates if the replacement
 	 * selection strategy or simple hybrid sort-merge strategy is in use.
-	 * Replacement selection is abandoned after first run.
+	 * Replacement selection may be determined to not be effective ahead of
+	 * time, based on a caller-supplied hint.  Otherwise, it is abandoned
+	 * after first run.
 	 */
 	bool		replaceActive;
 
@@ -505,9 +513,11 @@ struct Tuplesortstate
 	} while(0)
 
 
-static Tuplesortstate *tuplesort_begin_common(int workMem, bool randomAccess);
+static Tuplesortstate *tuplesort_begin_common(int workMem, double rowNumHint,
+									  bool randomAccess);
 static void puttuple_common(Tuplesortstate *state, SortTuple *tuple);
 static bool consider_abort_common(Tuplesortstate *state);
+static bool useselection(Tuplesortstate *state);
 static void inittapes(Tuplesortstate *state);
 static void selectnewtape(Tuplesortstate *state);
 static void mergeruns(Tuplesortstate *state);
@@ -584,12 +594,14 @@ static void free_sort_tuple(Tuplesortstate *state, SortTuple *stup);
  * Each variant of tuplesort_begin has a workMem parameter specifying the
  * maximum number of kilobytes of RAM to use before spilling data to disk.
  * (The normal value of this parameter is work_mem, but some callers use
- * other values.)  Each variant also has a randomAccess parameter specifying
- * whether the caller needs non-sequential access to the sort result.
+ * other values.)  Each variant also has a hint parameter of the total
+ * number of rows to be sorted, and a randomAccess parameter specifying
+ * whether the caller needs non-sequential access to the sort result.  Since
+ * rowNumHint is just a hint, it's acceptable for it to be zero or negative.
  */
 
 static Tuplesortstate *
-tuplesort_begin_common(int workMem, bool randomAccess)
+tuplesort_begin_common(int workMem, double rowNumHint, bool randomAccess)
 {
 	Tuplesortstate *state;
 	MemoryContext sortcontext;
@@ -619,6 +631,7 @@ tuplesort_begin_common(int workMem, bool randomAccess)
 #endif
 
 	state->status = TSS_INITIAL;
+	state->rowNumHint = rowNumHint;
 	state->randomAccess = randomAccess;
 	state->bounded = false;
 	state->boundUsed = false;
@@ -664,9 +677,11 @@ tuplesort_begin_heap(TupleDesc tupDesc,
 					 int nkeys, AttrNumber *attNums,
 					 Oid *sortOperators, Oid *sortCollations,
 					 bool *nullsFirstFlags,
-					 int workMem, bool randomAccess)
+					 int workMem, double rowNumHint,
+					 bool randomAccess)
 {
-	Tuplesortstate *state = tuplesort_begin_common(workMem, randomAccess);
+	Tuplesortstate *state = tuplesort_begin_common(workMem, rowNumHint,
+												   randomAccess);
 	MemoryContext oldcontext;
 	int			i;
 
@@ -734,9 +749,11 @@ tuplesort_begin_heap(TupleDesc tupDesc,
 Tuplesortstate *
 tuplesort_begin_cluster(TupleDesc tupDesc,
 						Relation indexRel,
-						int workMem, bool randomAccess)
+						int workMem,
+						double rowNumHint, bool randomAccess)
 {
-	Tuplesortstate *state = tuplesort_begin_common(workMem, randomAccess);
+	Tuplesortstate *state = tuplesort_begin_common(workMem, rowNumHint,
+												   randomAccess);
 	ScanKey		indexScanKey;
 	MemoryContext oldcontext;
 	int			i;
@@ -827,9 +844,11 @@ Tuplesortstate *
 tuplesort_begin_index_btree(Relation heapRel,
 							Relation indexRel,
 							bool enforceUnique,
-							int workMem, bool randomAccess)
+							int workMem,
+							double rowNumHint, bool randomAccess)
 {
-	Tuplesortstate *state = tuplesort_begin_common(workMem, randomAccess);
+	Tuplesortstate *state = tuplesort_begin_common(workMem, rowNumHint,
+												   randomAccess);
 	ScanKey		indexScanKey;
 	MemoryContext oldcontext;
 	int			i;
@@ -902,9 +921,11 @@ Tuplesortstate *
 tuplesort_begin_index_hash(Relation heapRel,
 						   Relation indexRel,
 						   uint32 hash_mask,
-						   int workMem, bool randomAccess)
+						   int workMem,
+						   double rowNumHint, bool randomAccess)
 {
-	Tuplesortstate *state = tuplesort_begin_common(workMem, randomAccess);
+	Tuplesortstate *state = tuplesort_begin_common(workMem, rowNumHint,
+												   randomAccess);
 	MemoryContext oldcontext;
 
 	oldcontext = MemoryContextSwitchTo(state->sortcontext);
@@ -937,9 +958,10 @@ tuplesort_begin_index_hash(Relation heapRel,
 Tuplesortstate *
 tuplesort_begin_datum(Oid datumType, Oid sortOperator, Oid sortCollation,
 					  bool nullsFirstFlag,
-					  int workMem, bool randomAccess)
+					  int workMem, double rowNumHint, bool randomAccess)
 {
-	Tuplesortstate *state = tuplesort_begin_common(workMem, randomAccess);
+	Tuplesortstate *state = tuplesort_begin_common(workMem, rowNumHint,
+												   randomAccess);
 	MemoryContext oldcontext;
 	int16		typlen;
 	bool		typbyval;
@@ -2270,6 +2292,73 @@ tuplesort_merge_order(int64 allowedMem)
 }
 
 /*
+ * useselection - determine if one replacement selection run should be
+ * attempted.
+ *
+ * This is called when we just ran out of memory, and must consider costs
+ * and benefits of replacement selection for first run, which can result in
+ * a "quicksort with spillover".  Note that replacement selection is always
+ * abandoned after the first run.
+ */
+static bool
+useselection(Tuplesortstate *state)
+{
+	int64		memNowUsed = state->allowedMem - state->availMem;
+	double		avgTupleSize;
+	int			increments;
+	double		crossover;
+	bool		useSelection;
+
+	/* For randomAccess callers, "quicksort with spillover" is never used */
+	if (state->randomAccess)
+		return false;
+
+	/*
+	 * Crossover point is somewhere between where memtuples is between 40%
+	 * and all-but-one of total tuples to sort.  This weighs approximate
+	 * savings in I/O, against generic heap sorting cost.
+	 */
+	avgTupleSize = (double) memNowUsed / (double) state->memtupsize;
+
+	/*
+	 * Starting from a threshold of 90%, refund 7.5% per 32 byte
+	 * average-size-increment.
+	 */
+	increments = MAXALIGN_DOWN((int) avgTupleSize) / 32;
+	crossover = 0.90 - (increments * 0.075);
+
+	/*
+	 * Clamp, making either outcome possible regardless of average size.
+	 *
+	 * 40% is about the minimum point at which "quicksort with spillover"
+	 * can still occur without a logical/physical correlation.
+	 */
+	crossover = Max(0.40, Min(crossover, 0.85));
+
+	/*
+	 * The point where the overhead of maintaining the heap invariant is
+	 * likely to dominate over any saving in I/O is somewhat arbitrarily
+	 * assumed to be the point where memtuples' size exceeds MaxAllocSize
+	 * (note that overall memory consumption may be far greater).  Past this
+	 * point, only the most compelling cases use replacement selection for
+	 * their first run.
+	 */
+	if (sizeof(SortTuple) * state->memtupcount > MaxAllocSize)
+		crossover = avgTupleSize > 32 ? 0.90 : 0.95;
+
+	useSelection = state->memtupcount > state->rowNumHint * crossover;
+
+#ifdef TRACE_SORT
+	if (trace_sort)
+		elog(LOG, "%s in use from row %d with %.2f total rows %.3f crossover",
+			 useSelection? "replacement selection" : "hybrid sort-merge",
+			 state->memtupcount, state->rowNumHint, crossover);
+#endif
+
+	return useSelection;
+}
+
+/*
  * inittapes - initialize for tape sorting.
  *
  * This is called only if we have found we don't have room to sort in memory.
@@ -2278,7 +2367,6 @@ static void
 inittapes(Tuplesortstate *state)
 {
 	int			maxTapes,
-				ntuples,
 				j;
 	int64		tapeSpace;
 
@@ -2337,32 +2425,38 @@ inittapes(Tuplesortstate *state)
 	state->tp_tapenum = (int *) palloc0(maxTapes * sizeof(int));
 
 	/*
-	 * Give replacement selection a try.  There will be a switch to a simple
-	 * hybrid sort-merge strategy after the first run (iff there is to be a
-	 * second on-tape run).
+	 * Give replacement selection a try when number of tuples to be sorted
+	 * has a reasonable chance of enabling a "quicksort with spillover".
+	 * There will be a switch to a simple hybrid sort-merge strategy after
+	 * the first run (iff there is to be a second on-tape run).
 	 */
-	state->replaceActive = true;
+	state->replaceActive = useselection(state);
 	state->cached = false;
 	state->just_memtuples = false;
 
-	/*
-	 * Convert the unsorted contents of memtuples[] into a heap. Each tuple is
-	 * marked as belonging to run number zero.
-	 *
-	 * NOTE: we pass false for checkIndex since there's no point in comparing
-	 * indexes in this step, even though we do intend the indexes to be part
-	 * of the sort key...
-	 */
-	ntuples = state->memtupcount;
-	state->memtupcount = 0;		/* make the heap empty */
-	for (j = 0; j < ntuples; j++)
+	if (state->replaceActive)
 	{
-		/* Must copy source tuple to avoid possible overwrite */
-		SortTuple	stup = state->memtuples[j];
+		/*
+		 * Convert the unsorted contents of memtuples[] into a heap. Each
+		 * tuple is marked as belonging to run number zero.
+		 *
+		 * NOTE: we pass false for checkIndex since there's no point in
+		 * comparing indexes in this step, even though we do intend the
+		 * indexes to be part of the sort key...
+		 */
+		int			ntuples = state->memtupcount;
 
-		tuplesort_heap_insert(state, &stup, 0, false);
+		state->memtupcount = 0;		/* make the heap empty */
+
+		for (j = 0; j < ntuples; j++)
+		{
+			/* Must copy source tuple to avoid possible overwrite */
+			SortTuple	stup = state->memtuples[j];
+
+			tuplesort_heap_insert(state, &stup, 0, false);
+		}
+		Assert(state->memtupcount == ntuples);
 	}
-	Assert(state->memtupcount == ntuples);
 
 	state->currentRun = 0;
 
diff --git a/src/include/access/hash.h b/src/include/access/hash.h
index 97cb859..95acc1d 100644
--- a/src/include/access/hash.h
+++ b/src/include/access/hash.h
@@ -335,7 +335,8 @@ extern bool _hash_step(IndexScanDesc scan, Buffer *bufP, ScanDirection dir);
 /* hashsort.c */
 typedef struct HSpool HSpool;	/* opaque struct in hashsort.c */
 
-extern HSpool *_h_spoolinit(Relation heap, Relation index, uint32 num_buckets);
+extern HSpool *_h_spoolinit(Relation heap, Relation index, uint32 num_buckets,
+				 double reltuples);
 extern void _h_spooldestroy(HSpool *hspool);
 extern void _h_spool(HSpool *hspool, ItemPointer self,
 		 Datum *values, bool *isnull);
diff --git a/src/include/access/nbtree.h b/src/include/access/nbtree.h
index 9e48efd..5504b7b 100644
--- a/src/include/access/nbtree.h
+++ b/src/include/access/nbtree.h
@@ -743,7 +743,7 @@ extern void BTreeShmemInit(void);
 typedef struct BTSpool BTSpool; /* opaque type known only within nbtsort.c */
 
 extern BTSpool *_bt_spoolinit(Relation heap, Relation index,
-			  bool isunique, bool isdead);
+			  bool isunique, bool isdead, double reltuples);
 extern void _bt_spooldestroy(BTSpool *btspool);
 extern void _bt_spool(BTSpool *btspool, ItemPointer self,
 		  Datum *values, bool *isnull);
diff --git a/src/include/executor/nodeAgg.h b/src/include/executor/nodeAgg.h
index fe3b81a..e6144f2 100644
--- a/src/include/executor/nodeAgg.h
+++ b/src/include/executor/nodeAgg.h
@@ -21,6 +21,8 @@ extern TupleTableSlot *ExecAgg(AggState *node);
 extern void ExecEndAgg(AggState *node);
 extern void ExecReScanAgg(AggState *node);
 
+extern double agg_input_rows(AggState *aggstate);
+
 extern Size hash_agg_entry_size(int numAggs);
 
 extern Datum aggregate_dummy(PG_FUNCTION_ARGS);
diff --git a/src/include/utils/tuplesort.h b/src/include/utils/tuplesort.h
index 3679815..11a5fb7 100644
--- a/src/include/utils/tuplesort.h
+++ b/src/include/utils/tuplesort.h
@@ -62,22 +62,27 @@ extern Tuplesortstate *tuplesort_begin_heap(TupleDesc tupDesc,
 					 int nkeys, AttrNumber *attNums,
 					 Oid *sortOperators, Oid *sortCollations,
 					 bool *nullsFirstFlags,
-					 int workMem, bool randomAccess);
+					 int workMem,
+					 double rowNumHint, bool randomAccess);
 extern Tuplesortstate *tuplesort_begin_cluster(TupleDesc tupDesc,
 						Relation indexRel,
-						int workMem, bool randomAccess);
+						int workMem,
+						double rowNumHint, bool randomAccess);
 extern Tuplesortstate *tuplesort_begin_index_btree(Relation heapRel,
 							Relation indexRel,
 							bool enforceUnique,
-							int workMem, bool randomAccess);
+							int workMem,
+							double rowNumHint, bool randomAccess);
 extern Tuplesortstate *tuplesort_begin_index_hash(Relation heapRel,
 						   Relation indexRel,
 						   uint32 hash_mask,
-						   int workMem, bool randomAccess);
+						   int workMem,
+						   double rowNumHint, bool randomAccess);
 extern Tuplesortstate *tuplesort_begin_datum(Oid datumType,
 					  Oid sortOperator, Oid sortCollation,
 					  bool nullsFirstFlag,
-					  int workMem, bool randomAccess);
+					  int workMem,
+					  double rowNumHint, bool randomAccess);
 
 extern void tuplesort_set_bound(Tuplesortstate *state, int64 bound);
 
-- 
1.9.1