From 865a1abeb6bfb601b1ec605afb1e339c0e444e10 Mon Sep 17 00:00:00 2001
From: Peter Geoghegan <pg@heroku.com>
Date: Sun, 9 Nov 2014 14:38:44 -0800
Subject: [PATCH 2/2] Estimate total number of rows to be sorted

Sortsupport opclasses now accept a row hint, indicating the estimated
number of rows to be sorted.  This gives opclasses a sense of proportion
about how far along the copying of tuples is when considering aborting
abbreviation.

Estimates come from various sources.  The text opclass now always avoids
aborting abbreviation if the total number of rows to be sorted is high
enough, without considering cardinality at all.
---
 src/backend/access/nbtree/nbtree.c     |  5 ++-
 src/backend/access/nbtree/nbtsort.c    | 14 +++++-
 src/backend/commands/cluster.c         |  4 +-
 src/backend/executor/nodeAgg.c         |  5 ++-
 src/backend/executor/nodeSort.c        |  1 +
 src/backend/utils/adt/orderedsetaggs.c |  2 +-
 src/backend/utils/adt/varlena.c        | 80 ++++++++++++++++++++++++++++++++--
 src/backend/utils/sort/tuplesort.c     | 14 ++++--
 src/include/access/nbtree.h            |  2 +-
 src/include/utils/sortsupport.h        |  7 ++-
 src/include/utils/tuplesort.h          |  6 +--
 11 files changed, 121 insertions(+), 19 deletions(-)

diff --git a/src/backend/access/nbtree/nbtree.c b/src/backend/access/nbtree/nbtree.c
index d881525..d26c60b 100644
--- a/src/backend/access/nbtree/nbtree.c
+++ b/src/backend/access/nbtree/nbtree.c
@@ -109,14 +109,15 @@ btbuild(PG_FUNCTION_ARGS)
 		elog(ERROR, "index \"%s\" already contains data",
 			 RelationGetRelationName(index));
 
-	buildstate.spool = _bt_spoolinit(heap, index, indexInfo->ii_Unique, false);
+	buildstate.spool = _bt_spoolinit(heap, index, indexInfo->ii_Unique,
+									 indexInfo->ii_Predicate != NIL, false);
 
 	/*
 	 * If building a unique index, put dead tuples in a second spool to keep
 	 * them out of the uniqueness check.
 	 */
 	if (indexInfo->ii_Unique)
-		buildstate.spool2 = _bt_spoolinit(heap, index, false, true);
+		buildstate.spool2 = _bt_spoolinit(heap, index, false, true, true);
 
 	/* do the heap scan */
 	reltuples = IndexBuildHeapScan(heap, index, indexInfo, true,
diff --git a/src/backend/access/nbtree/nbtsort.c b/src/backend/access/nbtree/nbtsort.c
index 593571b..473ac54 100644
--- a/src/backend/access/nbtree/nbtsort.c
+++ b/src/backend/access/nbtree/nbtsort.c
@@ -73,6 +73,7 @@
 #include "storage/smgr.h"
 #include "tcop/tcopprot.h"
 #include "utils/rel.h"
+#include "utils/selfuncs.h"
 #include "utils/sortsupport.h"
 #include "utils/tuplesort.h"
 
@@ -149,10 +150,13 @@ static void _bt_load(BTWriteState *wstate,
  * create and initialize a spool structure
  */
 BTSpool *
-_bt_spoolinit(Relation heap, Relation index, bool isunique, bool isdead)
+_bt_spoolinit(Relation heap, Relation index, bool isunique, bool ispartial,
+			  bool isdead)
 {
 	BTSpool    *btspool = (BTSpool *) palloc0(sizeof(BTSpool));
 	int			btKbytes;
+	double		estRows;
+	float4		relTuples;
 
 	btspool->heap = heap;
 	btspool->index = index;
@@ -165,10 +169,16 @@ _bt_spoolinit(Relation heap, Relation index, bool isunique, bool isdead)
 	 * unique index actually requires two BTSpool objects.  We expect that the
 	 * second one (for dead tuples) won't get very full, so we give it only
 	 * work_mem.
+	 *
+	 * Certain cases will always have a relTuples of 0, such as reindexing as
+	 * part of a CLUSTER operation, or when reindexing toast tables.  This is
+	 * interpreted as "no estimate available".
 	 */
 	btKbytes = isdead ? work_mem : maintenance_work_mem;
+	relTuples = RelationGetForm(heap)->reltuples;
+	estRows =  relTuples * (isdead || ispartial ?  DEFAULT_INEQ_SEL : 1);
 	btspool->sortstate = tuplesort_begin_index_btree(heap, index, isunique,
-													 btKbytes, false);
+													 btKbytes, estRows, false);
 
 	return btspool;
 }
diff --git a/src/backend/commands/cluster.c b/src/backend/commands/cluster.c
index bc5f33f..8e5f536 100644
--- a/src/backend/commands/cluster.c
+++ b/src/backend/commands/cluster.c
@@ -890,7 +890,9 @@ copy_heap_data(Oid OIDNewHeap, Oid OIDOldHeap, Oid OIDOldIndex, bool verbose,
 	/* Set up sorting if wanted */
 	if (use_sort)
 		tuplesort = tuplesort_begin_cluster(oldTupDesc, OldIndex,
-											maintenance_work_mem, false);
+											maintenance_work_mem,
+											RelationGetForm(OldHeap)->reltuples,
+											false);
 	else
 		tuplesort = NULL;
 
diff --git a/src/backend/executor/nodeAgg.c b/src/backend/executor/nodeAgg.c
index 89de755..95143c3 100644
--- a/src/backend/executor/nodeAgg.c
+++ b/src/backend/executor/nodeAgg.c
@@ -346,6 +346,7 @@ initialize_aggregates(AggState *aggstate,
 	{
 		AggStatePerAgg peraggstate = &peragg[aggno];
 		AggStatePerGroup pergroupstate = &pergroup[aggno];
+		Agg		   		*node = (Agg *) aggstate->ss.ps.plan;
 
 		/*
 		 * Start a fresh sort operation for each DISTINCT/ORDER BY aggregate.
@@ -381,7 +382,9 @@ initialize_aggregates(AggState *aggstate,
 									 peraggstate->sortOperators,
 									 peraggstate->sortCollations,
 									 peraggstate->sortNullsFirst,
-									 work_mem, false);
+									 work_mem,
+									 node->plan.plan_rows,
+									 false);
 		}
 
 		/*
diff --git a/src/backend/executor/nodeSort.c b/src/backend/executor/nodeSort.c
index b88571b..31d3ead 100644
--- a/src/backend/executor/nodeSort.c
+++ b/src/backend/executor/nodeSort.c
@@ -89,6 +89,7 @@ ExecSort(SortState *node)
 											  plannode->collations,
 											  plannode->nullsFirst,
 											  work_mem,
+											  plannode->plan.plan_rows,
 											  node->randomAccess);
 		if (node->bounded)
 			tuplesort_set_bound(tuplesortstate, node->bound);
diff --git a/src/backend/utils/adt/orderedsetaggs.c b/src/backend/utils/adt/orderedsetaggs.c
index 9d7c71f..4ecf48e 100644
--- a/src/backend/utils/adt/orderedsetaggs.c
+++ b/src/backend/utils/adt/orderedsetaggs.c
@@ -280,7 +280,7 @@ ordered_set_startup(FunctionCallInfo fcinfo, bool use_tuples)
 												   qstate->sortOperators,
 												   qstate->sortCollations,
 												   qstate->sortNullsFirsts,
-												   work_mem, false);
+												   work_mem, -1, false);
 	else
 		osastate->sortstate = tuplesort_begin_datum(qstate->sortColType,
 													qstate->sortOperator,
diff --git a/src/backend/utils/adt/varlena.c b/src/backend/utils/adt/varlena.c
index 34f607d..741fb59 100644
--- a/src/backend/utils/adt/varlena.c
+++ b/src/backend/utils/adt/varlena.c
@@ -16,6 +16,7 @@
 
 #include <ctype.h>
 #include <limits.h>
+#include <math.h>
 
 #include "access/hash.h"
 #include "access/tuptoaster.h"
@@ -82,12 +83,18 @@ typedef struct
 #define PG_GETARG_UNKNOWN_P_COPY(n) DatumGetUnknownPCopy(PG_GETARG_DATUM(n))
 #define PG_RETURN_UNKNOWN_P(x)		PG_RETURN_POINTER(x)
 
+/*
+ * Used for calculating number of sort comparisons
+ */
+#define LOG2(x)  (log(x) / 0.693147180559945)
+
 static void btsortsupport_worker(SortSupport ssup, Oid collid);
 static int bttextfastcmp_c(Datum x, Datum y, SortSupport ssup);
 static int bttextfastcmp_locale(Datum x, Datum y, SortSupport ssup);
 static int bttextcmp_abbrev(Datum x, Datum y, SortSupport ssup);
 static Datum bttext_abbrev_convert(Datum original, SortSupport ssup);
-static bool bttext_abbrev_abort(int memtupcount, SortSupport ssup);
+static bool bttext_abbrev_abort(int memtupcount, double estrows,
+								SortSupport ssup);
 static int32 text_length(Datum str);
 static text *text_catenate(text *t1, text *t2);
 static text *text_substring(Datum str,
@@ -2114,17 +2121,84 @@ retry:
  * should be aborted, based on its projected effectiveness.
  */
 static bool
-bttext_abbrev_abort(int memtupcount, SortSupport ssup)
+bttext_abbrev_abort(int memtupcount, double estrows, SortSupport ssup)
 {
 	TextSortSupport	   *tss = (TextSortSupport *) ssup->ssup_extra;
 	double				abbrev_distinct, key_distinct;
 
 	Assert(ssup->abbreviate);
 
-	/* Have a little patience */
+	/* Have a little patience, even without estrows hint */
 	if (memtupcount < 20)
 		return false;
 
+	if (estrows > 0)
+	{
+		double		normalized_rows_to_process,
+					estimated_cmps;
+
+		normalized_rows_to_process = (estrows - memtupcount) / estrows;
+
+		if (normalized_rows_to_process > 0.95 && memtupcount < 200000)
+		{
+			/*
+			 * Be patient -- don't consider aborting until we've processed an
+			 * estimated 5% of all rows to be sorted, or 200,000 rows,
+			 * whichever is less.
+			 */
+#ifdef DEBUG_ABBREV_KEYS
+			elog(DEBUG_elog_output, "conversion patiently waited after %d tuples of %f",
+				 memtupcount, estrows);
+#endif
+			return false;
+		}
+		else if (normalized_rows_to_process < 0.65)
+		{
+			/*
+			 * Already too invested -- don't abort a marginal case.  Note that
+			 * clients will tend to stop calling here when it is established
+			 * that it is too late to abort anyway.
+			 */
+			return false;
+		}
+
+		/*
+		 * strxfrm() is strongly recommended for large lists of strings.  This
+		 * is because despite the memory overhead often implied by an approach
+		 * using string transformation, the number of comparisons that a
+		 * comparison sort algorithm requires increases at least in proportion
+		 * to O(n log n).  Linearithmic growth will result in a number of
+		 * comparisons that is considerably higher than the number of elements.
+		 * (top-N heapsorts never use the abbreviation optimization, and so are
+		 * not considered here.)
+		 *
+		 * Unicode Technical Standard #10 states "Because binary comparison is
+		 * much faster than string comparison, it is faster to use sort keys
+		 * whenever there will be more than about 10 comparisons per string, if
+		 * the system can afford the storage".  That would amount to
+		 * approximately 1,000 list elements on average.  While our costs are
+		 * clearly different in several ways, this calculus cannot be ignored
+		 * entirely.  Past a certain point, we are probabilistically better off
+		 * holding out for some improvement even if there is an abbreviated key
+		 * cardinality of 1 thus far.  That point is somewhat arbitrarily
+		 * assumed to be 20 comparisons per string (approximately 1 million
+		 * estimated rows).  We may still lose, but not by terribly much, and
+		 * only in cases close to the most pessimal worst case.  Even in that
+		 * very worst case, as this tuple count threshold is crossed the
+		 * regression for internal sorts is at or under 5%.
+		 */
+		estimated_cmps = estrows * LOG2(estrows);
+
+		if (estimated_cmps > estrows * 20)
+		{
+#ifdef DEBUG_ABBREV_KEYS
+			elog(DEBUG_elog_output, "row estimate too high (%f, estimated cmps: %f) to ever abort",
+				 estrows, estimated_cmps);
+#endif
+			return false;
+		}
+	}
+
 	abbrev_distinct = estimateHyperLogLog(&tss->abbr_card);
 	key_distinct = estimateHyperLogLog(&tss->full_card);
 
diff --git a/src/backend/utils/sort/tuplesort.c b/src/backend/utils/sort/tuplesort.c
index 4ccb766..a35cef0 100644
--- a/src/backend/utils/sort/tuplesort.c
+++ b/src/backend/utils/sort/tuplesort.c
@@ -356,6 +356,7 @@ struct Tuplesortstate
 	 * effectiveness is tested.
 	 */
 	int64		abbrevNext;		/* Tuple # at which to next check applicability */
+	double		abbrevEstRow;	/* Estimated # rows to be sorted, 0 >= if unknown */
 
 	/*
 	 * These variables are specific to the CLUSTER case; they are set by
@@ -600,7 +601,8 @@ tuplesort_begin_heap(TupleDesc tupDesc,
 					 int nkeys, AttrNumber *attNums,
 					 Oid *sortOperators, Oid *sortCollations,
 					 bool *nullsFirstFlags,
-					 int workMem, bool randomAccess)
+					 int workMem, double estRows,
+					 bool randomAccess)
 {
 	Tuplesortstate *state = tuplesort_begin_common(workMem, randomAccess);
 	MemoryContext oldcontext;
@@ -632,6 +634,7 @@ tuplesort_begin_heap(TupleDesc tupDesc,
 
 	state->tupDesc = tupDesc;	/* assume we need not copy tupDesc */
 	state->abbrevNext = 10;
+	state->abbrevEstRow = estRows;
 
 	/* Prepare SortSupport data for each column */
 	state->sortKeys = (SortSupport) palloc0(nkeys * sizeof(SortSupportData));
@@ -670,7 +673,8 @@ tuplesort_begin_heap(TupleDesc tupDesc,
 Tuplesortstate *
 tuplesort_begin_cluster(TupleDesc tupDesc,
 						Relation indexRel,
-						int workMem, bool randomAccess)
+						int workMem,
+						double estRows, bool randomAccess)
 {
 	Tuplesortstate *state = tuplesort_begin_common(workMem, randomAccess);
 	ScanKey			indexScanKey;
@@ -702,6 +706,7 @@ tuplesort_begin_cluster(TupleDesc tupDesc,
 	state->writetup = writetup_cluster;
 	state->readtup = readtup_cluster;
 	state->abbrevNext = 10;
+	state->abbrevEstRow = estRows;
 
 	state->indexInfo = BuildIndexInfo(indexRel);
 
@@ -763,7 +768,8 @@ Tuplesortstate *
 tuplesort_begin_index_btree(Relation heapRel,
 							Relation indexRel,
 							bool enforceUnique,
-							int workMem, bool randomAccess)
+							int workMem,
+							double estRows, bool randomAccess)
 {
 	Tuplesortstate *state = tuplesort_begin_common(workMem, randomAccess);
 	ScanKey			indexScanKey;
@@ -793,6 +799,7 @@ tuplesort_begin_index_btree(Relation heapRel,
 	state->writetup = writetup_index;
 	state->readtup = readtup_index;
 	state->abbrevNext = 10;
+	state->abbrevEstRow = estRows;
 
 	state->heapRel = heapRel;
 	state->indexRel = indexRel;
@@ -1475,6 +1482,7 @@ consider_abort_common(Tuplesortstate *state)
 		 * indicate that abbreviation should not proceed.
 		 */
 		if (!state->sortKeys->abbrev_abort(state->memtupcount,
+										   state->abbrevEstRow,
 										   state->sortKeys))
 			return false;
 
diff --git a/src/include/access/nbtree.h b/src/include/access/nbtree.h
index d3d258b..1143a33 100644
--- a/src/include/access/nbtree.h
+++ b/src/include/access/nbtree.h
@@ -711,7 +711,7 @@ extern void BTreeShmemInit(void);
 typedef struct BTSpool BTSpool; /* opaque type known only within nbtsort.c */
 
 extern BTSpool *_bt_spoolinit(Relation heap, Relation index,
-			  bool isunique, bool isdead);
+			  bool isunique, bool ispartial, bool isdead);
 extern void _bt_spooldestroy(BTSpool *btspool);
 extern void _bt_spool(BTSpool *btspool, ItemPointer self,
 		  Datum *values, bool *isnull);
diff --git a/src/include/utils/sortsupport.h b/src/include/utils/sortsupport.h
index 4c99ed6..659233b 100644
--- a/src/include/utils/sortsupport.h
+++ b/src/include/utils/sortsupport.h
@@ -176,9 +176,12 @@ typedef struct SortSupportData
 	 * If there is a lot of duplicate abbreviated keys in practice, it's useful
 	 * to be able to abandon the strategy before paying too high a cost in
 	 * conversion (perhaps certain opclass-specific adaptations are useful
-	 * too).
+	 * too).  estrows is typically an estimate of total rows that will be
+	 * sorted originating from the planner. (By convention, interpretation is
+	 * "no hint available" when estrows <= 0.)
 	 */
-	bool			(*abbrev_abort) (int memtupcount, SortSupport ssup);
+	bool			(*abbrev_abort) (int memtupcount, double estrows,
+									 SortSupport ssup);
 
 	/*
 	 * Full, authoritative comparator for key that an abbreviated
diff --git a/src/include/utils/tuplesort.h b/src/include/utils/tuplesort.h
index 2537883..06e35d5 100644
--- a/src/include/utils/tuplesort.h
+++ b/src/include/utils/tuplesort.h
@@ -62,14 +62,14 @@ extern Tuplesortstate *tuplesort_begin_heap(TupleDesc tupDesc,
 					 int nkeys, AttrNumber *attNums,
 					 Oid *sortOperators, Oid *sortCollations,
 					 bool *nullsFirstFlags,
-					 int workMem, bool randomAccess);
+					 int workMem, double estRows, bool randomAccess);
 extern Tuplesortstate *tuplesort_begin_cluster(TupleDesc tupDesc,
 						Relation indexRel,
-						int workMem, bool randomAccess);
+						int workMem, double estRows, bool randomAccess);
 extern Tuplesortstate *tuplesort_begin_index_btree(Relation heapRel,
 							Relation indexRel,
 							bool enforceUnique,
-							int workMem, bool randomAccess);
+							int workMem, double estRows, bool randomAccess);
 extern Tuplesortstate *tuplesort_begin_index_hash(Relation heapRel,
 						   Relation indexRel,
 						   uint32 hash_mask,
-- 
1.9.1