From b4c2a801aa802e71b8a822f5e1cd163463e97e26 Mon Sep 17 00:00:00 2001
From: Tomas Vondra <tomas@2ndquadrant.com>
Date: Sat, 31 Mar 2018 19:49:38 +0200
Subject: [PATCH] comments

---
 src/backend/commands/explain.c             |   4 +
 src/backend/executor/nodeIncrementalSort.c | 118 ++++++++++++++++++++---------
 src/backend/optimizer/path/costsize.c      |  54 ++++++++-----
 src/backend/optimizer/path/pathkeys.c      |   7 +-
 src/backend/optimizer/plan/createplan.c    |  14 ++++
 src/backend/optimizer/plan/planagg.c       |   1 -
 src/backend/optimizer/plan/planner.c       |   6 +-
 src/backend/optimizer/util/pathnode.c      |   5 ++
 8 files changed, 155 insertions(+), 54 deletions(-)

diff --git a/src/backend/commands/explain.c b/src/backend/commands/explain.c
index edd71ae..8533f36 100644
--- a/src/backend/commands/explain.c
+++ b/src/backend/commands/explain.c
@@ -2016,6 +2016,10 @@ show_sort_keys(SortState *sortstate, List *ancestors, ExplainState *es)
 	Sort	   *plan = (Sort *) sortstate->ss.ps.plan;
 	int			presortedCols;
 
+	/*
+	 * XXX This seems unnecessary. In which case do we call show_sort_keys
+	 * for incremental sort? Even the comment says it's for Sort only.
+	 */
 	if (IsA(plan, IncrementalSort))
 		presortedCols = ((IncrementalSort *) plan)->presortedCols;
 	else
diff --git a/src/backend/executor/nodeIncrementalSort.c b/src/backend/executor/nodeIncrementalSort.c
index 1f5e41f..b95379b 100644
--- a/src/backend/executor/nodeIncrementalSort.c
+++ b/src/backend/executor/nodeIncrementalSort.c
@@ -5,45 +5,49 @@
  *
  * DESCRIPTION
  *
- *		Incremental sort is a specially optimized kind of multikey sort used
- *		when the input is already presorted by a prefix of the required keys
- *		list.  Thus, when it's required to sort by (key1, key2 ... keyN) and
- *		result is already sorted by (key1, key2 ... keyM), M < N, we sort groups
- *		where values of (key1, key2 ... keyM) are equal.
+ *		Incremental sort is an optimized variant of multikey sort for cases
+ *      when the input is already sorted by a prefix of the sort keys.  For
+ *		example when a sort by (key1, key2 ... keyN) is requested, and the
+ *		input is already sorted by (key1, key2 ... keyM), M < N, we can
+ *      divide the input into groups where keys (key1, ... keyM) are equal,
+ *      and only sort on the remaining columns.
  *
- *		Consider the following example.  We have input tuples consisting from
- *		two integers (x, y) already presorted by x, while it's required to
- *		sort them by x and y.  Let input tuples be following.
+ *		Consider the following example.  We have input tuples consisting of
+ *		two integers (X, Y) already presorted by X, while it's required to
+ *		sort them by both X and Y.  Let input tuples be following.
  *
  *		(1, 5)
  *		(1, 2)
- *		(2, 10)
+ *		(2, 9)
  *		(2, 1)
  *		(2, 5)
  *		(3, 3)
  *		(3, 7)
  *
- *		Incremental sort algorithm would sort by y following groups, which have
- *		equal x, individually:
+ *		Incremental sort algorithm would split the input into the following
+ *      groups, which have equal X, and then sort them by Y individually:
+ *
  *			(1, 5) (1, 2)
- *			(2, 10) (2, 1) (2, 5)
+ *			(2, 9) (2, 1) (2, 5)
  *			(3, 3) (3, 7)
  *
  *		After sorting these groups and putting them altogether, we would get
- *		following tuple set which is actually sorted by x and y.
+ *		the following result which is sorted by X and Y, as requested:
  *
  *		(1, 2)
  *		(1, 5)
  *		(2, 1)
  *		(2, 5)
- *		(2, 10)
+ *		(2, 9)
  *		(3, 3)
  *		(3, 7)
  *
- *		Incremental sort is faster than full sort on large datasets.  But
- *		the case of most huge benefit of incremental sort is queries with
- *		LIMIT because incremental sort can return first tuples without reading
- *		whole input dataset.
+ *		Incremental sort may be more efficient than plain sort, parcitularly
+ *      on large datasets, as it reduces the amount of data to sort at once,
+ *      making it more likely it fits into work_mem (eliminating the need to
+ *      spill to disk).  But the main advantage of incremental sort is that
+ *      it can start producing rows early, before sorting the whole dataset,
+ *      which is a significant benefit especially for queries with LIMIT.
  *
  * Portions Copyright (c) 1996-2018, PostgreSQL Global Development Group
  * Portions Copyright (c) 1994, Regents of the University of California
@@ -112,18 +116,27 @@ preparePresortedCols(IncrementalSortState *node)
 
 /*
  * Check if first "presortedCols" sort values are equal.
+ *
+ * XXX I find the name somewhat confusing, because "cmp" is usually used
+ * for comparators, i.e. functions that return -1/0/1. I suggest something
+ * like "samePrefixGroup" or something like that.
  */
 static bool
 cmpSortPresortedCols(IncrementalSortState *node, TupleTableSlot *a,
 															TupleTableSlot *b)
 {
-	int n, i;
+	int presortedCols, i;
 
 	Assert(IsA(node->ss.ps.plan, IncrementalSort));
 
-	n = ((IncrementalSort *) node->ss.ps.plan)->presortedCols;
+	presortedCols = ((IncrementalSort *) node->ss.ps.plan)->presortedCols;
 
-	for (i = n - 1; i >= 0; i--)
+	/*
+	 * We do assume the input is sorted by keys (0, ... n), which means
+	 * the tail keys are more likely to change. So we do the comparison
+	 * from the end, to minimize the number of function calls.
+	 */
+	for (i = presortedCols - 1; i >= 0; i--)
 	{
 		Datum				datumA,
 							datumB,
@@ -171,6 +184,12 @@ cmpSortPresortedCols(IncrementalSortState *node, TupleTableSlot *a,
  * to cope this problem we don't copy pivot tuple before the group contains
  * at least MIN_GROUP_SIZE of tuples.  Surely, it might reduce efficiency of
  * incremental sort, but it reduces the probability of regression.
+ *
+ * XXX I suppose this is not just about copying the tuples into the slot, but
+ * about frequently sorting tiny amounts of data (tuplesort overhead)?
+ *
+ * XXX Fixed-size limit seems like a bad idea, for example when the subplan
+ * is expected to produce only very few tuples (less than 32) at high cost.
  */
 #define MIN_GROUP_SIZE 32
 
@@ -205,6 +224,9 @@ ExecIncrementalSort(PlanState *pstate)
 	TupleDesc			tupDesc;
 	int64				nTuples = 0;
 
+	/* XXX ExecSort does this, I don't see why ExecIncrementalSort shouldn't */
+	CHECK_FOR_INTERRUPTS();
+
 	/*
 	 * get state info from node
 	 */
@@ -216,7 +238,9 @@ ExecIncrementalSort(PlanState *pstate)
 	tuplesortstate = (Tuplesortstate *) node->tuplesortstate;
 
 	/*
-	 * Return next tuple from sorted set if any.
+	 * Return next tuple from the current sorted group set if available.
+	 * If there are no more tuples in the current group, we need to try
+	 * to fetch more tuples from the input and build another group.
 	 */
 	if (node->sort_Done)
 	{
@@ -228,8 +252,10 @@ ExecIncrementalSort(PlanState *pstate)
 	}
 
 	/*
-	 * If first time through, read all tuples from outer plan and pass them to
-	 * tuplesort.c. Subsequent calls just fetch tuples from tuplesort.
+	 * First time through or no tuples in the current group. Read next
+	 * batch of tuples from the outer plan and pass them to tuplesort.c.
+	 * Subsequent calls just fetch tuples from tuplesort, until the group
+	 * is exhausted, at which point we build the next group.
 	 */
 
 	SO1_printf("ExecIncrementalSort: %s\n",
@@ -241,15 +267,12 @@ ExecIncrementalSort(PlanState *pstate)
 	 */
 	estate->es_direction = ForwardScanDirection;
 
-	/*
-	 * Initialize tuplesort module.
-	 */
-	SO1_printf("ExecIncrementalSort: %s\n",
-			   "calling tuplesort_begin");
-
 	outerNode = outerPlanState(node);
 	tupDesc = ExecGetResultType(outerNode);
 
+	/*
+	 * Initialize tuplesort module (needed only before the first group).
+	 */
 	if (node->tuplesortstate == NULL)
 	{
 		/*
@@ -259,12 +282,17 @@ ExecIncrementalSort(PlanState *pstate)
 		 */
 		preparePresortedCols(node);
 
+		SO1_printf("ExecIncrementalSort: %s\n",
+				   "calling tuplesort_begin_heap");
+
 		/*
 		 * Pass all the columns to tuplesort.  We pass to tuple sort groups
 		 * of at least MIN_GROUP_SIZE size.  Thus, these groups doesn't
 		 * necessary have equal value of the first column.  We unlikely will
 		 * have huge groups with incremental sort.  Therefore usage of
 		 * abbreviated keys would be likely a waste of time.
+		 *
+		 * XXX The claim about abbreviated keys seems rather dubious, IMHO.
 		 */
 		tuplesortstate = tuplesort_begin_heap(
 									tupDesc,
@@ -290,7 +318,7 @@ ExecIncrementalSort(PlanState *pstate)
 	if (node->bounded)
 		tuplesort_set_bound(tuplesortstate, node->bound - node->bound_Done);
 
-	/* Put saved tuple to tuplesort if any */
+	/* If we got a left-over tuple from the last group, pass it to tuplesort. */
 	if (!TupIsNull(node->grpPivotSlot))
 	{
 		tuplesort_puttupleslot(tuplesortstate, node->grpPivotSlot);
@@ -312,19 +340,41 @@ ExecIncrementalSort(PlanState *pstate)
 			break;
 		}
 
-		/* Put next group of presorted data to the tuplesort */
+		/*
+		 * Accumulate the next group of presorted tuples for tuplesort.
+		 * We always accumulate at least MIN_GROUP_SIZE tuples, and only
+		 * then we start to compare the prefix keys.
+		 *
+		 * The last tuple is kept as a pivot, so that we can determine if
+		 * the subsequent tuples have the same prefix key (same group).
+		 */
 		if (nTuples < MIN_GROUP_SIZE)
 		{
 			tuplesort_puttupleslot(tuplesortstate, slot);
 
-			/* Save last tuple in minimal group */
+			/* Keep the last tuple in minimal group as a pivot. */
 			if (nTuples == MIN_GROUP_SIZE - 1)
 				ExecCopySlot(node->grpPivotSlot, slot);
 			nTuples++;
 		}
 		else
 		{
-			/* Iterate while presorted cols are the same as in saved tuple */
+			/*
+			 * Iterate while presorted cols are the same as in saved tuple
+			 *
+			 * After accumulating at least MIN_GROUP_SIZE tuples (we don't
+			 * know how many groups are there in that set), we need to keep
+			 * accumulating until we reach the end of the group. Only then
+			 * we can do the sort and output all the tuples.
+			 *
+			 * We compare the prefix keys to the pivot - if the prefix keys
+			 * are the same the tuple belongs to the same group, so we pass
+			 * it to the tuplesort.
+			 *
+			 * If the prefix differs, we've reached the end of the group. We
+			 * need to keep the last tuple, so we copy it into the pivot slot
+			 * (it does not serve as pivot, though).
+			 */
 			if (cmpSortPresortedCols(node, node->grpPivotSlot, slot))
 			{
 				tuplesort_puttupleslot(tuplesortstate, slot);
diff --git a/src/backend/optimizer/path/costsize.c b/src/backend/optimizer/path/costsize.c
index e8cfdd8..52956a8 100644
--- a/src/backend/optimizer/path/costsize.c
+++ b/src/backend/optimizer/path/costsize.c
@@ -1616,13 +1616,6 @@ cost_recursive_union(Path *runion, Path *nrterm, Path *rterm)
  *	  Determines and returns the cost of sorting a relation, including
  *	  the cost of reading the input data.
  *
- * Sort could be either full sort of relation or incremental sort when we already
- * have data presorted by some of required pathkeys.  In the second case
- * we estimate number of groups which source data is divided to by presorted
- * pathkeys.  And then estimate cost of sorting each individual group assuming
- * data is divided into group uniformly.  Also, if LIMIT is specified then
- * we have to pull from source and sort only some of total groups.
- *
  * If the total volume of data to sort is less than sort_mem, we will do
  * an in-memory sort, which requires no I/O and about t*log2(t) tuple
  * comparisons for t tuples.
@@ -1648,6 +1641,16 @@ cost_recursive_union(Path *runion, Path *nrterm, Path *rterm)
  * specifying nonzero comparison_cost; typically that's used for any extra
  * work that has to be done to prepare the inputs to the comparison operators.
  *
+ * The sort may also be incremental, when the input data is already sorted
+ * by a prefix of the requested pathkeys.  In that case we estimate the
+ * number of groups the input data is divided into (by the prefix keys), and
+ * then apply the same costing criteria as for regular sort.  For example the
+ * sort_mem limit is applied on per-group size (assuming average group size),
+ * not the total volume of data.
+ *
+ * If LIMIT is specified, incremental sort only needs to pull and sort
+ * a subset of the input data, unlike the regular sort.
+ *
  * 'pathkeys' is a list of sort keys
  * 'presorted_keys' is a number of pathkeys already presorted in given path
  * 'input_startup_cost' is the startup cost for reading the input data
@@ -1687,6 +1690,7 @@ cost_sort(Path *path, PlannerInfo *root,
 
 	if (!enable_sort)
 		startup_cost += disable_cost;
+
 	if (!enable_incrementalsort)
 		presorted_keys = 0;
 
@@ -1749,6 +1753,19 @@ cost_sort(Path *path, PlannerInfo *root,
 		 */
 		group_input_bytes = 1.5 * input_bytes / num_groups;
 		group_tuples = 1.5 * tuples / num_groups;
+
+		/*
+		 * We want to be sure the cost of a sort is never estimated as zero, even
+		 * if passed-in tuple count is zero.  Besides, mustn't do log(0)...
+		 *
+		 * XXX Same protection as for tuples, at the beginning. And we don't need
+		 * to worry about LOG2() below.
+		 *
+		 * XXX Probably should re-evaluate group_input_bytes, but the difference
+		 * is going to be tiny.
+		 */
+		if (group_tuples < 2.0)
+			group_tuples = 2.0;
 	}
 	else
 	{
@@ -1757,6 +1774,10 @@ cost_sort(Path *path, PlannerInfo *root,
 		group_tuples = tuples;
 	}
 
+	/*
+	 * XXX Can it actually happen that the first condition is true,
+	 * while the second one is false? I don't think so.
+	 */
 	if (output_bytes > sort_mem_bytes && group_input_bytes > sort_mem_bytes)
 	{
 		/*
@@ -1799,15 +1820,8 @@ cost_sort(Path *path, PlannerInfo *root,
 	}
 	else
 	{
-		/*
-		 * We'll use plain quicksort on all the input tuples.  If it appears
-		 * that we expect less than two tuples per sort group then assume
-		 * logarithmic part of estimate to be 1.
-		 */
-		if (group_tuples >= 2.0)
-			group_cost = comparison_cost * group_tuples * LOG2(group_tuples);
-		else
-			group_cost = comparison_cost * group_tuples;
+		/* We'll use plain quicksort on all the input tuples. */
+		group_cost = comparison_cost * group_tuples * LOG2(group_tuples);
 	}
 
 	/* Add per group cost of fetching tuples from input */
@@ -1832,7 +1846,13 @@ cost_sort(Path *path, PlannerInfo *root,
 	 */
 	run_cost += cpu_operator_cost * tuples;
 
-	/* Extra costs of incremental sort */
+	/*
+	 * Extra costs of incremental sort
+	 *
+	 * XXX This pretty much implies there are cases where incremental sort
+	 * is costed as more expensive than plain sort. The difference may be
+	 * fairly small, but if the groups are tiny it may be noticeable.
+	 */
 	if (presorted_keys > 0)
 	{
 		/*
diff --git a/src/backend/optimizer/path/pathkeys.c b/src/backend/optimizer/path/pathkeys.c
index 57fe52d..506d898 100644
--- a/src/backend/optimizer/path/pathkeys.c
+++ b/src/backend/optimizer/path/pathkeys.c
@@ -27,7 +27,6 @@
 #include "optimizer/paths.h"
 #include "optimizer/tlist.h"
 #include "utils/lsyscache.h"
-#include "utils/selfuncs.h"
 
 
 static bool pathkey_is_redundant(PathKey *new_pathkey, List *pathkeys);
@@ -1648,6 +1647,12 @@ pathkeys_useful_for_ordering(List *query_pathkeys, List *pathkeys)
 	}
 	else
 	{
+		/*
+		 * XXX This seems really strange. Why should we even consider the GUC
+		 * here? It's supposed to be a generic utility function. That belongs
+		 * to costsize.c only, I think. No other function here does something
+		 * like that, which is illustrated by having to include cost.h.
+		 */
 		if (enable_incrementalsort)
 		{
 			/*
diff --git a/src/backend/optimizer/plan/createplan.c b/src/backend/optimizer/plan/createplan.c
index 9ca06c7..74230ec 100644
--- a/src/backend/optimizer/plan/createplan.c
+++ b/src/backend/optimizer/plan/createplan.c
@@ -1180,6 +1180,13 @@ create_merge_append_plan(PlannerInfo *root, MergeAppendPath *best_path)
 					  numsortkeys * sizeof(bool)) == 0);
 
 		/* Now, insert a Sort node if subplan isn't sufficiently ordered */
+
+		/*
+		 * XXX This seems wrong, as it builds an incremental sort whenever
+		 * there's at least one prefix column, even when a regular sort
+		 * would be cheaper.
+		 */
+
 		if (!pathkeys_common_contained_in(pathkeys, subpath->pathkeys,
 										  &n_common_pathkeys))
 		{
@@ -1564,6 +1571,13 @@ create_gather_merge_plan(PlannerInfo *root, GatherMergePath *best_path)
 
 
 	/* Now, insert a Sort node if subplan isn't sufficiently ordered */
+
+	/*
+	 * XXX This seems wrong, as it builds an incremental sort whenever
+	 * there's at least one prefix column, even when a regular sort
+	 * would be cheaper.
+	 */
+
 	if (!pathkeys_common_contained_in(pathkeys, best_path->subpath->pathkeys,
 									  &n_common_pathkeys))
 	{
diff --git a/src/backend/optimizer/plan/planagg.c b/src/backend/optimizer/plan/planagg.c
index 308f60b..95cbffb 100644
--- a/src/backend/optimizer/plan/planagg.c
+++ b/src/backend/optimizer/plan/planagg.c
@@ -44,7 +44,6 @@
 #include "parser/parse_clause.h"
 #include "rewrite/rewriteManip.h"
 #include "utils/lsyscache.h"
-#include "utils/selfuncs.h"
 #include "utils/syscache.h"
 
 
diff --git a/src/backend/optimizer/plan/planner.c b/src/backend/optimizer/plan/planner.c
index 39dabce..fa312e9 100644
--- a/src/backend/optimizer/plan/planner.c
+++ b/src/backend/optimizer/plan/planner.c
@@ -6209,7 +6209,11 @@ add_paths_to_grouping_rel(PlannerInfo *root, RelOptInfo *input_rel,
 									root->group_pathkeys, path->pathkeys);
 			if (path == cheapest_path || n_useful_pathkeys > 0)
 			{
-				/* Sort the cheapest-total path if it isn't already sorted */
+				/* Sort the cheapest-total path if it isn't already sorted
+				 *
+				 * XXX This comment is obviously stale, as it's not about
+				 * cheapest-total path, but about paths sorted by prefix.
+				 */
 				if (n_useful_pathkeys < list_length(root->group_pathkeys))
 					path = (Path *) create_sort_path(root,
 													 grouped_rel,
diff --git a/src/backend/optimizer/util/pathnode.c b/src/backend/optimizer/util/pathnode.c
index 2202e97..62f39c0 100644
--- a/src/backend/optimizer/util/pathnode.c
+++ b/src/backend/optimizer/util/pathnode.c
@@ -1368,6 +1368,11 @@ create_merge_append_path(PlannerInfo *root,
 		pathnode->path.parallel_safe = pathnode->path.parallel_safe &&
 			subpath->parallel_safe;
 
+
+		/*
+		 * XXX Same issue as in createplan.c - we always create incremental
+		 * sort, even if plain sort would be cheaper.
+		 */
 		if (pathkeys_common_contained_in(pathkeys, subpath->pathkeys,
 										 &n_common_pathkeys))
 		{
-- 
2.9.5