diff git a/src/backend/optimizer/path/costsize.c b/src/backend/optimizer/path/costsize.c
index a2a7e0c520..521a7d3c9a 100644
 a/src/backend/optimizer/path/costsize.c
+++ b/src/backend/optimizer/path/costsize.c
@@ 1612,6 +1612,250 @@ cost_recursive_union(Path *runion, Path *nrterm, Path *rterm)
rterm>pathtarget>width);
}
+/*
+ * is_fake_var
+ * Workaround for generate_append_tlist() which generates fake Vars with
+ * varno == 0, that will cause a fail of estimate_num_group() call
+ */
+static bool
+is_fake_var(Expr *expr)
+{
+ if (IsA(expr, RelabelType))
+ expr = (Expr *) ((RelabelType *) expr)>arg;
+
+ return (IsA(expr, Var) && ((Var *) expr)>varno == 0);
+}
+
+/*
+ * get_width_cost_multiplier
+ * Returns relative complexity of comparing two valyes based on it's width.
+ * The idea behind  long values have more expensive comparison. Return value is
+ * in cpu_operator_cost unit.
+ */
+static double
+get_width_cost_multiplier(PlannerInfo *root, Expr *expr)
+{
+ double width = 1.0; /* fake value */
+
+ if (IsA(expr, RelabelType))
+ expr = (Expr *) ((RelabelType *) expr)>arg;
+
+ /* Try to find actual stat in corresonding relation */
+ if (IsA(expr, Var))
+ {
+ Var *var = (Var *) expr;
+
+ if (var>varno > 0 && var>varno < root>simple_rel_array_size)
+ {
+ RelOptInfo *rel = root>simple_rel_array[var>varno];
+
+ if (rel != NULL &&
+ var>varattno >= rel>min_attr &&
+ var>varattno <= rel>max_attr)
+ {
+ int ndx = var>varattno  rel>min_attr;
+
+ if (rel>attr_widths[ndx] > 0)
+ width = rel>attr_widths[ndx];
+ }
+ }
+ }
+
+ /* Didn't find any actual stats, use estimation by type */
+ if (width < 0.0)
+ {
+ Node *node = (Node*) expr;
+
+ width = get_typavgwidth(exprType(node), exprTypmod(node));
+ }
+
+ /*
+ * Any value in pgsql is passed by Datum type, so any operation with value
+ * could not be cheaper than operation with Datum type
+ */
+ if (width <= sizeof(Datum))
+ return 1.0;
+
+ /*
+ * Seems, cost of comparision is not directly proportional to args width,
+ * because comparing args could be differ width (we known only average over
+ * column) and difference often could be defined only by looking on first
+ * bytes. So, use log16(width) as estimation.
+ */
+ return 1.0 + 0.125 * LOG2(width / sizeof(Datum));
+}
+
+/*
+ * compute_cpu_sort_cost
+ * compute CPU cost of sort (i.e. inmemory)
+ *
+ * NOTE: some callers currently pass NIL for pathkeys because they
+ * can't conveniently supply the sort keys. In this case, it will fallback to
+ * simple comparison cost estimate.
+ *
+ * Estimation algorithm is based on ideas from course Algorithms,
+ * Robert Sedgewick, Kevin Wayne, https://algs4.cs.princeton.edu/home/ and paper
+ * "Quicksort Is Optimal For Many Equal Keys", Sebastian Wild,
+ * arXiv:1608.04906v4 [cs.DS] 1 Nov 2017.
+ *
+ * In term of that papers, let N  number of tuples, Xi  number of tuples with
+ * key Ki, then estimation is:
+ * log(N! / (X1! * X2! * ..)) ~ sum(Xi * log(N/Xi))
+ * In our case all Xi are the same because noew we don't have an estimation of
+ * group sizes, we have only estimation of number of groups. In this case,
+ * formula becomes: N * log(NumberOfGroups). Next, to support correct estimation
+ * of multicolumn sort we need separately compute each column, so, let k is a
+ * column number, Gk  number of groups defined by k columns:
+ * N * sum( Fk * log(Gk) )
+ * Fk is sum of functions costs for k columns.
+ */
+
+static Cost
+compute_cpu_sort_cost(PlannerInfo *root, List *pathkeys,
+ Cost comparison_cost,
+ double tuples, double output_tuples, bool heapSort)
+{
+ Cost per_tuple_cost = 0.0;
+ ListCell *lc;
+ List *pathkeyExprs = NIL;
+ double tuplesPerPrevGroup = tuples;
+ bool has_fake_var = false;
+ double totalFuncCost = 0.0;
+
+ /* fallback if pathkeys is unknown */
+ if (list_length(pathkeys) == 0)
+ {
+ /*
+ * If we'll use a bounded heapsort keeping just K tuples in memory, for
+ * a total number of tuple comparisons of N log2 K; but the constant
+ * factor is a bit higher than for quicksort. Tweak it so that the
+ * cost curve is continuous at the crossover point.
+ */
+ output_tuples = (heapSort) ? 2.0 * output_tuples : tuples;
+ per_tuple_cost += 2.0 * cpu_operator_cost * LOG2(output_tuples);
+
+ return per_tuple_cost * tuples;
+ }
+
+ totalFuncCost = 1.0;
+
+ /*
+ * Computing total cost of sorting takes into account:
+ *  per column comparison function cost
+ *  we try to compute needed number of comparison per column
+ */
+
+ foreach(lc, pathkeys)
+ {
+ PathKey *pathkey = (PathKey*)lfirst(lc);
+ Cost funcCost = 1.0; /* fallback value */
+ EquivalenceMember *em;
+ double nGroups,
+ correctedNGroups;
+
+ /*
+ * We believe than equivalence members aren't very different, so, to
+ * estimate cost we take just first member
+ */
+ em = (EquivalenceMember *) linitial(pathkey>pk_eclass>ec_members);
+
+ if (em>em_datatype != InvalidOid)
+ {
+ Oid sortop;
+
+ sortop = get_opfamily_member(pathkey>pk_opfamily,
+ em>em_datatype, em>em_datatype,
+ pathkey>pk_strategy);
+
+ funcCost = get_func_cost(get_opcode(sortop));
+ }
+
+ /* Try to take into account actual width fee */
+ funcCost *= get_width_cost_multiplier(root, em>em_expr);
+
+ totalFuncCost += funcCost;
+
+ /* remeber if we have a fake var in pathkeys */
+ has_fake_var = is_fake_var(em>em_expr);
+ pathkeyExprs = lappend(pathkeyExprs, em>em_expr);
+
+ /*
+ * Prevent call estimate_num_groups() with fake Var. Note,
+ * pathkeyExprs contains only previous columns
+ */
+ if (has_fake_var == false)
+ /*
+ * Recursively defin number of group in group from previous step
+ */
+ nGroups = estimate_num_groups(root, pathkeyExprs,
+ tuplesPerPrevGroup, NULL);
+ else if (tuples > 4.0)
+ /*
+ * Use geometric mean as estimation if there is no any stats.
+ * Don't use DEFAULT_NUM_DISTINCT because it used for only one
+ * column while here we try to estimate number of groups over
+ * set of columns.
+ */
+ nGroups = ceil(2.0 + sqrt(tuples) *
+ list_length(pathkeyExprs) / list_length(pathkeys));
+ else
+ nGroups = tuples;
+
+ if (heapSort)
+ {
+ if (tuplesPerPrevGroup < output_tuples)
+ /* comparing only inside output_tuples */
+ correctedNGroups =
+ ceil(2.0 * output_tuples / (tuplesPerPrevGroup / nGroups));
+ else
+ /* two groups  in output and out */
+ correctedNGroups = 2.0;
+ }
+ else
+ correctedNGroups = nGroups;
+
+ per_tuple_cost += totalFuncCost * LOG2(correctedNGroups);
+
+ /*
+ * Realworld distribution isn't uniform but now we don't have a way to
+ * determine that, so, add multiplier to get closer to worst case.
+ */
+ tuplesPerPrevGroup = ceil(1.5 * tuples / nGroups);
+ if (tuplesPerPrevGroup > tuples)
+ tuplesPerPrevGroup = tuples;
+
+ /*
+ * We could skip all followed columns for cost estimation, because we
+ * believe that tuples are unique by set ot previous columns
+ */
+ if (tuples <= nGroups)
+ break;
+ }
+
+ list_free(pathkeyExprs);
+
+ /* per_tuple_cost is in cpu_operator_cost units */
+ per_tuple_cost *= cpu_operator_cost;
+
+ /*
+ * Accordingly to "Introduction to algorithms", Thomas H. Cormen, Charles E.
+ * Leiserson, Ronald L. Rivest, ISBN 0070131430, quicksort estimation
+ * formula has additional term proportional to number of tuples (See Chapter
+ * 8.2 and Theorem 4.1). It has meaning with low number of tuples,
+ * approximately less that 1e4. Of course, it could be inmplemented as
+ * additional multiplier under logarithm, but use more complicated formula
+ * which takes into account number of unique tuples and it isn't clear how
+ * to combine multiplier with groups. Estimate it as 10 in cpu_operator_cost
+ * unit.
+ */
+ per_tuple_cost += 10 * cpu_operator_cost;
+
+ /* add cost provided by caller */
+ per_tuple_cost += comparison_cost;
+
+ return tuples * per_tuple_cost;
+}
+
/*
* cost_sort
* Determines and returns the cost of sorting a relation, including
@@ 1628,7 +1872,7 @@ cost_recursive_union(Path *runion, Path *nrterm, Path *rterm)
* number of initial runs formed and M is the merge order used by tuplesort.c.
* Since the average initial run should be about sort_mem, we have
* disk traffic = 2 * relsize * ceil(logM(p / sort_mem))
 * cpu = comparison_cost * t * log2(t)
+ * and cpu cost computed by compute_cpu_sort_cost().
*
* If the sort is bounded (i.e., only the first k result tuples are needed)
* and k tuples can fit into sort_mem, we use a heap method that keeps only
@@ 1649,13 +1893,6 @@ cost_recursive_union(Path *runion, Path *nrterm, Path *rterm)
* 'comparison_cost' is the extra cost per comparison, if any
* 'sort_mem' is the number of kilobytes of work memory allowed for the sort
* 'limit_tuples' is the bound on the number of output tuples; 1 if no bound
 *
 * NOTE: some callers currently pass NIL for pathkeys because they
 * can't conveniently supply the sort keys. Since this routine doesn't
 * currently do anything with pathkeys anyway, that doesn't matter...
 * but if it ever does, it should react gracefully to lack of key data.
 * (Actually, the thing we'd most likely be interested in is just the number
 * of sort keys, which all callers *could* supply.)
*/
void
cost_sort(Path *path, PlannerInfo *root,
@@ 1682,9 +1919,6 @@ cost_sort(Path *path, PlannerInfo *root,
if (tuples < 2.0)
tuples = 2.0;
 /* Include the default costpercomparison */
 comparison_cost += 2.0 * cpu_operator_cost;

/* Do we have a useful LIMIT? */
if (limit_tuples > 0 && limit_tuples < tuples)
{
@@ 1713,7 +1947,9 @@ cost_sort(Path *path, PlannerInfo *root,
*
* Assume about N log2 N comparisons
*/
 startup_cost += comparison_cost * tuples * LOG2(tuples);
+ startup_cost += compute_cpu_sort_cost(root, pathkeys,
+ comparison_cost, tuples,
+ tuples, false);
/* Disk costs */
@@ 1729,18 +1965,17 @@ cost_sort(Path *path, PlannerInfo *root,
}
else if (tuples > 2 * output_tuples  input_bytes > sort_mem_bytes)
{
 /*
 * We'll use a bounded heapsort keeping just K tuples in memory, for
 * a total number of tuple comparisons of N log2 K; but the constant
 * factor is a bit higher than for quicksort. Tweak it so that the
 * cost curve is continuous at the crossover point.
 */
 startup_cost += comparison_cost * tuples * LOG2(2.0 * output_tuples);
+ /* We'll use a bounded heapsort keeping just K tuples in memory. */
+ startup_cost += compute_cpu_sort_cost(root, pathkeys,
+ comparison_cost, tuples,
+ output_tuples, true);
}
else
{
/* We'll use plain quicksort on all the input tuples */
 startup_cost += comparison_cost * tuples * LOG2(tuples);
+ startup_cost += compute_cpu_sort_cost(root, pathkeys,
+ comparison_cost, tuples,
+ tuples, false);
}
/*