diff --git a/src/backend/optimizer/path/costsize.c b/src/backend/optimizer/path/costsize.c
index a2a7e0c520..3588cff802 100644
--- a/src/backend/optimizer/path/costsize.c
+++ b/src/backend/optimizer/path/costsize.c
@@ -1612,6 +1612,252 @@ cost_recursive_union(Path *runion, Path *nrterm, Path *rterm)
 									rterm->pathtarget->width);
 }
 
+/*
+ * is_fake_var
+ *		Workaround for generate_append_tlist() which generates fake Vars with
+ *		varno == 0, that will cause a fail of estimate_num_group() call
+ */
+static bool
+is_fake_var(Expr *expr)
+{
+	if (IsA(expr, RelabelType))
+		expr = (Expr *) ((RelabelType *) expr)->arg;
+
+	return (IsA(expr, Var) && ((Var *) expr)->varno == 0);
+}
+
+/*
+ * get_width_cost_multiplier
+ *		Returns relative complexity of comparing two valyes based on it's width.
+ * The idea behind - long values have more expensive comparison. Return value is
+ * in cpu_operator_cost unit.
+ */
+static double
+get_width_cost_multiplier(PlannerInfo *root, Expr *expr)
+{
+	double	width = -1.0; /* fake value */
+
+	if (IsA(expr, RelabelType))
+		expr = (Expr *) ((RelabelType *) expr)->arg;
+
+	/* Try to find actual stat in corresonding relation */
+	if (IsA(expr, Var))
+	{
+		Var		*var = (Var *) expr;
+
+		if (var->varno > 0 && var->varno < root->simple_rel_array_size)
+		{
+			RelOptInfo	*rel = root->simple_rel_array[var->varno];
+
+			if (rel != NULL &&
+				var->varattno >= rel->min_attr &&
+				var->varattno <= rel->max_attr)
+			{
+				int	ndx = var->varattno - rel->min_attr;
+
+				if (rel->attr_widths[ndx] > 0)
+					width = rel->attr_widths[ndx];
+			}
+		}
+	}
+
+	/* Didn't find any actual stats, use estimation by type */
+	if (width < 0.0)
+	{
+		Node	*node = (Node*) expr;
+
+		width = get_typavgwidth(exprType(node), exprTypmod(node));
+	}
+
+	/*
+	 * Any value in pgsql is passed by Datum type, so any operation with value
+	 * could not be cheaper than operation with Datum type
+	 */
+	if (width <= sizeof(Datum))
+		return 1.0;
+
+	/*
+	 * Seems, cost of comparision is not directly proportional to args width,
+	 * because comparing args could be differ width (we known only average over
+	 * column) and difference often could be defined only by looking on first
+	 * bytes. So, use log16(width) as estimation.
+	 */
+	return 1.0 + 0.125 * LOG2(width / sizeof(Datum));
+}
+
+/*
+ * compute_cpu_sort_cost
+ *		compute CPU cost of sort (i.e. in-memory)
+ *
+ * NOTE: some callers currently pass NIL for pathkeys because they
+ * can't conveniently supply the sort keys.  In this case, it will fallback to
+ * simple comparison cost estimate.
+ *
+ * Estimation algorithm is based on ideas from course Algorithms,
+ * Robert Sedgewick, Kevin Wayne, https://algs4.cs.princeton.edu/home/ and paper
+ * "Quicksort Is Optimal For Many Equal Keys", Sebastian Wild,
+ * arXiv:1608.04906v4 [cs.DS] 1 Nov 2017.
+ *
+ * In term of that papers, let N - number of tuples, Xi - number of tuples with
+ * key Ki, then estimation is:
+ * log(N! / (X1! * X2! * ..))  ~  sum(Xi * log(N/Xi))
+ * In our case all Xi are the same because noew we don't have an estimation of
+ * group sizes, we have only estimation of number of groups. In this case,
+ * formula becomes: N * log(NumberOfGroups). Next, to support correct estimation
+ * of multicolumn sort we need separately compute each column, so, let k is a
+ * column number, Gk - number of groups  defined by k columns:
+ * N * sum( Fk * log(Gk) )
+ * Fk is sum of functions costs for k columns.
+ */
+
+static Cost
+compute_cpu_sort_cost(PlannerInfo *root, List *pathkeys,
+						   Cost comparison_cost,
+						   double tuples, double output_tuples, bool heapSort)
+{
+	Cost		per_tuple_cost = comparison_cost;
+	ListCell	*lc;
+	List		*pathkeyExprs = NIL;
+	double		tuplesPerPrevGroup = tuples;
+	bool		has_fake_var = false;
+	double		totalFuncCost = 0.0;
+
+	/* fallback if pathkeys is unknown */
+	if (list_length(pathkeys) == 0)
+	{
+		/*
+		 * If we'll use a bounded heap-sort keeping just K tuples in memory, for
+		 * a total number of tuple comparisons of N log2 K; but the constant
+		 * factor is a bit higher than for quicksort.  Tweak it so that the
+		 * cost curve is continuous at the crossover point.
+		 */
+		output_tuples = (heapSort) ? 2.0 * output_tuples : tuples;
+		per_tuple_cost += 2.0 * cpu_operator_cost * LOG2(output_tuples);
+
+		return per_tuple_cost * tuples;
+	}
+
+	totalFuncCost = 1.0;
+
+	/*
+	 * Computing total cost of sorting takes into account:
+	 * - per column comparison function cost
+	 * - we try to compute needed number of comparison per column
+	 */
+
+	foreach(lc, pathkeys)
+	{
+		PathKey				*pathkey = (PathKey*)lfirst(lc);
+		Cost				 funcCost = 1.0; /* fallback value */
+		EquivalenceMember	*em;
+		double				 nGroups,
+							 correctedNGroups;
+
+		/*
+		 * We believe than equivalence members aren't very  different, so, to
+		 * estimate cost we take just first member
+		 */
+		em = (EquivalenceMember *) linitial(pathkey->pk_eclass->ec_members);
+
+		if (em->em_datatype != InvalidOid)
+		{
+			Oid		sortop;
+
+			sortop = get_opfamily_member(pathkey->pk_opfamily,
+										 em->em_datatype, em->em_datatype,
+										 pathkey->pk_strategy);
+
+			if (sortop != InvalidOid)
+			{
+				Oid	funcOid = get_opcode(sortop);
+
+				funcCost = get_func_cost(funcOid);
+			}
+		}
+
+		/* Try to take into account actual width fee */
+		funcCost *= get_width_cost_multiplier(root, em->em_expr);
+
+		totalFuncCost += funcCost;
+
+		/* remeber if we have a fake var in pathkeys */
+		has_fake_var |= is_fake_var(em->em_expr);
+		pathkeyExprs = lappend(pathkeyExprs, em->em_expr);
+
+		/*
+		 * Prevent call estimate_num_groups() with fake Var. Note,
+		 * pathkeyExprs contains only previous columns
+		 */
+		if (has_fake_var == false)
+			/*
+			 * Recursively defin number of group in group from previous step
+			 */
+			nGroups = estimate_num_groups(root, pathkeyExprs,
+										  tuplesPerPrevGroup, NULL);
+		else if (tuples > 4.0)
+			/*
+			 * Use geometric mean as estimation if there is no any stats.
+			 * Don't use DEFAULT_NUM_DISTINCT because it used for only one
+			 * column while here we try to estimate number of groups over
+			 * set of columns.
+			 */
+			nGroups = ceil(2.0 + sqrt(tuples) *
+				list_length(pathkeyExprs) / list_length(pathkeys));
+		else
+			nGroups = tuples;
+
+		if (heapSort)
+		{
+			if (tuplesPerPrevGroup < output_tuples)
+				/* comparing only inside output_tuples */
+				correctedNGroups =
+					ceil(2.0 * output_tuples / (tuplesPerPrevGroup / nGroups));
+			else
+				/* two groups - in output and out */
+				correctedNGroups = 2.0;
+		}
+		else
+			correctedNGroups = nGroups;
+
+		per_tuple_cost += totalFuncCost * LOG2(correctedNGroups);
+
+		/*
+		 * Real-world distribution isn't uniform but now we don't have a way to
+		 * determine that, so, add multiplier to get closer to worst case.
+		 */
+		tuplesPerPrevGroup = ceil(1.5 * tuples / nGroups);
+		if (tuplesPerPrevGroup > tuples)
+			tuplesPerPrevGroup = tuples;
+
+		/*
+		 * We could skip all followed columns for cost estimation, because we
+		 * believe that tuples are unique by set ot previous columns
+		 */
+		if (tuples <= nGroups)
+			break;
+	}
+
+	list_free(pathkeyExprs);
+
+	/* per_tuple_cost is in cpu_operator_cost units */
+	per_tuple_cost *= cpu_operator_cost;
+
+	/*
+	 * Accordingly to "Introduction to algorithms", Thomas H. Cormen, Charles E.
+	 * Leiserson, Ronald L. Rivest, ISBN 0-07-013143-0, quicksort estimation
+	 * formula has additional term proportional to number of tuples (See Chapter
+	 * 8.2 and Theorem 4.1). It has meaning with low number of tuples,
+	 * approximately less that 1e4. Of course, it could be inmplemented as
+	 * additional multiplier under logarithm, but use more complicated formula
+	 * which takes into account number of unique tuples and it isn't clear how
+	 * to combine multiplier with groups. Estimate it as 10 in cpu_operator_cost
+	 * unit.
+	 */
+	per_tuple_cost += 10 * cpu_operator_cost;
+
+	return tuples * per_tuple_cost;
+}
+
 /*
  * cost_sort
  *	  Determines and returns the cost of sorting a relation, including
@@ -1628,7 +1874,7 @@ cost_recursive_union(Path *runion, Path *nrterm, Path *rterm)
  * number of initial runs formed and M is the merge order used by tuplesort.c.
  * Since the average initial run should be about sort_mem, we have
  *		disk traffic = 2 * relsize * ceil(logM(p / sort_mem))
- *		cpu = comparison_cost * t * log2(t)
+ * and cpu cost computed by compute_cpu_sort_cost().
  *
  * If the sort is bounded (i.e., only the first k result tuples are needed)
  * and k tuples can fit into sort_mem, we use a heap method that keeps only
@@ -1649,13 +1895,6 @@ cost_recursive_union(Path *runion, Path *nrterm, Path *rterm)
  * 'comparison_cost' is the extra cost per comparison, if any
  * 'sort_mem' is the number of kilobytes of work memory allowed for the sort
  * 'limit_tuples' is the bound on the number of output tuples; -1 if no bound
- *
- * NOTE: some callers currently pass NIL for pathkeys because they
- * can't conveniently supply the sort keys.  Since this routine doesn't
- * currently do anything with pathkeys anyway, that doesn't matter...
- * but if it ever does, it should react gracefully to lack of key data.
- * (Actually, the thing we'd most likely be interested in is just the number
- * of sort keys, which all callers *could* supply.)
  */
 void
 cost_sort(Path *path, PlannerInfo *root,
@@ -1682,9 +1921,6 @@ cost_sort(Path *path, PlannerInfo *root,
 	if (tuples < 2.0)
 		tuples = 2.0;
 
-	/* Include the default cost-per-comparison */
-	comparison_cost += 2.0 * cpu_operator_cost;
-
 	/* Do we have a useful LIMIT? */
 	if (limit_tuples > 0 && limit_tuples < tuples)
 	{
@@ -1713,7 +1949,9 @@ cost_sort(Path *path, PlannerInfo *root,
 		 *
 		 * Assume about N log2 N comparisons
 		 */
-		startup_cost += comparison_cost * tuples * LOG2(tuples);
+		startup_cost += compute_cpu_sort_cost(root, pathkeys,
+											  comparison_cost, tuples,
+											  tuples, false);
 
 		/* Disk costs */
 
@@ -1729,18 +1967,17 @@ cost_sort(Path *path, PlannerInfo *root,
 	}
 	else if (tuples > 2 * output_tuples || input_bytes > sort_mem_bytes)
 	{
-		/*
-		 * We'll use a bounded heap-sort keeping just K tuples in memory, for
-		 * a total number of tuple comparisons of N log2 K; but the constant
-		 * factor is a bit higher than for quicksort.  Tweak it so that the
-		 * cost curve is continuous at the crossover point.
-		 */
-		startup_cost += comparison_cost * tuples * LOG2(2.0 * output_tuples);
+		/* We'll use a bounded heap-sort keeping just K tuples in memory. */
+		startup_cost += compute_cpu_sort_cost(root, pathkeys,
+											  comparison_cost, tuples,
+											  output_tuples, true);
 	}
 	else
 	{
 		/* We'll use plain quicksort on all the input tuples */
-		startup_cost += comparison_cost * tuples * LOG2(tuples);
+		startup_cost += compute_cpu_sort_cost(root, pathkeys,
+											  comparison_cost, tuples,
+											  tuples, false);
 	}
 
 	/*