From b4af98013dddfa8f56c14b3d6a0b667b9faefece Mon Sep 17 00:00:00 2001
From: James Hunter <james.hunter.pg@gmail.com>
Date: Tue, 4 Mar 2025 23:03:19 +0000
Subject: [PATCH 2/4] Add "workmem" estimates to Path node and PlannedStmt

To allow for future optimizers to make decisions at Path time, this commit
aggregates the Path's total working memory onto the Path's "workmem" field,
normalized to a minimum of 64 KB and rounded up to the next whole KB.

To allow future hooks to override ExecAssignWorkMem(), this commit then
breaks that total working memory into per-data structure working memory,
and stores it, next to the workMemLimit, on the PlannedStmt.
---
 src/backend/executor/execParallel.c     |   2 +
 src/backend/executor/nodeHash.c         |  32 +-
 src/backend/nodes/tidbitmap.c           |  18 ++
 src/backend/optimizer/path/costsize.c   | 406 ++++++++++++++++++++++--
 src/backend/optimizer/plan/createplan.c | 267 +++++++++++++---
 src/backend/optimizer/plan/planner.c    |   2 +
 src/backend/optimizer/prep/prepagg.c    |  12 +
 src/backend/optimizer/util/pathnode.c   |  53 +++-
 src/include/executor/nodeHash.h         |   3 +-
 src/include/nodes/execnodes.h           |  12 +
 src/include/nodes/pathnodes.h           |  10 +-
 src/include/nodes/plannodes.h           |   7 +-
 src/include/nodes/tidbitmap.h           |   1 +
 src/include/optimizer/cost.h            |  13 +-
 src/include/optimizer/planmain.h        |   3 +-
 15 files changed, 762 insertions(+), 79 deletions(-)

diff --git a/src/backend/executor/execParallel.c b/src/backend/executor/execParallel.c
index a8cb631963e..5c90a29d7d1 100644
--- a/src/backend/executor/execParallel.c
+++ b/src/backend/executor/execParallel.c
@@ -217,6 +217,8 @@ ExecSerializePlan(Plan *plan, EState *estate)
 	pstmt->stmt_location = -1;
 	pstmt->stmt_len = -1;
 	pstmt->workMemCategories = estate->es_plannedstmt->workMemCategories;
+	pstmt->workMemEstimates = estate->es_plannedstmt->workMemEstimates;
+	pstmt->workMemCounts = estate->es_plannedstmt->workMemCounts;
 	pstmt->workMemLimits = estate->es_plannedstmt->workMemLimits;
 
 	/* Return serialized copy of our dummy PlannedStmt. */
diff --git a/src/backend/executor/nodeHash.c b/src/backend/executor/nodeHash.c
index bb9af08dc5d..7d09ac8b5a3 100644
--- a/src/backend/executor/nodeHash.c
+++ b/src/backend/executor/nodeHash.c
@@ -35,6 +35,7 @@
 #include "executor/nodeHash.h"
 #include "executor/nodeHashjoin.h"
 #include "miscadmin.h"
+#include "optimizer/cost.h"
 #include "port/pg_bitutils.h"
 #include "utils/dynahash.h"
 #include "utils/lsyscache.h"
@@ -453,6 +454,7 @@ ExecHashTableCreate(HashState *state)
 	int			nbuckets;
 	int			nbatch;
 	double		rows;
+	int			workmem;		/* ignored */
 	int			num_skew_mcvs;
 	int			log2_nbuckets;
 	MemoryContext oldcxt;
@@ -482,7 +484,7 @@ ExecHashTableCreate(HashState *state)
 							state->parallel_state->nparticipants - 1 : 0,
 							worker_space_allowed,
 							&space_allowed,
-							&nbuckets, &nbatch, &num_skew_mcvs);
+							&nbuckets, &nbatch, &num_skew_mcvs, &workmem);
 
 	/* nbuckets must be a power of 2 */
 	log2_nbuckets = my_log2(nbuckets);
@@ -668,7 +670,8 @@ ExecChooseHashTableSize(double ntuples, int tupwidth, bool useskew,
 						size_t *total_space_allowed,
 						int *numbuckets,
 						int *numbatches,
-						int *num_skew_mcvs)
+						int *num_skew_mcvs,
+						int *workmem)
 {
 	int			tupsize;
 	double		inner_rel_bytes;
@@ -769,6 +772,27 @@ ExecChooseHashTableSize(double ntuples, int tupwidth, bool useskew,
 		*num_skew_mcvs = 0;
 
 	/*
+	 * Set "workmem" to the amount of memory needed to hold the inner rel in a
+	 * single batch. So this calculation doesn't care about "max_pointers".
+	 */
+	dbuckets = ceil(ntuples / NTUP_PER_BUCKET);
+	nbuckets = (int) dbuckets;
+	/* don't let nbuckets be really small, though ... */
+	nbuckets = Max(nbuckets, 1024);
+	/* ... and force it to be a power of 2. */
+	nbuckets = pg_nextpower2_32(nbuckets);
+	bucket_bytes = sizeof(HashJoinTuple) * nbuckets;
+
+	/* Don't forget the 2% overhead reserved for skew buckets! */
+	*workmem = useskew ?
+		normalize_work_bytes((inner_rel_bytes + bucket_bytes) *
+							 100.0 / (100.0 - SKEW_HASH_MEM_PERCENT)) :
+		normalize_work_bytes(inner_rel_bytes + bucket_bytes);
+
+	/*
+	 * Now redo the nbuckets and bucket_bytes calculations, taking memory
+	 * limits into account.
+	 *
 	 * Set nbuckets to achieve an average bucket load of NTUP_PER_BUCKET when
 	 * memory is filled, assuming a single batch; but limit the value so that
 	 * the pointer arrays we'll try to allocate do not exceed hash_table_bytes
@@ -799,6 +823,7 @@ ExecChooseHashTableSize(double ntuples, int tupwidth, bool useskew,
 	 * the required bucket headers, we will need multiple batches.
 	 */
 	bucket_bytes = sizeof(HashJoinTuple) * nbuckets;
+
 	if (inner_rel_bytes + bucket_bytes > hash_table_bytes)
 	{
 		/* We'll need multiple batches */
@@ -819,7 +844,8 @@ ExecChooseHashTableSize(double ntuples, int tupwidth, bool useskew,
 									total_space_allowed,
 									numbuckets,
 									numbatches,
-									num_skew_mcvs);
+									num_skew_mcvs,
+									workmem);
 			return;
 		}
 
diff --git a/src/backend/nodes/tidbitmap.c b/src/backend/nodes/tidbitmap.c
index 41031aa8f2f..425333b0218 100644
--- a/src/backend/nodes/tidbitmap.c
+++ b/src/backend/nodes/tidbitmap.c
@@ -1560,6 +1560,24 @@ tbm_calculate_entries(Size maxbytes)
 	return (int) nbuckets;
 }
 
+/*
+ * tbm_calculate_bytes
+ *
+ * Estimate number of bytes needed to store maxentries hashtable entries.
+ *
+ * This function is the inverse of tbm_calculate_entries(), and is used to
+ * estimate a work_mem limit, based on cardinality.
+ */
+double
+tbm_calculate_bytes(double maxentries)
+{
+	maxentries = Min(maxentries, INT_MAX - 1);	/* safety limit */
+	maxentries = Max(maxentries, 16);	/* sanity limit */
+
+	return maxentries * (sizeof(PagetableEntry) + sizeof(Pointer) +
+						 sizeof(Pointer));
+}
+
 /*
  * Create a shared or private bitmap iterator and start iteration.
  *
diff --git a/src/backend/optimizer/path/costsize.c b/src/backend/optimizer/path/costsize.c
index 353f51fdff2..27daa1966c2 100644
--- a/src/backend/optimizer/path/costsize.c
+++ b/src/backend/optimizer/path/costsize.c
@@ -105,6 +105,7 @@
 #include "optimizer/planmain.h"
 #include "optimizer/restrictinfo.h"
 #include "parser/parsetree.h"
+#include "utils/guc.h"
 #include "utils/lsyscache.h"
 #include "utils/selfuncs.h"
 #include "utils/spccache.h"
@@ -201,9 +202,14 @@ static Cost append_nonpartial_cost(List *subpaths, int numpaths,
 								   int parallel_workers);
 static void set_rel_width(PlannerInfo *root, RelOptInfo *rel);
 static int32 get_expr_width(PlannerInfo *root, const Node *expr);
-static double relation_byte_size(double tuples, int width);
 static double page_size(double tuples, int width);
 static double get_parallel_divisor(Path *path);
+static void compute_sort_output_sizes(double input_tuples, int input_width,
+									  double limit_tuples,
+									  double *output_tuples,
+									  double *output_bytes);
+static double compute_bitmap_workmem(RelOptInfo *baserel, Path *bitmapqual,
+									 Cardinality max_ancestor_rows);
 
 
 /*
@@ -1113,6 +1119,18 @@ cost_bitmap_heap_scan(Path *path, PlannerInfo *root, RelOptInfo *baserel,
 	path->disabled_nodes = enable_bitmapscan ? 0 : 1;
 	path->startup_cost = startup_cost;
 	path->total_cost = startup_cost + run_cost;
+
+
+	/*
+	 * Set an overall working-memory estimate for the entire BitmapHeapPath --
+	 * including all of the IndexPaths and BitmapOrPaths in its bitmapqual.
+	 *
+	 * (When we convert this path into a BitmapHeapScan plan, we'll break this
+	 * overall estimate down into per-node estimates, just as we do for
+	 * AggPaths.)
+	 */
+	path->workmem = compute_bitmap_workmem(baserel, bitmapqual,
+										   0.0 /* max_ancestor_rows */ );
 }
 
 /*
@@ -1588,6 +1606,16 @@ cost_functionscan(Path *path, PlannerInfo *root,
 	path->disabled_nodes = 0;
 	path->startup_cost = startup_cost;
 	path->total_cost = startup_cost + run_cost;
+
+	/*
+	 * Per "XXX" comment above, this workmem estimate is likely to be wrong,
+	 * because the "rows" estimate is pretty phony. Report the estimate
+	 * anyway, for completeness. (This is at least better than saying it won't
+	 * use *any* working memory.)
+	 */
+	path->workmem = list_length(rte->functions) *
+		normalize_work_bytes(relation_byte_size(path->rows,
+												path->pathtarget->width));
 }
 
 /*
@@ -1645,6 +1673,16 @@ cost_tablefuncscan(Path *path, PlannerInfo *root,
 	path->disabled_nodes = 0;
 	path->startup_cost = startup_cost;
 	path->total_cost = startup_cost + run_cost;
+
+	/*
+	 * Per "XXX" comment above, this workmem estimate is likely to be wrong,
+	 * because the "rows" estimate is pretty phony. Report the estimate
+	 * anyway, for completeness. (This is at least better than saying it won't
+	 * use *any* working memory.)
+	 */
+	path->workmem =
+		normalize_work_bytes(relation_byte_size(path->rows,
+												path->pathtarget->width));
 }
 
 /*
@@ -1741,6 +1779,9 @@ cost_ctescan(Path *path, PlannerInfo *root,
 	path->disabled_nodes = 0;
 	path->startup_cost = startup_cost;
 	path->total_cost = startup_cost + run_cost;
+	path->workmem =
+		normalize_work_bytes(relation_byte_size(path->rows,
+												path->pathtarget->width));
 }
 
 /*
@@ -1824,7 +1865,7 @@ cost_resultscan(Path *path, PlannerInfo *root,
  * We are given Paths for the nonrecursive and recursive terms.
  */
 void
-cost_recursive_union(Path *runion, Path *nrterm, Path *rterm)
+cost_recursive_union(RecursiveUnionPath *runion, Path *nrterm, Path *rterm)
 {
 	Cost		startup_cost;
 	Cost		total_cost;
@@ -1851,12 +1892,37 @@ cost_recursive_union(Path *runion, Path *nrterm, Path *rterm)
 	 */
 	total_cost += cpu_tuple_cost * total_rows;
 
-	runion->disabled_nodes = nrterm->disabled_nodes + rterm->disabled_nodes;
-	runion->startup_cost = startup_cost;
-	runion->total_cost = total_cost;
-	runion->rows = total_rows;
-	runion->pathtarget->width = Max(nrterm->pathtarget->width,
-									rterm->pathtarget->width);
+	runion->path.disabled_nodes = nrterm->disabled_nodes + rterm->disabled_nodes;
+	runion->path.startup_cost = startup_cost;
+	runion->path.total_cost = total_cost;
+	runion->path.rows = total_rows;
+	runion->path.pathtarget->width = Max(nrterm->pathtarget->width,
+										 rterm->pathtarget->width);
+
+	/*
+	 * Include memory for working and intermediate tables. Since we'll
+	 * repeatedly swap the two tables, use 2x whichever is larger as our
+	 * estimate.
+	 */
+	runion->path.workmem =
+		normalize_work_bytes(
+							 Max(relation_byte_size(nrterm->rows,
+													nrterm->pathtarget->width),
+								 relation_byte_size(rterm->rows,
+													rterm->pathtarget->width))
+							 * 2);
+
+	if (list_length(runion->distinctList) > 0)
+	{
+		/* Also include memory for hash table. */
+		Size		hashentrysize;
+
+		hashentrysize = MAXALIGN(runion->path.pathtarget->width) +
+			MAXALIGN(SizeofMinimalTupleHeader);
+
+		runion->path.workmem +=
+			normalize_work_bytes(runion->numGroups * hashentrysize);
+	}
 }
 
 /*
@@ -1896,7 +1962,7 @@ cost_recursive_union(Path *runion, Path *nrterm, Path *rterm)
  * 'limit_tuples' is the bound on the number of output tuples; -1 if no bound
  */
 static void
-cost_tuplesort(Cost *startup_cost, Cost *run_cost,
+cost_tuplesort(Cost *startup_cost, Cost *run_cost, Cost *nbytes,
 			   double tuples, int width,
 			   Cost comparison_cost, int sort_mem,
 			   double limit_tuples)
@@ -1916,17 +1982,8 @@ cost_tuplesort(Cost *startup_cost, Cost *run_cost,
 	/* Include the default cost-per-comparison */
 	comparison_cost += 2.0 * cpu_operator_cost;
 
-	/* Do we have a useful LIMIT? */
-	if (limit_tuples > 0 && limit_tuples < tuples)
-	{
-		output_tuples = limit_tuples;
-		output_bytes = relation_byte_size(output_tuples, width);
-	}
-	else
-	{
-		output_tuples = tuples;
-		output_bytes = input_bytes;
-	}
+	compute_sort_output_sizes(tuples, width, limit_tuples,
+							  &output_tuples, &output_bytes);
 
 	if (output_bytes > sort_mem_bytes)
 	{
@@ -1983,6 +2040,7 @@ cost_tuplesort(Cost *startup_cost, Cost *run_cost,
 	 * counting the LIMIT otherwise.
 	 */
 	*run_cost = cpu_operator_cost * tuples;
+	*nbytes = output_bytes;
 }
 
 /*
@@ -2012,6 +2070,7 @@ cost_incremental_sort(Path *path,
 				input_groups;
 	Cost		group_startup_cost,
 				group_run_cost,
+				group_nbytes,
 				group_input_run_cost;
 	List	   *presortedExprs = NIL;
 	ListCell   *l;
@@ -2086,7 +2145,7 @@ cost_incremental_sort(Path *path,
 	 * Estimate the average cost of sorting of one group where presorted keys
 	 * are equal.
 	 */
-	cost_tuplesort(&group_startup_cost, &group_run_cost,
+	cost_tuplesort(&group_startup_cost, &group_run_cost, &group_nbytes,
 				   group_tuples, width, comparison_cost, sort_mem,
 				   limit_tuples);
 
@@ -2127,6 +2186,14 @@ cost_incremental_sort(Path *path,
 
 	path->startup_cost = startup_cost;
 	path->total_cost = startup_cost + run_cost;
+
+	/*
+	 * Incremental sort switches between two Tuplesortstates: one that sorts
+	 * all columns ("full"), and that sorts only suffix columns ("prefix").
+	 * We'll assume they're both around the same size: large enough to hold
+	 * one sort group.
+	 */
+	path->workmem = normalize_work_bytes(group_nbytes * 2.0);
 }
 
 /*
@@ -2151,8 +2218,9 @@ cost_sort(Path *path, PlannerInfo *root,
 {
 	Cost		startup_cost;
 	Cost		run_cost;
+	Cost		nbytes;
 
-	cost_tuplesort(&startup_cost, &run_cost,
+	cost_tuplesort(&startup_cost, &run_cost, &nbytes,
 				   tuples, width,
 				   comparison_cost, sort_mem,
 				   limit_tuples);
@@ -2163,6 +2231,7 @@ cost_sort(Path *path, PlannerInfo *root,
 	path->disabled_nodes = input_disabled_nodes + (enable_sort ? 0 : 1);
 	path->startup_cost = startup_cost;
 	path->total_cost = startup_cost + run_cost;
+	path->workmem = normalize_work_bytes(nbytes);
 }
 
 /*
@@ -2549,6 +2618,7 @@ cost_material(Path *path,
 	path->disabled_nodes = input_disabled_nodes + (enable_material ? 0 : 1);
 	path->startup_cost = startup_cost;
 	path->total_cost = startup_cost + run_cost;
+	path->workmem = normalize_work_bytes(nbytes);
 }
 
 /*
@@ -2622,6 +2692,9 @@ cost_memoize_rescan(PlannerInfo *root, MemoizePath *mpath,
 	/* Remember the ndistinct estimate for EXPLAIN */
 	mpath->est_unique_keys = ndistinct;
 
+	/* How much working memory would we need, to store every distinct tuple? */
+	mpath->path.workmem = normalize_work_bytes(ndistinct * est_entry_bytes);
+
 	/*
 	 * Since we've already estimated the maximum number of entries we can
 	 * store at once and know the estimated number of distinct values we'll be
@@ -2899,6 +2972,19 @@ cost_agg(Path *path, PlannerInfo *root,
 	path->disabled_nodes = disabled_nodes;
 	path->startup_cost = startup_cost;
 	path->total_cost = total_cost;
+
+	/* Include memory needed to produce output. */
+	path->workmem =
+		compute_agg_output_workmem(root, aggstrategy, numGroups,
+								   aggcosts->transitionSpace, input_tuples,
+								   input_width, false /* cost_sort */ );
+
+	/* Also include memory needed to sort inputs (if needed): */
+	if (aggcosts->numSortBuffers > 0)
+	{
+		path->workmem += (double) aggcosts->numSortBuffers *
+			compute_agg_input_workmem(input_tuples, input_width);
+	}
 }
 
 /*
@@ -3133,7 +3219,7 @@ cost_windowagg(Path *path, PlannerInfo *root,
 			   List *windowFuncs, WindowClause *winclause,
 			   int input_disabled_nodes,
 			   Cost input_startup_cost, Cost input_total_cost,
-			   double input_tuples)
+			   double input_tuples, int width)
 {
 	Cost		startup_cost;
 	Cost		total_cost;
@@ -3215,6 +3301,11 @@ cost_windowagg(Path *path, PlannerInfo *root,
 	if (startup_tuples > 1.0)
 		path->startup_cost += (total_cost - startup_cost) / input_tuples *
 			(startup_tuples - 1.0);
+
+
+	/* We need to store a window of size "startup_tuples", in a Tuplestore. */
+	path->workmem =
+		normalize_work_bytes(relation_byte_size(startup_tuples, width));
 }
 
 /*
@@ -3369,6 +3460,7 @@ initial_cost_nestloop(PlannerInfo *root, JoinCostWorkspace *workspace,
 	workspace->total_cost = startup_cost + run_cost;
 	/* Save private data for final_cost_nestloop */
 	workspace->run_cost = run_cost;
+	workspace->workmem = 0;
 }
 
 /*
@@ -3833,6 +3925,14 @@ initial_cost_mergejoin(PlannerInfo *root, JoinCostWorkspace *workspace,
 	workspace->total_cost = startup_cost + run_cost + inner_run_cost;
 	/* Save private data for final_cost_mergejoin */
 	workspace->run_cost = run_cost;
+
+	/*
+	 * By itself, Merge Join requires no working memory. If it adds one or
+	 * more Sort or Material nodes, we'll track their working memory when we
+	 * create them, inside createplan.c.
+	 */
+	workspace->workmem = 0;
+
 	workspace->inner_run_cost = inner_run_cost;
 	workspace->outer_rows = outer_rows;
 	workspace->inner_rows = inner_rows;
@@ -4204,6 +4304,7 @@ initial_cost_hashjoin(PlannerInfo *root, JoinCostWorkspace *workspace,
 	double		outer_path_rows = outer_path->rows;
 	double		inner_path_rows = inner_path->rows;
 	double		inner_path_rows_total = inner_path_rows;
+	int			workmem;
 	int			num_hashclauses = list_length(hashclauses);
 	int			numbuckets;
 	int			numbatches;
@@ -4262,7 +4363,8 @@ initial_cost_hashjoin(PlannerInfo *root, JoinCostWorkspace *workspace,
 							&space_allowed,
 							&numbuckets,
 							&numbatches,
-							&num_skew_mcvs);
+							&num_skew_mcvs,
+							&workmem);
 
 	/*
 	 * If inner relation is too big then we will need to "batch" the join,
@@ -4293,6 +4395,7 @@ initial_cost_hashjoin(PlannerInfo *root, JoinCostWorkspace *workspace,
 	workspace->numbuckets = numbuckets;
 	workspace->numbatches = numbatches;
 	workspace->inner_rows_total = inner_path_rows_total;
+	workspace->workmem = workmem;
 }
 
 /*
@@ -4301,8 +4404,8 @@ initial_cost_hashjoin(PlannerInfo *root, JoinCostWorkspace *workspace,
  *
  * Note: the numbatches estimate is also saved into 'path' for use later
  *
- * 'path' is already filled in except for the rows and cost fields and
- *		num_batches
+ * 'path' is already filled in except for the rows and cost fields,
+ *		num_batches, and workmem
  * 'workspace' is the result from initial_cost_hashjoin
  * 'extra' contains miscellaneous information about the join
  */
@@ -4319,6 +4422,7 @@ final_cost_hashjoin(PlannerInfo *root, HashPath *path,
 	List	   *hashclauses = path->path_hashclauses;
 	Cost		startup_cost = workspace->startup_cost;
 	Cost		run_cost = workspace->run_cost;
+	int			workmem = workspace->workmem;
 	int			numbuckets = workspace->numbuckets;
 	int			numbatches = workspace->numbatches;
 	Cost		cpu_per_tuple;
@@ -4555,6 +4659,7 @@ final_cost_hashjoin(PlannerInfo *root, HashPath *path,
 
 	path->jpath.path.startup_cost = startup_cost;
 	path->jpath.path.total_cost = startup_cost + run_cost;
+	path->jpath.path.workmem = workmem;
 }
 
 
@@ -4577,6 +4682,9 @@ cost_subplan(PlannerInfo *root, SubPlan *subplan, Plan *plan)
 
 	if (subplan->useHashTable)
 	{
+		long		nbuckets;
+		Size		hashentrysize;
+
 		/*
 		 * If we are using a hash table for the subquery outputs, then the
 		 * cost of evaluating the query is a one-time cost.  We charge one
@@ -4588,13 +4696,37 @@ cost_subplan(PlannerInfo *root, SubPlan *subplan, Plan *plan)
 
 		/*
 		 * Working memory needed for the hashtable (and hashnulls, if needed).
+		 * The logic below MUST match the logic in buildSubPlanHash() and
+		 * ExecInitSubPlan().
 		 */
-		subplan->hashtab_workmem_id = add_hash_workmem(root->glob);
+		nbuckets = clamp_cardinality_to_long(plan->plan_rows);
+		if (nbuckets < 1)
+			nbuckets = 1;
+
+		hashentrysize = MAXALIGN(plan->plan_width) +
+			MAXALIGN(SizeofMinimalTupleHeader);
+
+		subplan->hashtab_workmem_id =
+			add_hash_workmem(root->glob,
+							 normalize_work_bytes((double) nbuckets *
+												  hashentrysize));
 
 		if (!subplan->unknownEqFalse)
 		{
 			/* Also needs a hashnulls table.  */
-			subplan->hashnul_workmem_id = add_hash_workmem(root->glob);
+			if (IsA(subplan->testexpr, OpExpr))
+				nbuckets = 1;	/* there can be only one entry */
+			else
+			{
+				nbuckets /= 16;
+				if (nbuckets < 1)
+					nbuckets = 1;
+			}
+
+			subplan->hashnul_workmem_id =
+				add_hash_workmem(root->glob,
+								 normalize_work_bytes((double) nbuckets *
+													  hashentrysize));
 		}
 
 		/*
@@ -6481,7 +6613,7 @@ get_expr_width(PlannerInfo *root, const Node *expr)
  *	  Estimate the storage space in bytes for a given number of tuples
  *	  of a given width (size in bytes).
  */
-static double
+double
 relation_byte_size(double tuples, int width)
 {
 	return tuples * (MAXALIGN(width) + MAXALIGN(SizeofHeapTupleHeader));
@@ -6660,3 +6792,219 @@ compute_gather_rows(Path *path)
 
 	return clamp_row_est(path->rows * get_parallel_divisor(path));
 }
+
+/*
+ * compute_sort_output_sizes
+ *	  Estimate amount of memory and rows needed to hold a Sort operator's output
+ */
+static void
+compute_sort_output_sizes(double input_tuples, int input_width,
+						  double limit_tuples,
+						  double *output_tuples, double *output_bytes)
+{
+	/*
+	 * We want to be sure the cost of a sort is never estimated as zero, even
+	 * if passed-in tuple count is zero.  Besides, mustn't do log(0)...
+	 */
+	if (input_tuples < 2.0)
+		input_tuples = 2.0;
+
+	/* Do we have a useful LIMIT? */
+	if (limit_tuples > 0 && limit_tuples < input_tuples)
+		*output_tuples = limit_tuples;
+	else
+		*output_tuples = input_tuples;
+
+	*output_bytes = relation_byte_size(*output_tuples, input_width);
+}
+
+/*
+ * compute_agg_input_workmem
+ *	  Estimate memory (in KB) needed to hold a sort buffer for aggregate's input
+ *
+ * Some aggregates involve DISTINCT or ORDER BY, so they need to sort their
+ * input, before they can process it. We need one sort buffer per such
+ * aggregate, and this function returns that sort buffer's (estimated) size (in
+ * KB).
+ */
+int
+compute_agg_input_workmem(double input_tuples, double input_width)
+{
+	double		output_tuples;	/* ignored */
+	double		output_bytes;
+
+	/* Account for size of one buffer needed to sort the input. */
+	compute_sort_output_sizes(input_tuples, input_width,
+							  0.0 /* limit_tuples */ ,
+							  &output_tuples, &output_bytes);
+	return normalize_work_bytes(output_bytes);
+}
+
+/*
+ * compute_agg_output_workmem
+ *	  Estimate amount of memory needed (in KB) to hold an aggregate's output
+ *
+ * In a Hash aggregate, we need space for the hash table that holds the
+ * aggregated data.
+ *
+ * Sort aggregates require output space only if they are part of a Grouping
+ * Sets chain: the first aggregate writes to its "sort_out" buffer, which the
+ * second aggregate uses as its "sort_in" buffer, and sorts.
+ *
+ * In the latter case, the "Path" code already costs the sort by calling
+ * cost_sort(), so it passes "cost_sort = false" to this function, to avoid
+ * double-counting.
+ */
+int
+compute_agg_output_workmem(PlannerInfo *root, AggStrategy aggstrategy,
+						   double numGroups, uint64 transitionSpace,
+						   double input_tuples, double input_width,
+						   bool cost_sort)
+{
+	/* Account for size of hash table to hold the output. */
+	if (aggstrategy == AGG_HASHED || aggstrategy == AGG_MIXED)
+	{
+		double		hashentrysize;
+
+		hashentrysize = hash_agg_entry_size(list_length(root->aggtransinfos),
+											input_width, transitionSpace);
+		return normalize_work_bytes(numGroups * hashentrysize);
+	}
+
+	/* Account for the size of the "sort_out" buffer. */
+	if (cost_sort && aggstrategy == AGG_SORTED)
+	{
+		double		output_tuples;	/* ignored */
+		double		output_bytes;
+
+		Assert(aggstrategy == AGG_SORTED);
+
+		compute_sort_output_sizes(numGroups, input_width,
+								  0.0 /* limit_tuples */ ,
+								  &output_tuples, &output_bytes);
+		return normalize_work_bytes(output_bytes);
+	}
+
+	return 0;
+}
+
+/*
+ * compute_bitmap_workmem
+ *	  Estimate total working memory (in KB) needed by bitmapqual
+ *
+ * Although we don't fill in the workmem_est or rows fields on the bitmapqual's
+ * paths, we fill them in on the owning BitmapHeapPath. This function estimates
+ * the total work_mem needed by all BitmapOrPaths and IndexPaths inside
+ * bitmapqual.
+ */
+static double
+compute_bitmap_workmem(RelOptInfo *baserel, Path *bitmapqual,
+					   Cardinality max_ancestor_rows)
+{
+	double		workmem = 0.0;
+	Cost		cost;			/* not used */
+	Selectivity selec;
+	Cardinality plan_rows;
+
+	/* How many rows will this node output? */
+	cost_bitmap_tree_node(bitmapqual, &cost, &selec);
+	plan_rows = clamp_row_est(selec * baserel->tuples);
+
+	/*
+	 * At runtime, we'll reuse the left-most child's TID bitmap. Let that
+	 * child that child know to request enough working memory to hold all its
+	 * ancestors' results.
+	 */
+	max_ancestor_rows = Max(max_ancestor_rows, plan_rows);
+
+	if (IsA(bitmapqual, BitmapAndPath))
+	{
+		BitmapAndPath *apath = (BitmapAndPath *) bitmapqual;
+		ListCell   *l;
+
+		foreach(l, apath->bitmapquals)
+		{
+			workmem +=
+				compute_bitmap_workmem(baserel, (Path *) lfirst(l),
+									   foreach_current_index(l) == 0 ?
+									   max_ancestor_rows : 0.0);
+		}
+	}
+	else if (IsA(bitmapqual, BitmapOrPath))
+	{
+		BitmapOrPath *opath = (BitmapOrPath *) bitmapqual;
+		ListCell   *l;
+
+		foreach(l, opath->bitmapquals)
+		{
+			workmem +=
+				compute_bitmap_workmem(baserel, (Path *) lfirst(l),
+									   foreach_current_index(l) == 0 ?
+									   max_ancestor_rows : 0.0);
+		}
+	}
+	else if (IsA(bitmapqual, IndexPath))
+	{
+		/* Working memory needed for 1 TID bitmap. */
+		workmem +=
+			normalize_work_bytes(tbm_calculate_bytes(max_ancestor_rows));
+	}
+
+	return workmem;
+}
+
+/*
+ * normalize_work_kb
+ *	  Convert a double, "KB" working-memory estimate to an int, "KB" value
+ *
+ * Normalizes non-zero input to a minimum of 64 (KB), rounding up to the
+ * nearest whole KB.
+ */
+int
+normalize_work_kb(double nkb)
+{
+	double		workmem;
+
+	if (nkb == 0.0)
+		return 0;				/* caller apparently doesn't need any workmem */
+
+	/*
+	 * We'll assign working-memory to SQL operators in 1 KB increments, so
+	 * round up to the next whole KB.
+	 */
+	workmem = ceil(nkb);
+
+	/*
+	 * Although some components can probably work with < 64 KB of working
+	 * memory, PostgreSQL has imposed a hard minimum of 64 KB on the
+	 * "work_mem" GUC, for a long time; so, by now, some components probably
+	 * rely on this minimum, implicitly, and would fail if we tried to assign
+	 * them < 64 KB.
+	 *
+	 * Perhaps this minimum can be relaxed, in the future; but memory sizes
+	 * keep increasing, and right now the minimum of 64 KB = 1.6 percent of
+	 * the default "work_mem" of 4 MB.
+	 *
+	 * So, even with this (overly?) cautious normalization, with the default
+	 * GUC settings, we can still achieve a working-memory reduction of
+	 * 64-to-1.
+	 */
+	workmem = Max((double) 64, workmem);
+
+	/* And clamp to MAX_KILOBYTES. */
+	workmem = Min(workmem, (double) MAX_KILOBYTES);
+
+	return (int) workmem;
+}
+
+/*
+ * normalize_work_bytes
+ *	  Convert a double, "bytes" working-memory estimate to an int, "KB" value
+ *
+ * Same as above, but takes input in bytes rather than in KB.
+ */
+int
+normalize_work_bytes(double nbytes)
+{
+	return normalize_work_kb(nbytes / 1024.0);
+}
diff --git a/src/backend/optimizer/plan/createplan.c b/src/backend/optimizer/plan/createplan.c
index 22834fe37f4..aba15d54fa1 100644
--- a/src/backend/optimizer/plan/createplan.c
+++ b/src/backend/optimizer/plan/createplan.c
@@ -130,6 +130,7 @@ static BitmapHeapScan *create_bitmap_scan_plan(PlannerInfo *root,
 											   BitmapHeapPath *best_path,
 											   List *tlist, List *scan_clauses);
 static Plan *create_bitmap_subplan(PlannerInfo *root, Path *bitmapqual,
+								   Cardinality max_ancestor_rows,
 								   List **qual, List **indexqual, List **indexECs);
 static void bitmap_subplan_mark_shared(Plan *plan);
 static TidScan *create_tidscan_plan(PlannerInfo *root, TidPath *best_path,
@@ -319,6 +320,8 @@ static ModifyTable *make_modifytable(PlannerInfo *root, Plan *subplan,
 									 int epqParam);
 static GatherMerge *create_gather_merge_plan(PlannerInfo *root,
 											 GatherMergePath *best_path);
+static int	add_workmem(PlannerGlobal *glob, int estimate);
+static int	add_workmems(PlannerGlobal *glob, int estimate, int count);
 
 
 /*
@@ -1706,7 +1709,8 @@ create_material_plan(PlannerInfo *root, MaterialPath *best_path, int flags)
 
 	copy_generic_path_info(&plan->plan, (Path *) best_path);
 
-	plan->plan.workmem_id = add_workmem(root->glob);
+	plan->plan.workmem_id =
+		add_workmem(root->glob, normalize_work_kb(best_path->path.workmem));
 
 	return plan;
 }
@@ -1763,7 +1767,9 @@ create_memoize_plan(PlannerInfo *root, MemoizePath *best_path, int flags)
 
 	copy_generic_path_info(&plan->plan, (Path *) best_path);
 
-	plan->plan.workmem_id = add_hash_workmem(root->glob);
+	plan->plan.workmem_id =
+		add_hash_workmem(root->glob,
+						 normalize_work_kb(best_path->path.workmem));
 
 	return plan;
 }
@@ -1912,7 +1918,9 @@ create_unique_plan(PlannerInfo *root, UniquePath *best_path, int flags)
 								 0,
 								 subplan);
 
-		plan->workmem_id = add_hash_workmem(root->glob);
+		plan->workmem_id =
+			add_hash_workmem(root->glob,
+							 normalize_work_kb(best_path->path.workmem));
 	}
 	else
 	{
@@ -2259,7 +2267,9 @@ create_sort_plan(PlannerInfo *root, SortPath *best_path, int flags)
 
 	copy_generic_path_info(&plan->plan, (Path *) best_path);
 
-	plan->plan.workmem_id = add_workmem(root->glob);
+	plan->plan.workmem_id =
+		add_workmem(root->glob,
+					normalize_work_kb(best_path->path.workmem));
 
 	return plan;
 }
@@ -2287,7 +2297,13 @@ create_incrementalsort_plan(PlannerInfo *root, IncrementalSortPath *best_path,
 
 	copy_generic_path_info(&plan->sort.plan, (Path *) best_path);
 
-	plan->sort.plan.workmem_id = add_workmem(root->glob);
+	/*
+	 * IncrementalSort creates two sort buffers, which the Path's "workmem"
+	 * estimate combined into a single value. Split it into two now.
+	 */
+	plan->sort.plan.workmem_id =
+		add_workmems(root->glob,
+					 normalize_work_kb(best_path->spath.path.workmem / 2), 2);
 
 	return plan;
 }
@@ -2400,11 +2416,32 @@ create_agg_plan(PlannerInfo *root, AggPath *best_path)
 
 	copy_generic_path_info(&plan->plan, (Path *) best_path);
 
+	/*
+	 * Replace the AggPath's overall workmem estimate with finer-grained
+	 * estimates.
+	 */
 	if (plan->aggstrategy == AGG_HASHED)
-		plan->plan.workmem_id = add_hash_workmem(root->glob);
+	{
+		int			workmem =
+			compute_agg_output_workmem(root, AGG_HASHED,
+									   plan->numGroups,
+									   plan->transitionSpace,
+									   subplan->plan_rows,
+									   subplan->plan_width,
+									   false /* cost_sort */ );
 
-	/* Also include working memory needed to sort the input: */
-	plan->sortWorkMemId = add_workmem(root->glob);
+		plan->plan.workmem_id = add_hash_workmem(root->glob, workmem);
+	}
+
+	/* Also include estimated memory needed to sort the input: */
+	if (best_path->numSortBuffers > 0)
+	{
+		int			workmem = compute_agg_input_workmem(subplan->plan_rows,
+														subplan->plan_width);
+
+		plan->sortWorkMemId =
+			add_workmems(root->glob, workmem, best_path->numSortBuffers);
+	}
 
 	return plan;
 }
@@ -2466,6 +2503,9 @@ create_groupingsets_plan(PlannerInfo *root, GroupingSetsPath *best_path)
 	int			maxref;
 	List	   *chain;
 	ListCell   *lc;
+	int			num_sort_aggs = 0;
+	int			max_sort_agg_workmem = 0.0;
+	double		sum_hash_agg_workmem = 0.0;
 
 	/* Shouldn't get here without grouping sets */
 	Assert(root->parse->groupingSets);
@@ -2527,6 +2567,8 @@ create_groupingsets_plan(PlannerInfo *root, GroupingSetsPath *best_path)
 			Plan	   *sort_plan = NULL;
 			Agg		   *agg_plan;
 			AggStrategy strat;
+			bool		cost_sort;
+			int			workmem;
 
 			new_grpColIdx = remap_groupColIdx(root, rollup->groupClause);
 
@@ -2577,6 +2619,33 @@ create_groupingsets_plan(PlannerInfo *root, GroupingSetsPath *best_path)
 				first_sort_agg = agg_plan;
 			}
 
+			/*
+			 * If we're an AGG_SORTED, but not the last, we need to cost
+			 * working memory needed to produce our "sort_out" buffer.
+			 */
+			cost_sort = foreach_current_index(lc) < list_length(rollups) - 1;
+
+			/* Estimated memory needed to hold the output: */
+			workmem =
+				compute_agg_output_workmem(root, agg_plan->aggstrategy,
+										   agg_plan->numGroups,
+										   agg_plan->transitionSpace,
+										   subplan->plan_rows,
+										   subplan->plan_width,
+										   cost_sort);
+
+			if (agg_plan->aggstrategy == AGG_HASHED)
+			{
+				/* All Hash Grouping Sets share the same workmem limit. */
+				sum_hash_agg_workmem += workmem;
+			}
+			else if (agg_plan->aggstrategy == AGG_SORTED)
+			{
+				/* Every Sort Grouping Set gets its own workmem limit. */
+				max_sort_agg_workmem = Max(max_sort_agg_workmem, workmem);
+				++num_sort_aggs;
+			}
+
 			chain = lappend(chain, agg_plan);
 		}
 	}
@@ -2588,6 +2657,8 @@ create_groupingsets_plan(PlannerInfo *root, GroupingSetsPath *best_path)
 		RollupData *rollup = linitial(rollups);
 		AttrNumber *top_grpColIdx;
 		int			numGroupCols;
+		bool		cost_sort;
+		int			workmem;
 
 		top_grpColIdx = remap_groupColIdx(root, rollup->groupClause);
 
@@ -2610,6 +2681,27 @@ create_groupingsets_plan(PlannerInfo *root, GroupingSetsPath *best_path)
 		/* Copy cost data from Path to Plan */
 		copy_generic_path_info(&plan->plan, &best_path->path);
 
+		/*
+		 * If we're an AGG_SORTED, but not the last, we need to cost working
+		 * memory needed to produce our "sort_out" buffer.
+		 */
+		cost_sort = list_length(rollups) > 1;
+
+		/*
+		 * Replace the overall workmem estimate that we copied from the Path
+		 * with finer-grained estimates.
+		 *
+		 */
+
+		/* Estimated memory needed to hold the output: */
+		workmem =
+			compute_agg_output_workmem(root, plan->aggstrategy,
+									   plan->numGroups,
+									   plan->transitionSpace,
+									   subplan->plan_rows,
+									   subplan->plan_width,
+									   cost_sort);
+
 		/*
 		 * NOTE: We will place the workmem needed to sort the input (if any)
 		 * on the first agg, the Hash workmem on the first Hash agg, and the
@@ -2618,20 +2710,37 @@ create_groupingsets_plan(PlannerInfo *root, GroupingSetsPath *best_path)
 		if (plan->aggstrategy == AGG_HASHED || plan->aggstrategy == AGG_MIXED)
 		{
 			/* All Hash Grouping Sets share the same workmem limit. */
-			plan->plan.workmem_id = add_hash_workmem(root->glob);
+			sum_hash_agg_workmem += workmem;
+			plan->plan.workmem_id = add_hash_workmem(root->glob,
+													 sum_hash_agg_workmem);
 		}
 		else if (plan->aggstrategy == AGG_SORTED)
 		{
 			/* Every Sort Grouping Set gets its own workmem limit. */
+			max_sort_agg_workmem = Max(max_sort_agg_workmem, workmem);
+			++num_sort_aggs;
+
 			first_sort_agg = plan;
 		}
 
 		/* Store the workmem limit, for all Sorts, on the first Sort. */
-		if (first_sort_agg)
-			first_sort_agg->plan.workmem_id = add_workmem(root->glob);
+		if (num_sort_aggs > 1)
+		{
+			first_sort_agg->plan.workmem_id =
+				add_workmems(root->glob, max_sort_agg_workmem,
+							 num_sort_aggs > 2 ? 2 : 1);
+		}
 
 		/* Also include working memory needed to sort the input: */
-		plan->sortWorkMemId = add_workmem(root->glob);
+		if (best_path->numSortBuffers > 0)
+		{
+			workmem = compute_agg_input_workmem(subplan->plan_rows,
+												subplan->plan_width);
+
+			plan->sortWorkMemId =
+				add_workmems(root->glob, workmem,
+							 best_path->numSortBuffers * list_length(rollups));
+		}
 	}
 
 	return (Plan *) plan;
@@ -2796,7 +2905,8 @@ create_windowagg_plan(PlannerInfo *root, WindowAggPath *best_path)
 
 	copy_generic_path_info(&plan->plan, (Path *) best_path);
 
-	plan->plan.workmem_id = add_workmem(root->glob);
+	plan->plan.workmem_id =
+		add_workmem(root->glob, normalize_work_kb(best_path->path.workmem));
 
 	return plan;
 }
@@ -2838,7 +2948,9 @@ create_setop_plan(PlannerInfo *root, SetOpPath *best_path, int flags)
 
 	copy_generic_path_info(&plan->plan, (Path *) best_path);
 
-	plan->plan.workmem_id = add_hash_workmem(root->glob);
+	plan->plan.workmem_id =
+		add_hash_workmem(root->glob,
+						 normalize_work_kb(best_path->path.workmem));
 
 	return plan;
 }
@@ -2876,11 +2988,38 @@ create_recursiveunion_plan(PlannerInfo *root, RecursiveUnionPath *best_path)
 
 	copy_generic_path_info(&plan->plan, (Path *) best_path);
 
-	plan->plan.workmem_id = add_workmem(root->glob);
+	/*
+	 * Replace our overall "workmem" estimate with estimates at finer
+	 * granularity.
+	 */
+
+	/*
+	 * Include memory for working and intermediate tables.  Since we'll
+	 * repeatedly swap the two tables, use the larger of the two as our
+	 * working- memory estimate.
+	 *
+	 * NOTE: The Path's "workmem" estimate is for the whole Path, but the
+	 * Plan's "workmem" estimates are *per data structure*. So, this value is
+	 * half of the corresponding Path's value.
+	 */
+	plan->plan.workmem_id =
+		add_workmems(root->glob,
+					 normalize_work_bytes(Max(relation_byte_size(leftplan->plan_rows,
+																 leftplan->plan_width),
+											  relation_byte_size(rightplan->plan_rows,
+																 rightplan->plan_width))),
+					 2);
 
 	/* Also include working memory for hash table. */
 	if (plan->numCols > 0)
-		plan->hashWorkMemId = add_hash_workmem(root->glob);
+	{
+		Size		entrysize =
+			sizeof(TupleHashEntryData) + plan->plan.plan_width;
+
+		plan->hashWorkMemId =
+			add_hash_workmem(root->glob,
+							 normalize_work_bytes(plan->numGroups * entrysize));
+	}
 
 	return plan;
 }
@@ -3322,6 +3461,7 @@ create_bitmap_scan_plan(PlannerInfo *root,
 
 	/* Process the bitmapqual tree into a Plan tree and qual lists */
 	bitmapqualplan = create_bitmap_subplan(root, best_path->bitmapqual,
+										   0.0 /* max_ancestor_rows */ ,
 										   &bitmapqualorig, &indexquals,
 										   &indexECs);
 
@@ -3433,9 +3573,24 @@ create_bitmap_scan_plan(PlannerInfo *root,
  */
 static Plan *
 create_bitmap_subplan(PlannerInfo *root, Path *bitmapqual,
+					  Cardinality max_ancestor_rows,
 					  List **qual, List **indexqual, List **indexECs)
 {
 	Plan	   *plan;
+	Cost		cost;			/* not used */
+	Selectivity selec;
+	Cardinality plan_rows;
+
+	/* How many rows will this node output? */
+	cost_bitmap_tree_node(bitmapqual, &cost, &selec);
+	plan_rows = clamp_row_est(selec * bitmapqual->parent->tuples);
+
+	/*
+	 * At runtime, we'll reuse the left-most child's TID bitmap. Let that
+	 * child that child know to request enough working memory to hold all its
+	 * ancestors' results.
+	 */
+	max_ancestor_rows = Max(max_ancestor_rows, plan_rows);
 
 	if (IsA(bitmapqual, BitmapAndPath))
 	{
@@ -3461,6 +3616,8 @@ create_bitmap_subplan(PlannerInfo *root, Path *bitmapqual,
 			List	   *subindexEC;
 
 			subplan = create_bitmap_subplan(root, (Path *) lfirst(l),
+											foreach_current_index(l) == 0 ?
+											max_ancestor_rows : 0.0,
 											&subqual, &subindexqual,
 											&subindexEC);
 			subplans = lappend(subplans, subplan);
@@ -3472,8 +3629,7 @@ create_bitmap_subplan(PlannerInfo *root, Path *bitmapqual,
 		plan = (Plan *) make_bitmap_and(subplans);
 		plan->startup_cost = apath->path.startup_cost;
 		plan->total_cost = apath->path.total_cost;
-		plan->plan_rows =
-			clamp_row_est(apath->bitmapselectivity * apath->path.parent->tuples);
+		plan->plan_rows = plan_rows;
 		plan->plan_width = 0;	/* meaningless */
 		plan->parallel_aware = false;
 		plan->parallel_safe = apath->path.parallel_safe;
@@ -3508,6 +3664,8 @@ create_bitmap_subplan(PlannerInfo *root, Path *bitmapqual,
 			List	   *subindexEC;
 
 			subplan = create_bitmap_subplan(root, (Path *) lfirst(l),
+											foreach_current_index(l) == 0 ?
+											max_ancestor_rows : 0.0,
 											&subqual, &subindexqual,
 											&subindexEC);
 			subplans = lappend(subplans, subplan);
@@ -3536,8 +3694,7 @@ create_bitmap_subplan(PlannerInfo *root, Path *bitmapqual,
 			plan = (Plan *) make_bitmap_or(subplans);
 			plan->startup_cost = opath->path.startup_cost;
 			plan->total_cost = opath->path.total_cost;
-			plan->plan_rows =
-				clamp_row_est(opath->bitmapselectivity * opath->path.parent->tuples);
+			plan->plan_rows = plan_rows;
 			plan->plan_width = 0;	/* meaningless */
 			plan->parallel_aware = false;
 			plan->parallel_safe = opath->path.parallel_safe;
@@ -3583,13 +3740,14 @@ create_bitmap_subplan(PlannerInfo *root, Path *bitmapqual,
 		/* and set its cost/width fields appropriately */
 		plan->startup_cost = 0.0;
 		plan->total_cost = ipath->indextotalcost;
-		plan->plan_rows =
-			clamp_row_est(ipath->indexselectivity * ipath->path.parent->tuples);
+		plan->plan_rows = plan_rows;
 		plan->plan_width = 0;	/* meaningless */
 		plan->parallel_aware = false;
 		plan->parallel_safe = ipath->path.parallel_safe;
 
-		plan->workmem_id = add_workmem(root->glob);
+		plan->workmem_id =
+			add_workmem(root->glob,
+						normalize_work_bytes(tbm_calculate_bytes(max_ancestor_rows)));
 
 		/* Extract original index clauses, actual index quals, relevant ECs */
 		subquals = NIL;
@@ -3898,7 +4056,15 @@ create_functionscan_plan(PlannerInfo *root, Path *best_path,
 
 	copy_generic_path_info(&scan_plan->scan.plan, best_path);
 
-	scan_plan->scan.plan.workmem_id = add_workmem(root->glob);
+	/*
+	 * Replace the path's total working-memory estimate with a per-function
+	 * estimate.
+	 */
+	scan_plan->scan.plan.workmem_id =
+		add_workmems(root->glob,
+					 normalize_work_bytes(relation_byte_size(scan_plan->scan.plan.plan_rows,
+															 scan_plan->scan.plan.plan_width)),
+					 list_length(functions));
 
 	return scan_plan;
 }
@@ -3943,7 +4109,8 @@ create_tablefuncscan_plan(PlannerInfo *root, Path *best_path,
 
 	copy_generic_path_info(&scan_plan->scan.plan, best_path);
 
-	scan_plan->scan.plan.workmem_id = add_workmem(root->glob);
+	scan_plan->scan.plan.workmem_id =
+		add_workmem(root->glob, normalize_work_kb(best_path->workmem));
 
 	return scan_plan;
 }
@@ -4083,7 +4250,8 @@ create_ctescan_plan(PlannerInfo *root, Path *best_path,
 
 	copy_generic_path_info(&scan_plan->scan.plan, best_path);
 
-	scan_plan->scan.plan.workmem_id = add_workmem(root->glob);
+	scan_plan->scan.plan.workmem_id =
+		add_workmem(root->glob, normalize_work_kb(best_path->workmem));
 
 	return scan_plan;
 }
@@ -4786,8 +4954,10 @@ create_mergejoin_plan(PlannerInfo *root,
 		 */
 		copy_plan_costsize(matplan, inner_plan);
 		matplan->total_cost += cpu_operator_cost * matplan->plan_rows;
-
-		matplan->workmem_id = add_workmem(root->glob);
+		matplan->workmem_id =
+			add_workmem(root->glob,
+						normalize_work_bytes(relation_byte_size(matplan->plan_rows,
+																matplan->plan_width)));
 
 		inner_plan = matplan;
 	}
@@ -5135,7 +5305,9 @@ create_hashjoin_plan(PlannerInfo *root,
 	copy_generic_path_info(&join_plan->join.plan, &best_path->jpath.path);
 
 	/* Assign workmem to the Hash subnode, not its parent HashJoin node. */
-	hash_plan->plan.workmem_id = add_hash_workmem(root->glob);
+	hash_plan->plan.workmem_id =
+		add_hash_workmem(root->glob,
+						 normalize_work_kb(best_path->jpath.path.workmem));
 
 	return join_plan;
 }
@@ -5690,7 +5862,8 @@ label_sort_with_costsize(PlannerInfo *root, Sort *plan, double limit_tuples)
 	plan->plan.parallel_aware = false;
 	plan->plan.parallel_safe = lefttree->parallel_safe;
 
-	plan->plan.workmem_id = add_workmem(root->glob);
+	plan->plan.workmem_id =
+		add_workmem(root->glob, normalize_work_kb(sort_path.workmem));
 }
 
 /*
@@ -5723,7 +5896,8 @@ label_incrementalsort_with_costsize(PlannerInfo *root, IncrementalSort *plan,
 	plan->sort.plan.parallel_aware = false;
 	plan->sort.plan.parallel_safe = lefttree->parallel_safe;
 
-	plan->sort.plan.workmem_id = add_workmem(root->glob);
+	plan->sort.plan.workmem_id =
+		add_workmem(root->glob, normalize_work_kb(sort_path.workmem));
 }
 
 /*
@@ -6821,7 +6995,8 @@ materialize_finished_plan(PlannerGlobal *glob, Plan *subplan)
 	matplan->parallel_aware = false;
 	matplan->parallel_safe = subplan->parallel_safe;
 
-	matplan->workmem_id = add_workmem(glob);
+	matplan->workmem_id =
+		add_workmem(glob, normalize_work_kb(matpath.workmem));
 
 	return matplan;
 }
@@ -7590,12 +7765,22 @@ is_projection_capable_plan(Plan *plan)
 }
 
 static int
-add_workmem_internal(PlannerGlobal *glob, WorkMemCategory category)
+add_workmem_internal(PlannerGlobal *glob, WorkMemCategory category,
+					 int estimate, int count)
 {
+	if (estimate == 0 || count == 0)
+		return 0;
+
 	glob->workMemCategories = lappend_int(glob->workMemCategories, category);
+	glob->workMemEstimates = lappend_int(glob->workMemEstimates, estimate);
+	glob->workMemCounts = lappend_int(glob->workMemCounts, count);
 	/* the executor will fill this in later: */
 	glob->workMemLimits = lappend_int(glob->workMemLimits, 0);
 
+	Assert(list_length(glob->workMemCategories) ==
+		   list_length(glob->workMemEstimates));
+	Assert(list_length(glob->workMemCategories) ==
+		   list_length(glob->workMemCounts));
 	Assert(list_length(glob->workMemCategories) ==
 		   list_length(glob->workMemLimits));
 
@@ -7608,10 +7793,10 @@ add_workmem_internal(PlannerGlobal *glob, WorkMemCategory category)
  *
  * This data structure will have its working-memory limit set to work_mem.
  */
-int
-add_workmem(PlannerGlobal *glob)
+static int
+add_workmem(PlannerGlobal *glob, int estimate)
 {
-	return add_workmem_internal(glob, WORKMEM_NORMAL);
+	return add_workmem_internal(glob, WORKMEM_NORMAL, estimate, 1);
 }
 
 /*
@@ -7622,7 +7807,13 @@ add_workmem(PlannerGlobal *glob)
  * hash_mem_multiplier.
  */
 int
-add_hash_workmem(PlannerGlobal *glob)
+add_hash_workmem(PlannerGlobal *glob, int estimate)
 {
-	return add_workmem_internal(glob, WORKMEM_HASH);
+	return add_workmem_internal(glob, WORKMEM_HASH, estimate, 1);
+}
+
+static int
+add_workmems(PlannerGlobal *glob, int estimate, int count)
+{
+	return add_workmem_internal(glob, WORKMEM_NORMAL, estimate, count);
 }
diff --git a/src/backend/optimizer/plan/planner.c b/src/backend/optimizer/plan/planner.c
index a431808be96..007e298565a 100644
--- a/src/backend/optimizer/plan/planner.c
+++ b/src/backend/optimizer/plan/planner.c
@@ -585,6 +585,8 @@ standard_planner(Query *parse, const char *query_string, int cursorOptions,
 	result->stmt_len = parse->stmt_len;
 
 	result->workMemCategories = glob->workMemCategories;
+	result->workMemEstimates = glob->workMemEstimates;
+	result->workMemCounts = glob->workMemCounts;
 	result->workMemLimits = glob->workMemLimits;
 
 	result->jitFlags = PGJIT_NONE;
diff --git a/src/backend/optimizer/prep/prepagg.c b/src/backend/optimizer/prep/prepagg.c
index c0a2f04a8c3..0d0fb5cf8ed 100644
--- a/src/backend/optimizer/prep/prepagg.c
+++ b/src/backend/optimizer/prep/prepagg.c
@@ -691,5 +691,17 @@ get_agg_clause_costs(PlannerInfo *root, AggSplit aggsplit, AggClauseCosts *costs
 			costs->finalCost.startup += argcosts.startup;
 			costs->finalCost.per_tuple += argcosts.per_tuple;
 		}
+
+		/*
+		 * How many aggrefs need to sort their input? (Each such aggref gets
+		 * its own sort buffer. The logic here MUST match the corresponding
+		 * logic in function build_pertrans_for_aggref().)
+		 */
+		if (!AGGKIND_IS_ORDERED_SET(aggref->aggkind) &&
+			!aggref->aggpresorted &&
+			(aggref->aggdistinct || aggref->aggorder))
+		{
+			++costs->numSortBuffers;
+		}
 	}
 }
diff --git a/src/backend/optimizer/util/pathnode.c b/src/backend/optimizer/util/pathnode.c
index a4c5867cdcb..070b86563b1 100644
--- a/src/backend/optimizer/util/pathnode.c
+++ b/src/backend/optimizer/util/pathnode.c
@@ -1737,6 +1737,13 @@ create_memoize_path(PlannerInfo *root, RelOptInfo *rel, Path *subpath,
 	pathnode->path.total_cost = subpath->total_cost + cpu_tuple_cost;
 	pathnode->path.rows = subpath->rows;
 
+	/*
+	 * For now, set workmem at hash memory limit. Function
+	 * cost_memoize_rescan() will adjust this field, same as it does for field
+	 * "est_entries".
+	 */
+	pathnode->path.workmem = normalize_work_bytes(get_hash_memory_limit());
+
 	return pathnode;
 }
 
@@ -1965,12 +1972,14 @@ create_unique_path(PlannerInfo *root, RelOptInfo *rel, Path *subpath,
 		pathnode->path.disabled_nodes = agg_path.disabled_nodes;
 		pathnode->path.startup_cost = agg_path.startup_cost;
 		pathnode->path.total_cost = agg_path.total_cost;
+		pathnode->path.workmem = agg_path.workmem;
 	}
 	else
 	{
 		pathnode->path.disabled_nodes = sort_path.disabled_nodes;
 		pathnode->path.startup_cost = sort_path.startup_cost;
 		pathnode->path.total_cost = sort_path.total_cost;
+		pathnode->path.workmem = sort_path.workmem;
 	}
 
 	rel->cheapest_unique_path = (Path *) pathnode;
@@ -2317,6 +2326,13 @@ create_worktablescan_path(PlannerInfo *root, RelOptInfo *rel,
 	/* Cost is the same as for a regular CTE scan */
 	cost_ctescan(pathnode, root, rel, pathnode->param_info);
 
+	/*
+	 * But working memory used is 0, since the worktable scan doesn't create a
+	 * tuplestore -- it just reuses a tuplestore already created by a
+	 * recursive union.
+	 */
+	pathnode->workmem = 0;
+
 	return pathnode;
 }
 
@@ -3314,6 +3330,7 @@ create_agg_path(PlannerInfo *root,
 
 	pathnode->aggstrategy = aggstrategy;
 	pathnode->aggsplit = aggsplit;
+	pathnode->numSortBuffers = aggcosts ? aggcosts->numSortBuffers : 0;
 	pathnode->numGroups = numGroups;
 	pathnode->transitionSpace = aggcosts ? aggcosts->transitionSpace : 0;
 	pathnode->groupClause = groupClause;
@@ -3364,6 +3381,8 @@ create_groupingsets_path(PlannerInfo *root,
 	ListCell   *lc;
 	bool		is_first = true;
 	bool		is_first_sort = true;
+	int			num_sort_nodes = 0;
+	double		max_sort_workmem = 0.0;
 
 	/* The topmost generated Plan node will be an Agg */
 	pathnode->path.pathtype = T_Agg;
@@ -3400,6 +3419,7 @@ create_groupingsets_path(PlannerInfo *root,
 		pathnode->path.pathkeys = NIL;
 
 	pathnode->aggstrategy = aggstrategy;
+	pathnode->numSortBuffers = agg_costs ? agg_costs->numSortBuffers : 0;
 	pathnode->rollups = rollups;
 	pathnode->qual = having_qual;
 	pathnode->transitionSpace = agg_costs ? agg_costs->transitionSpace : 0;
@@ -3463,6 +3483,8 @@ create_groupingsets_path(PlannerInfo *root,
 						 subpath->pathtarget->width);
 				if (!rollup->is_hashed)
 					is_first_sort = false;
+
+				pathnode->path.workmem += agg_path.workmem;
 			}
 			else
 			{
@@ -3475,6 +3497,12 @@ create_groupingsets_path(PlannerInfo *root,
 						  work_mem,
 						  -1.0);
 
+				/*
+				 * We costed sorting the previous "sort" rollup's "sort_out"
+				 * buffer. How much memory did it need?
+				 */
+				max_sort_workmem = Max(max_sort_workmem, sort_path.workmem);
+
 				/* Account for cost of aggregation */
 
 				cost_agg(&agg_path, root,
@@ -3488,12 +3516,17 @@ create_groupingsets_path(PlannerInfo *root,
 						 sort_path.total_cost,
 						 sort_path.rows,
 						 subpath->pathtarget->width);
+
+				pathnode->path.workmem += agg_path.workmem;
 			}
 
 			pathnode->path.disabled_nodes += agg_path.disabled_nodes;
 			pathnode->path.total_cost += agg_path.total_cost;
 			pathnode->path.rows += agg_path.rows;
 		}
+
+		if (!rollup->is_hashed)
+			++num_sort_nodes;
 	}
 
 	/* add tlist eval cost for each output row */
@@ -3501,6 +3534,17 @@ create_groupingsets_path(PlannerInfo *root,
 	pathnode->path.total_cost += target->cost.startup +
 		target->cost.per_tuple * pathnode->path.rows;
 
+	/*
+	 * Include working memory needed to sort agg output. If there's only 1
+	 * sort rollup, then we don't need any memory. If there are 2 sort
+	 * rollups, we need enough memory for 1 sort buffer. If there are >= 3
+	 * sort rollups, we need only 2 sort buffers, since we're
+	 * double-buffering.
+	 */
+	pathnode->path.workmem += num_sort_nodes > 2 ?
+		max_sort_workmem * 2.0 :
+		max_sort_workmem;
+
 	return pathnode;
 }
 
@@ -3650,7 +3694,8 @@ create_windowagg_path(PlannerInfo *root,
 				   subpath->disabled_nodes,
 				   subpath->startup_cost,
 				   subpath->total_cost,
-				   subpath->rows);
+				   subpath->rows,
+				   subpath->pathtarget->width);
 
 	/* add tlist eval cost for each output row */
 	pathnode->path.startup_cost += target->cost.startup;
@@ -3775,7 +3820,11 @@ create_setop_path(PlannerInfo *root,
 			MAXALIGN(SizeofMinimalTupleHeader);
 		if (hashentrysize * numGroups > get_hash_memory_limit())
 			pathnode->path.disabled_nodes++;
+
+		pathnode->path.workmem =
+			normalize_work_bytes(numGroups * hashentrysize);
 	}
+
 	pathnode->path.rows = outputRows;
 
 	return pathnode;
@@ -3826,7 +3875,7 @@ create_recursiveunion_path(PlannerInfo *root,
 	pathnode->wtParam = wtParam;
 	pathnode->numGroups = numGroups;
 
-	cost_recursive_union(&pathnode->path, leftpath, rightpath);
+	cost_recursive_union(pathnode, leftpath, rightpath);
 
 	return pathnode;
 }
diff --git a/src/include/executor/nodeHash.h b/src/include/executor/nodeHash.h
index e4e9e0d1de1..6cd9bffbee5 100644
--- a/src/include/executor/nodeHash.h
+++ b/src/include/executor/nodeHash.h
@@ -63,7 +63,8 @@ extern void ExecChooseHashTableSize(double ntuples, int tupwidth, bool useskew,
 									size_t *total_space_allowed,
 									int *numbuckets,
 									int *numbatches,
-									int *num_skew_mcvs);
+									int *num_skew_mcvs,
+									int *workmem);
 extern int	ExecHashGetSkewBucket(HashJoinTable hashtable, uint32 hashvalue);
 extern void ExecHashEstimate(HashState *node, ParallelContext *pcxt);
 extern void ExecHashInitializeDSM(HashState *node, ParallelContext *pcxt);
diff --git a/src/include/nodes/execnodes.h b/src/include/nodes/execnodes.h
index d543011d92a..e15c37608d1 100644
--- a/src/include/nodes/execnodes.h
+++ b/src/include/nodes/execnodes.h
@@ -1283,6 +1283,18 @@ typedef struct PlanState
 #define workMemField(node, field)   \
 	(workMemFieldFromId((node), field, ((PlanState *)(node))->plan->workmem_id))
 
+/* workmem estimate: */
+#define workMemEstimateFromId(node, id) \
+	(workMemFieldFromId(node, workMemEstimates, id))
+#define workMemEstimate(node) \
+	(workMemField(node, workMemEstimates))
+
+/* workmem count: */
+#define workMemCountFromId(node, id) \
+	(workMemFieldFromId(node, workMemCounts, id))
+#define workMemCount(node) \
+	(workMemField(node, workMemCounts))
+
 /* workmem limit: */
 #define workMemLimitFromId(node, id) \
 	(workMemFieldFromId(node, workMemLimits, id))
diff --git a/src/include/nodes/pathnodes.h b/src/include/nodes/pathnodes.h
index 181437ac933..779a56ede1a 100644
--- a/src/include/nodes/pathnodes.h
+++ b/src/include/nodes/pathnodes.h
@@ -60,6 +60,7 @@ typedef struct AggClauseCosts
 	QualCost	transCost;		/* total per-input-row execution costs */
 	QualCost	finalCost;		/* total per-aggregated-row costs */
 	Size		transitionSpace;	/* space for pass-by-ref transition data */
+	int			numSortBuffers; /* # of required input-sort buffers */
 } AggClauseCosts;
 
 /*
@@ -188,9 +189,12 @@ typedef struct PlannerGlobal
 	 * needs working memory for a data structure maintains a "workmem_id"
 	 * index into the following lists (all kept in sync).
 	 */
-
 	/* - IntList (of WorkMemCategory): is this a Hash or "normal" limit? */
 	List	   *workMemCategories;
+	/* - IntList: estimate (in KB) of memory needed to avoid spilling */
+	List	   *workMemEstimates;
+	/* - IntList: how many data structures get a copy of this info */
+	List	   *workMemCounts;
 	/* - IntList: limit (in KB), after which data structure must spill */
 	List	   *workMemLimits;
 } PlannerGlobal;
@@ -1807,6 +1811,7 @@ typedef struct Path
 	int			disabled_nodes; /* count of disabled nodes */
 	Cost		startup_cost;	/* cost expended before fetching any tuples */
 	Cost		total_cost;		/* total cost (assuming all tuples fetched) */
+	Cost		workmem;		/* estimated work_mem (in KB) */
 
 	/* sort ordering of path's output; a List of PathKey nodes; see above */
 	List	   *pathkeys;
@@ -2411,6 +2416,7 @@ typedef struct AggPath
 	Path	   *subpath;		/* path representing input source */
 	AggStrategy aggstrategy;	/* basic strategy, see nodes.h */
 	AggSplit	aggsplit;		/* agg-splitting mode, see nodes.h */
+	int			numSortBuffers; /* number of inputs that require sorting */
 	Cardinality numGroups;		/* estimated number of groups in input */
 	uint64		transitionSpace;	/* for pass-by-ref transition data */
 	List	   *groupClause;	/* a list of SortGroupClause's */
@@ -2452,6 +2458,7 @@ typedef struct GroupingSetsPath
 	Path		path;
 	Path	   *subpath;		/* path representing input source */
 	AggStrategy aggstrategy;	/* basic strategy */
+	int			numSortBuffers; /* number of inputs that require sorting */
 	List	   *rollups;		/* list of RollupData */
 	List	   *qual;			/* quals (HAVING quals), if any */
 	uint64		transitionSpace;	/* for pass-by-ref transition data */
@@ -3495,6 +3502,7 @@ typedef struct JoinCostWorkspace
 
 	/* Fields below here should be treated as private to costsize.c */
 	Cost		run_cost;		/* non-startup cost components */
+	Cost		workmem;		/* estimated work_mem (in KB) */
 
 	/* private for cost_nestloop code */
 	Cost		inner_run_cost; /* also used by cost_mergejoin code */
diff --git a/src/include/nodes/plannodes.h b/src/include/nodes/plannodes.h
index ba8fdc2e6db..2134b15f95f 100644
--- a/src/include/nodes/plannodes.h
+++ b/src/include/nodes/plannodes.h
@@ -160,9 +160,12 @@ typedef struct PlannedStmt
 	 * needs working memory for a data structure maintains a "workmem_id"
 	 * index into the following lists (all kept in sync).
 	 */
-
 	/* - IntList (of WorkMemCategory): is this a Hash or "normal" limit? */
 	List	   *workMemCategories;
+	/* - IntList: estimate (in KB) of memory needed to avoid spilling */
+	List	   *workMemEstimates;
+	/* - IntList: how many data structures get a copy of this info */
+	List	   *workMemCounts;
 	/* - IntList: limit (in KB), after which data structure must spill */
 	List	   *workMemLimits;
 } PlannedStmt;
@@ -1191,6 +1194,8 @@ typedef struct Agg
 	Oid		   *grpOperators pg_node_attr(array_size(numCols));
 	Oid		   *grpCollations pg_node_attr(array_size(numCols));
 
+	/* number of inputs that require sorting */
+	int			numSorts;
 	/* 1-based id of workMem to use to sort inputs, or else zero */
 	int			sortWorkMemId;
 
diff --git a/src/include/nodes/tidbitmap.h b/src/include/nodes/tidbitmap.h
index 99f795ceab5..d89a0f71a72 100644
--- a/src/include/nodes/tidbitmap.h
+++ b/src/include/nodes/tidbitmap.h
@@ -108,6 +108,7 @@ extern void tbm_end_shared_iterate(TBMSharedIterator *iterator);
 extern TBMSharedIterator *tbm_attach_shared_iterate(dsa_area *dsa,
 													dsa_pointer dp);
 extern int	tbm_calculate_entries(Size maxbytes);
+extern double tbm_calculate_bytes(double maxentries);
 
 extern TBMIterator tbm_begin_iterate(TIDBitmap *tbm,
 									 dsa_area *dsa, dsa_pointer dsp);
diff --git a/src/include/optimizer/cost.h b/src/include/optimizer/cost.h
index b523bcda8f3..ef80f6f9339 100644
--- a/src/include/optimizer/cost.h
+++ b/src/include/optimizer/cost.h
@@ -106,7 +106,7 @@ extern void cost_namedtuplestorescan(Path *path, PlannerInfo *root,
 									 RelOptInfo *baserel, ParamPathInfo *param_info);
 extern void cost_resultscan(Path *path, PlannerInfo *root,
 							RelOptInfo *baserel, ParamPathInfo *param_info);
-extern void cost_recursive_union(Path *runion, Path *nrterm, Path *rterm);
+extern void cost_recursive_union(RecursiveUnionPath *runion, Path *nrterm, Path *rterm);
 extern void cost_sort(Path *path, PlannerInfo *root,
 					  List *pathkeys, int input_disabled_nodes,
 					  Cost input_cost, double tuples, int width,
@@ -139,7 +139,7 @@ extern void cost_windowagg(Path *path, PlannerInfo *root,
 						   List *windowFuncs, WindowClause *winclause,
 						   int input_disabled_nodes,
 						   Cost input_startup_cost, Cost input_total_cost,
-						   double input_tuples);
+						   double input_tuples, int width);
 extern void cost_group(Path *path, PlannerInfo *root,
 					   int numGroupCols, double numGroups,
 					   List *quals,
@@ -218,9 +218,18 @@ extern void set_namedtuplestore_size_estimates(PlannerInfo *root, RelOptInfo *re
 extern void set_result_size_estimates(PlannerInfo *root, RelOptInfo *rel);
 extern void set_foreign_size_estimates(PlannerInfo *root, RelOptInfo *rel);
 extern PathTarget *set_pathtarget_cost_width(PlannerInfo *root, PathTarget *target);
+extern double relation_byte_size(double tuples, int width);
 extern double compute_bitmap_pages(PlannerInfo *root, RelOptInfo *baserel,
 								   Path *bitmapqual, double loop_count,
 								   Cost *cost_p, double *tuples_p);
 extern double compute_gather_rows(Path *path);
+extern int	compute_agg_input_workmem(double input_tuples, double input_width);
+extern int	compute_agg_output_workmem(PlannerInfo *root,
+									   AggStrategy aggstrategy,
+									   double numGroups, uint64 transitionSpace,
+									   double input_tuples, double input_width,
+									   bool cost_sort);
+extern int	normalize_work_kb(double nkb);
+extern int	normalize_work_bytes(double nbytes);
 
 #endif							/* COST_H */
diff --git a/src/include/optimizer/planmain.h b/src/include/optimizer/planmain.h
index 8436136026b..21894adffcc 100644
--- a/src/include/optimizer/planmain.h
+++ b/src/include/optimizer/planmain.h
@@ -49,8 +49,7 @@ extern Plan *change_plan_targetlist(Plan *subplan, List *tlist,
 extern Plan *materialize_finished_plan(PlannerGlobal *glob, Plan *subplan);
 extern bool is_projection_capable_path(Path *path);
 extern bool is_projection_capable_plan(Plan *plan);
-extern int	add_workmem(PlannerGlobal *glob);
-extern int	add_hash_workmem(PlannerGlobal *glob);
+extern int	add_hash_workmem(PlannerGlobal *glob, int estimate);
 
 /* External use of these functions is deprecated: */
 extern Sort *make_sort_from_sortclauses(List *sortcls, Plan *lefttree);
-- 
2.39.5