From ca15dc437b2ca0d736c89dfbef79316c218b90d2 Mon Sep 17 00:00:00 2001
From: Tom Lane <tgl@sss.pgh.pa.us>
Date: Tue, 19 Nov 2024 20:58:34 -0500
Subject: [PATCH v2 5/5] Teach generate_nonunion_paths to consider pre-sorted
 child paths.

Much of this patch is just re-ordering existing code so that we can
compute the desired pathkeys before we call build_setop_child_paths.

I don't think we need any new test cases: the existing cases that
change plan are enough to show that this works.

One test case in union.sql would switch from HashSetOp Except to
sorted SetOp Except if we left it alone, since the cheapest inputs
are already sorted.  But the point of that set of tests is to
exercise HashSetOp, so swing a bigger hammer.  (The same query is
re-tested with sorting further down, so this seems fine.)
---
 src/backend/optimizer/prep/prepunion.c | 144 ++++++++++++++++---------
 src/test/regress/expected/union.out    |  37 +++----
 src/test/regress/sql/union.sql         |   5 +
 3 files changed, 115 insertions(+), 71 deletions(-)

diff --git a/src/backend/optimizer/prep/prepunion.c b/src/backend/optimizer/prep/prepunion.c
index e92a088334..b3a26738d0 100644
--- a/src/backend/optimizer/prep/prepunion.c
+++ b/src/backend/optimizer/prep/prepunion.c
@@ -1010,6 +1010,7 @@ generate_nonunion_paths(SetOperationStmt *op, PlannerInfo *root,
 	bool		lpath_trivial_tlist,
 				rpath_trivial_tlist,
 				result_trivial_tlist;
+	List	   *nonunion_pathkeys = NIL;
 	double		dLeftGroups,
 				dRightGroups,
 				dNumGroups,
@@ -1030,11 +1031,6 @@ generate_nonunion_paths(SetOperationStmt *op, PlannerInfo *root,
 								  refnames_tlist,
 								  &lpath_tlist,
 								  &lpath_trivial_tlist);
-	if (lrel->rtekind == RTE_SUBQUERY)
-		build_setop_child_paths(root, lrel, lpath_trivial_tlist, lpath_tlist,
-								NIL, &dLeftGroups);
-	else
-		dLeftGroups = lrel->rows;
 
 	rrel = recurse_set_operations(op->rarg, root,
 								  op,
@@ -1042,9 +1038,57 @@ generate_nonunion_paths(SetOperationStmt *op, PlannerInfo *root,
 								  refnames_tlist,
 								  &rpath_tlist,
 								  &rpath_trivial_tlist);
+
+	/*
+	 * Generate tlist for SetOp plan node.
+	 *
+	 * The tlist for a SetOp plan isn't important so far as the SetOp is
+	 * concerned, but we must make it look real anyway for the benefit of the
+	 * next plan level up.
+	 */
+	tlist = generate_setop_tlist(op->colTypes, op->colCollations,
+								 0, false, lpath_tlist, refnames_tlist,
+								 &result_trivial_tlist);
+
+	/* We should not have needed any type coercions in the tlist */
+	Assert(result_trivial_tlist);
+
+	*pTargetList = tlist;
+
+	/* Identify the grouping semantics */
+	groupList = generate_setop_grouplist(op, tlist);
+
+	/* Check whether the operators support sorting or hashing */
+	can_sort = grouping_is_sortable(groupList);
+	can_hash = grouping_is_hashable(groupList);
+	if (!can_sort && !can_hash)
+		ereport(ERROR,
+				(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
+		/* translator: %s is INTERSECT or EXCEPT */
+				 errmsg("could not implement %s",
+						(op->op == SETOP_INTERSECT) ? "INTERSECT" : "EXCEPT"),
+				 errdetail("Some of the datatypes only support hashing, while others only support sorting.")));
+
+	if (can_sort)
+	{
+		/* Determine the pathkeys for sorting by the whole target list */
+		nonunion_pathkeys = make_pathkeys_for_sortclauses(root, groupList,
+														  tlist);
+
+		root->query_pathkeys = nonunion_pathkeys;
+	}
+
+	/*
+	 * Now that we've got all that info, we can build the child paths.
+	 */
+	if (lrel->rtekind == RTE_SUBQUERY)
+		build_setop_child_paths(root, lrel, lpath_trivial_tlist, lpath_tlist,
+								nonunion_pathkeys, &dLeftGroups);
+	else
+		dLeftGroups = lrel->rows;
 	if (rrel->rtekind == RTE_SUBQUERY)
 		build_setop_child_paths(root, rrel, rpath_trivial_tlist, rpath_tlist,
-								NIL, &dRightGroups);
+								nonunion_pathkeys, &dRightGroups);
 	else
 		dRightGroups = rrel->rows;
 
@@ -1080,30 +1124,11 @@ generate_nonunion_paths(SetOperationStmt *op, PlannerInfo *root,
 	lpath = lrel->cheapest_total_path;
 	rpath = rrel->cheapest_total_path;
 
-	/*
-	 * Generate tlist for SetOp plan node.
-	 *
-	 * The tlist for a SetOp plan isn't important so far as the SetOp is
-	 * concerned, but we must make it look real anyway for the benefit of the
-	 * next plan level up.
-	 */
-	tlist = generate_setop_tlist(op->colTypes, op->colCollations,
-								 0, false, lpath_tlist, refnames_tlist,
-								 &result_trivial_tlist);
-
-	/* We should not have needed any type coercions in the tlist */
-	Assert(result_trivial_tlist);
-
-	*pTargetList = tlist;
-
 	/* Build result relation. */
 	result_rel = fetch_upper_rel(root, UPPERREL_SETOP,
 								 bms_union(lrel->relids, rrel->relids));
 	result_rel->reltarget = create_pathtarget(root, tlist);
 
-	/* Identify the grouping semantics */
-	groupList = generate_setop_grouplist(op, tlist);
-
 	/*
 	 * Estimate number of distinct groups that we'll need hashtable entries
 	 * for; this is the size of the left-hand input for EXCEPT, or the smaller
@@ -1139,17 +1164,6 @@ generate_nonunion_paths(SetOperationStmt *op, PlannerInfo *root,
 			break;
 	}
 
-	/* Check whether the operators support sorting or hashing */
-	can_sort = grouping_is_sortable(groupList);
-	can_hash = grouping_is_hashable(groupList);
-	if (!can_sort && !can_hash)
-		ereport(ERROR,
-				(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
-		/* translator: %s is INTERSECT or EXCEPT */
-				 errmsg("could not implement %s",
-						(op->op == SETOP_INTERSECT) ? "INTERSECT" : "EXCEPT"),
-				 errdetail("Some of the datatypes only support hashing, while others only support sorting.")));
-
 	/*
 	 * If we can hash, that just requires a SetOp atop the cheapest inputs.
 	 */
@@ -1189,35 +1203,63 @@ generate_nonunion_paths(SetOperationStmt *op, PlannerInfo *root,
 	}
 
 	/*
-	 * If we can sort, sort the inputs if needed before adding SetOp.
+	 * If we can sort, generate the cheapest sorted input paths, and add a
+	 * SetOp atop those.
 	 */
 	if (can_sort)
 	{
 		List	   *pathkeys;
+		Path	   *slpath,
+				   *srpath;
 
+		/* First the left input ... */
 		pathkeys = make_pathkeys_for_sortclauses(root,
 												 groupList,
 												 lpath_tlist);
-		if (!pathkeys_contained_in(pathkeys, lpath->pathkeys))
-			lpath = (Path *) create_sort_path(root,
-											  lpath->parent,
-											  lpath,
-											  pathkeys,
-											  -1.0);
+		if (pathkeys_contained_in(pathkeys, lpath->pathkeys))
+			slpath = lpath;		/* cheapest path is already sorted */
+		else
+		{
+			slpath = get_cheapest_path_for_pathkeys(lrel->pathlist,
+													nonunion_pathkeys,
+													NULL,
+													TOTAL_COST,
+													false);
+			/* Subquery failed to produce any presorted paths? */
+			if (slpath == NULL)
+				slpath = (Path *) create_sort_path(root,
+												   lpath->parent,
+												   lpath,
+												   pathkeys,
+												   -1.0);
+		}
+
+		/* and now the same for the right. */
 		pathkeys = make_pathkeys_for_sortclauses(root,
 												 groupList,
 												 rpath_tlist);
-		if (!pathkeys_contained_in(pathkeys, rpath->pathkeys))
-			rpath = (Path *) create_sort_path(root,
-											  rpath->parent,
-											  rpath,
-											  pathkeys,
-											  -1.0);
+		if (pathkeys_contained_in(pathkeys, rpath->pathkeys))
+			srpath = rpath;		/* cheapest path is already sorted */
+		else
+		{
+			srpath = get_cheapest_path_for_pathkeys(rrel->pathlist,
+													nonunion_pathkeys,
+													NULL,
+													TOTAL_COST,
+													false);
+			/* Subquery failed to produce any presorted paths? */
+			if (srpath == NULL)
+				srpath = (Path *) create_sort_path(root,
+												   rpath->parent,
+												   rpath,
+												   pathkeys,
+												   -1.0);
+		}
 
 		path = (Path *) create_setop_path(root,
 										  result_rel,
-										  lpath,
-										  rpath,
+										  slpath,
+										  srpath,
 										  cmd,
 										  SETOP_SORTED,
 										  groupList,
diff --git a/src/test/regress/expected/union.out b/src/test/regress/expected/union.out
index 3e1adbd992..aa73edeac5 100644
--- a/src/test/regress/expected/union.out
+++ b/src/test/regress/expected/union.out
@@ -385,13 +385,15 @@ select count(*) from
   5000
 (1 row)
 
+-- this query will prefer a sorted setop unless we force it.
+set enable_indexscan to off;
 explain (costs off)
 select unique1 from tenk1 except select unique2 from tenk1 where unique2 != 10;
-                         QUERY PLAN                         
-------------------------------------------------------------
+           QUERY PLAN            
+---------------------------------
  HashSetOp Except
-   ->  Index Only Scan using tenk1_unique1 on tenk1
-   ->  Index Only Scan using tenk1_unique2 on tenk1 tenk1_1
+   ->  Seq Scan on tenk1
+   ->  Seq Scan on tenk1 tenk1_1
          Filter: (unique2 <> 10)
 (4 rows)
 
@@ -401,6 +403,7 @@ select unique1 from tenk1 except select unique2 from tenk1 where unique2 != 10;
       10
 (1 row)
 
+reset enable_indexscan;
 set enable_hashagg to off;
 explain (costs off)
 select count(*) from
@@ -427,17 +430,15 @@ select count(*) from
 explain (costs off)
 select count(*) from
   ( select unique1 from tenk1 intersect select fivethous from tenk1 ) ss;
-                               QUERY PLAN                               
-------------------------------------------------------------------------
+                            QUERY PLAN                            
+------------------------------------------------------------------
  Aggregate
    ->  SetOp Intersect
          ->  Sort
                Sort Key: tenk1.fivethous
                ->  Seq Scan on tenk1
-         ->  Sort
-               Sort Key: tenk1_1.unique1
-               ->  Index Only Scan using tenk1_unique1 on tenk1 tenk1_1
-(8 rows)
+         ->  Index Only Scan using tenk1_unique1 on tenk1 tenk1_1
+(6 rows)
 
 select count(*) from
   ( select unique1 from tenk1 intersect select fivethous from tenk1 ) ss;
@@ -448,17 +449,13 @@ select count(*) from
 
 explain (costs off)
 select unique1 from tenk1 except select unique2 from tenk1 where unique2 != 10;
-                            QUERY PLAN                            
-------------------------------------------------------------------
+                         QUERY PLAN                         
+------------------------------------------------------------
  SetOp Except
-   ->  Sort
-         Sort Key: tenk1.unique1
-         ->  Index Only Scan using tenk1_unique1 on tenk1
-   ->  Sort
-         Sort Key: tenk1_1.unique2
-         ->  Index Only Scan using tenk1_unique2 on tenk1 tenk1_1
-               Filter: (unique2 <> 10)
-(8 rows)
+   ->  Index Only Scan using tenk1_unique1 on tenk1
+   ->  Index Only Scan using tenk1_unique2 on tenk1 tenk1_1
+         Filter: (unique2 <> 10)
+(4 rows)
 
 select unique1 from tenk1 except select unique2 from tenk1 where unique2 != 10;
  unique1 
diff --git a/src/test/regress/sql/union.sql b/src/test/regress/sql/union.sql
index 10030ec1c5..0be2911b0b 100644
--- a/src/test/regress/sql/union.sql
+++ b/src/test/regress/sql/union.sql
@@ -134,10 +134,15 @@ select count(*) from
 select count(*) from
   ( select unique1 from tenk1 intersect select fivethous from tenk1 ) ss;
 
+-- this query will prefer a sorted setop unless we force it.
+set enable_indexscan to off;
+
 explain (costs off)
 select unique1 from tenk1 except select unique2 from tenk1 where unique2 != 10;
 select unique1 from tenk1 except select unique2 from tenk1 where unique2 != 10;
 
+reset enable_indexscan;
+
 set enable_hashagg to off;
 
 explain (costs off)
-- 
2.43.5

