From cfb5d2b8f2bef75beb686605b388443959e165c1 Mon Sep 17 00:00:00 2001
From: Ewan Young <kdbase.hack@gmail.com>
Date: Thu, 11 Jun 2026 01:43:22 +0800
Subject: [PATCH v2] Discard ORDER BY and DISTINCT in an IN sub-select pulled
 up to a semijoin

When convert_ANY_sublink_to_join() turns an IN/ANY sublink into a semijoin,
the sub-select's ORDER BY and DISTINCT clauses become irrelevant: a semijoin
only depends on whether a matching row exists, not on the order of the inner
rows or whether they contain duplicates.  Discard those clauses once we have
committed to building the semijoin.

This mirrors what simplify_EXISTS_query() already does for EXISTS, and has two
benefits: the executor no longer needlessly sorts or de-duplicates the inner
side, and removing the clauses can make the sub-select "simple" so that
pull_up_subqueries() is able to flatten it into the parent query instead of
leaving a separate subquery (often with a redundant HashAggregate); see
is_simple_subquery().  Dropping DISTINCT loses no plan: a semijoin is
algebraically equivalent to an inner join over a unique-ified inner
(A SEMI B == A JOIN unique(B)), which is what JOIN_UNIQUE_INNER implements, so
the planner can re-derive the de-duplication for itself whenever it is cheaper.

We deliberately do not do this for a NOT IN that has been proven null-safe and
converted to an antijoin.  ORDER BY and DISTINCT are equally irrelevant to an
antijoin's result, but an antijoin has no comparable inner-join equivalence --
"no matching row exists" cannot be rephrased as an inner join over any
transform of the inner side -- which is exactly why there is no JOIN_UNIQUE
path for it (joinrels.c calls create_unique_paths() only for JOIN_SEMI).  A
dropped DISTINCT therefore cannot be recovered, and removing it would force the
full, possibly heavily duplicated, inner relation through the join (for example
by defeating a cheaper parallel de-duplication), so the sub-select is left
untouched for the antijoin case.

The clauses are also kept when the sub-select has LIMIT/OFFSET, since then
ORDER BY and DISTINCT do affect which rows are returned, or uses DISTINCT ON,
which selects particular rows in conjunction with ORDER BY.
---
 src/backend/optimizer/plan/subselect.c  |  27 +++
 src/test/regress/expected/subselect.out | 209 ++++++++++++++++++++++++
 src/test/regress/sql/subselect.sql      |  49 ++++++
 3 files changed, 285 insertions(+)

diff --git a/src/backend/optimizer/plan/subselect.c b/src/backend/optimizer/plan/subselect.c
index 6aa8971c95d..46ae1532af4 100644
--- a/src/backend/optimizer/plan/subselect.c
+++ b/src/backend/optimizer/plan/subselect.c
@@ -1405,6 +1405,33 @@ convert_ANY_sublink_to_join(PlannerInfo *root, SubLink *sublink,
 	if (contain_volatile_functions(sublink->testexpr))
 		return NULL;
 
+	/*
+	 * For a semijoin we can discard the sub-select's ORDER BY and DISTINCT: the
+	 * join only cares whether a matching row exists, not about the order or
+	 * multiplicity of the inner rows.  This mirrors simplify_EXISTS_query(), and
+	 * may also let the sub-select be flattened by pull_up_subqueries() (see
+	 * is_simple_subquery()).  Dropping DISTINCT loses no plan, because a
+	 * semijoin is equivalent to an inner join over a unique-ified inner, so the
+	 * planner can re-derive the de-duplication itself when worthwhile
+	 * (JOIN_UNIQUE_INNER).
+	 *
+	 * We must not do this for an antijoin (NOT IN): there is no such inner-join
+	 * equivalence, hence no JOIN_UNIQUE path for it (see joinrels.c), so a
+	 * dropped DISTINCT cannot be recovered and we would be forcing the full,
+	 * possibly heavily duplicated, inner relation through the join.
+	 *
+	 * LIMIT/OFFSET, and DISTINCT ON, make ORDER BY and DISTINCT significant, so
+	 * leave the sub-select alone in those cases too.
+	 */
+	if (!under_not &&
+		subselect->limitOffset == NULL &&
+		subselect->limitCount == NULL &&
+		!subselect->hasDistinctOn)
+	{
+		subselect->distinctClause = NIL;
+		subselect->sortClause = NIL;
+	}
+
 	/* Create a dummy ParseState for addRangeTableEntryForSubquery */
 	pstate = make_parsestate(NULL);
 
diff --git a/src/test/regress/expected/subselect.out b/src/test/regress/expected/subselect.out
index a3778c23c34..9374ec386df 100644
--- a/src/test/regress/expected/subselect.out
+++ b/src/test/regress/expected/subselect.out
@@ -3785,3 +3785,212 @@ WHERE id NOT IN (SELECT id FROM notnull_notvalid_tab);
 (0 rows)
 
 ROLLBACK;
+-- ORDER BY and DISTINCT in an IN sublink are dropped: same plan as without them
+explain (costs off)
+select * from tenk1 t where t.unique1 in (select distinct hundred from tenk2);
+                        QUERY PLAN                        
+----------------------------------------------------------
+ Merge Join
+   Merge Cond: (t.unique1 = tenk2.hundred)
+   ->  Index Scan using tenk1_unique1 on tenk1 t
+   ->  Unique
+         ->  Index Only Scan using tenk2_hundred on tenk2
+(5 rows)
+
+explain (costs off)
+select * from tenk1 t where t.unique1 in (select hundred from tenk2);
+                        QUERY PLAN                        
+----------------------------------------------------------
+ Merge Join
+   Merge Cond: (t.unique1 = tenk2.hundred)
+   ->  Index Scan using tenk1_unique1 on tenk1 t
+   ->  Unique
+         ->  Index Only Scan using tenk2_hundred on tenk2
+(5 rows)
+
+explain (costs off)
+select * from tenk1 t
+where t.hundred in (select b.ten from tenk2 b where b.unique2 = t.unique1 order by b.ten);
+                           QUERY PLAN                           
+----------------------------------------------------------------
+ Hash Semi Join
+   Hash Cond: ((t.unique1 = b.unique2) AND (t.hundred = b.ten))
+   ->  Seq Scan on tenk1 t
+   ->  Hash
+         ->  Seq Scan on tenk2 b
+(5 rows)
+
+-- but kept with LIMIT, and for DISTINCT ON
+explain (costs off)
+select * from tenk1 t where t.unique1 in (select distinct hundred from tenk2 limit 5);
+                           QUERY PLAN                           
+----------------------------------------------------------------
+ Nested Loop
+   ->  Limit
+         ->  Unique
+               ->  Index Only Scan using tenk2_hundred on tenk2
+   ->  Index Scan using tenk1_unique1 on tenk1 t
+         Index Cond: (unique1 = tenk2.hundred)
+(6 rows)
+
+explain (costs off)
+select * from tenk1 t
+where t.unique1 in (select distinct on (thousand) hundred from tenk2 order by thousand);
+                     QUERY PLAN                     
+----------------------------------------------------
+ Hash Join
+   Hash Cond: (t.unique1 = tenk2.hundred)
+   ->  Seq Scan on tenk1 t
+   ->  Hash
+         ->  HashAggregate
+               Group Key: tenk2.hundred
+               ->  Unique
+                     ->  Sort
+                           Sort Key: tenk2.thousand
+                           ->  Seq Scan on tenk2
+(10 rows)
+
+select count(*) from tenk1 t where t.unique1 in (select distinct hundred from tenk2);
+ count 
+-------
+   100
+(1 row)
+
+-- safe with aggregates and window functions (sub-select is not flattened)
+explain (costs off)
+select * from tenk1 t where t.unique1 in (select distinct count(*) from tenk2 group by hundred);
+                              QUERY PLAN                              
+----------------------------------------------------------------------
+ Hash Semi Join
+   Hash Cond: (t.unique1 = unnamed_subquery.count)
+   ->  Seq Scan on tenk1 t
+   ->  Hash
+         ->  Subquery Scan on unnamed_subquery
+               ->  GroupAggregate
+                     Group Key: tenk2.hundred
+                     ->  Index Only Scan using tenk2_hundred on tenk2
+(8 rows)
+
+explain (costs off)
+select * from tenk1 t where t.unique1 in (select count(*) from tenk2 group by hundred);
+                              QUERY PLAN                              
+----------------------------------------------------------------------
+ Hash Semi Join
+   Hash Cond: (t.unique1 = unnamed_subquery.count)
+   ->  Seq Scan on tenk1 t
+   ->  Hash
+         ->  Subquery Scan on unnamed_subquery
+               ->  GroupAggregate
+                     Group Key: tenk2.hundred
+                     ->  Index Only Scan using tenk2_hundred on tenk2
+(8 rows)
+
+select count(*) from tenk1 t where t.unique1 in (select distinct count(*) from tenk2 group by hundred);
+ count 
+-------
+     1
+(1 row)
+
+explain (costs off)
+select * from tenk1 t where t.unique1 in (select distinct rank() over (order by hundred) from tenk2);
+                                     QUERY PLAN                                      
+-------------------------------------------------------------------------------------
+ Hash Join
+   Hash Cond: (t.unique1 = (rank() OVER w1))
+   ->  Seq Scan on tenk1 t
+   ->  Hash
+         ->  HashAggregate
+               Group Key: rank() OVER w1
+               ->  WindowAgg
+                     Window: w1 AS (ORDER BY tenk2.hundred ROWS UNBOUNDED PRECEDING)
+                     ->  Index Only Scan using tenk2_hundred on tenk2
+(9 rows)
+
+explain (costs off)
+select * from tenk1 t where t.unique1 in (select rank() over (order by hundred) from tenk2);
+                                     QUERY PLAN                                      
+-------------------------------------------------------------------------------------
+ Hash Join
+   Hash Cond: (t.unique1 = (rank() OVER w1))
+   ->  Seq Scan on tenk1 t
+   ->  Hash
+         ->  HashAggregate
+               Group Key: rank() OVER w1
+               ->  WindowAgg
+                     Window: w1 AS (ORDER BY tenk2.hundred ROWS UNBOUNDED PRECEDING)
+                     ->  Index Only Scan using tenk2_hundred on tenk2
+(9 rows)
+
+select count(*) from tenk1 t where t.unique1 in (select distinct rank() over (order by hundred) from tenk2);
+ count 
+-------
+   100
+(1 row)
+
+-- correlated IN with DISTINCT: dropping it allows pull-up to a semi join
+explain (costs off)
+select * from tenk1 t
+where t.hundred in (select distinct b.ten from tenk2 b where b.unique2 = t.unique1);
+                           QUERY PLAN                           
+----------------------------------------------------------------
+ Hash Semi Join
+   Hash Cond: ((t.unique1 = b.unique2) AND (t.hundred = b.ten))
+   ->  Seq Scan on tenk1 t
+   ->  Hash
+         ->  Seq Scan on tenk2 b
+(5 rows)
+
+select count(*) from tenk1 t
+where t.hundred in (select distinct b.ten from tenk2 b where b.unique2 = t.unique1);
+ count 
+-------
+   107
+(1 row)
+
+-- NOT IN/anti join: clauses are kept (no JOIN_UNIQUE path to recover them)
+create temp table anti_o (a int not null);
+create temp table anti_i (k int not null);
+insert into anti_o select unique1 from tenk1;
+insert into anti_i select hundred from tenk2;
+analyze anti_o;
+analyze anti_i;
+explain (costs off) select * from anti_o where a not in (select distinct k from anti_i);
+              QUERY PLAN              
+--------------------------------------
+ Hash Anti Join
+   Hash Cond: (anti_o.a = anti_i.k)
+   ->  Seq Scan on anti_o
+   ->  Hash
+         ->  HashAggregate
+               Group Key: anti_i.k
+               ->  Seq Scan on anti_i
+(7 rows)
+
+explain (costs off) select * from anti_o where a not in (select k from anti_i);
+             QUERY PLAN             
+------------------------------------
+ Hash Anti Join
+   Hash Cond: (anti_o.a = anti_i.k)
+   ->  Seq Scan on anti_o
+   ->  Hash
+         ->  Seq Scan on anti_i
+(5 rows)
+
+explain (costs off) select * from anti_o where a not in (select k from anti_i order by k);
+              QUERY PLAN              
+--------------------------------------
+ Hash Anti Join
+   Hash Cond: (anti_o.a = anti_i.k)
+   ->  Seq Scan on anti_o
+   ->  Hash
+         ->  Sort
+               Sort Key: anti_i.k
+               ->  Seq Scan on anti_i
+(7 rows)
+
+select count(*) from anti_o where a not in (select distinct k from anti_i);
+ count 
+-------
+  9900
+(1 row)
+
diff --git a/src/test/regress/sql/subselect.sql b/src/test/regress/sql/subselect.sql
index 1a02c3f86c0..5b8f4914cbf 100644
--- a/src/test/regress/sql/subselect.sql
+++ b/src/test/regress/sql/subselect.sql
@@ -1647,3 +1647,52 @@ SELECT * FROM not_null_tab
 WHERE id NOT IN (SELECT id FROM notnull_notvalid_tab);
 
 ROLLBACK;
+
+-- ORDER BY and DISTINCT in an IN sublink are dropped: same plan as without them
+explain (costs off)
+select * from tenk1 t where t.unique1 in (select distinct hundred from tenk2);
+explain (costs off)
+select * from tenk1 t where t.unique1 in (select hundred from tenk2);
+explain (costs off)
+select * from tenk1 t
+where t.hundred in (select b.ten from tenk2 b where b.unique2 = t.unique1 order by b.ten);
+
+-- but kept with LIMIT, and for DISTINCT ON
+explain (costs off)
+select * from tenk1 t where t.unique1 in (select distinct hundred from tenk2 limit 5);
+explain (costs off)
+select * from tenk1 t
+where t.unique1 in (select distinct on (thousand) hundred from tenk2 order by thousand);
+
+select count(*) from tenk1 t where t.unique1 in (select distinct hundred from tenk2);
+
+-- safe with aggregates and window functions (sub-select is not flattened)
+explain (costs off)
+select * from tenk1 t where t.unique1 in (select distinct count(*) from tenk2 group by hundred);
+explain (costs off)
+select * from tenk1 t where t.unique1 in (select count(*) from tenk2 group by hundred);
+select count(*) from tenk1 t where t.unique1 in (select distinct count(*) from tenk2 group by hundred);
+explain (costs off)
+select * from tenk1 t where t.unique1 in (select distinct rank() over (order by hundred) from tenk2);
+explain (costs off)
+select * from tenk1 t where t.unique1 in (select rank() over (order by hundred) from tenk2);
+select count(*) from tenk1 t where t.unique1 in (select distinct rank() over (order by hundred) from tenk2);
+
+-- correlated IN with DISTINCT: dropping it allows pull-up to a semi join
+explain (costs off)
+select * from tenk1 t
+where t.hundred in (select distinct b.ten from tenk2 b where b.unique2 = t.unique1);
+select count(*) from tenk1 t
+where t.hundred in (select distinct b.ten from tenk2 b where b.unique2 = t.unique1);
+
+-- NOT IN/anti join: clauses are kept (no JOIN_UNIQUE path to recover them)
+create temp table anti_o (a int not null);
+create temp table anti_i (k int not null);
+insert into anti_o select unique1 from tenk1;
+insert into anti_i select hundred from tenk2;
+analyze anti_o;
+analyze anti_i;
+explain (costs off) select * from anti_o where a not in (select distinct k from anti_i);
+explain (costs off) select * from anti_o where a not in (select k from anti_i);
+explain (costs off) select * from anti_o where a not in (select k from anti_i order by k);
+select count(*) from anti_o where a not in (select distinct k from anti_i);
-- 
2.47.3