From e77dd86adbb372329743143b8c44e59ef06fbf0b Mon Sep 17 00:00:00 2001
From: Ilia Evdokimov <ilya.evdokimov@tantorlabs.com>
Date: Mon, 8 Jun 2026 14:10:03 +0300
Subject: [PATCH v1] Remove redundant DISTINCT when GROUP BY guarantees
 uniqueness

When a query contains both SELECT DISTINCT and GROUP BY, the DISTINCT
step is redundant if GROUP BY already guarantees that every output row
is unique. This is the case when every GROUP BY key appears in the
DISTINCT clause: since each group produces exactly one output row, no
two rows can be identical on all DISTINCT keys.

The optimization is skipped when GROUPING SETS are present (they can
produce extra NULL-filled rows across sets, making duplicates possible)
or when DISTINCT ON is used (different semantics).
---
 src/backend/optimizer/plan/planner.c          |  56 +++++++-
 src/test/regress/expected/select_distinct.out | 128 ++++++++++++++++++
 src/test/regress/sql/select_distinct.sql      |  54 ++++++++
 3 files changed, 237 insertions(+), 1 deletion(-)

diff --git a/src/backend/optimizer/plan/planner.c b/src/backend/optimizer/plan/planner.c
index f4689e7c9f8..83373cb2fbe 100644
--- a/src/backend/optimizer/plan/planner.c
+++ b/src/backend/optimizer/plan/planner.c
@@ -1744,6 +1744,57 @@ preprocess_phv_expression(PlannerInfo *root, Expr *expr)
 	return (Expr *) preprocess_expression(root, (Node *) expr, EXPRKIND_PHV);
 }
 
+/*
+ * distinct_redundant_by_groupby
+ *
+ * Returns true if SELECT DISTINCT is redundant because GROUP BY already
+ * guarantees uniqueness of every output row.
+ *
+ * This is the case when:
+ *  - there is a non-empty GROUP BY (no GROUPING SETS, which can introduce
+ *    NULLs that create duplicates across grouping sets),
+ *  - it is plain DISTINCT, not DISTINCT ON (different semantics), and
+ *  - every GROUP BY key appears in the DISTINCT clause, ensuring that no
+ *    two distinct groups can produce the same DISTINCT output values.
+ */
+static bool
+distinct_redundant_by_groupby(Query *parse)
+{
+	ListCell   *lc;
+	Bitmapset  *distinct_refs = NULL;
+
+	Assert(parse->distinctClause != NIL);
+
+	/* Need a non-empty plain GROUP BY, no GROUPING SETS */
+	if (parse->groupClause == NIL ||
+		parse->groupingSets != NIL ||
+		parse->hasDistinctOn)
+		return false;
+
+	/*
+	 * Every GROUP BY key must appear in the DISTINCT clause.  If a GROUP BY
+	 * key is absent from the DISTINCT list, two groups could yield the same
+	 * DISTINCT output (e.g. SELECT DISTINCT 1 FROM t GROUP BY a).
+	 */
+
+	foreach(lc, parse->distinctClause)
+	{
+		SortGroupClause *sgc = lfirst_node(SortGroupClause, lc);
+
+		distinct_refs = bms_add_member(distinct_refs, sgc->tleSortGroupRef);
+	}
+
+	foreach(lc, parse->groupClause)
+	{
+		SortGroupClause *sgc = lfirst_node(SortGroupClause, lc);
+
+		if (!bms_is_member(sgc->tleSortGroupRef, distinct_refs))
+			return false;
+	}
+
+	return true;
+}
+
 /*--------------------
  * grouping_planner
  *	  Perform planning steps related to grouping, aggregation, etc.
@@ -2172,8 +2223,11 @@ grouping_planner(PlannerInfo *root, double tuple_fraction,
 		/*
 		 * If there is a DISTINCT clause, consider ways to implement that. We
 		 * build a new upperrel representing the output of this phase.
+		 *
+		 * Skip this step if DISTINCT is redundant because GROUP BY already
+		 * guarantees uniqueness of the output rows.
 		 */
-		if (parse->distinctClause)
+		if (parse->distinctClause && !distinct_redundant_by_groupby(parse))
 		{
 			current_rel = create_distinct_paths(root,
 												current_rel,
diff --git a/src/test/regress/expected/select_distinct.out b/src/test/regress/expected/select_distinct.out
index 379ba0bc9fa..875a876ee41 100644
--- a/src/test/regress/expected/select_distinct.out
+++ b/src/test/regress/expected/select_distinct.out
@@ -596,3 +596,131 @@ SELECT DISTINCT y, x FROM distinct_tbl ORDER BY y;
 
 RESET enable_hashagg;
 DROP TABLE distinct_tbl;
+--
+-- Test that DISTINCT is removed when GROUP BY already guarantees uniqueness
+-- of the output rows (distinct_redundant_by_groupby optimization).
+--
+CREATE TABLE distinct_groupby_tbl (a int, b int, c int);
+INSERT INTO distinct_groupby_tbl VALUES
+    (1, 1, 10), (1, 2, 20), (2, 1, 30), (2, 2, 40),
+    (1, 1, 50);
+ANALYZE distinct_groupby_tbl;
+-- GROUP BY (a, b) guarantees unique output rows, so DISTINCT is redundant.
+-- Expect a single aggregation node, no extra Unique node on top.
+EXPLAIN (COSTS OFF)
+SELECT DISTINCT a, b FROM distinct_groupby_tbl GROUP BY a, b;
+               QUERY PLAN               
+----------------------------------------
+ HashAggregate
+   Group Key: a, b
+   ->  Seq Scan on distinct_groupby_tbl
+(3 rows)
+
+-- Verify correct results
+SELECT DISTINCT a, b FROM distinct_groupby_tbl GROUP BY a, b ORDER BY a, b;
+ a | b 
+---+---
+ 1 | 1
+ 1 | 2
+ 2 | 1
+ 2 | 2
+(4 rows)
+
+-- Different column order in DISTINCT vs GROUP BY -- still redundant.
+EXPLAIN (COSTS OFF)
+SELECT DISTINCT b, a FROM distinct_groupby_tbl GROUP BY a, b;
+               QUERY PLAN               
+----------------------------------------
+ HashAggregate
+   Group Key: a, b
+   ->  Seq Scan on distinct_groupby_tbl
+(3 rows)
+
+SELECT DISTINCT b, a FROM distinct_groupby_tbl GROUP BY a, b ORDER BY a, b;
+ b | a 
+---+---
+ 1 | 1
+ 2 | 1
+ 1 | 2
+ 2 | 2
+(4 rows)
+
+-- Aggregate in SELECT list does not prevent elimination.
+EXPLAIN (COSTS OFF)
+SELECT DISTINCT a, b, sum(c) FROM distinct_groupby_tbl GROUP BY a, b;
+               QUERY PLAN               
+----------------------------------------
+ HashAggregate
+   Group Key: a, b
+   ->  Seq Scan on distinct_groupby_tbl
+(3 rows)
+
+SELECT DISTINCT a, b, sum(c) FROM distinct_groupby_tbl GROUP BY a, b ORDER BY a, b;
+ a | b | sum 
+---+---+-----
+ 1 | 1 |  60
+ 1 | 2 |  20
+ 2 | 1 |  30
+ 2 | 2 |  40
+(4 rows)
+
+-- DISTINCT is NOT redundant: GROUP BY key 'b' is absent from DISTINCT.
+-- Different (a, b) groups can produce the same 'a' output value.
+EXPLAIN (COSTS OFF)
+SELECT DISTINCT a FROM distinct_groupby_tbl GROUP BY a, b;
+                     QUERY PLAN                     
+----------------------------------------------------
+ Unique
+   ->  Sort
+         Sort Key: a
+         ->  HashAggregate
+               Group Key: a, b
+               ->  Seq Scan on distinct_groupby_tbl
+(6 rows)
+
+SELECT DISTINCT a FROM distinct_groupby_tbl GROUP BY a, b ORDER BY a;
+ a 
+---
+ 1
+ 2
+(2 rows)
+
+-- DISTINCT is NOT redundant: no GROUP BY clause.
+EXPLAIN (COSTS OFF)
+SELECT DISTINCT a, b FROM distinct_groupby_tbl;
+               QUERY PLAN               
+----------------------------------------
+ HashAggregate
+   Group Key: a, b
+   ->  Seq Scan on distinct_groupby_tbl
+(3 rows)
+
+-- DISTINCT is NOT redundant: GROUPING SETS can introduce extra NULL rows,
+-- so two grouping sets could yield the same DISTINCT output.
+EXPLAIN (COSTS OFF)
+SELECT DISTINCT a FROM distinct_groupby_tbl
+GROUP BY GROUPING SETS ((a), ());
+                  QUERY PLAN                  
+----------------------------------------------
+ HashAggregate
+   Group Key: a
+   ->  MixedAggregate
+         Hash Key: a
+         Group Key: ()
+         ->  Seq Scan on distinct_groupby_tbl
+(6 rows)
+
+-- DISTINCT ON is unaffected (different semantics).
+EXPLAIN (COSTS OFF)
+SELECT DISTINCT ON (a) a, b FROM distinct_groupby_tbl GROUP BY a, b ORDER BY a;
+                     QUERY PLAN                     
+----------------------------------------------------
+ Unique
+   ->  Sort
+         Sort Key: a
+         ->  HashAggregate
+               Group Key: a, b
+               ->  Seq Scan on distinct_groupby_tbl
+(6 rows)
+
+DROP TABLE distinct_groupby_tbl;
diff --git a/src/test/regress/sql/select_distinct.sql b/src/test/regress/sql/select_distinct.sql
index 50ac7dde396..e4b69900cda 100644
--- a/src/test/regress/sql/select_distinct.sql
+++ b/src/test/regress/sql/select_distinct.sql
@@ -274,3 +274,57 @@ SELECT DISTINCT y, x FROM distinct_tbl ORDER BY y;
 RESET enable_hashagg;
 
 DROP TABLE distinct_tbl;
+
+--
+-- Test that DISTINCT is removed when GROUP BY already guarantees uniqueness
+-- of the output rows (distinct_redundant_by_groupby optimization).
+--
+
+CREATE TABLE distinct_groupby_tbl (a int, b int, c int);
+INSERT INTO distinct_groupby_tbl VALUES
+    (1, 1, 10), (1, 2, 20), (2, 1, 30), (2, 2, 40),
+    (1, 1, 50);
+ANALYZE distinct_groupby_tbl;
+
+-- GROUP BY (a, b) guarantees unique output rows, so DISTINCT is redundant.
+-- Expect a single aggregation node, no extra Unique node on top.
+EXPLAIN (COSTS OFF)
+SELECT DISTINCT a, b FROM distinct_groupby_tbl GROUP BY a, b;
+
+-- Verify correct results
+SELECT DISTINCT a, b FROM distinct_groupby_tbl GROUP BY a, b ORDER BY a, b;
+
+-- Different column order in DISTINCT vs GROUP BY -- still redundant.
+EXPLAIN (COSTS OFF)
+SELECT DISTINCT b, a FROM distinct_groupby_tbl GROUP BY a, b;
+
+SELECT DISTINCT b, a FROM distinct_groupby_tbl GROUP BY a, b ORDER BY a, b;
+
+-- Aggregate in SELECT list does not prevent elimination.
+EXPLAIN (COSTS OFF)
+SELECT DISTINCT a, b, sum(c) FROM distinct_groupby_tbl GROUP BY a, b;
+
+SELECT DISTINCT a, b, sum(c) FROM distinct_groupby_tbl GROUP BY a, b ORDER BY a, b;
+
+-- DISTINCT is NOT redundant: GROUP BY key 'b' is absent from DISTINCT.
+-- Different (a, b) groups can produce the same 'a' output value.
+EXPLAIN (COSTS OFF)
+SELECT DISTINCT a FROM distinct_groupby_tbl GROUP BY a, b;
+
+SELECT DISTINCT a FROM distinct_groupby_tbl GROUP BY a, b ORDER BY a;
+
+-- DISTINCT is NOT redundant: no GROUP BY clause.
+EXPLAIN (COSTS OFF)
+SELECT DISTINCT a, b FROM distinct_groupby_tbl;
+
+-- DISTINCT is NOT redundant: GROUPING SETS can introduce extra NULL rows,
+-- so two grouping sets could yield the same DISTINCT output.
+EXPLAIN (COSTS OFF)
+SELECT DISTINCT a FROM distinct_groupby_tbl
+GROUP BY GROUPING SETS ((a), ());
+
+-- DISTINCT ON is unaffected (different semantics).
+EXPLAIN (COSTS OFF)
+SELECT DISTINCT ON (a) a, b FROM distinct_groupby_tbl GROUP BY a, b ORDER BY a;
+
+DROP TABLE distinct_groupby_tbl;
-- 
2.34.1