From bc71aea5a0c1911c1f63567c21a870241b76231b Mon Sep 17 00:00:00 2001
From: Pierre Ducroquet
Date: Fri, 31 Jul 2020 10:28:25 +0200
Subject: [PATCH] Add a new remove_useless_distinct function in planner
This function removes useless distinct that are (sadly) often
added when developers use an ORM.
For instance, we may have the following queries:
SELECT DISTINCT * FROM tbl;
SELECT DISTINCT * FROM tbl ORDER BY b;
SELECT DISTINCT tbl1.* FROM tbl1 JOIN tbl2 ON tbl2.a_id = tbl1.id;
In the first two queries (if tbl has a PK of course), the distinct
is useless, it's at best an implicit order by.
In the third query, the distinct turns the JOIN into a SEMI-JOIN.
All these are thus optimized here.
---
src/backend/optimizer/plan/planner.c | 132 +++++++++++++++++++++++++++
1 file changed, 132 insertions(+)
diff --git a/src/backend/optimizer/plan/planner.c b/src/backend/optimizer/plan/planner.c
index b40a112c25..a88389acb5 100644
--- a/src/backend/optimizer/plan/planner.c
+++ b/src/backend/optimizer/plan/planner.c
@@ -140,6 +140,7 @@ static double preprocess_limit(PlannerInfo *root,
double tuple_fraction,
int64 *offset_est, int64 *count_est);
static void remove_useless_groupby_columns(PlannerInfo *root);
+static void remove_useless_distinct(PlannerInfo *root);
static List *preprocess_groupclause(PlannerInfo *root, List *force);
static List *extract_rollup_sets(List *groupingSets);
static List *reorder_grouping_sets(List *groupingSets, List *sortclause);
@@ -650,6 +651,12 @@ subquery_planner(PlannerGlobal *glob, Query *parse,
*/
replace_empty_jointree(parse);
+ /*
+ * If there is a distinct clause, try to remove it
+ */
+ if (parse->distinctClause)
+ remove_useless_distinct(root);
+
/*
* Look for ANY and EXISTS SubLinks in WHERE and JOIN/ON clauses, and try
* to transform them into joins. Note that this step does not descend
@@ -3207,6 +3214,131 @@ remove_useless_groupby_columns(PlannerInfo *root)
}
}
+/*
+ * remove_useless_distinct
+ * Remove any distinct that is redundant due to being applied on the
+ * primary key of the table.
+ *
+ * Currently, we only make use of pkey constraints for this, however, we may
+ * want to take this further in the future and also use unique constraints
+ * with NOT NULL columns.
+ */
+static void
+remove_useless_distinct(PlannerInfo *root)
+{
+ Index distincted_table = 0;
+ if (list_length(root->parse->rtable) > 1)
+ {
+ /**
+ * There is more than one table.
+ * If no field is selected from another table, this is a 'typical'
+ * semi-join written using distinct instead of exists.
+ */
+ ListCell *targetCell;
+ Var *targetVar;
+ foreach (targetCell, root->parse->targetList)
+ {
+ /**
+ * This only accept Vars. Later, we could go lower and look for Vars
+ * inside function parameters for instance.
+ */
+ TargetEntry *target = (TargetEntry*) lfirst(targetCell);
+ if (!IsA(target->expr, Var))
+ {
+ return;
+ }
+ else
+ {
+ targetVar = (Var*) target->expr;
+ if (distincted_table == 0)
+ distincted_table = targetVar->varno;
+ else if (distincted_table != targetVar->varno)
+ return;
+ }
+ }
+ }
+ else
+ {
+ distincted_table = 1;
+ }
+
+ /**
+ * We will fetch the PK, check if we have a distinct on it.
+ */
+ ListCell *distinctItemCell;
+ TargetEntry *entry;
+ Oid pk_oid = InvalidOid;
+ Bitmapset *pk_attnos;
+ RangeTblEntry *tbl;
+ Var *var;
+ SortGroupClause *distinctItem;
+
+ // To get the pk, we need the table
+ tbl = (RangeTblEntry*) list_nth(root->parse->rtable, distincted_table - 1);
+ // Make sure it is a relkind we understand
+ if (tbl->relkind != RELKIND_RELATION)
+ {
+ return;
+ }
+
+ pk_attnos = get_primary_key_attnos(tbl->relid, false, &pk_oid);
+ if (!pk_attnos)
+ {
+ // This table has no primary key, so we can abort
+ return;
+ }
+
+ foreach (distinctItemCell, root->parse->distinctClause)
+ {
+ distinctItem = (SortGroupClause*) lfirst(distinctItemCell);
+ // Get the entry tleSortGroupRef in targetList
+ entry = list_nth(root->parse->targetList, distinctItem->tleSortGroupRef - 1);
+ if (entry->expr)
+ {
+ if (IsA(entry->expr, Var))
+ {
+ var = (Var*) entry->expr;
+ pk_attnos = bms_del_member(
+ pk_attnos, var->varattno - FirstLowInvalidHeapAttributeNumber);
+ }
+ else
+ {
+ // Ignoring that field
+ }
+ }
+ }
+
+ if (bms_is_empty(pk_attnos)) {
+ /**
+ * We saw all the fields of the PK, so we can optimize.
+ * If there is a join, turn it into a SEMI-JOIN.
+ */
+ ListCell *joinCell;
+ foreach (joinCell, root->parse->jointree->fromlist)
+ {
+ if (IsA(lfirst(joinCell), JoinExpr))
+ {
+ JoinExpr *expr = (JoinExpr*) lfirst(joinCell);
+ if (expr->jointype == JOIN_INNER)
+ expr->jointype = JOIN_SEMI;
+ else
+ return;
+ }
+ else
+ {
+ return;
+ }
+ }
+ /**
+ * If there was no sort clause, we change the distinct into a sort clause.
+ */
+ if (!root->parse->sortClause)
+ root->parse->sortClause = root->parse->distinctClause;
+
+ root->parse->distinctClause = NULL;
+ }
+}
+
/*
* preprocess_groupclause - do preparatory work on GROUP BY clause
*
--
2.28.0