From 5bae9fdf8b74e5996b606e78f8b2a5fb327e011b Mon Sep 17 00:00:00 2001
From: Dmitrii Dolgov <9erthalion6@gmail.com>
Date: Mon, 17 May 2021 11:47:07 +0200
Subject: [PATCH v41 1/6] Unique expressions

Extend PlannerInfo and Path structures with the list of relevant unique
expressions. It specifies which keys must be unique on the query
level, and allows to leverage this into on the path level. At the moment
only distinctClause makes use of such mechanism, which enables potential
use of index skip scan.

Originally proposed by David Rowley, based on the UniqueKey patch
implementation from Andy Fan, contains few bits out of previous version
from Jesper Pedersen, Floris Van Nee.
---
 src/backend/nodes/list.c                | 31 +++++++++
 src/backend/optimizer/path/Makefile     |  3 +-
 src/backend/optimizer/path/pathkeys.c   | 62 +++++++++++++++++
 src/backend/optimizer/path/uniquekeys.c | 92 +++++++++++++++++++++++++
 src/backend/optimizer/plan/planner.c    | 36 +++++++++-
 src/backend/optimizer/util/pathnode.c   | 32 ++++++---
 src/include/nodes/pathnodes.h           |  5 ++
 src/include/nodes/pg_list.h             |  1 +
 src/include/optimizer/pathnode.h        |  1 +
 src/include/optimizer/paths.h           |  9 +++
 10 files changed, 261 insertions(+), 11 deletions(-)
 create mode 100644 src/backend/optimizer/path/uniquekeys.c

diff --git a/src/backend/nodes/list.c b/src/backend/nodes/list.c
index f843f861ef..a53a50f372 100644
--- a/src/backend/nodes/list.c
+++ b/src/backend/nodes/list.c
@@ -1653,3 +1653,34 @@ list_oid_cmp(const ListCell *p1, const ListCell *p2)
 		return 1;
 	return 0;
 }
+
+/*
+ * Return true iff every entry in "members" list is also present
+ * in the "target" list.
+ */
+bool
+list_is_subset(const List *members, const List *target)
+{
+	const ListCell	*lc1, *lc2;
+
+	Assert(IsPointerList(members));
+	Assert(IsPointerList(target));
+	check_list_invariants(members);
+	check_list_invariants(target);
+
+	foreach(lc1, members)
+	{
+		bool found = false;
+		foreach(lc2, target)
+		{
+			if (equal(lfirst(lc1), lfirst(lc2)))
+			{
+				found = true;
+				break;
+			}
+		}
+		if (!found)
+			return false;
+	}
+	return true;
+}
diff --git a/src/backend/optimizer/path/Makefile b/src/backend/optimizer/path/Makefile
index 1e199ff66f..7b9820c25f 100644
--- a/src/backend/optimizer/path/Makefile
+++ b/src/backend/optimizer/path/Makefile
@@ -21,6 +21,7 @@ OBJS = \
 	joinpath.o \
 	joinrels.o \
 	pathkeys.o \
-	tidpath.o
+	tidpath.o \
+	uniquekeys.o
 
 include $(top_srcdir)/src/backend/common.mk
diff --git a/src/backend/optimizer/path/pathkeys.c b/src/backend/optimizer/path/pathkeys.c
index 86a35cdef1..e2be1fbf90 100644
--- a/src/backend/optimizer/path/pathkeys.c
+++ b/src/backend/optimizer/path/pathkeys.c
@@ -29,6 +29,7 @@
 #include "utils/lsyscache.h"
 
 
+static bool pathkey_is_unique(PathKey *new_pathkey, List *pathkeys);
 static bool pathkey_is_redundant(PathKey *new_pathkey, List *pathkeys);
 static bool matches_boolean_partition_clause(RestrictInfo *rinfo,
 											 RelOptInfo *partrel,
@@ -96,6 +97,29 @@ make_canonical_pathkey(PlannerInfo *root,
 	return pk;
 }
 
+/*
+ * pathkey_is_unique
+ *		Checks if the new pathkey's equivalence class is the same as that of
+ *		any existing member of the pathkey list.
+ */
+static bool
+pathkey_is_unique(PathKey *new_pathkey, List *pathkeys)
+{
+	EquivalenceClass *new_ec = new_pathkey->pk_eclass;
+	ListCell   *lc;
+
+	/* If same EC already is already in the list, then not unique */
+	foreach(lc, pathkeys)
+	{
+		PathKey    *old_pathkey = (PathKey *) lfirst(lc);
+
+		if (new_ec == old_pathkey->pk_eclass)
+			return false;
+	}
+
+	return true;
+}
+
 /*
  * pathkey_is_redundant
  *	   Is a pathkey redundant with one already in the given list?
@@ -1152,6 +1176,44 @@ make_pathkeys_for_sortclauses(PlannerInfo *root,
 	return pathkeys;
 }
 
+/*
+ * make_pathkeys_for_uniquekeyclauses
+ *		Generate a pathkeys list to be used for uniquekey clauses
+ */
+List *
+make_pathkeys_for_uniquekeys(PlannerInfo *root,
+							 List *sortclauses,
+							 List *tlist)
+{
+	List	   *pathkeys = NIL;
+	ListCell   *l;
+
+	foreach(l, sortclauses)
+	{
+		SortGroupClause *sortcl = (SortGroupClause *) lfirst(l);
+		Expr	   *sortkey;
+		PathKey    *pathkey;
+
+		sortkey = (Expr *) get_sortgroupclause_expr(sortcl, tlist);
+		Assert(OidIsValid(sortcl->sortop));
+		pathkey = make_pathkey_from_sortop(root,
+										   sortkey,
+										   root->nullable_baserels,
+										   sortcl->sortop,
+										   sortcl->nulls_first,
+										   sortcl->tleSortGroupRef,
+										   true);
+
+		if (EC_MUST_BE_REDUNDANT(pathkey->pk_eclass))
+			continue;
+
+		if (pathkey_is_unique(pathkey, pathkeys))
+			pathkeys = lappend(pathkeys, pathkey);
+	}
+
+	return pathkeys;
+}
+
 /****************************************************************************
  *		PATHKEYS AND MERGECLAUSES
  ****************************************************************************/
diff --git a/src/backend/optimizer/path/uniquekeys.c b/src/backend/optimizer/path/uniquekeys.c
new file mode 100644
index 0000000000..d2525771e3
--- /dev/null
+++ b/src/backend/optimizer/path/uniquekeys.c
@@ -0,0 +1,92 @@
+/*-------------------------------------------------------------------------
+ *
+ * uniquekeys.c
+ *	  Utilities for matching and building unique keys
+ *
+ * Portions Copyright (c) 2020, PostgreSQL Global Development Group
+ *
+ * IDENTIFICATION
+ *	  src/backend/optimizer/path/uniquekeys.c
+ *
+ *-------------------------------------------------------------------------
+ */
+#include "postgres.h"
+
+#include "optimizer/paths.h"
+
+/*
+ * build_uniquekeys
+ * 		Preparing list of pathkeys keys which are considered to be unique for
+ * 		this query.
+ *
+ * For now used only for distinct clauses, where redundant keys	need to be
+ * preserved e.g. for skip scan. Justification for this function existence is
+ * future plans to make it produce actual UniqueKey list. 
+ */
+List*
+build_uniquekeys(PlannerInfo *root, List *sortclauses)
+{
+	List *result = NIL;
+	List *sortkeys;
+	ListCell *l;
+	List *exprs = NIL;
+
+	sortkeys = make_pathkeys_for_uniquekeys(root,
+											sortclauses,
+											root->processed_tlist);
+
+	/* Create a uniquekey and add it to the list */
+	foreach(l, sortkeys)
+	{
+		PathKey    *pathkey = (PathKey *) lfirst(l);
+		EquivalenceClass *ec = pathkey->pk_eclass;
+		EquivalenceMember *mem = (EquivalenceMember*) lfirst(list_head(ec->ec_members));
+		exprs = lappend(exprs, mem->em_expr);
+	}
+
+	result = lappend(result, exprs);
+
+	return result;
+}
+
+/*
+ * query_has_uniquekeys_for
+ * 		Check if the specified unique keys matching all query level unique
+ * 		keys.
+ *
+ * The main use is to verify that unique keys for some path are covering all
+ * requested query unique keys. Based on this information a path could be
+ * rejected if it satisfy uniqueness only partially.
+ */
+bool
+query_has_uniquekeys_for(PlannerInfo *root, List *path_uniquekeys,
+						 bool allow_multinulls)
+{
+	ListCell *lc;
+	ListCell *lc2;
+
+	/* root->query_uniquekeys are the requested DISTINCT clauses on query level
+	 * path_uniquekeys are the unique keys on current path. All requested
+	 * query_uniquekeys must be satisfied by the path_uniquekeys.
+	 */
+	foreach(lc, root->query_uniquekeys)
+	{
+		List *query_ukey = lfirst_node(List, lc);
+		bool satisfied = false;
+		foreach(lc2, path_uniquekeys)
+		{
+			List *ukey = lfirst_node(List, lc2);
+			if (list_length(ukey) == 0 &&
+				list_length(query_ukey) != 0)
+				continue;
+			if (list_is_subset(ukey, query_ukey))
+			{
+				satisfied = true;
+				break;
+			}
+		}
+		if (!satisfied)
+			return false;
+	}
+	return true;
+}
diff --git a/src/backend/optimizer/plan/planner.c b/src/backend/optimizer/plan/planner.c
index bd09f85aea..b67bff8ccc 100644
--- a/src/backend/optimizer/plan/planner.c
+++ b/src/backend/optimizer/plan/planner.c
@@ -3102,12 +3102,18 @@ standard_qp_callback(PlannerInfo *root, void *extra)
 
 	if (parse->distinctClause &&
 		grouping_is_sortable(parse->distinctClause))
+	{
 		root->distinct_pathkeys =
 			make_pathkeys_for_sortclauses(root,
 										  parse->distinctClause,
 										  tlist);
+		root->query_uniquekeys = build_uniquekeys(root, parse->distinctClause);
+	}
 	else
+	{
 		root->distinct_pathkeys = NIL;
+		root->query_uniquekeys = NIL;
+	}
 
 	root->sort_pathkeys =
 		make_pathkeys_for_sortclauses(root,
@@ -4493,13 +4499,19 @@ create_final_distinct_paths(PlannerInfo *root, RelOptInfo *input_rel,
 			Path	   *path = (Path *) lfirst(lc);
 
 			if (pathkeys_contained_in(needed_pathkeys, path->pathkeys))
-			{
 				add_path(distinct_rel, (Path *)
 						 create_upper_unique_path(root, distinct_rel,
 												  path,
 												  list_length(root->distinct_pathkeys),
 												  numDistinctRows));
-			}
+		}
+
+		foreach(lc, input_rel->unique_pathlist)
+		{
+			Path	   *path = (Path *) lfirst(lc);
+
+			if (query_has_uniquekeys_for(root, path->uniquekeys, false))
+				add_path(distinct_rel, path);
 		}
 
 		/* For explicit-sort case, always use the more rigorous clause */
@@ -7109,6 +7121,26 @@ apply_scanjoin_target_to_paths(PlannerInfo *root,
 		}
 	}
 
+	foreach(lc, rel->unique_pathlist)
+	{
+		Path	   *subpath = (Path *) lfirst(lc);
+
+		/* Shouldn't have any parameterized paths anymore */
+		Assert(subpath->param_info == NULL);
+
+		if (tlist_same_exprs)
+			subpath->pathtarget->sortgrouprefs =
+				scanjoin_target->sortgrouprefs;
+		else
+		{
+			Path	   *newpath;
+
+			newpath = (Path *) create_projection_path(root, rel, subpath,
+													  scanjoin_target);
+			lfirst(lc) = newpath;
+		}
+	}
+
 	/*
 	 * Now, if final scan/join target contains SRFs, insert ProjectSetPath(s)
 	 * atop each existing path.  (Note that this function doesn't look at the
diff --git a/src/backend/optimizer/util/pathnode.c b/src/backend/optimizer/util/pathnode.c
index 5c32c96b71..abb77d867e 100644
--- a/src/backend/optimizer/util/pathnode.c
+++ b/src/backend/optimizer/util/pathnode.c
@@ -416,10 +416,10 @@ set_cheapest(RelOptInfo *parent_rel)
  * 'parent_rel' is the relation entry to which the path corresponds.
  * 'new_path' is a potential path for parent_rel.
  *
- * Returns nothing, but modifies parent_rel->pathlist.
+ * Returns modified pathlist.
  */
-void
-add_path(RelOptInfo *parent_rel, Path *new_path)
+static List *
+add_path_to(RelOptInfo *parent_rel, List *pathlist, Path *new_path)
 {
 	bool		accept_new = true;	/* unless we find a superior old path */
 	int			insert_at = 0;	/* where to insert new item */
@@ -440,7 +440,7 @@ add_path(RelOptInfo *parent_rel, Path *new_path)
 	 * for more than one old path to be tossed out because new_path dominates
 	 * it.
 	 */
-	foreach(p1, parent_rel->pathlist)
+	foreach(p1, pathlist)
 	{
 		Path	   *old_path = (Path *) lfirst(p1);
 		bool		remove_old = false; /* unless new proves superior */
@@ -584,8 +584,7 @@ add_path(RelOptInfo *parent_rel, Path *new_path)
 		 */
 		if (remove_old)
 		{
-			parent_rel->pathlist = foreach_delete_current(parent_rel->pathlist,
-														  p1);
+			pathlist = foreach_delete_current(pathlist, p1);
 
 			/*
 			 * Delete the data pointed-to by the deleted cell, if possible
@@ -612,8 +611,7 @@ add_path(RelOptInfo *parent_rel, Path *new_path)
 	if (accept_new)
 	{
 		/* Accept the new path: insert it at proper place in pathlist */
-		parent_rel->pathlist =
-			list_insert_nth(parent_rel->pathlist, insert_at, new_path);
+		pathlist = list_insert_nth(pathlist, insert_at, new_path);
 	}
 	else
 	{
@@ -621,6 +619,23 @@ add_path(RelOptInfo *parent_rel, Path *new_path)
 		if (!IsA(new_path, IndexPath))
 			pfree(new_path);
 	}
+
+	return pathlist;
+}
+
+void
+add_path(RelOptInfo *parent_rel, Path *new_path)
+{
+	parent_rel->pathlist = add_path_to(parent_rel,
+									   parent_rel->pathlist, new_path);
+}
+
+void
+add_unique_path(RelOptInfo *parent_rel, Path *new_path)
+{
+	parent_rel->unique_pathlist = add_path_to(parent_rel,
+											  parent_rel->unique_pathlist,
+											  new_path);
 }
 
 /*
@@ -2662,6 +2677,7 @@ create_projection_path(PlannerInfo *root,
 	pathnode->path.pathkeys = subpath->pathkeys;
 
 	pathnode->subpath = subpath;
+	pathnode->path.uniquekeys = subpath->uniquekeys;
 
 	/*
 	 * We might not need a separate Result node.  If the input plan node type
diff --git a/src/include/nodes/pathnodes.h b/src/include/nodes/pathnodes.h
index 1f3845b3fe..056b13826a 100644
--- a/src/include/nodes/pathnodes.h
+++ b/src/include/nodes/pathnodes.h
@@ -293,6 +293,7 @@ struct PlannerInfo
 
 	List	   *query_pathkeys; /* desired pathkeys for query_planner() */
 
+	List	   *query_uniquekeys; /* unique keys required for the query */
 	List	   *group_pathkeys; /* groupClause pathkeys, if any */
 	List	   *window_pathkeys;	/* pathkeys of bottom window, if any */
 	List	   *distinct_pathkeys;	/* distinctClause pathkeys, if any */
@@ -695,6 +696,7 @@ typedef struct RelOptInfo
 	List	   *pathlist;		/* Path structures */
 	List	   *ppilist;		/* ParamPathInfos used in pathlist */
 	List	   *partial_pathlist;	/* partial Paths */
+	List       *unique_pathlist;    /* unique Paths */
 	struct Path *cheapest_startup_path;
 	struct Path *cheapest_total_path;
 	struct Path *cheapest_unique_path;
@@ -883,6 +885,7 @@ struct IndexOptInfo
 	bool		amsearchnulls;	/* can AM search for NULL/NOT NULL entries? */
 	bool		amhasgettuple;	/* does AM have amgettuple interface? */
 	bool		amhasgetbitmap; /* does AM have amgetbitmap interface? */
+	bool		amcanskip;		/* can AM skip duplicate values? */
 	bool		amcanparallel;	/* does AM support parallel scan? */
 	bool		amcanmarkpos;	/* does AM support mark/restore? */
 	/* Rather than include amapi.h here, we declare amcostestimate like this */
@@ -1196,6 +1199,8 @@ typedef struct Path
 
 	List	   *pathkeys;		/* sort ordering of path's output */
 	/* pathkeys is a List of PathKey nodes; see above */
+
+	List	   *uniquekeys;	/* the unique keys, or NIL if none */
 } Path;
 
 /* Macro for extracting a path's parameterization relids; beware double eval */
diff --git a/src/include/nodes/pg_list.h b/src/include/nodes/pg_list.h
index 2cb9d1371d..4ac871fd16 100644
--- a/src/include/nodes/pg_list.h
+++ b/src/include/nodes/pg_list.h
@@ -567,6 +567,7 @@ extern pg_nodiscard List *list_delete_last(List *list);
 extern pg_nodiscard List *list_delete_first_n(List *list, int n);
 extern pg_nodiscard List *list_delete_nth_cell(List *list, int n);
 extern pg_nodiscard List *list_delete_cell(List *list, ListCell *cell);
+extern bool list_is_subset(const List *members, const List *target);
 
 extern List *list_union(const List *list1, const List *list2);
 extern List *list_union_ptr(const List *list1, const List *list2);
diff --git a/src/include/optimizer/pathnode.h b/src/include/optimizer/pathnode.h
index 620eeda2d6..bb6d730e93 100644
--- a/src/include/optimizer/pathnode.h
+++ b/src/include/optimizer/pathnode.h
@@ -27,6 +27,7 @@ extern int	compare_fractional_path_costs(Path *path1, Path *path2,
 										  double fraction);
 extern void set_cheapest(RelOptInfo *parent_rel);
 extern void add_path(RelOptInfo *parent_rel, Path *new_path);
+extern void add_unique_path(RelOptInfo *parent_rel, Path *new_path);
 extern bool add_path_precheck(RelOptInfo *parent_rel,
 							  Cost startup_cost, Cost total_cost,
 							  List *pathkeys, Relids required_outer);
diff --git a/src/include/optimizer/paths.h b/src/include/optimizer/paths.h
index 0c3a0b90c8..3dfa21adad 100644
--- a/src/include/optimizer/paths.h
+++ b/src/include/optimizer/paths.h
@@ -229,6 +229,9 @@ extern List *build_join_pathkeys(PlannerInfo *root,
 extern List *make_pathkeys_for_sortclauses(PlannerInfo *root,
 										   List *sortclauses,
 										   List *tlist);
+extern List *make_pathkeys_for_uniquekeys(PlannerInfo *root,
+										  List *sortclauses,
+										  List *tlist);
 extern void initialize_mergeclause_eclasses(PlannerInfo *root,
 											RestrictInfo *restrictinfo);
 extern void update_mergeclause_eclasses(PlannerInfo *root,
@@ -255,4 +258,10 @@ extern PathKey *make_canonical_pathkey(PlannerInfo *root,
 extern void add_paths_to_append_rel(PlannerInfo *root, RelOptInfo *rel,
 									List *live_childrels);
 
+extern bool query_has_uniquekeys_for(PlannerInfo *root,
+									 List *exprs,
+									 bool allow_multinulls);
+
+extern List *build_uniquekeys(PlannerInfo *root, List *sortclauses);
+
 #endif							/* PATHS_H */
-- 
2.32.0

