From a6a278ddeca2c3369b80ffe02f18494531011c5f Mon Sep 17 00:00:00 2001
From: "dgrowley@gmail.com" <dgrowley@gmail.com>
Date: Tue, 20 Oct 2020 13:36:48 +1300
Subject: [PATCH v9 3/3] Allow parameterized Nested Loops to cache tuples from
 inner scans

Traditionally a parameterized nested loop would always perform another
inner scan each time the parameter values for the scan changed.  This was
quite wasteful when we had repeat lookups for the same value again and
again.

Here we add support to allow nested loops to remember the resulting tuples
from a scan and reuse those if we see the same parameter values on a
subsequent scan.

These results are stored within a hash table, the size of which is limited
by hash_mem.  When the cache becomes full, the least recently looked up
entries are evicted from the cache to make way for new tuples.

In the query plan, these appear as "Cached Nested Loops".
---
 .../postgres_fdw/expected/postgres_fdw.out    |  71 +-
 doc/src/sgml/config.sgml                      |  19 +
 src/backend/commands/explain.c                | 146 ++-
 src/backend/executor/Makefile                 |   1 +
 src/backend/executor/execExpr.c               | 132 +++
 src/backend/executor/execMRUTupleCache.c      | 981 ++++++++++++++++++
 src/backend/executor/execParallel.c           |  17 +
 src/backend/executor/nodeNestloop.c           | 213 +++-
 src/backend/nodes/copyfuncs.c                 |   8 +-
 src/backend/nodes/outfuncs.c                  |   7 +
 src/backend/nodes/readfuncs.c                 |   7 +
 src/backend/optimizer/path/costsize.c         | 266 ++++-
 src/backend/optimizer/path/joinpath.c         | 437 ++++++++
 src/backend/optimizer/plan/createplan.c       |  60 +-
 src/backend/optimizer/util/pathnode.c         | 144 ++-
 src/backend/utils/adt/ruleutils.c             |   7 +-
 src/backend/utils/misc/guc.c                  |  10 +
 src/backend/utils/misc/postgresql.conf.sample |   1 +
 src/include/executor/execMRUTupleCache.h      |  97 ++
 src/include/executor/executor.h               |   6 +
 src/include/executor/nodeNestloop.h           |   5 +
 src/include/lib/ilist.h                       |  19 +
 src/include/lib/simplehash.h                  |   8 +-
 src/include/nodes/execnodes.h                 |   9 +
 src/include/nodes/pathnodes.h                 |  34 +-
 src/include/nodes/plannodes.h                 |  13 +
 src/include/optimizer/cost.h                  |  13 +-
 src/include/optimizer/pathnode.h              |  14 +
 src/include/utils/selfuncs.h                  |   6 +-
 src/test/regress/expected/join.out            | 131 ++-
 src/test/regress/expected/partition_prune.out |  33 +-
 src/test/regress/expected/subselect.out       |   5 +-
 src/test/regress/expected/sysviews.out        |   3 +-
 src/test/regress/sql/join.sql                 |  38 +
 src/test/regress/sql/partition_prune.sql      |   3 +
 35 files changed, 2828 insertions(+), 136 deletions(-)
 create mode 100644 src/backend/executor/execMRUTupleCache.c
 create mode 100644 src/include/executor/execMRUTupleCache.h

diff --git a/contrib/postgres_fdw/expected/postgres_fdw.out b/contrib/postgres_fdw/expected/postgres_fdw.out
index 2d88d06358..dd72764b36 100644
--- a/contrib/postgres_fdw/expected/postgres_fdw.out
+++ b/contrib/postgres_fdw/expected/postgres_fdw.out
@@ -2114,8 +2114,9 @@ SELECT t1."C 1" FROM "S 1"."T 1" t1, LATERAL (SELECT DISTINCT t2.c1, t3.c1 FROM
 --------------------------------------------------------------------------------------------------------------------------------------------------------------------
  Limit
    Output: t1."C 1"
-   ->  Nested Loop
+   ->  Cached Nested Loop
          Output: t1."C 1"
+         Cache Key: t1.c2
          ->  Index Scan using t1_pkey on "S 1"."T 1" t1
                Output: t1."C 1", t1.c2, t1.c3, t1.c4, t1.c5, t1.c6, t1.c7, t1.c8
          ->  HashAggregate
@@ -2125,7 +2126,7 @@ SELECT t1."C 1" FROM "S 1"."T 1" t1, LATERAL (SELECT DISTINCT t2.c1, t3.c1 FROM
                      Output: t2.c1, t3.c1
                      Relations: (public.ft1 t2) INNER JOIN (public.ft2 t3)
                      Remote SQL: SELECT r1."C 1", r2."C 1" FROM ("S 1"."T 1" r1 INNER JOIN "S 1"."T 1" r2 ON (((r1."C 1" = r2."C 1")) AND ((r1.c2 = $1::integer))))
-(13 rows)
+(14 rows)
 
 SELECT t1."C 1" FROM "S 1"."T 1" t1, LATERAL (SELECT DISTINCT t2.c1, t3.c1 FROM ft1 t2, ft2 t3 WHERE t2.c1 = t3.c1 AND t2.c2 = t1.c2) q ORDER BY t1."C 1" OFFSET 10 LIMIT 10;
  C 1 
@@ -7360,26 +7361,27 @@ analyze loct1;
 -- inner join; expressions in the clauses appear in the equivalence class list
 explain (verbose, costs off)
 	select foo.f1, loct1.f1 from foo join loct1 on (foo.f1 = loct1.f1) order by foo.f2 offset 10 limit 10;
-                                            QUERY PLAN                                            
---------------------------------------------------------------------------------------------------
+                                         QUERY PLAN                                         
+--------------------------------------------------------------------------------------------
  Limit
    Output: foo.f1, loct1.f1, foo.f2
-   ->  Sort
+   ->  Cached Nested Loop
          Output: foo.f1, loct1.f1, foo.f2
-         Sort Key: foo.f2
-         ->  Merge Join
-               Output: foo.f1, loct1.f1, foo.f2
-               Merge Cond: (foo.f1 = loct1.f1)
-               ->  Merge Append
-                     Sort Key: foo.f1
-                     ->  Index Scan using i_foo_f1 on public.foo foo_1
+         Cache Key: foo.f1
+         ->  Merge Append
+               Sort Key: foo.f2
+               ->  Sort
+                     Output: foo_1.f1, foo_1.f2
+                     Sort Key: foo_1.f2
+                     ->  Seq Scan on public.foo foo_1
                            Output: foo_1.f1, foo_1.f2
-                     ->  Foreign Scan on public.foo2 foo_2
-                           Output: foo_2.f1, foo_2.f2
-                           Remote SQL: SELECT f1, f2 FROM public.loct1 ORDER BY f1 ASC NULLS LAST
-               ->  Index Only Scan using i_loct1_f1 on public.loct1
-                     Output: loct1.f1
-(17 rows)
+               ->  Foreign Scan on public.foo2 foo_2
+                     Output: foo_2.f1, foo_2.f2
+                     Remote SQL: SELECT f1, f2 FROM public.loct1 ORDER BY f2 ASC NULLS LAST
+         ->  Index Only Scan using i_loct1_f1 on public.loct1
+               Output: loct1.f1
+               Index Cond: (loct1.f1 = foo.f1)
+(18 rows)
 
 select foo.f1, loct1.f1 from foo join loct1 on (foo.f1 = loct1.f1) order by foo.f2 offset 10 limit 10;
  f1 | f1 
@@ -7400,26 +7402,27 @@ select foo.f1, loct1.f1 from foo join loct1 on (foo.f1 = loct1.f1) order by foo.
 -- list but no output change as compared to the previous query
 explain (verbose, costs off)
 	select foo.f1, loct1.f1 from foo left join loct1 on (foo.f1 = loct1.f1) order by foo.f2 offset 10 limit 10;
-                                            QUERY PLAN                                            
---------------------------------------------------------------------------------------------------
+                                         QUERY PLAN                                         
+--------------------------------------------------------------------------------------------
  Limit
    Output: foo.f1, loct1.f1, foo.f2
-   ->  Sort
+   ->  Cached Nested Loop Left Join
          Output: foo.f1, loct1.f1, foo.f2
-         Sort Key: foo.f2
-         ->  Merge Left Join
-               Output: foo.f1, loct1.f1, foo.f2
-               Merge Cond: (foo.f1 = loct1.f1)
-               ->  Merge Append
-                     Sort Key: foo.f1
-                     ->  Index Scan using i_foo_f1 on public.foo foo_1
+         Cache Key: foo.f1
+         ->  Merge Append
+               Sort Key: foo.f2
+               ->  Sort
+                     Output: foo_1.f1, foo_1.f2
+                     Sort Key: foo_1.f2
+                     ->  Seq Scan on public.foo foo_1
                            Output: foo_1.f1, foo_1.f2
-                     ->  Foreign Scan on public.foo2 foo_2
-                           Output: foo_2.f1, foo_2.f2
-                           Remote SQL: SELECT f1, f2 FROM public.loct1 ORDER BY f1 ASC NULLS LAST
-               ->  Index Only Scan using i_loct1_f1 on public.loct1
-                     Output: loct1.f1
-(17 rows)
+               ->  Foreign Scan on public.foo2 foo_2
+                     Output: foo_2.f1, foo_2.f2
+                     Remote SQL: SELECT f1, f2 FROM public.loct1 ORDER BY f2 ASC NULLS LAST
+         ->  Index Only Scan using i_loct1_f1 on public.loct1
+               Output: loct1.f1
+               Index Cond: (loct1.f1 = foo.f1)
+(18 rows)
 
 select foo.f1, loct1.f1 from foo left join loct1 on (foo.f1 = loct1.f1) order by foo.f2 offset 10 limit 10;
  f1 | f1 
diff --git a/doc/src/sgml/config.sgml b/doc/src/sgml/config.sgml
index f043433e31..5a82e4e7ac 100644
--- a/doc/src/sgml/config.sgml
+++ b/doc/src/sgml/config.sgml
@@ -4718,6 +4718,25 @@ ANY <replaceable class="parameter">num_sync</replaceable> ( <replaceable class="
       </listitem>
      </varlistentry>
 
+     <varlistentry id="guc-enable-cachednestloop" xreflabel="enable_cachednestloop">
+      <term><varname>enable_cachednestloop</varname> (<type>boolean</type>)
+      <indexterm>
+       <primary><varname>enable_cachednestloop</varname> configuration parameter</primary>
+      </indexterm>
+      </term>
+      <listitem>
+       <para>
+        Enables or disables the query planner's ability to use a cached
+        parameterized nested loop joins.  Such joins allow the inner
+        parameterized scan of a nested loop join to be cached so that repeat
+        lookups are likely to find the tuples already cached rather than have
+        to perform another inner scan.  Less commonly looked up results may be
+        evicted from the cache when more space is required for new entries.
+        The default is <literal>on</literal>.
+       </para>
+      </listitem>
+     </varlistentry>
+
      <varlistentry id="guc-enable-gathermerge" xreflabel="enable_gathermerge">
       <term><varname>enable_gathermerge</varname> (<type>boolean</type>)
       <indexterm>
diff --git a/src/backend/commands/explain.c b/src/backend/commands/explain.c
index 41317f1837..a3387959f3 100644
--- a/src/backend/commands/explain.c
+++ b/src/backend/commands/explain.c
@@ -18,6 +18,7 @@
 #include "commands/createas.h"
 #include "commands/defrem.h"
 #include "commands/prepare.h"
+#include "executor/execMRUTupleCache.h"
 #include "executor/nodeHash.h"
 #include "foreign/fdwapi.h"
 #include "jit/jit.h"
@@ -108,6 +109,8 @@ static void show_sort_info(SortState *sortstate, ExplainState *es);
 static void show_incremental_sort_info(IncrementalSortState *incrsortstate,
 									   ExplainState *es);
 static void show_hash_info(HashState *hashstate, ExplainState *es);
+static void show_cachednestloop_info(NestLoopState *nlstate, List *ancestors,
+									 ExplainState *es);
 static void show_hashagg_info(AggState *hashstate, ExplainState *es);
 static void show_tidbitmap_info(BitmapHeapScanState *planstate,
 								ExplainState *es);
@@ -1170,7 +1173,10 @@ ExplainNode(PlanState *planstate, List *ancestors,
 			pname = sname = "BitmapOr";
 			break;
 		case T_NestLoop:
-			pname = sname = "Nested Loop";
+			if (((NestLoop *) plan)->mrucache)
+				pname = sname = "Cached Nested Loop";
+			else
+				pname = sname = "Nested Loop";
 			break;
 		case T_MergeJoin:
 			pname = "Merge";	/* "Join" gets added by jointype switch */
@@ -1875,6 +1881,9 @@ ExplainNode(PlanState *planstate, List *ancestors,
 			}
 			break;
 		case T_NestLoop:
+			if (((NestLoop *) plan)->mrucache)
+				show_cachednestloop_info((NestLoopState *) planstate, ancestors, es);
+
 			show_upper_qual(((NestLoop *) plan)->join.joinqual,
 							"Join Filter", planstate, ancestors, es);
 			if (((NestLoop *) plan)->join.joinqual)
@@ -3028,6 +3037,141 @@ show_hash_info(HashState *hashstate, ExplainState *es)
 	}
 }
 
+static void
+show_cachednestloop_info(NestLoopState *nlstate, List *ancestors, ExplainState *es)
+{
+	Plan	   *plan = ((PlanState *) nlstate)->plan;
+	MRUTupleCache *mrucache = nlstate->nl_mrucache;
+
+	ListCell   *lc;
+	List	   *context;
+	StringInfoData keystr;
+	char	   *seperator = "";
+	bool		useprefix;
+	int64		memPeakKb;
+
+	if (mrucache == NULL)
+		return;
+
+	initStringInfo(&keystr);
+
+	useprefix = list_length(es->rtable) > 1;
+
+	ancestors = lcons(plan, ancestors);
+
+	/* Set up deparsing context */
+	context = set_deparse_context_plan(es->deparse_cxt,
+									   plan,
+									   ancestors);
+
+	foreach(lc, ((NestLoop *) plan)->param_exprs)
+	{
+		Node	   *expr = (Node *) lfirst(lc);
+
+		appendStringInfoString(&keystr, seperator);
+
+		appendStringInfoString(&keystr, deparse_expression(expr, context,
+														   useprefix, false));
+		seperator = ", ";
+	}
+
+	if (es->format != EXPLAIN_FORMAT_TEXT)
+	{
+		ExplainPropertyText("Cache Key", keystr.data, es);
+	}
+	else
+	{
+		ExplainIndentText(es);
+		appendStringInfo(es->str, "Cache Key: %s\n", keystr.data);
+	}
+
+	pfree(keystr.data);
+
+	ancestors = list_delete_first(ancestors);
+
+	if (!es->analyze)
+		return;
+
+	if (mrucache->stats.mem_peak > 0)
+		memPeakKb = (mrucache->stats.mem_peak + 1023) / 1024;
+	else
+		memPeakKb = (mrucache->mem_used + 1023) / 1024;
+
+	if (es->format != EXPLAIN_FORMAT_TEXT)
+	{
+		ExplainPropertyInteger("Cache Hits", NULL, mrucache->stats.cache_hits, es);
+		ExplainPropertyInteger("Cache Misses", NULL, mrucache->stats.cache_misses, es);
+		ExplainPropertyInteger("Cache Evictions", NULL, mrucache->stats.cache_evictions, es);
+		ExplainPropertyInteger("Cache Overflows", NULL, mrucache->stats.cache_overflows, es);
+		ExplainPropertyInteger("Peak Memory Usage", "kB", memPeakKb, es);
+	}
+	else
+	{
+		ExplainIndentText(es);
+		appendStringInfo(es->str,
+						 "Hits: " UINT64_FORMAT "  Misses: " UINT64_FORMAT "  Evictions: " UINT64_FORMAT "  Overflows: " UINT64_FORMAT "  Memory Usage: " INT64_FORMAT "kB\n",
+						 mrucache->stats.cache_hits,
+						 mrucache->stats.cache_misses,
+						 mrucache->stats.cache_evictions,
+						 mrucache->stats.cache_overflows,
+						 memPeakKb);
+	}
+
+	/* Show details from parallel workers, if any */
+	if (nlstate->shared_info != NULL)
+	{
+		for (int n = 0; n < nlstate->shared_info->num_workers; n++)
+		{
+			MRUCacheInstrumentation *si;
+
+			si = &nlstate->shared_info->sinstrument[n];
+
+			/*
+			 * Skip workers that didn't do anything.  We needn't consider
+			 * cache hits as we'll always get a miss before a hit.
+			 */
+			if (si->cache_misses == 0)
+				continue;
+
+			if (es->workers_state)
+				ExplainOpenWorker(n, es);
+
+			/*
+			 * Since the worker's MRUTupleCache.mem_used field is unavailable
+			 * to us, ExecEndNestLoop will have set the
+			 * MRUCacheInstrumentation.mem_peak field for us.  No need to do
+			 * the zero checks like we did for the serial case above.
+			 */
+			memPeakKb = (si->mem_peak + 1023) / 1024;
+
+
+			if (es->format == EXPLAIN_FORMAT_TEXT)
+			{
+				ExplainIndentText(es);
+				appendStringInfo(es->str,
+								 "Hits: " UINT64_FORMAT "  Misses: " UINT64_FORMAT "  Evictions: " UINT64_FORMAT "  Overflows: " UINT64_FORMAT "  Memory Usage: " INT64_FORMAT "kB\n",
+								 si->cache_hits, si->cache_misses, si->cache_evictions, si->cache_overflows, memPeakKb);
+			}
+			else
+			{
+				ExplainPropertyInteger("Cache Hits", NULL,
+									   si->cache_hits, es);
+				ExplainPropertyInteger("Cache Misses", NULL,
+									   si->cache_misses, es);
+				ExplainPropertyInteger("Cache Evictions", NULL,
+									   si->cache_evictions, es);
+				ExplainPropertyInteger("Cache Overflows", NULL,
+									   si->cache_overflows, es);
+				ExplainPropertyInteger("Peak Memory Usage", "kB",memPeakKb,
+									   es);
+			}
+
+			if (es->workers_state)
+				ExplainCloseWorker(n, es);
+		}
+	}
+}
+
 /*
  * Show information on hash aggregate memory usage and batches.
  */
diff --git a/src/backend/executor/Makefile b/src/backend/executor/Makefile
index f990c6473a..e33e8f2f28 100644
--- a/src/backend/executor/Makefile
+++ b/src/backend/executor/Makefile
@@ -21,6 +21,7 @@ OBJS = \
 	execIndexing.o \
 	execJunk.o \
 	execMain.o \
+	execMRUTupleCache.o \
 	execParallel.o \
 	execPartition.o \
 	execProcnode.o \
diff --git a/src/backend/executor/execExpr.c b/src/backend/executor/execExpr.c
index 868f8b0858..fdc94b9914 100644
--- a/src/backend/executor/execExpr.c
+++ b/src/backend/executor/execExpr.c
@@ -3470,3 +3470,135 @@ ExecBuildGroupingEqual(TupleDesc ldesc, TupleDesc rdesc,
 
 	return state;
 }
+
+/*
+ * Build equality expression that can be evaluated using ExecQual(), returning
+ * true if the expression context's inner/outer tuples are equal.  Datums in
+ * the inner/outer slots are assumed to be in the same order and quantity as
+ * the 'eqfunctions' parameter.
+ *
+ * desc: tuple descriptor of the to-be-compared tuples
+ * ops: the slot ops for the inner/outer tuple slots
+ * eqFunctions: array of function oids of the equality functions to use
+ * this must be the same length as the 'param_exprs' list.
+ * collations: collation Oids to use for equality comparison. Must be the
+ * same length as the 'param_exprs' list.
+ * parent: parent executor node
+ */
+ExprState *
+ExecBuildParamSetEqual(TupleDesc desc,
+					   const TupleTableSlotOps *ops,
+					   const Oid *eqfunctions,
+					   const Oid *collations,
+					   const List *param_exprs,
+					   PlanState *parent)
+{
+	ExprState  *state = makeNode(ExprState);
+	ExprEvalStep scratch = {0};
+	int			maxatt = list_length(param_exprs);
+	List	   *adjust_jumps = NIL;
+	ListCell   *lc;
+
+	state->expr = NULL;
+	state->flags = EEO_FLAG_IS_QUAL;
+	state->parent = parent;
+
+	scratch.resvalue = &state->resvalue;
+	scratch.resnull = &state->resnull;
+
+	/* push deform steps */
+	scratch.opcode = EEOP_INNER_FETCHSOME;
+	scratch.d.fetch.last_var = maxatt;
+	scratch.d.fetch.fixed = false;
+	scratch.d.fetch.known_desc = desc;
+	scratch.d.fetch.kind = ops;
+	if (ExecComputeSlotInfo(state, &scratch))
+		ExprEvalPushStep(state, &scratch);
+
+	scratch.opcode = EEOP_OUTER_FETCHSOME;
+	scratch.d.fetch.last_var = maxatt;
+	scratch.d.fetch.fixed = false;
+	scratch.d.fetch.known_desc = desc;
+	scratch.d.fetch.kind = ops;
+	if (ExecComputeSlotInfo(state, &scratch))
+		ExprEvalPushStep(state, &scratch);
+
+	for (int attno = 0; attno < maxatt; attno++)
+	{
+		Form_pg_attribute att = TupleDescAttr(desc, attno);
+		Oid			foid = eqfunctions[attno];
+		Oid			collid = collations[attno];
+		FmgrInfo   *finfo;
+		FunctionCallInfo fcinfo;
+		AclResult	aclresult;
+
+		/* Check permission to call function */
+		aclresult = pg_proc_aclcheck(foid, GetUserId(), ACL_EXECUTE);
+		if (aclresult != ACLCHECK_OK)
+			aclcheck_error(aclresult, OBJECT_FUNCTION, get_func_name(foid));
+
+		InvokeFunctionExecuteHook(foid);
+
+		/* Set up the primary fmgr lookup information */
+		finfo = palloc0(sizeof(FmgrInfo));
+		fcinfo = palloc0(SizeForFunctionCallInfo(2));
+		fmgr_info(foid, finfo);
+		fmgr_info_set_expr(NULL, finfo);
+		InitFunctionCallInfoData(*fcinfo, finfo, 2,
+								 collid, NULL, NULL);
+
+		/* left arg */
+		scratch.opcode = EEOP_INNER_VAR;
+		scratch.d.var.attnum = attno;
+		scratch.d.var.vartype = att->atttypid;
+		scratch.resvalue = &fcinfo->args[0].value;
+		scratch.resnull = &fcinfo->args[0].isnull;
+		ExprEvalPushStep(state, &scratch);
+
+		/* right arg */
+		scratch.opcode = EEOP_OUTER_VAR;
+		scratch.d.var.attnum = attno;
+		scratch.d.var.vartype = att->atttypid;
+		scratch.resvalue = &fcinfo->args[1].value;
+		scratch.resnull = &fcinfo->args[1].isnull;
+		ExprEvalPushStep(state, &scratch);
+
+		scratch.opcode = finfo->fn_strict ? EEOP_FUNCEXPR_STRICT :
+			EEOP_FUNCEXPR;
+		scratch.d.func.finfo = finfo;
+		scratch.d.func.fcinfo_data = fcinfo;
+		scratch.d.func.fn_addr = finfo->fn_addr;
+		scratch.d.func.nargs = 2;
+		scratch.resvalue = &state->resvalue;
+		scratch.resnull = &state->resnull;
+		ExprEvalPushStep(state, &scratch);
+
+		/* then emit EEOP_QUAL to detect if result is false (or null) */
+		scratch.opcode = EEOP_QUAL;
+		scratch.d.qualexpr.jumpdone = -1;
+		scratch.resvalue = &state->resvalue;
+		scratch.resnull = &state->resnull;
+		ExprEvalPushStep(state, &scratch);
+		adjust_jumps = lappend_int(adjust_jumps,
+								   state->steps_len - 1);
+	}
+
+	/* adjust jump targets */
+	foreach(lc, adjust_jumps)
+	{
+		ExprEvalStep *as = &state->steps[lfirst_int(lc)];
+
+		Assert(as->opcode == EEOP_QUAL);
+		Assert(as->d.qualexpr.jumpdone == -1);
+		as->d.qualexpr.jumpdone = state->steps_len;
+	}
+
+	scratch.resvalue = NULL;
+	scratch.resnull = NULL;
+	scratch.opcode = EEOP_DONE;
+	ExprEvalPushStep(state, &scratch);
+
+	ExecReadyExpr(state);
+
+	return state;
+}
diff --git a/src/backend/executor/execMRUTupleCache.c b/src/backend/executor/execMRUTupleCache.c
new file mode 100644
index 0000000000..3553dc26cb
--- /dev/null
+++ b/src/backend/executor/execMRUTupleCache.c
@@ -0,0 +1,981 @@
+/*-------------------------------------------------------------------------
+ *
+ * execMRUTupleCache.c
+ *	  Routines setting up and using a most-recently-used cache to store sets
+ *	  of tuples for a given cache key.
+ *
+ * Portions Copyright (c) 2020, PostgreSQL Global Development Group
+ *
+ *
+ * IDENTIFICATION
+ *	  src/backend/executor/execMRUTupleCache.c
+ *
+ * A set of functions for setting up and using a most-recently-used tuple
+ * cache.  Sets of tuples are stored by the cache key and are located in RAM.
+ * When we're asked cache tuples that would cause us to exceed the memory
+ * limits which are imposed by the caller, the least recently looked up cache
+ * entry is evicted from cache to make way for the new entry.
+ *
+ * Sometimes our callers won't run their scans to completion. For example a
+ * semi-join only needs to run until it finds a matching tuple, and once it
+ * does, the join operator skips to the next outer tuple and does not execute
+ * the inner side again on that scan.  Because of this, we must keep track of
+ * when a cache entry is complete, and by default, we know it is when we run
+ * out of tuples to read during the scan.  However, there are cases where we
+ * can mark the cache entry as complete without exhausting the scan of all
+ * tuples.  One case is unique joins, where the join operator knows that there
+ * will only be at most one match for any given outer tuple.  In order to
+ * support such cases we allow the "singlerow" option to be set for the cache.
+ * This option marks the cache entry as complete after we read the first tuple
+ * from the subnode.
+ *
+ *-------------------------------------------------------------------------
+ */
+
+#include "postgres.h"
+
+#include "access/parallel.h"
+#include "common/hashfn.h"
+#include "executor/execMRUTupleCache.h"
+#include "executor/executor.h"
+#include "lib/ilist.h"
+#include "miscadmin.h"
+#include "utils/lsyscache.h"
+
+/*
+ * States of the MRUTupleCache's state machine
+ */
+#define MRUCACHE_LOOKUP				1	/* Attempt to find the first tuple for
+										 * a given key */
+#define MRUCACHE_FETCH_NEXT_TUPLE	2	/* Get another tuple from the cache */
+#define MRUCACHE_FILLING			3	/* Read next tuple to fill cache */
+#define MRUCACHE_BYPASS_MODE		4	/* Bypass mode.  Just read from our
+										 * plan without caching anything */
+#define MRUCACHE_ENDOFSCAN			5	/* Ready for rescan */
+
+/* Helper macros for memory accounting */
+#define EMPTY_ENTRY_MEMORY_BYTES(e)		(sizeof(MRUCacheEntry) + \
+										 sizeof(MRUCacheKey) + \
+										 (e)->key->params->t_len);
+#define CACHE_TUPLE_BYTES(t)			(sizeof(MRUCacheTuple) + \
+										 (t)->mintuple->t_len)
+
+ /*
+  * MRUCacheTuple Stores an individually cached tuple
+  */
+typedef struct MRUCacheTuple
+{
+	MinimalTuple mintuple;		/* Cached tuple */
+	struct MRUCacheTuple *next;	/* The next tuple with the same parameter
+								 * values or NULL if it's the last one */
+} MRUCacheTuple;
+
+/*
+ * MRUCacheKey
+ * The hash table key for cached entries plus the LRU list link
+ */
+typedef struct MRUCacheKey
+{
+	MinimalTuple params;
+	dlist_node	lru_node;		/* Pointer to next/prev key in LRU list */
+} MRUCacheKey;
+
+/*
+ * MRUCacheEntry
+ *		The data struct that the cache hash table stores
+ */
+typedef struct MRUCacheEntry
+{
+	MRUCacheKey *key;			/* Hash key for hash table lookups */
+	MRUCacheTuple *tuplehead;	/* Pointer to the first tuple or NULL if
+								 * no tuples are cached for this entry */
+	uint32		hash;			/* Hash value (cached) */
+	char		status;			/* Hash status */
+	bool		complete;		/* Were all required tuples read from the
+								 * plan? */
+} MRUCacheEntry;
+
+
+#define SH_PREFIX mrucache
+#define SH_ELEMENT_TYPE MRUCacheEntry
+#define SH_KEY_TYPE MRUCacheKey *
+#define SH_SCOPE static inline
+#define SH_DECLARE
+#include "lib/simplehash.h"
+
+static uint32 MRUCacheHash_hash(struct mrucache_hash *tb,
+								const MRUCacheKey *key);
+static int	MRUCacheHash_equal(struct mrucache_hash *tb,
+							   const MRUCacheKey *params1,
+							   const MRUCacheKey *params2);
+
+#define SH_PREFIX mrucache
+#define SH_ELEMENT_TYPE MRUCacheEntry
+#define SH_KEY_TYPE MRUCacheKey *
+#define SH_KEY key
+#define SH_HASH_KEY(tb, key) MRUCacheHash_hash(tb, key)
+#define SH_EQUAL(tb, a, b) MRUCacheHash_equal(tb, a, b) == 0
+#define SH_SCOPE static inline
+#define SH_STORE_HASH
+#define SH_GET_HASH(tb, a) a->hash
+#define SH_DEFINE
+#include "lib/simplehash.h"
+
+/*
+ * MRUCacheHash_hash
+ *		Hash function for simplehash hashtable.  'key' is unused here as we
+ *		require that all table lookups first populate the MRUTupleCache's
+ *		probeslot with the key values to be looked up.
+ */
+static uint32
+MRUCacheHash_hash(struct mrucache_hash *tb, const MRUCacheKey *key)
+{
+	MRUTupleCache *mrucache = (MRUTupleCache *) tb->private_data;
+	TupleTableSlot *pslot = mrucache->probeslot;
+	uint32		hashkey = 0;
+	int			numkeys = mrucache->nkeys;
+	FmgrInfo   *hashfunctions = mrucache->hashfunctions;
+	Oid		   *collations = mrucache->collations;
+
+	for (int i = 0; i < numkeys; i++)
+	{
+		/* rotate hashkey left 1 bit at each step */
+		hashkey = (hashkey << 1) | ((hashkey & 0x80000000) ? 1 : 0);
+
+		if (!pslot->tts_isnull[i])	/* treat nulls as having hash key 0 */
+		{
+			uint32		hkey;
+
+			hkey = DatumGetUInt32(FunctionCall1Coll(&hashfunctions[i],
+													collations[i], pslot->tts_values[i]));
+			hashkey ^= hkey;
+		}
+	}
+
+	return murmurhash32(hashkey);
+}
+
+/*
+ * MRUCacheHash_equal
+ *		Equality function for confirming hash value matches during a hash
+ *		table lookup.  'key2' is never used, instead the MRUTupleCache's
+ *		probeslot is always populated with details of what's being looked up.
+ */
+static int
+MRUCacheHash_equal(struct mrucache_hash *tb, const MRUCacheKey *key1,
+					  const MRUCacheKey *key2)
+{
+	MRUTupleCache *mrucache = (MRUTupleCache *) tb->private_data;
+	ExprContext *econtext = mrucache->ps_ExprContext;
+	TupleTableSlot *tslot = mrucache->tableslot;
+	TupleTableSlot *pslot = mrucache->probeslot;
+
+	/* probeslot should have already been prepared by prepare_probe_slot() */
+
+	ExecStoreMinimalTuple(key1->params, tslot, false);
+
+	econtext->ecxt_innertuple = tslot;
+	econtext->ecxt_outertuple = pslot;
+	return !ExecQualAndReset(mrucache->cache_eq_expr, econtext);
+}
+
+/*
+ * Initialize the hash table to empty.
+ */
+static void
+build_hash_table(MRUTupleCache *mrucache, uint32 size)
+{
+	/* mrucache_create will convert the size to a power of 2 */
+	mrucache->hashtable = mrucache_create(mrucache->tableContext, size,
+										  mrucache);
+}
+
+/*
+ * prepare_probe_slot
+ *		Populate mrucache's probeslot with the values from the tuple stored
+ *		in 'key'.  If 'key' is NULL, then perform the population by
+ *		evalulating mrucache's param_exprs.
+ */
+static inline void
+prepare_probe_slot(MRUTupleCache *mrucache, MRUCacheKey *key)
+{
+	TupleTableSlot *pslot = mrucache->probeslot;
+	TupleTableSlot *tslot = mrucache->tableslot;
+	int				numKeys = mrucache->nkeys;
+
+	ExecClearTuple(pslot);
+
+	if (key == NULL)
+	{
+		/* Set the probeslot's values based on the current parameter values */
+		for (int i = 0; i < numKeys; i++)
+			pslot->tts_values[i] = ExecEvalExpr(mrucache->param_exprs[i],
+												mrucache->ps_ExprContext,
+												&pslot->tts_isnull[i]);
+	}
+	else
+	{
+		/* Process the key's MinimalTuple and store the values in probeslot */
+		ExecStoreMinimalTuple(key->params, tslot, false);
+		slot_getallattrs(tslot);
+		memcpy(pslot->tts_values, tslot->tts_values, sizeof(Datum) * numKeys);
+		memcpy(pslot->tts_isnull, tslot->tts_isnull, sizeof(bool) * numKeys);
+	}
+
+	ExecStoreVirtualTuple(pslot);
+}
+
+/*
+ * entry_purge_tuples
+ *		Remove all tuples from a cache entry, leaving an empty cache entry.
+ *		Also update memory accounting to reflect the removal of the tuples.
+ */
+static void
+entry_purge_tuples(MRUTupleCache *mrucache, MRUCacheEntry *entry)
+{
+	MRUCacheTuple *tuple = entry->tuplehead;
+	uint64		freed_mem = 0;
+
+	while (tuple != NULL)
+	{
+		MRUCacheTuple *next = tuple->next;
+
+		freed_mem += CACHE_TUPLE_BYTES(tuple);
+
+		/* Free memory used for this tuple */
+		pfree(tuple->mintuple);
+		pfree(tuple);
+
+		tuple = next;
+	}
+
+	entry->complete = false;
+	entry->tuplehead = NULL;
+
+	/* Update the memory accounting */
+	mrucache->mem_used -= freed_mem;
+}
+
+/*
+ * remove_cache_entry
+ *		Remove 'entry' from the cache and free memory used by it.
+ */
+static void
+remove_cache_entry(MRUTupleCache *mrucache, MRUCacheEntry *entry)
+{
+	MRUCacheKey *key = entry->key;
+
+	dlist_delete(&entry->key->lru_node);
+
+	/* Remove all of the tuples from this entry */
+	entry_purge_tuples(mrucache, entry);
+
+	/*
+	 * Update memory accounting. entry_purge_tuples should have already
+	 * subtracted the memory used for each cached tuple.  Here we just update
+	 * the amount used by the entry itself.
+	 */
+	mrucache->mem_used -= EMPTY_ENTRY_MEMORY_BYTES(entry);
+
+	/* Ensure we didn't mess up the tracking somehow */
+	Assert(mrucache->mem_used >= 0);
+
+#ifdef CACHE_VERIFY_TABLE
+	/* XXX I don't really plan on keeping this */
+	{
+		int			i,
+					count;
+		uint64		mem = 0;
+
+		count = 0;
+		for (i = 0; i < mrucache->hashtable->size; i++)
+		{
+			MRUCacheEntry *entry = &mrucache->hashtable->data[i];
+
+			if (entry->status == mrucache_SH_IN_USE)
+			{
+
+				MRUCacheTuple *tuple = entry->tuplehead;
+
+				mem += EMPTY_ENTRY_MEMORY_BYTES(entry);
+				while (tuple != NULL)
+				{
+					mem += CACHE_TUPLE_BYTES(tuple);
+					tuple = tuple->next;
+				}
+				count++;
+			}
+		}
+
+		Assert(count == mrucache->hashtable->members);
+		Assert(mem == mrucache->mem_used);
+	}
+#endif
+
+	/* Remove the entry from the cache */
+	mrucache_delete_item(mrucache->hashtable, entry);
+
+	pfree(key->params);
+	pfree(key);
+}
+
+/*
+ * cache_reduce_memory
+ *		Evict older and less recently used items from the cache in order to
+ *		reduce the memory consumption back to something below the
+ *		MRUTupleCache's mem_lowerlimit.
+ *
+ * 'specialkey', if not NULL, causes the function to return false if the entry
+ * which the key belongs to is removed from the cache.
+ */
+static bool
+cache_reduce_memory(MRUTupleCache *mrucache, MRUCacheKey *specialkey)
+{
+	bool		specialkey_intact = true;	/* for now */
+	dlist_mutable_iter iter;
+
+	/* Update peak memory usage */
+	if (mrucache->mem_used > mrucache->stats.mem_peak)
+		mrucache->stats.mem_peak = mrucache->mem_used;
+
+	/* We expect only to be called when we've gone over budget on memory */
+	Assert(mrucache->mem_used > mrucache->mem_upperlimit);
+
+	/* Start the eviction process starting at the head of the LRU list. */
+	dlist_foreach_modify(iter, &mrucache->lru_list)
+	{
+		MRUCacheKey *key = dlist_container(MRUCacheKey, lru_node, iter.cur);
+		MRUCacheEntry *entry;
+
+		/*
+		 * Populate the hash probe slot in preparation for looking up this LRU
+		 * entry.
+		 */
+		prepare_probe_slot(mrucache, key);
+
+		/*
+		 * Ideally the LRU list pointers would be stored in the entry itself
+		 * rather than in the key.  Unfortunately, we can't do that as the
+		 * simplehash.h code may resize the table and allocate new memory for
+		 * entries which would result in those pointers pointing to the old
+		 * buckets.  However, it's fine to use the key to store this as that's
+		 * only referenced by a pointer in the entry, which of course follows
+		 * the entry whenever the hash table is resized.  Since we only have a
+		 * pointer to the key here, we must perform a hash table lookup to
+		 * find the entry that the key belongs to.
+		 */
+		entry = mrucache_lookup(mrucache->hashtable, NULL);
+
+		/* A good spot to check for corruption of the table and LRU list. */
+		Assert(entry != NULL);
+		Assert(entry->key == key);
+
+		/*
+		 * If we're being called to free memory while the cache is being
+		 * populated with new tuples, then we'd better take some care as we
+		 * could end up freeing the entry which 'specialkey' belongs to.
+		 * Generally callers will pass 'specialkeys' as the keys for the cache
+		 * entry which is currently being populated, so we must set
+		 * 'specialkey_intact' to false to inform the caller the specialkey
+		 * entry has been removed.
+		 */
+		if (key == specialkey)
+			specialkey_intact = false;
+
+		/*
+		 * Finally remove the entry.  This will remove from the LRU list too.
+		 */
+		remove_cache_entry(mrucache, entry);
+
+		mrucache->stats.cache_evictions += 1;	/* Update Stats */
+
+		/* Exit if we've freed enough memory */
+		if (mrucache->mem_used <= mrucache->mem_lowerlimit)
+			break;
+	}
+
+	return specialkey_intact;
+}
+
+/*
+ * cache_lookup
+ *		Perform a lookup to see if we've already cached results based on the
+ *		scan's current parameters.  If we find an existing entry we move it to
+ *		the end of the LRU list, set *found to true then return it.  If we
+ *		don't find an entry then we create a new one and add it to the end of
+ *		the LRU list.  We also update cache memory accounting and remove older
+ *		entries if we go over the memory budget.  If we managed to free enough
+ *		memory we return the new entry, else we return NULL.
+ *
+ * Callers can assume we'll never return NULL when *found is true.
+ */
+static MRUCacheEntry *
+cache_lookup(MRUTupleCache *mrucache, bool *found)
+{
+	MRUCacheKey *key;
+	MRUCacheEntry *entry;
+	MemoryContext oldcontext;
+
+	/* prepare the probe slot with the current scan parameters */
+	prepare_probe_slot(mrucache, NULL);
+
+	/*
+	 * Add the new entry to the cache.  No need to pass a valid key since the
+	 * hash function uses mrucache's probeslot, which we populated above.
+	 */
+	entry = mrucache_insert(mrucache->hashtable, NULL, found);
+
+	if (*found)
+	{
+		/*
+		 * Move existing entry to the tail of the LRU list to mark it as the
+		 * most recently used item.
+		 */
+		dlist_move_tail(&mrucache->lru_list, &entry->key->lru_node);
+
+		return entry;
+	}
+
+	oldcontext = MemoryContextSwitchTo(mrucache->tableContext);
+
+	/* Allocate a new key */
+	entry->key = key = (MRUCacheKey *) palloc(sizeof(MRUCacheKey));
+	key->params = ExecCopySlotMinimalTuple(mrucache->probeslot);
+
+	/* Update the total cache memory utilization */
+	mrucache->mem_used += EMPTY_ENTRY_MEMORY_BYTES(entry);
+
+	/* Initialize this entry */
+	entry->complete = false;
+	entry->tuplehead = NULL;
+
+	/*
+	 * Since this is the most recently used entry, push this entry onto the
+	 * end of the LRU list.
+	 */
+	dlist_push_tail(&mrucache->lru_list, &entry->key->lru_node);
+
+	mrucache->last_tuple = NULL;
+
+	MemoryContextSwitchTo(oldcontext);
+
+	/*
+	 * If we've gone over our memory budget, then we'll free up some space in
+	 * the cache.
+	 */
+	if (mrucache->mem_used > mrucache->mem_upperlimit)
+	{
+		/*
+		 * Try to free up some memory.  It's highly unlikely that we'll fail
+		 * to do so here since the entry we've just added is yet to contain
+		 * any tuples and we're able to remove any other entry to reduce the
+		 * memory consumption.
+		 */
+		if (unlikely(!cache_reduce_memory(mrucache, key)))
+			return NULL;
+
+		/*
+		 * The process of removing entries from the cache may have caused the
+		 * code in simplehash.h to shuffle elements to earlier buckets in the
+		 * hash table.  If it has, we'll need to find the entry again by
+		 * performing a lookup.  Fortunately, we can detect if this has
+		 * happened by seeing if the entry is still in use and that the key
+		 * pointer matches our expected key.
+		 */
+		if (entry->status != mrucache_SH_IN_USE || entry->key != key)
+		{
+			/*
+			 * We need to repopulate the probeslot as lookups performed during
+			 * the cache evictions above will have stored some other key.
+			 */
+			prepare_probe_slot(mrucache, key);
+
+			/* Re-find the newly added entry */
+			entry = mrucache_lookup(mrucache->hashtable, NULL);
+			Assert(entry != NULL);
+		}
+	}
+
+	return entry;
+}
+
+/*
+ * cache_store_tuple
+ *		Add the tuple stored in 'slot' to the mrucache's current cache entry.
+ *		The cache entry must have already been made with cache_lookup().
+ *		mrucache's last_tuple field must point to the tail of
+ *		mrucache->entry's list of tuples.
+ */
+static bool
+cache_store_tuple(MRUTupleCache *mrucache, TupleTableSlot *slot)
+{
+	MRUCacheTuple *tuple;
+	MRUCacheEntry *entry = mrucache->entry;
+	MemoryContext oldcontext;
+
+	Assert(slot != NULL);
+	Assert(entry != NULL);
+
+	oldcontext = MemoryContextSwitchTo(mrucache->tableContext);
+
+	tuple = (MRUCacheTuple *) palloc(sizeof(MRUCacheTuple));
+	tuple->mintuple = ExecCopySlotMinimalTuple(slot);
+	tuple->next = NULL;
+
+	/* Account for the memory we just consumed */
+	mrucache->mem_used += CACHE_TUPLE_BYTES(tuple);
+
+	if (entry->tuplehead == NULL)
+	{
+		/*
+		 * This is the first tuple for this entry, so just point the list head
+		 * to it.
+		 */
+		entry->tuplehead = tuple;
+	}
+	else
+	{
+		/* push this tuple onto the tail of the list */
+		/* XXX use slist? */
+		mrucache->last_tuple->next = tuple;
+	}
+
+	mrucache->last_tuple = tuple;
+	MemoryContextSwitchTo(oldcontext);
+
+	/*
+	 * If we've gone over our memory budget then free up some space in the
+	 * cache.
+	 */
+	if (mrucache->mem_used > mrucache->mem_upperlimit)
+	{
+		MRUCacheKey *key = entry->key;
+
+		if (!cache_reduce_memory(mrucache, key))
+			return false;
+
+		/*
+		 * The process of removing entries from the cache may have caused the
+		 * code in simplehash.h to shuffle elements to earlier buckets in the
+		 * hash table.  If it has, we'll need to find the entry again by
+		 * performing a lookup.  Fortunately, we can detect if this has
+		 * happened by seeing if the entry is still in use and that the key
+		 * pointer matches our expected key.
+		 */
+		if (entry->status != mrucache_SH_IN_USE || entry->key != key)
+		{
+			/*
+			 * We need to repopulate the probeslot as lookups performed during
+			 * the cache evictions above will have stored some other key.
+			 */
+			prepare_probe_slot(mrucache, key);
+
+			/* Re-find the entry */
+			mrucache->entry = entry = mrucache_lookup(mrucache->hashtable,
+													  NULL);
+			Assert(entry != NULL);
+		}
+	}
+
+	return true;
+}
+
+/*
+ * Caller to call this after it finishes a parameterized scan
+ */
+void
+ExecMRUTupleCacheFinishScan(MRUTupleCache *mrucache)
+{
+	mrucache->state = MRUCACHE_LOOKUP;
+
+	/* nullify pointers used for the last scan */
+	mrucache->entry = NULL;
+	mrucache->last_tuple = NULL;
+}
+
+TupleTableSlot *
+ExecMRUTupleCacheFetch(MRUTupleCache *mrucache)
+{
+	PlanState *plan = mrucache->subplan;
+	TupleTableSlot *slot;
+
+	switch (mrucache->state)
+	{
+		case MRUCACHE_LOOKUP:
+			{
+				MRUCacheEntry *entry;
+				bool		found;
+
+				Assert(mrucache->entry == NULL);
+
+				/*
+				 * We're only ever in this state for the first call of the
+				 * scan.  Here we have a look to see if we've already seen the
+				 * current parameters before and if we have already cached a
+				 * complete set of records that the plan will return for these
+				 * parameters.
+				 *
+				 * When we find a valid cache entry, we'll return the first
+				 * tuple from it. If not found, we'll create a cache entry and
+				 * then try to fetch a tuple from the plan.  If we find one
+				 * there, we'll try to cache it.
+				 */
+
+				 /* see if we've got anything cached for the current parameters */
+				entry = cache_lookup(mrucache, &found);
+
+				if (found && entry->complete)
+				{
+					mrucache->stats.cache_hits += 1;	/* stats update */
+
+					/*
+					 * Set last_tuple and entry so that the state
+					 * MRUCACHE_FETCH_NEXT_TUPLE can easily find the next
+					 * tuple for these parameters.
+					 */
+					mrucache->last_tuple = entry->tuplehead;
+					mrucache->entry = entry;
+
+					/* Fetch the first cached tuple, if there is one */
+					if (entry->tuplehead)
+					{
+						mrucache->state = MRUCACHE_FETCH_NEXT_TUPLE;
+
+						ExecClearTuple(mrucache->cachefoundslot);
+						slot = mrucache->cachefoundslot;
+						ExecStoreMinimalTuple(mrucache->last_tuple->mintuple, slot, false);
+						return slot;
+					}
+					else
+					{
+						/* The cache entry is void of any tuples. */
+						mrucache->state = MRUCACHE_ENDOFSCAN;
+						return NULL;
+					}
+				}
+				else
+				{
+					TupleTableSlot *slot;
+
+					mrucache->stats.cache_misses += 1;	/* stats update */
+
+					if (found)
+					{
+						/*
+						 * A cache entry was found, but the scan for that
+						 * entry did not run to completion.  We'll just remove
+						 * all tuples and start again.  It might be tempting
+						 * to continue where we left off, but there's no
+						 * guarantee the subplan will produce the tuples in
+						 * the same order as it did last time.
+						 */
+						entry_purge_tuples(mrucache, entry);
+					}
+
+					/* Scan the subplan for a tuple to cache */
+					slot = ExecProcNode(plan);
+					if (TupIsNull(slot))
+					{
+						/*
+						 * cache_lookup may have returned NULL due to failure
+						 * to free enough cache space, so ensure we don't do
+						 * anything here that assumes it worked. There's no
+						 * need to go into bypass mode here as we're setting
+						 * rc_status to end of scan.
+						 */
+						if (likely(entry))
+							entry->complete = true;
+
+						mrucache->state = MRUCACHE_ENDOFSCAN;
+						return NULL;
+					}
+
+					mrucache->entry = entry;
+
+					/*
+					 * If we failed to create the entry or failed to store the
+					 * tuple in the entry, then go into bypass mode.
+					 */
+					if (unlikely(entry == NULL ||
+						!cache_store_tuple(mrucache, slot)))
+					{
+						mrucache->stats.cache_overflows += 1;	/* stats update */
+
+						mrucache->state = MRUCACHE_BYPASS_MODE;
+
+						/*
+						 * No need to clear out last_tuple as we'll stay in
+						 * bypass mode until the end of the scan.
+						 */
+					}
+					else
+					{
+						/*
+						 * If we only expect a single row from this scan then
+						 * we can mark that we're not expecting more.  This
+						 * allows cache lookups to work even when the scan has
+						 * not been executed to completion.
+						 */
+						entry->complete = mrucache->singlerow;
+						mrucache->state = MRUCACHE_FILLING;
+					}
+
+					return slot;
+				}
+			}
+
+		case MRUCACHE_FETCH_NEXT_TUPLE:
+			{
+				/* We shouldn't be in this state if these are not set */
+				Assert(mrucache->entry != NULL);
+				Assert(mrucache->last_tuple != NULL);
+
+				/* Skip to the next tuple to output */
+				mrucache->last_tuple = mrucache->last_tuple->next;
+
+				/* No more tuples in the cache */
+				if (mrucache->last_tuple == NULL)
+				{
+					mrucache->state = MRUCACHE_ENDOFSCAN;
+					return NULL;
+				}
+
+				ExecClearTuple(mrucache->cachefoundslot);
+				slot = mrucache->cachefoundslot;
+				ExecStoreMinimalTuple(mrucache->last_tuple->mintuple, slot, false);
+				return slot;
+			}
+
+		case MRUCACHE_FILLING:
+			{
+				TupleTableSlot *slot;
+				MRUCacheEntry *entry = mrucache->entry;
+
+				/*
+				 * entry should already have been set in the MRUCACHE_LOOKUP
+				 * state.
+				 */
+				Assert(entry != NULL);
+
+				/*
+				 * When in the MRUCACHE_FILLING state, we've just had a cache
+				 * miss and are populating the cache with the current scan
+				 * tuples.
+				 */
+				slot = ExecProcNode(plan);
+				if (TupIsNull(slot))
+				{
+					/* No more tuples.  Mark it as complete */
+					entry->complete = true;
+					mrucache->state = MRUCACHE_ENDOFSCAN;
+					return NULL;
+				}
+				else
+				{
+					/*
+					 * Validate if the planner properly set the singlerow
+					 * flag.  It should only set that if each cache entry can,
+					 * at most, return 1 row.  XXX is this worth the check?
+					 */
+					if (unlikely(entry->complete))
+						elog(ERROR, "cache entry already complete");
+
+					/* Record the tuple in the current cache entry */
+					if (unlikely(!cache_store_tuple(mrucache, slot)))
+					{
+						/* Couldn't store it?  Handle overflow. */
+						mrucache->stats.cache_overflows += 1;	/* stats update */
+
+						mrucache->state = MRUCACHE_BYPASS_MODE;
+
+						/*
+						 * No need to clear out entry or last_tuple as we'll
+						 * stay in bypass mode until the end of the scan.
+						 */
+					}
+
+					return slot;
+				}
+			}
+
+		case MRUCACHE_BYPASS_MODE:
+			{
+				TupleTableSlot *slot;
+
+				/*
+				 * We end up in bypass mode when we're unable to fit all of
+				 * the tuples for a given key in the cache, despite evicting
+				 * everything else from the cache.
+				 *
+				 * We just continue to read tuples without caching.  We need
+				 * to wait until the next rescan before we can come out of
+				 * this mode. Perhaps the tuples for the next lookup key will
+				 * fit.
+				 */
+				slot = ExecProcNode(plan);
+				if (TupIsNull(slot))
+				{
+					mrucache->state = MRUCACHE_ENDOFSCAN;
+					return NULL;
+				}
+
+				return slot;
+			}
+
+		case MRUCACHE_ENDOFSCAN:
+
+			/*
+			 * We've already returned NULL for this scan, but just in case
+			 * something call us again by mistake.
+			 */
+			return NULL;
+
+		default:
+			elog(ERROR, "unrecognized mrucache state: %d",
+				 (int) mrucache->state);
+			return NULL;
+	}							/* switch */
+}
+
+/*
+ * ExecMRUTupleCacheInit
+ *		Builds and returns a MRUTupleCache struct to allow caching of tuples
+ *		from 'cache_planstate'.
+ *
+ * 'planstate' the parent plan node that we're performing caching for.
+ * 'cache_planstate' the sub-node of 'planstate' that we're to cache tuples
+ *		from.
+ * 'param_exprs' the cache key parameters
+ * 'hashOperators' the operators for the hash functions to use to hash the
+ *		cache key exprs.  Must have list_length(param_exprs) elements.
+ * 'collations' collations for cache key exprs.  Must have
+ *		list_length(param_exprs) elements.
+ * 'memory_limit_bytes' the number of bytes to limit the size of the cache to.
+ * 'est_entries' the estimated number of entries we expect to cache. Or 0 if
+ *		unknown.
+ * 'singlerow' if true, mark the cache entry as complete after fetching the
+ *		first tuple.  Some callers may wish to pass this as true if they only
+ *		need to fetch 1 tuple and would like the cache entry for that 1 tuple
+ *		to become valid after the first tuple is fetched.
+ */
+MRUTupleCache *
+ExecMRUTupleCacheInit(PlanState *planstate, PlanState *cache_planstate,
+					  List *param_exprs, Oid *hashOperators, Oid *collations,
+					  uint64 memory_limit_bytes, int est_entries,
+					  bool singlerow)
+{
+	MRUTupleCache *mrucache = (MRUTupleCache *) palloc0(sizeof(MRUTupleCache));
+	int			i;
+	int			nkeys;
+	Oid		   *eqfuncoids;
+
+	mrucache->subplan = cache_planstate;
+	mrucache->ps_ExprContext = CreateExprContext(planstate->state);
+	mrucache->state = MRUCACHE_LOOKUP;
+
+	mrucache->nkeys = nkeys = list_length(param_exprs);
+	mrucache->hashkeydesc = ExecTypeFromExprList(param_exprs);
+	mrucache->tableslot = MakeSingleTupleTableSlot(mrucache->hashkeydesc,
+												   &TTSOpsMinimalTuple);
+	mrucache->cachefoundslot = MakeSingleTupleTableSlot(cache_planstate->ps_ResultTupleDesc,
+		&TTSOpsMinimalTuple);
+	mrucache->probeslot = MakeSingleTupleTableSlot(mrucache->hashkeydesc,
+												   &TTSOpsVirtual);
+
+	mrucache->param_exprs = (ExprState **) palloc(nkeys * sizeof(ExprState *));
+	mrucache->collations = collations;
+	mrucache->hashfunctions = (FmgrInfo *) palloc(nkeys * sizeof(FmgrInfo));
+
+	eqfuncoids = (Oid *) palloc(nkeys * sizeof(Oid));
+
+	for (i = 0; i < nkeys; i++)
+	{
+		Oid			hashop = hashOperators[i];
+		Oid			left_hashfn;
+		Oid			right_hashfn;
+		Expr	   *param_expr = (Expr *) list_nth(param_exprs, i);
+
+		if (!get_op_hash_functions(hashop, &left_hashfn, &right_hashfn))
+			elog(ERROR, "could not find hash function for hash operator %u",
+				 hashop);
+
+		fmgr_info(left_hashfn, &mrucache->hashfunctions[i]);
+
+		mrucache->param_exprs[i] = ExecInitExpr(param_expr, (PlanState *) planstate);
+		eqfuncoids[i] = get_opcode(hashop);
+	}
+
+	mrucache->cache_eq_expr = ExecBuildParamSetEqual(mrucache->hashkeydesc,
+													 &TTSOpsMinimalTuple,
+													 eqfuncoids,
+													 collations,
+													 param_exprs,
+													 (PlanState *) planstate);
+
+	pfree(eqfuncoids);
+	mrucache->mem_used = 0;
+
+	/* Limit the total memory consumed by the cache to this */
+	mrucache->mem_upperlimit = memory_limit_bytes;
+
+	/*
+	 * Set the lower limit to something a bit less than the upper limit so
+	 * that we don't have to evict tuples every time we need to add a new one
+	 * after the cache has filled.  We don't make it too much smaller as we'd
+	 * like to keep as much in the cache as possible.
+	 */
+	mrucache->mem_lowerlimit = mrucache->mem_upperlimit * 0.98;
+
+	/* A memory context dedicated for the cache */
+	mrucache->tableContext = AllocSetContextCreate(CurrentMemoryContext,
+												   "MRUCacheHashTable",
+												   ALLOCSET_DEFAULT_SIZES);
+
+	dlist_init(&mrucache->lru_list);
+	mrucache->last_tuple = NULL;
+	mrucache->entry = NULL;
+
+	/*
+	 * Mark if we can assume the cache entry is completed after we get the
+	 * first record for it.  Some callers might not call us again after
+	 * getting the first match. e.g. A join operator performing a unique join
+	 * is able to skip to the next outer tuple after getting the first
+	 * matching inner tuple.  In this case, the cache entry is complete after
+	 * getting the first tuple.  This allows us to mark it as so.
+	 */
+	mrucache->singlerow = singlerow;
+
+	/* Zero the statistics counters */
+	memset(&mrucache->stats, 0, sizeof(MRUCacheInstrumentation));
+
+	/*
+	 * Allocate and set up the actual cache.  We'll just use 1024 buckets if
+	 * the caller did not specify an estimate.
+	 */
+	build_hash_table(mrucache, est_entries > 0 ? est_entries :
+					 1024);
+
+	return mrucache;
+}
+
+void
+ExecMRUTupleCacheCleanup(MRUTupleCache *mrucache)
+{
+	/* Remove the cache context */
+	MemoryContextDelete(mrucache->tableContext);
+
+	ExecClearTuple(mrucache->cachefoundslot);
+	FreeExprContext(mrucache->ps_ExprContext, false);
+}
+
+/*
+ * ExecEstimateMRUCacheEntryOverheadBytes
+ *		For use in the query planner to help it estimate the amount of memory
+ *		required to store a single entry in the cache.
+ */
+double
+ExecEstimateMRUCacheEntryOverheadBytes(double ntuples)
+{
+	return sizeof(MRUCacheEntry) + sizeof(MRUCacheKey) +
+		sizeof(MRUCacheTuple) * ntuples;
+}
diff --git a/src/backend/executor/execParallel.c b/src/backend/executor/execParallel.c
index befde52691..38973b1591 100644
--- a/src/backend/executor/execParallel.c
+++ b/src/backend/executor/execParallel.c
@@ -35,6 +35,7 @@
 #include "executor/nodeIncrementalSort.h"
 #include "executor/nodeIndexonlyscan.h"
 #include "executor/nodeIndexscan.h"
+#include "executor/nodeNestloop.h"
 #include "executor/nodeSeqscan.h"
 #include "executor/nodeSort.h"
 #include "executor/nodeSubplan.h"
@@ -276,6 +277,10 @@ ExecParallelEstimate(PlanState *planstate, ExecParallelEstimateContext *e)
 				ExecHashJoinEstimate((HashJoinState *) planstate,
 									 e->pcxt);
 			break;
+		case T_NestLoopState:
+			/* even when not parallel-aware, for EXPLAIN ANALYZE */
+			ExecNestLoopEstimate((NestLoopState *) planstate, e->pcxt);
+			break;
 		case T_HashState:
 			/* even when not parallel-aware, for EXPLAIN ANALYZE */
 			ExecHashEstimate((HashState *) planstate, e->pcxt);
@@ -496,6 +501,10 @@ ExecParallelInitializeDSM(PlanState *planstate,
 				ExecHashJoinInitializeDSM((HashJoinState *) planstate,
 										  d->pcxt);
 			break;
+		case T_NestLoopState:
+			/* even when not parallel-aware, for EXPLAIN ANALYZE */
+			ExecNestLoopInitializeDSM((NestLoopState *) planstate, d->pcxt);
+			break;
 		case T_HashState:
 			/* even when not parallel-aware, for EXPLAIN ANALYZE */
 			ExecHashInitializeDSM((HashState *) planstate, d->pcxt);
@@ -985,6 +994,7 @@ ExecParallelReInitializeDSM(PlanState *planstate,
 				ExecHashJoinReInitializeDSM((HashJoinState *) planstate,
 											pcxt);
 			break;
+		case T_NestLoopState:
 		case T_HashState:
 		case T_SortState:
 		case T_IncrementalSortState:
@@ -1045,6 +1055,9 @@ ExecParallelRetrieveInstrumentation(PlanState *planstate,
 	/* Perform any node-type-specific work that needs to be done. */
 	switch (nodeTag(planstate))
 	{
+		case T_NestLoopState:
+			ExecNestLoopRetrieveInstrumentation((NestLoopState *) planstate);
+			break;
 		case T_SortState:
 			ExecSortRetrieveInstrumentation((SortState *) planstate);
 			break;
@@ -1332,6 +1345,10 @@ ExecParallelInitializeWorker(PlanState *planstate, ParallelWorkerContext *pwcxt)
 				ExecHashJoinInitializeWorker((HashJoinState *) planstate,
 											 pwcxt);
 			break;
+		case T_NestLoopState:
+			/* even when not parallel-aware, for EXPLAIN ANALYZE */
+			ExecNestLoopInitializeWorker((NestLoopState *) planstate, pwcxt);
+			break;
 		case T_HashState:
 			/* even when not parallel-aware, for EXPLAIN ANALYZE */
 			ExecHashInitializeWorker((HashState *) planstate, pwcxt);
diff --git a/src/backend/executor/nodeNestloop.c b/src/backend/executor/nodeNestloop.c
index b07c2996d4..fbefc127b2 100644
--- a/src/backend/executor/nodeNestloop.c
+++ b/src/backend/executor/nodeNestloop.c
@@ -21,11 +21,43 @@
 
 #include "postgres.h"
 
+#include "executor/execMRUTupleCache.h"
 #include "executor/execdebug.h"
+#include "executor/nodeHash.h"
 #include "executor/nodeNestloop.h"
 #include "miscadmin.h"
 #include "utils/memutils.h"
 
+static inline TupleTableSlot *
+FetchInnerTuple(NestLoopState *nlstate, PlanState *innerPlan)
+{
+	MRUTupleCache *mrucache = nlstate->nl_mrucache;
+
+	/* No caching? Just exec the inner node */
+	if (mrucache == NULL)
+		return ExecProcNode(innerPlan);
+
+	/* Otherwise let the cache deal with it */
+	else
+	{
+		TupleTableSlot *slot = ExecMRUTupleCacheFetch(mrucache);
+
+		if (slot == mrucache->cachefoundslot)
+		{
+			nlstate->js.ps.ps_ProjInfo = nlstate->ps_CacheProjInfo;
+			nlstate->js.ps.qual = nlstate->ps_CacheQual;
+			nlstate->js.joinqual = nlstate->ps_CacheJoinqual;
+		}
+		else
+		{
+			nlstate->js.ps.ps_ProjInfo = nlstate->ps_ScanProjInfo;
+			nlstate->js.ps.qual = nlstate->ps_ScanQual;
+			nlstate->js.joinqual = nlstate->ps_ScanJoinqual;
+		}
+		return slot;
+	}
+}
+
 
 /* ----------------------------------------------------------------
  *		ExecNestLoop(node)
@@ -66,8 +98,6 @@ ExecNestLoop(PlanState *pstate)
 	PlanState  *outerPlan;
 	TupleTableSlot *outerTupleSlot;
 	TupleTableSlot *innerTupleSlot;
-	ExprState  *joinqual;
-	ExprState  *otherqual;
 	ExprContext *econtext;
 	ListCell   *lc;
 
@@ -79,8 +109,6 @@ ExecNestLoop(PlanState *pstate)
 	ENL1_printf("getting info from node");
 
 	nl = (NestLoop *) node->js.ps.plan;
-	joinqual = node->js.joinqual;
-	otherqual = node->js.ps.qual;
 	outerPlan = outerPlanState(node);
 	innerPlan = innerPlanState(node);
 	econtext = node->js.ps.ps_ExprContext;
@@ -150,6 +178,14 @@ ExecNestLoop(PlanState *pstate)
 			 */
 			ENL1_printf("rescanning inner plan");
 			ExecReScan(innerPlan);
+
+			/*
+			 * When using an MRU cache, reset the state ready for another
+			 * lookup.
+			 */
+			if (node->nl_mrucache)
+				ExecMRUTupleCacheFinishScan(node->nl_mrucache);
+
 		}
 
 		/*
@@ -157,7 +193,7 @@ ExecNestLoop(PlanState *pstate)
 		 */
 		ENL1_printf("getting new inner tuple");
 
-		innerTupleSlot = ExecProcNode(innerPlan);
+		innerTupleSlot = FetchInnerTuple(node, innerPlan);
 		econtext->ecxt_innertuple = innerTupleSlot;
 
 		if (TupIsNull(innerTupleSlot))
@@ -180,7 +216,7 @@ ExecNestLoop(PlanState *pstate)
 
 				ENL1_printf("testing qualification for outer-join tuple");
 
-				if (otherqual == NULL || ExecQual(otherqual, econtext))
+				if (node->js.ps.qual == NULL || ExecQual(node->js.ps.qual, econtext))
 				{
 					/*
 					 * qualification was satisfied so we project and return
@@ -211,7 +247,7 @@ ExecNestLoop(PlanState *pstate)
 		 */
 		ENL1_printf("testing qualification");
 
-		if (ExecQual(joinqual, econtext))
+		if (ExecQual(node->js.joinqual, econtext))
 		{
 			node->nl_MatchedOuter = true;
 
@@ -230,7 +266,7 @@ ExecNestLoop(PlanState *pstate)
 			if (node->js.single_match)
 				node->nl_NeedNewOuter = true;
 
-			if (otherqual == NULL || ExecQual(otherqual, econtext))
+			if (node->js.ps.qual == NULL || ExecQual(node->js.ps.qual, econtext))
 			{
 				/*
 				 * qualification was satisfied so we project and return the
@@ -306,15 +342,18 @@ ExecInitNestLoop(NestLoop *node, EState *estate, int eflags)
 	 */
 	ExecInitResultTupleSlotTL(&nlstate->js.ps, &TTSOpsVirtual);
 	ExecAssignProjectionInfo(&nlstate->js.ps, NULL);
+	nlstate->ps_ScanProjInfo = nlstate->js.ps.ps_ProjInfo;
 
 	/*
 	 * initialize child expressions
 	 */
 	nlstate->js.ps.qual =
 		ExecInitQual(node->join.plan.qual, (PlanState *) nlstate);
+	nlstate->ps_ScanQual = nlstate->js.ps.qual;
 	nlstate->js.jointype = node->join.jointype;
 	nlstate->js.joinqual =
 		ExecInitQual(node->join.joinqual, (PlanState *) nlstate);
+	nlstate->ps_ScanJoinqual = nlstate->js.joinqual;
 
 	/*
 	 * detect whether we need only consider the first matching inner tuple
@@ -346,12 +385,59 @@ ExecInitNestLoop(NestLoop *node, EState *estate, int eflags)
 	nlstate->nl_NeedNewOuter = true;
 	nlstate->nl_MatchedOuter = false;
 
+	/* Setup the MRU cache, if enabled */
+	if (node->mrucache)
+	{
+		nlstate->nl_mrucache = ExecMRUTupleCacheInit((PlanState *) nlstate,
+													 (PlanState *) innerPlanState(nlstate),
+													 node->param_exprs,
+													 node->hashOperators,
+													 node->collations,
+													 get_hash_mem() * 1024L,
+													 node->est_entries,
+													 node->singlerow);
+
+		/*
+		 * Create a seperate Projection info for projecting from the slots
+		 * belonging to the result cache.
+		 */
+		if (nlstate->js.ps.innerops != &TTSOpsMinimalTuple)
+		{
+			const TupleTableSlotOps *ttsops = nlstate->js.ps.innerops;
+			bool inneropsset = nlstate->js.ps.inneropsset;
+
+			nlstate->js.ps.innerops = &TTSOpsMinimalTuple;
+			nlstate->js.ps.inneropsset = true;
+
+			nlstate->ps_CacheProjInfo = ExecBuildProjectionInfo(nlstate->js.ps.plan->targetlist,
+																nlstate->js.ps.ps_ExprContext,
+																nlstate->js.ps.ps_ResultTupleSlot,
+																&nlstate->js.ps,
+																NULL);
+
+			nlstate->ps_CacheQual =
+				ExecInitQual(node->join.plan.qual, (PlanState *) nlstate);
+			nlstate->ps_CacheJoinqual =
+				ExecInitQual(node->join.joinqual, (PlanState *) nlstate);
+
+			/* Restore original values */
+			nlstate->js.ps.innerops = ttsops;
+			nlstate->js.ps.inneropsset = inneropsset;
+		}
+	}
+	else
+	{
+		nlstate->nl_mrucache = NULL;
+		nlstate->ps_CacheProjInfo = NULL;
+	}
+
 	NL1_printf("ExecInitNestLoop: %s\n",
 			   "node initialized");
 
 	return nlstate;
 }
 
+
 /* ----------------------------------------------------------------
  *		ExecEndNestLoop
  *
@@ -380,6 +466,29 @@ ExecEndNestLoop(NestLoopState *node)
 	ExecEndNode(outerPlanState(node));
 	ExecEndNode(innerPlanState(node));
 
+	if (node->nl_mrucache != NULL)
+	{
+		/*
+		 * When ending a parallel worker, copy the statistics gathered by the
+		 * worker back into shared memory so that it can be picked up by the main
+		 * process to report in EXPLAIN ANALYZE.
+		 */
+		if (node->shared_info && IsParallelWorker())
+		{
+			MRUCacheInstrumentation *si;
+
+			/* Make mem_peak available for EXPLAIN */
+			if (node->nl_mrucache->stats.mem_peak == 0)
+				node->nl_mrucache->stats.mem_peak = node->nl_mrucache->mem_used;
+
+			Assert(ParallelWorkerNumber <= node->shared_info->num_workers);
+			si = &node->shared_info->sinstrument[ParallelWorkerNumber];
+			memcpy(si, &node->nl_mrucache->stats, sizeof(MRUCacheInstrumentation));
+		}
+
+		ExecMRUTupleCacheCleanup(node->nl_mrucache);
+	}
+
 	NL1_printf("ExecEndNestLoop: %s\n",
 			   "node processing ended");
 }
@@ -400,6 +509,8 @@ ExecReScanNestLoop(NestLoopState *node)
 	if (outerPlan->chgParam == NULL)
 		ExecReScan(outerPlan);
 
+	if (node->nl_mrucache != NULL)
+		ExecMRUTupleCacheFinishScan(node->nl_mrucache);
 	/*
 	 * innerPlan is re-scanned for each new outer tuple and MUST NOT be
 	 * re-scanned from here or you'll get troubles from inner index scans when
@@ -409,3 +520,89 @@ ExecReScanNestLoop(NestLoopState *node)
 	node->nl_NeedNewOuter = true;
 	node->nl_MatchedOuter = false;
 }
+
+/* ----------------------------------------------------------------
+ *						Parallel Query Support
+ * ----------------------------------------------------------------
+ */
+
+ /* ----------------------------------------------------------------
+  *		ExecNestLoopEstimate
+  *
+  *		Estimate space required to propagate nested loop statistics.
+  * ----------------------------------------------------------------
+  */
+void
+ExecNestLoopEstimate(NestLoopState *node, ParallelContext *pcxt)
+{
+	Size		size;
+
+	/* don't need this if not instrumenting or no workers */
+	if (!node->js.ps.instrument || pcxt->nworkers == 0)
+		return;
+
+	size = mul_size(pcxt->nworkers, sizeof(MRUCacheInstrumentation));
+	size = add_size(size, offsetof(SharedMRUCacheInfo, sinstrument));
+	shm_toc_estimate_chunk(&pcxt->estimator, size);
+	shm_toc_estimate_keys(&pcxt->estimator, 1);
+}
+
+/* ----------------------------------------------------------------
+ *		ExecNestLoopInitializeDSM
+ *
+ *		Initialize DSM space for nested loop statistics.
+ * ----------------------------------------------------------------
+ */
+void
+ExecNestLoopInitializeDSM(NestLoopState *node, ParallelContext *pcxt)
+{
+	Size		size;
+
+	/* don't need this if not instrumenting or no workers */
+	if (!node->js.ps.instrument || pcxt->nworkers == 0)
+		return;
+
+	size = offsetof(SharedMRUCacheInfo, sinstrument)
+		+ pcxt->nworkers * sizeof(MRUCacheInstrumentation);
+	node->shared_info = shm_toc_allocate(pcxt->toc, size);
+	/* ensure any unfilled slots will contain zeroes */
+	memset(node->shared_info, 0, size);
+	node->shared_info->num_workers = pcxt->nworkers;
+	shm_toc_insert(pcxt->toc, node->js.ps.plan->plan_node_id,
+				   node->shared_info);
+}
+
+/* ----------------------------------------------------------------
+ *		ExecNestLoopInitializeWorker
+ *
+ *		Attach worker to DSM space for nested loop statistics.
+ * ----------------------------------------------------------------
+ */
+void
+ExecNestLoopInitializeWorker(NestLoopState *node, ParallelWorkerContext *pwcxt)
+{
+	node->shared_info =
+		shm_toc_lookup(pwcxt->toc, node->js.ps.plan->plan_node_id, true);
+}
+
+/* ----------------------------------------------------------------
+ *		ExecNestLoopRetrieveInstrumentation
+ *
+*		Transfer nested loop statistics from DSM to private memory.
+ * ----------------------------------------------------------------
+ */
+void
+ExecNestLoopRetrieveInstrumentation(NestLoopState *node)
+{
+	Size		size;
+	SharedMRUCacheInfo *si;
+
+	if (node->shared_info == NULL)
+		return;
+
+	size = offsetof(SharedMRUCacheInfo, sinstrument)
+		+ node->shared_info->num_workers * sizeof(MRUCacheInstrumentation);
+	si = palloc(size);
+	memcpy(si, node->shared_info, size);
+	node->shared_info = si;
+}
diff --git a/src/backend/nodes/copyfuncs.c b/src/backend/nodes/copyfuncs.c
index 2b4d7654cc..fa28ad7b1b 100644
--- a/src/backend/nodes/copyfuncs.c
+++ b/src/backend/nodes/copyfuncs.c
@@ -837,7 +837,6 @@ static NestLoop *
 _copyNestLoop(const NestLoop *from)
 {
 	NestLoop   *newnode = makeNode(NestLoop);
-
 	/*
 	 * copy node superclass fields
 	 */
@@ -847,6 +846,13 @@ _copyNestLoop(const NestLoop *from)
 	 * copy remainder of node
 	 */
 	COPY_NODE_FIELD(nestParams);
+	COPY_SCALAR_FIELD(numKeys);
+	COPY_POINTER_FIELD(hashOperators, from->numKeys * sizeof(Oid));
+	COPY_POINTER_FIELD(collations, from->numKeys * sizeof(Oid));
+	COPY_NODE_FIELD(param_exprs);
+	COPY_SCALAR_FIELD(mrucache);
+	COPY_SCALAR_FIELD(singlerow);
+	COPY_SCALAR_FIELD(est_entries);
 
 	return newnode;
 }
diff --git a/src/backend/nodes/outfuncs.c b/src/backend/nodes/outfuncs.c
index 08a049232e..e4bbc688b6 100644
--- a/src/backend/nodes/outfuncs.c
+++ b/src/backend/nodes/outfuncs.c
@@ -732,6 +732,13 @@ _outNestLoop(StringInfo str, const NestLoop *node)
 	_outJoinPlanInfo(str, (const Join *) node);
 
 	WRITE_NODE_FIELD(nestParams);
+	WRITE_INT_FIELD(numKeys);
+	WRITE_OID_ARRAY(hashOperators, node->numKeys);
+	WRITE_OID_ARRAY(collations, node->numKeys);
+	WRITE_NODE_FIELD(param_exprs);
+	WRITE_BOOL_FIELD(mrucache);
+	WRITE_BOOL_FIELD(singlerow);
+	WRITE_INT_FIELD(est_entries);
 }
 
 static void
diff --git a/src/backend/nodes/readfuncs.c b/src/backend/nodes/readfuncs.c
index ab7b535caa..b62ac16cb4 100644
--- a/src/backend/nodes/readfuncs.c
+++ b/src/backend/nodes/readfuncs.c
@@ -2088,6 +2088,13 @@ _readNestLoop(void)
 	ReadCommonJoin(&local_node->join);
 
 	READ_NODE_FIELD(nestParams);
+	READ_INT_FIELD(numKeys);
+	READ_OID_ARRAY(hashOperators, local_node->numKeys);
+	READ_OID_ARRAY(collations, local_node->numKeys);
+	READ_NODE_FIELD(param_exprs);
+	READ_BOOL_FIELD(mrucache);
+	READ_BOOL_FIELD(singlerow);
+	READ_INT_FIELD(est_entries);
 
 	READ_DONE();
 }
diff --git a/src/backend/optimizer/path/costsize.c b/src/backend/optimizer/path/costsize.c
index a0877e2be4..6a29143575 100644
--- a/src/backend/optimizer/path/costsize.c
+++ b/src/backend/optimizer/path/costsize.c
@@ -76,6 +76,7 @@
 #include "access/amapi.h"
 #include "access/htup_details.h"
 #include "access/tsmapi.h"
+#include "executor/execMRUTupleCache.h"
 #include "executor/executor.h"
 #include "executor/nodeAgg.h"
 #include "executor/nodeHash.h"
@@ -138,6 +139,7 @@ bool		enable_sort = true;
 bool		enable_incremental_sort = true;
 bool		enable_hashagg = true;
 bool		enable_nestloop = true;
+bool		enable_cachednestloop = true;
 bool		enable_material = true;
 bool		enable_mergejoin = true;
 bool		enable_hashjoin = true;
@@ -2736,10 +2738,11 @@ initial_cost_nestloop(PlannerInfo *root, JoinCostWorkspace *workspace,
 void
 final_cost_nestloop(PlannerInfo *root, NestPath *path,
 					JoinCostWorkspace *workspace,
-					JoinPathExtraData *extra)
+					JoinPathExtraData *extra,
+					bool enabled)
 {
-	Path	   *outer_path = path->outerjoinpath;
-	Path	   *inner_path = path->innerjoinpath;
+	Path	   *outer_path = path->jpath.outerjoinpath;
+	Path	   *inner_path = path->jpath.innerjoinpath;
 	double		outer_path_rows = outer_path->rows;
 	double		inner_path_rows = inner_path->rows;
 	Cost		startup_cost = workspace->startup_cost;
@@ -2754,18 +2757,18 @@ final_cost_nestloop(PlannerInfo *root, NestPath *path,
 	if (inner_path_rows <= 0)
 		inner_path_rows = 1;
 	/* Mark the path with the correct row estimate */
-	if (path->path.param_info)
-		path->path.rows = path->path.param_info->ppi_rows;
+	if (path->jpath.path.param_info)
+		path->jpath.path.rows = path->jpath.path.param_info->ppi_rows;
 	else
-		path->path.rows = path->path.parent->rows;
+		path->jpath.path.rows = path->jpath.path.parent->rows;
 
 	/* For partial paths, scale row estimate. */
-	if (path->path.parallel_workers > 0)
+	if (path->jpath.path.parallel_workers > 0)
 	{
-		double		parallel_divisor = get_parallel_divisor(&path->path);
+		double		parallel_divisor = get_parallel_divisor(&path->jpath.path);
 
-		path->path.rows =
-			clamp_row_est(path->path.rows / parallel_divisor);
+		path->jpath.path.rows =
+			clamp_row_est(path->jpath.path.rows / parallel_divisor);
 	}
 
 	/*
@@ -2773,12 +2776,12 @@ final_cost_nestloop(PlannerInfo *root, NestPath *path,
 	 * would amount to optimizing for the case where the join method is
 	 * disabled, which doesn't seem like the way to bet.
 	 */
-	if (!enable_nestloop)
+	if (!enabled)
 		startup_cost += disable_cost;
 
 	/* cost of inner-relation source data (we already dealt with outer rel) */
 
-	if (path->jointype == JOIN_SEMI || path->jointype == JOIN_ANTI ||
+	if (path->jpath.jointype == JOIN_SEMI || path->jpath.jointype == JOIN_ANTI ||
 		extra->inner_unique)
 	{
 		/*
@@ -2896,17 +2899,240 @@ final_cost_nestloop(PlannerInfo *root, NestPath *path,
 	}
 
 	/* CPU costs */
-	cost_qual_eval(&restrict_qual_cost, path->joinrestrictinfo, root);
+	cost_qual_eval(&restrict_qual_cost, path->jpath.joinrestrictinfo, root);
 	startup_cost += restrict_qual_cost.startup;
 	cpu_per_tuple = cpu_tuple_cost + restrict_qual_cost.per_tuple;
 	run_cost += cpu_per_tuple * ntuples;
 
 	/* tlist eval costs are paid per output row, not per tuple scanned */
-	startup_cost += path->path.pathtarget->cost.startup;
-	run_cost += path->path.pathtarget->cost.per_tuple * path->path.rows;
+	startup_cost += path->jpath.path.pathtarget->cost.startup;
+	run_cost += path->jpath.path.pathtarget->cost.per_tuple * path->jpath.path.rows;
 
-	path->path.startup_cost = startup_cost;
-	path->path.total_cost = startup_cost + run_cost;
+	path->jpath.path.startup_cost = startup_cost;
+	path->jpath.path.total_cost = startup_cost + run_cost;
+}
+
+/*
+ * initial_cost_cached_nestloop
+ *	  Preliminary estimate of the cost of a cached nestloop join path.
+ *
+ * This must quickly produce lower-bound estimates of the path's startup and
+ * total costs.  If we are unable to eliminate the proposed path from
+ * consideration using the lower bounds, final_cost_cached_nestloop will be
+ * called to obtain the final estimates.
+ *
+ * The exact division of labor between this function and
+ * final_cost_cached_nestloop is private to them, and represents a tradeoff
+ * between speed of the initial estimate and getting a tight lower bound.  We
+ * choose to not examine the join quals here, since that's by far the most
+ * expensive part of the calculations.  The end result is that CPU-cost
+ * considerations must be left for the second phase; and for SEMI/ANTI joins,
+ * we must also postpone incorporation of the inner path's run cost.
+ *
+ * 'workspace' is to be filled with startup_cost, total_cost, and perhaps
+ *		other data to be used by final_cost_nestloop
+ * 'jointype' is the type of join to be performed
+ * 'outer_path' is the outer input to the join
+ * 'inner_path' is the inner input to the join
+ * 'extra' contains miscellaneous information about the join
+ * 'param_exprs' contains the list of exprs that the inner_path is
+ *		parameterized by.
+ *
+ * Returns the estimated number of entries which can be stored in the cache at
+ * a time.
+ */
+int
+initial_cost_cached_nestloop(PlannerInfo *root, JoinCostWorkspace *workspace,
+							 JoinType jointype,
+							 Path *outer_path, Path *inner_path,
+							 JoinPathExtraData *extra, List *param_exprs)
+{
+	Cost		startup_cost = 0;
+	Cost		run_cost = 0;
+	double		outer_path_rows = outer_path->rows;
+	double		inner_path_rows = inner_path->rows;
+	Cost		inner_rescan_start_cost;
+	Cost		inner_rescan_total_cost;
+	Cost		input_total_cost = inner_path->total_cost;
+	Cost		input_startup_cost = inner_path->startup_cost;
+	Cost		inner_run_cost;
+	Cost		inner_rescan_run_cost;
+	int			width = inner_path->pathtarget->width;
+	int			flags;
+
+	double		hash_mem_bytes;
+	double		est_entry_bytes;
+	double		est_cache_entries;
+	double		ndistinct;
+	double		evict_ratio;
+	double		hit_ratio;
+	int			est_entries;
+
+	/* Protect some assumptions below that rowcounts aren't zero */
+	if (outer_path_rows <= 0)
+		outer_path_rows = 1;
+	if (inner_path_rows <= 0)
+		inner_path_rows = 1;
+
+	/* available cache space */
+	hash_mem_bytes = get_hash_mem() * 1024L;
+
+	/*
+	 * Set the number of bytes each cache entry should consume in the cache.
+	 * To provide us with better estimations on how many cache entries we can
+	 * store at once we make a call to the excutor here to ask it what memory
+	 * overheads there are for a single cache entry.
+	 *
+	 * XXX we also store the cache key, but that's not accounted for here.
+	 */
+	est_entry_bytes = relation_byte_size(inner_path_rows, width) +
+		ExecEstimateMRUCacheEntryOverheadBytes(inner_path_rows);
+
+	/* estimate on the upper limit of cache entries we can hold at once */
+	est_cache_entries = floor(hash_mem_bytes / est_entry_bytes);
+
+	/* estimate on the distinct number of parameter values */
+	ndistinct = estimate_num_groups(root, param_exprs, outer_path_rows, NULL,
+									&flags);
+
+	/*
+	 * When the estimation fell back on using a default value, it's a bit too
+	 * risky to assume that it's ok to use a cached nested loop.  The use of a
+	 * default could cause us to choose this plan type when it's really
+	 * inappropriate to do so.  If we see that this has been done then we'll
+	 * assume that every call will have unique parameters, which will almost
+	 * certainly mean the cached nested loop will never survive add_path().
+	 */
+	if ((flags & SELFLAG_USED_DEFAULT) != 0)
+		ndistinct = outer_path_rows;
+
+	/*
+	 * Since we've already estimated the maximum number of entries we can
+	 * store at once and know the estimated number of distinct values we'll be
+	 * called with, well take this opportunity to determine the est_entries.
+	 * This will ultimately determine the initial size of the hash table that
+	 * the executor will use.  If we leave this at zero the executor will just
+	 * choose the size itself.  Really this is not the right place to do this,
+	 * but it's convenient since we already have the ndistinct estimate and an
+	 * estimate on the number of entries that will fit based on
+	 * hash_mem_bytes.
+	 */
+	 est_entries = Min(Min(ndistinct, est_cache_entries),
+						   PG_UINT32_MAX);
+
+	 /*
+	  * When the number of distinct parameter values is above the amount we can
+	  * store in the cache, then we'll have to evict some entries from the
+	  * cache.  This is not free, so here we estimate how often we'll incur the
+	  * cost of that eviction.
+	  */
+	evict_ratio = 1.0 - Min(est_cache_entries, ndistinct) / ndistinct;
+
+	/*
+	 * In order to estimate how costly a single scan will be, we need to
+	 * attempt to estimate what the cache hit ratio will be.  To do that we
+	 * must look at how many scans are estimated in total of this node and how
+	 * many of those scans we expect to get a cache hit.
+	 */
+	hit_ratio = 1.0 / ndistinct * Min(est_cache_entries, ndistinct) -
+		(ndistinct / outer_path_rows);
+
+	/* Ensure we don't go negative */
+	hit_ratio = Max(hit_ratio, 0);
+
+	/*
+	 * Set the inner_rescan_total_cost accounting for the expected cache hit
+	 * ratio.  We also add on a cpu_operator_cost to account for a cache
+	 * lookup. This will happen regardless of if it's a cache hit or not.
+	 */
+	inner_rescan_total_cost = input_total_cost * (1.0 - hit_ratio) + cpu_operator_cost;
+
+	/* Now adjust the total cost to account for cache evictions */
+
+	/* Charge a cpu_tuple_cost for evicting the actual cache entry */
+	inner_rescan_total_cost += cpu_tuple_cost * evict_ratio;
+
+	/*
+	 * Charge a 10th of cpu_operator_cost to evict every tuple in that entry.
+	 * The per-tuple eviction is really just a pfree, so charging a whole
+	 * cpu_operator_cost seems a little excessive.
+	 */
+	inner_rescan_total_cost += cpu_operator_cost / 10.0 * evict_ratio * inner_path_rows;
+
+	/*
+	 * Getting the first row must be also be proportioned according to the
+	 * expected cache hit ratio.
+	 */
+	inner_rescan_start_cost = input_startup_cost * (1.0 - hit_ratio);
+
+	/*
+	 * NOTE: clearly, we must pay both outer and inner paths' startup_cost
+	 * before we can start returning tuples, so the join's startup cost is
+	 * their sum.  We'll also pay the inner path's rescan startup cost
+	 * multiple times.
+	 */
+	startup_cost += outer_path->startup_cost + inner_path->startup_cost;
+	run_cost += outer_path->total_cost - outer_path->startup_cost;
+	if (outer_path_rows > 1)
+		run_cost += (outer_path_rows - 1) * inner_rescan_start_cost;
+
+	inner_run_cost = inner_path->total_cost - inner_path->startup_cost;
+	inner_rescan_run_cost = inner_rescan_total_cost - inner_rescan_start_cost;
+
+	if (jointype == JOIN_SEMI || jointype == JOIN_ANTI ||
+		extra->inner_unique)
+	{
+		/*
+		 * With a SEMI or ANTI join, or if the innerrel is known unique, the
+		 * executor will stop after the first match.
+		 *
+		 * Getting decent estimates requires inspection of the join quals,
+		 * which we choose to postpone to final_cost_cached_nestloop.
+		 */
+
+		 /* Save private data for final_cost_cached_nestloop */
+		workspace->inner_run_cost = inner_run_cost;
+		workspace->inner_rescan_run_cost = inner_rescan_run_cost;
+	}
+	else
+	{
+		/* Normal case; we'll scan whole input rel for each outer row */
+		run_cost += inner_run_cost;
+		if (outer_path_rows > 1)
+			run_cost += (outer_path_rows - 1) * inner_rescan_run_cost;
+	}
+
+	/* CPU costs left for later */
+
+	/* Public result fields */
+	workspace->startup_cost = startup_cost;
+	workspace->total_cost = startup_cost + run_cost;
+	/* Save private data for final_cost_cached_nestloop */
+	workspace->run_cost = run_cost;
+
+	return est_entries;
+}
+
+/*
+ * final_cost_cached_nestloop
+ *	  Final estimate of the cost and result size of a cached nestloop join
+ *	  path.
+ *
+ * 'path' is already filled in except for the rows and cost fields
+ * 'workspace' is the result from initial_cost_nestloop
+ * 'extra' contains miscellaneous information about the join
+ */
+void
+final_cost_cached_nestloop(PlannerInfo *root, NestPath *path,
+						   JoinCostWorkspace *workspace,
+						   JoinPathExtraData *extra)
+{
+	/*
+	 * The final costings are identical to final_cost_nestloop.  We pass true
+	 * for the 'enabled' as we wouldn't have got here if enable_cachednestloop
+	 * was false.
+	 */
+	final_cost_nestloop(root, path, workspace, extra, true);
 }
 
 /*
@@ -4502,14 +4728,14 @@ compute_semi_anti_join_factors(PlannerInfo *root,
 static bool
 has_indexed_join_quals(NestPath *joinpath)
 {
-	Relids		joinrelids = joinpath->path.parent->relids;
-	Path	   *innerpath = joinpath->innerjoinpath;
+	Relids		joinrelids = joinpath->jpath.path.parent->relids;
+	Path	   *innerpath = joinpath->jpath.innerjoinpath;
 	List	   *indexclauses;
 	bool		found_one;
 	ListCell   *lc;
 
 	/* If join still has quals to evaluate, it's not fast */
-	if (joinpath->joinrestrictinfo != NIL)
+	if (joinpath->jpath.joinrestrictinfo != NIL)
 		return false;
 	/* Nor if the inner path isn't parameterized at all */
 	if (innerpath->param_info == NULL)
diff --git a/src/backend/optimizer/path/joinpath.c b/src/backend/optimizer/path/joinpath.c
index db54a6ba2e..62572ab050 100644
--- a/src/backend/optimizer/path/joinpath.c
+++ b/src/backend/optimizer/path/joinpath.c
@@ -18,10 +18,15 @@
 
 #include "executor/executor.h"
 #include "foreign/fdwapi.h"
+#include "miscadmin.h"
+#include "nodes/nodeFuncs.h"
 #include "optimizer/cost.h"
+#include "optimizer/optimizer.h"
 #include "optimizer/pathnode.h"
 #include "optimizer/paths.h"
 #include "optimizer/planmain.h"
+#include "utils/selfuncs.h"
+#include "utils/typcache.h"
 
 /* Hook for plugins to get control in add_paths_to_joinrel() */
 set_join_pathlist_hook_type set_join_pathlist_hook = NULL;
@@ -52,6 +57,9 @@ static void try_partial_mergejoin_path(PlannerInfo *root,
 static void sort_inner_and_outer(PlannerInfo *root, RelOptInfo *joinrel,
 								 RelOptInfo *outerrel, RelOptInfo *innerrel,
 								 JoinType jointype, JoinPathExtraData *extra);
+static inline bool clause_sides_match_join(RestrictInfo *rinfo,
+										   RelOptInfo *outerrel,
+										   RelOptInfo *innerrel);
 static void match_unsorted_outer(PlannerInfo *root, RelOptInfo *joinrel,
 								 RelOptInfo *outerrel, RelOptInfo *innerrel,
 								 JoinType jointype, JoinPathExtraData *extra);
@@ -163,6 +171,11 @@ add_paths_to_joinrel(PlannerInfo *root,
 	{
 		case JOIN_SEMI:
 		case JOIN_ANTI:
+
+			/*
+			 * XXX it may be worth proving this to allow cached nested
+			 * semi/anti join loops to be considered
+			 */
 			extra.inner_unique = false; /* well, unproven */
 			break;
 		case JOIN_UNIQUE_INNER:
@@ -354,6 +367,188 @@ allow_star_schema_join(PlannerInfo *root,
 			bms_nonempty_difference(inner_paramrels, outerrelids));
 }
 
+/*
+ * paraminfo_get_equal_hashops
+ *		Determine if it's valid to use a cached nested loop join.
+ *
+ * Additionally we also fetch outer side exprs and check for valid hashable
+ * equality operator for each outer expr.  Returns true and sets the
+ *'param_exprs' and 'operators' output parameters if the caching is possible.
+ */
+static bool
+paraminfo_get_equal_hashops(ParamPathInfo *param_info, List **param_exprs,
+							List **operators, RelOptInfo *outerrel,
+							RelOptInfo *innerrel)
+{
+	TypeCacheEntry *typentry;
+	ListCell   *lc;
+
+	/*
+	 * We can't use a result cache if there are volatile functions in the
+	 * inner rel's target list or restrict list.  A cache hit could reduce the
+	 * number of calls to these functions.
+	 *
+	 * XXX Think about this harder. Any other restrictions to add here?
+	 */
+	if (contain_volatile_functions((Node *) innerrel->reltarget->exprs))
+		return false;
+
+	foreach(lc, innerrel->baserestrictinfo)
+	{
+		RestrictInfo *rinfo = (RestrictInfo *) lfirst(lc);
+
+		if (contain_volatile_functions((Node *) rinfo->clause))
+			return false;
+	}
+
+	*param_exprs = NIL;
+	*operators = NIL;
+
+
+	if (param_info != NULL)
+	{
+		List	   *clauses = param_info->ppi_clauses;
+
+		foreach(lc, clauses)
+		{
+			RestrictInfo *rinfo = (RestrictInfo *) lfirst(lc);
+			OpExpr	   *opexpr;
+			Node	   *expr;
+
+			opexpr = (OpExpr *) rinfo->clause;
+
+			/* ppi_clauses should always meet this requirement */
+			if (!IsA(opexpr, OpExpr) || list_length(opexpr->args) != 2 ||
+				!clause_sides_match_join(rinfo, outerrel, innerrel))
+			{
+				list_free(*operators);
+				list_free(*param_exprs);
+				return false;
+			}
+
+			if (rinfo->outer_is_left)
+				expr = (Node *) list_nth(opexpr->args, 0);
+			else
+				expr = (Node *) list_nth(opexpr->args, 1);
+
+			typentry = lookup_type_cache(exprType(expr),
+										 TYPECACHE_HASH_PROC | TYPECACHE_EQ_OPR);
+
+			/* XXX will eq_opr ever be invalid if hash_proc isn't? */
+			if (!OidIsValid(typentry->hash_proc) || !OidIsValid(typentry->eq_opr))
+			{
+				list_free(*operators);
+				list_free(*param_exprs);
+				return false;
+			}
+
+			*operators = lappend_oid(*operators, typentry->eq_opr);
+			*param_exprs = lappend(*param_exprs, expr);
+		}
+	}
+
+	/* Now add any lateral vars to the cache key too */
+	foreach(lc, innerrel->lateral_vars)
+	{
+		Node	   *expr = (Node *) lfirst(lc);
+		Relids		var_relids = NULL;
+
+		if (IsA(expr, Var))
+			var_relids = bms_make_singleton(((Var *) expr)->varno);
+		else if (IsA(expr, PlaceHolderVar))
+			var_relids = pull_varnos((Node *) ((PlaceHolderVar *) expr)->phexpr);
+		else
+			Assert(false);
+
+		/* No need for lateral vars that are from the innerrel itself */
+		/* XXX can this actually happen? */
+		if (bms_overlap(var_relids, innerrel->relids))
+		{
+			bms_free(var_relids);
+			continue;
+		}
+		bms_free(var_relids);
+
+		typentry = lookup_type_cache(exprType(expr),
+									 TYPECACHE_HASH_PROC | TYPECACHE_EQ_OPR);
+
+		/* XXX will eq_opr ever be invalid if hash_proc isn't? */
+		if (!OidIsValid(typentry->hash_proc) || !OidIsValid(typentry->eq_opr))
+		{
+			list_free(*operators);
+			list_free(*param_exprs);
+			return false;
+		}
+
+		*operators = lappend_oid(*operators, typentry->eq_opr);
+		*param_exprs = lappend(*param_exprs, expr);
+	}
+
+	/* We can hash, provided we found something to hash */
+	return (*operators != NIL);
+}
+
+/*
+ * can_cached_nestloop
+ *		Returns true if it's possible to build a hash table inside a
+ *		parameterized nested loop to cache most recently seen parameters.
+ *
+ * Sets param_exprs to the cache key parameters and hash_operators to the
+ * hash operators for the cache upon returning true.
+ */
+static bool
+can_cached_nestloop(PlannerInfo *root, RelOptInfo *innerrel,
+					RelOptInfo *outerrel, Path *inner_path,
+					Path *outer_path, JoinType jointype,
+					JoinPathExtraData *extra, List **param_exprs,
+					List **hash_operators)
+{
+	/* Obviously not if it's disabled */
+	if (!enable_cachednestloop)
+		return false;
+
+	/*
+	 * We can safely not bother with all this unless we expect to perform more
+	 * than one inner scan.  The first scan is always going to be a cache
+	 * miss.  This would likely fail later anyway based on costs, so this is
+	 * really just to save some wasted effort.
+	 */
+	if (outer_path->parent->rows < 2)
+		return false;
+
+	/*
+	 * We can only have a result cache when there's some kind of cache key,
+	 * either parameterized path clauses or lateral Vars.
+	 */
+	if ((inner_path->param_info == NULL ||
+		 inner_path->param_info->ppi_clauses == NIL) &&
+		innerrel->lateral_vars == NIL)
+		return false;
+
+	/*
+	 * Currently we don't do this for SEMI and ANTI joins unless they're
+	 * marked as inner_unique.  This is because nested loop SEMI/ANTI joins
+	 * don't scan the inner node to completion, which will mean resultcache
+	 * cannot mark the cache entry as complete.
+	 *
+	 * XXX Currently we don't attempt to mark SEMI/ANTI joins as inner_unique
+	 * = true.  Should we?  See add_paths_to_joinrel()
+	 */
+	if (!extra->inner_unique && (jointype == JOIN_SEMI ||
+								 jointype == JOIN_ANTI))
+		return false;
+
+	/* Check if we have hash ops for each parameter to the path */
+	if (paraminfo_get_equal_hashops(inner_path->param_info,
+									param_exprs,
+									hash_operators,
+									outerrel,
+									innerrel))
+		return true;
+
+	return false;
+}
+
 /*
  * try_nestloop_path
  *	  Consider a nestloop join path; if it appears useful, push it into
@@ -548,6 +743,214 @@ try_partial_nestloop_path(PlannerInfo *root,
 										  NULL));
 }
 
+/*
+ * try_cached_nestloop_path
+ *	  Consider a cached nestloop join path; if it appears useful, push it into
+ *	  the joinrel's pathlist via add_path().
+ */
+static void
+try_cached_nestloop_path(PlannerInfo *root,
+						 RelOptInfo *joinrel,
+						 Path *outer_path,
+						 Path *inner_path,
+						 List *pathkeys,
+						 JoinType jointype,
+						 JoinPathExtraData *extra,
+						 List *param_exprs,
+						 List *hash_operators)
+{
+	Relids		required_outer;
+	JoinCostWorkspace workspace;
+	RelOptInfo *innerrel = inner_path->parent;
+	RelOptInfo *outerrel = outer_path->parent;
+	Relids		innerrelids;
+	Relids		outerrelids;
+	Relids		inner_paramrels = PATH_REQ_OUTER(inner_path);
+	Relids		outer_paramrels = PATH_REQ_OUTER(outer_path);
+	int			table_size;
+
+	/*
+	 * Paths are parameterized by top-level parents, so run parameterization
+	 * tests on the parent relids.
+	 */
+	if (innerrel->top_parent_relids)
+		innerrelids = innerrel->top_parent_relids;
+	else
+		innerrelids = innerrel->relids;
+
+	if (outerrel->top_parent_relids)
+		outerrelids = outerrel->top_parent_relids;
+	else
+		outerrelids = outerrel->relids;
+
+	/*
+	 * Check to see if proposed path is still parameterized, and reject if the
+	 * parameterization wouldn't be sensible --- unless allow_star_schema_join
+	 * says to allow it anyway.  Also, we must reject if have_dangerous_phv
+	 * doesn't like the look of it, which could only happen if the nestloop is
+	 * still parameterized.
+	 */
+	required_outer = calc_nestloop_required_outer(outerrelids, outer_paramrels,
+												  innerrelids, inner_paramrels);
+	if (required_outer &&
+		((!bms_overlap(required_outer, extra->param_source_rels) &&
+		  !allow_star_schema_join(root, outerrelids, inner_paramrels)) ||
+		 have_dangerous_phv(root, outerrelids, inner_paramrels)))
+	{
+		/* Waste no memory when we reject a path here */
+		bms_free(required_outer);
+		return;
+	}
+
+	/*
+	 * Do a precheck to quickly eliminate obviously-inferior paths.  We
+	 * calculate a cheap lower bound on the path's cost and then use
+	 * add_path_precheck() to see if the path is clearly going to be dominated
+	 * by some existing path for the joinrel.  If not, do the full pushup with
+	 * creating a fully valid path structure and submitting it to add_path().
+	 * The latter two steps are expensive enough to make this two-phase
+	 * methodology worthwhile.
+	 */
+	table_size = initial_cost_cached_nestloop(root, &workspace, jointype,
+											  outer_path, inner_path, extra,
+											  param_exprs);
+
+	if (add_path_precheck(joinrel,
+						  workspace.startup_cost, workspace.total_cost,
+						  pathkeys, required_outer))
+	{
+		/*
+		 * If the inner path is parameterized, it is parameterized by the
+		 * topmost parent of the outer rel, not the outer rel itself.  Fix
+		 * that.
+		 */
+		if (PATH_PARAM_BY_PARENT(inner_path, outer_path->parent))
+		{
+			inner_path = reparameterize_path_by_child(root, inner_path,
+													  outer_path->parent);
+
+			/*
+			 * If we could not translate the path, we can't create nest loop
+			 * path.
+			 */
+			if (!inner_path)
+			{
+				bms_free(required_outer);
+				return;
+			}
+		}
+
+		add_path(joinrel, (Path *)
+				 create_cached_nestloop_path(root,
+											 joinrel,
+											 jointype,
+											 &workspace,
+											 extra,
+											 outer_path,
+											 inner_path,
+											 extra->restrictlist,
+											 pathkeys,
+											 required_outer,
+											 table_size,
+											 param_exprs,
+											 hash_operators));
+	}
+	else
+	{
+		/* Waste no memory when we reject a path here */
+		bms_free(required_outer);
+	}
+}
+
+/*
+ * try_partial_cached_nestloop_path
+ *	  Consider a partial cached nestloop join path; if it appears useful, push
+ *	  it into the joinrel's partial_pathlist via add_partial_path().
+ */
+static void
+try_partial_cached_nestloop_path(PlannerInfo *root,
+								 RelOptInfo *joinrel,
+								 Path *outer_path,
+								 Path *inner_path,
+								 List *pathkeys,
+								 JoinType jointype,
+								 JoinPathExtraData *extra,
+								 List *param_exprs,
+								 List *hash_operators)
+{
+	JoinCostWorkspace workspace;
+	int			table_size;
+
+	/*
+	 * If the inner path is parameterized, the parameterization must be fully
+	 * satisfied by the proposed outer path.  Parameterized partial paths are
+	 * not supported.  The caller should already have verified that no
+	 * extra_lateral_rels are required here.
+	 */
+	Assert(bms_is_empty(joinrel->lateral_relids));
+	if (inner_path->param_info != NULL)
+	{
+		Relids		inner_paramrels = inner_path->param_info->ppi_req_outer;
+		RelOptInfo *outerrel = outer_path->parent;
+		Relids		outerrelids;
+
+		/*
+		 * The inner and outer paths are parameterized, if at all, by the top
+		 * level parents, not the child relations, so we must use those relids
+		 * for our parameterization tests.
+		 */
+		if (outerrel->top_parent_relids)
+			outerrelids = outerrel->top_parent_relids;
+		else
+			outerrelids = outerrel->relids;
+
+		if (!bms_is_subset(inner_paramrels, outerrelids))
+			return;
+	}
+
+	/*
+	 * Before creating a path, get a quick lower bound on what it is likely to
+	 * cost.  Bail out right away if it looks terrible.
+	 */
+	table_size = initial_cost_cached_nestloop(root, &workspace, jointype,
+											  outer_path, inner_path, extra,
+											  param_exprs);
+	if (!add_partial_path_precheck(joinrel, workspace.total_cost, pathkeys))
+		return;
+
+	/*
+	 * If the inner path is parameterized, it is parameterized by the topmost
+	 * parent of the outer rel, not the outer rel itself.  Fix that.
+	 */
+	if (PATH_PARAM_BY_PARENT(inner_path, outer_path->parent))
+	{
+		inner_path = reparameterize_path_by_child(root, inner_path,
+												  outer_path->parent);
+
+		/*
+		 * If we could not translate the path, we can't create nest loop path.
+		 */
+		if (!inner_path)
+			return;
+	}
+
+	/* Might be good enough to be worth trying, so let's try it. */
+	add_partial_path(joinrel, (Path *)
+					 create_cached_nestloop_path(root,
+												 joinrel,
+												 jointype,
+												 &workspace,
+												 extra,
+												 outer_path,
+												 inner_path,
+												 extra->restrictlist,
+												 pathkeys,
+												 NULL,
+												 table_size,
+												 param_exprs,
+												 hash_operators));
+}
+
 /*
  * try_mergejoin_path
  *	  Consider a merge join path; if it appears useful, push it into
@@ -1471,6 +1874,8 @@ match_unsorted_outer(PlannerInfo *root,
 			foreach(lc2, innerrel->cheapest_parameterized_paths)
 			{
 				Path	   *innerpath = (Path *) lfirst(lc2);
+				List	   *param_exprs;
+				List	   *hashoperators;
 
 				try_nestloop_path(root,
 								  joinrel,
@@ -1479,6 +1884,21 @@ match_unsorted_outer(PlannerInfo *root,
 								  merge_pathkeys,
 								  jointype,
 								  extra);
+
+				if (can_cached_nestloop(root, innerrel, outerrel,
+										innerpath, outerpath, jointype,
+										extra, &param_exprs, &hashoperators))
+				{
+					try_cached_nestloop_path(root,
+											 joinrel,
+											 outerpath,
+											 innerpath,
+											 merge_pathkeys,
+											 jointype,
+											 extra,
+											 param_exprs,
+											 hashoperators);
+				}
 			}
 
 			/* Also consider materialized form of the cheapest inner path */
@@ -1633,6 +2053,8 @@ consider_parallel_nestloop(PlannerInfo *root,
 		foreach(lc2, innerrel->cheapest_parameterized_paths)
 		{
 			Path	   *innerpath = (Path *) lfirst(lc2);
+			List	   *param_exprs;
+			List	   *hashoperators;
 
 			/* Can't join to an inner path that is not parallel-safe */
 			if (!innerpath->parallel_safe)
@@ -1657,6 +2079,21 @@ consider_parallel_nestloop(PlannerInfo *root,
 
 			try_partial_nestloop_path(root, joinrel, outerpath, innerpath,
 									  pathkeys, jointype, extra);
+
+			if (can_cached_nestloop(root, innerrel, outerrel,
+									innerpath, outerpath, jointype,
+									extra, &param_exprs, &hashoperators))
+			{
+				try_partial_cached_nestloop_path(root,
+												 joinrel,
+												 outerpath,
+												 innerpath,
+												 pathkeys,
+												 jointype,
+												 extra,
+												 param_exprs,
+												 hashoperators);
+			}
 		}
 	}
 }
diff --git a/src/backend/optimizer/plan/createplan.c b/src/backend/optimizer/plan/createplan.c
index 94280a730c..fc8daaded4 100644
--- a/src/backend/optimizer/plan/createplan.c
+++ b/src/backend/optimizer/plan/createplan.c
@@ -4077,22 +4077,26 @@ create_nestloop_plan(PlannerInfo *root,
 	NestLoop   *join_plan;
 	Plan	   *outer_plan;
 	Plan	   *inner_plan;
-	List	   *tlist = build_path_tlist(root, &best_path->path);
-	List	   *joinrestrictclauses = best_path->joinrestrictinfo;
+	List	   *tlist = build_path_tlist(root, &best_path->jpath.path);
+	List	   *joinrestrictclauses = best_path->jpath.joinrestrictinfo;
 	List	   *joinclauses;
 	List	   *otherclauses;
 	Relids		outerrelids;
 	List	   *nestParams;
 	Relids		saveOuterRels = root->curOuterRels;
+	List	   *param_exprs = NIL;
 
 	/* NestLoop can project, so no need to be picky about child tlists */
-	outer_plan = create_plan_recurse(root, best_path->outerjoinpath, 0);
+	outer_plan = create_plan_recurse(root, best_path->jpath.outerjoinpath, 0);
 
 	/* For a nestloop, include outer relids in curOuterRels for inner side */
 	root->curOuterRels = bms_union(root->curOuterRels,
-								   best_path->outerjoinpath->parent->relids);
+								   best_path->jpath.outerjoinpath->parent->relids);
+
+	inner_plan = create_plan_recurse(root, best_path->jpath.innerjoinpath, 0);
 
-	inner_plan = create_plan_recurse(root, best_path->innerjoinpath, 0);
+	param_exprs = (List *) replace_nestloop_params(root, (Node *)
+												   best_path->param_exprs);
 
 	/* Restore curOuterRels */
 	bms_free(root->curOuterRels);
@@ -4103,10 +4107,10 @@ create_nestloop_plan(PlannerInfo *root,
 
 	/* Get the join qual clauses (in plain expression form) */
 	/* Any pseudoconstant clauses are ignored here */
-	if (IS_OUTER_JOIN(best_path->jointype))
+	if (IS_OUTER_JOIN(best_path->jpath.jointype))
 	{
 		extract_actual_join_clauses(joinrestrictclauses,
-									best_path->path.parent->relids,
+									best_path->jpath.path.parent->relids,
 									&joinclauses, &otherclauses);
 	}
 	else
@@ -4117,7 +4121,7 @@ create_nestloop_plan(PlannerInfo *root,
 	}
 
 	/* Replace any outer-relation variables with nestloop params */
-	if (best_path->path.param_info)
+	if (best_path->jpath.path.param_info)
 	{
 		joinclauses = (List *)
 			replace_nestloop_params(root, (Node *) joinclauses);
@@ -4129,7 +4133,7 @@ create_nestloop_plan(PlannerInfo *root,
 	 * Identify any nestloop parameters that should be supplied by this join
 	 * node, and remove them from root->curOuterParams.
 	 */
-	outerrelids = best_path->outerjoinpath->parent->relids;
+	outerrelids = best_path->jpath.outerjoinpath->parent->relids;
 	nestParams = identify_current_nestloop_params(root, outerrelids);
 
 	join_plan = make_nestloop(tlist,
@@ -4138,10 +4142,42 @@ create_nestloop_plan(PlannerInfo *root,
 							  nestParams,
 							  outer_plan,
 							  inner_plan,
-							  best_path->jointype,
-							  best_path->inner_unique);
+							  best_path->jpath.jointype,
+							  best_path->jpath.inner_unique);
+
+	if (best_path->use_cache)
+	{
+		Oid		   *operators;
+		Oid		   *collations;
+		ListCell   *lc;
+		ListCell   *lc2;
+		int			nkeys;
+		int			i;
+
+		join_plan->numKeys = nkeys = list_length(param_exprs);
+		Assert(nkeys > 0);
+		operators = palloc(nkeys * sizeof(Oid));
+		collations = palloc(nkeys * sizeof(Oid));
+
+		i = 0;
+		forboth(lc, param_exprs, lc2, best_path->hash_operators)
+		{
+			Expr	   *param_expr = (Expr *) lfirst(lc);
+			Oid			opno = lfirst_oid(lc2);
+
+			operators[i] = opno;
+			collations[i] = exprCollation((Node *) param_expr);
+			i++;
+		}
+		join_plan->mrucache = true;
+		join_plan->param_exprs = param_exprs;
+		join_plan->hashOperators = operators;
+		join_plan->collations = collations;
+		join_plan->singlerow = best_path->singlerow;
+		join_plan->est_entries = best_path->est_entries;
+	}
 
-	copy_generic_path_info(&join_plan->join.plan, &best_path->path);
+	copy_generic_path_info(&join_plan->join.plan, &best_path->jpath.path);
 
 	return join_plan;
 }
diff --git a/src/backend/optimizer/util/pathnode.c b/src/backend/optimizer/util/pathnode.c
index 138a353f93..93a42a0521 100644
--- a/src/backend/optimizer/util/pathnode.c
+++ b/src/backend/optimizer/util/pathnode.c
@@ -2371,10 +2371,10 @@ create_nestloop_path(PlannerInfo *root,
 		restrict_clauses = jclauses;
 	}
 
-	pathnode->path.pathtype = T_NestLoop;
-	pathnode->path.parent = joinrel;
-	pathnode->path.pathtarget = joinrel->reltarget;
-	pathnode->path.param_info =
+	pathnode->jpath.path.pathtype = T_NestLoop;
+	pathnode->jpath.path.parent = joinrel;
+	pathnode->jpath.path.pathtarget = joinrel->reltarget;
+	pathnode->jpath.path.param_info =
 		get_joinrel_parampathinfo(root,
 								  joinrel,
 								  outer_path,
@@ -2382,19 +2382,129 @@ create_nestloop_path(PlannerInfo *root,
 								  extra->sjinfo,
 								  required_outer,
 								  &restrict_clauses);
-	pathnode->path.parallel_aware = false;
-	pathnode->path.parallel_safe = joinrel->consider_parallel &&
+	pathnode->jpath.path.parallel_aware = false;
+	pathnode->jpath.path.parallel_safe = joinrel->consider_parallel &&
 		outer_path->parallel_safe && inner_path->parallel_safe;
 	/* This is a foolish way to estimate parallel_workers, but for now... */
-	pathnode->path.parallel_workers = outer_path->parallel_workers;
-	pathnode->path.pathkeys = pathkeys;
-	pathnode->jointype = jointype;
-	pathnode->inner_unique = extra->inner_unique;
-	pathnode->outerjoinpath = outer_path;
-	pathnode->innerjoinpath = inner_path;
-	pathnode->joinrestrictinfo = restrict_clauses;
+	pathnode->jpath.path.parallel_workers = outer_path->parallel_workers;
+	pathnode->jpath.path.pathkeys = pathkeys;
+	pathnode->jpath.jointype = jointype;
+	pathnode->jpath.inner_unique = extra->inner_unique;
+	pathnode->jpath.outerjoinpath = outer_path;
+	pathnode->jpath.innerjoinpath = inner_path;
+	pathnode->jpath.joinrestrictinfo = restrict_clauses;
+
+	/* Zero out the fields specific to Cached Nested Loop */
+	pathnode->use_cache = false;
+	pathnode->singlerow = false;
+	pathnode->est_entries = 0;
+	pathnode->hash_operators = NIL;
+	pathnode->param_exprs = NIL;
+
+	final_cost_nestloop(root, pathnode, workspace, extra, enable_nestloop);
+
+	return pathnode;
+}
+
+/*
+ * create_cached_nestloop_path
+ *	  Creates a pathnode corresponding to a cached nestloop join between two
+ *	  relations.
+ *
+ * 'joinrel' is the join relation.
+ * 'jointype' is the type of join required
+ * 'workspace' is the result from initial_cost_nestloop
+ * 'extra' contains various information about the join
+ * 'outer_path' is the outer path
+ * 'inner_path' is the inner path
+ * 'restrict_clauses' are the RestrictInfo nodes to apply at the join
+ * 'pathkeys' are the path keys of the new join path
+ * 'required_outer' is the set of required outer rels
+ * 'table_size' number of initial buckets to make in cache hash table or 0 if
+ *		the executor should just decide.
+ * 'param_exprs' Exprs from the outer side of the join to use as cache keys
+ * 'hash_operators' hash operator Oid for each 'param_expr'
+ *
+ * Returns the resulting path node.
+ */
+NestPath *
+create_cached_nestloop_path(PlannerInfo *root,
+							RelOptInfo *joinrel,
+							JoinType jointype,
+							JoinCostWorkspace *workspace,
+							JoinPathExtraData *extra,
+							Path *outer_path,
+							Path *inner_path,
+							List *restrict_clauses,
+							List *pathkeys,
+							Relids required_outer,
+							int table_size,
+							List *param_exprs,
+							List *hash_operators)
+{
+	NestPath   *pathnode = makeNode(NestPath);
+	Relids		inner_req_outer = PATH_REQ_OUTER(inner_path);
+
+	/*
+	 * If the inner path is parameterized by the outer, we must drop any
+	 * restrict_clauses that are due to be moved into the inner path.  We have
+	 * to do this now, rather than postpone the work till createplan time,
+	 * because the restrict_clauses list can affect the size and cost
+	 * estimates for this path.
+	 */
+	if (bms_overlap(inner_req_outer, outer_path->parent->relids))
+	{
+		Relids		inner_and_outer = bms_union(inner_path->parent->relids,
+												inner_req_outer);
+		List	   *jclauses = NIL;
+		ListCell   *lc;
+
+		foreach(lc, restrict_clauses)
+		{
+			RestrictInfo *rinfo = (RestrictInfo *) lfirst(lc);
+
+			if (!join_clause_is_movable_into(rinfo,
+											 inner_path->parent->relids,
+											 inner_and_outer))
+				jclauses = lappend(jclauses, rinfo);
+		}
+		restrict_clauses = jclauses;
+	}
+
+	pathnode->jpath.path.pathtype = T_NestLoop;
+	pathnode->jpath.path.parent = joinrel;
+	pathnode->jpath.path.pathtarget = joinrel->reltarget;
+	pathnode->jpath.path.param_info =
+		get_joinrel_parampathinfo(root,
+								  joinrel,
+								  outer_path,
+								  inner_path,
+								  extra->sjinfo,
+								  required_outer,
+								  &restrict_clauses);
+	pathnode->jpath.path.parallel_aware = false;
+	pathnode->jpath.path.parallel_safe = joinrel->consider_parallel &&
+		outer_path->parallel_safe && inner_path->parallel_safe;
+	/* This is a foolish way to estimate parallel_workers, but for now... */
+	pathnode->jpath.path.parallel_workers = outer_path->parallel_workers;
+	pathnode->jpath.path.pathkeys = pathkeys;
+	pathnode->jpath.jointype = jointype;
+	pathnode->jpath.inner_unique = extra->inner_unique;
+	pathnode->jpath.outerjoinpath = outer_path;
+	pathnode->jpath.innerjoinpath = inner_path;
+	pathnode->jpath.joinrestrictinfo = restrict_clauses;
+
+	pathnode->use_cache = true;
+	pathnode->singlerow = extra->inner_unique;
+	pathnode->est_entries = table_size;
+	pathnode->param_exprs = param_exprs;
+	pathnode->hash_operators = hash_operators;
+
+	/* initial_cost_cached_nestloop() already did the final costs */
+	pathnode->jpath.path.startup_cost = workspace->startup_cost;
+	pathnode->jpath.path.total_cost = workspace->total_cost;
 
-	final_cost_nestloop(root, pathnode, workspace, extra);
+	final_cost_cached_nestloop(root, pathnode, workspace, extra);
 
 	return pathnode;
 }
@@ -4018,13 +4128,15 @@ do { \
 		case T_NestPath:
 			{
 				JoinPath   *jpath;
+				NestPath   *npath;
 
-				FLAT_COPY_PATH(jpath, path, NestPath);
+				FLAT_COPY_PATH(npath, path, NestPath);
 
+				jpath = (JoinPath *) npath;
 				REPARAMETERIZE_CHILD_PATH(jpath->outerjoinpath);
 				REPARAMETERIZE_CHILD_PATH(jpath->innerjoinpath);
 				ADJUST_CHILD_ATTRS(jpath->joinrestrictinfo);
-				new_path = (Path *) jpath;
+				new_path = (Path *) npath;
 			}
 			break;
 
diff --git a/src/backend/utils/adt/ruleutils.c b/src/backend/utils/adt/ruleutils.c
index 6c656586e8..7c8f412767 100644
--- a/src/backend/utils/adt/ruleutils.c
+++ b/src/backend/utils/adt/ruleutils.c
@@ -7391,12 +7391,11 @@ find_param_referent(Param *param, deparse_context *context,
 			ListCell   *lc2;
 
 			/*
-			 * NestLoops transmit params to their inner child only; also, once
-			 * we've crawled up out of a subplan, this couldn't possibly be
-			 * the right match.
+			 * NestLoops transmit params to either side of the join; also,
+			 * once we've crawled up out of a subplan, this couldn't possibly
+			 * be the right match.
 			 */
 			if (IsA(ancestor, NestLoop) &&
-				child_plan == innerPlan(ancestor) &&
 				in_same_plan_level)
 			{
 				NestLoop   *nl = (NestLoop *) ancestor;
diff --git a/src/backend/utils/misc/guc.c b/src/backend/utils/misc/guc.c
index a62d64eaa4..b813cfac5e 100644
--- a/src/backend/utils/misc/guc.c
+++ b/src/backend/utils/misc/guc.c
@@ -1016,6 +1016,16 @@ static struct config_bool ConfigureNamesBool[] =
 		true,
 		NULL, NULL, NULL
 	},
+	{
+		{"enable_cachednestloop", PGC_USERSET, QUERY_TUNING_METHOD,
+			gettext_noop("Enables the planner's use of cached nested-loop join plans."),
+			NULL,
+			GUC_EXPLAIN
+		},
+		&enable_cachednestloop,
+		true,
+		NULL, NULL, NULL
+	},
 	{
 		{"enable_nestloop", PGC_USERSET, QUERY_TUNING_METHOD,
 			gettext_noop("Enables the planner's use of nested-loop join plans."),
diff --git a/src/backend/utils/misc/postgresql.conf.sample b/src/backend/utils/misc/postgresql.conf.sample
index 9cb571f7cc..f1b738c971 100644
--- a/src/backend/utils/misc/postgresql.conf.sample
+++ b/src/backend/utils/misc/postgresql.conf.sample
@@ -362,6 +362,7 @@
 #enable_material = on
 #enable_mergejoin = on
 #enable_nestloop = on
+#enable_cachednestloop = on
 #enable_parallel_append = on
 #enable_seqscan = on
 #enable_sort = on
diff --git a/src/include/executor/execMRUTupleCache.h b/src/include/executor/execMRUTupleCache.h
new file mode 100644
index 0000000000..e6ceef9086
--- /dev/null
+++ b/src/include/executor/execMRUTupleCache.h
@@ -0,0 +1,97 @@
+/*-------------------------------------------------------------------------
+ *
+ * execMRUTupleCache.h
+ *	  Routines setting up and using a most-recently-used cache to store sets
+ *	  of tuples for a given cache key.
+ *
+ * Portions Copyright (c) 2020, PostgreSQL Global Development Group
+ *
+ *
+ * IDENTIFICATION
+ *	  src/include/executor/execMRUTupleCache.c
+ *
+ *-------------------------------------------------------------------------
+ */
+#ifndef EXEC_MRUTUPLECACHE_H
+#define EXEC_MRUTUPLECACHE_H
+
+#include "nodes/execnodes.h"
+
+typedef struct MRUCacheInstrumentation
+{
+	uint64		cache_hits;		/* number of rescans where we've found the
+								 * scan parameter values to be cached */
+	uint64		cache_misses;	/* number of rescans where we've not found the
+								 * scan parameter values to be cached. */
+	uint64		cache_evictions;	/* number of cache entries removed due to
+									 * the need to free memory */
+	uint64		cache_overflows;	/* number of times we've had to bypass the
+									 * cache when filling it due to not being
+									 * able to free enough space to store the
+									 * current scan's tuples. */
+	uint64		mem_peak;		/* peak memory usage in bytes */
+} MRUCacheInstrumentation;
+
+/* ----------------
+ *	 Shared memory container for per-worker MRU cache information
+ * ----------------
+ */
+typedef struct SharedMRUCacheInfo
+{
+	int			num_workers;
+	MRUCacheInstrumentation sinstrument[FLEXIBLE_ARRAY_MEMBER];
+} SharedMRUCacheInfo;
+
+ /* ----------------
+  *	MRUTupleCache information
+  *
+  *		Main data structure for MRUTupleCache.
+  * ----------------
+  */
+typedef struct MRUTupleCache
+{
+	PlanState	*subplan;		/* subplan to read and cache tuples from */
+	ExprContext *ps_ExprContext;	/* node's expression-evaluation context */
+	int			state;		/* value of MRUTupleCache's state machine */
+	int			nkeys;			/* number of hash table keys */
+	struct mrucache_hash *hashtable; /* hash table cache entries */
+	TupleDesc	hashkeydesc;	/* tuple descriptor for hash keys */
+	TupleTableSlot *tableslot;	/* min tuple slot for existing cache entries */
+	TupleTableSlot *cachefoundslot; /* Slot to return found cache entries */
+	TupleTableSlot *probeslot;	/* virtual slot used for hash lookups */
+	ExprState  *cache_eq_expr;	/* Compare exec params to hash key */
+	ExprState **param_exprs;	/* exprs containing the parameters to this
+								 * node */
+	FmgrInfo   *hashfunctions;	/* lookup data for hash funcs nkeys in size */
+	Oid		   *collations;		/* collation for comparisons nkeys in size */
+	uint64		mem_used;		/* bytes of memory used by cache */
+	uint64		mem_upperlimit; /* memory limit in bytes for the cache */
+	uint64		mem_lowerlimit; /* reduce memory usage to below this when we
+								 * free up space */
+	MemoryContext tableContext; /* memory context to store cache data */
+	dlist_head	lru_list;		/* least recently used entry list */
+	struct MRUCacheTuple *last_tuple;	/* Used to point to the last tuple
+											 * returned during a cache hit and
+											 * the tuple we last stored when
+											 * populating the cache. */
+	struct MRUCacheEntry *entry; /* the entry that 'last_tuple' belongs to
+									 * or NULL if 'last_tuple' is NULL. */
+	bool		singlerow;		/* true if the cache entry is to be marked as
+								 * complete after caching the first tuple. */
+	MRUCacheInstrumentation stats;	/* execution statistics */
+} MRUTupleCache;
+
+extern void ExecMRUTupleCacheFinishScan(MRUTupleCache *mrucache);
+extern TupleTableSlot *ExecMRUTupleCacheFetch(MRUTupleCache *mrucache);
+extern MRUTupleCache *ExecMRUTupleCacheInit(PlanState *planstate,
+											PlanState *cache_planstate,
+											List *param_exprs,
+											Oid *hashOperators,
+											Oid *collations,
+											uint64 memory_limit_bytes,
+											int est_entries,
+											bool singlerow);
+extern void ExecMRUTupleCacheCleanup(MRUTupleCache *mrucache);
+extern double ExecEstimateMRUCacheEntryOverheadBytes(double ntuples);
+
+#endif							/* EXEC_MRUTUPLECACHE_H */
diff --git a/src/include/executor/executor.h b/src/include/executor/executor.h
index b7978cd22e..993919dbe2 100644
--- a/src/include/executor/executor.h
+++ b/src/include/executor/executor.h
@@ -262,6 +262,12 @@ extern ExprState *ExecBuildGroupingEqual(TupleDesc ldesc, TupleDesc rdesc,
 										 const Oid *eqfunctions,
 										 const Oid *collations,
 										 PlanState *parent);
+extern ExprState *ExecBuildParamSetEqual(TupleDesc ldesc,
+										 const TupleTableSlotOps *lops,
+										 const Oid *eqfunctions,
+										 const Oid *collations,
+										 const List *param_exprs,
+										 PlanState *parent);
 extern ProjectionInfo *ExecBuildProjectionInfo(List *targetList,
 											   ExprContext *econtext,
 											   TupleTableSlot *slot,
diff --git a/src/include/executor/nodeNestloop.h b/src/include/executor/nodeNestloop.h
index 5a048a799f..1e4c729bd7 100644
--- a/src/include/executor/nodeNestloop.h
+++ b/src/include/executor/nodeNestloop.h
@@ -20,4 +20,9 @@ extern NestLoopState *ExecInitNestLoop(NestLoop *node, EState *estate, int eflag
 extern void ExecEndNestLoop(NestLoopState *node);
 extern void ExecReScanNestLoop(NestLoopState *node);
 
+extern void ExecNestLoopEstimate(NestLoopState *node, ParallelContext *pcxt);
+extern void ExecNestLoopInitializeDSM(NestLoopState *node, ParallelContext *pcxt);
+extern void ExecNestLoopInitializeWorker(NestLoopState *node, ParallelWorkerContext *pwcxt);
+extern void ExecNestLoopRetrieveInstrumentation(NestLoopState *node);
+
 #endif							/* NODENESTLOOP_H */
diff --git a/src/include/lib/ilist.h b/src/include/lib/ilist.h
index 98db885f6f..fcafc03725 100644
--- a/src/include/lib/ilist.h
+++ b/src/include/lib/ilist.h
@@ -394,6 +394,25 @@ dlist_move_head(dlist_head *head, dlist_node *node)
 	dlist_check(head);
 }
 
+/*
+ * Move element from its current position in the list to the tail position in
+ * the same list.
+ *
+ * Undefined behaviour if 'node' is not already part of the list.
+ */
+static inline void
+dlist_move_tail(dlist_head *head, dlist_node *node)
+{
+	/* fast path if it's already at the tail */
+	if (head->head.prev == node)
+		return;
+
+	dlist_delete(node);
+	dlist_push_tail(head, node);
+
+	dlist_check(head);
+}
+
 /*
  * Check whether 'node' has a following node.
  * Caution: unreliable if 'node' is not in the list.
diff --git a/src/include/lib/simplehash.h b/src/include/lib/simplehash.h
index dc1f1df07e..da51781e98 100644
--- a/src/include/lib/simplehash.h
+++ b/src/include/lib/simplehash.h
@@ -921,11 +921,11 @@ SH_DELETE_ITEM(SH_TYPE * tb, SH_ELEMENT_TYPE * entry)
 	tb->members--;
 
 	/*
-	 * Backward shift following elements till either an empty element
-	 * or an element at its optimal position is encountered.
+	 * Backward shift following elements till either an empty element or an
+	 * element at its optimal position is encountered.
 	 *
-	 * While that sounds expensive, the average chain length is short,
-	 * and deletions would otherwise require tombstones.
+	 * While that sounds expensive, the average chain length is short, and
+	 * deletions would otherwise require tombstones.
 	 */
 	while (true)
 	{
diff --git a/src/include/nodes/execnodes.h b/src/include/nodes/execnodes.h
index 6c0a7d68d6..bfb9d979f8 100644
--- a/src/include/nodes/execnodes.h
+++ b/src/include/nodes/execnodes.h
@@ -17,6 +17,7 @@
 #include "access/tupconvert.h"
 #include "executor/instrument.h"
 #include "fmgr.h"
+#include "lib/ilist.h"
 #include "lib/pairingheap.h"
 #include "nodes/params.h"
 #include "nodes/plannodes.h"
@@ -1855,6 +1856,14 @@ typedef struct NestLoopState
 	bool		nl_NeedNewOuter;
 	bool		nl_MatchedOuter;
 	TupleTableSlot *nl_NullInnerTupleSlot;
+	struct MRUTupleCache *nl_mrucache;
+	ProjectionInfo *ps_CacheProjInfo;	/* info for doing tuple projection */
+	ExprState	   *ps_CacheQual;
+	ExprState	   *ps_CacheJoinqual;
+	ProjectionInfo *ps_ScanProjInfo;	/* info for doing tuple projection */
+	ExprState	   *ps_ScanQual;
+	ExprState	   *ps_ScanJoinqual;
+	struct SharedMRUCacheInfo *shared_info; /* statistics for parallel workers */
 } NestLoopState;
 
 /* ----------------
diff --git a/src/include/nodes/pathnodes.h b/src/include/nodes/pathnodes.h
index 3dd16b9ad5..29da4f57fb 100644
--- a/src/include/nodes/pathnodes.h
+++ b/src/include/nodes/pathnodes.h
@@ -1534,11 +1534,41 @@ typedef struct JoinPath
 } JoinPath;
 
 /*
- * A nested-loop path needs no special fields.
+ * A standard non-cached nested loop only requires the fields supplied by
+ * JoinPath.  Cached Nested Loops require the following additional fields:
+ *
+ * 'use_cache' to indicate if the parameterized inner results should be saved
+ * for a later executinon which uses the same parameter values.  When false
+ * this is just a normal nested loop.
+ *
+ * 'singlerow' instructs the caching code to mark a cache entry as complete
+ * after we find the first row.  This is useful for unique joins where we stop
+ * trying to read additional rows after getting the first match.  Without this
+ * we'd leave the cache entry as uncomplete and be unable to use it next
+ * lookup.
+ *
+ * 'est_entries' the planners best guess at how large to make the hash table
+ * for the cache. 0 can be specified if the value is unknown.
+ *
+ * 'hash_operators' list of Oids for hash operators for each 'param_exprs'.
+ *
+ * 'param_exprs' vars/exprs from the outer side of the join which we use for
+ * the cache's key.
  */
 
-typedef JoinPath NestPath;
+typedef struct NestPath
+{
+	JoinPath	jpath;
 
+	bool		use_cache;
+	bool		singlerow;		/* true if the cache entry is to be marked as
+								 * complete after caching the first record. */
+	uint32		est_entries;	/* The maximum number of entries that the
+								 * planner expects will fit in the cache, or 0
+								 * if unknown */
+	List	   *hash_operators; /* hash operators for each key */
+	List	   *param_exprs;	/* cache keys */
+} NestPath;
 /*
  * A mergejoin path has these fields.
  *
diff --git a/src/include/nodes/plannodes.h b/src/include/nodes/plannodes.h
index 7e6b10f86b..f2f31a6db6 100644
--- a/src/include/nodes/plannodes.h
+++ b/src/include/nodes/plannodes.h
@@ -701,6 +701,19 @@ typedef struct NestLoop
 {
 	Join		join;
 	List	   *nestParams;		/* list of NestLoopParam nodes */
+	int			numKeys;		/* size of the two arrays below */
+
+	Oid		   *hashOperators;	/* hash operators for each key */
+	Oid		   *collations;		/* cache keys */
+	List	   *param_exprs;	/* exprs containing parameters */
+	bool		mrucache;		/* True if parameterized nested loop is to
+								 * cache rows from repeat scans. */
+	bool		singlerow;		/* true if the cache entry should be marked as
+								 * complete after we store the first tuple in
+								 * it. */
+	uint32		est_entries;	/* The maximum number of entries that the
+								 * planner expects will fit in the cache, or 0
+								 * if unknown */
 } NestLoop;
 
 typedef struct NestLoopParam
diff --git a/src/include/optimizer/cost.h b/src/include/optimizer/cost.h
index 6141654e47..29ee866892 100644
--- a/src/include/optimizer/cost.h
+++ b/src/include/optimizer/cost.h
@@ -57,6 +57,7 @@ extern PGDLLIMPORT bool enable_incremental_sort;
 extern PGDLLIMPORT bool enable_hashagg;
 extern PGDLLIMPORT bool enable_nestloop;
 extern PGDLLIMPORT bool enable_material;
+extern PGDLLIMPORT bool enable_cachednestloop;
 extern PGDLLIMPORT bool enable_mergejoin;
 extern PGDLLIMPORT bool enable_hashjoin;
 extern PGDLLIMPORT bool enable_gathermerge;
@@ -137,7 +138,17 @@ extern void initial_cost_nestloop(PlannerInfo *root,
 								  JoinPathExtraData *extra);
 extern void final_cost_nestloop(PlannerInfo *root, NestPath *path,
 								JoinCostWorkspace *workspace,
-								JoinPathExtraData *extra);
+								JoinPathExtraData *extra,
+								bool enabled);
+extern int initial_cost_cached_nestloop(PlannerInfo *root,
+										JoinCostWorkspace *workspace,
+										JoinType jointype,
+										Path *outer_path, Path *inner_path,
+										JoinPathExtraData *extra,
+										List *param_exprs);
+extern void final_cost_cached_nestloop(PlannerInfo *root, NestPath *path,
+									   JoinCostWorkspace *workspace,
+									   JoinPathExtraData *extra);
 extern void initial_cost_mergejoin(PlannerInfo *root,
 								   JoinCostWorkspace *workspace,
 								   JoinType jointype,
diff --git a/src/include/optimizer/pathnode.h b/src/include/optimizer/pathnode.h
index 715a24ad29..562096e6c1 100644
--- a/src/include/optimizer/pathnode.h
+++ b/src/include/optimizer/pathnode.h
@@ -146,6 +146,20 @@ extern NestPath *create_nestloop_path(PlannerInfo *root,
 									  List *pathkeys,
 									  Relids required_outer);
 
+extern NestPath *create_cached_nestloop_path(PlannerInfo *root,
+											 RelOptInfo *joinrel,
+											 JoinType jointype,
+											 JoinCostWorkspace *workspace,
+											 JoinPathExtraData *extra,
+											 Path *outer_path,
+											 Path *inner_path,
+											 List *restrict_clauses,
+											 List *pathkeys,
+											 Relids required_outer,
+											 int table_size,
+											 List *param_exprs,
+											 List *hash_operators);
+
 extern MergePath *create_mergejoin_path(PlannerInfo *root,
 										RelOptInfo *joinrel,
 										JoinType jointype,
diff --git a/src/include/utils/selfuncs.h b/src/include/utils/selfuncs.h
index 455e1343ee..57ca9fda8d 100644
--- a/src/include/utils/selfuncs.h
+++ b/src/include/utils/selfuncs.h
@@ -70,9 +70,9 @@
  * callers to provide further details about some assumptions which were made
  * during the estimation.
  */
-#define SELFLAG_USED_DEFAULT		(1 << 0) /* Estimation fell back on one of
-											  * the DEFAULTs as defined above.
-											  */
+#define SELFLAG_USED_DEFAULT		(1 << 0)	/* Estimation fell back on one
+												 * of the DEFAULTs as defined
+												 * above. */
 
 /* Return data from examine_variable and friends */
 typedef struct VariableStatData
diff --git a/src/test/regress/expected/join.out b/src/test/regress/expected/join.out
index a46b1573bd..0453b0ba91 100644
--- a/src/test/regress/expected/join.out
+++ b/src/test/regress/expected/join.out
@@ -2484,6 +2484,7 @@ reset enable_nestloop;
 --
 set work_mem to '64kB';
 set enable_mergejoin to off;
+set enable_cachednestloop to off;
 explain (costs off)
 select count(*) from tenk1 a, tenk1 b
   where a.hundred = b.thousand and (b.fivethous % 10) < 10;
@@ -2507,6 +2508,7 @@ select count(*) from tenk1 a, tenk1 b
 
 reset work_mem;
 reset enable_mergejoin;
+reset enable_cachednestloop;
 --
 -- regression test for 8.2 bug with improper re-ordering of left joins
 --
@@ -4056,8 +4058,9 @@ select * from
 where t1.f1 = ss.f1;
                     QUERY PLAN                    
 --------------------------------------------------
- Nested Loop
+ Cached Nested Loop
    Output: t1.f1, i8.q1, i8.q2, (i8.q1), t2.f1
+   Cache Key: i8.q1
    Join Filter: (t1.f1 = t2.f1)
    ->  Nested Loop Left Join
          Output: t1.f1, i8.q1, i8.q2
@@ -4072,7 +4075,7 @@ where t1.f1 = ss.f1;
          Output: (i8.q1), t2.f1
          ->  Seq Scan on public.text_tbl t2
                Output: i8.q1, t2.f1
-(16 rows)
+(17 rows)
 
 select * from
   text_tbl t1
@@ -4095,11 +4098,13 @@ select * from
 where t1.f1 = ss2.f1;
                             QUERY PLAN                             
 -------------------------------------------------------------------
- Nested Loop
+ Cached Nested Loop
    Output: t1.f1, i8.q1, i8.q2, (i8.q1), t2.f1, ((i8.q1)), (t2.f1)
+   Cache Key: (i8.q1), t2.f1
    Join Filter: (t1.f1 = (t2.f1))
-   ->  Nested Loop
+   ->  Cached Nested Loop
          Output: t1.f1, i8.q1, i8.q2, (i8.q1), t2.f1
+         Cache Key: i8.q1
          ->  Nested Loop Left Join
                Output: t1.f1, i8.q1, i8.q2
                ->  Seq Scan on public.text_tbl t1
@@ -4117,7 +4122,7 @@ where t1.f1 = ss2.f1;
          Output: ((i8.q1)), (t2.f1)
          ->  Seq Scan on public.text_tbl t3
                Output: (i8.q1), t2.f1
-(22 rows)
+(24 rows)
 
 select * from
   text_tbl t1
@@ -4141,8 +4146,9 @@ select 1 from
 where tt1.f1 = ss1.c0;
                         QUERY PLAN                        
 ----------------------------------------------------------
- Nested Loop
+ Cached Nested Loop
    Output: 1
+   Cache Key: tt4.f1
    ->  Nested Loop Left Join
          Output: tt1.f1, tt4.f1
          ->  Nested Loop
@@ -4170,7 +4176,7 @@ where tt1.f1 = ss1.c0;
                Output: (tt4.f1)
                ->  Seq Scan on public.text_tbl tt5
                      Output: tt4.f1
-(29 rows)
+(30 rows)
 
 select 1 from
   text_tbl as tt1
@@ -4811,20 +4817,22 @@ explain (costs off)
                    QUERY PLAN                   
 ------------------------------------------------
  Aggregate
-   ->  Nested Loop
+   ->  Cached Nested Loop
+         Cache Key: a.two
          ->  Seq Scan on tenk1 a
          ->  Function Scan on generate_series g
-(4 rows)
+(5 rows)
 
 explain (costs off)
   select count(*) from tenk1 a cross join lateral generate_series(1,two) g;
                    QUERY PLAN                   
 ------------------------------------------------
  Aggregate
-   ->  Nested Loop
+   ->  Cached Nested Loop
+         Cache Key: a.two
          ->  Seq Scan on tenk1 a
          ->  Function Scan on generate_series g
-(4 rows)
+(5 rows)
 
 -- don't need the explicit LATERAL keyword for functions
 explain (costs off)
@@ -4832,10 +4840,11 @@ explain (costs off)
                    QUERY PLAN                   
 ------------------------------------------------
  Aggregate
-   ->  Nested Loop
+   ->  Cached Nested Loop
+         Cache Key: a.two
          ->  Seq Scan on tenk1 a
          ->  Function Scan on generate_series g
-(4 rows)
+(5 rows)
 
 -- lateral with UNION ALL subselect
 explain (costs off)
@@ -4890,13 +4899,13 @@ explain (costs off)
                             QUERY PLAN                            
 ------------------------------------------------------------------
  Aggregate
-   ->  Hash Join
-         Hash Cond: ("*VALUES*".column1 = b.unique2)
+   ->  Cached Nested Loop
+         Cache Key: "*VALUES*".column1
          ->  Nested Loop
                ->  Index Only Scan using tenk1_unique1 on tenk1 a
                ->  Values Scan on "*VALUES*"
-         ->  Hash
-               ->  Index Only Scan using tenk1_unique2 on tenk1 b
+         ->  Index Only Scan using tenk1_unique2 on tenk1 b
+               Index Cond: (unique2 = "*VALUES*".column1)
 (8 rows)
 
 select count(*) from tenk1 a,
@@ -6286,3 +6295,91 @@ where exists (select 1 from j3
 (13 rows)
 
 drop table j3;
+-- Tests for Cached Nested Loops
+-- Ensure we get a cached nested loop plan
+explain (analyze, costs off, timing off, summary off)
+select count(*),avg(t1.unique1) from tenk1 t1
+inner join tenk1 t2 on t1.unique1 = t2.twenty
+where t2.unique1 < 1000;
+                                      QUERY PLAN                                      
+--------------------------------------------------------------------------------------
+ Aggregate (actual rows=1 loops=1)
+   ->  Cached Nested Loop (actual rows=1000 loops=1)
+         Cache Key: t2.twenty
+         Hits: 980  Misses: 20  Evictions: 0  Overflows: 0  Memory Usage: 3kB
+         ->  Bitmap Heap Scan on tenk1 t2 (actual rows=1000 loops=1)
+               Recheck Cond: (unique1 < 1000)
+               Heap Blocks: exact=333
+               ->  Bitmap Index Scan on tenk1_unique1 (actual rows=1000 loops=1)
+                     Index Cond: (unique1 < 1000)
+         ->  Index Only Scan using tenk1_unique1 on tenk1 t1 (actual rows=1 loops=20)
+               Index Cond: (unique1 = t2.twenty)
+               Heap Fetches: 0
+(12 rows)
+
+-- and check we get the expected results.
+select count(*),avg(t1.unique1) from tenk1 t1
+inner join tenk1 t2 on t1.unique1 = t2.twenty
+where t2.unique1 < 1000;
+ count |        avg         
+-------+--------------------
+  1000 | 9.5000000000000000
+(1 row)
+
+-- try reducing work to test the cache eviction code.
+set work_mem to 64;
+set enable_hashjoin to off;
+set enable_mergejoin to off;
+explain (analyze, costs off, timing off, summary off)
+select count(*),avg(t1.unique1) from tenk1 t1
+inner join tenk1 t2 on t1.unique1 = t2.thousand
+where t2.unique1 < 1000;
+                                       QUERY PLAN                                       
+----------------------------------------------------------------------------------------
+ Aggregate (actual rows=1 loops=1)
+   ->  Cached Nested Loop (actual rows=1000 loops=1)
+         Cache Key: t2.thousand
+         Hits: 0  Misses: 1000  Evictions: 378  Overflows: 0  Memory Usage: 65kB
+         ->  Bitmap Heap Scan on tenk1 t2 (actual rows=1000 loops=1)
+               Recheck Cond: (unique1 < 1000)
+               Heap Blocks: exact=333
+               ->  Bitmap Index Scan on tenk1_unique1 (actual rows=1000 loops=1)
+                     Index Cond: (unique1 < 1000)
+         ->  Index Only Scan using tenk1_unique1 on tenk1 t1 (actual rows=1 loops=1000)
+               Index Cond: (unique1 = t2.thousand)
+               Heap Fetches: 0
+(12 rows)
+
+reset enable_mergejoin;
+reset enable_hashjoin;
+reset work_mem;
+-- Try with LATERAL joins
+explain (analyze, costs off, timing off, summary off)
+select count(*),avg(t2.unique1) from tenk1 t1,
+lateral (select t2.unique1 from tenk1 t2 where t1.twenty = t2.unique1) t2
+where t1.unique1 < 1000;
+                                      QUERY PLAN                                      
+--------------------------------------------------------------------------------------
+ Aggregate (actual rows=1 loops=1)
+   ->  Cached Nested Loop (actual rows=1000 loops=1)
+         Cache Key: t1.twenty
+         Hits: 980  Misses: 20  Evictions: 0  Overflows: 0  Memory Usage: 3kB
+         ->  Bitmap Heap Scan on tenk1 t1 (actual rows=1000 loops=1)
+               Recheck Cond: (unique1 < 1000)
+               Heap Blocks: exact=333
+               ->  Bitmap Index Scan on tenk1_unique1 (actual rows=1000 loops=1)
+                     Index Cond: (unique1 < 1000)
+         ->  Index Only Scan using tenk1_unique1 on tenk1 t2 (actual rows=1 loops=20)
+               Index Cond: (unique1 = t1.twenty)
+               Heap Fetches: 0
+(12 rows)
+
+-- and check we get the expected results.
+select count(*),avg(t2.unique1) from tenk1 t1,
+lateral (select t2.unique1 from tenk1 t2 where t1.twenty = t2.unique1) t2
+where t1.unique1 < 1000;
+ count |        avg         
+-------+--------------------
+  1000 | 9.5000000000000000
+(1 row)
+
diff --git a/src/test/regress/expected/partition_prune.out b/src/test/regress/expected/partition_prune.out
index 50d2a7e4b9..97b200e482 100644
--- a/src/test/regress/expected/partition_prune.out
+++ b/src/test/regress/expected/partition_prune.out
@@ -1930,6 +1930,9 @@ begin
         ln := regexp_replace(ln, 'Workers Launched: \d+', 'Workers Launched: N');
         ln := regexp_replace(ln, 'actual rows=\d+ loops=\d+', 'actual rows=N loops=N');
         ln := regexp_replace(ln, 'Rows Removed by Filter: \d+', 'Rows Removed by Filter: N');
+        ln := regexp_replace(ln, 'Hits: \d+', 'Hits: N');
+        ln := regexp_replace(ln, 'Misses: \d+', 'Misses: N');
+        ln := regexp_replace(ln, 'Memory Usage: \d+', 'Memory Usage: N');
         return next ln;
     end loop;
 end;
@@ -2065,7 +2068,9 @@ select explain_parallel_append('select avg(ab.a) from ab inner join lprt_a a on
          Workers Planned: 1
          Workers Launched: N
          ->  Partial Aggregate (actual rows=N loops=N)
-               ->  Nested Loop (actual rows=N loops=N)
+               ->  Cached Nested Loop (actual rows=N loops=N)
+                     Cache Key: a.a
+                     Hits: N  Misses: N  Evictions: 0  Overflows: 0  Memory Usage: NkB
                      ->  Parallel Seq Scan on lprt_a a (actual rows=N loops=N)
                            Filter: (a = ANY ('{0,0,1}'::integer[]))
                      ->  Append (actual rows=N loops=N)
@@ -2087,7 +2092,7 @@ select explain_parallel_append('select avg(ab.a) from ab inner join lprt_a a on
                                  Index Cond: (a = a.a)
                            ->  Index Scan using ab_a3_b3_a_idx on ab_a3_b3 ab_9 (never executed)
                                  Index Cond: (a = a.a)
-(27 rows)
+(29 rows)
 
 -- Ensure the same partitions are pruned when we make the nested loop
 -- parameter an Expr rather than a plain Param.
@@ -2099,7 +2104,9 @@ select explain_parallel_append('select avg(ab.a) from ab inner join lprt_a a on
          Workers Planned: 1
          Workers Launched: N
          ->  Partial Aggregate (actual rows=N loops=N)
-               ->  Nested Loop (actual rows=N loops=N)
+               ->  Cached Nested Loop (actual rows=N loops=N)
+                     Cache Key: (a.a + 0)
+                     Hits: N  Misses: N  Evictions: 0  Overflows: 0  Memory Usage: NkB
                      ->  Parallel Seq Scan on lprt_a a (actual rows=N loops=N)
                            Filter: (a = ANY ('{0,0,1}'::integer[]))
                      ->  Append (actual rows=N loops=N)
@@ -2121,7 +2128,7 @@ select explain_parallel_append('select avg(ab.a) from ab inner join lprt_a a on
                                  Index Cond: (a = (a.a + 0))
                            ->  Index Scan using ab_a3_b3_a_idx on ab_a3_b3 ab_9 (never executed)
                                  Index Cond: (a = (a.a + 0))
-(27 rows)
+(29 rows)
 
 insert into lprt_a values(3),(3);
 select explain_parallel_append('select avg(ab.a) from ab inner join lprt_a a on ab.a = a.a where a.a in(1, 0, 3)');
@@ -2132,7 +2139,9 @@ select explain_parallel_append('select avg(ab.a) from ab inner join lprt_a a on
          Workers Planned: 1
          Workers Launched: N
          ->  Partial Aggregate (actual rows=N loops=N)
-               ->  Nested Loop (actual rows=N loops=N)
+               ->  Cached Nested Loop (actual rows=N loops=N)
+                     Cache Key: a.a
+                     Hits: N  Misses: N  Evictions: 0  Overflows: 0  Memory Usage: NkB
                      ->  Parallel Seq Scan on lprt_a a (actual rows=N loops=N)
                            Filter: (a = ANY ('{1,0,3}'::integer[]))
                      ->  Append (actual rows=N loops=N)
@@ -2154,7 +2163,7 @@ select explain_parallel_append('select avg(ab.a) from ab inner join lprt_a a on
                                  Index Cond: (a = a.a)
                            ->  Index Scan using ab_a3_b3_a_idx on ab_a3_b3 ab_9 (actual rows=N loops=N)
                                  Index Cond: (a = a.a)
-(27 rows)
+(29 rows)
 
 select explain_parallel_append('select avg(ab.a) from ab inner join lprt_a a on ab.a = a.a where a.a in(1, 0, 0)');
                                         explain_parallel_append                                         
@@ -2164,7 +2173,9 @@ select explain_parallel_append('select avg(ab.a) from ab inner join lprt_a a on
          Workers Planned: 1
          Workers Launched: N
          ->  Partial Aggregate (actual rows=N loops=N)
-               ->  Nested Loop (actual rows=N loops=N)
+               ->  Cached Nested Loop (actual rows=N loops=N)
+                     Cache Key: a.a
+                     Hits: N  Misses: N  Evictions: 0  Overflows: 0  Memory Usage: NkB
                      ->  Parallel Seq Scan on lprt_a a (actual rows=N loops=N)
                            Filter: (a = ANY ('{1,0,0}'::integer[]))
                            Rows Removed by Filter: N
@@ -2187,7 +2198,7 @@ select explain_parallel_append('select avg(ab.a) from ab inner join lprt_a a on
                                  Index Cond: (a = a.a)
                            ->  Index Scan using ab_a3_b3_a_idx on ab_a3_b3 ab_9 (never executed)
                                  Index Cond: (a = a.a)
-(28 rows)
+(30 rows)
 
 delete from lprt_a where a = 1;
 select explain_parallel_append('select avg(ab.a) from ab inner join lprt_a a on ab.a = a.a where a.a in(1, 0, 0)');
@@ -2198,7 +2209,9 @@ select explain_parallel_append('select avg(ab.a) from ab inner join lprt_a a on
          Workers Planned: 1
          Workers Launched: N
          ->  Partial Aggregate (actual rows=N loops=N)
-               ->  Nested Loop (actual rows=N loops=N)
+               ->  Cached Nested Loop (actual rows=N loops=N)
+                     Cache Key: a.a
+                     Hits: N  Misses: N  Evictions: 0  Overflows: 0  Memory Usage: NkB
                      ->  Parallel Seq Scan on lprt_a a (actual rows=N loops=N)
                            Filter: (a = ANY ('{1,0,0}'::integer[]))
                            Rows Removed by Filter: N
@@ -2221,7 +2234,7 @@ select explain_parallel_append('select avg(ab.a) from ab inner join lprt_a a on
                                  Index Cond: (a = a.a)
                            ->  Index Scan using ab_a3_b3_a_idx on ab_a3_b3 ab_9 (never executed)
                                  Index Cond: (a = a.a)
-(28 rows)
+(30 rows)
 
 reset enable_hashjoin;
 reset enable_mergejoin;
diff --git a/src/test/regress/expected/subselect.out b/src/test/regress/expected/subselect.out
index 9d56cdacf3..cc4de0e8c3 100644
--- a/src/test/regress/expected/subselect.out
+++ b/src/test/regress/expected/subselect.out
@@ -1094,7 +1094,8 @@ where o.ten = 1;
                     QUERY PLAN                     
 ---------------------------------------------------
  Aggregate
-   ->  Nested Loop
+   ->  Cached Nested Loop
+         Cache Key: o.four
          ->  Seq Scan on onek o
                Filter: (ten = 1)
          ->  CTE Scan on x
@@ -1103,7 +1104,7 @@ where o.ten = 1;
                        ->  Result
                        ->  WorkTable Scan on x x_1
                              Filter: (a < 10)
-(10 rows)
+(11 rows)
 
 select sum(o.four), sum(ss.a) from
   onek o cross join lateral (
diff --git a/src/test/regress/expected/sysviews.out b/src/test/regress/expected/sysviews.out
index 81bdacf59d..bf3eaaccf1 100644
--- a/src/test/regress/expected/sysviews.out
+++ b/src/test/regress/expected/sysviews.out
@@ -89,6 +89,7 @@ select name, setting from pg_settings where name like 'enable%';
               name              | setting 
 --------------------------------+---------
  enable_bitmapscan              | on
+ enable_cachednestloop          | on
  enable_gathermerge             | on
  enable_hashagg                 | on
  enable_hashjoin                | on
@@ -106,7 +107,7 @@ select name, setting from pg_settings where name like 'enable%';
  enable_seqscan                 | on
  enable_sort                    | on
  enable_tidscan                 | on
-(18 rows)
+(19 rows)
 
 -- Test that the pg_timezone_names and pg_timezone_abbrevs views are
 -- more-or-less working.  We can't test their contents in any great detail
diff --git a/src/test/regress/sql/join.sql b/src/test/regress/sql/join.sql
index 1403e0ffe7..90ccba69de 100644
--- a/src/test/regress/sql/join.sql
+++ b/src/test/regress/sql/join.sql
@@ -539,6 +539,7 @@ reset enable_nestloop;
 
 set work_mem to '64kB';
 set enable_mergejoin to off;
+set enable_cachednestloop to off;
 
 explain (costs off)
 select count(*) from tenk1 a, tenk1 b
@@ -548,6 +549,7 @@ select count(*) from tenk1 a, tenk1 b
 
 reset work_mem;
 reset enable_mergejoin;
+reset enable_cachednestloop;
 
 --
 -- regression test for 8.2 bug with improper re-ordering of left joins
@@ -2171,3 +2173,39 @@ where exists (select 1 from j3
       and t1.unique1 < 1;
 
 drop table j3;
+
+-- Tests for Cached Nested Loops
+-- Ensure we get a cached nested loop plan
+explain (analyze, costs off, timing off, summary off)
+select count(*),avg(t1.unique1) from tenk1 t1
+inner join tenk1 t2 on t1.unique1 = t2.twenty
+where t2.unique1 < 1000;
+
+-- and check we get the expected results.
+select count(*),avg(t1.unique1) from tenk1 t1
+inner join tenk1 t2 on t1.unique1 = t2.twenty
+where t2.unique1 < 1000;
+
+-- try reducing work to test the cache eviction code.
+set work_mem to 64;
+set enable_hashjoin to off;
+set enable_mergejoin to off;
+explain (analyze, costs off, timing off, summary off)
+select count(*),avg(t1.unique1) from tenk1 t1
+inner join tenk1 t2 on t1.unique1 = t2.thousand
+where t2.unique1 < 1000;
+
+reset enable_mergejoin;
+reset enable_hashjoin;
+reset work_mem;
+
+-- Try with LATERAL joins
+explain (analyze, costs off, timing off, summary off)
+select count(*),avg(t2.unique1) from tenk1 t1,
+lateral (select t2.unique1 from tenk1 t2 where t1.twenty = t2.unique1) t2
+where t1.unique1 < 1000;
+
+-- and check we get the expected results.
+select count(*),avg(t2.unique1) from tenk1 t1,
+lateral (select t2.unique1 from tenk1 t2 where t1.twenty = t2.unique1) t2
+where t1.unique1 < 1000;
diff --git a/src/test/regress/sql/partition_prune.sql b/src/test/regress/sql/partition_prune.sql
index 1e904a8c5b..7ee792506d 100644
--- a/src/test/regress/sql/partition_prune.sql
+++ b/src/test/regress/sql/partition_prune.sql
@@ -453,6 +453,9 @@ begin
         ln := regexp_replace(ln, 'Workers Launched: \d+', 'Workers Launched: N');
         ln := regexp_replace(ln, 'actual rows=\d+ loops=\d+', 'actual rows=N loops=N');
         ln := regexp_replace(ln, 'Rows Removed by Filter: \d+', 'Rows Removed by Filter: N');
+        ln := regexp_replace(ln, 'Hits: \d+', 'Hits: N');
+        ln := regexp_replace(ln, 'Misses: \d+', 'Misses: N');
+        ln := regexp_replace(ln, 'Memory Usage: \d+', 'Memory Usage: N');
         return next ln;
     end loop;
 end;
-- 
2.21.0.windows.1