diff --git a/src/backend/commands/explain.c b/src/backend/commands/explain.c
index 6d4b9eb3b9..42c6df549f 100644
--- a/src/backend/commands/explain.c
+++ b/src/backend/commands/explain.c
@@ -108,8 +108,7 @@ static void show_sort_info(SortState *sortstate, ExplainState *es);
 static void show_incremental_sort_info(IncrementalSortState *incrsortstate,
 									   ExplainState *es);
 static void show_hash_info(HashState *hashstate, ExplainState *es);
-static void show_resultcache_info(ResultCacheState *rcstate, List *ancestors,
-								  ExplainState *es);
+static void show_resultcache_info(NestLoopState *nlstate, List *ancestors, ExplainState *es);
 static void show_hashagg_info(AggState *hashstate, ExplainState *es);
 static void show_tidbitmap_info(BitmapHeapScanState *planstate,
 								ExplainState *es);
@@ -1494,10 +1493,13 @@ ExplainNode(PlanState *planstate, List *ancestors,
 					 * For historical reasons, the join type is interpolated
 					 * into the node type name...
 					 */
-					if (((Join *) plan)->jointype != JOIN_INNER)
+					if (((Join *)plan)->jointype != JOIN_INNER)
 						appendStringInfo(es->str, " %s Join", jointype);
 					else if (!IsA(plan, NestLoop))
 						appendStringInfoString(es->str, " Join");
+					else if (castNode(NestLoop, plan)->paramcache)
+						appendStringInfoString(es->str, " Cached");
+
 				}
 				else
 					ExplainPropertyText("Join Type", jointype, es);
@@ -1883,6 +1885,7 @@ ExplainNode(PlanState *planstate, List *ancestors,
 			}
 			break;
 		case T_NestLoop:
+			show_resultcache_info((NestLoopState *) planstate, ancestors, es);
 			show_upper_qual(((NestLoop *) plan)->join.joinqual,
 							"Join Filter", planstate, ancestors, es);
 			if (((NestLoop *) plan)->join.joinqual)
@@ -1963,10 +1966,10 @@ ExplainNode(PlanState *planstate, List *ancestors,
 		case T_Hash:
 			show_hash_info(castNode(HashState, planstate), es);
 			break;
-		case T_ResultCache:
-			show_resultcache_info(castNode(ResultCacheState, planstate),
-								  ancestors, es);
-			break;
+		//case T_ResultCache:
+		//	show_resultcache_info(castNode(ResultCacheState, planstate),
+		//						  ancestors, es);
+		//	break;
 		default:
 			break;
 	}
@@ -3041,15 +3044,19 @@ show_hash_info(HashState *hashstate, ExplainState *es)
 }
 
 static void
-show_resultcache_info(ResultCacheState *rcstate, List *ancestors, ExplainState *es)
+show_resultcache_info(NestLoopState *nlstate, List *ancestors, ExplainState *es)
 {
-	Plan	   *plan = ((PlanState *) rcstate)->plan;
+	Plan	   *plan = ((PlanState *) nlstate)->plan;
+	ResultCacheState *rcstate;
 	ListCell   *lc;
 	List	   *context;
 	StringInfoData keystr;
 	char	   *seperator = "";
 	bool		useprefix;
 
+	if (nlstate->nl_pcache == NULL)
+		return;
+
 	initStringInfo(&keystr);
 
 	/* XXX surely we'll always have more than one if we have a resultcache? */
@@ -3060,7 +3067,7 @@ show_resultcache_info(ResultCacheState *rcstate, List *ancestors, ExplainState *
 									   plan,
 									   ancestors);
 
-	foreach(lc, ((ResultCache *) plan)->param_exprs)
+	foreach(lc, ((NestLoop *) plan)->param_exprs)
 	{
 		Node	   *expr = (Node *) lfirst(lc);
 
@@ -3086,6 +3093,8 @@ show_resultcache_info(ResultCacheState *rcstate, List *ancestors, ExplainState *
 	if (!es->analyze)
 		return;
 
+
+	rcstate = nlstate->nl_pcache;
 	if (es->format != EXPLAIN_FORMAT_TEXT)
 	{
 		ExplainPropertyInteger("Cache Hits", NULL, rcstate->stats.cache_hits, es);
diff --git a/src/backend/executor/execAmi.c b/src/backend/executor/execAmi.c
index 68920ecd89..f9c2f80c79 100644
--- a/src/backend/executor/execAmi.c
+++ b/src/backend/executor/execAmi.c
@@ -44,7 +44,7 @@
 #include "executor/nodeProjectSet.h"
 #include "executor/nodeRecursiveunion.h"
 #include "executor/nodeResult.h"
-#include "executor/nodeResultCache.h"
+//#include "executor/nodeResultCache.h"
 #include "executor/nodeSamplescan.h"
 #include "executor/nodeSeqscan.h"
 #include "executor/nodeSetOp.h"
@@ -250,9 +250,9 @@ ExecReScan(PlanState *node)
 			ExecReScanMaterial((MaterialState *) node);
 			break;
 
-		case T_ResultCacheState:
-			ExecReScanResultCache((ResultCacheState *) node);
-			break;
+		//case T_ResultCacheState:
+		//	ExecReScanResultCache((ResultCacheState *) node);
+		//	break;
 
 		case T_SortState:
 			ExecReScanSort((SortState *) node);
diff --git a/src/backend/executor/execParallel.c b/src/backend/executor/execParallel.c
index 459e9dd3e9..37cfa36881 100644
--- a/src/backend/executor/execParallel.c
+++ b/src/backend/executor/execParallel.c
@@ -35,7 +35,7 @@
 #include "executor/nodeIncrementalSort.h"
 #include "executor/nodeIndexonlyscan.h"
 #include "executor/nodeIndexscan.h"
-#include "executor/nodeResultCache.h"
+//#include "executor/nodeResultCache.h"
 #include "executor/nodeSeqscan.h"
 #include "executor/nodeSort.h"
 #include "executor/nodeSubplan.h"
@@ -294,10 +294,10 @@ ExecParallelEstimate(PlanState *planstate, ExecParallelEstimateContext *e)
 			/* even when not parallel-aware, for EXPLAIN ANALYZE */
 			ExecAggEstimate((AggState *) planstate, e->pcxt);
 			break;
-		case T_ResultCacheState:
-			/* even when not parallel-aware, for EXPLAIN ANALYZE */
-			ExecResultCacheEstimate((ResultCacheState *) planstate, e->pcxt);
-			break;
+		//case T_ResultCacheState:
+		//	/* even when not parallel-aware, for EXPLAIN ANALYZE */
+		//	ExecResultCacheEstimate((ResultCacheState *) planstate, e->pcxt);
+		//	break;
 		default:
 			break;
 	}
@@ -518,10 +518,10 @@ ExecParallelInitializeDSM(PlanState *planstate,
 			/* even when not parallel-aware, for EXPLAIN ANALYZE */
 			ExecAggInitializeDSM((AggState *) planstate, d->pcxt);
 			break;
-		case T_ResultCacheState:
-			/* even when not parallel-aware, for EXPLAIN ANALYZE */
-			ExecResultCacheInitializeDSM((ResultCacheState *) planstate, d->pcxt);
-			break;
+		//case T_ResultCacheState:
+		//	/* even when not parallel-aware, for EXPLAIN ANALYZE */
+		//	ExecResultCacheInitializeDSM((ResultCacheState *) planstate, d->pcxt);
+		//	break;
 		default:
 			break;
 	}
@@ -998,9 +998,9 @@ ExecParallelReInitializeDSM(PlanState *planstate,
 		case T_HashState:
 		case T_SortState:
 		case T_IncrementalSortState:
-		case T_ResultCacheState:
-			/* these nodes have DSM state, but no reinitialization is required */
-			break;
+		//case T_ResultCacheState:
+		//	/* these nodes have DSM state, but no reinitialization is required */
+		//	break;
 
 		default:
 			break;
@@ -1068,9 +1068,9 @@ ExecParallelRetrieveInstrumentation(PlanState *planstate,
 		case T_AggState:
 			ExecAggRetrieveInstrumentation((AggState *) planstate);
 			break;
-		case T_ResultCacheState:
-			ExecResultCacheRetrieveInstrumentation((ResultCacheState *) planstate);
-			break;
+		//case T_ResultCacheState:
+		//	ExecResultCacheRetrieveInstrumentation((ResultCacheState *) planstate);
+		//	break;
 		default:
 			break;
 	}
@@ -1363,11 +1363,11 @@ ExecParallelInitializeWorker(PlanState *planstate, ParallelWorkerContext *pwcxt)
 			/* even when not parallel-aware, for EXPLAIN ANALYZE */
 			ExecAggInitializeWorker((AggState *) planstate, pwcxt);
 			break;
-		case T_ResultCacheState:
-			/* even when not parallel-aware, for EXPLAIN ANALYZE */
-			ExecResultCacheInitializeWorker((ResultCacheState *) planstate,
-											pwcxt);
-			break;
+		//case T_ResultCacheState:
+		//	/* even when not parallel-aware, for EXPLAIN ANALYZE */
+		//	ExecResultCacheInitializeWorker((ResultCacheState *) planstate,
+		//									pwcxt);
+		//	break;
 		default:
 			break;
 	}
diff --git a/src/backend/executor/execProcnode.c b/src/backend/executor/execProcnode.c
index fbbe667cc1..e5b8c74da7 100644
--- a/src/backend/executor/execProcnode.c
+++ b/src/backend/executor/execProcnode.c
@@ -102,7 +102,7 @@
 #include "executor/nodeProjectSet.h"
 #include "executor/nodeRecursiveunion.h"
 #include "executor/nodeResult.h"
-#include "executor/nodeResultCache.h"
+//#include "executor/nodeResultCache.h"
 #include "executor/nodeSamplescan.h"
 #include "executor/nodeSeqscan.h"
 #include "executor/nodeSetOp.h"
@@ -320,10 +320,10 @@ ExecInitNode(Plan *node, EState *estate, int eflags)
 														   estate, eflags);
 			break;
 
-		case T_ResultCache:
-			result = (PlanState *) ExecInitResultCache((ResultCache *) node,
-													   estate, eflags);
-			break;
+		//case T_ResultCache:
+		//	result = (PlanState *) ExecInitResultCache((ResultCache *) node,
+		//											   estate, eflags);
+		//	break;
 
 		case T_Group:
 			result = (PlanState *) ExecInitGroup((Group *) node,
@@ -709,9 +709,9 @@ ExecEndNode(PlanState *node)
 			ExecEndIncrementalSort((IncrementalSortState *) node);
 			break;
 
-		case T_ResultCacheState:
-			ExecEndResultCache((ResultCacheState *) node);
-			break;
+		//case T_ResultCacheState:
+		//	ExecEndResultCache((ResultCacheState *) node);
+		//	break;
 
 		case T_GroupState:
 			ExecEndGroup((GroupState *) node);
diff --git a/src/backend/executor/nodeNestloop.c b/src/backend/executor/nodeNestloop.c
index b07c2996d4..97213071d5 100644
--- a/src/backend/executor/nodeNestloop.c
+++ b/src/backend/executor/nodeNestloop.c
@@ -23,9 +23,21 @@
 
 #include "executor/execdebug.h"
 #include "executor/nodeNestloop.h"
+#include "executor/nodeResultCache.h"
 #include "miscadmin.h"
 #include "utils/memutils.h"
 
+static inline TupleTableSlot *
+FetchInnerTuple(ResultCacheState *rcstate, PlanState *innerPlan)
+{
+	/* No caching? Just exec the inner node */
+	if (rcstate == NULL)
+		return ExecProcNode(innerPlan);
+	/* Otherwise let the cache deal with it */
+	else
+		return ExecResultCache(rcstate, innerPlan);
+}
+
 
 /* ----------------------------------------------------------------
  *		ExecNestLoop(node)
@@ -150,6 +162,11 @@ ExecNestLoop(PlanState *pstate)
 			 */
 			ENL1_printf("rescanning inner plan");
 			ExecReScan(innerPlan);
+
+			/* When using a result cache, reset the state ready for another lookup */
+			if (node->nl_pcache)
+				ExecResultCacheFinishScan(node->nl_pcache);
+
 		}
 
 		/*
@@ -157,7 +174,7 @@ ExecNestLoop(PlanState *pstate)
 		 */
 		ENL1_printf("getting new inner tuple");
 
-		innerTupleSlot = ExecProcNode(innerPlan);
+		innerTupleSlot = FetchInnerTuple(node->nl_pcache, innerPlan);
 		econtext->ecxt_innertuple = innerTupleSlot;
 
 		if (TupIsNull(innerTupleSlot))
@@ -345,6 +362,13 @@ ExecInitNestLoop(NestLoop *node, EState *estate, int eflags)
 	 */
 	nlstate->nl_NeedNewOuter = true;
 	nlstate->nl_MatchedOuter = false;
+	nlstate->nl_ParamCache = node->paramcache;
+
+	/* Setup the result cache if enabled */
+	if (nlstate->nl_ParamCache)
+		nlstate->nl_pcache = ExecInitResultCache(node, (PlanState *) nlstate, (PlanState *) innerPlanState(nlstate));
+	else
+		nlstate->nl_pcache = NULL;
 
 	NL1_printf("ExecInitNestLoop: %s\n",
 			   "node initialized");
@@ -352,6 +376,7 @@ ExecInitNestLoop(NestLoop *node, EState *estate, int eflags)
 	return nlstate;
 }
 
+
 /* ----------------------------------------------------------------
  *		ExecEndNestLoop
  *
@@ -380,6 +405,9 @@ ExecEndNestLoop(NestLoopState *node)
 	ExecEndNode(outerPlanState(node));
 	ExecEndNode(innerPlanState(node));
 
+	if (node->nl_pcache)
+		ExecEndResultCache(node->nl_pcache);
+
 	NL1_printf("ExecEndNestLoop: %s\n",
 			   "node processing ended");
 }
diff --git a/src/backend/executor/nodeResultCache.c b/src/backend/executor/nodeResultCache.c
index 09b25ea184..da5edf9c06 100644
--- a/src/backend/executor/nodeResultCache.c
+++ b/src/backend/executor/nodeResultCache.c
@@ -66,7 +66,6 @@
 										 * subplan without caching anything */
 #define RC_END_OF_SCAN				5	/* Ready for rescan */
 
-
 /* Helper macros for memory accounting */
 #define EMPTY_ENTRY_MEMORY_BYTES(e)		(sizeof(ResultCacheEntry) + \
 										 sizeof(ResultCacheKey) + \
@@ -179,7 +178,7 @@ ResultCacheHash_equal(struct resultcache_hash *tb, const ResultCacheKey *key1,
 					  const ResultCacheKey *key2)
 {
 	ResultCacheState *rcstate = (ResultCacheState *) tb->private_data;
-	ExprContext *econtext = rcstate->ss.ps.ps_ExprContext;
+	ExprContext *econtext = rcstate->ps_ExprContext;
 	TupleTableSlot *tslot = rcstate->tableslot;
 	TupleTableSlot *pslot = rcstate->probeslot;
 
@@ -223,7 +222,7 @@ prepare_probe_slot(ResultCacheState *rcstate, ResultCacheKey *key)
 		/* Set the probeslot's values based on the current parameter values */
 		for (int i = 0; i < numKeys; i++)
 			pslot->tts_values[i] = ExecEvalExpr(rcstate->param_exprs[i],
-												rcstate->ss.ps.ps_ExprContext,
+												rcstate->ps_ExprContext,
 												&pslot->tts_isnull[i]);
 	}
 	else
@@ -243,7 +242,7 @@ prepare_probe_slot(ResultCacheState *rcstate, ResultCacheKey *key)
  *		Remove all tuples from a cache entry, leaving an empty cache entry.
  *		Also update memory accounting to reflect the removal of the tuples.
  */
-static inline void
+static void
 entry_purge_tuples(ResultCacheState *rcstate, ResultCacheEntry *entry)
 {
 	ResultCacheTuple *tuple = entry->tuplehead;
@@ -590,21 +589,32 @@ cache_store_tuple(ResultCacheState *rcstate, TupleTableSlot *slot)
 	return true;
 }
 
-static TupleTableSlot *
-ExecResultCache(PlanState *pstate)
+/*
+ * Caller to call this after it finishes a parameterized scan
+ */
+void
+ExecResultCacheFinishScan(ResultCacheState *rcstate)
+{
+	rcstate->rc_status = RC_CACHE_LOOKUP;
+
+	/* nullify pointers used for the last scan */
+	rcstate->entry = NULL;
+	rcstate->last_tuple = NULL;
+}
+
+TupleTableSlot *
+ExecResultCache(ResultCacheState *rcstate, PlanState *innerPlan)
 {
-	ResultCacheState *node = castNode(ResultCacheState, pstate);
-	PlanState  *outerNode;
 	TupleTableSlot *slot;
 
-	switch (node->rc_status)
+	switch (rcstate->rc_status)
 	{
 		case RC_CACHE_LOOKUP:
 			{
 				ResultCacheEntry *entry;
 				bool		found;
 
-				Assert(node->entry == NULL);
+				Assert(rcstate->entry == NULL);
 
 				/*
 				 * We're only ever in this state for the first call of the
@@ -619,44 +629,43 @@ ExecResultCache(PlanState *pstate)
 				 * one there, we'll try to cache it.
 				 */
 
-				/* see if we've got anything cached for the current parameters */
-				entry = cache_lookup(node, &found);
+				 /* see if we've got anything cached for the current parameters */
+				entry = cache_lookup(rcstate, &found);
 
 				if (found && entry->complete)
 				{
-					node->stats.cache_hits += 1;	/* stats update */
+					rcstate->stats.cache_hits += 1;	/* stats update */
 
 					/*
 					 * Set last_tuple and entry so that the state
 					 * RC_CACHE_FETCH_NEXT_TUPLE can easily find the next
 					 * tuple for these parameters.
 					 */
-					node->last_tuple = entry->tuplehead;
-					node->entry = entry;
+					rcstate->last_tuple = entry->tuplehead;
+					rcstate->entry = entry;
 
 					/* Fetch the first cached tuple, if there is one */
 					if (entry->tuplehead)
 					{
-						node->rc_status = RC_CACHE_FETCH_NEXT_TUPLE;
+						rcstate->rc_status = RC_CACHE_FETCH_NEXT_TUPLE;
 
-						slot = node->ss.ps.ps_ResultTupleSlot;
-						ExecStoreMinimalTuple(entry->tuplehead->mintuple,
-											  slot, false);
-
-						return slot;
+						ExecClearTuple(rcstate->cachefoundslot);
+						slot = rcstate->cachefoundslotmin;
+						ExecStoreMinimalTuple(rcstate->last_tuple->mintuple, slot, false);
+						return ExecCopySlot(rcstate->cachefoundslot, slot);
 					}
 					else
 					{
 						/* The cache entry is void of any tuples. */
-						node->rc_status = RC_END_OF_SCAN;
+						rcstate->rc_status = RC_END_OF_SCAN;
 						return NULL;
 					}
 				}
 				else
 				{
-					TupleTableSlot *outerslot;
+					TupleTableSlot *innerslot;
 
-					node->stats.cache_misses += 1;	/* stats update */
+					rcstate->stats.cache_misses += 1;	/* stats update */
 
 					if (found)
 					{
@@ -668,13 +677,12 @@ ExecResultCache(PlanState *pstate)
 						 * guarantee the outer node will produce the tuples in
 						 * the same order as it did last time.
 						 */
-						entry_purge_tuples(node, entry);
+						entry_purge_tuples(rcstate, entry);
 					}
 
 					/* Scan the outer node for a tuple to cache */
-					outerNode = outerPlanState(node);
-					outerslot = ExecProcNode(outerNode);
-					if (TupIsNull(outerslot))
+					innerslot = ExecProcNode(innerPlan);
+					if (TupIsNull(innerslot))
 					{
 						/*
 						 * cache_lookup may have returned NULL due to failure
@@ -686,22 +694,22 @@ ExecResultCache(PlanState *pstate)
 						if (likely(entry))
 							entry->complete = true;
 
-						node->rc_status = RC_END_OF_SCAN;
+						rcstate->rc_status = RC_END_OF_SCAN;
 						return NULL;
 					}
 
-					node->entry = entry;
+					rcstate->entry = entry;
 
 					/*
 					 * If we failed to create the entry or failed to store the
 					 * tuple in the entry, then go into bypass mode.
 					 */
 					if (unlikely(entry == NULL ||
-								 !cache_store_tuple(node, outerslot)))
+						!cache_store_tuple(rcstate, innerslot)))
 					{
-						node->stats.cache_overflows += 1;	/* stats update */
+						rcstate->stats.cache_overflows += 1;	/* stats update */
 
-						node->rc_status = RC_CACHE_BYPASS_MODE;
+						rcstate->rc_status = RC_CACHE_BYPASS_MODE;
 
 						/*
 						 * No need to clear out last_tuple as we'll stay in
@@ -716,43 +724,41 @@ ExecResultCache(PlanState *pstate)
 						 * allows cache lookups to work even when the scan has
 						 * not been executed to completion.
 						 */
-						entry->complete = node->singlerow;
-						node->rc_status = RC_FILLING_CACHE;
+						entry->complete = rcstate->singlerow;
+						rcstate->rc_status = RC_FILLING_CACHE;
 					}
 
-					slot = node->ss.ps.ps_ResultTupleSlot;
-					ExecCopySlot(slot, outerslot);
-					return slot;
+					return innerslot;
 				}
 			}
 
 		case RC_CACHE_FETCH_NEXT_TUPLE:
 			{
 				/* We shouldn't be in this state if these are not set */
-				Assert(node->entry != NULL);
-				Assert(node->last_tuple != NULL);
+				Assert(rcstate->entry != NULL);
+				Assert(rcstate->last_tuple != NULL);
 
 				/* Skip to the next tuple to output */
-				node->last_tuple = node->last_tuple->next;
+				rcstate->last_tuple = rcstate->last_tuple->next;
 
 				/* No more tuples in the cache */
-				if (node->last_tuple == NULL)
+				if (rcstate->last_tuple == NULL)
 				{
-					node->rc_status = RC_END_OF_SCAN;
+					rcstate->rc_status = RC_END_OF_SCAN;
 					return NULL;
 				}
 
-				slot = node->ss.ps.ps_ResultTupleSlot;
-				ExecStoreMinimalTuple(node->last_tuple->mintuple, slot,
-									  false);
+				ExecClearTuple(rcstate->cachefoundslot);
+				slot = rcstate->cachefoundslotmin;
+				ExecStoreMinimalTuple(rcstate->last_tuple->mintuple, slot, false);
 
-				return slot;
+				return ExecCopySlot(rcstate->cachefoundslot, slot);
 			}
 
 		case RC_FILLING_CACHE:
 			{
-				TupleTableSlot *outerslot;
-				ResultCacheEntry *entry = node->entry;
+				TupleTableSlot *innerslot;
+				ResultCacheEntry *entry = rcstate->entry;
 
 				/* entry should already have been set by RC_CACHE_LOOKUP */
 				Assert(entry != NULL);
@@ -762,13 +768,12 @@ ExecResultCache(PlanState *pstate)
 				 * miss and are populating the cache with the current scan
 				 * tuples.
 				 */
-				outerNode = outerPlanState(node);
-				outerslot = ExecProcNode(outerNode);
-				if (TupIsNull(outerslot))
+				innerslot = ExecProcNode(innerPlan);
+				if (TupIsNull(innerslot))
 				{
 					/* No more tuples.  Mark it as complete */
 					entry->complete = true;
-					node->rc_status = RC_END_OF_SCAN;
+					rcstate->rc_status = RC_END_OF_SCAN;
 					return NULL;
 				}
 				else
@@ -782,12 +787,12 @@ ExecResultCache(PlanState *pstate)
 						elog(ERROR, "cache entry already complete");
 
 					/* Record the tuple in the current cache entry */
-					if (unlikely(!cache_store_tuple(node, outerslot)))
+					if (unlikely(!cache_store_tuple(rcstate, innerslot)))
 					{
 						/* Couldn't store it?  Handle overflow */
-						node->stats.cache_overflows += 1;	/* stats update */
+						rcstate->stats.cache_overflows += 1;	/* stats update */
 
-						node->rc_status = RC_CACHE_BYPASS_MODE;
+						rcstate->rc_status = RC_CACHE_BYPASS_MODE;
 
 						/*
 						 * No need to clear out entry or last_tuple as we'll
@@ -795,32 +800,27 @@ ExecResultCache(PlanState *pstate)
 						 */
 					}
 
-					slot = node->ss.ps.ps_ResultTupleSlot;
-					ExecCopySlot(slot, outerslot);
-					return slot;
+					return innerslot;
 				}
 			}
 
 		case RC_CACHE_BYPASS_MODE:
 			{
-				TupleTableSlot *outerslot;
+				TupleTableSlot *innerslot;
 
 				/*
 				 * When in bypass mode we just continue to read tuples without
 				 * caching.  We need to wait until the next rescan before we
 				 * can come out of this mode.
 				 */
-				outerNode = outerPlanState(node);
-				outerslot = ExecProcNode(outerNode);
-				if (TupIsNull(outerslot))
+				innerslot = ExecProcNode(innerPlan);
+				if (TupIsNull(innerslot))
 				{
-					node->rc_status = RC_END_OF_SCAN;
+					rcstate->rc_status = RC_END_OF_SCAN;
 					return NULL;
 				}
 
-				slot = node->ss.ps.ps_ResultTupleSlot;
-				ExecCopySlot(slot, outerslot);
-				return slot;
+				return innerslot;
 			}
 
 		case RC_END_OF_SCAN:
@@ -833,60 +833,34 @@ ExecResultCache(PlanState *pstate)
 
 		default:
 			elog(ERROR, "unrecognized resultcache state: %d",
-				 (int) node->rc_status);
+				 (int) rcstate->rc_status);
 			return NULL;
 	}							/* switch */
 }
 
 ResultCacheState *
-ExecInitResultCache(ResultCache *node, EState *estate, int eflags)
+ExecInitResultCache(NestLoop *node, PlanState *planstate, PlanState *cache_planstate)
 {
 	ResultCacheState *rcstate = makeNode(ResultCacheState);
-	Plan	   *outerNode;
 	int			i;
 	int			nkeys;
 	Oid		   *eqfuncoids;
 
-	/* check for unsupported flags */
-	Assert(!(eflags & (EXEC_FLAG_BACKWARD | EXEC_FLAG_MARK)));
-
-	rcstate->ss.ps.plan = (Plan *) node;
-	rcstate->ss.ps.state = estate;
-	rcstate->ss.ps.ExecProcNode = ExecResultCache;
-
-	/*
-	 * Miscellaneous initialization
-	 *
-	 * create expression context for node
-	 */
-	ExecAssignExprContext(estate, &rcstate->ss.ps);
-
-	outerNode = outerPlan(node);
-	outerPlanState(rcstate) = ExecInitNode(outerNode, estate, eflags);
-
-	/*
-	 * Initialize return slot and type. No need to initialize projection info
-	 * because this node doesn't do projections.
-	 */
-	ExecInitResultTupleSlotTL(&rcstate->ss.ps, &TTSOpsMinimalTuple);
-	rcstate->ss.ps.ps_ProjInfo = NULL;
-
-	/*
-	 * Initialize scan slot and type.
-	 */
-	ExecCreateScanSlotFromOuterPlan(estate, &rcstate->ss, &TTSOpsMinimalTuple);
-
-	/*
-	 * Set the state machine to lookup the cache.  We won't find anything
-	 * until we cache something, but this saves a special case to create the
-	 * first entry.
-	 */
+	rcstate->ps_ExprContext = CreateExprContext(planstate->state);
 	rcstate->rc_status = RC_CACHE_LOOKUP;
 
 	rcstate->nkeys = nkeys = node->numKeys;
 	rcstate->hashkeydesc = ExecTypeFromExprList(node->param_exprs);
 	rcstate->tableslot = MakeSingleTupleTableSlot(rcstate->hashkeydesc,
 												  &TTSOpsMinimalTuple);
+	/* XXX this should make a slot the same type as cache_planstates result slot.  For now
+	 * that'll always be a nested loop, so just make a virtual slot, which is what nested loop
+	 * uses.
+	 */
+	rcstate->cachefoundslot = MakeSingleTupleTableSlot(cache_planstate->ps_ResultTupleDesc,
+		&TTSOpsVirtual);
+	rcstate->cachefoundslotmin = MakeSingleTupleTableSlot(cache_planstate->ps_ResultTupleDesc,
+		&TTSOpsMinimalTuple);
 	rcstate->probeslot = MakeSingleTupleTableSlot(rcstate->hashkeydesc,
 												  &TTSOpsVirtual);
 
@@ -910,7 +884,7 @@ ExecInitResultCache(ResultCache *node, EState *estate, int eflags)
 
 		fmgr_info(left_hashfn, &rcstate->hashfunctions[i]);
 
-		rcstate->param_exprs[i] = ExecInitExpr(param_expr, (PlanState *) rcstate);
+		rcstate->param_exprs[i] = ExecInitExpr(param_expr, (PlanState *)planstate);
 		eqfuncoids[i] = get_opcode(hashop);
 	}
 
@@ -919,7 +893,7 @@ ExecInitResultCache(ResultCache *node, EState *estate, int eflags)
 													eqfuncoids,
 													node->collations,
 													node->param_exprs,
-													(PlanState *) rcstate);
+													(PlanState *) planstate);
 
 	pfree(eqfuncoids);
 	rcstate->mem_used = 0;
@@ -970,57 +944,12 @@ ExecInitResultCache(ResultCache *node, EState *estate, int eflags)
 void
 ExecEndResultCache(ResultCacheState *node)
 {
-	/*
-	 * When ending a parallel worker, copy the statistics gathered by the
-	 * worker back into shared memory so that it can be picked up by the main
-	 * process to report in EXPLAIN ANALYZE.
-	 */
-	if (node->shared_info && IsParallelWorker())
-	{
-		ResultCacheInstrumentation *si;
-
-		Assert(ParallelWorkerNumber <= node->shared_info->num_workers);
-		si = &node->shared_info->sinstrument[ParallelWorkerNumber];
-		memcpy(si, &node->stats, sizeof(ResultCacheInstrumentation));
-	}
-
 	/* Remove the cache context */
 	MemoryContextDelete(node->tableContext);
 
-	ExecClearTuple(node->ss.ss_ScanTupleSlot);
-	/* must drop pointer to cache result tuple */
-	ExecClearTuple(node->ss.ps.ps_ResultTupleSlot);
-
-	/*
-	 * free exprcontext
-	 */
-	ExecFreeExprContext(&node->ss.ps);
-
-	/*
-	 * shut down the subplan
-	 */
-	ExecEndNode(outerPlanState(node));
-}
-
-void
-ExecReScanResultCache(ResultCacheState *node)
-{
-	PlanState  *outerPlan = outerPlanState(node);
-
-	/* Mark that we must lookup the cache for a new set of parameters */
-	node->rc_status = RC_CACHE_LOOKUP;
-
-	/* nullify pointers used for the last scan */
-	node->entry = NULL;
-	node->last_tuple = NULL;
-
-	/*
-	 * if chgParam of subnode is not null then plan will be re-scanned by
-	 * first ExecProcNode.
-	 */
-	if (outerPlan->chgParam == NULL)
-		ExecReScan(outerPlan);
-
+	ExecClearTuple(node->cachefoundslot);
+	ExecClearTuple(node->cachefoundslotmin);
+	FreeExprContext(node->ps_ExprContext, false);
 }
 
 /*
@@ -1035,88 +964,3 @@ ExecEstimateCacheEntryOverheadBytes(double ntuples)
 		sizeof(ResultCacheTuple) * ntuples;
 }
 
-/* ----------------------------------------------------------------
- *						Parallel Query Support
- * ----------------------------------------------------------------
- */
-
- /* ----------------------------------------------------------------
-  *		ExecResultCacheEstimate
-  *
-  *		Estimate space required to propagate result cache statistics.
-  * ----------------------------------------------------------------
-  */
-void
-ExecResultCacheEstimate(ResultCacheState *node, ParallelContext *pcxt)
-{
-	Size		size;
-
-	/* don't need this if not instrumenting or no workers */
-	if (!node->ss.ps.instrument || pcxt->nworkers == 0)
-		return;
-
-	size = mul_size(pcxt->nworkers, sizeof(ResultCacheInstrumentation));
-	size = add_size(size, offsetof(SharedResultCacheInfo, sinstrument));
-	shm_toc_estimate_chunk(&pcxt->estimator, size);
-	shm_toc_estimate_keys(&pcxt->estimator, 1);
-}
-
-/* ----------------------------------------------------------------
- *		ExecResultCacheInitializeDSM
- *
- *		Initialize DSM space for result cache statistics.
- * ----------------------------------------------------------------
- */
-void
-ExecResultCacheInitializeDSM(ResultCacheState *node, ParallelContext *pcxt)
-{
-	Size		size;
-
-	/* don't need this if not instrumenting or no workers */
-	if (!node->ss.ps.instrument || pcxt->nworkers == 0)
-		return;
-
-	size = offsetof(SharedResultCacheInfo, sinstrument)
-		+ pcxt->nworkers * sizeof(ResultCacheInstrumentation);
-	node->shared_info = shm_toc_allocate(pcxt->toc, size);
-	/* ensure any unfilled slots will contain zeroes */
-	memset(node->shared_info, 0, size);
-	node->shared_info->num_workers = pcxt->nworkers;
-	shm_toc_insert(pcxt->toc, node->ss.ps.plan->plan_node_id,
-				   node->shared_info);
-}
-
-/* ----------------------------------------------------------------
- *		ExecResultCacheInitializeWorker
- *
- *		Attach worker to DSM space for result cache statistics.
- * ----------------------------------------------------------------
- */
-void
-ExecResultCacheInitializeWorker(ResultCacheState *node, ParallelWorkerContext *pwcxt)
-{
-	node->shared_info =
-		shm_toc_lookup(pwcxt->toc, node->ss.ps.plan->plan_node_id, true);
-}
-
-/* ----------------------------------------------------------------
- *		ExecResultCacheRetrieveInstrumentation
- *
- *		Transfer result cache statistics from DSM to private memory.
- * ----------------------------------------------------------------
- */
-void
-ExecResultCacheRetrieveInstrumentation(ResultCacheState *node)
-{
-	Size		size;
-	SharedResultCacheInfo *si;
-
-	if (node->shared_info == NULL)
-		return;
-
-	size = offsetof(SharedResultCacheInfo, sinstrument)
-		+ node->shared_info->num_workers * sizeof(ResultCacheInstrumentation);
-	si = palloc(size);
-	memcpy(si, node->shared_info, size);
-	node->shared_info = si;
-}
diff --git a/src/backend/optimizer/path/costsize.c b/src/backend/optimizer/path/costsize.c
index e50844df9b..0101d719c4 100644
--- a/src/backend/optimizer/path/costsize.c
+++ b/src/backend/optimizer/path/costsize.c
@@ -2298,148 +2298,6 @@ cost_material(Path *path,
 	path->total_cost = startup_cost + run_cost;
 }
 
-/*
- * cost_resultcache_rescan
- *	  Determines the estimated cost of rescanning a ResultCache node.
- *
- * In order to estimate this, we must gain knowledge of how often we expect to
- * be called and how many distinct sets of parameters we are likely to be
- * called with. If we expect a good cache hit ratio, then we can set our
- * costs to account for that hit ratio, plus a little bit of cost for the
- * caching itself.  Caching will not work out well if we expect to be called
- * with too many distinct parameter values.  The worst-case here is that we
- * never see the same parameter values twice, in which case we'd never get a
- * cache hit and caching would be a complete waste of effort.
- */
-static void
-cost_resultcache_rescan(PlannerInfo *root, ResultCachePath *rcpath,
-						Cost *rescan_startup_cost, Cost *rescan_total_cost)
-{
-	Cost		input_startup_cost = rcpath->subpath->startup_cost;
-	Cost		input_total_cost = rcpath->subpath->total_cost;
-	double		tuples = rcpath->subpath->rows;
-	double		calls = rcpath->calls;
-	int			width = rcpath->subpath->pathtarget->width;
-	int			flags;
-
-	double		work_mem_bytes;
-	double		est_entry_bytes;
-	double		est_cache_entries;
-	double		ndistinct;
-	double		evict_ratio;
-	double		hit_ratio;
-	Cost		startup_cost;
-	Cost		total_cost;
-
-	/* available cache space */
-	work_mem_bytes = work_mem * 1024L;
-
-	/*
-	 * Set the number of bytes each cache entry should consume in the cache.
-	 * To provide us with better estimations on how many cache entries we can
-	 * store at once we make a call to the excutor here to ask it what memory
-	 * overheads there are for a single cache entry.
-	 *
-	 * XXX we also store the cache key, but that's not accounted for here.
-	 */
-	est_entry_bytes = relation_byte_size(tuples, width) +
-		ExecEstimateCacheEntryOverheadBytes(tuples);
-
-	/* estimate on the upper limit of cache entries we can hold at once */
-	est_cache_entries = floor(work_mem_bytes / est_entry_bytes);
-
-	/* estimate on the distinct number of parameter values */
-	ndistinct = estimate_num_groups(root, rcpath->param_exprs, calls, NULL,
-									&flags);
-
-	/*
-	 * When the estimation fell back on using a default value, it's a bit too
-	 * risky to assume that it's ok to use a Result Cache.  The use of a
-	 * default could cause us to use a Result Cache when it's really
-	 * inappropriate to do so.  If we see that this has been done then we'll
-	 * assume that every call will have unique parameters, which will almost
-	 * certainly mean a ResultCachePath will never survive add_path().
-	 */
-	if ((flags & SELFLAG_USED_DEFAULT) != 0)
-		ndistinct = calls;
-
-	/*
-	 * Since we've already estimated the maximum number of entries we can
-	 * store at once and know the estimated number of distinct values we'll be
-	 * called with, well take this opportunity to set the path's est_entries.
-	 * This will ultimately determine the hash table size that the executor
-	 * will use.  If we leave this at zero the executor will just choose the
-	 * size itself.  Really this is not the right place to do this, but it's
-	 * convenient since everything is already calculated.
-	 */
-	rcpath->est_entries = Min(Min(ndistinct, est_cache_entries),
-							  PG_UINT32_MAX);
-
-
-	/*
-	 * When the number of distinct parameter values is above the amount we can
-	 * store in the cache, then we'll have to evict some entries from the
-	 * cache.  This is not free, so here we estimate how often we'll incur the
-	 * cost of that eviction.
-	 */
-	evict_ratio = 1.0 - Min(est_cache_entries, ndistinct) / ndistinct;
-
-	/*
-	 * In order to estimate how costly a single scan will be, we need to
-	 * attempt to estimate what the cache hit ratio will be.  To do that we
-	 * must look at how many scans are estimated in total of this node and how
-	 * many of those scans we expect to get a cache hit.
-	 */
-	hit_ratio = 1.0 / ndistinct * Min(est_cache_entries, ndistinct) -
-		(ndistinct / calls);
-
-	/* Ensure we don't go negative */
-	hit_ratio = Max(hit_ratio, 0);
-
-	/*
-	 * Set the total_cost accounting for the expected cache hit ratio.  We
-	 * also add on a cpu_operator_cost to account for a cache lookup. This
-	 * will happen regardless of if it's a cache hit or not.
-	 */
-	total_cost = input_total_cost * (1.0 - hit_ratio) + cpu_operator_cost;
-
-	/* Now adjust the total cost to account for cache evictions */
-
-	/* Charge a cpu_tuple_cost for evicting the actual cache entry */
-	total_cost += cpu_tuple_cost * evict_ratio;
-
-	/*
-	 * Charge a 10th of cpu_operator_cost to evict every tuple in that entry.
-	 * The per-tuple eviction is really just a pfree, so charging a whole
-	 * cpu_operator_cost seems a little excessive.
-	 */
-	total_cost += cpu_operator_cost / 10.0 * evict_ratio * tuples;
-
-	/*
-	 * Now adjust for storing things in the cache, since that's not free
-	 * either.  Everything must go in the cache, so we don't proportion this
-	 * over any ratio, just apply it once for the scan.  We charge a
-	 * cpu_tuple_cost for the creation of the cache entry and also a
-	 * cpu_operator_cost for each tuple we expect to cache.
-	 */
-	total_cost += cpu_tuple_cost + cpu_operator_cost * tuples;
-
-	/*
-	 * Getting the first row must be also be proportioned according to the
-	 * expected cache hit ratio.
-	 */
-	startup_cost = input_startup_cost * (1.0 - hit_ratio);
-
-	/*
-	 * Additionally we charge a cpu_tuple_cost to account for cache lookups,
-	 * which we'll do regardless of if it was a cache hit or not.
-	 */
-	startup_cost += cpu_tuple_cost;
-
-	*rescan_startup_cost = startup_cost;
-	*rescan_total_cost = total_cost;
-}
-
 /*
  * cost_agg
  *		Determines and returns the cost of performing an Agg plan node,
@@ -4167,11 +4025,6 @@ cost_rescan(PlannerInfo *root, Path *path,
 				*rescan_total_cost = run_cost;
 			}
 			break;
-		case T_ResultCache:
-			/* All the hard work is done by cost_resultcache_rescan */
-			cost_resultcache_rescan(root, (ResultCachePath *) path,
-									rescan_startup_cost, rescan_total_cost);
-			break;
 		default:
 			*rescan_startup_cost = path->startup_cost;
 			*rescan_total_cost = path->total_cost;
diff --git a/src/backend/optimizer/path/joinpath.c b/src/backend/optimizer/path/joinpath.c
index f4c76577ad..5918dd9a3a 100644
--- a/src/backend/optimizer/path/joinpath.c
+++ b/src/backend/optimizer/path/joinpath.c
@@ -17,13 +17,16 @@
 #include <math.h>
 
 #include "executor/executor.h"
+#include "executor/nodeResultCache.h"
 #include "foreign/fdwapi.h"
+#include "miscadmin.h"
 #include "nodes/nodeFuncs.h"
 #include "optimizer/cost.h"
 #include "optimizer/optimizer.h"
 #include "optimizer/pathnode.h"
 #include "optimizer/paths.h"
 #include "optimizer/planmain.h"
+#include "utils/selfuncs.h"
 #include "utils/typcache.h"
 
 /* Hook for plugins to get control in add_paths_to_joinrel() */
@@ -554,6 +557,152 @@ get_resultcache_path(PlannerInfo *root, RelOptInfo *innerrel,
 	return NULL;
 }
 
+static double
+relation_byte_size(double tuples, int width)
+{
+	return tuples * (MAXALIGN(width) + MAXALIGN(SizeofHeapTupleHeader));
+}
+
+/*
+ * cost_resultcache_rescan
+ *	  Determines the estimated cost of rescanning a ResultCache node.
+ *
+ * In order to estimate this, we must gain knowledge of how often we expect to
+ * be called and how many distinct sets of parameters we are likely to be
+ * called with. If we expect a good cache hit ratio, then we can set our
+ * costs to account for that hit ratio, plus a little bit of cost for the
+ * caching itself.  Caching will not work out well if we expect to be called
+ * with too many distinct parameter values.  The worst-case here is that we
+ * never see the same parameter values twice, in which case we'd never get a
+ * cache hit and caching would be a complete waste of effort.
+ */
+static bool
+use_nestedloop_cache(PlannerInfo *root, NestPath *nlpath)
+{
+	Cost		input_startup_cost = nlpath->innerjoinpath->startup_cost;
+	Cost		input_total_cost = nlpath->innerjoinpath->total_cost;
+	double		tuples = nlpath->innerjoinpath->rows;
+	double		calls = nlpath->outerjoinpath->rows;
+	int			width = nlpath->innerjoinpath->pathtarget->width;
+	int			flags;
+
+	double		work_mem_bytes;
+	double		est_entry_bytes;
+	double		est_cache_entries;
+	double		ndistinct;
+	double		evict_ratio;
+	double		hit_ratio;
+	Cost		startup_cost;
+	Cost		total_cost;
+
+	/* available cache space */
+	work_mem_bytes = work_mem * 1024L;
+
+	/*
+	 * Set the number of bytes each cache entry should consume in the cache.
+	 * To provide us with better estimations on how many cache entries we can
+	 * store at once we make a call to the excutor here to ask it what memory
+	 * overheads there are for a single cache entry.
+	 *
+	 * XXX we also store the cache key, but that's not accounted for here.
+	 */
+	est_entry_bytes = relation_byte_size(tuples, width) +
+		ExecEstimateCacheEntryOverheadBytes(tuples);
+
+	/* estimate on the upper limit of cache entries we can hold at once */
+	est_cache_entries = floor(work_mem_bytes / est_entry_bytes);
+
+	/* estimate on the distinct number of parameter values */
+	ndistinct = 1; // estimate_num_groups(root, nlpath->rcpath->param_exprs, calls, NULL,
+		//&flags);
+
+	/*
+	 * When the estimation fell back on using a default value, it's a bit too
+	 * risky to assume that it's ok to use a Result Cache.  The use of a
+	 * default could cause us to use a Result Cache when it's really
+	 * inappropriate to do so.  If we see that this has been done then we'll
+	 * assume that every call will have unique parameters, which will almost
+	 * certainly mean a ResultCachePath will never survive add_path().
+	 */
+	if ((flags & SELFLAG_USED_DEFAULT) != 0)
+		ndistinct = calls;
+
+	/*
+	 * Since we've already estimated the maximum number of entries we can
+	 * store at once and know the estimated number of distinct values we'll be
+	 * called with, well take this opportunity to set the path's est_entries.
+	 * This will ultimately determine the hash table size that the executor
+	 * will use.  If we leave this at zero the executor will just choose the
+	 * size itself.  Really this is not the right place to do this, but it's
+	 * convenient since everything is already calculated.
+	 */
+	//nlpath->est_entries = Min(Min(ndistinct, est_cache_entries),
+	//	PG_UINT32_MAX);
+
+
+	/*
+	 * When the number of distinct parameter values is above the amount we can
+	 * store in the cache, then we'll have to evict some entries from the
+	 * cache.  This is not free, so here we estimate how often we'll incur the
+	 * cost of that eviction.
+	 */
+	evict_ratio = 1.0 - Min(est_cache_entries, ndistinct) / ndistinct;
+
+	/*
+	 * In order to estimate how costly a single scan will be, we need to
+	 * attempt to estimate what the cache hit ratio will be.  To do that we
+	 * must look at how many scans are estimated in total of this node and how
+	 * many of those scans we expect to get a cache hit.
+	 */
+	hit_ratio = 1.0 / ndistinct * Min(est_cache_entries, ndistinct) -
+		(ndistinct / calls);
+
+	/* Ensure we don't go negative */
+	hit_ratio = Max(hit_ratio, 0);
+
+	/*
+	 * Set the total_cost accounting for the expected cache hit ratio.  We
+	 * also add on a cpu_operator_cost to account for a cache lookup. This
+	 * will happen regardless of if it's a cache hit or not.
+	 */
+	total_cost = input_total_cost * (1.0 - hit_ratio) + cpu_operator_cost;
+
+	/* Now adjust the total cost to account for cache evictions */
+
+	/* Charge a cpu_tuple_cost for evicting the actual cache entry */
+	total_cost += cpu_tuple_cost * evict_ratio;
+
+	/*
+	 * Charge a 10th of cpu_operator_cost to evict every tuple in that entry.
+	 * The per-tuple eviction is really just a pfree, so charging a whole
+	 * cpu_operator_cost seems a little excessive.
+	 */
+	total_cost += cpu_operator_cost / 10.0 * evict_ratio * tuples;
+
+	/*
+	 * Now adjust for storing things in the cache, since that's not free
+	 * either.  Everything must go in the cache, so we don't proportion this
+	 * over any ratio, just apply it once for the scan.  We charge a
+	 * cpu_tuple_cost for the creation of the cache entry and also a
+	 * cpu_operator_cost for each tuple we expect to cache.
+	 */
+	total_cost += cpu_tuple_cost + cpu_operator_cost * tuples;
+
+	/*
+	 * Getting the first row must be also be proportioned according to the
+	 * expected cache hit ratio.
+	 */
+	startup_cost = input_startup_cost * (1.0 - hit_ratio);
+
+	/*
+	 * Additionally we charge a cpu_tuple_cost to account for cache lookups,
+	 * which we'll do regardless of if it was a cache hit or not.
+	 */
+	startup_cost += cpu_tuple_cost;
+
+	return total_cost < nlpath->innerjoinpath->total_cost;
+}
+
 /*
  * try_nestloop_path
  *	  Consider a nestloop join path; if it appears useful, push it into
@@ -576,8 +725,7 @@ try_nestloop_path(PlannerInfo *root,
 	Relids		outerrelids;
 	Relids		inner_paramrels = PATH_REQ_OUTER(inner_path);
 	Relids		outer_paramrels = PATH_REQ_OUTER(outer_path);
-	Path	   *inner_cache_path;
-	bool		added_path = false;
+	ResultCachePath	   *rcpath;
 
 	/*
 	 * Paths are parameterized by top-level parents, so run parameterization
@@ -628,6 +776,7 @@ try_nestloop_path(PlannerInfo *root,
 						  workspace.startup_cost, workspace.total_cost,
 						  pathkeys, required_outer))
 	{
+		NestPath *nlpath;
 		/*
 		 * If the inner path is parameterized, it is parameterized by the
 		 * topmost parent of the outer rel, not the outer rel itself.  Fix
@@ -649,103 +798,37 @@ try_nestloop_path(PlannerInfo *root,
 			}
 		}
 
-		add_path(joinrel, (Path *)
-				 create_nestloop_path(root,
-									  joinrel,
-									  jointype,
-									  &workspace,
-									  extra,
-									  outer_path,
-									  inner_path,
-									  extra->restrictlist,
-									  pathkeys,
-									  required_outer));
-		added_path = true;
-	}
-
-	/*
-	 * See if we can build a result cache path for this inner_path. That might
-	 * make the nested loop cheaper.
-	 */
-	inner_cache_path = get_resultcache_path(root, innerrel, outerrel,
-											inner_path, outer_path, jointype,
-											extra);
-
-	if (inner_cache_path == NULL)
-	{
-		if (!added_path)
-			bms_free(required_outer);
-		return;
-	}
-
-	initial_cost_nestloop(root, &workspace, jointype,
-						  outer_path, inner_cache_path, extra);
-
-	if (add_path_precheck(joinrel,
-						  workspace.startup_cost, workspace.total_cost,
-						  pathkeys, required_outer))
-	{
 		/*
-		 * If the inner path is parameterized, it is parameterized by the
-		 * topmost parent of the outer rel, not the outer rel itself.  Fix
-		 * that.
+		 * See if we can build a result cache path for this inner_path. That might
+		 * make the nested loop cheaper.
 		 */
-		if (PATH_PARAM_BY_PARENT(inner_cache_path, outer_path->parent))
+		rcpath = (ResultCachePath *) get_resultcache_path(root, innerrel, outerrel,
+			inner_path, outer_path, jointype,
+			extra);
+
+		nlpath = create_nestloop_path(root,
+			joinrel,
+			jointype,
+			&workspace,
+			extra,
+			outer_path,
+			inner_path,
+			extra->restrictlist,
+			pathkeys,
+			required_outer);
+
+		if (rcpath != NULL)
 		{
-			Path	   *reparameterize_path;
-
-			reparameterize_path = reparameterize_path_by_child(root,
-															   inner_cache_path,
-															   outer_path->parent);
-
-			/*
-			 * If we could not translate the path, we can't create nest loop
-			 * path.
-			 */
-			if (!reparameterize_path)
-			{
-				ResultCachePath *rcpath = (ResultCachePath *) inner_cache_path;
-
-				/* Waste no memory when we reject a path here */
-				list_free(rcpath->hash_operators);
-				list_free(rcpath->param_exprs);
-				pfree(rcpath);
-
-				if (!added_path)
-					bms_free(required_outer);
-				return;
-			}
+			nlpath->use_cache = true;
+			nlpath->hash_operators = rcpath->hash_operators;
+			nlpath->param_exprs = rcpath->param_exprs;
+			nlpath->singlerow = rcpath->singlerow;
+			nlpath->calls = rcpath->calls;
+			nlpath->est_entries = rcpath->est_entries;
 		}
 
-		add_path(joinrel, (Path *)
-				 create_nestloop_path(root,
-									  joinrel,
-									  jointype,
-									  &workspace,
-									  extra,
-									  outer_path,
-									  inner_cache_path,
-									  extra->restrictlist,
-									  pathkeys,
-									  required_outer));
-		added_path = true;
+		add_path(joinrel, (Path *)nlpath);
 	}
-	else
-	{
-		ResultCachePath *rcpath = (ResultCachePath *) inner_cache_path;
-
-		/* Waste no memory when we reject a path here */
-		list_free(rcpath->hash_operators);
-		list_free(rcpath->param_exprs);
-		pfree(rcpath);
-	}
-
-	if (!added_path)
-	{
-		/* Waste no memory when we reject a path here */
-		bms_free(required_outer);
-	}
-
 }
 
 /*
diff --git a/src/backend/optimizer/plan/createplan.c b/src/backend/optimizer/plan/createplan.c
index 45e211262a..7afb7741d0 100644
--- a/src/backend/optimizer/plan/createplan.c
+++ b/src/backend/optimizer/plan/createplan.c
@@ -4147,6 +4147,7 @@ create_nestloop_plan(PlannerInfo *root,
 	Relids		outerrelids;
 	List	   *nestParams;
 	Relids		saveOuterRels = root->curOuterRels;
+	List	   *param_exprs = NIL;
 
 	/* NestLoop can project, so no need to be picky about child tlists */
 	outer_plan = create_plan_recurse(root, best_path->outerjoinpath, 0);
@@ -4157,6 +4158,9 @@ create_nestloop_plan(PlannerInfo *root,
 
 	inner_plan = create_plan_recurse(root, best_path->innerjoinpath, 0);
 
+	param_exprs = (List *) replace_nestloop_params(root, (Node *)
+												   best_path->param_exprs);
+
 	/* Restore curOuterRels */
 	bms_free(root->curOuterRels);
 	root->curOuterRels = saveOuterRels;
@@ -4204,6 +4208,54 @@ create_nestloop_plan(PlannerInfo *root,
 							  best_path->jointype,
 							  best_path->inner_unique);
 
+	//bool		paramcache;
+	//int			numKeys;		/* size of the two arrays below */
+
+	//Oid		   *hashOperators;	/* hash operators for each key */
+	//Oid		   *collations;		/* cache keys */
+	//List	   *param_exprs;	/* exprs containing parameters */
+	//bool		singlerow;		/* true if the cache entry should be marked as
+	//							 * complete after we store the first tuple in
+	//							 * it. */
+	//uint32		est_entries;	/* The maximum number of entries that the
+	//							 * planner expects will fit in the cache, or 0
+	//							 * if unknown */
+
+	if (best_path->use_cache)
+	{
+		Oid		   *operators;
+		Oid		   *collations;
+		ListCell   *lc;
+		ListCell   *lc2;
+		int			nkeys;
+		int			i;
+
+		join_plan->numKeys = list_length(best_path->param_exprs);
+
+		nkeys = list_length(param_exprs);
+		Assert(nkeys > 0);
+		operators = palloc(nkeys * sizeof(Oid));
+		collations = palloc(nkeys * sizeof(Oid));
+
+		i = 0;
+		forboth(lc, param_exprs, lc2, best_path->hash_operators)
+		{
+			Expr	   *param_expr = (Expr *)lfirst(lc);
+			Oid			opno = lfirst_oid(lc2);
+
+			operators[i] = opno;
+			collations[i] = exprCollation((Node *)param_expr);
+			i++;
+		}
+		join_plan->paramcache = true;
+		join_plan->param_exprs = param_exprs;
+		join_plan->hashOperators = operators;
+		join_plan->collations = collations;
+		join_plan->singlerow = best_path->singlerow;
+		join_plan->est_entries = best_path->est_entries;
+
+	}
+
 	copy_generic_path_info(&join_plan->join.plan, &best_path->path);
 
 	return join_plan;
diff --git a/src/backend/optimizer/plan/subselect.c b/src/backend/optimizer/plan/subselect.c
index 3e2c61b0a0..9da223139a 100644
--- a/src/backend/optimizer/plan/subselect.c
+++ b/src/backend/optimizer/plan/subselect.c
@@ -137,74 +137,6 @@ get_first_col_type(Plan *plan, Oid *coltype, int32 *coltypmod,
 	*colcollation = InvalidOid;
 }
 
-
-/*
- * outer_params_hashable
- *		Determine if it's valid to use a ResultCache node to cache already
- *		seen rows matching a given set of parameters instead of performing a
- *		rescan of the subplan pointed to by 'subroot'.  If it's valid, check
- *		if all parameters required by this query level can be hashed.  If so,
- *		return true and set 'operators' to the list of hash equality operators
- *		for the given parameters then populate 'param_exprs' with each
- *		PARAM_EXEC parameter that the subplan requires the outer query to pass
- *		it.  When hashing is not possible, false is returned and the two
- *		output lists are unchanged.
- */
-static bool
-outer_params_hashable(PlannerInfo *subroot, List *plan_params, List **operators,
-					  List **param_exprs)
-{
-	List	   *oplist = NIL;
-	List	   *exprlist = NIL;
-	ListCell   *lc;
-
-	/* Ensure we're not given a top-level query. */
-	Assert(subroot->parent_root != NULL);
-
-	/*
-	 * It's not valid to use a Result Cache node if there are any volatile
-	 * function in the subquery.  Caching could cause fewer evaluations of
-	 * volatile functions that have side-effects
-	 */
-	if (contain_volatile_functions((Node *) subroot->parse))
-		return false;
-
-	foreach(lc, plan_params)
-	{
-		PlannerParamItem *ppi = (PlannerParamItem *) lfirst(lc);
-		TypeCacheEntry *typentry;
-		Node	   *expr = ppi->item;
-		Param	   *param;
-
-		param = makeNode(Param);
-		param->paramkind = PARAM_EXEC;
-		param->paramid = ppi->paramId;
-		param->paramtype = exprType(expr);
-		param->paramtypmod = exprTypmod(expr);
-		param->paramcollid = exprCollation(expr);
-		param->location = -1;
-
-		typentry = lookup_type_cache(param->paramtype,
-									 TYPECACHE_HASH_PROC | TYPECACHE_EQ_OPR);
-
-		/* XXX will eq_opr ever be invalid if hash_proc isn't? */
-		if (!OidIsValid(typentry->hash_proc) || !OidIsValid(typentry->eq_opr))
-		{
-			list_free(oplist);
-			list_free(exprlist);
-			return false;
-		}
-
-		oplist = lappend_oid(oplist, typentry->eq_opr);
-		exprlist = lappend(exprlist, param);
-	}
-
-	*operators = oplist;
-	*param_exprs = exprlist;
-
-	return true;				/* all params can be hashed */
-}
-
 /*
  * Convert a SubLink (as created by the parser) into a SubPlan.
  *
@@ -311,30 +243,30 @@ make_subplan(PlannerInfo *root, Query *orig_subquery,
 	 * regardless. It may be useful if we can only do this when it seems
 	 * likely that we'll get some repeat lookups, i.e. cache hits.
 	 */
-	if (enable_resultcache && plan_params != NIL && subLinkType == EXPR_SUBLINK)
-	{
-		List	   *operators;
-		List	   *param_exprs;
-
-		/* Determine if all the subplan parameters can be hashed */
-		if (outer_params_hashable(subroot, plan_params, &operators, &param_exprs))
-		{
-			ResultCachePath *cache_path;
-
-			/*
-			 * Pass -1 for the number of calls since we don't have any ideas
-			 * what that'll be.
-			 */
-			cache_path = create_resultcache_path(root,
-												 best_path->parent,
-												 best_path,
-												 param_exprs,
-												 operators,
-												 false,
-												 -1);
-			best_path = (Path *) cache_path;
-		}
-	}
+	//if (enable_resultcache && plan_params != NIL && subLinkType == EXPR_SUBLINK)
+	//{
+	//	List	   *operators;
+	//	List	   *param_exprs;
+
+	//	/* Determine if all the subplan parameters can be hashed */
+	//	if (outer_params_hashable(subroot, plan_params, &operators, &param_exprs))
+	//	{
+	//		ResultCachePath *cache_path;
+
+	//		/*
+	//		 * Pass -1 for the number of calls since we don't have any ideas
+	//		 * what that'll be.
+	//		 */
+	//		cache_path = create_resultcache_path(root,
+	//											 best_path->parent,
+	//											 best_path,
+	//											 param_exprs,
+	//											 operators,
+	//											 false,
+	//											 -1);
+	//		best_path = (Path *) cache_path;
+	//	}
+	//}
 
 	plan = create_plan(subroot, best_path);
 
diff --git a/src/include/executor/nodeResultCache.h b/src/include/executor/nodeResultCache.h
index d2f3ed9a74..440019d141 100644
--- a/src/include/executor/nodeResultCache.h
+++ b/src/include/executor/nodeResultCache.h
@@ -15,16 +15,11 @@
 
 #include "nodes/execnodes.h"
 
-extern ResultCacheState *ExecInitResultCache(ResultCache *node, EState *estate, int eflags);
+extern void ExecResultCacheFinishScan(ResultCacheState *rcstate);
+extern TupleTableSlot *ExecResultCache(ResultCacheState *rcstate, PlanState *innerPlan);
+extern ResultCacheState *ExecInitResultCache(NestLoop *node, PlanState *planstate, PlanState *cache_planstate);
 extern void ExecEndResultCache(ResultCacheState *node);
 extern void ExecReScanResultCache(ResultCacheState *node);
 extern double ExecEstimateCacheEntryOverheadBytes(double ntuples);
-extern void ExecResultCacheEstimate(ResultCacheState *node,
-									ParallelContext *pcxt);
-extern void ExecResultCacheInitializeDSM(ResultCacheState *node,
-										 ParallelContext *pcxt);
-extern void ExecResultCacheInitializeWorker(ResultCacheState *node,
-											ParallelWorkerContext *pwcxt);
-extern void ExecResultCacheRetrieveInstrumentation(ResultCacheState *node);
 
 #endif							/* NODERESULTCACHE_H */
diff --git a/src/include/nodes/execnodes.h b/src/include/nodes/execnodes.h
index 30f66d5058..a2a70151c9 100644
--- a/src/include/nodes/execnodes.h
+++ b/src/include/nodes/execnodes.h
@@ -1855,12 +1855,15 @@ typedef struct JoinState
  *		NullInnerTupleSlot prepared null tuple for left outer joins
  * ----------------
  */
+struct ResultCacheState;
 typedef struct NestLoopState
 {
 	JoinState	js;				/* its first field is NodeTag */
 	bool		nl_NeedNewOuter;
 	bool		nl_MatchedOuter;
+	bool		nl_ParamCache;
 	TupleTableSlot *nl_NullInnerTupleSlot;
+	struct ResultCacheState *nl_pcache;
 } NestLoopState;
 
 /* ----------------
@@ -2022,12 +2025,15 @@ typedef struct SharedResultCacheInfo
  */
 typedef struct ResultCacheState
 {
-	ScanState	ss;				/* its first field is NodeTag */
+	ExprContext *ps_ExprContext;	/* node's expression-evaluation context */
+	//ScanState	ss;				/* its first field is NodeTag */
 	int			rc_status;		/* value of ExecResultCache's state machine */
 	int			nkeys;			/* number of hash table keys */
 	struct resultcache_hash *hashtable; /* hash table cache entries */
 	TupleDesc	hashkeydesc;	/* tuple descriptor for hash keys */
 	TupleTableSlot *tableslot;	/* min tuple slot for existing cache entries */
+	TupleTableSlot *cachefoundslot; /* Slot to return found cache entries */
+	TupleTableSlot *cachefoundslotmin; /* Slot to return found cache entries */
 	TupleTableSlot *probeslot;	/* virtual slot used for hash lookups */
 	ExprState  *cache_eq_expr;	/* Compare exec params to hash key */
 	ExprState **param_exprs;	/* exprs containing the parameters to this
diff --git a/src/include/nodes/pathnodes.h b/src/include/nodes/pathnodes.h
index 79a4ad20dd..31b158026c 100644
--- a/src/include/nodes/pathnodes.h
+++ b/src/include/nodes/pathnodes.h
@@ -1546,6 +1546,16 @@ typedef struct JoinPath
 
 	List	   *joinrestrictinfo;	/* RestrictInfos to apply to join */
 
+	bool		use_cache;
+	List	   *hash_operators; /* hash operators for each key */
+	List	   *param_exprs;	/* cache keys */
+	bool		singlerow;		/* true if the cache entry is to be marked as
+								 * complete after caching the first record. */
+	double		calls;			/* expected number of rescans */
+	uint32		est_entries;	/* The maximum number of entries that the
+								 * planner expects will fit in the cache, or 0
+								 * if unknown */
+
 	/*
 	 * See the notes for RelOptInfo and ParamPathInfo to understand why
 	 * joinrestrictinfo is needed in JoinPath, and can't be merged into the
diff --git a/src/include/nodes/plannodes.h b/src/include/nodes/plannodes.h
index ac5685da64..f989d31033 100644
--- a/src/include/nodes/plannodes.h
+++ b/src/include/nodes/plannodes.h
@@ -701,6 +701,18 @@ typedef struct NestLoop
 {
 	Join		join;
 	List	   *nestParams;		/* list of NestLoopParam nodes */
+	bool		paramcache;
+	int			numKeys;		/* size of the two arrays below */
+
+	Oid		   *hashOperators;	/* hash operators for each key */
+	Oid		   *collations;		/* cache keys */
+	List	   *param_exprs;	/* exprs containing parameters */
+	bool		singlerow;		/* true if the cache entry should be marked as
+								 * complete after we store the first tuple in
+								 * it. */
+	uint32		est_entries;	/* The maximum number of entries that the
+								 * planner expects will fit in the cache, or 0
+								 * if unknown */
 } NestLoop;
 
 typedef struct NestLoopParam