From 162f58fe782e60d548a0598b0228c096438b9a44 Mon Sep 17 00:00:00 2001
From: Tom Lane <tgl@sss.pgh.pa.us>
Date: Thu, 13 Nov 2025 12:40:53 -0500
Subject: [PATCH v5 2/5] Factor out duplicative code in
 eqjoinsel_inner/eqjoinsel_semi.

These functions have essentially identical code for scanning the
two MCV lists and identifying which entries have matches in the
other list.  While it's not a huge amount of code, it's 50 or
so lines, and will be more after an upcoming patch to use a hash
table with many MCVs.  Let's reduce duplication by moving that
code into a common subroutine.

The one downside of doing this is that we must compute
sum(sslot1->numbers[i] * sslot2->numbers[j]) even though
eqjoinsel_semi won't need that.  But the cost of that appears
negligible, so I didn't trouble to invent a way of avoiding it.
---
 src/backend/utils/adt/selfuncs.c | 191 ++++++++++++++++---------------
 1 file changed, 99 insertions(+), 92 deletions(-)

diff --git a/src/backend/utils/adt/selfuncs.c b/src/backend/utils/adt/selfuncs.c
index cb23ad52782..590b3a0c078 100644
--- a/src/backend/utils/adt/selfuncs.c
+++ b/src/backend/utils/adt/selfuncs.c
@@ -163,6 +163,11 @@ static double eqjoinsel_semi(Oid opfuncoid, Oid collation,
 							 Form_pg_statistic stats1, Form_pg_statistic stats2,
 							 bool have_mcvs1, bool have_mcvs2,
 							 RelOptInfo *inner_rel);
+static void eqjoinsel_find_matches(Oid opfuncoid, Oid collation,
+								   AttStatsSlot *sslot1, AttStatsSlot *sslot2,
+								   int nvalues1, int nvalues2,
+								   bool *hasmatch1, bool *hasmatch2,
+								   int *p_nmatches, double *p_matchprodfreq);
 static bool estimate_multivariate_ndistinct(PlannerInfo *root,
 											RelOptInfo *rel, List **varinfos, double *ndistinct);
 static bool convert_to_scalar(Datum value, Oid valuetypid, Oid collid,
@@ -2473,8 +2478,6 @@ eqjoinsel_inner(Oid opfuncoid, Oid collation,
 		 * results", Technical Report 1018, Computer Science Dept., University
 		 * of Wisconsin, Madison, March 1991 (available from ftp.cs.wisc.edu).
 		 */
-		LOCAL_FCINFO(fcinfo, 2);
-		FmgrInfo	eqproc;
 		bool	   *hasmatch1;
 		bool	   *hasmatch2;
 		double		nullfrac1 = stats1->stanullfrac;
@@ -2491,55 +2494,17 @@ eqjoinsel_inner(Oid opfuncoid, Oid collation,
 		int			i,
 					nmatches;
 
-		fmgr_info(opfuncoid, &eqproc);
-
-		/*
-		 * Save a few cycles by setting up the fcinfo struct just once. Using
-		 * FunctionCallInvoke directly also avoids failure if the eqproc
-		 * returns NULL, though really equality functions should never do
-		 * that.
-		 */
-		InitFunctionCallInfoData(*fcinfo, &eqproc, 2, collation,
-								 NULL, NULL);
-		fcinfo->args[0].isnull = false;
-		fcinfo->args[1].isnull = false;
-
+		/* Construct the match arrays */
 		hasmatch1 = (bool *) palloc0(sslot1->nvalues * sizeof(bool));
 		hasmatch2 = (bool *) palloc0(sslot2->nvalues * sizeof(bool));
 
-		/*
-		 * Note we assume that each MCV will match at most one member of the
-		 * other MCV list.  If the operator isn't really equality, there could
-		 * be multiple matches --- but we don't look for them, both for speed
-		 * and because the math wouldn't add up...
-		 */
-		matchprodfreq = 0.0;
-		nmatches = 0;
-		for (i = 0; i < sslot1->nvalues; i++)
-		{
-			int			j;
-
-			fcinfo->args[0].value = sslot1->values[i];
-
-			for (j = 0; j < sslot2->nvalues; j++)
-			{
-				Datum		fresult;
-
-				if (hasmatch2[j])
-					continue;
-				fcinfo->args[1].value = sslot2->values[j];
-				fcinfo->isnull = false;
-				fresult = FunctionCallInvoke(fcinfo);
-				if (!fcinfo->isnull && DatumGetBool(fresult))
-				{
-					hasmatch1[i] = hasmatch2[j] = true;
-					matchprodfreq += sslot1->numbers[i] * sslot2->numbers[j];
-					nmatches++;
-					break;
-				}
-			}
-		}
+		eqjoinsel_find_matches(opfuncoid, collation,
+							   sslot1, sslot2,
+							   sslot1->nvalues, sslot2->nvalues,
+							   hasmatch1, hasmatch2,
+							   &nmatches, &matchprodfreq);
 		CLAMP_PROBABILITY(matchprodfreq);
+
 		/* Sum up frequencies of matched and unmatched MCVs */
 		matchfreq1 = unmatchfreq1 = 0.0;
 		for (i = 0; i < sslot1->nvalues; i++)
@@ -2700,12 +2665,11 @@ eqjoinsel_semi(Oid opfuncoid, Oid collation,
 		 * lists.  We still have to estimate for the remaining population, but
 		 * in a skewed distribution this gives us a big leg up in accuracy.
 		 */
-		LOCAL_FCINFO(fcinfo, 2);
-		FmgrInfo	eqproc;
 		bool	   *hasmatch1;
 		bool	   *hasmatch2;
 		double		nullfrac1 = stats1->stanullfrac;
-		double		matchfreq1,
+		double		matchprodfreq,
+					matchfreq1,
 					uncertainfrac,
 					uncertain;
 		int			i,
@@ -2721,52 +2685,16 @@ eqjoinsel_semi(Oid opfuncoid, Oid collation,
 		 */
 		clamped_nvalues2 = Min(sslot2->nvalues, nd2);
 
-		fmgr_info(opfuncoid, &eqproc);
-
-		/*
-		 * Save a few cycles by setting up the fcinfo struct just once. Using
-		 * FunctionCallInvoke directly also avoids failure if the eqproc
-		 * returns NULL, though really equality functions should never do
-		 * that.
-		 */
-		InitFunctionCallInfoData(*fcinfo, &eqproc, 2, collation,
-								 NULL, NULL);
-		fcinfo->args[0].isnull = false;
-		fcinfo->args[1].isnull = false;
-
+		/* Construct the match arrays */
 		hasmatch1 = (bool *) palloc0(sslot1->nvalues * sizeof(bool));
 		hasmatch2 = (bool *) palloc0(clamped_nvalues2 * sizeof(bool));
 
-		/*
-		 * Note we assume that each MCV will match at most one member of the
-		 * other MCV list.  If the operator isn't really equality, there could
-		 * be multiple matches --- but we don't look for them, both for speed
-		 * and because the math wouldn't add up...
-		 */
-		nmatches = 0;
-		for (i = 0; i < sslot1->nvalues; i++)
-		{
-			int			j;
-
-			fcinfo->args[0].value = sslot1->values[i];
-
-			for (j = 0; j < clamped_nvalues2; j++)
-			{
-				Datum		fresult;
+		eqjoinsel_find_matches(opfuncoid, collation,
+							   sslot1, sslot2,
+							   sslot1->nvalues, clamped_nvalues2,
+							   hasmatch1, hasmatch2,
+							   &nmatches, &matchprodfreq);
 
-				if (hasmatch2[j])
-					continue;
-				fcinfo->args[1].value = sslot2->values[j];
-				fcinfo->isnull = false;
-				fresult = FunctionCallInvoke(fcinfo);
-				if (!fcinfo->isnull && DatumGetBool(fresult))
-				{
-					hasmatch1[i] = hasmatch2[j] = true;
-					nmatches++;
-					break;
-				}
-			}
-		}
 		/* Sum up frequencies of matched MCVs */
 		matchfreq1 = 0.0;
 		for (i = 0; i < sslot1->nvalues; i++)
@@ -2830,6 +2758,85 @@ eqjoinsel_semi(Oid opfuncoid, Oid collation,
 	return selec;
 }
 
+/*
+ * Identify matching MCVs for eqjoinsel_inner or eqjoinsel_semi.
+ *
+ * Inputs:
+ *	opfuncoid: OID of equality function to use (might be cross-type)
+ *	collation: OID of collation to use
+ *	sslot1, sslot2: MCV values for the lefthand and righthand inputs
+ *	nvalues1, nvalues2: number of values to be considered (can be less than
+ *		sslotN->nvalues, but not more)
+ * Outputs:
+ *	hasmatch1[], hasmatch2[]: pre-zeroed arrays of lengths nvalues1, nvalues2;
+ *		entries are set to true if that MCV has a match on the other side
+ *	*p_nmatches: receives number of MCV pairs that match
+ *	*p_matchprodfreq: receives sum(sslot1->numbers[i] * sslot2->numbers[j])
+ *		for matching MCVs
+ *
+ * Note we assume that each MCV will match at most one member of the other
+ * MCV list.  If the operator isn't really equality, there could be multiple
+ * matches --- but we don't look for them, both for speed and because the
+ * math wouldn't add up...
+ */
+static void
+eqjoinsel_find_matches(Oid opfuncoid, Oid collation,
+					   AttStatsSlot *sslot1, AttStatsSlot *sslot2,
+					   int nvalues1, int nvalues2,
+					   bool *hasmatch1, bool *hasmatch2,
+					   int *p_nmatches, double *p_matchprodfreq)
+{
+	LOCAL_FCINFO(fcinfo, 2);
+	FmgrInfo	eqproc;
+	double		matchprodfreq = 0.0;
+	int			nmatches = 0;
+
+	fmgr_info(opfuncoid, &eqproc);
+
+	/*
+	 * Save a few cycles by setting up the fcinfo struct just once.  Using
+	 * FunctionCallInvoke directly also avoids failure if the eqproc returns
+	 * NULL, though really equality functions should never do that.
+	 */
+	InitFunctionCallInfoData(*fcinfo, &eqproc, 2, collation,
+							 NULL, NULL);
+	fcinfo->args[0].isnull = false;
+	fcinfo->args[1].isnull = false;
+
+	/*
+	 * The reason for this extra level of braces will become apparent later.
+	 * For now, it just prevents having to re-indent this chunk of code moved
+	 * from eqjoinsel_inner.
+	 */
+	{
+		for (int i = 0; i < nvalues1; i++)
+		{
+			fcinfo->args[0].value = sslot1->values[i];
+
+			for (int j = 0; j < nvalues2; j++)
+			{
+				Datum		fresult;
+
+				if (hasmatch2[j])
+					continue;
+				fcinfo->args[1].value = sslot2->values[j];
+				fcinfo->isnull = false;
+				fresult = FunctionCallInvoke(fcinfo);
+				if (!fcinfo->isnull && DatumGetBool(fresult))
+				{
+					hasmatch1[i] = hasmatch2[j] = true;
+					matchprodfreq += sslot1->numbers[i] * sslot2->numbers[j];
+					nmatches++;
+					break;
+				}
+			}
+		}
+	}
+
+	*p_nmatches = nmatches;
+	*p_matchprodfreq = matchprodfreq;
+}
+
 /*
  *		neqjoinsel		- Join selectivity of "!="
  */
-- 
2.43.7

