From 7a851a93abc8589d6b43b0e8ca4cbbd5030d68c2 Mon Sep 17 00:00:00 2001
From: Tom Lane <tgl@sss.pgh.pa.us>
Date: Tue, 3 Mar 2026 15:46:46 -0500
Subject: [PATCH v3 2/2] Fix tuple-counting issues in hash joins.

After the previous patch, inner tuples diverted into a null tuplestore
are not counted in EXPLAIN ANALYZE's output for a Hash node, whereas
they were before.  This seems undesirable, so let's fix it.

A pre-existing buglet that I noticed while analyzing the code's
behavior is that ExecHashRemoveNextSkewBucket fails to decrement
hashtable->skewTuples for tuples moved into the main hash table
from the skew hash table.  This invalidates ExecHashTableInsert's
calculation of the number of main-hash-table tuples, though probably
not by a lot since we expect the skew table to be small relative
to the main one.  Nonetheless, let's fix that too while we're here.
(I also moved the increment of skewTuples into ExecHashSkewTableInsert
to be more consistent with that.)

I thought for awhile that there was a second and much worse bug in
ExecHashTableInsert's calculation, namely that it depends on
hashtable->totalTuples which is a running total not the number of
tuples currently present in the hash tables.  That's actually okay
because we only need the number to be accurate while we're still
considering a single batch, but let's restructure the code in
ExecHashTableInsert to make that clearer.

I also renamed "partialTuples" to "reportTuples" in hopes of making
it clearer what that's used for, and improved related comments.

Author: Tom Lane <tgl@sss.pgh.pa.us>
Discussion: https://postgr.es/m/3061845.1746486714@sss.pgh.pa.us
---
 src/backend/executor/nodeHash.c | 31 ++++++++++++++++++++++---------
 src/include/executor/hashjoin.h | 13 ++++++++++---
 2 files changed, 32 insertions(+), 12 deletions(-)

diff --git a/src/backend/executor/nodeHash.c b/src/backend/executor/nodeHash.c
index 392f2cc264c..a12c24eb279 100644
--- a/src/backend/executor/nodeHash.c
+++ b/src/backend/executor/nodeHash.c
@@ -114,7 +114,7 @@ MultiExecHash(HashState *node)
 
 	/* must provide our own instrumentation support */
 	if (node->ps.instrument)
-		InstrStopNode(node->ps.instrument, node->hashtable->partialTuples);
+		InstrStopNode(node->ps.instrument, node->hashtable->reportTuples);
 
 	/*
 	 * We do not return the hash table directly because it's not a subtype of
@@ -140,6 +140,7 @@ MultiExecPrivateHash(HashState *node)
 	HashJoinTable hashtable;
 	TupleTableSlot *slot;
 	ExprContext *econtext;
+	double		nullTuples = 0;
 
 	/*
 	 * get state info from node
@@ -187,7 +188,6 @@ MultiExecPrivateHash(HashState *node)
 				/* It's a skew tuple, so put it into that hash table */
 				ExecHashSkewTableInsert(hashtable, slot, hashvalue,
 										bucketNumber);
-				hashtable->skewTuples += 1;
 			}
 			else
 			{
@@ -202,6 +202,7 @@ MultiExecPrivateHash(HashState *node)
 			if (node->null_tuple_store == NULL)
 				node->null_tuple_store = ExecHashBuildNullTupleStore(hashtable);
 			tuplestore_puttupleslot(node->null_tuple_store, slot);
+			nullTuples += 1;
 		}
 		/* else we can discard the tuple immediately */
 	}
@@ -215,7 +216,8 @@ MultiExecPrivateHash(HashState *node)
 	if (hashtable->spaceUsed > hashtable->spacePeak)
 		hashtable->spacePeak = hashtable->spaceUsed;
 
-	hashtable->partialTuples = hashtable->totalTuples;
+	/* Report total number of tuples output (but not those discarded) */
+	hashtable->reportTuples = hashtable->totalTuples + nullTuples;
 }
 
 /* ----------------------------------------------------------------
@@ -310,7 +312,7 @@ MultiExecParallelHash(HashState *node)
 				{
 					/* normal case with a non-null join key */
 					ExecParallelHashTableInsert(hashtable, slot, hashvalue);
-					hashtable->partialTuples++;
+					hashtable->reportTuples++;
 				}
 				else if (node->keep_null_tuples)
 				{
@@ -318,6 +320,7 @@ MultiExecParallelHash(HashState *node)
 					if (node->null_tuple_store == NULL)
 						node->null_tuple_store = ExecHashBuildNullTupleStore(hashtable);
 					tuplestore_puttupleslot(node->null_tuple_store, slot);
+					hashtable->reportTuples++;
 				}
 				/* else we can discard the tuple immediately */
 			}
@@ -358,7 +361,9 @@ MultiExecParallelHash(HashState *node)
 
 	/*
 	 * We're not yet attached to a batch.  We all agree on the dimensions and
-	 * number of inner tuples (for the empty table optimization).
+	 * number of inner tuples.  (In parallel mode, totalTuples isn't used in
+	 * this module, but we must report it for nodeHashjoin.c's empty-table
+	 * optimization.)
 	 */
 	hashtable->curbatch = -1;
 	hashtable->nbuckets = pstate->nbuckets;
@@ -526,7 +531,7 @@ ExecHashTableCreate(HashState *state)
 	hashtable->nbatch_outstart = nbatch;
 	hashtable->growEnabled = true;
 	hashtable->totalTuples = 0;
-	hashtable->partialTuples = 0;
+	hashtable->reportTuples = 0;
 	hashtable->skewTuples = 0;
 	hashtable->innerBatchFile = NULL;
 	hashtable->outerBatchFile = NULL;
@@ -1786,7 +1791,6 @@ ExecHashTableInsert(HashJoinTable hashtable,
 		 */
 		HashJoinTuple hashTuple;
 		int			hashTupleSize;
-		double		ntuples = (hashtable->totalTuples - hashtable->skewTuples);
 
 		/* Create the HashJoinTuple */
 		hashTupleSize = HJTUPLE_OVERHEAD + tuple->t_len;
@@ -1810,10 +1814,12 @@ ExecHashTableInsert(HashJoinTable hashtable,
 		/*
 		 * Increase the (optimal) number of buckets if we just exceeded the
 		 * NTUP_PER_BUCKET threshold, but only when there's still a single
-		 * batch.
+		 * batch.  Note that totalTuples - skewTuples is a reliable indicator
+		 * of the hash table's size only as long as there's just one batch.
 		 */
 		if (hashtable->nbatch == 1 &&
-			ntuples > (hashtable->nbuckets_optimal * NTUP_PER_BUCKET))
+			(hashtable->totalTuples - hashtable->skewTuples) >
+			(hashtable->nbuckets_optimal * NTUP_PER_BUCKET))
 		{
 			/* Guard against integer overflow and alloc size overflow */
 			if (hashtable->nbuckets_optimal <= INT_MAX / 2 &&
@@ -2640,6 +2646,7 @@ ExecHashSkewTableInsert(HashJoinTable hashtable,
 	Assert(hashTuple != hashTuple->next.unshared);
 
 	/* Account for space used, and back off if we've used too much */
+	hashtable->skewTuples += 1;
 	hashtable->spaceUsed += hashTupleSize;
 	hashtable->spaceUsedSkew += hashTupleSize;
 	if (hashtable->spaceUsed > hashtable->spacePeak)
@@ -2732,6 +2739,12 @@ ExecHashRemoveNextSkewBucket(HashJoinTable hashtable)
 			hashtable->spaceUsedSkew -= tupleSize;
 		}
 
+		/*
+		 * We must reduce skewTuples, but totalTuples doesn't change since it
+		 * counts both main and skew tuples.
+		 */
+		hashtable->skewTuples -= 1;
+
 		hashTuple = nextHashTuple;
 
 		/* allow this loop to be cancellable */
diff --git a/src/include/executor/hashjoin.h b/src/include/executor/hashjoin.h
index 7c76e15a4a8..4d63e89abd3 100644
--- a/src/include/executor/hashjoin.h
+++ b/src/include/executor/hashjoin.h
@@ -336,9 +336,16 @@ typedef struct HashJoinTableData
 
 	bool		growEnabled;	/* flag to shut off nbatch increases */
 
-	double		totalTuples;	/* # tuples obtained from inner plan */
-	double		partialTuples;	/* # tuples obtained from inner plan by me */
-	double		skewTuples;		/* # tuples inserted into skew tuples */
+	/*
+	 * totalTuples is the running total of tuples inserted into either the
+	 * main or skew hash tables.  reportTuples is the number of tuples that we
+	 * want EXPLAIN to show as output from the Hash node (this includes saved
+	 * null-keyed tuples as well as those inserted into the hash tables).
+	 * skewTuples is the number of tuples present in the skew hash table.
+	 */
+	double		totalTuples;
+	double		reportTuples;
+	double		skewTuples;
 
 	/*
 	 * These arrays are allocated for the life of the hash join, but only if
-- 
2.43.7

