From 961e6e1b7da98eab10753aaf117bcf1102d0ebea Mon Sep 17 00:00:00 2001
From: Tomas Vondra <tomas@vondra.me>
Date: Tue, 18 Feb 2025 15:10:28 +0100
Subject: [PATCH v20250218 2/3] Postpone hashtable growth instead of disabling
 it

After increasing the number of batches and splitting the current one, we
used to disable further growth if all tuples went into only one of the
two new batches. It's possible to construct cases when this leads to
disabling growth prematurely - maybe we can't split the batch now, but
that doesn't mean we could not split it later.

This generally requires underestimated size of the inner relation, so
that we need to increase the number of batches. And then also hashes
non-random in some way. There may be a "common" prefix, or maybe the
data is just correlated in some way.

So instead of hard-disabling the growth permanently, double the memory
limit so that we retry the split after processing more data. Doubling
the limit is somewhat arbitrary - it's the earliest when we could split
the batch in half even if all the current tuples have duplicate hashes.
But we could pick any other value, to retry sooner/later.
---
 src/backend/executor/nodeHash.c     | 50 ++++++++++++++++++++++++++++-
 src/backend/utils/misc/guc_tables.c | 11 +++++++
 src/include/executor/hashjoin.h     |  2 ++
 3 files changed, 62 insertions(+), 1 deletion(-)

diff --git a/src/backend/executor/nodeHash.c b/src/backend/executor/nodeHash.c
index 4748ee845db..df3e945af0e 100644
--- a/src/backend/executor/nodeHash.c
+++ b/src/backend/executor/nodeHash.c
@@ -81,6 +81,9 @@ static void ExecParallelHashMergeCounters(HashJoinTable hashtable);
 static void ExecParallelHashCloseBatchAccessors(HashJoinTable hashtable);
 
 
+/* enable soft-disable of nbatch growth */
+bool	enable_hashjoin_growth = false;
+
 /* ----------------------------------------------------------------
  *		ExecHash
  *
@@ -1172,10 +1175,55 @@ ExecHashIncreaseNumBatches(HashJoinTable hashtable)
 	 * Increasing nbatch will not fix it since there's no way to subdivide the
 	 * group any more finely. We have to just gut it out and hope the server
 	 * has enough RAM.
+	 *
+	 * XXX This logic for hard-disabling nbatch growth assumes that if we're
+	 * unable to split the batch now, we'll be unable to split it forever. That
+	 * works well for "nice" random data sets, but it's not difficult to come
+	 * up with cases where the assumption does not hold.
+	 *
+	 * 1) The hashes may share the same "prefix", but the next bit may be
+	 * random - and doubling the number of batches again would split the batch
+	 * about evenly. This is rather unlikely to happen by chance, of course.
+	 *
+	 * 2) The dataset may be correlated in some way, i.e. produce values with
+	 * one hash value first, before producing other values. This might happen
+	 * for data read from index, etc. If we underestimate the amount of data
+	 * we will need to add to the hash, this may disable nbatch growth while
+	 * still reading the first value, i.e. too soon.
+	 *
+	 * XXX With the "hard disable" this also handles the case when we run out
+	 * of hash bits, because the first doubling after that only has "0" in the
+	 * new bit, which means we get (nfreed == ninmemory) and stop adding more
+	 * batches. But with the soft disable that's no longer the case, and we'd
+	 * try adding more batches after a while. Not sure this a big deal, as the
+	 * memory limit would need to be pretty substantial in that case already
+	 * (assuming we already doubled the limit multiple times). But maybe t
+	 * could be the first time we're hitting this condition? Perhaps the right
+	 * solution is to detect running out of hash bits explicitly, and just do
+	 * a "hard disable" in this case?
+	 *
+	 * XXX This should probably also increase the number of buckets etc.
 	 */
 	if (nfreed == 0 || nfreed == ninmemory)
 	{
-		hashtable->growEnabled = false;
+		/*
+		 * If enable_hashjoin_grow=true, don't disable the growth permanently,
+		 * and instead just increase the limit in the hope that we'll be able
+		 * to split the batches in the future.
+		 *
+		 * XXX Not sure if 2.0 is the optimal growth factor, but it seems quite
+		 * reasonable because it aligns with the expectation that we'll be able
+		 * to split the memory usage in half. It's not perfect, because if we
+		 * add a random bit to the hash, we might split earlier than that.
+		 */
+		if (enable_hashjoin_growth)
+		{
+			/* XXX Do we need to worry about overlowing spaceAllowed? */
+			hashtable->spaceAllowed *= 2.0;
+		}
+		else
+			hashtable->growEnabled = false;
+
 #ifdef HJDEBUG
 		printf("Hashjoin %p: disabling further increase of nbatch\n",
 			   hashtable);
diff --git a/src/backend/utils/misc/guc_tables.c b/src/backend/utils/misc/guc_tables.c
index cce73314609..ac283f61ab1 100644
--- a/src/backend/utils/misc/guc_tables.c
+++ b/src/backend/utils/misc/guc_tables.c
@@ -46,6 +46,7 @@
 #include "commands/vacuum.h"
 #include "common/file_utils.h"
 #include "common/scram-common.h"
+#include "executor/hashjoin.h"
 #include "jit/jit.h"
 #include "libpq/auth.h"
 #include "libpq/libpq.h"
@@ -901,6 +902,16 @@ struct config_bool ConfigureNamesBool[] =
 		true,
 		NULL, NULL, NULL
 	},
+	{
+		{"enable_hashjoin_growth", PGC_USERSET, QUERY_TUNING_METHOD,
+			gettext_noop("Enables growing the hash table memory limit instead of disabling batch increases permanently."),
+			NULL,
+			GUC_EXPLAIN
+		},
+		&enable_hashjoin_growth,
+		false,
+		NULL, NULL, NULL
+	},
 	{
 		{"enable_gathermerge", PGC_USERSET, QUERY_TUNING_METHOD,
 			gettext_noop("Enables the planner's use of gather merge plans."),
diff --git a/src/include/executor/hashjoin.h b/src/include/executor/hashjoin.h
index ecff4842fd3..c73c587b120 100644
--- a/src/include/executor/hashjoin.h
+++ b/src/include/executor/hashjoin.h
@@ -71,6 +71,8 @@
  * ----------------------------------------------------------------
  */
 
+extern PGDLLIMPORT bool enable_hashjoin_growth;
+
 /* these are in nodes/execnodes.h: */
 /* typedef struct HashJoinTupleData *HashJoinTuple; */
 /* typedef struct HashJoinTableData *HashJoinTable; */
-- 
2.48.1