From a5d2068e698d3d2e8f5168fa8b29e074d01f08bc Mon Sep 17 00:00:00 2001
From: test <test>
Date: Sun, 31 May 2026 10:20:50 +0200
Subject: [PATCH v2 1/2] Using Bloom filters for serial hash joins

Builds a Bloom filter on the inner side (on the hash values used for the
regular hash table), and probes it before lookups in the main hash
table. The expectation is that probing the filter is cheaper than hash
table lookup, and much cheaper than spilling the tuple to temporary
files (with batched joins). If a significant fraction of outer tuples
can be skipped based on the probe, it makes the join cheaper.

The patch is limited to serial (non-parallel) joins, within the scope
of a single join node (no pushdown of the Bloom filter).

The feature is gated behind a new GUC enable_hashjoin_bloom (=on).

It's possible the filter does not reject enough tuples to outweigh the
build/probe costs. To mitigate this risk, the patch implements two
adaptive behaviors based on lookup and probe statistics, driving the
filter build and probing.

The filter is built when:

* The join is using batching (nbatch>1). We expect the spilling to be
  expensive enough to justify the cost to build the filter, even if only
  a very small fraction of tuples gets eliminated.

* For single-batch joins (nbatch=1) the filter gets built based on
  lookup match rate. The filter is built if the rate dropp below 90%,
  i.e. if at least 10% of outer tuples can get eliminated.

Furthermore, the probing is driven by a similar statistics. If less than
10% of probes reject the tuple, the filter is considered ineffective,
and is temporarily disabled. It's probed only for 1% of the tuples,
until the reject fraction increases above 20%.

See comments in nodeHash.c and nodeHashjoin.c for more details.

The patch also adds a number of relevant stats to EXPLAIN (ANALYZE),
some of which require VERBOSE.
---
 src/backend/commands/explain.c                |  97 +++++
 src/backend/executor/nodeHash.c               | 390 ++++++++++++++++++
 src/backend/executor/nodeHashjoin.c           |  30 ++
 src/backend/lib/bloomfilter.c                 |  68 ++-
 src/backend/optimizer/path/costsize.c         |   1 +
 src/backend/utils/misc/guc_parameters.dat     |   7 +
 src/backend/utils/misc/postgresql.conf.sample |   1 +
 src/include/executor/hashjoin.h               |  39 ++
 src/include/executor/instrument_node.h        |  12 +
 src/include/executor/nodeHash.h               |   5 +
 src/include/lib/bloomfilter.h                 |   5 +
 src/include/optimizer/cost.h                  |   1 +
 src/test/regress/expected/join_hash_bloom.out | 178 ++++++++
 src/test/regress/expected/sysviews.out        |   3 +-
 src/test/regress/parallel_schedule            |   2 +-
 src/test/regress/sql/join_hash_bloom.sql      |  56 +++
 16 files changed, 889 insertions(+), 6 deletions(-)
 create mode 100644 src/test/regress/expected/join_hash_bloom.out
 create mode 100644 src/test/regress/sql/join_hash_bloom.sql

diff --git a/src/backend/commands/explain.c b/src/backend/commands/explain.c
index 112c17b0d64..1b3a3579df9 100644
--- a/src/backend/commands/explain.c
+++ b/src/backend/commands/explain.c
@@ -3474,6 +3474,103 @@ show_hash_info(HashState *hashstate, ExplainState *es)
 							 spacePeakKb);
 		}
 	}
+
+	/*
+	 * Hash table runtime statistics - number of hash table lookups and matches.
+	 * This does not include tuples rejected by a Bloom filter (if there's one).
+	 */
+	if (es->analyze && es->verbose)
+	{
+		double	match_rate = 0.0;
+
+		/* fraction of lookups with a match */
+		if (hinstrument.hash_nmatches > 0)
+			match_rate = (double) hinstrument.hash_nmatches /
+				hinstrument.hash_nlookups;
+
+		if (es->format != EXPLAIN_FORMAT_TEXT)
+		{
+			ExplainPropertyInteger("Hash Lookups", NULL,
+								   hinstrument.hash_nlookups, es);
+			ExplainPropertyInteger("Hash Matches", NULL,
+								   hinstrument.hash_nmatches, es);
+			ExplainPropertyFloat("Hash Match Rate", NULL,
+								 (100.0 * match_rate), 3, es);
+		}
+		else
+		{
+			ExplainIndentText(es);
+			appendStringInfo(es->str,
+							 "Hash Lookups: " INT64_FORMAT "  Matches: " INT64_FORMAT "  Match Rate: %.3f%%\n",
+							 hinstrument.hash_nlookups,
+							 hinstrument.hash_nmatches,
+							 (100.0 * match_rate));
+		}
+	}
+
+	/*
+	 * Bloom filter statistics - similarly to hash tables we report number of
+	 * probes and number of matches, but we also report some basic properties
+	 * of the Bloom filter (size, number of hash functions and the estimated
+	 * false positive rate). The false positive rate is estimated from how
+	 * many bits are set in the filter at the end, not the rate the filter was
+	 * originally sized for.
+	 *
+	 * XXX This only really matters under EXPLAIN ANALYZE, probably. In most
+	 * cases we only decide to build the filter during execution (except for
+	 * the case when we know the hash join neeeds to be batched)?
+	 */
+	if (hinstrument.bloom_used)
+	{
+		uint64		bloomSizeKb = BYTES_TO_KILOBYTES(hinstrument.bloom_nbytes);
+		double		match_fraction = 0.0;
+
+		/* fraction of probes matching the filter */
+		if (hinstrument.bloom_nprobes > 0)
+			match_fraction = (double) hinstrument.bloom_nmatches /
+				hinstrument.bloom_nprobes;
+
+		if (es->format != EXPLAIN_FORMAT_TEXT)
+		{
+			ExplainOpenGroup("Bloom Filter", "Bloom Filter", true, es);
+			ExplainPropertyUInteger("Filter Size", "kB", bloomSizeKb, es);
+			ExplainPropertyInteger("Hash Functions", NULL,
+								   hinstrument.bloom_nhashfuncs, es);
+			ExplainPropertyFloat("False Positive Rate", NULL,
+								 100.0 * hinstrument.bloom_false_positive_rate, 3, es);
+
+			if (es->analyze)
+			{
+				ExplainPropertyInteger("Probes", NULL,
+									   hinstrument.bloom_nprobes, es);
+				ExplainPropertyInteger("Matches", NULL,
+									   hinstrument.bloom_nmatches, es);
+				ExplainPropertyFloat("Match Rate", NULL,
+									 (100.0 * match_fraction), 3, es);
+			}
+
+			ExplainCloseGroup("Bloom Filter", "Bloom Filter", true, es);
+		}
+		else
+		{
+			ExplainIndentText(es);
+			appendStringInfo(es->str,
+							 "Bloom Filter: Size: " UINT64_FORMAT "kB  Hash Functions: %d  False Positive Rate: %.3f%%\n",
+							 bloomSizeKb,
+							 hinstrument.bloom_nhashfuncs,
+							 100.0 * hinstrument.bloom_false_positive_rate);
+
+			if (es->analyze)
+			{
+				ExplainIndentText(es);
+				appendStringInfo(es->str,
+								 "Bloom Filter Probes: " INT64_FORMAT "  Matches: " INT64_FORMAT "  Match Rate: %.3f%%\n",
+								 hinstrument.bloom_nprobes,
+								 hinstrument.bloom_nmatches,
+								 (100.0 * match_fraction));
+			}
+		}
+	}
 }
 
 /*
diff --git a/src/backend/executor/nodeHash.c b/src/backend/executor/nodeHash.c
index 8825bb6fa23..442beee7b70 100644
--- a/src/backend/executor/nodeHash.c
+++ b/src/backend/executor/nodeHash.c
@@ -35,7 +35,9 @@
 #include "executor/instrument.h"
 #include "executor/nodeHash.h"
 #include "executor/nodeHashjoin.h"
+#include "lib/bloomfilter.h"
 #include "miscadmin.h"
+#include "optimizer/cost.h"
 #include "port/pg_bitutils.h"
 #include "utils/lsyscache.h"
 #include "utils/memutils.h"
@@ -81,6 +83,95 @@ static bool ExecParallelHashTuplePrealloc(HashJoinTable hashtable,
 static void ExecParallelHashMergeCounters(HashJoinTable hashtable);
 static void ExecParallelHashCloseBatchAccessors(HashJoinTable hashtable);
 
+/*
+ * Bloom filters
+ *
+ * A hashjoin may benefit from a Bloom filter on the inner side, allowing it to
+ * reject some of the outer tuples without having to perform a full hash table
+ * lookup, and/or spilling them to disk (for batched joins).
+ *
+ * Probing a filter is significantly cheaper than a hash table lookup (by 1-2
+ * orders of magnitude), and even cheaper than spilling it to disk. If a join
+ * is selective, a significant fraction of the outer tuples can be rejected
+ * after probing the filter. If a join is not selective, and finds a match for
+ * (almost) all outer tuples, there are no benefits of the Bloom filter.
+ *
+ * To make regressions less likely, we employ two adaptive strategies during
+ * building and probing, to limit the impact in case the join is not selective
+ * enough for the filter to pay for itself.
+ *
+ *
+ * 1) adaptive build
+ *
+ * goal: Build filters only when there's a good chance the filter will pay for
+ * itself, i.e. that it will eliminate enough lookups and/or tuples spilled to
+ * disk with (nbatch>1).
+ *
+ * If we expect the hash table to fit into memory (i.e. nbatch=1), we don't
+ * build the filter right away. Instead, we build just the hash table, and
+ * start executing the join as usual. After 1000 lookups (BLOOM_BUILD_WINDOW)
+ * we check how selective the join is, i.e. how many lookups found a match. If
+ * the fraction is below 90% (BLOOM_BUILD_THRESHOLD), we expect the filter to
+ * be worth it, and build it on the tuples in the hash table. We repeat this
+ * check every BLOOM_BUILD_WINDOW lookups, in case the data set is not uniform.
+ *
+ * If we expect the hash table to not fit into memory (i.e. nbatch>1), or if
+ * we find this while building the hash table, we start building the filter
+ * immediately. We can't delay the decision, because once we spill some tuples
+ * to disk, we won't be able to build a valid filter. We also expect the
+ * spilling to be expensive enough to "hide" the overhead, and if we can
+ * eliminate at least some outer tuples before spilling them to disk, it's
+ * likely a win overall.
+ *
+ *
+ * 2) adaptive probing
+ *
+ * goal: Stop probing filters that turn out to not be selective, and start
+ * probing them if that changes during the join. There's no point in probing
+ * a useless filter. But also we've already paid the price for building it,
+ * so if there's a chance it'll be useful, no harm to check again.
+ *
+ * To evaluate the efficiency of a filter, we track the number of matches
+ * for every 1000 probes (BLOOM_PROBE_WINDOW). If more than 90% probes have
+ * a possible match (and thus proceed to perform a hash table lookup), the
+ * filter is considered not effective.
+ *
+ * Instead of just disabling the filter entirely, we start sampling only a
+ * fraction (1%, per BLOOM_PROBE_SAMPLE_RATE) of the probes. Only those
+ * probes are evaluated using the filter, the remaining 99% go directly to
+ * the hash table lookuk (as if the filter did not reject them). After
+ * about 100k values, we should have another "window" and we recheck the
+ * efficiency of the filter. If the fraction of matches is lower than 80%
+ * (per BLOOM_PROBE_THRESHOLD_LOW), we enable the filter again.
+ *
+ * This way we can enable/disable the filter for different parts of the
+ * data set, in case the distribution is not uniform in some way.
+ *
+ * XXX The gap between 80% and 90% is intentional. It adds hysteresis, so
+ * that the heuristics does not "flap" for datasets that oscillate right
+ * around ~90% matches.
+ *
+ * XXX Maybe 1000 and 1% is a bit too much, because we'll recheck after 100k
+ * lookups. Which seems like a lot, maybe we should recheck more often?
+ * Idea: Double the distance, i.e. cut the sample rate in half. We start
+ * with 1, so 100% is sampled. If disable, double sample to 2, so 50% is
+ * sampled, and the distance is 2. Then 4, 8, 16, 32, .... up to some upper
+ * limit (64k?). A change drops it to 1 again.
+ */
+
+/* minimum filter size, in bytes */
+#define BLOOM_MIN_FILTER_SIZE	(8 * 1024)
+
+/* adaptive filter build */
+#define BLOOM_BUILD_WINDOW		1000
+#define BLOOM_BUILD_THRESHOLD	0.9
+
+/* adaptive filter probing */
+#define BLOOM_PROBE_WINDOW				1000
+#define BLOOM_PROBE_THRESHOLD_HIGH		0.9
+#define BLOOM_PROBE_THRESHOLD_LOW		0.8
+#define BLOOM_PROBE_SAMPLE_RATE			100
+
 
 /* ----------------------------------------------------------------
  *		ExecHash
@@ -184,6 +275,12 @@ MultiExecPrivateHash(HashState *node)
 			uint32		hashvalue = DatumGetUInt32(hashdatum);
 			int			bucketNumber;
 
+			/* If a Bloom filter is already in use, record the hash in it. */
+			if (hashtable->bloomFilter != NULL)
+				bloom_add_element(hashtable->bloomFilter,
+								  (unsigned char *) &hashvalue,
+								  sizeof(uint32));
+
 			bucketNumber = ExecHashGetSkewBucket(hashtable, hashvalue);
 			if (bucketNumber != INVALID_SKEW_BUCKET_NO)
 			{
@@ -535,6 +632,16 @@ ExecHashTableCreate(HashState *state)
 	hashtable->totalTuples = 0;
 	hashtable->reportTuples = 0;
 	hashtable->skewTuples = 0;
+	hashtable->bloomFilter = NULL;
+	hashtable->bloomElements = rows;
+	hashtable->bloomSampling = false;
+	hashtable->bloomSampleCounter = 0;
+	hashtable->bloomSampleMatches = 0;
+	hashtable->bloomSampleProbes = 0;
+	hashtable->bloomProbes = 0;
+	hashtable->bloomMatches = 0;
+	hashtable->hashLookups = 0;
+	hashtable->hashMatches = 0;
 	hashtable->innerBatchFile = NULL;
 	hashtable->outerBatchFile = NULL;
 	hashtable->spaceUsed = 0;
@@ -663,11 +770,260 @@ ExecHashTableCreate(HashState *state)
 			ExecHashBuildSkewHash(state, hashtable, node, num_skew_mcvs);
 
 		MemoryContextSwitchTo(oldcxt);
+
+		/*
+		 * If we already expect to need more than one batch, start building a
+		 * Bloom filter right away so that it ends up containing every inner
+		 * tuple. (For nbatch=1 we start without a filter and may build one
+		 * later, either when we are forced to start batching or adaptively
+		 * while probing.)
+		 */
+		if (nbatch > 1)
+			ExecHashCreateBloomFilter(hashtable);
 	}
 
 	return hashtable;
 }
 
+/*
+ * ExecHashCreateBloomFilter
+ *		Create and empty bloom filter for the inner-side hash table.
+ *
+ * Creates an empty Bloom filter for the hashes of the inner join keys. The
+ * filter is created in hashCxt just like the hash table, so that it survives
+ * between batches etc.
+ *
+ * If the filter is not created at the beginning of the build, before any
+ * tuples are added to the hash table, it needs to be populated with hashes
+ * already added to the hash table. See ExecHashBuildBloomFilter.
+ *
+ * XXX Actually, could we destroy the filter after the first batch? At that
+ * point all outer tuples are already probed, so the filter is not needed. Or
+ * do we need to keep it for rescans?
+ */
+void
+ExecHashCreateBloomFilter(HashJoinTable hashtable)
+{
+	MemoryContext oldcxt;
+	int64		nelems;
+
+	Assert(hashtable->parallel_state == NULL);
+	Assert(hashtable->bloomFilter == NULL);
+
+	/* bail out if bloom filters disabled */
+	if (!enable_hashjoin_bloom)
+		return;
+
+	/*
+	 * Size the filter for the expected number of inner tuples. Use the larger
+	 * of the planner estimate and the number of tuples seen so far; the Bloom
+	 * filter implementation copes well with the estimate being somewhat off.
+	 *
+	 * XXX We know if we're building before the hash table is complete. If it's
+	 * complete, we've seen all tuples - no need to consider bloomElements.
+	 *
+	 * XXX Maybe we should use a multiple, to make it better in case of poor
+	 * estimates? But only if we build the filter while still reading the inner
+	 * relation. If we already saw all tuples, we size the filter perfectly.
+	 *
+	 * XXX We should also consider what to do if the filter can't fit into
+	 * the memory budget. We may try building a filter with worse false
+	 * positive rate, as long as the final match rate is low enough.
+	 */
+	nelems = (int64) Max(hashtable->bloomElements, hashtable->totalTuples);
+	nelems = Max(nelems, 1000);
+
+	oldcxt = MemoryContextSwitchTo(hashtable->hashCxt);
+	hashtable->bloomFilter = bloom_create_custom(nelems, work_mem,
+												 BLOOM_MIN_FILTER_SIZE, 0);
+	MemoryContextSwitchTo(oldcxt);
+}
+
+/*
+ * ExecHashBuildBloomFilter
+ *		Creates an empty Bloom filter, and populates it with current hashes.
+ *
+ * Creates an empty filter, and seeds it with the hashes of tuples already
+ * present in the hash table (both the main and skew hash table). Remaining
+ * tuples are added as they are inserted into the hash table.
+ */
+void
+ExecHashBuildBloomFilter(HashJoinTable hashtable)
+{
+	/* create an empty bloom filter */
+	ExecHashCreateBloomFilter(hashtable);
+
+	/* add tuples already stored in the main hash table */
+	for (HashMemoryChunk chunk = hashtable->chunks;
+		 chunk != NULL;
+		 chunk = chunk->next.unshared)
+	{
+		size_t		idx = 0;
+
+		while (idx < chunk->used)
+		{
+			HashJoinTuple hashTuple = (HashJoinTuple) (HASH_CHUNK_DATA(chunk) + idx);
+			MinimalTuple tuple = HJTUPLE_MINTUPLE(hashTuple);
+
+			bloom_add_element(hashtable->bloomFilter,
+							  (unsigned char *) &hashTuple->hashvalue,
+							  sizeof(uint32));
+
+			idx += MAXALIGN(HJTUPLE_OVERHEAD + tuple->t_len);
+		}
+	}
+
+	/* add tuples already stored in the skep hash table */
+	if (hashtable->skewEnabled)
+	{
+		for (int i = 0; i < hashtable->nSkewBuckets; i++)
+		{
+			int			j = hashtable->skewBucketNums[i];
+			HashJoinTuple skewTuple = hashtable->skewBucket[j]->tuples;
+
+			while (skewTuple != NULL)
+			{
+				bloom_add_element(hashtable->bloomFilter,
+								  (unsigned char *) &skewTuple->hashvalue,
+								  sizeof(uint32));
+				skewTuple = skewTuple->next.unshared;
+			}
+		}
+	}
+}
+
+/*
+ * ExecHashBloomReject
+ *		Should this hash value (for an outer tuple) be rejected?
+ *
+ * Returns true if a Bloom filter is in use and it proves that the given hash
+ * value (and therefore the outer tuple) cannot match any inner tuple.
+ *
+ * When sampling the filter probes, most tuples bypass the filter and the
+ * function returns false without consulting it.
+ */
+bool
+ExecHashBloomReject(HashJoinTable hashtable, uint32 hashvalue)
+{
+	bool	reject = false;
+
+	/*
+	 * Ignore the filter after processing the first batch (all tuples spilled
+	 * to temporary files already went through the check).
+	 */
+	if (hashtable->curbatch != 0)
+		return false;
+
+	/* If there's no filter, all tuples should pass. */
+	if (hashtable->bloomFilter == NULL)
+		return false;
+
+	/*
+	 * Probe the filter for the hash value, unless it should be skipped due to
+	 * sampling. With sampling enabled, we only probe the filter for one tuple
+	 * in BLOOM_PROBE_SAMPLE_RATE; the rest go straight to the hash table.
+	 */
+	if (!(hashtable->bloomSampling &&
+		  (hashtable->bloomSampleCounter++ % BLOOM_PROBE_SAMPLE_RATE) != 0))
+	{
+		hashtable->bloomProbes++;
+		if (bloom_lacks_element(hashtable->bloomFilter,
+								(unsigned char *) &hashvalue,
+								sizeof(uint32)))
+		{
+			hashtable->bloomRejects++;
+			reject = true;
+		}
+
+		if (!reject)
+			hashtable->bloomMatches++;
+
+		/* record the result and adjust the sampling state */
+		ExecHashBloomSamplingUpdate(hashtable, !reject);
+	}
+
+	return reject;
+}
+
+/*
+ * ExecHashBloomSamplingUpdate
+ *		Record the outcome of a filter probe and adjust the filter behavior.
+ *
+ * "match" indicates whether the filter probe rejected the hash value, so that
+ * the tuple can be eliminated. We track the fraction of matches over a sliding
+ * window of BLOOM_PROBE_WINDOW probes, and use it to enable/disable sampling.
+ * If too many probes find a match, we let most probes through, except for a
+ * small sample. Once the fraction of matches drops, we stop sampling.
+ */
+void
+ExecHashBloomSamplingUpdate(HashJoinTable hashtable, bool match)
+{
+	double		fraction;
+
+	/* Record the probe and the result in the current window. */
+	hashtable->bloomSampleProbes++;
+	if (match)
+		hashtable->bloomSampleMatches++;
+
+	/* Wait until we have a full window before reassessing. */
+	if (hashtable->bloomSampleProbes < BLOOM_PROBE_WINDOW)
+		return;
+
+	/* fraction of probes that found a (possible) match */
+	fraction = (double) hashtable->bloomSampleMatches / hashtable->bloomSampleProbes;
+
+	/* if the match rate is too high, start sampling */
+	if (fraction > BLOOM_PROBE_THRESHOLD_HIGH)
+		hashtable->bloomSampling = true;
+
+	/* if the match rate is lowe enough, stop sampling */
+	if (fraction < BLOOM_PROBE_THRESHOLD_LOW)
+		hashtable->bloomSampling = false;
+
+	/* reset the sample window */
+	hashtable->bloomSampleCounter = 0;
+	hashtable->bloomSampleMatches = 0;
+	hashtable->bloomSampleProbes = 0;
+}
+
+/*
+ * ExecHashBloomAccountLookup
+ *		Account for hash table lookup, and maybe create the Bloom filter.
+ */
+void
+ExecHashBloomAccountLookup(HashJoinTable hashtable)
+{
+	hashtable->hashMatches++;
+
+	/* Bail out if Bloom filters are disabled. */
+	if (!enable_hashjoin_bloom)
+		return;
+
+	/* If the filter is already built, we're done. */
+	if (hashtable->bloomFilter != NULL)
+		return;
+
+	/* We can't build filters for parallel hash joins. */
+	if (hashtable->parallel_state != NULL)
+		return;
+
+	/* All serial batched runs should have a filter created automatically. */
+	Assert(hashtable->nbatch == 1);
+
+	/*
+	 * Build a filter if the hash table lookups found sufficiently few matches
+	 * so far. We recheck regularly, after each window of lookups.
+	 *
+	 * XXX Maybe we should reset the counters, just like for filter probes? That
+	 * would mean we look at individual windows, while now we look at the whole
+	 * history of lookups. Not sure if one of these is a "more right".
+	 */
+	if (((hashtable->hashLookups % BLOOM_BUILD_WINDOW) == 0) &&
+		(hashtable->hashMatches < hashtable->hashLookups * BLOOM_BUILD_THRESHOLD))
+	{
+		ExecHashBuildBloomFilter(hashtable);
+	}
+}
 
 /*
  * Compute appropriate size for hashtable given the estimated size of the
@@ -1103,6 +1459,15 @@ ExecHashIncreaseNumBatches(HashJoinTable hashtable)
 
 	hashtable->nbatch = nbatch;
 
+	/*
+	 * Build the Bloom filter, if we're switching from a single batch to multiple
+	 * batches, so that it contains all inner tuples loaded so far. Remaining
+	 * tuples will be added as they are loaded from the inner plan, so the filter
+	 * will contain cover all batches.
+	 */
+	if (oldnbatch == 1)
+		ExecHashBuildBloomFilter(hashtable);
+
 	/*
 	 * Scan through the existing hash table entries and dump out any that are
 	 * no longer of the current batch.
@@ -2945,6 +3310,31 @@ ExecHashAccumInstrumentation(HashInstrumentation *instrument,
 									  hashtable->nbatch_original);
 	instrument->space_peak = Max(instrument->space_peak,
 								 hashtable->spacePeak);
+
+	/*
+	 * Record Bloom filter information, if a filter was built.
+	 *
+	 * XXX Shouldn't this use Max(), just like the block above?
+	 */
+	if (hashtable->bloomFilter != NULL)
+	{
+		instrument->bloom_used = true;
+		instrument->bloom_nhashfuncs =
+			bloom_num_hash_funcs(hashtable->bloomFilter);
+		instrument->bloom_nbytes = bloom_total_bits(hashtable->bloomFilter) / BITS_PER_BYTE;
+		instrument->bloom_false_positive_rate =
+			bloom_false_positive_rate(hashtable->bloomFilter);
+		instrument->bloom_nprobes = hashtable->bloomProbes;
+		instrument->bloom_nmatches = hashtable->bloomMatches;
+	}
+
+	/*
+	 * Record hash-table probe statistics.
+	 *
+	 * XXX Shouldn't this use Max(), just like the earlier block?
+	 */
+	instrument->hash_nlookups = hashtable->hashLookups;
+	instrument->hash_nmatches = hashtable->hashMatches;
 }
 
 /*
diff --git a/src/backend/executor/nodeHashjoin.c b/src/backend/executor/nodeHashjoin.c
index 0b365d5b475..db14cf98f9b 100644
--- a/src/backend/executor/nodeHashjoin.c
+++ b/src/backend/executor/nodeHashjoin.c
@@ -170,6 +170,7 @@
 #include "executor/nodeHash.h"
 #include "executor/nodeHashjoin.h"
 #include "miscadmin.h"
+#include "optimizer/cost.h"
 #include "utils/lsyscache.h"
 #include "utils/sharedtuplestore.h"
 #include "utils/tuplestore.h"
@@ -500,6 +501,20 @@ ExecHashJoinImpl(PlanState *pstate, bool parallel)
 																 hashvalue);
 				node->hj_CurTuple = NULL;
 
+				/*
+				 * Consult the inner-relation Bloom filter, if any, before
+				 * probing the hash table. A negative answer means this outer
+				 * tuple cannot match in any batch: we can skip both the
+				 * hash-table lookup and any spilling to a later batch. Jumping to
+				 * HJ_FILL_OUTER_TUPLE emits a null-extended row for outer joins
+				 * and simply discards the tuple otherwise.
+				 */
+				if (!parallel && ExecHashBloomReject(hashtable, hashvalue))
+				{
+					node->hj_JoinState = HJ_FILL_OUTER_TUPLE;
+					continue;
+				}
+
 				/*
 				 * The tuple might not belong to the current batch (where
 				 * "current batch" includes the skew buckets if any).
@@ -531,6 +546,9 @@ ExecHashJoinImpl(PlanState *pstate, bool parallel)
 				/* OK, let's scan the bucket for matches */
 				node->hj_JoinState = HJ_SCAN_BUCKET;
 
+				/* Count this as a lookup in the hash table. */
+				hashtable->hashLookups++;
+
 				pg_fallthrough;
 
 			case HJ_SCAN_BUCKET:
@@ -565,6 +583,18 @@ ExecHashJoinImpl(PlanState *pstate, bool parallel)
 					HeapTupleHeaderHasMatch(HJTUPLE_MINTUPLE(node->hj_CurTuple)))
 					continue;
 
+				/*
+				 * Count the first match found for this outer tuple (may create
+				 * the Bloom filter, if sufficienly few matches.
+				 *
+				 * If an outer tuple has multiple matching inner tuples, we want
+				 * it to count as a single match, so that it's comparable to
+				 * counters for the Bloom filter (which also counts each outer
+				 * as a single probe).
+				 */
+				if (!node->hj_MatchedOuter)
+					ExecHashBloomAccountLookup(hashtable);
+
 				/*
 				 * We've got a match, but still need to test non-hashed quals.
 				 * ExecScanHashBucket already set up all the state needed to
diff --git a/src/backend/lib/bloomfilter.c b/src/backend/lib/bloomfilter.c
index 73b3768a172..bb04aa600e8 100644
--- a/src/backend/lib/bloomfilter.c
+++ b/src/backend/lib/bloomfilter.c
@@ -39,6 +39,13 @@
 #include "lib/bloomfilter.h"
 #include "port/pg_bitutils.h"
 
+/*
+ * Default minimum size of the bitset, in bytes. bloom_create() won't create
+ * a bitset smaller than this, even when the caller's total_elems estimate would
+ * suggest a smaller one.
+ */
+#define DEFAULT_MIN_BITSET_BYTES	(1024 * 1024)
+
 #define MAX_HASH_FUNCS		10
 
 struct bloom_filter
@@ -74,17 +81,26 @@ static inline uint32 mod_m(uint32 val, uint64 m);
  * bits, and the largest possible bitset is 512MB (2^32 bits).  The
  * implementation allocates only enough memory to target its standard false
  * positive rate, using a simple formula with caller's total_elems estimate as
- * an input.  The bitset might be as small as 1MB, even when bloom_work_mem is
- * much higher.
+ * an input.  The bitset might be as small as min_bitset_bytes, even when
+ * bloom_work_mem is much higher.
  *
  * The Bloom filter is seeded using a value provided by the caller.  Using a
  * distinct seed value on every call makes it unlikely that the same false
  * positives will reoccur when the same set is fingerprinted a second time.
  * Callers that don't care about this pass a constant as their seed, typically
  * 0.  Callers can also use a pseudo-random seed, eg from pg_prng_uint64().
+ *
+ * min_bitset_bytes is the minimum bitset size. The bitset might be as small
+ * as 1KiB, even when bloom_work_mem is much higher. This is useful for callers
+ * that want to allow filters smaller than the default DEFAULT_MIN_BITSET_BYTES
+ * (1MB), for example when fingerprinting small sets where the 1MB minimum
+ * would waste memory and would not fit into CPU caches. The bitset is still
+ * sized as a power of two number of bits, and is never smaller than this
+ * minimum (subject to that rounding).
  */
 bloom_filter *
-bloom_create(int64 total_elems, int bloom_work_mem, uint64 seed)
+bloom_create_custom(int64 total_elems, int bloom_work_mem,
+					uint64 min_bitset_bytes, uint64 seed)
 {
 	bloom_filter *filter;
 	int			bloom_power;
@@ -99,7 +115,7 @@ bloom_create(int64 total_elems, int bloom_work_mem, uint64 seed)
 	 * false positive rate still won't exceed 2% in almost all cases.
 	 */
 	bitset_bytes = Min(bloom_work_mem * UINT64CONST(1024), total_elems * 2);
-	bitset_bytes = Max(1024 * 1024, bitset_bytes);
+	bitset_bytes = Max(min_bitset_bytes, bitset_bytes);
 
 	/*
 	 * Size in bits should be the highest power of two <= target.  bitset_bits
@@ -119,6 +135,17 @@ bloom_create(int64 total_elems, int bloom_work_mem, uint64 seed)
 	return filter;
 }
 
+/*
+ * Create Bloom filter in caller's memory context, like bloom_create_custom(),
+ * but with the minimum bitset size set to DEFAULT_MIN_BITSET_BYTES (i.e. 1MB).
+ */
+bloom_filter *
+bloom_create(int64 total_elems, int bloom_work_mem, uint64 seed)
+{
+	return bloom_create_custom(total_elems, bloom_work_mem, seed,
+							   DEFAULT_MIN_BITSET_BYTES);
+}
+
 /*
  * Free Bloom filter
  */
@@ -192,6 +219,39 @@ bloom_prop_bits_set(bloom_filter *filter)
 	return bits_set / (double) filter->m;
 }
 
+/*
+ * Returns the number of hash functions used by this Bloom filter.
+ */
+int
+bloom_num_hash_funcs(bloom_filter *filter)
+{
+	return filter->k_hash_funcs;
+}
+
+/*
+ * Returns the total size of the Bloom filter's bitset, in bits.
+ */
+uint64
+bloom_total_bits(bloom_filter *filter)
+{
+	return filter->m;
+}
+
+/*
+ * Estimate the current false positive rate of the Bloom filter.
+ *
+ * For a filter that uses k hash functions, the probability that a membership
+ * test for an element that was never added still reports "possibly present" is
+ * approximately p^k, where p is the proportion of bits currently set. This
+ * reflects the actual contents of the filter rather than the target rate aimed
+ * for at creation time.
+ */
+double
+bloom_false_positive_rate(bloom_filter *filter)
+{
+	return pow(bloom_prop_bits_set(filter), filter->k_hash_funcs);
+}
+
 /*
  * Which element in the sequence of powers of two is less than or equal to
  * target_bitset_bits?
diff --git a/src/backend/optimizer/path/costsize.c b/src/backend/optimizer/path/costsize.c
index 1c575e56ff6..c3072a29ccc 100644
--- a/src/backend/optimizer/path/costsize.c
+++ b/src/backend/optimizer/path/costsize.c
@@ -156,6 +156,7 @@ bool		enable_material = true;
 bool		enable_memoize = true;
 bool		enable_mergejoin = true;
 bool		enable_hashjoin = true;
+bool		enable_hashjoin_bloom = true;
 bool		enable_gathermerge = true;
 bool		enable_partitionwise_join = false;
 bool		enable_partitionwise_aggregate = false;
diff --git a/src/backend/utils/misc/guc_parameters.dat b/src/backend/utils/misc/guc_parameters.dat
index afaa058b046..eb75cf4c5a2 100644
--- a/src/backend/utils/misc/guc_parameters.dat
+++ b/src/backend/utils/misc/guc_parameters.dat
@@ -920,6 +920,13 @@
   boot_val => 'true',
 },
 
+{ name => 'enable_hashjoin_bloom', type => 'bool', context => 'PGC_USERSET', group => 'QUERY_TUNING_METHOD',
+  short_desc => 'Enables the use of a Bloom filter to prefilter hash join probes.',
+  flags => 'GUC_EXPLAIN',
+  variable => 'enable_hashjoin_bloom',
+  boot_val => 'true',
+},
+
 { name => 'enable_incremental_sort', type => 'bool', context => 'PGC_USERSET', group => 'QUERY_TUNING_METHOD',
   short_desc => 'Enables the planner\'s use of incremental sort steps.',
   flags => 'GUC_EXPLAIN',
diff --git a/src/backend/utils/misc/postgresql.conf.sample b/src/backend/utils/misc/postgresql.conf.sample
index ac38cddaaf9..c598504fe25 100644
--- a/src/backend/utils/misc/postgresql.conf.sample
+++ b/src/backend/utils/misc/postgresql.conf.sample
@@ -428,6 +428,7 @@
 #enable_gathermerge = on
 #enable_hashagg = on
 #enable_hashjoin = on
+#enable_hashjoin_bloom = on
 #enable_incremental_sort = on
 #enable_indexscan = on
 #enable_indexonlyscan = on
diff --git a/src/include/executor/hashjoin.h b/src/include/executor/hashjoin.h
index 4d342174b9a..62d43c7dab4 100644
--- a/src/include/executor/hashjoin.h
+++ b/src/include/executor/hashjoin.h
@@ -14,6 +14,7 @@
 #ifndef HASHJOIN_H
 #define HASHJOIN_H
 
+#include "lib/bloomfilter.h"
 #include "nodes/execnodes.h"
 #include "port/atomics.h"
 #include "storage/barrier.h"
@@ -338,6 +339,44 @@ typedef struct HashJoinTableData
 
 	bool		growEnabled;	/* flag to shut off nbatch increases */
 
+	/*
+	 * Optional Bloom filter built on the hashes of the inner relation's join
+	 * keys (the same hash values used by the hash table).  When present, it is
+	 * consulted before probing the hash table to discard outer tuples that
+	 * cannot have a match.  It always contains the hashes of every inner
+	 * tuple, so a negative answer is conclusive across all batches.  This is
+	 * only used by the non-parallel hash join.
+	 */
+	bloom_filter *bloomFilter;	/* Bloom filter, or NULL if not used */
+	double		bloomElements;	/* estimated number of inner tuples */
+	int64		bloomProbes;	/* hash-table probes in the current window */
+	int64		bloomMatches;	/* matches among those probes */
+	bool		bloomSampling;	/* only probe the filter for a sample? */
+	uint64		bloomSampleCounter; /* counter used while sampling */
+	uint64		bloomSampleProbes;  /* counter used while sampling */
+	uint64		bloomSampleMatches; /* counter used while sampling */
+
+	/*
+	 * Cumulative Bloom filter probe statistics, retained for the lifetime of
+	 * the join so EXPLAIN ANALYZE can report how effective the filter was.
+	 * bloomLookups counts how many outer tuples were actually checked against
+	 * the filter, and bloomRejects how many of those were discarded because
+	 * the filter proved they could not match.  Unlike bloomProbes/bloomMatches
+	 * above, these are never reset.
+	 */
+	int64		bloomLookups;	/* outer tuples tested against the filter */
+	int64		bloomRejects;	/* outer tuples rejected by the filter */
+
+	/*
+	 * Cumulative hash-table probe statistics, retained for the lifetime of the
+	 * join.  hashLookups counts how many outer tuples actually probed the hash
+	 * table, and hashMatches how many of those found at least one matching
+	 * inner tuple.  Outer tuples eliminated by the Bloom filter never probe the
+	 * hash table and so are not counted here.
+	 */
+	int64		hashLookups;	/* outer tuples that probed the hash table */
+	int64		hashMatches;	/* probes that found a matching inner tuple */
+
 	/*
 	 * totalTuples is the running total of tuples inserted into either the
 	 * main or skew hash tables.  reportTuples is the number of tuples that we
diff --git a/src/include/executor/instrument_node.h b/src/include/executor/instrument_node.h
index 4076990408e..215e03d5529 100644
--- a/src/include/executor/instrument_node.h
+++ b/src/include/executor/instrument_node.h
@@ -227,6 +227,18 @@ typedef struct HashInstrumentation
 	int			nbatch;			/* number of batches at end of execution */
 	int			nbatch_original;	/* planned number of batches */
 	Size		space_peak;		/* peak memory usage in bytes */
+
+	/* Bloom filter statistics (only the non-parallel hash join builds one) */
+	bool		bloom_used;		/* was a Bloom filter built? */
+	int			bloom_nhashfuncs;	/* number of hash functions used */
+	uint64		bloom_nbytes;	/* size of the filter's bitset, in bytes */
+	double		bloom_false_positive_rate;	/* estimated false positive rate */
+	int64		bloom_nprobes;	/* number of filter probes */
+	int64		bloom_nmatches;	/* number of probes matching the filter */
+
+	/* Hash table probe statistics */
+	int64		hash_nlookups;	/* outer tuples that probed the hash table */
+	int64		hash_nmatches;	/* probes that found a matching inner tuple */
 } HashInstrumentation;
 
 /*
diff --git a/src/include/executor/nodeHash.h b/src/include/executor/nodeHash.h
index 9ff493b627a..a5f45e55875 100644
--- a/src/include/executor/nodeHash.h
+++ b/src/include/executor/nodeHash.h
@@ -36,6 +36,11 @@ extern void ExecParallelHashTableSetCurrentBatch(HashJoinTable hashtable,
 extern void ExecHashTableInsert(HashJoinTable hashtable,
 								TupleTableSlot *slot,
 								uint32 hashvalue);
+extern void ExecHashCreateBloomFilter(HashJoinTable hashtable);
+extern void ExecHashBuildBloomFilter(HashJoinTable hashtable);
+extern bool ExecHashBloomReject(HashJoinTable hashtable, uint32 hashvalue);
+extern void ExecHashBloomSamplingUpdate(HashJoinTable hashtable, bool matched);
+extern void ExecHashBloomAccountLookup(HashJoinTable hashtable);
 extern void ExecParallelHashTableInsert(HashJoinTable hashtable,
 										TupleTableSlot *slot,
 										uint32 hashvalue);
diff --git a/src/include/lib/bloomfilter.h b/src/include/lib/bloomfilter.h
index 860ee9bdc72..8b705319f82 100644
--- a/src/include/lib/bloomfilter.h
+++ b/src/include/lib/bloomfilter.h
@@ -17,11 +17,16 @@ typedef struct bloom_filter bloom_filter;
 
 extern bloom_filter *bloom_create(int64 total_elems, int bloom_work_mem,
 								  uint64 seed);
+extern bloom_filter *bloom_create_custom(int64 total_elems, int bloom_work_mem,
+										 uint64 min_bitset_bytes, uint64 seed);
 extern void bloom_free(bloom_filter *filter);
 extern void bloom_add_element(bloom_filter *filter, unsigned char *elem,
 							  size_t len);
 extern bool bloom_lacks_element(bloom_filter *filter, unsigned char *elem,
 								size_t len);
 extern double bloom_prop_bits_set(bloom_filter *filter);
+extern int bloom_num_hash_funcs(bloom_filter *filter);
+extern uint64 bloom_total_bits(bloom_filter *filter);
+extern double bloom_false_positive_rate(bloom_filter *filter);
 
 #endif							/* BLOOMFILTER_H */
diff --git a/src/include/optimizer/cost.h b/src/include/optimizer/cost.h
index f2fd5d31507..7339979c008 100644
--- a/src/include/optimizer/cost.h
+++ b/src/include/optimizer/cost.h
@@ -62,6 +62,7 @@ extern PGDLLIMPORT bool enable_material;
 extern PGDLLIMPORT bool enable_memoize;
 extern PGDLLIMPORT bool enable_mergejoin;
 extern PGDLLIMPORT bool enable_hashjoin;
+extern PGDLLIMPORT bool enable_hashjoin_bloom;
 extern PGDLLIMPORT bool enable_gathermerge;
 extern PGDLLIMPORT bool enable_partitionwise_join;
 extern PGDLLIMPORT bool enable_partitionwise_aggregate;
diff --git a/src/test/regress/expected/join_hash_bloom.out b/src/test/regress/expected/join_hash_bloom.out
new file mode 100644
index 00000000000..c9b5bdc66c9
--- /dev/null
+++ b/src/test/regress/expected/join_hash_bloom.out
@@ -0,0 +1,178 @@
+CREATE TABLE hash_bloom_fact (id int, did int, padding text);
+CREATE TABLE hash_bloom_dimension (id int, r float, padding text);
+-- fact is 10x the dimension size
+SELECT setseed(0); -- stabilize random() output
+ setseed 
+---------
+ 
+(1 row)
+
+INSERT INTO hash_bloom_fact SELECT i, 1 + mod(i, 10000), md5(i::text) FROM generate_series(1,100000) s(i);
+INSERT INTO hash_bloom_dimension SELECT i, random(), md5(i::text) FROM generate_series(1,10000) s(i);
+VACUUM ANALYZE hash_bloom_fact;
+VACUUM ANALYZE hash_bloom_dimension;
+-- no parallel queries for now, force hashjoins
+SET max_parallel_workers_per_gather = 0;
+SET enable_nestloop = off;
+SET enable_mergejoin = off;
+SET work_mem = '512kB';
+-- non-selective in-memory hash join does not use Bloom filters
+SET enable_hashjoin_bloom = off;
+EXPLAIN (ANALYZE, VERBOSE, TIMING OFF, COSTS OFF, BUFFERS OFF, SUMMARY OFF) SELECT * FROM hash_bloom_fact f JOIN hash_bloom_dimension d ON (f.did = d.id);
+                                      QUERY PLAN                                      
+--------------------------------------------------------------------------------------
+ Hash Join (actual rows=100000.00 loops=1)
+   Output: f.id, f.did, f.padding, d.id, d.r, d.padding
+   Hash Cond: (f.did = d.id)
+   ->  Seq Scan on public.hash_bloom_fact f (actual rows=100000.00 loops=1)
+         Output: f.id, f.did, f.padding
+   ->  Hash (actual rows=10000.00 loops=1)
+         Output: d.id, d.r, d.padding
+         Buckets: 16384  Batches: 1  Memory Usage: 920kB
+         Hash Lookups: 100000  Matches: 100000  Match Rate: 100.000%
+         ->  Seq Scan on public.hash_bloom_dimension d (actual rows=10000.00 loops=1)
+               Output: d.id, d.r, d.padding
+(11 rows)
+
+SET enable_hashjoin_bloom = on;
+EXPLAIN (ANALYZE, VERBOSE, TIMING OFF, COSTS OFF, BUFFERS OFF, SUMMARY OFF) SELECT * FROM hash_bloom_fact f JOIN hash_bloom_dimension d ON (f.did = d.id);
+                                      QUERY PLAN                                      
+--------------------------------------------------------------------------------------
+ Hash Join (actual rows=100000.00 loops=1)
+   Output: f.id, f.did, f.padding, d.id, d.r, d.padding
+   Hash Cond: (f.did = d.id)
+   ->  Seq Scan on public.hash_bloom_fact f (actual rows=100000.00 loops=1)
+         Output: f.id, f.did, f.padding
+   ->  Hash (actual rows=10000.00 loops=1)
+         Output: d.id, d.r, d.padding
+         Buckets: 16384  Batches: 1  Memory Usage: 920kB
+         Hash Lookups: 100000  Matches: 100000  Match Rate: 100.000%
+         ->  Seq Scan on public.hash_bloom_dimension d (actual rows=10000.00 loops=1)
+               Output: d.id, d.r, d.padding
+(11 rows)
+
+-- a selective in-memory join uses a filter (after 1000 lookups)
+SET enable_hashjoin_bloom = off;
+EXPLAIN (ANALYZE, VERBOSE, TIMING OFF, COSTS OFF, BUFFERS OFF, SUMMARY OFF) SELECT * FROM hash_bloom_fact f JOIN hash_bloom_dimension d ON (f.did = d.id) WHERE d.r < 0.5;
+                                     QUERY PLAN                                      
+-------------------------------------------------------------------------------------
+ Hash Join (actual rows=50180.00 loops=1)
+   Output: f.id, f.did, f.padding, d.id, d.r, d.padding
+   Hash Cond: (f.did = d.id)
+   ->  Seq Scan on public.hash_bloom_fact f (actual rows=100000.00 loops=1)
+         Output: f.id, f.did, f.padding
+   ->  Hash (actual rows=5018.00 loops=1)
+         Output: d.id, d.r, d.padding
+         Buckets: 8192  Batches: 1  Memory Usage: 461kB
+         Hash Lookups: 100000  Matches: 50180  Match Rate: 50.180%
+         ->  Seq Scan on public.hash_bloom_dimension d (actual rows=5018.00 loops=1)
+               Output: d.id, d.r, d.padding
+               Filter: (d.r < '0.5'::double precision)
+               Rows Removed by Filter: 4982
+(13 rows)
+
+SET enable_hashjoin_bloom = on;
+EXPLAIN (ANALYZE, VERBOSE, TIMING OFF, COSTS OFF, BUFFERS OFF, SUMMARY OFF) SELECT * FROM hash_bloom_fact f JOIN hash_bloom_dimension d ON (f.did = d.id) WHERE d.r < 0.5;
+                                     QUERY PLAN                                      
+-------------------------------------------------------------------------------------
+ Hash Join (actual rows=50180.00 loops=1)
+   Output: f.id, f.did, f.padding, d.id, d.r, d.padding
+   Hash Cond: (f.did = d.id)
+   ->  Seq Scan on public.hash_bloom_fact f (actual rows=100000.00 loops=1)
+         Output: f.id, f.did, f.padding
+   ->  Hash (actual rows=5018.00 loops=1)
+         Output: d.id, d.r, d.padding
+         Buckets: 8192  Batches: 1  Memory Usage: 461kB
+         Hash Lookups: 52754  Matches: 50180  Match Rate: 95.121%
+         Bloom Filter: Size: 8kB  Hash Functions: 9  False Positive Rate: 0.191%
+         Bloom Filter Probes: 95000  Matches: 47754  Match Rate: 50.267%
+         ->  Seq Scan on public.hash_bloom_dimension d (actual rows=5018.00 loops=1)
+               Output: d.id, d.r, d.padding
+               Filter: (d.r < '0.5'::double precision)
+               Rows Removed by Filter: 4982
+(15 rows)
+
+-- force batching
+SET work_mem = '128kB';
+-- batched join always creates a Bloom filter, but then disables it if
+-- not selective enough
+SET enable_hashjoin_bloom = off;
+EXPLAIN (ANALYZE, VERBOSE, TIMING OFF, COSTS OFF, BUFFERS OFF, SUMMARY OFF) SELECT * FROM hash_bloom_fact f JOIN hash_bloom_dimension d ON (f.did = d.id);
+                                      QUERY PLAN                                      
+--------------------------------------------------------------------------------------
+ Hash Join (actual rows=100000.00 loops=1)
+   Output: f.id, f.did, f.padding, d.id, d.r, d.padding
+   Hash Cond: (f.did = d.id)
+   ->  Seq Scan on public.hash_bloom_fact f (actual rows=100000.00 loops=1)
+         Output: f.id, f.did, f.padding
+   ->  Hash (actual rows=10000.00 loops=1)
+         Output: d.id, d.r, d.padding
+         Buckets: 4096  Batches: 4  Memory Usage: 229kB
+         Hash Lookups: 100000  Matches: 100000  Match Rate: 100.000%
+         ->  Seq Scan on public.hash_bloom_dimension d (actual rows=10000.00 loops=1)
+               Output: d.id, d.r, d.padding
+(11 rows)
+
+SET enable_hashjoin_bloom = on;
+EXPLAIN (ANALYZE, VERBOSE, TIMING OFF, COSTS OFF, BUFFERS OFF, SUMMARY OFF) SELECT * FROM hash_bloom_fact f JOIN hash_bloom_dimension d ON (f.did = d.id);
+                                      QUERY PLAN                                      
+--------------------------------------------------------------------------------------
+ Hash Join (actual rows=100000.00 loops=1)
+   Output: f.id, f.did, f.padding, d.id, d.r, d.padding
+   Hash Cond: (f.did = d.id)
+   ->  Seq Scan on public.hash_bloom_fact f (actual rows=100000.00 loops=1)
+         Output: f.id, f.did, f.padding
+   ->  Hash (actual rows=10000.00 loops=1)
+         Output: d.id, d.r, d.padding
+         Buckets: 4096  Batches: 4  Memory Usage: 229kB
+         Hash Lookups: 100000  Matches: 100000  Match Rate: 100.000%
+         Bloom Filter: Size: 16kB  Hash Functions: 9  False Positive Rate: 0.187%
+         Bloom Filter Probes: 1990  Matches: 1990  Match Rate: 100.000%
+         ->  Seq Scan on public.hash_bloom_dimension d (actual rows=10000.00 loops=1)
+               Output: d.id, d.r, d.padding
+(13 rows)
+
+-- batched join always creates a Bloom filter, and keeps using it if
+-- selective enough
+SET enable_hashjoin_bloom = off;
+EXPLAIN (ANALYZE, VERBOSE, TIMING OFF, COSTS OFF, BUFFERS OFF, SUMMARY OFF) SELECT * FROM hash_bloom_fact f JOIN hash_bloom_dimension d ON (f.did = d.id) WHERE d.r < 0.5;
+                                     QUERY PLAN                                      
+-------------------------------------------------------------------------------------
+ Hash Join (actual rows=50180.00 loops=1)
+   Output: f.id, f.did, f.padding, d.id, d.r, d.padding
+   Hash Cond: (f.did = d.id)
+   ->  Seq Scan on public.hash_bloom_fact f (actual rows=100000.00 loops=1)
+         Output: f.id, f.did, f.padding
+   ->  Hash (actual rows=5018.00 loops=1)
+         Output: d.id, d.r, d.padding
+         Buckets: 4096  Batches: 2  Memory Usage: 228kB
+         Hash Lookups: 100000  Matches: 50180  Match Rate: 50.180%
+         ->  Seq Scan on public.hash_bloom_dimension d (actual rows=5018.00 loops=1)
+               Output: d.id, d.r, d.padding
+               Filter: (d.r < '0.5'::double precision)
+               Rows Removed by Filter: 4982
+(13 rows)
+
+SET enable_hashjoin_bloom = on;
+EXPLAIN (ANALYZE, VERBOSE, TIMING OFF, COSTS OFF, BUFFERS OFF, SUMMARY OFF) SELECT * FROM hash_bloom_fact f JOIN hash_bloom_dimension d ON (f.did = d.id) WHERE d.r < 0.5;
+                                     QUERY PLAN                                      
+-------------------------------------------------------------------------------------
+ Hash Join (actual rows=50180.00 loops=1)
+   Output: f.id, f.did, f.padding, d.id, d.r, d.padding
+   Hash Cond: (f.did = d.id)
+   ->  Seq Scan on public.hash_bloom_fact f (actual rows=100000.00 loops=1)
+         Output: f.id, f.did, f.padding
+   ->  Hash (actual rows=5018.00 loops=1)
+         Output: d.id, d.r, d.padding
+         Buckets: 4096  Batches: 2  Memory Usage: 228kB
+         Hash Lookups: 50250  Matches: 50180  Match Rate: 99.861%
+         Bloom Filter: Size: 8kB  Hash Functions: 9  False Positive Rate: 0.191%
+         Bloom Filter Probes: 100000  Matches: 50250  Match Rate: 50.250%
+         ->  Seq Scan on public.hash_bloom_dimension d (actual rows=5018.00 loops=1)
+               Output: d.id, d.r, d.padding
+               Filter: (d.r < '0.5'::double precision)
+               Rows Removed by Filter: 4982
+(15 rows)
+
+DROP TABLE hash_bloom_fact;
+DROP TABLE hash_bloom_dimension;
diff --git a/src/test/regress/expected/sysviews.out b/src/test/regress/expected/sysviews.out
index 132b56a5864..a796e431415 100644
--- a/src/test/regress/expected/sysviews.out
+++ b/src/test/regress/expected/sysviews.out
@@ -163,6 +163,7 @@ select name, setting from pg_settings where name like 'enable%';
  enable_group_by_reordering     | on
  enable_hashagg                 | on
  enable_hashjoin                | on
+ enable_hashjoin_bloom          | on
  enable_incremental_sort        | on
  enable_indexonlyscan           | on
  enable_indexscan               | on
@@ -180,7 +181,7 @@ select name, setting from pg_settings where name like 'enable%';
  enable_seqscan                 | on
  enable_sort                    | on
  enable_tidscan                 | on
-(25 rows)
+(26 rows)
 
 -- There are always wait event descriptions for various types.  InjectionPoint
 -- may be present or absent, depending on history since last postmaster start.
diff --git a/src/test/regress/parallel_schedule b/src/test/regress/parallel_schedule
index 8fa0a6c47fb..095a3fea981 100644
--- a/src/test/regress/parallel_schedule
+++ b/src/test/regress/parallel_schedule
@@ -71,7 +71,7 @@ test: brin gin gist spgist privileges init_privs security_label collate matview
 # ----------
 # Additional BRIN tests
 # ----------
-test: brin_bloom brin_multi
+test: brin_bloom brin_multi join_hash_bloom
 
 # ----------
 # Another group of parallel tests
diff --git a/src/test/regress/sql/join_hash_bloom.sql b/src/test/regress/sql/join_hash_bloom.sql
new file mode 100644
index 00000000000..b62e0b2ed90
--- /dev/null
+++ b/src/test/regress/sql/join_hash_bloom.sql
@@ -0,0 +1,56 @@
+CREATE TABLE hash_bloom_fact (id int, did int, padding text);
+CREATE TABLE hash_bloom_dimension (id int, r float, padding text);
+
+-- fact is 10x the dimension size
+SELECT setseed(0); -- stabilize random() output
+INSERT INTO hash_bloom_fact SELECT i, 1 + mod(i, 10000), md5(i::text) FROM generate_series(1,100000) s(i);
+INSERT INTO hash_bloom_dimension SELECT i, random(), md5(i::text) FROM generate_series(1,10000) s(i);
+
+VACUUM ANALYZE hash_bloom_fact;
+VACUUM ANALYZE hash_bloom_dimension;
+
+-- no parallel queries for now, force hashjoins
+SET max_parallel_workers_per_gather = 0;
+SET enable_nestloop = off;
+SET enable_mergejoin = off;
+SET work_mem = '512kB';
+
+-- non-selective in-memory hash join does not use Bloom filters
+
+SET enable_hashjoin_bloom = off;
+EXPLAIN (ANALYZE, VERBOSE, TIMING OFF, COSTS OFF, BUFFERS OFF, SUMMARY OFF) SELECT * FROM hash_bloom_fact f JOIN hash_bloom_dimension d ON (f.did = d.id);
+
+SET enable_hashjoin_bloom = on;
+EXPLAIN (ANALYZE, VERBOSE, TIMING OFF, COSTS OFF, BUFFERS OFF, SUMMARY OFF) SELECT * FROM hash_bloom_fact f JOIN hash_bloom_dimension d ON (f.did = d.id);
+
+-- a selective in-memory join uses a filter (after 1000 lookups)
+
+SET enable_hashjoin_bloom = off;
+EXPLAIN (ANALYZE, VERBOSE, TIMING OFF, COSTS OFF, BUFFERS OFF, SUMMARY OFF) SELECT * FROM hash_bloom_fact f JOIN hash_bloom_dimension d ON (f.did = d.id) WHERE d.r < 0.5;
+
+SET enable_hashjoin_bloom = on;
+EXPLAIN (ANALYZE, VERBOSE, TIMING OFF, COSTS OFF, BUFFERS OFF, SUMMARY OFF) SELECT * FROM hash_bloom_fact f JOIN hash_bloom_dimension d ON (f.did = d.id) WHERE d.r < 0.5;
+
+-- force batching
+SET work_mem = '128kB';
+
+-- batched join always creates a Bloom filter, but then disables it if
+-- not selective enough
+
+SET enable_hashjoin_bloom = off;
+EXPLAIN (ANALYZE, VERBOSE, TIMING OFF, COSTS OFF, BUFFERS OFF, SUMMARY OFF) SELECT * FROM hash_bloom_fact f JOIN hash_bloom_dimension d ON (f.did = d.id);
+
+SET enable_hashjoin_bloom = on;
+EXPLAIN (ANALYZE, VERBOSE, TIMING OFF, COSTS OFF, BUFFERS OFF, SUMMARY OFF) SELECT * FROM hash_bloom_fact f JOIN hash_bloom_dimension d ON (f.did = d.id);
+
+-- batched join always creates a Bloom filter, and keeps using it if
+-- selective enough
+
+SET enable_hashjoin_bloom = off;
+EXPLAIN (ANALYZE, VERBOSE, TIMING OFF, COSTS OFF, BUFFERS OFF, SUMMARY OFF) SELECT * FROM hash_bloom_fact f JOIN hash_bloom_dimension d ON (f.did = d.id) WHERE d.r < 0.5;
+
+SET enable_hashjoin_bloom = on;
+EXPLAIN (ANALYZE, VERBOSE, TIMING OFF, COSTS OFF, BUFFERS OFF, SUMMARY OFF) SELECT * FROM hash_bloom_fact f JOIN hash_bloom_dimension d ON (f.did = d.id) WHERE d.r < 0.5;
+
+DROP TABLE hash_bloom_fact;
+DROP TABLE hash_bloom_dimension;
-- 
2.54.0

