From 62040683b360efa790b42d0c3c4970bbdc439c52 Mon Sep 17 00:00:00 2001 From: Ben Mejia Date: Mon, 22 Jun 2026 13:56:45 -0700 Subject: [PATCH v1 1/2] Add enable_hashjoin_prefilter: drop non-matching outer tuples before spill Maintain a per-batch inner-occupancy bitmap (the prefilter), populated as inner tuples are written to their batch file during the build. During outer partitioning, consult the target batch's bitmap and drop the tuple if its bucket is empty. Restricted to serial multi-batch joins with no batch doubling. An adaptive guard samples the eviction rate over a window of routed probe tuples and stops consulting the bitmap once the rate falls below break-even, so dense (FK-like) joins do not keep paying the per-probe check for no benefit. Controlled by the enable_hashjoin_prefilter GUC (default off) and reported in EXPLAIN ANALYZE as "Outer Prefilter: N tuples dropped before spill". --- src/backend/commands/explain.c | 15 ++ src/backend/executor/nodeHash.c | 35 ++++ src/backend/executor/nodeHashjoin.c | 53 ++++- src/backend/utils/misc/guc_parameters.dat | 7 + src/backend/utils/misc/guc_tables.c | 1 + src/backend/utils/misc/postgresql.conf.sample | 1 + src/include/executor/hashjoin.h | 17 ++ src/include/executor/instrument_node.h | 1 + src/include/miscadmin.h | 2 + src/test/regress/expected/join_hash.out | 195 ++++++++++++++++++ src/test/regress/sql/join_hash.sql | 154 ++++++++++++++ 11 files changed, 479 insertions(+), 2 deletions(-) diff --git a/src/backend/commands/explain.c b/src/backend/commands/explain.c index a40d03d35f3..cb4044c310d 100644 --- a/src/backend/commands/explain.c +++ b/src/backend/commands/explain.c @@ -3473,6 +3473,21 @@ show_hash_info(HashState *hashstate, ExplainState *es) hinstrument.nbuckets, hinstrument.nbatch, spacePeakKb); } + + /* Show outer pre-storage filter stats if it dropped anything */ + if (hinstrument.outer_prefiltered > 0) + { + if (es->format != EXPLAIN_FORMAT_TEXT) + ExplainPropertyInteger("Outer Tuples Prefiltered", NULL, + hinstrument.outer_prefiltered, es); + else + { + ExplainIndentText(es); + appendStringInfo(es->str, + "Outer Prefilter: " UINT64_FORMAT " tuples dropped before spill\n", + hinstrument.outer_prefiltered); + } + } } } diff --git a/src/backend/executor/nodeHash.c b/src/backend/executor/nodeHash.c index 8825bb6fa23..389c9b2367a 100644 --- a/src/backend/executor/nodeHash.c +++ b/src/backend/executor/nodeHash.c @@ -537,6 +537,11 @@ ExecHashTableCreate(HashState *state) hashtable->skewTuples = 0; hashtable->innerBatchFile = NULL; hashtable->outerBatchFile = NULL; + hashtable->batch_bitmap = NULL; + hashtable->prefilter_active = false; + hashtable->prefilter_win_checks = 0; + hashtable->prefilter_win_drops = 0; + hashtable->outer_prefiltered = 0; hashtable->spaceUsed = 0; hashtable->spacePeak = 0; hashtable->spaceAllowed = space_allowed; @@ -587,6 +592,21 @@ ExecHashTableCreate(HashState *state) hashtable->innerBatchFile = palloc0_array(BufFile *, nbatch); hashtable->outerBatchFile = palloc0_array(BufFile *, nbatch); + /* + * Allocate per-batch pre-filter bitmaps. + * Index 0 is left NULL, as batch 0 is never spilled. + */ + if (enable_hashjoin_prefilter) + { + size_t bitmap_bytes = (hashtable->nbuckets + 7) / 8; + int b; + + hashtable->batch_bitmap = palloc0_array(uint8 *, nbatch); + for (b = 1; b < nbatch; b++) + hashtable->batch_bitmap[b] = palloc0(bitmap_bytes); + hashtable->prefilter_active = true; + } + MemoryContextSwitchTo(oldctx); /* The files will not be opened until needed... */ @@ -1103,6 +1123,16 @@ ExecHashIncreaseNumBatches(HashJoinTable hashtable) hashtable->nbatch = nbatch; + /* Batch doubling invalidates the outer pre-storage filter. */ + if (hashtable->batch_bitmap != NULL) + { + for (int b = 1; b < oldnbatch; b++) + if (hashtable->batch_bitmap[b]) + pfree(hashtable->batch_bitmap[b]); + pfree(hashtable->batch_bitmap); + hashtable->batch_bitmap = NULL; + } + /* * Scan through the existing hash table entries and dump out any that are * no longer of the current batch. @@ -1851,6 +1881,10 @@ ExecHashTableInsert(HashJoinTable hashtable, hashvalue, &hashtable->innerBatchFile[batchno], hashtable); + + /* Set bit in pre-filter to record this bucket as occupied */ + if (hashtable->batch_bitmap != NULL) + hashtable->batch_bitmap[batchno][bucketno >> 3] |= (1 << (bucketno & 7)); } if (shouldFree) @@ -2945,6 +2979,7 @@ ExecHashAccumInstrumentation(HashInstrumentation *instrument, hashtable->nbatch_original); instrument->space_peak = Max(instrument->space_peak, hashtable->spacePeak); + instrument->outer_prefiltered += hashtable->outer_prefiltered; } /* diff --git a/src/backend/executor/nodeHashjoin.c b/src/backend/executor/nodeHashjoin.c index 0b365d5b475..44ce1857947 100644 --- a/src/backend/executor/nodeHashjoin.c +++ b/src/backend/executor/nodeHashjoin.c @@ -193,6 +193,15 @@ /* Returns true if doing null-fill on inner relation */ #define HJ_FILL_INNER(hjstate) ((hjstate)->hj_NullOuterTupleSlot != NULL) +/* + * Adaptive guard for the pre-filter. Sample the eviction rate + * over a window of routed probe tuples; if it falls below the threshold, stop + * consulting the bitmap for the rest of the outer-partitioning pass so dense + * (FK-like) joins do not pay the per-probe check for no benefit. + */ +#define PREFILTER_WINDOW 4096 +#define PREFILTER_MIN_DROP_PCT 5 + static TupleTableSlot *ExecHashJoinOuterGetTuple(PlanState *outerNode, HashJoinState *hjstate, uint32 *hashvalue); @@ -508,8 +517,48 @@ ExecHashJoinImpl(PlanState *pstate, bool parallel) node->hj_CurSkewBucketNo == INVALID_SKEW_BUCKET_NO) { bool shouldFree; - MinimalTuple mintuple = ExecFetchSlotMinimalTuple(outerTupleSlot, - &shouldFree); + MinimalTuple mintuple; + + /* + * Pre-filter: if inner table has nothing in this bucket, the + * outer tuple cannot match, so drop it. Not valid for outer/anti + * joins, which must still emit unmatched outer rows. + */ + if (hashtable->batch_bitmap != NULL && hashtable->prefilter_active && + (node->js.jointype == JOIN_INNER || + node->js.jointype == JOIN_SEMI)) + { + int bucketno = node->hj_CurBucketNo; + uint8 *bitmap = hashtable->batch_bitmap[batchno]; + bool empty; + + empty = ((bitmap[bucketno >> 3] & (1 << (bucketno & 7))) == 0); + + /* Window accounting: count this probe, and the drop if empty. */ + hashtable->prefilter_win_checks++; + if (empty) + { + hashtable->prefilter_win_drops++; + hashtable->outer_prefiltered++; + } + + /* End of window: stop filtering if it is no longer paying off. */ + if (hashtable->prefilter_win_checks >= PREFILTER_WINDOW) + { + if (hashtable->prefilter_win_drops * 100 < + hashtable->prefilter_win_checks * PREFILTER_MIN_DROP_PCT) + hashtable->prefilter_active = false; + hashtable->prefilter_win_checks = 0; + hashtable->prefilter_win_drops = 0; + } + + /* empty bucket: no inner match possible, drop without spilling */ + if (empty) + continue; + } + + mintuple = ExecFetchSlotMinimalTuple(outerTupleSlot, + &shouldFree); /* * Need to postpone this outer tuple to a later batch. diff --git a/src/backend/utils/misc/guc_parameters.dat b/src/backend/utils/misc/guc_parameters.dat index 7b1eb6e61bc..00c9c103e16 100644 --- a/src/backend/utils/misc/guc_parameters.dat +++ b/src/backend/utils/misc/guc_parameters.dat @@ -919,6 +919,13 @@ boot_val => 'true', }, +{ name => 'enable_hashjoin_prefilter', type => 'bool', context => 'PGC_USERSET', group => 'QUERY_TUNING_METHOD', + short_desc => 'Enables pre-filter for outer tuples before spilling them in multi-batch hash joins.', + flags => 'GUC_EXPLAIN', + variable => 'enable_hashjoin_prefilter', + boot_val => 'false', +}, + { name => 'enable_incremental_sort', type => 'bool', context => 'PGC_USERSET', group => 'QUERY_TUNING_METHOD', short_desc => 'Enables the planner\'s use of incremental sort steps.', flags => 'GUC_EXPLAIN', diff --git a/src/backend/utils/misc/guc_tables.c b/src/backend/utils/misc/guc_tables.c index 290ccbc543e..400cc687533 100644 --- a/src/backend/utils/misc/guc_tables.c +++ b/src/backend/utils/misc/guc_tables.c @@ -532,6 +532,7 @@ extern const struct config_enum_entry dynamic_shared_memory_options[]; * GUC option variables that are exported from this module */ bool AllowAlterSystem = true; +bool enable_hashjoin_prefilter = false; bool log_duration = false; bool Debug_print_plan = false; bool Debug_print_parse = false; diff --git a/src/backend/utils/misc/postgresql.conf.sample b/src/backend/utils/misc/postgresql.conf.sample index ac38cddaaf9..92630504fdf 100644 --- a/src/backend/utils/misc/postgresql.conf.sample +++ b/src/backend/utils/misc/postgresql.conf.sample @@ -428,6 +428,7 @@ #enable_gathermerge = on #enable_hashagg = on #enable_hashjoin = on +#enable_hashjoin_prefilter = off #enable_incremental_sort = on #enable_indexscan = on #enable_indexonlyscan = on diff --git a/src/include/executor/hashjoin.h b/src/include/executor/hashjoin.h index 4d342174b9a..ca5f69f9830 100644 --- a/src/include/executor/hashjoin.h +++ b/src/include/executor/hashjoin.h @@ -359,6 +359,23 @@ typedef struct HashJoinTableData BufFile **innerBatchFile; /* buffered virtual temp file per batch */ BufFile **outerBatchFile; /* buffered virtual temp file per batch */ + /* + * Per-batch pre-filter bitmaps, one bit per bucket, populated as inner + * tuples are written to their batch file during the inner build. + * During outer partitioning, an outer tuple whose bucket is empty + * cannot match. So for inner and semi joins it can be dropped before + * being spilled, saving the outerBatchFile write and the later read. + * + * prefilter_active is an adaptive runtime guard: it starts true and is + * cleared if the sampled eviction rate over a window of routed probe tuples + * is below break-even. + */ + uint8 **batch_bitmap; /* per-batch occupancy bitmaps, or NULL */ + bool prefilter_active; /* adaptive: still consulting the bitmap? */ + uint64 prefilter_win_checks; /* routed probes seen in current window */ + uint64 prefilter_win_drops; /* drops in current window */ + uint64 outer_prefiltered; /* outer tuples dropped before spilling */ + Size spaceUsed; /* memory space currently used by tuples */ Size spaceAllowed; /* upper limit for space used */ Size spacePeak; /* peak space used */ diff --git a/src/include/executor/instrument_node.h b/src/include/executor/instrument_node.h index 4076990408e..37776baa7ab 100644 --- a/src/include/executor/instrument_node.h +++ b/src/include/executor/instrument_node.h @@ -227,6 +227,7 @@ typedef struct HashInstrumentation int nbatch; /* number of batches at end of execution */ int nbatch_original; /* planned number of batches */ Size space_peak; /* peak memory usage in bytes */ + uint64 outer_prefiltered; /* outer tuples dropped before spilling */ } HashInstrumentation; /* diff --git a/src/include/miscadmin.h b/src/include/miscadmin.h index 7170a4bff98..9ca5a15ea5b 100644 --- a/src/include/miscadmin.h +++ b/src/include/miscadmin.h @@ -273,6 +273,8 @@ extern PGDLLIMPORT double hash_mem_multiplier; extern PGDLLIMPORT int maintenance_work_mem; extern PGDLLIMPORT int max_parallel_maintenance_workers; +extern PGDLLIMPORT bool enable_hashjoin_prefilter; + /* * Upper and lower hard limits for the buffer access strategy ring size * specified by the VacuumBufferUsageLimit GUC and BUFFER_USAGE_LIMIT option diff --git a/src/test/regress/expected/join_hash.out b/src/test/regress/expected/join_hash.out index 75009e29720..0557c544272 100644 --- a/src/test/regress/expected/join_hash.out +++ b/src/test/regress/expected/join_hash.out @@ -1166,3 +1166,198 @@ lateral (select t1.fivethous, i4.f1 from tenk1 t1 join int4_tbl i4 (4 rows) rollback; +-- +-- enable_hashjoin_prefilter: drop non-matching outer tuples before they are +-- spilled to a batch file. Valid only for inner and semi joins; outer and +-- anti joins must still emit unmatched outer rows, so the pre-filter must not +-- run for them. +-- +begin; +set local enable_hashjoin = on; +set local enable_mergejoin = off; +set local enable_nestloop = off; +set local max_parallel_workers_per_gather = 0; +set local work_mem = '128kB'; +set local hash_mem_multiplier = 1.0; +-- Locate the Hash node in an EXPLAIN (FORMAT json) plan. +create or replace function find_hash(node json) +returns json language plpgsql +as +$$ +declare + x json; + child json; +begin + if node->>'Node Type' = 'Hash' then + return node; + else + for child in select json_array_elements(node->'Plans') + loop + x := find_hash(child); + if x is not null then + return x; + end if; + end loop; + return null; + end if; +end; +$$; +-- original/final batch counts, as in the tests above. +create or replace function hash_join_batches(query text) +returns table (original int, final int) language plpgsql +as +$$ +declare + whole_plan json; + hash_node json; +begin + for whole_plan in + execute 'explain (analyze, format ''json'') ' || query + loop + hash_node := find_hash(json_extract_path(whole_plan, '0', 'Plan')); + original := hash_node->>'Original Hash Batches'; + final := hash_node->>'Hash Batches'; + return next; + end loop; +end; +$$; +-- Number of outer tuples removed by the pre-filter, or NULL when it did not +-- run (e.g. for outer/anti joins, or a single-batch join). +create or replace function hash_join_prefiltered(query text) +returns bigint language plpgsql +as +$$ +declare + whole_plan json; + hash_node json; + result bigint; +begin + for whole_plan in + execute 'explain (analyze, format ''json'') ' || query + loop + hash_node := find_hash(json_extract_path(whole_plan, '0', 'Plan')); + result := (hash_node->>'Outer Tuples Prefiltered')::bigint; + end loop; + return result; +end; +$$; +-- Build side: 20000 distinct keys, large enough to need multiple batches at +-- the work_mem above and correctly estimated so nbatch stays put (the regime +-- where the pre-filter bitmaps survive). +create table pf_build as + select g as id, repeat('x', 30) as t from generate_series(1, 20000) g; +analyze pf_build; +-- Probe side: 100000 rows, ~10% matching (keys 1..10000) and ~90% with no +-- match (keys far above the build range), interleaved. The interleaving +-- matters: the pre-filter samples its drop rate over a window of routed probe +-- tuples and switches itself off if that window is unrepresentative, so the +-- matching and non-matching rows must be mixed rather than segregated. Larger +-- than the build side so the planner hashes pf_build and probes this. +create table pf_probe as + select case when g % 10 = 0 then g / 10 + else 1000000 + g end as id + from generate_series(1, 100000) g; +analyze pf_probe; +-- Premise: the join is multi-batch and planned that way (no runtime growth). +select original > 1 as initially_multibatch, final > original as increased_batches + from hash_join_batches( +$$ + select count(*) from pf_probe p join pf_build b using (id) +$$); + initially_multibatch | increased_batches +----------------------+------------------- + t | f +(1 row) + +-- (1) INNER join: identical results with the pre-filter off and on. +set local enable_hashjoin_prefilter = off; +select count(*), coalesce(sum(p.id), 0) as checksum + from pf_probe p join pf_build b using (id); + count | checksum +-------+---------- + 10000 | 50005000 +(1 row) + +set local enable_hashjoin_prefilter = on; +select count(*), coalesce(sum(p.id), 0) as checksum + from pf_probe p join pf_build b using (id); + count | checksum +-------+---------- + 10000 | 50005000 +(1 row) + +-- (2) the pre-filter actually dropped non-matching outer tuples. +select hash_join_prefiltered( +$$ + select count(*) from pf_probe p join pf_build b using (id) +$$) > 0 as inner_prefilter_fired; + inner_prefilter_fired +----------------------- + t +(1 row) + +-- (3) SEMI join: identical results off vs on, and the pre-filter may fire. +set local enable_hashjoin_prefilter = off; +select count(*) from pf_probe p + where exists (select 1 from pf_build b where b.id = p.id); + count +------- + 10000 +(1 row) + +set local enable_hashjoin_prefilter = on; +select count(*) from pf_probe p + where exists (select 1 from pf_build b where b.id = p.id); + count +------- + 10000 +(1 row) + +select hash_join_prefiltered( +$$ + select count(*) from pf_probe p + where exists (select 1 from pf_build b where b.id = p.id) +$$) > 0 as semi_prefilter_fired; + semi_prefilter_fired +---------------------- + t +(1 row) + +-- (4) LEFT join: unmatched outer rows must survive, so the pre-filter must +-- not run. (A wrong pre-filter would make this count drop below 100000.) +set local enable_hashjoin_prefilter = on; +select count(*) from pf_probe p left join pf_build b using (id); + count +-------- + 100000 +(1 row) + +select hash_join_prefiltered( +$$ + select count(*) from pf_probe p left join pf_build b using (id) +$$) is null as left_prefilter_disabled; + left_prefilter_disabled +------------------------- + t +(1 row) + +-- (5) ANTI join: the non-matching outer rows are exactly the result, so the +-- pre-filter must not run. +select count(*) from pf_probe p + where not exists (select 1 from pf_build b where b.id = p.id); + count +------- + 90000 +(1 row) + +select hash_join_prefiltered( +$$ + select count(*) from pf_probe p + where not exists (select 1 from pf_build b where b.id = p.id) +$$) is null as anti_prefilter_disabled; + anti_prefilter_disabled +------------------------- + t +(1 row) + +rollback; diff --git a/src/test/regress/sql/join_hash.sql b/src/test/regress/sql/join_hash.sql index 989390e6864..ab01b277532 100644 --- a/src/test/regress/sql/join_hash.sql +++ b/src/test/regress/sql/join_hash.sql @@ -625,3 +625,157 @@ lateral (select t1.fivethous, i4.f1 from tenk1 t1 join int4_tbl i4 on t1.fivethous = i4.f1+i8.q2 order by 1,2) ss; rollback; + +-- +-- enable_hashjoin_prefilter: drop non-matching outer tuples before they are +-- spilled to a batch file. Valid only for inner and semi joins; outer and +-- anti joins must still emit unmatched outer rows, so the pre-filter must not +-- run for them. +-- +begin; + +set local enable_hashjoin = on; +set local enable_mergejoin = off; +set local enable_nestloop = off; +set local max_parallel_workers_per_gather = 0; +set local work_mem = '128kB'; +set local hash_mem_multiplier = 1.0; + +-- Locate the Hash node in an EXPLAIN (FORMAT json) plan. +create or replace function find_hash(node json) +returns json language plpgsql +as +$$ +declare + x json; + child json; +begin + if node->>'Node Type' = 'Hash' then + return node; + else + for child in select json_array_elements(node->'Plans') + loop + x := find_hash(child); + if x is not null then + return x; + end if; + end loop; + return null; + end if; +end; +$$; + +-- original/final batch counts, as in the tests above. +create or replace function hash_join_batches(query text) +returns table (original int, final int) language plpgsql +as +$$ +declare + whole_plan json; + hash_node json; +begin + for whole_plan in + execute 'explain (analyze, format ''json'') ' || query + loop + hash_node := find_hash(json_extract_path(whole_plan, '0', 'Plan')); + original := hash_node->>'Original Hash Batches'; + final := hash_node->>'Hash Batches'; + return next; + end loop; +end; +$$; + +-- Number of outer tuples removed by the pre-filter, or NULL when it did not +-- run (e.g. for outer/anti joins, or a single-batch join). +create or replace function hash_join_prefiltered(query text) +returns bigint language plpgsql +as +$$ +declare + whole_plan json; + hash_node json; + result bigint; +begin + for whole_plan in + execute 'explain (analyze, format ''json'') ' || query + loop + hash_node := find_hash(json_extract_path(whole_plan, '0', 'Plan')); + result := (hash_node->>'Outer Tuples Prefiltered')::bigint; + end loop; + return result; +end; +$$; + +-- Build side: 20000 distinct keys, large enough to need multiple batches at +-- the work_mem above and correctly estimated so nbatch stays put (the regime +-- where the pre-filter bitmaps survive). +create table pf_build as + select g as id, repeat('x', 30) as t from generate_series(1, 20000) g; +analyze pf_build; + +-- Probe side: 100000 rows, ~10% matching (keys 1..10000) and ~90% with no +-- match (keys far above the build range), interleaved. The interleaving +-- matters: the pre-filter samples its drop rate over a window of routed probe +-- tuples and switches itself off if that window is unrepresentative, so the +-- matching and non-matching rows must be mixed rather than segregated. Larger +-- than the build side so the planner hashes pf_build and probes this. +create table pf_probe as + select case when g % 10 = 0 then g / 10 + else 1000000 + g end as id + from generate_series(1, 100000) g; +analyze pf_probe; + +-- Premise: the join is multi-batch and planned that way (no runtime growth). +select original > 1 as initially_multibatch, final > original as increased_batches + from hash_join_batches( +$$ + select count(*) from pf_probe p join pf_build b using (id) +$$); + +-- (1) INNER join: identical results with the pre-filter off and on. +set local enable_hashjoin_prefilter = off; +select count(*), coalesce(sum(p.id), 0) as checksum + from pf_probe p join pf_build b using (id); +set local enable_hashjoin_prefilter = on; +select count(*), coalesce(sum(p.id), 0) as checksum + from pf_probe p join pf_build b using (id); + +-- (2) the pre-filter actually dropped non-matching outer tuples. +select hash_join_prefiltered( +$$ + select count(*) from pf_probe p join pf_build b using (id) +$$) > 0 as inner_prefilter_fired; + +-- (3) SEMI join: identical results off vs on, and the pre-filter may fire. +set local enable_hashjoin_prefilter = off; +select count(*) from pf_probe p + where exists (select 1 from pf_build b where b.id = p.id); +set local enable_hashjoin_prefilter = on; +select count(*) from pf_probe p + where exists (select 1 from pf_build b where b.id = p.id); +select hash_join_prefiltered( +$$ + select count(*) from pf_probe p + where exists (select 1 from pf_build b where b.id = p.id) +$$) > 0 as semi_prefilter_fired; + +-- (4) LEFT join: unmatched outer rows must survive, so the pre-filter must +-- not run. (A wrong pre-filter would make this count drop below 100000.) +set local enable_hashjoin_prefilter = on; +select count(*) from pf_probe p left join pf_build b using (id); +select hash_join_prefiltered( +$$ + select count(*) from pf_probe p left join pf_build b using (id) +$$) is null as left_prefilter_disabled; + +-- (5) ANTI join: the non-matching outer rows are exactly the result, so the +-- pre-filter must not run. +select count(*) from pf_probe p + where not exists (select 1 from pf_build b where b.id = p.id); +select hash_join_prefiltered( +$$ + select count(*) from pf_probe p + where not exists (select 1 from pf_build b where b.id = p.id) +$$) is null as anti_prefilter_disabled; + +rollback; base-commit: 4df5fe3833a87f6629eb888ca5a385bcb0b179d9 -- 2.50.1 (Apple Git-155)