From 9866e34b9d7719bed781acdbf1961a5b933b455d Mon Sep 17 00:00:00 2001 From: Ben Mejia Date: Mon, 22 Jun 2026 15:35:39 -0700 Subject: [PATCH v1 2/2] Add a bitmap filter for single-batch prefiltering. Add a GUC to control probe filter: enable_hashjoin_probe_filter The probe site has its own per-batch adaptive guard with a higher threshold (~27%). Reported in EXPLAIN ANALYZE as "Probe Filter: N empty buckets skipped". --- src/backend/commands/explain.c | 15 ++++ src/backend/executor/nodeHash.c | 77 ++++++++++++++++++- src/backend/utils/misc/guc_parameters.dat | 7 ++ src/backend/utils/misc/guc_tables.c | 1 + src/backend/utils/misc/postgresql.conf.sample | 1 + src/include/executor/hashjoin.h | 11 +++ src/include/executor/instrument_node.h | 1 + src/include/miscadmin.h | 1 + src/test/regress/expected/join_hash.out | 70 +++++++++++++++++ src/test/regress/sql/join_hash.sql | 53 +++++++++++++ 10 files changed, 234 insertions(+), 3 deletions(-) diff --git a/src/backend/commands/explain.c b/src/backend/commands/explain.c index cb4044c310d..db6a842b550 100644 --- a/src/backend/commands/explain.c +++ b/src/backend/commands/explain.c @@ -3488,6 +3488,21 @@ show_hash_info(HashState *hashstate, ExplainState *es) hinstrument.outer_prefiltered); } } + + /* Show probe-time empty-bucket skips if any */ + if (hinstrument.probe_filter_skips > 0) + { + if (es->format != EXPLAIN_FORMAT_TEXT) + ExplainPropertyInteger("Probe Filter Skips", NULL, + hinstrument.probe_filter_skips, es); + else + { + ExplainIndentText(es); + appendStringInfo(es->str, + "Probe Filter: " UINT64_FORMAT " empty buckets skipped\n", + hinstrument.probe_filter_skips); + } + } } } diff --git a/src/backend/executor/nodeHash.c b/src/backend/executor/nodeHash.c index 389c9b2367a..26765b1f9f1 100644 --- a/src/backend/executor/nodeHash.c +++ b/src/backend/executor/nodeHash.c @@ -43,6 +43,14 @@ #include "utils/tuplestore.h" #include "utils/wait_event.h" +/* + * Adaptive guard for the probe-time empty-bucket filter. Sample the skip + * rate over a window; if it falls below break-even, stop consulting the + * bitmap. + */ +#define PROBE_FILTER_WINDOW 1024 +#define PROBE_FILTER_MIN_SKIP_PCT 27 + static void ExecHashIncreaseNumBatches(HashJoinTable hashtable); static void ExecHashIncreaseNumBuckets(HashJoinTable hashtable); static void ExecParallelHashIncreaseNumBatches(HashJoinTable hashtable); @@ -213,6 +221,26 @@ MultiExecPrivateHash(HashState *node) if (hashtable->nbuckets != hashtable->nbuckets_optimal) ExecHashIncreaseNumBuckets(hashtable); + /* + * Single-batch probe filter: build the bitmap filter now that nbuckets + * is final. + */ + if (hashtable->nbatch == 1 && enable_hashjoin_probe_filter && + hashtable->batch_bitmap == NULL) + { + MemoryContext oldctx = MemoryContextSwitchTo(hashtable->hashCxt); + size_t bitmap_bytes = (hashtable->nbuckets + 7) / 8; + int b; + + hashtable->batch_bitmap = palloc0_array(uint8 *, 1); + hashtable->batch_bitmap[0] = palloc0(bitmap_bytes); + for (b = 0; b < hashtable->nbuckets; b++) + if (hashtable->buckets.unshared[b] != NULL) + hashtable->batch_bitmap[0][b >> 3] |= (1 << (b & 7)); + hashtable->probe_filter_active = true; + MemoryContextSwitchTo(oldctx); + } + /* Account for the buckets in spaceUsed (reported in EXPLAIN ANALYZE) */ hashtable->spaceUsed += hashtable->nbuckets * sizeof(HashJoinTuple); if (hashtable->spaceUsed > hashtable->spacePeak) @@ -542,6 +570,10 @@ ExecHashTableCreate(HashState *state) hashtable->prefilter_win_checks = 0; hashtable->prefilter_win_drops = 0; hashtable->outer_prefiltered = 0; + hashtable->probe_filter_active = false; + hashtable->probe_filter_win_checks = 0; + hashtable->probe_filter_win_skips = 0; + hashtable->probe_filter_skips = 0; hashtable->spaceUsed = 0; hashtable->spacePeak = 0; hashtable->spaceAllowed = space_allowed; @@ -593,8 +625,9 @@ ExecHashTableCreate(HashState *state) hashtable->outerBatchFile = palloc0_array(BufFile *, nbatch); /* - * Allocate per-batch pre-filter bitmaps. - * Index 0 is left NULL, as batch 0 is never spilled. + * Allocate the pre-spill drop filter's per-batch occupancy bitmaps. + * Index 0 is left NULL, as batch 0 is never spilled. (The probe-time + * skip is single-batch only and builds its own bitmap in MultiExecHash.) */ if (enable_hashjoin_prefilter) { @@ -1126,7 +1159,7 @@ ExecHashIncreaseNumBatches(HashJoinTable hashtable) /* Batch doubling invalidates the outer pre-storage filter. */ if (hashtable->batch_bitmap != NULL) { - for (int b = 1; b < oldnbatch; b++) + for (int b = 0; b < oldnbatch; b++) if (hashtable->batch_bitmap[b]) pfree(hashtable->batch_bitmap[b]); pfree(hashtable->batch_bitmap); @@ -2057,6 +2090,43 @@ ExecScanHashBucket(HashJoinState *hjstate, HashJoinTuple hashTuple = hjstate->hj_CurTuple; uint32 hashvalue = hjstate->hj_CurHashValue; + /* + * Probe-time empty-bucket skip: on the first probe of a standard bucket, + * check the bitmap filter and skip this bucket if empty. Also check the + * adaptive guard and disable the filter if the rate is below break-even. + */ + if (hashTuple == NULL && + enable_hashjoin_probe_filter && + hashtable->probe_filter_active && + hashtable->nbatch == 1 && + hashtable->batch_bitmap != NULL && + hjstate->hj_CurSkewBucketNo == INVALID_SKEW_BUCKET_NO) + { + int bucketno = hjstate->hj_CurBucketNo; + uint8 *bitmap = hashtable->batch_bitmap[hashtable->curbatch]; + bool empty = ((bitmap[bucketno >> 3] & (1 << (bucketno & 7))) == 0); + + hashtable->probe_filter_win_checks++; + if (empty) + hashtable->probe_filter_win_skips++; + + /* End of probe filter window: skip only if it is paying off. */ + if (hashtable->probe_filter_win_checks >= PROBE_FILTER_WINDOW) + { + if (hashtable->probe_filter_win_skips * 100 < + hashtable->probe_filter_win_checks * PROBE_FILTER_MIN_SKIP_PCT) + hashtable->probe_filter_active = false; + hashtable->probe_filter_win_checks = 0; + hashtable->probe_filter_win_skips = 0; + } + + if (empty) + { + hashtable->probe_filter_skips++; + return false; + } + } + /* * hj_CurTuple is the address of the tuple last returned from the current * bucket, or NULL if it's time to start scanning a new bucket. @@ -2980,6 +3050,7 @@ ExecHashAccumInstrumentation(HashInstrumentation *instrument, instrument->space_peak = Max(instrument->space_peak, hashtable->spacePeak); instrument->outer_prefiltered += hashtable->outer_prefiltered; + instrument->probe_filter_skips += hashtable->probe_filter_skips; } /* diff --git a/src/backend/utils/misc/guc_parameters.dat b/src/backend/utils/misc/guc_parameters.dat index 00c9c103e16..a972e0d2364 100644 --- a/src/backend/utils/misc/guc_parameters.dat +++ b/src/backend/utils/misc/guc_parameters.dat @@ -926,6 +926,13 @@ boot_val => 'false', }, +{ name => 'enable_hashjoin_probe_filter', type => 'bool', context => 'PGC_USERSET', group => 'QUERY_TUNING_METHOD', + short_desc => 'Enables skipping empty hash buckets during the probe phase of single-batch hash joins.', + flags => 'GUC_EXPLAIN', + variable => 'enable_hashjoin_probe_filter', + boot_val => 'false', +}, + { name => 'enable_incremental_sort', type => 'bool', context => 'PGC_USERSET', group => 'QUERY_TUNING_METHOD', short_desc => 'Enables the planner\'s use of incremental sort steps.', flags => 'GUC_EXPLAIN', diff --git a/src/backend/utils/misc/guc_tables.c b/src/backend/utils/misc/guc_tables.c index 400cc687533..b5affcc52b1 100644 --- a/src/backend/utils/misc/guc_tables.c +++ b/src/backend/utils/misc/guc_tables.c @@ -533,6 +533,7 @@ extern const struct config_enum_entry dynamic_shared_memory_options[]; */ bool AllowAlterSystem = true; bool enable_hashjoin_prefilter = false; +bool enable_hashjoin_probe_filter = false; bool log_duration = false; bool Debug_print_plan = false; bool Debug_print_parse = false; diff --git a/src/backend/utils/misc/postgresql.conf.sample b/src/backend/utils/misc/postgresql.conf.sample index 92630504fdf..544b9d2e14e 100644 --- a/src/backend/utils/misc/postgresql.conf.sample +++ b/src/backend/utils/misc/postgresql.conf.sample @@ -429,6 +429,7 @@ #enable_hashagg = on #enable_hashjoin = on #enable_hashjoin_prefilter = off +#enable_hashjoin_probe_filter = off #enable_incremental_sort = on #enable_indexscan = on #enable_indexonlyscan = on diff --git a/src/include/executor/hashjoin.h b/src/include/executor/hashjoin.h index ca5f69f9830..487e00fc909 100644 --- a/src/include/executor/hashjoin.h +++ b/src/include/executor/hashjoin.h @@ -376,6 +376,17 @@ typedef struct HashJoinTableData uint64 prefilter_win_drops; /* drops in current window */ uint64 outer_prefiltered; /* outer tuples dropped before spilling */ + /* + * Probe-time empty-bucket skip (single-batch joins only). After the build, + * MultiExecHash records every occupied bucket in batch_bitmap[0]; while + * probing, an empty bucket cannot match, so the chain walk is skipped. A + * one-shot adaptive guard disables it if the skip rate is below break-even. + */ + bool probe_filter_active; /* still consulting at probe time? */ + uint64 probe_filter_win_checks; /* probes seen in current window */ + uint64 probe_filter_win_skips; /* skips in current window */ + uint64 probe_filter_skips; /* total empty buckets skipped at probe */ + Size spaceUsed; /* memory space currently used by tuples */ Size spaceAllowed; /* upper limit for space used */ Size spacePeak; /* peak space used */ diff --git a/src/include/executor/instrument_node.h b/src/include/executor/instrument_node.h index 37776baa7ab..5df6f07034c 100644 --- a/src/include/executor/instrument_node.h +++ b/src/include/executor/instrument_node.h @@ -228,6 +228,7 @@ typedef struct HashInstrumentation int nbatch_original; /* planned number of batches */ Size space_peak; /* peak memory usage in bytes */ uint64 outer_prefiltered; /* outer tuples dropped before spilling */ + uint64 probe_filter_skips; /* empty buckets skipped at probe time */ } HashInstrumentation; /* diff --git a/src/include/miscadmin.h b/src/include/miscadmin.h index 9ca5a15ea5b..250c416c8ec 100644 --- a/src/include/miscadmin.h +++ b/src/include/miscadmin.h @@ -274,6 +274,7 @@ extern PGDLLIMPORT int maintenance_work_mem; extern PGDLLIMPORT int max_parallel_maintenance_workers; extern PGDLLIMPORT bool enable_hashjoin_prefilter; +extern PGDLLIMPORT bool enable_hashjoin_probe_filter; /* * Upper and lower hard limits for the buffer access strategy ring size diff --git a/src/test/regress/expected/join_hash.out b/src/test/regress/expected/join_hash.out index 0557c544272..d1bbe1a177c 100644 --- a/src/test/regress/expected/join_hash.out +++ b/src/test/regress/expected/join_hash.out @@ -1241,6 +1241,26 @@ begin return result; end; $$; +-- Number of empty buckets the single-batch probe filter skipped, or NULL when +-- it did not run. +create or replace function hash_join_probe_skips(query text) +returns bigint language plpgsql +as +$$ +declare + whole_plan json; + hash_node json; + result bigint; +begin + for whole_plan in + execute 'explain (analyze, format ''json'') ' || query + loop + hash_node := find_hash(json_extract_path(whole_plan, '0', 'Plan')); + result := (hash_node->>'Probe Filter Skips')::bigint; + end loop; + return result; +end; +$$; -- Build side: 20000 distinct keys, large enough to need multiple batches at -- the work_mem above and correctly estimated so nbatch stays put (the regime -- where the pre-filter bitmaps survive). @@ -1360,4 +1380,54 @@ $$) is null as anti_prefilter_disabled; t (1 row) +-- (6) Probe filter must stay inert in a multi-batch join. With the drop +-- filter on, batch_bitmap[1..n] is allocated but index 0 is left NULL; the +-- probe filter is single-batch only and must not consult it. A correct count +-- (and no crash) guards against the nbatch==1 gate being removed. +set local enable_hashjoin_prefilter = on; +set local enable_hashjoin_probe_filter = on; +select count(*) from pf_probe p join pf_build b using (id); + count +------- + 10000 +(1 row) + +-- (7) Single-batch probe filter: with the inner in one batch and a sparsely +-- occupied bucket array, non-matching probe tuples land in empty buckets and +-- are skipped at probe time. Matching and non-matching keys are interleaved +-- so the adaptive sampling window sees a representative skip rate. +set local work_mem = '4MB'; +set local enable_hashjoin_prefilter = off; +set local enable_hashjoin_probe_filter = off; +create table pf_sb_build as + select g as id from generate_series(1, 5000) g; +analyze pf_sb_build; +create table pf_sb_probe as + select case when g % 50 = 0 then g / 50 else 1000000 + g end as id + from generate_series(1, 50000) g; +analyze pf_sb_probe; +-- identical result with the probe filter off and on +select count(*) from pf_sb_probe p join pf_sb_build b using (id); + count +------- + 1000 +(1 row) + +set local enable_hashjoin_probe_filter = on; +select count(*) from pf_sb_probe p join pf_sb_build b using (id); + count +------- + 1000 +(1 row) + +-- the probe filter actually skipped empty buckets +select hash_join_probe_skips( +$$ + select count(*) from pf_sb_probe p join pf_sb_build b using (id) +$$) > 0 as single_batch_probe_fired; + single_batch_probe_fired +-------------------------- + t +(1 row) + rollback; diff --git a/src/test/regress/sql/join_hash.sql b/src/test/regress/sql/join_hash.sql index ab01b277532..56086361c83 100644 --- a/src/test/regress/sql/join_hash.sql +++ b/src/test/regress/sql/join_hash.sql @@ -706,6 +706,27 @@ begin end; $$; +-- Number of empty buckets the single-batch probe filter skipped, or NULL when +-- it did not run. +create or replace function hash_join_probe_skips(query text) +returns bigint language plpgsql +as +$$ +declare + whole_plan json; + hash_node json; + result bigint; +begin + for whole_plan in + execute 'explain (analyze, format ''json'') ' || query + loop + hash_node := find_hash(json_extract_path(whole_plan, '0', 'Plan')); + result := (hash_node->>'Probe Filter Skips')::bigint; + end loop; + return result; +end; +$$; + -- Build side: 20000 distinct keys, large enough to need multiple batches at -- the work_mem above and correctly estimated so nbatch stays put (the regime -- where the pre-filter bitmaps survive). @@ -778,4 +799,36 @@ $$ where not exists (select 1 from pf_build b where b.id = p.id) $$) is null as anti_prefilter_disabled; +-- (6) Probe filter must stay inert in a multi-batch join. With the drop +-- filter on, batch_bitmap[1..n] is allocated but index 0 is left NULL; the +-- probe filter is single-batch only and must not consult it. A correct count +-- (and no crash) guards against the nbatch==1 gate being removed. +set local enable_hashjoin_prefilter = on; +set local enable_hashjoin_probe_filter = on; +select count(*) from pf_probe p join pf_build b using (id); + +-- (7) Single-batch probe filter: with the inner in one batch and a sparsely +-- occupied bucket array, non-matching probe tuples land in empty buckets and +-- are skipped at probe time. Matching and non-matching keys are interleaved +-- so the adaptive sampling window sees a representative skip rate. +set local work_mem = '4MB'; +set local enable_hashjoin_prefilter = off; +set local enable_hashjoin_probe_filter = off; +create table pf_sb_build as + select g as id from generate_series(1, 5000) g; +analyze pf_sb_build; +create table pf_sb_probe as + select case when g % 50 = 0 then g / 50 else 1000000 + g end as id + from generate_series(1, 50000) g; +analyze pf_sb_probe; +-- identical result with the probe filter off and on +select count(*) from pf_sb_probe p join pf_sb_build b using (id); +set local enable_hashjoin_probe_filter = on; +select count(*) from pf_sb_probe p join pf_sb_build b using (id); +-- the probe filter actually skipped empty buckets +select hash_join_probe_skips( +$$ + select count(*) from pf_sb_probe p join pf_sb_build b using (id) +$$) > 0 as single_batch_probe_fired; + rollback; -- 2.50.1 (Apple Git-155)