From 3a4be73ebded2c7cb683f2f0803dcf3badf0686a Mon Sep 17 00:00:00 2001 From: Andrew Dunstan Date: Sun, 31 May 2026 07:48:23 -0400 Subject: [PATCH addon 3/3] Build the hashjoin bloom filter eagerly for a CustomScan recipient When the outer relation's startup cost is below the hash-table build cost, ExecHashJoinImpl fetches the first outer tuple before building the hash table, to take the empty-outer shortcut. For a CustomScan that consumes a pushed-down bloom filter in its own scan loop that is too late: its first tuple request -- which for a column store may decompress a whole row group -- happens before the filter exists, so the first batch is scanned unfiltered. Add a HashJoin.bloom_eager flag, set at plan time when the filter is pushed to a CustomScan recipient (which advertised CUSTOMPATH_SUPPORT_BLOOM_FILTERS), telling the executor to skip the empty-outer prefetch and build the hash table -- and the filter -- before the outer scan starts. This is driven by the same opt-in path as the recipient itself rather than a GUC, and only such a recipient pays the cost (a possibly-needless hash build when the outer turns out empty); stock-scan recipients, which probe per-row after producing a tuple anyway, are unaffected. --- src/backend/executor/nodeHashjoin.c | 11 +++++++++ src/backend/optimizer/plan/createplan.c | 30 ++++++++++++++++++------- src/include/nodes/plannodes.h | 10 +++++++++ 3 files changed, 43 insertions(+), 8 deletions(-) diff --git a/src/backend/executor/nodeHashjoin.c b/src/backend/executor/nodeHashjoin.c index 1eaf81285f8..9154310c09a 100644 --- a/src/backend/executor/nodeHashjoin.c +++ b/src/backend/executor/nodeHashjoin.c @@ -317,6 +317,17 @@ ExecHashJoinImpl(PlanState *pstate, bool parallel) */ node->hj_FirstOuterTupleSlot = NULL; } + else if (((HashJoin *) node->js.ps.plan)->bloom_eager) + { + /* + * We pushed a bloom filter to a CustomScan on the outer + * side that wants it at scan start (e.g. to skip row groups + * before decompression). Skip the empty-outer prefetch and + * build the hash table -- and the filter -- first, so it is + * ready before the outer scan produces its first tuple. + */ + node->hj_FirstOuterTupleSlot = NULL; + } else if (HJ_FILL_OUTER(node) || (outerNode->plan->startup_cost < hashNode->ps.plan->total_cost && !node->hj_OuterNotEmpty)) diff --git a/src/backend/optimizer/plan/createplan.c b/src/backend/optimizer/plan/createplan.c index 5b01b3e45cc..a70f1104800 100644 --- a/src/backend/optimizer/plan/createplan.c +++ b/src/backend/optimizer/plan/createplan.c @@ -4993,16 +4993,30 @@ try_push_bloom_filter(PlannerInfo *root, HashJoin *hj, Plan *outer_plan) recipient->bloom_filters = lappend(recipient->bloom_filters, bf); /* - * If the recipient is a CustomScan that opted in, also build a separate - * filter per join key. Only such a recipient can make use of them (to - * test a single column against a dictionary or zone map); the combined - * filter is always built and is the more selective one for the per-row - * probe. There is nothing to gain for a single-key join, where the two - * coincide. + * A CustomScan recipient that opted in consumes the filter in its own + * scan loop, possibly at the storage level, so it wants two things a + * stock scan does not. */ - if (list_length(hashkeys) > 1 && IsA(recipient, CustomScan) && + if (IsA(recipient, CustomScan) && (((CustomScan *) recipient)->flags & CUSTOMPATH_SUPPORT_BLOOM_FILTERS)) - hj->bloom_perkey = true; + { + /* + * Build the hash table (and filter) before the outer scan starts, so + * the filter is available on the first tuple request rather than after + * a batch has already been scanned unfiltered. + */ + hj->bloom_eager = true; + + /* + * Also build a separate filter per join key, so the recipient can test + * a single column on its own (e.g. against a per-column dictionary or + * zone map). The combined filter is always built and is the more + * selective one for a per-row probe; there is nothing to gain for a + * single-key join, where the two coincide. + */ + if (list_length(hashkeys) > 1) + hj->bloom_perkey = true; + } /* * XXX We've manged to push the filter to the scan node, but maybe diff --git a/src/include/nodes/plannodes.h b/src/include/nodes/plannodes.h index 21ec7ffae1a..0e011f3d4e2 100644 --- a/src/include/nodes/plannodes.h +++ b/src/include/nodes/plannodes.h @@ -1135,6 +1135,16 @@ typedef struct HashJoin * opt-in extra that nobody else pays for. */ bool bloom_perkey; + + /* + * Whether to build the hash table (and bloom filter) before fetching the + * first outer tuple, skipping the empty-outer prefetch optimization. Set + * at plan time when the filter is pushed to a CustomScan recipient, which + * may want to apply the filter the moment its scan starts (e.g. a column + * store skipping row groups before decompression) rather than after having + * already produced a batch unfiltered. See ExecHashJoinImpl. + */ + bool bloom_eager; } HashJoin; /* ---------------- -- 2.43.0