From 16047a0dabca1a0a31cc8d86b274859cd1166438 Mon Sep 17 00:00:00 2001
From: Tomas Vondra <tomas.vondra@postgresql.org>
Date: Sun, 8 Jan 2023 22:04:41 +0100
Subject: [PATCH] Fix handling of NULLs in BRIN indexes

BRIN indexes did not properly distinguish between summaries for empty
(no rows) and all-NULL ranges. All summaries were initialized with
allnulls=true, and the opclasses simply reset allnulls to false when
processing the first non-NULL value. This however fails if the range
starts with a NULL value (or a sequence of NULL values), in which case
we forget the range contains NULL values.

This happens because the allnulls flag is used for two separate
purposes - to mark empty ranges (not representing any rows yet) and
ranges containing only NULL values.

Opclasses don't know which of these cases it is, and so don't know
whether to set hasnulls=true. Setting hasnulls=true in both cases would
make it correct, but it would also make BRIN indexes useless for queries
with IS NULL clauses - all ranges start empty (and thus allnulls=true),
so all ranges would end up with either allnulls=true or hasnulls=true.

The severity of the issue is somewhat reduced by the fact that it only
happens when adding values to an existing summary with allnulls=true,
not when the summarization is processing values in bulk (e.g. during
CREATE INDEX or automatic summarization). In this case the flags were
updated in a slightly different way, not forgetting the NULL values.

The best solution would be to introduce a new flag marking index tuples
representing ranges with no rows, but that would break on-disk format
and/or ABI, depending on where we put the flag. Considering we need to
backpatch this, that's not acceptable.

So instead we use an "impossible" combination of both flags (allnulls
and hasnulls) set to true, to mark "empty" ranges with no rows. In
principle "empty" is a feature of the whole index tuple, which may
contain multiple summaries in a multi-column index, but this is where
the flags are, unfortunately.

We could also skip storing index tuples for empty summaries, but then
we'd have to always process such ranges - even if there are no rows in
large parts of the table (e.g. after a bulk DELETE), it would still
require reading the pages etc. So we store them, but ignore them when
building the bitmap.

Backpatch to 11. The issue exists since BRIN indexes were introduced in
9.5, but older releases are already EOL.

Backpatch-through: 11
Reviewed-by: Justin Pryzby, Matthias van de Meent
Discussion: https://postgr.es/m/402430e4-7d9d-6cf1-09ef-464d80afff3b@enterprisedb.com
---
 src/backend/access/brin/brin.c                | 46 ++++++++++++++++
 src/backend/access/brin/brin_minmax.c         | 54 +++++++++++++++++--
 src/backend/access/brin/brin_tuple.c          | 16 +++++-
 ...summarization-and-inprogress-insertion.out |  8 +--
 ...ummarization-and-inprogress-insertion.spec |  1 +
 5 files changed, 116 insertions(+), 9 deletions(-)

diff --git a/src/backend/access/brin/brin.c b/src/backend/access/brin/brin.c
index 0becfde113..f1dd39e016 100644
--- a/src/backend/access/brin/brin.c
+++ b/src/backend/access/brin/brin.c
@@ -253,8 +253,19 @@ brininsert(Relation idxRel, Datum *values, bool *nulls,
 			Datum		result;
 			BrinValues *bval;
 			FmgrInfo   *addValue;
+			bool		first_row;
+			bool		hasnulls;
 
 			bval = &dtup->bt_columns[keyno];
+
+			first_row = (bval->bv_hasnulls && bval->bv_allnulls);
+
+			/*
+			 * Does the range already have NULL values? Either of the flags can
+			 * be set, but we ignore the state before adding first row.
+			 */
+			hasnulls = (bval->bv_hasnulls || bval->bv_allnulls) && !first_row;
+
 			addValue = index_getprocinfo(idxRel, keyno + 1,
 										 BRIN_PROCNUM_ADDVALUE);
 			result = FunctionCall4Coll(addValue,
@@ -265,6 +276,13 @@ brininsert(Relation idxRel, Datum *values, bool *nulls,
 									   nulls[keyno]);
 			/* if that returned true, we need to insert the updated tuple */
 			need_insert |= DatumGetBool(result);
+
+			/*
+			 * If we had NULLS, and the opclass didn't set allnulls=true, clear
+			 * hasnulls so that we remember there are NULL values.
+			 */
+			if (hasnulls && !bval->bv_allnulls)
+				bval->bv_hasnulls = true;
 		}
 
 		if (!need_insert)
@@ -508,6 +526,17 @@ bringetbitmap(IndexScanDesc scan, TIDBitmap *tbm)
 									   CurrentMemoryContext);
 					}
 
+					/*
+					 * If the range has both allnulls and hasnulls set, it means
+					 * there are no rows in the range, so we can skip it (we have
+					 * scan keys, and we know there's nothing to match).
+					 */
+					if (bval->bv_allnulls && bval->bv_hasnulls)
+					{
+						addrange = false;
+						break;
+					}
+
 					/*
 					 * Check whether the scan key is consistent with the page
 					 * range values; if so, have the pages in the range added
@@ -645,8 +674,19 @@ brinbuildCallback(Relation index,
 		FmgrInfo   *addValue;
 		BrinValues *col;
 		Form_pg_attribute attr = TupleDescAttr(state->bs_bdesc->bd_tupdesc, i);
+		bool		first_row;
+		bool		hasnulls;
 
 		col = &state->bs_dtuple->bt_columns[i];
+
+		first_row = (col->bv_hasnulls && col->bv_allnulls);
+
+		/*
+		 * Does the range already have NULL values? Either of the flags can
+		 * be set, but we ignore the state before adding first row.
+		 */
+		hasnulls = (col->bv_hasnulls || col->bv_allnulls) && !first_row;
+
 		addValue = index_getprocinfo(index, i + 1,
 									 BRIN_PROCNUM_ADDVALUE);
 
@@ -658,6 +698,12 @@ brinbuildCallback(Relation index,
 						  PointerGetDatum(state->bs_bdesc),
 						  PointerGetDatum(col),
 						  values[i], isnull[i]);
+		/*
+		 * If we had NULLS, and the opclass didn't set allnulls=true, clear
+		 * hasnulls so that we remember there are NULL values.
+		 */
+		if (hasnulls && !col->bv_allnulls)
+			col->bv_hasnulls = true;
 	}
 }
 
diff --git a/src/backend/access/brin/brin_minmax.c b/src/backend/access/brin/brin_minmax.c
index 4b5d6a7213..fe19536c90 100644
--- a/src/backend/access/brin/brin_minmax.c
+++ b/src/backend/access/brin/brin_minmax.c
@@ -75,14 +75,38 @@ brin_minmax_add_value(PG_FUNCTION_ARGS)
 	Form_pg_attribute attr;
 	AttrNumber	attno;
 
+	/*
+	 * Is this the first tuple we're adding to the range range? We track
+	 * that by setting both bv_hasnulls and bval->bv_allnulls to true
+	 * during initialization. But it's not a valid combination (at most
+	 * one of those flags should be set), so we reset the second flag.
+	 *
+	 * XXX The caller is responsible for tracking first_row=true.
+	 */
+	if (column->bv_hasnulls && column->bv_allnulls)
+	{
+		column->bv_hasnulls = false;
+		updated = true;
+	}
+
 	/*
 	 * If the new value is null, we record that we saw it if it's the first
 	 * one; otherwise, there's nothing to do.
 	 */
 	if (isnull)
 	{
-		if (column->bv_hasnulls)
-			PG_RETURN_BOOL(false);
+		/*
+		 * We used to check "bv_hasnulls" which might result in having both
+		 * flags set. That used to be OK, because we just ignore hasnulls
+		 * flag in brin_form_tuple when bv_allnulls=true.
+		 *
+		 * But now we interpret this combination as "first row" so it would
+		 * confuse following calls. So make sure to only set one of these
+		 * flags - when allnulls=true we're done, as it already marks the
+		 * range as containing values.
+		 */
+		if (column->bv_allnulls)
+			PG_RETURN_BOOL(updated);
 
 		column->bv_hasnulls = true;
 		PG_RETURN_BOOL(true);
@@ -250,11 +274,32 @@ brin_minmax_union(PG_FUNCTION_ARGS)
 
 	Assert(col_a->bv_attno == col_b->bv_attno);
 
-	/* Adjust "hasnulls" */
+	/*
+	 * If B is empty (represents no rows), ignore it and just keep
+	 * A as is (might be empty etc.).
+	 */
+	if (col_b->bv_allnulls && col_b->bv_hasnulls)
+		PG_RETURN_VOID();
+
+	/*
+	 * Adjust "hasnulls".
+	 *
+	 * It may happen A has allnulls=true, and we should reset it. We
+	 * need to copy the values from B first, which happens later.
+	 * We know the next condition can't trigger, because B is not
+	 * empty so only one of the flags is true.
+	 */
 	if (!col_a->bv_hasnulls && col_b->bv_hasnulls)
 		col_a->bv_hasnulls = true;
 
-	/* If there are no values in B, there's nothing left to do */
+	/*
+	 * If there are no values in B, there's nothing left to do.
+	 *
+	 * Note this is mutually exclusive with the preceding condition.
+	 * We have skipped "empty" B, so hasnulls and allnulls can't be
+	 * both true. So if we adjusted hasnulls for A, there have to be
+	 * values for B (i.e. we're not terminating here).
+	 */
 	if (col_b->bv_allnulls)
 		PG_RETURN_VOID();
 
@@ -269,6 +314,7 @@ brin_minmax_union(PG_FUNCTION_ARGS)
 	 */
 	if (col_a->bv_allnulls)
 	{
+		/* This also applies if we adjusted hasnulls=true earlier. */
 		col_a->bv_allnulls = false;
 		col_a->bv_values[0] = datumCopy(col_b->bv_values[0],
 										attr->attbyval, attr->attlen);
diff --git a/src/backend/access/brin/brin_tuple.c b/src/backend/access/brin/brin_tuple.c
index b3b453aed1..fa0bfd8699 100644
--- a/src/backend/access/brin/brin_tuple.c
+++ b/src/backend/access/brin/brin_tuple.c
@@ -493,8 +493,15 @@ brin_memtuple_initialize(BrinMemTuple *dtuple, BrinDesc *brdesc)
 	for (i = 0; i < brdesc->bd_tupdesc->natts; i++)
 	{
 		dtuple->bt_columns[i].bv_attno = i + 1;
+
+		/*
+		 * Each memtuple starts as if it represents no rows, which is indicated
+		 * by having bot allnulls and hasnulls set to true. We track this for
+		 * all columns, because we don't have a flag for the whole memtuple.
+		 */
 		dtuple->bt_columns[i].bv_allnulls = true;
-		dtuple->bt_columns[i].bv_hasnulls = false;
+		dtuple->bt_columns[i].bv_hasnulls = true;
+
 		dtuple->bt_columns[i].bv_values = (Datum *) currdatum;
 		currdatum += sizeof(Datum) * brdesc->bd_info[i]->oi_nstored;
 	}
@@ -557,6 +564,13 @@ brin_deform_tuple(BrinDesc *brdesc, BrinTuple *tuple, BrinMemTuple *dMemtuple)
 	{
 		int			i;
 
+		/*
+		 * Make sure to overwrite the hasnulls flag, because it was initialized
+		 * to true by brin_memtuple_initialize and we don't want to skip it if
+		 * allnulls=true.
+		 */
+		dtup->bt_columns[keyno].bv_hasnulls = hasnulls[keyno];
+
 		if (allnulls[keyno])
 		{
 			valueno += brdesc->bd_info[keyno]->oi_nstored;
diff --git a/src/test/modules/brin/expected/summarization-and-inprogress-insertion.out b/src/test/modules/brin/expected/summarization-and-inprogress-insertion.out
index 2a4755d099..584ac2602f 100644
--- a/src/test/modules/brin/expected/summarization-and-inprogress-insertion.out
+++ b/src/test/modules/brin/expected/summarization-and-inprogress-insertion.out
@@ -4,7 +4,7 @@ starting permutation: s2check s1b s2b s1i s2summ s1c s2c s2check
 step s2check: SELECT * FROM brin_page_items(get_raw_page('brinidx', 2), 'brinidx'::regclass);
 itemoffset|blknum|attnum|allnulls|hasnulls|placeholder|value   
 ----------+------+------+--------+--------+-----------+--------
-         1|     0|     1|f       |f       |f          |{1 .. 1}
+         1|     0|     1|f       |t       |f          |{1 .. 1}
 (1 row)
 
 step s1b: BEGIN ISOLATION LEVEL REPEATABLE READ;
@@ -26,7 +26,7 @@ step s2c: COMMIT;
 step s2check: SELECT * FROM brin_page_items(get_raw_page('brinidx', 2), 'brinidx'::regclass);
 itemoffset|blknum|attnum|allnulls|hasnulls|placeholder|value      
 ----------+------+------+--------+--------+-----------+-----------
-         1|     0|     1|f       |f       |f          |{1 .. 1}   
+         1|     0|     1|f       |t       |f          |{1 .. 1}   
          2|     1|     1|f       |f       |f          |{1 .. 1000}
 (2 rows)
 
@@ -35,7 +35,7 @@ starting permutation: s2check s1b s1i s2vacuum s1c s2check
 step s2check: SELECT * FROM brin_page_items(get_raw_page('brinidx', 2), 'brinidx'::regclass);
 itemoffset|blknum|attnum|allnulls|hasnulls|placeholder|value   
 ----------+------+------+--------+--------+-----------+--------
-         1|     0|     1|f       |f       |f          |{1 .. 1}
+         1|     0|     1|f       |t       |f          |{1 .. 1}
 (1 row)
 
 step s1b: BEGIN ISOLATION LEVEL REPEATABLE READ;
@@ -45,7 +45,7 @@ step s1c: COMMIT;
 step s2check: SELECT * FROM brin_page_items(get_raw_page('brinidx', 2), 'brinidx'::regclass);
 itemoffset|blknum|attnum|allnulls|hasnulls|placeholder|value      
 ----------+------+------+--------+--------+-----------+-----------
-         1|     0|     1|f       |f       |f          |{1 .. 1}   
+         1|     0|     1|f       |t       |f          |{1 .. 1}   
          2|     1|     1|f       |f       |f          |{1 .. 1000}
 (2 rows)
 
diff --git a/src/test/modules/brin/specs/summarization-and-inprogress-insertion.spec b/src/test/modules/brin/specs/summarization-and-inprogress-insertion.spec
index 19ac18a2e8..18ba92b7ba 100644
--- a/src/test/modules/brin/specs/summarization-and-inprogress-insertion.spec
+++ b/src/test/modules/brin/specs/summarization-and-inprogress-insertion.spec
@@ -9,6 +9,7 @@ setup
     ) WITH (fillfactor=10);
     CREATE INDEX brinidx ON brin_iso USING brin (value) WITH (pages_per_range=1);
     -- this fills the first page
+    INSERT INTO brin_iso VALUES (NULL);
     DO $$
     DECLARE curtid tid;
     BEGIN
-- 
2.39.0

