diff --git a/src/backend/access/brin/brin_bloom.c b/src/backend/access/brin/brin_bloom.c index 03e9d4b713..5dc213727b 100644 --- a/src/backend/access/brin/brin_bloom.c +++ b/src/backend/access/brin/brin_bloom.c @@ -203,6 +203,9 @@ typedef struct BloomOptions */ #define BLOOM_DEFAULT_FALSE_POSITIVE_RATE 0.01 /* 1% fp rate */ +/* With the minimum allowed false positive rate of 0.001, we need up to 10 hashes */ +#define BLOOM_MAX_NUM_PARTITIONS 10 + #define BloomGetNDistinctPerRange(opts) \ ((opts) && (((BloomOptions *) (opts))->nDistinctPerRange != 0) ? \ (((BloomOptions *) (opts))->nDistinctPerRange) : \ @@ -270,6 +273,7 @@ typedef struct BloomFilter uint8 nhashes; /* number of hash functions */ uint32 nbits; /* number of bits in the bitmap (size) */ uint32 nbits_set; /* number of bits set to 1 */ + uint16 partlens[BLOOM_MAX_NUM_PARTITIONS]; /* partition lengths */ /* data of the bloom filter (used both for sorted and hashed phase) */ char data[FLEXIBLE_ARRAY_MEMBER]; @@ -279,6 +283,114 @@ typedef struct BloomFilter static BloomFilter *bloom_switch_to_hashing(BloomFilter *filter); +/* + * generate_primes + * returns array of all primes less than limit + * + * WIP: very naive prime sieve; could be optimized using segmented ranges + */ +static uint16 * +generate_primes(int limit) +{ + /* upper bound of number of primes below limit */ + /* WIP: reference for this number */ + int numprimes = 1.26 * limit / log(limit); + + bool *is_composite = (bool *) palloc0(limit * sizeof(bool)); + uint16 *primes = (uint16 *) palloc0(numprimes * sizeof(uint16)); + + int maxfactor = floor(sqrt(limit)); + int factor = 2; /* first prime */ + + /* mark the sieve where the index is composite */ + while (factor < maxfactor) + { + for (int i = factor * factor; i < limit; i += factor) + is_composite[i] = true; + do { factor++; } while (is_composite[factor]); + } + + /* the unmarked numbers are prime, so copy over */ + for (int i = 2, j = 0; i < limit && j < numprimes; i++) + { + if (!is_composite[i]) + primes[j++] = i; + } + + /* there should still be some zeroes at the end, but make sure */ + primes[numprimes - 1] = 0; + + /* pretty large, so free it now (segmented ranges would make it smaller) */ + pfree(is_composite); + return primes; +} + +/* + * set_bloom_partitions + * Calculate k moduli for one-hashing bloom filter. + * + * Find consecutive primes whose sum is close to nbits and + * return the sum. Copy the primes to the filter to use as + * partition lengths. + * WIP: one-hashing bf paper ref somewhere + */ +static uint32 +set_bloom_partitions(int nbits, uint8 nhashes, uint16 *partlens) +{ + int min, diff, incr; + int pidx = 0; + int sum = 0; + int target_partlen = nbits / nhashes; + + /* + * Increase the limit to ensure we have some primes higher than + * the target partition length. The 100 value is arbitrary, but + * should be well over what we need. + */ + uint16 *primes = generate_primes(target_partlen + 100); + + /* + * In our array of primes, find a sequence of length nhashes, whose + * last item is close to our target partition length. The end of the + * array will be filled with zeros, so we need to guard against that. + */ + while (primes[pidx + nhashes - 1] <= target_partlen && + primes[pidx] > 0) + pidx++; + + for (int i = 0; i < nhashes; i++) + sum += primes[pidx + i]; + + /* + * Since all the primes are less than or equal the desired partition + * length, the sum is somewhat less than nbits. Increment the starting + * point until we find the sequence of primes whose sum is closest to + * nbits. It doesn't matter whether it's higher or lower. + */ + min = abs(nbits - sum); + for (;;) + { + incr = primes[pidx + nhashes] - primes[pidx]; + diff = abs(nbits - (sum + incr)); + if (diff >= min) + break; + + min = diff; + sum += incr; + pidx++; + } + + memcpy(partlens, &primes[pidx], nhashes * sizeof(uint16)); + + /* WIP: assuming it's not important to pfree primes */ + + /* + * The actual filter length will be the sum of the partition lengths + * rounded up to the nearest byte. + */ + return (uint32) ((sum + 7) / 8) * 8; +} + /* * bloom_init * Initialize the Bloom Filter, allocate all the memory. @@ -301,15 +413,13 @@ bloom_init(int ndistinct, double false_positive_rate) m = ceil((ndistinct * log(false_positive_rate)) / log(1.0 / (pow(2.0, log(2.0))))); - /* round m to whole bytes */ - m = ((m + 7) / 8) * 8; - /* * round(log(2.0) * m / ndistinct), but assume round() may not be * available on Windows */ k = log(2.0) * m / ndistinct; k = (k - floor(k) >= 0.5) ? ceil(k) : floor(k); + k = Min(k, BLOOM_MAX_NUM_PARTITIONS); /* * Allocate the bloom filter with a minimum size 64B (about 40B in the @@ -323,7 +433,9 @@ bloom_init(int ndistinct, double false_positive_rate) filter->flags = 0; /* implies SORTED phase */ filter->nhashes = (int) k; - filter->nbits = m; + + /* calculate the partition lengths and adjust m to match */ + filter->nbits = set_bloom_partitions(m, k, filter->partlens); SET_VARSIZE(filter, len); @@ -445,7 +557,7 @@ static BloomFilter * bloom_add_value(BloomFilter *filter, uint32 value, bool *updated) { int i; - uint32 big_h, h, d; + int part_boundary = 0; /* assume 'not updated' by default */ Assert(filter); @@ -514,17 +626,16 @@ bloom_add_value(BloomFilter *filter, uint32 value, bool *updated) /* we better be in the hashing phase */ Assert(BLOOM_IS_HASHED(filter)); - /* compute the hashes, used for the bloom filter */ - big_h = ((uint32) DatumGetInt64(hash_uint32(value))); - - h = big_h % filter->nbits; - d = big_h % (filter->nbits - 1); - /* compute the requested number of hashes */ for (i = 0; i < filter->nhashes; i++) { - int byte = (h / 8); - int bit = (h % 8); + int partlen = filter->partlens[i]; + int bitloc = part_boundary + (value % partlen); + + Assert(bitloc < filter->nbits); + + int byte = (bitloc / 8); + int bit = (bitloc % 8); /* if the bit is not set, set it and remember we did that */ if (! (filter->data[byte] & (0x01 << bit))) @@ -536,12 +647,7 @@ bloom_add_value(BloomFilter *filter, uint32 value, bool *updated) } /* next bit */ - h += d++; - if (h >= filter->nbits) - h -= filter->nbits; - - if (d == filter->nbits) - d = 0; + part_boundary += partlen; } return filter; @@ -571,6 +677,7 @@ bloom_switch_to_hashing(BloomFilter *filter) newfilter->nhashes = filter->nhashes; newfilter->nbits = filter->nbits; + memcpy(newfilter->partlens, filter->partlens, filter->nhashes * sizeof(uint16)); newfilter->flags |= BLOOM_FLAG_PHASE_HASH; SET_VARSIZE(newfilter, len); @@ -595,7 +702,7 @@ static bool bloom_contains_value(BloomFilter *filter, uint32 value) { int i; - uint32 big_h, h, d; + int part_boundary = 0; Assert(filter); @@ -624,28 +731,23 @@ bloom_contains_value(BloomFilter *filter, uint32 value) /* now the regular hashing mode */ Assert(BLOOM_IS_HASHED(filter)); - big_h = ((uint32) DatumGetInt64(hash_uint32(value))); - - h = big_h % filter->nbits; - d = big_h % (filter->nbits - 1); - /* compute the requested number of hashes */ for (i = 0; i < filter->nhashes; i++) { - int byte = (h / 8); - int bit = (h % 8); + int partlen = filter->partlens[i]; + int bitloc = part_boundary + (value % partlen); + + Assert(bitloc < filter->nbits); + + int byte = (bitloc / 8); + int bit = (bitloc % 8); /* if the bit is not set, the value is not there */ if (! (filter->data[byte] & (0x01 << bit))) return false; /* next bit */ - h += d++; - if (h >= filter->nbits) - h -= filter->nbits; - - if (d == filter->nbits) - d = 0; + part_boundary += partlen; } /* all hashes found in bloom filter */ @@ -1065,8 +1167,14 @@ brin_bloom_summary_out(PG_FUNCTION_ARGS) if (BLOOM_IS_HASHED(filter)) { - appendStringInfo(&str, "mode: hashed nhashes: %u nbits: %u nbits_set: %u", + appendStringInfo(&str, + "mode: hashed nhashes: %u nbits: %u nbits_set: %u partition lengths: [", filter->nhashes, filter->nbits, filter->nbits_set); + for (int i = 0; i < filter->nhashes - 1; i++) + { + appendStringInfo(&str, "%u, ", filter->partlens[i]); + } + appendStringInfo(&str, "%u]", filter->partlens[filter->nhashes - 1]); } else {