diff git a/contrib/pageinspect/expected/hash.out b/contrib/pageinspect/expected/hash.out
index 3ba01f6..518bdbe 100644
 a/contrib/pageinspect/expected/hash.out
+++ b/contrib/pageinspect/expected/hash.out
@@ 51,13 +51,13 @@ bsize  8152
bmsize  4096
bmshift  15
maxbucket  3
highmask  7
lowmask  3
+highmask  3
+lowmask  1
ovflpoint  2
firstfree  0
nmaps  1
procid  450
spares  {0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0}
+spares  {0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0}
mapp  {5,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0}
SELECT magic, version, ntuples, bsize, bmsize, bmshift, maxbucket, highmask,
diff git a/doc/src/sgml/pageinspect.sgml b/doc/src/sgml/pageinspect.sgml
index 9f41bb2..682747e 100644
 a/doc/src/sgml/pageinspect.sgml
+++ b/doc/src/sgml/pageinspect.sgml
@@ 667,11 +667,11 @@ bmshift  15
maxbucket  12512
highmask  16383
lowmask  8191
ovflpoint  14
+ovflpoint  49
firstfree  1204
nmaps  1
procid  450
spares  {0,0,0,0,0,0,1,1,1,1,1,4,59,704,1204,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0}
+spares  {0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,3,4,4,4,45,55,58,59,508,567,628,704,1193,1202,1204,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0}
mapp  {65,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0}
diff git a/src/backend/access/hash/README b/src/backend/access/hash/README
index 1541438..e0115de 100644
 a/src/backend/access/hash/README
+++ b/src/backend/access/hash/README
@@ 58,35 +58,53 @@ rules to support a variable number of overflow pages while not having to
move primary bucket pages around after they are created.
Primary bucket pages (henceforth just "bucket pages") are allocated in
powerof2 groups, called "split points" in the code. Buckets 0 and 1
are created when the index is initialized. At the first split, buckets 2
and 3 are allocated; when bucket 4 is needed, buckets 47 are allocated;
when bucket 8 is needed, buckets 815 are allocated; etc. All the bucket
pages of a powerof2 group appear consecutively in the index. This
addressing scheme allows the physical location of a bucket page to be
computed from the bucket number relatively easily, using only a small
amount of control information. We take the log2() of the bucket number
to determine which split point S the bucket belongs to, and then simply
add "hashm_spares[S] + 1" (where hashm_spares[] is an array stored in the
metapage) to compute the physical address. hashm_spares[S] can be
interpreted as the total number of overflow pages that have been allocated
before the bucket pages of splitpoint S. hashm_spares[0] is always 0,
so that buckets 0 and 1 (which belong to splitpoint 0) always appear at
block numbers 1 and 2, just after the meta page. We always have
hashm_spares[N] <= hashm_spares[N+1], since the latter count includes the
former. The difference between the two represents the number of overflow
pages appearing between the bucket page groups of splitpoints N and N+1.

+powerof2 groups, called "split points" in the code. That means at every new
+splitpoint we double the existing number of buckets. Allocating huge chunks
+of bucket pages all at once isn't optimal and we will take ages to consume
+those. To avoid this exponential growth of index size, we did use a trick to
+break up allocation of buckets at the splitpoint into 4 equal phases. If
+(2 ^ x) are the total buckets need to be allocated at a splitpoint (from now on
+we shall call this as a splitpoint group), then we allocate 1/4th (2 ^ (x  2))
+of total buckets at each phase of splitpoint group. Next quarter of allocation
+will only happen if buckets of the previous phase have been already consumed.
+Since for buckets number < 4 we cannot further divide it into multiple phases,
+the first 3 group will have only one phase of allocation. The groups 0, 1 and 2
+will allocate 1, 1 and 2 buckets respectively at once in one phase. For the
+groups > 2 Where we allocate buckets > 4, the allocation process is distributed
+among four equal phases. At group 3 we allocate 4 buckets in 4 different
+phases {1, 1, 1, 1}, the numbers in curly braces indicate the number of buckets
+allocated within each phase of splitpoint group 3. And, for splitpoint group 4
+and 5 allocation phase will be {2, 2, 2, 2} = 16 buckets in total and
+{4, 4, 4, 4} = 32 buckets in total. We can see that at each splitpoint group
+we double the total number of buckets from the previous group but in an
+incremental phase. The bucket pages allocated within one phase of a splitpoint
+group will appear consecutively in the index. This addressing scheme allows
+the physical location of a bucket page to be computed from the bucket number
+relatively easily, using only a small amount of control information. If we
+look at the function _hash_spareindex for a given bucket number we first
+compute the splitpoint group it belongs to and then the phase to which the
+bucket belongs to. Adding them we get the global splitpoint phase number S to
+which the bucket belongs and then simply add "hashm_spares[S] + 1"
+(where hashm_spares[] is an array stored in the metapage) with given bucket
+number to compute its physical address. The hashm_spares[S] can be interpreted
+as the total number of overflow pages that have been allocated before the
+bucket pages of splitpoint phase S. The hashm_spares[0] is always 0, so that
+buckets 0 and 1 (which belong to splitpoint group 0's phase 1 and phase 2
+respectively) always appear at block numbers 1 and 2, just after the meta page.
+We always have hashm_spares[N] <= hashm_spares[N+1], since the latter count
+includes the former. The difference between the two represents the number of
+overflow pages appearing between the bucket page groups of splitpoints phase N
+and N+1.
(Note: the above describes what happens when filling an initially minimally
sized hash index. In practice, we try to estimate the required index size
and allocate a suitable number of splitpoints immediately, to avoid
+sized hash index. In practice, we try to estimate the required index size and
+allocate a suitable number of splitpoints phases immediately, to avoid
expensive resplitting during initial index build.)
When S splitpoints exist altogether, the array entries hashm_spares[0]
through hashm_spares[S] are valid; hashm_spares[S] records the current
total number of overflow pages. New overflow pages are created as needed
at the end of the index, and recorded by incrementing hashm_spares[S].
When it is time to create a new splitpoint's worth of bucket pages, we
+When it is time to create a new splitpoint phase's worth of bucket pages, we
copy hashm_spares[S] into hashm_spares[S+1] and increment S (which is
stored in the hashm_ovflpoint field of the meta page). This has the
effect of reserving the correct number of bucket pages at the end of the
@@ 101,7 +119,7 @@ We have to allow the case "greater than" because it's possible that during
an index extension we crash after allocating filesystem space and before
updating the metapage. Note that on filesystems that allow "holes" in
files, it's entirely likely that pages before the logical EOF are not yet
allocated: when we allocate a new splitpoint's worth of bucket pages, we
+allocated: when we allocate a new splitpoint phase's worth of bucket pages, we
physically zero the last such page to force the EOF up, and the first such
page will be used immediately, but the intervening pages are not written
until needed.
diff git a/src/backend/access/hash/hashovfl.c b/src/backend/access/hash/hashovfl.c
index a3cae21..fe0b4ef 100644
 a/src/backend/access/hash/hashovfl.c
+++ b/src/backend/access/hash/hashovfl.c
@@ 49,7 +49,7 @@ bitno_to_blkno(HashMetaPage metap, uint32 ovflbitnum)
* Convert to absolute page number by adding the number of bucket pages
* that exist before this split point.
*/
 return (BlockNumber) ((1 << i) + ovflbitnum);
+ return (BlockNumber) (_hash_get_totalbuckets(i) + ovflbitnum);
}
/*
@@ 67,14 +67,15 @@ _hash_ovflblkno_to_bitno(HashMetaPage metap, BlockNumber ovflblkno)
/* Determine the split number containing this page */
for (i = 1; i <= splitnum; i++)
{
 if (ovflblkno <= (BlockNumber) (1 << i))
+ if (ovflblkno <= (BlockNumber) _hash_get_totalbuckets(i))
break; /* oops */
 bitnum = ovflblkno  (1 << i);
+ bitnum = ovflblkno  _hash_get_totalbuckets(i);
/*
* bitnum has to be greater than number of overflow page added in
* previous split point. The overflow page at this splitnum (i) if any
 * should start from ((2 ^ i) + metap>hashm_spares[i  1] + 1).
+ * should start from
+ * (_hash_get_totalbuckets(i) + metap>hashm_spares[i  1] + 1).
*/
if (bitnum > metap>hashm_spares[i  1] &&
bitnum <= metap>hashm_spares[i])
diff git a/src/backend/access/hash/hashpage.c b/src/backend/access/hash/hashpage.c
index 61ca2ec..d7374fa 100644
 a/src/backend/access/hash/hashpage.c
+++ b/src/backend/access/hash/hashpage.c
@@ 502,14 +502,15 @@ _hash_init_metabuffer(Buffer buf, double num_tuples, RegProcedure procid,
Page page;
double dnumbuckets;
uint32 num_buckets;
 uint32 log2_num_buckets;
+ uint32 spare_index;
uint32 i;
/*
* Choose the number of initial bucket pages to match the fill factor
* given the estimated number of tuples. We round up the result to the
 * next power of 2, however, and always force at least 2 bucket pages. The
 * upper limit is determined by considerations explained in
+ * total number of buckets which has to be allocated before using its
+ * _hashm_spares index slot. However always force at least 2 bucket pages.
+ * The upper limit is determined by considerations explained in
* _hash_expandtable().
*/
dnumbuckets = num_tuples / ffactor;
@@ 518,11 +519,10 @@ _hash_init_metabuffer(Buffer buf, double num_tuples, RegProcedure procid,
else if (dnumbuckets >= (double) 0x40000000)
num_buckets = 0x40000000;
else
 num_buckets = ((uint32) 1) << _hash_log2((uint32) dnumbuckets);
+ num_buckets = _hash_get_totalbuckets(_hash_spareindex(dnumbuckets));
 log2_num_buckets = _hash_log2(num_buckets);
 Assert(num_buckets == (((uint32) 1) << log2_num_buckets));
 Assert(log2_num_buckets < HASH_MAX_SPLITPOINTS);
+ spare_index = _hash_spareindex(num_buckets);
+ Assert(spare_index < HASH_MAX_SPLITPOINTS);
page = BufferGetPage(buf);
if (initpage)
@@ 563,18 +563,20 @@ _hash_init_metabuffer(Buffer buf, double num_tuples, RegProcedure procid,
/*
* We initialize the index with N buckets, 0 .. N1, occupying physical
 * blocks 1 to N. The first freespace bitmap page is in block N+1. Since
 * N is a power of 2, we can set the masks this way:
+ * blocks 1 to N. The first freespace bitmap page is in block N+1.
*/
 metap>hashm_maxbucket = metap>hashm_lowmask = num_buckets  1;
 metap>hashm_highmask = (num_buckets << 1)  1;
+ metap>hashm_maxbucket = num_buckets  1;
+
+ /* set highmask, which should be sufficient to cover num_buckets. */
+ metap>hashm_highmask = (1 << (_hash_log2(num_buckets)))  1;
+ metap>hashm_lowmask = (metap>hashm_highmask >> 1);
MemSet(metap>hashm_spares, 0, sizeof(metap>hashm_spares));
MemSet(metap>hashm_mapp, 0, sizeof(metap>hashm_mapp));
/* Set up mapping for one spare page after the initial splitpoints */
 metap>hashm_spares[log2_num_buckets] = 1;
 metap>hashm_ovflpoint = log2_num_buckets;
+ metap>hashm_spares[spare_index] = 1;
+ metap>hashm_ovflpoint = spare_index;
metap>hashm_firstfree = 0;
/*
@@ 773,25 +775,44 @@ restart_expand:
start_nblkno = BUCKET_TO_BLKNO(metap, new_bucket);
/*
 * If the split point is increasing (hashm_maxbucket's log base 2
 * increases), we need to allocate a new batch of bucket pages.
+ * If the split point is increasing we need to allocate a new batch of
+ * bucket pages.
*/
 spare_ndx = _hash_log2(new_bucket + 1);
+ spare_ndx = _hash_spareindex(new_bucket + 1);
if (spare_ndx > metap>hashm_ovflpoint)
{
+ uint32 buckets_toadd = 0;
+ uint32 splitpoint_group = 0;
+
Assert(spare_ndx == metap>hashm_ovflpoint + 1);
/*
 * The number of buckets in the new splitpoint is equal to the total
 * number already in existence, i.e. new_bucket. Currently this maps
 * onetoone to blocks required, but someday we may need a more
 * complicated calculation here. We treat allocation of buckets as a
 * separate WALlogged action. Even if we fail after this operation,
 * won't leak bucket pages; rather, the next split will consume this
 * space. In any case, even without failure we don't use all the space
 * in one split operation.
+ * The number of buckets in the new splitpoint group is equal to the
+ * total number already in existence. But we do not allocate them at
+ * once. Each splitpoint group will have 4 slots, we distribute the
+ * buckets equally among them. So we allocate only one fourth of total
+ * buckets in new splitpoint group at a time to consume one phase after
+ * another. We treat allocation of buckets as a separate WALlogged
+ * action. Even if we fail after this operation, won't leak bucket
+ * pages; rather, the next split will consume this space. In any case,
+ * even without failure we don't use all the space in one split
+ * operation.
+ */
+
+ splitpoint_group = SPLITPOINT_PHASE_TO_SPLITPOINT_GRP(spare_ndx);
+
+ /*
+ * Each phase in the splitpoint_group will allocate one fourth of total
+ * buckets to be allocated in splitpoint_group. For
+ * splitpoint_group < SPLITPOINT_GROUPS_WITH_ONLY_ONE_PHASE, have only
+ * one phase of allocation so we allocate all of the buckets belonging
+ * to that buckets at once.
*/
 if (!_hash_alloc_buckets(rel, start_nblkno, new_bucket))
+ buckets_toadd =
+ (splitpoint_group < SPLITPOINT_GROUPS_WITH_ONLY_ONE_PHASE) ?
+ (new_bucket) :
+ ((1 << (splitpoint_group  1)) / SPLITPOINT_PHASES_PER_GRP);
+ if (!_hash_alloc_buckets(rel, start_nblkno, buckets_toadd))
{
/* can't split due to BlockNumber overflow */
_hash_relbuf(rel, buf_oblkno);
@@ 836,10 +857,9 @@ restart_expand:
}
/*
 * If the split point is increasing (hashm_maxbucket's log base 2
 * increases), we need to adjust the hashm_spares[] array and
 * hashm_ovflpoint so that future overflow pages will be created beyond
 * this new batch of bucket pages.
+ * If the split point is increasing we need to adjust the hashm_spares[]
+ * array and hashm_ovflpoint so that future overflow pages will be created
+ * beyond this new batch of bucket pages.
*/
if (spare_ndx > metap>hashm_ovflpoint)
{
diff git a/src/backend/access/hash/hashutil.c b/src/backend/access/hash/hashutil.c
index 2e99719..c2f2c71 100644
 a/src/backend/access/hash/hashutil.c
+++ b/src/backend/access/hash/hashutil.c
@@ 150,6 +150,49 @@ _hash_log2(uint32 num)
}
/*
+ * _hash_spareindex  returns spare index / global splitpoint phase of the
+ * bucket
+ */
+uint32
+_hash_spareindex(uint32 num_bucket)
+{
+ uint32 splitpoint_group;
+
+ splitpoint_group = _hash_log2(num_bucket);
+
+ if (splitpoint_group < SPLITPOINT_GROUPS_WITH_ONLY_ONE_PHASE)
+ return splitpoint_group;
+
+ return TOTAL_SPLITPOINT_PHASES_BEFORE_GROUP(splitpoint_group) +
+ SPLITPOINT_PHASES_WITHIN_GROUP(splitpoint_group,
+ num_bucket  1); /* to 0based */
+}
+
+/*
+ * _hash_get_totalbuckets  returns total number of buckets allocated till
+ * the given splitpoint phase.
+ */
+uint32
+_hash_get_totalbuckets(uint32 splitpoint_phase)
+{
+ uint32 splitpoint_group;
+
+ splitpoint_group = SPLITPOINT_PHASE_TO_SPLITPOINT_GRP(splitpoint_phase);
+
+ if (splitpoint_group < SPLITPOINT_GROUPS_WITH_ONLY_ONE_PHASE)
+ return (1 << splitpoint_group);
+
+ /*
+ * total_buckets = total number of buckets before its splitpoint group +
+ * total buckets within its splitpoint group until given splitpoint_phase.
+ */
+ return BUCKETS_BEFORE_SP_GRP(splitpoint_group) +
+ BUCKETS_WITHIN_SP_GRP(splitpoint_group,
+ ((splitpoint_phase  SPLITPOINT_GROUPS_WITH_ONLY_ONE_PHASE) %
+ SPLITPOINT_PHASES_PER_GRP) + 1);
+}
+
+/*
* _hash_checkpage  sanity checks on the format of all hash pages
*
* If flags is not zero, it is a bitwise OR of the acceptable values of
diff git a/src/include/access/hash.h b/src/include/access/hash.h
index eb1df57..64e98c2 100644
 a/src/include/access/hash.h
+++ b/src/include/access/hash.h
@@ 36,7 +36,7 @@ typedef uint32 Bucket;
#define InvalidBucket ((Bucket) 0xFFFFFFFF)
#define BUCKET_TO_BLKNO(metap,B) \
 ((BlockNumber) ((B) + ((B) ? (metap)>hashm_spares[_hash_log2((B)+1)1] : 0)) + 1)
+ ((BlockNumber) ((B) + ((B) ? (metap)>hashm_spares[_hash_spareindex((B)+1)1] : 0)) + 1)
/*
* Special space for hash index pages.
@@ 180,9 +180,46 @@ typedef HashScanOpaqueData *HashScanOpaque;
* needing to fit into the metapage. (With 8K block size, 128 bitmaps
* limit us to 64 GB of overflow space...)
*/
#define HASH_MAX_SPLITPOINTS 32
+#define HASH_MAX_SPLITPOINTS 128
#define HASH_MAX_BITMAPS 128
+#define SPLITPOINT_PHASES_PER_GRP 4
+#define SPLITPOINT_PHASE_MASK (SPLITPOINT_PHASES_PER_GRP  1)
+#define SPLITPOINT_GROUPS_WITH_ONLY_ONE_PHASE 3
+
+#define TOTAL_SPLITPOINT_PHASES_BEFORE_GROUP(sp_g) \
+ ((((sp_g  SPLITPOINT_GROUPS_WITH_ONLY_ONE_PHASE) << 2) + \
+ SPLITPOINT_GROUPS_WITH_ONLY_ONE_PHASE))
+
+/*
+ * This is just a trick to save a division operation. If you look into the
+ * bitmap of 0based bucket_num 2nd and 3rd most significant bit will indicate
+ * which phase of allocation the bucket_num belongs to with in the group. This
+ * is because at every splitpoint group we allocate (2 ^ x) buckets and we have
+ * divided the allocation process into 4 equal phases. This macro returns value
+ * from 0 to 3.
+ */
+#define SPLITPOINT_PHASES_WITHIN_GROUP(sp_g, bucket_num) \
+ (((bucket_num) >> (sp_g  SPLITPOINT_GROUPS_WITH_ONLY_ONE_PHASE)) & \
+ SPLITPOINT_PHASE_MASK)
+
+/*
+ * At every splitpoint group we double the total number of buckets. So at
+ * splitpoint group sp_g we allocate (1 << (sp_g 1)) buckets as we will have
+ * same number of buckets already allocated before this group. For spitpoint
+ * groups >= SPLITPOINT_GROUPS_WITH_ONLY_ONE_PHASE we allocate buckets in 4
+ * equal phases hence we allocate ((1 << (sp_g  1)) >> 2) buckets per phase.
+ */
+#define BUCKETS_BEFORE_SP_GRP(sp_g) (1 << (sp_g  1))
+#define BUCKETS_WITHIN_SP_GRP(sp_g, nphase) \
+ ((nphase) * ((1 << (sp_g  1)) >> 2))
+
+#define SPLITPOINT_PHASE_TO_SPLITPOINT_GRP(sp_phase) \
+ ((sp_phase < SPLITPOINT_GROUPS_WITH_ONLY_ONE_PHASE) ? \
+ (sp_phase) : \
+ (((sp_phase  SPLITPOINT_GROUPS_WITH_ONLY_ONE_PHASE) >> 2) + \
+ SPLITPOINT_GROUPS_WITH_ONLY_ONE_PHASE))
+
typedef struct HashMetaPageData
{
uint32 hashm_magic; /* magic no. for hash tables */
@@ 382,6 +419,8 @@ extern uint32 _hash_datum2hashkey_type(Relation rel, Datum key, Oid keytype);
extern Bucket _hash_hashkey2bucket(uint32 hashkey, uint32 maxbucket,
uint32 highmask, uint32 lowmask);
extern uint32 _hash_log2(uint32 num);
+extern uint32 _hash_spareindex(uint32 num_bucket);
+extern uint32 _hash_get_totalbuckets(uint32 splitpoint_phase);
extern void _hash_checkpage(Relation rel, Buffer buf, int flags);
extern uint32 _hash_get_indextuple_hashkey(IndexTuple itup);
extern bool _hash_convert_tuple(Relation index,