*** ../../../src/backend/access/hash/hash.c.orig	2007-09-23 19:01:09.000000000 -0700
--- ../../../src/backend/access/hash/hash.c	2007-09-24 18:01:27.709487000 -0700
***************
*** 27,35 ****
  /* Working state for hashbuild and its callback */
  typedef struct
  {
! 	double		indtuples;
  } HashBuildState;
  
  static void hashbuildCallback(Relation index,
  				  HeapTuple htup,
  				  Datum *values,
--- 27,45 ----
  /* Working state for hashbuild and its callback */
  typedef struct
  {
! 	double		indtuples; /* The current number of index tuples */
! 	Relation	heapRel;   /* The index covers this heap relation */
! 	HSpool		*spool;	   /* Used to sort the index tuples before insertion into the index */
  } HashBuildState;
  
+ 
+ static void countTupleCallBack(Relation indx,
+                                 HeapTuple htup,
+                                 Datum *values,
+                                 bool *isnull,
+                                 bool tupleIsAlive,
+                                 void *state);
+ 
  static void hashbuildCallback(Relation index,
  				  HeapTuple htup,
  				  Datum *values,
***************
*** 40,46 ****
--- 50,87 ----
  
  /*
   *	hashbuild() -- build a new hash index.
+  *
+  *
+  *   The algorithm:
+  *    (1) Initialize the build state
+  *    (2) Scan the heap file to determine the number of rows
+  *    (3) Transform the heap file tuples into index tuples (itups),
+  *        while inserting them into a spool.  If the spool overflows
+  *        memory, sort it into runs and spill it to disk
+  *    (4) Finish sorting the spool
+  *    (5) Pre-initialize all the buckets of the final index
+  *    (6) Insert itups from the spool into the index
+  *
+  *   Sorting the tuples before inserting them into the index is a classical
+  * bulk-load technique, also used in the BTree code.  The sort is done in
+  * hash value order.
+  *   Pre-allocating the buckets minimizes the number of overflow pages.
+  *   The reason for step (2) is that in order to sort, in step (3), one must
+  * know the hash value, which depends on the number of buckets, which in turn
+  * depends on the number of itups = the number of rows in the heap file.
+  *   Steps (3),(4) and (6) parallel similar steps in the BTree code.
+  *
+  *   Here is an alternative algorithm:
+  *    (1') Same as (1)
+  *    (2') Scan the heap file, counting the number of rows, forming index
+  *         tuples and inserting them into a spool (the spool is not presorted).
+  *    (3') Sort the spool
+  *    (4') same as (5)
+  *    (5') same as (6)
+  *    Although this algorithm would be somewhat faster, we prefer the existing
+  * algorithm because it reuses existing BTree code.
   */
+ 
  Datum
  hashbuild(PG_FUNCTION_ARGS)
  {
***************
*** 50,55 ****
--- 91,100 ----
  	IndexBuildResult *result;
  	double		reltuples;
  	HashBuildState buildstate;
+ 	uint32		tuples;
+ 	HashMetaPage 	metap;
+ 	Buffer		metabuf;
+ 	uint32		num_bkt; /* Estimates number of buckets in the final index */
  
  	/*
  	 * We expect to be called exactly once for any index relation. If that's
***************
*** 59,85 ****
  		elog(ERROR, "index \"%s\" already contains data",
  			 RelationGetRelationName(index));
  
! 	/* initialize the hash index metadata page */
! 	_hash_metapinit(index);
! 
! 	/* build the index */
  	buildstate.indtuples = 0;
  
! 	/* do the heap scan */
! 	reltuples = IndexBuildHeapScan(heap, index, indexInfo,
! 								   hashbuildCallback, (void *) &buildstate);
  
- 	/*
- 	 * Return statistics
- 	 */
- 	result = (IndexBuildResult *) palloc(sizeof(IndexBuildResult));
  
! 	result->heap_tuples = reltuples;
! 	result->index_tuples = buildstate.indtuples;
  
  	PG_RETURN_POINTER(result);
  }
  
  /*
   * Per-tuple callback from IndexBuildHeapScan
   */
--- 104,183 ----
  		elog(ERROR, "index \"%s\" already contains data",
  			 RelationGetRelationName(index));
  
! 	/* initialize the build state */
  	buildstate.indtuples = 0;
+ 	buildstate.heapRel = heap;
+ 	buildstate.spool = h_spoolinit(index);
  
!        /*
!  	* Scan the heap file to determine the number of rows
!  	*
!         */
!         tuples=0;
!         IndexBuildHeapScan(heap, index, indexInfo, countTupleCallBack, (void*) &tuples);
! 
!         num_bkt = h_bkt_num(tuples, index); /* calculate the number of buckets in the final index */
!         h_set_bkt_mask(buildstate.spool, num_bkt); /* set the bucket mask for the compare function */
! 
!         /*
! 	 * Pre-initialize all the buckets of the final index
!          */
!         _hash_metapinit(index, num_bkt);
! 
!         /*
!          * Transform the heap file tuples into index tuples (itups),
!          * while inserting them into a spool.  If the spool overflows
!          * memory, sort it into runs and spill it to disk
!          *
!          */
!         reltuples = IndexBuildHeapScan(heap, index, indexInfo,
!                                                                    hashbuildCallback, (void *) &buildstate);
! 
! 
!         /*
!          * Finish sorting the spool
!          */
!         h_do_sort(buildstate.spool);
! 
!         /*
!          * Insert itups from the spool into the index
!          */
!         h_print_spool(buildstate.spool);
! 
! 
!         /* Read the meta page just initialized */
!         metabuf = _hash_getbuf(index, HASH_METAPAGE, HASH_READ, LH_META_PAGE);
!         _hash_checkpage(index, metabuf, LH_META_PAGE);
!         metap   = (HashMetaPage) BufferGetPage(metabuf);
! 
!         /* Gather result, destroy spool, return */
! 
!         result = (IndexBuildResult *) palloc(sizeof(IndexBuildResult));
!         _hash_relbuf(index,metabuf);
!         result->heap_tuples = reltuples;
!         result->index_tuples = buildstate.indtuples;
  
  
!         h_spooldestroy(buildstate.spool);
  
  	PG_RETURN_POINTER(result);
  }
  
+ 
+ /* This function is used to count the tuples in the relation we are indexing.
+  */
+ 
+ static void
+ countTupleCallBack(Relation index, HeapTuple htup, Datum *values,
+                                 bool *isnull,
+                                 bool tupleIsAlive,
+                                 void *ptr)
+ {
+         uint32 *tuples = (uint32 *) ptr;
+         *tuples += 1;
+ }
+ 
+ 
  /*
   * Per-tuple callback from IndexBuildHeapScan
   */
***************
*** 104,111 ****
  		pfree(itup);
  		return;
  	}
! 
! 	_hash_doinsert(index, itup);
  
  	buildstate->indtuples += 1;
  
--- 202,212 ----
  		pfree(itup);
  		return;
  	}
!         else
!         {
!                 /* Place each itup into the spool for sorting */
!                 h_spool(itup, buildstate->spool);
!         }
  
  	buildstate->indtuples += 1;
  
*** ../../../src/backend/access/hash/hashpage.c.orig	2007-09-23 19:31:22.000000000 -0700
--- ../../../src/backend/access/hash/hashpage.c	2007-09-23 20:46:09.000000000 -0700
***************
*** 313,326 ****
  /*
   *	_hash_metapinit() -- Initialize the metadata page of a hash index,
   *				the two buckets that we begin with and the initial
!  *				bitmap page.
   *
   * We are fairly cavalier about locking here, since we know that no one else
   * could be accessing this index.  In particular the rule about not holding
   * multiple buffer locks is ignored.
   */
  void
! _hash_metapinit(Relation rel)
  {
  	HashMetaPage metap;
  	HashPageOpaque pageopaque;
--- 313,326 ----
  /*
   *	_hash_metapinit() -- Initialize the metadata page of a hash index,
   *				the two buckets that we begin with and the initial
!  *				bitmap page, plus num_bkt buckets.
   *
   * We are fairly cavalier about locking here, since we know that no one else
   * could be accessing this index.  In particular the rule about not holding
   * multiple buffer locks is ignored.
   */
  void
! _hash_metapinit(Relation rel, uint32 num_bkt)
  {
  	HashMetaPage metap;
  	HashPageOpaque pageopaque;
***************
*** 330,342 ****
  	int32		data_width;
  	int32		item_width;
  	int32		ffactor;
! 	uint16		i;
  
  	/* safety check */
  	if (RelationGetNumberOfBlocks(rel) != 0)
  		elog(ERROR, "cannot initialize non-empty hash index \"%s\"",
  			 RelationGetRelationName(rel));
  
  	/*
  	 * Determine the target fill factor (in tuples per bucket) for this index.
  	 * The idea is to make the fill factor correspond to pages about as full
--- 330,354 ----
  	int32		data_width;
  	int32		item_width;
  	int32		ffactor;
! 	uint32		i;
! 	uint32		lg2buckets;
! 	uint32		pwr2;
! 	BlockNumber	start_blk;	
  
  	/* safety check */
  	if (RelationGetNumberOfBlocks(rel) != 0)
  		elog(ERROR, "cannot initialize non-empty hash index \"%s\"",
  			 RelationGetRelationName(rel));
  
+ 
+         /* The minimum number of buckets is 2, so if we are below
+          * that number, let's fix that here
+          */
+ 
+         if (num_bkt < 2)
+                 num_bkt = 2;
+ 
+ 
  	/*
  	 * Determine the target fill factor (in tuples per bucket) for this index.
  	 * The idea is to make the fill factor correspond to pages about as full
***************
*** 401,431 ****
  	 * We initialize the index with two buckets, 0 and 1, occupying physical
  	 * blocks 1 and 2.	The first freespace bitmap page is in block 3.
  	 */
! 	metap->hashm_maxbucket = metap->hashm_lowmask = 1;	/* nbuckets - 1 */
! 	metap->hashm_highmask = 3;	/* (nbuckets << 1) - 1 */
  
  	MemSet(metap->hashm_spares, 0, sizeof(metap->hashm_spares));
  	MemSet(metap->hashm_mapp, 0, sizeof(metap->hashm_mapp));
  
  	metap->hashm_spares[1] = 1; /* the first bitmap page is only spare */
! 	metap->hashm_ovflpoint = 1;
  	metap->hashm_firstfree = 0;
  
! 	/*
! 	 * Initialize the first two buckets
! 	 */
! 	for (i = 0; i <= 1; i++)
! 	{
! 		buf = _hash_getnewbuf(rel, BUCKET_TO_BLKNO(metap, i));
! 		pg = BufferGetPage(buf);
! 		pageopaque = (HashPageOpaque) PageGetSpecialPointer(pg);
! 		pageopaque->hasho_prevblkno = InvalidBlockNumber;
! 		pageopaque->hasho_nextblkno = InvalidBlockNumber;
! 		pageopaque->hasho_bucket = i;
! 		pageopaque->hasho_flag = LH_BUCKET_PAGE;
  		pageopaque->hasho_page_id = HASHO_PAGE_ID;
! 		_hash_wrtbuf(rel, buf);
! 	}
  
  	/*
  	 * Initialize first bitmap page
--- 413,488 ----
  	 * We initialize the index with two buckets, 0 and 1, occupying physical
  	 * blocks 1 and 2.	The first freespace bitmap page is in block 3.
  	 */
! 
!         /* We need this calculation to correctly set the splits
!          * and to set the value of the low/high mask.
!          */
!         lg2buckets = _hash_log2_floor(num_bkt);
! 
!         pwr2 = 1 << lg2buckets;
! 
! 
!         metap->hashm_maxbucket = num_bkt -1;
!         /* We want the highmask to mask the next larger
!          * power of 2 value greated than the number of buckets
!          * we need.  So, if we need 9 buckets, our high mask
!          * value would be 15.  Our low mask value will be 7
!          */
! 
!         metap->hashm_highmask = (pwr2 << 1) -1;
!         metap->hashm_lowmask = pwr2 - 1;
! 
  
  	MemSet(metap->hashm_spares, 0, sizeof(metap->hashm_spares));
  	MemSet(metap->hashm_mapp, 0, sizeof(metap->hashm_mapp));
  
  	metap->hashm_spares[1] = 1; /* the first bitmap page is only spare */
! 
! 
!         /*
!          * No overflows will have occurred during this initialization process
!          * so we need to just copy the value '1' into each position in the
!          * hashm_spares array.  This is needed to correctly determine how each
!          * bucket maps to the logical page on disk.
!          *
!          */
! 
!         for (i = 2; i <= _hash_log2(num_bkt); i++)
!                 metap->hashm_spares[i] = 1;
! 
! 
!         /* This value needs to be the ceiling log to properly set the overflow
!          * beyond our max bucket */
!         metap->hashm_ovflpoint = _hash_log2(num_bkt);
! 
  	metap->hashm_firstfree = 0;
  
! 
!         /* We need to make sure the file system can handle
!          * this big block of pages we are going to allocate
!          * _hash_alloc_buckets() will do that for us
!          */
! 
!         start_blk = BUCKET_TO_BLKNO(metap, 2);
!         _hash_alloc_buckets(rel, start_blk, (pwr2<<1)-2);
! 
!         /*
!          * Initialize the first 'num_bkt' buckets
!          */
!         for (i = 0; i < num_bkt; i++)
!         {
! 
!                 buf = _hash_getnewbuf(rel, BUCKET_TO_BLKNO(metap, i));
!                 pg = BufferGetPage(buf);
!                 _hash_pageinit(pg, BufferGetPageSize(buf));
!                 pageopaque = (HashPageOpaque) PageGetSpecialPointer(pg);
!                 pageopaque->hasho_prevblkno = InvalidBlockNumber;
!                 pageopaque->hasho_nextblkno = InvalidBlockNumber;
!                 pageopaque->hasho_bucket = i;
!                 pageopaque->hasho_flag = LH_BUCKET_PAGE;
  		pageopaque->hasho_page_id = HASHO_PAGE_ID;
!                 _hash_wrtbuf(rel, buf);
!         }
  
  	/*
  	 * Initialize first bitmap page
*** ../../../src/backend/access/hash/hashutil.c.orig	2007-09-23 19:47:27.000000000 -0700
--- ../../../src/backend/access/hash/hashutil.c	2007-09-23 19:49:44.000000000 -0700
***************
*** 120,125 ****
--- 120,144 ----
  	return bucket;
  }
  
+ 
+ /* _hash_log2_floor -- returns floor(lg2(num))
+  *
+  */
+ uint32
+ _hash_log2_floor(uint32 num)
+ {
+         uint32          i, limit;
+         limit = 1;
+         for (i=0; limit < num; limit <<= 1, i++)
+                 ;
+         if (limit == num)
+                 return i;
+         else
+                 return i-1;
+ 
+ }
+ 
+ 
  /*
   * _hash_log2 -- returns ceil(lg2(num))
   */
*** ../../../src/backend/access/hash/Makefile.orig	2007-09-23 20:30:39.000000000 -0700
--- ../../../src/backend/access/hash/Makefile	2007-09-23 20:31:12.000000000 -0700
***************
*** 13,19 ****
  include $(top_builddir)/src/Makefile.global
  
  OBJS = hash.o hashfunc.o hashinsert.o hashovfl.o hashpage.o hashscan.o \
!        hashsearch.o hashutil.o
  
  all: SUBSYS.o
  
--- 13,19 ----
  include $(top_builddir)/src/Makefile.global
  
  OBJS = hash.o hashfunc.o hashinsert.o hashovfl.o hashpage.o hashscan.o \
!        hashsearch.o hashutil.o hashsort.o
  
  all: SUBSYS.o
  
*** ../../../src/backend/access/hash/README.orig	2007-09-24 00:07:32.000000000 -0700
--- ../../../src/backend/access/hash/README	2007-09-24 19:18:10.312700000 -0700
***************
*** 105,110 ****
--- 105,135 ----
  number 3, which is the first bitmap page and is allocated during index
  creation.
  
+ Building an index on a full table
+ ----------------------------------
+ 
+ An index can be created on a table already loaded with tuples. Before it
+ is built, an estimate of the number of bucket pages that might be needed to
+ fit all the index tuples, is calculated.This is done by scanning the base
+ relation and counting the number of tuples to be indexed. A Fill factor
+ (either DEFAULT or user-defined as applicable) is then used to get the estimate,
+ which is the number of bucket pages initialized. If the estimate falls below two,
+ then a minimum of two bucket pages is initialized. However, the number of bucket
+ pages allocated is always a power of 2 value greater than the estimate.
+ I.e., if the estimated number of buckets is 3124, then the number of buckets
+ allocated is 4096 (and the number of bucket pages initialized is 3124).
+ If the estimate is 5456, then number allocated is 8192 (and the number initialized
+ is 5456) and so on.
+ 
+ A spool holds all the index tuples before they are inserted into the
+ index pages. Content of the spool is sorted on the hash value order
+ so that there will be less backtracking to the bucket pages we have already
+ visited.
+ 
+ The intention of creating as many as the estimated number of pages is to
+ avoid bucket page splits at splitpoint S and hence, redistribution of tuples.
+ Since we create all the bucket pages we need and insert the tuples based on the
+ bucket number they belong to, there will not be any need for splitting.
  
  Lock definitions
  ----------------
*** ../../../src/backend/utils/sort/tuplesort.c.orig	2007-09-23 19:51:30.000000000 -0700
--- ../../../src/backend/utils/sort/tuplesort.c	2007-09-24 00:03:25.000000000 -0700
***************
*** 341,346 ****
--- 341,347 ----
  	Relation	indexRel;
  	ScanKey		indexScanKey;
  	bool		enforceUnique;	/* complain if we find duplicate tuples */
+ 	uint32		bkt_mask;	/* Bucket mask for hash sort */
  
  	/*
  	 * These variables are specific to the Datum case; they are set by
***************
*** 439,444 ****
--- 440,448 ----
  static void reversedirection_heap(Tuplesortstate *state);
  static int comparetup_index(const SortTuple *a, const SortTuple *b,
  				 Tuplesortstate *state);
+ static int comparetup_hashindex(const SortTuple *a, const SortTuple *b,
+                                Tuplesortstate *state);
+ 
  static void copytup_index(Tuplesortstate *state, SortTuple *stup, void *tup);
  static void writetup_index(Tuplesortstate *state, int tapenum,
  			   SortTuple *stup);
***************
*** 2621,2626 ****
--- 2625,2673 ----
  								&stup->isnull1);
  }
  
+ /*
+  * Set state parameters for sorting hash index tuples
+  */
+ void tuplesort_set_hashindex(Tuplesortstate *state)
+ {
+         state->comparetup = comparetup_hashindex;
+         state->indexScanKey = NULL; /* Scan key is not needed in hash index */
+         state->enforceUnique = false;   /* no reason to enforce uniqueness with a hash table */
+ 
+ }
+ 
+ 
+ void tuplesort_set_bktmask(Tuplesortstate *state, uint32 mask)
+ {
+         state->bkt_mask = mask;
+ }
+ 
+ 
+ /* Special compare function called for the hash sort algorithm
+  * We are comparing hash values of the key, which then are masked to
+  * determine the proper hash table bucket.
+  */
+ 
+ 
+ static int comparetup_hashindex(const SortTuple *a, const SortTuple *b, Tuplesortstate *state)
+ {
+         uint32  bkta, bktb;
+ 
+         /* Allow interupting long sorts */
+         CHECK_FOR_INTERRUPTS();
+ 
+         bkta = (_hash_datum2hashkey(state->indexRel, a->datum1)) & (uint32)state->bkt_mask;
+         bktb = (_hash_datum2hashkey(state->indexRel, b->datum1)) & (uint32)state->bkt_mask;
+ 
+         if (bkta > bktb)
+                 return 1;
+         else if (bkta < bktb)
+                 return -1;
+         else
+                 return 0;
+ 
+ }
+ 
  static void
  reversedirection_heap(Tuplesortstate *state)
  {
*** ../../../src/include/access/hash.h.orig	2007-09-23 17:50:40.000000000 -0700
--- ../../../src/include/access/hash.h	2007-09-23 17:40:21.000000000 -0700
***************
*** 282,287 ****
--- 282,297 ----
  								Bucket bucket, BlockNumber bucket_blkno,
  								BufferAccessStrategy bstrategy);
  
+ /*hashsort.c*/
+ typedef struct HSpool HSpool;
+ extern HSpool *h_spoolinit(Relation index);
+ extern void h_spool(IndexTuple itup, HSpool *hspool);
+ extern uint32 h_bkt_num(uint32 tuples, Relation rel);
+ extern void h_set_bkt_mask(HSpool *hspool, uint32 bkts);
+ extern void h_do_sort(HSpool *hspool);
+ extern void h_print_spool(HSpool *hspool);
+ extern void h_spooldestroy(HSpool *hspool);
+ 
  /* hashpage.c */
  extern void _hash_getlock(Relation rel, BlockNumber whichlock, int access);
  extern bool _hash_try_getlock(Relation rel, BlockNumber whichlock, int access);
***************
*** 298,304 ****
  extern void _hash_wrtbuf(Relation rel, Buffer buf);
  extern void _hash_chgbufaccess(Relation rel, Buffer buf, int from_access,
  				   int to_access);
! extern void _hash_metapinit(Relation rel);
  extern void _hash_pageinit(Page page, Size size);
  extern void _hash_expandtable(Relation rel, Buffer metabuf);
  
--- 308,314 ----
  extern void _hash_wrtbuf(Relation rel, Buffer buf);
  extern void _hash_chgbufaccess(Relation rel, Buffer buf, int from_access,
  				   int to_access);
! extern void _hash_metapinit(Relation rel, uint32 pages);
  extern void _hash_pageinit(Page page, Size size);
  extern void _hash_expandtable(Relation rel, Buffer metabuf);
  
***************
*** 320,325 ****
--- 330,336 ----
  extern Bucket _hash_hashkey2bucket(uint32 hashkey, uint32 maxbucket,
  					 uint32 highmask, uint32 lowmask);
  extern uint32 _hash_log2(uint32 num);
+ extern uint32 _hash_log2_floor(uint32 num);
  extern void _hash_checkpage(Relation rel, Buffer buf, int flags);
  
  /* hash.c */
*** ../../../src/include/utils/tuplesort.h.orig	2007-09-23 18:52:07.000000000 -0700
--- ../../../src/include/utils/tuplesort.h	2007-09-23 18:59:13.000000000 -0700
***************
*** 55,60 ****
--- 55,63 ----
  					  Oid sortOperator, bool nullsFirstFlag,
  					  int workMem, bool randomAccess);
  
+ extern void tuplesort_set_hashindex(Tuplesortstate *state);
+ extern void tuplesort_set_bktmask(Tuplesortstate *state, uint32 mask);
+ 
  extern void tuplesort_set_bound(Tuplesortstate *state, int64 bound);
  
  extern void tuplesort_puttupleslot(Tuplesortstate *state,