*** ../../../src/backend/access/hash/hash.c.orig 2007-09-23 19:01:09.000000000 -0700 --- ../../../src/backend/access/hash/hash.c 2007-09-24 18:01:27.709487000 -0700 *************** *** 27,35 **** /* Working state for hashbuild and its callback */ typedef struct { ! double indtuples; } HashBuildState; static void hashbuildCallback(Relation index, HeapTuple htup, Datum *values, --- 27,45 ---- /* Working state for hashbuild and its callback */ typedef struct { ! double indtuples; /* The current number of index tuples */ ! Relation heapRel; /* The index covers this heap relation */ ! HSpool *spool; /* Used to sort the index tuples before insertion into the index */ } HashBuildState; + + static void countTupleCallBack(Relation indx, + HeapTuple htup, + Datum *values, + bool *isnull, + bool tupleIsAlive, + void *state); + static void hashbuildCallback(Relation index, HeapTuple htup, Datum *values, *************** *** 40,46 **** --- 50,87 ---- /* * hashbuild() -- build a new hash index. + * + * + * The algorithm: + * (1) Initialize the build state + * (2) Scan the heap file to determine the number of rows + * (3) Transform the heap file tuples into index tuples (itups), + * while inserting them into a spool. If the spool overflows + * memory, sort it into runs and spill it to disk + * (4) Finish sorting the spool + * (5) Pre-initialize all the buckets of the final index + * (6) Insert itups from the spool into the index + * + * Sorting the tuples before inserting them into the index is a classical + * bulk-load technique, also used in the BTree code. The sort is done in + * hash value order. + * Pre-allocating the buckets minimizes the number of overflow pages. + * The reason for step (2) is that in order to sort, in step (3), one must + * know the hash value, which depends on the number of buckets, which in turn + * depends on the number of itups = the number of rows in the heap file. + * Steps (3),(4) and (6) parallel similar steps in the BTree code. + * + * Here is an alternative algorithm: + * (1') Same as (1) + * (2') Scan the heap file, counting the number of rows, forming index + * tuples and inserting them into a spool (the spool is not presorted). + * (3') Sort the spool + * (4') same as (5) + * (5') same as (6) + * Although this algorithm would be somewhat faster, we prefer the existing + * algorithm because it reuses existing BTree code. */ + Datum hashbuild(PG_FUNCTION_ARGS) { *************** *** 50,55 **** --- 91,100 ---- IndexBuildResult *result; double reltuples; HashBuildState buildstate; + uint32 tuples; + HashMetaPage metap; + Buffer metabuf; + uint32 num_bkt; /* Estimates number of buckets in the final index */ /* * We expect to be called exactly once for any index relation. If that's *************** *** 59,85 **** elog(ERROR, "index \"%s\" already contains data", RelationGetRelationName(index)); ! /* initialize the hash index metadata page */ ! _hash_metapinit(index); ! ! /* build the index */ buildstate.indtuples = 0; ! /* do the heap scan */ ! reltuples = IndexBuildHeapScan(heap, index, indexInfo, ! hashbuildCallback, (void *) &buildstate); - /* - * Return statistics - */ - result = (IndexBuildResult *) palloc(sizeof(IndexBuildResult)); ! result->heap_tuples = reltuples; ! result->index_tuples = buildstate.indtuples; PG_RETURN_POINTER(result); } /* * Per-tuple callback from IndexBuildHeapScan */ --- 104,183 ---- elog(ERROR, "index \"%s\" already contains data", RelationGetRelationName(index)); ! /* initialize the build state */ buildstate.indtuples = 0; + buildstate.heapRel = heap; + buildstate.spool = h_spoolinit(index); ! /* ! * Scan the heap file to determine the number of rows ! * ! */ ! tuples=0; ! IndexBuildHeapScan(heap, index, indexInfo, countTupleCallBack, (void*) &tuples); ! ! num_bkt = h_bkt_num(tuples, index); /* calculate the number of buckets in the final index */ ! h_set_bkt_mask(buildstate.spool, num_bkt); /* set the bucket mask for the compare function */ ! ! /* ! * Pre-initialize all the buckets of the final index ! */ ! _hash_metapinit(index, num_bkt); ! ! /* ! * Transform the heap file tuples into index tuples (itups), ! * while inserting them into a spool. If the spool overflows ! * memory, sort it into runs and spill it to disk ! * ! */ ! reltuples = IndexBuildHeapScan(heap, index, indexInfo, ! hashbuildCallback, (void *) &buildstate); ! ! ! /* ! * Finish sorting the spool ! */ ! h_do_sort(buildstate.spool); ! ! /* ! * Insert itups from the spool into the index ! */ ! h_print_spool(buildstate.spool); ! ! ! /* Read the meta page just initialized */ ! metabuf = _hash_getbuf(index, HASH_METAPAGE, HASH_READ, LH_META_PAGE); ! _hash_checkpage(index, metabuf, LH_META_PAGE); ! metap = (HashMetaPage) BufferGetPage(metabuf); ! ! /* Gather result, destroy spool, return */ ! ! result = (IndexBuildResult *) palloc(sizeof(IndexBuildResult)); ! _hash_relbuf(index,metabuf); ! result->heap_tuples = reltuples; ! result->index_tuples = buildstate.indtuples; ! h_spooldestroy(buildstate.spool); PG_RETURN_POINTER(result); } + + /* This function is used to count the tuples in the relation we are indexing. + */ + + static void + countTupleCallBack(Relation index, HeapTuple htup, Datum *values, + bool *isnull, + bool tupleIsAlive, + void *ptr) + { + uint32 *tuples = (uint32 *) ptr; + *tuples += 1; + } + + /* * Per-tuple callback from IndexBuildHeapScan */ *************** *** 104,111 **** pfree(itup); return; } ! ! _hash_doinsert(index, itup); buildstate->indtuples += 1; --- 202,212 ---- pfree(itup); return; } ! else ! { ! /* Place each itup into the spool for sorting */ ! h_spool(itup, buildstate->spool); ! } buildstate->indtuples += 1; *** ../../../src/backend/access/hash/hashpage.c.orig 2007-09-23 19:31:22.000000000 -0700 --- ../../../src/backend/access/hash/hashpage.c 2007-09-23 20:46:09.000000000 -0700 *************** *** 313,326 **** /* * _hash_metapinit() -- Initialize the metadata page of a hash index, * the two buckets that we begin with and the initial ! * bitmap page. * * We are fairly cavalier about locking here, since we know that no one else * could be accessing this index. In particular the rule about not holding * multiple buffer locks is ignored. */ void ! _hash_metapinit(Relation rel) { HashMetaPage metap; HashPageOpaque pageopaque; --- 313,326 ---- /* * _hash_metapinit() -- Initialize the metadata page of a hash index, * the two buckets that we begin with and the initial ! * bitmap page, plus num_bkt buckets. * * We are fairly cavalier about locking here, since we know that no one else * could be accessing this index. In particular the rule about not holding * multiple buffer locks is ignored. */ void ! _hash_metapinit(Relation rel, uint32 num_bkt) { HashMetaPage metap; HashPageOpaque pageopaque; *************** *** 330,342 **** int32 data_width; int32 item_width; int32 ffactor; ! uint16 i; /* safety check */ if (RelationGetNumberOfBlocks(rel) != 0) elog(ERROR, "cannot initialize non-empty hash index \"%s\"", RelationGetRelationName(rel)); /* * Determine the target fill factor (in tuples per bucket) for this index. * The idea is to make the fill factor correspond to pages about as full --- 330,354 ---- int32 data_width; int32 item_width; int32 ffactor; ! uint32 i; ! uint32 lg2buckets; ! uint32 pwr2; ! BlockNumber start_blk; /* safety check */ if (RelationGetNumberOfBlocks(rel) != 0) elog(ERROR, "cannot initialize non-empty hash index \"%s\"", RelationGetRelationName(rel)); + + /* The minimum number of buckets is 2, so if we are below + * that number, let's fix that here + */ + + if (num_bkt < 2) + num_bkt = 2; + + /* * Determine the target fill factor (in tuples per bucket) for this index. * The idea is to make the fill factor correspond to pages about as full *************** *** 401,431 **** * We initialize the index with two buckets, 0 and 1, occupying physical * blocks 1 and 2. The first freespace bitmap page is in block 3. */ ! metap->hashm_maxbucket = metap->hashm_lowmask = 1; /* nbuckets - 1 */ ! metap->hashm_highmask = 3; /* (nbuckets << 1) - 1 */ MemSet(metap->hashm_spares, 0, sizeof(metap->hashm_spares)); MemSet(metap->hashm_mapp, 0, sizeof(metap->hashm_mapp)); metap->hashm_spares[1] = 1; /* the first bitmap page is only spare */ ! metap->hashm_ovflpoint = 1; metap->hashm_firstfree = 0; ! /* ! * Initialize the first two buckets ! */ ! for (i = 0; i <= 1; i++) ! { ! buf = _hash_getnewbuf(rel, BUCKET_TO_BLKNO(metap, i)); ! pg = BufferGetPage(buf); ! pageopaque = (HashPageOpaque) PageGetSpecialPointer(pg); ! pageopaque->hasho_prevblkno = InvalidBlockNumber; ! pageopaque->hasho_nextblkno = InvalidBlockNumber; ! pageopaque->hasho_bucket = i; ! pageopaque->hasho_flag = LH_BUCKET_PAGE; pageopaque->hasho_page_id = HASHO_PAGE_ID; ! _hash_wrtbuf(rel, buf); ! } /* * Initialize first bitmap page --- 413,488 ---- * We initialize the index with two buckets, 0 and 1, occupying physical * blocks 1 and 2. The first freespace bitmap page is in block 3. */ ! ! /* We need this calculation to correctly set the splits ! * and to set the value of the low/high mask. ! */ ! lg2buckets = _hash_log2_floor(num_bkt); ! ! pwr2 = 1 << lg2buckets; ! ! ! metap->hashm_maxbucket = num_bkt -1; ! /* We want the highmask to mask the next larger ! * power of 2 value greated than the number of buckets ! * we need. So, if we need 9 buckets, our high mask ! * value would be 15. Our low mask value will be 7 ! */ ! ! metap->hashm_highmask = (pwr2 << 1) -1; ! metap->hashm_lowmask = pwr2 - 1; ! MemSet(metap->hashm_spares, 0, sizeof(metap->hashm_spares)); MemSet(metap->hashm_mapp, 0, sizeof(metap->hashm_mapp)); metap->hashm_spares[1] = 1; /* the first bitmap page is only spare */ ! ! ! /* ! * No overflows will have occurred during this initialization process ! * so we need to just copy the value '1' into each position in the ! * hashm_spares array. This is needed to correctly determine how each ! * bucket maps to the logical page on disk. ! * ! */ ! ! for (i = 2; i <= _hash_log2(num_bkt); i++) ! metap->hashm_spares[i] = 1; ! ! ! /* This value needs to be the ceiling log to properly set the overflow ! * beyond our max bucket */ ! metap->hashm_ovflpoint = _hash_log2(num_bkt); ! metap->hashm_firstfree = 0; ! ! /* We need to make sure the file system can handle ! * this big block of pages we are going to allocate ! * _hash_alloc_buckets() will do that for us ! */ ! ! start_blk = BUCKET_TO_BLKNO(metap, 2); ! _hash_alloc_buckets(rel, start_blk, (pwr2<<1)-2); ! ! /* ! * Initialize the first 'num_bkt' buckets ! */ ! for (i = 0; i < num_bkt; i++) ! { ! ! buf = _hash_getnewbuf(rel, BUCKET_TO_BLKNO(metap, i)); ! pg = BufferGetPage(buf); ! _hash_pageinit(pg, BufferGetPageSize(buf)); ! pageopaque = (HashPageOpaque) PageGetSpecialPointer(pg); ! pageopaque->hasho_prevblkno = InvalidBlockNumber; ! pageopaque->hasho_nextblkno = InvalidBlockNumber; ! pageopaque->hasho_bucket = i; ! pageopaque->hasho_flag = LH_BUCKET_PAGE; pageopaque->hasho_page_id = HASHO_PAGE_ID; ! _hash_wrtbuf(rel, buf); ! } /* * Initialize first bitmap page *** ../../../src/backend/access/hash/hashutil.c.orig 2007-09-23 19:47:27.000000000 -0700 --- ../../../src/backend/access/hash/hashutil.c 2007-09-23 19:49:44.000000000 -0700 *************** *** 120,125 **** --- 120,144 ---- return bucket; } + + /* _hash_log2_floor -- returns floor(lg2(num)) + * + */ + uint32 + _hash_log2_floor(uint32 num) + { + uint32 i, limit; + limit = 1; + for (i=0; limit < num; limit <<= 1, i++) + ; + if (limit == num) + return i; + else + return i-1; + + } + + /* * _hash_log2 -- returns ceil(lg2(num)) */ *** ../../../src/backend/access/hash/Makefile.orig 2007-09-23 20:30:39.000000000 -0700 --- ../../../src/backend/access/hash/Makefile 2007-09-23 20:31:12.000000000 -0700 *************** *** 13,19 **** include $(top_builddir)/src/Makefile.global OBJS = hash.o hashfunc.o hashinsert.o hashovfl.o hashpage.o hashscan.o \ ! hashsearch.o hashutil.o all: SUBSYS.o --- 13,19 ---- include $(top_builddir)/src/Makefile.global OBJS = hash.o hashfunc.o hashinsert.o hashovfl.o hashpage.o hashscan.o \ ! hashsearch.o hashutil.o hashsort.o all: SUBSYS.o *** ../../../src/backend/access/hash/README.orig 2007-09-24 00:07:32.000000000 -0700 --- ../../../src/backend/access/hash/README 2007-09-24 19:18:10.312700000 -0700 *************** *** 105,110 **** --- 105,135 ---- number 3, which is the first bitmap page and is allocated during index creation. + Building an index on a full table + ---------------------------------- + + An index can be created on a table already loaded with tuples. Before it + is built, an estimate of the number of bucket pages that might be needed to + fit all the index tuples, is calculated.This is done by scanning the base + relation and counting the number of tuples to be indexed. A Fill factor + (either DEFAULT or user-defined as applicable) is then used to get the estimate, + which is the number of bucket pages initialized. If the estimate falls below two, + then a minimum of two bucket pages is initialized. However, the number of bucket + pages allocated is always a power of 2 value greater than the estimate. + I.e., if the estimated number of buckets is 3124, then the number of buckets + allocated is 4096 (and the number of bucket pages initialized is 3124). + If the estimate is 5456, then number allocated is 8192 (and the number initialized + is 5456) and so on. + + A spool holds all the index tuples before they are inserted into the + index pages. Content of the spool is sorted on the hash value order + so that there will be less backtracking to the bucket pages we have already + visited. + + The intention of creating as many as the estimated number of pages is to + avoid bucket page splits at splitpoint S and hence, redistribution of tuples. + Since we create all the bucket pages we need and insert the tuples based on the + bucket number they belong to, there will not be any need for splitting. Lock definitions ---------------- *** ../../../src/backend/utils/sort/tuplesort.c.orig 2007-09-23 19:51:30.000000000 -0700 --- ../../../src/backend/utils/sort/tuplesort.c 2007-09-24 00:03:25.000000000 -0700 *************** *** 341,346 **** --- 341,347 ---- Relation indexRel; ScanKey indexScanKey; bool enforceUnique; /* complain if we find duplicate tuples */ + uint32 bkt_mask; /* Bucket mask for hash sort */ /* * These variables are specific to the Datum case; they are set by *************** *** 439,444 **** --- 440,448 ---- static void reversedirection_heap(Tuplesortstate *state); static int comparetup_index(const SortTuple *a, const SortTuple *b, Tuplesortstate *state); + static int comparetup_hashindex(const SortTuple *a, const SortTuple *b, + Tuplesortstate *state); + static void copytup_index(Tuplesortstate *state, SortTuple *stup, void *tup); static void writetup_index(Tuplesortstate *state, int tapenum, SortTuple *stup); *************** *** 2621,2626 **** --- 2625,2673 ---- &stup->isnull1); } + /* + * Set state parameters for sorting hash index tuples + */ + void tuplesort_set_hashindex(Tuplesortstate *state) + { + state->comparetup = comparetup_hashindex; + state->indexScanKey = NULL; /* Scan key is not needed in hash index */ + state->enforceUnique = false; /* no reason to enforce uniqueness with a hash table */ + + } + + + void tuplesort_set_bktmask(Tuplesortstate *state, uint32 mask) + { + state->bkt_mask = mask; + } + + + /* Special compare function called for the hash sort algorithm + * We are comparing hash values of the key, which then are masked to + * determine the proper hash table bucket. + */ + + + static int comparetup_hashindex(const SortTuple *a, const SortTuple *b, Tuplesortstate *state) + { + uint32 bkta, bktb; + + /* Allow interupting long sorts */ + CHECK_FOR_INTERRUPTS(); + + bkta = (_hash_datum2hashkey(state->indexRel, a->datum1)) & (uint32)state->bkt_mask; + bktb = (_hash_datum2hashkey(state->indexRel, b->datum1)) & (uint32)state->bkt_mask; + + if (bkta > bktb) + return 1; + else if (bkta < bktb) + return -1; + else + return 0; + + } + static void reversedirection_heap(Tuplesortstate *state) { *** ../../../src/include/access/hash.h.orig 2007-09-23 17:50:40.000000000 -0700 --- ../../../src/include/access/hash.h 2007-09-23 17:40:21.000000000 -0700 *************** *** 282,287 **** --- 282,297 ---- Bucket bucket, BlockNumber bucket_blkno, BufferAccessStrategy bstrategy); + /*hashsort.c*/ + typedef struct HSpool HSpool; + extern HSpool *h_spoolinit(Relation index); + extern void h_spool(IndexTuple itup, HSpool *hspool); + extern uint32 h_bkt_num(uint32 tuples, Relation rel); + extern void h_set_bkt_mask(HSpool *hspool, uint32 bkts); + extern void h_do_sort(HSpool *hspool); + extern void h_print_spool(HSpool *hspool); + extern void h_spooldestroy(HSpool *hspool); + /* hashpage.c */ extern void _hash_getlock(Relation rel, BlockNumber whichlock, int access); extern bool _hash_try_getlock(Relation rel, BlockNumber whichlock, int access); *************** *** 298,304 **** extern void _hash_wrtbuf(Relation rel, Buffer buf); extern void _hash_chgbufaccess(Relation rel, Buffer buf, int from_access, int to_access); ! extern void _hash_metapinit(Relation rel); extern void _hash_pageinit(Page page, Size size); extern void _hash_expandtable(Relation rel, Buffer metabuf); --- 308,314 ---- extern void _hash_wrtbuf(Relation rel, Buffer buf); extern void _hash_chgbufaccess(Relation rel, Buffer buf, int from_access, int to_access); ! extern void _hash_metapinit(Relation rel, uint32 pages); extern void _hash_pageinit(Page page, Size size); extern void _hash_expandtable(Relation rel, Buffer metabuf); *************** *** 320,325 **** --- 330,336 ---- extern Bucket _hash_hashkey2bucket(uint32 hashkey, uint32 maxbucket, uint32 highmask, uint32 lowmask); extern uint32 _hash_log2(uint32 num); + extern uint32 _hash_log2_floor(uint32 num); extern void _hash_checkpage(Relation rel, Buffer buf, int flags); /* hash.c */ *** ../../../src/include/utils/tuplesort.h.orig 2007-09-23 18:52:07.000000000 -0700 --- ../../../src/include/utils/tuplesort.h 2007-09-23 18:59:13.000000000 -0700 *************** *** 55,60 **** --- 55,63 ---- Oid sortOperator, bool nullsFirstFlag, int workMem, bool randomAccess); + extern void tuplesort_set_hashindex(Tuplesortstate *state); + extern void tuplesort_set_bktmask(Tuplesortstate *state, uint32 mask); + extern void tuplesort_set_bound(Tuplesortstate *state, int64 bound); extern void tuplesort_puttupleslot(Tuplesortstate *state,