Skip site navigation (1) Skip section navigation (2)

Re: Hash Index Build Patch v2

From: Bruce Momjian <bruce(at)momjian(dot)us>
To: Tom Raney <twraney(at)comcast(dot)net>
Cc: pgsql-patches(at)postgresql(dot)org
Subject: Re: Hash Index Build Patch v2
Date: 2007-10-22 14:50:48
Message-ID: 200710221450.l9MEomV03017@momjian.us (view raw or flat)
Thread:
Lists: pgsql-patches
This has been saved for the 8.4 release:

	http://momjian.postgresql.org/cgi-bin/pgpatches_hold

---------------------------------------------------------------------------

Tom Raney wrote:
> This revised version of our patch uses the function estimate_rel_size() 
> from plancat.c to estimate the number of tuples in the parent relation.  
> This method is an alternative to scanning the parent relation to 
> estimate the number of tuples, as we did in the first version of the patch.
> 
> -Tom

> #include <stdio.h>
> #include <stdlib.h>
> 
> 
> extern int main(int argc, char **argv) {
>       char *p;
>       long i, tups, seed;
> 
>       if (argc < 3) {
>               printf("Too few args.  <tuples> <seed>\n");
>               return 0;
>       }
> 
>       tups = strtol(argv[1],&p,0);
>       seed = strtol(argv[2], &p, 0);
>       srand(seed);
> 
>       for (i=0; i< tups; i++) {
>               printf("%d\n", rand());
>       }
> 
>       return 0;
> 
> } 
> *** ./backend/access/hash/hash.c.orig	2007-09-23 19:01:09.000000000 -0700
> --- ./backend/access/hash/hash.c	2007-10-21 12:07:48.455594000 -0700
> ***************
> *** 22,33 ****
>   #include "access/hash.h"
>   #include "catalog/index.h"
>   #include "commands/vacuum.h"
>   
>   
>   /* Working state for hashbuild and its callback */
>   typedef struct
>   {
> ! 	double		indtuples;
>   } HashBuildState;
>   
>   static void hashbuildCallback(Relation index,
> --- 22,36 ----
>   #include "access/hash.h"
>   #include "catalog/index.h"
>   #include "commands/vacuum.h"
> + #include "optimizer/plancat.h"
>   
>   
>   /* Working state for hashbuild and its callback */
>   typedef struct
>   {
> ! 	double		indtuples; /* The current number of index tuples */
> ! 	Relation	heapRel;   /* The index covers this heap relation */
> ! 	HSpool		*spool;	   /* Used to sort the index tuples before insertion into the index */
>   } HashBuildState;
>   
>   static void hashbuildCallback(Relation index,
> ***************
> *** 40,46 ****
> --- 43,80 ----
>   
>   /*
>    *	hashbuild() -- build a new hash index.
> +  *
> +  *
> +  *   The algorithm:
> +  *    (1) Initialize the build state
> +  *    (2) Retrieve estimate of tuple count from estimate_rel_size(); 
> +  *    (3) Transform the heap file tuples into index tuples (itups),
> +  *        while inserting them into a spool.  If the spool overflows
> +  *        memory, sort it into runs and spill it to disk
> +  *    (4) Finish sorting the spool
> +  *    (5) Pre-initialize all the buckets of the final index
> +  *    (6) Insert itups from the spool into the index
> +  *
> +  *   Sorting the tuples before inserting them into the index is a classical
> +  * bulk-load technique, also used in the BTree code.  The sort is done in
> +  * hash value order.
> +  *   Pre-allocating the buckets minimizes the number of overflow pages.
> +  *   The reason for step (2) is that in order to sort, in step (3), one must
> +  * know the hash value, which depends on the number of buckets, which in turn
> +  * depends on the number of itups = the number of rows in the heap file.
> +  *   Steps (3),(4) and (6) parallel similar steps in the BTree code.
> +  *
> +  *   Here is an alternative algorithm:
> +  *    (1') Same as (1)
> +  *    (2') Scan the heap file, counting the number of rows, forming index
> +  *         tuples and inserting them into a spool (the spool is not presorted).
> +  *    (3') Sort the spool
> +  *    (4') same as (5)
> +  *    (5') same as (6)
> +  *    Although this algorithm would be somewhat faster, we prefer the existing
> +  * algorithm because it reuses existing BTree code.
>    */
> + 
>   Datum
>   hashbuild(PG_FUNCTION_ARGS)
>   {
> ***************
> *** 50,55 ****
> --- 84,94 ----
>   	IndexBuildResult *result;
>   	double		reltuples;
>   	HashBuildState buildstate;
> + 	double		tuples;
> + 	BlockNumber	pages;
> + 	HashMetaPage 	metap;
> + 	Buffer		metabuf;
> + 	uint32		num_bkt; /* Estimates number of buckets in the final index */
>   
>   	/*
>   	 * We expect to be called exactly once for any index relation. If that's
> ***************
> *** 59,81 ****
>   		elog(ERROR, "index \"%s\" already contains data",
>   			 RelationGetRelationName(index));
>   
> ! 	/* initialize the hash index metadata page */
> ! 	_hash_metapinit(index);
> ! 
> ! 	/* build the index */
>   	buildstate.indtuples = 0;
>   
> ! 	/* do the heap scan */
> ! 	reltuples = IndexBuildHeapScan(heap, index, indexInfo,
> ! 								   hashbuildCallback, (void *) &buildstate);
>   
> - 	/*
> - 	 * Return statistics
> - 	 */
> - 	result = (IndexBuildResult *) palloc(sizeof(IndexBuildResult));
>   
> ! 	result->heap_tuples = reltuples;
> ! 	result->index_tuples = buildstate.indtuples;
>   
>   	PG_RETURN_POINTER(result);
>   }
> --- 98,158 ----
>   		elog(ERROR, "index \"%s\" already contains data",
>   			 RelationGetRelationName(index));
>   
> ! 	/* initialize the build state */
>   	buildstate.indtuples = 0;
> + 	buildstate.heapRel = heap;
> + 	buildstate.spool = h_spoolinit(index);
>   
> !        /*
> !  	* Retrieve an estimate of the number of rows
> !  	*
> !         */
> !         tuples=0;
> ! 	estimate_rel_size(heap, NULL, &pages, &tuples);
> ! 
> !         num_bkt = h_bkt_num((uint32)tuples, index); /* calculate the number of buckets in the final index */
> !         h_set_bkt_mask(buildstate.spool, num_bkt); /* set the bucket mask for the compare function */
> ! 
> !         /*
> ! 	 * Pre-initialize all the buckets of the final index
> !          */
> !         _hash_metapinit(index, num_bkt);
> ! 
> !         /*
> !          * Transform the heap file tuples into index tuples (itups),
> !          * while inserting them into a spool.  If the spool overflows
> !          * memory, sort it into runs and spill it to disk
> !          *
> !          */
> !         reltuples = IndexBuildHeapScan(heap, index, indexInfo,
> !                                                                    hashbuildCallback, (void *) &buildstate);
> ! 
> ! 
> !         /*
> !          * Finish sorting the spool
> !          */
> !         h_do_sort(buildstate.spool);
> ! 
> !         /*
> !          * Insert itups from the spool into the index
> !          */
> !         h_print_spool(buildstate.spool);
> ! 
> ! 
> !         /* Read the meta page just initialized */
> !         metabuf = _hash_getbuf(index, HASH_METAPAGE, HASH_READ, LH_META_PAGE);
> !         _hash_checkpage(index, metabuf, LH_META_PAGE);
> !         metap   = (HashMetaPage) BufferGetPage(metabuf);
> ! 
> !         /* Gather result, destroy spool, return */
> ! 
> !         result = (IndexBuildResult *) palloc(sizeof(IndexBuildResult));
> !         _hash_relbuf(index,metabuf);
> !         result->heap_tuples = reltuples;
> !         result->index_tuples = buildstate.indtuples;
>   
>   
> !         h_spooldestroy(buildstate.spool);
>   
>   	PG_RETURN_POINTER(result);
>   }
> ***************
> *** 104,111 ****
>   		pfree(itup);
>   		return;
>   	}
> ! 
> ! 	_hash_doinsert(index, itup);
>   
>   	buildstate->indtuples += 1;
>   
> --- 181,191 ----
>   		pfree(itup);
>   		return;
>   	}
> !         else
> !         {
> !                 /* Place each itup into the spool for sorting */
> !                 h_spool(itup, buildstate->spool);
> !         }
>   
>   	buildstate->indtuples += 1;
>   
> *** ./backend/access/hash/hashpage.c.orig	2007-09-23 19:31:22.000000000 -0700
> --- ./backend/access/hash/hashpage.c	2007-09-23 20:46:09.000000000 -0700
> ***************
> *** 313,326 ****
>   /*
>    *	_hash_metapinit() -- Initialize the metadata page of a hash index,
>    *				the two buckets that we begin with and the initial
> !  *				bitmap page.
>    *
>    * We are fairly cavalier about locking here, since we know that no one else
>    * could be accessing this index.  In particular the rule about not holding
>    * multiple buffer locks is ignored.
>    */
>   void
> ! _hash_metapinit(Relation rel)
>   {
>   	HashMetaPage metap;
>   	HashPageOpaque pageopaque;
> --- 313,326 ----
>   /*
>    *	_hash_metapinit() -- Initialize the metadata page of a hash index,
>    *				the two buckets that we begin with and the initial
> !  *				bitmap page, plus num_bkt buckets.
>    *
>    * We are fairly cavalier about locking here, since we know that no one else
>    * could be accessing this index.  In particular the rule about not holding
>    * multiple buffer locks is ignored.
>    */
>   void
> ! _hash_metapinit(Relation rel, uint32 num_bkt)
>   {
>   	HashMetaPage metap;
>   	HashPageOpaque pageopaque;
> ***************
> *** 330,342 ****
>   	int32		data_width;
>   	int32		item_width;
>   	int32		ffactor;
> ! 	uint16		i;
>   
>   	/* safety check */
>   	if (RelationGetNumberOfBlocks(rel) != 0)
>   		elog(ERROR, "cannot initialize non-empty hash index \"%s\"",
>   			 RelationGetRelationName(rel));
>   
>   	/*
>   	 * Determine the target fill factor (in tuples per bucket) for this index.
>   	 * The idea is to make the fill factor correspond to pages about as full
> --- 330,354 ----
>   	int32		data_width;
>   	int32		item_width;
>   	int32		ffactor;
> ! 	uint32		i;
> ! 	uint32		lg2buckets;
> ! 	uint32		pwr2;
> ! 	BlockNumber	start_blk;	
>   
>   	/* safety check */
>   	if (RelationGetNumberOfBlocks(rel) != 0)
>   		elog(ERROR, "cannot initialize non-empty hash index \"%s\"",
>   			 RelationGetRelationName(rel));
>   
> + 
> +         /* The minimum number of buckets is 2, so if we are below
> +          * that number, let's fix that here
> +          */
> + 
> +         if (num_bkt < 2)
> +                 num_bkt = 2;
> + 
> + 
>   	/*
>   	 * Determine the target fill factor (in tuples per bucket) for this index.
>   	 * The idea is to make the fill factor correspond to pages about as full
> ***************
> *** 401,431 ****
>   	 * We initialize the index with two buckets, 0 and 1, occupying physical
>   	 * blocks 1 and 2.	The first freespace bitmap page is in block 3.
>   	 */
> ! 	metap->hashm_maxbucket = metap->hashm_lowmask = 1;	/* nbuckets - 1 */
> ! 	metap->hashm_highmask = 3;	/* (nbuckets << 1) - 1 */
>   
>   	MemSet(metap->hashm_spares, 0, sizeof(metap->hashm_spares));
>   	MemSet(metap->hashm_mapp, 0, sizeof(metap->hashm_mapp));
>   
>   	metap->hashm_spares[1] = 1; /* the first bitmap page is only spare */
> ! 	metap->hashm_ovflpoint = 1;
>   	metap->hashm_firstfree = 0;
>   
> ! 	/*
> ! 	 * Initialize the first two buckets
> ! 	 */
> ! 	for (i = 0; i <= 1; i++)
> ! 	{
> ! 		buf = _hash_getnewbuf(rel, BUCKET_TO_BLKNO(metap, i));
> ! 		pg = BufferGetPage(buf);
> ! 		pageopaque = (HashPageOpaque) PageGetSpecialPointer(pg);
> ! 		pageopaque->hasho_prevblkno = InvalidBlockNumber;
> ! 		pageopaque->hasho_nextblkno = InvalidBlockNumber;
> ! 		pageopaque->hasho_bucket = i;
> ! 		pageopaque->hasho_flag = LH_BUCKET_PAGE;
>   		pageopaque->hasho_page_id = HASHO_PAGE_ID;
> ! 		_hash_wrtbuf(rel, buf);
> ! 	}
>   
>   	/*
>   	 * Initialize first bitmap page
> --- 413,488 ----
>   	 * We initialize the index with two buckets, 0 and 1, occupying physical
>   	 * blocks 1 and 2.	The first freespace bitmap page is in block 3.
>   	 */
> ! 
> !         /* We need this calculation to correctly set the splits
> !          * and to set the value of the low/high mask.
> !          */
> !         lg2buckets = _hash_log2_floor(num_bkt);
> ! 
> !         pwr2 = 1 << lg2buckets;
> ! 
> ! 
> !         metap->hashm_maxbucket = num_bkt -1;
> !         /* We want the highmask to mask the next larger
> !          * power of 2 value greated than the number of buckets
> !          * we need.  So, if we need 9 buckets, our high mask
> !          * value would be 15.  Our low mask value will be 7
> !          */
> ! 
> !         metap->hashm_highmask = (pwr2 << 1) -1;
> !         metap->hashm_lowmask = pwr2 - 1;
> ! 
>   
>   	MemSet(metap->hashm_spares, 0, sizeof(metap->hashm_spares));
>   	MemSet(metap->hashm_mapp, 0, sizeof(metap->hashm_mapp));
>   
>   	metap->hashm_spares[1] = 1; /* the first bitmap page is only spare */
> ! 
> ! 
> !         /*
> !          * No overflows will have occurred during this initialization process
> !          * so we need to just copy the value '1' into each position in the
> !          * hashm_spares array.  This is needed to correctly determine how each
> !          * bucket maps to the logical page on disk.
> !          *
> !          */
> ! 
> !         for (i = 2; i <= _hash_log2(num_bkt); i++)
> !                 metap->hashm_spares[i] = 1;
> ! 
> ! 
> !         /* This value needs to be the ceiling log to properly set the overflow
> !          * beyond our max bucket */
> !         metap->hashm_ovflpoint = _hash_log2(num_bkt);
> ! 
>   	metap->hashm_firstfree = 0;
>   
> ! 
> !         /* We need to make sure the file system can handle
> !          * this big block of pages we are going to allocate
> !          * _hash_alloc_buckets() will do that for us
> !          */
> ! 
> !         start_blk = BUCKET_TO_BLKNO(metap, 2);
> !         _hash_alloc_buckets(rel, start_blk, (pwr2<<1)-2);
> ! 
> !         /*
> !          * Initialize the first 'num_bkt' buckets
> !          */
> !         for (i = 0; i < num_bkt; i++)
> !         {
> ! 
> !                 buf = _hash_getnewbuf(rel, BUCKET_TO_BLKNO(metap, i));
> !                 pg = BufferGetPage(buf);
> !                 _hash_pageinit(pg, BufferGetPageSize(buf));
> !                 pageopaque = (HashPageOpaque) PageGetSpecialPointer(pg);
> !                 pageopaque->hasho_prevblkno = InvalidBlockNumber;
> !                 pageopaque->hasho_nextblkno = InvalidBlockNumber;
> !                 pageopaque->hasho_bucket = i;
> !                 pageopaque->hasho_flag = LH_BUCKET_PAGE;
>   		pageopaque->hasho_page_id = HASHO_PAGE_ID;
> !                 _hash_wrtbuf(rel, buf);
> !         }
>   
>   	/*
>   	 * Initialize first bitmap page
> *** ./backend/access/hash/hashutil.c.orig	2007-09-23 19:47:27.000000000 -0700
> --- ./backend/access/hash/hashutil.c	2007-09-23 19:49:44.000000000 -0700
> ***************
> *** 120,125 ****
> --- 120,144 ----
>   	return bucket;
>   }
>   
> + 
> + /* _hash_log2_floor -- returns floor(lg2(num))
> +  *
> +  */
> + uint32
> + _hash_log2_floor(uint32 num)
> + {
> +         uint32          i, limit;
> +         limit = 1;
> +         for (i=0; limit < num; limit <<= 1, i++)
> +                 ;
> +         if (limit == num)
> +                 return i;
> +         else
> +                 return i-1;
> + 
> + }
> + 
> + 
>   /*
>    * _hash_log2 -- returns ceil(lg2(num))
>    */
> *** ./backend/access/hash/Makefile.orig	2007-09-23 20:30:39.000000000 -0700
> --- ./backend/access/hash/Makefile	2007-09-23 20:31:12.000000000 -0700
> ***************
> *** 13,19 ****
>   include $(top_builddir)/src/Makefile.global
>   
>   OBJS = hash.o hashfunc.o hashinsert.o hashovfl.o hashpage.o hashscan.o \
> !        hashsearch.o hashutil.o
>   
>   all: SUBSYS.o
>   
> --- 13,19 ----
>   include $(top_builddir)/src/Makefile.global
>   
>   OBJS = hash.o hashfunc.o hashinsert.o hashovfl.o hashpage.o hashscan.o \
> !        hashsearch.o hashutil.o hashsort.o
>   
>   all: SUBSYS.o
>   
> *** ./backend/access/hash/README.orig	2007-09-24 00:07:32.000000000 -0700
> --- ./backend/access/hash/README	2007-10-21 12:22:00.569522000 -0700
> ***************
> *** 105,110 ****
> --- 105,137 ----
>   number 3, which is the first bitmap page and is allocated during index
>   creation.
>   
> + Building an index on a full table
> + ----------------------------------
> + 
> + An index can be created on a table already loaded with tuples.  Before it
> + is built, an estimate of the number of bucket pages that might be needed
> + to fit all the index tuples, is calculated.  This is done by calling the
> + function estimate_rel_size().  A Fill factor (either DEFAULT or
> + user-defined as applicable) is then used to get the estimate, which is 
> + the number of bucket pages initialized.  If the estimate falls below two,
> + then a minimum of two bucket pages is initialized.  However, the number
> + of bucket pages allocated is always a power of 2 value greater than the
> + estimate.  I.e., if the estimated number of buckets is 3124, then the
> + number of buckets allocated is 4096 (and the number of bucket pages
> + initialized is 3124). If the estimate is 5456, then number allocated is
> + 8192 (and the number initialized is 5456) and so on.
> + 
> + A spool holds all the index tuples before they are inserted into the
> + index pages. The content of the spool is sorted on the hash value order so
> + that there will be less backtracking to the bucket pages we have already
> + visited.
> + 
> + The intention of creating as many as the estimated number of pages is to
> + avoid bucket page splits at splitpoint S and hence, redistribution of
> + tuples.  Since we create all the bucket pages we need and insert the tuples
> + based on the bucket number they belong to, there will not be any need for
> + splitting.
> + 
>   
>   Lock definitions
>   ----------------
> *** ./backend/utils/sort/tuplesort.c.orig	2007-09-23 19:51:30.000000000 -0700
> --- ./backend/utils/sort/tuplesort.c	2007-09-24 00:03:25.000000000 -0700
> ***************
> *** 341,346 ****
> --- 341,347 ----
>   	Relation	indexRel;
>   	ScanKey		indexScanKey;
>   	bool		enforceUnique;	/* complain if we find duplicate tuples */
> + 	uint32		bkt_mask;	/* Bucket mask for hash sort */
>   
>   	/*
>   	 * These variables are specific to the Datum case; they are set by
> ***************
> *** 439,444 ****
> --- 440,448 ----
>   static void reversedirection_heap(Tuplesortstate *state);
>   static int comparetup_index(const SortTuple *a, const SortTuple *b,
>   				 Tuplesortstate *state);
> + static int comparetup_hashindex(const SortTuple *a, const SortTuple *b,
> +                                Tuplesortstate *state);
> + 
>   static void copytup_index(Tuplesortstate *state, SortTuple *stup, void *tup);
>   static void writetup_index(Tuplesortstate *state, int tapenum,
>   			   SortTuple *stup);
> ***************
> *** 2621,2626 ****
> --- 2625,2673 ----
>   								&stup->isnull1);
>   }
>   
> + /*
> +  * Set state parameters for sorting hash index tuples
> +  */
> + void tuplesort_set_hashindex(Tuplesortstate *state)
> + {
> +         state->comparetup = comparetup_hashindex;
> +         state->indexScanKey = NULL; /* Scan key is not needed in hash index */
> +         state->enforceUnique = false;   /* no reason to enforce uniqueness with a hash table */
> + 
> + }
> + 
> + 
> + void tuplesort_set_bktmask(Tuplesortstate *state, uint32 mask)
> + {
> +         state->bkt_mask = mask;
> + }
> + 
> + 
> + /* Special compare function called for the hash sort algorithm
> +  * We are comparing hash values of the key, which then are masked to
> +  * determine the proper hash table bucket.
> +  */
> + 
> + 
> + static int comparetup_hashindex(const SortTuple *a, const SortTuple *b, Tuplesortstate *state)
> + {
> +         uint32  bkta, bktb;
> + 
> +         /* Allow interupting long sorts */
> +         CHECK_FOR_INTERRUPTS();
> + 
> +         bkta = (_hash_datum2hashkey(state->indexRel, a->datum1)) & (uint32)state->bkt_mask;
> +         bktb = (_hash_datum2hashkey(state->indexRel, b->datum1)) & (uint32)state->bkt_mask;
> + 
> +         if (bkta > bktb)
> +                 return 1;
> +         else if (bkta < bktb)
> +                 return -1;
> +         else
> +                 return 0;
> + 
> + }
> + 
>   static void
>   reversedirection_heap(Tuplesortstate *state)
>   {
> *** ./include/access/hash.h.orig	2007-09-23 17:50:40.000000000 -0700
> --- ./include/access/hash.h	2007-09-23 17:40:21.000000000 -0700
> ***************
> *** 282,287 ****
> --- 282,297 ----
>   								Bucket bucket, BlockNumber bucket_blkno,
>   								BufferAccessStrategy bstrategy);
>   
> + /*hashsort.c*/
> + typedef struct HSpool HSpool;
> + extern HSpool *h_spoolinit(Relation index);
> + extern void h_spool(IndexTuple itup, HSpool *hspool);
> + extern uint32 h_bkt_num(uint32 tuples, Relation rel);
> + extern void h_set_bkt_mask(HSpool *hspool, uint32 bkts);
> + extern void h_do_sort(HSpool *hspool);
> + extern void h_print_spool(HSpool *hspool);
> + extern void h_spooldestroy(HSpool *hspool);
> + 
>   /* hashpage.c */
>   extern void _hash_getlock(Relation rel, BlockNumber whichlock, int access);
>   extern bool _hash_try_getlock(Relation rel, BlockNumber whichlock, int access);
> ***************
> *** 298,304 ****
>   extern void _hash_wrtbuf(Relation rel, Buffer buf);
>   extern void _hash_chgbufaccess(Relation rel, Buffer buf, int from_access,
>   				   int to_access);
> ! extern void _hash_metapinit(Relation rel);
>   extern void _hash_pageinit(Page page, Size size);
>   extern void _hash_expandtable(Relation rel, Buffer metabuf);
>   
> --- 308,314 ----
>   extern void _hash_wrtbuf(Relation rel, Buffer buf);
>   extern void _hash_chgbufaccess(Relation rel, Buffer buf, int from_access,
>   				   int to_access);
> ! extern void _hash_metapinit(Relation rel, uint32 pages);
>   extern void _hash_pageinit(Page page, Size size);
>   extern void _hash_expandtable(Relation rel, Buffer metabuf);
>   
> ***************
> *** 320,325 ****
> --- 330,336 ----
>   extern Bucket _hash_hashkey2bucket(uint32 hashkey, uint32 maxbucket,
>   					 uint32 highmask, uint32 lowmask);
>   extern uint32 _hash_log2(uint32 num);
> + extern uint32 _hash_log2_floor(uint32 num);
>   extern void _hash_checkpage(Relation rel, Buffer buf, int flags);
>   
>   /* hash.c */
> *** ./include/utils/tuplesort.h.orig	2007-09-23 18:52:07.000000000 -0700
> --- ./include/utils/tuplesort.h	2007-09-23 18:59:13.000000000 -0700
> ***************
> *** 55,60 ****
> --- 55,63 ----
>   					  Oid sortOperator, bool nullsFirstFlag,
>   					  int workMem, bool randomAccess);
>   
> + extern void tuplesort_set_hashindex(Tuplesortstate *state);
> + extern void tuplesort_set_bktmask(Tuplesortstate *state, uint32 mask);
> + 
>   extern void tuplesort_set_bound(Tuplesortstate *state, int64 bound);
>   
>   extern void tuplesort_puttupleslot(Tuplesortstate *state,

> /*---------------------------------------------------------------------------
>  * hashsort.c
>  *-------------------------------------------------------------------------*/
> 
> /* 
>  * Functions needed to support initializing and sorting tuples in the spool
>  * in hash.c
>  */
> 
> 
> #include "postgres.h"
> 
> #include "access/hash.h"
> #include "miscadmin.h"
> #include "storage/smgr.h"
> #include "utils/tuplesort.h"
> 
> struct HSpool
> {
> 	/* State data for tuplesort.c */
> 	Tuplesortstate *sortstate; 
> 	Relation index;
> };
> 
> /* create and initialize the spool structure */
> HSpool *h_spoolinit(Relation index)
> {
> 	HSpool *hspool;
> 	int hKbytes;
> 	
> 	hspool = (HSpool *) palloc0(sizeof(HSpool));
> 
> 	hspool->index = index;
> 
> 	/* hKbytes is the amount of memory we are going to use
>  	 * to sort the spool.  
>  	 */
> 
> 	hKbytes = maintenance_work_mem;
> 	hspool->sortstate = tuplesort_begin_index(index,false,hKbytes,false);
> 
>  	/* Substitute the default compare call-back function
>  	 * for a specific hash index build compare function.
>  	 */
> 
> 	tuplesort_set_hashindex(hspool->sortstate);
> 	
> 	return hspool;
> 
> }
> 
> void h_spool(IndexTuple itup, HSpool *hspool)
> {
> 	tuplesort_putindextuple(hspool->sortstate, itup);
> }
> 
> /* 
>  * h_bkt_num() estimates the number of buckets
>  * in the final hash table.  
>  *
>  */
> 
> uint32 h_bkt_num(uint32 tuples, Relation rel)
> {
> 	int32 ffactor;
> 	int32 data_width;
> 	int32 item_width;
> 	uint32 bkt_num;
> 
> 	/*
>  	 * Calculate the fill factor.  We could get this from the meta page
>  	 * but at the point this method is called, the meta page has not been
>  	 * created.
>  	 *
>  	 */
> 
> 	data_width = get_typavgwidth(RelationGetDescr(rel)->attrs[0]->atttypid,
> 					RelationGetDescr(rel)->attrs[0]->atttypmod);
> 	item_width = MAXALIGN(sizeof(IndexTupleData)) + MAXALIGN(data_width) +
> 					sizeof(ItemIdData);
> 
> 	ffactor = RelationGetTargetPageUsage(rel, HASH_DEFAULT_FILLFACTOR) / item_width;
> 	if (ffactor < 10)
> 		ffactor = 10;
> 
>         bkt_num = tuples / ffactor;
>         bkt_num = bkt_num +1;
> 
> 	return bkt_num;
> }
> 
> /*	In order to define an order for the index tuples, there must be a mask
>  *	applied to the 32 bit hash value of the index key during the sort 
>  *	process.
>  *	For example, if there are 4,555 buckets approximated, the mask (or modulo) 
>  *	would be 8,191 (hex value 1FFF). 
>  *
>  */
> 
> void h_set_bkt_mask(HSpool *spool, uint32 bkts) {
> 	uint32 bkt_pwr2, bkt_mask;
> 
> 	bkt_pwr2 = _hash_log2(bkts);
> 	bkt_mask = (1<<(bkt_pwr2))-1;
> 
> 	tuplesort_set_bktmask(spool->sortstate, bkt_mask);	
> }
> 
> void h_do_sort(HSpool *spool)
> {
> 	tuplesort_performsort(spool->sortstate);
> }
> 
> void h_spooldestroy(HSpool *hspool)
> {
> 	tuplesort_end(hspool->sortstate);
> 	pfree(hspool);
> }
> 
> 
> /* 
>  * h_print_spool() reads the itups back from the spool, which are now
>  * in sorted order by hash value. As each itup is read from the spool, it is
>  * inserted into the hash index.
>  *
>  */
> 
> void h_print_spool(HSpool *spool)
> {
> 	bool isfirstnull;
> 	bool should_free;
> 	IndexTuple itup= NULL;
> 	Datum attr1;
> 	TupleDesc tupd = RelationGetDescr(spool->index);
> 	
> 	while (itup = tuplesort_getindextuple(spool->sortstate, true, &should_free))
> 	{	
> 		
> 		attr1 = index_getattr(itup,1, tupd, &isfirstnull);
> 		if(!isfirstnull)
> 		{
> 			_hash_doinsert(spool->index, itup);
> 		}
> 	}
> }
> 

> 
> ---------------------------(end of broadcast)---------------------------
> TIP 9: In versions below 8.0, the planner will ignore your desire to
>        choose an index scan if your joining column's datatypes do not
>        match

-- 
  Bruce Momjian  <bruce(at)momjian(dot)us>        http://momjian.us
  EnterpriseDB                             http://postgres.enterprisedb.com

  + If your life is a hard drive, Christ can be your backup. +

In response to

pgsql-patches by date

Next:From: Bruce MomjianDate: 2007-10-22 15:09:14
Subject: Re: EXECUTE USING for plpgsql (for 8.4)
Previous:From: Pavel StehuleDate: 2007-10-22 10:48:08
Subject: EXECUTE USING for plpgsql (for 8.4)

Privacy Policy | About PostgreSQL
Copyright © 1996-2014 The PostgreSQL Global Development Group