*** a/contrib/pageinspect/Makefile
--- b/contrib/pageinspect/Makefile
***************
*** 1,7 ****
  # contrib/pageinspect/Makefile
  
  MODULE_big	= pageinspect
! OBJS		= rawpage.o heapfuncs.o btreefuncs.o fsmfuncs.o
  
  EXTENSION = pageinspect
  DATA = pageinspect--1.1.sql pageinspect--1.0--1.1.sql \
--- 1,7 ----
  # contrib/pageinspect/Makefile
  
  MODULE_big	= pageinspect
! OBJS		= rawpage.o heapfuncs.o btreefuncs.o fsmfuncs.o mmfuncs.o
  
  EXTENSION = pageinspect
  DATA = pageinspect--1.1.sql pageinspect--1.0--1.1.sql \
*** /dev/null
--- b/contrib/pageinspect/mmfuncs.c
***************
*** 0 ****
--- 1,217 ----
+ /*
+  * mmfuncs.c
+  * 		Functions to investigate MinMax indexes
+  *
+  * Copyright (c) 2013, PostgreSQL Global Development Group
+  *
+  * IDENTIFICATION
+  * 		contrib/pageinspect/mmfuncs.c
+  */
+ #include "postgres.h"
+ 
+ #include "access/htup_details.h"
+ #include "access/minmax.h"
+ #include "access/minmax_tuple.h"
+ #include "catalog/index.h"
+ #include "funcapi.h"
+ #include "utils/builtins.h"
+ #include "utils/lsyscache.h"
+ #include "utils/rel.h"
+ #include "miscadmin.h"
+ 
+ Datum minmax_page_items(PG_FUNCTION_ARGS);
+ 
+ PG_FUNCTION_INFO_V1(minmax_page_items);
+ 
+ typedef struct mm_page_state
+ {
+ 	TupleDesc	tupdesc;
+ 	Page		page;
+ 	OffsetNumber offset;
+ 	bool		unusedItem;
+ 	bool		done;
+ 	AttrNumber	attno;
+ 	DeformedMMTuple *dtup;
+ 	FmgrInfo	outputfn[FLEXIBLE_ARRAY_MEMBER];
+ } mm_page_state;
+ 
+ /*
+  * Extract all item values from a minmax index page
+  *
+  * Usage: SELECT * FROM minmax_page_items(get_raw_page('idx', 1), 'idx'::regclass);
+  */
+ Datum
+ minmax_page_items(PG_FUNCTION_ARGS)
+ {
+ 	mm_page_state *state;
+ 	FuncCallContext *fctx;
+ 
+ 	if (!superuser())
+ 		ereport(ERROR,
+ 				(errcode(ERRCODE_INSUFFICIENT_PRIVILEGE),
+ 				 (errmsg("must be superuser to use raw page functions"))));
+ 
+ 	if (SRF_IS_FIRSTCALL())
+ 	{
+ 		bytea	   *raw_page = PG_GETARG_BYTEA_P(0);
+ 		Oid			indexRelid = PG_GETARG_OID(1);
+ 		int			raw_page_size;
+ 		TupleDesc	tupdesc;
+ 		MemoryContext mctx;
+ 		Relation	indexRel;
+ 		AttrNumber	attno;
+ 
+ 		raw_page_size = VARSIZE(raw_page) - VARHDRSZ;
+ 
+ 		if (raw_page_size < SizeOfPageHeaderData)
+ 			ereport(ERROR,
+ 					(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
+ 					 errmsg("input page too small (%d bytes)", raw_page_size)));
+ 
+ 		/* create a function context for cross-call persistence */
+ 		fctx = SRF_FIRSTCALL_INIT();
+ 
+ 		/* switch to memory context appropriate for multiple function calls */
+ 		mctx = MemoryContextSwitchTo(fctx->multi_call_memory_ctx);
+ 
+ 		/* Build a tuple descriptor for our result type */
+ 		if (get_call_result_type(fcinfo, NULL, &tupdesc) != TYPEFUNC_COMPOSITE)
+ 			elog(ERROR, "return type must be a row type");
+ 
+ 		indexRel = index_open(indexRelid, AccessShareLock);
+ 
+ 		state = palloc(offsetof(mm_page_state, outputfn) +
+ 					   sizeof(FmgrInfo) * RelationGetDescr(indexRel)->natts);
+ 
+ 		state->tupdesc = CreateTupleDescCopy(RelationGetDescr(indexRel));
+ 		state->page = VARDATA(raw_page);
+ 		state->offset = FirstOffsetNumber;
+ 		state->unusedItem = false;
+ 		state->done = false;
+ 		state->dtup = NULL;
+ 
+ 		index_close(indexRel, AccessShareLock);
+ 
+ 		for (attno = 1; attno <= state->tupdesc->natts; attno++)
+ 		{
+ 			Oid		output;
+ 			bool	isVarlena;
+ 
+ 			getTypeOutputInfo(state->tupdesc->attrs[attno - 1]->atttypid,
+ 							  &output, &isVarlena);
+ 			fmgr_info(output, &state->outputfn[attno - 1]);
+ 		}
+ 
+ 		fctx->user_fctx = state;
+ 		fctx->tuple_desc = BlessTupleDesc(tupdesc);
+ 
+ 		MemoryContextSwitchTo(mctx);
+ 	}
+ 
+ 	fctx = SRF_PERCALL_SETUP();
+ 	state = fctx->user_fctx;
+ 
+ 	if (!state->done)
+ 	{
+ 		HeapTuple	result;
+ 		Datum		values[6];
+ 		bool		nulls[6];
+ 
+ 		/*
+ 		 * This loop is called once for every attribute of every tuple in the
+ 		 * page.  At the start of a tuple, we get a NULL dtup; that's our
+ 		 * signal for obtaining and decoding the next one.  If that's not the
+ 		 * case, we output the next attribute.
+ 		 */
+ 		if (state->dtup == NULL)
+ 		{
+ 			MMTuple	   *tup;
+ 			MemoryContext mctx;
+ 			ItemId		itemId;
+ 
+ 			/* deformed tuple must live across calls */
+ 			mctx = MemoryContextSwitchTo(fctx->multi_call_memory_ctx);
+ 
+ 			/* verify item status: if there's no data, we can't decode */
+ 			itemId = PageGetItemId(state->page, state->offset);
+ 			if (ItemIdIsUsed(itemId))
+ 			{
+ 				tup = (MMTuple *) PageGetItem(state->page,
+ 											  PageGetItemId(state->page,
+ 															state->offset));
+ 				state->dtup = minmax_deform_tuple(state->tupdesc, tup);
+ 				state->attno = 1;
+ 				state->unusedItem = false;
+ 			}
+ 			else
+ 				state->unusedItem = true;
+ 
+ 			MemoryContextSwitchTo(mctx);
+ 		}
+ 		else
+ 			state->attno++;
+ 
+ 		MemSet(nulls, 0, sizeof(nulls));
+ 
+ 		if (state->unusedItem)
+ 		{
+ 			values[0] = UInt16GetDatum(state->offset);
+ 			nulls[1] = true;
+ 			nulls[2] = true;
+ 			nulls[3] = true;
+ 			nulls[4] = true;
+ 			nulls[5] = true;
+ 		}
+ 		else
+ 		{
+ 			int		att = state->attno - 1;
+ 
+ 			values[0] = UInt16GetDatum(state->offset);
+ 			values[1] = UInt16GetDatum(state->attno);
+ 			values[2] = BoolGetDatum(state->dtup->values[att].allnulls);
+ 			values[3] = BoolGetDatum(state->dtup->values[att].hasnulls);
+ 			if (!state->dtup->values[att].allnulls)
+ 			{
+ 				FmgrInfo   *outputfn = &state->outputfn[att];
+ 				MMValues   *mmvalues = &state->dtup->values[att];
+ 
+ 				values[4] = CStringGetTextDatum(OutputFunctionCall(outputfn,
+ 																   mmvalues->min));
+ 				values[5] = CStringGetTextDatum(OutputFunctionCall(outputfn,
+ 																   mmvalues->max));
+ 			}
+ 			else
+ 			{
+ 				nulls[4] = true;
+ 				nulls[5] = true;
+ 			}
+ 		}
+ 
+ 		result = heap_form_tuple(fctx->tuple_desc, values, nulls);
+ 
+ 		/*
+ 		 * If the item was unused, jump straight to the next one; otherwise,
+ 		 * the only cleanup needed here is to set our signal to go to the next
+ 		 * tuple in the following iteration, by freeing the current one.
+ 		 */
+ 		if (state->unusedItem)
+ 			state->offset = OffsetNumberNext(state->offset);
+ 		else if (state->attno >= state->tupdesc->natts)
+ 		{
+ 			pfree(state->dtup);
+ 			state->dtup = NULL;
+ 			state->offset = OffsetNumberNext(state->offset);
+ 		}
+ 
+ 		/*
+ 		 * If we're beyond the end of the page, set flag to end the function in
+ 		 * the following iteration.
+ 		 */
+ 		if (state->offset > PageGetMaxOffsetNumber(state->page))
+ 			state->done = true;
+ 
+ 		SRF_RETURN_NEXT(fctx, HeapTupleGetDatum(result));
+ 	}
+ 
+ 	SRF_RETURN_DONE(fctx);
+ }
*** a/contrib/pageinspect/pageinspect--1.1.sql
--- b/contrib/pageinspect/pageinspect--1.1.sql
***************
*** 99,104 **** AS 'MODULE_PATHNAME', 'bt_page_items'
--- 99,118 ----
  LANGUAGE C STRICT;
  
  --
+ -- minmax_page_items()
+ --
+ CREATE FUNCTION minmax_page_items(IN page bytea, IN index_oid oid,
+ 	OUT itemoffset int,
+ 	OUT attnum int,
+ 	OUT allnulls bool,
+ 	OUT hasnulls bool,
+ 	OUT min text,
+ 	OUT max text)
+ RETURNS SETOF record
+ AS 'MODULE_PATHNAME', 'minmax_page_items'
+ LANGUAGE C STRICT;
+ 
+ --
  -- fsm_page_contents()
  --
  CREATE FUNCTION fsm_page_contents(IN page bytea)
*** a/contrib/pg_xlogdump/rmgrdesc.c
--- b/contrib/pg_xlogdump/rmgrdesc.c
***************
*** 13,18 ****
--- 13,19 ----
  #include "access/gist_private.h"
  #include "access/hash.h"
  #include "access/heapam_xlog.h"
+ #include "access/minmax_xlog.h"
  #include "access/multixact.h"
  #include "access/nbtree.h"
  #include "access/rmgr.h"
*** /dev/null
--- b/minmax-proposal
***************
*** 0 ****
--- 1,300 ----
+ Minmax Range Indexes
+ ====================
+ 
+ Minmax indexes are a new access method intended to enable very fast scanning of
+ extremely large tables.
+ 
+ The essential idea of a minmax index is to keep track of the min() and max()
+ values in consecutive groups of heap pages (page ranges).  These values can be
+ used by constraint exclusion to avoid scanning such pages, depending on query
+ quals.
+ 
+ The main drawback of this is having to update the stored min/max values of each
+ page range as tuples are inserted into them.
+ 
+ Other database systems already have this feature. Some examples:
+ 
+ * Oracle Exadata calls this "storage indexes"
+   http://richardfoote.wordpress.com/category/storage-indexes/
+ 
+ * Netezza has "zone maps"
+   http://nztips.com/2010/11/netezza-integer-join-keys/
+ 
+ * Infobright has this automatically within their "data packs"
+   http://www.infobright.org/Blog/Entry/organizing_data_and_more_about_rough_data_contest/
+ 
+ * MonetDB seems to have it
+   http://citeseerx.ist.psu.edu/viewdoc/summary?doi=10.1.1.108.2662
+   "Cooperative Scans: Dynamic Bandwidth Sharing in a DBMS"
+ 
+ Index creation
+ --------------
+ 
+ To create a minmax index, we use the standard wording:
+ 
+   CREATE INDEX foo_minmax_idx ON foo USING MINMAX (a, b, e);
+ 
+ Partial indexes are not supported; since an index is concerned with minimum and
+ maximum values of the involved columns across all the pages in the table, it
+ doesn't make sense to exclude values.  Another way to see "partial" indexes
+ here would be those that only considered some pages in the table instead of all
+ of them; but this would be difficult to implement and manage and, most likely,
+ pointless.
+ 
+ Expressional indexes can probably be supported in the future, but we disallow
+ them initially for conceptual simplicity.
+ 
+ Having multiple minmax indexes in the same table is acceptable, though most of
+ the time it would make more sense to have a single index covering all the
+ interesting columns.  Multiple indexes might be useful for columns added later.
+ 
+ Access Method Design
+ --------------------
+ 
+ Since item pointers are not stored inside indexes of this type, it is not
+ possible to support the amgettuple interface.  Instead, we only provide
+ amgetbitmap support; scanning a relation using this index requires a recheck
+ node on top.  The amgetbitmap routine would return a TIDBitmap comprising all
+ the pages in those page groups that match the query qualifications; the recheck
+ node prunes tuples that are not visible per snapshot and those that are not
+ visible per query quals.
+ 
+ For each supported datatype, we need an opclass with the following catalog
+ entries:
+ 
+ - support operators (pg_amop): same as btree (<, <=, =, >=, >)
+ 
+ These operators are used pervasively:
+ 
+ - The optimizer requires them to evaluate queries, so that the index is chosen
+   when queries on the indexed table are planned.
+ - During index construction (ambuild), they are used to determine the boundary
+   values for each page range.
+ - During index updates (aminsert), they are used to determine whether the new
+   heap tuple matches the existing index tuple; and if not, they are used to
+   construct the new index tuple.
+ 
+ In each index tuple (corresponding to one page range), we store:
+ - for each indexed column:
+   * minimum value across all tuples in the range
+   * maximum value across all tuples in the range
+   * are there nulls present in any tuple?
+   * are null all the values in all tuples in the range?
+ 
+ These null bits are stored in a single null bitmask of length 2x number of
+ columns.
+ 
+ With the default INDEX_MAX_KEYS of 32, and considering columns of 8-byte length
+ types such as timestamptz or bigint, each tuple would be 522 bytes in length,
+ which seems reasonable.  There are 6 extra bytes for padding between the null
+ bitmask and the first data item, assuming 64-bit alignment; so the total size
+ for such an index would actually be 528 bytes.
+ 
+ This maximum index tuple size is calculated as: mt_info (2 bytes) + null bitmap
+ (8 bytes) + data value (8 bytes) * 32 * 2
+ 
+ (Of course, larger columns are possible, such as varchar, but creating minmax
+ indexes on such columns seems of little practical usefulness.  Also, the
+ usefulness of an index containing so many columns is dubious, at best.)
+ 
+ There can be gaps where some pages have no covering index entry. In particular,
+ the last few pages of the table would commonly not be summarized.
+ 
+ The Range Reverse Map
+ ---------------------
+ 
+ To find out the index tuple for a particular page range, we have a
+ separate fork called the range reverse map.  This fork stores one TID per
+ range, which is the address of the index tuple summarizing that range.  Since
+ these map entries are fixed size, it is possible to compute the address of the
+ range map entry for any given heap page.
+ 
+ When a new heap tuple is inserted in a summarized page range, it is possible to
+ compare the existing index tuple with the new heap tuple.  If the heap tuple is
+ outside the minimum/maximum boundaries given by the index tuple for any indexed
+ column (or if the new heap tuple contains null values but the index tuple
+ indicate there are no nulls), it is necessary to create a new index tuple with
+ the new values.  To do this, a new index tuple is inserted, and the reverse range
+ map is updated to point to it.  The old index tuple is left in place, for later
+ garbage collection.
+ 
+ If the reverse range map points to an invalid TID, the corresponding page range
+ is not summarized.
+ 
+ A minmax index is updated by creating a new summary tuple whenever an
+ insertion outside the min-max interval occurs in the pages within the range.
+ 
+ To scan a table following a minmax index, we scan the reverse range map
+ sequentially.  This yields index tuples in ascending page range order.
+ Query quals are matched to each index tuple; if they match, each page within
+ the page range is returned as part of the output TID bitmap.  If there's no
+ match, they are skipped.  Reverse range map entries returning invalid index
+ TIDs, that is unsummarized page ranges, are also returned in the TID bitmap.
+ 
+ To store the range reverse map, we reuse the VISIBILITYMAP_FORKNUM, since a VM
+ does not make sense for a minmax index anyway (XXX -- really??)
+ 
+ When tuples are added to unsummarized pages, nothing needs to happen.
+ 
+ Heap tuples can be removed from anywhere without restriction.
+ 
+ Index entries that are not referenced from the revmap can be removed from the
+ main fork.  This currently happens at amvacuumcleanup, though it could be
+ carried out separately; no heap scan is necessary to determine which tuples
+ are unreachable.
+ 
+ Summarization
+ -------------
+ 
+ At index creation time, the whole table is scanned; for each page range the
+ minimum and maximum values of each indexed column and nulls bitmap are
+ collected and stored in the index.  The possibly-incomplete range at the end
+ of the table is not included.
+ 
+ Once in a while, it is necessary to summarize a bunch of unsummarized pages
+ (because the table has grown since the index was created), or re-summarize a
+ range that has been marked invalid.  This is simple: scan the page range
+ calculating the min() and max() for each indexed column, then insert the new
+ index entry at the end of the index.  The main interesting questions are:
+ 
+ a) when to do it
+    The perfect time to do it is as soon as a complete page range of the
+    configured range size has been filled.
+ 
+ b) who does it (what process)
+    It doesn't seem a good idea to have a client-connected process do it;
+    it would incur unwanted latency.  Three other options are (i) to spawn a
+    specialized process to do it, which perhaps can be signalled by a
+    client-connected process that executes a scan and notices the need to run
+    summarization; or (ii) to let autovacuum do it, as a separate new
+    maintenance task.  This seems simple enough to bolt on top of already
+    existing autovacuum infrastructure.  The timing constraints of autovacuum
+    might be undesirable, though.  (iii) wait for user command.
+ 
+ The easiest way to go around this seems to have vacuum do it.  That way we can
+ simply do re-summarization on the amvacuumcleanup routine.  Other answers would
+ mean we need a separate AM routine, which appears unwarranted at this stage.
+ 
+ Vacuuming
+ ---------
+ 
+ Vacuuming a table that has a minmax index does not represent a significant
+ challenge.  Since no heap TIDs are stored, it's not necessary to scan the index
+ when heap tuples are removed.  It might be that some min() value can be
+ incremented, or some max() value can be decremented; but this would represent
+ an optimization opportunity only, not a correctness issue.  Perhaps it's
+ simpler to represent this as the need to re-run summarization on the affected
+ page range.
+ 
+ Note that if there are no indexes on the table other than the minmax index,
+ usage of maintenance_work_mem by vacuum can be decreased significantly, because
+ no detailed index scan needs to take place (and thus it's not necessary for
+ vacuum to save TIDs to remove).  This optimization opportunity is best left for
+ future improvement.
+ 
+ Locking considerations
+ ----------------------
+ 
+ To read the TID during an index scan, we follow this protocol:
+ 
+ * read revmap page
+ * obtain share lock on the revmap buffer
+ * read the TID
+ * obtain share lock on buffer of main fork
+ * LockTuple the TID (using the index as relation).  A shared lock is
+   sufficient.  We need the LockTuple to prevent VACUUM from recycling
+   the index tuple; see below.
+ * release revmap buffer lock
+ * read the index tuple
+ * release the tuple lock
+ * release main fork buffer lock
+ 
+ 
+ To update the summary tuple for a page range, we use this protocol:
+ 
+ * insert a new index tuple somewhere in the main fork; note its TID
+ * read revmap page
+ * obtain exclusive lock on revmap buffer
+ * write the TID
+ * release lock
+ 
+ This ensures no concurrent reader can obtain a partially-written TID.
+ Note we don't need a tuple lock here.  Concurrent scans don't have to
+ worry about whether they got the old or new index tuple: if they get the
+ old one, the tighter values are okay from a correctness standpoint because
+ due to MVCC they can't possibly see the just-inserted heap tuples anyway.
+ 
+ 
+ For vacuuming, we need to figure out which index tuples are no longer
+ referenced from the reverse range map.  This requires some brute force,
+ but is simple:
+ 
+ 1) scan the complete index, store each existing TIDs in a dynahash.
+    Hash key is the TID, hash value is a boolean initially set to false.
+ 2) scan the complete revmap sequentially, read the TIDs on each page.  Share
+    lock on each page is sufficient.  For each TID so obtained, grab the
+    element from the hash and update the boolean to true.
+ 3) Scan the index again; for each tuple found, search the hash table.
+    If the tuple is not present in hash, it must have been added after our
+    initial scan; ignore it.  If tuple is present in hash, and the hash flag is
+    true, then the tuple is referenced from the revmap; ignore it.  If the hash
+    flag is false, then the index tuple is no longer referenced by the revmap;
+    but it could be about to be accessed by a concurrent scan.  Do
+    ConditionalLockTuple.  If this fails, ignore the tuple (it's in use), it
+    will be deleted by a future vacuum.  If lock is acquired, then we can safely
+    remove the index tuple.
+ 4) Index pages with free space can be detected by this second scan.  Register
+    those with the FSM.
+ 
+ Note this doesn't require scanning the heap at all, or being involved in
+ the heap's cleanup procedure.  Also, there is no need to LockBufferForCleanup,
+ which is a nice property because index scans keep pages pinned for long
+ periods.
+ 
+ 
+ 
+ Optimizer
+ ---------
+ 
+ In order to make this all work, the only thing we need to do is ensure we have a
+ good enough opclass and amcostestimate.  With this, the optimizer is able to pick
+ up the index on its own.
+ 
+ 
+ Open questions
+ --------------
+ 
+ * Same-size page ranges?
+   Current related literature seems to consider that each "index entry" in a
+   minmax index must cover the same number of pages.  There doesn't seem to be a
+   hard reason for this to be so; it might make sense to allow the index to
+   self-tune so that some index entries cover smaller page ranges, if this allows
+   the min()/max() values to be more compact.  This would incur larger minmax
+   overhead for the index itself, but might allow better pruning of page ranges
+   during scan.  In the limit of one index tuple per page, the index itself would
+   occupy too much space, even though we would be able to skip reading the most
+   heap pages, because the min()/max() ranges are tight; in the opposite limit of
+   a single tuple that summarizes the whole table, we wouldn't be able to prune
+   anything even though the index is very small.  This can probably be made to work
+   by using the reverse range map as an index in itself.
+ 
+ * More compact representation for TIDBitmap?
+   TIDBitmap is the structure used to represent bitmap scans.  The
+   representation of lossy page ranges is not optimal for our purposes, because
+   it uses a Bitmapset to represent pages in the range; since we're going to return
+   all pages in a large range, it might be more convenient to allow for a
+   struct that uses start and end page numbers to represent the range, instead.
+ 
+ 
+ 
+ References:
+ 
+ Email thread on pgsql-hackers
+   http://www.postgresql.org/message-id/1199296574.7260.149.camel@ebony.site
+   From: Simon Riggs
+   To: pgsql-hackers
+   Subject: Dynamic Partitioning using Segment Visibility Map
+ 
+ http://wiki.postgresql.org/wiki/Segment_Exclusion
+ http://wiki.postgresql.org/wiki/Segment_Visibility_Map
+ 
*** a/src/backend/access/Makefile
--- b/src/backend/access/Makefile
***************
*** 8,13 **** subdir = src/backend/access
  top_builddir = ../../..
  include $(top_builddir)/src/Makefile.global
  
! SUBDIRS	    = common gin gist hash heap index nbtree rmgrdesc spgist transam
  
  include $(top_srcdir)/src/backend/common.mk
--- 8,13 ----
  top_builddir = ../../..
  include $(top_builddir)/src/Makefile.global
  
! SUBDIRS	    = common gin gist hash heap index minmax nbtree rmgrdesc spgist transam
  
  include $(top_srcdir)/src/backend/common.mk
*** a/src/backend/access/heap/heapam.c
--- b/src/backend/access/heap/heapam.c
***************
*** 268,273 **** initscan(HeapScanDesc scan, ScanKey key, bool is_rescan)
--- 268,275 ----
  		scan->rs_startblock = 0;
  	}
  
+ 	scan->rs_initblock = 0;
+ 	scan->rs_numblocks = InvalidBlockNumber;
  	scan->rs_inited = false;
  	scan->rs_ctup.t_data = NULL;
  	ItemPointerSetInvalid(&scan->rs_ctup.t_self);
***************
*** 293,298 **** initscan(HeapScanDesc scan, ScanKey key, bool is_rescan)
--- 295,308 ----
  		pgstat_count_heap_scan(scan->rs_rd);
  }
  
+ void
+ heap_setscanlimits(HeapScanDesc scan, BlockNumber startBlk, BlockNumber numBlks)
+ {
+ 	scan->rs_startblock = startBlk;
+ 	scan->rs_initblock = startBlk;
+ 	scan->rs_numblocks = numBlks;
+ }
+ 
  /*
   * heapgetpage - subroutine for heapgettup()
   *
***************
*** 634,640 **** heapgettup(HeapScanDesc scan,
  		 */
  		if (backward)
  		{
! 			finished = (page == scan->rs_startblock);
  			if (page == 0)
  				page = scan->rs_nblocks;
  			page--;
--- 644,651 ----
  		 */
  		if (backward)
  		{
! 			finished = --scan->rs_numblocks <= 0 ||
! 				(page == scan->rs_startblock);
  			if (page == 0)
  				page = scan->rs_nblocks;
  			page--;
***************
*** 644,650 **** heapgettup(HeapScanDesc scan,
  			page++;
  			if (page >= scan->rs_nblocks)
  				page = 0;
! 			finished = (page == scan->rs_startblock);
  
  			/*
  			 * Report our new scan position for synchronization purposes. We
--- 655,662 ----
  			page++;
  			if (page >= scan->rs_nblocks)
  				page = 0;
! 			finished = --scan->rs_numblocks <= 0 ||
! 				(page == scan->rs_startblock);
  
  			/*
  			 * Report our new scan position for synchronization purposes. We
***************
*** 895,901 **** heapgettup_pagemode(HeapScanDesc scan,
  		 */
  		if (backward)
  		{
! 			finished = (page == scan->rs_startblock);
  			if (page == 0)
  				page = scan->rs_nblocks;
  			page--;
--- 907,913 ----
  		 */
  		if (backward)
  		{
! 			finished = --scan->rs_numblocks <= 0 || page == scan->rs_startblock;
  			if (page == 0)
  				page = scan->rs_nblocks;
  			page--;
***************
*** 905,911 **** heapgettup_pagemode(HeapScanDesc scan,
  			page++;
  			if (page >= scan->rs_nblocks)
  				page = 0;
! 			finished = (page == scan->rs_startblock);
  
  			/*
  			 * Report our new scan position for synchronization purposes. We
--- 917,923 ----
  			page++;
  			if (page >= scan->rs_nblocks)
  				page = 0;
! 			finished = --scan->rs_numblocks <= 0 || page == scan->rs_startblock;
  
  			/*
  			 * Report our new scan position for synchronization purposes. We
*** /dev/null
--- b/src/backend/access/minmax/Makefile
***************
*** 0 ****
--- 1,17 ----
+ #-------------------------------------------------------------------------
+ #
+ # Makefile--
+ #    Makefile for access/minmax
+ #
+ # IDENTIFICATION
+ #    src/backend/access/minmax/Makefile
+ #
+ #-------------------------------------------------------------------------
+ 
+ subdir = src/backend/access/minmax
+ top_builddir = ../../../..
+ include $(top_builddir)/src/Makefile.global
+ 
+ OBJS = minmax.o mmrevmap.o mmtuple.o mmxlog.o
+ 
+ include $(top_srcdir)/src/backend/common.mk
*** /dev/null
--- b/src/backend/access/minmax/minmax.c
***************
*** 0 ****
--- 1,1521 ----
+ /*
+  * minmax.c
+  *		Implementation of Minmax indexes for Postgres
+  *
+  * Portions Copyright (c) 1996-2013, PostgreSQL Global Development Group
+  * Portions Copyright (c) 1994, Regents of the University of California
+  *
+  * IDENTIFICATION
+  *	  src/backend/access/minmax/minmax.c
+  *
+  * TODO
+  * 		* do we need to reserve special space on pages?
+  * 		* support collatable datatypes
+  * 		* on heap insert, we always create a new index entry.  Need to mark
+  * 		  range as unsummarized at some point, to avoid index bloat?
+  * 		* index truncation on vacuum?
+  * 		* datumCopy() is needed in several places?
+  */
+ #include "postgres.h"
+ 
+ #include "access/htup_details.h"
+ #include "access/minmax.h"
+ #include "access/minmax_internal.h"
+ #include "access/minmax_revmap.h"
+ #include "access/minmax_tuple.h"
+ #include "access/minmax_xlog.h"
+ #include "access/relscan.h"
+ #include "access/xlogutils.h"
+ #include "catalog/index.h"
+ #include "catalog/pg_operator.h"
+ #include "commands/vacuum.h"
+ #include "miscadmin.h"
+ #include "pgstat.h"
+ #include "storage/bufmgr.h"
+ #include "storage/freespace.h"
+ #include "storage/lmgr.h"
+ #include "utils/datum.h"
+ #include "utils/lsyscache.h"
+ #include "utils/memutils.h"
+ #include "utils/syscache.h"
+ 
+ 
+ /*
+  * We use a MMBuildState during initial construction of a Minmax index.
+  * Within that struct, each column's contruction info is represented by a
+  * MMPerColBuildInfo struct.  The running state is all kept in a
+  * DeformedMMTuple.
+  */
+ typedef struct MMPerColBuildInfo
+ {
+ 	AttrNumber	heapAttno;
+ 	int			typLen;
+ 	bool		typByVal;
+ 	FmgrInfo	lt;
+ 	FmgrInfo	gt;
+ } MMPerColBuildInfo;
+ 
+ typedef struct MMBuildState
+ {
+ 	Relation	irel;
+ 	int			numtuples;
+ 	Buffer		currentInsertBuf;
+ 	BlockNumber currRangeStart;
+ 	BlockNumber nextRangeAt;
+ 	mmRevmapAccess *rmAccess;
+ 	TupleDesc	indexDesc;
+ 	TupleDesc	diskDesc;
+ 	DeformedMMTuple *dtuple;
+ 	MMPerColBuildInfo perColState[FLEXIBLE_ARRAY_MEMBER];
+ } MMBuildState;
+ 
+ static void mmbuildCallback(Relation index,
+ 				HeapTuple htup, Datum *values, bool *isnull,
+ 				bool tupleIsAlive, void *state);
+ static void get_mm_operator(Oid opfam, Oid idxtypid, Oid keytypid,
+ 				StrategyNumber strategy, FmgrInfo *finfo);
+ static inline bool invoke_mm_operator(FmgrInfo *operator, Oid collation,
+ 				   Datum left, Datum right);
+ static void mm_doinsert(Relation idxrel, mmRevmapAccess *rmAccess,
+ 			Buffer *buffer, BlockNumber heapblkno, MMTuple *tup, Size itemsz);
+ static Buffer mm_getnewbuffer(Relation irel);
+ static bool mm_getinsertbuffer(Relation irel, Buffer *buffer, Size itemsz);
+ 
+ 
+ #define MINMAX_PAGES_PER_RANGE	2
+ 
+ 
+ /*
+  * A tuple in the heap is being inserted.  To keep a minmax index up to date,
+  * we need to obtain the relevant index tuple, compare its min()/max() stored
+  * values with those of the new tuple; if the tuple values are in range,
+  * there's nothing to do; otherwise we need to create a new index tuple and
+  * point the revmap to it.
+  *
+  * If the range is not currently summarized (i.e. the revmap returns InvalidTid
+  * for it), there's nothing to do either.
+  */
+ Datum
+ mminsert(PG_FUNCTION_ARGS)
+ {
+ 	Relation	idxRel = (Relation) PG_GETARG_POINTER(0);
+ 	Datum	   *values = (Datum *) PG_GETARG_POINTER(1);
+ 	bool	   *nulls = (bool *) PG_GETARG_POINTER(2);
+ 	ItemPointer heaptid = (ItemPointer) PG_GETARG_POINTER(3);
+ 
+ 	/* we ignore the rest of our arguments */
+ 	mmRevmapAccess *rmAccess;
+ 	Datum		indclassDatum;
+ 	bool		isnull;
+ 	oidvector  *indclass;
+ 	TupleDesc	tupdesc;
+ 	MMTuple    *mmtup;
+ 	DeformedMMTuple *dtup;
+ 	ItemPointerData idxtid;
+ 	BlockNumber heapBlk;
+ 	BlockNumber iblk;
+ 	OffsetNumber ioff;
+ 	Buffer		buf;
+ 	IndexInfo  *indexInfo;
+ 	Page		page;
+ 	int			keyno;
+ 	FmgrInfo   *lt;
+ 	FmgrInfo   *gt;
+ 	bool		need_insert;
+ 
+ 	rmAccess = mmRevmapAccessInit(idxRel, MINMAX_PAGES_PER_RANGE);
+ 
+ 	heapBlk = ItemPointerGetBlockNumber(heaptid);
+ 	mmGetHeapBlockItemptr(rmAccess, heapBlk, &idxtid);
+ 	/* tuple lock on idxtid is grabbed by mmGetHeapBlockItemptr */
+ 
+ 	if (!ItemPointerIsValid(&idxtid))
+ 	{
+ 		/* nothing to do, range is unsummarized */
+ 		mmRevmapAccessTerminate(rmAccess);
+ 		return BoolGetDatum(false);
+ 	}
+ 
+ 	tupdesc = RelationGetDescr(idxRel);
+ 	indexInfo = BuildIndexInfo(idxRel);
+ 
+ 	lt = palloc(sizeof(FmgrInfo) * indexInfo->ii_NumIndexAttrs);
+ 	gt = palloc(sizeof(FmgrInfo) * indexInfo->ii_NumIndexAttrs);
+ 
+ 	/* grab the operators we will need: < and > for each indexed column */
+ 	indclassDatum = SysCacheGetAttr(INDEXRELID, idxRel->rd_indextuple,
+ 									Anum_pg_index_indclass, &isnull);
+ 	Assert(!isnull);
+ 	indclass = (oidvector *) DatumGetPointer(indclassDatum);
+ 	for (keyno = 0; keyno < indexInfo->ii_NumIndexAttrs; keyno++)
+ 	{
+ 		Oid			opfam = get_opclass_family(indclass->values[keyno]);
+ 		Oid			idxtypid = tupdesc->attrs[keyno]->atttypid;
+ 
+ 		get_mm_operator(opfam, idxtypid, idxtypid, BTLessStrategyNumber,
+ 						&lt[keyno]);
+ 		get_mm_operator(opfam, idxtypid, idxtypid, BTGreaterStrategyNumber,
+ 						&gt[keyno]);
+ 	}
+ 
+ 	iblk = ItemPointerGetBlockNumber(&idxtid);
+ 	ioff = ItemPointerGetOffsetNumber(&idxtid);
+ 	buf = ReadBuffer(idxRel, iblk);
+ 
+ 	LockBuffer(buf, BUFFER_LOCK_SHARE);
+ 	UnlockTuple(idxRel, &idxtid, ShareLock);
+ 	page = BufferGetPage(buf);
+ 	mmtup = (MMTuple *) PageGetItem(page, PageGetItemId(page, ioff));
+ 
+ 	dtup = minmax_deform_tuple(tupdesc, mmtup);
+ 
+ 	/* compare the key values of the new tuple to the stored index values */
+ 	for (keyno = 0; keyno < indexInfo->ii_NumIndexAttrs; keyno++)
+ 	{
+ 		/*
+ 		 * If the new tuple contains a null in this attr, but the range index
+ 		 * tuple doesn't allow for nulls, we need a new summary tuple
+ 		 */
+ 		if (nulls[keyno])
+ 		{
+ 			if (!dtup->values[keyno].hasnulls)
+ 			{
+ 				need_insert = true;
+ 			}
+ 			else
+ 				continue;
+ 		}
+ 
+ 		/*
+ 		 * If the new key value is not within the min/max interval for this
+ 		 * range, we need a new summary tuple
+ 		 */
+ 		if (invoke_mm_operator(&lt[keyno], InvalidOid, values[keyno],
+ 							   dtup->values[keyno].min))
+ 		{
+ 			dtup->values[keyno].min = values[keyno];	/* XXX datumCopy? */
+ 			need_insert = true;
+ 		}
+ 		if (invoke_mm_operator(&gt[keyno], InvalidOid, values[keyno],
+ 							   dtup->values[keyno].max))
+ 		{
+ 			dtup->values[keyno].max = values[keyno];	/* XXX datumCopy? */
+ 			need_insert = true;
+ 		}
+ 	}
+ 
+ 	LockBuffer(buf, BUFFER_LOCK_UNLOCK);
+ 
+ 	if (need_insert)
+ 	{
+ 		TupleDesc	diskDesc;
+ 		Size		tupsz;
+ 		MMTuple    *tup;
+ 
+ 		diskDesc = minmax_get_descr(tupdesc);
+ 		tup = minmax_form_tuple(tupdesc, diskDesc, dtup, &tupsz);
+ 
+ 		mm_doinsert(idxRel, rmAccess, &buf, heapBlk, tup, tupsz);
+ 	}
+ 
+ 	ReleaseBuffer(buf);
+ 
+ 	mmRevmapAccessTerminate(rmAccess);
+ 
+ 	return BoolGetDatum(false);
+ }
+ 
+ Datum
+ mmbeginscan(PG_FUNCTION_ARGS)
+ {
+ 	Relation	r = (Relation) PG_GETARG_POINTER(0);
+ 	int			nkeys = PG_GETARG_INT32(1);
+ 	int			norderbys = PG_GETARG_INT32(2);
+ 	IndexScanDesc scan;
+ 
+ 	scan = RelationGetIndexScan(r, nkeys, norderbys);
+ 
+ 	PG_RETURN_POINTER(scan);
+ }
+ 
+ 
+ /*
+  * Execute the index scan.
+  *
+  * This works by reading index TIDs from the revmap, and obtaining the index
+  * tuples pointed to by them; the min/max values in them are compared to the
+  * scan keys.  We return into the TID bitmap all the pages in ranges
+  * corresponding to index tuples that match the scan keys.
+  *
+  * If a TID from the revmap is read as InvalidTID, we know that range is
+  * unsummarized.  Pages in those ranges need to be returned regardless of scan
+  * keys.
+  */
+ Datum
+ mmgetbitmap(PG_FUNCTION_ARGS)
+ {
+ 	IndexScanDesc scan = (IndexScanDesc) PG_GETARG_POINTER(0);
+ 	TIDBitmap  *tbm = (TIDBitmap *) PG_GETARG_POINTER(1);
+ 	Relation	idxRel = scan->indexRelation;
+ 	Buffer		currIdxBuf = InvalidBuffer;
+ 	Oid			heapOid;
+ 	Relation	heapRel;
+ 	mmRevmapAccess *rmAccess;
+ 	BlockNumber nblocks;
+ 	BlockNumber heapBlk;
+ 	TupleDesc	tupdesc;
+ 	AttrNumber	keyno;
+ 	Datum		indclassDatum;
+ 	bool		isnull;
+ 	oidvector  *indclass;
+ 	FmgrInfo   *lt;
+ 	FmgrInfo   *lteq;
+ 	FmgrInfo   *gteq;
+ 	FmgrInfo   *gt;
+ 
+ 	pgstat_count_index_scan(idxRel);
+ 
+ 	heapOid = IndexGetRelation(RelationGetRelid(idxRel), false);
+ 	heapRel = heap_open(heapOid, AccessShareLock);
+ 	nblocks = RelationGetNumberOfBlocks(heapRel);
+ 	heap_close(heapRel, AccessShareLock);
+ 
+ 	tupdesc = RelationGetDescr(idxRel);
+ 
+ 	lt = palloc(sizeof(FmgrInfo) * scan->numberOfKeys);
+ 	lteq = palloc(sizeof(FmgrInfo) * scan->numberOfKeys);
+ 	gteq = palloc(sizeof(FmgrInfo) * scan->numberOfKeys);
+ 	gt = palloc(sizeof(FmgrInfo) * scan->numberOfKeys);
+ 
+ 	/*
+ 	 * lookup the operators needed to determine range containment of each key
+ 	 * value.
+ 	 */
+ 	indclassDatum = SysCacheGetAttr(INDEXRELID, idxRel->rd_indextuple,
+ 									Anum_pg_index_indclass, &isnull);
+ 	Assert(!isnull);
+ 	indclass = (oidvector *) DatumGetPointer(indclassDatum);
+ 	for (keyno = 0; keyno < scan->numberOfKeys; keyno++)
+ 	{
+ 		AttrNumber	keyattno;
+ 		Oid			opfam;
+ 		Oid			keytypid;
+ 		Oid			idxtypid;
+ 
+ 		keyattno = scan->keyData[keyno].sk_attno;
+ 		opfam = get_opclass_family(indclass->values[keyattno - 1]);
+ 		keytypid = scan->keyData[keyno].sk_subtype;
+ 		idxtypid = tupdesc->attrs[keyattno - 1]->atttypid;
+ 
+ 		get_mm_operator(opfam, idxtypid, keytypid, BTLessStrategyNumber,
+ 						&lt[keyno]);
+ 		get_mm_operator(opfam, idxtypid, keytypid, BTLessEqualStrategyNumber,
+ 						&lteq[keyno]);
+ 		get_mm_operator(opfam, idxtypid, keytypid, BTGreaterStrategyNumber,
+ 						&gt[keyno]);
+ 		get_mm_operator(opfam, idxtypid, keytypid, BTGreaterEqualStrategyNumber,
+ 						&gteq[keyno]);
+ 	}
+ 
+ 	/*
+ 	 * Now scan the revmap.  We start by querying for heap page 0,
+ 	 * incrementing by the number of pages per range; this gives us a full
+ 	 * view of the table.
+ 	 */
+ 	rmAccess = mmRevmapAccessInit(idxRel, MINMAX_PAGES_PER_RANGE);
+ 	for (heapBlk = 0; heapBlk < nblocks; heapBlk += MINMAX_PAGES_PER_RANGE)
+ 	{
+ 		ItemPointerData itupptr;
+ 		bool		addrange;
+ 
+ 		mmGetHeapBlockItemptr(rmAccess, heapBlk, &itupptr);
+ 
+ 		/*
+ 		 * For revmap items that return InvalidTID, we must return the whole
+ 		 * range; otherwise, fetch the index item and compare it to the scan
+ 		 * keys.
+ 		 */
+ 		if (!ItemPointerIsValid(&itupptr))
+ 		{
+ 			addrange = true;
+ 		}
+ 		else
+ 		{
+ 			Page		page;
+ 			OffsetNumber idxoffno;
+ 			BlockNumber idxblkno;
+ 			MMTuple    *tup;
+ 			DeformedMMTuple *dtup;
+ 			int			keyno;
+ 
+ 			idxoffno = ItemPointerGetOffsetNumber(&itupptr);
+ 			idxblkno = ItemPointerGetBlockNumber(&itupptr);
+ 
+ 			if (currIdxBuf == InvalidBuffer ||
+ 				idxblkno != BufferGetBlockNumber(currIdxBuf))
+ 			{
+ 				if (currIdxBuf != InvalidBuffer)
+ 					ReleaseBuffer(currIdxBuf);
+ 
+ 				currIdxBuf = ReadBuffer(idxRel, idxblkno);
+ 			}
+ 
+ 			/*
+ 			 * To keep the buffer locked for a short time, we grab and
+ 			 * immediately deform the index tuple to operate on.  As soon as
+ 			 * we have acquired the lock on the index buffer, we can release
+ 			 * the tuple lock the revmap acquired for us.  So vacuum would be
+ 			 * able to remove this index row as soon as we release the buffer
+ 			 * lock, if it has become stale.
+ 			 */
+ 			LockBuffer(currIdxBuf, BUFFER_LOCK_SHARE);
+ 
+ 			UnlockTuple(idxRel, &itupptr, ShareLock);
+ 
+ 			page = BufferGetPage(currIdxBuf);
+ 			tup = (MMTuple *)
+ 				PageGetItem(page, PageGetItemId(page, idxoffno));
+ 			/* XXX probably need copies */
+ 			dtup = minmax_deform_tuple(tupdesc, tup);
+ 
+ 			/* done with the index page */
+ 			LockBuffer(currIdxBuf, BUFFER_LOCK_UNLOCK);
+ 
+ 			/*
+ 			 * Compare scan keys with min/max values stored in range.  If scan
+ 			 * keys are matched, the page range must be added to the bitmap.
+ 			 */
+ 			for (keyno = 0, addrange = true;
+ 				 keyno < scan->numberOfKeys;
+ 				 keyno++)
+ 			{
+ 				ScanKey		key = &scan->keyData[keyno];
+ 				AttrNumber	keyattno = key->sk_attno;
+ 
+ 				/*
+ 				 * The analysis we need to make to decide whether to include a
+ 				 * page range in the output result is: is it possible for a
+ 				 * tuple contained within the min/max interval specified by
+ 				 * this index tuple to match what's specified by the scan key?
+ 				 * For example, for a query qual such as "WHERE col < 10" we
+ 				 * need to include a range whose minimum value is less than
+ 				 * 10.
+ 				 *
+ 				 * When there are multiple scan keys, failure to meet the
+ 				 * criteria for a single one of them is enough to discard the
+ 				 * range as a whole.
+ 				 */
+ 				switch (key->sk_strategy)
+ 				{
+ 					case BTLessStrategyNumber:
+ 						addrange =
+ 							invoke_mm_operator(&lt[keyno], InvalidOid,
+ 											   dtup->values[keyattno - 1].min,
+ 											   key->sk_argument);
+ 						break;
+ 					case BTLessEqualStrategyNumber:
+ 						addrange =
+ 							invoke_mm_operator(&lteq[keyno], InvalidOid,
+ 											   dtup->values[keyattno - 1].min,
+ 											   key->sk_argument);
+ 						break;
+ 					case BTEqualStrategyNumber:
+ 
+ 						/*
+ 						 * In the equality case (WHERE col = someval), we want
+ 						 * to return the current page range if the minimum
+ 						 * value in the range <= scan key, and the maximum
+ 						 * value >= scan key.
+ 						 */
+ 						addrange =
+ 							invoke_mm_operator(&lteq[keyno], InvalidOid,
+ 											   dtup->values[keyattno - 1].min,
+ 											   key->sk_argument);
+ 						if (!addrange)
+ 							break;
+ 						/* max() >= scankey */
+ 						addrange =
+ 							invoke_mm_operator(&gteq[keyno], InvalidOid,
+ 											   dtup->values[keyattno - 1].max,
+ 											   key->sk_argument);
+ 						break;
+ 					case BTGreaterEqualStrategyNumber:
+ 						addrange =
+ 							invoke_mm_operator(&gteq[keyno], InvalidOid,
+ 											   dtup->values[keyattno - 1].max,
+ 											   key->sk_argument);
+ 						break;
+ 					case BTGreaterStrategyNumber:
+ 						addrange =
+ 							invoke_mm_operator(&gt[keyno], InvalidOid,
+ 											   dtup->values[keyattno - 1].max,
+ 											   key->sk_argument);
+ 						break;
+ 				}
+ 
+ 				/*
+ 				 * If the current scan key doesn't match the range values,
+ 				 * don't look at further ones.
+ 				 */
+ 				if (!addrange)
+ 					break;
+ 			}
+ 
+ 			/* XXX anything to free here? */
+ 		}
+ 
+ 		if (addrange)
+ 		{
+ 			BlockNumber pageno;
+ 
+ 			for (pageno = heapBlk;
+ 				 pageno <= heapBlk + MINMAX_PAGES_PER_RANGE - 1;
+ 				 pageno++)
+ 				tbm_add_page(tbm, pageno);
+ 		}
+ 	}
+ 
+ 	mmRevmapAccessTerminate(rmAccess);
+ 	if (currIdxBuf != InvalidBuffer)
+ 		ReleaseBuffer(currIdxBuf);
+ 
+ 	pfree(lt);
+ 	pfree(lteq);
+ 	pfree(gt);
+ 	pfree(gteq);
+ 
+ 	PG_RETURN_INT64(MaxHeapTuplesPerPage);
+ }
+ 
+ 
+ Datum
+ mmrescan(PG_FUNCTION_ARGS)
+ {
+ 	IndexScanDesc scan = (IndexScanDesc) PG_GETARG_POINTER(0);
+ 	ScanKey		scankey = (ScanKey) PG_GETARG_POINTER(1);
+ 
+ 	/* other arguments ignored */
+ 
+ 	if (scankey && scan->numberOfKeys > 0)
+ 	{
+ 		memmove(scan->keyData, scankey,
+ 				scan->numberOfKeys * sizeof(ScanKeyData));
+ 	}
+ 
+ 	PG_RETURN_VOID();
+ }
+ 
+ Datum
+ mmendscan(PG_FUNCTION_ARGS)
+ {
+ 	IndexScanDesc scan = (IndexScanDesc) PG_GETARG_POINTER(0);
+ 
+ 	/* anything to do here? */
+ 	(void) scan;	/* silence compiler */
+ 
+ 	PG_RETURN_VOID();
+ }
+ 
+ Datum
+ mmmarkpos(PG_FUNCTION_ARGS)
+ {
+ 	elog(ERROR, "MinMax does not support mark/restore");
+ 	PG_RETURN_VOID();
+ }
+ 
+ Datum
+ mmrestrpos(PG_FUNCTION_ARGS)
+ {
+ 	elog(ERROR, "MinMax does not support mark/restore");
+ 	PG_RETURN_VOID();
+ }
+ 
+ /*
+  * Reset the per-column build state in an MMBuildState.
+  */
+ static void
+ clear_mm_percol_buildstate(MMBuildState *mmstate)
+ {
+ 	int			i;
+ 
+ 	for (i = 0; i < mmstate->indexDesc->natts; i++)
+ 	{
+ 		mmstate->dtuple->values[i].allnulls = true;
+ 		mmstate->dtuple->values[i].hasnulls = false;
+ 		mmstate->dtuple->values[i].min = (Datum) 0;
+ 		mmstate->dtuple->values[i].max = (Datum) 0;
+ 	}
+ }
+ 
+ /*
+  * Per-heap-tuple callback for IndexBuildHeapScan.
+  *
+  * Note we don't worry about the page range at the end of the table here; they
+  * are present in the build state struct but not inserted into the index.
+  * Caller must ensure to do so, if appropriate.
+  */
+ static void
+ mmbuildCallback(Relation index,
+ 				HeapTuple htup,
+ 				Datum *values,
+ 				bool *isnull,
+ 				bool tupleIsAlive,
+ 				void *state)
+ {
+ 	MMBuildState *mmstate = (MMBuildState *) state;
+ 	BlockNumber thisblock;
+ 	int			i;
+ 
+ 	thisblock = ItemPointerGetBlockNumber(&htup->t_self);
+ 
+ 	/*
+ 	 * If we're in a new block which belongs to the next range, summarize what
+ 	 * we've got and start afresh.
+ 	 */
+ 	if (thisblock == mmstate->nextRangeAt)
+ 	{
+ 		MMTuple    *tup;
+ 		Size		size;
+ 
+ #if 0
+ 		for (i = 0; i < mmstate->indexDesc->natts; i++)
+ 		{
+ 			elog(DEBUG2, "completed a range for column %d, range: %u .. %u",
+ 				 i,
+ 				 DatumGetUInt32(mmstate->dtuple->values[i].min),
+ 				 DatumGetUInt32(mmstate->dtuple->values[i].max));
+ 		}
+ #endif
+ 
+ 		/*
+ 		 * Create the index tuple containing min/max values, and insert it.
+ 		 */
+ 		tup = minmax_form_tuple(mmstate->indexDesc, mmstate->diskDesc,
+ 								mmstate->dtuple, &size);
+ 		mm_doinsert(mmstate->irel, mmstate->rmAccess,
+ 					&mmstate->currentInsertBuf, mmstate->currRangeStart, tup,
+ 					size);
+ 		mmstate->numtuples++;
+ 		pfree(tup);
+ 
+ 		/* and set state to correspond to the new current range */
+ 		mmstate->currRangeStart = mmstate->nextRangeAt;
+ 		mmstate->nextRangeAt = mmstate->currRangeStart + MINMAX_PAGES_PER_RANGE;
+ 
+ 		/* initialize aggregate state for the new range */
+ 		for (i = 0; i < mmstate->indexDesc->natts; i++)
+ 		{
+ 			if (!mmstate->dtuple->values[i].allnulls &&
+ 				!mmstate->perColState[i].typByVal)
+ 			{
+ 				pfree(DatumGetPointer(mmstate->dtuple->values[i].min));
+ 				pfree(DatumGetPointer(mmstate->dtuple->values[i].max));
+ 			}
+ 		}
+ 
+ 		clear_mm_percol_buildstate(mmstate);
+ 	}
+ 
+ 	/* Accumulate the current tuple into the running state */
+ 	for (i = 0; i < mmstate->indexDesc->natts; i++)
+ 	{
+ 		AttrNumber	heapAttno = mmstate->perColState[i].heapAttno;
+ 
+ 		/*
+ 		 * If the value in the current heap tuple is null, there's not much to
+ 		 * do other than keep track that we saw it.
+ 		 */
+ 		if (isnull[heapAttno - 1])
+ 		{
+ 			mmstate->dtuple->values[i].hasnulls = true;
+ 			continue;
+ 		}
+ 
+ 		/*
+ 		 * If this is the first tuple in the range containing a not-null value
+ 		 * for this column, initialize our state.
+ 		 */
+ 		if (mmstate->dtuple->values[i].allnulls)
+ 		{
+ 			mmstate->dtuple->values[i].allnulls = false;
+ 			mmstate->dtuple->values[i].min =
+ 				datumCopy(values[heapAttno - 1],
+ 						  mmstate->perColState[i].typByVal,
+ 						  mmstate->perColState[i].typLen);
+ 			mmstate->dtuple->values[i].max =
+ 				datumCopy(values[heapAttno - 1],
+ 						  mmstate->perColState[i].typByVal,
+ 						  mmstate->perColState[i].typLen);
+ 			continue;
+ 		}
+ 
+ 		/*
+ 		 * Otherwise, dtuple state was already initialized, and the current
+ 		 * tuple is not null: therefore we need to compare it to the current
+ 		 * state and possibly update the min/max boundaries.
+ 		 */
+ 		if (invoke_mm_operator(&mmstate->perColState[i].lt, InvalidOid,
+ 							   values[heapAttno - 1],
+ 							   mmstate->dtuple->values[i].min))
+ 		{
+ 			if (!mmstate->perColState[i].typByVal)
+ 				pfree(DatumGetPointer(mmstate->dtuple->values[i].min));
+ 			mmstate->dtuple->values[i].min =
+ 				datumCopy(values[heapAttno - 1],
+ 						  mmstate->perColState[i].typByVal,
+ 						  mmstate->perColState[i].typLen);
+ 		}
+ 
+ 		if (invoke_mm_operator(&mmstate->perColState[i].gt, InvalidOid,
+ 							   values[heapAttno - 1],
+ 							   mmstate->dtuple->values[i].max))
+ 		{
+ 			if (!mmstate->perColState[i].typByVal)
+ 				pfree(DatumGetPointer(mmstate->dtuple->values[i].min));
+ 			mmstate->dtuple->values[i].max =
+ 				datumCopy(values[heapAttno - 1],
+ 						  mmstate->perColState[i].typByVal,
+ 						  mmstate->perColState[i].typLen);
+ 		}
+ 	}
+ }
+ 
+ static MMBuildState *
+ initialize_mm_buildstate(Relation heapRel, Relation idxRel,
+ 						 mmRevmapAccess *rmAccess, IndexInfo *indexInfo)
+ {
+ 	MMBuildState *mmstate;
+ 	TupleDesc	heapDesc = RelationGetDescr(heapRel);
+ 	Datum		indclassDatum;
+ 	bool		isnull;
+ 	oidvector  *indclass;
+ 	int			i;
+ 
+ 	mmstate = palloc(offsetof(MMBuildState, perColState) +
+ 					 sizeof(MMPerColBuildInfo) * indexInfo->ii_NumIndexAttrs);
+ 
+ 	mmstate->irel = idxRel;
+ 	mmstate->numtuples = 0;
+ 	mmstate->currentInsertBuf = InvalidBuffer;
+ 	mmstate->currRangeStart = 0;
+ 	mmstate->nextRangeAt = MINMAX_PAGES_PER_RANGE;
+ 	mmstate->rmAccess = rmAccess;
+ 	mmstate->indexDesc = RelationGetDescr(idxRel);
+ 	mmstate->diskDesc = minmax_get_descr(mmstate->indexDesc);
+ 
+ 	mmstate->dtuple = palloc(offsetof(DeformedMMTuple, values) +
+ 							 sizeof(MMValues) * indexInfo->ii_NumIndexAttrs);
+ 	/* other stuff in dtuple is initialized below */
+ 
+ 	indclassDatum = SysCacheGetAttr(INDEXRELID, idxRel->rd_indextuple,
+ 									Anum_pg_index_indclass, &isnull);
+ 	Assert(!isnull);
+ 	indclass = (oidvector *) DatumGetPointer(indclassDatum);
+ 
+ 	for (i = 0; i < mmstate->indexDesc->natts; i++)
+ 	{
+ 		int			heapAttno;
+ 		Form_pg_attribute attr;
+ 		Oid			opfam = get_opclass_family(indclass->values[i]);
+ 		Oid			idxtypid = mmstate->indexDesc->attrs[i]->atttypid;
+ 
+ 		heapAttno = indexInfo->ii_KeyAttrNumbers[i];
+ 		if (heapAttno == 0)
+ 			ereport(ERROR,
+ 					(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
+ 					 errmsg("cannot create minmax indexes on expressions")));
+ 
+ 		attr = heapDesc->attrs[heapAttno - 1];
+ 		mmstate->perColState[i].heapAttno = heapAttno;
+ 		mmstate->perColState[i].typByVal = attr->attbyval;
+ 		mmstate->perColState[i].typLen = attr->attlen;
+ 		get_mm_operator(opfam, idxtypid, idxtypid, BTLessStrategyNumber,
+ 						&(mmstate->perColState[i].lt));
+ 		get_mm_operator(opfam, idxtypid, idxtypid, BTGreaterStrategyNumber,
+ 						&(mmstate->perColState[i].gt));
+ 
+ 		/* initialize per-column state */
+ 	}
+ 
+ 	clear_mm_percol_buildstate(mmstate);
+ 
+ 	return mmstate;
+ }
+ 
+ void
+ mm_init_metapage(Buffer meta)
+ {
+ 	MinmaxMetaPageData	*metadata;
+ 	Page		page = BufferGetPage(meta);
+ 
+ 	PageInit(page, BLCKSZ, 0);
+ 
+ 	metadata = (MinmaxMetaPageData *) PageGetContents(page);
+ 
+ 	metadata->minmaxMagic = MINMAX_META_MAGIC;
+ 	metadata->minmaxVersion = MINMAX_CURRENT_VERSION;
+ }
+ 
+ /*
+  * mmbuild() -- build a new minmax index.
+  */
+ Datum
+ mmbuild(PG_FUNCTION_ARGS)
+ {
+ 	Relation	heap = (Relation) PG_GETARG_POINTER(0);
+ 	Relation	index = (Relation) PG_GETARG_POINTER(1);
+ 	IndexInfo  *indexInfo = (IndexInfo *) PG_GETARG_POINTER(2);
+ 	IndexBuildResult *result;
+ 	double		reltuples;
+ 	mmRevmapAccess *rmAccess;
+ 	MMBuildState *mmstate;
+ 	Buffer		meta;
+ 
+ 	/*
+ 	 * We expect to be called exactly once for any index relation.
+ 	 */
+ 	if (RelationGetNumberOfBlocks(index) != 0)
+ 		elog(ERROR, "index \"%s\" already contains data",
+ 			 RelationGetRelationName(index));
+ 
+ 	/* partial indexes not supported */
+ 	if (indexInfo->ii_Predicate != NIL)
+ 		ereport(ERROR,
+ 				(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
+ 				 errmsg("partial indexes not supported")));
+ 	/* expressions not supported (yet?) */
+ 	if (indexInfo->ii_Expressions != NIL)
+ 		ereport(ERROR,
+ 				(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
+ 				 errmsg("expression indexes not supported")));
+ 
+ 	START_CRIT_SECTION();
+ 	meta = mm_getnewbuffer(index);
+ 	mm_init_metapage(meta);
+ 	MarkBufferDirty(meta);
+ 
+ 	if (RelationNeedsWAL(index))
+ 	{
+ 		XLogRecPtr	recptr;
+ 		XLogRecData	rdata;
+ 		Page		page;
+ 
+ 		rdata.buffer = InvalidBuffer;
+ 		rdata.data = (char *) &(index->rd_node);
+ 		rdata.len = sizeof(RelFileNode);
+ 		rdata.next = NULL;
+ 
+ 		recptr = XLogInsert(RM_MINMAX_ID, XLOG_MINMAX_CREATE_INDEX, &rdata);
+ 
+ 		page = BufferGetPage(meta);
+ 		PageSetLSN(page, recptr);
+ 	}
+ 
+ 	UnlockReleaseBuffer(meta);
+ 	END_CRIT_SECTION();
+ 
+ 	/* set up our "reverse map" fork */
+ 	mmRevmapCreate(index);
+ 
+ 	/*
+ 	 * Initialize our state, including the deformed tuple state.
+ 	 */
+ 	rmAccess = mmRevmapAccessInit(index, MINMAX_PAGES_PER_RANGE);
+ 	mmstate = initialize_mm_buildstate(heap, index, rmAccess, indexInfo);
+ 
+ 	/*
+ 	 * Now scan the relation.  No syncscan allowed here because we want the
+ 	 * heap blocks in order
+ 	 */
+ 	reltuples = IndexBuildHeapScan(heap, index, indexInfo, false,
+ 								   mmbuildCallback, (void *) mmstate);
+ 
+ 	/* XXX process the final batch, if needed */
+ 
+ 
+ 	/* release the last index buffer used */
+ 	if (!BufferIsInvalid(mmstate->currentInsertBuf))
+ 	{
+ 		ReleaseBuffer(mmstate->currentInsertBuf);
+ 		mmstate->currentInsertBuf = InvalidBuffer;
+ 	}
+ 
+ 	mmRevmapAccessTerminate(mmstate->rmAccess);
+ 
+ 	/*
+ 	 * Return statistics
+ 	 */
+ 	result = (IndexBuildResult *) palloc(sizeof(IndexBuildResult));
+ 
+ 	result->heap_tuples = reltuples;
+ 	result->index_tuples = mmstate->numtuples;
+ 
+ 	PG_RETURN_POINTER(result);
+ }
+ 
+ Datum
+ mmbuildempty(PG_FUNCTION_ARGS)
+ {
+ 	ereport(ERROR,
+ 			(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
+ 			 errmsg("unlogged MinMax indexes are not supported")));
+ 
+ 	PG_RETURN_VOID();
+ }
+ 
+ Datum
+ mmbulkdelete(PG_FUNCTION_ARGS)
+ {
+ 	PG_RETURN_POINTER(NULL);
+ }
+ 
+ /*
+  * qsort comparator for ItemPointerData items
+  */
+ static int
+ qsortCompareItemPointers(const void *a, const void *b)
+ {
+ 	return ItemPointerCompare((ItemPointer) a, (ItemPointer) b);
+ }
+ 
+ /*
+  * Remove index tuples that are no longer useful.
+  *
+  * While at it, return an array of block numbers for which the revmap returns
+  * InvalidTid; this is used in a later stage to execute re-summarization.
+  * (The block numbers correspond to the start heap page numbers with which each
+  * unsummarized range starts.)	Space for the array is palloc'ed, and must be
+  * freed by caller.
+  */
+ static void
+ remove_deletable_tuples(Relation idxRel, BlockNumber heapNumBlocks,
+ 						BufferAccessStrategy strategy,
+ 						BlockNumber **nonsummed, int *numnonsummed)
+ {
+ 	HASHCTL		hctl;
+ 	HTAB	   *tuples;
+ 	HASH_SEQ_STATUS status;
+ 	MemoryContext hashcxt;
+ 	BlockNumber nblocks;
+ 	BlockNumber blk;
+ 	mmRevmapAccess *rmAccess;
+ 	BlockNumber heapBlk;
+ 	int			numitems = 0;
+ 	int			numdeletable = 0;
+ 	ItemPointerData *deletable;
+ 	int			start;
+ 	int			i;
+ 	BlockNumber *nonsumm = NULL;
+ 	int			maxnonsumm = 0;
+ 	int			numnonsumm = 0;
+ 
+ 	typedef struct DeletableTuple
+ 	{
+ 		ItemPointerData tid;
+ 		bool		referenced;
+ 	} DeletableTuple;
+ 
+ 	nblocks = RelationGetNumberOfBlocks(idxRel);
+ 
+ 	hashcxt = AllocSetContextCreate(CurrentMemoryContext,
+ 									"mm remove deletable hash",
+ 									ALLOCSET_SMALL_MINSIZE,
+ 									ALLOCSET_SMALL_INITSIZE,
+ 									ALLOCSET_SMALL_MAXSIZE);
+ 
+ 	/* Initialize hash used to track deletable tuples */
+ 	memset(&hctl, 0, sizeof(hctl));
+ 	hctl.keysize = sizeof(ItemPointerData);
+ 	hctl.entrysize = sizeof(DeletableTuple);
+ 	hctl.hcxt = hashcxt;
+ 	hctl.hash = tag_hash;
+ 
+ 	/* assume ten entries per page.  No harm in getting this wrong */
+ 	tuples = hash_create("mmvacuumcleanup", nblocks * 10, &hctl,
+ 						 HASH_CONTEXT | HASH_FUNCTION | HASH_ELEM);
+ 
+ 	/*
+ 	 * Scan the index sequentially, entering each item into a hash table.
+ 	 * Initially, the items are marked as not referenced.
+ 	 */
+ 	for (blk = 0; blk < nblocks; blk++)
+ 	{
+ 		Buffer		buf;
+ 		Page		page;
+ 		OffsetNumber offno;
+ 
+ 		vacuum_delay_point();
+ 
+ 		buf = ReadBufferExtended(idxRel, MAIN_FORKNUM, blk, RBM_NORMAL,
+ 								 strategy);
+ 		LockBuffer(buf, BUFFER_LOCK_SHARE);
+ 		page = BufferGetPage(buf);
+ 
+ 		for (offno = 1; offno <= PageGetMaxOffsetNumber(page); offno++)
+ 		{
+ 			ItemPointerData tid;
+ 			ItemId		itemid;
+ 			bool		found;
+ 			DeletableTuple *hitem;
+ 
+ 			itemid = PageGetItemId(page, offno);
+ 			if (!ItemIdHasStorage(itemid))
+ 				continue;
+ 
+ 			ItemPointerSet(&tid, blk, offno);
+ 			hitem = (DeletableTuple *) hash_search(tuples,
+ 												   &tid,
+ 												   HASH_ENTER,
+ 												   &found);
+ 			Assert(!found);
+ 			hitem->referenced = false;
+ 		}
+ 		UnlockReleaseBuffer(buf);
+ 	}
+ 
+ 	/*
+ 	 * now scan the revmap, and determine which of these TIDs are still
+ 	 * referenced
+ 	 */
+ 	rmAccess = mmRevmapAccessInit(idxRel, MINMAX_PAGES_PER_RANGE);
+ 	for (heapBlk = 0, numitems = 0;
+ 		 heapBlk < heapNumBlocks;
+ 		 heapBlk += MINMAX_PAGES_PER_RANGE)
+ 	{
+ 		ItemPointerData itupptr;
+ 		DeletableTuple *hitem;
+ 		bool		found;
+ 
+ 		mmGetHeapBlockItemptr(rmAccess, heapBlk, &itupptr);
+ 
+ 		if (!ItemPointerIsValid(&itupptr))
+ 		{
+ 			/*
+ 			 * Ignore revmap entries set to invalid.  However, if the heap page
+ 			 * range is complete but not summarized, store its initial page
+ 			 * number in the unsummarized array, for later summarization.
+ 			 */
+ 			if (heapBlk + MINMAX_PAGES_PER_RANGE < heapNumBlocks)
+ 			{
+ 				if (maxnonsumm == 0)
+ 				{
+ 					Assert(!nonsumm);
+ 					maxnonsumm = 8;
+ 					nonsumm = palloc(sizeof(BlockNumber) * maxnonsumm);
+ 				}
+ 				else if (numnonsumm >= maxnonsumm)
+ 				{
+ 					maxnonsumm *= 2;
+ 					nonsumm = repalloc(nonsumm, sizeof(BlockNumber) * maxnonsumm);
+ 				}
+ 
+ 				nonsumm[numnonsumm++] = heapBlk;
+ 			}
+ 
+ 			continue;
+ 		}
+ 
+ 		hitem = (DeletableTuple *) hash_search(tuples,
+ 											   &itupptr,
+ 											   HASH_FIND,
+ 											   &found);
+ 		if (!found)
+ 			elog(ERROR, "reverse map references nonexistant index tuple %u/%u",
+ 				 ItemPointerGetBlockNumber(&itupptr),
+ 				 ItemPointerGetOffsetNumber(&itupptr));
+ 		hitem->referenced = true;
+ 		numitems++;
+ 	}
+ 
+ 	mmRevmapAccessTerminate(rmAccess);
+ 
+ 	/*
+ 	 * Now scan the hash, and keep track of the removable (i.e. not referenced,
+ 	 * not locked) tuples.  Allocate this in the hash context, so that it goes
+ 	 * away with it.
+ 	 */
+ 	deletable = MemoryContextAlloc(hashcxt, sizeof(ItemPointerData) * numitems);
+ 
+ 	hash_freeze(tuples);
+ 	hash_seq_init(&status, tuples);
+ 	for (;;)
+ 	{
+ 		DeletableTuple *hitem;
+ 
+ 		hitem = hash_seq_search(&status);
+ 		if (!hitem)
+ 			break;
+ 		if (hitem->referenced)
+ 			continue;
+ 		if (!ConditionalLockTuple(idxRel, &hitem->tid, ExclusiveLock))
+ 			continue;
+ 
+ 		/*
+ 		 * By here, we know this tuple is not referenced from the revmap.
+ 		 * Also, since we hold the tuple lock, we know that if there is a
+ 		 * concurrent scan that had obtained the tuple before the reference
+ 		 * got removed, either that scan is not looking at the tuple (because
+ 		 * that would have prevented us from getting the tuple lock) or it is
+ 		 * holding the containing buffer's lock.  If the former, then there's
+ 		 * no problem with removing the tuple immediately; if the latter, we
+ 		 * will block below trying to acquire that lock, so by the time we are
+ 		 * unblocked, the concurrent scan will no longer be interested in the
+ 		 * tuple contents anymore.	Therefore, this tuple can be removed from
+ 		 * the block.
+ 		 */
+ 		UnlockTuple(idxRel, &hitem->tid, ExclusiveLock);
+ 
+ 		deletable[numdeletable++] = hitem->tid;
+ 	}
+ 
+ 	/*
+ 	 * Now sort the array of deletable index tuples, and walk this array by
+ 	 * pages doing bulk deletion of items on each page; the free space map is
+ 	 * updated for pages on which we delete item.
+ 	 */
+ 	qsort(deletable, numdeletable, sizeof(ItemPointerData),
+ 		  qsortCompareItemPointers);
+ 
+ 	start = 0;
+ 	for (i = 0; i < numdeletable; i++)
+ 	{
+ 		if (i == numdeletable - 1 ||
+ 			(ItemPointerGetBlockNumber(&deletable[start]) !=
+ 			 ItemPointerGetBlockNumber(&deletable[i + 1])))
+ 		{
+ 			OffsetNumber *offnos;
+ 			int			noffs;
+ 			Buffer		buf;
+ 			Page		page;
+ 			int			j;
+ 			BlockNumber	blk;
+ 
+ 			vacuum_delay_point();
+ 
+ 			blk = ItemPointerGetBlockNumber(&deletable[start]);
+ 			buf = ReadBufferExtended(idxRel, MAIN_FORKNUM, blk,
+ 									 RBM_NORMAL, strategy);
+ 			LockBuffer(buf, BUFFER_LOCK_EXCLUSIVE);
+ 			page = BufferGetPage(buf);
+ 
+ 			noffs = i + 1 - start;
+ 			offnos = palloc(sizeof(OffsetNumber) * noffs);
+ 			for (j = 0; j < noffs; j++)
+ 				offnos[j] = ItemPointerGetOffsetNumber(&deletable[start + j]);
+ 
+ 			START_CRIT_SECTION();
+ 
+ 			PageIndexDeleteNoCompact(page, offnos, noffs);
+ 
+ 			MarkBufferDirty(buf);
+ 
+ 			/* XLOG stuff */
+ 			if (RelationNeedsWAL(idxRel))
+ 			{
+ 				xl_minmax_bulkremove	xlrec;
+ 				XLogRecPtr	recptr;
+ 				XLogRecData	rdata[2];
+ 				uint8		info = XLOG_MINMAX_BULKREMOVE;
+ 
+ 				xlrec.node = idxRel->rd_node;
+ 				xlrec.block = blk;
+ 				rdata[0].data = (char *) &xlrec;
+ 				rdata[0].len = SizeOfMinmaxBulkRemove;
+ 				rdata[0].buffer = InvalidBuffer;
+ 				rdata[0].next = &(rdata[1]);
+ 
+ 				/*
+ 				 * The OffsetNumber array is not actually in the buffer, but we
+ 				 * pretend that it is.  When XLogInsert stores the whole
+ 				 * buffer, the offset array need not be stored too.
+ 				 */
+ 				rdata[1].data = (char *) offnos;
+ 				rdata[1].len = sizeof(OffsetNumber) * noffs;
+ 				rdata[1].buffer = buf;
+ 				rdata[1].buffer_std = true;
+ 				rdata[1].next = NULL;
+ 
+ 				recptr = XLogInsert(RM_MINMAX_ID, info, rdata);
+ 
+ 				PageSetLSN(page, recptr);
+ 			}
+ 
+ 			END_CRIT_SECTION();
+ 
+ 			RecordPageWithFreeSpace(idxRel, blk, PageGetFreeSpace(page));
+ 
+ 			start = i + 1;
+ 
+ 			UnlockReleaseBuffer(buf);
+ 			pfree(offnos);
+ 		}
+ 	}
+ 
+ 	/* Finally, ensure the index' FSM is consistent */
+ 	FreeSpaceMapVacuum(idxRel);
+ 
+ 	*nonsummed = nonsumm;
+ 	*numnonsummed = numnonsumm;
+ 
+ 	hash_destroy(tuples);
+ }
+ 
+ /*
+  * Summarize the given page ranges of the given index.
+  */
+ static void
+ rerun_summarization(Relation idxRel, Relation heapRel, mmRevmapAccess *rmAccess,
+ 					BlockNumber *nonsummarized, int numnonsummarized)
+ {
+ 	int			i;
+ 	IndexInfo  *indexInfo;
+ 	MMBuildState *mmstate;
+ 
+ 	indexInfo = BuildIndexInfo(idxRel);
+ 
+ 	mmstate = initialize_mm_buildstate(heapRel, idxRel, rmAccess, indexInfo);
+ 
+ 	for (i = 0; i < numnonsummarized; i++)
+ 	{
+ 		BlockNumber blk = nonsummarized[i];
+ 		ItemPointerData iptr;
+ 		MMTuple    *tup;
+ 		Size		size;
+ 
+ 		mmGetHeapBlockItemptr(rmAccess, blk, &iptr);
+ 
+ 		mmstate->currRangeStart = blk;
+ 		mmstate->nextRangeAt = blk + MINMAX_PAGES_PER_RANGE;
+ 
+ 		/* it can't have been re-summarized concurrently .. */
+ 		Assert(!ItemPointerIsValid(&iptr));
+ 
+ 		IndexBuildHeapRangeScan(heapRel, idxRel, indexInfo, false,
+ 								blk, MINMAX_PAGES_PER_RANGE,
+ 								mmbuildCallback, (void *) mmstate);
+ 
+ 		/*
+ 		 * Create the index tuple containing min/max values, and insert it.
+ 		 * Note mmbuildCallback didn't have the chance to actually insert
+ 		 * anything into the index, because the heapscan should have ended
+ 		 * just as it reached the final tuple in the range.
+ 		 */
+ 		tup = minmax_form_tuple(mmstate->indexDesc, mmstate->diskDesc,
+ 								mmstate->dtuple, &size);
+ 		mm_doinsert(mmstate->irel, mmstate->rmAccess,
+ 					&mmstate->currentInsertBuf, mmstate->currRangeStart, tup,
+ 					size);
+ 		mmstate->numtuples++;
+ 		pfree(tup);
+ 
+ 		clear_mm_percol_buildstate(mmstate);
+ 	}
+ 
+ 	if (!BufferIsInvalid(mmstate->currentInsertBuf))
+ 	{
+ 		ReleaseBuffer(mmstate->currentInsertBuf);
+ 		mmstate->currentInsertBuf = InvalidBuffer;
+ 	}
+ }
+ 
+ /*
+  * During amvacuumcleanup of a MinMax index, we do three main things:
+  *
+  * 1) remove revmap entries which are no longer interesting (heap has been
+  * truncated).
+  *
+  * 2) remove index tuples that are no longer referenced from the revmap.
+  *
+  * 3) summarize ranges that are currently unsummarized.
+  */
+ Datum
+ mmvacuumcleanup(PG_FUNCTION_ARGS)
+ {
+ 	IndexVacuumInfo *info = (IndexVacuumInfo *) PG_GETARG_POINTER(0);
+ 	IndexBulkDeleteResult *stats = (IndexBulkDeleteResult *) PG_GETARG_POINTER(1);
+ 	mmRevmapAccess *rmAccess;
+ 	BlockNumber *nonsummarized = NULL;
+ 	int			numnonsummarized;
+ 	Relation	heapRel;
+ 	BlockNumber	heapNumBlocks;
+ 
+ 	rmAccess = mmRevmapAccessInit(info->index, MINMAX_PAGES_PER_RANGE);
+ 
+ 	heapRel = heap_open(IndexGetRelation(RelationGetRelid(info->index), false),
+ 						AccessShareLock);
+ 
+ 	/*
+ 	 * First: truncate the revmap to the range that covers pages actually in
+ 	 * the heap.  We must do this while holding the relation extension lock,
+ 	 * or we risk someone else extending the relation in the meantime.
+ 	 */
+ 	LockRelationForExtension(heapRel, AccessShareLock);
+ 	heapNumBlocks = RelationGetNumberOfBlocks(heapRel);
+ 	mmRevmapTruncate(rmAccess, heapNumBlocks);
+ 	UnlockRelationForExtension(heapRel, AccessShareLock);
+ 
+ 	/*
+ 	 * Second: scan the index, removing index tuples that are no longer
+ 	 * referenced from the revmap.  While at it, collect the page numbers
+ 	 * of ranges that are not summarized.
+ 	 */
+ 	remove_deletable_tuples(info->index, heapNumBlocks, info->strategy,
+ 							&nonsummarized, &numnonsummarized);
+ 
+ 	/* Finally, summarize the ranges collected above */
+ 	if (nonsummarized)
+ 	{
+ 		rerun_summarization(info->index, heapRel, rmAccess,
+ 							nonsummarized, numnonsummarized);
+ 		pfree(nonsummarized);
+ 	}
+ 
+ 	mmRevmapAccessTerminate(rmAccess);
+ 	heap_close(heapRel, AccessShareLock);
+ 
+ 	PG_RETURN_POINTER(stats);
+ }
+ 
+ Datum
+ mmcostestimate(PG_FUNCTION_ARGS)
+ {
+ 	PG_RETURN_INT64(0);
+ }
+ 
+ Datum
+ mmoptions(PG_FUNCTION_ARGS)
+ {
+ 	PG_RETURN_INT64(0);
+ }
+ 
+ /*
+  * Fill the given finfo to enable calls to the operator specified by the given
+  * parameters.
+  */
+ static void
+ get_mm_operator(Oid opfam, Oid idxtypid, Oid keytypid,
+ 				StrategyNumber strategy, FmgrInfo *finfo)
+ {
+ 	Oid			oprid;
+ 	HeapTuple	oper;
+ 
+ 	oprid = get_opfamily_member(opfam, idxtypid, keytypid, strategy);
+ 	if (!OidIsValid(oprid))
+ 		elog(ERROR, "missing operator %d(%u,%u) in opfamily %u",
+ 			 strategy, idxtypid, keytypid, opfam);
+ 
+ 	oper = SearchSysCache1(OPEROID, oprid);
+ 	if (!HeapTupleIsValid(oper))
+ 		elog(ERROR, "cache lookup failed for operator %u", oprid);
+ 
+ 	fmgr_info(((Form_pg_operator) GETSTRUCT(oper))->oprcode, finfo);
+ 	ReleaseSysCache(oper);
+ }
+ 
+ /*
+  * Invoke the given operator, and return the result as a C boolean.
+  */
+ static inline bool
+ invoke_mm_operator(FmgrInfo *operator, Oid collation, Datum left, Datum right)
+ {
+ 	Datum		result;
+ 
+ 	result = FunctionCall2Coll(operator, collation, left, right);
+ 
+ 	return DatumGetBool(result);
+ }
+ 
+ /*
+  * Insert an index tuple into the index relation.  The revmap is updated to
+  * mark the range containing the given page as pointing to the inserted entry.
+  *
+  * The buffer, if valid, is checked for free space to insert the new entry;
+  * if there isn't enough, a new buffer is obtained and pinned.
+  *
+  * The buffer is marked dirty.
+  */
+ static void
+ mm_doinsert(Relation idxrel, mmRevmapAccess *rmAccess, Buffer *buffer,
+ 			BlockNumber heapblkno, MMTuple *tup, Size itemsz)
+ {
+ 	Page		page;
+ 	BlockNumber blk;
+ 	OffsetNumber off;
+ 	bool		extended;
+ 
+ 	itemsz = MAXALIGN(itemsz);
+ 
+ 	extended = mm_getinsertbuffer(idxrel, buffer, itemsz);
+ 	page = BufferGetPage(*buffer);
+ 
+ 	if (PageGetFreeSpace(page) < itemsz)
+ 		ereport(ERROR,
+ 				(errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
+ 				 errmsg("index row size %lu exceeds maximum for index \"%s\"",
+ 						itemsz, RelationGetRelationName(idxrel))));
+ 
+ 	off = PageAddItem(page, (Item) tup, itemsz, InvalidOffsetNumber,
+ 					  false, false);
+ 	blk = BufferGetBlockNumber(*buffer);
+ 
+ 	MarkBufferDirty(*buffer);
+ 
+ 	START_CRIT_SECTION();
+ 
+ 	/* XLOG stuff */
+ 	if (RelationNeedsWAL(idxrel))
+ 	{
+ 		xl_minmax_insert	xlrec;
+ 		XLogRecPtr	recptr;
+ 		XLogRecData	rdata[2];
+ 		uint8		info = XLOG_MINMAX_INSERT;
+ 
+ 
+ 		xlrec.target.node = idxrel->rd_node;
+ 		ItemPointerSet(&xlrec.target.tid, blk, off);
+ 		rdata[0].data = (char *) &xlrec;
+ 		rdata[0].len = SizeOfMinmaxInsert;
+ 		rdata[0].buffer = InvalidBuffer;
+ 		rdata[0].next = &(rdata[1]);
+ 
+ 		rdata[1].data = (char *) tup;
+ 		rdata[1].len = itemsz;
+ 		rdata[1].buffer = *buffer;
+ 		rdata[1].buffer_std = true;
+ 		rdata[1].next = NULL;
+ 
+ 		/*
+ 		 * If this is the first tuple in the page, we can reinit the page
+ 		 * instead of restoring the whole thing.  Set flag, and hide buffer
+ 		 * references from XLogInsert.
+ 		 */
+ 		if (extended)
+ 		{
+ 			info |= XLOG_MINMAX_INIT_PAGE;
+ 			rdata[1].buffer = InvalidBuffer;
+ 		}
+ 
+ 		recptr = XLogInsert(RM_MINMAX_ID, info, rdata);
+ 
+ 		PageSetLSN(page, recptr);
+ 	}
+ 
+ 	END_CRIT_SECTION();
+ 
+ 	/*
+ 	 * Note we need to keep the lock on the buffer until after the revmap
+ 	 * has been updated.  Otherwise, a concurrent scanner could try to obtain
+ 	 * the index tuple from the revmap before we're done writing it.
+ 	 */
+ 	mmSetHeapBlockItemptr(rmAccess, heapblkno, blk, off);
+ 
+ 	LockBuffer(*buffer, BUFFER_LOCK_UNLOCK);
+ }
+ 
+ /*
+  * Return a exclusively-locked buffer resulting from extending the relation.
+  */
+ static Buffer
+ mm_getnewbuffer(Relation irel)
+ {
+ 	Buffer	buffer;
+ 	bool	needLock = !RELATION_IS_LOCAL(irel);
+ 
+ 	if (needLock)
+ 		LockRelationForExtension(irel, ExclusiveLock);
+ 
+ 	buffer = ReadBuffer(irel, P_NEW);
+ 	LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE);
+ 
+ 	if (needLock)
+ 		UnlockRelationForExtension(irel, ExclusiveLock);
+ 
+ 	return buffer;
+ }
+ 
+ /*
+  * Return a pinned and locked buffer which can be used to insert an index item
+  * of size itemsz.
+  *
+  * The passed buffer argument is tested for free space; if it has some, it is
+  * locked and returned.  Otherwise, that buffer (if valid) is unpinned, and a
+  * new buffer is obtained, and returned pinned and locked.
+  *
+  * If there's no existing page with enough free to accomodate the new item,
+  * the relation is extended.  The function returns true if this happens, false
+  * otherwise.
+  */
+ static bool
+ mm_getinsertbuffer(Relation irel, Buffer *buffer, Size itemsz)
+ {
+ 	Buffer		buf;
+ 	bool		extended = false;
+ 
+ 	buf = *buffer;
+ 
+ 	if (BufferIsInvalid(buf) ||
+ 		(PageGetFreeSpace(BufferGetPage(buf)) < itemsz))
+ 	{
+ 		Page		page;
+ 
+ 		/*
+ 		 * By the time we break out of this loop, buf is a locked and pinned
+ 		 * buffer which has enough free space to satisfy the requirement.
+ 		 */
+ 		for (;;)
+ 		{
+ 			BlockNumber	blk;
+ 			int			freespace;
+ 
+ 			blk = GetPageWithFreeSpace(irel, itemsz);
+ 			if (blk == InvalidBlockNumber)
+ 			{
+ 				/*
+ 				 * There's not enough free space in any existing index page,
+ 				 * according to the FSM: extend the relation to obtain a shiny
+ 				 * new page.
+ 				 */
+ 				buf = mm_getnewbuffer(irel);
+ 				page = BufferGetPage(buf);
+ 				PageInit(page, BLCKSZ, 0);
+ 
+ 				/*
+ 				 * If an entirely new page does not contain enough free space
+ 				 * for the new item, then surely that item is oversized.
+ 				 * Complain loudly.
+ 				 */
+ 				freespace = PageGetFreeSpace(page);
+ 				if (freespace < itemsz)
+ 					ereport(ERROR,
+ 							(errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
+ 							 errmsg("index row size %lu exceeds maximum %lu for index \"%s\"",
+ 									(unsigned long) itemsz,
+ 									(unsigned long) freespace,
+ 									RelationGetRelationName(irel))));
+ 				extended = true;
+ 				break;
+ 			}
+ 
+ 			buf = ReadBuffer(irel, blk);
+ 			LockBuffer(buf, BUFFER_LOCK_EXCLUSIVE);
+ 			page = BufferGetPage(buf);
+ 			freespace = PageGetFreeSpace(page);
+ 			if (freespace >= itemsz)
+ 				break;
+ 
+ 			/* Not enough space: register reality and start over */
+ 			/* XXX register and unlock, or unlock and register?? */
+ 			RecordPageWithFreeSpace(irel, blk, freespace);
+ 			LockBuffer(buf, BUFFER_LOCK_UNLOCK);
+ 		}
+ 
+ 		if (!BufferIsInvalid(*buffer))
+ 			ReleaseBuffer(*buffer);
+ 
+ 		*buffer = buf;
+ 	}
+ 	else
+ 		LockBuffer(buf, BUFFER_LOCK_EXCLUSIVE);
+ 
+ 	return extended;
+ }
*** /dev/null
--- b/src/backend/access/minmax/mmrevmap.c
***************
*** 0 ****
--- 1,375 ----
+ /*
+  * mmrevmap.c
+  *		Reverse range map for MinMax indexes
+  *
+  * Portions Copyright (c) 1996-2013, PostgreSQL Global Development Group
+  * Portions Copyright (c) 1994, Regents of the University of California
+  * IDENTIFICATION
+  *	  src/backend/access/minmax/mmrevmap.c
+  */
+ #include "postgres.h"
+ 
+ #include "access/minmax.h"
+ #include "access/minmax_internal.h"
+ #include "access/minmax_revmap.h"
+ #include "access/minmax_xlog.h"
+ #include "access/rmgr.h"
+ #include "miscadmin.h"
+ #include "storage/bufmgr.h"
+ #include "storage/lmgr.h"
+ #include "storage/relfilenode.h"
+ #include "storage/smgr.h"
+ 
+ 
+ #define MAPSIZE (BLCKSZ - MAXALIGN(SizeOfPageHeaderData))
+ #define IDXITEMS_PER_PAGE (MAPSIZE / SizeOfIptrData)
+ 
+ #define HEAPBLK_TO_REVMAP_BLK(pagesPerRange, heapBlk) \
+ 	((heapBlk / pagesPerRange) / IDXITEMS_PER_PAGE)
+ 
+ #define HEAPBLK_TO_REVMAP_INDEX(pagesPerRange, heapBlk) \
+ 	((heapBlk / pagesPerRange) % IDXITEMS_PER_PAGE)
+ 
+ static bool mmRevmapExtend(mmRevmapAccess *rmAccess, BlockNumber blkno);
+ 
+ /* typedef appears in minmax_revmap.h */
+ struct mmRevmapAccess
+ {
+ 	Relation	idxrel;
+ 	BlockNumber pagesPerRange;
+ 	Buffer		currBuf;
+ 	BlockNumber physPagesInRevmap;
+ };
+ 
+ 
+ /*
+  * Initialize an access object for a reverse range map, which can be used to
+  * read stuff from it.	This must be freed by mmRevmapAccessTerminate when caller
+  * is done with it.
+  */
+ mmRevmapAccess *
+ mmRevmapAccessInit(Relation idxrel, BlockNumber pagesPerRange)
+ {
+ 	mmRevmapAccess *rmAccess = palloc(sizeof(mmRevmapAccess));
+ 
+ 	RelationOpenSmgr(idxrel);
+ 
+ 	rmAccess->idxrel = idxrel;
+ 	rmAccess->pagesPerRange = pagesPerRange;
+ 	rmAccess->currBuf = InvalidBuffer;
+ 	rmAccess->physPagesInRevmap =
+ 		smgrnblocks(idxrel->rd_smgr, MM_REVMAP_FORKNUM);
+ 
+ 	return rmAccess;
+ }
+ 
+ /*
+  * Release resources associated with a revmap access object.
+  */
+ void
+ mmRevmapAccessTerminate(mmRevmapAccess *rmAccess)
+ {
+ 	if (rmAccess->currBuf != InvalidBuffer)
+ 		ReleaseBuffer(rmAccess->currBuf);
+ 	pfree(rmAccess);
+ }
+ 
+ /*
+  * in the given revmap page, which is used in a minmax index of pagesPerRange
+  * pages-per-range, set the element corresponding to heap block number heapBlk
+  * to the value (blkno, offno).
+  *
+  * Caller must have obtained the correct page.
+  *
+  * This is used both in regular operation and during WAL replay.
+  */
+ void
+ rm_page_set_iptr(Page page, int pagesPerRange, BlockNumber heapBlk,
+ 				 BlockNumber blkno, OffsetNumber offno)
+ {
+ 	ItemPointerData *iptr;
+ 
+ 	iptr = (ItemPointerData *) PageGetContents(page);
+ 	iptr += HEAPBLK_TO_REVMAP_INDEX(pagesPerRange, heapBlk);
+ 
+ 	ItemPointerSet(iptr, blkno, offno);
+ }
+ 
+ /*
+  * Set the TID of the index entry corresponding to the range that includes
+  * the given heap page to the given item pointer.
+  *
+  * The map is extended, if necessary.
+  */
+ void
+ mmSetHeapBlockItemptr(mmRevmapAccess *rmAccess, BlockNumber heapBlk,
+ 					  BlockNumber blkno, OffsetNumber offno)
+ {
+ 	BlockNumber mapBlk;
+ 	bool		extend = false;
+ 
+ 	mapBlk = HEAPBLK_TO_REVMAP_BLK(rmAccess->pagesPerRange, heapBlk);
+ 
+ 	/*
+ 	 * If the revmap is out of space, extend it first.
+ 	 */
+ 	if (mapBlk >= rmAccess->physPagesInRevmap)
+ 		extend = mmRevmapExtend(rmAccess, mapBlk);
+ 
+ 	/*
+ 	 * Obtain the buffer from which we need to read.  If we already have the
+ 	 * correct buffer in our access struct, use that; otherwise, release that,
+ 	 * (if valid) and read the one we need.
+ 	 */
+ 	if (rmAccess->currBuf == InvalidBuffer ||
+ 		mapBlk != BufferGetBlockNumber(rmAccess->currBuf))
+ 	{
+ 		if (rmAccess->currBuf != InvalidBuffer)
+ 			ReleaseBuffer(rmAccess->currBuf);
+ 		rmAccess->currBuf = ReadBufferExtended(rmAccess->idxrel,
+ 											   MM_REVMAP_FORKNUM, mapBlk,
+ 											   RBM_NORMAL, NULL);
+ 	}
+ 
+ 	LockBuffer(rmAccess->currBuf, BUFFER_LOCK_EXCLUSIVE);
+ 	START_CRIT_SECTION();
+ 
+ 	rm_page_set_iptr(BufferGetPage(rmAccess->currBuf),
+ 					 rmAccess->pagesPerRange,
+ 					 heapBlk,
+ 					 blkno, offno);
+ 
+ 	MarkBufferDirty(rmAccess->currBuf);
+ 
+ 	/* XLOG stuff */
+ 	if (RelationNeedsWAL(rmAccess->idxrel))
+ 	{
+ 		xl_minmax_rm_set	xlrec;
+ 		XLogRecPtr	recptr;
+ 		XLogRecData	rdata[2];
+ 		uint8		info;
+ 
+ 		info = XLOG_MINMAX_REVMAP_SET;
+ 
+ 		xlrec.node = rmAccess->idxrel->rd_node;
+ 		xlrec.mapBlock = mapBlk;
+ 		xlrec.pagesPerRange = rmAccess->pagesPerRange;
+ 		xlrec.heapBlock = heapBlk;
+ 		ItemPointerSet(&(xlrec.newval), blkno, offno);
+ 
+ 		rdata[0].data = (char *) &xlrec;
+ 		rdata[0].len = SizeOfMinmaxRevmapSet;
+ 		rdata[0].buffer = InvalidBuffer;
+ 		rdata[0].buffer_std = false;
+ 		rdata[0].next = &(rdata[1]);
+ 
+ 		rdata[1].data = NULL;
+ 		rdata[1].len = 0;
+ 		rdata[1].buffer = rmAccess->currBuf;
+ 		rdata[1].buffer_std = false;
+ 		rdata[1].next = NULL;
+ 
+ 		if (extend)
+ 		{
+ 			info |= XLOG_MINMAX_INIT_PAGE;
+ 			/* If the page is new, there's no need for a full page image */
+ 			rdata[0].next = NULL;
+ 		}
+ 
+ 		recptr = XLogInsert(RM_MINMAX_ID, info, rdata);
+ 
+ 		PageSetLSN(BufferGetPage(rmAccess->currBuf), recptr);
+ 	}
+ 
+ 	END_CRIT_SECTION();
+ 
+ 	LockBuffer(rmAccess->currBuf, BUFFER_LOCK_UNLOCK);
+ }
+ 
+ 
+ /*
+  * Return the TID of the index entry corresponding to the range that includes
+  * the given heap page.  If the TID is valid, the tuple is locked with LockTuple.
+  * It is the caller's responsibility to release that lock.
+  */
+ void
+ mmGetHeapBlockItemptr(mmRevmapAccess *rmAccess, BlockNumber heapBlk,
+ 					  ItemPointerData *out)
+ {
+ 	BlockNumber mapBlk;
+ 	ItemPointerData *iptr;
+ 
+ 	mapBlk = HEAPBLK_TO_REVMAP_BLK(rmAccess->pagesPerRange, heapBlk);
+ 
+ 	/*
+ 	 * If we are asked for a block of the map which is beyond what we know
+ 	 * about it, try to see if our fork has grown since we last checked its
+ 	 * size; a concurrent inserter could have extended it.
+ 	 */
+ 	if (mapBlk >= rmAccess->physPagesInRevmap)
+ 	{
+ 		RelationOpenSmgr(rmAccess->idxrel);
+ 		LockRelationForExtension(rmAccess->idxrel, ShareLock);
+ 		rmAccess->physPagesInRevmap =
+ 			smgrnblocks(rmAccess->idxrel->rd_smgr, MM_REVMAP_FORKNUM);
+ 
+ 		if (mapBlk >= rmAccess->physPagesInRevmap)
+ 		{
+ 			/* definitely not in range */
+ 
+ 			UnlockRelationForExtension(rmAccess->idxrel, ShareLock);
+ 			ItemPointerSetInvalid(out);
+ 			return;
+ 		}
+ 
+ 		/* the block exists now, proceed */
+ 		UnlockRelationForExtension(rmAccess->idxrel, ShareLock);
+ 	}
+ 
+ 	if (rmAccess->currBuf == InvalidBuffer ||
+ 		BufferGetBlockNumber(rmAccess->currBuf) != mapBlk)
+ 	{
+ 		if (rmAccess->currBuf != InvalidBuffer)
+ 			ReleaseBuffer(rmAccess->currBuf);
+ 
+ 		rmAccess->currBuf =
+ 			ReadBufferExtended(rmAccess->idxrel, MM_REVMAP_FORKNUM, mapBlk,
+ 							   RBM_NORMAL, NULL);
+ 	}
+ 
+ 	LockBuffer(rmAccess->currBuf, BUFFER_LOCK_SHARE);
+ 
+ 	iptr = (ItemPointerData *)
+ 		PageGetContents(BufferGetPage(rmAccess->currBuf));
+ 	iptr += HEAPBLK_TO_REVMAP_INDEX(rmAccess->pagesPerRange, heapBlk);
+ 
+ 	ItemPointerCopy(iptr, out);
+ 
+ 	if (ItemPointerIsValid(iptr))
+ 		LockTuple(rmAccess->idxrel, iptr, ShareLock);
+ 
+ 	LockBuffer(rmAccess->currBuf, BUFFER_LOCK_UNLOCK);
+ }
+ 
+ /*
+  * Create a single-page reverse range map fork for a new minmax index
+  *
+  * NB -- caller is assumed to WAL-log this operation
+  */
+ void
+ mmRevmapCreate(Relation idxrel)
+ {
+ 	bool		needLock;
+ 	Buffer		buf;
+ 	Page		page;
+ 
+ 	needLock = !RELATION_IS_LOCAL(idxrel);
+ 
+ 	/*
+ 	 * XXX it's unclear that we need this lock, considering that the relation
+ 	 * is likely being created ...
+ 	 */
+ 	if (needLock)
+ 		LockRelationForExtension(idxrel, ExclusiveLock);
+ 
+ 	START_CRIT_SECTION();
+ 	RelationOpenSmgr(idxrel);
+ 	smgrcreate(idxrel->rd_smgr, MM_REVMAP_FORKNUM, false);
+ 	buf = ReadBufferExtended(idxrel, MM_REVMAP_FORKNUM, P_NEW, RBM_NORMAL,
+ 							 NULL);
+ 	LockBuffer(buf, BUFFER_LOCK_EXCLUSIVE);
+ 
+ 	page = BufferGetPage(buf);
+ 	PageInit(page, BLCKSZ, 0);
+ 	MarkBufferDirty(buf);
+ 
+ 	UnlockReleaseBuffer(buf);
+ 	END_CRIT_SECTION();
+ 
+ 	if (needLock)
+ 		UnlockRelationForExtension(idxrel, ExclusiveLock);
+ }
+ 
+ /*
+  * Extend the reverse range map to cover the given block number.  Return false
+  * if the map already covered the requested range (no extension actually done),
+  * true otherwise.
+  *
+  * NB -- caller is responsible for ensuring this action is properly WAL-logged.
+  */
+ static bool
+ mmRevmapExtend(mmRevmapAccess *rmAccess, BlockNumber blkno)
+ {
+ 	char		page[BLCKSZ];
+ 	bool		extended = false;
+ 
+ 	MemSet(page, 0, sizeof(page));
+ 	PageInit(page, BLCKSZ, 0);
+ 
+ 	LockRelationForExtension(rmAccess->idxrel, ExclusiveLock);
+ 
+ 	/*
+ 	 * first, refresh our idea of the current size; it might well have grown
+ 	 * up to what we need since we last checked.
+ 	 */
+ 	RelationOpenSmgr(rmAccess->idxrel);
+ 	rmAccess->physPagesInRevmap =
+ 		smgrnblocks(rmAccess->idxrel->rd_smgr, MM_REVMAP_FORKNUM);
+ 
+ 	/*
+ 	 * Now extend it one page at a time.  This might seem a bit inefficient,
+ 	 * but normally we'd be extending for a single page anyway.
+ 	 */
+ 	while (blkno >= rmAccess->physPagesInRevmap)
+ 	{
+ 		extended = true;
+ 		PageSetChecksumInplace(page, blkno);
+ 		smgrextend(rmAccess->idxrel->rd_smgr, MM_REVMAP_FORKNUM,
+ 				   rmAccess->physPagesInRevmap, page, false);
+ 		rmAccess->physPagesInRevmap++;
+ 	}
+ 
+ 	Assert(rmAccess->physPagesInRevmap ==
+ 		   smgrnblocks(rmAccess->idxrel->rd_smgr, MM_REVMAP_FORKNUM));
+ 
+ 	UnlockRelationForExtension(rmAccess->idxrel, ExclusiveLock);
+ 
+ 	return extended;
+ }
+ 
+ /*
+  * Truncate a revmap to the size needed for a table of the given number of
+  * blocks.  This includes removing pages beyond the last one needed, and also
+  * zeroing out the excess entries in the last page.
+  *
+  * The caller should hold a lock to avoid the table from growing in
+  * the meantime.
+  */
+ void
+ mmRevmapTruncate(mmRevmapAccess *rmAccess, BlockNumber heapNumBlocks)
+ {
+ 	BlockNumber		rmBlks;
+ 	char	   *data;
+ 	Page		page;
+ 	Buffer		buffer;
+ 
+ 	/* Remove blocks at the end */
+ 	rmBlks = HEAPBLK_TO_REVMAP_BLK(rmAccess->pagesPerRange, heapNumBlocks);
+ 
+ 	RelationOpenSmgr(rmAccess->idxrel);
+ 	smgrtruncate(rmAccess->idxrel->rd_smgr, MM_REVMAP_FORKNUM, rmBlks + 1);
+ 
+ 	/* zero out the remaining items in the last page */
+ 	buffer = ReadBufferExtended(rmAccess->idxrel,
+ 								MM_REVMAP_FORKNUM, rmBlks,
+ 								RBM_NORMAL, NULL);
+ 	LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE);
+ 
+ 	page = PageGetContents(BufferGetPage(buffer));
+ 	data = page + sizeof(ItemPointerData) *
+ 		HEAPBLK_TO_REVMAP_INDEX(rmAccess->pagesPerRange, heapNumBlocks + 1);
+ 
+ 	memset(data, 0, page + MAPSIZE - data);
+ 
+ 	UnlockReleaseBuffer(buffer);
+ }
*** /dev/null
--- b/src/backend/access/minmax/mmtuple.c
***************
*** 0 ****
--- 1,388 ----
+ /*
+  * MinMax-specific tuples
+  *		Method implementations for tuples in minmax indexes.
+  *
+  * The intended interface is that code outside this file only deals with
+  * DeformedMMTuples, and convert to and from the on-disk representation by
+  * using functions in this file.
+  *
+  * NOTES
+  *
+  * A minmax tuple is similar to a heap tuple, with a few key differences.  The
+  * first interesting difference is that the tuple header is much simpler, only
+  * containing its total length and a small area for flags.	Also, the stored
+  * data does not match the tuple descriptor exactly: for each attribute in the
+  * descriptor, the index tuple carries two values, one for the minimum value in
+  * that column and one for the maximum.
+  *
+  * Also, for each column there are two null bits: one (hasnulls) stores whether
+  * any tuple within the page range has that column set to null; the other
+  * (allnulls) stores whether the column values are all null.  If allnulls is
+  * true, then the tuple data area does not contain min/max values for that
+  * column at all; whereas it does if the hasnulls is set.  Note we always store
+  * a double-length null bitmask; for typical indexes of four columns or less,
+  * they take a single byte anyway.	It doesn't seem worth trying to optimize
+  * this further.
+  *
+  * Portions Copyright (c) 1996-2013, PostgreSQL Global Development Group
+  * Portions Copyright (c) 1994, Regents of the University of California
+  *
+  * IDENTIFICATION
+  *	  src/backend/access/minmax/mmtuple.c
+  */
+ #include "postgres.h"
+ 
+ #include "access/htup_details.h"
+ #include "access/minmax_tuple.h"
+ #include "access/tupdesc.h"
+ #include "access/tupmacs.h"
+ 
+ 
+ static inline void mm_deconstruct_tuple(char *tp, bits8 *nullbits, bool nulls,
+ 					 int natts, Form_pg_attribute *att,
+ 					 Datum *values, bool *allnulls, bool *hasnulls);
+ 
+ 
+ /*
+  * Generate an internal-style tuple descriptor to pass to minmax_form_tuple.
+  * These have no use outside this module.
+  *
+  * The argument is a minmax index' regular tuple descriptor.
+  */
+ TupleDesc
+ minmax_get_descr(TupleDesc tupdesc)
+ {
+ 	TupleDesc	diskDesc;
+ 	int			i,
+ 				j;
+ 
+ 	diskDesc = CreateTemplateTupleDesc(tupdesc->natts * 2, false);
+ 
+ 	for (i = 0, j = 1; i < tupdesc->natts; i++)
+ 	{
+ 		/* min */
+ 		TupleDescInitEntry(diskDesc,
+ 						   j++,
+ 						   NULL,
+ 						   tupdesc->attrs[i]->atttypid,
+ 						   tupdesc->attrs[i]->atttypmod,
+ 						   0);
+ 		/* max */
+ 		TupleDescInitEntry(diskDesc,
+ 						   j++,
+ 						   NULL,
+ 						   tupdesc->attrs[i]->atttypid,
+ 						   tupdesc->attrs[i]->atttypmod,
+ 						   0);
+ 	}
+ 
+ 	return diskDesc;
+ }
+ 
+ /*
+  * Generate a new on-disk tuple to be inserted in a minmax index.
+  *
+  * The first tuple descriptor passed corresponds to the catalogued index info,
+  * that is, it is the index's descriptor; the second descriptor must be
+  * obtained by calling minmax_get_descr() on that descriptor.
+  *
+  * (The reason for this slightly grotty arrangement is that we use heap tuple
+  * functions to implement packing of a tuple into the on-disk format.)
+  */
+ MMTuple *
+ minmax_form_tuple(TupleDesc idxDsc, TupleDesc diskDsc, DeformedMMTuple *tuple,
+ 				  Size *size)
+ {
+ 	Datum	   *values;
+ 	bool	   *nulls;
+ 	bool		anynulls = false;
+ 	MMTuple    *rettuple;
+ 	int			keyno;
+ 	uint16		phony_infomask;
+ 	bits8	   *phony_nullbitmap;
+ 	Size		len,
+ 				hoff,
+ 				data_len;
+ 
+ 	Assert(diskDsc->natts > 0);
+ 
+ 	values = palloc(sizeof(Datum) * diskDsc->natts);
+ 	nulls = palloc0(sizeof(bool) * diskDsc->natts);
+ 	phony_nullbitmap = palloc(sizeof(bits8) * BITMAPLEN(diskDsc->natts));
+ 
+ 	/*
+ 	 * Set up the values/nulls arrays for heap_fill_tuple
+ 	 */
+ 	for (keyno = 0; keyno < idxDsc->natts; keyno++)
+ 	{
+ 		int		idxattno = keyno * 2;
+ 
+ 		/*
+ 		 * "allnulls" is set when there's no nonnull value in any row in
+ 		 * the column; set the nullable bits for both min and max attrs.
+ 		 */
+ 		if (tuple->values[keyno].allnulls)
+ 		{
+ 			nulls[idxattno] = true;
+ 			nulls[idxattno + 1] = true;
+ 			anynulls = true;
+ 			continue;
+ 		}
+ 
+ 		if (tuple->values[keyno].hasnulls)
+ 			anynulls = true;
+ 
+ 		values[idxattno] = tuple->values[keyno].min;
+ 		values[idxattno + 1] = tuple->values[keyno].max;
+ 	}
+ 
+ 	/* compute total space needed */
+ 	len = SizeOfMinMaxTuple;
+ 	if (anynulls)
+ 	{
+ 		/*
+ 		 * We need a double-length bitmap on an on-disk minmax index tuple;
+ 		 * the first half stores the "allnulls" bits, the second stores
+ 		 * "hasnulls".
+ 		 */
+ 		len += BITMAPLEN(idxDsc->natts * 2);
+ 	}
+ 
+ 	/*
+ 	 * TODO: we can probably do away with alignment here, and save some
+ 	 * precious disk space.  When there's no bitmap we can save 6 bytes. Maybe
+ 	 * we can use the first col's type alignment instead of maxalign.
+ 	 */
+ 	len = hoff = MAXALIGN(len);
+ 
+ 	data_len = heap_compute_data_size(diskDsc, values, nulls);
+ 
+ 	len += data_len;
+ 
+ 	rettuple = palloc0(len);
+ 	rettuple->mt_info = hoff;
+ 	Assert((rettuple->mt_info & MMIDX_OFFSET_MASK) == hoff);
+ 
+ 	/*
+ 	 * The infomask and null bitmap as computed by heap_fill_tuple are useless
+ 	 * to us.  However, that function will not accept a null infomask; and we
+ 	 * need to pass a valid null bitmap so that it will correctly skip
+ 	 * outputting null attributes in the data area.
+ 	 */
+ 	heap_fill_tuple(diskDsc,
+ 					values,
+ 					nulls,
+ 					(char *) rettuple + hoff,
+ 					data_len,
+ 					&phony_infomask,
+ 					phony_nullbitmap);
+ 
+ 	/* done with these */
+ 	pfree(values);
+ 	pfree(nulls);
+ 	pfree(phony_nullbitmap);
+ 
+ 	/*
+ 	 * Now fill in the real null bitmasks.	allnulls first.
+ 	 */
+ 	if (anynulls)
+ 	{
+ 		bits8	   *bitP;
+ 		int			bitmask;
+ 
+ 		rettuple->mt_info |= MMIDX_NULLS_MASK;
+ 
+ 		bitP = ((bits8 *) (rettuple + SizeOfMinMaxTuple)) - 1;
+ 		bitmask = HIGHBIT;
+ 		for (keyno = 0; keyno < idxDsc->natts; keyno++)
+ 		{
+ 			if (bitmask != HIGHBIT)
+ 				bitmask <<= 1;
+ 			else
+ 			{
+ 				bitP += 1;
+ 				*bitP = 0x0;
+ 				bitmask = 1;
+ 			}
+ 
+ 			if (tuple->values[keyno].allnulls)
+ 				continue;
+ 
+ 			*bitP |= bitmask;
+ 		}
+ 		/* hasnulls bits follow */
+ 		for (keyno = 0; keyno < idxDsc->natts; keyno++)
+ 		{
+ 			if (bitmask != HIGHBIT)
+ 				bitmask <<= 1;
+ 			else
+ 			{
+ 				bitP += 1;
+ 				*bitP = 0x0;
+ 				bitmask = 1;
+ 			}
+ 
+ 			if (tuple->values[keyno].hasnulls)
+ 				continue;
+ 
+ 			*bitP |= bitmask;
+ 		}
+ 	}
+ 
+ 	*size = len;
+ 	return rettuple;
+ }
+ 
+ /*
+  * Free a tuple created by minmax_form_tuple
+  */
+ void
+ minmax_free_tuple(MMTuple *tuple)
+ {
+ 	pfree(tuple);
+ }
+ 
+ /*
+  * Convert a MMTuple back to a DeformedMMTuple.  This is the reverse of
+  * minmax_form_tuple.
+  *
+  * Note we don't need the "on disk tupdesc" here; we rely on our own routine to
+  * deconstruct the tuple from the on-disk format.
+  *
+  * XXX some callers might need copies of each datum; if so we need
+  * to apply datumCopy inside the loop.	We probably also need a
+  * minmax_free_dtuple() function.
+  */
+ DeformedMMTuple *
+ minmax_deform_tuple(TupleDesc tupdesc, MMTuple *tuple)
+ {
+ 	DeformedMMTuple *dtup;
+ 	Datum	   *values;
+ 	bool	   *allnulls;
+ 	bool	   *hasnulls;
+ 	char	   *tp;
+ 	bits8	   *nullbits = NULL;
+ 	int			keyno;
+ 
+ 	dtup = palloc(offsetof(DeformedMMTuple, values) +
+ 				  sizeof(MMValues) * tupdesc->natts);
+ 
+ 	values = palloc(sizeof(Datum) * tupdesc->natts * 2);
+ 	allnulls = palloc(sizeof(bool) * tupdesc->natts);
+ 	hasnulls = palloc(sizeof(bool) * tupdesc->natts);
+ 
+ 	tp = (char *) tuple + MMTupleDataOffset(tuple);
+ 
+ 	if (MMTupleHasNulls(tuple))
+ 		nullbits = (bits8 *) ((char *) tuple + SizeOfMinMaxTuple);
+ 	mm_deconstruct_tuple(tp, nullbits,
+ 						 MMTupleHasNulls(tuple),
+ 						 tupdesc->natts, tupdesc->attrs, values,
+ 						 allnulls, hasnulls);
+ 
+ 	for (keyno = 0; keyno < tupdesc->natts; keyno++)
+ 	{
+ 		if (allnulls[keyno])
+ 		{
+ 			dtup->values[keyno].allnulls = true;
+ 			continue;
+ 		}
+ 
+ 		/* XXX optional datumCopy() */
+ 		dtup->values[keyno].min = values[keyno * 2];
+ 		dtup->values[keyno].max = values[keyno * 2 + 1];
+ 		dtup->values[keyno].hasnulls = hasnulls[keyno];
+ 		dtup->values[keyno].allnulls = false;
+ 	}
+ 
+ 	pfree(values);
+ 	pfree(allnulls);
+ 	pfree(hasnulls);
+ 
+ 	return dtup;
+ }
+ 
+ /*
+  * mm_deconstruct_tuple
+  *		Guts of attribute extraction from an on-disk minmax tuple.
+  *
+  * Its arguments are:
+  *	tp			pointer to the tuple data area
+  *	nullbits	pointer to the tuple nulls bitmask
+  *	nulls		"has nulls" bit in tuple infomask
+  *	natts		number of array members in att
+  *	att			the tuple's TupleDesc Form_pg_attribute array
+  *	values		output values, size 2 * natts (alternates min and max)
+  *	allnulls	output "allnulls", size natts
+  *	hasnulls	output "hasnulls", size natts
+  *
+  * Output arrays are allocated by caller.
+  */
+ static inline void
+ mm_deconstruct_tuple(char *tp, bits8 *nullbits, bool nulls,
+ 					 int natts, Form_pg_attribute *att,
+ 					 Datum *values, bool *allnulls, bool *hasnulls)
+ {
+ 	int			attnum;
+ 	long		off = 0;
+ 
+ 	/*
+ 	 * First iterate to natts to obtain both null flags for each attribute.
+ 	 */
+ 	for (attnum = 0; attnum < natts; attnum++)
+ 	{
+ 		/*
+ 		 * the "all nulls" bit means that all values in the page range for
+ 		 * this column are nulls.  Therefore there are no values in the tuple
+ 		 * data area.
+ 		 */
+ 		if (nulls && att_isnull(attnum, nullbits))
+ 		{
+ 			values[attnum] = (Datum) 0;
+ 			allnulls[attnum] = true;
+ 			hasnulls[attnum] = true;	/* XXX ? */
+ 			continue;
+ 		}
+ 
+ 		allnulls[attnum] = false;
+ 
+ 		/*
+ 		 * the "has nulls" bit means that some tuples have nulls, but others
+ 		 * have not-null values.  So the tuple data does have data for this
+ 		 * column.
+ 		 *
+ 		 * The hasnulls bits follow the allnulls bits in the same bitmask.
+ 		 */
+ 		hasnulls[attnum] = nulls && att_isnull(natts + attnum, hasnulls);
+ 	}
+ 
+ 	/*
+ 	 * The we iterate to natts * 2 to obtain each attribute's min and max
+ 	 * values.	Note that since we reuse attribute entries (first for the
+ 	 * minimum value of the corresponding column, then for max), we cannot
+ 	 * cache offsets here.
+ 	 */
+ 	for (attnum = 0; attnum < natts * 2; attnum++)
+ 	{
+ 		int			true_attnum = attnum / 2;
+ 		Form_pg_attribute thisatt = att[true_attnum];
+ 
+ 		if (allnulls[true_attnum])
+ 			continue;
+ 
+ 		if (thisatt->attlen == -1)
+ 		{
+ 			off = att_align_pointer(off, thisatt->attalign, -1,
+ 									tp + off);
+ 		}
+ 		else
+ 		{
+ 			/* not varlena, so safe to use att_align_nominal */
+ 			off = att_align_nominal(off, thisatt->attalign);
+ 		}
+ 
+ 		values[attnum] = fetchatt(thisatt, tp + off);
+ 
+ 		off = att_addlength_pointer(off, thisatt->attlen, tp + off);
+ 	}
+ }
*** /dev/null
--- b/src/backend/access/minmax/mmxlog.c
***************
*** 0 ****
--- 1,212 ----
+ /*
+  * mmxlog.c
+  *		XLog replay routines for MinMax indexes
+  *
+  * Portions Copyright (c) 1996-2013, PostgreSQL Global Development Group
+  * Portions Copyright (c) 1994, Regents of the University of California
+  *
+  * IDENTIFICATION
+  *	  src/backend/access/minmax/mmxlog.c
+  */
+ #include "postgres.h"
+ 
+ #include "access/minmax.h"
+ #include "access/minmax_internal.h"
+ #include "access/minmax_tuple.h"
+ #include "access/minmax_xlog.h"
+ #include "access/xlogutils.h"
+ #include "storage/freespace.h"
+ 
+ 
+ /*
+  * xlog replay routines
+  */
+ static void
+ minmax_xlog_createidx(XLogRecPtr lsn, XLogRecord *record)
+ {
+ 	xl_minmax_createidx *xlrec = (xl_minmax_createidx *) XLogRecGetData(record);
+ 	Buffer		buf;
+ 	Page		page;
+ 
+ 	/* Backup blocks are not used in create_index records */
+ 	Assert(!(record->xl_info & XLR_BKP_BLOCK_MASK));
+ 
+ 	/* create the index' metapage */
+ 	buf = XLogReadBuffer(xlrec->node, MINMAX_METAPAGE_BLKNO, true);
+ 	Assert(BufferIsValid(buf));
+ 	page = (Page) BufferGetPage(buf);
+ 	mm_init_metapage(buf);
+ 	PageSetLSN(page, lsn);
+ 	MarkBufferDirty(buf);
+ 	UnlockReleaseBuffer(buf);
+ 
+ 	/* also initialize its revmap fork */
+ 	buf = XLogReadBufferExtended(xlrec->node, MM_REVMAP_FORKNUM, 0, RBM_ZERO);
+ 	LockBuffer(buf, BUFFER_LOCK_EXCLUSIVE);
+ 	Assert(BufferIsValid(buf));
+ 	page = (Page) BufferGetPage(buf);
+ 	PageInit(page, BLCKSZ, 0);
+ 	PageSetLSN(page, lsn);
+ 	MarkBufferDirty(buf);
+ 	UnlockReleaseBuffer(buf);
+ }
+ 
+ static void
+ minmax_xlog_insert(XLogRecPtr lsn, XLogRecord *record)
+ {
+ 	xl_minmax_insert *xlrec = (xl_minmax_insert *) XLogRecGetData(record);
+ 	BlockNumber	blkno;
+ 	Buffer		buffer;
+ 	Page		page;
+ 	OffsetNumber offnum;
+ 	int			tuplen;
+ 	MMTuple	   *mmtuple;
+ 
+ 	/* If we have a full-page image, restore it and we're done */
+ 	if (record->xl_info & XLR_BKP_BLOCK(0))
+ 	{
+ 		(void) RestoreBackupBlock(lsn, record, 0, false, false);
+ 		return;
+ 	}
+ 
+ 	blkno = ItemPointerGetBlockNumber(&(xlrec->target.tid));
+ 	if (record->xl_info & XLOG_MINMAX_INIT_PAGE)
+ 	{
+ 		buffer = XLogReadBuffer(xlrec->target.node, blkno, true);
+ 		Assert(BufferIsValid(buffer));
+ 		page = (Page) BufferGetPage(buffer);
+ 
+ 		PageInit(page, BufferGetPageSize(buffer), 0);	/* XXX size correct?? */
+ 	}
+ 	else
+ 	{
+ 		buffer = XLogReadBuffer(xlrec->target.node, blkno, false);
+ 		if (!BufferIsValid(buffer))
+ 			return;
+ 		page = (Page) BufferGetPage(buffer);
+ 
+ 		if (lsn <= PageGetLSN(page))	/* changes are applied */
+ 		{
+ 			UnlockReleaseBuffer(buffer);
+ 			return;
+ 		}
+ 	}
+ 	offnum = ItemPointerGetOffsetNumber(&(xlrec->target.tid));
+ 	if (PageGetMaxOffsetNumber(page) + 1 < offnum)
+ 		elog(PANIC, "minmax_xlog_insert: invalid max offset number");
+ 
+ 	tuplen = record->xl_len - SizeOfMinmaxInsert;
+ 	mmtuple = (MMTuple *) ((char *) xlrec + SizeOfMinmaxInsert);
+ 
+ 	offnum = PageAddItem(page, (Item) mmtuple, tuplen, offnum, true, false);
+ 	if (offnum == InvalidOffsetNumber)
+ 		elog(PANIC, "minmax_xlog_insert: failed to add tuple");
+ 
+ 	PageSetLSN(page, lsn);
+ 
+ 	MarkBufferDirty(buffer);
+ 	UnlockReleaseBuffer(buffer);
+ 
+ 	/* XXX no FSM updates here ... */
+ }
+ 
+ static void
+ minmax_xlog_bulkremove(XLogRecPtr lsn, XLogRecord *record)
+ {
+ 	xl_minmax_bulkremove *xlrec = (xl_minmax_bulkremove *) XLogRecGetData(record);
+ 	Buffer		buffer;
+ 	Page		page;
+ 	OffsetNumber *offnos;
+ 	int			noffs;
+ 	Size		freespace;
+ 
+ 	/* If we have a full-page image, restore it and we're done */
+ 	if (record->xl_info & XLR_BKP_BLOCK(0))
+ 	{
+ 		(void) RestoreBackupBlock(lsn, record, 0, false, false);
+ 		return;
+ 	}
+ 
+ 	buffer = XLogReadBuffer(xlrec->node, xlrec->block, false);
+ 	if (!BufferIsValid(buffer))
+ 		return;
+ 	page = (Page) BufferGetPage(buffer);
+ 
+ 	if (lsn <= PageGetLSN(page))	/* changes are applied */
+ 	{
+ 		UnlockReleaseBuffer(buffer);
+ 		return;
+ 	}
+ 
+ 	offnos = (OffsetNumber *) ((char *) xlrec + SizeOfMinmaxBulkRemove);
+ 	noffs = (record->xl_len - SizeOfMinmaxBulkRemove) / sizeof(OffsetNumber);
+ 
+ 	PageIndexDeleteNoCompact(page, offnos, noffs);
+ 	freespace = PageGetFreeSpace(page);
+ 
+ 	PageSetLSN(page, lsn);
+ 
+ 	MarkBufferDirty(buffer);
+ 	UnlockReleaseBuffer(buffer);
+ 
+ 	/* update FSM as well */
+ 	XLogRecordPageWithFreeSpace(xlrec->node, xlrec->block, freespace);
+ }
+ 
+ static void
+ minmax_xlog_revmap_set(XLogRecPtr lsn, XLogRecord *record)
+ {
+ 	xl_minmax_rm_set *xlrec = (xl_minmax_rm_set *) XLogRecGetData(record);
+ 	bool	init;
+ 	Buffer	buffer;
+ 	Page	page;
+ 
+ 	/* If we have a full-page image, restore it and we're done */
+ 	if (record->xl_info & XLR_BKP_BLOCK(0))
+ 	{
+ 		(void) RestoreBackupBlock(lsn, record, 0, false, false);
+ 		return;
+ 	}
+ 
+ 	init = (record->xl_info & XLOG_MINMAX_INIT_PAGE) != 0;
+ 	buffer = XLogReadBufferExtended(xlrec->node,
+ 									MM_REVMAP_FORKNUM, xlrec->mapBlock,
+ 									init ? RBM_ZERO : RBM_NORMAL);
+ 	Assert(BufferIsValid(buffer));
+ 	LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE);
+ 	page = BufferGetPage(buffer);
+ 	if (init)
+ 		PageInit(page, BufferGetPageSize(buffer), 0);
+ 
+ 	rm_page_set_iptr(page, xlrec->pagesPerRange, xlrec->heapBlock,
+ 					 ItemPointerGetBlockNumber(&(xlrec->newval)),
+ 					 ItemPointerGetOffsetNumber(&(xlrec->newval)));
+ 
+ 	PageSetLSN(page, lsn);
+ 	MarkBufferDirty(buffer);
+ 	UnlockReleaseBuffer(buffer);
+ }
+ 
+ void
+ minmax_redo(XLogRecPtr lsn, XLogRecord *record)
+ {
+ 	uint8		info = record->xl_info & ~XLR_INFO_MASK;
+ 
+ 	switch (info & XLOG_MINMAX_OPMASK)
+ 	{
+ 		case XLOG_MINMAX_CREATE_INDEX:
+ 			minmax_xlog_createidx(lsn, record);
+ 			break;
+ 		case XLOG_MINMAX_INSERT:
+ 			minmax_xlog_insert(lsn, record);
+ 			break;
+ 		case XLOG_MINMAX_BULKREMOVE:
+ 			minmax_xlog_bulkremove(lsn, record);
+ 			break;
+ 		case XLOG_MINMAX_REVMAP_SET:
+ 			minmax_xlog_revmap_set(lsn, record);
+ 			break;
+ 		default:
+ 			elog(PANIC, "minmax_redo: unknown op code %u", info);
+ 	}
+ }
*** a/src/backend/access/rmgrdesc/Makefile
--- b/src/backend/access/rmgrdesc/Makefile
***************
*** 9,15 **** top_builddir = ../../../..
  include $(top_builddir)/src/Makefile.global
  
  OBJS = clogdesc.o dbasedesc.o gindesc.o gistdesc.o hashdesc.o heapdesc.o \
! 	   mxactdesc.o nbtdesc.o relmapdesc.o seqdesc.o smgrdesc.o spgdesc.o \
  	   standbydesc.o tblspcdesc.o xactdesc.o xlogdesc.o
  
  include $(top_srcdir)/src/backend/common.mk
--- 9,16 ----
  include $(top_builddir)/src/Makefile.global
  
  OBJS = clogdesc.o dbasedesc.o gindesc.o gistdesc.o hashdesc.o heapdesc.o \
! 	   minmaxdesc.o mxactdesc.o nbtdesc.o relmapdesc.o seqdesc.o \
! 	   smgrdesc.o spgdesc.o \
  	   standbydesc.o tblspcdesc.o xactdesc.o xlogdesc.o
  
  include $(top_srcdir)/src/backend/common.mk
*** /dev/null
--- b/src/backend/access/rmgrdesc/minmaxdesc.c
***************
*** 0 ****
--- 1,74 ----
+ /*-------------------------------------------------------------------------
+  *
+  * minmaxdesc.c
+  *	  rmgr descriptor routines for MinMax indexes
+  *
+  * Portions Copyright (c) 1996-2013, PostgreSQL Global Development Group
+  * Portions Copyright (c) 1994, Regents of the University of California
+  *
+  *
+  * IDENTIFICATION
+  *	  src/backend/access/rmgrdesc/minmaxdesc.c
+  *
+  *-------------------------------------------------------------------------
+  */
+ #include "postgres.h"
+ 
+ #include "access/minmax_xlog.h"
+ 
+ static void
+ out_target(StringInfo buf, xl_minmax_tid *target)
+ {
+ 	appendStringInfo(buf, "rel %u/%u/%u; tid %u/%u",
+ 			 target->node.spcNode, target->node.dbNode, target->node.relNode,
+ 					 ItemPointerGetBlockNumber(&(target->tid)),
+ 					 ItemPointerGetOffsetNumber(&(target->tid)));
+ }
+ 
+ void
+ minmax_desc(StringInfo buf, uint8 xl_info, char *rec)
+ {
+ 	uint8		info = xl_info & ~XLR_INFO_MASK;
+ 
+ 	info &= XLOG_MINMAX_OPMASK;
+ 	if (info == XLOG_MINMAX_CREATE_INDEX)
+ 	{
+ 		xl_minmax_createidx *xlrec = (xl_minmax_createidx *) rec;
+ 
+ 		appendStringInfo(buf, "create index: %u/%u/%u",
+ 						 xlrec->node.spcNode, xlrec->node.dbNode,
+ 						 xlrec->node.relNode);
+ 	}
+ 	else if (info == XLOG_MINMAX_INSERT)
+ 	{
+ 		xl_minmax_insert *xlrec = (xl_minmax_insert *) rec;
+ 
+ 		if (xl_info & XLOG_MINMAX_INIT_PAGE)
+ 			appendStringInfo(buf, "insert(init): ");
+ 		else
+ 			appendStringInfo(buf, "insert: ");
+ 		out_target(buf, &(xlrec->target));
+ 	}
+ 	else if (info == XLOG_MINMAX_BULKREMOVE)
+ 	{
+ 		xl_minmax_bulkremove *xlrec = (xl_minmax_bulkremove *) rec;
+ 
+ 		appendStringInfo(buf, "bulkremove: rel %u/%u/%u blk %u",
+ 						 xlrec->node.spcNode, xlrec->node.dbNode,
+ 						 xlrec->node.relNode, xlrec->block);
+ 	}
+ 	else if (info == XLOG_MINMAX_REVMAP_SET)
+ 	{
+ 		xl_minmax_rm_set *xlrec = (xl_minmax_rm_set *) rec;
+ 
+ 		appendStringInfo(buf, "revmap set: rel %u/%u/%u mapblk %u pagesPerRange %u item %u value %u/%u",
+ 						 xlrec->node.spcNode, xlrec->node.dbNode,
+ 						 xlrec->node.relNode, xlrec->mapBlock,
+ 						 xlrec->pagesPerRange, xlrec->heapBlock,
+ 						 ItemPointerGetBlockNumber(&(xlrec->newval)),
+ 						 ItemPointerGetOffsetNumber(&(xlrec->newval)));
+ 	}
+ 	else
+ 		appendStringInfo(buf, "UNKNOWN");
+ }
+ 
*** a/src/backend/access/transam/rmgr.c
--- b/src/backend/access/transam/rmgr.c
***************
*** 12,17 ****
--- 12,18 ----
  #include "access/gist_private.h"
  #include "access/hash.h"
  #include "access/heapam_xlog.h"
+ #include "access/minmax_xlog.h"
  #include "access/multixact.h"
  #include "access/nbtree.h"
  #include "access/spgist.h"
*** a/src/backend/catalog/index.c
--- b/src/backend/catalog/index.c
***************
*** 2116,2121 **** IndexBuildHeapScan(Relation heapRelation,
--- 2116,2142 ----
  				   IndexBuildCallback callback,
  				   void *callback_state)
  {
+ 	return IndexBuildHeapRangeScan(heapRelation, indexRelation,
+ 								   indexInfo, allow_sync,
+ 								   0, InvalidBlockNumber,
+ 								   callback, callback_state);
+ }
+ 
+ /*
+  * As above, except that instead of scanning the complete heap, only the given
+  * range is scanned.  Scan to end-of-rel can be signalled by passing
+  * InvalidBlockNumber as end block number.
+  */
+ double
+ IndexBuildHeapRangeScan(Relation heapRelation,
+ 						Relation indexRelation,
+ 						IndexInfo *indexInfo,
+ 						bool allow_sync,
+ 						BlockNumber start_blockno,
+ 						BlockNumber numblocks,
+ 						IndexBuildCallback callback,
+ 						void *callback_state)
+ {
  	bool		is_system_catalog;
  	bool		checking_uniqueness;
  	HeapScanDesc scan;
***************
*** 2186,2191 **** IndexBuildHeapScan(Relation heapRelation,
--- 2207,2215 ----
  								true,	/* buffer access strategy OK */
  								allow_sync);	/* syncscan OK? */
  
+ 	/* set our endpoints */
+ 	heap_setscanlimits(scan, start_blockno, numblocks);
+ 
  	reltuples = 0;
  
  	/*
*** a/src/backend/storage/page/bufpage.c
--- b/src/backend/storage/page/bufpage.c
***************
*** 899,904 **** PageIndexMultiDelete(Page page, OffsetNumber *itemnos, int nitems)
--- 899,1074 ----
  	pfree(itemidbase);
  }
  
+ /*
+  * PageIndexDeleteNoCompact
+  *		Delete the given items for an index page, and defragment the resulting
+  *		free space, but do not compact the item pointers array.
+  *
+  * Unused items at the end of the array are removed.
+  *
+  * This is used for index AMs that require that existing TIDs of live tuples
+  * remain unchanged.
+  */
+ void
+ PageIndexDeleteNoCompact(Page page, OffsetNumber *itemnos, int nitems)
+ {
+ 	PageHeader	phdr = (PageHeader) page;
+ 	LocationIndex pd_lower = phdr->pd_lower;
+ 	LocationIndex pd_upper = phdr->pd_upper;
+ 	LocationIndex pd_special = phdr->pd_special;
+ 	int			nline,
+ 				nstorage;
+ 	OffsetNumber offnum;
+ 	int			nextitm;
+ 
+ 	/*
+ 	 * As with PageRepairFragmentation, paranoia seems justified.
+ 	 */
+ 	if (pd_lower < SizeOfPageHeaderData ||
+ 		pd_lower > pd_upper ||
+ 		pd_upper > pd_special ||
+ 		pd_special > BLCKSZ ||
+ 		pd_special != MAXALIGN(pd_special))
+ 		ereport(ERROR,
+ 				(errcode(ERRCODE_DATA_CORRUPTED),
+ 				 errmsg("corrupted page pointers: lower = %u, upper = %u, special = %u",
+ 						pd_lower, pd_upper, pd_special)));
+ 
+ 	/*
+ 	 * Scan the item pointer array and build a list of just the ones we are
+ 	 * going to keep.  Notice we do not modify the page just yet, since we are
+ 	 * still validity-checking.
+ 	 */
+ 	nline = PageGetMaxOffsetNumber(page);
+ 	nstorage = 0;
+ 	nextitm = 0;
+ 	for (offnum = FirstOffsetNumber; offnum <= nline; offnum = OffsetNumberNext(offnum))
+ 	{
+ 		ItemId		lp;
+ 		ItemLength	itemlen;
+ 		ItemOffset	offset;
+ 
+ 		lp = PageGetItemId(page, offnum);
+ 
+ 		itemlen = ItemIdGetLength(lp);
+ 		offset = ItemIdGetOffset(lp);
+ 
+ 		if (ItemIdIsUsed(lp))
+ 		{
+ 			if (offset < pd_upper ||
+ 				(offset + itemlen) > pd_special ||
+ 				offset != MAXALIGN(offset))
+ 				ereport(ERROR,
+ 						(errcode(ERRCODE_DATA_CORRUPTED),
+ 						 errmsg("corrupted item pointer: offset = %u, length = %u",
+ 								offset, (unsigned int) itemlen)));
+ 
+ 			if (nextitm < nitems && offnum == itemnos[nextitm])
+ 			{
+ 				ItemIdSetUnused(lp);
+ 				nextitm++;
+ 			}
+ 			else if (ItemIdHasStorage(lp))
+ 				nstorage++;
+ 		}
+ 	}
+ 
+ 	/* this will catch invalid or out-of-order itemnos[] */
+ 	if (nextitm != nitems)
+ 		elog(ERROR, "incorrect index offsets supplied");
+ 
+ 	if (nstorage == 0)
+ 	{
+ 		/* Page is completely empty, so just reset it quickly */
+ 		phdr->pd_lower = SizeOfPageHeaderData;
+ 		phdr->pd_upper = pd_special;
+ 	}
+ 	else
+ 	{
+ 		/* There are live items: need to compact the page the hard way */
+ 		char		pageCopy[BLCKSZ];
+ 		itemIdSort	itemidbase,
+ 					itemidptr;
+ 		int			lastused;
+ 		int			i;
+ 		Size		totallen;
+ 		Offset		upper;
+ 
+ 		/*
+ 		 * First scan the page taking note of each item that we need to
+ 		 * preserve.  This includes both live items (those that contain data)
+ 		 * and interspersed unused ones.  It's critical to preserve these unused
+ 		 * items, because otherwise the offset numbers for later live items
+ 		 * would change, which is not acceptable.
+ 		 */
+ 		itemidbase = (itemIdSort) palloc(sizeof(itemIdSortData) * nline);
+ 		itemidptr = itemidbase;
+ 		totallen = 0;
+ 		for (i = 0; i < nline; i++, itemidptr++)
+ 		{
+ 			ItemId		lp;
+ 
+ 			itemidptr->offsetindex = i;
+ 
+ 			lp = PageGetItemId(page, i + 1);
+ 			if (ItemIdHasStorage(lp))
+ 			{
+ 				itemidptr->itemoff = ItemIdGetOffset(lp);
+ 				itemidptr->alignedlen = MAXALIGN(ItemIdGetLength(lp));
+ 				totallen += itemidptr->alignedlen;
+ 			}
+ 			else
+ 			{
+ 				itemidptr->itemoff = 0;
+ 				itemidptr->alignedlen = 0;
+ 			}
+ 		}
+ 
+ 		if (totallen > (Size) (pd_special - pd_lower))
+ 			ereport(ERROR,
+ 					(errcode(ERRCODE_DATA_CORRUPTED),
+ 					 errmsg("corrupted item lengths: total %u, available space %u",
+ 							(unsigned int) totallen, pd_special - pd_lower)));
+ 
+ 		/*
+ 		 * Defragment the data areas of each tuple.  Note that since offset
+ 		 * numbers must remain unchanged in these pages, we can't do a qsort()
+ 		 * of the itemIdSort elements here; and because the elements are not
+ 		 * sorted by offset, we can't use memmove() to defragment the occupied
+ 		 * data space.  So we first create a temporary copy of the original
+ 		 * data page, from which we memcpy() each item's data onto the final
+ 		 * page.
+ 		 */
+ 		memcpy(pageCopy, page, BLCKSZ);
+ 		lastused = FirstOffsetNumber;
+ 		upper = pd_special;
+ 		PageClearHasFreeLinePointers(page);
+ 		for (i = 0, itemidptr = itemidbase; i < nline; i++, itemidptr++)
+ 		{
+ 			ItemId		lp;
+ 
+ 			if (itemidptr->alignedlen == 0)
+ 			{
+ 				PageSetHasFreeLinePointers(page);
+ 				continue;
+ 			}
+ 			lp = PageGetItemId(page, itemidptr->offsetindex + 1);
+ 			upper -= itemidptr->alignedlen;
+ 			memcpy((char *) page + upper,
+ 				   pageCopy + itemidptr->itemoff,
+ 				   itemidptr->alignedlen);
+ 			lp->lp_off = upper;
+ 
+ 			lastused = i + 1;
+ 		}
+ 
+ 		/* Set the new page limits */
+ 		phdr->pd_upper = upper;
+ 		phdr->pd_lower = SizeOfPageHeaderData + lastused * sizeof(ItemIdData);
+ 
+ 		pfree(itemidbase);
+ 	}
+ }
  
  /*
   * Set checksum for a page in shared buffers.
*** a/src/include/access/heapam.h
--- b/src/include/access/heapam.h
***************
*** 112,117 **** extern HeapScanDesc heap_beginscan_strat(Relation relation, Snapshot snapshot,
--- 112,119 ----
  					 bool allow_strat, bool allow_sync);
  extern HeapScanDesc heap_beginscan_bm(Relation relation, Snapshot snapshot,
  				  int nkeys, ScanKey key);
+ extern void heap_setscanlimits(HeapScanDesc scan, BlockNumber startBlk,
+ 		   BlockNumber endBlk);
  extern void heap_rescan(HeapScanDesc scan, ScanKey key);
  extern void heap_endscan(HeapScanDesc scan);
  extern HeapTuple heap_getnext(HeapScanDesc scan, ScanDirection direction);
*** /dev/null
--- b/src/include/access/minmax.h
***************
*** 0 ****
--- 1,35 ----
+ /*
+  * AM-callable functions for MinMax indexes
+  *
+  * Portions Copyright (c) 1996-2013, PostgreSQL Global Development Group
+  * Portions Copyright (c) 1994, Regents of the University of California
+  *
+  * IDENTIFICATION
+  *		src/include/access/minmax.h
+  */
+ #ifndef MINMAX_H
+ #define MINMAX_H
+ 
+ #include "fmgr.h"
+ 
+ 
+ /*
+  * prototypes for functions in minmax.c (external entry points for minmax)
+  */
+ extern Datum mmbuild(PG_FUNCTION_ARGS);
+ extern Datum mmbuildempty(PG_FUNCTION_ARGS);
+ extern Datum mminsert(PG_FUNCTION_ARGS);
+ extern Datum mmbeginscan(PG_FUNCTION_ARGS);
+ extern Datum mmgettuple(PG_FUNCTION_ARGS);
+ extern Datum mmgetbitmap(PG_FUNCTION_ARGS);
+ extern Datum mmrescan(PG_FUNCTION_ARGS);
+ extern Datum mmendscan(PG_FUNCTION_ARGS);
+ extern Datum mmmarkpos(PG_FUNCTION_ARGS);
+ extern Datum mmrestrpos(PG_FUNCTION_ARGS);
+ extern Datum mmbulkdelete(PG_FUNCTION_ARGS);
+ extern Datum mmvacuumcleanup(PG_FUNCTION_ARGS);
+ extern Datum mmcanreturn(PG_FUNCTION_ARGS);
+ extern Datum mmcostestimate(PG_FUNCTION_ARGS);
+ extern Datum mmoptions(PG_FUNCTION_ARGS);
+ 
+ #endif   /* MINMAX_H */
*** /dev/null
--- b/src/include/access/minmax_internal.h
***************
*** 0 ****
--- 1,39 ----
+ /*
+  * minmax_internal.h
+  *		internal declarations for MinMax indexes
+  *
+  * Portions Copyright (c) 1996-2013, PostgreSQL Global Development Group
+  * Portions Copyright (c) 1994, Regents of the University of California
+  *
+  * IDENTIFICATION
+  *		src/include/access/minmax_internal.h
+  */
+ #ifndef MINMAX_INTERNAL_H
+ #define MINMAX_INTERNAL_H
+ 
+ #include "storage/buf.h"
+ #include "storage/bufpage.h"
+ #include "storage/off.h"
+ 
+ /* Metapage definitions */
+ typedef struct MinmaxMetaPageData
+ {
+ 	int32	minmaxMagic;
+ 	int32	minmaxVersion;
+ } MinmaxMetaPageData;
+ 
+ #define MINMAX_CURRENT_VERSION		1
+ #define MINMAX_META_MAGIC			0xA8109CFA
+ 
+ #define MINMAX_METAPAGE_BLKNO	0
+ 
+ #define MM_REVMAP_FORKNUM	VISIBILITYMAP_FORKNUM		/* reuse the VM forknum */
+ 
+ 
+ extern void mm_init_metapage(Buffer meta);
+ extern void
+ rm_page_set_iptr(Page page, int pagesPerRange, BlockNumber heapBlk,
+ 				 BlockNumber blkno, OffsetNumber offno);
+ 
+ 
+ #endif   /* MINMAX_INTERNAL_H */
*** /dev/null
--- b/src/include/access/minmax_revmap.h
***************
*** 0 ****
--- 1,34 ----
+ /*
+  * prototypes for minmax reverse range maps
+  *
+  * Portions Copyright (c) 1996-2013, PostgreSQL Global Development Group
+  * Portions Copyright (c) 1994, Regents of the University of California
+  *
+  * IDENTIFICATION
+  *		src/include/access/minmax_revmap.h
+  */
+ 
+ #ifndef MINMAX_REVMAP_H
+ #define MINMAX_REVMAP_H
+ 
+ #include "storage/block.h"
+ #include "storage/itemptr.h"
+ #include "storage/off.h"
+ #include "utils/relcache.h"
+ 
+ /* struct definition lives in mmrevmap.c */
+ typedef struct mmRevmapAccess mmRevmapAccess;
+ 
+ extern mmRevmapAccess *mmRevmapAccessInit(Relation idxrel,
+ 				  BlockNumber pagesPerRange);
+ extern void mmRevmapAccessTerminate(mmRevmapAccess *rmAccess);
+ 
+ extern void mmRevmapCreate(Relation idxrel);
+ extern void mmSetHeapBlockItemptr(mmRevmapAccess *rmAccess, BlockNumber blk,
+ 					  BlockNumber blkno, OffsetNumber offno);
+ extern void mmGetHeapBlockItemptr(mmRevmapAccess *rmAccess, BlockNumber blk,
+ 					  ItemPointerData *iptr);
+ extern void mmRevmapTruncate(mmRevmapAccess *rmAccess,
+ 				 BlockNumber heapNumBlocks);
+ 
+ #endif   /* MINMAX_REVMAP_H */
*** /dev/null
--- b/src/include/access/minmax_tuple.h
***************
*** 0 ****
--- 1,79 ----
+ /*
+  * Declarations for dealing with MinMax-specific tuples.
+  *
+  * Portions Copyright (c) 1996-2013, PostgreSQL Global Development Group
+  * Portions Copyright (c) 1994, Regents of the University of California
+  *
+  * IDENTIFICATION
+  *	  src/include/access/minmax_tuple.h
+  */
+ #ifndef MINMAX_TUPLE_H
+ #define MINMAX_TUPLE_H
+ 
+ #include "access/tupdesc.h"
+ 
+ 
+ /*
+  * This struct is used to represent the indexed values for one column, within
+  * one page range.
+  */
+ typedef struct MMValues
+ {
+ 	Datum		min;
+ 	Datum		max;
+ 	bool		hasnulls;
+ 	bool		allnulls;
+ } MMValues;
+ 
+ /*
+  * This struct represents one index tuple, comprising the minimum and
+  * maximum values for all indexed columns, within one page range.
+  * The number of elements in the values array is determined by the accompanying
+  * tuple descriptor.
+  */
+ typedef struct DeformedMMTuple
+ {
+ 	bool		nvalues;		/* XXX unused */
+ 	MMValues	values[FLEXIBLE_ARRAY_MEMBER];
+ } DeformedMMTuple;
+ 
+ /*
+  * An on-disk minmax tuple.  This is possibly followed by a nulls bitmask, with
+  * room for natts*2 null bits; min and max Datum values for each column follow
+  * that.
+  */
+ typedef struct MMTuple
+ {
+ 	/* ---------------
+ 	 * mt_info is laid out in the following fashion:
+ 	 *
+ 	 * 7th (high) bit: has nulls
+ 	 * 6th bit: unused
+ 	 * 5th bit: unused
+ 	 * 4-0 bit: offset of data
+ 	 * ---------------
+ 	 */
+ 	uint8		mt_info;
+ } MMTuple;
+ 
+ #define SizeOfMinMaxTuple	offsetof(MMTuple, mt_info) + sizeof(uint8)
+ 
+ /*
+  * t_info manipulation macros
+  */
+ #define MMIDX_OFFSET_MASK 0x1F
+ /* bit 0x20 is not used at present */
+ /* bit 0x40 is not used at present */
+ #define MMIDX_NULLS_MASK 0x80
+ 
+ #define MMTupleDataOffset(mmtup)	((Size) (((MMTuple *) (mmtup))->mt_info & MMIDX_OFFSET_MASK))
+ #define MMTupleHasNulls(mmtup)	(((((MMTuple *) (mmtup))->mt_info & MMIDX_NULLS_MASK)) != 0)
+ 
+ 
+ extern TupleDesc minmax_get_descr(TupleDesc tupdesc);
+ extern MMTuple *minmax_form_tuple(TupleDesc idxDesc, TupleDesc diskDesc,
+ 				  DeformedMMTuple *tuple, Size *size);
+ extern void minmax_free_tuple(MMTuple *tuple);
+ extern DeformedMMTuple *minmax_deform_tuple(TupleDesc tupdesc, MMTuple *tuple);
+ 
+ #endif   /* MINMAX_TUPLE_H */
*** /dev/null
--- b/src/include/access/minmax_xlog.h
***************
*** 0 ****
--- 1,93 ----
+ /*-------------------------------------------------------------------------
+  *
+  * minmax_xlog.h
+  *	  POSTGRES MinMax access XLOG definitions.
+  *
+  *
+  * Portions Copyright (c) 1996-2013, PostgreSQL Global Development Group
+  * Portions Copyright (c) 1994, Regents of the University of California
+  *
+  * src/include/access/minmax_xlog.h
+  *
+  *-------------------------------------------------------------------------
+  */
+ #ifndef MINMAX_XLOG_H
+ #define MINMAX_XLOG_H
+ 
+ #include "access/xlog.h"
+ #include "storage/bufpage.h"
+ #include "storage/itemptr.h"
+ #include "storage/relfilenode.h"
+ #include "utils/relcache.h"
+ 
+ 
+ /*
+  * WAL record definitions for minmax's WAL operations
+  *
+  * XLOG allows to store some information in high 4 bits of log
+  * record xl_info field.
+  */
+ #define XLOG_MINMAX_CREATE_INDEX	0x00
+ #define XLOG_MINMAX_INSERT			0x10
+ #define XLOG_MINMAX_BULKREMOVE		0x20
+ #define XLOG_MINMAX_REVMAP_SET		0x30
+ 
+ #define XLOG_MINMAX_OPMASK			0x70
+ /*
+  * When we insert the first item on a new page, we restore the entire page in
+  * redo.
+  */
+ #define XLOG_MINMAX_INIT_PAGE		0x80
+ 
+ /* This is what we need to know about a minmax index create */
+ typedef struct xl_minmax_createidx
+ {
+ 	RelFileNode	node;
+ } xl_minmax_createidx;
+ #define SizeOfMinmaxCreateIdx	(offsetof(xl_minmax_createidx, node) + sizeof(RelFileNode)
+ 
+ /* All that we need to find a minmax tuple */
+ typedef struct xl_minmax_tid
+ {
+ 	RelFileNode	node;
+ 	ItemPointerData tid;
+ } xl_minmax_tid;
+ 
+ #define SizeOfMinmaxTid		(offsetof(xl_minmax_tid, tid) + SizeOfIptrData)
+ 
+ /* This is what we need to know about a minmax tuple insert */
+ typedef struct xl_minmax_insert
+ {
+ 	xl_minmax_tid	target;
+ 	/* tuple data follows at end of struct */
+ } xl_minmax_insert;
+ 
+ #define SizeOfMinmaxInsert		(offsetof(xl_minmax_insert, target) + SizeOfMinmaxTid)
+ 
+ /* This is what we need to know about a bulk minmax tuple remove */
+ typedef struct xl_minmax_bulkremove
+ {
+ 	RelFileNode node;
+ 	BlockNumber	block;
+ 	/* offset number array follows at end of struct */
+ } xl_minmax_bulkremove;
+ 
+ #define SizeOfMinmaxBulkRemove	(offsetof(xl_minmax_bulkremove, block) + sizeof(BlockNumber))
+ 
+ /* This is what we need to know about a revmap "set heap ptr" */
+ typedef struct xl_minmax_rm_set
+ {
+ 	RelFileNode		node;
+ 	BlockNumber		mapBlock;
+ 	int				pagesPerRange;
+ 	BlockNumber		heapBlock;
+ 	ItemPointerData newval;
+ } xl_minmax_rm_set;
+ 
+ #define SizeOfMinmaxRevmapSet	(offsetof(xl_minmax_rm_set, newval) + SizeOfIptrData)
+ 
+ 
+ extern void minmax_desc(StringInfo buf, uint8 xl_info, char *rec);
+ extern void minmax_redo(XLogRecPtr lsn, XLogRecord *record);
+ 
+ #endif	/* MINMAX_XLOG_H */
*** a/src/include/access/relscan.h
--- b/src/include/access/relscan.h
***************
*** 35,42 **** typedef struct HeapScanDescData
  	bool		rs_temp_snap;	/* unregister snapshot at scan end? */
  
  	/* state set up at initscan time */
! 	BlockNumber rs_nblocks;		/* number of blocks to scan */
  	BlockNumber rs_startblock;	/* block # to start at */
  	BufferAccessStrategy rs_strategy;	/* access strategy for reads */
  	bool		rs_syncscan;	/* report location to syncscan logic? */
  
--- 35,44 ----
  	bool		rs_temp_snap;	/* unregister snapshot at scan end? */
  
  	/* state set up at initscan time */
! 	BlockNumber rs_nblocks;		/* total number of blocks in rel */
  	BlockNumber rs_startblock;	/* block # to start at */
+ 	BlockNumber	rs_initblock;	/* block # to consider initial of rel */
+ 	BlockNumber	rs_numblocks;	/* number of blocks to scan */
  	BufferAccessStrategy rs_strategy;	/* access strategy for reads */
  	bool		rs_syncscan;	/* report location to syncscan logic? */
  
*** a/src/include/access/rmgrlist.h
--- b/src/include/access/rmgrlist.h
***************
*** 42,44 **** PG_RMGR(RM_GIN_ID, "Gin", gin_redo, gin_desc, gin_xlog_startup, gin_xlog_cleanup
--- 42,45 ----
  PG_RMGR(RM_GIST_ID, "Gist", gist_redo, gist_desc, gist_xlog_startup, gist_xlog_cleanup, NULL)
  PG_RMGR(RM_SEQ_ID, "Sequence", seq_redo, seq_desc, NULL, NULL, NULL)
  PG_RMGR(RM_SPGIST_ID, "SPGist", spg_redo, spg_desc, spg_xlog_startup, spg_xlog_cleanup, NULL)
+ PG_RMGR(RM_MINMAX_ID, "MinMax", minmax_redo, minmax_desc, NULL, NULL, NULL)
*** a/src/include/catalog/index.h
--- b/src/include/catalog/index.h
***************
*** 97,102 **** extern double IndexBuildHeapScan(Relation heapRelation,
--- 97,110 ----
  				   bool allow_sync,
  				   IndexBuildCallback callback,
  				   void *callback_state);
+ extern double IndexBuildHeapRangeScan(Relation heapRelation,
+ 						Relation indexRelation,
+ 						IndexInfo *indexInfo,
+ 						bool allow_sync,
+ 						BlockNumber start_blockno,
+ 						BlockNumber end_blockno,
+ 						IndexBuildCallback callback,
+ 						void *callback_state);
  
  extern void validate_index(Oid heapId, Oid indexId, Snapshot snapshot);
  
*** a/src/include/catalog/pg_am.h
--- b/src/include/catalog/pg_am.h
***************
*** 132,136 **** DESCR("GIN index access method");
--- 132,138 ----
  DATA(insert OID = 4000 (  spgist	0 5 f f f f f t f t f f f 0 spginsert spgbeginscan spggettuple spggetbitmap spgrescan spgendscan spgmarkpos spgrestrpos spgbuild spgbuildempty spgbulkdelete spgvacuumcleanup spgcanreturn spgcostestimate spgoptions ));
  DESCR("SP-GiST index access method");
  #define SPGIST_AM_OID 4000
+ DATA(insert OID = 3847 (  minmax	5 0 f f f f t t f t t f f 0 mminsert mmbeginscan - mmgetbitmap mmrescan mmendscan mmmarkpos mmrestrpos mmbuild mmbuildempty mmbulkdelete mmvacuumcleanup - mmcostestimate mmoptions ));
+ #define MINMAX_AM_OID 3847
  
  #endif   /* PG_AM_H */
*** a/src/include/catalog/pg_amop.h
--- b/src/include/catalog/pg_amop.h
***************
*** 781,784 **** DATA(insert (	3474   3831 3831 8 s	3892 4000 0 ));
--- 781,811 ----
  DATA(insert (	3474   3831 2283 16 s	3889 4000 0 ));
  DATA(insert (	3474   3831 3831 18 s	3882 4000 0 ));
  
+ /*
+  * MinMax int4_ops
+  */
+ DATA(insert (	3192   23 23 1 s	97	3847 0 ));
+ DATA(insert (	3192   23 23 2 s	523 3847 0 ));
+ DATA(insert (	3192   23 23 3 s	96	3847 0 ));
+ DATA(insert (	3192   23 23 4 s	525 3847 0 ));
+ DATA(insert (	3192   23 23 5 s	521 3847 0 ));
+ 
+ /*
+  * MinMax numeric_ops
+  */
+ DATA(insert (	3193   1700 1700 1 s 1754 3847 0 ));
+ DATA(insert (	3193   1700 1700 2 s 1755 3847 0 ));
+ DATA(insert (	3193   1700 1700 3 s 1752 3847 0 ));
+ DATA(insert (	3193   1700 1700 4 s 1757 3847 0 ));
+ DATA(insert (	3193   1700 1700 5 s 1756 3847 0 ));
+ 
+ /*
+  * MinMax text_ops
+  */
+ DATA(insert (	3194   25 25 1 s	664 3847 0 ));
+ DATA(insert (	3194   25 25 2 s	665 3847 0 ));
+ DATA(insert (	3194   25 25 3 s	98	3847 0 ));
+ DATA(insert (	3194   25 25 4 s	667 3847 0 ));
+ DATA(insert (	3194   25 25 5 s	666 3847 0 ));
+ 
  #endif   /* PG_AMOP_H */
*** a/src/include/catalog/pg_opclass.h
--- b/src/include/catalog/pg_opclass.h
***************
*** 227,231 **** DATA(insert (	4000	range_ops			PGNSP PGUID 3474  3831 t 0 ));
--- 227,234 ----
  DATA(insert (	4000	quad_point_ops		PGNSP PGUID 4015  600 t 0 ));
  DATA(insert (	4000	kd_point_ops		PGNSP PGUID 4016  600 f 0 ));
  DATA(insert (	4000	text_ops			PGNSP PGUID 4017  25 t 0 ));
+ DATA(insert (	3847	int4_ops			PGNSP PGUID 3192  23 t 0 ));
+ DATA(insert (	3847	numeric_ops			PGNSP PGUID 3193  1700 t 0 ));
+ DATA(insert (	3847	text_ops			PGNSP PGUID 3194  25 t 0 ));
  
  #endif   /* PG_OPCLASS_H */
*** a/src/include/catalog/pg_opfamily.h
--- b/src/include/catalog/pg_opfamily.h
***************
*** 147,151 **** DATA(insert OID = 4015 (	4000	quad_point_ops	PGNSP PGUID ));
--- 147,154 ----
  DATA(insert OID = 4016 (	4000	kd_point_ops	PGNSP PGUID ));
  DATA(insert OID = 4017 (	4000	text_ops		PGNSP PGUID ));
  #define TEXT_SPGIST_FAM_OID 4017
+ DATA(insert OID = 3192 (	3847	int4_ops		PGNSP PGUID ));
+ DATA(insert OID = 3193 (	3847	numeric_ops		PGNSP PGUID ));
+ DATA(insert OID = 3194 (	3847	text_ops		PGNSP PGUID ));
  
  #endif   /* PG_OPFAMILY_H */
*** a/src/include/catalog/pg_proc.h
--- b/src/include/catalog/pg_proc.h
***************
*** 561,566 **** DESCR("btree(internal)");
--- 561,594 ----
  DATA(insert OID = 2785 (  btoptions		   PGNSP PGUID 12 1 0 0 0 f f f f t f s 2 0 17 "1009 16" _null_ _null_ _null_ _null_  btoptions _null_ _null_ _null_ ));
  DESCR("btree(internal)");
  
+ DATA(insert OID = 3178 (  mmgetbitmap	   PGNSP PGUID 12 1 0 0 0 f f f f t f v 2 0 20 "2281 2281" _null_ _null_ _null_ _null_	mmgetbitmap _null_ _null_ _null_ ));
+ DESCR("minmax(internal)");
+ DATA(insert OID = 3179 (  mminsert		   PGNSP PGUID 12 1 0 0 0 f f f f t f v 6 0 16 "2281 2281 2281 2281 2281 2281" _null_ _null_ _null_ _null_	mminsert _null_ _null_ _null_ ));
+ DESCR("minmax(internal)");
+ DATA(insert OID = 3180 (  mmbeginscan	   PGNSP PGUID 12 1 0 0 0 f f f f t f v 3 0 2281 "2281 2281 2281" _null_ _null_ _null_ _null_	mmbeginscan _null_ _null_ _null_ ));
+ DESCR("minmax(internal)");
+ DATA(insert OID = 3181 (  mmrescan		   PGNSP PGUID 12 1 0 0 0 f f f f t f v 5 0 2278 "2281 2281 2281 2281 2281" _null_ _null_ _null_ _null_ mmrescan _null_ _null_ _null_ ));
+ DESCR("minmax(internal)");
+ DATA(insert OID = 3182 (  mmendscan		   PGNSP PGUID 12 1 0 0 0 f f f f t f v 1 0 2278 "2281" _null_ _null_ _null_ _null_ mmendscan _null_ _null_ _null_ ));
+ DESCR("minmax(internal)");
+ DATA(insert OID = 3183 (  mmmarkpos		   PGNSP PGUID 12 1 0 0 0 f f f f t f v 1 0 2278 "2281" _null_ _null_ _null_ _null_ mmmarkpos _null_ _null_ _null_ ));
+ DESCR("minmax(internal)");
+ DATA(insert OID = 3184 (  mmrestrpos		   PGNSP PGUID 12 1 0 0 0 f f f f t f v 1 0 2278 "2281" _null_ _null_ _null_ _null_ mmrestrpos _null_ _null_ _null_ ));
+ DESCR("minmax(internal)");
+ DATA(insert OID = 3185 (  mmbuild		   PGNSP PGUID 12 1 0 0 0 f f f f t f v 3 0 2281 "2281 2281 2281" _null_ _null_ _null_ _null_ mmbuild _null_ _null_ _null_ ));
+ DESCR("minmax(internal)");
+ DATA(insert OID = 3186 (  mmbuildempty	   PGNSP PGUID 12 1 0 0 0 f f f f t f v 1 0 2278 "2281" _null_ _null_ _null_ _null_ mmbuildempty _null_ _null_ _null_ ));
+ DESCR("minmax(internal)");
+ DATA(insert OID = 3187 (  mmbulkdelete	   PGNSP PGUID 12 1 0 0 0 f f f f t f v 4 0 2281 "2281 2281 2281 2281" _null_ _null_ _null_ _null_ mmbulkdelete _null_ _null_ _null_ ));
+ DESCR("minmax(internal)");
+ DATA(insert OID = 3188 (  mmvacuumcleanup   PGNSP PGUID 12 1 0 0 0 f f f f t f v 2 0 2281 "2281 2281" _null_ _null_ _null_ _null_ mmvacuumcleanup _null_ _null_ _null_ ));
+ DESCR("minmax(internal)");
+ DATA(insert OID = 3190 (  mmcostestimate   PGNSP PGUID 12 1 0 0 0 f f f f t f v 7 0 2278 "2281 2281 2281 2281 2281 2281 2281" _null_ _null_ _null_ _null_ mmcostestimate _null_ _null_ _null_ ));
+ DESCR("minmax(internal)");
+ DATA(insert OID = 3191 (  mmoptions		   PGNSP PGUID 12 1 0 0 0 f f f f t f s 2 0 17 "1009 16" _null_ _null_ _null_ _null_  mmoptions _null_ _null_ _null_ ));
+ DESCR("minmax(internal)");
+ 
+ 
  DATA(insert OID = 339 (  poly_same		   PGNSP PGUID 12 1 0 0 0 f f f f t f i 2 0 16 "604 604" _null_ _null_ _null_ _null_ poly_same _null_ _null_ _null_ ));
  DATA(insert OID = 340 (  poly_contain	   PGNSP PGUID 12 1 0 0 0 f f f f t f i 2 0 16 "604 604" _null_ _null_ _null_ _null_ poly_contain _null_ _null_ _null_ ));
  DATA(insert OID = 341 (  poly_left		   PGNSP PGUID 12 1 0 0 0 f f f f t f i 2 0 16 "604 604" _null_ _null_ _null_ _null_ poly_left _null_ _null_ _null_ ));
*** a/src/include/storage/bufpage.h
--- b/src/include/storage/bufpage.h
***************
*** 403,408 **** extern Size PageGetExactFreeSpace(Page page);
--- 403,409 ----
  extern Size PageGetHeapFreeSpace(Page page);
  extern void PageIndexTupleDelete(Page page, OffsetNumber offset);
  extern void PageIndexMultiDelete(Page page, OffsetNumber *itemnos, int nitems);
+ extern void PageIndexDeleteNoCompact(Page page, OffsetNumber *itemnos, int nitems);
  extern char *PageSetChecksumCopy(Page page, BlockNumber blkno);
  extern void PageSetChecksumInplace(Page page, BlockNumber blkno);
  
*** a/src/test/regress/expected/opr_sanity.out
--- b/src/test/regress/expected/opr_sanity.out
***************
*** 1076,1081 **** ORDER BY 1, 2, 3;
--- 1076,1086 ----
         2742 |            2 | @@@
         2742 |            3 | <@
         2742 |            4 | =
+        3847 |            1 | <
+        3847 |            2 | <=
+        3847 |            3 | =
+        3847 |            4 | >=
+        3847 |            5 | >
         4000 |            1 | <<
         4000 |            1 | ~<~
         4000 |            2 | &<
***************
*** 1098,1104 **** ORDER BY 1, 2, 3;
         4000 |           15 | >
         4000 |           16 | @>
         4000 |           18 | =
! (62 rows)
  
  -- Check that all opclass search operators have selectivity estimators.
  -- This is not absolutely required, but it seems a reasonable thing
--- 1103,1109 ----
         4000 |           15 | >
         4000 |           16 | @>
         4000 |           18 | =
! (67 rows)
  
  -- Check that all opclass search operators have selectivity estimators.
  -- This is not absolutely required, but it seems a reasonable thing
***************
*** 1271,1277 **** FROM pg_am am JOIN pg_opclass op ON opcmethod = am.oid
           amproclefttype = amprocrighttype AND amproclefttype = opcintype
  WHERE am.amname <> 'btree' AND am.amname <> 'gist' AND am.amname <> 'gin'
  GROUP BY amname, amsupport, opcname, amprocfamily
! HAVING count(*) != amsupport OR amprocfamily IS NULL;
   amname | opcname | count 
  --------+---------+-------
  (0 rows)
--- 1276,1282 ----
           amproclefttype = amprocrighttype AND amproclefttype = opcintype
  WHERE am.amname <> 'btree' AND am.amname <> 'gist' AND am.amname <> 'gin'
  GROUP BY amname, amsupport, opcname, amprocfamily
! HAVING count(*) != amsupport AND amprocfamily IS NOT NULL;
   amname | opcname | count 
  --------+---------+-------
  (0 rows)
*** a/src/test/regress/sql/opr_sanity.sql
--- b/src/test/regress/sql/opr_sanity.sql
***************
*** 978,984 **** FROM pg_am am JOIN pg_opclass op ON opcmethod = am.oid
           amproclefttype = amprocrighttype AND amproclefttype = opcintype
  WHERE am.amname <> 'btree' AND am.amname <> 'gist' AND am.amname <> 'gin'
  GROUP BY amname, amsupport, opcname, amprocfamily
! HAVING count(*) != amsupport OR amprocfamily IS NULL;
  
  SELECT amname, opcname, count(*)
  FROM pg_am am JOIN pg_opclass op ON opcmethod = am.oid
--- 978,984 ----
           amproclefttype = amprocrighttype AND amproclefttype = opcintype
  WHERE am.amname <> 'btree' AND am.amname <> 'gist' AND am.amname <> 'gin'
  GROUP BY amname, amsupport, opcname, amprocfamily
! HAVING count(*) != amsupport AND amprocfamily IS NOT NULL;
  
  SELECT amname, opcname, count(*)
  FROM pg_am am JOIN pg_opclass op ON opcmethod = am.oid
