*** a/GNUmakefile.in
--- b/GNUmakefile.in
***************
*** 75,81 **** distclean maintainer-clean:
  
  check: all
  
! check installcheck installcheck-parallel:
  	$(MAKE) -C src/test $@
  
  installcheck-world:
--- 75,81 ----
  
  check: all
  
! check dcheck installcheck installcheck-parallel:
  	$(MAKE) -C src/test $@
  
  installcheck-world:
*** a/src/backend/access/heap/heapam.c
--- b/src/backend/access/heap/heapam.c
***************
*** 57,62 ****
--- 57,63 ----
  #include "storage/bufmgr.h"
  #include "storage/freespace.h"
  #include "storage/lmgr.h"
+ #include "storage/predicate.h"
  #include "storage/procarray.h"
  #include "storage/smgr.h"
  #include "storage/standby.h"
***************
*** 261,280 **** heapgetpage(HeapScanDesc scan, BlockNumber page)
  	{
  		if (ItemIdIsNormal(lpp))
  		{
  			bool		valid;
  
  			if (all_visible)
  				valid = true;
  			else
  			{
- 				HeapTupleData loctup;
- 
  				loctup.t_data = (HeapTupleHeader) PageGetItem((Page) dp, lpp);
  				loctup.t_len = ItemIdGetLength(lpp);
  				ItemPointerSet(&(loctup.t_self), page, lineoff);
  
  				valid = HeapTupleSatisfiesVisibility(&loctup, snapshot, buffer);
  			}
  			if (valid)
  				scan->rs_vistuples[ntup++] = lineoff;
  		}
--- 262,283 ----
  	{
  		if (ItemIdIsNormal(lpp))
  		{
+ 			HeapTupleData loctup;
  			bool		valid;
  
  			if (all_visible)
  				valid = true;
  			else
  			{
  				loctup.t_data = (HeapTupleHeader) PageGetItem((Page) dp, lpp);
  				loctup.t_len = ItemIdGetLength(lpp);
  				ItemPointerSet(&(loctup.t_self), page, lineoff);
  
  				valid = HeapTupleSatisfiesVisibility(&loctup, snapshot, buffer);
  			}
+ 
+ 			CheckForSerializableConflictOut(valid, scan->rs_rd, &loctup, buffer);
+ 
  			if (valid)
  				scan->rs_vistuples[ntup++] = lineoff;
  		}
***************
*** 468,479 **** heapgettup(HeapScanDesc scan,
--- 471,485 ----
  													 snapshot,
  													 scan->rs_cbuf);
  
+ 				CheckForSerializableConflictOut(valid, scan->rs_rd, tuple, scan->rs_cbuf);
+ 
  				if (valid && key != NULL)
  					HeapKeyTest(tuple, RelationGetDescr(scan->rs_rd),
  								nkeys, key, valid);
  
  				if (valid)
  				{
+ 					PredicateLockTuple(scan->rs_rd, tuple);
  					LockBuffer(scan->rs_cbuf, BUFFER_LOCK_UNLOCK);
  					return;
  				}
***************
*** 741,752 **** heapgettup_pagemode(HeapScanDesc scan,
--- 747,760 ----
  							nkeys, key, valid);
  				if (valid)
  				{
+ 					PredicateLockTuple(scan->rs_rd, tuple);
  					scan->rs_cindex = lineindex;
  					return;
  				}
  			}
  			else
  			{
+ 				PredicateLockTuple(scan->rs_rd, tuple);
  				scan->rs_cindex = lineindex;
  				return;
  			}
***************
*** 1460,1467 **** heap_fetch(Relation relation,
--- 1468,1478 ----
  
  	LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
  
+ 	CheckForSerializableConflictOut(valid, relation, tuple, buffer);
+ 
  	if (valid)
  	{
+ 		PredicateLockTuple(relation, tuple);
  		/*
  		 * All checks passed, so return the tuple as valid. Caller is now
  		 * responsible for releasing the buffer.
***************
*** 1505,1517 **** heap_fetch(Relation relation,
   * heap_fetch, we do not report any pgstats count; caller may do so if wanted.
   */
  bool
! heap_hot_search_buffer(ItemPointer tid, Buffer buffer, Snapshot snapshot,
! 					   bool *all_dead)
  {
  	Page		dp = (Page) BufferGetPage(buffer);
  	TransactionId prev_xmax = InvalidTransactionId;
  	OffsetNumber offnum;
  	bool		at_chain_start;
  
  	if (all_dead)
  		*all_dead = true;
--- 1516,1530 ----
   * heap_fetch, we do not report any pgstats count; caller may do so if wanted.
   */
  bool
! heap_hot_search_buffer(ItemPointer tid, Relation relation, Buffer buffer,
! 					   Snapshot snapshot, bool *all_dead)
  {
  	Page		dp = (Page) BufferGetPage(buffer);
  	TransactionId prev_xmax = InvalidTransactionId;
  	OffsetNumber offnum;
  	bool		at_chain_start;
+ 	bool		valid;
+ 	bool		match_found;
  
  	if (all_dead)
  		*all_dead = true;
***************
*** 1521,1526 **** heap_hot_search_buffer(ItemPointer tid, Buffer buffer, Snapshot snapshot,
--- 1534,1540 ----
  	Assert(ItemPointerGetBlockNumber(tid) == BufferGetBlockNumber(buffer));
  	offnum = ItemPointerGetOffsetNumber(tid);
  	at_chain_start = true;
+ 	match_found = false;
  
  	/* Scan through possible multiple members of HOT-chain */
  	for (;;)
***************
*** 1551,1556 **** heap_hot_search_buffer(ItemPointer tid, Buffer buffer, Snapshot snapshot,
--- 1565,1572 ----
  
  		heapTuple.t_data = (HeapTupleHeader) PageGetItem(dp, lp);
  		heapTuple.t_len = ItemIdGetLength(lp);
+ 		heapTuple.t_tableOid = relation->rd_id;
+ 		heapTuple.t_self = *tid;
  
  		/*
  		 * Shouldn't see a HEAP_ONLY tuple at chain start.
***************
*** 1568,1579 **** heap_hot_search_buffer(ItemPointer tid, Buffer buffer, Snapshot snapshot,
  			break;
  
  		/* If it's visible per the snapshot, we must return it */
! 		if (HeapTupleSatisfiesVisibility(&heapTuple, snapshot, buffer))
  		{
  			ItemPointerSetOffsetNumber(tid, offnum);
  			if (all_dead)
  				*all_dead = false;
! 			return true;
  		}
  
  		/*
--- 1584,1601 ----
  			break;
  
  		/* If it's visible per the snapshot, we must return it */
! 		valid = HeapTupleSatisfiesVisibility(&heapTuple, snapshot, buffer);
! 		CheckForSerializableConflictOut(valid, relation, &heapTuple, buffer);
! 		if (valid)
  		{
  			ItemPointerSetOffsetNumber(tid, offnum);
+ 			PredicateLockTuple(relation, &heapTuple);
  			if (all_dead)
  				*all_dead = false;
! 			if (IsXactIsoLevelFullySerializable)
! 				match_found = true;
! 			else
! 				return true;
  		}
  
  		/*
***************
*** 1602,1608 **** heap_hot_search_buffer(ItemPointer tid, Buffer buffer, Snapshot snapshot,
  			break;				/* end of chain */
  	}
  
! 	return false;
  }
  
  /*
--- 1624,1630 ----
  			break;				/* end of chain */
  	}
  
! 	return match_found;
  }
  
  /*
***************
*** 1621,1627 **** heap_hot_search(ItemPointer tid, Relation relation, Snapshot snapshot,
  
  	buffer = ReadBuffer(relation, ItemPointerGetBlockNumber(tid));
  	LockBuffer(buffer, BUFFER_LOCK_SHARE);
! 	result = heap_hot_search_buffer(tid, buffer, snapshot, all_dead);
  	LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
  	ReleaseBuffer(buffer);
  	return result;
--- 1643,1649 ----
  
  	buffer = ReadBuffer(relation, ItemPointerGetBlockNumber(tid));
  	LockBuffer(buffer, BUFFER_LOCK_SHARE);
! 	result = heap_hot_search_buffer(tid, relation, buffer, snapshot, all_dead);
  	LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
  	ReleaseBuffer(buffer);
  	return result;
***************
*** 1728,1735 **** heap_get_latest_tid(Relation relation,
--- 1750,1760 ----
  		 * result candidate.
  		 */
  		valid = HeapTupleSatisfiesVisibility(&tp, snapshot, buffer);
+ 		CheckForSerializableConflictOut(valid, relation, &tp, buffer);
  		if (valid)
+ 		{
  			*tid = ctid;
+ 		}
  
  		/*
  		 * If there's a valid t_ctid link, follow it, else we're done.
***************
*** 1892,1897 **** heap_insert(Relation relation, HeapTuple tup, CommandId cid,
--- 1917,1929 ----
  	buffer = RelationGetBufferForTuple(relation, heaptup->t_len,
  									   InvalidBuffer, options, bistate);
  
+ 	/*
+ 	 * We're about to do the actual insert -- check for conflict at the
+ 	 * relation or buffer level first, to avoid possibly having to roll
+ 	 * back work we've just done.
+ 	 */
+ 	CheckForSerializableConflictIn(relation, NULL, buffer);
+ 
  	/* NO EREPORT(ERROR) from here till changes are logged */
  	START_CRIT_SECTION();
  
***************
*** 2192,2197 **** l1:
--- 2224,2235 ----
  		return result;
  	}
  
+ 	/*
+ 	 * We're about to do the actual delete -- check for conflict first,
+ 	 * to avoid possibly having to roll back work we've just done.
+ 	 */
+ 	CheckForSerializableConflictIn(relation, &tp, buffer);
+ 
  	/* replace cid with a combo cid if necessary */
  	HeapTupleHeaderAdjustCmax(tp.t_data, &cid, &iscombo);
  
***************
*** 2545,2550 **** l2:
--- 2583,2594 ----
  		return result;
  	}
  
+ 	/*
+ 	 * We're about to do the actual update -- check for conflict first,
+ 	 * to avoid possibly having to roll back work we've just done.
+ 	 */
+ 	CheckForSerializableConflictIn(relation, &oldtup, buffer);
+ 
  	/* Fill in OID and transaction status data for newtup */
  	if (relation->rd_rel->relhasoids)
  	{
***************
*** 2690,2695 **** l2:
--- 2734,2749 ----
  	}
  
  	/*
+ 	 * We're about to create the new tuple -- check for conflict first,
+ 	 * to avoid possibly having to roll back work we've just done.
+ 	 *
+ 	 * NOTE: For a tuple insert, we only need to check for table locks, since
+ 	 * predicate locking at the index level will cover ranges for anything
+ 	 * except a table scan.  Therefore, only provide the relation.
+ 	 */
+ 	CheckForSerializableConflictIn(relation, NULL, InvalidBuffer);
+ 
+ 	/*
  	 * At this point newbuf and buffer are both pinned and locked, and newbuf
  	 * has enough space for the new tuple.	If they are the same buffer, only
  	 * one pin is held.
***************
*** 2829,2834 **** l2:
--- 2883,2894 ----
  	CacheInvalidateHeapTuple(relation, heaptup);
  
  	/*
+ 	 * TODO SSI: In order to support SIREAD locks at tuple granularity, any
+ 	 *           existing SIREAD locks on the old tuple must be copied to
+ 	 *           also refer to the new tuple, somewhere around this point?
+ 	 */
+ 
+ 	/*
  	 * Release the lmgr tuple lock, if we had it.
  	 */
  	if (have_tuple_lock)
*** a/src/backend/access/index/indexam.c
--- b/src/backend/access/index/indexam.c
***************
*** 64,72 ****
--- 64,74 ----
  
  #include "access/relscan.h"
  #include "access/transam.h"
+ #include "access/xact.h"
  #include "pgstat.h"
  #include "storage/bufmgr.h"
  #include "storage/lmgr.h"
+ #include "storage/predicate.h"
  #include "utils/relcache.h"
  #include "utils/snapmgr.h"
  #include "utils/tqual.h"
***************
*** 192,197 **** index_insert(Relation indexRelation,
--- 194,204 ----
  	RELATION_CHECKS;
  	GET_REL_PROCEDURE(aminsert);
  
+ 	if (!(indexRelation->rd_am->ampredlocks))
+ 		CheckForSerializableConflictIn(indexRelation,
+ 									   (HeapTuple) NULL,
+ 									   InvalidBuffer);
+ 
  	/*
  	 * have the am's insert proc do all the work.
  	 */
***************
*** 266,271 **** index_beginscan_internal(Relation indexRelation,
--- 273,281 ----
  	RELATION_CHECKS;
  	GET_REL_PROCEDURE(ambeginscan);
  
+ 	if (!(indexRelation->rd_am->ampredlocks))
+ 		PredicateLockRelation(indexRelation);
+ 
  	/*
  	 * We hold a reference count to the relcache entry throughout the scan.
  	 */
***************
*** 515,520 **** index_getnext(IndexScanDesc scan, ScanDirection direction)
--- 525,531 ----
  		{
  			ItemId		lp;
  			ItemPointer ctid;
+ 			bool		valid;
  
  			/* check for bogus TID */
  			if (offnum < FirstOffsetNumber ||
***************
*** 569,576 **** index_getnext(IndexScanDesc scan, ScanDirection direction)
  				break;
  
  			/* If it's visible per the snapshot, we must return it */
! 			if (HeapTupleSatisfiesVisibility(heapTuple, scan->xs_snapshot,
! 											 scan->xs_cbuf))
  			{
  				/*
  				 * If the snapshot is MVCC, we know that it could accept at
--- 580,592 ----
  				break;
  
  			/* If it's visible per the snapshot, we must return it */
! 			valid = HeapTupleSatisfiesVisibility(heapTuple, scan->xs_snapshot,
! 												 scan->xs_cbuf);
! 
! 			CheckForSerializableConflictOut(valid, scan->heapRelation,
! 											heapTuple, scan->xs_cbuf);
! 
! 			if (valid)
  			{
  				/*
  				 * If the snapshot is MVCC, we know that it could accept at
***************
*** 578,584 **** index_getnext(IndexScanDesc scan, ScanDirection direction)
  				 * any more members.  Otherwise, check for continuation of the
  				 * HOT-chain, and set state for next time.
  				 */
! 				if (IsMVCCSnapshot(scan->xs_snapshot))
  					scan->xs_next_hot = InvalidOffsetNumber;
  				else if (HeapTupleIsHotUpdated(heapTuple))
  				{
--- 594,601 ----
  				 * any more members.  Otherwise, check for continuation of the
  				 * HOT-chain, and set state for next time.
  				 */
! 				if (IsMVCCSnapshot(scan->xs_snapshot)
! 					&& !IsXactIsoLevelFullySerializable)
  					scan->xs_next_hot = InvalidOffsetNumber;
  				else if (HeapTupleIsHotUpdated(heapTuple))
  				{
***************
*** 594,599 **** index_getnext(IndexScanDesc scan, ScanDirection direction)
--- 611,618 ----
  
  				pgstat_count_heap_fetch(scan->indexRelation);
  
+ 				PredicateLockTuple(scan->heapRelation, heapTuple);
+ 
  				return heapTuple;
  			}
  
*** a/src/backend/access/nbtree/nbtinsert.c
--- b/src/backend/access/nbtree/nbtinsert.c
***************
*** 21,26 ****
--- 21,27 ----
  #include "miscadmin.h"
  #include "storage/bufmgr.h"
  #include "storage/lmgr.h"
+ #include "storage/predicate.h"
  #include "utils/inval.h"
  #include "utils/tqual.h"
  
***************
*** 175,180 **** top:
--- 176,189 ----
  
  	if (checkUnique != UNIQUE_CHECK_EXISTING)
  	{
+ 		/*
+ 		 * The only conflict predicate locking cares about for indexes is when
+ 		 * an index tuple insert conflicts with an existing lock.  Since the
+ 		 * actual location of the insert is hard to predict because of the
+ 		 * random search used to prevent O(N^2) performance when there are many
+ 		 * duplicate entries, we can just use the "first valid" page.
+ 		 */
+ 		CheckForSerializableConflictIn(rel, NULL, buf);
  		/* do the insertion */
  		_bt_findinsertloc(rel, &buf, &offset, natts, itup_scankey, itup, heapRel);
  		_bt_insertonpg(rel, buf, stack, itup, offset, false);
***************
*** 697,702 **** _bt_insertonpg(Relation rel,
--- 706,714 ----
  		/* split the buffer into left and right halves */
  		rbuf = _bt_split(rel, buf, firstright,
  						 newitemoff, itemsz, itup, newitemonleft);
+ 		PredicateLockPageSplit(rel,
+ 							   BufferGetBlockNumber(buf),
+ 							   BufferGetBlockNumber(rbuf));
  
  		/*----------
  		 * By here,
*** a/src/backend/access/nbtree/nbtpage.c
--- b/src/backend/access/nbtree/nbtpage.c
***************
*** 1177,1182 **** _bt_pagedel(Relation rel, Buffer buf, BTStack stack)
--- 1177,1188 ----
  	rbuf = _bt_getbuf(rel, rightsib, BT_WRITE);
  
  	/*
+ 	 * Any insert which would have gone on the target block will now go to the
+ 	 * right sibling block.
+ 	 */
+ 	PredicateLockPageCombine(rel, target, rightsib);
+ 
+ 	/*
  	 * Next find and write-lock the current parent of the target page. This is
  	 * essentially the same as the corresponding step of splitting.
  	 */
*** a/src/backend/access/nbtree/nbtsearch.c
--- b/src/backend/access/nbtree/nbtsearch.c
***************
*** 21,26 ****
--- 21,27 ----
  #include "miscadmin.h"
  #include "pgstat.h"
  #include "storage/bufmgr.h"
+ #include "storage/predicate.h"
  #include "utils/lsyscache.h"
  #include "utils/rel.h"
  
***************
*** 63,69 **** _bt_search(Relation rel, int keysz, ScanKey scankey, bool nextkey,
--- 64,73 ----
  
  	/* If index is empty and access = BT_READ, no root page is created. */
  	if (!BufferIsValid(*bufP))
+ 	{
+ 		PredicateLockRelation(rel);  /* Nothing finer to lock exists. */
  		return (BTStack) NULL;
+ 	}
  
  	/* Loop iterates once per level descended in the tree */
  	for (;;)
***************
*** 88,94 **** _bt_search(Relation rel, int keysz, ScanKey scankey, bool nextkey,
--- 92,102 ----
  		page = BufferGetPage(*bufP);
  		opaque = (BTPageOpaque) PageGetSpecialPointer(page);
  		if (P_ISLEAF(opaque))
+ 		{
+ 			if (access == BT_READ)
+ 				PredicateLockPage(rel, BufferGetBlockNumber(*bufP));
  			break;
+ 		}
  
  		/*
  		 * Find the appropriate item on the internal page, and get the child
***************
*** 199,204 **** _bt_moveright(Relation rel,
--- 207,213 ----
  		elog(ERROR, "fell off the end of index \"%s\"",
  			 RelationGetRelationName(rel));
  
+ 	PredicateLockPage(rel, BufferGetBlockNumber(buf));
  	return buf;
  }
  
***************
*** 1142,1147 **** _bt_steppage(IndexScanDesc scan, ScanDirection dir)
--- 1151,1157 ----
  			opaque = (BTPageOpaque) PageGetSpecialPointer(page);
  			if (!P_IGNORE(opaque))
  			{
+ 				PredicateLockPage(rel, blkno);
  				/* see if there are any matches on this page */
  				/* note that this will clear moreRight if we can stop */
  				if (_bt_readpage(scan, dir, P_FIRSTDATAKEY(opaque)))
***************
*** 1189,1194 **** _bt_steppage(IndexScanDesc scan, ScanDirection dir)
--- 1199,1205 ----
  			opaque = (BTPageOpaque) PageGetSpecialPointer(page);
  			if (!P_IGNORE(opaque))
  			{
+ 				PredicateLockPage(rel, BufferGetBlockNumber(so->currPos.buf));
  				/* see if there are any matches on this page */
  				/* note that this will clear moreLeft if we can stop */
  				if (_bt_readpage(scan, dir, PageGetMaxOffsetNumber(page)))
***************
*** 1352,1357 **** _bt_get_endpoint(Relation rel, uint32 level, bool rightmost)
--- 1363,1369 ----
  	if (!BufferIsValid(buf))
  	{
  		/* empty index... */
+ 		PredicateLockRelation(rel);  /* Nothing finer to lock exists. */
  		return InvalidBuffer;
  	}
  
***************
*** 1431,1440 **** _bt_endpoint(IndexScanDesc scan, ScanDirection dir)
--- 1443,1454 ----
  	if (!BufferIsValid(buf))
  	{
  		/* empty index... */
+ 		PredicateLockRelation(rel);  /* Nothing finer to lock exists. */
  		so->currPos.buf = InvalidBuffer;
  		return false;
  	}
  
+ 	PredicateLockPage(rel, BufferGetBlockNumber(buf));
  	page = BufferGetPage(buf);
  	opaque = (BTPageOpaque) PageGetSpecialPointer(page);
  	Assert(P_ISLEAF(opaque));
*** a/src/backend/access/transam/xact.c
--- b/src/backend/access/transam/xact.c
***************
*** 39,44 ****
--- 39,45 ----
  #include "storage/bufmgr.h"
  #include "storage/fd.h"
  #include "storage/lmgr.h"
+ #include "storage/predicate.h"
  #include "storage/procarray.h"
  #include "storage/sinvaladt.h"
  #include "storage/smgr.h"
***************
*** 1754,1759 **** CommitTransaction(void)
--- 1755,1767 ----
  	AtEOXact_LargeObject(true);
  
  	/*
+ 	 * Mark serializable transaction as complete for predicate locking
+ 	 * purposes.  This should be done as late as we can put it and still
+ 	 * allow errors to be raised for failure patterns found at commit.
+ 	 */
+ 	PreCommit_CheckForSerializationFailure();
+ 
+ 	/*
  	 * Insert notifications sent by NOTIFY commands into the queue.  This
  	 * should be late in the pre-commit sequence to minimize time spent
  	 * holding the notify-insertion lock.
*** a/src/backend/catalog/index.c
--- b/src/backend/catalog/index.c
***************
*** 2044,2050 **** IndexCheckExclusion(Relation heapRelation,
   *
   * After completing validate_index(), we wait until all transactions that
   * were alive at the time of the reference snapshot are gone; this is
!  * necessary to be sure there are none left with a serializable snapshot
   * older than the reference (and hence possibly able to see tuples we did
   * not index).	Then we mark the index "indisvalid" and commit.  Subsequent
   * transactions will be able to use it for queries.
--- 2044,2050 ----
   *
   * After completing validate_index(), we wait until all transactions that
   * were alive at the time of the reference snapshot are gone; this is
!  * necessary to be sure there are none left with a transaction-based snapshot
   * older than the reference (and hence possibly able to see tuples we did
   * not index).	Then we mark the index "indisvalid" and commit.  Subsequent
   * transactions will be able to use it for queries.
*** a/src/backend/commands/trigger.c
--- b/src/backend/commands/trigger.c
***************
*** 2360,2366 **** ltrmark:;
  
  			case HeapTupleUpdated:
  				ReleaseBuffer(buffer);
! 				if (IsXactIsoLevelSerializable)
  					ereport(ERROR,
  							(errcode(ERRCODE_T_R_SERIALIZATION_FAILURE),
  							 errmsg("could not serialize access due to concurrent update")));
--- 2360,2366 ----
  
  			case HeapTupleUpdated:
  				ReleaseBuffer(buffer);
! 				if (IsXactIsoLevelXactSnapshotBased)
  					ereport(ERROR,
  							(errcode(ERRCODE_T_R_SERIALIZATION_FAILURE),
  							 errmsg("could not serialize access due to concurrent update")));
*** a/src/backend/executor/execMain.c
--- b/src/backend/executor/execMain.c
***************
*** 1544,1550 **** EvalPlanQualFetch(EState *estate, Relation relation, int lockmode,
  
  				case HeapTupleUpdated:
  					ReleaseBuffer(buffer);
! 					if (IsXactIsoLevelSerializable)
  						ereport(ERROR,
  								(errcode(ERRCODE_T_R_SERIALIZATION_FAILURE),
  								 errmsg("could not serialize access due to concurrent update")));
--- 1544,1550 ----
  
  				case HeapTupleUpdated:
  					ReleaseBuffer(buffer);
! 					if (IsXactIsoLevelXactSnapshotBased)
  						ereport(ERROR,
  								(errcode(ERRCODE_T_R_SERIALIZATION_FAILURE),
  								 errmsg("could not serialize access due to concurrent update")));
*** a/src/backend/executor/nodeBitmapHeapscan.c
--- b/src/backend/executor/nodeBitmapHeapscan.c
***************
*** 42,47 ****
--- 42,48 ----
  #include "executor/nodeBitmapHeapscan.h"
  #include "pgstat.h"
  #include "storage/bufmgr.h"
+ #include "storage/predicate.h"
  #include "utils/memutils.h"
  #include "utils/snapmgr.h"
  #include "utils/tqual.h"
***************
*** 351,357 **** bitgetpage(HeapScanDesc scan, TBMIterateResult *tbmres)
  			ItemPointerData tid;
  
  			ItemPointerSet(&tid, page, offnum);
! 			if (heap_hot_search_buffer(&tid, buffer, snapshot, NULL))
  				scan->rs_vistuples[ntup++] = ItemPointerGetOffsetNumber(&tid);
  		}
  	}
--- 352,358 ----
  			ItemPointerData tid;
  
  			ItemPointerSet(&tid, page, offnum);
! 			if (heap_hot_search_buffer(&tid, scan->rs_rd, buffer, snapshot, NULL))
  				scan->rs_vistuples[ntup++] = ItemPointerGetOffsetNumber(&tid);
  		}
  	}
*** a/src/backend/executor/nodeIndexscan.c
--- b/src/backend/executor/nodeIndexscan.c
***************
*** 30,35 ****
--- 30,36 ----
  #include "executor/execdebug.h"
  #include "executor/nodeIndexscan.h"
  #include "optimizer/clauses.h"
+ #include "storage/predicate.h"
  #include "utils/array.h"
  #include "utils/lsyscache.h"
  #include "utils/memutils.h"
*** a/src/backend/executor/nodeLockRows.c
--- b/src/backend/executor/nodeLockRows.c
***************
*** 130,136 **** lnext:
  				break;
  
  			case HeapTupleUpdated:
! 				if (IsXactIsoLevelSerializable)
  					ereport(ERROR,
  							(errcode(ERRCODE_T_R_SERIALIZATION_FAILURE),
  							 errmsg("could not serialize access due to concurrent update")));
--- 130,136 ----
  				break;
  
  			case HeapTupleUpdated:
! 				if (IsXactIsoLevelXactSnapshotBased)
  					ereport(ERROR,
  							(errcode(ERRCODE_T_R_SERIALIZATION_FAILURE),
  							 errmsg("could not serialize access due to concurrent update")));
*** a/src/backend/executor/nodeModifyTable.c
--- b/src/backend/executor/nodeModifyTable.c
***************
*** 328,334 **** ldelete:;
  			break;
  
  		case HeapTupleUpdated:
! 			if (IsXactIsoLevelSerializable)
  				ereport(ERROR,
  						(errcode(ERRCODE_T_R_SERIALIZATION_FAILURE),
  						 errmsg("could not serialize access due to concurrent update")));
--- 328,334 ----
  			break;
  
  		case HeapTupleUpdated:
! 			if (IsXactIsoLevelXactSnapshotBased)
  				ereport(ERROR,
  						(errcode(ERRCODE_T_R_SERIALIZATION_FAILURE),
  						 errmsg("could not serialize access due to concurrent update")));
***************
*** 516,522 **** lreplace:;
  			break;
  
  		case HeapTupleUpdated:
! 			if (IsXactIsoLevelSerializable)
  				ereport(ERROR,
  						(errcode(ERRCODE_T_R_SERIALIZATION_FAILURE),
  						 errmsg("could not serialize access due to concurrent update")));
--- 516,522 ----
  			break;
  
  		case HeapTupleUpdated:
! 			if (IsXactIsoLevelXactSnapshotBased)
  				ereport(ERROR,
  						(errcode(ERRCODE_T_R_SERIALIZATION_FAILURE),
  						 errmsg("could not serialize access due to concurrent update")));
*** a/src/backend/executor/nodeSeqscan.c
--- b/src/backend/executor/nodeSeqscan.c
***************
*** 28,33 ****
--- 28,34 ----
  #include "access/relscan.h"
  #include "executor/execdebug.h"
  #include "executor/nodeSeqscan.h"
+ #include "storage/predicate.h"
  
  static void InitScanRelation(SeqScanState *node, EState *estate);
  static TupleTableSlot *SeqNext(SeqScanState *node);
***************
*** 105,115 **** SeqRecheck(SeqScanState *node, TupleTableSlot *slot)
--- 106,118 ----
   *		tuple.
   *		We call the ExecScan() routine and pass it the appropriate
   *		access method functions.
+  *		For serializable transactions, we first lock the entire relation.
   * ----------------------------------------------------------------
   */
  TupleTableSlot *
  ExecSeqScan(SeqScanState *node)
  {
+ 	PredicateLockRelation(node->ss_currentRelation);
  	return ExecScan((ScanState *) node,
  					(ExecScanAccessMtd) SeqNext,
  					(ExecScanRecheckMtd) SeqRecheck);
*** a/src/backend/executor/nodeTidscan.c
--- b/src/backend/executor/nodeTidscan.c
***************
*** 31,36 ****
--- 31,37 ----
  #include "executor/nodeTidscan.h"
  #include "optimizer/clauses.h"
  #include "storage/bufmgr.h"
+ #include "storage/predicate.h"
  #include "utils/array.h"
  
  
*** a/src/backend/storage/ipc/ipci.c
--- b/src/backend/storage/ipc/ipci.c
***************
*** 105,110 **** CreateSharedMemoryAndSemaphores(bool makePrivate, int port)
--- 105,111 ----
  												 sizeof(ShmemIndexEnt)));
  		size = add_size(size, BufferShmemSize());
  		size = add_size(size, LockShmemSize());
+ 		size = add_size(size, PredicateLockShmemSize());
  		size = add_size(size, ProcGlobalShmemSize());
  		size = add_size(size, XLOGShmemSize());
  		size = add_size(size, CLOGShmemSize());
***************
*** 200,205 **** CreateSharedMemoryAndSemaphores(bool makePrivate, int port)
--- 201,211 ----
  	InitLocks();
  
  	/*
+ 	 * Set up predicate lock manager
+ 	 */
+ 	InitPredicateLocks();
+ 
+ 	/*
  	 * Set up process table
  	 */
  	if (!IsUnderPostmaster)
*** a/src/backend/storage/ipc/shmqueue.c
--- b/src/backend/storage/ipc/shmqueue.c
***************
*** 43,56 **** SHMQueueInit(SHM_QUEUE *queue)
   * SHMQueueIsDetached -- TRUE if element is not currently
   *		in a queue.
   */
- #ifdef NOT_USED
  bool
  SHMQueueIsDetached(SHM_QUEUE *queue)
  {
  	Assert(ShmemAddrIsValid(queue));
  	return (queue->prev == NULL);
  }
- #endif
  
  /*
   * SHMQueueElemInit -- clear an element's links
--- 43,54 ----
*** a/src/backend/storage/lmgr/Makefile
--- b/src/backend/storage/lmgr/Makefile
***************
*** 12,18 **** subdir = src/backend/storage/lmgr
  top_builddir = ../../../..
  include $(top_builddir)/src/Makefile.global
  
! OBJS = lmgr.o lock.o proc.o deadlock.o lwlock.o spin.o s_lock.o
  
  include $(top_srcdir)/src/backend/common.mk
  
--- 12,18 ----
  top_builddir = ../../../..
  include $(top_builddir)/src/Makefile.global
  
! OBJS = lmgr.o lock.o proc.o deadlock.o lwlock.o spin.o s_lock.o predicate.o
  
  include $(top_srcdir)/src/backend/common.mk
  
*** /dev/null
--- b/src/backend/storage/lmgr/predicate.c
***************
*** 0 ****
--- 1,2418 ----
+ /*-------------------------------------------------------------------------
+  *
+  * predicate.c
+  *	  POSTGRES predicate locking
+  *	  to support full serializable transaction isolation
+  *
+  * Predicate locks for Serializable Snapshot Isolation (SSI) are SIREAD
+  * locks, which are so different from normal locks that a distinct set of
+  * structures is required to handle them.
+  *
+  * (1)	Besides tuples actually read, they must cover ranges of tuples
+  *		which would have been read based on the predicate.	This will
+  *		require modelling the predicates through locks against database
+  *		objects such as pages, index ranges, or entire tables.
+  *
+  * (2)	They must be kept in RAM for quick access.	Because of this, it
+  *		isn't possible to always maintain tuple-level granularity -- when
+  *		the space allocated to store these approaches exhaustion, a
+  *		request for a lock may need to scan for situations where a single
+  *		transaction holds many fine-grained locks which can be coalesced
+  *		into a single coarser-grained lock.
+  *
+  * (3)	They never block anything; they are more like flags than locks
+  *		in that regard; although they refer to database objects and are
+  *		used to identify rw-conflicts with normal write locks.
+  *
+  * (4)	While they are associated with a transaction, they must survive
+  *		a successful COMMIT of that transaction, and remain until all
+  *		overlapping transactions complete.	This even means that they
+  *		must survive termination of the transaction's process.  On a
+  *		rollback of the top level transaction, all of that transaction's
+  *		SIREAD locks should be released, however.
+  *
+  * (5)	The only transactions which create SIREAD locks or check for
+  *		conflicts with them are serializable transactions.
+  *
+  * (6)	When a write lock for a top level transaction is found to cover
+  *		an existing SIREAD lock for the same transaction, the SIREAD lock
+  *		can be deleted.
+  *
+  * (7)	A write from a serializable transaction must ensure that a xact
+  *		record exists for the transaction, with the same lifespan (until
+  *		all concurrent transaction complete or the transaction is rolled
+  *		back) so that rw-dependencies to that transaction can be
+  *		detected.
+  *
+  *
+  * Lightweight locks to manage access to the predicate locking shared
+  * memory objects must be taken in this order, and should be released in
+  * reverse order:
+  *
+  *	SerializableFinishedListLock
+  *		- Protects the list of transaction which have completed but which
+  *			may yet matter because they overlap still-active transactions.
+  *
+  *	SerializablePredicateLockListLock
+  *		- Special handling: use shared mode for walking the list *and*
+  *			for modifying the list from the process running the owning
+  *			transaction.  No other process is allowed to walk the list,
+  *			and any other process must acquire exclusive access to modify
+  *			it.  Once a transaction has completed, it is the holder of
+  *			the SerializableFinishedListLock who can walk the list in
+  *			shared mode.
+  *
+  *	FirstPredicateLockMgrLock based partition locks
+  *		- The same lock protects a target and all locks on that target.
+  *		- When more than one is needed, acquire in ascending order.
+  *
+  *	SerializableXactHashLock
+  *		- Protects both SerializableXactHash and SerializableXidHash.
+  *
+  *
+  * Portions Copyright (c) 1996-2010, PostgreSQL Global Development Group
+  * Portions Copyright (c) 1994, Regents of the University of California
+  *
+  *
+  * IDENTIFICATION
+  *	  $PostgreSQL$
+  *
+  *-------------------------------------------------------------------------
+  */
+ /*
+  * INTERFACE ROUTINES
+  *
+  * housekeeping for setting up shared memory predicate lock structures
+  *		InitPredicateLocks(void)
+  *		PredicateLockShmemSize(void)
+  *
+  * predicate lock reporting
+  *		PredicateLockData *GetPredicateLockStatusData(void)
+  *
+  * predicate lock maintenance
+  *		RegisterSerializableTransaction(Snapshot snapshot)
+  *		PredicateLockRelation(Relation relation)
+  *		PredicateLockPage(Relation relation, BlockNumber blkno)
+  *		PredicateLockTuple(Relation relation, HeapTuple tuple)
+  *		PredicateLockPageSplit(Relation relation, BlockNumber oldblkno,
+  *							   BlockNumber newblkno);
+  *		PredicateLockPageCombine(Relation relation, BlockNumber oldblkno,
+  *								 BlockNumber newblkno);
+  *		ReleasePredicateLocks(bool isCommit)
+  *
+  * conflict detection (may also trigger rollback)
+  *		CheckForSerializableConflictOut(bool valid, Relation relation,
+  *										HeapTupleData *tup, Buffer buffer)
+  *		CheckForSerializableConflictIn(Relation relation, HeapTupleData *tup,
+  *									   Buffer buffer)
+  *
+  * final rollback checking
+  *		PreCommit_CheckForSerializationFailure(void)
+  */
+ 
+ #include "postgres.h"
+ 
+ #include "access/transam.h"
+ #include "access/twophase.h"
+ #include "access/xact.h"
+ #include "miscadmin.h"
+ #include "storage/bufmgr.h"
+ #include "storage/predicate.h"
+ #include "utils/rel.h"
+ #include "utils/snapmgr.h"
+ 
+ /*
+  * The SERIALIZABLEXIDTAG struct identifies an xid assigned to a serializable
+  * transaction or any of its subtransactions.
+  */
+ typedef struct SERIALIZABLEXIDTAG
+ {
+ 	TransactionId xid;
+ } SERIALIZABLEXIDTAG;
+ 
+ /*
+  * Information to link between an xid list and a top level serializable
+  * transaction.
+  */
+ typedef struct SERIALIZABLEXID
+ {
+ 	/* hash key */
+ 	SERIALIZABLEXIDTAG tag;
+ 
+ 	/* data */
+ 	SERIALIZABLEXACT *myXact;	/* pointer to the top level transaction data */
+ 	SHM_QUEUE	xactLink;		/* list link in SERIALIZABLEXACT's list of
+ 								 * xids */
+ } SERIALIZABLEXID;
+ 
+ /*
+  * Per-locked-object predicate lock information:
+  *
+  * tag -- uniquely identifies the object being locked
+  * predicateLocks -- list of predicate lock objects for this target.
+  */
+ typedef struct PREDICATELOCKTARGET
+ {
+ 	/* hash key */
+ 	PREDICATELOCKTARGETTAG tag; /* unique identifier of lockable object */
+ 
+ 	/* data */
+ 	SHM_QUEUE	predicateLocks; /* list of PREDICATELOCK objects assoc. with
+ 								 * predicate lock target */
+ } PREDICATELOCKTARGET;
+ 
+ typedef struct PREDICATELOCKTAG
+ {
+ 	PREDICATELOCKTARGET *myTarget;
+ 	SERIALIZABLEXACT *myXact;
+ } PREDICATELOCKTAG;
+ 
+ typedef struct PREDICATELOCK
+ {
+ 	/* hash key */
+ 	PREDICATELOCKTAG tag;		/* unique identifier of lockable object */
+ 
+ 	/* data */
+ 	SHM_QUEUE	targetLink;		/* list link in PREDICATELOCKTARGET's list of
+ 								 * predicate locks */
+ 	SHM_QUEUE	xactLink;		/* list link in SERIALIZABLEXACT's list of
+ 								 * predicate locks */
+ } PREDICATELOCK;
+ 
+ /*
+  * Backend-local hash table of ancestor (coarser) locks and the number
+  * of (finer-grained) children locks that are currently held. This is
+  * used to determine when to promote multiple fine-grained locks to
+  * one coarse-grained lock.
+  */
+ typedef struct LOCALPREDICATELOCK
+ {
+ 	/* hash key */
+ 	PREDICATELOCKTARGETTAG tag; /* unique identifier of lockable object */
+ 
+ 	/* data */
+ 	bool		held;			/* is lock held, or just its children?	*/
+ 	int			childLocks;		/* number of child locks currently held */
+ } LOCALPREDICATELOCK;
+ static HTAB *LocalPredicateLockHash = NULL;
+ 
+ 
+ /*
+  * Test the most selective fields first, for performance.
+  *
+  * a is covered by b if all of the following hold:
+  *	1) a.database = b.database
+  *	2) a.relation = b.relation
+  *	3) b.offset is invalid (b is page-granularity or higher)
+  *	4) either of the following:
+  *		4a) a.offset is valid (a is tuple-granularity) and a.page = b.page
+  *	 or 4b) a.offset is invalid and b.page is invalid (a is
+  *			page-granularity and b is relation-granularity
+  */
+ #define TargetTagIsCoveredBy(covered_target, covering_target)			\
+ 	((GET_PREDICATELOCKTARGETTAG_RELATION(covered_target) == /* (2) */	\
+ 	  GET_PREDICATELOCKTARGETTAG_RELATION(covering_target))				\
+ 	 && (GET_PREDICATELOCKTARGETTAG_OFFSET(covering_target) ==			\
+ 		 InvalidOffsetNumber)								 /* (3) */	\
+ 	 && (((GET_PREDICATELOCKTARGETTAG_OFFSET(covered_target) !=			\
+ 		   InvalidOffsetNumber)								 /* (4a) */ \
+ 		  && (GET_PREDICATELOCKTARGETTAG_PAGE(covering_target) ==		\
+ 			  GET_PREDICATELOCKTARGETTAG_PAGE(covered_target)))			\
+ 		 || ((GET_PREDICATELOCKTARGETTAG_PAGE(covering_target) ==		\
+ 			  InvalidBlockNumber)							 /* (4b) */ \
+ 			 && (GET_PREDICATELOCKTARGETTAG_PAGE(covered_target)		\
+ 				 != InvalidBlockNumber)))								\
+ 	 && (GET_PREDICATELOCKTARGETTAG_DB(covered_target) ==	 /* (1) */	\
+ 		 GET_PREDICATELOCKTARGETTAG_DB(covering_target)))
+ 
+ /*
+  * The predicate locking target and lock shared hash tables are partitioned to
+  * reduce contention.  To determine which partition a given target belongs to,
+  * compute the tag's hash code with PredicateLockTargetTagHashCode(), then
+  * apply one of these macros.
+  * NB: NUM_PREDICATELOCK_PARTITIONS must be a power of 2!
+  */
+ #define PredicateLockHashPartition(hashcode) \
+ 	((hashcode) % NUM_PREDICATELOCK_PARTITIONS)
+ #define PredicateLockHashPartitionLock(hashcode) \
+ 	((LWLockId) (FirstPredicateLockMgrLock + PredicateLockHashPartition(hashcode)))
+ 
+ #define NPREDICATELOCKTARGETENTS() \
+ 	mul_size(max_predicate_locks_per_xact, add_size(MaxBackends, max_prepared_xacts))
+ 
+ #define SxactIsOnFinishedList(sxact) (!SHMQueueIsDetached(&((sxact)->finishedLink)))
+ 
+ #define SxactIsCommitted(sxact) TransactionIdIsValid((sxact)->finishedBefore)
+ #define SxactCommittedBefore(sxactPivotOut, sxactOther) \
+ 	((!TransactionIdIsValid((sxactOther)->finishedBefore)) \
+ 	|| TransactionIdPrecedesOrEquals((sxactPivotOut)->finishedBefore, \
+ 									 (sxactOther)->finishedBefore))
+ 
+ /*
+  * When a public interface method is called for a split on an index relation,
+  * this is the test to see if we should do a quick return.
+  */
+ #define SkipSplitTracking(relation) \
+ 	(((relation)->rd_id < FirstBootstrapObjectId) \
+ 	|| ((relation)->rd_istemp))
+ 
+ /*
+  * When a public interface method is called for serializing a relation within
+  * the current transaction, this is the test to see if we should do a quick
+  * return.
+  */
+ #define SkipSerialization(relation) \
+ 	((!IsXactIsoLevelFullySerializable) \
+ 	|| SkipSplitTracking(relation))
+ 
+ 
+ /*
+  * Compute the hash code associated with a PREDICATELOCKTARGETTAG.
+  *
+  * To avoid unnecessary recomputations of the hash code, we try to do this
+  * just once per function, and then pass it around as needed.  Aside from
+  * passing the hashcode to hash_search_with_hash_value(), we can extract
+  * the lock partition number from the hashcode.
+  */
+ #define PredicateLockTargetTagHashCode(predicatelocktargettag) \
+ 	(tag_hash((predicatelocktargettag), sizeof(PREDICATELOCKTARGETTAG)))
+ 
+ /*
+  * Given a predicate lock tag, and the hash for its target,
+  * compute the lock hash.
+  *
+  * To make the hash code also depend on the transaction, we xor the sxid
+  * struct's address into the hash code, left-shifted so that the
+  * partition-number bits don't change.  Since this is only a hash, we
+  * don't care if we lose high-order bits of the address; use an
+  * intermediate variable to suppress cast-pointer-to-int warnings.
+  */
+ #define PredicateLockHashCodeFromTargetHashCode(predicatelocktag, targethash) \
+ 	((targethash) ^ ((uint32) PointerGetDatum((predicatelocktag)->myXact)) \
+ 	 << LOG2_NUM_PREDICATELOCK_PARTITIONS)
+ 
+ 
+ /* This configuration variable is used to set the predicate lock table size */
+ int			max_predicate_locks_per_xact;		/* set by guc.c */
+ 
+ /*
+  * These global variables are maintained when registering and cleaning up
+  * serializable transactions.  They must be global across all backends, but
+  * are not needed outside this source file, so no .h declaration is needed.
+  */
+ TransactionId SerializableGlobalXmin = InvalidTransactionId;
+ int			SerializableGlobalXminCount = 0;
+ 
+ /*
+  * The predicate locking hash tables are in shared memory.
+  * Each backend keeps pointers to them.
+  */
+ static HTAB *SerializableXactHash;
+ static HTAB *SerializableXidHash;
+ static HTAB *PredicateLockTargetHash;
+ static HTAB *PredicateLockHash;
+ static SHM_QUEUE *FinishedSerializableTransactions;
+ 
+ /*
+  * Keep a pointer to the currently-running serializable transaction (if any)
+  * for quick reference.
+  */
+ typedef SERIALIZABLEXACT *SERIALIZABLEXACTPtr;
+ 
+ #define InvalidSerializableXact ((SERIALIZABLEXACTPtr) NULL)
+ static volatile SERIALIZABLEXACT *MySerializableXact = InvalidSerializableXact;
+ 
+ /* TODO SSI: Remove volatile qualifier and the then-unnecessary casts? */
+ 
+ /* The most recently used xid within this transaction, for optimizations. */
+ static TransactionId MyXid = InvalidTransactionId;
+ 
+ 
+ /* local functions */
+ static uint32 predicatelock_hash(const void *key, Size keysize);
+ static void ReleaseOneSerializableXact(SERIALIZABLEXACT *sxact);
+ static bool PredicateLockExists(const PREDICATELOCKTARGETTAG *newtargettag);
+ static bool CoarserLockCovers(const PREDICATELOCKTARGETTAG *newtargettag);
+ static void DeleteChildTargetLocks(const PREDICATELOCKTARGETTAG *targettag);
+ static int	PredicateLockPromotionThreshold(const PREDICATELOCKTARGETTAG *tag);
+ static bool GetParentPredicateLockTag(const PREDICATELOCKTARGETTAG *tag,
+ 						  PREDICATELOCKTARGETTAG *parent);
+ static void DecrementParentLocks(const PREDICATELOCKTARGETTAG *targettag);
+ static void PredicateLockAcquire(const PREDICATELOCKTARGETTAG *tag);
+ static void EnsureMySerializableXidExists(void);
+ static void ClearOldPredicateLocks(void);
+ static bool XidIsConcurrent(TransactionId xid);
+ static void FlagRWConflict(SERIALIZABLEXACT *reader, SERIALIZABLEXACT *writer);
+ static void CheckTargetForConflictsIn(PREDICATELOCKTARGETTAG *targettag);
+ static void OnConflict_CheckForSerializationFailure(const SERIALIZABLEXACT *reader,
+ 										const SERIALIZABLEXACT *writer);
+ 
+ /*
+  * InitPredicateLocks -- Initialize the predicate locking data structures.
+  *
+  * This is called from CreateSharedMemoryAndSemaphores(), which see for
+  * more comments.  In the normal postmaster case, the shared hash tables
+  * are created here.  Backends inherit the pointers
+  * to the shared tables via fork().  In the EXEC_BACKEND case, each
+  * backend re-executes this code to obtain pointers to the already existing
+  * shared hash tables.
+  */
+ void
+ InitPredicateLocks(void)
+ {
+ 	HASHCTL		info;
+ 	int			hash_flags;
+ 	long		init_table_size,
+ 				max_table_size;
+ 	bool		found;
+ 
+ 	/*
+ 	 * Compute init/max size to request for predicate lock target hashtable.
+ 	 * Note these calculations must agree with PredicateLockShmemSize!
+ 	 */
+ 	max_table_size = NPREDICATELOCKTARGETENTS();
+ 	init_table_size = max_table_size / 2;
+ 
+ 	/*
+ 	 * Allocate hash table for PREDICATELOCKTARGET structs.  This stores
+ 	 * per-predicate-lock-target information.
+ 	 */
+ 	MemSet(&info, 0, sizeof(info));
+ 	info.keysize = sizeof(PREDICATELOCKTARGETTAG);
+ 	info.entrysize = sizeof(PREDICATELOCKTARGET);
+ 	info.hash = tag_hash;
+ 	info.num_partitions = NUM_PREDICATELOCK_PARTITIONS;
+ 	hash_flags = (HASH_ELEM | HASH_FUNCTION | HASH_PARTITION);
+ 
+ 	PredicateLockTargetHash = ShmemInitHash("PREDICATELOCKTARGET hash",
+ 											init_table_size,
+ 											max_table_size,
+ 											&info,
+ 											hash_flags);
+ 
+ 	/* Assume an average of 2 xacts per target */
+ 	max_table_size *= 2;
+ 	init_table_size *= 2;
+ 
+ 	/*
+ 	 * Allocate hash table for PREDICATELOCK structs.  This stores per
+ 	 * xact-lock-of-a-target information.
+ 	 */
+ 	MemSet(&info, 0, sizeof(info));
+ 	info.keysize = sizeof(PREDICATELOCKTAG);
+ 	info.entrysize = sizeof(PREDICATELOCK);
+ 	info.hash = predicatelock_hash;
+ 	info.num_partitions = NUM_PREDICATELOCK_PARTITIONS;
+ 	hash_flags = (HASH_ELEM | HASH_FUNCTION | HASH_PARTITION);
+ 
+ 	PredicateLockHash = ShmemInitHash("PREDICATELOCK hash",
+ 									  init_table_size,
+ 									  max_table_size,
+ 									  &info,
+ 									  hash_flags);
+ 
+ 	/*
+ 	 * Compute init/max size to request for serializable transaction
+ 	 * hashtable. Note these calculations must agree with
+ 	 * PredicateLockShmemSize!
+ 	 */
+ 	max_table_size = MaxBackends;
+ 	init_table_size = max_table_size / 2;
+ 
+ 	/*
+ 	 * Allocate hash table for SERIALIZABLEXACT structs.  This stores per-vxid
+ 	 * information for serializable transactions which have accessed data.
+ 	 */
+ 	MemSet(&info, 0, sizeof(info));
+ 	info.keysize = sizeof(SERIALIZABLEXACTTAG);
+ 	info.entrysize = sizeof(SERIALIZABLEXACT);
+ 	info.hash = tag_hash;
+ 	hash_flags = (HASH_ELEM | HASH_FUNCTION);
+ 
+ 	SerializableXactHash = ShmemInitHash("SERIALIZABLEXACT hash",
+ 										 init_table_size,
+ 										 max_table_size,
+ 										 &info,
+ 										 hash_flags);
+ 
+ 	/* Assume an average of 10 serializable xids per backend. */
+ 	max_table_size *= 10;
+ 	init_table_size *= 10;
+ 
+ 	/*
+ 	 * Allocate hash table for SERIALIZABLEXID structs.  This stores per-xid
+ 	 * information for serializable transactions which have accessed data.
+ 	 */
+ 	MemSet(&info, 0, sizeof(info));
+ 	info.keysize = sizeof(SERIALIZABLEXIDTAG);
+ 	info.entrysize = sizeof(SERIALIZABLEXID);
+ 	info.hash = tag_hash;
+ 	hash_flags = (HASH_ELEM | HASH_FUNCTION);
+ 
+ 	SerializableXidHash = ShmemInitHash("SERIALIZABLEXID hash",
+ 										init_table_size,
+ 										max_table_size,
+ 										&info,
+ 										hash_flags);
+ 
+ 	/*
+ 	 * Create or attach to the header for the list of finished serializable
+ 	 * transactions.
+ 	 */
+ 	FinishedSerializableTransactions = (SHM_QUEUE *)
+ 		ShmemInitStruct("FinishedSerializableTransactions",
+ 						sizeof(SHM_QUEUE),
+ 						&found);
+ 	if (!found)
+ 		SHMQueueInit(FinishedSerializableTransactions);
+ }
+ 
+ /*
+  * Estimate shared-memory space used for predicate lock table
+  */
+ Size
+ PredicateLockShmemSize(void)
+ {
+ 	Size		size = 0;
+ 	long		max_table_size;
+ 
+ 	/* predicate lock target hash table */
+ 	max_table_size = NPREDICATELOCKTARGETENTS();
+ 	size = add_size(size, hash_estimate_size(max_table_size,
+ 											 sizeof(PREDICATELOCKTARGET)));
+ 
+ 	/* predicate lock hash table */
+ 	max_table_size *= 2;
+ 	size = add_size(size, hash_estimate_size(max_table_size,
+ 											 sizeof(PREDICATELOCK)));
+ 
+ 	/*
+ 	 * Since NPREDICATELOCKTARGETENTS is only an estimate, add 10% safety
+ 	 * margin.
+ 	 */
+ 	size = add_size(size, size / 10);
+ 
+ 	/* serializable transaction table */
+ 	max_table_size = MaxBackends;
+ 	size = add_size(size, hash_estimate_size(max_table_size,
+ 											 sizeof(SERIALIZABLEXACT)));
+ 
+ 	/* serializable subtransaction table */
+ 	max_table_size *= 10;
+ 	size = add_size(size, hash_estimate_size(max_table_size,
+ 											 sizeof(SERIALIZABLEXID)));
+ 
+ 	/* Head for list of serializable transactions. */
+ 	size = add_size(size, sizeof(SHM_QUEUE));
+ 
+ 	return size;
+ }
+ 
+ 
+ /*
+  * Compute the hash code associated with a PREDICATELOCKTAG.
+  *
+  * Because we want to use just one set of partition locks for both the
+  * PREDICATELOCKTARGET and PREDICATELOCK hash tables, we have to make sure
+  * that PREDICATELOCKs fall into the same partition number as their
+  * associated PREDICATELOCKTARGETs.  dynahash.c expects the partition number
+  * to be the low-order bits of the hash code, and therefore a
+  * PREDICATELOCKTAG's hash code must have the same low-order bits as the
+  * associated PREDICATELOCKTARGETTAG's hash code.  We achieve this with this
+  * specialized hash function.
+  */
+ static uint32
+ predicatelock_hash(const void *key, Size keysize)
+ {
+ 	const PREDICATELOCKTAG *predicatelocktag = (const PREDICATELOCKTAG *) key;
+ 	uint32		targethash;
+ 
+ 	Assert(keysize == sizeof(PREDICATELOCKTAG));
+ 
+ 	/* Look into the associated target object, and compute its hash code */
+ 	targethash = PredicateLockTargetTagHashCode(&predicatelocktag->myTarget->tag);
+ 
+ 	return PredicateLockHashCodeFromTargetHashCode(predicatelocktag, targethash);
+ }
+ 
+ 
+ /*
+  * GetPredicateLockStatusData
+  *		Return a table containing the internal state of the predicate
+  *		lock manager for use in pg_lock_status.
+  *
+  * Like GetLockStatusData, this function tries to hold the partition LWLocks
+  * for as short a time as possible by returning two arrays that simply
+  * contain the PREDICATELOCKTARGETTAG and SERIALIZABLEXACT for each lock
+  * table entry. Multiple copies of the same PREDICATELOCKTARGETTAG and
+  * SERIALIZABLEXACT will likely appear.
+  */
+ PredicateLockData *
+ GetPredicateLockStatusData(void)
+ {
+ 	PredicateLockData *data;
+ 	int			i;
+ 	int			els,
+ 				el;
+ 	HASH_SEQ_STATUS seqstat;
+ 	PREDICATELOCK *predlock;
+ 
+ 	data = (PredicateLockData *) palloc(sizeof(PredicateLockData));
+ 
+ 	/*
+ 	 * Acquire locks. To ensure consistency, take simultaneous locks on
+ 	 * SerializableFinishedListLock, all partition locks in ascending order,
+ 	 * then SerializableXactHashLock. TODO SSI: Do we really need to lock
+ 	 * SerializableFinishedListLock?
+ 	 */
+ 	LWLockAcquire(SerializableFinishedListLock, LW_SHARED);
+ 	for (i = 0; i < NUM_PREDICATELOCK_PARTITIONS; i++)
+ 		LWLockAcquire(FirstPredicateLockMgrLock + i, LW_SHARED);
+ 	LWLockAcquire(SerializableXactHashLock, LW_SHARED);
+ 
+ 	/* Get number of locks and allocate appropriately-sized arrays. */
+ 	els = hash_get_num_entries(PredicateLockHash);
+ 	data->nelements = els;
+ 	data->locktags = (PREDICATELOCKTARGETTAG *)
+ 		palloc(sizeof(PREDICATELOCKTARGETTAG) * els);
+ 	data->xacts = (SERIALIZABLEXACT *)
+ 		palloc(sizeof(SERIALIZABLEXACT) * els);
+ 
+ 
+ 	/* Scan through PredicateLockHash and copy contents */
+ 	hash_seq_init(&seqstat, PredicateLockHash);
+ 
+ 	el = 0;
+ 
+ 	while ((predlock = (PREDICATELOCK *) hash_seq_search(&seqstat)))
+ 	{
+ 		data->locktags[el] = predlock->tag.myTarget->tag;
+ 		data->xacts[el] = *predlock->tag.myXact;
+ 		el++;
+ 	}
+ 
+ 	Assert(el == els);
+ 
+ 	/* Release locks in reverse order */
+ 	LWLockRelease(SerializableXactHashLock);
+ 	for (i = NUM_PREDICATELOCK_PARTITIONS - 1; i >= 0; i--)
+ 		LWLockRelease(FirstPredicateLockMgrLock + i);
+ 	LWLockRelease(SerializableFinishedListLock);
+ 
+ 	return data;
+ }
+ 
+ 
+ /*
+  * Make sure we have a SERIALIZABLEXACT reference in MySerializableXact.
+  * It should be current for this process and be contained in
+  * SerializableXactHash.
+  */
+ void
+ RegisterSerializableTransaction(const Snapshot snapshot)
+ {
+ 	PGPROC	   *proc;
+ 	SERIALIZABLEXACTTAG sxacttag;
+ 	SERIALIZABLEXACT *sxact;
+ 	bool		found;
+ 	HASHCTL		hash_ctl;
+ 
+ 	/* We only do this for serializable transactions.  Once. */
+ 	Assert(IsXactIsoLevelFullySerializable);
+ 	Assert(MySerializableXact == InvalidSerializableXact);
+ 
+ 	proc = MyProc;
+ 	Assert(proc != NULL);
+ 	GET_VXID_FROM_PGPROC(sxacttag.vxid, *proc);
+ 
+ 	LWLockAcquire(SerializableXactHashLock, LW_EXCLUSIVE);
+ 	if (!TransactionIdIsValid(SerializableGlobalXmin))
+ 	{
+ 		Assert(SerializableGlobalXminCount == 0);
+ 		SerializableGlobalXmin = snapshot->xmin;
+ 		SerializableGlobalXminCount = 1;
+ 	}
+ 	else if (SerializableGlobalXmin == snapshot->xmin)
+ 	{
+ 		Assert(SerializableGlobalXminCount > 0);
+ 		SerializableGlobalXminCount++;
+ 	}
+ 	else
+ 	{
+ 		Assert(TransactionIdFollows(snapshot->xmin, SerializableGlobalXmin));
+ 	}
+ 	sxact = (SERIALIZABLEXACT *) hash_search(SerializableXactHash,
+ 											 &sxacttag,
+ 											 HASH_ENTER, &found);
+ 	Assert(!found);
+ 	if (!sxact)
+ 		ereport(ERROR,
+ 				(errcode(ERRCODE_OUT_OF_MEMORY),
+ 				 errmsg("out of shared memory"),
+ 				 errhint("You might need to increase max_predicate_locks_per_transaction.")));
+ 
+ 	/* Initialize the structure. */
+ 	sxact->outConflict = InvalidSerializableXact;
+ 	sxact->inConflict = InvalidSerializableXact;
+ 	sxact->topXid = GetTopTransactionIdIfAny();
+ 	sxact->finishedBefore = InvalidTransactionId;
+ 	sxact->xmin = snapshot->xmin;
+ 	SHMQueueInit(&(sxact->predicateLocks));
+ 	SHMQueueInit(&(sxact->xids));
+ 	SHMQueueElemInit(&(sxact->finishedLink));
+ 	sxact->rolledBack = false;
+ 	LWLockRelease(SerializableXactHashLock);
+ 
+ 	MySerializableXact = sxact;
+ 
+ 	/* Initialized the backend-local hash table of parent locks */
+ 	Assert(LocalPredicateLockHash == NULL);
+ 	MemSet(&hash_ctl, 0, sizeof(hash_ctl));
+ 	hash_ctl.keysize = sizeof(PREDICATELOCKTARGETTAG);
+ 	hash_ctl.entrysize = sizeof(LOCALPREDICATELOCK);
+ 	hash_ctl.hash = tag_hash;
+ 	LocalPredicateLockHash = hash_create("Local predicate lock",
+ 										 max_predicate_locks_per_xact,
+ 										 &hash_ctl,
+ 										 HASH_ELEM | HASH_FUNCTION);
+ }
+ 
+ /*
+  * Make sure we have a SERIALIZABLEXACT reference in MySerializableXact.
+  * It should be current for this process and be contained in
+  * SerializableXidHash.
+  */
+ static void
+ EnsureMySerializableXidExists(void)
+ {
+ 	TransactionId xid;
+ 
+ 	Assert(MySerializableXact != InvalidSerializableXact);
+ 
+ 	MySerializableXact->topXid = GetTopTransactionIdIfAny();
+ 
+ 	/*
+ 	 * If this isn't the xid we've most recently seen for this vxid, make sure
+ 	 * it's in the hash table.
+ 	 */
+ 	xid = GetCurrentTransactionIdIfAny();
+ 	if (MyXid != xid)
+ 	{
+ 		SERIALIZABLEXIDTAG sxidtag;
+ 		SERIALIZABLEXID *sxid;
+ 		bool		found;
+ 
+ 		Assert(TransactionIdIsValid(xid));
+ 
+ 		sxidtag.xid = xid;
+ 		LWLockAcquire(SerializableXactHashLock, LW_EXCLUSIVE);
+ 		sxid = (SERIALIZABLEXID *) hash_search(SerializableXidHash,
+ 											   &sxidtag,
+ 											   HASH_ENTER, &found);
+ 		if (!sxid)
+ 			ereport(ERROR,
+ 					(errcode(ERRCODE_OUT_OF_MEMORY),
+ 					 errmsg("out of shared memory"),
+ 					 errhint("You might need to increase max_predicate_locks_per_transaction.")));
+ 
+ 		/* Initialize the structure. */
+ 		if (!found)
+ 		{
+ 			sxid->myXact = (SERIALIZABLEXACT *) MySerializableXact;
+ 			SHMQueueInsertBefore(&(((SERIALIZABLEXACT *) MySerializableXact)->xids),
+ 								 &(sxid->xactLink));
+ 		}
+ 		LWLockRelease(SerializableXactHashLock);
+ 		MyXid = xid;
+ 	}
+ }
+ 
+ 
+ /*
+  * Check whether a particular lock is held by this transaction.
+  */
+ static bool
+ PredicateLockExists(const PREDICATELOCKTARGETTAG *targettag)
+ {
+ 	LOCALPREDICATELOCK *lock;
+ 
+ 	/* check local hash table */
+ 	lock = (LOCALPREDICATELOCK *) hash_search(LocalPredicateLockHash,
+ 											  targettag,
+ 											  HASH_FIND, NULL);
+ 
+ 	if (!lock)
+ 		return false;
+ 
+ 	/*
+ 	 * Found entry in the table, but still need to check whether it's actually
+ 	 * held -- it could just be a parent of some held lock.
+ 	 */
+ 	return lock->held;
+ }
+ 
+ /*
+  * Return the parent lock tag in the lock hierarchy: the next coarser
+  * lock that covers the provided tag.
+  *
+  * Returns true and sets *parent to the parent tag if one exists,
+  * returns false if none exists.
+  */
+ static bool
+ GetParentPredicateLockTag(const PREDICATELOCKTARGETTAG *tag,
+ 						  PREDICATELOCKTARGETTAG *parent)
+ {
+ 	switch (GET_PREDICATELOCKTARGETTAG_TYPE(*tag))
+ 	{
+ 		case PREDLOCKTAG_RELATION:
+ 			/* relation locks have no parent lock */
+ 			return false;
+ 
+ 		case PREDLOCKTAG_PAGE:
+ 			/* parent lock is relation lock */
+ 			SET_PREDICATELOCKTARGETTAG_RELATION(*parent,
+ 										 GET_PREDICATELOCKTARGETTAG_DB(*tag),
+ 								  GET_PREDICATELOCKTARGETTAG_RELATION(*tag));
+ 
+ 			return true;
+ 
+ 		case PREDLOCKTAG_TUPLE:
+ 			/* parent lock is page lock */
+ 			SET_PREDICATELOCKTARGETTAG_PAGE(*parent,
+ 										 GET_PREDICATELOCKTARGETTAG_DB(*tag),
+ 								   GET_PREDICATELOCKTARGETTAG_RELATION(*tag),
+ 									  GET_PREDICATELOCKTARGETTAG_PAGE(*tag));
+ 			return true;
+ 	}
+ 
+ 	/* not reachable */
+ 	Assert(false);
+ 	return false;
+ }
+ 
+ /*
+  * Check whether the lock we are considering is already covered by a
+  * coarser lock for our transaction.
+  */
+ static bool
+ CoarserLockCovers(const PREDICATELOCKTARGETTAG *newtargettag)
+ {
+ 	PREDICATELOCKTARGETTAG targettag,
+ 				parenttag;
+ 
+ 	targettag = *newtargettag;
+ 
+ 	/* check parents iteratively until no more */
+ 	while (GetParentPredicateLockTag(&targettag, &parenttag))
+ 	{
+ 		targettag = parenttag;
+ 		if (PredicateLockExists(&targettag))
+ 			return true;
+ 	}
+ 
+ 	/* no more parents to check; lock is not covered */
+ 	return false;
+ }
+ 
+ 
+ /*
+  * Delete child target locks owned by this process.
+  * This implementation is assuming that the usage of each target tag field
+  * is uniform.	No need to make this hard if we don't have to.
+  *
+  * We aren't acquiring lightweight locks for the predicate lock or lock
+  * target structures associated with this transaction unless we're going
+  * to modify them, because no other process is permitted to modify our
+  * locks.
+  */
+ static void
+ DeleteChildTargetLocks(const PREDICATELOCKTARGETTAG *newtargettag)
+ {
+ 	SERIALIZABLEXACT *sxact;
+ 	PREDICATELOCK *predlock;
+ 
+ 	LWLockAcquire(SerializablePredicateLockListLock, LW_SHARED);
+ 	sxact = (SERIALIZABLEXACT *) MySerializableXact;
+ 	predlock = (PREDICATELOCK *)
+ 		SHMQueueNext(&(sxact->predicateLocks),
+ 					 &(sxact->predicateLocks),
+ 					 offsetof(PREDICATELOCK, xactLink));
+ 	while (predlock)
+ 	{
+ 		SHM_QUEUE  *predlocksxactlink;
+ 		PREDICATELOCK *nextpredlock;
+ 		PREDICATELOCKTAG oldlocktag;
+ 		PREDICATELOCKTARGET *oldtarget;
+ 		PREDICATELOCKTARGETTAG oldtargettag;
+ 
+ 		predlocksxactlink = &(predlock->xactLink);
+ 		nextpredlock = (PREDICATELOCK *)
+ 			SHMQueueNext(&(sxact->predicateLocks),
+ 						 predlocksxactlink,
+ 						 offsetof(PREDICATELOCK, xactLink));
+ 
+ 		oldlocktag = predlock->tag;
+ 		Assert(oldlocktag.myXact == sxact);
+ 		oldtarget = oldlocktag.myTarget;
+ 		oldtargettag = oldtarget->tag;
+ 
+ 		if (TargetTagIsCoveredBy(oldtargettag, *newtargettag))
+ 		{
+ 			uint32		oldtargettaghash;
+ 			LWLockId	partitionLock;
+ 			PREDICATELOCK *rmpredlock;
+ 			PREDICATELOCKTARGET *rmtarget;
+ 
+ 			oldtargettaghash = PredicateLockTargetTagHashCode(&oldtargettag);
+ 			partitionLock = PredicateLockHashPartitionLock(oldtargettaghash);
+ 
+ 			LWLockAcquire(partitionLock, LW_EXCLUSIVE);
+ 
+ 			SHMQueueDelete(predlocksxactlink);
+ 			SHMQueueDelete(&(predlock->targetLink));
+ 			rmpredlock = hash_search_with_hash_value
+ 				(PredicateLockHash,
+ 				 &oldlocktag,
+ 				 PredicateLockHashCodeFromTargetHashCode(&oldlocktag,
+ 														 oldtargettaghash),
+ 				 HASH_REMOVE, NULL);
+ 			Assert(rmpredlock == predlock);
+ 
+ 			if (SHMQueueEmpty(&oldtarget->predicateLocks))
+ 			{
+ 				rmtarget = hash_search_with_hash_value(PredicateLockTargetHash,
+ 													   &oldtargettag,
+ 													   oldtargettaghash,
+ 													   HASH_REMOVE, NULL);
+ 				Assert(rmtarget == oldtarget);
+ 			}
+ 
+ 			LWLockRelease(partitionLock);
+ 
+ 			DecrementParentLocks(&oldtargettag);
+ 		}
+ 
+ 		predlock = nextpredlock;
+ 	}
+ 	LWLockRelease(SerializablePredicateLockListLock);
+ }
+ 
+ /*
+  * Returns the promotion threshold for a given predicate lock
+  * target. This is the number of descendant locks required to promote
+  * to the specified tag. Note that the threshold includes non-direct
+  * descendants, e.g. both tuples and pages for a relation lock.
+  *
+  * TODO SSI: We should do something more intelligent about what the
+  * thresholds are, either making it proportional to the number of
+  * tuples in a page & pages in a relation, or at least making it a
+  * GUC. Currently the threshold is 3 for a page lock, and
+  * max_predicate_locks_per_transaction/2 for a relation lock, chosen
+  * entirely arbitrarily (and without benchmarking).
+  */
+ static int
+ PredicateLockPromotionThreshold(const PREDICATELOCKTARGETTAG *tag)
+ {
+ 	switch (GET_PREDICATELOCKTARGETTAG_TYPE(*tag))
+ 	{
+ 		case PREDLOCKTAG_RELATION:
+ 			return max_predicate_locks_per_xact / 2;
+ 
+ 		case PREDLOCKTAG_PAGE:
+ 			return 3;
+ 
+ 		case PREDLOCKTAG_TUPLE:
+ 
+ 			/*
+ 			 * not reachable: nothing is finer-granularity than a tuple, so we
+ 			 * should never try to promote to it.
+ 			 */
+ 			Assert(false);
+ 			return 0;
+ 	}
+ 
+ 	/* not reachable */
+ 	Assert(false);
+ 	return 0;
+ }
+ 
+ /*
+  * For all ancestors of a newly-acquired predicate lock, increment
+  * their child count in the parent hash table. If any of them have
+  * more descendants than their promotion threshold, acquire the
+  * coarsest such lock.
+  *
+  * Returns true if a parent lock was acquired and false otherwise.
+  */
+ static bool
+ CheckAndPromotePredicateLockRequest(const PREDICATELOCKTARGETTAG *reqtag)
+ {
+ 	PREDICATELOCKTARGETTAG targettag,
+ 				nexttag,
+ 				promotiontag;
+ 	LOCALPREDICATELOCK *parentlock;
+ 	bool		found,
+ 				promote;
+ 
+ 	promote = false;
+ 
+ 	targettag = *reqtag;
+ 
+ 	/* check parents iteratively */
+ 	while (GetParentPredicateLockTag(&targettag, &nexttag))
+ 	{
+ 		targettag = nexttag;
+ 		parentlock = (LOCALPREDICATELOCK *) hash_search(LocalPredicateLockHash,
+ 														&targettag,
+ 														HASH_ENTER,
+ 														&found);
+ 		if (!found)
+ 		{
+ 			parentlock->held = false;
+ 			parentlock->childLocks = 1;
+ 		}
+ 		else
+ 			parentlock->childLocks++;
+ 
+ 		if (parentlock->childLocks >=
+ 			PredicateLockPromotionThreshold(&targettag))
+ 		{
+ 			/*
+ 			 * We should promote to this parent lock. Continue to check its
+ 			 * ancestors, however, both to get their child counts right and to
+ 			 * check whether we should just go ahead and promote to one of
+ 			 * them.
+ 			 */
+ 			promotiontag = targettag;
+ 			promote = true;
+ 		}
+ 	}
+ 
+ 	if (promote)
+ 	{
+ 		/* acquire coarsest ancestor eligible for promotion */
+ 		PredicateLockAcquire(&promotiontag);
+ 		return true;
+ 	}
+ 	else
+ 		return false;
+ }
+ 
+ /*
+  * When releasing a lock, decrement the child count on all ancestor
+  * locks.
+  *
+  * This is called only when releasing a lock via
+  * DeleteChildTargetLocks (i.e. when a lock becomes redundant because
+  * we've acquired its parent, possibly due to promotion) or when a new
+  * MVCC write lock makes the predicate lock unnecessary. There's no
+  * point in calling it when locks are released at transaction end, as
+  * this information is no longer needed.
+  */
+ static void
+ DecrementParentLocks(const PREDICATELOCKTARGETTAG *targettag)
+ {
+ 	PREDICATELOCKTARGETTAG parenttag,
+ 				nexttag;
+ 
+ 	parenttag = *targettag;
+ 
+ 	while (GetParentPredicateLockTag(&parenttag, &nexttag))
+ 	{
+ 		uint32		targettaghash;
+ 		LOCALPREDICATELOCK *parentlock,
+ 				   *rmlock;
+ 
+ 		parenttag = nexttag;
+ 		targettaghash = PredicateLockTargetTagHashCode(&parenttag);
+ 		parentlock = (LOCALPREDICATELOCK *)
+ 			hash_search_with_hash_value(LocalPredicateLockHash,
+ 										&parenttag, targettaghash,
+ 										HASH_FIND, NULL);
+ 		Assert(parentlock != NULL);
+ 		parentlock->childLocks--;
+ 
+ 		Assert(parentlock->childLocks >= 0);
+ 
+ 		if ((parentlock->childLocks == 0) && (!parentlock->held))
+ 		{
+ 			rmlock = (LOCALPREDICATELOCK *)
+ 				hash_search_with_hash_value(LocalPredicateLockHash,
+ 											&parenttag, targettaghash,
+ 											HASH_REMOVE, NULL);
+ 			Assert(rmlock == parentlock);
+ 		}
+ 	}
+ }
+ 
+ /*
+  * Acquire a predicate lock on the specified target for the current
+  * connection if not already held.	Create related serializable transaction
+  * and predicate lock target entries first if missing.
+  */
+ static void
+ PredicateLockAcquire(const PREDICATELOCKTARGETTAG *targettag)
+ {
+ 	uint32		targettaghash;
+ 	LWLockId	partitionLock;
+ 	bool		found;
+ 	PREDICATELOCKTARGET *target;
+ 	PREDICATELOCKTAG locktag;
+ 	PREDICATELOCK *lock;
+ 	LOCALPREDICATELOCK *locallock;
+ 
+ 	EnsureMySerializableXidExists();
+ 
+ 	/* Do we have the lock already, or a covering lock? */
+ 	if (PredicateLockExists(targettag))
+ 		return;
+ 
+ 	if (CoarserLockCovers(targettag))
+ 		return;
+ 
+ 	/* the same hash and LW lock apply to the lock target and the local lock. */
+ 	targettaghash = PredicateLockTargetTagHashCode(targettag);
+ 	partitionLock = PredicateLockHashPartitionLock(targettaghash);
+ 
+ 	/* Acquire lock in local table */
+ 	locallock = (LOCALPREDICATELOCK *)
+ 		hash_search_with_hash_value(LocalPredicateLockHash,
+ 									targettag, targettaghash,
+ 									HASH_ENTER, &found);
+ 	/* We should not hold the lock (but its entry might still exist) */
+ 	Assert(!found || !locallock->held);
+ 	locallock->held = true;
+ 	if (!found)
+ 		locallock->childLocks = 0;
+ 
+ 	LWLockAcquire(SerializablePredicateLockListLock, LW_SHARED);
+ 	LWLockAcquire(partitionLock, LW_EXCLUSIVE);
+ 
+ 	/* Make sure that the target is represented. */
+ 	target = (PREDICATELOCKTARGET *)
+ 		hash_search_with_hash_value(PredicateLockTargetHash,
+ 									targettag, targettaghash,
+ 									HASH_ENTER, &found);
+ 	if (!target)
+ 		ereport(ERROR,
+ 				(errcode(ERRCODE_OUT_OF_MEMORY),
+ 				 errmsg("out of shared memory"),
+ 				 errhint("You might need to increase max_predicate_locks_per_transaction.")));
+ 	if (!found)
+ 		SHMQueueInit(&(target->predicateLocks));
+ 
+ 	/* We've got the sxact and target, make sure they're joined. */
+ 	locktag.myTarget = target;
+ 	locktag.myXact = (SERIALIZABLEXACT *) MySerializableXact;
+ 	lock = (PREDICATELOCK *)
+ 		hash_search_with_hash_value(PredicateLockHash, &locktag,
+ 			PredicateLockHashCodeFromTargetHashCode(&locktag, targettaghash),
+ 									HASH_ENTER, &found);
+ 	if (!lock)
+ 		ereport(ERROR,
+ 				(errcode(ERRCODE_OUT_OF_MEMORY),
+ 				 errmsg("out of shared memory"),
+ 				 errhint("You might need to increase max_predicate_locks_per_transaction.")));
+ 
+ 	if (!found)
+ 	{
+ 		SHMQueueInsertBefore(&(target->predicateLocks), &(lock->targetLink));
+ 		SHMQueueInsertBefore((SHM_QUEUE *) &(MySerializableXact->predicateLocks),
+ 							 &(lock->xactLink));
+ 	}
+ 
+ 	LWLockRelease(partitionLock);
+ 	LWLockRelease(SerializablePredicateLockListLock);
+ 
+ 	/*
+ 	 * Lock has been acquired. Check whether it should be promoted to a
+ 	 * coarser granularity, or whether there are finer-granularity locks to
+ 	 * clean up.
+ 	 */
+ 	if (CheckAndPromotePredicateLockRequest(targettag))
+ 	{
+ 		/*
+ 		 * Lock request was promoted to a coarser-granularity lock, and that
+ 		 * lock was acquired. It will delete this lock and any of its
+ 		 * children, so we're done.
+ 		 */
+ 	}
+ 	else
+ 	{
+ 		/* Clean up any finer-granularity locks */
+ 		if (GET_PREDICATELOCKTARGETTAG_TYPE(*targettag) != PREDLOCKTAG_TUPLE)
+ 			DeleteChildTargetLocks(targettag);
+ 	}
+ }
+ 
+ 
+ /*
+  *		PredicateLockRelation
+  *
+  * Gets a predicate lock at the relation level.
+  * Skip if not in full serializable transaction isolation level.
+  * Skip if this is a temporary table.
+  * Clear any finer-grained predicate locks this session has on the relation.
+  */
+ void
+ PredicateLockRelation(const Relation relation)
+ {
+ 	PREDICATELOCKTARGETTAG tag;
+ 
+ 	if (SkipSerialization(relation))
+ 		return;
+ 
+ 	SET_PREDICATELOCKTARGETTAG_RELATION(tag,
+ 										relation->rd_node.dbNode,
+ 										relation->rd_id);
+ 	PredicateLockAcquire(&tag);
+ }
+ 
+ /*
+  *		PredicateLockPage
+  *
+  * Gets a predicate lock at the page level.
+  * Skip if not in full serializable transaction isolation level.
+  * Skip if this is a temporary table.
+  * Skip if a coarser predicate lock already covers this page.
+  * Clear any finer-grained predicate locks this session has on the relation.
+  */
+ void
+ PredicateLockPage(const Relation relation, const BlockNumber blkno)
+ {
+ 	PREDICATELOCKTARGETTAG tag;
+ 
+ 	if (SkipSerialization(relation))
+ 		return;
+ 
+ 	SET_PREDICATELOCKTARGETTAG_PAGE(tag,
+ 									relation->rd_node.dbNode,
+ 									relation->rd_id,
+ 									blkno);
+ 	PredicateLockAcquire(&tag);
+ }
+ 
+ /*
+  *		PredicateLockTuple
+  *
+  * Gets a predicate lock at the tuple level.
+  * Skip if not in full serializable transaction isolation level.
+  * Skip if this is a temporary table.
+  */
+ void
+ PredicateLockTuple(const Relation relation, const HeapTuple tuple)
+ {
+ 	PREDICATELOCKTARGETTAG tag;
+ 	ItemPointer tid;
+ 
+ 	if (SkipSerialization(relation))
+ 		return;
+ 
+ 	/*
+ 	 * If it's a heap tuple, return if this xact wrote it.  It might be useful
+ 	 * to pass in the xmin from the tuple as another parameter.
+ 	 */
+ 	if (relation->rd_index == NULL)
+ 	{
+ 		SERIALIZABLEXIDTAG sxidtag;
+ 		SERIALIZABLEXID *sxid;
+ 
+ 		sxidtag.xid = HeapTupleHeaderGetXmin(tuple->t_data);
+ 		LWLockAcquire(SerializableXactHashLock, LW_SHARED);
+ 		sxid = (SERIALIZABLEXID *)
+ 			hash_search(SerializableXidHash, &sxidtag, HASH_FIND, NULL);
+ 		if (sxid)
+ 		{
+ 			if (sxid->myXact == MySerializableXact)
+ 			{
+ 				/* We wrote it; we already have a write lock. */
+ 				LWLockRelease(SerializableXactHashLock);
+ 				return;
+ 			}
+ 		}
+ 		LWLockRelease(SerializableXactHashLock);
+ 	}
+ 
+ 	tid = &(tuple->t_self);
+ 	SET_PREDICATELOCKTARGETTAG_TUPLE(tag,
+ 									 relation->rd_node.dbNode,
+ 									 relation->rd_id,
+ 									 ItemPointerGetBlockNumber(tid),
+ 									 ItemPointerGetOffsetNumber(tid));
+ 	PredicateLockAcquire(&tag);
+ }
+ 
+ /*
+  *		PredicateLockPageSplit
+  *
+  * Copies any predicate locks for the old page to the new page.
+  * Skip if this is a temporary table or toast table.
+  *
+  * NOTE: A page split (or overflow) affects all serializable transactions,
+  * even if it occurrs in the context of another transaction isolation level.
+  *
+  * NOTE: This currently leaves the local copy of the locks without
+  * information on the new lock which is in shared memory.  This could cause
+  * problems if enough page splits occur on locked pages without the processes
+  * which hold the locks getting in and noticing.
+  */
+ void
+ PredicateLockPageSplit(const Relation relation, const BlockNumber oldblkno,
+ 					   const BlockNumber newblkno)
+ {
+ 	PREDICATELOCKTARGETTAG oldtargettag;
+ 	PREDICATELOCKTARGETTAG newtargettag;
+ 	uint32		oldtargettaghash;
+ 	LWLockId	oldpartitionLock;
+ 	PREDICATELOCKTARGET *oldtarget;
+ 	uint32		newtargettaghash;
+ 	LWLockId	newpartitionLock;
+ 
+ 	if (SkipSplitTracking(relation))
+ 		return;
+ 
+ 	Assert(oldblkno != newblkno);
+ 	Assert(BlockNumberIsValid(oldblkno));
+ 	Assert(BlockNumberIsValid(newblkno));
+ 
+ 	SET_PREDICATELOCKTARGETTAG_PAGE(oldtargettag,
+ 									relation->rd_node.dbNode,
+ 									relation->rd_id,
+ 									oldblkno);
+ 	SET_PREDICATELOCKTARGETTAG_PAGE(newtargettag,
+ 									relation->rd_node.dbNode,
+ 									relation->rd_id,
+ 									newblkno);
+ 
+ 	oldtargettaghash = PredicateLockTargetTagHashCode(&oldtargettag);
+ 	newtargettaghash = PredicateLockTargetTagHashCode(&newtargettag);
+ 	oldpartitionLock = PredicateLockHashPartitionLock(oldtargettaghash);
+ 	newpartitionLock = PredicateLockHashPartitionLock(newtargettaghash);
+ 
+ 	LWLockAcquire(SerializablePredicateLockListLock, LW_EXCLUSIVE);
+ 
+ 	/*
+ 	 * We must get the partition locks in ascending sequence to avoid
+ 	 * deadlocks. If old and new partitions are the same, we must request the
+ 	 * lock only once.
+ 	 */
+ 	if (oldpartitionLock < newpartitionLock)
+ 	{
+ 		LWLockAcquire(oldpartitionLock, LW_SHARED);
+ 		LWLockAcquire(newpartitionLock, LW_EXCLUSIVE);
+ 	}
+ 	else if (oldpartitionLock > newpartitionLock)
+ 	{
+ 		LWLockAcquire(newpartitionLock, LW_EXCLUSIVE);
+ 		LWLockAcquire(oldpartitionLock, LW_SHARED);
+ 	}
+ 	else
+ 		LWLockAcquire(newpartitionLock, LW_EXCLUSIVE);
+ 
+ 	/*
+ 	 * Look for the old target.  If not found, that's OK; no predicate locks
+ 	 * are affected, so we can just clean up and return. If it does exist,
+ 	 * walk its list of predicate locks and create new ones for the new block
+ 	 * number.
+ 	 */
+ 	oldtarget = hash_search_with_hash_value(PredicateLockTargetHash,
+ 											&oldtargettag,
+ 											oldtargettaghash,
+ 											HASH_FIND, NULL);
+ 	if (oldtarget)
+ 	{
+ 		PREDICATELOCKTARGET *newtarget;
+ 		bool		found;
+ 		PREDICATELOCK *oldpredlock;
+ 		PREDICATELOCKTAG newpredlocktag;
+ 
+ 		newtarget = hash_search_with_hash_value(PredicateLockTargetHash,
+ 												&newtargettag,
+ 												newtargettaghash,
+ 												HASH_ENTER, &found);
+ 		Assert(!found);
+ 		if (!newtarget)
+ 			ereport(ERROR,
+ 					(errcode(ERRCODE_OUT_OF_MEMORY),
+ 					 errmsg("out of shared memory"),
+ 					 errhint("You might need to increase max_predicate_locks_per_transaction.")));
+ 		SHMQueueInit(&(newtarget->predicateLocks));
+ 
+ 		newpredlocktag.myTarget = newtarget;
+ 
+ 		oldpredlock = (PREDICATELOCK *)
+ 			SHMQueueNext(&(oldtarget->predicateLocks),
+ 						 &(oldtarget->predicateLocks),
+ 						 offsetof(PREDICATELOCK, targetLink));
+ 		LWLockAcquire(SerializableXactHashLock, LW_EXCLUSIVE);
+ 		while (oldpredlock)
+ 		{
+ 			SHM_QUEUE  *predlocktargetlink;
+ 			PREDICATELOCK *nextpredlock;
+ 			PREDICATELOCK *newpredlock;
+ 
+ 			predlocktargetlink = &(oldpredlock->targetLink);
+ 			nextpredlock = (PREDICATELOCK *)
+ 				SHMQueueNext(&(oldtarget->predicateLocks),
+ 							 predlocktargetlink,
+ 							 offsetof(PREDICATELOCK, targetLink));
+ 			newpredlocktag.myXact = oldpredlock->tag.myXact;
+ 
+ 			newpredlock = (PREDICATELOCK *)
+ 				hash_search_with_hash_value
+ 				(PredicateLockHash,
+ 				 &newpredlocktag,
+ 				 PredicateLockHashCodeFromTargetHashCode(&newpredlocktag,
+ 														 newtargettaghash),
+ 				 HASH_ENTER, &found);
+ 			if (!newpredlock)
+ 				ereport(ERROR,
+ 						(errcode(ERRCODE_OUT_OF_MEMORY),
+ 						 errmsg("out of shared memory"),
+ 						 errhint("You might need to increase max_predicate_locks_per_transaction.")));
+ 			Assert(!found);
+ 			SHMQueueInsertBefore(&(newtarget->predicateLocks),
+ 								 &(newpredlock->targetLink));
+ 			SHMQueueInsertBefore(&(newpredlocktag.myXact->predicateLocks),
+ 								 &(newpredlock->xactLink));
+ 
+ 			oldpredlock = nextpredlock;
+ 		}
+ 		LWLockRelease(SerializableXactHashLock);
+ 	}
+ 
+ 	/* Release partition locks in reverse order of acquisition. */
+ 	if (oldpartitionLock < newpartitionLock)
+ 	{
+ 		LWLockRelease(newpartitionLock);
+ 		LWLockRelease(oldpartitionLock);
+ 	}
+ 	else if (oldpartitionLock > newpartitionLock)
+ 	{
+ 		LWLockRelease(oldpartitionLock);
+ 		LWLockRelease(newpartitionLock);
+ 	}
+ 	else
+ 		LWLockRelease(newpartitionLock);
+ 	LWLockRelease(SerializablePredicateLockListLock);
+ }
+ 
+ /*
+  *		PredicateLockPageCombine
+  *
+  * Combines predicate locks for two existing pages.
+  * Skip if this is a temporary table or toast table.
+  *
+  * NOTE: A page combine affects all serializable
+  * transactions, even if it occurrs in the context of another
+  * transaction isolation level.
+  */
+ void
+ PredicateLockPageCombine(const Relation relation, const BlockNumber oldblkno,
+ 						 const BlockNumber newblkno)
+ {
+ 	PREDICATELOCKTARGETTAG oldtargettag;
+ 	PREDICATELOCKTARGETTAG newtargettag;
+ 	uint32		oldtargettaghash;
+ 	LWLockId	oldpartitionLock;
+ 	PREDICATELOCKTARGET *oldtarget;
+ 	uint32		newtargettaghash;
+ 	LWLockId	newpartitionLock;
+ 
+ 	if (SkipSplitTracking(relation))
+ 		return;
+ 
+ 	Assert(oldblkno != newblkno);
+ 	Assert(BlockNumberIsValid(oldblkno));
+ 	Assert(BlockNumberIsValid(newblkno));
+ 
+ 	SET_PREDICATELOCKTARGETTAG_PAGE(oldtargettag,
+ 									relation->rd_node.dbNode,
+ 									relation->rd_id,
+ 									oldblkno);
+ 	SET_PREDICATELOCKTARGETTAG_PAGE(newtargettag,
+ 									relation->rd_node.dbNode,
+ 									relation->rd_id,
+ 									newblkno);
+ 
+ 	oldtargettaghash = PredicateLockTargetTagHashCode(&oldtargettag);
+ 	newtargettaghash = PredicateLockTargetTagHashCode(&newtargettag);
+ 	oldpartitionLock = PredicateLockHashPartitionLock(oldtargettaghash);
+ 	newpartitionLock = PredicateLockHashPartitionLock(newtargettaghash);
+ 
+ 	LWLockAcquire(SerializablePredicateLockListLock, LW_EXCLUSIVE);
+ 
+ 	/*
+ 	 * We must get the partition locks in ascending sequence to avoid
+ 	 * deadlocks. If old and new partitions are the same, we must request the
+ 	 * lock only once.
+ 	 */
+ 	if (oldpartitionLock < newpartitionLock)
+ 	{
+ 		LWLockAcquire(oldpartitionLock, LW_EXCLUSIVE);
+ 		LWLockAcquire(newpartitionLock, LW_EXCLUSIVE);
+ 	}
+ 	else if (oldpartitionLock > newpartitionLock)
+ 	{
+ 		LWLockAcquire(newpartitionLock, LW_EXCLUSIVE);
+ 		LWLockAcquire(oldpartitionLock, LW_EXCLUSIVE);
+ 	}
+ 	else
+ 		LWLockAcquire(newpartitionLock, LW_EXCLUSIVE);
+ 
+ 	/*
+ 	 * Look for the old target.  If not found, that's OK; no predicate locks
+ 	 * are affected, so we can just clean up and return. If it does exist,
+ 	 * walk its list of predicate locks and create new ones for the new block
+ 	 * number, while deleting the old ones.
+ 	 */
+ 	oldtarget = hash_search_with_hash_value(PredicateLockTargetHash,
+ 											&oldtargettag,
+ 											oldtargettaghash,
+ 											HASH_FIND, NULL);
+ 	if (oldtarget)
+ 	{
+ 		PREDICATELOCKTARGET *newtarget;
+ 		PREDICATELOCK *oldpredlock;
+ 		PREDICATELOCKTAG newpredlocktag;
+ 
+ 		newtarget = hash_search_with_hash_value(PredicateLockTargetHash,
+ 												&newtargettag,
+ 												newtargettaghash,
+ 												HASH_FIND, NULL);
+ 		Assert(newtarget);
+ 
+ 		newpredlocktag.myTarget = newtarget;
+ 
+ 		oldpredlock = (PREDICATELOCK *)
+ 			SHMQueueNext(&(oldtarget->predicateLocks),
+ 						 &(oldtarget->predicateLocks),
+ 						 offsetof(PREDICATELOCK, targetLink));
+ 		LWLockAcquire(SerializableXactHashLock, LW_EXCLUSIVE);
+ 		while (oldpredlock)
+ 		{
+ 			SHM_QUEUE  *predlocktargetlink;
+ 			PREDICATELOCK *nextpredlock;
+ 			PREDICATELOCK *newpredlock;
+ 			bool		found;
+ 
+ 			predlocktargetlink = &(oldpredlock->targetLink);
+ 			nextpredlock = (PREDICATELOCK *)
+ 				SHMQueueNext(&(oldtarget->predicateLocks),
+ 							 predlocktargetlink,
+ 							 offsetof(PREDICATELOCK, targetLink));
+ 			newpredlocktag.myXact = oldpredlock->tag.myXact;
+ 
+ 			hash_search_with_hash_value
+ 				(PredicateLockHash,
+ 				 &oldpredlock->tag,
+ 				 PredicateLockHashCodeFromTargetHashCode(&oldpredlock->tag,
+ 														 oldtargettaghash),
+ 				 HASH_REMOVE, NULL);
+ 
+ 			newpredlock = (PREDICATELOCK *)
+ 				hash_search_with_hash_value
+ 				(PredicateLockHash,
+ 				 &newpredlocktag,
+ 				 PredicateLockHashCodeFromTargetHashCode(&newpredlocktag,
+ 														 newtargettaghash),
+ 				 HASH_ENTER, &found);
+ 			if (!newpredlock)
+ 				ereport(ERROR,
+ 						(errcode(ERRCODE_OUT_OF_MEMORY),
+ 						 errmsg("out of shared memory"),
+ 						 errhint("You might need to increase max_predicate_locks_per_transaction.")));
+ 			if (!found)
+ 			{
+ 				SHMQueueInsertBefore(&(newtarget->predicateLocks),
+ 									 &(newpredlock->targetLink));
+ 				SHMQueueInsertBefore((SHM_QUEUE *) &(newpredlocktag.myXact->predicateLocks),
+ 									 &(newpredlock->xactLink));
+ 			}
+ 
+ 			oldpredlock = nextpredlock;
+ 		}
+ 		LWLockRelease(SerializableXactHashLock);
+ 		Assert(SHMQueueIsDetached(&oldtarget->predicateLocks));
+ 		hash_search_with_hash_value(PredicateLockTargetHash,
+ 									&oldtargettag,
+ 									oldtargettaghash,
+ 									HASH_REMOVE, NULL);
+ 	}
+ 
+ 	/* Release partition locks in reverse order of acquisition. */
+ 	if (oldpartitionLock < newpartitionLock)
+ 	{
+ 		LWLockRelease(newpartitionLock);
+ 		LWLockRelease(oldpartitionLock);
+ 	}
+ 	else if (oldpartitionLock > newpartitionLock)
+ 	{
+ 		LWLockRelease(oldpartitionLock);
+ 		LWLockRelease(newpartitionLock);
+ 	}
+ 	else
+ 		LWLockRelease(newpartitionLock);
+ }
+ 
+ /*
+  * Walk the hash table and find the new xmin.
+  */
+ static void
+ SetNewSerializableGlobalXmin(void)
+ {
+ 	HASH_SEQ_STATUS seqstat;
+ 	SERIALIZABLEXACT *sxact;
+ 
+ 	SerializableGlobalXmin = InvalidTransactionId;
+ 	SerializableGlobalXminCount = 0;
+ 	hash_seq_init(&seqstat, SerializableXactHash);
+ 	while ((sxact = (SERIALIZABLEXACT *) hash_seq_search(&seqstat)))
+ 	{
+ 		if (!SxactIsOnFinishedList(sxact))
+ 		{
+ 			if (!TransactionIdIsValid(SerializableGlobalXmin)
+ 				|| TransactionIdPrecedes(sxact->xmin, SerializableGlobalXmin))
+ 			{
+ 				SerializableGlobalXmin = sxact->xmin;
+ 				SerializableGlobalXminCount = 1;
+ 			}
+ 			else if (sxact->xmin == SerializableGlobalXmin)
+ 				SerializableGlobalXminCount++;
+ 		}
+ 	}
+ }
+ 
+ /*
+  *		ReleasePredicateLocks
+  *
+  * Releases predicate locks based on completion of the current
+  * transaction, whether committed or rolled back.
+  *
+  * We do nothing unless this is a serializable transaction.
+  *
+  * For a rollback, the current transaction's predicate locks could be
+  * immediately released; however, we may still have conflict pointers to
+  * our transaction which could be expensive to find and eliminate right
+  * now, so we flag it as rolled back so that it will be ignored, and let
+  * cleanup happen later.
+  *
+  * This method must ensure that shared memory hash tables are cleaned
+  * up in some relatively timely fashion.
+  *
+  * If this transaction is committing and is holding any predicate locks,
+  * it must be added to a list of completed serializable transaction still
+  * holding locks.
+  */
+ void
+ ReleasePredicateLocks(const bool isCommit)
+ {
+ 	bool		needToClear;
+ 
+ 	if (MySerializableXact == InvalidSerializableXact)
+ 	{
+ 		Assert(LocalPredicateLockHash == NULL);
+ 		return;
+ 	}
+ 
+ 	Assert(IsXactIsoLevelFullySerializable);
+ 
+ 	/* We'd better not already be on the cleanup list. */
+ 	Assert(!SxactIsOnFinishedList((SERIALIZABLEXACT *) MySerializableXact));
+ 
+ 	/*
+ 	 * If it's not a commit it's a rollback, and we can clear our locks
+ 	 * immediately.  TODO SSI: Clear the locks, but leave the sxact record.
+ 	 */
+ 	if (!isCommit)
+ 		MySerializableXact->rolledBack = true;
+ 
+ 	/*
+ 	 * Add this to the list of transactions to check for later cleanup. First
+ 	 * turn pointers to already-terminated transactions to self-references.
+ 	 */
+ 	if (MySerializableXact->inConflict != InvalidSerializableXact)
+ 	{
+ 		if (MySerializableXact->inConflict->rolledBack)
+ 			MySerializableXact->inConflict = InvalidSerializableXact;
+ 		else if (SxactIsCommitted(MySerializableXact->inConflict))
+ 			MySerializableXact->inConflict = (SERIALIZABLEXACT *) MySerializableXact;
+ 	}
+ 	if (MySerializableXact->outConflict != InvalidSerializableXact)
+ 	{
+ 		if (MySerializableXact->outConflict->rolledBack)
+ 			MySerializableXact->outConflict = InvalidSerializableXact;
+ 		else if (SxactIsCommitted(MySerializableXact->outConflict))
+ 			MySerializableXact->outConflict = (SERIALIZABLEXACT *) MySerializableXact;
+ 	}
+ 
+ 	/* Add this to the list of transactions to check for later cleanup. */
+ 	LWLockAcquire(SerializableFinishedListLock, LW_EXCLUSIVE);
+ 	SHMQueueInsertBefore(FinishedSerializableTransactions,
+ 						 (SHM_QUEUE *) &(MySerializableXact->finishedLink));
+ 	LWLockRelease(SerializableFinishedListLock);
+ 
+ 	/*
+ 	 * Check whether it's time to clean up old transactions. This can only be
+ 	 * done when the last serializable transaction with the oldest xmin among
+ 	 * serializable transactions completes.  We then find the "new oldest"
+ 	 * xmin and purge any transactions which finished before this transaction
+ 	 * was launched.
+ 	 */
+ 	needToClear = false;
+ 	LWLockAcquire(SerializableXactHashLock, LW_EXCLUSIVE);
+ 	if (TransactionIdPrecedes(SerializableGlobalXmin, RecentGlobalXmin))
+ 	{
+ 		SetNewSerializableGlobalXmin();
+ 		needToClear = true;
+ 	}
+ 	else if (MySerializableXact->xmin == SerializableGlobalXmin)
+ 	{
+ 		Assert(SerializableGlobalXminCount > 0);
+ 		if (--SerializableGlobalXminCount == 0)
+ 		{
+ 			SetNewSerializableGlobalXmin();
+ 			needToClear = true;
+ 		}
+ 	}
+ 	LWLockRelease(SerializableXactHashLock);
+ 
+ 	if (needToClear)
+ 		ClearOldPredicateLocks();
+ 
+ 	MySerializableXact = InvalidSerializableXact;
+ 	MyXid = InvalidTransactionId;
+ 
+ 	/* Delete per-transaction lock table */
+ 	hash_destroy(LocalPredicateLockHash);
+ 	LocalPredicateLockHash = NULL;
+ }
+ 
+ /*
+  * Clear old predicate locks.
+  */
+ static void
+ ClearOldPredicateLocks(void)
+ {
+ 	SERIALIZABLEXACT *finishedSxact;
+ 
+ 	if (!LWLockConditionalAcquire(SerializableFinishedListLock, LW_EXCLUSIVE))
+ 		return;
+ 
+ 	finishedSxact = (SERIALIZABLEXACT *)
+ 		SHMQueueNext(FinishedSerializableTransactions,
+ 					 FinishedSerializableTransactions,
+ 					 offsetof(SERIALIZABLEXACT, finishedLink));
+ 	LWLockAcquire(SerializableXactHashLock, LW_SHARED);
+ 	while (finishedSxact)
+ 	{
+ 		SERIALIZABLEXACT *nextSxact;
+ 
+ 		nextSxact = (SERIALIZABLEXACT *)
+ 			SHMQueueNext(FinishedSerializableTransactions,
+ 						 &(finishedSxact->finishedLink),
+ 						 offsetof(SERIALIZABLEXACT, finishedLink));
+ 		if (!TransactionIdIsValid(SerializableGlobalXmin)
+ 			|| TransactionIdPrecedesOrEquals(finishedSxact->finishedBefore,
+ 											 SerializableGlobalXmin))
+ 		{
+ 			LWLockRelease(SerializableXactHashLock);
+ 			ReleaseOneSerializableXact(finishedSxact);
+ 			LWLockAcquire(SerializableXactHashLock, LW_SHARED);
+ 		}
+ 		finishedSxact = nextSxact;
+ 	}
+ 	LWLockRelease(SerializableXactHashLock);
+ 	LWLockRelease(SerializableFinishedListLock);
+ }
+ 
+ /*
+  * This is the normal way to delete anything from any of the predicate
+  * locking hash tables.  Given a transaction which we know can be deleted,
+  * delete all predicate locks held by that transaction, and any predicate
+  * lock targets which are now unreferenced by a lock; delete all xid values
+  * for the transaction; then delete the transaction.
+  */
+ static void
+ ReleaseOneSerializableXact(SERIALIZABLEXACT *sxact)
+ {
+ 	PREDICATELOCK *predlock;
+ 	SERIALIZABLEXID *sxid;
+ 
+ 	Assert(sxact != NULL);
+ 	Assert(sxact->rolledBack || SxactIsCommitted(sxact));
+ 	Assert(SxactIsOnFinishedList(sxact));
+ 
+ 	LWLockAcquire(SerializablePredicateLockListLock, LW_SHARED);
+ 	predlock = (PREDICATELOCK *)
+ 		SHMQueueNext(&(sxact->predicateLocks),
+ 					 &(sxact->predicateLocks),
+ 					 offsetof(PREDICATELOCK, xactLink));
+ 	while (predlock)
+ 	{
+ 		PREDICATELOCK *nextpredlock;
+ 		PREDICATELOCKTAG tag;
+ 		SHM_QUEUE  *targetLink;
+ 		PREDICATELOCKTARGET *target;
+ 		PREDICATELOCKTARGETTAG targettag;
+ 		uint32		targettaghash;
+ 		LWLockId	partitionLock;
+ 
+ 		nextpredlock = (PREDICATELOCK *)
+ 			SHMQueueNext(&(sxact->predicateLocks),
+ 						 &(predlock->xactLink),
+ 						 offsetof(PREDICATELOCK, xactLink));
+ 
+ 		tag = predlock->tag;
+ 		targetLink = &(predlock->targetLink);
+ 		target = tag.myTarget;
+ 		targettag = target->tag;
+ 		targettaghash = PredicateLockTargetTagHashCode(&targettag);
+ 		partitionLock = PredicateLockHashPartitionLock(targettaghash);
+ 
+ 		LWLockAcquire(partitionLock, LW_EXCLUSIVE);
+ 		SHMQueueDelete(targetLink);
+ 
+ 		/*
+ 		 * No need to do retail removal from transaction object; it's going
+ 		 * away.
+ 		 */
+ 		hash_search_with_hash_value(PredicateLockHash, &tag,
+ 								PredicateLockHashCodeFromTargetHashCode(&tag,
+ 															  targettaghash),
+ 									HASH_REMOVE, NULL);
+ 		if (SHMQueueEmpty(&target->predicateLocks))
+ 			hash_search_with_hash_value(PredicateLockTargetHash,
+ 							   &targettag, targettaghash, HASH_REMOVE, NULL);
+ 		LWLockRelease(partitionLock);
+ 		predlock = nextpredlock;
+ 	}
+ 	LWLockRelease(SerializablePredicateLockListLock);
+ 
+ 	/* Get rid of the xids and the record of the transaction itself. */
+ 	LWLockAcquire(SerializableXactHashLock, LW_EXCLUSIVE);
+ 	sxid = (SERIALIZABLEXID *)
+ 		SHMQueueNext(&(sxact->xids),
+ 					 &(sxact->xids),
+ 					 offsetof(SERIALIZABLEXID, xactLink));
+ 	while (sxid)
+ 	{
+ 		SERIALIZABLEXID *nextsxid;
+ 		SERIALIZABLEXIDTAG tag;
+ 
+ 		nextsxid = (SERIALIZABLEXID *)
+ 			SHMQueueNext(&(sxact->xids),
+ 						 &(sxid->xactLink),
+ 						 offsetof(SERIALIZABLEXID, xactLink));
+ 		tag = sxid->tag;
+ 		hash_search(SerializableXidHash, &tag, HASH_REMOVE, NULL);
+ 
+ 		/*
+ 		 * No need to do retail removal from transaction object; it's going
+ 		 * away.
+ 		 */
+ 		sxid = nextsxid;
+ 	}
+ 	SHMQueueDelete(&(sxact->finishedLink));
+ 	hash_search(SerializableXactHash, &(sxact->tag), HASH_REMOVE, NULL);
+ 	LWLockRelease(SerializableXactHashLock);
+ }
+ 
+ /*
+  * Tests whether the given transaction is concurrent with (overlaps)
+  * our current transaction.
+  */
+ static bool
+ XidIsConcurrent(TransactionId xid)
+ {
+ 	Snapshot	snap;
+ 	uint32		i;
+ 
+ 	Assert(TransactionIdIsValid(xid));
+ 
+ 	/*
+ 	 * We don't count our own transaction or its subtransactions as
+ 	 * "concurrent".
+ 	 */
+ 	if (xid == GetTopTransactionIdIfAny())
+ 		return false;
+ 
+ 	snap = GetTransactionSnapshot();
+ 
+ 	if (TransactionIdPrecedes(xid, snap->xmin))
+ 		return false;
+ 
+ 	if (TransactionIdFollowsOrEquals(xid, snap->xmax))
+ 		return true;
+ 
+ 	for (i = 0; i < snap->xcnt; i++)
+ 	{
+ 		if (xid == snap->xip[i])
+ 			return true;
+ 	}
+ 
+ 	return false;
+ }
+ 
+ /*
+  * CheckForSerializableConflictOut
+  *		We are reading a tuple which has been modified.  If it is visible to
+  *		us but has been deleted, that indicates a rw-conflict out.	If it's
+  *		not visible and was created by a concurrent (overlapping)
+  *		serializable transaction, that is also a rw-conflict out,
+  *
+  * The heap tables which we maintain for predicate locking will also be used
+  * to determine that the xmin from a row is related to a serializable
+  * transaction, and will provide a mapping to the top level transaction.
+  *
+  * This function should be called just about anywhere in heapam.c that a
+  * tuple has been read.
+  */
+ void
+ CheckForSerializableConflictOut(const bool valid, const Relation relation,
+ 								const HeapTuple tuple, const Buffer buffer)
+ {
+ 	TransactionId xid;
+ 	SERIALIZABLEXIDTAG sxidtag;
+ 	SERIALIZABLEXID *sxid;
+ 	SERIALIZABLEXACTTAG sxacttag;
+ 	SERIALIZABLEXACT *sxact;
+ 
+ 	if (SkipSerialization(relation))
+ 		return;
+ 
+ 	if (valid)
+ 	{
+ 		/*----------------------------------------------------------------
+ 		 * TODO SSI: Figure out why the ItemPointerIsValid test is needed.
+ 		 *			 We are sometimes failing with ip_posid == 0 in corner
+ 		 *			 cases, like the following.  Is this some underlying bug?
+ 		 *			 If not, is this the best way to handle this?
+ 		 *
+ 		 *	-- setup
+ 		 *	drop table ctl, receipt;
+ 		 *	create table ctl (k text not null primary key, deposit_date date not null);
+ 		 *	insert into ctl values ('receipt', date '2008-12-22');
+ 		 *	create table receipt (receipt_no int not null primary key, deposit_date date not null, amount numeric(13,2));
+ 		 *	insert into receipt values (1, (select deposit_date from ctl where k = 'receipt'), 1.00);
+ 		 *	insert into receipt values (2, (select deposit_date from ctl where k = 'receipt'), 2.00);
+ 		 *
+ 		 *	-- connection 1
+ 		 *	start transaction isolation level serializable ;
+ 		 *	insert into receipt values (3, (select deposit_date from ctl where k = 'receipt'), 4.00);
+ 		 *
+ 		 *	-- connection 2
+ 		 *	start transaction isolation level serializable ;
+ 		 *	update ctl set deposit_date = date '2008-12-23' where k = 'receipt';
+ 		 *
+ 		 *	-- connection 3
+ 		 *	start transaction isolation level serializable ;
+ 		 *	select * from ctl;
+ 		 *
+ 		 *	-- connection 2
+ 		 *	rollback;
+ 		 *
+ 		 *	-- connection 3
+ 		 *	select * from re<Tab><Tab>[nothing shows]ceipt;
+ 		 *	> no connection to the server
+ 		 *	> The connection to the server was lost. Attempting reset: Succeeded.
+ 		 *----------------------------------------------------------------
+ 		 */
+ 		/* If there's a new tuple to key on, return to avoid duplicate work. */
+ 		if (ItemPointerIsValid(&(tuple->t_data->t_ctid))
+ 			&& !ItemPointerEquals(&(tuple->t_self), &(tuple->t_data->t_ctid)))
+ 			return;
+ 
+ 		/*
+ 		 * We may bail out if previous xmax aborted, or if it committed but
+ 		 * only locked the tuple without updating it.
+ 		 */
+ 		if (tuple->t_data->t_infomask & (HEAP_XMAX_INVALID | HEAP_IS_LOCKED))
+ 			return;
+ 
+ 		/*
+ 		 * If there's a valid xmax, it must be from a concurrent transaction,
+ 		 * since it deleted a tuple which is visible to us.
+ 		 */
+ 		xid = HeapTupleHeaderGetXmax(tuple->t_data);
+ 		if (!TransactionIdIsValid(xid))
+ 			return;
+ 	}
+ 	else
+ 	{
+ 		/*
+ 		 * We would read this row, but it isn't visible to us.
+ 		 */
+ 		xid = HeapTupleHeaderGetXmin(tuple->t_data);
+ 	}
+ 
+ 	/*
+ 	 * It's OK to look for conflicts with a share lock, and record them with
+ 	 * an exclusive lock when found; we just have to release the shared lock
+ 	 * before attempting to get the other lock, to prevent deadlocks.  We will
+ 	 * need to recheck that the entry still exists after getting the stronger
+ 	 * lock, just in case it rolled back in the window where we weren't
+ 	 * holding a lock.
+ 	 */
+ 	sxidtag.xid = xid;
+ 	LWLockAcquire(SerializableXactHashLock, LW_SHARED);
+ 	sxid = (SERIALIZABLEXID *)
+ 		hash_search(SerializableXidHash, &sxidtag, HASH_FIND, NULL);
+ 	if (!sxid)
+ 	{
+ 		/* It's not serializable or otherwise not important. */
+ 		LWLockRelease(SerializableXactHashLock);
+ 		return;
+ 	}
+ 	sxact = sxid->myXact;
+ 	if (sxact == MySerializableXact || sxact->rolledBack)
+ 	{
+ 		/* We can't conflict with our own transaction or one rolled back. */
+ 		LWLockRelease(SerializableXactHashLock);
+ 		return;
+ 	}
+ 
+ 	/*
+ 	 * If this is a read-only transaction and the writing transaction has
+ 	 * committed, and it doesn't have a rw-conflict out or has a conflict out
+ 	 * to a transaction which overlaps this transaction, then no conflict.
+ 	 */
+ 	if (XactReadOnly
+ 		&& SxactIsCommitted(sxact)
+ 		&& (!TransactionIdIsValid(sxact->outConflict)
+ 			|| (sxact != sxact->outConflict
+ 				&& (!SxactIsCommitted(sxact->outConflict)
+ 					|| XidIsConcurrent(sxact->outConflict->topXid)))))
+ 	{
+ 		/* Read-only transaction will appear to run first.	No conflict. */
+ 		LWLockRelease(SerializableXactHashLock);
+ 		return;
+ 	}
+ 
+ 	sxacttag = sxact->tag;
+ 	LWLockRelease(SerializableXactHashLock);
+ 
+ 	/*
+ 	 * Make sure we have somewhere to record a conflict against this
+ 	 * transaction.
+ 	 */
+ 	EnsureMySerializableXidExists();
+ 
+ 	LWLockAcquire(SerializableXactHashLock, LW_EXCLUSIVE);
+ 	sxact = (SERIALIZABLEXACT *)
+ 		hash_search(SerializableXactHash, &sxacttag, HASH_FIND, NULL);
+ 	if (!sxact)
+ 	{
+ 		/* It must have been cleaned up, which means it wasn't useful. */
+ 		LWLockRelease(SerializableXactHashLock);
+ 		return;
+ 	}
+ 	xid = sxact->topXid;
+ 	if (!XidIsConcurrent(xid))
+ 	{
+ 		/* This write was already in our snapshot; no conflict. */
+ 		LWLockRelease(SerializableXactHashLock);
+ 		return;
+ 	}
+ 
+ 	/*
+ 	 * Flag the conflict.  But first, if this conflict creates a dangerous
+ 	 * structure, ereport an error.
+ 	 */
+ 	FlagRWConflict((SERIALIZABLEXACT *) MySerializableXact, sxact);
+ 	LWLockRelease(SerializableXactHashLock);
+ }
+ 
+ /*
+  * Check a particular target for rw-dependency conflict in.
+  */
+ static void
+ CheckTargetForConflictsIn(PREDICATELOCKTARGETTAG *targettag)
+ {
+ 	uint32		targettaghash;
+ 	LWLockId	partitionLock;
+ 	PREDICATELOCKTARGET *target;
+ 	PREDICATELOCK *predlock;
+ 
+ 	Assert(MySerializableXact != InvalidSerializableXact);
+ 
+ 	/* The same hash and LW lock apply to the lock target and the lock itself. */
+ 	targettaghash = PredicateLockTargetTagHashCode(targettag);
+ 	partitionLock = PredicateLockHashPartitionLock(targettaghash);
+ 	LWLockAcquire(partitionLock, LW_SHARED);
+ 	target = (PREDICATELOCKTARGET *)
+ 		hash_search_with_hash_value(PredicateLockTargetHash,
+ 									targettag, targettaghash,
+ 									HASH_FIND, NULL);
+ 	if (!target)
+ 	{
+ 		/* Nothing has this target locked; we're done here. */
+ 		LWLockRelease(partitionLock);
+ 		return;
+ 	}
+ 
+ 	/*
+ 	 * Each lock for an overlapping transaction represents a conflict: a
+ 	 * rw-dependency in to this transaction.
+ 	 */
+ 	predlock = (PREDICATELOCK *)
+ 		SHMQueueNext(&(target->predicateLocks),
+ 					 &(target->predicateLocks),
+ 					 offsetof(PREDICATELOCK, targetLink));
+ 	LWLockAcquire(SerializableXactHashLock, LW_SHARED);
+ 	while (predlock)
+ 	{
+ 		SHM_QUEUE  *predlocktargetlink;
+ 		PREDICATELOCK *nextpredlock;
+ 		SERIALIZABLEXACT *sxact;
+ 
+ 		predlocktargetlink = &(predlock->targetLink);
+ 		nextpredlock = (PREDICATELOCK *)
+ 			SHMQueueNext(&(target->predicateLocks),
+ 						 predlocktargetlink,
+ 						 offsetof(PREDICATELOCK, targetLink));
+ 
+ 		sxact = predlock->tag.myXact;
+ 		if (sxact == MySerializableXact)
+ 		{
+ 			/*
+ 			 * If we're getting a write lock on the tuple, we don't need a
+ 			 * predicate (SIREAD) lock. At this point our transaction already
+ 			 * has an ExclusiveRowLock on the relation, so we are OK to drop
+ 			 * the predicate lock on the tuple, if found, without fearing that
+ 			 * another write against the tuple will occur before the MVCC
+ 			 * information makes it to the buffer.
+ 			 */
+ 			if (GET_PREDICATELOCKTARGETTAG_OFFSET(*targettag))
+ 			{
+ 				uint32		predlockhashcode;
+ 				PREDICATELOCKTARGET *rmtarget = NULL;
+ 				PREDICATELOCK *rmpredlock;
+ 				LOCALPREDICATELOCK *locallock,
+ 						   *rmlocallock;
+ 
+ 				/*
+ 				 * This is a tuple on which we have a tuple predicate lock. We
+ 				 * only have shared LW locks now; release those, and get
+ 				 * exclusive locks only while we modify things.
+ 				 */
+ 				LWLockRelease(SerializableXactHashLock);
+ 				LWLockRelease(partitionLock);
+ 				LWLockAcquire(SerializablePredicateLockListLock, LW_SHARED);
+ 				LWLockAcquire(partitionLock, LW_EXCLUSIVE);
+ 				LWLockAcquire(SerializableXactHashLock, LW_EXCLUSIVE);
+ 
+ 				/*
+ 				 * Remove the predicate lock from shared memory, if it hasn't
+ 				 * been concurrently removed by an index page combine.
+ 				 */
+ 				predlockhashcode = PredicateLockHashCodeFromTargetHashCode
+ 					(&(predlock->tag), targettaghash);
+ 				rmpredlock = (PREDICATELOCK *)
+ 					hash_search_with_hash_value(PredicateLockHash,
+ 												&(predlock->tag),
+ 												predlockhashcode,
+ 												HASH_FIND, NULL);
+ 				if (rmpredlock == predlock)
+ 				{
+ 					SHMQueueDelete(predlocktargetlink);
+ 					SHMQueueDelete(&(predlock->xactLink));
+ 
+ 					rmpredlock = (PREDICATELOCK *)
+ 						hash_search_with_hash_value(PredicateLockHash,
+ 													&(predlock->tag),
+ 													predlockhashcode,
+ 													HASH_REMOVE, NULL);
+ 					Assert(rmpredlock == predlock);
+ 
+ 					/*
+ 					 * When a target is no longer used, remove it.
+ 					 */
+ 					if (SHMQueueEmpty(&target->predicateLocks))
+ 					{
+ 						rmtarget = (PREDICATELOCKTARGET *)
+ 							hash_search_with_hash_value(PredicateLockTargetHash,
+ 														targettag,
+ 														targettaghash,
+ 														HASH_REMOVE, NULL);
+ 						Assert(rmtarget == target);
+ 					}
+ 
+ 					LWLockRelease(SerializableXactHashLock);
+ 					LWLockRelease(partitionLock);
+ 					LWLockRelease(SerializablePredicateLockListLock);
+ 
+ 					locallock = (LOCALPREDICATELOCK *)
+ 						hash_search_with_hash_value(LocalPredicateLockHash,
+ 													targettag, targettaghash,
+ 													HASH_FIND, NULL);
+ 					Assert(locallock != NULL);
+ 					Assert(locallock->held);
+ 					locallock->held = false;
+ 
+ 					if (locallock->childLocks == 0)
+ 					{
+ 						rmlocallock = (LOCALPREDICATELOCK *)
+ 							hash_search_with_hash_value(LocalPredicateLockHash,
+ 													targettag, targettaghash,
+ 														HASH_REMOVE, NULL);
+ 						Assert(rmlocallock == locallock);
+ 					}
+ 
+ 					DecrementParentLocks(targettag);
+ 
+ 					if (rmtarget)
+ 						return;
+ 
+ 					LWLockAcquire(partitionLock, LW_SHARED);
+ 					nextpredlock = (PREDICATELOCK *)
+ 						SHMQueueNext(&(target->predicateLocks),
+ 									 &(target->predicateLocks),
+ 									 offsetof(PREDICATELOCK, targetLink));
+ 
+ 					LWLockAcquire(SerializableXactHashLock, LW_SHARED);
+ 				}
+ 				else
+ 				{
+ 					LWLockAcquire(partitionLock, LW_SHARED);
+ 					LWLockAcquire(SerializableXactHashLock, LW_SHARED);
+ 				}
+ 			}
+ 		}
+ 		else if (!(sxact->rolledBack)
+ 				 && (!SxactIsCommitted(sxact)
+ 					 || TransactionIdPrecedes(GetTransactionSnapshot()->xmin,
+ 											  sxact->finishedBefore))
+ 				 && sxact->outConflict != MySerializableXact
+ 				 && MySerializableXact->inConflict != sxact)
+ 		{
+ 			LWLockRelease(SerializableXactHashLock);
+ 			LWLockAcquire(SerializableXactHashLock, LW_EXCLUSIVE);
+ 
+ 			FlagRWConflict(sxact, (SERIALIZABLEXACT *) MySerializableXact);
+ 
+ 			LWLockRelease(SerializableXactHashLock);
+ 			LWLockAcquire(SerializableXactHashLock, LW_SHARED);
+ 		}
+ 
+ 		predlock = nextpredlock;
+ 	}
+ 	LWLockRelease(SerializableXactHashLock);
+ 	LWLockRelease(partitionLock);
+ }
+ 
+ /*
+  * CheckForSerializableConflictIn
+  *		We are writing the given tuple.  If that indicates a rw-conflict
+  *		in from another serializable transaction, take appropriate action.
+  *
+  * Skip checking for any granularity for which a parameter is missing.
+  *
+  * A tuple update or delete is in conflict if we have a predicate lock
+  * against the relation or page in which the tuple exists, or against the
+  * tuple itself.  A tuple insert is in conflict only if there is a predicate
+  * lock against the entire relation.
+  *
+  * The call to this function also indicates that we need an entry in the
+  * serializable transaction hash table, so that this write's conflicts can
+  * be detected for the proper lifetime, which is until this transaction and
+  * all overlapping serializable transactions have completed.
+  */
+ void
+ CheckForSerializableConflictIn(const Relation relation, const HeapTuple tuple,
+ 							   const Buffer buffer)
+ {
+ 	PREDICATELOCKTARGETTAG targettag;
+ 
+ 	if (SkipSerialization(relation))
+ 		return;
+ 
+ 	EnsureMySerializableXidExists();
+ 
+ 	/*
+ 	 * It is important that we check for locks from the finest granularity to
+ 	 * the coarsest granularity, so that granularity promotion doesn't cause
+ 	 * us to miss a lock.  The new (coarser) lock will be acquired before the
+ 	 * old (finer) locks are released.
+ 	 *
+ 	 * It is not possible to take and hold a lock across the checks for all
+ 	 * granularities because each target could be in a separate partition.
+ 	 */
+ 	if (tuple != NULL)
+ 	{
+ 		SET_PREDICATELOCKTARGETTAG_TUPLE(targettag,
+ 										 relation->rd_node.dbNode,
+ 										 relation->rd_id,
+ 						 ItemPointerGetBlockNumber(&(tuple->t_data->t_ctid)),
+ 					   ItemPointerGetOffsetNumber(&(tuple->t_data->t_ctid)));
+ 		CheckTargetForConflictsIn(&targettag);
+ 	}
+ 
+ 	if (BufferIsValid(buffer))
+ 	{
+ 		SET_PREDICATELOCKTARGETTAG_PAGE(targettag,
+ 										relation->rd_node.dbNode,
+ 										relation->rd_id,
+ 										BufferGetBlockNumber(buffer));
+ 		CheckTargetForConflictsIn(&targettag);
+ 	}
+ 
+ 	SET_PREDICATELOCKTARGETTAG_RELATION(targettag,
+ 										relation->rd_node.dbNode,
+ 										relation->rd_id);
+ 	CheckTargetForConflictsIn(&targettag);
+ }
+ 
+ /*
+  * Flag a rw-dependency between two serializable transactions.
+  * If a conflict field is invalid set it to the other transaction,
+  * if it's already the other transaction leave it alone, otherwise
+  * use self-reference (so we don't need to keep a list).
+  *
+  * The caller is responsible for ensuring that we have a LW lock on
+  * the transaction hash table.
+  */
+ static void
+ FlagRWConflict(SERIALIZABLEXACT *reader, SERIALIZABLEXACT *writer)
+ {
+ 	Assert(reader != writer);
+ 
+ 	/* First, see if this conflict causes failure. */
+ 	OnConflict_CheckForSerializationFailure(reader, writer);
+ 
+ 	/* Actually do the conflict flagging. */
+ 	if (writer->inConflict == InvalidSerializableXact
+ 		|| writer->inConflict->rolledBack)
+ 		writer->inConflict = reader;
+ 	else if (writer->inConflict != reader)
+ 		writer->inConflict = writer;
+ 	if (reader->outConflict == InvalidSerializableXact
+ 		|| reader->outConflict->rolledBack)
+ 		reader->outConflict = writer;
+ 	else if (reader->outConflict != writer)
+ 		reader->outConflict = reader;
+ }
+ 
+ /*
+  * Check whether we should roll back one of these transactions
+  * instead of flagging a conflict.
+  */
+ static void
+ OnConflict_CheckForSerializationFailure(const SERIALIZABLEXACT *reader,
+ 										const SERIALIZABLEXACT *writer)
+ {
+ 	bool		failure;
+ 
+ 	Assert(LWLockHeldByMe(SerializableXactHashLock));
+ 
+ 	failure = false;
+ 
+ 	if (writer->inConflict != reader
+ 		&& writer->outConflict != InvalidSerializableXact
+ 		&& !(writer->outConflict->rolledBack))
+ 	{
+ 		/* The writer is or is becoming a pivot. */
+ 		/* Self-reference prevents checking commit sequence. */
+ 		if (writer->outConflict == writer
+ 
+ 		/*
+ 		 * TODO SSI: Resolve this performance tweak issue.
+ 		 *
+ 		 * Back-and-forth reference is write skew; thus doomed; however,
+ 		 * rolling back here increases chances that a retry will still fail.
+ 		 * It may be better to let it happen at commit time.  Only performance
+ 		 * testing can determine whether the next line should be used.
+ 		 *
+ 		 * Leaving it out would be *especially* valuable if the PreCommit
+ 		 * checking could be changed to allow a commit in a situation where it
+ 		 * is leaving another transaction in a state where a commit must fail
+ 		 * -- when the doomed transaction eventually tries to commit, it would
+ 		 * probably be at a time when an immediate retry is very likely to
+ 		 * succeed.
+ 		 */
+ 		/* || writer->outConflict == reader */
+ 			)
+ 			failure = true;
+ 		else if (SxactIsCommitted(writer->outConflict))
+ 		{
+ 			if (SxactCommittedBefore(writer->outConflict, writer)
+ 				&& SxactCommittedBefore(writer->outConflict, reader))
+ 				/* The out side of the pivot committed first. */
+ 				failure = true;
+ 		}
+ 		else
+ 		{
+ 			if (writer->outConflict->inConflict == writer->outConflict)
+ 				/* Self-reference will prevent checking at commit. */
+ 				failure = true;
+ 		}
+ 	}
+ 
+ 	if (reader->outConflict != writer
+ 		&& reader->inConflict != InvalidSerializableXact
+ 		&& !(reader->inConflict->rolledBack))
+ 	{
+ 		/* The reader is or is becoming a pivot. */
+ 		if (SxactIsCommitted(writer))
+ 		{
+ 			if (SxactCommittedBefore(writer, reader)
+ 				&& (reader->inConflict == reader
+ 					|| SxactCommittedBefore(writer, reader->inConflict)))
+ 				/* The out side committed first, as far as we can tell. */
+ 				failure = true;
+ 		}
+ 		else if (writer->inConflict != InvalidSerializableXact
+ 				 && writer->inConflict != reader)
+ 			/* Self-reference will prevent checking at commit. */
+ 			failure = true;
+ 	}
+ 
+ 	if (failure)
+ 		ereport(ERROR,
+ 				(errcode(ERRCODE_T_R_SERIALIZATION_FAILURE),
+ 				 errmsg("could not serialize access due to read/write dependencies among transactions"),
+ 				 errhint("The transaction might succeed if retried.")));
+ }
+ 
+ /*
+  * PreCommit_CheckForSerializableConflicts
+  *		Check for dangerous structures in a serializable transaction
+  *		at commit.
+  *
+  * We're checking for a dangerous structure as each conflict is recorded.
+  * The only way we could have a problem at commit is if this is the "out"
+  * side of a pivot, and neither the "in" side or the pivot itself has yet
+  * committed.
+  */
+ void
+ PreCommit_CheckForSerializationFailure(void)
+ {
+ 	if (MySerializableXact == InvalidSerializableXact)
+ 		return;
+ 
+ 	Assert(IsXactIsoLevelFullySerializable);
+ 
+ 	LWLockAcquire(SerializableXactHashLock, LW_EXCLUSIVE);
+ 
+ 	/*
+ 	 * Checking at conflict detection should only allow self-reference in if
+ 	 * this transaction is on the on the out side of a pivot, so
+ 	 * self-reference is OK here.
+ 	 */
+ 	if (MySerializableXact->inConflict != InvalidSerializableXact
+ 		&& MySerializableXact->inConflict != MySerializableXact
+ 		&& !(MySerializableXact->inConflict->rolledBack)
+ 	 && MySerializableXact->inConflict->inConflict != InvalidSerializableXact
+ 		&& !SxactIsCommitted(MySerializableXact->inConflict)
+ 		&& !SxactIsCommitted(MySerializableXact->inConflict->inConflict))
+ 	{
+ 		MySerializableXact->finishedBefore = ShmemVariableCache->nextXid;
+ 		ereport(ERROR,
+ 				(errcode(ERRCODE_T_R_SERIALIZATION_FAILURE),
+ 				 errmsg("could not serialize access due to read/write dependencies among transactions"),
+ 				 errhint("The transaction might succeed if retried.")));
+ 	}
+ 
+ 	MySerializableXact->finishedBefore = ShmemVariableCache->nextXid;
+ 	LWLockRelease(SerializableXactHashLock);
+ }
*** a/src/backend/utils/adt/lockfuncs.c
--- b/src/backend/utils/adt/lockfuncs.c
***************
*** 17,22 ****
--- 17,23 ----
  #include "miscadmin.h"
  #include "storage/proc.h"
  #include "utils/builtins.h"
+ #include "storage/predicate.h"
  
  
  /* This must match enum LockTagType! */
***************
*** 32,42 **** static const char *const LockTagTypeNames[] = {
--- 33,52 ----
  	"advisory"
  };
  
+ /* This must match enum PredicateLockTargetType (predicate.h) */
+ static const char *const PredicateLockTagTypeNames[] = {
+ 	"relation",
+ 	"page",
+ 	"tuple"
+ };
+ 
  /* Working status for pg_lock_status */
  typedef struct
  {
  	LockData   *lockData;		/* state data from lmgr */
  	int			currIdx;		/* current PROCLOCK index */
+ 	PredicateLockData *predLockData;	/* state data for pred locks */
+ 	int			predLockIdx;	/* current index for pred lock */
  } PG_Lock_Status;
  
  
***************
*** 69,74 **** pg_lock_status(PG_FUNCTION_ARGS)
--- 79,85 ----
  	FuncCallContext *funcctx;
  	PG_Lock_Status *mystatus;
  	LockData   *lockData;
+ 	PredicateLockData *predLockData;
  
  	if (SRF_IS_FIRSTCALL())
  	{
***************
*** 126,131 **** pg_lock_status(PG_FUNCTION_ARGS)
--- 137,144 ----
  
  		mystatus->lockData = GetLockStatusData();
  		mystatus->currIdx = 0;
+ 		mystatus->predLockData = GetPredicateLockStatusData();
+ 		mystatus->predLockIdx = 0;
  
  		MemoryContextSwitchTo(oldcontext);
  	}
***************
*** 303,308 **** pg_lock_status(PG_FUNCTION_ARGS)
--- 316,385 ----
  		SRF_RETURN_NEXT(funcctx, result);
  	}
  
+ 	/*
+ 	 * Have returned all regular locks. Now start on the SIREAD predicate
+ 	 * locks.
+ 	 */
+ 	predLockData = mystatus->predLockData;
+ 	if (mystatus->predLockIdx < predLockData->nelements)
+ 	{
+ 		PREDICATELOCKTARGETTAG *predTag = &(predLockData->locktags[mystatus->predLockIdx]);
+ 		SERIALIZABLEXACT *xact = &(predLockData->xacts[mystatus->predLockIdx]);
+ 		Datum		values[14];
+ 		bool		nulls[14];
+ 		HeapTuple	tuple;
+ 		Datum		result;
+ 
+ 		mystatus->predLockIdx++;
+ 
+ 		/*
+ 		 * Form tuple with appropriate data.
+ 		 */
+ 		MemSet(values, 0, sizeof(values));
+ 		MemSet(nulls, false, sizeof(nulls));
+ 
+ 		/* lock type */
+ 		PredicateLockTargetType lockType = GET_PREDICATELOCKTARGETTAG_TYPE(*predTag);
+ 
+ 		values[0] = CStringGetTextDatum(PredicateLockTagTypeNames[lockType]);
+ 
+ 		/* lock target */
+ 		values[1] = GET_PREDICATELOCKTARGETTAG_DB(*predTag);
+ 		values[2] = GET_PREDICATELOCKTARGETTAG_RELATION(*predTag);
+ 		if (lockType == PREDLOCKTAG_TUPLE)
+ 			values[4] = GET_PREDICATELOCKTARGETTAG_OFFSET(*predTag);
+ 		else
+ 			nulls[4] = true;
+ 		if ((lockType == PREDLOCKTAG_TUPLE) ||
+ 			(lockType == PREDLOCKTAG_PAGE))
+ 			values[3] = GET_PREDICATELOCKTARGETTAG_PAGE(*predTag);
+ 		else
+ 			nulls[3] = true;
+ 
+ 		/* these fields are targets for other types of locks */
+ 		nulls[5] = true;		/* virtualxid */
+ 		nulls[6] = true;		/* transactionid */
+ 		nulls[7] = true;		/* classid */
+ 		nulls[8] = true;		/* objid */
+ 		nulls[9] = true;		/* objsubid */
+ 
+ 		/* lock holder */
+ 		values[10] = VXIDGetDatum(xact->tag.vxid.backendId,
+ 								  xact->tag.vxid.localTransactionId);
+ 		nulls[11] = true;		/* pid */
+ 
+ 		/*
+ 		 * Lock mode. Currently all predicate locks are SIReadLocks, which are
+ 		 * always held (never waiting)
+ 		 */
+ 		values[12] = CStringGetTextDatum("SIReadLock");
+ 		values[13] = BoolGetDatum(true);
+ 
+ 		tuple = heap_form_tuple(funcctx->tuple_desc, values, nulls);
+ 		result = HeapTupleGetDatum(tuple);
+ 		SRF_RETURN_NEXT(funcctx, result);
+ 	}
+ 
  	SRF_RETURN_DONE(funcctx);
  }
  
*** a/src/backend/utils/adt/ri_triggers.c
--- b/src/backend/utils/adt/ri_triggers.c
***************
*** 3308,3314 **** ri_PerformCheck(RI_QueryKey *qkey, SPIPlanPtr qplan,
  	/*
  	 * In READ COMMITTED mode, we just need to use an up-to-date regular
  	 * snapshot, and we will see all rows that could be interesting. But in
! 	 * SERIALIZABLE mode, we can't change the transaction snapshot. If the
  	 * caller passes detectNewRows == false then it's okay to do the query
  	 * with the transaction snapshot; otherwise we use a current snapshot, and
  	 * tell the executor to error out if it finds any rows under the current
--- 3308,3314 ----
  	/*
  	 * In READ COMMITTED mode, we just need to use an up-to-date regular
  	 * snapshot, and we will see all rows that could be interesting. But in
! 	 * xact-snapshot-based modes, we can't change the transaction snapshot. If the
  	 * caller passes detectNewRows == false then it's okay to do the query
  	 * with the transaction snapshot; otherwise we use a current snapshot, and
  	 * tell the executor to error out if it finds any rows under the current
***************
*** 3316,3322 **** ri_PerformCheck(RI_QueryKey *qkey, SPIPlanPtr qplan,
  	 * that SPI_execute_snapshot will register the snapshots, so we don't need
  	 * to bother here.
  	 */
! 	if (IsXactIsoLevelSerializable && detectNewRows)
  	{
  		CommandCounterIncrement();		/* be sure all my own work is visible */
  		test_snapshot = GetLatestSnapshot();
--- 3316,3322 ----
  	 * that SPI_execute_snapshot will register the snapshots, so we don't need
  	 * to bother here.
  	 */
! 	if (IsXactIsoLevelXactSnapshotBased && detectNewRows)
  	{
  		CommandCounterIncrement();		/* be sure all my own work is visible */
  		test_snapshot = GetLatestSnapshot();
*** a/src/backend/utils/misc/guc.c
--- b/src/backend/utils/misc/guc.c
***************
*** 59,64 ****
--- 59,65 ----
  #include "storage/bufmgr.h"
  #include "storage/standby.h"
  #include "storage/fd.h"
+ #include "storage/predicate.h"
  #include "tcop/tcopprot.h"
  #include "tsearch/ts_cache.h"
  #include "utils/builtins.h"
***************
*** 1670,1675 **** static struct config_int ConfigureNamesInt[] =
--- 1671,1687 ----
  	},
  
  	{
+ 		{"max_predicate_locks_per_transaction", PGC_POSTMASTER, LOCK_MANAGEMENT,
+ 			gettext_noop("Sets the maximum number of predicate locks per transaction."),
+ 			gettext_noop("The shared predicate lock table is sized on the assumption that "
+ 			  "at most max_predicate_locks_per_transaction * max_connections distinct "
+ 						 "objects will need to be locked at any one time.")
+ 		},
+ 		&max_predicate_locks_per_xact,
+ 		64, 10, INT_MAX, NULL, NULL
+ 	},
+ 
+ 	{
  		{"authentication_timeout", PGC_SIGHUP, CONN_AUTH_SECURITY,
  			gettext_noop("Sets the maximum allowed time to complete client authentication."),
  			NULL,
*** a/src/backend/utils/resowner/resowner.c
--- b/src/backend/utils/resowner/resowner.c
***************
*** 261,267 **** ResourceOwnerReleaseInternal(ResourceOwner owner,
--- 261,270 ----
  			 * the top of the recursion.
  			 */
  			if (owner == TopTransactionResourceOwner)
+ 			{
  				ProcReleaseLocks(isCommit);
+ 				ReleasePredicateLocks(isCommit);
+ 			}
  		}
  		else
  		{
*** a/src/backend/utils/time/snapmgr.c
--- b/src/backend/utils/time/snapmgr.c
***************
*** 37,44 ****
  
  
  /*
!  * CurrentSnapshot points to the only snapshot taken in a serializable
!  * transaction, and to the latest one taken in a read-committed transaction.
   * SecondarySnapshot is a snapshot that's always up-to-date as of the current
   * instant, even on a serializable transaction.  It should only be used for
   * special-purpose code (say, RI checking.)
--- 37,44 ----
  
  
  /*
!  * CurrentSnapshot points to the only snapshot taken in a xact-snapshot-based
!  * transaction; otherwise to the latest one taken.
   * SecondarySnapshot is a snapshot that's always up-to-date as of the current
   * instant, even on a serializable transaction.  It should only be used for
   * special-purpose code (say, RI checking.)
***************
*** 97,107 **** static int	RegisteredSnapshots = 0;
  bool		FirstSnapshotSet = false;
  
  /*
!  * Remembers whether this transaction registered a serializable snapshot at
   * start.  We cannot trust FirstSnapshotSet in combination with
!  * IsXactIsoLevelSerializable, because GUC may be reset before us.
   */
! static bool registered_serializable = false;
  
  
  static Snapshot CopySnapshot(Snapshot snapshot);
--- 97,107 ----
  bool		FirstSnapshotSet = false;
  
  /*
!  * Remembers whether this transaction registered a transaction-based snapshot at
   * start.  We cannot trust FirstSnapshotSet in combination with
!  * IsXactIsoLevelXactSnapshotBased, because GUC may be reset before us.
   */
! static bool registered_xact_snapshot = false;
  
  
  static Snapshot CopySnapshot(Snapshot snapshot);
***************
*** 130,150 **** GetTransactionSnapshot(void)
  		FirstSnapshotSet = true;
  
  		/*
! 		 * In serializable mode, the first snapshot must live until end of
! 		 * xact regardless of what the caller does with it, so we must
! 		 * register it internally here and unregister it at end of xact.
  		 */
! 		if (IsXactIsoLevelSerializable)
  		{
  			CurrentSnapshot = RegisterSnapshotOnOwner(CurrentSnapshot,
  												TopTransactionResourceOwner);
! 			registered_serializable = true;
  		}
  
  		return CurrentSnapshot;
  	}
  
! 	if (IsXactIsoLevelSerializable)
  		return CurrentSnapshot;
  
  	CurrentSnapshot = GetSnapshotData(&CurrentSnapshotData);
--- 130,153 ----
  		FirstSnapshotSet = true;
  
  		/*
! 		 * In xact-snapshot-based isolation levels, the first snapshot must
! 		 * live until end of xact regardless of what the caller does with it,
! 		 * so we must register it internally here and unregister it at end of
! 		 * xact.
  		 */
! 		if (IsXactIsoLevelXactSnapshotBased)
  		{
  			CurrentSnapshot = RegisterSnapshotOnOwner(CurrentSnapshot,
  												TopTransactionResourceOwner);
! 			registered_xact_snapshot = true;
! 			if (IsXactIsoLevelFullySerializable)
! 				RegisterSerializableTransaction(CurrentSnapshot);
  		}
  
  		return CurrentSnapshot;
  	}
  
! 	if (IsXactIsoLevelXactSnapshotBased)
  		return CurrentSnapshot;
  
  	CurrentSnapshot = GetSnapshotData(&CurrentSnapshotData);
***************
*** 155,161 **** GetTransactionSnapshot(void)
  /*
   * GetLatestSnapshot
   *		Get a snapshot that is up-to-date as of the current instant,
!  *		even if we are executing in SERIALIZABLE mode.
   */
  Snapshot
  GetLatestSnapshot(void)
--- 158,164 ----
  /*
   * GetLatestSnapshot
   *		Get a snapshot that is up-to-date as of the current instant,
!  *		even if we are executing in xact-snapshot-based mode.
   */
  Snapshot
  GetLatestSnapshot(void)
***************
*** 515,527 **** void
  AtEarlyCommit_Snapshot(void)
  {
  	/*
! 	 * On a serializable transaction we must unregister our private refcount
! 	 * to the serializable snapshot.
  	 */
! 	if (registered_serializable)
  		UnregisterSnapshotFromOwner(CurrentSnapshot,
  									TopTransactionResourceOwner);
! 	registered_serializable = false;
  
  }
  
--- 518,530 ----
  AtEarlyCommit_Snapshot(void)
  {
  	/*
! 	 * On a xact-snapshot-based transaction we must unregister our private
! 	 * refcount to the xact snapshot.
  	 */
! 	if (registered_xact_snapshot)
  		UnregisterSnapshotFromOwner(CurrentSnapshot,
  									TopTransactionResourceOwner);
! 	registered_xact_snapshot = false;
  
  }
  
***************
*** 557,561 **** AtEOXact_Snapshot(bool isCommit)
  	SecondarySnapshot = NULL;
  
  	FirstSnapshotSet = false;
! 	registered_serializable = false;
  }
--- 560,564 ----
  	SecondarySnapshot = NULL;
  
  	FirstSnapshotSet = false;
! 	registered_xact_snapshot = false;
  }
*** a/src/include/access/heapam.h
--- b/src/include/access/heapam.h
***************
*** 82,89 **** extern HeapTuple heap_getnext(HeapScanDesc scan, ScanDirection direction);
  extern bool heap_fetch(Relation relation, Snapshot snapshot,
  		   HeapTuple tuple, Buffer *userbuf, bool keep_buf,
  		   Relation stats_relation);
! extern bool heap_hot_search_buffer(ItemPointer tid, Buffer buffer,
! 					   Snapshot snapshot, bool *all_dead);
  extern bool heap_hot_search(ItemPointer tid, Relation relation,
  				Snapshot snapshot, bool *all_dead);
  
--- 82,89 ----
  extern bool heap_fetch(Relation relation, Snapshot snapshot,
  		   HeapTuple tuple, Buffer *userbuf, bool keep_buf,
  		   Relation stats_relation);
! extern bool heap_hot_search_buffer(ItemPointer tid, Relation relation,
! 					   Buffer buffer, Snapshot snapshot, bool *all_dead);
  extern bool heap_hot_search(ItemPointer tid, Relation relation,
  				Snapshot snapshot, bool *all_dead);
  
*** a/src/include/access/xact.h
--- b/src/include/access/xact.h
***************
*** 32,41 **** extern int	DefaultXactIsoLevel;
  extern int	XactIsoLevel;
  
  /*
!  * We only implement two isolation levels internally.  This macro should
!  * be used to check which one is selected.
   */
! #define IsXactIsoLevelSerializable (XactIsoLevel >= XACT_REPEATABLE_READ)
  
  /* Xact read-only state */
  extern bool DefaultXactReadOnly;
--- 32,45 ----
  extern int	XactIsoLevel;
  
  /*
!  * We implement three isolation levels internally.
!  * The two stronger ones use one snapshot per database transaction;
!  * the others use one snapshot per statement.
!  * Serializable uses predicate locks.
!  * These macros should be used to check which isolation level is selected.
   */
! #define IsXactIsoLevelXactSnapshotBased (XactIsoLevel >= XACT_REPEATABLE_READ)
! #define IsXactIsoLevelFullySerializable (XactIsoLevel == XACT_SERIALIZABLE)
  
  /* Xact read-only state */
  extern bool DefaultXactReadOnly;
*** a/src/include/catalog/pg_am.h
--- b/src/include/catalog/pg_am.h
***************
*** 49,54 **** CATALOG(pg_am,2601)
--- 49,55 ----
  	bool		amsearchnulls;	/* can AM search for NULL/NOT NULL entries? */
  	bool		amstorage;		/* can storage type differ from column type? */
  	bool		amclusterable;	/* does AM support cluster command? */
+ 	bool		ampredlocks;	/* does AM handle predicate locks? */
  	Oid			amkeytype;		/* type of data in index, or InvalidOid */
  	regproc		aminsert;		/* "insert this tuple" function */
  	regproc		ambeginscan;	/* "start new scan" function */
***************
*** 76,82 **** typedef FormData_pg_am *Form_pg_am;
   *		compiler constants for pg_am
   * ----------------
   */
! #define Natts_pg_am						26
  #define Anum_pg_am_amname				1
  #define Anum_pg_am_amstrategies			2
  #define Anum_pg_am_amsupport			3
--- 77,83 ----
   *		compiler constants for pg_am
   * ----------------
   */
! #define Natts_pg_am						27
  #define Anum_pg_am_amname				1
  #define Anum_pg_am_amstrategies			2
  #define Anum_pg_am_amsupport			3
***************
*** 89,124 **** typedef FormData_pg_am *Form_pg_am;
  #define Anum_pg_am_amsearchnulls		10
  #define Anum_pg_am_amstorage			11
  #define Anum_pg_am_amclusterable		12
! #define Anum_pg_am_amkeytype			13
! #define Anum_pg_am_aminsert				14
! #define Anum_pg_am_ambeginscan			15
! #define Anum_pg_am_amgettuple			16
! #define Anum_pg_am_amgetbitmap			17
! #define Anum_pg_am_amrescan				18
! #define Anum_pg_am_amendscan			19
! #define Anum_pg_am_ammarkpos			20
! #define Anum_pg_am_amrestrpos			21
! #define Anum_pg_am_ambuild				22
! #define Anum_pg_am_ambulkdelete			23
! #define Anum_pg_am_amvacuumcleanup		24
! #define Anum_pg_am_amcostestimate		25
! #define Anum_pg_am_amoptions			26
  
  /* ----------------
   *		initial contents of pg_am
   * ----------------
   */
  
! DATA(insert OID = 403 (  btree	5 1 t t t t t t t f t 0 btinsert btbeginscan btgettuple btgetbitmap btrescan btendscan btmarkpos btrestrpos btbuild btbulkdelete btvacuumcleanup btcostestimate btoptions ));
  DESCR("b-tree index access method");
  #define BTREE_AM_OID 403
! DATA(insert OID = 405 (  hash	1 1 f t f f f f f f f 23 hashinsert hashbeginscan hashgettuple hashgetbitmap hashrescan hashendscan hashmarkpos hashrestrpos hashbuild hashbulkdelete hashvacuumcleanup hashcostestimate hashoptions ));
  DESCR("hash index access method");
  #define HASH_AM_OID 405
! DATA(insert OID = 783 (  gist	0 7 f f f t t t t t t 0 gistinsert gistbeginscan gistgettuple gistgetbitmap gistrescan gistendscan gistmarkpos gistrestrpos gistbuild gistbulkdelete gistvacuumcleanup gistcostestimate gistoptions ));
  DESCR("GiST index access method");
  #define GIST_AM_OID 783
! DATA(insert OID = 2742 (  gin	0 5 f f f t t f f t f 0 gininsert ginbeginscan - gingetbitmap ginrescan ginendscan ginmarkpos ginrestrpos ginbuild ginbulkdelete ginvacuumcleanup gincostestimate ginoptions ));
  DESCR("GIN index access method");
  #define GIN_AM_OID 2742
  
--- 90,126 ----
  #define Anum_pg_am_amsearchnulls		10
  #define Anum_pg_am_amstorage			11
  #define Anum_pg_am_amclusterable		12
! #define Anum_pg_am_ampredlocks			13
! #define Anum_pg_am_amkeytype			14
! #define Anum_pg_am_aminsert				15
! #define Anum_pg_am_ambeginscan			16
! #define Anum_pg_am_amgettuple			17
! #define Anum_pg_am_amgetbitmap			18
! #define Anum_pg_am_amrescan				19
! #define Anum_pg_am_amendscan			20
! #define Anum_pg_am_ammarkpos			21
! #define Anum_pg_am_amrestrpos			22
! #define Anum_pg_am_ambuild				23
! #define Anum_pg_am_ambulkdelete			24
! #define Anum_pg_am_amvacuumcleanup		25
! #define Anum_pg_am_amcostestimate		26
! #define Anum_pg_am_amoptions			27
  
  /* ----------------
   *		initial contents of pg_am
   * ----------------
   */
  
! DATA(insert OID = 403 (  btree	5 1 t t t t t t t f t t 0 btinsert btbeginscan btgettuple btgetbitmap btrescan btendscan btmarkpos btrestrpos btbuild btbulkdelete btvacuumcleanup btcostestimate btoptions ));
  DESCR("b-tree index access method");
  #define BTREE_AM_OID 403
! DATA(insert OID = 405 (  hash	1 1 f t f f f f f f f f 23 hashinsert hashbeginscan hashgettuple hashgetbitmap hashrescan hashendscan hashmarkpos hashrestrpos hashbuild hashbulkdelete hashvacuumcleanup hashcostestimate hashoptions ));
  DESCR("hash index access method");
  #define HASH_AM_OID 405
! DATA(insert OID = 783 (  gist	0 7 f f f t t t t t t f 0 gistinsert gistbeginscan gistgettuple gistgetbitmap gistrescan gistendscan gistmarkpos gistrestrpos gistbuild gistbulkdelete gistvacuumcleanup gistcostestimate gistoptions ));
  DESCR("GiST index access method");
  #define GIST_AM_OID 783
! DATA(insert OID = 2742 (  gin	0 5 f f f t t f f t f f 0 gininsert ginbeginscan - gingetbitmap ginrescan ginendscan ginmarkpos ginrestrpos ginbuild ginbulkdelete ginvacuumcleanup gincostestimate ginoptions ));
  DESCR("GIN index access method");
  #define GIN_AM_OID 2742
  
*** a/src/include/storage/lwlock.h
--- b/src/include/storage/lwlock.h
***************
*** 27,32 ****
--- 27,36 ----
  #define LOG2_NUM_LOCK_PARTITIONS  4
  #define NUM_LOCK_PARTITIONS  (1 << LOG2_NUM_LOCK_PARTITIONS)
  
+ /* Number of partitions the shared predicate lock tables are divided into */
+ #define LOG2_NUM_PREDICATELOCK_PARTITIONS  4
+ #define NUM_PREDICATELOCK_PARTITIONS  (1 << LOG2_NUM_PREDICATELOCK_PARTITIONS)
+ 
  /*
   * We have a number of predefined LWLocks, plus a bunch of LWLocks that are
   * dynamically assigned (e.g., for shared buffers).  The LWLock structures
***************
*** 70,81 **** typedef enum LWLockId
  	RelationMappingLock,
  	AsyncCtlLock,
  	AsyncQueueLock,
  	/* Individual lock IDs end here */
  	FirstBufMappingLock,
  	FirstLockMgrLock = FirstBufMappingLock + NUM_BUFFER_PARTITIONS,
  
  	/* must be last except for MaxDynamicLWLock: */
! 	NumFixedLWLocks = FirstLockMgrLock + NUM_LOCK_PARTITIONS,
  
  	MaxDynamicLWLock = 1000000000
  } LWLockId;
--- 74,89 ----
  	RelationMappingLock,
  	AsyncCtlLock,
  	AsyncQueueLock,
+ 	SerializableXactHashLock,
+ 	SerializableFinishedListLock,
+ 	SerializablePredicateLockListLock,
  	/* Individual lock IDs end here */
  	FirstBufMappingLock,
  	FirstLockMgrLock = FirstBufMappingLock + NUM_BUFFER_PARTITIONS,
+ 	FirstPredicateLockMgrLock = FirstLockMgrLock + NUM_LOCK_PARTITIONS,
  
  	/* must be last except for MaxDynamicLWLock: */
! 	NumFixedLWLocks = FirstPredicateLockMgrLock + NUM_PREDICATELOCK_PARTITIONS,
  
  	MaxDynamicLWLock = 1000000000
  } LWLockId;
*** /dev/null
--- b/src/include/storage/predicate.h
***************
*** 0 ****
--- 1,174 ----
+ /*-------------------------------------------------------------------------
+  *
+  * predicate.h
+  *	  POSTGRES predicate locking definitions.
+  *
+  *
+  * Portions Copyright (c) 1996-2010, PostgreSQL Global Development Group
+  * Portions Copyright (c) 1994, Regents of the University of California
+  *
+  * $PostgreSQL$
+  *
+  *-------------------------------------------------------------------------
+  */
+ #ifndef PREDICATE_H
+ #define PREDICATE_H
+ 
+ #include "access/htup.h"
+ #include "utils/snapshot.h"
+ 
+ /* GUC variables */
+ extern int	max_predicate_locks_per_xact;
+ 
+ /*
+  * The SERIALIZABLEXACTTAG struct identifies a serializable transaction.
+  */
+ typedef struct SERIALIZABLEXACTTAG
+ {
+ 	VirtualTransactionId vxid;	/* We always have one of these. */
+ } SERIALIZABLEXACTTAG;
+ 
+ /*
+  * Information needed for each serializable database transaction to support SSI techniques.
+  * TODO SSI: Should inConflict and outConflict be lists?  That would allow us to reduce
+  *			 false positives, *and* would allow us to guarantee that an immediate retry
+  *			 of a transaction would never fail on the exact same conflicts.
+  *			 The RAM doesn't look like it would be the limiting factor, but CPU time might
+  *			 be -- we should have baseline benchmarks before attempting this.
+  */
+ typedef struct SERIALIZABLEXACT
+ {
+ 	/* hash key */
+ 	SERIALIZABLEXACTTAG tag;
+ 
+ 	/* data */
+ 	struct SERIALIZABLEXACT *outConflict;		/* ptr to write transaction
+ 												 * whose data we couldn't
+ 												 * read. invalid means no
+ 												 * conflict; self-reference
+ 												 * means multiple or
+ 												 * committed. */
+ 	struct SERIALIZABLEXACT *inConflict;		/* ptr to read transaction
+ 												 * which couldn't see our
+ 												 * write. invalid means no
+ 												 * conflict; self-reference
+ 												 * means multiple or
+ 												 * committed. */
+ 	TransactionId topXid;		/* top level xid for the transaction, if one
+ 								 * exists */
+ 	TransactionId finishedBefore;		/* invalid means still running; else
+ 										 * the struct expires when no tags <
+ 										 * this. */
+ 	TransactionId xmin;			/* the transaction's snapshot xmin */
+ 	SHM_QUEUE	predicateLocks; /* list of associated PREDICATELOCK objects */
+ 	SHM_QUEUE	xids;			/* list of associated SERIALIZABLEXID objects */
+ 	SHM_QUEUE	finishedLink;	/* list link in
+ 								 * FinishedSerializableTransactions */
+ 	bool		rolledBack;		/* ignore conflicts when true; allows deferred
+ 								 * cleanup */
+ } SERIALIZABLEXACT;
+ 
+ 
+ typedef enum PredicateLockTargetType
+ {
+ 	PREDLOCKTAG_RELATION,
+ 	PREDLOCKTAG_PAGE,
+ 	PREDLOCKTAG_TUPLE
+ 	/* TODO Other types may be needed for index locking */
+ }	PredicateLockTargetType;
+ 
+ /*
+  * The PREDICATELOCKTARGETTAG struct is defined to fit into 16
+  * bytes with no padding.  Note that this would need adjustment if we were
+  * to widen Oid or BlockNumber to more than 32 bits.
+  */
+ typedef struct PREDICATELOCKTARGETTAG
+ {
+ 	uint32		locktag_field1; /* a 32-bit ID field */
+ 	uint32		locktag_field2; /* a 32-bit ID field */
+ 	uint32		locktag_field3; /* a 32-bit ID field */
+ 	uint16		locktag_field4; /* a 16-bit ID field */
+ 	uint16		locktag_field5; /* a 16-bit ID field */
+ } PREDICATELOCKTARGETTAG;
+ 
+ /*
+  * These macros define how we map logical IDs of lockable objects into
+  * the physical fields of PREDICATELOCKTARGETTAG.	Use these to set up values,
+  * rather than accessing the fields directly.  Note multiple eval of target!
+  *
+  * TODO SSI: If we always use the same fields for the same type of value,
+  * we should rename these.	Holding off until it's clear there are no exceptions.
+  * Since indexes are relations with blocks and tuples, it's looking likely that
+  * the rename will be possible.  If not, we may need to divide the last field
+  * and use part of it for a target type, so that we know how to interpret the
+  * data..
+  */
+ #define SET_PREDICATELOCKTARGETTAG_RELATION(locktag,dboid,reloid) \
+ 	((locktag).locktag_field1 = (dboid), \
+ 	 (locktag).locktag_field2 = (reloid), \
+ 	 (locktag).locktag_field3 = InvalidBlockNumber, \
+ 	 (locktag).locktag_field4 = InvalidOffsetNumber, \
+ 	 (locktag).locktag_field5 = 0)
+ 
+ #define SET_PREDICATELOCKTARGETTAG_PAGE(locktag,dboid,reloid,blocknum) \
+ 	((locktag).locktag_field1 = (dboid), \
+ 	 (locktag).locktag_field2 = (reloid), \
+ 	 (locktag).locktag_field3 = (blocknum), \
+ 	 (locktag).locktag_field4 = InvalidOffsetNumber, \
+ 	 (locktag).locktag_field5 = 0)
+ 
+ #define SET_PREDICATELOCKTARGETTAG_TUPLE(locktag,dboid,reloid,blocknum,offnum) \
+ 	((locktag).locktag_field1 = (dboid), \
+ 	 (locktag).locktag_field2 = (reloid), \
+ 	 (locktag).locktag_field3 = (blocknum), \
+ 	 (locktag).locktag_field4 = (offnum), \
+ 	 (locktag).locktag_field5 = 0)
+ 
+ #define GET_PREDICATELOCKTARGETTAG_DB(locktag) \
+ 	((locktag).locktag_field1)
+ #define GET_PREDICATELOCKTARGETTAG_RELATION(locktag) \
+ 	((locktag).locktag_field2)
+ #define GET_PREDICATELOCKTARGETTAG_PAGE(locktag) \
+ 	((locktag).locktag_field3)
+ #define GET_PREDICATELOCKTARGETTAG_OFFSET(locktag) \
+ 	((locktag).locktag_field4)
+ #define GET_PREDICATELOCKTARGETTAG_TYPE(locktag)							 \
+ 	(((locktag).locktag_field4 != InvalidOffsetNumber) ? PREDLOCKTAG_TUPLE : \
+ 	 (((locktag).locktag_field3 != InvalidBlockNumber) ? PREDLOCKTAG_PAGE :   \
+ 	  PREDLOCKTAG_RELATION))
+ 
+ typedef struct PredicateLockData
+ {
+ 	int			nelements;
+ 	PREDICATELOCKTARGETTAG *locktags;
+ 	SERIALIZABLEXACT *xacts;
+ } PredicateLockData;
+ 
+ /*
+  * function prototypes
+  */
+ 
+ /* housekeeping for shared memory predicate lock structures */
+ extern void InitPredicateLocks(void);
+ extern Size PredicateLockShmemSize(void);
+ 
+ /* predicate lock reporting */
+ extern PredicateLockData *GetPredicateLockStatusData(void);
+ 
+ /* predicate lock maintenance */
+ extern void RegisterSerializableTransaction(const Snapshot snapshot);
+ extern void PredicateLockRelation(const Relation relation);
+ extern void PredicateLockPage(const Relation relation, const BlockNumber blkno);
+ extern void PredicateLockTuple(const Relation relation, const HeapTuple tuple);
+ extern void PredicateLockPageSplit(const Relation relation, const BlockNumber oldblkno, const BlockNumber newblkno);
+ extern void PredicateLockPageCombine(const Relation relation, const BlockNumber oldblkno, const BlockNumber newblkno);
+ extern void ReleasePredicateLocks(const bool isCommit);
+ 
+ /* conflict detection (may also trigger rollback) */
+ extern void CheckForSerializableConflictOut(const bool valid, const Relation relation, const HeapTuple tuple, const Buffer buffer);
+ extern void CheckForSerializableConflictIn(const Relation relation, const HeapTuple tuple, const Buffer buffer);
+ 
+ /* final rollback checking */
+ extern void PreCommit_CheckForSerializationFailure(void);
+ 
+ #endif   /* PREDICATE_H */
*** a/src/include/storage/shmem.h
--- b/src/include/storage/shmem.h
***************
*** 70,74 **** extern void SHMQueueInsertBefore(SHM_QUEUE *queue, SHM_QUEUE *elem);
--- 70,75 ----
  extern Pointer SHMQueueNext(SHM_QUEUE *queue, SHM_QUEUE *curElem,
  			 Size linkOffset);
  extern bool SHMQueueEmpty(SHM_QUEUE *queue);
+ extern bool SHMQueueIsDetached(SHM_QUEUE *queue);
  
  #endif   /* SHMEM_H */
*** a/src/test/regress/GNUmakefile
--- b/src/test/regress/GNUmakefile
***************
*** 135,140 **** tablespace-setup:
--- 135,157 ----
  
  
  ##
+ ## Prepare for dtester tests
+ ##
+ pg_dtester.py: pg_dtester.py.in GNUmakefile $(top_builddir)/src/Makefile.global
+ 	sed -e 's,@bindir@,$(bindir),g' \
+ 	    -e 's,@libdir@,$(libdir),g' \
+ 	    -e 's,@pkglibdir@,$(pkglibdir),g' \
+ 	    -e 's,@datadir@,$(datadir),g' \
+ 	    -e 's/@VERSION@/$(VERSION)/g' \
+ 	    -e 's/@host_tuple@/$(host_tuple)/g' \
+ 	    -e 's,@GMAKE@,$(MAKE),g' \
+ 	    -e 's/@enable_shared@/$(enable_shared)/g' \
+ 	    -e 's/@GCC@/$(GCC)/g' \
+ 	  $< >$@
+ 	chmod a+x $@
+ 
+ 
+ ##
  ## Run tests
  ##
  
***************
*** 152,157 **** installcheck-parallel: all
--- 169,179 ----
  standbycheck: all
  	$(pg_regress_call) --psqldir=$(PSQLDIR) --schedule=$(srcdir)/standby_schedule --use-existing
  
+ dcheck: pg_dtester.py
+ 	./pg_dtester.py --temp-install --top-builddir=$(top_builddir) \
+         --multibyte=$(MULTIBYTE) $(MAXCONNOPT) $(NOLOCALE)
+ 
+ 
  # old interfaces follow...
  
  runcheck: check
*** /dev/null
--- b/src/test/regress/pg_dtester.py.in
***************
*** 0 ****
--- 1,1626 ----
+ #!/usr/bin/python
+ 
+ #-------------------------------------------------------------------------
+ #
+ # dtester.py.in
+ #
+ #	 Sample test suite running two concurrent transactions, showing
+ #    off some capabilities of dtester.
+ #
+ # Copyright (c) 2006-2010, Markus Wanner
+ #
+ #-------------------------------------------------------------------------
+ 
+ import re, os, sys, getopt
+ from twisted.internet import defer, reactor
+ from twisted.python import failure
+ 
+ from dtester.events import EventMatcher, EventSource, Event, \
+ 	ProcessOutputEvent, ProcessErrorEvent, ProcessEndedEvent
+ from dtester.exceptions import TestAborted, TestFailure
+ from dtester.test import TestSuite, BaseTest, SyncTest
+ from dtester.reporter import StreamReporter, CursesReporter
+ from dtester.runner import Runner, Timeout
+ 
+ # ******  definition of tests and suites  ***********************************
+ 
+ class InstallationSuite(TestSuite):
+ 
+ 	setUpDescription = "creating temporary installation"
+ 	tearDownDescription = "removing temporary installation"
+ 
+ 	needs = (('shell', "IShell or something"),)
+ 
+ 	def setUp(self):
+ 		# inherit getConfig from the shell
+ 		setattr(self, 'getConfig', self.shell.getConfig)
+ 		setattr(self, 'runCommand', self.shell.runCommand)
+ 		setattr(self, 'recursive_remove', self.shell.recursive_remove)
+ 
+ 		# (re) create an installation directory
+ 		self.pg_inst_dir = self.shell.getConfig('inst_dir')
+ 		if os.path.exists(self.pg_inst_dir):
+ 			self.shell.recursive_remove(self.pg_inst_dir)
+ 		os.mkdir(self.pg_inst_dir)
+ 
+ 		# install into that directory
+ 		proc = self.shell.runCommand('make', 'make',
+ 			args=['make', '-C', self.shell.getConfig('top-builddir'),
+ 				  'DESTDIR=%s' % self.pg_inst_dir, 'install',
+ 				  'with_perl=no', 'with_python=no'],
+ 			lineBasedOutput=True)
+ 
+ 		d = self.waitFor(proc, EventMatcher(ProcessEndedEvent))
+ 		d.addCallback(self.makeTerminated)
+ 		proc.start()
+ 
+ 		# FIXME: how to properly handle these?
+ 		self.shell.addEnvPath(self.shell.getConfig('bindir'))
+ 		self.shell.addEnvLibraryPath(self.shell.getConfig('libdir'))
+ 		return d
+ 
+ 	def makeTerminated(self, event):
+ 		if event.exitCode != 0:
+ 			raise Exception("Initdb returned %d" % event.exitCode)
+ 		else:
+ 			return True
+ 
+ 	def tearDown(self):
+ 		# The installation procedure should be able to simply override any
+ 		# formerly installed files, so we save the time to clean up the
+ 		# installation directory.
+ 		return
+ 
+ 
+ class InitdbSuite(TestSuite):
+ 
+ 	args = (('number', int), )
+ 	needs = (('shell', "IShell or something"),)
+ 
+ 	def setUpDescription(self):
+ 		return "initializing database system %d" % self.number
+ 
+ 	def tearDownDescription(self):
+ 		return "removing database system %d" % self.number
+ 
+ 	def getNumber(self):
+ 		return self.number
+ 
+ 	def getDir(self):
+ 		return self.dbdir
+ 
+ 	def setUp(self):
+ 		self.dbdir = "%s%d" % \
+ 			(self.shell.getConfig('pgdata_prefix'), self.number)
+ 		proc = self.shell.runCommand(
+ 				'initdb-%d' % self.number,
+ 				'initdb', args = [
+ 				'initdb', '-D', self.dbdir,
+ 				'-A', 'trust', '--noclean'],
+ 				lineBasedOutput=True)
+ 
+ 		d = defer.Deferred()
+ 		proc.addHook(EventMatcher(ProcessEndedEvent),
+ 					 self.initdb_terminated, d)
+ 		proc.start()
+ 		return d
+ 
+ 	def initdb_terminated(self, event, d):
+ 		if event.exitCode != 0:
+ 			d.errback(Exception("Initdb returned %d" % event.exitCode))
+ 		else:
+ 			d.callback(True)
+ 
+ 	def tearDown(self):
+ 		self.shell.recursive_remove(
+ 			"%s%d" % (self.shell.getConfig('pgdata_prefix'), self.number))
+ 
+ 
+ class PostmasterSuite(TestSuite):
+ 
+ 	needs = (('shell', "IShell or something"),
+ 			 ('dbdir', "IDatabaseDir"),)
+ 
+ 	def setUpDescription(self):
+ 		return "starting database system %d" % self.dbdir.getNumber()
+ 
+ 	def tearDownDescription(self):
+ 		return "stopping database system %d" % self.dbdir.getNumber()
+ 
+ 	def getPort(self):
+ 		return self.port
+ 
+ 	def setUp(self):
+ 		setattr(self, 'getNumber', self.dbdir.getNumber)
+ 
+ 		self.port = self.shell.getConfig('temp-port') + self.dbdir.getNumber()
+ 
+ 		args = ['postmaster', '-d5',
+ 					'-D', self.dbdir.getDir(),
+ 					'-i', '-p', str(self.port)]
+ 		if self.shell.getConfig('enable_cassert'):
+ 			args += "-A1"
+ 
+ 		self.postmaster = self.shell.runCommand(
+ 			'postmaster%d' % self.dbdir.getNumber(),
+ 			'postmaster',
+ 			args = args,
+ 			lineBasedOutput=True)
+ 
+ 		d = defer.Deferred()
+ 		self.readyHook = \
+ 			self.postmaster.addHook(EventMatcher(ProcessErrorEvent,
+ 				"database system is ready to accept connections"),
+ 				self.postmaster_ready, d)
+ 
+ 		self.unexpectedTerminationHook = \
+ 		  self.postmaster.addHook(EventMatcher(ProcessEndedEvent),
+ 								  self.postmaster_terminated)
+ 		self.postmaster.start()
+ 		return d
+ 
+ 	def postmaster_ready(self, event, d):
+ 		# it's sufficient if we're called once
+ 		self.postmaster.removeHook(self.readyHook)
+ 		d.callback(None)
+ 
+ 	def postmaster_terminated(self, event):
+ 		exitCode = 'undef'
+ 		if hasattr(event, 'exitCode'):
+ 			exitCode = event.exitCode
+ 		elif hasattr(event, 'data'):
+ 			exitCode = repr(event.data)
+ 		self.abort("postmaster %d unexpectedly terminated (exit code %s)" % \
+ 			(self.dbdir.getNumber(), exitCode))
+ 
+ 	def tearDown(self):
+ 		self.postmaster.removeHook(self.unexpectedTerminationHook)
+ 		if not self.aborted:
+ 			d = defer.Deferred()
+ 			self.postmaster.addHook(EventMatcher(ProcessEndedEvent),
+ 									lambda event: d.callback(None))
+ 			self.postmaster.stop()
+ 			return d
+ 		else:
+ 			return True
+ 
+ 
+ class TestDatabaseSuite(TestSuite):
+ 
+ 	args = (('dbname', str),)
+ 	needs = (('shell', "IShell or something"),
+ 			 ('pg', "IPostmaster"),)
+ 
+ 	def setUpDescription(self):
+ 		return "creating database %s at server %d" % \
+ 						(self.dbname, self.pg.getNumber())
+ 
+ 	def tearDownDescription(self):
+ 		return "not (!) dropping database %s at server %d" % \
+ 						(self.dbname, self.pg.getNumber())
+ 
+ 	def getDbname(self):
+ 		return self.dbname
+ 
+ 	def setUp(self):
+ 		setattr(self, "getPort", self.pg.getPort)
+ 		setattr(self, "getNumber", self.pg.getNumber)
+ 
+ 		self.proc = self.shell.runCommand(
+ 			'createdb%d' % self.pg.getNumber(),
+ 			'createdb',
+ 			args = ['createdb',
+ 					'-p', str(self.getPort()), self.dbname],
+ 			lineBasedOutput=True)
+ 
+ 		d = defer.Deferred()
+ 		self.proc.addHook(EventMatcher(ProcessEndedEvent),
+ 						  self.createdb_terminated, d)
+ 		self.proc.start()
+ 		return d
+ 
+ 	def createdb_terminated(self, event, d):
+ 		if event.exitCode != 0:
+ 			d.errback(Exception("createdb terminated with code %d" % \
+ 				event.exitCode))
+ 		else:
+ 			d.callback(None)
+ 
+ 	def tearDown(self):
+ 		if self.pg.aborted:
+ 			return True
+ 
+ 		# Hm.. this interferes with the postmaster suites, which need
+ 		# to be started and stopped several times on top of a test database,
+ 		# however, creating and dropping it certainly depends on a running
+ 		# postmaster. Not sure how to solve this, at the moment I'm just
+ 		# skipping cleanup, i.e. dropdb.
+ 		return True
+ 
+ 		self.proc = self.shell.runCommand(
+ 			'dropdb%d' % self.pg.getNumber(),
+ 			'dropdb',
+ 			args = ['dropdb',
+ 					'-p', str(self.getPort()), self.dbname],
+ 			lineBasedOutput=True)
+ 
+ 		d = defer.Deferred()
+ 		self.proc.addHook(EventMatcher(ProcessEndedEvent),
+ 						  self.dropdb_terminated, d)
+ 		self.proc.start()
+ 		return d
+ 
+ 	def dropdb_terminated(self, event, d):
+ 		if event.exitCode != 0:
+ 			d.errback(Exception("dropdb returned with %d" % \
+ 				event.exitCode))
+ 		else:
+ 			d.callback(None)
+ 
+ 
+ class SqlConnectionSuite(TestSuite):
+ 
+ 	args = (('dbname', str),)
+ 	needs = (('shell', "IShell or something"),
+ 			 ('db', "IPostmaster"))
+ 
+ 	def setUpDescription(self):
+ 		return "connecting to database %s at server %d" % \
+ 						(self.dbname, self.db.getNumber())
+ 	def tearDownDescription(self):
+ 		return "disconnecting from database %s at server %d" % \
+ 						(self.dbname, self.db.getNumber())
+ 
+ 	def getDbname(self):
+ 		return self.dbname
+ 
+ 	def setUp(self):
+ 		self.psql = self.shell.runCommand(
+ 			'psql%d' % self.db.getNumber(),
+ 			'psql',
+ 			args=['psql', '-AEn',
+ 				  '--pset=pager=off', '--pset=columns=0',
+ 				  '-p', str(self.db.getPort()),
+ 				  self.dbname])
+ 
+ 		# initialize the output buffer and attach a first output collector
+ 		# *before* the process is started.
+ 		self.output_buffer = ""
+ 		d = defer.Deferred()
+ 		self.outputCollectorDeferred = d
+ 		self.outputCollectorHook = self.psql.addHook(
+ 			EventMatcher(ProcessOutputEvent), self.outputCollector,
+ 			None, d)
+ 
+ 		# Mark as being in used, until we get to the commandline
+ 		self.inUse = True
+ 		self.workQueue = []
+ 
+ 		# also add a termination hook
+ 		self.unexpectedTerminationHook = self.psql.addHook(
+ 			EventMatcher(ProcessEndedEvent), self.psql_terminated)
+ 
+ 		# then schedule start of the psql process and return the deferred
+ 		# *before* starting the process.
+ 		reactor.callLater(0.0, self.psql.start)
+ 		return d
+ 
+ 	def psql_terminated(self, event):
+ 		exitCode = "undef"
+ 		if hasattr(event, 'exitCode'):
+ 			exitCode = event.exitCode
+ 		elif hasattr(event, 'data'):
+ 			exitCode = repr(event.data)
+ 
+ 		# If there's an outputCollectorHook, the abort method won't catch
+ 		# and we have to wait for the timeout to trigger, instead of
+ 		# acting on process termination. We thus save the outputCollector
+ 		# deferred and send it an errback with the failure.
+ 		if self.outputCollectorHook:
+ 			self.outputCollectorDeferred.errback( \
+ 				TestAborted("psql to server %d unexpectedly terminated (exit code %s)" % ( \
+ 					self.db.getNumber(), exitCode)))
+ 		self.abort(
+ 			"psql to server %d unexpectedly terminated (exit code %s)" % ( \
+ 				self.db.getNumber(), exitCode))
+ 
+ 	def tearDown(self):
+ 		self.psql.removeHook(self.unexpectedTerminationHook)
+ 
+ 		d = defer.Deferred()
+ 		self.psql.addHook(EventMatcher(ProcessEndedEvent),
+ 						  lambda event: d.callback(None))
+ 		reactor.callLater(0.0, self.psql.write, "\\q\n")
+ 		reactor.callLater(5.0, self.psql.stop)
+ 		return d
+ 
+ 	def outputCollector(self, event, query, d):
+ 		self.output_buffer += event.data
+ 
+ 		cmdprompt = self.dbname + '=#'
+ 		cpos = self.output_buffer.find(cmdprompt)
+ 
+ 		if cpos >= 0:
+ 			self.psql.removeHook(self.outputCollectorHook)
+ 			self.outputCollectorHook = False
+ 			result = self.output_buffer[:cpos]
+ 			self.output_buffer = self.output_buffer[cpos + len(cmdprompt):]
+ 			if len(self.output_buffer) > 0 and self.output_buffer != ' ':
+ 				print "rest: %s" % repr(self.output_buffer)
+ 			if d:
+ 				# remove the command prompt at the end
+ 				result = result[:cpos]
+ 
+ 				if query:
+ 					# remove the query string at the beginning
+ 					query_len = len(query)
+ 					if result[:query_len] != query:
+ 						raise Exception("Query not found at beginning of psql answer.")
+ 
+ 					result = result[query_len:]
+ 					while (len(result) > 1) and (result[0] in ("\n", "\r", " ")):
+ 						result = result[1:]
+ 				reactor.callLater(0.0, d.callback, result)
+ 
+ 			self.inUse = False
+ 			if len(self.workQueue) > 0:
+ 				assert not self.inUse
+ 				job = self.workQueue.pop()
+ 				d1 = job['method'](*job['args'])
+ 				d1.chainDeferred(job['deferred'])
+ 
+ 	def query(self, query):
+ 		if self.inUse:
+ 			d = defer.Deferred()
+ 			self.workQueue.append({'deferred': d,
+ 								   'method': self.query,
+ 								   'args': (query,)})
+ 			return d
+ 
+ 		assert not self.inUse
+ 		assert not self.outputCollectorHook
+ 
+ 		self.inUse = True
+ 		self.output_buffer = ""
+ 		d = defer.Deferred()
+ 		self.outputCollectorHook = self.psql.addHook(
+ 			EventMatcher(ProcessOutputEvent), self.outputCollector, query, d)
+ 		d.addCallback(self.parseQueryResult)
+ 
+ 		# defer writing to the process, so that the caller has the
+ 		# opportunity to add callbacks to the deferred we return.
+ 		reactor.callLater(0.0, self.psql.write, query + "\n")
+ 
+ 		return d
+ 
+ 	def parseQueryResult(self, result):
+ 		rawlines = result.split('\n')
+ 
+ 		lines = []
+ 		for line in rawlines:
+ 			line = line.strip()
+ 			if line.startswith("ROLLBACK"):
+ 				raise Exception("transaction rolled back (%s)" % query)
+ 			if line.startswith("message type"):
+ 				raise Exception("protocol error: %s" % line)
+ 			if len(line) > 0 and not line.startswith("NOTICE:") \
+ 				    and not line.startswith("ROLLBACK"):
+ 				lines.append(line)
+ 
+ 		try:
+ 			assert len(lines) >= 2
+ 
+ 			lines = map(lambda x: x.strip(), lines)
+ 			headLine = lines[0]
+ 			tailLine = lines[-1]
+ 
+ 			fields = headLine.split('|')
+ 			rows = []
+ 			for row in lines[1:-1]:
+ 				attrs = row.split('|')
+ 				assert len(attrs) == len(fields)
+ 				x = {}
+ 				for i in range(len(attrs)):
+ 					x[fields[i]] = attrs[i].strip()
+ 				rows.append(x)
+ 
+ 			x = re.compile("\((\d+) rows?\)").search(tailLine)
+ 			if x:
+ 				if not int(x.group(1)) == len(rows):
+ 					raise Exception("number of rows doesn't match: %s vs %d for: '%s'" % (
+ 						x.group(1), len(rows), lines))
+ 			else:
+ 				raise Exception("final number of rows line doesn't match.\n------------\n%s\n---------------\n" % lines)
+ 			return rows
+ 		except Exception, e:
+ 			import traceback
+ 			print "error parsing query result: %s" % e
+ 			traceback.print_exc()
+ 			raise e
+ 			# return []
+ 
+ 	def operation(self, query, expResult=None):
+ 		if self.inUse:
+ 			d = defer.Deferred()
+ 			self.workQueue.append({'deferred': d,
+ 								   'method': self.operation,
+ 								   'args': (query, expResult)})
+ 			return d
+ 
+ 		assert not self.inUse
+ 		assert not self.outputCollectorHook
+ 
+ 		self.inUse = True
+ 		self.output_buffer = ""
+ 		d = defer.Deferred()
+ 		self.outputCollectorDeferred = d
+ 		self.outputCollectorHook = self.psql.addHook(
+ 			EventMatcher(ProcessOutputEvent), self.outputCollector, query, d)
+ 		d.addCallback(self.checkQueryResult, query, expResult)
+ 
+ 		# defer writing to the process, so that the caller has the
+ 		# opportunity to add callbacks to the deferred we return.
+ 		reactor.callLater(0.0, self.psql.write, query + "\n")
+ 
+ 		return d
+ 
+ 	def checkQueryResult(self, result, query, expResult):
+ 		lines = []
+ 		for line in result.split("\n"):
+ 			line = line.strip()
+ 			if len(line) > 0 and not line.startswith("WARNING:") \
+ 							 and not line.startswith("NOTICE:"):
+ 				lines.append(line)
+ 		lines = "\n".join(lines)
+ 		if expResult:
+ 			if isinstance(expResult, str):
+ 				self.assertEqual(expResult, lines,
+ 					"didn't get expected result for query '%s'" % query)
+ 			elif isinstance(expResult, list):
+ 				if not lines in expResult:
+ 					raise TestFailure("didn't get expected result",
+ 									   "no result matches, got:\n%s\nfor query: '%s'\n" % (lines, query))
+ 		return lines
+ 
+ 
+ class TestDatabaseConnection(BaseTest):
+ 
+ 	needs = (('conn', "ISqlConnection"),)
+ 
+ 	description = "database connection"
+ 
+ 	def run(self):
+ 		return self.conn.query("SELECT 1 AS test;")
+ 
+ 
+ # FIXME: that's not actually a test, but it modifies the database state
+ class PopulateTestDatabase(BaseTest):
+ 
+ 	needs = (('conn', "ISqlConnection"),)
+ 
+ 	description = "populate test database"
+ 
+ 	def run(self):
+ 		conn = self.conn
+ 
+ 		# Create a test table for use in TestConcurrentUpdates and fill it
+ 		# with two test tuples.
+ 		d = conn.operation("CREATE TABLE test (i int PRIMARY KEY, t text);",
+ 						   "CREATE TABLE")
+ 		d.addCallback(lambda x: conn.operation(
+ 			"INSERT INTO test VALUES (5, 'apple');",
+ 			"INSERT 0 1"))
+ 		d.addCallback(lambda x: conn.operation(
+ 			"INSERT INTO test VALUES (7, 'pear');",
+ 			"INSERT 0 1"))
+ 		d.addCallback(lambda x: conn.operation(
+ 			"INSERT INTO test VALUES (11, 'banana');",
+ 			"INSERT 0 1"))
+ 		return d
+ 
+ 
+ class PermutationTest(SyncTest):
+ 	"""	Abstract class for testing a set of steps in all permutations of execution order.
+ 		This counts as a single test, although a subclass may accumulate counts which may be of
+ 		interest, and should therefore be shown regardless of success or failure of the test.
+ 	"""
+ 
+ 	# stepDictionary maps a step ID to a function to run for that step.
+ 	stepDictionary = {}
+ 
+ 	# stepThreading is a list of lists.
+ 	# All permutations of interleaving of steps from the sublists will be generated.
+ 	# Steps from within each sublist are kept in order; only the interleaving is variable.
+ 	stepThreading = [[]]
+ 
+ 	# Override this to provide any per-iteration (permutation) setup.
+ 	def setUpIteration(self, stepIdList):
+ 		pass
+ 
+ 	# Override this to provide any per-iteration (permutation) teardown.
+ 	def tearDownIteration(self, stepIdList):
+ 		pass
+ 
+ 	def runIterationStep(self, stepId):
+ 		p = self.stepDictionary[stepId]
+ 		p()
+ 
+ 	def runIterationSteps(self, stepIdList):
+ 		try:
+ 			self.setUpIteration(stepIdList)
+ 			for stepId in stepIdList:
+ 				self.runIterationStep(stepId)
+ 		finally:
+ 			self.tearDownIteration(stepIdList)
+ 
+ 	def runPermutations(self, a):
+ 		self.runPermutations_recurse([], a)
+ 
+ 	def runPermutations_recurse(self, p, a):
+ 		found = False
+ 		for i in range(len(a)):
+ 			if len(a[i]) > 0:
+ 				found = True
+ 				r = p[:]
+ 				b = a[:]
+ 				r.append(b[i][0])
+ 				b[i] = b[i][1:]
+ 				self.runPermutations_recurse(r, b)
+ 		if not found:
+ 			self.runIterationSteps(p)
+ 
+ 	# If the dictionary is set up in this method, there can be references
+ 	# to class methods and fields.
+ 	def populateStepDictionary(self):
+ 		pass
+ 
+ 	def run(self):
+ 		self.populateStepDictionary()
+ 		self.runPermutations(self.stepThreading)
+ 		# The last two lines of output for the last entry seem to disappear???
+ 		print
+ 		print
+ 
+ 
+ class DummyPermutationTest(PermutationTest):
+ 	"""	Simple test of the PermutationTest abstract class.
+ 	"""
+ 
+ 	description = "simple test of the PermutationTest abstract class"
+ 
+ 	stepThreading = [['r1x','c1'],['r2x','c2']]
+ 
+ 	def setUpIteration(self, stepIdList):
+ 		print stepIdList
+ 
+ 	def tearDownIteration(self, stepIdList):
+ 		print
+ 
+ 	def printStepId(self, stepId):
+ 		print stepId,
+ 
+ 	def populateStepDictionary(self):
+ 		self.stepDictionary = {
+ 			'r1x': lambda : self.printStepId('r1x'),
+ 			'c1': lambda : self.printStepId('c1'),
+ 			'r2x': lambda : self.printStepId('r2x'),
+ 			'c2': lambda : self.printStepId('c2')
+ 			}
+ 
+ 
+ class DatabasePermutationTest(PermutationTest):
+ 	""" Abstract class to provide framework for using an IterativeTest for database queries.
+ 	"""
+ 
+ 	commitRequiredCount = 0
+ 	commitRequiredOK = 0
+ 	rollbackRequiredCount = 0
+ 	rollbackRequiredOK = 0
+ 	commitPreferredCount = 0
+ 	commitPreferredOK = 0
+ 
+ 	serializationFailure = False
+ 
+ 	def commitRequired(self, stepIdList):
+ 		return True
+ 
+ 	def rollbackRequired(self, stepIdList):
+ 		return False
+ 
+ 	def countProgress(self, stepIdList):
+ 		if self.rollbackRequired(stepIdList):
+ 			self.rollbackRequiredCount += 1
+ 			if self.serializationFailure:
+ 				self.rollbackRequiredOK += 1
+ 		else:
+ 			if self.commitRequired(stepIdList):
+ 				self.commitRequiredCount += 1
+ 				if not self.serializationFailure:
+ 					self.commitRequiredOK += 1
+ 			else:
+ 				self.commitPreferredCount += 1
+ 				if not self.serializationFailure:
+ 					self.commitPreferredOK += 1
+ 
+ 	def runIterationSteps(self, stepIdList):
+ 		try:
+ 			self.setUpIteration(stepIdList)
+ 			for stepId in stepIdList:
+ 				self.runIterationStep(stepId)
+ 			self.countProgress(stepIdList)
+ 		finally:
+ 			self.tearDownIteration(stepIdList)
+ 
+ 	def tryOperation(self, conn, sql):
+ 		result = self.syncCall(10, conn.operation, sql),
+ 		for line in result:
+ 			if len(line) > 0 and line.startswith("ERROR:  could not serialize"):
+ 				self.serializationFailure = True
+ 			else:
+ 				if len(line) > 0 and line.startswith("ERROR:"):
+ 					raise TestFailure("failure other than serializable encountered: " + line, line)
+ 
+ 	def printStatistics(self):
+ 		print 'rollback required: ', self.rollbackRequiredOK, '/', self.rollbackRequiredCount
+ 		print 'commit required: ', self.commitRequiredOK, '/', self.commitRequiredCount
+ 		print 'commit preferred: ', self.commitPreferredOK, '/', self.commitPreferredCount
+ 
+ 	def run(self):
+ 		self.populateStepDictionary()
+ 		self.runPermutations(self.stepThreading)
+ 		self.printStatistics()
+ 		# The last two lines of output for the last entry seem to disappear???
+ 		print
+ 		print
+ 		if self.rollbackRequiredOK < self.rollbackRequiredCount:
+ 			raise TestFailure("serialization anomalies incorrectly allowed",
+ 				"Database integrity not protected.")
+ 		if self.commitRequiredOK < self.commitRequiredCount:
+ 			raise TestFailure("serialization failure occurred when it should not have",
+ 				"Transactions we thought we knew how to recognize as safe resulted in a rollback..")
+ 
+ 	def printStepResults(self, stepIdList):
+ 		if self.serializationFailure:
+ 			if self.commitRequired(stepIdList):
+ 				print 'rolled back ??'
+ 			else:
+ 				if not self.rollbackRequired(stepIdList):
+ 					print 'rolled back ?'
+ 				else:
+ 					print 'rolled back'
+ 		else:
+ 			if self.rollbackRequired(stepIdList):
+ 				print 'committed ***'
+ 			else:
+ 				print 'committed'
+ 
+ 
+ class SimpleWriteSkewTest(DatabasePermutationTest):
+ 	"""	Write skew test.
+ 		This test has two serializable transactions: one which updates all
+ 		'apple' rows to 'pear' and one which updates all 'pear' rows to
+ 		'apple'.  If these were serialized (run one at a time) either
+ 		value could be present, but not both.  One must be rolled back to
+ 		prevent the write skew anomaly.
+ 	"""
+ 
+ 	needs = (('conn1', 'ISqlConnection'),
+ 			 ('conn2', 'ISqlConnection'))
+ 
+ 	description = "write skew test"
+ 
+ 	stepThreading = [['rwx1','c1'],['rwx2','c2']]
+ 
+ 	def populateStepDictionary(self):
+ 		self.stepDictionary = {
+ 			'rwx1': lambda : self.tryOperation(self.conn1, "UPDATE test SET t = 'apple' WHERE t = 'pear';"),
+ 			'c1': lambda : self.tryOperation(self.conn1, "COMMIT;"),
+ 			'rwx2': lambda : self.tryOperation(self.conn2, "UPDATE test SET t = 'pear' WHERE t = 'apple';"),
+ 			'c2': lambda : self.tryOperation(self.conn2, "COMMIT;")
+ 			}
+ 
+ 	def setUpIteration(self, stepIdList):
+ 		print stepIdList,
+ 		self.serializationFailure = False
+ 		self.syncCall(10, self.conn1.operation, "UPDATE test SET t = 'apple' WHERE i = 5;", "UPDATE 1")
+ 		self.syncCall(10, self.conn1.operation, "UPDATE test SET t = 'pear' WHERE i = 7;", "UPDATE 1")
+ 		self.syncCall(10, self.conn1.operation, "BEGIN TRANSACTION ISOLATION LEVEL SERIALIZABLE;", "BEGIN")
+ 		self.syncCall(10, self.conn2.operation, "BEGIN TRANSACTION ISOLATION LEVEL SERIALIZABLE;", "BEGIN")
+ 
+ 	def tearDownIteration(self, stepIdList):
+ 		self.syncCall(10, self.conn1.operation, "ROLLBACK;")
+ 		self.syncCall(10, self.conn2.operation, "ROLLBACK;")
+ 		self.printStepResults(stepIdList)
+ 
+ 	def commitRequired(self, stepIdList):
+ 		return (stepIdList.index('c1') < stepIdList.index('rwx2')
+ 				or stepIdList.index('c2') < stepIdList.index('rwx1'))
+ 
+ 	def rollbackRequired(self, stepIdList):
+ 		return not self.commitRequired(stepIdList)
+ 
+ 
+ class ReceiptReportTest(DatabasePermutationTest):
+ 	"""	Daily Report of Receipts test.
+ 		This test doesn't persist a bad state in the database; rather, it
+ 		provides a view of the data which is not consistent with any
+ 		order of execution of the serializable transactions.  It
+ 		demonstrates a situation where the deposit date for receipts could
+ 		be changed and a report of the closed day's receipts subsequently
+ 		run which will miss a receipt from the date which has been closed.
+ 	"""
+ 
+ 	needs = (('conn1', 'ISqlConnection'),
+ 			 ('conn2', 'ISqlConnection'),
+ 			 ('conn3', 'ISqlConnection'))
+ 
+ 	description = "daily report of receipts test"
+ 
+ 	stepThreading = [['rxwy1','c1'],['wx2','c2'],['rx3','ry3','c3']]
+ 
+ 	def populateStepDictionary(self):
+ 		self.stepDictionary = {
+ 			'rxwy1': lambda : self.tryOperation(self.conn1, "INSERT INTO receipt VALUES (3, (SELECT deposit_date FROM ctl WHERE k = 'receipt'), 4.00);"),
+ 			'c1': lambda : self.tryOperation(self.conn1, "COMMIT;"),
+ 			'wx2': lambda : self.tryOperation(self.conn2, "UPDATE ctl SET deposit_date = DATE '2008-12-23' WHERE k = 'receipt';"),
+ 			'c2': lambda : self.tryOperation(self.conn2, "COMMIT;"),
+ 			'rx3': lambda : self.tryOperation(self.conn3, "SELECT * FROM ctl WHERE k = 'receipt';"),
+ 			'ry3': lambda : self.tryOperation(self.conn3, "SELECT * FROM receipt WHERE deposit_date = DATE '2008-12-22';"),
+ 			'c3': lambda : self.tryOperation(self.conn3, "COMMIT;")
+ 			}
+ 
+ 	def setUpIteration(self, stepIdList):
+ 		print stepIdList,
+ 		self.serializationFailure = False
+ 		self.syncCall(10, self.conn1.operation, "DROP TABLE IF EXISTS ctl, receipt;")
+ 		self.syncCall(10, self.conn1.operation, "CREATE TABLE ctl (k text NOT NULL PRIMARY KEY, deposit_date date NOT NULL);")
+ 		self.syncCall(10, self.conn1.operation, "INSERT INTO ctl VALUES ('receipt', DATE '2008-12-22');")
+ 		self.syncCall(10, self.conn1.operation, "CREATE TABLE receipt (receipt_no int NOT NULL PRIMARY KEY, deposit_date date NOT NULL, amount numeric(13,2));")
+ 		self.syncCall(10, self.conn1.operation, "INSERT INTO receipt VALUES (1, (SELECT deposit_date FROM ctl WHERE k = 'receipt'), 1.00);")
+ 		self.syncCall(10, self.conn1.operation, "INSERT INTO receipt VALUES (2, (SELECT deposit_date FROM ctl WHERE k = 'receipt'), 2.00);")
+ 		self.syncCall(10, self.conn1.operation, "BEGIN TRANSACTION ISOLATION LEVEL SERIALIZABLE;", "BEGIN")
+ 		self.syncCall(10, self.conn2.operation, "BEGIN TRANSACTION ISOLATION LEVEL SERIALIZABLE;", "BEGIN")
+ 		self.syncCall(10, self.conn3.operation, "BEGIN TRANSACTION READ ONLY ISOLATION LEVEL SERIALIZABLE READ ONLY;", "BEGIN")
+ 
+ 	def tearDownIteration(self, stepIdList):
+ 		self.syncCall(10, self.conn1.operation, "ROLLBACK;")
+ 		self.syncCall(10, self.conn2.operation, "ROLLBACK;")
+ 		self.syncCall(10, self.conn3.operation, "ROLLBACK;")
+ 		self.printStepResults(stepIdList)
+ 
+ 	def commitRequired(self, stepIdList):
+ 		return (   (stepIdList.index('c1') < stepIdList.index('wx2')
+ 					and stepIdList.index('c1') < stepIdList.index('rx3'))
+ 				or (stepIdList.index('c2') < stepIdList.index('rxwy1')
+ 					and stepIdList.index('c2') < stepIdList.index('rx3'))
+ 				or (stepIdList.index('c3') < stepIdList.index('rxwy1')
+ 					and stepIdList.index('c3') < stepIdList.index('wx2'))
+ 				or (stepIdList.index('c2') < stepIdList.index('rxwy1')
+ 					and stepIdList.index('c3') < stepIdList.index('rxwy1'))
+ 				or (stepIdList.index('c1') < stepIdList.index('wx2')
+ 					and stepIdList.index('c3') < stepIdList.index('wx2'))
+ 				or (stepIdList.index('c1') < stepIdList.index('rx3')
+ 					and stepIdList.index('c2') < stepIdList.index('rx3')))
+ 
+ 	def rollbackRequired(self, stepIdList):
+ 		return ((stepIdList.index('c2') < stepIdList.index('c1')
+ 				and stepIdList.index('c2') < stepIdList.index('c3')
+ 				and stepIdList.index('rxwy1') < stepIdList.index('c2')
+ 				and stepIdList.index('rx3') < stepIdList.index('c1')
+ 				#############################################################
+ 				# The following test excludes some rows from rollback
+ 				# required for which we know our current SSI algorithm
+ 				# requires a rollback, but which don't, in fact, cause
+ 				# any anomaly.  If we determine that we can allow pivots
+ 				# in which conflictIn and conflictOut are separate and
+ 				# overlapping transactions, these can be committed.
+ 				# To include these permutations in the "rollback required"
+ 				# count, comment out the following line.
+ 				and stepIdList.index('c2') < stepIdList.index('rx3')
+ 				#############################################################
+ 				)
+ 
+ 				#############################################################
+ 				# An anomaly can't actually occur based on the following
+ 				# "or" clause, but we know that our algorithm can't
+ 				# currently detect that, because T2's conflictIn is set
+ 				# to a self-reference because of multiple conflicts.
+ 				# To count these in the "rollback required" list, uncomment
+ 				# this section; otherwise they are "commit preferred"..
+ 				# or (stepIdList.index('rxwy1') < stepIdList.index('c1')
+ 				#	and stepIdList.index('rxwy1') < stepIdList.index('c2')
+ 				#	and stepIdList.index('rxwy1') < stepIdList.index('c3')
+ 				#	and stepIdList.index('wx2') < stepIdList.index('c1')
+ 				#	and stepIdList.index('wx2') < stepIdList.index('c2')
+ 				#	and stepIdList.index('wx2') < stepIdList.index('c3')
+ 				#	and stepIdList.index('rx3') < stepIdList.index('c1')
+ 				#	and stepIdList.index('rx3') < stepIdList.index('c2')
+ 				#	and stepIdList.index('rx3') < stepIdList.index('c3')
+ 				#	)
+ 				#############################################################
+ 			   )
+ 
+ 
+ class TemporalRangeIntegrityTest(DatabasePermutationTest):
+ 	"""	Temporal range integrity test.
+ 		Snapshot integrity fails with simple referential integrity tests,
+ 		but those don't make for good demonstrations because people just
+ 		say that foreign key definitions should be used instead.  There
+ 		are many integrity tests which are conceptually very similar but
+ 		don't have built-in support which will fail when used in triggers.
+ 		This is intended to illustrate such cases.  It is obviously very
+ 		hard to exercise all these permutations when the code is actually
+ 		in a trigger; this test pulls what would normally be inside of
+ 		triggers out to the top level to control the permutations.
+ 	"""
+ 
+ 	needs = (('conn1', 'ISqlConnection'),
+ 			 ('conn2', 'ISqlConnection'))
+ 
+ 	description = "temporal range integrity test"
+ 
+ 	stepThreading = [['rx1','wy1','c1'],['ry2','wx2','c2']]
+ 
+ 	def populateStepDictionary(self):
+ 		self.stepDictionary = {
+ 			'rx1': lambda : self.tryOperation(self.conn1, "SELECT count(*) FROM statute WHERE statute_cite = '123.45(1)a' AND eff_date <= DATE '2009-05-15' AND (exp_date IS NULL OR exp_date > DATE '2009-05-15');"),
+ 			'wy1': lambda : self.tryOperation(self.conn1, "INSERT INTO offense VALUES (1, '123.45(1)a', DATE '2009-05-15');"),
+ 			'c1': lambda : self.tryOperation(self.conn1, "COMMIT;"),
+ 			'ry2': lambda : self.tryOperation(self.conn2, "SELECT count(*) FROM offense WHERE statute_cite = '123.45(1)a' AND offense_date >= DATE '2008-01-01';"),
+ 			'wx2': lambda : self.tryOperation(self.conn2, "DELETE FROM statute WHERE statute_cite = '123.45(1)a' AND eff_date = DATE '2008-01-01';"),
+ 			'c2': lambda : self.tryOperation(self.conn2, "COMMIT;")
+ 			}
+ 
+ 	def setUpIteration(self, stepIdList):
+ 		self.serializationFailure = False
+ 		self.syncCall(10, self.conn1.operation, "DROP TABLE IF EXISTS statute, offense;", "DROP TABLE")
+ 		self.syncCall(10, self.conn1.operation, "CREATE TABLE statute (statute_cite text NOT NULL, eff_date date NOT NULL, exp_date date, CONSTRAINT statute_pkey PRIMARY KEY (statute_cite, eff_date));", "CREATE TABLE")
+ 		self.syncCall(10, self.conn1.operation, "INSERT INTO statute VALUES ('123.45(1)a', DATE '2008-01-01', NULL);", "INSERT 0 1")
+ 		self.syncCall(10, self.conn1.operation, "CREATE TABLE offense (offense_no int NOT NULL, statute_cite text NOT NULL, offense_date date NOT NULL, CONSTRAINT offense_pkey PRIMARY KEY (offense_no));", "CREATE TABLE")
+ 		self.syncCall(10, self.conn1.operation, "BEGIN TRANSACTION ISOLATION LEVEL SERIALIZABLE;", "BEGIN")
+ 		self.syncCall(10, self.conn2.operation, "BEGIN TRANSACTION ISOLATION LEVEL SERIALIZABLE;", "BEGIN")
+ 		print stepIdList,
+ 
+ 	def tearDownIteration(self, stepIdList):
+ 		self.syncCall(10, self.conn1.operation, "ROLLBACK;")
+ 		self.syncCall(10, self.conn2.operation, "ROLLBACK;")
+ 		self.printStepResults(stepIdList)
+ 
+ 	def commitRequired(self, stepIdList):
+ 		return (   stepIdList.index('c1') < stepIdList.index('ry2')
+ 				or stepIdList.index('c2') < stepIdList.index('rx1'))
+ 
+ 	def rollbackRequired(self, stepIdList):
+ 		return not self.commitRequired(stepIdList)
+ 
+ 
+ class ProjectManagerTest(DatabasePermutationTest):
+ 	"""	Project manager test.
+ 		Ensure that the person who is on the project as a manager
+ 		is flagged as a project manager in the person table.
+ 	"""
+ 
+ 	needs = (('conn1', 'ISqlConnection'),
+ 			 ('conn2', 'ISqlConnection'))
+ 
+ 	description = "project manager test"
+ 
+ 	stepThreading = [['rx1','wy1','c1'],['ry2','wx2','c2']]
+ 
+ 	def populateStepDictionary(self):
+ 		self.stepDictionary = {
+ 			'rx1': lambda : self.tryOperation(self.conn1, "SELECT count(*) FROM person WHERE person_id = 1 AND is_project_manager;"),
+ 			'wy1': lambda : self.tryOperation(self.conn1, "INSERT INTO project VALUES (101, 'Build Great Wall', 1);"),
+ 			'c1': lambda : self.tryOperation(self.conn1, "COMMIT;"),
+ 			'ry2': lambda : self.tryOperation(self.conn2, "SELECT count(*) FROM project WHERE project_manager = 1;"),
+ 			'wx2': lambda : self.tryOperation(self.conn2, "UPDATE person SET is_project_manager = false WHERE person_id = 1;"),
+ 			'c2': lambda : self.tryOperation(self.conn2, "COMMIT;")
+ 			}
+ 
+ 	def setUpIteration(self, stepIdList):
+ 		self.serializationFailure = False
+ 		self.syncCall(10, self.conn1.operation, "DROP TABLE IF EXISTS person, project;", "DROP TABLE")
+ 		self.syncCall(10, self.conn1.operation, "CREATE TABLE person (person_id int NOT NULL PRIMARY KEY, name text NOT NULL, is_project_manager bool NOT NULL);", "CREATE TABLE")
+ 		self.syncCall(10, self.conn1.operation, "INSERT INTO person VALUES (1, 'Robert Haas', true);", "INSERT 0 1")
+ 		self.syncCall(10, self.conn1.operation, "CREATE TABLE project (project_no int NOT NULL PRIMARY KEY, description text NOT NULL, project_manager int NOT NULL);", "CREATE TABLE")
+ 		self.syncCall(10, self.conn1.operation, "BEGIN TRANSACTION ISOLATION LEVEL SERIALIZABLE;", "BEGIN")
+ 		self.syncCall(10, self.conn2.operation, "BEGIN TRANSACTION ISOLATION LEVEL SERIALIZABLE;", "BEGIN")
+ 		print stepIdList,
+ 
+ 	def tearDownIteration(self, stepIdList):
+ 		self.syncCall(10, self.conn1.operation, "ROLLBACK;")
+ 		self.syncCall(10, self.conn2.operation, "ROLLBACK;")
+ 		self.printStepResults(stepIdList)
+ 
+ 	def commitRequired(self, stepIdList):
+ 		return (   stepIdList.index('c1') < stepIdList.index('ry2')
+ 				or stepIdList.index('c2') < stepIdList.index('rx1'))
+ 
+ 	def rollbackRequired(self, stepIdList):
+ 		return not self.commitRequired(stepIdList)
+ 
+ 
+ class ClassroomSchedulingTest(DatabasePermutationTest):
+ 	"""	Classroom scheduling test.
+ 		Ensure that the classroom is not scheduled more than once
+ 		for any moment in time.
+ 	"""
+ 
+ 	needs = (('conn1', 'ISqlConnection'),
+ 			 ('conn2', 'ISqlConnection'))
+ 
+ 	description = "classroom scheduling test"
+ 
+ 	stepThreading = [['rx1','wy1','c1'],['ry2','wx2','c2']]
+ 
+ 	def populateStepDictionary(self):
+ 		self.stepDictionary = {
+ 			'rx1': lambda : self.tryOperation(self.conn1, "SELECT count(*) FROM room_reservation WHERE room_id = '101' AND start_time < TIMESTAMP WITH TIME ZONE '2010-04-01 14:00' AND end_time > TIMESTAMP WITH TIME ZONE '2010-04-01 13:00';"),
+ 			'wy1': lambda : self.tryOperation(self.conn1, "INSERT INTO room_reservation VALUES ('101', TIMESTAMP WITH TIME ZONE '2010-04-01 13:00', TIMESTAMP WITH TIME ZONE '2010-04-01 14:00', 'Carol');"),
+ 			'c1': lambda : self.tryOperation(self.conn1, "COMMIT;"),
+ 			'ry2': lambda : self.tryOperation(self.conn2, "SELECT count(*) FROM room_reservation WHERE room_id = '101' AND start_time < TIMESTAMP WITH TIME ZONE '2010-04-01 14:30' AND end_time > TIMESTAMP WITH TIME ZONE '2010-04-01 13:30';"),
+ 			'wx2': lambda : self.tryOperation(self.conn2, "UPDATE room_reservation SET start_time = TIMESTAMP WITH TIME ZONE '2010-04-01 13:30', end_time = TIMESTAMP WITH TIME ZONE '2010-04-01 14:30' WHERE room_id = '101' AND start_time = TIMESTAMP WITH TIME ZONE '2010-04-01 10:00';"),
+ 			'c2': lambda : self.tryOperation(self.conn2, "COMMIT;")
+ 			}
+ 
+ 	def setUpIteration(self, stepIdList):
+ 		self.serializationFailure = False
+ 		self.syncCall(10, self.conn1.operation, "DROP TABLE IF EXISTS room_reservation;", "DROP TABLE")
+ 		self.syncCall(10, self.conn1.operation, "CREATE TABLE room_reservation (room_id text NOT NULL, start_time timestamp with time zone NOT NULL, end_time timestamp with time zone NOT NULL, description text NOT NULL, CONSTRAINT room_reservation_pkey PRIMARY KEY (room_id, start_time));", "CREATE TABLE")
+ 		self.syncCall(10, self.conn1.operation, "INSERT INTO room_reservation VALUES ('101', TIMESTAMP WITH TIME ZONE '2010-04-01 10:00', TIMESTAMP WITH TIME ZONE '2010-04-01 11:00', 'Bob');", "INSERT 0 1")
+ 		self.syncCall(10, self.conn1.operation, "BEGIN TRANSACTION ISOLATION LEVEL SERIALIZABLE;", "BEGIN")
+ 		self.syncCall(10, self.conn2.operation, "BEGIN TRANSACTION ISOLATION LEVEL SERIALIZABLE;", "BEGIN")
+ 		print stepIdList,
+ 
+ 	def tearDownIteration(self, stepIdList):
+ 		self.syncCall(10, self.conn1.operation, "ROLLBACK;")
+ 		self.syncCall(10, self.conn2.operation, "ROLLBACK;")
+ 		self.printStepResults(stepIdList)
+ 
+ 	def commitRequired(self, stepIdList):
+ 		return (   stepIdList.index('c1') < stepIdList.index('ry2')
+ 				or stepIdList.index('c2') < stepIdList.index('rx1'))
+ 
+ 	def rollbackRequired(self, stepIdList):
+ 		return not self.commitRequired(stepIdList)
+ 
+ 
+ class TotalCashTest(DatabasePermutationTest):
+ 	"""	Total cash test.
+ 		Another famous test of snapshot isolation anomaly.
+ 	"""
+ 
+ 	needs = (('conn1', 'ISqlConnection'),
+ 			 ('conn2', 'ISqlConnection'))
+ 
+ 	description = "total cash test"
+ 
+ 	stepThreading = [['wx1','rxy1','c1'],['wy2','rxy2','c2']]
+ 
+ 	def populateStepDictionary(self):
+ 		self.stepDictionary = {
+ 			'wx1': lambda : self.tryOperation(self.conn1, "UPDATE accounts SET balance = balance - 200 WHERE accountid = 'checking';"),
+ 			'rxy1': lambda : self.tryOperation(self.conn1, "SELECT SUM(balance) FROM accounts;"),
+ 			'c1': lambda : self.tryOperation(self.conn1, "COMMIT;"),
+ 			'wy2': lambda : self.tryOperation(self.conn2, "UPDATE accounts SET balance = balance - 200 WHERE accountid = 'savings';"),
+ 			'rxy2': lambda : self.tryOperation(self.conn2, "SELECT SUM(balance) FROM accounts;"),
+ 			'c2': lambda : self.tryOperation(self.conn2, "COMMIT;")
+ 			}
+ 
+ 	def setUpIteration(self, stepIdList):
+ 		self.serializationFailure = False
+ 		self.syncCall(10, self.conn1.operation, "DROP TABLE IF EXISTS accounts;", "DROP TABLE")
+ 		self.syncCall(10, self.conn1.operation, "CREATE TABLE accounts (accountid text NOT NULL PRIMARY KEY, balance numeric not null);", "CREATE TABLE")
+ 		self.syncCall(10, self.conn1.operation, "INSERT INTO accounts VALUES ('checking', 600),('savings',600);", "INSERT 0 2")
+ 		self.syncCall(10, self.conn1.operation, "BEGIN TRANSACTION ISOLATION LEVEL SERIALIZABLE;", "BEGIN")
+ 		self.syncCall(10, self.conn2.operation, "BEGIN TRANSACTION ISOLATION LEVEL SERIALIZABLE;", "BEGIN")
+ 		print stepIdList,
+ 
+ 	def tearDownIteration(self, stepIdList):
+ 		self.syncCall(10, self.conn1.operation, "ROLLBACK;")
+ 		self.syncCall(10, self.conn2.operation, "ROLLBACK;")
+ 		self.printStepResults(stepIdList)
+ 
+ 	def commitRequired(self, stepIdList):
+ 		return (   stepIdList.index('c1') < stepIdList.index('wy2')
+ 				or stepIdList.index('c2') < stepIdList.index('wx1'))
+ 
+ 	def rollbackRequired(self, stepIdList):
+ 		return not self.commitRequired(stepIdList)
+ 
+ 
+ class ReferentialIntegrityTest(DatabasePermutationTest):
+ 	"""	Referential integrity test.
+ 		The assumption here is that the application code issuing the SELECT
+ 		to test for the presence or absence of a related record would do the
+ 		right thing -- this script doesn't include that logic.
+ 	"""
+ 
+ 	needs = (('conn1', 'ISqlConnection'),
+ 			 ('conn2', 'ISqlConnection'))
+ 
+ 	description = "referential integrity test"
+ 
+ 	stepThreading = [['rx1','wy1','c1'],['rx2','ry2','wx2','c2']]
+ 
+ 	def populateStepDictionary(self):
+ 		self.stepDictionary = {
+ 			'rx1': lambda : self.tryOperation(self.conn1, "SELECT i FROM a WHERE i = 1;"),
+ 			'wy1': lambda : self.tryOperation(self.conn1, "INSERT INTO b VALUES (1);"),
+ 			'c1': lambda : self.tryOperation(self.conn1, "COMMIT;"),
+ 			'rx2': lambda : self.tryOperation(self.conn2, "SELECT i FROM a WHERE i = 1;"),
+ 			'ry2': lambda : self.tryOperation(self.conn2, "SELECT a_id FROM b WHERE a_id = 1;"),
+ 			'wx2': lambda : self.tryOperation(self.conn2, "DELETE FROM a WHERE i = 1;"),
+ 			'c2': lambda : self.tryOperation(self.conn2, "COMMIT;")
+ 			}
+ 
+ 	def setUpIteration(self, stepIdList):
+ 		self.serializationFailure = False
+ 		self.syncCall(10, self.conn1.operation, "DROP TABLE IF EXISTS a, b;", "DROP TABLE")
+ 		self.syncCall(10, self.conn1.operation, "CREATE TABLE a (i int PRIMARY KEY);", "CREATE TABLE")
+ 		self.syncCall(10, self.conn1.operation, "CREATE TABLE b (a_id int);", "CREATE TABLE")
+ 		self.syncCall(10, self.conn1.operation, "INSERT INTO a VALUES (1);", "INSERT 0 1")
+ 		self.syncCall(10, self.conn1.operation, "BEGIN TRANSACTION ISOLATION LEVEL SERIALIZABLE;", "BEGIN")
+ 		self.syncCall(10, self.conn2.operation, "BEGIN TRANSACTION ISOLATION LEVEL SERIALIZABLE;", "BEGIN")
+ 		print stepIdList,
+ 
+ 	def tearDownIteration(self, stepIdList):
+ 		self.syncCall(10, self.conn1.operation, "ROLLBACK;")
+ 		self.syncCall(10, self.conn2.operation, "ROLLBACK;")
+ 		self.printStepResults(stepIdList)
+ 
+ 	def commitRequired(self, stepIdList):
+ 		return (   stepIdList.index('c1') < stepIdList.index('rx2')
+ 				or stepIdList.index('c2') < stepIdList.index('rx1'))
+ 
+ 	def rollbackRequired(self, stepIdList):
+ 		return not self.commitRequired(stepIdList)
+ 
+ 
+ class RITriggerTest(DatabasePermutationTest):
+ 	"""	Referential integrity trigger test.
+ 	"""
+ 
+ 	needs = (('conn1', 'ISqlConnection'),
+ 			 ('conn2', 'ISqlConnection'))
+ 
+ 	description = "referential integrity trigger test"
+ 
+ 	stepThreading = [['wxry1','c1'],['r2','wyrx2','c2']]
+ 
+ 	def populateStepDictionary(self):
+ 		self.stepDictionary = {
+ 			'wxry1': lambda : self.tryOperation(self.conn1, "INSERT INTO child (parent_id) VALUES (0);"),
+ 			'c1': lambda : self.tryOperation(self.conn1, "COMMIT;"),
+ 			'r2': lambda : self.tryOperation(self.conn2, "SELECT TRUE;"),
+ 			'wyrx2': lambda : self.tryOperation(self.conn2, "DELETE FROM parent WHERE parent_id = 0;"),
+ 			'c2': lambda : self.tryOperation(self.conn2, "COMMIT;")
+ 			}
+ 
+ 	def setUpIteration(self, stepIdList):
+ 		self.serializationFailure = False
+ 		self.syncCall(10, self.conn1.operation, "DROP TABLE IF EXISTS parent, child;", "DROP TABLE")
+ 		self.syncCall(10, self.conn1.operation, "CREATE TABLE parent (parent_id SERIAL NOT NULL PRIMARY KEY);", "CREATE TABLE")
+ 		self.syncCall(10, self.conn1.operation, "CREATE TABLE child (child_id SERIAL NOT NULL PRIMARY KEY, parent_id INTEGER NOT NULL);", "CREATE TABLE")
+ 		self.syncCall(10, self.conn1.operation, "CREATE OR REPLACE FUNCTION ri_parent() RETURNS TRIGGER AS $body$\
+ BEGIN\
+   PERFORM TRUE FROM child WHERE parent_id = OLD.parent_id;\
+   IF FOUND THEN\
+     RAISE SQLSTATE '23503' USING MESSAGE = 'Parent ' || OLD.parent_id || ' still referenced during ' || TG_OP;\
+   END IF;\
+   RETURN NULL;\
+ END;\
+ $body$ LANGUAGE PLPGSQL VOLATILE;", "CREATE FUNCTION")
+ 		self.syncCall(10, self.conn1.operation, "CREATE TRIGGER ri_parent AFTER UPDATE OR DELETE ON parent FOR EACH ROW EXECUTE PROCEDURE ri_parent();", "CREATE TRIGGER")
+ 		self.syncCall(10, self.conn1.operation, "CREATE OR REPLACE FUNCTION ri_child() RETURNS TRIGGER AS $body$\
+ BEGIN\
+   PERFORM TRUE FROM parent WHERE parent_id = NEW.parent_id;\
+   IF NOT FOUND THEN\
+     RAISE SQLSTATE '23503' USING MESSAGE = 'Parent ' || NEW.parent_id || ' does not exist during ' || TG_OP;\
+   END IF;\
+   RETURN NULL;\
+ END;\
+ $body$ LANGUAGE PLPGSQL VOLATILE;", "CREATE FUNCTION")
+ 		self.syncCall(10, self.conn1.operation, "CREATE TRIGGER ri_child AFTER INSERT OR UPDATE ON child FOR EACH ROW EXECUTE PROCEDURE ri_child();", "CREATE TRIGGER")
+ 		self.syncCall(10, self.conn1.operation, "INSERT INTO parent VALUES(0);", "INSERT 0 1")
+ 		self.syncCall(10, self.conn1.operation, "BEGIN TRANSACTION ISOLATION LEVEL SERIALIZABLE;", "BEGIN")
+ 		self.syncCall(10, self.conn2.operation, "BEGIN TRANSACTION ISOLATION LEVEL SERIALIZABLE;", "BEGIN")
+ 		print stepIdList,
+ 
+ 	# Override the normal method to allow failures generated by the trigger code
+ 	# to be considered "success".  Just so we can count things up.
+ 	def tryOperation(self, conn, sql):
+ 		result = self.syncCall(10, conn.operation, sql),
+ 		for line in result:
+ 			if len(line) > 0 and line.startswith("ERROR:  could not serialize"):
+ 				self.serializationFailure = True
+ 			else:
+ 				if (len(line) > 0 and line.startswith("ERROR:")
+ 				and len(line) > 0 and not line.startswith("ERROR:  Parent 0 ")):
+ 					raise TestFailure("failure other than serializable encountered: " + line, line)
+ 
+ 	def tearDownIteration(self, stepIdList):
+ 		self.syncCall(10, self.conn1.operation, "ROLLBACK;")
+ 		self.syncCall(10, self.conn2.operation, "ROLLBACK;")
+ 		self.printStepResults(stepIdList)
+ 
+ 	def commitRequired(self, stepIdList):
+ 		return (   stepIdList.index('c1') < stepIdList.index('r2')
+ 				or stepIdList.index('c2') < stepIdList.index('wxry1'))
+ 
+ 	def rollbackRequired(self, stepIdList):
+ 		return not self.commitRequired(stepIdList)
+ 
+ 
+ class TestTrueSerializabilityConcurrentUpdates(SyncTest):
+ 	""" Runs three transactions concurrently, each reading from what the
+ 		other writes in turn. Should raise a serialization failure, but
+ 		instead leads to wrong results, ATM.
+ 	"""
+ 
+ 	description = "concurrent updates"
+ 
+ 	needs = (('conn1', 'ISqlConnection'),
+ 			 ('conn2', 'ISqlConnection'),
+ 			 ('conn3', 'ISqlConnection'))
+ 
+ 	def execOnAllConnections(self, sql, expRes=None):
+ 		deferreds = []
+ 		for conn in self.connections:
+ 			d = conn.operation(sql, expRes)
+ 			deferreds.append(d)
+ 
+ 		d = defer.DeferredList(deferreds,
+ 							   consumeErrors=True, fireOnOneErrback=True)
+ 		return d
+ 
+ 	def readValueThenWrite(self, conn, readFromId, writeToId):
+ 		d = conn.query("SELECT t FROM test WHERE i = %d;" % readFromId)
+ 		d.addCallback(self.writeValueBack, conn, writeToId)
+ 		return d
+ 
+ 	def writeValueBack(self, result, conn, writeToId):
+ 		self.assertEqual(1, len(result),
+ 						 "expected exactly one result row")
+ 		row = result[0]
+ 		self.assertEqual(1, len(row),
+ 						 "expected exactly one column")
+ 		value = row['t']
+ 		d = conn.operation("UPDATE test SET t = '%s' WHERE i = %d;" % (value, writeToId),
+ 						   "UPDATE")
+ 		return d
+ 
+ 	def startConcurrentOperations(self):
+ 		d1 = self.readValueThenWrite(self.conn1, readFromId=5,  writeToId=7)
+ 		d2 = self.readValueThenWrite(self.conn2, readFromId=7,  writeToId=11)
+ 		d3 = self.readValueThenWrite(self.conn3, readFromId=11, writeToId=5)
+ 		return defer.DeferredList([d1, d2, d3],
+ 								  consumeErrors=True, fireOnOneErrback=True)
+ 
+ 	def run(self):
+ 		try:
+ 			self.sub_run()
+ 		finally:
+ 			self.syncCall(10, self.execOnAllConnections, "ROLLBACK;")
+ 
+ 	def sub_run(self):
+ 		self.connections = [
+ 			self.conn1,
+ 			self.conn2,
+ 			self.conn3]
+ 
+ 		# begin a transaction on all three connections
+ 		self.syncCall(10, self.execOnAllConnections,
+ 			"BEGIN;", "BEGIN")
+ 
+ 		# set their isolation level to SERIALIZABLE
+ 		self.syncCall(10, self.execOnAllConnections,
+ 			"SET TRANSACTION ISOLATION LEVEL SERIALIZABLE;", "SET")
+ 
+ 		# concurrently let each of the three transactions read a value and
+ 		# write that to another tuple, wait for all the UPDATEs to complete
+ 		# before trying to commit any of the transactions
+ 		self.syncCall(10, self.startConcurrentOperations)
+ 
+ 		# try to commit all three transactions (accepting both COMMIT or
+ 		# ERROR, we check the result later on).
+ 		self.syncCall(10, self.execOnAllConnections,
+ 			"COMMIT;", "COMMIT|ERROR");
+ 
+ 		# count the occurrance of each fruit
+ 		result = self.syncCall(10, self.conn1.query,
+ 			"SELECT t FROM test WHERE i IN (5, 7, 11);")
+ 		counters = {'banana': 0, 'apple': 0, 'pear': 0}
+ 		for row in result:
+ 			counters[row['t']] += 1
+ 
+ 		# you currently get one fruit each, as no transaction gets aborted,
+ 		# which is impossible if the transactions had been executed one
+ 		# after another.
+ 		if counters.values() == [1, 1, 1]:
+ 			raise TestFailure("conflict not detected",
+ 				"All transactions committed, so the conflict hasn't been detected.")
+ 
+ class TestTrueSerializabilityConcurrentInsert(BaseTest):
+ 	""" Runs two transactions, both doing an insert, first, then select
+ 		all the relevant rows (within the range 100 <= i < 110). We let the
+ 		first transaction commit before creating the cyclic dependency,
+ 		which forces transaction 2 to abort.
+ 	"""
+ 
+ 	description = "concurrent insert"
+ 
+ 	needs = (('conn1', 'ISqlConnection'),
+ 			 ('conn2', 'ISqlConnection'))
+ 
+ 	def execOnAllConnections(self, sql, expRes=None):
+ 		deferreds = []
+ 		for conn in self.connections:
+ 			d = conn.operation(sql, expRes)
+ 			deferreds.append(d)
+ 
+ 		d = defer.DeferredList(deferreds,
+ 							   consumeErrors=True, fireOnOneErrback=True)
+ 		return d
+ 
+ 	def run(self):
+ 		self.connections = [
+ 			self.conn1,
+ 			self.conn2]
+ 
+ 		# begin a transaction on all three connections
+ 		d = self.execOnAllConnections("BEGIN;", "BEGIN")
+ 
+ 		# set their isolation level to SERIALIZABLE
+ 		d.addCallback(lambda x:
+ 			self.execOnAllConnections(
+ 				"SET TRANSACTION ISOLATION LEVEL SERIALIZABLE;", "SET"))
+ 
+ 		# let transaction 1 do an insert (so it acquires a snapshot)
+ 		d.addCallback(lambda x:
+ 			self.conn1.operation(
+ 				"INSERT INTO test (i, t) VALUES (101, 'orange');", "INSERT 0 1"))
+ 
+ 		# then same for transaction 2
+ 		d.addCallback(lambda x:
+ 			self.conn2.operation(
+ 				"INSERT INTO test (i, t) VALUES (102, 'grapefruit');", "INSERT 0 1"))
+ 
+ 		# let transaction 1 read the relevant rows, so it acquires an SIREAD
+ 		# lock on the predicate. (The result is discarded).
+ 		d.addCallback(lambda x:
+ 			self.conn2.query("SELECT t FROM test WHERE i >= 100 AND i < 110;"))
+ 
+ 		# then commit transaction 1 (which should still succeed)
+ 		d.addCallback(lambda x:
+ 			self.conn1.operation(
+ 				"COMMIT;", "COMMIT"))
+ 
+ 		# try to read all rows with the second transaction's snapshot (which
+ 		# doesn't see the update of transaction 1)
+ 		d.addCallback(lambda x:
+ 			self.conn2.query("SELECT t FROM test WHERE i >= 100 AND i < 110;"))
+ 
+ 		# With SSI in place, this should lock the same predicate with an
+ 		# SIREAD lock, which should bomb out on the orange (tuple i = 101)
+ 		# from transaction 1.
+ 		#
+ 		# dtester FIXME: Hm.. this could need some "expect to fail" help
+ 		#                from dtester
+ 		d.addCallback(self.checkResult)
+ 
+ 		# cleanup both transactions, especially in case of failure
+ 		d.addBoth(self.cleanup)
+ 
+ 		return d
+ 
+ 	def checkResult(self, result):
+ 		if not isinstance(result, failure.Failure):
+ 			raise TestFailure("conflict not detected",
+ 				"SELECT should raise a serialization error")
+ 		return result
+ 
+ 	def cleanup(self, result):
+ 		d = self.execOnAllConnections("ROLLBACK;")
+ 
+ 		# ignore errors above, but instead make sure we return the result
+ 		# we got here, especially if it was an error.
+ 		d.addBoth(lambda x: result)
+ 		return d
+ 
+ class TestTrueSerializabilityConcurrentInsert2(BaseTest):
+ 	""" Pretty similar to the above test, except that the first transaction
+ 		doesn't read (and thus predicate lock) the relevant rows. This still
+ 		leaves a possible serialization ordering, even if it doesn't match
+ 		the real commit ordering.
+ 
+ 		Uses rows 200 <= i < 210
+ 	"""
+ 
+ 	description = "concurrent insert"
+ 
+ 	needs = (('conn1', 'ISqlConnection'),
+ 			 ('conn2', 'ISqlConnection'))
+ 
+ 	def execOnAllConnections(self, sql, expRes=None):
+ 		deferreds = []
+ 		for conn in self.connections:
+ 			d = conn.operation(sql, expRes)
+ 			deferreds.append(d)
+ 
+ 		d = defer.DeferredList(deferreds,
+ 							   consumeErrors=True, fireOnOneErrback=True)
+ 		return d
+ 
+ 	def run(self):
+ 		self.connections = [
+ 			self.conn1,
+ 			self.conn2]
+ 
+ 		# begin a transaction on all three connections
+ 		d = self.execOnAllConnections("BEGIN;", "BEGIN")
+ 
+ 		# set their isolation level to SERIALIZABLE
+ 		d.addCallback(lambda x:
+ 			self.execOnAllConnections(
+ 				"SET TRANSACTION ISOLATION LEVEL SERIALIZABLE;", "SET"))
+ 
+ 		# let transaction 1 do an insert (so it acquires a snapshot)
+ 		d.addCallback(lambda x:
+ 			self.conn1.operation(
+ 				"INSERT INTO test (i, t) VALUES (201, 'orange');", "INSERT 0 1"))
+ 
+ 		# then same for transaction 2
+ 		d.addCallback(lambda x:
+ 			self.conn2.operation(
+ 				"INSERT INTO test (i, t) VALUES (202, 'grapefruit');", "INSERT 0 1"))
+ 
+ 		# no SELECT here, so transaction 1 doesn't acquire any SIREAD lock
+ 
+ 		# then commit transaction 1 (which should succeed)
+ 		d.addCallback(lambda x:
+ 			self.conn1.operation(
+ 				"COMMIT;", "COMMIT"))
+ 
+ 		# try to read all rows with the second transaction's snapshot (which
+ 		# doesn't see the update of transaction 1)
+ 		d.addCallback(lambda x:
+ 			self.conn2.query("SELECT t FROM test WHERE i >= 200 AND i < 210;"))
+ 
+ 		# With SSI in place, this should lock the same predicate as abover
+ 		# with an SIREAD lock. This includes the row just written by the
+ 		# first transaction.
+ 		#
+ 		# As long as there are no other edges, this still leaves a possible
+ 		# serialization ordering: if we executed the second transaction
+ 		# *before* the first one, the second didn't see the 'orange' row
+ 		# inserted "later" by the first transaction. That's the result we
+ 		# expect.
+ 		d.addCallback(self.checkResult)
+ 
+ 		# commit transaction 2
+ 		d.addCallback(lambda x:
+ 			self.conn2.operation(
+ 				"COMMIT;", "COMMIT"))
+ 
+ 		# add a cleanup handler
+ 		d.addErrback(self.cleanup)
+ 
+ 		return d
+ 
+ 	def checkResult(self, result):
+ 		self.assertEqual(len(result), 1,
+ 			"Expected exactly one row, got %d (%s)" % (
+ 				len(result), repr(result)))
+ 		self.assertEqual(result[0], {"t": "grapefruit"},
+ 			"Expected to read the grapefruit row, but got %s" % (result[0],))
+ 
+ 		return result
+ 
+ 	def cleanup(self, result):
+ 		d = self.execOnAllConnections("ROLLBACK;")
+ 
+ 		# ignore errors above, but instead make sure we return the result
+ 		# we got here, especially if it was an error.
+ 		d.addBoth(lambda x: result)
+ 		return d
+ 
+ 
+ # ******  test running code  ************************************************
+ 
+ class Logger(object):
+ 	""" A simplistic logger that just writes it all into one single file.
+ 	"""
+ 	def __init__(self, logFileName):
+ 		self.logfile = open(logFileName, 'w')
+ 
+ 	def __del__(self):
+ 		self.logfile.close()
+ 
+ 	def callback(self, event):
+ 		self.logfile.write(str(event) + "\n")
+ 		self.logfile.flush()
+ 
+ def main(argv):
+ 	print "Postgres dtester suite                Copyright (c) 2004-2010, by Markus Wanner\n"
+ 
+ 	postgres_configure_args = "@configure_args@"
+ 
+ 	config = {
+ 			'temp-port': 65432,
+ 
+ 			# by default, use the same installation directory as make check
+ 			'inst_dir': os.path.join(os.getcwd(), 'tmp_check/install'),
+ 
+ 			# and a similar prefix
+ 			'pgdata_prefix': os.path.join(os.getcwd(), 'tmp_check/data-dtester'),
+ 			'logfile' : os.path.join(os.getcwd(), 'dtester.log'),
+ 
+ 			'enable_cassert': 'enable_cassert' in postgres_configure_args
+ 	}
+ 
+ 	try:
+ 		opts, args = getopt.getopt(argv,
+ 			"h",
+ 			["help", "temp-install", "top-builddir=", "temp-port=",
+ 			 "multibyte="])
+ 	except getopt.GetoptError:
+ 		usage()
+ 		sys.exit(2)
+ 
+ 	for opt, arg in opts:
+ 		if opt in ("-h", "--help"):
+ 			usage()
+ 			sys.exit()
+ 		elif opt in ("--temp-install"):
+ 			config["temp-install"] = True
+ 		elif opt in ("--temp-port"):
+ 			try:
+ 				arg = int(arg)
+ 				if arg >= 1024 and arg <= 65535:
+ 					config["temp-port"] = arg
+ 				else:
+ 					print "temp-port out of range."
+ 					sys.exit(2)
+ 			except ValueError:
+ 				print "Fatal: invalid temp-port specified"
+ 				sys.exit(2)
+ 		elif opt in ("--top-builddir"):
+ 			config["top-builddir"] = arg
+ 
+ 
+ 	if not config.has_key('bindir'):
+ 		bindir = '@bindir@'
+ 		if bindir[0] == '/':
+ 			bindir = bindir[1:]
+ 		config['bindir'] = os.path.join(config['inst_dir'], bindir)
+ 	if not config.has_key('libdir'):
+ 		libdir = '@libdir@'
+ 		if libdir[0] == '/':
+ 			libdir = libdir[1:]
+ 		config['libdir'] = os.path.join(config['inst_dir'], libdir)
+ 	if not config.has_key('datadir'):
+ 		datadir = '@datadir@'
+ 		if datadir[0] == '/':
+ 			datadir = datadir[1:]
+ 		config['datadir'] = os.path.join(config['inst_dir'], datadir)
+ 
+ 
+ 	# FIXME: should not have to be here
+ 	logger = Logger(config['logfile'])
+ 	config['main_logging_hook'] = (EventMatcher(Event), logger.callback)
+ 
+ 
+ 	# definition of tests and suites, including their dependencies
+ 	tdef = {
+ 		# runs 'make install' to make sure the installation is up to date
+ 		'temp_install':		{'class': InstallationSuite,
+ 							 'uses': ('__system__',)},
+ 
+ 		# runs initdb, providing the Postgres data directory
+ 		'initdb-0':			{'class': InitdbSuite,
+ 							 'uses': ('temp_install',),
+ 							 'args': (0,)},
+ 
+ 		# runs a postmaster on the created database directory
+ 		'pg-0':				{'class': PostmasterSuite,
+ 							 'uses': ('temp_install', 'initdb-0')},
+ 
+ 		# creates a test database on pg-0
+ 		'testdb':			{'class': TestDatabaseSuite,
+ 							 'uses': ('temp_install', 'pg-0'),
+ 							 'args': ('testdb',)},
+ 
+ 		# open two connections
+ 		'conn-0A':			{'class': SqlConnectionSuite,
+ 							 'uses': ('temp_install', 'pg-0'),
+ 							 'args': ('testdb',),
+ 							 'depends': ('testdb',)},
+ 		'conn-0B':			{'class': SqlConnectionSuite,
+ 							 'uses': ('temp_install', 'pg-0'),
+ 							 'args': ('testdb',),
+ 							 'depends': ('testdb',)},
+ 		'conn-0C':			{'class': SqlConnectionSuite,
+ 							 'uses': ('temp_install', 'pg-0'),
+ 							 'args': ('testdb',),
+ 							 'depends': ('testdb',)},
+ 
+ 		# test the connections
+ 		'test-conn-0A':		{'class': TestDatabaseConnection,
+ 							 'uses': ('conn-0A',)},
+ 		'test-conn-0B':		{'class': TestDatabaseConnection,
+ 							 'uses': ('conn-0B',)},
+ 		'test-conn-0C':		{'class': TestDatabaseConnection,
+ 							 'uses': ('conn-0C',)},
+ 
+ #		'dummy-recursion':	{'class': DummyPermutationTest},
+ 
+ 		# populate the test database
+ 		'populate-testdb':	{'class': PopulateTestDatabase,
+ 							 'uses': ('conn-0A',),
+ 							 'onlyAfter': ('test-conn-0A', 'test-conn-0B',
+ 										   'test-conn-0C')},
+ 
+ 		'simple-write-skew':	{'class': SimpleWriteSkewTest,
+ 							 'uses': ('conn-0A', 'conn-0B'),
+ 							 'onlyAfter': ('populate-testdb',),
+ 							 'xfail': True},
+ 
+ 		'receipt-report':	{'class': ReceiptReportTest,
+ 							 'uses': ('conn-0A', 'conn-0B', 'conn-0C'),
+ 							 'onlyAfter': ('simple-write-skew',),
+ 							 'xfail': True},
+ 
+ 		'temporal-range':	{'class': TemporalRangeIntegrityTest,
+ 							 'uses': ('conn-0A', 'conn-0B'),
+ 							 'onlyAfter': ('receipt-report',),
+ 							 'xfail': True},
+ 
+ 		'project-manager':	{'class': ProjectManagerTest,
+ 							 'uses': ('conn-0A', 'conn-0B'),
+ 							 'onlyAfter': ('temporal-range',),
+ 							 'xfail': True},
+ 
+ 		'classroom-scheduling':	{'class': ClassroomSchedulingTest,
+ 							 'uses': ('conn-0A', 'conn-0B'),
+ 							 'onlyAfter': ('project-manager',),
+ 							 'xfail': True},
+ 
+ 		'total-cash':		{'class': TotalCashTest,
+ 							 'uses': ('conn-0A', 'conn-0B'),
+ 							 'onlyAfter': ('classroom-scheduling',),
+ 							 'xfail': True},
+ 
+ 		'referential-integrity':	{'class': ReferentialIntegrityTest,
+ 							 'uses': ('conn-0A', 'conn-0B'),
+ 							 'onlyAfter': ('total-cash',),
+ 							 'xfail': True},
+ 
+ 		'ri-trigger':		{'class': RITriggerTest,
+ 							 'uses': ('conn-0A', 'conn-0B'),
+ 							 'onlyAfter': ('referential-integrity',),
+ 							 'xfail': True}
+ 
+ #		'ser-updates':		{'class': TestTrueSerializabilityConcurrentUpdates,
+ #							 'uses': ('conn-0A', 'conn-0B', 'conn-0C'),
+ #							 'onlyAfter': ('populate-testdb',),
+ #							 'xfail': True},
+ #
+ #		'ser-insert':		{'class': TestTrueSerializabilityConcurrentInsert,
+ #							 'uses': ('conn-0A', 'conn-0B'),
+ #							 'onlyAfter': ('ser-updates',),
+ #							 'xfail': True},
+ #
+ #		'ser-insert2':		{'class': TestTrueSerializabilityConcurrentInsert2,
+ #							 'uses': ('conn-0A', 'conn-0B'),
+ #							 'onlyAfter': ('ser-insert',)}
+ 	}
+ 
+ 
+ 	runner = Runner(testTimeout=600, suiteTimeout=3600)
+ 	runner.run(tdef, config)
+ 
+ 
+ if __name__ == "__main__":
+ 	main(sys.argv[1:])
+