*** a/GNUmakefile.in --- b/GNUmakefile.in *************** *** 75,81 **** distclean maintainer-clean: check: all ! check installcheck installcheck-parallel: $(MAKE) -C src/test $@ installcheck-world: --- 75,81 ---- check: all ! check dcheck installcheck installcheck-parallel: $(MAKE) -C src/test $@ installcheck-world: *** a/src/backend/access/heap/heapam.c --- b/src/backend/access/heap/heapam.c *************** *** 57,62 **** --- 57,63 ---- #include "storage/bufmgr.h" #include "storage/freespace.h" #include "storage/lmgr.h" + #include "storage/predicate.h" #include "storage/procarray.h" #include "storage/smgr.h" #include "storage/standby.h" *************** *** 261,280 **** heapgetpage(HeapScanDesc scan, BlockNumber page) { if (ItemIdIsNormal(lpp)) { bool valid; if (all_visible) valid = true; else { - HeapTupleData loctup; - loctup.t_data = (HeapTupleHeader) PageGetItem((Page) dp, lpp); loctup.t_len = ItemIdGetLength(lpp); ItemPointerSet(&(loctup.t_self), page, lineoff); valid = HeapTupleSatisfiesVisibility(&loctup, snapshot, buffer); } if (valid) scan->rs_vistuples[ntup++] = lineoff; } --- 262,283 ---- { if (ItemIdIsNormal(lpp)) { + HeapTupleData loctup; bool valid; if (all_visible) valid = true; else { loctup.t_data = (HeapTupleHeader) PageGetItem((Page) dp, lpp); loctup.t_len = ItemIdGetLength(lpp); ItemPointerSet(&(loctup.t_self), page, lineoff); valid = HeapTupleSatisfiesVisibility(&loctup, snapshot, buffer); } + + CheckForSerializableConflictOut(valid, scan->rs_rd, &loctup, buffer); + if (valid) scan->rs_vistuples[ntup++] = lineoff; } *************** *** 468,479 **** heapgettup(HeapScanDesc scan, --- 471,485 ---- snapshot, scan->rs_cbuf); + CheckForSerializableConflictOut(valid, scan->rs_rd, tuple, scan->rs_cbuf); + if (valid && key != NULL) HeapKeyTest(tuple, RelationGetDescr(scan->rs_rd), nkeys, key, valid); if (valid) { + PredicateLockTuple(scan->rs_rd, tuple); LockBuffer(scan->rs_cbuf, BUFFER_LOCK_UNLOCK); return; } *************** *** 741,752 **** heapgettup_pagemode(HeapScanDesc scan, --- 747,760 ---- nkeys, key, valid); if (valid) { + PredicateLockTuple(scan->rs_rd, tuple); scan->rs_cindex = lineindex; return; } } else { + PredicateLockTuple(scan->rs_rd, tuple); scan->rs_cindex = lineindex; return; } *************** *** 1460,1467 **** heap_fetch(Relation relation, --- 1468,1478 ---- LockBuffer(buffer, BUFFER_LOCK_UNLOCK); + CheckForSerializableConflictOut(valid, relation, tuple, buffer); + if (valid) { + PredicateLockTuple(relation, tuple); /* * All checks passed, so return the tuple as valid. Caller is now * responsible for releasing the buffer. *************** *** 1505,1517 **** heap_fetch(Relation relation, * heap_fetch, we do not report any pgstats count; caller may do so if wanted. */ bool ! heap_hot_search_buffer(ItemPointer tid, Buffer buffer, Snapshot snapshot, ! bool *all_dead) { Page dp = (Page) BufferGetPage(buffer); TransactionId prev_xmax = InvalidTransactionId; OffsetNumber offnum; bool at_chain_start; if (all_dead) *all_dead = true; --- 1516,1530 ---- * heap_fetch, we do not report any pgstats count; caller may do so if wanted. */ bool ! heap_hot_search_buffer(ItemPointer tid, Relation relation, Buffer buffer, ! Snapshot snapshot, bool *all_dead) { Page dp = (Page) BufferGetPage(buffer); TransactionId prev_xmax = InvalidTransactionId; OffsetNumber offnum; bool at_chain_start; + bool valid; + bool match_found; if (all_dead) *all_dead = true; *************** *** 1521,1526 **** heap_hot_search_buffer(ItemPointer tid, Buffer buffer, Snapshot snapshot, --- 1534,1540 ---- Assert(ItemPointerGetBlockNumber(tid) == BufferGetBlockNumber(buffer)); offnum = ItemPointerGetOffsetNumber(tid); at_chain_start = true; + match_found = false; /* Scan through possible multiple members of HOT-chain */ for (;;) *************** *** 1551,1556 **** heap_hot_search_buffer(ItemPointer tid, Buffer buffer, Snapshot snapshot, --- 1565,1572 ---- heapTuple.t_data = (HeapTupleHeader) PageGetItem(dp, lp); heapTuple.t_len = ItemIdGetLength(lp); + heapTuple.t_tableOid = relation->rd_id; + heapTuple.t_self = *tid; /* * Shouldn't see a HEAP_ONLY tuple at chain start. *************** *** 1568,1579 **** heap_hot_search_buffer(ItemPointer tid, Buffer buffer, Snapshot snapshot, break; /* If it's visible per the snapshot, we must return it */ ! if (HeapTupleSatisfiesVisibility(&heapTuple, snapshot, buffer)) { ItemPointerSetOffsetNumber(tid, offnum); if (all_dead) *all_dead = false; ! return true; } /* --- 1584,1601 ---- break; /* If it's visible per the snapshot, we must return it */ ! valid = HeapTupleSatisfiesVisibility(&heapTuple, snapshot, buffer); ! CheckForSerializableConflictOut(valid, relation, &heapTuple, buffer); ! if (valid) { ItemPointerSetOffsetNumber(tid, offnum); + PredicateLockTuple(relation, &heapTuple); if (all_dead) *all_dead = false; ! if (IsXactIsoLevelFullySerializable) ! match_found = true; ! else ! return true; } /* *************** *** 1602,1608 **** heap_hot_search_buffer(ItemPointer tid, Buffer buffer, Snapshot snapshot, break; /* end of chain */ } ! return false; } /* --- 1624,1630 ---- break; /* end of chain */ } ! return match_found; } /* *************** *** 1621,1627 **** heap_hot_search(ItemPointer tid, Relation relation, Snapshot snapshot, buffer = ReadBuffer(relation, ItemPointerGetBlockNumber(tid)); LockBuffer(buffer, BUFFER_LOCK_SHARE); ! result = heap_hot_search_buffer(tid, buffer, snapshot, all_dead); LockBuffer(buffer, BUFFER_LOCK_UNLOCK); ReleaseBuffer(buffer); return result; --- 1643,1649 ---- buffer = ReadBuffer(relation, ItemPointerGetBlockNumber(tid)); LockBuffer(buffer, BUFFER_LOCK_SHARE); ! result = heap_hot_search_buffer(tid, relation, buffer, snapshot, all_dead); LockBuffer(buffer, BUFFER_LOCK_UNLOCK); ReleaseBuffer(buffer); return result; *************** *** 1728,1735 **** heap_get_latest_tid(Relation relation, --- 1750,1760 ---- * result candidate. */ valid = HeapTupleSatisfiesVisibility(&tp, snapshot, buffer); + CheckForSerializableConflictOut(valid, relation, &tp, buffer); if (valid) + { *tid = ctid; + } /* * If there's a valid t_ctid link, follow it, else we're done. *************** *** 1892,1897 **** heap_insert(Relation relation, HeapTuple tup, CommandId cid, --- 1917,1929 ---- buffer = RelationGetBufferForTuple(relation, heaptup->t_len, InvalidBuffer, options, bistate); + /* + * We're about to do the actual insert -- check for conflict at the + * relation or buffer level first, to avoid possibly having to roll + * back work we've just done. + */ + CheckForSerializableConflictIn(relation, NULL, buffer); + /* NO EREPORT(ERROR) from here till changes are logged */ START_CRIT_SECTION(); *************** *** 2192,2197 **** l1: --- 2224,2235 ---- return result; } + /* + * We're about to do the actual delete -- check for conflict first, + * to avoid possibly having to roll back work we've just done. + */ + CheckForSerializableConflictIn(relation, &tp, buffer); + /* replace cid with a combo cid if necessary */ HeapTupleHeaderAdjustCmax(tp.t_data, &cid, &iscombo); *************** *** 2545,2550 **** l2: --- 2583,2594 ---- return result; } + /* + * We're about to do the actual update -- check for conflict first, + * to avoid possibly having to roll back work we've just done. + */ + CheckForSerializableConflictIn(relation, &oldtup, buffer); + /* Fill in OID and transaction status data for newtup */ if (relation->rd_rel->relhasoids) { *************** *** 2690,2695 **** l2: --- 2734,2749 ---- } /* + * We're about to create the new tuple -- check for conflict first, + * to avoid possibly having to roll back work we've just done. + * + * NOTE: For a tuple insert, we only need to check for table locks, since + * predicate locking at the index level will cover ranges for anything + * except a table scan. Therefore, only provide the relation. + */ + CheckForSerializableConflictIn(relation, NULL, InvalidBuffer); + + /* * At this point newbuf and buffer are both pinned and locked, and newbuf * has enough space for the new tuple. If they are the same buffer, only * one pin is held. *************** *** 2829,2834 **** l2: --- 2883,2894 ---- CacheInvalidateHeapTuple(relation, heaptup); /* + * TODO SSI: In order to support SIREAD locks at tuple granularity, any + * existing SIREAD locks on the old tuple must be copied to + * also refer to the new tuple, somewhere around this point? + */ + + /* * Release the lmgr tuple lock, if we had it. */ if (have_tuple_lock) *** a/src/backend/access/index/indexam.c --- b/src/backend/access/index/indexam.c *************** *** 64,72 **** --- 64,74 ---- #include "access/relscan.h" #include "access/transam.h" + #include "access/xact.h" #include "pgstat.h" #include "storage/bufmgr.h" #include "storage/lmgr.h" + #include "storage/predicate.h" #include "utils/relcache.h" #include "utils/snapmgr.h" #include "utils/tqual.h" *************** *** 192,197 **** index_insert(Relation indexRelation, --- 194,204 ---- RELATION_CHECKS; GET_REL_PROCEDURE(aminsert); + if (!(indexRelation->rd_am->ampredlocks)) + CheckForSerializableConflictIn(indexRelation, + (HeapTuple) NULL, + InvalidBuffer); + /* * have the am's insert proc do all the work. */ *************** *** 266,271 **** index_beginscan_internal(Relation indexRelation, --- 273,281 ---- RELATION_CHECKS; GET_REL_PROCEDURE(ambeginscan); + if (!(indexRelation->rd_am->ampredlocks)) + PredicateLockRelation(indexRelation); + /* * We hold a reference count to the relcache entry throughout the scan. */ *************** *** 515,520 **** index_getnext(IndexScanDesc scan, ScanDirection direction) --- 525,531 ---- { ItemId lp; ItemPointer ctid; + bool valid; /* check for bogus TID */ if (offnum < FirstOffsetNumber || *************** *** 569,576 **** index_getnext(IndexScanDesc scan, ScanDirection direction) break; /* If it's visible per the snapshot, we must return it */ ! if (HeapTupleSatisfiesVisibility(heapTuple, scan->xs_snapshot, ! scan->xs_cbuf)) { /* * If the snapshot is MVCC, we know that it could accept at --- 580,592 ---- break; /* If it's visible per the snapshot, we must return it */ ! valid = HeapTupleSatisfiesVisibility(heapTuple, scan->xs_snapshot, ! scan->xs_cbuf); ! ! CheckForSerializableConflictOut(valid, scan->heapRelation, ! heapTuple, scan->xs_cbuf); ! ! if (valid) { /* * If the snapshot is MVCC, we know that it could accept at *************** *** 578,584 **** index_getnext(IndexScanDesc scan, ScanDirection direction) * any more members. Otherwise, check for continuation of the * HOT-chain, and set state for next time. */ ! if (IsMVCCSnapshot(scan->xs_snapshot)) scan->xs_next_hot = InvalidOffsetNumber; else if (HeapTupleIsHotUpdated(heapTuple)) { --- 594,601 ---- * any more members. Otherwise, check for continuation of the * HOT-chain, and set state for next time. */ ! if (IsMVCCSnapshot(scan->xs_snapshot) ! && !IsXactIsoLevelFullySerializable) scan->xs_next_hot = InvalidOffsetNumber; else if (HeapTupleIsHotUpdated(heapTuple)) { *************** *** 594,599 **** index_getnext(IndexScanDesc scan, ScanDirection direction) --- 611,618 ---- pgstat_count_heap_fetch(scan->indexRelation); + PredicateLockTuple(scan->heapRelation, heapTuple); + return heapTuple; } *** a/src/backend/access/nbtree/nbtinsert.c --- b/src/backend/access/nbtree/nbtinsert.c *************** *** 21,26 **** --- 21,27 ---- #include "miscadmin.h" #include "storage/bufmgr.h" #include "storage/lmgr.h" + #include "storage/predicate.h" #include "utils/inval.h" #include "utils/tqual.h" *************** *** 175,180 **** top: --- 176,189 ---- if (checkUnique != UNIQUE_CHECK_EXISTING) { + /* + * The only conflict predicate locking cares about for indexes is when + * an index tuple insert conflicts with an existing lock. Since the + * actual location of the insert is hard to predict because of the + * random search used to prevent O(N^2) performance when there are many + * duplicate entries, we can just use the "first valid" page. + */ + CheckForSerializableConflictIn(rel, NULL, buf); /* do the insertion */ _bt_findinsertloc(rel, &buf, &offset, natts, itup_scankey, itup, heapRel); _bt_insertonpg(rel, buf, stack, itup, offset, false); *************** *** 697,702 **** _bt_insertonpg(Relation rel, --- 706,714 ---- /* split the buffer into left and right halves */ rbuf = _bt_split(rel, buf, firstright, newitemoff, itemsz, itup, newitemonleft); + PredicateLockPageSplit(rel, + BufferGetBlockNumber(buf), + BufferGetBlockNumber(rbuf)); /*---------- * By here, *** a/src/backend/access/nbtree/nbtpage.c --- b/src/backend/access/nbtree/nbtpage.c *************** *** 1177,1182 **** _bt_pagedel(Relation rel, Buffer buf, BTStack stack) --- 1177,1188 ---- rbuf = _bt_getbuf(rel, rightsib, BT_WRITE); /* + * Any insert which would have gone on the target block will now go to the + * right sibling block. + */ + PredicateLockPageCombine(rel, target, rightsib); + + /* * Next find and write-lock the current parent of the target page. This is * essentially the same as the corresponding step of splitting. */ *** a/src/backend/access/nbtree/nbtsearch.c --- b/src/backend/access/nbtree/nbtsearch.c *************** *** 21,26 **** --- 21,27 ---- #include "miscadmin.h" #include "pgstat.h" #include "storage/bufmgr.h" + #include "storage/predicate.h" #include "utils/lsyscache.h" #include "utils/rel.h" *************** *** 63,69 **** _bt_search(Relation rel, int keysz, ScanKey scankey, bool nextkey, --- 64,73 ---- /* If index is empty and access = BT_READ, no root page is created. */ if (!BufferIsValid(*bufP)) + { + PredicateLockRelation(rel); /* Nothing finer to lock exists. */ return (BTStack) NULL; + } /* Loop iterates once per level descended in the tree */ for (;;) *************** *** 88,94 **** _bt_search(Relation rel, int keysz, ScanKey scankey, bool nextkey, --- 92,102 ---- page = BufferGetPage(*bufP); opaque = (BTPageOpaque) PageGetSpecialPointer(page); if (P_ISLEAF(opaque)) + { + if (access == BT_READ) + PredicateLockPage(rel, BufferGetBlockNumber(*bufP)); break; + } /* * Find the appropriate item on the internal page, and get the child *************** *** 199,204 **** _bt_moveright(Relation rel, --- 207,213 ---- elog(ERROR, "fell off the end of index \"%s\"", RelationGetRelationName(rel)); + PredicateLockPage(rel, BufferGetBlockNumber(buf)); return buf; } *************** *** 1142,1147 **** _bt_steppage(IndexScanDesc scan, ScanDirection dir) --- 1151,1157 ---- opaque = (BTPageOpaque) PageGetSpecialPointer(page); if (!P_IGNORE(opaque)) { + PredicateLockPage(rel, blkno); /* see if there are any matches on this page */ /* note that this will clear moreRight if we can stop */ if (_bt_readpage(scan, dir, P_FIRSTDATAKEY(opaque))) *************** *** 1189,1194 **** _bt_steppage(IndexScanDesc scan, ScanDirection dir) --- 1199,1205 ---- opaque = (BTPageOpaque) PageGetSpecialPointer(page); if (!P_IGNORE(opaque)) { + PredicateLockPage(rel, BufferGetBlockNumber(so->currPos.buf)); /* see if there are any matches on this page */ /* note that this will clear moreLeft if we can stop */ if (_bt_readpage(scan, dir, PageGetMaxOffsetNumber(page))) *************** *** 1352,1357 **** _bt_get_endpoint(Relation rel, uint32 level, bool rightmost) --- 1363,1369 ---- if (!BufferIsValid(buf)) { /* empty index... */ + PredicateLockRelation(rel); /* Nothing finer to lock exists. */ return InvalidBuffer; } *************** *** 1431,1440 **** _bt_endpoint(IndexScanDesc scan, ScanDirection dir) --- 1443,1454 ---- if (!BufferIsValid(buf)) { /* empty index... */ + PredicateLockRelation(rel); /* Nothing finer to lock exists. */ so->currPos.buf = InvalidBuffer; return false; } + PredicateLockPage(rel, BufferGetBlockNumber(buf)); page = BufferGetPage(buf); opaque = (BTPageOpaque) PageGetSpecialPointer(page); Assert(P_ISLEAF(opaque)); *** a/src/backend/access/transam/xact.c --- b/src/backend/access/transam/xact.c *************** *** 39,44 **** --- 39,45 ---- #include "storage/bufmgr.h" #include "storage/fd.h" #include "storage/lmgr.h" + #include "storage/predicate.h" #include "storage/procarray.h" #include "storage/sinvaladt.h" #include "storage/smgr.h" *************** *** 1754,1759 **** CommitTransaction(void) --- 1755,1767 ---- AtEOXact_LargeObject(true); /* + * Mark serializable transaction as complete for predicate locking + * purposes. This should be done as late as we can put it and still + * allow errors to be raised for failure patterns found at commit. + */ + PreCommit_CheckForSerializationFailure(); + + /* * Insert notifications sent by NOTIFY commands into the queue. This * should be late in the pre-commit sequence to minimize time spent * holding the notify-insertion lock. *** a/src/backend/catalog/index.c --- b/src/backend/catalog/index.c *************** *** 2044,2050 **** IndexCheckExclusion(Relation heapRelation, * * After completing validate_index(), we wait until all transactions that * were alive at the time of the reference snapshot are gone; this is ! * necessary to be sure there are none left with a serializable snapshot * older than the reference (and hence possibly able to see tuples we did * not index). Then we mark the index "indisvalid" and commit. Subsequent * transactions will be able to use it for queries. --- 2044,2050 ---- * * After completing validate_index(), we wait until all transactions that * were alive at the time of the reference snapshot are gone; this is ! * necessary to be sure there are none left with a transaction-based snapshot * older than the reference (and hence possibly able to see tuples we did * not index). Then we mark the index "indisvalid" and commit. Subsequent * transactions will be able to use it for queries. *** a/src/backend/commands/trigger.c --- b/src/backend/commands/trigger.c *************** *** 2360,2366 **** ltrmark:; case HeapTupleUpdated: ReleaseBuffer(buffer); ! if (IsXactIsoLevelSerializable) ereport(ERROR, (errcode(ERRCODE_T_R_SERIALIZATION_FAILURE), errmsg("could not serialize access due to concurrent update"))); --- 2360,2366 ---- case HeapTupleUpdated: ReleaseBuffer(buffer); ! if (IsXactIsoLevelXactSnapshotBased) ereport(ERROR, (errcode(ERRCODE_T_R_SERIALIZATION_FAILURE), errmsg("could not serialize access due to concurrent update"))); *** a/src/backend/executor/execMain.c --- b/src/backend/executor/execMain.c *************** *** 1544,1550 **** EvalPlanQualFetch(EState *estate, Relation relation, int lockmode, case HeapTupleUpdated: ReleaseBuffer(buffer); ! if (IsXactIsoLevelSerializable) ereport(ERROR, (errcode(ERRCODE_T_R_SERIALIZATION_FAILURE), errmsg("could not serialize access due to concurrent update"))); --- 1544,1550 ---- case HeapTupleUpdated: ReleaseBuffer(buffer); ! if (IsXactIsoLevelXactSnapshotBased) ereport(ERROR, (errcode(ERRCODE_T_R_SERIALIZATION_FAILURE), errmsg("could not serialize access due to concurrent update"))); *** a/src/backend/executor/nodeBitmapHeapscan.c --- b/src/backend/executor/nodeBitmapHeapscan.c *************** *** 42,47 **** --- 42,48 ---- #include "executor/nodeBitmapHeapscan.h" #include "pgstat.h" #include "storage/bufmgr.h" + #include "storage/predicate.h" #include "utils/memutils.h" #include "utils/snapmgr.h" #include "utils/tqual.h" *************** *** 351,357 **** bitgetpage(HeapScanDesc scan, TBMIterateResult *tbmres) ItemPointerData tid; ItemPointerSet(&tid, page, offnum); ! if (heap_hot_search_buffer(&tid, buffer, snapshot, NULL)) scan->rs_vistuples[ntup++] = ItemPointerGetOffsetNumber(&tid); } } --- 352,358 ---- ItemPointerData tid; ItemPointerSet(&tid, page, offnum); ! if (heap_hot_search_buffer(&tid, scan->rs_rd, buffer, snapshot, NULL)) scan->rs_vistuples[ntup++] = ItemPointerGetOffsetNumber(&tid); } } *** a/src/backend/executor/nodeIndexscan.c --- b/src/backend/executor/nodeIndexscan.c *************** *** 30,35 **** --- 30,36 ---- #include "executor/execdebug.h" #include "executor/nodeIndexscan.h" #include "optimizer/clauses.h" + #include "storage/predicate.h" #include "utils/array.h" #include "utils/lsyscache.h" #include "utils/memutils.h" *** a/src/backend/executor/nodeLockRows.c --- b/src/backend/executor/nodeLockRows.c *************** *** 130,136 **** lnext: break; case HeapTupleUpdated: ! if (IsXactIsoLevelSerializable) ereport(ERROR, (errcode(ERRCODE_T_R_SERIALIZATION_FAILURE), errmsg("could not serialize access due to concurrent update"))); --- 130,136 ---- break; case HeapTupleUpdated: ! if (IsXactIsoLevelXactSnapshotBased) ereport(ERROR, (errcode(ERRCODE_T_R_SERIALIZATION_FAILURE), errmsg("could not serialize access due to concurrent update"))); *** a/src/backend/executor/nodeModifyTable.c --- b/src/backend/executor/nodeModifyTable.c *************** *** 328,334 **** ldelete:; break; case HeapTupleUpdated: ! if (IsXactIsoLevelSerializable) ereport(ERROR, (errcode(ERRCODE_T_R_SERIALIZATION_FAILURE), errmsg("could not serialize access due to concurrent update"))); --- 328,334 ---- break; case HeapTupleUpdated: ! if (IsXactIsoLevelXactSnapshotBased) ereport(ERROR, (errcode(ERRCODE_T_R_SERIALIZATION_FAILURE), errmsg("could not serialize access due to concurrent update"))); *************** *** 516,522 **** lreplace:; break; case HeapTupleUpdated: ! if (IsXactIsoLevelSerializable) ereport(ERROR, (errcode(ERRCODE_T_R_SERIALIZATION_FAILURE), errmsg("could not serialize access due to concurrent update"))); --- 516,522 ---- break; case HeapTupleUpdated: ! if (IsXactIsoLevelXactSnapshotBased) ereport(ERROR, (errcode(ERRCODE_T_R_SERIALIZATION_FAILURE), errmsg("could not serialize access due to concurrent update"))); *** a/src/backend/executor/nodeSeqscan.c --- b/src/backend/executor/nodeSeqscan.c *************** *** 28,33 **** --- 28,34 ---- #include "access/relscan.h" #include "executor/execdebug.h" #include "executor/nodeSeqscan.h" + #include "storage/predicate.h" static void InitScanRelation(SeqScanState *node, EState *estate); static TupleTableSlot *SeqNext(SeqScanState *node); *************** *** 105,115 **** SeqRecheck(SeqScanState *node, TupleTableSlot *slot) --- 106,118 ---- * tuple. * We call the ExecScan() routine and pass it the appropriate * access method functions. + * For serializable transactions, we first lock the entire relation. * ---------------------------------------------------------------- */ TupleTableSlot * ExecSeqScan(SeqScanState *node) { + PredicateLockRelation(node->ss_currentRelation); return ExecScan((ScanState *) node, (ExecScanAccessMtd) SeqNext, (ExecScanRecheckMtd) SeqRecheck); *** a/src/backend/executor/nodeTidscan.c --- b/src/backend/executor/nodeTidscan.c *************** *** 31,36 **** --- 31,37 ---- #include "executor/nodeTidscan.h" #include "optimizer/clauses.h" #include "storage/bufmgr.h" + #include "storage/predicate.h" #include "utils/array.h" *** a/src/backend/storage/ipc/ipci.c --- b/src/backend/storage/ipc/ipci.c *************** *** 105,110 **** CreateSharedMemoryAndSemaphores(bool makePrivate, int port) --- 105,111 ---- sizeof(ShmemIndexEnt))); size = add_size(size, BufferShmemSize()); size = add_size(size, LockShmemSize()); + size = add_size(size, PredicateLockShmemSize()); size = add_size(size, ProcGlobalShmemSize()); size = add_size(size, XLOGShmemSize()); size = add_size(size, CLOGShmemSize()); *************** *** 200,205 **** CreateSharedMemoryAndSemaphores(bool makePrivate, int port) --- 201,211 ---- InitLocks(); /* + * Set up predicate lock manager + */ + InitPredicateLocks(); + + /* * Set up process table */ if (!IsUnderPostmaster) *** a/src/backend/storage/ipc/shmqueue.c --- b/src/backend/storage/ipc/shmqueue.c *************** *** 43,56 **** SHMQueueInit(SHM_QUEUE *queue) * SHMQueueIsDetached -- TRUE if element is not currently * in a queue. */ - #ifdef NOT_USED bool SHMQueueIsDetached(SHM_QUEUE *queue) { Assert(ShmemAddrIsValid(queue)); return (queue->prev == NULL); } - #endif /* * SHMQueueElemInit -- clear an element's links --- 43,54 ---- *** a/src/backend/storage/lmgr/Makefile --- b/src/backend/storage/lmgr/Makefile *************** *** 12,18 **** subdir = src/backend/storage/lmgr top_builddir = ../../../.. include $(top_builddir)/src/Makefile.global ! OBJS = lmgr.o lock.o proc.o deadlock.o lwlock.o spin.o s_lock.o include $(top_srcdir)/src/backend/common.mk --- 12,18 ---- top_builddir = ../../../.. include $(top_builddir)/src/Makefile.global ! OBJS = lmgr.o lock.o proc.o deadlock.o lwlock.o spin.o s_lock.o predicate.o include $(top_srcdir)/src/backend/common.mk *** /dev/null --- b/src/backend/storage/lmgr/predicate.c *************** *** 0 **** --- 1,2418 ---- + /*------------------------------------------------------------------------- + * + * predicate.c + * POSTGRES predicate locking + * to support full serializable transaction isolation + * + * Predicate locks for Serializable Snapshot Isolation (SSI) are SIREAD + * locks, which are so different from normal locks that a distinct set of + * structures is required to handle them. + * + * (1) Besides tuples actually read, they must cover ranges of tuples + * which would have been read based on the predicate. This will + * require modelling the predicates through locks against database + * objects such as pages, index ranges, or entire tables. + * + * (2) They must be kept in RAM for quick access. Because of this, it + * isn't possible to always maintain tuple-level granularity -- when + * the space allocated to store these approaches exhaustion, a + * request for a lock may need to scan for situations where a single + * transaction holds many fine-grained locks which can be coalesced + * into a single coarser-grained lock. + * + * (3) They never block anything; they are more like flags than locks + * in that regard; although they refer to database objects and are + * used to identify rw-conflicts with normal write locks. + * + * (4) While they are associated with a transaction, they must survive + * a successful COMMIT of that transaction, and remain until all + * overlapping transactions complete. This even means that they + * must survive termination of the transaction's process. On a + * rollback of the top level transaction, all of that transaction's + * SIREAD locks should be released, however. + * + * (5) The only transactions which create SIREAD locks or check for + * conflicts with them are serializable transactions. + * + * (6) When a write lock for a top level transaction is found to cover + * an existing SIREAD lock for the same transaction, the SIREAD lock + * can be deleted. + * + * (7) A write from a serializable transaction must ensure that a xact + * record exists for the transaction, with the same lifespan (until + * all concurrent transaction complete or the transaction is rolled + * back) so that rw-dependencies to that transaction can be + * detected. + * + * + * Lightweight locks to manage access to the predicate locking shared + * memory objects must be taken in this order, and should be released in + * reverse order: + * + * SerializableFinishedListLock + * - Protects the list of transaction which have completed but which + * may yet matter because they overlap still-active transactions. + * + * SerializablePredicateLockListLock + * - Special handling: use shared mode for walking the list *and* + * for modifying the list from the process running the owning + * transaction. No other process is allowed to walk the list, + * and any other process must acquire exclusive access to modify + * it. Once a transaction has completed, it is the holder of + * the SerializableFinishedListLock who can walk the list in + * shared mode. + * + * FirstPredicateLockMgrLock based partition locks + * - The same lock protects a target and all locks on that target. + * - When more than one is needed, acquire in ascending order. + * + * SerializableXactHashLock + * - Protects both SerializableXactHash and SerializableXidHash. + * + * + * Portions Copyright (c) 1996-2010, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * + * IDENTIFICATION + * $PostgreSQL$ + * + *------------------------------------------------------------------------- + */ + /* + * INTERFACE ROUTINES + * + * housekeeping for setting up shared memory predicate lock structures + * InitPredicateLocks(void) + * PredicateLockShmemSize(void) + * + * predicate lock reporting + * PredicateLockData *GetPredicateLockStatusData(void) + * + * predicate lock maintenance + * RegisterSerializableTransaction(Snapshot snapshot) + * PredicateLockRelation(Relation relation) + * PredicateLockPage(Relation relation, BlockNumber blkno) + * PredicateLockTuple(Relation relation, HeapTuple tuple) + * PredicateLockPageSplit(Relation relation, BlockNumber oldblkno, + * BlockNumber newblkno); + * PredicateLockPageCombine(Relation relation, BlockNumber oldblkno, + * BlockNumber newblkno); + * ReleasePredicateLocks(bool isCommit) + * + * conflict detection (may also trigger rollback) + * CheckForSerializableConflictOut(bool valid, Relation relation, + * HeapTupleData *tup, Buffer buffer) + * CheckForSerializableConflictIn(Relation relation, HeapTupleData *tup, + * Buffer buffer) + * + * final rollback checking + * PreCommit_CheckForSerializationFailure(void) + */ + + #include "postgres.h" + + #include "access/transam.h" + #include "access/twophase.h" + #include "access/xact.h" + #include "miscadmin.h" + #include "storage/bufmgr.h" + #include "storage/predicate.h" + #include "utils/rel.h" + #include "utils/snapmgr.h" + + /* + * The SERIALIZABLEXIDTAG struct identifies an xid assigned to a serializable + * transaction or any of its subtransactions. + */ + typedef struct SERIALIZABLEXIDTAG + { + TransactionId xid; + } SERIALIZABLEXIDTAG; + + /* + * Information to link between an xid list and a top level serializable + * transaction. + */ + typedef struct SERIALIZABLEXID + { + /* hash key */ + SERIALIZABLEXIDTAG tag; + + /* data */ + SERIALIZABLEXACT *myXact; /* pointer to the top level transaction data */ + SHM_QUEUE xactLink; /* list link in SERIALIZABLEXACT's list of + * xids */ + } SERIALIZABLEXID; + + /* + * Per-locked-object predicate lock information: + * + * tag -- uniquely identifies the object being locked + * predicateLocks -- list of predicate lock objects for this target. + */ + typedef struct PREDICATELOCKTARGET + { + /* hash key */ + PREDICATELOCKTARGETTAG tag; /* unique identifier of lockable object */ + + /* data */ + SHM_QUEUE predicateLocks; /* list of PREDICATELOCK objects assoc. with + * predicate lock target */ + } PREDICATELOCKTARGET; + + typedef struct PREDICATELOCKTAG + { + PREDICATELOCKTARGET *myTarget; + SERIALIZABLEXACT *myXact; + } PREDICATELOCKTAG; + + typedef struct PREDICATELOCK + { + /* hash key */ + PREDICATELOCKTAG tag; /* unique identifier of lockable object */ + + /* data */ + SHM_QUEUE targetLink; /* list link in PREDICATELOCKTARGET's list of + * predicate locks */ + SHM_QUEUE xactLink; /* list link in SERIALIZABLEXACT's list of + * predicate locks */ + } PREDICATELOCK; + + /* + * Backend-local hash table of ancestor (coarser) locks and the number + * of (finer-grained) children locks that are currently held. This is + * used to determine when to promote multiple fine-grained locks to + * one coarse-grained lock. + */ + typedef struct LOCALPREDICATELOCK + { + /* hash key */ + PREDICATELOCKTARGETTAG tag; /* unique identifier of lockable object */ + + /* data */ + bool held; /* is lock held, or just its children? */ + int childLocks; /* number of child locks currently held */ + } LOCALPREDICATELOCK; + static HTAB *LocalPredicateLockHash = NULL; + + + /* + * Test the most selective fields first, for performance. + * + * a is covered by b if all of the following hold: + * 1) a.database = b.database + * 2) a.relation = b.relation + * 3) b.offset is invalid (b is page-granularity or higher) + * 4) either of the following: + * 4a) a.offset is valid (a is tuple-granularity) and a.page = b.page + * or 4b) a.offset is invalid and b.page is invalid (a is + * page-granularity and b is relation-granularity + */ + #define TargetTagIsCoveredBy(covered_target, covering_target) \ + ((GET_PREDICATELOCKTARGETTAG_RELATION(covered_target) == /* (2) */ \ + GET_PREDICATELOCKTARGETTAG_RELATION(covering_target)) \ + && (GET_PREDICATELOCKTARGETTAG_OFFSET(covering_target) == \ + InvalidOffsetNumber) /* (3) */ \ + && (((GET_PREDICATELOCKTARGETTAG_OFFSET(covered_target) != \ + InvalidOffsetNumber) /* (4a) */ \ + && (GET_PREDICATELOCKTARGETTAG_PAGE(covering_target) == \ + GET_PREDICATELOCKTARGETTAG_PAGE(covered_target))) \ + || ((GET_PREDICATELOCKTARGETTAG_PAGE(covering_target) == \ + InvalidBlockNumber) /* (4b) */ \ + && (GET_PREDICATELOCKTARGETTAG_PAGE(covered_target) \ + != InvalidBlockNumber))) \ + && (GET_PREDICATELOCKTARGETTAG_DB(covered_target) == /* (1) */ \ + GET_PREDICATELOCKTARGETTAG_DB(covering_target))) + + /* + * The predicate locking target and lock shared hash tables are partitioned to + * reduce contention. To determine which partition a given target belongs to, + * compute the tag's hash code with PredicateLockTargetTagHashCode(), then + * apply one of these macros. + * NB: NUM_PREDICATELOCK_PARTITIONS must be a power of 2! + */ + #define PredicateLockHashPartition(hashcode) \ + ((hashcode) % NUM_PREDICATELOCK_PARTITIONS) + #define PredicateLockHashPartitionLock(hashcode) \ + ((LWLockId) (FirstPredicateLockMgrLock + PredicateLockHashPartition(hashcode))) + + #define NPREDICATELOCKTARGETENTS() \ + mul_size(max_predicate_locks_per_xact, add_size(MaxBackends, max_prepared_xacts)) + + #define SxactIsOnFinishedList(sxact) (!SHMQueueIsDetached(&((sxact)->finishedLink))) + + #define SxactIsCommitted(sxact) TransactionIdIsValid((sxact)->finishedBefore) + #define SxactCommittedBefore(sxactPivotOut, sxactOther) \ + ((!TransactionIdIsValid((sxactOther)->finishedBefore)) \ + || TransactionIdPrecedesOrEquals((sxactPivotOut)->finishedBefore, \ + (sxactOther)->finishedBefore)) + + /* + * When a public interface method is called for a split on an index relation, + * this is the test to see if we should do a quick return. + */ + #define SkipSplitTracking(relation) \ + (((relation)->rd_id < FirstBootstrapObjectId) \ + || ((relation)->rd_istemp)) + + /* + * When a public interface method is called for serializing a relation within + * the current transaction, this is the test to see if we should do a quick + * return. + */ + #define SkipSerialization(relation) \ + ((!IsXactIsoLevelFullySerializable) \ + || SkipSplitTracking(relation)) + + + /* + * Compute the hash code associated with a PREDICATELOCKTARGETTAG. + * + * To avoid unnecessary recomputations of the hash code, we try to do this + * just once per function, and then pass it around as needed. Aside from + * passing the hashcode to hash_search_with_hash_value(), we can extract + * the lock partition number from the hashcode. + */ + #define PredicateLockTargetTagHashCode(predicatelocktargettag) \ + (tag_hash((predicatelocktargettag), sizeof(PREDICATELOCKTARGETTAG))) + + /* + * Given a predicate lock tag, and the hash for its target, + * compute the lock hash. + * + * To make the hash code also depend on the transaction, we xor the sxid + * struct's address into the hash code, left-shifted so that the + * partition-number bits don't change. Since this is only a hash, we + * don't care if we lose high-order bits of the address; use an + * intermediate variable to suppress cast-pointer-to-int warnings. + */ + #define PredicateLockHashCodeFromTargetHashCode(predicatelocktag, targethash) \ + ((targethash) ^ ((uint32) PointerGetDatum((predicatelocktag)->myXact)) \ + << LOG2_NUM_PREDICATELOCK_PARTITIONS) + + + /* This configuration variable is used to set the predicate lock table size */ + int max_predicate_locks_per_xact; /* set by guc.c */ + + /* + * These global variables are maintained when registering and cleaning up + * serializable transactions. They must be global across all backends, but + * are not needed outside this source file, so no .h declaration is needed. + */ + TransactionId SerializableGlobalXmin = InvalidTransactionId; + int SerializableGlobalXminCount = 0; + + /* + * The predicate locking hash tables are in shared memory. + * Each backend keeps pointers to them. + */ + static HTAB *SerializableXactHash; + static HTAB *SerializableXidHash; + static HTAB *PredicateLockTargetHash; + static HTAB *PredicateLockHash; + static SHM_QUEUE *FinishedSerializableTransactions; + + /* + * Keep a pointer to the currently-running serializable transaction (if any) + * for quick reference. + */ + typedef SERIALIZABLEXACT *SERIALIZABLEXACTPtr; + + #define InvalidSerializableXact ((SERIALIZABLEXACTPtr) NULL) + static volatile SERIALIZABLEXACT *MySerializableXact = InvalidSerializableXact; + + /* TODO SSI: Remove volatile qualifier and the then-unnecessary casts? */ + + /* The most recently used xid within this transaction, for optimizations. */ + static TransactionId MyXid = InvalidTransactionId; + + + /* local functions */ + static uint32 predicatelock_hash(const void *key, Size keysize); + static void ReleaseOneSerializableXact(SERIALIZABLEXACT *sxact); + static bool PredicateLockExists(const PREDICATELOCKTARGETTAG *newtargettag); + static bool CoarserLockCovers(const PREDICATELOCKTARGETTAG *newtargettag); + static void DeleteChildTargetLocks(const PREDICATELOCKTARGETTAG *targettag); + static int PredicateLockPromotionThreshold(const PREDICATELOCKTARGETTAG *tag); + static bool GetParentPredicateLockTag(const PREDICATELOCKTARGETTAG *tag, + PREDICATELOCKTARGETTAG *parent); + static void DecrementParentLocks(const PREDICATELOCKTARGETTAG *targettag); + static void PredicateLockAcquire(const PREDICATELOCKTARGETTAG *tag); + static void EnsureMySerializableXidExists(void); + static void ClearOldPredicateLocks(void); + static bool XidIsConcurrent(TransactionId xid); + static void FlagRWConflict(SERIALIZABLEXACT *reader, SERIALIZABLEXACT *writer); + static void CheckTargetForConflictsIn(PREDICATELOCKTARGETTAG *targettag); + static void OnConflict_CheckForSerializationFailure(const SERIALIZABLEXACT *reader, + const SERIALIZABLEXACT *writer); + + /* + * InitPredicateLocks -- Initialize the predicate locking data structures. + * + * This is called from CreateSharedMemoryAndSemaphores(), which see for + * more comments. In the normal postmaster case, the shared hash tables + * are created here. Backends inherit the pointers + * to the shared tables via fork(). In the EXEC_BACKEND case, each + * backend re-executes this code to obtain pointers to the already existing + * shared hash tables. + */ + void + InitPredicateLocks(void) + { + HASHCTL info; + int hash_flags; + long init_table_size, + max_table_size; + bool found; + + /* + * Compute init/max size to request for predicate lock target hashtable. + * Note these calculations must agree with PredicateLockShmemSize! + */ + max_table_size = NPREDICATELOCKTARGETENTS(); + init_table_size = max_table_size / 2; + + /* + * Allocate hash table for PREDICATELOCKTARGET structs. This stores + * per-predicate-lock-target information. + */ + MemSet(&info, 0, sizeof(info)); + info.keysize = sizeof(PREDICATELOCKTARGETTAG); + info.entrysize = sizeof(PREDICATELOCKTARGET); + info.hash = tag_hash; + info.num_partitions = NUM_PREDICATELOCK_PARTITIONS; + hash_flags = (HASH_ELEM | HASH_FUNCTION | HASH_PARTITION); + + PredicateLockTargetHash = ShmemInitHash("PREDICATELOCKTARGET hash", + init_table_size, + max_table_size, + &info, + hash_flags); + + /* Assume an average of 2 xacts per target */ + max_table_size *= 2; + init_table_size *= 2; + + /* + * Allocate hash table for PREDICATELOCK structs. This stores per + * xact-lock-of-a-target information. + */ + MemSet(&info, 0, sizeof(info)); + info.keysize = sizeof(PREDICATELOCKTAG); + info.entrysize = sizeof(PREDICATELOCK); + info.hash = predicatelock_hash; + info.num_partitions = NUM_PREDICATELOCK_PARTITIONS; + hash_flags = (HASH_ELEM | HASH_FUNCTION | HASH_PARTITION); + + PredicateLockHash = ShmemInitHash("PREDICATELOCK hash", + init_table_size, + max_table_size, + &info, + hash_flags); + + /* + * Compute init/max size to request for serializable transaction + * hashtable. Note these calculations must agree with + * PredicateLockShmemSize! + */ + max_table_size = MaxBackends; + init_table_size = max_table_size / 2; + + /* + * Allocate hash table for SERIALIZABLEXACT structs. This stores per-vxid + * information for serializable transactions which have accessed data. + */ + MemSet(&info, 0, sizeof(info)); + info.keysize = sizeof(SERIALIZABLEXACTTAG); + info.entrysize = sizeof(SERIALIZABLEXACT); + info.hash = tag_hash; + hash_flags = (HASH_ELEM | HASH_FUNCTION); + + SerializableXactHash = ShmemInitHash("SERIALIZABLEXACT hash", + init_table_size, + max_table_size, + &info, + hash_flags); + + /* Assume an average of 10 serializable xids per backend. */ + max_table_size *= 10; + init_table_size *= 10; + + /* + * Allocate hash table for SERIALIZABLEXID structs. This stores per-xid + * information for serializable transactions which have accessed data. + */ + MemSet(&info, 0, sizeof(info)); + info.keysize = sizeof(SERIALIZABLEXIDTAG); + info.entrysize = sizeof(SERIALIZABLEXID); + info.hash = tag_hash; + hash_flags = (HASH_ELEM | HASH_FUNCTION); + + SerializableXidHash = ShmemInitHash("SERIALIZABLEXID hash", + init_table_size, + max_table_size, + &info, + hash_flags); + + /* + * Create or attach to the header for the list of finished serializable + * transactions. + */ + FinishedSerializableTransactions = (SHM_QUEUE *) + ShmemInitStruct("FinishedSerializableTransactions", + sizeof(SHM_QUEUE), + &found); + if (!found) + SHMQueueInit(FinishedSerializableTransactions); + } + + /* + * Estimate shared-memory space used for predicate lock table + */ + Size + PredicateLockShmemSize(void) + { + Size size = 0; + long max_table_size; + + /* predicate lock target hash table */ + max_table_size = NPREDICATELOCKTARGETENTS(); + size = add_size(size, hash_estimate_size(max_table_size, + sizeof(PREDICATELOCKTARGET))); + + /* predicate lock hash table */ + max_table_size *= 2; + size = add_size(size, hash_estimate_size(max_table_size, + sizeof(PREDICATELOCK))); + + /* + * Since NPREDICATELOCKTARGETENTS is only an estimate, add 10% safety + * margin. + */ + size = add_size(size, size / 10); + + /* serializable transaction table */ + max_table_size = MaxBackends; + size = add_size(size, hash_estimate_size(max_table_size, + sizeof(SERIALIZABLEXACT))); + + /* serializable subtransaction table */ + max_table_size *= 10; + size = add_size(size, hash_estimate_size(max_table_size, + sizeof(SERIALIZABLEXID))); + + /* Head for list of serializable transactions. */ + size = add_size(size, sizeof(SHM_QUEUE)); + + return size; + } + + + /* + * Compute the hash code associated with a PREDICATELOCKTAG. + * + * Because we want to use just one set of partition locks for both the + * PREDICATELOCKTARGET and PREDICATELOCK hash tables, we have to make sure + * that PREDICATELOCKs fall into the same partition number as their + * associated PREDICATELOCKTARGETs. dynahash.c expects the partition number + * to be the low-order bits of the hash code, and therefore a + * PREDICATELOCKTAG's hash code must have the same low-order bits as the + * associated PREDICATELOCKTARGETTAG's hash code. We achieve this with this + * specialized hash function. + */ + static uint32 + predicatelock_hash(const void *key, Size keysize) + { + const PREDICATELOCKTAG *predicatelocktag = (const PREDICATELOCKTAG *) key; + uint32 targethash; + + Assert(keysize == sizeof(PREDICATELOCKTAG)); + + /* Look into the associated target object, and compute its hash code */ + targethash = PredicateLockTargetTagHashCode(&predicatelocktag->myTarget->tag); + + return PredicateLockHashCodeFromTargetHashCode(predicatelocktag, targethash); + } + + + /* + * GetPredicateLockStatusData + * Return a table containing the internal state of the predicate + * lock manager for use in pg_lock_status. + * + * Like GetLockStatusData, this function tries to hold the partition LWLocks + * for as short a time as possible by returning two arrays that simply + * contain the PREDICATELOCKTARGETTAG and SERIALIZABLEXACT for each lock + * table entry. Multiple copies of the same PREDICATELOCKTARGETTAG and + * SERIALIZABLEXACT will likely appear. + */ + PredicateLockData * + GetPredicateLockStatusData(void) + { + PredicateLockData *data; + int i; + int els, + el; + HASH_SEQ_STATUS seqstat; + PREDICATELOCK *predlock; + + data = (PredicateLockData *) palloc(sizeof(PredicateLockData)); + + /* + * Acquire locks. To ensure consistency, take simultaneous locks on + * SerializableFinishedListLock, all partition locks in ascending order, + * then SerializableXactHashLock. TODO SSI: Do we really need to lock + * SerializableFinishedListLock? + */ + LWLockAcquire(SerializableFinishedListLock, LW_SHARED); + for (i = 0; i < NUM_PREDICATELOCK_PARTITIONS; i++) + LWLockAcquire(FirstPredicateLockMgrLock + i, LW_SHARED); + LWLockAcquire(SerializableXactHashLock, LW_SHARED); + + /* Get number of locks and allocate appropriately-sized arrays. */ + els = hash_get_num_entries(PredicateLockHash); + data->nelements = els; + data->locktags = (PREDICATELOCKTARGETTAG *) + palloc(sizeof(PREDICATELOCKTARGETTAG) * els); + data->xacts = (SERIALIZABLEXACT *) + palloc(sizeof(SERIALIZABLEXACT) * els); + + + /* Scan through PredicateLockHash and copy contents */ + hash_seq_init(&seqstat, PredicateLockHash); + + el = 0; + + while ((predlock = (PREDICATELOCK *) hash_seq_search(&seqstat))) + { + data->locktags[el] = predlock->tag.myTarget->tag; + data->xacts[el] = *predlock->tag.myXact; + el++; + } + + Assert(el == els); + + /* Release locks in reverse order */ + LWLockRelease(SerializableXactHashLock); + for (i = NUM_PREDICATELOCK_PARTITIONS - 1; i >= 0; i--) + LWLockRelease(FirstPredicateLockMgrLock + i); + LWLockRelease(SerializableFinishedListLock); + + return data; + } + + + /* + * Make sure we have a SERIALIZABLEXACT reference in MySerializableXact. + * It should be current for this process and be contained in + * SerializableXactHash. + */ + void + RegisterSerializableTransaction(const Snapshot snapshot) + { + PGPROC *proc; + SERIALIZABLEXACTTAG sxacttag; + SERIALIZABLEXACT *sxact; + bool found; + HASHCTL hash_ctl; + + /* We only do this for serializable transactions. Once. */ + Assert(IsXactIsoLevelFullySerializable); + Assert(MySerializableXact == InvalidSerializableXact); + + proc = MyProc; + Assert(proc != NULL); + GET_VXID_FROM_PGPROC(sxacttag.vxid, *proc); + + LWLockAcquire(SerializableXactHashLock, LW_EXCLUSIVE); + if (!TransactionIdIsValid(SerializableGlobalXmin)) + { + Assert(SerializableGlobalXminCount == 0); + SerializableGlobalXmin = snapshot->xmin; + SerializableGlobalXminCount = 1; + } + else if (SerializableGlobalXmin == snapshot->xmin) + { + Assert(SerializableGlobalXminCount > 0); + SerializableGlobalXminCount++; + } + else + { + Assert(TransactionIdFollows(snapshot->xmin, SerializableGlobalXmin)); + } + sxact = (SERIALIZABLEXACT *) hash_search(SerializableXactHash, + &sxacttag, + HASH_ENTER, &found); + Assert(!found); + if (!sxact) + ereport(ERROR, + (errcode(ERRCODE_OUT_OF_MEMORY), + errmsg("out of shared memory"), + errhint("You might need to increase max_predicate_locks_per_transaction."))); + + /* Initialize the structure. */ + sxact->outConflict = InvalidSerializableXact; + sxact->inConflict = InvalidSerializableXact; + sxact->topXid = GetTopTransactionIdIfAny(); + sxact->finishedBefore = InvalidTransactionId; + sxact->xmin = snapshot->xmin; + SHMQueueInit(&(sxact->predicateLocks)); + SHMQueueInit(&(sxact->xids)); + SHMQueueElemInit(&(sxact->finishedLink)); + sxact->rolledBack = false; + LWLockRelease(SerializableXactHashLock); + + MySerializableXact = sxact; + + /* Initialized the backend-local hash table of parent locks */ + Assert(LocalPredicateLockHash == NULL); + MemSet(&hash_ctl, 0, sizeof(hash_ctl)); + hash_ctl.keysize = sizeof(PREDICATELOCKTARGETTAG); + hash_ctl.entrysize = sizeof(LOCALPREDICATELOCK); + hash_ctl.hash = tag_hash; + LocalPredicateLockHash = hash_create("Local predicate lock", + max_predicate_locks_per_xact, + &hash_ctl, + HASH_ELEM | HASH_FUNCTION); + } + + /* + * Make sure we have a SERIALIZABLEXACT reference in MySerializableXact. + * It should be current for this process and be contained in + * SerializableXidHash. + */ + static void + EnsureMySerializableXidExists(void) + { + TransactionId xid; + + Assert(MySerializableXact != InvalidSerializableXact); + + MySerializableXact->topXid = GetTopTransactionIdIfAny(); + + /* + * If this isn't the xid we've most recently seen for this vxid, make sure + * it's in the hash table. + */ + xid = GetCurrentTransactionIdIfAny(); + if (MyXid != xid) + { + SERIALIZABLEXIDTAG sxidtag; + SERIALIZABLEXID *sxid; + bool found; + + Assert(TransactionIdIsValid(xid)); + + sxidtag.xid = xid; + LWLockAcquire(SerializableXactHashLock, LW_EXCLUSIVE); + sxid = (SERIALIZABLEXID *) hash_search(SerializableXidHash, + &sxidtag, + HASH_ENTER, &found); + if (!sxid) + ereport(ERROR, + (errcode(ERRCODE_OUT_OF_MEMORY), + errmsg("out of shared memory"), + errhint("You might need to increase max_predicate_locks_per_transaction."))); + + /* Initialize the structure. */ + if (!found) + { + sxid->myXact = (SERIALIZABLEXACT *) MySerializableXact; + SHMQueueInsertBefore(&(((SERIALIZABLEXACT *) MySerializableXact)->xids), + &(sxid->xactLink)); + } + LWLockRelease(SerializableXactHashLock); + MyXid = xid; + } + } + + + /* + * Check whether a particular lock is held by this transaction. + */ + static bool + PredicateLockExists(const PREDICATELOCKTARGETTAG *targettag) + { + LOCALPREDICATELOCK *lock; + + /* check local hash table */ + lock = (LOCALPREDICATELOCK *) hash_search(LocalPredicateLockHash, + targettag, + HASH_FIND, NULL); + + if (!lock) + return false; + + /* + * Found entry in the table, but still need to check whether it's actually + * held -- it could just be a parent of some held lock. + */ + return lock->held; + } + + /* + * Return the parent lock tag in the lock hierarchy: the next coarser + * lock that covers the provided tag. + * + * Returns true and sets *parent to the parent tag if one exists, + * returns false if none exists. + */ + static bool + GetParentPredicateLockTag(const PREDICATELOCKTARGETTAG *tag, + PREDICATELOCKTARGETTAG *parent) + { + switch (GET_PREDICATELOCKTARGETTAG_TYPE(*tag)) + { + case PREDLOCKTAG_RELATION: + /* relation locks have no parent lock */ + return false; + + case PREDLOCKTAG_PAGE: + /* parent lock is relation lock */ + SET_PREDICATELOCKTARGETTAG_RELATION(*parent, + GET_PREDICATELOCKTARGETTAG_DB(*tag), + GET_PREDICATELOCKTARGETTAG_RELATION(*tag)); + + return true; + + case PREDLOCKTAG_TUPLE: + /* parent lock is page lock */ + SET_PREDICATELOCKTARGETTAG_PAGE(*parent, + GET_PREDICATELOCKTARGETTAG_DB(*tag), + GET_PREDICATELOCKTARGETTAG_RELATION(*tag), + GET_PREDICATELOCKTARGETTAG_PAGE(*tag)); + return true; + } + + /* not reachable */ + Assert(false); + return false; + } + + /* + * Check whether the lock we are considering is already covered by a + * coarser lock for our transaction. + */ + static bool + CoarserLockCovers(const PREDICATELOCKTARGETTAG *newtargettag) + { + PREDICATELOCKTARGETTAG targettag, + parenttag; + + targettag = *newtargettag; + + /* check parents iteratively until no more */ + while (GetParentPredicateLockTag(&targettag, &parenttag)) + { + targettag = parenttag; + if (PredicateLockExists(&targettag)) + return true; + } + + /* no more parents to check; lock is not covered */ + return false; + } + + + /* + * Delete child target locks owned by this process. + * This implementation is assuming that the usage of each target tag field + * is uniform. No need to make this hard if we don't have to. + * + * We aren't acquiring lightweight locks for the predicate lock or lock + * target structures associated with this transaction unless we're going + * to modify them, because no other process is permitted to modify our + * locks. + */ + static void + DeleteChildTargetLocks(const PREDICATELOCKTARGETTAG *newtargettag) + { + SERIALIZABLEXACT *sxact; + PREDICATELOCK *predlock; + + LWLockAcquire(SerializablePredicateLockListLock, LW_SHARED); + sxact = (SERIALIZABLEXACT *) MySerializableXact; + predlock = (PREDICATELOCK *) + SHMQueueNext(&(sxact->predicateLocks), + &(sxact->predicateLocks), + offsetof(PREDICATELOCK, xactLink)); + while (predlock) + { + SHM_QUEUE *predlocksxactlink; + PREDICATELOCK *nextpredlock; + PREDICATELOCKTAG oldlocktag; + PREDICATELOCKTARGET *oldtarget; + PREDICATELOCKTARGETTAG oldtargettag; + + predlocksxactlink = &(predlock->xactLink); + nextpredlock = (PREDICATELOCK *) + SHMQueueNext(&(sxact->predicateLocks), + predlocksxactlink, + offsetof(PREDICATELOCK, xactLink)); + + oldlocktag = predlock->tag; + Assert(oldlocktag.myXact == sxact); + oldtarget = oldlocktag.myTarget; + oldtargettag = oldtarget->tag; + + if (TargetTagIsCoveredBy(oldtargettag, *newtargettag)) + { + uint32 oldtargettaghash; + LWLockId partitionLock; + PREDICATELOCK *rmpredlock; + PREDICATELOCKTARGET *rmtarget; + + oldtargettaghash = PredicateLockTargetTagHashCode(&oldtargettag); + partitionLock = PredicateLockHashPartitionLock(oldtargettaghash); + + LWLockAcquire(partitionLock, LW_EXCLUSIVE); + + SHMQueueDelete(predlocksxactlink); + SHMQueueDelete(&(predlock->targetLink)); + rmpredlock = hash_search_with_hash_value + (PredicateLockHash, + &oldlocktag, + PredicateLockHashCodeFromTargetHashCode(&oldlocktag, + oldtargettaghash), + HASH_REMOVE, NULL); + Assert(rmpredlock == predlock); + + if (SHMQueueEmpty(&oldtarget->predicateLocks)) + { + rmtarget = hash_search_with_hash_value(PredicateLockTargetHash, + &oldtargettag, + oldtargettaghash, + HASH_REMOVE, NULL); + Assert(rmtarget == oldtarget); + } + + LWLockRelease(partitionLock); + + DecrementParentLocks(&oldtargettag); + } + + predlock = nextpredlock; + } + LWLockRelease(SerializablePredicateLockListLock); + } + + /* + * Returns the promotion threshold for a given predicate lock + * target. This is the number of descendant locks required to promote + * to the specified tag. Note that the threshold includes non-direct + * descendants, e.g. both tuples and pages for a relation lock. + * + * TODO SSI: We should do something more intelligent about what the + * thresholds are, either making it proportional to the number of + * tuples in a page & pages in a relation, or at least making it a + * GUC. Currently the threshold is 3 for a page lock, and + * max_predicate_locks_per_transaction/2 for a relation lock, chosen + * entirely arbitrarily (and without benchmarking). + */ + static int + PredicateLockPromotionThreshold(const PREDICATELOCKTARGETTAG *tag) + { + switch (GET_PREDICATELOCKTARGETTAG_TYPE(*tag)) + { + case PREDLOCKTAG_RELATION: + return max_predicate_locks_per_xact / 2; + + case PREDLOCKTAG_PAGE: + return 3; + + case PREDLOCKTAG_TUPLE: + + /* + * not reachable: nothing is finer-granularity than a tuple, so we + * should never try to promote to it. + */ + Assert(false); + return 0; + } + + /* not reachable */ + Assert(false); + return 0; + } + + /* + * For all ancestors of a newly-acquired predicate lock, increment + * their child count in the parent hash table. If any of them have + * more descendants than their promotion threshold, acquire the + * coarsest such lock. + * + * Returns true if a parent lock was acquired and false otherwise. + */ + static bool + CheckAndPromotePredicateLockRequest(const PREDICATELOCKTARGETTAG *reqtag) + { + PREDICATELOCKTARGETTAG targettag, + nexttag, + promotiontag; + LOCALPREDICATELOCK *parentlock; + bool found, + promote; + + promote = false; + + targettag = *reqtag; + + /* check parents iteratively */ + while (GetParentPredicateLockTag(&targettag, &nexttag)) + { + targettag = nexttag; + parentlock = (LOCALPREDICATELOCK *) hash_search(LocalPredicateLockHash, + &targettag, + HASH_ENTER, + &found); + if (!found) + { + parentlock->held = false; + parentlock->childLocks = 1; + } + else + parentlock->childLocks++; + + if (parentlock->childLocks >= + PredicateLockPromotionThreshold(&targettag)) + { + /* + * We should promote to this parent lock. Continue to check its + * ancestors, however, both to get their child counts right and to + * check whether we should just go ahead and promote to one of + * them. + */ + promotiontag = targettag; + promote = true; + } + } + + if (promote) + { + /* acquire coarsest ancestor eligible for promotion */ + PredicateLockAcquire(&promotiontag); + return true; + } + else + return false; + } + + /* + * When releasing a lock, decrement the child count on all ancestor + * locks. + * + * This is called only when releasing a lock via + * DeleteChildTargetLocks (i.e. when a lock becomes redundant because + * we've acquired its parent, possibly due to promotion) or when a new + * MVCC write lock makes the predicate lock unnecessary. There's no + * point in calling it when locks are released at transaction end, as + * this information is no longer needed. + */ + static void + DecrementParentLocks(const PREDICATELOCKTARGETTAG *targettag) + { + PREDICATELOCKTARGETTAG parenttag, + nexttag; + + parenttag = *targettag; + + while (GetParentPredicateLockTag(&parenttag, &nexttag)) + { + uint32 targettaghash; + LOCALPREDICATELOCK *parentlock, + *rmlock; + + parenttag = nexttag; + targettaghash = PredicateLockTargetTagHashCode(&parenttag); + parentlock = (LOCALPREDICATELOCK *) + hash_search_with_hash_value(LocalPredicateLockHash, + &parenttag, targettaghash, + HASH_FIND, NULL); + Assert(parentlock != NULL); + parentlock->childLocks--; + + Assert(parentlock->childLocks >= 0); + + if ((parentlock->childLocks == 0) && (!parentlock->held)) + { + rmlock = (LOCALPREDICATELOCK *) + hash_search_with_hash_value(LocalPredicateLockHash, + &parenttag, targettaghash, + HASH_REMOVE, NULL); + Assert(rmlock == parentlock); + } + } + } + + /* + * Acquire a predicate lock on the specified target for the current + * connection if not already held. Create related serializable transaction + * and predicate lock target entries first if missing. + */ + static void + PredicateLockAcquire(const PREDICATELOCKTARGETTAG *targettag) + { + uint32 targettaghash; + LWLockId partitionLock; + bool found; + PREDICATELOCKTARGET *target; + PREDICATELOCKTAG locktag; + PREDICATELOCK *lock; + LOCALPREDICATELOCK *locallock; + + EnsureMySerializableXidExists(); + + /* Do we have the lock already, or a covering lock? */ + if (PredicateLockExists(targettag)) + return; + + if (CoarserLockCovers(targettag)) + return; + + /* the same hash and LW lock apply to the lock target and the local lock. */ + targettaghash = PredicateLockTargetTagHashCode(targettag); + partitionLock = PredicateLockHashPartitionLock(targettaghash); + + /* Acquire lock in local table */ + locallock = (LOCALPREDICATELOCK *) + hash_search_with_hash_value(LocalPredicateLockHash, + targettag, targettaghash, + HASH_ENTER, &found); + /* We should not hold the lock (but its entry might still exist) */ + Assert(!found || !locallock->held); + locallock->held = true; + if (!found) + locallock->childLocks = 0; + + LWLockAcquire(SerializablePredicateLockListLock, LW_SHARED); + LWLockAcquire(partitionLock, LW_EXCLUSIVE); + + /* Make sure that the target is represented. */ + target = (PREDICATELOCKTARGET *) + hash_search_with_hash_value(PredicateLockTargetHash, + targettag, targettaghash, + HASH_ENTER, &found); + if (!target) + ereport(ERROR, + (errcode(ERRCODE_OUT_OF_MEMORY), + errmsg("out of shared memory"), + errhint("You might need to increase max_predicate_locks_per_transaction."))); + if (!found) + SHMQueueInit(&(target->predicateLocks)); + + /* We've got the sxact and target, make sure they're joined. */ + locktag.myTarget = target; + locktag.myXact = (SERIALIZABLEXACT *) MySerializableXact; + lock = (PREDICATELOCK *) + hash_search_with_hash_value(PredicateLockHash, &locktag, + PredicateLockHashCodeFromTargetHashCode(&locktag, targettaghash), + HASH_ENTER, &found); + if (!lock) + ereport(ERROR, + (errcode(ERRCODE_OUT_OF_MEMORY), + errmsg("out of shared memory"), + errhint("You might need to increase max_predicate_locks_per_transaction."))); + + if (!found) + { + SHMQueueInsertBefore(&(target->predicateLocks), &(lock->targetLink)); + SHMQueueInsertBefore((SHM_QUEUE *) &(MySerializableXact->predicateLocks), + &(lock->xactLink)); + } + + LWLockRelease(partitionLock); + LWLockRelease(SerializablePredicateLockListLock); + + /* + * Lock has been acquired. Check whether it should be promoted to a + * coarser granularity, or whether there are finer-granularity locks to + * clean up. + */ + if (CheckAndPromotePredicateLockRequest(targettag)) + { + /* + * Lock request was promoted to a coarser-granularity lock, and that + * lock was acquired. It will delete this lock and any of its + * children, so we're done. + */ + } + else + { + /* Clean up any finer-granularity locks */ + if (GET_PREDICATELOCKTARGETTAG_TYPE(*targettag) != PREDLOCKTAG_TUPLE) + DeleteChildTargetLocks(targettag); + } + } + + + /* + * PredicateLockRelation + * + * Gets a predicate lock at the relation level. + * Skip if not in full serializable transaction isolation level. + * Skip if this is a temporary table. + * Clear any finer-grained predicate locks this session has on the relation. + */ + void + PredicateLockRelation(const Relation relation) + { + PREDICATELOCKTARGETTAG tag; + + if (SkipSerialization(relation)) + return; + + SET_PREDICATELOCKTARGETTAG_RELATION(tag, + relation->rd_node.dbNode, + relation->rd_id); + PredicateLockAcquire(&tag); + } + + /* + * PredicateLockPage + * + * Gets a predicate lock at the page level. + * Skip if not in full serializable transaction isolation level. + * Skip if this is a temporary table. + * Skip if a coarser predicate lock already covers this page. + * Clear any finer-grained predicate locks this session has on the relation. + */ + void + PredicateLockPage(const Relation relation, const BlockNumber blkno) + { + PREDICATELOCKTARGETTAG tag; + + if (SkipSerialization(relation)) + return; + + SET_PREDICATELOCKTARGETTAG_PAGE(tag, + relation->rd_node.dbNode, + relation->rd_id, + blkno); + PredicateLockAcquire(&tag); + } + + /* + * PredicateLockTuple + * + * Gets a predicate lock at the tuple level. + * Skip if not in full serializable transaction isolation level. + * Skip if this is a temporary table. + */ + void + PredicateLockTuple(const Relation relation, const HeapTuple tuple) + { + PREDICATELOCKTARGETTAG tag; + ItemPointer tid; + + if (SkipSerialization(relation)) + return; + + /* + * If it's a heap tuple, return if this xact wrote it. It might be useful + * to pass in the xmin from the tuple as another parameter. + */ + if (relation->rd_index == NULL) + { + SERIALIZABLEXIDTAG sxidtag; + SERIALIZABLEXID *sxid; + + sxidtag.xid = HeapTupleHeaderGetXmin(tuple->t_data); + LWLockAcquire(SerializableXactHashLock, LW_SHARED); + sxid = (SERIALIZABLEXID *) + hash_search(SerializableXidHash, &sxidtag, HASH_FIND, NULL); + if (sxid) + { + if (sxid->myXact == MySerializableXact) + { + /* We wrote it; we already have a write lock. */ + LWLockRelease(SerializableXactHashLock); + return; + } + } + LWLockRelease(SerializableXactHashLock); + } + + tid = &(tuple->t_self); + SET_PREDICATELOCKTARGETTAG_TUPLE(tag, + relation->rd_node.dbNode, + relation->rd_id, + ItemPointerGetBlockNumber(tid), + ItemPointerGetOffsetNumber(tid)); + PredicateLockAcquire(&tag); + } + + /* + * PredicateLockPageSplit + * + * Copies any predicate locks for the old page to the new page. + * Skip if this is a temporary table or toast table. + * + * NOTE: A page split (or overflow) affects all serializable transactions, + * even if it occurrs in the context of another transaction isolation level. + * + * NOTE: This currently leaves the local copy of the locks without + * information on the new lock which is in shared memory. This could cause + * problems if enough page splits occur on locked pages without the processes + * which hold the locks getting in and noticing. + */ + void + PredicateLockPageSplit(const Relation relation, const BlockNumber oldblkno, + const BlockNumber newblkno) + { + PREDICATELOCKTARGETTAG oldtargettag; + PREDICATELOCKTARGETTAG newtargettag; + uint32 oldtargettaghash; + LWLockId oldpartitionLock; + PREDICATELOCKTARGET *oldtarget; + uint32 newtargettaghash; + LWLockId newpartitionLock; + + if (SkipSplitTracking(relation)) + return; + + Assert(oldblkno != newblkno); + Assert(BlockNumberIsValid(oldblkno)); + Assert(BlockNumberIsValid(newblkno)); + + SET_PREDICATELOCKTARGETTAG_PAGE(oldtargettag, + relation->rd_node.dbNode, + relation->rd_id, + oldblkno); + SET_PREDICATELOCKTARGETTAG_PAGE(newtargettag, + relation->rd_node.dbNode, + relation->rd_id, + newblkno); + + oldtargettaghash = PredicateLockTargetTagHashCode(&oldtargettag); + newtargettaghash = PredicateLockTargetTagHashCode(&newtargettag); + oldpartitionLock = PredicateLockHashPartitionLock(oldtargettaghash); + newpartitionLock = PredicateLockHashPartitionLock(newtargettaghash); + + LWLockAcquire(SerializablePredicateLockListLock, LW_EXCLUSIVE); + + /* + * We must get the partition locks in ascending sequence to avoid + * deadlocks. If old and new partitions are the same, we must request the + * lock only once. + */ + if (oldpartitionLock < newpartitionLock) + { + LWLockAcquire(oldpartitionLock, LW_SHARED); + LWLockAcquire(newpartitionLock, LW_EXCLUSIVE); + } + else if (oldpartitionLock > newpartitionLock) + { + LWLockAcquire(newpartitionLock, LW_EXCLUSIVE); + LWLockAcquire(oldpartitionLock, LW_SHARED); + } + else + LWLockAcquire(newpartitionLock, LW_EXCLUSIVE); + + /* + * Look for the old target. If not found, that's OK; no predicate locks + * are affected, so we can just clean up and return. If it does exist, + * walk its list of predicate locks and create new ones for the new block + * number. + */ + oldtarget = hash_search_with_hash_value(PredicateLockTargetHash, + &oldtargettag, + oldtargettaghash, + HASH_FIND, NULL); + if (oldtarget) + { + PREDICATELOCKTARGET *newtarget; + bool found; + PREDICATELOCK *oldpredlock; + PREDICATELOCKTAG newpredlocktag; + + newtarget = hash_search_with_hash_value(PredicateLockTargetHash, + &newtargettag, + newtargettaghash, + HASH_ENTER, &found); + Assert(!found); + if (!newtarget) + ereport(ERROR, + (errcode(ERRCODE_OUT_OF_MEMORY), + errmsg("out of shared memory"), + errhint("You might need to increase max_predicate_locks_per_transaction."))); + SHMQueueInit(&(newtarget->predicateLocks)); + + newpredlocktag.myTarget = newtarget; + + oldpredlock = (PREDICATELOCK *) + SHMQueueNext(&(oldtarget->predicateLocks), + &(oldtarget->predicateLocks), + offsetof(PREDICATELOCK, targetLink)); + LWLockAcquire(SerializableXactHashLock, LW_EXCLUSIVE); + while (oldpredlock) + { + SHM_QUEUE *predlocktargetlink; + PREDICATELOCK *nextpredlock; + PREDICATELOCK *newpredlock; + + predlocktargetlink = &(oldpredlock->targetLink); + nextpredlock = (PREDICATELOCK *) + SHMQueueNext(&(oldtarget->predicateLocks), + predlocktargetlink, + offsetof(PREDICATELOCK, targetLink)); + newpredlocktag.myXact = oldpredlock->tag.myXact; + + newpredlock = (PREDICATELOCK *) + hash_search_with_hash_value + (PredicateLockHash, + &newpredlocktag, + PredicateLockHashCodeFromTargetHashCode(&newpredlocktag, + newtargettaghash), + HASH_ENTER, &found); + if (!newpredlock) + ereport(ERROR, + (errcode(ERRCODE_OUT_OF_MEMORY), + errmsg("out of shared memory"), + errhint("You might need to increase max_predicate_locks_per_transaction."))); + Assert(!found); + SHMQueueInsertBefore(&(newtarget->predicateLocks), + &(newpredlock->targetLink)); + SHMQueueInsertBefore(&(newpredlocktag.myXact->predicateLocks), + &(newpredlock->xactLink)); + + oldpredlock = nextpredlock; + } + LWLockRelease(SerializableXactHashLock); + } + + /* Release partition locks in reverse order of acquisition. */ + if (oldpartitionLock < newpartitionLock) + { + LWLockRelease(newpartitionLock); + LWLockRelease(oldpartitionLock); + } + else if (oldpartitionLock > newpartitionLock) + { + LWLockRelease(oldpartitionLock); + LWLockRelease(newpartitionLock); + } + else + LWLockRelease(newpartitionLock); + LWLockRelease(SerializablePredicateLockListLock); + } + + /* + * PredicateLockPageCombine + * + * Combines predicate locks for two existing pages. + * Skip if this is a temporary table or toast table. + * + * NOTE: A page combine affects all serializable + * transactions, even if it occurrs in the context of another + * transaction isolation level. + */ + void + PredicateLockPageCombine(const Relation relation, const BlockNumber oldblkno, + const BlockNumber newblkno) + { + PREDICATELOCKTARGETTAG oldtargettag; + PREDICATELOCKTARGETTAG newtargettag; + uint32 oldtargettaghash; + LWLockId oldpartitionLock; + PREDICATELOCKTARGET *oldtarget; + uint32 newtargettaghash; + LWLockId newpartitionLock; + + if (SkipSplitTracking(relation)) + return; + + Assert(oldblkno != newblkno); + Assert(BlockNumberIsValid(oldblkno)); + Assert(BlockNumberIsValid(newblkno)); + + SET_PREDICATELOCKTARGETTAG_PAGE(oldtargettag, + relation->rd_node.dbNode, + relation->rd_id, + oldblkno); + SET_PREDICATELOCKTARGETTAG_PAGE(newtargettag, + relation->rd_node.dbNode, + relation->rd_id, + newblkno); + + oldtargettaghash = PredicateLockTargetTagHashCode(&oldtargettag); + newtargettaghash = PredicateLockTargetTagHashCode(&newtargettag); + oldpartitionLock = PredicateLockHashPartitionLock(oldtargettaghash); + newpartitionLock = PredicateLockHashPartitionLock(newtargettaghash); + + LWLockAcquire(SerializablePredicateLockListLock, LW_EXCLUSIVE); + + /* + * We must get the partition locks in ascending sequence to avoid + * deadlocks. If old and new partitions are the same, we must request the + * lock only once. + */ + if (oldpartitionLock < newpartitionLock) + { + LWLockAcquire(oldpartitionLock, LW_EXCLUSIVE); + LWLockAcquire(newpartitionLock, LW_EXCLUSIVE); + } + else if (oldpartitionLock > newpartitionLock) + { + LWLockAcquire(newpartitionLock, LW_EXCLUSIVE); + LWLockAcquire(oldpartitionLock, LW_EXCLUSIVE); + } + else + LWLockAcquire(newpartitionLock, LW_EXCLUSIVE); + + /* + * Look for the old target. If not found, that's OK; no predicate locks + * are affected, so we can just clean up and return. If it does exist, + * walk its list of predicate locks and create new ones for the new block + * number, while deleting the old ones. + */ + oldtarget = hash_search_with_hash_value(PredicateLockTargetHash, + &oldtargettag, + oldtargettaghash, + HASH_FIND, NULL); + if (oldtarget) + { + PREDICATELOCKTARGET *newtarget; + PREDICATELOCK *oldpredlock; + PREDICATELOCKTAG newpredlocktag; + + newtarget = hash_search_with_hash_value(PredicateLockTargetHash, + &newtargettag, + newtargettaghash, + HASH_FIND, NULL); + Assert(newtarget); + + newpredlocktag.myTarget = newtarget; + + oldpredlock = (PREDICATELOCK *) + SHMQueueNext(&(oldtarget->predicateLocks), + &(oldtarget->predicateLocks), + offsetof(PREDICATELOCK, targetLink)); + LWLockAcquire(SerializableXactHashLock, LW_EXCLUSIVE); + while (oldpredlock) + { + SHM_QUEUE *predlocktargetlink; + PREDICATELOCK *nextpredlock; + PREDICATELOCK *newpredlock; + bool found; + + predlocktargetlink = &(oldpredlock->targetLink); + nextpredlock = (PREDICATELOCK *) + SHMQueueNext(&(oldtarget->predicateLocks), + predlocktargetlink, + offsetof(PREDICATELOCK, targetLink)); + newpredlocktag.myXact = oldpredlock->tag.myXact; + + hash_search_with_hash_value + (PredicateLockHash, + &oldpredlock->tag, + PredicateLockHashCodeFromTargetHashCode(&oldpredlock->tag, + oldtargettaghash), + HASH_REMOVE, NULL); + + newpredlock = (PREDICATELOCK *) + hash_search_with_hash_value + (PredicateLockHash, + &newpredlocktag, + PredicateLockHashCodeFromTargetHashCode(&newpredlocktag, + newtargettaghash), + HASH_ENTER, &found); + if (!newpredlock) + ereport(ERROR, + (errcode(ERRCODE_OUT_OF_MEMORY), + errmsg("out of shared memory"), + errhint("You might need to increase max_predicate_locks_per_transaction."))); + if (!found) + { + SHMQueueInsertBefore(&(newtarget->predicateLocks), + &(newpredlock->targetLink)); + SHMQueueInsertBefore((SHM_QUEUE *) &(newpredlocktag.myXact->predicateLocks), + &(newpredlock->xactLink)); + } + + oldpredlock = nextpredlock; + } + LWLockRelease(SerializableXactHashLock); + Assert(SHMQueueIsDetached(&oldtarget->predicateLocks)); + hash_search_with_hash_value(PredicateLockTargetHash, + &oldtargettag, + oldtargettaghash, + HASH_REMOVE, NULL); + } + + /* Release partition locks in reverse order of acquisition. */ + if (oldpartitionLock < newpartitionLock) + { + LWLockRelease(newpartitionLock); + LWLockRelease(oldpartitionLock); + } + else if (oldpartitionLock > newpartitionLock) + { + LWLockRelease(oldpartitionLock); + LWLockRelease(newpartitionLock); + } + else + LWLockRelease(newpartitionLock); + } + + /* + * Walk the hash table and find the new xmin. + */ + static void + SetNewSerializableGlobalXmin(void) + { + HASH_SEQ_STATUS seqstat; + SERIALIZABLEXACT *sxact; + + SerializableGlobalXmin = InvalidTransactionId; + SerializableGlobalXminCount = 0; + hash_seq_init(&seqstat, SerializableXactHash); + while ((sxact = (SERIALIZABLEXACT *) hash_seq_search(&seqstat))) + { + if (!SxactIsOnFinishedList(sxact)) + { + if (!TransactionIdIsValid(SerializableGlobalXmin) + || TransactionIdPrecedes(sxact->xmin, SerializableGlobalXmin)) + { + SerializableGlobalXmin = sxact->xmin; + SerializableGlobalXminCount = 1; + } + else if (sxact->xmin == SerializableGlobalXmin) + SerializableGlobalXminCount++; + } + } + } + + /* + * ReleasePredicateLocks + * + * Releases predicate locks based on completion of the current + * transaction, whether committed or rolled back. + * + * We do nothing unless this is a serializable transaction. + * + * For a rollback, the current transaction's predicate locks could be + * immediately released; however, we may still have conflict pointers to + * our transaction which could be expensive to find and eliminate right + * now, so we flag it as rolled back so that it will be ignored, and let + * cleanup happen later. + * + * This method must ensure that shared memory hash tables are cleaned + * up in some relatively timely fashion. + * + * If this transaction is committing and is holding any predicate locks, + * it must be added to a list of completed serializable transaction still + * holding locks. + */ + void + ReleasePredicateLocks(const bool isCommit) + { + bool needToClear; + + if (MySerializableXact == InvalidSerializableXact) + { + Assert(LocalPredicateLockHash == NULL); + return; + } + + Assert(IsXactIsoLevelFullySerializable); + + /* We'd better not already be on the cleanup list. */ + Assert(!SxactIsOnFinishedList((SERIALIZABLEXACT *) MySerializableXact)); + + /* + * If it's not a commit it's a rollback, and we can clear our locks + * immediately. TODO SSI: Clear the locks, but leave the sxact record. + */ + if (!isCommit) + MySerializableXact->rolledBack = true; + + /* + * Add this to the list of transactions to check for later cleanup. First + * turn pointers to already-terminated transactions to self-references. + */ + if (MySerializableXact->inConflict != InvalidSerializableXact) + { + if (MySerializableXact->inConflict->rolledBack) + MySerializableXact->inConflict = InvalidSerializableXact; + else if (SxactIsCommitted(MySerializableXact->inConflict)) + MySerializableXact->inConflict = (SERIALIZABLEXACT *) MySerializableXact; + } + if (MySerializableXact->outConflict != InvalidSerializableXact) + { + if (MySerializableXact->outConflict->rolledBack) + MySerializableXact->outConflict = InvalidSerializableXact; + else if (SxactIsCommitted(MySerializableXact->outConflict)) + MySerializableXact->outConflict = (SERIALIZABLEXACT *) MySerializableXact; + } + + /* Add this to the list of transactions to check for later cleanup. */ + LWLockAcquire(SerializableFinishedListLock, LW_EXCLUSIVE); + SHMQueueInsertBefore(FinishedSerializableTransactions, + (SHM_QUEUE *) &(MySerializableXact->finishedLink)); + LWLockRelease(SerializableFinishedListLock); + + /* + * Check whether it's time to clean up old transactions. This can only be + * done when the last serializable transaction with the oldest xmin among + * serializable transactions completes. We then find the "new oldest" + * xmin and purge any transactions which finished before this transaction + * was launched. + */ + needToClear = false; + LWLockAcquire(SerializableXactHashLock, LW_EXCLUSIVE); + if (TransactionIdPrecedes(SerializableGlobalXmin, RecentGlobalXmin)) + { + SetNewSerializableGlobalXmin(); + needToClear = true; + } + else if (MySerializableXact->xmin == SerializableGlobalXmin) + { + Assert(SerializableGlobalXminCount > 0); + if (--SerializableGlobalXminCount == 0) + { + SetNewSerializableGlobalXmin(); + needToClear = true; + } + } + LWLockRelease(SerializableXactHashLock); + + if (needToClear) + ClearOldPredicateLocks(); + + MySerializableXact = InvalidSerializableXact; + MyXid = InvalidTransactionId; + + /* Delete per-transaction lock table */ + hash_destroy(LocalPredicateLockHash); + LocalPredicateLockHash = NULL; + } + + /* + * Clear old predicate locks. + */ + static void + ClearOldPredicateLocks(void) + { + SERIALIZABLEXACT *finishedSxact; + + if (!LWLockConditionalAcquire(SerializableFinishedListLock, LW_EXCLUSIVE)) + return; + + finishedSxact = (SERIALIZABLEXACT *) + SHMQueueNext(FinishedSerializableTransactions, + FinishedSerializableTransactions, + offsetof(SERIALIZABLEXACT, finishedLink)); + LWLockAcquire(SerializableXactHashLock, LW_SHARED); + while (finishedSxact) + { + SERIALIZABLEXACT *nextSxact; + + nextSxact = (SERIALIZABLEXACT *) + SHMQueueNext(FinishedSerializableTransactions, + &(finishedSxact->finishedLink), + offsetof(SERIALIZABLEXACT, finishedLink)); + if (!TransactionIdIsValid(SerializableGlobalXmin) + || TransactionIdPrecedesOrEquals(finishedSxact->finishedBefore, + SerializableGlobalXmin)) + { + LWLockRelease(SerializableXactHashLock); + ReleaseOneSerializableXact(finishedSxact); + LWLockAcquire(SerializableXactHashLock, LW_SHARED); + } + finishedSxact = nextSxact; + } + LWLockRelease(SerializableXactHashLock); + LWLockRelease(SerializableFinishedListLock); + } + + /* + * This is the normal way to delete anything from any of the predicate + * locking hash tables. Given a transaction which we know can be deleted, + * delete all predicate locks held by that transaction, and any predicate + * lock targets which are now unreferenced by a lock; delete all xid values + * for the transaction; then delete the transaction. + */ + static void + ReleaseOneSerializableXact(SERIALIZABLEXACT *sxact) + { + PREDICATELOCK *predlock; + SERIALIZABLEXID *sxid; + + Assert(sxact != NULL); + Assert(sxact->rolledBack || SxactIsCommitted(sxact)); + Assert(SxactIsOnFinishedList(sxact)); + + LWLockAcquire(SerializablePredicateLockListLock, LW_SHARED); + predlock = (PREDICATELOCK *) + SHMQueueNext(&(sxact->predicateLocks), + &(sxact->predicateLocks), + offsetof(PREDICATELOCK, xactLink)); + while (predlock) + { + PREDICATELOCK *nextpredlock; + PREDICATELOCKTAG tag; + SHM_QUEUE *targetLink; + PREDICATELOCKTARGET *target; + PREDICATELOCKTARGETTAG targettag; + uint32 targettaghash; + LWLockId partitionLock; + + nextpredlock = (PREDICATELOCK *) + SHMQueueNext(&(sxact->predicateLocks), + &(predlock->xactLink), + offsetof(PREDICATELOCK, xactLink)); + + tag = predlock->tag; + targetLink = &(predlock->targetLink); + target = tag.myTarget; + targettag = target->tag; + targettaghash = PredicateLockTargetTagHashCode(&targettag); + partitionLock = PredicateLockHashPartitionLock(targettaghash); + + LWLockAcquire(partitionLock, LW_EXCLUSIVE); + SHMQueueDelete(targetLink); + + /* + * No need to do retail removal from transaction object; it's going + * away. + */ + hash_search_with_hash_value(PredicateLockHash, &tag, + PredicateLockHashCodeFromTargetHashCode(&tag, + targettaghash), + HASH_REMOVE, NULL); + if (SHMQueueEmpty(&target->predicateLocks)) + hash_search_with_hash_value(PredicateLockTargetHash, + &targettag, targettaghash, HASH_REMOVE, NULL); + LWLockRelease(partitionLock); + predlock = nextpredlock; + } + LWLockRelease(SerializablePredicateLockListLock); + + /* Get rid of the xids and the record of the transaction itself. */ + LWLockAcquire(SerializableXactHashLock, LW_EXCLUSIVE); + sxid = (SERIALIZABLEXID *) + SHMQueueNext(&(sxact->xids), + &(sxact->xids), + offsetof(SERIALIZABLEXID, xactLink)); + while (sxid) + { + SERIALIZABLEXID *nextsxid; + SERIALIZABLEXIDTAG tag; + + nextsxid = (SERIALIZABLEXID *) + SHMQueueNext(&(sxact->xids), + &(sxid->xactLink), + offsetof(SERIALIZABLEXID, xactLink)); + tag = sxid->tag; + hash_search(SerializableXidHash, &tag, HASH_REMOVE, NULL); + + /* + * No need to do retail removal from transaction object; it's going + * away. + */ + sxid = nextsxid; + } + SHMQueueDelete(&(sxact->finishedLink)); + hash_search(SerializableXactHash, &(sxact->tag), HASH_REMOVE, NULL); + LWLockRelease(SerializableXactHashLock); + } + + /* + * Tests whether the given transaction is concurrent with (overlaps) + * our current transaction. + */ + static bool + XidIsConcurrent(TransactionId xid) + { + Snapshot snap; + uint32 i; + + Assert(TransactionIdIsValid(xid)); + + /* + * We don't count our own transaction or its subtransactions as + * "concurrent". + */ + if (xid == GetTopTransactionIdIfAny()) + return false; + + snap = GetTransactionSnapshot(); + + if (TransactionIdPrecedes(xid, snap->xmin)) + return false; + + if (TransactionIdFollowsOrEquals(xid, snap->xmax)) + return true; + + for (i = 0; i < snap->xcnt; i++) + { + if (xid == snap->xip[i]) + return true; + } + + return false; + } + + /* + * CheckForSerializableConflictOut + * We are reading a tuple which has been modified. If it is visible to + * us but has been deleted, that indicates a rw-conflict out. If it's + * not visible and was created by a concurrent (overlapping) + * serializable transaction, that is also a rw-conflict out, + * + * The heap tables which we maintain for predicate locking will also be used + * to determine that the xmin from a row is related to a serializable + * transaction, and will provide a mapping to the top level transaction. + * + * This function should be called just about anywhere in heapam.c that a + * tuple has been read. + */ + void + CheckForSerializableConflictOut(const bool valid, const Relation relation, + const HeapTuple tuple, const Buffer buffer) + { + TransactionId xid; + SERIALIZABLEXIDTAG sxidtag; + SERIALIZABLEXID *sxid; + SERIALIZABLEXACTTAG sxacttag; + SERIALIZABLEXACT *sxact; + + if (SkipSerialization(relation)) + return; + + if (valid) + { + /*---------------------------------------------------------------- + * TODO SSI: Figure out why the ItemPointerIsValid test is needed. + * We are sometimes failing with ip_posid == 0 in corner + * cases, like the following. Is this some underlying bug? + * If not, is this the best way to handle this? + * + * -- setup + * drop table ctl, receipt; + * create table ctl (k text not null primary key, deposit_date date not null); + * insert into ctl values ('receipt', date '2008-12-22'); + * create table receipt (receipt_no int not null primary key, deposit_date date not null, amount numeric(13,2)); + * insert into receipt values (1, (select deposit_date from ctl where k = 'receipt'), 1.00); + * insert into receipt values (2, (select deposit_date from ctl where k = 'receipt'), 2.00); + * + * -- connection 1 + * start transaction isolation level serializable ; + * insert into receipt values (3, (select deposit_date from ctl where k = 'receipt'), 4.00); + * + * -- connection 2 + * start transaction isolation level serializable ; + * update ctl set deposit_date = date '2008-12-23' where k = 'receipt'; + * + * -- connection 3 + * start transaction isolation level serializable ; + * select * from ctl; + * + * -- connection 2 + * rollback; + * + * -- connection 3 + * select * from re[nothing shows]ceipt; + * > no connection to the server + * > The connection to the server was lost. Attempting reset: Succeeded. + *---------------------------------------------------------------- + */ + /* If there's a new tuple to key on, return to avoid duplicate work. */ + if (ItemPointerIsValid(&(tuple->t_data->t_ctid)) + && !ItemPointerEquals(&(tuple->t_self), &(tuple->t_data->t_ctid))) + return; + + /* + * We may bail out if previous xmax aborted, or if it committed but + * only locked the tuple without updating it. + */ + if (tuple->t_data->t_infomask & (HEAP_XMAX_INVALID | HEAP_IS_LOCKED)) + return; + + /* + * If there's a valid xmax, it must be from a concurrent transaction, + * since it deleted a tuple which is visible to us. + */ + xid = HeapTupleHeaderGetXmax(tuple->t_data); + if (!TransactionIdIsValid(xid)) + return; + } + else + { + /* + * We would read this row, but it isn't visible to us. + */ + xid = HeapTupleHeaderGetXmin(tuple->t_data); + } + + /* + * It's OK to look for conflicts with a share lock, and record them with + * an exclusive lock when found; we just have to release the shared lock + * before attempting to get the other lock, to prevent deadlocks. We will + * need to recheck that the entry still exists after getting the stronger + * lock, just in case it rolled back in the window where we weren't + * holding a lock. + */ + sxidtag.xid = xid; + LWLockAcquire(SerializableXactHashLock, LW_SHARED); + sxid = (SERIALIZABLEXID *) + hash_search(SerializableXidHash, &sxidtag, HASH_FIND, NULL); + if (!sxid) + { + /* It's not serializable or otherwise not important. */ + LWLockRelease(SerializableXactHashLock); + return; + } + sxact = sxid->myXact; + if (sxact == MySerializableXact || sxact->rolledBack) + { + /* We can't conflict with our own transaction or one rolled back. */ + LWLockRelease(SerializableXactHashLock); + return; + } + + /* + * If this is a read-only transaction and the writing transaction has + * committed, and it doesn't have a rw-conflict out or has a conflict out + * to a transaction which overlaps this transaction, then no conflict. + */ + if (XactReadOnly + && SxactIsCommitted(sxact) + && (!TransactionIdIsValid(sxact->outConflict) + || (sxact != sxact->outConflict + && (!SxactIsCommitted(sxact->outConflict) + || XidIsConcurrent(sxact->outConflict->topXid))))) + { + /* Read-only transaction will appear to run first. No conflict. */ + LWLockRelease(SerializableXactHashLock); + return; + } + + sxacttag = sxact->tag; + LWLockRelease(SerializableXactHashLock); + + /* + * Make sure we have somewhere to record a conflict against this + * transaction. + */ + EnsureMySerializableXidExists(); + + LWLockAcquire(SerializableXactHashLock, LW_EXCLUSIVE); + sxact = (SERIALIZABLEXACT *) + hash_search(SerializableXactHash, &sxacttag, HASH_FIND, NULL); + if (!sxact) + { + /* It must have been cleaned up, which means it wasn't useful. */ + LWLockRelease(SerializableXactHashLock); + return; + } + xid = sxact->topXid; + if (!XidIsConcurrent(xid)) + { + /* This write was already in our snapshot; no conflict. */ + LWLockRelease(SerializableXactHashLock); + return; + } + + /* + * Flag the conflict. But first, if this conflict creates a dangerous + * structure, ereport an error. + */ + FlagRWConflict((SERIALIZABLEXACT *) MySerializableXact, sxact); + LWLockRelease(SerializableXactHashLock); + } + + /* + * Check a particular target for rw-dependency conflict in. + */ + static void + CheckTargetForConflictsIn(PREDICATELOCKTARGETTAG *targettag) + { + uint32 targettaghash; + LWLockId partitionLock; + PREDICATELOCKTARGET *target; + PREDICATELOCK *predlock; + + Assert(MySerializableXact != InvalidSerializableXact); + + /* The same hash and LW lock apply to the lock target and the lock itself. */ + targettaghash = PredicateLockTargetTagHashCode(targettag); + partitionLock = PredicateLockHashPartitionLock(targettaghash); + LWLockAcquire(partitionLock, LW_SHARED); + target = (PREDICATELOCKTARGET *) + hash_search_with_hash_value(PredicateLockTargetHash, + targettag, targettaghash, + HASH_FIND, NULL); + if (!target) + { + /* Nothing has this target locked; we're done here. */ + LWLockRelease(partitionLock); + return; + } + + /* + * Each lock for an overlapping transaction represents a conflict: a + * rw-dependency in to this transaction. + */ + predlock = (PREDICATELOCK *) + SHMQueueNext(&(target->predicateLocks), + &(target->predicateLocks), + offsetof(PREDICATELOCK, targetLink)); + LWLockAcquire(SerializableXactHashLock, LW_SHARED); + while (predlock) + { + SHM_QUEUE *predlocktargetlink; + PREDICATELOCK *nextpredlock; + SERIALIZABLEXACT *sxact; + + predlocktargetlink = &(predlock->targetLink); + nextpredlock = (PREDICATELOCK *) + SHMQueueNext(&(target->predicateLocks), + predlocktargetlink, + offsetof(PREDICATELOCK, targetLink)); + + sxact = predlock->tag.myXact; + if (sxact == MySerializableXact) + { + /* + * If we're getting a write lock on the tuple, we don't need a + * predicate (SIREAD) lock. At this point our transaction already + * has an ExclusiveRowLock on the relation, so we are OK to drop + * the predicate lock on the tuple, if found, without fearing that + * another write against the tuple will occur before the MVCC + * information makes it to the buffer. + */ + if (GET_PREDICATELOCKTARGETTAG_OFFSET(*targettag)) + { + uint32 predlockhashcode; + PREDICATELOCKTARGET *rmtarget = NULL; + PREDICATELOCK *rmpredlock; + LOCALPREDICATELOCK *locallock, + *rmlocallock; + + /* + * This is a tuple on which we have a tuple predicate lock. We + * only have shared LW locks now; release those, and get + * exclusive locks only while we modify things. + */ + LWLockRelease(SerializableXactHashLock); + LWLockRelease(partitionLock); + LWLockAcquire(SerializablePredicateLockListLock, LW_SHARED); + LWLockAcquire(partitionLock, LW_EXCLUSIVE); + LWLockAcquire(SerializableXactHashLock, LW_EXCLUSIVE); + + /* + * Remove the predicate lock from shared memory, if it hasn't + * been concurrently removed by an index page combine. + */ + predlockhashcode = PredicateLockHashCodeFromTargetHashCode + (&(predlock->tag), targettaghash); + rmpredlock = (PREDICATELOCK *) + hash_search_with_hash_value(PredicateLockHash, + &(predlock->tag), + predlockhashcode, + HASH_FIND, NULL); + if (rmpredlock == predlock) + { + SHMQueueDelete(predlocktargetlink); + SHMQueueDelete(&(predlock->xactLink)); + + rmpredlock = (PREDICATELOCK *) + hash_search_with_hash_value(PredicateLockHash, + &(predlock->tag), + predlockhashcode, + HASH_REMOVE, NULL); + Assert(rmpredlock == predlock); + + /* + * When a target is no longer used, remove it. + */ + if (SHMQueueEmpty(&target->predicateLocks)) + { + rmtarget = (PREDICATELOCKTARGET *) + hash_search_with_hash_value(PredicateLockTargetHash, + targettag, + targettaghash, + HASH_REMOVE, NULL); + Assert(rmtarget == target); + } + + LWLockRelease(SerializableXactHashLock); + LWLockRelease(partitionLock); + LWLockRelease(SerializablePredicateLockListLock); + + locallock = (LOCALPREDICATELOCK *) + hash_search_with_hash_value(LocalPredicateLockHash, + targettag, targettaghash, + HASH_FIND, NULL); + Assert(locallock != NULL); + Assert(locallock->held); + locallock->held = false; + + if (locallock->childLocks == 0) + { + rmlocallock = (LOCALPREDICATELOCK *) + hash_search_with_hash_value(LocalPredicateLockHash, + targettag, targettaghash, + HASH_REMOVE, NULL); + Assert(rmlocallock == locallock); + } + + DecrementParentLocks(targettag); + + if (rmtarget) + return; + + LWLockAcquire(partitionLock, LW_SHARED); + nextpredlock = (PREDICATELOCK *) + SHMQueueNext(&(target->predicateLocks), + &(target->predicateLocks), + offsetof(PREDICATELOCK, targetLink)); + + LWLockAcquire(SerializableXactHashLock, LW_SHARED); + } + else + { + LWLockAcquire(partitionLock, LW_SHARED); + LWLockAcquire(SerializableXactHashLock, LW_SHARED); + } + } + } + else if (!(sxact->rolledBack) + && (!SxactIsCommitted(sxact) + || TransactionIdPrecedes(GetTransactionSnapshot()->xmin, + sxact->finishedBefore)) + && sxact->outConflict != MySerializableXact + && MySerializableXact->inConflict != sxact) + { + LWLockRelease(SerializableXactHashLock); + LWLockAcquire(SerializableXactHashLock, LW_EXCLUSIVE); + + FlagRWConflict(sxact, (SERIALIZABLEXACT *) MySerializableXact); + + LWLockRelease(SerializableXactHashLock); + LWLockAcquire(SerializableXactHashLock, LW_SHARED); + } + + predlock = nextpredlock; + } + LWLockRelease(SerializableXactHashLock); + LWLockRelease(partitionLock); + } + + /* + * CheckForSerializableConflictIn + * We are writing the given tuple. If that indicates a rw-conflict + * in from another serializable transaction, take appropriate action. + * + * Skip checking for any granularity for which a parameter is missing. + * + * A tuple update or delete is in conflict if we have a predicate lock + * against the relation or page in which the tuple exists, or against the + * tuple itself. A tuple insert is in conflict only if there is a predicate + * lock against the entire relation. + * + * The call to this function also indicates that we need an entry in the + * serializable transaction hash table, so that this write's conflicts can + * be detected for the proper lifetime, which is until this transaction and + * all overlapping serializable transactions have completed. + */ + void + CheckForSerializableConflictIn(const Relation relation, const HeapTuple tuple, + const Buffer buffer) + { + PREDICATELOCKTARGETTAG targettag; + + if (SkipSerialization(relation)) + return; + + EnsureMySerializableXidExists(); + + /* + * It is important that we check for locks from the finest granularity to + * the coarsest granularity, so that granularity promotion doesn't cause + * us to miss a lock. The new (coarser) lock will be acquired before the + * old (finer) locks are released. + * + * It is not possible to take and hold a lock across the checks for all + * granularities because each target could be in a separate partition. + */ + if (tuple != NULL) + { + SET_PREDICATELOCKTARGETTAG_TUPLE(targettag, + relation->rd_node.dbNode, + relation->rd_id, + ItemPointerGetBlockNumber(&(tuple->t_data->t_ctid)), + ItemPointerGetOffsetNumber(&(tuple->t_data->t_ctid))); + CheckTargetForConflictsIn(&targettag); + } + + if (BufferIsValid(buffer)) + { + SET_PREDICATELOCKTARGETTAG_PAGE(targettag, + relation->rd_node.dbNode, + relation->rd_id, + BufferGetBlockNumber(buffer)); + CheckTargetForConflictsIn(&targettag); + } + + SET_PREDICATELOCKTARGETTAG_RELATION(targettag, + relation->rd_node.dbNode, + relation->rd_id); + CheckTargetForConflictsIn(&targettag); + } + + /* + * Flag a rw-dependency between two serializable transactions. + * If a conflict field is invalid set it to the other transaction, + * if it's already the other transaction leave it alone, otherwise + * use self-reference (so we don't need to keep a list). + * + * The caller is responsible for ensuring that we have a LW lock on + * the transaction hash table. + */ + static void + FlagRWConflict(SERIALIZABLEXACT *reader, SERIALIZABLEXACT *writer) + { + Assert(reader != writer); + + /* First, see if this conflict causes failure. */ + OnConflict_CheckForSerializationFailure(reader, writer); + + /* Actually do the conflict flagging. */ + if (writer->inConflict == InvalidSerializableXact + || writer->inConflict->rolledBack) + writer->inConflict = reader; + else if (writer->inConflict != reader) + writer->inConflict = writer; + if (reader->outConflict == InvalidSerializableXact + || reader->outConflict->rolledBack) + reader->outConflict = writer; + else if (reader->outConflict != writer) + reader->outConflict = reader; + } + + /* + * Check whether we should roll back one of these transactions + * instead of flagging a conflict. + */ + static void + OnConflict_CheckForSerializationFailure(const SERIALIZABLEXACT *reader, + const SERIALIZABLEXACT *writer) + { + bool failure; + + Assert(LWLockHeldByMe(SerializableXactHashLock)); + + failure = false; + + if (writer->inConflict != reader + && writer->outConflict != InvalidSerializableXact + && !(writer->outConflict->rolledBack)) + { + /* The writer is or is becoming a pivot. */ + /* Self-reference prevents checking commit sequence. */ + if (writer->outConflict == writer + + /* + * TODO SSI: Resolve this performance tweak issue. + * + * Back-and-forth reference is write skew; thus doomed; however, + * rolling back here increases chances that a retry will still fail. + * It may be better to let it happen at commit time. Only performance + * testing can determine whether the next line should be used. + * + * Leaving it out would be *especially* valuable if the PreCommit + * checking could be changed to allow a commit in a situation where it + * is leaving another transaction in a state where a commit must fail + * -- when the doomed transaction eventually tries to commit, it would + * probably be at a time when an immediate retry is very likely to + * succeed. + */ + /* || writer->outConflict == reader */ + ) + failure = true; + else if (SxactIsCommitted(writer->outConflict)) + { + if (SxactCommittedBefore(writer->outConflict, writer) + && SxactCommittedBefore(writer->outConflict, reader)) + /* The out side of the pivot committed first. */ + failure = true; + } + else + { + if (writer->outConflict->inConflict == writer->outConflict) + /* Self-reference will prevent checking at commit. */ + failure = true; + } + } + + if (reader->outConflict != writer + && reader->inConflict != InvalidSerializableXact + && !(reader->inConflict->rolledBack)) + { + /* The reader is or is becoming a pivot. */ + if (SxactIsCommitted(writer)) + { + if (SxactCommittedBefore(writer, reader) + && (reader->inConflict == reader + || SxactCommittedBefore(writer, reader->inConflict))) + /* The out side committed first, as far as we can tell. */ + failure = true; + } + else if (writer->inConflict != InvalidSerializableXact + && writer->inConflict != reader) + /* Self-reference will prevent checking at commit. */ + failure = true; + } + + if (failure) + ereport(ERROR, + (errcode(ERRCODE_T_R_SERIALIZATION_FAILURE), + errmsg("could not serialize access due to read/write dependencies among transactions"), + errhint("The transaction might succeed if retried."))); + } + + /* + * PreCommit_CheckForSerializableConflicts + * Check for dangerous structures in a serializable transaction + * at commit. + * + * We're checking for a dangerous structure as each conflict is recorded. + * The only way we could have a problem at commit is if this is the "out" + * side of a pivot, and neither the "in" side or the pivot itself has yet + * committed. + */ + void + PreCommit_CheckForSerializationFailure(void) + { + if (MySerializableXact == InvalidSerializableXact) + return; + + Assert(IsXactIsoLevelFullySerializable); + + LWLockAcquire(SerializableXactHashLock, LW_EXCLUSIVE); + + /* + * Checking at conflict detection should only allow self-reference in if + * this transaction is on the on the out side of a pivot, so + * self-reference is OK here. + */ + if (MySerializableXact->inConflict != InvalidSerializableXact + && MySerializableXact->inConflict != MySerializableXact + && !(MySerializableXact->inConflict->rolledBack) + && MySerializableXact->inConflict->inConflict != InvalidSerializableXact + && !SxactIsCommitted(MySerializableXact->inConflict) + && !SxactIsCommitted(MySerializableXact->inConflict->inConflict)) + { + MySerializableXact->finishedBefore = ShmemVariableCache->nextXid; + ereport(ERROR, + (errcode(ERRCODE_T_R_SERIALIZATION_FAILURE), + errmsg("could not serialize access due to read/write dependencies among transactions"), + errhint("The transaction might succeed if retried."))); + } + + MySerializableXact->finishedBefore = ShmemVariableCache->nextXid; + LWLockRelease(SerializableXactHashLock); + } *** a/src/backend/utils/adt/lockfuncs.c --- b/src/backend/utils/adt/lockfuncs.c *************** *** 17,22 **** --- 17,23 ---- #include "miscadmin.h" #include "storage/proc.h" #include "utils/builtins.h" + #include "storage/predicate.h" /* This must match enum LockTagType! */ *************** *** 32,42 **** static const char *const LockTagTypeNames[] = { --- 33,52 ---- "advisory" }; + /* This must match enum PredicateLockTargetType (predicate.h) */ + static const char *const PredicateLockTagTypeNames[] = { + "relation", + "page", + "tuple" + }; + /* Working status for pg_lock_status */ typedef struct { LockData *lockData; /* state data from lmgr */ int currIdx; /* current PROCLOCK index */ + PredicateLockData *predLockData; /* state data for pred locks */ + int predLockIdx; /* current index for pred lock */ } PG_Lock_Status; *************** *** 69,74 **** pg_lock_status(PG_FUNCTION_ARGS) --- 79,85 ---- FuncCallContext *funcctx; PG_Lock_Status *mystatus; LockData *lockData; + PredicateLockData *predLockData; if (SRF_IS_FIRSTCALL()) { *************** *** 126,131 **** pg_lock_status(PG_FUNCTION_ARGS) --- 137,144 ---- mystatus->lockData = GetLockStatusData(); mystatus->currIdx = 0; + mystatus->predLockData = GetPredicateLockStatusData(); + mystatus->predLockIdx = 0; MemoryContextSwitchTo(oldcontext); } *************** *** 303,308 **** pg_lock_status(PG_FUNCTION_ARGS) --- 316,385 ---- SRF_RETURN_NEXT(funcctx, result); } + /* + * Have returned all regular locks. Now start on the SIREAD predicate + * locks. + */ + predLockData = mystatus->predLockData; + if (mystatus->predLockIdx < predLockData->nelements) + { + PREDICATELOCKTARGETTAG *predTag = &(predLockData->locktags[mystatus->predLockIdx]); + SERIALIZABLEXACT *xact = &(predLockData->xacts[mystatus->predLockIdx]); + Datum values[14]; + bool nulls[14]; + HeapTuple tuple; + Datum result; + + mystatus->predLockIdx++; + + /* + * Form tuple with appropriate data. + */ + MemSet(values, 0, sizeof(values)); + MemSet(nulls, false, sizeof(nulls)); + + /* lock type */ + PredicateLockTargetType lockType = GET_PREDICATELOCKTARGETTAG_TYPE(*predTag); + + values[0] = CStringGetTextDatum(PredicateLockTagTypeNames[lockType]); + + /* lock target */ + values[1] = GET_PREDICATELOCKTARGETTAG_DB(*predTag); + values[2] = GET_PREDICATELOCKTARGETTAG_RELATION(*predTag); + if (lockType == PREDLOCKTAG_TUPLE) + values[4] = GET_PREDICATELOCKTARGETTAG_OFFSET(*predTag); + else + nulls[4] = true; + if ((lockType == PREDLOCKTAG_TUPLE) || + (lockType == PREDLOCKTAG_PAGE)) + values[3] = GET_PREDICATELOCKTARGETTAG_PAGE(*predTag); + else + nulls[3] = true; + + /* these fields are targets for other types of locks */ + nulls[5] = true; /* virtualxid */ + nulls[6] = true; /* transactionid */ + nulls[7] = true; /* classid */ + nulls[8] = true; /* objid */ + nulls[9] = true; /* objsubid */ + + /* lock holder */ + values[10] = VXIDGetDatum(xact->tag.vxid.backendId, + xact->tag.vxid.localTransactionId); + nulls[11] = true; /* pid */ + + /* + * Lock mode. Currently all predicate locks are SIReadLocks, which are + * always held (never waiting) + */ + values[12] = CStringGetTextDatum("SIReadLock"); + values[13] = BoolGetDatum(true); + + tuple = heap_form_tuple(funcctx->tuple_desc, values, nulls); + result = HeapTupleGetDatum(tuple); + SRF_RETURN_NEXT(funcctx, result); + } + SRF_RETURN_DONE(funcctx); } *** a/src/backend/utils/adt/ri_triggers.c --- b/src/backend/utils/adt/ri_triggers.c *************** *** 3308,3314 **** ri_PerformCheck(RI_QueryKey *qkey, SPIPlanPtr qplan, /* * In READ COMMITTED mode, we just need to use an up-to-date regular * snapshot, and we will see all rows that could be interesting. But in ! * SERIALIZABLE mode, we can't change the transaction snapshot. If the * caller passes detectNewRows == false then it's okay to do the query * with the transaction snapshot; otherwise we use a current snapshot, and * tell the executor to error out if it finds any rows under the current --- 3308,3314 ---- /* * In READ COMMITTED mode, we just need to use an up-to-date regular * snapshot, and we will see all rows that could be interesting. But in ! * xact-snapshot-based modes, we can't change the transaction snapshot. If the * caller passes detectNewRows == false then it's okay to do the query * with the transaction snapshot; otherwise we use a current snapshot, and * tell the executor to error out if it finds any rows under the current *************** *** 3316,3322 **** ri_PerformCheck(RI_QueryKey *qkey, SPIPlanPtr qplan, * that SPI_execute_snapshot will register the snapshots, so we don't need * to bother here. */ ! if (IsXactIsoLevelSerializable && detectNewRows) { CommandCounterIncrement(); /* be sure all my own work is visible */ test_snapshot = GetLatestSnapshot(); --- 3316,3322 ---- * that SPI_execute_snapshot will register the snapshots, so we don't need * to bother here. */ ! if (IsXactIsoLevelXactSnapshotBased && detectNewRows) { CommandCounterIncrement(); /* be sure all my own work is visible */ test_snapshot = GetLatestSnapshot(); *** a/src/backend/utils/misc/guc.c --- b/src/backend/utils/misc/guc.c *************** *** 59,64 **** --- 59,65 ---- #include "storage/bufmgr.h" #include "storage/standby.h" #include "storage/fd.h" + #include "storage/predicate.h" #include "tcop/tcopprot.h" #include "tsearch/ts_cache.h" #include "utils/builtins.h" *************** *** 1670,1675 **** static struct config_int ConfigureNamesInt[] = --- 1671,1687 ---- }, { + {"max_predicate_locks_per_transaction", PGC_POSTMASTER, LOCK_MANAGEMENT, + gettext_noop("Sets the maximum number of predicate locks per transaction."), + gettext_noop("The shared predicate lock table is sized on the assumption that " + "at most max_predicate_locks_per_transaction * max_connections distinct " + "objects will need to be locked at any one time.") + }, + &max_predicate_locks_per_xact, + 64, 10, INT_MAX, NULL, NULL + }, + + { {"authentication_timeout", PGC_SIGHUP, CONN_AUTH_SECURITY, gettext_noop("Sets the maximum allowed time to complete client authentication."), NULL, *** a/src/backend/utils/resowner/resowner.c --- b/src/backend/utils/resowner/resowner.c *************** *** 261,267 **** ResourceOwnerReleaseInternal(ResourceOwner owner, --- 261,270 ---- * the top of the recursion. */ if (owner == TopTransactionResourceOwner) + { ProcReleaseLocks(isCommit); + ReleasePredicateLocks(isCommit); + } } else { *** a/src/backend/utils/time/snapmgr.c --- b/src/backend/utils/time/snapmgr.c *************** *** 37,44 **** /* ! * CurrentSnapshot points to the only snapshot taken in a serializable ! * transaction, and to the latest one taken in a read-committed transaction. * SecondarySnapshot is a snapshot that's always up-to-date as of the current * instant, even on a serializable transaction. It should only be used for * special-purpose code (say, RI checking.) --- 37,44 ---- /* ! * CurrentSnapshot points to the only snapshot taken in a xact-snapshot-based ! * transaction; otherwise to the latest one taken. * SecondarySnapshot is a snapshot that's always up-to-date as of the current * instant, even on a serializable transaction. It should only be used for * special-purpose code (say, RI checking.) *************** *** 97,107 **** static int RegisteredSnapshots = 0; bool FirstSnapshotSet = false; /* ! * Remembers whether this transaction registered a serializable snapshot at * start. We cannot trust FirstSnapshotSet in combination with ! * IsXactIsoLevelSerializable, because GUC may be reset before us. */ ! static bool registered_serializable = false; static Snapshot CopySnapshot(Snapshot snapshot); --- 97,107 ---- bool FirstSnapshotSet = false; /* ! * Remembers whether this transaction registered a transaction-based snapshot at * start. We cannot trust FirstSnapshotSet in combination with ! * IsXactIsoLevelXactSnapshotBased, because GUC may be reset before us. */ ! static bool registered_xact_snapshot = false; static Snapshot CopySnapshot(Snapshot snapshot); *************** *** 130,150 **** GetTransactionSnapshot(void) FirstSnapshotSet = true; /* ! * In serializable mode, the first snapshot must live until end of ! * xact regardless of what the caller does with it, so we must ! * register it internally here and unregister it at end of xact. */ ! if (IsXactIsoLevelSerializable) { CurrentSnapshot = RegisterSnapshotOnOwner(CurrentSnapshot, TopTransactionResourceOwner); ! registered_serializable = true; } return CurrentSnapshot; } ! if (IsXactIsoLevelSerializable) return CurrentSnapshot; CurrentSnapshot = GetSnapshotData(&CurrentSnapshotData); --- 130,153 ---- FirstSnapshotSet = true; /* ! * In xact-snapshot-based isolation levels, the first snapshot must ! * live until end of xact regardless of what the caller does with it, ! * so we must register it internally here and unregister it at end of ! * xact. */ ! if (IsXactIsoLevelXactSnapshotBased) { CurrentSnapshot = RegisterSnapshotOnOwner(CurrentSnapshot, TopTransactionResourceOwner); ! registered_xact_snapshot = true; ! if (IsXactIsoLevelFullySerializable) ! RegisterSerializableTransaction(CurrentSnapshot); } return CurrentSnapshot; } ! if (IsXactIsoLevelXactSnapshotBased) return CurrentSnapshot; CurrentSnapshot = GetSnapshotData(&CurrentSnapshotData); *************** *** 155,161 **** GetTransactionSnapshot(void) /* * GetLatestSnapshot * Get a snapshot that is up-to-date as of the current instant, ! * even if we are executing in SERIALIZABLE mode. */ Snapshot GetLatestSnapshot(void) --- 158,164 ---- /* * GetLatestSnapshot * Get a snapshot that is up-to-date as of the current instant, ! * even if we are executing in xact-snapshot-based mode. */ Snapshot GetLatestSnapshot(void) *************** *** 515,527 **** void AtEarlyCommit_Snapshot(void) { /* ! * On a serializable transaction we must unregister our private refcount ! * to the serializable snapshot. */ ! if (registered_serializable) UnregisterSnapshotFromOwner(CurrentSnapshot, TopTransactionResourceOwner); ! registered_serializable = false; } --- 518,530 ---- AtEarlyCommit_Snapshot(void) { /* ! * On a xact-snapshot-based transaction we must unregister our private ! * refcount to the xact snapshot. */ ! if (registered_xact_snapshot) UnregisterSnapshotFromOwner(CurrentSnapshot, TopTransactionResourceOwner); ! registered_xact_snapshot = false; } *************** *** 557,561 **** AtEOXact_Snapshot(bool isCommit) SecondarySnapshot = NULL; FirstSnapshotSet = false; ! registered_serializable = false; } --- 560,564 ---- SecondarySnapshot = NULL; FirstSnapshotSet = false; ! registered_xact_snapshot = false; } *** a/src/include/access/heapam.h --- b/src/include/access/heapam.h *************** *** 82,89 **** extern HeapTuple heap_getnext(HeapScanDesc scan, ScanDirection direction); extern bool heap_fetch(Relation relation, Snapshot snapshot, HeapTuple tuple, Buffer *userbuf, bool keep_buf, Relation stats_relation); ! extern bool heap_hot_search_buffer(ItemPointer tid, Buffer buffer, ! Snapshot snapshot, bool *all_dead); extern bool heap_hot_search(ItemPointer tid, Relation relation, Snapshot snapshot, bool *all_dead); --- 82,89 ---- extern bool heap_fetch(Relation relation, Snapshot snapshot, HeapTuple tuple, Buffer *userbuf, bool keep_buf, Relation stats_relation); ! extern bool heap_hot_search_buffer(ItemPointer tid, Relation relation, ! Buffer buffer, Snapshot snapshot, bool *all_dead); extern bool heap_hot_search(ItemPointer tid, Relation relation, Snapshot snapshot, bool *all_dead); *** a/src/include/access/xact.h --- b/src/include/access/xact.h *************** *** 32,41 **** extern int DefaultXactIsoLevel; extern int XactIsoLevel; /* ! * We only implement two isolation levels internally. This macro should ! * be used to check which one is selected. */ ! #define IsXactIsoLevelSerializable (XactIsoLevel >= XACT_REPEATABLE_READ) /* Xact read-only state */ extern bool DefaultXactReadOnly; --- 32,45 ---- extern int XactIsoLevel; /* ! * We implement three isolation levels internally. ! * The two stronger ones use one snapshot per database transaction; ! * the others use one snapshot per statement. ! * Serializable uses predicate locks. ! * These macros should be used to check which isolation level is selected. */ ! #define IsXactIsoLevelXactSnapshotBased (XactIsoLevel >= XACT_REPEATABLE_READ) ! #define IsXactIsoLevelFullySerializable (XactIsoLevel == XACT_SERIALIZABLE) /* Xact read-only state */ extern bool DefaultXactReadOnly; *** a/src/include/catalog/pg_am.h --- b/src/include/catalog/pg_am.h *************** *** 49,54 **** CATALOG(pg_am,2601) --- 49,55 ---- bool amsearchnulls; /* can AM search for NULL/NOT NULL entries? */ bool amstorage; /* can storage type differ from column type? */ bool amclusterable; /* does AM support cluster command? */ + bool ampredlocks; /* does AM handle predicate locks? */ Oid amkeytype; /* type of data in index, or InvalidOid */ regproc aminsert; /* "insert this tuple" function */ regproc ambeginscan; /* "start new scan" function */ *************** *** 76,82 **** typedef FormData_pg_am *Form_pg_am; * compiler constants for pg_am * ---------------- */ ! #define Natts_pg_am 26 #define Anum_pg_am_amname 1 #define Anum_pg_am_amstrategies 2 #define Anum_pg_am_amsupport 3 --- 77,83 ---- * compiler constants for pg_am * ---------------- */ ! #define Natts_pg_am 27 #define Anum_pg_am_amname 1 #define Anum_pg_am_amstrategies 2 #define Anum_pg_am_amsupport 3 *************** *** 89,124 **** typedef FormData_pg_am *Form_pg_am; #define Anum_pg_am_amsearchnulls 10 #define Anum_pg_am_amstorage 11 #define Anum_pg_am_amclusterable 12 ! #define Anum_pg_am_amkeytype 13 ! #define Anum_pg_am_aminsert 14 ! #define Anum_pg_am_ambeginscan 15 ! #define Anum_pg_am_amgettuple 16 ! #define Anum_pg_am_amgetbitmap 17 ! #define Anum_pg_am_amrescan 18 ! #define Anum_pg_am_amendscan 19 ! #define Anum_pg_am_ammarkpos 20 ! #define Anum_pg_am_amrestrpos 21 ! #define Anum_pg_am_ambuild 22 ! #define Anum_pg_am_ambulkdelete 23 ! #define Anum_pg_am_amvacuumcleanup 24 ! #define Anum_pg_am_amcostestimate 25 ! #define Anum_pg_am_amoptions 26 /* ---------------- * initial contents of pg_am * ---------------- */ ! DATA(insert OID = 403 ( btree 5 1 t t t t t t t f t 0 btinsert btbeginscan btgettuple btgetbitmap btrescan btendscan btmarkpos btrestrpos btbuild btbulkdelete btvacuumcleanup btcostestimate btoptions )); DESCR("b-tree index access method"); #define BTREE_AM_OID 403 ! DATA(insert OID = 405 ( hash 1 1 f t f f f f f f f 23 hashinsert hashbeginscan hashgettuple hashgetbitmap hashrescan hashendscan hashmarkpos hashrestrpos hashbuild hashbulkdelete hashvacuumcleanup hashcostestimate hashoptions )); DESCR("hash index access method"); #define HASH_AM_OID 405 ! DATA(insert OID = 783 ( gist 0 7 f f f t t t t t t 0 gistinsert gistbeginscan gistgettuple gistgetbitmap gistrescan gistendscan gistmarkpos gistrestrpos gistbuild gistbulkdelete gistvacuumcleanup gistcostestimate gistoptions )); DESCR("GiST index access method"); #define GIST_AM_OID 783 ! DATA(insert OID = 2742 ( gin 0 5 f f f t t f f t f 0 gininsert ginbeginscan - gingetbitmap ginrescan ginendscan ginmarkpos ginrestrpos ginbuild ginbulkdelete ginvacuumcleanup gincostestimate ginoptions )); DESCR("GIN index access method"); #define GIN_AM_OID 2742 --- 90,126 ---- #define Anum_pg_am_amsearchnulls 10 #define Anum_pg_am_amstorage 11 #define Anum_pg_am_amclusterable 12 ! #define Anum_pg_am_ampredlocks 13 ! #define Anum_pg_am_amkeytype 14 ! #define Anum_pg_am_aminsert 15 ! #define Anum_pg_am_ambeginscan 16 ! #define Anum_pg_am_amgettuple 17 ! #define Anum_pg_am_amgetbitmap 18 ! #define Anum_pg_am_amrescan 19 ! #define Anum_pg_am_amendscan 20 ! #define Anum_pg_am_ammarkpos 21 ! #define Anum_pg_am_amrestrpos 22 ! #define Anum_pg_am_ambuild 23 ! #define Anum_pg_am_ambulkdelete 24 ! #define Anum_pg_am_amvacuumcleanup 25 ! #define Anum_pg_am_amcostestimate 26 ! #define Anum_pg_am_amoptions 27 /* ---------------- * initial contents of pg_am * ---------------- */ ! DATA(insert OID = 403 ( btree 5 1 t t t t t t t f t t 0 btinsert btbeginscan btgettuple btgetbitmap btrescan btendscan btmarkpos btrestrpos btbuild btbulkdelete btvacuumcleanup btcostestimate btoptions )); DESCR("b-tree index access method"); #define BTREE_AM_OID 403 ! DATA(insert OID = 405 ( hash 1 1 f t f f f f f f f f 23 hashinsert hashbeginscan hashgettuple hashgetbitmap hashrescan hashendscan hashmarkpos hashrestrpos hashbuild hashbulkdelete hashvacuumcleanup hashcostestimate hashoptions )); DESCR("hash index access method"); #define HASH_AM_OID 405 ! DATA(insert OID = 783 ( gist 0 7 f f f t t t t t t f 0 gistinsert gistbeginscan gistgettuple gistgetbitmap gistrescan gistendscan gistmarkpos gistrestrpos gistbuild gistbulkdelete gistvacuumcleanup gistcostestimate gistoptions )); DESCR("GiST index access method"); #define GIST_AM_OID 783 ! DATA(insert OID = 2742 ( gin 0 5 f f f t t f f t f f 0 gininsert ginbeginscan - gingetbitmap ginrescan ginendscan ginmarkpos ginrestrpos ginbuild ginbulkdelete ginvacuumcleanup gincostestimate ginoptions )); DESCR("GIN index access method"); #define GIN_AM_OID 2742 *** a/src/include/storage/lwlock.h --- b/src/include/storage/lwlock.h *************** *** 27,32 **** --- 27,36 ---- #define LOG2_NUM_LOCK_PARTITIONS 4 #define NUM_LOCK_PARTITIONS (1 << LOG2_NUM_LOCK_PARTITIONS) + /* Number of partitions the shared predicate lock tables are divided into */ + #define LOG2_NUM_PREDICATELOCK_PARTITIONS 4 + #define NUM_PREDICATELOCK_PARTITIONS (1 << LOG2_NUM_PREDICATELOCK_PARTITIONS) + /* * We have a number of predefined LWLocks, plus a bunch of LWLocks that are * dynamically assigned (e.g., for shared buffers). The LWLock structures *************** *** 70,81 **** typedef enum LWLockId RelationMappingLock, AsyncCtlLock, AsyncQueueLock, /* Individual lock IDs end here */ FirstBufMappingLock, FirstLockMgrLock = FirstBufMappingLock + NUM_BUFFER_PARTITIONS, /* must be last except for MaxDynamicLWLock: */ ! NumFixedLWLocks = FirstLockMgrLock + NUM_LOCK_PARTITIONS, MaxDynamicLWLock = 1000000000 } LWLockId; --- 74,89 ---- RelationMappingLock, AsyncCtlLock, AsyncQueueLock, + SerializableXactHashLock, + SerializableFinishedListLock, + SerializablePredicateLockListLock, /* Individual lock IDs end here */ FirstBufMappingLock, FirstLockMgrLock = FirstBufMappingLock + NUM_BUFFER_PARTITIONS, + FirstPredicateLockMgrLock = FirstLockMgrLock + NUM_LOCK_PARTITIONS, /* must be last except for MaxDynamicLWLock: */ ! NumFixedLWLocks = FirstPredicateLockMgrLock + NUM_PREDICATELOCK_PARTITIONS, MaxDynamicLWLock = 1000000000 } LWLockId; *** /dev/null --- b/src/include/storage/predicate.h *************** *** 0 **** --- 1,174 ---- + /*------------------------------------------------------------------------- + * + * predicate.h + * POSTGRES predicate locking definitions. + * + * + * Portions Copyright (c) 1996-2010, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * $PostgreSQL$ + * + *------------------------------------------------------------------------- + */ + #ifndef PREDICATE_H + #define PREDICATE_H + + #include "access/htup.h" + #include "utils/snapshot.h" + + /* GUC variables */ + extern int max_predicate_locks_per_xact; + + /* + * The SERIALIZABLEXACTTAG struct identifies a serializable transaction. + */ + typedef struct SERIALIZABLEXACTTAG + { + VirtualTransactionId vxid; /* We always have one of these. */ + } SERIALIZABLEXACTTAG; + + /* + * Information needed for each serializable database transaction to support SSI techniques. + * TODO SSI: Should inConflict and outConflict be lists? That would allow us to reduce + * false positives, *and* would allow us to guarantee that an immediate retry + * of a transaction would never fail on the exact same conflicts. + * The RAM doesn't look like it would be the limiting factor, but CPU time might + * be -- we should have baseline benchmarks before attempting this. + */ + typedef struct SERIALIZABLEXACT + { + /* hash key */ + SERIALIZABLEXACTTAG tag; + + /* data */ + struct SERIALIZABLEXACT *outConflict; /* ptr to write transaction + * whose data we couldn't + * read. invalid means no + * conflict; self-reference + * means multiple or + * committed. */ + struct SERIALIZABLEXACT *inConflict; /* ptr to read transaction + * which couldn't see our + * write. invalid means no + * conflict; self-reference + * means multiple or + * committed. */ + TransactionId topXid; /* top level xid for the transaction, if one + * exists */ + TransactionId finishedBefore; /* invalid means still running; else + * the struct expires when no tags < + * this. */ + TransactionId xmin; /* the transaction's snapshot xmin */ + SHM_QUEUE predicateLocks; /* list of associated PREDICATELOCK objects */ + SHM_QUEUE xids; /* list of associated SERIALIZABLEXID objects */ + SHM_QUEUE finishedLink; /* list link in + * FinishedSerializableTransactions */ + bool rolledBack; /* ignore conflicts when true; allows deferred + * cleanup */ + } SERIALIZABLEXACT; + + + typedef enum PredicateLockTargetType + { + PREDLOCKTAG_RELATION, + PREDLOCKTAG_PAGE, + PREDLOCKTAG_TUPLE + /* TODO Other types may be needed for index locking */ + } PredicateLockTargetType; + + /* + * The PREDICATELOCKTARGETTAG struct is defined to fit into 16 + * bytes with no padding. Note that this would need adjustment if we were + * to widen Oid or BlockNumber to more than 32 bits. + */ + typedef struct PREDICATELOCKTARGETTAG + { + uint32 locktag_field1; /* a 32-bit ID field */ + uint32 locktag_field2; /* a 32-bit ID field */ + uint32 locktag_field3; /* a 32-bit ID field */ + uint16 locktag_field4; /* a 16-bit ID field */ + uint16 locktag_field5; /* a 16-bit ID field */ + } PREDICATELOCKTARGETTAG; + + /* + * These macros define how we map logical IDs of lockable objects into + * the physical fields of PREDICATELOCKTARGETTAG. Use these to set up values, + * rather than accessing the fields directly. Note multiple eval of target! + * + * TODO SSI: If we always use the same fields for the same type of value, + * we should rename these. Holding off until it's clear there are no exceptions. + * Since indexes are relations with blocks and tuples, it's looking likely that + * the rename will be possible. If not, we may need to divide the last field + * and use part of it for a target type, so that we know how to interpret the + * data.. + */ + #define SET_PREDICATELOCKTARGETTAG_RELATION(locktag,dboid,reloid) \ + ((locktag).locktag_field1 = (dboid), \ + (locktag).locktag_field2 = (reloid), \ + (locktag).locktag_field3 = InvalidBlockNumber, \ + (locktag).locktag_field4 = InvalidOffsetNumber, \ + (locktag).locktag_field5 = 0) + + #define SET_PREDICATELOCKTARGETTAG_PAGE(locktag,dboid,reloid,blocknum) \ + ((locktag).locktag_field1 = (dboid), \ + (locktag).locktag_field2 = (reloid), \ + (locktag).locktag_field3 = (blocknum), \ + (locktag).locktag_field4 = InvalidOffsetNumber, \ + (locktag).locktag_field5 = 0) + + #define SET_PREDICATELOCKTARGETTAG_TUPLE(locktag,dboid,reloid,blocknum,offnum) \ + ((locktag).locktag_field1 = (dboid), \ + (locktag).locktag_field2 = (reloid), \ + (locktag).locktag_field3 = (blocknum), \ + (locktag).locktag_field4 = (offnum), \ + (locktag).locktag_field5 = 0) + + #define GET_PREDICATELOCKTARGETTAG_DB(locktag) \ + ((locktag).locktag_field1) + #define GET_PREDICATELOCKTARGETTAG_RELATION(locktag) \ + ((locktag).locktag_field2) + #define GET_PREDICATELOCKTARGETTAG_PAGE(locktag) \ + ((locktag).locktag_field3) + #define GET_PREDICATELOCKTARGETTAG_OFFSET(locktag) \ + ((locktag).locktag_field4) + #define GET_PREDICATELOCKTARGETTAG_TYPE(locktag) \ + (((locktag).locktag_field4 != InvalidOffsetNumber) ? PREDLOCKTAG_TUPLE : \ + (((locktag).locktag_field3 != InvalidBlockNumber) ? PREDLOCKTAG_PAGE : \ + PREDLOCKTAG_RELATION)) + + typedef struct PredicateLockData + { + int nelements; + PREDICATELOCKTARGETTAG *locktags; + SERIALIZABLEXACT *xacts; + } PredicateLockData; + + /* + * function prototypes + */ + + /* housekeeping for shared memory predicate lock structures */ + extern void InitPredicateLocks(void); + extern Size PredicateLockShmemSize(void); + + /* predicate lock reporting */ + extern PredicateLockData *GetPredicateLockStatusData(void); + + /* predicate lock maintenance */ + extern void RegisterSerializableTransaction(const Snapshot snapshot); + extern void PredicateLockRelation(const Relation relation); + extern void PredicateLockPage(const Relation relation, const BlockNumber blkno); + extern void PredicateLockTuple(const Relation relation, const HeapTuple tuple); + extern void PredicateLockPageSplit(const Relation relation, const BlockNumber oldblkno, const BlockNumber newblkno); + extern void PredicateLockPageCombine(const Relation relation, const BlockNumber oldblkno, const BlockNumber newblkno); + extern void ReleasePredicateLocks(const bool isCommit); + + /* conflict detection (may also trigger rollback) */ + extern void CheckForSerializableConflictOut(const bool valid, const Relation relation, const HeapTuple tuple, const Buffer buffer); + extern void CheckForSerializableConflictIn(const Relation relation, const HeapTuple tuple, const Buffer buffer); + + /* final rollback checking */ + extern void PreCommit_CheckForSerializationFailure(void); + + #endif /* PREDICATE_H */ *** a/src/include/storage/shmem.h --- b/src/include/storage/shmem.h *************** *** 70,74 **** extern void SHMQueueInsertBefore(SHM_QUEUE *queue, SHM_QUEUE *elem); --- 70,75 ---- extern Pointer SHMQueueNext(SHM_QUEUE *queue, SHM_QUEUE *curElem, Size linkOffset); extern bool SHMQueueEmpty(SHM_QUEUE *queue); + extern bool SHMQueueIsDetached(SHM_QUEUE *queue); #endif /* SHMEM_H */ *** a/src/test/regress/GNUmakefile --- b/src/test/regress/GNUmakefile *************** *** 135,140 **** tablespace-setup: --- 135,157 ---- ## + ## Prepare for dtester tests + ## + pg_dtester.py: pg_dtester.py.in GNUmakefile $(top_builddir)/src/Makefile.global + sed -e 's,@bindir@,$(bindir),g' \ + -e 's,@libdir@,$(libdir),g' \ + -e 's,@pkglibdir@,$(pkglibdir),g' \ + -e 's,@datadir@,$(datadir),g' \ + -e 's/@VERSION@/$(VERSION)/g' \ + -e 's/@host_tuple@/$(host_tuple)/g' \ + -e 's,@GMAKE@,$(MAKE),g' \ + -e 's/@enable_shared@/$(enable_shared)/g' \ + -e 's/@GCC@/$(GCC)/g' \ + $< >$@ + chmod a+x $@ + + + ## ## Run tests ## *************** *** 152,157 **** installcheck-parallel: all --- 169,179 ---- standbycheck: all $(pg_regress_call) --psqldir=$(PSQLDIR) --schedule=$(srcdir)/standby_schedule --use-existing + dcheck: pg_dtester.py + ./pg_dtester.py --temp-install --top-builddir=$(top_builddir) \ + --multibyte=$(MULTIBYTE) $(MAXCONNOPT) $(NOLOCALE) + + # old interfaces follow... runcheck: check *** /dev/null --- b/src/test/regress/pg_dtester.py.in *************** *** 0 **** --- 1,1626 ---- + #!/usr/bin/python + + #------------------------------------------------------------------------- + # + # dtester.py.in + # + # Sample test suite running two concurrent transactions, showing + # off some capabilities of dtester. + # + # Copyright (c) 2006-2010, Markus Wanner + # + #------------------------------------------------------------------------- + + import re, os, sys, getopt + from twisted.internet import defer, reactor + from twisted.python import failure + + from dtester.events import EventMatcher, EventSource, Event, \ + ProcessOutputEvent, ProcessErrorEvent, ProcessEndedEvent + from dtester.exceptions import TestAborted, TestFailure + from dtester.test import TestSuite, BaseTest, SyncTest + from dtester.reporter import StreamReporter, CursesReporter + from dtester.runner import Runner, Timeout + + # ****** definition of tests and suites *********************************** + + class InstallationSuite(TestSuite): + + setUpDescription = "creating temporary installation" + tearDownDescription = "removing temporary installation" + + needs = (('shell', "IShell or something"),) + + def setUp(self): + # inherit getConfig from the shell + setattr(self, 'getConfig', self.shell.getConfig) + setattr(self, 'runCommand', self.shell.runCommand) + setattr(self, 'recursive_remove', self.shell.recursive_remove) + + # (re) create an installation directory + self.pg_inst_dir = self.shell.getConfig('inst_dir') + if os.path.exists(self.pg_inst_dir): + self.shell.recursive_remove(self.pg_inst_dir) + os.mkdir(self.pg_inst_dir) + + # install into that directory + proc = self.shell.runCommand('make', 'make', + args=['make', '-C', self.shell.getConfig('top-builddir'), + 'DESTDIR=%s' % self.pg_inst_dir, 'install', + 'with_perl=no', 'with_python=no'], + lineBasedOutput=True) + + d = self.waitFor(proc, EventMatcher(ProcessEndedEvent)) + d.addCallback(self.makeTerminated) + proc.start() + + # FIXME: how to properly handle these? + self.shell.addEnvPath(self.shell.getConfig('bindir')) + self.shell.addEnvLibraryPath(self.shell.getConfig('libdir')) + return d + + def makeTerminated(self, event): + if event.exitCode != 0: + raise Exception("Initdb returned %d" % event.exitCode) + else: + return True + + def tearDown(self): + # The installation procedure should be able to simply override any + # formerly installed files, so we save the time to clean up the + # installation directory. + return + + + class InitdbSuite(TestSuite): + + args = (('number', int), ) + needs = (('shell', "IShell or something"),) + + def setUpDescription(self): + return "initializing database system %d" % self.number + + def tearDownDescription(self): + return "removing database system %d" % self.number + + def getNumber(self): + return self.number + + def getDir(self): + return self.dbdir + + def setUp(self): + self.dbdir = "%s%d" % \ + (self.shell.getConfig('pgdata_prefix'), self.number) + proc = self.shell.runCommand( + 'initdb-%d' % self.number, + 'initdb', args = [ + 'initdb', '-D', self.dbdir, + '-A', 'trust', '--noclean'], + lineBasedOutput=True) + + d = defer.Deferred() + proc.addHook(EventMatcher(ProcessEndedEvent), + self.initdb_terminated, d) + proc.start() + return d + + def initdb_terminated(self, event, d): + if event.exitCode != 0: + d.errback(Exception("Initdb returned %d" % event.exitCode)) + else: + d.callback(True) + + def tearDown(self): + self.shell.recursive_remove( + "%s%d" % (self.shell.getConfig('pgdata_prefix'), self.number)) + + + class PostmasterSuite(TestSuite): + + needs = (('shell', "IShell or something"), + ('dbdir', "IDatabaseDir"),) + + def setUpDescription(self): + return "starting database system %d" % self.dbdir.getNumber() + + def tearDownDescription(self): + return "stopping database system %d" % self.dbdir.getNumber() + + def getPort(self): + return self.port + + def setUp(self): + setattr(self, 'getNumber', self.dbdir.getNumber) + + self.port = self.shell.getConfig('temp-port') + self.dbdir.getNumber() + + args = ['postmaster', '-d5', + '-D', self.dbdir.getDir(), + '-i', '-p', str(self.port)] + if self.shell.getConfig('enable_cassert'): + args += "-A1" + + self.postmaster = self.shell.runCommand( + 'postmaster%d' % self.dbdir.getNumber(), + 'postmaster', + args = args, + lineBasedOutput=True) + + d = defer.Deferred() + self.readyHook = \ + self.postmaster.addHook(EventMatcher(ProcessErrorEvent, + "database system is ready to accept connections"), + self.postmaster_ready, d) + + self.unexpectedTerminationHook = \ + self.postmaster.addHook(EventMatcher(ProcessEndedEvent), + self.postmaster_terminated) + self.postmaster.start() + return d + + def postmaster_ready(self, event, d): + # it's sufficient if we're called once + self.postmaster.removeHook(self.readyHook) + d.callback(None) + + def postmaster_terminated(self, event): + exitCode = 'undef' + if hasattr(event, 'exitCode'): + exitCode = event.exitCode + elif hasattr(event, 'data'): + exitCode = repr(event.data) + self.abort("postmaster %d unexpectedly terminated (exit code %s)" % \ + (self.dbdir.getNumber(), exitCode)) + + def tearDown(self): + self.postmaster.removeHook(self.unexpectedTerminationHook) + if not self.aborted: + d = defer.Deferred() + self.postmaster.addHook(EventMatcher(ProcessEndedEvent), + lambda event: d.callback(None)) + self.postmaster.stop() + return d + else: + return True + + + class TestDatabaseSuite(TestSuite): + + args = (('dbname', str),) + needs = (('shell', "IShell or something"), + ('pg', "IPostmaster"),) + + def setUpDescription(self): + return "creating database %s at server %d" % \ + (self.dbname, self.pg.getNumber()) + + def tearDownDescription(self): + return "not (!) dropping database %s at server %d" % \ + (self.dbname, self.pg.getNumber()) + + def getDbname(self): + return self.dbname + + def setUp(self): + setattr(self, "getPort", self.pg.getPort) + setattr(self, "getNumber", self.pg.getNumber) + + self.proc = self.shell.runCommand( + 'createdb%d' % self.pg.getNumber(), + 'createdb', + args = ['createdb', + '-p', str(self.getPort()), self.dbname], + lineBasedOutput=True) + + d = defer.Deferred() + self.proc.addHook(EventMatcher(ProcessEndedEvent), + self.createdb_terminated, d) + self.proc.start() + return d + + def createdb_terminated(self, event, d): + if event.exitCode != 0: + d.errback(Exception("createdb terminated with code %d" % \ + event.exitCode)) + else: + d.callback(None) + + def tearDown(self): + if self.pg.aborted: + return True + + # Hm.. this interferes with the postmaster suites, which need + # to be started and stopped several times on top of a test database, + # however, creating and dropping it certainly depends on a running + # postmaster. Not sure how to solve this, at the moment I'm just + # skipping cleanup, i.e. dropdb. + return True + + self.proc = self.shell.runCommand( + 'dropdb%d' % self.pg.getNumber(), + 'dropdb', + args = ['dropdb', + '-p', str(self.getPort()), self.dbname], + lineBasedOutput=True) + + d = defer.Deferred() + self.proc.addHook(EventMatcher(ProcessEndedEvent), + self.dropdb_terminated, d) + self.proc.start() + return d + + def dropdb_terminated(self, event, d): + if event.exitCode != 0: + d.errback(Exception("dropdb returned with %d" % \ + event.exitCode)) + else: + d.callback(None) + + + class SqlConnectionSuite(TestSuite): + + args = (('dbname', str),) + needs = (('shell', "IShell or something"), + ('db', "IPostmaster")) + + def setUpDescription(self): + return "connecting to database %s at server %d" % \ + (self.dbname, self.db.getNumber()) + def tearDownDescription(self): + return "disconnecting from database %s at server %d" % \ + (self.dbname, self.db.getNumber()) + + def getDbname(self): + return self.dbname + + def setUp(self): + self.psql = self.shell.runCommand( + 'psql%d' % self.db.getNumber(), + 'psql', + args=['psql', '-AEn', + '--pset=pager=off', '--pset=columns=0', + '-p', str(self.db.getPort()), + self.dbname]) + + # initialize the output buffer and attach a first output collector + # *before* the process is started. + self.output_buffer = "" + d = defer.Deferred() + self.outputCollectorDeferred = d + self.outputCollectorHook = self.psql.addHook( + EventMatcher(ProcessOutputEvent), self.outputCollector, + None, d) + + # Mark as being in used, until we get to the commandline + self.inUse = True + self.workQueue = [] + + # also add a termination hook + self.unexpectedTerminationHook = self.psql.addHook( + EventMatcher(ProcessEndedEvent), self.psql_terminated) + + # then schedule start of the psql process and return the deferred + # *before* starting the process. + reactor.callLater(0.0, self.psql.start) + return d + + def psql_terminated(self, event): + exitCode = "undef" + if hasattr(event, 'exitCode'): + exitCode = event.exitCode + elif hasattr(event, 'data'): + exitCode = repr(event.data) + + # If there's an outputCollectorHook, the abort method won't catch + # and we have to wait for the timeout to trigger, instead of + # acting on process termination. We thus save the outputCollector + # deferred and send it an errback with the failure. + if self.outputCollectorHook: + self.outputCollectorDeferred.errback( \ + TestAborted("psql to server %d unexpectedly terminated (exit code %s)" % ( \ + self.db.getNumber(), exitCode))) + self.abort( + "psql to server %d unexpectedly terminated (exit code %s)" % ( \ + self.db.getNumber(), exitCode)) + + def tearDown(self): + self.psql.removeHook(self.unexpectedTerminationHook) + + d = defer.Deferred() + self.psql.addHook(EventMatcher(ProcessEndedEvent), + lambda event: d.callback(None)) + reactor.callLater(0.0, self.psql.write, "\\q\n") + reactor.callLater(5.0, self.psql.stop) + return d + + def outputCollector(self, event, query, d): + self.output_buffer += event.data + + cmdprompt = self.dbname + '=#' + cpos = self.output_buffer.find(cmdprompt) + + if cpos >= 0: + self.psql.removeHook(self.outputCollectorHook) + self.outputCollectorHook = False + result = self.output_buffer[:cpos] + self.output_buffer = self.output_buffer[cpos + len(cmdprompt):] + if len(self.output_buffer) > 0 and self.output_buffer != ' ': + print "rest: %s" % repr(self.output_buffer) + if d: + # remove the command prompt at the end + result = result[:cpos] + + if query: + # remove the query string at the beginning + query_len = len(query) + if result[:query_len] != query: + raise Exception("Query not found at beginning of psql answer.") + + result = result[query_len:] + while (len(result) > 1) and (result[0] in ("\n", "\r", " ")): + result = result[1:] + reactor.callLater(0.0, d.callback, result) + + self.inUse = False + if len(self.workQueue) > 0: + assert not self.inUse + job = self.workQueue.pop() + d1 = job['method'](*job['args']) + d1.chainDeferred(job['deferred']) + + def query(self, query): + if self.inUse: + d = defer.Deferred() + self.workQueue.append({'deferred': d, + 'method': self.query, + 'args': (query,)}) + return d + + assert not self.inUse + assert not self.outputCollectorHook + + self.inUse = True + self.output_buffer = "" + d = defer.Deferred() + self.outputCollectorHook = self.psql.addHook( + EventMatcher(ProcessOutputEvent), self.outputCollector, query, d) + d.addCallback(self.parseQueryResult) + + # defer writing to the process, so that the caller has the + # opportunity to add callbacks to the deferred we return. + reactor.callLater(0.0, self.psql.write, query + "\n") + + return d + + def parseQueryResult(self, result): + rawlines = result.split('\n') + + lines = [] + for line in rawlines: + line = line.strip() + if line.startswith("ROLLBACK"): + raise Exception("transaction rolled back (%s)" % query) + if line.startswith("message type"): + raise Exception("protocol error: %s" % line) + if len(line) > 0 and not line.startswith("NOTICE:") \ + and not line.startswith("ROLLBACK"): + lines.append(line) + + try: + assert len(lines) >= 2 + + lines = map(lambda x: x.strip(), lines) + headLine = lines[0] + tailLine = lines[-1] + + fields = headLine.split('|') + rows = [] + for row in lines[1:-1]: + attrs = row.split('|') + assert len(attrs) == len(fields) + x = {} + for i in range(len(attrs)): + x[fields[i]] = attrs[i].strip() + rows.append(x) + + x = re.compile("\((\d+) rows?\)").search(tailLine) + if x: + if not int(x.group(1)) == len(rows): + raise Exception("number of rows doesn't match: %s vs %d for: '%s'" % ( + x.group(1), len(rows), lines)) + else: + raise Exception("final number of rows line doesn't match.\n------------\n%s\n---------------\n" % lines) + return rows + except Exception, e: + import traceback + print "error parsing query result: %s" % e + traceback.print_exc() + raise e + # return [] + + def operation(self, query, expResult=None): + if self.inUse: + d = defer.Deferred() + self.workQueue.append({'deferred': d, + 'method': self.operation, + 'args': (query, expResult)}) + return d + + assert not self.inUse + assert not self.outputCollectorHook + + self.inUse = True + self.output_buffer = "" + d = defer.Deferred() + self.outputCollectorDeferred = d + self.outputCollectorHook = self.psql.addHook( + EventMatcher(ProcessOutputEvent), self.outputCollector, query, d) + d.addCallback(self.checkQueryResult, query, expResult) + + # defer writing to the process, so that the caller has the + # opportunity to add callbacks to the deferred we return. + reactor.callLater(0.0, self.psql.write, query + "\n") + + return d + + def checkQueryResult(self, result, query, expResult): + lines = [] + for line in result.split("\n"): + line = line.strip() + if len(line) > 0 and not line.startswith("WARNING:") \ + and not line.startswith("NOTICE:"): + lines.append(line) + lines = "\n".join(lines) + if expResult: + if isinstance(expResult, str): + self.assertEqual(expResult, lines, + "didn't get expected result for query '%s'" % query) + elif isinstance(expResult, list): + if not lines in expResult: + raise TestFailure("didn't get expected result", + "no result matches, got:\n%s\nfor query: '%s'\n" % (lines, query)) + return lines + + + class TestDatabaseConnection(BaseTest): + + needs = (('conn', "ISqlConnection"),) + + description = "database connection" + + def run(self): + return self.conn.query("SELECT 1 AS test;") + + + # FIXME: that's not actually a test, but it modifies the database state + class PopulateTestDatabase(BaseTest): + + needs = (('conn', "ISqlConnection"),) + + description = "populate test database" + + def run(self): + conn = self.conn + + # Create a test table for use in TestConcurrentUpdates and fill it + # with two test tuples. + d = conn.operation("CREATE TABLE test (i int PRIMARY KEY, t text);", + "CREATE TABLE") + d.addCallback(lambda x: conn.operation( + "INSERT INTO test VALUES (5, 'apple');", + "INSERT 0 1")) + d.addCallback(lambda x: conn.operation( + "INSERT INTO test VALUES (7, 'pear');", + "INSERT 0 1")) + d.addCallback(lambda x: conn.operation( + "INSERT INTO test VALUES (11, 'banana');", + "INSERT 0 1")) + return d + + + class PermutationTest(SyncTest): + """ Abstract class for testing a set of steps in all permutations of execution order. + This counts as a single test, although a subclass may accumulate counts which may be of + interest, and should therefore be shown regardless of success or failure of the test. + """ + + # stepDictionary maps a step ID to a function to run for that step. + stepDictionary = {} + + # stepThreading is a list of lists. + # All permutations of interleaving of steps from the sublists will be generated. + # Steps from within each sublist are kept in order; only the interleaving is variable. + stepThreading = [[]] + + # Override this to provide any per-iteration (permutation) setup. + def setUpIteration(self, stepIdList): + pass + + # Override this to provide any per-iteration (permutation) teardown. + def tearDownIteration(self, stepIdList): + pass + + def runIterationStep(self, stepId): + p = self.stepDictionary[stepId] + p() + + def runIterationSteps(self, stepIdList): + try: + self.setUpIteration(stepIdList) + for stepId in stepIdList: + self.runIterationStep(stepId) + finally: + self.tearDownIteration(stepIdList) + + def runPermutations(self, a): + self.runPermutations_recurse([], a) + + def runPermutations_recurse(self, p, a): + found = False + for i in range(len(a)): + if len(a[i]) > 0: + found = True + r = p[:] + b = a[:] + r.append(b[i][0]) + b[i] = b[i][1:] + self.runPermutations_recurse(r, b) + if not found: + self.runIterationSteps(p) + + # If the dictionary is set up in this method, there can be references + # to class methods and fields. + def populateStepDictionary(self): + pass + + def run(self): + self.populateStepDictionary() + self.runPermutations(self.stepThreading) + # The last two lines of output for the last entry seem to disappear??? + print + print + + + class DummyPermutationTest(PermutationTest): + """ Simple test of the PermutationTest abstract class. + """ + + description = "simple test of the PermutationTest abstract class" + + stepThreading = [['r1x','c1'],['r2x','c2']] + + def setUpIteration(self, stepIdList): + print stepIdList + + def tearDownIteration(self, stepIdList): + print + + def printStepId(self, stepId): + print stepId, + + def populateStepDictionary(self): + self.stepDictionary = { + 'r1x': lambda : self.printStepId('r1x'), + 'c1': lambda : self.printStepId('c1'), + 'r2x': lambda : self.printStepId('r2x'), + 'c2': lambda : self.printStepId('c2') + } + + + class DatabasePermutationTest(PermutationTest): + """ Abstract class to provide framework for using an IterativeTest for database queries. + """ + + commitRequiredCount = 0 + commitRequiredOK = 0 + rollbackRequiredCount = 0 + rollbackRequiredOK = 0 + commitPreferredCount = 0 + commitPreferredOK = 0 + + serializationFailure = False + + def commitRequired(self, stepIdList): + return True + + def rollbackRequired(self, stepIdList): + return False + + def countProgress(self, stepIdList): + if self.rollbackRequired(stepIdList): + self.rollbackRequiredCount += 1 + if self.serializationFailure: + self.rollbackRequiredOK += 1 + else: + if self.commitRequired(stepIdList): + self.commitRequiredCount += 1 + if not self.serializationFailure: + self.commitRequiredOK += 1 + else: + self.commitPreferredCount += 1 + if not self.serializationFailure: + self.commitPreferredOK += 1 + + def runIterationSteps(self, stepIdList): + try: + self.setUpIteration(stepIdList) + for stepId in stepIdList: + self.runIterationStep(stepId) + self.countProgress(stepIdList) + finally: + self.tearDownIteration(stepIdList) + + def tryOperation(self, conn, sql): + result = self.syncCall(10, conn.operation, sql), + for line in result: + if len(line) > 0 and line.startswith("ERROR: could not serialize"): + self.serializationFailure = True + else: + if len(line) > 0 and line.startswith("ERROR:"): + raise TestFailure("failure other than serializable encountered: " + line, line) + + def printStatistics(self): + print 'rollback required: ', self.rollbackRequiredOK, '/', self.rollbackRequiredCount + print 'commit required: ', self.commitRequiredOK, '/', self.commitRequiredCount + print 'commit preferred: ', self.commitPreferredOK, '/', self.commitPreferredCount + + def run(self): + self.populateStepDictionary() + self.runPermutations(self.stepThreading) + self.printStatistics() + # The last two lines of output for the last entry seem to disappear??? + print + print + if self.rollbackRequiredOK < self.rollbackRequiredCount: + raise TestFailure("serialization anomalies incorrectly allowed", + "Database integrity not protected.") + if self.commitRequiredOK < self.commitRequiredCount: + raise TestFailure("serialization failure occurred when it should not have", + "Transactions we thought we knew how to recognize as safe resulted in a rollback..") + + def printStepResults(self, stepIdList): + if self.serializationFailure: + if self.commitRequired(stepIdList): + print 'rolled back ??' + else: + if not self.rollbackRequired(stepIdList): + print 'rolled back ?' + else: + print 'rolled back' + else: + if self.rollbackRequired(stepIdList): + print 'committed ***' + else: + print 'committed' + + + class SimpleWriteSkewTest(DatabasePermutationTest): + """ Write skew test. + This test has two serializable transactions: one which updates all + 'apple' rows to 'pear' and one which updates all 'pear' rows to + 'apple'. If these were serialized (run one at a time) either + value could be present, but not both. One must be rolled back to + prevent the write skew anomaly. + """ + + needs = (('conn1', 'ISqlConnection'), + ('conn2', 'ISqlConnection')) + + description = "write skew test" + + stepThreading = [['rwx1','c1'],['rwx2','c2']] + + def populateStepDictionary(self): + self.stepDictionary = { + 'rwx1': lambda : self.tryOperation(self.conn1, "UPDATE test SET t = 'apple' WHERE t = 'pear';"), + 'c1': lambda : self.tryOperation(self.conn1, "COMMIT;"), + 'rwx2': lambda : self.tryOperation(self.conn2, "UPDATE test SET t = 'pear' WHERE t = 'apple';"), + 'c2': lambda : self.tryOperation(self.conn2, "COMMIT;") + } + + def setUpIteration(self, stepIdList): + print stepIdList, + self.serializationFailure = False + self.syncCall(10, self.conn1.operation, "UPDATE test SET t = 'apple' WHERE i = 5;", "UPDATE 1") + self.syncCall(10, self.conn1.operation, "UPDATE test SET t = 'pear' WHERE i = 7;", "UPDATE 1") + self.syncCall(10, self.conn1.operation, "BEGIN TRANSACTION ISOLATION LEVEL SERIALIZABLE;", "BEGIN") + self.syncCall(10, self.conn2.operation, "BEGIN TRANSACTION ISOLATION LEVEL SERIALIZABLE;", "BEGIN") + + def tearDownIteration(self, stepIdList): + self.syncCall(10, self.conn1.operation, "ROLLBACK;") + self.syncCall(10, self.conn2.operation, "ROLLBACK;") + self.printStepResults(stepIdList) + + def commitRequired(self, stepIdList): + return (stepIdList.index('c1') < stepIdList.index('rwx2') + or stepIdList.index('c2') < stepIdList.index('rwx1')) + + def rollbackRequired(self, stepIdList): + return not self.commitRequired(stepIdList) + + + class ReceiptReportTest(DatabasePermutationTest): + """ Daily Report of Receipts test. + This test doesn't persist a bad state in the database; rather, it + provides a view of the data which is not consistent with any + order of execution of the serializable transactions. It + demonstrates a situation where the deposit date for receipts could + be changed and a report of the closed day's receipts subsequently + run which will miss a receipt from the date which has been closed. + """ + + needs = (('conn1', 'ISqlConnection'), + ('conn2', 'ISqlConnection'), + ('conn3', 'ISqlConnection')) + + description = "daily report of receipts test" + + stepThreading = [['rxwy1','c1'],['wx2','c2'],['rx3','ry3','c3']] + + def populateStepDictionary(self): + self.stepDictionary = { + 'rxwy1': lambda : self.tryOperation(self.conn1, "INSERT INTO receipt VALUES (3, (SELECT deposit_date FROM ctl WHERE k = 'receipt'), 4.00);"), + 'c1': lambda : self.tryOperation(self.conn1, "COMMIT;"), + 'wx2': lambda : self.tryOperation(self.conn2, "UPDATE ctl SET deposit_date = DATE '2008-12-23' WHERE k = 'receipt';"), + 'c2': lambda : self.tryOperation(self.conn2, "COMMIT;"), + 'rx3': lambda : self.tryOperation(self.conn3, "SELECT * FROM ctl WHERE k = 'receipt';"), + 'ry3': lambda : self.tryOperation(self.conn3, "SELECT * FROM receipt WHERE deposit_date = DATE '2008-12-22';"), + 'c3': lambda : self.tryOperation(self.conn3, "COMMIT;") + } + + def setUpIteration(self, stepIdList): + print stepIdList, + self.serializationFailure = False + self.syncCall(10, self.conn1.operation, "DROP TABLE IF EXISTS ctl, receipt;") + self.syncCall(10, self.conn1.operation, "CREATE TABLE ctl (k text NOT NULL PRIMARY KEY, deposit_date date NOT NULL);") + self.syncCall(10, self.conn1.operation, "INSERT INTO ctl VALUES ('receipt', DATE '2008-12-22');") + self.syncCall(10, self.conn1.operation, "CREATE TABLE receipt (receipt_no int NOT NULL PRIMARY KEY, deposit_date date NOT NULL, amount numeric(13,2));") + self.syncCall(10, self.conn1.operation, "INSERT INTO receipt VALUES (1, (SELECT deposit_date FROM ctl WHERE k = 'receipt'), 1.00);") + self.syncCall(10, self.conn1.operation, "INSERT INTO receipt VALUES (2, (SELECT deposit_date FROM ctl WHERE k = 'receipt'), 2.00);") + self.syncCall(10, self.conn1.operation, "BEGIN TRANSACTION ISOLATION LEVEL SERIALIZABLE;", "BEGIN") + self.syncCall(10, self.conn2.operation, "BEGIN TRANSACTION ISOLATION LEVEL SERIALIZABLE;", "BEGIN") + self.syncCall(10, self.conn3.operation, "BEGIN TRANSACTION READ ONLY ISOLATION LEVEL SERIALIZABLE READ ONLY;", "BEGIN") + + def tearDownIteration(self, stepIdList): + self.syncCall(10, self.conn1.operation, "ROLLBACK;") + self.syncCall(10, self.conn2.operation, "ROLLBACK;") + self.syncCall(10, self.conn3.operation, "ROLLBACK;") + self.printStepResults(stepIdList) + + def commitRequired(self, stepIdList): + return ( (stepIdList.index('c1') < stepIdList.index('wx2') + and stepIdList.index('c1') < stepIdList.index('rx3')) + or (stepIdList.index('c2') < stepIdList.index('rxwy1') + and stepIdList.index('c2') < stepIdList.index('rx3')) + or (stepIdList.index('c3') < stepIdList.index('rxwy1') + and stepIdList.index('c3') < stepIdList.index('wx2')) + or (stepIdList.index('c2') < stepIdList.index('rxwy1') + and stepIdList.index('c3') < stepIdList.index('rxwy1')) + or (stepIdList.index('c1') < stepIdList.index('wx2') + and stepIdList.index('c3') < stepIdList.index('wx2')) + or (stepIdList.index('c1') < stepIdList.index('rx3') + and stepIdList.index('c2') < stepIdList.index('rx3'))) + + def rollbackRequired(self, stepIdList): + return ((stepIdList.index('c2') < stepIdList.index('c1') + and stepIdList.index('c2') < stepIdList.index('c3') + and stepIdList.index('rxwy1') < stepIdList.index('c2') + and stepIdList.index('rx3') < stepIdList.index('c1') + ############################################################# + # The following test excludes some rows from rollback + # required for which we know our current SSI algorithm + # requires a rollback, but which don't, in fact, cause + # any anomaly. If we determine that we can allow pivots + # in which conflictIn and conflictOut are separate and + # overlapping transactions, these can be committed. + # To include these permutations in the "rollback required" + # count, comment out the following line. + and stepIdList.index('c2') < stepIdList.index('rx3') + ############################################################# + ) + + ############################################################# + # An anomaly can't actually occur based on the following + # "or" clause, but we know that our algorithm can't + # currently detect that, because T2's conflictIn is set + # to a self-reference because of multiple conflicts. + # To count these in the "rollback required" list, uncomment + # this section; otherwise they are "commit preferred".. + # or (stepIdList.index('rxwy1') < stepIdList.index('c1') + # and stepIdList.index('rxwy1') < stepIdList.index('c2') + # and stepIdList.index('rxwy1') < stepIdList.index('c3') + # and stepIdList.index('wx2') < stepIdList.index('c1') + # and stepIdList.index('wx2') < stepIdList.index('c2') + # and stepIdList.index('wx2') < stepIdList.index('c3') + # and stepIdList.index('rx3') < stepIdList.index('c1') + # and stepIdList.index('rx3') < stepIdList.index('c2') + # and stepIdList.index('rx3') < stepIdList.index('c3') + # ) + ############################################################# + ) + + + class TemporalRangeIntegrityTest(DatabasePermutationTest): + """ Temporal range integrity test. + Snapshot integrity fails with simple referential integrity tests, + but those don't make for good demonstrations because people just + say that foreign key definitions should be used instead. There + are many integrity tests which are conceptually very similar but + don't have built-in support which will fail when used in triggers. + This is intended to illustrate such cases. It is obviously very + hard to exercise all these permutations when the code is actually + in a trigger; this test pulls what would normally be inside of + triggers out to the top level to control the permutations. + """ + + needs = (('conn1', 'ISqlConnection'), + ('conn2', 'ISqlConnection')) + + description = "temporal range integrity test" + + stepThreading = [['rx1','wy1','c1'],['ry2','wx2','c2']] + + def populateStepDictionary(self): + self.stepDictionary = { + 'rx1': lambda : self.tryOperation(self.conn1, "SELECT count(*) FROM statute WHERE statute_cite = '123.45(1)a' AND eff_date <= DATE '2009-05-15' AND (exp_date IS NULL OR exp_date > DATE '2009-05-15');"), + 'wy1': lambda : self.tryOperation(self.conn1, "INSERT INTO offense VALUES (1, '123.45(1)a', DATE '2009-05-15');"), + 'c1': lambda : self.tryOperation(self.conn1, "COMMIT;"), + 'ry2': lambda : self.tryOperation(self.conn2, "SELECT count(*) FROM offense WHERE statute_cite = '123.45(1)a' AND offense_date >= DATE '2008-01-01';"), + 'wx2': lambda : self.tryOperation(self.conn2, "DELETE FROM statute WHERE statute_cite = '123.45(1)a' AND eff_date = DATE '2008-01-01';"), + 'c2': lambda : self.tryOperation(self.conn2, "COMMIT;") + } + + def setUpIteration(self, stepIdList): + self.serializationFailure = False + self.syncCall(10, self.conn1.operation, "DROP TABLE IF EXISTS statute, offense;", "DROP TABLE") + self.syncCall(10, self.conn1.operation, "CREATE TABLE statute (statute_cite text NOT NULL, eff_date date NOT NULL, exp_date date, CONSTRAINT statute_pkey PRIMARY KEY (statute_cite, eff_date));", "CREATE TABLE") + self.syncCall(10, self.conn1.operation, "INSERT INTO statute VALUES ('123.45(1)a', DATE '2008-01-01', NULL);", "INSERT 0 1") + self.syncCall(10, self.conn1.operation, "CREATE TABLE offense (offense_no int NOT NULL, statute_cite text NOT NULL, offense_date date NOT NULL, CONSTRAINT offense_pkey PRIMARY KEY (offense_no));", "CREATE TABLE") + self.syncCall(10, self.conn1.operation, "BEGIN TRANSACTION ISOLATION LEVEL SERIALIZABLE;", "BEGIN") + self.syncCall(10, self.conn2.operation, "BEGIN TRANSACTION ISOLATION LEVEL SERIALIZABLE;", "BEGIN") + print stepIdList, + + def tearDownIteration(self, stepIdList): + self.syncCall(10, self.conn1.operation, "ROLLBACK;") + self.syncCall(10, self.conn2.operation, "ROLLBACK;") + self.printStepResults(stepIdList) + + def commitRequired(self, stepIdList): + return ( stepIdList.index('c1') < stepIdList.index('ry2') + or stepIdList.index('c2') < stepIdList.index('rx1')) + + def rollbackRequired(self, stepIdList): + return not self.commitRequired(stepIdList) + + + class ProjectManagerTest(DatabasePermutationTest): + """ Project manager test. + Ensure that the person who is on the project as a manager + is flagged as a project manager in the person table. + """ + + needs = (('conn1', 'ISqlConnection'), + ('conn2', 'ISqlConnection')) + + description = "project manager test" + + stepThreading = [['rx1','wy1','c1'],['ry2','wx2','c2']] + + def populateStepDictionary(self): + self.stepDictionary = { + 'rx1': lambda : self.tryOperation(self.conn1, "SELECT count(*) FROM person WHERE person_id = 1 AND is_project_manager;"), + 'wy1': lambda : self.tryOperation(self.conn1, "INSERT INTO project VALUES (101, 'Build Great Wall', 1);"), + 'c1': lambda : self.tryOperation(self.conn1, "COMMIT;"), + 'ry2': lambda : self.tryOperation(self.conn2, "SELECT count(*) FROM project WHERE project_manager = 1;"), + 'wx2': lambda : self.tryOperation(self.conn2, "UPDATE person SET is_project_manager = false WHERE person_id = 1;"), + 'c2': lambda : self.tryOperation(self.conn2, "COMMIT;") + } + + def setUpIteration(self, stepIdList): + self.serializationFailure = False + self.syncCall(10, self.conn1.operation, "DROP TABLE IF EXISTS person, project;", "DROP TABLE") + self.syncCall(10, self.conn1.operation, "CREATE TABLE person (person_id int NOT NULL PRIMARY KEY, name text NOT NULL, is_project_manager bool NOT NULL);", "CREATE TABLE") + self.syncCall(10, self.conn1.operation, "INSERT INTO person VALUES (1, 'Robert Haas', true);", "INSERT 0 1") + self.syncCall(10, self.conn1.operation, "CREATE TABLE project (project_no int NOT NULL PRIMARY KEY, description text NOT NULL, project_manager int NOT NULL);", "CREATE TABLE") + self.syncCall(10, self.conn1.operation, "BEGIN TRANSACTION ISOLATION LEVEL SERIALIZABLE;", "BEGIN") + self.syncCall(10, self.conn2.operation, "BEGIN TRANSACTION ISOLATION LEVEL SERIALIZABLE;", "BEGIN") + print stepIdList, + + def tearDownIteration(self, stepIdList): + self.syncCall(10, self.conn1.operation, "ROLLBACK;") + self.syncCall(10, self.conn2.operation, "ROLLBACK;") + self.printStepResults(stepIdList) + + def commitRequired(self, stepIdList): + return ( stepIdList.index('c1') < stepIdList.index('ry2') + or stepIdList.index('c2') < stepIdList.index('rx1')) + + def rollbackRequired(self, stepIdList): + return not self.commitRequired(stepIdList) + + + class ClassroomSchedulingTest(DatabasePermutationTest): + """ Classroom scheduling test. + Ensure that the classroom is not scheduled more than once + for any moment in time. + """ + + needs = (('conn1', 'ISqlConnection'), + ('conn2', 'ISqlConnection')) + + description = "classroom scheduling test" + + stepThreading = [['rx1','wy1','c1'],['ry2','wx2','c2']] + + def populateStepDictionary(self): + self.stepDictionary = { + 'rx1': lambda : self.tryOperation(self.conn1, "SELECT count(*) FROM room_reservation WHERE room_id = '101' AND start_time < TIMESTAMP WITH TIME ZONE '2010-04-01 14:00' AND end_time > TIMESTAMP WITH TIME ZONE '2010-04-01 13:00';"), + 'wy1': lambda : self.tryOperation(self.conn1, "INSERT INTO room_reservation VALUES ('101', TIMESTAMP WITH TIME ZONE '2010-04-01 13:00', TIMESTAMP WITH TIME ZONE '2010-04-01 14:00', 'Carol');"), + 'c1': lambda : self.tryOperation(self.conn1, "COMMIT;"), + 'ry2': lambda : self.tryOperation(self.conn2, "SELECT count(*) FROM room_reservation WHERE room_id = '101' AND start_time < TIMESTAMP WITH TIME ZONE '2010-04-01 14:30' AND end_time > TIMESTAMP WITH TIME ZONE '2010-04-01 13:30';"), + 'wx2': lambda : self.tryOperation(self.conn2, "UPDATE room_reservation SET start_time = TIMESTAMP WITH TIME ZONE '2010-04-01 13:30', end_time = TIMESTAMP WITH TIME ZONE '2010-04-01 14:30' WHERE room_id = '101' AND start_time = TIMESTAMP WITH TIME ZONE '2010-04-01 10:00';"), + 'c2': lambda : self.tryOperation(self.conn2, "COMMIT;") + } + + def setUpIteration(self, stepIdList): + self.serializationFailure = False + self.syncCall(10, self.conn1.operation, "DROP TABLE IF EXISTS room_reservation;", "DROP TABLE") + self.syncCall(10, self.conn1.operation, "CREATE TABLE room_reservation (room_id text NOT NULL, start_time timestamp with time zone NOT NULL, end_time timestamp with time zone NOT NULL, description text NOT NULL, CONSTRAINT room_reservation_pkey PRIMARY KEY (room_id, start_time));", "CREATE TABLE") + self.syncCall(10, self.conn1.operation, "INSERT INTO room_reservation VALUES ('101', TIMESTAMP WITH TIME ZONE '2010-04-01 10:00', TIMESTAMP WITH TIME ZONE '2010-04-01 11:00', 'Bob');", "INSERT 0 1") + self.syncCall(10, self.conn1.operation, "BEGIN TRANSACTION ISOLATION LEVEL SERIALIZABLE;", "BEGIN") + self.syncCall(10, self.conn2.operation, "BEGIN TRANSACTION ISOLATION LEVEL SERIALIZABLE;", "BEGIN") + print stepIdList, + + def tearDownIteration(self, stepIdList): + self.syncCall(10, self.conn1.operation, "ROLLBACK;") + self.syncCall(10, self.conn2.operation, "ROLLBACK;") + self.printStepResults(stepIdList) + + def commitRequired(self, stepIdList): + return ( stepIdList.index('c1') < stepIdList.index('ry2') + or stepIdList.index('c2') < stepIdList.index('rx1')) + + def rollbackRequired(self, stepIdList): + return not self.commitRequired(stepIdList) + + + class TotalCashTest(DatabasePermutationTest): + """ Total cash test. + Another famous test of snapshot isolation anomaly. + """ + + needs = (('conn1', 'ISqlConnection'), + ('conn2', 'ISqlConnection')) + + description = "total cash test" + + stepThreading = [['wx1','rxy1','c1'],['wy2','rxy2','c2']] + + def populateStepDictionary(self): + self.stepDictionary = { + 'wx1': lambda : self.tryOperation(self.conn1, "UPDATE accounts SET balance = balance - 200 WHERE accountid = 'checking';"), + 'rxy1': lambda : self.tryOperation(self.conn1, "SELECT SUM(balance) FROM accounts;"), + 'c1': lambda : self.tryOperation(self.conn1, "COMMIT;"), + 'wy2': lambda : self.tryOperation(self.conn2, "UPDATE accounts SET balance = balance - 200 WHERE accountid = 'savings';"), + 'rxy2': lambda : self.tryOperation(self.conn2, "SELECT SUM(balance) FROM accounts;"), + 'c2': lambda : self.tryOperation(self.conn2, "COMMIT;") + } + + def setUpIteration(self, stepIdList): + self.serializationFailure = False + self.syncCall(10, self.conn1.operation, "DROP TABLE IF EXISTS accounts;", "DROP TABLE") + self.syncCall(10, self.conn1.operation, "CREATE TABLE accounts (accountid text NOT NULL PRIMARY KEY, balance numeric not null);", "CREATE TABLE") + self.syncCall(10, self.conn1.operation, "INSERT INTO accounts VALUES ('checking', 600),('savings',600);", "INSERT 0 2") + self.syncCall(10, self.conn1.operation, "BEGIN TRANSACTION ISOLATION LEVEL SERIALIZABLE;", "BEGIN") + self.syncCall(10, self.conn2.operation, "BEGIN TRANSACTION ISOLATION LEVEL SERIALIZABLE;", "BEGIN") + print stepIdList, + + def tearDownIteration(self, stepIdList): + self.syncCall(10, self.conn1.operation, "ROLLBACK;") + self.syncCall(10, self.conn2.operation, "ROLLBACK;") + self.printStepResults(stepIdList) + + def commitRequired(self, stepIdList): + return ( stepIdList.index('c1') < stepIdList.index('wy2') + or stepIdList.index('c2') < stepIdList.index('wx1')) + + def rollbackRequired(self, stepIdList): + return not self.commitRequired(stepIdList) + + + class ReferentialIntegrityTest(DatabasePermutationTest): + """ Referential integrity test. + The assumption here is that the application code issuing the SELECT + to test for the presence or absence of a related record would do the + right thing -- this script doesn't include that logic. + """ + + needs = (('conn1', 'ISqlConnection'), + ('conn2', 'ISqlConnection')) + + description = "referential integrity test" + + stepThreading = [['rx1','wy1','c1'],['rx2','ry2','wx2','c2']] + + def populateStepDictionary(self): + self.stepDictionary = { + 'rx1': lambda : self.tryOperation(self.conn1, "SELECT i FROM a WHERE i = 1;"), + 'wy1': lambda : self.tryOperation(self.conn1, "INSERT INTO b VALUES (1);"), + 'c1': lambda : self.tryOperation(self.conn1, "COMMIT;"), + 'rx2': lambda : self.tryOperation(self.conn2, "SELECT i FROM a WHERE i = 1;"), + 'ry2': lambda : self.tryOperation(self.conn2, "SELECT a_id FROM b WHERE a_id = 1;"), + 'wx2': lambda : self.tryOperation(self.conn2, "DELETE FROM a WHERE i = 1;"), + 'c2': lambda : self.tryOperation(self.conn2, "COMMIT;") + } + + def setUpIteration(self, stepIdList): + self.serializationFailure = False + self.syncCall(10, self.conn1.operation, "DROP TABLE IF EXISTS a, b;", "DROP TABLE") + self.syncCall(10, self.conn1.operation, "CREATE TABLE a (i int PRIMARY KEY);", "CREATE TABLE") + self.syncCall(10, self.conn1.operation, "CREATE TABLE b (a_id int);", "CREATE TABLE") + self.syncCall(10, self.conn1.operation, "INSERT INTO a VALUES (1);", "INSERT 0 1") + self.syncCall(10, self.conn1.operation, "BEGIN TRANSACTION ISOLATION LEVEL SERIALIZABLE;", "BEGIN") + self.syncCall(10, self.conn2.operation, "BEGIN TRANSACTION ISOLATION LEVEL SERIALIZABLE;", "BEGIN") + print stepIdList, + + def tearDownIteration(self, stepIdList): + self.syncCall(10, self.conn1.operation, "ROLLBACK;") + self.syncCall(10, self.conn2.operation, "ROLLBACK;") + self.printStepResults(stepIdList) + + def commitRequired(self, stepIdList): + return ( stepIdList.index('c1') < stepIdList.index('rx2') + or stepIdList.index('c2') < stepIdList.index('rx1')) + + def rollbackRequired(self, stepIdList): + return not self.commitRequired(stepIdList) + + + class RITriggerTest(DatabasePermutationTest): + """ Referential integrity trigger test. + """ + + needs = (('conn1', 'ISqlConnection'), + ('conn2', 'ISqlConnection')) + + description = "referential integrity trigger test" + + stepThreading = [['wxry1','c1'],['r2','wyrx2','c2']] + + def populateStepDictionary(self): + self.stepDictionary = { + 'wxry1': lambda : self.tryOperation(self.conn1, "INSERT INTO child (parent_id) VALUES (0);"), + 'c1': lambda : self.tryOperation(self.conn1, "COMMIT;"), + 'r2': lambda : self.tryOperation(self.conn2, "SELECT TRUE;"), + 'wyrx2': lambda : self.tryOperation(self.conn2, "DELETE FROM parent WHERE parent_id = 0;"), + 'c2': lambda : self.tryOperation(self.conn2, "COMMIT;") + } + + def setUpIteration(self, stepIdList): + self.serializationFailure = False + self.syncCall(10, self.conn1.operation, "DROP TABLE IF EXISTS parent, child;", "DROP TABLE") + self.syncCall(10, self.conn1.operation, "CREATE TABLE parent (parent_id SERIAL NOT NULL PRIMARY KEY);", "CREATE TABLE") + self.syncCall(10, self.conn1.operation, "CREATE TABLE child (child_id SERIAL NOT NULL PRIMARY KEY, parent_id INTEGER NOT NULL);", "CREATE TABLE") + self.syncCall(10, self.conn1.operation, "CREATE OR REPLACE FUNCTION ri_parent() RETURNS TRIGGER AS $body$\ + BEGIN\ + PERFORM TRUE FROM child WHERE parent_id = OLD.parent_id;\ + IF FOUND THEN\ + RAISE SQLSTATE '23503' USING MESSAGE = 'Parent ' || OLD.parent_id || ' still referenced during ' || TG_OP;\ + END IF;\ + RETURN NULL;\ + END;\ + $body$ LANGUAGE PLPGSQL VOLATILE;", "CREATE FUNCTION") + self.syncCall(10, self.conn1.operation, "CREATE TRIGGER ri_parent AFTER UPDATE OR DELETE ON parent FOR EACH ROW EXECUTE PROCEDURE ri_parent();", "CREATE TRIGGER") + self.syncCall(10, self.conn1.operation, "CREATE OR REPLACE FUNCTION ri_child() RETURNS TRIGGER AS $body$\ + BEGIN\ + PERFORM TRUE FROM parent WHERE parent_id = NEW.parent_id;\ + IF NOT FOUND THEN\ + RAISE SQLSTATE '23503' USING MESSAGE = 'Parent ' || NEW.parent_id || ' does not exist during ' || TG_OP;\ + END IF;\ + RETURN NULL;\ + END;\ + $body$ LANGUAGE PLPGSQL VOLATILE;", "CREATE FUNCTION") + self.syncCall(10, self.conn1.operation, "CREATE TRIGGER ri_child AFTER INSERT OR UPDATE ON child FOR EACH ROW EXECUTE PROCEDURE ri_child();", "CREATE TRIGGER") + self.syncCall(10, self.conn1.operation, "INSERT INTO parent VALUES(0);", "INSERT 0 1") + self.syncCall(10, self.conn1.operation, "BEGIN TRANSACTION ISOLATION LEVEL SERIALIZABLE;", "BEGIN") + self.syncCall(10, self.conn2.operation, "BEGIN TRANSACTION ISOLATION LEVEL SERIALIZABLE;", "BEGIN") + print stepIdList, + + # Override the normal method to allow failures generated by the trigger code + # to be considered "success". Just so we can count things up. + def tryOperation(self, conn, sql): + result = self.syncCall(10, conn.operation, sql), + for line in result: + if len(line) > 0 and line.startswith("ERROR: could not serialize"): + self.serializationFailure = True + else: + if (len(line) > 0 and line.startswith("ERROR:") + and len(line) > 0 and not line.startswith("ERROR: Parent 0 ")): + raise TestFailure("failure other than serializable encountered: " + line, line) + + def tearDownIteration(self, stepIdList): + self.syncCall(10, self.conn1.operation, "ROLLBACK;") + self.syncCall(10, self.conn2.operation, "ROLLBACK;") + self.printStepResults(stepIdList) + + def commitRequired(self, stepIdList): + return ( stepIdList.index('c1') < stepIdList.index('r2') + or stepIdList.index('c2') < stepIdList.index('wxry1')) + + def rollbackRequired(self, stepIdList): + return not self.commitRequired(stepIdList) + + + class TestTrueSerializabilityConcurrentUpdates(SyncTest): + """ Runs three transactions concurrently, each reading from what the + other writes in turn. Should raise a serialization failure, but + instead leads to wrong results, ATM. + """ + + description = "concurrent updates" + + needs = (('conn1', 'ISqlConnection'), + ('conn2', 'ISqlConnection'), + ('conn3', 'ISqlConnection')) + + def execOnAllConnections(self, sql, expRes=None): + deferreds = [] + for conn in self.connections: + d = conn.operation(sql, expRes) + deferreds.append(d) + + d = defer.DeferredList(deferreds, + consumeErrors=True, fireOnOneErrback=True) + return d + + def readValueThenWrite(self, conn, readFromId, writeToId): + d = conn.query("SELECT t FROM test WHERE i = %d;" % readFromId) + d.addCallback(self.writeValueBack, conn, writeToId) + return d + + def writeValueBack(self, result, conn, writeToId): + self.assertEqual(1, len(result), + "expected exactly one result row") + row = result[0] + self.assertEqual(1, len(row), + "expected exactly one column") + value = row['t'] + d = conn.operation("UPDATE test SET t = '%s' WHERE i = %d;" % (value, writeToId), + "UPDATE") + return d + + def startConcurrentOperations(self): + d1 = self.readValueThenWrite(self.conn1, readFromId=5, writeToId=7) + d2 = self.readValueThenWrite(self.conn2, readFromId=7, writeToId=11) + d3 = self.readValueThenWrite(self.conn3, readFromId=11, writeToId=5) + return defer.DeferredList([d1, d2, d3], + consumeErrors=True, fireOnOneErrback=True) + + def run(self): + try: + self.sub_run() + finally: + self.syncCall(10, self.execOnAllConnections, "ROLLBACK;") + + def sub_run(self): + self.connections = [ + self.conn1, + self.conn2, + self.conn3] + + # begin a transaction on all three connections + self.syncCall(10, self.execOnAllConnections, + "BEGIN;", "BEGIN") + + # set their isolation level to SERIALIZABLE + self.syncCall(10, self.execOnAllConnections, + "SET TRANSACTION ISOLATION LEVEL SERIALIZABLE;", "SET") + + # concurrently let each of the three transactions read a value and + # write that to another tuple, wait for all the UPDATEs to complete + # before trying to commit any of the transactions + self.syncCall(10, self.startConcurrentOperations) + + # try to commit all three transactions (accepting both COMMIT or + # ERROR, we check the result later on). + self.syncCall(10, self.execOnAllConnections, + "COMMIT;", "COMMIT|ERROR"); + + # count the occurrance of each fruit + result = self.syncCall(10, self.conn1.query, + "SELECT t FROM test WHERE i IN (5, 7, 11);") + counters = {'banana': 0, 'apple': 0, 'pear': 0} + for row in result: + counters[row['t']] += 1 + + # you currently get one fruit each, as no transaction gets aborted, + # which is impossible if the transactions had been executed one + # after another. + if counters.values() == [1, 1, 1]: + raise TestFailure("conflict not detected", + "All transactions committed, so the conflict hasn't been detected.") + + class TestTrueSerializabilityConcurrentInsert(BaseTest): + """ Runs two transactions, both doing an insert, first, then select + all the relevant rows (within the range 100 <= i < 110). We let the + first transaction commit before creating the cyclic dependency, + which forces transaction 2 to abort. + """ + + description = "concurrent insert" + + needs = (('conn1', 'ISqlConnection'), + ('conn2', 'ISqlConnection')) + + def execOnAllConnections(self, sql, expRes=None): + deferreds = [] + for conn in self.connections: + d = conn.operation(sql, expRes) + deferreds.append(d) + + d = defer.DeferredList(deferreds, + consumeErrors=True, fireOnOneErrback=True) + return d + + def run(self): + self.connections = [ + self.conn1, + self.conn2] + + # begin a transaction on all three connections + d = self.execOnAllConnections("BEGIN;", "BEGIN") + + # set their isolation level to SERIALIZABLE + d.addCallback(lambda x: + self.execOnAllConnections( + "SET TRANSACTION ISOLATION LEVEL SERIALIZABLE;", "SET")) + + # let transaction 1 do an insert (so it acquires a snapshot) + d.addCallback(lambda x: + self.conn1.operation( + "INSERT INTO test (i, t) VALUES (101, 'orange');", "INSERT 0 1")) + + # then same for transaction 2 + d.addCallback(lambda x: + self.conn2.operation( + "INSERT INTO test (i, t) VALUES (102, 'grapefruit');", "INSERT 0 1")) + + # let transaction 1 read the relevant rows, so it acquires an SIREAD + # lock on the predicate. (The result is discarded). + d.addCallback(lambda x: + self.conn2.query("SELECT t FROM test WHERE i >= 100 AND i < 110;")) + + # then commit transaction 1 (which should still succeed) + d.addCallback(lambda x: + self.conn1.operation( + "COMMIT;", "COMMIT")) + + # try to read all rows with the second transaction's snapshot (which + # doesn't see the update of transaction 1) + d.addCallback(lambda x: + self.conn2.query("SELECT t FROM test WHERE i >= 100 AND i < 110;")) + + # With SSI in place, this should lock the same predicate with an + # SIREAD lock, which should bomb out on the orange (tuple i = 101) + # from transaction 1. + # + # dtester FIXME: Hm.. this could need some "expect to fail" help + # from dtester + d.addCallback(self.checkResult) + + # cleanup both transactions, especially in case of failure + d.addBoth(self.cleanup) + + return d + + def checkResult(self, result): + if not isinstance(result, failure.Failure): + raise TestFailure("conflict not detected", + "SELECT should raise a serialization error") + return result + + def cleanup(self, result): + d = self.execOnAllConnections("ROLLBACK;") + + # ignore errors above, but instead make sure we return the result + # we got here, especially if it was an error. + d.addBoth(lambda x: result) + return d + + class TestTrueSerializabilityConcurrentInsert2(BaseTest): + """ Pretty similar to the above test, except that the first transaction + doesn't read (and thus predicate lock) the relevant rows. This still + leaves a possible serialization ordering, even if it doesn't match + the real commit ordering. + + Uses rows 200 <= i < 210 + """ + + description = "concurrent insert" + + needs = (('conn1', 'ISqlConnection'), + ('conn2', 'ISqlConnection')) + + def execOnAllConnections(self, sql, expRes=None): + deferreds = [] + for conn in self.connections: + d = conn.operation(sql, expRes) + deferreds.append(d) + + d = defer.DeferredList(deferreds, + consumeErrors=True, fireOnOneErrback=True) + return d + + def run(self): + self.connections = [ + self.conn1, + self.conn2] + + # begin a transaction on all three connections + d = self.execOnAllConnections("BEGIN;", "BEGIN") + + # set their isolation level to SERIALIZABLE + d.addCallback(lambda x: + self.execOnAllConnections( + "SET TRANSACTION ISOLATION LEVEL SERIALIZABLE;", "SET")) + + # let transaction 1 do an insert (so it acquires a snapshot) + d.addCallback(lambda x: + self.conn1.operation( + "INSERT INTO test (i, t) VALUES (201, 'orange');", "INSERT 0 1")) + + # then same for transaction 2 + d.addCallback(lambda x: + self.conn2.operation( + "INSERT INTO test (i, t) VALUES (202, 'grapefruit');", "INSERT 0 1")) + + # no SELECT here, so transaction 1 doesn't acquire any SIREAD lock + + # then commit transaction 1 (which should succeed) + d.addCallback(lambda x: + self.conn1.operation( + "COMMIT;", "COMMIT")) + + # try to read all rows with the second transaction's snapshot (which + # doesn't see the update of transaction 1) + d.addCallback(lambda x: + self.conn2.query("SELECT t FROM test WHERE i >= 200 AND i < 210;")) + + # With SSI in place, this should lock the same predicate as abover + # with an SIREAD lock. This includes the row just written by the + # first transaction. + # + # As long as there are no other edges, this still leaves a possible + # serialization ordering: if we executed the second transaction + # *before* the first one, the second didn't see the 'orange' row + # inserted "later" by the first transaction. That's the result we + # expect. + d.addCallback(self.checkResult) + + # commit transaction 2 + d.addCallback(lambda x: + self.conn2.operation( + "COMMIT;", "COMMIT")) + + # add a cleanup handler + d.addErrback(self.cleanup) + + return d + + def checkResult(self, result): + self.assertEqual(len(result), 1, + "Expected exactly one row, got %d (%s)" % ( + len(result), repr(result))) + self.assertEqual(result[0], {"t": "grapefruit"}, + "Expected to read the grapefruit row, but got %s" % (result[0],)) + + return result + + def cleanup(self, result): + d = self.execOnAllConnections("ROLLBACK;") + + # ignore errors above, but instead make sure we return the result + # we got here, especially if it was an error. + d.addBoth(lambda x: result) + return d + + + # ****** test running code ************************************************ + + class Logger(object): + """ A simplistic logger that just writes it all into one single file. + """ + def __init__(self, logFileName): + self.logfile = open(logFileName, 'w') + + def __del__(self): + self.logfile.close() + + def callback(self, event): + self.logfile.write(str(event) + "\n") + self.logfile.flush() + + def main(argv): + print "Postgres dtester suite Copyright (c) 2004-2010, by Markus Wanner\n" + + postgres_configure_args = "@configure_args@" + + config = { + 'temp-port': 65432, + + # by default, use the same installation directory as make check + 'inst_dir': os.path.join(os.getcwd(), 'tmp_check/install'), + + # and a similar prefix + 'pgdata_prefix': os.path.join(os.getcwd(), 'tmp_check/data-dtester'), + 'logfile' : os.path.join(os.getcwd(), 'dtester.log'), + + 'enable_cassert': 'enable_cassert' in postgres_configure_args + } + + try: + opts, args = getopt.getopt(argv, + "h", + ["help", "temp-install", "top-builddir=", "temp-port=", + "multibyte="]) + except getopt.GetoptError: + usage() + sys.exit(2) + + for opt, arg in opts: + if opt in ("-h", "--help"): + usage() + sys.exit() + elif opt in ("--temp-install"): + config["temp-install"] = True + elif opt in ("--temp-port"): + try: + arg = int(arg) + if arg >= 1024 and arg <= 65535: + config["temp-port"] = arg + else: + print "temp-port out of range." + sys.exit(2) + except ValueError: + print "Fatal: invalid temp-port specified" + sys.exit(2) + elif opt in ("--top-builddir"): + config["top-builddir"] = arg + + + if not config.has_key('bindir'): + bindir = '@bindir@' + if bindir[0] == '/': + bindir = bindir[1:] + config['bindir'] = os.path.join(config['inst_dir'], bindir) + if not config.has_key('libdir'): + libdir = '@libdir@' + if libdir[0] == '/': + libdir = libdir[1:] + config['libdir'] = os.path.join(config['inst_dir'], libdir) + if not config.has_key('datadir'): + datadir = '@datadir@' + if datadir[0] == '/': + datadir = datadir[1:] + config['datadir'] = os.path.join(config['inst_dir'], datadir) + + + # FIXME: should not have to be here + logger = Logger(config['logfile']) + config['main_logging_hook'] = (EventMatcher(Event), logger.callback) + + + # definition of tests and suites, including their dependencies + tdef = { + # runs 'make install' to make sure the installation is up to date + 'temp_install': {'class': InstallationSuite, + 'uses': ('__system__',)}, + + # runs initdb, providing the Postgres data directory + 'initdb-0': {'class': InitdbSuite, + 'uses': ('temp_install',), + 'args': (0,)}, + + # runs a postmaster on the created database directory + 'pg-0': {'class': PostmasterSuite, + 'uses': ('temp_install', 'initdb-0')}, + + # creates a test database on pg-0 + 'testdb': {'class': TestDatabaseSuite, + 'uses': ('temp_install', 'pg-0'), + 'args': ('testdb',)}, + + # open two connections + 'conn-0A': {'class': SqlConnectionSuite, + 'uses': ('temp_install', 'pg-0'), + 'args': ('testdb',), + 'depends': ('testdb',)}, + 'conn-0B': {'class': SqlConnectionSuite, + 'uses': ('temp_install', 'pg-0'), + 'args': ('testdb',), + 'depends': ('testdb',)}, + 'conn-0C': {'class': SqlConnectionSuite, + 'uses': ('temp_install', 'pg-0'), + 'args': ('testdb',), + 'depends': ('testdb',)}, + + # test the connections + 'test-conn-0A': {'class': TestDatabaseConnection, + 'uses': ('conn-0A',)}, + 'test-conn-0B': {'class': TestDatabaseConnection, + 'uses': ('conn-0B',)}, + 'test-conn-0C': {'class': TestDatabaseConnection, + 'uses': ('conn-0C',)}, + + # 'dummy-recursion': {'class': DummyPermutationTest}, + + # populate the test database + 'populate-testdb': {'class': PopulateTestDatabase, + 'uses': ('conn-0A',), + 'onlyAfter': ('test-conn-0A', 'test-conn-0B', + 'test-conn-0C')}, + + 'simple-write-skew': {'class': SimpleWriteSkewTest, + 'uses': ('conn-0A', 'conn-0B'), + 'onlyAfter': ('populate-testdb',), + 'xfail': True}, + + 'receipt-report': {'class': ReceiptReportTest, + 'uses': ('conn-0A', 'conn-0B', 'conn-0C'), + 'onlyAfter': ('simple-write-skew',), + 'xfail': True}, + + 'temporal-range': {'class': TemporalRangeIntegrityTest, + 'uses': ('conn-0A', 'conn-0B'), + 'onlyAfter': ('receipt-report',), + 'xfail': True}, + + 'project-manager': {'class': ProjectManagerTest, + 'uses': ('conn-0A', 'conn-0B'), + 'onlyAfter': ('temporal-range',), + 'xfail': True}, + + 'classroom-scheduling': {'class': ClassroomSchedulingTest, + 'uses': ('conn-0A', 'conn-0B'), + 'onlyAfter': ('project-manager',), + 'xfail': True}, + + 'total-cash': {'class': TotalCashTest, + 'uses': ('conn-0A', 'conn-0B'), + 'onlyAfter': ('classroom-scheduling',), + 'xfail': True}, + + 'referential-integrity': {'class': ReferentialIntegrityTest, + 'uses': ('conn-0A', 'conn-0B'), + 'onlyAfter': ('total-cash',), + 'xfail': True}, + + 'ri-trigger': {'class': RITriggerTest, + 'uses': ('conn-0A', 'conn-0B'), + 'onlyAfter': ('referential-integrity',), + 'xfail': True} + + # 'ser-updates': {'class': TestTrueSerializabilityConcurrentUpdates, + # 'uses': ('conn-0A', 'conn-0B', 'conn-0C'), + # 'onlyAfter': ('populate-testdb',), + # 'xfail': True}, + # + # 'ser-insert': {'class': TestTrueSerializabilityConcurrentInsert, + # 'uses': ('conn-0A', 'conn-0B'), + # 'onlyAfter': ('ser-updates',), + # 'xfail': True}, + # + # 'ser-insert2': {'class': TestTrueSerializabilityConcurrentInsert2, + # 'uses': ('conn-0A', 'conn-0B'), + # 'onlyAfter': ('ser-insert',)} + } + + + runner = Runner(testTimeout=600, suiteTimeout=3600) + runner.run(tdef, config) + + + if __name__ == "__main__": + main(sys.argv[1:]) +