*** a/src/backend/access/gist/gist.c --- b/src/backend/access/gist/gist.c *************** *** 36,42 **** static bool gistinserttuples(GISTInsertState *state, GISTInsertStack *stack, bool unlockbuf, bool unlockleftchild); static void gistfinishsplit(GISTInsertState *state, GISTInsertStack *stack, GISTSTATE *giststate, List *splitinfo, bool releasebuf); ! #define ROTATEDIST(d) do { \ SplitedPageLayout *tmp=(SplitedPageLayout*)palloc(sizeof(SplitedPageLayout)); \ --- 36,42 ---- bool unlockbuf, bool unlockleftchild); static void gistfinishsplit(GISTInsertState *state, GISTInsertStack *stack, GISTSTATE *giststate, List *splitinfo, bool releasebuf); ! static void gistvacuumpage(Relation rel, Page page, Buffer buffer); #define ROTATEDIST(d) do { \ SplitedPageLayout *tmp=(SplitedPageLayout*)palloc(sizeof(SplitedPageLayout)); \ *************** *** 209,214 **** gistplacetopage(Relation rel, Size freespace, GISTSTATE *giststate, --- 209,225 ---- * because the tuple vector passed to gistSplit won't include this tuple. */ is_split = gistnospace(page, itup, ntup, oldoffnum, freespace); + + /* + * If leaf page is full, try at first to delete dead tuples. And then + * check again. + */ + if ((is_split) && GistPageIsLeaf(page) && GistPageHasGarbage(page)) + { + gistvacuumpage(rel, page, buffer); + is_split = gistnospace(page, itup, ntup, oldoffnum, freespace); + } + if (is_split) { /* no space for insertion */ *************** *** 1440,1442 **** freeGISTstate(GISTSTATE *giststate) --- 1451,1522 ---- /* It's sufficient to delete the scanCxt */ MemoryContextDelete(giststate->scanCxt); } + + /* + * gistvacuumpage() -- try to remove LP_DEAD items from the given page. + */ + static void + gistvacuumpage(Relation rel, Page page, Buffer buffer) + { + OffsetNumber deletable[MaxOffsetNumber]; + int ndeletable = 0; + OffsetNumber offnum, + minoff, + maxoff; + + /* + * Scan over all items to see which ones need to be deleted according to + * LP_DEAD flags. + */ + maxoff = PageGetMaxOffsetNumber(page); + for (offnum = FirstOffsetNumber; + offnum <= maxoff; + offnum = OffsetNumberNext(offnum)) + { + ItemId itemId = PageGetItemId(page, offnum); + + if (ItemIdIsDead(itemId)) + deletable[ndeletable++] = offnum; + } + + if (ndeletable > 0) + { + START_CRIT_SECTION(); + + PageIndexMultiDelete(page, deletable, ndeletable); + + /* + * Mark the page as not containing any LP_DEAD items. This is not + * certainly true (there might be some that have recently been marked, + * but weren't included in our target-item list), but it will almost + * always be true and it doesn't seem worth an additional page scan to + * check it. Remember that F_HAS_GARBAGE is only a hint anyway. + */ + GistClearPageHasGarbage(page); + + MarkBufferDirty(buffer); + + /* XLOG stuff */ + if (RelationNeedsWAL(rel)) + { + XLogRecPtr recptr; + + recptr = gistXLogUpdate(rel->rd_node, buffer, + deletable, ndeletable, + NULL, 0, InvalidBuffer); + + PageSetLSN(page, recptr); + } + else + PageSetLSN(page, gistGetFakeLSN(rel)); + + END_CRIT_SECTION(); + } + + /* + * Note: if we didn't find any LP_DEAD items, then the page's + * F_HAS_GARBAGE hint bit is falsely set. We do not bother expending a + * separate write to clear it, however. We will clear it when we split + * the page. + */ + } *** a/src/backend/access/gist/gistget.c --- b/src/backend/access/gist/gistget.c *************** *** 24,29 **** --- 24,116 ---- #include "utils/memutils.h" #include "utils/rel.h" + /* + * gistkillitems() -- set LP_DEAD state for items an indexscan caller has + * told us were killed. + * + * We match items by heap TID before mark them. If an item has moved off + * the current page due to a split, we'll fail to find it and do nothing + * (this is not an error case --- we assume the item will eventually get + * marked in a future indexscan). + * + * We re-read page here, so it's significant to check page LSN. If page + * has been modified since the last read (as determined by LSN), we dare not + * flag any antries because it is possible that the old entry was vacuumed + * away and the TID was re-used by a completely different heap tuple. + */ + static void + gistkillitems(IndexScanDesc scan) + { + GISTScanOpaque so = (GISTScanOpaque) scan->opaque; + Buffer buffer; + Page page; + OffsetNumber minoff; + OffsetNumber maxoff; + int i; + bool killedsomething = false; + + Assert(so->curBlkno != InvalidBlockNumber); + + buffer = ReadBuffer(scan->indexRelation, so->curBlkno); + if (!BufferIsValid(buffer)) + return; + + LockBuffer(buffer, GIST_SHARE); + gistcheckpage(scan->indexRelation, buffer); + page = BufferGetPage(buffer); + + /* + * If page LSN differs it means that the page was modified since the last read. + * killedItemes could be not valid so LP_DEAD hints applying is not safe. + */ + if(PageGetLSN(page) != so->curPageLSN) + { + UnlockReleaseBuffer(buffer); + so->numKilled = 0; /* reset counter */ + return; + } + + minoff = FirstOffsetNumber; + maxoff = PageGetMaxOffsetNumber(page); + + maxoff = PageGetMaxOffsetNumber(page); + for (i = 0; i < so->numKilled; i++) + { + if (so->killedItems != NULL) + { + OffsetNumber offnum = FirstOffsetNumber; + + while (offnum <= maxoff) + { + ItemId iid = PageGetItemId(page, offnum); + IndexTuple ituple = (IndexTuple) PageGetItem(page, iid); + + if (ItemPointerEquals(&ituple->t_tid, &(so->killedItems[i]))) + { + /* found the item */ + ItemIdMarkDead(iid); + killedsomething = true; + break; /* out of inner search loop */ + } + offnum = OffsetNumberNext(offnum); + } + } + } + + if (killedsomething) + { + GistMarkPageHasGarbage(page); + MarkBufferDirtyHint(buffer, true); + } + + UnlockReleaseBuffer(buffer); + + /* + * Always reset the scan state, so we don't look for same items on other + * pages. + */ + so->numKilled = 0; + } /* * gistindex_keytest() -- does this index tuple satisfy the scan key(s)? *************** *** 306,322 **** gistScanPage(IndexScanDesc scan, GISTSearchItem *pageItem, double *myDistances, MemoryContextReset(so->pageDataCxt); /* * check all tuples on page */ maxoff = PageGetMaxOffsetNumber(page); for (i = FirstOffsetNumber; i <= maxoff; i = OffsetNumberNext(i)) { ! IndexTuple it = (IndexTuple) PageGetItem(page, PageGetItemId(page, i)); bool match; bool recheck; bool recheck_distances; /* * Must call gistindex_keytest in tempCxt, and clean up any leftover * junk afterward. */ --- 393,425 ---- MemoryContextReset(so->pageDataCxt); /* + * We save the LSN of the page as we read it, so that we know whether it + * safe to apply LP_DEAD hints to the page later. This allows us to drop + * the pin for MVCC scans, which allows vacuum to avoid blocking. + */ + so->curPageLSN = PageGetLSN(page); + + /* * check all tuples on page */ maxoff = PageGetMaxOffsetNumber(page); for (i = FirstOffsetNumber; i <= maxoff; i = OffsetNumberNext(i)) { ! ItemId iid = PageGetItemId(page, i); ! IndexTuple it; bool match; bool recheck; bool recheck_distances; /* + * If the scan specifies not to return killed tuples, then we treat a + * killed tuple as not passing the qual. + */ + if(scan->ignore_killed_tuples && ItemIdIsDead(iid)) + continue; + + it = (IndexTuple) PageGetItem(page, iid); + /* * Must call gistindex_keytest in tempCxt, and clean up any leftover * junk afterward. */ *************** *** 331,337 **** gistScanPage(IndexScanDesc scan, GISTSearchItem *pageItem, double *myDistances, /* Ignore tuple if it doesn't match */ if (!match) continue; - if (tbm && GistPageIsLeaf(page)) { /* --- 434,439 ---- *************** *** 451,456 **** getNextNearest(IndexScanDesc scan) --- 553,575 ---- if (scan->xs_itup) { + /* + * If previously returned index tuple is not visible save it into + * so->killedItems. And at the end of the page scan mark all saved + * tuples as dead. + */ + if (scan->kill_prior_tuple) + { + if (so->killedItems == NULL) + { + MemoryContext oldCxt2 = MemoryContextSwitchTo(so->giststate->scanCxt); + + so->killedItems = (ItemPointerData *) palloc(MaxIndexTuplesPerPage * sizeof(ItemPointerData)); + MemoryContextSwitchTo(oldCxt2); + } + if ((so->numKilled < MaxIndexTuplesPerPage)) + so->killedItems[so->numKilled++] = scan->xs_ctup.t_self; + } /* free previously returned tuple */ pfree(scan->xs_itup); scan->xs_itup = NULL; *************** *** 515,520 **** getNextNearest(IndexScanDesc scan) --- 634,645 ---- } else { + if ((so->curBlkno != InvalidBlockNumber) && (so->numKilled > 0)) + gistkillitems(scan); + + /* save current item BlockNumber for next gistkillitems() call */ + so->curBlkno = item->blkno; + /* visit an index page, extract its items into queue */ CHECK_FOR_INTERRUPTS(); *************** *** 572,578 **** gistgettuple(PG_FUNCTION_ARGS) --- 697,715 ---- { if (so->curPageData < so->nPageData) { + if ((scan->kill_prior_tuple) && (so->curPageData > 0)) + { + if (so->killedItems == NULL) + { + MemoryContext oldCxt = MemoryContextSwitchTo(so->giststate->scanCxt); + + so->killedItems = (ItemPointerData *) palloc(MaxIndexTuplesPerPage * sizeof(ItemPointerData)); + MemoryContextSwitchTo(oldCxt); + } + if (so->numKilled < MaxIndexTuplesPerPage) + so->killedItems[so->numKilled++] = so->pageData[so->curPageData - 1].heapPtr; + } /* continuing to return tuples from a leaf page */ scan->xs_ctup.t_self = so->pageData[so->curPageData].heapPtr; scan->xs_recheck = so->pageData[so->curPageData].recheck; *************** *** 586,594 **** gistgettuple(PG_FUNCTION_ARGS) --- 723,751 ---- PG_RETURN_BOOL(true); } + /* + * Check the last returned tuple and add it to killitems if + * necessary + */ + if ((scan->kill_prior_tuple) && (so->curPageData > 0) && (so->curPageData == so->nPageData)) + { + + if (so->killedItems == NULL) + { + MemoryContext oldCxt = MemoryContextSwitchTo(so->giststate->scanCxt); + + so->killedItems = (ItemPointerData *) palloc(MaxIndexTuplesPerPage * sizeof(ItemPointerData)); + MemoryContextSwitchTo(oldCxt); + } + if ((so->numKilled < MaxIndexTuplesPerPage)) + so->killedItems[so->numKilled++] = so->pageData[so->curPageData - 1].heapPtr; + } /* find and process the next index page */ do { + if ((so->curBlkno != InvalidBlockNumber) && (so->numKilled > 0)) + gistkillitems(scan); + GISTSearchItem *item = getNextGISTSearchItem(so); if (!item) *************** *** 596,601 **** gistgettuple(PG_FUNCTION_ARGS) --- 753,761 ---- CHECK_FOR_INTERRUPTS(); + /* save current item BlockNumber for next gistkillitems() call */ + so->curBlkno = item->blkno; + /* * While scanning a leaf page, ItemPointers of matching heap * tuples are stored in so->pageData. If there are any on *** a/src/backend/access/gist/gistscan.c --- b/src/backend/access/gist/gistscan.c *************** *** 93,98 **** gistbeginscan(PG_FUNCTION_ARGS) --- 93,103 ---- memset(scan->xs_orderbynulls, true, sizeof(bool) * scan->numberOfOrderBys); } + so->killedItems = NULL; /* until needed */ + so->numKilled = 0; + so->curBlkno = InvalidBlockNumber; + so->curPageLSN = InvalidXLogRecPtr; + scan->opaque = so; /* *** a/src/include/access/gist.h --- b/src/include/access/gist.h *************** *** 41,48 **** */ #define F_LEAF (1 << 0) /* leaf page */ #define F_DELETED (1 << 1) /* the page has been deleted */ ! #define F_TUPLES_DELETED (1 << 2) /* some tuples on the page are dead */ #define F_FOLLOW_RIGHT (1 << 3) /* page to the right has no downlink */ typedef XLogRecPtr GistNSN; --- 41,51 ---- */ #define F_LEAF (1 << 0) /* leaf page */ #define F_DELETED (1 << 1) /* the page has been deleted */ ! #define F_TUPLES_DELETED (1 << 2) /* some tuples on the page were ! * deleted */ #define F_FOLLOW_RIGHT (1 << 3) /* page to the right has no downlink */ + #define F_HAS_GARBAGE (1 << 4) /* some tuples on the page are dead, + * but not deleted yet */ typedef XLogRecPtr GistNSN; *************** *** 137,142 **** typedef struct GISTENTRY --- 140,149 ---- #define GistMarkTuplesDeleted(page) ( GistPageGetOpaque(page)->flags |= F_TUPLES_DELETED) #define GistClearTuplesDeleted(page) ( GistPageGetOpaque(page)->flags &= ~F_TUPLES_DELETED) + #define GistPageHasGarbage(page) ( GistPageGetOpaque(page)->flags & F_HAS_GARBAGE) + #define GistMarkPageHasGarbage(page) ( GistPageGetOpaque(page)->flags |= F_HAS_GARBAGE) + #define GistClearPageHasGarbage(page) ( GistPageGetOpaque(page)->flags &= ~F_HAS_GARBAGE) + #define GistFollowRight(page) ( GistPageGetOpaque(page)->flags & F_FOLLOW_RIGHT) #define GistMarkFollowRight(page) ( GistPageGetOpaque(page)->flags |= F_FOLLOW_RIGHT) #define GistClearFollowRight(page) ( GistPageGetOpaque(page)->flags &= ~F_FOLLOW_RIGHT) *** a/src/include/access/gist_private.h --- b/src/include/access/gist_private.h *************** *** 22,27 **** --- 22,28 ---- #include "storage/bufmgr.h" #include "storage/buffile.h" #include "utils/hsearch.h" + #include "access/genam.h" /* * Maximum number of "halves" a page can be split into in one operation. *************** *** 161,166 **** typedef struct GISTScanOpaqueData --- 162,173 ---- /* pre-allocated workspace arrays */ double *distances; /* output area for gistindex_keytest */ + /* info about killed items if any (killedItems is NULL if never used) */ + ItemPointerData *killedItems; /* heap pointers of killed items */ + int numKilled; /* number of currently stored items */ + BlockNumber curBlkno; /* current number of block */ + GistNSN curPageLSN; /* pos in the WAL stream when page was read */ + /* In a non-ordered search, returnable heap items are stored here: */ GISTSearchHeapItem pageData[BLCKSZ / sizeof(IndexTupleData)]; OffsetNumber nPageData; /* number of valid items in array */