/*------------------------------------------------------------------------- * * nbtfreelist.c * Freepage list handling for the btree indexed access method. * * Copyright (c) 2002, PostgreSQL Global Development Group * * * IDENTIFICATION * $Id$ * * NOTES * Alvaro Herrera's implementation of list of empty pages in BTrees. * * When a page is made free because there are no more items on it, it * should be entered in this list. This can be either a leaf pages * emptied by btbulkdelete(), or internal pages emptied when cleaning up * the parents of said leaf pages. * * btbulkdelete() should mark pages that are left empty with BTP_DEAD, * and then pass each dead page to _bt_processdead. This will take care * of adding the page to the freelist. When new pages are requested via * _bt_newbuf(), _bt_getfreepage() will take a page from the freelist or * return InvalidBuffer if the freelist is empty. In the latter case, * _bt_newbuf() should extend the relation. * * Note that for preventing deadlocks, when creating a new root page is * not possible to take a page from the freelist. * * The freelist is kept in BTFreeListData structures that keep the * BlockNumber of the free pages. There is one of those in the metapage. * When that one is filled up and _bt_addfree() is called again, two * things can happen. One is that there is a next page with a freelist, * and it is half empty. In that case, half the entries from the * metapage are copied to the next page, and the new free page is added * to the metapage's freelist. * * The other case is that the next page's freelist is full, or there * isn't a next page. In that case, the page we've been handed is used * as the new freelist page, and half the entries from the metapage are * copied there. The next time _bt_addfree() is called, the metapage * will have some free space, and the items can be copied from there to * this page that has been left half empty if the metapage fills again. * * This half-copying bussiness is to avoid ping-pong between the * metapage's freelist and the next page's when requesting and adding * freepages in the limit case that the metapage is full. It also * simplifies the handling of the lists so that only the metapage's list * is modified, and next page's is rarely touched. * *------------------------------------------------------------------------- */ #include "postgres.h" #include "access/nbtree.h" #include "miscadmin.h" #include "storage/lmgr.h" static void _bt_deletefromparent(Relation rel, BlockNumber pblkno, Buffer buf); static void _bt_addfree(Relation rel, Buffer buf); static void _bt_levelfreelists(BTFreeListData *btf1, BTFreeListData *btf2); #ifdef NOT_USED static void _bt_printbtf(BTFreeListData *btf); #else #define _bt_printbtfchain(r,b) #endif /* * Add a page to the list of free pages. Also set it's BTP_FREE flag * and unset BTP_DEAD. * * Assumes the buffer is locked and pinned, and does not release the lock, * unpin nor write the buffer. */ static void _bt_addfree(Relation rel, Buffer buf) { Page page, metapg, nextpg; Buffer metabuf; BTMetaPageData *metad; BTFreeListData *btfmeta, *btfnext; BlockNumber freeblk, nextblk; page = BufferGetPage(buf); freeblk = BufferGetBlockNumber(buf); BTPageOpaque op = (BTPageOpaque) PageGetSpecialPointer(page); Assert(P_ISDEAD(op)); LockPage(rel, 0, ExclusiveLock); /* * Add the page to the freelist. * * Maybe lock should be BT_WRITE. But I think this should be safe, given * that I am protecting with LockPage(rel, 0) from other freelist-modifying * operations. */ metabuf = _bt_getbuf(rel, BTREE_METAPAGE, BT_READ); metapg = BufferGetPage(metabuf); metad = BTPageGetMeta(metapg); btfmeta = &(metad->btm_freelist); /* The metapage's freelist has some free space. Add it there. */ if (btfmeta->btf_numfreepages < BT_MAX_FREELIST) { uint32 i; for (i = 0; i < BT_MAX_FREELIST; i++) { if (btfmeta->btf_freepages[i] == 0) { btfmeta->btf_freepages[i] = freeblk; btfmeta->btf_numfreepages++; op->btpo_flags |= BTP_FREE; op->btpo_flags &= ~BTP_DEAD; _bt_printbtfchain(rel, metabuf); _bt_wrtbuf(rel, metabuf); UnlockPage(rel, 0, ExclusiveLock); return; } } /* Shouldn't be here. */ Assert(false); } /* * The metapage's freelist is full. If the next freelist page is half * empty, copy half the entries from the metapage there and put the new * free page as the next freepage in the metapage. If the next freelist * page is full or it doesn't exist, use the new free page as the new * freelist page and copy half the entries in the metapage on it. This * is done to avoid ping-pong effects in the metapage. */ nextblk = btfmeta->btf_nextfreelist; if (nextblk != InvalidBlockNumber) { Buffer nextbuf; Page nextpg; BTFreeListData *btfnext; nextbuf = _bt_getbuf(rel, nextblk, BT_WRITE); nextpg = BufferGetPage(nextbuf); btfnext = (BTFreeListData *) PageGetContents (nextpg); /* There is a next page and it's half empty. Fill it up. */ if (btfnext->btf_numfreepages == BT_MAX_FREELIST/2) { _bt_levelfreelists(btfmeta, btfnext); btfmeta->btf_numfreepages++; btfmeta->btf_freepages[BT_MAX_FREELIST/2] = freeblk; _bt_printbtfchain(rel, metabuf); _bt_wrtbuf(rel, metabuf); _bt_wrtbuf(rel, nextbuf); op->btpo_flags |= BTP_FREE; op->btpo_flags &= ~BTP_DEAD; UnlockPage(rel, 0, ExclusiveLock); return; } _bt_relbuf(rel, nextbuf); } /* There is no next page, or it's full. Create a new one. */ nextpg = BufferGetPage(buf); btfnext = (BTFreeListData *) PageGetContents(nextpg); _bt_initfreelist(btfnext); btfnext->btf_nextfreelist = nextblk; btfmeta->btf_nextfreelist = freeblk; _bt_levelfreelists(btfmeta, btfnext); op->btpo_flags |= BTP_FREELIST; op->btpo_flags &= ~BTP_DEAD; _bt_printbtfchain(rel, metabuf); _bt_wrtbuf(rel, metabuf); UnlockPage(rel, 0, ExclusiveLock); return; } /* Initializes an empty BTFreeList */ void _bt_initfreelist(BTFreeListData *btf) { btf->btf_numfreepages = 0; btf->btf_nextfreelist = InvalidBlockNumber; MemSet(&(btf->btf_freepages), 0, sizeof(BlockNumber) * BT_MAX_FREELIST); } #ifdef NOT_USED /* Prints a freepage contents */ static void _bt_printbtf(BTFreeListData *btf) { uint32 i; char str[BT_MAX_FREELIST * 6 + 50]; char t[5]; sprintf(str, "Numfreepages: %u, Nextfreelist: %u, BlockNumbers: ", btf->btf_numfreepages, btf->btf_nextfreelist); for (i=0; i < BT_MAX_FREELIST; i++) { sprintf(t, "%u ", btf->btf_freepages[i]); strcat(str, t); } elog(NOTICE, str); } /* Prints a freepage chain */ void _bt_printbtfchain(Relation rel, Buffer buf) { BTPageOpaque op; Page page; BTFreeListData *btf; page = BufferGetPage(buf); op = (BTPageOpaque) PageGetSpecialPointer(page); if (op->btpo_flags & BTP_META) { BTMetaPageData *btm = BTPageGetMeta(page); btf = &(btm->btm_freelist); } else btf = (BTFreeListData *) PageGetContents(page); _bt_printbtf(btf); if (btf->btf_nextfreelist != InvalidBlockNumber) { BlockNumber next = btf->btf_nextfreelist; buf = ReadBuffer(rel, next); _bt_printbtfchain(rel, buf); ReleaseBuffer(buf); } } #endif /* * Move half the entries from the first BTFreeList to the second. * Update btf_numfreepages to the right values for both. * * The first list should be full. This is supposed to be the metapage's * freelist. */ void _bt_levelfreelists(BTFreeListData *btf1, BTFreeListData *btf2) { uint32 i; Assert(btf1->btf_numfreepages == BT_MAX_FREELIST); /* Copy the items */ /* FIXME: Do this with memcpy instead */ if (btf2->btf_numfreepages == BT_MAX_FREELIST/2) for (i = BT_MAX_FREELIST/2; i < BT_MAX_FREELIST; i++) btf2->btf_freepages[i] = btf1->btf_freepages[i]; else if (btf2->btf_numfreepages == 0) for (i = 0; i < BT_MAX_FREELIST/2; i++) btf2->btf_freepages[i] = btf1->btf_freepages[BT_MAX_FREELIST/2 + i]; else Assert(false); /* Zero the upper half of the first freelist */ /* FIXME: Use MemSet instead. */ for (i = BT_MAX_FREELIST/2; i < BT_MAX_FREELIST; i++) btf1->btf_freepages[i] = 0; btf1->btf_numfreepages = BT_MAX_FREELIST/2; btf2->btf_numfreepages += BT_MAX_FREELIST/2; } /* * Delete all the pointers to this page: side links from right and left * siblings and the pointer from the parent. Then add the page to the * freelist. * * The buffer should be LockedForCleanup. */ void _bt_processdead(Relation rel, Buffer buf) { BlockNumber lblkno, rblkno, parent; BTPageOpaque opaque; Page page; page = BufferGetPage(buf); opaque = (BTPageOpaque) PageGetSpecialPointer(page); /* * If it's the root page, it should not be entered in the freelist. * Rather, mark it as leaf and return. */ if (P_ISROOT(opaque)) { opaque->btpo_flags &= ~BTP_DEAD; opaque->btpo_flags |= BTP_LEAF; return; } lblkno = opaque->btpo_prev; rblkno = opaque->btpo_next; parent = opaque->btpo_parent; /* Unlock early to prevent deadlocks */ LockBuffer(buf, BUFFER_LOCK_UNLOCK); if (lblkno != 0) { /* FIXME */ /* I think I have to check this one for concurrent splits */ Buffer lbuf; Page lpage; BTPageOpaque lopaque; lbuf = _bt_getbuf(rel, lblkno, BT_WRITE); lpage = BufferGetPage(lbuf); lopaque = (BTPageOpaque) PageGetSpecialPointer(lpage); lopaque->btpo_next = rblkno; _bt_wrtbuf(rel, lbuf); } if (rblkno != 0) { Buffer rbuf; Page rpage; BTPageOpaque ropaque; rbuf = _bt_getbuf(rel, rblkno, BT_WRITE); rpage = BufferGetPage(rbuf); ropaque = (BTPageOpaque) PageGetSpecialPointer(rpage); ropaque->btpo_prev = lblkno; _bt_wrtbuf(rel, rbuf); } /* * Delete the pointer to this page in the parent, recursively. * Note that this page cannot be the root page, since that was * checked earlier. */ _bt_deletefromparent(rel, parent, buf); LockBufferForCleanup(buf); _bt_addfree(rel, buf); _bt_wrtbuf(rel, buf); } /* * Delete the pointer to a child page. If the parent page is empty after * the deletion, delete the pointer from its parent too. */ static void _bt_deletefromparent(Relation rel, BlockNumber pblkno, Buffer buf) { Buffer pbuf; OffsetNumber offnum; Page ppage; BTPageOpaque pop; BlockNumber blkno = BufferGetBlockNumber(buf), max; pbuf = _bt_getbuf(rel, pblkno, BT_WRITE); Assert(!BufferIsInvalid(pbuf)); ppage = BufferGetPage(pbuf); pop = (BTPageOpaque) PageGetSpecialPointer(ppage); /* * Repeat until the correct parent page is found. Splits may * cause the parent page to move right. */ for (;;) { BlockNumber next; Assert(BufferIsValid(pbuf) && PageIsValid(ppage) && PageIsUsed(ppage)); /* Make sure no one else tries to look at the page. */ LockBuffer(pbuf, BUFFER_LOCK_UNLOCK); LockBufferForCleanup(pbuf); max = PageGetMaxOffsetNumber(ppage); /* * Look every offset of the page for the item pointing to the * dead page. */ for (offnum = FirstOffsetNumber; offnum <= max; offnum++) { BTItem item; ItemPointer iptr; item = (BTItem) PageGetItem(ppage, PageGetItemId(ppage, offnum)); iptr = &(item->bti_itup.t_tid); if (ItemPointerGetBlockNumber(iptr) == blkno) { /* Ok, we found the page. Now delete the pointer. */ ItemPointer iptrd = palloc(SizeOfIptrData); ItemPointerSet(iptrd, pblkno, offnum); _bt_itemdel(rel, pbuf, iptrd); /* * If the parent page is empty after this deletion, * mark it dead and free it too. */ if (_bt_pageisempty(ppage)) { pop->btpo_flags |= BTP_DEAD; /* _bt_processdead will call _bt_wrtbuf() on the buffer */ _bt_processdead(rel, pbuf); } else _bt_wrtbuf(rel, pbuf); return; } } /* * If we just finished scanning the rightmost page of the upper level, * there's something wrong. */ if (P_RIGHTMOST(pop)) elog(ERROR, "Unable to find parent page!"); /* Oops, the parent was split. Check its right sibling. */ next = pop->btpo_next; _bt_relbuf(rel, pbuf); pbuf = _bt_getbuf(rel, next, BT_WRITE); ppage = BufferGetPage(pbuf); pop = (BTPageOpaque) PageGetSpecialPointer(ppage); } } /* * Returns a page that has previously been entered into the freelist, * or InvalidBuffer if there is none. */ Buffer _bt_getfreepage(Relation rel) { Buffer metabuf, freebuf; Page metapg; BTMetaPageData *metad; BTFreeListData *freelst; BlockNumber freeblk; /* Lock page 0 to look at the freelist or extend */ LockPage(rel, 0, ExclusiveLock); /* * Maybe lock should be BT_WRITE. But I think this * should be safe, given that I am protecting with LockPage(rel, 0) * from other freelist-modifying operations. */ metabuf = _bt_getbuf(rel, BTREE_METAPAGE, BT_READ); metapg = BufferGetPage(metabuf); metad = BTPageGetMeta(metapg); freelst = &(metad->btm_freelist); /* There is a freepage in the metapage freelist. Return it. */ if (freelst->btf_numfreepages > 0) { uint32 i; for (i = 0; i < BT_MAX_FREELIST; i++) { if (freelst->btf_freepages[i] != 0) { freelst->btf_numfreepages--; freeblk = freelst->btf_freepages[i]; freebuf = _bt_getbuf(rel, freeblk, BT_WRITE); freelst->btf_freepages[i] = 0; _bt_wrtbuf(rel, metabuf); UnlockPage(rel, 0, ExclusiveLock); return freebuf; } } /* Shouldn't be here */ Assert(false); } /* * The metapage's freelist is empty, but there's another freelist page. * Copy the list to the metapage's, and return the emptied page. */ if (freelst->btf_nextfreelist != InvalidBlockNumber) { BlockNumber nextblkno; Buffer nextbuf; Page nextpg; BTFreeListData *freel; BTPageOpaque opaque; nextblkno = freelst->btf_nextfreelist; nextbuf = _bt_getbuf(rel, nextblkno, BT_WRITE); nextpg = BufferGetPage(nextbuf); freel = (BTFreeListData *) PageGetContents(nextpg); memcpy(&(metad->btm_freelist), freel, sizeof(BTFreeListData)); _bt_wrtbuf(rel, metabuf); UnlockPage(rel, 0, ExclusiveLock); opaque = (BTPageOpaque) PageGetSpecialPointer(nextpg); opaque->btpo_flags |= BTP_FREE; opaque->btpo_flags &= ~BTP_FREELIST; return nextbuf; } /* There isn't any freepage. Return InvalidBuffer. */ _bt_relbuf(rel, metabuf); UnlockPage(rel, 0, ExclusiveLock); return InvalidBuffer; }