? src/backend/access/nbtree/.deps ? src/backend/access/nbtree/nbtfreelist.c Index: src/include/access/nbtree.h =================================================================== RCS file: /projects/cvsroot/pgsql-server/src/include/access/nbtree.h,v retrieving revision 1.63 diff -c -r1.63 nbtree.h *** src/include/access/nbtree.h 2002/07/02 05:48:44 1.63 --- src/include/access/nbtree.h 2002/12/18 07:32:46 *************** *** 20,25 **** --- 20,48 ---- #include "access/xlogutils.h" /* + * BTFreeListData is where freelists for BTress are kept. There is one in + * the metapage. If there are more freepages than fit in the metapage, + * btf_nextfreelist has the BlockNumber of the next freelist page. + * + * Care should be taken that with BT_MAX_FREELIST elements, the struct + * does not overflow the metapage. This number should be even so that + * _bt_addfree() works correctly. + * + * XXX Maybe this can be of varying size, one for the metapage and another + * for the rest of the pages. This allows for more items on the following + * pages, saving some space. I don't think this is worth the trouble. + * + * XXX Determine the greatest number that can fit in the metapage. + */ + #define BT_MAX_FREELIST 1500 + typedef struct BTFreeListData + { + uint32 btf_numfreepages; + BlockNumber btf_nextfreelist; + BlockNumber btf_freepages[BT_MAX_FREELIST]; + } BTFreeListData; + + /* * BTPageOpaqueData -- At the end of every page, we store a pointer * to both siblings in the tree. This is used to do forward/backward * index scans. See Lehman and Yao's paper for more *************** *** 37,43 **** BlockNumber btpo_next; /* used for forward index scans */ BlockNumber btpo_parent; /* pointer to parent, but not updated on * parent split */ ! uint16 btpo_flags; /* LEAF?, ROOT?, FREE?, META?, REORDER? */ } BTPageOpaqueData; --- 60,67 ---- BlockNumber btpo_next; /* used for forward index scans */ BlockNumber btpo_parent; /* pointer to parent, but not updated on * parent split */ ! uint16 btpo_flags; /* LEAF?, ROOT?, FREE?, DEAD?, META?, ! REORDER? */ } BTPageOpaqueData; *************** *** 49,55 **** #define BTP_FREE (1 << 2) /* page not in use */ #define BTP_META (1 << 3) /* meta-page */ #define BTP_REORDER (1 << 4) /* items need reordering */ ! /* * The Meta page is always the first page in the btree index. --- 73,80 ---- #define BTP_FREE (1 << 2) /* page not in use */ #define BTP_META (1 << 3) /* meta-page */ #define BTP_REORDER (1 << 4) /* items need reordering */ ! #define BTP_DEAD (1 << 5) /* page is about to be freed */ ! #define BTP_FREELIST (1 << 6) /* page is used with a freelist */ /* * The Meta page is always the first page in the btree index. *************** *** 58,67 **** typedef struct BTMetaPageData { ! uint32 btm_magic; ! uint32 btm_version; ! BlockNumber btm_root; ! int32 btm_level; } BTMetaPageData; #define BTPageGetMeta(p) \ --- 83,93 ---- typedef struct BTMetaPageData { ! uint32 btm_magic; ! uint32 btm_version; ! BlockNumber btm_root; ! int32 btm_level; ! BTFreeListData btm_freelist; } BTMetaPageData; #define BTPageGetMeta(p) \ *************** *** 204,209 **** --- 230,238 ---- #define P_RIGHTMOST(opaque) ((opaque)->btpo_next == P_NONE) #define P_ISLEAF(opaque) ((opaque)->btpo_flags & BTP_LEAF) #define P_ISROOT(opaque) ((opaque)->btpo_flags & BTP_ROOT) + #define P_ISFREE(opaque) ((opaque)->btpo_flags & BTP_FREE) + #define P_ISDEAD(opaque) ((opaque)->btpo_flags & BTP_DEAD) + #define P_ISFREELIST(opaque) ((opaque)->btpo_flags & BTP_FREELIST) /* * Lehman and Yao's algorithm requires a ``high key'' on every non-rightmost *************** *** 254,262 **** --- 283,293 ---- */ typedef struct xl_btree_delete { + bool isempty; /* page is empty after deletion */ xl_btreetid target; /* deleted tuple id */ } xl_btree_delete; + /* FIXME - este SizeOf puede ser incorrecto */ #define SizeOfBtreeDelete (offsetof(xl_btreetid, tid) + SizeOfIptrData) /* *************** *** 356,367 **** extern void _bt_metapinit(Relation rel); extern Buffer _bt_getroot(Relation rel, int access); extern Buffer _bt_getbuf(Relation rel, BlockNumber blkno, int access); extern void _bt_relbuf(Relation rel, Buffer buf); extern void _bt_wrtbuf(Relation rel, Buffer buf); extern void _bt_wrtnorelbuf(Relation rel, Buffer buf); extern void _bt_pageinit(Page page, Size size); extern void _bt_metaproot(Relation rel, BlockNumber rootbknum, int level); ! extern void _bt_itemdel(Relation rel, Buffer buf, ItemPointer tid); /* * prototypes for functions in nbtsearch.c --- 387,408 ---- extern void _bt_metapinit(Relation rel); extern Buffer _bt_getroot(Relation rel, int access); extern Buffer _bt_getbuf(Relation rel, BlockNumber blkno, int access); + extern Buffer _bt_newbuf(Relation rel, bool canreuse); extern void _bt_relbuf(Relation rel, Buffer buf); extern void _bt_wrtbuf(Relation rel, Buffer buf); extern void _bt_wrtnorelbuf(Relation rel, Buffer buf); extern void _bt_pageinit(Page page, Size size); extern void _bt_metaproot(Relation rel, BlockNumber rootbknum, int level); ! extern bool _bt_itemdel(Relation rel, Buffer buf, ItemPointer tid); ! extern bool _bt_pageisempty(Page page); ! ! /* ! * prototypes for functions in nbtfreelist.c ! */ ! extern void _bt_initfreelist(BTFreeListData *btf); ! extern void _bt_processdead(Relation rel, Buffer buf); ! extern Buffer _bt_getfreepage(Relation rel); ! extern void _bt_printbtfchain(Relation rel, Buffer buf); /* * prototypes for functions in nbtsearch.c Index: src/backend/access/nbtree/Makefile =================================================================== RCS file: /projects/cvsroot/pgsql-server/src/backend/access/nbtree/Makefile,v retrieving revision 1.11 diff -c -r1.11 Makefile *** src/backend/access/nbtree/Makefile 2001/07/15 22:48:16 1.11 --- src/backend/access/nbtree/Makefile 2002/12/18 07:32:46 *************** *** 13,19 **** include $(top_builddir)/src/Makefile.global OBJS = nbtcompare.o nbtinsert.o nbtpage.o nbtree.o nbtsearch.o \ ! nbtstrat.o nbtutils.o nbtsort.o all: SUBSYS.o --- 13,19 ---- include $(top_builddir)/src/Makefile.global OBJS = nbtcompare.o nbtinsert.o nbtpage.o nbtree.o nbtsearch.o \ ! nbtstrat.o nbtutils.o nbtsort.o nbtfreelist.o all: SUBSYS.o Index: src/backend/access/nbtree/README =================================================================== RCS file: /projects/cvsroot/pgsql-server/src/backend/access/nbtree/README,v retrieving revision 1.6 diff -c -r1.6 README *** src/backend/access/nbtree/README 2002/10/20 20:47:31 1.6 --- src/backend/access/nbtree/README 2002/12/18 07:32:47 *************** *** 37,43 **** copies of tree nodes are unshared. Postgres shares in-memory buffers among backends. As a result, we do page-level read locking on btree nodes in order to guarantee that no record is modified while we are ! examining it. This reduces concurrency but guaranteees correct behavior. An advantage is that when trading in a read lock for a write lock, we need not re-read the page after getting the write lock. Since we're also holding a pin on the shared buffer containing the --- 37,43 ---- copies of tree nodes are unshared. Postgres shares in-memory buffers among backends. As a result, we do page-level read locking on btree nodes in order to guarantee that no record is modified while we are ! examining it. This reduces concurrency but guarantees correct behavior. An advantage is that when trading in a read lock for a write lock, we need not re-read the page after getting the write lock. Since we're also holding a pin on the shared buffer containing the Index: src/backend/access/nbtree/nbtinsert.c =================================================================== RCS file: /projects/cvsroot/pgsql-server/src/backend/access/nbtree/nbtinsert.c,v retrieving revision 1.96 diff -c -r1.96 nbtinsert.c *** src/backend/access/nbtree/nbtinsert.c 2002/09/04 20:31:09 1.96 --- src/backend/access/nbtree/nbtinsert.c 2002/12/18 07:32:51 *************** *** 19,25 **** #include "access/nbtree.h" #include "miscadmin.h" - typedef struct { /* context data for _bt_checksplitloc */ --- 19,24 ---- *************** *** 708,714 **** OffsetNumber i; BTItem lhikey; ! rbuf = _bt_getbuf(rel, P_NEW, BT_WRITE); origpage = BufferGetPage(buf); leftpage = PageGetTempPage(origpage, sizeof(BTPageOpaqueData)); rightpage = BufferGetPage(rbuf); --- 707,713 ---- OffsetNumber i; BTItem lhikey; ! rbuf = _bt_newbuf(rel, true); origpage = BufferGetPage(buf); leftpage = PageGetTempPage(origpage, sizeof(BTPageOpaqueData)); rightpage = BufferGetPage(rbuf); *************** *** 1290,1296 **** BTMetaPageData *metad; /* get a new root page */ ! rootbuf = _bt_getbuf(rel, P_NEW, BT_WRITE); rootpage = BufferGetPage(rootbuf); rootblknum = BufferGetBlockNumber(rootbuf); metabuf = _bt_getbuf(rel, BTREE_METAPAGE, BT_WRITE); --- 1289,1295 ---- BTMetaPageData *metad; /* get a new root page */ ! rootbuf = _bt_newbuf(rel, false); rootpage = BufferGetPage(rootbuf); rootblknum = BufferGetBlockNumber(rootbuf); metabuf = _bt_getbuf(rel, BTREE_METAPAGE, BT_WRITE); Index: src/backend/access/nbtree/nbtpage.c =================================================================== RCS file: /projects/cvsroot/pgsql-server/src/backend/access/nbtree/nbtpage.c,v retrieving revision 1.58 diff -c -r1.58 nbtpage.c *** src/backend/access/nbtree/nbtpage.c 2002/08/06 02:36:33 1.58 --- src/backend/access/nbtree/nbtpage.c 2002/12/18 07:32:52 *************** *** 22,29 **** */ #include "postgres.h" - #include - #include "access/nbtree.h" #include "miscadmin.h" #include "storage/lmgr.h" --- 22,27 ---- *************** *** 74,79 **** --- 72,79 ---- metad.btm_version = BTREE_VERSION; metad.btm_root = P_NONE; metad.btm_level = 0; + _bt_initfreelist(&(metad.btm_freelist)); + memcpy((char *) BTPageGetMeta(pg), (char *) &metad, sizeof(metad)); op = (BTPageOpaque) PageGetSpecialPointer(pg); *************** *** 158,164 **** * type on the new root page. Since this is the first page in * the tree, it's a leaf as well as the root. */ ! rootbuf = _bt_getbuf(rel, P_NEW, BT_WRITE); rootblkno = BufferGetBlockNumber(rootbuf); rootpage = BufferGetPage(rootbuf); --- 158,164 ---- * type on the new root page. Since this is the first page in * the tree, it's a leaf as well as the root. */ ! rootbuf = _bt_newbuf(rel, false); rootblkno = BufferGetBlockNumber(rootbuf); rootpage = BufferGetPage(rootbuf); *************** *** 278,285 **** /* * Ok, here we have old root page with btpo_parent pointing to ! * upper level - check parent page because of there is good ! * chance that parent is root page. */ newrootbuf = _bt_getbuf(rel, rootopaque->btpo_parent, BT_READ); _bt_relbuf(rel, rootbuf); --- 278,285 ---- /* * Ok, here we have old root page with btpo_parent pointing to ! * upper level - check parent page because there is a good ! * chance that it is the new root page. */ newrootbuf = _bt_getbuf(rel, rootopaque->btpo_parent, BT_READ); _bt_relbuf(rel, rootbuf); *************** *** 316,348 **** { Buffer buf; ! if (blkno != P_NEW) ! { ! /* Read an existing block of the relation */ ! buf = ReadBuffer(rel, blkno); ! LockBuffer(buf, access); ! } ! else ! { ! Page page; ! /* ! * Extend the relation by one page. ! * ! * Extend bufmgr code is unclean and so we have to use extra locking ! * here. ! */ LockPage(rel, 0, ExclusiveLock); ! buf = ReadBuffer(rel, blkno); ! LockBuffer(buf, access); UnlockPage(rel, 0, ExclusiveLock); ! ! /* Initialize the new page before returning it */ ! page = BufferGetPage(buf); ! _bt_pageinit(page, BufferGetPageSize(buf)); } ! /* ref count and lock type are correct */ return buf; } --- 316,373 ---- { Buffer buf; ! /* New buffers are created using _bt_newbuf() */ ! Assert (blkno != P_NEW); ! /* Read an existing block of the relation */ ! buf = ReadBuffer(rel, blkno); ! ! LockBuffer(buf, access); ! ! /* ref count and lock type are correct */ ! return buf; ! } ! ! /* ! * _bt_newbuf() -- Creates a new buffer. ! * ! * When this routine returns, the buffer is locked with BT_WRITE ! * and with its reference count incremented. ! * ! * If canreuse is true, first tries to get a free page from the freelist; ! * if there isn't any, extends the relation. ! * ! * If canreuse is false, extend the relation. This is to prevent ! * deadlocking when splitting the root page. ! * ! */ ! Buffer ! _bt_newbuf(Relation rel, bool canreuse) ! { ! Page page; ! Buffer buf = InvalidBuffer; ! bool isfree = true; ! ! /* ! * If we are asked to give a new page, don't bother asking for ! * freepages (this is to avoid deadlocking when splitting the root). ! * Else, try to get a freepage and fallback to extending the relation if ! * one isn't found. ! */ ! if (!canreuse || ((buf = _bt_getfreepage(rel)) == InvalidBuffer)) ! { LockPage(rel, 0, ExclusiveLock); ! buf = ReadBuffer(rel, P_NEW); ! LockBuffer(buf, BT_WRITE); UnlockPage(rel, 0, ExclusiveLock); ! isfree = false; } + Assert(!BufferIsInvalid(buf)); ! /* Initialize the page before returning it */ ! page = BufferGetPage(buf); ! ! _bt_pageinit(page, BufferGetPageSize(buf)); return buf; } *************** *** 445,456 **** * * This routine assumes that the caller has pinned and locked the buffer, * and will write the buffer afterwards. */ ! void _bt_itemdel(Relation rel, Buffer buf, ItemPointer tid) { Page page = BufferGetPage(buf); OffsetNumber offno; offno = ItemPointerGetOffsetNumber(tid); --- 470,484 ---- * * This routine assumes that the caller has pinned and locked the buffer, * and will write the buffer afterwards. + * + * Returns whether the page is empty after the deletion. */ ! bool _bt_itemdel(Relation rel, Buffer buf, ItemPointer tid) { Page page = BufferGetPage(buf); OffsetNumber offno; + bool isempty; offno = ItemPointerGetOffsetNumber(tid); *************** *** 458,463 **** --- 486,493 ---- PageIndexTupleDelete(page, offno); + isempty = _bt_pageisempty(page); + /* XLOG stuff */ if (!rel->rd_istemp) { *************** *** 467,472 **** --- 497,503 ---- xlrec.target.node = rel->rd_node; xlrec.target.tid = *tid; + xlrec.isempty = isempty; rdata[0].buffer = InvalidBuffer; rdata[0].data = (char *) &xlrec; rdata[0].len = SizeOfBtreeDelete; *************** *** 484,487 **** --- 515,535 ---- } END_CRIT_SECTION(); + return isempty; + } + + /* + * Returns whether a BTree page is empty. + * I can't use PageIsEmpty because non-rightmost pages will have the + * high key in the first item. + */ + bool + _bt_pageisempty(Page page) + { + BTPageOpaque opaque = (BTPageOpaque) PageGetSpecialPointer(page); + OffsetNumber max = PageGetMaxOffsetNumber(page); + + if ((P_RIGHTMOST(opaque) && max == 0) || (!P_RIGHTMOST(opaque) && max == 1)) + return true; + return false; } Index: src/backend/access/nbtree/nbtree.c =================================================================== RCS file: /projects/cvsroot/pgsql-server/src/backend/access/nbtree/nbtree.c,v retrieving revision 1.94 diff -c -r1.94 nbtree.c *** src/backend/access/nbtree/nbtree.c 2002/11/15 01:26:08 1.94 --- src/backend/access/nbtree/nbtree.c 2002/12/18 07:33:00 *************** *** 594,599 **** --- 594,601 ---- IndexScanDesc scan; BTScanOpaque so; ItemPointer current; + List *freelist = NIL, + *block; tuples_removed = 0; num_index_tuples = 0; *************** *** 639,645 **** BlockNumber blkno; OffsetNumber offnum; BTItem btitem; ! BTPageOpaque opaque; IndexTuple itup; ItemPointer htup; --- 641,647 ---- BlockNumber blkno; OffsetNumber offnum; BTItem btitem; ! BTPageOpaque opaque = NULL; IndexTuple itup; ItemPointer htup; *************** *** 681,689 **** if (callback(htup, callback_state)) { ! /* Okay to delete the item from the page */ ! _bt_itemdel(rel, buf, current); /* Mark buffer dirty, but keep the lock and pin */ WriteNoReleaseBuffer(buf); --- 683,710 ---- if (callback(htup, callback_state)) { ! /* ! * Okay to delete the item from the page. If this was the ! * last tuple in the page, add the page to the freelist. ! * ! * FIXME: ! * If this is the a rightmost or leftmost page, forget about ! * freelist. Too much things break if the rightmost page is ! * dropped; _bt_endpoint() in particular. This means the tree ! * will never shrink vertically, but I don't see another way. ! */ + if (_bt_itemdel(rel, buf, current)) + { + if (opaque == NULL) + opaque = (BTPageOpaque) PageGetSpecialPointer(page); + if (!P_RIGHTMOST(opaque) && !P_LEFTMOST(opaque)) + { + opaque->btpo_flags |= BTP_DEAD; + freelist = lconsi(blkno, freelist); + } + } + /* Mark buffer dirty, but keep the lock and pin */ WriteNoReleaseBuffer(buf); *************** *** 706,712 **** --- 727,749 ---- } index_endscan(scan); + /* Process the list of dead pages and put them in the freelist */ + foreach (block, freelist) + { + Buffer buf = ReadBuffer(rel, lfirsti(block)); + /* FIXME is this locking correct? */ + LockBufferForCleanup(buf); + /* _bt_processdead() will unlock and release the buffer */ + _bt_processdead(rel, buf); + } + /* + * FIXME: Try to truncate the freepages at the end. + * Note that the freelist should be updated too, perhaps by removing + * the pages that are above the truncation point. + */ + + /* FIXME - maybe I should return the number of freed pages too */ /* return statistics */ num_pages = RelationGetNumberOfBlocks(rel); *************** *** 868,873 **** --- 905,913 ---- } PageIndexTupleDelete(page, ItemPointerGetOffsetNumber(&(xlrec->target.tid))); + + if (xlrec->isempty) + _bt_processdead(reln, buffer); PageSetLSN(page, lsn); PageSetSUI(page, ThisStartUpID); Index: src/backend/access/nbtree/nbtsearch.c =================================================================== RCS file: /projects/cvsroot/pgsql-server/src/backend/access/nbtree/nbtsearch.c,v retrieving revision 1.72 diff -c -r1.72 nbtsearch.c *** src/backend/access/nbtree/nbtsearch.c 2002/06/20 20:29:25 1.72 --- src/backend/access/nbtree/nbtsearch.c 2002/12/18 07:33:00 *************** *** 61,69 **** BlockNumber par_blkno; BTStack new_stack; - /* if this is a leaf page, we're done */ page = BufferGetPage(*bufP); opaque = (BTPageOpaque) PageGetSpecialPointer(page); if (P_ISLEAF(opaque)) break; --- 61,81 ---- BlockNumber par_blkno; BTStack new_stack; page = BufferGetPage(*bufP); opaque = (BTPageOpaque) PageGetSpecialPointer(page); + /* + * If page is marked dead, jump to the right. + * FIXME: what to do if the page is rightmost? + */ + while (P_ISDEAD(opaque)) + { + Assert(!P_RIGHTMOST(opaque)); + *bufP = ReadBuffer(rel, opaque->btpo_next); + page = BufferGetPage(*bufP); + opaque = (BTPageOpaque) PageGetSpecialPointer(page); + } + + /* if this is a leaf page, we're done */ if (P_ISLEAF(opaque)) break; *************** *** 150,156 **** * It could even have split more than once, so scan as far as needed. */ while (!P_RIGHTMOST(opaque) && ! _bt_compare(rel, keysz, scankey, page, P_HIKEY) > 0) { /* step right one page */ BlockNumber rblkno = opaque->btpo_next; --- 162,169 ---- * It could even have split more than once, so scan as far as needed. */ while (!P_RIGHTMOST(opaque) && ! (P_ISDEAD(opaque) || ! _bt_compare(rel, keysz, scankey, page, P_HIKEY) > 0)) { /* step right one page */ BlockNumber rblkno = opaque->btpo_next; Index: src/backend/access/nbtree/nbtsort.c =================================================================== RCS file: /projects/cvsroot/pgsql-server/src/backend/access/nbtree/nbtsort.c,v retrieving revision 1.70 diff -c -r1.70 nbtsort.c *** src/backend/access/nbtree/nbtsort.c 2002/11/15 01:26:08 1.70 --- src/backend/access/nbtree/nbtsort.c 2002/12/18 07:33:01 *************** *** 183,189 **** { BTPageOpaque opaque; ! *buf = _bt_getbuf(index, P_NEW, BT_WRITE); *page = BufferGetPage(*buf); /* Zero the page and set up standard page header info */ --- 183,189 ---- { BTPageOpaque opaque; ! *buf = _bt_newbuf(index, false); *page = BufferGetPage(*buf); /* Zero the page and set up standard page header info */