From 324706fb2e15facbc5686e98ee26bbdc5017366a Mon Sep 17 00:00:00 2001
From: Heikki Linnakangas <heikki.linnakangas@iki.fi>
Date: Mon, 10 Oct 2016 10:59:38 +0300
Subject: [PATCH 1/2] Simplify tape block format.

No more indirect blocks. The blocks form a linked list instead.

This saves some memory, because we don't need to have a buffer in memory to
hold the indirect block (or blocks). To reflect that, TAPE_BUFFER_OVERHEAD
is reduced from 3 to 1 buffer, which allows using more memory for building
the initial runs.
---
 src/backend/utils/sort/logtape.c   | 581 ++++++++++---------------------------
 src/backend/utils/sort/tuplesort.c |  17 +-
 src/include/utils/logtape.h        |   2 +-
 3 files changed, 156 insertions(+), 444 deletions(-)

diff --git a/src/backend/utils/sort/logtape.c b/src/backend/utils/sort/logtape.c
index 6cc06b2..7e1f0d8 100644
--- a/src/backend/utils/sort/logtape.c
+++ b/src/backend/utils/sort/logtape.c
@@ -31,15 +31,8 @@
  * in BLCKSZ-size blocks.  Space allocation boils down to keeping track
  * of which blocks in the underlying file belong to which logical tape,
  * plus any blocks that are free (recycled and not yet reused).
- * The blocks in each logical tape are remembered using a method borrowed
- * from the Unix HFS filesystem: we store data block numbers in an
- * "indirect block".  If an indirect block fills up, we write it out to
- * the underlying file and remember its location in a second-level indirect
- * block.  In the same way second-level blocks are remembered in third-
- * level blocks, and so on if necessary (of course we're talking huge
- * amounts of data here).  The topmost indirect block of a given logical
- * tape is never actually written out to the physical file, but all lower-
- * level indirect blocks will be.
+ * The blocks in each logical tape form a chain, with a prev- and next-
+ * pointer in each block.
  *
  * The initial write pass is guaranteed to fill the underlying file
  * perfectly sequentially, no matter how data is divided into logical tapes.
@@ -87,58 +80,58 @@
 #include "utils/memutils.h"
 
 /*
- * Block indexes are "long"s, so we can fit this many per indirect block.
- * NB: we assume this is an exact fit!
- */
-#define BLOCKS_PER_INDIR_BLOCK	((int) (BLCKSZ / sizeof(long)))
-
-/*
- * We use a struct like this for each active indirection level of each
- * logical tape.  If the indirect block is not the highest level of its
- * tape, the "nextup" link points to the next higher level.  Only the
- * "ptrs" array is written out if we have to dump the indirect block to
- * disk.  If "ptrs" is not completely full, we store -1L in the first
- * unused slot at completion of the write phase for the logical tape.
+ * A TapeBlockTrailer is stored at the end of each BLCKSZ block.
+ *
+ * The first block of a tape has prev == -1.  The last block of a tape
+ * stores the number of valid bytes on the block, subtracted by BLCKSZ.
+ * That's always negative, so prev < 0 indicates the last block.
  */
-typedef struct IndirectBlock
+typedef struct TapeBlockTrailer
 {
-	int			nextSlot;		/* next pointer slot to write or read */
-	struct IndirectBlock *nextup;		/* parent indirect level, or NULL if
-										 * top */
-	long		ptrs[BLOCKS_PER_INDIR_BLOCK];	/* indexes of contained blocks */
-} IndirectBlock;
+	long		prev;	/* previous block on this tape, or -1 on first block */
+	long		next;	/* next block on this tape, or # of valid bytes on last block */
+} TapeBlockTrailer;
+
+#define TapeBlockGetTrailer(buf) ((TapeBlockTrailer *) ((char *) buf + BLCKSZ - sizeof(TapeBlockTrailer)))
+#define TapeBlockPayloadSize  (BLCKSZ - sizeof(TapeBlockTrailer))
+
+#define TapeBlockGetNBytes(buf) (TapeBlockGetTrailer(buf)->next < 0 ? (TapeBlockGetTrailer(buf)->next + BLCKSZ) : TapeBlockPayloadSize)
+
+#define TapeBlockIsLast(buf) (TapeBlockGetTrailer(buf)->next < 0)
 
 /*
  * This data structure represents a single "logical tape" within the set
- * of logical tapes stored in the same file.  We must keep track of the
- * current partially-read-or-written data block as well as the active
- * indirect block level(s).
+ * of logical tapes stored in the same file.
+ *
+ * While writing, we hold the current partially-written data block in the
+ * buffer.  While reading, we can hold multiple blocks in the buffer.  Note
+ * that we don't retain the trailers of a block when it's read into the
+ * buffer.  The buffer therefore contains one large contiguous chunk of data
+ * from the tape.
  */
 typedef struct LogicalTape
 {
-	IndirectBlock *indirect;	/* bottom of my indirect-block hierarchy */
 	bool		writing;		/* T while in write phase */
 	bool		frozen;			/* T if blocks should not be freed when read */
 	bool		dirty;			/* does buffer need to be written? */
 
 	/*
-	 * The total data volume in the logical tape is numFullBlocks * BLCKSZ +
-	 * lastBlockBytes.  BUT: we do not update lastBlockBytes during writing,
-	 * only at completion of a write phase.
+	 * Block numbers of the first, current, and next block of the tape.
+	 *
+	 * The "current" block number is only valid when writing, or reading from
+	 * a frozen tape.  (When reading from an unfrozen tape, we use a larger
+	 * read buffer that holds multiple blocks, so the "current" block is
+	 * ambiguous.)
 	 */
-	long		numFullBlocks;	/* number of complete blocks in log tape */
-	int			lastBlockBytes; /* valid bytes in last (incomplete) block */
+	long		firstBlockNumber;
+	long		curBlockNumber;
+	long		nextBlockNumber;
 
 	/*
-	 * Buffer for current data block.  Note we don't bother to store the
-	 * actual file block number of the data block (during the write phase it
-	 * hasn't been assigned yet, and during read we don't care anymore). But
-	 * we do need the relative block number so we can detect end-of-tape while
-	 * reading.
+	 * Buffer for current data block(s).
 	 */
 	char	   *buffer;			/* physical buffer (separately palloc'd) */
 	int			buffer_size;	/* allocated size of the buffer */
-	long		curBlockNumber; /* this block's logical blk# within tape */
 	int			pos;			/* next read/write position in buffer */
 	int			nbytes;			/* total # of valid bytes in buffer */
 
@@ -190,19 +183,6 @@ static void ltsWriteBlock(LogicalTapeSet *lts, long blocknum, void *buffer);
 static void ltsReadBlock(LogicalTapeSet *lts, long blocknum, void *buffer);
 static long ltsGetFreeBlock(LogicalTapeSet *lts);
 static void ltsReleaseBlock(LogicalTapeSet *lts, long blocknum);
-static void ltsRecordBlockNum(LogicalTapeSet *lts, IndirectBlock *indirect,
-				  long blocknum);
-static long ltsRewindIndirectBlock(LogicalTapeSet *lts,
-					   IndirectBlock *indirect,
-					   bool freezing);
-static long ltsRewindFrozenIndirectBlock(LogicalTapeSet *lts,
-							 IndirectBlock *indirect);
-static long ltsRecallNextBlockNum(LogicalTapeSet *lts,
-					  IndirectBlock *indirect,
-					  bool frozen);
-static long ltsRecallPrevBlockNum(LogicalTapeSet *lts,
-					  IndirectBlock *indirect);
-static void ltsDumpBuffer(LogicalTapeSet *lts, LogicalTape *lt);
 
 
 /*
@@ -245,46 +225,39 @@ ltsReadBlock(LogicalTapeSet *lts, long blocknum, void *buffer)
 /*
  * Read as many blocks as we can into the per-tape buffer.
  *
- * The caller can specify the next physical block number to read, in
- * datablocknum, or -1 to fetch the next block number from the internal block.
- * If datablocknum == -1, the caller must've already set curBlockNumber.
- *
  * Returns true if anything was read, 'false' on EOF.
  */
 static bool
-ltsReadFillBuffer(LogicalTapeSet *lts, LogicalTape *lt, long datablocknum)
+ltsReadFillBuffer(LogicalTapeSet *lts, LogicalTape *lt)
 {
 	lt->pos = 0;
 	lt->nbytes = 0;
 
 	do
 	{
-		/* Fetch next block number (unless provided by caller) */
-		if (datablocknum == -1)
-		{
-			datablocknum = ltsRecallNextBlockNum(lts, lt->indirect, lt->frozen);
-			if (datablocknum == -1L)
-				break;			/* EOF */
-			lt->curBlockNumber++;
-		}
+		char	   *thisbuf = lt->buffer + lt->nbytes;
+
+		/* Fetch next block number */
+		if (lt->nextBlockNumber == -1L)
+			break;			/* EOF */
 
 		/* Read the block */
-		ltsReadBlock(lts, datablocknum, (void *) (lt->buffer + lt->nbytes));
+		ltsReadBlock(lts, lt->nextBlockNumber, (void *) thisbuf);
 		if (!lt->frozen)
-			ltsReleaseBlock(lts, datablocknum);
+			ltsReleaseBlock(lts, lt->nextBlockNumber);
 
-		if (lt->curBlockNumber < lt->numFullBlocks)
-			lt->nbytes += BLCKSZ;
-		else
+		lt->nbytes += TapeBlockGetNBytes(thisbuf);
+		if (TapeBlockIsLast(thisbuf))
 		{
+			lt->nextBlockNumber = -1L;
 			/* EOF */
-			lt->nbytes += lt->lastBlockBytes;
 			break;
 		}
+		else
+			lt->nextBlockNumber = TapeBlockGetTrailer(thisbuf)->next;
 
 		/* Advance to next block, if we have buffer space left */
-		datablocknum = -1;
-	} while (lt->nbytes < lt->buffer_size);
+	} while (lt->buffer_size - lt->nbytes > BLCKSZ);
 
 	return (lt->nbytes > 0);
 }
@@ -369,203 +342,6 @@ ltsReleaseBlock(LogicalTapeSet *lts, long blocknum)
 }
 
 /*
- * These routines manipulate indirect-block hierarchies.  All are recursive
- * so that they don't have any specific limit on the depth of hierarchy.
- */
-
-/*
- * Record a data block number in a logical tape's lowest indirect block,
- * or record an indirect block's number in the next higher indirect level.
- */
-static void
-ltsRecordBlockNum(LogicalTapeSet *lts, IndirectBlock *indirect,
-				  long blocknum)
-{
-	if (indirect->nextSlot >= BLOCKS_PER_INDIR_BLOCK)
-	{
-		/*
-		 * This indirect block is full, so dump it out and recursively save
-		 * its address in the next indirection level.  Create a new
-		 * indirection level if there wasn't one before.
-		 */
-		long		indirblock = ltsGetFreeBlock(lts);
-
-		ltsWriteBlock(lts, indirblock, (void *) indirect->ptrs);
-		if (indirect->nextup == NULL)
-		{
-			indirect->nextup = (IndirectBlock *) palloc(sizeof(IndirectBlock));
-			indirect->nextup->nextSlot = 0;
-			indirect->nextup->nextup = NULL;
-		}
-		ltsRecordBlockNum(lts, indirect->nextup, indirblock);
-
-		/*
-		 * Reset to fill another indirect block at this level.
-		 */
-		indirect->nextSlot = 0;
-	}
-	indirect->ptrs[indirect->nextSlot++] = blocknum;
-}
-
-/*
- * Reset a logical tape's indirect-block hierarchy after a write pass
- * to prepare for reading.  We dump out partly-filled blocks except
- * at the top of the hierarchy, and we rewind each level to the start.
- * This call returns the first data block number, or -1L if the tape
- * is empty.
- *
- * Unless 'freezing' is true, release indirect blocks to the free pool after
- * reading them.
- */
-static long
-ltsRewindIndirectBlock(LogicalTapeSet *lts,
-					   IndirectBlock *indirect,
-					   bool freezing)
-{
-	/* Handle case of never-written-to tape */
-	if (indirect == NULL)
-		return -1L;
-
-	/* Insert sentinel if block is not full */
-	if (indirect->nextSlot < BLOCKS_PER_INDIR_BLOCK)
-		indirect->ptrs[indirect->nextSlot] = -1L;
-
-	/*
-	 * If block is not topmost, write it out, and recurse to obtain address of
-	 * first block in this hierarchy level.  Read that one in.
-	 */
-	if (indirect->nextup != NULL)
-	{
-		long		indirblock = ltsGetFreeBlock(lts);
-
-		ltsWriteBlock(lts, indirblock, (void *) indirect->ptrs);
-		ltsRecordBlockNum(lts, indirect->nextup, indirblock);
-		indirblock = ltsRewindIndirectBlock(lts, indirect->nextup, freezing);
-		Assert(indirblock != -1L);
-		ltsReadBlock(lts, indirblock, (void *) indirect->ptrs);
-		if (!freezing)
-			ltsReleaseBlock(lts, indirblock);
-	}
-
-	/*
-	 * Reset my next-block pointer, and then fetch a block number if any.
-	 */
-	indirect->nextSlot = 0;
-	if (indirect->ptrs[0] == -1L)
-		return -1L;
-	return indirect->ptrs[indirect->nextSlot++];
-}
-
-/*
- * Rewind a previously-frozen indirect-block hierarchy for another read pass.
- * This call returns the first data block number, or -1L if the tape
- * is empty.
- */
-static long
-ltsRewindFrozenIndirectBlock(LogicalTapeSet *lts,
-							 IndirectBlock *indirect)
-{
-	/* Handle case of never-written-to tape */
-	if (indirect == NULL)
-		return -1L;
-
-	/*
-	 * If block is not topmost, recurse to obtain address of first block in
-	 * this hierarchy level.  Read that one in.
-	 */
-	if (indirect->nextup != NULL)
-	{
-		long		indirblock;
-
-		indirblock = ltsRewindFrozenIndirectBlock(lts, indirect->nextup);
-		Assert(indirblock != -1L);
-		ltsReadBlock(lts, indirblock, (void *) indirect->ptrs);
-	}
-
-	/*
-	 * Reset my next-block pointer, and then fetch a block number if any.
-	 */
-	indirect->nextSlot = 0;
-	if (indirect->ptrs[0] == -1L)
-		return -1L;
-	return indirect->ptrs[indirect->nextSlot++];
-}
-
-/*
- * Obtain next data block number in the forward direction, or -1L if no more.
- *
- * Unless 'frozen' is true, release indirect blocks to the free pool after
- * reading them.
- */
-static long
-ltsRecallNextBlockNum(LogicalTapeSet *lts,
-					  IndirectBlock *indirect,
-					  bool frozen)
-{
-	/* Handle case of never-written-to tape */
-	if (indirect == NULL)
-		return -1L;
-
-	if (indirect->nextSlot >= BLOCKS_PER_INDIR_BLOCK ||
-		indirect->ptrs[indirect->nextSlot] == -1L)
-	{
-		long		indirblock;
-
-		if (indirect->nextup == NULL)
-			return -1L;			/* nothing left at this level */
-		indirblock = ltsRecallNextBlockNum(lts, indirect->nextup, frozen);
-		if (indirblock == -1L)
-			return -1L;			/* nothing left at this level */
-		ltsReadBlock(lts, indirblock, (void *) indirect->ptrs);
-		if (!frozen)
-			ltsReleaseBlock(lts, indirblock);
-		indirect->nextSlot = 0;
-	}
-	if (indirect->ptrs[indirect->nextSlot] == -1L)
-		return -1L;
-	return indirect->ptrs[indirect->nextSlot++];
-}
-
-/*
- * Obtain next data block number in the reverse direction, or -1L if no more.
- *
- * Note this fetches the block# before the one last returned, no matter which
- * direction of call returned that one.  If we fail, no change in state.
- *
- * This routine can only be used in 'frozen' state, so there's no need to
- * pass a parameter telling whether to release blocks ... we never do.
- */
-static long
-ltsRecallPrevBlockNum(LogicalTapeSet *lts,
-					  IndirectBlock *indirect)
-{
-	/* Handle case of never-written-to tape */
-	if (indirect == NULL)
-		return -1L;
-
-	if (indirect->nextSlot <= 1)
-	{
-		long		indirblock;
-
-		if (indirect->nextup == NULL)
-			return -1L;			/* nothing left at this level */
-		indirblock = ltsRecallPrevBlockNum(lts, indirect->nextup);
-		if (indirblock == -1L)
-			return -1L;			/* nothing left at this level */
-		ltsReadBlock(lts, indirblock, (void *) indirect->ptrs);
-
-		/*
-		 * The previous block would only have been written out if full, so we
-		 * need not search it for a -1 sentinel.
-		 */
-		indirect->nextSlot = BLOCKS_PER_INDIR_BLOCK + 1;
-	}
-	indirect->nextSlot--;
-	return indirect->ptrs[indirect->nextSlot - 1];
-}
-
-
-/*
  * Create a set of logical tapes in a temporary underlying file.
  *
  * Each tape is initialized in write state.
@@ -594,23 +370,21 @@ LogicalTapeSetCreate(int ntapes)
 
 	/*
 	 * Initialize per-tape structs.  Note we allocate the I/O buffer and
-	 * first-level indirect block for a tape only when it is first actually
-	 * written to.  This avoids wasting memory space when tuplesort.c
-	 * overestimates the number of tapes needed.
+	 * the first block for a tape only when it is first actually written
+	 * to.  This avoids wasting memory space when tuplesort.c overestimates
+	 * the number of tapes needed.
 	 */
 	for (i = 0; i < ntapes; i++)
 	{
 		lt = &lts->tapes[i];
-		lt->indirect = NULL;
 		lt->writing = true;
 		lt->frozen = false;
 		lt->dirty = false;
-		lt->numFullBlocks = 0L;
-		lt->lastBlockBytes = 0;
+		lt->firstBlockNumber = -1L;
+		lt->curBlockNumber = -1L;
 		lt->buffer = NULL;
 		lt->buffer_size = 0;
 		lt->read_buffer_size = BLCKSZ;
-		lt->curBlockNumber = 0L;
 		lt->pos = 0;
 		lt->nbytes = 0;
 	}
@@ -624,19 +398,12 @@ void
 LogicalTapeSetClose(LogicalTapeSet *lts)
 {
 	LogicalTape *lt;
-	IndirectBlock *ib,
-			   *nextib;
 	int			i;
 
 	BufFileClose(lts->pfile);
 	for (i = 0; i < lts->nTapes; i++)
 	{
 		lt = &lts->tapes[i];
-		for (ib = lt->indirect; ib != NULL; ib = nextib)
-		{
-			nextib = ib->nextup;
-			pfree(ib);
-		}
 		if (lt->buffer)
 			pfree(lt->buffer);
 	}
@@ -660,21 +427,6 @@ LogicalTapeSetForgetFreeSpace(LogicalTapeSet *lts)
 }
 
 /*
- * Dump the dirty buffer of a logical tape.
- */
-static void
-ltsDumpBuffer(LogicalTapeSet *lts, LogicalTape *lt)
-{
-	long		datablock = ltsGetFreeBlock(lts);
-
-	Assert(lt->dirty);
-	ltsWriteBlock(lts, datablock, (void *) lt->buffer);
-	ltsRecordBlockNum(lts, lt->indirect, datablock);
-	lt->dirty = false;
-	/* Caller must do other state update as needed */
-}
-
-/*
  * Write to a logical tape.
  *
  * There are no error returns; we ereport() on failure.
@@ -690,39 +442,53 @@ LogicalTapeWrite(LogicalTapeSet *lts, int tapenum,
 	lt = &lts->tapes[tapenum];
 	Assert(lt->writing);
 
-	/* Allocate data buffer and first indirect block on first write */
+	/* Allocate data buffer and first block on first write */
 	if (lt->buffer == NULL)
 	{
 		lt->buffer = (char *) palloc(BLCKSZ);
 		lt->buffer_size = BLCKSZ;
 	}
-	if (lt->indirect == NULL)
+	if (lt->curBlockNumber == -1)
 	{
-		lt->indirect = (IndirectBlock *) palloc(sizeof(IndirectBlock));
-		lt->indirect->nextSlot = 0;
-		lt->indirect->nextup = NULL;
+		Assert(lt->firstBlockNumber == -1);
+		Assert(lt->pos == 0);
+
+		lt->curBlockNumber = ltsGetFreeBlock(lts);
+		lt->firstBlockNumber = lt->curBlockNumber;
+
+		TapeBlockGetTrailer(lt->buffer)->prev = -1L;
 	}
 
 	Assert(lt->buffer_size == BLCKSZ);
 	while (size > 0)
 	{
-		if (lt->pos >= BLCKSZ)
+		if (lt->pos >= TapeBlockPayloadSize)
 		{
 			/* Buffer full, dump it out */
-			if (lt->dirty)
-				ltsDumpBuffer(lts, lt);
-			else
+			long		nextBlockNumber;
+
+			if (!lt->dirty)
 			{
 				/* Hmm, went directly from reading to writing? */
 				elog(ERROR, "invalid logtape state: should be dirty");
 			}
-			lt->numFullBlocks++;
-			lt->curBlockNumber++;
+
+			/*
+			 * First allocate the next block, so that we can store it
+			 * in the 'next' pointer of this block.
+			 */
+			nextBlockNumber = ltsGetFreeBlock(lts);
+			TapeBlockGetTrailer(lt->buffer)->next = nextBlockNumber;
+
+			ltsWriteBlock(lts, lt->curBlockNumber, (void *) lt->buffer);
+
+			TapeBlockGetTrailer(lt->buffer)->prev = lt->curBlockNumber;
+			lt->curBlockNumber = nextBlockNumber;
 			lt->pos = 0;
 			lt->nbytes = 0;
 		}
 
-		nthistime = BLCKSZ - lt->pos;
+		nthistime = TapeBlockPayloadSize - lt->pos;
 		if (nthistime > size)
 			nthistime = size;
 		Assert(nthistime > 0);
@@ -748,7 +514,6 @@ void
 LogicalTapeRewind(LogicalTapeSet *lts, int tapenum, bool forWrite)
 {
 	LogicalTape *lt;
-	long		datablocknum;
 
 	Assert(tapenum >= 0 && tapenum < lts->nTapes);
 	lt = &lts->tapes[tapenum];
@@ -759,14 +524,14 @@ LogicalTapeRewind(LogicalTapeSet *lts, int tapenum, bool forWrite)
 		{
 			/*
 			 * Completion of a write phase.  Flush last partial data block,
-			 * flush any partial indirect blocks, rewind for normal
-			 * (destructive) read.
+			 * and rewind for normal (destructive) read.
 			 */
 			if (lt->dirty)
-				ltsDumpBuffer(lts, lt);
-			lt->lastBlockBytes = lt->nbytes;
+			{
+				TapeBlockGetTrailer(lt->buffer)->next = lt->nbytes - BLCKSZ;
+				ltsWriteBlock(lts, lt->curBlockNumber, (void *) lt->buffer);
+			}
 			lt->writing = false;
-			datablocknum = ltsRewindIndirectBlock(lts, lt->indirect, false);
 		}
 		else
 		{
@@ -775,7 +540,6 @@ LogicalTapeRewind(LogicalTapeSet *lts, int tapenum, bool forWrite)
 			 * pass.
 			 */
 			Assert(lt->frozen);
-			datablocknum = ltsRewindFrozenIndirectBlock(lts, lt->indirect);
 		}
 
 		/* Allocate a read buffer (unless the tape is empty) */
@@ -783,18 +547,17 @@ LogicalTapeRewind(LogicalTapeSet *lts, int tapenum, bool forWrite)
 			pfree(lt->buffer);
 		lt->buffer = NULL;
 		lt->buffer_size = 0;
-		if (datablocknum != -1L)
+		if (lt->firstBlockNumber != -1L)
 		{
 			lt->buffer = palloc(lt->read_buffer_size);
 			lt->buffer_size = lt->read_buffer_size;
 		}
 
 		/* Read the first block, or reset if tape is empty */
-		lt->curBlockNumber = 0L;
+		lt->nextBlockNumber = lt->firstBlockNumber;
 		lt->pos = 0;
 		lt->nbytes = 0;
-		if (datablocknum != -1L)
-			ltsReadFillBuffer(lts, lt, datablocknum);
+		ltsReadFillBuffer(lts, lt);
 	}
 	else
 	{
@@ -802,30 +565,15 @@ LogicalTapeRewind(LogicalTapeSet *lts, int tapenum, bool forWrite)
 		 * Completion of a read phase.  Rewind and prepare for write.
 		 *
 		 * NOTE: we assume the caller has read the tape to the end; otherwise
-		 * untouched data and indirect blocks will not have been freed. We
-		 * could add more code to free any unread blocks, but in current usage
-		 * of this module it'd be useless code.
+		 * untouched data will not have been released. We could add more code
+		 * to release any unread blocks, but in current usage of this module
+		 * it'd be useless code.
 		 */
-		IndirectBlock *ib,
-				   *nextib;
-
 		Assert(!lt->writing && !lt->frozen);
-		/* Must truncate the indirect-block hierarchy down to one level. */
-		if (lt->indirect)
-		{
-			for (ib = lt->indirect->nextup; ib != NULL; ib = nextib)
-			{
-				nextib = ib->nextup;
-				pfree(ib);
-			}
-			lt->indirect->nextSlot = 0;
-			lt->indirect->nextup = NULL;
-		}
 		lt->writing = true;
 		lt->dirty = false;
-		lt->numFullBlocks = 0L;
-		lt->lastBlockBytes = 0;
-		lt->curBlockNumber = 0L;
+		lt->firstBlockNumber = -1L;
+		lt->curBlockNumber = -1L;
 		lt->pos = 0;
 		lt->nbytes = 0;
 
@@ -860,7 +608,7 @@ LogicalTapeRead(LogicalTapeSet *lts, int tapenum,
 		if (lt->pos >= lt->nbytes)
 		{
 			/* Try to load more data into buffer. */
-			if (!ltsReadFillBuffer(lts, lt, -1))
+			if (!ltsReadFillBuffer(lts, lt))
 				break;			/* EOF */
 		}
 
@@ -895,22 +643,23 @@ void
 LogicalTapeFreeze(LogicalTapeSet *lts, int tapenum)
 {
 	LogicalTape *lt;
-	long		datablocknum;
 
 	Assert(tapenum >= 0 && tapenum < lts->nTapes);
 	lt = &lts->tapes[tapenum];
 	Assert(lt->writing);
 
 	/*
-	 * Completion of a write phase.  Flush last partial data block, flush any
-	 * partial indirect blocks, rewind for nondestructive read.
+	 * Completion of a write phase.  Flush last partial data block, and
+	 * rewind for nondestructive read.
 	 */
 	if (lt->dirty)
-		ltsDumpBuffer(lts, lt);
-	lt->lastBlockBytes = lt->nbytes;
+	{
+		TapeBlockGetTrailer(lt->buffer)->next = lt->nbytes - BLCKSZ;
+		ltsWriteBlock(lts, lt->curBlockNumber, (void *) lt->buffer);
+		lt->writing = false;
+	}
 	lt->writing = false;
 	lt->frozen = true;
-	datablocknum = ltsRewindIndirectBlock(lts, lt->indirect, true);
 
 	/*
 	 * The seek and backspace functions assume a single block read buffer.
@@ -928,15 +677,20 @@ LogicalTapeFreeze(LogicalTapeSet *lts, int tapenum)
 	}
 
 	/* Read the first block, or reset if tape is empty */
-	lt->curBlockNumber = 0L;
+	lt->curBlockNumber = lt->firstBlockNumber;
 	lt->pos = 0;
 	lt->nbytes = 0;
-	if (datablocknum != -1L)
+
+	if (lt->firstBlockNumber == -1L)
 	{
-		ltsReadBlock(lts, datablocknum, (void *) lt->buffer);
-		lt->nbytes = (lt->curBlockNumber < lt->numFullBlocks) ?
-			BLCKSZ : lt->lastBlockBytes;
+		lt->nextBlockNumber = -1L;
 	}
+	ltsReadBlock(lts, lt->curBlockNumber, (void *) lt->buffer);
+	if (TapeBlockIsLast(lt->buffer))
+		lt->nextBlockNumber = -1L;
+	else
+		lt->nextBlockNumber = TapeBlockGetTrailer(lt->buffer)->next;
+	lt->nbytes = TapeBlockGetNBytes(lt->buffer);
 }
 
 /*
@@ -947,15 +701,16 @@ LogicalTapeFreeze(LogicalTapeSet *lts, int tapenum)
  * random access during write, and an unfrozen read tape may have
  * already discarded the desired data!
  *
- * Return value is TRUE if seek successful, FALSE if there isn't that much
- * data before the current point (in which case there's no state change).
+ * Return value is TRUE if seek successful, FALSE if we are at the very
+ * beginning of the tape, in which case there's no state change.
+ * Note that backspacing beyond the beginning of the tape is not allowed,
+ * and will cause an error, except for the special case that we are
+ * currently positioned at the very beginning of the tape.
  */
 bool
 LogicalTapeBackspace(LogicalTapeSet *lts, int tapenum, size_t size)
 {
 	LogicalTape *lt;
-	long		nblocks;
-	int			newpos;
 
 	Assert(tapenum >= 0 && tapenum < lts->nTapes);
 	lt = &lts->tapes[tapenum];
@@ -972,40 +727,35 @@ LogicalTapeBackspace(LogicalTapeSet *lts, int tapenum, size_t size)
 	}
 
 	/*
-	 * Not-so-easy case.  Figure out whether it's possible at all.
+	 * Not-so-easy case.  Walk back the chain of blocks.  This implementation
+	 * would be pretty inefficient for long seeks, but we really aren't
+	 * expecting that (a seek over one tuple is typical).
 	 */
+	if (lt->pos == 0 && TapeBlockGetTrailer(lt->buffer)->prev)
+		return false;		/* beginning of tape */
+
 	size -= (size_t) lt->pos;	/* part within this block */
-	nblocks = size / BLCKSZ;
-	size = size % BLCKSZ;
-	if (size)
-	{
-		nblocks++;
-		newpos = (int) (BLCKSZ - size);
-	}
-	else
-		newpos = 0;
-	if (nblocks > lt->curBlockNumber)
-		return false;			/* a seek too far... */
 
-	/*
-	 * OK, we need to back up nblocks blocks.  This implementation would be
-	 * pretty inefficient for long seeks, but we really aren't expecting that
-	 * (a seek over one tuple is typical).
-	 */
-	while (nblocks-- > 0)
+	while (size > TapeBlockPayloadSize)
 	{
-		long		datablocknum = ltsRecallPrevBlockNum(lts, lt->indirect);
+		long		datablocknum = TapeBlockGetTrailer(lt->buffer)->prev;
 
 		if (datablocknum == -1L)
 			elog(ERROR, "unexpected end of tape");
-		lt->curBlockNumber--;
-		if (nblocks == 0)
-		{
-			ltsReadBlock(lts, datablocknum, (void *) lt->buffer);
-			lt->nbytes = BLCKSZ;
-		}
+
+		ltsReadBlock(lts, datablocknum, (void *) lt->buffer);
+
+		if (TapeBlockGetTrailer(lt->buffer)->next != lt->curBlockNumber)
+			elog(ERROR, "broken tape");
+
+		lt->nbytes = TapeBlockPayloadSize;
+		lt->curBlockNumber = datablocknum;
+
+		size -= TapeBlockPayloadSize;
 	}
-	lt->pos = newpos;
+
+	lt->pos = TapeBlockPayloadSize - size;
+
 	return true;
 }
 
@@ -1014,10 +764,10 @@ LogicalTapeBackspace(LogicalTapeSet *lts, int tapenum, size_t size)
  *
  * *Only* a frozen-for-read tape can be seeked.
  *
- * Return value is TRUE if seek successful, FALSE if there isn't that much
- * data in the tape (in which case there's no state change).
+ * Must be called with a block/offset previously returned by
+ * LogicalTapeTell().
  */
-bool
+void
 LogicalTapeSeek(LogicalTapeSet *lts, int tapenum,
 				long blocknum, int offset)
 {
@@ -1026,53 +776,15 @@ LogicalTapeSeek(LogicalTapeSet *lts, int tapenum,
 	Assert(tapenum >= 0 && tapenum < lts->nTapes);
 	lt = &lts->tapes[tapenum];
 	Assert(lt->frozen);
-	Assert(offset >= 0 && offset <= BLCKSZ);
+	Assert(offset >= 0 && offset <= TapeBlockPayloadSize);
 	Assert(lt->buffer_size == BLCKSZ);
 
-	/*
-	 * Easy case for seek within current block.
-	 */
-	if (blocknum == lt->curBlockNumber && offset <= lt->nbytes)
-	{
-		lt->pos = offset;
-		return true;
-	}
-
-	/*
-	 * Not-so-easy case.  Figure out whether it's possible at all.
-	 */
-	if (blocknum < 0 || blocknum > lt->numFullBlocks ||
-		(blocknum == lt->numFullBlocks && offset > lt->lastBlockBytes))
-		return false;
-
-	/*
-	 * OK, advance or back up to the target block.  This implementation would
-	 * be pretty inefficient for long seeks, but we really aren't expecting
-	 * that (a seek over one tuple is typical).
-	 */
-	while (lt->curBlockNumber > blocknum)
-	{
-		long		datablocknum = ltsRecallPrevBlockNum(lts, lt->indirect);
-
-		if (datablocknum == -1L)
-			elog(ERROR, "unexpected end of tape");
-		if (--lt->curBlockNumber == blocknum)
-			ltsReadBlock(lts, datablocknum, (void *) lt->buffer);
-	}
-	while (lt->curBlockNumber < blocknum)
-	{
-		long		datablocknum = ltsRecallNextBlockNum(lts, lt->indirect,
-														 lt->frozen);
+	if (blocknum != lt->curBlockNumber)
+		ltsReadBlock(lts, blocknum, (void *) lt->buffer);
 
-		if (datablocknum == -1L)
-			elog(ERROR, "unexpected end of tape");
-		if (++lt->curBlockNumber == blocknum)
-			ltsReadBlock(lts, datablocknum, (void *) lt->buffer);
-	}
-	lt->nbytes = (lt->curBlockNumber < lt->numFullBlocks) ?
-		BLCKSZ : lt->lastBlockBytes;
+	if (offset > lt->nbytes)
+		elog(ERROR, "invalid tape seek position");
 	lt->pos = offset;
-	return true;
 }
 
 /*
@@ -1110,7 +822,8 @@ LogicalTapeSetBlocks(LogicalTapeSet *lts)
  * Set buffer size to use, when reading from given tape.
  */
 void
-LogicalTapeAssignReadBufferSize(LogicalTapeSet *lts, int tapenum, size_t avail_mem)
+LogicalTapeAssignReadBufferSize(LogicalTapeSet *lts, int tapenum,
+								size_t avail_mem)
 {
 	LogicalTape *lt;
 
diff --git a/src/backend/utils/sort/tuplesort.c b/src/backend/utils/sort/tuplesort.c
index dd83e7a..7c9232c 100644
--- a/src/backend/utils/sort/tuplesort.c
+++ b/src/backend/utils/sort/tuplesort.c
@@ -239,15 +239,15 @@ typedef enum
  * Parameters for calculation of number of tapes to use --- see inittapes()
  * and tuplesort_merge_order().
  *
- * In this calculation we assume that each tape will cost us about 3 blocks
- * worth of buffer space (which is an underestimate for very large data
- * volumes, but it's probably close enough --- see logtape.c).
+ * In this calculation we assume that each tape will cost us about 1 blocks
+ * worth of buffer space.  This ignores the overhead of all the other data
+ * structures needed for each tape, but it's probably close enough.
  *
  * MERGE_BUFFER_SIZE is how much data we'd like to read from each input
  * tape during a preread cycle (see discussion at top of file).
  */
 #define MINORDER		6		/* minimum merge order */
-#define TAPE_BUFFER_OVERHEAD		(BLCKSZ * 3)
+#define TAPE_BUFFER_OVERHEAD		BLCKSZ
 #define MERGE_BUFFER_SIZE			(BLCKSZ * 32)
 
  /*
@@ -3239,11 +3239,10 @@ tuplesort_restorepos(Tuplesortstate *state)
 			state->eof_reached = state->markpos_eof;
 			break;
 		case TSS_SORTEDONTAPE:
-			if (!LogicalTapeSeek(state->tapeset,
-								 state->result_tape,
-								 state->markpos_block,
-								 state->markpos_offset))
-				elog(ERROR, "tuplesort_restorepos failed");
+			LogicalTapeSeek(state->tapeset,
+							state->result_tape,
+							state->markpos_block,
+							state->markpos_offset);
 			state->eof_reached = state->markpos_eof;
 			break;
 		default:
diff --git a/src/include/utils/logtape.h b/src/include/utils/logtape.h
index 362a619..92e8245 100644
--- a/src/include/utils/logtape.h
+++ b/src/include/utils/logtape.h
@@ -35,7 +35,7 @@ extern void LogicalTapeRewind(LogicalTapeSet *lts, int tapenum, bool forWrite);
 extern void LogicalTapeFreeze(LogicalTapeSet *lts, int tapenum);
 extern bool LogicalTapeBackspace(LogicalTapeSet *lts, int tapenum,
 					 size_t size);
-extern bool LogicalTapeSeek(LogicalTapeSet *lts, int tapenum,
+extern void LogicalTapeSeek(LogicalTapeSet *lts, int tapenum,
 				long blocknum, int offset);
 extern void LogicalTapeTell(LogicalTapeSet *lts, int tapenum,
 				long *blocknum, int *offset);
-- 
2.9.3

