From aae6423b6f0e7955a19d0ff28ec64071e808ca0d Mon Sep 17 00:00:00 2001
From: Tomas Vondra <tomas@2ndquadrant.com>
Date: Sun, 31 Mar 2024 14:55:29 +0200
Subject: [PATCH v20240403 1/6] Properly align blocks in incremental backups

Align blocks stored in incremental files to BLCKSZ (typically 8K), so
that the incremental backups can work well with CoW filesystems.

The features based on block sharing in CoW filesystems require the
blocks to be aligned to the filesystem page size, but the format
of the incremental files did not meet that requirement as the files
have variable-length header and the data for the blocks was stored
right after that.

This would features like deduplication or block sharing built into
some file systems (e.g. ZFS, BTRFS, XFS) from working well.

This commit pads the file header with \0 to a multiple of BLCKSZ,
which means the block data is aligned to BLCKSZ too. The padding
is applied only to files with block data, so files with just the
header are left small. For files with blocks this adds a bit of
overhead, but as the number of blocks increases the overhead gets
negligible very quickly. Moreover, as the padding is \0 bytes, it
does compress extremely well.
---
 src/backend/backup/basebackup.c             | 26 ++++++++++++++
 src/backend/backup/basebackup_incremental.c | 39 ++++++++++++++++++---
 src/bin/pg_combinebackup/reconstruct.c      |  8 +++++
 src/include/backup/basebackup_incremental.h |  1 +
 4 files changed, 70 insertions(+), 4 deletions(-)

diff --git a/src/backend/backup/basebackup.c b/src/backend/backup/basebackup.c
index 5fea86ad0fd..b956df072d5 100644
--- a/src/backend/backup/basebackup.c
+++ b/src/backend/backup/basebackup.c
@@ -1623,6 +1623,8 @@ sendFile(bbsink *sink, const char *readfilename, const char *tarfilename,
 	{
 		unsigned	magic = INCREMENTAL_MAGIC;
 		size_t		header_bytes_done = 0;
+		char		padding[BLCKSZ];
+		size_t		paddinglen;
 
 		/* Emit header data. */
 		push_to_sink(sink, &checksum_ctx, &header_bytes_done,
@@ -1635,6 +1637,23 @@ sendFile(bbsink *sink, const char *readfilename, const char *tarfilename,
 					 incremental_blocks,
 					 sizeof(BlockNumber) * num_incremental_blocks);
 
+		/*
+		 * Add padding to align header to a multiple of BLCKSZ, but only if
+		 * the incremental file has some blocks. If there are no blocks we
+		 * don't want make the file unnecessarily large, as that might make
+		 * some filesystem optimizations impossible.
+		 */
+		if (num_incremental_blocks > 0)
+		{
+			paddinglen = (BLCKSZ - (header_bytes_done % BLCKSZ));
+
+			memset(padding, 0, paddinglen);
+			bytes_done += paddinglen;
+
+			push_to_sink(sink, &checksum_ctx, &header_bytes_done,
+						 padding, paddinglen);
+		}
+
 		/* Flush out any data still in the buffer so it's again empty. */
 		if (header_bytes_done > 0)
 		{
@@ -1748,6 +1767,13 @@ sendFile(bbsink *sink, const char *readfilename, const char *tarfilename,
 		blkno += cnt / BLCKSZ;
 		bytes_done += cnt;
 
+		/*
+		 * Make sure incremental files with block data are properly aligned
+		 * (header is a multiple of BLCKSZ, blocks are BLCKSZ too).
+		 */
+		Assert(!((incremental_blocks != NULL && num_incremental_blocks > 0) &&
+				 (bytes_done % BLCKSZ != 0)));
+
 		/* Archive the data we just read. */
 		bbsink_archive_contents(sink, cnt);
 
diff --git a/src/backend/backup/basebackup_incremental.c b/src/backend/backup/basebackup_incremental.c
index 990b2872eaf..9cd7d9e15ae 100644
--- a/src/backend/backup/basebackup_incremental.c
+++ b/src/backend/backup/basebackup_incremental.c
@@ -902,6 +902,36 @@ GetFileBackupMethod(IncrementalBackupInfo *ib, const char *path,
 	return BACK_UP_FILE_INCREMENTALLY;
 }
 
+/*
+ * Compute the size for a header of an incremental file containing a given
+ * number of blocks. The header is rounded to a multiple of BLCKSZ, but
+ * only if the file will store some block data.
+ */
+extern size_t
+GetIncrementalHeaderSize(unsigned num_blocks_required)
+{
+	size_t		result;
+
+	/* Make sure we're not going to overflow. */
+	Assert(num_blocks_required <= RELSEG_SIZE);
+
+	/*
+	 * Three four byte quantities (magic number, truncation block length,
+	 * block count) followed by block numbers.
+	 */
+	result = 3 * sizeof(uint32) + (sizeof(BlockNumber) * num_blocks_required);
+
+	/*
+	 * Round the header size to a multiple of BLCKSZ - when not a multiple
+	 * of BLCKSZ, add the missing fraction of a block. But do this only if
+	 * the file will store data for some blocks, otherwise keep it small.
+	 */
+	if ((num_blocks_required > 0) && (result % BLCKSZ != 0))
+		result += BLCKSZ - (result % BLCKSZ);
+
+	return result;
+}
+
 /*
  * Compute the size for an incremental file containing a given number of blocks.
  */
@@ -914,11 +944,12 @@ GetIncrementalFileSize(unsigned num_blocks_required)
 	Assert(num_blocks_required <= RELSEG_SIZE);
 
 	/*
-	 * Three four byte quantities (magic number, truncation block length,
-	 * block count) followed by block numbers followed by block contents.
+	 * Header with three four byte quantities (magic number, truncation block
+	 * length, block count) followed by block numbers, rounded to a multiple
+	 * of BLCKSZ (for files with block data), followed by block contents.
 	 */
-	result = 3 * sizeof(uint32);
-	result += (BLCKSZ + sizeof(BlockNumber)) * num_blocks_required;
+	result = GetIncrementalHeaderSize(num_blocks_required);
+	result += BLCKSZ * num_blocks_required;
 
 	return result;
 }
diff --git a/src/bin/pg_combinebackup/reconstruct.c b/src/bin/pg_combinebackup/reconstruct.c
index 41f06bb26b5..33c6da02a8c 100644
--- a/src/bin/pg_combinebackup/reconstruct.c
+++ b/src/bin/pg_combinebackup/reconstruct.c
@@ -472,6 +472,14 @@ make_incremental_rfile(char *filename)
 		sizeof(rf->truncation_block_length) +
 		sizeof(BlockNumber) * rf->num_blocks;
 
+	/*
+	 * Round header length to a multiple of BLCKSZ, so that blocks contents
+	 * are properly aligned. Only do this when the file actually has data for
+	 * some blocks.
+	 */
+	if ((rf->num_blocks > 0) && ((rf->header_length % BLCKSZ) != 0))
+		rf->header_length += (BLCKSZ - (rf->header_length % BLCKSZ));
+
 	return rf;
 }
 
diff --git a/src/include/backup/basebackup_incremental.h b/src/include/backup/basebackup_incremental.h
index ae5feb4b320..6ba9c9035e2 100644
--- a/src/include/backup/basebackup_incremental.h
+++ b/src/include/backup/basebackup_incremental.h
@@ -51,5 +51,6 @@ extern FileBackupMethod GetFileBackupMethod(IncrementalBackupInfo *ib,
 											BlockNumber *relative_block_numbers,
 											unsigned *truncation_block_length);
 extern size_t GetIncrementalFileSize(unsigned num_blocks_required);
+extern size_t GetIncrementalHeaderSize(unsigned num_blocks_required);
 
 #endif
-- 
2.44.0

