From 76e97361f7fa45008ec524f0a83eab5c3da46506 Mon Sep 17 00:00:00 2001 From: Bryan Green Date: Thu, 6 Nov 2025 10:56:02 -0600 Subject: [PATCH v3] Fix Windows file I/O to support files larger than 2GB PostgreSQL's Windows port has been unable to handle files larger than 2GB due to pervasive use of off_t for file offsets, which is only 32-bit on Windows. This causes signed integer overflow at exactly 2^31 bytes. The codebase already defines pgoff_t as __int64 (64-bit) on Windows for this purpose, and some function declarations in headers use it, but many implementations still used off_t. This issue is unlikely to affect most users since the default RELSEG_SIZE is 1GB, keeping individual segment files small. However, anyone building with --with-segsize larger than 2 would hit this bug. Tested with --with-segsize=8 and verified that files can now grow beyond 4GB. This version also addresses three additional code paths in WAL handling that used casts to off_t when calling pg_pread() or pg_pwrite(): - xlogrecovery.c: pg_pread() called with cast to off_t - xlogreader.c: pg_pread() with cast to off_t - walreceiver.c: pg_pwrite() with cast to off_t While these are not critical (WAL segments have a max size of 1GB), the casts are now corrected to pgoff_t for consistency and to avoid any potential future issues. Note: off_t is still used in other parts of the codebase (e.g. buffile.c) which may have similar issues on Windows, but those are outside the critical path for relation file extension and can be addressed separately. On Unix-like systems, pgoff_t is defined as off_t, so this change only affects Windows behavior. --- src/backend/access/transam/xlogreader.c | 2 +- src/backend/access/transam/xlogrecovery.c | 2 +- src/backend/replication/walreceiver.c | 2 +- src/backend/storage/file/fd.c | 38 +-- src/backend/storage/smgr/md.c | 50 ++-- src/common/file_utils.c | 4 +- src/include/common/file_utils.h | 4 +- src/include/port/pg_iovec.h | 4 +- src/include/port/win32_port.h | 4 +- src/include/storage/fd.h | 26 +- src/port/win32pread.c | 10 +- src/port/win32pwrite.c | 10 +- src/test/modules/meson.build | 1 + src/test/modules/test_large_files/Makefile | 20 ++ src/test/modules/test_large_files/README | 53 ++++ src/test/modules/test_large_files/meson.build | 29 ++ .../t/001_windows_large_files.pl | 65 +++++ .../test_large_files--1.0.sql | 36 +++ .../test_large_files/test_large_files.c | 270 ++++++++++++++++++ .../test_large_files/test_large_files.control | 5 + 20 files changed, 557 insertions(+), 78 deletions(-) create mode 100644 src/test/modules/test_large_files/Makefile create mode 100644 src/test/modules/test_large_files/README create mode 100644 src/test/modules/test_large_files/meson.build create mode 100644 src/test/modules/test_large_files/t/001_windows_large_files.pl create mode 100644 src/test/modules/test_large_files/test_large_files--1.0.sql create mode 100644 src/test/modules/test_large_files/test_large_files.c create mode 100644 src/test/modules/test_large_files/test_large_files.control diff --git a/src/backend/access/transam/xlogreader.c b/src/backend/access/transam/xlogreader.c index dcc8d4f9c1..8ea837003f 100644 --- a/src/backend/access/transam/xlogreader.c +++ b/src/backend/access/transam/xlogreader.c @@ -1574,7 +1574,7 @@ WALRead(XLogReaderState *state, /* Reset errno first; eases reporting non-errno-affecting errors */ errno = 0; - readbytes = pg_pread(state->seg.ws_file, p, segbytes, (off_t) startoff); + readbytes = pg_pread(state->seg.ws_file, p, segbytes, (pgoff_t) startoff); #ifndef FRONTEND pgstat_report_wait_end(); diff --git a/src/backend/access/transam/xlogrecovery.c b/src/backend/access/transam/xlogrecovery.c index 550de6e4a5..c723d03d96 100644 --- a/src/backend/access/transam/xlogrecovery.c +++ b/src/backend/access/transam/xlogrecovery.c @@ -3429,7 +3429,7 @@ retry: io_start = pgstat_prepare_io_time(track_wal_io_timing); pgstat_report_wait_start(WAIT_EVENT_WAL_READ); - r = pg_pread(readFile, readBuf, XLOG_BLCKSZ, (off_t) readOff); + r = pg_pread(readFile, readBuf, XLOG_BLCKSZ, (pgoff_t) readOff); if (r != XLOG_BLCKSZ) { char fname[MAXFNAMELEN]; diff --git a/src/backend/replication/walreceiver.c b/src/backend/replication/walreceiver.c index 7361ffc9dc..ec243db3a4 100644 --- a/src/backend/replication/walreceiver.c +++ b/src/backend/replication/walreceiver.c @@ -928,7 +928,7 @@ XLogWalRcvWrite(char *buf, Size nbytes, XLogRecPtr recptr, TimeLineID tli) start = pgstat_prepare_io_time(track_wal_io_timing); pgstat_report_wait_start(WAIT_EVENT_WAL_WRITE); - byteswritten = pg_pwrite(recvFile, buf, segbytes, (off_t) startoff); + byteswritten = pg_pwrite(recvFile, buf, segbytes, (pgoff_t) startoff); pgstat_report_wait_end(); pgstat_count_io_op_time(IOOBJECT_WAL, IOCONTEXT_NORMAL, diff --git a/src/backend/storage/file/fd.c b/src/backend/storage/file/fd.c index a4ec7959f3..b25e74831e 100644 --- a/src/backend/storage/file/fd.c +++ b/src/backend/storage/file/fd.c @@ -201,7 +201,7 @@ typedef struct vfd File nextFree; /* link to next free VFD, if in freelist */ File lruMoreRecently; /* doubly linked recency-of-use list */ File lruLessRecently; - off_t fileSize; /* current size of file (0 if not temporary) */ + pgoff_t fileSize; /* current size of file (0 if not temporary) */ char *fileName; /* name of file, or NULL for unused VFD */ /* NB: fileName is malloc'd, and must be free'd when closing the VFD */ int fileFlags; /* open(2) flags for (re)opening the file */ @@ -519,7 +519,7 @@ pg_file_exists(const char *name) * offset of 0 with nbytes 0 means that the entire file should be flushed */ void -pg_flush_data(int fd, off_t offset, off_t nbytes) +pg_flush_data(int fd, pgoff_t offset, pgoff_t nbytes) { /* * Right now file flushing is primarily used to avoid making later @@ -635,7 +635,7 @@ retry: * may simply not be enough address space. If so, silently fall * through to the next implementation. */ - if (nbytes <= (off_t) SSIZE_MAX) + if (nbytes <= (pgoff_t) SSIZE_MAX) p = mmap(NULL, nbytes, PROT_READ, MAP_SHARED, fd, offset); else p = MAP_FAILED; @@ -697,7 +697,7 @@ retry: * Truncate an open file to a given length. */ static int -pg_ftruncate(int fd, off_t length) +pg_ftruncate(int fd, pgoff_t length) { int ret; @@ -714,7 +714,7 @@ retry: * Truncate a file to a given length by name. */ int -pg_truncate(const char *path, off_t length) +pg_truncate(const char *path, pgoff_t length) { int ret; #ifdef WIN32 @@ -1526,7 +1526,7 @@ FileAccess(File file) * Called whenever a temporary file is deleted to report its size. */ static void -ReportTemporaryFileUsage(const char *path, off_t size) +ReportTemporaryFileUsage(const char *path, pgoff_t size) { pgstat_report_tempfile(size); @@ -2077,7 +2077,7 @@ FileClose(File file) * this. */ int -FilePrefetch(File file, off_t offset, off_t amount, uint32 wait_event_info) +FilePrefetch(File file, pgoff_t offset, pgoff_t amount, uint32 wait_event_info) { Assert(FileIsValid(file)); @@ -2108,7 +2108,7 @@ retry: { struct radvisory { - off_t ra_offset; /* offset into the file */ + pgoff_t ra_offset; /* offset into the file */ int ra_count; /* size of the read */ } ra; int returnCode; @@ -2133,7 +2133,7 @@ retry: } void -FileWriteback(File file, off_t offset, off_t nbytes, uint32 wait_event_info) +FileWriteback(File file, pgoff_t offset, pgoff_t nbytes, uint32 wait_event_info) { int returnCode; @@ -2159,7 +2159,7 @@ FileWriteback(File file, off_t offset, off_t nbytes, uint32 wait_event_info) } ssize_t -FileReadV(File file, const struct iovec *iov, int iovcnt, off_t offset, +FileReadV(File file, const struct iovec *iov, int iovcnt, pgoff_t offset, uint32 wait_event_info) { ssize_t returnCode; @@ -2216,7 +2216,7 @@ retry: int FileStartReadV(PgAioHandle *ioh, File file, - int iovcnt, off_t offset, + int iovcnt, pgoff_t offset, uint32 wait_event_info) { int returnCode; @@ -2241,7 +2241,7 @@ FileStartReadV(PgAioHandle *ioh, File file, } ssize_t -FileWriteV(File file, const struct iovec *iov, int iovcnt, off_t offset, +FileWriteV(File file, const struct iovec *iov, int iovcnt, pgoff_t offset, uint32 wait_event_info) { ssize_t returnCode; @@ -2270,7 +2270,7 @@ FileWriteV(File file, const struct iovec *iov, int iovcnt, off_t offset, */ if (temp_file_limit >= 0 && (vfdP->fdstate & FD_TEMP_FILE_LIMIT)) { - off_t past_write = offset; + pgoff_t past_write = offset; for (int i = 0; i < iovcnt; ++i) past_write += iov[i].iov_len; @@ -2309,7 +2309,7 @@ retry: */ if (vfdP->fdstate & FD_TEMP_FILE_LIMIT) { - off_t past_write = offset + returnCode; + pgoff_t past_write = offset + returnCode; if (past_write > vfdP->fileSize) { @@ -2373,7 +2373,7 @@ FileSync(File file, uint32 wait_event_info) * appropriate error. */ int -FileZero(File file, off_t offset, off_t amount, uint32 wait_event_info) +FileZero(File file, pgoff_t offset, pgoff_t amount, uint32 wait_event_info) { int returnCode; ssize_t written; @@ -2418,7 +2418,7 @@ FileZero(File file, off_t offset, off_t amount, uint32 wait_event_info) * appropriate error. */ int -FileFallocate(File file, off_t offset, off_t amount, uint32 wait_event_info) +FileFallocate(File file, pgoff_t offset, pgoff_t amount, uint32 wait_event_info) { #ifdef HAVE_POSIX_FALLOCATE int returnCode; @@ -2457,7 +2457,7 @@ retry: return FileZero(file, offset, amount, wait_event_info); } -off_t +pgoff_t FileSize(File file) { Assert(FileIsValid(file)); @@ -2468,14 +2468,14 @@ FileSize(File file) if (FileIsNotOpen(file)) { if (FileAccess(file) < 0) - return (off_t) -1; + return (pgoff_t) -1; } return lseek(VfdCache[file].fd, 0, SEEK_END); } int -FileTruncate(File file, off_t offset, uint32 wait_event_info) +FileTruncate(File file, pgoff_t offset, uint32 wait_event_info) { int returnCode; diff --git a/src/backend/storage/smgr/md.c b/src/backend/storage/smgr/md.c index 235ba7e191..e3f335a834 100644 --- a/src/backend/storage/smgr/md.c +++ b/src/backend/storage/smgr/md.c @@ -487,7 +487,7 @@ void mdextend(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, const void *buffer, bool skipFsync) { - off_t seekpos; + pgoff_t seekpos; int nbytes; MdfdVec *v; @@ -515,9 +515,9 @@ mdextend(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, v = _mdfd_getseg(reln, forknum, blocknum, skipFsync, EXTENSION_CREATE); - seekpos = (off_t) BLCKSZ * (blocknum % ((BlockNumber) RELSEG_SIZE)); + seekpos = (pgoff_t) BLCKSZ * (blocknum % ((BlockNumber) RELSEG_SIZE)); - Assert(seekpos < (off_t) BLCKSZ * RELSEG_SIZE); + Assert(seekpos < (pgoff_t) BLCKSZ * RELSEG_SIZE); if ((nbytes = FileWrite(v->mdfd_vfd, buffer, BLCKSZ, seekpos, WAIT_EVENT_DATA_FILE_EXTEND)) != BLCKSZ) { @@ -578,7 +578,7 @@ mdzeroextend(SMgrRelation reln, ForkNumber forknum, while (remblocks > 0) { BlockNumber segstartblock = curblocknum % ((BlockNumber) RELSEG_SIZE); - off_t seekpos = (off_t) BLCKSZ * segstartblock; + pgoff_t seekpos = (pgoff_t) BLCKSZ * segstartblock; int numblocks; if (segstartblock + remblocks > RELSEG_SIZE) @@ -607,7 +607,7 @@ mdzeroextend(SMgrRelation reln, ForkNumber forknum, int ret; ret = FileFallocate(v->mdfd_vfd, - seekpos, (off_t) BLCKSZ * numblocks, + seekpos, (pgoff_t) BLCKSZ * numblocks, WAIT_EVENT_DATA_FILE_EXTEND); if (ret != 0) { @@ -630,7 +630,7 @@ mdzeroextend(SMgrRelation reln, ForkNumber forknum, * whole length of the extension. */ ret = FileZero(v->mdfd_vfd, - seekpos, (off_t) BLCKSZ * numblocks, + seekpos, (pgoff_t) BLCKSZ * numblocks, WAIT_EVENT_DATA_FILE_EXTEND); if (ret < 0) ereport(ERROR, @@ -745,7 +745,7 @@ mdprefetch(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, while (nblocks > 0) { - off_t seekpos; + pgoff_t seekpos; MdfdVec *v; int nblocks_this_segment; @@ -754,9 +754,9 @@ mdprefetch(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, if (v == NULL) return false; - seekpos = (off_t) BLCKSZ * (blocknum % ((BlockNumber) RELSEG_SIZE)); + seekpos = (pgoff_t) BLCKSZ * (blocknum % ((BlockNumber) RELSEG_SIZE)); - Assert(seekpos < (off_t) BLCKSZ * RELSEG_SIZE); + Assert(seekpos < (pgoff_t) BLCKSZ * RELSEG_SIZE); nblocks_this_segment = Min(nblocks, @@ -851,7 +851,7 @@ mdreadv(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, { struct iovec iov[PG_IOV_MAX]; int iovcnt; - off_t seekpos; + pgoff_t seekpos; int nbytes; MdfdVec *v; BlockNumber nblocks_this_segment; @@ -861,9 +861,9 @@ mdreadv(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, v = _mdfd_getseg(reln, forknum, blocknum, false, EXTENSION_FAIL | EXTENSION_CREATE_RECOVERY); - seekpos = (off_t) BLCKSZ * (blocknum % ((BlockNumber) RELSEG_SIZE)); + seekpos = (pgoff_t) BLCKSZ * (blocknum % ((BlockNumber) RELSEG_SIZE)); - Assert(seekpos < (off_t) BLCKSZ * RELSEG_SIZE); + Assert(seekpos < (pgoff_t) BLCKSZ * RELSEG_SIZE); nblocks_this_segment = Min(nblocks, @@ -986,7 +986,7 @@ mdstartreadv(PgAioHandle *ioh, SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, void **buffers, BlockNumber nblocks) { - off_t seekpos; + pgoff_t seekpos; MdfdVec *v; BlockNumber nblocks_this_segment; struct iovec *iov; @@ -996,9 +996,9 @@ mdstartreadv(PgAioHandle *ioh, v = _mdfd_getseg(reln, forknum, blocknum, false, EXTENSION_FAIL | EXTENSION_CREATE_RECOVERY); - seekpos = (off_t) BLCKSZ * (blocknum % ((BlockNumber) RELSEG_SIZE)); + seekpos = (pgoff_t) BLCKSZ * (blocknum % ((BlockNumber) RELSEG_SIZE)); - Assert(seekpos < (off_t) BLCKSZ * RELSEG_SIZE); + Assert(seekpos < (pgoff_t) BLCKSZ * RELSEG_SIZE); nblocks_this_segment = Min(nblocks, @@ -1068,7 +1068,7 @@ mdwritev(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, { struct iovec iov[PG_IOV_MAX]; int iovcnt; - off_t seekpos; + pgoff_t seekpos; int nbytes; MdfdVec *v; BlockNumber nblocks_this_segment; @@ -1078,9 +1078,9 @@ mdwritev(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, v = _mdfd_getseg(reln, forknum, blocknum, skipFsync, EXTENSION_FAIL | EXTENSION_CREATE_RECOVERY); - seekpos = (off_t) BLCKSZ * (blocknum % ((BlockNumber) RELSEG_SIZE)); + seekpos = (pgoff_t) BLCKSZ * (blocknum % ((BlockNumber) RELSEG_SIZE)); - Assert(seekpos < (off_t) BLCKSZ * RELSEG_SIZE); + Assert(seekpos < (pgoff_t) BLCKSZ * RELSEG_SIZE); nblocks_this_segment = Min(nblocks, @@ -1173,7 +1173,7 @@ mdwriteback(SMgrRelation reln, ForkNumber forknum, while (nblocks > 0) { BlockNumber nflush = nblocks; - off_t seekpos; + pgoff_t seekpos; MdfdVec *v; int segnum_start, segnum_end; @@ -1202,9 +1202,9 @@ mdwriteback(SMgrRelation reln, ForkNumber forknum, Assert(nflush >= 1); Assert(nflush <= nblocks); - seekpos = (off_t) BLCKSZ * (blocknum % ((BlockNumber) RELSEG_SIZE)); + seekpos = (pgoff_t) BLCKSZ * (blocknum % ((BlockNumber) RELSEG_SIZE)); - FileWriteback(v->mdfd_vfd, seekpos, (off_t) BLCKSZ * nflush, WAIT_EVENT_DATA_FILE_FLUSH); + FileWriteback(v->mdfd_vfd, seekpos, (pgoff_t) BLCKSZ * nflush, WAIT_EVENT_DATA_FILE_FLUSH); nblocks -= nflush; blocknum += nflush; @@ -1348,7 +1348,7 @@ mdtruncate(SMgrRelation reln, ForkNumber forknum, */ BlockNumber lastsegblocks = nblocks - priorblocks; - if (FileTruncate(v->mdfd_vfd, (off_t) lastsegblocks * BLCKSZ, WAIT_EVENT_DATA_FILE_TRUNCATE) < 0) + if (FileTruncate(v->mdfd_vfd, (pgoff_t) lastsegblocks * BLCKSZ, WAIT_EVENT_DATA_FILE_TRUNCATE) < 0) ereport(ERROR, (errcode_for_file_access(), errmsg("could not truncate file \"%s\" to %u blocks: %m", @@ -1484,9 +1484,9 @@ mdfd(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, uint32 *off) v = _mdfd_getseg(reln, forknum, blocknum, false, EXTENSION_FAIL); - *off = (off_t) BLCKSZ * (blocknum % ((BlockNumber) RELSEG_SIZE)); + *off = (pgoff_t) BLCKSZ * (blocknum % ((BlockNumber) RELSEG_SIZE)); - Assert(*off < (off_t) BLCKSZ * RELSEG_SIZE); + Assert(*off < (pgoff_t) BLCKSZ * RELSEG_SIZE); return FileGetRawDesc(v->mdfd_vfd); } @@ -1868,7 +1868,7 @@ _mdfd_getseg(SMgrRelation reln, ForkNumber forknum, BlockNumber blkno, static BlockNumber _mdnblocks(SMgrRelation reln, ForkNumber forknum, MdfdVec *seg) { - off_t len; + pgoff_t len; len = FileSize(seg->mdfd_vfd); if (len < 0) diff --git a/src/common/file_utils.c b/src/common/file_utils.c index 7b62687a2a..cdf08ab5cb 100644 --- a/src/common/file_utils.c +++ b/src/common/file_utils.c @@ -656,7 +656,7 @@ compute_remaining_iovec(struct iovec *destination, * error is returned, it is unspecified how much has been written. */ ssize_t -pg_pwritev_with_retry(int fd, const struct iovec *iov, int iovcnt, off_t offset) +pg_pwritev_with_retry(int fd, const struct iovec *iov, int iovcnt, pgoff_t offset) { struct iovec iov_copy[PG_IOV_MAX]; ssize_t sum = 0; @@ -706,7 +706,7 @@ pg_pwritev_with_retry(int fd, const struct iovec *iov, int iovcnt, off_t offset) * is returned with errno set. */ ssize_t -pg_pwrite_zeros(int fd, size_t size, off_t offset) +pg_pwrite_zeros(int fd, size_t size, pgoff_t offset) { static const PGIOAlignedBlock zbuffer = {0}; /* worth BLCKSZ */ void *zerobuf_addr = unconstify(PGIOAlignedBlock *, &zbuffer)->data; diff --git a/src/include/common/file_utils.h b/src/include/common/file_utils.h index 9fd88953e4..4239713803 100644 --- a/src/include/common/file_utils.h +++ b/src/include/common/file_utils.h @@ -55,9 +55,9 @@ extern int compute_remaining_iovec(struct iovec *destination, extern ssize_t pg_pwritev_with_retry(int fd, const struct iovec *iov, int iovcnt, - off_t offset); + pgoff_t offset); -extern ssize_t pg_pwrite_zeros(int fd, size_t size, off_t offset); +extern ssize_t pg_pwrite_zeros(int fd, size_t size, pgoff_t offset); /* Filename components */ #define PG_TEMP_FILES_DIR "pgsql_tmp" diff --git a/src/include/port/pg_iovec.h b/src/include/port/pg_iovec.h index 90be3af449..845ded8c71 100644 --- a/src/include/port/pg_iovec.h +++ b/src/include/port/pg_iovec.h @@ -51,7 +51,7 @@ struct iovec * this changes the current file position. */ static inline ssize_t -pg_preadv(int fd, const struct iovec *iov, int iovcnt, off_t offset) +pg_preadv(int fd, const struct iovec *iov, int iovcnt, pgoff_t offset) { #if HAVE_DECL_PREADV /* @@ -90,7 +90,7 @@ pg_preadv(int fd, const struct iovec *iov, int iovcnt, off_t offset) * this changes the current file position. */ static inline ssize_t -pg_pwritev(int fd, const struct iovec *iov, int iovcnt, off_t offset) +pg_pwritev(int fd, const struct iovec *iov, int iovcnt, pgoff_t offset) { #if HAVE_DECL_PWRITEV /* diff --git a/src/include/port/win32_port.h b/src/include/port/win32_port.h index ff7028bdc8..f54ccef7db 100644 --- a/src/include/port/win32_port.h +++ b/src/include/port/win32_port.h @@ -584,9 +584,9 @@ typedef unsigned short mode_t; #endif /* in port/win32pread.c */ -extern ssize_t pg_pread(int fd, void *buf, size_t nbyte, off_t offset); +extern ssize_t pg_pread(int fd, void *buf, size_t nbyte, pgoff_t offset); /* in port/win32pwrite.c */ -extern ssize_t pg_pwrite(int fd, const void *buf, size_t nbyte, off_t offset); +extern ssize_t pg_pwrite(int fd, const void *buf, size_t nbyte, pgoff_t offset); #endif /* PG_WIN32_PORT_H */ diff --git a/src/include/storage/fd.h b/src/include/storage/fd.h index b77d8e5e30..3e821ce8fb 100644 --- a/src/include/storage/fd.h +++ b/src/include/storage/fd.h @@ -108,17 +108,17 @@ extern File PathNameOpenFile(const char *fileName, int fileFlags); extern File PathNameOpenFilePerm(const char *fileName, int fileFlags, mode_t fileMode); extern File OpenTemporaryFile(bool interXact); extern void FileClose(File file); -extern int FilePrefetch(File file, off_t offset, off_t amount, uint32 wait_event_info); -extern ssize_t FileReadV(File file, const struct iovec *iov, int iovcnt, off_t offset, uint32 wait_event_info); -extern ssize_t FileWriteV(File file, const struct iovec *iov, int iovcnt, off_t offset, uint32 wait_event_info); -extern int FileStartReadV(struct PgAioHandle *ioh, File file, int iovcnt, off_t offset, uint32 wait_event_info); +extern int FilePrefetch(File file, pgoff_t offset, pgoff_t amount, uint32 wait_event_info); +extern ssize_t FileReadV(File file, const struct iovec *iov, int iovcnt, pgoff_t offset, uint32 wait_event_info); +extern ssize_t FileWriteV(File file, const struct iovec *iov, int iovcnt, pgoff_t offset, uint32 wait_event_info); +extern int FileStartReadV(struct PgAioHandle *ioh, File file, int iovcnt, pgoff_t offset, uint32 wait_event_info); extern int FileSync(File file, uint32 wait_event_info); -extern int FileZero(File file, off_t offset, off_t amount, uint32 wait_event_info); -extern int FileFallocate(File file, off_t offset, off_t amount, uint32 wait_event_info); +extern int FileZero(File file, pgoff_t offset, pgoff_t amount, uint32 wait_event_info); +extern int FileFallocate(File file, pgoff_t offset, pgoff_t amount, uint32 wait_event_info); -extern off_t FileSize(File file); -extern int FileTruncate(File file, off_t offset, uint32 wait_event_info); -extern void FileWriteback(File file, off_t offset, off_t nbytes, uint32 wait_event_info); +extern pgoff_t FileSize(File file); +extern int FileTruncate(File file, pgoff_t offset, uint32 wait_event_info); +extern void FileWriteback(File file, pgoff_t offset, pgoff_t nbytes, uint32 wait_event_info); extern char *FilePathName(File file); extern int FileGetRawDesc(File file); extern int FileGetRawFlags(File file); @@ -186,8 +186,8 @@ extern int pg_fsync_no_writethrough(int fd); extern int pg_fsync_writethrough(int fd); extern int pg_fdatasync(int fd); extern bool pg_file_exists(const char *name); -extern void pg_flush_data(int fd, off_t offset, off_t nbytes); -extern int pg_truncate(const char *path, off_t length); +extern void pg_flush_data(int fd, pgoff_t offset, pgoff_t nbytes); +extern int pg_truncate(const char *path, pgoff_t length); extern void fsync_fname(const char *fname, bool isdir); extern int fsync_fname_ext(const char *fname, bool isdir, bool ignore_perm, int elevel); extern int durable_rename(const char *oldfile, const char *newfile, int elevel); @@ -196,7 +196,7 @@ extern void SyncDataDirectory(void); extern int data_sync_elevel(int elevel); static inline ssize_t -FileRead(File file, void *buffer, size_t amount, off_t offset, +FileRead(File file, void *buffer, size_t amount, pgoff_t offset, uint32 wait_event_info) { struct iovec iov = { @@ -208,7 +208,7 @@ FileRead(File file, void *buffer, size_t amount, off_t offset, } static inline ssize_t -FileWrite(File file, const void *buffer, size_t amount, off_t offset, +FileWrite(File file, const void *buffer, size_t amount, pgoff_t offset, uint32 wait_event_info) { struct iovec iov = { diff --git a/src/port/win32pread.c b/src/port/win32pread.c index 32d56c462e..1f00dfd8e6 100644 --- a/src/port/win32pread.c +++ b/src/port/win32pread.c @@ -17,7 +17,7 @@ #include ssize_t -pg_pread(int fd, void *buf, size_t size, off_t offset) +pg_pread(int fd, void *buf, size_t size, pgoff_t offset) { OVERLAPPED overlapped = {0}; HANDLE handle; @@ -30,16 +30,16 @@ pg_pread(int fd, void *buf, size_t size, off_t offset) return -1; } - /* Avoid overflowing DWORD. */ + /* Avoid overflowing DWORD */ size = Min(size, 1024 * 1024 * 1024); - /* Note that this changes the file position, despite not using it. */ - overlapped.Offset = offset; + overlapped.Offset = (DWORD) offset; + overlapped.OffsetHigh = (DWORD) (offset >> 32); + if (!ReadFile(handle, buf, size, &result, &overlapped)) { if (GetLastError() == ERROR_HANDLE_EOF) return 0; - _dosmaperr(GetLastError()); return -1; } diff --git a/src/port/win32pwrite.c b/src/port/win32pwrite.c index 249aa6c468..d9a0d23c2b 100644 --- a/src/port/win32pwrite.c +++ b/src/port/win32pwrite.c @@ -15,9 +15,8 @@ #include "c.h" #include - ssize_t -pg_pwrite(int fd, const void *buf, size_t size, off_t offset) +pg_pwrite(int fd, const void *buf, size_t size, pgoff_t offset) { OVERLAPPED overlapped = {0}; HANDLE handle; @@ -30,11 +29,12 @@ pg_pwrite(int fd, const void *buf, size_t size, off_t offset) return -1; } - /* Avoid overflowing DWORD. */ + /* Avoid overflowing DWORD */ size = Min(size, 1024 * 1024 * 1024); - /* Note that this changes the file position, despite not using it. */ - overlapped.Offset = offset; + overlapped.Offset = (DWORD) offset; + overlapped.OffsetHigh = (DWORD) (offset >> 32); + if (!WriteFile(handle, buf, size, &result, &overlapped)) { _dosmaperr(GetLastError()); diff --git a/src/test/modules/meson.build b/src/test/modules/meson.build index 14fc761c4c..95af220a4d 100644 --- a/src/test/modules/meson.build +++ b/src/test/modules/meson.build @@ -28,6 +28,7 @@ subdir('test_ginpostinglist') subdir('test_int128') subdir('test_integerset') subdir('test_json_parser') +subdir('test_large_files') subdir('test_lfind') subdir('test_lwlock_tranches') subdir('test_misc') diff --git a/src/test/modules/test_large_files/Makefile b/src/test/modules/test_large_files/Makefile new file mode 100644 index 0000000000..26bb53a51f --- /dev/null +++ b/src/test/modules/test_large_files/Makefile @@ -0,0 +1,20 @@ +# src/test/modules/test_large_files/Makefile + +MODULE_big = test_large_files +OBJS = test_large_files.o + +EXTENSION = test_large_files +DATA = test_large_files--1.0.sql + +REGRESS = test_large_files + +ifdef USE_PGXS +PG_CONFIG = pg_config +PGXS := $(shell $(PG_CONFIG) --pgxs) +include $(PGXS) +else +subdir = src/test/modules/test_large_files +top_builddir = ../../../.. +include $(top_builddir)/src/Makefile.global +include $(top_srcdir)/contrib/contrib-global.mk +endif diff --git a/src/test/modules/test_large_files/README b/src/test/modules/test_large_files/README new file mode 100644 index 0000000000..9df6a2ce84 --- /dev/null +++ b/src/test/modules/test_large_files/README @@ -0,0 +1,53 @@ +Test Module for Windows Large File I/O + +This test module provides functions to test PostgreSQL's ability to +handle files larger than 4GB on Windows. + +Requirements + +- Windows platform +- PostgreSQL built with segment size greater than 2GB +- NTFS filesystem (for sparse file support) + +Functions + +test_create_sparse_file(filename text, size_gb int) RETURNS boolean + +Creates a sparse file of the specified size in gigabytes. This allows +testing large offsets without actually writing gigabytes of data to +disk. + +test_sparse_write_read(filename text, offset_gb float8, test_data text) +RETURNS boolean + +Writes test data at the specified offset (in GB) using PostgreSQL's VFD +layer (FileWrite), then reads it back using FileRead to verify basic I/O +functionality. + +test_verify_offset_native(filename text, offset_gb float8, expected_data +text) RETURNS boolean + +Critical for validation: Uses native Windows APIs (ReadFile with proper +OVERLAPPED structure) to verify that data written by PostgreSQL is +actually at the correct offset. This catches bugs where both write and +read might use the same incorrect offset calculation (making a broken +test appear to pass). + +Without this verification, a test could pass even with broken offset +handling if both FileWrite and FileRead make the same mistake. + +What the Test Verifies + +1. Sparse file creation works on Windows +2. PostgreSQL's FileWrite can write at offsets > 4GB +3. PostgreSQL's FileRead can read from offsets > 4GB +4. Data is actually at the correct offset (verified with native Windows + APIs) + +The native verification step is critical because without it, a test +could pass even with broken offset handling. For example, if both +FileWrite and FileRead truncate offsets to 32 bits, writing at 4.5GB +would actually write at ~512MB, and reading at 4.5GB would read from +~512MB - the test would find matching data but at the wrong location. +The native verification catches this by independently checking the +actual file offset. diff --git a/src/test/modules/test_large_files/meson.build b/src/test/modules/test_large_files/meson.build new file mode 100644 index 0000000000..c755e2cf16 --- /dev/null +++ b/src/test/modules/test_large_files/meson.build @@ -0,0 +1,29 @@ +# src/test/modules/test_large_files/meson.build + +test_large_files_sources = files( + 'test_large_files.c', +) + +if host_system == 'windows' + test_large_files = shared_module('test_large_files', + test_large_files_sources, + kwargs: pg_test_mod_args, + ) + test_install_libs += test_large_files + + test_install_data += files( + 'test_large_files.control', + 'test_large_files--1.0.sql', + ) + + tests += { + 'name': 'test_large_files', + 'sd': meson.current_source_dir(), + 'bd': meson.current_build_dir(), + 'tap': { + 'tests': [ + 't/001_windows_large_files.pl', + ], + }, + } +endif diff --git a/src/test/modules/test_large_files/t/001_windows_large_files.pl b/src/test/modules/test_large_files/t/001_windows_large_files.pl new file mode 100644 index 0000000000..2fb0ef5e36 --- /dev/null +++ b/src/test/modules/test_large_files/t/001_windows_large_files.pl @@ -0,0 +1,65 @@ +#!/usr/bin/perl +# Copyright (c) 2025, PostgreSQL Global Development Group + +=pod + +=head1 NAME + +001_windows_large_files.pl - Test Windows support for files >4GB + +=head1 SYNOPSIS + + prove src/test/modules/test_large_files/t/001_windows_large_files.pl + +=head1 DESCRIPTION + +This test verifies that PostgreSQL on Windows can correctly handle file +operations at offsets beyond 4GB. This requires PostgreSQL to be +built with a segment size greater than 2GB. + +The test uses sparse files to avoid actually writing gigabytes of data. + +=cut + +use strict; +use warnings; +use PostgreSQL::Test::Cluster; +use PostgreSQL::Test::Utils; +use Test::More; +use File::Spec; +use File::Temp; + +if ($^O ne 'MSWin32') +{ + plan skip_all => 'test is Windows-specific'; +} + +plan tests => 4; + +my $node = PostgreSQL::Test::Cluster->new('main'); +$node->init; +$node->start; + +$node->safe_psql('postgres', 'CREATE EXTENSION test_large_files;'); +pass("test_large_files extension loaded"); + +my $tempdir = File::Temp->newdir(); +my $testfile = File::Spec->catfile($tempdir, 'large_file_test.dat'); + +note "Test file: $testfile"; + +my $create_result = $node->safe_psql('postgres', + "SELECT test_create_sparse_file('$testfile', 5);"); +is($create_result, 't', "Created 5GB sparse file"); + +my $test_4_5gb = $node->safe_psql('postgres', + "SELECT test_sparse_write_read('$testfile', 4.5, 'TEST_DATA_AT_4.5GB');"); +is($test_4_5gb, 't', "Write/read successful at 4.5GB offset"); + +my $verify_4_5gb = $node->safe_psql('postgres', + "SELECT test_verify_offset_native('$testfile', 4.5, 'TEST_DATA_AT_4.5GB');"); +is($verify_4_5gb, 't', "Native verification confirms data at correct 4.5GB offset"); + +$node->stop; + +done_testing(); diff --git a/src/test/modules/test_large_files/test_large_files--1.0.sql b/src/test/modules/test_large_files/test_large_files--1.0.sql new file mode 100644 index 0000000000..c4db84106c --- /dev/null +++ b/src/test/modules/test_large_files/test_large_files--1.0.sql @@ -0,0 +1,36 @@ +-- src/test/modules/test_large_files/test_large_files--1.0.sql + +-- complain if script is sourced in psql, rather than via CREATE EXTENSION +\echo Use "CREATE EXTENSION test_large_files" to load this file. \quit + +-- +-- test_create_sparse_file(filename text, size_gb int) returns boolean +-- +-- Creates a sparse file for testing. Windows only. +-- +CREATE FUNCTION test_create_sparse_file(filename text, size_gb int) +RETURNS boolean +AS 'MODULE_PATHNAME', 'test_create_sparse_file' +LANGUAGE C STRICT; + +-- +-- test_sparse_write_read(filename text, offset_gb numeric, test_data text) returns boolean +-- +-- Writes data at a large offset and reads it back to verify correctness. +-- Tests pg_pwrite/pg_pread with offsets beyond 2GB and 4GB. Windows only. +-- +CREATE FUNCTION test_sparse_write_read(filename text, offset_gb float8, test_data text) +RETURNS boolean +AS 'MODULE_PATHNAME', 'test_sparse_write_read' +LANGUAGE C STRICT; + +-- +-- test_verify_offset_native(filename text, offset_gb numeric, expected_data text) returns boolean +-- +-- Uses native Windows APIs to verify data is at the correct offset. +-- This ensures PostgreSQL's I/O didn't write to a wrapped/incorrect offset. +-- +CREATE FUNCTION test_verify_offset_native(filename text, offset_gb float8, expected_data text) +RETURNS boolean +AS 'MODULE_PATHNAME', 'test_verify_offset_native' +LANGUAGE C STRICT; diff --git a/src/test/modules/test_large_files/test_large_files.c b/src/test/modules/test_large_files/test_large_files.c new file mode 100644 index 0000000000..531230da4b --- /dev/null +++ b/src/test/modules/test_large_files/test_large_files.c @@ -0,0 +1,270 @@ +/* src/test/modules/test_large_files/test_large_files.c */ + +#include "postgres.h" + +#include "fmgr.h" +#include "storage/fd.h" +#include "utils/builtins.h" + +#ifdef WIN32 +#include +#include +#endif + +PG_MODULE_MAGIC; + +PG_FUNCTION_INFO_V1(test_sparse_write_read); +PG_FUNCTION_INFO_V1(test_create_sparse_file); +PG_FUNCTION_INFO_V1(test_verify_offset_native); + +/* + * test_verify_offset_native(filename text, offset_gb numeric, expected_data text) returns boolean + * + * Uses native Windows APIs to read data at the specified offset and verify it matches. + * This ensures PostgreSQL's I/O functions wrote to the CORRECT offset, not a wrapped one. + * Windows only. + */ +Datum +test_verify_offset_native(PG_FUNCTION_ARGS) +{ +#ifdef WIN32 + text *filename_text = PG_GETARG_TEXT_PP(0); + float8 offset_gb = PG_GETARG_FLOAT8(1); + text *expected_text = PG_GETARG_TEXT_PP(2); + char *filename; + char *expected_data; + char *read_buffer; + int expected_len; + int64 offset; + HANDLE hFile; + OVERLAPPED overlapped = {0}; + DWORD bytesRead; + bool success = false; + + filename = text_to_cstring(filename_text); + expected_data = text_to_cstring(expected_text); + expected_len = strlen(expected_data) + 1; + + /* Calculate offset in bytes */ + offset = (int64) (offset_gb * 1024.0 * 1024.0 * 1024.0); + + /* Open file with native Windows API */ + hFile = CreateFile(filename, + GENERIC_READ, + FILE_SHARE_READ | FILE_SHARE_WRITE, + NULL, + OPEN_EXISTING, + FILE_ATTRIBUTE_NORMAL, + NULL); + + if (hFile == INVALID_HANDLE_VALUE) + ereport(ERROR, + (errcode_for_file_access(), + errmsg("could not open file \"%s\" for verification: %lu", + filename, GetLastError()))); + + /* Set up OVERLAPPED structure with proper 64-bit offset */ + overlapped.Offset = (DWORD)(offset & 0xFFFFFFFF); + overlapped.OffsetHigh = (DWORD)(offset >> 32); + + /* Allocate read buffer */ + read_buffer = palloc(expected_len); + + /* Read using native Windows API */ + if (!ReadFile(hFile, read_buffer, expected_len, &bytesRead, &overlapped)) + { + DWORD error = GetLastError(); + CloseHandle(hFile); + pfree(read_buffer); + ereport(ERROR, + (errcode_for_file_access(), + errmsg("native ReadFile failed at offset %lld: %lu", + offset, error))); + } + + if (bytesRead != expected_len) + { + CloseHandle(hFile); + pfree(read_buffer); + ereport(ERROR, + (errmsg("native ReadFile read %lu bytes, expected %d", + bytesRead, expected_len))); + } + + /* Verify data matches */ + success = (memcmp(expected_data, read_buffer, expected_len) == 0); + + pfree(read_buffer); + CloseHandle(hFile); + + if (!success) + ereport(ERROR, + (errmsg("data mismatch at offset %lld: PostgreSQL wrote to wrong location", + offset))); + + PG_RETURN_BOOL(success); +#else + ereport(ERROR, + (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), + errmsg("this test is only supported on Windows"))); + PG_RETURN_BOOL(false); +#endif +} + +/* + * test_create_sparse_file(filename text, size_gb int) returns boolean + * + * Creates a sparse file of the specified size in gigabytes. + * Windows only. + */ +Datum +test_create_sparse_file(PG_FUNCTION_ARGS) +{ +#ifdef WIN32 + text *filename_text = PG_GETARG_TEXT_PP(0); + int32 size_gb = PG_GETARG_INT32(1); + char *filename; + HANDLE hFile; + DWORD bytesReturned; + LARGE_INTEGER fileSize; + bool success = false; + + filename = text_to_cstring(filename_text); + + /* Open/create the file */ + hFile = CreateFile(filename, + GENERIC_WRITE, + 0, + NULL, + CREATE_ALWAYS, + FILE_ATTRIBUTE_NORMAL, + NULL); + + if (hFile == INVALID_HANDLE_VALUE) + ereport(ERROR, + (errcode_for_file_access(), + errmsg("could not create file \"%s\": %lu", + filename, GetLastError()))); + + /* Mark as sparse */ + if (!DeviceIoControl(hFile, FSCTL_SET_SPARSE, NULL, 0, NULL, 0, + &bytesReturned, NULL)) + { + CloseHandle(hFile); + ereport(ERROR, + (errcode_for_file_access(), + errmsg("could not set file sparse: %lu", GetLastError()))); + } + + /* Set file size */ + fileSize.QuadPart = (int64) size_gb * 1024 * 1024 * 1024; + if (!SetFilePointerEx(hFile, fileSize, NULL, FILE_BEGIN)) + { + CloseHandle(hFile); + ereport(ERROR, + (errcode_for_file_access(), + errmsg("could not set file pointer: %lu", GetLastError()))); + } + + if (!SetEndOfFile(hFile)) + { + CloseHandle(hFile); + ereport(ERROR, + (errcode_for_file_access(), + errmsg("could not set end of file: %lu", GetLastError()))); + } + + success = true; + CloseHandle(hFile); + + PG_RETURN_BOOL(success); +#else + ereport(ERROR, + (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), + errmsg("sparse files are only supported on Windows"))); + PG_RETURN_BOOL(false); +#endif +} + +/* + * test_sparse_write_read(filename text, offset_gb numeric, test_data text) returns boolean + * + * Writes test data at the specified offset (in GB) and reads it back to verify. + * Tests that pg_pwrite and pg_pread work correctly with large offsets. + * Windows only. + */ +Datum +test_sparse_write_read(PG_FUNCTION_ARGS) +{ +#ifdef WIN32 + text *filename_text = PG_GETARG_TEXT_PP(0); + float8 offset_gb = PG_GETARG_FLOAT8(1); + text *test_data_text = PG_GETARG_TEXT_PP(2); + char *filename; + char *test_data; + char *read_buffer; + int test_data_len; + pgoff_t offset; + int fd; + ssize_t written; + ssize_t nread; + bool success = false; + + filename = text_to_cstring(filename_text); + test_data = text_to_cstring(test_data_text); + test_data_len = strlen(test_data) + 1; /* include null terminator */ + + /* Calculate offset in bytes */ + offset = (pgoff_t) (offset_gb * 1024.0 * 1024.0 * 1024.0); + + /* Open the file using PostgreSQL's VFD layer */ + fd = BasicOpenFile(filename, O_RDWR | PG_BINARY); + if (fd < 0) + ereport(ERROR, + (errcode_for_file_access(), + errmsg("could not open file \"%s\": %m", filename))); + + /* Write test data at the specified offset using pg_pwrite */ + written = pg_pwrite(fd, test_data, test_data_len, offset); + if (written != test_data_len) + { + close(fd); + ereport(ERROR, + (errcode_for_file_access(), + errmsg("could not write to file at offset %lld: wrote %zd of %d bytes", + (long long) offset, written, test_data_len))); + } + + /* Allocate buffer for reading */ + read_buffer = palloc(test_data_len); + + /* Read back the data using pg_pread */ + nread = pg_pread(fd, read_buffer, test_data_len, offset); + if (nread != test_data_len) + { + close(fd); + pfree(read_buffer); + ereport(ERROR, + (errcode_for_file_access(), + errmsg("could not read from file at offset %lld: read %zd of %d bytes", + (long long) offset, nread, test_data_len))); + } + + /* Verify data matches */ + success = (memcmp(test_data, read_buffer, test_data_len) == 0); + + pfree(read_buffer); + close(fd); + + if (!success) + ereport(ERROR, + (errmsg("data mismatch: read data does not match written data"))); + + PG_RETURN_BOOL(success); +#else + ereport(ERROR, + (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), + errmsg("this test is only supported on Windows"))); + PG_RETURN_BOOL(false); +#endif +} diff --git a/src/test/modules/test_large_files/test_large_files.control b/src/test/modules/test_large_files/test_large_files.control new file mode 100644 index 0000000000..9b0a30974b --- /dev/null +++ b/src/test/modules/test_large_files/test_large_files.control @@ -0,0 +1,5 @@ +# test_large_files extension +comment = 'Test module for large file I/O on Windows' +default_version = '1.0' +module_pathname = '$libdir/test_large_files' +relocatable = true -- 2.49.0