From 8383a00e7a2fe1f131416fb63e5e34fedf658243 Mon Sep 17 00:00:00 2001
From: Heikki Linnakangas <heikki.linnakangas@iki.fi>
Date: Fri, 3 Jul 2026 12:20:26 +0300
Subject: [PATCH v4 2/2] libpq: Drain all pending bytes from SSL/GSS during
 pqReadData()

The previous commit strengthened a workaround for a hang when large
messages are split across TLS records/GSS tokens. Because that
workaround is implemented in libpq internals, it can only help us when
libpq itself is polling on the socket. In nonblocking situations,
where the client above libpq is expected to poll, the same bugs can
show up.

As a contrived example, consider a large protocol-2.0 error coming
back from a server during PQconnectPoll(), split in an odd way across
two records:

    -- TLS record (8192-byte payload) --
    EEEE[...repeated a total of 8192 times]
    -- TLS record (8193-byte payload) --
    EEEE[...repeated a total of 8192 times]\0

The first record will fill the first half of the libpq receive buffer,
which is 16k long by default. The second record completely fills the
last half with its first 8192 bytes, leaving the terminating NULL in
the OpenSSL buffer. Since we still haven't seen the terminator at our
level, PQconnectPoll() will return PGRES_POLLING_READING, expecting to
come back when the server has sent "the rest" of the data.  But there
is nothing left to read from the socket; OpenSSL had to pull all of
the data in the 8193-byte record off of the wire to decrypt it.

A real server would probably not split up the records this way, nor
keep the connection open after sending a fatal connection error. But
servers that regularly use larger TLS records can get the libpq
receive buffer into the same state if DataRows are big enough, as
reported on the list. While the PostgreSQL server doesn't use larger
TLS records like that, other non-PostgreSQL servers that implement the
wire protocol are known to do that, as well as proxies that sit
between the server and the client

This is a layering violation. libpq makes decisions based on data in
the application buffer, above the transport buffer (whether SSL or
GSS), but clients are polling the socket below the transport buffer.
One way to fix this in a backportable way, without changing APIs too
much, is to ensure data never stays in the transport buffer. Then
pqReadData's postconditions will look similar for both raw sockets and
SSL/GSS: any available data is either in the application buffer, or
still on the socket.

Building on the prior commit, make pqReadData() to drain all pending
data from the transport layer into conn->inBuffer, expanding the
buffer as necessary. This is not particularly efficient from an
architectural perspective (the pqsecure_read() implementations take
care to fit their packets into the current buffer, and that effort is
now completely discarded), but it's hopefully easier to reason about
than a full rewrite would be for the back branches.

Author: Jacob Champion <jacob.champion@enterprisedb.com>
Reviewed-by: Mark Dilger <mark.dilger@enterprisedb.com>
Reviewed-by: solai v <solai.cdac@gmail.com>
Reported-by: Lars Kanis <lars@greiz-reinsdorf.de>
Discussion: https://postgr.es/m/2039ac58-d3e0-434b-ac1a-2a987f3b4cb1%40greiz-reinsdorf.de
Backpatch-through: 14
---
 src/interfaces/libpq/fe-misc.c           | 145 ++++++++++++++++++++++-
 src/interfaces/libpq/fe-secure-openssl.c |   4 +-
 src/interfaces/libpq/fe-secure.c         |   3 +-
 3 files changed, 148 insertions(+), 4 deletions(-)

diff --git a/src/interfaces/libpq/fe-misc.c b/src/interfaces/libpq/fe-misc.c
index 58ccca1e393..dd772838005 100644
--- a/src/interfaces/libpq/fe-misc.c
+++ b/src/interfaces/libpq/fe-misc.c
@@ -55,6 +55,8 @@ static int	pqPutMsgBytes(const void *buf, size_t len, PGconn *conn);
 static int	pqSendSome(PGconn *conn, int len);
 static int	pqSocketCheck(PGconn *conn, int forRead, int forWrite,
 						  pg_usec_time_t end_time);
+static int	pqReadData_internal(PGconn *conn);
+static int	pqDrainPending(PGconn *conn);
 
 /*
  * PQlibVersion: return the libpq version number
@@ -593,6 +595,13 @@ pqPutMsgEnd(PGconn *conn)
 
 /* ----------
  * pqReadData: read more data, if any is available
+ *
+ * Upon a successful return, callers may assume that either 1) all available
+ * bytes have been consumed from the socket, or 2) the socket is still marked
+ * readable by the OS.  (In other words: after a successful pqReadData, it's
+ * safe to tell a client to poll for readable bytes on the socket without any
+ * further draining of the SSL/GSS transport buffers.)
+ *
  * Possible return values:
  *	 1: successfully loaded at least one more byte
  *	 0: no data is presently available, but no error detected
@@ -605,8 +614,7 @@ pqPutMsgEnd(PGconn *conn)
 int
 pqReadData(PGconn *conn)
 {
-	int			someread = 0;
-	int			nread;
+	int			available;
 
 	if (conn->sock == PGINVALID_SOCKET)
 	{
@@ -614,6 +622,40 @@ pqReadData(PGconn *conn)
 		return -1;
 	}
 
+	available = pqReadData_internal(conn);
+	if (available < 0)
+		return -1;
+	else if (available > 0)
+	{
+		/*
+		 * Make sure there are no bytes stuck in layers between conn->inBuffer
+		 * and the socket, to make it safe for clients to poll on PQsocket().
+		 */
+		if (pqDrainPending(conn))
+			return -1;
+	}
+	else
+	{
+		/*
+		 * If we're not returning any bytes from the underlying transport,
+		 * that must imply there aren't any in the transport buffer...
+		 */
+		Assert(pqsecure_bytes_pending(conn) == 0);
+	}
+
+	return available;
+}
+
+/*
+ * Workhorse for pqReadData().  It's kept separate from the pqDrainPending()
+ * logic to avoid adding to this function's goto complexity.
+ */
+static int
+pqReadData_internal(PGconn *conn)
+{
+	int			someread = 0;
+	int			nread;
+
 	/* Left-justify any data in the buffer to make room */
 	if (conn->inStart < conn->inEnd)
 	{
@@ -800,6 +842,105 @@ definitelyFailed:
 	return -1;
 }
 
+/*---
+ * Drain any transport data that is already buffered in userspace and add it
+ * to conn->inBuffer, enlarging inBuffer if necessary.  The drain fails if
+ * inBuffer cannot be made to hold all available transport data.
+ *
+ * We assume that the underlying secure transport implementation does not
+ * attempt to read any more data from the socket while draining the transport
+ * buffer.  After a successful return, pqsecure_bytes_pending() must be zero.
+ *
+ * This operation is necessary to prevent deadlock, due to a layering
+ * violation designed into our asynchronous client API: pqReadData() and all
+ * the parsing routines above it receive data from the SSL/GSS transport
+ * buffer, but clients poll on the raw PQsocket() handle.  So data can be
+ * "lost" in the intermediate layer if we don't take it out here.
+ *
+ * To illustrate what we're trying to prevent, say that the server is sending
+ * two messages at once in response to a query (Aaaa and Bb), the libpq buffer
+ * is five characters in size, and TLS records max out at three-character
+ * payloads.  Here's what would happen if pqReadData() didn't call
+ * pqDrainPending():
+ *
+ *   Client    libpq      SSL      Socket
+ *     |         |         |         |
+ *     |      [     ]    [   ]     [   ]    [1] Buffers are empty, client is
+ *     x --------------------------> |          polling on socket
+ *     |         |         |         |
+ *     |      [     ]    [   ]     [xxx]    [2] First record is received; poll
+ *     | <-------------------------- |          signals read-ready
+ *     |         |         |         |
+ *     x ---> [     ]    [   ]     [xxx]    [3] Client calls PQconsumeInput()
+ *     |         |         |         |
+ *     |      [     ] -> [   ]     [xxx]    [4] libpq calls pqReadData() to fill
+ *     |         |         |         |          the receive buffer
+ *     |      [     ]    [Aaa] <-- [   ]    [5] SSL pulls payload off the wire
+ *     |         |         |         |          and decrypts it
+ *     |      [Aaa  ] <- [   ]     [   ]    [6] pqsecure_read() takes all data
+ *     |         |         |         |
+ *     | <--- [Aaa  ]    [   ]     [   ]    [7] PQconsumeInput() returns with a
+ *     x --------------------------> |          partial message, PQisBusy() is
+ *     |         |         |         |          still true, client polls again
+ *     |      [Aaa  ]    [   ]     [xxx]    [8] Second record is received; poll
+ *     | <-------------------------- |          signals read-ready
+ *     |         |         |         |
+ *     x ---> [Aaa  ]    [   ]     [xxx]    [9] Client calls PQconsumeInput()
+ *     |         |         |         |
+ *     |      [Aaa  ] -> [   ]     [xxx]   [10] libpq calls pqReadData() to fill
+ *     |         |         |         |          the receive buffer
+ *     |      [Aaa  ]    [aBb] <-- [   ]   [11] SSL decrypts
+ *     |         |         |         |
+ *     |      [AaaaB] <- [b  ]     [   ]   [12] pqsecure_read() fills its
+ *     |         |         |         |          buffer, taking only two bytes
+ *     | <--- [AaaaB]    [b  ]     [   ]   [13] PQconsumeInput() returns with a
+ *     |         |         |         |          complete message buffered;
+ *     |         |         |         |          PQisBusy() is false
+ *     x ---> [AaaaB]    [b  ]     [   ]   [14] Client calls PQgetResult()
+ *     |         |         |         |
+ *     | <--- [B    ]    [b  ]     [   ]   [15] Aaaa is returned; PQisBusy() is
+ *     x --------------------------> |          true and client polls again
+ *     .         |         |         .
+ *     .      [B    ]    [b  ]       .     [16] No packets, and client hangs.
+ *     .         |         |         .
+ *
+ * The pqDrainPending() call fixes the above scenario at step [13].  Before
+ * returning to the Client, it first expands the libpq buffer and moves the
+ * remaining data from the SSL buffer to the libpq buffer.
+ *
+ * The function returns 0 on success and -1 on error.  Success means that
+ * there was no data pending or it was successfully drained to conn->inBuffer.
+ * On error, conn->errorMessage is set.
+ */
+static int
+pqDrainPending(PGconn *conn)
+{
+	ssize_t		bytes_pending;
+	ssize_t		nread;
+
+	bytes_pending = pqsecure_bytes_pending(conn);
+	if (bytes_pending <= 0)
+		return bytes_pending;
+
+	/* Expand the input buffer if necessary. */
+	if (pqCheckInBufferSpace(conn->inEnd + (size_t) bytes_pending, conn))
+		return -1;				/* errorMessage already set */
+
+	nread = pqsecure_read(conn, conn->inBuffer + conn->inEnd,
+						  bytes_pending);
+	conn->inEnd += nread;
+
+	/* When there are bytes pending, the read function is not supposed to fail */
+	if (nread != bytes_pending)
+	{
+		libpq_append_conn_error(conn,
+								"drained only %zu of %zd pending bytes in transport buffer",
+								nread, bytes_pending);
+		return -1;
+	}
+	return 0;
+}
+
 /*
  * pqSendSome: send data waiting in the output buffer.
  *
diff --git a/src/interfaces/libpq/fe-secure-openssl.c b/src/interfaces/libpq/fe-secure-openssl.c
index 1b22ba7b7f6..f8b2184a1ce 100644
--- a/src/interfaces/libpq/fe-secure-openssl.c
+++ b/src/interfaces/libpq/fe-secure-openssl.c
@@ -236,7 +236,9 @@ pgtls_bytes_pending(PGconn *conn)
 	int			pending;
 
 	/*
-	 * OpenSSL readahead is documented to break SSL_pending().
+	 * OpenSSL readahead is documented to break SSL_pending().  Plus, we can't
+	 * afford to have OpenSSL take bytes off the socket without processing
+	 * them; that breaks the postconditions for pqsecure_drain_pending().
 	 */
 	Assert(!SSL_get_read_ahead(conn->ssl));
 
diff --git a/src/interfaces/libpq/fe-secure.c b/src/interfaces/libpq/fe-secure.c
index 907cdb9ea39..70faf8b2fe0 100644
--- a/src/interfaces/libpq/fe-secure.c
+++ b/src/interfaces/libpq/fe-secure.c
@@ -247,7 +247,8 @@ pqsecure_raw_read(PGconn *conn, void *ptr, size_t len)
  *	Return the number of bytes available in the transport buffer.
  *
  * If pqsecure_read() is called for this number of bytes, it's guaranteed to
- * return successfully without reading from the underlying socket.
+ * return successfully without reading from the underlying socket.  See
+ * pqDrainPending() for a more complete discussion of the concepts involved.
  */
 ssize_t
 pqsecure_bytes_pending(PGconn *conn)
-- 
2.47.3