From fffa7153f563a19663a02e44196f377b83bf217f Mon Sep 17 00:00:00 2001
From: Jeff Davis <jeff@j-davis.com>
Date: Thu, 16 Apr 2026 14:56:11 -0700
Subject: [PATCH v3 2/4] Move UTF8 checks into unicode_case.c.

Pre-checking UTF-8 is inefficient. Refactor the error paths so we can
catch UTF-8 errors while iterating, and return back to
pg_locale_builtin.c where we can throw the error.

Reviewed-by: Chao Li <li.evan.chao@gmail.com>
Discussion: https://postgr.es/m/c355354e6c3f4a7aafb047361b73db247260fca0.camel@j-davis.com
---
 src/backend/utils/adt/pg_locale_builtin.c | 85 +++++++++++++++------
 src/common/unicode/case_test.c            | 33 +++++---
 src/common/unicode_case.c                 | 92 ++++++++++++++++-------
 src/include/common/unicode_case.h         |  8 +-
 4 files changed, 156 insertions(+), 62 deletions(-)

diff --git a/src/backend/utils/adt/pg_locale_builtin.c b/src/backend/utils/adt/pg_locale_builtin.c
index 7f167e751ea..96da9c6fcf3 100644
--- a/src/backend/utils/adt/pg_locale_builtin.c
+++ b/src/backend/utils/adt/pg_locale_builtin.c
@@ -62,21 +62,32 @@ initcap_wbnext(void *state)
 
 	while (wbstate->offset < wbstate->len)
 	{
-		char32_t	u = utf8_to_unicode((const unsigned char *) wbstate->str +
+		int			ulen = pg_utf_mblen((const unsigned char *) wbstate->str +
 										wbstate->offset);
-		bool		curr_alnum = pg_u_isalnum(u, wbstate->posix);
+		char32_t	u;
+		bool		curr_alnum;
+
+		if (wbstate->offset + ulen > wbstate->len)
+		{
+			wbstate->offset = wbstate->len;
+			return wbstate->len;
+		}
+
+		u = utf8_to_unicode((const unsigned char *) wbstate->str +
+							wbstate->offset);
+		curr_alnum = pg_u_isalnum(u, wbstate->posix);
 
 		if (!wbstate->init || curr_alnum != wbstate->prev_alnum)
 		{
 			size_t		prev_offset = wbstate->offset;
 
 			wbstate->init = true;
-			wbstate->offset += unicode_utf8len(u);
+			wbstate->offset += ulen;
 			wbstate->prev_alnum = curr_alnum;
 			return prev_offset;
 		}
 
-		wbstate->offset += unicode_utf8len(u);
+		wbstate->offset += ulen;
 	}
 
 	return wbstate->len;
@@ -86,9 +97,16 @@ static size_t
 strlower_builtin(char *dest, size_t destsize, const char *src, size_t srclen,
 				 pg_locale_t locale)
 {
-	pg_verifymbstr(src, srclen, false);
-	return unicode_strlower(dest, destsize, src, srclen,
-							locale->builtin.casemap_full);
+	size_t	consumed;
+	size_t	result;
+
+	result = unicode_strlower(dest, destsize, src, srclen, &consumed,
+							  locale->builtin.casemap_full);
+	if (consumed < srclen)
+		report_invalid_encoding(GetDatabaseEncoding(), src + consumed,
+								srclen - consumed);
+
+	return result;
 }
 
 static size_t
@@ -96,36 +114,57 @@ strtitle_builtin(char *dest, size_t destsize, const char *src, size_t srclen,
 				 pg_locale_t locale)
 {
 	struct WordBoundaryState wbstate = {
-		.str = src,
-		.len = srclen,
-		.offset = 0,
-		.posix = !locale->builtin.casemap_full,
-		.init = false,
-		.prev_alnum = false,
+		.str						 = src,
+		.len						 = srclen,
+		.offset						 = 0,
+		.posix						 = !locale->builtin.casemap_full,
+		.init						 = false,
+		.prev_alnum					 = false,
 	};
+	size_t	consumed;
+	size_t	result;
+
+	result = unicode_strtitle(dest, destsize, src, srclen, &consumed,
+							  locale->builtin.casemap_full,
+							  initcap_wbnext, &wbstate);
 
-	pg_verifymbstr(src, srclen, false);
-	return unicode_strtitle(dest, destsize, src, srclen,
-							locale->builtin.casemap_full,
-							initcap_wbnext, &wbstate);
+	if (consumed < srclen)
+		report_invalid_encoding(GetDatabaseEncoding(), src + consumed,
+								srclen - consumed);
+
+	return result;
 }
 
 static size_t
 strupper_builtin(char *dest, size_t destsize, const char *src, size_t srclen,
 				 pg_locale_t locale)
 {
-	pg_verifymbstr(src, srclen, false);
-	return unicode_strupper(dest, destsize, src, srclen,
-							locale->builtin.casemap_full);
+	size_t	consumed;
+	size_t	result;
+
+	result = unicode_strupper(dest, destsize, src, srclen, &consumed,
+							  locale->builtin.casemap_full);
+	if (consumed < srclen)
+		report_invalid_encoding(GetDatabaseEncoding(), src + consumed,
+								srclen - consumed);
+
+	return result;
 }
 
 static size_t
 strfold_builtin(char *dest, size_t destsize, const char *src, size_t srclen,
 				pg_locale_t locale)
 {
-	pg_verifymbstr(src, srclen, false);
-	return unicode_strfold(dest, destsize, src, srclen,
-						   locale->builtin.casemap_full);
+	size_t	consumed;
+	size_t	result;
+
+	result = unicode_strfold(dest, destsize, src, srclen, &consumed,
+							 locale->builtin.casemap_full);
+	if (consumed < srclen)
+		report_invalid_encoding(GetDatabaseEncoding(), src + consumed,
+								srclen - consumed);
+
+	return result;
 }
 
 static bool
diff --git a/src/common/unicode/case_test.c b/src/common/unicode/case_test.c
index a0dbf00b671..ae0f86ffa0c 100644
--- a/src/common/unicode/case_test.c
+++ b/src/common/unicode/case_test.c
@@ -115,6 +115,7 @@ icu_test_full(char *str)
 	char		icu_fold[BUFSZ];
 	UErrorCode	status;
 	size_t		len = strlen(str);
+	size_t		consumed;
 
 	/* full case mapping doesn't use posix semantics */
 	struct WordBoundaryState wbstate = {
@@ -126,10 +127,10 @@ icu_test_full(char *str)
 		.prev_alnum = false,
 	};
 
-	unicode_strlower(lower, BUFSZ, str, len, true);
-	unicode_strtitle(title, BUFSZ, str, len, true, initcap_wbnext, &wbstate);
-	unicode_strupper(upper, BUFSZ, str, len, true);
-	unicode_strfold(fold, BUFSZ, str, len, true);
+	unicode_strlower(lower, BUFSZ, str, len, &consumed, true);
+	unicode_strtitle(title, BUFSZ, str, len, &consumed, true, initcap_wbnext, &wbstate);
+	unicode_strupper(upper, BUFSZ, str, len, &consumed, true);
+	unicode_strfold(fold, BUFSZ, str, len, &consumed, true);
 	status = U_ZERO_ERROR;
 	ucasemap_utf8ToLower(casemap, icu_lower, BUFSZ, str, len, &status);
 	status = U_ZERO_ERROR;
@@ -260,13 +261,16 @@ static size_t
 tfunc_lower(char *dst, size_t dstsize, const char *src,
 			size_t srclen)
 {
-	return unicode_strlower(dst, dstsize, src, srclen, true);
+	size_t		consumed;
+
+	return unicode_strlower(dst, dstsize, src, srclen, &consumed, true);
 }
 
 static size_t
 tfunc_title(char *dst, size_t dstsize, const char *src,
 			size_t srclen)
 {
+	size_t		consumed;
 	struct WordBoundaryState wbstate = {
 		.str = src,
 		.len = srclen,
@@ -275,27 +279,34 @@ tfunc_title(char *dst, size_t dstsize, const char *src,
 		.prev_alnum = false,
 	};
 
-	return unicode_strtitle(dst, dstsize, src, srclen, true, initcap_wbnext,
-							&wbstate);
+	return unicode_strtitle(dst, dstsize, src, srclen, &consumed, true,
+							initcap_wbnext, &wbstate);
 }
 
 static size_t
 tfunc_upper(char *dst, size_t dstsize, const char *src,
 			size_t srclen)
 {
-	return unicode_strupper(dst, dstsize, src, srclen, true);
+	size_t		consumed;
+
+	return unicode_strupper(dst, dstsize, src, srclen, &consumed, true);
 }
 
 static size_t
 tfunc_fold(char *dst, size_t dstsize, const char *src,
 		   size_t srclen)
 {
-	return unicode_strfold(dst, dstsize, src, srclen, true);
+	size_t		consumed;
+
+	return unicode_strfold(dst, dstsize, src, srclen, &consumed, true);
 }
 
 static void
 test_convert_case(void)
 {
+	size_t		needed;
+	size_t		consumed;
+
 	/* test string with no case changes */
 	test_convert(tfunc_lower, "√∞", "√∞");
 	/* test adjust-to-cased behavior */
@@ -320,6 +331,10 @@ test_convert_case(void)
 	/* U+FF11 FULLWIDTH ONE is alphanumeric for full case mapping */
 	test_convert(tfunc_title, "\uFF11a", "\uFF11a");
 
+	/* invalid UTF8 */
+	needed = unicode_strfold(NULL, 0, "abc\xCE", 4, &consumed, false);
+	Assert(consumed == 3);
+	Assert(needed == 3);
 
 #ifdef USE_ICU
 	icu_test_full("");
diff --git a/src/common/unicode_case.c b/src/common/unicode_case.c
index d6ee00b7d9c..4a692cfa249 100644
--- a/src/common/unicode_case.c
+++ b/src/common/unicode_case.c
@@ -40,8 +40,8 @@ static const char32_t *const casekind_map[NCaseKind] =
 
 static char32_t find_case_map(char32_t ucs, const char32_t *map);
 static size_t convert_case(char *dst, size_t dstsize, const char *src, size_t srclen,
-						   CaseKind str_casekind, bool full, WordBoundaryNext wbnext,
-						   void *wbstate);
+						   size_t *pconsumed, CaseKind str_casekind, bool full,
+						   WordBoundaryNext wbnext, void *wbstate);
 static enum CaseMapResult casemap(char32_t u1, CaseKind casekind, bool full,
 								  const char *src, size_t srclen, size_t srcoff,
 								  char32_t *simple, const char32_t **special);
@@ -82,7 +82,8 @@ unicode_casefold_simple(char32_t code)
  * unicode_strlower()
  *
  * Convert src to lowercase, and return the result length (not including
- * terminating NUL).
+ * terminating NUL). Sets *pconsumed to the amount of src successfully
+ * consumed; if less than srclen, indicates a decoding error.
  *
  * String src must be encoded in UTF-8.
  *
@@ -98,17 +99,18 @@ unicode_casefold_simple(char32_t code)
  */
 size_t
 unicode_strlower(char *dst, size_t dstsize, const char *src, size_t srclen,
-				 bool full)
+				 size_t *pconsumed, bool full)
 {
-	return convert_case(dst, dstsize, src, srclen, CaseLower, full, NULL,
-						NULL);
+	return convert_case(dst, dstsize, src, srclen, pconsumed, CaseLower, full,
+						NULL, NULL);
 }
 
 /*
  * unicode_strtitle()
  *
  * Convert src to titlecase, and return the result length (not including
- * terminating NUL).
+ * terminating NUL). Sets *pconsumed to the amount of src successfully
+ * consumed; if less than srclen, indicates a decoding error.
  *
  * String src must be encoded in UTF-8.
  *
@@ -134,17 +136,19 @@ unicode_strlower(char *dst, size_t dstsize, const char *src, size_t srclen,
  */
 size_t
 unicode_strtitle(char *dst, size_t dstsize, const char *src, size_t srclen,
-				 bool full, WordBoundaryNext wbnext, void *wbstate)
+				 size_t *pconsumed, bool full, WordBoundaryNext wbnext,
+				 void *wbstate)
 {
-	return convert_case(dst, dstsize, src, srclen, CaseTitle, full, wbnext,
-						wbstate);
+	return convert_case(dst, dstsize, src, srclen, pconsumed, CaseTitle, full,
+						wbnext, wbstate);
 }
 
 /*
  * unicode_strupper()
  *
  * Convert src to uppercase, and return the result length (not including
- * terminating NUL).
+ * terminating NUL). Sets *pconsumed to the amount of src successfully
+ * consumed; if less than srclen, indicates a decoding error.
  *
  * String src must be encoded in UTF-8.
  *
@@ -160,17 +164,18 @@ unicode_strtitle(char *dst, size_t dstsize, const char *src, size_t srclen,
  */
 size_t
 unicode_strupper(char *dst, size_t dstsize, const char *src, size_t srclen,
-				 bool full)
+				 size_t *pconsumed, bool full)
 {
-	return convert_case(dst, dstsize, src, srclen, CaseUpper, full, NULL,
-						NULL);
+	return convert_case(dst, dstsize, src, srclen, pconsumed, CaseUpper, full,
+						NULL, NULL);
 }
 
 /*
  * unicode_strfold()
  *
  * Case fold src, and return the result length (not including terminating
- * NUL).
+ * NUL). Sets *pconsumed to the amount of src successfully consumed; if less
+ * than srclen, indicates a decoding error.
  *
  * String src must be encoded in UTF-8.
  *
@@ -183,10 +188,26 @@ unicode_strupper(char *dst, size_t dstsize, const char *src, size_t srclen,
  */
 size_t
 unicode_strfold(char *dst, size_t dstsize, const char *src, size_t srclen,
-				bool full)
+				size_t *pconsumed, bool full)
 {
-	return convert_case(dst, dstsize, src, srclen, CaseFold, full, NULL,
-						NULL);
+	return convert_case(dst, dstsize, src, srclen, pconsumed, CaseFold, full,
+						NULL, NULL);
+}
+
+/* local version of pg_utf_mblen() to be inlinable */
+static int
+utf8_mblen(const unsigned char *s)
+{
+	if ((*s & 0x80) == 0)
+		return 1;
+	else if ((*s & 0xe0) == 0xc0)
+		return 2;
+	else if ((*s & 0xf0) == 0xe0)
+		return 3;
+	else if ((*s & 0xf8) == 0xf0)
+		return 4;
+	else
+		return -1;
 }
 
 /*
@@ -207,8 +228,8 @@ unicode_strfold(char *dst, size_t dstsize, const char *src, size_t srclen,
  */
 static size_t
 convert_case(char *dst, size_t dstsize, const char *src, size_t srclen,
-			 CaseKind str_casekind, bool full, WordBoundaryNext wbnext,
-			 void *wbstate)
+			 size_t *pconsumed, CaseKind str_casekind, bool full,
+			 WordBoundaryNext wbnext, void *wbstate)
 {
 	/* character CaseKind varies while titlecasing */
 	CaseKind	chr_casekind = str_casekind;
@@ -227,12 +248,18 @@ convert_case(char *dst, size_t dstsize, const char *src, size_t srclen,
 
 	while (srcoff < srclen)
 	{
-		char32_t	u1 = utf8_to_unicode((const unsigned char *) src + srcoff);
-		int			u1len = unicode_utf8len(u1);
+		int			u1len = utf8_mblen((const unsigned char *) src + srcoff);
+		char32_t	u1;
 		char32_t	simple = 0;
 		const char32_t *special = NULL;
 		enum CaseMapResult casemap_result;
 
+		/* invalid UTF8 */
+		if (u1len < 0 || srcoff + u1len > srclen)
+			break;
+
+		u1 = utf8_to_unicode((const unsigned char *) src + srcoff);
+
 		if (str_casekind == CaseTitle)
 		{
 			if (srcoff == boundary)
@@ -293,6 +320,7 @@ convert_case(char *dst, size_t dstsize, const char *src, size_t srclen,
 	if (result_len < dstsize)
 		dst[result_len] = '\0';
 
+	*pconsumed = srcoff;
 	return result_len;
 }
 
@@ -316,7 +344,14 @@ check_final_sigma(const unsigned char *str, size_t len, size_t offset)
 	{
 		if ((str[i] & 0x80) == 0 || (str[i] & 0xC0) == 0xC0)
 		{
-			char32_t	curr = utf8_to_unicode(str + i);
+			int			u1len = utf8_mblen((const unsigned char *) str + i);
+			char32_t	curr;
+
+			/* invalid UTF8 */
+			if (u1len < 0 || i + u1len > len)
+				return false;
+
+			curr = utf8_to_unicode(str + i);
 
 			if (pg_u_prop_case_ignorable(curr))
 				continue;
@@ -327,8 +362,6 @@ check_final_sigma(const unsigned char *str, size_t len, size_t offset)
 		}
 		else if ((str[i] & 0xC0) == 0x80)
 			continue;
-
-		Assert(false);			/* invalid UTF-8 */
 	}
 
 	/* end of string is not followed by a Cased character */
@@ -340,7 +373,14 @@ check_final_sigma(const unsigned char *str, size_t len, size_t offset)
 	{
 		if ((str[i] & 0x80) == 0 || (str[i] & 0xC0) == 0xC0)
 		{
-			char32_t	curr = utf8_to_unicode(str + i);
+			int			u1len = utf8_mblen((const unsigned char *) str + i);
+			char32_t	curr;
+
+			/* invalid UTF8 */
+			if (u1len < 0 || i + u1len > len)
+				return false;
+
+			curr = utf8_to_unicode(str + i);
 
 			if (pg_u_prop_case_ignorable(curr))
 				continue;
diff --git a/src/include/common/unicode_case.h b/src/include/common/unicode_case.h
index 03add78cabe..1cbc0c14bc2 100644
--- a/src/include/common/unicode_case.h
+++ b/src/include/common/unicode_case.h
@@ -21,13 +21,13 @@ char32_t	unicode_titlecase_simple(char32_t code);
 char32_t	unicode_uppercase_simple(char32_t code);
 char32_t	unicode_casefold_simple(char32_t code);
 size_t		unicode_strlower(char *dst, size_t dstsize, const char *src,
-							 size_t srclen, bool full);
+							 size_t srclen, size_t *pconsumed, bool full);
 size_t		unicode_strtitle(char *dst, size_t dstsize, const char *src,
-							 size_t srclen, bool full,
+							 size_t srclen, size_t *pconsumed, bool full,
 							 WordBoundaryNext wbnext, void *wbstate);
 size_t		unicode_strupper(char *dst, size_t dstsize, const char *src,
-							 size_t srclen, bool full);
+							 size_t srclen, size_t *pconsumed, bool full);
 size_t		unicode_strfold(char *dst, size_t dstsize, const char *src,
-							size_t srclen, bool full);
+							size_t srclen, size_t *pconsumed, bool full);
 
 #endif							/* UNICODE_CASE_H */
-- 
2.43.0