From d3621a42037e91a02b24c8974bd48ce9a42febf4 Mon Sep 17 00:00:00 2001
From: Jeff Davis <jeff@j-davis.com>
Date: Mon, 22 Jun 2026 16:31:38 -0700
Subject: [PATCH v4 2/4] unicode_case.c: change API to signal UTF8 decoding
 error.

Errors at this point are not expected, but if encountered, signal to
the caller so it can raise the appropriate error.

Reviewed-by: Chao Li <li.evan.chao@gmail.com>
Discussion: https://postgr.es/m/c355354e6c3f4a7aafb047361b73db247260fca0.camel@j-davis.com
---
 src/backend/utils/adt/pg_locale_builtin.c | 50 +++++++++++++++++++----
 src/common/unicode/case_test.c            | 33 +++++++++++----
 src/common/unicode_case.c                 | 46 ++++++++++++---------
 src/include/common/unicode_case.h         |  8 ++--
 4 files changed, 95 insertions(+), 42 deletions(-)

diff --git a/src/backend/utils/adt/pg_locale_builtin.c b/src/backend/utils/adt/pg_locale_builtin.c
index 7c36fd5091b..5619daf43c3 100644
--- a/src/backend/utils/adt/pg_locale_builtin.c
+++ b/src/backend/utils/adt/pg_locale_builtin.c
@@ -98,8 +98,16 @@ static size_t
 strlower_builtin(char *dest, size_t destsize, const char *src, size_t srclen,
 				 pg_locale_t locale)
 {
-	return unicode_strlower(dest, destsize, src, srclen,
-							locale->builtin.casemap_full);
+	size_t		consumed;
+	size_t		result;
+
+	result = unicode_strlower(dest, destsize, src, srclen, &consumed,
+							  locale->builtin.casemap_full);
+	if (consumed < srclen)
+		report_invalid_encoding(GetDatabaseEncoding(), src + consumed,
+								srclen - consumed);
+
+	return result;
 }
 
 static size_t
@@ -114,26 +122,50 @@ strtitle_builtin(char *dest, size_t destsize, const char *src, size_t srclen,
 		.init = false,
 		.prev_alnum = false,
 	};
+	size_t		consumed;
+	size_t		result;
 
-	return unicode_strtitle(dest, destsize, src, srclen,
-							locale->builtin.casemap_full,
-							initcap_wbnext, &wbstate);
+	result = unicode_strtitle(dest, destsize, src, srclen, &consumed,
+							  locale->builtin.casemap_full,
+							  initcap_wbnext, &wbstate);
+
+	if (consumed < srclen)
+		report_invalid_encoding(GetDatabaseEncoding(), src + consumed,
+								srclen - consumed);
+
+	return result;
 }
 
 static size_t
 strupper_builtin(char *dest, size_t destsize, const char *src, size_t srclen,
 				 pg_locale_t locale)
 {
-	return unicode_strupper(dest, destsize, src, srclen,
-							locale->builtin.casemap_full);
+	size_t		consumed;
+	size_t		result;
+
+	result = unicode_strupper(dest, destsize, src, srclen, &consumed,
+							  locale->builtin.casemap_full);
+	if (consumed < srclen)
+		report_invalid_encoding(GetDatabaseEncoding(), src + consumed,
+								srclen - consumed);
+
+	return result;
 }
 
 static size_t
 strfold_builtin(char *dest, size_t destsize, const char *src, size_t srclen,
 				pg_locale_t locale)
 {
-	return unicode_strfold(dest, destsize, src, srclen,
-						   locale->builtin.casemap_full);
+	size_t		consumed;
+	size_t		result;
+
+	result = unicode_strfold(dest, destsize, src, srclen, &consumed,
+							 locale->builtin.casemap_full);
+	if (consumed < srclen)
+		report_invalid_encoding(GetDatabaseEncoding(), src + consumed,
+								srclen - consumed);
+
+	return result;
 }
 
 static bool
diff --git a/src/common/unicode/case_test.c b/src/common/unicode/case_test.c
index a0dbf00b671..ae0f86ffa0c 100644
--- a/src/common/unicode/case_test.c
+++ b/src/common/unicode/case_test.c
@@ -115,6 +115,7 @@ icu_test_full(char *str)
 	char		icu_fold[BUFSZ];
 	UErrorCode	status;
 	size_t		len = strlen(str);
+	size_t		consumed;
 
 	/* full case mapping doesn't use posix semantics */
 	struct WordBoundaryState wbstate = {
@@ -126,10 +127,10 @@ icu_test_full(char *str)
 		.prev_alnum = false,
 	};
 
-	unicode_strlower(lower, BUFSZ, str, len, true);
-	unicode_strtitle(title, BUFSZ, str, len, true, initcap_wbnext, &wbstate);
-	unicode_strupper(upper, BUFSZ, str, len, true);
-	unicode_strfold(fold, BUFSZ, str, len, true);
+	unicode_strlower(lower, BUFSZ, str, len, &consumed, true);
+	unicode_strtitle(title, BUFSZ, str, len, &consumed, true, initcap_wbnext, &wbstate);
+	unicode_strupper(upper, BUFSZ, str, len, &consumed, true);
+	unicode_strfold(fold, BUFSZ, str, len, &consumed, true);
 	status = U_ZERO_ERROR;
 	ucasemap_utf8ToLower(casemap, icu_lower, BUFSZ, str, len, &status);
 	status = U_ZERO_ERROR;
@@ -260,13 +261,16 @@ static size_t
 tfunc_lower(char *dst, size_t dstsize, const char *src,
 			size_t srclen)
 {
-	return unicode_strlower(dst, dstsize, src, srclen, true);
+	size_t		consumed;
+
+	return unicode_strlower(dst, dstsize, src, srclen, &consumed, true);
 }
 
 static size_t
 tfunc_title(char *dst, size_t dstsize, const char *src,
 			size_t srclen)
 {
+	size_t		consumed;
 	struct WordBoundaryState wbstate = {
 		.str = src,
 		.len = srclen,
@@ -275,27 +279,34 @@ tfunc_title(char *dst, size_t dstsize, const char *src,
 		.prev_alnum = false,
 	};
 
-	return unicode_strtitle(dst, dstsize, src, srclen, true, initcap_wbnext,
-							&wbstate);
+	return unicode_strtitle(dst, dstsize, src, srclen, &consumed, true,
+							initcap_wbnext, &wbstate);
 }
 
 static size_t
 tfunc_upper(char *dst, size_t dstsize, const char *src,
 			size_t srclen)
 {
-	return unicode_strupper(dst, dstsize, src, srclen, true);
+	size_t		consumed;
+
+	return unicode_strupper(dst, dstsize, src, srclen, &consumed, true);
 }
 
 static size_t
 tfunc_fold(char *dst, size_t dstsize, const char *src,
 		   size_t srclen)
 {
-	return unicode_strfold(dst, dstsize, src, srclen, true);
+	size_t		consumed;
+
+	return unicode_strfold(dst, dstsize, src, srclen, &consumed, true);
 }
 
 static void
 test_convert_case(void)
 {
+	size_t		needed;
+	size_t		consumed;
+
 	/* test string with no case changes */
 	test_convert(tfunc_lower, "√∞", "√∞");
 	/* test adjust-to-cased behavior */
@@ -320,6 +331,10 @@ test_convert_case(void)
 	/* U+FF11 FULLWIDTH ONE is alphanumeric for full case mapping */
 	test_convert(tfunc_title, "\uFF11a", "\uFF11a");
 
+	/* invalid UTF8 */
+	needed = unicode_strfold(NULL, 0, "abc\xCE", 4, &consumed, false);
+	Assert(consumed == 3);
+	Assert(needed == 3);
 
 #ifdef USE_ICU
 	icu_test_full("");
diff --git a/src/common/unicode_case.c b/src/common/unicode_case.c
index 42eb7d22211..79f8c7f14a7 100644
--- a/src/common/unicode_case.c
+++ b/src/common/unicode_case.c
@@ -40,8 +40,8 @@ static const char32_t *const casekind_map[NCaseKind] =
 
 static char32_t find_case_map(char32_t ucs, const char32_t *map);
 static size_t convert_case(char *dst, size_t dstsize, const char *src, size_t srclen,
-						   CaseKind str_casekind, bool full, WordBoundaryNext wbnext,
-						   void *wbstate);
+						   size_t *pconsumed, CaseKind str_casekind, bool full,
+						   WordBoundaryNext wbnext, void *wbstate);
 static enum CaseMapResult casemap(char32_t u1, CaseKind casekind, bool full,
 								  const char *src, size_t srclen, size_t srcoff,
 								  char32_t *simple, const char32_t **special);
@@ -82,7 +82,8 @@ unicode_casefold_simple(char32_t code)
  * unicode_strlower()
  *
  * Convert src to lowercase, and return the result length (not including
- * terminating NUL).
+ * terminating NUL). Sets *pconsumed to the amount of src successfully
+ * consumed; if less than srclen, indicates a decoding error.
  *
  * String src must be encoded in UTF-8.
  *
@@ -98,17 +99,18 @@ unicode_casefold_simple(char32_t code)
  */
 size_t
 unicode_strlower(char *dst, size_t dstsize, const char *src, size_t srclen,
-				 bool full)
+				 size_t *pconsumed, bool full)
 {
-	return convert_case(dst, dstsize, src, srclen, CaseLower, full, NULL,
-						NULL);
+	return convert_case(dst, dstsize, src, srclen, pconsumed, CaseLower, full,
+						NULL, NULL);
 }
 
 /*
  * unicode_strtitle()
  *
  * Convert src to titlecase, and return the result length (not including
- * terminating NUL).
+ * terminating NUL). Sets *pconsumed to the amount of src successfully
+ * consumed; if less than srclen, indicates a decoding error.
  *
  * String src must be encoded in UTF-8.
  *
@@ -134,17 +136,19 @@ unicode_strlower(char *dst, size_t dstsize, const char *src, size_t srclen,
  */
 size_t
 unicode_strtitle(char *dst, size_t dstsize, const char *src, size_t srclen,
-				 bool full, WordBoundaryNext wbnext, void *wbstate)
+				 size_t *pconsumed, bool full, WordBoundaryNext wbnext,
+				 void *wbstate)
 {
-	return convert_case(dst, dstsize, src, srclen, CaseTitle, full, wbnext,
-						wbstate);
+	return convert_case(dst, dstsize, src, srclen, pconsumed, CaseTitle, full,
+						wbnext, wbstate);
 }
 
 /*
  * unicode_strupper()
  *
  * Convert src to uppercase, and return the result length (not including
- * terminating NUL).
+ * terminating NUL). Sets *pconsumed to the amount of src successfully
+ * consumed; if less than srclen, indicates a decoding error.
  *
  * String src must be encoded in UTF-8.
  *
@@ -160,17 +164,18 @@ unicode_strtitle(char *dst, size_t dstsize, const char *src, size_t srclen,
  */
 size_t
 unicode_strupper(char *dst, size_t dstsize, const char *src, size_t srclen,
-				 bool full)
+				 size_t *pconsumed, bool full)
 {
-	return convert_case(dst, dstsize, src, srclen, CaseUpper, full, NULL,
-						NULL);
+	return convert_case(dst, dstsize, src, srclen, pconsumed, CaseUpper, full,
+						NULL, NULL);
 }
 
 /*
  * unicode_strfold()
  *
  * Case fold src, and return the result length (not including terminating
- * NUL).
+ * NUL). Sets *pconsumed to the amount of src successfully consumed; if less
+ * than srclen, indicates a decoding error.
  *
  * String src must be encoded in UTF-8.
  *
@@ -183,10 +188,10 @@ unicode_strupper(char *dst, size_t dstsize, const char *src, size_t srclen,
  */
 size_t
 unicode_strfold(char *dst, size_t dstsize, const char *src, size_t srclen,
-				bool full)
+				size_t *pconsumed, bool full)
 {
-	return convert_case(dst, dstsize, src, srclen, CaseFold, full, NULL,
-						NULL);
+	return convert_case(dst, dstsize, src, srclen, pconsumed, CaseFold, full,
+						NULL, NULL);
 }
 
 /* local version of pg_utf_mblen() to be inlinable */
@@ -223,8 +228,8 @@ utf8_mblen(const unsigned char *s)
  */
 static size_t
 convert_case(char *dst, size_t dstsize, const char *src, size_t srclen,
-			 CaseKind str_casekind, bool full, WordBoundaryNext wbnext,
-			 void *wbstate)
+			 size_t *pconsumed, CaseKind str_casekind, bool full,
+			 WordBoundaryNext wbnext, void *wbstate)
 {
 	/* character CaseKind varies while titlecasing */
 	CaseKind	chr_casekind = str_casekind;
@@ -315,6 +320,7 @@ convert_case(char *dst, size_t dstsize, const char *src, size_t srclen,
 	if (result_len < dstsize)
 		dst[result_len] = '\0';
 
+	*pconsumed = srcoff;
 	return result_len;
 }
 
diff --git a/src/include/common/unicode_case.h b/src/include/common/unicode_case.h
index 03add78cabe..1cbc0c14bc2 100644
--- a/src/include/common/unicode_case.h
+++ b/src/include/common/unicode_case.h
@@ -21,13 +21,13 @@ char32_t	unicode_titlecase_simple(char32_t code);
 char32_t	unicode_uppercase_simple(char32_t code);
 char32_t	unicode_casefold_simple(char32_t code);
 size_t		unicode_strlower(char *dst, size_t dstsize, const char *src,
-							 size_t srclen, bool full);
+							 size_t srclen, size_t *pconsumed, bool full);
 size_t		unicode_strtitle(char *dst, size_t dstsize, const char *src,
-							 size_t srclen, bool full,
+							 size_t srclen, size_t *pconsumed, bool full,
 							 WordBoundaryNext wbnext, void *wbstate);
 size_t		unicode_strupper(char *dst, size_t dstsize, const char *src,
-							 size_t srclen, bool full);
+							 size_t srclen, size_t *pconsumed, bool full);
 size_t		unicode_strfold(char *dst, size_t dstsize, const char *src,
-							size_t srclen, bool full);
+							size_t srclen, size_t *pconsumed, bool full);
 
 #endif							/* UNICODE_CASE_H */
-- 
2.43.0