From 25562a48cfbd93c25f23e2734d27307d6f5459d6 Mon Sep 17 00:00:00 2001
From: Jeff Davis <jeff@j-davis.com>
Date: Fri, 19 Jun 2026 14:58:47 -0700
Subject: [PATCH v5 5/5] unicode_case.c: use new utf8encode/utf8decode APIs.

Discussion: https://postgr.es/m/c355354e6c3f4a7aafb047361b73db247260fca0.camel@j-davis.com
---
 src/backend/utils/adt/pg_locale_builtin.c | 10 +--
 src/common/unicode/case_test.c            | 45 +++++++++----
 src/common/unicode_case.c                 | 77 +++++++++++------------
 3 files changed, 77 insertions(+), 55 deletions(-)

diff --git a/src/backend/utils/adt/pg_locale_builtin.c b/src/backend/utils/adt/pg_locale_builtin.c
index 5619daf43c3..63516e7174f 100644
--- a/src/backend/utils/adt/pg_locale_builtin.c
+++ b/src/backend/utils/adt/pg_locale_builtin.c
@@ -62,22 +62,22 @@ initcap_wbnext(void *state)
 
 	while (wbstate->offset < wbstate->len)
 	{
-		int			ulen = pg_utf_mblen((const unsigned char *) wbstate->str +
-										wbstate->offset);
+		int			ulen;
 		char32_t	u;
 		bool		curr_alnum;
 		size_t		prev_offset = wbstate->offset;
 
+		ulen = utf8decode(&u, (const unsigned char *) wbstate->str + wbstate->offset,
+						  wbstate->len - wbstate->offset);
+
 		/* invalid UTF8 */
-		if (wbstate->offset + ulen > wbstate->len)
+		if (ulen <= 0)
 		{
 			wbstate->init = true;
 			wbstate->offset = wbstate->len;
 			return prev_offset;
 		}
 
-		u = utf8_to_unicode((const unsigned char *) wbstate->str +
-							wbstate->offset);
 		curr_alnum = pg_u_isalnum(u, wbstate->posix);
 
 		if (!wbstate->init || curr_alnum != wbstate->prev_alnum)
diff --git a/src/common/unicode/case_test.c b/src/common/unicode/case_test.c
index 08421d9e5ca..9461b56742b 100644
--- a/src/common/unicode/case_test.c
+++ b/src/common/unicode/case_test.c
@@ -52,24 +52,35 @@ initcap_wbnext(void *state)
 {
 	struct WordBoundaryState *wbstate = (struct WordBoundaryState *) state;
 
-	while (wbstate->offset < wbstate->len &&
-		   wbstate->str[wbstate->offset] != '\0')
+	while (wbstate->offset < wbstate->len)
 	{
-		char32_t	u = utf8_to_unicode((const unsigned char *) wbstate->str +
-										wbstate->offset);
-		bool		curr_alnum = pg_u_isalnum(u, wbstate->posix);
+		int			ulen;
+		char32_t	u;
+		bool		curr_alnum;
+		size_t		prev_offset = wbstate->offset;
 
-		if (!wbstate->init || curr_alnum != wbstate->prev_alnum)
+		ulen = utf8decode(&u, (const unsigned char *) wbstate->str + wbstate->offset,
+						  wbstate->len - wbstate->offset);
+
+		/* invalid UTF8 */
+		if (ulen <= 0)
 		{
-			size_t		prev_offset = wbstate->offset;
+			wbstate->init = true;
+			wbstate->offset = wbstate->len;
+			return prev_offset;
+		}
+
+		curr_alnum = pg_u_isalnum(u, wbstate->posix);
 
+		if (!wbstate->init || curr_alnum != wbstate->prev_alnum)
+		{
 			wbstate->init = true;
-			wbstate->offset += unicode_utf8len(u);
+			wbstate->offset += ulen;
 			wbstate->prev_alnum = curr_alnum;
 			return prev_offset;
 		}
 
-		wbstate->offset += unicode_utf8len(u);
+		wbstate->offset += ulen;
 	}
 
 	return wbstate->len;
@@ -179,7 +190,7 @@ test_icu(void)
 	{
 		pg_unicode_category category = unicode_category(code);
 
-		if (category != PG_U_UNASSIGNED)
+		if (category != PG_U_UNASSIGNED && category != PG_U_SURROGATE)
 		{
 			uint8_t		icu_category = u_charType(code);
 			char		code_str[5] = {0};
@@ -191,7 +202,7 @@ test_icu(void)
 			}
 
 			icu_test_simple(code);
-			unicode_to_utf8(code, (unsigned char *) code_str);
+			utf8encode((unsigned char *) code_str, 5, code);
 			icu_test_full(code_str);
 
 			successful++;
@@ -337,6 +348,18 @@ test_convert_case(void)
 	/* invalid UTF8: leading byte invalid length */
 	needed = unicode_strfold(NULL, 0, "abc\xF8xyz", 7, &consumed, false);
 	Assert(needed == 3 && consumed == 3);
+	/* invalid UTF8: surrogates */
+	needed = unicode_strfold(NULL, 0, "abc\xED\xA0\x81xyz", 7, &consumed, false);
+	Assert(needed == 3 && consumed == 3);
+	/* invalid UTF8: continuation with no leading byte */
+	needed = unicode_strfold(NULL, 0, "abc\x80xyz", 7, &consumed, false);
+	Assert(needed == 3 && consumed == 3);
+	/* invalid UTF8: out of range */
+	needed = unicode_strfold(NULL, 0, "abc\xF5\x80\x80\x80xyz", 7, &consumed, false);
+	Assert(needed == 3 && consumed == 3);
+	/* invalid UTF8: overlong */
+	needed = unicode_strfold(NULL, 0, "abc\xC1\xBFxyz", 7, &consumed, false);
+	Assert(needed == 3 && consumed == 3);
 
 #ifdef USE_ICU
 	icu_test_full("");
diff --git a/src/common/unicode_case.c b/src/common/unicode_case.c
index 24753aaab09..4d8ee71e8dc 100644
--- a/src/common/unicode_case.c
+++ b/src/common/unicode_case.c
@@ -194,22 +194,6 @@ unicode_strfold(char *dst, size_t dstsize, const char *src, size_t srclen,
 						NULL, NULL);
 }
 
-/* local version of pg_utf_mblen() to be inlinable */
-static int
-utf8_mblen(const unsigned char *s)
-{
-	if ((*s & 0x80) == 0)
-		return 1;
-	else if ((*s & 0xe0) == 0xc0)
-		return 2;
-	else if ((*s & 0xf0) == 0xe0)
-		return 3;
-	else if ((*s & 0xf8) == 0xf0)
-		return 4;
-	else
-		return -1;
-}
-
 /*
  * Implement Unicode Default Case Conversion algorithm.
  *
@@ -248,18 +232,19 @@ convert_case(char *dst, size_t dstsize, const char *src, size_t srclen,
 
 	while (srcoff < srclen)
 	{
-		int			u1len = utf8_mblen((const unsigned char *) src + srcoff);
 		char32_t	u1;
+		int			u1len;
 		char32_t	simple = 0;
 		const char32_t *special = NULL;
 		enum CaseMapResult casemap_result;
 
+		u1len = utf8decode(&u1, (const unsigned char *) src + srcoff,
+						   srclen - srcoff);
+
 		/* invalid UTF8 */
-		if (u1len < 0 || srcoff + u1len > srclen)
+		if (u1len <= 0)
 			break;
 
-		u1 = utf8_to_unicode((const unsigned char *) src + srcoff);
-
 		if (str_casekind == CaseTitle)
 		{
 			if (srcoff == boundary)
@@ -280,6 +265,7 @@ convert_case(char *dst, size_t dstsize, const char *src, size_t srclen,
 				/* no mapping; copy bytes from src */
 				Assert(simple == 0);
 				Assert(special == NULL);
+
 				if (result_len + u1len <= dstsize)
 					memcpy(dst + result_len, src + srcoff, u1len);
 
@@ -289,11 +275,19 @@ convert_case(char *dst, size_t dstsize, const char *src, size_t srclen,
 				{
 					/* replace with single character */
 					char32_t	u2 = simple;
-					char32_t	u2len = unicode_utf8len(u2);
+					int			u2len;
+					size_t		remaining = 0;
+					unsigned char *p = NULL;
+
+					if (dstsize > result_len)
+					{
+						remaining = dstsize - result_len;
+						p = (unsigned char *) dst + result_len;
+					}
 
 					Assert(special == NULL);
-					if (result_len + u2len <= dstsize)
-						unicode_to_utf8(u2, (unsigned char *) dst + result_len);
+					u2len = utf8encode(p, remaining, u2);
+					Assert(u2len > 0);
 
 					result_len += u2len;
 				}
@@ -304,10 +298,18 @@ convert_case(char *dst, size_t dstsize, const char *src, size_t srclen,
 				for (int i = 0; i < MAX_CASE_EXPANSION && special[i]; i++)
 				{
 					char32_t	u2 = special[i];
-					size_t		u2len = unicode_utf8len(u2);
+					int			u2len;
+					size_t		remaining = 0;
+					unsigned char *p = NULL;
+
+					if (dstsize > result_len)
+					{
+						remaining = dstsize - result_len;
+						p = (unsigned char *) dst + result_len;
+					}
 
-					if (result_len + u2len <= dstsize)
-						unicode_to_utf8(u2, (unsigned char *) dst + result_len);
+					u2len = utf8encode(p, remaining, u2);
+					Assert(u2len > 0);
 
 					result_len += u2len;
 				}
@@ -352,13 +354,10 @@ check_final_sigma(const unsigned char *str, size_t len, size_t offset)
 		/* now at leading byte of previous sequence */
 		Assert((str[i] & 0x80) == 0 || (str[i] & 0xC0) == 0xC0);
 
-		ulen = utf8_mblen((const unsigned char *) str + i);
-
-		/* invalid UTF8 */
-		if (ulen < 0 || i + ulen > len)
-			return false;
+		ulen = utf8decode(&curr, (const unsigned char *) str + i, len - i);
 
-		curr = utf8_to_unicode((const unsigned char *) str + i);
+		if (ulen <= 0)
+			return false;		/* invalid UTF8 */
 
 		if (!pg_u_prop_case_ignorable(curr))
 		{
@@ -367,18 +366,18 @@ check_final_sigma(const unsigned char *str, size_t len, size_t offset)
 		}
 	}
 
-	ulen = utf8_mblen((const unsigned char *) str + offset);
+	ulen = utf8decode(&curr, (const unsigned char *) str + offset,
+					  len - offset);
+	if (ulen <= 0)
+		return false;			/* invalid UTF8 */
 
 	/* iterate forward looking for following character */
 	for (int i = offset + ulen; i < len;)
 	{
-		ulen = utf8_mblen((const unsigned char *) str + i);
-
-		/* invalid UTF8 */
-		if (ulen < 0 || i + ulen > len)
-			return false;
+		ulen = utf8decode(&curr, (const unsigned char *) str + i, len - i);
 
-		curr = utf8_to_unicode((const unsigned char *) str + i);
+		if (ulen <= 0)
+			return false;		/* invalid UTF8 */
 
 		if (!pg_u_prop_case_ignorable(curr))
 		{
-- 
2.43.0

