From c389e8f3d47c51183702f468846c0ad8c33beaae Mon Sep 17 00:00:00 2001
From: Jeff Davis <jeff@j-davis.com>
Date: Tue, 23 Jun 2026 17:09:49 -0700
Subject: [PATCH v5 2/5] pg_unicode_fast: fix final sigma logic.

If the string is preceded only by Case Ignorable characters, don't
consider it to be a final sigma.

In the process, refactor so that the preceding and following
characters are found first, and then the rule is applied, to improve
clarity.

Discussion: https://postgr.es/m/c355354e6c3f4a7aafb047361b73db247260fca0.camel@j-davis.com
Backpatch-through: 18
---
 src/common/unicode_case.c                  | 88 ++++++++++------------
 src/test/regress/expected/collate.utf8.out |  6 ++
 src/test/regress/sql/collate.utf8.sql      |  1 +
 3 files changed, 47 insertions(+), 48 deletions(-)

diff --git a/src/common/unicode_case.c b/src/common/unicode_case.c
index 42eb7d22211..dd5b3ba86d0 100644
--- a/src/common/unicode_case.c
+++ b/src/common/unicode_case.c
@@ -323,75 +323,67 @@ convert_case(char *dst, size_t dstsize, const char *src, size_t srclen,
  * 3-17. The character at the given offset must be directly preceded by a
  * Cased character, and must not be directly followed by a Cased character.
  *
- * Case_Ignorable characters are ignored. NB: some characters may be both
+ * Case_Ignorable characters are ignored. Neither beginning of string nor end
+ * of string are considered Cased characters. NB: some characters may be both
  * Cased and Case_Ignorable, in which case they are ignored.
  */
 static bool
 check_final_sigma(const unsigned char *str, size_t len, size_t offset)
 {
-	/* the start of the string is not preceded by a Cased character */
-	if (offset == 0)
-		return false;
+	bool		preceded_by_cased = false;
+	bool		followed_by_cased = false;
+	char32_t	curr;
+	int			ulen;
 
-	/* iterate backwards, looking for Cased character */
-	for (int i = offset - 1; i >= 0; i--)
+	/* iterate backwards looking for preceding character */
+	for (int i = offset; i > 0;)
 	{
-		if ((str[i] & 0x80) == 0 || (str[i] & 0xC0) == 0xC0)
-		{
-			int			u1len = utf8_mblen((const unsigned char *) str + i);
-			char32_t	curr;
+		/* skip backwards through continuation bytes */
+		i--;
+		if ((str[i] & 0xC0) == 0x80)
+			continue;
 
-			/* invalid UTF8 */
-			if (u1len < 0 || i + u1len > len)
-				return false;
+		/* now at leading byte of previous sequence */
+		Assert((str[i] & 0x80) == 0 || (str[i] & 0xC0) == 0xC0);
 
-			curr = utf8_to_unicode(str + i);
+		ulen = utf8_mblen((const unsigned char *) str + i);
 
-			if (pg_u_prop_case_ignorable(curr))
-				continue;
-			else if (pg_u_prop_cased(curr))
-				break;
-			else
-				return false;
+		/* invalid UTF8 */
+		if (ulen < 0 || i + ulen > len)
+			return false;
+
+		curr = utf8_to_unicode((const unsigned char *) str + i);
+
+		if (!pg_u_prop_case_ignorable(curr))
+		{
+			preceded_by_cased = pg_u_prop_cased(curr);
+			break;
 		}
-		else if ((str[i] & 0xC0) == 0x80)
-			continue;
-		else
-			return false;			/* invalid UTF8 */
 	}
 
-	/* end of string is not followed by a Cased character */
-	if (offset == len)
-		return true;
+	ulen = utf8_mblen((const unsigned char *) str + offset);
 
-	/* iterate forwards, looking for Cased character */
-	for (int i = offset + 1; i < len && str[i] != '\0'; i++)
+	/* iterate forward looking for following character */
+	for (int i = offset + ulen; i < len;)
 	{
-		if ((str[i] & 0x80) == 0 || (str[i] & 0xC0) == 0xC0)
-		{
-			int			u1len = utf8_mblen((const unsigned char *) str + i);
-			char32_t	curr;
+		ulen = utf8_mblen((const unsigned char *) str + i);
 
-			/* invalid UTF8 */
-			if (u1len < 0 || i + u1len > len)
-				return false;
+		/* invalid UTF8 */
+		if (ulen < 0 || i + ulen > len)
+			return false;
 
-			curr = utf8_to_unicode(str + i);
+		curr = utf8_to_unicode((const unsigned char *) str + i);
 
-			if (pg_u_prop_case_ignorable(curr))
-				continue;
-			else if (pg_u_prop_cased(curr))
-				return false;
-			else
-				break;
+		if (!pg_u_prop_case_ignorable(curr))
+		{
+			followed_by_cased = pg_u_prop_cased(curr);
+			break;
 		}
-		else if ((str[i] & 0xC0) == 0x80)
-			continue;
-		else
-			return false;			/* invalid UTF8 */
+
+		i += ulen;
 	}
 
-	return true;
+	return (preceded_by_cased && !followed_by_cased);
 }
 
 /*
diff --git a/src/test/regress/expected/collate.utf8.out b/src/test/regress/expected/collate.utf8.out
index 0c3ab5c89b2..99fdc111fa4 100644
--- a/src/test/regress/expected/collate.utf8.out
+++ b/src/test/regress/expected/collate.utf8.out
@@ -263,6 +263,12 @@ SELECT lower('ᾼΣͅΑ' COLLATE PG_UNICODE_FAST); -- 0391 0345 03A3 0345 0391
  ᾳσͅα
 (1 row)
 
+SELECT lower(U&'\0300\03A3' COLLATE PG_UNICODE_FAST);
+ lower 
+-------
+ ̀σ
+(1 row)
+
 -- properties
 SELECT 'xyz' ~ '[[:alnum:]]' COLLATE PG_UNICODE_FAST;
  ?column? 
diff --git a/src/test/regress/sql/collate.utf8.sql b/src/test/regress/sql/collate.utf8.sql
index d6d14220ab3..22aecee3a60 100644
--- a/src/test/regress/sql/collate.utf8.sql
+++ b/src/test/regress/sql/collate.utf8.sql
@@ -128,6 +128,7 @@ SELECT lower('0Σ' COLLATE PG_UNICODE_FAST); -- 0030 03A3
 SELECT lower('ΑΣΑ' COLLATE PG_UNICODE_FAST); -- 0391 03A3 0391
 SELECT lower('ἈΣ̓Α' COLLATE PG_UNICODE_FAST); -- 0391 0343 03A3 0343 0391
 SELECT lower('ᾼΣͅΑ' COLLATE PG_UNICODE_FAST); -- 0391 0345 03A3 0345 0391
+SELECT lower(U&'\0300\03A3' COLLATE PG_UNICODE_FAST);
 
 -- properties
 
-- 
2.43.0

