From 75079fd0f30c37e2a0d0b22baf78441e0d0a7221 Mon Sep 17 00:00:00 2001
From: Tom Lane <tgl@sss.pgh.pa.us>
Date: Sat, 4 Jul 2026 13:45:59 -0400
Subject: [PATCH v1] Fix mishandling of leading '\' in nondeterministic LIKE.

The loop in MatchText() processed a leading '\' without regard to
nondeterministic locales, which is problematic if what the '\'
precedes is an ordinary character that should be subject to
nondeterministic matching.  We'd insist on a literal match for it,
which is not right and is not like what happens with a '\' that
follows some ordinary characters.  Worse, we'd then advance the text
and pattern pointers by one byte, so that if the escaped character
is multibyte the next loop iteration would take the nondeterministic
code path starting at a point within the character.  That could very
possibly cause pg_strncoll() to misbehave.

The fix is quite simple: move the stanza that handles '\' down past
the one that handles nondeterminism.  The stanzas for '%' and '_'
are fine where they are, but the '\' stanza is only correct for
deterministic matching.  The logic for nondeterministic cases is
already prepared to do the right things with a '\'.

While here, I replaced tests of "locale && !locale->deterministic"
with a boolean local variable, reasoning that those are in the hot
loop paths so saving a branch and indirect fetch is worth the
trouble.  I also improved a number of related comments.
---
 src/backend/utils/adt/like_match.c            | 76 +++++++++++--------
 .../regress/expected/collate.icu.utf8.out     | 30 ++++++++
 src/test/regress/sql/collate.icu.utf8.sql     |  6 ++
 3 files changed, 80 insertions(+), 32 deletions(-)

diff --git a/src/backend/utils/adt/like_match.c b/src/backend/utils/adt/like_match.c
index f5f72b82e21..21f3421d9ce 100644
--- a/src/backend/utils/adt/like_match.c
+++ b/src/backend/utils/adt/like_match.c
@@ -83,6 +83,8 @@
 static int
 MatchText(const char *t, int tlen, const char *p, int plen, pg_locale_t locale)
 {
+	bool		nondeterministic = (locale && !locale->deterministic);
+
 	/* Fast path for match-everything pattern */
 	if (plen == 1 && *p == '%')
 		return LIKE_TRUE;
@@ -96,23 +98,16 @@ MatchText(const char *t, int tlen, const char *p, int plen, pg_locale_t locale)
 	 * occasions it is safe to advance by byte, as the text and pattern will
 	 * be in lockstep. This allows us to perform all comparisons between the
 	 * text and pattern on a byte by byte basis, even for multi-byte
-	 * encodings.
+	 * encodings.  (But that doesn't work in a nondeterministic locale, so the
+	 * nondeterministic case below has to advance the text by chars.)
 	 */
 	while (tlen > 0 && plen > 0)
 	{
-		if (*p == '\\')
-		{
-			/* Next pattern byte must match literally, whatever it is */
-			NextByte(p, plen);
-			/* ... and there had better be one, per SQL standard */
-			if (plen <= 0)
-				ereport(ERROR,
-						(errcode(ERRCODE_INVALID_ESCAPE_SEQUENCE),
-						 errmsg("LIKE pattern must not end with escape character")));
-			if (GETCHAR(*p) != GETCHAR(*t))
-				return LIKE_FALSE;
-		}
-		else if (*p == '%')
+		/*
+		 * At the top of this loop, we are not positioned immediately after an
+		 * escape, so we may take wildcards at face value.
+		 */
+		if (*p == '%')
 		{
 			char		firstpat;
 
@@ -161,9 +156,9 @@ MatchText(const char *t, int tlen, const char *p, int plen, pg_locale_t locale)
 			 * the first pattern byte to each text byte to avoid recursing
 			 * more than we have to.  This fact also guarantees that we don't
 			 * have to consider a match to the zero-length substring at the
-			 * end of the text.  With a nondeterministic collation, we can't
-			 * rely on the first bytes being equal, so we have to recurse in
-			 * any case.
+			 * end of the text.  But with a nondeterministic collation, we
+			 * can't rely on the first byte of a match being equal, so we have
+			 * to recurse in any case.
 			 */
 			if (*p == '\\')
 			{
@@ -178,7 +173,7 @@ MatchText(const char *t, int tlen, const char *p, int plen, pg_locale_t locale)
 
 			while (tlen > 0)
 			{
-				if (GETCHAR(*t) == firstpat || (locale && !locale->deterministic))
+				if (GETCHAR(*t) == firstpat || nondeterministic)
 				{
 					int			matched = MatchText(t, tlen, p, plen, locale);
 
@@ -202,7 +197,7 @@ MatchText(const char *t, int tlen, const char *p, int plen, pg_locale_t locale)
 			NextByte(p, plen);
 			continue;
 		}
-		else if (locale && !locale->deterministic)
+		else if (nondeterministic)
 		{
 			/*
 			 * For nondeterministic locales, we find the next substring of the
@@ -222,9 +217,9 @@ MatchText(const char *t, int tlen, const char *p, int plen, pg_locale_t locale)
 			char	   *buf = NULL;
 
 			/*
-			 * Determine next substring of pattern without wildcards.  p is
-			 * the start of the subpattern, p1 is one past the last byte. Also
-			 * track if we found an escape character.
+			 * Determine length of substring of pattern without wildcards.  p
+			 * is the start of the subpattern, p1 will advance to one past its
+			 * last byte.  Also track if we found an escape character.
 			 */
 			p1 = p;
 			p1len = plen;
@@ -242,12 +237,15 @@ MatchText(const char *t, int tlen, const char *p, int plen, pg_locale_t locale)
 				}
 				else if (*p1 == '_' || *p1 == '%')
 					break;
+				/* Advance over regular or escaped character */
 				NextByte(p1, p1len);
 			}
 
 			/*
-			 * If we found an escape character, then make an unescaped copy of
-			 * the subpattern.
+			 * If we found an escape character, then make a de-escaped copy of
+			 * the subpattern that we can use to match literally.  Otherwise
+			 * we can use the subpattern in-place.  (buf holds the de-escaped
+			 * copy; be sure to pfree it before returning.)
 			 */
 			if (found_escape)
 			{
@@ -290,9 +288,10 @@ MatchText(const char *t, int tlen, const char *p, int plen, pg_locale_t locale)
 			}
 
 			/*
-			 * Now build a substring of the text and try to match it against
-			 * the subpattern.  t is the start of the text, t1 is one past the
-			 * last byte.  We start with a zero-length string.
+			 * Consider each successively-longer substring of the remaining
+			 * text and try to match it against the subpattern.  t is the
+			 * start of the substring, t1 is one past its last byte.  We start
+			 * with a zero-length substring.
 			 */
 			t1 = t;
 			t1len = tlen;
@@ -300,16 +299,16 @@ MatchText(const char *t, int tlen, const char *p, int plen, pg_locale_t locale)
 			{
 				int			cmp;
 
+				/* This could be slow, so allow interrupts */
 				CHECK_FOR_INTERRUPTS();
 
 				cmp = pg_strncoll(subpat, subpatlen, t, (t1 - t), locale);
 
 				/*
 				 * If we found a match, we have to test if the rest of pattern
-				 * can match against the rest of the string.  Otherwise we
-				 * have to continue here try matching with a longer substring.
-				 * (This is similar to the recursion for the '%' wildcard
-				 * above.)
+				 * can match against the rest of the text.  If not, we have to
+				 * continue and try the next longer substring.  (This is
+				 * similar to the recursion for the '%' wildcard above.)
 				 *
 				 * Note that we can't just wind forward p and t and continue
 				 * with the main loop.  This would fail for example with
@@ -344,7 +343,20 @@ MatchText(const char *t, int tlen, const char *p, int plen, pg_locale_t locale)
 				}
 				else
 					NextChar(t1, t1len);
-			}
+			}					/* end loop over substrings starting at t */
+		}
+		/* the rest of this loop considers only deterministic cases */
+		else if (*p == '\\')
+		{
+			/* Next pattern byte must match literally, whatever it is */
+			NextByte(p, plen);
+			/* ... and there had better be one, per SQL standard */
+			if (plen <= 0)
+				ereport(ERROR,
+						(errcode(ERRCODE_INVALID_ESCAPE_SEQUENCE),
+						 errmsg("LIKE pattern must not end with escape character")));
+			if (GETCHAR(*p) != GETCHAR(*t))
+				return LIKE_FALSE;
 		}
 		else if (GETCHAR(*p) != GETCHAR(*t))
 		{
diff --git a/src/test/regress/expected/collate.icu.utf8.out b/src/test/regress/expected/collate.icu.utf8.out
index 04e2f6df037..55578958504 100644
--- a/src/test/regress/expected/collate.icu.utf8.out
+++ b/src/test/regress/expected/collate.icu.utf8.out
@@ -1481,6 +1481,36 @@ SELECT 'abc' <= 'ABC' COLLATE case_insensitive, 'abc' >= 'ABC' COLLATE case_inse
  t        | t
 (1 row)
 
+SELECT 'AB' LIKE 'ab' COLLATE case_insensitive AS t;
+ t 
+---
+ t
+(1 row)
+
+SELECT 'AB' LIKE 'a\b' COLLATE case_insensitive AS t;
+ t 
+---
+ t
+(1 row)
+
+SELECT 'AB' LIKE '\ab' COLLATE case_insensitive AS t;
+ t 
+---
+ t
+(1 row)
+
+SELECT 'AB' LIKE '\a%' COLLATE case_insensitive AS t;
+ t 
+---
+ t
+(1 row)
+
+SELECT 'AB' LIKE '\a\%' COLLATE case_insensitive AS f;
+ f 
+---
+ f
+(1 row)
+
 -- tests with array_sort
 SELECT array_sort('{a,B}'::text[] COLLATE case_insensitive);
  array_sort 
diff --git a/src/test/regress/sql/collate.icu.utf8.sql b/src/test/regress/sql/collate.icu.utf8.sql
index 18c47e6e05a..f00605cb8a8 100644
--- a/src/test/regress/sql/collate.icu.utf8.sql
+++ b/src/test/regress/sql/collate.icu.utf8.sql
@@ -568,6 +568,12 @@ CREATE COLLATION case_insensitive (provider = icu, locale = '@colStrength=second
 SELECT 'abc' <= 'ABC' COLLATE case_sensitive, 'abc' >= 'ABC' COLLATE case_sensitive;
 SELECT 'abc' <= 'ABC' COLLATE case_insensitive, 'abc' >= 'ABC' COLLATE case_insensitive;
 
+SELECT 'AB' LIKE 'ab' COLLATE case_insensitive AS t;
+SELECT 'AB' LIKE 'a\b' COLLATE case_insensitive AS t;
+SELECT 'AB' LIKE '\ab' COLLATE case_insensitive AS t;
+SELECT 'AB' LIKE '\a%' COLLATE case_insensitive AS t;
+SELECT 'AB' LIKE '\a\%' COLLATE case_insensitive AS f;
+
 -- tests with array_sort
 SELECT array_sort('{a,B}'::text[] COLLATE case_insensitive);
 SELECT array_sort('{a,B}'::text[] COLLATE "C");
-- 
2.52.0

