From a8cf529a620201b6cb0bff179f10a008ace8a904 Mon Sep 17 00:00:00 2001
From: Michael Paquier <michael@paquier.xyz>
Date: Tue, 27 May 2025 14:41:21 +0900
Subject: [PATCH v3] Fix SIMILAR TO regex translation for character classes

The code that translates SIMILAR TO pattern matching expressions
to regular expressions didn't consider that brackets can be nested,
as in [[:alpha:]%], and replaced placeholders like _ and % where
it shouldn't.

Fix by tracking the nesting level of brackets while considering that
in expressions like []] and [^]] the first closing bracket is a
regular character.

Author: Laurenz Albe <laurenz.albe@cybertec.at>
Reviewed-by: Michael Paquier <michael@paquier.xyz>
Discussion: https://postgr.es/m/16ab039d1af455652bdf4173402ddda145f2c73b.camel@cybertec.at
Backpatch-through: 13
---
 src/backend/utils/adt/regexp.c        | 38 +++++++++--
 src/test/regress/expected/strings.out | 91 +++++++++++++++++++++++++++
 src/test/regress/sql/strings.sql      | 20 ++++++
 3 files changed, 143 insertions(+), 6 deletions(-)

diff --git a/src/backend/utils/adt/regexp.c b/src/backend/utils/adt/regexp.c
index edee1f7880bd..6e2864cbbda8 100644
--- a/src/backend/utils/adt/regexp.c
+++ b/src/backend/utils/adt/regexp.c
@@ -773,8 +773,11 @@ similar_escape_internal(text *pat_text, text *esc_text)
 	int			plen,
 				elen;
 	bool		afterescape = false;
-	bool		incharclass = false;
 	int			nquotes = 0;
+	int			charclass_depth = 0;	/* Nesting level of character classes,
+										 * encompassed by square brackets */
+	int			charclass_start = 0;	/* State of the character class start,
+										 * for carets */
 
 	p = VARDATA_ANY(pat_text);
 	plen = VARSIZE_ANY_EXHDR(pat_text);
@@ -904,7 +907,7 @@ similar_escape_internal(text *pat_text, text *esc_text)
 		/* fast path */
 		if (afterescape)
 		{
-			if (pchar == '"' && !incharclass)	/* escape-double-quote? */
+			if (pchar == '"' && charclass_depth < 1)	/* escape-double-quote? */
 			{
 				/* emit appropriate part separator, per notes above */
 				if (nquotes == 0)
@@ -953,18 +956,41 @@ similar_escape_internal(text *pat_text, text *esc_text)
 			/* SQL escape character; do not send to output */
 			afterescape = true;
 		}
-		else if (incharclass)
+		else if (charclass_depth > 0)
 		{
 			if (pchar == '\\')
 				*r++ = '\\';
 			*r++ = pchar;
-			if (pchar == ']')
-				incharclass = false;
+
+			/*
+			 * Ignore a closing bracket at the start of a character class.
+			 * Such a bracket is taken literally rather than closing the
+			 * class.  "charclass_start" is 1 right at the beginning of a
+			 * class and 2 after an initial caret.
+			 */
+			if (pchar == ']' && charclass_start > 2)
+				charclass_depth--;
+			else if (pchar == '[')
+				charclass_depth++;
+
+			/*
+			 * If there is a caret right after the opening bracket, it negates
+			 * the character class, but a following closing bracket should
+			 * still be treated as a normal character.  That holds only for
+			 * the first caret, so only the values 1 and 2 mean that closing
+			 * brackets should be taken literally.
+			 */
+			if (pchar == '^')
+				charclass_start++;
+			else
+				charclass_start = 3;	/* definitely past the start */
 		}
 		else if (pchar == '[')
 		{
+			/* start of a character class */
 			*r++ = pchar;
-			incharclass = true;
+			charclass_depth++;
+			charclass_start = 1;
 		}
 		else if (pchar == '%')
 		{
diff --git a/src/test/regress/expected/strings.out b/src/test/regress/expected/strings.out
index 174f0a68331b..601bc7d2ec25 100644
--- a/src/test/regress/expected/strings.out
+++ b/src/test/regress/expected/strings.out
@@ -614,6 +614,97 @@ SELECT 'abcdefg' SIMILAR TO '_bcd%' ESCAPE NULL AS null;
 SELECT 'abcdefg' SIMILAR TO '_bcd#%' ESCAPE '##' AS error;
 ERROR:  invalid escape string
 HINT:  Escape string must be empty or one character.
+-- Characters that should be left alone in character classes when a
+-- SIMILAR TO regexp pattern is converted to POSIX style.
+-- Underscore "_"
+EXPLAIN (VERBOSE, COSTS OFF) SELECT (SELECT '') SIMILAR TO '_[_[:alpha:]_]_';
+                          QUERY PLAN                           
+---------------------------------------------------------------
+ Result
+   Output: ((InitPlan 1).col1 ~ '^(?:.[_[:alpha:]_].)$'::text)
+   InitPlan 1
+     ->  Result
+           Output: ''::text
+(5 rows)
+
+-- Percentage "%"
+EXPLAIN (VERBOSE, COSTS OFF) SELECT (SELECT '') SIMILAR TO '%[%[:alnum:]%]%';
+                           QUERY PLAN                            
+-----------------------------------------------------------------
+ Result
+   Output: ((InitPlan 1).col1 ~ '^(?:.*[%[:alnum:]%].*)$'::text)
+   InitPlan 1
+     ->  Result
+           Output: ''::text
+(5 rows)
+
+-- Dot "."
+EXPLAIN (VERBOSE, COSTS OFF) SELECT (SELECT '') SIMILAR TO '.[.[:alnum:].].';
+                           QUERY PLAN                            
+-----------------------------------------------------------------
+ Result
+   Output: ((InitPlan 1).col1 ~ '^(?:\.[.[:alnum:].]\.)$'::text)
+   InitPlan 1
+     ->  Result
+           Output: ''::text
+(5 rows)
+
+-- Dollar "$"
+EXPLAIN (VERBOSE, COSTS OFF) SELECT (SELECT '') SIMILAR TO '$[$[:alnum:]$]$';
+                           QUERY PLAN                            
+-----------------------------------------------------------------
+ Result
+   Output: ((InitPlan 1).col1 ~ '^(?:\$[$[:alnum:]$]\$)$'::text)
+   InitPlan 1
+     ->  Result
+           Output: ''::text
+(5 rows)
+
+-- Opening parenthesis "("
+EXPLAIN (VERBOSE, COSTS OFF) SELECT (SELECT '') SIMILAR TO '([([:alnum:](](';
+                            QUERY PLAN                             
+-------------------------------------------------------------------
+ Result
+   Output: ((InitPlan 1).col1 ~ '^(?:(?:[([:alnum:](](?:)$'::text)
+   InitPlan 1
+     ->  Result
+           Output: ''::text
+(5 rows)
+
+-- Caret "^"
+EXPLAIN (VERBOSE, COSTS OFF) SELECT (SELECT '') SIMILAR TO '^[^[:alnum:]^[^^][[^^]][\^][[\^]]\^]^';
+                                      QUERY PLAN                                       
+---------------------------------------------------------------------------------------
+ Result
+   Output: ((InitPlan 1).col1 ~ '^(?:\^[^[:alnum:]^[^^][[^^]][\^][[\^]]\^]\^)$'::text)
+   InitPlan 1
+     ->  Result
+           Output: ''::text
+(5 rows)
+
+-- Closing square bracket "]" at the beginning of character class
+EXPLAIN (VERBOSE, COSTS OFF) SELECT (SELECT '') SIMILAR TO '[]%][^]%][^%]%';
+                          QUERY PLAN                           
+---------------------------------------------------------------
+ Result
+   Output: ((InitPlan 1).col1 ~ '^(?:[]%][^]%][^%].*)$'::text)
+   InitPlan 1
+     ->  Result
+           Output: ''::text
+(5 rows)
+
+-- Closing square bracket effective after two carets at the beginning
+-- of character class.
+EXPLAIN (VERBOSE, COSTS OFF) SELECT (SELECT '') SIMILAR TO '[^^]^';
+                      QUERY PLAN                      
+------------------------------------------------------
+ Result
+   Output: ((InitPlan 1).col1 ~ '^(?:[^^]\^)$'::text)
+   InitPlan 1
+     ->  Result
+           Output: ''::text
+(5 rows)
+
 -- Test backslash escapes in regexp_replace's replacement string
 SELECT regexp_replace('1112223333', E'(\\d{3})(\\d{3})(\\d{4})', E'(\\1) \\2-\\3');
  regexp_replace 
diff --git a/src/test/regress/sql/strings.sql b/src/test/regress/sql/strings.sql
index f7b325baadf4..34c52b23604b 100644
--- a/src/test/regress/sql/strings.sql
+++ b/src/test/regress/sql/strings.sql
@@ -197,6 +197,26 @@ SELECT 'abcd\efg' SIMILAR TO '_bcd\%' ESCAPE '' AS true;
 SELECT 'abcdefg' SIMILAR TO '_bcd%' ESCAPE NULL AS null;
 SELECT 'abcdefg' SIMILAR TO '_bcd#%' ESCAPE '##' AS error;
 
+-- Characters that should be left alone in character classes when a
+-- SIMILAR TO regexp pattern is converted to POSIX style.
+-- Underscore "_"
+EXPLAIN (VERBOSE, COSTS OFF) SELECT (SELECT '') SIMILAR TO '_[_[:alpha:]_]_';
+-- Percentage "%"
+EXPLAIN (VERBOSE, COSTS OFF) SELECT (SELECT '') SIMILAR TO '%[%[:alnum:]%]%';
+-- Dot "."
+EXPLAIN (VERBOSE, COSTS OFF) SELECT (SELECT '') SIMILAR TO '.[.[:alnum:].].';
+-- Dollar "$"
+EXPLAIN (VERBOSE, COSTS OFF) SELECT (SELECT '') SIMILAR TO '$[$[:alnum:]$]$';
+-- Opening parenthesis "("
+EXPLAIN (VERBOSE, COSTS OFF) SELECT (SELECT '') SIMILAR TO '([([:alnum:](](';
+-- Caret "^"
+EXPLAIN (VERBOSE, COSTS OFF) SELECT (SELECT '') SIMILAR TO '^[^[:alnum:]^[^^][[^^]][\^][[\^]]\^]^';
+-- Closing square bracket "]" at the beginning of character class
+EXPLAIN (VERBOSE, COSTS OFF) SELECT (SELECT '') SIMILAR TO '[]%][^]%][^%]%';
+-- Closing square bracket effective after two carets at the beginning
+-- of character class.
+EXPLAIN (VERBOSE, COSTS OFF) SELECT (SELECT '') SIMILAR TO '[^^]^';
+
 -- Test backslash escapes in regexp_replace's replacement string
 SELECT regexp_replace('1112223333', E'(\\d{3})(\\d{3})(\\d{4})', E'(\\1) \\2-\\3');
 SELECT regexp_replace('foobarrbazz', E'(.)\\1', E'X\\&Y', 'g');
-- 
2.49.0

