From 32b6d132232d40308c47faad111e75d5c03f6b63 Mon Sep 17 00:00:00 2001 From: Peter Eisentraut Date: Fri, 19 Jun 2020 11:14:10 +0200 Subject: [PATCH 2/2] Add current substring regular expression syntax SQL99 had syntax SUBSTRING(text FROM pattern FOR escapechar) but this was replaced in SQL:2003 by the more clear SUBSTRING(text SIMILAR pattern ESCAPE escapechar) but this was never implemented in PostgreSQL. This patch adds that new syntax as an alternative in the parser, and updates documentation and tests to indicate that this is the preferred alternative now. --- doc/src/sgml/func.sgml | 20 ++++++++++++----- src/backend/parser/gram.y | 26 +++++++++++++++++++++- src/test/regress/expected/strings.out | 31 ++++++++++++++++----------- src/test/regress/sql/strings.sql | 26 +++++++++++----------- 4 files changed, 73 insertions(+), 30 deletions(-) diff --git a/doc/src/sgml/func.sgml b/doc/src/sgml/func.sgml index 9d71678029..eedf189546 100644 --- a/doc/src/sgml/func.sgml +++ b/doc/src/sgml/func.sgml @@ -2669,15 +2669,21 @@ <acronym>SQL</acronym> String Functions and Operators + substring ( string text SIMILAR pattern text ESCAPE escape text ) + text + + substring ( string text FROM pattern text FOR escape text ) text Extracts substring matching SQL regular expression; - see . + see . The first form has + specified since SQL:2003; the second form was only in SQL:1999 and + should be considered obsolete. - substring('Thomas' from '%#"o_a#"_' for '#') + substring('Thomas' similar '%#"o_a#"_' escape '#') oma @@ -5160,7 +5166,11 @@ <function>SIMILAR TO</function> Regular Expressions The substring function with three parameters provides extraction of a substring that matches an SQL regular expression pattern. The function can be written according - to SQL99 syntax: + to standard SQL syntax: + +substring(string similar pattern escape escape-character) + + or using the now obsolete SQL99 syntax: substring(string from pattern for escape-character) @@ -5201,8 +5211,8 @@ <function>SIMILAR TO</function> Regular Expressions Some examples, with #" delimiting the return string: -substring('foobar' from '%#"o_b#"%' for '#') oob -substring('foobar' from '#"o_b#"%' for '#') NULL +substring('foobar' similar '%#"o_b#"%' escape '#') oob +substring('foobar' similar '#"o_b#"%' escape '#') NULL diff --git a/src/backend/parser/gram.y b/src/backend/parser/gram.y index 1a843049f0..5f21039b11 100644 --- a/src/backend/parser/gram.y +++ b/src/backend/parser/gram.y @@ -14451,7 +14451,27 @@ position_list: | /*EMPTY*/ { $$ = NIL; } ; -/* SUBSTRING() arguments */ +/* + * SUBSTRING() arguments + * + * Note that SQL99 has both + * + * text FROM int FOR int + * + * and + * + * text FROM pattern FOR escape + * + * In the parser we map them both to a call to the substring() function and + * rely on type resolution to pick the right one. + * + * In SQL:2003, the second variant was changed to + * + * text SIMILAR pattern ESCAPE escape + * + * We could in theory map that to a different function internally, but + * since we still support the SQL99 version, we don't. + */ substr_list: a_expr FROM a_expr FOR a_expr { @@ -14483,6 +14503,10 @@ substr_list: makeTypeCast($3, SystemTypeName("int4"), -1)); } + | a_expr SIMILAR a_expr ESCAPE a_expr + { + $$ = list_make3($1, $3, $5); + } /* * We also want to support generic substring functions that * accept the usual generic list of arguments. diff --git a/src/test/regress/expected/strings.out b/src/test/regress/expected/strings.out index 6e98d183f6..8c034c9599 100644 --- a/src/test/regress/expected/strings.out +++ b/src/test/regress/expected/strings.out @@ -397,6 +397,13 @@ SELECT SUBSTRING('1234567890' FROM 4 FOR 3) = '456' AS "456"; (1 row) -- T581 regular expression substring (with SQL's bizarre regexp syntax) +SELECT SUBSTRING('abcdefg' SIMILAR 'a#"(b_d)#"%' ESCAPE '#') AS "bcd"; + bcd +----- + bcd +(1 row) + +-- obsolete SQL99 syntax SELECT SUBSTRING('abcdefg' FROM 'a#"(b_d)#"%' FOR '#') AS "bcd"; bcd ----- @@ -404,75 +411,75 @@ SELECT SUBSTRING('abcdefg' FROM 'a#"(b_d)#"%' FOR '#') AS "bcd"; (1 row) -- No match should return NULL -SELECT SUBSTRING('abcdefg' FROM '#"(b_d)#"%' FOR '#') IS NULL AS "True"; +SELECT SUBSTRING('abcdefg' SIMILAR '#"(b_d)#"%' ESCAPE '#') IS NULL AS "True"; True ------ t (1 row) -- Null inputs should return NULL -SELECT SUBSTRING('abcdefg' FROM '%' FOR NULL) IS NULL AS "True"; +SELECT SUBSTRING('abcdefg' SIMILAR '%' ESCAPE NULL) IS NULL AS "True"; True ------ t (1 row) -SELECT SUBSTRING(NULL FROM '%' FOR '#') IS NULL AS "True"; +SELECT SUBSTRING(NULL SIMILAR '%' ESCAPE '#') IS NULL AS "True"; True ------ t (1 row) -SELECT SUBSTRING('abcdefg' FROM NULL FOR '#') IS NULL AS "True"; +SELECT SUBSTRING('abcdefg' SIMILAR NULL ESCAPE '#') IS NULL AS "True"; True ------ t (1 row) -- The first and last parts should act non-greedy -SELECT SUBSTRING('abcdefg' FROM 'a#"%#"g' FOR '#') AS "bcdef"; +SELECT SUBSTRING('abcdefg' SIMILAR 'a#"%#"g' ESCAPE '#') AS "bcdef"; bcdef ------- bcdef (1 row) -SELECT SUBSTRING('abcdefg' FROM 'a*#"%#"g*' FOR '#') AS "abcdefg"; +SELECT SUBSTRING('abcdefg' SIMILAR 'a*#"%#"g*' ESCAPE '#') AS "abcdefg"; abcdefg --------- abcdefg (1 row) -- Vertical bar in any part affects only that part -SELECT SUBSTRING('abcdefg' FROM 'a|b#"%#"g' FOR '#') AS "bcdef"; +SELECT SUBSTRING('abcdefg' SIMILAR 'a|b#"%#"g' ESCAPE '#') AS "bcdef"; bcdef ------- bcdef (1 row) -SELECT SUBSTRING('abcdefg' FROM 'a#"%#"x|g' FOR '#') AS "bcdef"; +SELECT SUBSTRING('abcdefg' SIMILAR 'a#"%#"x|g' ESCAPE '#') AS "bcdef"; bcdef ------- bcdef (1 row) -SELECT SUBSTRING('abcdefg' FROM 'a#"%|ab#"g' FOR '#') AS "bcdef"; +SELECT SUBSTRING('abcdefg' SIMILAR 'a#"%|ab#"g' ESCAPE '#') AS "bcdef"; bcdef ------- bcdef (1 row) -- Can't have more than two part separators -SELECT SUBSTRING('abcdefg' FROM 'a*#"%#"g*#"x' FOR '#') AS "error"; +SELECT SUBSTRING('abcdefg' SIMILAR 'a*#"%#"g*#"x' ESCAPE '#') AS "error"; ERROR: SQL regular expression may not contain more than two escape-double-quote separators CONTEXT: SQL function "substring" statement 1 -- Postgres extension: with 0 or 1 separator, assume parts 1 and 3 are empty -SELECT SUBSTRING('abcdefg' FROM 'a#"%g' FOR '#') AS "bcdefg"; +SELECT SUBSTRING('abcdefg' SIMILAR 'a#"%g' ESCAPE '#') AS "bcdefg"; bcdefg -------- bcdefg (1 row) -SELECT SUBSTRING('abcdefg' FROM 'a%g' FOR '#') AS "abcdefg"; +SELECT SUBSTRING('abcdefg' SIMILAR 'a%g' ESCAPE '#') AS "abcdefg"; abcdefg --------- abcdefg diff --git a/src/test/regress/sql/strings.sql b/src/test/regress/sql/strings.sql index 3e89159a4f..14901a2692 100644 --- a/src/test/regress/sql/strings.sql +++ b/src/test/regress/sql/strings.sql @@ -132,31 +132,33 @@ SELECT SUBSTRING('1234567890' FROM 4 FOR 3) = '456' AS "456"; -- T581 regular expression substring (with SQL's bizarre regexp syntax) +SELECT SUBSTRING('abcdefg' SIMILAR 'a#"(b_d)#"%' ESCAPE '#') AS "bcd"; +-- obsolete SQL99 syntax SELECT SUBSTRING('abcdefg' FROM 'a#"(b_d)#"%' FOR '#') AS "bcd"; -- No match should return NULL -SELECT SUBSTRING('abcdefg' FROM '#"(b_d)#"%' FOR '#') IS NULL AS "True"; +SELECT SUBSTRING('abcdefg' SIMILAR '#"(b_d)#"%' ESCAPE '#') IS NULL AS "True"; -- Null inputs should return NULL -SELECT SUBSTRING('abcdefg' FROM '%' FOR NULL) IS NULL AS "True"; -SELECT SUBSTRING(NULL FROM '%' FOR '#') IS NULL AS "True"; -SELECT SUBSTRING('abcdefg' FROM NULL FOR '#') IS NULL AS "True"; +SELECT SUBSTRING('abcdefg' SIMILAR '%' ESCAPE NULL) IS NULL AS "True"; +SELECT SUBSTRING(NULL SIMILAR '%' ESCAPE '#') IS NULL AS "True"; +SELECT SUBSTRING('abcdefg' SIMILAR NULL ESCAPE '#') IS NULL AS "True"; -- The first and last parts should act non-greedy -SELECT SUBSTRING('abcdefg' FROM 'a#"%#"g' FOR '#') AS "bcdef"; -SELECT SUBSTRING('abcdefg' FROM 'a*#"%#"g*' FOR '#') AS "abcdefg"; +SELECT SUBSTRING('abcdefg' SIMILAR 'a#"%#"g' ESCAPE '#') AS "bcdef"; +SELECT SUBSTRING('abcdefg' SIMILAR 'a*#"%#"g*' ESCAPE '#') AS "abcdefg"; -- Vertical bar in any part affects only that part -SELECT SUBSTRING('abcdefg' FROM 'a|b#"%#"g' FOR '#') AS "bcdef"; -SELECT SUBSTRING('abcdefg' FROM 'a#"%#"x|g' FOR '#') AS "bcdef"; -SELECT SUBSTRING('abcdefg' FROM 'a#"%|ab#"g' FOR '#') AS "bcdef"; +SELECT SUBSTRING('abcdefg' SIMILAR 'a|b#"%#"g' ESCAPE '#') AS "bcdef"; +SELECT SUBSTRING('abcdefg' SIMILAR 'a#"%#"x|g' ESCAPE '#') AS "bcdef"; +SELECT SUBSTRING('abcdefg' SIMILAR 'a#"%|ab#"g' ESCAPE '#') AS "bcdef"; -- Can't have more than two part separators -SELECT SUBSTRING('abcdefg' FROM 'a*#"%#"g*#"x' FOR '#') AS "error"; +SELECT SUBSTRING('abcdefg' SIMILAR 'a*#"%#"g*#"x' ESCAPE '#') AS "error"; -- Postgres extension: with 0 or 1 separator, assume parts 1 and 3 are empty -SELECT SUBSTRING('abcdefg' FROM 'a#"%g' FOR '#') AS "bcdefg"; -SELECT SUBSTRING('abcdefg' FROM 'a%g' FOR '#') AS "abcdefg"; +SELECT SUBSTRING('abcdefg' SIMILAR 'a#"%g' ESCAPE '#') AS "bcdefg"; +SELECT SUBSTRING('abcdefg' SIMILAR 'a%g' ESCAPE '#') AS "abcdefg"; -- substring() with just two arguments is not allowed by SQL spec; -- we accept it, but we interpret the pattern as a POSIX regexp not SQL -- 2.27.0