From 50aa82f006d9f22a724c94ebdedcc711fbb600af Mon Sep 17 00:00:00 2001 From: Peter Eisentraut Date: Sat, 27 Jun 2020 11:05:10 +0200 Subject: [PATCH v2 2/2] Add current substring regular expression syntax SQL:1999 had syntax SUBSTRING(text FROM pattern FOR escapechar) but this was replaced in SQL:2003 by the more clear SUBSTRING(text SIMILAR pattern ESCAPE escapechar) but this was never implemented in PostgreSQL. This patch adds that new syntax as an alternative in the parser, and updates documentation and tests to indicate that this is the preferred alternative now. Discussion: https://www.postgresql.org/message-id/flat/a15db31c-d0f8-8ce0-9039-578a31758adb%402ndquadrant.com --- contrib/citext/expected/citext.out | 2 +- contrib/citext/expected/citext_1.out | 2 +- contrib/citext/sql/citext.sql | 2 +- doc/src/sgml/func.sgml | 20 ++++++++++---- src/backend/catalog/information_schema.sql | 2 +- src/backend/parser/gram.y | 26 +++++++++++++++++- src/test/regress/expected/strings.out | 31 +++++++++++++--------- src/test/regress/sql/strings.sql | 26 +++++++++--------- 8 files changed, 77 insertions(+), 34 deletions(-) diff --git a/contrib/citext/expected/citext.out b/contrib/citext/expected/citext.out index 96800be9c0..ec99aaed5d 100644 --- a/contrib/citext/expected/citext.out +++ b/contrib/citext/expected/citext.out @@ -1602,7 +1602,7 @@ SELECT substring('Thomas'::citext from '...$') = 'mas' AS t; t (1 row) -SELECT substring('Thomas'::citext from '%#"o_a#"_' for '#') = 'oma' AS t; +SELECT substring('Thomas'::citext similar '%#"o_a#"_' escape '#') = 'oma' AS t; t --- t diff --git a/contrib/citext/expected/citext_1.out b/contrib/citext/expected/citext_1.out index 33e3676d3c..75fd08b7cc 100644 --- a/contrib/citext/expected/citext_1.out +++ b/contrib/citext/expected/citext_1.out @@ -1602,7 +1602,7 @@ SELECT substring('Thomas'::citext from '...$') = 'mas' AS t; t (1 row) -SELECT substring('Thomas'::citext from '%#"o_a#"_' for '#') = 'oma' AS t; +SELECT substring('Thomas'::citext similar '%#"o_a#"_' escape '#') = 'oma' AS t; t --- t diff --git a/contrib/citext/sql/citext.sql b/contrib/citext/sql/citext.sql index 261b73cfa6..10232f5a9f 100644 --- a/contrib/citext/sql/citext.sql +++ b/contrib/citext/sql/citext.sql @@ -564,7 +564,7 @@ CREATE TABLE caster ( SELECT substring('Thomas'::citext from 2 for 3) = 'hom' AS t; SELECT substring('Thomas'::citext from 2) = 'homas' AS t; SELECT substring('Thomas'::citext from '...$') = 'mas' AS t; -SELECT substring('Thomas'::citext from '%#"o_a#"_' for '#') = 'oma' AS t; +SELECT substring('Thomas'::citext similar '%#"o_a#"_' escape '#') = 'oma' AS t; SELECT trim(' trim '::citext) = 'trim' AS t; SELECT trim('xxxxxtrimxxxx'::citext, 'x'::citext) = 'trim' AS t; diff --git a/doc/src/sgml/func.sgml b/doc/src/sgml/func.sgml index 7119f0b2ca..af4179b311 100644 --- a/doc/src/sgml/func.sgml +++ b/doc/src/sgml/func.sgml @@ -2669,15 +2669,21 @@ <acronym>SQL</acronym> String Functions and Operators + substring ( string text SIMILAR pattern text ESCAPE escape text ) + text + + substring ( string text FROM pattern text FOR escape text ) text Extracts substring matching SQL regular expression; - see . + see . The first form has + specified since SQL:2003; the second form was only in SQL:1999 and + should be considered obsolete. - substring('Thomas' from '%#"o_a#"_' for '#') + substring('Thomas' similar '%#"o_a#"_' escape '#') oma @@ -5160,7 +5166,11 @@ <function>SIMILAR TO</function> Regular Expressions The substring function with three parameters provides extraction of a substring that matches an SQL regular expression pattern. The function can be written according - to SQL99 syntax: + to standard SQL syntax: + +substring(string similar pattern escape escape-character) + + or using the now obsolete SQL:1999 syntax: substring(string from pattern for escape-character) @@ -5201,8 +5211,8 @@ <function>SIMILAR TO</function> Regular Expressions Some examples, with #" delimiting the return string: -substring('foobar' from '%#"o_b#"%' for '#') oob -substring('foobar' from '#"o_b#"%' for '#') NULL +substring('foobar' similar '%#"o_b#"%' escape '#') oob +substring('foobar' similar '#"o_b#"%' escape '#') NULL diff --git a/src/backend/catalog/information_schema.sql b/src/backend/catalog/information_schema.sql index 3e07fb107e..5ab47e7743 100644 --- a/src/backend/catalog/information_schema.sql +++ b/src/backend/catalog/information_schema.sql @@ -182,7 +182,7 @@ CREATE FUNCTION _pg_interval_type(typid oid, mod int4) RETURNS text AS $$SELECT CASE WHEN $1 IN (1186) /* interval */ - THEN pg_catalog.upper(substring(pg_catalog.format_type($1, $2) from 'interval[()0-9]* #"%#"' for '#')) + THEN pg_catalog.upper(substring(pg_catalog.format_type($1, $2) similar 'interval[()0-9]* #"%#"' escape '#')) ELSE null END$$; diff --git a/src/backend/parser/gram.y b/src/backend/parser/gram.y index 1a843049f0..4ff35095b8 100644 --- a/src/backend/parser/gram.y +++ b/src/backend/parser/gram.y @@ -14451,7 +14451,27 @@ position_list: | /*EMPTY*/ { $$ = NIL; } ; -/* SUBSTRING() arguments */ +/* + * SUBSTRING() arguments + * + * Note that SQL:1999 has both + * + * text FROM int FOR int + * + * and + * + * text FROM pattern FOR escape + * + * In the parser we map them both to a call to the substring() function and + * rely on type resolution to pick the right one. + * + * In SQL:2003, the second variant was changed to + * + * text SIMILAR pattern ESCAPE escape + * + * We could in theory map that to a different function internally, but + * since we still support the SQL:1999 version, we don't. + */ substr_list: a_expr FROM a_expr FOR a_expr { @@ -14483,6 +14503,10 @@ substr_list: makeTypeCast($3, SystemTypeName("int4"), -1)); } + | a_expr SIMILAR a_expr ESCAPE a_expr + { + $$ = list_make3($1, $3, $5); + } /* * We also want to support generic substring functions that * accept the usual generic list of arguments. diff --git a/src/test/regress/expected/strings.out b/src/test/regress/expected/strings.out index 6e98d183f6..8c034c9599 100644 --- a/src/test/regress/expected/strings.out +++ b/src/test/regress/expected/strings.out @@ -397,6 +397,13 @@ SELECT SUBSTRING('1234567890' FROM 4 FOR 3) = '456' AS "456"; (1 row) -- T581 regular expression substring (with SQL's bizarre regexp syntax) +SELECT SUBSTRING('abcdefg' SIMILAR 'a#"(b_d)#"%' ESCAPE '#') AS "bcd"; + bcd +----- + bcd +(1 row) + +-- obsolete SQL99 syntax SELECT SUBSTRING('abcdefg' FROM 'a#"(b_d)#"%' FOR '#') AS "bcd"; bcd ----- @@ -404,75 +411,75 @@ SELECT SUBSTRING('abcdefg' FROM 'a#"(b_d)#"%' FOR '#') AS "bcd"; (1 row) -- No match should return NULL -SELECT SUBSTRING('abcdefg' FROM '#"(b_d)#"%' FOR '#') IS NULL AS "True"; +SELECT SUBSTRING('abcdefg' SIMILAR '#"(b_d)#"%' ESCAPE '#') IS NULL AS "True"; True ------ t (1 row) -- Null inputs should return NULL -SELECT SUBSTRING('abcdefg' FROM '%' FOR NULL) IS NULL AS "True"; +SELECT SUBSTRING('abcdefg' SIMILAR '%' ESCAPE NULL) IS NULL AS "True"; True ------ t (1 row) -SELECT SUBSTRING(NULL FROM '%' FOR '#') IS NULL AS "True"; +SELECT SUBSTRING(NULL SIMILAR '%' ESCAPE '#') IS NULL AS "True"; True ------ t (1 row) -SELECT SUBSTRING('abcdefg' FROM NULL FOR '#') IS NULL AS "True"; +SELECT SUBSTRING('abcdefg' SIMILAR NULL ESCAPE '#') IS NULL AS "True"; True ------ t (1 row) -- The first and last parts should act non-greedy -SELECT SUBSTRING('abcdefg' FROM 'a#"%#"g' FOR '#') AS "bcdef"; +SELECT SUBSTRING('abcdefg' SIMILAR 'a#"%#"g' ESCAPE '#') AS "bcdef"; bcdef ------- bcdef (1 row) -SELECT SUBSTRING('abcdefg' FROM 'a*#"%#"g*' FOR '#') AS "abcdefg"; +SELECT SUBSTRING('abcdefg' SIMILAR 'a*#"%#"g*' ESCAPE '#') AS "abcdefg"; abcdefg --------- abcdefg (1 row) -- Vertical bar in any part affects only that part -SELECT SUBSTRING('abcdefg' FROM 'a|b#"%#"g' FOR '#') AS "bcdef"; +SELECT SUBSTRING('abcdefg' SIMILAR 'a|b#"%#"g' ESCAPE '#') AS "bcdef"; bcdef ------- bcdef (1 row) -SELECT SUBSTRING('abcdefg' FROM 'a#"%#"x|g' FOR '#') AS "bcdef"; +SELECT SUBSTRING('abcdefg' SIMILAR 'a#"%#"x|g' ESCAPE '#') AS "bcdef"; bcdef ------- bcdef (1 row) -SELECT SUBSTRING('abcdefg' FROM 'a#"%|ab#"g' FOR '#') AS "bcdef"; +SELECT SUBSTRING('abcdefg' SIMILAR 'a#"%|ab#"g' ESCAPE '#') AS "bcdef"; bcdef ------- bcdef (1 row) -- Can't have more than two part separators -SELECT SUBSTRING('abcdefg' FROM 'a*#"%#"g*#"x' FOR '#') AS "error"; +SELECT SUBSTRING('abcdefg' SIMILAR 'a*#"%#"g*#"x' ESCAPE '#') AS "error"; ERROR: SQL regular expression may not contain more than two escape-double-quote separators CONTEXT: SQL function "substring" statement 1 -- Postgres extension: with 0 or 1 separator, assume parts 1 and 3 are empty -SELECT SUBSTRING('abcdefg' FROM 'a#"%g' FOR '#') AS "bcdefg"; +SELECT SUBSTRING('abcdefg' SIMILAR 'a#"%g' ESCAPE '#') AS "bcdefg"; bcdefg -------- bcdefg (1 row) -SELECT SUBSTRING('abcdefg' FROM 'a%g' FOR '#') AS "abcdefg"; +SELECT SUBSTRING('abcdefg' SIMILAR 'a%g' ESCAPE '#') AS "abcdefg"; abcdefg --------- abcdefg diff --git a/src/test/regress/sql/strings.sql b/src/test/regress/sql/strings.sql index 3e89159a4f..14901a2692 100644 --- a/src/test/regress/sql/strings.sql +++ b/src/test/regress/sql/strings.sql @@ -132,31 +132,33 @@ SELECT SUBSTRING('1234567890' FROM 4 FOR 3) = '456' AS "456"; -- T581 regular expression substring (with SQL's bizarre regexp syntax) +SELECT SUBSTRING('abcdefg' SIMILAR 'a#"(b_d)#"%' ESCAPE '#') AS "bcd"; +-- obsolete SQL99 syntax SELECT SUBSTRING('abcdefg' FROM 'a#"(b_d)#"%' FOR '#') AS "bcd"; -- No match should return NULL -SELECT SUBSTRING('abcdefg' FROM '#"(b_d)#"%' FOR '#') IS NULL AS "True"; +SELECT SUBSTRING('abcdefg' SIMILAR '#"(b_d)#"%' ESCAPE '#') IS NULL AS "True"; -- Null inputs should return NULL -SELECT SUBSTRING('abcdefg' FROM '%' FOR NULL) IS NULL AS "True"; -SELECT SUBSTRING(NULL FROM '%' FOR '#') IS NULL AS "True"; -SELECT SUBSTRING('abcdefg' FROM NULL FOR '#') IS NULL AS "True"; +SELECT SUBSTRING('abcdefg' SIMILAR '%' ESCAPE NULL) IS NULL AS "True"; +SELECT SUBSTRING(NULL SIMILAR '%' ESCAPE '#') IS NULL AS "True"; +SELECT SUBSTRING('abcdefg' SIMILAR NULL ESCAPE '#') IS NULL AS "True"; -- The first and last parts should act non-greedy -SELECT SUBSTRING('abcdefg' FROM 'a#"%#"g' FOR '#') AS "bcdef"; -SELECT SUBSTRING('abcdefg' FROM 'a*#"%#"g*' FOR '#') AS "abcdefg"; +SELECT SUBSTRING('abcdefg' SIMILAR 'a#"%#"g' ESCAPE '#') AS "bcdef"; +SELECT SUBSTRING('abcdefg' SIMILAR 'a*#"%#"g*' ESCAPE '#') AS "abcdefg"; -- Vertical bar in any part affects only that part -SELECT SUBSTRING('abcdefg' FROM 'a|b#"%#"g' FOR '#') AS "bcdef"; -SELECT SUBSTRING('abcdefg' FROM 'a#"%#"x|g' FOR '#') AS "bcdef"; -SELECT SUBSTRING('abcdefg' FROM 'a#"%|ab#"g' FOR '#') AS "bcdef"; +SELECT SUBSTRING('abcdefg' SIMILAR 'a|b#"%#"g' ESCAPE '#') AS "bcdef"; +SELECT SUBSTRING('abcdefg' SIMILAR 'a#"%#"x|g' ESCAPE '#') AS "bcdef"; +SELECT SUBSTRING('abcdefg' SIMILAR 'a#"%|ab#"g' ESCAPE '#') AS "bcdef"; -- Can't have more than two part separators -SELECT SUBSTRING('abcdefg' FROM 'a*#"%#"g*#"x' FOR '#') AS "error"; +SELECT SUBSTRING('abcdefg' SIMILAR 'a*#"%#"g*#"x' ESCAPE '#') AS "error"; -- Postgres extension: with 0 or 1 separator, assume parts 1 and 3 are empty -SELECT SUBSTRING('abcdefg' FROM 'a#"%g' FOR '#') AS "bcdefg"; -SELECT SUBSTRING('abcdefg' FROM 'a%g' FOR '#') AS "abcdefg"; +SELECT SUBSTRING('abcdefg' SIMILAR 'a#"%g' ESCAPE '#') AS "bcdefg"; +SELECT SUBSTRING('abcdefg' SIMILAR 'a%g' ESCAPE '#') AS "abcdefg"; -- substring() with just two arguments is not allowed by SQL spec; -- we accept it, but we interpret the pattern as a POSIX regexp not SQL -- 2.27.0