From 749715ef0758f309671f995ae71e0642c6562ecc Mon Sep 17 00:00:00 2001 From: Peter Eisentraut Date: Wed, 11 Dec 2019 13:32:17 +0100 Subject: [PATCH v1 2/2] Add SQL functions for Unicode normalization This adds SQL expressions NORMALIZE() and IS NORMALIZED to convert and check Unicode normal forms, per SQL standard. --- doc/src/sgml/func.sgml | 46 +++++++++ src/backend/catalog/sql_features.txt | 2 +- src/backend/catalog/system_views.sql | 15 +++ src/backend/parser/gram.y | 41 +++++++- src/backend/utils/adt/varlena.c | 130 ++++++++++++++++++++++++ src/include/catalog/pg_proc.dat | 8 ++ src/include/parser/kwlist.h | 6 ++ src/test/regress/expected/unicode.out | 81 +++++++++++++++ src/test/regress/expected/unicode_1.out | 3 + src/test/regress/parallel_schedule | 2 +- src/test/regress/serial_schedule | 1 + src/test/regress/sql/unicode.sql | 32 ++++++ 12 files changed, 364 insertions(+), 3 deletions(-) create mode 100644 src/test/regress/expected/unicode.out create mode 100644 src/test/regress/expected/unicode_1.out create mode 100644 src/test/regress/sql/unicode.sql diff --git a/doc/src/sgml/func.sgml b/doc/src/sgml/func.sgml index 57a1539506..bfe2fe8a5f 100644 --- a/doc/src/sgml/func.sgml +++ b/doc/src/sgml/func.sgml @@ -1500,6 +1500,28 @@ <acronym>SQL</acronym> String Functions and Operators Value: 42 + + + + normalized + + + Unicode normalization + + string is not form normalized + + boolean + + Checks whether the string is in the specified Unicode + normalization form. The optional parameter specifies the form: NFC (default), + NFD, NFKC, + NFKD. This expression can only be used if the server + encoding is UTF8. + + U&'\0061\0308bc' IS NFD NORMALIZED + true + + @@ -1550,6 +1572,30 @@ <acronym>SQL</acronym> String Functions and Operators tom + + + + normalize + + + Unicode normalization + + normalize(string text + , form ) + + text + + Converts the string in the first argument to the specified Unicode + normalization form. The optional second argument specifies the form + as an identifier: NFC (default), + NFD, NFKC, + NFKD. This function can only be used if the server + encoding is UTF8. + + normalize(U&'\0061\0308bc', NFC) + U&'\00E4bc' + + diff --git a/src/backend/catalog/sql_features.txt b/src/backend/catalog/sql_features.txt index ab3e381cff..59fb2812b4 100644 --- a/src/backend/catalog/sql_features.txt +++ b/src/backend/catalog/sql_features.txt @@ -257,7 +257,7 @@ F386 Set identity column generation clause YES F391 Long identifiers YES F392 Unicode escapes in identifiers YES F393 Unicode escapes in literals YES -F394 Optional normal form specification NO +F394 Optional normal form specification YES F401 Extended joined table YES F401 Extended joined table 01 NATURAL JOIN YES F401 Extended joined table 02 FULL OUTER JOIN YES diff --git a/src/backend/catalog/system_views.sql b/src/backend/catalog/system_views.sql index f7800f01a6..a7adf68449 100644 --- a/src/backend/catalog/system_views.sql +++ b/src/backend/catalog/system_views.sql @@ -1332,6 +1332,21 @@ CREATE OR REPLACE FUNCTION STRICT STABLE PARALLEL SAFE AS 'jsonb_path_query_first_tz'; +-- default normalization form is NFC, per SQL standard +CREATE OR REPLACE FUNCTION + "normalize"(text, text DEFAULT 'NFC') +RETURNS text +LANGUAGE internal +STRICT IMMUTABLE PARALLEL SAFE +AS 'unicode_normalize_func'; + +CREATE OR REPLACE FUNCTION + is_normalized(text, text DEFAULT 'NFC') +RETURNS boolean +LANGUAGE internal +STRICT IMMUTABLE PARALLEL SAFE +AS 'unicode_is_normalized'; + -- -- The default permissions for functions mean that anyone can execute them. -- A number of functions shouldn't be executable by just anyone, but rather diff --git a/src/backend/parser/gram.y b/src/backend/parser/gram.y index c5086846de..8067dcbcbf 100644 --- a/src/backend/parser/gram.y +++ b/src/backend/parser/gram.y @@ -444,6 +444,7 @@ static Node *makeRecursiveViewSelect(char *relname, List *aliases, Node *query); %type substr_list trim_list %type opt_interval interval_second %type overlay_placing substr_from substr_for +%type unicode_normal_form %type opt_instead %type opt_unique opt_concurrently opt_verbose opt_full @@ -661,7 +662,8 @@ static Node *makeRecursiveViewSelect(char *relname, List *aliases, Node *query); MAPPING MATCH MATERIALIZED MAXVALUE METHOD MINUTE_P MINVALUE MODE MONTH_P MOVE - NAME_P NAMES NATIONAL NATURAL NCHAR NEW NEXT NO NONE + NAME_P NAMES NATIONAL NATURAL NCHAR NEW NEXT NFC NFD NFKC NFKD NO NONE + NORMALIZE NORMALIZED NOT NOTHING NOTIFY NOTNULL NOWAIT NULL_P NULLIF NULLS_P NUMERIC @@ -13436,6 +13438,22 @@ a_expr: c_expr { $$ = $1; } list_make1($1), @2), @2); } + | a_expr IS NORMALIZED %prec IS + { + $$ = (Node *) makeFuncCall(SystemFuncName("is_normalized"), list_make1($1), @2); + } + | a_expr IS unicode_normal_form NORMALIZED %prec IS + { + $$ = (Node *) makeFuncCall(SystemFuncName("is_normalized"), list_make2($1, makeStringConst($3, @3)), @2); + } + | a_expr IS NOT NORMALIZED %prec IS + { + $$ = makeNotExpr((Node *) makeFuncCall(SystemFuncName("is_normalized"), list_make1($1), @2), @2); + } + | a_expr IS NOT unicode_normal_form NORMALIZED %prec IS + { + $$ = makeNotExpr((Node *) makeFuncCall(SystemFuncName("is_normalized"), list_make2($1, makeStringConst($4, @4)), @2), @2); + } | DEFAULT { /* @@ -13879,6 +13897,14 @@ func_expr_common_subexpr: { $$ = (Node *) makeFuncCall(SystemFuncName("date_part"), $3, @1); } + | NORMALIZE '(' a_expr ')' + { + $$ = (Node *) makeFuncCall(SystemFuncName("normalize"), list_make1($3), @1); + } + | NORMALIZE '(' a_expr ',' unicode_normal_form ')' + { + $$ = (Node *) makeFuncCall(SystemFuncName("normalize"), list_make2($3, makeStringConst($5, @5)), @1); + } | OVERLAY '(' overlay_list ')' { /* overlay(A PLACING B FROM C FOR D) is converted to @@ -14514,6 +14540,13 @@ extract_arg: | Sconst { $$ = $1; } ; +unicode_normal_form: + NFC { $$ = "nfc"; } + | NFD { $$ = "nfd"; } + | NFKC { $$ = "nfkc"; } + | NFKD { $$ = "nfkd"; } + ; + /* OVERLAY() arguments * SQL99 defines the OVERLAY() function: * o overlay(text placing text from int for int) @@ -15259,7 +15292,12 @@ unreserved_keyword: | NAMES | NEW | NEXT + | NFC + | NFD + | NFKC + | NFKD | NO + | NORMALIZED | NOTHING | NOTIFY | NOWAIT @@ -15437,6 +15475,7 @@ col_name_keyword: | NATIONAL | NCHAR | NONE + | NORMALIZE | NULLIF | NUMERIC | OUT_P diff --git a/src/backend/utils/adt/varlena.c b/src/backend/utils/adt/varlena.c index 69165eb311..c78f25066e 100644 --- a/src/backend/utils/adt/varlena.c +++ b/src/backend/utils/adt/varlena.c @@ -21,6 +21,7 @@ #include "catalog/pg_collation.h" #include "catalog/pg_type.h" #include "common/int.h" +#include "common/unicode_norm.h" #include "lib/hyperloglog.h" #include "libpq/pqformat.h" #include "miscadmin.h" @@ -5956,3 +5957,132 @@ rest_of_char_same(const char *s1, const char *s2, int len) #include "levenshtein.c" #define LEVENSHTEIN_LESS_EQUAL #include "levenshtein.c" + + +/* + * Unicode support + */ + +static UnicodeNormalizationForm +unicode_norm_form_from_string(const char *formstr) +{ + UnicodeNormalizationForm form = -1; + + /* + * Might as well check this while we're here. + */ + if (GetDatabaseEncoding() != PG_UTF8) + ereport(ERROR, + (errcode(ERRCODE_SYNTAX_ERROR), + errmsg("Unicode normalization can only be performed if server encoding is UTF8"))); + + if (pg_strcasecmp(formstr, "NFC") == 0) + form = UNICODE_NFC; + else if (pg_strcasecmp(formstr, "NFD") == 0) + form = UNICODE_NFD; + else if (pg_strcasecmp(formstr, "NFKC") == 0) + form = UNICODE_NFKC; + else if (pg_strcasecmp(formstr, "NFKD") == 0) + form = UNICODE_NFKD; + else + ereport(ERROR, + (errcode(ERRCODE_INVALID_PARAMETER_VALUE), + errmsg("invalid normalization form: %s", formstr))); + + return form; +} + +Datum +unicode_normalize_func(PG_FUNCTION_ARGS) +{ + text *input = PG_GETARG_TEXT_PP(0); + char *formstr = text_to_cstring(PG_GETARG_TEXT_PP(1)); + UnicodeNormalizationForm form; + int size; + pg_wchar *input_chars; + pg_wchar *output_chars; + unsigned char *p; + text *result; + int i; + + form = unicode_norm_form_from_string(formstr); + + /* convert to pg_wchar */ + size = pg_mbstrlen_with_len(VARDATA_ANY(input), VARSIZE_ANY_EXHDR(input)); + input_chars = palloc((size + 1) * sizeof(pg_wchar)); + p = (unsigned char *) VARDATA_ANY(input); + for (i = 0; i < size; i++) + { + input_chars[i] = utf8_to_unicode(p); + p += pg_utf_mblen(p); + } + input_chars[i] = (pg_wchar) '\0'; + Assert((char *) p == VARDATA_ANY(input) + VARSIZE_ANY_EXHDR(input)); + + /* action */ + output_chars = unicode_normalize(form, input_chars); + + /* convert back to UTF-8 string */ + size = 0; + for (pg_wchar *wp = output_chars; *wp; wp++) + { + unsigned char buf[4]; + + unicode_to_utf8(*wp, buf); + size += pg_utf_mblen(buf); + } + + result = palloc(size + VARHDRSZ); + SET_VARSIZE(result, size + VARHDRSZ); + + p = (unsigned char *) VARDATA_ANY(result); + for (pg_wchar *wp = output_chars; *wp; wp++) + { + unicode_to_utf8(*wp, p); + p += pg_utf_mblen(p); + } + Assert((char *) p == (char *) result + size + VARHDRSZ); + + PG_RETURN_TEXT_P(result); +} + +Datum +unicode_is_normalized(PG_FUNCTION_ARGS) +{ + text *input = PG_GETARG_TEXT_PP(0); + char *formstr = text_to_cstring(PG_GETARG_TEXT_PP(1)); + UnicodeNormalizationForm form; + int size; + pg_wchar *input_chars; + pg_wchar *output_chars; + unsigned char *p; + int i; + int output_size; + bool result; + + form = unicode_norm_form_from_string(formstr); + + /* convert to pg_wchar */ + size = pg_mbstrlen_with_len(VARDATA_ANY(input), VARSIZE_ANY_EXHDR(input)); + input_chars = palloc((size + 1) * sizeof(pg_wchar)); + p = (unsigned char *) VARDATA_ANY(input); + for (i = 0; i < size; i++) + { + input_chars[i] = utf8_to_unicode(p); + p += pg_utf_mblen(p); + } + input_chars[i] = (pg_wchar) '\0'; + Assert((char *) p == VARDATA_ANY(input) + VARSIZE_ANY_EXHDR(input)); + + /* normalize and compare with original */ + output_chars = unicode_normalize(form, input_chars); + + output_size = 0; + for (pg_wchar *wp = output_chars; *wp; wp++) + output_size++; + + result = (size == output_size) && + (memcmp(input_chars, output_chars, size * sizeof(pg_wchar)) == 0); + + PG_RETURN_BOOL(result); +} diff --git a/src/include/catalog/pg_proc.dat b/src/include/catalog/pg_proc.dat index ac8f64b219..259d65c99a 100644 --- a/src/include/catalog/pg_proc.dat +++ b/src/include/catalog/pg_proc.dat @@ -10729,4 +10729,12 @@ proname => 'pg_partition_root', prorettype => 'regclass', proargtypes => 'regclass', prosrc => 'pg_partition_root' }, +{ oid => '4350', descr => 'Unicode normalization', + proname => 'normalize', prorettype => 'text', + proargtypes => 'text text', prosrc => 'unicode_normalize_func' }, + +{ oid => '4351', descr => 'check Unicode normalization', + proname => 'is_normalized', prorettype => 'bool', + proargtypes => 'text text', prosrc => 'unicode_is_normalized' }, + ] diff --git a/src/include/parser/kwlist.h b/src/include/parser/kwlist.h index 00ace8425e..7e77c651e6 100644 --- a/src/include/parser/kwlist.h +++ b/src/include/parser/kwlist.h @@ -259,8 +259,14 @@ PG_KEYWORD("natural", NATURAL, TYPE_FUNC_NAME_KEYWORD) PG_KEYWORD("nchar", NCHAR, COL_NAME_KEYWORD) PG_KEYWORD("new", NEW, UNRESERVED_KEYWORD) PG_KEYWORD("next", NEXT, UNRESERVED_KEYWORD) +PG_KEYWORD("nfc", NFC, UNRESERVED_KEYWORD) +PG_KEYWORD("nfd", NFD, UNRESERVED_KEYWORD) +PG_KEYWORD("nfkc", NFKC, UNRESERVED_KEYWORD) +PG_KEYWORD("nfkd", NFKD, UNRESERVED_KEYWORD) PG_KEYWORD("no", NO, UNRESERVED_KEYWORD) PG_KEYWORD("none", NONE, COL_NAME_KEYWORD) +PG_KEYWORD("normalize", NORMALIZE, COL_NAME_KEYWORD) +PG_KEYWORD("normalized", NORMALIZED, UNRESERVED_KEYWORD) PG_KEYWORD("not", NOT, RESERVED_KEYWORD) PG_KEYWORD("nothing", NOTHING, UNRESERVED_KEYWORD) PG_KEYWORD("notify", NOTIFY, UNRESERVED_KEYWORD) diff --git a/src/test/regress/expected/unicode.out b/src/test/regress/expected/unicode.out new file mode 100644 index 0000000000..2a1e903696 --- /dev/null +++ b/src/test/regress/expected/unicode.out @@ -0,0 +1,81 @@ +SELECT getdatabaseencoding() <> 'UTF8' AS skip_test \gset +\if :skip_test +\quit +\endif +SELECT U&'\0061\0308bc' <> U&'\00E4bc' COLLATE "C" AS sanity_check; + sanity_check +-------------- + t +(1 row) + +SELECT normalize(U&'\0061\0308\24D1c') = U&'\00E4\24D1c' COLLATE "C" AS test_default; + test_default +-------------- + t +(1 row) + +SELECT normalize(U&'\0061\0308\24D1c', NFC) = U&'\00E4\24D1c' COLLATE "C" AS test_nfc; + test_nfc +---------- + t +(1 row) + +SELECT normalize(U&'\00E4bc', NFC) = U&'\00E4bc' COLLATE "C" AS test_nfc_idem; + test_nfc_idem +--------------- + t +(1 row) + +SELECT normalize(U&'\00E4\24D1c', NFD) = U&'\0061\0308\24D1c' COLLATE "C" AS test_nfd; + test_nfd +---------- + t +(1 row) + +SELECT normalize(U&'\0061\0308\24D1c', NFKC) = U&'\00E4bc' COLLATE "C" AS test_nfkc; + test_nfkc +----------- + t +(1 row) + +SELECT normalize(U&'\00E4\24D1c', NFKD) = U&'\0061\0308bc' COLLATE "C" AS test_nfkd; + test_nfkd +----------- + t +(1 row) + +SELECT "normalize"('abc', 'def'); -- run-time error +ERROR: invalid normalization form: def +SELECT U&'\00E4\24D1c' IS NORMALIZED AS test_default; + test_default +-------------- + t +(1 row) + +SELECT U&'\00E4\24D1c' IS NFC NORMALIZED AS test_nfc; + test_nfc +---------- + t +(1 row) + +SELECT num, val, + val IS NFC NORMALIZED AS NFC, + val IS NFD NORMALIZED AS NFD, + val IS NFKC NORMALIZED AS NFKC, + val IS NFKD NORMALIZED AS NFKD +FROM + (VALUES (1, U&'\00E4bc'), + (2, U&'\0061\0308bc'), + (3, U&'\00E4\24D1c'), + (4, U&'\0061\0308\24D1c')) vals (num, val) +ORDER BY num; + num | val | nfc | nfd | nfkc | nfkd +-----+-----+-----+-----+------+------ + 1 | äbc | t | f | t | f + 2 | äbc | f | t | f | t + 3 | äⓑc | t | f | f | f + 4 | äⓑc | f | t | f | f +(4 rows) + +SELECT is_normalized('abc', 'def'); -- run-time error +ERROR: invalid normalization form: def diff --git a/src/test/regress/expected/unicode_1.out b/src/test/regress/expected/unicode_1.out new file mode 100644 index 0000000000..8505c4fa55 --- /dev/null +++ b/src/test/regress/expected/unicode_1.out @@ -0,0 +1,3 @@ +SELECT getdatabaseencoding() <> 'UTF8' AS skip_test \gset +\if :skip_test +\quit diff --git a/src/test/regress/parallel_schedule b/src/test/regress/parallel_schedule index d33a4e143d..8351c361c8 100644 --- a/src/test/regress/parallel_schedule +++ b/src/test/regress/parallel_schedule @@ -27,7 +27,7 @@ test: strings numerology point lseg line box path polygon circle date time timet # geometry depends on point, lseg, box, path, polygon and circle # horology depends on interval, timetz, timestamp, timestamptz # ---------- -test: geometry horology regex oidjoins type_sanity opr_sanity misc_sanity comments expressions +test: geometry horology regex oidjoins type_sanity opr_sanity misc_sanity comments expressions unicode # ---------- # These four each depend on the previous one diff --git a/src/test/regress/serial_schedule b/src/test/regress/serial_schedule index f86f5c5682..2b89b3841e 100644 --- a/src/test/regress/serial_schedule +++ b/src/test/regress/serial_schedule @@ -49,6 +49,7 @@ test: opr_sanity test: misc_sanity test: comments test: expressions +test: unicode test: create_function_1 test: create_type test: create_table diff --git a/src/test/regress/sql/unicode.sql b/src/test/regress/sql/unicode.sql new file mode 100644 index 0000000000..ccfc6fa77a --- /dev/null +++ b/src/test/regress/sql/unicode.sql @@ -0,0 +1,32 @@ +SELECT getdatabaseencoding() <> 'UTF8' AS skip_test \gset +\if :skip_test +\quit +\endif + +SELECT U&'\0061\0308bc' <> U&'\00E4bc' COLLATE "C" AS sanity_check; + +SELECT normalize(U&'\0061\0308\24D1c') = U&'\00E4\24D1c' COLLATE "C" AS test_default; +SELECT normalize(U&'\0061\0308\24D1c', NFC) = U&'\00E4\24D1c' COLLATE "C" AS test_nfc; +SELECT normalize(U&'\00E4bc', NFC) = U&'\00E4bc' COLLATE "C" AS test_nfc_idem; +SELECT normalize(U&'\00E4\24D1c', NFD) = U&'\0061\0308\24D1c' COLLATE "C" AS test_nfd; +SELECT normalize(U&'\0061\0308\24D1c', NFKC) = U&'\00E4bc' COLLATE "C" AS test_nfkc; +SELECT normalize(U&'\00E4\24D1c', NFKD) = U&'\0061\0308bc' COLLATE "C" AS test_nfkd; + +SELECT "normalize"('abc', 'def'); -- run-time error + +SELECT U&'\00E4\24D1c' IS NORMALIZED AS test_default; +SELECT U&'\00E4\24D1c' IS NFC NORMALIZED AS test_nfc; + +SELECT num, val, + val IS NFC NORMALIZED AS NFC, + val IS NFD NORMALIZED AS NFD, + val IS NFKC NORMALIZED AS NFKC, + val IS NFKD NORMALIZED AS NFKD +FROM + (VALUES (1, U&'\00E4bc'), + (2, U&'\0061\0308bc'), + (3, U&'\00E4\24D1c'), + (4, U&'\0061\0308\24D1c')) vals (num, val) +ORDER BY num; + +SELECT is_normalized('abc', 'def'); -- run-time error -- 2.24.0