From 749715ef0758f309671f995ae71e0642c6562ecc Mon Sep 17 00:00:00 2001
From: Peter Eisentraut <peter@eisentraut.org>
Date: Wed, 11 Dec 2019 13:32:17 +0100
Subject: [PATCH v1 2/2] Add SQL functions for Unicode normalization

This adds SQL expressions NORMALIZE() and IS NORMALIZED to convert and
check Unicode normal forms, per SQL standard.
---
 doc/src/sgml/func.sgml                  |  46 +++++++++
 src/backend/catalog/sql_features.txt    |   2 +-
 src/backend/catalog/system_views.sql    |  15 +++
 src/backend/parser/gram.y               |  41 +++++++-
 src/backend/utils/adt/varlena.c         | 130 ++++++++++++++++++++++++
 src/include/catalog/pg_proc.dat         |   8 ++
 src/include/parser/kwlist.h             |   6 ++
 src/test/regress/expected/unicode.out   |  81 +++++++++++++++
 src/test/regress/expected/unicode_1.out |   3 +
 src/test/regress/parallel_schedule      |   2 +-
 src/test/regress/serial_schedule        |   1 +
 src/test/regress/sql/unicode.sql        |  32 ++++++
 12 files changed, 364 insertions(+), 3 deletions(-)
 create mode 100644 src/test/regress/expected/unicode.out
 create mode 100644 src/test/regress/expected/unicode_1.out
 create mode 100644 src/test/regress/sql/unicode.sql
diff --git a/doc/src/sgml/func.sgml b/doc/src/sgml/func.sgml
index 57a1539506..bfe2fe8a5f 100644
--- a/doc/src/sgml/func.sgml
+++ b/doc/src/sgml/func.sgml
@@ -1500,6 +1500,28 @@ <title><acronym>SQL</acronym> String Functions and Operators</title>
        <entry><literal>Value: 42</literal></entry>
       </row>
 
+      <row>
+       <entry>
+        <indexterm>
+         <primary>normalized</primary>
+        </indexterm>
+        <indexterm>
+         <primary>Unicode normalization</primary>
+        </indexterm>
+        <literal><parameter>string</parameter> is <optional>not</optional> <optional><parameter>form</parameter></optional> normalized</literal>
+       </entry>
+       <entry><type>boolean</type></entry>
+       <entry>
+        Checks whether the string is in the specified Unicode
+        normalization form.  The optional parameter specifies the form: <literal>NFC</literal> (default),
+        <literal>NFD</literal>, <literal>NFKC</literal>,
+        <literal>NFKD</literal>.  This expression can only be used if the server
+        encoding is <literal>UTF8</literal>.
+       </entry>
+       <entry><literal>U&amp;'\0061\0308bc' IS NFD NORMALIZED</literal></entry>
+       <entry><literal>true</literal></entry>
+      </row>
+
       <row>
        <entry>
         <indexterm>
@@ -1550,6 +1572,30 @@ <title><acronym>SQL</acronym> String Functions and Operators</title>
        <entry><literal>tom</literal></entry>
       </row>
 
+      <row>
+       <entry>
+        <indexterm>
+         <primary>normalize</primary>
+        </indexterm>
+        <indexterm>
+         <primary>Unicode normalization</primary>
+        </indexterm>
+        <literal><function>normalize(<parameter>string</parameter> <type>text</type>
+        <optional>, <parameter>form</parameter> </optional>)</function></literal>
+       </entry>
+       <entry><type>text</type></entry>
+       <entry>
+        Converts the string in the first argument to the specified Unicode
+        normalization form.  The optional second argument specifies the form
+        as an identifier: <literal>NFC</literal> (default),
+        <literal>NFD</literal>, <literal>NFKC</literal>,
+        <literal>NFKD</literal>.  This function can only be used if the server
+        encoding is <literal>UTF8</literal>.
+       </entry>
+       <entry><literal>normalize(U&amp;'\0061\0308bc', NFC)</literal></entry>
+       <entry><literal>U&amp;'\00E4bc'</literal></entry>
+      </row>
+
       <row>
        <entry>
         <indexterm>
diff --git a/src/backend/catalog/sql_features.txt b/src/backend/catalog/sql_features.txt
index ab3e381cff..59fb2812b4 100644
--- a/src/backend/catalog/sql_features.txt
+++ b/src/backend/catalog/sql_features.txt
@@ -257,7 +257,7 @@ F386	Set identity column generation clause			YES
 F391	Long identifiers			YES	
 F392	Unicode escapes in identifiers			YES	
 F393	Unicode escapes in literals			YES	
-F394	Optional normal form specification			NO	
+F394	Optional normal form specification			YES	
 F401	Extended joined table			YES	
 F401	Extended joined table	01	NATURAL JOIN	YES	
 F401	Extended joined table	02	FULL OUTER JOIN	YES	
diff --git a/src/backend/catalog/system_views.sql b/src/backend/catalog/system_views.sql
index f7800f01a6..a7adf68449 100644
--- a/src/backend/catalog/system_views.sql
+++ b/src/backend/catalog/system_views.sql
@@ -1332,6 +1332,21 @@ CREATE OR REPLACE FUNCTION
 STRICT STABLE PARALLEL SAFE
 AS 'jsonb_path_query_first_tz';
 
+-- default normalization form is NFC, per SQL standard
+CREATE OR REPLACE FUNCTION
+  "normalize"(text, text DEFAULT 'NFC')
+RETURNS text
+LANGUAGE internal
+STRICT IMMUTABLE PARALLEL SAFE
+AS 'unicode_normalize_func';
+
+CREATE OR REPLACE FUNCTION
+  is_normalized(text, text DEFAULT 'NFC')
+RETURNS boolean
+LANGUAGE internal
+STRICT IMMUTABLE PARALLEL SAFE
+AS 'unicode_is_normalized';
+
 --
 -- The default permissions for functions mean that anyone can execute them.
 -- A number of functions shouldn't be executable by just anyone, but rather
diff --git a/src/backend/parser/gram.y b/src/backend/parser/gram.y
index c5086846de..8067dcbcbf 100644
--- a/src/backend/parser/gram.y
+++ b/src/backend/parser/gram.y
@@ -444,6 +444,7 @@ static Node *makeRecursiveViewSelect(char *relname, List *aliases, Node *query);
 %type <list>	substr_list trim_list
 %type <list>	opt_interval interval_second
 %type <node>	overlay_placing substr_from substr_for
+%type <str>		unicode_normal_form
 
 %type <boolean> opt_instead
 %type <boolean> opt_unique opt_concurrently opt_verbose opt_full
@@ -661,7 +662,8 @@ static Node *makeRecursiveViewSelect(char *relname, List *aliases, Node *query);
 
 	MAPPING MATCH MATERIALIZED MAXVALUE METHOD MINUTE_P MINVALUE MODE MONTH_P MOVE
 
-	NAME_P NAMES NATIONAL NATURAL NCHAR NEW NEXT NO NONE
+	NAME_P NAMES NATIONAL NATURAL NCHAR NEW NEXT NFC NFD NFKC NFKD NO NONE
+	NORMALIZE NORMALIZED
 	NOT NOTHING NOTIFY NOTNULL NOWAIT NULL_P NULLIF
 	NULLS_P NUMERIC
 
@@ -13436,6 +13438,22 @@ a_expr:		c_expr									{ $$ = $1; }
 												 list_make1($1), @2),
 									 @2);
 				}
+			| a_expr IS NORMALIZED								%prec IS
+				{
+					$$ = (Node *) makeFuncCall(SystemFuncName("is_normalized"), list_make1($1), @2);
+				}
+			| a_expr IS unicode_normal_form NORMALIZED			%prec IS
+				{
+					$$ = (Node *) makeFuncCall(SystemFuncName("is_normalized"), list_make2($1, makeStringConst($3, @3)), @2);
+				}
+			| a_expr IS NOT NORMALIZED							%prec IS
+				{
+					$$ = makeNotExpr((Node *) makeFuncCall(SystemFuncName("is_normalized"), list_make1($1), @2), @2);
+				}
+			| a_expr IS NOT unicode_normal_form NORMALIZED		%prec IS
+				{
+					$$ = makeNotExpr((Node *) makeFuncCall(SystemFuncName("is_normalized"), list_make2($1, makeStringConst($4, @4)), @2), @2);
+				}
 			| DEFAULT
 				{
 					/*
@@ -13879,6 +13897,14 @@ func_expr_common_subexpr:
 				{
 					$$ = (Node *) makeFuncCall(SystemFuncName("date_part"), $3, @1);
 				}
+			| NORMALIZE '(' a_expr ')'
+				{
+					$$ = (Node *) makeFuncCall(SystemFuncName("normalize"), list_make1($3), @1);
+				}
+			| NORMALIZE '(' a_expr ',' unicode_normal_form ')'
+				{
+					$$ = (Node *) makeFuncCall(SystemFuncName("normalize"), list_make2($3, makeStringConst($5, @5)), @1);
+				}
 			| OVERLAY '(' overlay_list ')'
 				{
 					/* overlay(A PLACING B FROM C FOR D) is converted to
@@ -14514,6 +14540,13 @@ extract_arg:
 			| Sconst								{ $$ = $1; }
 		;
 
+unicode_normal_form:
+			NFC										{ $$ = "nfc"; }
+			| NFD									{ $$ = "nfd"; }
+			| NFKC									{ $$ = "nfkc"; }
+			| NFKD									{ $$ = "nfkd"; }
+		;
+
 /* OVERLAY() arguments
  * SQL99 defines the OVERLAY() function:
  * o overlay(text placing text from int for int)
@@ -15259,7 +15292,12 @@ unreserved_keyword:
 			| NAMES
 			| NEW
 			| NEXT
+			| NFC
+			| NFD
+			| NFKC
+			| NFKD
 			| NO
+			| NORMALIZED
 			| NOTHING
 			| NOTIFY
 			| NOWAIT
@@ -15437,6 +15475,7 @@ col_name_keyword:
 			| NATIONAL
 			| NCHAR
 			| NONE
+			| NORMALIZE
 			| NULLIF
 			| NUMERIC
 			| OUT_P
diff --git a/src/backend/utils/adt/varlena.c b/src/backend/utils/adt/varlena.c
index 69165eb311..c78f25066e 100644
--- a/src/backend/utils/adt/varlena.c
+++ b/src/backend/utils/adt/varlena.c
@@ -21,6 +21,7 @@
 #include "catalog/pg_collation.h"
 #include "catalog/pg_type.h"
 #include "common/int.h"
+#include "common/unicode_norm.h"
 #include "lib/hyperloglog.h"
 #include "libpq/pqformat.h"
 #include "miscadmin.h"
@@ -5956,3 +5957,132 @@ rest_of_char_same(const char *s1, const char *s2, int len)
 #include "levenshtein.c"
 #define LEVENSHTEIN_LESS_EQUAL
 #include "levenshtein.c"
+
+
+/*
+ * Unicode support
+ */
+
+static UnicodeNormalizationForm
+unicode_norm_form_from_string(const char *formstr)
+{
+	UnicodeNormalizationForm form = -1;
+
+	/*
+	 * Might as well check this while we're here.
+	 */
+	if (GetDatabaseEncoding() != PG_UTF8)
+		ereport(ERROR,
+				(errcode(ERRCODE_SYNTAX_ERROR),
+				 errmsg("Unicode normalization can only be performed if server encoding is UTF8")));
+
+	if (pg_strcasecmp(formstr, "NFC") == 0)
+		form = UNICODE_NFC;
+	else if (pg_strcasecmp(formstr, "NFD") == 0)
+		form = UNICODE_NFD;
+	else if (pg_strcasecmp(formstr, "NFKC") == 0)
+		form = UNICODE_NFKC;
+	else if (pg_strcasecmp(formstr, "NFKD") == 0)
+		form = UNICODE_NFKD;
+	else
+		ereport(ERROR,
+				(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
+				 errmsg("invalid normalization form: %s", formstr)));
+
+	return form;
+}
+
+Datum
+unicode_normalize_func(PG_FUNCTION_ARGS)
+{
+	text	   *input = PG_GETARG_TEXT_PP(0);
+	char	   *formstr = text_to_cstring(PG_GETARG_TEXT_PP(1));
+	UnicodeNormalizationForm form;
+	int			size;
+	pg_wchar   *input_chars;
+	pg_wchar   *output_chars;
+	unsigned char *p;
+	text	   *result;
+	int			i;
+
+	form = unicode_norm_form_from_string(formstr);
+
+	/* convert to pg_wchar */
+	size = pg_mbstrlen_with_len(VARDATA_ANY(input), VARSIZE_ANY_EXHDR(input));
+	input_chars = palloc((size + 1) * sizeof(pg_wchar));
+	p = (unsigned char *) VARDATA_ANY(input);
+	for (i = 0; i < size; i++)
+	{
+		input_chars[i] = utf8_to_unicode(p);
+		p += pg_utf_mblen(p);
+	}
+	input_chars[i] = (pg_wchar) '\0';
+	Assert((char *) p == VARDATA_ANY(input) + VARSIZE_ANY_EXHDR(input));
+
+	/* action */
+	output_chars = unicode_normalize(form, input_chars);
+
+	/* convert back to UTF-8 string */
+	size = 0;
+	for (pg_wchar *wp = output_chars; *wp; wp++)
+	{
+		unsigned char buf[4];
+
+		unicode_to_utf8(*wp, buf);
+		size += pg_utf_mblen(buf);
+	}
+
+	result = palloc(size + VARHDRSZ);
+	SET_VARSIZE(result, size + VARHDRSZ);
+
+	p = (unsigned char *) VARDATA_ANY(result);
+	for (pg_wchar *wp = output_chars; *wp; wp++)
+	{
+		unicode_to_utf8(*wp, p);
+		p += pg_utf_mblen(p);
+	}
+	Assert((char *) p == (char *) result + size + VARHDRSZ);
+
+	PG_RETURN_TEXT_P(result);
+}
+
+Datum
+unicode_is_normalized(PG_FUNCTION_ARGS)
+{
+	text	   *input = PG_GETARG_TEXT_PP(0);
+	char	   *formstr = text_to_cstring(PG_GETARG_TEXT_PP(1));
+	UnicodeNormalizationForm form;
+	int			size;
+	pg_wchar   *input_chars;
+	pg_wchar   *output_chars;
+	unsigned char *p;
+	int			i;
+	int			output_size;
+	bool		result;
+
+	form = unicode_norm_form_from_string(formstr);
+
+	/* convert to pg_wchar */
+	size = pg_mbstrlen_with_len(VARDATA_ANY(input), VARSIZE_ANY_EXHDR(input));
+	input_chars = palloc((size + 1) * sizeof(pg_wchar));
+	p = (unsigned char *) VARDATA_ANY(input);
+	for (i = 0; i < size; i++)
+	{
+		input_chars[i] = utf8_to_unicode(p);
+		p += pg_utf_mblen(p);
+	}
+	input_chars[i] = (pg_wchar) '\0';
+	Assert((char *) p == VARDATA_ANY(input) + VARSIZE_ANY_EXHDR(input));
+
+	/* normalize and compare with original */
+	output_chars = unicode_normalize(form, input_chars);
+
+	output_size = 0;
+	for (pg_wchar *wp = output_chars; *wp; wp++)
+		output_size++;
+
+	result = (size == output_size) &&
+		(memcmp(input_chars, output_chars, size * sizeof(pg_wchar)) == 0);
+
+	PG_RETURN_BOOL(result);
+}
diff --git a/src/include/catalog/pg_proc.dat b/src/include/catalog/pg_proc.dat
index ac8f64b219..259d65c99a 100644
--- a/src/include/catalog/pg_proc.dat
+++ b/src/include/catalog/pg_proc.dat
@@ -10729,4 +10729,12 @@
   proname => 'pg_partition_root', prorettype => 'regclass',
   proargtypes => 'regclass', prosrc => 'pg_partition_root' },
 
+{ oid => '4350', descr => 'Unicode normalization',
+  proname => 'normalize', prorettype => 'text',
+  proargtypes => 'text text', prosrc => 'unicode_normalize_func' },
+
+{ oid => '4351', descr => 'check Unicode normalization',
+  proname => 'is_normalized', prorettype => 'bool',
+  proargtypes => 'text text', prosrc => 'unicode_is_normalized' },
+
 ]
diff --git a/src/include/parser/kwlist.h b/src/include/parser/kwlist.h
index 00ace8425e..7e77c651e6 100644
--- a/src/include/parser/kwlist.h
+++ b/src/include/parser/kwlist.h
@@ -259,8 +259,14 @@ PG_KEYWORD("natural", NATURAL, TYPE_FUNC_NAME_KEYWORD)
 PG_KEYWORD("nchar", NCHAR, COL_NAME_KEYWORD)
 PG_KEYWORD("new", NEW, UNRESERVED_KEYWORD)
 PG_KEYWORD("next", NEXT, UNRESERVED_KEYWORD)
+PG_KEYWORD("nfc", NFC, UNRESERVED_KEYWORD)
+PG_KEYWORD("nfd", NFD, UNRESERVED_KEYWORD)
+PG_KEYWORD("nfkc", NFKC, UNRESERVED_KEYWORD)
+PG_KEYWORD("nfkd", NFKD, UNRESERVED_KEYWORD)
 PG_KEYWORD("no", NO, UNRESERVED_KEYWORD)
 PG_KEYWORD("none", NONE, COL_NAME_KEYWORD)
+PG_KEYWORD("normalize", NORMALIZE, COL_NAME_KEYWORD)
+PG_KEYWORD("normalized", NORMALIZED, UNRESERVED_KEYWORD)
 PG_KEYWORD("not", NOT, RESERVED_KEYWORD)
 PG_KEYWORD("nothing", NOTHING, UNRESERVED_KEYWORD)
 PG_KEYWORD("notify", NOTIFY, UNRESERVED_KEYWORD)
diff --git a/src/test/regress/expected/unicode.out b/src/test/regress/expected/unicode.out
new file mode 100644
index 0000000000..2a1e903696
--- /dev/null
+++ b/src/test/regress/expected/unicode.out
@@ -0,0 +1,81 @@
+SELECT getdatabaseencoding() <> 'UTF8' AS skip_test \gset
+\if :skip_test
+\quit
+\endif
+SELECT U&'\0061\0308bc' <> U&'\00E4bc' COLLATE "C" AS sanity_check;
+ sanity_check 
+--------------
+ t
+(1 row)
+
+SELECT normalize(U&'\0061\0308\24D1c') = U&'\00E4\24D1c' COLLATE "C" AS test_default;
+ test_default 
+--------------
+ t
+(1 row)
+
+SELECT normalize(U&'\0061\0308\24D1c', NFC) = U&'\00E4\24D1c' COLLATE "C" AS test_nfc;
+ test_nfc 
+----------
+ t
+(1 row)
+
+SELECT normalize(U&'\00E4bc', NFC) = U&'\00E4bc' COLLATE "C" AS test_nfc_idem;
+ test_nfc_idem 
+---------------
+ t
+(1 row)
+
+SELECT normalize(U&'\00E4\24D1c', NFD) = U&'\0061\0308\24D1c' COLLATE "C" AS test_nfd;
+ test_nfd 
+----------
+ t
+(1 row)
+
+SELECT normalize(U&'\0061\0308\24D1c', NFKC) = U&'\00E4bc' COLLATE "C" AS test_nfkc;
+ test_nfkc 
+-----------
+ t
+(1 row)
+
+SELECT normalize(U&'\00E4\24D1c', NFKD) = U&'\0061\0308bc' COLLATE "C" AS test_nfkd;
+ test_nfkd 
+-----------
+ t
+(1 row)
+
+SELECT "normalize"('abc', 'def');  -- run-time error
+ERROR:  invalid normalization form: def
+SELECT U&'\00E4\24D1c' IS NORMALIZED AS test_default;
+ test_default 
+--------------
+ t
+(1 row)
+
+SELECT U&'\00E4\24D1c' IS NFC NORMALIZED AS test_nfc;
+ test_nfc 
+----------
+ t
+(1 row)
+
+SELECT num, val,
+    val IS NFC NORMALIZED AS NFC,
+    val IS NFD NORMALIZED AS NFD,
+    val IS NFKC NORMALIZED AS NFKC,
+    val IS NFKD NORMALIZED AS NFKD
+FROM
+  (VALUES (1, U&'\00E4bc'),
+          (2, U&'\0061\0308bc'),
+          (3, U&'\00E4\24D1c'),
+          (4, U&'\0061\0308\24D1c')) vals (num, val)
+ORDER BY num;
+ num | val | nfc | nfd | nfkc | nfkd 
+-----+-----+-----+-----+------+------
+   1 | äbc | t   | f   | t    | f
+   2 | äbc | f   | t   | f    | t
+   3 | äⓑc | t   | f   | f    | f
+   4 | äⓑc | f   | t   | f    | f
+(4 rows)
+
+SELECT is_normalized('abc', 'def');  -- run-time error
+ERROR:  invalid normalization form: def
diff --git a/src/test/regress/expected/unicode_1.out b/src/test/regress/expected/unicode_1.out
new file mode 100644
index 0000000000..8505c4fa55
--- /dev/null
+++ b/src/test/regress/expected/unicode_1.out
@@ -0,0 +1,3 @@
+SELECT getdatabaseencoding() <> 'UTF8' AS skip_test \gset
+\if :skip_test
+\quit
diff --git a/src/test/regress/parallel_schedule b/src/test/regress/parallel_schedule
index d33a4e143d..8351c361c8 100644
--- a/src/test/regress/parallel_schedule
+++ b/src/test/regress/parallel_schedule
@@ -27,7 +27,7 @@ test: strings numerology point lseg line box path polygon circle date time timet
 # geometry depends on point, lseg, box, path, polygon and circle
 # horology depends on interval, timetz, timestamp, timestamptz
 # ----------
-test: geometry horology regex oidjoins type_sanity opr_sanity misc_sanity comments expressions
+test: geometry horology regex oidjoins type_sanity opr_sanity misc_sanity comments expressions unicode
 
 # ----------
 # These four each depend on the previous one
diff --git a/src/test/regress/serial_schedule b/src/test/regress/serial_schedule
index f86f5c5682..2b89b3841e 100644
--- a/src/test/regress/serial_schedule
+++ b/src/test/regress/serial_schedule
@@ -49,6 +49,7 @@ test: opr_sanity
 test: misc_sanity
 test: comments
 test: expressions
+test: unicode
 test: create_function_1
 test: create_type
 test: create_table
diff --git a/src/test/regress/sql/unicode.sql b/src/test/regress/sql/unicode.sql
new file mode 100644
index 0000000000..ccfc6fa77a
--- /dev/null
+++ b/src/test/regress/sql/unicode.sql
@@ -0,0 +1,32 @@
+SELECT getdatabaseencoding() <> 'UTF8' AS skip_test \gset
+\if :skip_test
+\quit
+\endif
+
+SELECT U&'\0061\0308bc' <> U&'\00E4bc' COLLATE "C" AS sanity_check;
+
+SELECT normalize(U&'\0061\0308\24D1c') = U&'\00E4\24D1c' COLLATE "C" AS test_default;
+SELECT normalize(U&'\0061\0308\24D1c', NFC) = U&'\00E4\24D1c' COLLATE "C" AS test_nfc;
+SELECT normalize(U&'\00E4bc', NFC) = U&'\00E4bc' COLLATE "C" AS test_nfc_idem;
+SELECT normalize(U&'\00E4\24D1c', NFD) = U&'\0061\0308\24D1c' COLLATE "C" AS test_nfd;
+SELECT normalize(U&'\0061\0308\24D1c', NFKC) = U&'\00E4bc' COLLATE "C" AS test_nfkc;
+SELECT normalize(U&'\00E4\24D1c', NFKD) = U&'\0061\0308bc' COLLATE "C" AS test_nfkd;
+
+SELECT "normalize"('abc', 'def');  -- run-time error
+
+SELECT U&'\00E4\24D1c' IS NORMALIZED AS test_default;
+SELECT U&'\00E4\24D1c' IS NFC NORMALIZED AS test_nfc;
+
+SELECT num, val,
+    val IS NFC NORMALIZED AS NFC,
+    val IS NFD NORMALIZED AS NFD,
+    val IS NFKC NORMALIZED AS NFKC,
+    val IS NFKD NORMALIZED AS NFKD
+FROM
+  (VALUES (1, U&'\00E4bc'),
+          (2, U&'\0061\0308bc'),
+          (3, U&'\00E4\24D1c'),
+          (4, U&'\0061\0308\24D1c')) vals (num, val)
+ORDER BY num;
+
+SELECT is_normalized('abc', 'def');  -- run-time error
-- 
2.24.0