From 071a955d44b588d4633030e7a7d06c5cfb4ff838 Mon Sep 17 00:00:00 2001 From: Peter Eisentraut Date: Tue, 20 Aug 2019 16:02:46 +0200 Subject: [PATCH v1] Add option to use ICU as global collation provider This adds the option to use ICU as the default collation provider for either the whole cluster or a database. New options for initdb, createdb, and CREATE DATABASE are used to select this. --- doc/src/sgml/ref/createdb.sgml | 9 ++ doc/src/sgml/ref/initdb.sgml | 23 ++++ src/backend/access/hash/hashfunc.c | 18 ++- src/backend/commands/dbcommands.c | 52 ++++++++- src/backend/regex/regc_pg_locale.c | 7 +- src/backend/utils/adt/formatting.c | 6 + src/backend/utils/adt/like.c | 20 +++- src/backend/utils/adt/like_support.c | 2 + src/backend/utils/adt/pg_locale.c | 168 ++++++++++++++++----------- src/backend/utils/adt/varchar.c | 22 +++- src/backend/utils/adt/varlena.c | 26 ++++- src/backend/utils/init/postinit.c | 21 ++++ src/bin/initdb/Makefile | 2 + src/bin/initdb/initdb.c | 63 ++++++++-- src/bin/initdb/t/001_initdb.pl | 18 ++- src/bin/pg_dump/pg_dump.c | 16 +++ src/bin/psql/describe.c | 8 ++ src/bin/scripts/Makefile | 2 + src/bin/scripts/createdb.c | 9 ++ src/bin/scripts/t/020_createdb.pl | 19 ++- src/include/catalog/pg_database.dat | 2 +- src/include/catalog/pg_database.h | 3 + src/include/utils/pg_locale.h | 6 + 23 files changed, 417 insertions(+), 105 deletions(-) diff --git a/doc/src/sgml/ref/createdb.sgml b/doc/src/sgml/ref/createdb.sgml index 8fc8128bf9..5b73afad91 100644 --- a/doc/src/sgml/ref/createdb.sgml +++ b/doc/src/sgml/ref/createdb.sgml @@ -85,6 +85,15 @@ Options + + + + + Specifies the collation provider for the database. + + + + diff --git a/doc/src/sgml/ref/initdb.sgml b/doc/src/sgml/ref/initdb.sgml index da5c8f5307..9ad7b2e112 100644 --- a/doc/src/sgml/ref/initdb.sgml +++ b/doc/src/sgml/ref/initdb.sgml @@ -165,6 +165,18 @@ Options + + + + + This option sets the collation provider for databases created in the + new cluster. It can be overridden in the CREATE + DATABASE command when new databases are subsequently + created. The default is libc. + + + + @@ -209,6 +221,17 @@ Options + + + + + Specifies the ICU locale if the ICU collation provider is used. If + this is not specified, the value from the + option is used. + + + + diff --git a/src/backend/access/hash/hashfunc.c b/src/backend/access/hash/hashfunc.c index 6ec1ec3df3..2f8f220549 100644 --- a/src/backend/access/hash/hashfunc.c +++ b/src/backend/access/hash/hashfunc.c @@ -255,8 +255,13 @@ hashtext(PG_FUNCTION_ARGS) errmsg("could not determine which collation to use for string hashing"), errhint("Use the COLLATE clause to set the collation explicitly."))); - if (!lc_collate_is_c(collid) && collid != DEFAULT_COLLATION_OID) - mylocale = pg_newlocale_from_collation(collid); + if (!lc_collate_is_c(collid)) + { + if (collid != DEFAULT_COLLATION_OID) + mylocale = pg_newlocale_from_collation(collid); + else if (global_locale.provider == COLLPROVIDER_ICU) + mylocale = &global_locale; + } if (!mylocale || mylocale->deterministic) { @@ -311,8 +316,13 @@ hashtextextended(PG_FUNCTION_ARGS) errmsg("could not determine which collation to use for string hashing"), errhint("Use the COLLATE clause to set the collation explicitly."))); - if (!lc_collate_is_c(collid) && collid != DEFAULT_COLLATION_OID) - mylocale = pg_newlocale_from_collation(collid); + if (!lc_collate_is_c(collid)) + { + if (collid != DEFAULT_COLLATION_OID) + mylocale = pg_newlocale_from_collation(collid); + else if (global_locale.provider == COLLPROVIDER_ICU) + mylocale = &global_locale; + } if (!mylocale || mylocale->deterministic) { diff --git a/src/backend/commands/dbcommands.c b/src/backend/commands/dbcommands.c index 95881a8550..a00f08682d 100644 --- a/src/backend/commands/dbcommands.c +++ b/src/backend/commands/dbcommands.c @@ -35,6 +35,7 @@ #include "catalog/indexing.h" #include "catalog/objectaccess.h" #include "catalog/pg_authid.h" +#include "catalog/pg_collation.h" #include "catalog/pg_database.h" #include "catalog/pg_db_role_setting.h" #include "catalog/pg_subscription.h" @@ -86,7 +87,8 @@ static bool get_db_info(const char *name, LOCKMODE lockmode, int *encodingP, bool *dbIsTemplateP, bool *dbAllowConnP, Oid *dbLastSysOidP, TransactionId *dbFrozenXidP, MultiXactId *dbMinMultiP, - Oid *dbTablespace, char **dbCollate, char **dbCtype); + Oid *dbTablespace, char **dbCollate, char **dbCtype, + char *dbCollProvider); static bool have_createdb_privilege(void); static void remove_dbtablespaces(Oid db_id); static bool check_db_file_conflict(Oid db_id); @@ -106,6 +108,7 @@ createdb(ParseState *pstate, const CreatedbStmt *stmt) int src_encoding; char *src_collate; char *src_ctype; + char src_collprovider; bool src_istemplate; bool src_allowconn; Oid src_lastsysoid; @@ -127,6 +130,7 @@ createdb(ParseState *pstate, const CreatedbStmt *stmt) DefElem *dlocale = NULL; DefElem *dcollate = NULL; DefElem *dctype = NULL; + DefElem *dcollprovider = NULL; DefElem *distemplate = NULL; DefElem *dallowconnections = NULL; DefElem *dconnlimit = NULL; @@ -135,6 +139,7 @@ createdb(ParseState *pstate, const CreatedbStmt *stmt) const char *dbtemplate = NULL; char *dbcollate = NULL; char *dbctype = NULL; + char dbcollprovider = '\0'; char *canonname; int encoding = -1; bool dbistemplate = false; @@ -212,6 +217,15 @@ createdb(ParseState *pstate, const CreatedbStmt *stmt) parser_errposition(pstate, defel->location))); dctype = defel; } + else if (strcmp(defel->defname, "collation_provider") == 0) + { + if (dcollprovider) + ereport(ERROR, + (errcode(ERRCODE_SYNTAX_ERROR), + errmsg("conflicting or redundant options"), + parser_errposition(pstate, defel->location))); + dcollprovider = defel; + } else if (strcmp(defel->defname, "is_template") == 0) { if (distemplate) @@ -301,6 +315,23 @@ createdb(ParseState *pstate, const CreatedbStmt *stmt) dbcollate = defGetString(dcollate); if (dctype && dctype->arg) dbctype = defGetString(dctype); + if (dcollprovider && dcollprovider->arg) + { + char *collproviderstr = defGetString(dcollprovider); + +#ifdef USE_ICU + if (pg_strcasecmp(collproviderstr, "icu") == 0) + dbcollprovider = COLLPROVIDER_ICU; + else +#endif + if (pg_strcasecmp(collproviderstr, "libc") == 0) + dbcollprovider = COLLPROVIDER_LIBC; + else + ereport(ERROR, + (errcode(ERRCODE_INVALID_OBJECT_DEFINITION), + errmsg("unrecognized collation provider: %s", + collproviderstr))); + } if (distemplate && distemplate->arg) dbistemplate = defGetBoolean(distemplate); if (dallowconnections && dallowconnections->arg) @@ -350,7 +381,7 @@ createdb(ParseState *pstate, const CreatedbStmt *stmt) &src_dboid, &src_owner, &src_encoding, &src_istemplate, &src_allowconn, &src_lastsysoid, &src_frozenxid, &src_minmxid, &src_deftablespace, - &src_collate, &src_ctype)) + &src_collate, &src_ctype, &src_collprovider)) ereport(ERROR, (errcode(ERRCODE_UNDEFINED_DATABASE), errmsg("template database \"%s\" does not exist", @@ -376,6 +407,8 @@ createdb(ParseState *pstate, const CreatedbStmt *stmt) dbcollate = src_collate; if (dbctype == NULL) dbctype = src_ctype; + if (dbcollprovider == '\0') + dbcollprovider = src_collprovider; /* Some encodings are client only */ if (!PG_VALID_BE_ENCODING(encoding)) @@ -383,6 +416,8 @@ createdb(ParseState *pstate, const CreatedbStmt *stmt) (errcode(ERRCODE_WRONG_OBJECT_TYPE), errmsg("invalid server encoding %d", encoding))); + if (dbcollprovider == COLLPROVIDER_LIBC) + { /* Check that the chosen locales are valid, and get canonical spellings */ if (!check_locale(LC_COLLATE, dbcollate, &canonname)) ereport(ERROR, @@ -396,6 +431,7 @@ createdb(ParseState *pstate, const CreatedbStmt *stmt) dbctype = canonname; check_encoding_locale_matches(encoding, dbcollate, dbctype); + } /* * Check that the new encoding and locale settings match the source @@ -559,6 +595,7 @@ createdb(ParseState *pstate, const CreatedbStmt *stmt) DirectFunctionCall1(namein, CStringGetDatum(dbcollate)); new_record[Anum_pg_database_datctype - 1] = DirectFunctionCall1(namein, CStringGetDatum(dbctype)); + new_record[Anum_pg_database_datcollprovider - 1] = CharGetDatum(dbcollprovider); new_record[Anum_pg_database_datistemplate - 1] = BoolGetDatum(dbistemplate); new_record[Anum_pg_database_datallowconn - 1] = BoolGetDatum(dballowconnections); new_record[Anum_pg_database_datconnlimit - 1] = Int32GetDatum(dbconnlimit); @@ -832,7 +869,7 @@ dropdb(const char *dbname, bool missing_ok) pgdbrel = table_open(DatabaseRelationId, RowExclusiveLock); if (!get_db_info(dbname, AccessExclusiveLock, &db_id, NULL, NULL, - &db_istemplate, NULL, NULL, NULL, NULL, NULL, NULL, NULL)) + &db_istemplate, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL)) { if (!missing_ok) { @@ -1023,7 +1060,7 @@ RenameDatabase(const char *oldname, const char *newname) rel = table_open(DatabaseRelationId, RowExclusiveLock); if (!get_db_info(oldname, AccessExclusiveLock, &db_id, NULL, NULL, - NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL)) + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL)) ereport(ERROR, (errcode(ERRCODE_UNDEFINED_DATABASE), errmsg("database \"%s\" does not exist", oldname))); @@ -1136,7 +1173,7 @@ movedb(const char *dbname, const char *tblspcname) pgdbrel = table_open(DatabaseRelationId, RowExclusiveLock); if (!get_db_info(dbname, AccessExclusiveLock, &db_id, NULL, NULL, - NULL, NULL, NULL, NULL, NULL, &src_tblspcoid, NULL, NULL)) + NULL, NULL, NULL, NULL, NULL, &src_tblspcoid, NULL, NULL, NULL)) ereport(ERROR, (errcode(ERRCODE_UNDEFINED_DATABASE), errmsg("database \"%s\" does not exist", dbname))); @@ -1768,7 +1805,8 @@ get_db_info(const char *name, LOCKMODE lockmode, int *encodingP, bool *dbIsTemplateP, bool *dbAllowConnP, Oid *dbLastSysOidP, TransactionId *dbFrozenXidP, MultiXactId *dbMinMultiP, - Oid *dbTablespace, char **dbCollate, char **dbCtype) + Oid *dbTablespace, char **dbCollate, char **dbCtype, + char *dbCollProvider) { bool result = false; Relation relation; @@ -1865,6 +1903,8 @@ get_db_info(const char *name, LOCKMODE lockmode, *dbCollate = pstrdup(NameStr(dbform->datcollate)); if (dbCtype) *dbCtype = pstrdup(NameStr(dbform->datctype)); + if (dbCollProvider) + *dbCollProvider = dbform->datcollprovider; ReleaseSysCache(tuple); result = true; break; diff --git a/src/backend/regex/regc_pg_locale.c b/src/backend/regex/regc_pg_locale.c index 4a808b7606..510bd71371 100644 --- a/src/backend/regex/regc_pg_locale.c +++ b/src/backend/regex/regc_pg_locale.c @@ -241,7 +241,12 @@ pg_set_regex_collation(Oid collation) else { if (collation == DEFAULT_COLLATION_OID) - pg_regex_locale = 0; + { + if (global_locale.provider == COLLPROVIDER_ICU) + pg_regex_locale = &global_locale; + else + pg_regex_locale = 0; + } else if (OidIsValid(collation)) { /* diff --git a/src/backend/utils/adt/formatting.c b/src/backend/utils/adt/formatting.c index b3115e4bea..bb5a992b9f 100644 --- a/src/backend/utils/adt/formatting.c +++ b/src/backend/utils/adt/formatting.c @@ -1570,6 +1570,8 @@ str_tolower(const char *buff, size_t nbytes, Oid collid) } mylocale = pg_newlocale_from_collation(collid); } + else if (global_locale.provider == COLLPROVIDER_ICU) + mylocale = &global_locale; #ifdef USE_ICU if (mylocale && mylocale->provider == COLLPROVIDER_ICU) @@ -1693,6 +1695,8 @@ str_toupper(const char *buff, size_t nbytes, Oid collid) } mylocale = pg_newlocale_from_collation(collid); } + else if (global_locale.provider == COLLPROVIDER_ICU) + mylocale = &global_locale; #ifdef USE_ICU if (mylocale && mylocale->provider == COLLPROVIDER_ICU) @@ -1817,6 +1821,8 @@ str_initcap(const char *buff, size_t nbytes, Oid collid) } mylocale = pg_newlocale_from_collation(collid); } + else if (global_locale.provider == COLLPROVIDER_ICU) + mylocale = &global_locale; #ifdef USE_ICU if (mylocale && mylocale->provider == COLLPROVIDER_ICU) diff --git a/src/backend/utils/adt/like.c b/src/backend/utils/adt/like.c index 13d5cb083c..57dd3fe59d 100644 --- a/src/backend/utils/adt/like.c +++ b/src/backend/utils/adt/like.c @@ -150,9 +150,14 @@ SB_lower_char(unsigned char c, pg_locale_t locale, bool locale_is_c) static inline int GenericMatchText(const char *s, int slen, const char *p, int plen, Oid collation) { - if (collation && !lc_ctype_is_c(collation) && collation != DEFAULT_COLLATION_OID) + if (collation && !lc_ctype_is_c(collation)) { - pg_locale_t locale = pg_newlocale_from_collation(collation); + pg_locale_t locale = 0; + + if (collation != DEFAULT_COLLATION_OID) + locale = pg_newlocale_from_collation(collation); + else if (global_locale.provider == COLLPROVIDER_ICU) + locale = &global_locale; if (locale && !locale->deterministic) ereport(ERROR, @@ -195,11 +200,14 @@ Generic_Text_IC_like(text *str, text *pat, Oid collation) } locale = pg_newlocale_from_collation(collation); - if (locale && !locale->deterministic) - ereport(ERROR, - (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), - errmsg("nondeterministic collations are not supported for ILIKE"))); } + else if (global_locale.provider == COLLPROVIDER_ICU) + locale = &global_locale; + + if (locale && !locale->deterministic) + ereport(ERROR, + (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), + errmsg("nondeterministic collations are not supported for ILIKE"))); /* * For efficiency reasons, in the single byte case we don't call lower() diff --git a/src/backend/utils/adt/like_support.c b/src/backend/utils/adt/like_support.c index c8fec7863f..09a28aab8e 100644 --- a/src/backend/utils/adt/like_support.c +++ b/src/backend/utils/adt/like_support.c @@ -966,6 +966,8 @@ like_fixed_prefix(Const *patt_const, bool case_insensitive, Oid collation, } locale = pg_newlocale_from_collation(collation); } + else if (global_locale.provider == COLLPROVIDER_ICU) + locale = &global_locale; } if (typeid != BYTEAOID) diff --git a/src/backend/utils/adt/pg_locale.c b/src/backend/utils/adt/pg_locale.c index b2f08ead45..168890b6f6 100644 --- a/src/backend/utils/adt/pg_locale.c +++ b/src/backend/utils/adt/pg_locale.c @@ -1206,6 +1206,9 @@ lc_collate_is_c(Oid collation) static int result = -1; char *localeptr; + if (global_locale.provider == COLLPROVIDER_ICU) + return false; + if (result >= 0) return (bool) result; localeptr = setlocale(LC_COLLATE, NULL); @@ -1256,6 +1259,9 @@ lc_ctype_is_c(Oid collation) static int result = -1; char *localeptr; + if (global_locale.provider == COLLPROVIDER_ICU) + return false; + if (result >= 0) return (bool) result; localeptr = setlocale(LC_CTYPE, NULL); @@ -1284,6 +1290,89 @@ lc_ctype_is_c(Oid collation) return (lookup_collation_cache(collation, true))->ctype_is_c; } +struct pg_locale_struct global_locale; + +void +make_icu_collator(const char *collcollate, const char *collctype, + struct pg_locale_struct *resultp) +{ +#ifdef USE_ICU + UCollator *collator; + UErrorCode status; + + if (strcmp(collcollate, collctype) != 0) + ereport(ERROR, + (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), + errmsg("collations with different collate and ctype values are not supported by ICU"))); + + status = U_ZERO_ERROR; + collator = ucol_open(collcollate, &status); + if (U_FAILURE(status)) + ereport(ERROR, + (errmsg("could not open collator for locale \"%s\": %s", + collcollate, u_errorName(status)))); + + if (U_ICU_VERSION_MAJOR_NUM < 54) + icu_set_collation_attributes(collator, collcollate); + + /* We will leak this string if we get an error below :-( */ + resultp->info.icu.locale = MemoryContextStrdup(TopMemoryContext, + collcollate); + resultp->info.icu.ucol = collator; +#else /* not USE_ICU */ + /* could get here if a collation was created by a build with ICU */ + ereport(ERROR, + (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), + errmsg("ICU is not supported in this build"), \ + errhint("You need to rebuild PostgreSQL using --with-icu."))); +#endif /* not USE_ICU */ +} + +void +check_collation_version(HeapTuple colltuple) +{ + Form_pg_collation collform; + Datum collversion; + bool isnull; + + collform = (Form_pg_collation) GETSTRUCT(colltuple); + + collversion = SysCacheGetAttr(COLLOID, colltuple, Anum_pg_collation_collversion, + &isnull); + if (!isnull) + { + char *actual_versionstr; + char *collversionstr; + + actual_versionstr = get_collation_actual_version(collform->collprovider, + NameStr(collform->collcollate)); + if (!actual_versionstr) + { + /* + * This could happen when specifying a version in CREATE + * COLLATION for a libc locale, or manually creating a mess in + * the catalogs. + */ + ereport(ERROR, + (errmsg("collation \"%s\" has no actual version, but a version was specified", + NameStr(collform->collname)))); + } + collversionstr = TextDatumGetCString(collversion); + + if (strcmp(actual_versionstr, collversionstr) != 0) + ereport(WARNING, + (errmsg("collation \"%s\" has version mismatch", + NameStr(collform->collname)), + errdetail("The collation in the database was created using version %s, " + "but the operating system provides version %s.", + collversionstr, actual_versionstr), + errhint("Rebuild all objects affected by this collation and run " + "ALTER COLLATION %s REFRESH VERSION, " + "or build PostgreSQL with the right library version.", + quote_qualified_identifier(get_namespace_name(collform->collnamespace), + NameStr(collform->collname))))); + } +} /* simple subroutine for reporting errors from newlocale() */ #ifdef HAVE_LOCALE_T @@ -1357,8 +1446,6 @@ pg_newlocale_from_collation(Oid collid) const char *collctype pg_attribute_unused(); struct pg_locale_struct result; pg_locale_t resultp; - Datum collversion; - bool isnull; tp = SearchSysCache1(COLLOID, ObjectIdGetDatum(collid)); if (!HeapTupleIsValid(tp)) @@ -1428,72 +1515,10 @@ pg_newlocale_from_collation(Oid collid) } else if (collform->collprovider == COLLPROVIDER_ICU) { -#ifdef USE_ICU - UCollator *collator; - UErrorCode status; - - if (strcmp(collcollate, collctype) != 0) - ereport(ERROR, - (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), - errmsg("collations with different collate and ctype values are not supported by ICU"))); - - status = U_ZERO_ERROR; - collator = ucol_open(collcollate, &status); - if (U_FAILURE(status)) - ereport(ERROR, - (errmsg("could not open collator for locale \"%s\": %s", - collcollate, u_errorName(status)))); - - if (U_ICU_VERSION_MAJOR_NUM < 54) - icu_set_collation_attributes(collator, collcollate); - - /* We will leak this string if we get an error below :-( */ - result.info.icu.locale = MemoryContextStrdup(TopMemoryContext, - collcollate); - result.info.icu.ucol = collator; -#else /* not USE_ICU */ - /* could get here if a collation was created by a build with ICU */ - ereport(ERROR, - (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), - errmsg("ICU is not supported in this build"), \ - errhint("You need to rebuild PostgreSQL using --with-icu."))); -#endif /* not USE_ICU */ + make_icu_collator(collcollate, collctype, &result); } - collversion = SysCacheGetAttr(COLLOID, tp, Anum_pg_collation_collversion, - &isnull); - if (!isnull) - { - char *actual_versionstr; - char *collversionstr; - - actual_versionstr = get_collation_actual_version(collform->collprovider, collcollate); - if (!actual_versionstr) - { - /* - * This could happen when specifying a version in CREATE - * COLLATION for a libc locale, or manually creating a mess in - * the catalogs. - */ - ereport(ERROR, - (errmsg("collation \"%s\" has no actual version, but a version was specified", - NameStr(collform->collname)))); - } - collversionstr = TextDatumGetCString(collversion); - - if (strcmp(actual_versionstr, collversionstr) != 0) - ereport(WARNING, - (errmsg("collation \"%s\" has version mismatch", - NameStr(collform->collname)), - errdetail("The collation in the database was created using version %s, " - "but the operating system provides version %s.", - collversionstr, actual_versionstr), - errhint("Rebuild all objects affected by this collation and run " - "ALTER COLLATION %s REFRESH VERSION, " - "or build PostgreSQL with the right library version.", - quote_qualified_identifier(get_namespace_name(collform->collnamespace), - NameStr(collform->collname))))); - } + check_collation_version(tp); ReleaseSysCache(tp); @@ -1520,6 +1545,17 @@ get_collation_actual_version(char collprovider, const char *collcollate) { char *collversion; + if (collprovider == COLLPROVIDER_DEFAULT) + { +#ifdef USE_ICU + if (global_locale.provider == COLLPROVIDER_ICU) + collversion = get_collation_actual_version(global_locale.provider, + global_locale.info.icu.locale); + else +#endif + collversion = NULL; + } + else #ifdef USE_ICU if (collprovider == COLLPROVIDER_ICU) { diff --git a/src/backend/utils/adt/varchar.c b/src/backend/utils/adt/varchar.c index 332dc860c4..703b87a7c2 100644 --- a/src/backend/utils/adt/varchar.c +++ b/src/backend/utils/adt/varchar.c @@ -751,7 +751,7 @@ bpchareq(PG_FUNCTION_ARGS) len2 = bcTruelen(arg2); if (lc_collate_is_c(collid) || - collid == DEFAULT_COLLATION_OID || + (collid == DEFAULT_COLLATION_OID && global_locale.deterministic) || pg_newlocale_from_collation(collid)->deterministic) { /* @@ -789,7 +789,7 @@ bpcharne(PG_FUNCTION_ARGS) len2 = bcTruelen(arg2); if (lc_collate_is_c(collid) || - collid == DEFAULT_COLLATION_OID || + (collid == DEFAULT_COLLATION_OID && global_locale.deterministic) || pg_newlocale_from_collation(collid)->deterministic) { /* @@ -995,8 +995,13 @@ hashbpchar(PG_FUNCTION_ARGS) keydata = VARDATA_ANY(key); keylen = bcTruelen(key); - if (!lc_collate_is_c(collid) && collid != DEFAULT_COLLATION_OID) - mylocale = pg_newlocale_from_collation(collid); + if (!lc_collate_is_c(collid)) + { + if (collid != DEFAULT_COLLATION_OID) + mylocale = pg_newlocale_from_collation(collid); + else if (global_locale.provider == COLLPROVIDER_ICU) + mylocale = &global_locale; + } if (!mylocale || mylocale->deterministic) { @@ -1055,8 +1060,13 @@ hashbpcharextended(PG_FUNCTION_ARGS) keydata = VARDATA_ANY(key); keylen = bcTruelen(key); - if (!lc_collate_is_c(collid) && collid != DEFAULT_COLLATION_OID) - mylocale = pg_newlocale_from_collation(collid); + if (!lc_collate_is_c(collid)) + { + if (collid != DEFAULT_COLLATION_OID) + mylocale = pg_newlocale_from_collation(collid); + else if (global_locale.provider == COLLPROVIDER_ICU) + mylocale = &global_locale; + } if (!mylocale || mylocale->deterministic) { diff --git a/src/backend/utils/adt/varlena.c b/src/backend/utils/adt/varlena.c index fa08b55eb6..e59cf2d49e 100644 --- a/src/backend/utils/adt/varlena.c +++ b/src/backend/utils/adt/varlena.c @@ -1156,8 +1156,13 @@ text_position_setup(text *t1, text *t2, Oid collid, TextPositionState *state) check_collation_set(collid); - if (!lc_collate_is_c(collid) && collid != DEFAULT_COLLATION_OID) - mylocale = pg_newlocale_from_collation(collid); + if (!lc_collate_is_c(collid)) + { + if (collid != DEFAULT_COLLATION_OID) + mylocale = pg_newlocale_from_collation(collid); + else if (global_locale.provider == COLLPROVIDER_ICU) + mylocale = &global_locale; + } if (mylocale && !mylocale->deterministic) ereport(ERROR, @@ -1493,6 +1498,8 @@ varstr_cmp(const char *arg1, int len1, const char *arg2, int len2, Oid collid) if (collid != DEFAULT_COLLATION_OID) mylocale = pg_newlocale_from_collation(collid); + else if (global_locale.provider == COLLPROVIDER_ICU) + mylocale = &global_locale; /* * memcmp() can't tell us which of two unequal strings sorts first, @@ -1714,7 +1721,7 @@ texteq(PG_FUNCTION_ARGS) check_collation_set(collid); if (lc_collate_is_c(collid) || - collid == DEFAULT_COLLATION_OID || + (collid == DEFAULT_COLLATION_OID && global_locale.deterministic) || pg_newlocale_from_collation(collid)->deterministic) { Datum arg1 = PG_GETARG_DATUM(0); @@ -1768,7 +1775,7 @@ textne(PG_FUNCTION_ARGS) check_collation_set(collid); if (lc_collate_is_c(collid) || - collid == DEFAULT_COLLATION_OID || + (collid == DEFAULT_COLLATION_OID && global_locale.deterministic) || pg_newlocale_from_collation(collid)->deterministic) { Datum arg1 = PG_GETARG_DATUM(0); @@ -1880,8 +1887,13 @@ text_starts_with(PG_FUNCTION_ARGS) check_collation_set(collid); - if (!lc_collate_is_c(collid) && collid != DEFAULT_COLLATION_OID) - mylocale = pg_newlocale_from_collation(collid); + if (!lc_collate_is_c(collid)) + { + if (collid != DEFAULT_COLLATION_OID) + mylocale = pg_newlocale_from_collation(collid); + else if (global_locale.provider == COLLPROVIDER_ICU) + mylocale = &global_locale; + } if (mylocale && !mylocale->deterministic) ereport(ERROR, @@ -1996,6 +2008,8 @@ varstr_sortsupport(SortSupport ssup, Oid typid, Oid collid) */ if (collid != DEFAULT_COLLATION_OID) locale = pg_newlocale_from_collation(collid); + else if (global_locale.provider == COLLPROVIDER_ICU) + locale = &global_locale; /* * There is a further exception on Windows. When the database diff --git a/src/backend/utils/init/postinit.c b/src/backend/utils/init/postinit.c index 43b9f17f72..83a36f619d 100644 --- a/src/backend/utils/init/postinit.c +++ b/src/backend/utils/init/postinit.c @@ -31,6 +31,7 @@ #include "catalog/indexing.h" #include "catalog/namespace.h" #include "catalog/pg_authid.h" +#include "catalog/pg_collation.h" #include "catalog/pg_database.h" #include "catalog/pg_db_role_setting.h" #include "catalog/pg_tablespace.h" @@ -404,6 +405,8 @@ CheckMyDatabase(const char *name, bool am_superuser, bool override_allow_connect collate = NameStr(dbform->datcollate); ctype = NameStr(dbform->datctype); + if (dbform->datcollprovider == COLLPROVIDER_LIBC) + { if (pg_perm_setlocale(LC_COLLATE, collate) == NULL) ereport(FATAL, (errmsg("database locale is incompatible with operating system"), @@ -417,6 +420,24 @@ CheckMyDatabase(const char *name, bool am_superuser, bool override_allow_connect errdetail("The database was initialized with LC_CTYPE \"%s\", " " which is not recognized by setlocale().", ctype), errhint("Recreate the database with another locale or install the missing locale."))); + } + else if (dbform->datcollprovider == COLLPROVIDER_ICU) + { + make_icu_collator(collate, ctype, &global_locale); + } + + global_locale.provider = dbform->datcollprovider; + global_locale.deterministic = true; // TODO + + { + HeapTuple tp; + + tp = SearchSysCache1(COLLOID, ObjectIdGetDatum(DEFAULT_COLLATION_OID)); + if (!HeapTupleIsValid(tp)) + elog(ERROR, "cache lookup failed for collation %u", DEFAULT_COLLATION_OID); + check_collation_version(tp); + ReleaseSysCache(tp); + } /* Make the locale settings visible as GUC variables, too */ SetConfigOption("lc_collate", collate, PGC_INTERNAL, PGC_S_OVERRIDE); diff --git a/src/bin/initdb/Makefile b/src/bin/initdb/Makefile index 7c404430a9..a9335a8ba6 100644 --- a/src/bin/initdb/Makefile +++ b/src/bin/initdb/Makefile @@ -61,6 +61,8 @@ clean distclean maintainer-clean: # ensure that changes in datadir propagate into object file initdb.o: initdb.c $(top_builddir)/src/Makefile.global +export with_icu + check: $(prove_check) diff --git a/src/bin/initdb/initdb.c b/src/bin/initdb/initdb.c index 88a261d9bd..62c310040a 100644 --- a/src/bin/initdb/initdb.c +++ b/src/bin/initdb/initdb.c @@ -129,6 +129,8 @@ static char *lc_monetary = NULL; static char *lc_numeric = NULL; static char *lc_time = NULL; static char *lc_messages = NULL; +static char collation_provider[] = {COLLPROVIDER_LIBC, '\0'}; +static char *icu_locale = NULL; static const char *default_text_search_config = NULL; static char *username = NULL; static bool pwprompt = false; @@ -1412,10 +1414,13 @@ bootstrap_template1(void) encodingid_to_string(encodingid)); bki_lines = replace_token(bki_lines, "LC_COLLATE", - escape_quotes_bki(lc_collate)); + escape_quotes_bki(collation_provider[0] == COLLPROVIDER_ICU ? icu_locale : lc_collate)); bki_lines = replace_token(bki_lines, "LC_CTYPE", - escape_quotes_bki(lc_ctype)); + escape_quotes_bki(collation_provider[0] == COLLPROVIDER_ICU ? icu_locale : lc_ctype)); + + bki_lines = replace_token(bki_lines, "COLLPROVIDER", + collation_provider); /* Also ensure backend isn't confused by this environment var: */ unsetenv("PGCLIENTENCODING"); @@ -1707,6 +1712,12 @@ setup_description(FILE *cmdfd) static void setup_collation(FILE *cmdfd) { + /* + * Set version of the default collation. + */ + PG_CMD_PRINTF("UPDATE pg_collation SET collversion = pg_collation_actual_version(oid) WHERE oid = %d;\n\n", + DEFAULT_COLLATION_OID); + /* * Add an SQL-standard name. We don't want to pin this, so it doesn't go * in pg_collation.h. But add it before reading system collations, so @@ -1995,8 +2006,6 @@ make_template0(FILE *cmdfd) { const char *const *line; static const char *const template0_setup[] = { - "CREATE DATABASE template0 IS_TEMPLATE = true ALLOW_CONNECTIONS = false;\n\n", - /* * We use the OID of template0 to determine datlastsysoid */ @@ -2021,6 +2030,9 @@ make_template0(FILE *cmdfd) NULL }; + PG_CMD_PRINTF("CREATE DATABASE template0 IS_TEMPLATE = true ALLOW_CONNECTIONS = false COLLATION_PROVIDER = %s;\n\n", + collation_provider[0] == COLLPROVIDER_ICU ? "icu" : "libc"); + for (line = template0_setup; *line; line++) PG_CMD_PUTS(*line); } @@ -2293,13 +2305,14 @@ setlocales(void) lc_monetary = locale; if (!lc_messages) lc_messages = locale; + if (!icu_locale) + icu_locale = locale; } /* * canonicalize locale names, and obtain any missing values from our * current environment */ - check_locale_name(LC_CTYPE, lc_ctype, &canonname); lc_ctype = canonname; check_locale_name(LC_COLLATE, lc_collate, &canonname); @@ -2318,6 +2331,18 @@ setlocales(void) check_locale_name(LC_CTYPE, lc_messages, &canonname); lc_messages = canonname; #endif + + /* + * If ICU is selected but no ICU locale has been given, take the + * lc_collate locale and chop off any encoding suffix. This should give + * the user a configuration that resembles their operating system's locale + * setup. + */ + if (collation_provider[0] == COLLPROVIDER_ICU && !icu_locale) + { + icu_locale = pg_strdup(lc_collate); + icu_locale[strcspn(icu_locale, ".")] = '\0'; + } } /* @@ -2333,9 +2358,12 @@ usage(const char *progname) printf(_(" -A, --auth=METHOD default authentication method for local connections\n")); printf(_(" --auth-host=METHOD default authentication method for local TCP/IP connections\n")); printf(_(" --auth-local=METHOD default authentication method for local-socket connections\n")); + printf(_(" --collation-provider={libc|icu}\n" + " set default collation provider for new databases\n")); printf(_(" [-D, --pgdata=]DATADIR location for this database cluster\n")); printf(_(" -E, --encoding=ENCODING set default encoding for new databases\n")); printf(_(" -g, --allow-group-access allow group read/execute on data directory\n")); + printf(_(" --icu-locale set ICU locale for new databases\n")); printf(_(" --locale=LOCALE set default locale for new databases\n")); printf(_(" --lc-collate=, --lc-ctype=, --lc-messages=LOCALE\n" " --lc-monetary=, --lc-numeric=, --lc-time=LOCALE\n" @@ -2510,7 +2538,8 @@ setup_locale_encoding(void) strcmp(lc_ctype, lc_time) == 0 && strcmp(lc_ctype, lc_numeric) == 0 && strcmp(lc_ctype, lc_monetary) == 0 && - strcmp(lc_ctype, lc_messages) == 0) + strcmp(lc_ctype, lc_messages) == 0 && + (!icu_locale || strcmp(lc_ctype, icu_locale) == 0)) printf(_("The database cluster will be initialized with locale \"%s\".\n"), lc_ctype); else { @@ -2527,9 +2556,13 @@ setup_locale_encoding(void) lc_monetary, lc_numeric, lc_time); + if (icu_locale) + printf(_(" ICU: %s\n"), icu_locale); } - if (!encoding) + if (!encoding && collation_provider[0] == COLLPROVIDER_ICU) + encodingid = PG_UTF8; + else if (!encoding) { int ctype_enc; @@ -3029,6 +3062,8 @@ main(int argc, char *argv[]) {"wal-segsize", required_argument, NULL, 12}, {"data-checksums", no_argument, NULL, 'k'}, {"allow-group-access", no_argument, NULL, 'g'}, + {"collation-provider", required_argument, NULL, 13}, + {"icu-locale", required_argument, NULL, 14}, {NULL, 0, NULL, 0} }; @@ -3167,6 +3202,20 @@ main(int argc, char *argv[]) case 'g': SetDataDirectoryCreatePerm(PG_DIR_MODE_GROUP); break; + case 13: + if (strcmp(optarg, "icu") == 0) + collation_provider[0] = COLLPROVIDER_ICU; + else if (strcmp(optarg, "libc") == 0) + collation_provider[0] = COLLPROVIDER_LIBC; + else + { + pg_log_error("unrecognized collation provider: %s", optarg); + exit(1); + } + break; + case 14: + icu_locale = pg_strdup(optarg); + break; default: /* getopt_long already emitted a complaint */ fprintf(stderr, _("Try \"%s --help\" for more information.\n"), diff --git a/src/bin/initdb/t/001_initdb.pl b/src/bin/initdb/t/001_initdb.pl index 8387b945d3..90f6fc8f14 100644 --- a/src/bin/initdb/t/001_initdb.pl +++ b/src/bin/initdb/t/001_initdb.pl @@ -8,7 +8,7 @@ use File::stat qw{lstat}; use PostgresNode; use TestLib; -use Test::More tests => 22; +use Test::More tests => 24; my $tempdir = TestLib::tempdir; my $xlogdir = "$tempdir/pgxlog"; @@ -89,3 +89,19 @@ ok(check_mode_recursive($datadir_group, 0750, 0640), 'check PGDATA permissions'); } + +# Collation provider tests + +if ($ENV{with_icu} eq 'yes') +{ + command_ok(['initdb', '--no-sync', '--collation-provider=icu', "$tempdir/data2"], + 'collation provider ICU'); +} +else +{ + command_fails(['initdb', '--no-sync', '--collation-provider=icu', "$tempdir/data2"], + 'collation provider ICU fails since no ICU support'); +} + +command_fails(['initdb', '--no-sync', '--collation-provider=xyz', "$tempdir/dataX"], + 'fails for invalid collation provider'); diff --git a/src/bin/pg_dump/pg_dump.c b/src/bin/pg_dump/pg_dump.c index 34981401bf..b1932c227f 100644 --- a/src/bin/pg_dump/pg_dump.c +++ b/src/bin/pg_dump/pg_dump.c @@ -2638,6 +2638,7 @@ dumpDatabase(Archive *fout) i_datname, i_dba, i_encoding, + i_datcollprovider, i_collate, i_ctype, i_frozenxid, @@ -2652,6 +2653,7 @@ dumpDatabase(Archive *fout) const char *datname, *dba, *encoding, + *datcollprovider, *collate, *ctype, *datacl, @@ -2680,6 +2682,7 @@ dumpDatabase(Archive *fout) appendPQExpBuffer(dbQry, "SELECT tableoid, oid, datname, " "(%s datdba) AS dba, " "pg_encoding_to_char(encoding) AS encoding, " + "datcollprovider, " "datcollate, datctype, datfrozenxid, datminmxid, " "(SELECT array_agg(acl ORDER BY row_n) FROM " " (SELECT acl, row_n FROM " @@ -2772,6 +2775,7 @@ dumpDatabase(Archive *fout) i_datname = PQfnumber(res, "datname"); i_dba = PQfnumber(res, "dba"); i_encoding = PQfnumber(res, "encoding"); + i_datcollprovider = PQfnumber(res, "datcollprovider"); i_collate = PQfnumber(res, "datcollate"); i_ctype = PQfnumber(res, "datctype"); i_frozenxid = PQfnumber(res, "datfrozenxid"); @@ -2787,6 +2791,7 @@ dumpDatabase(Archive *fout) datname = PQgetvalue(res, 0, i_datname); dba = PQgetvalue(res, 0, i_dba); encoding = PQgetvalue(res, 0, i_encoding); + datcollprovider = PQgetvalue(res, 0, i_datcollprovider); collate = PQgetvalue(res, 0, i_collate); ctype = PQgetvalue(res, 0, i_ctype); frozenxid = atooid(PQgetvalue(res, 0, i_frozenxid)); @@ -2812,6 +2817,17 @@ dumpDatabase(Archive *fout) appendPQExpBufferStr(creaQry, " ENCODING = "); appendStringLiteralAH(creaQry, encoding, fout); } + if (strlen(datcollprovider) > 0) + { + appendPQExpBufferStr(creaQry, " COLLATION_PROVIDER = "); + if (datcollprovider[0] == 'c') + appendPQExpBufferStr(creaQry, "libc"); + else if (datcollprovider[0] == 'i') + appendPQExpBufferStr(creaQry, "icu"); + else + fatal("unrecognized collation provider: %s", + datcollprovider); + } if (strlen(collate) > 0 && strcmp(collate, ctype) == 0) { appendPQExpBufferStr(creaQry, " LOCALE = "); diff --git a/src/bin/psql/describe.c b/src/bin/psql/describe.c index 774cc764ff..0a38eb15b1 100644 --- a/src/bin/psql/describe.c +++ b/src/bin/psql/describe.c @@ -879,6 +879,14 @@ listAllDbs(const char *pattern, bool verbose) " d.datctype as \"%s\",\n", gettext_noop("Collate"), gettext_noop("Ctype")); + if (pset.sversion >= 130000) + appendPQExpBuffer(&buf, + " CASE d.datcollprovider WHEN 'c' THEN 'libc' WHEN 'i' THEN 'icu' END AS \"%s\",\n", + gettext_noop("Provider")); + else + appendPQExpBuffer(&buf, + " 'libc' AS \"%s\",\n", + gettext_noop("Provider")); appendPQExpBufferStr(&buf, " "); printACLColumn(&buf, "d.datacl"); if (verbose && pset.sversion >= 80200) diff --git a/src/bin/scripts/Makefile b/src/bin/scripts/Makefile index ede665090f..ef4f8593dc 100644 --- a/src/bin/scripts/Makefile +++ b/src/bin/scripts/Makefile @@ -53,6 +53,8 @@ clean distclean maintainer-clean: rm -f common.o scripts_parallel.o $(WIN32RES) rm -rf tmp_check +export with_icu + check: $(prove_check) diff --git a/src/bin/scripts/createdb.c b/src/bin/scripts/createdb.c index 64bcc20cb4..5944fd3f63 100644 --- a/src/bin/scripts/createdb.c +++ b/src/bin/scripts/createdb.c @@ -37,6 +37,7 @@ main(int argc, char *argv[]) {"lc-ctype", required_argument, NULL, 2}, {"locale", required_argument, NULL, 'l'}, {"maintenance-db", required_argument, NULL, 3}, + {"collation-provider", required_argument, NULL, 4}, {NULL, 0, NULL, 0} }; @@ -59,6 +60,7 @@ main(int argc, char *argv[]) char *lc_collate = NULL; char *lc_ctype = NULL; char *locale = NULL; + char *collation_provider = NULL; PQExpBufferData sql; @@ -117,6 +119,9 @@ main(int argc, char *argv[]) case 3: maintenance_db = pg_strdup(optarg); break; + case 4: + collation_provider = pg_strdup(optarg); + break; default: fprintf(stderr, _("Try \"%s --help\" for more information.\n"), progname); exit(1); @@ -193,6 +198,8 @@ main(int argc, char *argv[]) appendPQExpBuffer(&sql, " LC_COLLATE '%s'", lc_collate); if (lc_ctype) appendPQExpBuffer(&sql, " LC_CTYPE '%s'", lc_ctype); + if (collation_provider) + appendPQExpBuffer(&sql, " COLLATION_PROVIDER %s", collation_provider); appendPQExpBufferChar(&sql, ';'); @@ -250,6 +257,8 @@ help(const char *progname) printf(_("Usage:\n")); printf(_(" %s [OPTION]... [DBNAME] [DESCRIPTION]\n"), progname); printf(_("\nOptions:\n")); + printf(_(" --collation-provider={libc|icu}\n" + " collation provider for the database\n")); printf(_(" -D, --tablespace=TABLESPACE default tablespace for the database\n")); printf(_(" -e, --echo show the commands being sent to the server\n")); printf(_(" -E, --encoding=ENCODING encoding for the database\n")); diff --git a/src/bin/scripts/t/020_createdb.pl b/src/bin/scripts/t/020_createdb.pl index c0f6067a92..9e8220335f 100644 --- a/src/bin/scripts/t/020_createdb.pl +++ b/src/bin/scripts/t/020_createdb.pl @@ -3,7 +3,7 @@ use PostgresNode; use TestLib; -use Test::More tests => 13; +use Test::More tests => 16; program_help_ok('createdb'); program_version_ok('createdb'); @@ -22,5 +22,22 @@ qr/statement: CREATE DATABASE foobar2 ENCODING 'LATIN1'/, 'create database with encoding'); +if ($ENV{with_icu} eq 'yes') +{ + $node->issues_sql_like( + [ 'createdb', '-T', 'template0', '--collation-provider=icu', 'foobar3' ], + qr/statement: CREATE DATABASE foobar3 .* COLLATION_PROVIDER icu/, + 'create database with ICU'); +} +else +{ + $node->command_fails( + [ 'createdb', '-T', 'template0', '--collation-provider=icu', 'foobar3' ], + 'create database with ICU fails since no ICU support'); + pass; +} + $node->command_fails([ 'createdb', 'foobar1' ], 'fails if database already exists'); +$node->command_fails([ 'createdb', '-T', 'template0', '--collation-provider=xyz', 'foobarX' ], + 'fails for invalid collation provider'); diff --git a/src/include/catalog/pg_database.dat b/src/include/catalog/pg_database.dat index 89bd75d024..f261cdd838 100644 --- a/src/include/catalog/pg_database.dat +++ b/src/include/catalog/pg_database.dat @@ -15,7 +15,7 @@ { oid => '1', oid_symbol => 'TemplateDbOid', descr => 'default template for new databases', datname => 'template1', encoding => 'ENCODING', datcollate => 'LC_COLLATE', - datctype => 'LC_CTYPE', datistemplate => 't', datallowconn => 't', + datctype => 'LC_CTYPE', datcollprovider => 'COLLPROVIDER', datistemplate => 't', datallowconn => 't', datconnlimit => '-1', datlastsysoid => '0', datfrozenxid => '0', datminmxid => '1', dattablespace => 'pg_default', datacl => '_null_' }, diff --git a/src/include/catalog/pg_database.h b/src/include/catalog/pg_database.h index 06fea45f53..ab3c0951df 100644 --- a/src/include/catalog/pg_database.h +++ b/src/include/catalog/pg_database.h @@ -46,6 +46,9 @@ CATALOG(pg_database,1262,DatabaseRelationId) BKI_SHARED_RELATION BKI_ROWTYPE_OID /* LC_CTYPE setting */ NameData datctype; + /* see pg_collation.collprovider */ + char datcollprovider; + /* allowed as CREATE DATABASE template? */ bool datistemplate; diff --git a/src/include/utils/pg_locale.h b/src/include/utils/pg_locale.h index b4b3aa5843..17fcee1e89 100644 --- a/src/include/utils/pg_locale.h +++ b/src/include/utils/pg_locale.h @@ -101,6 +101,12 @@ struct pg_locale_struct typedef struct pg_locale_struct *pg_locale_t; +extern struct pg_locale_struct global_locale; + +extern void make_icu_collator(const char *collcollate, const char *collctype, + struct pg_locale_struct *resultp); +extern void check_collation_version(HeapTuple colltuple); + extern pg_locale_t pg_newlocale_from_collation(Oid collid); extern char *get_collation_actual_version(char collprovider, const char *collcollate); base-commit: 56f8f9624ba050c7c47dd97547b7fafb866f2bdd -- 2.22.0