From 73f3134eccdb88587cc20e7a7aa39e2f941b3d44 Mon Sep 17 00:00:00 2001
From: Thomas Munro <thomas.munro@gmail.com>
Date: Wed, 28 Aug 2024 20:52:04 +1200
Subject: [PATCH v2] Reject non-ASCII locale names.
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Commit bf03cfd1 started scanning the available BCP 47 locale names on
Windows.  This caused an abort/crash in the Windows runtime library if
initdb used a locale name containing non-ASCII characters, because of
our use of the setlocale() save/restore pattern with C-strings.  After
switching to another locale with a different encoding, the saved name
could no longer be understood, and setlocale() would just abort the
process.

"Turkish_Türkiye.1254" is the example from bug reports, a name that
recently changed its spelling, but there are other examples of countries
with non-ASCII characters in their English language name.

To defend against this, the following changes are made:

1.  In initdb, reject non-ASCII locale names given explicity on the
command line, and returned by the operating system environment with
setlocale("", ...), and "canonicalized" by the operating system when we
set it.

2.  In initdb only, perform the save-and-restore with the wchar_t
variant of setlocale(), so that it is not subject to round trip failure
when the char-based encoding fails.

3.  In the backend, we don't have to worry about the save-and-restore
problem because we shouldn't have let any non-ASCII locale names in as
the database default, so we just have to make sure that CREATE DATABASE
also rejects them.

Anyone who encounters the new error message should either create a new
duplicated locale with an ASCII-only name using Windows Locale Builder,
or consider using BCP 47 names like "tr-TR".  Users already couldn't
initialize a cluster with "Turkish_Türkiye.1254" on PostgreSQL 16+.
We're just changing the failure more from a crash to an error message.
Users with existing clusters that might eventually be upgraded to 16+
must already have taken one of those approaches to keep their cluster
running, so pg_upgrade to a later version should also still be possible
with this change.

Back-patch to 16, where bf03cfd1 landed.  Older versions are affected in
theory, but only 16 and later are generating crash reports.

Reviewed-by: Andrew Dunstan <andrew@dunslane.net>
Discussion: https://postgr.es/m/PH8PR21MB3902F334A3174C54058F792CE5182%40PH8PR21MB3902.namprd21.prod.outlook.com
---
 src/backend/commands/collationcmds.c |  1 -
 src/backend/utils/adt/pg_locale.c    | 31 +++++++++++++++++++++++
 src/bin/initdb/initdb.c              | 38 +++++++++++++++++++++++++++-
 3 files changed, 68 insertions(+), 2 deletions(-)

diff --git a/src/backend/commands/collationcmds.c b/src/backend/commands/collationcmds.c
index 63ef9a08411..5147e85ba1e 100644
--- a/src/backend/commands/collationcmds.c
+++ b/src/backend/commands/collationcmds.c
@@ -263,7 +263,6 @@ DefineCollation(ParseState *pstate, List *names, List *parameters, bool if_not_e
 						(errcode(ERRCODE_INVALID_OBJECT_DEFINITION),
 						 errmsg("parameter \"%s\" must be specified",
 								"lc_collate")));
-
 			if (!collctype)
 				ereport(ERROR,
 						(errcode(ERRCODE_INVALID_OBJECT_DEFINITION),
diff --git a/src/backend/utils/adt/pg_locale.c b/src/backend/utils/adt/pg_locale.c
index 643cca05d38..497e2d35896 100644
--- a/src/backend/utils/adt/pg_locale.c
+++ b/src/backend/utils/adt/pg_locale.c
@@ -58,6 +58,7 @@
 #include "catalog/pg_collation.h"
 #include "catalog/pg_database.h"
 #include "common/hashfn.h"
+#include "common/string.h"
 #include "mb/pg_wchar.h"
 #include "miscadmin.h"
 #include "utils/builtins.h"
@@ -334,6 +335,16 @@ check_locale(int category, const char *locale, char **canonname)
 	char	   *save;
 	char	   *res;
 
+	/* Don't let Windows' non-ASCII locale names in. */
+	if (!pg_is_ascii(locale))
+	{
+		ereport(WARNING,
+				(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
+				 errmsg("locale name \"%s\" contains non-ASCII characters",
+						locale)));
+		return false;
+	}
+
 	if (canonname)
 		*canonname = NULL;		/* in case of failure */
 
@@ -341,6 +352,14 @@ check_locale(int category, const char *locale, char **canonname)
 	if (!save)
 		return false;			/* won't happen, we hope */
 
+	/*
+	 * The global locale should already have been defended against Windows'
+	 * non-ASCII locale names that can't be safely saved-and-restored using C
+	 * strings, but just in case we see one here, we'll log a warning.
+	 */
+	if (!pg_is_ascii(save))
+		elog(WARNING, "locale name \"%s\" contains non-ASCII characters", save);
+
 	/* save may be pointing at a modifiable scratch variable, see above. */
 	save = pstrdup(save);
 
@@ -356,6 +375,18 @@ check_locale(int category, const char *locale, char **canonname)
 		elog(WARNING, "failed to restore old locale \"%s\"", save);
 	pfree(save);
 
+	/* Don't let Windows' non-ASCII locale names out. */
+	if (canonname && *canonname && !pg_is_ascii(*canonname))
+	{
+		ereport(WARNING,
+				(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
+				 errmsg("locale name \"%s\" contains non-ASCII characters",
+						*canonname)));
+		pfree(*canonname);
+		*canonname = NULL;
+		return false;
+	}
+
 	return (res != NULL);
 }
 
diff --git a/src/bin/initdb/initdb.c b/src/bin/initdb/initdb.c
index f00718a0150..03f16947416 100644
--- a/src/bin/initdb/initdb.c
+++ b/src/bin/initdb/initdb.c
@@ -2132,18 +2132,42 @@ locale_date_order(const char *locale)
 static void
 check_locale_name(int category, const char *locale, char **canonname)
 {
+#ifdef WIN32
+	wchar_t    *save;
+#else
 	char	   *save;
+#endif
 	char	   *res;
 
+	/* Don't let Windows' non-ASCII locales names in. */
+	if (locale && !pg_is_ascii(locale))
+		pg_fatal("locale name \"%s\" contains non-ASCII characters", locale);
+
 	if (canonname)
 		*canonname = NULL;		/* in case of failure */
 
+	/*
+	 * We can't save-and-restore Windows' non-ASCII locales safely unless we
+	 * use the wchar_t variant, because the locale we switch to might change
+	 * the expected encoding of "save" when we restore.
+	 */
+#ifdef WIN32
+	save = _wsetlocale(category, NULL);
+	if (!save)
+		pg_fatal("_wsetlocale() failed");
+
+	/* save may be pointing at a modifiable scratch variable, so copy it. */
+	save = wcsdup(save);
+	if (save == NULL)
+		pg_fatal("out of memory");
+#else
 	save = setlocale(category, NULL);
 	if (!save)
-		pg_fatal("setlocale() failed");
+		pg_fatal("wsetlocale() failed");
 
 	/* save may be pointing at a modifiable scratch variable, so copy it. */
 	save = pg_strdup(save);
+#endif
 
 	/* for setlocale() call */
 	if (!locale)
@@ -2157,8 +2181,13 @@ check_locale_name(int category, const char *locale, char **canonname)
 		*canonname = pg_strdup(res);
 
 	/* restore old value. */
+#ifdef WIN32
+	if (!_wsetlocale(category, NULL))
+		pg_fatal("failed to restore old locale");
+#else
 	if (!setlocale(category, save))
 		pg_fatal("failed to restore old locale \"%s\"", save);
+#endif
 	free(save);
 
 	/* complain if locale wasn't valid */
@@ -2183,6 +2212,13 @@ check_locale_name(int category, const char *locale, char **canonname)
 			pg_fatal("invalid locale settings; check LANG and LC_* environment variables");
 		}
 	}
+
+	/*
+	 * Don't let Windows' non-ASCII locales out, in the unlikely event that an
+	 * ASCII input name was canonicalized to a non-ASCII name.
+	 */
+	if (canonname && !pg_is_ascii(*canonname))
+		pg_fatal("locale name \"%s\" contains non-ASCII characters", locale);
 }
 
 /*
-- 
2.46.0

