From b98e310a2c22ff0f0a0b6125189a7e7e7dd4b91d Mon Sep 17 00:00:00 2001
From: Thomas Munro <thomas.munro@gmail.com>
Date: Tue, 14 May 2024 16:46:27 +1200
Subject: [PATCH v2] Make win32locale.c's kludge table dynamic.

Historically, we transformed Windows' locale names on entry to and exit
from setlocale() using a hard-coded lookup table, because they are
unstable and we can't really handle non-ASCII locale names.  We
shouldn't be using these unstable display names, but *setlocale()
itself* gave them to us, and it can't even understand everything it
returns, leading to a series of historical kludges.

Ideally we should soon switch to BCP47 codes for new clusters, but we'll
have to support the older names in ancient database clusters for now and
deal with the transition.

So, provide users with an emergency option for getting unstuck, since
they probably don't want to wait 3-6 months for a new binary with an
updated kludge table when their database fails to come up.  While there
are probably workarounds within Windows tools, it seems easy enough to
offer a textfile of name mappings.  It might also have applications in
the transition to BCP47.

To help Unix-based developers who couldn't otherwise test this code,
provide a developer-only way to replace setlocale() on Unix too, if the
macro DEBUG_SETLOCALE_MAP is defined.

Discussion: https://postgr.es/m/PH8PR21MB3902F334A3174C54058F792CE5182%40PH8PR21MB3902.namprd21.prod.outlook.com
---
 doc/src/sgml/charset.sgml        |  52 +++-
 src/include/pg_config_manual.h   |   6 +
 src/include/port.h               |  11 +
 src/port/chklocale.c             |   7 +
 src/port/win32setlocale.c        | 402 ++++++++++++++++++++++++-------
 src/tools/pgindent/typedefs.list |   1 +
 6 files changed, 396 insertions(+), 83 deletions(-)

diff --git a/doc/src/sgml/charset.sgml b/doc/src/sgml/charset.sgml
index 834cb30c85..62a4f86fde 100644
--- a/doc/src/sgml/charset.sgml
+++ b/doc/src/sgml/charset.sgml
@@ -630,8 +630,58 @@ SELECT 'N-45' &lt; 'N-123' COLLATE mycollation5 as result;
     mailing list.
    </para>
   </sect2>
- </sect1>
 
+  <sect2 id="locale-windows">
+   <title>Windows locale names</title>
+   <para>
+    Before adopting locale names based on BCP 47 language tags, Windows
+    used names spelled out in full English words.  The traditional names are
+    still supported by Windows and PostgreSQL, but are not recommended.  They
+    can change unexpectedly in operating system updates, which can cause PostgreSQL to
+    fail.  It may have references to the old name in <literal>postgresql.conf</literal>
+    and the <link
+    linkend="catalog-pg-database"><structname>pg_database</structname></link>
+    catalog.  They also sometimes have non-ASCII characters, which are
+    not allowed in shared catalog tables.
+   </para>
+   <para>
+    To fix a cluster that is negatively affected by a renamed locale, a mapping
+    file can be installed to rename locales on the fly.  Its full path can either
+    be provided in the environment variable <literal>PG_SETLOCALE_MAP</literal>,
+    or in a file named <filename>share/setlocale.map</filename> in the installation
+    directory.
+   </para>
+   <para>
+    Each non-comment line of the file has a pattern and a replacement string
+    separated by an equal sign.  The pattern can contain
+    <literal>?</literal> wildcard characters, and optionally a <literal>*</literal>
+    wildcard to match any sequence.  The replacement name can be any sort of locale
+    name, including a stable BCP 47 locale name.  For example:
+<screen>
+# "Turkish_Turkey.1254" changed to "Turkish_Türkiye.1254" in a Windows update.
+# If either of these names is the database default, convert them to the
+# stable BCP 47 equivalent on the fly.  Use a wildcard to detect both
+# names and avoid encoding problems in the name itself.
+Turkish_T*.1254=tr-TR.1254
+</screen>
+   </para>
+   <para>
+    The mapping file is only intended to address renaming problems.  Mapping a
+    locale name to an incompatible one could cause index corruption, due to
+    differing collation order.
+   </para>
+   <para>
+    Unless a file is installed by the user, PostgreSQL behaves as though the
+    following file were installed, for historical reasons:
+<screen>
+Hong Kong S.A.R.=HKG
+.A.E.=ARE
+Chinese (Traditional)_Maca? S.A.R..950=ZHM
+Chinese_Maca? S.A.R..950=ZHM
+</screen>
+   </para>
+  </sect2>
+ </sect1>
 
  <sect1 id="collation">
   <title>Collation Support</title>
diff --git a/src/include/pg_config_manual.h b/src/include/pg_config_manual.h
index f941ee2faf..e04b02908a 100644
--- a/src/include/pg_config_manual.h
+++ b/src/include/pg_config_manual.h
@@ -141,6 +141,12 @@
 #define EXEC_BACKEND
 #endif
 
+/*
+ * Define DEBUG_SETLOCALE_MAP to test the setlocale() replacement function on
+ * Unix sytems.  Normally it is used only on Windows.
+ */
+/* #define DEBUG_SETLOCALE_MAP */
+
 /*
  * USE_POSIX_FADVISE controls whether Postgres will attempt to use the
  * posix_fadvise() kernel call.  Usually the automatic configure tests are
diff --git a/src/include/port.h b/src/include/port.h
index ae115d2d97..0f482391c6 100644
--- a/src/include/port.h
+++ b/src/include/port.h
@@ -501,6 +501,17 @@ extern bool wait_result_is_signal(int exit_status, int signum);
 extern bool wait_result_is_any_signal(int exit_status, bool include_command_not_found);
 extern int	wait_result_to_exit_code(int exit_status);
 
+/*
+ * Debug-only option to allow Unix systems to test the setlocale() replacement
+ * code we use on Windows.
+ */
+#ifdef DEBUG_SETLOCALE_MAP
+/* port/win32setlocale.c */
+#undef setlocale
+extern char *pgwin32_setlocale(int category, const char *locale);
+#define setlocale(a,b) pgwin32_setlocale(a,b)
+#endif
+
 /*
  * Interfaces that we assume all Unix system have.  We retain individual macros
  * for better documentation.
diff --git a/src/port/chklocale.c b/src/port/chklocale.c
index 8cb81c8640..7e8bcfbf2d 100644
--- a/src/port/chklocale.c
+++ b/src/port/chklocale.c
@@ -431,3 +431,10 @@ pg_get_encoding_from_locale(const char *ctype, bool write_message)
 }
 
 #endif							/* (HAVE_LANGINFO_H && CODESET) || WIN32 */
+
+/*
+ * An option allowing Unix developers to test the pgwin32_setlocale() function.
+ */
+#if !defined(WIN32) && defined(DEBUG_SETLOCALE_MAP)
+#include "win32setlocale.c"
+#endif
diff --git a/src/port/win32setlocale.c b/src/port/win32setlocale.c
index 9e2ab8cc3a..490b1ae0d5 100644
--- a/src/port/win32setlocale.c
+++ b/src/port/win32setlocale.c
@@ -36,25 +36,38 @@
 
 #undef setlocale
 
-struct locale_map
-{
-	/*
-	 * String in locale name to replace. Can be a single string (end is NULL),
-	 * or separate start and end strings. If two strings are given, the locale
-	 * name must contain both of them, and everything between them is
-	 * replaced. This is used for a poor-man's regexp search, allowing
-	 * replacement of "start.*end".
-	 */
-	const char *locale_name_start;
-	const char *locale_name_end;
-
-	const char *replacement;	/* string to replace the match with */
-};
+#ifndef FRONTEND
+#include "pg_config_paths.h"
+#endif
 
 /*
- * Mappings applied before calling setlocale(), to the argument.
+ * The path of a text file that can be created under PGDATA to override these
+ * rules.  It allows locale names to be overridden on input to setlocale, and
+ * on return from setlocale when querying the default.  This is intended as an
+ * option of last resort for users whose system becomes unstartable due to an
+ * operating system update that changes a locale name.
+ *
+ * # comments begin a hash sign
+ * call pattern=replacement
+ * return pattern=replacemnt
+ *
+ * The pattern syntax supports ? for any character (really byte), and * for
+ * any sequence, though only one star can be used in the whole pattern.
+ *
+ * The encoding of the file is effectively undefined; setlocale() works with
+ * the current Windows ACP, and PostgreSQL thinks the strings should be ASCII
+ * or some undefined superset.  This could lead to some confusion if databases
+ * have different encodings, so it's likely that replacements should use BCP47
+ * tags if possible.
  */
-static const struct locale_map locale_map_argument[] = {
+
+typedef struct Win32LocaleTableEntry
+{
+	const char *pattern;
+	const char *replacement;
+} Win32LocaleTableEntry;
+
+static const Win32LocaleTableEntry default_call_mapping_table[] = {
 	/*
 	 * "HKG" is listed here:
 	 * http://msdn.microsoft.com/en-us/library/cdax410z%28v=vs.71%29.aspx
@@ -63,8 +76,8 @@ static const struct locale_map locale_map_argument[] = {
 	 * "ARE" is the ISO-3166 three-letter code for U.A.E. It is not on the
 	 * above list, but seems to work anyway.
 	 */
-	{"Hong Kong S.A.R.", NULL, "HKG"},
-	{"U.A.E.", NULL, "ARE"},
+	{"Hong Kong S.A.R.", "HKG"},
+	{"U.A.E.", "ARE"},
 
 	/*
 	 * The ISO-3166 country code for Macau S.A.R. is MAC, but Windows doesn't
@@ -79,17 +92,13 @@ static const struct locale_map locale_map_argument[] = {
 	 *
 	 * Some versions of Windows spell it "Macau", others "Macao".
 	 */
-	{"Chinese (Traditional)_Macau S.A.R..950", NULL, "ZHM"},
-	{"Chinese_Macau S.A.R..950", NULL, "ZHM"},
-	{"Chinese (Traditional)_Macao S.A.R..950", NULL, "ZHM"},
-	{"Chinese_Macao S.A.R..950", NULL, "ZHM"},
-	{NULL, NULL, NULL}
+	{"Chinese (Traditional)_Maca? S.A.R..950", "ZHM"},
+	{"Chinese_Maca? S.A.R..950", "ZHM"},
+
+	{NULL}
 };
 
-/*
- * Mappings applied after calling setlocale(), to its return value.
- */
-static const struct locale_map locale_map_result[] = {
+static const Win32LocaleTableEntry return_mapping_table[] = {
 	/*
 	 * "Norwegian (Bokm&aring;l)" locale name contains the a-ring character.
 	 * Map it to a pure-ASCII alias.
@@ -100,84 +109,309 @@ static const struct locale_map locale_map_result[] = {
 	 * Just to make life even more complicated, some versions of Windows spell
 	 * the locale name without parentheses.  Translate that too.
 	 */
-	{"Norwegian (Bokm", "l)_Norway", "Norwegian_Norway"},
-	{"Norwegian Bokm", "l_Norway", "Norwegian_Norway"},
-	{NULL, NULL, NULL}
+	{"Norwegian (Bokm*l)_Norway", "Norwegian_Norway"},
+	{"Norwegian Bokm*l_Norway", "Norwegian_Norway"},
+
+	{NULL}
 };
 
-#define MAX_LOCALE_NAME_LEN		100
+static const Win32LocaleTableEntry *call_mapping_table;
 
-static const char *
-map_locale(const struct locale_map *map, const char *locale)
+/*
+ * Parse a line of the mapping file.  Returns 0 on success. Squawks to stderr
+ * on failure, but also sets the errno for setlocale() to fail with and
+ * returns -1 in that case.
+ */
+static int
+parse_line(const char *path,
+		   char *line,
+		   int line_number,
+		   char **pattern,
+		   char **replacement)
+{
+	const char *delimiter;
+	size_t		len;
+
+	/* Strip line endings. */
+	while ((len = strlen(line)) > 0 &&
+		   (line[len - 1] == '\r' || line[len - 1] == '\n'))
+		line[len - 1] = '\0';
+
+	/* Skip empty lines and shell-style comments. */
+	if (line[0] == '\0' || line[0] == '#')
+		return 0;
+
+	/* Look for the equal sign followed by something. */
+	delimiter = strchr(line, '=');
+	if (!delimiter || delimiter[1] == '\0')
+	{
+		fprintf(stderr,
+				"syntax error on line %d of \"%s\", expected pattern=replacement'\n",
+				line_number, path);
+		errno = EINVAL;
+		return -1;
+	}
+
+	/* Copy the pattern. */
+	len = delimiter - line;
+	*pattern = malloc(len + 1);
+	if (!*pattern)
+	{
+		errno = ENOMEM;
+		return -1;
+	}
+	memcpy(*pattern, line, len);
+	(*pattern)[len] = '\0';
+
+	/* Copy the replacement. */
+	*replacement = strdup(delimiter + 1);
+	if (!*replacement)
+	{
+		free(*pattern);
+		errno = ENOMEM;
+		return -1;
+	}
+
+	return 0;
+}
+
+/*
+ * Free a mapping table.  Only used for cleanup on failure, because otherwise
+ * the mapping table is built once and sticks around until process exit.
+ */
+static void
+free_call_mapping_table(Win32LocaleTableEntry *table)
+{
+	while (table->pattern)
+	{
+		free(unconstify(char *, table->pattern));
+		free(unconstify(char *, table->replacement));
+		table++;
+	}
+	free(table);
+}
+
+/*
+ * Initialize call_mapping_table and call_mapping_table_size.  On failure,
+ * errno is set and an error message is written to stderr.
+ */
+static int
+initialize_call_mapping_table(void)
 {
-	static char aliasbuf[MAX_LOCALE_NAME_LEN];
-	int			i;
+	FILE	   *file;
+	const char *path;
+	char		line[128];
+	int			line_number;
+	Win32LocaleTableEntry *table;
+	size_t		table_size;
+	size_t		table_capacity;
+
+	/* Has the user specified a map file in the environment? */
+	path = getenv("PG_SETLOCALE_MAP");
 
-	/* Check if the locale name matches any of the problematic ones. */
-	for (i = 0; map[i].locale_name_start != NULL; i++)
+	/*
+	 * In backends only, fall back to trying to find a file installed in the
+	 * share directory.
+	 */
+	if (!path)
 	{
-		const char *needle_start = map[i].locale_name_start;
-		const char *needle_end = map[i].locale_name_end;
-		const char *replacement = map[i].replacement;
-		char	   *match;
-		char	   *match_start = NULL;
-		char	   *match_end = NULL;
-
-		match = strstr(locale, needle_start);
-		if (match)
+#ifndef FRONTEND
+		path = PGSHAREDIR "/setlocale.map";
+#else
+		/* Use defaults in frontend. */
+		call_mapping_table = default_call_mapping_table;
+		return 0;
+#endif
+	}
+
+	/* If there is no mapping file at that path, use defaults. */
+	file = fopen(path, "r");
+	if (!file)
+	{
+		call_mapping_table = default_call_mapping_table;
+		return 0;
+	}
+
+	/* Initial guess at required space. */
+	table_size = 0;
+	table_capacity = 16;
+	table = malloc(sizeof(Win32LocaleTableEntry) * table_capacity);
+	if (table == NULL)
+	{
+		errno = ENOMEM;
+		return -1;
+	}
+	table->pattern = NULL;
+
+	/* Read the file line-by-line. */
+	line_number = 0;
+	while (fgets(line, sizeof(line), file))
+	{
+		char	   *pattern = NULL;
+		char	   *replacement;
+
+		if (parse_line(path,
+					   line,
+					   line_number++,
+					   &pattern,
+					   &replacement) != 0)
 		{
-			/*
-			 * Found a match for the first part. If this was a two-part
-			 * replacement, find the second part.
-			 */
-			match_start = match;
-			if (needle_end)
-			{
-				match = strstr(match_start + strlen(needle_start), needle_end);
-				if (match)
-					match_end = match + strlen(needle_end);
-				else
-					match_start = NULL;
-			}
-			else
-				match_end = match_start + strlen(needle_start);
+			int			save_errno = errno;
+
+			free_call_mapping_table(table);
+			fclose(file);
+			errno = save_errno;
+			return -1;
 		}
 
-		if (match_start)
+		/* Skip blank/comments. */
+		if (pattern == NULL)
+			continue;
+
+		/*
+		 * Grow by doubling on demand.  Allow an extra entry because we want a
+		 * null terminator.
+		 */
+		if (table_size + 1 == table_capacity)
 		{
-			/* Found a match. Replace the matched string. */
-			int			matchpos = match_start - locale;
-			int			replacementlen = strlen(replacement);
-			char	   *rest = match_end;
-			int			restlen = strlen(rest);
-
-			/* check that the result fits in the static buffer */
-			if (matchpos + replacementlen + restlen + 1 > MAX_LOCALE_NAME_LEN)
-				return NULL;
-
-			memcpy(&aliasbuf[0], &locale[0], matchpos);
-			memcpy(&aliasbuf[matchpos], replacement, replacementlen);
-			/* includes null terminator */
-			memcpy(&aliasbuf[matchpos + replacementlen], rest, restlen + 1);
-
-			return aliasbuf;
+			Win32LocaleTableEntry *new_table;
+
+			new_table = malloc(sizeof(*new_table) * table_capacity * 2);
+			if (new_table == NULL)
+			{
+				free_call_mapping_table(table);
+				fclose(file);
+				errno = ENOMEM;
+				return -1;
+			}
+			memcpy(new_table, table, sizeof(*table) * table_size);
+			free(table);
+			table = new_table;
+			table_capacity *= 2;
 		}
+
+		/* Fill in new entry. */
+		table[table_size].pattern = pattern;
+		table[table_size].replacement = replacement;
+		table_size++;
+
+		table[table_size].pattern = NULL;
 	}
 
-	/* no match, just return the original string */
-	return locale;
+	fclose(file);
+
+	/* Mapping table established for this process. */
+	call_mapping_table = table;
+
+	return 0;
 }
 
+/*
+ * Checks if n bytes of pattern and name match.  '?' is treated as a wildcard
+ * in the pattern, but all other bytes must be identical to match.
+ */
+static bool
+subpattern_matches(const char *pattern, const char *name, size_t n)
+{
+	while (n > 0)
+	{
+		/* Have we hit the end of the pattern or name? */
+		if (*pattern == '\0')
+			return *name == '\0';
+		else if (*name == '\0')
+			return false;
+
+		/* Otherwise matches wildcard or exact character. */
+		if (*pattern != '?' && *pattern != *name)
+			return false;
+
+		/* Next. */
+		n--;
+		pattern++;
+		name++;
+	}
+	return true;
+}
+
+/*
+ * Checks if a name matches a pattern, with an extremely simple pattern logic.
+ * The pattern may contain any number of '?' characters to match any character,
+ * and zero or one '*' characters to match any sequence of characters.
+ */
+static bool
+pattern_matches(const char *pattern, const char *name)
+{
+	const char *star;
+
+	if ((star = strchr(pattern, '*')))
+	{
+		size_t		len_pattern_before_star;
+		size_t		len_pattern_after_star;
+		size_t		len_name;
+
+		/* Does the name match the part before the star? */
+		len_pattern_before_star = star - pattern;
+		if (!subpattern_matches(pattern, name, len_pattern_before_star))
+			return false;
+
+		/* Step over the star in the pattern. */
+		pattern += len_pattern_before_star;
+		pattern++;
+		len_pattern_after_star = strlen(pattern);
+
+		/* Step over the star in the name. */
+		name += len_pattern_before_star;
+		len_name = strlen(name);
+		if (len_name < len_pattern_after_star)
+			return false;
+		name += len_name - len_pattern_after_star;
+	}
+
+	return subpattern_matches(pattern, name, SIZE_MAX);
+}
+
+/*
+ * Convert a locale name using the given table, if it contains a matching
+ * pattern.
+ */
+static const char *
+map_locale(const Win32LocaleTableEntry *table, const char *name)
+{
+	while (table->pattern)
+	{
+		if (pattern_matches(table->pattern, name))
+			return table->replacement;
+		table++;
+	}
+	return name;
+}
+
+/*
+ * This implementation sets errno and in some cases writes messages to stderr
+ * for catastrophic internal failures, though the POSIX function defines no
+ * errors so callers don't actually check errno.
+ */
 char *
 pgwin32_setlocale(int category, const char *locale)
 {
 	const char *argument;
 	char	   *result;
 
+	if (!call_mapping_table)
+	{
+		if (initialize_call_mapping_table() < 0)
+			return NULL;
+	}
+
+	/*
+	 * XXX Call value transformation is relevant as long as we think there are
+	 * existing systems that were initdb'd with unstable and non-ASCII locale
+	 * names.
+	 */
 	if (locale == NULL)
 		argument = NULL;
 	else
-		argument = map_locale(locale_map_argument, locale);
+		argument = map_locale(call_mapping_table, locale);
 
 	/* Call the real setlocale() function */
 	result = setlocale(category, argument);
@@ -185,9 +419,13 @@ pgwin32_setlocale(int category, const char *locale)
 	/*
 	 * setlocale() is specified to return a "char *" that the caller is
 	 * forbidden to modify, so casting away the "const" is innocuous.
+	 *
+	 * XXX Return value transformation is only relevant as long as we continue
+	 * to use setlocale("") as a way to query the default locale names, which
+	 * is the source of the unstable and non-ASCII locale names.
 	 */
 	if (result)
-		result = unconstify(char *, map_locale(locale_map_result, result));
+		result = unconstify(char *, map_locale(return_mapping_table, result));
 
 	return result;
 }
diff --git a/src/tools/pgindent/typedefs.list b/src/tools/pgindent/typedefs.list
index 61ad417cde..2c8626dec4 100644
--- a/src/tools/pgindent/typedefs.list
+++ b/src/tools/pgindent/typedefs.list
@@ -3131,6 +3131,7 @@ WalUsage
 WalWriteMethod
 WalWriteMethodOps
 Walfile
+Win32LocaleTableEntry
 WindowAgg
 WindowAggPath
 WindowAggState
-- 
2.39.2

