From 9a9026c29f3e9cd3c1b7fd92e053bcb5ecc5f6ae Mon Sep 17 00:00:00 2001
From: Thomas Munro <thomas.munro@gmail.com>
Date: Wed, 29 Oct 2025 15:14:13 +1300
Subject: [PATCH 2/8] Formalize pg_wchar encoding schemes.

Create a bit more clarity about the different ways that pg_wchar can be
encoded, by naming the three schemes in use.  This also allows a
dispatch-table format in pg_locale_libc.c.

Discussion: https://www.postgresql.org/message-id/flat/CA%2BhUKG%2BhDkp1etcfy%3DtaxJ8ybf8KapyOjqdBRPF7yaoSoSj1_w%40mail.gmail.com
---
 src/backend/utils/adt/pg_locale_libc.c | 163 +++++++++++++------------
 src/common/wchar.c                     |  94 +++++++-------
 src/include/mb/pg_wchar.h              |  51 ++++++++
 src/tools/pgindent/typedefs.list       |   1 +
 4 files changed, 187 insertions(+), 122 deletions(-)

diff --git a/src/backend/utils/adt/pg_locale_libc.c b/src/backend/utils/adt/pg_locale_libc.c
index 761ed1a0603..1892ed3c5ce 100644
--- a/src/backend/utils/adt/pg_locale_libc.c
+++ b/src/backend/utils/adt/pg_locale_libc.c
@@ -43,20 +43,25 @@
  * the <ctype.h> functions since those will obey LC_CTYPE.  Note that these
  * collations don't give a fig about multibyte characters.
  *
- * 2. When working in UTF8 encoding, we use the <wctype.h> functions.
+ * 2. PG_WCHAR_UTF32 encoding scheme:
+ *
+ * When working in UTF8 encoding, we use the <wctype.h> functions.
  * This assumes that every platform uses Unicode codepoints directly
  * as the wchar_t representation of Unicode.  On some platforms
  * wchar_t is only 16 bits wide, so we have to punt for codepoints > 0xFFFF.
  *
- * 3. In all other encodings, we use the <ctype.h> functions for pg_wchar
+ * 3. PG_WCHAR_CHAR and PG_WCHAR_CUSTOM encoding schemes:
+ *
+ * In all other encodings, we use the <ctype.h> functions for pg_wchar
  * values up to 255, and punt for values above that.  This is 100% correct
- * only in single-byte encodings such as LATINn.  However, non-Unicode
- * multibyte encodings are mostly Far Eastern character sets for which the
- * properties being tested here aren't very relevant for higher code values
- * anyway.  The difficulty with using the <wctype.h> functions with
- * non-Unicode multibyte encodings is that we can have no certainty that
- * the platform's wchar_t representation matches what we do in pg_wchar
- * conversions.
+ * only in single-byte encodings such as LATINn (PG_WCHAR_CHAR).  However,
+ * non-Unicode multibyte encodings (PG_WCHAR_CUSTOM) are all Far Eastern
+ * character sets for which the properties being tested here aren't very
+ * relevant for higher code values anyway.  The difficulty with using the
+ * <wctype.h> functions with non-Unicode multibyte encodings is that we can
+ * have no certainty that the platform's wchar_t representation matches what we
+ * do in pg_wchar conversions.  (MULE is also declared PG_WCHAR_CUSTOM but is
+ * not available as a multi-byte encoding in any known libc.)
  *
  * As a special case, in the "default" collation, (2) and (3) force ASCII
  * letters to follow ASCII upcase/downcase rules, while in a non-default
@@ -331,70 +336,75 @@ tolower_libc_mb(pg_wchar wc, pg_locale_t locale)
 		return wc;
 }
 
-static const struct ctype_methods ctype_methods_libc_sb = {
-	.strlower = strlower_libc_sb,
-	.strtitle = strtitle_libc_sb,
-	.strupper = strupper_libc_sb,
-	.wc_isdigit = wc_isdigit_libc_sb,
-	.wc_isalpha = wc_isalpha_libc_sb,
-	.wc_isalnum = wc_isalnum_libc_sb,
-	.wc_isupper = wc_isupper_libc_sb,
-	.wc_islower = wc_islower_libc_sb,
-	.wc_isgraph = wc_isgraph_libc_sb,
-	.wc_isprint = wc_isprint_libc_sb,
-	.wc_ispunct = wc_ispunct_libc_sb,
-	.wc_isspace = wc_isspace_libc_sb,
-	.wc_isxdigit = wc_isxdigit_libc_sb,
-	.char_is_cased = char_is_cased_libc,
-	.char_tolower = char_tolower_libc,
-	.wc_toupper = toupper_libc_sb,
-	.wc_tolower = tolower_libc_sb,
-	.max_chr = UCHAR_MAX,
-};
-
-/*
- * Non-UTF8 multibyte encodings use multibyte semantics for case mapping, but
- * single-byte semantics for pattern matching.
- */
-static const struct ctype_methods ctype_methods_libc_other_mb = {
-	.strlower = strlower_libc_mb,
-	.strtitle = strtitle_libc_mb,
-	.strupper = strupper_libc_mb,
-	.wc_isdigit = wc_isdigit_libc_sb,
-	.wc_isalpha = wc_isalpha_libc_sb,
-	.wc_isalnum = wc_isalnum_libc_sb,
-	.wc_isupper = wc_isupper_libc_sb,
-	.wc_islower = wc_islower_libc_sb,
-	.wc_isgraph = wc_isgraph_libc_sb,
-	.wc_isprint = wc_isprint_libc_sb,
-	.wc_ispunct = wc_ispunct_libc_sb,
-	.wc_isspace = wc_isspace_libc_sb,
-	.wc_isxdigit = wc_isxdigit_libc_sb,
-	.char_is_cased = char_is_cased_libc,
-	.char_tolower = char_tolower_libc,
-	.wc_toupper = toupper_libc_sb,
-	.wc_tolower = tolower_libc_sb,
-	.max_chr = UCHAR_MAX,
-};
+static const struct ctype_methods ctype_methods_libc[] = {
+	[PG_WCHAR_CHAR] = {
+		.strlower = strlower_libc_sb,
+		.strtitle = strtitle_libc_sb,
+		.strupper = strupper_libc_sb,
+		.wc_isdigit = wc_isdigit_libc_sb,
+		.wc_isalpha = wc_isalpha_libc_sb,
+		.wc_isalnum = wc_isalnum_libc_sb,
+		.wc_isupper = wc_isupper_libc_sb,
+		.wc_islower = wc_islower_libc_sb,
+		.wc_isgraph = wc_isgraph_libc_sb,
+		.wc_isprint = wc_isprint_libc_sb,
+		.wc_ispunct = wc_ispunct_libc_sb,
+		.wc_isspace = wc_isspace_libc_sb,
+		.wc_isxdigit = wc_isxdigit_libc_sb,
+		.char_is_cased = char_is_cased_libc,
+		.char_tolower = char_tolower_libc,
+		.wc_toupper = toupper_libc_sb,
+		.wc_tolower = tolower_libc_sb,
+		.max_chr = UCHAR_MAX,
+	},
+	[PG_WCHAR_UTF32] = {
+		.strlower = strlower_libc_mb,
+		.strtitle = strtitle_libc_mb,
+		.strupper = strupper_libc_mb,
+		.wc_isdigit = wc_isdigit_libc_mb,
+		.wc_isalpha = wc_isalpha_libc_mb,
+		.wc_isalnum = wc_isalnum_libc_mb,
+		.wc_isupper = wc_isupper_libc_mb,
+		.wc_islower = wc_islower_libc_mb,
+		.wc_isgraph = wc_isgraph_libc_mb,
+		.wc_isprint = wc_isprint_libc_mb,
+		.wc_ispunct = wc_ispunct_libc_mb,
+		.wc_isspace = wc_isspace_libc_mb,
+		.wc_isxdigit = wc_isxdigit_libc_mb,
+		.char_is_cased = char_is_cased_libc,
+		.char_tolower = char_tolower_libc,
+		.wc_toupper = toupper_libc_mb,
+		.wc_tolower = tolower_libc_mb,
+	},
 
-static const struct ctype_methods ctype_methods_libc_utf8 = {
-	.strlower = strlower_libc_mb,
-	.strtitle = strtitle_libc_mb,
-	.strupper = strupper_libc_mb,
-	.wc_isdigit = wc_isdigit_libc_mb,
-	.wc_isalpha = wc_isalpha_libc_mb,
-	.wc_isalnum = wc_isalnum_libc_mb,
-	.wc_isupper = wc_isupper_libc_mb,
-	.wc_islower = wc_islower_libc_mb,
-	.wc_isgraph = wc_isgraph_libc_mb,
-	.wc_isprint = wc_isprint_libc_mb,
-	.wc_ispunct = wc_ispunct_libc_mb,
-	.wc_isspace = wc_isspace_libc_mb,
-	.wc_isxdigit = wc_isxdigit_libc_mb,
-	.char_is_cased = char_is_cased_libc,
-	.char_tolower = char_tolower_libc,
-	.wc_toupper = toupper_libc_mb,
-	.wc_tolower = tolower_libc_mb,
+	/*
+	 * Custom pg_wchar format converted from non-UTF8 multibyte encodings use
+	 * multibyte semantics for case mapping, but single-byte semantics for
+	 * pattern matching.
+	 *
+	 * XXX Therefore this gives incorrect results for pattern matching outside
+	 * the ASCII range.  Could be fixed.
+	 */
+	[PG_WCHAR_CUSTOM] = {
+		.strlower = strlower_libc_mb,
+		.strtitle = strtitle_libc_mb,
+		.strupper = strupper_libc_mb,
+		.wc_isdigit = wc_isdigit_libc_sb,
+		.wc_isalpha = wc_isalpha_libc_sb,
+		.wc_isalnum = wc_isalnum_libc_sb,
+		.wc_isupper = wc_isupper_libc_sb,
+		.wc_islower = wc_islower_libc_sb,
+		.wc_isgraph = wc_isgraph_libc_sb,
+		.wc_isprint = wc_isprint_libc_sb,
+		.wc_ispunct = wc_ispunct_libc_sb,
+		.wc_isspace = wc_isspace_libc_sb,
+		.wc_isxdigit = wc_isxdigit_libc_sb,
+		.char_is_cased = char_is_cased_libc,
+		.char_tolower = char_tolower_libc,
+		.wc_toupper = toupper_libc_sb,
+		.wc_tolower = tolower_libc_sb,
+		.max_chr = UCHAR_MAX,
+	},
 };
 
 static const struct collate_methods collate_methods_libc = {
@@ -763,14 +773,7 @@ create_pg_locale_libc(Oid collid, MemoryContext context)
 			result->collate = &collate_methods_libc;
 	}
 	if (!result->ctype_is_c)
-	{
-		if (GetDatabaseEncoding() == PG_UTF8)
-			result->ctype = &ctype_methods_libc_utf8;
-		else if (pg_database_encoding_max_length() > 1)
-			result->ctype = &ctype_methods_libc_other_mb;
-		else
-			result->ctype = &ctype_methods_libc_sb;
-	}
+		result->ctype = &ctype_methods_libc[pg_wchar_encoding_scheme(GetDatabaseEncoding())];
 
 	return result;
 }
diff --git a/src/common/wchar.c b/src/common/wchar.c
index a4bc29921de..f453587749a 100644
--- a/src/common/wchar.c
+++ b/src/common/wchar.c
@@ -2062,50 +2062,60 @@ pg_encoding_set_invalid(int encoding, char *dst)
  *-------------------------------------------------------------------
  */
 const pg_wchar_tbl pg_wchar_table[] = {
-	[PG_SQL_ASCII] = {pg_ascii2wchar_with_len, pg_wchar2single_with_len, pg_ascii_mblen, pg_ascii_dsplen, pg_ascii_verifychar, pg_ascii_verifystr, 1},
-	[PG_EUC_JP] = {pg_eucjp2wchar_with_len, pg_wchar2euc_with_len, pg_eucjp_mblen, pg_eucjp_dsplen, pg_eucjp_verifychar, pg_eucjp_verifystr, 3},
-	[PG_EUC_CN] = {pg_euccn2wchar_with_len, pg_wchar2euc_with_len, pg_euccn_mblen, pg_euccn_dsplen, pg_euccn_verifychar, pg_euccn_verifystr, 2},
-	[PG_EUC_KR] = {pg_euckr2wchar_with_len, pg_wchar2euc_with_len, pg_euckr_mblen, pg_euckr_dsplen, pg_euckr_verifychar, pg_euckr_verifystr, 3},
-	[PG_EUC_TW] = {pg_euctw2wchar_with_len, pg_wchar2euc_with_len, pg_euctw_mblen, pg_euctw_dsplen, pg_euctw_verifychar, pg_euctw_verifystr, 4},
-	[PG_EUC_JIS_2004] = {pg_eucjp2wchar_with_len, pg_wchar2euc_with_len, pg_eucjp_mblen, pg_eucjp_dsplen, pg_eucjp_verifychar, pg_eucjp_verifystr, 3},
-	[PG_UTF8] = {pg_utf2wchar_with_len, pg_wchar2utf_with_len, pg_utf_mblen, pg_utf_dsplen, pg_utf8_verifychar, pg_utf8_verifystr, 4},
-	[PG_MULE_INTERNAL] = {pg_mule2wchar_with_len, pg_wchar2mule_with_len, pg_mule_mblen, pg_mule_dsplen, pg_mule_verifychar, pg_mule_verifystr, 4},
-	[PG_LATIN1] = {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1},
-	[PG_LATIN2] = {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1},
-	[PG_LATIN3] = {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1},
-	[PG_LATIN4] = {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1},
-	[PG_LATIN5] = {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1},
-	[PG_LATIN6] = {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1},
-	[PG_LATIN7] = {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1},
-	[PG_LATIN8] = {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1},
-	[PG_LATIN9] = {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1},
-	[PG_LATIN10] = {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1},
-	[PG_WIN1256] = {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1},
-	[PG_WIN1258] = {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1},
-	[PG_WIN866] = {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1},
-	[PG_WIN874] = {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1},
-	[PG_KOI8R] = {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1},
-	[PG_WIN1251] = {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1},
-	[PG_WIN1252] = {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1},
-	[PG_ISO_8859_5] = {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1},
-	[PG_ISO_8859_6] = {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1},
-	[PG_ISO_8859_7] = {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1},
-	[PG_ISO_8859_8] = {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1},
-	[PG_WIN1250] = {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1},
-	[PG_WIN1253] = {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1},
-	[PG_WIN1254] = {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1},
-	[PG_WIN1255] = {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1},
-	[PG_WIN1257] = {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1},
-	[PG_KOI8U] = {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1},
-	[PG_SJIS] = {0, 0, pg_sjis_mblen, pg_sjis_dsplen, pg_sjis_verifychar, pg_sjis_verifystr, 2},
-	[PG_BIG5] = {0, 0, pg_big5_mblen, pg_big5_dsplen, pg_big5_verifychar, pg_big5_verifystr, 2},
-	[PG_GBK] = {0, 0, pg_gbk_mblen, pg_gbk_dsplen, pg_gbk_verifychar, pg_gbk_verifystr, 2},
-	[PG_UHC] = {0, 0, pg_uhc_mblen, pg_uhc_dsplen, pg_uhc_verifychar, pg_uhc_verifystr, 2},
-	[PG_GB18030] = {0, 0, pg_gb18030_mblen, pg_gb18030_dsplen, pg_gb18030_verifychar, pg_gb18030_verifystr, 4},
-	[PG_JOHAB] = {0, 0, pg_johab_mblen, pg_johab_dsplen, pg_johab_verifychar, pg_johab_verifystr, 3},
-	[PG_SHIFT_JIS_2004] = {0, 0, pg_sjis_mblen, pg_sjis_dsplen, pg_sjis_verifychar, pg_sjis_verifystr, 2},
+	[PG_SQL_ASCII] = {PG_WCHAR_CHAR, pg_ascii2wchar_with_len, pg_wchar2single_with_len, pg_ascii_mblen, pg_ascii_dsplen, pg_ascii_verifychar, pg_ascii_verifystr, 1},
+	[PG_EUC_JP] = {PG_WCHAR_CUSTOM, pg_eucjp2wchar_with_len, pg_wchar2euc_with_len, pg_eucjp_mblen, pg_eucjp_dsplen, pg_eucjp_verifychar, pg_eucjp_verifystr, 3},
+	[PG_EUC_CN] = {PG_WCHAR_CUSTOM, pg_euccn2wchar_with_len, pg_wchar2euc_with_len, pg_euccn_mblen, pg_euccn_dsplen, pg_euccn_verifychar, pg_euccn_verifystr, 2},
+	[PG_EUC_KR] = {PG_WCHAR_CUSTOM, pg_euckr2wchar_with_len, pg_wchar2euc_with_len, pg_euckr_mblen, pg_euckr_dsplen, pg_euckr_verifychar, pg_euckr_verifystr, 3},
+	[PG_EUC_TW] = {PG_WCHAR_CUSTOM, pg_euctw2wchar_with_len, pg_wchar2euc_with_len, pg_euctw_mblen, pg_euctw_dsplen, pg_euctw_verifychar, pg_euctw_verifystr, 4},
+	[PG_EUC_JIS_2004] = {PG_WCHAR_CUSTOM, pg_eucjp2wchar_with_len, pg_wchar2euc_with_len, pg_eucjp_mblen, pg_eucjp_dsplen, pg_eucjp_verifychar, pg_eucjp_verifystr, 3},
+	[PG_UTF8] = {PG_WCHAR_UTF32, pg_utf2wchar_with_len, pg_wchar2utf_with_len, pg_utf_mblen, pg_utf_dsplen, pg_utf8_verifychar, pg_utf8_verifystr, 4},
+	[PG_MULE_INTERNAL] = {PG_WCHAR_CUSTOM, pg_mule2wchar_with_len, pg_wchar2mule_with_len, pg_mule_mblen, pg_mule_dsplen, pg_mule_verifychar, pg_mule_verifystr, 4},
+	[PG_LATIN1] = {PG_WCHAR_CHAR, pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1},
+	[PG_LATIN2] = {PG_WCHAR_CHAR, pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1},
+	[PG_LATIN3] = {PG_WCHAR_CHAR, pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1},
+	[PG_LATIN4] = {PG_WCHAR_CHAR, pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1},
+	[PG_LATIN5] = {PG_WCHAR_CHAR, pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1},
+	[PG_LATIN6] = {PG_WCHAR_CHAR, pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1},
+	[PG_LATIN7] = {PG_WCHAR_CHAR, pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1},
+	[PG_LATIN8] = {PG_WCHAR_CHAR, pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1},
+	[PG_LATIN9] = {PG_WCHAR_CHAR, pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1},
+	[PG_LATIN10] = {PG_WCHAR_CHAR, pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1},
+	[PG_WIN1256] = {PG_WCHAR_CHAR, pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1},
+	[PG_WIN1258] = {PG_WCHAR_CHAR, pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1},
+	[PG_WIN866] = {PG_WCHAR_CHAR, pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1},
+	[PG_WIN874] = {PG_WCHAR_CHAR, pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1},
+	[PG_KOI8R] = {PG_WCHAR_CHAR, pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1},
+	[PG_WIN1251] = {PG_WCHAR_CHAR, pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1},
+	[PG_WIN1252] = {PG_WCHAR_CHAR, pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1},
+	[PG_ISO_8859_5] = {PG_WCHAR_CHAR, pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1},
+	[PG_ISO_8859_6] = {PG_WCHAR_CHAR, pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1},
+	[PG_ISO_8859_7] = {PG_WCHAR_CHAR, pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1},
+	[PG_ISO_8859_8] = {PG_WCHAR_CHAR, pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1},
+	[PG_WIN1250] = {PG_WCHAR_CHAR, pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1},
+	[PG_WIN1253] = {PG_WCHAR_CHAR, pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1},
+	[PG_WIN1254] = {PG_WCHAR_CHAR, pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1},
+	[PG_WIN1255] = {PG_WCHAR_CHAR, pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1},
+	[PG_WIN1257] = {PG_WCHAR_CHAR, pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1},
+	[PG_KOI8U] = {PG_WCHAR_CHAR, pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1},
+	[PG_SJIS] = {PG_WCHAR_NONE, 0, 0, pg_sjis_mblen, pg_sjis_dsplen, pg_sjis_verifychar, pg_sjis_verifystr, 2},
+	[PG_BIG5] = {PG_WCHAR_NONE, 0, 0, pg_big5_mblen, pg_big5_dsplen, pg_big5_verifychar, pg_big5_verifystr, 2},
+	[PG_GBK] = {PG_WCHAR_NONE, 0, 0, pg_gbk_mblen, pg_gbk_dsplen, pg_gbk_verifychar, pg_gbk_verifystr, 2},
+	[PG_UHC] = {PG_WCHAR_NONE, 0, 0, pg_uhc_mblen, pg_uhc_dsplen, pg_uhc_verifychar, pg_uhc_verifystr, 2},
+	[PG_GB18030] = {PG_WCHAR_NONE, 0, 0, pg_gb18030_mblen, pg_gb18030_dsplen, pg_gb18030_verifychar, pg_gb18030_verifystr, 4},
+	[PG_JOHAB] = {PG_WCHAR_NONE, 0, 0, pg_johab_mblen, pg_johab_dsplen, pg_johab_verifychar, pg_johab_verifystr, 3},
+	[PG_SHIFT_JIS_2004] = {PG_WCHAR_NONE, 0, 0, pg_sjis_mblen, pg_sjis_dsplen, pg_sjis_verifychar, pg_sjis_verifystr, 2},
 };
 
+/*
+ * Returns the encoding scheme for pg_wchar values in the current database
+ * encoding.
+ */
+PgWcharEncodingScheme
+pg_wchar_encoding_scheme(int encoding)
+{
+	return pg_wchar_table[encoding].encoding_scheme;
+}
+
 /*
  * Returns the byte length of a multibyte character.
  *
diff --git a/src/include/mb/pg_wchar.h b/src/include/mb/pg_wchar.h
index 4b4a9974b75..5db00cebcef 100644
--- a/src/include/mb/pg_wchar.h
+++ b/src/include/mb/pg_wchar.h
@@ -27,6 +27,55 @@
  */
 typedef unsigned int pg_wchar;
 
+/*
+ * Encoding schemes that pg_wchar might hold.
+ *
+ * Each multi-byte encoding has a corresponding wide encoding scheme,
+ * conceptually like wchar_t in C.  Conversions to and from char should be
+ * performed by pg_mb2wchar*() and pg_wchar2mb*() functions.  In all encoding
+ * schemes, values 0-127 represent ASCII.  For higher values, see below.
+ *
+ * Locale providers make use of the known properties of these encoding schemes
+ * to implement ctype/wctype functionality.
+ */
+typedef enum PgWcharEncodingScheme
+{
+	/*
+	 * 8-bit characters in the database encoding, zero-extended to pg_wchar
+	 * width.
+	 */
+	PG_WCHAR_CHAR,
+
+	/*
+	 * 32-bit Unicode code points.  PostgreSQL assumes that all libc
+	 * implementations use UTF-32 or at least UTF-16 if wchar_t is narrow for
+	 * locales that use UTF-8 encoding for char strings, so it has a special
+	 * case for this.
+	 */
+	PG_WCHAR_UTF32,
+
+	/*
+	 * For multi-byte database encodings other than UTF-8, the encoding is
+	 * unspecified outside the ASCII range.
+	 */
+	PG_WCHAR_CUSTOM,
+
+	/*
+	 * This scheme is not currently used by any of the supported encodings,
+	 * but is included here for completeness, providing terminology.  In a few
+	 * places, pg_wchar is used to transport wchar_t in whatever unknown
+	 * encoding libc uses for the database encoding.  This is second from last
+	 * so that lookup arrays don't have to waste an entry.
+	 */
+	PG_WCHAR_SYSTEM_WCHAR_T,
+
+	/*
+	 * pg_wchar conversion is not available for the database encoding.  This
+	 * is last so that lookup arrays don't have to waste an entry.
+	 */
+	PG_WCHAR_NONE,
+} PgWcharEncodingScheme;
+
 /*
  * Maximum byte length of multibyte characters in any backend encoding
  */
@@ -391,6 +440,7 @@ typedef int (*mbstr_verifier) (const unsigned char *mbstr, int len);
 
 typedef struct
 {
+	PgWcharEncodingScheme encoding_scheme;	/* pg_wchar representation */
 	mb2wchar_with_len_converter mb2wchar_with_len;	/* convert a multibyte
 													 * string to a wchar */
 	wchar2mb_with_len_converter wchar2mb_with_len;	/* convert a wchar string
@@ -713,6 +763,7 @@ extern int	SetClientEncoding(int encoding);
 extern void InitializeClientEncoding(void);
 extern int	pg_get_client_encoding(void);
 extern const char *pg_get_client_encoding_name(void);
+extern PgWcharEncodingScheme pg_wchar_encoding_scheme(int encoding);
 
 extern void SetDatabaseEncoding(int encoding);
 extern int	GetDatabaseEncoding(void);
diff --git a/src/tools/pgindent/typedefs.list b/src/tools/pgindent/typedefs.list
index ac2da4c98cf..d6973751f12 100644
--- a/src/tools/pgindent/typedefs.list
+++ b/src/tools/pgindent/typedefs.list
@@ -2264,6 +2264,7 @@ PgStat_WalCounters
 PgStat_WalStats
 PgXmlErrorContext
 PgXmlStrictness
+PgWcharEncodingScheme
 Pg_abi_values
 Pg_finfo_record
 Pg_magic_struct
-- 
2.50.1 (Apple Git-155)