From 9a9026c29f3e9cd3c1b7fd92e053bcb5ecc5f6ae Mon Sep 17 00:00:00 2001 From: Thomas Munro Date: Wed, 29 Oct 2025 15:14:13 +1300 Subject: [PATCH 2/8] Formalize pg_wchar encoding schemes. Create a bit more clarity about the different ways that pg_wchar can be encoded, by naming the three schemes in use. This also allows a dispatch-table format in pg_locale_libc.c. Discussion: https://www.postgresql.org/message-id/flat/CA%2BhUKG%2BhDkp1etcfy%3DtaxJ8ybf8KapyOjqdBRPF7yaoSoSj1_w%40mail.gmail.com --- src/backend/utils/adt/pg_locale_libc.c | 163 +++++++++++++------------ src/common/wchar.c | 94 +++++++------- src/include/mb/pg_wchar.h | 51 ++++++++ src/tools/pgindent/typedefs.list | 1 + 4 files changed, 187 insertions(+), 122 deletions(-) diff --git a/src/backend/utils/adt/pg_locale_libc.c b/src/backend/utils/adt/pg_locale_libc.c index 761ed1a0603..1892ed3c5ce 100644 --- a/src/backend/utils/adt/pg_locale_libc.c +++ b/src/backend/utils/adt/pg_locale_libc.c @@ -43,20 +43,25 @@ * the functions since those will obey LC_CTYPE. Note that these * collations don't give a fig about multibyte characters. * - * 2. When working in UTF8 encoding, we use the functions. + * 2. PG_WCHAR_UTF32 encoding scheme: + * + * When working in UTF8 encoding, we use the functions. * This assumes that every platform uses Unicode codepoints directly * as the wchar_t representation of Unicode. On some platforms * wchar_t is only 16 bits wide, so we have to punt for codepoints > 0xFFFF. * - * 3. In all other encodings, we use the functions for pg_wchar + * 3. PG_WCHAR_CHAR and PG_WCHAR_CUSTOM encoding schemes: + * + * In all other encodings, we use the functions for pg_wchar * values up to 255, and punt for values above that. This is 100% correct - * only in single-byte encodings such as LATINn. However, non-Unicode - * multibyte encodings are mostly Far Eastern character sets for which the - * properties being tested here aren't very relevant for higher code values - * anyway. The difficulty with using the functions with - * non-Unicode multibyte encodings is that we can have no certainty that - * the platform's wchar_t representation matches what we do in pg_wchar - * conversions. + * only in single-byte encodings such as LATINn (PG_WCHAR_CHAR). However, + * non-Unicode multibyte encodings (PG_WCHAR_CUSTOM) are all Far Eastern + * character sets for which the properties being tested here aren't very + * relevant for higher code values anyway. The difficulty with using the + * functions with non-Unicode multibyte encodings is that we can + * have no certainty that the platform's wchar_t representation matches what we + * do in pg_wchar conversions. (MULE is also declared PG_WCHAR_CUSTOM but is + * not available as a multi-byte encoding in any known libc.) * * As a special case, in the "default" collation, (2) and (3) force ASCII * letters to follow ASCII upcase/downcase rules, while in a non-default @@ -331,70 +336,75 @@ tolower_libc_mb(pg_wchar wc, pg_locale_t locale) return wc; } -static const struct ctype_methods ctype_methods_libc_sb = { - .strlower = strlower_libc_sb, - .strtitle = strtitle_libc_sb, - .strupper = strupper_libc_sb, - .wc_isdigit = wc_isdigit_libc_sb, - .wc_isalpha = wc_isalpha_libc_sb, - .wc_isalnum = wc_isalnum_libc_sb, - .wc_isupper = wc_isupper_libc_sb, - .wc_islower = wc_islower_libc_sb, - .wc_isgraph = wc_isgraph_libc_sb, - .wc_isprint = wc_isprint_libc_sb, - .wc_ispunct = wc_ispunct_libc_sb, - .wc_isspace = wc_isspace_libc_sb, - .wc_isxdigit = wc_isxdigit_libc_sb, - .char_is_cased = char_is_cased_libc, - .char_tolower = char_tolower_libc, - .wc_toupper = toupper_libc_sb, - .wc_tolower = tolower_libc_sb, - .max_chr = UCHAR_MAX, -}; - -/* - * Non-UTF8 multibyte encodings use multibyte semantics for case mapping, but - * single-byte semantics for pattern matching. - */ -static const struct ctype_methods ctype_methods_libc_other_mb = { - .strlower = strlower_libc_mb, - .strtitle = strtitle_libc_mb, - .strupper = strupper_libc_mb, - .wc_isdigit = wc_isdigit_libc_sb, - .wc_isalpha = wc_isalpha_libc_sb, - .wc_isalnum = wc_isalnum_libc_sb, - .wc_isupper = wc_isupper_libc_sb, - .wc_islower = wc_islower_libc_sb, - .wc_isgraph = wc_isgraph_libc_sb, - .wc_isprint = wc_isprint_libc_sb, - .wc_ispunct = wc_ispunct_libc_sb, - .wc_isspace = wc_isspace_libc_sb, - .wc_isxdigit = wc_isxdigit_libc_sb, - .char_is_cased = char_is_cased_libc, - .char_tolower = char_tolower_libc, - .wc_toupper = toupper_libc_sb, - .wc_tolower = tolower_libc_sb, - .max_chr = UCHAR_MAX, -}; +static const struct ctype_methods ctype_methods_libc[] = { + [PG_WCHAR_CHAR] = { + .strlower = strlower_libc_sb, + .strtitle = strtitle_libc_sb, + .strupper = strupper_libc_sb, + .wc_isdigit = wc_isdigit_libc_sb, + .wc_isalpha = wc_isalpha_libc_sb, + .wc_isalnum = wc_isalnum_libc_sb, + .wc_isupper = wc_isupper_libc_sb, + .wc_islower = wc_islower_libc_sb, + .wc_isgraph = wc_isgraph_libc_sb, + .wc_isprint = wc_isprint_libc_sb, + .wc_ispunct = wc_ispunct_libc_sb, + .wc_isspace = wc_isspace_libc_sb, + .wc_isxdigit = wc_isxdigit_libc_sb, + .char_is_cased = char_is_cased_libc, + .char_tolower = char_tolower_libc, + .wc_toupper = toupper_libc_sb, + .wc_tolower = tolower_libc_sb, + .max_chr = UCHAR_MAX, + }, + [PG_WCHAR_UTF32] = { + .strlower = strlower_libc_mb, + .strtitle = strtitle_libc_mb, + .strupper = strupper_libc_mb, + .wc_isdigit = wc_isdigit_libc_mb, + .wc_isalpha = wc_isalpha_libc_mb, + .wc_isalnum = wc_isalnum_libc_mb, + .wc_isupper = wc_isupper_libc_mb, + .wc_islower = wc_islower_libc_mb, + .wc_isgraph = wc_isgraph_libc_mb, + .wc_isprint = wc_isprint_libc_mb, + .wc_ispunct = wc_ispunct_libc_mb, + .wc_isspace = wc_isspace_libc_mb, + .wc_isxdigit = wc_isxdigit_libc_mb, + .char_is_cased = char_is_cased_libc, + .char_tolower = char_tolower_libc, + .wc_toupper = toupper_libc_mb, + .wc_tolower = tolower_libc_mb, + }, -static const struct ctype_methods ctype_methods_libc_utf8 = { - .strlower = strlower_libc_mb, - .strtitle = strtitle_libc_mb, - .strupper = strupper_libc_mb, - .wc_isdigit = wc_isdigit_libc_mb, - .wc_isalpha = wc_isalpha_libc_mb, - .wc_isalnum = wc_isalnum_libc_mb, - .wc_isupper = wc_isupper_libc_mb, - .wc_islower = wc_islower_libc_mb, - .wc_isgraph = wc_isgraph_libc_mb, - .wc_isprint = wc_isprint_libc_mb, - .wc_ispunct = wc_ispunct_libc_mb, - .wc_isspace = wc_isspace_libc_mb, - .wc_isxdigit = wc_isxdigit_libc_mb, - .char_is_cased = char_is_cased_libc, - .char_tolower = char_tolower_libc, - .wc_toupper = toupper_libc_mb, - .wc_tolower = tolower_libc_mb, + /* + * Custom pg_wchar format converted from non-UTF8 multibyte encodings use + * multibyte semantics for case mapping, but single-byte semantics for + * pattern matching. + * + * XXX Therefore this gives incorrect results for pattern matching outside + * the ASCII range. Could be fixed. + */ + [PG_WCHAR_CUSTOM] = { + .strlower = strlower_libc_mb, + .strtitle = strtitle_libc_mb, + .strupper = strupper_libc_mb, + .wc_isdigit = wc_isdigit_libc_sb, + .wc_isalpha = wc_isalpha_libc_sb, + .wc_isalnum = wc_isalnum_libc_sb, + .wc_isupper = wc_isupper_libc_sb, + .wc_islower = wc_islower_libc_sb, + .wc_isgraph = wc_isgraph_libc_sb, + .wc_isprint = wc_isprint_libc_sb, + .wc_ispunct = wc_ispunct_libc_sb, + .wc_isspace = wc_isspace_libc_sb, + .wc_isxdigit = wc_isxdigit_libc_sb, + .char_is_cased = char_is_cased_libc, + .char_tolower = char_tolower_libc, + .wc_toupper = toupper_libc_sb, + .wc_tolower = tolower_libc_sb, + .max_chr = UCHAR_MAX, + }, }; static const struct collate_methods collate_methods_libc = { @@ -763,14 +773,7 @@ create_pg_locale_libc(Oid collid, MemoryContext context) result->collate = &collate_methods_libc; } if (!result->ctype_is_c) - { - if (GetDatabaseEncoding() == PG_UTF8) - result->ctype = &ctype_methods_libc_utf8; - else if (pg_database_encoding_max_length() > 1) - result->ctype = &ctype_methods_libc_other_mb; - else - result->ctype = &ctype_methods_libc_sb; - } + result->ctype = &ctype_methods_libc[pg_wchar_encoding_scheme(GetDatabaseEncoding())]; return result; } diff --git a/src/common/wchar.c b/src/common/wchar.c index a4bc29921de..f453587749a 100644 --- a/src/common/wchar.c +++ b/src/common/wchar.c @@ -2062,50 +2062,60 @@ pg_encoding_set_invalid(int encoding, char *dst) *------------------------------------------------------------------- */ const pg_wchar_tbl pg_wchar_table[] = { - [PG_SQL_ASCII] = {pg_ascii2wchar_with_len, pg_wchar2single_with_len, pg_ascii_mblen, pg_ascii_dsplen, pg_ascii_verifychar, pg_ascii_verifystr, 1}, - [PG_EUC_JP] = {pg_eucjp2wchar_with_len, pg_wchar2euc_with_len, pg_eucjp_mblen, pg_eucjp_dsplen, pg_eucjp_verifychar, pg_eucjp_verifystr, 3}, - [PG_EUC_CN] = {pg_euccn2wchar_with_len, pg_wchar2euc_with_len, pg_euccn_mblen, pg_euccn_dsplen, pg_euccn_verifychar, pg_euccn_verifystr, 2}, - [PG_EUC_KR] = {pg_euckr2wchar_with_len, pg_wchar2euc_with_len, pg_euckr_mblen, pg_euckr_dsplen, pg_euckr_verifychar, pg_euckr_verifystr, 3}, - [PG_EUC_TW] = {pg_euctw2wchar_with_len, pg_wchar2euc_with_len, pg_euctw_mblen, pg_euctw_dsplen, pg_euctw_verifychar, pg_euctw_verifystr, 4}, - [PG_EUC_JIS_2004] = {pg_eucjp2wchar_with_len, pg_wchar2euc_with_len, pg_eucjp_mblen, pg_eucjp_dsplen, pg_eucjp_verifychar, pg_eucjp_verifystr, 3}, - [PG_UTF8] = {pg_utf2wchar_with_len, pg_wchar2utf_with_len, pg_utf_mblen, pg_utf_dsplen, pg_utf8_verifychar, pg_utf8_verifystr, 4}, - [PG_MULE_INTERNAL] = {pg_mule2wchar_with_len, pg_wchar2mule_with_len, pg_mule_mblen, pg_mule_dsplen, pg_mule_verifychar, pg_mule_verifystr, 4}, - [PG_LATIN1] = {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1}, - [PG_LATIN2] = {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1}, - [PG_LATIN3] = {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1}, - [PG_LATIN4] = {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1}, - [PG_LATIN5] = {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1}, - [PG_LATIN6] = {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1}, - [PG_LATIN7] = {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1}, - [PG_LATIN8] = {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1}, - [PG_LATIN9] = {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1}, - [PG_LATIN10] = {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1}, - [PG_WIN1256] = {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1}, - [PG_WIN1258] = {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1}, - [PG_WIN866] = {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1}, - [PG_WIN874] = {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1}, - [PG_KOI8R] = {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1}, - [PG_WIN1251] = {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1}, - [PG_WIN1252] = {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1}, - [PG_ISO_8859_5] = {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1}, - [PG_ISO_8859_6] = {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1}, - [PG_ISO_8859_7] = {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1}, - [PG_ISO_8859_8] = {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1}, - [PG_WIN1250] = {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1}, - [PG_WIN1253] = {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1}, - [PG_WIN1254] = {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1}, - [PG_WIN1255] = {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1}, - [PG_WIN1257] = {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1}, - [PG_KOI8U] = {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1}, - [PG_SJIS] = {0, 0, pg_sjis_mblen, pg_sjis_dsplen, pg_sjis_verifychar, pg_sjis_verifystr, 2}, - [PG_BIG5] = {0, 0, pg_big5_mblen, pg_big5_dsplen, pg_big5_verifychar, pg_big5_verifystr, 2}, - [PG_GBK] = {0, 0, pg_gbk_mblen, pg_gbk_dsplen, pg_gbk_verifychar, pg_gbk_verifystr, 2}, - [PG_UHC] = {0, 0, pg_uhc_mblen, pg_uhc_dsplen, pg_uhc_verifychar, pg_uhc_verifystr, 2}, - [PG_GB18030] = {0, 0, pg_gb18030_mblen, pg_gb18030_dsplen, pg_gb18030_verifychar, pg_gb18030_verifystr, 4}, - [PG_JOHAB] = {0, 0, pg_johab_mblen, pg_johab_dsplen, pg_johab_verifychar, pg_johab_verifystr, 3}, - [PG_SHIFT_JIS_2004] = {0, 0, pg_sjis_mblen, pg_sjis_dsplen, pg_sjis_verifychar, pg_sjis_verifystr, 2}, + [PG_SQL_ASCII] = {PG_WCHAR_CHAR, pg_ascii2wchar_with_len, pg_wchar2single_with_len, pg_ascii_mblen, pg_ascii_dsplen, pg_ascii_verifychar, pg_ascii_verifystr, 1}, + [PG_EUC_JP] = {PG_WCHAR_CUSTOM, pg_eucjp2wchar_with_len, pg_wchar2euc_with_len, pg_eucjp_mblen, pg_eucjp_dsplen, pg_eucjp_verifychar, pg_eucjp_verifystr, 3}, + [PG_EUC_CN] = {PG_WCHAR_CUSTOM, pg_euccn2wchar_with_len, pg_wchar2euc_with_len, pg_euccn_mblen, pg_euccn_dsplen, pg_euccn_verifychar, pg_euccn_verifystr, 2}, + [PG_EUC_KR] = {PG_WCHAR_CUSTOM, pg_euckr2wchar_with_len, pg_wchar2euc_with_len, pg_euckr_mblen, pg_euckr_dsplen, pg_euckr_verifychar, pg_euckr_verifystr, 3}, + [PG_EUC_TW] = {PG_WCHAR_CUSTOM, pg_euctw2wchar_with_len, pg_wchar2euc_with_len, pg_euctw_mblen, pg_euctw_dsplen, pg_euctw_verifychar, pg_euctw_verifystr, 4}, + [PG_EUC_JIS_2004] = {PG_WCHAR_CUSTOM, pg_eucjp2wchar_with_len, pg_wchar2euc_with_len, pg_eucjp_mblen, pg_eucjp_dsplen, pg_eucjp_verifychar, pg_eucjp_verifystr, 3}, + [PG_UTF8] = {PG_WCHAR_UTF32, pg_utf2wchar_with_len, pg_wchar2utf_with_len, pg_utf_mblen, pg_utf_dsplen, pg_utf8_verifychar, pg_utf8_verifystr, 4}, + [PG_MULE_INTERNAL] = {PG_WCHAR_CUSTOM, pg_mule2wchar_with_len, pg_wchar2mule_with_len, pg_mule_mblen, pg_mule_dsplen, pg_mule_verifychar, pg_mule_verifystr, 4}, + [PG_LATIN1] = {PG_WCHAR_CHAR, pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1}, + [PG_LATIN2] = {PG_WCHAR_CHAR, pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1}, + [PG_LATIN3] = {PG_WCHAR_CHAR, pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1}, + [PG_LATIN4] = {PG_WCHAR_CHAR, pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1}, + [PG_LATIN5] = {PG_WCHAR_CHAR, pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1}, + [PG_LATIN6] = {PG_WCHAR_CHAR, pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1}, + [PG_LATIN7] = {PG_WCHAR_CHAR, pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1}, + [PG_LATIN8] = {PG_WCHAR_CHAR, pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1}, + [PG_LATIN9] = {PG_WCHAR_CHAR, pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1}, + [PG_LATIN10] = {PG_WCHAR_CHAR, pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1}, + [PG_WIN1256] = {PG_WCHAR_CHAR, pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1}, + [PG_WIN1258] = {PG_WCHAR_CHAR, pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1}, + [PG_WIN866] = {PG_WCHAR_CHAR, pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1}, + [PG_WIN874] = {PG_WCHAR_CHAR, pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1}, + [PG_KOI8R] = {PG_WCHAR_CHAR, pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1}, + [PG_WIN1251] = {PG_WCHAR_CHAR, pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1}, + [PG_WIN1252] = {PG_WCHAR_CHAR, pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1}, + [PG_ISO_8859_5] = {PG_WCHAR_CHAR, pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1}, + [PG_ISO_8859_6] = {PG_WCHAR_CHAR, pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1}, + [PG_ISO_8859_7] = {PG_WCHAR_CHAR, pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1}, + [PG_ISO_8859_8] = {PG_WCHAR_CHAR, pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1}, + [PG_WIN1250] = {PG_WCHAR_CHAR, pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1}, + [PG_WIN1253] = {PG_WCHAR_CHAR, pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1}, + [PG_WIN1254] = {PG_WCHAR_CHAR, pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1}, + [PG_WIN1255] = {PG_WCHAR_CHAR, pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1}, + [PG_WIN1257] = {PG_WCHAR_CHAR, pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1}, + [PG_KOI8U] = {PG_WCHAR_CHAR, pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1}, + [PG_SJIS] = {PG_WCHAR_NONE, 0, 0, pg_sjis_mblen, pg_sjis_dsplen, pg_sjis_verifychar, pg_sjis_verifystr, 2}, + [PG_BIG5] = {PG_WCHAR_NONE, 0, 0, pg_big5_mblen, pg_big5_dsplen, pg_big5_verifychar, pg_big5_verifystr, 2}, + [PG_GBK] = {PG_WCHAR_NONE, 0, 0, pg_gbk_mblen, pg_gbk_dsplen, pg_gbk_verifychar, pg_gbk_verifystr, 2}, + [PG_UHC] = {PG_WCHAR_NONE, 0, 0, pg_uhc_mblen, pg_uhc_dsplen, pg_uhc_verifychar, pg_uhc_verifystr, 2}, + [PG_GB18030] = {PG_WCHAR_NONE, 0, 0, pg_gb18030_mblen, pg_gb18030_dsplen, pg_gb18030_verifychar, pg_gb18030_verifystr, 4}, + [PG_JOHAB] = {PG_WCHAR_NONE, 0, 0, pg_johab_mblen, pg_johab_dsplen, pg_johab_verifychar, pg_johab_verifystr, 3}, + [PG_SHIFT_JIS_2004] = {PG_WCHAR_NONE, 0, 0, pg_sjis_mblen, pg_sjis_dsplen, pg_sjis_verifychar, pg_sjis_verifystr, 2}, }; +/* + * Returns the encoding scheme for pg_wchar values in the current database + * encoding. + */ +PgWcharEncodingScheme +pg_wchar_encoding_scheme(int encoding) +{ + return pg_wchar_table[encoding].encoding_scheme; +} + /* * Returns the byte length of a multibyte character. * diff --git a/src/include/mb/pg_wchar.h b/src/include/mb/pg_wchar.h index 4b4a9974b75..5db00cebcef 100644 --- a/src/include/mb/pg_wchar.h +++ b/src/include/mb/pg_wchar.h @@ -27,6 +27,55 @@ */ typedef unsigned int pg_wchar; +/* + * Encoding schemes that pg_wchar might hold. + * + * Each multi-byte encoding has a corresponding wide encoding scheme, + * conceptually like wchar_t in C. Conversions to and from char should be + * performed by pg_mb2wchar*() and pg_wchar2mb*() functions. In all encoding + * schemes, values 0-127 represent ASCII. For higher values, see below. + * + * Locale providers make use of the known properties of these encoding schemes + * to implement ctype/wctype functionality. + */ +typedef enum PgWcharEncodingScheme +{ + /* + * 8-bit characters in the database encoding, zero-extended to pg_wchar + * width. + */ + PG_WCHAR_CHAR, + + /* + * 32-bit Unicode code points. PostgreSQL assumes that all libc + * implementations use UTF-32 or at least UTF-16 if wchar_t is narrow for + * locales that use UTF-8 encoding for char strings, so it has a special + * case for this. + */ + PG_WCHAR_UTF32, + + /* + * For multi-byte database encodings other than UTF-8, the encoding is + * unspecified outside the ASCII range. + */ + PG_WCHAR_CUSTOM, + + /* + * This scheme is not currently used by any of the supported encodings, + * but is included here for completeness, providing terminology. In a few + * places, pg_wchar is used to transport wchar_t in whatever unknown + * encoding libc uses for the database encoding. This is second from last + * so that lookup arrays don't have to waste an entry. + */ + PG_WCHAR_SYSTEM_WCHAR_T, + + /* + * pg_wchar conversion is not available for the database encoding. This + * is last so that lookup arrays don't have to waste an entry. + */ + PG_WCHAR_NONE, +} PgWcharEncodingScheme; + /* * Maximum byte length of multibyte characters in any backend encoding */ @@ -391,6 +440,7 @@ typedef int (*mbstr_verifier) (const unsigned char *mbstr, int len); typedef struct { + PgWcharEncodingScheme encoding_scheme; /* pg_wchar representation */ mb2wchar_with_len_converter mb2wchar_with_len; /* convert a multibyte * string to a wchar */ wchar2mb_with_len_converter wchar2mb_with_len; /* convert a wchar string @@ -713,6 +763,7 @@ extern int SetClientEncoding(int encoding); extern void InitializeClientEncoding(void); extern int pg_get_client_encoding(void); extern const char *pg_get_client_encoding_name(void); +extern PgWcharEncodingScheme pg_wchar_encoding_scheme(int encoding); extern void SetDatabaseEncoding(int encoding); extern int GetDatabaseEncoding(void); diff --git a/src/tools/pgindent/typedefs.list b/src/tools/pgindent/typedefs.list index ac2da4c98cf..d6973751f12 100644 --- a/src/tools/pgindent/typedefs.list +++ b/src/tools/pgindent/typedefs.list @@ -2264,6 +2264,7 @@ PgStat_WalCounters PgStat_WalStats PgXmlErrorContext PgXmlStrictness +PgWcharEncodingScheme Pg_abi_values Pg_finfo_record Pg_magic_struct -- 2.50.1 (Apple Git-155)