From 94fc0d0c2f2e7428f111fb952dda635b99c84da3 Mon Sep 17 00:00:00 2001 From: Henson Choi Date: Wed, 15 Apr 2026 08:46:56 +0900 Subject: [PATCH] Fix JOHAB encoding validation to match KS X 1001 Annex 3. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Since the encoding was added in 2002, pg_johab_mblen() and pg_johab_verifychar() have borrowed their byte-length and trail-byte rules from EUC-KR via pg_euc_mblen() and IS_EUC_RANGE_VALID(), which demand trail bytes in 0xA1-0xFE. JOHAB does not follow that rule: per KS X 1001:2004 Annex 3 Table 1, trail bytes may fall anywhere in 0x41-0x7E or 0x81-0xFE for Hangul syllables (0x31-0x7E or 0x91-0xFE for the other three categories), including the ASCII graphic range and in particular 0x5C, the backslash. As a result, most of the mappings shipped in johab_to_utf8.map were unreachable: the verifier rejected the byte sequences before they could be converted. The first multi-byte character in the source JOHAB.TXT (\x84\x44) and the originally reported sequence \x8A\x5C = "굎" were both affected. Rewrite pg_johab_mblen() and pg_johab_verifychar() to classify the leading byte into the four categories defined by Annex 3 Table 1 and accept only the trail-byte ranges specified for each category. The encoding is strictly two bytes wide for any non-ASCII character, so also correct pg_wchar_table[PG_JOHAB].maxmblen from 3 to 2 and the corresponding column in charset.sgml. A new regression test covers the original bug sequences, boundary cases for each lead and trail range, and the invalid-byte gaps. The mapping tables themselves were already correct and are unchanged, so this fix is forward-compatible: sequences that decoded before continue to decode identically, and the sequences that were erroneously rejected now succeed. Bug: #19354 Reported-by: Jeroen Vermeulen Discussion: https://postgr.es/m/19354-eefe6d8b3e84f9f2@postgresql.org --- doc/src/sgml/charset.sgml | 2 +- src/common/wchar.c | 69 ++++++++++++++++----- src/test/regress/expected/johab.out | 87 +++++++++++++++++++++++++++ src/test/regress/expected/johab_1.out | 9 +++ src/test/regress/parallel_schedule | 2 +- src/test/regress/sql/johab.sql | 58 ++++++++++++++++++ 6 files changed, 209 insertions(+), 18 deletions(-) create mode 100644 src/test/regress/expected/johab.out create mode 100644 src/test/regress/expected/johab_1.out create mode 100644 src/test/regress/sql/johab.sql diff --git a/doc/src/sgml/charset.sgml b/doc/src/sgml/charset.sgml index 746e40bb9d2..8ff7f7ed03d 100644 --- a/doc/src/sgml/charset.sgml +++ b/doc/src/sgml/charset.sgml @@ -1934,7 +1934,7 @@ ORDER BY c COLLATE ebcdic; Korean (Hangul) No No - 1–3 + 1–2 diff --git a/src/common/wchar.c b/src/common/wchar.c index a44ee73accf..f493e4d9a99 100644 --- a/src/common/wchar.c +++ b/src/common/wchar.c @@ -438,18 +438,45 @@ pg_wchar2euc_with_len(const pg_wchar *from, unsigned char *to, int len) /* - * JOHAB + * JOHAB (KS X 1001:2004 Annex 3, a.k.a. the 2-byte combinational code) + * + * Byte ranges per Annex 3 Table 1: + * + * Category Lead byte Trail byte + * -------------------- ----------- --------------------- + * Hangul syllables 0x84 - 0xD3 0x41 - 0x7E, 0x81 - 0xFE + * User-defined area A 0xD8 0x31 - 0x7E, 0x91 - 0xFE + * Other characters 0xD9 - 0xDE 0x31 - 0x7E, 0x91 - 0xFE + * Hanja 0xE0 - 0xF9 0x31 - 0x7E, 0x91 - 0xFE + * + * ASCII (< 0x80) is single-byte. Lead bytes in the gaps between the ranges + * above (0x80-0x83, 0xD4-0xD7, 0xDF, 0xFA-0xFF) are invalid. Likewise, + * trail bytes that fall outside their allowed union are invalid: for Hangul + * this excludes 0x00-0x40, 0x7F-0x80, and 0xFF; for the other categories + * this excludes 0x00-0x30, 0x7F-0x90, and 0xFF. + * + * Note that unlike EUC-KR, trail bytes may fall within the ASCII graphic + * range (including 0x5C backslash), so callers dealing with JOHAB text + * must not assume ASCII bytes are self-synchronizing. */ +#define IS_JOHAB_LEAD_HANGUL(c) ((c) >= 0x84 && (c) <= 0xD3) +#define IS_JOHAB_LEAD_OTHER(c) \ + (((c) >= 0xD8 && (c) <= 0xDE) || ((c) >= 0xE0 && (c) <= 0xF9)) + static int pg_johab_mblen(const unsigned char *s) { - return pg_euc_mblen(s); + if (IS_JOHAB_LEAD_HANGUL(*s) || IS_JOHAB_LEAD_OTHER(*s)) + return 2; + return 1; } static int pg_johab_dsplen(const unsigned char *s) { - return pg_euc_dsplen(s); + if (IS_HIGHBIT_SET(*s)) + return 2; + return pg_ascii_dsplen(s); } /* @@ -1156,25 +1183,35 @@ pg_euctw_verifystr(const unsigned char *s, int len) static int pg_johab_verifychar(const unsigned char *s, int len) { - int l, - mbl; - unsigned char c; + unsigned char b1, + b2; - l = mbl = pg_johab_mblen(s); + if (!IS_HIGHBIT_SET(*s)) + return 1; - if (len < l) + if (len < 2) return -1; - if (!IS_HIGHBIT_SET(*s)) - return mbl; + b1 = s[0]; + b2 = s[1]; - while (--l > 0) + /* + * Per KS X 1001:2004 Annex 3 Table 1, trailing byte ranges depend on the + * leading byte's category. + */ + if (IS_JOHAB_LEAD_HANGUL(b1)) { - c = *++s; - if (!IS_EUC_RANGE_VALID(c)) - return -1; + /* Hangul syllables: 0x41-0x7E or 0x81-0xFE */ + if ((b2 >= 0x41 && b2 <= 0x7E) || (b2 >= 0x81 && b2 <= 0xFE)) + return 2; } - return mbl; + else if (IS_JOHAB_LEAD_OTHER(b1)) + { + /* User-defined, other characters, Hanja: 0x31-0x7E or 0x91-0xFE */ + if ((b2 >= 0x31 && b2 <= 0x7E) || (b2 >= 0x91 && b2 <= 0xFE)) + return 2; + } + return -1; } static int @@ -1901,7 +1938,7 @@ const pg_wchar_tbl pg_wchar_table[] = { [PG_GBK] = {0, 0, pg_gbk_mblen, pg_gbk_dsplen, pg_gbk_verifychar, pg_gbk_verifystr, 2}, [PG_UHC] = {0, 0, pg_uhc_mblen, pg_uhc_dsplen, pg_uhc_verifychar, pg_uhc_verifystr, 2}, [PG_GB18030] = {0, 0, pg_gb18030_mblen, pg_gb18030_dsplen, pg_gb18030_verifychar, pg_gb18030_verifystr, 4}, - [PG_JOHAB] = {0, 0, pg_johab_mblen, pg_johab_dsplen, pg_johab_verifychar, pg_johab_verifystr, 3}, + [PG_JOHAB] = {0, 0, pg_johab_mblen, pg_johab_dsplen, pg_johab_verifychar, pg_johab_verifystr, 2}, [PG_SHIFT_JIS_2004] = {0, 0, pg_sjis_mblen, pg_sjis_dsplen, pg_sjis_verifychar, pg_sjis_verifystr, 2}, }; diff --git a/src/test/regress/expected/johab.out b/src/test/regress/expected/johab.out new file mode 100644 index 00000000000..d2eafdf73e4 --- /dev/null +++ b/src/test/regress/expected/johab.out @@ -0,0 +1,87 @@ +-- This test exercises the JOHAB client encoding (KS X 1001:2004 Annex 3). +-- JOHAB's valid byte ranges differ from EUC-KR: trail bytes may fall within +-- the ASCII graphic range (0x41-0x7E for Hangul, 0x31-0x7E for the other +-- categories), including 0x5C which is the ASCII backslash. The test runs +-- only in UTF8 databases, since some decoded characters have no equivalent +-- in other server encodings. +SELECT getdatabaseencoding() <> 'UTF8' AS skip_test \gset +\if :skip_test +\quit +\endif +-- Bug #19354 original report plus its neighbors: these three byte sequences +-- are valid Hangul syllables per Annex 3 Table 1 (lead 0x8A is in 0x84-0xD3, +-- trail 0x5B/0x5C/0x5D is in 0x41-0x7E) but were rejected by the prior +-- EUC-KR-derived check that demanded trail bytes in 0xA1-0xFE. +SELECT convert_from('\x8a5b'::bytea, 'johab') AS "0x8a5b", + convert_from('\x8a5c'::bytea, 'johab') AS "0x8a5c", + convert_from('\x8a5d'::bytea, 'johab') AS "0x8a5d"; + 0x8a5b | 0x8a5c | 0x8a5d +--------+--------+-------- + 굍 | 굎 | 굏 +(1 row) + +-- First multi-byte character in unicode.org's JOHAB.TXT, also rejected by +-- the prior check (trail 0x44 in Hangul range 0x41-0x7E). +SELECT convert_from('\x8444'::bytea, 'johab') AS "0x8444"; + 0x8444 +-------- + ㄳ +(1 row) + +-- Regression check for byte sequences that already decoded correctly under +-- the old rules (trail byte already within the old-allowed 0xA1-0xFE). +SELECT convert_from('\x89ef'::bytea, 'johab') AS "0x89ef", + convert_from('\x89a1'::bytea, 'johab') AS "0x89a1"; + 0x89ef | 0x89a1 +--------+-------- + 괦 | 고 +(1 row) + +-- Hanja range (lead 0xE0-0xF9) with trail bytes in the old-rejected region +-- 0x31-0xA0. Per Annex 3 Table 1 the Hanja trail range is 0x31-0x7E and +-- 0x91-0xFE. +SELECT convert_from('\xe031'::bytea, 'johab') AS "0xe031", + convert_from('\xe07e'::bytea, 'johab') AS "0xe07e", + convert_from('\xe091'::bytea, 'johab') AS "0xe091"; + 0xe031 | 0xe07e | 0xe091 +--------+--------+-------- + 伽 | 嵌 | 感 +(1 row) + +-- "Other characters" category (lead 0xD9-0xDE) with a low trail byte. +SELECT convert_from('\xd931'::bytea, 'johab') AS "0xd931"; + 0xd931 +-------- +   +(1 row) + +-- Invalid lead bytes: the gaps between the four lead-byte ranges defined by +-- Annex 3 Table 1. +SELECT convert_from('\x8041'::bytea, 'johab'); +ERROR: invalid byte sequence for encoding "JOHAB": 0x80 +SELECT convert_from('\xd541'::bytea, 'johab'); +ERROR: invalid byte sequence for encoding "JOHAB": 0xd5 +SELECT convert_from('\xdf41'::bytea, 'johab'); +ERROR: invalid byte sequence for encoding "JOHAB": 0xdf +SELECT convert_from('\xfa41'::bytea, 'johab'); +ERROR: invalid byte sequence for encoding "JOHAB": 0xfa +-- Invalid trail bytes: values inside the gaps within each trail-byte range. +-- For Hangul the gaps are 0x00-0x40, 0x7F-0x80, and 0xFF. +SELECT convert_from('\x8a40'::bytea, 'johab'); +ERROR: invalid byte sequence for encoding "JOHAB": 0x8a 0x40 +SELECT convert_from('\x8a7f'::bytea, 'johab'); +ERROR: invalid byte sequence for encoding "JOHAB": 0x8a 0x7f +SELECT convert_from('\x8a80'::bytea, 'johab'); +ERROR: invalid byte sequence for encoding "JOHAB": 0x8a 0x80 +-- For the other categories the gaps are 0x00-0x30, 0x7F-0x90, and 0xFF. +SELECT convert_from('\xe030'::bytea, 'johab'); +ERROR: invalid byte sequence for encoding "JOHAB": 0xe0 0x30 +SELECT convert_from('\xe07f'::bytea, 'johab'); +ERROR: invalid byte sequence for encoding "JOHAB": 0xe0 0x7f +SELECT convert_from('\xe090'::bytea, 'johab'); +ERROR: invalid byte sequence for encoding "JOHAB": 0xe0 0x90 +SELECT convert_from('\xe0ff'::bytea, 'johab'); +ERROR: invalid byte sequence for encoding "JOHAB": 0xe0 0xff +-- Incomplete sequence: a valid lead byte with no trail byte is rejected. +SELECT convert_from('\x8a'::bytea, 'johab'); +ERROR: invalid byte sequence for encoding "JOHAB": 0x8a diff --git a/src/test/regress/expected/johab_1.out b/src/test/regress/expected/johab_1.out new file mode 100644 index 00000000000..89028ad81e0 --- /dev/null +++ b/src/test/regress/expected/johab_1.out @@ -0,0 +1,9 @@ +-- This test exercises the JOHAB client encoding (KS X 1001:2004 Annex 3). +-- JOHAB's valid byte ranges differ from EUC-KR: trail bytes may fall within +-- the ASCII graphic range (0x41-0x7E for Hangul, 0x31-0x7E for the other +-- categories), including 0x5C which is the ASCII backslash. The test runs +-- only in UTF8 databases, since some decoded characters have no equivalent +-- in other server encodings. +SELECT getdatabaseencoding() <> 'UTF8' AS skip_test \gset +\if :skip_test +\quit diff --git a/src/test/regress/parallel_schedule b/src/test/regress/parallel_schedule index cc365393bb7..63f7419d255 100644 --- a/src/test/regress/parallel_schedule +++ b/src/test/regress/parallel_schedule @@ -28,7 +28,7 @@ test: strings md5 numerology point lseg line box path polygon circle date time t # geometry depends on point, lseg, line, box, path, polygon, circle # horology depends on date, time, timetz, timestamp, timestamptz, interval # ---------- -test: geometry horology tstypes regex type_sanity opr_sanity misc_sanity comments expressions unicode xid mvcc database stats_import pg_ndistinct pg_dependencies oid8 encoding euc_kr +test: geometry horology tstypes regex type_sanity opr_sanity misc_sanity comments expressions unicode xid mvcc database stats_import pg_ndistinct pg_dependencies oid8 encoding euc_kr johab # ---------- # Load huge amounts of data diff --git a/src/test/regress/sql/johab.sql b/src/test/regress/sql/johab.sql new file mode 100644 index 00000000000..7a919f430a7 --- /dev/null +++ b/src/test/regress/sql/johab.sql @@ -0,0 +1,58 @@ +-- This test exercises the JOHAB client encoding (KS X 1001:2004 Annex 3). +-- JOHAB's valid byte ranges differ from EUC-KR: trail bytes may fall within +-- the ASCII graphic range (0x41-0x7E for Hangul, 0x31-0x7E for the other +-- categories), including 0x5C which is the ASCII backslash. The test runs +-- only in UTF8 databases, since some decoded characters have no equivalent +-- in other server encodings. +SELECT getdatabaseencoding() <> 'UTF8' AS skip_test \gset +\if :skip_test +\quit +\endif + +-- Bug #19354 original report plus its neighbors: these three byte sequences +-- are valid Hangul syllables per Annex 3 Table 1 (lead 0x8A is in 0x84-0xD3, +-- trail 0x5B/0x5C/0x5D is in 0x41-0x7E) but were rejected by the prior +-- EUC-KR-derived check that demanded trail bytes in 0xA1-0xFE. +SELECT convert_from('\x8a5b'::bytea, 'johab') AS "0x8a5b", + convert_from('\x8a5c'::bytea, 'johab') AS "0x8a5c", + convert_from('\x8a5d'::bytea, 'johab') AS "0x8a5d"; + +-- First multi-byte character in unicode.org's JOHAB.TXT, also rejected by +-- the prior check (trail 0x44 in Hangul range 0x41-0x7E). +SELECT convert_from('\x8444'::bytea, 'johab') AS "0x8444"; + +-- Regression check for byte sequences that already decoded correctly under +-- the old rules (trail byte already within the old-allowed 0xA1-0xFE). +SELECT convert_from('\x89ef'::bytea, 'johab') AS "0x89ef", + convert_from('\x89a1'::bytea, 'johab') AS "0x89a1"; + +-- Hanja range (lead 0xE0-0xF9) with trail bytes in the old-rejected region +-- 0x31-0xA0. Per Annex 3 Table 1 the Hanja trail range is 0x31-0x7E and +-- 0x91-0xFE. +SELECT convert_from('\xe031'::bytea, 'johab') AS "0xe031", + convert_from('\xe07e'::bytea, 'johab') AS "0xe07e", + convert_from('\xe091'::bytea, 'johab') AS "0xe091"; + +-- "Other characters" category (lead 0xD9-0xDE) with a low trail byte. +SELECT convert_from('\xd931'::bytea, 'johab') AS "0xd931"; + +-- Invalid lead bytes: the gaps between the four lead-byte ranges defined by +-- Annex 3 Table 1. +SELECT convert_from('\x8041'::bytea, 'johab'); +SELECT convert_from('\xd541'::bytea, 'johab'); +SELECT convert_from('\xdf41'::bytea, 'johab'); +SELECT convert_from('\xfa41'::bytea, 'johab'); + +-- Invalid trail bytes: values inside the gaps within each trail-byte range. +-- For Hangul the gaps are 0x00-0x40, 0x7F-0x80, and 0xFF. +SELECT convert_from('\x8a40'::bytea, 'johab'); +SELECT convert_from('\x8a7f'::bytea, 'johab'); +SELECT convert_from('\x8a80'::bytea, 'johab'); +-- For the other categories the gaps are 0x00-0x30, 0x7F-0x90, and 0xFF. +SELECT convert_from('\xe030'::bytea, 'johab'); +SELECT convert_from('\xe07f'::bytea, 'johab'); +SELECT convert_from('\xe090'::bytea, 'johab'); +SELECT convert_from('\xe0ff'::bytea, 'johab'); + +-- Incomplete sequence: a valid lead byte with no trail byte is rejected. +SELECT convert_from('\x8a'::bytea, 'johab'); -- 2.50.1 (Apple Git-155)