From 94fc0d0c2f2e7428f111fb952dda635b99c84da3 Mon Sep 17 00:00:00 2001
From: Henson Choi <assam258@gmail.com>
Date: Wed, 15 Apr 2026 08:46:56 +0900
Subject: [PATCH] Fix JOHAB encoding validation to match KS X 1001 Annex 3.
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Since the encoding was added in 2002, pg_johab_mblen() and
pg_johab_verifychar() have borrowed their byte-length and trail-byte
rules from EUC-KR via pg_euc_mblen() and IS_EUC_RANGE_VALID(), which
demand trail bytes in 0xA1-0xFE.  JOHAB does not follow that rule: per
KS X 1001:2004 Annex 3 Table 1, trail bytes may fall anywhere in
0x41-0x7E or 0x81-0xFE for Hangul syllables (0x31-0x7E or 0x91-0xFE
for the other three categories), including the ASCII graphic range
and in particular 0x5C, the backslash.  As a result, most of the
mappings shipped in johab_to_utf8.map were unreachable: the verifier
rejected the byte sequences before they could be converted.  The
first multi-byte character in the source JOHAB.TXT (\x84\x44) and the
originally reported sequence \x8A\x5C = "굎" were both affected.

Rewrite pg_johab_mblen() and pg_johab_verifychar() to classify the
leading byte into the four categories defined by Annex 3 Table 1 and
accept only the trail-byte ranges specified for each category.  The
encoding is strictly two bytes wide for any non-ASCII character, so
also correct pg_wchar_table[PG_JOHAB].maxmblen from 3 to 2 and the
corresponding column in charset.sgml.  A new regression test covers
the original bug sequences, boundary cases for each lead and trail
range, and the invalid-byte gaps.

The mapping tables themselves were already correct and are unchanged,
so this fix is forward-compatible: sequences that decoded before
continue to decode identically, and the sequences that were
erroneously rejected now succeed.

Bug: #19354
Reported-by: Jeroen Vermeulen <jtvjtv@gmail.com>
Discussion: https://postgr.es/m/19354-eefe6d8b3e84f9f2@postgresql.org
---
 doc/src/sgml/charset.sgml             |  2 +-
 src/common/wchar.c                    | 69 ++++++++++++++++-----
 src/test/regress/expected/johab.out   | 87 +++++++++++++++++++++++++++
 src/test/regress/expected/johab_1.out |  9 +++
 src/test/regress/parallel_schedule    |  2 +-
 src/test/regress/sql/johab.sql        | 58 ++++++++++++++++++
 6 files changed, 209 insertions(+), 18 deletions(-)
 create mode 100644 src/test/regress/expected/johab.out
 create mode 100644 src/test/regress/expected/johab_1.out
 create mode 100644 src/test/regress/sql/johab.sql
diff --git a/doc/src/sgml/charset.sgml b/doc/src/sgml/charset.sgml
index 746e40bb9d2..8ff7f7ed03d 100644
--- a/doc/src/sgml/charset.sgml
+++ b/doc/src/sgml/charset.sgml
@@ -1934,7 +1934,7 @@ ORDER BY c COLLATE ebcdic;
          <entry>Korean (Hangul)</entry>
          <entry>No</entry>
          <entry>No</entry>
-         <entry>1&ndash;3</entry>
+         <entry>1&ndash;2</entry>
          <entry></entry>
         </row>
         <row>
diff --git a/src/common/wchar.c b/src/common/wchar.c
index a44ee73accf..f493e4d9a99 100644
--- a/src/common/wchar.c
+++ b/src/common/wchar.c
@@ -438,18 +438,45 @@ pg_wchar2euc_with_len(const pg_wchar *from, unsigned char *to, int len)
 
 
 /*
- * JOHAB
+ * JOHAB (KS X 1001:2004 Annex 3, a.k.a. the 2-byte combinational code)
+ *
+ * Byte ranges per Annex 3 Table 1:
+ *
+ *   Category              Lead byte    Trail byte
+ *   --------------------  -----------  ---------------------
+ *   Hangul syllables      0x84 - 0xD3  0x41 - 0x7E, 0x81 - 0xFE
+ *   User-defined area A   0xD8         0x31 - 0x7E, 0x91 - 0xFE
+ *   Other characters      0xD9 - 0xDE  0x31 - 0x7E, 0x91 - 0xFE
+ *   Hanja                 0xE0 - 0xF9  0x31 - 0x7E, 0x91 - 0xFE
+ *
+ * ASCII (< 0x80) is single-byte.  Lead bytes in the gaps between the ranges
+ * above (0x80-0x83, 0xD4-0xD7, 0xDF, 0xFA-0xFF) are invalid.  Likewise,
+ * trail bytes that fall outside their allowed union are invalid: for Hangul
+ * this excludes 0x00-0x40, 0x7F-0x80, and 0xFF; for the other categories
+ * this excludes 0x00-0x30, 0x7F-0x90, and 0xFF.
+ *
+ * Note that unlike EUC-KR, trail bytes may fall within the ASCII graphic
+ * range (including 0x5C backslash), so callers dealing with JOHAB text
+ * must not assume ASCII bytes are self-synchronizing.
  */
+#define IS_JOHAB_LEAD_HANGUL(c)	((c) >= 0x84 && (c) <= 0xD3)
+#define IS_JOHAB_LEAD_OTHER(c)	\
+	(((c) >= 0xD8 && (c) <= 0xDE) || ((c) >= 0xE0 && (c) <= 0xF9))
+
 static int
 pg_johab_mblen(const unsigned char *s)
 {
-	return pg_euc_mblen(s);
+	if (IS_JOHAB_LEAD_HANGUL(*s) || IS_JOHAB_LEAD_OTHER(*s))
+		return 2;
+	return 1;
 }
 
 static int
 pg_johab_dsplen(const unsigned char *s)
 {
-	return pg_euc_dsplen(s);
+	if (IS_HIGHBIT_SET(*s))
+		return 2;
+	return pg_ascii_dsplen(s);
 }
 
 /*
@@ -1156,25 +1183,35 @@ pg_euctw_verifystr(const unsigned char *s, int len)
 static int
 pg_johab_verifychar(const unsigned char *s, int len)
 {
-	int			l,
-				mbl;
-	unsigned char c;
+	unsigned char b1,
+				b2;
 
-	l = mbl = pg_johab_mblen(s);
+	if (!IS_HIGHBIT_SET(*s))
+		return 1;
 
-	if (len < l)
+	if (len < 2)
 		return -1;
 
-	if (!IS_HIGHBIT_SET(*s))
-		return mbl;
+	b1 = s[0];
+	b2 = s[1];
 
-	while (--l > 0)
+	/*
+	 * Per KS X 1001:2004 Annex 3 Table 1, trailing byte ranges depend on the
+	 * leading byte's category.
+	 */
+	if (IS_JOHAB_LEAD_HANGUL(b1))
 	{
-		c = *++s;
-		if (!IS_EUC_RANGE_VALID(c))
-			return -1;
+		/* Hangul syllables: 0x41-0x7E or 0x81-0xFE */
+		if ((b2 >= 0x41 && b2 <= 0x7E) || (b2 >= 0x81 && b2 <= 0xFE))
+			return 2;
 	}
-	return mbl;
+	else if (IS_JOHAB_LEAD_OTHER(b1))
+	{
+		/* User-defined, other characters, Hanja: 0x31-0x7E or 0x91-0xFE */
+		if ((b2 >= 0x31 && b2 <= 0x7E) || (b2 >= 0x91 && b2 <= 0xFE))
+			return 2;
+	}
+	return -1;
 }
 
 static int
@@ -1901,7 +1938,7 @@ const pg_wchar_tbl pg_wchar_table[] = {
 	[PG_GBK] = {0, 0, pg_gbk_mblen, pg_gbk_dsplen, pg_gbk_verifychar, pg_gbk_verifystr, 2},
 	[PG_UHC] = {0, 0, pg_uhc_mblen, pg_uhc_dsplen, pg_uhc_verifychar, pg_uhc_verifystr, 2},
 	[PG_GB18030] = {0, 0, pg_gb18030_mblen, pg_gb18030_dsplen, pg_gb18030_verifychar, pg_gb18030_verifystr, 4},
-	[PG_JOHAB] = {0, 0, pg_johab_mblen, pg_johab_dsplen, pg_johab_verifychar, pg_johab_verifystr, 3},
+	[PG_JOHAB] = {0, 0, pg_johab_mblen, pg_johab_dsplen, pg_johab_verifychar, pg_johab_verifystr, 2},
 	[PG_SHIFT_JIS_2004] = {0, 0, pg_sjis_mblen, pg_sjis_dsplen, pg_sjis_verifychar, pg_sjis_verifystr, 2},
 };
 
diff --git a/src/test/regress/expected/johab.out b/src/test/regress/expected/johab.out
new file mode 100644
index 00000000000..d2eafdf73e4
--- /dev/null
+++ b/src/test/regress/expected/johab.out
@@ -0,0 +1,87 @@
+-- This test exercises the JOHAB client encoding (KS X 1001:2004 Annex 3).
+-- JOHAB's valid byte ranges differ from EUC-KR: trail bytes may fall within
+-- the ASCII graphic range (0x41-0x7E for Hangul, 0x31-0x7E for the other
+-- categories), including 0x5C which is the ASCII backslash.  The test runs
+-- only in UTF8 databases, since some decoded characters have no equivalent
+-- in other server encodings.
+SELECT getdatabaseencoding() <> 'UTF8' AS skip_test \gset
+\if :skip_test
+\quit
+\endif
+-- Bug #19354 original report plus its neighbors: these three byte sequences
+-- are valid Hangul syllables per Annex 3 Table 1 (lead 0x8A is in 0x84-0xD3,
+-- trail 0x5B/0x5C/0x5D is in 0x41-0x7E) but were rejected by the prior
+-- EUC-KR-derived check that demanded trail bytes in 0xA1-0xFE.
+SELECT convert_from('\x8a5b'::bytea, 'johab') AS "0x8a5b",
+       convert_from('\x8a5c'::bytea, 'johab') AS "0x8a5c",
+       convert_from('\x8a5d'::bytea, 'johab') AS "0x8a5d";
+ 0x8a5b | 0x8a5c | 0x8a5d 
+--------+--------+--------
+ 굍     | 굎     | 굏
+(1 row)
+
+-- First multi-byte character in unicode.org's JOHAB.TXT, also rejected by
+-- the prior check (trail 0x44 in Hangul range 0x41-0x7E).
+SELECT convert_from('\x8444'::bytea, 'johab') AS "0x8444";
+ 0x8444 
+--------
+ ㄳ
+(1 row)
+
+-- Regression check for byte sequences that already decoded correctly under
+-- the old rules (trail byte already within the old-allowed 0xA1-0xFE).
+SELECT convert_from('\x89ef'::bytea, 'johab') AS "0x89ef",
+       convert_from('\x89a1'::bytea, 'johab') AS "0x89a1";
+ 0x89ef | 0x89a1 
+--------+--------
+ 괦     | 고
+(1 row)
+
+-- Hanja range (lead 0xE0-0xF9) with trail bytes in the old-rejected region
+-- 0x31-0xA0.  Per Annex 3 Table 1 the Hanja trail range is 0x31-0x7E and
+-- 0x91-0xFE.
+SELECT convert_from('\xe031'::bytea, 'johab') AS "0xe031",
+       convert_from('\xe07e'::bytea, 'johab') AS "0xe07e",
+       convert_from('\xe091'::bytea, 'johab') AS "0xe091";
+ 0xe031 | 0xe07e | 0xe091 
+--------+--------+--------
+ 伽     | 嵌     | 感
+(1 row)
+
+-- "Other characters" category (lead 0xD9-0xDE) with a low trail byte.
+SELECT convert_from('\xd931'::bytea, 'johab') AS "0xd931";
+ 0xd931 
+--------
+ 　
+(1 row)
+
+-- Invalid lead bytes: the gaps between the four lead-byte ranges defined by
+-- Annex 3 Table 1.
+SELECT convert_from('\x8041'::bytea, 'johab');
+ERROR:  invalid byte sequence for encoding "JOHAB": 0x80
+SELECT convert_from('\xd541'::bytea, 'johab');
+ERROR:  invalid byte sequence for encoding "JOHAB": 0xd5
+SELECT convert_from('\xdf41'::bytea, 'johab');
+ERROR:  invalid byte sequence for encoding "JOHAB": 0xdf
+SELECT convert_from('\xfa41'::bytea, 'johab');
+ERROR:  invalid byte sequence for encoding "JOHAB": 0xfa
+-- Invalid trail bytes: values inside the gaps within each trail-byte range.
+-- For Hangul the gaps are 0x00-0x40, 0x7F-0x80, and 0xFF.
+SELECT convert_from('\x8a40'::bytea, 'johab');
+ERROR:  invalid byte sequence for encoding "JOHAB": 0x8a 0x40
+SELECT convert_from('\x8a7f'::bytea, 'johab');
+ERROR:  invalid byte sequence for encoding "JOHAB": 0x8a 0x7f
+SELECT convert_from('\x8a80'::bytea, 'johab');
+ERROR:  invalid byte sequence for encoding "JOHAB": 0x8a 0x80
+-- For the other categories the gaps are 0x00-0x30, 0x7F-0x90, and 0xFF.
+SELECT convert_from('\xe030'::bytea, 'johab');
+ERROR:  invalid byte sequence for encoding "JOHAB": 0xe0 0x30
+SELECT convert_from('\xe07f'::bytea, 'johab');
+ERROR:  invalid byte sequence for encoding "JOHAB": 0xe0 0x7f
+SELECT convert_from('\xe090'::bytea, 'johab');
+ERROR:  invalid byte sequence for encoding "JOHAB": 0xe0 0x90
+SELECT convert_from('\xe0ff'::bytea, 'johab');
+ERROR:  invalid byte sequence for encoding "JOHAB": 0xe0 0xff
+-- Incomplete sequence: a valid lead byte with no trail byte is rejected.
+SELECT convert_from('\x8a'::bytea, 'johab');
+ERROR:  invalid byte sequence for encoding "JOHAB": 0x8a
diff --git a/src/test/regress/expected/johab_1.out b/src/test/regress/expected/johab_1.out
new file mode 100644
index 00000000000..89028ad81e0
--- /dev/null
+++ b/src/test/regress/expected/johab_1.out
@@ -0,0 +1,9 @@
+-- This test exercises the JOHAB client encoding (KS X 1001:2004 Annex 3).
+-- JOHAB's valid byte ranges differ from EUC-KR: trail bytes may fall within
+-- the ASCII graphic range (0x41-0x7E for Hangul, 0x31-0x7E for the other
+-- categories), including 0x5C which is the ASCII backslash.  The test runs
+-- only in UTF8 databases, since some decoded characters have no equivalent
+-- in other server encodings.
+SELECT getdatabaseencoding() <> 'UTF8' AS skip_test \gset
+\if :skip_test
+\quit
diff --git a/src/test/regress/parallel_schedule b/src/test/regress/parallel_schedule
index cc365393bb7..63f7419d255 100644
--- a/src/test/regress/parallel_schedule
+++ b/src/test/regress/parallel_schedule
@@ -28,7 +28,7 @@ test: strings md5 numerology point lseg line box path polygon circle date time t
 # geometry depends on point, lseg, line, box, path, polygon, circle
 # horology depends on date, time, timetz, timestamp, timestamptz, interval
 # ----------
-test: geometry horology tstypes regex type_sanity opr_sanity misc_sanity comments expressions unicode xid mvcc database stats_import pg_ndistinct pg_dependencies oid8 encoding euc_kr
+test: geometry horology tstypes regex type_sanity opr_sanity misc_sanity comments expressions unicode xid mvcc database stats_import pg_ndistinct pg_dependencies oid8 encoding euc_kr johab
 
 # ----------
 # Load huge amounts of data
diff --git a/src/test/regress/sql/johab.sql b/src/test/regress/sql/johab.sql
new file mode 100644
index 00000000000..7a919f430a7
--- /dev/null
+++ b/src/test/regress/sql/johab.sql
@@ -0,0 +1,58 @@
+-- This test exercises the JOHAB client encoding (KS X 1001:2004 Annex 3).
+-- JOHAB's valid byte ranges differ from EUC-KR: trail bytes may fall within
+-- the ASCII graphic range (0x41-0x7E for Hangul, 0x31-0x7E for the other
+-- categories), including 0x5C which is the ASCII backslash.  The test runs
+-- only in UTF8 databases, since some decoded characters have no equivalent
+-- in other server encodings.
+SELECT getdatabaseencoding() <> 'UTF8' AS skip_test \gset
+\if :skip_test
+\quit
+\endif
+
+-- Bug #19354 original report plus its neighbors: these three byte sequences
+-- are valid Hangul syllables per Annex 3 Table 1 (lead 0x8A is in 0x84-0xD3,
+-- trail 0x5B/0x5C/0x5D is in 0x41-0x7E) but were rejected by the prior
+-- EUC-KR-derived check that demanded trail bytes in 0xA1-0xFE.
+SELECT convert_from('\x8a5b'::bytea, 'johab') AS "0x8a5b",
+       convert_from('\x8a5c'::bytea, 'johab') AS "0x8a5c",
+       convert_from('\x8a5d'::bytea, 'johab') AS "0x8a5d";
+
+-- First multi-byte character in unicode.org's JOHAB.TXT, also rejected by
+-- the prior check (trail 0x44 in Hangul range 0x41-0x7E).
+SELECT convert_from('\x8444'::bytea, 'johab') AS "0x8444";
+
+-- Regression check for byte sequences that already decoded correctly under
+-- the old rules (trail byte already within the old-allowed 0xA1-0xFE).
+SELECT convert_from('\x89ef'::bytea, 'johab') AS "0x89ef",
+       convert_from('\x89a1'::bytea, 'johab') AS "0x89a1";
+
+-- Hanja range (lead 0xE0-0xF9) with trail bytes in the old-rejected region
+-- 0x31-0xA0.  Per Annex 3 Table 1 the Hanja trail range is 0x31-0x7E and
+-- 0x91-0xFE.
+SELECT convert_from('\xe031'::bytea, 'johab') AS "0xe031",
+       convert_from('\xe07e'::bytea, 'johab') AS "0xe07e",
+       convert_from('\xe091'::bytea, 'johab') AS "0xe091";
+
+-- "Other characters" category (lead 0xD9-0xDE) with a low trail byte.
+SELECT convert_from('\xd931'::bytea, 'johab') AS "0xd931";
+
+-- Invalid lead bytes: the gaps between the four lead-byte ranges defined by
+-- Annex 3 Table 1.
+SELECT convert_from('\x8041'::bytea, 'johab');
+SELECT convert_from('\xd541'::bytea, 'johab');
+SELECT convert_from('\xdf41'::bytea, 'johab');
+SELECT convert_from('\xfa41'::bytea, 'johab');
+
+-- Invalid trail bytes: values inside the gaps within each trail-byte range.
+-- For Hangul the gaps are 0x00-0x40, 0x7F-0x80, and 0xFF.
+SELECT convert_from('\x8a40'::bytea, 'johab');
+SELECT convert_from('\x8a7f'::bytea, 'johab');
+SELECT convert_from('\x8a80'::bytea, 'johab');
+-- For the other categories the gaps are 0x00-0x30, 0x7F-0x90, and 0xFF.
+SELECT convert_from('\xe030'::bytea, 'johab');
+SELECT convert_from('\xe07f'::bytea, 'johab');
+SELECT convert_from('\xe090'::bytea, 'johab');
+SELECT convert_from('\xe0ff'::bytea, 'johab');
+
+-- Incomplete sequence: a valid lead byte with no trail byte is rejected.
+SELECT convert_from('\x8a'::bytea, 'johab');
-- 
2.50.1 (Apple Git-155)