From dd5eb976fdefc447826b0310a782c2848c3f21a1 Mon Sep 17 00:00:00 2001 From: DoGeon Yoo Date: Thu, 14 May 2026 15:44:19 +0900 Subject: [PATCH v1 1/2] Add regression test for UHC encoding (baseline capture) UHC is a client-only encoding, so pg_uhc_verifychar() can only be exercised indirectly through convert_from() in a UTF8 database. There has been no dedicated regression test for it. This commit adds src/test/regress/sql/uhc.sql covering: - valid two-byte sequences at the CP949 lead/trail boundaries (trail 0x41, 0x5A, 0x61, 0x7A, 0x81, 0xFE; high leads 0xC7, 0xFD) - invalid lead bytes (0x80, 0xFF) - invalid trail bytes (0x40, 0x5B, 0x60, 0x7B, 0x80, 0xFF) - the NONUTF8_INVALID sentinel pair (0x8d 0x20) - a truncated two-byte character The expected output records the *current* behavior on master, not the desired behavior. In particular, the eight invalid-lead and invalid-trail cases (0x80 0x41, 0xFF 0x41, 0x81 0x40, ...) are currently accepted by pg_uhc_verifychar() and rejected only later by the conversion table with "character with byte sequence ... has no equivalent in encoding UTF8". Capturing this behavior here makes the follow-up patch's diff self-evident: a subsequent commit that tightens pg_uhc_verifychar() to enforce the CP949 lead/trail byte ranges will turn those eight "has no equivalent" errors into "invalid byte sequence" errors, without changing any other test result. uhc_1.out provides an early \quit fallback for non-UTF8 databases. --- src/test/regress/expected/uhc.out | 86 +++++++++++++++++++++++++++++ src/test/regress/expected/uhc_1.out | 6 ++ src/test/regress/parallel_schedule | 2 +- src/test/regress/sql/uhc.sql | 36 ++++++++++++ 4 files changed, 129 insertions(+), 1 deletion(-) create mode 100644 src/test/regress/expected/uhc.out create mode 100644 src/test/regress/expected/uhc_1.out create mode 100644 src/test/regress/sql/uhc.sql diff --git a/src/test/regress/expected/uhc.out b/src/test/regress/expected/uhc.out new file mode 100644 index 00000000000..d922cca7caf --- /dev/null +++ b/src/test/regress/expected/uhc.out @@ -0,0 +1,86 @@ +-- This test is about UHC (Windows-949 / CP949) encoding. UHC is a +-- client-only encoding, so exercise pg_uhc_verifychar() via convert_from() +-- in a UTF8 database. +SELECT getdatabaseencoding() <> 'UTF8' AS skip_test \gset +\if :skip_test +\quit +\endif +-- valid: EUC_KR-compatible Hangul (U+AC00 "가") +SELECT encode(convert_to(convert_from('\xb0a1', 'UHC'), 'UTF8'), 'hex'); + encode +-------- + eab080 +(1 row) + +-- valid: CP949 lead/trail boundary values +SELECT encode(convert_to(convert_from('\x8141', 'UHC'), 'UTF8'), 'hex'); -- trail 0x41 + encode +-------- + eab082 +(1 row) + +SELECT encode(convert_to(convert_from('\x815a', 'UHC'), 'UTF8'), 'hex'); -- trail 0x5A + encode +-------- + eab0b4 +(1 row) + +SELECT encode(convert_to(convert_from('\x8161', 'UHC'), 'UTF8'), 'hex'); -- trail 0x61 + encode +-------- + eab0b5 +(1 row) + +SELECT encode(convert_to(convert_from('\x817a', 'UHC'), 'UTF8'), 'hex'); -- trail 0x7A + encode +-------- + eab195 +(1 row) + +SELECT encode(convert_to(convert_from('\x8181', 'UHC'), 'UTF8'), 'hex'); -- trail 0x81 + encode +-------- + eab196 +(1 row) + +SELECT encode(convert_to(convert_from('\x81fe', 'UHC'), 'UTF8'), 'hex'); -- trail 0xFE + encode +-------- + eab493 +(1 row) + +SELECT encode(convert_to(convert_from('\xc7a1', 'UHC'), 'UTF8'), 'hex'); -- high lead 0xC7 + encode +-------- + ed9088 +(1 row) + +SELECT encode(convert_to(convert_from('\xfda1', 'UHC'), 'UTF8'), 'hex'); -- high lead 0xFD + encode +-------- + e788bb +(1 row) + +-- invalid lead byte (0x80 and 0xFF are unused in CP949) +SELECT convert_from('\x8041', 'UHC'); +ERROR: character with byte sequence 0x80 0x41 in encoding "UHC" has no equivalent in encoding "UTF8" +SELECT convert_from('\xff41', 'UHC'); +ERROR: character with byte sequence 0xff 0x41 in encoding "UHC" has no equivalent in encoding "UTF8" +-- invalid trail byte +SELECT convert_from('\x8140', 'UHC'); -- 0x40 +ERROR: character with byte sequence 0x81 0x40 in encoding "UHC" has no equivalent in encoding "UTF8" +SELECT convert_from('\x815b', 'UHC'); -- 0x5B +ERROR: character with byte sequence 0x81 0x5b in encoding "UHC" has no equivalent in encoding "UTF8" +SELECT convert_from('\x8160', 'UHC'); -- 0x60 +ERROR: character with byte sequence 0x81 0x60 in encoding "UHC" has no equivalent in encoding "UTF8" +SELECT convert_from('\x817b', 'UHC'); -- 0x7B +ERROR: character with byte sequence 0x81 0x7b in encoding "UHC" has no equivalent in encoding "UTF8" +SELECT convert_from('\x8180', 'UHC'); -- 0x80 +ERROR: character with byte sequence 0x81 0x80 in encoding "UHC" has no equivalent in encoding "UTF8" +SELECT convert_from('\x81ff', 'UHC'); -- 0xFF +ERROR: character with byte sequence 0x81 0xff in encoding "UHC" has no equivalent in encoding "UTF8" +SELECT convert_from('\x8d20', 'UHC'); -- NONUTF8_INVALID sentinel pair +ERROR: invalid byte sequence for encoding "UHC": 0x8d 0x20 +-- truncated two-byte character +SELECT convert_from('\x81', 'UHC'); +ERROR: invalid byte sequence for encoding "UHC": 0x81 diff --git a/src/test/regress/expected/uhc_1.out b/src/test/regress/expected/uhc_1.out new file mode 100644 index 00000000000..9deb8b8ee1d --- /dev/null +++ b/src/test/regress/expected/uhc_1.out @@ -0,0 +1,6 @@ +-- This test is about UHC (Windows-949 / CP949) encoding. UHC is a +-- client-only encoding, so exercise pg_uhc_verifychar() via convert_from() +-- in a UTF8 database. +SELECT getdatabaseencoding() <> 'UTF8' AS skip_test \gset +\if :skip_test +\quit diff --git a/src/test/regress/parallel_schedule b/src/test/regress/parallel_schedule index 8fa0a6c47fb..15d5e539961 100644 --- a/src/test/regress/parallel_schedule +++ b/src/test/regress/parallel_schedule @@ -28,7 +28,7 @@ test: strings md5 numerology point lseg line box path polygon circle date time t # geometry depends on point, lseg, line, box, path, polygon, circle # horology depends on date, time, timetz, timestamp, timestamptz, interval # ---------- -test: geometry horology tstypes regex type_sanity opr_sanity misc_sanity comments expressions unicode xid mvcc database stats_import pg_ndistinct pg_dependencies oid8 encoding euc_kr +test: geometry horology tstypes regex type_sanity opr_sanity misc_sanity comments expressions unicode xid mvcc database stats_import pg_ndistinct pg_dependencies oid8 encoding euc_kr uhc # ---------- # Load huge amounts of data diff --git a/src/test/regress/sql/uhc.sql b/src/test/regress/sql/uhc.sql new file mode 100644 index 00000000000..6905ad084b4 --- /dev/null +++ b/src/test/regress/sql/uhc.sql @@ -0,0 +1,36 @@ +-- This test is about UHC (Windows-949 / CP949) encoding. UHC is a +-- client-only encoding, so exercise pg_uhc_verifychar() via convert_from() +-- in a UTF8 database. +SELECT getdatabaseencoding() <> 'UTF8' AS skip_test \gset +\if :skip_test +\quit +\endif + +-- valid: EUC_KR-compatible Hangul (U+AC00 "가") +SELECT encode(convert_to(convert_from('\xb0a1', 'UHC'), 'UTF8'), 'hex'); + +-- valid: CP949 lead/trail boundary values +SELECT encode(convert_to(convert_from('\x8141', 'UHC'), 'UTF8'), 'hex'); -- trail 0x41 +SELECT encode(convert_to(convert_from('\x815a', 'UHC'), 'UTF8'), 'hex'); -- trail 0x5A +SELECT encode(convert_to(convert_from('\x8161', 'UHC'), 'UTF8'), 'hex'); -- trail 0x61 +SELECT encode(convert_to(convert_from('\x817a', 'UHC'), 'UTF8'), 'hex'); -- trail 0x7A +SELECT encode(convert_to(convert_from('\x8181', 'UHC'), 'UTF8'), 'hex'); -- trail 0x81 +SELECT encode(convert_to(convert_from('\x81fe', 'UHC'), 'UTF8'), 'hex'); -- trail 0xFE +SELECT encode(convert_to(convert_from('\xc7a1', 'UHC'), 'UTF8'), 'hex'); -- high lead 0xC7 +SELECT encode(convert_to(convert_from('\xfda1', 'UHC'), 'UTF8'), 'hex'); -- high lead 0xFD + +-- invalid lead byte (0x80 and 0xFF are unused in CP949) +SELECT convert_from('\x8041', 'UHC'); +SELECT convert_from('\xff41', 'UHC'); + +-- invalid trail byte +SELECT convert_from('\x8140', 'UHC'); -- 0x40 +SELECT convert_from('\x815b', 'UHC'); -- 0x5B +SELECT convert_from('\x8160', 'UHC'); -- 0x60 +SELECT convert_from('\x817b', 'UHC'); -- 0x7B +SELECT convert_from('\x8180', 'UHC'); -- 0x80 +SELECT convert_from('\x81ff', 'UHC'); -- 0xFF +SELECT convert_from('\x8d20', 'UHC'); -- NONUTF8_INVALID sentinel pair + +-- truncated two-byte character +SELECT convert_from('\x81', 'UHC'); -- 2.43.0