From 0614fd3227eedffe91c31468def76400fd01d134 Mon Sep 17 00:00:00 2001 From: Michael Paquier Date: Thu, 4 Jun 2026 12:56:36 +0900 Subject: [PATCH] Fix off-by-one with NFC recomposition for Hangul U+11A7 (TBASE) The NFC recomposition incorrectly included TBASE as a valid T syllable, which is incorrect based on the Unicode spec (TBASE is one below the start of the range, range beginning at U+11A8). This would cause the TBASE to be silently swallowed in the normalization, leading to an incorrect result. A couple of regression tests are added to check more patterns with Hangul recomposition and decomposition, on top of a test to check this issue with TBASE. Author: Diego Frias Discussion: https://postgr.es/m/B92ED640-7D4A-4505-B09F-3548F58CBB16@dzfrias.dev Backpatch-through: 14 --- src/common/unicode_norm.c | 2 +- src/test/regress/expected/unicode.out | 78 +++++++++++++++++++++++++++ src/test/regress/sql/unicode.sql | 20 +++++++ 3 files changed, 99 insertions(+), 1 deletion(-) diff --git a/src/common/unicode_norm.c b/src/common/unicode_norm.c index cf84f2024140..0534ae34640f 100644 --- a/src/common/unicode_norm.c +++ b/src/common/unicode_norm.c @@ -236,7 +236,7 @@ recompose_code(uint32 start, uint32 code, uint32 *result) /* Check if two current characters are LV and T */ else if (start >= SBASE && start < (SBASE + SCOUNT) && ((start - SBASE) % TCOUNT) == 0 && - code >= TBASE && code < (TBASE + TCOUNT)) + code > TBASE && code < (TBASE + TCOUNT)) { /* make syllable of form LVT */ uint32 tindex = code - TBASE; diff --git a/src/test/regress/expected/unicode.out b/src/test/regress/expected/unicode.out index 1e06de226491..63e48d3a961f 100644 --- a/src/test/regress/expected/unicode.out +++ b/src/test/regress/expected/unicode.out @@ -105,3 +105,81 @@ ORDER BY num; SELECT is_normalized('abc', 'def'); -- run-time error ERROR: invalid normalization form: def +-- Hangul NFC recomposition tests +-- L+V -> LV composition (first and last) +SELECT normalize(U&'\1100\1161', NFC) = U&'\AC00' COLLATE "C" AS hangul_lv_first; + hangul_lv_first +----------------- + t +(1 row) + +SELECT normalize(U&'\1112\1175', NFC) = U&'\D788' COLLATE "C" AS hangul_lv_last; + hangul_lv_last +---------------- + t +(1 row) + +-- LV+T -> LVT composition +SELECT normalize(U&'\AC00\11A8', NFC) = U&'\AC01' COLLATE "C" AS hangul_lvt_first_t; + hangul_lvt_first_t +-------------------- + t +(1 row) + +SELECT normalize(U&'\AC00\11C2', NFC) = U&'\AC1B' COLLATE "C" AS hangul_lvt_last_t; + hangul_lvt_last_t +------------------- + t +(1 row) + +SELECT normalize(U&'\D788\11A8', NFC) = U&'\D789' COLLATE "C" AS hangul_lvt_last_lv; + hangul_lvt_last_lv +-------------------- + t +(1 row) + +-- L+V+T -> LVT composition +SELECT normalize(U&'\1100\1161\11A8', NFC) = U&'\AC01' COLLATE "C" AS hangul_full_lvt; + hangul_full_lvt +----------------- + t +(1 row) + +SELECT normalize(U&'\1112\1175\11C2', NFC) = U&'\D7A3' COLLATE "C" AS hangul_full_lvt; + hangul_full_lvt +----------------- + t +(1 row) + +-- TBASE invalid T syllable +SELECT normalize(U&'\AC00\11A7', NFC) = U&'\AC00\11A7' COLLATE "C" AS hangul_tbase_not_combined; + hangul_tbase_not_combined +--------------------------- + t +(1 row) + +SELECT normalize(U&'\1100\1161\11A7', NFC) = U&'\AC00\11A7' COLLATE "C" AS hangul_lv_tbase_separate; + hangul_lv_tbase_separate +-------------------------- + t +(1 row) + +-- Hangul NFD decomposition tests +SELECT normalize(U&'\AC00', NFD) = U&'\1100\1161' COLLATE "C" AS hangul_nfd_lv; + hangul_nfd_lv +--------------- + t +(1 row) + +SELECT normalize(U&'\AC01', NFD) = U&'\1100\1161\11A8' COLLATE "C" AS hangul_nfd_lvt; + hangul_nfd_lvt +---------------- + t +(1 row) + +SELECT normalize(U&'\D7A3', NFD) = U&'\1112\1175\11C2' COLLATE "C" AS hangul_nfd_last; + hangul_nfd_last +----------------- + t +(1 row) + diff --git a/src/test/regress/sql/unicode.sql b/src/test/regress/sql/unicode.sql index e50adb68ed0d..951f86a336e8 100644 --- a/src/test/regress/sql/unicode.sql +++ b/src/test/regress/sql/unicode.sql @@ -36,3 +36,23 @@ FROM ORDER BY num; SELECT is_normalized('abc', 'def'); -- run-time error + +-- Hangul NFC recomposition tests +-- L+V -> LV composition (first and last) +SELECT normalize(U&'\1100\1161', NFC) = U&'\AC00' COLLATE "C" AS hangul_lv_first; +SELECT normalize(U&'\1112\1175', NFC) = U&'\D788' COLLATE "C" AS hangul_lv_last; +-- LV+T -> LVT composition +SELECT normalize(U&'\AC00\11A8', NFC) = U&'\AC01' COLLATE "C" AS hangul_lvt_first_t; +SELECT normalize(U&'\AC00\11C2', NFC) = U&'\AC1B' COLLATE "C" AS hangul_lvt_last_t; +SELECT normalize(U&'\D788\11A8', NFC) = U&'\D789' COLLATE "C" AS hangul_lvt_last_lv; +-- L+V+T -> LVT composition +SELECT normalize(U&'\1100\1161\11A8', NFC) = U&'\AC01' COLLATE "C" AS hangul_full_lvt; +SELECT normalize(U&'\1112\1175\11C2', NFC) = U&'\D7A3' COLLATE "C" AS hangul_full_lvt; +-- TBASE invalid T syllable +SELECT normalize(U&'\AC00\11A7', NFC) = U&'\AC00\11A7' COLLATE "C" AS hangul_tbase_not_combined; +SELECT normalize(U&'\1100\1161\11A7', NFC) = U&'\AC00\11A7' COLLATE "C" AS hangul_lv_tbase_separate; + +-- Hangul NFD decomposition tests +SELECT normalize(U&'\AC00', NFD) = U&'\1100\1161' COLLATE "C" AS hangul_nfd_lv; +SELECT normalize(U&'\AC01', NFD) = U&'\1100\1161\11A8' COLLATE "C" AS hangul_nfd_lvt; +SELECT normalize(U&'\D7A3', NFD) = U&'\1112\1175\11C2' COLLATE "C" AS hangul_nfd_last; -- 2.54.0