From 8fc31198057582667738d226cc792c7970f25490 Mon Sep 17 00:00:00 2001 From: Henson Choi Date: Wed, 1 Jul 2026 15:05:56 +0900 Subject: [PATCH] Add tests exposing out-of-bounds reads in Unicode normalization lookup Bounds assertions in normalization_index()/inverse_index() plus regression tests that hit the last, unpadded block of the decomposition and recomposition Index tables. Test-only; does not fix table generation. --- src/include/common/unicode_norm_table.h | 2 ++ src/test/regress/expected/unicode.out | 29 +++++++++++++++++++++++++ src/test/regress/sql/unicode.sql | 10 +++++++++ 3 files changed, 41 insertions(+) diff --git a/src/include/common/unicode_norm_table.h b/src/include/common/unicode_norm_table.h index 7bd198fac5f..99c42854759 100644 --- a/src/include/common/unicode_norm_table.h +++ b/src/include/common/unicode_norm_table.h @@ -6521,6 +6521,7 @@ normalization_index(char32_t cp) offset = decomp_table_offset[offset_idx]; + Assert(offset + (cp & 63) < lengthof(decomp_table_index)); return decomp_table_index[offset + (cp & 63)]; } @@ -6541,6 +6542,7 @@ inverse_index(char32_t cp) offset = inverse_table_offset[offset_idx]; + Assert(offset + (cp & 63) < lengthof(inverse_table_index)); return inverse_table_index[offset + (cp & 63)]; } diff --git a/src/test/regress/expected/unicode.out b/src/test/regress/expected/unicode.out index 63e48d3a961..41de7a4e12f 100644 --- a/src/test/regress/expected/unicode.out +++ b/src/test/regress/expected/unicode.out @@ -183,3 +183,32 @@ SELECT normalize(U&'\D7A3', NFD) = U&'\1112\1175\11C2' COLLATE "C" AS hangul_nfd t (1 row) +-- Two-stage table boundary: the last block of the decomposition and +-- recomposition Index tables must be padded so the first out-of-bounds slot +-- and the block's last slot (cp & 63 == 63) stay in bounds. +-- NFD forces the decomposition lookup (paired with a decomposing character). +SELECT normalize(U&'\00C5\+02FA1E', NFD) = U&'\0041\030A\+02FA1E' COLLATE "C" AS decomp_first_oob; + decomp_first_oob +------------------ + t +(1 row) + +SELECT normalize(U&'\00C5\+02FA3F', NFD) = U&'\0041\030A\+02FA3F' COLLATE "C" AS decomp_last_slot; + decomp_last_slot +------------------ + t +(1 row) + +-- A trailing combining mark forces the NFC recomposition lookup. +SELECT normalize(U&'\+016D6A\0301', NFC) = U&'\+016D6A\0301' COLLATE "C" AS recomp_first_oob; + recomp_first_oob +------------------ + t +(1 row) + +SELECT normalize(U&'\+016D7F\0301', NFC) = U&'\+016D7F\0301' COLLATE "C" AS recomp_last_slot; + recomp_last_slot +------------------ + t +(1 row) + diff --git a/src/test/regress/sql/unicode.sql b/src/test/regress/sql/unicode.sql index 951f86a336e..b9fe53144db 100644 --- a/src/test/regress/sql/unicode.sql +++ b/src/test/regress/sql/unicode.sql @@ -56,3 +56,13 @@ SELECT normalize(U&'\1100\1161\11A7', NFC) = U&'\AC00\11A7' COLLATE "C" AS hangu SELECT normalize(U&'\AC00', NFD) = U&'\1100\1161' COLLATE "C" AS hangul_nfd_lv; SELECT normalize(U&'\AC01', NFD) = U&'\1100\1161\11A8' COLLATE "C" AS hangul_nfd_lvt; SELECT normalize(U&'\D7A3', NFD) = U&'\1112\1175\11C2' COLLATE "C" AS hangul_nfd_last; + +-- Two-stage table boundary: the last block of the decomposition and +-- recomposition Index tables must be padded so the first out-of-bounds slot +-- and the block's last slot (cp & 63 == 63) stay in bounds. +-- NFD forces the decomposition lookup (paired with a decomposing character). +SELECT normalize(U&'\00C5\+02FA1E', NFD) = U&'\0041\030A\+02FA1E' COLLATE "C" AS decomp_first_oob; +SELECT normalize(U&'\00C5\+02FA3F', NFD) = U&'\0041\030A\+02FA3F' COLLATE "C" AS decomp_last_slot; +-- A trailing combining mark forces the NFC recomposition lookup. +SELECT normalize(U&'\+016D6A\0301', NFC) = U&'\+016D6A\0301' COLLATE "C" AS recomp_first_oob; +SELECT normalize(U&'\+016D7F\0301', NFC) = U&'\+016D7F\0301' COLLATE "C" AS recomp_last_slot; -- 2.50.1 (Apple Git-155)