From 917c3eaf9d42d21e6f4cca9b58fca5748bab12f0 Mon Sep 17 00:00:00 2001 From: "Chao Li (Evan)" Date: Mon, 11 Aug 2025 18:06:07 +0800 Subject: [PATCH v1] GB18030: Switch to using gb-18030-2000.ucm This is the first in a series of patches to upgrade GB18030 encoding from the 2000 standard to the 2022 standard. In this patch, UCS_to_GB18030.pl is modified to use gb-18030-2000.ucm. We do not check the UCM file into the source tree; to build the map files, run: make gb18030_to_utf8.map from src/backend/utils/mb/Unicode. Note that the gb-18030-2000.ucm used here is not the latest version from GitHub. A newer version exists that fixes a mapping error. To keep this patch focused, we are using a version of gb-18030-2000.ucm that matches the current gb-18030-2000.xml, so the mapping files remain unchanged in this commit. The next patch will update gb-18030-2022.ucm to the latest version. Author: Chao Li --- src/backend/utils/mb/Unicode/Makefile | 5 +- .../utils/mb/Unicode/UCS_to_GB18030.pl | 71 ++++++++++++------- 2 files changed, 50 insertions(+), 26 deletions(-) diff --git a/src/backend/utils/mb/Unicode/Makefile b/src/backend/utils/mb/Unicode/Makefile index ad789b31e54..27424b2a001 100644 --- a/src/backend/utils/mb/Unicode/Makefile +++ b/src/backend/utils/mb/Unicode/Makefile @@ -54,7 +54,7 @@ $(eval $(call map_rule,euc_cn,UCS_to_EUC_CN.pl,gb-18030-2000.xml)) $(eval $(call map_rule,euc_kr,UCS_to_EUC_KR.pl,KSX1001.TXT)) $(eval $(call map_rule,euc_tw,UCS_to_EUC_TW.pl,CNS11643.TXT)) $(eval $(call map_rule,sjis,UCS_to_SJIS.pl,CP932.TXT)) -$(eval $(call map_rule,gb18030,UCS_to_GB18030.pl,gb-18030-2000.xml)) +$(eval $(call map_rule,gb18030,UCS_to_GB18030.pl,gb-18030-2000.ucm)) $(eval $(call map_rule,big5,UCS_to_BIG5.pl,CP950.TXT BIG5.TXT CP950.TXT)) $(eval $(call map_rule,euc_jis_2004,UCS_to_EUC_JIS_2004.pl,euc-jis-2004-std.txt)) $(eval $(call map_rule,shift_jis_2004,UCS_to_SHIFT_JIS_2004.pl,sjis-0213-2004-std.txt)) @@ -78,6 +78,9 @@ euc-jis-2004-std.txt sjis-0213-2004-std.txt: gb-18030-2000.xml windows-949-2000.xml: $(DOWNLOAD) https://raw.githubusercontent.com/unicode-org/icu-data/master/charset/data/xml/$(@F) +gb-18030-2000.ucm: + $(DOWNLOAD) https://raw.githubusercontent.com/unicode-org/icu-data/d9d3a6ed27bb98a7106763e940258f0be8cd995b/charset/data/ucm/$(@F) + GB2312.TXT: $(DOWNLOAD) 'http://trac.greenstone.org/browser/trunk/gsdl/unicode/MAPPINGS/EASTASIA/GB/GB2312.TXT?rev=1842&format=txt' diff --git a/src/backend/utils/mb/Unicode/UCS_to_GB18030.pl b/src/backend/utils/mb/Unicode/UCS_to_GB18030.pl index ddcbd6ef0c4..e53cd2532cb 100755 --- a/src/backend/utils/mb/Unicode/UCS_to_GB18030.pl +++ b/src/backend/utils/mb/Unicode/UCS_to_GB18030.pl @@ -4,14 +4,18 @@ # # src/backend/utils/mb/Unicode/UCS_to_GB18030.pl # + # Generate UTF-8 <--> GB18030 code conversion tables from -# "gb-18030-2000.xml", obtained from -# http://source.icu-project.org/repos/icu/data/trunk/charset/data/xml/ +# "gb-18030-2000.ucm", a Unicode Character Mapping file (UCM) from ICU, +# obtained from https://github.com/unicode-org/icu-data/blob/d9d3a6ed27bb98a7106763e940258f0be8cd995b/charset/data/ucm/gb-18030-2000.ucm +# +# The lines we care about in the source file look like: +# \xYY[\xYY...] |n +# where is the Unicode code point in hex, +# and the \xYY... is the hex byte sequence for GB18030, +# and n is a flag indicating the type of mapping having +# a single value of 0. # -# The lines we care about in the source file look like -# -# where the "u" field is the Unicode code point in hex, -# and the "b" field is the hex byte sequence for GB18030 use strict; use warnings FATAL => 'all'; @@ -22,29 +26,46 @@ my $this_script = 'src/backend/utils/mb/Unicode/UCS_to_GB18030.pl'; # Read the input -my $in_file = "gb-18030-2000.xml"; - +my $in_file = "gb-18030-2000.ucm"; open(my $in, '<', $in_file) || die("cannot open $in_file"); my @mapping; +my $in_charmap = 0; + +while (<$in>) { + chomp; + # Enter CHARMAP section + if (/^CHARMAP/) { + $in_charmap = 1; + next; + } + # Exit CHARMAP section + if (/^END CHARMAP/) { + $in_charmap = 0; + last; + } + next unless $in_charmap; + # Skip comments and empty lines + next if /^#/ || /^$/; -while (<$in>) -{ - next if (!m/= 0x80 && $ucs >= 0x0080) - { - push @mapping, - { - ucs => $ucs, - code => $code, - direction => BOTH, - f => $in_file, - l => $. - }; + # Match lines like: \xYY[\xYY...] |n, and use only (|0) mappings + if (/^\s+((?:\\x[0-9A-Fa-f]{2})+)\s*\|(\d+)/) { + my ($u, $c, $flag) = ($1, $2, $3); + next if ($flag ne '0'); # non-0 flags + my $ucs = hex($u); + # Remove \x and concatenate bytes + my $c_hex = $c; + $c_hex =~ s/\\x//g; + my $code = hex($c_hex); + if ($code >= 0x80 && $ucs >= 0x0080) { + push @mapping, { + ucs => $ucs, + code => $code, + direction => BOTH, + f => $in_file, + l => $. + }; + } } } close($in); -- 2.39.5 (Apple Git-154)