From 64d5ef3b024368541181255700997f517c45cc56 Mon Sep 17 00:00:00 2001 From: Alexander Borisov Date: Mon, 2 Jun 2025 15:38:00 +0300 Subject: [PATCH v1 1/2] Moving Perl functions (Range index) to a common module. Functions for forming the Range index in case mapping are transferred to a Perl module. From src/common/unicode/generate-unicode_case_table.pl to src/tools/Ranges.pm. This is necessary for further use of this code to build Range indexes in Unicode normalization. --- .../unicode/generate-unicode_case_table.pl | 159 +---------- src/tools/Ranges.pm | 248 ++++++++++++++++++ 2 files changed, 255 insertions(+), 152 deletions(-) create mode 100644 src/tools/Ranges.pm diff --git a/src/common/unicode/generate-unicode_case_table.pl b/src/common/unicode/generate-unicode_case_table.pl index 5d9ddd62803..21b4b4bc238 100644 --- a/src/common/unicode/generate-unicode_case_table.pl +++ b/src/common/unicode/generate-unicode_case_table.pl @@ -14,6 +14,7 @@ use Getopt::Long; use FindBin; use lib "$FindBin::RealBin/../../tools/"; +use Ranges; my $output_path = '.'; @@ -466,11 +467,11 @@ foreach my $entry (@{ $map{special} }) print $OT "\n};\n"; +my $case_table_name = "case_map"; my @codepoints = keys %simple; -my $range = make_ranges(\@codepoints, 500); -my @case_map_lines = range_tables($range); -my $case_map_length = scalar @case_map_lines; -my $case_map_table = join "\n", @case_map_lines; +my $range = Ranges::make(\@codepoints, 500); +my $case_map_table = Ranges::tables($range, $case_table_name, + sub { $simple{ $_[0] }{Index} || 0 }); print $OT <<"EOS"; @@ -479,10 +480,7 @@ print $OT <<"EOS"; * of the following arrays: case_map_lower, case_map_title, case_map_upper, * case_map_fold. */ -static const uint16 case_map[$case_map_length] = -{ $case_map_table -}; EOS @@ -507,12 +505,12 @@ case_index(pg_wchar cp) /* Fast path for codepoints < $fastpath_limit */ if (cp < $fastpath_limit) { - return case_map[cp]; + return $case_table_name\[cp]; } EOS -print $OT join("\n", @{ branch($range, 0, $#$range, 1) }); +print $OT Ranges::branch_as_text($range, 0, $#$range, 1, $case_table_name); print $OT <<"EOS"; @@ -522,146 +520,3 @@ print $OT <<"EOS"; EOS close $OT; - -# The function generates C code with a series of nested if-else conditions -# to search for the matching interval. -sub branch -{ - my ($range, $from, $to, $indent) = @_; - my ($idx, $space, $entry, $table, @result); - - $idx = ($from + int(($to - $from) / 2)); - return \@result unless exists $range->[$idx]; - - $space = "\t" x $indent; - - $entry = $range->[$idx]; - - # IF state - if ($idx == $from) - { - if ($idx == 0) - { - push @result, - sprintf("%sif (cp >= 0x%04X && cp < 0x%04X)\n%s{", - $space, $entry->{Start}, $entry->{End}, $space); - } - else - { - push @result, - sprintf("%sif (cp < 0x%04X)\n%s{", - $space, $entry->{End}, $space); - } - - push @result, - sprintf("%s\treturn case_map[cp - 0x%04X + %d];", - $space, $entry->{Start}, $entry->{Offset}); - } - else - { - push @result, - sprintf("%sif (cp < 0x%04X)\n%s{", $space, $entry->{End}, $space); - push @result, @{ branch($range, $from, $idx - 1, $indent + 1) }; - } - - push @result, $space . "}"; - - # return now if it's the last range - return \@result if $idx == (scalar @$range) - 1; - - # ELSE looks ahead to the next range to avoid adding an - # unnecessary level of branching. - $entry = @$range[ $idx + 1 ]; - - # ELSE state - push @result, - sprintf("%selse if (cp >= 0x%04X)\n%s{", - $space, $entry->{Start}, $space); - - if ($idx == $to) - { - push @result, - sprintf("%s\treturn case_map\[cp - 0x%04X + %d];", - $space, $entry->{Start}, $entry->{Offset}); - } - else - { - push @result, @{ branch($range, $idx + 1, $to, $indent + 1) }; - } - - push @result, $space . "}"; - - return \@result; -} - -# Group numbers into ranges where the difference between neighboring -# elements does not exceed $limit. If the difference is greater, a new -# range is created. This is used to break the sequence into intervals -# where the gaps between numbers are greater than limit. -# -# For example, if there are numbers 1, 2, 3, 5, 6 and limit = 1, then -# there is a difference of 2 between 3 and 5, which is greater than 1, -# so there will be ranges 1-3 and 5-6. -sub make_ranges -{ - my ($nums, $limit) = @_; - my ($prev, $start, $total, @sorted, @range); - - @sorted = sort { $a <=> $b } @$nums; - - die "expecting at least 2 codepoints" if (scalar @sorted < 2); - - $start = shift @sorted; - - die "expecting first codepoint to start at 0" unless $start == 0; - - $prev = $start; - $total = 0; - - # append final 'undef' to signal final iteration - push @sorted, undef; - - foreach my $curr (@sorted) - { - # if last iteration always append the range - if (!defined($curr) || ($curr - $prev > $limit)) - { - push @range, - { - Start => $start, - End => $prev + 1, - Offset => $total - }; - $total += $prev + 1 - $start; - $start = $curr; - } - - $prev = $curr; - } - - return \@range; -} - -# The function combines all ranges into the case_map table. Ranges may -# include codepoints without a case mapping at all, in which case the -# entry in case_map should be zero. -sub range_tables -{ - my ($range) = @_; - my (@lines, @result); - - foreach my $entry (@$range) - { - my $start = $entry->{Start}; - my $end = $entry->{End} - 1; - - foreach my $cp ($start .. $end) - { - my $idx = sprintf("%d,", ($simple{$cp}{Index} || 0)); - $idx .= "\t" if length($idx) < 4; - push @lines, sprintf("\t%s\t\t\t\t\t\t/* U+%06X */", $idx, $cp); - } - } - - return @lines; -} diff --git a/src/tools/Ranges.pm b/src/tools/Ranges.pm new file mode 100644 index 00000000000..b85c0fe6bde --- /dev/null +++ b/src/tools/Ranges.pm @@ -0,0 +1,248 @@ +#---------------------------------------------------------------------- +# +# Ranges.pm +# Perl module for generating optimized C code that implements searching +# value within ranges of values. +# +# This code generates a table with ranges of values and a function containing +# conditions for searching for a value in the table. +# This approach is very effective if there are consecutive values with gaps. +# The algorithm is well suited for creating effective character converters +# (case mapping, normalization, etc.). +# +# How it works. +# +# Values are sorted, after which ranges of values are created with a limit on +# the permissible gap between values. If the gap between values is less than +# the limit, the range is considered continuous (not discontinuous), and the +# values in the gap are filled with zeros. If the gap exceeds the specified +# limit, a new range is created. +# +# Let's take the following values with a gap limit of 10 as an example: +# 1, 2, 3, 50, 51, 1024, 1027, 1031 +# +# We will obtain three ranges: +# 1. 1 to 3 +# 2. 50 to 51 +# 3. 1024 to 1031 +# +# Next, a table is formed from these ranges, in which the data we need to +# compare with the values will be stored. In other words, an index. +# +# The values listed above will form the following table: +# { +# 0, /* The zero value is 0, it's like NULL */ +# 'a', 'b', 'c', /* 1 to 3 */ +# 'j', 'k', /* 50 to 51 */ +# 'x', 0, 0, 'y', 0, 0, 0, 'z' /* 1024 to 1031 */ +# } +# +# Next, the function code is generated to search for the required index in +# the table by value. +# For our example, it will look like this: +# +# if (cp < 0x0034) +# { +# if (cp >= 0x0001 && cp < 0x0004) +# { +# return greek_table[cp - 0x0001 + 0]; +# } +# else if (cp >= 0x0032) +# { +# return greek_table[cp - 0x0032 + 3]; +# } +# } +# else if (cp >= 0x0400) +# { +# if (cp < 0x0408) +# { +# return greek_table[cp - 0x0400 + 5]; +# } +# } +# +# return 0; +# +# In fact, we see that the code is a binary search built on if/else constructs +# that contain all the necessary data (checks and offsets) for correct indexing. +# +# Example of use: +# # Grouping of code points with a gap of <= 10 +# my $ranges = Ranges::make([0x41..0x5A, 0x1F00, 0x1F08], 10); +# +# # Table generation (gaps = 0) +# print Ranges::tables($ranges, "greek_table", sub { $_[0] + 1 }); +# +# # Search code generation +# print Ranges::branch_as_text($ranges, 0, $#$ranges, 0, "greek_table"); +# +# +# Portions Copyright (c) 1996-2025, PostgreSQL Global Development Group +# Portions Copyright (c) 1994, Regents of the University of California +# +# src/tools/Ranges.pm +# +#---------------------------------------------------------------------- + +package Ranges; + +use strict; +use warnings FATAL => 'all'; + + +# The function generates C code with a series of nested if-else conditions +# to search for the matching interval. +sub branch +{ + my ($range, $from, $to, $indent, $table_name) = @_; + my ($idx, $space, $entry, @result); + + $idx = ($from + int(($to - $from) / 2)); + return \@result unless exists $range->[$idx]; + + $space = "\t" x $indent; + + $entry = $range->[$idx]; + + # IF state + if ($idx == $from) + { + if ($idx == 0) + { + push @result, + sprintf("%sif (cp >= 0x%04X && cp < 0x%04X)\n%s{", + $space, $entry->{Start}, $entry->{End}, $space); + } + else + { + push @result, + sprintf("%sif (cp < 0x%04X)\n%s{", + $space, $entry->{End}, $space); + } + + push @result, + sprintf("%s\treturn $table_name\[cp - 0x%04X + %d];", + $space, $entry->{Start}, $entry->{Offset}); + } + else + { + push @result, + sprintf("%sif (cp < 0x%04X)\n%s{", $space, $entry->{End}, $space); + push @result, + @{ branch($range, $from, $idx - 1, $indent + 1, $table_name) }; + } + + push @result, $space . "}"; + + # return now if it's the last range + return \@result if $idx == (scalar @$range) - 1; + + # ELSE looks ahead to the next range to avoid adding an + # unnecessary level of branching. + $entry = @$range[ $idx + 1 ]; + + # ELSE state + push @result, + sprintf("%selse if (cp >= 0x%04X)\n%s{", + $space, $entry->{Start}, $space); + + if ($idx == $to) + { + push @result, + sprintf("%s\treturn $table_name\[cp - 0x%04X + %d];", + $space, $entry->{Start}, $entry->{Offset}); + } + else + { + push @result, + @{ branch($range, $idx + 1, $to, $indent + 1, $table_name) }; + } + + push @result, $space . "}"; + + return \@result; +} + +sub branch_as_text +{ + return join "\n", @{ branch(@_) }; +} + +# Group numbers into ranges where the difference between neighboring +# elements does not exceed $limit. If the difference is greater, a new +# range is created. This is used to break the sequence into intervals +# where the gaps between numbers are greater than limit. +# +# For example, if there are numbers 1, 2, 3, 5, 6 and limit = 1, then +# there is a difference of 2 between 3 and 5, which is greater than 1, +# so there will be ranges 1-3 and 5-6. +sub make +{ + my ($nums, $limit) = @_; + my ($prev, $start, $total, @sorted, @range); + + @sorted = sort { $a <=> $b } @$nums; + + die "expecting at least 2 codepoints" if (scalar @sorted < 2); + + $start = shift @sorted; + $prev = $start; + $total = 0; + + # append final 'undef' to signal final iteration + push @sorted, undef; + + foreach my $curr (@sorted) + { + # if last iteration always append the range + if (!defined($curr) || ($curr - $prev > $limit)) + { + push @range, + { + Start => $start, + End => $prev + 1, + Offset => $total + }; + $total += $prev + 1 - $start; + $start = $curr; + } + + $prev = $curr; + } + + return \@range; +} + +# The function combines all ranges into the $name table. Ranges may include +# entries without a mapping at all, in which the entry in the $name table +# should be zero. +# Arguments: +# $range - Reference to an array of ranges +# $name - Name of the generated C table +# $callback - Function for calculating values +sub tables +{ + my ($range, $name, $callback) = @_; + my @lines; + + foreach my $entry (@$range) + { + my $start = $entry->{Start}; + my $end = $entry->{End} - 1; + + foreach my $cp ($start .. $end) + { + my $idx = sprintf("%d,", $callback->($cp)); + $idx .= "\t" if length($idx) < 4; + push @lines, sprintf("\t%s\t\t\t\t\t\t/* U+%06X */", $idx, $cp); + } + } + + my $length = scalar @lines; + + unshift @lines, "static const uint16 $name\[$length] =\n{"; + push @lines, "};"; + + return join "\n", @lines; +} + +1; -- 2.39.5 (Apple Git-154)