From 086a1ce809a2b427ec0f5bac960073f38b770c46 Mon Sep 17 00:00:00 2001 From: Alexander Borisov Date: Mon, 22 Dec 2025 14:44:00 +0300 Subject: [PATCH v8 2/4] Add Perl module (Sparse Array) to a common module. Perl module for generating compact C tables and lookup functions optimized for sparse numeric key distributions. The module creates a two-level sparse array (Offset and Index) for numeric values. This approach is well suited for Unicode code points where values come in ranges (have a non-fixed gap in data between ranges). Append to src/tools/GenerateSparseArray.pm. This is necessary for further use of this code to build Sparse Array for Unicode normalization, case, and Encoding. --- src/tools/GenerateSparseArray.pm | 374 +++++++++++++++++++++++++++++++ 1 file changed, 374 insertions(+) create mode 100644 src/tools/GenerateSparseArray.pm diff --git a/src/tools/GenerateSparseArray.pm b/src/tools/GenerateSparseArray.pm new file mode 100644 index 00000000000..8fef64231fa --- /dev/null +++ b/src/tools/GenerateSparseArray.pm @@ -0,0 +1,374 @@ +#---------------------------------------------------------------------- +# +# GenerateSparseArray.pm +# Perl module for generating compact C tables and lookup functions +# optimized for sparse numeric key distributions. +# +# The module creates a two-level sparse array (Offset and Index) for numeric +# values. This approach is well suited for Unicode code points where values +# come in ranges (have a non-fixed gap in data between ranges). +# +# +# How it works. +# +# Essentially, we break numeric values into fixed ranges by N. +# +# The module creates two tables: +# 1. Offset +# Contains offsets for the table Index. +# Stores the beginning of the range for a number. +# +# 2. Index +# Stores fixed ranges of a specified size. +# Contains the index for the data table. The data table is a user table in +# which the necessary data is stored. +# +# Algorithm for obtaining an index for a table with data: +# 1. We have the number 1300 (Unicode code point). +# 2. Calculate its index in the Offset table: 1300 >> SHIFT +# (8 bit = 256) = 5. offset[5]. +# 3. After obtaining the offset of the range start for the Index table, we +# calculate the specific position among 256 values for the number 1300. +# RANGE_MASK = (1 << SHIFT) - 1 +# offset[5] + (cp & RANGE_MASK) +# +# The zero index in the Index table is used as a dummy range (like a NULL). +# +# For example: +# use GenerateSparseArray; +# +# my %data; +# my $gsa = new GenerateSparseArray(8); +# +# foreach my $id (0x41..0x5A, 0x61..0x7A) { +# $gsa->push($id); +# $data{$id} = $id + 10; +# } +# +# my ($offset, $index, $func) = $gsa->generate( +# 'latin_greek_table', +# 'get_index', +# sub { $data{$_[0]} } +# ); +# +# print join("\n", $offset, $index, $func); +# +# Result: +# +# static const uint16 latin_greek_table_offset[2] = +# { +# 256, 0 +# }; +# +# static const uint16 latin_greek_table_index[379] = +# { +# 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, +# 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, +# 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, +# 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, +# 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, +# 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, +# 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, +# 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, +# 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, +# 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, +# 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, +# 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, +# 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 75, 76, 77, +# 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, +# 97, 98, 99, 100, 0, 0, 0, 0, 0, 0, 107, 108, 109, 110, 111, 112, 113, 114, +# 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, +# 130, 131, 132 +# }; +# +# static uint16 +# get_index(char32_t cp) +# { +# uint16 offset_idx, offset; +# +# offset_idx = cp >> 8; +# +# if (offset_idx > 1) +# return 0; +# +# offset = latin_greek_table_offset[offset_idx]; +# +# return latin_greek_table_index[offset + (cp & 255)]; +# } +# +# We can balance tables (gaps between data). That is, you can change the +# density of data in a table by changing the SHIFT value. The smaller the +# SHIFT value, the denser the index table will be, and vice versa, the larger +# the value, the denser the offset table will be, and the index table will be +# more sparse (more empty values). +# +# For example, let's modify the example above: +# my $gsa = new GenerateSparseArray(4); # Not 8, but 4. +# +# The result for the same numbers: +# +# static const uint16 latin_greek_table_offset[9] = +# { +# 0, 0, 0, 0, 16, 32, 48, 64, 0 +# }; +# +# static const uint16 latin_greek_table_index[75] = +# { +# 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 75, 76, 77, 78, 79, 80, +# 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, +# 100, 0, 0, 0, 0, 0, 0, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, +# 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, +# 132 +# }; +# +# static uint16 +# get_index(char32_t cp) +# { +# uint16 offset_idx, offset; +# +# offset_idx = cp >> 4; +# +# if (offset_idx > 8) +# return 0; +# +# offset = latin_greek_table_offset[offset_idx]; +# +# return latin_greek_table_index[offset + (cp & 15)]; +# } +# +# We can see that the tables have become more "balanced". +# +# +# Disadvantages of the algorithm? +# +# The algorithm performs well with smoothly increasing values +# (hi Unicode and Encoding). For example, from 100 to 1,000,000 in different +# ranges: 100..500, 1000..1200, 10000..30000, and so on — everything will be +# fine in the algorithm. +# +# For example, in the worst case scenario, let's take two ranges of numbers: +# 10..150 and 5000000..10000000. Tables with very large dummy data will be +# constructed for these values. It seems that these problems can be solved, +# but this does not apply to the current tasks of using this algorithm. +# +# Portions Copyright (c) 1996-2025, PostgreSQL Global Development Group +# Portions Copyright (c) 1994, Regents of the University of California +# +# src/tools/GenerateSparseArray.pm +# +#---------------------------------------------------------------------- + +package GenerateSparseArray; + +use strict; +use warnings FATAL => 'all'; + +use PrettyLine; + +# new($range_shift) +# Constructor for the sparse table generator object. +# +# Initializes paging parameters and internal state used to build the +# two-level table (offset + index). If $range_shift is not provided, +# it defaults to 8, which gives a range size of 2^8 = 256 entries. +# +# Internal fields: +# keys - array of all registered numeric keys +# max - maximum key seen so far +# offset_table_size - size of the offset table (filled at generate()) +# range_shift - bit count, controls range granularity +# range_size - number of positions in one range (1 << range_shift) +# range_mask - bitmask for extracting low bits ($range_size - 1) +sub new +{ + my ($class, $range_shift) = @_; + my ($range_size, $range_mask); + + $range_shift ||= 8; + $range_size = 1 << $range_shift; + $range_mask = $range_size - 1; + + return bless { + keys => [], + max => 0, + index_uint => "uint16", + offset_uint => "uint16", + offset_table_size => 0, + range_shift => $range_shift, + range_size => $range_size, + range_mask => $range_mask + }, $class; +} + +# push($key) +# Append a new numeric key in the generator state. +# +# The key must be a non-negative integer; otherwise the function dies +# with an error message. +sub push +{ + my ($gsa, $key) = @_; + + die "the key must be a number" unless $key =~ /^[0-9]+$/; + + CORE::push @{ $gsa->{keys} }, $key; +} + +# generate($table_name, $func_name, $callback) +# Main generation routine that produces C tables and lookup function +# from all previously added keys. +# +# return [Offset table, Index table, lookup function]. +sub generate +{ + my ($gsa, $table_name, $func_name, $callback) = @_; + my (@sorted, @offsets, @data, $pos, $table_size); + my ($uint_offset); + + die "no values for table generation and functions" + unless scalar(@{ $gsa->{keys} }); + + # It is not essential, but for consistent table output, it is better + # to sort the data. + @sorted = sort { $a <=> $b } @{ $gsa->{keys} }; + $table_size = (($sorted[-1] >> $gsa->{range_shift}) + 1); + + # We immediately allocate the required size for the table. + $offsets[$table_size] = 0; + $pos = $gsa->{range_size}; + + foreach my $key (@sorted) + { + my $offset_index = $key >> $gsa->{range_shift}; + my $offset = $offsets[$offset_index]; + + unless (defined $offset) + { + $offset = $pos; + $offsets[$offset_index] = $offset; + $pos += $gsa->{range_size}; + + $uint_offset = _uint_type($offset); + } + + my $index = $key & $gsa->{range_mask}; + my $value = $callback->($key); + + $data[ $offset + $index ] = $value; + } + + my $max = (sort { $b <=> $a } grep { defined $_ } @data)[0]; + + $gsa->{offset_table_size} = $table_size; + $gsa->{offset_uint} = $uint_offset; + $gsa->{index_uint} = _uint_type($max); + + return ( + $gsa->_table_offset(\@offsets, $table_name), + $gsa->_table_index(\@data, $table_name), + $gsa->function($func_name, $table_name)); +} + +sub function +{ + my ($gsa, $func_name, $table_name) = @_; + + my $offset_name = "$table_name\_offset"; + my $index_name = "$table_name\_index"; + + return <{index_uint} +$func_name(char32_t cp) +{ + $gsa->{offset_uint} offset_idx, + offset; + + offset_idx = cp >> $gsa->{range_shift}; + + if (offset_idx > $gsa->{offset_table_size}) + return 0; + + offset = $offset_name\[offset_idx]; + + return $index_name\[offset + (cp & $gsa->{range_mask})]; +} +FUNCTION +} + +sub _table_offset +{ + my ($gsa, $table, $name) = @_; + my $table_text = + $gsa->table($table, "$name\_offset", $gsa->{offset_uint}); + + my $comment = <table($table, "$name\_index", $gsa->{index_uint}); +} + +# table($table, $name) +# Formats raw table data as pretty-printed C static array declaration. +sub table +{ + my ($gsa, $table, $name, $uint_type) = @_; + my (@lines, $pretty, $length); + + $pretty = new PrettyLine(); + $pretty->push(defined $_ ? $_ : 0) foreach (@$table); + $length = $pretty->length(); + + CORE::push @lines, "static const $uint_type $name\[$length] =\n{"; + CORE::push @lines, @{ $pretty->result() }; + CORE::push @lines, "};"; + + return join "\n", @lines; +} + +# _uint_type($number) +# Internal function that determines the size of uint for a number. +sub _uint_type +{ + my $num = 0 + $_[0]; + + # uint8: (1 << 8) - 1 + if ($num <= (((1 << 7) - 1) | (1 << 7))) + { + return "uint8"; + } + + # uint16: (1 << 16) - 1 + if ($num <= (((1 << 15) - 1) | (1 << 15))) + { + return "uint16"; + } + + # uint32: (1 << 32) - 1 + if ($num <= (((1 << 31) - 1) | (1 << 31))) + { + return "uint32"; + } + + # uint64: (1 << 64) - 1 + if ($num <= (((1 << 63) - 1) | (1 << 63))) + { + return "uint64"; + } + + die "value is greater than uint64: $num"; +} + +1; -- 2.39.5 (Apple Git-154)