From 086a1ce809a2b427ec0f5bac960073f38b770c46 Mon Sep 17 00:00:00 2001
From: Alexander Borisov <lex.borisov@gmail.com>
Date: Mon, 22 Dec 2025 14:44:00 +0300
Subject: [PATCH v8 2/4] Add Perl module (Sparse Array) to a common module.

Perl module for generating compact C tables and lookup functions
optimized for sparse numeric key distributions.

The module creates a two-level sparse array (Offset and Index) for numeric
values. This approach is well suited for Unicode code points where values
come in ranges (have a non-fixed gap in data between ranges).

Append to src/tools/GenerateSparseArray.pm.

This is necessary for further use of this code to build Sparse Array for
Unicode normalization, case, and Encoding.
---
 src/tools/GenerateSparseArray.pm | 374 +++++++++++++++++++++++++++++++
 1 file changed, 374 insertions(+)
 create mode 100644 src/tools/GenerateSparseArray.pm

diff --git a/src/tools/GenerateSparseArray.pm b/src/tools/GenerateSparseArray.pm
new file mode 100644
index 00000000000..8fef64231fa
--- /dev/null
+++ b/src/tools/GenerateSparseArray.pm
@@ -0,0 +1,374 @@
+#----------------------------------------------------------------------
+#
+# GenerateSparseArray.pm
+#    Perl module for generating compact C tables and lookup functions
+#    optimized for sparse numeric key distributions.
+#
+# The module creates a two-level sparse array (Offset and Index) for numeric
+# values. This approach is well suited for Unicode code points where values
+# come in ranges (have a non-fixed gap in data between ranges).
+#
+#
+# How it works.
+#
+# Essentially, we break numeric values into fixed ranges by N.
+#
+# The module creates two tables:
+# 1. Offset
+#     Contains offsets for the table Index.
+#     Stores the beginning of the range for a number.
+#
+# 2. Index
+#     Stores fixed ranges of a specified size.
+#     Contains the index for the data table. The data table is a user table in
+#     which the necessary data is stored.
+#
+# Algorithm for obtaining an index for a table with data:
+#     1. We have the number 1300 (Unicode code point).
+#     2. Calculate its index in the Offset table: 1300 >> SHIFT
+#        (8 bit = 256) = 5. offset[5].
+#     3. After obtaining the offset of the range start for the Index table, we
+#        calculate the specific position among 256 values for the number 1300.
+#        RANGE_MASK = (1 << SHIFT) - 1
+#        offset[5] + (cp & RANGE_MASK)
+#
+# The zero index in the Index table is used as a dummy range (like a NULL).
+#
+# For example:
+#     use GenerateSparseArray;
+#
+#     my %data;
+#     my $gsa = new GenerateSparseArray(8);
+#
+#     foreach my $id (0x41..0x5A, 0x61..0x7A) {
+#         $gsa->push($id);
+#         $data{$id} = $id + 10;
+#     }
+#
+#     my ($offset, $index, $func) = $gsa->generate(
+#         'latin_greek_table',
+#         'get_index',
+#         sub { $data{$_[0]} }
+#     );
+#
+#     print join("\n", $offset, $index, $func);
+#
+# Result:
+#
+# static const uint16 latin_greek_table_offset[2] =
+# {
+#     256, 0
+# };
+#
+# static const uint16 latin_greek_table_index[379] =
+# {
+#     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+#     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+#     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+#     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+#     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+#     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+#     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+#     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+#     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+#     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+#     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+#     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+#     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 75, 76, 77,
+#     78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96,
+#     97, 98, 99, 100, 0, 0, 0, 0, 0, 0, 107, 108, 109, 110, 111, 112, 113, 114,
+#     115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129,
+#     130, 131, 132
+# };
+#
+# static uint16
+# get_index(char32_t cp)
+# {
+#     uint16 offset_idx, offset;
+#
+#     offset_idx = cp >> 8;
+#
+#     if (offset_idx > 1)
+#         return 0;
+#
+#     offset = latin_greek_table_offset[offset_idx];
+#
+#     return latin_greek_table_index[offset + (cp & 255)];
+# }
+#
+# We can balance tables (gaps between data). That is, you can change the
+# density of data in a table by changing the SHIFT value. The smaller the
+# SHIFT value, the denser the index table will be, and vice versa, the larger
+# the value, the denser the offset table will be, and the index table will be
+# more sparse (more empty values).
+#
+# For example, let's modify the example above:
+#     my $gsa = new GenerateSparseArray(4); # Not 8, but 4.
+#
+# The result for the same numbers:
+#
+# static const uint16 latin_greek_table_offset[9] =
+# {
+#     0, 0, 0, 0, 16, 32, 48, 64, 0
+# };
+#
+# static const uint16 latin_greek_table_index[75] =
+# {
+#     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 75, 76, 77, 78, 79, 80,
+#     81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99,
+#     100, 0, 0, 0, 0, 0, 0, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116,
+#     117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131,
+#     132
+# };
+#
+# static uint16
+# get_index(char32_t cp)
+# {
+#     uint16 offset_idx, offset;
+#
+#     offset_idx = cp >> 4;
+#
+#     if (offset_idx > 8)
+#         return 0;
+#
+#     offset = latin_greek_table_offset[offset_idx];
+#
+#     return latin_greek_table_index[offset + (cp & 15)];
+# }
+#
+# We can see that the tables have become more "balanced".
+#
+#
+# Disadvantages of the algorithm?
+#
+# The algorithm performs well with smoothly increasing values
+# (hi Unicode and Encoding). For example, from 100 to 1,000,000 in different
+# ranges: 100..500, 1000..1200, 10000..30000, and so on — everything will be
+# fine in the algorithm.
+#
+# For example, in the worst case scenario, let's take two ranges of numbers:
+# 10..150 and 5000000..10000000. Tables with very large dummy data will be
+# constructed for these values. It seems that these problems can be solved,
+# but this does not apply to the current tasks of using this algorithm.
+#
+# Portions Copyright (c) 1996-2025, PostgreSQL Global Development Group
+# Portions Copyright (c) 1994, Regents of the University of California
+#
+# src/tools/GenerateSparseArray.pm
+#
+#----------------------------------------------------------------------
+
+package GenerateSparseArray;
+
+use strict;
+use warnings FATAL => 'all';
+
+use PrettyLine;
+
+# new($range_shift)
+#    Constructor for the sparse table generator object.
+#
+#    Initializes paging parameters and internal state used to build the
+#    two-level table (offset + index). If $range_shift is not provided,
+#    it defaults to 8, which gives a range size of 2^8 = 256 entries.
+#
+#    Internal fields:
+#      keys              - array of all registered numeric keys
+#      max               - maximum key seen so far
+#      offset_table_size - size of the offset table (filled at generate())
+#      range_shift        - bit count, controls range granularity
+#      range_size         - number of positions in one range (1 << range_shift)
+#      range_mask         - bitmask for extracting low bits ($range_size - 1)
+sub new
+{
+	my ($class, $range_shift) = @_;
+	my ($range_size, $range_mask);
+
+	$range_shift ||= 8;
+	$range_size = 1 << $range_shift;
+	$range_mask = $range_size - 1;
+
+	return bless {
+		keys => [],
+		max => 0,
+		index_uint => "uint16",
+		offset_uint => "uint16",
+		offset_table_size => 0,
+		range_shift => $range_shift,
+		range_size => $range_size,
+		range_mask => $range_mask
+	}, $class;
+}
+
+# push($key)
+#    Append a new numeric key in the generator state.
+#
+#    The key must be a non-negative integer; otherwise the function dies
+#    with an error message.
+sub push
+{
+	my ($gsa, $key) = @_;
+
+	die "the key must be a number" unless $key =~ /^[0-9]+$/;
+
+	CORE::push @{ $gsa->{keys} }, $key;
+}
+
+# generate($table_name, $func_name, $callback)
+#    Main generation routine that produces C tables and lookup function
+#    from all previously added keys.
+#
+#    return [Offset table, Index table, lookup function].
+sub generate
+{
+	my ($gsa, $table_name, $func_name, $callback) = @_;
+	my (@sorted, @offsets, @data, $pos, $table_size);
+	my ($uint_offset);
+
+	die "no values for table generation and functions"
+	  unless scalar(@{ $gsa->{keys} });
+
+	# It is not essential, but for consistent table output, it is better
+	# to sort the data.
+	@sorted = sort { $a <=> $b } @{ $gsa->{keys} };
+	$table_size = (($sorted[-1] >> $gsa->{range_shift}) + 1);
+
+	# We immediately allocate the required size for the table.
+	$offsets[$table_size] = 0;
+	$pos = $gsa->{range_size};
+
+	foreach my $key (@sorted)
+	{
+		my $offset_index = $key >> $gsa->{range_shift};
+		my $offset = $offsets[$offset_index];
+
+		unless (defined $offset)
+		{
+			$offset = $pos;
+			$offsets[$offset_index] = $offset;
+			$pos += $gsa->{range_size};
+
+			$uint_offset = _uint_type($offset);
+		}
+
+		my $index = $key & $gsa->{range_mask};
+		my $value = $callback->($key);
+
+		$data[ $offset + $index ] = $value;
+	}
+
+	my $max = (sort { $b <=> $a } grep { defined $_ } @data)[0];
+
+	$gsa->{offset_table_size} = $table_size;
+	$gsa->{offset_uint} = $uint_offset;
+	$gsa->{index_uint} = _uint_type($max);
+
+	return (
+		$gsa->_table_offset(\@offsets, $table_name),
+		$gsa->_table_index(\@data, $table_name),
+		$gsa->function($func_name, $table_name));
+}
+
+sub function
+{
+	my ($gsa, $func_name, $table_name) = @_;
+
+	my $offset_name = "$table_name\_offset";
+	my $index_name = "$table_name\_index";
+
+	return <<FUNCTION;
+/*
+ * Lookup for tables:
+ *     $offset_name and $index_name.
+ */
+static $gsa->{index_uint}
+$func_name(char32_t cp)
+{
+	$gsa->{offset_uint}		offset_idx,
+				offset;
+
+	offset_idx = cp >> $gsa->{range_shift};
+
+	if (offset_idx > $gsa->{offset_table_size})
+		return 0;
+
+	offset = $offset_name\[offset_idx];
+
+	return $index_name\[offset + (cp & $gsa->{range_mask})];
+}
+FUNCTION
+}
+
+sub _table_offset
+{
+	my ($gsa, $table, $name) = @_;
+	my $table_text =
+	  $gsa->table($table, "$name\_offset", $gsa->{offset_uint});
+
+	my $comment = <<COMMENT;
+/*
+ * The table contains offsets to table $name\_index.
+ */
+COMMENT
+	return "$comment$table_text";
+}
+
+sub _table_index
+{
+	my ($gsa, $table, $name) = @_;
+
+	return $gsa->table($table, "$name\_index", $gsa->{index_uint});
+}
+
+# table($table, $name)
+#    Formats raw table data as pretty-printed C static array declaration.
+sub table
+{
+	my ($gsa, $table, $name, $uint_type) = @_;
+	my (@lines, $pretty, $length);
+
+	$pretty = new PrettyLine();
+	$pretty->push(defined $_ ? $_ : 0) foreach (@$table);
+	$length = $pretty->length();
+
+	CORE::push @lines, "static const $uint_type $name\[$length] =\n{";
+	CORE::push @lines, @{ $pretty->result() };
+	CORE::push @lines, "};";
+
+	return join "\n", @lines;
+}
+
+# _uint_type($number)
+#    Internal function that determines the size of uint for a number.
+sub _uint_type
+{
+	my $num = 0 + $_[0];
+
+	# uint8: (1 << 8) - 1
+	if ($num <= (((1 << 7) - 1) | (1 << 7)))
+	{
+		return "uint8";
+	}
+
+	# uint16: (1 << 16) - 1
+	if ($num <= (((1 << 15) - 1) | (1 << 15)))
+	{
+		return "uint16";
+	}
+
+	# uint32: (1 << 32) - 1
+	if ($num <= (((1 << 31) - 1) | (1 << 31)))
+	{
+		return "uint32";
+	}
+
+	# uint64: (1 << 64) - 1
+	if ($num <= (((1 << 63) - 1) | (1 << 63)))
+	{
+		return "uint64";
+	}
+
+	die "value is greater than uint64: $num";
+}
+
+1;
-- 
2.39.5 (Apple Git-154)