#!/usr/bin/perl

# Generate benchmark SQL files for PostgreSQL normalize() function.
#
# Reads NormalizationTest.txt (Unicode UCD) and produces four SQL files:
#   normalize_nfc.sql  -- benchmark for NFC  normalization
#   normalize_nfd.sql  -- benchmark for NFD  normalization
#   normalize_nfkc.sql -- benchmark for NFKC normalization
#   normalize_nfkd.sql -- benchmark for NFKD normalization
#
# Each file contains a single SELECT normalize(U&'...', <FORM>) statement
# with ~100000 codepoints.
#
# Usage:
#   perl generate_NF_C_D_KC_KD_sql.pl
#
# Prerequisites:
#   NormalizationTest.txt must be in the current directory.
#   Download from https://www.unicode.org/Public/UCD/latest/ucd/NormalizationTest.txt
#
# Example pgbench for run the generated SQL files:
#   pgbench --port=5432 --time=60 --progress=10 -f normalize_nfc.sql test
#

use strict;
use warnings FATAL => 'all';

my $norm_test_file = 'NormalizationTest.txt';
my %nfc_seqs;
my %nfd_seqs;
my %nfkc_seqs;
my %nfkd_seqs;

open my $fh, '<', $norm_test_file or die "Cannot open $norm_test_file: $!";

while (my $line = <$fh>) {
    next if $line =~ /^#/;
    next unless $line =~ /;/;

    $line =~ s/\n$//;

    my ($c1, $c2, $c3, $c4, $c5) = split /;/, $line;

    # Trim spaces
    $_ =~ s/^\s+|\s+$//g for ($c1, $c2, $c3, $c4, $c5);

    # Add to NFC test strings if input != NFC(input)
    $nfc_seqs{$c3} = 1 if $c3 && $c2 && $c3 ne $c2;
    $nfc_seqs{$c1} = 1 if $c1 && $c2 && $c1 ne $c2;

    # Add to NFD test strings if input != NFD(input)
    $nfd_seqs{$c2} = 1 if $c2 && $c3 && $c2 ne $c3;
    $nfd_seqs{$c1} = 1 if $c1 && $c3 && $c1 ne $c3;

    # Add to NFKC test strings if input != NFKC(input)
    $nfkc_seqs{$c5} = 1 if $c5 && $c4 && $c5 ne $c4;
    $nfkc_seqs{$c1} = 1 if $c1 && $c4 && $c1 ne $c4;

    # Add to NFKD test strings if input != NFKD(input)
    $nfkd_seqs{$c4} = 1 if $c4 && $c5 && $c4 ne $c5;
    $nfkd_seqs{$c1} = 1 if $c1 && $c5 && $c1 ne $c5;
}

close $fh;

sub to_pg_string {
    my ($seq) = @_;
    my @res;

    foreach my $cp (split / /, $seq) {
        if (length($cp) <= 4) {
            push @res, sprintf("\\%04X", hex($cp));
        } else {
            push @res, sprintf("\\+%06X", hex($cp));
        }
    }

    return \@res;
}

sub generate_sql {
    my ($filename, $form, $seqs_ref) = @_;
    my (@res, @seqs, $char_count);

    @seqs = sort {$a cmp $b} keys %$seqs_ref;
    $char_count = 0;

    # Loop over sequences multiple times if necessary to reach ~100000 characters
    while ($char_count < 100000) {
        foreach my $seq (@seqs) {
            my $norm = to_pg_string($seq);

            push @res, join('', @$norm);

            $char_count += scalar(@$norm);
        }
    }

    open my $out, '>', $filename or die "Cannot open $filename: $!";
    print $out "SELECT normalize(U&'", join('', @res), "', $form);\n";
    close $out;

    print "Generated $filename with $char_count codepoints.\n";
}

generate_sql("normalize_nfc.sql", "NFC", \%nfc_seqs);
generate_sql("normalize_nfd.sql", "NFD", \%nfd_seqs);
generate_sql("normalize_nfkc.sql", "NFKC", \%nfkc_seqs);
generate_sql("normalize_nfkd.sql", "NFKD", \%nfkd_seqs);
