#!/usr/bin/perl # Generate benchmark SQL files for PostgreSQL normalize() function. # # Reads NormalizationTest.txt (Unicode UCD) and produces four SQL files: # normalize_nfc.sql -- benchmark for NFC normalization # normalize_nfd.sql -- benchmark for NFD normalization # normalize_nfkc.sql -- benchmark for NFKC normalization # normalize_nfkd.sql -- benchmark for NFKD normalization # # Each file contains a single SELECT normalize(U&'...',

) statement # with ~100000 codepoints. # # Usage: # perl generate_NF_C_D_KC_KD_sql.pl # # Prerequisites: # NormalizationTest.txt must be in the current directory. # Download from https://www.unicode.org/Public/UCD/latest/ucd/NormalizationTest.txt # # Example pgbench for run the generated SQL files: # pgbench --port=5432 --time=60 --progress=10 -f normalize_nfc.sql test # use strict; use warnings FATAL => 'all'; my $norm_test_file = 'NormalizationTest.txt'; my %nfc_seqs; my %nfd_seqs; my %nfkc_seqs; my %nfkd_seqs; open my $fh, '<', $norm_test_file or die "Cannot open $norm_test_file: $!"; while (my $line = <$fh>) { next if $line =~ /^#/; next unless $line =~ /;/; $line =~ s/\n$//; my ($c1, $c2, $c3, $c4, $c5) = split /;/, $line; # Trim spaces $_ =~ s/^\s+|\s+$//g for ($c1, $c2, $c3, $c4, $c5); # Add to NFC test strings if input != NFC(input) $nfc_seqs{$c3} = 1 if $c3 && $c2 && $c3 ne $c2; $nfc_seqs{$c1} = 1 if $c1 && $c2 && $c1 ne $c2; # Add to NFD test strings if input != NFD(input) $nfd_seqs{$c2} = 1 if $c2 && $c3 && $c2 ne $c3; $nfd_seqs{$c1} = 1 if $c1 && $c3 && $c1 ne $c3; # Add to NFKC test strings if input != NFKC(input) $nfkc_seqs{$c5} = 1 if $c5 && $c4 && $c5 ne $c4; $nfkc_seqs{$c1} = 1 if $c1 && $c4 && $c1 ne $c4; # Add to NFKD test strings if input != NFKD(input) $nfkd_seqs{$c4} = 1 if $c4 && $c5 && $c4 ne $c5; $nfkd_seqs{$c1} = 1 if $c1 && $c5 && $c1 ne $c5; } close $fh; sub to_pg_string { my ($seq) = @_; my @res; foreach my $cp (split / /, $seq) { if (length($cp) <= 4) { push @res, sprintf("\\%04X", hex($cp)); } else { push @res, sprintf("\\+%06X", hex($cp)); } } return \@res; } sub generate_sql { my ($filename, $form, $seqs_ref) = @_; my (@res, @seqs, $char_count); @seqs = sort {$a cmp $b} keys %$seqs_ref; $char_count = 0; # Loop over sequences multiple times if necessary to reach ~100000 characters while ($char_count < 100000) { foreach my $seq (@seqs) { my $norm = to_pg_string($seq); push @res, join('', @$norm); $char_count += scalar(@$norm); } } open my $out, '>', $filename or die "Cannot open $filename: $!"; print $out "SELECT normalize(U&'", join('', @res), "', $form);\n"; close $out; print "Generated $filename with $char_count codepoints.\n"; } generate_sql("normalize_nfc.sql", "NFC", \%nfc_seqs); generate_sql("normalize_nfd.sql", "NFD", \%nfd_seqs); generate_sql("normalize_nfkc.sql", "NFKC", \%nfkc_seqs); generate_sql("normalize_nfkd.sql", "NFKD", \%nfkd_seqs);