From: Noah Misch Roughly extend string function tests to multibyte+toasted inputs. Not for commit. This operates by two mechanisms: - ascii2utf8sql.pl: mechanically translate selected src/test/regress/sql files to replace string literals with multibyte string literals. This is a cheap way to extend existing string function coverage to multibyte inputs. - fmgr.c, pg_lzcompress.c: automatically toast-compress function arguments of type "text". Don't avoid compression just because the input is small. Tests are not intended to pass. Setup a server for installcheck, then test with: sh run.sh make installcheck-parallel; grep +ERROR src/test/regress/regression.diffs Many additional errors appear, due to the crudity of this change. The key is the presence or absence of additional "invalid byte sequence for encoding" errors. The master branch gets 3 of those errors before commit 9f4fd11, and it gets 0 after that regression fix. The query "SELECT SUBSTRING(repeat(U&'\2026', 4000) FROM 1 FOR 1);" is another example that reaches the error after these changes (and with a revert of commit 9f4fd11). It doesn't normally reach toast compression, but these changes make it do so. Gemini 3 wrote much this. Discussion: https://postgr.es/m/FIXME diff --git a/ascii2utf8sql.pl b/ascii2utf8sql.pl new file mode 100644 index 0000000..03c4e3f --- /dev/null +++ b/ascii2utf8sql.pl @@ -0,0 +1,310 @@ +#!/usr/bin/env perl +use strict; +use warnings; +use utf8; + +# Ensure standard filehandles handle UTF-8 properly +binmode STDIN, ":encoding(UTF-8)"; +binmode STDOUT, ":encoding(UTF-8)"; + +my %map; +my %regex_metachars = map { $_ => 1 } split //, '<.*+?|()[]{}^$\\-:='; + +# Exact strings that should bypass the transformation +my %denylist = ( + 'UTF8' => 1, + '\x' => 1); + +# Helper to map uppercase/lowercase pairs identically across byte boundaries +sub add_pair +{ + my ($u_ascii, $l_ascii, $u_uni, $l_uni) = @_; + $map{$u_ascii} = chr($u_uni); + $map{$l_ascii} = chr($l_uni); +} + +# 1. Letters mapping (Preserving Postgres casing rules) +# A-C: 1-byte +add_pair('A', 'a', 0x0041, 0x0061); +add_pair('B', 'b', 0x0042, 0x0062); +add_pair('C', 'c', 0x0043, 0x0063); + +# D-K: 2-byte (Latin Extended-A) +add_pair('D', 'd', 0x010A, 0x010B); # Ċ/ċ +add_pair('E', 'e', 0x0112, 0x0113); # Ē/ē +add_pair('F', 'f', 0x011E, 0x011F); # Ğ/ğ +add_pair('G', 'g', 0x0122, 0x0123); # Ģ/ģ +add_pair('H', 'h', 0x0128, 0x0129); # Ĩ/ĩ +add_pair('I', 'i', 0x012A, 0x012B); # Ī/ī +add_pair('J', 'j', 0x0132, 0x0133); # IJ/ij +add_pair('K', 'k', 0x0136, 0x0137); # Ķ/ķ + +# L-S: 3-byte (Fullwidth Latin) +for my $i (11 .. 18) +{ + add_pair(chr(0x41 + $i), chr(0x61 + $i), 0xFF21 + $i, 0xFF41 + $i); +} + +# T-Z: 4-byte (Deseret) +for my $i (19 .. 25) +{ + add_pair(chr(0x41 + $i), chr(0x61 + $i), 0x10400 + $i, 0x10428 + $i); +} + +# 2. Digits mapping +$map{'0'} = '0'; # 1-byte +$map{'1'} = chr(0x0661); # 2-byte (Arabic-Indic) +$map{'2'} = chr(0x0662); +$map{'3'} = chr(0x0663); +$map{'4'} = chr(0xFF14); # 3-byte (Fullwidth) +$map{'5'} = chr(0xFF15); +$map{'6'} = chr(0xFF16); +$map{'7'} = chr(0x1D7E9); # 4-byte (Math Sans-Serif) +$map{'8'} = chr(0x1D7EA); +$map{'9'} = chr(0x1D7EB); + +# 3. Other ASCII characters +$map{'\\'} = '\\'; # Lock backslash to 1-byte to protect E'\n' escapes +my @symbols; +for my $i (32 .. 126) +{ + my $c = chr($i); + push @symbols, $c if $c !~ /[A-Za-z0-9']/ && !exists $map{$c}; +} + +my $base2_sym = 0x00A1; # ¡ (2-byte) +my $base3_sym = 0x2010; # ‐ (3-byte) +my $base4_sym = 0x1D300; # 𝌀 (4-byte) + +for my $i (0 .. $#symbols) +{ + my $c = $symbols[$i]; + if ($i < 3) + { + $map{$c} = $c; + } + elsif ($i < 13) + { + $map{$c} = chr($base2_sym++); + } + elsif ($i < 23) + { + $map{$c} = chr($base3_sym++); + } + else + { + $map{$c} = chr($base4_sym++); + } +} + +sub transform_string +{ + my ($text, $mode) = @_; + + return $text if $mode eq 'control'; + return $text if $denylist{$text}; + + # For format strings, protect PostgreSQL %-escapes + # (e.g. %s, %I, %L, %% and parametrized ones like %1$s, %-10s) + if ($mode eq 'format') + { + my $res = ""; + # Scans the string chunk by chunk, matching anything up to a format specifier or end of string. + # The specifier regex captures: %, optional -, numbers, $, *, and the final type letter (s, I, L, or %). + while ($text =~ /\G(.*?)(%[-0-9\$*]*[sIL%]|$)/gcs) + { + my $pre = $1; + my $spec = $2; + + # Recursively apply 'normal' transformation to the text between format specifiers + $res .= transform_string($pre, 'normal') if length $pre; + $res .= $spec if length $spec; + } + return $res; + } + + my $res = ""; + my $is_regex = ($mode eq 'regex'); + my @chars = split //, $text; + + for (my $i = 0; $i < @chars; $i++) + { + my $c = $chars[$i]; + + # Pass through doubled single quotes untouched + if ($c eq "'" && $i + 1 < @chars && $chars[ $i + 1 ] eq "'") + { + $res .= "''"; + $i++; + next; + } + + if ($is_regex) + { + # Skip transformation for escaped regex sequences (e.g., \d, \s) + if ($c eq '\\' && $i + 1 < @chars) + { + $res .= $c . $chars[ $i + 1 ]; + $i++; + next; + } + if ($regex_metachars{$c}) + { + $res .= $c; + next; + } + } + + $res .= $map{$c} // $c; + } + return $res; +} + +# Slurp the entire input stream +local $/ = undef; +my $sql = ; +my $out = ""; + +# Stack tracks nested function calls: { name => 'regexp_replace', arg => 0 } +my @stack; +my $last_id = ""; + +pos($sql) = 0; + +# The regex evaluates alternatives in order. +# It matches comments and dollar-quotes first, returning them unmodified. +# It only captures and transforms single-quoted strings. +while ( + $sql =~ m/\G( + (--.*?\n) # $2: single-line comment + | (\/\*.*?\*\/) # $3: multi-line comment + | (\$([a-zA-Z0-9_]*)\$.*?\$\5\$) # $4: dollar quote, $5: tag + | (([EeuU]&?)?('(?:[^']|'')*')) # $6: full string, $7: prefix, $8: content + | ([a-zA-Z_][a-zA-Z0-9_]*) # $9: identifier + | ([(),]) # $10: punctuation + | (\s+) # $11: whitespace + | (.) # $12: other +)/gsx) +{ + if (defined $2) + { + $out .= $2; + $last_id = ""; + } + elsif (defined $3) + { + $out .= $3; + $last_id = ""; + } + elsif (defined $4) + { + $out .= $4; + $last_id = ""; + } + elsif (defined $6) + { + my $prefix = $7 // ""; + my $literal = $8; + $literal =~ s/^'//; + $literal =~ s/'$//; + + my $mode = 'normal'; + + # 1. Apply function argument rules based on the stack + if (@stack) + { + my $func = lc($stack[-1]{name}); + my $arg_idx = $stack[-1]{arg}; + + # Apply arg index rules for regexp functions + if ($func =~ /^regexp_/) + { + if ($arg_idx == 0) + { + $mode = 'normal'; + } + elsif ($arg_idx == 1) + { + $mode = 'regex'; + } + elsif ($arg_idx == 2 && $func eq 'regexp_replace') + { + $mode = 'normal'; + } + else + { + $mode = 'control'; + } + } + elsif ($func eq 'encode') + { + if ($arg_idx == 0 || $arg_idx == 1) + { + $mode = 'control'; + } + } + elsif ($func eq 'decode') + { + if ($arg_idx == 1) + { + $mode = 'control'; + } + } + elsif ($func eq 'format') + { + if ($arg_idx == 0) + { + $mode = 'format'; + } + } + } + + # 2. Look ahead in the unparsed text to see if it's cast to bytea or regclass + my $remainder = substr($sql, pos($sql)); + if ($remainder =~ /^\s*::\s*(?:bytea|regclass)\b/i) + { + $mode = 'control'; + } + + $out .= $prefix . "'" . transform_string($literal, $mode) . "'"; + $last_id = ""; + } + elsif (defined $9) + { + $out .= $9; + # Exclude common SQL syntax keywords from being treated as function names + if ($9 !~ /^(from|for|as|and|or|not|in)$/i) + { + $last_id = $9; + } + } + elsif (defined $10) + { + my $p = $10; + $out .= $p; + if ($p eq '(') + { + push @stack, { name => $last_id, arg => 0 }; + } + elsif ($p eq ',') + { + $stack[-1]{arg}++ if @stack; + } + elsif ($p eq ')') + { + pop @stack if @stack; + } + $last_id = ""; + } + elsif (defined $11) + { + $out .= $11; + } + elsif (defined $12) + { + $out .= $12; + $last_id = ""; + } +} + +print $out; diff --git a/run.sh b/run.sh new file mode 100644 index 0000000..b000c34 --- /dev/null +++ b/run.sh @@ -0,0 +1,24 @@ +# To let coverage gaps inform which files to translate, I used bpftrace to get +# the call stacks that reach a pg_mblen* call w/ retval=1 but reach no such +# call with retval!=1. + +cd src/test/regress +# fairly clean +tests='text strings regex' +# much noisier +tests="$tests arrays collate.icu.utf8 tsdicts tsearch tsrf tstypes" +# expects v15+; v14 lacks test_setup +runnable=test_setup +for t in $tests; do + perl ../../../ascii2utf8sql.pl sql/$t.utf8.sql + # TODO script not ready for expected outputs: deletes more than it should + perl ../../..//ascii2utf8sql.pl expected/$t.utf8.out + runnable="$runnable $t.utf8" +done +cd ../../.. +make installcheck-tests TESTS="$runnable tablespace" "$@" +echo '==== all added errors' +echo '==== (includes false positives from flawed munging of expected output)' +grep '^[+]ERROR' src/test/regress/regression.diffs | sort | uniq -c +echo '==== all examples of goal error: invalid byte sequence for encoding' +grep 'invalid byte sequence for encoding' src/test/regress/regression.diffs diff --git a/src/backend/utils/fmgr/fmgr.c b/src/backend/utils/fmgr/fmgr.c index 4e26df7..9b51df5 100644 --- a/src/backend/utils/fmgr/fmgr.c +++ b/src/backend/utils/fmgr/fmgr.c @@ -34,11 +34,188 @@ #include "utils/lsyscache.h" #include "utils/syscache.h" +#include "access/xact.h" +#include "access/toast_compression.h" +#include "access/toast_internals.h" +#include "utils/fmgroids.h" + +/* https://gemini.google.com/u/1/app/dc868a99fe48c7e8 */ +/* stack of nested function pointers */ +#define MAX_NESTED_CALLS 1024 +static PGFunction fn_addr_stack[MAX_NESTED_CALLS]; +static int fn_addr_stack_ptr = 0; + +/* + * + * The wrapper function that receives the fully populated FunctionCallInfo, + * compresses varlenas in place, and calls the original function. + */ +static Datum +my_function_wrapper(FunctionCallInfo fcinfo) +{ + PGFunction orig_fn; + Datum result; + bool modified = false; + + /* Allocate backup array on the stack (max args in Postgres is 100) */ + Datum saved_values[100]; + + if (fn_addr_stack_ptr > 0) + orig_fn = fn_addr_stack[fn_addr_stack_ptr - 1]; + else + elog(ERROR, "test_toast_hook: fn_addr_stack empty"); + + /* Backup and compress */ + for (int i = 0; i < fcinfo->nargs; i++) + { + saved_values[i] = fcinfo->args[i].value; + + if (!fcinfo->args[i].isnull && get_fn_expr_argtype(fcinfo->flinfo, i) == TEXTOID) + { + Datum val = fcinfo->args[i].value; + + if (!VARATT_IS_EXTENDED(DatumGetPointer(val))) + { + Datum cval = toast_compress_datum(val, TOAST_PGLZ_COMPRESSION); + + if (cval) + { + fcinfo->args[i].value = cval; + modified = true; + } + } + } + } + + /* Execute */ + result = orig_fn(fcinfo); + + /* Restore the original pointers to protect the ExprState cache */ + if (modified) + { + for (int i = 0; i < fcinfo->nargs; i++) + { + fcinfo->args[i].value = saved_values[i]; + } + } + + return result; +} + +/* + * Force Postgres to route all function calls through fmgr_security_definer. + */ +#define MAX_CACHED_OIDS 65536 +/* + * 0 = unknown, 1 = hook needed, 2 = hook not needed + * + * Exempt functions used in a search of the PROCOID system, to avoid infinite + * recursion. We know none of these have text args. This list is probably + * incomplete, since associated relcache rebuilds are possible but rare. + * FIXME test w/ debug_discard_caches. + */ +static uint8 oid_hook_cache[MAX_CACHED_OIDS] = { + [F_BTHANDLER] = 2, + [F_BTOIDCMP] = 2, + [F_BTTEXTCMP] = 2, + [F_HEAP_TABLEAM_HANDLER] = 2, + [F_INT2GT] = 2, + [F_OIDEQ] = 2, +}; + +static bool +my_needs_fmgr_hook(Oid fn_oid) +{ + bool needs_hook = false; + HeapTuple tup; + + /* Fast path: check local cache */ + if (fn_oid < MAX_CACHED_OIDS) + { + if (oid_hook_cache[fn_oid] == 1) + return true; + if (oid_hook_cache[fn_oid] == 2) + return false; + } + + /* + * * If we cannot access the catalog safely, assume false to prevent + * crashes. The cache will remain 'unknown' and will be populated during + * the next in-transaction execution. + */ + if (!IsTransactionState()) + return false; + + /* + * So far, infinite recursion has always been associated with a btree + * opclass member relevant to the PROCOID syscache or its recursive + * dependencies. The solution has been to add the member to the + * oid_hook_cache initializer. + */ + if (stack_is_too_deep()) + elog(ERROR, "stack depth for fn_oid=%u", fn_oid); + + /* Safe to query the catalog to inspect the function's arguments */ + tup = SearchSysCache1(PROCOID, ObjectIdGetDatum(fn_oid)); + + if (HeapTupleIsValid(tup)) + { + Form_pg_proc procForm = (Form_pg_proc) GETSTRUCT(tup); + + /* Check if any argument requires TEXTOID */ + for (int i = 0; i < procForm->pronargs; i++) + { + if (procForm->proargtypes.values[i] == TEXTOID) + { + needs_hook = true; + break; + } + } + ReleaseSysCache(tup); + } + + /* Populate the cache */ + if (fn_oid < MAX_CACHED_OIDS) + { + oid_hook_cache[fn_oid] = needs_hook ? 1 : 2; + } + + return needs_hook; +} + +/* + * Hook the execution cycle to swap the function pointer. + */ +static void +my_fmgr_hook(FmgrHookEventType event, FmgrInfo *flinfo, Datum *private_data) +{ + if (event == FHET_START) + { + /* Stash the real function pointer */ + if (fn_addr_stack_ptr < MAX_NESTED_CALLS) + fn_addr_stack[fn_addr_stack_ptr++] = flinfo->fn_addr; + else + elog(ERROR, "test_toast_hook: MAX_NESTED_CALLS exceeded"); + + /* Replace the execution pointer with our argument-mutating wrapper */ + flinfo->fn_addr = my_function_wrapper; + } + else if (event == FHET_END || event == FHET_ABORT) + { + /* Pop the stack AND restore the original function pointer */ + if (fn_addr_stack_ptr > 0) + { + fn_addr_stack_ptr--; + flinfo->fn_addr = fn_addr_stack[fn_addr_stack_ptr]; + } + } +} + /* * Hooks for function calls */ -PGDLLIMPORT needs_fmgr_hook_type needs_fmgr_hook = NULL; -PGDLLIMPORT fmgr_hook_type fmgr_hook = NULL; +PGDLLIMPORT needs_fmgr_hook_type needs_fmgr_hook = my_needs_fmgr_hook; +PGDLLIMPORT fmgr_hook_type fmgr_hook = my_fmgr_hook; /* * Hashtable for fast lookup of external C functions @@ -164,7 +341,9 @@ fmgr_info_cxt_security(Oid functionId, FmgrInfo *finfo, MemoryContext mcxt, finfo->fn_mcxt = mcxt; finfo->fn_expr = NULL; /* caller may set this later */ - if ((fbp = fmgr_isbuiltin(functionId)) != NULL) + if ((fbp = fmgr_isbuiltin(functionId)) != NULL && + (!IsNormalProcessingMode() || + !FmgrHookIsNeeded(functionId))) { /* * Fast path for builtin functions: don't bother consulting pg_proc diff --git a/src/common/pg_lzcompress.c b/src/common/pg_lzcompress.c index 75d529d..aec17c6 100644 --- a/src/common/pg_lzcompress.c +++ b/src/common/pg_lzcompress.c @@ -233,7 +233,6 @@ static const PGLZ_Strategy strategy_default_data = { 10 /* Lower good match size by 10% at every loop * iteration */ }; -const PGLZ_Strategy *const PGLZ_strategy_default = &strategy_default_data; static const PGLZ_Strategy strategy_always_data = { @@ -246,6 +245,7 @@ static const PGLZ_Strategy strategy_always_data = { 6 /* Look harder for a good match */ }; const PGLZ_Strategy *const PGLZ_strategy_always = &strategy_always_data; +const PGLZ_Strategy *const PGLZ_strategy_default = &strategy_always_data; /* ---------- diff --git a/src/test/regress/expected/strings.out b/src/test/regress/expected/strings.out index f38688b..6e0b628 100644 --- a/src/test/regress/expected/strings.out +++ b/src/test/regress/expected/strings.out @@ -1455,13 +1455,13 @@ ERROR: regexp_split_to_array() does not support the "global" option -- change NULL-display back \pset null '' -- E021-11 position expression -SELECT POSITION('4' IN '1234567890') = '4' AS "4"; +SELECT POSITION('4' IN '1234567890') = $int4$4$int4$ AS "4"; 4 --- t (1 row) -SELECT POSITION('5' IN '1234567890') = '5' AS "5"; +SELECT POSITION('5' IN '1234567890') = $int4$5$int4$ AS "5"; 5 --- t diff --git a/src/test/regress/sql/strings.sql b/src/test/regress/sql/strings.sql index d8a0973..b26ae4d 100644 --- a/src/test/regress/sql/strings.sql +++ b/src/test/regress/sql/strings.sql @@ -395,9 +395,9 @@ SELECT regexp_split_to_array('thE QUick bROWn FOx jUMPs ovEr The lazy dOG', 'e', \pset null '' -- E021-11 position expression -SELECT POSITION('4' IN '1234567890') = '4' AS "4"; +SELECT POSITION('4' IN '1234567890') = $int4$4$int4$ AS "4"; -SELECT POSITION('5' IN '1234567890') = '5' AS "5"; +SELECT POSITION('5' IN '1234567890') = $int4$5$int4$ AS "5"; SELECT POSITION('\x11'::bytea IN ''::bytea) = 0 AS "0"; SELECT POSITION('\x33'::bytea IN '\x1122'::bytea) = 0 AS "0";