diff --git a/src/common/wchar.c b/src/common/wchar.c index 0636b8765b..f48d79638c 100644 --- a/src/common/wchar.c +++ b/src/common/wchar.c @@ -13,8 +13,41 @@ #include "c.h" #include "mb/pg_wchar.h" +#include "port/pg_bswap.h" +/* Verify a chunk of bytes for valid ASCII including a zero-byte check. */ +static inline int +check_ascii(const uint64 chunk) +{ + uint64 + highbits_set, + highbit_carry; + + /* Check if any bytes in this chunk have the high bit set. */ + highbits_set = chunk & UINT64CONST(0x8080808080808080); + if (highbits_set) + return 0; + + /* + * Check if there are any zero bytes in this chunk. + * + * First, add 0x7f to each byte. This sets the high bit in each byte, + * unless it was a zero. We already checked that none of the bytes had the + * high bit set previously, so the max value each byte can have after the + * addition is 0x7f + 0x7f = 0xfe, and we don't need to worry about + * carrying over to the next byte. + */ + highbit_carry = chunk + UINT64CONST(0x7f7f7f7f7f7f7f7f); + + /* Then check that the high bit is set in each byte. */ + highbit_carry &= UINT64CONST(0x8080808080808080); + if (highbit_carry == UINT64CONST(0x8080808080808080)) + return sizeof(chunk); + else + return 0; +} + /* * Operations on multi-byte encodings are driven by a table of helper * functions. @@ -1728,6 +1761,67 @@ pg_gb18030_verifystr(const unsigned char *s, int len) return s - start; } +/* + * Workhorse for pg_utf8_verifychar(). Returns the length of the character + * at *s in bytes, or -1 on invalid input or premature end of input. + * Static inline for the benefit of pg_utf8_verifystr(). + */ +static inline int +pg_utf8_verifychar_internal(const uint64 chunk_orig) +{ + const uint64 chunk = (pg_hton64(chunk_orig)); + + /* high bit should be set */ + Assert((chunk & 0x8000000000000000) != 0); + + /* 2-byte lead with one continuation byte */ + if ((chunk & 0xE0C0000000000000) == 0xC080000000000000) + { + /* check 2-byte overlong: 1100.000x.10xx.xxxx */ + if (chunk < 0xC200000000000000) + return -1; + + /* found valid sequence for code points U+0080 through U+07FF */ + return 2; + } + /* 3-byte lead with two continuation bytes */ + else if ((chunk & 0xF0C0C00000000000) == 0xE080800000000000) + { + /* check 3-byte overlong: 1110.0000 100x.xxxx 10xx.xxxx */ + if (chunk < 0xE0A0000000000000) + return -1; + + /* check surrogate: 1110.1101 101x.xxxx 10xx.xxxx */ + if (chunk > 0xED9FBFFFffffffff && chunk < 0xEE00000000000000) + return -1; + + /* + * found valid sequence for code points U+0800 through U+D7FF or + * U+E000 through U+FFFF + */ + return 3; + } + /* 4-byte lead with three continuation bytes */ + else if ((chunk & 0xF8C0C0C000000000) == 0xF080808000000000) + { + /* + * check 4-byte overlong: 1111.0000 1000.xxxx 10xx.xxxx 10xx.xxxx + */ + if (chunk < 0xF090000000000000) + return -1; + + /* check too large: 1111.0100 1001.xxxx 10xx.xxxx 10xx.xxxx */ + if (chunk > 0xF48FBFBFffffffff) + return -1; + + /* found valid sequence for code points U+010000 through U+10FFFF */ + return 4; + } + else + /* invalid byte */ + return -1; +} + static int pg_utf8_verifychar(const unsigned char *s, int len) { @@ -1761,28 +1855,62 @@ static int pg_utf8_verifystr(const unsigned char *s, int len) { const unsigned char *start = s; + uint64 chunk; - while (len > 0) + /* + * Fast path for when we have enough bytes left in the string to give + * check_ascii() a chance to advance the pointer. This also allows the + * functions in this loop to skip length checks. + */ + while (len >= sizeof(chunk)) { int l; + memcpy(&chunk, s, sizeof(chunk)); + /* fast path for ASCII-subset characters */ + l = check_ascii(chunk); + if (l) + goto advance; + + /* + * Found non-ASCII or zero above, so verify a single character. First check the first byte for ASCII. + */ if (!IS_HIGHBIT_SET(*s)) { if (*s == '\0') - break; - l = 1; - } - else - { - l = pg_utf8_verifychar(s, len); - if (l == -1) - break; + goto end; + else + { + l = 1; + goto advance; + } } + + /* Check for valid multibyte input. Since we already have the integer chunk, use that here as well. */ + l = pg_utf8_verifychar_internal(chunk); + if (l == -1) + goto end; + +advance: + s += l; + len -= l; + } + + /* Slow path to handle the last few bytes in the string */ + while (len > 0) + { + int l; + + l = pg_utf8_verifychar(s, len); + if (l == -1) + goto end; + s += l; len -= l; } +end: return s - start; } diff --git a/src/test/regress/expected/conversion.out b/src/test/regress/expected/conversion.out index 04fdcba496..92b5df62c8 100644 --- a/src/test/regress/expected/conversion.out +++ b/src/test/regress/expected/conversion.out @@ -72,6 +72,91 @@ $$; -- -- UTF-8 -- +-- The description column must be unique. +CREATE TABLE utf8_verification_inputs (inbytes bytea, description text PRIMARY KEY); +insert into utf8_verification_inputs values + ('\xaf', 'bare continuation'), + ('\xc5', 'missing second byte in 2-byte char'), + ('\xc080', 'smallest 2-byte overlong'), + ('\xc1bf', 'largest 2-byte overlong'), + ('\xc280', 'next 2-byte after overlongs'), + ('\xdfbf', 'largest 2-byte'), + ('\xe9af', 'missing third byte in 3-byte char'), + ('\xe08080', 'smallest 3-byte overlong'), + ('\xe09fbf', 'largest 3-byte overlong'), + ('\xe0a080', 'next 3-byte after overlong'), + ('\xed9fbf', 'last before surrogates'), + ('\xeda080', 'smallest surrogate'), + ('\xedbfbf', 'largest surrogate'), + ('\xee8080', 'next after surrogates'), + ('\xefbfbf', 'largest 3-byte'), + ('\xf1afbf', 'missing fourth byte in 4-byte char'), + ('\xf0808080', 'smallest 4-byte overlong'), + ('\xf08fbfbf', 'largest 4-byte overlong'), + ('\xf0908080', 'next 4-byte after overlong'), + ('\xf48fbfbf', 'largest 4-byte'), + ('\xf4908080', 'smallest too large'), + ('\xfa9a9a8a8a', '5-byte'), + ('\x66006f', 'NUL byte'); +-- Test UTF-8 verification +select description, (test_conv(inbytes, 'utf8', 'utf8')).* from utf8_verification_inputs; + description | result | errorat | error +------------------------------------+------------+--------------+---------------------------------------------------------------- + bare continuation | \x | \xaf | invalid byte sequence for encoding "UTF8": 0xaf + missing second byte in 2-byte char | \x | \xc5 | invalid byte sequence for encoding "UTF8": 0xc5 + smallest 2-byte overlong | \x | \xc080 | invalid byte sequence for encoding "UTF8": 0xc0 0x80 + largest 2-byte overlong | \x | \xc1bf | invalid byte sequence for encoding "UTF8": 0xc1 0xbf + next 2-byte after overlongs | \xc280 | | + largest 2-byte | \xdfbf | | + missing third byte in 3-byte char | \x | \xe9af | invalid byte sequence for encoding "UTF8": 0xe9 0xaf + smallest 3-byte overlong | \x | \xe08080 | invalid byte sequence for encoding "UTF8": 0xe0 0x80 0x80 + largest 3-byte overlong | \x | \xe09fbf | invalid byte sequence for encoding "UTF8": 0xe0 0x9f 0xbf + next 3-byte after overlong | \xe0a080 | | + last before surrogates | \xed9fbf | | + smallest surrogate | \x | \xeda080 | invalid byte sequence for encoding "UTF8": 0xed 0xa0 0x80 + largest surrogate | \x | \xedbfbf | invalid byte sequence for encoding "UTF8": 0xed 0xbf 0xbf + next after surrogates | \xee8080 | | + largest 3-byte | \xefbfbf | | + missing fourth byte in 4-byte char | \x | \xf1afbf | invalid byte sequence for encoding "UTF8": 0xf1 0xaf 0xbf + smallest 4-byte overlong | \x | \xf0808080 | invalid byte sequence for encoding "UTF8": 0xf0 0x80 0x80 0x80 + largest 4-byte overlong | \x | \xf08fbfbf | invalid byte sequence for encoding "UTF8": 0xf0 0x8f 0xbf 0xbf + next 4-byte after overlong | \xf0908080 | | + largest 4-byte | \xf48fbfbf | | + smallest too large | \x | \xf4908080 | invalid byte sequence for encoding "UTF8": 0xf4 0x90 0x80 0x80 + 5-byte | \x | \xfa9a9a8a8a | invalid byte sequence for encoding "UTF8": 0xfa + NUL byte | \x66 | \x006f | invalid byte sequence for encoding "UTF8": 0x00 +(23 rows) + +-- Test UTF-8 verification with ASCII padding appended to provide +-- coverage for algorithms that work on multiple bytes at a time. +with test_bytes as ( + -- The error message for a sequence starting with a 4-byte lead + -- will contain all 4 bytes if they are present, so add 3 + -- ASCII bytes to the end to ensure consistent error messages. + select + inbytes, + description, + (test_conv(inbytes || repeat('.', 3)::bytea, 'utf8', 'utf8')).error + from utf8_verification_inputs +), test_padded as ( + select + description, + (test_conv(inbytes || repeat('.', 16)::bytea, 'utf8', 'utf8')).error + from test_bytes +) +select + description, + b.error as orig_error, + p.error as error_after_padding +from test_padded p +join test_bytes b +using (description) +where p.error is distinct from b.error +order by description; + description | orig_error | error_after_padding +-------------+------------+--------------------- +(0 rows) + CREATE TABLE utf8_inputs (inbytes bytea, description text); insert into utf8_inputs values ('\x666f6f', 'valid, pure ASCII'), diff --git a/src/test/regress/sql/conversion.sql b/src/test/regress/sql/conversion.sql index 8358682432..a3e12961db 100644 --- a/src/test/regress/sql/conversion.sql +++ b/src/test/regress/sql/conversion.sql @@ -74,6 +74,63 @@ $$; -- -- UTF-8 -- +-- The description column must be unique. +CREATE TABLE utf8_verification_inputs (inbytes bytea, description text PRIMARY KEY); +insert into utf8_verification_inputs values + ('\xaf', 'bare continuation'), + ('\xc5', 'missing second byte in 2-byte char'), + ('\xc080', 'smallest 2-byte overlong'), + ('\xc1bf', 'largest 2-byte overlong'), + ('\xc280', 'next 2-byte after overlongs'), + ('\xdfbf', 'largest 2-byte'), + ('\xe9af', 'missing third byte in 3-byte char'), + ('\xe08080', 'smallest 3-byte overlong'), + ('\xe09fbf', 'largest 3-byte overlong'), + ('\xe0a080', 'next 3-byte after overlong'), + ('\xed9fbf', 'last before surrogates'), + ('\xeda080', 'smallest surrogate'), + ('\xedbfbf', 'largest surrogate'), + ('\xee8080', 'next after surrogates'), + ('\xefbfbf', 'largest 3-byte'), + ('\xf1afbf', 'missing fourth byte in 4-byte char'), + ('\xf0808080', 'smallest 4-byte overlong'), + ('\xf08fbfbf', 'largest 4-byte overlong'), + ('\xf0908080', 'next 4-byte after overlong'), + ('\xf48fbfbf', 'largest 4-byte'), + ('\xf4908080', 'smallest too large'), + ('\xfa9a9a8a8a', '5-byte'), + ('\x66006f', 'NUL byte'); + +-- Test UTF-8 verification +select description, (test_conv(inbytes, 'utf8', 'utf8')).* from utf8_verification_inputs; + +-- Test UTF-8 verification with ASCII padding appended to provide +-- coverage for algorithms that work on multiple bytes at a time. +with test_bytes as ( + -- The error message for a sequence starting with a 4-byte lead + -- will contain all 4 bytes if they are present, so add 3 + -- ASCII bytes to the end to ensure consistent error messages. + select + inbytes, + description, + (test_conv(inbytes || repeat('.', 3)::bytea, 'utf8', 'utf8')).error + from utf8_verification_inputs +), test_padded as ( + select + description, + (test_conv(inbytes || repeat('.', 16)::bytea, 'utf8', 'utf8')).error + from test_bytes +) +select + description, + b.error as orig_error, + p.error as error_after_padding +from test_padded p +join test_bytes b +using (description) +where p.error is distinct from b.error +order by description; + CREATE TABLE utf8_inputs (inbytes bytea, description text); insert into utf8_inputs values ('\x666f6f', 'valid, pure ASCII'),