diff --git a/src/backend/utils/adt/selfuncs.c b/src/backend/utils/adt/selfuncs.c index 10b73fb..4151ce2 100644 --- a/src/backend/utils/adt/selfuncs.c +++ b/src/backend/utils/adt/selfuncs.c @@ -5502,6 +5502,16 @@ pattern_selectivity(Const *patt, Pattern_Type ptype) /* + * This function is "character increment" function for bytea used in + * make_greater_string() that has same interface with pg_wchar_tbl.charinc. + */ +static bool byte_increment(unsigned char *ptr, int len) +{ + (*ptr)--; + return true; +} + +/* * Try to generate a string greater than the given string or any * string it is a prefix of. If successful, return a palloc'd string * in the form of a Const node; else return NULL. @@ -5540,6 +5550,7 @@ make_greater_string(const Const *str_const, FmgrInfo *ltproc, Oid collation) int len; Datum cmpstr; text *cmptxt = NULL; + character_incrementer charincfunc; /* * Get a modifiable copy of the prefix string in C-string format, and set @@ -5601,27 +5612,38 @@ make_greater_string(const Const *str_const, FmgrInfo *ltproc, Oid collation) } } + if (datatype != BYTEAOID) + charincfunc = pg_database_encoding_character_incrementer(); + else + charincfunc = &byte_increment; + while (len > 0) { - unsigned char *lastchar = (unsigned char *) (workstr + len - 1); - unsigned char savelastchar = *lastchar; + int charlen; + unsigned char *lastchar; + unsigned char savelastbyte; + Const *workstr_const; + + if (datatype == BYTEAOID) + charlen = 1; + else + charlen = len - pg_mbcliplen(workstr, len, len - 1); + + lastchar = (unsigned char *) (workstr + len - charlen); /* - * Try to generate a larger string by incrementing the last byte. + * savelastbyte has meaning only for datatype == BYTEAOID */ - while (*lastchar < (unsigned char) 255) - { - Const *workstr_const; + savelastbyte = *lastchar; - (*lastchar)++; + /* + * Try to generate a larger string by incrementing the last byte or + * character. + */ + if (charincfunc(lastchar, charlen)) { if (datatype != BYTEAOID) - { - /* do not generate invalid encoding sequences */ - if (!pg_verifymbstr(workstr, len, true)) - continue; workstr_const = string_to_const(workstr, datatype); - } else workstr_const = string_to_bytea_const(workstr, len); @@ -5636,26 +5658,17 @@ make_greater_string(const Const *str_const, FmgrInfo *ltproc, Oid collation) pfree(workstr); return workstr_const; } - + /* No good, release unusable value and try again */ pfree(DatumGetPointer(workstr_const->constvalue)); pfree(workstr_const); } - /* restore last byte so we don't confuse pg_mbcliplen */ - *lastchar = savelastchar; - /* - * Truncate off the last character, which might be more than 1 byte, - * depending on the character encoding. + * Truncate off the last character or restore last byte for BYTEA. */ - if (datatype != BYTEAOID && pg_database_encoding_max_length() > 1) - len = pg_mbcliplen(workstr, len, len - 1); - else - len -= 1; - - if (datatype != BYTEAOID) - workstr[len] = '\0'; + len -= charlen; + workstr[len] = (datatype != BYTEAOID ? '\0' : savelastbyte); } /* Failed... */ diff --git a/src/backend/utils/mb/wchar.c b/src/backend/utils/mb/wchar.c index 5b0cf62..1d6aee0 100644 --- a/src/backend/utils/mb/wchar.c +++ b/src/backend/utils/mb/wchar.c @@ -935,6 +935,85 @@ pg_gb18030_dsplen(const unsigned char *s) /* *------------------------------------------------------------------- + * multibyte character incrementer + * + * These functions accept "charptr", a pointer to the first byte of a + * maybe-multibyte character. Try `increment' the character and return true if + * successed. If these functions returns false, the character is not modified. + * ------------------------------------------------------------------- + */ + +static bool pg_generic_charinc(unsigned char *charptr, int len) +{ + unsigned char *lastchar = (unsigned char *) (charptr + len - 1); + unsigned char savelastchar = *lastchar; + const char *const_charptr = (const char *)charptr; + + while (*lastchar < (unsigned char) 255) + { + (*lastchar)++; + if (!pg_verifymbstr(const_charptr, len, true)) + continue; + return true; + } + + *lastchar = savelastchar; + return false; +} + +static bool pg_utf8_increment(unsigned char *charptr, int length) +{ + unsigned char a; + unsigned char bak[4]; + + memcpy(bak, charptr, length); + switch (length) + { + default: + /* reject lengths 5 and 6 for now */ + return false; + case 4: + a = charptr[3]; + if (a < 0xBF) + { + charptr[3]++; + break; + } + charptr[3] = 0x80; + /* FALL THRU */ + case 3: + a = charptr[2]; + if (a < 0xBF) + { + charptr[2]++; + break; + } + charptr[2] = 0x80; + /* FALL THRU */ + case 2: + a = charptr[1]; + if ((*charptr == 0xed && a < 0x9F) || a < 0xBF) + { + charptr[1]++; + break; + } + charptr[1] = 0x80; + /* FALL THRU */ + case 1: + a = *charptr; + if (a == 0x7F || a == 0xDF || a == 0xEF || a == 0xF7) { + memcpy(charptr, bak, length); + return false; + } + charptr[0]++; + break; + } + + return pg_utf8_islegal(charptr, length); +} + +/* + *------------------------------------------------------------------- * multibyte sequence validators * * These functions accept "s", a pointer to the first byte of a string, @@ -1341,48 +1420,48 @@ pg_utf8_islegal(const unsigned char *source, int length) *------------------------------------------------------------------- */ pg_wchar_tbl pg_wchar_table[] = { - {pg_ascii2wchar_with_len, pg_ascii_mblen, pg_ascii_dsplen, pg_ascii_verifier, 1}, /* PG_SQL_ASCII */ - {pg_eucjp2wchar_with_len, pg_eucjp_mblen, pg_eucjp_dsplen, pg_eucjp_verifier, 3}, /* PG_EUC_JP */ - {pg_euccn2wchar_with_len, pg_euccn_mblen, pg_euccn_dsplen, pg_euccn_verifier, 2}, /* PG_EUC_CN */ - {pg_euckr2wchar_with_len, pg_euckr_mblen, pg_euckr_dsplen, pg_euckr_verifier, 3}, /* PG_EUC_KR */ - {pg_euctw2wchar_with_len, pg_euctw_mblen, pg_euctw_dsplen, pg_euctw_verifier, 4}, /* PG_EUC_TW */ - {pg_eucjp2wchar_with_len, pg_eucjp_mblen, pg_eucjp_dsplen, pg_eucjp_verifier, 3}, /* PG_EUC_JIS_2004 */ - {pg_utf2wchar_with_len, pg_utf_mblen, pg_utf_dsplen, pg_utf8_verifier, 4}, /* PG_UTF8 */ - {pg_mule2wchar_with_len, pg_mule_mblen, pg_mule_dsplen, pg_mule_verifier, 4}, /* PG_MULE_INTERNAL */ - {pg_latin12wchar_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1}, /* PG_LATIN1 */ - {pg_latin12wchar_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1}, /* PG_LATIN2 */ - {pg_latin12wchar_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1}, /* PG_LATIN3 */ - {pg_latin12wchar_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1}, /* PG_LATIN4 */ - {pg_latin12wchar_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1}, /* PG_LATIN5 */ - {pg_latin12wchar_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1}, /* PG_LATIN6 */ - {pg_latin12wchar_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1}, /* PG_LATIN7 */ - {pg_latin12wchar_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1}, /* PG_LATIN8 */ - {pg_latin12wchar_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1}, /* PG_LATIN9 */ - {pg_latin12wchar_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1}, /* PG_LATIN10 */ - {pg_latin12wchar_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1}, /* PG_WIN1256 */ - {pg_latin12wchar_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1}, /* PG_WIN1258 */ - {pg_latin12wchar_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1}, /* PG_WIN866 */ - {pg_latin12wchar_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1}, /* PG_WIN874 */ - {pg_latin12wchar_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1}, /* PG_KOI8R */ - {pg_latin12wchar_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1}, /* PG_WIN1251 */ - {pg_latin12wchar_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1}, /* PG_WIN1252 */ - {pg_latin12wchar_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1}, /* ISO-8859-5 */ - {pg_latin12wchar_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1}, /* ISO-8859-6 */ - {pg_latin12wchar_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1}, /* ISO-8859-7 */ - {pg_latin12wchar_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1}, /* ISO-8859-8 */ - {pg_latin12wchar_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1}, /* PG_WIN1250 */ - {pg_latin12wchar_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1}, /* PG_WIN1253 */ - {pg_latin12wchar_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1}, /* PG_WIN1254 */ - {pg_latin12wchar_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1}, /* PG_WIN1255 */ - {pg_latin12wchar_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1}, /* PG_WIN1257 */ - {pg_latin12wchar_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1}, /* PG_KOI8U */ - {0, pg_sjis_mblen, pg_sjis_dsplen, pg_sjis_verifier, 2}, /* PG_SJIS */ - {0, pg_big5_mblen, pg_big5_dsplen, pg_big5_verifier, 2}, /* PG_BIG5 */ - {0, pg_gbk_mblen, pg_gbk_dsplen, pg_gbk_verifier, 2}, /* PG_GBK */ - {0, pg_uhc_mblen, pg_uhc_dsplen, pg_uhc_verifier, 2}, /* PG_UHC */ - {0, pg_gb18030_mblen, pg_gb18030_dsplen, pg_gb18030_verifier, 4}, /* PG_GB18030 */ - {0, pg_johab_mblen, pg_johab_dsplen, pg_johab_verifier, 3}, /* PG_JOHAB */ - {0, pg_sjis_mblen, pg_sjis_dsplen, pg_sjis_verifier, 2} /* PG_SHIFT_JIS_2004 */ + {pg_ascii2wchar_with_len, pg_ascii_mblen, pg_ascii_dsplen, pg_generic_charinc, pg_ascii_verifier, 1}, /* PG_SQL_ASCII */ + {pg_eucjp2wchar_with_len, pg_eucjp_mblen, pg_eucjp_dsplen, pg_generic_charinc, pg_eucjp_verifier, 3}, /* PG_EUC_JP */ + {pg_euccn2wchar_with_len, pg_euccn_mblen, pg_euccn_dsplen, pg_generic_charinc, pg_euccn_verifier, 2}, /* PG_EUC_CN */ + {pg_euckr2wchar_with_len, pg_euckr_mblen, pg_euckr_dsplen, pg_generic_charinc, pg_euckr_verifier, 3}, /* PG_EUC_KR */ + {pg_euctw2wchar_with_len, pg_euctw_mblen, pg_euctw_dsplen, pg_generic_charinc, pg_euctw_verifier, 4}, /* PG_EUC_TW */ + {pg_eucjp2wchar_with_len, pg_eucjp_mblen, pg_eucjp_dsplen, pg_generic_charinc, pg_eucjp_verifier, 3}, /* PG_EUC_JIS_2004 */ + {pg_utf2wchar_with_len, pg_utf_mblen, pg_utf_dsplen, pg_utf8_increment, pg_utf8_verifier, 4}, /* PG_UTF8 */ + {pg_mule2wchar_with_len, pg_mule_mblen, pg_mule_dsplen, pg_generic_charinc, pg_mule_verifier, 4}, /* PG_MULE_INTERNAL */ + {pg_latin12wchar_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_generic_charinc, pg_latin1_verifier, 1}, /* PG_LATIN1 */ + {pg_latin12wchar_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_generic_charinc, pg_latin1_verifier, 1}, /* PG_LATIN2 */ + {pg_latin12wchar_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_generic_charinc, pg_latin1_verifier, 1}, /* PG_LATIN3 */ + {pg_latin12wchar_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_generic_charinc, pg_latin1_verifier, 1}, /* PG_LATIN4 */ + {pg_latin12wchar_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_generic_charinc, pg_latin1_verifier, 1}, /* PG_LATIN5 */ + {pg_latin12wchar_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_generic_charinc, pg_latin1_verifier, 1}, /* PG_LATIN6 */ + {pg_latin12wchar_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_generic_charinc, pg_latin1_verifier, 1}, /* PG_LATIN7 */ + {pg_latin12wchar_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_generic_charinc, pg_latin1_verifier, 1}, /* PG_LATIN8 */ + {pg_latin12wchar_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_generic_charinc, pg_latin1_verifier, 1}, /* PG_LATIN9 */ + {pg_latin12wchar_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_generic_charinc, pg_latin1_verifier, 1}, /* PG_LATIN10 */ + {pg_latin12wchar_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_generic_charinc, pg_latin1_verifier, 1}, /* PG_WIN1256 */ + {pg_latin12wchar_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_generic_charinc, pg_latin1_verifier, 1}, /* PG_WIN1258 */ + {pg_latin12wchar_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_generic_charinc, pg_latin1_verifier, 1}, /* PG_WIN866 */ + {pg_latin12wchar_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_generic_charinc, pg_latin1_verifier, 1}, /* PG_WIN874 */ + {pg_latin12wchar_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_generic_charinc, pg_latin1_verifier, 1}, /* PG_KOI8R */ + {pg_latin12wchar_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_generic_charinc, pg_latin1_verifier, 1}, /* PG_WIN1251 */ + {pg_latin12wchar_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_generic_charinc, pg_latin1_verifier, 1}, /* PG_WIN1252 */ + {pg_latin12wchar_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_generic_charinc, pg_latin1_verifier, 1}, /* ISO-8859-5 */ + {pg_latin12wchar_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_generic_charinc, pg_latin1_verifier, 1}, /* ISO-8859-6 */ + {pg_latin12wchar_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_generic_charinc, pg_latin1_verifier, 1}, /* ISO-8859-7 */ + {pg_latin12wchar_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_generic_charinc, pg_latin1_verifier, 1}, /* ISO-8859-8 */ + {pg_latin12wchar_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_generic_charinc, pg_latin1_verifier, 1}, /* PG_WIN1250 */ + {pg_latin12wchar_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_generic_charinc, pg_latin1_verifier, 1}, /* PG_WIN1253 */ + {pg_latin12wchar_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_generic_charinc, pg_latin1_verifier, 1}, /* PG_WIN1254 */ + {pg_latin12wchar_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_generic_charinc, pg_latin1_verifier, 1}, /* PG_WIN1255 */ + {pg_latin12wchar_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_generic_charinc, pg_latin1_verifier, 1}, /* PG_WIN1257 */ + {pg_latin12wchar_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_generic_charinc, pg_latin1_verifier, 1}, /* PG_KOI8U */ + {0, pg_sjis_mblen, pg_sjis_dsplen, pg_generic_charinc, pg_sjis_verifier, 2}, /* PG_SJIS */ + {0, pg_big5_mblen, pg_big5_dsplen, pg_generic_charinc, pg_big5_verifier, 2}, /* PG_BIG5 */ + {0, pg_gbk_mblen, pg_gbk_dsplen, pg_generic_charinc, pg_gbk_verifier, 2}, /* PG_GBK */ + {0, pg_uhc_mblen, pg_uhc_dsplen, pg_generic_charinc, pg_uhc_verifier, 2}, /* PG_UHC */ + {0, pg_gb18030_mblen, pg_gb18030_dsplen, pg_generic_charinc, pg_gb18030_verifier, 4}, /* PG_GB18030 */ + {0, pg_johab_mblen, pg_johab_dsplen, pg_generic_charinc, pg_johab_verifier, 3}, /* PG_JOHAB */ + {0, pg_sjis_mblen, pg_sjis_dsplen, pg_generic_charinc, pg_sjis_verifier, 2} /* PG_SHIFT_JIS_2004 */ }; /* returns the byte length of a word for mule internal code */ @@ -1459,6 +1538,15 @@ pg_database_encoding_max_length(void) } /* + * fetch maximum length of the encoding for the current database + */ +character_incrementer +pg_database_encoding_character_incrementer(void) +{ + return pg_wchar_table[GetDatabaseEncoding()].charinc; +} + +/* * Verify mbstr to make sure that it is validly encoded in the current * database encoding. Otherwise same as pg_verify_mbstr(). */ diff --git a/src/include/mb/pg_wchar.h b/src/include/mb/pg_wchar.h index 826c7af..356703a 100644 --- a/src/include/mb/pg_wchar.h +++ b/src/include/mb/pg_wchar.h @@ -284,6 +284,8 @@ typedef int (*mblen_converter) (const unsigned char *mbstr); typedef int (*mbdisplaylen_converter) (const unsigned char *mbstr); +typedef bool (*character_incrementer) (unsigned char *mbstr, int len); + typedef int (*mbverifier) (const unsigned char *mbstr, int len); typedef struct @@ -292,6 +294,7 @@ typedef struct * string to a wchar */ mblen_converter mblen; /* get byte length of a char */ mbdisplaylen_converter dsplen; /* get display width of a char */ + character_incrementer charinc; /* Character code incrementer if not null */ mbverifier mbverify; /* verify multibyte sequence */ int maxmblen; /* max bytes for a char in this encoding */ } pg_wchar_tbl; @@ -389,6 +392,7 @@ extern int pg_encoding_mbcliplen(int encoding, const char *mbstr, extern int pg_mbcharcliplen(const char *mbstr, int len, int imit); extern int pg_encoding_max_length(int encoding); extern int pg_database_encoding_max_length(void); +extern character_incrementer pg_database_encoding_character_incrementer(void); extern int PrepareClientEncoding(int encoding); extern int SetClientEncoding(int encoding);