--- postgresql-7.4.5/src/backend/utils/adt/oracle_compat.c 2003-08-08 23:42:06.000000000 +0200 +++ postgresql-8.0.0beta1/src/backend/utils/adt/oracle_compat.c 2004-06-07 00:17:01.000000000 +0200 @@ -9,23 +9,145 @@ * * * IDENTIFICATION - * $Header: /cvsroot/pgsql-server/src/backend/utils/adt/oracle_compat.c,v 1.48 2003/08/08 21:42:06 momjian Exp $ + * $PostgreSQL: pgsql-server/src/backend/utils/adt/oracle_compat.c,v 1.53 2004/06/06 22:17:01 tgl Exp $ * *------------------------------------------------------------------------- */ #include "postgres.h" #include +#include +/* + * towlower() and friends should be in , but some pre-C99 systems + * declare them in . + */ +#ifdef HAVE_WCHAR_H +#include +#endif +#ifdef HAVE_WCTYPE_H +#include +#endif #include "utils/builtins.h" #include "mb/pg_wchar.h" +/* + * If the system provides the needed functions for wide-character manipulation + * (which are all standardized by C99), then we implement upper/lower/initcap + * using wide-character functions. Otherwise we use the traditional + * functions, which of course will not work as desired in multibyte character + * sets. Note that in either case we are effectively assuming that the + * database character encoding matches the encoding implied by LC_CTYPE. + * + * We assume if we have these two functions, we have their friends too, and + * can use the wide-character method. + */ +#if defined(HAVE_WCSTOMBS) && defined(HAVE_TOWLOWER) +#define USE_WIDE_UPPER_LOWER +#endif + static text *dotrim(const char *string, int stringlen, const char *set, int setlen, bool doltrim, bool dortrim); +#ifdef USE_WIDE_UPPER_LOWER + +/* + * Convert a TEXT value into a palloc'd wchar string. + */ +static wchar_t * +texttowcs(const text *txt) +{ + int nbytes = VARSIZE(txt) - VARHDRSZ; + char *workstr; + wchar_t *result; + size_t ncodes; + + /* Overflow paranoia */ + if (nbytes < 0 || + nbytes > (int) (INT_MAX / sizeof(wchar_t)) - 1) + ereport(ERROR, + (errcode(ERRCODE_OUT_OF_MEMORY), + errmsg("out of memory"))); + + /* Need a null-terminated version of the input */ + workstr = (char *) palloc(nbytes + 1); + memcpy(workstr, VARDATA(txt), nbytes); + workstr[nbytes] = '\0'; + + /* Output workspace cannot have more codes than input bytes */ + result = (wchar_t *) palloc((nbytes + 1) * sizeof(wchar_t)); + + /* Do the conversion */ + ncodes = mbstowcs(result, workstr, nbytes + 1); + + if (ncodes == (size_t) -1) + { + /* + * Invalid multibyte character encountered. We try to give a useful + * error message by letting pg_verifymbstr check the string. But + * it's possible that the string is OK to us, and not OK to mbstowcs + * --- this suggests that the LC_CTYPE locale is different from the + * database encoding. Give a generic error message if verifymbstr + * can't find anything wrong. + */ + pg_verifymbstr(workstr, nbytes, false); + ereport(ERROR, + (errcode(ERRCODE_CHARACTER_NOT_IN_REPERTOIRE), + errmsg("invalid multibyte character for locale"))); + } + + Assert(ncodes <= (size_t) nbytes); + + return result; +} + + +/* + * Convert a wchar string into a palloc'd TEXT value. The wchar string + * must be zero-terminated, but we also require the caller to pass the string + * length, since it will know it anyway in current uses. + */ +static text * +wcstotext(const wchar_t *str, int ncodes) +{ + text *result; + size_t nbytes; + + /* Overflow paranoia */ + if (ncodes < 0 || + ncodes > (int) ((INT_MAX - VARHDRSZ) / MB_CUR_MAX) - 1) + ereport(ERROR, + (errcode(ERRCODE_OUT_OF_MEMORY), + errmsg("out of memory"))); + + /* Make workspace certainly large enough for result */ + result = (text *) palloc((ncodes + 1) * MB_CUR_MAX + VARHDRSZ); + + /* Do the conversion */ + nbytes = wcstombs((char *) VARDATA(result), str, + (ncodes + 1) * MB_CUR_MAX); + + if (nbytes == (size_t) -1) + { + /* Invalid multibyte character encountered ... shouldn't happen */ + ereport(ERROR, + (errcode(ERRCODE_CHARACTER_NOT_IN_REPERTOIRE), + errmsg("invalid multibyte character for locale"))); + } + + Assert(nbytes <= (size_t) (ncodes * MB_CUR_MAX)); + + VARATT_SIZEP(result) = nbytes + VARHDRSZ; + + return result; +} + +#endif /* USE_WIDE_UPPER_LOWER */ + + /******************************************************************** * * lower @@ -43,21 +165,45 @@ Datum lower(PG_FUNCTION_ARGS) { - text *string = PG_GETARG_TEXT_P_COPY(0); - char *ptr; - int m; +#ifdef USE_WIDE_UPPER_LOWER + /* use wide char code only when max encoding length > one */ + if (pg_database_encoding_max_length() > 1) + { + text *string = PG_GETARG_TEXT_P(0); + text *result; + wchar_t *workspace; + int i; - /* Since we copied the string, we can scribble directly on the value */ - ptr = VARDATA(string); - m = VARSIZE(string) - VARHDRSZ; + workspace = texttowcs(string); - while (m-- > 0) - { - *ptr = tolower((unsigned char) *ptr); - ptr++; + for (i = 0; workspace[i] != 0; i++) + workspace[i] = towlower(workspace[i]); + + result = wcstotext(workspace, i); + + pfree(workspace); + + PG_RETURN_TEXT_P(result); } + else +#endif /* USE_WIDE_UPPER_LOWER */ + { + text *string = PG_GETARG_TEXT_P_COPY(0); + char *ptr; + int m; + + /* Since we copied the string, we can scribble directly on the value */ + ptr = VARDATA(string); + m = VARSIZE(string) - VARHDRSZ; + + while (m-- > 0) + { + *ptr = tolower((unsigned char) *ptr); + ptr++; + } - PG_RETURN_TEXT_P(string); + PG_RETURN_TEXT_P(string); + } } @@ -78,21 +224,45 @@ Datum upper(PG_FUNCTION_ARGS) { - text *string = PG_GETARG_TEXT_P_COPY(0); - char *ptr; - int m; +#ifdef USE_WIDE_UPPER_LOWER + /* use wide char code only when max encoding length > one */ + if (pg_database_encoding_max_length() > 1) + { + text *string = PG_GETARG_TEXT_P(0); + text *result; + wchar_t *workspace; + int i; - /* Since we copied the string, we can scribble directly on the value */ - ptr = VARDATA(string); - m = VARSIZE(string) - VARHDRSZ; + workspace = texttowcs(string); - while (m-- > 0) - { - *ptr = toupper((unsigned char) *ptr); - ptr++; + for (i = 0; workspace[i] != 0; i++) + workspace[i] = towupper(workspace[i]); + + result = wcstotext(workspace, i); + + pfree(workspace); + + PG_RETURN_TEXT_P(result); } + else +#endif /* USE_WIDE_UPPER_LOWER */ + { + text *string = PG_GETARG_TEXT_P_COPY(0); + char *ptr; + int m; + + /* Since we copied the string, we can scribble directly on the value */ + ptr = VARDATA(string); + m = VARSIZE(string) - VARHDRSZ; + + while (m-- > 0) + { + *ptr = toupper((unsigned char) *ptr); + ptr++; + } - PG_RETURN_TEXT_P(string); + PG_RETURN_TEXT_P(string); + } } @@ -106,41 +276,67 @@ * * Purpose: * - * Returns string, with first letter of each word in uppercase, - * all other letters in lowercase. A word is delimited by white - * space. + * Returns string, with first letter of each word in uppercase, all + * other letters in lowercase. A word is defined as a sequence of + * alphanumeric characters, delimited by non-alphanumeric + * characters. * ********************************************************************/ Datum initcap(PG_FUNCTION_ARGS) { - text *string = PG_GETARG_TEXT_P_COPY(0); - char *ptr; - int m; +#ifdef USE_WIDE_UPPER_LOWER + /* use wide char code only when max encoding length > one */ + if (pg_database_encoding_max_length() > 1) + { + text *string = PG_GETARG_TEXT_P(0); + text *result; + wchar_t *workspace; + int wasalnum = 0; + int i; - /* Since we copied the string, we can scribble directly on the value */ - ptr = VARDATA(string); - m = VARSIZE(string) - VARHDRSZ; + workspace = texttowcs(string); - if (m > 0) - { - *ptr = toupper((unsigned char) *ptr); - ptr++; - m--; - } + for (i = 0; workspace[i] != 0; i++) + { + if (wasalnum) + workspace[i] = towlower(workspace[i]); + else + workspace[i] = towupper(workspace[i]); + wasalnum = iswalnum(workspace[i]); + } - while (m-- > 0) - { - /* Oracle capitalizes after all non-alphanumeric */ - if (!isalnum((unsigned char) ptr[-1])) - *ptr = toupper((unsigned char) *ptr); - else - *ptr = tolower((unsigned char) *ptr); - ptr++; + result = wcstotext(workspace, i); + + pfree(workspace); + + PG_RETURN_TEXT_P(result); } + else +#endif /* USE_WIDE_UPPER_LOWER */ + { + text *string = PG_GETARG_TEXT_P_COPY(0); + int wasalnum = 0; + char *ptr; + int m; + + /* Since we copied the string, we can scribble directly on the value */ + ptr = VARDATA(string); + m = VARSIZE(string) - VARHDRSZ; + + while (m-- > 0) + { + if (wasalnum) + *ptr = tolower((unsigned char) *ptr); + else + *ptr = toupper((unsigned char) *ptr); + wasalnum = isalnum((unsigned char) *ptr); + ptr++; + } - PG_RETURN_TEXT_P(string); + PG_RETURN_TEXT_P(string); + } } @@ -872,7 +1068,7 @@ ********************************************************************/ Datum -chr (PG_FUNCTION_ARGS) +chr(PG_FUNCTION_ARGS) { int32 cvalue = PG_GETARG_INT32(0); text *result; --- postgresql-7.4.5/configure.in 2004-08-18 05:11:25.000000000 +0200 +++ postgresql-8.0.0beta1/configure.in 2004-08-09 01:27:11.000000000 +0200 @@ -866,7 +810,7 @@ # SunOS doesn't handle negative byte comparisons properly with +/- return AC_FUNC_MEMCMP -AC_CHECK_FUNCS([cbrt dlopen fcvt fdatasync getpeereid memmove poll pstat setproctitle setsid sigprocmask symlink sysconf utime utimes waitpid]) +AC_CHECK_FUNCS([cbrt dlopen fcvt fdatasync getpeereid memmove poll pstat setproctitle setsid sigprocmask symlink sysconf towlower utime utimes waitpid wcstombs]) AC_CHECK_DECLS(fdatasync, [], [], [#include ]) --- postgresql-7.4.5/src/include/pg_config.h.in~ 2004-03-20 16:39:40.000000000 +0100 +++ postgresql-7.4.5/src/include/pg_config.h.in 2004-08-26 13:18:28.000000000 +0200 @@ -509,6 +509,9 @@ `HAVE_STRUCT_TM_TM_ZONE' instead. */ #undef HAVE_TM_ZONE +/* Define to 1 if you have the `towlower' function. */ +#undef HAVE_TOWLOWER + /* Define to 1 if you have the external array `tzname'. */ #undef HAVE_TZNAME @@ -542,6 +545,9 @@ /* Define to 1 if you have the `waitpid' function. */ #undef HAVE_WAITPID +/* Define to 1 if you have the `wcstombs' function. */ +#undef HAVE_WCSTOMBS + /* Define to the appropriate snprintf format for 64-bit ints, if any. */ #undef INT64_FORMAT