Index: doc/src/sgml/func.sgml =================================================================== RCS file: /cvsroot/pgsql/doc/src/sgml/func.sgml,v retrieving revision 1.263 diff -c -c -r1.263 func.sgml *** doc/src/sgml/func.sgml 6 Jul 2005 19:02:52 -0000 1.263 --- doc/src/sgml/func.sgml 10 Jul 2005 04:52:43 -0000 *************** *** 1257,1262 **** --- 1257,1282 ---- + regexp_replace(source text, + pattern text, + replacement text + , flags text) + text + Replace string that matches the regular expression + pattern in source to + replacement. + replacement can use \1-\9 and \&. + \1-\9 is a back reference to the n'th subexpression, and + \& is the entire matched string. + flags can use g(global) and i(ignore case). + When flags is not specified, case sensitive matching is used, and it replaces + only the instance. + + regexp_replace('1112223333', '(\\d{3})(\\d{3})(\\d{4})', '(\\1) \\2-\\3') + (111) 222-3333 + + + repeat(string text, number integer) text Repeat string the specified Index: src/backend/regex/regexec.c =================================================================== RCS file: /cvsroot/pgsql/src/backend/regex/regexec.c,v retrieving revision 1.24 diff -c -c -r1.24 regexec.c *** src/backend/regex/regexec.c 29 Nov 2003 19:51:55 -0000 1.24 --- src/backend/regex/regexec.c 10 Jul 2005 04:52:44 -0000 *************** *** 110,115 **** --- 110,116 ---- regmatch_t *pmatch; rm_detail_t *details; chr *start; /* start of string */ + chr *search_start; /* search start of string */ chr *stop; /* just past end of string */ int err; /* error code if any (0 none) */ regoff_t *mem; /* memory vector for backtracking */ *************** *** 168,173 **** --- 169,175 ---- pg_regexec(regex_t *re, const chr *string, size_t len, + size_t search_start, rm_detail_t *details, size_t nmatch, regmatch_t pmatch[], *************** *** 219,224 **** --- 221,227 ---- v->pmatch = pmatch; v->details = details; v->start = (chr *) string; + v->search_start = (chr *) string + search_start; v->stop = (chr *) string + len; v->err = 0; if (backref) *************** *** 288,294 **** NOERR(); MDEBUG(("\nsearch at %ld\n", LOFF(v->start))); cold = NULL; ! close = shortest(v, s, v->start, v->start, v->stop, &cold, (int *) NULL); freedfa(s); NOERR(); if (v->g->cflags & REG_EXPECT) --- 291,298 ---- NOERR(); MDEBUG(("\nsearch at %ld\n", LOFF(v->start))); cold = NULL; ! close = shortest(v, s, v->search_start, v->search_start, v->stop, ! &cold, (int *) NULL); freedfa(s); NOERR(); if (v->g->cflags & REG_EXPECT) *************** *** 415,421 **** assert(d != NULL && s != NULL); cold = NULL; ! close = v->start; do { MDEBUG(("\ncsearch at %ld\n", LOFF(close))); --- 419,425 ---- assert(d != NULL && s != NULL); cold = NULL; ! close = v->search_start; do { MDEBUG(("\ncsearch at %ld\n", LOFF(close))); Index: src/backend/utils/adt/regexp.c =================================================================== RCS file: /cvsroot/pgsql/src/backend/utils/adt/regexp.c,v retrieving revision 1.56 diff -c -c -r1.56 regexp.c *** src/backend/utils/adt/regexp.c 31 Dec 2004 22:01:22 -0000 1.56 --- src/backend/utils/adt/regexp.c 10 Jul 2005 04:52:45 -0000 *************** *** 81,118 **** /* ! * RE_compile_and_execute - compile and execute a RE, caching if possible * ! * Returns TRUE on match, FALSE on no match * ! * text_re --- the pattern, expressed as an *untoasted* TEXT object ! * dat --- the data to match against (need not be null-terminated) ! * dat_len --- the length of the data string ! * cflags --- compile options for the pattern ! * nmatch, pmatch --- optional return area for match details * ! * Both pattern and data are given in the database encoding. We internally ! * convert to array of pg_wchar which is what Spencer's regex package wants. */ ! static bool ! RE_compile_and_execute(text *text_re, unsigned char *dat, int dat_len, ! int cflags, int nmatch, regmatch_t *pmatch) { int text_re_len = VARSIZE(text_re); - pg_wchar *data; - size_t data_len; pg_wchar *pattern; size_t pattern_len; int i; int regcomp_result; - int regexec_result; cached_re_str re_temp; char errMsg[100]; - /* Convert data string to wide characters */ - data = (pg_wchar *) palloc((dat_len + 1) * sizeof(pg_wchar)); - data_len = pg_mb2wchar_with_len(dat, data, dat_len); - /* * Look for a match among previously compiled REs. Since the data * structure is self-organizing with most-used entries at the front, --- 81,107 ---- /* ! * RE_compile_and_cache - compile a RE, caching if possible * ! * Returns regex_t * ! * text_re --- the pattern, expressed as an *untoasted* TEXT object ! * cflags --- compile options for the pattern * ! * Pattern is given in the database encoding. We internally convert to ! * array of pg_wchar which is what Spencer's regex package wants. */ ! static regex_t ! RE_compile_and_cache(text *text_re, int cflags) { int text_re_len = VARSIZE(text_re); pg_wchar *pattern; size_t pattern_len; int i; int regcomp_result; cached_re_str re_temp; char errMsg[100]; /* * Look for a match among previously compiled REs. Since the data * structure is self-organizing with most-used entries at the front, *************** *** 134,161 **** re_array[0] = re_temp; } ! /* Perform RE match and return result */ ! regexec_result = pg_regexec(&re_array[0].cre_re, ! data, ! data_len, ! NULL, /* no details */ ! nmatch, ! pmatch, ! 0); ! ! pfree(data); ! ! if (regexec_result != REG_OKAY && regexec_result != REG_NOMATCH) ! { ! /* re failed??? */ ! pg_regerror(regexec_result, &re_array[0].cre_re, ! errMsg, sizeof(errMsg)); ! ereport(ERROR, ! (errcode(ERRCODE_INVALID_REGULAR_EXPRESSION), ! errmsg("regular expression failed: %s", errMsg))); ! } ! ! return (regexec_result == REG_OKAY); } } --- 123,129 ---- re_array[0] = re_temp; } ! return re_array[0].cre_re; } } *************** *** 220,229 **** --- 188,232 ---- re_array[0] = re_temp; num_res++; + return re_array[0].cre_re; + } + + /* + * RE_compile_and_execute - compile and execute a RE + * + * Returns TRUE on match, FALSE on no match + * + * text_re --- the pattern, expressed as an *untoasted* TEXT object + * dat --- the data to match against (need not be null-terminated) + * dat_len --- the length of the data string + * cflags --- compile options for the pattern + * nmatch, pmatch --- optional return area for match details + * + * Both pattern and data are given in the database encoding. We internally + * convert to array of pg_wchar which is what Spencer's regex package wants. + */ + static bool + RE_compile_and_execute(text *text_re, unsigned char *dat, int dat_len, + int cflags, int nmatch, regmatch_t *pmatch) + { + pg_wchar *data; + size_t data_len; + int regexec_result; + regex_t re; + char errMsg[100]; + + /* Convert data string to wide characters */ + data = (pg_wchar *) palloc((dat_len + 1) * sizeof(pg_wchar)); + data_len = pg_mb2wchar_with_len(dat, data, dat_len); + + /* Compile RE */ + re = RE_compile_and_cache(text_re, cflags); + /* Perform RE match and return result */ regexec_result = pg_regexec(&re_array[0].cre_re, data, data_len, + 0, NULL, /* no details */ nmatch, pmatch, *************** *** 428,442 **** eo = pmatch[0].rm_eo; } ! return (DirectFunctionCall3(text_substr, PointerGetDatum(s), Int32GetDatum(so + 1), ! Int32GetDatum(eo - so))); } PG_RETURN_NULL(); } /* similar_escape() * Convert a SQL99 regexp pattern to POSIX style, so it can be used by * our regexp engine. --- 431,519 ---- eo = pmatch[0].rm_eo; } ! return DirectFunctionCall3(text_substr, PointerGetDatum(s), Int32GetDatum(so + 1), ! Int32GetDatum(eo - so)); } PG_RETURN_NULL(); } + /* + * textregexreplace_noopt() + * Return a replace string matched by a regular expression. + * This function is a version that doesn't specify the option of + * textregexreplace. This is case sensitive, replace the first + * instance only. + */ + Datum + textregexreplace_noopt(PG_FUNCTION_ARGS) + { + text *s = PG_GETARG_TEXT_P(0); + text *p = PG_GETARG_TEXT_P(1); + text *r = PG_GETARG_TEXT_P(2); + regex_t re; + + re = RE_compile_and_cache(p, regex_flavor); + + return DirectFunctionCall4(replace_text_regexp, + PointerGetDatum(s), + PointerGetDatum(&re), + PointerGetDatum(r), + BoolGetDatum(false)); + } + + /* + * textregexreplace() + * Return a replace string matched by a regular expression. + */ + Datum + textregexreplace(PG_FUNCTION_ARGS) + { + text *s = PG_GETARG_TEXT_P(0); + text *p = PG_GETARG_TEXT_P(1); + text *r = PG_GETARG_TEXT_P(2); + text *opt = PG_GETARG_TEXT_P(3); + char *opt_p = VARDATA(opt); + int opt_len = (VARSIZE(opt) - VARHDRSZ); + int i; + bool global = false; + bool ignorecase = false; + regex_t re; + + /* parse options */ + for (i = 0; i < opt_len; i++) + { + switch (opt_p[i]) + { + case 'i': + ignorecase = true; + break; + case 'g': + global = true; + break; + default: + ereport(ERROR, + (errcode(ERRCODE_INVALID_PARAMETER_VALUE), + errmsg("invalid option of regexp_replace: %c", + opt_p[i]))); + break; + } + } + + if (ignorecase) + re = RE_compile_and_cache(p, regex_flavor | REG_ICASE); + else + re = RE_compile_and_cache(p, regex_flavor); + + return DirectFunctionCall4(replace_text_regexp, + PointerGetDatum(s), + PointerGetDatum(&re), + PointerGetDatum(r), + BoolGetDatum(global)); + } + /* similar_escape() * Convert a SQL99 regexp pattern to POSIX style, so it can be used by * our regexp engine. Index: src/backend/utils/adt/varlena.c =================================================================== RCS file: /cvsroot/pgsql/src/backend/utils/adt/varlena.c,v retrieving revision 1.126 diff -c -c -r1.126 varlena.c *** src/backend/utils/adt/varlena.c 7 Jul 2005 04:36:08 -0000 1.126 --- src/backend/utils/adt/varlena.c 10 Jul 2005 04:52:46 -0000 *************** *** 28,33 **** --- 28,34 ---- #include "utils/builtins.h" #include "utils/lsyscache.h" #include "utils/pg_locale.h" + #include "regex/regex.h" typedef struct varlena unknown; *************** *** 1994,1999 **** --- 1995,2219 ---- } /* + * check_replace_text_has_escape_char + * check whether replace_text has escape char. + */ + static bool + check_replace_text_has_escape_char(const text *replace_text) + { + const char *p = VARDATA(replace_text); + const char *p_end = p + (VARSIZE(replace_text) - VARHDRSZ); + + if (pg_database_encoding_max_length() == 1) + { + for (; p < p_end; p++) + if (*p == '\\') return true; + } + else + { + for (; p < p_end; p += pg_mblen(p)) + if (*p == '\\') return true; + } + + return false; + } + + /* + * appendStringInfoRegexpSubstr + * append string by using back references of regexp. + */ + static void + appendStringInfoRegexpSubstr(StringInfo str, text *replace_text, + regmatch_t *pmatch, text *src_text) + { + const char *p = VARDATA(replace_text); + const char *p_end = p + (VARSIZE(replace_text) - VARHDRSZ); + + int eml = pg_database_encoding_max_length(); + + int substr_start = 1; + int ch_cnt; + + int so; + int eo; + + while (1) + { + /* Find escape char. */ + ch_cnt = 0; + if (eml == 1) + { + for (; p < p_end && *p != '\\'; p++) + ch_cnt++; + } + else + { + for (; p < p_end && *p != '\\'; p += pg_mblen(p)) + ch_cnt++; + } + + /* + * Copy the text when there is a text in the left of escape char + * or escape char is not found. + */ + if (ch_cnt) + { + text *append_text = text_substring(PointerGetDatum(replace_text), + substr_start, ch_cnt, false); + appendStringInfoString(str, PG_TEXT_GET_STR(append_text)); + pfree(append_text); + } + substr_start += ch_cnt + 1; + + if (p >= p_end) /* When escape char is not found. */ + break; + + /* See the next character of escape char. */ + p++; + so = eo = -1; + + if (*p >= '1' && *p <= '9') + { + /* Use the back reference of regexp. */ + int idx = *p - '0'; + so = pmatch[idx].rm_so; + eo = pmatch[idx].rm_eo; + p++; + substr_start++; + } + else if (*p == '&') + { + /* Use the entire matched string. */ + so = pmatch[0].rm_so; + eo = pmatch[0].rm_eo; + p++; + substr_start++; + } + + if (so != -1 && eo != -1) + { + /* Copy the text that is back reference of regexp. */ + text *append_text = text_substring(PointerGetDatum(src_text), + so + 1, (eo - so), false); + appendStringInfoString(str, PG_TEXT_GET_STR(append_text)); + pfree(append_text); + } + } + } + + #define REGEXP_REPLACE_BACKREF_CNT 10 + + /* + * replace_text_regexp + * replace text that matches to regexp in src_text to replace_text. + */ + Datum + replace_text_regexp(PG_FUNCTION_ARGS) + { + text *ret_text; + text *src_text = PG_GETARG_TEXT_P(0); + int src_text_len = VARSIZE(src_text) - VARHDRSZ; + regex_t *re = (regex_t *)PG_GETARG_POINTER(1); + text *replace_text = PG_GETARG_TEXT_P(2); + bool global = PG_GETARG_BOOL(3); + StringInfo str = makeStringInfo(); + int regexec_result; + regmatch_t pmatch[REGEXP_REPLACE_BACKREF_CNT]; + pg_wchar *data; + size_t data_len; + int search_start; + int data_pos; + bool have_escape; + + /* Convert data string to wide characters. */ + data = (pg_wchar *) palloc((src_text_len + 1) * sizeof(pg_wchar)); + data_len = pg_mb2wchar_with_len(VARDATA(src_text), data, src_text_len); + + /* Check whether replace_text has escape char. */ + have_escape = check_replace_text_has_escape_char(replace_text); + + for (search_start = data_pos = 0; search_start <= data_len;) + { + regexec_result = pg_regexec(re, + data, + data_len, + search_start, + NULL, /* no details */ + REGEXP_REPLACE_BACKREF_CNT, + pmatch, + 0); + + if (regexec_result != REG_OKAY && regexec_result != REG_NOMATCH) + { + char errMsg[100]; + + /* re failed??? */ + pg_regerror(regexec_result, re, errMsg, sizeof(errMsg)); + ereport(ERROR, + (errcode(ERRCODE_INVALID_REGULAR_EXPRESSION), + errmsg("regular expression failed: %s", errMsg))); + } + + if (regexec_result == REG_NOMATCH) + break; + + /* + * Copy the text when there is a text in the left of matched position. + */ + if (pmatch[0].rm_so - data_pos > 0) + { + text *left_text = text_substring(PointerGetDatum(src_text), + data_pos + 1, + pmatch[0].rm_so - data_pos, false); + appendStringInfoString(str, PG_TEXT_GET_STR(left_text)); + pfree(left_text); + } + + /* + * Copy the replace_text. Process back references when the + * replace_text has escape characters. + */ + if (have_escape) + appendStringInfoRegexpSubstr(str, replace_text, pmatch, src_text); + else + appendStringInfoString(str, PG_TEXT_GET_STR(replace_text)); + + search_start = data_pos = pmatch[0].rm_eo; + + /* + * When global option is off, replace the first instance only. + */ + if (!global) + break; + + /* + * Search from next character when the matching text is zero width. + */ + if (pmatch[0].rm_so == pmatch[0].rm_eo) + search_start++; + } + + /* + * Copy the text when there is a text at the right of last matched + * or regexp is not matched. + */ + if (data_pos < data_len) + { + text *right_text = text_substring(PointerGetDatum(src_text), + data_pos + 1, -1, true); + appendStringInfoString(str, PG_TEXT_GET_STR(right_text)); + pfree(right_text); + } + + ret_text = PG_STR_GET_TEXT(str->data); + pfree(str->data); + pfree(str); + pfree(data); + + PG_RETURN_TEXT_P(ret_text); + } + + /* * split_text * parse input string * return ord item (1 based) Index: src/include/catalog/catversion.h =================================================================== RCS file: /cvsroot/pgsql/src/include/catalog/catversion.h,v retrieving revision 1.286 diff -c -c -r1.286 catversion.h *** src/include/catalog/catversion.h 8 Jul 2005 04:12:26 -0000 1.286 --- src/include/catalog/catversion.h 10 Jul 2005 04:52:46 -0000 *************** *** 53,58 **** */ /* yyyymmddN */ ! #define CATALOG_VERSION_NO 200507081 #endif --- 53,58 ---- */ /* yyyymmddN */ ! #define CATALOG_VERSION_NO 200507101 #endif Index: src/include/catalog/pg_proc.h =================================================================== RCS file: /cvsroot/pgsql/src/include/catalog/pg_proc.h,v retrieving revision 1.374 diff -c -c -r1.374 pg_proc.h *** src/include/catalog/pg_proc.h 6 Jul 2005 19:02:53 -0000 1.374 --- src/include/catalog/pg_proc.h 10 Jul 2005 04:52:51 -0000 *************** *** 2207,2212 **** --- 2207,2216 ---- DESCR("return portion of string"); DATA(insert OID = 2087 ( replace PGNSP PGUID 12 f f t f i 3 25 "25 25 25" _null_ _null_ _null_ replace_text - _null_ )); DESCR("replace all occurrences of old_substr with new_substr in string"); + DATA(insert OID = 2284 ( regexp_replace PGNSP PGUID 12 f f t f i 3 25 "25 25 25" _null_ _null_ _null_ textregexreplace_noopt - _null_ )); + DESCR("replace text using regexp"); + DATA(insert OID = 2285 ( regexp_replace PGNSP PGUID 12 f f t f i 4 25 "25 25 25 25" _null_ _null_ _null_ textregexreplace - _null_ )); + DESCR("replace text using regexp"); DATA(insert OID = 2088 ( split_part PGNSP PGUID 12 f f t f i 3 25 "25 25 23" _null_ _null_ _null_ split_text - _null_ )); DESCR("split string by field_sep and return field_num"); DATA(insert OID = 2089 ( to_hex PGNSP PGUID 12 f f t f i 1 25 "23" _null_ _null_ _null_ to_hex32 - _null_ )); Index: src/include/regex/regex.h =================================================================== RCS file: /cvsroot/pgsql/src/include/regex/regex.h,v retrieving revision 1.26 diff -c -c -r1.26 regex.h *** src/include/regex/regex.h 29 Nov 2003 22:41:10 -0000 1.26 --- src/include/regex/regex.h 10 Jul 2005 04:52:51 -0000 *************** *** 163,169 **** * the prototypes for exported functions */ extern int pg_regcomp(regex_t *, const pg_wchar *, size_t, int); ! extern int pg_regexec(regex_t *, const pg_wchar *, size_t, rm_detail_t *, size_t, regmatch_t[], int); extern void pg_regfree(regex_t *); extern size_t pg_regerror(int, const regex_t *, char *, size_t); --- 163,169 ---- * the prototypes for exported functions */ extern int pg_regcomp(regex_t *, const pg_wchar *, size_t, int); ! extern int pg_regexec(regex_t *, const pg_wchar *, size_t, size_t, rm_detail_t *, size_t, regmatch_t[], int); extern void pg_regfree(regex_t *); extern size_t pg_regerror(int, const regex_t *, char *, size_t); Index: src/include/utils/builtins.h =================================================================== RCS file: /cvsroot/pgsql/src/include/utils/builtins.h,v retrieving revision 1.259 diff -c -c -r1.259 builtins.h *** src/include/utils/builtins.h 6 Jul 2005 19:02:54 -0000 1.259 --- src/include/utils/builtins.h 10 Jul 2005 04:52:52 -0000 *************** *** 429,434 **** --- 429,436 ---- extern Datum texticregexeq(PG_FUNCTION_ARGS); extern Datum texticregexne(PG_FUNCTION_ARGS); extern Datum textregexsubstr(PG_FUNCTION_ARGS); + extern Datum textregexreplace_noopt(PG_FUNCTION_ARGS); + extern Datum textregexreplace(PG_FUNCTION_ARGS); extern Datum similar_escape(PG_FUNCTION_ARGS); /* regproc.c */ *************** *** 566,571 **** --- 568,574 ---- extern bool SplitIdentifierString(char *rawstring, char separator, List **namelist); extern Datum replace_text(PG_FUNCTION_ARGS); + extern Datum replace_text_regexp(PG_FUNCTION_ARGS); extern Datum split_text(PG_FUNCTION_ARGS); extern Datum text_to_array(PG_FUNCTION_ARGS); extern Datum array_to_text(PG_FUNCTION_ARGS); Index: src/test/regress/expected/strings.out =================================================================== RCS file: /cvsroot/pgsql/src/test/regress/expected/strings.out,v retrieving revision 1.25 diff -c -c -r1.25 strings.out *** src/test/regress/expected/strings.out 20 May 2005 01:29:56 -0000 1.25 --- src/test/regress/expected/strings.out 10 Jul 2005 04:52:52 -0000 *************** *** 192,197 **** --- 192,225 ---- cde (1 row) + -- PostgreSQL extension to allow using back reference in replace string; + SELECT regexp_replace('1112223333', '(\\d{3})(\\d{3})(\\d{4})', '(\\1) \\2-\\3'); + regexp_replace + ---------------- + (111) 222-3333 + (1 row) + + SELECT regexp_replace('AAA BBB CCC ', '\\s+', ' ', 'g'); + regexp_replace + ---------------- + AAA BBB CCC + (1 row) + + SELECT regexp_replace('AAA', '^|$', 'Z', 'g'); + regexp_replace + ---------------- + ZAAAZ + (1 row) + + SELECT regexp_replace('AAA aaa', 'A+', 'Z', 'gi'); + regexp_replace + ---------------- + Z Z + (1 row) + + -- invalid option of REGEXP_REPLACE + SELECT regexp_replace('AAA aaa', 'A+', 'Z', 'z'); + ERROR: invalid option of regexp_replace: z -- E021-11 position expression SELECT POSITION('4' IN '1234567890') = '4' AS "4"; 4 Index: src/test/regress/sql/strings.sql =================================================================== RCS file: /cvsroot/pgsql/src/test/regress/sql/strings.sql,v retrieving revision 1.16 diff -c -c -r1.16 strings.sql *** src/test/regress/sql/strings.sql 20 May 2005 01:29:56 -0000 1.16 --- src/test/regress/sql/strings.sql 10 Jul 2005 04:52:53 -0000 *************** *** 80,85 **** --- 80,92 ---- -- With a parenthesized subexpression, return only what matches the subexpr SELECT SUBSTRING('abcdefg' FROM 'b(.*)f') AS "cde"; + -- PostgreSQL extension to allow using back reference in replace string; + SELECT regexp_replace('1112223333', '(\\d{3})(\\d{3})(\\d{4})', '(\\1) \\2-\\3'); + SELECT regexp_replace('AAA BBB CCC ', '\\s+', ' ', 'g'); + SELECT regexp_replace('AAA', '^|$', 'Z', 'g'); + SELECT regexp_replace('AAA aaa', 'A+', 'Z', 'gi'); + -- invalid option of REGEXP_REPLACE + SELECT regexp_replace('AAA aaa', 'A+', 'Z', 'z'); -- E021-11 position expression SELECT POSITION('4' IN '1234567890') = '4' AS "4";