Re: regexp_replace

From: Bruce Momjian <pgman(at)candle(dot)pha(dot)pa(dot)us>
To: "a_ogawa00" <a_ogawa00(at)yahoo(dot)co(dot)jp>
Cc: pgsql-patches(at)postgresql(dot)org
Subject: Re: regexp_replace
Date: 2004-08-07 01:04:34
Message-ID: 200408070104.i7714Yp00734@candle.pha.pa.us
Views: Raw Message | Whole Thread | Download mbox | Resend email
Thread:
Lists: pgsql-hackers pgsql-patches


This has been saved for the 8.1 release:

http:/momjian.postgresql.org/cgi-bin/pgpatches2

---------------------------------------------------------------------------

a_ogawa00 wrote:
>
> This patch provides a new function regexp_replace.
> regexp_replace extends a replace function and enables text search
> by the regular expression. And, a back reference can be used within
> a replace string.
> (This patch for PostgreSQL 7.4.3)
>
> Function: regexp_replace(str, pattern, replace_str)
> Retuen Type: text
> Description: Replace all matched string in str.
> pattern is regular expression pattern.
> replace_str is replace string that can use '\1' - '\9', and
> '\&'.
> '\1' - '\9' is back reference to the n'th subexpression.
> '\&' is matched string.
>
> (example1)
> select regexp_replace('ABC-DEF', '(\\w+)-(\\w+)', '\\2-\\1')
> result: DEF-ABC
>
> (example2)
> update tab1 set col1 = regexp_replace(col1, '[A-Z]', '');
>
> ---
> Atsushi Ogawa
> a_ogawa(at)hi-ho(dot)ne(dot)jp
>
> --- cut here ---
>
> *** ./src/backend/regex/regexec.c.orig Tue Jul 20 08:45:39 2004
> --- ./src/backend/regex/regexec.c Tue Jul 20 08:49:36 2004
> ***************
> *** 110,115 ****
> --- 110,116 ----
> regmatch_t *pmatch;
> rm_detail_t *details;
> chr *start; /* start of string */
> + chr *search_start; /* search start of string */
> chr *stop; /* just past end of string */
> int err; /* error code if any (0 none) */
> regoff_t *mem; /* memory vector for backtracking */
> ***************
> *** 168,173 ****
> --- 169,175 ----
> pg_regexec(regex_t *re,
> const chr *string,
> size_t len,
> + size_t search_start,
> rm_detail_t *details,
> size_t nmatch,
> regmatch_t pmatch[],
> ***************
> *** 219,224 ****
> --- 221,227 ----
> v->pmatch = pmatch;
> v->details = details;
> v->start = (chr *) string;
> + v->search_start = (chr *) string + search_start;
> v->stop = (chr *) string + len;
> v->err = 0;
> if (backref)
> ***************
> *** 288,294 ****
> NOERR();
> MDEBUG(("\nsearch at %ld\n", LOFF(v->start)));
> cold = NULL;
> ! close = shortest(v, s, v->start, v->start, v->stop, &cold, (int *)
> NULL);
> freedfa(s);
> NOERR();
> if (v->g->cflags & REG_EXPECT)
> --- 291,298 ----
> NOERR();
> MDEBUG(("\nsearch at %ld\n", LOFF(v->start)));
> cold = NULL;
> ! close = shortest(v, s, v->search_start, v->search_start, v->stop,
> ! &cold, (int *) NULL);
> freedfa(s);
> NOERR();
> if (v->g->cflags & REG_EXPECT)
> ***************
> *** 415,421 ****
>
> assert(d != NULL && s != NULL);
> cold = NULL;
> ! close = v->start;
> do
> {
> MDEBUG(("\ncsearch at %ld\n", LOFF(close)));
> --- 419,425 ----
>
> assert(d != NULL && s != NULL);
> cold = NULL;
> ! close = v->search_start;
> do
> {
> MDEBUG(("\ncsearch at %ld\n", LOFF(close)));
> *** ./src/backend/utils/adt/regexp.c.orig Tue Jul 20 08:50:08 2004
> --- ./src/backend/utils/adt/regexp.c Tue Jul 20 09:00:05 2004
> ***************
> *** 80,116 ****
>
>
> /*
> ! * RE_compile_and_execute - compile and execute a RE, caching if possible
> *
> ! * Returns TRUE on match, FALSE on no match
> *
> ! * text_re --- the pattern, expressed as an *untoasted* TEXT object
> ! * dat --- the data to match against (need not be null-terminated)
> ! * dat_len --- the length of the data string
> ! * cflags --- compile options for the pattern
> ! * nmatch, pmatch --- optional return area for match details
> *
> ! * Both pattern and data are given in the database encoding. We
> internally
> ! * convert to array of pg_wchar which is what Spencer's regex package
> wants.
> */
> ! static bool
> ! RE_compile_and_execute(text *text_re, unsigned char *dat, int dat_len,
> ! int cflags, int nmatch, regmatch_t *pmatch)
> {
> int text_re_len = VARSIZE(text_re);
> - pg_wchar *data;
> - size_t data_len;
> pg_wchar *pattern;
> size_t pattern_len;
> int i;
> int regcomp_result;
> - int regexec_result;
> cached_re_str re_temp;
>
> - /* Convert data string to wide characters */
> - data = (pg_wchar *) palloc((dat_len + 1) * sizeof(pg_wchar));
> - data_len = pg_mb2wchar_with_len(dat, data, dat_len);
> -
> /*
> * Look for a match among previously compiled REs. Since the data
> * structure is self-organizing with most-used entries at the front,
> --- 80,105 ----
>
>
> /*
> ! * RE_compile_and_cache - compile a RE, caching if possible
> *
> ! * Returns regex_t
> *
> ! * text_re --- the pattern, expressed as an *untoasted* TEXT object
> ! * cflags --- compile options for the pattern
> *
> ! * Pattern is given in the database encoding. We internally convert to
> ! * array of pg_wchar which is what Spencer's regex package wants.
> */
> ! static regex_t
> ! RE_compile_and_cache(text *text_re, int cflags)
> {
> int text_re_len = VARSIZE(text_re);
> pg_wchar *pattern;
> size_t pattern_len;
> int i;
> int regcomp_result;
> cached_re_str re_temp;
>
> /*
> * Look for a match among previously compiled REs. Since the data
> * structure is self-organizing with most-used entries at the front,
> ***************
> *** 132,149 ****
> re_array[0] = re_temp;
> }
>
> ! /* Perform RE match and return result */
> ! regexec_result = pg_regexec(&re_array[0].cre_re,
> ! data,
> ! data_len,
> ! NULL, /* no details */
> ! nmatch,
> ! pmatch,
> ! 0);
> !
> ! pfree(data);
> !
> ! return (regexec_result == 0);
> }
> }
>
> --- 121,127 ----
> re_array[0] = re_temp;
> }
>
> ! return re_array[0].cre_re;
> }
> }
>
> ***************
> *** 210,219 ****
> --- 188,231 ----
> re_array[0] = re_temp;
> num_res++;
>
> + return re_array[0].cre_re;
> + }
> +
> + /*
> + * RE_compile_and_execute - compile and execute a RE, caching if possible
> + *
> + * Returns TRUE on match, FALSE on no match
> + *
> + * text_re --- the pattern, expressed as an *untoasted* TEXT object
> + * dat --- the data to match against (need not be null-terminated)
> + * dat_len --- the length of the data string
> + * cflags --- compile options for the pattern
> + * nmatch, pmatch --- optional return area for match details
> + *
> + * Both pattern and data are given in the database encoding. We
> internally
> + * convert to array of pg_wchar which is what Spencer's regex package
> wants.
> + */
> + static bool
> + RE_compile_and_execute(text *text_re, unsigned char *dat, int dat_len,
> + int cflags, int nmatch, regmatch_t *pmatch)
> + {
> + pg_wchar *data;
> + size_t data_len;
> + int regexec_result;
> + regex_t re;
> +
> + /* Convert data string to wide characters */
> + data = (pg_wchar *) palloc((dat_len + 1) * sizeof(pg_wchar));
> + data_len = pg_mb2wchar_with_len(dat, data, dat_len);
> +
> + /* Compile RE */
> + re = RE_compile_and_cache(text_re, cflags);
> +
> /* Perform RE match and return result */
> regexec_result = pg_regexec(&re_array[0].cre_re,
> data,
> data_len,
> + 0,
> NULL, /* no details */
> nmatch,
> pmatch,
> ***************
> *** 415,420 ****
> --- 427,452 ----
> }
>
> PG_RETURN_NULL();
> + }
> +
> + /*
> + * textregexreplace()
> + * Return a replace string matched by a regular expression.
> + */
> + Datum
> + textregexreplace(PG_FUNCTION_ARGS)
> + {
> + text *s = PG_GETARG_TEXT_P(0);
> + text *p = PG_GETARG_TEXT_P(1);
> + text *r = PG_GETARG_TEXT_P(2);
> + regex_t re;
> +
> + re = RE_compile_and_cache(p, regex_flavor);
> +
> + return (DirectFunctionCall3(replace_text_regexp,
> + PointerGetDatum(s),
> + PointerGetDatum(&re),
> + PointerGetDatum(r)));
> }
>
> /* similar_escape()
> *** ./src/backend/utils/adt/varlena.c.orig Tue Jul 20 09:00:17 2004
> --- ./src/backend/utils/adt/varlena.c Tue Jul 20 10:23:32 2004
> ***************
> *** 28,33 ****
> --- 28,34 ----
> #include "utils/builtins.h"
> #include "utils/lsyscache.h"
> #include "utils/pg_locale.h"
> + #include "regex/regex.h"
>
>
> typedef struct varlena unknown;
> ***************
> *** 1971,1976 ****
> --- 1972,2122 ----
> ret_text = PG_STR_GET_TEXT(str->data);
> pfree(str->data);
> pfree(str);
> +
> + PG_RETURN_TEXT_P(ret_text);
> + }
> +
> + /*
> + * have_escape_in_regexp_replace_str
> + * check replace string have escape char
> + */
> + static bool
> + have_escape_in_regexp_replace_str(const char *replace_str)
> + {
> + return (strchr(replace_str, '\\') != NULL);
> + }
> +
> + #define REGEXP_REPLACE_BACKREF_CNT 10
> + /*
> + * appendStringInfoRegexpSubstr
> + * append string for regexp back references.
> + */
> + static void
> + appendStringInfoRegexpSubstr(StringInfo str, text *replace_text,
> + regmatch_t *pmatch, text *buf_text, int search_start)
> + {
> + const char *pstart = PG_TEXT_GET_STR(replace_text);
> + const char *p = pstart;
> + const char *pnext;
> +
> + text *add_text;
> + int so;
> + int eo;
> +
> + for(;;) {
> + pnext = strchr(p, '\\');
> + if(pnext == NULL) break;
> +
> + add_text = text_substring(PointerGetDatum(replace_text),
> + p - pstart + 1, pnext - p, false);
> + appendStringInfoString(str, PG_TEXT_GET_STR(add_text));
> + pfree(add_text);
> +
> + p = pnext + 1;
> + so = eo = -1;
> +
> + if(*p >= '1' && *p <= '9') {
> + int idx = *p - '0';
> + so = pmatch[idx].rm_so - search_start;
> + eo = pmatch[idx].rm_eo - search_start;
> + p++;
> + } else {
> + switch(*p) {
> + case '&':
> + so = pmatch[0].rm_so - search_start;
> + eo = pmatch[0].rm_eo - search_start;
> + p++;
> + break;
> + }
> + }
> +
> + if(so != -1 && eo != -1) {
> + add_text = text_substring(PointerGetDatum(buf_text),
> + so + 1, (eo - so), false);
> + appendStringInfoString(str, PG_TEXT_GET_STR(add_text));
> + pfree(add_text);
> + }
> + }
> +
> + add_text = text_substring(PointerGetDatum(replace_text),
> + p - pstart + 1, -1, true);
> + appendStringInfoString(str, PG_TEXT_GET_STR(add_text));
> + pfree(add_text);
> + }
> +
> + /*
> + * replace_text_regexp
> + * replace text using regexp
> + */
> + Datum
> + replace_text_regexp(PG_FUNCTION_ARGS)
> + {
> + text *left_text;
> + text *right_text;
> + text *buf_text;
> + text *ret_text;
> + text *src_text = PG_GETARG_TEXT_P(0);
> + char *src_text_str = PG_TEXT_GET_STR(src_text);
> + int src_text_len = TEXTLEN(src_text);
> + regex_t *re = (regex_t *)PG_GETARG_POINTER(1);
> + text *replace_text = PG_GETARG_TEXT_P(2);
> + char *replace_str = PG_TEXT_GET_STR(replace_text);
> + StringInfo str = makeStringInfo();
> + int regexec_result;
> + regmatch_t pmatch[REGEXP_REPLACE_BACKREF_CNT];
> + pg_wchar *data;
> + size_t data_len;
> + int search_start;
> + bool have_escape;
> +
> + buf_text = TEXTDUP(src_text);
> +
> + /* Convert data string to wide characters */
> + data = (pg_wchar *) palloc((src_text_len + 1) * sizeof(pg_wchar));
> + data_len = pg_mb2wchar_with_len(src_text_str, data,
> strlen(src_text_str));
> +
> + have_escape = have_escape_in_regexp_replace_str(replace_str);
> +
> + for(search_start = 0; search_start <= data_len;) {
> + regexec_result = pg_regexec(re,
> + data,
> + data_len,
> + search_start,
> + NULL, /* no details */
> + REGEXP_REPLACE_BACKREF_CNT,
> + pmatch,
> + 0);
> + if(regexec_result != 0) break;
> +
> + left_text = text_substring(PointerGetDatum(buf_text),
> + 1, pmatch[0].rm_so - search_start, false);
> + right_text = text_substring(PointerGetDatum(buf_text),
> + pmatch[0].rm_eo - search_start + 1,
> + -1, true);
> +
> + appendStringInfoString(str, PG_TEXT_GET_STR(left_text));
> + if(have_escape) {
> + appendStringInfoRegexpSubstr(str, replace_text, pmatch,
> + buf_text, search_start);
> + } else {
> + appendStringInfoString(str, replace_str);
> + }
> +
> + pfree(buf_text);
> + pfree(left_text);
> + buf_text = right_text;
> +
> + search_start = pmatch[0].rm_eo;
> + if(pmatch[0].rm_so == pmatch[0].rm_eo) search_start++;
> + }
> +
> + appendStringInfoString(str, PG_TEXT_GET_STR(buf_text));
> + pfree(buf_text);
> +
> + ret_text = PG_STR_GET_TEXT(str->data);
> + pfree(str->data);
> + pfree(str);
> + pfree(data);
>
> PG_RETURN_TEXT_P(ret_text);
> }
> *** ./src/include/catalog/pg_proc.h.orig Tue Jul 20 09:24:11 2004
> --- ./src/include/catalog/pg_proc.h Tue Jul 20 09:26:11 2004
> ***************
> *** 2186,2191 ****
> --- 2186,2193 ----
> DESCR("return portion of string");
> DATA(insert OID = 2087 ( replace PGNSP PGUID 12 f f t f i 3 25 "25 25
> 25" replace_text - _null_ ));
> DESCR("replace all occurrences of old_substr with new_substr in string");
> + DATA(insert OID = 2167 ( regexp_replace PGNSP PGUID 12 f f t f i 3 25
> "25 25 25" textregexreplace - _null_ ));
> + DESCR("replace text using regexp");
> DATA(insert OID = 2088 ( split_part PGNSP PGUID 12 f f t f i 3 25 "25
> 25 23" split_text - _null_ ));
> DESCR("split string by field_sep and return field_num");
> DATA(insert OID = 2089 ( to_hex PGNSP PGUID 12 f f t f i 1 25 "23"
> to_hex32 - _null_ ));
> *** ./src/include/regex/regex.h.orig Tue Jul 20 08:51:06 2004
> --- ./src/include/regex/regex.h Tue Jul 20 08:51:16 2004
> ***************
> *** 163,169 ****
> * the prototypes for exported functions
> */
> extern int pg_regcomp(regex_t *, const pg_wchar *, size_t, int);
> ! extern int pg_regexec(regex_t *, const pg_wchar *, size_t, rm_detail_t *,
> size_t, regmatch_t[], int);
> extern void pg_regfree(regex_t *);
> extern size_t pg_regerror(int, const regex_t *, char *, size_t);
>
> --- 163,169 ----
> * the prototypes for exported functions
> */
> extern int pg_regcomp(regex_t *, const pg_wchar *, size_t, int);
> ! extern int pg_regexec(regex_t *, const pg_wchar *, size_t, size_t,
> rm_detail_t *, size_t, regmatch_t[], int);
> extern void pg_regfree(regex_t *);
> extern size_t pg_regerror(int, const regex_t *, char *, size_t);
>
> *** ./src/include/utils/builtins.h.orig Tue Jul 20 09:11:19 2004
> --- ./src/include/utils/builtins.h Tue Jul 20 09:11:46 2004
> ***************
> *** 408,413 ****
> --- 408,414 ----
> extern Datum texticregexeq(PG_FUNCTION_ARGS);
> extern Datum texticregexne(PG_FUNCTION_ARGS);
> extern Datum textregexsubstr(PG_FUNCTION_ARGS);
> + extern Datum textregexreplace(PG_FUNCTION_ARGS);
> extern Datum similar_escape(PG_FUNCTION_ARGS);
> extern const char *assign_regex_flavor(const char *value,
> bool doit, bool interactive);
> ***************
> *** 537,542 ****
> --- 538,544 ----
> extern bool SplitIdentifierString(char *rawstring, char separator,
> List **namelist);
> extern Datum replace_text(PG_FUNCTION_ARGS);
> + extern Datum replace_text_regexp(PG_FUNCTION_ARGS);
> extern Datum split_text(PG_FUNCTION_ARGS);
> extern Datum text_to_array(PG_FUNCTION_ARGS);
> extern Datum array_to_text(PG_FUNCTION_ARGS);
>
> __________________________________________________
> Do You Yahoo!?
> http://bb.yahoo.co.jp/
>
>
> ---------------------------(end of broadcast)---------------------------
> TIP 3: if posting/reading through Usenet, please send an appropriate
> subscribe-nomail command to majordomo(at)postgresql(dot)org so that your
> message can get through to the mailing list cleanly
>

--
Bruce Momjian | http://candle.pha.pa.us
pgman(at)candle(dot)pha(dot)pa(dot)us | (610) 359-1001
+ If your life is a hard drive, | 13 Roberts Road
+ Christ can be your backup. | Newtown Square, Pennsylvania 19073

In response to

Browse pgsql-hackers by date

  From Date Subject
Next Message Bruce Momjian 2004-08-07 01:41:26 Re: parameter hints to the optimizer
Previous Message Bruce Momjian 2004-08-07 01:04:04 Re: Vacuum Cost Documentation?

Browse pgsql-patches by date

  From Date Subject
Next Message Joe Conway 2004-08-07 01:50:01 Re: [PATCHES] [BUGS] casting strings to multidimensional arrays yields
Previous Message Roger Leigh 2004-08-07 00:05:32 psql groff patch (for 8.1)