regexp_replace

From: "a_ogawa00" <a_ogawa00(at)yahoo(dot)co(dot)jp>
To: <pgsql-patches(at)postgresql(dot)org>
Subject: regexp_replace
Date: 2004-07-19 13:24:42
Message-ID: PIEMIKOOMKNIJLLLBCBBMEPLCAAA.a_ogawa00@yahoo.co.jp
Views: Raw Message | Whole Thread | Download mbox | Resend email
Thread:
Lists: pgsql-hackers pgsql-patches


This patch provides a new function regexp_replace.
regexp_replace extends a replace function and enables text search
by the regular expression. And, a back reference can be used within
a replace string.
(This patch for PostgreSQL 7.4.3)

Function: regexp_replace(str, pattern, replace_str)
Retuen Type: text
Description: Replace all matched string in str.
pattern is regular expression pattern.
replace_str is replace string that can use '\1' - '\9', and
'\&'.
'\1' - '\9' is back reference to the n'th subexpression.
'\&' is matched string.

(example1)
select regexp_replace('ABC-DEF', '(\\w+)-(\\w+)', '\\2-\\1')
result: DEF-ABC

(example2)
update tab1 set col1 = regexp_replace(col1, '[A-Z]', '');

---
Atsushi Ogawa
a_ogawa(at)hi-ho(dot)ne(dot)jp

--- cut here ---

*** ./src/backend/regex/regexec.c.orig Tue Jul 20 08:45:39 2004
--- ./src/backend/regex/regexec.c Tue Jul 20 08:49:36 2004
***************
*** 110,115 ****
--- 110,116 ----
regmatch_t *pmatch;
rm_detail_t *details;
chr *start; /* start of string */
+ chr *search_start; /* search start of string */
chr *stop; /* just past end of string */
int err; /* error code if any (0 none) */
regoff_t *mem; /* memory vector for backtracking */
***************
*** 168,173 ****
--- 169,175 ----
pg_regexec(regex_t *re,
const chr *string,
size_t len,
+ size_t search_start,
rm_detail_t *details,
size_t nmatch,
regmatch_t pmatch[],
***************
*** 219,224 ****
--- 221,227 ----
v->pmatch = pmatch;
v->details = details;
v->start = (chr *) string;
+ v->search_start = (chr *) string + search_start;
v->stop = (chr *) string + len;
v->err = 0;
if (backref)
***************
*** 288,294 ****
NOERR();
MDEBUG(("\nsearch at %ld\n", LOFF(v->start)));
cold = NULL;
! close = shortest(v, s, v->start, v->start, v->stop, &cold, (int *)
NULL);
freedfa(s);
NOERR();
if (v->g->cflags & REG_EXPECT)
--- 291,298 ----
NOERR();
MDEBUG(("\nsearch at %ld\n", LOFF(v->start)));
cold = NULL;
! close = shortest(v, s, v->search_start, v->search_start, v->stop,
! &cold, (int *) NULL);
freedfa(s);
NOERR();
if (v->g->cflags & REG_EXPECT)
***************
*** 415,421 ****

assert(d != NULL && s != NULL);
cold = NULL;
! close = v->start;
do
{
MDEBUG(("\ncsearch at %ld\n", LOFF(close)));
--- 419,425 ----

assert(d != NULL && s != NULL);
cold = NULL;
! close = v->search_start;
do
{
MDEBUG(("\ncsearch at %ld\n", LOFF(close)));
*** ./src/backend/utils/adt/regexp.c.orig Tue Jul 20 08:50:08 2004
--- ./src/backend/utils/adt/regexp.c Tue Jul 20 09:00:05 2004
***************
*** 80,116 ****

/*
! * RE_compile_and_execute - compile and execute a RE, caching if possible
*
! * Returns TRUE on match, FALSE on no match
*
! * text_re --- the pattern, expressed as an *untoasted* TEXT object
! * dat --- the data to match against (need not be null-terminated)
! * dat_len --- the length of the data string
! * cflags --- compile options for the pattern
! * nmatch, pmatch --- optional return area for match details
*
! * Both pattern and data are given in the database encoding. We
internally
! * convert to array of pg_wchar which is what Spencer's regex package
wants.
*/
! static bool
! RE_compile_and_execute(text *text_re, unsigned char *dat, int dat_len,
! int cflags, int nmatch, regmatch_t *pmatch)
{
int text_re_len = VARSIZE(text_re);
- pg_wchar *data;
- size_t data_len;
pg_wchar *pattern;
size_t pattern_len;
int i;
int regcomp_result;
- int regexec_result;
cached_re_str re_temp;

- /* Convert data string to wide characters */
- data = (pg_wchar *) palloc((dat_len + 1) * sizeof(pg_wchar));
- data_len = pg_mb2wchar_with_len(dat, data, dat_len);
-
/*
* Look for a match among previously compiled REs. Since the data
* structure is self-organizing with most-used entries at the front,
--- 80,105 ----

/*
! * RE_compile_and_cache - compile a RE, caching if possible
*
! * Returns regex_t
*
! * text_re --- the pattern, expressed as an *untoasted* TEXT object
! * cflags --- compile options for the pattern
*
! * Pattern is given in the database encoding. We internally convert to
! * array of pg_wchar which is what Spencer's regex package wants.
*/
! static regex_t
! RE_compile_and_cache(text *text_re, int cflags)
{
int text_re_len = VARSIZE(text_re);
pg_wchar *pattern;
size_t pattern_len;
int i;
int regcomp_result;
cached_re_str re_temp;

/*
* Look for a match among previously compiled REs. Since the data
* structure is self-organizing with most-used entries at the front,
***************
*** 132,149 ****
re_array[0] = re_temp;
}

! /* Perform RE match and return result */
! regexec_result = pg_regexec(&re_array[0].cre_re,
! data,
! data_len,
! NULL, /* no details */
! nmatch,
! pmatch,
! 0);
!
! pfree(data);
!
! return (regexec_result == 0);
}
}

--- 121,127 ----
re_array[0] = re_temp;
}

! return re_array[0].cre_re;
}
}

***************
*** 210,219 ****
--- 188,231 ----
re_array[0] = re_temp;
num_res++;

+ return re_array[0].cre_re;
+ }
+
+ /*
+ * RE_compile_and_execute - compile and execute a RE, caching if possible
+ *
+ * Returns TRUE on match, FALSE on no match
+ *
+ * text_re --- the pattern, expressed as an *untoasted* TEXT object
+ * dat --- the data to match against (need not be null-terminated)
+ * dat_len --- the length of the data string
+ * cflags --- compile options for the pattern
+ * nmatch, pmatch --- optional return area for match details
+ *
+ * Both pattern and data are given in the database encoding. We
internally
+ * convert to array of pg_wchar which is what Spencer's regex package
wants.
+ */
+ static bool
+ RE_compile_and_execute(text *text_re, unsigned char *dat, int dat_len,
+ int cflags, int nmatch, regmatch_t *pmatch)
+ {
+ pg_wchar *data;
+ size_t data_len;
+ int regexec_result;
+ regex_t re;
+
+ /* Convert data string to wide characters */
+ data = (pg_wchar *) palloc((dat_len + 1) * sizeof(pg_wchar));
+ data_len = pg_mb2wchar_with_len(dat, data, dat_len);
+
+ /* Compile RE */
+ re = RE_compile_and_cache(text_re, cflags);
+
/* Perform RE match and return result */
regexec_result = pg_regexec(&re_array[0].cre_re,
data,
data_len,
+ 0,
NULL, /* no details */
nmatch,
pmatch,
***************
*** 415,420 ****
--- 427,452 ----
}

PG_RETURN_NULL();
+ }
+
+ /*
+ * textregexreplace()
+ * Return a replace string matched by a regular expression.
+ */
+ Datum
+ textregexreplace(PG_FUNCTION_ARGS)
+ {
+ text *s = PG_GETARG_TEXT_P(0);
+ text *p = PG_GETARG_TEXT_P(1);
+ text *r = PG_GETARG_TEXT_P(2);
+ regex_t re;
+
+ re = RE_compile_and_cache(p, regex_flavor);
+
+ return (DirectFunctionCall3(replace_text_regexp,
+ PointerGetDatum(s),
+ PointerGetDatum(&re),
+ PointerGetDatum(r)));
}

/* similar_escape()
*** ./src/backend/utils/adt/varlena.c.orig Tue Jul 20 09:00:17 2004
--- ./src/backend/utils/adt/varlena.c Tue Jul 20 10:23:32 2004
***************
*** 28,33 ****
--- 28,34 ----
#include "utils/builtins.h"
#include "utils/lsyscache.h"
#include "utils/pg_locale.h"
+ #include "regex/regex.h"

typedef struct varlena unknown;
***************
*** 1971,1976 ****
--- 1972,2122 ----
ret_text = PG_STR_GET_TEXT(str->data);
pfree(str->data);
pfree(str);
+
+ PG_RETURN_TEXT_P(ret_text);
+ }
+
+ /*
+ * have_escape_in_regexp_replace_str
+ * check replace string have escape char
+ */
+ static bool
+ have_escape_in_regexp_replace_str(const char *replace_str)
+ {
+ return (strchr(replace_str, '\\') != NULL);
+ }
+
+ #define REGEXP_REPLACE_BACKREF_CNT 10
+ /*
+ * appendStringInfoRegexpSubstr
+ * append string for regexp back references.
+ */
+ static void
+ appendStringInfoRegexpSubstr(StringInfo str, text *replace_text,
+ regmatch_t *pmatch, text *buf_text, int search_start)
+ {
+ const char *pstart = PG_TEXT_GET_STR(replace_text);
+ const char *p = pstart;
+ const char *pnext;
+
+ text *add_text;
+ int so;
+ int eo;
+
+ for(;;) {
+ pnext = strchr(p, '\\');
+ if(pnext == NULL) break;
+
+ add_text = text_substring(PointerGetDatum(replace_text),
+ p - pstart + 1, pnext - p, false);
+ appendStringInfoString(str, PG_TEXT_GET_STR(add_text));
+ pfree(add_text);
+
+ p = pnext + 1;
+ so = eo = -1;
+
+ if(*p >= '1' && *p <= '9') {
+ int idx = *p - '0';
+ so = pmatch[idx].rm_so - search_start;
+ eo = pmatch[idx].rm_eo - search_start;
+ p++;
+ } else {
+ switch(*p) {
+ case '&':
+ so = pmatch[0].rm_so - search_start;
+ eo = pmatch[0].rm_eo - search_start;
+ p++;
+ break;
+ }
+ }
+
+ if(so != -1 && eo != -1) {
+ add_text = text_substring(PointerGetDatum(buf_text),
+ so + 1, (eo - so), false);
+ appendStringInfoString(str, PG_TEXT_GET_STR(add_text));
+ pfree(add_text);
+ }
+ }
+
+ add_text = text_substring(PointerGetDatum(replace_text),
+ p - pstart + 1, -1, true);
+ appendStringInfoString(str, PG_TEXT_GET_STR(add_text));
+ pfree(add_text);
+ }
+
+ /*
+ * replace_text_regexp
+ * replace text using regexp
+ */
+ Datum
+ replace_text_regexp(PG_FUNCTION_ARGS)
+ {
+ text *left_text;
+ text *right_text;
+ text *buf_text;
+ text *ret_text;
+ text *src_text = PG_GETARG_TEXT_P(0);
+ char *src_text_str = PG_TEXT_GET_STR(src_text);
+ int src_text_len = TEXTLEN(src_text);
+ regex_t *re = (regex_t *)PG_GETARG_POINTER(1);
+ text *replace_text = PG_GETARG_TEXT_P(2);
+ char *replace_str = PG_TEXT_GET_STR(replace_text);
+ StringInfo str = makeStringInfo();
+ int regexec_result;
+ regmatch_t pmatch[REGEXP_REPLACE_BACKREF_CNT];
+ pg_wchar *data;
+ size_t data_len;
+ int search_start;
+ bool have_escape;
+
+ buf_text = TEXTDUP(src_text);
+
+ /* Convert data string to wide characters */
+ data = (pg_wchar *) palloc((src_text_len + 1) * sizeof(pg_wchar));
+ data_len = pg_mb2wchar_with_len(src_text_str, data,
strlen(src_text_str));
+
+ have_escape = have_escape_in_regexp_replace_str(replace_str);
+
+ for(search_start = 0; search_start <= data_len;) {
+ regexec_result = pg_regexec(re,
+ data,
+ data_len,
+ search_start,
+ NULL, /* no details */
+ REGEXP_REPLACE_BACKREF_CNT,
+ pmatch,
+ 0);
+ if(regexec_result != 0) break;
+
+ left_text = text_substring(PointerGetDatum(buf_text),
+ 1, pmatch[0].rm_so - search_start, false);
+ right_text = text_substring(PointerGetDatum(buf_text),
+ pmatch[0].rm_eo - search_start + 1,
+ -1, true);
+
+ appendStringInfoString(str, PG_TEXT_GET_STR(left_text));
+ if(have_escape) {
+ appendStringInfoRegexpSubstr(str, replace_text, pmatch,
+ buf_text, search_start);
+ } else {
+ appendStringInfoString(str, replace_str);
+ }
+
+ pfree(buf_text);
+ pfree(left_text);
+ buf_text = right_text;
+
+ search_start = pmatch[0].rm_eo;
+ if(pmatch[0].rm_so == pmatch[0].rm_eo) search_start++;
+ }
+
+ appendStringInfoString(str, PG_TEXT_GET_STR(buf_text));
+ pfree(buf_text);
+
+ ret_text = PG_STR_GET_TEXT(str->data);
+ pfree(str->data);
+ pfree(str);
+ pfree(data);

PG_RETURN_TEXT_P(ret_text);
}
*** ./src/include/catalog/pg_proc.h.orig Tue Jul 20 09:24:11 2004
--- ./src/include/catalog/pg_proc.h Tue Jul 20 09:26:11 2004
***************
*** 2186,2191 ****
--- 2186,2193 ----
DESCR("return portion of string");
DATA(insert OID = 2087 ( replace PGNSP PGUID 12 f f t f i 3 25 "25 25
25" replace_text - _null_ ));
DESCR("replace all occurrences of old_substr with new_substr in string");
+ DATA(insert OID = 2167 ( regexp_replace PGNSP PGUID 12 f f t f i 3 25
"25 25 25" textregexreplace - _null_ ));
+ DESCR("replace text using regexp");
DATA(insert OID = 2088 ( split_part PGNSP PGUID 12 f f t f i 3 25 "25
25 23" split_text - _null_ ));
DESCR("split string by field_sep and return field_num");
DATA(insert OID = 2089 ( to_hex PGNSP PGUID 12 f f t f i 1 25 "23"
to_hex32 - _null_ ));
*** ./src/include/regex/regex.h.orig Tue Jul 20 08:51:06 2004
--- ./src/include/regex/regex.h Tue Jul 20 08:51:16 2004
***************
*** 163,169 ****
* the prototypes for exported functions
*/
extern int pg_regcomp(regex_t *, const pg_wchar *, size_t, int);
! extern int pg_regexec(regex_t *, const pg_wchar *, size_t, rm_detail_t *,
size_t, regmatch_t[], int);
extern void pg_regfree(regex_t *);
extern size_t pg_regerror(int, const regex_t *, char *, size_t);

--- 163,169 ----
* the prototypes for exported functions
*/
extern int pg_regcomp(regex_t *, const pg_wchar *, size_t, int);
! extern int pg_regexec(regex_t *, const pg_wchar *, size_t, size_t,
rm_detail_t *, size_t, regmatch_t[], int);
extern void pg_regfree(regex_t *);
extern size_t pg_regerror(int, const regex_t *, char *, size_t);

*** ./src/include/utils/builtins.h.orig Tue Jul 20 09:11:19 2004
--- ./src/include/utils/builtins.h Tue Jul 20 09:11:46 2004
***************
*** 408,413 ****
--- 408,414 ----
extern Datum texticregexeq(PG_FUNCTION_ARGS);
extern Datum texticregexne(PG_FUNCTION_ARGS);
extern Datum textregexsubstr(PG_FUNCTION_ARGS);
+ extern Datum textregexreplace(PG_FUNCTION_ARGS);
extern Datum similar_escape(PG_FUNCTION_ARGS);
extern const char *assign_regex_flavor(const char *value,
bool doit, bool interactive);
***************
*** 537,542 ****
--- 538,544 ----
extern bool SplitIdentifierString(char *rawstring, char separator,
List **namelist);
extern Datum replace_text(PG_FUNCTION_ARGS);
+ extern Datum replace_text_regexp(PG_FUNCTION_ARGS);
extern Datum split_text(PG_FUNCTION_ARGS);
extern Datum text_to_array(PG_FUNCTION_ARGS);
extern Datum array_to_text(PG_FUNCTION_ARGS);

__________________________________________________
Do You Yahoo!?
http://bb.yahoo.co.jp/

Responses

Browse pgsql-hackers by date

  From Date Subject
Next Message Pierre Emmanuel Gros 2004-07-19 13:40:39 storage engine , mysql syntax CREATE TABLE t (i INT) ENGINE = INNODB|BDB
Previous Message Rod Taylor 2004-07-19 12:28:08 Re: pg_dump bug fixing

Browse pgsql-patches by date

  From Date Subject
Next Message Bruce Momjian 2004-07-19 16:35:05 Re: [HACKERS] Point in Time Recovery
Previous Message Andreas Pflug 2004-07-19 10:54:17 Re: logfile subprocess and Fancy File Functions