Index: doc/src/sgml/func.sgml =================================================================== RCS file: /home/jeremyd/local/postgres/cvsuproot/pgsql/doc/src/sgml/func.sgml,v retrieving revision 1.357 diff -c -r1.357 func.sgml *** doc/src/sgml/func.sgml 1 Feb 2007 00:28:16 -0000 1.357 --- doc/src/sgml/func.sgml 10 Feb 2007 23:33:04 -0000 *************** *** 1446,1451 **** --- 1446,1464 ---- + regexp_matches(string text, pattern text [,flags text]) + text[] or setof record (if flags are given) + + Return all capture groups resulting from matching POSIX regular + expression against the string. See + for more information on pattern + matching. + + regexp_matches('foobarbequebaz', '(bar)(beque)') + {bar,beque} + + + regexp_replace(string text, pattern text, replacement text [,flags text]) text *************** *** 1458,1463 **** --- 1471,1488 ---- + regexp_split(string text, pattern text [,flags text]) + setof text + + Splits string using POSIX regular expression as + the delimiter. See for more + information on pattern matching. + + regexp_split('hello world', E'\\s+') + helloworld (2 rows) + + + repeat(string text, number int) text Repeat string the specified *************** *** 2861,2869 **** substring - - regexp_replace - string SIMILAR TO pattern ESCAPE escape-character --- 2886,2891 ---- *************** *** 2982,2987 **** --- 3004,3021 ---- regular expression pattern matching + + substring + + + regexp_replace + + + regexp_matches + + + regexp_split + lists the available *************** *** 3112,3118 **** string containing zero or more single-letter flags that change the function's behavior. Flag i specifies case-insensitive matching, while flag g specifies replacement of each matching ! substring rather than only the first one. --- 3146,3155 ---- string containing zero or more single-letter flags that change the function's behavior. Flag i specifies case-insensitive matching, while flag g specifies replacement of each matching ! substring rather than only the first one. Other supported flags are ! m, n, p, w and ! x, whose meanings correspond to those shown in ! . *************** *** 3127,3132 **** --- 3164,3344 ---- + + The regexp_matches function returns all of the capture + groups resulting from matching POSIX regular expression patterns. + It has the syntax + regexp_matches(string, pattern, + , flags ). + If there is no match to the pattern, the function returns NULL. + If there is a match, all of the capture groups in the pattern + are returned in a text[]. If the flags are not given, + this text[] is the only returned value. If the + flags are given, the function returns the following values: + + + Columns Returned from <function>regexp_matches</function> + + + + Column Name + Column Type + Description + + + + + + prematch + text + NULL unless the r flag is given, else all of the string prior to the match + + + + fullmatch + text + The entire match of the regular expression + + + + matches + text[] + Contents of all of the capture groups in the regular expression, or an empty array if there were none + + + + postmatch + text + NULL unless the r flag is given, else all of the string after the match + + + +
+ + + The flags parameter is an optional text + string containing zero or more single-letter flags that change the + function's behavior. Flag i specifies case-insensitive + matching, while flag g causes the return of each matching + substring rather than only the first one. The r flag causes + the fields prematch and postmatch to be + filled in. This can be expensive, so it should be used sparingly. Other supported + flags are m, n, p, w and + x, whose meanings correspond to those shown in + . + + + + Some examples: + + SELECT * FROM regexp_matches('foobarbecuebaz', '(bar)(becue)'); + regexp_matches + ---------------- + {bar,beque} + (1 row) + + SELECT * FROM regexp_matches('foobarbequebaz', '(bar)(becue)', 'g'); + prematch | fullmatch | matches | postmatch + ----------+-----------+-------------+----------- + NULL | barbeque | {bar,beque} | NULL + (1 row) + + SELECT * FROM regexp_matches('foobarbequebaz', '(bar)(beque)', 'r'); + prematch | fullmatch | matches | postmatch + ----------+-----------+-------------+----------- + foo | barbeque | {bar,beque} | baz + (1 row) + + SELECT * FROM regexp_matches('foobarbequebazilbarfbonk', '(b[^b]+)(b[^b]+)', 'gr'); + prematch | fullmatch | matches | postmatch + -------------+-----------+--------------+--------------- + foo | barbeque | {bar,beque} | bazilbarfbonk + foobarbeque | bazilbarf | {bazil,barf} | bonk + (2 rows) + + + + + + The regexp_split function splits a string using a POSIX + regular expression pattern as a delimiter. It has the syntax + regexp_split(string, pattern, + , flags ). + If there is no match to the pattern, the function returns the + string. If there is at least one match, for each match it returns + the text from the end of the last match (or the beginning of the string) to the + beginning of the match. When there are no more matches, it returns the text from + the end of the last match to the end of the string. + The flags parameter is an optional text + string containing zero or more single-letter flags that change the + function's behavior. regexp_split supports + the flags i, m, n, p, + w and x, whose meanings are described in + . + + + + Some examples: + + + SELECT foo, length(foo) FROM regexp_split('the quick brown fox jumped over the lazy dog', E'\\s+') AS foo; + foo | length + --------+-------- + the | 3 + quick | 5 + brown | 5 + fox | 3 + jumped | 6 + over | 4 + the | 3 + lazy | 4 + dog | 3 + (9 rows) + + SELECT foo, length(foo) FROM regexp_split('the quick brown fox jumped over the lazy dog', E'\\s*') AS foo; + foo | length + -----+-------- + t | 1 + h | 1 + e | 1 + q | 1 + u | 1 + i | 1 + c | 1 + k | 1 + b | 1 + r | 1 + o | 1 + w | 1 + n | 1 + f | 1 + o | 1 + x | 1 + j | 1 + u | 1 + m | 1 + p | 1 + e | 1 + d | 1 + o | 1 + v | 1 + e | 1 + r | 1 + t | 1 + h | 1 + e | 1 + l | 1 + a | 1 + z | 1 + y | 1 + d | 1 + o | 1 + g | 1 + (36 rows) + + + + PostgreSQL's regular expressions are implemented using a package written by Henry Spencer. Much of Index: src/backend/utils/adt/regexp.c =================================================================== RCS file: /home/jeremyd/local/postgres/cvsuproot/pgsql/src/backend/utils/adt/regexp.c,v retrieving revision 1.68 diff -c -r1.68 regexp.c *** src/backend/utils/adt/regexp.c 5 Jan 2007 22:19:41 -0000 1.68 --- src/backend/utils/adt/regexp.c 10 Feb 2007 09:11:08 -0000 *************** *** 29,36 **** --- 29,39 ---- */ #include "postgres.h" + #include "funcapi.h" + #include "access/heapam.h" #include "regex/regex.h" #include "utils/builtins.h" + #include "utils/lsyscache.h" #include "utils/guc.h" *************** *** 75,83 **** --- 78,137 ---- regex_t cre_re; /* the compiled regular expression */ } cached_re_str; + typedef struct re_comp_flags + { + int cflags; + bool return_pre_and_post; + bool glob; + } re_comp_flags; + + typedef struct regexp_matches_ctx + { + text *orig_str; + size_t orig_len; + pg_wchar *wide_str; + size_t wide_len; + regex_t *cpattern; + regmatch_t *pmatch; + size_t offset; + + re_comp_flags flags; + + /* return type info */ + TupleDesc rettupdesc; + Oid rettype; + TypeFuncClass typefunc; + + /* text type info */ + Oid param_type; + int16 typlen; + bool typbyval; + char typalign; + } regexp_matches_ctx; + + typedef struct regexp_split_ctx + { + text *orig_str; + size_t orig_len; + pg_wchar *wide_str; + size_t wide_len; + regex_t *cpattern; + regmatch_t match; + size_t offset; + re_comp_flags flags; + } regexp_split_ctx; + + static int num_res = 0; /* # of cached re's */ static cached_re_str re_array[MAX_CACHED_RES]; /* cached re's */ + static regexp_matches_ctx *setup_regexp_matches(FunctionCallInfo fcinfo, + text *orig_str, text *pattern, + text *flags); + static ArrayType *perform_regexp_matches(regexp_matches_ctx *matchctx); + static HeapTuple form_re_match_tuple(ArrayType *match_ary, + regexp_matches_ctx *matchctx); + /* * RE_compile_and_cache - compile a RE, caching if possible *************** *** 88,94 **** * cflags --- compile options for the pattern * * Pattern is given in the database encoding. We internally convert to ! * array of pg_wchar which is what Spencer's regex package wants. */ static regex_t * RE_compile_and_cache(text *text_re, int cflags) --- 142,148 ---- * cflags --- compile options for the pattern * * Pattern is given in the database encoding. We internally convert to ! * an array of pg_wchar, which is what Spencer's regex package wants. */ static regex_t * RE_compile_and_cache(text *text_re, int cflags) *************** *** 191,238 **** } /* ! * RE_compile_and_execute - compile and execute a RE * * Returns TRUE on match, FALSE on no match * ! * text_re --- the pattern, expressed as an *untoasted* TEXT object ! * dat --- the data to match against (need not be null-terminated) ! * dat_len --- the length of the data string ! * cflags --- compile options for the pattern * nmatch, pmatch --- optional return area for match details * ! * Both pattern and data are given in the database encoding. We internally ! * convert to array of pg_wchar which is what Spencer's regex package wants. */ static bool ! RE_compile_and_execute(text *text_re, char *dat, int dat_len, ! int cflags, int nmatch, regmatch_t *pmatch) { - pg_wchar *data; - size_t data_len; int regexec_result; - regex_t *re; char errMsg[100]; - /* Convert data string to wide characters */ - data = (pg_wchar *) palloc((dat_len + 1) * sizeof(pg_wchar)); - data_len = pg_mb2wchar_with_len(dat, data, dat_len); - - /* Compile RE */ - re = RE_compile_and_cache(text_re, cflags); - /* Perform RE match and return result */ regexec_result = pg_regexec(re, data, data_len, ! 0, NULL, /* no details */ nmatch, pmatch, 0); - pfree(data); - if (regexec_result != REG_OKAY && regexec_result != REG_NOMATCH) { /* re failed??? */ --- 245,280 ---- } /* ! * RE_wchar_execute - execute a RE * * Returns TRUE on match, FALSE on no match * ! * re --- the compiled pattern as returned by RE_compile_and_cache ! * data --- the data to match against (need not be null-terminated) ! * data_len --- the length of the data string ! * start_search -- the offset in the data to start searching * nmatch, pmatch --- optional return area for match details * ! * Data is given as array of pg_wchar which is what Spencer's regex package ! * wants. */ static bool ! RE_wchar_execute(regex_t *re, pg_wchar *data, int data_len, size_t start_search, ! int nmatch, regmatch_t *pmatch) { int regexec_result; char errMsg[100]; /* Perform RE match and return result */ regexec_result = pg_regexec(re, data, data_len, ! start_search, NULL, /* no details */ nmatch, pmatch, 0); if (regexec_result != REG_OKAY && regexec_result != REG_NOMATCH) { /* re failed??? */ *************** *** 245,250 **** --- 287,399 ---- return (regexec_result == REG_OKAY); } + /* + * RE_execute - execute a RE + * + * Returns TRUE on match, FALSE on no match + * + * re --- the compiled pattern as returned by RE_compile_and_cache + * dat --- the data to match against (need not be null-terminated) + * dat_len --- the length of the data string + * nmatch, pmatch --- optional return area for match details + * + * Data is given in the database encoding. We internally + * convert to array of pg_wchar which is what Spencer's regex package wants. + */ + static bool + RE_execute(regex_t *re, char *dat, int dat_len, + int nmatch, regmatch_t *pmatch) + { + pg_wchar *data; + size_t data_len; + bool match; + + /* Convert data string to wide characters */ + data = (pg_wchar *) palloc((dat_len + 1) * sizeof(pg_wchar)); + data_len = pg_mb2wchar_with_len(dat, data, dat_len); + + /* Perform RE match and return result */ + match = RE_wchar_execute(re, data, data_len, 0, nmatch, pmatch); + pfree(data); + return match; + } + + /* + * RE_compile_and_execute - compile and execute a RE + * + * Returns TRUE on match, FALSE on no match + * + * text_re --- the pattern, expressed as an *untoasted* TEXT object + * dat --- the data to match against (need not be null-terminated) + * dat_len --- the length of the data string + * cflags --- compile options for the pattern + * nmatch, pmatch --- optional return area for match details + * + * Both pattern and data are given in the database encoding. We internally + * convert to array of pg_wchar which is what Spencer's regex package wants. + */ + static bool + RE_compile_and_execute(text *text_re, char *dat, int dat_len, + int cflags, int nmatch, regmatch_t *pmatch) + { + regex_t *re; + + /* Compile RE */ + re = RE_compile_and_cache(text_re, cflags); + + return RE_execute(re, dat, dat_len, nmatch, pmatch); + } + + static void + parse_re_comp_flags(re_comp_flags *flags, text *opts) + { + MemSet(flags, 0, sizeof(re_comp_flags)); + flags->cflags = regex_flavor; + + if (opts) + { + char *opt_p = VARDATA(opts); + size_t opt_len = VARSIZE(opts) - VARHDRSZ; + int i; + + for (i = 0; i < opt_len; i++) + { + switch (opt_p[i]) + { + case 'g': + flags->glob = true; + break; + case 'i': + flags->cflags |= REG_ICASE; + break; + case 'm': + case 'n': + flags->cflags |= REG_NEWLINE; + break; + case 'p': + flags->cflags |= REG_NLSTOP; + flags->cflags &= ~REG_NLANCH; + break; + case 'r': + flags->return_pre_and_post = true; + break; + case 'w': + flags->cflags &= ~REG_NLSTOP; + flags->cflags |= REG_NLANCH; + break; + case 'x': + flags->cflags |= REG_EXPANDED; + break; + default: + ereport(ERROR, + (errcode(ERRCODE_INVALID_PARAMETER_VALUE), + errmsg("invalid regexp option: %c", opt_p[i]))); + break; + } + } + } + } + /* * assign_regex_flavor - GUC hook to validate and set REGEX_FLAVOR *************** *** 469,507 **** text *p = PG_GETARG_TEXT_P(1); text *r = PG_GETARG_TEXT_P(2); text *opt = PG_GETARG_TEXT_P(3); - char *opt_p = VARDATA(opt); - int opt_len = (VARSIZE(opt) - VARHDRSZ); - int i; - bool glob = false; - bool ignorecase = false; regex_t *re; ! /* parse options */ ! for (i = 0; i < opt_len; i++) ! { ! switch (opt_p[i]) ! { ! case 'i': ! ignorecase = true; ! break; ! case 'g': ! glob = true; ! break; ! default: ! ereport(ERROR, ! (errcode(ERRCODE_INVALID_PARAMETER_VALUE), ! errmsg("invalid option of regexp_replace: %c", ! opt_p[i]))); ! break; ! } ! } ! if (ignorecase) ! re = RE_compile_and_cache(p, regex_flavor | REG_ICASE); ! else ! re = RE_compile_and_cache(p, regex_flavor); ! PG_RETURN_TEXT_P(replace_text_regexp(s, (void *) re, r, glob)); } /* similar_escape() --- 618,636 ---- text *p = PG_GETARG_TEXT_P(1); text *r = PG_GETARG_TEXT_P(2); text *opt = PG_GETARG_TEXT_P(3); regex_t *re; + re_comp_flags flags; ! parse_re_comp_flags(&flags, opt); ! if (flags.return_pre_and_post) ! ereport(ERROR, ! (errcode(ERRCODE_INVALID_PARAMETER_VALUE), ! errmsg("invalid option of regexp_replace: r"))); ! re = RE_compile_and_cache(p, flags.cflags); ! ! PG_RETURN_TEXT_P(replace_text_regexp(s, (void *) re, r, flags.glob)); } /* similar_escape() *************** *** 625,630 **** --- 754,1109 ---- PG_RETURN_TEXT_P(result); } + + Datum + regexp_matches(PG_FUNCTION_ARGS) + { + FuncCallContext *funcctx; + MemoryContext oldcontext; + regexp_matches_ctx *matchctx; + + if (SRF_IS_FIRSTCALL()) + { + text *pattern = PG_GETARG_TEXT_P(1); + text *flags = PG_GETARG_TEXT_P(2); + + funcctx = SRF_FIRSTCALL_INIT(); + oldcontext = MemoryContextSwitchTo(funcctx->multi_call_memory_ctx); + + /* be sure to copy the input string into the multi-call ctx */ + matchctx = setup_regexp_matches(fcinfo, PG_GETARG_TEXT_P_COPY(0), + pattern, flags); + + MemoryContextSwitchTo(oldcontext); + funcctx->user_fctx = (void *) matchctx; + + /* Avoid run-away function by making sure we never iterate more than + * the length of the text + 1 (the number of matches an empty pattern + * will make is length + 1) + */ + if (matchctx->flags.glob) + funcctx->max_calls = matchctx->wide_len + 1; + else + funcctx->max_calls = 0; + } + + funcctx = SRF_PERCALL_SETUP(); + matchctx = (regexp_matches_ctx *) funcctx->user_fctx; + + if (funcctx->call_cntr > funcctx->max_calls) + { + /* if max_calls == 0, then we are doing a non-global match, we should + * stop now, no proplem. Otherwise, if we exceed max_calls something + * really wonky is going on, since it is returning more matches than + * there are characters in the string, which should not happen + */ + if (funcctx->max_calls != 0) + elog(ERROR, "set returning match function terminated after iterating %d times", funcctx->call_cntr); + SRF_RETURN_DONE(funcctx); + } + + if (matchctx->offset < matchctx->wide_len) + { + ArrayType *result_ary; + + if (matchctx->pmatch[0].rm_so == matchctx->pmatch[0].rm_eo) + matchctx->offset++; + + if ((result_ary = perform_regexp_matches(matchctx)) != NULL) + { + HeapTuple result = form_re_match_tuple(result_ary, matchctx); + matchctx->offset = matchctx->pmatch[0].rm_eo; + SRF_RETURN_NEXT(funcctx, HeapTupleGetDatum(result)); + } + /* else fall through and return done */ + } + + SRF_RETURN_DONE (funcctx); + } + + static HeapTuple + form_re_match_tuple(ArrayType *match_ary, regexp_matches_ctx *matchctx) + { + Datum vals[4]; + bool nulls[4]; + + MemSet(vals, 0, sizeof(vals)); + MemSet(nulls, 0, sizeof(nulls)); + + if (matchctx->flags.return_pre_and_post) + { + vals[0] = DirectFunctionCall3(text_substr, + PointerGetDatum(matchctx->orig_str), + Int32GetDatum(1), + Int32GetDatum(matchctx->pmatch[0].rm_so)); + + vals[3] = DirectFunctionCall2(text_substr_no_len, + PointerGetDatum(matchctx->orig_str), + Int32GetDatum(matchctx->pmatch[0].rm_eo + 1)); + } + else + { + nulls[0] = nulls[3] = true; + } + + vals[1] = DirectFunctionCall3(text_substr, + PointerGetDatum(matchctx->orig_str), + Int32GetDatum(matchctx->pmatch[0].rm_so + 1), + Int32GetDatum(matchctx->pmatch[0].rm_eo - matchctx->pmatch[0].rm_so)); + vals[2] = PointerGetDatum(match_ary); + + return heap_form_tuple(matchctx->rettupdesc, vals, nulls); + } + + /* This version is not SRF, and has no flags parameter */ + Datum + regexp_matches_noopts(PG_FUNCTION_ARGS) + { + regexp_matches_ctx *matchctx; + + matchctx = setup_regexp_matches(fcinfo, PG_GETARG_TEXT_P(0), + PG_GETARG_TEXT_P(1), NULL); + + /* + * probably not really needed for this version, will only happen + * if input str is zero-length + */ + if (matchctx->offset < matchctx->wide_len) + { + ArrayType *result_ary = perform_regexp_matches(matchctx); + + if (result_ary != NULL) + PG_RETURN_ARRAYTYPE_P(result_ary); + } + + PG_RETURN_NULL(); + } + + static regexp_matches_ctx * + setup_regexp_matches(FunctionCallInfo fcinfo, text *orig_str, text *pattern, text *flags) + { + regexp_matches_ctx *matchctx = palloc(sizeof(regexp_matches_ctx)); + + matchctx->orig_str = orig_str; + matchctx->orig_len = VARSIZE(matchctx->orig_str) - VARHDRSZ; + + parse_re_comp_flags(&matchctx->flags, flags); + + matchctx->cpattern = RE_compile_and_cache(pattern, matchctx->flags.cflags); + matchctx->pmatch = palloc(sizeof(regmatch_t) * (matchctx->cpattern->re_nsub + 1)); + matchctx->offset = 0; + + /* get text type oid, too lazy to do it some other way */ + matchctx->param_type = get_fn_expr_argtype(fcinfo->flinfo, 0); + get_typlenbyvalalign(matchctx->param_type, &matchctx->typlen, + &matchctx->typbyval, &matchctx->typalign); + + matchctx->typefunc = get_call_result_type(fcinfo, &matchctx->rettype, &matchctx->rettupdesc); + + if (matchctx->typefunc == TYPEFUNC_COMPOSITE) + matchctx->rettupdesc = BlessTupleDesc(matchctx->rettupdesc); + + matchctx->wide_str = (pg_wchar *) palloc((matchctx->orig_len + 1) * sizeof(pg_wchar)); + matchctx->wide_len = pg_mb2wchar_with_len(VARDATA(matchctx->orig_str), + matchctx->wide_str, matchctx->orig_len); + + matchctx->pmatch[0].rm_so = -1; + /* both < 0 but not equal */ + matchctx->pmatch[0].rm_eo = -2; + + return matchctx; + } + + static ArrayType * + perform_regexp_matches(regexp_matches_ctx *matchctx) + { + if (RE_wchar_execute(matchctx->cpattern, + matchctx->wide_str, + matchctx->wide_len, + matchctx->offset, + matchctx->cpattern->re_nsub + 1, + matchctx->pmatch)) + { + Datum *elems = palloc(matchctx->cpattern->re_nsub * sizeof(Datum)); + bool *nulls = palloc(matchctx->cpattern->re_nsub * sizeof(bool)); + int ndims = 1; + int dims[1] = {matchctx->cpattern->re_nsub}; + int lbs[1] = {1}; + int i; + + for (i = 0; i < matchctx->cpattern->re_nsub; i++) + { + int so = matchctx->pmatch[i+1].rm_so; + int eo = matchctx->pmatch[i+1].rm_eo; + + if (so < 0 || eo < 0) + { + elems[i] = 0; + nulls[i] = true; + } + else + { + elems[i] = DirectFunctionCall3(text_substr, + PointerGetDatum(matchctx->orig_str), + Int32GetDatum(so + 1), + Int32GetDatum(eo - so)); + nulls[i] = false; + } + } + + return construct_md_array(elems, nulls, ndims, dims, lbs, + matchctx->param_type, matchctx->typlen, + matchctx->typbyval, matchctx->typalign); + } + else + return NULL; + } + + #define PG_GETARG_TEXT_P_IF_EXISTS(_n) \ + (PG_NARGS() > _n ? PG_GETARG_TEXT_P(_n) : NULL) + + Datum + regexp_split(PG_FUNCTION_ARGS) + { + FuncCallContext *funcctx; + regexp_split_ctx *splitctx; + + if (SRF_IS_FIRSTCALL()) + { + text *pattern = PG_GETARG_TEXT_P(1); + text *flags = PG_GETARG_TEXT_P_IF_EXISTS(2); + MemoryContext oldcontext; + + funcctx = SRF_FIRSTCALL_INIT(); + oldcontext = MemoryContextSwitchTo(funcctx->multi_call_memory_ctx); + splitctx = (regexp_split_ctx *) palloc(sizeof(regexp_split_ctx)); + + splitctx->orig_str = PG_GETARG_TEXT_P_COPY(0); + splitctx->orig_len = VARSIZE(splitctx->orig_str) - VARHDRSZ; + + parse_re_comp_flags(&splitctx->flags, flags); + if (splitctx->flags.glob) + ereport(ERROR, + (errcode(ERRCODE_INVALID_PARAMETER_VALUE), + errmsg("regexp_split does not support the global option"))); + + if (splitctx->flags.return_pre_and_post) + ereport(ERROR, + (errcode(ERRCODE_INVALID_PARAMETER_VALUE), + errmsg("regexp_split does not support the return pre and postmatch option"))); + + splitctx->cpattern = RE_compile_and_cache(pattern, splitctx->flags.cflags); + + splitctx->wide_str = (pg_wchar *) palloc((splitctx->orig_len + 1) * sizeof(pg_wchar)); + splitctx->wide_len = pg_mb2wchar_with_len(VARDATA(splitctx->orig_str), + splitctx->wide_str, splitctx->orig_len); + + splitctx->offset = 0; + + splitctx->match.rm_so = -1; + /* both < 0 but not equal */ + splitctx->match.rm_eo = -2; + + MemoryContextSwitchTo(oldcontext); + funcctx->user_fctx = (void *) splitctx; + + /* Avoid run-away function by making sure we never iterate more than + * the length of the text + */ + funcctx->max_calls = splitctx->wide_len; + } + + funcctx = SRF_PERCALL_SETUP(); + splitctx = (regexp_split_ctx *) funcctx->user_fctx; + + /* if we exceed max_calls something really wonky is going on, since it is + * returning more matches than there are characters in the string, which + * should not happen + */ + if (funcctx->call_cntr > funcctx->max_calls) + elog(ERROR, "set returning split function terminated after iterating %d times", funcctx->call_cntr); + + if (splitctx->offset < splitctx->wide_len) + { + regmatch_t *pmatch = &(splitctx->match); + bool first_match = (pmatch->rm_so < 0 || pmatch->rm_eo < 0); + bool incremented_offset = false; + + for (;;) + { + Datum result; + + /* if the last match was zero-length, we need to push the offset + * forward to avoid matching the same place forever + */ + if (pmatch->rm_so == pmatch->rm_eo) + { + splitctx->offset++; + incremented_offset = true; + } + + if (RE_wchar_execute(splitctx->cpattern, + splitctx->wide_str, + splitctx->wide_len, + splitctx->offset, + 1, + pmatch)) + { + /* if we are trying to match at the beginning of the string and + * we got a zero-length match, or if we just matched where we + * left off last time, go around the loop again and increment + * the offset. If we have incremented the offset already and + * it matched at the new offset, that's ok + */ + if ((first_match && pmatch->rm_so == pmatch->rm_eo) + || (pmatch->rm_so == splitctx->offset && !incremented_offset)) + { + first_match = false; + continue; + } + + /* the start parameter is one-based and the offset + * is zero-based, so we need to add one, but if we + * incremented the offset, we need to subtract one + * from the offset, so in that case the net change is + * zero. compensate in the length for the extra + * character from the incremented offset + */ + result = DirectFunctionCall3(text_substr, + PointerGetDatum(splitctx->orig_str), + Int32GetDatum(splitctx->offset + (incremented_offset ? 0 : 1)), + Int32GetDatum(pmatch->rm_so - splitctx->offset + (incremented_offset ? 1 : 0))); + + /* set the offset to the end of this match for next time */ + splitctx->offset = pmatch->rm_eo; + SRF_RETURN_NEXT(funcctx, result); + } + else + { + /* no more matches, return rest of string */ + result = DirectFunctionCall2(text_substr_no_len, + PointerGetDatum(splitctx->orig_str), + Int32GetDatum(splitctx->offset + (incremented_offset ? 0 : 1))); + /* so we know we're done next time through */ + splitctx->offset = splitctx->wide_len; + SRF_RETURN_NEXT(funcctx, result); + } + + /* will never get here, continue above is only way this loop + * will loop + */ + } + } + + SRF_RETURN_DONE(funcctx); + } + + /* Hack to make oprsanity happy */ + Datum regexp_split_noopts(PG_FUNCTION_ARGS) + { + return regexp_split(fcinfo); + } + /* * report whether regex_flavor is currently BASIC */ Index: src/include/catalog/pg_proc.h =================================================================== RCS file: /home/jeremyd/local/postgres/cvsuproot/pgsql/src/include/catalog/pg_proc.h,v retrieving revision 1.443 diff -c -r1.443 pg_proc.h *** src/include/catalog/pg_proc.h 7 Feb 2007 23:11:30 -0000 1.443 --- src/include/catalog/pg_proc.h 10 Feb 2007 07:15:19 -0000 *************** *** 2259,2266 **** --- 2259,2274 ---- DESCR("replace text using regexp"); DATA(insert OID = 2285 ( regexp_replace PGNSP PGUID 12 1 0 f f t f i 4 25 "25 25 25 25" _null_ _null_ _null_ textregexreplace - _null_ )); DESCR("replace text using regexp"); + DATA(insert OID = 2760 ( regexp_matches PGNSP PGUID 12 1 0 f f t f i 2 1009 "25 25" _null_ _null_ _null_ regexp_matches_noopts - _null_ )); + DESCR("return all match groups for regexp"); + DATA(insert OID = 2761 ( regexp_matches PGNSP PGUID 12 1 10 f f t t i 3 2249 "25 25 25" "{25,25,25,25,25,1009,25}" "{i,i,i,o,o,o,o}" "{str,pattern,flags,prematch,fullmatch,matches,postmatch}" regexp_matches - _null_ )); + DESCR("return all match groups for regexp"); DATA(insert OID = 2088 ( split_part PGNSP PGUID 12 1 0 f f t f i 3 25 "25 25 23" _null_ _null_ _null_ split_text - _null_ )); DESCR("split string by field_sep and return field_num"); + DATA(insert OID = 2762 ( regexp_split PGNSP PGUID 12 1 1000 f f t t i 2 25 "25 25" _null_ _null_ _null_ regexp_split_noopts - _null_ )); + DESCR("split string by pattern"); + DATA(insert OID = 2763 ( regexp_split PGNSP PGUID 12 1 1000 f f t t i 3 25 "25 25 25" _null_ _null_ _null_ regexp_split - _null_ )); + DESCR("split string by pattern"); DATA(insert OID = 2089 ( to_hex PGNSP PGUID 12 1 0 f f t f i 1 25 "23" _null_ _null_ _null_ to_hex32 - _null_ )); DESCR("convert int4 number to hex"); DATA(insert OID = 2090 ( to_hex PGNSP PGUID 12 1 0 f f t f i 1 25 "20" _null_ _null_ _null_ to_hex64 - _null_ )); Index: src/include/utils/builtins.h =================================================================== RCS file: /home/jeremyd/local/postgres/cvsuproot/pgsql/src/include/utils/builtins.h,v retrieving revision 1.287 diff -c -r1.287 builtins.h *** src/include/utils/builtins.h 28 Jan 2007 16:16:54 -0000 1.287 --- src/include/utils/builtins.h 10 Feb 2007 07:15:19 -0000 *************** *** 478,483 **** --- 478,487 ---- extern Datum textregexreplace_noopt(PG_FUNCTION_ARGS); extern Datum textregexreplace(PG_FUNCTION_ARGS); extern Datum similar_escape(PG_FUNCTION_ARGS); + extern Datum regexp_matches(PG_FUNCTION_ARGS); + extern Datum regexp_matches_noopts(PG_FUNCTION_ARGS); + extern Datum regexp_split(PG_FUNCTION_ARGS); + extern Datum regexp_split_noopts(PG_FUNCTION_ARGS); extern bool regex_flavor_is_basic(void); /* regproc.c */ Index: src/test/regress/expected/strings.out =================================================================== RCS file: /home/jeremyd/local/postgres/cvsuproot/pgsql/src/test/regress/expected/strings.out,v retrieving revision 1.29 diff -c -r1.29 strings.out *** src/test/regress/expected/strings.out 11 May 2006 19:15:36 -0000 1.29 --- src/test/regress/expected/strings.out 10 Feb 2007 07:15:19 -0000 *************** *** 217,225 **** Z Z (1 row) ! -- invalid option of REGEXP_REPLACE SELECT regexp_replace('AAA aaa', 'A+', 'Z', 'z'); ! ERROR: invalid option of regexp_replace: z -- E021-11 position expression SELECT POSITION('4' IN '1234567890') = '4' AS "4"; 4 --- 217,486 ---- Z Z (1 row) ! -- invalid regexp option SELECT regexp_replace('AAA aaa', 'A+', 'Z', 'z'); ! ERROR: invalid regexp option: z ! -- set so we can tell NULL from empty string ! \pset null '\\N' ! -- return all matches from regexp ! SELECT * FROM regexp_matches('foobarbequebaz', $re$(bar)(beque)$re$); ! regexp_matches ! ---------------- ! {bar,beque} ! (1 row) ! ! SELECT * FROM regexp_matches('foobarbequebaz', $re$(bar)(beque)$re$, ''); ! prematch | fullmatch | matches | postmatch ! ----------+-----------+-------------+----------- ! \N | barbeque | {bar,beque} | \N ! (1 row) ! ! SELECT * FROM regexp_matches('foobarbequebaz', $re$(bar)(beque)$re$, 'r'); ! prematch | fullmatch | matches | postmatch ! ----------+-----------+-------------+----------- ! foo | barbeque | {bar,beque} | baz ! (1 row) ! ! -- test case insensitive - first variant not possible ! SELECT * FROM regexp_matches('foObARbEqUEbAz', $re$(bar)(beque)$re$, 'i'); ! prematch | fullmatch | matches | postmatch ! ----------+-----------+-------------+----------- ! \N | bARbEqUE | {bAR,bEqUE} | \N ! (1 row) ! ! SELECT * FROM regexp_matches('foObARbEqUEbAz', $re$(bar)(beque)$re$, 'ir'); ! prematch | fullmatch | matches | postmatch ! ----------+-----------+-------------+----------- ! foO | bARbEqUE | {bAR,bEqUE} | bAz ! (1 row) ! ! -- global option - more than one match ! SELECT * FROM regexp_matches('foobarbequebazilbarfbonk', $re$(b[^b]+)(b[^b]+)$re$, 'gr'); ! prematch | fullmatch | matches | postmatch ! -------------+-----------+--------------+--------------- ! foo | barbeque | {bar,beque} | bazilbarfbonk ! foobarbeque | bazilbarf | {bazil,barf} | bonk ! (2 rows) ! ! -- empty capture group (matched empty string) ! SELECT * FROM regexp_matches('foobarbequebaz', $re$(bar)(.*)(beque)$re$); ! regexp_matches ! ---------------- ! {bar,"",beque} ! (1 row) ! ! -- no match ! SELECT * FROM regexp_matches('foobarbequebaz', $re$(bar)(.+)(beque)$re$); ! regexp_matches ! ---------------- ! \N ! (1 row) ! ! -- optional capture group did not match, null entry in array ! SELECT * FROM regexp_matches('foobarbequebaz', $re$(bar)(.+)?(beque)$re$); ! regexp_matches ! ------------------ ! {bar,NULL,beque} ! (1 row) ! ! -- nothing before match, empty string ! SELECT * FROM regexp_matches('barbequebaz', $re$(bar)(beque)$re$, 'r'); ! prematch | fullmatch | matches | postmatch ! ----------+-----------+-------------+----------- ! | barbeque | {bar,beque} | baz ! (1 row) ! ! SELECT * FROM regexp_matches('barbequebaz', $re$^(bar)(beque)$re$, 'r'); ! prematch | fullmatch | matches | postmatch ! ----------+-----------+-------------+----------- ! | barbeque | {bar,beque} | baz ! (1 row) ! ! SELECT * FROM regexp_matches('barbequebaz', $re$(^bar)(beque)$re$, 'r'); ! prematch | fullmatch | matches | postmatch ! ----------+-----------+-------------+----------- ! | barbeque | {bar,beque} | baz ! (1 row) ! ! -- nothing after match, empty string ! SELECT * FROM regexp_matches('foobarbeque', $re$(bar)(beque)$re$, 'r'); ! prematch | fullmatch | matches | postmatch ! ----------+-----------+-------------+----------- ! foo | barbeque | {bar,beque} | ! (1 row) ! ! SELECT * FROM regexp_matches('foobarbeque', $re$(bar)(beque)$$re$, 'r'); ! prematch | fullmatch | matches | postmatch ! ----------+-----------+-------------+----------- ! foo | barbeque | {bar,beque} | ! (1 row) ! ! SELECT * FROM regexp_matches('foobarbeque', $re$(bar)(beque$)$re$, 'r'); ! prematch | fullmatch | matches | postmatch ! ----------+-----------+-------------+----------- ! foo | barbeque | {bar,beque} | ! (1 row) ! ! -- no capture groups, should first throw error? ! SELECT * FROM regexp_matches('foobarbequebaz', $re$barbeque$re$); ! regexp_matches ! ---------------- ! {} ! (1 row) ! ! SELECT * FROM regexp_matches('foobarbequebaz', $re$barbeque$re$, ''); ! prematch | fullmatch | matches | postmatch ! ----------+-----------+---------+----------- ! \N | barbeque | {} | \N ! (1 row) ! ! SELECT * FROM regexp_matches('foobarbequebaz', $re$barbeque$re$, 'r'); ! prematch | fullmatch | matches | postmatch ! ----------+-----------+---------+----------- ! foo | barbeque | {} | baz ! (1 row) ! ! -- give me errors ! SELECT * FROM regexp_matches('foobarbequebaz', $re$(bar)(beque)$re$, 'zipper'); ! ERROR: invalid regexp option: z ! SELECT * FROM regexp_matches('foobarbequebaz', $re$(barbeque$re$, ''); ! ERROR: invalid regular expression: parentheses () not balanced ! SELECT * FROM regexp_matches('foobarbequebaz', $re$(bar)(beque){2,1}$re$, ''); ! ERROR: invalid regular expression: invalid repetition count(s) ! -- split string on regexp ! SELECT foo, length(foo) FROM regexp_split('the quick brown fox jumped over the lazy dog', $re$\s+$re$) AS foo; ! foo | length ! --------+-------- ! the | 3 ! quick | 5 ! brown | 5 ! fox | 3 ! jumped | 6 ! over | 4 ! the | 3 ! lazy | 4 ! dog | 3 ! (9 rows) ! ! SELECT foo, length(foo) FROM regexp_split('the quick brown fox jumped over the lazy dog', $re$\s*$re$) AS foo; ! foo | length ! -----+-------- ! t | 1 ! h | 1 ! e | 1 ! q | 1 ! u | 1 ! i | 1 ! c | 1 ! k | 1 ! b | 1 ! r | 1 ! o | 1 ! w | 1 ! n | 1 ! f | 1 ! o | 1 ! x | 1 ! j | 1 ! u | 1 ! m | 1 ! p | 1 ! e | 1 ! d | 1 ! o | 1 ! v | 1 ! e | 1 ! r | 1 ! t | 1 ! h | 1 ! e | 1 ! l | 1 ! a | 1 ! z | 1 ! y | 1 ! d | 1 ! o | 1 ! g | 1 ! (36 rows) ! ! SELECT foo, length(foo) FROM regexp_split('the quick brown fox jumped over the lazy dog', '') AS foo; ! foo | length ! -----+-------- ! t | 1 ! h | 1 ! e | 1 ! | 1 ! q | 1 ! u | 1 ! i | 1 ! c | 1 ! k | 1 ! | 1 ! b | 1 ! r | 1 ! o | 1 ! w | 1 ! n | 1 ! | 1 ! f | 1 ! o | 1 ! x | 1 ! | 1 ! j | 1 ! u | 1 ! m | 1 ! p | 1 ! e | 1 ! d | 1 ! | 1 ! o | 1 ! v | 1 ! e | 1 ! r | 1 ! | 1 ! t | 1 ! h | 1 ! e | 1 ! | 1 ! l | 1 ! a | 1 ! z | 1 ! y | 1 ! | 1 ! d | 1 ! o | 1 ! g | 1 ! (44 rows) ! ! -- case insensitive ! SELECT foo, length(foo) FROM regexp_split('thE QUick bROWn FOx jUMPed ovEr THE lazy dOG', 'e', 'i') AS foo; ! foo | length ! -----------------------+-------- ! th | 2 ! QUick bROWn FOx jUMP | 21 ! d ov | 4 ! r TH | 4 ! lazy dOG | 9 ! (5 rows) ! ! -- no match of pattern ! SELECT foo, length(foo) FROM regexp_split('the quick brown fox jumped over the lazy dog', 'nomatch') AS foo; ! foo | length ! ----------------------------------------------+-------- ! the quick brown fox jumped over the lazy dog | 44 ! (1 row) ! ! -- errors ! SELECT foo, length(foo) FROM regexp_split('thE QUick bROWn FOx jUMPed ovEr THE lazy dOG', 'e', 'zippy') AS foo; ! ERROR: invalid regexp option: z ! -- global option meaningless for regexp_split ! SELECT foo, length(foo) FROM regexp_split('thE QUick bROWn FOx jUMPed ovEr THE lazy dOG', 'e', 'g') AS foo; ! ERROR: regexp_split does not support the global option ! -- return pre and post option meaningless for regexp_split ! SELECT foo, length(foo) FROM regexp_split('thE QUick bROWn FOx jUMPed ovEr THE lazy dOG', 'e', 'r') AS foo; ! ERROR: regexp_split does not support the return pre and postmatch option ! -- change NULL-display back ! \pset null '' -- E021-11 position expression SELECT POSITION('4' IN '1234567890') = '4' AS "4"; 4 Index: src/test/regress/sql/strings.sql =================================================================== RCS file: /home/jeremyd/local/postgres/cvsuproot/pgsql/src/test/regress/sql/strings.sql,v retrieving revision 1.18 diff -c -r1.18 strings.sql *** src/test/regress/sql/strings.sql 6 Mar 2006 19:49:20 -0000 1.18 --- src/test/regress/sql/strings.sql 10 Feb 2007 07:15:19 -0000 *************** *** 85,93 **** SELECT regexp_replace('AAA BBB CCC ', E'\\s+', ' ', 'g'); SELECT regexp_replace('AAA', '^|$', 'Z', 'g'); SELECT regexp_replace('AAA aaa', 'A+', 'Z', 'gi'); ! -- invalid option of REGEXP_REPLACE SELECT regexp_replace('AAA aaa', 'A+', 'Z', 'z'); -- E021-11 position expression SELECT POSITION('4' IN '1234567890') = '4' AS "4"; --- 85,155 ---- SELECT regexp_replace('AAA BBB CCC ', E'\\s+', ' ', 'g'); SELECT regexp_replace('AAA', '^|$', 'Z', 'g'); SELECT regexp_replace('AAA aaa', 'A+', 'Z', 'gi'); ! -- invalid regexp option SELECT regexp_replace('AAA aaa', 'A+', 'Z', 'z'); + -- set so we can tell NULL from empty string + \pset null '\\N' + + -- return all matches from regexp + SELECT * FROM regexp_matches('foobarbequebaz', $re$(bar)(beque)$re$); + SELECT * FROM regexp_matches('foobarbequebaz', $re$(bar)(beque)$re$, ''); + SELECT * FROM regexp_matches('foobarbequebaz', $re$(bar)(beque)$re$, 'r'); + + -- test case insensitive - first variant not possible + SELECT * FROM regexp_matches('foObARbEqUEbAz', $re$(bar)(beque)$re$, 'i'); + SELECT * FROM regexp_matches('foObARbEqUEbAz', $re$(bar)(beque)$re$, 'ir'); + + -- global option - more than one match + SELECT * FROM regexp_matches('foobarbequebazilbarfbonk', $re$(b[^b]+)(b[^b]+)$re$, 'gr'); + + -- empty capture group (matched empty string) + SELECT * FROM regexp_matches('foobarbequebaz', $re$(bar)(.*)(beque)$re$); + -- no match + SELECT * FROM regexp_matches('foobarbequebaz', $re$(bar)(.+)(beque)$re$); + -- optional capture group did not match, null entry in array + SELECT * FROM regexp_matches('foobarbequebaz', $re$(bar)(.+)?(beque)$re$); + + -- nothing before match, empty string + SELECT * FROM regexp_matches('barbequebaz', $re$(bar)(beque)$re$, 'r'); + SELECT * FROM regexp_matches('barbequebaz', $re$^(bar)(beque)$re$, 'r'); + SELECT * FROM regexp_matches('barbequebaz', $re$(^bar)(beque)$re$, 'r'); + + -- nothing after match, empty string + SELECT * FROM regexp_matches('foobarbeque', $re$(bar)(beque)$re$, 'r'); + SELECT * FROM regexp_matches('foobarbeque', $re$(bar)(beque)$$re$, 'r'); + SELECT * FROM regexp_matches('foobarbeque', $re$(bar)(beque$)$re$, 'r'); + + -- no capture groups, should first throw error? + SELECT * FROM regexp_matches('foobarbequebaz', $re$barbeque$re$); + SELECT * FROM regexp_matches('foobarbequebaz', $re$barbeque$re$, ''); + SELECT * FROM regexp_matches('foobarbequebaz', $re$barbeque$re$, 'r'); + + -- give me errors + SELECT * FROM regexp_matches('foobarbequebaz', $re$(bar)(beque)$re$, 'zipper'); + SELECT * FROM regexp_matches('foobarbequebaz', $re$(barbeque$re$, ''); + SELECT * FROM regexp_matches('foobarbequebaz', $re$(bar)(beque){2,1}$re$, ''); + + -- split string on regexp + SELECT foo, length(foo) FROM regexp_split('the quick brown fox jumped over the lazy dog', $re$\s+$re$) AS foo; + + SELECT foo, length(foo) FROM regexp_split('the quick brown fox jumped over the lazy dog', $re$\s*$re$) AS foo; + SELECT foo, length(foo) FROM regexp_split('the quick brown fox jumped over the lazy dog', '') AS foo; + -- case insensitive + SELECT foo, length(foo) FROM regexp_split('thE QUick bROWn FOx jUMPed ovEr THE lazy dOG', 'e', 'i') AS foo; + -- no match of pattern + SELECT foo, length(foo) FROM regexp_split('the quick brown fox jumped over the lazy dog', 'nomatch') AS foo; + + -- errors + SELECT foo, length(foo) FROM regexp_split('thE QUick bROWn FOx jUMPed ovEr THE lazy dOG', 'e', 'zippy') AS foo; + -- global option meaningless for regexp_split + SELECT foo, length(foo) FROM regexp_split('thE QUick bROWn FOx jUMPed ovEr THE lazy dOG', 'e', 'g') AS foo; + -- return pre and post option meaningless for regexp_split + SELECT foo, length(foo) FROM regexp_split('thE QUick bROWn FOx jUMPed ovEr THE lazy dOG', 'e', 'r') AS foo; + + -- change NULL-display back + \pset null '' + -- E021-11 position expression SELECT POSITION('4' IN '1234567890') = '4' AS "4";