diff --git a/doc/src/sgml/func.sgml b/doc/src/sgml/func.sgml index fee0561961..36e446cb7b 100644 --- a/doc/src/sgml/func.sgml +++ b/doc/src/sgml/func.sgml @@ -3097,6 +3097,66 @@ repeat('Pg', 4) PgPgPgPg + + + + regexp_count + + regexp_count ( string text, pattern text [, position integer ] [, flags text ] ) + integer + + + Returns the number of times a pattern occurs for a match of a POSIX + regular expression to the string; see + . + + + regexp_count('123456789012', '\d{3}', 3) + 3 + + + + + + + regexp_instr + + regexp_instr ( string text, pattern text [, position integer ] [, occurence integer ] [, returnopt integer ] [, flags text ] [, group integer ] ) + integer + + + Returns the position within string where the + match of a POSIX regular expression occurs. It returns an integer + indicating the beginning or ending position of the matched substring, + depending on the value of the returnopt argument + (default beginning). If no match is found the function returns 0; + see . + + + regexp_instr('1234567890', '(123)(4(56)(78))', 1, 1, 0, 'i', 4) + 7 + + + + + + + regexp_like + + regexp_like ( string text, pattern text [, flags text ] ) + boolean + + + Evaluates the existence of a match to a POSIX regular expression + in string; see . + + + regexp_like('Hello'||chr(10)||'world', '^world$', 'm') + t + + + + @@ -3144,7 +3204,7 @@ repeat('Pg', 4) PgPgPgPg regexp_replace - regexp_replace ( string text, pattern text, replacement text [, flags text ] ) + regexp_replace ( string text, pattern text, replacement text [, position integer ] [, occurence integer ] [, flags text ] ) text @@ -3157,6 +3217,24 @@ repeat('Pg', 4) PgPgPgPg + + + + regexp_substr + + regexp_substr ( string text, pattern text [, position integer ] [, occurence integer ] [, flags text ] [, group integer ] ) + text + + + Return the substring within string corresponding to the + match of a POSIX regular expression; see . + + + regexp_substr('1234567890 1234557890', '(123)(4(5[56])(78))', 1, 2, 'i', 3) + 55 + + + @@ -5295,6 +5373,18 @@ substring('foobar' similar '#"o_b#"%' escape '#') NULL regexp_split_to_array + + regexp_like + + + regexp_count + + + regexp_instr + + + regexp_substr + lists the available @@ -5451,6 +5541,8 @@ substring('foobar' from 'o(.)b') o It has the syntax regexp_replace(source, pattern, replacement + , position + , occurrence , flags ). The source string is returned unchanged if there is no match to the pattern. If there is a @@ -5464,12 +5556,19 @@ substring('foobar' from 'o(.)b') o substring matching the entire pattern should be inserted. Write \\ if you need to put a literal backslash in the replacement text. - The flags parameter is an optional text - string containing zero or more single-letter flags that change the + pattern is searched in string starting + from an optional position or from the beginning + of source by default. Optional occurrence + parameter indicates which occurrence of pattern in + source should be replaced. Default value for occurrence + is 1, replace only the first occurrence. The flags parameter + is an optional text string containing zero or more single-letter flags that change the function's behavior. Flag i specifies case-insensitive matching, while flag g specifies replacement of each matching - substring rather than only the first one. Supported flags (though - not g) are + substring rather than only the first one. When occurrence + is set flag g has no effect. If occurrence + is set to zero, all occurrences are replaced which is similar to flag g. + Supported flags (though not g) are described in . @@ -5482,6 +5581,10 @@ regexp_replace('foobarbaz', 'b..', 'X', 'g') fooXX regexp_replace('foobarbaz', 'b(..)', 'X\1Y', 'g') fooXarYXazY +regexp_replace('A PostgreSQL function', 'a|e|i|o|u', 'X', 1, 0, 'i') + X PXstgrXSQL fXnctXXn +regexp_replace('A PostgreSQL function', 'a|e|i|o|u', 'X', 1, 3, 'i') + A PostgrXSQL function @@ -5669,6 +5772,163 @@ SELECT foo FROM regexp_split_to_table('the quick brown fox', '\s*') AS foo; in practice. Other software systems such as Perl use similar definitions. + + The regexp_like function evaluates the existence of a match + to a POSIX regular expression in string; returns a boolean + resulting from matching a POSIX regular expression pattern to a string. It has + the syntax regexp_like(string, + pattern , flags ). + pattern is searched in string starting + from the beginning of string. + The flags parameter is an optional text string + containing zero or more single-letter flags that change the function's behavior. + regexp_like accepts all the flags + shown in . + This function is similar to regexp operator ~ when used without + flags and similar to operator ~* when + used with flags 'i' only. + + + + Some examples: + +SELECT 'found' FROM (values('Hello'||chr(10)||'world') as f(col) WHERE regexp_like(col, '^world$', 'm'); + ?column? +---------- + found +(1 row) + + + + + + The regexp_count function returns the number of + captured substring(s) resulting from matching a POSIX regular + expression pattern to a string. It has the syntax regexp_count( + string, pattern , + position , flags ). + pattern is searched in string starting + from an optional position or from the beginning of string + by default. The flags parameter is an optional text string + containing zero or more single-letter flags that change the function's behavior. + regexp_count accepts all the flags + shown in . + The g flag is forced internally to count all matches. + This function returns 0 if there is no match or the number of matches as + an integer. + + + + Some examples: + +SELECT regexp_count('123123123123', '\d{3}', 1); + regexp_count +-------------- + 4 +(1 row) + +SELECT regexp_count('Hello'||CHR(10)||'world!', '^world!$', 1, 'm'); + regexp_count +-------------- + 1 +(1 row) + + + + + The regexp_instr function returns the beginning or ending + position of the matched substring resulting from matching a POSIX regular + expression pattern to a string. It has the syntax regexp_instr( + string, pattern , + position , occurrence + , returnopt + , flags + , group ). + pattern is searched in string starting + from an optional position or from the beginning + of string by default. Optional occurrence parameter + indicates which occurrence of pattern in string + should be searched. When optional returnopt parameter is set to 0 (default) + the function returns the position of the first character of the occurrence. When set to 1 returns + the position of the character after the occurrence. + The flags parameter is an optional text string + containing zero or more single-letter flags that change the function's behavior. + regexp_instr accepts all the flags + shown in . + The g flag is forced internally to track all matches. + For a pattern with capture groups, group is an integer indicating + which capture in pattern is the target of the function. + A capture group is a part of the pattern enclosed in parentheses. Capture groups can be nested. + They are numbered in the order in which their left parentheses appear in pattern. + If group is zero, then the position + of the entire substring that matches the pattern is returned. If pattern + does not have at least group capture groups, the function returns zero. + This function returns 0 if there is no match or the starting or ending position + of a match as an integer. + + + + Some examples: + +SELECT regexp_instr('number of your street, zipcode thetown, FR', '[^ ]+', 1, 6); + regexp_instr +-------------- + 32 +(1 row) + +SELECT regexp_instr('1234567890', '(123)(4(56)(78))', 1, 1, 0, 'i', 3); + regexp_instr +-------------- + 5 +(1 row) + + + + + The regexp_substr function returns the + matched substring resulting from matching a POSIX regular + expression pattern to a string. It has the syntax regexp_substr( + string, pattern , + position , occurrence + , flags + , group ). + pattern is searched in string starting + from an optional position or from the beginning + of string by default. Optional parameter occurrence + indicates which occurrence of pattern in string + should be searched. The flags parameter is an optional text string + containing zero or more single-letter flags that change the function's behavior. + regexp_substr accepts all the flags + shown in . + The g flag is forced internally to track all matches. + For a pattern with capture groups, optional group is an integer indicating + which capture in pattern is the target of the function. A capture group + is a part of the pattern enclosed in parentheses. Capture groups can be nested. They are numbered in + the order in which their left parentheses appear in pattern. + If group is zero, then the position + of the entire substring that matches the pattern is returned. If pattern + does not have at least group capture groups, the function returns zero. + This function returns NULL if there is no match or the substring of the match. + + + + Some examples: + +SELECT regexp_substr('number of your street, zipcode town, FR', ',[^,]+', 1, 2); + regexp_substr +--------------- + , FR +(1 row) + +SELECT regexp_substr('1234567890 1234557890', '(123)(4(5[56])(78))', 1, 2, 'i', 3); + regexp_substr +--------------- + 55 +(1 row) + + + + diff --git a/src/backend/utils/adt/regexp.c b/src/backend/utils/adt/regexp.c index a32c5c82ab..624c57cec9 100644 --- a/src/backend/utils/adt/regexp.c +++ b/src/backend/utils/adt/regexp.c @@ -120,6 +120,7 @@ static regexp_matches_ctx *setup_regexp_matches(text *orig_str, text *pattern, static ArrayType *build_regexp_match_result(regexp_matches_ctx *matchctx); static Datum build_regexp_split_result(regexp_matches_ctx *splitctx); +static text *enclose_with_parenthesis(text *str); /* * RE_compile_and_cache - compile a RE, caching if possible @@ -555,7 +556,6 @@ texticregexne(PG_FUNCTION_ARGS) 0, NULL)); } - /* * textregexsubstr() * Return a substring matched by a regular expression. @@ -1063,6 +1063,508 @@ regexp_matches_no_flags(PG_FUNCTION_ARGS) return regexp_matches(fcinfo); } +/* + * regexp_like() + * Return the true if a pattern match within a string. + */ +Datum +regexp_like(PG_FUNCTION_ARGS) +{ + text *str = PG_GETARG_TEXT_PP(0); + text *pattern = PG_GETARG_TEXT_PP(1); + text *flags = PG_GETARG_TEXT_PP_IF_EXISTS(2); + + pg_re_flags re_flags; + regexp_matches_ctx *matchctx; + + /* Determine options */ + parse_re_flags(&re_flags, flags); + + matchctx = setup_regexp_matches(str, pattern, &re_flags, + PG_GET_COLLATION(), true, false, false); + + if (matchctx->nmatches > 0) + PG_RETURN_BOOL(true); + + PG_RETURN_BOOL(false); +} + +/* This is separate to keep the opr_sanity regression test from complaining */ +Datum +regexp_like_no_flags(PG_FUNCTION_ARGS) +{ + return regexp_like(fcinfo); +} + +/* + * regexp_count() + * Return the number of matches of a pattern within a string. + */ +Datum +regexp_count(PG_FUNCTION_ARGS) +{ + text *orig_str = NULL; + text *pattern = PG_GETARG_TEXT_PP(1); + int start = 1; + text *flags = PG_GETARG_TEXT_PP_IF_EXISTS(3); + + pg_re_flags re_flags; + regexp_matches_ctx *matchctx; + + if (PG_NARGS() > 2) + start = PG_GETARG_INT32(2); + + if (start < 0) + ereport(ERROR, + (errcode(ERRCODE_INVALID_PARAMETER_VALUE), + errmsg("invalid value for parameter \"%s\": %d", "start_position", start))); + + /* regexp_count(string, pattern, start[, flags]) */ + if (start > 1) + orig_str = DatumGetTextPP(DirectFunctionCall2(text_substr_no_len, + PG_GETARG_DATUM(0), + Int32GetDatum(start) + )); + else + orig_str = PG_GETARG_TEXT_PP(0); + + /* Determine options */ + parse_re_flags(&re_flags, flags); + + /* this function require flag 'g' */ + re_flags.glob = true; + + matchctx = setup_regexp_matches(orig_str, pattern, &re_flags, + PG_GET_COLLATION(), true, false, false); + + PG_RETURN_INT32(matchctx->nmatches); +} + +/* This is separate to keep the opr_sanity regression test from complaining */ +Datum +regexp_count_no_start(PG_FUNCTION_ARGS) +{ + return regexp_count(fcinfo); +} + + +/* This is separate to keep the opr_sanity regression test from complaining */ +Datum +regexp_count_no_flags(PG_FUNCTION_ARGS) +{ + return regexp_count(fcinfo); +} + +/* + * textregexreplace_extended() + * Return a string matched by a regular expression, with replacement. + * Extends textregexreplace by allowing a start position and the + * choice of the occurrence to replace (0 means all occurrences). + */ +Datum +textregexreplace_extended(PG_FUNCTION_ARGS) +{ + text *s = NULL; + text *p = PG_GETARG_TEXT_PP(1); + text *pattern = enclose_with_parenthesis(PG_GETARG_TEXT_PP(1)); + text *r = PG_GETARG_TEXT_PP(2); + int start = 1; + int occurrence = 1; + int pos = 0; + text *flags = PG_GETARG_TEXT_PP_IF_EXISTS(5); + StringInfoData str; + regex_t *re; + pg_re_flags re_flags; + regexp_matches_ctx *matchctx; + text *after = NULL; + + /* start position */ + if (PG_NARGS() > 3) + start = PG_GETARG_INT32(3); + if (start < 0) + ereport(ERROR, + (errcode(ERRCODE_INVALID_PARAMETER_VALUE), + errmsg("invalid value for parameter \"%s\": %d", "start_position", start))); + if (start > 1) + s = DatumGetTextPP(DirectFunctionCall2(text_substr_no_len, + PG_GETARG_DATUM(0), + Int32GetDatum(start) + )); + else + s = PG_GETARG_TEXT_PP(0); + + /* occurrence to replace */ + if (PG_NARGS() > 4) + occurrence = PG_GETARG_INT32(4); + + if (occurrence < 0) + ereport(ERROR, + (errcode(ERRCODE_INVALID_PARAMETER_VALUE), + errmsg("invalid value for parameter \"%s\": %d", "occurrence", occurrence))); + + /* Determine options */ + parse_re_flags(&re_flags, flags); + re_flags.glob = true; + + /* lookup for pattern */ + matchctx = setup_regexp_matches(s, pattern, &re_flags, + PG_GET_COLLATION(), true, + false, re_flags.glob); + + /* If no match is found, then the function returns the original string */ + if (matchctx->nmatches == 0) + PG_RETURN_DATUM(PG_GETARG_DATUM(0)); + + /* Get the position of the occurence to replace */ + if (PG_NARGS() > 4 && occurrence > 0) + { + /* When occurrence exceed matches return the original string */ + if (occurrence > matchctx->nmatches) + PG_RETURN_DATUM(PG_GETARG_DATUM(0)); + + pos = (occurrence*matchctx->npatterns*2)-(matchctx->npatterns*2); + pos = matchctx->match_locs[pos]+start; + } + else + pos = start; + + /* Normal case without explicit VARIADIC marker */ + initStringInfo(&str); + + /* Get the string before the occurrence starting */ + if (pos > 1) + { + text *before = DatumGetTextPP(DirectFunctionCall3(text_substr, + PG_GETARG_DATUM(0), + Int32GetDatum(1), + Int32GetDatum(pos - 1))); + appendStringInfoString(&str, TextDatumGetCString(before)); + } + + /* all occurences must be replaced? */ + if (occurrence == 0) + re_flags.glob = true; + else + re_flags.glob = false; + + /* Compile the regular expression */ + re = RE_compile_and_cache(p, re_flags.cflags, PG_GET_COLLATION()); + + /* Get the substring starting at the right occurrence position */ + after = DatumGetTextPP(DirectFunctionCall2(text_substr_no_len, + PG_GETARG_DATUM(0), + Int32GetDatum(pos) + )); + + appendStringInfoString(&str, TextDatumGetCString(replace_text_regexp( + after, + (void *) re, + r, + re_flags.glob))); + + PG_RETURN_TEXT_P(CStringGetTextDatum(str.data)); +} + +/* This is separate to keep the opr_sanity regression test from complaining */ +Datum +textregexreplace_extended_no_occurrence(PG_FUNCTION_ARGS) +{ + return textregexreplace_extended(fcinfo); +} + +/* This is separate to keep the opr_sanity regression test from complaining */ +Datum +textregexreplace_extended_no_flags(PG_FUNCTION_ARGS) +{ + return textregexreplace_extended(fcinfo); +} + +/* + * Return text string between parenthesis + */ +static text * +enclose_with_parenthesis(text *str) +{ + int len = VARSIZE_ANY_EXHDR(str); + text *result; + char *ptr; + + result = palloc(len + VARHDRSZ + 2); + SET_VARSIZE(result, len + VARHDRSZ + 2); + ptr = VARDATA(result); + memcpy(ptr, "(", 1); + memcpy(ptr+1, VARDATA_ANY(str), len); + memcpy(ptr+len+1, ")", 1); + + return result; +} + +/* + * regexp_instr() + * Return the position within the string where the match was located + */ +Datum +regexp_instr(PG_FUNCTION_ARGS) +{ + text *orig_str = NULL; + text *pattern = NULL; + text *flags = PG_GETARG_TEXT_PP_IF_EXISTS(5); + int start = 1; + int occurrence = 1; + int return_opt = 0; + int subexpr = 0; + int pos; + + pg_re_flags re_flags; + regexp_matches_ctx *matchctx; + + /* regexp_instr(string, pattern, start) */ + if (PG_NARGS() > 2) + start = PG_GETARG_INT32(2); + + if (start < 0) + ereport(ERROR, + (errcode(ERRCODE_INVALID_PARAMETER_VALUE), + errmsg("invalid value for parameter \"%s\": %d", "start_position", start))); + + if (start > 1) + orig_str = DatumGetTextPP(DirectFunctionCall2(text_substr_no_len, + PG_GETARG_DATUM(0), + Int32GetDatum(start) + )); + else + /* regexp_instr(string, pattern) */ + orig_str = PG_GETARG_TEXT_PP(0); + + if (orig_str == NULL) + PG_RETURN_NULL(); + + /* regexp_instr(string, pattern, start, occurrence) */ + if (PG_NARGS() > 3) + occurrence = PG_GETARG_INT32(3); + + if (occurrence <= 0) + ereport(ERROR, + (errcode(ERRCODE_INVALID_PARAMETER_VALUE), + errmsg("invalid value for parameter \"%s\": %d", "occurrence", occurrence))); + + /* regexp_instr(string, pattern, start, occurrence, return_opt) */ + if (PG_NARGS() > 4) + return_opt = PG_GETARG_INT32(4); + + if (return_opt != 0 && return_opt != 1) + ereport(ERROR, + (errcode(ERRCODE_INVALID_PARAMETER_VALUE), + errmsg("invalid value for parameter \"%s\": %d", "return_option", return_opt))); + + /* regexp_instr(string, pattern, start, occurrence, return_opt, flags, subexpr) */ + if (PG_NARGS() > 6) + subexpr = PG_GETARG_INT32(6); + if (subexpr < 0) + ereport(ERROR, + (errcode(ERRCODE_INVALID_PARAMETER_VALUE), + errmsg("invalid value for parameter \"%s\": %d", "group", subexpr))); + + /* + * If subexpr is zero (default), then the position of the entire + * substring that matches the pattern is returned. Otherwise we + * will exactly register the subexpressions given in the pattern. + * Enclose pattern between parenthesis to register the position + * of the entire substring. + */ + pattern = enclose_with_parenthesis(PG_GETARG_TEXT_PP(1)); + + /* Determine options */ + parse_re_flags(&re_flags, flags); + /* this function require flag 'g' */ + re_flags.glob = true; + + matchctx = setup_regexp_matches(orig_str, pattern, &re_flags, + PG_GET_COLLATION(), true, false, false); + + /* If no match is found, then the function returns 0 */ + if (matchctx->nmatches == 0) + PG_RETURN_INT32(0); + + /* When occurrence exceed matches return 0 */ + if (occurrence > matchctx->nmatches) + PG_RETURN_INT32(0); + + /* When subexpr exceed number of subexpression return 0 */ + if (subexpr > matchctx->npatterns - 1) + PG_RETURN_INT32(0); + + /* + * Returns the position of the first character of the occurrence + * or for subexpression in this occurrence. + */ + pos = (occurrence*matchctx->npatterns*2)-((matchctx->npatterns-subexpr)*2); + if (return_opt == 1) + pos += 1; + + PG_RETURN_INT32(matchctx->match_locs[pos]+start); +} + +/* This is separate to keep the opr_sanity regression test from complaining */ +Datum +regexp_instr_no_start(PG_FUNCTION_ARGS) +{ + return regexp_instr(fcinfo); +} + +/* This is separate to keep the opr_sanity regression test from complaining */ +Datum +regexp_instr_no_occurrence(PG_FUNCTION_ARGS) +{ + return regexp_instr(fcinfo); +} + +/* This is separate to keep the opr_sanity regression test from complaining */ +Datum +regexp_instr_no_return_opt(PG_FUNCTION_ARGS) +{ + return regexp_instr(fcinfo); +} + +/* This is separate to keep the opr_sanity regression test from complaining */ +Datum +regexp_instr_no_flags(PG_FUNCTION_ARGS) +{ + return regexp_instr(fcinfo); +} + +/* This is separate to keep the opr_sanity regression test from complaining */ +Datum +regexp_instr_no_subexpr(PG_FUNCTION_ARGS) +{ + return regexp_instr(fcinfo); +} + +/* + * regexp_substr() + * Return the substring within the string that match a regular + * expression pattern + */ +Datum +regexp_substr(PG_FUNCTION_ARGS) +{ + text *orig_str = NULL; + text *pattern = NULL; + text *flags = PG_GETARG_TEXT_PP_IF_EXISTS(4); + int start = 1; + int occurrence = 1; + int subexpr = 0; + int so, eo, pos; + + pg_re_flags re_flags; + regexp_matches_ctx *matchctx; + + if (PG_NARGS() > 2) + start = PG_GETARG_INT32(2); + + if (start < 0) + ereport(ERROR, + (errcode(ERRCODE_INVALID_PARAMETER_VALUE), + errmsg("invalid value for parameter \"%s\": %d", "start_position", start))); + + /* regexp_substr(string, pattern, start) */ + if (start > 1) + orig_str = DatumGetTextPP(DirectFunctionCall2(text_substr_no_len, + PG_GETARG_DATUM(0), + Int32GetDatum(start) + )); + else + /* regexp_substr(string, pattern) */ + orig_str = PG_GETARG_TEXT_PP(0); + + if (orig_str == NULL) + PG_RETURN_NULL(); + + /* regexp_substr(string, pattern, start, occurrence) */ + if (PG_NARGS() > 3) + occurrence = PG_GETARG_INT32(3); + + if (occurrence <= 0) + ereport(ERROR, + (errcode(ERRCODE_INVALID_PARAMETER_VALUE), + errmsg("invalid value for parameter \"%s\": %d", "occurrence", occurrence))); + + /* regexp_substr(string, pattern, start, occurrence, flags, subexpr) */ + if (PG_NARGS() > 5) + subexpr = PG_GETARG_INT32(5); + if (subexpr < 0) + ereport(ERROR, + (errcode(ERRCODE_INVALID_PARAMETER_VALUE), + errmsg("invalid value for parameter \"%s\": %d", "group", subexpr))); + + /* + * If subexpr is zero (default), then the position of the entire + * substring that matches the pattern is returned. Otherwise we + * will exactly register the subexpressions given in the pattern. + * Enclose pattern between parenthesis to register the position + * of the entire substring. + */ + pattern = enclose_with_parenthesis(PG_GETARG_TEXT_PP(1)); + + /* Determine options */ + parse_re_flags(&re_flags, flags); + /* this function require flag 'g' */ + re_flags.glob = true; + + matchctx = setup_regexp_matches(orig_str, pattern, &re_flags, + PG_GET_COLLATION(), true, false, false); + + /* If no match is found, then the function returns NULL */ + if (matchctx->nmatches == 0) + PG_RETURN_NULL(); + + /* When occurrence exceed matches return NULL */ + if (occurrence > matchctx->nmatches) + PG_RETURN_NULL(); + + /* When subexpr exceed number of subexpression return NULL */ + if (subexpr > matchctx->npatterns - 1) + PG_RETURN_NULL(); + + /* Returns the substring corresponding to the occurrence. */ + pos = (occurrence*matchctx->npatterns*2)-((matchctx->npatterns-subexpr)*2); + so = matchctx->match_locs[pos]+1; + eo = matchctx->match_locs[pos+1]+1; + + PG_RETURN_DATUM(DirectFunctionCall3(text_substr, + PointerGetDatum(matchctx->orig_str), + Int32GetDatum(so), + Int32GetDatum(eo - so))); +} + +/* This is separate to keep the opr_sanity regression test from complaining */ +Datum +regexp_substr_no_start(PG_FUNCTION_ARGS) +{ + return regexp_substr(fcinfo); +} + +/* This is separate to keep the opr_sanity regression test from complaining */ +Datum +regexp_substr_no_occurrence(PG_FUNCTION_ARGS) +{ + return regexp_substr(fcinfo); +} + +/* This is separate to keep the opr_sanity regression test from complaining */ +Datum +regexp_substr_no_flags(PG_FUNCTION_ARGS) +{ + return regexp_substr(fcinfo); +} + +/* This is separate to keep the opr_sanity regression test from complaining */ +Datum +regexp_substr_no_subexpr(PG_FUNCTION_ARGS) +{ + return regexp_substr(fcinfo); +} + /* * setup_regexp_matches --- do the initial matching for regexp_match * and regexp_split functions diff --git a/src/include/catalog/pg_proc.dat b/src/include/catalog/pg_proc.dat index 59d2b71ca9..fb91c54afe 100644 --- a/src/include/catalog/pg_proc.dat +++ b/src/include/catalog/pg_proc.dat @@ -3560,6 +3560,63 @@ proname => 'regexp_matches', prorows => '10', proretset => 't', prorettype => '_text', proargtypes => 'text text text', prosrc => 'regexp_matches' }, +{ oid => '9614', descr => 'count match(es) for regexp', + proname => 'regexp_count', prorettype => 'int4', proargtypes => 'text text', + prosrc => 'regexp_count_no_start' }, +{ oid => '9615', descr => 'count match(es) for regexp', + proname => 'regexp_count', prorettype => 'int4', proargtypes => 'text text int4', + prosrc => 'regexp_count_no_flags' }, +{ oid => '9616', descr => 'count match(es) for regexp', + proname => 'regexp_count', prorettype => 'int4', + proargtypes => 'text text int4 text', prosrc => 'regexp_count' }, +{ oid => '9617', descr => 'position where the match for regexp was located', + proname => 'regexp_instr', prorettype => 'int4', proargtypes => 'text text', + prosrc => 'regexp_instr_no_start' }, +{ oid => '9618', descr => 'position where the match for regexp was located', + proname => 'regexp_instr', prorettype => 'int4', proargtypes => 'text text int4', + prosrc => 'regexp_instr_no_occurrence' }, +{ oid => '9619', descr => 'position where the match for regexp was located', + proname => 'regexp_instr', prorettype => 'int4', proargtypes => 'text text int4 int4', + prosrc => 'regexp_instr_no_return_opt' }, +{ oid => '9620', descr => 'position where the match for regexp was located', + proname => 'regexp_instr', prorettype => 'int4', proargtypes => 'text text int4 int4 int4', + prosrc => 'regexp_instr_no_flags' }, +{ oid => '9621', descr => 'position where the match for regexp was located', + proname => 'regexp_instr', prorettype => 'int4', proargtypes => 'text text int4 int4 int4 text', + prosrc => 'regexp_instr_no_subexpr' }, +{ oid => '9622', descr => 'position where the match for regexp was located', + proname => 'regexp_instr', prorettype => 'int4', proargtypes => 'text text int4 int4 int4 text int4', + prosrc => 'regexp_instr' }, +{ oid => '9623', descr => 'substring that matches the regexp pattern', + proname => 'regexp_substr', prorettype => 'text', proargtypes => 'text text', + prosrc => 'regexp_substr_no_start' }, +{ oid => '9624', descr => 'substring that matchies the regexp pattern', + proname => 'regexp_substr', prorettype => 'text', proargtypes => 'text text int4', + prosrc => 'regexp_substr_no_occurrence' }, +{ oid => '9625', descr => 'substring that matches the regexp pattern', + proname => 'regexp_substr', prorettype => 'text', proargtypes => 'text text int4 int4', + prosrc => 'regexp_substr_no_flags' }, +{ oid => '9626', descr => 'substring that matches the regexp pattern', + proname => 'regexp_substr', prorettype => 'text', proargtypes => 'text text int4 int4 text', + prosrc => 'regexp_substr_no_subexpr' }, +{ oid => '9627', descr => 'substring that matches the regexp pattern', + proname => 'regexp_substr', prorettype => 'text', proargtypes => 'text text int4 int4 text int4', + prosrc => 'regexp_substr' }, +{ oid => '9628', descr => 'evaluate match(es) for regexp', + proname => 'regexp_like', prorettype => 'bool', proargtypes => 'text text', + prosrc => 'regexp_like_no_flags' }, +{ oid => '9629', descr => 'evaluate match(es) for regexp', + proname => 'regexp_like', prorettype => 'bool', + proargtypes => 'text text text', prosrc => 'regexp_like' }, +{ oid => '9606', descr => 'replace text using regexp', + proname => 'regexp_replace', prorettype => 'text', + proargtypes => 'text text text int4 int4 text', prosrc => 'textregexreplace_extended' }, +{ oid => '9607', descr => 'replace text using regexp', + proname => 'regexp_replace', prorettype => 'text', + proargtypes => 'text text text int4 int4', prosrc => 'textregexreplace_extended_no_flags' }, +{ oid => '9608', descr => 'replace text using regexp', + proname => 'regexp_replace', prorettype => 'text', + proargtypes => 'text text text int4', prosrc => 'textregexreplace_extended_no_occurrence' }, { oid => '2088', descr => 'split string by field_sep and return field_num', proname => 'split_part', prorettype => 'text', proargtypes => 'text text int4', prosrc => 'split_part' }, diff --git a/src/test/regress/expected/strings.out b/src/test/regress/expected/strings.out index fb4573d85f..b9cef32e9e 100644 --- a/src/test/regress/expected/strings.out +++ b/src/test/regress/expected/strings.out @@ -905,6 +905,751 @@ SELECT foo, length(foo) FROM regexp_split_to_table('thE QUick bROWn FOx jUMPs ov ERROR: regexp_split_to_table() does not support the "global" option SELECT regexp_split_to_array('thE QUick bROWn FOx jUMPs ovEr The lazy dOG', 'e', 'g'); ERROR: regexp_split_to_array() does not support the "global" option +-- regexp_like tests +SELECT regexp_like('a'||CHR(10)||'d', 'a.d'); + regexp_like +------------- + t +(1 row) + +SELECT regexp_like('a'||CHR(10)||'d', 'a.d', 'm'); + regexp_like +------------- + f +(1 row) + +SELECT regexp_like('a'||CHR(10)||'d', 'a.d', 'n'); + regexp_like +------------- + f +(1 row) + +SELECT regexp_like('Steven', '^Ste(v|ph)en$'); + regexp_like +------------- + t +(1 row) + +SELECT regexp_like('foo' || chr(10) || 'bar' || chr(10) || 'bequq' || chr(10) || 'baz', '^bar'); + regexp_like +------------- + f +(1 row) + +SELECT regexp_like('foo' || chr(10) || 'bar' || chr(10) || 'bequq' || chr(10) || 'baz', 'bar'); + regexp_like +------------- + t +(1 row) + +SELECT regexp_like('foo' || chr(10) || 'bar' || chr(10) || 'bequq' || chr(10) || 'baz', '^bar', 'm'); + regexp_like +------------- + t +(1 row) + +SELECT regexp_like('foo' || chr(10) || 'bar' || chr(10) || 'bequq' || chr(10) || 'baz', '^bar', 'n'); + regexp_like +------------- + t +(1 row) + +SELECT regexp_like('GREEN', '([aeiou])\1'); + regexp_like +------------- + f +(1 row) + +SELECT regexp_like('GREEN', '([aeiou])\1', 'i'); + regexp_like +------------- + t +(1 row) + +SELECT regexp_like('ORANGE' || chr(10) || 'GREEN', '([aeiou])\1', 'i'); + regexp_like +------------- + t +(1 row) + +SELECT regexp_like('ORANGE' || chr(10) || 'GREEN', '^..([aeiou])\1', 'i'); + regexp_like +------------- + f +(1 row) + +SELECT regexp_like('ORANGE' || chr(10) || 'GREEN', '([aeiou])\1', 'in'); + regexp_like +------------- + t +(1 row) + +SELECT regexp_like('ORANGE' || chr(10) || 'GREEN', '^..([aeiou])\1', 'in'); + regexp_like +------------- + t +(1 row) + +SELECT regexp_like('ORANGE' || chr(10) || 'GREEN', '^..([aeiou])\1', 'im'); + regexp_like +------------- + t +(1 row) + +SELECT REGEXP_LIKE('abc', 'a b c'); + regexp_like +------------- + f +(1 row) + +SELECT REGEXP_LIKE('abc', 'a b c','x'); + regexp_like +------------- + t +(1 row) + +-- count all matches for regexp +SELECT regexp_count('123123123123123', '(12)3'); + regexp_count +-------------- + 5 +(1 row) + +-- count all matches with start position +SELECT regexp_count('123123123123', '123', 0); + regexp_count +-------------- + 4 +(1 row) + +SELECT regexp_count('123123123123', '123', 1); + regexp_count +-------------- + 4 +(1 row) + +SELECT regexp_count('123123123123', '123', 3); + regexp_count +-------------- + 3 +(1 row) + +SELECT regexp_count('123123123123', '123', -3); +ERROR: invalid value for parameter "start_position": -3 +-- count all matches in NULL string with a start position +SELECT regexp_count(NULL, '123', 3); + regexp_count +-------------- + \N +(1 row) + +-- count all matches with a start position greater than string length +SELECT regexp_count('123', '123', 10); + regexp_count +-------------- + 0 +(1 row) + +-- count all matches from different regexp +SELECT regexp_count('ABC123', '[A-Z]'), regexp_count('A1B2C3', '[A-Z]'); + regexp_count | regexp_count +--------------+-------------- + 3 | 3 +(1 row) + +SELECT regexp_count('ABC123', '[A-Z][0-9]'), regexp_count('A1B2C3', '[A-Z][0-9]'); + regexp_count | regexp_count +--------------+-------------- + 1 | 3 +(1 row) + +SELECT regexp_count('ABC123', '[A-Z][0-9]{2}'), regexp_count('A1B2C3', '[A-Z][0-9]{2}'); + regexp_count | regexp_count +--------------+-------------- + 1 | 0 +(1 row) + +SELECT regexp_count('ABC123', '([A-Z][0-9]){2}'), regexp_count('A1B2C3', '([A-Z][0-9]){2}'); + regexp_count | regexp_count +--------------+-------------- + 0 | 1 +(1 row) + +SELECT regexp_count('ABC123A5', '^[A-Z][0-9]'), regexp_count('A1B2C3', '^[A-Z][0-9]'); + regexp_count | regexp_count +--------------+-------------- + 0 | 1 +(1 row) + +SELECT regexp_count('ABC123', '[A-Z][0-9]{2}'), regexp_count('A1B2C34', '[A-Z][0-9]{2}'); + regexp_count | regexp_count +--------------+-------------- + 1 | 1 +(1 row) + +-- count matches with newline case insensivity +SELECT regexp_count('a'||CHR(10)||'d', 'a.d'); + regexp_count +-------------- + 1 +(1 row) + +SELECT regexp_count('a'||CHR(10)||'d', 'a.d', 1, 's'); + regexp_count +-------------- + 1 +(1 row) + +-- count matches with newline case sensivity +SELECT regexp_count('a'||CHR(10)||'d', 'a.d', 1, 'n'); + regexp_count +-------------- + 0 +(1 row) + +-- count not multiline matches +SELECT regexp_count('a'||CHR(10)||'d', '^d$'); + regexp_count +-------------- + 0 +(1 row) + +-- count multiline matches +SELECT regexp_count('a'||CHR(10)||'d', '^d$', 1, 'm'); + regexp_count +-------------- + 1 +(1 row) + +SELECT regexp_count('Hello'||CHR(10)||'world!', '^world!$', 1, 'm'); -- 1 + regexp_count +-------------- + 1 +(1 row) + +-- Count the number of occurrences of the substring an in the string. +SELECT regexp_count('a man, a plan, a canal: Panama', 'an'); + regexp_count +-------------- + 4 +(1 row) + +-- Find the number of occurrences of the substring an in the string starting with the fifth character. +SELECT regexp_count('a man, a plan, a canal: Panama', 'an',5); + regexp_count +-------------- + 3 +(1 row) + +-- Find the number of occurrences of a substring containing a lower-case character +-- followed by an. In the first example, do not use a modifier. In the second example, +-- use the i modifier to force the regular expression to ignore case. +SELECT regexp_count('a man, a plan, a canal: Panama', '[a-z]an'); + regexp_count +-------------- + 3 +(1 row) + +SELECT regexp_count('a man, a plan, a canal: Panama', '[a-z]an', 1, 'i'); + regexp_count +-------------- + 4 +(1 row) + +DROP TABLE IF EXISTS regexp_temp; +NOTICE: table "regexp_temp" does not exist, skipping +CREATE TABLE regexp_temp(fullname varchar(20), email varchar(20)); +INSERT INTO regexp_temp (fullname, email) VALUES ('John Doe', 'johndoe@example.com'); +INSERT INTO regexp_temp (fullname, email) VALUES ('Jane Doe', 'janedoe'); +-- count matches case sensitive +SELECT fullname, regexp_count(fullname, 'e', 1, 'c') FROM regexp_temp; + fullname | regexp_count +----------+-------------- + John Doe | 1 + Jane Doe | 2 +(2 rows) + +SELECT fullname, regexp_count(fullname, 'D', 1, 'c') FROM regexp_temp; + fullname | regexp_count +----------+-------------- + John Doe | 1 + Jane Doe | 1 +(2 rows) + +SELECT fullname, regexp_count(fullname, 'd', 1, 'c') FROM regexp_temp; + fullname | regexp_count +----------+-------------- + John Doe | 0 + Jane Doe | 0 +(2 rows) + +-- count matches case insensitive +SELECT fullname, regexp_count(fullname, 'E', 1, 'i') FROM regexp_temp; + fullname | regexp_count +----------+-------------- + John Doe | 1 + Jane Doe | 2 +(2 rows) + +SELECT fullname, regexp_count(fullname, 'do', 1, 'i') FROM regexp_temp; + fullname | regexp_count +----------+-------------- + John Doe | 1 + Jane Doe | 1 +(2 rows) + +-- return the start position of the 6th occurence starting at beginning of the string +SELECT regexp_instr('number of your street, zipcode thetown, FR', '[^ ]+', 1, 6); + regexp_instr +-------------- + 32 +(1 row) + +-- return the start position of the 5th occurence starting after the first word +SELECT regexp_instr('number of your street, zipcode thetown, FR', '[^ ]+', 7, 5, 0); + regexp_instr +-------------- + 32 +(1 row) + +-- return the ending position of the 5th occurence starting after the first word +SELECT regexp_instr('number of your street, zipcode thetown, FR', '[^ ]+', 7, 5, 0); + regexp_instr +-------------- + 32 +(1 row) + +-- return the ending position of the 2nd occurence starting after the first word +SELECT regexp_instr('number of your street, zipcode thetown, FR', '[T|Z|S][[:alpha:]]{5}', 7, 2, 1, 'i'); + regexp_instr +-------------- + 30 +(1 row) + +-- return the starting position corresponding to the different capture group +SELECT regexp_instr('1234567890', '(123)(4(56)(78))', 1, 1, 0, 'i', 0); + regexp_instr +-------------- + 1 +(1 row) + +SELECT regexp_instr('1234567890', '(123)(4(56)(78))', 1, 1, 0, 'i', 1); + regexp_instr +-------------- + 1 +(1 row) + +SELECT regexp_instr('1234567890', '(123)(4(56)(78))', 1, 1, 0, 'i', 2); + regexp_instr +-------------- + 4 +(1 row) + +SELECT regexp_instr('1234567890', '(123)(4(56)(78))', 1, 1, 0, 'i', 3); + regexp_instr +-------------- + 5 +(1 row) + +SELECT regexp_instr('1234567890', '(123)(4(56)(78))', 1, 1, 0, 'i', 4); + regexp_instr +-------------- + 7 +(1 row) + +-- return the starting position corresponding to a non existant capture group +SELECT regexp_instr('1234567890', '(123)(4(56)(78))', 1, 1, 0, 'i', 5); + regexp_instr +-------------- + 0 +(1 row) + +-- Same but with the ending position +SELECT regexp_instr('1234567890', '(123)(4(56)(78))', 1, 1, 1, 'i', 0); + regexp_instr +-------------- + 9 +(1 row) + +SELECT regexp_instr('1234567890', '(123)(4(56)(78))', 1, 1, 1, 'i', 1); + regexp_instr +-------------- + 4 +(1 row) + +SELECT regexp_instr('1234567890', '(123)(4(56)(78))', 1, 1, 1, 'i', 2); + regexp_instr +-------------- + 9 +(1 row) + +SELECT regexp_instr('1234567890', '(123)(4(56)(78))', 1, 1, 1, 'i', 3); + regexp_instr +-------------- + 7 +(1 row) + +SELECT regexp_instr('1234567890', '(123)(4(56)(78))', 1, 1, 1, 'i', 4); + regexp_instr +-------------- + 9 +(1 row) + +SELECT regexp_instr('1234567890', '(123)(4(56)(78))', 1, 1, 1, 'i', 5); + regexp_instr +-------------- + 0 +(1 row) + +-- start position of a valid email +SELECT email, regexp_instr(email, '\w+@\w+(\.\w+)+') "valid_email" FROM regexp_temp; + email | valid_email +---------------------+------------- + johndoe@example.com | 1 + janedoe | 0 +(2 rows) + +-- ending position of a valid email +SELECT email, regexp_instr(email, '\w+@\w+(\.\w+)+', 1, 1, 1) "valid_email" FROM regexp_temp; + email | valid_email +---------------------+------------- + johndoe@example.com | 20 + janedoe | 0 +(2 rows) + +-- start position of first capture group in the email (the dot part) +SELECT email, regexp_instr(email, '\w+@\w+(\.\w+)+', 1, 1, 0, 'i', 1) FROM regexp_temp; + email | regexp_instr +---------------------+-------------- + johndoe@example.com | 16 + janedoe | 0 +(2 rows) + +-- ending position of first capture group in the email (the dot part) +SELECT email, regexp_instr(email, '\w+@\w+(\.\w+)+', 1, 1, 1, 'i', 1) FROM regexp_temp; + email | regexp_instr +---------------------+-------------- + johndoe@example.com | 20 + janedoe | 0 +(2 rows) + +-- test negative values +SELECT regexp_instr('1234567890', '(123)(4(56)(78))', -1, 1, 1, 'i', 1); +ERROR: invalid value for parameter "start_position": -1 +SELECT regexp_instr('1234567890', '(123)(4(56)(78))', 1, -1, 1, 'i', 1); +ERROR: invalid value for parameter "occurrence": -1 +SELECT regexp_instr('1234567890', '(123)(4(56)(78))', 1, 1, -1, 'i', 1); +ERROR: invalid value for parameter "return_option": -1 +SELECT regexp_instr('1234567890', '(123)(4(56)(78))', 1, 1, 1, 'i', -1); +ERROR: invalid value for parameter "group": -1 +-- Find the first occurrence of a sequence of letters starting with the letter e +-- and ending with the letter y in the phrase "easy come, easy go." +SELECT regexp_instr('easy come, easy go','e\w*y'); + regexp_instr +-------------- + 1 +(1 row) + +-- Find the first sequence of letters starting with the letter e and ending with +-- the letter y in the string "easy come, easy go" starting at the second character +SELECT regexp_instr('easy come, easy go','e\w*y',2); + regexp_instr +-------------- + 12 +(1 row) + +-- Find the second sequence of letters starting with the letter e and ending with +-- the letter y in the string "easy come, easy go" starting at the first character. +SELECT regexp_instr('easy come, easy go','e\w*y',1,2); + regexp_instr +-------------- + 12 +(1 row) + +-- Find the position of the first character after the first whitespace in the string "easy come, easy go." +SELECT regexp_instr('easy come, easy go','\s',1,1,1); + regexp_instr +-------------- + 6 +(1 row) + +-- Find the position of the start of the third word in a string by capturing each +-- word as a subexpression, and returning the third subexpression's start position. +SELECT regexp_instr('one two three','(\w+)\s+(\w+)\s+(\w+)', 1,1,0,'',3); + regexp_instr +-------------- + 9 +(1 row) + +-- return the substring matching the regexp +SELECT regexp_substr('number of your street, zipcode town, FR', ',[^,]+'); + regexp_substr +---------------- + , zipcode town +(1 row) + +-- return the substring matching the regexp +SELECT regexp_substr('number of your street, zipcode town, FR', ',[^,]+', 24); + regexp_substr +--------------- + , FR +(1 row) + +-- return the substring matching the regexp at the first occurrence +SELECT regexp_substr('number of your street, zipcode town, FR', ',[^,]+', 1, 1); + regexp_substr +---------------- + , zipcode town +(1 row) + +-- return the substring matching the regexp at the second occurrence +SELECT regexp_substr('number of your street, zipcode town, FR', ',[^,]+', 1, 2); + regexp_substr +--------------- + , FR +(1 row) + +-- case sensitivity substring search +SELECT regexp_substr('number of your street, zipcode town, FR', ',\s+[Zf][^,]+', 1, 1); + regexp_substr +--------------- + \N +(1 row) + +SELECT regexp_substr('number of your street, zipcode town, FR', ',\s+[Zf][^,]+', 1, 1, 'i'); + regexp_substr +---------------- + , zipcode town +(1 row) + +-- case sensitivity substring search with no capture group +SELECT regexp_substr('number of your street, zipcode town, FR', ',\s+[Zf][^,]+', 1, 1, 'i', 0); + regexp_substr +---------------- + , zipcode town +(1 row) + +-- case sensitivity substring search with non existing capture group +SELECT regexp_substr('number of your street, zipcode town, FR', ',\s+[Zf][^,]+', 1, 1, 'i', 0); + regexp_substr +---------------- + , zipcode town +(1 row) + +-- return the substring matching the regexp at different occurrence and capture group +SELECT regexp_substr('1234567890 1234567890', '(123)(4(56)(78))', 1, 1, 'i', 4); + regexp_substr +--------------- + 78 +(1 row) + +SELECT regexp_substr('1234567890 1234557890', '(123)(4(5[56])(78))', 1, 2, 'i', 3); + regexp_substr +--------------- + 55 +(1 row) + +SELECT regexp_substr('1234567890 1234567890', '(123)(4(56)(78))', 1, 1, 'i', 0); + regexp_substr +--------------- + 12345678 +(1 row) + +-- test negative values +SELECT regexp_substr('1234567890 1234567890', '(123)(4(56)(78))', -1, 1, 'i', 4); +ERROR: invalid value for parameter "start_position": -1 +SELECT regexp_substr('1234567890 1234567890', '(123)(4(56)(78))', 1, -1, 'i', 4); +ERROR: invalid value for parameter "occurrence": -1 +SELECT regexp_substr('1234567890 1234567890', '(123)(4(56)(78))', 1, 1, 'i', -4); +ERROR: invalid value for parameter "group": -4 +-- Select the first substring of letters that end with "thy." +SELECT regexp_substr('healthy, wealthy, and wise','\w+thy'); + regexp_substr +--------------- + healthy +(1 row) + +-- Select the first substring of letters that ends with "thy" starting at the second character in the string. +SELECT regexp_substr('healthy, wealthy, and wise','\w+thy',2); + regexp_substr +--------------- + ealthy +(1 row) + +-- Return the contents of the third captured subexpression, which captures the third word in the string. +SELECT regexp_substr('one two three', '(\w+)\s+(\w+)\s+(\w+)', 1, 1, '', 3); + regexp_substr +--------------- + three +(1 row) + +DROP TABLE IF EXISTS regexp_temp; +-- Regression tests for extended regexp_replace() function with start position and occurrence +SELECT regexp_replace('512.123.4567', '([[:digit:]]{3})\.([[:digit:]]{3})\.([[:digit:]]{4})', '(\1) \2-\3', 1); + regexp_replace +---------------- + (512) 123-4567 +(1 row) + +SELECT regexp_replace('512.123.4567 612.123.4567', '([[:digit:]]{3})\.([[:digit:]]{3})\.([[:digit:]]{4})', '(\1) \2-\3', 1, 0); + regexp_replace +------------------------------- + (512) 123-4567 (612) 123-4567 +(1 row) + +SELECT regexp_replace('number your street,'||CHR(10)||' zipcode town, FR', '( ){2,}', ' ', 1, 0); + regexp_replace +--------------------- + number your street,+ + zipcode town, FR +(1 row) + +SELECT regexp_replace('number your street, zipcode town, FR', '( ){2,}', ' ', 9); + regexp_replace +-------------------------------------------- + number your street, zipcode town, FR +(1 row) + +SELECT regexp_replace('number your street, zipcode town, FR', '( ){2,}', ' ', 9, 0); + regexp_replace +---------------------------------------- + number your street, zipcode town, FR +(1 row) + +SELECT regexp_replace('number your street, zipcode town, FR', '( ){2,}', ' ', 9, 2); + regexp_replace +--------------------------------------------- + number your street, zipcode town, FR +(1 row) + +SELECT regexp_replace('number your street, zipcode town, FR', '( ){2,}', ' ', 9, 2, 'm'); + regexp_replace +--------------------------------------------- + number your street, zipcode town, FR +(1 row) + +SELECT regexp_replace('number your street, zipcode town, FR', '([EURT]){2,}', '[\1]', 9, 1, 'i'); + regexp_replace +------------------------------------------------- + number yo[r] street, zipcode town, FR +(1 row) + +SELECT regexp_replace('number your street, zipcode town, FR', '([EURT]){2,}', '[\1]', 9, 2, 'i'); + regexp_replace +---------------------------------------------- + number your s[t], zipcode town, FR +(1 row) + +SELECT regexp_replace ('A PostgreSQL function', 'A|e|i|o|u', 'X', 1, 2); + regexp_replace +----------------------- + A PXstgreSQL function +(1 row) + +SELECT regexp_replace ('A PostgreSQL function', 'a|e|i|o|u', 'X', 1, 0, 'i'); + regexp_replace +----------------------- + X PXstgrXSQL fXnctXXn +(1 row) + +SELECT regexp_replace ('A PostgreSQL function', 'a|e|i|o|u', 'X', 1, 1, 'i'); + regexp_replace +----------------------- + X PostgreSQL function +(1 row) + +SELECT regexp_replace ('A PostgreSQL function', 'a|e|i|o|u', 'X', 1, 2, 'i'); + regexp_replace +----------------------- + A PXstgreSQL function +(1 row) + +SELECT regexp_replace ('A PostgreSQL function', 'a|e|i|o|u', 'X', 1, 3, 'i'); + regexp_replace +----------------------- + A PostgrXSQL function +(1 row) + +SELECT regexp_replace ('A PostgreSQL function', 'a|e|i|o|u', 'X', 1, 9, 'i'); + regexp_replace +----------------------- + A PostgreSQL function +(1 row) + +SELECT regexp_replace ('A PostgreSQL function', 'A|e|i|o|u', 'X', 1, 9); + regexp_replace +----------------------- + A PostgreSQL function +(1 row) + +-- Invalid parameter values +SELECT regexp_replace ('A PostgreSQL function', 'a|e|i|o|u', 'X', -1, 0, 'i'); +ERROR: invalid value for parameter "start_position": -1 +SELECT regexp_replace ('A PostgreSQL function', 'a|e|i|o|u', 'X', 1, -1, 'i'); +ERROR: invalid value for parameter "occurrence": -1 +-- Modifier 'g' should not be taken in account, we have an occurrence to replace +SELECT regexp_replace ('A PostgreSQL function', 'a|e|i|o|u', 'X', 1, 1, 'g'); + regexp_replace +----------------------- + A PXstgreSQL function +(1 row) + +-- Find groups of "word characters" (letters, numbers and underscore) ending with +-- "thy" in the string "healthy, wealthy, and wise" and replace them with nothing. +SELECT regexp_replace('healthy, wealthy, and wise','\w+thy', '', 'g'); + regexp_replace +---------------- + , , and wise +(1 row) + +SELECT regexp_replace('healthy, wealthy, and wise','\w+thy', '', 1, 0); + regexp_replace +---------------- + , , and wise +(1 row) + +SELECT regexp_replace('healthy, wealthy, and wise','\w+thy', '', 1, 0, 'g'); + regexp_replace +---------------- + , , and wise +(1 row) + +-- Find groups of word characters ending with "thy" and replace with "something." +SELECT regexp_replace('healthy, wealthy, and wise','\w+thy', 'something', 'g'); + regexp_replace +-------------------------------- + something, something, and wise +(1 row) + +-- Find groups of word characters ending with "thy" and replace with the string +-- "something" starting at the third character in the string. +SELECT regexp_replace('healthy, wealthy, and wise','\w+thy', 'something', 3, 0); + regexp_replace +---------------------------------- + hesomething, something, and wise +(1 row) + +-- Replace the second group of word characters ending with "thy" with "something." +SELECT regexp_replace('healthy, wealthy, and wise','\w+thy', 'something', 1, 2); + regexp_replace +------------------------------ + healthy, something, and wise +(1 row) + +-- Find groups of word characters ending with "thy" capturing the letters before +-- the "thy", and replace with the captured letters plus the letters "ish." +SELECT regexp_replace('healthy, wealthy, and wise','(\w+)thy', '\1ish', 'g'); + regexp_replace +---------------------------- + healish, wealish, and wise +(1 row) + +SELECT regexp_replace('healthy, wealthy, and wise','(\w+)thy', '\1ish', 1, 0); + regexp_replace +---------------------------- + healish, wealish, and wise +(1 row) + -- change NULL-display back \pset null '' -- E021-11 position expression diff --git a/src/test/regress/sql/strings.sql b/src/test/regress/sql/strings.sql index 57a48c9d0b..7623e85f1e 100644 --- a/src/test/regress/sql/strings.sql +++ b/src/test/regress/sql/strings.sql @@ -255,6 +255,199 @@ SELECT regexp_split_to_array('thE QUick bROWn FOx jUMPs ovEr The lazy dOG', 'e', SELECT foo, length(foo) FROM regexp_split_to_table('thE QUick bROWn FOx jUMPs ovEr The lazy dOG', 'e', 'g') AS foo; SELECT regexp_split_to_array('thE QUick bROWn FOx jUMPs ovEr The lazy dOG', 'e', 'g'); +-- regexp_like tests +SELECT regexp_like('a'||CHR(10)||'d', 'a.d'); +SELECT regexp_like('a'||CHR(10)||'d', 'a.d', 'm'); +SELECT regexp_like('a'||CHR(10)||'d', 'a.d', 'n'); +SELECT regexp_like('Steven', '^Ste(v|ph)en$'); +SELECT regexp_like('foo' || chr(10) || 'bar' || chr(10) || 'bequq' || chr(10) || 'baz', '^bar'); +SELECT regexp_like('foo' || chr(10) || 'bar' || chr(10) || 'bequq' || chr(10) || 'baz', 'bar'); +SELECT regexp_like('foo' || chr(10) || 'bar' || chr(10) || 'bequq' || chr(10) || 'baz', '^bar', 'm'); +SELECT regexp_like('foo' || chr(10) || 'bar' || chr(10) || 'bequq' || chr(10) || 'baz', '^bar', 'n'); +SELECT regexp_like('GREEN', '([aeiou])\1'); +SELECT regexp_like('GREEN', '([aeiou])\1', 'i'); +SELECT regexp_like('ORANGE' || chr(10) || 'GREEN', '([aeiou])\1', 'i'); +SELECT regexp_like('ORANGE' || chr(10) || 'GREEN', '^..([aeiou])\1', 'i'); +SELECT regexp_like('ORANGE' || chr(10) || 'GREEN', '([aeiou])\1', 'in'); +SELECT regexp_like('ORANGE' || chr(10) || 'GREEN', '^..([aeiou])\1', 'in'); +SELECT regexp_like('ORANGE' || chr(10) || 'GREEN', '^..([aeiou])\1', 'im'); +SELECT REGEXP_LIKE('abc', 'a b c'); +SELECT REGEXP_LIKE('abc', 'a b c','x'); + +-- count all matches for regexp +SELECT regexp_count('123123123123123', '(12)3'); +-- count all matches with start position +SELECT regexp_count('123123123123', '123', 0); +SELECT regexp_count('123123123123', '123', 1); +SELECT regexp_count('123123123123', '123', 3); +SELECT regexp_count('123123123123', '123', -3); +-- count all matches in NULL string with a start position +SELECT regexp_count(NULL, '123', 3); +-- count all matches with a start position greater than string length +SELECT regexp_count('123', '123', 10); +-- count all matches from different regexp +SELECT regexp_count('ABC123', '[A-Z]'), regexp_count('A1B2C3', '[A-Z]'); +SELECT regexp_count('ABC123', '[A-Z][0-9]'), regexp_count('A1B2C3', '[A-Z][0-9]'); +SELECT regexp_count('ABC123', '[A-Z][0-9]{2}'), regexp_count('A1B2C3', '[A-Z][0-9]{2}'); +SELECT regexp_count('ABC123', '([A-Z][0-9]){2}'), regexp_count('A1B2C3', '([A-Z][0-9]){2}'); +SELECT regexp_count('ABC123A5', '^[A-Z][0-9]'), regexp_count('A1B2C3', '^[A-Z][0-9]'); +SELECT regexp_count('ABC123', '[A-Z][0-9]{2}'), regexp_count('A1B2C34', '[A-Z][0-9]{2}'); +-- count matches with newline case insensivity +SELECT regexp_count('a'||CHR(10)||'d', 'a.d'); +SELECT regexp_count('a'||CHR(10)||'d', 'a.d', 1, 's'); +-- count matches with newline case sensivity +SELECT regexp_count('a'||CHR(10)||'d', 'a.d', 1, 'n'); +-- count not multiline matches +SELECT regexp_count('a'||CHR(10)||'d', '^d$'); +-- count multiline matches +SELECT regexp_count('a'||CHR(10)||'d', '^d$', 1, 'm'); +SELECT regexp_count('Hello'||CHR(10)||'world!', '^world!$', 1, 'm'); -- 1 +-- Count the number of occurrences of the substring an in the string. +SELECT regexp_count('a man, a plan, a canal: Panama', 'an'); +-- Find the number of occurrences of the substring an in the string starting with the fifth character. +SELECT regexp_count('a man, a plan, a canal: Panama', 'an',5); +-- Find the number of occurrences of a substring containing a lower-case character +-- followed by an. In the first example, do not use a modifier. In the second example, +-- use the i modifier to force the regular expression to ignore case. +SELECT regexp_count('a man, a plan, a canal: Panama', '[a-z]an'); +SELECT regexp_count('a man, a plan, a canal: Panama', '[a-z]an', 1, 'i'); + +DROP TABLE IF EXISTS regexp_temp; +CREATE TABLE regexp_temp(fullname varchar(20), email varchar(20)); +INSERT INTO regexp_temp (fullname, email) VALUES ('John Doe', 'johndoe@example.com'); +INSERT INTO regexp_temp (fullname, email) VALUES ('Jane Doe', 'janedoe'); +-- count matches case sensitive +SELECT fullname, regexp_count(fullname, 'e', 1, 'c') FROM regexp_temp; +SELECT fullname, regexp_count(fullname, 'D', 1, 'c') FROM regexp_temp; +SELECT fullname, regexp_count(fullname, 'd', 1, 'c') FROM regexp_temp; +-- count matches case insensitive +SELECT fullname, regexp_count(fullname, 'E', 1, 'i') FROM regexp_temp; +SELECT fullname, regexp_count(fullname, 'do', 1, 'i') FROM regexp_temp; + +-- return the start position of the 6th occurence starting at beginning of the string +SELECT regexp_instr('number of your street, zipcode thetown, FR', '[^ ]+', 1, 6); +-- return the start position of the 5th occurence starting after the first word +SELECT regexp_instr('number of your street, zipcode thetown, FR', '[^ ]+', 7, 5, 0); +-- return the ending position of the 5th occurence starting after the first word +SELECT regexp_instr('number of your street, zipcode thetown, FR', '[^ ]+', 7, 5, 0); +-- return the ending position of the 2nd occurence starting after the first word +SELECT regexp_instr('number of your street, zipcode thetown, FR', '[T|Z|S][[:alpha:]]{5}', 7, 2, 1, 'i'); +-- return the starting position corresponding to the different capture group +SELECT regexp_instr('1234567890', '(123)(4(56)(78))', 1, 1, 0, 'i', 0); +SELECT regexp_instr('1234567890', '(123)(4(56)(78))', 1, 1, 0, 'i', 1); +SELECT regexp_instr('1234567890', '(123)(4(56)(78))', 1, 1, 0, 'i', 2); +SELECT regexp_instr('1234567890', '(123)(4(56)(78))', 1, 1, 0, 'i', 3); +SELECT regexp_instr('1234567890', '(123)(4(56)(78))', 1, 1, 0, 'i', 4); +-- return the starting position corresponding to a non existant capture group +SELECT regexp_instr('1234567890', '(123)(4(56)(78))', 1, 1, 0, 'i', 5); +-- Same but with the ending position +SELECT regexp_instr('1234567890', '(123)(4(56)(78))', 1, 1, 1, 'i', 0); +SELECT regexp_instr('1234567890', '(123)(4(56)(78))', 1, 1, 1, 'i', 1); +SELECT regexp_instr('1234567890', '(123)(4(56)(78))', 1, 1, 1, 'i', 2); +SELECT regexp_instr('1234567890', '(123)(4(56)(78))', 1, 1, 1, 'i', 3); +SELECT regexp_instr('1234567890', '(123)(4(56)(78))', 1, 1, 1, 'i', 4); +SELECT regexp_instr('1234567890', '(123)(4(56)(78))', 1, 1, 1, 'i', 5); +-- start position of a valid email +SELECT email, regexp_instr(email, '\w+@\w+(\.\w+)+') "valid_email" FROM regexp_temp; +-- ending position of a valid email +SELECT email, regexp_instr(email, '\w+@\w+(\.\w+)+', 1, 1, 1) "valid_email" FROM regexp_temp; +-- start position of first capture group in the email (the dot part) +SELECT email, regexp_instr(email, '\w+@\w+(\.\w+)+', 1, 1, 0, 'i', 1) FROM regexp_temp; +-- ending position of first capture group in the email (the dot part) +SELECT email, regexp_instr(email, '\w+@\w+(\.\w+)+', 1, 1, 1, 'i', 1) FROM regexp_temp; +-- test negative values +SELECT regexp_instr('1234567890', '(123)(4(56)(78))', -1, 1, 1, 'i', 1); +SELECT regexp_instr('1234567890', '(123)(4(56)(78))', 1, -1, 1, 'i', 1); +SELECT regexp_instr('1234567890', '(123)(4(56)(78))', 1, 1, -1, 'i', 1); +SELECT regexp_instr('1234567890', '(123)(4(56)(78))', 1, 1, 1, 'i', -1); + +-- Find the first occurrence of a sequence of letters starting with the letter e +-- and ending with the letter y in the phrase "easy come, easy go." +SELECT regexp_instr('easy come, easy go','e\w*y'); +-- Find the first sequence of letters starting with the letter e and ending with +-- the letter y in the string "easy come, easy go" starting at the second character +SELECT regexp_instr('easy come, easy go','e\w*y',2); +-- Find the second sequence of letters starting with the letter e and ending with +-- the letter y in the string "easy come, easy go" starting at the first character. +SELECT regexp_instr('easy come, easy go','e\w*y',1,2); +-- Find the position of the first character after the first whitespace in the string "easy come, easy go." +SELECT regexp_instr('easy come, easy go','\s',1,1,1); +-- Find the position of the start of the third word in a string by capturing each +-- word as a subexpression, and returning the third subexpression's start position. +SELECT regexp_instr('one two three','(\w+)\s+(\w+)\s+(\w+)', 1,1,0,'',3); + +-- return the substring matching the regexp +SELECT regexp_substr('number of your street, zipcode town, FR', ',[^,]+'); +-- return the substring matching the regexp +SELECT regexp_substr('number of your street, zipcode town, FR', ',[^,]+', 24); +-- return the substring matching the regexp at the first occurrence +SELECT regexp_substr('number of your street, zipcode town, FR', ',[^,]+', 1, 1); +-- return the substring matching the regexp at the second occurrence +SELECT regexp_substr('number of your street, zipcode town, FR', ',[^,]+', 1, 2); +-- case sensitivity substring search +SELECT regexp_substr('number of your street, zipcode town, FR', ',\s+[Zf][^,]+', 1, 1); +SELECT regexp_substr('number of your street, zipcode town, FR', ',\s+[Zf][^,]+', 1, 1, 'i'); +-- case sensitivity substring search with no capture group +SELECT regexp_substr('number of your street, zipcode town, FR', ',\s+[Zf][^,]+', 1, 1, 'i', 0); +-- case sensitivity substring search with non existing capture group +SELECT regexp_substr('number of your street, zipcode town, FR', ',\s+[Zf][^,]+', 1, 1, 'i', 0); +-- return the substring matching the regexp at different occurrence and capture group +SELECT regexp_substr('1234567890 1234567890', '(123)(4(56)(78))', 1, 1, 'i', 4); +SELECT regexp_substr('1234567890 1234557890', '(123)(4(5[56])(78))', 1, 2, 'i', 3); +SELECT regexp_substr('1234567890 1234567890', '(123)(4(56)(78))', 1, 1, 'i', 0); +-- test negative values +SELECT regexp_substr('1234567890 1234567890', '(123)(4(56)(78))', -1, 1, 'i', 4); +SELECT regexp_substr('1234567890 1234567890', '(123)(4(56)(78))', 1, -1, 'i', 4); +SELECT regexp_substr('1234567890 1234567890', '(123)(4(56)(78))', 1, 1, 'i', -4); +-- Select the first substring of letters that end with "thy." +SELECT regexp_substr('healthy, wealthy, and wise','\w+thy'); +-- Select the first substring of letters that ends with "thy" starting at the second character in the string. +SELECT regexp_substr('healthy, wealthy, and wise','\w+thy',2); +-- Return the contents of the third captured subexpression, which captures the third word in the string. +SELECT regexp_substr('one two three', '(\w+)\s+(\w+)\s+(\w+)', 1, 1, '', 3); + +DROP TABLE IF EXISTS regexp_temp; + +-- Regression tests for extended regexp_replace() function with start position and occurrence +SELECT regexp_replace('512.123.4567', '([[:digit:]]{3})\.([[:digit:]]{3})\.([[:digit:]]{4})', '(\1) \2-\3', 1); +SELECT regexp_replace('512.123.4567 612.123.4567', '([[:digit:]]{3})\.([[:digit:]]{3})\.([[:digit:]]{4})', '(\1) \2-\3', 1, 0); +SELECT regexp_replace('number your street,'||CHR(10)||' zipcode town, FR', '( ){2,}', ' ', 1, 0); +SELECT regexp_replace('number your street, zipcode town, FR', '( ){2,}', ' ', 9); +SELECT regexp_replace('number your street, zipcode town, FR', '( ){2,}', ' ', 9, 0); +SELECT regexp_replace('number your street, zipcode town, FR', '( ){2,}', ' ', 9, 2); +SELECT regexp_replace('number your street, zipcode town, FR', '( ){2,}', ' ', 9, 2, 'm'); +SELECT regexp_replace('number your street, zipcode town, FR', '([EURT]){2,}', '[\1]', 9, 1, 'i'); +SELECT regexp_replace('number your street, zipcode town, FR', '([EURT]){2,}', '[\1]', 9, 2, 'i'); +SELECT regexp_replace ('A PostgreSQL function', 'A|e|i|o|u', 'X', 1, 2); +SELECT regexp_replace ('A PostgreSQL function', 'a|e|i|o|u', 'X', 1, 0, 'i'); +SELECT regexp_replace ('A PostgreSQL function', 'a|e|i|o|u', 'X', 1, 1, 'i'); +SELECT regexp_replace ('A PostgreSQL function', 'a|e|i|o|u', 'X', 1, 2, 'i'); +SELECT regexp_replace ('A PostgreSQL function', 'a|e|i|o|u', 'X', 1, 3, 'i'); +SELECT regexp_replace ('A PostgreSQL function', 'a|e|i|o|u', 'X', 1, 9, 'i'); +SELECT regexp_replace ('A PostgreSQL function', 'A|e|i|o|u', 'X', 1, 9); +-- Invalid parameter values +SELECT regexp_replace ('A PostgreSQL function', 'a|e|i|o|u', 'X', -1, 0, 'i'); +SELECT regexp_replace ('A PostgreSQL function', 'a|e|i|o|u', 'X', 1, -1, 'i'); +-- Modifier 'g' should not be taken in account, we have an occurrence to replace +SELECT regexp_replace ('A PostgreSQL function', 'a|e|i|o|u', 'X', 1, 1, 'g'); +-- Find groups of "word characters" (letters, numbers and underscore) ending with +-- "thy" in the string "healthy, wealthy, and wise" and replace them with nothing. +SELECT regexp_replace('healthy, wealthy, and wise','\w+thy', '', 'g'); +SELECT regexp_replace('healthy, wealthy, and wise','\w+thy', '', 1, 0); +SELECT regexp_replace('healthy, wealthy, and wise','\w+thy', '', 1, 0, 'g'); +-- Find groups of word characters ending with "thy" and replace with "something." +SELECT regexp_replace('healthy, wealthy, and wise','\w+thy', 'something', 'g'); +-- Find groups of word characters ending with "thy" and replace with the string +-- "something" starting at the third character in the string. +SELECT regexp_replace('healthy, wealthy, and wise','\w+thy', 'something', 3, 0); +-- Replace the second group of word characters ending with "thy" with "something." +SELECT regexp_replace('healthy, wealthy, and wise','\w+thy', 'something', 1, 2); +-- Find groups of word characters ending with "thy" capturing the letters before +-- the "thy", and replace with the captured letters plus the letters "ish." +SELECT regexp_replace('healthy, wealthy, and wise','(\w+)thy', '\1ish', 'g'); +SELECT regexp_replace('healthy, wealthy, and wise','(\w+)thy', '\1ish', 1, 0); + + -- change NULL-display back \pset null ''