diff --git a/doc/src/sgml/func.sgml b/doc/src/sgml/func.sgml index a5b6adc4bb..2bc9060e47 100644 --- a/doc/src/sgml/func.sgml +++ b/doc/src/sgml/func.sgml @@ -3108,6 +3108,80 @@ repeat('Pg', 4) PgPgPgPg + + + + regexp_count + + regexp_count ( string text, pattern text + [, start integer + [, flags text ] ] ) + integer + + + Returns the number of times the POSIX regular + expression pattern matches in + the string; see + . + + + regexp_count('123456789012', '\d\d\d', 2) + 3 + + + + + + + regexp_instr + + regexp_instr ( string text, pattern text + [, start integer + [, N integer + [, endoption integer + [, flags text + [, subexpr integer ] ] ] ] ] ) + integer + + + Returns the position within string where + the N'th match of the POSIX regular + expression pattern occurs, or zero if there is + no such match; see . + + + regexp_instr('ABCDEF', 'c(.)(..)', 1, 1, 0, 'i') + 3 + + + regexp_instr('ABCDEF', 'c(.)(..)', 1, 1, 0, 'i', 2) + 5 + + + + + + + regexp_like + + regexp_like ( string text, pattern text + [, start integer + [, flags text ] ] ) + boolean + + + Checks whether a match of the POSIX regular + expression pattern occurs + within string starting at + the start'th character; see + . + + + regexp_like('Hello World', 'world$', 1, 'i') + t + + + @@ -3117,8 +3191,9 @@ repeat('Pg', 4) PgPgPgPg text[] - Returns captured substrings resulting from the first match of a POSIX - regular expression to the string; see + Returns captured substrings resulting from the first match of the + POSIX regular expression pattern to + the string; see . @@ -3136,10 +3211,11 @@ repeat('Pg', 4) PgPgPgPg setof text[] - Returns captured substrings resulting from the first match of a - POSIX regular expression to the string, - or multiple matches if the g flag is used; - see . + Returns captured substrings resulting from the first match of the + POSIX regular expression pattern to + the string, or all matches if + the g flag is used; see + . regexp_matches('foobarbequebaz', 'ba.', 'g') @@ -3156,14 +3232,16 @@ repeat('Pg', 4) PgPgPgPg regexp_replace - regexp_replace ( string text, pattern text, replacement text [, flags text ] ) + regexp_replace ( string text, pattern text, replacement text + [, start integer ] + [, flags text ] ) text - Replaces substrings resulting from the first match of a - POSIX regular expression, or multiple substring matches - if the g flag is used; see . + Replaces the substring that is the first match to the POSIX + regular expression pattern, or all matches + if the g flag is used; see + . regexp_replace('Thomas', '.[mN]a.', 'M') @@ -3171,6 +3249,26 @@ repeat('Pg', 4) PgPgPgPg + + + regexp_replace ( string text, pattern text, replacement text, + start integer, + N integer + [, flags text ] ) + text + + + Replaces the substring that is the N'th + match to the POSIX regular expression pattern, + or all matches if N is zero; see + . + + + regexp_replace('Thomas', '.', 'X', 3, 2) + ThoXas + + + @@ -3213,6 +3311,35 @@ repeat('Pg', 4) PgPgPgPg + + + + regexp_substr + + regexp_substr ( string text, pattern text + [, start integer + [, N integer + [, flags text + [, subexpr integer ] ] ] ] ) + text + + + Returns the substring within string that + matches the N'th occurrence of the POSIX + regular expression pattern, + or NULL if there is no such match; see + . + + + regexp_substr('ABCDEF', 'c(.)(..)', 1, 1, 'i') + CDEF + + + regexp_substr('ABCDEF', 'c(.)(..)', 1, 1, 'i', 2) + EF + + + @@ -5377,6 +5504,15 @@ substring('foobar' similar '#"o_b#"%' escape '#') NULL substring + + regexp_count + + + regexp_instr + + + regexp_like + regexp_match @@ -5392,6 +5528,9 @@ substring('foobar' similar '#"o_b#"%' escape '#') NULL regexp_split_to_array + + regexp_substr + lists the available @@ -5542,6 +5681,114 @@ substring('foobar' from 'o(.)b') o + + The regexp_count function counts the number of + places where a POSIX regular expression pattern matches a string. + It has the syntax + regexp_count(string, + pattern + , start + , flags + ). + pattern is searched for + in string, normally from the beginning of + the string, but if the start parameter is + provided then beginning from that character index. + The flags parameter is an optional text + string containing zero or more single-letter flags that change the + function's behavior. For example, including i in + flags specifies case-insensitive matching. + Supported flags are described in + . + + + + Some examples: + +regexp_count('ABCABCAXYaxy', 'A.') 3 +regexp_count('ABCABCAXYaxy', 'A.', 1, 'i') 4 + + + + + The regexp_instr function returns the starting or + ending position of the N'th match of a + POSIX regular expression pattern to a string, or zero if there is no + such match. It has the syntax + regexp_instr(string, + pattern + , start + , N + , endoption + , flags + , subexpr + ). + pattern is searched for + in string, normally from the beginning of + the string, but if the start parameter is + provided then beginning from that character index. + If N is specified + then the N'th match of the pattern + is located, otherwise the first match is located. + If the endoption parameter is omitted or + specified as zero, the function returns the position of the first + character of the match. Otherwise, endoption + must be one, and the function returns the position of the character + following the match. + The flags parameter is an optional text + string containing zero or more single-letter flags that change the + function's behavior. Supported flags are described + in . + For a pattern containing parenthesized + subexpressions, subexpr is an integer + indicating which subexpression is of interest: the result identifies + the position of the substring matching that subexpression. + Subexpressions are numbered in the order of their leading parentheses. + When subexpr is omitted or zero, the result + identifies the position of the whole match regardless of + parenthesized subexpressions. + + + + Some examples: + +regexp_instr('number of your street, town zip, FR', '[^,]+', 1, 2) + 23 +regexp_instr('ABCDEFGHI', '(c..)(...)', 1, 1, 0, 'i', 2) + 6 + + + + + The regexp_like function checks whether a match + of a POSIX regular expression pattern occurs within a string, + returning boolean true or false. It has the syntax + regexp_like(string, + pattern + , start + , flags ). + pattern is searched for + in string, normally from the beginning of + the string, but if the start parameter is + provided then beginning from that character index. + The flags parameter is an optional text + string containing zero or more single-letter flags that change the + function's behavior. Supported flags are described + in . + This function has the same results as the ~ + operator if no flags are specified. If only the i + flag is specified, it has the same results as + the ~* operator. + + + + Some examples: + +regexp_like('Hello World', 'world') false +regexp_like('Hello World', 'world', 1, 'i') true + + + The regexp_match function returns a text array of captured substring(s) resulting from the first match of a POSIX @@ -5579,8 +5826,17 @@ SELECT regexp_match('foobarbequebaz', '(bar)(beque)'); {bar,beque} (1 row) - In the common case where you just want the whole matching substring - or NULL for no match, write something like + + + + + In the common case where you just want the whole matching substring + or NULL for no match, the best solution is to + use regexp_substr(). + However, regexp_substr() only exists + in PostgreSQL version 15 and up. When + working in older versions, you can extract the first element + of regexp_match()'s result, for example: SELECT (regexp_match('foobarbequebaz', 'bar.*que'))[1]; regexp_match @@ -5588,7 +5844,8 @@ SELECT (regexp_match('foobarbequebaz', 'bar.*que'))[1]; barbeque (1 row) - + + The regexp_matches function returns a set of text arrays @@ -5650,7 +5907,13 @@ SELECT col1, (SELECT regexp_matches(col2, '(bar)(beque)')) FROM tab; It has the syntax regexp_replace(source, pattern, replacement + , start + , N + , flags ). + (Notice that N cannot be specified + unless start is, + but flags can be given in any case.) The source string is returned unchanged if there is no match to the pattern. If there is a match, the source string is returned with the @@ -5663,11 +5926,22 @@ SELECT col1, (SELECT regexp_matches(col2, '(bar)(beque)')) FROM tab; substring matching the entire pattern should be inserted. Write \\ if you need to put a literal backslash in the replacement text. + pattern is searched for + in string, normally from the beginning of + the string, but if the start parameter is + provided then beginning from that character index. + By default, only the first match of the pattern is replaced. + If N is specified and is greater than zero, + then the N'th match of the pattern + is replaced. + If the g flag is given, or + if N is specified and is zero, then all + matches at or after the start position are + replaced. (The g flag is ignored + when N is specified.) The flags parameter is an optional text string containing zero or more single-letter flags that change the - function's behavior. Flag i specifies case-insensitive - matching, while flag g specifies replacement of each matching - substring rather than only the first one. Supported flags (though + function's behavior. Supported flags (though not g) are described in . @@ -5681,6 +5955,10 @@ regexp_replace('foobarbaz', 'b..', 'X', 'g') fooXX regexp_replace('foobarbaz', 'b(..)', 'X\1Y', 'g') fooXarYXazY +regexp_replace('A PostgreSQL function', 'a|e|i|o|u', 'X', 1, 0, 'i') + X PXstgrXSQL fXnctXXn +regexp_replace('A PostgreSQL function', 'a|e|i|o|u', 'X', 1, 3, 'i') + A PostgrXSQL function @@ -5712,7 +5990,6 @@ regexp_replace('foobarbaz', 'b(..)', 'X\1Y', 'g') Some examples: - SELECT foo FROM regexp_split_to_table('the quick brown fox jumps over the lazy dog', '\s+') AS foo; foo ------- @@ -5761,11 +6038,51 @@ SELECT foo FROM regexp_split_to_table('the quick brown fox', '\s*') AS foo; zero-length matches that occur at the start or end of the string or immediately after a previous match. This is contrary to the strict definition of regexp matching that is implemented by - regexp_match and - regexp_matches, but is usually the most convenient behavior + the other regexp functions, but is usually the most convenient behavior in practice. Other software systems such as Perl use similar definitions. + + The regexp_substr function returns the substring + that matches a POSIX regular expression pattern, + or NULL if there is no match. It has the syntax + regexp_substr(string, + pattern + , start + , N + , flags + , subexpr + ). + pattern is searched for + in string, normally from the beginning of + the string, but if the start parameter is + provided then beginning from that character index. + If N is specified + then the N'th match of the pattern + is returned, otherwise the first match is returned. + The flags parameter is an optional text + string containing zero or more single-letter flags that change the + function's behavior. Supported flags are described + in . + For a pattern containing parenthesized + subexpressions, subexpr is an integer + indicating which subexpression is of interest: the result is the + substring matching that subexpression. + Subexpressions are numbered in the order of their leading parentheses. + When subexpr is omitted or zero, the result + is the whole match regardless of parenthesized subexpressions. + + + + Some examples: + +regexp_substr('number of your street, town zip, FR', '[^,]+', 1, 2) + town zip +regexp_substr('ABCDEFGHI', '(c..)(...)', 1, 1, 'i', 2) + FGH + + + diff --git a/src/backend/utils/adt/regexp.c b/src/backend/utils/adt/regexp.c index a32c5c82ab..ede20a4d14 100644 --- a/src/backend/utils/adt/regexp.c +++ b/src/backend/utils/adt/regexp.c @@ -113,6 +113,7 @@ static cached_re_str re_array[MAX_CACHED_RES]; /* cached re's */ /* Local functions */ static regexp_matches_ctx *setup_regexp_matches(text *orig_str, text *pattern, pg_re_flags *flags, + int start_search, Oid collation, bool use_subpatterns, bool ignore_degenerate, @@ -629,7 +630,7 @@ textregexreplace_noopt(PG_FUNCTION_ARGS) re = RE_compile_and_cache(p, REG_ADVANCED, PG_GET_COLLATION()); - PG_RETURN_TEXT_P(replace_text_regexp(s, (void *) re, r, false)); + PG_RETURN_TEXT_P(replace_text_regexp(s, (void *) re, r, 0, 1)); } /* @@ -646,11 +647,97 @@ textregexreplace(PG_FUNCTION_ARGS) regex_t *re; pg_re_flags flags; + /* + * regexp_replace() with four arguments will be preferentially resolved as + * this form when the fourth argument is of type UNKNOWN. However, the + * user might have intended to call textregexreplace_extended_no_n. If we + * see flags that look like an integer, emit the same error that + * parse_re_flags would, but add a HINT about how to fix it. + */ + if (VARSIZE_ANY_EXHDR(opt) > 0) + { + char *opt_p = VARDATA_ANY(opt); + + if (*opt_p >= '0' && *opt_p <= '9') + ereport(ERROR, + (errcode(ERRCODE_INVALID_PARAMETER_VALUE), + errmsg("invalid regular expression option: \"%.*s\"", + pg_mblen(opt_p), opt_p), + errhint("If you meant to use regexp_replace() with a start parameter, cast the fourth argument to integer explicitly."))); + } + parse_re_flags(&flags, opt); re = RE_compile_and_cache(p, flags.cflags, PG_GET_COLLATION()); - PG_RETURN_TEXT_P(replace_text_regexp(s, (void *) re, r, flags.glob)); + PG_RETURN_TEXT_P(replace_text_regexp(s, (void *) re, r, 0, + flags.glob ? 0 : 1)); +} + +/* + * textregexreplace_extended() + * Return a string matched by a regular expression, with replacement. + * Extends textregexreplace by allowing a start position and the + * choice of the occurrence to replace (0 means all occurrences). + */ +Datum +textregexreplace_extended(PG_FUNCTION_ARGS) +{ + text *s = PG_GETARG_TEXT_PP(0); + text *p = PG_GETARG_TEXT_PP(1); + text *r = PG_GETARG_TEXT_PP(2); + int start = 1; + int n = 1; + text *flags = PG_GETARG_TEXT_PP_IF_EXISTS(5); + pg_re_flags re_flags; + regex_t *re; + + /* Collect optional parameters */ + if (PG_NARGS() > 3) + { + start = PG_GETARG_INT32(3); + if (start <= 0) + ereport(ERROR, + (errcode(ERRCODE_INVALID_PARAMETER_VALUE), + errmsg("invalid value for parameter \"%s\": %d", + "start", start))); + } + if (PG_NARGS() > 4) + { + n = PG_GETARG_INT32(4); + if (n < 0) + ereport(ERROR, + (errcode(ERRCODE_INVALID_PARAMETER_VALUE), + errmsg("invalid value for parameter \"%s\": %d", + "n", n))); + } + + /* Determine options */ + parse_re_flags(&re_flags, flags); + + /* If N was not specified, deduce it from the 'g' flag */ + if (PG_NARGS() <= 4) + n = re_flags.glob ? 0 : 1; + + /* Compile the regular expression */ + re = RE_compile_and_cache(p, re_flags.cflags, PG_GET_COLLATION()); + + /* Do the replacement(s) */ + PG_RETURN_TEXT_P(replace_text_regexp(s, (void *) re, r, start - 1, n)); +} + +/* This is separate to keep the opr_sanity regression test from complaining */ +Datum +textregexreplace_extended_no_n(PG_FUNCTION_ARGS) +{ + return textregexreplace_extended(fcinfo); +} + +/* This is separate to keep the opr_sanity regression test from complaining */ +Datum +textregexreplace_extended_no_flags(PG_FUNCTION_ARGS) +{ + return textregexreplace_extended(fcinfo); } /* @@ -958,6 +1045,255 @@ similar_escape(PG_FUNCTION_ARGS) PG_RETURN_TEXT_P(result); } +/* + * regexp_count() + * Return the number of matches of a pattern within a string. + */ +Datum +regexp_count(PG_FUNCTION_ARGS) +{ + text *str = PG_GETARG_TEXT_PP(0); + text *pattern = PG_GETARG_TEXT_PP(1); + int start = 1; + text *flags = PG_GETARG_TEXT_PP_IF_EXISTS(3); + pg_re_flags re_flags; + regexp_matches_ctx *matchctx; + + /* Collect optional parameters */ + if (PG_NARGS() > 2) + { + start = PG_GETARG_INT32(2); + if (start <= 0) + ereport(ERROR, + (errcode(ERRCODE_INVALID_PARAMETER_VALUE), + errmsg("invalid value for parameter \"%s\": %d", + "start", start))); + } + + /* Determine options */ + parse_re_flags(&re_flags, flags); + /* User mustn't specify 'g' */ + if (re_flags.glob) + ereport(ERROR, + (errcode(ERRCODE_INVALID_PARAMETER_VALUE), + /* translator: %s is a SQL function name */ + errmsg("%s does not support the \"global\" option", + "regexp_count()"))); + /* But we find all the matches anyway */ + re_flags.glob = true; + + /* Do the matching */ + matchctx = setup_regexp_matches(str, pattern, &re_flags, start - 1, + PG_GET_COLLATION(), + false, /* can ignore subexprs */ + false, false); + + PG_RETURN_INT32(matchctx->nmatches); +} + +/* This is separate to keep the opr_sanity regression test from complaining */ +Datum +regexp_count_no_start(PG_FUNCTION_ARGS) +{ + return regexp_count(fcinfo); +} + +/* This is separate to keep the opr_sanity regression test from complaining */ +Datum +regexp_count_no_flags(PG_FUNCTION_ARGS) +{ + return regexp_count(fcinfo); +} + +/* + * regexp_instr() + * Return the match's position within the string + */ +Datum +regexp_instr(PG_FUNCTION_ARGS) +{ + text *str = PG_GETARG_TEXT_PP(0); + text *pattern = PG_GETARG_TEXT_PP(1); + int start = 1; + int n = 1; + int endoption = 0; + text *flags = PG_GETARG_TEXT_PP_IF_EXISTS(5); + int subexpr = 0; + int pos; + pg_re_flags re_flags; + regexp_matches_ctx *matchctx; + + /* Collect optional parameters */ + if (PG_NARGS() > 2) + { + start = PG_GETARG_INT32(2); + if (start <= 0) + ereport(ERROR, + (errcode(ERRCODE_INVALID_PARAMETER_VALUE), + errmsg("invalid value for parameter \"%s\": %d", + "start", start))); + } + if (PG_NARGS() > 3) + { + n = PG_GETARG_INT32(3); + if (n <= 0) + ereport(ERROR, + (errcode(ERRCODE_INVALID_PARAMETER_VALUE), + errmsg("invalid value for parameter \"%s\": %d", + "n", n))); + } + if (PG_NARGS() > 4) + { + endoption = PG_GETARG_INT32(4); + if (endoption != 0 && endoption != 1) + ereport(ERROR, + (errcode(ERRCODE_INVALID_PARAMETER_VALUE), + errmsg("invalid value for parameter \"%s\": %d", + "endoption", endoption))); + } + if (PG_NARGS() > 6) + { + subexpr = PG_GETARG_INT32(6); + if (subexpr < 0) + ereport(ERROR, + (errcode(ERRCODE_INVALID_PARAMETER_VALUE), + errmsg("invalid value for parameter \"%s\": %d", + "subexpr", subexpr))); + } + + /* Determine options */ + parse_re_flags(&re_flags, flags); + /* User mustn't specify 'g' */ + if (re_flags.glob) + ereport(ERROR, + (errcode(ERRCODE_INVALID_PARAMETER_VALUE), + /* translator: %s is a SQL function name */ + errmsg("%s does not support the \"global\" option", + "regexp_instr()"))); + /* But we find all the matches anyway */ + re_flags.glob = true; + + /* Do the matching */ + matchctx = setup_regexp_matches(str, pattern, &re_flags, start - 1, + PG_GET_COLLATION(), + (subexpr > 0), /* need submatches? */ + false, false); + + /* When n exceeds matches return 0 (includes case of no matches) */ + if (n > matchctx->nmatches) + PG_RETURN_INT32(0); + + /* When subexpr exceeds number of subexpressions return 0 */ + if (subexpr > matchctx->npatterns) + PG_RETURN_INT32(0); + + /* Select the appropriate match position to return */ + pos = (n - 1) * matchctx->npatterns; + if (subexpr > 0) + pos += subexpr - 1; + pos *= 2; + if (endoption == 1) + pos += 1; + + if (matchctx->match_locs[pos] >= 0) + PG_RETURN_INT32(matchctx->match_locs[pos] + 1); + else + PG_RETURN_INT32(0); /* position not identifiable */ +} + +/* This is separate to keep the opr_sanity regression test from complaining */ +Datum +regexp_instr_no_start(PG_FUNCTION_ARGS) +{ + return regexp_instr(fcinfo); +} + +/* This is separate to keep the opr_sanity regression test from complaining */ +Datum +regexp_instr_no_n(PG_FUNCTION_ARGS) +{ + return regexp_instr(fcinfo); +} + +/* This is separate to keep the opr_sanity regression test from complaining */ +Datum +regexp_instr_no_endoption(PG_FUNCTION_ARGS) +{ + return regexp_instr(fcinfo); +} + +/* This is separate to keep the opr_sanity regression test from complaining */ +Datum +regexp_instr_no_flags(PG_FUNCTION_ARGS) +{ + return regexp_instr(fcinfo); +} + +/* This is separate to keep the opr_sanity regression test from complaining */ +Datum +regexp_instr_no_subexpr(PG_FUNCTION_ARGS) +{ + return regexp_instr(fcinfo); +} + +/* + * regexp_like() + * Test for a pattern match within a string. + */ +Datum +regexp_like(PG_FUNCTION_ARGS) +{ + text *str = PG_GETARG_TEXT_PP(0); + text *pattern = PG_GETARG_TEXT_PP(1); + int start = 1; + text *flags = PG_GETARG_TEXT_PP_IF_EXISTS(3); + pg_re_flags re_flags; + regexp_matches_ctx *matchctx; + + /* Collect optional parameters */ + if (PG_NARGS() > 2) + { + start = PG_GETARG_INT32(2); + if (start <= 0) + ereport(ERROR, + (errcode(ERRCODE_INVALID_PARAMETER_VALUE), + errmsg("invalid value for parameter \"%s\": %d", + "start", start))); + } + + /* Determine options */ + parse_re_flags(&re_flags, flags); + /* User mustn't specify 'g' */ + if (re_flags.glob) + ereport(ERROR, + (errcode(ERRCODE_INVALID_PARAMETER_VALUE), + /* translator: %s is a SQL function name */ + errmsg("%s does not support the \"global\" option", + "regexp_like()"))); + + /* Do the matching */ + matchctx = setup_regexp_matches(str, pattern, &re_flags, start - 1, + PG_GET_COLLATION(), + false, /* can ignore subexprs */ + false, false); + + PG_RETURN_BOOL((matchctx->nmatches > 0) ? true : false); +} + +/* This is separate to keep the opr_sanity regression test from complaining */ +Datum +regexp_like_no_start(PG_FUNCTION_ARGS) +{ + return regexp_like(fcinfo); +} + +/* This is separate to keep the opr_sanity regression test from complaining */ +Datum +regexp_like_no_flags(PG_FUNCTION_ARGS) +{ + return regexp_like(fcinfo); +} + /* * regexp_match() * Return the first substring(s) matching a pattern within a string. @@ -982,7 +1318,7 @@ regexp_match(PG_FUNCTION_ARGS) "regexp_match()"), errhint("Use the regexp_matches function instead."))); - matchctx = setup_regexp_matches(orig_str, pattern, &re_flags, + matchctx = setup_regexp_matches(orig_str, pattern, &re_flags, 0, PG_GET_COLLATION(), true, false, false); if (matchctx->nmatches == 0) @@ -1029,7 +1365,7 @@ regexp_matches(PG_FUNCTION_ARGS) /* be sure to copy the input string into the multi-call ctx */ matchctx = setup_regexp_matches(PG_GETARG_TEXT_P_COPY(0), pattern, - &re_flags, + &re_flags, 0, PG_GET_COLLATION(), true, false, false); @@ -1064,24 +1400,28 @@ regexp_matches_no_flags(PG_FUNCTION_ARGS) } /* - * setup_regexp_matches --- do the initial matching for regexp_match - * and regexp_split functions + * setup_regexp_matches --- do the initial matching for regexp_match, + * regexp_split, and related functions * * To avoid having to re-find the compiled pattern on each call, we do * all the matching in one swoop. The returned regexp_matches_ctx contains * the locations of all the substrings matching the pattern. * - * The three bool parameters have only two patterns (one for matching, one for - * splitting) but it seems clearer to distinguish the functionality this way - * than to key it all off one "is_split" flag. We don't currently assume that - * fetching_unmatched is exclusive of fetching the matched text too; if it's - * set, the conversion buffer is large enough to fetch any single matched or - * unmatched string, but not any larger substring. (In practice, when splitting - * the matches are usually small anyway, and it didn't seem worth complicating - * the code further.) + * start_search: the character (not byte) offset in orig_str at which to + * begin the search. Returned positions are relative to orig_str anyway. + * use_subpatterns: collect data about matches to parenthesized subexpressions. + * ignore_degenerate: ignore zero-length matches. + * fetching_unmatched: caller wants to fetch unmatched substrings. + * + * We don't currently assume that fetching_unmatched is exclusive of fetching + * the matched text too; if it's set, the conversion buffer is large enough to + * fetch any single matched or unmatched string, but not any larger + * substring. (In practice, when splitting the matches are usually small + * anyway, and it didn't seem worth complicating the code further.) */ static regexp_matches_ctx * setup_regexp_matches(text *orig_str, text *pattern, pg_re_flags *re_flags, + int start_search, Oid collation, bool use_subpatterns, bool ignore_degenerate, @@ -1099,7 +1439,6 @@ setup_regexp_matches(text *orig_str, text *pattern, pg_re_flags *re_flags, int array_idx; int prev_match_end; int prev_valid_match_end; - int start_search; int maxlen = 0; /* largest fetch length in characters */ /* save original string --- we'll extract result substrings from it */ @@ -1142,7 +1481,6 @@ setup_regexp_matches(text *orig_str, text *pattern, pg_re_flags *re_flags, /* search for the pattern, perhaps repeatedly */ prev_match_end = 0; prev_valid_match_end = 0; - start_search = 0; while (RE_wchar_execute(cpattern, wide_str, wide_len, start_search, pmatch_len, pmatch)) { @@ -1367,7 +1705,7 @@ regexp_split_to_table(PG_FUNCTION_ARGS) /* be sure to copy the input string into the multi-call ctx */ splitctx = setup_regexp_matches(PG_GETARG_TEXT_P_COPY(0), pattern, - &re_flags, + &re_flags, 0, PG_GET_COLLATION(), false, true, true); @@ -1422,7 +1760,7 @@ regexp_split_to_array(PG_FUNCTION_ARGS) splitctx = setup_regexp_matches(PG_GETARG_TEXT_PP(0), PG_GETARG_TEXT_PP(1), - &re_flags, + &re_flags, 0, PG_GET_COLLATION(), false, true, true); @@ -1489,6 +1827,125 @@ build_regexp_split_result(regexp_matches_ctx *splitctx) } } +/* + * regexp_substr() + * Return the substring that matches a regular expression pattern + */ +Datum +regexp_substr(PG_FUNCTION_ARGS) +{ + text *str = PG_GETARG_TEXT_PP(0); + text *pattern = PG_GETARG_TEXT_PP(1); + int start = 1; + int n = 1; + text *flags = PG_GETARG_TEXT_PP_IF_EXISTS(4); + int subexpr = 0; + int so, + eo, + pos; + pg_re_flags re_flags; + regexp_matches_ctx *matchctx; + + /* Collect optional parameters */ + if (PG_NARGS() > 2) + { + start = PG_GETARG_INT32(2); + if (start <= 0) + ereport(ERROR, + (errcode(ERRCODE_INVALID_PARAMETER_VALUE), + errmsg("invalid value for parameter \"%s\": %d", + "start", start))); + } + if (PG_NARGS() > 3) + { + n = PG_GETARG_INT32(3); + if (n <= 0) + ereport(ERROR, + (errcode(ERRCODE_INVALID_PARAMETER_VALUE), + errmsg("invalid value for parameter \"%s\": %d", + "n", n))); + } + if (PG_NARGS() > 5) + { + subexpr = PG_GETARG_INT32(5); + if (subexpr < 0) + ereport(ERROR, + (errcode(ERRCODE_INVALID_PARAMETER_VALUE), + errmsg("invalid value for parameter \"%s\": %d", + "subexpr", subexpr))); + } + + /* Determine options */ + parse_re_flags(&re_flags, flags); + /* User mustn't specify 'g' */ + if (re_flags.glob) + ereport(ERROR, + (errcode(ERRCODE_INVALID_PARAMETER_VALUE), + /* translator: %s is a SQL function name */ + errmsg("%s does not support the \"global\" option", + "regexp_substr()"))); + /* But we find all the matches anyway */ + re_flags.glob = true; + + /* Do the matching */ + matchctx = setup_regexp_matches(str, pattern, &re_flags, start - 1, + PG_GET_COLLATION(), + (subexpr > 0), /* need submatches? */ + false, false); + + /* When n exceeds matches return NULL (includes case of no matches) */ + if (n > matchctx->nmatches) + PG_RETURN_NULL(); + + /* When subexpr exceeds number of subexpressions return NULL */ + if (subexpr > matchctx->npatterns) + PG_RETURN_NULL(); + + /* Select the appropriate match position to return */ + pos = (n - 1) * matchctx->npatterns; + if (subexpr > 0) + pos += subexpr - 1; + pos *= 2; + so = matchctx->match_locs[pos]; + eo = matchctx->match_locs[pos + 1]; + + if (so < 0 || eo < 0) + PG_RETURN_NULL(); /* unidentifiable location */ + + PG_RETURN_DATUM(DirectFunctionCall3(text_substr, + PointerGetDatum(matchctx->orig_str), + Int32GetDatum(so + 1), + Int32GetDatum(eo - so))); +} + +/* This is separate to keep the opr_sanity regression test from complaining */ +Datum +regexp_substr_no_start(PG_FUNCTION_ARGS) +{ + return regexp_substr(fcinfo); +} + +/* This is separate to keep the opr_sanity regression test from complaining */ +Datum +regexp_substr_no_n(PG_FUNCTION_ARGS) +{ + return regexp_substr(fcinfo); +} + +/* This is separate to keep the opr_sanity regression test from complaining */ +Datum +regexp_substr_no_flags(PG_FUNCTION_ARGS) +{ + return regexp_substr(fcinfo); +} + +/* This is separate to keep the opr_sanity regression test from complaining */ +Datum +regexp_substr_no_subexpr(PG_FUNCTION_ARGS) +{ + return regexp_substr(fcinfo); +} + /* * regexp_fixed_prefix - extract fixed prefix, if any, for a regexp * diff --git a/src/backend/utils/adt/varlena.c b/src/backend/utils/adt/varlena.c index d2a11b1b5d..a0bde4e352 100644 --- a/src/backend/utils/adt/varlena.c +++ b/src/backend/utils/adt/varlena.c @@ -4496,23 +4496,28 @@ appendStringInfoRegexpSubstr(StringInfo str, text *replace_text, /* * replace_text_regexp * - * replace text that matches to regexp in src_text to replace_text. + * replace text that matches to regexp in src_text with replace_text. + * + * search_start: the character (not byte) offset in src_text at which to + * begin searching. + * n: if 0, replace all matches; if > 0, replace only the N'th match. * * Note: to avoid having to include regex.h in builtins.h, we declare * the regexp argument as void *, but really it's regex_t *. */ text * replace_text_regexp(text *src_text, void *regexp, - text *replace_text, bool glob) + text *replace_text, + int search_start, int n) { text *ret_text; regex_t *re = (regex_t *) regexp; int src_text_len = VARSIZE_ANY_EXHDR(src_text); + int nmatches = 0; StringInfoData buf; regmatch_t pmatch[REGEXP_REPLACE_BACKREF_CNT]; pg_wchar *data; size_t data_len; - int search_start; int data_pos; char *start_ptr; bool have_escape; @@ -4530,7 +4535,6 @@ replace_text_regexp(text *src_text, void *regexp, start_ptr = (char *) VARDATA_ANY(src_text); data_pos = 0; - search_start = 0; while (search_start <= data_len) { int regexec_result; @@ -4560,6 +4564,23 @@ replace_text_regexp(text *src_text, void *regexp, errmsg("regular expression failed: %s", errMsg))); } + /* + * Count matches, and decide whether to replace this match. + */ + nmatches++; + if (n > 0 && nmatches != n) + { + /* + * No, so advance search_start, but not start_ptr/data_pos. (Thus, + * we treat the matched text as if it weren't matched, and copy it + * to the output later.) + */ + search_start = pmatch[0].rm_eo; + if (pmatch[0].rm_so == pmatch[0].rm_eo) + search_start++; + continue; + } + /* * Copy the text to the left of the match position. Note we are given * character not byte indexes. @@ -4596,9 +4617,9 @@ replace_text_regexp(text *src_text, void *regexp, data_pos = pmatch[0].rm_eo; /* - * When global option is off, replace the first instance only. + * If we only want to replace one occurrence, we're done. */ - if (!glob) + if (n > 0) break; /* diff --git a/src/include/catalog/pg_proc.dat b/src/include/catalog/pg_proc.dat index 8cd0252082..32e5d25714 100644 --- a/src/include/catalog/pg_proc.dat +++ b/src/include/catalog/pg_proc.dat @@ -3565,6 +3565,18 @@ { oid => '2285', descr => 'replace text using regexp', proname => 'regexp_replace', prorettype => 'text', proargtypes => 'text text text text', prosrc => 'textregexreplace' }, +{ oid => '9611', descr => 'replace text using regexp', + proname => 'regexp_replace', prorettype => 'text', + proargtypes => 'text text text int4 int4 text', + prosrc => 'textregexreplace_extended' }, +{ oid => '9612', descr => 'replace text using regexp', + proname => 'regexp_replace', prorettype => 'text', + proargtypes => 'text text text int4 int4', + prosrc => 'textregexreplace_extended_no_flags' }, +{ oid => '9613', descr => 'replace text using regexp', + proname => 'regexp_replace', prorettype => 'text', + proargtypes => 'text text text int4', + prosrc => 'textregexreplace_extended_no_n' }, { oid => '3396', descr => 'find first match for regexp', proname => 'regexp_match', prorettype => '_text', proargtypes => 'text text', prosrc => 'regexp_match_no_flags' }, @@ -3579,6 +3591,61 @@ proname => 'regexp_matches', prorows => '10', proretset => 't', prorettype => '_text', proargtypes => 'text text text', prosrc => 'regexp_matches' }, +{ oid => '9614', descr => 'count regexp matches', + proname => 'regexp_count', prorettype => 'int4', proargtypes => 'text text', + prosrc => 'regexp_count_no_start' }, +{ oid => '9615', descr => 'count regexp matches', + proname => 'regexp_count', prorettype => 'int4', + proargtypes => 'text text int4', prosrc => 'regexp_count_no_flags' }, +{ oid => '9616', descr => 'count regexp matches', + proname => 'regexp_count', prorettype => 'int4', + proargtypes => 'text text int4 text', prosrc => 'regexp_count' }, +{ oid => '9617', descr => 'position of regexp match', + proname => 'regexp_instr', prorettype => 'int4', proargtypes => 'text text', + prosrc => 'regexp_instr_no_start' }, +{ oid => '9618', descr => 'position of regexp match', + proname => 'regexp_instr', prorettype => 'int4', + proargtypes => 'text text int4', prosrc => 'regexp_instr_no_n' }, +{ oid => '9619', descr => 'position of regexp match', + proname => 'regexp_instr', prorettype => 'int4', + proargtypes => 'text text int4 int4', prosrc => 'regexp_instr_no_endoption' }, +{ oid => '9620', descr => 'position of regexp match', + proname => 'regexp_instr', prorettype => 'int4', + proargtypes => 'text text int4 int4 int4', + prosrc => 'regexp_instr_no_flags' }, +{ oid => '9621', descr => 'position of regexp match', + proname => 'regexp_instr', prorettype => 'int4', + proargtypes => 'text text int4 int4 int4 text', + prosrc => 'regexp_instr_no_subexpr' }, +{ oid => '9622', descr => 'position of regexp match', + proname => 'regexp_instr', prorettype => 'int4', + proargtypes => 'text text int4 int4 int4 text int4', + prosrc => 'regexp_instr' }, +{ oid => '9623', descr => 'test for regexp match', + proname => 'regexp_like', prorettype => 'bool', proargtypes => 'text text', + prosrc => 'regexp_like_no_start' }, +{ oid => '9624', descr => 'test for regexp match', + proname => 'regexp_like', prorettype => 'bool', + proargtypes => 'text text int4', prosrc => 'regexp_like_no_flags' }, +{ oid => '9630', descr => 'test for regexp match', + proname => 'regexp_like', prorettype => 'bool', + proargtypes => 'text text int4 text', prosrc => 'regexp_like' }, +{ oid => '9625', descr => 'extract substring that matches regexp', + proname => 'regexp_substr', prorettype => 'text', proargtypes => 'text text', + prosrc => 'regexp_substr_no_start' }, +{ oid => '9626', descr => 'extract substring that matches regexp', + proname => 'regexp_substr', prorettype => 'text', + proargtypes => 'text text int4', prosrc => 'regexp_substr_no_n' }, +{ oid => '9627', descr => 'extract substring that matches regexp', + proname => 'regexp_substr', prorettype => 'text', + proargtypes => 'text text int4 int4', prosrc => 'regexp_substr_no_flags' }, +{ oid => '9628', descr => 'extract substring that matches regexp', + proname => 'regexp_substr', prorettype => 'text', + proargtypes => 'text text int4 int4 text', + prosrc => 'regexp_substr_no_subexpr' }, +{ oid => '9629', descr => 'extract substring that matches regexp', + proname => 'regexp_substr', prorettype => 'text', + proargtypes => 'text text int4 int4 text int4', prosrc => 'regexp_substr' }, { oid => '2088', descr => 'split string by field_sep and return field_num', proname => 'split_part', prorettype => 'text', proargtypes => 'text text int4', prosrc => 'split_part' }, diff --git a/src/include/utils/varlena.h b/src/include/utils/varlena.h index 5c39723332..6645e2af13 100644 --- a/src/include/utils/varlena.h +++ b/src/include/utils/varlena.h @@ -34,6 +34,7 @@ extern bool SplitDirectoriesString(char *rawstring, char separator, extern bool SplitGUCList(char *rawstring, char separator, List **namelist); extern text *replace_text_regexp(text *src_text, void *regexp, - text *replace_text, bool glob); + text *replace_text, + int search_start, int n); #endif diff --git a/src/test/regress/expected/strings.out b/src/test/regress/expected/strings.out index 91aa819804..2359105348 100644 --- a/src/test/regress/expected/strings.out +++ b/src/test/regress/expected/strings.out @@ -515,6 +515,13 @@ SELECT SUBSTRING('abcdefg' FROM 'b(.*)f') AS "cde"; cde (1 row) +-- Check case where we have a match, but not a subexpression match +SELECT SUBSTRING('foo' FROM 'foo(bar)?') IS NULL AS t; + t +--- + t +(1 row) + -- Check behavior of SIMILAR TO, which uses largely the same regexp variant SELECT 'abcdefg' SIMILAR TO '_bcd%' AS true; true @@ -592,6 +599,378 @@ SELECT regexp_replace('AAA aaa', 'A+', 'Z', 'gi'); -- invalid regexp option SELECT regexp_replace('AAA aaa', 'A+', 'Z', 'z'); ERROR: invalid regular expression option: "z" +-- extended regexp_replace tests +SELECT regexp_replace('A PostgreSQL function', 'A|e|i|o|u', 'X', 1); + regexp_replace +----------------------- + X PostgreSQL function +(1 row) + +SELECT regexp_replace('A PostgreSQL function', 'A|e|i|o|u', 'X', 1, 2); + regexp_replace +----------------------- + A PXstgreSQL function +(1 row) + +SELECT regexp_replace('A PostgreSQL function', 'a|e|i|o|u', 'X', 1, 0, 'i'); + regexp_replace +----------------------- + X PXstgrXSQL fXnctXXn +(1 row) + +SELECT regexp_replace('A PostgreSQL function', 'a|e|i|o|u', 'X', 1, 1, 'i'); + regexp_replace +----------------------- + X PostgreSQL function +(1 row) + +SELECT regexp_replace('A PostgreSQL function', 'a|e|i|o|u', 'X', 1, 2, 'i'); + regexp_replace +----------------------- + A PXstgreSQL function +(1 row) + +SELECT regexp_replace('A PostgreSQL function', 'a|e|i|o|u', 'X', 1, 3, 'i'); + regexp_replace +----------------------- + A PostgrXSQL function +(1 row) + +SELECT regexp_replace('A PostgreSQL function', 'a|e|i|o|u', 'X', 1, 9, 'i'); + regexp_replace +----------------------- + A PostgreSQL function +(1 row) + +SELECT regexp_replace('A PostgreSQL function', 'A|e|i|o|u', 'X', 7, 0, 'i'); + regexp_replace +----------------------- + A PostgrXSQL fXnctXXn +(1 row) + +-- 'g' flag should be ignored when N is specified +SELECT regexp_replace('A PostgreSQL function', 'a|e|i|o|u', 'X', 1, 1, 'g'); + regexp_replace +----------------------- + A PXstgreSQL function +(1 row) + +-- errors +SELECT regexp_replace('A PostgreSQL function', 'a|e|i|o|u', 'X', -1, 0, 'i'); +ERROR: invalid value for parameter "start": -1 +SELECT regexp_replace('A PostgreSQL function', 'a|e|i|o|u', 'X', 1, -1, 'i'); +ERROR: invalid value for parameter "n": -1 +-- erroneous invocation of non-extended form +SELECT regexp_replace('A PostgreSQL function', 'a|e|i|o|u', 'X', '1'); +ERROR: invalid regular expression option: "1" +HINT: If you meant to use regexp_replace() with a start parameter, cast the fourth argument to integer explicitly. +-- regexp_count tests +SELECT regexp_count('123123123123123', '(12)3'); + regexp_count +-------------- + 5 +(1 row) + +SELECT regexp_count('123123123123', '123', 1); + regexp_count +-------------- + 4 +(1 row) + +SELECT regexp_count('123123123123', '123', 3); + regexp_count +-------------- + 3 +(1 row) + +SELECT regexp_count('123123123123', '123', 33); + regexp_count +-------------- + 0 +(1 row) + +SELECT regexp_count('ABCABCABCABC', 'Abc', 1, ''); + regexp_count +-------------- + 0 +(1 row) + +SELECT regexp_count('ABCABCABCABC', 'Abc', 1, 'i'); + regexp_count +-------------- + 4 +(1 row) + +-- errors +SELECT regexp_count('123123123123', '123', 0); +ERROR: invalid value for parameter "start": 0 +SELECT regexp_count('123123123123', '123', -3); +ERROR: invalid value for parameter "start": -3 +-- regexp_like tests +SELECT regexp_like('Steven', '^Ste(v|ph)en$'); + regexp_like +------------- + t +(1 row) + +SELECT regexp_like('a'||CHR(10)||'d', 'a.d', 1, 'n'); + regexp_like +------------- + f +(1 row) + +SELECT regexp_like('a'||CHR(10)||'d', 'a.d', 1, 's'); + regexp_like +------------- + t +(1 row) + +SELECT regexp_like('abc', ' a . c ', 1, 'x'); + regexp_like +------------- + t +(1 row) + +SELECT regexp_like('abc', 'a.c', 2); + regexp_like +------------- + f +(1 row) + +SELECT regexp_like('abc', 'a.c', 0); -- error +ERROR: invalid value for parameter "start": 0 +SELECT regexp_like('abc', 'a.c', 1, 'g'); -- error +ERROR: regexp_like() does not support the "global" option +-- regexp_instr tests +SELECT regexp_instr('abcdefghi', 'd.f'); + regexp_instr +-------------- + 4 +(1 row) + +SELECT regexp_instr('abcdefghi', 'd.q'); + regexp_instr +-------------- + 0 +(1 row) + +SELECT regexp_instr('abcabcabc', 'a.c'); + regexp_instr +-------------- + 1 +(1 row) + +SELECT regexp_instr('abcabcabc', 'a.c', 2); + regexp_instr +-------------- + 4 +(1 row) + +SELECT regexp_instr('abcabcabc', 'a.c', 1, 3); + regexp_instr +-------------- + 7 +(1 row) + +SELECT regexp_instr('abcabcabc', 'a.c', 1, 4); + regexp_instr +-------------- + 0 +(1 row) + +SELECT regexp_instr('abcabcabc', 'A.C', 1, 2, 0, 'i'); + regexp_instr +-------------- + 4 +(1 row) + +SELECT regexp_instr('1234567890', '(123)(4(56)(78))', 1, 1, 0, 'i', 0); + regexp_instr +-------------- + 1 +(1 row) + +SELECT regexp_instr('1234567890', '(123)(4(56)(78))', 1, 1, 0, 'i', 1); + regexp_instr +-------------- + 1 +(1 row) + +SELECT regexp_instr('1234567890', '(123)(4(56)(78))', 1, 1, 0, 'i', 2); + regexp_instr +-------------- + 4 +(1 row) + +SELECT regexp_instr('1234567890', '(123)(4(56)(78))', 1, 1, 0, 'i', 3); + regexp_instr +-------------- + 5 +(1 row) + +SELECT regexp_instr('1234567890', '(123)(4(56)(78))', 1, 1, 0, 'i', 4); + regexp_instr +-------------- + 7 +(1 row) + +SELECT regexp_instr('1234567890', '(123)(4(56)(78))', 1, 1, 0, 'i', 5); + regexp_instr +-------------- + 0 +(1 row) + +SELECT regexp_instr('1234567890', '(123)(4(56)(78))', 1, 1, 1, 'i', 0); + regexp_instr +-------------- + 9 +(1 row) + +SELECT regexp_instr('1234567890', '(123)(4(56)(78))', 1, 1, 1, 'i', 1); + regexp_instr +-------------- + 4 +(1 row) + +SELECT regexp_instr('1234567890', '(123)(4(56)(78))', 1, 1, 1, 'i', 2); + regexp_instr +-------------- + 9 +(1 row) + +SELECT regexp_instr('1234567890', '(123)(4(56)(78))', 1, 1, 1, 'i', 3); + regexp_instr +-------------- + 7 +(1 row) + +SELECT regexp_instr('1234567890', '(123)(4(56)(78))', 1, 1, 1, 'i', 4); + regexp_instr +-------------- + 9 +(1 row) + +SELECT regexp_instr('1234567890', '(123)(4(56)(78))', 1, 1, 1, 'i', 5); + regexp_instr +-------------- + 0 +(1 row) + +-- Check case where we have a match, but not a subexpression match +SELECT regexp_instr('foo', 'foo(bar)?', 1, 1, 0, '', 1); + regexp_instr +-------------- + 0 +(1 row) + +-- errors +SELECT regexp_instr('abcabcabc', 'a.c', 0, 1); +ERROR: invalid value for parameter "start": 0 +SELECT regexp_instr('abcabcabc', 'a.c', 1, 0); +ERROR: invalid value for parameter "n": 0 +SELECT regexp_instr('abcabcabc', 'a.c', 1, 1, -1); +ERROR: invalid value for parameter "endoption": -1 +SELECT regexp_instr('abcabcabc', 'a.c', 1, 1, 2); +ERROR: invalid value for parameter "endoption": 2 +SELECT regexp_instr('abcabcabc', 'a.c', 1, 1, 0, 'g'); +ERROR: regexp_instr() does not support the "global" option +SELECT regexp_instr('abcabcabc', 'a.c', 1, 1, 0, '', -1); +ERROR: invalid value for parameter "subexpr": -1 +-- regexp_substr tests +SELECT regexp_substr('abcdefghi', 'd.f'); + regexp_substr +--------------- + def +(1 row) + +SELECT regexp_substr('abcdefghi', 'd.q') IS NULL AS t; + t +--- + t +(1 row) + +SELECT regexp_substr('abcabcabc', 'a.c'); + regexp_substr +--------------- + abc +(1 row) + +SELECT regexp_substr('abcabcabc', 'a.c', 2); + regexp_substr +--------------- + abc +(1 row) + +SELECT regexp_substr('abcabcabc', 'a.c', 1, 3); + regexp_substr +--------------- + abc +(1 row) + +SELECT regexp_substr('abcabcabc', 'a.c', 1, 4) IS NULL AS t; + t +--- + t +(1 row) + +SELECT regexp_substr('abcabcabc', 'A.C', 1, 2, 'i'); + regexp_substr +--------------- + abc +(1 row) + +SELECT regexp_substr('1234567890', '(123)(4(56)(78))', 1, 1, 'i', 0); + regexp_substr +--------------- + 12345678 +(1 row) + +SELECT regexp_substr('1234567890', '(123)(4(56)(78))', 1, 1, 'i', 1); + regexp_substr +--------------- + 123 +(1 row) + +SELECT regexp_substr('1234567890', '(123)(4(56)(78))', 1, 1, 'i', 2); + regexp_substr +--------------- + 45678 +(1 row) + +SELECT regexp_substr('1234567890', '(123)(4(56)(78))', 1, 1, 'i', 3); + regexp_substr +--------------- + 56 +(1 row) + +SELECT regexp_substr('1234567890', '(123)(4(56)(78))', 1, 1, 'i', 4); + regexp_substr +--------------- + 78 +(1 row) + +SELECT regexp_substr('1234567890', '(123)(4(56)(78))', 1, 1, 'i', 5) IS NULL AS t; + t +--- + t +(1 row) + +-- Check case where we have a match, but not a subexpression match +SELECT regexp_substr('foo', 'foo(bar)?', 1, 1, '', 1) IS NULL AS t; + t +--- + t +(1 row) + +-- errors +SELECT regexp_substr('abcabcabc', 'a.c', 0, 1); +ERROR: invalid value for parameter "start": 0 +SELECT regexp_substr('abcabcabc', 'a.c', 1, 0); +ERROR: invalid value for parameter "n": 0 +SELECT regexp_substr('abcabcabc', 'a.c', 1, 1, 'g'); +ERROR: regexp_substr() does not support the "global" option +SELECT regexp_substr('abcabcabc', 'a.c', 1, 1, '', -1); +ERROR: invalid value for parameter "subexpr": -1 -- set so we can tell NULL from empty string \pset null '\\N' -- return all matches from regexp diff --git a/src/test/regress/sql/strings.sql b/src/test/regress/sql/strings.sql index 2c502534c2..92837fdd14 100644 --- a/src/test/regress/sql/strings.sql +++ b/src/test/regress/sql/strings.sql @@ -171,6 +171,8 @@ SELECT SUBSTRING('abcdefg' FROM 'c.e') AS "cde"; -- With a parenthesized subexpression, return only what matches the subexpr SELECT SUBSTRING('abcdefg' FROM 'b(.*)f') AS "cde"; +-- Check case where we have a match, but not a subexpression match +SELECT SUBSTRING('foo' FROM 'foo(bar)?') IS NULL AS t; -- Check behavior of SIMILAR TO, which uses largely the same regexp variant SELECT 'abcdefg' SIMILAR TO '_bcd%' AS true; @@ -193,6 +195,95 @@ SELECT regexp_replace('AAA aaa', 'A+', 'Z', 'gi'); -- invalid regexp option SELECT regexp_replace('AAA aaa', 'A+', 'Z', 'z'); +-- extended regexp_replace tests +SELECT regexp_replace('A PostgreSQL function', 'A|e|i|o|u', 'X', 1); +SELECT regexp_replace('A PostgreSQL function', 'A|e|i|o|u', 'X', 1, 2); +SELECT regexp_replace('A PostgreSQL function', 'a|e|i|o|u', 'X', 1, 0, 'i'); +SELECT regexp_replace('A PostgreSQL function', 'a|e|i|o|u', 'X', 1, 1, 'i'); +SELECT regexp_replace('A PostgreSQL function', 'a|e|i|o|u', 'X', 1, 2, 'i'); +SELECT regexp_replace('A PostgreSQL function', 'a|e|i|o|u', 'X', 1, 3, 'i'); +SELECT regexp_replace('A PostgreSQL function', 'a|e|i|o|u', 'X', 1, 9, 'i'); +SELECT regexp_replace('A PostgreSQL function', 'A|e|i|o|u', 'X', 7, 0, 'i'); +-- 'g' flag should be ignored when N is specified +SELECT regexp_replace('A PostgreSQL function', 'a|e|i|o|u', 'X', 1, 1, 'g'); +-- errors +SELECT regexp_replace('A PostgreSQL function', 'a|e|i|o|u', 'X', -1, 0, 'i'); +SELECT regexp_replace('A PostgreSQL function', 'a|e|i|o|u', 'X', 1, -1, 'i'); +-- erroneous invocation of non-extended form +SELECT regexp_replace('A PostgreSQL function', 'a|e|i|o|u', 'X', '1'); + +-- regexp_count tests +SELECT regexp_count('123123123123123', '(12)3'); +SELECT regexp_count('123123123123', '123', 1); +SELECT regexp_count('123123123123', '123', 3); +SELECT regexp_count('123123123123', '123', 33); +SELECT regexp_count('ABCABCABCABC', 'Abc', 1, ''); +SELECT regexp_count('ABCABCABCABC', 'Abc', 1, 'i'); +-- errors +SELECT regexp_count('123123123123', '123', 0); +SELECT regexp_count('123123123123', '123', -3); + +-- regexp_like tests +SELECT regexp_like('Steven', '^Ste(v|ph)en$'); +SELECT regexp_like('a'||CHR(10)||'d', 'a.d', 1, 'n'); +SELECT regexp_like('a'||CHR(10)||'d', 'a.d', 1, 's'); +SELECT regexp_like('abc', ' a . c ', 1, 'x'); +SELECT regexp_like('abc', 'a.c', 2); +SELECT regexp_like('abc', 'a.c', 0); -- error +SELECT regexp_like('abc', 'a.c', 1, 'g'); -- error + +-- regexp_instr tests +SELECT regexp_instr('abcdefghi', 'd.f'); +SELECT regexp_instr('abcdefghi', 'd.q'); +SELECT regexp_instr('abcabcabc', 'a.c'); +SELECT regexp_instr('abcabcabc', 'a.c', 2); +SELECT regexp_instr('abcabcabc', 'a.c', 1, 3); +SELECT regexp_instr('abcabcabc', 'a.c', 1, 4); +SELECT regexp_instr('abcabcabc', 'A.C', 1, 2, 0, 'i'); +SELECT regexp_instr('1234567890', '(123)(4(56)(78))', 1, 1, 0, 'i', 0); +SELECT regexp_instr('1234567890', '(123)(4(56)(78))', 1, 1, 0, 'i', 1); +SELECT regexp_instr('1234567890', '(123)(4(56)(78))', 1, 1, 0, 'i', 2); +SELECT regexp_instr('1234567890', '(123)(4(56)(78))', 1, 1, 0, 'i', 3); +SELECT regexp_instr('1234567890', '(123)(4(56)(78))', 1, 1, 0, 'i', 4); +SELECT regexp_instr('1234567890', '(123)(4(56)(78))', 1, 1, 0, 'i', 5); +SELECT regexp_instr('1234567890', '(123)(4(56)(78))', 1, 1, 1, 'i', 0); +SELECT regexp_instr('1234567890', '(123)(4(56)(78))', 1, 1, 1, 'i', 1); +SELECT regexp_instr('1234567890', '(123)(4(56)(78))', 1, 1, 1, 'i', 2); +SELECT regexp_instr('1234567890', '(123)(4(56)(78))', 1, 1, 1, 'i', 3); +SELECT regexp_instr('1234567890', '(123)(4(56)(78))', 1, 1, 1, 'i', 4); +SELECT regexp_instr('1234567890', '(123)(4(56)(78))', 1, 1, 1, 'i', 5); +-- Check case where we have a match, but not a subexpression match +SELECT regexp_instr('foo', 'foo(bar)?', 1, 1, 0, '', 1); +-- errors +SELECT regexp_instr('abcabcabc', 'a.c', 0, 1); +SELECT regexp_instr('abcabcabc', 'a.c', 1, 0); +SELECT regexp_instr('abcabcabc', 'a.c', 1, 1, -1); +SELECT regexp_instr('abcabcabc', 'a.c', 1, 1, 2); +SELECT regexp_instr('abcabcabc', 'a.c', 1, 1, 0, 'g'); +SELECT regexp_instr('abcabcabc', 'a.c', 1, 1, 0, '', -1); + +-- regexp_substr tests +SELECT regexp_substr('abcdefghi', 'd.f'); +SELECT regexp_substr('abcdefghi', 'd.q') IS NULL AS t; +SELECT regexp_substr('abcabcabc', 'a.c'); +SELECT regexp_substr('abcabcabc', 'a.c', 2); +SELECT regexp_substr('abcabcabc', 'a.c', 1, 3); +SELECT regexp_substr('abcabcabc', 'a.c', 1, 4) IS NULL AS t; +SELECT regexp_substr('abcabcabc', 'A.C', 1, 2, 'i'); +SELECT regexp_substr('1234567890', '(123)(4(56)(78))', 1, 1, 'i', 0); +SELECT regexp_substr('1234567890', '(123)(4(56)(78))', 1, 1, 'i', 1); +SELECT regexp_substr('1234567890', '(123)(4(56)(78))', 1, 1, 'i', 2); +SELECT regexp_substr('1234567890', '(123)(4(56)(78))', 1, 1, 'i', 3); +SELECT regexp_substr('1234567890', '(123)(4(56)(78))', 1, 1, 'i', 4); +SELECT regexp_substr('1234567890', '(123)(4(56)(78))', 1, 1, 'i', 5) IS NULL AS t; +-- Check case where we have a match, but not a subexpression match +SELECT regexp_substr('foo', 'foo(bar)?', 1, 1, '', 1) IS NULL AS t; +-- errors +SELECT regexp_substr('abcabcabc', 'a.c', 0, 1); +SELECT regexp_substr('abcabcabc', 'a.c', 1, 0); +SELECT regexp_substr('abcabcabc', 'a.c', 1, 1, 'g'); +SELECT regexp_substr('abcabcabc', 'a.c', 1, 1, '', -1); + -- set so we can tell NULL from empty string \pset null '\\N'