src/backend/parser/gram.y | 5 +- src/backend/parser/parser.c | 279 ++++++++++++++++++- src/backend/parser/scan.l | 451 +++++++------------------------ src/fe_utils/psqlscan.l | 156 +++++------ src/include/mb/pg_wchar.h | 21 ++ src/include/parser/kwlist.h | 1 + src/include/parser/scanner.h | 2 +- src/interfaces/ecpg/preproc/ecpg.tokens | 1 - src/interfaces/ecpg/preproc/ecpg.trailer | 37 +-- src/interfaces/ecpg/preproc/ecpg.type | 6 +- src/interfaces/ecpg/preproc/parse.pl | 4 +- src/interfaces/ecpg/preproc/parser.c | 114 ++++++-- src/interfaces/ecpg/preproc/pgc.l | 269 +++++++++--------- src/pl/plpgsql/src/pl_gram.y | 2 +- src/test/regress/expected/strings.out | 12 +- src/test/regress/sql/strings.sql | 1 + 16 files changed, 707 insertions(+), 654 deletions(-) diff --git a/src/backend/parser/gram.y b/src/backend/parser/gram.y index c5086846de..1f10340484 100644 --- a/src/backend/parser/gram.y +++ b/src/backend/parser/gram.y @@ -601,7 +601,7 @@ static Node *makeRecursiveViewSelect(char *relname, List *aliases, Node *query); * DOT_DOT is unused in the core SQL grammar, and so will always provoke * parse errors. It is needed by PL/pgSQL. */ -%token IDENT FCONST SCONST BCONST XCONST Op +%token IDENT UIDENT FCONST SCONST UCONST BCONST XCONST Op %token ICONST PARAM %token TYPECAST DOT_DOT COLON_EQUALS EQUALS_GREATER %token LESS_EQUALS GREATER_EQUALS NOT_EQUALS @@ -691,7 +691,7 @@ static Node *makeRecursiveViewSelect(char *relname, List *aliases, Node *query); TREAT TRIGGER TRIM TRUE_P TRUNCATE TRUSTED TYPE_P TYPES_P - UNBOUNDED UNCOMMITTED UNENCRYPTED UNION UNIQUE UNKNOWN UNLISTEN UNLOGGED + UESCAPE UNBOUNDED UNCOMMITTED UNENCRYPTED UNION UNIQUE UNKNOWN UNLISTEN UNLOGGED UNTIL UPDATE USER USING VACUUM VALID VALIDATE VALIDATOR VALUE_P VALUES VARCHAR VARIADIC VARYING @@ -15374,6 +15374,7 @@ unreserved_keyword: | TRUSTED | TYPE_P | TYPES_P + | UESCAPE | UNBOUNDED | UNCOMMITTED | UNENCRYPTED diff --git a/src/backend/parser/parser.c b/src/backend/parser/parser.c index 4c0c258cd7..6d4a9721ac 100644 --- a/src/backend/parser/parser.c +++ b/src/backend/parser/parser.c @@ -23,6 +23,12 @@ #include "parser/gramparse.h" #include "parser/parser.h" +#include "parser/scansup.h" +#include "mb/pg_wchar.h" + +static bool check_uescapechar(unsigned char escape); +static char *str_udeescape(char escape, char *str, int position, + core_yyscan_t yyscanner); /* @@ -75,6 +81,10 @@ raw_parser(const char *str) * scanner backtrack, which would cost more performance than this filter * layer does. * + * We also use this filter to convert UIDENT and UCONST sequences into + * plain IDENT and SCONST tokens. While that could be handled by additional + * productions in the main grammar, it's more efficient to do it like this. + * * The filter also provides a convenient place to translate between * the core_YYSTYPE and YYSTYPE representations (which are really the * same thing anyway, but notationally they're different). @@ -104,7 +114,7 @@ base_yylex(YYSTYPE *lvalp, YYLTYPE *llocp, core_yyscan_t yyscanner) * If this token isn't one that requires lookahead, just return it. If it * does, determine the token length. (We could get that via strlen(), but * since we have such a small set of possibilities, hardwiring seems - * feasible and more efficient.) + * feasible and more efficient --- at least for the fixed-length cases.) */ switch (cur_token) { @@ -117,6 +127,10 @@ base_yylex(YYSTYPE *lvalp, YYLTYPE *llocp, core_yyscan_t yyscanner) case WITH: cur_token_length = 4; break; + case UIDENT: + case UCONST: + cur_token_length = strlen(yyextra->core_yy_extra.scanbuf + *llocp); + break; default: return cur_token; } @@ -190,7 +204,270 @@ base_yylex(YYSTYPE *lvalp, YYLTYPE *llocp, core_yyscan_t yyscanner) break; } break; + + case UIDENT: + case UCONST: + /* Look ahead for UESCAPE */ + if (next_token == UESCAPE) + { + /* Yup, so get third token, which had better be SCONST */ + const char *escstr; + + /* Again save and restore *llocp */ + cur_yylloc = *llocp; + + /* Un-truncate current token so errors point to third token */ + *(yyextra->lookahead_end) = yyextra->lookahead_hold_char; + + /* Get third token */ + next_token = core_yylex(&(yyextra->lookahead_yylval), + llocp, yyscanner); + + /* If we throw error here, it will point to third token */ + if (next_token != SCONST) + scanner_yyerror("UESCAPE must be followed by a simple string literal", + yyscanner); + + escstr = yyextra->lookahead_yylval.str; + if (strlen(escstr) != 1 || !check_uescapechar(escstr[0])) + scanner_yyerror("invalid Unicode escape character", + yyscanner); + + /* Now restore *llocp; errors will point to first token */ + *llocp = cur_yylloc; + + /* Apply Unicode conversion */ + lvalp->core_yystype.str = + str_udeescape(escstr[0], + lvalp->core_yystype.str, + *llocp, + yyscanner); + + /* + * We don't need to revert the un-truncation of UESCAPE. What we + * do want to do is clear have_lookahead, thereby consuming + * all three tokens. + */ + yyextra->have_lookahead = false; + } + else + { + /* No UESCAPE, so convert using default escape character */ + lvalp->core_yystype.str = + str_udeescape('\\', + lvalp->core_yystype.str, + *llocp, + yyscanner); + } + + if (cur_token == UIDENT) + { + /* It's an identifier, so truncate as appropriate */ + truncate_identifier(lvalp->core_yystype.str, + strlen(lvalp->core_yystype.str), + true); + cur_token = IDENT; + } + else if (cur_token == UCONST) + { + cur_token = SCONST; + } + break; } return cur_token; } + +/* convert hex digit (caller should have verified that) to value */ +static unsigned int +hexval(unsigned char c) +{ + if (c >= '0' && c <= '9') + return c - '0'; + if (c >= 'a' && c <= 'f') + return c - 'a' + 0xA; + if (c >= 'A' && c <= 'F') + return c - 'A' + 0xA; + elog(ERROR, "invalid hexadecimal digit"); + return 0; /* not reached */ +} + +/* is Unicode code point acceptable in database's encoding? */ +static void +check_unicode_value(pg_wchar c, int pos, core_yyscan_t yyscanner) +{ + /* See also addunicode() in scan.l */ + if (c == 0 || c > 0x10FFFF) + ereport(ERROR, + (errcode(ERRCODE_SYNTAX_ERROR), + errmsg("invalid Unicode escape value"), + scanner_errposition(pos, yyscanner))); + + if (c > 0x7F && GetDatabaseEncoding() != PG_UTF8) + ereport(ERROR, + (errcode(ERRCODE_SYNTAX_ERROR), + errmsg("Unicode escape values cannot be used for code point values above 007F when the server encoding is not UTF8"), + scanner_errposition(pos, yyscanner))); +} + +/* is 'escape' acceptable as Unicode escape character (UESCAPE syntax) ? */ +static bool +check_uescapechar(unsigned char escape) +{ + if (isxdigit(escape) + || escape == '+' + || escape == '\'' + || escape == '"' + || scanner_isspace(escape)) + return false; + else + return true; +} + +/* Process Unicode escapes in "str", producing a palloc'd plain string */ +static char * +str_udeescape(char escape, char *str, int position, + core_yyscan_t yyscanner) +{ + char *new, + *in, + *out; + int str_length; + pg_wchar pair_first = 0; + + str_length = strlen(str); + + /* + * This relies on the subtle assumption that a UTF-8 expansion cannot be + * longer than its escaped representation. + */ + new = palloc(str_length + 1); + + in = str; + out = new; + while (*in) + { + if (in[0] == escape) + { + if (in[1] == escape) + { + if (pair_first) + goto invalid_pair; + *out++ = escape; + in += 2; + } + else if (isxdigit((unsigned char) in[1]) && + isxdigit((unsigned char) in[2]) && + isxdigit((unsigned char) in[3]) && + isxdigit((unsigned char) in[4])) + { + pg_wchar unicode; + + unicode = (hexval(in[1]) << 12) + + (hexval(in[2]) << 8) + + (hexval(in[3]) << 4) + + hexval(in[4]); + check_unicode_value(unicode, + position + in - str + 3, /* 3 for U&" */ + yyscanner); + if (pair_first) + { + if (is_utf16_surrogate_second(unicode)) + { + unicode = surrogate_pair_to_codepoint(pair_first, unicode); + pair_first = 0; + } + else + goto invalid_pair; + } + else if (is_utf16_surrogate_second(unicode)) + goto invalid_pair; + + if (is_utf16_surrogate_first(unicode)) + pair_first = unicode; + else + { + unicode_to_utf8(unicode, (unsigned char *) out); + out += pg_mblen(out); + } + in += 5; + } + else if (in[1] == '+' && + isxdigit((unsigned char) in[2]) && + isxdigit((unsigned char) in[3]) && + isxdigit((unsigned char) in[4]) && + isxdigit((unsigned char) in[5]) && + isxdigit((unsigned char) in[6]) && + isxdigit((unsigned char) in[7])) + { + pg_wchar unicode; + + unicode = (hexval(in[2]) << 20) + + (hexval(in[3]) << 16) + + (hexval(in[4]) << 12) + + (hexval(in[5]) << 8) + + (hexval(in[6]) << 4) + + hexval(in[7]); + check_unicode_value(unicode, + position + in - str + 3, /* 3 for U&" */ + yyscanner); + if (pair_first) + { + if (is_utf16_surrogate_second(unicode)) + { + unicode = surrogate_pair_to_codepoint(pair_first, unicode); + pair_first = 0; + } + else + goto invalid_pair; + } + else if (is_utf16_surrogate_second(unicode)) + goto invalid_pair; + + if (is_utf16_surrogate_first(unicode)) + pair_first = unicode; + else + { + unicode_to_utf8(unicode, (unsigned char *) out); + out += pg_mblen(out); + } + in += 8; + } + else + ereport(ERROR, + (errcode(ERRCODE_SYNTAX_ERROR), + errmsg("invalid Unicode escape value"), + scanner_errposition(position + in - str + 3, /* 3 for U&" */ + yyscanner))); + } + else + { + if (pair_first) + goto invalid_pair; + + *out++ = *in++; + } + } + + /* unfinished surrogate pair? */ + if (pair_first) + goto invalid_pair; + + *out = '\0'; + + /* + * We could skip pg_verifymbstr if we didn't process any non-7-bit-ASCII + * codes; but it's probably not worth the trouble, since this isn't likely + * to be a performance-critical path. + */ + pg_verifymbstr(new, out - new, false); + return new; + +invalid_pair: + ereport(ERROR, + (errcode(ERRCODE_SYNTAX_ERROR), + errmsg("invalid Unicode surrogate pair"), + scanner_errposition(position + in - str + 3, /* 3 for U&" */ + yyscanner))); + return NULL; /* keep compiler quiet */ +} diff --git a/src/backend/parser/scan.l b/src/backend/parser/scan.l index e1cae859e8..856f4bac3a 100644 --- a/src/backend/parser/scan.l +++ b/src/backend/parser/scan.l @@ -110,14 +110,9 @@ const uint16 ScanKeywordTokens[] = { static void addlit(char *ytext, int yleng, core_yyscan_t yyscanner); static void addlitchar(unsigned char ychar, core_yyscan_t yyscanner); static char *litbufdup(core_yyscan_t yyscanner); -static char *litbuf_udeescape(unsigned char escape, core_yyscan_t yyscanner); static unsigned char unescape_single_char(unsigned char c, core_yyscan_t yyscanner); static int process_integer_literal(const char *token, YYSTYPE *lval); -static bool is_utf16_surrogate_first(pg_wchar c); -static bool is_utf16_surrogate_second(pg_wchar c); -static pg_wchar surrogate_pair_to_codepoint(pg_wchar first, pg_wchar second); static void addunicode(pg_wchar c, yyscan_t yyscanner); -static bool check_uescapechar(unsigned char escape); #define yyerror(msg) scanner_yyerror(msg, yyscanner) @@ -149,6 +144,7 @@ extern void core_yyset_column(int column_no, yyscan_t yyscanner); %option noyyalloc %option noyyrealloc %option noyyfree +%option stack %option warn %option prefix="core_yy" @@ -168,12 +164,11 @@ extern void core_yyset_column(int column_no, yyscan_t yyscanner); * delimited identifiers (double-quoted identifiers) * hexadecimal numeric string * standard quoted strings + * quote stop (detect continued strings) * extended quoted strings (support backslash escape sequences) * $foo$ quoted strings * quoted identifier with Unicode escapes - * end of a quoted identifier with Unicode escapes, UESCAPE can follow * quoted string with Unicode escapes - * end of a quoted string with Unicode escapes, UESCAPE can follow * Unicode surrogate pair in extended quoted string * * Remember to add an <> case whenever you add a new exclusive state! @@ -185,12 +180,11 @@ extern void core_yyset_column(int column_no, yyscan_t yyscanner); %x xd %x xh %x xq +%x xqs %x xe %x xdolq %x xui -%x xuiend %x xus -%x xusend %x xeu /* @@ -231,19 +225,18 @@ special_whitespace ({space}+|{comment}{newline}) horiz_whitespace ({horiz_space}|{comment}) whitespace_with_newline ({horiz_whitespace}*{newline}{special_whitespace}*) +quote ' +/* If we see {quote} then {quotecontinue}, the quoted string continues */ +quotecontinue {whitespace_with_newline}{quote} + /* - * To ensure that {quotecontinue} can be scanned without having to back up - * if the full pattern isn't matched, we include trailing whitespace in - * {quotestop}. This matches all cases where {quotecontinue} fails to match, - * except for {quote} followed by whitespace and just one "-" (not two, - * which would start a {comment}). To cover that we have {quotefail}. - * The actions for {quotestop} and {quotefail} must throw back characters - * beyond the quote proper. + * {quotecontinuefail} is needed to avoid lexer backup when we fail to match + * {quotecontinue}. It might seem that this could just be {whitespace}*, + * but if there's a dash after {whitespace_with_newline}, it must be consumed + * to see if there's another dash --- which would start a {comment} and thus + * allow continuation of the {quotecontinue} token. */ -quote ' -quotestop {quote}{whitespace}* -quotecontinue {quote}{whitespace_with_newline}{quote} -quotefail {quote}{whitespace}*"-" +quotecontinuefail {whitespace}*"-"? /* Bit string * It is tempting to scan the string for only those characters @@ -304,21 +297,12 @@ xdstop {dquote} xddouble {dquote}{dquote} xdinside [^"]+ -/* Unicode escapes */ -uescape [uU][eE][sS][cC][aA][pP][eE]{whitespace}*{quote}[^']{quote} -/* error rule to avoid backup */ -uescapefail [uU][eE][sS][cC][aA][pP][eE]{whitespace}*"-"|[uU][eE][sS][cC][aA][pP][eE]{whitespace}*{quote}[^']|[uU][eE][sS][cC][aA][pP][eE]{whitespace}*{quote}|[uU][eE][sS][cC][aA][pP][eE]{whitespace}*|[uU][eE][sS][cC][aA][pP]|[uU][eE][sS][cC][aA]|[uU][eE][sS][cC]|[uU][eE][sS]|[uU][eE]|[uU] - /* Quoted identifier with Unicode escapes */ xuistart [uU]&{dquote} /* Quoted string with Unicode escapes */ xusstart [uU]&{quote} -/* Optional UESCAPE after a quoted string or identifier with Unicode escapes. */ -xustop1 {uescapefail}? -xustop2 {uescape} - /* error rule to avoid backup */ xufailed [uU]& @@ -428,7 +412,7 @@ other . /* Set location in case of syntax error in comment */ SET_YYLLOC(); yyextra->xcdepth = 0; - BEGIN(xc); + yy_push_state(xc, yyscanner); /* Put back any characters past slash-star; see above */ yyless(2); } @@ -442,7 +426,7 @@ other . {xcstop} { if (yyextra->xcdepth <= 0) - BEGIN(INITIAL); + yy_pop_state(yyscanner); else (yyextra->xcdepth)--; } @@ -472,25 +456,14 @@ other . * to mark it for the input routine as a binary string. */ SET_YYLLOC(); - BEGIN(xb); + yy_push_state(xb, yyscanner); startlit(); addlitchar('b', yyscanner); } -{quotestop} | -{quotefail} { - yyless(1); - BEGIN(INITIAL); - yylval->str = litbufdup(yyscanner); - return BCONST; - } {xhinside} | {xbinside} { addlit(yytext, yyleng, yyscanner); } -{quotecontinue} | -{quotecontinue} { - /* ignore */ - } <> { yyerror("unterminated bit string literal"); } {xhstart} { @@ -501,17 +474,10 @@ other . * to mark it for the input routine as a hex string. */ SET_YYLLOC(); - BEGIN(xh); + yy_push_state(xh, yyscanner); startlit(); addlitchar('x', yyscanner); } -{quotestop} | -{quotefail} { - yyless(1); - BEGIN(INITIAL); - yylval->str = litbufdup(yyscanner); - return XCONST; - } <> { yyerror("unterminated hexadecimal string literal"); } {xnstart} { @@ -545,16 +511,16 @@ other . yyextra->saw_non_ascii = false; SET_YYLLOC(); if (yyextra->standard_conforming_strings) - BEGIN(xq); + yy_push_state(xq, yyscanner); else - BEGIN(xe); + yy_push_state(xe, yyscanner); startlit(); } {xestart} { yyextra->warn_on_first_escape = false; yyextra->saw_non_ascii = false; SET_YYLLOC(); - BEGIN(xe); + yy_push_state(xe, yyscanner); startlit(); } {xusstart} { @@ -565,56 +531,80 @@ other . errmsg("unsafe use of string constant with Unicode escapes"), errdetail("String constants with Unicode escapes cannot be used when standard_conforming_strings is off."), lexer_errposition())); - BEGIN(xus); + yy_push_state(xus, yyscanner); startlit(); } -{quotestop} | -{quotefail} { - yyless(1); - BEGIN(INITIAL); + +{quote} { /* - * check that the data remains valid if it might have been - * made invalid by unescaping any chars. + * When we are scanning a quoted string and see an end + * quote, we must look ahead for a possible continuation. + * If we don't see one, we know the end quote was in fact + * the end of the string. To reduce the lexer table size, + * we use a single "xqs" state to do the lookahead for all + * types of strings. */ - if (yyextra->saw_non_ascii) - pg_verifymbstr(yyextra->literalbuf, - yyextra->literallen, - false); - yylval->str = litbufdup(yyscanner); - return SCONST; - } -{quotestop} | -{quotefail} { - /* throw back all but the quote */ - yyless(1); - /* xusend state looks for possible UESCAPE */ - BEGIN(xusend); + yy_push_state(xqs, yyscanner); } -{whitespace} { - /* stay in xusend state over whitespace */ +{quotecontinue} { + /* + * Found a quote continuation, so return to the in-quote + * state and continue scanning the literal. + */ + yy_pop_state(yyscanner); } -<> | -{other} | -{xustop1} { - /* no UESCAPE after the quote, throw back everything */ +{quotecontinuefail} | +<> | +{other} { + int token; + + /* + * Failed to see a quote continuation. Throw back + * everything after the end quote, and handle the string + * according to the state we were in previously. + */ yyless(0); - BEGIN(INITIAL); - yylval->str = litbuf_udeescape('\\', yyscanner); - return SCONST; - } -{xustop2} { - /* found UESCAPE after the end quote */ - BEGIN(INITIAL); - if (!check_uescapechar(yytext[yyleng - 2])) + + switch (yy_top_state(yyscanner)) { - SET_YYLLOC(); - ADVANCE_YYLLOC(yyleng - 2); - yyerror("invalid Unicode escape character"); + case xb: + yylval->str = litbufdup(yyscanner); + token = BCONST; + break; + case xh: + yylval->str = litbufdup(yyscanner); + token = XCONST; + break; + case xq: + /* fallthrough */ + case xe: + /* + * Check that the data remains valid if it + * might have been made invalid by unescaping + * any chars. + */ + if (yyextra->saw_non_ascii) + pg_verifymbstr(yyextra->literalbuf, + yyextra->literallen, + false); + yylval->str = litbufdup(yyscanner); + token = SCONST; + break; + case xus: + yylval->str = litbufdup(yyscanner); + token = UCONST; + break; + default: + yyerror("unhandled previous state in xqs"); } - yylval->str = litbuf_udeescape(yytext[yyleng - 2], - yyscanner); - return SCONST; + + /* go back to state before string start */ + yy_pop_state(yyscanner); + yy_pop_state(yyscanner); + + return token; } + {xqdouble} { addlitchar('\'', yyscanner); } @@ -693,9 +683,6 @@ other . if (c == '\0' || IS_HIGHBIT_SET(c)) yyextra->saw_non_ascii = true; } -{quotecontinue} { - /* ignore */ - } . { /* This is only needed for \ just before EOF */ addlitchar(yytext[0], yyscanner); @@ -705,7 +692,7 @@ other . {dolqdelim} { SET_YYLLOC(); yyextra->dolqstart = pstrdup(yytext); - BEGIN(xdolq); + yy_push_state(xdolq, yyscanner); startlit(); } {dolqfailed} { @@ -720,7 +707,7 @@ other . { pfree(yyextra->dolqstart); yyextra->dolqstart = NULL; - BEGIN(INITIAL); + yy_pop_state(yyscanner); yylval->str = litbufdup(yyscanner); return SCONST; } @@ -749,18 +736,18 @@ other . {xdstart} { SET_YYLLOC(); - BEGIN(xd); + yy_push_state(xd, yyscanner); startlit(); } {xuistart} { SET_YYLLOC(); - BEGIN(xui); + yy_push_state(xui, yyscanner); startlit(); } {xdstop} { char *ident; - BEGIN(INITIAL); + yy_pop_state(yyscanner); if (yyextra->literallen == 0) yyerror("zero-length delimited identifier"); ident = litbufdup(yyscanner); @@ -769,54 +756,15 @@ other . yylval->str = ident; return IDENT; } -{dquote} { - yyless(1); - /* xuiend state looks for possible UESCAPE */ - BEGIN(xuiend); - } -{whitespace} { - /* stay in xuiend state over whitespace */ - } -<> | -{other} | -{xustop1} { - /* no UESCAPE after the quote, throw back everything */ - char *ident; - int identlen; - - yyless(0); - - BEGIN(INITIAL); +{dquote} { if (yyextra->literallen == 0) yyerror("zero-length delimited identifier"); - ident = litbuf_udeescape('\\', yyscanner); - identlen = strlen(ident); - if (identlen >= NAMEDATALEN) - truncate_identifier(ident, identlen, true); - yylval->str = ident; - return IDENT; - } -{xustop2} { - /* found UESCAPE after the end quote */ - char *ident; - int identlen; - BEGIN(INITIAL); - if (yyextra->literallen == 0) - yyerror("zero-length delimited identifier"); - if (!check_uescapechar(yytext[yyleng - 2])) - { - SET_YYLLOC(); - ADVANCE_YYLLOC(yyleng - 2); - yyerror("invalid Unicode escape character"); - } - ident = litbuf_udeescape(yytext[yyleng - 2], yyscanner); - identlen = strlen(ident); - if (identlen >= NAMEDATALEN) - truncate_identifier(ident, identlen, true); - yylval->str = ident; - return IDENT; + yy_pop_state(yyscanner); + yylval->str = litbufdup(yyscanner); + return UIDENT; } + {xddouble} { addlitchar('"', yyscanner); } @@ -1288,55 +1236,12 @@ process_integer_literal(const char *token, YYSTYPE *lval) return ICONST; } -static unsigned int -hexval(unsigned char c) -{ - if (c >= '0' && c <= '9') - return c - '0'; - if (c >= 'a' && c <= 'f') - return c - 'a' + 0xA; - if (c >= 'A' && c <= 'F') - return c - 'A' + 0xA; - elog(ERROR, "invalid hexadecimal digit"); - return 0; /* not reached */ -} - -static void -check_unicode_value(pg_wchar c, char *loc, core_yyscan_t yyscanner) -{ - if (GetDatabaseEncoding() == PG_UTF8) - return; - - if (c > 0x7F) - { - ADVANCE_YYLLOC(loc - yyextra->literalbuf + 3); /* 3 for U&" */ - yyerror("Unicode escape values cannot be used for code point values above 007F when the server encoding is not UTF8"); - } -} - -static bool -is_utf16_surrogate_first(pg_wchar c) -{ - return (c >= 0xD800 && c <= 0xDBFF); -} - -static bool -is_utf16_surrogate_second(pg_wchar c) -{ - return (c >= 0xDC00 && c <= 0xDFFF); -} - -static pg_wchar -surrogate_pair_to_codepoint(pg_wchar first, pg_wchar second) -{ - return ((first & 0x3FF) << 10) + 0x10000 + (second & 0x3FF); -} - static void addunicode(pg_wchar c, core_yyscan_t yyscanner) { char buf[8]; + /* See also check_unicode_value() in parser.c */ if (c == 0 || c > 0x10FFFF) yyerror("invalid Unicode escape value"); if (c > 0x7F) @@ -1349,172 +1254,6 @@ addunicode(pg_wchar c, core_yyscan_t yyscanner) addlit(buf, pg_mblen(buf), yyscanner); } -/* is 'escape' acceptable as Unicode escape character (UESCAPE syntax) ? */ -static bool -check_uescapechar(unsigned char escape) -{ - if (isxdigit(escape) - || escape == '+' - || escape == '\'' - || escape == '"' - || scanner_isspace(escape)) - { - return false; - } - else - return true; -} - -/* like litbufdup, but handle unicode escapes */ -static char * -litbuf_udeescape(unsigned char escape, core_yyscan_t yyscanner) -{ - char *new; - char *litbuf, - *in, - *out; - pg_wchar pair_first = 0; - - /* Make literalbuf null-terminated to simplify the scanning loop */ - litbuf = yyextra->literalbuf; - litbuf[yyextra->literallen] = '\0'; - - /* - * This relies on the subtle assumption that a UTF-8 expansion cannot be - * longer than its escaped representation. - */ - new = palloc(yyextra->literallen + 1); - - in = litbuf; - out = new; - while (*in) - { - if (in[0] == escape) - { - if (in[1] == escape) - { - if (pair_first) - { - ADVANCE_YYLLOC(in - litbuf + 3); /* 3 for U&" */ - yyerror("invalid Unicode surrogate pair"); - } - *out++ = escape; - in += 2; - } - else if (isxdigit((unsigned char) in[1]) && - isxdigit((unsigned char) in[2]) && - isxdigit((unsigned char) in[3]) && - isxdigit((unsigned char) in[4])) - { - pg_wchar unicode; - - unicode = (hexval(in[1]) << 12) + - (hexval(in[2]) << 8) + - (hexval(in[3]) << 4) + - hexval(in[4]); - check_unicode_value(unicode, in, yyscanner); - if (pair_first) - { - if (is_utf16_surrogate_second(unicode)) - { - unicode = surrogate_pair_to_codepoint(pair_first, unicode); - pair_first = 0; - } - else - { - ADVANCE_YYLLOC(in - litbuf + 3); /* 3 for U&" */ - yyerror("invalid Unicode surrogate pair"); - } - } - else if (is_utf16_surrogate_second(unicode)) - yyerror("invalid Unicode surrogate pair"); - - if (is_utf16_surrogate_first(unicode)) - pair_first = unicode; - else - { - unicode_to_utf8(unicode, (unsigned char *) out); - out += pg_mblen(out); - } - in += 5; - } - else if (in[1] == '+' && - isxdigit((unsigned char) in[2]) && - isxdigit((unsigned char) in[3]) && - isxdigit((unsigned char) in[4]) && - isxdigit((unsigned char) in[5]) && - isxdigit((unsigned char) in[6]) && - isxdigit((unsigned char) in[7])) - { - pg_wchar unicode; - - unicode = (hexval(in[2]) << 20) + - (hexval(in[3]) << 16) + - (hexval(in[4]) << 12) + - (hexval(in[5]) << 8) + - (hexval(in[6]) << 4) + - hexval(in[7]); - check_unicode_value(unicode, in, yyscanner); - if (pair_first) - { - if (is_utf16_surrogate_second(unicode)) - { - unicode = surrogate_pair_to_codepoint(pair_first, unicode); - pair_first = 0; - } - else - { - ADVANCE_YYLLOC(in - litbuf + 3); /* 3 for U&" */ - yyerror("invalid Unicode surrogate pair"); - } - } - else if (is_utf16_surrogate_second(unicode)) - yyerror("invalid Unicode surrogate pair"); - - if (is_utf16_surrogate_first(unicode)) - pair_first = unicode; - else - { - unicode_to_utf8(unicode, (unsigned char *) out); - out += pg_mblen(out); - } - in += 8; - } - else - { - ADVANCE_YYLLOC(in - litbuf + 3); /* 3 for U&" */ - yyerror("invalid Unicode escape value"); - } - } - else - { - if (pair_first) - { - ADVANCE_YYLLOC(in - litbuf + 3); /* 3 for U&" */ - yyerror("invalid Unicode surrogate pair"); - } - *out++ = *in++; - } - } - - /* unfinished surrogate pair? */ - if (pair_first) - { - ADVANCE_YYLLOC(in - litbuf + 3); /* 3 for U&" */ - yyerror("invalid Unicode surrogate pair"); - } - - *out = '\0'; - - /* - * We could skip pg_verifymbstr if we didn't process any non-7-bit-ASCII - * codes; but it's probably not worth the trouble, since this isn't likely - * to be a performance-critical path. - */ - pg_verifymbstr(new, out - new, false); - return new; -} - static unsigned char unescape_single_char(unsigned char c, core_yyscan_t yyscanner) { diff --git a/src/fe_utils/psqlscan.l b/src/fe_utils/psqlscan.l index ce20936339..71ada0b72e 100644 --- a/src/fe_utils/psqlscan.l +++ b/src/fe_utils/psqlscan.l @@ -86,6 +86,8 @@ extern void psql_yyset_column(int column_no, yyscan_t yyscanner); %option noinput %option nounput %option noyywrap +%option noyy_top_state +%option stack %option warn %option prefix="psql_yy" @@ -114,12 +116,11 @@ extern void psql_yyset_column(int column_no, yyscan_t yyscanner); * delimited identifiers (double-quoted identifiers) * hexadecimal numeric string * standard quoted strings + * quote stop (detect continued strings) * extended quoted strings (support backslash escape sequences) * $foo$ quoted strings * quoted identifier with Unicode escapes - * end of a quoted identifier with Unicode escapes, UESCAPE can follow * quoted string with Unicode escapes - * end of a quoted string with Unicode escapes, UESCAPE can follow * * Note: we intentionally don't mimic the backend's state; we have * no need to distinguish it from state, and no good way to get out @@ -132,12 +133,11 @@ extern void psql_yyset_column(int column_no, yyscan_t yyscanner); %x xd %x xh %x xq +%x xqs %x xe %x xdolq %x xui -%x xuiend %x xus -%x xusend /* * In order to make the world safe for Windows and Mac clients as well as @@ -177,19 +177,18 @@ special_whitespace ({space}+|{comment}{newline}) horiz_whitespace ({horiz_space}|{comment}) whitespace_with_newline ({horiz_whitespace}*{newline}{special_whitespace}*) +quote ' +/* If we see {quote} then {quotecontinue}, the quoted string continues */ +quotecontinue {whitespace_with_newline}{quote} + /* - * To ensure that {quotecontinue} can be scanned without having to back up - * if the full pattern isn't matched, we include trailing whitespace in - * {quotestop}. This matches all cases where {quotecontinue} fails to match, - * except for {quote} followed by whitespace and just one "-" (not two, - * which would start a {comment}). To cover that we have {quotefail}. - * The actions for {quotestop} and {quotefail} must throw back characters - * beyond the quote proper. + * {quotecontinuefail} is needed to avoid lexer backup when we fail to match + * {quotecontinue}. It might seem that this could just be {whitespace}*, + * but if there's a dash after {whitespace_with_newline}, it must be consumed + * to see if there's another dash --- which would start a {comment} and thus + * allow continuation of the {quotecontinue} token. */ -quote ' -quotestop {quote}{whitespace}* -quotecontinue {quote}{whitespace_with_newline}{quote} -quotefail {quote}{whitespace}*"-" +quotecontinuefail {whitespace}*"-"? /* Bit string * It is tempting to scan the string for only those characters @@ -250,21 +249,12 @@ xdstop {dquote} xddouble {dquote}{dquote} xdinside [^"]+ -/* Unicode escapes */ -uescape [uU][eE][sS][cC][aA][pP][eE]{whitespace}*{quote}[^']{quote} -/* error rule to avoid backup */ -uescapefail [uU][eE][sS][cC][aA][pP][eE]{whitespace}*"-"|[uU][eE][sS][cC][aA][pP][eE]{whitespace}*{quote}[^']|[uU][eE][sS][cC][aA][pP][eE]{whitespace}*{quote}|[uU][eE][sS][cC][aA][pP][eE]{whitespace}*|[uU][eE][sS][cC][aA][pP]|[uU][eE][sS][cC][aA]|[uU][eE][sS][cC]|[uU][eE][sS]|[uU][eE]|[uU] - /* Quoted identifier with Unicode escapes */ xuistart [uU]&{dquote} /* Quoted string with Unicode escapes */ xusstart [uU]&{quote} -/* Optional UESCAPE after a quoted string or identifier with Unicode escapes. */ -xustop1 {uescapefail}? -xustop2 {uescape} - /* error rule to avoid backup */ xufailed [uU]& @@ -399,7 +389,7 @@ other . {xcstart} { cur_state->xcdepth = 0; - BEGIN(xc); + yy_push_state(xc, yyscanner); /* Put back any characters past slash-star; see above */ yyless(2); ECHO; @@ -415,7 +405,7 @@ other . {xcstop} { if (cur_state->xcdepth <= 0) - BEGIN(INITIAL); + yy_pop_state(yyscanner); else cur_state->xcdepth--; ECHO; @@ -435,23 +425,13 @@ other . } /* */ {xbstart} { - BEGIN(xb); - ECHO; - } -{quotestop} | -{quotefail} { - yyless(1); - BEGIN(INITIAL); + yy_push_state(xb, yyscanner); ECHO; } {xhinside} | {xbinside} { ECHO; } -{quotecontinue} | -{quotecontinue} { - ECHO; - } {xhstart} { /* Hexadecimal bit type. @@ -460,13 +440,7 @@ other . * In the meantime, place a leading "x" on the string * to mark it for the input routine as a hex string. */ - BEGIN(xh); - ECHO; - } -{quotestop} | -{quotefail} { - yyless(1); - BEGIN(INITIAL); + yy_push_state(xh, yyscanner); ECHO; } @@ -477,45 +451,53 @@ other . {xqstart} { if (cur_state->std_strings) - BEGIN(xq); + yy_push_state(xq, yyscanner); else - BEGIN(xe); + yy_push_state(xe, yyscanner); ECHO; } {xestart} { - BEGIN(xe); + yy_push_state(xe, yyscanner); ECHO; } {xusstart} { - BEGIN(xus); - ECHO; - } -{quotestop} | -{quotefail} { - yyless(1); - BEGIN(INITIAL); + yy_push_state(xus, yyscanner); ECHO; } -{quotestop} | -{quotefail} { - /* throw back all but the quote */ - yyless(1); - BEGIN(xusend); + +{quote} { + /* + * When we are scanning a quoted string and see an end + * quote, we must look ahead for a possible continuation. + * If we don't see one, we know the end quote was in fact + * the end of the string. To reduce the lexer table size, + * we use a single "xqs" state to do the lookahead for all + * types of strings. + */ + yy_push_state(xqs, yyscanner); ECHO; } -{whitespace} { +{quotecontinue} { + /* + * Found a quote continuation, so return to the in-quote + * state and continue scanning the literal. + */ + yy_pop_state(yyscanner); ECHO; } -{other} | -{xustop1} { +{quotecontinuefail} | +{other} { + /* + * Failed to see a quote continuation. Throw back + * everything after the end quote. + */ yyless(0); - BEGIN(INITIAL); - ECHO; - } -{xustop2} { - BEGIN(INITIAL); - ECHO; + + /* go back to state before string start */ + yy_pop_state(yyscanner); + yy_pop_state(yyscanner); } + {xqdouble} { ECHO; } @@ -540,9 +522,6 @@ other . {xehexesc} { ECHO; } -{quotecontinue} { - ECHO; - } . { /* This is only needed for \ just before EOF */ ECHO; @@ -550,7 +529,7 @@ other . {dolqdelim} { cur_state->dolqstart = pg_strdup(yytext); - BEGIN(xdolq); + yy_push_state(xdolq, yyscanner); ECHO; } {dolqfailed} { @@ -563,7 +542,7 @@ other . { free(cur_state->dolqstart); cur_state->dolqstart = NULL; - BEGIN(INITIAL); + yy_pop_state(yyscanner); } else { @@ -588,35 +567,22 @@ other . } {xdstart} { - BEGIN(xd); + yy_push_state(xd, yyscanner); ECHO; } {xuistart} { - BEGIN(xui); + yy_push_state(xui, yyscanner); ECHO; } {xdstop} { - BEGIN(INITIAL); - ECHO; - } -{dquote} { - yyless(1); - BEGIN(xuiend); - ECHO; - } -{whitespace} { + yy_pop_state(yyscanner); ECHO; } -{other} | -{xustop1} { - yyless(0); - BEGIN(INITIAL); - ECHO; - } -{xustop2} { - BEGIN(INITIAL); +{dquote} { + yy_pop_state(yyscanner); ECHO; } + {xddouble} { ECHO; } @@ -1084,8 +1050,7 @@ psql_scan(PsqlScanState state, switch (state->start_state) { case INITIAL: - case xuiend: /* we treat these like INITIAL */ - case xusend: + case xqs: /* we treat this like INITIAL */ if (state->paren_depth > 0) { result = PSCAN_INCOMPLETE; @@ -1240,7 +1205,8 @@ psql_scan_reselect_sql_lexer(PsqlScanState state) bool psql_scan_in_quote(PsqlScanState state) { - return state->start_state != INITIAL; + return state->start_state != INITIAL && + state->start_state != xqs; } /* diff --git a/src/include/mb/pg_wchar.h b/src/include/mb/pg_wchar.h index 3e3e6c470e..0c4cb9c7d0 100644 --- a/src/include/mb/pg_wchar.h +++ b/src/include/mb/pg_wchar.h @@ -508,6 +508,27 @@ typedef uint32 (*utf_local_conversion_func) (uint32 code); (destencoding)) +/* + * Some handy functions for Unicode-specific tests. + */ +static inline bool +is_utf16_surrogate_first(pg_wchar c) +{ + return (c >= 0xD800 && c <= 0xDBFF); +} + +static inline bool +is_utf16_surrogate_second(pg_wchar c) +{ + return (c >= 0xDC00 && c <= 0xDFFF); +} + +static inline pg_wchar +surrogate_pair_to_codepoint(pg_wchar first, pg_wchar second) +{ + return ((first & 0x3FF) << 10) + 0x10000 + (second & 0x3FF); +} + /* * These functions are considered part of libpq's exported API and * are also declared in libpq-fe.h. diff --git a/src/include/parser/kwlist.h b/src/include/parser/kwlist.h index 00ace8425e..5893d317d8 100644 --- a/src/include/parser/kwlist.h +++ b/src/include/parser/kwlist.h @@ -416,6 +416,7 @@ PG_KEYWORD("truncate", TRUNCATE, UNRESERVED_KEYWORD) PG_KEYWORD("trusted", TRUSTED, UNRESERVED_KEYWORD) PG_KEYWORD("type", TYPE_P, UNRESERVED_KEYWORD) PG_KEYWORD("types", TYPES_P, UNRESERVED_KEYWORD) +PG_KEYWORD("uescape", UESCAPE, UNRESERVED_KEYWORD) PG_KEYWORD("unbounded", UNBOUNDED, UNRESERVED_KEYWORD) PG_KEYWORD("uncommitted", UNCOMMITTED, UNRESERVED_KEYWORD) PG_KEYWORD("unencrypted", UNENCRYPTED, UNRESERVED_KEYWORD) diff --git a/src/include/parser/scanner.h b/src/include/parser/scanner.h index 731a2bd264..b61ac65f54 100644 --- a/src/include/parser/scanner.h +++ b/src/include/parser/scanner.h @@ -48,7 +48,7 @@ typedef union core_YYSTYPE * However, those are not defined in this file, because bison insists on * defining them for itself. The token codes used by the core scanner are * the ASCII characters plus these: - * %token IDENT FCONST SCONST BCONST XCONST Op + * %token IDENT UIDENT FCONST SCONST UCONST BCONST XCONST Op * %token ICONST PARAM * %token TYPECAST DOT_DOT COLON_EQUALS EQUALS_GREATER * %token LESS_EQUALS GREATER_EQUALS NOT_EQUALS diff --git a/src/interfaces/ecpg/preproc/ecpg.tokens b/src/interfaces/ecpg/preproc/ecpg.tokens index 1d613af02f..8e0527fdb7 100644 --- a/src/interfaces/ecpg/preproc/ecpg.tokens +++ b/src/interfaces/ecpg/preproc/ecpg.tokens @@ -24,4 +24,3 @@ S_TYPEDEF %token CSTRING CVARIABLE CPP_LINE IP -%token DOLCONST ECONST NCONST UCONST UIDENT diff --git a/src/interfaces/ecpg/preproc/ecpg.trailer b/src/interfaces/ecpg/preproc/ecpg.trailer index f58b41e675..784d1d199e 100644 --- a/src/interfaces/ecpg/preproc/ecpg.trailer +++ b/src/interfaces/ecpg/preproc/ecpg.trailer @@ -1719,46 +1719,13 @@ ecpg_bconst: BCONST { $$ = make_name(); } ; ecpg_fconst: FCONST { $$ = make_name(); } ; -ecpg_sconst: - SCONST - { - /* could have been input as '' or $$ */ - $$ = (char *)mm_alloc(strlen($1) + 3); - $$[0]='\''; - strcpy($$+1, $1); - $$[strlen($1)+1]='\''; - $$[strlen($1)+2]='\0'; - free($1); - } - | ECONST - { - $$ = (char *)mm_alloc(strlen($1) + 4); - $$[0]='E'; - $$[1]='\''; - strcpy($$+2, $1); - $$[strlen($1)+2]='\''; - $$[strlen($1)+3]='\0'; - free($1); - } - | NCONST - { - $$ = (char *)mm_alloc(strlen($1) + 4); - $$[0]='N'; - $$[1]='\''; - strcpy($$+2, $1); - $$[strlen($1)+2]='\''; - $$[strlen($1)+3]='\0'; - free($1); - } - | UCONST { $$ = $1; } - | DOLCONST { $$ = $1; } +ecpg_sconst: SCONST { $$ = $1; } ; ecpg_xconst: XCONST { $$ = make_name(); } ; -ecpg_ident: IDENT { $$ = make_name(); } +ecpg_ident: IDENT { $$ = $1; } | CSTRING { $$ = make3_str(mm_strdup("\""), $1, mm_strdup("\"")); } - | UIDENT { $$ = $1; } ; quoted_ident_stringvar: name diff --git a/src/interfaces/ecpg/preproc/ecpg.type b/src/interfaces/ecpg/preproc/ecpg.type index 9497b91b9d..ffafa82af9 100644 --- a/src/interfaces/ecpg/preproc/ecpg.type +++ b/src/interfaces/ecpg/preproc/ecpg.type @@ -122,12 +122,8 @@ %type CSTRING %type CPP_LINE %type CVARIABLE -%type DOLCONST -%type ECONST -%type NCONST %type SCONST -%type UCONST -%type UIDENT +%type IDENT %type s_struct_union_symbol diff --git a/src/interfaces/ecpg/preproc/parse.pl b/src/interfaces/ecpg/preproc/parse.pl index 3619706cdc..47300b7083 100644 --- a/src/interfaces/ecpg/preproc/parse.pl +++ b/src/interfaces/ecpg/preproc/parse.pl @@ -218,8 +218,8 @@ sub main if ($a eq 'IDENT' && $prior eq '%nonassoc') { - # add two more tokens to the list - $str = $str . "\n%nonassoc CSTRING\n%nonassoc UIDENT"; + # add more tokens to the list + $str = $str . "\n%nonassoc CSTRING"; } $prior = $a; } diff --git a/src/interfaces/ecpg/preproc/parser.c b/src/interfaces/ecpg/preproc/parser.c index abae89d51b..ecf3c1b7e5 100644 --- a/src/interfaces/ecpg/preproc/parser.c +++ b/src/interfaces/ecpg/preproc/parser.c @@ -6,6 +6,10 @@ * This should match src/backend/parser/parser.c, except that we do not * need to bother with re-entrant interfaces. * + * Note: ECPG doesn't report error location like the backend does. + * This file will need work if we ever want it to. + * See backend/parser/parser.c + * * * Portions Copyright (c) 1996-2019, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California @@ -27,8 +31,10 @@ static int lookahead_token; /* one-token lookahead */ static YYSTYPE lookahead_yylval; /* yylval for lookahead token */ static YYLTYPE lookahead_yylloc; /* yylloc for lookahead token */ static char *lookahead_yytext; /* start current token */ -static char *lookahead_end; /* end of current token */ -static char lookahead_hold_char; /* to be put back at *lookahead_end */ + + +static bool check_uescapechar(unsigned char escape); +static bool ecpg_isspace(char ch); /* @@ -43,13 +49,16 @@ static char lookahead_hold_char; /* to be put back at *lookahead_end */ * words. Furthermore it's not clear how to do that without re-introducing * scanner backtrack, which would cost more performance than this filter * layer does. + * + * We also use this filter to convert UIDENT and UCONST sequences into + * plain IDENT and SCONST tokens. While that could be handled by additional + * productions in the main grammar, it's more efficient to do it like this. */ int filtered_base_yylex(void) { int cur_token; int next_token; - int cur_token_length; YYSTYPE cur_yylval; YYLTYPE cur_yylloc; char *cur_yytext; @@ -61,41 +70,26 @@ filtered_base_yylex(void) base_yylval = lookahead_yylval; base_yylloc = lookahead_yylloc; base_yytext = lookahead_yytext; - *lookahead_end = lookahead_hold_char; have_lookahead = false; } else cur_token = base_yylex(); /* - * If this token isn't one that requires lookahead, just return it. If it - * does, determine the token length. (We could get that via strlen(), but - * since we have such a small set of possibilities, hardwiring seems - * feasible and more efficient.) + * If this token isn't one that requires lookahead, just return it. */ switch (cur_token) { case NOT: - cur_token_length = 3; - break; case NULLS_P: - cur_token_length = 5; - break; case WITH: - cur_token_length = 4; + case UIDENT: + case UCONST: break; default: return cur_token; } - /* - * Identify end+1 of current token. base_yylex() has temporarily stored a - * '\0' here, and will undo that when we call it again. We need to redo - * it to fully revert the lookahead call for error reporting purposes. - */ - lookahead_end = base_yytext + cur_token_length; - Assert(*lookahead_end == '\0'); - /* Save and restore lexer output variables around the call */ cur_yylval = base_yylval; cur_yylloc = base_yylloc; @@ -113,10 +107,6 @@ filtered_base_yylex(void) base_yylloc = cur_yylloc; base_yytext = cur_yytext; - /* Now revert the un-truncation of the current token */ - lookahead_hold_char = *lookahead_end; - *lookahead_end = '\0'; - have_lookahead = true; /* Replace cur_token if needed, based on lookahead */ @@ -157,7 +147,81 @@ filtered_base_yylex(void) break; } break; + case UIDENT: + case UCONST: + /* Look ahead for UESCAPE */ + if (next_token == UESCAPE) + { + /* Yup, so get third token, which had better be SCONST */ + const char *escstr; + + /* Again save and restore lexer output variables around the call */ + cur_yylval = base_yylval; + cur_yylloc = base_yylloc; + cur_yytext = base_yytext; + + /* Get third token */ + next_token = base_yylex(); + + if (next_token != SCONST) + mmerror(PARSE_ERROR, ET_ERROR, "UESCAPE must be followed by a simple string literal"); + + /* Save and check escape string, which the scanner returns with quotes */ + escstr = base_yylval.str; + if (strlen(escstr) != 3 || !check_uescapechar(escstr[1])) + mmerror(PARSE_ERROR, ET_ERROR, "invalid Unicode escape character"); + + base_yylval = cur_yylval; + base_yylloc = cur_yylloc; + base_yytext = cur_yytext; + + /* Combine 3 tokens into 1 */ + base_yylval.str = psprintf("%s uescape %s", base_yylval.str, escstr); + + /* + * We don't need to revert the un-truncation of UESCAPE. What we + * do want to do is clear have_lookahead, thereby consuming + * all three tokens. + */ + have_lookahead = false; + } + + if (cur_token == UIDENT) + cur_token = IDENT; + else if (cur_token == UCONST) + cur_token = SCONST; + break; } return cur_token; } + +// WIP: if we want to check this here, find a better location +/* is 'escape' acceptable as Unicode escape character (UESCAPE syntax) ? */ +static bool +check_uescapechar(unsigned char escape) +{ + if (isxdigit(escape) + || escape == '+' + || escape == '\'' + || escape == '"' + || ecpg_isspace(escape)) + return false; + else + return true; +} + +/* + * ecpg_isspace() --- return true if flex scanner considers char whitespace + */ +static bool +ecpg_isspace(char ch) +{ + if (ch == ' ' || + ch == '\t' || + ch == '\n' || + ch == '\r' || + ch == '\f') + return true; + return false; +} diff --git a/src/interfaces/ecpg/preproc/pgc.l b/src/interfaces/ecpg/preproc/pgc.l index 488c89b7f4..c83b21118b 100644 --- a/src/interfaces/ecpg/preproc/pgc.l +++ b/src/interfaces/ecpg/preproc/pgc.l @@ -6,6 +6,9 @@ * * This is a modified version of src/backend/parser/scan.l * + * The ecpg scanner is not backup-free, so the fail rules are + * only here to simplify syncing this file with scan.l. + * * * Portions Copyright (c) 1996-2019, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California @@ -61,7 +64,6 @@ static bool isdefine(void); static bool isinformixdefine(void); char *token_start; -static int state_before; struct _yy_buffer { @@ -89,6 +91,7 @@ static struct _if_value %option nodefault %option noinput %option noyywrap +%option stack %option warn %option yylineno %option prefix="base_yy" @@ -105,13 +108,13 @@ static struct _if_value * and to eliminate parsing troubles for numeric strings. * Exclusive states: * bit string literal - * extended C-style comments in C - * extended C-style comments in SQL + * extended C-style comments * delimited identifiers (double-quoted identifiers) * double-quoted strings in C * hexadecimal numeric string * national character quoted strings * standard quoted strings + * quote stop (detect continued strings) * extended quoted strings (support backslash escape sequences) * single-quoted strings in C * $foo$ quoted strings @@ -120,18 +123,21 @@ static struct _if_value * condition of an EXEC SQL IFDEF construct * skipping the inactive part of an EXEC SQL IFDEF construct * + * Note: we intentionally don't mimic the backend's state; we have + * no need to distinguish it from state. + * * Remember to add an <> case whenever you add a new exclusive state! * The default one is probably not the right thing. */ %x xb -%x xcc -%x xcsql +%x xc %x xd %x xdc %x xh %x xn %x xq +%x xqs %x xe %x xqc %x xdolq @@ -181,9 +187,17 @@ horiz_whitespace ({horiz_space}|{comment}) whitespace_with_newline ({horiz_whitespace}*{newline}{whitespace}*) quote ' -quotestop {quote}{whitespace}* -quotecontinue {quote}{whitespace_with_newline}{quote} -quotefail {quote}{whitespace}*"-" +/* If we see {quote} then {quotecontinue}, the quoted string continues */ +quotecontinue {whitespace_with_newline}{quote} + +/* + * {quotecontinuefail} is needed to avoid lexer backup when we fail to match + * {quotecontinue}. It might seem that this could just be {whitespace}*, + * but if there's a dash after {whitespace_with_newline}, it must be consumed + * to see if there's another dash --- which would start a {comment} and thus + * allow continuation of the {quotecontinue} token. + */ +quotecontinuefail {whitespace}*"-"? /* Bit string */ @@ -237,19 +251,11 @@ xdstop {dquote} xddouble {dquote}{dquote} xdinside [^"]+ -/* Unicode escapes */ -/* (The ecpg scanner is not backup-free, so the fail rules in scan.l are - * not needed here, but could be added if desired.) - */ -uescape [uU][eE][sS][cC][aA][pP][eE]{whitespace}*{quote}[^']{quote} - /* Quoted identifier with Unicode escapes */ xuistart [uU]&{dquote} -xuistop {dquote}({whitespace}*{uescape})? /* Quoted string with Unicode escapes */ xusstart [uU]&{quote} -xusstop {quote}({whitespace}*{uescape})? /* special stuff for C strings */ xdcqq \\\\ @@ -408,54 +414,56 @@ cppline {space}*#([^i][A-Za-z]*|{if}|{ifdef}|{ifndef}|{import})((\/\*[^*/]*\*+ {whitespace} { /* ignore */ } +} /* */ +{ {xcstart} { token_start = yytext; - state_before = YYSTATE; xcdepth = 0; - BEGIN(xcsql); + yy_push_state(xc); /* Put back any characters past slash-star; see above */ yyless(2); fputs("/*", yyout); } -} /* */ +} /* */ -{xcstart} { - token_start = yytext; - state_before = YYSTATE; - xcdepth = 0; - BEGIN(xcc); - /* Put back any characters past slash-star; see above */ - yyless(2); - fputs("/*", yyout); - } -{xcstart} { ECHO; } -{xcstart} { - xcdepth++; - /* Put back any characters past slash-star; see above */ - yyless(2); - fputs("/_*", yyout); - } -{xcstop} { - if (xcdepth <= 0) +{ +{xcstart} { + if (yy_top_state() == SQL) { - ECHO; - BEGIN(state_before); - token_start = NULL; + xcdepth++; + /* Put back any characters past slash-star; see above */ + yyless(2); + fputs("/_*", yyout); } - else + else if (yy_top_state() == C) { - xcdepth--; - fputs("*_/", yyout); + ECHO; } } -{xcstop} { - ECHO; - BEGIN(state_before); - token_start = NULL; +{xcstop} { + if (yy_top_state() == SQL) + { + if (xcdepth <= 0) + { + ECHO; + yy_pop_state(); + token_start = NULL; + } + else + { + xcdepth--; + fputs("*_/", yyout); + } + } + else if (yy_top_state() == C) + { + ECHO; + yy_pop_state(); + token_start = NULL; + } } -{ {xcinside} { ECHO; } @@ -471,56 +479,34 @@ cppline {space}*#([^i][A-Za-z]*|{if}|{ifdef}|{ifndef}|{import})((\/\*[^*/]*\*+ <> { mmfatal(PARSE_ERROR, "unterminated /* comment"); } -} /* */ +} /* */ { {xbstart} { token_start = yytext; - BEGIN(xb); + yy_push_state(xb); startlit(); addlitchar('b'); } } /* */ -{quotestop} | -{quotefail} { - yyless(1); - BEGIN(SQL); - if (literalbuf[strspn(literalbuf, "01") + 1] != '\0') - mmerror(PARSE_ERROR, ET_ERROR, "invalid bit string literal"); - base_yylval.str = mm_strdup(literalbuf); - return BCONST; - } {xhinside} | {xbinside} { addlit(yytext, yyleng); } -{quotecontinue} | -{quotecontinue} { - /* ignore */ - } <> { mmfatal(PARSE_ERROR, "unterminated bit string literal"); } {xhstart} { token_start = yytext; - BEGIN(xh); + yy_push_state(xh); startlit(); addlitchar('x'); } -{quotestop} | -{quotefail} { - yyless(1); - BEGIN(SQL); - base_yylval.str = mm_strdup(literalbuf); - return XCONST; - } - <> { mmfatal(PARSE_ERROR, "unterminated hexadecimal string literal"); } {xqstart} { token_start = yytext; - state_before = YYSTATE; - BEGIN(xqc); + yy_push_state(xqc); startlit(); } @@ -530,59 +516,98 @@ cppline {space}*#([^i][A-Za-z]*|{if}|{ifdef}|{ifndef}|{import})((\/\*[^*/]*\*+ * Transfer it as-is to the backend. */ token_start = yytext; - state_before = YYSTATE; - BEGIN(xn); + yy_push_state(xn); startlit(); } {xqstart} { token_start = yytext; - state_before = YYSTATE; - BEGIN(xq); + yy_push_state(xq); startlit(); } {xestart} { token_start = yytext; - state_before = YYSTATE; - BEGIN(xe); + yy_push_state(xe); startlit(); } {xusstart} { token_start = yytext; - state_before = YYSTATE; - BEGIN(xus); + yy_push_state(xus); startlit(); - addlit(yytext, yyleng); } } /* */ -{quotestop} | -{quotefail} { - yyless(1); - BEGIN(state_before); - base_yylval.str = mm_strdup(literalbuf); - return SCONST; - } -{quotestop} | -{quotefail} { - yyless(1); - BEGIN(state_before); - base_yylval.str = mm_strdup(literalbuf); - return ECONST; +{quote} { + /* + * When we are scanning a quoted string and see an end + * quote, we must look ahead for a possible continuation. + * If we don't see one, we know the end quote was in fact + * the end of the string. To reduce the lexer table size, + * we use a single "xqs" state to do the lookahead for all + * types of strings. + */ + yy_push_state(xqs); } -{quotestop} | -{quotefail} { - yyless(1); - BEGIN(state_before); - base_yylval.str = mm_strdup(literalbuf); - return NCONST; +{quotecontinue} { + /* + * Found a quote continuation, so return to the in-quote + * state and continue scanning the literal. + */ + yy_pop_state(); } -{xusstop} { - addlit(yytext, yyleng); - BEGIN(state_before); - base_yylval.str = mm_strdup(literalbuf); - return UCONST; +{quotecontinuefail} | +<> | +{other} { + int token; + + /* + * Failed to see a quote continuation. Throw back + * everything after the end quote, and handle the string + * according to the state we were in previously. + */ + yyless(0); + + switch (yy_top_state()) + { + case xb: + if (literalbuf[strspn(literalbuf, "01") + 1] != '\0') + mmerror(PARSE_ERROR, ET_ERROR, "invalid bit string literal"); + base_yylval.str = mm_strdup(literalbuf); + token = BCONST; + break; + case xh: + base_yylval.str = mm_strdup(literalbuf); + token = XCONST; + break; + case xq: + /* fallthrough */ + case xqc: + base_yylval.str = psprintf("'%s'", literalbuf); + token = SCONST; + break; + case xe: + base_yylval.str = psprintf("E'%s'", literalbuf); + token = SCONST; + break; + case xn: + base_yylval.str = psprintf("N'%s'", literalbuf); + token = SCONST; + break; + case xus: + base_yylval.str = psprintf("U&'%s'", literalbuf); + token = UCONST; + break; + default: + mmfatal(PARSE_ERROR, "unhandled previous state in xqs\n"); + } + + /* go back to state before string start */ + yy_pop_state(); + yy_pop_state(); + + return token; } + {xqdouble} { addlitchar('\''); } {xqcquote} { addlitchar('\\'); @@ -604,9 +629,6 @@ cppline {space}*#([^i][A-Za-z]*|{if}|{ifdef}|{ifndef}|{import})((\/\*[^*/]*\*+ {xehexesc} { addlit(yytext, yyleng); } -{quotecontinue} { - /* ignore */ - } . { /* This is only needed for \ just before EOF */ addlitchar(yytext[0]); @@ -619,7 +641,7 @@ cppline {space}*#([^i][A-Za-z]*|{if}|{ifdef}|{ifndef}|{import})((\/\*[^*/]*\*+ if (dolqstart) free(dolqstart); dolqstart = mm_strdup(yytext); - BEGIN(xdolq); + yy_push_state(xdolq); startlit(); addlit(yytext, yyleng); } @@ -637,9 +659,9 @@ cppline {space}*#([^i][A-Za-z]*|{if}|{ifdef}|{ifndef}|{import})((\/\*[^*/]*\*+ addlit(yytext, yyleng); free(dolqstart); dolqstart = NULL; - BEGIN(SQL); + yy_pop_state(); base_yylval.str = mm_strdup(literalbuf); - return DOLCONST; + return SCONST; } else { @@ -666,20 +688,17 @@ cppline {space}*#([^i][A-Za-z]*|{if}|{ifdef}|{ifndef}|{import})((\/\*[^*/]*\*+ { {xdstart} { - state_before = YYSTATE; - BEGIN(xd); + yy_push_state(xd); startlit(); } {xuistart} { - state_before = YYSTATE; - BEGIN(xui); + yy_push_state(xui); startlit(); - addlit(yytext, yyleng); } } /* */ {xdstop} { - BEGIN(state_before); + yy_pop_state(); if (literallen == 0) mmerror(PARSE_ERROR, ET_ERROR, "zero-length delimited identifier"); /* The backend will truncate the identifier here. We do not as it does not change the result. */ @@ -687,17 +706,16 @@ cppline {space}*#([^i][A-Za-z]*|{if}|{ifdef}|{ifndef}|{import})((\/\*[^*/]*\*+ return CSTRING; } {xdstop} { - BEGIN(state_before); + yy_pop_state(); base_yylval.str = mm_strdup(literalbuf); return CSTRING; } -{xuistop} { - BEGIN(state_before); +{dquote} { + yy_pop_state(); if (literallen == 2) /* "U&" */ mmerror(PARSE_ERROR, ET_ERROR, "zero-length delimited identifier"); /* The backend will truncate the identifier here. We do not as it does not change the result. */ - addlit(yytext, yyleng); - base_yylval.str = mm_strdup(literalbuf); + base_yylval.str = psprintf("U&\"%s\"", literalbuf); return UIDENT; } {xddouble} { @@ -708,8 +726,7 @@ cppline {space}*#([^i][A-Za-z]*|{if}|{ifdef}|{ifndef}|{import})((\/\*[^*/]*\*+ } <> { mmfatal(PARSE_ERROR, "unterminated quoted identifier"); } {xdstart} { - state_before = YYSTATE; - BEGIN(xdc); + yy_push_state(xdc); startlit(); } {xdcinside} { diff --git a/src/pl/plpgsql/src/pl_gram.y b/src/pl/plpgsql/src/pl_gram.y index 454071a81f..3cdf9289c4 100644 --- a/src/pl/plpgsql/src/pl_gram.y +++ b/src/pl/plpgsql/src/pl_gram.y @@ -232,7 +232,7 @@ static void check_raise_parameters(PLpgSQL_stmt_raise *stmt); * Some of these are not directly referenced in this file, but they must be * here anyway. */ -%token IDENT FCONST SCONST BCONST XCONST Op +%token IDENT UIDENT FCONST SCONST UCONST BCONST XCONST Op %token ICONST PARAM %token TYPECAST DOT_DOT COLON_EQUALS EQUALS_GREATER %token LESS_EQUALS GREATER_EQUALS NOT_EQUALS diff --git a/src/test/regress/expected/strings.out b/src/test/regress/expected/strings.out index 6d96843e5b..60cb86193c 100644 --- a/src/test/regress/expected/strings.out +++ b/src/test/regress/expected/strings.out @@ -48,17 +48,21 @@ SELECT 'tricky' AS U&"\" UESCAPE '!'; (1 row) SELECT U&'wrong: \061'; -ERROR: invalid Unicode escape value at or near "\061'" +ERROR: invalid Unicode escape value LINE 1: SELECT U&'wrong: \061'; ^ SELECT U&'wrong: \+0061'; -ERROR: invalid Unicode escape value at or near "\+0061'" +ERROR: invalid Unicode escape value LINE 1: SELECT U&'wrong: \+0061'; ^ +SELECT U&'wrong: +0061' UESCAPE +; +ERROR: UESCAPE must be followed by a simple string literal at or near "+" +LINE 1: SELECT U&'wrong: +0061' UESCAPE +; + ^ SELECT U&'wrong: +0061' UESCAPE '+'; -ERROR: invalid Unicode escape character at or near "+'" +ERROR: invalid Unicode escape character at or near "'+'" LINE 1: SELECT U&'wrong: +0061' UESCAPE '+'; - ^ + ^ SET standard_conforming_strings TO off; SELECT U&'d\0061t\+000061' AS U&"d\0061t\+000061"; ERROR: unsafe use of string constant with Unicode escapes diff --git a/src/test/regress/sql/strings.sql b/src/test/regress/sql/strings.sql index 0afb94964b..c5cd15142a 100644 --- a/src/test/regress/sql/strings.sql +++ b/src/test/regress/sql/strings.sql @@ -27,6 +27,7 @@ SELECT 'tricky' AS U&"\" UESCAPE '!'; SELECT U&'wrong: \061'; SELECT U&'wrong: \+0061'; +SELECT U&'wrong: +0061' UESCAPE +; SELECT U&'wrong: +0061' UESCAPE '+'; SET standard_conforming_strings TO off;