diff --git a/src/backend/parser/scan.l b/src/backend/parser/scan.l index e1cae85..899da09 100644 --- a/src/backend/parser/scan.l +++ b/src/backend/parser/scan.l @@ -168,12 +168,14 @@ extern void core_yyset_column(int column_no, yyscan_t yyscanner); * delimited identifiers (double-quoted identifiers) * hexadecimal numeric string * standard quoted strings + * quote stop (detect continued strings) * extended quoted strings (support backslash escape sequences) * $foo$ quoted strings * quoted identifier with Unicode escapes - * end of a quoted identifier with Unicode escapes, UESCAPE can follow * quoted string with Unicode escapes - * end of a quoted string with Unicode escapes, UESCAPE can follow + * end of a quoted string or identifier with Unicode escapes, + * UESCAPE can follow + * expecting escape character literal after UESCAPE * Unicode surrogate pair in extended quoted string * * Remember to add an <> case whenever you add a new exclusive state! @@ -185,12 +187,13 @@ extern void core_yyset_column(int column_no, yyscan_t yyscanner); %x xd %x xh %x xq +%x xqs %x xe %x xdolq %x xui -%x xuiend %x xus -%x xusend +%x xuend +%x xuchar %x xeu /* @@ -231,19 +234,18 @@ special_whitespace ({space}+|{comment}{newline}) horiz_whitespace ({horiz_space}|{comment}) whitespace_with_newline ({horiz_whitespace}*{newline}{special_whitespace}*) +quote ' +/* If we see {quote} then {quotecontinue}, the quoted string continues */ +quotecontinue {whitespace_with_newline}{quote} + /* - * To ensure that {quotecontinue} can be scanned without having to back up - * if the full pattern isn't matched, we include trailing whitespace in - * {quotestop}. This matches all cases where {quotecontinue} fails to match, - * except for {quote} followed by whitespace and just one "-" (not two, - * which would start a {comment}). To cover that we have {quotefail}. - * The actions for {quotestop} and {quotefail} must throw back characters - * beyond the quote proper. + * {quotecontinuefail} is needed to avoid lexer backup when we fail to match + * {quotecontinue}. It might seem that this could just be {whitespace}*, + * but if there's a dash after {whitespace_with_newline}, it must be consumed + * to see if there's another dash --- which would start a {comment} and thus + * allow continuation of the {quotecontinue} token. */ -quote ' -quotestop {quote}{whitespace}* -quotecontinue {quote}{whitespace_with_newline}{quote} -quotefail {quote}{whitespace}*"-" +quotecontinuefail {whitespace}*"-"? /* Bit string * It is tempting to scan the string for only those characters @@ -304,10 +306,15 @@ xdstop {dquote} xddouble {dquote}{dquote} xdinside [^"]+ -/* Unicode escapes */ -uescape [uU][eE][sS][cC][aA][pP][eE]{whitespace}*{quote}[^']{quote} +/* Optional UESCAPE after a quoted string or identifier with Unicode escapes */ +uescape [uU][eE][sS][cC][aA][pP][eE] +/* error rule to avoid backup */ +uescapefail [uU][eE][sS][cC][aA][pP]|[uU][eE][sS][cC][aA]|[uU][eE][sS][cC]|[uU][eE][sS]|[uU][eE]|[uU] + +/* escape character literal */ +uescchar {quote}[^']{quote} /* error rule to avoid backup */ -uescapefail [uU][eE][sS][cC][aA][pP][eE]{whitespace}*"-"|[uU][eE][sS][cC][aA][pP][eE]{whitespace}*{quote}[^']|[uU][eE][sS][cC][aA][pP][eE]{whitespace}*{quote}|[uU][eE][sS][cC][aA][pP][eE]{whitespace}*|[uU][eE][sS][cC][aA][pP]|[uU][eE][sS][cC][aA]|[uU][eE][sS][cC]|[uU][eE][sS]|[uU][eE]|[uU] +uesccharfail {quote}[^']|{quote} /* Quoted identifier with Unicode escapes */ xuistart [uU]&{dquote} @@ -315,10 +322,6 @@ xuistart [uU]&{dquote} /* Quoted string with Unicode escapes */ xusstart [uU]&{quote} -/* Optional UESCAPE after a quoted string or identifier with Unicode escapes. */ -xustop1 {uescapefail}? -xustop2 {uescape} - /* error rule to avoid backup */ xufailed [uU]& @@ -476,21 +479,10 @@ other . startlit(); addlitchar('b', yyscanner); } -{quotestop} | -{quotefail} { - yyless(1); - BEGIN(INITIAL); - yylval->str = litbufdup(yyscanner); - return BCONST; - } {xhinside} | {xbinside} { addlit(yytext, yyleng, yyscanner); } -{quotecontinue} | -{quotecontinue} { - /* ignore */ - } <> { yyerror("unterminated bit string literal"); } {xhstart} { @@ -505,13 +497,6 @@ other . startlit(); addlitchar('x', yyscanner); } -{quotestop} | -{quotefail} { - yyless(1); - BEGIN(INITIAL); - yylval->str = litbufdup(yyscanner); - return XCONST; - } <> { yyerror("unterminated hexadecimal string literal"); } {xnstart} { @@ -568,53 +553,71 @@ other . BEGIN(xus); startlit(); } -{quotestop} | -{quotefail} { - yyless(1); - BEGIN(INITIAL); + +{quote} { /* - * check that the data remains valid if it might have been - * made invalid by unescaping any chars. + * When we are scanning a quoted string and see an end + * quote, we must look ahead for a possible continuation. + * If we don't see one, we know the end quote was in fact + * the end of the string. To reduce the lexer table size, + * we use a single "xqs" state to do the lookahead for all + * types of strings. */ - if (yyextra->saw_non_ascii) - pg_verifymbstr(yyextra->literalbuf, - yyextra->literallen, - false); - yylval->str = litbufdup(yyscanner); - return SCONST; - } -{quotestop} | -{quotefail} { - /* throw back all but the quote */ - yyless(1); - /* xusend state looks for possible UESCAPE */ - BEGIN(xusend); + yyextra->state_before_quote_stop = YYSTATE; + BEGIN(xqs); } -{whitespace} { - /* stay in xusend state over whitespace */ +{quotecontinue} { + /* + * Found a quote continuation, so return to the in-quote + * state and continue scanning the literal. + */ + BEGIN(yyextra->state_before_quote_stop); } -<> | -{other} | -{xustop1} { - /* no UESCAPE after the quote, throw back everything */ +{quotecontinuefail} | +{other} | +<> { + /* + * Failed to see a quote continuation. Throw back + * everything after the end quote, and handle the string + * according to the state we were in previously. + */ yyless(0); - BEGIN(INITIAL); - yylval->str = litbuf_udeescape('\\', yyscanner); - return SCONST; - } -{xustop2} { - /* found UESCAPE after the end quote */ - BEGIN(INITIAL); - if (!check_uescapechar(yytext[yyleng - 2])) + + switch (yyextra->state_before_quote_stop) { - SET_YYLLOC(); - ADVANCE_YYLLOC(yyleng - 2); - yyerror("invalid Unicode escape character"); + case xb: + BEGIN(INITIAL); + yylval->str = litbufdup(yyscanner); + return BCONST; + case xh: + BEGIN(INITIAL); + yylval->str = litbufdup(yyscanner); + return XCONST; + case xe: + /* fallthrough */ + case xq: + BEGIN(INITIAL); + + /* + * Check that the data remains valid if it + * might have been made invalid by unescaping + * any chars. + */ + if (yyextra->saw_non_ascii) + pg_verifymbstr(yyextra->literalbuf, + yyextra->literallen, + false); + yylval->str = litbufdup(yyscanner); + return SCONST; + case xus: + /* xuend state looks for possible UESCAPE */ + BEGIN(xuend); + break; + default: + yyerror("unhandled previous state in xqs"); } - yylval->str = litbuf_udeescape(yytext[yyleng - 2], - yyscanner); - return SCONST; } + {xqdouble} { addlitchar('\'', yyscanner); } @@ -693,9 +696,6 @@ other . if (c == '\0' || IS_HIGHBIT_SET(c)) yyextra->saw_non_ascii = true; } -{quotecontinue} { - /* ignore */ - } . { /* This is only needed for \ just before EOF */ addlitchar(yytext[0], yyscanner); @@ -770,53 +770,89 @@ other . return IDENT; } {dquote} { - yyless(1); - /* xuiend state looks for possible UESCAPE */ - BEGIN(xuiend); + /* xuend state looks for possible UESCAPE */ + yyextra->state_before_quote_stop = YYSTATE; + BEGIN(xuend); } -{whitespace} { - /* stay in xuiend state over whitespace */ + +{whitespace} { + /* stay in xuend or xuchar state over whitespace */ } -<> | -{other} | -{xustop1} { +{uescapefail} | +{other} | +<> { /* no UESCAPE after the quote, throw back everything */ - char *ident; - int identlen; - yyless(0); - BEGIN(INITIAL); - if (yyextra->literallen == 0) - yyerror("zero-length delimited identifier"); - ident = litbuf_udeescape('\\', yyscanner); - identlen = strlen(ident); - if (identlen >= NAMEDATALEN) - truncate_identifier(ident, identlen, true); - yylval->str = ident; - return IDENT; + if (yyextra->state_before_quote_stop == xus) + { + BEGIN(INITIAL); + yylval->str = litbuf_udeescape('\\', yyscanner); + return SCONST; + } + else if (yyextra->state_before_quote_stop == xui) + { + char *ident; + int identlen; + + BEGIN(INITIAL); + if (yyextra->literallen == 0) + yyerror("zero-length delimited identifier"); + ident = litbuf_udeescape('\\', yyscanner); + identlen = strlen(ident); + if (identlen >= NAMEDATALEN) + truncate_identifier(ident, identlen, true); + yylval->str = ident; + return IDENT; + } + else + yyerror("unhandled previous state in xuend"); } -{xustop2} { +{uescape} { /* found UESCAPE after the end quote */ - char *ident; - int identlen; - - BEGIN(INITIAL); - if (yyextra->literallen == 0) - yyerror("zero-length delimited identifier"); + BEGIN(xuchar); + } +{uescchar} { + /* found escape character literal after UESCAPE */ if (!check_uescapechar(yytext[yyleng - 2])) { SET_YYLLOC(); ADVANCE_YYLLOC(yyleng - 2); yyerror("invalid Unicode escape character"); } - ident = litbuf_udeescape(yytext[yyleng - 2], yyscanner); - identlen = strlen(ident); - if (identlen >= NAMEDATALEN) - truncate_identifier(ident, identlen, true); - yylval->str = ident; - return IDENT; + + if (yyextra->state_before_quote_stop == xus) + { + BEGIN(INITIAL); + yylval->str = litbuf_udeescape(yytext[yyleng - 2], + yyscanner); + return SCONST; + } + else if (yyextra->state_before_quote_stop == xui) + { + char *ident; + int identlen; + + BEGIN(INITIAL); + if (yyextra->literallen == 0) + yyerror("zero-length delimited identifier"); + ident = litbuf_udeescape(yytext[yyleng - 2], yyscanner); + identlen = strlen(ident); + if (identlen >= NAMEDATALEN) + truncate_identifier(ident, identlen, true); + yylval->str = ident; + return IDENT; + } + else + yyerror("unhandled previous state in xuchar"); + } +{uesccharfail} | +{other} | +<> { + SET_YYLLOC(); + yyerror("missing or invalid Unicode escape character"); } + {xddouble} { addlitchar('"', yyscanner); } diff --git a/src/include/parser/scanner.h b/src/include/parser/scanner.h index 731a2bd..72c2a28 100644 --- a/src/include/parser/scanner.h +++ b/src/include/parser/scanner.h @@ -99,6 +99,7 @@ typedef struct core_yy_extra_type int literallen; /* actual current string length */ int literalalloc; /* current allocated buffer size */ + int state_before_quote_stop; /* start cond. before end quote */ int xcdepth; /* depth of nesting in slash-star comments */ char *dolqstart; /* current $foo$ quote start string */