From d69c6e1dcfd96adb9dfe012b80d3922b89be6dbf Mon Sep 17 00:00:00 2001 From: Mark Dilger Date: Wed, 22 Jan 2020 13:28:52 -0800 Subject: [PATCH 09/11] Making json parsing work without throwing exceptions. This is largely based on Robert Haas's patch v2-0004-WIP-Return-errors-rather-than-using-ereport.patch --- src/backend/utils/adt/json.c | 473 ++++++++---------------------- src/backend/utils/adt/jsonb.c | 5 +- src/backend/utils/adt/jsonfuncs.c | 254 +++++++++++++++- src/include/common/jsonapi.h | 30 +- src/include/utils/jsonfuncs.h | 34 +++ 5 files changed, 432 insertions(+), 364 deletions(-) diff --git a/src/backend/utils/adt/json.c b/src/backend/utils/adt/json.c index ff0764dbc5..54075d07e3 100644 --- a/src/backend/utils/adt/json.c +++ b/src/backend/utils/adt/json.c @@ -34,6 +34,13 @@ #include "utils/syscache.h" #include "utils/typcache.h" +#define INSIST(x) \ +do { \ + JsonParseErrorType parse_result; \ + if((parse_result = (x)) != JSON_SUCCESS) \ + return parse_result; \ +} while (0) + /* * The context of the parser is maintained by the recursive descent * mechanism, but is passed explicitly to the error reporting routine @@ -76,19 +83,17 @@ typedef struct JsonAggState Oid val_output_func; } JsonAggState; -static inline void json_lex(JsonLexContext *lex); -static inline void json_lex_string(JsonLexContext *lex); -static inline void json_lex_number(JsonLexContext *lex, char *s, - bool *num_err, int *total_len); -static inline void parse_scalar(JsonLexContext *lex, const JsonSemAction *sem); -static void parse_object_field(JsonLexContext *lex, const JsonSemAction *sem); -static void parse_object(JsonLexContext *lex, const JsonSemAction *sem); -static void parse_array_element(JsonLexContext *lex, const JsonSemAction *sem); -static void parse_array(JsonLexContext *lex, const JsonSemAction *sem); -static void report_parse_error(JsonParseContext ctx, JsonLexContext *lex) pg_attribute_noreturn(); -static void report_invalid_token(JsonLexContext *lex) pg_attribute_noreturn(); -static int report_json_context(JsonLexContext *lex); -static char *extract_mb_char(char *s); +static inline JsonTokenType lex_peek(JsonLexContext *lex) __attribute__((warn_unused_result)); +static inline JsonParseErrorType lex_expect(JsonParseContext ctx, JsonLexContext *lex, JsonTokenType token) __attribute__((warn_unused_result)); +static inline JsonParseErrorType json_lex_string(JsonLexContext *lex) __attribute__((warn_unused_result)); +static inline JsonParseErrorType json_lex_number(JsonLexContext *lex, char *s, + bool *num_err, int *total_len) __attribute__((warn_unused_result)); +static inline JsonParseErrorType parse_scalar(JsonLexContext *lex, const JsonSemAction *sem) __attribute__((warn_unused_result)); +static JsonParseErrorType parse_object_field(JsonLexContext *lex, const JsonSemAction *sem) __attribute__((warn_unused_result)); +static JsonParseErrorType parse_object(JsonLexContext *lex, const JsonSemAction *sem) __attribute__((warn_unused_result)); +static JsonParseErrorType parse_array_element(JsonLexContext *lex, const JsonSemAction *sem) __attribute__((warn_unused_result)); +static JsonParseErrorType parse_array(JsonLexContext *lex, const JsonSemAction *sem) __attribute__((warn_unused_result)); +static JsonParseErrorType report_parse_error(JsonParseContext ctx, JsonLexContext *lex) __attribute__((warn_unused_result)); static void composite_to_json(Datum composite, StringInfo result, bool use_line_feeds); static void array_dim_to_json(StringInfo result, int dim, int ndims, int *dims, @@ -126,13 +131,14 @@ lex_peek(JsonLexContext *lex) * move the lexer to the next token if the current look_ahead token matches * the parameter token. Otherwise, report an error. */ -static inline void +static inline JsonParseErrorType lex_expect(JsonParseContext ctx, JsonLexContext *lex, JsonTokenType token) { if (lex_peek(lex) == token) - json_lex(lex); + INSIST(json_lex(lex)); else - report_parse_error(ctx, lex); + return report_parse_error(ctx, lex); + return JSON_SUCCESS; } /* chars to consider as part of an alphanumeric token */ @@ -175,7 +181,8 @@ IsValidJsonNumber(const char *str, int len) dummy_lex.input_length = len; } - json_lex_number(&dummy_lex, dummy_lex.input, &numeric_error, &total_len); + if (JSON_SUCCESS != json_lex_number(&dummy_lex, dummy_lex.input, &numeric_error, &total_len)) + return false; return (!numeric_error) && (total_len == dummy_lex.input_length); } @@ -192,7 +199,7 @@ json_in(PG_FUNCTION_ARGS) /* validate it */ lex = makeJsonLexContext(result, false); - pg_parse_json(lex, &nullSemAction); + pg_parse_json_or_throw(lex, &nullSemAction); /* Internal representation is the same as text, for now */ PG_RETURN_TEXT_P(result); @@ -239,7 +246,7 @@ json_recv(PG_FUNCTION_ARGS) /* Validate it. */ lex = makeJsonLexContextCstringLen(str, nbytes, false); - pg_parse_json(lex, &nullSemAction); + pg_parse_json_or_throw(lex, &nullSemAction); PG_RETURN_TEXT_P(cstring_to_text_with_len(str, nbytes)); } @@ -254,13 +261,13 @@ json_recv(PG_FUNCTION_ARGS) * action routines to be called at appropriate spots during parsing, and a * pointer to a state object to be passed to those routines. */ -void +JsonParseErrorType pg_parse_json(JsonLexContext *lex, const JsonSemAction *sem) { JsonTokenType tok; /* get the initial token */ - json_lex(lex); + INSIST(json_lex(lex)); tok = lex_peek(lex); @@ -268,17 +275,17 @@ pg_parse_json(JsonLexContext *lex, const JsonSemAction *sem) switch (tok) { case JSON_TOKEN_OBJECT_START: - parse_object(lex, sem); + INSIST(parse_object(lex, sem)); break; case JSON_TOKEN_ARRAY_START: - parse_array(lex, sem); + INSIST(parse_array(lex, sem)); break; default: - parse_scalar(lex, sem); /* json can be a bare scalar */ + INSIST(parse_scalar(lex, sem)); /* json can be a bare scalar */ } - lex_expect(JSON_PARSE_END, lex, JSON_TOKEN_END); - + INSIST(lex_expect(JSON_PARSE_END, lex, JSON_TOKEN_END)); + return JSON_SUCCESS; } /* @@ -305,19 +312,20 @@ json_count_array_elements(JsonLexContext *lex) copylex.lex_level++; count = 0; - lex_expect(JSON_PARSE_ARRAY_START, ©lex, JSON_TOKEN_ARRAY_START); + INSIST(lex_expect(JSON_PARSE_ARRAY_START, ©lex, JSON_TOKEN_ARRAY_START)); if (lex_peek(©lex) != JSON_TOKEN_ARRAY_END) { while (1) { count++; - parse_array_element(©lex, &nullSemAction); + if (JSON_SUCCESS != parse_array_element(©lex, &nullSemAction)) + break; if (copylex.token_type != JSON_TOKEN_COMMA) break; - json_lex(©lex); + INSIST(json_lex(©lex)); } } - lex_expect(JSON_PARSE_ARRAY_NEXT, ©lex, JSON_TOKEN_ARRAY_END); + INSIST(lex_expect(JSON_PARSE_ARRAY_NEXT, ©lex, JSON_TOKEN_ARRAY_END)); return count; } @@ -331,7 +339,7 @@ json_count_array_elements(JsonLexContext *lex) * - object ( { } ) * - object field */ -static inline void +static inline JsonParseErrorType parse_scalar(JsonLexContext *lex, const JsonSemAction *sem) { char *val = NULL; @@ -348,14 +356,14 @@ parse_scalar(JsonLexContext *lex, const JsonSemAction *sem) case JSON_TOKEN_NULL: break; default: - report_parse_error(JSON_PARSE_VALUE, lex); + return report_parse_error(JSON_PARSE_VALUE, lex); } /* if no semantic function, just consume the token */ if (sfunc == NULL) { - json_lex(lex); - return; + INSIST(json_lex(lex)); + return JSON_SUCCESS; } /* extract the de-escaped string value, or the raw lexeme */ @@ -374,13 +382,14 @@ parse_scalar(JsonLexContext *lex, const JsonSemAction *sem) } /* consume the token */ - json_lex(lex); + INSIST(json_lex(lex)); /* invoke the callback */ (*sfunc) (sem->semstate, val, tok); + return JSON_SUCCESS; } -static void +static JsonParseErrorType parse_object_field(JsonLexContext *lex, const JsonSemAction *sem) { /* @@ -396,12 +405,12 @@ parse_object_field(JsonLexContext *lex, const JsonSemAction *sem) JsonTokenType tok; if (lex_peek(lex) != JSON_TOKEN_STRING) - report_parse_error(JSON_PARSE_STRING, lex); + return report_parse_error(JSON_PARSE_STRING, lex); if ((ostart != NULL || oend != NULL) && lex->strval != NULL) fname = pstrdup(lex->strval->data); - json_lex(lex); + INSIST(json_lex(lex)); - lex_expect(JSON_PARSE_OBJECT_LABEL, lex, JSON_TOKEN_COLON); + INSIST(lex_expect(JSON_PARSE_OBJECT_LABEL, lex, JSON_TOKEN_COLON)); tok = lex_peek(lex); isnull = tok == JSON_TOKEN_NULL; @@ -412,20 +421,21 @@ parse_object_field(JsonLexContext *lex, const JsonSemAction *sem) switch (tok) { case JSON_TOKEN_OBJECT_START: - parse_object(lex, sem); + INSIST(parse_object(lex, sem)); break; case JSON_TOKEN_ARRAY_START: - parse_array(lex, sem); + INSIST(parse_array(lex, sem)); break; default: - parse_scalar(lex, sem); + INSIST(parse_scalar(lex, sem)); } if (oend != NULL) (*oend) (sem->semstate, fname, isnull); + return JSON_SUCCESS; } -static void +static JsonParseErrorType parse_object(JsonLexContext *lex, const JsonSemAction *sem) { /* @@ -450,35 +460,36 @@ parse_object(JsonLexContext *lex, const JsonSemAction *sem) lex->lex_level++; Assert(lex_peek(lex) == JSON_TOKEN_OBJECT_START); - json_lex(lex); + INSIST(json_lex(lex)); tok = lex_peek(lex); switch (tok) { case JSON_TOKEN_STRING: - parse_object_field(lex, sem); + INSIST(parse_object_field(lex, sem)); while (lex_peek(lex) == JSON_TOKEN_COMMA) { - json_lex(lex); - parse_object_field(lex, sem); + INSIST(json_lex(lex)); + INSIST(parse_object_field(lex, sem)); } break; case JSON_TOKEN_OBJECT_END: break; default: /* case of an invalid initial token inside the object */ - report_parse_error(JSON_PARSE_OBJECT_START, lex); + return report_parse_error(JSON_PARSE_OBJECT_START, lex); } - lex_expect(JSON_PARSE_OBJECT_NEXT, lex, JSON_TOKEN_OBJECT_END); + INSIST(lex_expect(JSON_PARSE_OBJECT_NEXT, lex, JSON_TOKEN_OBJECT_END)); lex->lex_level--; if (oend != NULL) (*oend) (sem->semstate); + return JSON_SUCCESS; } -static void +static JsonParseErrorType parse_array_element(JsonLexContext *lex, const JsonSemAction *sem) { json_aelem_action astart = sem->array_element_start; @@ -496,20 +507,21 @@ parse_array_element(JsonLexContext *lex, const JsonSemAction *sem) switch (tok) { case JSON_TOKEN_OBJECT_START: - parse_object(lex, sem); + INSIST(parse_object(lex, sem)); break; case JSON_TOKEN_ARRAY_START: - parse_array(lex, sem); + INSIST(parse_array(lex, sem)); break; default: - parse_scalar(lex, sem); + INSIST(parse_scalar(lex, sem)); } if (aend != NULL) (*aend) (sem->semstate, isnull); + return JSON_SUCCESS; } -static void +static JsonParseErrorType parse_array(JsonLexContext *lex, const JsonSemAction *sem) { /* @@ -532,31 +544,32 @@ parse_array(JsonLexContext *lex, const JsonSemAction *sem) */ lex->lex_level++; - lex_expect(JSON_PARSE_ARRAY_START, lex, JSON_TOKEN_ARRAY_START); + INSIST(lex_expect(JSON_PARSE_ARRAY_START, lex, JSON_TOKEN_ARRAY_START)); if (lex_peek(lex) != JSON_TOKEN_ARRAY_END) { - parse_array_element(lex, sem); + INSIST(parse_array_element(lex, sem)); while (lex_peek(lex) == JSON_TOKEN_COMMA) { - json_lex(lex); - parse_array_element(lex, sem); + INSIST(json_lex(lex)); + INSIST(parse_array_element(lex, sem)); } } - lex_expect(JSON_PARSE_ARRAY_NEXT, lex, JSON_TOKEN_ARRAY_END); + INSIST(lex_expect(JSON_PARSE_ARRAY_NEXT, lex, JSON_TOKEN_ARRAY_END)); lex->lex_level--; if (aend != NULL) (*aend) (sem->semstate); + return JSON_SUCCESS; } /* * Lex one token from the input stream. */ -static inline void +JsonParseErrorType json_lex(JsonLexContext *lex) { char *s; @@ -619,12 +632,12 @@ json_lex(JsonLexContext *lex) break; case '"': /* string */ - json_lex_string(lex); + INSIST(json_lex_string(lex)); lex->token_type = JSON_TOKEN_STRING; break; case '-': /* Negative number. */ - json_lex_number(lex, s + 1, NULL, NULL); + INSIST(json_lex_number(lex, s + 1, NULL, NULL)); lex->token_type = JSON_TOKEN_NUMBER; break; case '0': @@ -638,7 +651,7 @@ json_lex(JsonLexContext *lex) case '8': case '9': /* Positive number. */ - json_lex_number(lex, s, NULL, NULL); + INSIST(json_lex_number(lex, s, NULL, NULL)); lex->token_type = JSON_TOKEN_NUMBER; break; default: @@ -666,7 +679,7 @@ json_lex(JsonLexContext *lex) { lex->prev_token_terminator = lex->token_terminator; lex->token_terminator = s + 1; - report_invalid_token(lex); + return JSON_INVALID_TOKEN; } /* @@ -683,21 +696,22 @@ json_lex(JsonLexContext *lex) else if (memcmp(s, "null", 4) == 0) lex->token_type = JSON_TOKEN_NULL; else - report_invalid_token(lex); + return JSON_INVALID_TOKEN; } else if (p - s == 5 && memcmp(s, "false", 5) == 0) lex->token_type = JSON_TOKEN_FALSE; else - report_invalid_token(lex); + return JSON_INVALID_TOKEN; } } /* end of switch */ + return JSON_SUCCESS; } /* * The next token in the input stream is known to be a string; lex it. */ -static inline void +static inline JsonParseErrorType json_lex_string(JsonLexContext *lex) { char *s; @@ -718,7 +732,7 @@ json_lex_string(JsonLexContext *lex) if (len >= lex->input_length) { lex->token_terminator = s; - report_invalid_token(lex); + return JSON_INVALID_TOKEN; } else if (*s == '"') break; @@ -727,12 +741,7 @@ json_lex_string(JsonLexContext *lex) /* Per RFC4627, these characters MUST be escaped. */ /* Since *s isn't printable, exclude it from the context string */ lex->token_terminator = s; - ereport(ERROR, - (errcode(ERRCODE_INVALID_TEXT_REPRESENTATION), - errmsg("invalid input syntax for type %s", "json"), - errdetail("Character with value 0x%02x must be escaped.", - (unsigned char) *s), - report_json_context(lex))); + return JSON_ESCAPING_REQUIRED; } else if (*s == '\\') { @@ -742,7 +751,7 @@ json_lex_string(JsonLexContext *lex) if (len >= lex->input_length) { lex->token_terminator = s; - report_invalid_token(lex); + return JSON_INVALID_TOKEN; } else if (*s == 'u') { @@ -756,7 +765,7 @@ json_lex_string(JsonLexContext *lex) if (len >= lex->input_length) { lex->token_terminator = s; - report_invalid_token(lex); + return JSON_INVALID_TOKEN; } else if (*s >= '0' && *s <= '9') ch = (ch * 16) + (*s - '0'); @@ -767,12 +776,7 @@ json_lex_string(JsonLexContext *lex) else { lex->token_terminator = s + pg_mblen(s); - ereport(ERROR, - (errcode(ERRCODE_INVALID_TEXT_REPRESENTATION), - errmsg("invalid input syntax for type %s", - "json"), - errdetail("\"\\u\" must be followed by four hexadecimal digits."), - report_json_context(lex))); + return JSON_UNICODE_ESCAPE_FORMAT; } } if (lex->strval != NULL) @@ -783,33 +787,20 @@ json_lex_string(JsonLexContext *lex) if (ch >= 0xd800 && ch <= 0xdbff) { if (hi_surrogate != -1) - ereport(ERROR, - (errcode(ERRCODE_INVALID_TEXT_REPRESENTATION), - errmsg("invalid input syntax for type %s", - "json"), - errdetail("Unicode high surrogate must not follow a high surrogate."), - report_json_context(lex))); + return JSON_UNICODE_HIGH_SURROGATE; hi_surrogate = (ch & 0x3ff) << 10; continue; } else if (ch >= 0xdc00 && ch <= 0xdfff) { if (hi_surrogate == -1) - ereport(ERROR, - (errcode(ERRCODE_INVALID_TEXT_REPRESENTATION), - errmsg("invalid input syntax for type %s", "json"), - errdetail("Unicode low surrogate must follow a high surrogate."), - report_json_context(lex))); + return JSON_UNICODE_LOW_SURROGATE; ch = 0x10000 + hi_surrogate + (ch & 0x3ff); hi_surrogate = -1; } if (hi_surrogate != -1) - ereport(ERROR, - (errcode(ERRCODE_INVALID_TEXT_REPRESENTATION), - errmsg("invalid input syntax for type %s", "json"), - errdetail("Unicode low surrogate must follow a high surrogate."), - report_json_context(lex))); + return JSON_UNICODE_LOW_SURROGATE; /* * For UTF8, replace the escape sequence by the actual @@ -821,11 +812,7 @@ json_lex_string(JsonLexContext *lex) if (ch == 0) { /* We can't allow this, since our TEXT type doesn't */ - ereport(ERROR, - (errcode(ERRCODE_UNTRANSLATABLE_CHARACTER), - errmsg("unsupported Unicode escape sequence"), - errdetail("\\u0000 cannot be converted to text."), - report_json_context(lex))); + return JSON_UNICODE_CODE_POINT_ZERO; } else if (GetDatabaseEncoding() == PG_UTF8) { @@ -843,25 +830,14 @@ json_lex_string(JsonLexContext *lex) appendStringInfoChar(lex->strval, (char) ch); } else - { - ereport(ERROR, - (errcode(ERRCODE_UNTRANSLATABLE_CHARACTER), - errmsg("unsupported Unicode escape sequence"), - errdetail("Unicode escape values cannot be used for code point values above 007F when the server encoding is not UTF8."), - report_json_context(lex))); - } + return JSON_UNICODE_HIGH_ESCAPE; } } else if (lex->strval != NULL) { if (hi_surrogate != -1) - ereport(ERROR, - (errcode(ERRCODE_INVALID_TEXT_REPRESENTATION), - errmsg("invalid input syntax for type %s", - "json"), - errdetail("Unicode low surrogate must follow a high surrogate."), - report_json_context(lex))); + return JSON_UNICODE_LOW_SURROGATE; switch (*s) { @@ -888,13 +864,7 @@ json_lex_string(JsonLexContext *lex) default: /* Not a valid string escape, so error out. */ lex->token_terminator = s + pg_mblen(s); - ereport(ERROR, - (errcode(ERRCODE_INVALID_TEXT_REPRESENTATION), - errmsg("invalid input syntax for type %s", - "json"), - errdetail("Escape sequence \"\\%s\" is invalid.", - extract_mb_char(s)), - report_json_context(lex))); + return JSON_ESCAPING_INVALID; } } else if (strchr("\"\\/bfnrt", *s) == NULL) @@ -907,39 +877,26 @@ json_lex_string(JsonLexContext *lex) * shown it's not a performance win. */ lex->token_terminator = s + pg_mblen(s); - ereport(ERROR, - (errcode(ERRCODE_INVALID_TEXT_REPRESENTATION), - errmsg("invalid input syntax for type %s", "json"), - errdetail("Escape sequence \"\\%s\" is invalid.", - extract_mb_char(s)), - report_json_context(lex))); + return JSON_ESCAPING_INVALID; } } else if (lex->strval != NULL) { if (hi_surrogate != -1) - ereport(ERROR, - (errcode(ERRCODE_INVALID_TEXT_REPRESENTATION), - errmsg("invalid input syntax for type %s", "json"), - errdetail("Unicode low surrogate must follow a high surrogate."), - report_json_context(lex))); - + return JSON_UNICODE_LOW_SURROGATE; appendStringInfoChar(lex->strval, *s); } } if (hi_surrogate != -1) - ereport(ERROR, - (errcode(ERRCODE_INVALID_TEXT_REPRESENTATION), - errmsg("invalid input syntax for type %s", "json"), - errdetail("Unicode low surrogate must follow a high surrogate."), - report_json_context(lex))); + return JSON_UNICODE_LOW_SURROGATE; /* Hooray, we found the end of the string! */ lex->prev_token_terminator = lex->token_terminator; lex->token_terminator = s + 1; + return JSON_SUCCESS; } /* @@ -970,7 +927,7 @@ json_lex_string(JsonLexContext *lex) * raising an error for a badly-formed number. Also, if total_len is not NULL * the distance from lex->input to the token end+1 is returned to *total_len. */ -static inline void +static inline JsonParseErrorType json_lex_number(JsonLexContext *lex, char *s, bool *num_err, int *total_len) { @@ -1059,8 +1016,9 @@ json_lex_number(JsonLexContext *lex, char *s, lex->token_terminator = s; /* handle error if any */ if (error) - report_invalid_token(lex); + return JSON_INVALID_TOKEN; } + return JSON_SUCCESS; } /* @@ -1068,219 +1026,36 @@ json_lex_number(JsonLexContext *lex, char *s, * * lex->token_start and lex->token_terminator must identify the current token. */ -static void +static JsonParseErrorType report_parse_error(JsonParseContext ctx, JsonLexContext *lex) { - char *token; - int toklen; - /* Handle case where the input ended prematurely. */ if (lex->token_start == NULL || lex->token_type == JSON_TOKEN_END) - ereport(ERROR, - (errcode(ERRCODE_INVALID_TEXT_REPRESENTATION), - errmsg("invalid input syntax for type %s", "json"), - errdetail("The input string ended unexpectedly."), - report_json_context(lex))); - - /* Separate out the current token. */ - toklen = lex->token_terminator - lex->token_start; - token = palloc(toklen + 1); - memcpy(token, lex->token_start, toklen); - token[toklen] = '\0'; - - /* Complain, with the appropriate detail message. */ - if (ctx == JSON_PARSE_END) - ereport(ERROR, - (errcode(ERRCODE_INVALID_TEXT_REPRESENTATION), - errmsg("invalid input syntax for type %s", "json"), - errdetail("Expected end of input, but found \"%s\".", - token), - report_json_context(lex))); - else - { - switch (ctx) - { - case JSON_PARSE_VALUE: - ereport(ERROR, - (errcode(ERRCODE_INVALID_TEXT_REPRESENTATION), - errmsg("invalid input syntax for type %s", "json"), - errdetail("Expected JSON value, but found \"%s\".", - token), - report_json_context(lex))); - break; - case JSON_PARSE_STRING: - ereport(ERROR, - (errcode(ERRCODE_INVALID_TEXT_REPRESENTATION), - errmsg("invalid input syntax for type %s", "json"), - errdetail("Expected string, but found \"%s\".", - token), - report_json_context(lex))); - break; - case JSON_PARSE_ARRAY_START: - ereport(ERROR, - (errcode(ERRCODE_INVALID_TEXT_REPRESENTATION), - errmsg("invalid input syntax for type %s", "json"), - errdetail("Expected array element or \"]\", but found \"%s\".", - token), - report_json_context(lex))); - break; - case JSON_PARSE_ARRAY_NEXT: - ereport(ERROR, - (errcode(ERRCODE_INVALID_TEXT_REPRESENTATION), - errmsg("invalid input syntax for type %s", "json"), - errdetail("Expected \",\" or \"]\", but found \"%s\".", - token), - report_json_context(lex))); - break; - case JSON_PARSE_OBJECT_START: - ereport(ERROR, - (errcode(ERRCODE_INVALID_TEXT_REPRESENTATION), - errmsg("invalid input syntax for type %s", "json"), - errdetail("Expected string or \"}\", but found \"%s\".", - token), - report_json_context(lex))); - break; - case JSON_PARSE_OBJECT_LABEL: - ereport(ERROR, - (errcode(ERRCODE_INVALID_TEXT_REPRESENTATION), - errmsg("invalid input syntax for type %s", "json"), - errdetail("Expected \":\", but found \"%s\".", - token), - report_json_context(lex))); - break; - case JSON_PARSE_OBJECT_NEXT: - ereport(ERROR, - (errcode(ERRCODE_INVALID_TEXT_REPRESENTATION), - errmsg("invalid input syntax for type %s", "json"), - errdetail("Expected \",\" or \"}\", but found \"%s\".", - token), - report_json_context(lex))); - break; - case JSON_PARSE_OBJECT_COMMA: - ereport(ERROR, - (errcode(ERRCODE_INVALID_TEXT_REPRESENTATION), - errmsg("invalid input syntax for type %s", "json"), - errdetail("Expected string, but found \"%s\".", - token), - report_json_context(lex))); - break; - default: - elog(ERROR, "unexpected json parse state: %d", ctx); - } - } -} - -/* - * Report an invalid input token. - * - * lex->token_start and lex->token_terminator must identify the token. - */ -static void -report_invalid_token(JsonLexContext *lex) -{ - char *token; - int toklen; - - /* Separate out the offending token. */ - toklen = lex->token_terminator - lex->token_start; - token = palloc(toklen + 1); - memcpy(token, lex->token_start, toklen); - token[toklen] = '\0'; - - ereport(ERROR, - (errcode(ERRCODE_INVALID_TEXT_REPRESENTATION), - errmsg("invalid input syntax for type %s", "json"), - errdetail("Token \"%s\" is invalid.", token), - report_json_context(lex))); -} + return JSON_EXPECTED_MORE; -/* - * Report a CONTEXT line for bogus JSON input. - * - * lex->token_terminator must be set to identify the spot where we detected - * the error. Note that lex->token_start might be NULL, in case we recognized - * error at EOF. - * - * The return value isn't meaningful, but we make it non-void so that this - * can be invoked inside ereport(). - */ -static int -report_json_context(JsonLexContext *lex) -{ - const char *context_start; - const char *context_end; - const char *line_start; - int line_number; - char *ctxt; - int ctxtlen; - const char *prefix; - const char *suffix; - - /* Choose boundaries for the part of the input we will display */ - context_start = lex->input; - context_end = lex->token_terminator; - line_start = context_start; - line_number = 1; - for (;;) + switch (ctx) { - /* Always advance over newlines */ - if (context_start < context_end && *context_start == '\n') - { - context_start++; - line_start = context_start; - line_number++; - continue; - } - /* Otherwise, done as soon as we are close enough to context_end */ - if (context_end - context_start < 50) - break; - /* Advance to next multibyte character */ - if (IS_HIGHBIT_SET(*context_start)) - context_start += pg_mblen(context_start); - else - context_start++; + case JSON_PARSE_END: + return JSON_EXPECTED_END; + case JSON_PARSE_VALUE: + return JSON_EXPECTED_JSON; + case JSON_PARSE_STRING: + return JSON_EXPECTED_STRING; + case JSON_PARSE_ARRAY_START: + return JSON_EXPECTED_ARRAY_FIRST; + case JSON_PARSE_ARRAY_NEXT: + return JSON_EXPECTED_ARRAY_NEXT; + case JSON_PARSE_OBJECT_START: + return JSON_EXPECTED_OBJECT_FIRST; + case JSON_PARSE_OBJECT_LABEL: + return JSON_EXPECTED_COLON; + case JSON_PARSE_OBJECT_NEXT: + return JSON_EXPECTED_OBJECT_NEXT; + case JSON_PARSE_OBJECT_COMMA: + return JSON_EXPECTED_STRING; + default: + return JSON_BAD_PARSER_STATE;; } - - /* - * We add "..." to indicate that the excerpt doesn't start at the - * beginning of the line ... but if we're within 3 characters of the - * beginning of the line, we might as well just show the whole line. - */ - if (context_start - line_start <= 3) - context_start = line_start; - - /* Get a null-terminated copy of the data to present */ - ctxtlen = context_end - context_start; - ctxt = palloc(ctxtlen + 1); - memcpy(ctxt, context_start, ctxtlen); - ctxt[ctxtlen] = '\0'; - - /* - * Show the context, prefixing "..." if not starting at start of line, and - * suffixing "..." if not ending at end of line. - */ - prefix = (context_start > line_start) ? "..." : ""; - suffix = (lex->token_type != JSON_TOKEN_END && context_end - lex->input < lex->input_length && *context_end != '\n' && *context_end != '\r') ? "..." : ""; - - return errcontext("JSON data, line %d: %s%s%s", - line_number, prefix, ctxt, suffix); -} - -/* - * Extract a single, possibly multi-byte char from the input string. - */ -static char * -extract_mb_char(char *s) -{ - char *res; - int len; - - len = pg_mblen(s); - res = palloc(len + 1); - memcpy(res, s, len); - res[len] = '\0'; - - return res; } /* @@ -2492,7 +2267,7 @@ json_typeof(PG_FUNCTION_ARGS) lex = makeJsonLexContext(json, false); /* Lex exactly one token from the input and check its type. */ - json_lex(lex); + json_lex_or_throw(lex); tok = lex_peek(lex); switch (tok) { diff --git a/src/backend/utils/adt/jsonb.c b/src/backend/utils/adt/jsonb.c index c95e112184..63072f616e 100644 --- a/src/backend/utils/adt/jsonb.c +++ b/src/backend/utils/adt/jsonb.c @@ -272,7 +272,7 @@ jsonb_from_cstring(char *json, int len) sem.scalar = jsonb_in_scalar; sem.object_field_start = jsonb_in_object_field_start; - pg_parse_json(lex, &sem); + pg_parse_json_or_throw(lex, &sem); /* after parsing, the item member has the composed jsonb structure */ PG_RETURN_POINTER(JsonbValueToJsonb(state.res)); @@ -860,8 +860,7 @@ datum_to_jsonb(Datum val, bool is_null, JsonbInState *result, sem.scalar = jsonb_in_scalar; sem.object_field_start = jsonb_in_object_field_start; - pg_parse_json(lex, &sem); - + pg_parse_json_or_throw(lex, &sem); } break; case JSONBTYPE_JSONB: diff --git a/src/backend/utils/adt/jsonfuncs.c b/src/backend/utils/adt/jsonfuncs.c index 3979145ecc..be5d30239d 100644 --- a/src/backend/utils/adt/jsonfuncs.c +++ b/src/backend/utils/adt/jsonfuncs.c @@ -30,6 +30,7 @@ #include "utils/json.h" #include "utils/jsonb.h" #include "utils/lsyscache.h" +#include "utils/mbutils.h" #include "utils/memutils.h" #include "utils/syscache.h" #include "utils/typcache.h" @@ -328,6 +329,9 @@ typedef struct JsObject hash_destroy((jso)->val.json_hash); \ } while (0) +/* functions for json parsing */ +static char *extract_mb_char(char *s); + /* semantic action functions for json_object_keys */ static void okeys_object_field_start(void *state, char *fname, bool isnull); static void okeys_array_start(void *state); @@ -483,6 +487,23 @@ static void transform_string_values_object_field_start(void *state, char *fname, static void transform_string_values_array_element_start(void *state, bool isnull); static void transform_string_values_scalar(void *state, char *token, JsonTokenType tokentype); +/* + * Extract a single, possibly multi-byte char from the input string. + */ +static char * +extract_mb_char(char *s) +{ + char *res; + int len; + + len = pg_mblen(s); + res = palloc(len + 1); + memcpy(res, s, len); + res[len] = '\0'; + + return res; +} + /* * makeJsonLexContext * @@ -625,7 +646,7 @@ json_object_keys(PG_FUNCTION_ARGS) sem->object_field_start = okeys_object_field_start; /* remainder are all NULL, courtesy of palloc0 above */ - pg_parse_json(lex, sem); + pg_parse_json_or_throw(lex, sem); /* keys are now in state->result */ pfree(lex->strval->data); @@ -656,6 +677,78 @@ json_object_keys(PG_FUNCTION_ARGS) SRF_RETURN_DONE(funcctx); } +/* + * Report a CONTEXT line for bogus JSON input. + * + * lex->token_terminator must be set to identify the spot where we detected + * the error. Note that lex->token_start might be NULL, in case we recognized + * error at EOF. + * + * The return value isn't meaningful, but we make it non-void so that this + * can be invoked inside ereport(). + */ +int +report_json_context(JsonLexContext *lex) +{ + const char *context_start; + const char *context_end; + const char *line_start; + int line_number; + char *ctxt; + int ctxtlen; + const char *prefix; + const char *suffix; + + /* Choose boundaries for the part of the input we will display */ + context_start = lex->input; + context_end = lex->token_terminator; + line_start = context_start; + line_number = 1; + for (;;) + { + /* Always advance over newlines */ + if (context_start < context_end && *context_start == '\n') + { + context_start++; + line_start = context_start; + line_number++; + continue; + } + /* Otherwise, done as soon as we are close enough to context_end */ + if (context_end - context_start < 50) + break; + /* Advance to next multibyte character */ + if (IS_HIGHBIT_SET(*context_start)) + context_start += pg_mblen(context_start); + else + context_start++; + } + + /* + * We add "..." to indicate that the excerpt doesn't start at the + * beginning of the line ... but if we're within 3 characters of the + * beginning of the line, we might as well just show the whole line. + */ + if (context_start - line_start <= 3) + context_start = line_start; + + /* Get a null-terminated copy of the data to present */ + ctxtlen = context_end - context_start; + ctxt = palloc(ctxtlen + 1); + memcpy(ctxt, context_start, ctxtlen); + ctxt[ctxtlen] = '\0'; + + /* + * Show the context, prefixing "..." if not starting at start of line, and + * suffixing "..." if not ending at end of line. + */ + prefix = (context_start > line_start) ? "..." : ""; + suffix = (lex->token_type != JSON_TOKEN_END && context_end - lex->input < lex->input_length && *context_end != '\n' && *context_end != '\r') ? "..." : ""; + + return errcontext("JSON data, line %d: %s%s%s", + line_number, prefix, ctxt, suffix); +} + static void okeys_object_field_start(void *state, char *fname, bool isnull) { @@ -1019,7 +1112,7 @@ get_worker(text *json, sem->array_element_end = get_array_element_end; } - pg_parse_json(lex, sem); + pg_parse_json_or_throw(lex, sem); return state->tresult; } @@ -1567,7 +1660,7 @@ json_array_length(PG_FUNCTION_ARGS) sem->scalar = alen_scalar; sem->array_element_start = alen_array_element_start; - pg_parse_json(lex, sem); + pg_parse_json_or_throw(lex, sem); PG_RETURN_INT32(state->count); } @@ -1662,6 +1755,145 @@ jsonb_each_text(PG_FUNCTION_ARGS) return each_worker_jsonb(fcinfo, "jsonb_each_text", true); } +static char * +extract_token(JsonLexContext *lex) +{ + int toklen = lex->token_terminator - lex->token_start; + char *token = palloc(toklen + 1); + + memcpy(token, lex->token_start, toklen); + token[toklen] = '\0'; + return token; +} + +void +throw_json_parse_error(JsonParseErrorType error, JsonLexContext *lex) +{ + switch (error) + { + case JSON_SUCCESS: + elog(ERROR, "internal error in json parser"); + break; + case JSON_ESCAPING_INVALID: + ereport(ERROR, + (errcode(ERRCODE_INVALID_TEXT_REPRESENTATION), + errmsg("invalid input syntax for type %s", "json"), + errdetail("Escape sequence \"\\%s\" is invalid.", + extract_mb_char(lex->token_terminator - 1)), // XXX WRONG AND BUSTED + report_json_context(lex))); + case JSON_ESCAPING_REQUIRED: + ereport(ERROR, + (errcode(ERRCODE_INVALID_TEXT_REPRESENTATION), + errmsg("invalid input syntax for type %s", "json"), + errdetail("Character with value 0x%02x must be escaped.", + (unsigned char) *(lex->token_terminator)), + report_json_context(lex))); + case JSON_EXPECTED_END: + ereport(ERROR, + (errcode(ERRCODE_INVALID_TEXT_REPRESENTATION), + errmsg("invalid input syntax for type %s", "json"), + errdetail("Expected end of input, but found \"%s\".", + extract_token(lex)), + report_json_context(lex))); + case JSON_EXPECTED_ARRAY_FIRST: + ereport(ERROR, + (errcode(ERRCODE_INVALID_TEXT_REPRESENTATION), + errmsg("invalid input syntax for type %s", "json"), + errdetail("Expected array element or \"]\", but found \"%s\".", + extract_token(lex)), + report_json_context(lex))); + case JSON_EXPECTED_ARRAY_NEXT: + ereport(ERROR, + (errcode(ERRCODE_INVALID_TEXT_REPRESENTATION), + errmsg("invalid input syntax for type %s", "json"), + errdetail("Expected \",\" or \"]\", but found \"%s\".", + extract_token(lex)), + report_json_context(lex))); + case JSON_EXPECTED_COLON: + ereport(ERROR, + (errcode(ERRCODE_INVALID_TEXT_REPRESENTATION), + errmsg("invalid input syntax for type %s", "json"), + errdetail("Expected \":\", but found \"%s\".", + extract_token(lex)), + report_json_context(lex))); + case JSON_EXPECTED_JSON: + ereport(ERROR, + (errcode(ERRCODE_INVALID_TEXT_REPRESENTATION), + errmsg("invalid input syntax for type %s", "json"), + errdetail("Expected JSON value, but found \"%s\".", + extract_token(lex)), + report_json_context(lex))); + case JSON_EXPECTED_MORE: + ereport(ERROR, + (errcode(ERRCODE_INVALID_TEXT_REPRESENTATION), + errmsg("invalid input syntax for type %s", "json"), + errdetail("The input string ended unexpectedly."), + report_json_context(lex))); + case JSON_EXPECTED_OBJECT_FIRST: + ereport(ERROR, + (errcode(ERRCODE_INVALID_TEXT_REPRESENTATION), + errmsg("invalid input syntax for type %s", "json"), + errdetail("Expected string or \"}\", but found \"%s\".", + extract_token(lex)), + report_json_context(lex))); + case JSON_EXPECTED_OBJECT_NEXT: + ereport(ERROR, + (errcode(ERRCODE_INVALID_TEXT_REPRESENTATION), + errmsg("invalid input syntax for type %s", "json"), + errdetail("Expected \",\" or \"}\", but found \"%s\".", + extract_token(lex)), + report_json_context(lex))); + case JSON_EXPECTED_STRING: + ereport(ERROR, + (errcode(ERRCODE_INVALID_TEXT_REPRESENTATION), + errmsg("invalid input syntax for type %s", "json"), + errdetail("Expected string, but found \"%s\".", + extract_token(lex)), + report_json_context(lex))); + case JSON_INVALID_TOKEN: + ereport(ERROR, + (errcode(ERRCODE_INVALID_TEXT_REPRESENTATION), + errmsg("invalid input syntax for type %s", "json"), + errdetail("Token \"%s\" is invalid.", extract_token(lex)), + report_json_context(lex))); + case JSON_UNICODE_CODE_POINT_ZERO: + ereport(ERROR, + (errcode(ERRCODE_UNTRANSLATABLE_CHARACTER), + errmsg("unsupported Unicode escape sequence"), + errdetail("\\u0000 cannot be converted to text."), + report_json_context(lex))); + case JSON_UNICODE_ESCAPE_FORMAT: + ereport(ERROR, + (errcode(ERRCODE_INVALID_TEXT_REPRESENTATION), + errmsg("invalid input syntax for type %s", "json"), + errdetail("\"\\u\" must be followed by four hexadecimal digits."), + report_json_context(lex))); + case JSON_UNICODE_HIGH_ESCAPE: + ereport(ERROR, + (errcode(ERRCODE_UNTRANSLATABLE_CHARACTER), + errmsg("unsupported Unicode escape sequence"), + errdetail("Unicode escape values cannot be used for code point values above 007F when the server encoding is not UTF8."), + report_json_context(lex))); + case JSON_UNICODE_HIGH_SURROGATE: + ereport(ERROR, + (errcode(ERRCODE_INVALID_TEXT_REPRESENTATION), + errmsg("invalid input syntax for type %s", "json"), + errdetail("Unicode high surrogate must not follow a high surrogate."), + report_json_context(lex))); + case JSON_UNICODE_LOW_SURROGATE: + ereport(ERROR, + (errcode(ERRCODE_INVALID_TEXT_REPRESENTATION), + errmsg("invalid input syntax for type %s", "json"), + errdetail("Unicode low surrogate must follow a high surrogate."), + report_json_context(lex))); + case JSON_BAD_PARSER_STATE: + ereport(ERROR, + (errcode(ERRCODE_INTERNAL_ERROR), + errmsg("json parser encountered an internally inconsistent state"), + report_json_context(lex))); /* TODO: Is it safe to call report_json_context here? Perhaps the bad parser state causes problems? */ + } +} + static Datum each_worker_jsonb(FunctionCallInfo fcinfo, const char *funcname, bool as_text) { @@ -1833,7 +2065,7 @@ each_worker(FunctionCallInfo fcinfo, bool as_text) "json_each temporary cxt", ALLOCSET_DEFAULT_SIZES); - pg_parse_json(lex, sem); + pg_parse_json_or_throw(lex, sem); MemoryContextDelete(state->tmp_cxt); @@ -2132,7 +2364,7 @@ elements_worker(FunctionCallInfo fcinfo, const char *funcname, bool as_text) "json_array_elements temporary cxt", ALLOCSET_DEFAULT_SIZES); - pg_parse_json(lex, sem); + pg_parse_json_or_throw(lex, sem); MemoryContextDelete(state->tmp_cxt); @@ -2504,7 +2736,7 @@ populate_array_json(PopulateArrayContext *ctx, char *json, int len) sem.array_element_end = populate_array_element_end; sem.scalar = populate_array_scalar; - pg_parse_json(state.lex, &sem); + pg_parse_json_or_throw(state.lex, &sem); /* number of dimensions should be already known */ Assert(ctx->ndims > 0 && ctx->dims); @@ -3361,7 +3593,7 @@ get_json_object_as_hash(char *json, int len, const char *funcname) sem->object_field_start = hash_object_field_start; sem->object_field_end = hash_object_field_end; - pg_parse_json(lex, sem); + pg_parse_json_or_throw(lex, sem); return tab; } @@ -3660,7 +3892,7 @@ populate_recordset_worker(FunctionCallInfo fcinfo, const char *funcname, state->lex = lex; - pg_parse_json(lex, sem); + pg_parse_json_or_throw(lex, sem); } else { @@ -3990,7 +4222,7 @@ json_strip_nulls(PG_FUNCTION_ARGS) sem->array_element_start = sn_array_element_start; sem->object_field_start = sn_object_field_start; - pg_parse_json(lex, sem); + pg_parse_json_or_throw(lex, sem); PG_RETURN_TEXT_P(cstring_to_text_with_len(state->strval->data, state->strval->len)); @@ -5129,7 +5361,7 @@ iterate_json_values(text *json, uint32 flags, void *action_state, sem->scalar = iterate_values_scalar; sem->object_field_start = iterate_values_object_field_start; - pg_parse_json(lex, sem); + pg_parse_json_or_throw(lex, sem); } /* @@ -5249,7 +5481,7 @@ transform_json_string_values(text *json, void *action_state, sem->array_element_start = transform_string_values_array_element_start; sem->object_field_start = transform_string_values_object_field_start; - pg_parse_json(lex, sem); + pg_parse_json_or_throw(lex, sem); return cstring_to_text_with_len(state->strval->data, state->strval->len); } diff --git a/src/include/common/jsonapi.h b/src/include/common/jsonapi.h index 581fd48036..162437193a 100644 --- a/src/include/common/jsonapi.h +++ b/src/include/common/jsonapi.h @@ -33,6 +33,28 @@ typedef enum JSON_TOKEN_END } JsonTokenType; +typedef enum +{ + JSON_SUCCESS = 0, + JSON_ESCAPING_INVALID, + JSON_ESCAPING_REQUIRED, + JSON_EXPECTED_ARRAY_FIRST, + JSON_EXPECTED_ARRAY_NEXT, + JSON_EXPECTED_COLON, + JSON_EXPECTED_END, + JSON_EXPECTED_JSON, + JSON_EXPECTED_MORE, + JSON_EXPECTED_OBJECT_FIRST, + JSON_EXPECTED_OBJECT_NEXT, + JSON_EXPECTED_STRING, + JSON_INVALID_TOKEN, + JSON_UNICODE_CODE_POINT_ZERO, + JSON_UNICODE_ESCAPE_FORMAT, + JSON_UNICODE_HIGH_ESCAPE, + JSON_UNICODE_HIGH_SURROGATE, + JSON_UNICODE_LOW_SURROGATE, + JSON_BAD_PARSER_STATE +} JsonParseErrorType; /* * All the fields in this structure should be treated as read-only. @@ -104,7 +126,13 @@ extern const JsonSemAction nullSemAction; * points to. If the action pointers are NULL the parser * does nothing and just continues. */ -extern void pg_parse_json(JsonLexContext *lex, const JsonSemAction *sem); +extern JsonParseErrorType pg_parse_json(JsonLexContext *lex, const JsonSemAction *sem) __attribute__((warn_unused_result)); + +/* + * Lex one token from the input stream. + */ +extern JsonParseErrorType json_lex(JsonLexContext *lex) __attribute__((warn_unused_result)); + /* * json_count_array_elements performs a fast secondary parse to determine the diff --git a/src/include/utils/jsonfuncs.h b/src/include/utils/jsonfuncs.h index bade7248f9..82a56eaf06 100644 --- a/src/include/utils/jsonfuncs.h +++ b/src/include/utils/jsonfuncs.h @@ -25,5 +25,39 @@ extern char *JsonEncodeDateTime(char *buf, Datum value, Oid typid, extern text *transform_json_string_values(text *json, void *action_state, JsonTransformStringValuesAction transform_action); extern JsonLexContext *makeJsonLexContext(text *json, bool need_escapes); +extern int report_json_context(JsonLexContext *lex); + +extern void throw_json_parse_error(JsonParseErrorType error, JsonLexContext *lex) pg_attribute_noreturn(); + +static inline void pg_parse_json_or_throw(JsonLexContext *lex, const JsonSemAction *sem); +static inline void json_lex_or_throw(JsonLexContext *lex); + +#define PARSE_OR_THROW(x, lex) \ +do { \ + JsonParseErrorType parse_result; \ + if ((parse_result = (x)) != JSON_SUCCESS) \ + throw_json_parse_error(parse_result, (lex)); \ +} while (0) + +/* + * pg_parse_json will parse the string in the lex calling the + * action functions in sem at the appropriate points. It is + * up to them to keep what state they need in semstate. If they + * need access to the state of the lexer, then its pointer + * should be passed to them as a member of whatever semstate + * points to. If the action pointers are NULL the parser + * does nothing and just continues. + */ +static inline void +pg_parse_json_or_throw(JsonLexContext *lex, const JsonSemAction *sem) +{ + PARSE_OR_THROW(pg_parse_json(lex, sem), lex); +} + +static inline void +json_lex_or_throw(JsonLexContext *lex) +{ + PARSE_OR_THROW(json_lex(lex), lex); +} #endif /* JSONFUNCS_H */ -- 2.21.1 (Apple Git-122.3)