Browse Source
[3.10] bpo-46339: Fix crash in the parser when computing error text for multi-line f-strings (GH-30529) (GH-30542)
[3.10] bpo-46339: Fix crash in the parser when computing error text for multi-line f-strings (GH-30529) (GH-30542)
* bpo-46339: Fix crash in the parser when computing error text for multi-line f-strings (GH-30529)
Automerge-Triggered-By: GH:pablogsal
(cherry picked from commit cedec19be8
)
Co-authored-by: Pablo Galindo Salgado <Pablogsal@gmail.com>
* Fix interactive mode
Co-authored-by: Pablo Galindo Salgado <Pablogsal@gmail.com>
pull/30726/head
committed by
GitHub
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
4 changed files with 19 additions and 427 deletions
-
6Lib/test/test_exceptions.py
-
3Misc/NEWS.d/next/Core and Builtins/2022-01-11-11-50-19.bpo-46339.OVumDZ.rst
-
12Parser/pegen.c
-
425Parser/pegen_errors.c
@ -0,0 +1,3 @@ |
|||
Fix a crash in the parser when retrieving the error text for multi-line |
|||
f-strings expressions that do not start in the first line of the string. |
|||
Patch by Pablo Galindo |
@ -1,425 +0,0 @@ |
|||
#include <Python.h> |
|||
#include <errcode.h> |
|||
|
|||
#include "tokenizer.h" |
|||
#include "pegen.h" |
|||
|
|||
// TOKENIZER ERRORS |
|||
|
|||
void |
|||
_PyPegen_raise_tokenizer_init_error(PyObject *filename) |
|||
{ |
|||
if (!(PyErr_ExceptionMatches(PyExc_LookupError) |
|||
|| PyErr_ExceptionMatches(PyExc_SyntaxError) |
|||
|| PyErr_ExceptionMatches(PyExc_ValueError) |
|||
|| PyErr_ExceptionMatches(PyExc_UnicodeDecodeError))) { |
|||
return; |
|||
} |
|||
PyObject *errstr = NULL; |
|||
PyObject *tuple = NULL; |
|||
PyObject *type; |
|||
PyObject *value; |
|||
PyObject *tback; |
|||
PyErr_Fetch(&type, &value, &tback); |
|||
errstr = PyObject_Str(value); |
|||
if (!errstr) { |
|||
goto error; |
|||
} |
|||
|
|||
PyObject *tmp = Py_BuildValue("(OiiO)", filename, 0, -1, Py_None); |
|||
if (!tmp) { |
|||
goto error; |
|||
} |
|||
|
|||
tuple = PyTuple_Pack(2, errstr, tmp); |
|||
Py_DECREF(tmp); |
|||
if (!value) { |
|||
goto error; |
|||
} |
|||
PyErr_SetObject(PyExc_SyntaxError, tuple); |
|||
|
|||
error: |
|||
Py_XDECREF(type); |
|||
Py_XDECREF(value); |
|||
Py_XDECREF(tback); |
|||
Py_XDECREF(errstr); |
|||
Py_XDECREF(tuple); |
|||
} |
|||
|
|||
static inline void |
|||
raise_unclosed_parentheses_error(Parser *p) { |
|||
int error_lineno = p->tok->parenlinenostack[p->tok->level-1]; |
|||
int error_col = p->tok->parencolstack[p->tok->level-1]; |
|||
RAISE_ERROR_KNOWN_LOCATION(p, PyExc_SyntaxError, |
|||
error_lineno, error_col, error_lineno, -1, |
|||
"'%c' was never closed", |
|||
p->tok->parenstack[p->tok->level-1]); |
|||
} |
|||
|
|||
int |
|||
_Pypegen_tokenizer_error(Parser *p) |
|||
{ |
|||
if (PyErr_Occurred()) { |
|||
return -1; |
|||
} |
|||
|
|||
const char *msg = NULL; |
|||
PyObject* errtype = PyExc_SyntaxError; |
|||
Py_ssize_t col_offset = -1; |
|||
switch (p->tok->done) { |
|||
case E_TOKEN: |
|||
msg = "invalid token"; |
|||
break; |
|||
case E_EOF: |
|||
if (p->tok->level) { |
|||
raise_unclosed_parentheses_error(p); |
|||
} else { |
|||
RAISE_SYNTAX_ERROR("unexpected EOF while parsing"); |
|||
} |
|||
return -1; |
|||
case E_DEDENT: |
|||
RAISE_INDENTATION_ERROR("unindent does not match any outer indentation level"); |
|||
return -1; |
|||
case E_INTR: |
|||
if (!PyErr_Occurred()) { |
|||
PyErr_SetNone(PyExc_KeyboardInterrupt); |
|||
} |
|||
return -1; |
|||
case E_NOMEM: |
|||
PyErr_NoMemory(); |
|||
return -1; |
|||
case E_TABSPACE: |
|||
errtype = PyExc_TabError; |
|||
msg = "inconsistent use of tabs and spaces in indentation"; |
|||
break; |
|||
case E_TOODEEP: |
|||
errtype = PyExc_IndentationError; |
|||
msg = "too many levels of indentation"; |
|||
break; |
|||
case E_LINECONT: { |
|||
col_offset = p->tok->cur - p->tok->buf - 1; |
|||
msg = "unexpected character after line continuation character"; |
|||
break; |
|||
} |
|||
default: |
|||
msg = "unknown parsing error"; |
|||
} |
|||
|
|||
RAISE_ERROR_KNOWN_LOCATION(p, errtype, p->tok->lineno, |
|||
col_offset >= 0 ? col_offset : 0, |
|||
p->tok->lineno, -1, msg); |
|||
return -1; |
|||
} |
|||
|
|||
int |
|||
_Pypegen_raise_decode_error(Parser *p) |
|||
{ |
|||
assert(PyErr_Occurred()); |
|||
const char *errtype = NULL; |
|||
if (PyErr_ExceptionMatches(PyExc_UnicodeError)) { |
|||
errtype = "unicode error"; |
|||
} |
|||
else if (PyErr_ExceptionMatches(PyExc_ValueError)) { |
|||
errtype = "value error"; |
|||
} |
|||
if (errtype) { |
|||
PyObject *type; |
|||
PyObject *value; |
|||
PyObject *tback; |
|||
PyObject *errstr; |
|||
PyErr_Fetch(&type, &value, &tback); |
|||
errstr = PyObject_Str(value); |
|||
if (errstr) { |
|||
RAISE_SYNTAX_ERROR("(%s) %U", errtype, errstr); |
|||
Py_DECREF(errstr); |
|||
} |
|||
else { |
|||
PyErr_Clear(); |
|||
RAISE_SYNTAX_ERROR("(%s) unknown error", errtype); |
|||
} |
|||
Py_XDECREF(type); |
|||
Py_XDECREF(value); |
|||
Py_XDECREF(tback); |
|||
} |
|||
|
|||
return -1; |
|||
} |
|||
|
|||
static int |
|||
_PyPegen_tokenize_full_source_to_check_for_errors(Parser *p) { |
|||
// Tokenize the whole input to see if there are any tokenization |
|||
// errors such as mistmatching parentheses. These will get priority |
|||
// over generic syntax errors only if the line number of the error is |
|||
// before the one that we had for the generic error. |
|||
|
|||
// We don't want to tokenize to the end for interactive input |
|||
if (p->tok->prompt != NULL) { |
|||
return 0; |
|||
} |
|||
|
|||
PyObject *type, *value, *traceback; |
|||
PyErr_Fetch(&type, &value, &traceback); |
|||
|
|||
Token *current_token = p->known_err_token != NULL ? p->known_err_token : p->tokens[p->fill - 1]; |
|||
Py_ssize_t current_err_line = current_token->lineno; |
|||
|
|||
int ret = 0; |
|||
|
|||
for (;;) { |
|||
const char *start; |
|||
const char *end; |
|||
switch (_PyTokenizer_Get(p->tok, &start, &end)) { |
|||
case ERRORTOKEN: |
|||
if (p->tok->level != 0) { |
|||
int error_lineno = p->tok->parenlinenostack[p->tok->level-1]; |
|||
if (current_err_line > error_lineno) { |
|||
raise_unclosed_parentheses_error(p); |
|||
ret = -1; |
|||
goto exit; |
|||
} |
|||
} |
|||
break; |
|||
case ENDMARKER: |
|||
break; |
|||
default: |
|||
continue; |
|||
} |
|||
break; |
|||
} |
|||
|
|||
|
|||
exit: |
|||
if (PyErr_Occurred()) { |
|||
Py_XDECREF(value); |
|||
Py_XDECREF(type); |
|||
Py_XDECREF(traceback); |
|||
} else { |
|||
PyErr_Restore(type, value, traceback); |
|||
} |
|||
return ret; |
|||
} |
|||
|
|||
// PARSER ERRORS |
|||
|
|||
void * |
|||
_PyPegen_raise_error(Parser *p, PyObject *errtype, const char *errmsg, ...) |
|||
{ |
|||
if (p->fill == 0) { |
|||
va_list va; |
|||
va_start(va, errmsg); |
|||
_PyPegen_raise_error_known_location(p, errtype, 0, 0, 0, -1, errmsg, va); |
|||
va_end(va); |
|||
return NULL; |
|||
} |
|||
|
|||
Token *t = p->known_err_token != NULL ? p->known_err_token : p->tokens[p->fill - 1]; |
|||
Py_ssize_t col_offset; |
|||
Py_ssize_t end_col_offset = -1; |
|||
if (t->col_offset == -1) { |
|||
if (p->tok->cur == p->tok->buf) { |
|||
col_offset = 0; |
|||
} else { |
|||
const char* start = p->tok->buf ? p->tok->line_start : p->tok->buf; |
|||
col_offset = Py_SAFE_DOWNCAST(p->tok->cur - start, intptr_t, int); |
|||
} |
|||
} else { |
|||
col_offset = t->col_offset + 1; |
|||
} |
|||
|
|||
if (t->end_col_offset != -1) { |
|||
end_col_offset = t->end_col_offset + 1; |
|||
} |
|||
|
|||
va_list va; |
|||
va_start(va, errmsg); |
|||
_PyPegen_raise_error_known_location(p, errtype, t->lineno, col_offset, t->end_lineno, end_col_offset, errmsg, va); |
|||
va_end(va); |
|||
|
|||
return NULL; |
|||
} |
|||
|
|||
static PyObject * |
|||
get_error_line_from_tokenizer_buffers(Parser *p, Py_ssize_t lineno) |
|||
{ |
|||
/* If the file descriptor is interactive, the source lines of the current |
|||
* (multi-line) statement are stored in p->tok->interactive_src_start. |
|||
* If not, we're parsing from a string, which means that the whole source |
|||
* is stored in p->tok->str. */ |
|||
assert((p->tok->fp == NULL && p->tok->str != NULL) || p->tok->fp == stdin); |
|||
|
|||
char *cur_line = p->tok->fp_interactive ? p->tok->interactive_src_start : p->tok->str; |
|||
assert(cur_line != NULL); |
|||
|
|||
for (int i = 0; i < lineno - 1; i++) { |
|||
cur_line = strchr(cur_line, '\n') + 1; |
|||
} |
|||
|
|||
char *next_newline; |
|||
if ((next_newline = strchr(cur_line, '\n')) == NULL) { // This is the last line |
|||
next_newline = cur_line + strlen(cur_line); |
|||
} |
|||
return PyUnicode_DecodeUTF8(cur_line, next_newline - cur_line, "replace"); |
|||
} |
|||
|
|||
void * |
|||
_PyPegen_raise_error_known_location(Parser *p, PyObject *errtype, |
|||
Py_ssize_t lineno, Py_ssize_t col_offset, |
|||
Py_ssize_t end_lineno, Py_ssize_t end_col_offset, |
|||
const char *errmsg, va_list va) |
|||
{ |
|||
PyObject *value = NULL; |
|||
PyObject *errstr = NULL; |
|||
PyObject *error_line = NULL; |
|||
PyObject *tmp = NULL; |
|||
p->error_indicator = 1; |
|||
|
|||
if (end_lineno == CURRENT_POS) { |
|||
end_lineno = p->tok->lineno; |
|||
} |
|||
if (end_col_offset == CURRENT_POS) { |
|||
end_col_offset = p->tok->cur - p->tok->line_start; |
|||
} |
|||
|
|||
if (p->start_rule == Py_fstring_input) { |
|||
const char *fstring_msg = "f-string: "; |
|||
Py_ssize_t len = strlen(fstring_msg) + strlen(errmsg); |
|||
|
|||
char *new_errmsg = PyMem_Malloc(len + 1); // Lengths of both strings plus NULL character |
|||
if (!new_errmsg) { |
|||
return (void *) PyErr_NoMemory(); |
|||
} |
|||
|
|||
// Copy both strings into new buffer |
|||
memcpy(new_errmsg, fstring_msg, strlen(fstring_msg)); |
|||
memcpy(new_errmsg + strlen(fstring_msg), errmsg, strlen(errmsg)); |
|||
new_errmsg[len] = 0; |
|||
errmsg = new_errmsg; |
|||
} |
|||
errstr = PyUnicode_FromFormatV(errmsg, va); |
|||
if (!errstr) { |
|||
goto error; |
|||
} |
|||
|
|||
if (p->tok->fp_interactive) { |
|||
error_line = get_error_line_from_tokenizer_buffers(p, lineno); |
|||
} |
|||
else if (p->start_rule == Py_file_input) { |
|||
error_line = _PyErr_ProgramDecodedTextObject(p->tok->filename, |
|||
(int) lineno, p->tok->encoding); |
|||
} |
|||
|
|||
if (!error_line) { |
|||
/* PyErr_ProgramTextObject was not called or returned NULL. If it was not called, |
|||
then we need to find the error line from some other source, because |
|||
p->start_rule != Py_file_input. If it returned NULL, then it either unexpectedly |
|||
failed or we're parsing from a string or the REPL. There's a third edge case where |
|||
we're actually parsing from a file, which has an E_EOF SyntaxError and in that case |
|||
`PyErr_ProgramTextObject` fails because lineno points to last_file_line + 1, which |
|||
does not physically exist */ |
|||
assert(p->tok->fp == NULL || p->tok->fp == stdin || p->tok->done == E_EOF); |
|||
|
|||
if (p->tok->lineno <= lineno && p->tok->inp > p->tok->buf) { |
|||
Py_ssize_t size = p->tok->inp - p->tok->buf; |
|||
error_line = PyUnicode_DecodeUTF8(p->tok->buf, size, "replace"); |
|||
} |
|||
else if (p->tok->fp == NULL || p->tok->fp == stdin) { |
|||
error_line = get_error_line_from_tokenizer_buffers(p, lineno); |
|||
} |
|||
else { |
|||
error_line = PyUnicode_FromStringAndSize("", 0); |
|||
} |
|||
if (!error_line) { |
|||
goto error; |
|||
} |
|||
} |
|||
|
|||
if (p->start_rule == Py_fstring_input) { |
|||
col_offset -= p->starting_col_offset; |
|||
end_col_offset -= p->starting_col_offset; |
|||
} |
|||
|
|||
Py_ssize_t col_number = col_offset; |
|||
Py_ssize_t end_col_number = end_col_offset; |
|||
|
|||
if (p->tok->encoding != NULL) { |
|||
col_number = _PyPegen_byte_offset_to_character_offset(error_line, col_offset); |
|||
if (col_number < 0) { |
|||
goto error; |
|||
} |
|||
if (end_col_number > 0) { |
|||
Py_ssize_t end_col_offset = _PyPegen_byte_offset_to_character_offset(error_line, end_col_number); |
|||
if (end_col_offset < 0) { |
|||
goto error; |
|||
} else { |
|||
end_col_number = end_col_offset; |
|||
} |
|||
} |
|||
} |
|||
tmp = Py_BuildValue("(OiiNii)", p->tok->filename, lineno, col_number, error_line, end_lineno, end_col_number); |
|||
if (!tmp) { |
|||
goto error; |
|||
} |
|||
value = PyTuple_Pack(2, errstr, tmp); |
|||
Py_DECREF(tmp); |
|||
if (!value) { |
|||
goto error; |
|||
} |
|||
PyErr_SetObject(errtype, value); |
|||
|
|||
Py_DECREF(errstr); |
|||
Py_DECREF(value); |
|||
if (p->start_rule == Py_fstring_input) { |
|||
PyMem_Free((void *)errmsg); |
|||
} |
|||
return NULL; |
|||
|
|||
error: |
|||
Py_XDECREF(errstr); |
|||
Py_XDECREF(error_line); |
|||
if (p->start_rule == Py_fstring_input) { |
|||
PyMem_Free((void *)errmsg); |
|||
} |
|||
return NULL; |
|||
} |
|||
|
|||
void |
|||
_Pypegen_set_syntax_error(Parser* p, Token* last_token) { |
|||
// Existing sintax error |
|||
if (PyErr_Occurred()) { |
|||
// Prioritize tokenizer errors to custom syntax errors raised |
|||
// on the second phase only if the errors come from the parser. |
|||
if (p->tok->done == E_DONE && PyErr_ExceptionMatches(PyExc_SyntaxError)) { |
|||
_PyPegen_tokenize_full_source_to_check_for_errors(p); |
|||
} |
|||
// Propagate the existing syntax error. |
|||
return; |
|||
} |
|||
// Initialization error |
|||
if (p->fill == 0) { |
|||
RAISE_SYNTAX_ERROR("error at start before reading any input"); |
|||
} |
|||
// Parser encountered EOF (End of File) unexpectedtly |
|||
if (last_token->type == ERRORTOKEN && p->tok->done == E_EOF) { |
|||
if (p->tok->level) { |
|||
raise_unclosed_parentheses_error(p); |
|||
} else { |
|||
RAISE_SYNTAX_ERROR("unexpected EOF while parsing"); |
|||
} |
|||
return; |
|||
} |
|||
// Indentation error in the tokenizer |
|||
if (last_token->type == INDENT || last_token->type == DEDENT) { |
|||
RAISE_INDENTATION_ERROR(last_token->type == INDENT ? "unexpected indent" : "unexpected unindent"); |
|||
return; |
|||
} |
|||
// Unknown error (generic case) |
|||
|
|||
// Use the last token we found on the first pass to avoid reporting |
|||
// incorrect locations for generic syntax errors just because we reached |
|||
// further away when trying to find specific syntax errors in the second |
|||
// pass. |
|||
RAISE_SYNTAX_ERROR_KNOWN_LOCATION(last_token, "invalid syntax"); |
|||
// _PyPegen_tokenize_full_source_to_check_for_errors will override the existing |
|||
// generic SyntaxError we just raised if errors are found. |
|||
_PyPegen_tokenize_full_source_to_check_for_errors(p); |
|||
} |
Write
Preview
Loading…
Cancel
Save
Reference in new issue