cpython/Parser/pegen_errors.c
Pablo Galindo Salgado 9ed36d533a
gh-113602: Bail out when the parser tries to override existing errors (#113607)
Signed-off-by: Pablo Galindo <pablogsal@gmail.com>
2024-01-02 13:00:52 +00:00

456 lines
15 KiB
C

#include <Python.h>
#include <errcode.h>
#include "pycore_pyerrors.h" // _PyErr_ProgramDecodedTextObject()
#include "lexer/state.h"
#include "lexer/lexer.h"
#include "pegen.h"
// TOKENIZER ERRORS
void
_PyPegen_raise_tokenizer_init_error(PyObject *filename)
{
if (!(PyErr_ExceptionMatches(PyExc_LookupError)
|| PyErr_ExceptionMatches(PyExc_SyntaxError)
|| PyErr_ExceptionMatches(PyExc_ValueError)
|| PyErr_ExceptionMatches(PyExc_UnicodeDecodeError))) {
return;
}
PyObject *errstr = NULL;
PyObject *tuple = NULL;
PyObject *type;
PyObject *value;
PyObject *tback;
PyErr_Fetch(&type, &value, &tback);
errstr = PyObject_Str(value);
if (!errstr) {
goto error;
}
PyObject *tmp = Py_BuildValue("(OiiO)", filename, 0, -1, Py_None);
if (!tmp) {
goto error;
}
tuple = PyTuple_Pack(2, errstr, tmp);
Py_DECREF(tmp);
if (!value) {
goto error;
}
PyErr_SetObject(PyExc_SyntaxError, tuple);
error:
Py_XDECREF(type);
Py_XDECREF(value);
Py_XDECREF(tback);
Py_XDECREF(errstr);
Py_XDECREF(tuple);
}
static inline void
raise_unclosed_parentheses_error(Parser *p) {
int error_lineno = p->tok->parenlinenostack[p->tok->level-1];
int error_col = p->tok->parencolstack[p->tok->level-1];
RAISE_ERROR_KNOWN_LOCATION(p, PyExc_SyntaxError,
error_lineno, error_col, error_lineno, -1,
"'%c' was never closed",
p->tok->parenstack[p->tok->level-1]);
}
int
_Pypegen_tokenizer_error(Parser *p)
{
if (PyErr_Occurred()) {
return -1;
}
const char *msg = NULL;
PyObject* errtype = PyExc_SyntaxError;
Py_ssize_t col_offset = -1;
p->error_indicator = 1;
switch (p->tok->done) {
case E_TOKEN:
msg = "invalid token";
break;
case E_EOF:
if (p->tok->level) {
raise_unclosed_parentheses_error(p);
} else {
RAISE_SYNTAX_ERROR("unexpected EOF while parsing");
}
return -1;
case E_DEDENT:
RAISE_INDENTATION_ERROR("unindent does not match any outer indentation level");
return -1;
case E_INTR:
if (!PyErr_Occurred()) {
PyErr_SetNone(PyExc_KeyboardInterrupt);
}
return -1;
case E_NOMEM:
PyErr_NoMemory();
return -1;
case E_TABSPACE:
errtype = PyExc_TabError;
msg = "inconsistent use of tabs and spaces in indentation";
break;
case E_TOODEEP:
errtype = PyExc_IndentationError;
msg = "too many levels of indentation";
break;
case E_LINECONT: {
col_offset = p->tok->cur - p->tok->buf - 1;
msg = "unexpected character after line continuation character";
break;
}
case E_COLUMNOVERFLOW:
PyErr_SetString(PyExc_OverflowError,
"Parser column offset overflow - source line is too big");
return -1;
default:
msg = "unknown parsing error";
}
RAISE_ERROR_KNOWN_LOCATION(p, errtype, p->tok->lineno,
col_offset >= 0 ? col_offset : 0,
p->tok->lineno, -1, msg);
return -1;
}
int
_Pypegen_raise_decode_error(Parser *p)
{
assert(PyErr_Occurred());
const char *errtype = NULL;
if (PyErr_ExceptionMatches(PyExc_UnicodeError)) {
errtype = "unicode error";
}
else if (PyErr_ExceptionMatches(PyExc_ValueError)) {
errtype = "value error";
}
if (errtype) {
PyObject *type;
PyObject *value;
PyObject *tback;
PyObject *errstr;
PyErr_Fetch(&type, &value, &tback);
errstr = PyObject_Str(value);
if (errstr) {
RAISE_SYNTAX_ERROR("(%s) %U", errtype, errstr);
Py_DECREF(errstr);
}
else {
PyErr_Clear();
RAISE_SYNTAX_ERROR("(%s) unknown error", errtype);
}
Py_XDECREF(type);
Py_XDECREF(value);
Py_XDECREF(tback);
}
return -1;
}
static int
_PyPegen_tokenize_full_source_to_check_for_errors(Parser *p) {
// Tokenize the whole input to see if there are any tokenization
// errors such as mistmatching parentheses. These will get priority
// over generic syntax errors only if the line number of the error is
// before the one that we had for the generic error.
// We don't want to tokenize to the end for interactive input
if (p->tok->prompt != NULL) {
return 0;
}
PyObject *type, *value, *traceback;
PyErr_Fetch(&type, &value, &traceback);
Token *current_token = p->known_err_token != NULL ? p->known_err_token : p->tokens[p->fill - 1];
Py_ssize_t current_err_line = current_token->lineno;
int ret = 0;
struct token new_token;
_PyToken_Init(&new_token);
for (;;) {
switch (_PyTokenizer_Get(p->tok, &new_token)) {
case ERRORTOKEN:
if (PyErr_Occurred()) {
ret = -1;
goto exit;
}
if (p->tok->level != 0) {
int error_lineno = p->tok->parenlinenostack[p->tok->level-1];
if (current_err_line > error_lineno) {
raise_unclosed_parentheses_error(p);
ret = -1;
goto exit;
}
}
break;
case ENDMARKER:
break;
default:
continue;
}
break;
}
exit:
_PyToken_Free(&new_token);
// If we're in an f-string, we want the syntax error in the expression part
// to propagate, so that tokenizer errors (like expecting '}') that happen afterwards
// do not swallow it.
if (PyErr_Occurred() && p->tok->tok_mode_stack_index <= 0) {
Py_XDECREF(value);
Py_XDECREF(type);
Py_XDECREF(traceback);
} else {
PyErr_Restore(type, value, traceback);
}
return ret;
}
// PARSER ERRORS
void *
_PyPegen_raise_error(Parser *p, PyObject *errtype, int use_mark, const char *errmsg, ...)
{
// Bail out if we already have an error set.
if (p->error_indicator && PyErr_Occurred()) {
return NULL;
}
if (p->fill == 0) {
va_list va;
va_start(va, errmsg);
_PyPegen_raise_error_known_location(p, errtype, 0, 0, 0, -1, errmsg, va);
va_end(va);
return NULL;
}
if (use_mark && p->mark == p->fill && _PyPegen_fill_token(p) < 0) {
p->error_indicator = 1;
return NULL;
}
Token *t = p->known_err_token != NULL
? p->known_err_token
: p->tokens[use_mark ? p->mark : p->fill - 1];
Py_ssize_t col_offset;
Py_ssize_t end_col_offset = -1;
if (t->col_offset == -1) {
if (p->tok->cur == p->tok->buf) {
col_offset = 0;
} else {
const char* start = p->tok->buf ? p->tok->line_start : p->tok->buf;
col_offset = Py_SAFE_DOWNCAST(p->tok->cur - start, intptr_t, int);
}
} else {
col_offset = t->col_offset + 1;
}
if (t->end_col_offset != -1) {
end_col_offset = t->end_col_offset + 1;
}
va_list va;
va_start(va, errmsg);
_PyPegen_raise_error_known_location(p, errtype, t->lineno, col_offset, t->end_lineno, end_col_offset, errmsg, va);
va_end(va);
return NULL;
}
static PyObject *
get_error_line_from_tokenizer_buffers(Parser *p, Py_ssize_t lineno)
{
/* If the file descriptor is interactive, the source lines of the current
* (multi-line) statement are stored in p->tok->interactive_src_start.
* If not, we're parsing from a string, which means that the whole source
* is stored in p->tok->str. */
assert((p->tok->fp == NULL && p->tok->str != NULL) || p->tok->fp != NULL);
char *cur_line = p->tok->fp_interactive ? p->tok->interactive_src_start : p->tok->str;
if (cur_line == NULL) {
assert(p->tok->fp_interactive);
// We can reach this point if the tokenizer buffers for interactive source have not been
// initialized because we failed to decode the original source with the given locale.
return PyUnicode_FromStringAndSize("", 0);
}
Py_ssize_t relative_lineno = p->starting_lineno ? lineno - p->starting_lineno + 1 : lineno;
const char* buf_end = p->tok->fp_interactive ? p->tok->interactive_src_end : p->tok->inp;
if (buf_end < cur_line) {
buf_end = cur_line + strlen(cur_line);
}
for (int i = 0; i < relative_lineno - 1; i++) {
char *new_line = strchr(cur_line, '\n');
// The assert is here for debug builds but the conditional that
// follows is there so in release builds we do not crash at the cost
// to report a potentially wrong line.
assert(new_line != NULL && new_line + 1 < buf_end);
if (new_line == NULL || new_line + 1 > buf_end) {
break;
}
cur_line = new_line + 1;
}
char *next_newline;
if ((next_newline = strchr(cur_line, '\n')) == NULL) { // This is the last line
next_newline = cur_line + strlen(cur_line);
}
return PyUnicode_DecodeUTF8(cur_line, next_newline - cur_line, "replace");
}
void *
_PyPegen_raise_error_known_location(Parser *p, PyObject *errtype,
Py_ssize_t lineno, Py_ssize_t col_offset,
Py_ssize_t end_lineno, Py_ssize_t end_col_offset,
const char *errmsg, va_list va)
{
// Bail out if we already have an error set.
if (p->error_indicator && PyErr_Occurred()) {
return NULL;
}
PyObject *value = NULL;
PyObject *errstr = NULL;
PyObject *error_line = NULL;
PyObject *tmp = NULL;
p->error_indicator = 1;
if (end_lineno == CURRENT_POS) {
end_lineno = p->tok->lineno;
}
if (end_col_offset == CURRENT_POS) {
end_col_offset = p->tok->cur - p->tok->line_start;
}
errstr = PyUnicode_FromFormatV(errmsg, va);
if (!errstr) {
goto error;
}
if (p->tok->fp_interactive && p->tok->interactive_src_start != NULL) {
error_line = get_error_line_from_tokenizer_buffers(p, lineno);
}
else if (p->start_rule == Py_file_input) {
error_line = _PyErr_ProgramDecodedTextObject(p->tok->filename,
(int) lineno, p->tok->encoding);
}
if (!error_line) {
/* PyErr_ProgramTextObject was not called or returned NULL. If it was not called,
then we need to find the error line from some other source, because
p->start_rule != Py_file_input. If it returned NULL, then it either unexpectedly
failed or we're parsing from a string or the REPL. There's a third edge case where
we're actually parsing from a file, which has an E_EOF SyntaxError and in that case
`PyErr_ProgramTextObject` fails because lineno points to last_file_line + 1, which
does not physically exist */
assert(p->tok->fp == NULL || p->tok->fp == stdin || p->tok->done == E_EOF);
if (p->tok->lineno <= lineno && p->tok->inp > p->tok->buf) {
Py_ssize_t size = p->tok->inp - p->tok->buf;
error_line = PyUnicode_DecodeUTF8(p->tok->buf, size, "replace");
}
else if (p->tok->fp == NULL || p->tok->fp == stdin) {
error_line = get_error_line_from_tokenizer_buffers(p, lineno);
}
else {
error_line = PyUnicode_FromStringAndSize("", 0);
}
if (!error_line) {
goto error;
}
}
Py_ssize_t col_number = col_offset;
Py_ssize_t end_col_number = end_col_offset;
if (p->tok->encoding != NULL) {
col_number = _PyPegen_byte_offset_to_character_offset(error_line, col_offset);
if (col_number < 0) {
goto error;
}
if (end_col_number > 0) {
Py_ssize_t end_col_offset = _PyPegen_byte_offset_to_character_offset(error_line, end_col_number);
if (end_col_offset < 0) {
goto error;
} else {
end_col_number = end_col_offset;
}
}
}
tmp = Py_BuildValue("(OnnNnn)", p->tok->filename, lineno, col_number, error_line, end_lineno, end_col_number);
if (!tmp) {
goto error;
}
value = PyTuple_Pack(2, errstr, tmp);
Py_DECREF(tmp);
if (!value) {
goto error;
}
PyErr_SetObject(errtype, value);
Py_DECREF(errstr);
Py_DECREF(value);
return NULL;
error:
Py_XDECREF(errstr);
Py_XDECREF(error_line);
return NULL;
}
void
_Pypegen_set_syntax_error(Parser* p, Token* last_token) {
// Existing syntax error
if (PyErr_Occurred()) {
// Prioritize tokenizer errors to custom syntax errors raised
// on the second phase only if the errors come from the parser.
int is_tok_ok = (p->tok->done == E_DONE || p->tok->done == E_OK);
if (is_tok_ok && PyErr_ExceptionMatches(PyExc_SyntaxError)) {
_PyPegen_tokenize_full_source_to_check_for_errors(p);
}
// Propagate the existing syntax error.
return;
}
// Initialization error
if (p->fill == 0) {
RAISE_SYNTAX_ERROR("error at start before reading any input");
}
// Parser encountered EOF (End of File) unexpectedtly
if (last_token->type == ERRORTOKEN && p->tok->done == E_EOF) {
if (p->tok->level) {
raise_unclosed_parentheses_error(p);
} else {
RAISE_SYNTAX_ERROR("unexpected EOF while parsing");
}
return;
}
// Indentation error in the tokenizer
if (last_token->type == INDENT || last_token->type == DEDENT) {
RAISE_INDENTATION_ERROR(last_token->type == INDENT ? "unexpected indent" : "unexpected unindent");
return;
}
// Unknown error (generic case)
// Use the last token we found on the first pass to avoid reporting
// incorrect locations for generic syntax errors just because we reached
// further away when trying to find specific syntax errors in the second
// pass.
RAISE_SYNTAX_ERROR_KNOWN_LOCATION(last_token, "invalid syntax");
// _PyPegen_tokenize_full_source_to_check_for_errors will override the existing
// generic SyntaxError we just raised if errors are found.
_PyPegen_tokenize_full_source_to_check_for_errors(p);
}
void
_Pypegen_stack_overflow(Parser *p)
{
p->error_indicator = 1;
PyErr_SetString(PyExc_MemoryError,
"Parser stack overflowed - Python source too complex to parse");
}