cpython/Parser/pegen_errors.c
Lysandros Nikolaou cbf0afd8a1
gh-97973: Return all necessary information from the tokenizer (GH-97984)
Right now, the tokenizer only returns type and two pointers to the start and end of the token.
This PR modifies the tokenizer to return the type and set all of the necessary information,
so that the parser does not have to this.
2022-10-06 16:07:17 -07:00

442 lines
14 KiB
C

#include <Python.h>
#include <errcode.h>
#include "tokenizer.h"
#include "pegen.h"
// TOKENIZER ERRORS
void
_PyPegen_raise_tokenizer_init_error(PyObject *filename)
{
if (!(PyErr_ExceptionMatches(PyExc_LookupError)
|| PyErr_ExceptionMatches(PyExc_SyntaxError)
|| PyErr_ExceptionMatches(PyExc_ValueError)
|| PyErr_ExceptionMatches(PyExc_UnicodeDecodeError))) {
return;
}
PyObject *errstr = NULL;
PyObject *tuple = NULL;
PyObject *type;
PyObject *value;
PyObject *tback;
PyErr_Fetch(&type, &value, &tback);
errstr = PyObject_Str(value);
if (!errstr) {
goto error;
}
PyObject *tmp = Py_BuildValue("(OiiO)", filename, 0, -1, Py_None);
if (!tmp) {
goto error;
}
tuple = PyTuple_Pack(2, errstr, tmp);
Py_DECREF(tmp);
if (!value) {
goto error;
}
PyErr_SetObject(PyExc_SyntaxError, tuple);
error:
Py_XDECREF(type);
Py_XDECREF(value);
Py_XDECREF(tback);
Py_XDECREF(errstr);
Py_XDECREF(tuple);
}
static inline void
raise_unclosed_parentheses_error(Parser *p) {
int error_lineno = p->tok->parenlinenostack[p->tok->level-1];
int error_col = p->tok->parencolstack[p->tok->level-1];
RAISE_ERROR_KNOWN_LOCATION(p, PyExc_SyntaxError,
error_lineno, error_col, error_lineno, -1,
"'%c' was never closed",
p->tok->parenstack[p->tok->level-1]);
}
int
_Pypegen_tokenizer_error(Parser *p)
{
if (PyErr_Occurred()) {
return -1;
}
const char *msg = NULL;
PyObject* errtype = PyExc_SyntaxError;
Py_ssize_t col_offset = -1;
switch (p->tok->done) {
case E_TOKEN:
msg = "invalid token";
break;
case E_EOF:
if (p->tok->level) {
raise_unclosed_parentheses_error(p);
} else {
RAISE_SYNTAX_ERROR("unexpected EOF while parsing");
}
return -1;
case E_DEDENT:
RAISE_INDENTATION_ERROR("unindent does not match any outer indentation level");
return -1;
case E_INTR:
if (!PyErr_Occurred()) {
PyErr_SetNone(PyExc_KeyboardInterrupt);
}
return -1;
case E_NOMEM:
PyErr_NoMemory();
return -1;
case E_TABSPACE:
errtype = PyExc_TabError;
msg = "inconsistent use of tabs and spaces in indentation";
break;
case E_TOODEEP:
errtype = PyExc_IndentationError;
msg = "too many levels of indentation";
break;
case E_LINECONT: {
col_offset = p->tok->cur - p->tok->buf - 1;
msg = "unexpected character after line continuation character";
break;
}
default:
msg = "unknown parsing error";
}
RAISE_ERROR_KNOWN_LOCATION(p, errtype, p->tok->lineno,
col_offset >= 0 ? col_offset : 0,
p->tok->lineno, -1, msg);
return -1;
}
int
_Pypegen_raise_decode_error(Parser *p)
{
assert(PyErr_Occurred());
const char *errtype = NULL;
if (PyErr_ExceptionMatches(PyExc_UnicodeError)) {
errtype = "unicode error";
}
else if (PyErr_ExceptionMatches(PyExc_ValueError)) {
errtype = "value error";
}
if (errtype) {
PyObject *type;
PyObject *value;
PyObject *tback;
PyObject *errstr;
PyErr_Fetch(&type, &value, &tback);
errstr = PyObject_Str(value);
if (errstr) {
RAISE_SYNTAX_ERROR("(%s) %U", errtype, errstr);
Py_DECREF(errstr);
}
else {
PyErr_Clear();
RAISE_SYNTAX_ERROR("(%s) unknown error", errtype);
}
Py_XDECREF(type);
Py_XDECREF(value);
Py_XDECREF(tback);
}
return -1;
}
static int
_PyPegen_tokenize_full_source_to_check_for_errors(Parser *p) {
// Tokenize the whole input to see if there are any tokenization
// errors such as mistmatching parentheses. These will get priority
// over generic syntax errors only if the line number of the error is
// before the one that we had for the generic error.
// We don't want to tokenize to the end for interactive input
if (p->tok->prompt != NULL) {
return 0;
}
PyObject *type, *value, *traceback;
PyErr_Fetch(&type, &value, &traceback);
Token *current_token = p->known_err_token != NULL ? p->known_err_token : p->tokens[p->fill - 1];
Py_ssize_t current_err_line = current_token->lineno;
int ret = 0;
struct token new_token;
for (;;) {
switch (_PyTokenizer_Get(p->tok, &new_token)) {
case ERRORTOKEN:
if (p->tok->level != 0) {
int error_lineno = p->tok->parenlinenostack[p->tok->level-1];
if (current_err_line > error_lineno) {
raise_unclosed_parentheses_error(p);
ret = -1;
goto exit;
}
}
break;
case ENDMARKER:
break;
default:
continue;
}
break;
}
exit:
if (PyErr_Occurred()) {
Py_XDECREF(value);
Py_XDECREF(type);
Py_XDECREF(traceback);
} else {
PyErr_Restore(type, value, traceback);
}
return ret;
}
// PARSER ERRORS
void *
_PyPegen_raise_error(Parser *p, PyObject *errtype, const char *errmsg, ...)
{
if (p->fill == 0) {
va_list va;
va_start(va, errmsg);
_PyPegen_raise_error_known_location(p, errtype, 0, 0, 0, -1, errmsg, va);
va_end(va);
return NULL;
}
Token *t = p->known_err_token != NULL ? p->known_err_token : p->tokens[p->fill - 1];
Py_ssize_t col_offset;
Py_ssize_t end_col_offset = -1;
if (t->col_offset == -1) {
if (p->tok->cur == p->tok->buf) {
col_offset = 0;
} else {
const char* start = p->tok->buf ? p->tok->line_start : p->tok->buf;
col_offset = Py_SAFE_DOWNCAST(p->tok->cur - start, intptr_t, int);
}
} else {
col_offset = t->col_offset + 1;
}
if (t->end_col_offset != -1) {
end_col_offset = t->end_col_offset + 1;
}
va_list va;
va_start(va, errmsg);
_PyPegen_raise_error_known_location(p, errtype, t->lineno, col_offset, t->end_lineno, end_col_offset, errmsg, va);
va_end(va);
return NULL;
}
static PyObject *
get_error_line_from_tokenizer_buffers(Parser *p, Py_ssize_t lineno)
{
/* If the file descriptor is interactive, the source lines of the current
* (multi-line) statement are stored in p->tok->interactive_src_start.
* If not, we're parsing from a string, which means that the whole source
* is stored in p->tok->str. */
assert((p->tok->fp == NULL && p->tok->str != NULL) || p->tok->fp != NULL);
char *cur_line = p->tok->fp_interactive ? p->tok->interactive_src_start : p->tok->str;
if (cur_line == NULL) {
assert(p->tok->fp_interactive);
// We can reach this point if the tokenizer buffers for interactive source have not been
// initialized because we failed to decode the original source with the given locale.
return PyUnicode_FromStringAndSize("", 0);
}
Py_ssize_t relative_lineno = p->starting_lineno ? lineno - p->starting_lineno + 1 : lineno;
const char* buf_end = p->tok->fp_interactive ? p->tok->interactive_src_end : p->tok->inp;
for (int i = 0; i < relative_lineno - 1; i++) {
char *new_line = strchr(cur_line, '\n');
// The assert is here for debug builds but the conditional that
// follows is there so in release builds we do not crash at the cost
// to report a potentially wrong line.
assert(new_line != NULL && new_line + 1 < buf_end);
if (new_line == NULL || new_line + 1 > buf_end) {
break;
}
cur_line = new_line + 1;
}
char *next_newline;
if ((next_newline = strchr(cur_line, '\n')) == NULL) { // This is the last line
next_newline = cur_line + strlen(cur_line);
}
return PyUnicode_DecodeUTF8(cur_line, next_newline - cur_line, "replace");
}
void *
_PyPegen_raise_error_known_location(Parser *p, PyObject *errtype,
Py_ssize_t lineno, Py_ssize_t col_offset,
Py_ssize_t end_lineno, Py_ssize_t end_col_offset,
const char *errmsg, va_list va)
{
PyObject *value = NULL;
PyObject *errstr = NULL;
PyObject *error_line = NULL;
PyObject *tmp = NULL;
p->error_indicator = 1;
if (end_lineno == CURRENT_POS) {
end_lineno = p->tok->lineno;
}
if (end_col_offset == CURRENT_POS) {
end_col_offset = p->tok->cur - p->tok->line_start;
}
if (p->start_rule == Py_fstring_input) {
const char *fstring_msg = "f-string: ";
Py_ssize_t len = strlen(fstring_msg) + strlen(errmsg);
char *new_errmsg = PyMem_Malloc(len + 1); // Lengths of both strings plus NULL character
if (!new_errmsg) {
return (void *) PyErr_NoMemory();
}
// Copy both strings into new buffer
memcpy(new_errmsg, fstring_msg, strlen(fstring_msg));
memcpy(new_errmsg + strlen(fstring_msg), errmsg, strlen(errmsg));
new_errmsg[len] = 0;
errmsg = new_errmsg;
}
errstr = PyUnicode_FromFormatV(errmsg, va);
if (!errstr) {
goto error;
}
if (p->tok->fp_interactive && p->tok->interactive_src_start != NULL) {
error_line = get_error_line_from_tokenizer_buffers(p, lineno);
}
else if (p->start_rule == Py_file_input) {
error_line = _PyErr_ProgramDecodedTextObject(p->tok->filename,
(int) lineno, p->tok->encoding);
}
if (!error_line) {
/* PyErr_ProgramTextObject was not called or returned NULL. If it was not called,
then we need to find the error line from some other source, because
p->start_rule != Py_file_input. If it returned NULL, then it either unexpectedly
failed or we're parsing from a string or the REPL. There's a third edge case where
we're actually parsing from a file, which has an E_EOF SyntaxError and in that case
`PyErr_ProgramTextObject` fails because lineno points to last_file_line + 1, which
does not physically exist */
assert(p->tok->fp == NULL || p->tok->fp == stdin || p->tok->done == E_EOF);
if (p->tok->lineno <= lineno && p->tok->inp > p->tok->buf) {
Py_ssize_t size = p->tok->inp - p->tok->buf;
error_line = PyUnicode_DecodeUTF8(p->tok->buf, size, "replace");
}
else if (p->tok->fp == NULL || p->tok->fp == stdin) {
error_line = get_error_line_from_tokenizer_buffers(p, lineno);
}
else {
error_line = PyUnicode_FromStringAndSize("", 0);
}
if (!error_line) {
goto error;
}
}
if (p->start_rule == Py_fstring_input) {
col_offset -= p->starting_col_offset;
end_col_offset -= p->starting_col_offset;
}
Py_ssize_t col_number = col_offset;
Py_ssize_t end_col_number = end_col_offset;
if (p->tok->encoding != NULL) {
col_number = _PyPegen_byte_offset_to_character_offset(error_line, col_offset);
if (col_number < 0) {
goto error;
}
if (end_col_number > 0) {
Py_ssize_t end_col_offset = _PyPegen_byte_offset_to_character_offset(error_line, end_col_number);
if (end_col_offset < 0) {
goto error;
} else {
end_col_number = end_col_offset;
}
}
}
tmp = Py_BuildValue("(OnnNnn)", p->tok->filename, lineno, col_number, error_line, end_lineno, end_col_number);
if (!tmp) {
goto error;
}
value = PyTuple_Pack(2, errstr, tmp);
Py_DECREF(tmp);
if (!value) {
goto error;
}
PyErr_SetObject(errtype, value);
Py_DECREF(errstr);
Py_DECREF(value);
if (p->start_rule == Py_fstring_input) {
PyMem_Free((void *)errmsg);
}
return NULL;
error:
Py_XDECREF(errstr);
Py_XDECREF(error_line);
if (p->start_rule == Py_fstring_input) {
PyMem_Free((void *)errmsg);
}
return NULL;
}
void
_Pypegen_set_syntax_error(Parser* p, Token* last_token) {
// Existing sintax error
if (PyErr_Occurred()) {
// Prioritize tokenizer errors to custom syntax errors raised
// on the second phase only if the errors come from the parser.
int is_tok_ok = (p->tok->done == E_DONE || p->tok->done == E_OK);
if (is_tok_ok && PyErr_ExceptionMatches(PyExc_SyntaxError)) {
_PyPegen_tokenize_full_source_to_check_for_errors(p);
}
// Propagate the existing syntax error.
return;
}
// Initialization error
if (p->fill == 0) {
RAISE_SYNTAX_ERROR("error at start before reading any input");
}
// Parser encountered EOF (End of File) unexpectedtly
if (last_token->type == ERRORTOKEN && p->tok->done == E_EOF) {
if (p->tok->level) {
raise_unclosed_parentheses_error(p);
} else {
RAISE_SYNTAX_ERROR("unexpected EOF while parsing");
}
return;
}
// Indentation error in the tokenizer
if (last_token->type == INDENT || last_token->type == DEDENT) {
RAISE_INDENTATION_ERROR(last_token->type == INDENT ? "unexpected indent" : "unexpected unindent");
return;
}
// Unknown error (generic case)
// Use the last token we found on the first pass to avoid reporting
// incorrect locations for generic syntax errors just because we reached
// further away when trying to find specific syntax errors in the second
// pass.
RAISE_SYNTAX_ERROR_KNOWN_LOCATION(last_token, "invalid syntax");
// _PyPegen_tokenize_full_source_to_check_for_errors will override the existing
// generic SyntaxError we just raised if errors are found.
_PyPegen_tokenize_full_source_to_check_for_errors(p);
}