bpo-46054: Fix parsing error when parsing non-utf8 characters in source files (GH-30068)

This commit is contained in:
Pablo Galindo Salgado 2021-12-12 07:06:50 +00:00 committed by GitHub
parent 59435eea08
commit 4325a766f5
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
3 changed files with 19 additions and 8 deletions

View file

@ -2387,6 +2387,18 @@ def test_encodings(self):
finally:
unlink(TESTFN)
def test_non_utf8(self):
# Check non utf-8 characters
try:
with open(TESTFN, 'bw') as testfile:
testfile.write(b'\x7fELF\x02\x01\x01\x00\x00\x00')
rc, out, err = script_helper.assert_python_failure('-Wd', '-X', 'utf8', TESTFN)
err = err.decode('utf-8').splitlines()
self.assertEqual(err[-1], "SyntaxError: invalid non-printable character U+007F")
finally:
unlink(TESTFN)
def test_attributes_new_constructor(self):
args = ("bad.py", 1, 2, "abcdefg", 1, 100)
the_exception = SyntaxError("bad bad", args)

View file

@ -0,0 +1,2 @@
Fix parser error when parsing non-utf8 characters in source files. Patch by
Pablo Galindo.

View file

@ -819,10 +819,10 @@ tok_readline_raw(struct tok_state *tok)
tok_concatenate_interactive_new_line(tok, line) == -1) {
return 0;
}
if (*tok->inp == '\0') {
tok->inp = strchr(tok->inp, '\0');
if (tok->inp == tok->buf) {
return 0;
}
tok->inp = strchr(tok->inp, '\0');
} while (tok->inp[-1] != '\n');
return 1;
}
@ -984,12 +984,9 @@ tok_underflow_file(struct tok_state *tok) {
}
/* The default encoding is UTF-8, so make sure we don't have any
non-UTF-8 sequences in it. */
if (!tok->encoding
&& (tok->decoding_state != STATE_NORMAL || tok->lineno >= 2)) {
if (!ensure_utf8(tok->cur, tok)) {
error_ret(tok);
return 0;
}
if (!tok->encoding && !ensure_utf8(tok->cur, tok)) {
error_ret(tok);
return 0;
}
assert(tok->done == E_OK);
return tok->done == E_OK;