Issue #18960: Fix bugs with Python source code encoding in the second line.

* The first line of Python script could be executed twice when the source
encoding (not equal to 'utf-8') was specified on the second line.

* Now the source encoding declaration on the second line isn't effective if
the first line contains anything except a comment.

* As a consequence, 'python -x' works now again with files with the source
encoding declarations specified on the second file, and can be used again
to make Python batch files on Windows.

* The tokenize module now ignore the source encoding declaration on the second
line if the first line contains anything except a comment.

* IDLE now ignores the source encoding declaration on the second line if the
first line contains anything except a comment.

* 2to3 and the findnocoding.py script now ignore the source encoding
declaration on the second line if the first line contains anything except
a comment.
This commit is contained in:
Serhiy Storchaka 2014-01-09 18:36:09 +02:00
parent 21e7d4cd5e
commit 768c16ce02
7 changed files with 87 additions and 5 deletions

View file

@ -64,6 +64,7 @@
### 'encoding' is used below in encode(), check!
coding_re = re.compile(r'^[ \t\f]*#.*coding[:=][ \t]*([-\w.]+)', re.ASCII)
blank_re = re.compile(r'^[ \t\f]*(?:[#\r\n]|$)', re.ASCII)
def coding_spec(data):
"""Return the encoding declaration according to PEP 263.
@ -93,6 +94,8 @@ def coding_spec(data):
match = coding_re.match(line)
if match is not None:
break
if not blank_re.match(line):
return None
else:
return None
name = match.group(1)

View file

@ -237,6 +237,7 @@ def compat(self, token, iterable):
toks_append(tokval)
cookie_re = re.compile(r'^[ \t\f]*#.*coding[:=][ \t]*([-\w.]+)', re.ASCII)
blank_re = re.compile(br'^[ \t\f]*(?:[#\r\n]|$)', re.ASCII)
def _get_normal_name(orig_enc):
"""Imitates get_normal_name in tokenizer.c."""
@ -309,6 +310,8 @@ def find_cookie(line):
encoding = find_cookie(first)
if encoding:
return encoding, [first]
if not blank_re.match(first):
return default, [first]
second = read_or_stop()
if not second:

View file

@ -885,6 +885,39 @@ def test_mismatched_bom_and_cookie_second_line_raises_syntaxerror(self):
readline = self.get_readline(lines)
self.assertRaises(SyntaxError, detect_encoding, readline)
def test_cookie_second_line_noncommented_first_line(self):
lines = (
b"print('\xc2\xa3')\n",
b'# vim: set fileencoding=iso8859-15 :\n',
b"print('\xe2\x82\xac')\n"
)
encoding, consumed_lines = detect_encoding(self.get_readline(lines))
self.assertEqual(encoding, 'utf-8')
expected = [b"print('\xc2\xa3')\n"]
self.assertEqual(consumed_lines, expected)
def test_cookie_second_line_commented_first_line(self):
lines = (
b"#print('\xc2\xa3')\n",
b'# vim: set fileencoding=iso8859-15 :\n',
b"print('\xe2\x82\xac')\n"
)
encoding, consumed_lines = detect_encoding(self.get_readline(lines))
self.assertEqual(encoding, 'iso8859-15')
expected = [b"#print('\xc2\xa3')\n", b'# vim: set fileencoding=iso8859-15 :\n']
self.assertEqual(consumed_lines, expected)
def test_cookie_second_line_empty_first_line(self):
lines = (
b'\n',
b'# vim: set fileencoding=iso8859-15 :\n',
b"print('\xe2\x82\xac')\n"
)
encoding, consumed_lines = detect_encoding(self.get_readline(lines))
self.assertEqual(encoding, 'iso8859-15')
expected = [b'\n', b'# vim: set fileencoding=iso8859-15 :\n']
self.assertEqual(consumed_lines, expected)
def test_latin1_normalization(self):
# See get_normal_name() in tokenizer.c.
encodings = ("latin-1", "iso-8859-1", "iso-latin-1", "latin-1-unix",

View file

@ -32,6 +32,7 @@
import collections
from io import TextIOWrapper
cookie_re = re.compile(r'^[ \t\f]*#.*coding[:=][ \t]*([-\w.]+)', re.ASCII)
blank_re = re.compile(br'^[ \t\f]*(?:[#\r\n]|$)', re.ASCII)
import token
__all__ = token.__all__ + ["COMMENT", "tokenize", "detect_encoding",
@ -409,6 +410,8 @@ def find_cookie(line):
encoding = find_cookie(first)
if encoding:
return encoding, [first]
if not blank_re.match(first):
return default, [first]
second = read_or_stop()
if not second:

View file

@ -10,6 +10,13 @@ What's New in Python 3.3.4 release candidate 1?
Core and Builtins
-----------------
- Issue #18960: The first line of Python script could be executed twice when
the source encoding was specified on the second line. Now the source encoding
declaration on the second line isn't effective if the first line contains
anything except a comment. 'python -x' works now again with files with the
source encoding declarations, and can be used to make Python batch files
on Windows.
- Issue #19081: When a zipimport .zip file in sys.path being imported from
is modified during the lifetime of the Python process after zipimport has
already cached the zip's table of contents we detect this and recover
@ -36,6 +43,9 @@ Core and Builtins
Library
-------
- Issue #18960: The tokenize module now ignore the source encoding declaration
on the second line if the first line contains anything except a comment.
- Issue #20078: Reading malformed zipfiles no longer hangs with 100% CPU
consumption.
@ -204,6 +214,9 @@ Library
IDLE
----
- Issue #18960: IDLE now ignores the source encoding declaration on the second
line if the first line contains anything except a comment.
- Issue #20058: sys.stdin.readline() in IDLE now always returns only one line.
- Issue #19481: print() of string subclass instance in IDLE no longer hangs.
@ -281,6 +294,13 @@ Build
- Add workaround for VS 2010 nmake clean issue. VS 2010 doesn't set up PATH
for nmake.exe correctly.
Tools/Demos
-----------
- Issue #18960: 2to3 and the findnocoding.py script now ignore the source
encoding declaration on the second line if the first line contains anything
except a comment.
What's New in Python 3.3.3?
===========================

View file

@ -283,13 +283,27 @@ check_coding_spec(const char* line, Py_ssize_t size, struct tok_state *tok,
char *cs;
int r = 1;
if (tok->cont_line)
if (tok->cont_line) {
/* It's a continuation line, so it can't be a coding spec. */
tok->read_coding_spec = 1;
return 1;
}
if (!get_coding_spec(line, &cs, size, tok))
return 0;
if (!cs)
if (!cs) {
Py_ssize_t i;
for (i = 0; i < size; i++) {
if (line[i] == '#' || line[i] == '\n' || line[i] == '\r')
break;
if (line[i] != ' ' && line[i] != '\t' && line[i] != '\014') {
/* Stop checking coding spec after a line containing
* anything except a comment. */
tok->read_coding_spec = 1;
break;
}
}
return 1;
}
tok->read_coding_spec = 1;
if (tok->encoding == NULL) {
assert(tok->decoding_state == STATE_RAW);
@ -476,13 +490,17 @@ fp_setreadl(struct tok_state *tok, const char* enc)
_Py_IDENTIFIER(open);
_Py_IDENTIFIER(readline);
int fd;
long pos;
io = PyImport_ImportModuleNoBlock("io");
if (io == NULL)
goto cleanup;
fd = fileno(tok->fp);
if (lseek(fd, 0, SEEK_SET) == (off_t)-1) {
/* Due to buffering the file offset for fd can be different from the file
* position of tok->fp. */
pos = ftell(tok->fp);
if (pos == -1 || lseek(fd, (off_t)pos, SEEK_SET) == (off_t)-1) {
PyErr_SetFromErrnoWithFilename(PyExc_OSError, NULL);
goto cleanup;
}
@ -751,7 +769,7 @@ decode_str(const char *input, int single, struct tok_state *tok)
if (newl[0]) {
if (!check_coding_spec(str, newl[0] - str, tok, buf_setreadl))
return error_ret(tok);
if (tok->enc == NULL && newl[1]) {
if (tok->enc == NULL && !tok->read_coding_spec && newl[1]) {
if (!check_coding_spec(newl[0]+1, newl[1] - newl[0],
tok, buf_setreadl))
return error_ret(tok);

View file

@ -33,6 +33,7 @@ def walk_python_files(self, paths, *args, **kwargs):
decl_re = re.compile(rb'^[ \t\f]*#.*coding[:=][ \t]*([-\w.]+)')
blank_re = re.compile(rb'^[ \t\f]*(?:[#\r\n]|$)')
def get_declaration(line):
match = decl_re.match(line)
@ -58,7 +59,8 @@ def needs_declaration(fullpath):
line1 = infile.readline()
line2 = infile.readline()
if get_declaration(line1) or get_declaration(line2):
if (get_declaration(line1) or
blank_re.match(line1) and get_declaration(line2)):
# the file does have an encoding declaration, so trust it
return False