diff --git a/Lib/test/test_tokenize.py b/Lib/test/test_tokenize.py index 8fbd216ac66..75a7a809b4f 100644 --- a/Lib/test/test_tokenize.py +++ b/Lib/test/test_tokenize.py @@ -795,6 +795,8 @@ def test_short_files(self): self.assertEquals(encoding, 'utf-8') self.assertEquals(consumed_lines, []) + readline = self.get_readline((b'# coding: bad\n',)) + self.assertRaises(SyntaxError, detect_encoding, readline) class TestTokenize(TestCase): diff --git a/Lib/tokenize.py b/Lib/tokenize.py index ec5a79a6453..16c4f3f0298 100644 --- a/Lib/tokenize.py +++ b/Lib/tokenize.py @@ -26,7 +26,7 @@ import re, string, sys from token import * -from codecs import lookup +from codecs import lookup, BOM_UTF8 from itertools import chain, repeat cookie_re = re.compile("coding[:=]\s*([-\w.]+)") @@ -251,11 +251,11 @@ def detect_encoding(readline): It detects the encoding from the presence of a utf-8 bom or an encoding cookie as specified in pep-0263. If both a bom and a cookie are present, - but disagree, a SyntaxError will be raised. + but disagree, a SyntaxError will be raised. If the encoding cookie is an + invalid charset, raise a SyntaxError. If no encoding is specified, then the default of 'utf-8' will be returned. """ - utf8_bom = b'\xef\xbb\xbf' bom_found = False encoding = None def read_or_stop(): @@ -268,18 +268,25 @@ def find_cookie(line): try: line_string = line.decode('ascii') except UnicodeDecodeError: - pass - else: - matches = cookie_re.findall(line_string) - if matches: - encoding = matches[0] - if bom_found and lookup(encoding).name != 'utf-8': - # This behaviour mimics the Python interpreter - raise SyntaxError('encoding problem: utf-8') - return encoding + return None + + matches = cookie_re.findall(line_string) + if not matches: + return None + encoding = matches[0] + try: + codec = lookup(encoding) + except LookupError: + # This behaviour mimics the Python interpreter + raise SyntaxError("unknown encoding: " + encoding) + + if bom_found and codec.name != 'utf-8': + # This behaviour mimics the Python interpreter + raise SyntaxError('encoding problem: utf-8') + return encoding first = read_or_stop() - if first.startswith(utf8_bom): + if first.startswith(BOM_UTF8): bom_found = True first = first[3:] if not first: diff --git a/Misc/NEWS b/Misc/NEWS index 10a07459906..6568a1c5a38 100644 --- a/Misc/NEWS +++ b/Misc/NEWS @@ -45,6 +45,9 @@ Core and Builtins Library ------- +- Issue #4021: tokenize.detect_encoding() now raises a SyntaxError when the + codec cannot be found. This is for compatibility with the builtin behavior. + - Issue #4084: Fix max, min, max_mag and min_mag Decimal methods to give correct results in the case where one argument is a quiet NaN and the other is a finite number that requires rounding.