diff --git a/Lib/test/test_tokenize.py b/Lib/test/test_tokenize.py index 4bce1ca9c76..f8b16e52976 100644 --- a/Lib/test/test_tokenize.py +++ b/Lib/test/test_tokenize.py @@ -3,7 +3,7 @@ from tokenize import (tokenize, _tokenize, untokenize, NUMBER, NAME, OP, STRING, ENDMARKER, ENCODING, tok_name, detect_encoding, open as tokenize_open, Untokenizer, generate_tokens, - NEWLINE) + NEWLINE, _generate_tokens_from_c_tokenizer) from io import BytesIO, StringIO import unittest from unittest import TestCase, mock @@ -12,7 +12,6 @@ import os import token - # Converts a source string into a list of textual representation # of the tokens such as: # ` NAME 'if' (1, 0) (1, 2)` @@ -1654,5 +1653,865 @@ def test_indentation_semantics_retained(self): self.check_roundtrip(code) +class CTokenizeTest(TestCase): + def check_tokenize(self, s, expected): + # Format the tokens in s in a table format. + # The ENDMARKER and final NEWLINE are omitted. + with self.subTest(source=s): + result = stringify_tokens_from_source( + _generate_tokens_from_c_tokenizer(s), s + ) + self.assertEqual(result, expected.rstrip().splitlines()) + + def test_int(self): + + self.check_tokenize('0xff <= 255', """\ + NUMBER '0xff' (1, 0) (1, 4) + LESSEQUAL '<=' (1, 5) (1, 7) + NUMBER '255' (1, 8) (1, 11) + """) + + self.check_tokenize('0b10 <= 255', """\ + NUMBER '0b10' (1, 0) (1, 4) + LESSEQUAL '<=' (1, 5) (1, 7) + NUMBER '255' (1, 8) (1, 11) + """) + + self.check_tokenize('0o123 <= 0O123', """\ + NUMBER '0o123' (1, 0) (1, 5) + LESSEQUAL '<=' (1, 6) (1, 8) + NUMBER '0O123' (1, 9) (1, 14) + """) + + self.check_tokenize('1234567 > ~0x15', """\ + NUMBER '1234567' (1, 0) (1, 7) + GREATER '>' (1, 8) (1, 9) + TILDE '~' (1, 10) (1, 11) + NUMBER '0x15' (1, 11) (1, 15) + """) + + self.check_tokenize('2134568 != 1231515', """\ + NUMBER '2134568' (1, 0) (1, 7) + NOTEQUAL '!=' (1, 8) (1, 10) + NUMBER '1231515' (1, 11) (1, 18) + """) + + self.check_tokenize('(-124561-1) & 200000000', """\ + LPAR '(' (1, 0) (1, 1) + MINUS '-' (1, 1) (1, 2) + NUMBER '124561' (1, 2) (1, 8) + MINUS '-' (1, 8) (1, 9) + NUMBER '1' (1, 9) (1, 10) + RPAR ')' (1, 10) (1, 11) + AMPER '&' (1, 12) (1, 13) + NUMBER '200000000' (1, 14) (1, 23) + """) + + self.check_tokenize('0xdeadbeef != -1', """\ + NUMBER '0xdeadbeef' (1, 0) (1, 10) + NOTEQUAL '!=' (1, 11) (1, 13) + MINUS '-' (1, 14) (1, 15) + NUMBER '1' (1, 15) (1, 16) + """) + + self.check_tokenize('0xdeadc0de & 12345', """\ + NUMBER '0xdeadc0de' (1, 0) (1, 10) + AMPER '&' (1, 11) (1, 12) + NUMBER '12345' (1, 13) (1, 18) + """) + + self.check_tokenize('0xFF & 0x15 | 1234', """\ + NUMBER '0xFF' (1, 0) (1, 4) + AMPER '&' (1, 5) (1, 6) + NUMBER '0x15' (1, 7) (1, 11) + VBAR '|' (1, 12) (1, 13) + NUMBER '1234' (1, 14) (1, 18) + """) + + def test_float(self): + + self.check_tokenize('x = 3.14159', """\ + NAME 'x' (1, 0) (1, 1) + EQUAL '=' (1, 2) (1, 3) + NUMBER '3.14159' (1, 4) (1, 11) + """) + + self.check_tokenize('x = 314159.', """\ + NAME 'x' (1, 0) (1, 1) + EQUAL '=' (1, 2) (1, 3) + NUMBER '314159.' (1, 4) (1, 11) + """) + + self.check_tokenize('x = .314159', """\ + NAME 'x' (1, 0) (1, 1) + EQUAL '=' (1, 2) (1, 3) + NUMBER '.314159' (1, 4) (1, 11) + """) + + self.check_tokenize('x = 3e14159', """\ + NAME 'x' (1, 0) (1, 1) + EQUAL '=' (1, 2) (1, 3) + NUMBER '3e14159' (1, 4) (1, 11) + """) + + self.check_tokenize('x = 3E123', """\ + NAME 'x' (1, 0) (1, 1) + EQUAL '=' (1, 2) (1, 3) + NUMBER '3E123' (1, 4) (1, 9) + """) + + self.check_tokenize('x+y = 3e-1230', """\ + NAME 'x' (1, 0) (1, 1) + PLUS '+' (1, 1) (1, 2) + NAME 'y' (1, 2) (1, 3) + EQUAL '=' (1, 4) (1, 5) + NUMBER '3e-1230' (1, 6) (1, 13) + """) + + self.check_tokenize('x = 3.14e159', """\ + NAME 'x' (1, 0) (1, 1) + EQUAL '=' (1, 2) (1, 3) + NUMBER '3.14e159' (1, 4) (1, 12) + """) + + def test_string(self): + + self.check_tokenize('x = \'\'; y = ""', """\ + NAME 'x' (1, 0) (1, 1) + EQUAL '=' (1, 2) (1, 3) + STRING "''" (1, 4) (1, 6) + SEMI ';' (1, 6) (1, 7) + NAME 'y' (1, 8) (1, 9) + EQUAL '=' (1, 10) (1, 11) + STRING '""' (1, 12) (1, 14) + """) + + self.check_tokenize('x = \'"\'; y = "\'"', """\ + NAME 'x' (1, 0) (1, 1) + EQUAL '=' (1, 2) (1, 3) + STRING '\\'"\\'' (1, 4) (1, 7) + SEMI ';' (1, 7) (1, 8) + NAME 'y' (1, 9) (1, 10) + EQUAL '=' (1, 11) (1, 12) + STRING '"\\'"' (1, 13) (1, 16) + """) + + self.check_tokenize('x = "doesn\'t "shrink", does it"', """\ + NAME 'x' (1, 0) (1, 1) + EQUAL '=' (1, 2) (1, 3) + STRING '"doesn\\'t "' (1, 4) (1, 14) + NAME 'shrink' (1, 14) (1, 20) + STRING '", does it"' (1, 20) (1, 31) + """) + + self.check_tokenize("x = 'abc' + 'ABC'", """\ + NAME 'x' (1, 0) (1, 1) + EQUAL '=' (1, 2) (1, 3) + STRING "'abc'" (1, 4) (1, 9) + PLUS '+' (1, 10) (1, 11) + STRING "'ABC'" (1, 12) (1, 17) + """) + + self.check_tokenize('y = "ABC" + "ABC"', """\ + NAME 'y' (1, 0) (1, 1) + EQUAL '=' (1, 2) (1, 3) + STRING '"ABC"' (1, 4) (1, 9) + PLUS '+' (1, 10) (1, 11) + STRING '"ABC"' (1, 12) (1, 17) + """) + + self.check_tokenize("x = r'abc' + r'ABC' + R'ABC' + R'ABC'", """\ + NAME 'x' (1, 0) (1, 1) + EQUAL '=' (1, 2) (1, 3) + STRING "r'abc'" (1, 4) (1, 10) + PLUS '+' (1, 11) (1, 12) + STRING "r'ABC'" (1, 13) (1, 19) + PLUS '+' (1, 20) (1, 21) + STRING "R'ABC'" (1, 22) (1, 28) + PLUS '+' (1, 29) (1, 30) + STRING "R'ABC'" (1, 31) (1, 37) + """) + + self.check_tokenize('y = r"abc" + r"ABC" + R"ABC" + R"ABC"', """\ + NAME 'y' (1, 0) (1, 1) + EQUAL '=' (1, 2) (1, 3) + STRING 'r"abc"' (1, 4) (1, 10) + PLUS '+' (1, 11) (1, 12) + STRING 'r"ABC"' (1, 13) (1, 19) + PLUS '+' (1, 20) (1, 21) + STRING 'R"ABC"' (1, 22) (1, 28) + PLUS '+' (1, 29) (1, 30) + STRING 'R"ABC"' (1, 31) (1, 37) + """) + + self.check_tokenize("u'abc' + U'abc'", """\ + STRING "u'abc'" (1, 0) (1, 6) + PLUS '+' (1, 7) (1, 8) + STRING "U'abc'" (1, 9) (1, 15) + """) + + self.check_tokenize('u"abc" + U"abc"', """\ + STRING 'u"abc"' (1, 0) (1, 6) + PLUS '+' (1, 7) (1, 8) + STRING 'U"abc"' (1, 9) (1, 15) + """) + + self.check_tokenize("b'abc' + B'abc'", """\ + STRING "b'abc'" (1, 0) (1, 6) + PLUS '+' (1, 7) (1, 8) + STRING "B'abc'" (1, 9) (1, 15) + """) + + self.check_tokenize('b"abc" + B"abc"', """\ + STRING 'b"abc"' (1, 0) (1, 6) + PLUS '+' (1, 7) (1, 8) + STRING 'B"abc"' (1, 9) (1, 15) + """) + + self.check_tokenize("br'abc' + bR'abc' + Br'abc' + BR'abc'", """\ + STRING "br'abc'" (1, 0) (1, 7) + PLUS '+' (1, 8) (1, 9) + STRING "bR'abc'" (1, 10) (1, 17) + PLUS '+' (1, 18) (1, 19) + STRING "Br'abc'" (1, 20) (1, 27) + PLUS '+' (1, 28) (1, 29) + STRING "BR'abc'" (1, 30) (1, 37) + """) + + self.check_tokenize('br"abc" + bR"abc" + Br"abc" + BR"abc"', """\ + STRING 'br"abc"' (1, 0) (1, 7) + PLUS '+' (1, 8) (1, 9) + STRING 'bR"abc"' (1, 10) (1, 17) + PLUS '+' (1, 18) (1, 19) + STRING 'Br"abc"' (1, 20) (1, 27) + PLUS '+' (1, 28) (1, 29) + STRING 'BR"abc"' (1, 30) (1, 37) + """) + + self.check_tokenize("rb'abc' + rB'abc' + Rb'abc' + RB'abc'", """\ + STRING "rb'abc'" (1, 0) (1, 7) + PLUS '+' (1, 8) (1, 9) + STRING "rB'abc'" (1, 10) (1, 17) + PLUS '+' (1, 18) (1, 19) + STRING "Rb'abc'" (1, 20) (1, 27) + PLUS '+' (1, 28) (1, 29) + STRING "RB'abc'" (1, 30) (1, 37) + """) + + self.check_tokenize('rb"abc" + rB"abc" + Rb"abc" + RB"abc"', """\ + STRING 'rb"abc"' (1, 0) (1, 7) + PLUS '+' (1, 8) (1, 9) + STRING 'rB"abc"' (1, 10) (1, 17) + PLUS '+' (1, 18) (1, 19) + STRING 'Rb"abc"' (1, 20) (1, 27) + PLUS '+' (1, 28) (1, 29) + STRING 'RB"abc"' (1, 30) (1, 37) + """) + + self.check_tokenize('"a\\\nde\\\nfg"', """\ + STRING '"a\\\\\\nde\\\\\\nfg"\' (1, 0) (3, 3) + """) + + self.check_tokenize('u"a\\\nde"', """\ + STRING 'u"a\\\\\\nde"\' (1, 0) (2, 3) + """) + + self.check_tokenize('rb"a\\\nd"', """\ + STRING 'rb"a\\\\\\nd"\' (1, 0) (2, 2) + """) + + self.check_tokenize(r'"""a\ +b"""', """\ + STRING '\"\""a\\\\\\nb\"\""' (1, 0) (2, 4) + """) + self.check_tokenize(r'u"""a\ +b"""', """\ + STRING 'u\"\""a\\\\\\nb\"\""' (1, 0) (2, 4) + """) + self.check_tokenize(r'rb"""a\ +b\ +c"""', """\ + STRING 'rb"\""a\\\\\\nb\\\\\\nc"\""' (1, 0) (3, 4) + """) + + self.check_tokenize('f"abc"', """\ + STRING 'f"abc"' (1, 0) (1, 6) + """) + + self.check_tokenize('fR"a{b}c"', """\ + STRING 'fR"a{b}c"' (1, 0) (1, 9) + """) + + self.check_tokenize('f"""abc"""', """\ + STRING 'f\"\"\"abc\"\"\"' (1, 0) (1, 10) + """) + + self.check_tokenize(r'f"abc\ +def"', """\ + STRING 'f"abc\\\\\\ndef"' (1, 0) (2, 4) + """) + + self.check_tokenize(r'Rf"abc\ +def"', """\ + STRING 'Rf"abc\\\\\\ndef"' (1, 0) (2, 4) + """) + + def test_function(self): + + self.check_tokenize('def d22(a, b, c=2, d=2, *k): pass', """\ + NAME 'def' (1, 0) (1, 3) + NAME 'd22' (1, 4) (1, 7) + LPAR '(' (1, 7) (1, 8) + NAME 'a' (1, 8) (1, 9) + COMMA ',' (1, 9) (1, 10) + NAME 'b' (1, 11) (1, 12) + COMMA ',' (1, 12) (1, 13) + NAME 'c' (1, 14) (1, 15) + EQUAL '=' (1, 15) (1, 16) + NUMBER '2' (1, 16) (1, 17) + COMMA ',' (1, 17) (1, 18) + NAME 'd' (1, 19) (1, 20) + EQUAL '=' (1, 20) (1, 21) + NUMBER '2' (1, 21) (1, 22) + COMMA ',' (1, 22) (1, 23) + STAR '*' (1, 24) (1, 25) + NAME 'k' (1, 25) (1, 26) + RPAR ')' (1, 26) (1, 27) + COLON ':' (1, 27) (1, 28) + NAME 'pass' (1, 29) (1, 33) + """) + + self.check_tokenize('def d01v_(a=1, *k, **w): pass', """\ + NAME 'def' (1, 0) (1, 3) + NAME 'd01v_' (1, 4) (1, 9) + LPAR '(' (1, 9) (1, 10) + NAME 'a' (1, 10) (1, 11) + EQUAL '=' (1, 11) (1, 12) + NUMBER '1' (1, 12) (1, 13) + COMMA ',' (1, 13) (1, 14) + STAR '*' (1, 15) (1, 16) + NAME 'k' (1, 16) (1, 17) + COMMA ',' (1, 17) (1, 18) + DOUBLESTAR '**' (1, 19) (1, 21) + NAME 'w' (1, 21) (1, 22) + RPAR ')' (1, 22) (1, 23) + COLON ':' (1, 23) (1, 24) + NAME 'pass' (1, 25) (1, 29) + """) + + self.check_tokenize('def d23(a: str, b: int=3) -> int: pass', """\ + NAME 'def' (1, 0) (1, 3) + NAME 'd23' (1, 4) (1, 7) + LPAR '(' (1, 7) (1, 8) + NAME 'a' (1, 8) (1, 9) + COLON ':' (1, 9) (1, 10) + NAME 'str' (1, 11) (1, 14) + COMMA ',' (1, 14) (1, 15) + NAME 'b' (1, 16) (1, 17) + COLON ':' (1, 17) (1, 18) + NAME 'int' (1, 19) (1, 22) + EQUAL '=' (1, 22) (1, 23) + NUMBER '3' (1, 23) (1, 24) + RPAR ')' (1, 24) (1, 25) + RARROW '->' (1, 26) (1, 28) + NAME 'int' (1, 29) (1, 32) + COLON ':' (1, 32) (1, 33) + NAME 'pass' (1, 34) (1, 38) + """) + + def test_comparison(self): + + self.check_tokenize("if 1 < 1 > 1 == 1 >= 5 <= 0x15 <= 0x12 != " + "1 and 5 in 1 not in 1 is 1 or 5 is not 1: pass", """\ + NAME 'if' (1, 0) (1, 2) + NUMBER '1' (1, 3) (1, 4) + LESS '<' (1, 5) (1, 6) + NUMBER '1' (1, 7) (1, 8) + GREATER '>' (1, 9) (1, 10) + NUMBER '1' (1, 11) (1, 12) + EQEQUAL '==' (1, 13) (1, 15) + NUMBER '1' (1, 16) (1, 17) + GREATEREQUAL '>=' (1, 18) (1, 20) + NUMBER '5' (1, 21) (1, 22) + LESSEQUAL '<=' (1, 23) (1, 25) + NUMBER '0x15' (1, 26) (1, 30) + LESSEQUAL '<=' (1, 31) (1, 33) + NUMBER '0x12' (1, 34) (1, 38) + NOTEQUAL '!=' (1, 39) (1, 41) + NUMBER '1' (1, 42) (1, 43) + NAME 'and' (1, 44) (1, 47) + NUMBER '5' (1, 48) (1, 49) + NAME 'in' (1, 50) (1, 52) + NUMBER '1' (1, 53) (1, 54) + NAME 'not' (1, 55) (1, 58) + NAME 'in' (1, 59) (1, 61) + NUMBER '1' (1, 62) (1, 63) + NAME 'is' (1, 64) (1, 66) + NUMBER '1' (1, 67) (1, 68) + NAME 'or' (1, 69) (1, 71) + NUMBER '5' (1, 72) (1, 73) + NAME 'is' (1, 74) (1, 76) + NAME 'not' (1, 77) (1, 80) + NUMBER '1' (1, 81) (1, 82) + COLON ':' (1, 82) (1, 83) + NAME 'pass' (1, 84) (1, 88) + """) + + def test_additive(self): + + self.check_tokenize('x = 1 - y + 15 - 1 + 0x124 + z + a[5]', """\ + NAME 'x' (1, 0) (1, 1) + EQUAL '=' (1, 2) (1, 3) + NUMBER '1' (1, 4) (1, 5) + MINUS '-' (1, 6) (1, 7) + NAME 'y' (1, 8) (1, 9) + PLUS '+' (1, 10) (1, 11) + NUMBER '15' (1, 12) (1, 14) + MINUS '-' (1, 15) (1, 16) + NUMBER '1' (1, 17) (1, 18) + PLUS '+' (1, 19) (1, 20) + NUMBER '0x124' (1, 21) (1, 26) + PLUS '+' (1, 27) (1, 28) + NAME 'z' (1, 29) (1, 30) + PLUS '+' (1, 31) (1, 32) + NAME 'a' (1, 33) (1, 34) + LSQB '[' (1, 34) (1, 35) + NUMBER '5' (1, 35) (1, 36) + RSQB ']' (1, 36) (1, 37) + """) + + def test_multiplicative(self): + + self.check_tokenize('x = 1//1*1/5*12%0x12@42', """\ + NAME 'x' (1, 0) (1, 1) + EQUAL '=' (1, 2) (1, 3) + NUMBER '1' (1, 4) (1, 5) + DOUBLESLASH '//' (1, 5) (1, 7) + NUMBER '1' (1, 7) (1, 8) + STAR '*' (1, 8) (1, 9) + NUMBER '1' (1, 9) (1, 10) + SLASH '/' (1, 10) (1, 11) + NUMBER '5' (1, 11) (1, 12) + STAR '*' (1, 12) (1, 13) + NUMBER '12' (1, 13) (1, 15) + PERCENT '%' (1, 15) (1, 16) + NUMBER '0x12' (1, 16) (1, 20) + AT '@' (1, 20) (1, 21) + NUMBER '42' (1, 21) (1, 23) + """) + + def test_unary(self): + + self.check_tokenize('~1 ^ 1 & 1 |1 ^ -1', """\ + TILDE '~' (1, 0) (1, 1) + NUMBER '1' (1, 1) (1, 2) + CIRCUMFLEX '^' (1, 3) (1, 4) + NUMBER '1' (1, 5) (1, 6) + AMPER '&' (1, 7) (1, 8) + NUMBER '1' (1, 9) (1, 10) + VBAR '|' (1, 11) (1, 12) + NUMBER '1' (1, 12) (1, 13) + CIRCUMFLEX '^' (1, 14) (1, 15) + MINUS '-' (1, 16) (1, 17) + NUMBER '1' (1, 17) (1, 18) + """) + + self.check_tokenize('-1*1/1+1*1//1 - ---1**1', """\ + MINUS '-' (1, 0) (1, 1) + NUMBER '1' (1, 1) (1, 2) + STAR '*' (1, 2) (1, 3) + NUMBER '1' (1, 3) (1, 4) + SLASH '/' (1, 4) (1, 5) + NUMBER '1' (1, 5) (1, 6) + PLUS '+' (1, 6) (1, 7) + NUMBER '1' (1, 7) (1, 8) + STAR '*' (1, 8) (1, 9) + NUMBER '1' (1, 9) (1, 10) + DOUBLESLASH '//' (1, 10) (1, 12) + NUMBER '1' (1, 12) (1, 13) + MINUS '-' (1, 14) (1, 15) + MINUS '-' (1, 16) (1, 17) + MINUS '-' (1, 17) (1, 18) + MINUS '-' (1, 18) (1, 19) + NUMBER '1' (1, 19) (1, 20) + DOUBLESTAR '**' (1, 20) (1, 22) + NUMBER '1' (1, 22) (1, 23) + """) + + def test_selector(self): + + self.check_tokenize("import sys, time\nx = sys.modules['time'].time()", """\ + NAME 'import' (1, 0) (1, 6) + NAME 'sys' (1, 7) (1, 10) + COMMA ',' (1, 10) (1, 11) + NAME 'time' (1, 12) (1, 16) + NEWLINE '' (1, 16) (1, 16) + NAME 'x' (2, 0) (2, 1) + EQUAL '=' (2, 2) (2, 3) + NAME 'sys' (2, 4) (2, 7) + DOT '.' (2, 7) (2, 8) + NAME 'modules' (2, 8) (2, 15) + LSQB '[' (2, 15) (2, 16) + STRING "'time'" (2, 16) (2, 22) + RSQB ']' (2, 22) (2, 23) + DOT '.' (2, 23) (2, 24) + NAME 'time' (2, 24) (2, 28) + LPAR '(' (2, 28) (2, 29) + RPAR ')' (2, 29) (2, 30) + """) + + def test_method(self): + + self.check_tokenize('@staticmethod\ndef foo(x,y): pass', """\ + AT '@' (1, 0) (1, 1) + NAME 'staticmethod' (1, 1) (1, 13) + NEWLINE '' (1, 13) (1, 13) + NAME 'def' (2, 0) (2, 3) + NAME 'foo' (2, 4) (2, 7) + LPAR '(' (2, 7) (2, 8) + NAME 'x' (2, 8) (2, 9) + COMMA ',' (2, 9) (2, 10) + NAME 'y' (2, 10) (2, 11) + RPAR ')' (2, 11) (2, 12) + COLON ':' (2, 12) (2, 13) + NAME 'pass' (2, 14) (2, 18) + """) + + def test_tabs(self): + + self.check_tokenize('@staticmethod\ndef foo(x,y): pass', """\ + AT '@' (1, 0) (1, 1) + NAME 'staticmethod' (1, 1) (1, 13) + NEWLINE '' (1, 13) (1, 13) + NAME 'def' (2, 0) (2, 3) + NAME 'foo' (2, 4) (2, 7) + LPAR '(' (2, 7) (2, 8) + NAME 'x' (2, 8) (2, 9) + COMMA ',' (2, 9) (2, 10) + NAME 'y' (2, 10) (2, 11) + RPAR ')' (2, 11) (2, 12) + COLON ':' (2, 12) (2, 13) + NAME 'pass' (2, 14) (2, 18) + """) + + def test_async(self): + + self.check_tokenize('async = 1', """\ + ASYNC 'async' (1, 0) (1, 5) + EQUAL '=' (1, 6) (1, 7) + NUMBER '1' (1, 8) (1, 9) + """) + + self.check_tokenize('a = (async = 1)', """\ + NAME 'a' (1, 0) (1, 1) + EQUAL '=' (1, 2) (1, 3) + LPAR '(' (1, 4) (1, 5) + ASYNC 'async' (1, 5) (1, 10) + EQUAL '=' (1, 11) (1, 12) + NUMBER '1' (1, 13) (1, 14) + RPAR ')' (1, 14) (1, 15) + """) + + self.check_tokenize('async()', """\ + ASYNC 'async' (1, 0) (1, 5) + LPAR '(' (1, 5) (1, 6) + RPAR ')' (1, 6) (1, 7) + """) + + self.check_tokenize('class async(Bar):pass', """\ + NAME 'class' (1, 0) (1, 5) + ASYNC 'async' (1, 6) (1, 11) + LPAR '(' (1, 11) (1, 12) + NAME 'Bar' (1, 12) (1, 15) + RPAR ')' (1, 15) (1, 16) + COLON ':' (1, 16) (1, 17) + NAME 'pass' (1, 17) (1, 21) + """) + + self.check_tokenize('class async:pass', """\ + NAME 'class' (1, 0) (1, 5) + ASYNC 'async' (1, 6) (1, 11) + COLON ':' (1, 11) (1, 12) + NAME 'pass' (1, 12) (1, 16) + """) + + self.check_tokenize('await = 1', """\ + AWAIT 'await' (1, 0) (1, 5) + EQUAL '=' (1, 6) (1, 7) + NUMBER '1' (1, 8) (1, 9) + """) + + self.check_tokenize('foo.async', """\ + NAME 'foo' (1, 0) (1, 3) + DOT '.' (1, 3) (1, 4) + ASYNC 'async' (1, 4) (1, 9) + """) + + self.check_tokenize('async for a in b: pass', """\ + ASYNC 'async' (1, 0) (1, 5) + NAME 'for' (1, 6) (1, 9) + NAME 'a' (1, 10) (1, 11) + NAME 'in' (1, 12) (1, 14) + NAME 'b' (1, 15) (1, 16) + COLON ':' (1, 16) (1, 17) + NAME 'pass' (1, 18) (1, 22) + """) + + self.check_tokenize('async with a as b: pass', """\ + ASYNC 'async' (1, 0) (1, 5) + NAME 'with' (1, 6) (1, 10) + NAME 'a' (1, 11) (1, 12) + NAME 'as' (1, 13) (1, 15) + NAME 'b' (1, 16) (1, 17) + COLON ':' (1, 17) (1, 18) + NAME 'pass' (1, 19) (1, 23) + """) + + self.check_tokenize('async.foo', """\ + ASYNC 'async' (1, 0) (1, 5) + DOT '.' (1, 5) (1, 6) + NAME 'foo' (1, 6) (1, 9) + """) + + self.check_tokenize('async', """\ + ASYNC 'async' (1, 0) (1, 5) + """) + + self.check_tokenize('async\n#comment\nawait', """\ + ASYNC 'async' (1, 0) (1, 5) + NEWLINE '' (1, 5) (1, 5) + AWAIT 'await' (3, 0) (3, 5) + """) + + self.check_tokenize('async\n...\nawait', """\ + ASYNC 'async' (1, 0) (1, 5) + NEWLINE '' (1, 5) (1, 5) + ELLIPSIS '...' (2, 0) (2, 3) + NEWLINE '' (2, 3) (2, 3) + AWAIT 'await' (3, 0) (3, 5) + """) + + self.check_tokenize('async\nawait', """\ + ASYNC 'async' (1, 0) (1, 5) + NEWLINE '' (1, 5) (1, 5) + AWAIT 'await' (2, 0) (2, 5) + """) + + self.check_tokenize('foo.async + 1', """\ + NAME 'foo' (1, 0) (1, 3) + DOT '.' (1, 3) (1, 4) + ASYNC 'async' (1, 4) (1, 9) + PLUS '+' (1, 10) (1, 11) + NUMBER '1' (1, 12) (1, 13) + """) + + self.check_tokenize('async def foo(): pass', """\ + ASYNC 'async' (1, 0) (1, 5) + NAME 'def' (1, 6) (1, 9) + NAME 'foo' (1, 10) (1, 13) + LPAR '(' (1, 13) (1, 14) + RPAR ')' (1, 14) (1, 15) + COLON ':' (1, 15) (1, 16) + NAME 'pass' (1, 17) (1, 21) + """) + + self.check_tokenize('''\ +async def foo(): + def foo(await): + await = 1 + if 1: + await +async += 1 +''', """\ + ASYNC 'async' (1, 0) (1, 5) + NAME 'def' (1, 6) (1, 9) + NAME 'foo' (1, 10) (1, 13) + LPAR '(' (1, 13) (1, 14) + RPAR ')' (1, 14) (1, 15) + COLON ':' (1, 15) (1, 16) + NEWLINE '' (1, 16) (1, 16) + INDENT '' (2, -1) (2, -1) + NAME 'def' (2, 2) (2, 5) + NAME 'foo' (2, 6) (2, 9) + LPAR '(' (2, 9) (2, 10) + AWAIT 'await' (2, 10) (2, 15) + RPAR ')' (2, 15) (2, 16) + COLON ':' (2, 16) (2, 17) + NEWLINE '' (2, 17) (2, 17) + INDENT '' (3, -1) (3, -1) + AWAIT 'await' (3, 4) (3, 9) + EQUAL '=' (3, 10) (3, 11) + NUMBER '1' (3, 12) (3, 13) + NEWLINE '' (3, 13) (3, 13) + DEDENT '' (4, -1) (4, -1) + NAME 'if' (4, 2) (4, 4) + NUMBER '1' (4, 5) (4, 6) + COLON ':' (4, 6) (4, 7) + NEWLINE '' (4, 7) (4, 7) + INDENT '' (5, -1) (5, -1) + AWAIT 'await' (5, 4) (5, 9) + NEWLINE '' (5, 9) (5, 9) + DEDENT '' (6, -1) (6, -1) + DEDENT '' (6, -1) (6, -1) + ASYNC 'async' (6, 0) (6, 5) + PLUSEQUAL '+=' (6, 6) (6, 8) + NUMBER '1' (6, 9) (6, 10) + NEWLINE '' (6, 10) (6, 10) + """) + + self.check_tokenize('async def foo():\n async for i in 1: pass', """\ + ASYNC 'async' (1, 0) (1, 5) + NAME 'def' (1, 6) (1, 9) + NAME 'foo' (1, 10) (1, 13) + LPAR '(' (1, 13) (1, 14) + RPAR ')' (1, 14) (1, 15) + COLON ':' (1, 15) (1, 16) + NEWLINE '' (1, 16) (1, 16) + INDENT '' (2, -1) (2, -1) + ASYNC 'async' (2, 2) (2, 7) + NAME 'for' (2, 8) (2, 11) + NAME 'i' (2, 12) (2, 13) + NAME 'in' (2, 14) (2, 16) + NUMBER '1' (2, 17) (2, 18) + COLON ':' (2, 18) (2, 19) + NAME 'pass' (2, 20) (2, 24) + DEDENT '' (2, -1) (2, -1) + """) + + self.check_tokenize('async def foo(async): await', """\ + ASYNC 'async' (1, 0) (1, 5) + NAME 'def' (1, 6) (1, 9) + NAME 'foo' (1, 10) (1, 13) + LPAR '(' (1, 13) (1, 14) + ASYNC 'async' (1, 14) (1, 19) + RPAR ')' (1, 19) (1, 20) + COLON ':' (1, 20) (1, 21) + AWAIT 'await' (1, 22) (1, 27) + """) + + self.check_tokenize('''\ +def f(): + + def baz(): pass + async def bar(): pass + + await = 2''', """\ + NAME 'def' (1, 0) (1, 3) + NAME 'f' (1, 4) (1, 5) + LPAR '(' (1, 5) (1, 6) + RPAR ')' (1, 6) (1, 7) + COLON ':' (1, 7) (1, 8) + NEWLINE '' (1, 8) (1, 8) + INDENT '' (3, -1) (3, -1) + NAME 'def' (3, 2) (3, 5) + NAME 'baz' (3, 6) (3, 9) + LPAR '(' (3, 9) (3, 10) + RPAR ')' (3, 10) (3, 11) + COLON ':' (3, 11) (3, 12) + NAME 'pass' (3, 13) (3, 17) + NEWLINE '' (3, 17) (3, 17) + ASYNC 'async' (4, 2) (4, 7) + NAME 'def' (4, 8) (4, 11) + NAME 'bar' (4, 12) (4, 15) + LPAR '(' (4, 15) (4, 16) + RPAR ')' (4, 16) (4, 17) + COLON ':' (4, 17) (4, 18) + NAME 'pass' (4, 19) (4, 23) + NEWLINE '' (4, 23) (4, 23) + AWAIT 'await' (6, 2) (6, 7) + EQUAL '=' (6, 8) (6, 9) + NUMBER '2' (6, 10) (6, 11) + DEDENT '' (6, -1) (6, -1) + """) + + self.check_tokenize('''\ +async def f(): + + def baz(): pass + async def bar(): pass + + await = 2''', """\ + ASYNC 'async' (1, 0) (1, 5) + NAME 'def' (1, 6) (1, 9) + NAME 'f' (1, 10) (1, 11) + LPAR '(' (1, 11) (1, 12) + RPAR ')' (1, 12) (1, 13) + COLON ':' (1, 13) (1, 14) + NEWLINE '' (1, 14) (1, 14) + INDENT '' (3, -1) (3, -1) + NAME 'def' (3, 2) (3, 5) + NAME 'baz' (3, 6) (3, 9) + LPAR '(' (3, 9) (3, 10) + RPAR ')' (3, 10) (3, 11) + COLON ':' (3, 11) (3, 12) + NAME 'pass' (3, 13) (3, 17) + NEWLINE '' (3, 17) (3, 17) + ASYNC 'async' (4, 2) (4, 7) + NAME 'def' (4, 8) (4, 11) + NAME 'bar' (4, 12) (4, 15) + LPAR '(' (4, 15) (4, 16) + RPAR ')' (4, 16) (4, 17) + COLON ':' (4, 17) (4, 18) + NAME 'pass' (4, 19) (4, 23) + NEWLINE '' (4, 23) (4, 23) + AWAIT 'await' (6, 2) (6, 7) + EQUAL '=' (6, 8) (6, 9) + NUMBER '2' (6, 10) (6, 11) + DEDENT '' (6, -1) (6, -1) + """) + + def test_unicode(self): + + self.check_tokenize("Örter = u'places'\ngrün = U'green'", """\ + NAME 'Örter' (1, 0) (1, 6) + EQUAL '=' (1, 7) (1, 8) + STRING "u'places'" (1, 9) (1, 18) + NEWLINE '' (1, 18) (1, 18) + NAME 'grün' (2, 0) (2, 5) + EQUAL '=' (2, 6) (2, 7) + STRING "U'green'" (2, 8) (2, 16) + """) + + def test_invalid_syntax(self): + def get_tokens(string): + return list(_generate_tokens_from_c_tokenizer(string)) + + self.assertRaises(SyntaxError, get_tokens, "(1+2]") + self.assertRaises(SyntaxError, get_tokens, "(1+2}") + self.assertRaises(SyntaxError, get_tokens, "{1+2]") + + self.assertRaises(SyntaxError, get_tokens, "1_") + self.assertRaises(SyntaxError, get_tokens, "1.2_") + self.assertRaises(SyntaxError, get_tokens, "1e2_") + self.assertRaises(SyntaxError, get_tokens, "1e+") + + self.assertRaises(SyntaxError, get_tokens, "\xa0") + self.assertRaises(SyntaxError, get_tokens, "€") + + self.assertRaises(SyntaxError, get_tokens, "0b12") + self.assertRaises(SyntaxError, get_tokens, "0b1_2") + self.assertRaises(SyntaxError, get_tokens, "0b2") + self.assertRaises(SyntaxError, get_tokens, "0b1_") + self.assertRaises(SyntaxError, get_tokens, "0b") + self.assertRaises(SyntaxError, get_tokens, "0o18") + self.assertRaises(SyntaxError, get_tokens, "0o1_8") + self.assertRaises(SyntaxError, get_tokens, "0o8") + self.assertRaises(SyntaxError, get_tokens, "0o1_") + self.assertRaises(SyntaxError, get_tokens, "0o") + self.assertRaises(SyntaxError, get_tokens, "0x1_") + self.assertRaises(SyntaxError, get_tokens, "0x") + self.assertRaises(SyntaxError, get_tokens, "1_") + self.assertRaises(SyntaxError, get_tokens, "012") + self.assertRaises(SyntaxError, get_tokens, "1.2_") + self.assertRaises(SyntaxError, get_tokens, "1e2_") + self.assertRaises(SyntaxError, get_tokens, "1e+") + + self.assertRaises(SyntaxError, get_tokens, "'sdfsdf") + self.assertRaises(SyntaxError, get_tokens, "'''sdfsdf''") + + self.assertRaises(SyntaxError, get_tokens, "("*1000+"a"+")"*1000) + self.assertRaises(SyntaxError, get_tokens, "]") + + if __name__ == "__main__": unittest.main() diff --git a/Lib/tokenize.py b/Lib/tokenize.py index 7d7736fe985..0b9e2383100 100644 --- a/Lib/tokenize.py +++ b/Lib/tokenize.py @@ -680,5 +680,13 @@ def error(message, filename=None, location=None): perror("unexpected error: %s" % err) raise +def _generate_tokens_from_c_tokenizer(source): + """Tokenize a source reading Python code as unicode strings using the internal C tokenizer""" + import _tokenize as c_tokenizer + for info in c_tokenizer.TokenizerIter(source): + tok, type, lineno, end_lineno, col_off, end_col_off, line = info + yield TokenInfo(type, tok, (lineno, col_off), (end_lineno, end_col_off), line) + + if __name__ == "__main__": main() diff --git a/Makefile.pre.in b/Makefile.pre.in index f503ac4d876..1007f440759 100644 --- a/Makefile.pre.in +++ b/Makefile.pre.in @@ -339,6 +339,7 @@ PARSER_HEADERS= \ PYTHON_OBJS= \ Python/_warnings.o \ Python/Python-ast.o \ + Python/Python-tokenize.o \ Python/asdl.o \ Python/ast.o \ Python/ast_opt.o \ diff --git a/Modules/config.c.in b/Modules/config.c.in index d69e8e88b0c..6081f957595 100644 --- a/Modules/config.c.in +++ b/Modules/config.c.in @@ -28,6 +28,7 @@ extern PyObject* PyMarshal_Init(void); extern PyObject* PyInit__imp(void); extern PyObject* PyInit_gc(void); extern PyObject* PyInit__ast(void); +extern PyObject* PyInit__tokenize(void); extern PyObject* _PyWarnings_Init(void); extern PyObject* PyInit__string(void); @@ -44,6 +45,9 @@ struct _inittab _PyImport_Inittab[] = { /* This lives in Python/Python-ast.c */ {"_ast", PyInit__ast}, + /* This lives in Python/Python-tokenizer.c */ + {"_tokenize", PyInit__tokenize}, + /* These entries are here for sys.builtin_module_names */ {"builtins", NULL}, {"sys", NULL}, diff --git a/PC/config.c b/PC/config.c index 11743ea45a9..9d900c78e40 100644 --- a/PC/config.c +++ b/PC/config.c @@ -72,9 +72,8 @@ extern PyObject* _PyWarnings_Init(void); extern PyObject* PyInit__string(void); extern PyObject* PyInit__stat(void); extern PyObject* PyInit__opcode(void); - extern PyObject* PyInit__contextvars(void); - +extern PyObject* PyInit__tokenize(void); /* tools/freeze/makeconfig.py marker for additional "extern" */ /* -- ADDMODULE MARKER 1 -- */ @@ -83,7 +82,6 @@ extern PyObject* PyMarshal_Init(void); extern PyObject* PyInit__imp(void); struct _inittab _PyImport_Inittab[] = { - {"_abc", PyInit__abc}, {"array", PyInit_array}, {"_ast", PyInit__ast}, @@ -105,6 +103,7 @@ struct _inittab _PyImport_Inittab[] = { {"_blake2", PyInit__blake2}, {"time", PyInit_time}, {"_thread", PyInit__thread}, + {"_tokenize", PyInit__tokenize}, {"_typing", PyInit__typing}, {"_statistics", PyInit__statistics}, #ifdef WIN32 diff --git a/PCbuild/pythoncore.vcxproj b/PCbuild/pythoncore.vcxproj index ebc0f2879f9..b8cadf46935 100644 --- a/PCbuild/pythoncore.vcxproj +++ b/PCbuild/pythoncore.vcxproj @@ -488,6 +488,7 @@ + diff --git a/Python/Python-tokenize.c b/Python/Python-tokenize.c new file mode 100644 index 00000000000..b9fb1693ce1 --- /dev/null +++ b/Python/Python-tokenize.c @@ -0,0 +1,195 @@ +#include "Python.h" +#include "../Parser/tokenizer.h" + +static struct PyModuleDef _tokenizemodule; + +typedef struct { + PyTypeObject* TokenizerIter; +} tokenize_state; + +static tokenize_state* +get_tokenize_state(PyObject* module) +{ + return (tokenize_state*)PyModule_GetState(module); +} + +#define _tokenize_get_state_by_type(type) \ + get_tokenize_state(_PyType_GetModuleByDef(type, &_tokenizemodule)) + +#include "clinic/Python-tokenize.c.h" + +/*[clinic input] +module _tokenizer +class _tokenizer.tokenizeriter "tokenizeriterobject *" "_tokenize_get_state_by_type(type)->TokenizerIter" +[clinic start generated code]*/ +/*[clinic end generated code: output=da39a3ee5e6b4b0d input=96d98ee2fef7a8bc]*/ + +typedef struct { + PyObject_HEAD + struct tok_state* tok; +} tokenizeriterobject; + +/*[clinic input] +@classmethod +_tokenizer.tokenizeriter.__new__ as tokenizeriter_new + + source: str +[clinic start generated code]*/ + +static PyObject * +tokenizeriter_new_impl(PyTypeObject *type, const char *source) +/*[clinic end generated code: output=7fd9f46cf9263cbb input=4384b368407375c6]*/ +{ + tokenizeriterobject* self = (tokenizeriterobject*)type->tp_alloc(type, 0); + if (self == NULL) { + return NULL; + } + PyObject* filename = PyUnicode_FromString(""); + if (filename == NULL) { + return NULL; + } + self->tok = PyTokenizer_FromUTF8(source, 1); + if (self->tok == NULL) { + return NULL; + } + self->tok->filename = filename; + return (PyObject*)self; +} + +static PyObject* +tokenizeriter_next(tokenizeriterobject* it) +{ + const char* start; + const char* end; + int type = PyTokenizer_Get(it->tok, &start, &end); + if (type == ERRORTOKEN && PyErr_Occurred()) { + return NULL; + } + if (type == ERRORTOKEN || type == ENDMARKER) { + PyErr_SetString(PyExc_StopIteration, "EOF"); + return NULL; + } + PyObject* str = NULL; + if (start == NULL || end == NULL) { + str = PyUnicode_FromString(""); + } else { + str = PyUnicode_FromStringAndSize(start, end - start); + } + if (str == NULL) { + return NULL; + } + + Py_ssize_t size = it->tok->inp - it->tok->buf; + PyObject* line = PyUnicode_DecodeUTF8(it->tok->buf, size, "replace"); + if (line == NULL) { + Py_DECREF(str); + return NULL; + } + const char* line_start = type == STRING ? it->tok->multi_line_start : it->tok->line_start; + int lineno = type == STRING ? it->tok->first_lineno : it->tok->lineno; + int end_lineno = it->tok->lineno; + int col_offset = -1; + int end_col_offset = -1; + if (start != NULL && start >= line_start) { + col_offset = (int)(start - line_start); + } + if (end != NULL && end >= it->tok->line_start) { + end_col_offset = (int)(end - it->tok->line_start); + } + + return Py_BuildValue("(NiiiiiN)", str, type, lineno, end_lineno, col_offset, end_col_offset, line); +} + +static void +tokenizeriter_dealloc(tokenizeriterobject* it) +{ + PyTypeObject* tp = Py_TYPE(it); + PyTokenizer_Free(it->tok); + tp->tp_free(it); + Py_DECREF(tp); +} + +static PyType_Slot tokenizeriter_slots[] = { + {Py_tp_new, tokenizeriter_new}, + {Py_tp_dealloc, tokenizeriter_dealloc}, + {Py_tp_getattro, PyObject_GenericGetAttr}, + {Py_tp_iter, PyObject_SelfIter}, + {Py_tp_iternext, tokenizeriter_next}, + {0, NULL}, +}; + +static PyType_Spec tokenizeriter_spec = { + .name = "_tokenize.TokenizerIter", + .basicsize = sizeof(tokenizeriterobject), + .flags = (Py_TPFLAGS_DEFAULT | Py_TPFLAGS_IMMUTABLETYPE), + .slots = tokenizeriter_slots, +}; + + +static int +tokenizemodule_exec(PyObject* m) +{ + tokenize_state* state = get_tokenize_state(m); + if (state == NULL) { + return -1; + } + + state->TokenizerIter = (PyTypeObject *)PyType_FromModuleAndSpec( + m, &tokenizeriter_spec, NULL); + if (state->TokenizerIter == NULL) { + return -1; + } + if (PyModule_AddType(m, state->TokenizerIter) < 0) { + return -1; + } + + return 0; +} + +static PyMethodDef tokenize_methods[] = { + {NULL, NULL, 0, NULL} /* Sentinel */ +}; + +static PyModuleDef_Slot tokenizemodule_slots[] = { + {Py_mod_exec, tokenizemodule_exec}, + {0, NULL} +}; + +static int +tokenizemodule_traverse(PyObject *m, visitproc visit, void *arg) +{ + tokenize_state *state = get_tokenize_state(m); + Py_VISIT(state->TokenizerIter); + return 0; +} + +static int +tokenizemodule_clear(PyObject *m) +{ + tokenize_state *state = get_tokenize_state(m); + Py_CLEAR(state->TokenizerIter); + return 0; +} + +static void +tokenizemodule_free(void *m) +{ + tokenizemodule_clear((PyObject *)m); +} + +static struct PyModuleDef _tokenizemodule = { + PyModuleDef_HEAD_INIT, + .m_name = "_tokenize", + .m_size = sizeof(tokenize_state), + .m_slots = tokenizemodule_slots, + .m_methods = tokenize_methods, + .m_traverse = tokenizemodule_traverse, + .m_clear = tokenizemodule_clear, + .m_free = tokenizemodule_free, +}; + +PyMODINIT_FUNC +PyInit__tokenize(void) +{ + return PyModuleDef_Init(&_tokenizemodule); +} diff --git a/Python/clinic/Python-tokenize.c.h b/Python/clinic/Python-tokenize.c.h new file mode 100644 index 00000000000..050b4d49448 --- /dev/null +++ b/Python/clinic/Python-tokenize.c.h @@ -0,0 +1,41 @@ +/*[clinic input] +preserve +[clinic start generated code]*/ + +static PyObject * +tokenizeriter_new_impl(PyTypeObject *type, const char *source); + +static PyObject * +tokenizeriter_new(PyTypeObject *type, PyObject *args, PyObject *kwargs) +{ + PyObject *return_value = NULL; + static const char * const _keywords[] = {"source", NULL}; + static _PyArg_Parser _parser = {NULL, _keywords, "tokenizeriter", 0}; + PyObject *argsbuf[1]; + PyObject * const *fastargs; + Py_ssize_t nargs = PyTuple_GET_SIZE(args); + const char *source; + + fastargs = _PyArg_UnpackKeywords(_PyTuple_CAST(args)->ob_item, nargs, kwargs, NULL, &_parser, 1, 1, 0, argsbuf); + if (!fastargs) { + goto exit; + } + if (!PyUnicode_Check(fastargs[0])) { + _PyArg_BadArgument("tokenizeriter", "argument 'source'", "str", fastargs[0]); + goto exit; + } + Py_ssize_t source_length; + source = PyUnicode_AsUTF8AndSize(fastargs[0], &source_length); + if (source == NULL) { + goto exit; + } + if (strlen(source) != (size_t)source_length) { + PyErr_SetString(PyExc_ValueError, "embedded null character"); + goto exit; + } + return_value = tokenizeriter_new_impl(type, source); + +exit: + return return_value; +} +/*[clinic end generated code: output=dfcd64774e01bfe6 input=a9049054013a1b77]*/ diff --git a/Python/stdlib_module_names.h b/Python/stdlib_module_names.h index 3c5f1768305..2f75c2e54cd 100644 --- a/Python/stdlib_module_names.h +++ b/Python/stdlib_module_names.h @@ -80,6 +80,7 @@ static const char* _Py_stdlib_module_names[] = { "_thread", "_threading_local", "_tkinter", +"_tokenize", "_tracemalloc", "_typing", "_uuid",