From 96ec934e755355cfc5af036db8641646b7ddb45e Mon Sep 17 00:00:00 2001 From: Yury Selivanov Date: Thu, 23 Jul 2015 15:01:58 +0300 Subject: [PATCH] Issue #24619: Simplify async/await tokenization. This commit simplifies async/await tokenization in tokenizer.c, tokenize.py & lib2to3/tokenize.py. Previous solution was to keep a stack of async-def & def blocks, whereas the new approach is just to remember position of the outermost async-def block. This change won't bring any parsing performance improvements, but it makes the code much easier to read and validate. --- Lib/lib2to3/pgen2/tokenize.py | 33 ++++---- Lib/lib2to3/tests/test_parser.py | 22 ++++++ Lib/test/test_coroutines.py | 1 + Lib/test/test_tokenize.py | 73 ++++++++++++++++++ Lib/tokenize.py | 39 ++++++---- Parser/tokenizer.c | 126 ++++++++++--------------------- Parser/tokenizer.h | 21 ++---- 7 files changed, 183 insertions(+), 132 deletions(-) diff --git a/Lib/lib2to3/pgen2/tokenize.py b/Lib/lib2to3/pgen2/tokenize.py index 896b0fa0ad4..1ff1c61ee22 100644 --- a/Lib/lib2to3/pgen2/tokenize.py +++ b/Lib/lib2to3/pgen2/tokenize.py @@ -366,10 +366,11 @@ def generate_tokens(readline): contline = None indents = [0] - # 'stashed' and 'ctx' are used for async/await parsing + # 'stashed' and 'async_*' are used for async/await parsing stashed = None - ctx = [('sync', 0)] - in_async = 0 + async_def = False + async_def_indent = 0 + async_def_nl = False while 1: # loop over lines in stream try: @@ -438,15 +439,18 @@ def generate_tokens(readline): ("", lnum, pos, line)) indents = indents[:-1] - cur_indent = indents[-1] - while len(ctx) > 1 and ctx[-1][1] >= cur_indent: - if ctx[-1][0] == 'async': - in_async -= 1 - assert in_async >= 0 - ctx.pop() + if async_def and async_def_indent >= indents[-1]: + async_def = False + async_def_nl = False + async_def_indent = 0 yield (DEDENT, '', (lnum, pos), (lnum, pos), line) + if async_def and async_def_nl and async_def_indent >= indents[-1]: + async_def = False + async_def_nl = False + async_def_indent = 0 + else: # continued statement if not line: raise TokenError("EOF in multi-line statement", (lnum, 0)) @@ -466,10 +470,13 @@ def generate_tokens(readline): newline = NEWLINE if parenlev > 0: newline = NL + elif async_def: + async_def_nl = True if stashed: yield stashed stashed = None yield (newline, token, spos, epos, line) + elif initial == '#': assert not token.endswith("\n") if stashed: @@ -508,7 +515,7 @@ def generate_tokens(readline): yield (STRING, token, spos, epos, line) elif initial in namechars: # ordinary name if token in ('async', 'await'): - if in_async: + if async_def: yield (ASYNC if token == 'async' else AWAIT, token, spos, epos, line) continue @@ -523,15 +530,13 @@ def generate_tokens(readline): and stashed[0] == NAME and stashed[1] == 'async'): - ctx.append(('async', indents[-1])) - in_async += 1 + async_def = True + async_def_indent = indents[-1] yield (ASYNC, stashed[1], stashed[2], stashed[3], stashed[4]) stashed = None - else: - ctx.append(('sync', indents[-1])) if stashed: yield stashed diff --git a/Lib/lib2to3/tests/test_parser.py b/Lib/lib2to3/tests/test_parser.py index 107b5ab68b8..b533c01e28f 100644 --- a/Lib/lib2to3/tests/test_parser.py +++ b/Lib/lib2to3/tests/test_parser.py @@ -67,10 +67,32 @@ def test_await_expr(self): await x """) + self.validate("""async def foo(): + + def foo(): pass + + def foo(): pass + + await x + """) + + self.validate("""async def foo(): return await a""") + + self.validate("""def foo(): + def foo(): pass + async def foo(): await x + """) + self.invalid_syntax("await x") self.invalid_syntax("""def foo(): await x""") + self.invalid_syntax("""def foo(): + def foo(): pass + async def foo(): pass + await x + """) + def test_async_var(self): self.validate("""async = 1""") self.validate("""await = 1""") diff --git a/Lib/test/test_coroutines.py b/Lib/test/test_coroutines.py index 14682ca6047..10de85644ee 100644 --- a/Lib/test/test_coroutines.py +++ b/Lib/test/test_coroutines.py @@ -330,6 +330,7 @@ async def bar(): return await_ async def f(): async def g(): pass await z + await = 1 self.assertTrue(inspect.iscoroutinefunction(f)) diff --git a/Lib/test/test_tokenize.py b/Lib/test/test_tokenize.py index e3205628818..b7ca08949a3 100644 --- a/Lib/test/test_tokenize.py +++ b/Lib/test/test_tokenize.py @@ -840,6 +840,79 @@ OP ')' (1, 19) (1, 20) OP ':' (1, 20) (1, 21) AWAIT 'await' (1, 22) (1, 27) + + >>> dump_tokens('''def f(): + ... + ... def baz(): pass + ... async def bar(): pass + ... + ... await = 2''') + ENCODING 'utf-8' (0, 0) (0, 0) + NAME 'def' (1, 0) (1, 3) + NAME 'f' (1, 4) (1, 5) + OP '(' (1, 5) (1, 6) + OP ')' (1, 6) (1, 7) + OP ':' (1, 7) (1, 8) + NEWLINE '\\n' (1, 8) (1, 9) + NL '\\n' (2, 0) (2, 1) + INDENT ' ' (3, 0) (3, 2) + NAME 'def' (3, 2) (3, 5) + NAME 'baz' (3, 6) (3, 9) + OP '(' (3, 9) (3, 10) + OP ')' (3, 10) (3, 11) + OP ':' (3, 11) (3, 12) + NAME 'pass' (3, 13) (3, 17) + NEWLINE '\\n' (3, 17) (3, 18) + ASYNC 'async' (4, 2) (4, 7) + NAME 'def' (4, 8) (4, 11) + NAME 'bar' (4, 12) (4, 15) + OP '(' (4, 15) (4, 16) + OP ')' (4, 16) (4, 17) + OP ':' (4, 17) (4, 18) + NAME 'pass' (4, 19) (4, 23) + NEWLINE '\\n' (4, 23) (4, 24) + NL '\\n' (5, 0) (5, 1) + NAME 'await' (6, 2) (6, 7) + OP '=' (6, 8) (6, 9) + NUMBER '2' (6, 10) (6, 11) + DEDENT '' (7, 0) (7, 0) + + >>> dump_tokens('''async def f(): + ... + ... def baz(): pass + ... async def bar(): pass + ... + ... await = 2''') + ENCODING 'utf-8' (0, 0) (0, 0) + ASYNC 'async' (1, 0) (1, 5) + NAME 'def' (1, 6) (1, 9) + NAME 'f' (1, 10) (1, 11) + OP '(' (1, 11) (1, 12) + OP ')' (1, 12) (1, 13) + OP ':' (1, 13) (1, 14) + NEWLINE '\\n' (1, 14) (1, 15) + NL '\\n' (2, 0) (2, 1) + INDENT ' ' (3, 0) (3, 2) + NAME 'def' (3, 2) (3, 5) + NAME 'baz' (3, 6) (3, 9) + OP '(' (3, 9) (3, 10) + OP ')' (3, 10) (3, 11) + OP ':' (3, 11) (3, 12) + NAME 'pass' (3, 13) (3, 17) + NEWLINE '\\n' (3, 17) (3, 18) + ASYNC 'async' (4, 2) (4, 7) + NAME 'def' (4, 8) (4, 11) + NAME 'bar' (4, 12) (4, 15) + OP '(' (4, 15) (4, 16) + OP ')' (4, 16) (4, 17) + OP ':' (4, 17) (4, 18) + NAME 'pass' (4, 19) (4, 23) + NEWLINE '\\n' (4, 23) (4, 24) + NL '\\n' (5, 0) (5, 1) + AWAIT 'await' (6, 2) (6, 7) + OP '=' (6, 8) (6, 9) + NUMBER '2' (6, 10) (6, 11) + DEDENT '' (7, 0) (7, 0) """ from test import support diff --git a/Lib/tokenize.py b/Lib/tokenize.py index c3efdda528d..65d06e53f3b 100644 --- a/Lib/tokenize.py +++ b/Lib/tokenize.py @@ -498,10 +498,11 @@ def _tokenize(readline, encoding): contline = None indents = [0] - # 'stashed' and 'ctx' are used for async/await parsing + # 'stashed' and 'async_*' are used for async/await parsing stashed = None - ctx = [('sync', 0)] - in_async = 0 + async_def = False + async_def_indent = 0 + async_def_nl = False if encoding is not None: if encoding == "utf-8-sig": @@ -579,15 +580,18 @@ def _tokenize(readline, encoding): ("", lnum, pos, line)) indents = indents[:-1] - cur_indent = indents[-1] - while len(ctx) > 1 and ctx[-1][1] >= cur_indent: - if ctx[-1][0] == 'async': - in_async -= 1 - assert in_async >= 0 - ctx.pop() + if async_def and async_def_indent >= indents[-1]: + async_def = False + async_def_nl = False + async_def_indent = 0 yield TokenInfo(DEDENT, '', (lnum, pos), (lnum, pos), line) + if async_def and async_def_nl and async_def_indent >= indents[-1]: + async_def = False + async_def_nl = False + async_def_indent = 0 + else: # continued statement if not line: raise TokenError("EOF in multi-line statement", (lnum, 0)) @@ -609,8 +613,13 @@ def _tokenize(readline, encoding): if stashed: yield stashed stashed = None - yield TokenInfo(NL if parenlev > 0 else NEWLINE, - token, spos, epos, line) + if parenlev > 0: + yield TokenInfo(NL, token, spos, epos, line) + else: + yield TokenInfo(NEWLINE, token, spos, epos, line) + if async_def: + async_def_nl = True + elif initial == '#': assert not token.endswith("\n") if stashed: @@ -644,7 +653,7 @@ def _tokenize(readline, encoding): yield TokenInfo(STRING, token, spos, epos, line) elif initial.isidentifier(): # ordinary name if token in ('async', 'await'): - if in_async: + if async_def: yield TokenInfo( ASYNC if token == 'async' else AWAIT, token, spos, epos, line) @@ -660,15 +669,13 @@ def _tokenize(readline, encoding): and stashed.type == NAME and stashed.string == 'async'): - ctx.append(('async', indents[-1])) - in_async += 1 + async_def = True + async_def_indent = indents[-1] yield TokenInfo(ASYNC, stashed.string, stashed.start, stashed.end, stashed.line) stashed = None - else: - ctx.append(('sync', indents[-1])) if stashed: yield stashed diff --git a/Parser/tokenizer.c b/Parser/tokenizer.c index 46c058083f8..04baeaf38ad 100644 --- a/Parser/tokenizer.c +++ b/Parser/tokenizer.c @@ -31,12 +31,6 @@ || c == '_'\ || (c >= 128)) -/* The following DEFTYPE* flags are used in 'tok_state->deftypestack', - and should be removed in 3.7, when async/await are regular - keywords. */ -#define DEFTYPE_ASYNC 1 -#define DEFTYPE_HAS_NL 2 - extern char *PyOS_Readline(FILE *, FILE *, const char *); /* Return malloc'ed string including trailing \n; empty malloc'ed string for EOF; @@ -133,12 +127,6 @@ tok_new(void) tok->indent = 0; tok->indstack[0] = 0; - tok->def = 0; - tok->defstack[0] = 0; - tok->deftypestack[0] = 0; - tok->def_async_behind = 0; - tok->def_in_async = 0; - tok->atbol = 1; tok->pendin = 0; tok->prompt = tok->nextprompt = NULL; @@ -159,6 +147,11 @@ tok_new(void) tok->decoding_readline = NULL; tok->decoding_buffer = NULL; #endif + + tok->async_def = 0; + tok->async_def_indent = 0; + tok->async_def_nl = 0; + return tok; } @@ -1350,11 +1343,6 @@ tok_get(struct tok_state *tok, char **p_start, char **p_end) int c; int blankline, nonascii; - int tok_len; - struct tok_state ahead_tok; - char *ahead_tok_start = NULL, *ahead_top_end = NULL; - int ahead_tok_kind; - *p_start = *p_end = NULL; nextline: tok->start = NULL; @@ -1442,16 +1430,6 @@ tok_get(struct tok_state *tok, char **p_start, char **p_end) if (tok->pendin != 0) { if (tok->pendin < 0) { tok->pendin++; - - while (tok->def && tok->defstack[tok->def] >= tok->indent) { - if (tok->deftypestack[tok->def] & DEFTYPE_ASYNC) { - tok->def_in_async--; - assert(tok->def_in_async >= 0); - } - tok->def--; - assert(tok->def >= 0); - } - return DEDENT; } else { @@ -1460,20 +1438,19 @@ tok_get(struct tok_state *tok, char **p_start, char **p_end) } } - if (!blankline && tok->level == 0 - && tok->def && tok->deftypestack[tok->def] & DEFTYPE_HAS_NL - && tok->defstack[tok->def] >= tok->indent) + if (tok->async_def + && !blankline + && tok->level == 0 + /* There was a NEWLINE after ASYNC DEF, + so we're past the signature. */ + && tok->async_def_nl + /* Current indentation level is less than where + the async function was defined */ + && tok->async_def_indent >= tok->indent) { - /* The top function on the stack did have a NEWLINE - token, but didn't have an INDENT. That means that - it's a one-line function and it should now be removed - from the stack. */ - if (tok->deftypestack[tok->def] & DEFTYPE_ASYNC) { - tok->def_in_async--; - assert(tok->def_in_async >= 0); - } - tok->def--; - assert(tok->def >= 0); + tok->async_def = 0; + tok->async_def_indent = 0; + tok->async_def_nl = 0; } again: @@ -1528,38 +1505,27 @@ tok_get(struct tok_state *tok, char **p_start, char **p_end) *p_start = tok->start; *p_end = tok->cur; - tok_len = tok->cur - tok->start; - if (tok_len == 3 && memcmp(tok->start, "def", 3) == 0) { - /* The current token is 'def'. */ - if (tok->def + 1 >= MAXINDENT) { - tok->done = E_TOODEEP; - tok->cur = tok->inp; - return ERRORTOKEN; + /* async/await parsing block. */ + if (tok->cur - tok->start == 5) { + /* Current token length is 5. */ + if (tok->async_def) { + /* We're inside an 'async def' function. */ + if (memcmp(tok->start, "async", 5) == 0) + return ASYNC; + if (memcmp(tok->start, "await", 5) == 0) + return AWAIT; } + else if (memcmp(tok->start, "async", 5) == 0) { + /* The current token is 'async'. + Look ahead one token.*/ - /* Advance defs stack. */ - tok->def++; - tok->defstack[tok->def] = tok->indent; + struct tok_state ahead_tok; + char *ahead_tok_start = NULL, *ahead_tok_end = NULL; + int ahead_tok_kind; - if (tok->def_async_behind) { - /* The previous token was 'async'. */ - tok->def_async_behind = 0; - tok->deftypestack[tok->def] = DEFTYPE_ASYNC; - tok->def_in_async++; - } - else { - /* This is a regular function (not async def). */ - tok->deftypestack[tok->def] = 0; - } - } - else if (tok_len == 5) { - if (memcmp(tok->start, "async", 5) == 0) { - /* The current token is 'async'. */ memcpy(&ahead_tok, tok, sizeof(ahead_tok)); - - /* Try to look ahead one token. */ ahead_tok_kind = tok_get(&ahead_tok, &ahead_tok_start, - &ahead_top_end); + &ahead_tok_end); if (ahead_tok_kind == NAME && ahead_tok.cur - ahead_tok.start == 3 @@ -1567,22 +1533,10 @@ tok_get(struct tok_state *tok, char **p_start, char **p_end) { /* The next token is going to be 'def', so instead of returning 'async' NAME token, we return ASYNC. */ - tok->def_async_behind = 1; + tok->async_def_indent = tok->indent; + tok->async_def = 1; return ASYNC; } - else if (tok->def_in_async) - { - /* We're inside an 'async def' function, so we treat - 'async' token as ASYNC, instead of NAME. */ - return ASYNC; - } - - } - else if (memcmp(tok->start, "await", 5) == 0 && tok->def_in_async) - { - /* We're inside an 'async def' function, so we treat - 'await' token as AWAIT, instead of NAME. */ - return AWAIT; } } @@ -1597,12 +1551,10 @@ tok_get(struct tok_state *tok, char **p_start, char **p_end) *p_start = tok->start; *p_end = tok->cur - 1; /* Leave '\n' out of the string */ tok->cont_line = 0; - if (tok->def) { - /* Mark the top function on the stack that it had - at least one NEWLINE. That will help us to - distinguish one-line functions from functions - with multiple statements. */ - tok->deftypestack[tok->def] |= DEFTYPE_HAS_NL; + if (tok->async_def) { + /* We're somewhere inside an 'async def' function, and + we've encountered a NEWLINE after its signature. */ + tok->async_def_nl = 1; } return NEWLINE; } diff --git a/Parser/tokenizer.h b/Parser/tokenizer.h index e198a0b6f55..af053e250a3 100644 --- a/Parser/tokenizer.h +++ b/Parser/tokenizer.h @@ -66,21 +66,12 @@ struct tok_state { const char* str; const char* input; /* Tokenizer's newline translated copy of the string. */ - /* `def*` fields are for parsing async/await in a backwards compatible - way. They should be removed in 3.7, when they will become - regular constants. See PEP 492 for more details. */ - int defstack[MAXINDENT]; /* Stack of funcs & indents where they - were defined. */ - int deftypestack[MAXINDENT]; /* Stack of func flags, see DEFTYPE_* - constants. */ - int def; /* Length of stack of func types/flags. */ - int def_async_behind; /* 1 if there was an 'async' token before - a 'def' token. */ - int def_in_async; /* Counter of how deep 'async def's - are nested. If greater than 0, - we are somewhere in an 'async def' - body, so 'async' and 'await' should - be parsed as keywords.*/ + /* async/await related fields; can be removed in 3.7 when async and await + become normal keywords. */ + int async_def; /* =1 if tokens are inside an 'async def' body. */ + int async_def_indent; /* Indentation level of the outermost 'async def'. */ + int async_def_nl; /* =1 if the outermost 'async def' had at least one + NEWLINE token after it. */ }; extern struct tok_state *PyTokenizer_FromString(const char *, int);