Issue #24619: Simplify async/await tokenization.

This commit simplifies async/await tokenization in tokenizer.c,
tokenize.py & lib2to3/tokenize.py.  Previous solution was to keep
a stack of async-def & def blocks, whereas the new approach is just
to remember position of the outermost async-def block.

This change won't bring any parsing performance improvements, but
it makes the code much easier to read and validate.
This commit is contained in:
Yury Selivanov 2015-07-23 15:01:58 +03:00
parent f315c1c016
commit 96ec934e75
7 changed files with 183 additions and 132 deletions

View file

@ -366,10 +366,11 @@ def generate_tokens(readline):
contline = None
indents = [0]
# 'stashed' and 'ctx' are used for async/await parsing
# 'stashed' and 'async_*' are used for async/await parsing
stashed = None
ctx = [('sync', 0)]
in_async = 0
async_def = False
async_def_indent = 0
async_def_nl = False
while 1: # loop over lines in stream
try:
@ -438,15 +439,18 @@ def generate_tokens(readline):
("<tokenize>", lnum, pos, line))
indents = indents[:-1]
cur_indent = indents[-1]
while len(ctx) > 1 and ctx[-1][1] >= cur_indent:
if ctx[-1][0] == 'async':
in_async -= 1
assert in_async >= 0
ctx.pop()
if async_def and async_def_indent >= indents[-1]:
async_def = False
async_def_nl = False
async_def_indent = 0
yield (DEDENT, '', (lnum, pos), (lnum, pos), line)
if async_def and async_def_nl and async_def_indent >= indents[-1]:
async_def = False
async_def_nl = False
async_def_indent = 0
else: # continued statement
if not line:
raise TokenError("EOF in multi-line statement", (lnum, 0))
@ -466,10 +470,13 @@ def generate_tokens(readline):
newline = NEWLINE
if parenlev > 0:
newline = NL
elif async_def:
async_def_nl = True
if stashed:
yield stashed
stashed = None
yield (newline, token, spos, epos, line)
elif initial == '#':
assert not token.endswith("\n")
if stashed:
@ -508,7 +515,7 @@ def generate_tokens(readline):
yield (STRING, token, spos, epos, line)
elif initial in namechars: # ordinary name
if token in ('async', 'await'):
if in_async:
if async_def:
yield (ASYNC if token == 'async' else AWAIT,
token, spos, epos, line)
continue
@ -523,15 +530,13 @@ def generate_tokens(readline):
and stashed[0] == NAME
and stashed[1] == 'async'):
ctx.append(('async', indents[-1]))
in_async += 1
async_def = True
async_def_indent = indents[-1]
yield (ASYNC, stashed[1],
stashed[2], stashed[3],
stashed[4])
stashed = None
else:
ctx.append(('sync', indents[-1]))
if stashed:
yield stashed

View file

@ -67,10 +67,32 @@ def test_await_expr(self):
await x
""")
self.validate("""async def foo():
def foo(): pass
def foo(): pass
await x
""")
self.validate("""async def foo(): return await a""")
self.validate("""def foo():
def foo(): pass
async def foo(): await x
""")
self.invalid_syntax("await x")
self.invalid_syntax("""def foo():
await x""")
self.invalid_syntax("""def foo():
def foo(): pass
async def foo(): pass
await x
""")
def test_async_var(self):
self.validate("""async = 1""")
self.validate("""await = 1""")

View file

@ -330,6 +330,7 @@ async def bar(): return await_
async def f():
async def g(): pass
await z
await = 1
self.assertTrue(inspect.iscoroutinefunction(f))

View file

@ -840,6 +840,79 @@
OP ')' (1, 19) (1, 20)
OP ':' (1, 20) (1, 21)
AWAIT 'await' (1, 22) (1, 27)
>>> dump_tokens('''def f():
...
... def baz(): pass
... async def bar(): pass
...
... await = 2''')
ENCODING 'utf-8' (0, 0) (0, 0)
NAME 'def' (1, 0) (1, 3)
NAME 'f' (1, 4) (1, 5)
OP '(' (1, 5) (1, 6)
OP ')' (1, 6) (1, 7)
OP ':' (1, 7) (1, 8)
NEWLINE '\\n' (1, 8) (1, 9)
NL '\\n' (2, 0) (2, 1)
INDENT ' ' (3, 0) (3, 2)
NAME 'def' (3, 2) (3, 5)
NAME 'baz' (3, 6) (3, 9)
OP '(' (3, 9) (3, 10)
OP ')' (3, 10) (3, 11)
OP ':' (3, 11) (3, 12)
NAME 'pass' (3, 13) (3, 17)
NEWLINE '\\n' (3, 17) (3, 18)
ASYNC 'async' (4, 2) (4, 7)
NAME 'def' (4, 8) (4, 11)
NAME 'bar' (4, 12) (4, 15)
OP '(' (4, 15) (4, 16)
OP ')' (4, 16) (4, 17)
OP ':' (4, 17) (4, 18)
NAME 'pass' (4, 19) (4, 23)
NEWLINE '\\n' (4, 23) (4, 24)
NL '\\n' (5, 0) (5, 1)
NAME 'await' (6, 2) (6, 7)
OP '=' (6, 8) (6, 9)
NUMBER '2' (6, 10) (6, 11)
DEDENT '' (7, 0) (7, 0)
>>> dump_tokens('''async def f():
...
... def baz(): pass
... async def bar(): pass
...
... await = 2''')
ENCODING 'utf-8' (0, 0) (0, 0)
ASYNC 'async' (1, 0) (1, 5)
NAME 'def' (1, 6) (1, 9)
NAME 'f' (1, 10) (1, 11)
OP '(' (1, 11) (1, 12)
OP ')' (1, 12) (1, 13)
OP ':' (1, 13) (1, 14)
NEWLINE '\\n' (1, 14) (1, 15)
NL '\\n' (2, 0) (2, 1)
INDENT ' ' (3, 0) (3, 2)
NAME 'def' (3, 2) (3, 5)
NAME 'baz' (3, 6) (3, 9)
OP '(' (3, 9) (3, 10)
OP ')' (3, 10) (3, 11)
OP ':' (3, 11) (3, 12)
NAME 'pass' (3, 13) (3, 17)
NEWLINE '\\n' (3, 17) (3, 18)
ASYNC 'async' (4, 2) (4, 7)
NAME 'def' (4, 8) (4, 11)
NAME 'bar' (4, 12) (4, 15)
OP '(' (4, 15) (4, 16)
OP ')' (4, 16) (4, 17)
OP ':' (4, 17) (4, 18)
NAME 'pass' (4, 19) (4, 23)
NEWLINE '\\n' (4, 23) (4, 24)
NL '\\n' (5, 0) (5, 1)
AWAIT 'await' (6, 2) (6, 7)
OP '=' (6, 8) (6, 9)
NUMBER '2' (6, 10) (6, 11)
DEDENT '' (7, 0) (7, 0)
"""
from test import support

View file

@ -498,10 +498,11 @@ def _tokenize(readline, encoding):
contline = None
indents = [0]
# 'stashed' and 'ctx' are used for async/await parsing
# 'stashed' and 'async_*' are used for async/await parsing
stashed = None
ctx = [('sync', 0)]
in_async = 0
async_def = False
async_def_indent = 0
async_def_nl = False
if encoding is not None:
if encoding == "utf-8-sig":
@ -579,15 +580,18 @@ def _tokenize(readline, encoding):
("<tokenize>", lnum, pos, line))
indents = indents[:-1]
cur_indent = indents[-1]
while len(ctx) > 1 and ctx[-1][1] >= cur_indent:
if ctx[-1][0] == 'async':
in_async -= 1
assert in_async >= 0
ctx.pop()
if async_def and async_def_indent >= indents[-1]:
async_def = False
async_def_nl = False
async_def_indent = 0
yield TokenInfo(DEDENT, '', (lnum, pos), (lnum, pos), line)
if async_def and async_def_nl and async_def_indent >= indents[-1]:
async_def = False
async_def_nl = False
async_def_indent = 0
else: # continued statement
if not line:
raise TokenError("EOF in multi-line statement", (lnum, 0))
@ -609,8 +613,13 @@ def _tokenize(readline, encoding):
if stashed:
yield stashed
stashed = None
yield TokenInfo(NL if parenlev > 0 else NEWLINE,
token, spos, epos, line)
if parenlev > 0:
yield TokenInfo(NL, token, spos, epos, line)
else:
yield TokenInfo(NEWLINE, token, spos, epos, line)
if async_def:
async_def_nl = True
elif initial == '#':
assert not token.endswith("\n")
if stashed:
@ -644,7 +653,7 @@ def _tokenize(readline, encoding):
yield TokenInfo(STRING, token, spos, epos, line)
elif initial.isidentifier(): # ordinary name
if token in ('async', 'await'):
if in_async:
if async_def:
yield TokenInfo(
ASYNC if token == 'async' else AWAIT,
token, spos, epos, line)
@ -660,15 +669,13 @@ def _tokenize(readline, encoding):
and stashed.type == NAME
and stashed.string == 'async'):
ctx.append(('async', indents[-1]))
in_async += 1
async_def = True
async_def_indent = indents[-1]
yield TokenInfo(ASYNC, stashed.string,
stashed.start, stashed.end,
stashed.line)
stashed = None
else:
ctx.append(('sync', indents[-1]))
if stashed:
yield stashed

View file

@ -31,12 +31,6 @@
|| c == '_'\
|| (c >= 128))
/* The following DEFTYPE* flags are used in 'tok_state->deftypestack',
and should be removed in 3.7, when async/await are regular
keywords. */
#define DEFTYPE_ASYNC 1
#define DEFTYPE_HAS_NL 2
extern char *PyOS_Readline(FILE *, FILE *, const char *);
/* Return malloc'ed string including trailing \n;
empty malloc'ed string for EOF;
@ -133,12 +127,6 @@ tok_new(void)
tok->indent = 0;
tok->indstack[0] = 0;
tok->def = 0;
tok->defstack[0] = 0;
tok->deftypestack[0] = 0;
tok->def_async_behind = 0;
tok->def_in_async = 0;
tok->atbol = 1;
tok->pendin = 0;
tok->prompt = tok->nextprompt = NULL;
@ -159,6 +147,11 @@ tok_new(void)
tok->decoding_readline = NULL;
tok->decoding_buffer = NULL;
#endif
tok->async_def = 0;
tok->async_def_indent = 0;
tok->async_def_nl = 0;
return tok;
}
@ -1350,11 +1343,6 @@ tok_get(struct tok_state *tok, char **p_start, char **p_end)
int c;
int blankline, nonascii;
int tok_len;
struct tok_state ahead_tok;
char *ahead_tok_start = NULL, *ahead_top_end = NULL;
int ahead_tok_kind;
*p_start = *p_end = NULL;
nextline:
tok->start = NULL;
@ -1442,16 +1430,6 @@ tok_get(struct tok_state *tok, char **p_start, char **p_end)
if (tok->pendin != 0) {
if (tok->pendin < 0) {
tok->pendin++;
while (tok->def && tok->defstack[tok->def] >= tok->indent) {
if (tok->deftypestack[tok->def] & DEFTYPE_ASYNC) {
tok->def_in_async--;
assert(tok->def_in_async >= 0);
}
tok->def--;
assert(tok->def >= 0);
}
return DEDENT;
}
else {
@ -1460,20 +1438,19 @@ tok_get(struct tok_state *tok, char **p_start, char **p_end)
}
}
if (!blankline && tok->level == 0
&& tok->def && tok->deftypestack[tok->def] & DEFTYPE_HAS_NL
&& tok->defstack[tok->def] >= tok->indent)
if (tok->async_def
&& !blankline
&& tok->level == 0
/* There was a NEWLINE after ASYNC DEF,
so we're past the signature. */
&& tok->async_def_nl
/* Current indentation level is less than where
the async function was defined */
&& tok->async_def_indent >= tok->indent)
{
/* The top function on the stack did have a NEWLINE
token, but didn't have an INDENT. That means that
it's a one-line function and it should now be removed
from the stack. */
if (tok->deftypestack[tok->def] & DEFTYPE_ASYNC) {
tok->def_in_async--;
assert(tok->def_in_async >= 0);
}
tok->def--;
assert(tok->def >= 0);
tok->async_def = 0;
tok->async_def_indent = 0;
tok->async_def_nl = 0;
}
again:
@ -1528,38 +1505,27 @@ tok_get(struct tok_state *tok, char **p_start, char **p_end)
*p_start = tok->start;
*p_end = tok->cur;
tok_len = tok->cur - tok->start;
if (tok_len == 3 && memcmp(tok->start, "def", 3) == 0) {
/* The current token is 'def'. */
if (tok->def + 1 >= MAXINDENT) {
tok->done = E_TOODEEP;
tok->cur = tok->inp;
return ERRORTOKEN;
/* async/await parsing block. */
if (tok->cur - tok->start == 5) {
/* Current token length is 5. */
if (tok->async_def) {
/* We're inside an 'async def' function. */
if (memcmp(tok->start, "async", 5) == 0)
return ASYNC;
if (memcmp(tok->start, "await", 5) == 0)
return AWAIT;
}
else if (memcmp(tok->start, "async", 5) == 0) {
/* The current token is 'async'.
Look ahead one token.*/
/* Advance defs stack. */
tok->def++;
tok->defstack[tok->def] = tok->indent;
struct tok_state ahead_tok;
char *ahead_tok_start = NULL, *ahead_tok_end = NULL;
int ahead_tok_kind;
if (tok->def_async_behind) {
/* The previous token was 'async'. */
tok->def_async_behind = 0;
tok->deftypestack[tok->def] = DEFTYPE_ASYNC;
tok->def_in_async++;
}
else {
/* This is a regular function (not async def). */
tok->deftypestack[tok->def] = 0;
}
}
else if (tok_len == 5) {
if (memcmp(tok->start, "async", 5) == 0) {
/* The current token is 'async'. */
memcpy(&ahead_tok, tok, sizeof(ahead_tok));
/* Try to look ahead one token. */
ahead_tok_kind = tok_get(&ahead_tok, &ahead_tok_start,
&ahead_top_end);
&ahead_tok_end);
if (ahead_tok_kind == NAME
&& ahead_tok.cur - ahead_tok.start == 3
@ -1567,22 +1533,10 @@ tok_get(struct tok_state *tok, char **p_start, char **p_end)
{
/* The next token is going to be 'def', so instead of
returning 'async' NAME token, we return ASYNC. */
tok->def_async_behind = 1;
tok->async_def_indent = tok->indent;
tok->async_def = 1;
return ASYNC;
}
else if (tok->def_in_async)
{
/* We're inside an 'async def' function, so we treat
'async' token as ASYNC, instead of NAME. */
return ASYNC;
}
}
else if (memcmp(tok->start, "await", 5) == 0 && tok->def_in_async)
{
/* We're inside an 'async def' function, so we treat
'await' token as AWAIT, instead of NAME. */
return AWAIT;
}
}
@ -1597,12 +1551,10 @@ tok_get(struct tok_state *tok, char **p_start, char **p_end)
*p_start = tok->start;
*p_end = tok->cur - 1; /* Leave '\n' out of the string */
tok->cont_line = 0;
if (tok->def) {
/* Mark the top function on the stack that it had
at least one NEWLINE. That will help us to
distinguish one-line functions from functions
with multiple statements. */
tok->deftypestack[tok->def] |= DEFTYPE_HAS_NL;
if (tok->async_def) {
/* We're somewhere inside an 'async def' function, and
we've encountered a NEWLINE after its signature. */
tok->async_def_nl = 1;
}
return NEWLINE;
}

View file

@ -66,21 +66,12 @@ struct tok_state {
const char* str;
const char* input; /* Tokenizer's newline translated copy of the string. */
/* `def*` fields are for parsing async/await in a backwards compatible
way. They should be removed in 3.7, when they will become
regular constants. See PEP 492 for more details. */
int defstack[MAXINDENT]; /* Stack of funcs & indents where they
were defined. */
int deftypestack[MAXINDENT]; /* Stack of func flags, see DEFTYPE_*
constants. */
int def; /* Length of stack of func types/flags. */
int def_async_behind; /* 1 if there was an 'async' token before
a 'def' token. */
int def_in_async; /* Counter of how deep 'async def's
are nested. If greater than 0,
we are somewhere in an 'async def'
body, so 'async' and 'await' should
be parsed as keywords.*/
/* async/await related fields; can be removed in 3.7 when async and await
become normal keywords. */
int async_def; /* =1 if tokens are inside an 'async def' body. */
int async_def_indent; /* Indentation level of the outermost 'async def'. */
int async_def_nl; /* =1 if the outermost 'async def' had at least one
NEWLINE token after it. */
};
extern struct tok_state *PyTokenizer_FromString(const char *, int);