bpo-36143: Regenerate Lib/keyword.py from the Grammar and Tokens file using pgen (GH-12456)

Now that the parser generator is written in Python (Parser/pgen) we can make use of it to regenerate the Lib/keyword file that contains the language keywords instead of parsing the autogenerated grammar files. This also allows checking in the CI that the autogenerated files are up to date.
2024-09-05 00:05:39 +00:00 · 2019-03-25 22:01:12 +00:00 · 2019-03-25 22:01:12 +00:00 · 91759d9801
parent 027b09c5a1
commit 91759d9801
5 changed files with 124 additions and 197 deletions
--- a/Lib/keyword.py
+++ b/Lib/keyword.py
@ -1,98 +1,55 @@
-#! /usr/bin/env python3
-
-"""Keywords (from "graminit.c")
+"""Keywords (from "Grammar/Grammar")

 This file is automatically generated; please don't muck it up!

 To update the symbols in this file, 'cd' to the top directory of
-the python source tree after building the interpreter and run:
+the python source tree and run:

-    ./python Lib/keyword.py
+    python3 -m Parser.pgen.keywordgen Grammar/Grammar \
+                                      Grammar/Tokens \
+                                      Lib/keyword.py
+
+Alternatively, you can run 'make regen-keyword'.
 """

 __all__ = ["iskeyword", "kwlist"]

 kwlist = [
-#--start keywords--
-        'False',
-        'None',
-        'True',
-        'and',
-        'as',
-        'assert',
-        'break',
-        'class',
-        'continue',
-        'def',
-        'del',
-        'elif',
-        'else',
-        'except',
-        'finally',
-        'for',
-        'from',
-        'global',
-        'if',
-        'import',
-        'in',
-        'is',
-        'lambda',
-        'nonlocal',
-        'not',
-        'or',
-        'pass',
-        'raise',
-        'return',
-        'try',
-        'while',
-        'with',
-        'yield',
-#--end keywords--
-        ]
-
-kwlist.append('async')
-kwlist.append('await')
-kwlist.sort()
+    'False',
+    'None',
+    'True',
+    'and',
+    'as',
+    'assert',
+    'async',
+    'await',
+    'break',
+    'class',
+    'continue',
+    'def',
+    'del',
+    'elif',
+    'else',
+    'except',
+    'finally',
+    'for',
+    'from',
+    'global',
+    'if',
+    'import',
+    'in',
+    'is',
+    'lambda',
+    'nonlocal',
+    'not',
+    'or',
+    'pass',
+    'raise',
+    'return',
+    'try',
+    'while',
+    'with',
+    'yield'
+]

 iskeyword = frozenset(kwlist).__contains__
-
-def main():
-    import sys, re
-
-    args = sys.argv[1:]
-    iptfile = args and args[0] or "Python/graminit.c"
-    if len(args) > 1: optfile = args[1]
-    else: optfile = "Lib/keyword.py"
-
-    # load the output skeleton from the target, taking care to preserve its
-    # newline convention.
-    with open(optfile, newline='') as fp:
-        format = fp.readlines()
-    nl = format[0][len(format[0].strip()):] if format else '\n'
-
-    # scan the source file for keywords
-    with open(iptfile) as fp:
-        strprog = re.compile('"([^"]+)"')
-        lines = []
-        for line in fp:
-            if '{1, "' in line:
-                match = strprog.search(line)
-                if match:
-                    lines.append("        '" + match.group(1) + "'," + nl)
-    lines.sort()
-
-    # insert the lines of keywords into the skeleton
-    try:
-        start = format.index("#--start keywords--" + nl) + 1
-        end = format.index("#--end keywords--" + nl)
-        format[start:end] = lines
-    except ValueError:
-        sys.stderr.write("target does not contain format markers\n")
-        sys.exit(1)
-
-    # write the output file
-    with open(optfile, 'w', newline='') as fp:
-        fp.writelines(format)
-
-if __name__ == "__main__":
-    main()
--- a/Lib/test/test_keyword.py
+++ b/Lib/test/test_keyword.py
@ -1,20 +1,5 @@
 import keyword
 import unittest
-from test import support
-import filecmp
-import os
-import sys
-import subprocess
-import shutil
-import textwrap
-
-KEYWORD_FILE             = support.findfile('keyword.py')
-GRAMMAR_FILE             = os.path.join(os.path.split(__file__)[0],
-                                        '..', '..', 'Python', 'graminit.c')
-TEST_PY_FILE             = 'keyword_test.py'
-GRAMMAR_TEST_FILE        = 'graminit_test.c'
-PY_FILE_WITHOUT_KEYWORDS = 'minimal_keyword.py'
-NONEXISTENT_FILE         = 'not_here.txt'


 class Test_iskeyword(unittest.TestCase):
@ -35,103 +20,17 @@ def test_changing_the_kwlist_does_not_affect_iskeyword(self):
        keyword.kwlist = ['its', 'all', 'eggs', 'beans', 'and', 'a', 'slice']
        self.assertFalse(keyword.iskeyword('eggs'))

+    def test_all_keywords_fail_to_be_used_as_names(self):
+        for key in keyword.kwlist:
+            with self.assertRaises(SyntaxError):
+                exec(f"{key} = 42")

-class TestKeywordGeneration(unittest.TestCase):
+    def test_async_and_await_are_keywords(self):
+        self.assertIn("async", keyword.kwlist)
+        self.assertIn("await", keyword.kwlist)

-    def _copy_file_without_generated_keywords(self, source_file, dest_file):
-        with open(source_file, 'rb') as fp:
-            lines = fp.readlines()
-        nl = lines[0][len(lines[0].strip()):]
-        with open(dest_file, 'wb') as fp:
-            fp.writelines(lines[:lines.index(b"#--start keywords--" + nl) + 1])
-            fp.writelines(lines[lines.index(b"#--end keywords--" + nl):])
-
-    def _generate_keywords(self, grammar_file, target_keyword_py_file):
-        proc = subprocess.Popen([sys.executable,
-                                 KEYWORD_FILE,
-                                 grammar_file,
-                                 target_keyword_py_file], stderr=subprocess.PIPE)
-        stderr = proc.communicate()[1]
-        return proc.returncode, stderr
-
-    @unittest.skipIf(not os.path.exists(GRAMMAR_FILE),
-                     'test only works from source build directory')
-    def test_real_grammar_and_keyword_file(self):
-        self._copy_file_without_generated_keywords(KEYWORD_FILE, TEST_PY_FILE)
-        self.addCleanup(support.unlink, TEST_PY_FILE)
-        self.assertFalse(filecmp.cmp(KEYWORD_FILE, TEST_PY_FILE))
-        self.assertEqual((0, b''), self._generate_keywords(GRAMMAR_FILE,
-                                                           TEST_PY_FILE))
-        self.assertTrue(filecmp.cmp(KEYWORD_FILE, TEST_PY_FILE))
-
-    def test_grammar(self):
-        self._copy_file_without_generated_keywords(KEYWORD_FILE, TEST_PY_FILE)
-        self.addCleanup(support.unlink, TEST_PY_FILE)
-        with open(GRAMMAR_TEST_FILE, 'w') as fp:
-            # Some of these are probably implementation accidents.
-            fp.writelines(textwrap.dedent("""\
-                {2, 1},
-                    {11, "encoding_decl", 0, 2, states_79,
-                     "\000\000\040\000\000\000\000\000\000\000\000\000"
-                     "\000\000\000\000\000\000\000\000\000"},
-                    {1, "jello"},
-                    {326, 0},
-                    {1, "turnip"},
-                \t{1, "This one is tab indented"
-                    {278, 0},
-                    {1, "crazy but legal"
-                "also legal" {1, "
-                    {1, "continue"},
-                   {1, "lemon"},
-                     {1, "tomato"},
-                {1, "wigii"},
-                    {1, 'no good'}
-                    {283, 0},
-                    {1,  "too many spaces"}"""))
-        self.addCleanup(support.unlink, GRAMMAR_TEST_FILE)
-        self._generate_keywords(GRAMMAR_TEST_FILE, TEST_PY_FILE)
-        expected = [
-            "        'This one is tab indented',",
-            "        'also legal',",
-            "        'continue',",
-            "        'crazy but legal',",
-            "        'jello',",
-            "        'lemon',",
-            "        'tomato',",
-            "        'turnip',",
-            "        'wigii',",
-            ]
-        with open(TEST_PY_FILE) as fp:
-            lines = fp.read().splitlines()
-        start = lines.index("#--start keywords--") + 1
-        end = lines.index("#--end keywords--")
-        actual = lines[start:end]
-        self.assertEqual(actual, expected)
-
-    def test_empty_grammar_results_in_no_keywords(self):
-        self._copy_file_without_generated_keywords(KEYWORD_FILE,
-                                                   PY_FILE_WITHOUT_KEYWORDS)
-        self.addCleanup(support.unlink, PY_FILE_WITHOUT_KEYWORDS)
-        shutil.copyfile(KEYWORD_FILE, TEST_PY_FILE)
-        self.addCleanup(support.unlink, TEST_PY_FILE)
-        self.assertEqual((0, b''), self._generate_keywords(os.devnull,
-                                                           TEST_PY_FILE))
-        self.assertTrue(filecmp.cmp(TEST_PY_FILE, PY_FILE_WITHOUT_KEYWORDS))
-
-    def test_keywords_py_without_markers_produces_error(self):
-        rc, stderr = self._generate_keywords(os.devnull, os.devnull)
-        self.assertNotEqual(rc, 0)
-        self.assertRegex(stderr, b'does not contain format markers')
-
-    def test_missing_grammar_file_produces_error(self):
-        rc, stderr = self._generate_keywords(NONEXISTENT_FILE, KEYWORD_FILE)
-        self.assertNotEqual(rc, 0)
-        self.assertRegex(stderr, b'(?ms)' + NONEXISTENT_FILE.encode())
-
-    def test_missing_keywords_py_file_produces_error(self):
-        rc, stderr = self._generate_keywords(os.devnull, NONEXISTENT_FILE)
-        self.assertNotEqual(rc, 0)
-        self.assertRegex(stderr, b'(?ms)' + NONEXISTENT_FILE.encode())
+    def test_keywords_are_sorted(self):
+        self.assertListEqual(sorted(keyword.kwlist), keyword.kwlist)


 if __name__ == "__main__":
--- a/Makefile.pre.in
+++ b/Makefile.pre.in
@ -724,7 +724,7 @@ regen-importlib: Programs/_freeze_importlib
 # Regenerate all generated files

 regen-all: regen-opcode regen-opcode-targets regen-typeslots regen-grammar \
-	regen-token regen-symbol regen-ast regen-importlib clinic
+	regen-token regen-keyword regen-symbol regen-ast regen-importlib clinic

 ############################################################################
 # Special rules for object files
@ -843,6 +843,15 @@ regen-token:
 		$(srcdir)/Grammar/Tokens \
 		$(srcdir)/Lib/token.py

+.PHONY: regen-keyword
+regen-keyword:
+	# Regenerate Lib/keyword.py from Grammar/Grammar and Grammar/Tokens
+	# using Parser/pgen
+	$(PYTHON_FOR_REGEN) -m Parser.pgen.keywordgen $(srcdir)/Grammar/Grammar \
+		$(srcdir)/Grammar/Tokens \
+		$(srcdir)/Lib/keyword.py.new
+	$(UPDATE_FILE) $(srcdir)/Lib/keyword.py $(srcdir)/Lib/keyword.py.new
+
 .PHONY: regen-symbol
 regen-symbol: $(srcdir)/Include/graminit.h
 	# Regenerate Lib/symbol.py from Include/graminit.h
--- a/Builtins/2019-03-20-00-37-24.bpo-12456.fnKoKo.rst
+++ b/Builtins/2019-03-20-00-37-24.bpo-12456.fnKoKo.rst
@ -0,0 +1,2 @@
+Regenerate :mod:`keyword` from the Grammar and Tokens file using pgen. Patch
+by Pablo Galindo.
--- a/Parser/pgen/keywordgen.py
+++ b/Parser/pgen/keywordgen.py
@ -0,0 +1,60 @@
+"""Generate Lib/keyword.py from the Grammar and Tokens files using pgen"""
+
+import argparse
+
+from .pgen import ParserGenerator
+
+TEMPLATE = r'''
+"""Keywords (from "Grammar/Grammar")
+
+This file is automatically generated; please don't muck it up!
+
+To update the symbols in this file, 'cd' to the top directory of
+the python source tree and run:
+
+    python3 -m Parser.pgen.keywordgen Grammar/Grammar \
+                                      Grammar/Tokens \
+                                      Lib/keyword.py
+
+Alternatively, you can run 'make regen-keyword'.
+"""
+
+__all__ = ["iskeyword", "kwlist"]
+
+kwlist = [
+    {keywords}
+]
+
+iskeyword = frozenset(kwlist).__contains__
+'''.lstrip()
+
+EXTRA_KEYWORDS = ["async", "await"]
+
+
+def main():
+    parser = argparse.ArgumentParser(description="Generate the Lib/keywords.py "
+                                                 "file from the grammar.")
+    parser.add_argument(
+        "grammar", type=str, help="The file with the grammar definition in EBNF format"
+    )
+    parser.add_argument(
+        "tokens", type=str, help="The file with the token definitions"
+    )
+    parser.add_argument(
+        "keyword_file",
+        type=argparse.FileType('w'),
+        help="The path to write the keyword definitions",
+    )
+    args = parser.parse_args()
+    p = ParserGenerator(args.grammar, args.tokens)
+    grammar = p.make_grammar()
+
+    with args.keyword_file as thefile:
+        all_keywords = sorted(list(grammar.keywords) + EXTRA_KEYWORDS)
+
+        keywords = ",\n    ".join(map(repr, all_keywords))
+        thefile.write(TEMPLATE.format(keywords=keywords))
+
+
+if __name__ == "__main__":
+    main()