diff --git a/Parser/pgen/__main__.py b/Parser/pgen/__main__.py index eea52618422..bb96e75beea 100644 --- a/Parser/pgen/__main__.py +++ b/Parser/pgen/__main__.py @@ -8,17 +8,15 @@ def main(): parser.add_argument( "grammar", type=str, help="The file with the grammar definition in EBNF format" ) - parser.add_argument( - "tokens", type=str, help="The file with the token definitions" - ) + parser.add_argument("tokens", type=str, help="The file with the token definitions") parser.add_argument( "graminit_h", - type=argparse.FileType('w'), + type=argparse.FileType("w"), help="The path to write the grammar's non-terminals as #defines", ) parser.add_argument( "graminit_c", - type=argparse.FileType('w'), + type=argparse.FileType("w"), help="The path to write the grammar as initialized data", ) diff --git a/Parser/pgen/automata.py b/Parser/pgen/automata.py new file mode 100644 index 00000000000..3147d8636ff --- /dev/null +++ b/Parser/pgen/automata.py @@ -0,0 +1,371 @@ +"""Classes representing state-machine concepts""" + +class NFA: + """A non deterministic finite automata + + A non deterministic automata is a form of a finite state + machine. An NFA's rules are less restrictive than a DFA. + The NFA rules are: + + * A transition can be non-deterministic and can result in + nothing, one, or two or more states. + + * An epsilon transition consuming empty input is valid. + Transitions consuming labeled symbols are also permitted. + + This class assumes that there is only one starting state and one + accepting (ending) state. + + Attributes: + name (str): The name of the rule the NFA is representing. + start (NFAState): The starting state. + end (NFAState): The ending state + """ + + def __init__(self, start, end): + self.name = start.rule_name + self.start = start + self.end = end + + def __repr__(self): + return "NFA(start={}, end={})".format(self.start, self.end) + + def dump(self, writer=print): + """Dump a graphical representation of the NFA""" + todo = [self.start] + for i, state in enumerate(todo): + writer(" State", i, state is self.end and "(final)" or "") + for arc in state.arcs: + label = arc.label + next = arc.target + if next in todo: + j = todo.index(next) + else: + j = len(todo) + todo.append(next) + if label is None: + writer(" -> %d" % j) + else: + writer(" %s -> %d" % (label, j)) + + +class NFAArc: + """An arc representing a transition between two NFA states. + + NFA states can be connected via two ways: + + * A label transition: An input equal to the label must + be consumed to perform the transition. + * An epsilon transition: The transition can be taken without + consuming any input symbol. + + Attributes: + target (NFAState): The ending state of the transition arc. + label (Optional[str]): The label that must be consumed to make + the transition. An epsilon transition is represented + using `None`. + """ + + def __init__(self, target, label): + self.target = target + self.label = label + + def __repr__(self): + return "<%s: %s>" % (self.__class__.__name__, self.label) + + +class NFAState: + """A state of a NFA, non deterministic finite automata. + + Attributes: + target (rule_name): The name of the rule used to represent the NFA's + ending state after a transition. + arcs (Dict[Optional[str], NFAState]): A mapping representing transitions + between the current NFA state and another NFA state via following + a label. + """ + + def __init__(self, rule_name): + self.rule_name = rule_name + self.arcs = [] + + def add_arc(self, target, label=None): + """Add a new arc to connect the state to a target state within the NFA + + The method adds a new arc to the list of arcs available as transitions + from the present state. An optional label indicates a named transition + that consumes an input while the absence of a label represents an epsilon + transition. + + Attributes: + target (NFAState): The end of the transition that the arc represents. + label (Optional[str]): The label that must be consumed for making + the transition. If the label is not provided the transition is assumed + to be an epsilon-transition. + """ + assert label is None or isinstance(label, str) + assert isinstance(target, NFAState) + self.arcs.append(NFAArc(target, label)) + + def __repr__(self): + return "<%s: from %s>" % (self.__class__.__name__, self.rule_name) + + +class DFA: + """A deterministic finite automata + + A deterministic finite automata is a form of a finite state machine + that obeys the following rules: + + * Each of the transitions is uniquely determined by + the source state and input symbol + * Reading an input symbol is required for each state + transition (no epsilon transitions). + + The finite-state machine will accept or reject a string of symbols + and only produces a unique computation of the automaton for each input + string. The DFA must have a unique starting state (represented as the first + element in the list of states) but can have multiple final states. + + Attributes: + name (str): The name of the rule the DFA is representing. + states (List[DFAState]): A collection of DFA states. + """ + + def __init__(self, name, states): + self.name = name + self.states = states + + @classmethod + def from_nfa(cls, nfa): + """Constructs a DFA from a NFA using the Rabin–Scott construction algorithm. + + To simulate the operation of a DFA on a given input string, it's + necessary to keep track of a single state at any time, or more precisely, + the state that the automaton will reach after seeing a prefix of the + input. In contrast, to simulate an NFA, it's necessary to keep track of + a set of states: all of the states that the automaton could reach after + seeing the same prefix of the input, according to the nondeterministic + choices made by the automaton. There are two possible sources of + non-determinism: + + 1) Multiple (one or more) transitions with the same label + + 'A' +-------+ + +----------->+ State +----------->+ + | | 2 | + +-------+ +-------+ + | State | + | 1 | +-------+ + +-------+ | State | + +----------->+ 3 +----------->+ + 'A' +-------+ + + 2) Epsilon transitions (transitions that can be taken without consuming any input) + + +-------+ +-------+ + | State | ε | State | + | 1 +----------->+ 2 +----------->+ + +-------+ +-------+ + + Looking at the first case above, we can't determine which transition should be + followed when given an input A. We could choose whether or not to follow the + transition while in the second case the problem is that we can choose both to + follow the transition or not doing it. To solve this problem we can imagine that + we follow all possibilities at the same time and we construct new states from the + set of all possible reachable states. For every case in the previous example: + + + 1) For multiple transitions with the same label we colapse all of the + final states under the same one + + +-------+ +-------+ + | State | 'A' | State | + | 1 +----------->+ 2-3 +----------->+ + +-------+ +-------+ + + 2) For epsilon transitions we collapse all epsilon-reachable states + into the same one + + +-------+ + | State | + | 1-2 +-----------> + +-------+ + + Because the DFA states consist of sets of NFA states, an n-state NFA + may be converted to a DFA with at most 2**n states. Notice that the + constructed DFA is not minimal and can be simplified or reduced + afterwards. + + Parameters: + name (NFA): The NFA to transform to DFA. + """ + assert isinstance(nfa, NFA) + + def add_closure(nfa_state, base_nfa_set): + """Calculate the epsilon-closure of a given state + + Add to the *base_nfa_set* all the states that are + reachable from *nfa_state* via epsilon-transitions. + """ + assert isinstance(nfa_state, NFAState) + if nfa_state in base_nfa_set: + return + base_nfa_set.add(nfa_state) + for nfa_arc in nfa_state.arcs: + if nfa_arc.label is None: + add_closure(nfa_arc.target, base_nfa_set) + + # Calculte the epsilon-closure of the starting state + base_nfa_set = set() + add_closure(nfa.start, base_nfa_set) + + # Start by visiting the NFA starting state (there is only one). + states = [DFAState(nfa.name, base_nfa_set, nfa.end)] + + for state in states: # NB states grow while we're iterating + + # Find transitions from the current state to other reachable states + # and store them in mapping that correlates the label to all the + # possible reachable states that can be obtained by consuming a + # token equal to the label. Each set of all the states that can + # be reached after following a label will be the a DFA state. + arcs = {} + for nfa_state in state.nfa_set: + for nfa_arc in nfa_state.arcs: + if nfa_arc.label is not None: + nfa_set = arcs.setdefault(nfa_arc.label, set()) + # All states that can be reached by epsilon-transitions + # are also included in the set of reachable states. + add_closure(nfa_arc.target, nfa_set) + + # Now create new DFAs by visiting all posible transitions between + # the current DFA state and the new power-set states (each nfa_set) + # via the different labels. As the nodes are appended to *states* this + # is performing a deep-first search traversal over the power-set of + # the states of the original NFA. + for label, nfa_set in sorted(arcs.items()): + for exisisting_state in states: + if exisisting_state.nfa_set == nfa_set: + # The DFA state already exists for this rule. + next_state = exisisting_state + break + else: + next_state = DFAState(nfa.name, nfa_set, nfa.end) + states.append(next_state) + + # Add a transition between the current DFA state and the new + # DFA state (the power-set state) via the current label. + state.add_arc(next_state, label) + + return cls(nfa.name, states) + + def __iter__(self): + return iter(self.states) + + def simplify(self): + """Attempt to reduce the number of states of the DFA + + Transform the DFA into an equivalent DFA that has fewer states. Two + classes of states can be removed or merged from the original DFA without + affecting the language it accepts to minimize it: + + * Unreachable states can not be reached from the initial + state of the DFA, for any input string. + * Nondistinguishable states are those that cannot be distinguished + from one another for any input string. + + This algorithm does not achieve the optimal fully-reduced solution, but it + works well enough for the particularities of the Python grammar. The + algorithm repeatedly looks for two states that have the same set of + arcs (same labels pointing to the same nodes) and unifies them, until + things stop changing. + """ + changes = True + while changes: + changes = False + for i, state_i in enumerate(self.states): + for j in range(i + 1, len(self.states)): + state_j = self.states[j] + if state_i == state_j: + del self.states[j] + for state in self.states: + state.unifystate(state_j, state_i) + changes = True + break + + def dump(self, writer=print): + """Dump a graphical representation of the DFA""" + for i, state in enumerate(self.states): + writer(" State", i, state.is_final and "(final)" or "") + for label, next in sorted(state.arcs.items()): + writer(" %s -> %d" % (label, self.states.index(next))) + + +class DFAState(object): + """A state of a DFA + + Attributes: + rule_name (rule_name): The name of the DFA rule containing the represented state. + nfa_set (Set[NFAState]): The set of NFA states used to create this state. + final (bool): True if the state represents an accepting state of the DFA + containing this state. + arcs (Dict[label, DFAState]): A mapping representing transitions between + the current DFA state and another DFA state via following a label. + """ + + def __init__(self, rule_name, nfa_set, final): + assert isinstance(nfa_set, set) + assert isinstance(next(iter(nfa_set)), NFAState) + assert isinstance(final, NFAState) + self.rule_name = rule_name + self.nfa_set = nfa_set + self.arcs = {} # map from terminals/nonterminals to DFAState + self.is_final = final in nfa_set + + def add_arc(self, target, label): + """Add a new arc to the current state. + + Parameters: + target (DFAState): The DFA state at the end of the arc. + label (str): The label respresenting the token that must be consumed + to perform this transition. + """ + assert isinstance(label, str) + assert label not in self.arcs + assert isinstance(target, DFAState) + self.arcs[label] = target + + def unifystate(self, old, new): + """Replace all arcs from the current node to *old* with *new*. + + Parameters: + old (DFAState): The DFA state to remove from all existing arcs. + new (DFAState): The DFA state to replace in all existing arcs. + """ + for label, next_ in self.arcs.items(): + if next_ is old: + self.arcs[label] = new + + def __eq__(self, other): + # The nfa_set does not matter for equality + assert isinstance(other, DFAState) + if self.is_final != other.is_final: + return False + # We cannot just return self.arcs == other.arcs because that + # would invoke this method recursively if there are any cycles. + if len(self.arcs) != len(other.arcs): + return False + for label, next_ in self.arcs.items(): + if next_ is not other.arcs.get(label): + return False + return True + + __hash__ = None # For Py3 compatibility. + + def __repr__(self): + return "<%s: %s is_final=%s>" % ( + self.__class__.__name__, + self.rule_name, + self.is_final, + ) diff --git a/Parser/pgen/grammar.py b/Parser/pgen/grammar.py index 5cd652426b4..56188db775a 100644 --- a/Parser/pgen/grammar.py +++ b/Parser/pgen/grammar.py @@ -76,12 +76,14 @@ def produce_graminit_c(self, writer): def print_labels(self, writer): writer( - "static const label labels[{n_labels}] = {{\n".format(n_labels=len(self.labels)) + "static const label labels[{n_labels}] = {{\n".format( + n_labels=len(self.labels) + ) ) for label, name in self.labels: label_name = '"{}"'.format(name) if name is not None else 0 writer( - ' {{{label}, {label_name}}},\n'.format( + " {{{label}, {label_name}}},\n".format( label=label, label_name=label_name ) ) diff --git a/Parser/pgen/keywordgen.py b/Parser/pgen/keywordgen.py index eeb3ef739fa..f0234a81b62 100644 --- a/Parser/pgen/keywordgen.py +++ b/Parser/pgen/keywordgen.py @@ -32,17 +32,16 @@ def main(): - parser = argparse.ArgumentParser(description="Generate the Lib/keywords.py " - "file from the grammar.") + parser = argparse.ArgumentParser( + description="Generate the Lib/keywords.py " "file from the grammar." + ) parser.add_argument( "grammar", type=str, help="The file with the grammar definition in EBNF format" ) - parser.add_argument( - "tokens", type=str, help="The file with the token definitions" - ) + parser.add_argument("tokens", type=str, help="The file with the token definitions") parser.add_argument( "keyword_file", - type=argparse.FileType('w'), + type=argparse.FileType("w"), help="The path to write the keyword definitions", ) args = parser.parse_args() diff --git a/Parser/pgen/metaparser.py b/Parser/pgen/metaparser.py new file mode 100644 index 00000000000..074a083fb74 --- /dev/null +++ b/Parser/pgen/metaparser.py @@ -0,0 +1,152 @@ +"""Parser for the Python metagrammar""" + +import io +import tokenize # from stdlib + +from .automata import NFA, NFAState + + +class GrammarParser: + """Parser for Python grammar files.""" + + _translation_table = { + tokenize.NAME: "NAME", + tokenize.STRING: "STRING", + tokenize.NEWLINE: "NEWLINE", + tokenize.NL: "NL", + tokenize.OP: "OP", + tokenize.ENDMARKER: "ENDMARKER", + tokenize.COMMENT: "COMMENT", + } + + def __init__(self, grammar): + self.grammar = grammar + grammar_adaptor = io.StringIO(grammar) + self.generator = tokenize.generate_tokens(grammar_adaptor.readline) + self._gettoken() # Initialize lookahead + self._current_rule_name = None + + def parse(self): + """Turn the grammar into a collection of NFAs""" + # grammar: (NEWLINE | rule)* ENDMARKER + while self.type != tokenize.ENDMARKER: + while self.type == tokenize.NEWLINE: + self._gettoken() + # rule: NAME ':' rhs NEWLINE + self._current_rule_name = self._expect(tokenize.NAME) + self._expect(tokenize.OP, ":") + a, z = self._parse_rhs() + self._expect(tokenize.NEWLINE) + + yield NFA(a, z) + + def _parse_rhs(self): + # rhs: items ('|' items)* + a, z = self._parse_items() + if self.value != "|": + return a, z + else: + aa = NFAState(self._current_rule_name) + zz = NFAState(self._current_rule_name) + while True: + # Allow to transit directly to the previous state and connect the end of the + # previous state to the end of the current one, effectively allowing to skip + # the current state. + aa.add_arc(a) + z.add_arc(zz) + if self.value != "|": + break + + self._gettoken() + a, z = self._parse_items() + return aa, zz + + def _parse_items(self): + # items: item+ + a, b = self._parse_item() + while self.type in (tokenize.NAME, tokenize.STRING) or self.value in ("(", "["): + c, d = self._parse_item() + # Allow a transition between the end of the previous state + # and the beginning of the new one, connecting all the items + # together. In this way we can only reach the end if we visit + # all the items. + b.add_arc(c) + b = d + return a, b + + def _parse_item(self): + # item: '[' rhs ']' | atom ['+' | '*'] + if self.value == "[": + self._gettoken() + a, z = self._parse_rhs() + self._expect(tokenize.OP, "]") + # Make a transition from the beginning to the end so it is possible to + # advance for free to the next state of this item # without consuming + # anything from the rhs. + a.add_arc(z) + return a, z + else: + a, z = self._parse_atom() + value = self.value + if value not in ("+", "*"): + return a, z + self._gettoken() + z.add_arc(a) + if value == "+": + # Create a cycle to the beginning so we go back to the old state in this + # item and repeat. + return a, z + else: + # The end state is the same as the beginning, so we can cycle arbitrarily + # and end in the beginning if necessary. + return a, a + + def _parse_atom(self): + # atom: '(' rhs ')' | NAME | STRING + if self.value == "(": + self._gettoken() + a, z = self._parse_rhs() + self._expect(tokenize.OP, ")") + return a, z + elif self.type in (tokenize.NAME, tokenize.STRING): + a = NFAState(self._current_rule_name) + z = NFAState(self._current_rule_name) + # We can transit to the next state only if we consume the value. + a.add_arc(z, self.value) + self._gettoken() + return a, z + else: + self._raise_error( + "expected (...) or NAME or STRING, got {} ({})", + self._translation_table.get(self.type, self.type), + self.value, + ) + + def _expect(self, type_, value=None): + if self.type != type_: + self._raise_error( + "expected {}, got {} ({})", + self._translation_table.get(type_, type_), + self._translation_table.get(self.type, self.type), + self.value, + ) + if value is not None and self.value != value: + self._raise_error("expected {}, got {}", value, self.value) + value = self.value + self._gettoken() + return value + + def _gettoken(self): + tup = next(self.generator) + while tup[0] in (tokenize.COMMENT, tokenize.NL): + tup = next(self.generator) + self.type, self.value, self.begin, self.end, self.line = tup + + def _raise_error(self, msg, *args): + if args: + try: + msg = msg.format(*args) + except Exception: + msg = " ".join([msg] + list(map(str, args))) + line = self.grammar.splitlines()[self.begin[0] - 1] + raise SyntaxError(msg, ("", self.begin[0], self.begin[1], line)) diff --git a/Parser/pgen/pgen.py b/Parser/pgen/pgen.py index d52d58f64e4..d7dcb769334 100644 --- a/Parser/pgen/pgen.py +++ b/Parser/pgen/pgen.py @@ -1,42 +1,180 @@ +"""Python parser generator + + +This parser generator transforms a Python grammar file into parsing tables +that can be consumed by Python's LL(1) parser written in C. + +Concepts +-------- + +* An LL(1) parser (Left-to-right, Leftmost derivation, 1 token-lookahead) is a + top-down parser for a subset of context-free languages. It parses the input + from Left to right, performing Leftmost derivation of the sentence, and can + only use 1 tokens of lookahead when parsing a sentence. + +* A parsing table is a collection of data that a generic implementation of the + LL(1) parser consumes to know how to parse a given context-free grammar. In + this case the collection of thata involves Deterministic Finite Automatons, + calculated first sets, keywords and transition labels. + +* A grammar is defined by production rules (or just 'productions') that specify + which symbols may replace which other symbols; these rules may be used to + generate strings, or to parse them. Each such rule has a head, or left-hand + side, which consists of the string that may be replaced, and a body, or + right-hand side, which consists of a string that may replace it. In the + Python grammar, rules are written in the form + + rule_name: rule_description; + + meaning the rule 'a: b' specifies that a can be replaced by b. A Context-free + grammars is a grammars in which the left-hand side of each production rule + consists of only a single nonterminal symbol. Context free grammars can + always be recognized by a Non-Deterministic Automatons. + +* Terminal symbols are literal symbols which may appear in the outputs of the + production rules of the grammar and which cannot be changed using the rules + of the grammar. Applying the rules recursively to a source string of symbols + will usually terminate in a final output string consisting only of terminal + symbols. + +* Nonterminal symbols are those symbols which can be replaced. The grammar + includes a start symbol a designated member of the set of nonterminals from + which all the strings in the language may be derived by successive + applications of the production rules. + +* The language defined by the grammar is defined as the set of terminal strings + that can be derived using the production rules. + +* The first sets of a rule (FIRST(rule)) are defined to be the set of terminals + that can appear in the first position of any string derived from the rule. + This is useful for LL(1) parsers as the parser is only allow to look at the + next token in the input to know which rule needs to parse. For example given + this grammar: + + start: '(' A | B ')' + A: 'a' '<' + B: 'b' '<' + + and the input '(b<)' the parser can only look at 'b' to know if it needs + to parse A o B. Because FIRST(A) = {'a'} and FIRST(B) = {'b'} it knows + that needs to continue parsing rule B because only that rule can start + with 'b'. + +Description +----------- + +The input for the parser generator is a grammar in extended BNF form (using * +for repetition, + for at-least-once repetition, [] for optional parts, | for +alternatives and () for grouping). + +Each rule in the grammar file is considered as a regular expression in its +own right. It is turned into a Non-deterministic Finite Automaton (NFA), +which is then turned into a Deterministic Finite Automaton (DFA), which is +then optimized to reduce the number of states. See [Aho&Ullman 77] chapter 3, +or similar compiler books (this technique is more often used for lexical +analyzers). + +The DFA's are used by the parser as parsing tables in a special way that's +probably unique. Before they are usable, the FIRST sets of all non-terminals +are computed so the LL(1) parser consuming the parsing tables can distinguish +between different transitions. +Reference +--------- + +[Aho&Ullman 77] + Aho&Ullman, Principles of Compiler Design, Addison-Wesley 1977 + (first edition) +""" + +from ast import literal_eval import collections -import tokenize # from stdlib from . import grammar, token +from .automata import DFA +from .metaparser import GrammarParser + +import enum + + +class LabelType(enum.Enum): + NONTERMINAL = 0 + NAMED_TOKEN = 1 + KEYWORD = 2 + OPERATOR = 3 + NONE = 4 + + +class Label(str): + def __init__(self, value): + self.type = self._get_type() + + def _get_type(self): + if self[0].isalpha(): + if self.upper() == self: + # NAMED tokens (ASYNC, NAME...) are all uppercase by convention + return LabelType.NAMED_TOKEN + else: + # If is not uppercase it must be a non terminal. + return LabelType.NONTERMINAL + else: + # Keywords and operators are wrapped in quotes + assert self[0] == self[-1] in ('"', "'"), self + value = literal_eval(self) + if value[0].isalpha(): + return LabelType.KEYWORD + else: + return LabelType.OPERATOR + + def __repr__(self): + return "{}({})".format(self.type, super().__repr__()) class ParserGenerator(object): - - def __init__(self, grammar_file, token_file, stream=None, verbose=False): - close_stream = None - if stream is None: - stream = open(grammar_file) - close_stream = stream.close + def __init__(self, grammar_file, token_file, verbose=False): + with open(grammar_file) as f: + self.grammar = f.read() with open(token_file) as tok_file: token_lines = tok_file.readlines() self.tokens = dict(token.generate_tokens(token_lines)) self.opmap = dict(token.generate_opmap(token_lines)) # Manually add <> so it does not collide with != - self.opmap['<>'] = "NOTEQUAL" + self.opmap["<>"] = "NOTEQUAL" self.verbose = verbose self.filename = grammar_file - self.stream = stream - self.generator = tokenize.generate_tokens(stream.readline) - self.gettoken() # Initialize lookahead - self.dfas, self.startsymbol = self.parse() - if close_stream is not None: - close_stream() - self.first = {} # map from symbol name to set of tokens - self.addfirstsets() + self.dfas, self.startsymbol = self.create_dfas() + self.first = {} # map from symbol name to set of tokens + self.calculate_first_sets() + + def create_dfas(self): + rule_to_dfas = collections.OrderedDict() + start_nonterminal = None + for nfa in GrammarParser(self.grammar).parse(): + if self.verbose: + print("Dump of NFA for", nfa.name) + nfa.dump() + dfa = DFA.from_nfa(nfa) + if self.verbose: + print("Dump of DFA for", dfa.name) + dfa.dump() + dfa.simplify() + rule_to_dfas[dfa.name] = dfa + + if start_nonterminal is None: + start_nonterminal = dfa.name + + return rule_to_dfas, start_nonterminal def make_grammar(self): c = grammar.Grammar() + c.all_labels = set() names = list(self.dfas.keys()) names.remove(self.startsymbol) names.insert(0, self.startsymbol) for name in names: i = 256 + len(c.symbol2number) - c.symbol2number[name] = i - c.number2symbol[i] = name + c.symbol2number[Label(name)] = i + c.number2symbol[i] = Label(name) + c.all_labels.add(name) for name in names: self.make_label(c, name) dfa = self.dfas[name] @@ -44,12 +182,13 @@ def make_grammar(self): for state in dfa: arcs = [] for label, next in sorted(state.arcs.items()): - arcs.append((self.make_label(c, label), dfa.index(next))) - if state.isfinal: - arcs.append((0, dfa.index(state))) + c.all_labels.add(label) + arcs.append((self.make_label(c, label), dfa.states.index(next))) + if state.is_final: + arcs.append((0, dfa.states.index(state))) states.append(arcs) c.states.append(states) - c.dfas[c.symbol2number[name]] = (states, self.make_first(c, name)) + c.dfas[c.symbol2number[name]] = (states, self.make_first_sets(c, name)) c.start = c.symbol2number[self.startsymbol] if self.verbose: @@ -68,7 +207,7 @@ def make_grammar(self): ) return c - def make_first(self, c, name): + def make_first_sets(self, c, name): rawfirst = self.first[name] first = set() for label in sorted(rawfirst): @@ -78,67 +217,65 @@ def make_first(self, c, name): return first def make_label(self, c, label): - # XXX Maybe this should be a method on a subclass of converter? + label = Label(label) ilabel = len(c.labels) - if label[0].isalpha(): - # Either a symbol name or a named token - if label in c.symbol2number: - # A symbol name (a non-terminal) - if label in c.symbol2label: - return c.symbol2label[label] - else: - c.labels.append((c.symbol2number[label], None)) - c.symbol2label[label] = ilabel - return ilabel - else: - # A named token (NAME, NUMBER, STRING) - itoken = self.tokens.get(label, None) - assert isinstance(itoken, int), label - assert itoken in self.tokens.values(), label - if itoken in c.tokens: - return c.tokens[itoken] - else: - c.labels.append((itoken, None)) - c.tokens[itoken] = ilabel - return ilabel - else: - # Either a keyword or an operator - assert label[0] in ('"', "'"), label - value = eval(label) - if value[0].isalpha(): - # A keyword - if value in c.keywords: - return c.keywords[value] - else: - c.labels.append((self.tokens["NAME"], value)) - c.keywords[value] = ilabel - return ilabel - else: - # An operator (any non-numeric token) - tok_name = self.opmap[value] # Fails if unknown token - itoken = self.tokens[tok_name] - if itoken in c.tokens: - return c.tokens[itoken] - else: - c.labels.append((itoken, None)) - c.tokens[itoken] = ilabel - return ilabel - def addfirstsets(self): + if label.type == LabelType.NONTERMINAL: + if label in c.symbol2label: + return c.symbol2label[label] + else: + c.labels.append((c.symbol2number[label], None)) + c.symbol2label[label] = ilabel + return ilabel + elif label.type == LabelType.NAMED_TOKEN: + # A named token (NAME, NUMBER, STRING) + itoken = self.tokens.get(label, None) + assert isinstance(itoken, int), label + assert itoken in self.tokens.values(), label + if itoken in c.tokens: + return c.tokens[itoken] + else: + c.labels.append((itoken, None)) + c.tokens[itoken] = ilabel + return ilabel + elif label.type == LabelType.KEYWORD: + # A keyword + value = literal_eval(label) + if value in c.keywords: + return c.keywords[value] + else: + c.labels.append((self.tokens["NAME"], value)) + c.keywords[value] = ilabel + return ilabel + elif label.type == LabelType.OPERATOR: + # An operator (any non-numeric token) + value = literal_eval(label) + tok_name = self.opmap[value] # Fails if unknown token + itoken = self.tokens[tok_name] + if itoken in c.tokens: + return c.tokens[itoken] + else: + c.labels.append((itoken, None)) + c.tokens[itoken] = ilabel + return ilabel + else: + raise ValueError("Cannot categorize label {}".format(label)) + + def calculate_first_sets(self): names = list(self.dfas.keys()) for name in names: if name not in self.first: - self.calcfirst(name) + self.calculate_first_sets_for_rule(name) if self.verbose: print("First set for {dfa_name}".format(dfa_name=name)) for item in self.first[name]: print(" - {terminal}".format(terminal=item)) - def calcfirst(self, name): + def calculate_first_sets_for_rule(self, name): dfa = self.dfas[name] - self.first[name] = None # dummy to detect left recursion - state = dfa[0] + self.first[name] = None # dummy to detect left recursion + state = dfa.states[0] totalset = set() overlapcheck = {} for label, next in state.arcs.items(): @@ -148,7 +285,7 @@ def calcfirst(self, name): if fset is None: raise ValueError("recursion for rule %r" % name) else: - self.calcfirst(label) + self.calculate_first_sets_for_rule(label) fset = self.first[label] totalset.update(fset) overlapcheck[label] = fset @@ -159,248 +296,10 @@ def calcfirst(self, name): for label, itsfirst in overlapcheck.items(): for symbol in itsfirst: if symbol in inverse: - raise ValueError("rule %s is ambiguous; %s is in the" - " first sets of %s as well as %s" % - (name, symbol, label, inverse[symbol])) + raise ValueError( + "rule %s is ambiguous; %s is in the" + " first sets of %s as well as %s" + % (name, symbol, label, inverse[symbol]) + ) inverse[symbol] = label self.first[name] = totalset - - def parse(self): - dfas = collections.OrderedDict() - startsymbol = None - # MSTART: (NEWLINE | RULE)* ENDMARKER - while self.type != tokenize.ENDMARKER: - while self.type == tokenize.NEWLINE: - self.gettoken() - # RULE: NAME ':' RHS NEWLINE - name = self.expect(tokenize.NAME) - if self.verbose: - print("Processing rule {dfa_name}".format(dfa_name=name)) - self.expect(tokenize.OP, ":") - a, z = self.parse_rhs() - self.expect(tokenize.NEWLINE) - if self.verbose: - self.dump_nfa(name, a, z) - dfa = self.make_dfa(a, z) - if self.verbose: - self.dump_dfa(name, dfa) - self.simplify_dfa(dfa) - dfas[name] = dfa - if startsymbol is None: - startsymbol = name - return dfas, startsymbol - - def make_dfa(self, start, finish): - # To turn an NFA into a DFA, we define the states of the DFA - # to correspond to *sets* of states of the NFA. Then do some - # state reduction. Let's represent sets as dicts with 1 for - # values. - assert isinstance(start, NFAState) - assert isinstance(finish, NFAState) - def closure(state): - base = set() - addclosure(state, base) - return base - def addclosure(state, base): - assert isinstance(state, NFAState) - if state in base: - return - base.add(state) - for label, next in state.arcs: - if label is None: - addclosure(next, base) - states = [DFAState(closure(start), finish)] - for state in states: # NB states grows while we're iterating - arcs = {} - for nfastate in state.nfaset: - for label, next in nfastate.arcs: - if label is not None: - addclosure(next, arcs.setdefault(label, set())) - for label, nfaset in sorted(arcs.items()): - for st in states: - if st.nfaset == nfaset: - break - else: - st = DFAState(nfaset, finish) - states.append(st) - state.addarc(st, label) - return states # List of DFAState instances; first one is start - - def dump_nfa(self, name, start, finish): - print("Dump of NFA for", name) - todo = [start] - for i, state in enumerate(todo): - print(" State", i, state is finish and "(final)" or "") - for label, next in state.arcs: - if next in todo: - j = todo.index(next) - else: - j = len(todo) - todo.append(next) - if label is None: - print(" -> %d" % j) - else: - print(" %s -> %d" % (label, j)) - - def dump_dfa(self, name, dfa): - print("Dump of DFA for", name) - for i, state in enumerate(dfa): - print(" State", i, state.isfinal and "(final)" or "") - for label, next in sorted(state.arcs.items()): - print(" %s -> %d" % (label, dfa.index(next))) - - def simplify_dfa(self, dfa): - # This is not theoretically optimal, but works well enough. - # Algorithm: repeatedly look for two states that have the same - # set of arcs (same labels pointing to the same nodes) and - # unify them, until things stop changing. - - # dfa is a list of DFAState instances - changes = True - while changes: - changes = False - for i, state_i in enumerate(dfa): - for j in range(i+1, len(dfa)): - state_j = dfa[j] - if state_i == state_j: - #print " unify", i, j - del dfa[j] - for state in dfa: - state.unifystate(state_j, state_i) - changes = True - break - - def parse_rhs(self): - # RHS: ALT ('|' ALT)* - a, z = self.parse_alt() - if self.value != "|": - return a, z - else: - aa = NFAState() - zz = NFAState() - aa.addarc(a) - z.addarc(zz) - while self.value == "|": - self.gettoken() - a, z = self.parse_alt() - aa.addarc(a) - z.addarc(zz) - return aa, zz - - def parse_alt(self): - # ALT: ITEM+ - a, b = self.parse_item() - while (self.value in ("(", "[") or - self.type in (tokenize.NAME, tokenize.STRING)): - c, d = self.parse_item() - b.addarc(c) - b = d - return a, b - - def parse_item(self): - # ITEM: '[' RHS ']' | ATOM ['+' | '*'] - if self.value == "[": - self.gettoken() - a, z = self.parse_rhs() - self.expect(tokenize.OP, "]") - a.addarc(z) - return a, z - else: - a, z = self.parse_atom() - value = self.value - if value not in ("+", "*"): - return a, z - self.gettoken() - z.addarc(a) - if value == "+": - return a, z - else: - return a, a - - def parse_atom(self): - # ATOM: '(' RHS ')' | NAME | STRING - if self.value == "(": - self.gettoken() - a, z = self.parse_rhs() - self.expect(tokenize.OP, ")") - return a, z - elif self.type in (tokenize.NAME, tokenize.STRING): - a = NFAState() - z = NFAState() - a.addarc(z, self.value) - self.gettoken() - return a, z - else: - self.raise_error("expected (...) or NAME or STRING, got %s/%s", - self.type, self.value) - - def expect(self, type, value=None): - if self.type != type or (value is not None and self.value != value): - self.raise_error("expected %s/%s, got %s/%s", - type, value, self.type, self.value) - value = self.value - self.gettoken() - return value - - def gettoken(self): - tup = next(self.generator) - while tup[0] in (tokenize.COMMENT, tokenize.NL): - tup = next(self.generator) - self.type, self.value, self.begin, self.end, self.line = tup - # print(getattr(tokenize, 'tok_name')[self.type], repr(self.value)) - - def raise_error(self, msg, *args): - if args: - try: - msg = msg % args - except Exception: - msg = " ".join([msg] + list(map(str, args))) - raise SyntaxError(msg, (self.filename, self.end[0], - self.end[1], self.line)) - -class NFAState(object): - - def __init__(self): - self.arcs = [] # list of (label, NFAState) pairs - - def addarc(self, next, label=None): - assert label is None or isinstance(label, str) - assert isinstance(next, NFAState) - self.arcs.append((label, next)) - -class DFAState(object): - - def __init__(self, nfaset, final): - assert isinstance(nfaset, set) - assert isinstance(next(iter(nfaset)), NFAState) - assert isinstance(final, NFAState) - self.nfaset = nfaset - self.isfinal = final in nfaset - self.arcs = {} # map from label to DFAState - - def addarc(self, next, label): - assert isinstance(label, str) - assert label not in self.arcs - assert isinstance(next, DFAState) - self.arcs[label] = next - - def unifystate(self, old, new): - for label, next in self.arcs.items(): - if next is old: - self.arcs[label] = new - - def __eq__(self, other): - # Equality test -- ignore the nfaset instance variable - assert isinstance(other, DFAState) - if self.isfinal != other.isfinal: - return False - # Can't just return self.arcs == other.arcs, because that - # would invoke this method recursively, with cycles... - if len(self.arcs) != len(other.arcs): - return False - for label, next in self.arcs.items(): - if next is not other.arcs.get(label): - return False - return True - - __hash__ = None # For Py3 compatibility. diff --git a/Parser/pgen/token.py b/Parser/pgen/token.py index e7e8f3f1b66..2cff62ce3b2 100644 --- a/Parser/pgen/token.py +++ b/Parser/pgen/token.py @@ -6,21 +6,21 @@ def generate_tokens(tokens): for line in tokens: line = line.strip() - if not line or line.startswith('#'): + if not line or line.startswith("#"): continue name = line.split()[0] yield (name, next(numbers)) - yield ('N_TOKENS', next(numbers)) - yield ('NT_OFFSET', 256) + yield ("N_TOKENS", next(numbers)) + yield ("NT_OFFSET", 256) def generate_opmap(tokens): for line in tokens: line = line.strip() - if not line or line.startswith('#'): + if not line or line.startswith("#"): continue pieces = line.split() @@ -35,4 +35,4 @@ def generate_opmap(tokens): # with the token generation in "generate_tokens" because if this # symbol is included in Grammar/Tokens, it will collide with != # as it has the same name (NOTEQUAL). - yield ('<>', 'NOTEQUAL') + yield ("<>", "NOTEQUAL")