Use unicodedata instead of eval.

2024-10-14 23:28:08 +00:00 · 2018-02-08 18:51:45 +02:00 · 2018-02-08 18:51:45 +02:00 · 2272cec13b
parent c0a268962a
commit 2272cec13b
1 changed files with 22 additions and 26 deletions
--- a/Lib/sre_parse.py
+++ b/Lib/sre_parse.py
@ -13,7 +13,7 @@
 # XXX: show string offset and offending character for all errors

 from sre_constants import *
-from ast import literal_eval
+import unicodedata

 SPECIAL_CHARS = ".\\[{()*+?^$|"
 REPEAT_CHARS = "*+?{"
@ -26,10 +26,6 @@

 WHITESPACE = frozenset(" \t\n\r\v\f")

-UNICODE_NAME = ASCIILETTERS | DIGITS | frozenset(' -')
-CLOSING_BRACE = frozenset("}")
-OPENING_BRACE = frozenset("{")
-

 _REPEATCODES = frozenset({MIN_REPEAT, MAX_REPEAT})
 _UNITCODES = frozenset({ANY, RANGE, IN, LITERAL, NOT_LITERAL, CATEGORY})
@ -270,19 +266,19 @@ def getwhile(self, n, charset):
            result += c
            self.__next()
        return result
-    def getuntil(self, terminator):
+    def getuntil(self, terminator, name):
        result = ''
        while True:
            c = self.next
            self.__next()
            if c is None:
                if not result:
-                    raise self.error("missing group name")
+                    raise self.error("missing " + name)
                raise self.error("missing %s, unterminated name" % terminator,
                                 len(result))
            if c == terminator:
                if not result:
-                    raise self.error("missing group name", 1)
+                    raise self.error("missing " + name, 1)
                break
            result += c
        return result
@ -330,14 +326,14 @@ def _class_escape(source, escape):
            return LITERAL, c
        elif c == "N" and source.istext:
            # named unicode escape e.g. \N{EM DASH}
-            escape += source.getwhile(1, OPENING_BRACE)
-            escape += source.getwhile(100, UNICODE_NAME)
-            escape += source.getwhile(1, CLOSING_BRACE)
+            if not source.match('{'):
+                raise source.error("missing {")
+            charname = source.getuntil('}', 'character name')
            try:
-                c = ord(literal_eval('"%s"' % escape))
-            except SyntaxError:
-                charname = escape[2:].strip('{}')
-                raise source.error("unknown Unicode character name %s" % charname, len(escape))
+                c = ord(unicodedata.lookup(charname))
+            except KeyError:
+                raise source.error("undefined character name %r" % charname,
+                                   len(charname) + len(r'\N{}'))
            return LITERAL, c
        elif c in OCTDIGITS:
            # octal escape (up to three digits)
@ -389,14 +385,14 @@ def _escape(source, escape, state):
            return LITERAL, c
        elif c == "N" and source.istext:
            # named unicode escape e.g. \N{EM DASH}
-            escape += source.getwhile(1, OPENING_BRACE)
-            escape += source.getwhile(100, UNICODE_NAME)
-            escape += source.getwhile(1, CLOSING_BRACE)
+            if not source.match('{'):
+                raise source.error("missing {")
+            charname = source.getuntil('}', 'character name')
            try:
-                c = ord(literal_eval('"%s"' % escape))
-            except SyntaxError:
-                charname = escape[2:].strip('{}')
-                raise source.error("unknown Unicode character name %s" % charname, len(escape))
+                c = ord(unicodedata.lookup(charname))
+            except KeyError:
+                raise source.error("undefined character name %r" % charname,
+                                   len(charname) + len(r'\N{}'))
            return LITERAL, c
        elif c == "0":
            # octal escape
@ -707,13 +703,13 @@ def _parse(source, state, verbose, nested, first=False):
                    # python extensions
                    if sourcematch("<"):
                        # named group: skip forward to end of name
-                        name = source.getuntil(">")
+                        name = source.getuntil(">", "group name")
                        if not name.isidentifier():
                            msg = "bad character in group name %r" % name
                            raise source.error(msg, len(name) + 1)
                    elif sourcematch("="):
                        # named backreference
-                        name = source.getuntil(")")
+                        name = source.getuntil(")", "group name")
                        if not name.isidentifier():
                            msg = "bad character in group name %r" % name
                            raise source.error(msg, len(name) + 1)
@ -776,7 +772,7 @@ def _parse(source, state, verbose, nested, first=False):

                elif char == "(":
                    # conditional backreference group
-                    condname = source.getuntil(")")
+                    condname = source.getuntil(")", "group name")
                    if condname.isidentifier():
                        condgroup = state.groupdict.get(condname)
                        if condgroup is None:
@ -1005,7 +1001,7 @@ def addgroup(index, pos):
                name = ""
                if not s.match("<"):
                    raise s.error("missing <")
-                name = s.getuntil(">")
+                name = s.getuntil(">", "group name")
                if name.isidentifier():
                    try:
                        index = groupindex[name]