GH-96172 fix unicodedata.east_asian_width being wrong on unassigned code points (#96207)

2024-09-05 00:05:39 +00:00 · 2022-08-26 18:29:39 +02:00 · 2022-08-26 18:29:39 +02:00 · 9c197bc8bf
parent c1581a928c
commit 9c197bc8bf
4 changed files with 686 additions and 620 deletions
--- a/Lib/test/test_unicodedata.py
+++ b/Lib/test/test_unicodedata.py
@ -71,7 +71,7 @@ class UnicodeFunctionsTest(UnicodeDatabaseTest):

    # Update this if the database changes. Make sure to do a full rebuild
    # (e.g. 'make distclean && make') to get the correct checksum.
-    expectedchecksum = '98d602e1f69d5c5bb8a5910c40bbbad4e18e8370'
+    expectedchecksum = '4975f3ec0acd4a62465d18c9bf8519b1964181f6'

    @requires_resource('cpu')
    def test_function_checksum(self):
@ -90,6 +90,7 @@ def test_function_checksum(self):
                self.db.decomposition(char),
                str(self.db.mirrored(char)),
                str(self.db.combining(char)),
+                unicodedata.east_asian_width(char),
            ]
            h.update(''.join(data).encode("ascii"))
        result = h.hexdigest()
@ -220,6 +221,23 @@ def test_east_asian_width(self):
        self.assertEqual(eaw('\u2010'), 'A')
        self.assertEqual(eaw('\U00020000'), 'W')

+    def test_east_asian_width_unassigned(self):
+        eaw = self.db.east_asian_width
+        # unassigned
+        for char in '\u0530\u0ece\u10c6\u20fc\uaaca\U000107bd\U000115f2':
+            self.assertEqual(eaw(char), 'N')
+            self.assertIs(self.db.name(char, None), None)
+
+        # unassigned but reserved for CJK
+        for char in '\uFA6E\uFADA\U0002A6E0\U0002FA20\U0003134B\U0003FFFD':
+            self.assertEqual(eaw(char), 'W')
+            self.assertIs(self.db.name(char, None), None)
+
+        # private use areas
+        for char in '\uE000\uF800\U000F0000\U000FFFEE\U00100000\U0010FFF0':
+            self.assertEqual(eaw(char), 'A')
+            self.assertIs(self.db.name(char, None), None)
+
    def test_east_asian_width_9_0_changes(self):
        self.assertEqual(self.db.ucd_3_2_0.east_asian_width('\u231a'), 'N')
        self.assertEqual(self.db.east_asian_width('\u231a'), 'W')
--- a/Misc/NEWS.d/next/Library/2022-08-23-13-30-30.gh-issue-96172.7WTHer.rst
+++ b/Misc/NEWS.d/next/Library/2022-08-23-13-30-30.gh-issue-96172.7WTHer.rst
@ -0,0 +1,3 @@
+Fix a bug in ``unicodedata``: ``east_asian_width`` used to return the wrong
+value for unassigned characters; and for yet unassigned, but reserved
+characters.
--- a/Modules/unicodedata_db.h
+++ b/Modules/unicodedata_db.h
--- a/Tools/unicode/makeunicodedata.py
+++ b/Tools/unicode/makeunicodedata.py
@ -77,7 +77,8 @@
    "PDF", "EN", "ES", "ET", "AN", "CS", "NSM", "BN", "B", "S", "WS",
    "ON", "LRI", "RLI", "FSI", "PDI" ]

-EASTASIANWIDTH_NAMES = [ "F", "H", "W", "Na", "A", "N" ]
+# "N" needs to be the first entry, see the comment in makeunicodedata
+EASTASIANWIDTH_NAMES = [ "N", "H", "W", "Na", "A", "F" ]

 MANDATORY_LINE_BREAKS = [ "BK", "CR", "LF", "NL" ]

@ -135,6 +136,14 @@ def maketables(trace=0):

 def makeunicodedata(unicode, trace):

+    # the default value of east_asian_width is "N", for unassigned code points
+    # not mentioned in EastAsianWidth.txt
+    # in addition there are some reserved but unassigned code points in CJK
+    # ranges that are classified as "W". code points in private use areas
+    # have a width of "A". both of these have entries in
+    # EastAsianWidth.txt
+    # see https://unicode.org/reports/tr11/#Unassigned
+    assert EASTASIANWIDTH_NAMES[0] == "N"
    dummy = (0, 0, 0, 0, 0, 0)
    table = [dummy]
    cache = {0: dummy}
@ -160,12 +169,20 @@ def makeunicodedata(unicode, trace):
                category, combining, bidirectional, mirrored, eastasianwidth,
                normalizationquickcheck
                )
-            # add entry to index and item tables
-            i = cache.get(item)
-            if i is None:
-                cache[item] = i = len(table)
-                table.append(item)
-            index[char] = i
+        elif unicode.widths[char] is not None:
+            # an unassigned but reserved character, with a known
+            # east_asian_width
+            eastasianwidth = EASTASIANWIDTH_NAMES.index(unicode.widths[char])
+            item = (0, 0, 0, 0, eastasianwidth, 0)
+        else:
+            continue
+
+        # add entry to index and item tables
+        i = cache.get(item)
+        if i is None:
+            cache[item] = i = len(table)
+            table.append(item)
+        index[char] = i

    # 2) decomposition data

@ -1085,6 +1102,7 @@ def __init__(self, version, cjk_check=True):
        for i in range(0, 0x110000):
            if table[i] is not None:
                table[i].east_asian_width = widths[i]
+        self.widths = widths

        for char, (p,) in UcdFile(DERIVED_CORE_PROPERTIES, version).expanded():
            if table[char]: