Make dbm.dumb encode strings as UTF-8. Also fix it so it accepts bytes and

strings. Closes issue #3799.
2024-09-20 00:11:49 +00:00 · 2008-11-21 00:17:53 +00:00 · 2008-11-21 00:17:53 +00:00 · 58425d3103
parent 6e0d68e9e2
commit 58425d3103
3 changed files with 52 additions and 27 deletions
--- a/Lib/dbm/dumb.py
+++ b/Lib/dbm/dumb.py
@ -84,6 +84,7 @@ def _update(self):
            for line in f:
                line = line.rstrip()
                key, pos_and_siz_pair = eval(line)
+                key = key.encode('Latin-1')
                self._index[key] = pos_and_siz_pair
            f.close()

@ -110,13 +111,16 @@ def _commit(self):
        f = self._io.open(self._dirfile, 'w')
        self._chmod(self._dirfile)
        for key, pos_and_siz_pair in self._index.items():
-            f.write("%r, %r\n" % (key, pos_and_siz_pair))
+            # Use Latin-1 since it has no qualms with any value in any
+            # position; UTF-8, though, does care sometimes.
+            f.write("%r, %r\n" % (key.decode('Latin-1'), pos_and_siz_pair))
        f.close()

    sync = _commit

    def __getitem__(self, key):
-        key = key.decode("latin-1")
+        if isinstance(key, str):
+            key = key.encode('utf-8')
        pos, siz = self._index[key]     # may raise KeyError
        f = _io.open(self._datfile, 'rb')
        f.seek(pos)
@ -161,11 +165,12 @@ def _addkey(self, key, pos_and_siz_pair):
        f.close()

    def __setitem__(self, key, val):
-        if not isinstance(key, bytes):
-            raise TypeError("keys must be bytes")
-        key = key.decode("latin-1") # hashable bytes
+        if isinstance(key, str):
+            key = key.encode('utf-8')
+        elif not isinstance(key, (bytes, bytearray)):
+            raise TypeError("keys must be bytes or strings")
        if not isinstance(val, (bytes, bytearray)):
-            raise TypeError("values must be byte strings")
+            raise TypeError("values must be bytes")
        if key not in self._index:
            self._addkey(key, self._addval(val))
        else:
@ -191,7 +196,8 @@ def __setitem__(self, key, val):
            # (so that _commit() never gets called).

    def __delitem__(self, key):
-        key = key.decode("latin-1")
+        if isinstance(key, str):
+            key = key.encode('utf-8')
        # The blocks used by the associated value are lost.
        del self._index[key]
        # XXX It's unclear why we do a _commit() here (the code always
@ -201,14 +207,14 @@ def __delitem__(self, key):
        self._commit()

    def keys(self):
-        return [key.encode("latin-1") for key in self._index.keys()]
+        return list(self._index.keys())

    def items(self):
-        return [(key.encode("latin-1"), self[key.encode("latin-1")])
-                for key in self._index.keys()]
+        return [(key, self[key]) for key in self._index.keys()]

    def __contains__(self, key):
-        key = key.decode("latin-1")
+        if isinstance(key, str):
+            key = key.encode('utf-8')
        return key in self._index

    def iterkeys(self):
--- a/Lib/test/test_dbm_dumb.py
+++ b/Lib/test/test_dbm_dumb.py
@ -19,13 +19,14 @@ def _delete_files():
            pass

 class DumbDBMTestCase(unittest.TestCase):
-    _dict = {'0': b'',
-             'a': b'Python:',
-             'b': b'Programming',
-             'c': b'the',
-             'd': b'way',
-             'f': b'Guido',
-             'g': b'intended',
+    _dict = {b'0': b'',
+             b'a': b'Python:',
+             b'b': b'Programming',
+             b'c': b'the',
+             b'd': b'way',
+             b'f': b'Guido',
+             b'g': b'intended',
+             '\u00fc'.encode('utf-8') : b'!',
             }

    def __init__(self, *args):
@ -35,7 +36,7 @@ def test_dumbdbm_creation(self):
        f = dumbdbm.open(_fname, 'c')
        self.assertEqual(list(f.keys()), [])
        for key in self._dict:
-            f[key.encode("ascii")] = self._dict[key]
+            f[key] = self._dict[key]
        self.read_helper(f)
        f.close()

@ -73,7 +74,7 @@ def test_close_twice(self):
    def test_dumbdbm_modification(self):
        self.init_db()
        f = dumbdbm.open(_fname, 'w')
-        self._dict['g'] = f[b'g'] = b"indented"
+        self._dict[b'g'] = f[b'g'] = b"indented"
        self.read_helper(f)
        f.close()

@ -105,6 +106,21 @@ def test_write_write_read(self):
        self.assertEqual(f[b'1'], b'hello2')
        f.close()

+    def test_str_read(self):
+        self.init_db()
+        f = dumbdbm.open(_fname, 'r')
+        self.assertEqual(f['\u00fc'], self._dict['\u00fc'.encode('utf-8')])
+
+    def test_str_write_contains(self):
+        self.init_db()
+        f = dumbdbm.open(_fname)
+        f['\u00fc'] = b'!'
+        f.close()
+        f = dumbdbm.open(_fname, 'r')
+        self.assert_('\u00fc' in f)
+        self.assertEqual(f['\u00fc'.encode('utf-8')],
+                         self._dict['\u00fc'.encode('utf-8')])
+
    def test_line_endings(self):
        # test for bug #1172763: dumbdbm would die if the line endings
        # weren't what was expected.
@ -129,16 +145,16 @@ def test_line_endings(self):
    def read_helper(self, f):
        keys = self.keys_helper(f)
        for key in self._dict:
-            self.assertEqual(self._dict[key], f[key.encode("ascii")])
+            self.assertEqual(self._dict[key], f[key])

    def init_db(self):
        f = dumbdbm.open(_fname, 'w')
        for k in self._dict:
-            f[k.encode("ascii")] = self._dict[k]
+            f[k] = self._dict[k]
        f.close()

    def keys_helper(self, f):
-        keys = sorted(k.decode("ascii") for k in f.keys())
+        keys = sorted(f.keys())
        dkeys = sorted(self._dict.keys())
        self.assertEqual(keys, dkeys)
        return keys
@ -155,12 +171,12 @@ def test_random(self):
                if random.random() < 0.2:
                    if k in d:
                        del d[k]
-                        del f[k.encode("ascii")]
+                        del f[k]
                else:
                    v = random.choice((b'a', b'b', b'c')) * random.randrange(10000)
                    d[k] = v
-                    f[k.encode("ascii")] = v
-                    self.assertEqual(f[k.encode("ascii")], v)
+                    f[k] = v
+                    self.assertEqual(f[k], v)
            f.close()

            f = dumbdbm.open(_fname)
--- a/Misc/NEWS
+++ b/Misc/NEWS
@ -19,7 +19,7 @@ Core and Builtins
 - Issue #3327: Don't overallocate in the modules_by_index list.

 - Issue #1721812:  Binary set operations and copy() returned the input type
-  instead of the appropriate base type.  This was incorrect because set 
+  instead of the appropriate base type.  This was incorrect because set
  subclasses would be created without their __init__() method being called.
  The corrected behavior brings sets into line with lists and dicts.

@ -33,6 +33,9 @@ Core and Builtins
 Library
 -------

+- Issue #3799: Fix dbm.dumb to accept strings as well as bytes for keys. String
+  keys are now written out in UTF-8.
+
 - Issue #4338: Fix distutils upload command.

 - Issue #4354: Fix distutils register command.