Make dbm.dumb encode strings as UTF-8. Also fix it so it accepts bytes and

strings.

Closes issue #3799.
This commit is contained in:
Brett Cannon 2008-11-21 00:17:53 +00:00
parent 6e0d68e9e2
commit 58425d3103
3 changed files with 52 additions and 27 deletions

View file

@ -84,6 +84,7 @@ def _update(self):
for line in f:
line = line.rstrip()
key, pos_and_siz_pair = eval(line)
key = key.encode('Latin-1')
self._index[key] = pos_and_siz_pair
f.close()
@ -110,13 +111,16 @@ def _commit(self):
f = self._io.open(self._dirfile, 'w')
self._chmod(self._dirfile)
for key, pos_and_siz_pair in self._index.items():
f.write("%r, %r\n" % (key, pos_and_siz_pair))
# Use Latin-1 since it has no qualms with any value in any
# position; UTF-8, though, does care sometimes.
f.write("%r, %r\n" % (key.decode('Latin-1'), pos_and_siz_pair))
f.close()
sync = _commit
def __getitem__(self, key):
key = key.decode("latin-1")
if isinstance(key, str):
key = key.encode('utf-8')
pos, siz = self._index[key] # may raise KeyError
f = _io.open(self._datfile, 'rb')
f.seek(pos)
@ -161,11 +165,12 @@ def _addkey(self, key, pos_and_siz_pair):
f.close()
def __setitem__(self, key, val):
if not isinstance(key, bytes):
raise TypeError("keys must be bytes")
key = key.decode("latin-1") # hashable bytes
if isinstance(key, str):
key = key.encode('utf-8')
elif not isinstance(key, (bytes, bytearray)):
raise TypeError("keys must be bytes or strings")
if not isinstance(val, (bytes, bytearray)):
raise TypeError("values must be byte strings")
raise TypeError("values must be bytes")
if key not in self._index:
self._addkey(key, self._addval(val))
else:
@ -191,7 +196,8 @@ def __setitem__(self, key, val):
# (so that _commit() never gets called).
def __delitem__(self, key):
key = key.decode("latin-1")
if isinstance(key, str):
key = key.encode('utf-8')
# The blocks used by the associated value are lost.
del self._index[key]
# XXX It's unclear why we do a _commit() here (the code always
@ -201,14 +207,14 @@ def __delitem__(self, key):
self._commit()
def keys(self):
return [key.encode("latin-1") for key in self._index.keys()]
return list(self._index.keys())
def items(self):
return [(key.encode("latin-1"), self[key.encode("latin-1")])
for key in self._index.keys()]
return [(key, self[key]) for key in self._index.keys()]
def __contains__(self, key):
key = key.decode("latin-1")
if isinstance(key, str):
key = key.encode('utf-8')
return key in self._index
def iterkeys(self):

View file

@ -19,13 +19,14 @@ def _delete_files():
pass
class DumbDBMTestCase(unittest.TestCase):
_dict = {'0': b'',
'a': b'Python:',
'b': b'Programming',
'c': b'the',
'd': b'way',
'f': b'Guido',
'g': b'intended',
_dict = {b'0': b'',
b'a': b'Python:',
b'b': b'Programming',
b'c': b'the',
b'd': b'way',
b'f': b'Guido',
b'g': b'intended',
'\u00fc'.encode('utf-8') : b'!',
}
def __init__(self, *args):
@ -35,7 +36,7 @@ def test_dumbdbm_creation(self):
f = dumbdbm.open(_fname, 'c')
self.assertEqual(list(f.keys()), [])
for key in self._dict:
f[key.encode("ascii")] = self._dict[key]
f[key] = self._dict[key]
self.read_helper(f)
f.close()
@ -73,7 +74,7 @@ def test_close_twice(self):
def test_dumbdbm_modification(self):
self.init_db()
f = dumbdbm.open(_fname, 'w')
self._dict['g'] = f[b'g'] = b"indented"
self._dict[b'g'] = f[b'g'] = b"indented"
self.read_helper(f)
f.close()
@ -105,6 +106,21 @@ def test_write_write_read(self):
self.assertEqual(f[b'1'], b'hello2')
f.close()
def test_str_read(self):
self.init_db()
f = dumbdbm.open(_fname, 'r')
self.assertEqual(f['\u00fc'], self._dict['\u00fc'.encode('utf-8')])
def test_str_write_contains(self):
self.init_db()
f = dumbdbm.open(_fname)
f['\u00fc'] = b'!'
f.close()
f = dumbdbm.open(_fname, 'r')
self.assert_('\u00fc' in f)
self.assertEqual(f['\u00fc'.encode('utf-8')],
self._dict['\u00fc'.encode('utf-8')])
def test_line_endings(self):
# test for bug #1172763: dumbdbm would die if the line endings
# weren't what was expected.
@ -129,16 +145,16 @@ def test_line_endings(self):
def read_helper(self, f):
keys = self.keys_helper(f)
for key in self._dict:
self.assertEqual(self._dict[key], f[key.encode("ascii")])
self.assertEqual(self._dict[key], f[key])
def init_db(self):
f = dumbdbm.open(_fname, 'w')
for k in self._dict:
f[k.encode("ascii")] = self._dict[k]
f[k] = self._dict[k]
f.close()
def keys_helper(self, f):
keys = sorted(k.decode("ascii") for k in f.keys())
keys = sorted(f.keys())
dkeys = sorted(self._dict.keys())
self.assertEqual(keys, dkeys)
return keys
@ -155,12 +171,12 @@ def test_random(self):
if random.random() < 0.2:
if k in d:
del d[k]
del f[k.encode("ascii")]
del f[k]
else:
v = random.choice((b'a', b'b', b'c')) * random.randrange(10000)
d[k] = v
f[k.encode("ascii")] = v
self.assertEqual(f[k.encode("ascii")], v)
f[k] = v
self.assertEqual(f[k], v)
f.close()
f = dumbdbm.open(_fname)

View file

@ -19,7 +19,7 @@ Core and Builtins
- Issue #3327: Don't overallocate in the modules_by_index list.
- Issue #1721812: Binary set operations and copy() returned the input type
instead of the appropriate base type. This was incorrect because set
instead of the appropriate base type. This was incorrect because set
subclasses would be created without their __init__() method being called.
The corrected behavior brings sets into line with lists and dicts.
@ -33,6 +33,9 @@ Core and Builtins
Library
-------
- Issue #3799: Fix dbm.dumb to accept strings as well as bytes for keys. String
keys are now written out in UTF-8.
- Issue #4338: Fix distutils upload command.
- Issue #4354: Fix distutils register command.