bpo-22908: Add seek and tell functionality to ZipExtFile (GH-4966)

This allows for nested zip files, tar files within zip files, zip files within tar files, etc.

Contributed by: John Jolly
This commit is contained in:
John Jolly 2018-01-30 01:51:35 -07:00 committed by Gregory P. Smith
parent 2e0ecde8d7
commit 066df4fd45
4 changed files with 121 additions and 3 deletions

View file

@ -246,9 +246,9 @@ ZipFile Objects
With *mode* ``'r'`` the file-like object
(``ZipExtFile``) is read-only and provides the following methods:
:meth:`~io.BufferedIOBase.read`, :meth:`~io.IOBase.readline`,
:meth:`~io.IOBase.readlines`, :meth:`__iter__`,
:meth:`~iterator.__next__`. These objects can operate independently of
the ZipFile.
:meth:`~io.IOBase.readlines`, :meth:`~io.IOBase.seek`,
:meth:`~io.IOBase.tell`, :meth:`__iter__`, :meth:`~iterator.__next__`.
These objects can operate independently of the ZipFile.
With ``mode='w'``, a writable file handle is returned, which supports the
:meth:`~io.BufferedIOBase.write` method. While a writable file handle is open,

View file

@ -1628,6 +1628,40 @@ def test_open_conflicting_handles(self):
self.assertEqual(zipf.read('baz'), msg3)
self.assertEqual(zipf.namelist(), ['foo', 'bar', 'baz'])
def test_seek_tell(self):
# Test seek functionality
txt = b"Where's Bruce?"
bloc = txt.find(b"Bruce")
# Check seek on a file
with zipfile.ZipFile(TESTFN, "w") as zipf:
zipf.writestr("foo.txt", txt)
with zipfile.ZipFile(TESTFN, "r") as zipf:
with zipf.open("foo.txt", "r") as fp:
fp.seek(bloc, os.SEEK_SET)
self.assertEqual(fp.tell(), bloc)
fp.seek(-bloc, os.SEEK_CUR)
self.assertEqual(fp.tell(), 0)
fp.seek(bloc, os.SEEK_CUR)
self.assertEqual(fp.tell(), bloc)
self.assertEqual(fp.read(5), txt[bloc:bloc+5])
fp.seek(0, os.SEEK_END)
self.assertEqual(fp.tell(), len(txt))
# Check seek on memory file
data = io.BytesIO()
with zipfile.ZipFile(data, mode="w") as zipf:
zipf.writestr("foo.txt", txt)
with zipfile.ZipFile(data, mode="r") as zipf:
with zipf.open("foo.txt", "r") as fp:
fp.seek(bloc, os.SEEK_SET)
self.assertEqual(fp.tell(), bloc)
fp.seek(-bloc, os.SEEK_CUR)
self.assertEqual(fp.tell(), 0)
fp.seek(bloc, os.SEEK_CUR)
self.assertEqual(fp.tell(), bloc)
self.assertEqual(fp.read(5), txt[bloc:bloc+5])
fp.seek(0, os.SEEK_END)
self.assertEqual(fp.tell(), len(txt))
def tearDown(self):
unlink(TESTFN)
unlink(TESTFN2)

View file

@ -696,6 +696,18 @@ def __init__(self, file, pos, close, lock, writing):
self._close = close
self._lock = lock
self._writing = writing
self.seekable = file.seekable
self.tell = file.tell
def seek(self, offset, whence=0):
with self._lock:
if self.writing():
raise ValueError("Can't reposition in the ZIP file while "
"there is an open writing handle on it. "
"Close the writing handle before trying to read.")
self._file.seek(self._pos)
self._pos = self._file.tell()
return self._pos
def read(self, n=-1):
with self._lock:
@ -746,6 +758,9 @@ class ZipExtFile(io.BufferedIOBase):
# Read from compressed files in 4k blocks.
MIN_READ_SIZE = 4096
# Chunk size to read during seek
MAX_SEEK_READ = 1 << 24
def __init__(self, fileobj, mode, zipinfo, decrypter=None,
close_fileobj=False):
self._fileobj = fileobj
@ -778,6 +793,17 @@ def __init__(self, fileobj, mode, zipinfo, decrypter=None,
else:
self._expected_crc = None
self._seekable = False
try:
if fileobj.seekable():
self._orig_compress_start = fileobj.tell()
self._orig_compress_size = zipinfo.compress_size
self._orig_file_size = zipinfo.file_size
self._orig_start_crc = self._running_crc
self._seekable = True
except AttributeError:
pass
def __repr__(self):
result = ['<%s.%s' % (self.__class__.__module__,
self.__class__.__qualname__)]
@ -963,6 +989,62 @@ def close(self):
finally:
super().close()
def seekable(self):
return self._seekable
def seek(self, offset, whence=0):
if not self._seekable:
raise io.UnsupportedOperation("underlying stream is not seekable")
curr_pos = self.tell()
if whence == 0: # Seek from start of file
new_pos = offset
elif whence == 1: # Seek from current position
new_pos = curr_pos + offset
elif whence == 2: # Seek from EOF
new_pos = self._orig_file_size + offset
else:
raise ValueError("whence must be os.SEEK_SET (0), "
"os.SEEK_CUR (1), or os.SEEK_END (2)")
if new_pos > self._orig_file_size:
new_pos = self._orig_file_size
if new_pos < 0:
new_pos = 0
read_offset = new_pos - curr_pos
buff_offset = read_offset + self._offset
if buff_offset >= 0 and buff_offset < len(self._readbuffer):
# Just move the _offset index if the new position is in the _readbuffer
self._offset = buff_offset
read_offset = 0
elif read_offset < 0:
# Position is before the current position. Reset the ZipExtFile
self._fileobj.seek(self._orig_compress_start)
self._running_crc = self._orig_start_crc
self._compress_left = self._orig_compress_size
self._left = self._orig_file_size
self._readbuffer = b''
self._offset = 0
self._decompressor = zipfile._get_decompressor(self._compress_type)
self._eof = False
read_offset = new_pos
while read_offset > 0:
read_len = min(self.MAX_SEEK_READ, read_offset)
self.read(read_len)
read_offset -= read_len
return self.tell()
def tell(self):
if not self._seekable:
raise io.UnsupportedOperation("underlying stream is not seekable")
filepos = self._orig_file_size - self._left - len(self._readbuffer) + self._offset
return filepos
class _ZipWriteFile(io.BufferedIOBase):
def __init__(self, zf, zinfo, zip64):

View file

@ -0,0 +1,2 @@
Added seek and tell to the ZipExtFile class. This only works if the file
object used to open the zipfile is seekable.