gh-102120: [TarFile] Add an iter function that doesn't cache (GH-102128)

This commit is contained in:
Robert O'Shea 2023-05-23 21:44:40 +01:00 committed by GitHub
parent 097b7830cd
commit 50fce89d12
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
4 changed files with 42 additions and 7 deletions

View file

@ -318,7 +318,7 @@ be finalized; only the internally used file object will be closed. See the
.. versionadded:: 3.2
Added support for the context management protocol.
.. class:: TarFile(name=None, mode='r', fileobj=None, format=DEFAULT_FORMAT, tarinfo=TarInfo, dereference=False, ignore_zeros=False, encoding=ENCODING, errors='surrogateescape', pax_headers=None, debug=0, errorlevel=1)
.. class:: TarFile(name=None, mode='r', fileobj=None, format=DEFAULT_FORMAT, tarinfo=TarInfo, dereference=False, ignore_zeros=False, encoding=ENCODING, errors='surrogateescape', pax_headers=None, debug=0, errorlevel=1, stream=False)
All following arguments are optional and can be accessed as instance attributes
as well.
@ -369,6 +369,9 @@ be finalized; only the internally used file object will be closed. See the
The *pax_headers* argument is an optional dictionary of strings which
will be added as a pax global header if *format* is :const:`PAX_FORMAT`.
If *stream* is set to :const:`True` then while reading the archive info about files
in the archive are not cached, saving memory.
.. versionchanged:: 3.2
Use ``'surrogateescape'`` as the default for the *errors* argument.
@ -378,6 +381,8 @@ be finalized; only the internally used file object will be closed. See the
.. versionchanged:: 3.6
The *name* parameter accepts a :term:`path-like object`.
.. versionchanged:: 3.13
Add the *stream* parameter.
.. classmethod:: TarFile.open(...)

View file

@ -1633,7 +1633,7 @@ class TarFile(object):
def __init__(self, name=None, mode="r", fileobj=None, format=None,
tarinfo=None, dereference=None, ignore_zeros=None, encoding=None,
errors="surrogateescape", pax_headers=None, debug=None,
errorlevel=None, copybufsize=None):
errorlevel=None, copybufsize=None, stream=False):
"""Open an (uncompressed) tar archive `name'. `mode' is either 'r' to
read from an existing archive, 'a' to append data to an existing
file or 'w' to create a new file overwriting an existing one. `mode'
@ -1665,6 +1665,8 @@ def __init__(self, name=None, mode="r", fileobj=None, format=None,
self.name = os.path.abspath(name) if name else None
self.fileobj = fileobj
self.stream = stream
# Init attributes.
if format is not None:
self.format = format
@ -2631,7 +2633,9 @@ def next(self):
break
if tarinfo is not None:
self.members.append(tarinfo)
# if streaming the file we do not want to cache the tarinfo
if not self.stream:
self.members.append(tarinfo)
else:
self._loaded = True
@ -2682,11 +2686,12 @@ def _getmember(self, name, tarinfo=None, normalize=False):
def _load(self):
"""Read through the entire archive file and look for readable
members.
members. This should not run if the file is set to stream.
"""
while self.next() is not None:
pass
self._loaded = True
if not self.stream:
while self.next() is not None:
pass
self._loaded = True
def _check(self, mode=None):
"""Check if TarFile is still open, and if the operation's mode

View file

@ -100,6 +100,14 @@ def setUp(self):
def tearDown(self):
self.tar.close()
class StreamModeTest(ReadTest):
# Only needs to change how the tarfile is opened to set
# stream mode
def setUp(self):
self.tar = tarfile.open(self.tarname, mode=self.mode,
encoding="iso8859-1",
stream=True)
class UstarReadTest(ReadTest, unittest.TestCase):
@ -852,6 +860,21 @@ class Bz2StreamReadTest(Bz2Test, StreamReadTest):
class LzmaStreamReadTest(LzmaTest, StreamReadTest):
pass
class TarStreamModeReadTest(StreamModeTest, unittest.TestCase):
def test_stream_mode_no_cache(self):
for _ in self.tar:
pass
self.assertEqual(self.tar.members, [])
class GzipStreamModeReadTest(GzipTest, TarStreamModeReadTest):
pass
class Bz2StreamModeReadTest(Bz2Test, TarStreamModeReadTest):
pass
class LzmaStreamModeReadTest(LzmaTest, TarStreamModeReadTest):
pass
class DetectReadTest(TarTest, unittest.TestCase):
def _testfunc_file(self, name, mode):

View file

@ -0,0 +1,2 @@
Added a stream mode to ``tarfile`` that allows for reading
archives without caching info about the inner files.