diff --git a/Doc/library/zipfile.rst b/Doc/library/zipfile.rst index e8a2530fb8c..7126d8bd703 100644 --- a/Doc/library/zipfile.rst +++ b/Doc/library/zipfile.rst @@ -489,10 +489,20 @@ Path objects are traversable using the ``/`` operator. The final path component. -.. method:: Path.open(*, **) +.. method:: Path.open(mode='r', *, pwd, **) - Invoke :meth:`ZipFile.open` on the current path. Accepts - the same arguments as :meth:`ZipFile.open`. + Invoke :meth:`ZipFile.open` on the current path. + Allows opening for read or write, text or binary + through supported modes: 'r', 'w', 'rb', 'wb'. + Positional and keyword arguments are passed through to + :class:`io.TextIOWrapper` when opened as text and + ignored otherwise. + ``pwd`` is the ``pwd`` parameter to + :meth:`ZipFile.open`. + + .. versionchanged:: 3.9 + Added support for text and binary modes for open. Default + mode is now text. .. method:: Path.iterdir() diff --git a/Lib/test/test_zipfile.py b/Lib/test/test_zipfile.py index 09fc8506006..643c5b477ba 100644 --- a/Lib/test/test_zipfile.py +++ b/Lib/test/test_zipfile.py @@ -5,6 +5,7 @@ import os import pathlib import posixpath +import string import struct import subprocess import sys @@ -2880,7 +2881,7 @@ def test_open(self): a, b, g = root.iterdir() with a.open() as strm: data = strm.read() - assert data == b"content of a" + assert data == "content of a" def test_read(self): for alpharep in self.zipfile_alpharep(): @@ -2974,6 +2975,11 @@ def test_joinpath_constant_time(self): # Check the file iterated all items assert entries.count == self.HUGE_ZIPFILE_NUM_ENTRIES + # @func_timeout.func_set_timeout(3) + def test_implied_dirs_performance(self): + data = ['/'.join(string.ascii_lowercase + str(n)) for n in range(10000)] + zipfile.CompleteDirs._implied_dirs(data) + if __name__ == "__main__": unittest.main() diff --git a/Lib/zipfile.py b/Lib/zipfile.py index 4510fac250b..55993c89b5b 100644 --- a/Lib/zipfile.py +++ b/Lib/zipfile.py @@ -17,7 +17,6 @@ import threading import time import contextlib -from collections import OrderedDict try: import zlib # We may need its compression method @@ -2102,24 +2101,6 @@ def _compile(file, optimize=-1): return (fname, archivename) -def _unique_everseen(iterable, key=None): - "List unique elements, preserving order. Remember all elements ever seen." - # unique_everseen('AAAABBBCCDAABBB') --> A B C D - # unique_everseen('ABBCcAD', str.lower) --> A B C D - seen = set() - seen_add = seen.add - if key is None: - for element in itertools.filterfalse(seen.__contains__, iterable): - seen_add(element) - yield element - else: - for element in iterable: - k = key(element) - if k not in seen: - seen_add(k) - yield element - - def _parents(path): """ Given a path with elements separated by @@ -2161,6 +2142,18 @@ def _ancestry(path): path, tail = posixpath.split(path) +_dedupe = dict.fromkeys +"""Deduplicate an iterable in original order""" + + +def _difference(minuend, subtrahend): + """ + Return items in minuend not in subtrahend, retaining order + with O(1) lookup. + """ + return itertools.filterfalse(set(subtrahend).__contains__, minuend) + + class CompleteDirs(ZipFile): """ A ZipFile subclass that ensures that implied directories @@ -2170,13 +2163,8 @@ class CompleteDirs(ZipFile): @staticmethod def _implied_dirs(names): parents = itertools.chain.from_iterable(map(_parents, names)) - # Deduplicate entries in original order - implied_dirs = OrderedDict.fromkeys( - p + posixpath.sep for p in parents - # Cast names to a set for O(1) lookups - if p + posixpath.sep not in set(names) - ) - return implied_dirs + as_dirs = (p + posixpath.sep for p in parents) + return _dedupe(_difference(as_dirs, names)) def namelist(self): names = super(CompleteDirs, self).namelist() @@ -2305,20 +2293,31 @@ def __init__(self, root, at=""): self.root = FastLookup.make(root) self.at = at - @property - def open(self): - return functools.partial(self.root.open, self.at) + def open(self, mode='r', *args, **kwargs): + """ + Open this entry as text or binary following the semantics + of ``pathlib.Path.open()`` by passing arguments through + to io.TextIOWrapper(). + """ + pwd = kwargs.pop('pwd', None) + zip_mode = mode[0] + stream = self.root.open(self.at, zip_mode, pwd=pwd) + if 'b' in mode: + if args or kwargs: + raise ValueError("encoding args invalid for binary operation") + return stream + return io.TextIOWrapper(stream, *args, **kwargs) @property def name(self): return posixpath.basename(self.at.rstrip("/")) def read_text(self, *args, **kwargs): - with self.open() as strm: - return io.TextIOWrapper(strm, *args, **kwargs).read() + with self.open('r', *args, **kwargs) as strm: + return strm.read() def read_bytes(self): - with self.open() as strm: + with self.open('rb') as strm: return strm.read() def _is_child(self, path): diff --git a/Misc/NEWS.d/next/Library/2020-02-17-22-38-15.bpo-39667.QuzEHH.rst b/Misc/NEWS.d/next/Library/2020-02-17-22-38-15.bpo-39667.QuzEHH.rst new file mode 100644 index 00000000000..acf503cc998 --- /dev/null +++ b/Misc/NEWS.d/next/Library/2020-02-17-22-38-15.bpo-39667.QuzEHH.rst @@ -0,0 +1 @@ +Improve pathlib.Path compatibility on zipfile.Path and correct performance degradation as found in zipp 3.0. \ No newline at end of file