mirror of
https://github.com/python/cpython
synced 2024-10-14 14:53:36 +00:00
GH-115060: Speed up pathlib.Path.glob()
by removing redundant regex matching (#115061)
When expanding and filtering paths for a `**` wildcard segment, build an `re.Pattern` object from the subsequent pattern parts, rather than the entire pattern, and match against the `os.DirEntry` object prior to instantiating a path object. Also skip compiling a pattern when expanding a `*` wildcard segment.
This commit is contained in:
parent
9d1a353230
commit
6f93b4df92
|
@ -587,9 +587,13 @@ def iterdir(self):
|
||||||
def _scandir(self):
|
def _scandir(self):
|
||||||
return os.scandir(self)
|
return os.scandir(self)
|
||||||
|
|
||||||
def _make_child_entry(self, entry):
|
def _direntry_str(self, entry):
|
||||||
|
# Transform an entry yielded from _scandir() into a path string.
|
||||||
|
return entry.name if str(self) == '.' else entry.path
|
||||||
|
|
||||||
|
def _make_child_direntry(self, entry):
|
||||||
# Transform an entry yielded from _scandir() into a path object.
|
# Transform an entry yielded from _scandir() into a path object.
|
||||||
path_str = entry.name if str(self) == '.' else entry.path
|
path_str = self._direntry_str(entry)
|
||||||
path = self.with_segments(path_str)
|
path = self.with_segments(path_str)
|
||||||
path._str = path_str
|
path._str = path_str
|
||||||
path._drv = self.drive
|
path._drv = self.drive
|
||||||
|
|
|
@ -86,19 +86,29 @@ def _select_children(parent_paths, dir_only, follow_symlinks, match):
|
||||||
continue
|
continue
|
||||||
except OSError:
|
except OSError:
|
||||||
continue
|
continue
|
||||||
if match(entry.name):
|
# Avoid cost of making a path object for non-matching paths by
|
||||||
yield parent_path._make_child_entry(entry)
|
# matching against the os.DirEntry.name string.
|
||||||
|
if match is None or match(entry.name):
|
||||||
|
yield parent_path._make_child_direntry(entry)
|
||||||
|
|
||||||
|
|
||||||
def _select_recursive(parent_paths, dir_only, follow_symlinks):
|
def _select_recursive(parent_paths, dir_only, follow_symlinks, match):
|
||||||
"""Yield given paths and all their subdirectories, recursively."""
|
"""Yield given paths and all their children, recursively, filtering by
|
||||||
|
string and type.
|
||||||
|
"""
|
||||||
if follow_symlinks is None:
|
if follow_symlinks is None:
|
||||||
follow_symlinks = False
|
follow_symlinks = False
|
||||||
for parent_path in parent_paths:
|
for parent_path in parent_paths:
|
||||||
|
if match is not None:
|
||||||
|
# If we're filtering paths through a regex, record the length of
|
||||||
|
# the parent path. We'll pass it to match(path, pos=...) later.
|
||||||
|
parent_len = len(str(parent_path._make_child_relpath('_'))) - 1
|
||||||
paths = [parent_path._make_child_relpath('')]
|
paths = [parent_path._make_child_relpath('')]
|
||||||
while paths:
|
while paths:
|
||||||
path = paths.pop()
|
path = paths.pop()
|
||||||
yield path
|
if match is None or match(str(path), parent_len):
|
||||||
|
# Yield *directory* path that matches pattern (if any).
|
||||||
|
yield path
|
||||||
try:
|
try:
|
||||||
# We must close the scandir() object before proceeding to
|
# We must close the scandir() object before proceeding to
|
||||||
# avoid exhausting file descriptors when globbing deep trees.
|
# avoid exhausting file descriptors when globbing deep trees.
|
||||||
|
@ -108,14 +118,22 @@ def _select_recursive(parent_paths, dir_only, follow_symlinks):
|
||||||
pass
|
pass
|
||||||
else:
|
else:
|
||||||
for entry in entries:
|
for entry in entries:
|
||||||
|
# Handle directory entry.
|
||||||
try:
|
try:
|
||||||
if entry.is_dir(follow_symlinks=follow_symlinks):
|
if entry.is_dir(follow_symlinks=follow_symlinks):
|
||||||
paths.append(path._make_child_entry(entry))
|
# Recurse into this directory.
|
||||||
|
paths.append(path._make_child_direntry(entry))
|
||||||
continue
|
continue
|
||||||
except OSError:
|
except OSError:
|
||||||
pass
|
pass
|
||||||
|
|
||||||
|
# Handle file entry.
|
||||||
if not dir_only:
|
if not dir_only:
|
||||||
yield path._make_child_entry(entry)
|
# Avoid cost of making a path object for non-matching
|
||||||
|
# files by matching against the os.DirEntry object.
|
||||||
|
if match is None or match(path._direntry_str(entry), parent_len):
|
||||||
|
# Yield *file* path that matches pattern (if any).
|
||||||
|
yield path._make_child_direntry(entry)
|
||||||
|
|
||||||
|
|
||||||
def _select_unique(paths):
|
def _select_unique(paths):
|
||||||
|
@ -750,8 +768,14 @@ def _scandir(self):
|
||||||
from contextlib import nullcontext
|
from contextlib import nullcontext
|
||||||
return nullcontext(self.iterdir())
|
return nullcontext(self.iterdir())
|
||||||
|
|
||||||
def _make_child_entry(self, entry):
|
def _direntry_str(self, entry):
|
||||||
|
# Transform an entry yielded from _scandir() into a path string.
|
||||||
|
# PathBase._scandir() yields PathBase objects, so use str().
|
||||||
|
return str(entry)
|
||||||
|
|
||||||
|
def _make_child_direntry(self, entry):
|
||||||
# Transform an entry yielded from _scandir() into a path object.
|
# Transform an entry yielded from _scandir() into a path object.
|
||||||
|
# PathBase._scandir() yields PathBase objects, so this is a no-op.
|
||||||
return entry
|
return entry
|
||||||
|
|
||||||
def _make_child_relpath(self, name):
|
def _make_child_relpath(self, name):
|
||||||
|
@ -769,43 +793,49 @@ def glob(self, pattern, *, case_sensitive=None, follow_symlinks=None):
|
||||||
|
|
||||||
stack = pattern._pattern_stack
|
stack = pattern._pattern_stack
|
||||||
specials = ('', '.', '..')
|
specials = ('', '.', '..')
|
||||||
filter_paths = False
|
|
||||||
deduplicate_paths = False
|
deduplicate_paths = False
|
||||||
sep = self.pathmod.sep
|
sep = self.pathmod.sep
|
||||||
paths = iter([self] if self.is_dir() else [])
|
paths = iter([self] if self.is_dir() else [])
|
||||||
while stack:
|
while stack:
|
||||||
part = stack.pop()
|
part = stack.pop()
|
||||||
if part in specials:
|
if part in specials:
|
||||||
|
# Join special component (e.g. '..') onto paths.
|
||||||
paths = _select_special(paths, part)
|
paths = _select_special(paths, part)
|
||||||
|
|
||||||
elif part == '**':
|
elif part == '**':
|
||||||
# Consume adjacent '**' components.
|
# Consume following '**' components, which have no effect.
|
||||||
while stack and stack[-1] == '**':
|
while stack and stack[-1] == '**':
|
||||||
stack.pop()
|
stack.pop()
|
||||||
|
|
||||||
# Consume adjacent non-special components and enable post-walk
|
# Consume following non-special components, provided we're
|
||||||
# regex filtering, provided we're treating symlinks consistently.
|
# treating symlinks consistently. Each component is joined
|
||||||
|
# onto 'part', which is used to generate an re.Pattern object.
|
||||||
if follow_symlinks is not None:
|
if follow_symlinks is not None:
|
||||||
while stack and stack[-1] not in specials:
|
while stack and stack[-1] not in specials:
|
||||||
filter_paths = True
|
part += sep + stack.pop()
|
||||||
stack.pop()
|
|
||||||
|
|
||||||
dir_only = bool(stack)
|
# If the previous loop consumed pattern components, compile an
|
||||||
paths = _select_recursive(paths, dir_only, follow_symlinks)
|
# re.Pattern object based on those components.
|
||||||
|
match = _compile_pattern(part, sep, case_sensitive) if part != '**' else None
|
||||||
|
|
||||||
|
# Recursively walk directories, filtering by type and regex.
|
||||||
|
paths = _select_recursive(paths, bool(stack), follow_symlinks, match)
|
||||||
|
|
||||||
|
# De-duplicate if we've already seen a '**' component.
|
||||||
if deduplicate_paths:
|
if deduplicate_paths:
|
||||||
# De-duplicate if we've already seen a '**' component.
|
|
||||||
paths = _select_unique(paths)
|
paths = _select_unique(paths)
|
||||||
deduplicate_paths = True
|
deduplicate_paths = True
|
||||||
|
|
||||||
elif '**' in part:
|
elif '**' in part:
|
||||||
raise ValueError("Invalid pattern: '**' can only be an entire path component")
|
raise ValueError("Invalid pattern: '**' can only be an entire path component")
|
||||||
|
|
||||||
else:
|
else:
|
||||||
dir_only = bool(stack)
|
# If the pattern component isn't '*', compile an re.Pattern
|
||||||
match = _compile_pattern(part, sep, case_sensitive)
|
# object based on the component.
|
||||||
paths = _select_children(paths, dir_only, follow_symlinks, match)
|
match = _compile_pattern(part, sep, case_sensitive) if part != '*' else None
|
||||||
if filter_paths:
|
|
||||||
# Filter out paths that don't match pattern.
|
# Iterate over directories' children filtering by type and regex.
|
||||||
prefix_len = len(str(self._make_child_relpath('_'))) - 1
|
paths = _select_children(paths, bool(stack), follow_symlinks, match)
|
||||||
match = _compile_pattern(pattern._pattern_str, sep, case_sensitive)
|
|
||||||
paths = (path for path in paths if match(path._pattern_str, prefix_len))
|
|
||||||
return paths
|
return paths
|
||||||
|
|
||||||
def rglob(self, pattern, *, case_sensitive=None, follow_symlinks=None):
|
def rglob(self, pattern, *, case_sensitive=None, follow_symlinks=None):
|
||||||
|
@ -854,7 +884,7 @@ def walk(self, top_down=True, on_error=None, follow_symlinks=False):
|
||||||
|
|
||||||
if is_dir:
|
if is_dir:
|
||||||
if not top_down:
|
if not top_down:
|
||||||
paths.append(path._make_child_entry(entry))
|
paths.append(path._make_child_direntry(entry))
|
||||||
dirnames.append(entry.name)
|
dirnames.append(entry.name)
|
||||||
else:
|
else:
|
||||||
filenames.append(entry.name)
|
filenames.append(entry.name)
|
||||||
|
|
|
@ -1250,6 +1250,19 @@ def test_glob_pathlike(self):
|
||||||
self.assertEqual(expect, set(p.glob(P(pattern))))
|
self.assertEqual(expect, set(p.glob(P(pattern))))
|
||||||
self.assertEqual(expect, set(p.glob(FakePath(pattern))))
|
self.assertEqual(expect, set(p.glob(FakePath(pattern))))
|
||||||
|
|
||||||
|
@needs_symlinks
|
||||||
|
def test_glob_dot(self):
|
||||||
|
P = self.cls
|
||||||
|
with os_helper.change_cwd(P(self.base, "dirC")):
|
||||||
|
self.assertEqual(
|
||||||
|
set(P('.').glob('*')), {P("fileC"), P("novel.txt"), P("dirD")})
|
||||||
|
self.assertEqual(
|
||||||
|
set(P('.').glob('**')), {P("fileC"), P("novel.txt"), P("dirD"), P("dirD/fileD"), P(".")})
|
||||||
|
self.assertEqual(
|
||||||
|
set(P('.').glob('**/*')), {P("fileC"), P("novel.txt"), P("dirD"), P("dirD/fileD")})
|
||||||
|
self.assertEqual(
|
||||||
|
set(P('.').glob('**/*/*')), {P("dirD/fileD")})
|
||||||
|
|
||||||
def test_rglob_pathlike(self):
|
def test_rglob_pathlike(self):
|
||||||
P = self.cls
|
P = self.cls
|
||||||
p = P(self.base, "dirC")
|
p = P(self.base, "dirC")
|
||||||
|
|
|
@ -0,0 +1 @@
|
||||||
|
Speed up :meth:`pathlib.Path.glob` by removing redundant regex matching.
|
Loading…
Reference in a new issue