Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions .changes/next-release/bugfix-s3-1138.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
{
"type": "bugfix",
"category": "``s3``",
"description": "Skip traversing directories that are fully excluded by ``--exclude``/``--include`` filters during ``s3 sync``/``cp``/``mv``/``rm``. Fixes a long-standing performance issue (#1138) and a non-zero rc bug for excluded unreadable files (#1117)."
}
59 changes: 59 additions & 0 deletions awscli/customizations/s3/filegenerator.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@
from dateutil.tz import tzlocal

from awscli.compat import queue
from awscli.customizations.s3.fileinfo import FileInfo
from awscli.customizations.s3.utils import (
EPOCH_TIME,
BucketLister,
Expand Down Expand Up @@ -143,6 +144,8 @@ def __init__(
page_size=None,
result_queue=None,
request_parameters=None,
file_filter=None,
is_dst_walker=False,
):
self._client = client
self.operation_name = operation_name
Expand All @@ -154,6 +157,12 @@ def __init__(
self.request_parameters = {}
if request_parameters is not None:
self.request_parameters = request_parameters
self.file_filter = file_filter
# When True, this generator is the reverse/destination walker for
# ``sync``. Filter pruning consults ``dst_patterns`` instead of
# ``patterns`` because the paths it sees are rooted at the
# destination, not the source.
self.is_dst_walker = is_dst_walker

def call(self, files):
"""
Expand Down Expand Up @@ -230,6 +239,18 @@ def list_files(self, path, dir_op):
for name in names:
file_path = join(path, name)
if isdir(file_path):
# If the user's filter chain makes it impossible
# for any descendant to be included, prune the
# whole subtree instead of recursing into it.
if (
self.file_filter is not None
and self.file_filter.can_skip_directory(
file_path,
'local',
use_dst_patterns=self.is_dst_walker,
)
):
continue
# Anything in a directory will have a prefix of
# this current directory and will come before the
# remaining contents in this directory. This
Expand Down Expand Up @@ -305,6 +326,44 @@ def should_ignore_file(self, path):
path = path[:-1]
if os.path.islink(path):
return True
# Only run the prefilter / existence-before-filter logic when the
# user actually supplied --include/--exclude patterns. With an
# empty filter, ``Filter.call`` cannot suppress anything anyway,
# so the extra os.path.exists() call here would be pure overhead
# on large unfiltered sync/cp/mv walks.
if self.file_filter is not None:
relevant_patterns = (
self.file_filter.dst_patterns
if self.is_dst_walker
else self.file_filter.patterns
)
else:
relevant_patterns = None
if relevant_patterns:
# Validate existence *before* the filter has a chance to
# suppress the warning. A broken symlink or a path that
# disappeared between listdir and stat should still produce
# the standard "File does not exist." warning even when
# --exclude/--include would otherwise mask it.
if not os.path.exists(path):
return self.triggers_warning(path)
# Pre-filter using the user's --include/--exclude rules so the
# walker does not stat / listdir entries that the filter chain
# is going to drop anyway. The directory check must run before
# triggers_warning() because is_readable() calls os.listdir()
# on directories — without this guard, excluded subtrees would
# be listed just to test readability and the prune would be
# defeated. The file branch silences readability warnings for
# files that the filter would drop anyway.
if os.path.isdir(path):
if self.file_filter.can_skip_directory(
path, 'local', use_dst_patterns=self.is_dst_walker
):
return True
else:
probe = FileInfo(src=path, src_type='local')
if not list(self.file_filter.call([probe])):
return True
warning_triggered = self.triggers_warning(path)
if warning_triggered:
return True
Expand Down
81 changes: 81 additions & 0 deletions awscli/customizations/s3/filters.py
Original file line number Diff line number Diff line change
Expand Up @@ -74,6 +74,42 @@ def _get_local_root(source_location, dir_op):
return rootdir


def _literal_prefix(pattern):
for i, ch in enumerate(pattern):
if ch in '*?[':
return pattern[:i]
return pattern


def _pattern_can_match_under(pattern, target_with_sep):
"""Return True if some descendant path under ``target_with_sep``
could match ``pattern``. False means no descendant can possibly
match, so the caller may safely skip the directory. False is
never returned when a match is actually possible.
"""
lit = _literal_prefix(pattern)
common = min(len(lit), len(target_with_sep))
if lit[:common] != target_with_sep[:common]:
return False
if len(target_with_sep) <= len(lit):
return True
if lit == pattern:
return False
return True


def _pattern_matches_all_under(pattern, target_with_sep):
"""Return True only when every descendant path under
``target_with_sep`` is guaranteed to match ``pattern``. False
means we cannot prove this — the caller must not assume a match.
"""
lit = _literal_prefix(pattern)
if not target_with_sep.startswith(lit):
return False
rest = pattern[len(lit):]
return bool(rest) and all(c == '*' for c in rest)


class Filter:
"""
This is a universal exclude/include filter.
Expand Down Expand Up @@ -161,3 +197,48 @@ def _match_pattern(self, pattern, file_info):
path_pattern,
)
return file_status

def can_skip_directory(
self, dir_path, src_type='local', use_dst_patterns=False
):
"""Return True only when the filter chain cannot include any
descendant of ``dir_path``, so the walker may skip listing it.
False means the directory must be traversed normally — either
a descendant could match, or we cannot prove otherwise.

``use_dst_patterns``: when True, evaluate against ``dst_patterns``
(rooted at the destination) instead of ``patterns`` (rooted at the
source). The reverse-direction file generator used during
``s3 sync s3://bucket/ ./local`` walks the local destination, so
it must consult the destination-rooted patterns to prune correctly.
"""
patterns = self.dst_patterns if use_dst_patterns else self.patterns
if not patterns:
return False
sep = os.sep if src_type == 'local' else '/'
target = dir_path.rstrip(sep) + sep
if src_type == 'local':
# ``fnmatch.fnmatch`` (used by ``_match_pattern``) normalizes
# case via ``os.path.normcase`` on Windows, so this prefix
# comparison must do the same — otherwise a case-different
# subtree on Windows would be incorrectly pruned. On POSIX
# ``normcase`` is identity.
target = os.path.normcase(target)
normalized = []
for pattern_type, pat in patterns:
if src_type == 'local':
pat = os.path.normcase(pat.replace('/', os.sep))
else:
pat = pat.replace(os.sep, '/')
normalized.append((pattern_type, pat))
for pattern_type, pat in normalized:
if pattern_type == 'include' and _pattern_can_match_under(
pat, target
):
return False
for pattern_type, pat in normalized:
if pattern_type == 'exclude' and _pattern_matches_all_under(
pat, target
):
return True
return False
4 changes: 4 additions & 0 deletions awscli/customizations/s3/subcommands.py
Original file line number Diff line number Diff line change
Expand Up @@ -1470,19 +1470,23 @@ def run(self):
result_queue = queue.Queue()
operation_name = cmd_translation[paths_type]

file_filter = create_filter(self.parameters)
fgen_kwargs = {
'client': self._source_client,
'operation_name': operation_name,
'follow_symlinks': self.parameters['follow_symlinks'],
'page_size': self.parameters['page_size'],
'result_queue': result_queue,
'file_filter': file_filter,
}
rgen_kwargs = {
'client': self._client,
'operation_name': '',
'follow_symlinks': self.parameters['follow_symlinks'],
'page_size': self.parameters['page_size'],
'result_queue': result_queue,
'file_filter': file_filter,
'is_dst_walker': True,
}

fgen_request_parameters = (
Expand Down
Loading