Skip to content

Commit

Permalink
This commit fixes two issues with content filtering
Browse files Browse the repository at this point in the history
-- `ZipContentFilter` logic for patterns that have a single allowed pattern.
   Earlier this would include all files.
-- `generate_content_hash` also considers the patterns before generating the
   hash so that the hash generated matches the content of the zip.
  • Loading branch information
sohaibiftikhar committed May 28, 2024
1 parent 4f77bfc commit 414fbb0
Show file tree
Hide file tree
Showing 2 changed files with 100 additions and 10 deletions.
33 changes: 24 additions & 9 deletions package.py
Original file line number Diff line number Diff line change
Expand Up @@ -235,9 +235,20 @@ def emit_dir_content(base_dir):
yield os.path.normpath(os.path.join(root, name))


def generate_content_hash(source_paths, hash_func=hashlib.sha256, log=None):
def generate_content_hash(
source_paths,
content_filter,
hash_func=hashlib.sha256,
log=None,
):
"""
Generate a content hash of the source paths.
:param content_filter: Callable[[str], Iterable[str]
A function that filters the content of the source paths. Given a path
to a file or directory, it should return an iterable of paths to files
that should be included in the hash. At present we pass in the
ZipContentFilter.filter method for this purpose.
"""

if log:
Expand All @@ -248,8 +259,7 @@ def generate_content_hash(source_paths, hash_func=hashlib.sha256, log=None):
for source_path in source_paths:
if os.path.isdir(source_path):
source_dir = source_path
_log = log if log.isEnabledFor(DEBUG3) else None
for source_file in list_files(source_dir, log=_log):
for source_file in content_filter(source_dir):
update_hash(hash_obj, source_dir, source_file)
if log:
log.debug(os.path.join(source_dir, source_file))
Expand Down Expand Up @@ -589,10 +599,8 @@ def apply(path):
op, regex = r
neg = op is operator.not_
m = regex.fullmatch(path)
if neg and m:
d = False
elif m:
d = True
m = bool(m)
d = not m if neg else m
if d:
return path

Expand Down Expand Up @@ -648,6 +656,7 @@ class BuildPlanManager:
def __init__(self, args, log=None):
self._args = args
self._source_paths = None
self._patterns = []
self._log = log or logging.root

def hash(self, extra_paths):
Expand All @@ -660,7 +669,11 @@ def hash(self, extra_paths):
# runtime value, build command, and content of the build paths
# because they can have an effect on the resulting archive.
self._log.debug("Computing content hash on files...")
content_hash = generate_content_hash(content_hash_paths, log=self._log)
content_filter = ZipContentFilter(args=self._args)
content_filter.compile(self._patterns)
content_hash = generate_content_hash(
content_hash_paths, content_filter.filter, log=self._log
)
return content_hash

def plan(self, source_path, query):
Expand Down Expand Up @@ -800,7 +813,9 @@ def commands_step(path, commands):
patterns = claim.get("patterns")
commands = claim.get("commands")
if patterns:
step("set:filter", patterns_list(self._args, patterns))
patterns_as_list = patterns_list(self._args, patterns)
self._patterns.extend(patterns_as_list)
step("set:filter", patterns_as_list)
if commands:
commands_step(path, commands)
else:
Expand Down
77 changes: 76 additions & 1 deletion tests/test_zip_source.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,10 @@
import os
from pathlib import Path
from unittest.mock import MagicMock, Mock

from package import BuildPlanManager
import pytest

from package import BuildPlanManager, ZipContentFilter, datatree


def test_zip_source_path_sh_work_dir():
Expand Down Expand Up @@ -44,3 +47,75 @@ def test_zip_source_path():

zip_source_path = zs.write_dirs.call_args_list[0][0][0]
assert zip_source_path == f"{os.getcwd()}"


@pytest.fixture
def source_path(tmp_path: str) -> Path:
"""Creates a tmp stage dir for running tests."""
tmp_path = Path(tmp_path)
source = tmp_path / "some_dir"
source.mkdir()
for files in ["file.py", "file2.py", "README.md", "requirements.txt"]:
(source / files).touch()
yield source


def test_zip_content_filter(source_path: Path):
"""Test the zip content filter does not take all positive."""
args = Mock()
query_data = {
"runtime": "python",
"source_path": {
"path": str(source_path),
"patterns": [".*.py$"],
},
}
query = datatree("prepare_query", **query_data)

file_filter = ZipContentFilter(args=args)
file_filter.compile(query.source_path.patterns)
filtered = list(file_filter.filter(query.source_path.path))
expected = [str(source_path / fname) for fname in ["file.py", "file2.py"]]
assert filtered == sorted(expected)

# Test that filtering with empty patterns returns all files.
file_filter = ZipContentFilter(args=args)
file_filter.compile([])
filtered = list(file_filter.filter(query.source_path.path))
expected = [
str(source_path / fname)
for fname in ["file.py", "file2.py", "README.md", "requirements.txt"]
]
assert filtered == sorted(expected)


def test_generate_hash(source_path: Path):
"""Tests prepare hash generation and also packaging."""
args = Mock()

query_data = {
"runtime": "python",
"source_path": {
"path": str(source_path),
"patterns": ["!.*", ".*.py$"],
},
}
query = datatree("prepare_query", **query_data)

bpm = BuildPlanManager(args)
bpm.plan(query.source_path, query)
hash1 = bpm.hash([]).hexdigest()

# Add a new file that does not match the pattern.
(source_path / "file3.pyc").touch()
bpm.plan(query.source_path, query)
hash2 = bpm.hash([]).hexdigest()
# Both hashes should still be the same.
assert hash1 == hash2

# Add a new file that does match the pattern.
(source_path / "file4.py").touch()
bpm.plan(query.source_path, query)
hash3 = bpm.hash([]).hexdigest()
# Hash should be different.
assert hash1 != hash3

0 comments on commit 414fbb0

Please sign in to comment.