Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

consume packed wheel cache in zipapp creation #2175

Closed
wants to merge 5 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
46 changes: 37 additions & 9 deletions pex/bin/pex.py
Original file line number Diff line number Diff line change
Expand Up @@ -164,6 +164,23 @@ def configure_clp_pex_options(parser):
),
)

group.add_argument(
"--cache-dists",
"--no-cache-dists",
dest="cache_dists",
default=None,
action=HandleBoolAction,
help=(
"Whether to zip up each dist contained in the output PEX file into a fingerprinted "
"cache directory to speed up later PEX file builds. For `--layout packed`, this "
"behavior is enabled by default. "
"For `--layout zipapp`, this synthesizes the zip file from those cached zips with an "
"experimental zip merging technique, so this flag is disabled by default when building "
"a zipapp. This will re-use the same caches as `--layout packed`, so creating a "
"zipapp or packed PEX file from the same inputs will only populate the cache once. "
"This flag and behavior do not apply to other layouts."
),
)
group.add_argument(
"--compress",
"--compressed",
Expand All @@ -175,7 +192,11 @@ def configure_clp_pex_options(parser):
action=HandleBoolAction,
help=(
"Whether to compress zip entries when creating either a zipapp PEX file or a packed "
"PEX's bootstrap and dependency zip files. Does nothing for loose layout PEXes."
"PEX's bootstrap and dependency zip files. "
"Uncompressed PEX files are much faster to create from an empty cache, but are no "
"faster after the cache has been populated, and uncompressed cache entries will "
Copy link
Member

@jsirois jsirois Aug 9, 2023

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Reading a second time there is just no reason to cache uncompressed at all. I definitely vote for fail fast. It seems compress=False, cached_dists=True should not be allowed at all. Further, there is no reason for the current packed layout behavior to be toggleable. If you're building a packed layout, the zips should be compressed zips since the whole point is cache friendliness. If there is no call to add a degree of freedom, don't add it because, at least in Pex, you can never take it back.

"consume many times more space on disk. "
"Does nothing for loose layout PEXes."
),
)

Expand All @@ -200,7 +221,7 @@ def configure_clp_pex_options(parser):
action=HandleVenvAction,
help="Convert the pex file to a venv before executing it. If 'prepend' or 'append' is "
"specified, then all scripts and console scripts provided by distributions in the pex file "
"will be added to the PATH in the corresponding position. If the the pex file will be run "
"will be added to the PATH in the corresponding position. If the pex file will be run "
"multiple times under a stable runtime PEX_ROOT, the venv creation will only be done once "
"and subsequent runs will enjoy lower startup latency.",
)
Expand Down Expand Up @@ -282,10 +303,12 @@ def configure_clp_pex_options(parser):
dest="compile",
default=False,
action=HandleBoolAction,
help="Compiling means that the built pex will include .pyc files, which will result in "
"slightly faster startup performance. However, compiling means that the generated pex "
help="Compiling means that the built PEX will include .pyc files, which will result in "
"slightly faster startup performance. However, compiling means that the generated PEX "
"likely will not be reproducible, meaning that if you were to run `./pex -o` with the "
"same inputs then the new pex would not be byte-for-byte identical to the original.",
"same inputs then the new PEX would not be byte-for-byte identical to the original. "
"Note that all PEX files are now unzipped and compiled when first executed, so this "
"flag only affects the startup performance of the first execution.",
)

group.add_argument(
Expand All @@ -294,10 +317,14 @@ def configure_clp_pex_options(parser):
dest="use_system_time",
default=False,
action=HandleBoolAction,
help="Use the current system time to generate timestamps for the new pex. Otherwise, Pex "
"will use midnight on January 1, 1980. By using system time, the generated pex "
"will not be reproducible, meaning that if you were to run `./pex -o` with the "
"same inputs then the new pex would not be byte-for-byte identical to the original.",
help="Convert modification times from the filesystem into timestamps for any zip file "
"entries. Otherwise, Pex will use midnight on January 1, 1980. By using system time, the "
"generated PEX will not be reproducible, meaning that if you were to run `./pex -o` with "
"the same inputs then the new pex PEX not be byte-for-byte identical to the original. "
"Note that zip file entries synthesized from the pex cache (including any resolved "
"distributions) will always use the reproducible timestamp regardless of this flag. "
"Any unzipped output file will retain the timestamps of their sources regardless of this "
"flag, although this will not affect their checksum.",
)

group.add_argument(
Expand Down Expand Up @@ -949,6 +976,7 @@ def do_main(
deterministic_timestamp=not options.use_system_time,
layout=options.layout,
compress=options.compress,
cache_dists=options.cache_dists,
)
if options.seed != Seed.NONE:
seed_info = seed_cache(
Expand Down
74 changes: 62 additions & 12 deletions pex/common.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
import atexit
import contextlib
import errno
import io
import itertools
import os
import re
Expand Down Expand Up @@ -38,6 +39,8 @@
Union,
)

_DateTime = Tuple[int, int, int, int, int, int]


# We use the start of MS-DOS time, which is what zipfiles use (see section 4.4.6 of
# https://pkware.cachefly.net/webdocs/casestudies/APPNOTE.TXT).
Expand Down Expand Up @@ -137,6 +140,23 @@ def do_copy():
do_copy()


def copy_file_range(source, destination, length, buffer_size=io.DEFAULT_BUFFER_SIZE):
# type: (io.BufferedIOBase, io.BufferedIOBase, int, int) -> None
"""Implementation of shutil.copyfileobj() that only copies exactly `length` bytes."""
# We require a BufferedIOBase in order to avoid handling short reads or writes.
remaining_length = length
if buffer_size > length:
buffer_size = length
cur_buf = bytearray(buffer_size)
while remaining_length > buffer_size:
assert source.readinto(cur_buf) == buffer_size
assert destination.write(cur_buf) == buffer_size
remaining_length -= buffer_size
remainder = source.read(remaining_length)
assert len(remainder) == remaining_length
assert destination.write(remainder) == remaining_length


# See http://stackoverflow.com/questions/2572172/referencing-other-modules-in-atexit
class MktempTeardownRegistry(object):
def __init__(self):
Expand Down Expand Up @@ -173,7 +193,14 @@ class ZipEntry(namedtuple("ZipEntry", ["info", "data"])):
pass

@classmethod
def zip_entry_from_file(cls, filename, arcname=None, date_time=None):
def zip_entry_from_file(
cls,
filename, # type: str
arcname=None, # type: Optional[str]
date_time=None, # type: Optional[Tuple[int, ...]]
compression=zipfile.ZIP_STORED, # type: int
):
# type: (...) -> PermPreservingZipFile.ZipEntry
"""Construct a ZipEntry for a file on the filesystem.

Usually a similar `zip_info_from_file` method is provided by `ZipInfo`, but it is not
Expand All @@ -192,16 +219,20 @@ def zip_entry_from_file(cls, filename, arcname=None, date_time=None):
arcname += "/"
if date_time is None:
date_time = time.localtime(st.st_mtime)
zinfo = zipfile.ZipInfo(filename=arcname, date_time=date_time[:6])
zinfo = zipfile.ZipInfo(filename=arcname, date_time=cast("_DateTime", date_time[:6]))
zinfo.external_attr = (st.st_mode & 0xFFFF) << 16 # Unix attributes
if isdir:
zinfo.file_size = 0
zinfo.external_attr |= 0x10 # MS-DOS directory flag
# Always store directories decompressed, because they are empty but take up 2 bytes when
# compressed.
zinfo.compress_type = zipfile.ZIP_STORED
data = b""
else:
zinfo.file_size = st.st_size
zinfo.compress_type = zipfile.ZIP_DEFLATED
# File contents may be compressed or decompressed. Decompressed is significantly faster
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This seems like a gratuitous comment. You're selling here or else speaking to an end user. Clearly this is not the place to speak to the user and the coder doesn't need to be sold to, if a function takes a param ... it takes a param. If its a public function maybe sell in the API doc.

# to write, but caching makes up for that.
zinfo.compress_type = compression
with open(filename, "rb") as fp:
data = fp.read()
return cls.ZipEntry(info=zinfo, data=data)
Expand Down Expand Up @@ -281,18 +312,32 @@ def safe_mkdir(directory, clean=False):
return directory


def _ensure_parent(filename):
# type: (str) -> None
parent_dir = os.path.dirname(filename)
if parent_dir:
safe_mkdir(parent_dir)


def safe_open(filename, *args, **kwargs):
"""Safely open a file.

``safe_open`` ensures that the directory components leading up the specified file have been
created first.
"""
parent_dir = os.path.dirname(filename)
if parent_dir:
safe_mkdir(parent_dir)
_ensure_parent(filename)
return open(filename, *args, **kwargs) # noqa: T802


def safe_io_open(filename, *args, **kwargs):
# type: (str, Any, Any) -> io.IOBase
"""``safe_open()``, but using ``io.open()`` instead.

With the right arguments, this ensures the result produces a buffered file handle on py2."""
_ensure_parent(filename)
return cast("io.IOBase", io.open(filename, *args, **kwargs))


def safe_delete(filename):
# type: (str) -> None
"""Delete a file safely.
Expand Down Expand Up @@ -608,9 +653,13 @@ def delete(self):
# type: () -> None
shutil.rmtree(self.chroot)

# This directory traversal, file I/O, and compression can be made faster with complex
# parallelism and pipelining in a compiled language, but the result is much harder to package,
# and is still less performant than effective caching. See investigation in
# https://github.com/pantsbuild/pex/issues/2158 and https://github.com/pantsbuild/pex/pull/2175.
def zip(
self,
filename, # type: str
output_file, # type: Union[str, io.IOBase, io.BufferedRandom]
mode="w", # type: str
deterministic_timestamp=False, # type: bool
exclude_file=lambda _: False, # type: Callable[[str], bool]
Expand All @@ -628,7 +677,7 @@ def zip(
selected_files = self.files()

compression = zipfile.ZIP_DEFLATED if compress else zipfile.ZIP_STORED
with open_zip(filename, mode, compression) as zf:
with open_zip(output_file, mode, compression) as zf:

def write_entry(
filename, # type: str
Expand All @@ -638,11 +687,12 @@ def write_entry(
zip_entry = zf.zip_entry_from_file(
filename=filename,
arcname=os.path.relpath(arcname, strip_prefix) if strip_prefix else arcname,
date_time=DETERMINISTIC_DATETIME.timetuple()
if deterministic_timestamp
else None,
date_time=(
DETERMINISTIC_DATETIME.timetuple() if deterministic_timestamp else None
),
compression=compression,
)
zf.writestr(zip_entry.info, zip_entry.data, compression)
zf.writestr(zip_entry.info, zip_entry.data)

def get_parent_dir(path):
# type: (str) -> Optional[str]
Expand Down
Loading