From 5c2db0896a8f6061c8a99c3cf363b3e3d36a1a56 Mon Sep 17 00:00:00 2001 From: Laurenz Date: Fri, 27 Sep 2024 12:24:55 +0200 Subject: [PATCH] Use mksquashfs pseudofile definitions for AppImage build (#228) This changes the way that appimage squashfs files are built with rules_appimage. Instead of taring up all the files and then providing that to mksquashfs we build a pseudo file definitions list that tells mksquashfs where to get all the files from. This avoids the expensive tar operation when inputs change (building the pseudofile defs is pretty cheap). The mksquashfs action does take longer now (likely because it has to do thousands of `sh -c "cat {file}"` invocations) but overall the build times are lower for large appimages and the cachability remote execution ability should be better. There was also an issue with the most recent MkAppDir implementation where the tar was populated in a ThreadPool which was not deterministic, so file order inside the .tar and .sqfs may change, leading to changed artifact hashes. This is resolved with this. --- appimage/appimage.bzl | 49 ++++++---- appimage/private/mkappdir.py | 153 ++++++++++++++++-------------- tests/mkappdir_test.py | 179 +++++++++++------------------------ 3 files changed, 163 insertions(+), 218 deletions(-) diff --git a/appimage/appimage.bzl b/appimage/appimage.bzl index e7c132c..f40a789 100644 --- a/appimage/appimage.bzl +++ b/appimage/appimage.bzl @@ -31,37 +31,44 @@ def _appimage_impl(ctx): apprun = make_apprun(ctx) inputs = depset(direct = [manifest_file, apprun] + runfile_info.files) - # Create the AppDir tar - appdirtar = ctx.actions.declare_file(ctx.attr.name + ".tar") + # Create the AppDir mksquashfs pseudofile definitions + pseudofile_defs = ctx.actions.declare_file(ctx.attr.name + ".pseudofile_defs.txt") args = ctx.actions.args() args.add("--manifest").add(manifest_file.path) args.add("--apprun").add(apprun.path) - args.add(appdirtar.path) + args.add(pseudofile_defs.path) ctx.actions.run( mnemonic = "MkAppDir", inputs = inputs, executable = ctx.executable._mkappdir, arguments = [args], - outputs = [appdirtar], - execution_requirements = {"no-remote": "1"}, + outputs = [pseudofile_defs], + execution_requirements = {"no-remote-exec": "1"}, + ) + + # Create an empty directory that we can point mksquashfs at so all the added files come from the pseudofile defs + emptydir = ctx.actions.declare_directory(ctx.attr.name + ".emptydir") + ctx.actions.run_shell( + mnemonic = "MkEmptyDir", + outputs = [emptydir], + command = "mkdir -p %s" % emptydir.path, ) # Create the AppDir squashfs - mksquashfs_args = list(MKSQUASHFS_ARGS) - mksquashfs_args.extend(["-processors", str(MKSQUASHFS_NUM_PROCS)]) - mksquashfs_args.extend(["-mem", "{}M".format(MKSQUASHFS_MEM_MB)]) - mksquashfs_args.extend(ctx.attr.build_args) - mksquashfs_args.append("-tar") appdirsqfs = ctx.actions.declare_file(ctx.attr.name + ".sqfs") - ctx.actions.run_shell( - mnemonic = "Sqfstar", - inputs = [appdirtar], - command = "{exe} - {dst} {args} <{src}".format( - exe = ctx.executable._mksquashfs.path, - dst = appdirsqfs.path, - args = " ".join(mksquashfs_args), - src = appdirtar.path, - ), + mksquashfs_args = ctx.actions.args() + mksquashfs_args.add(emptydir.path + "/") + mksquashfs_args.add(appdirsqfs.path) + mksquashfs_args.add_all(MKSQUASHFS_ARGS) + mksquashfs_args.add("-processors").add(MKSQUASHFS_NUM_PROCS) + mksquashfs_args.add("-mem").add("%sM" % MKSQUASHFS_MEM_MB) + mksquashfs_args.add_all(ctx.attr.build_args) + mksquashfs_args.add("-pf").add(pseudofile_defs.path) + ctx.actions.run( + mnemonic = "MkSquashfs", + inputs = depset(direct = [pseudofile_defs, apprun, emptydir] + runfile_info.files), + executable = ctx.executable._mksquashfs, + arguments = [mksquashfs_args], tools = [ctx.executable._mksquashfs], outputs = [appdirsqfs], resource_set = _resources, @@ -79,7 +86,7 @@ def _appimage_impl(ctx): ctx.outputs.executable.path, ], outputs = [ctx.outputs.executable], - execution_requirements = {"no-remote": "1"}, + execution_requirements = {"no-remote-exec": "1"}, ) # Take the `binary` env and add the appimage target's env on top of it @@ -95,7 +102,7 @@ def _appimage_impl(ctx): runfiles = ctx.runfiles(files = [ctx.outputs.executable]), ), RunEnvironmentInfo(env), - OutputGroupInfo(appimage_debug = depset([manifest_file, appdirtar, appdirsqfs])), + OutputGroupInfo(appimage_debug = depset([manifest_file, pseudofile_defs, appdirsqfs])), ] _ATTRS = { diff --git a/appimage/private/mkappdir.py b/appimage/private/mkappdir.py index 35f0d80..a70ab45 100644 --- a/appimage/private/mkappdir.py +++ b/appimage/private/mkappdir.py @@ -1,4 +1,4 @@ -"""Prepare and build an AppImage AppDir tarball.""" +"""Prepare and build an AppImage AppDir Mksquashfs pseudo-file definitions file.""" from __future__ import annotations @@ -9,11 +9,8 @@ import os import shutil import sys -import tarfile -import tempfile -from concurrent.futures import ThreadPoolExecutor from pathlib import Path -from typing import TYPE_CHECKING, Callable, NamedTuple +from typing import TYPE_CHECKING, NamedTuple if TYPE_CHECKING: from collections.abc import Sequence @@ -101,43 +98,68 @@ def is_inside_bazel_cache(path: Path) -> bool: return os.fspath(path).startswith(get_output_base()) -def copy_file_or_dir(src: Path, dst: Path, preserve_symlinks: bool) -> None: - """Copy a file or dir from src to dst. +def get_all_parent_dirs(path: Path | str) -> list[Path]: + path = Path(path) + parts = path.parts if path.is_dir() else path.parts[:-1] + return [Path(*parts[: i + 1]) for i in range(len(parts))] - We use copy2 because it preserves metadata like permissions. - If preserve_symlinks is True we do not want to set follow_symlinks in order to keep - symlink targets preserved. + +def to_pseudofile_def_lines(src: Path, dst: Path, preserve_symlinks: bool) -> dict[str, str]: + """Return a pseudo-file definition line for a file or directory. + + See https://github.com/plougher/squashfs-tools/blob/e7b6490/examples/pseudo-file.example + Pseudo file definition format: + "filename d mode uid gid" create a directory + "filename m mode uid gid" modify filename + "filename f mode uid gid command" create file from stdout of command + "filename s mode uid gid symlink" create a symbolic link + (...) """ - dst.parent.mkdir(parents=True, exist_ok=True) + operations = {dir.as_posix(): "d 755 0 0" for dir in get_all_parent_dirs(dst)} + if ( + src.is_symlink() + and (preserve_symlinks or not src.readlink().exists()) + and not is_inside_bazel_cache(src.readlink()) + ): + operations[dst.as_posix()] = f"s {src.lstat().st_mode & 0o777:o} 0 0 {src.readlink()}" + elif src.is_file(): + operations[dst.as_posix()] = f"f {src.lstat().st_mode & 0o777:o} 0 0 cat {src}" + elif src.is_dir(): + operations[dst.as_posix()] = f"d {src.lstat().st_mode & 0o777:o} 0 0" + elif not src.exists(): + raise FileNotFoundError(f"{src=} does not exist") + else: + raise NotImplementedError(f"Cannot handle {src}") + + return operations + + +def copy_file_or_dir(src: Path, dst: Path, preserve_symlinks: bool) -> dict[str, str]: + """Copy a file or dir from src to dst.""" if not src.is_dir(): - shutil.copy2(src, dst, follow_symlinks=not preserve_symlinks) - return + return to_pseudofile_def_lines(src, dst, preserve_symlinks) - # Scan and recreate dangling symlinks - dangling: set[Path] = set() - for link in src.rglob("*"): - if not link.is_symlink() or link.exists(): - continue - new_link = dst / link.relative_to(src) - new_link.parent.mkdir(parents=True, exist_ok=True) - new_link.symlink_to(link.readlink()) - dangling.add(link) + operations: dict[str, str] = {} - copy_function = functools.partial(shutil.copy2, follow_symlinks=not preserve_symlinks) + # Scan and recreate symlinks manually + links = {link for link in src.rglob("*") if link.is_symlink()} + for link in links: + operations.update(to_pseudofile_def_lines(link, dst / link.relative_to(src), preserve_symlinks)) + + copies: dict[str, str] = {} shutil.copytree( src, dst, symlinks=preserve_symlinks, - ignore=lambda dir, names: [f for f in names if Path(dir) / f in dangling], - copy_function=copy_function, + ignore=lambda dir, names: [f for f in names if Path(dir) / f in links], + copy_function=lambda src, dst: copies.setdefault(dst, src), ignore_dangling_symlinks=False, dirs_exist_ok=True, ) + for dst_, src_ in copies.items(): + operations.update(to_pseudofile_def_lines(Path(src_), Path(dst_), preserve_symlinks)) - -def _copy_file_or_dir_later(src: Path, dst: Path, preserve_symlinks: bool) -> Callable[[], None]: - """Return a function that copies a file or dir from src to dst.""" - return lambda: copy_file_or_dir(src, dst, preserve_symlinks) + return operations def _move_relative_symlinks_in_files_to_their_own_section(manifest_data: _ManifestData) -> _ManifestData: @@ -198,7 +220,7 @@ def _prevent_duplicate_dsts_with_diverging_srcs(manifest_data: _ManifestData) -> return new_manifest_data -def populate_appdir(appdir: Path, manifest: Path) -> None: +def make_appdir_pseudofile_defs(manifest: Path) -> dict[str, str]: """Make the AppDir that will be squashfs'd into the AppImage. Note that the [AppImage Type2 Spec][appimage-spec] specifies that the contained [AppDir][appdir-spec] may contain a @@ -209,30 +231,30 @@ def populate_appdir(appdir: Path, manifest: Path) -> None: [desktop-spec]: https://specifications.freedesktop.org/desktop-entry-spec/desktop-entry-spec-latest.html [appimaged]: https://docs.appimage.org/user-guide/run-appimages.html#integrating-appimages-into-the-desktop """ - appdir.mkdir(parents=True, exist_ok=True) manifest_data = _ManifestData.from_json(manifest.read_text()) manifest_data = _move_relative_symlinks_in_files_to_their_own_section(manifest_data) manifest_data = _prevent_duplicate_dsts_with_diverging_srcs(manifest_data) - actions: list[Callable[[], None]] = [] + operations: dict[str, str] = {} for empty_file in manifest_data.empty_files: # example entry: "tests/test_py.runfiles/__init__.py" - (appdir / empty_file).parent.mkdir(parents=True, exist_ok=True) - (appdir / empty_file).touch() + for dir in get_all_parent_dirs(empty_file): + operations[dir.as_posix()] = "d 755 0 0" + operations[empty_file] = "f 755 0 0 true" for file in manifest_data.files: # example entry: {"dst": "tests/test_py.runfiles/_main/tests/data.txt", "src": "tests/data.txt"} - src = Path(file.src).resolve() - dst = Path(appdir / file.dst).resolve() - assert src.exists(), f"want to copy {src} to {dst}, but it does not exist" - actions.append(_copy_file_or_dir_later(src, dst, preserve_symlinks=True)) + operations.update(copy_file_or_dir(Path(file.src), Path(file.dst), preserve_symlinks=True)) for link in manifest_data.symlinks: # example entry: {"linkname": "tests/test_py", "target": "tests/test_py.runfiles/_main/tests/test_py"} - linkname = Path(link.linkname) - linkfile = (appdir / linkname).resolve() - linkfile.parent.mkdir(parents=True, exist_ok=True) + # example entry: {"linkname": + # "tests/test_py.runfiles/_main/../rules_python~0.27.1~python~python_3_11_x86_64-unknown-linux-gnu/bin/python3", + # "target": "python3.11"} + linkfile = Path(link.linkname) + for dir in get_all_parent_dirs(linkfile): + operations[dir.as_posix()] = "d 755 0 0" target = Path(link.target) if target.is_absolute(): # We keep absolute symlinks as is, but make no effort to copy the target into the runfiles as well. @@ -240,48 +262,37 @@ def populate_appdir(appdir: Path, manifest: Path) -> None: pass else: # Adapt relative symlinks to point relative to the new linkfile location - target = relative_path(appdir / target, linkfile.parent) - linkfile.symlink_to(target) + target = relative_path(target, linkfile.parent) + operations[linkfile.as_posix()] = f"s 755 0 0 {target}" for link in manifest_data.relative_symlinks: # example entry: {"linkname": # "tests/test_py.runfiles/_main/../rules_python~0.27.1~python~python_3_11_x86_64-unknown-linux-gnu/bin/python3", # "target": "python3.11"} - linkfile = (appdir / link.linkname).resolve() - linkfile.parent.mkdir(parents=True, exist_ok=True) - target = Path(link.target) - assert not target.is_absolute(), f"symlink {linkfile} must be relative, but points at {target}" - linkfile.symlink_to(target) + for dir in get_all_parent_dirs(link.linkname): + operations[dir.as_posix()] = "d 755 0 0" + operations[link.linkname] = f"s 755 0 0 {link.target}" for tree_artifact in manifest_data.tree_artifacts: # example entry: # {'dst': 'test.runfiles/_main/../rules_pycross~~lock_repos~pdm_deps/_lock/humanize@4.9.0', # 'src': 'bazel-out/k8-fastbuild/bin/external/rules_pycross~~lock_repos~pdm_deps/_lock/humanize@4.9.0'} - src = Path(tree_artifact.src).resolve() - dst = Path(appdir / tree_artifact.dst).resolve() - assert src.exists(), f"want to copy {src} to {dst}, but it does not exist" - actions.append(_copy_file_or_dir_later(src, dst, preserve_symlinks=False)) - - with ThreadPoolExecutor() as executor: - futures = [executor.submit(action) for action in actions] - for future in futures: - future.result() + operations.update(copy_file_or_dir(Path(tree_artifact.src), Path(tree_artifact.dst), preserve_symlinks=False)) + # Must not have `..` in file names: https://github.com/plougher/squashfs-tools/blob/4.6.1/squashfs-tools/unsquash-1.c#L377 + operations = {os.path.normpath(f): v for f, v in operations.items()} -def _make_executable(ti: tarfile.TarInfo) -> tarfile.TarInfo: - ti.mode |= 0o111 - return ti + return operations -def make_appdir_tar(manifest: Path, apprun: Path, output: Path) -> None: - """Create an AppImage AppDir (uncompressed) tar ready to be sqfstar'ed.""" - with tarfile.open(output, "w") as tar: - tar.add(apprun.resolve(), arcname="AppRun", filter=_make_executable) - with tempfile.TemporaryDirectory() as tmpdir: - appdir = Path(tmpdir) - populate_appdir(appdir, manifest) - for top_level in appdir.iterdir(): - tar.add(top_level, arcname=top_level.name) +def write_appdir_pseudofile_defs(manifest: Path, apprun: Path, output: Path) -> None: + """Write a mksquashfs pf file representing the AppDir.""" + lines = [ + f"AppRun f 777 0 0 cat {apprun}", + *sorted(f"{k} {v}" for k, v in make_appdir_pseudofile_defs(manifest).items()), + "", + ] + output.write_text("\n".join(lines)) def parse_args(args: Sequence[str]) -> argparse.Namespace: @@ -299,10 +310,10 @@ def parse_args(args: Sequence[str]) -> argparse.Namespace: type=Path, help="Path to AppRun script", ) - parser.add_argument("output", type=Path, help="Where to place output AppDir tar") + parser.add_argument("output", type=Path, help="Where to place output AppDir pseudo-file definition file") return parser.parse_args(args) if __name__ == "__main__": args = parse_args(sys.argv[1:]) - make_appdir_tar(args.manifest, args.apprun, args.output) + write_appdir_pseudofile_defs(args.manifest, args.apprun, args.output) diff --git a/tests/mkappdir_test.py b/tests/mkappdir_test.py index d71e93b..ad678e6 100644 --- a/tests/mkappdir_test.py +++ b/tests/mkappdir_test.py @@ -1,10 +1,10 @@ """Unit tests for mkappdir module.""" -import json +import contextlib import os import sys -import tarfile import tempfile +from collections.abc import Iterator from pathlib import Path import pytest @@ -48,131 +48,58 @@ def test_is_inside_bazel_cache(path: Path, expected: bool) -> None: assert inside == expected -@pytest.mark.parametrize("symlinks", [True, False]) -def test_copy_file_or_dir(symlinks: bool) -> None: - with tempfile.TemporaryDirectory(suffix=".src") as tmp_dir: - src = Path(tmp_dir) / "src" - src.mkdir() - - foo = src / "foo" - foo.mkdir(mode=0o701) - - bar = foo / "bar" - bar.write_text("bar") - bar.chmod(0o456) - os.utime(bar, (0, 200)) - os.utime(foo, (0, 100)) - - dir = src / "dir" - dir.mkdir() - - link = src / "baz/link" - link.parent.mkdir() - link.symlink_to("../foo/bar") - - dangling = src / "dangling" - dangling.symlink_to("invalid") - - dst_link = Path(tmp_dir) / "dstfile/baz/link" - dst_bar = Path(tmp_dir) / "dstfile/foo/bar" - mkappdir.copy_file_or_dir(link, dst_link, preserve_symlinks=symlinks) - mkappdir.copy_file_or_dir(bar, dst_bar, preserve_symlinks=symlinks) - - assert dst_link.exists() - assert dst_link.is_symlink() is symlinks - assert dst_link.read_text() == "bar" - - dst = Path(tmp_dir) / "dst_dir" - mkappdir.copy_file_or_dir(src, dst, preserve_symlinks=symlinks) - - assert set(dst.rglob("*")) == { - dst / "foo", - dst / "foo/bar", - dst / "dir", - dst / "baz", - dst / "baz/link", - dst / "dangling", +@pytest.mark.parametrize( + ("path", "expected"), + [ + ("/", ["/"]), + (".", []), + ("/a", ["/"]), + ("./a", []), + ("/dev/null", ["/", "/dev"]), + ("/dev", ["/", "/dev"]), + ("a", []), + ("a/b", ["a"]), + ("a/b/c", ["a", "a/b"]), + ("a/b/c/d", ["a", "a/b", "a/b/c"]), + ], +) +def test_get_all_parent_dirs(path: str, expected: list[str]) -> None: + assert mkappdir.get_all_parent_dirs(Path(path)) == list(map(Path, expected)) + + +@contextlib.contextmanager +def cd(path: Path | str) -> Iterator[None]: + old = Path.cwd() + os.chdir(path) + try: + yield + finally: + os.chdir(old) + + +def test_to_pseudofile_def_lines() -> None: + mkdef = mkappdir.to_pseudofile_def_lines + with tempfile.TemporaryDirectory() as tmp_dir, cd(tmp_dir): + src = Path("dir/file") + src.parent.mkdir(parents=True, exist_ok=True) + src.touch(0o601) + dangling = Path("dangling") + dangling.symlink_to("../invalid") + link = Path("link") + link.symlink_to(src) + + assert mkdef(src, Path("a/b/c/d"), True) == { + "a": "d 755 0 0", + "a/b": "d 755 0 0", + "a/b/c": "d 755 0 0", + "a/b/c/d": "f 601 0 0 cat dir/file", } - - foo = dst / "foo" - assert foo.exists() - assert foo.is_dir() - assert oct(foo.stat().st_mode) == "0o40701" - assert foo.stat().st_mtime == 100 - - file = dst / "foo/bar" - assert file.read_text() == "bar" - assert oct(file.stat().st_mode) == "0o100456" - assert file.stat().st_mtime == 200 - - link = dst / "baz/link" - assert link.exists() - assert link.is_file() - if symlinks: - assert link.readlink() == Path("../foo/bar") - else: - assert not link.is_symlink() - assert link.read_text() == "bar" - - dangling = dst / "dangling" - assert dangling.is_symlink() - assert not dangling.exists() - - -def test_populate_appdir() -> None: - with ( - tempfile.NamedTemporaryFile(suffix=".json") as manifest_file, - tempfile.TemporaryDirectory() as tmp_dir, - ): - manifest = Path(manifest_file.name) - manifest.write_text( - json.dumps( - { - "empty_files": ["empty_file"], - "files": [{"src": __file__, "dst": "dir/b"}], - "symlinks": [{"linkname": "link/symlink", "target": "dir/b"}], - "tree_artifacts": [{"src": Path(__file__).parent.as_posix(), "dst": "tree"}], - } - ) - ) - appdir = Path(tmp_dir) - mkappdir.populate_appdir(appdir, manifest) - - assert (appdir / "empty_file").read_text() == "" - assert (appdir / "dir/b").read_text() == Path(__file__).read_text() - assert (appdir / "link/symlink").is_symlink() - assert (appdir / "tree" / Path(__file__).name).read_text() == Path(__file__).read_text() - - -def test_make_appdir_tar() -> None: - with ( - tempfile.NamedTemporaryFile(suffix=".json") as manifest, - tempfile.NamedTemporaryFile(suffix=".AppRun") as apprun, - tempfile.NamedTemporaryFile(suffix=".tar") as output, - ): - Path(manifest.name).write_text( - json.dumps( - { - "empty_files": [], - "files": [], - "symlinks": [{"linkname": "link/symlink", "target": "AppRun"}], - "tree_artifacts": [], - } - ) - ) - Path(apprun.name).write_text("#!/bin/sh\n") - mkappdir.make_appdir_tar( - Path(manifest.name), - Path(apprun.name), - Path(output.name), - ) - with tarfile.open(output.name, "r:") as tar: - assert set(tar.getnames()) == {"link", "link/symlink", "AppRun"} - link = tar.extractfile("link/symlink") - assert link - assert link.read() == b"#!/bin/sh\n" - linkinfo = tar.getmember("link/symlink") - assert linkinfo.type == tarfile.SYMTYPE + assert mkdef(src, Path("dst"), True) == {"dst": "f 601 0 0 cat dir/file"} + perms = f"{dangling.lstat().st_mode & 0o777:o}" # default differs on Linux and macOS + assert mkdef(dangling, Path("dst"), True) == {"dst": f"s {perms} 0 0 ../invalid"} + assert mkdef(dangling, Path("dst"), False) == {"dst": f"s {perms} 0 0 ../invalid"} + assert mkdef(link, Path("dst"), True) == {"dst": f"s {perms} 0 0 dir/file"} + assert mkdef(link, Path("dst"), False) == {"dst": f"f {perms} 0 0 cat link"} if __name__ == "__main__":