Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

persistent cache for link parsing and interpreter compatibility #12258

Draft
wants to merge 7 commits into
base: main
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions news/12186.bugfix.rst
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
Avoid downloading any dists in ``install --dry-run`` if PEP 658 ``.metadata`` files or lazy wheels are available.
1 change: 1 addition & 0 deletions news/12256.feature.rst
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
Cache computed metadata from sdists and lazy wheels in ``~/.cache/pip/link-metadata`` when ``--use-feature=metadata-cache`` is enabled.
1 change: 1 addition & 0 deletions news/12257.feature.rst
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
Store HTTP caching headers in ``~/.cache/pip/fetch-resolve`` to reduce bandwidth usage when ``--use-feature=metadata-cache`` is enabled.
1 change: 1 addition & 0 deletions news/12863.trivial.rst
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
Cache "concrete" dists by ``Distribution`` instead of ``InstallRequirement``.
1 change: 1 addition & 0 deletions news/12871.trivial.rst
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
Refactor much of ``RequirementPreparer`` to avoid duplicated code paths for metadata-only requirements.
170 changes: 145 additions & 25 deletions src/pip/_internal/cache.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,14 @@
"""Cache Management
"""

import abc
import hashlib
import json
import logging
import os
import re
from pathlib import Path
from typing import Any, Dict, List, Optional
from typing import Any, Dict, Iterator, List, Optional, Tuple, Type

from pip._vendor.packaging.tags import Tag, interpreter_name, interpreter_version
from pip._vendor.packaging.utils import canonicalize_name
Expand All @@ -15,21 +17,71 @@
from pip._internal.models.direct_url import DirectUrl
from pip._internal.models.link import Link
from pip._internal.models.wheel import Wheel
from pip._internal.req.req_install import InstallRequirement
from pip._internal.utils.temp_dir import TempDirectory, tempdir_kinds
from pip._internal.utils.urls import path_to_url
from pip._internal.vcs import vcs

logger = logging.getLogger(__name__)

_egg_info_re = re.compile(r"([a-z0-9_.]+)-([a-z0-9_.!+-]+)", re.IGNORECASE)

ORIGIN_JSON_NAME = "origin.json"


def _contains_egg_info(s: str) -> bool:
"""Determine whether the string looks like an egg_info.

:param s: The string to parse. E.g. foo-2.1
"""
return bool(_egg_info_re.search(s))


def should_cache(
req: InstallRequirement,
) -> bool:
"""
Return whether a built InstallRequirement can be stored in the persistent
wheel cache, assuming the wheel cache is available, and _should_build()
has determined a wheel needs to be built.
"""
if not req.link:
return False

if req.link.is_wheel:
return False

if req.editable or not req.source_dir:
# never cache editable requirements
return False

if req.link and req.link.is_vcs:
# VCS checkout. Do not cache
# unless it points to an immutable commit hash.
assert not req.editable
assert req.source_dir
vcs_backend = vcs.get_backend_for_scheme(req.link.scheme)
assert vcs_backend
if vcs_backend.is_immutable_rev_checkout(req.link.url, req.source_dir):
return True
return False

assert req.link
base, ext = req.link.splitext()
if _contains_egg_info(base):
return True

# Otherwise, do not cache.
return False


def _hash_dict(d: Dict[str, str]) -> str:
"""Return a stable sha224 of a dictionary."""
s = json.dumps(d, sort_keys=True, separators=(",", ":"), ensure_ascii=True)
return hashlib.sha224(s.encode("ascii")).hexdigest()


class Cache:
class Cache(abc.ABC):
"""An abstract class - provides cache directories for data from links

:param cache_dir: The root of the cache.
Expand All @@ -40,7 +92,9 @@ def __init__(self, cache_dir: str) -> None:
assert not cache_dir or os.path.isabs(cache_dir)
self.cache_dir = cache_dir or None

def _get_cache_path_parts(self, link: Link) -> List[str]:
def _get_cache_path_parts(
self, link: Link, *, interpreter_dependent: bool
) -> List[str]:
"""Get parts of part that must be os.path.joined with cache_dir"""

# We want to generate an url to use as our cache key, we don't want to
Expand All @@ -52,13 +106,14 @@ def _get_cache_path_parts(self, link: Link) -> List[str]:
if link.subdirectory_fragment:
key_parts["subdirectory"] = link.subdirectory_fragment

# Include interpreter name, major and minor version in cache key
# to cope with ill-behaved sdists that build a different wheel
# depending on the python version their setup.py is being run on,
# and don't encode the difference in compatibility tags.
# https://github.com/pypa/pip/issues/7296
key_parts["interpreter_name"] = interpreter_name()
key_parts["interpreter_version"] = interpreter_version()
if interpreter_dependent:
# Include interpreter name, major and minor version in cache key
# to cope with ill-behaved sdists that build a different wheel
# depending on the python version their setup.py is being run on,
# and don't encode the difference in compatibility tags.
# https://github.com/pypa/pip/issues/7296
key_parts["interpreter_name"] = interpreter_name()
key_parts["interpreter_version"] = interpreter_version()

# Encode our key url with sha224, we'll use this because it has similar
# security properties to sha256, but with a shorter total output (and
Expand All @@ -73,20 +128,59 @@ def _get_cache_path_parts(self, link: Link) -> List[str]:

return parts

def _get_candidates(self, link: Link, canonical_package_name: str) -> List[Any]:
can_not_cache = not self.cache_dir or not canonical_package_name or not link
if can_not_cache:
return []
@abc.abstractmethod
def get_path_for_link(self, link: Link) -> str:
"""Return a directory to store cached items in for link."""
...

path = self.get_path_for_link(link)
if os.path.isdir(path):
return [(candidate, path) for candidate in os.listdir(path)]
return []
def cache_path(self, link: Link) -> Path:
return Path(self.get_path_for_link(link))


class LinkMetadataCache(Cache):
"""Persistently store the metadata of dists found at each link."""

def get_path_for_link(self, link: Link) -> str:
"""Return a directory to store cached items in for link."""
raise NotImplementedError()
parts = self._get_cache_path_parts(link, interpreter_dependent=True)
assert self.cache_dir
return os.path.join(self.cache_dir, "link-metadata", *parts)


class SerializableEntry(abc.ABC):
@classmethod
@abc.abstractmethod
def suffix(cls) -> str: ...

@abc.abstractmethod
def serialize(self) -> Dict[str, Any]: ...


class FetchResolveCache(Cache):
def get_path_for_link(self, link: Link) -> str:
# We are reading index links to extract other links from, not executing any
# python code, so these caches are interpreter-independent.
parts = self._get_cache_path_parts(link, interpreter_dependent=False)
assert self.cache_dir
return os.path.join(self.cache_dir, "fetch-resolve", *parts)

def hashed_entry_path(self, link: Link, entry: SerializableEntry) -> Path:
hashed = _hash_dict(entry.serialize())
return self.cache_path(link) / f"{hashed}{entry.suffix()}"

def clear_hashed_entries(
self, link: Link, entry_type: Type[SerializableEntry]
) -> None:
for hashed_entry in self.cache_path(link).glob(f"*{entry_type.suffix()}"):
logger.debug(
"unlinking invalidated hashed link eval cache entry %s", hashed_entry
)
hashed_entry.unlink()


class WheelCacheBase(Cache):
"""Specializations to the cache concept for wheels."""

@abc.abstractmethod
def get(
self,
link: Link,
Expand All @@ -96,10 +190,27 @@ def get(
"""Returns a link to a cached item if it exists, otherwise returns the
passed link.
"""
raise NotImplementedError()
...

def _can_cache(self, link: Link, canonical_package_name: str) -> bool:
return bool(self.cache_dir and canonical_package_name and link)

class SimpleWheelCache(Cache):
def _get_candidates(
self, link: Link, canonical_package_name: str
) -> Iterator[Tuple[str, str]]:
if not self._can_cache(link, canonical_package_name):
return

path = self.get_path_for_link(link)
if not os.path.isdir(path):
return

for candidate in os.scandir(path):
if candidate.is_file():
yield (candidate.name, path)


class SimpleWheelCache(WheelCacheBase):
"""A cache of wheels for future installs."""

def __init__(self, cache_dir: str) -> None:
Expand All @@ -120,7 +231,7 @@ def get_path_for_link(self, link: Link) -> str:

:param link: The link of the sdist for which this will cache wheels.
"""
parts = self._get_cache_path_parts(link)
parts = self._get_cache_path_parts(link, interpreter_dependent=True)
assert self.cache_dir
# Store wheels within the root cache_dir
return os.path.join(self.cache_dir, "wheels", *parts)
Expand All @@ -131,7 +242,7 @@ def get(
package_name: Optional[str],
supported_tags: List[Tag],
) -> Link:
candidates = []
candidates: List[Tuple[int, str, str]] = []

if not package_name:
return link
Expand Down Expand Up @@ -205,7 +316,7 @@ def __init__(
)


class WheelCache(Cache):
class WheelCache(WheelCacheBase):
"""Wraps EphemWheelCache and SimpleWheelCache into a single Cache

This Cache allows for gracefully degradation, using the ephem wheel cache
Expand All @@ -223,6 +334,15 @@ def get_path_for_link(self, link: Link) -> str:
def get_ephem_path_for_link(self, link: Link) -> str:
return self._ephem_cache.get_path_for_link(link)

def resolve_cache_dir(self, req: InstallRequirement) -> str:
"""Return the persistent or temporary cache directory where the built or
downloaded wheel should be stored."""
cache_available = bool(self.cache_dir)
assert req.link, req
if cache_available and should_cache(req):
return self.get_path_for_link(req.link)
return self.get_ephem_path_for_link(req.link)

def get(
self,
link: Link,
Expand Down
2 changes: 2 additions & 0 deletions src/pip/_internal/cli/cmdoptions.py
Original file line number Diff line number Diff line change
Expand Up @@ -1009,6 +1009,8 @@ def check_list_path_option(options: Values) -> None:
default=[],
choices=[
"fast-deps",
"metadata-cache",
"truststore",
]
+ ALWAYS_ENABLED_FEATURES,
help="Enable new functionality, that may be backward incompatible.",
Expand Down
18 changes: 17 additions & 1 deletion src/pip/_internal/cli/req_command.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@
from optparse import Values
from typing import Any, List, Optional, Tuple

from pip._internal.cache import WheelCache
from pip._internal.cache import FetchResolveCache, LinkMetadataCache, WheelCache
from pip._internal.cli import cmdoptions
from pip._internal.cli.index_command import IndexGroupCommand
from pip._internal.cli.index_command import SessionCommandMixin as SessionCommandMixin
Expand Down Expand Up @@ -127,6 +127,16 @@ def make_requirement_preparer(
"fast-deps has no effect when used with the legacy resolver."
)

if options.cache_dir and "metadata-cache" in options.features_enabled:
logger.warning(
"pip is using a local cache for metadata information. "
"This experimental feature is enabled through "
"--use-feature=metadata-cache and it is not ready for "
"production."
)
metadata_cache = LinkMetadataCache(options.cache_dir)
else:
metadata_cache = None
return RequirementPreparer(
build_dir=temp_build_dir_path,
src_dir=options.src_dir,
Expand All @@ -142,6 +152,7 @@ def make_requirement_preparer(
lazy_wheel=lazy_wheel,
verbosity=verbosity,
legacy_resolver=legacy_resolver,
metadata_cache=metadata_cache,
)

@classmethod
Expand Down Expand Up @@ -322,8 +333,13 @@ def _build_package_finder(
ignore_requires_python=ignore_requires_python,
)

if bool(options.cache_dir) and ("metadata-cache" in options.features_enabled):
fetch_resolve_cache = FetchResolveCache(options.cache_dir)
else:
fetch_resolve_cache = None
return PackageFinder.create(
link_collector=link_collector,
selection_prefs=selection_prefs,
target_python=target_python,
fetch_resolve_cache=fetch_resolve_cache,
)
5 changes: 3 additions & 2 deletions src/pip/_internal/commands/download.py
Original file line number Diff line number Diff line change
Expand Up @@ -130,6 +130,9 @@ def run(self, options: Values, args: List[str]) -> int:
self.trace_basic_info(finder)

requirement_set = resolver.resolve(reqs, check_supported_wheels=True)
preparer.finalize_linked_requirements(
requirement_set.requirements.values(), require_dist_files=True
)

downloaded: List[str] = []
for req in requirement_set.requirements.values():
Expand All @@ -138,8 +141,6 @@ def run(self, options: Values, args: List[str]) -> int:
preparer.save_linked_requirement(req)
downloaded.append(req.name)

preparer.prepare_linked_requirements_more(requirement_set.requirements.values())

if downloaded:
write_output("Successfully downloaded %s", " ".join(downloaded))

Expand Down
7 changes: 6 additions & 1 deletion src/pip/_internal/commands/install.py
Original file line number Diff line number Diff line change
Expand Up @@ -85,7 +85,8 @@ def add_options(self) -> None:
help=(
"Don't actually install anything, just print what would be. "
"Can be used in combination with --ignore-installed "
"to 'resolve' the requirements."
"to 'resolve' the requirements. If package metadata is available "
"or cached, --dry-run also avoids downloading the dependency at all."
),
)
self.cmd_opts.add_option(
Expand Down Expand Up @@ -379,6 +380,10 @@ def run(self, options: Values, args: List[str]) -> int:
requirement_set = resolver.resolve(
reqs, check_supported_wheels=not options.target_dir
)
preparer.finalize_linked_requirements(
requirement_set.requirements.values(),
require_dist_files=not options.dry_run,
)

if options.json_report_file:
report = InstallationReport(requirement_set.requirements_to_install)
Expand Down
5 changes: 3 additions & 2 deletions src/pip/_internal/commands/wheel.py
Original file line number Diff line number Diff line change
Expand Up @@ -145,6 +145,9 @@ def run(self, options: Values, args: List[str]) -> int:
self.trace_basic_info(finder)

requirement_set = resolver.resolve(reqs, check_supported_wheels=True)
preparer.finalize_linked_requirements(
requirement_set.requirements.values(), require_dist_files=True
)

reqs_to_build: List[InstallRequirement] = []
for req in requirement_set.requirements.values():
Expand All @@ -153,8 +156,6 @@ def run(self, options: Values, args: List[str]) -> int:
elif should_build_for_wheel_command(req):
reqs_to_build.append(req)

preparer.prepare_linked_requirements_more(requirement_set.requirements.values())

# build wheels
build_successes, build_failures = build(
reqs_to_build,
Expand Down
Loading