feat: Allow to order the index urls and find links together for Packa…

…geFinder (#43)
frostming · Apr 6, 2023 · dbe8527 · dbe8527
1 parent 0c7b891
commit dbe8527
Show file tree

Hide file tree

Showing 7 changed files with 198 additions and 108 deletions.
diff --git a/README.md b/README.md
@@ -43,7 +43,7 @@ Get the best matching candidate for a requirement:
 
 ```python
 >>> from unearth import PackageFinder
->>> finder = PackageFinder(index_urls=['https://pypi.org/simple/'])
+>>> finder = PackageFinder(index_urls=["https://pypi.org/simple/"])
 >>> result = finder.find_best_match("flask>=2")
 >>> result.best_candidate
 Package(name='flask', version='2.1.2', link=<Link https://files.pythonhosted.org/packages/ba/76/e9580e494eaf6f09710b0f3b9000c9c0363e44af5390be32bb0394165853/Flask-2.1.2-py3-none-any.whl#sha256=fad5b446feb0d6db6aec0c3184d16a8c1f6c3e464b511649c8918a9be100b4fe (from https://pypi.org/simple/flask)>)

diff --git a/src/unearth/__init__.py b/src/unearth/__init__.py
@@ -7,12 +7,13 @@
 """
 from unearth.errors import HashMismatchError, UnpackError, URLError, VCSBackendError
 from unearth.evaluator import Package, TargetPython
-from unearth.finder import BestMatch, PackageFinder
+from unearth.finder import BestMatch, PackageFinder, Source
 from unearth.link import Link
 from unearth.vcs import vcs_support
 
 __all__ = [
     "Link",
+    "Source",
     "Package",
     "URLError",
     "BestMatch",

diff --git a/src/unearth/__main__.py b/src/unearth/__main__.py
@@ -24,8 +24,8 @@ class CLIArgs:
     index_urls: list[str]
     find_links: list[str]
     trusted_hosts: list[str]
-    no_binary: list[str]
-    only_binary: list[str]
+    no_binary: bool
+    only_binary: bool
     prefer_binary: bool
     all: bool
     link_only: bool
@@ -57,8 +57,8 @@ def cli_parser() -> argparse.ArgumentParser:
     parser.add_argument(
         "--index-url",
         "-i",
-        dest="index_urls",
         metavar="URL",
+        dest="index_urls",
         action="append",
         help="(Multiple)(PEP 503)Simple Index URLs.",
     )
@@ -79,17 +79,13 @@ def cli_parser() -> argparse.ArgumentParser:
     )
     parser.add_argument(
         "--no-binary",
-        action="append",
-        metavar="PACKAGE",
-        help="(Multiple)Specify package names to exclude binary results, "
-        "or `:all:` to exclude all binary results.",
+        action="store_true",
+        help="Exclude binary packages from the results.",
     )
     parser.add_argument(
         "--only-binary",
-        action="append",
-        metavar="PACKAGE",
-        help="(Multiple)Specify package names to only allow binary results, "
-        "or `:all:` to enforce binary results for all packages.",
+        action="store_true",
+        help="Only include binary packages in the results.",
     )
     parser.add_argument(
         "--prefer-binary",
@@ -128,16 +124,17 @@ def cli(argv: list[str] | None = None) -> None:
     parser = cli_parser()
     args = cast(CLIArgs, parser.parse_args(argv))
     _setup_logger(args.verbose)
+    name = args.requirement.name
     finder = PackageFinder(
-        index_urls=args.index_urls or ["https://pypi.org/simple"],
+        index_urls=args.index_urls or ["https://pypi.org/simple/"],
         find_links=args.find_links or [],
         trusted_hosts=args.trusted_hosts or [],
-        no_binary=args.no_binary or [],
-        only_binary=args.only_binary or [],
-        prefer_binary=args.prefer_binary,
+        no_binary=[name] if args.no_binary else [],
+        only_binary=[name] if args.only_binary else [],
+        prefer_binary=[name] if args.prefer_binary else [],
         verbosity=int(args.verbose),
     )
-    matches = finder.find_matches(args.requirement)
+    matches = list(finder.find_matches(args.requirement))
     if not matches:
         print("No matches are found.", file=sys.stderr)
         sys.exit(1)

diff --git a/src/unearth/finder.py b/src/unearth/finder.py
@@ -3,10 +3,11 @@
 
 import atexit
 import functools
+import itertools
 import os
 import pathlib
 from tempfile import TemporaryDirectory
-from typing import Iterable, NamedTuple
+from typing import TYPE_CHECKING, Iterable, NamedTuple, Sequence
 from urllib.parse import urljoin
 
 import packaging.requirements
@@ -25,7 +26,17 @@
 from unearth.link import Link
 from unearth.preparer import unpack_link
 from unearth.session import PyPISession
-from unearth.utils import split_auth_from_url
+from unearth.utils import LazySequence
+
+if TYPE_CHECKING:
+    from typing import TypedDict
+
+    class Source(TypedDict):
+        url: str
+        type: str
+
+else:
+    Source = dict
 
 
 class BestMatch(NamedTuple):
@@ -34,9 +45,9 @@ class BestMatch(NamedTuple):
     #: The best matching package, or None if no match was found.
     best: Package | None
     #: The applicable packages, excluding those with unmatching versions.
-    applicable: list[Package]
+    applicable: Sequence[Package]
     #: All candidates found for the requirement.
-    candidates: list[Package]
+    candidates: Sequence[Package]
 
 
 class PackageFinder:
@@ -45,16 +56,16 @@ class PackageFinder:
     Args:
         session (PyPISession|None): The session to use for the finder.
             If not provided, a temporary session will be created.
-        index_urls: (Iterable[str]): The urls of the index pages.
-        find_links: (Iterable[str]): The urls or paths of the find links.
+        index_urls (Iterable[str]): The index URLs to search for packages.
+        find_links (Iterable[str]): The links to search for packages.
         trusted_hosts: (Iterable[str]): The trusted hosts.
         target_python (TargetPython): The links must match
             the target Python
         ignore_compatibility (bool): Whether to ignore the compatibility check
         no_binary (Iterable[str]): The names of the packages to disallow wheels
         only_binary (Iterable[str]): The names of the packages to disallow non-wheels
-        prefer_binary (bool): Whether to prefer binary packages even if
-            newer sdist pacakges exist.
+        prefer_binary (Iterable[str]): The names of the packages to prefer binary
+            distributions even if newer sdist pacakges exist.
         respect_source_order (bool): If True, packages from the source coming earlier
             are more preferred, even if they have lower versions.
         verbosity (int): The verbosity level.
@@ -63,40 +74,65 @@ class PackageFinder:
     def __init__(
         self,
         session: PyPISession | None = None,
+        *,
         index_urls: Iterable[str] = (),
         find_links: Iterable[str] = (),
         trusted_hosts: Iterable[str] = (),
         target_python: TargetPython | None = None,
         ignore_compatibility: bool = False,
         no_binary: Iterable[str] = (),
         only_binary: Iterable[str] = (),
-        prefer_binary: bool = False,
+        prefer_binary: Iterable[str] = (),
         respect_source_order: bool = False,
         verbosity: int = 0,
     ) -> None:
-        self.index_urls = list(index_urls)
-        self.find_links = list(find_links)
+        self.sources: list[Source] = []
+        for url in index_urls:
+            self.add_index_url(url)
+        for url in find_links:
+            self.add_find_links(url)
         self.target_python = target_python or TargetPython()
         self.ignore_compatibility = ignore_compatibility
         self.no_binary = [canonicalize_name(name) for name in no_binary]
         self.only_binary = [canonicalize_name(name) for name in only_binary]
-        self.prefer_binary = prefer_binary
-        if session is None:
-            session = PyPISession(
-                index_urls=self.index_urls, trusted_hosts=trusted_hosts
-            )
-            atexit.register(session.close)
-        self.session = session
+        self.prefer_binary = [canonicalize_name(name) for name in prefer_binary]
+        self.trusted_hosts = trusted_hosts
+        self._session = session
         self.respect_source_order = respect_source_order
         self.verbosity = verbosity
 
         self._tag_priorities = {
             tag: i for i, tag in enumerate(self.target_python.supported_tags())
         }
-        # Index pages are preferred over find links.
-        self._source_order = [
-            split_auth_from_url(url)[1] for url in (self.index_urls + self.find_links)
-        ]
+
+    @property
+    def session(self) -> PyPISession:
+        if self._session is None:
+            index_urls = [
+                source["url"] for source in self.sources if source["type"] == "index"
+            ]
+            session = PyPISession(
+                index_urls=index_urls, trusted_hosts=self.trusted_hosts
+            )
+            atexit.register(session.close)
+            self._session = session
+        return self._session
+
+    def add_index_url(self, url: str) -> None:
+        """Add an index URL to the finder search scope.
+
+        Args:
+            url (str): The index URL to add.
+        """
+        self.sources.append({"url": url, "type": "index"})
+
+    def add_find_links(self, url: str) -> None:
+        """Add a find links URL to the finder search scope.
+
+        Args:
+            url (str): The find links URL to add.
+        """
+        self.sources.append({"url": url, "type": "find_links"})
 
     def build_evaluator(
         self,
@@ -174,24 +210,12 @@ def _sort_key(self, package: Package) -> tuple:
                 (self._tag_priorities.get(tag, pri - 1) for tag in file_tags),
                 default=pri - 1,
             )
-            if self.prefer_binary:
+            if canonicalize_name(package.name) in self.prefer_binary:
                 prefer_binary = True
-        comes_from = package.link.comes_from
-        source_index = len(self._source_order)
-
-        if comes_from is not None and self.respect_source_order:
-            source_index = next(
-                (
-                    i
-                    for i, url in enumerate(self._source_order)
-                    if comes_from.startswith(url)
-                ),
-                source_index,
-            )
+
         return (
             -int(link.is_yanked),
             int(prefer_binary),
-            -source_index,
             parse_version(package.version) if package.version is not None else 0,
             -pri,
             build_tag,
@@ -211,26 +235,39 @@ def _find_packages(
             hashes (dict[str, list[str]]|None): The hashes to filter on.
 
         Returns:
-            Iterable[Package]: The packages with the given name
+            Iterable[Package]: The packages with the given name, sorted by best match.
         """
         evaluator = self.build_evaluator(package_name, allow_yanked, hashes)
-        for index_url in self.index_urls:
-            package_link = self._build_index_page_link(index_url, package_name)
-            yield from self._evaluate_links(
-                collect_links_from_location(self.session, package_link), evaluator
-            )
-        for find_link in self.find_links:
-            link = self._build_find_link(find_link)
-            yield from self._evaluate_links(
-                collect_links_from_location(self.session, link, expand=True), evaluator
-            )
+
+        def find_one_source(source: Source) -> Iterable[Package]:
+            if source["type"] == "index":
+                link = self._build_index_page_link(source["url"], package_name)
+                result = self._evaluate_links(
+                    collect_links_from_location(self.session, link), evaluator
+                )
+            else:
+                link = self._build_find_link(source["url"])
+                result = self._evaluate_links(
+                    collect_links_from_location(self.session, link, expand=True),
+                    evaluator,
+                )
+            if self.respect_source_order:
+                # Sort the result within the individual source.
+                return sorted(result, key=self._sort_key, reverse=True)
+            return result
+
+        all_packages = itertools.chain.from_iterable(map(find_one_source, self.sources))
+        if self.respect_source_order:
+            return all_packages
+        # Otherwise, sort the result across all sources.
+        return sorted(all_packages, key=self._sort_key, reverse=True)
 
     def find_all_packages(
         self,
         package_name: str,
         allow_yanked: bool = False,
         hashes: dict[str, list[str]] | None = None,
-    ) -> list[Package]:
+    ) -> Sequence[Package]:
         """Find all packages with the given package name, best match first.
 
         Args:
@@ -239,13 +276,9 @@ def find_all_packages(
             hashes (dict[str, list[str]]|None): The hashes to filter on.
 
         Returns:
-            list[Package]: The packages list sorted by best match
+            Sequence[Package]: The packages list sorted by best match
         """
-        return sorted(
-            self._find_packages(package_name, allow_yanked, hashes),
-            key=self._sort_key,
-            reverse=True,
-        )
+        return LazySequence(self._find_packages(package_name, allow_yanked, hashes))
 
     def _find_packages_from_requirement(
         self,
@@ -266,7 +299,7 @@ def find_matches(
         allow_yanked: bool | None = None,
         allow_prereleases: bool | None = None,
         hashes: dict[str, list[str]] | None = None,
-    ) -> list[Package]:
+    ) -> Sequence[Package]:
         """Find all packages matching the given requirement, best match first.
 
         Args:
@@ -279,18 +312,16 @@ def find_matches(
             hashes (dict[str, list[str]]|None): The hashes to filter on.
 
         Returns:
-            list[Package]: The packages list sorted by best match
+            Sequence[Package]: The packages sorted by best match
         """
         if isinstance(requirement, str):
             requirement = packaging.requirements.Requirement(requirement)
-        return sorted(
+        return LazySequence(
             self._evaluate_packages(
                 self._find_packages_from_requirement(requirement, allow_yanked, hashes),
                 requirement,
                 allow_prereleases,
-            ),
-            key=self._sort_key,
-            reverse=True,
+            )
         )
 
     def find_best_match(
@@ -316,13 +347,14 @@ def find_best_match(
         """
         if isinstance(requirement, str):
             requirement = packaging.requirements.Requirement(requirement)
-        candidates = list(
-            self._find_packages_from_requirement(requirement, allow_yanked, hashes)
+        packages = self._find_packages_from_requirement(
+            requirement, allow_yanked, hashes
         )
-        applicable_candidates = list(
-            self._evaluate_packages(candidates, requirement, allow_prereleases)
+        candidates = LazySequence(packages)
+        applicable_candidates = LazySequence(
+            self._evaluate_packages(packages, requirement, allow_prereleases)
         )
-        best_match = max(applicable_candidates, key=self._sort_key, default=None)
+        best_match = next(iter(applicable_candidates), None)
         return BestMatch(best_match, applicable_candidates, candidates)
 
     def download_and_unpack(