Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

perf: Avoid unnecessary URL processing while parsing links #13132

Merged
merged 1 commit into from
Jan 5, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions news/13132.feature.rst
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
Optimize package collection by avoiding unnecessary URL parsing and other processing.
26 changes: 20 additions & 6 deletions src/pip/_internal/models/link.py
Original file line number Diff line number Diff line change
Expand Up @@ -170,12 +170,23 @@ def _ensure_quoted_url(url: str) -> str:
and without double-quoting other characters.
"""
# Split the URL into parts according to the general structure
# `scheme://netloc/path;parameters?query#fragment`.
result = urllib.parse.urlparse(url)
# `scheme://netloc/path?query#fragment`.
result = urllib.parse.urlsplit(url)
# If the netloc is empty, then the URL refers to a local filesystem path.
is_local_path = not result.netloc
path = _clean_url_path(result.path, is_local_path=is_local_path)
return urllib.parse.urlunparse(result._replace(path=path))
return urllib.parse.urlunsplit(result._replace(path=path))


def _absolute_link_url(base_url: str, url: str) -> str:
"""
A faster implementation of urllib.parse.urljoin with a shortcut
for absolute http/https URLs.
"""
if url.startswith(("https://", "http://")):
return url
else:
return urllib.parse.urljoin(base_url, url)


@functools.total_ordering
Expand All @@ -185,6 +196,7 @@ class Link:
__slots__ = [
"_parsed_url",
"_url",
"_path",
"_hashes",
"comes_from",
"requires_python",
Expand Down Expand Up @@ -241,6 +253,8 @@ def __init__(
# Store the url as a private attribute to prevent accidentally
# trying to set a new value.
self._url = url
# The .path property is hot, so calculate its value ahead of time.
self._path = urllib.parse.unquote(self._parsed_url.path)

link_hash = LinkHash.find_hash_url_fragment(url)
hashes_from_link = {} if link_hash is None else link_hash.as_dict()
Expand Down Expand Up @@ -270,7 +284,7 @@ def from_json(
if file_url is None:
return None

url = _ensure_quoted_url(urllib.parse.urljoin(page_url, file_url))
url = _ensure_quoted_url(_absolute_link_url(page_url, file_url))
pyrequire = file_data.get("requires-python")
yanked_reason = file_data.get("yanked")
hashes = file_data.get("hashes", {})
Expand Down Expand Up @@ -322,7 +336,7 @@ def from_element(
if not href:
return None

url = _ensure_quoted_url(urllib.parse.urljoin(base_url, href))
url = _ensure_quoted_url(_absolute_link_url(base_url, href))
pyrequire = anchor_attribs.get("data-requires-python")
yanked_reason = anchor_attribs.get("data-yanked")

Expand Down Expand Up @@ -421,7 +435,7 @@ def netloc(self) -> str:

@property
def path(self) -> str:
return urllib.parse.unquote(self._parsed_url.path)
return self._path

def splitext(self) -> Tuple[str, str]:
return splitext(posixpath.basename(self.path.rstrip("/")))
Expand Down
Loading