From 227d8e8dd2f103728519b3aa779a0020690f83f2 Mon Sep 17 00:00:00 2001 From: Danny McClanahan <1305167+cosmicexplorer@users.noreply.github.com> Date: Thu, 14 Sep 2023 16:35:54 -0400 Subject: [PATCH] handle metadata email parsing errors --- src/pip/_internal/exceptions.py | 19 ++++++++++ src/pip/_internal/metadata/__init__.py | 10 +++++- src/pip/_internal/metadata/base.py | 15 ++++++++ src/pip/_internal/operations/prepare.py | 46 ++++++++----------------- 4 files changed, 58 insertions(+), 32 deletions(-) diff --git a/src/pip/_internal/exceptions.py b/src/pip/_internal/exceptions.py index d95fe44b34a..e08f7389a65 100644 --- a/src/pip/_internal/exceptions.py +++ b/src/pip/_internal/exceptions.py @@ -253,6 +253,25 @@ def __str__(self) -> str: ) +class CacheMetadataError(PipError): + """Raised when de/serializing a requirement into the metadata cache.""" + + def __init__( + self, + req: "InstallRequirement", + reason: str, + ) -> None: + """ + :param req: The requirement we attempted to cache. + :param reason: Context about the precise error that occurred. + """ + self.req = req + self.reason = reason + + def __str__(self) -> str: + return f"{self.reason} for {self.req} from {self.req.link}" + + class UserInstallationInvalid(InstallationError): """A --user install is requested on an environment without user site.""" diff --git a/src/pip/_internal/metadata/__init__.py b/src/pip/_internal/metadata/__init__.py index aa232b6cabd..2ed1fe8d64a 100644 --- a/src/pip/_internal/metadata/__init__.py +++ b/src/pip/_internal/metadata/__init__.py @@ -6,7 +6,14 @@ from pip._internal.utils.misc import strtobool -from .base import BaseDistribution, BaseEnvironment, FilesystemWheel, MemoryWheel, Wheel +from .base import ( + BaseDistribution, + BaseEnvironment, + FilesystemWheel, + MemoryWheel, + Wheel, + serialize_metadata, +) if TYPE_CHECKING: from typing import Literal, Protocol @@ -23,6 +30,7 @@ "get_environment", "get_wheel_distribution", "select_backend", + "serialize_metadata", ] diff --git a/src/pip/_internal/metadata/base.py b/src/pip/_internal/metadata/base.py index dea3d025d61..6a024542ee6 100644 --- a/src/pip/_internal/metadata/base.py +++ b/src/pip/_internal/metadata/base.py @@ -1,6 +1,9 @@ import csv +import email.generator import email.message +import email.policy import functools +import io import json import logging import pathlib @@ -97,6 +100,18 @@ def _convert_installed_files_path( return str(pathlib.Path(*info, *entry)) +def serialize_metadata(msg: email.message.Message) -> str: + """Write a dist's metadata to a string. + + Calling ``str(dist.metadata)`` may raise an error by misinterpreting RST directives + as email headers. This method uses the more robust ``email.policy.EmailPolicy`` to + avoid those parsing errors.""" + out = io.StringIO() + g = email.generator.Generator(out, policy=email.policy.EmailPolicy()) + g.flatten(msg) + return out.getvalue() + + class RequiresEntry(NamedTuple): requirement: str extra: str diff --git a/src/pip/_internal/operations/prepare.py b/src/pip/_internal/operations/prepare.py index fdaa4d04326..c4056e434cc 100644 --- a/src/pip/_internal/operations/prepare.py +++ b/src/pip/_internal/operations/prepare.py @@ -4,7 +4,6 @@ # The following comment should be removed at some point in the future. # mypy: strict-optional=False -import email.errors import gzip import json import mimetypes @@ -12,7 +11,7 @@ import shutil from dataclasses import dataclass from pathlib import Path -from typing import Any, Dict, Iterable, List, Optional, Tuple +from typing import Dict, Iterable, List, Optional, Tuple from pip._vendor.packaging.utils import canonicalize_name from pip._vendor.requests.exceptions import InvalidSchema @@ -20,6 +19,7 @@ from pip._internal.cache import LinkMetadataCache, should_cache from pip._internal.distributions import make_distribution_for_install_requirement from pip._internal.exceptions import ( + CacheMetadataError, DirectoryUrlHashUnsupported, HashMismatch, HashUnpinned, @@ -32,6 +32,7 @@ from pip._internal.metadata import ( BaseDistribution, get_metadata_distribution, + serialize_metadata, ) from pip._internal.models.direct_url import ArchiveInfo from pip._internal.models.link import Link @@ -230,7 +231,7 @@ class CacheableDist: def from_dist(cls, link: Link, dist: BaseDistribution) -> "CacheableDist": """Extract the serializable data necessary to generate a metadata-only dist.""" return cls( - metadata=str(dist.metadata), + metadata=serialize_metadata(dist.metadata), filename=Path(link.filename), canonical_name=dist.canonical_name, ) @@ -243,7 +244,7 @@ def to_dist(self) -> BaseDistribution: canonical_name=self.canonical_name, ) - def to_json(self) -> Dict[str, Any]: + def to_json(self) -> Dict[str, str]: return { "metadata": self.metadata, "filename": str(self.filename), @@ -251,7 +252,7 @@ def to_json(self) -> Dict[str, Any]: } @classmethod - def from_json(cls, args: Dict[str, Any]) -> "CacheableDist": + def from_json(cls, args: Dict[str, str]) -> "CacheableDist": return cls( metadata=args["metadata"], filename=Path(args["filename"]), @@ -458,17 +459,10 @@ def _fetch_cached_metadata( "found cached metadata for link %s at %s", req.link, f.name ) args = json.load(f) - cached_dist = CacheableDist.from_json(args) - return cached_dist.to_dist() - except (OSError, json.JSONDecodeError, KeyError) as e: - logger.exception( - "error reading cached metadata for link %s at %s %s(%s)", - req.link, - cached_path, - e.__class__.__name__, - str(e), - ) - raise + cached_dist = CacheableDist.from_json(args) + return cached_dist.to_dist() + except Exception: + raise CacheMetadataError(req, "error reading cached metadata") def _cache_metadata( self, @@ -490,23 +484,13 @@ def _cache_metadata( # containing directory for the cache file exists before writing. os.makedirs(str(cached_path.parent), exist_ok=True) try: + cacheable_dist = CacheableDist.from_dist(req.link, metadata_dist) + args = cacheable_dist.to_json() + logger.debug("caching metadata for link %s at %s", req.link, cached_path) with gzip.open(cached_path, mode="wt", encoding="utf-8") as f: - cacheable_dist = CacheableDist.from_dist(req.link, metadata_dist) - args = cacheable_dist.to_json() - logger.debug("caching metadata for link %s at %s", req.link, f.name) json.dump(args, f) - except (OSError, email.errors.HeaderParseError) as e: - # TODO: Some dists raise email.errors.HeaderParseError when calling str() or - # bytes() on the metadata, which is an email.Message. This is probably a bug - # in email parsing. - logger.exception( - "error caching metadata for dist %s from %s: %s(%s)", - metadata_dist, - req.link, - e.__class__.__name__, - str(e), - ) - raise + except Exception: + raise CacheMetadataError(req, "failed to serialize metadata") def _fetch_metadata_using_link_data_attr( self,