diff --git a/docs/html/reference/requirements-file-format.md b/docs/html/reference/requirements-file-format.md index 01047587161..020a6e51b5b 100644 --- a/docs/html/reference/requirements-file-format.md +++ b/docs/html/reference/requirements-file-format.md @@ -56,9 +56,9 @@ examples of all these forms, see {ref}`pip install Examples`. ### Encoding -Requirements files are `utf-8` encoding by default and also support -{pep}`263` style comments to change the encoding (i.e. -`# -*- coding: -*-`). +The default encoding for requirement files is `UTF-8` unless a different +encoding is specified using a {pep}`263` style comment (e.g. `# -*- coding: + -*-`). ### Line continuations diff --git a/news/12771.feature.rst b/news/12771.feature.rst new file mode 100644 index 00000000000..68b2f14aade --- /dev/null +++ b/news/12771.feature.rst @@ -0,0 +1,2 @@ +Reorder the encoding detection when decoding a requirements file, relying on +UTF-8 over the locale encoding by default. diff --git a/src/pip/_internal/req/req_file.py b/src/pip/_internal/req/req_file.py index dee7f2fe81b..f6ba70fe7f6 100644 --- a/src/pip/_internal/req/req_file.py +++ b/src/pip/_internal/req/req_file.py @@ -2,11 +2,14 @@ Requirements file parsing """ +import codecs +import locale import logging import optparse import os import re import shlex +import sys import urllib.parse from dataclasses import dataclass from optparse import Values @@ -26,7 +29,6 @@ from pip._internal.cli import cmdoptions from pip._internal.exceptions import InstallationError, RequirementsFileParseError from pip._internal.models.search_scope import SearchScope -from pip._internal.utils.encoding import auto_decode if TYPE_CHECKING: from pip._internal.index.package_finder import PackageFinder @@ -82,6 +84,21 @@ str(o().dest) for o in SUPPORTED_OPTIONS_EDITABLE_REQ ] +# order of BOMS is important: codecs.BOM_UTF16_LE is a prefix of codecs.BOM_UTF32_LE +# so data.startswith(BOM_UTF16_LE) would be true for UTF32_LE data +BOMS: List[Tuple[bytes, str]] = [ + (codecs.BOM_UTF8, "utf-8"), + (codecs.BOM_UTF32, "utf-32"), + (codecs.BOM_UTF32_BE, "utf-32-be"), + (codecs.BOM_UTF32_LE, "utf-32-le"), + (codecs.BOM_UTF16, "utf-16"), + (codecs.BOM_UTF16_BE, "utf-16-be"), + (codecs.BOM_UTF16_LE, "utf-16-le"), +] + +PEP263_ENCODING_RE = re.compile(rb"coding[:=]\s*([-\w.]+)") +DEFAULT_ENCODING = "utf-8" + logger = logging.getLogger(__name__) @@ -568,7 +585,39 @@ def get_file_content(url: str, session: "PipSession") -> Tuple[str, str]: # Assume this is a bare path. try: with open(url, "rb") as f: - content = auto_decode(f.read()) + raw_content = f.read() except OSError as exc: raise InstallationError(f"Could not open requirements file: {exc}") + + content = _decode_req_file(raw_content, url) + return url, content + + +def _decode_req_file(data: bytes, url: str) -> str: + for bom, encoding in BOMS: + if data.startswith(bom): + return data[len(bom) :].decode(encoding) + + for line in data.split(b"\n")[:2]: + if line[0:1] == b"#": + result = PEP263_ENCODING_RE.search(line) + if result is not None: + encoding = result.groups()[0].decode("ascii") + return data.decode(encoding) + + try: + return data.decode(DEFAULT_ENCODING) + except UnicodeDecodeError: + locale_encoding = locale.getpreferredencoding(False) or sys.getdefaultencoding() + logging.warning( + "unable to decode data from %s with default encoding %s, " + "falling back to encoding from locale: %s. " + "If this is intentional you should specify the encoding with a " + "PEP-263 style comment, e.g. '# -*- coding: %s -*-'", + url, + DEFAULT_ENCODING, + locale_encoding, + locale_encoding, + ) + return data.decode(locale_encoding) diff --git a/src/pip/_internal/utils/encoding.py b/src/pip/_internal/utils/encoding.py deleted file mode 100644 index 008f06a79bf..00000000000 --- a/src/pip/_internal/utils/encoding.py +++ /dev/null @@ -1,36 +0,0 @@ -import codecs -import locale -import re -import sys -from typing import List, Tuple - -BOMS: List[Tuple[bytes, str]] = [ - (codecs.BOM_UTF8, "utf-8"), - (codecs.BOM_UTF16, "utf-16"), - (codecs.BOM_UTF16_BE, "utf-16-be"), - (codecs.BOM_UTF16_LE, "utf-16-le"), - (codecs.BOM_UTF32, "utf-32"), - (codecs.BOM_UTF32_BE, "utf-32-be"), - (codecs.BOM_UTF32_LE, "utf-32-le"), -] - -ENCODING_RE = re.compile(rb"coding[:=]\s*([-\w.]+)") - - -def auto_decode(data: bytes) -> str: - """Check a bytes string for a BOM to correctly detect the encoding - - Fallback to locale.getpreferredencoding(False) like open() on Python3""" - for bom, encoding in BOMS: - if data.startswith(bom): - return data[len(bom) :].decode(encoding) - # Lets check the first two lines as in PEP263 - for line in data.split(b"\n")[:2]: - if line[0:1] == b"#" and ENCODING_RE.search(line): - result = ENCODING_RE.search(line) - assert result is not None - encoding = result.groups()[0].decode("ascii") - return data.decode(encoding) - return data.decode( - locale.getpreferredencoding(False) or sys.getdefaultencoding(), - ) diff --git a/tests/unit/test_req_file.py b/tests/unit/test_req_file.py index 1cc030681db..60b14940d27 100644 --- a/tests/unit/test_req_file.py +++ b/tests/unit/test_req_file.py @@ -1,3 +1,4 @@ +import codecs import collections import logging import os @@ -955,3 +956,116 @@ def test_install_requirements_with_options( ) assert req.global_options == [global_option] + + @pytest.mark.parametrize( + "raw_req_file,expected_name,expected_spec", + [ + pytest.param( + b"Django==1.4.2", + "Django", + "==1.4.2", + id="defaults to UTF-8", + ), + pytest.param( + "# coding=latin1\nDjango==1.4.2 # Pas trop de café".encode("latin-1"), + "Django", + "==1.4.2", + id="decodes based on PEP-263 style headers", + ), + ], + ) + def test_general_decoding( + self, + raw_req_file: bytes, + expected_name: str, + expected_spec: str, + tmpdir: Path, + session: PipSession, + ) -> None: + req_file = tmpdir / "requirements.txt" + req_file.write_bytes(raw_req_file) + + reqs = tuple(parse_reqfile(req_file.resolve(), session=session)) + + assert len(reqs) == 1 + assert reqs[0].name == expected_name + assert reqs[0].specifier == expected_spec + + @pytest.mark.parametrize( + "bom,encoding", + [ + (codecs.BOM_UTF8, "utf-8"), + (codecs.BOM_UTF16_BE, "utf-16-be"), + (codecs.BOM_UTF16_LE, "utf-16-le"), + (codecs.BOM_UTF32_BE, "utf-32-be"), + (codecs.BOM_UTF32_LE, "utf-32-le"), + # BOM automatically added when encoding byte-order dependent encodings + (b"", "utf-16"), + (b"", "utf-32"), + ], + ) + def test_decoding_with_BOM( + self, bom: bytes, encoding: str, tmpdir: Path, session: PipSession + ) -> None: + req_name = "Django" + req_specifier = "==1.4.2" + encoded_contents = bom + f"{req_name}{req_specifier}".encode(encoding) + req_file = tmpdir / "requirements.txt" + req_file.write_bytes(encoded_contents) + + reqs = tuple(parse_reqfile(req_file.resolve(), session=session)) + + assert len(reqs) == 1 + assert reqs[0].name == req_name + assert reqs[0].specifier == req_specifier + + def test_warns_and_fallsback_to_locale_on_utf8_decode_fail( + self, + tmpdir: Path, + session: PipSession, + caplog: pytest.LogCaptureFixture, + ) -> None: + # \xff is valid in latin-1 but not UTF-8 + data = b"pip<=24.0 # some comment\xff\n" + locale_encoding = "latin-1" + req_file = tmpdir / "requirements.txt" + req_file.write_bytes(data) + + # it's hard to rely on a locale definitely existing for testing + # so patch things out for simplicity + with caplog.at_level(logging.WARNING), mock.patch( + "locale.getpreferredencoding", return_value=locale_encoding + ): + reqs = tuple(parse_reqfile(req_file.resolve(), session=session)) + + assert len(caplog.records) == 1 + assert ( + caplog.records[0].msg + == "unable to decode data from %s with default encoding %s, " + "falling back to encoding from locale: %s. " + "If this is intentional you should specify the encoding with a " + "PEP-263 style comment, e.g. '# -*- coding: %s -*-'" + ) + assert caplog.records[0].args == ( + str(req_file), + "utf-8", + locale_encoding, + locale_encoding, + ) + + assert len(reqs) == 1 + assert reqs[0].name == "pip" + assert str(reqs[0].specifier) == "<=24.0" + + @pytest.mark.parametrize("encoding", ["utf-8", "gbk"]) + def test_errors_on_non_decodable_data( + self, encoding: str, tmpdir: Path, session: PipSession + ) -> None: + data = b"\xff" + req_file = tmpdir / "requirements.txt" + req_file.write_bytes(data) + + with pytest.raises(UnicodeDecodeError), mock.patch( + "locale.getpreferredencoding", return_value=encoding + ): + next(parse_reqfile(req_file.resolve(), session=session)) diff --git a/tests/unit/test_utils.py b/tests/unit/test_utils.py index 6627a89496d..e2d1710739a 100644 --- a/tests/unit/test_utils.py +++ b/tests/unit/test_utils.py @@ -3,7 +3,6 @@ """ -import codecs import os import shutil import stat @@ -12,7 +11,7 @@ from io import BytesIO from pathlib import Path from typing import Any, Callable, Iterator, List, NoReturn, Optional, Tuple, Type -from unittest.mock import Mock, patch +from unittest.mock import Mock import pytest @@ -21,7 +20,6 @@ from pip._internal.exceptions import HashMismatch, HashMissing, InstallationError from pip._internal.utils.deprecation import PipDeprecationWarning, deprecated from pip._internal.utils.egg_link import egg_link_path_from_location -from pip._internal.utils.encoding import BOMS, auto_decode from pip._internal.utils.glibc import ( glibc_version_string, glibc_version_string_confstr, @@ -445,48 +443,6 @@ def test_has_one_of(self) -> None: assert not empty_hashes.has_one_of({"sha256": "xyzt"}) -class TestEncoding: - """Tests for pip._internal.utils.encoding""" - - def test_auto_decode_utf_16_le(self) -> None: - data = ( - b"\xff\xfeD\x00j\x00a\x00n\x00g\x00o\x00=\x00" - b"=\x001\x00.\x004\x00.\x002\x00" - ) - assert data.startswith(codecs.BOM_UTF16_LE) - assert auto_decode(data) == "Django==1.4.2" - - def test_auto_decode_utf_16_be(self) -> None: - data = ( - b"\xfe\xff\x00D\x00j\x00a\x00n\x00g\x00o\x00=" - b"\x00=\x001\x00.\x004\x00.\x002" - ) - assert data.startswith(codecs.BOM_UTF16_BE) - assert auto_decode(data) == "Django==1.4.2" - - def test_auto_decode_no_bom(self) -> None: - assert auto_decode(b"foobar") == "foobar" - - def test_auto_decode_pep263_headers(self) -> None: - latin1_req = "# coding=latin1\n# Pas trop de café" - assert auto_decode(latin1_req.encode("latin1")) == latin1_req - - def test_auto_decode_no_preferred_encoding(self) -> None: - om, em = Mock(), Mock() - om.return_value = "ascii" - em.return_value = None - data = "data" - with patch("sys.getdefaultencoding", om): - with patch("locale.getpreferredencoding", em): - ret = auto_decode(data.encode(sys.getdefaultencoding())) - assert ret == data - - @pytest.mark.parametrize("encoding", [encoding for bom, encoding in BOMS]) - def test_all_encodings_are_valid(self, encoding: str) -> None: - # we really only care that there is no LookupError - assert "".encode(encoding).decode(encoding) == "" - - def raises(error: Type[Exception]) -> NoReturn: raise error