diff --git a/docs/html/reference/requirements-file-format.md b/docs/html/reference/requirements-file-format.md
index 01047587161..020a6e51b5b 100644
--- a/docs/html/reference/requirements-file-format.md
+++ b/docs/html/reference/requirements-file-format.md
@@ -56,9 +56,9 @@ examples of all these forms, see {ref}`pip install Examples`.
### Encoding
-Requirements files are `utf-8` encoding by default and also support
-{pep}`263` style comments to change the encoding (i.e.
-`# -*- coding: -*-`).
+The default encoding for requirement files is `UTF-8` unless a different
+encoding is specified using a {pep}`263` style comment (e.g. `# -*- coding:
+ -*-`).
### Line continuations
diff --git a/news/12771.feature.rst b/news/12771.feature.rst
new file mode 100644
index 00000000000..68b2f14aade
--- /dev/null
+++ b/news/12771.feature.rst
@@ -0,0 +1,2 @@
+Reorder the encoding detection when decoding a requirements file, relying on
+UTF-8 over the locale encoding by default.
diff --git a/src/pip/_internal/req/req_file.py b/src/pip/_internal/req/req_file.py
index dee7f2fe81b..f6ba70fe7f6 100644
--- a/src/pip/_internal/req/req_file.py
+++ b/src/pip/_internal/req/req_file.py
@@ -2,11 +2,14 @@
Requirements file parsing
"""
+import codecs
+import locale
import logging
import optparse
import os
import re
import shlex
+import sys
import urllib.parse
from dataclasses import dataclass
from optparse import Values
@@ -26,7 +29,6 @@
from pip._internal.cli import cmdoptions
from pip._internal.exceptions import InstallationError, RequirementsFileParseError
from pip._internal.models.search_scope import SearchScope
-from pip._internal.utils.encoding import auto_decode
if TYPE_CHECKING:
from pip._internal.index.package_finder import PackageFinder
@@ -82,6 +84,21 @@
str(o().dest) for o in SUPPORTED_OPTIONS_EDITABLE_REQ
]
+# order of BOMS is important: codecs.BOM_UTF16_LE is a prefix of codecs.BOM_UTF32_LE
+# so data.startswith(BOM_UTF16_LE) would be true for UTF32_LE data
+BOMS: List[Tuple[bytes, str]] = [
+ (codecs.BOM_UTF8, "utf-8"),
+ (codecs.BOM_UTF32, "utf-32"),
+ (codecs.BOM_UTF32_BE, "utf-32-be"),
+ (codecs.BOM_UTF32_LE, "utf-32-le"),
+ (codecs.BOM_UTF16, "utf-16"),
+ (codecs.BOM_UTF16_BE, "utf-16-be"),
+ (codecs.BOM_UTF16_LE, "utf-16-le"),
+]
+
+PEP263_ENCODING_RE = re.compile(rb"coding[:=]\s*([-\w.]+)")
+DEFAULT_ENCODING = "utf-8"
+
logger = logging.getLogger(__name__)
@@ -568,7 +585,39 @@ def get_file_content(url: str, session: "PipSession") -> Tuple[str, str]:
# Assume this is a bare path.
try:
with open(url, "rb") as f:
- content = auto_decode(f.read())
+ raw_content = f.read()
except OSError as exc:
raise InstallationError(f"Could not open requirements file: {exc}")
+
+ content = _decode_req_file(raw_content, url)
+
return url, content
+
+
+def _decode_req_file(data: bytes, url: str) -> str:
+ for bom, encoding in BOMS:
+ if data.startswith(bom):
+ return data[len(bom) :].decode(encoding)
+
+ for line in data.split(b"\n")[:2]:
+ if line[0:1] == b"#":
+ result = PEP263_ENCODING_RE.search(line)
+ if result is not None:
+ encoding = result.groups()[0].decode("ascii")
+ return data.decode(encoding)
+
+ try:
+ return data.decode(DEFAULT_ENCODING)
+ except UnicodeDecodeError:
+ locale_encoding = locale.getpreferredencoding(False) or sys.getdefaultencoding()
+ logging.warning(
+ "unable to decode data from %s with default encoding %s, "
+ "falling back to encoding from locale: %s. "
+ "If this is intentional you should specify the encoding with a "
+ "PEP-263 style comment, e.g. '# -*- coding: %s -*-'",
+ url,
+ DEFAULT_ENCODING,
+ locale_encoding,
+ locale_encoding,
+ )
+ return data.decode(locale_encoding)
diff --git a/src/pip/_internal/utils/encoding.py b/src/pip/_internal/utils/encoding.py
deleted file mode 100644
index 008f06a79bf..00000000000
--- a/src/pip/_internal/utils/encoding.py
+++ /dev/null
@@ -1,36 +0,0 @@
-import codecs
-import locale
-import re
-import sys
-from typing import List, Tuple
-
-BOMS: List[Tuple[bytes, str]] = [
- (codecs.BOM_UTF8, "utf-8"),
- (codecs.BOM_UTF16, "utf-16"),
- (codecs.BOM_UTF16_BE, "utf-16-be"),
- (codecs.BOM_UTF16_LE, "utf-16-le"),
- (codecs.BOM_UTF32, "utf-32"),
- (codecs.BOM_UTF32_BE, "utf-32-be"),
- (codecs.BOM_UTF32_LE, "utf-32-le"),
-]
-
-ENCODING_RE = re.compile(rb"coding[:=]\s*([-\w.]+)")
-
-
-def auto_decode(data: bytes) -> str:
- """Check a bytes string for a BOM to correctly detect the encoding
-
- Fallback to locale.getpreferredencoding(False) like open() on Python3"""
- for bom, encoding in BOMS:
- if data.startswith(bom):
- return data[len(bom) :].decode(encoding)
- # Lets check the first two lines as in PEP263
- for line in data.split(b"\n")[:2]:
- if line[0:1] == b"#" and ENCODING_RE.search(line):
- result = ENCODING_RE.search(line)
- assert result is not None
- encoding = result.groups()[0].decode("ascii")
- return data.decode(encoding)
- return data.decode(
- locale.getpreferredencoding(False) or sys.getdefaultencoding(),
- )
diff --git a/tests/unit/test_req_file.py b/tests/unit/test_req_file.py
index 1cc030681db..60b14940d27 100644
--- a/tests/unit/test_req_file.py
+++ b/tests/unit/test_req_file.py
@@ -1,3 +1,4 @@
+import codecs
import collections
import logging
import os
@@ -955,3 +956,116 @@ def test_install_requirements_with_options(
)
assert req.global_options == [global_option]
+
+ @pytest.mark.parametrize(
+ "raw_req_file,expected_name,expected_spec",
+ [
+ pytest.param(
+ b"Django==1.4.2",
+ "Django",
+ "==1.4.2",
+ id="defaults to UTF-8",
+ ),
+ pytest.param(
+ "# coding=latin1\nDjango==1.4.2 # Pas trop de café".encode("latin-1"),
+ "Django",
+ "==1.4.2",
+ id="decodes based on PEP-263 style headers",
+ ),
+ ],
+ )
+ def test_general_decoding(
+ self,
+ raw_req_file: bytes,
+ expected_name: str,
+ expected_spec: str,
+ tmpdir: Path,
+ session: PipSession,
+ ) -> None:
+ req_file = tmpdir / "requirements.txt"
+ req_file.write_bytes(raw_req_file)
+
+ reqs = tuple(parse_reqfile(req_file.resolve(), session=session))
+
+ assert len(reqs) == 1
+ assert reqs[0].name == expected_name
+ assert reqs[0].specifier == expected_spec
+
+ @pytest.mark.parametrize(
+ "bom,encoding",
+ [
+ (codecs.BOM_UTF8, "utf-8"),
+ (codecs.BOM_UTF16_BE, "utf-16-be"),
+ (codecs.BOM_UTF16_LE, "utf-16-le"),
+ (codecs.BOM_UTF32_BE, "utf-32-be"),
+ (codecs.BOM_UTF32_LE, "utf-32-le"),
+ # BOM automatically added when encoding byte-order dependent encodings
+ (b"", "utf-16"),
+ (b"", "utf-32"),
+ ],
+ )
+ def test_decoding_with_BOM(
+ self, bom: bytes, encoding: str, tmpdir: Path, session: PipSession
+ ) -> None:
+ req_name = "Django"
+ req_specifier = "==1.4.2"
+ encoded_contents = bom + f"{req_name}{req_specifier}".encode(encoding)
+ req_file = tmpdir / "requirements.txt"
+ req_file.write_bytes(encoded_contents)
+
+ reqs = tuple(parse_reqfile(req_file.resolve(), session=session))
+
+ assert len(reqs) == 1
+ assert reqs[0].name == req_name
+ assert reqs[0].specifier == req_specifier
+
+ def test_warns_and_fallsback_to_locale_on_utf8_decode_fail(
+ self,
+ tmpdir: Path,
+ session: PipSession,
+ caplog: pytest.LogCaptureFixture,
+ ) -> None:
+ # \xff is valid in latin-1 but not UTF-8
+ data = b"pip<=24.0 # some comment\xff\n"
+ locale_encoding = "latin-1"
+ req_file = tmpdir / "requirements.txt"
+ req_file.write_bytes(data)
+
+ # it's hard to rely on a locale definitely existing for testing
+ # so patch things out for simplicity
+ with caplog.at_level(logging.WARNING), mock.patch(
+ "locale.getpreferredencoding", return_value=locale_encoding
+ ):
+ reqs = tuple(parse_reqfile(req_file.resolve(), session=session))
+
+ assert len(caplog.records) == 1
+ assert (
+ caplog.records[0].msg
+ == "unable to decode data from %s with default encoding %s, "
+ "falling back to encoding from locale: %s. "
+ "If this is intentional you should specify the encoding with a "
+ "PEP-263 style comment, e.g. '# -*- coding: %s -*-'"
+ )
+ assert caplog.records[0].args == (
+ str(req_file),
+ "utf-8",
+ locale_encoding,
+ locale_encoding,
+ )
+
+ assert len(reqs) == 1
+ assert reqs[0].name == "pip"
+ assert str(reqs[0].specifier) == "<=24.0"
+
+ @pytest.mark.parametrize("encoding", ["utf-8", "gbk"])
+ def test_errors_on_non_decodable_data(
+ self, encoding: str, tmpdir: Path, session: PipSession
+ ) -> None:
+ data = b"\xff"
+ req_file = tmpdir / "requirements.txt"
+ req_file.write_bytes(data)
+
+ with pytest.raises(UnicodeDecodeError), mock.patch(
+ "locale.getpreferredencoding", return_value=encoding
+ ):
+ next(parse_reqfile(req_file.resolve(), session=session))
diff --git a/tests/unit/test_utils.py b/tests/unit/test_utils.py
index 6627a89496d..e2d1710739a 100644
--- a/tests/unit/test_utils.py
+++ b/tests/unit/test_utils.py
@@ -3,7 +3,6 @@
"""
-import codecs
import os
import shutil
import stat
@@ -12,7 +11,7 @@
from io import BytesIO
from pathlib import Path
from typing import Any, Callable, Iterator, List, NoReturn, Optional, Tuple, Type
-from unittest.mock import Mock, patch
+from unittest.mock import Mock
import pytest
@@ -21,7 +20,6 @@
from pip._internal.exceptions import HashMismatch, HashMissing, InstallationError
from pip._internal.utils.deprecation import PipDeprecationWarning, deprecated
from pip._internal.utils.egg_link import egg_link_path_from_location
-from pip._internal.utils.encoding import BOMS, auto_decode
from pip._internal.utils.glibc import (
glibc_version_string,
glibc_version_string_confstr,
@@ -445,48 +443,6 @@ def test_has_one_of(self) -> None:
assert not empty_hashes.has_one_of({"sha256": "xyzt"})
-class TestEncoding:
- """Tests for pip._internal.utils.encoding"""
-
- def test_auto_decode_utf_16_le(self) -> None:
- data = (
- b"\xff\xfeD\x00j\x00a\x00n\x00g\x00o\x00=\x00"
- b"=\x001\x00.\x004\x00.\x002\x00"
- )
- assert data.startswith(codecs.BOM_UTF16_LE)
- assert auto_decode(data) == "Django==1.4.2"
-
- def test_auto_decode_utf_16_be(self) -> None:
- data = (
- b"\xfe\xff\x00D\x00j\x00a\x00n\x00g\x00o\x00="
- b"\x00=\x001\x00.\x004\x00.\x002"
- )
- assert data.startswith(codecs.BOM_UTF16_BE)
- assert auto_decode(data) == "Django==1.4.2"
-
- def test_auto_decode_no_bom(self) -> None:
- assert auto_decode(b"foobar") == "foobar"
-
- def test_auto_decode_pep263_headers(self) -> None:
- latin1_req = "# coding=latin1\n# Pas trop de café"
- assert auto_decode(latin1_req.encode("latin1")) == latin1_req
-
- def test_auto_decode_no_preferred_encoding(self) -> None:
- om, em = Mock(), Mock()
- om.return_value = "ascii"
- em.return_value = None
- data = "data"
- with patch("sys.getdefaultencoding", om):
- with patch("locale.getpreferredencoding", em):
- ret = auto_decode(data.encode(sys.getdefaultencoding()))
- assert ret == data
-
- @pytest.mark.parametrize("encoding", [encoding for bom, encoding in BOMS])
- def test_all_encodings_are_valid(self, encoding: str) -> None:
- # we really only care that there is no LookupError
- assert "".encode(encoding).decode(encoding) == ""
-
-
def raises(error: Type[Exception]) -> NoReturn:
raise error