From 921cbfd2165abdfe387bc283996ed9cde11f717d Mon Sep 17 00:00:00 2001 From: Petr Viktorin Date: Wed, 24 Jul 2024 15:13:14 +0200 Subject: [PATCH 01/11] Verify that email headers are well-formed This should fail for custom fold() implementations that aren't careful about newlines. --- Lib/email/_policybase.py | 8 ++++++++ Lib/email/errors.py | 4 ++++ Lib/email/generator.py | 13 ++++++++++++- 3 files changed, 24 insertions(+), 1 deletion(-) diff --git a/Lib/email/_policybase.py b/Lib/email/_policybase.py index 1c76ed63b61ae8..c7694a44e26639 100644 --- a/Lib/email/_policybase.py +++ b/Lib/email/_policybase.py @@ -157,6 +157,13 @@ class Policy(_PolicyBase, metaclass=abc.ABCMeta): message_factory -- the class to use to create new message objects. If the value is None, the default is Message. + verify_generated_headers + -- if true, the generator verifies that each header + they are properly folded, so that a parser won't + treat it as multiple headers, start-of-body, or + part of another header. + This is a check against custom Header & fold() + implementations. """ raise_on_defect = False @@ -165,6 +172,7 @@ class Policy(_PolicyBase, metaclass=abc.ABCMeta): max_line_length = 78 mangle_from_ = False message_factory = None + verify_generated_headers = True def handle_defect(self, obj, defect): """Based on policy, either raise defect or call register_defect. diff --git a/Lib/email/errors.py b/Lib/email/errors.py index 3ad00565549968..02aa5eced6ae46 100644 --- a/Lib/email/errors.py +++ b/Lib/email/errors.py @@ -29,6 +29,10 @@ class CharsetError(MessageError): """An illegal charset was given.""" +class HeaderWriteError(MessageError): + """Error while writing headers.""" + + # These are parsing defects which the parser was able to work around. class MessageDefect(ValueError): """Base class for a message defect.""" diff --git a/Lib/email/generator.py b/Lib/email/generator.py index 9d058ceada24f8..42c84aa4da1044 100644 --- a/Lib/email/generator.py +++ b/Lib/email/generator.py @@ -14,12 +14,14 @@ from copy import deepcopy from io import StringIO, BytesIO from email.utils import _has_surrogates +from email.errors import HeaderWriteError UNDERSCORE = '_' NL = '\n' # XXX: no longer used by the code below. NLCRE = re.compile(r'\r\n|\r|\n') fcre = re.compile(r'^From ', re.MULTILINE) +NEWLINE_WITHOUT_FWSP = re.compile(r'\r\n[^ \t]|\r[^ \n\t]|\n[^ \t]') class Generator: @@ -222,7 +224,16 @@ def _dispatch(self, msg): def _write_headers(self, msg): for h, v in msg.raw_items(): - self.write(self.policy.fold(h, v)) + folded = self.policy.fold(h, v) + if self.policy.verify_generated_headers: + linesep = self.policy.linesep + if not folded.endswith(self.policy.linesep): + raise HeaderWriteError( + f'folded header does not end with {linesep!r}: {folded!r}') + if NEWLINE_WITHOUT_FWSP.search(folded.removesuffix(linesep)): + raise HeaderWriteError( + f'folded header contains newline: {folded!r}') + self.write(folded) # A blank line always separates headers from body self.write(self._NL) From bd7f922f6214364923aa5959ee5d9733f600ce85 Mon Sep 17 00:00:00 2001 From: Petr Viktorin Date: Wed, 24 Jul 2024 15:36:43 +0200 Subject: [PATCH 02/11] Encode header parts that contain newlines Per RFC 2047: > [...] these encoding schemes allow the > encoding of arbitrary octet values, mail readers that implement this > decoding should also ensure that display of the decoded data on the > recipient's terminal will not cause unwanted side-effects It seems that the "quoted-word" scheme is a valid way to include a newline character in a header value, just like we already allow undecodable bytes or control characters. They do need to be properly quoted when serialized to text, though. --- Credit for an earlier attempt: Co-Authored-By: Bas Bloemsaat --- Lib/email/_header_value_parser.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/Lib/email/_header_value_parser.py b/Lib/email/_header_value_parser.py index 7da1bbaf8a80d7..076dc55f8077ce 100644 --- a/Lib/email/_header_value_parser.py +++ b/Lib/email/_header_value_parser.py @@ -92,6 +92,8 @@ ASPECIALS = TSPECIALS | set("*'%") ATTRIBUTE_ENDS = ASPECIALS | WSP EXTENDED_ATTRIBUTE_ENDS = ATTRIBUTE_ENDS - set('%') +NLSET = {'\n', '\r'} +SPECIALSNL = SPECIALS | NLSET def quote_string(value): return '"'+str(value).replace('\\', '\\\\').replace('"', r'\"')+'"' @@ -2802,9 +2804,12 @@ def _refold_parse_tree(parse_tree, *, policy): wrap_as_ew_blocked -= 1 continue tstr = str(part) - if part.token_type == 'ptext' and set(tstr) & SPECIALS: + if part.token_type == 'ptext' and set(tstr) & SPECIALSNL: # Encode if tstr contains special characters. want_encoding = True + elif set(tstr) & NLSET: + # Encode if text contains newlines + want_encoding = True try: tstr.encode(encoding) charset = encoding From 59c06c3dd1a8435e76aa53d43d89ea9866181a4b Mon Sep 17 00:00:00 2001 From: Petr Viktorin Date: Sat, 27 Jul 2024 15:50:42 +0200 Subject: [PATCH 03/11] Add tests and documentation --- Doc/library/email.policy.rst | 18 ++++++++ Doc/whatsnew/3.14.rst | 9 ++++ Lib/test/test_email/test_generator.py | 62 +++++++++++++++++++++++++++ Lib/test/test_email/test_policy.py | 27 ++++++++++++ 4 files changed, 116 insertions(+) diff --git a/Doc/library/email.policy.rst b/Doc/library/email.policy.rst index 83feedf728351e..9eac692f43aa01 100644 --- a/Doc/library/email.policy.rst +++ b/Doc/library/email.policy.rst @@ -229,6 +229,24 @@ added matters. To illustrate:: .. versionadded:: 3.6 + + .. attribute:: verify_generated_headers + + If :const:`True` (the default), the generator will raise + :exc:`~email.errors.HeaderWriteError` instead of writing a header + that is improperly folded or delimited, such that it would + be parsed as multiple headers or joined with adjacent data. + Such headers can be generated by custom header classes or bugs + in the ``email`` module. + + As it's a security feature, this defaults to ``True`` even in the + :class:`~email.policy.Compat32` policy. + For backwards compatible, but unsafe, behavior, it must be set to + ``False`` explicitly. + + .. versionadded:: 3.13 + + The following :class:`Policy` method is intended to be called by code using the email library to create policy instances with custom settings: diff --git a/Doc/whatsnew/3.14.rst b/Doc/whatsnew/3.14.rst index bd8bdcb6732fde..40b25af8a24fe7 100644 --- a/Doc/whatsnew/3.14.rst +++ b/Doc/whatsnew/3.14.rst @@ -267,6 +267,15 @@ collections.abc email ----- +* Headers with embedded newlines are now quoted on output. + + The :mod:`~email.generator` will now refuse to serialize (write) headers + that are improperly folded or delimited, such that they would be parsed as + multiple headers or joined with adjacent data. + If you need to turn this safety feature off, + set :attr:`~email.policy.Policy.verify_generated_headers`. + (Contributed by Bas Bloemsaat and Petr Viktorin in :gh:`121650`.) + * Remove the *isdst* parameter from :func:`email.utils.localtime`. (Contributed by Hugo van Kemenade in :gh:`118798`.) diff --git a/Lib/test/test_email/test_generator.py b/Lib/test/test_email/test_generator.py index bc6f734d4fd0a9..c75a842c33578e 100644 --- a/Lib/test/test_email/test_generator.py +++ b/Lib/test/test_email/test_generator.py @@ -6,6 +6,7 @@ from email.generator import Generator, BytesGenerator from email.headerregistry import Address from email import policy +import email.errors from test.test_email import TestEmailBase, parameterize @@ -249,6 +250,44 @@ def test_rfc2231_wrapping_switches_to_default_len_if_too_narrow(self): g.flatten(msg) self.assertEqual(s.getvalue(), self.typ(expected)) + def test_keep_encoded_newlines(self): + msg = self.msgmaker(self.typ(textwrap.dedent("""\ + To: nobody + Subject: Bad subject=?UTF-8?Q?=0A?=Bcc: injection@example.com + + None + """))) + expected = textwrap.dedent("""\ + To: nobody + Subject: Bad subject=?UTF-8?Q?=0A?=Bcc: injection@example.com + + None + """) + s = self.ioclass() + g = self.genclass(s, policy=self.policy.clone(max_line_length=80)) + g.flatten(msg) + self.assertEqual(s.getvalue(), self.typ(expected)) + + def test_keep_long_encoded_newlines(self): + msg = self.msgmaker(self.typ(textwrap.dedent("""\ + To: nobody + Subject: Bad subject=?UTF-8?Q?=0A?=Bcc: injection@example.com + + None + """))) + expected = textwrap.dedent("""\ + To: nobody + Subject: Bad subject + =?utf-8?q?=0A?=Bcc: + injection@example.com + + None + """) + s = self.ioclass() + g = self.genclass(s, policy=self.policy.clone(max_line_length=30)) + g.flatten(msg) + self.assertEqual(s.getvalue(), self.typ(expected)) + class TestGenerator(TestGeneratorBase, TestEmailBase): @@ -273,6 +312,29 @@ def test_flatten_unicode_linesep(self): g.flatten(msg) self.assertEqual(s.getvalue(), self.typ(expected)) + def test_verify_generated_headers(self): + """gh-121650: by default the generator prevents header injection""" + class LiteralHeader(str): + name = 'Header' + def fold(self, **kwargs): + return self + + for text in ( + 'Value\r\nBad Injection\r\n', + 'NoNewLine' + ): + with self.subTest(text=text): + message = message_from_string( + "Header: Value\r\n\r\nBody", + policy=self.policy, + ) + + del message['Header'] + message['Header'] = LiteralHeader(text) + + with self.assertRaises(email.errors.HeaderWriteError): + message.as_string() + class TestBytesGenerator(TestGeneratorBase, TestEmailBase): diff --git a/Lib/test/test_email/test_policy.py b/Lib/test/test_email/test_policy.py index c6b9c80efe1b54..1e32d85a218bbc 100644 --- a/Lib/test/test_email/test_policy.py +++ b/Lib/test/test_email/test_policy.py @@ -26,6 +26,7 @@ class PolicyAPITests(unittest.TestCase): 'raise_on_defect': False, 'mangle_from_': True, 'message_factory': None, + 'verify_generated_headers': True, } # These default values are the ones set on email.policy.default. # If any of these defaults change, the docs must be updated. @@ -294,6 +295,32 @@ def test_short_maxlen_error(self): with self.assertRaises(email.errors.HeaderParseError): policy.fold("Subject", subject) + def test_verify_generated_headers(self): + """Turning protection off allows header injection""" + policy = email.policy.default.clone(verify_generated_headers=False) + for text in ( + 'Header: Value\r\nBad: Injection\r\n', + 'Header: NoNewLine' + ): + with self.subTest(text=text): + message = email.message_from_string( + "Header: Value\r\n\r\nBody", + policy=policy, + ) + class LiteralHeader(str): + name = 'Header' + def fold(self, **kwargs): + return self + + del message['Header'] + message['Header'] = LiteralHeader(text) + + expected = text#.replace('\r\n', '\n') + self.assertEqual( + message.as_string(), + f"{text}\nBody", + ) + # XXX: Need subclassing tests. # For adding subclassed objects, make sure the usual rules apply (subclass # wins), but that the order still works (right overrides left). From 8e7d6f18f97f599326dc8964b5a61d49fd0e6e21 Mon Sep 17 00:00:00 2001 From: Petr Viktorin Date: Sat, 27 Jul 2024 16:10:45 +0200 Subject: [PATCH 04/11] Add a blurb --- .../Library/2024-07-27-16-10-41.gh-issue-121650.nf6oc9.rst | 5 +++++ 1 file changed, 5 insertions(+) create mode 100644 Misc/NEWS.d/next/Library/2024-07-27-16-10-41.gh-issue-121650.nf6oc9.rst diff --git a/Misc/NEWS.d/next/Library/2024-07-27-16-10-41.gh-issue-121650.nf6oc9.rst b/Misc/NEWS.d/next/Library/2024-07-27-16-10-41.gh-issue-121650.nf6oc9.rst new file mode 100644 index 00000000000000..83dd28d4ac575b --- /dev/null +++ b/Misc/NEWS.d/next/Library/2024-07-27-16-10-41.gh-issue-121650.nf6oc9.rst @@ -0,0 +1,5 @@ +:mod:`email` headers with embedded newlines are now quoted on output. The +:mod:`~email.generator` will now refuse to serialize (write) headers that +are unsafely folded or delimited; see +:attr:`~email.policy.Policy.verify_generated_headers`. (Contributed by Bas +Bloemsaat and Petr Viktorin in :gh:`121650`.) From ef65562ecb31be7af25de53de56fc8a27eb08c39 Mon Sep 17 00:00:00 2001 From: Petr Viktorin Date: Sat, 27 Jul 2024 16:11:04 +0200 Subject: [PATCH 05/11] Add the news to 3.13; there's still hope for backporting --- Doc/whatsnew/3.13.rst | 9 +++++++++ Doc/whatsnew/3.14.rst | 9 --------- 2 files changed, 9 insertions(+), 9 deletions(-) diff --git a/Doc/whatsnew/3.13.rst b/Doc/whatsnew/3.13.rst index e89abfdd292f48..1a92f454fa9436 100644 --- a/Doc/whatsnew/3.13.rst +++ b/Doc/whatsnew/3.13.rst @@ -736,6 +736,15 @@ doctest email ----- +* Headers with embedded newlines are now quoted on output. + + The :mod:`~email.generator` will now refuse to serialize (write) headers + that are improperly folded or delimited, such that they would be parsed as + multiple headers or joined with adjacent data. + If you need to turn this safety feature off, + set :attr:`~email.policy.Policy.verify_generated_headers`. + (Contributed by Bas Bloemsaat and Petr Viktorin in :gh:`121650`.) + * :func:`email.utils.getaddresses` and :func:`email.utils.parseaddr` now return ``('', '')`` 2-tuples in more situations where invalid email addresses are encountered instead of potentially inaccurate values. Add optional *strict* diff --git a/Doc/whatsnew/3.14.rst b/Doc/whatsnew/3.14.rst index 40b25af8a24fe7..bd8bdcb6732fde 100644 --- a/Doc/whatsnew/3.14.rst +++ b/Doc/whatsnew/3.14.rst @@ -267,15 +267,6 @@ collections.abc email ----- -* Headers with embedded newlines are now quoted on output. - - The :mod:`~email.generator` will now refuse to serialize (write) headers - that are improperly folded or delimited, such that they would be parsed as - multiple headers or joined with adjacent data. - If you need to turn this safety feature off, - set :attr:`~email.policy.Policy.verify_generated_headers`. - (Contributed by Bas Bloemsaat and Petr Viktorin in :gh:`121650`.) - * Remove the *isdst* parameter from :func:`email.utils.localtime`. (Contributed by Hugo van Kemenade in :gh:`118798`.) From 64dcb445ebad1f59342e51b0d58611985a6f9dcb Mon Sep 17 00:00:00 2001 From: Petr Viktorin Date: Sat, 27 Jul 2024 16:26:32 +0200 Subject: [PATCH 06/11] Don't mark up ``True`` I'm not touching other instances in this file, since this PR might be backported to very old versions. --- Doc/library/email.policy.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Doc/library/email.policy.rst b/Doc/library/email.policy.rst index 9eac692f43aa01..314767d0802a08 100644 --- a/Doc/library/email.policy.rst +++ b/Doc/library/email.policy.rst @@ -232,7 +232,7 @@ added matters. To illustrate:: .. attribute:: verify_generated_headers - If :const:`True` (the default), the generator will raise + If ``True`` (the default), the generator will raise :exc:`~email.errors.HeaderWriteError` instead of writing a header that is improperly folded or delimited, such that it would be parsed as multiple headers or joined with adjacent data. From af4173361d85f6e7c9cdd4164fefda02d7d871b3 Mon Sep 17 00:00:00 2001 From: Petr Viktorin Date: Mon, 29 Jul 2024 15:24:35 +0200 Subject: [PATCH 07/11] Document HeaderWriteError --- Doc/library/email.errors.rst | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/Doc/library/email.errors.rst b/Doc/library/email.errors.rst index 33ab4265116178..f8f43d82a3df2e 100644 --- a/Doc/library/email.errors.rst +++ b/Doc/library/email.errors.rst @@ -58,6 +58,13 @@ The following exception classes are defined in the :mod:`email.errors` module: :class:`~email.mime.nonmultipart.MIMENonMultipart` (e.g. :class:`~email.mime.image.MIMEImage`). + +.. exception:: HeaderWriteError() + + Raised when an error occurs when the :mod:`~email.generator` outputs + headers. + + .. exception:: MessageDefect() This is the base class for all defects found when parsing email messages. From 596a25b66e10b821148d99098e427ed5734298e0 Mon Sep 17 00:00:00 2001 From: Petr Viktorin Date: Tue, 30 Jul 2024 09:29:10 +0200 Subject: [PATCH 08/11] Apply suggestions from code review Co-authored-by: Serhiy Storchaka --- Lib/email/_header_value_parser.py | 8 ++++---- Lib/test/test_email/test_policy.py | 1 - 2 files changed, 4 insertions(+), 5 deletions(-) diff --git a/Lib/email/_header_value_parser.py b/Lib/email/_header_value_parser.py index 076dc55f8077ce..9119c27983f0cb 100644 --- a/Lib/email/_header_value_parser.py +++ b/Lib/email/_header_value_parser.py @@ -2804,12 +2804,12 @@ def _refold_parse_tree(parse_tree, *, policy): wrap_as_ew_blocked -= 1 continue tstr = str(part) - if part.token_type == 'ptext' and set(tstr) & SPECIALSNL: + if part.token_type == 'ptext' # Encode if tstr contains special characters. - want_encoding = True - elif set(tstr) & NLSET: + want_encoding = not SPECIALSNL.isdisjoint(tstr) + else: # Encode if text contains newlines - want_encoding = True + want_encoding = not NLSET.isdisjoint(tstr) try: tstr.encode(encoding) charset = encoding diff --git a/Lib/test/test_email/test_policy.py b/Lib/test/test_email/test_policy.py index 1e32d85a218bbc..baa35fd68e49c5 100644 --- a/Lib/test/test_email/test_policy.py +++ b/Lib/test/test_email/test_policy.py @@ -315,7 +315,6 @@ def fold(self, **kwargs): del message['Header'] message['Header'] = LiteralHeader(text) - expected = text#.replace('\r\n', '\n') self.assertEqual( message.as_string(), f"{text}\nBody", From d7f5fc6499ce6495f8cc85656b1e463ded03a6d5 Mon Sep 17 00:00:00 2001 From: Petr Viktorin Date: Tue, 30 Jul 2024 09:30:48 +0200 Subject: [PATCH 09/11] Add missing colon --- Lib/email/_header_value_parser.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Lib/email/_header_value_parser.py b/Lib/email/_header_value_parser.py index 9119c27983f0cb..68e9e50b756d44 100644 --- a/Lib/email/_header_value_parser.py +++ b/Lib/email/_header_value_parser.py @@ -2804,7 +2804,7 @@ def _refold_parse_tree(parse_tree, *, policy): wrap_as_ew_blocked -= 1 continue tstr = str(part) - if part.token_type == 'ptext' + if part.token_type == 'ptext': # Encode if tstr contains special characters. want_encoding = not SPECIALSNL.isdisjoint(tstr) else: From a768a812fb87aef95fb12a031f6720a65aaa74f1 Mon Sep 17 00:00:00 2001 From: Petr Viktorin Date: Tue, 30 Jul 2024 15:56:28 +0200 Subject: [PATCH 10/11] Only set want_encoding to True; don't reset it to False --- Lib/email/_header_value_parser.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/Lib/email/_header_value_parser.py b/Lib/email/_header_value_parser.py index 68e9e50b756d44..12e3f7f3e5495d 100644 --- a/Lib/email/_header_value_parser.py +++ b/Lib/email/_header_value_parser.py @@ -2806,10 +2806,11 @@ def _refold_parse_tree(parse_tree, *, policy): tstr = str(part) if part.token_type == 'ptext': # Encode if tstr contains special characters. - want_encoding = not SPECIALSNL.isdisjoint(tstr) - else: - # Encode if text contains newlines - want_encoding = not NLSET.isdisjoint(tstr) + if not SPECIALSNL.isdisjoint(tstr): + want_encoding = True + elif not NLSET.isdisjoint(tstr): + # Encode if tstr contains newlines. + want_encoding = True try: tstr.encode(encoding) charset = encoding From f793b966f801605e56ff06f6b85eac0b8dd4aeea Mon Sep 17 00:00:00 2001 From: Serhiy Storchaka Date: Tue, 30 Jul 2024 21:35:30 +0300 Subject: [PATCH 11/11] Update Lib/email/_header_value_parser.py --- Lib/email/_header_value_parser.py | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/Lib/email/_header_value_parser.py b/Lib/email/_header_value_parser.py index 12e3f7f3e5495d..ec2215a5e5f33c 100644 --- a/Lib/email/_header_value_parser.py +++ b/Lib/email/_header_value_parser.py @@ -2804,13 +2804,13 @@ def _refold_parse_tree(parse_tree, *, policy): wrap_as_ew_blocked -= 1 continue tstr = str(part) - if part.token_type == 'ptext': - # Encode if tstr contains special characters. - if not SPECIALSNL.isdisjoint(tstr): - want_encoding = True - elif not NLSET.isdisjoint(tstr): - # Encode if tstr contains newlines. - want_encoding = True + if not want_encoding: + if part.token_type == 'ptext': + # Encode if tstr contains special characters. + want_encoding = not SPECIALSNL.isdisjoint(tstr) + else: + # Encode if tstr contains newlines. + want_encoding = not NLSET.isdisjoint(tstr) try: tstr.encode(encoding) charset = encoding