From 86d6c68d43e5c3dcb4abeeb790b58afdbfb32abc Mon Sep 17 00:00:00 2001
From: "Miss Islington (bot)"
 <31488909+miss-islington@users.noreply.github.com>
Date: Sun, 3 Nov 2024 15:30:16 +0100
Subject: [PATCH] [3.13] gh-104400: Add more tests to pygettext (GH-108173)
 (GH-126361)

(cherry picked from commit dcae5cd6abaae4f73e656ebc054f30d3f15ca7b8)

Co-authored-by: Tomas R <tomas.roun8@gmail.com>
---
 Lib/test/test_tools/i18n_data/docstrings.pot |  40 +++++++
 Lib/test/test_tools/i18n_data/docstrings.py  |  41 +++++++
 Lib/test/test_tools/i18n_data/fileloc.pot    |  35 ++++++
 Lib/test/test_tools/i18n_data/fileloc.py     |  26 +++++
 Lib/test/test_tools/i18n_data/messages.pot   |  67 +++++++++++
 Lib/test/test_tools/i18n_data/messages.py    |  64 +++++++++++
 Lib/test/test_tools/test_i18n.py             | 110 +++++++++++++++----
 Makefile.pre.in                              |   1 +
 8 files changed, 363 insertions(+), 21 deletions(-)
 create mode 100644 Lib/test/test_tools/i18n_data/docstrings.pot
 create mode 100644 Lib/test/test_tools/i18n_data/docstrings.py
 create mode 100644 Lib/test/test_tools/i18n_data/fileloc.pot
 create mode 100644 Lib/test/test_tools/i18n_data/fileloc.py
 create mode 100644 Lib/test/test_tools/i18n_data/messages.pot
 create mode 100644 Lib/test/test_tools/i18n_data/messages.py

diff --git a/Lib/test/test_tools/i18n_data/docstrings.pot b/Lib/test/test_tools/i18n_data/docstrings.pot
new file mode 100644
index 00000000000000..5af1d41422ff62
--- /dev/null
+++ b/Lib/test/test_tools/i18n_data/docstrings.pot
@@ -0,0 +1,40 @@
+# SOME DESCRIPTIVE TITLE.
+# Copyright (C) YEAR ORGANIZATION
+# FIRST AUTHOR <EMAIL@ADDRESS>, YEAR.
+#
+msgid ""
+msgstr ""
+"Project-Id-Version: PACKAGE VERSION\n"
+"POT-Creation-Date: 2000-01-01 00:00+0000\n"
+"PO-Revision-Date: YEAR-MO-DA HO:MI+ZONE\n"
+"Last-Translator: FULL NAME <EMAIL@ADDRESS>\n"
+"Language-Team: LANGUAGE <LL@li.org>\n"
+"MIME-Version: 1.0\n"
+"Content-Type: text/plain; charset=UTF-8\n"
+"Content-Transfer-Encoding: 8bit\n"
+"Generated-By: pygettext.py 1.5\n"
+
+
+#: docstrings.py:7
+#, docstring
+msgid ""
+msgstr ""
+
+#: docstrings.py:18
+#, docstring
+msgid ""
+"multiline\n"
+"    docstring\n"
+"    "
+msgstr ""
+
+#: docstrings.py:25
+#, docstring
+msgid "docstring1"
+msgstr ""
+
+#: docstrings.py:30
+#, docstring
+msgid "Hello, {}!"
+msgstr ""
+
diff --git a/Lib/test/test_tools/i18n_data/docstrings.py b/Lib/test/test_tools/i18n_data/docstrings.py
new file mode 100644
index 00000000000000..85d7f159d37775
--- /dev/null
+++ b/Lib/test/test_tools/i18n_data/docstrings.py
@@ -0,0 +1,41 @@
+# Test docstring extraction
+from gettext import gettext as _
+
+
+# Empty docstring
+def test(x):
+    """"""
+
+
+# Leading empty line
+def test2(x):
+
+    """docstring"""  # XXX This should be extracted but isn't.
+
+
+# XXX Multiline docstrings should be cleaned with `inspect.cleandoc`.
+def test3(x):
+    """multiline
+    docstring
+    """
+
+
+# Multiple docstrings - only the first should be extracted
+def test4(x):
+    """docstring1"""
+    """docstring2"""
+
+
+def test5(x):
+    """Hello, {}!""".format("world!")  # XXX This should not be extracted.
+
+
+# Nested docstrings
+def test6(x):
+    def inner(y):
+        """nested docstring"""  # XXX This should be extracted but isn't.
+
+
+class Outer:
+    class Inner:
+        "nested class docstring"  # XXX This should be extracted but isn't.
diff --git a/Lib/test/test_tools/i18n_data/fileloc.pot b/Lib/test/test_tools/i18n_data/fileloc.pot
new file mode 100644
index 00000000000000..dbd28687a73556
--- /dev/null
+++ b/Lib/test/test_tools/i18n_data/fileloc.pot
@@ -0,0 +1,35 @@
+# SOME DESCRIPTIVE TITLE.
+# Copyright (C) YEAR ORGANIZATION
+# FIRST AUTHOR <EMAIL@ADDRESS>, YEAR.
+#
+msgid ""
+msgstr ""
+"Project-Id-Version: PACKAGE VERSION\n"
+"POT-Creation-Date: 2000-01-01 00:00+0000\n"
+"PO-Revision-Date: YEAR-MO-DA HO:MI+ZONE\n"
+"Last-Translator: FULL NAME <EMAIL@ADDRESS>\n"
+"Language-Team: LANGUAGE <LL@li.org>\n"
+"MIME-Version: 1.0\n"
+"Content-Type: text/plain; charset=UTF-8\n"
+"Content-Transfer-Encoding: 8bit\n"
+"Generated-By: pygettext.py 1.5\n"
+
+
+#: fileloc.py:5 fileloc.py:6
+msgid "foo"
+msgstr ""
+
+#: fileloc.py:9
+msgid "bar"
+msgstr ""
+
+#: fileloc.py:14 fileloc.py:18
+#, docstring
+msgid "docstring"
+msgstr ""
+
+#: fileloc.py:22 fileloc.py:26
+#, docstring
+msgid "baz"
+msgstr ""
+
diff --git a/Lib/test/test_tools/i18n_data/fileloc.py b/Lib/test/test_tools/i18n_data/fileloc.py
new file mode 100644
index 00000000000000..c5d4d0595fea52
--- /dev/null
+++ b/Lib/test/test_tools/i18n_data/fileloc.py
@@ -0,0 +1,26 @@
+# Test file locations
+from gettext import gettext as _
+
+# Duplicate strings
+_('foo')
+_('foo')
+
+# Duplicate strings on the same line should only add one location to the output
+_('bar'), _('bar')
+
+
+# Duplicate docstrings
+class A:
+    """docstring"""
+
+
+def f():
+    """docstring"""
+
+
+# Duplicate message and docstring
+_('baz')
+
+
+def g():
+    """baz"""
diff --git a/Lib/test/test_tools/i18n_data/messages.pot b/Lib/test/test_tools/i18n_data/messages.pot
new file mode 100644
index 00000000000000..ddfbd18349ef4f
--- /dev/null
+++ b/Lib/test/test_tools/i18n_data/messages.pot
@@ -0,0 +1,67 @@
+# SOME DESCRIPTIVE TITLE.
+# Copyright (C) YEAR ORGANIZATION
+# FIRST AUTHOR <EMAIL@ADDRESS>, YEAR.
+#
+msgid ""
+msgstr ""
+"Project-Id-Version: PACKAGE VERSION\n"
+"POT-Creation-Date: 2000-01-01 00:00+0000\n"
+"PO-Revision-Date: YEAR-MO-DA HO:MI+ZONE\n"
+"Last-Translator: FULL NAME <EMAIL@ADDRESS>\n"
+"Language-Team: LANGUAGE <LL@li.org>\n"
+"MIME-Version: 1.0\n"
+"Content-Type: text/plain; charset=UTF-8\n"
+"Content-Transfer-Encoding: 8bit\n"
+"Generated-By: pygettext.py 1.5\n"
+
+
+#: messages.py:5
+msgid ""
+msgstr ""
+
+#: messages.py:8 messages.py:9
+msgid "parentheses"
+msgstr ""
+
+#: messages.py:12
+msgid "Hello, world!"
+msgstr ""
+
+#: messages.py:15
+msgid ""
+"Hello,\n"
+"    multiline!\n"
+msgstr ""
+
+#: messages.py:29
+msgid "Hello, {}!"
+msgstr ""
+
+#: messages.py:33
+msgid "1"
+msgstr ""
+
+#: messages.py:33
+msgid "2"
+msgstr ""
+
+#: messages.py:34 messages.py:35
+msgid "A"
+msgstr ""
+
+#: messages.py:34 messages.py:35
+msgid "B"
+msgstr ""
+
+#: messages.py:36
+msgid "set"
+msgstr ""
+
+#: messages.py:42
+msgid "nested string"
+msgstr ""
+
+#: messages.py:47
+msgid "baz"
+msgstr ""
+
diff --git a/Lib/test/test_tools/i18n_data/messages.py b/Lib/test/test_tools/i18n_data/messages.py
new file mode 100644
index 00000000000000..f220294b8d5c67
--- /dev/null
+++ b/Lib/test/test_tools/i18n_data/messages.py
@@ -0,0 +1,64 @@
+# Test message extraction
+from gettext import gettext as _
+
+# Empty string
+_("")
+
+# Extra parentheses
+(_("parentheses"))
+((_("parentheses")))
+
+# Multiline strings
+_("Hello, "
+  "world!")
+
+_("""Hello,
+    multiline!
+""")
+
+# Invalid arguments
+_()
+_(None)
+_(1)
+_(False)
+_(x="kwargs are not allowed")
+_("foo", "bar")
+_("something", x="something else")
+
+# .format()
+_("Hello, {}!").format("world")  # valid
+_("Hello, {}!".format("world"))  # invalid
+
+# Nested structures
+_("1"), _("2")
+arr = [_("A"), _("B")]
+obj = {'a': _("A"), 'b': _("B")}
+{{{_('set')}}}
+
+
+# Nested functions and classes
+def test():
+    _("nested string")  # XXX This should be extracted but isn't.
+    [_("nested string")]
+
+
+class Foo:
+    def bar(self):
+        return _("baz")
+
+
+def bar(x=_('default value')):  # XXX This should be extracted but isn't.
+    pass
+
+
+def baz(x=[_('default value')]):  # XXX This should be extracted but isn't.
+    pass
+
+
+# Shadowing _()
+def _(x):
+    pass
+
+
+def _(x="don't extract me"):
+    pass
diff --git a/Lib/test/test_tools/test_i18n.py b/Lib/test/test_tools/test_i18n.py
index c083a04475e726..21dead8f943bb7 100644
--- a/Lib/test/test_tools/test_i18n.py
+++ b/Lib/test/test_tools/test_i18n.py
@@ -1,9 +1,11 @@
 """Tests to cover the Tools/i18n package"""
 
 import os
+import re
 import sys
 import unittest
 from textwrap import dedent
+from pathlib import Path
 
 from test.support.script_helper import assert_python_ok
 from test.test_tools import skip_if_missing, toolsdir
@@ -12,20 +14,47 @@
 
 skip_if_missing()
 
+DATA_DIR = Path(__file__).resolve().parent / 'i18n_data'
+
+
+def normalize_POT_file(pot):
+    """Normalize the POT creation timestamp, charset and
+    file locations to make the POT file easier to compare.
+
+    """
+    # Normalize the creation date.
+    date_pattern = re.compile(r'"POT-Creation-Date: .+?\\n"')
+    header = r'"POT-Creation-Date: 2000-01-01 00:00+0000\\n"'
+    pot = re.sub(date_pattern, header, pot)
+
+    # Normalize charset to UTF-8 (currently there's no way to specify the output charset).
+    charset_pattern = re.compile(r'"Content-Type: text/plain; charset=.+?\\n"')
+    charset = r'"Content-Type: text/plain; charset=UTF-8\\n"'
+    pot = re.sub(charset_pattern, charset, pot)
+
+    # Normalize file location path separators in case this test is
+    # running on Windows (which uses '\').
+    fileloc_pattern = re.compile(r'#:.+')
+
+    def replace(match):
+        return match[0].replace(os.sep, "/")
+    pot = re.sub(fileloc_pattern, replace, pot)
+    return pot
+
 
 class Test_pygettext(unittest.TestCase):
     """Tests for the pygettext.py tool"""
 
-    script = os.path.join(toolsdir,'i18n', 'pygettext.py')
+    script = Path(toolsdir, 'i18n', 'pygettext.py')
 
     def get_header(self, data):
         """ utility: return the header of a .po file as a dictionary """
         headers = {}
         for line in data.split('\n'):
-            if not line or line.startswith(('#', 'msgid','msgstr')):
+            if not line or line.startswith(('#', 'msgid', 'msgstr')):
                 continue
             line = line.strip('"')
-            key, val = line.split(':',1)
+            key, val = line.split(':', 1)
             headers[key] = val.strip()
         return headers
 
@@ -53,13 +82,18 @@ def get_msgids(self, data):
 
         return msgids
 
+    def assert_POT_equal(self, expected, actual):
+        """Check if two POT files are equal"""
+        self.maxDiff = None
+        self.assertEqual(normalize_POT_file(expected), normalize_POT_file(actual))
+
     def extract_docstrings_from_str(self, module_content):
         """ utility: return all msgids extracted from module_content """
         filename = 'test_docstrings.py'
         with temp_cwd(None) as cwd:
             with open(filename, 'w', encoding='utf-8') as fp:
                 fp.write(module_content)
-            assert_python_ok(self.script, '-D', filename)
+            assert_python_ok('-Xutf8', self.script, '-D', filename)
             with open('messages.pot', encoding='utf-8') as fp:
                 data = fp.read()
         return self.get_msgids(data)
@@ -69,7 +103,7 @@ def test_header(self):
            http://www.gnu.org/software/gettext/manual/gettext.html#Header-Entry
         """
         with temp_cwd(None) as cwd:
-            assert_python_ok(self.script)
+            assert_python_ok('-Xutf8', self.script)
             with open('messages.pot', encoding='utf-8') as fp:
                 data = fp.read()
             header = self.get_header(data)
@@ -96,7 +130,7 @@ def test_POT_Creation_Date(self):
         """ Match the date format from xgettext for POT-Creation-Date """
         from datetime import datetime
         with temp_cwd(None) as cwd:
-            assert_python_ok(self.script)
+            assert_python_ok('-Xutf8', self.script)
             with open('messages.pot', encoding='utf-8') as fp:
                 data = fp.read()
             header = self.get_header(data)
@@ -310,6 +344,20 @@ def test_calls_in_fstring_with_partially_wrong_expression(self):
         self.assertNotIn('foo', msgids)
         self.assertIn('bar', msgids)
 
+    def test_pygettext_output(self):
+        """Test that the pygettext output exactly matches snapshots."""
+        for input_file in DATA_DIR.glob('*.py'):
+            output_file = input_file.with_suffix('.pot')
+            with self.subTest(input_file=f'i18n_data/{input_file}'):
+                contents = input_file.read_text(encoding='utf-8')
+                with temp_cwd(None):
+                    Path(input_file.name).write_text(contents)
+                    assert_python_ok('-Xutf8', self.script, '--docstrings', input_file.name)
+                    output = Path('messages.pot').read_text(encoding='utf-8')
+
+                expected = output_file.read_text(encoding='utf-8')
+                self.assert_POT_equal(expected, output)
+
     def test_files_list(self):
         """Make sure the directories are inspected for source files
            bpo-31920
@@ -318,21 +366,41 @@ def test_files_list(self):
         text2 = 'Text to translate2'
         text3 = 'Text to ignore'
         with temp_cwd(None), temp_dir(None) as sdir:
-            os.mkdir(os.path.join(sdir, 'pypkg'))
-            with open(os.path.join(sdir, 'pypkg', 'pymod.py'), 'w',
-                      encoding='utf-8') as sfile:
-                sfile.write(f'_({text1!r})')
-            os.mkdir(os.path.join(sdir, 'pkg.py'))
-            with open(os.path.join(sdir, 'pkg.py', 'pymod2.py'), 'w',
-                      encoding='utf-8') as sfile:
-                sfile.write(f'_({text2!r})')
-            os.mkdir(os.path.join(sdir, 'CVS'))
-            with open(os.path.join(sdir, 'CVS', 'pymod3.py'), 'w',
-                      encoding='utf-8') as sfile:
-                sfile.write(f'_({text3!r})')
-            assert_python_ok(self.script, sdir)
-            with open('messages.pot', encoding='utf-8') as fp:
-                data = fp.read()
+            pymod = Path(sdir, 'pypkg', 'pymod.py')
+            pymod.parent.mkdir()
+            pymod.write_text(f'_({text1!r})', encoding='utf-8')
+
+            pymod2 = Path(sdir, 'pkg.py', 'pymod2.py')
+            pymod2.parent.mkdir()
+            pymod2.write_text(f'_({text2!r})', encoding='utf-8')
+
+            pymod3 = Path(sdir, 'CVS', 'pymod3.py')
+            pymod3.parent.mkdir()
+            pymod3.write_text(f'_({text3!r})', encoding='utf-8')
+
+            assert_python_ok('-Xutf8', self.script, sdir)
+            data = Path('messages.pot').read_text(encoding='utf-8')
             self.assertIn(f'msgid "{text1}"', data)
             self.assertIn(f'msgid "{text2}"', data)
             self.assertNotIn(text3, data)
+
+
+def update_POT_snapshots():
+    for input_file in DATA_DIR.glob('*.py'):
+        output_file = input_file.with_suffix('.pot')
+        contents = input_file.read_bytes()
+        with temp_cwd(None):
+            Path(input_file.name).write_bytes(contents)
+            assert_python_ok('-Xutf8', Test_pygettext.script, '--docstrings', input_file.name)
+            output = Path('messages.pot').read_text(encoding='utf-8')
+
+        output = normalize_POT_file(output)
+        output_file.write_text(output, encoding='utf-8')
+
+
+if __name__ == '__main__':
+    # To regenerate POT files
+    if len(sys.argv) > 1 and sys.argv[1] == '--snapshot-update':
+        update_POT_snapshots()
+        sys.exit(0)
+    unittest.main()
diff --git a/Makefile.pre.in b/Makefile.pre.in
index 445fa6381c20a4..32a338aa5c879b 100644
--- a/Makefile.pre.in
+++ b/Makefile.pre.in
@@ -2476,6 +2476,7 @@ TESTSUBDIRS=	idlelib/idle_test \
 		test/test_tomllib/data/valid/dates-and-times \
 		test/test_tomllib/data/valid/multiline-basic-str \
 		test/test_tools \
+		test/test_tools/i18n_data \
 		test/test_ttk \
 		test/test_unittest \
 		test/test_unittest/testmock \