diff --git a/Lib/test/test_tools/test_i18n.py b/Lib/test/test_tools/test_i18n.py deleted file mode 100644 index c083a04475e726..00000000000000 --- a/Lib/test/test_tools/test_i18n.py +++ /dev/null @@ -1,338 +0,0 @@ -"""Tests to cover the Tools/i18n package""" - -import os -import sys -import unittest -from textwrap import dedent - -from test.support.script_helper import assert_python_ok -from test.test_tools import skip_if_missing, toolsdir -from test.support.os_helper import temp_cwd, temp_dir - - -skip_if_missing() - - -class Test_pygettext(unittest.TestCase): - """Tests for the pygettext.py tool""" - - script = os.path.join(toolsdir,'i18n', 'pygettext.py') - - def get_header(self, data): - """ utility: return the header of a .po file as a dictionary """ - headers = {} - for line in data.split('\n'): - if not line or line.startswith(('#', 'msgid','msgstr')): - continue - line = line.strip('"') - key, val = line.split(':',1) - headers[key] = val.strip() - return headers - - def get_msgids(self, data): - """ utility: return all msgids in .po file as a list of strings """ - msgids = [] - reading_msgid = False - cur_msgid = [] - for line in data.split('\n'): - if reading_msgid: - if line.startswith('"'): - cur_msgid.append(line.strip('"')) - else: - msgids.append('\n'.join(cur_msgid)) - cur_msgid = [] - reading_msgid = False - continue - if line.startswith('msgid '): - line = line[len('msgid '):] - cur_msgid.append(line.strip('"')) - reading_msgid = True - else: - if reading_msgid: - msgids.append('\n'.join(cur_msgid)) - - return msgids - - def extract_docstrings_from_str(self, module_content): - """ utility: return all msgids extracted from module_content """ - filename = 'test_docstrings.py' - with temp_cwd(None) as cwd: - with open(filename, 'w', encoding='utf-8') as fp: - fp.write(module_content) - assert_python_ok(self.script, '-D', filename) - with open('messages.pot', encoding='utf-8') as fp: - data = fp.read() - return self.get_msgids(data) - - def test_header(self): - """Make sure the required fields are in the header, according to: - http://www.gnu.org/software/gettext/manual/gettext.html#Header-Entry - """ - with temp_cwd(None) as cwd: - assert_python_ok(self.script) - with open('messages.pot', encoding='utf-8') as fp: - data = fp.read() - header = self.get_header(data) - - self.assertIn("Project-Id-Version", header) - self.assertIn("POT-Creation-Date", header) - self.assertIn("PO-Revision-Date", header) - self.assertIn("Last-Translator", header) - self.assertIn("Language-Team", header) - self.assertIn("MIME-Version", header) - self.assertIn("Content-Type", header) - self.assertIn("Content-Transfer-Encoding", header) - self.assertIn("Generated-By", header) - - # not clear if these should be required in POT (template) files - #self.assertIn("Report-Msgid-Bugs-To", header) - #self.assertIn("Language", header) - - #"Plural-Forms" is optional - - @unittest.skipIf(sys.platform.startswith('aix'), - 'bpo-29972: broken test on AIX') - def test_POT_Creation_Date(self): - """ Match the date format from xgettext for POT-Creation-Date """ - from datetime import datetime - with temp_cwd(None) as cwd: - assert_python_ok(self.script) - with open('messages.pot', encoding='utf-8') as fp: - data = fp.read() - header = self.get_header(data) - creationDate = header['POT-Creation-Date'] - - # peel off the escaped newline at the end of string - if creationDate.endswith('\\n'): - creationDate = creationDate[:-len('\\n')] - - # This will raise if the date format does not exactly match. - datetime.strptime(creationDate, '%Y-%m-%d %H:%M%z') - - def test_funcdocstring(self): - for doc in ('"""doc"""', "r'''doc'''", "R'doc'", 'u"doc"'): - with self.subTest(doc): - msgids = self.extract_docstrings_from_str(dedent('''\ - def foo(bar): - %s - ''' % doc)) - self.assertIn('doc', msgids) - - def test_funcdocstring_bytes(self): - msgids = self.extract_docstrings_from_str(dedent('''\ - def foo(bar): - b"""doc""" - ''')) - self.assertFalse([msgid for msgid in msgids if 'doc' in msgid]) - - def test_funcdocstring_fstring(self): - msgids = self.extract_docstrings_from_str(dedent('''\ - def foo(bar): - f"""doc""" - ''')) - self.assertFalse([msgid for msgid in msgids if 'doc' in msgid]) - - def test_classdocstring(self): - for doc in ('"""doc"""', "r'''doc'''", "R'doc'", 'u"doc"'): - with self.subTest(doc): - msgids = self.extract_docstrings_from_str(dedent('''\ - class C: - %s - ''' % doc)) - self.assertIn('doc', msgids) - - def test_classdocstring_bytes(self): - msgids = self.extract_docstrings_from_str(dedent('''\ - class C: - b"""doc""" - ''')) - self.assertFalse([msgid for msgid in msgids if 'doc' in msgid]) - - def test_classdocstring_fstring(self): - msgids = self.extract_docstrings_from_str(dedent('''\ - class C: - f"""doc""" - ''')) - self.assertFalse([msgid for msgid in msgids if 'doc' in msgid]) - - def test_moduledocstring(self): - for doc in ('"""doc"""', "r'''doc'''", "R'doc'", 'u"doc"'): - with self.subTest(doc): - msgids = self.extract_docstrings_from_str(dedent('''\ - %s - ''' % doc)) - self.assertIn('doc', msgids) - - def test_moduledocstring_bytes(self): - msgids = self.extract_docstrings_from_str(dedent('''\ - b"""doc""" - ''')) - self.assertFalse([msgid for msgid in msgids if 'doc' in msgid]) - - def test_moduledocstring_fstring(self): - msgids = self.extract_docstrings_from_str(dedent('''\ - f"""doc""" - ''')) - self.assertFalse([msgid for msgid in msgids if 'doc' in msgid]) - - def test_msgid(self): - msgids = self.extract_docstrings_from_str( - '''_("""doc""" r'str' u"ing")''') - self.assertIn('docstring', msgids) - - def test_msgid_bytes(self): - msgids = self.extract_docstrings_from_str('_(b"""doc""")') - self.assertFalse([msgid for msgid in msgids if 'doc' in msgid]) - - def test_msgid_fstring(self): - msgids = self.extract_docstrings_from_str('_(f"""doc""")') - self.assertFalse([msgid for msgid in msgids if 'doc' in msgid]) - - def test_funcdocstring_annotated_args(self): - """ Test docstrings for functions with annotated args """ - msgids = self.extract_docstrings_from_str(dedent('''\ - def foo(bar: str): - """doc""" - ''')) - self.assertIn('doc', msgids) - - def test_funcdocstring_annotated_return(self): - """ Test docstrings for functions with annotated return type """ - msgids = self.extract_docstrings_from_str(dedent('''\ - def foo(bar) -> str: - """doc""" - ''')) - self.assertIn('doc', msgids) - - def test_funcdocstring_defvalue_args(self): - """ Test docstring for functions with default arg values """ - msgids = self.extract_docstrings_from_str(dedent('''\ - def foo(bar=()): - """doc""" - ''')) - self.assertIn('doc', msgids) - - def test_funcdocstring_multiple_funcs(self): - """ Test docstring extraction for multiple functions combining - annotated args, annotated return types and default arg values - """ - msgids = self.extract_docstrings_from_str(dedent('''\ - def foo1(bar: tuple=()) -> str: - """doc1""" - - def foo2(bar: List[1:2]) -> (lambda x: x): - """doc2""" - - def foo3(bar: 'func'=lambda x: x) -> {1: 2}: - """doc3""" - ''')) - self.assertIn('doc1', msgids) - self.assertIn('doc2', msgids) - self.assertIn('doc3', msgids) - - def test_classdocstring_early_colon(self): - """ Test docstring extraction for a class with colons occurring within - the parentheses. - """ - msgids = self.extract_docstrings_from_str(dedent('''\ - class D(L[1:2], F({1: 2}), metaclass=M(lambda x: x)): - """doc""" - ''')) - self.assertIn('doc', msgids) - - def test_calls_in_fstrings(self): - msgids = self.extract_docstrings_from_str(dedent('''\ - f"{_('foo bar')}" - ''')) - self.assertIn('foo bar', msgids) - - def test_calls_in_fstrings_raw(self): - msgids = self.extract_docstrings_from_str(dedent('''\ - rf"{_('foo bar')}" - ''')) - self.assertIn('foo bar', msgids) - - def test_calls_in_fstrings_nested(self): - msgids = self.extract_docstrings_from_str(dedent('''\ - f"""{f'{_("foo bar")}'}""" - ''')) - self.assertIn('foo bar', msgids) - - def test_calls_in_fstrings_attribute(self): - msgids = self.extract_docstrings_from_str(dedent('''\ - f"{obj._('foo bar')}" - ''')) - self.assertIn('foo bar', msgids) - - def test_calls_in_fstrings_with_call_on_call(self): - msgids = self.extract_docstrings_from_str(dedent('''\ - f"{type(str)('foo bar')}" - ''')) - self.assertNotIn('foo bar', msgids) - - def test_calls_in_fstrings_with_format(self): - msgids = self.extract_docstrings_from_str(dedent('''\ - f"{_('foo {bar}').format(bar='baz')}" - ''')) - self.assertIn('foo {bar}', msgids) - - def test_calls_in_fstrings_with_wrong_input_1(self): - msgids = self.extract_docstrings_from_str(dedent('''\ - f"{_(f'foo {bar}')}" - ''')) - self.assertFalse([msgid for msgid in msgids if 'foo {bar}' in msgid]) - - def test_calls_in_fstrings_with_wrong_input_2(self): - msgids = self.extract_docstrings_from_str(dedent('''\ - f"{_(1)}" - ''')) - self.assertNotIn(1, msgids) - - def test_calls_in_fstring_with_multiple_args(self): - msgids = self.extract_docstrings_from_str(dedent('''\ - f"{_('foo', 'bar')}" - ''')) - self.assertNotIn('foo', msgids) - self.assertNotIn('bar', msgids) - - def test_calls_in_fstring_with_keyword_args(self): - msgids = self.extract_docstrings_from_str(dedent('''\ - f"{_('foo', bar='baz')}" - ''')) - self.assertNotIn('foo', msgids) - self.assertNotIn('bar', msgids) - self.assertNotIn('baz', msgids) - - def test_calls_in_fstring_with_partially_wrong_expression(self): - msgids = self.extract_docstrings_from_str(dedent('''\ - f"{_(f'foo') + _('bar')}" - ''')) - self.assertNotIn('foo', msgids) - self.assertIn('bar', msgids) - - def test_files_list(self): - """Make sure the directories are inspected for source files - bpo-31920 - """ - text1 = 'Text to translate1' - text2 = 'Text to translate2' - text3 = 'Text to ignore' - with temp_cwd(None), temp_dir(None) as sdir: - os.mkdir(os.path.join(sdir, 'pypkg')) - with open(os.path.join(sdir, 'pypkg', 'pymod.py'), 'w', - encoding='utf-8') as sfile: - sfile.write(f'_({text1!r})') - os.mkdir(os.path.join(sdir, 'pkg.py')) - with open(os.path.join(sdir, 'pkg.py', 'pymod2.py'), 'w', - encoding='utf-8') as sfile: - sfile.write(f'_({text2!r})') - os.mkdir(os.path.join(sdir, 'CVS')) - with open(os.path.join(sdir, 'CVS', 'pymod3.py'), 'w', - encoding='utf-8') as sfile: - sfile.write(f'_({text3!r})') - assert_python_ok(self.script, sdir) - with open('messages.pot', encoding='utf-8') as fp: - data = fp.read() - self.assertIn(f'msgid "{text1}"', data) - self.assertIn(f'msgid "{text2}"', data) - self.assertNotIn(text3, data) diff --git a/Lib/test/test_tools/test_i18n/__init__.py b/Lib/test/test_tools/test_i18n/__init__.py new file mode 100644 index 00000000000000..2f1449063fe6bb --- /dev/null +++ b/Lib/test/test_tools/test_i18n/__init__.py @@ -0,0 +1,6 @@ +import os.path +from test import support + + +def load_tests(*args): + return support.load_package_tests(os.path.dirname(__file__), *args) diff --git a/Lib/test/test_tools/test_i18n/data/all.pot b/Lib/test/test_tools/test_i18n/data/all.pot new file mode 100644 index 00000000000000..53a517a38fddcd --- /dev/null +++ b/Lib/test/test_tools/test_i18n/data/all.pot @@ -0,0 +1,164 @@ +# SOME DESCRIPTIVE TITLE. +# Copyright (C) YEAR ORGANIZATION +# FIRST AUTHOR , YEAR. +# +msgid "" +msgstr "" +"Project-Id-Version: PACKAGE VERSION\n" +"POT-Creation-Date: 2023-05-11 00:37+0200\n" +"PO-Revision-Date: YEAR-MO-DA HO:MI+ZONE\n" +"Last-Translator: FULL NAME \n" +"Language-Team: LANGUAGE \n" +"MIME-Version: 1.0\n" +"Content-Type: text/plain; charset=utf-8\n" +"Content-Transfer-Encoding: 8bit\n" +"Generated-By: pygettext.py 1.5\n" + +#: pypkg/docstrings.py:2 +#, docstring +msgid "" +"multiline\n" +"module docstring" +msgstr "" + +#: pypkg/docstrings.py:9 pypkg/docstrings.py:74 pypkg/messages.py:35 +#: pypkg/mixed.py:11 pypkg/mixed.py:15 +msgid "foo" +msgstr "" + +#: pypkg/docstrings.py:16 +#, docstring +msgid "docstring with some blank lines in front" +msgstr "" + +#: pypkg/docstrings.py:21 +#, docstring +msgid "" +"multiline\n" +"docstring with some more content" +msgstr "" + +#: pypkg/docstrings.py:27 +#, docstring +msgid "outer docstring" +msgstr "" + +#: pypkg/docstrings.py:29 +#, docstring +msgid "nested docstring" +msgstr "" + +#: pypkg/docstrings.py:33 +#, docstring +msgid "some docstring" +msgstr "" + +#: pypkg/docstrings.py:39 +#, docstring +msgid "" +"A very long docstring which should be correctly wrapped into multiple lines\n" +"in the output po file according to the maximum line width setting." +msgstr "" + +#: pypkg/docstrings.py:46 +#, docstring +msgid "Raw docstrings are ok" +msgstr "" + +#: pypkg/docstrings.py:50 +#, docstring +msgid "Unicode docstrings are ok" +msgstr "" + +#: pypkg/docstrings.py:66 +#, docstring +msgid "\"Some non-ascii dosctring: ěščř αβγδ" +msgstr "" + +#: pypkg/docstrings.py:78 +#, docstring +msgid "outer class docstring" +msgstr "" + +#: pypkg/docstrings.py:80 +#, docstring +msgid "inner class docstring" +msgstr "" + +#: pypkg/docstrings.py:83 pypkg/docstrings.py:86 +#, docstring +msgid "method docstring" +msgstr "" + +#: pypkg/docstrings.py:90 +#, docstring +msgid "Async function docstring" +msgstr "" + +#: pypkg/docstrings.py:92 +#, docstring +msgid "async function nested docstring" +msgstr "" + +#: pypkg/messages.py:5 pypkg/messages.py:7 pypkg/mixed.py:6 pypkg/mixed.py:8 +msgid "Hello, world!" +msgstr "" + +#: pypkg/messages.py:10 +msgid "" +"Hello,\n" +" multiline!\n" +msgstr "" + +#: pypkg/messages.py:14 +msgid "parentheses" +msgstr "" + +#: pypkg/messages.py:16 +msgid "Raw string \\ \\n \\t" +msgstr "" + +#: pypkg/messages.py:18 +msgid "unicode string" +msgstr "" + +#: pypkg/messages.py:20 +msgid "rawdocunicodestandard" +msgstr "" + +#: pypkg/messages.py:31 +msgid "Some non-ascii text: ěščř αβγδ" +msgstr "" + +#: pypkg/messages.py:33 +msgid "Some very long text which should wrap correctly into multiple lines while respecting the maximum line length" +msgstr "" + +#: pypkg/messages.py:35 +msgid "bar" +msgstr "" + +#: pypkg/messages.py:37 pypkg/messages.py:73 +msgid "baz" +msgstr "" + +#: pypkg/messages.py:55 +msgid "Hello, {}!" +msgstr "" + +#: pypkg/messages.py:59 pypkg/messages.py:60 +msgid "A" +msgstr "" + +#: pypkg/messages.py:59 pypkg/messages.py:60 +msgid "B" +msgstr "" + +#: pypkg/messages.py:64 +msgid "nested" +msgstr "" + +#: pypkg/messages.py:67 +msgid "param" +msgstr "" + diff --git a/Lib/test/test_tools/test_i18n/data/docstrings.pot b/Lib/test/test_tools/test_i18n/data/docstrings.pot new file mode 100644 index 00000000000000..5da0363cf48ffa --- /dev/null +++ b/Lib/test/test_tools/test_i18n/data/docstrings.pot @@ -0,0 +1,102 @@ +# SOME DESCRIPTIVE TITLE. +# Copyright (C) YEAR ORGANIZATION +# FIRST AUTHOR , YEAR. +# +msgid "" +msgstr "" +"Project-Id-Version: PACKAGE VERSION\n" +"POT-Creation-Date: 2023-05-10 16:10+0200\n" +"PO-Revision-Date: YEAR-MO-DA HO:MI+ZONE\n" +"Last-Translator: FULL NAME \n" +"Language-Team: LANGUAGE \n" +"MIME-Version: 1.0\n" +"Content-Type: text/plain; charset=utf-8\n" +"Content-Transfer-Encoding: 8bit\n" +"Generated-By: pygettext.py 1.5\n" + +#: docstrings.py:2 +#, docstring +msgid "" +"multiline\n" +"module docstring" +msgstr "" + +#: docstrings.py:9 docstrings.py:74 +#, docstring +msgid "foo" +msgstr "" + +#: docstrings.py:16 +#, docstring +msgid "docstring with some blank lines in front" +msgstr "" + +#: docstrings.py:21 +#, docstring +msgid "" +"multiline\n" +"docstring with some more content" +msgstr "" + +#: docstrings.py:27 +#, docstring +msgid "outer docstring" +msgstr "" + +#: docstrings.py:29 +#, docstring +msgid "nested docstring" +msgstr "" + +#: docstrings.py:33 +#, docstring +msgid "some docstring" +msgstr "" + +#: docstrings.py:39 +#, docstring +msgid "" +"A very long docstring which should be correctly wrapped into multiple lines\n" +"in the output po file according to the maximum line width setting." +msgstr "" + +#: docstrings.py:46 +#, docstring +msgid "Raw docstrings are ok" +msgstr "" + +#: docstrings.py:50 +#, docstring +msgid "Unicode docstrings are ok" +msgstr "" + +#: docstrings.py:66 +#, docstring +msgid "\"Some non-ascii dosctring: ěščř αβγδ" +msgstr "" + +#: docstrings.py:78 +#, docstring +msgid "outer class docstring" +msgstr "" + +#: docstrings.py:80 +#, docstring +msgid "inner class docstring" +msgstr "" + +#: docstrings.py:83 docstrings.py:86 +#, docstring +msgid "method docstring" +msgstr "" + +#: docstrings.py:90 +#, docstring +msgid "Async function docstring" +msgstr "" + +#: docstrings.py:92 +#, docstring +msgid "async function nested docstring" +msgstr "" + diff --git a/Lib/test/test_tools/test_i18n/data/docstrings.py b/Lib/test/test_tools/test_i18n/data/docstrings.py new file mode 100644 index 00000000000000..bcdcf60a6b0e37 --- /dev/null +++ b/Lib/test/test_tools/test_i18n/data/docstrings.py @@ -0,0 +1,92 @@ +# Test docstring extraction +""" +multiline +module docstring +""" + + +def test(x): + """foo""" + return 2*x + + +def test2(x): + + + """docstring with some blank lines in front""" + return 2*x + + +def test3(x): + """multiline + docstring with some more content + """ + + +def test4(x): + """outer docstring""" + def inner(y): + """nested docstring""" + + +def test5(x): + """some docstring""" + """another string""" + """and one more""" + + +def test6(x): + """ + A very long docstring which should be correctly wrapped into multiple lines + in the output po file according to the maximum line width setting. + """ + + +def test7(x): + r"""Raw docstrings are ok""" + + +def test8(x): + u"""Unicode docstrings are ok""" + + +def test9(x): + b"""bytes should not be picked up""" + + +def test10(x): + f"""f-strings should not be picked up""" + + +def test11(x): + """Hello, {}!""".format("docstring") + + +def test12(x): + """"Some non-ascii dosctring: ěščř αβγδ""" + + +def test13(x): + """""" + + +class Foo: + """foo""" + + +class Outer: + """outer class docstring""" + class Inner: + "inner class docstring" + + def inner_method(self): + """method docstring""" + + def outer_method(self): + """method docstring""" + + +async def async_test(x): + """Async function docstring""" + async def async_inner(y): + """async function nested docstring""" diff --git a/Lib/test/test_tools/test_i18n/data/messages.pot b/Lib/test/test_tools/test_i18n/data/messages.pot new file mode 100644 index 00000000000000..490b8bd728d74c --- /dev/null +++ b/Lib/test/test_tools/test_i18n/data/messages.pot @@ -0,0 +1,82 @@ +# SOME DESCRIPTIVE TITLE. +# Copyright (C) YEAR ORGANIZATION +# FIRST AUTHOR , YEAR. +# +msgid "" +msgstr "" +"Project-Id-Version: PACKAGE VERSION\n" +"POT-Creation-Date: 2023-05-11 00:33+0200\n" +"PO-Revision-Date: YEAR-MO-DA HO:MI+ZONE\n" +"Last-Translator: FULL NAME \n" +"Language-Team: LANGUAGE \n" +"MIME-Version: 1.0\n" +"Content-Type: text/plain; charset=utf-8\n" +"Content-Transfer-Encoding: 8bit\n" +"Generated-By: pygettext.py 1.5\n" + +#: messages.py:5 messages.py:7 +msgid "Hello, world!" +msgstr "" + +#: messages.py:10 +msgid "" +"Hello,\n" +" multiline!\n" +msgstr "" + +#: messages.py:14 +msgid "parentheses" +msgstr "" + +#: messages.py:16 +msgid "Raw string \\ \\n \\t" +msgstr "" + +#: messages.py:18 +msgid "unicode string" +msgstr "" + +#: messages.py:20 +msgid "rawdocunicodestandard" +msgstr "" + +#: messages.py:31 +msgid "Some non-ascii text: ěščř αβγδ" +msgstr "" + +#: messages.py:33 +msgid "Some very long text which should wrap correctly into multiple lines while respecting the maximum line length" +msgstr "" + +#: messages.py:35 +msgid "foo" +msgstr "" + +#: messages.py:35 +msgid "bar" +msgstr "" + +#: messages.py:37 messages.py:73 +msgid "baz" +msgstr "" + +#: messages.py:55 +msgid "Hello, {}!" +msgstr "" + +#: messages.py:59 messages.py:60 +msgid "A" +msgstr "" + +#: messages.py:59 messages.py:60 +msgid "B" +msgstr "" + +#: messages.py:64 +msgid "nested" +msgstr "" + +#: messages.py:67 +msgid "param" +msgstr "" + diff --git a/Lib/test/test_tools/test_i18n/data/messages.py b/Lib/test/test_tools/test_i18n/data/messages.py new file mode 100644 index 00000000000000..aea65a97eb1679 --- /dev/null +++ b/Lib/test/test_tools/test_i18n/data/messages.py @@ -0,0 +1,81 @@ +# Test message extraction +from gettext import gettext as _ + + +_("Hello, world!") + +_("Hello, " + "world!") + +_("""Hello, + multiline! +""") + +(_("parentheses")) + +_(r"Raw string \ \n \t") + +_(u"unicode string") + +_(r"raw" """doc""" u"unicode" "standard") + +_(f"f-strings should not get extracted") + +_(f"f-strings should not get {'extracted'}") + +text = 'extracted' +_(f"f-strings should not get {text}") + +_(b"bytes should not be extracted") + +_("Some non-ascii text: ěščř αβγδ") + +_("Some very long text which should wrap correctly into multiple lines while respecting the maximum line length") + +_("foo"), _("bar") + +_("baz"), _("baz") + +_("") + +_() + +_(None) + +_(1) + +_(False) + +_(x="no kw arguments") + +_("foo", "bar") + +_("something", x="something else") + +_("Hello, {}!").format("world") + +_("Hello, {}!".format("world")) + +arr = [_("A"), _("B")] +obj = {'a': _("A"), 'b': _("B")} + + +def test(): + print(_("nested")) + + +def test2(x=_("param")): + pass + + +class Foo: + def bar(self): + return _("baz") + + +def _(x): + pass + + +def _(x="don't extract me"): + pass diff --git a/Lib/test/test_tools/test_i18n/data/mixed.pot b/Lib/test/test_tools/test_i18n/data/mixed.pot new file mode 100644 index 00000000000000..2feb7624a4ce1a --- /dev/null +++ b/Lib/test/test_tools/test_i18n/data/mixed.pot @@ -0,0 +1,24 @@ +# SOME DESCRIPTIVE TITLE. +# Copyright (C) YEAR ORGANIZATION +# FIRST AUTHOR , YEAR. +# +msgid "" +msgstr "" +"Project-Id-Version: PACKAGE VERSION\n" +"POT-Creation-Date: 2023-05-10 16:11+0200\n" +"PO-Revision-Date: YEAR-MO-DA HO:MI+ZONE\n" +"Last-Translator: FULL NAME \n" +"Language-Team: LANGUAGE \n" +"MIME-Version: 1.0\n" +"Content-Type: text/plain; charset=utf-8\n" +"Content-Transfer-Encoding: 8bit\n" +"Generated-By: pygettext.py 1.5\n" + +#: mixed.py:6 mixed.py:8 +msgid "Hello, world!" +msgstr "" + +#: mixed.py:11 mixed.py:15 +msgid "foo" +msgstr "" + diff --git a/Lib/test/test_tools/test_i18n/data/mixed.py b/Lib/test/test_tools/test_i18n/data/mixed.py new file mode 100644 index 00000000000000..5a39d0eafbb7dd --- /dev/null +++ b/Lib/test/test_tools/test_i18n/data/mixed.py @@ -0,0 +1,15 @@ +# Test messages and docstrings with the same msgid +from gettext import gettext as _ + + +def test(): + """Hello, world!""" + + print(_("Hello, world!")) + + +_("foo") + + +class Foo: + """foo""" diff --git a/Lib/test/test_tools/test_i18n/test_i18n.py b/Lib/test/test_tools/test_i18n/test_i18n.py new file mode 100644 index 00000000000000..a3839acb86f78c --- /dev/null +++ b/Lib/test/test_tools/test_i18n/test_i18n.py @@ -0,0 +1,135 @@ +"""Tests to cover the Tools/i18n package""" + +import os +import re +import sys +import unittest +from pathlib import Path +from test.support.os_helper import temp_cwd +from test.support.script_helper import assert_python_ok +from test.test_tools import skip_if_missing, toolsdir + +skip_if_missing() + +DATA_DIR = Path(__file__).parent / 'data' + + +class Test_pygettext(unittest.TestCase): + """Tests for the pygettext.py tool""" + + script = os.path.join(toolsdir, 'i18n', 'pygettext.py') + + def get_header(self, data): + """ utility: return the header of a .po file as a dictionary """ + headers = {} + for line in data.split('\n'): + if not line or line.startswith(('#', 'msgid', 'msgstr')): + continue + line = line.strip('"') + key, val = line.split(':', 1) + headers[key] = val.strip() + return headers + + def assertPOEqual(self, expected, actual): + """Check if two PO files are equal""" + # Normalize the creation date + date_pattern = re.compile(r'"POT-Creation-Date: .+?\n"') + header = '"POT-Creation-Date: 2000-01-01 00:00+0000\\n"' + expected = re.sub(date_pattern, header, expected) + actual = re.sub(date_pattern, header, actual) + + # Normalize the path separators in case this test is running on a + # platform which does not use '/' as a default separator + fileloc_pattern = re.compile(r'#:.+') + + def replace(match): + return match[0].replace(os.sep, "/") + expected = re.sub(fileloc_pattern, replace, expected) + actual = re.sub(fileloc_pattern, replace, actual) + + self.assertEqual(expected, actual) + + def test_header(self): + """Make sure the required fields are in the header, according to: + http://www.gnu.org/software/gettext/manual/gettext.html#Header-Entry + """ + with temp_cwd(None): + assert_python_ok(self.script) + with open('messages.pot', encoding='utf-8') as fp: + data = fp.read() + header = self.get_header(data) + + self.assertIn("Project-Id-Version", header) + self.assertIn("POT-Creation-Date", header) + self.assertIn("PO-Revision-Date", header) + self.assertIn("Last-Translator", header) + self.assertIn("Language-Team", header) + self.assertIn("MIME-Version", header) + self.assertIn("Content-Type", header) + self.assertIn("Content-Transfer-Encoding", header) + self.assertIn("Generated-By", header) + + # not clear if these should be required in POT (template) files + # self.assertIn("Report-Msgid-Bugs-To", header) + # self.assertIn("Language", header) + + # "Plural-Forms" is optional + + @unittest.skipIf(sys.platform.startswith('aix'), + 'bpo-29972: broken test on AIX') + def test_POT_Creation_Date(self): + """ Match the date format from xgettext for POT-Creation-Date """ + from datetime import datetime + with temp_cwd(None): + assert_python_ok(self.script) + with open('messages.pot', encoding='utf-8') as fp: + data = fp.read() + header = self.get_header(data) + creationDate = header['POT-Creation-Date'] + + # peel off the escaped newline at the end of string + if creationDate.endswith('\\n'): + creationDate = creationDate[:-len('\\n')] + + # This will raise if the date format does not exactly match. + datetime.strptime(creationDate, '%Y-%m-%d %H:%M%z') + + def test_files(self): + """Test message and docstring extraction. + Compares the script output against the .po files in the data folder. + """ + filenames = (('messages.py', 'messages.pot'), + ('docstrings.py', 'docstrings.pot'), + ('mixed.py', 'mixed.pot')) + + for input_file, output_file in filenames: + with self.subTest(f'Input file: data/{input_file}'): + contents = (DATA_DIR / input_file).read_text(encoding='utf-8') + with temp_cwd(None): + Path(input_file).write_text(contents, encoding='utf-8') + assert_python_ok(self.script, '-D', input_file) + output = Path('messages.pot').read_text(encoding='utf-8') + + expected = (DATA_DIR / output_file).read_text(encoding='utf-8') + self.assertPOEqual(expected, output) + + def test_files_list(self): + """Make sure the directories are inspected for source files + bpo-31920 + """ + filenames = ('messages.py', 'docstrings.py', 'mixed.py') + + with temp_cwd(None): + pkg_dir = Path('pypkg') + pkg_dir.mkdir() + + for filename in filenames: + data = (DATA_DIR / filename).read_text(encoding='utf-8') + path = pkg_dir / filename + path.write_text(data, encoding='utf-8') + + assert_python_ok(self.script, '-D', 'pypkg') + output = Path('messages.pot').read_text(encoding='utf-8') + + expected = (Path(DATA_DIR) / 'all.pot').read_text(encoding='utf-8') + self.assertPOEqual(expected, output) diff --git a/Misc/NEWS.d/next/Tools-Demos/2023-05-11-23-32-25.gh-issue-104400.23vxm7.rst b/Misc/NEWS.d/next/Tools-Demos/2023-05-11-23-32-25.gh-issue-104400.23vxm7.rst new file mode 100644 index 00000000000000..f7bb05b99efe59 --- /dev/null +++ b/Misc/NEWS.d/next/Tools-Demos/2023-05-11-23-32-25.gh-issue-104400.23vxm7.rst @@ -0,0 +1 @@ +Use an AST parser instead of a tokenizer in pygettext diff --git a/Tools/i18n/pygettext.py b/Tools/i18n/pygettext.py index 3a0b27ba420e7a..6a5a8d85c0a05e 100755 --- a/Tools/i18n/pygettext.py +++ b/Tools/i18n/pygettext.py @@ -1,11 +1,10 @@ #! /usr/bin/env python3 -# -*- coding: iso-8859-1 -*- # Originally written by Barry Warsaw # # Minimally patched to make it even more xgettext compatible # by Peter Funk # -# 2002-11-22 Jrgen Hermann +# 2002-11-22 Jürgen Hermann # Added checks that _() only contains string literals, and # command line args are resolved to module lists, i.e. you # can now pass a filename, a module or package name, or a @@ -73,6 +72,10 @@ --default-domain=name Rename the default output file from messages.pot to name.pot. + --charset=charset + Character set used to read the input files and to write the + output file (default "utf-8"). + -E --escape Replace non-ASCII characters with octal escape sequences. @@ -155,24 +158,24 @@ If `inputfile' is -, standard input is read. """) -import os +import ast +import getopt +import glob import importlib.machinery import importlib.util +import os import sys -import glob import time -import getopt -import ast -import token -import tokenize +from ast import (AsyncFunctionDef, ClassDef, FunctionDef, Module, NodeVisitor, + unparse) +from collections import defaultdict +from dataclasses import dataclass __version__ = '1.5' default_keywords = ['_'] DEFAULTKEYWORDS = ', '.join(default_keywords) -EMPTYSTRING = '' - # The normal pot-file header. msgmerge and Emacs's po-mode work better if it's # there. @@ -207,7 +210,7 @@ def make_escapes(pass_nonascii): global escapes, escape if pass_nonascii: # Allow non-ascii characters to pass through so that e.g. 'msgid - # "Hhe"' would result not result in 'msgid "H\366he"'. Otherwise we + # "Höhe"' would not result in 'msgid "H\366he"'. Otherwise we # escape any character outside the 32..126 range. mod = 128 escape = escape_ascii @@ -227,19 +230,11 @@ def make_escapes(pass_nonascii): def escape_ascii(s, encoding): return ''.join(escapes[ord(c)] if ord(c) < 128 else c for c in s) + def escape_nonascii(s, encoding): return ''.join(escapes[b] for b in s.encode(encoding)) -def is_literal_string(s): - return s[0] in '\'"' or (s[0] in 'rRuU' and s[1] in '\'"') - - -def safe_eval(s): - # unwrap quotes, safely - return eval(s, {'__builtins__':{}}, {}) - - def normalize(s, encoding): # This converts the various Python string types into a format that is # appropriate for .po files, namely much closer to C style. @@ -306,211 +301,133 @@ def getFilesForName(name): return [] -class TokenEater: - def __init__(self, options): - self.__options = options - self.__messages = {} - self.__state = self.__waiting - self.__data = [] - self.__lineno = -1 - self.__freshmodule = 1 - self.__curfile = None - self.__enclosurecount = 0 - - def __call__(self, ttype, tstring, stup, etup, line): - # dispatch -## import token -## print('ttype:', token.tok_name[ttype], 'tstring:', tstring, -## file=sys.stderr) - self.__state(ttype, tstring, stup[0]) - - def __waiting(self, ttype, tstring, lineno): - opts = self.__options - # Do docstring extractions, if enabled - if opts.docstrings and not opts.nodocstrings.get(self.__curfile): - # module docstring? - if self.__freshmodule: - if ttype == tokenize.STRING and is_literal_string(tstring): - self.__addentry(safe_eval(tstring), lineno, isdocstring=1) - self.__freshmodule = 0 - return - if ttype in (tokenize.COMMENT, tokenize.NL, tokenize.ENCODING): - return - self.__freshmodule = 0 - # class or func/method docstring? - if ttype == tokenize.NAME and tstring in ('class', 'def'): - self.__state = self.__suiteseen - return - if ttype == tokenize.NAME and tstring in opts.keywords: - self.__state = self.__keywordseen +@dataclass +class Message: + filename: str + lineno: int + msgid: str + is_docstring: bool = False + + +def get_funcname(node): + if isinstance(node.func, ast.Name): + return node.func.id + elif isinstance(node.func, ast.Attribute): + return node.func.attr + else: + return None + + +class GettextVisitor(NodeVisitor): + def __init__(self, options, filename=None): + super().__init__() + self.options = options + self.filename = filename + self.messages = defaultdict(list) + + def _is_string_const(self, node): + return isinstance(node, ast.Constant) and isinstance(node.value, str) + + def _extract_docstring(self, node): + if not self.options.docstrings or self.options.nodocstrings.get(self.filename): return - if ttype == tokenize.STRING: - maybe_fstring = ast.parse(tstring, mode='eval').body - if not isinstance(maybe_fstring, ast.JoinedStr): - return - for value in filter(lambda node: isinstance(node, ast.FormattedValue), - maybe_fstring.values): - for call in filter(lambda node: isinstance(node, ast.Call), - ast.walk(value)): - func = call.func - if isinstance(func, ast.Name): - func_name = func.id - elif isinstance(func, ast.Attribute): - func_name = func.attr + + if not node.body: + return + + expr = node.body[0] + if isinstance(expr, ast.Expr) and self._is_string_const(expr.value): + if docstring := ast.get_docstring(node): + message = Message(self.filename, expr.lineno, docstring, is_docstring=True) + self.messages[docstring].append(message) + + def _extract_message(self, node): + funcname = get_funcname(node) + if funcname not in self.options.keywords: + return + + filename = self.filename + lineno = node.lineno + + if len(node.args) != 1: + print(f'*** {filename}:{lineno}: Seen unexpected amount of ' + f'positional arguments in gettext call: {unparse(node)}', + file=sys.stderr) + return + + if node.keywords: + print(f'*** {filename}:{lineno}: Seen unexpected keyword arguments ' + f'in gettext call: {unparse(node)}', file=sys.stderr) + return + + arg = node.args[0] + if not self._is_string_const(arg): + print(f'*** {filename}:{lineno}: Seen unexpected argument type ' + f'in gettext call: {unparse(node)}', file=sys.stderr) + return + + msgid = arg.value + if msgid == '': + print(f'*** {filename}:{lineno}: Empty msgid. It is reserved by GNU gettext: ' + 'gettext("") returns the header entry with ' + 'meta information, not the empty string.', + file=sys.stderr) + return + + message = Message(filename, lineno, msgid) + self.messages[msgid].append(message) + + def visit(self, node): + if type(node) in {Module, FunctionDef, AsyncFunctionDef, ClassDef}: + self._extract_docstring(node) + super().visit(node) + + def visit_Call(self, node): + self._extract_message(node) + self.generic_visit(node) + + +def format_pot_file(all_messages, options, encoding): + timestamp = time.strftime('%Y-%m-%d %H:%M%z') + output = pot_header % {'time': timestamp, 'version': __version__, + 'charset': encoding, + 'encoding': '8bit'} + inverted = defaultdict(dict) + + for msgid, messages in all_messages.items(): + occurrences = set((msg.filename, msg.lineno) for msg in messages) + occurrences = tuple(sorted(occurrences)) + inverted[occurrences][msgid] = messages + + sorted_occurrences = sorted(list(inverted.keys())) + + for occurrences in sorted_occurrences: + messages_dict = inverted[occurrences] + for msgid, messages in messages_dict.items(): + if options.writelocations: + locline = '#:' + for (filename, lineno) in occurrences: + s = f' {filename}:{lineno}' + if len(locline) + len(s) <= options.width: + locline = locline + s else: - continue - - if func_name not in opts.keywords: - continue - if len(call.args) != 1: - print(_( - '*** %(file)s:%(lineno)s: Seen unexpected amount of' - ' positional arguments in gettext call: %(source_segment)s' - ) % { - 'source_segment': ast.get_source_segment(tstring, call) or tstring, - 'file': self.__curfile, - 'lineno': lineno - }, file=sys.stderr) - continue - if call.keywords: - print(_( - '*** %(file)s:%(lineno)s: Seen unexpected keyword arguments' - ' in gettext call: %(source_segment)s' - ) % { - 'source_segment': ast.get_source_segment(tstring, call) or tstring, - 'file': self.__curfile, - 'lineno': lineno - }, file=sys.stderr) - continue - arg = call.args[0] - if not isinstance(arg, ast.Constant): - print(_( - '*** %(file)s:%(lineno)s: Seen unexpected argument type' - ' in gettext call: %(source_segment)s' - ) % { - 'source_segment': ast.get_source_segment(tstring, call) or tstring, - 'file': self.__curfile, - 'lineno': lineno - }, file=sys.stderr) - continue - if isinstance(arg.value, str): - self.__addentry(arg.value, lineno) - - def __suiteseen(self, ttype, tstring, lineno): - # skip over any enclosure pairs until we see the colon - if ttype == tokenize.OP: - if tstring == ':' and self.__enclosurecount == 0: - # we see a colon and we're not in an enclosure: end of def - self.__state = self.__suitedocstring - elif tstring in '([{': - self.__enclosurecount += 1 - elif tstring in ')]}': - self.__enclosurecount -= 1 - - def __suitedocstring(self, ttype, tstring, lineno): - # ignore any intervening noise - if ttype == tokenize.STRING and is_literal_string(tstring): - self.__addentry(safe_eval(tstring), lineno, isdocstring=1) - self.__state = self.__waiting - elif ttype not in (tokenize.NEWLINE, tokenize.INDENT, - tokenize.COMMENT): - # there was no class docstring - self.__state = self.__waiting - - def __keywordseen(self, ttype, tstring, lineno): - if ttype == tokenize.OP and tstring == '(': - self.__data = [] - self.__lineno = lineno - self.__state = self.__openseen - else: - self.__state = self.__waiting - - def __openseen(self, ttype, tstring, lineno): - if ttype == tokenize.OP and tstring == ')': - # We've seen the last of the translatable strings. Record the - # line number of the first line of the strings and update the list - # of messages seen. Reset state for the next batch. If there - # were no strings inside _(), then just ignore this entry. - if self.__data: - self.__addentry(EMPTYSTRING.join(self.__data)) - self.__state = self.__waiting - elif ttype == tokenize.STRING and is_literal_string(tstring): - self.__data.append(safe_eval(tstring)) - elif ttype not in [tokenize.COMMENT, token.INDENT, token.DEDENT, - token.NEWLINE, tokenize.NL]: - # warn if we see anything else than STRING or whitespace - print(_( - '*** %(file)s:%(lineno)s: Seen unexpected token "%(token)s"' - ) % { - 'token': tstring, - 'file': self.__curfile, - 'lineno': self.__lineno - }, file=sys.stderr) - self.__state = self.__waiting - - def __addentry(self, msg, lineno=None, isdocstring=0): - if lineno is None: - lineno = self.__lineno - if not msg in self.__options.toexclude: - entry = (self.__curfile, lineno) - self.__messages.setdefault(msg, {})[entry] = isdocstring - - def set_filename(self, filename): - self.__curfile = filename - self.__freshmodule = 1 - - def write(self, fp): - options = self.__options - timestamp = time.strftime('%Y-%m-%d %H:%M%z') - encoding = fp.encoding if fp.encoding else 'UTF-8' - print(pot_header % {'time': timestamp, 'version': __version__, - 'charset': encoding, - 'encoding': '8bit'}, file=fp) - # Sort the entries. First sort each particular entry's keys, then - # sort all the entries by their first item. - reverse = {} - for k, v in self.__messages.items(): - keys = sorted(v.keys()) - reverse.setdefault(tuple(keys), []).append((k, v)) - rkeys = sorted(reverse.keys()) - for rkey in rkeys: - rentries = reverse[rkey] - rentries.sort() - for k, v in rentries: - # If the entry was gleaned out of a docstring, then add a - # comment stating so. This is to aid translators who may wish - # to skip translating some unimportant docstrings. - isdocstring = any(v.values()) - # k is the message string, v is a dictionary-set of (filename, - # lineno) tuples. We want to sort the entries in v first by - # file name and then by line number. - v = sorted(v.keys()) - if not options.writelocations: - pass - # location comments are different b/w Solaris and GNU: - elif options.locationstyle == options.SOLARIS: - for filename, lineno in v: - d = {'filename': filename, 'lineno': lineno} - print(_( - '# File: %(filename)s, line: %(lineno)d') % d, file=fp) - elif options.locationstyle == options.GNU: - # fit as many locations on one line, as long as the - # resulting line length doesn't exceed 'options.width' - locline = '#:' - for filename, lineno in v: - d = {'filename': filename, 'lineno': lineno} - s = _(' %(filename)s:%(lineno)d') % d - if len(locline) + len(s) <= options.width: - locline = locline + s - else: - print(locline, file=fp) - locline = "#:" + s - if len(locline) > 2: - print(locline, file=fp) - if isdocstring: - print('#, docstring', file=fp) - print('msgid', normalize(k, encoding), file=fp) - print('msgstr ""\n', file=fp) + output += locline + '\n' + locline = '#:' + s + if len(locline) > 2: + output += locline + '\n' + + # If the entry was gleaned out of a docstring, then add a + # comment stating so. This is to aid translators who may wish + # to skip translating some unimportant docstrings. + is_docstring = all(msg.is_docstring for msg in messages) + if is_docstring: + output += '#, docstring\n' + + output += f'msgid {normalize(msgid, encoding)}\n' + output += 'msgstr ""\n' + output += '\n' + + return output def main(): @@ -519,7 +436,7 @@ def main(): opts, args = getopt.getopt( sys.argv[1:], 'ad:DEhk:Kno:p:S:Vvw:x:X:', - ['extract-all', 'default-domain=', 'escape', 'help', + ['extract-all', 'default-domain=', 'charset=', 'escape', 'help', 'keyword=', 'no-default-keywords', 'add-location', 'no-location', 'output=', 'output-dir=', 'style=', 'verbose', 'version', 'width=', 'exclude-file=', @@ -535,6 +452,7 @@ class Options: SOLARIS = 2 # defaults extractall = 0 # FIXME: currently this option has no effect at all. + charset = 'utf-8' escape = 0 keywords = [] outpath = '' @@ -560,6 +478,8 @@ class Options: options.extractall = 1 elif opt in ('-d', '--default-domain'): options.outfile = arg + '.pot' + elif opt in ('--charset',): + options.charset = arg elif opt in ('-E', '--escape'): options.escape = 1 elif opt in ('-D', '--docstrings'): @@ -631,7 +551,7 @@ class Options: args = expanded # slurp through all the files - eater = TokenEater(options) + visitor = GettextVisitor(options) for filename in args: if filename == '-': if options.verbose: @@ -641,18 +561,11 @@ class Options: else: if options.verbose: print(_('Working on %s') % filename) - fp = open(filename, 'rb') + fp = open(filename, encoding=options.charset) closep = 1 try: - eater.set_filename(filename) - try: - tokens = tokenize.tokenize(fp.readline) - for _token in tokens: - eater(*_token) - except tokenize.TokenError as e: - print('%s: %s, line %d, column %d' % ( - e.args[0], filename, e.args[1][0], e.args[1][1]), - file=sys.stderr) + visitor.filename = filename + visitor.visit(ast.parse(fp.read())) finally: if closep: fp.close() @@ -664,10 +577,10 @@ class Options: else: if options.outpath: options.outfile = os.path.join(options.outpath, options.outfile) - fp = open(options.outfile, 'w') + fp = open(options.outfile, 'w', encoding=options.charset) closep = 1 try: - eater.write(fp) + fp.write(format_pot_file(visitor.messages, options, options.charset)) finally: if closep: fp.close()