Force xgettext to return UTF-8

Reference: #14 Reported-by: Armijn Hemel @armijnhemel Signed-off-by: Philippe Ombredanne <[email protected]>
aboutcode-org · Mar 16, 2024 · 812faf9 · 812faf9
1 parent d7a4a36
commit 812faf9
Show file tree

Hide file tree

Showing 6 changed files with 154 additions and 6 deletions.
diff --git a/src/source_inspector/strings_xgettext.py b/src/source_inspector/strings_xgettext.py
@@ -70,7 +70,9 @@ def collect_strings(location, clean=True):
     rc, result, err = command.execute(
         cmd_loc="xgettext",
         args=[
-            "--omit-header",
+            # this is a trick to force getting UTF back
+            # see https://github.com/nexB/source-inspector/issues/14#issuecomment-2001893496
+            '--copyright-holder="ø"',
             "--no-wrap",
             "--extract-all",
             "--from-code=UTF-8",
@@ -83,16 +85,18 @@ def collect_strings(location, clean=True):
     if rc != 0:
         raise Exception(open(err).read())
 
-    yield from parse_po_text(po_text=result, clean=clean)
+    yield from parse_po_text(po_text=result, drop_header=True, clean=clean)
 
 
-def parse_po_text(po_text, clean=True):
+def parse_po_text(po_text, drop_header=False, clean=True):
     """
     Yield mappings of strings collected from the ``po_text`` string.
     Clean strings if ``clean`` is True.
+    Drop the "header" first block if ``drop_header`` is True
 
     The po text lines looks like this:
-    - Blocks sperated by 2 lines.
+    - Blocks separated by 2 lines.
+    - Optional first header block
     - The first lines starting with #: are comments with the line numbers.
     - The lines starting with #, are flags, not interesting
     - We care about the lines in the middle starting with the first msgid
@@ -112,8 +116,13 @@ def parse_po_text(po_text, clean=True):
     msgstr ""
     """
 
-    for chunk in po_text.split("\n\n"):
-        lines = chunk.splitlines(False)
+    blocks = po_text.split("\n\n")
+    if drop_header:
+        # drop the first block which is the header
+        blocks = blocks[1:]
+
+    for block in blocks:
+        lines = block.splitlines(False)
         line_numbers = []
         strings = []
         for line in lines:

diff --git a/tests/data/strings_xgettext/fdisk.c b/tests/data/strings_xgettext/fdisk.c
@@ -0,0 +1,32 @@
+/* vi: set sw=4 ts=4: */
+/*
+ * fdisk.c -- Partition table manipulator for Linux.
+ *
+ * Copyright (C) 1992  A. V. Le Blanc ([email protected])
+ * Copyright (C) 2001,2002 Vladimir Oleynik <[email protected]> (initial bb port)
+ *
+ * Licensed under GPLv2 or later, see file LICENSE in this source tree.
+ */
+//applet:IF_FDISK(APPLET(fdisk, BB_DIR_SBIN, BB_SUID_DROP))
+
+//kbuild:lib-$(CONFIG_FDISK) += fdisk.o
+
+/* Looks like someone forgot to add this to config system */
+//usage:#ifndef ENABLE_FEATURE_FDISK_BLKSIZE
+#include "libbb.h"
+#include "unicode.h"
+
+static void list_types(const char *const *sys);
+	"\x0a" "OS/2 Boot Manager",/* OS/2 Boot Manager */
+	"\x42" "SFS",
+	"\x63" "GNU HURD or SysV", /* GNU HURD or Mach or Sys V/386 (such as ISC UNIX) */
+	"\x80" "Old Minix",        /* Minix 1.4a and earlier */
+	"\x81" "Minix / old Linux",/* Minix 1.4b and later */
+	"\x82" "Linux swap",       /* also Solaris */
+	"\x83" "Linux",
+	"\x84" "OS/2 hidden C: drive",
+	"\x85" "Linux extended",
+	"\x86" "NTFS volume set",
+	"\x87" "NTFS volume set",
+	"\x8e" "Linux LVM",
+	"\x9f" "BSD/OS",           /* BSDI */
diff --git a/tests/data/strings_xgettext/fdisk.c-expected.json b/tests/data/strings_xgettext/fdisk.c-expected.json
@@ -0,0 +1,89 @@
+{
+  "files": [
+    {
+      "path": "fdisk.c",
+      "type": "file",
+      "source_strings": [
+        {
+          "line_numbers": [
+            20
+          ],
+          "string": "OS/2 Boot Manager"
+        },
+        {
+          "line_numbers": [
+            21
+          ],
+          "string": "BSFS"
+        },
+        {
+          "line_numbers": [
+            22
+          ],
+          "string": "cGNU HURD or SysV"
+        },
+        {
+          "line_numbers": [
+            23
+          ],
+          "string": "Old Minix"
+        },
+        {
+          "line_numbers": [
+            24
+          ],
+          "string": "Minix / old Linux"
+        },
+        {
+          "line_numbers": [
+            25
+          ],
+          "string": "Linux swap"
+        },
+        {
+          "line_numbers": [
+            26
+          ],
+          "string": "Linux"
+        },
+        {
+          "line_numbers": [
+            27
+          ],
+          "string": "OS/2 hidden C: drive"
+        },
+        {
+          "line_numbers": [
+            28
+          ],
+          "string": "Linux extended"
+        },
+        {
+          "line_numbers": [
+            29
+          ],
+          "string": "NTFS volume set"
+        },
+        {
+          "line_numbers": [
+            30
+          ],
+          "string": "NTFS volume set"
+        },
+        {
+          "line_numbers": [
+            31
+          ],
+          "string": "Linux LVM"
+        },
+        {
+          "line_numbers": [
+            32
+          ],
+          "string": "BSD/OS"
+        }
+      ],
+      "scan_errors": []
+    }
+  ]
+}
diff --git a/tests/data/strings_xgettext/fdisk.c.ABOUT b/tests/data/strings_xgettext/fdisk.c.ABOUT
@@ -0,0 +1,3 @@
+date: 2024-03-16
+download_url: https://git.busybox.net/busybox/plain/util-linux/fdisk.c?h=1_35_stable
+notes: stripped down to the few klines we care for
diff --git a/tests/data/strings_xgettext/lineedit.c-expected.json b/tests/data/strings_xgettext/lineedit.c-expected.json
@@ -65,6 +65,12 @@
           ],
           "string": "."
         },
+        {
+          "line_numbers": [
+            905
+          ],
+          "string": "A"
+        },
         {
           "line_numbers": [
             1000

diff --git a/tests/test_symbols_xgettext.py b/tests/test_symbols_xgettext.py
@@ -232,3 +232,12 @@ def test_strings_scanner_multilines_utf8(self):
 
         expected_loc = self.get_test_loc("lineedit.c-expected.json", must_exist=False)
         check_json_scan(expected_loc, result_file, regen=REGEN_TEST_FIXTURES)
+
+    def test_strings_scanner_unicode(self):
+        test_file = self.get_test_loc("fdisk.c")
+        result_file = self.get_temp_file("json")
+        args = ["--source-string", test_file, "--json-pp", result_file]
+        run_scan_click(args)
+
+        expected_loc = self.get_test_loc("fdisk.c-expected.json", must_exist=False)
+        check_json_scan(expected_loc, result_file, regen=REGEN_TEST_FIXTURES)