Skip to content

Commit

Permalink
Force xgettext to return UTF-8
Browse files Browse the repository at this point in the history
Reference: #14
Reported-by: Armijn Hemel @armijnhemel
Signed-off-by: Philippe Ombredanne <[email protected]>
  • Loading branch information
pombredanne committed Mar 16, 2024
1 parent d7a4a36 commit 812faf9
Show file tree
Hide file tree
Showing 6 changed files with 154 additions and 6 deletions.
21 changes: 15 additions & 6 deletions src/source_inspector/strings_xgettext.py
Original file line number Diff line number Diff line change
Expand Up @@ -70,7 +70,9 @@ def collect_strings(location, clean=True):
rc, result, err = command.execute(
cmd_loc="xgettext",
args=[
"--omit-header",
# this is a trick to force getting UTF back
# see https://github.com/nexB/source-inspector/issues/14#issuecomment-2001893496
'--copyright-holder="ø"',
"--no-wrap",
"--extract-all",
"--from-code=UTF-8",
Expand All @@ -83,16 +85,18 @@ def collect_strings(location, clean=True):
if rc != 0:
raise Exception(open(err).read())

yield from parse_po_text(po_text=result, clean=clean)
yield from parse_po_text(po_text=result, drop_header=True, clean=clean)


def parse_po_text(po_text, clean=True):
def parse_po_text(po_text, drop_header=False, clean=True):
"""
Yield mappings of strings collected from the ``po_text`` string.
Clean strings if ``clean`` is True.
Drop the "header" first block if ``drop_header`` is True
The po text lines looks like this:
- Blocks sperated by 2 lines.
- Blocks separated by 2 lines.
- Optional first header block
- The first lines starting with #: are comments with the line numbers.
- The lines starting with #, are flags, not interesting
- We care about the lines in the middle starting with the first msgid
Expand All @@ -112,8 +116,13 @@ def parse_po_text(po_text, clean=True):
msgstr ""
"""

for chunk in po_text.split("\n\n"):
lines = chunk.splitlines(False)
blocks = po_text.split("\n\n")
if drop_header:
# drop the first block which is the header
blocks = blocks[1:]

for block in blocks:
lines = block.splitlines(False)
line_numbers = []
strings = []
for line in lines:
Expand Down
32 changes: 32 additions & 0 deletions tests/data/strings_xgettext/fdisk.c
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
/* vi: set sw=4 ts=4: */
/*
* fdisk.c -- Partition table manipulator for Linux.
*
* Copyright (C) 1992 A. V. Le Blanc ([email protected])
* Copyright (C) 2001,2002 Vladimir Oleynik <[email protected]> (initial bb port)
*
* Licensed under GPLv2 or later, see file LICENSE in this source tree.
*/
//applet:IF_FDISK(APPLET(fdisk, BB_DIR_SBIN, BB_SUID_DROP))

//kbuild:lib-$(CONFIG_FDISK) += fdisk.o

/* Looks like someone forgot to add this to config system */
//usage:#ifndef ENABLE_FEATURE_FDISK_BLKSIZE
#include "libbb.h"
#include "unicode.h"

static void list_types(const char *const *sys);
"\x0a" "OS/2 Boot Manager",/* OS/2 Boot Manager */
"\x42" "SFS",
"\x63" "GNU HURD or SysV", /* GNU HURD or Mach or Sys V/386 (such as ISC UNIX) */
"\x80" "Old Minix", /* Minix 1.4a and earlier */
"\x81" "Minix / old Linux",/* Minix 1.4b and later */
"\x82" "Linux swap", /* also Solaris */
"\x83" "Linux",
"\x84" "OS/2 hidden C: drive",
"\x85" "Linux extended",
"\x86" "NTFS volume set",
"\x87" "NTFS volume set",
"\x8e" "Linux LVM",
"\x9f" "BSD/OS", /* BSDI */
89 changes: 89 additions & 0 deletions tests/data/strings_xgettext/fdisk.c-expected.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,89 @@
{
"files": [
{
"path": "fdisk.c",
"type": "file",
"source_strings": [
{
"line_numbers": [
20
],
"string": "OS/2 Boot Manager"
},
{
"line_numbers": [
21
],
"string": "BSFS"
},
{
"line_numbers": [
22
],
"string": "cGNU HURD or SysV"
},
{
"line_numbers": [
23
],
"string": "Old Minix"
},
{
"line_numbers": [
24
],
"string": "Minix / old Linux"
},
{
"line_numbers": [
25
],
"string": "Linux swap"
},
{
"line_numbers": [
26
],
"string": "Linux"
},
{
"line_numbers": [
27
],
"string": "OS/2 hidden C: drive"
},
{
"line_numbers": [
28
],
"string": "Linux extended"
},
{
"line_numbers": [
29
],
"string": "NTFS volume set"
},
{
"line_numbers": [
30
],
"string": "NTFS volume set"
},
{
"line_numbers": [
31
],
"string": "Linux LVM"
},
{
"line_numbers": [
32
],
"string": "BSD/OS"
}
],
"scan_errors": []
}
]
}
3 changes: 3 additions & 0 deletions tests/data/strings_xgettext/fdisk.c.ABOUT
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
date: 2024-03-16
download_url: https://git.busybox.net/busybox/plain/util-linux/fdisk.c?h=1_35_stable
notes: stripped down to the few klines we care for
6 changes: 6 additions & 0 deletions tests/data/strings_xgettext/lineedit.c-expected.json
Original file line number Diff line number Diff line change
Expand Up @@ -65,6 +65,12 @@
],
"string": "."
},
{
"line_numbers": [
905
],
"string": "A"
},
{
"line_numbers": [
1000
Expand Down
9 changes: 9 additions & 0 deletions tests/test_symbols_xgettext.py
Original file line number Diff line number Diff line change
Expand Up @@ -232,3 +232,12 @@ def test_strings_scanner_multilines_utf8(self):

expected_loc = self.get_test_loc("lineedit.c-expected.json", must_exist=False)
check_json_scan(expected_loc, result_file, regen=REGEN_TEST_FIXTURES)

def test_strings_scanner_unicode(self):
test_file = self.get_test_loc("fdisk.c")
result_file = self.get_temp_file("json")
args = ["--source-string", test_file, "--json-pp", result_file]
run_scan_click(args)

expected_loc = self.get_test_loc("fdisk.c-expected.json", must_exist=False)
check_json_scan(expected_loc, result_file, regen=REGEN_TEST_FIXTURES)

0 comments on commit 812faf9

Please sign in to comment.