Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Improve xgettext handlings #16

Merged
merged 6 commits into from
Mar 19, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
45 changes: 0 additions & 45 deletions azure-pipelines.yml
Original file line number Diff line number Diff line change
Expand Up @@ -27,48 +27,3 @@ jobs:
sudo apt-get install universal-ctags gettext
venv/bin/pytest -n 2 -vvs

- template: etc/ci/azure-posix.yml
parameters:
job_name: macos11_cpython
image_name: macOS-11
python_versions: ['3.8', '3.9', '3.10', '3.11', '3.12']
test_suites:
all: |
brew install universal-ctags gettext
venv/bin/pytest -n 2 -vvs

- template: etc/ci/azure-posix.yml
parameters:
job_name: macos12_cpython
image_name: macOS-12
python_versions: ['3.8', '3.9', '3.10', '3.11', '3.12']
test_suites:
all: |
brew install universal-ctags gettext
venv/bin/pytest -n 2 -vvs

- template: etc/ci/azure-posix.yml
parameters:
job_name: macos13_cpython
image_name: macOS-13
python_versions: ['3.8', '3.9', '3.10', '3.11', '3.12']
test_suites:
all: |
brew install universal-ctags gettext
venv/bin/pytest -n 2 -vvs

# - template: etc/ci/azure-win.yml
# parameters:
# job_name: win2019_cpython
# image_name: windows-2019
# python_versions: ['3.8', '3.9', '3.10', '3.11', '3.12']
# test_suites:
# all: venv\Scripts\pytest -n 2 -vvs
#
# - template: etc/ci/azure-win.yml
# parameters:
# job_name: win2022_cpython
# image_name: windows-2022
# python_versions: ['3.8', '3.9', '3.10', '3.11', '3.12']
# test_suites:
# all: venv\Scripts\pytest -n 2 -vvs
1 change: 1 addition & 0 deletions setup.cfg
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,7 @@ install_requires =
scancode-toolkit
plugincode
commoncode
typecode

[options.packages.find]
where = src
Expand Down
104 changes: 88 additions & 16 deletions src/source_inspector/strings_xgettext.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,13 +9,15 @@
#

import logging
import string

import attr
from commoncode import command
from commoncode.cliutils import SCAN_GROUP
from commoncode.cliutils import PluggableCommandLineOption
from plugincode.scan import ScanPlugin
from plugincode.scan import scan_impl
from typecode.contenttype import Type

"""
Extract strinsg from source code files with xgettext.
Expand Down Expand Up @@ -55,36 +57,50 @@ def get_source_strings(location, **kwargs):
"""
Return a mapping of strings for a source file at ``location``.
"""
return dict(source_strings=list(collect_strings(location=location, strip=True)))
return dict(source_strings=list(collect_strings(location=location, clean=True)))


def collect_strings(location, strip=False):
def collect_strings(location, clean=True):
"""
Yield mappings of strings collected from file at location.
Strip strings if ``strip`` is True.
Clean strings if ``clean`` is True.
"""
if not is_xgettext_installed():
return

if not Type(location).is_source:
return

rc, result, err = command.execute(
cmd_loc="xgettext",
args=["--omit-header", "--no-wrap", "--extract-all", "--output=-", location],
args=[
# this is a trick to force getting UTF back
# see https://github.com/nexB/source-inspector/issues/14#issuecomment-2001893496
'--copyright-holder="ø"',
"--no-wrap",
"--extract-all",
"--from-code=UTF-8",
"--output=-",
location,
],
to_files=False,
)

if rc != 0:
raise Exception(open(err).read())

yield from parse_po_text(po_text=result, strip=strip)
yield from parse_po_text(po_text=result, drop_header=True, clean=clean)


def parse_po_text(po_text, strip=False):
def parse_po_text(po_text, drop_header=False, clean=True):
"""
Yield mappings of strings collected from the ``po_text`` string.
Strip strings if ``strip`` is True.
Clean strings if ``clean`` is True.
Drop the "header" first block if ``drop_header`` is True

The po text lines looks like this:
- Blocks sperated by 2 lines.
- Blocks separated by 2 lines.
- Optional first header block
- The first lines starting with #: are comments with the line numbers.
- The lines starting with #, are flags, not interesting
- We care about the lines in the middle starting with the first msgid
Expand All @@ -104,14 +120,25 @@ def parse_po_text(po_text, strip=False):
msgstr ""
"""

for chunk in po_text.split("\n\n"):
lines = chunk.splitlines(False)
blocks = po_text.split("\n\n")
if drop_header:
# drop the first block which is the header
blocks = blocks[1:]

for block in blocks:
lines = block.splitlines(False)
line_numbers = []
strings = []
for line in lines:
if line.startswith("#: "):
_, _, start_line = line.rpartition(":")
line_numbers.append(int(start_line.strip()))
# we can have either of these two forms:
# #: lineedit.c:1571 lineedit.c:1587 lineedit.c:163
# #: lineedit.c:1571
_, _, line = line.partition("#: ")
filename, _, _ = line.partition(":")
numbers = line.replace(filename + ":", "")
numbers = [int(l) for ln in numbers.split() if (l := ln.strip())]
line_numbers.extend(numbers)

elif line.startswith(
(
Expand All @@ -130,12 +157,57 @@ def parse_po_text(po_text, strip=False):
elif line.startswith('"'):
strings.append(line)

strings = [l.strip('"').replace("\\n", "\n") for l in strings]
strings = [l.strip('"') for l in strings]
string = "".join(strings)
if strip:
string = string.strip()
if clean:
string = clean_string(string)
if string:
yield dict(line_numbers=line_numbers, string=string)

yield dict(line_numbers=line_numbers, string=string)

def clean_string(s):
"""
Return a cleaned and normalized string or None.
"""
s = s.strip('"')
s = s.replace("\\n", "\n")
s = s.strip()
non_printables = {
"\\a": "\a",
"\\b": "\b",
"\\v": "\v",
"\\f": "\f",
"\\x01": "\x01",
"\\x02": "\x02",
"\\x03": "\x03",
"\\x04": "\x04",
"\\x05": "\x05",
"\\x06": "\x06",
"\\x0e": "\x0e",
"\\x0f": "\x0f",
"\\x10": "\x10",
"\\x11": "\x11",
"\\x12": "\x12",
"\\x13": "\x13",
"\\x14": "\x14",
"\\x15": "\x15",
"\\x16": "\x16",
"\\x17": "\x17",
"\\x18": "\x18",
"\\x19": "\x19",
"\\x1a": "\x1a",
"\\x1b": "\x1b",
"\\x1c": "\x1c",
"\\x1d": "\x1d",
"\\x1e": "\x1e",
"\\x1f": "\x1f",
"\\x7f": "\x7f",
}

for plain, encoded in non_printables.items():
s = s.replace(plain, "")
s = s.replace(encoded, "")
return s


_IS_XGETTEXT_INSTALLED = None
Expand Down
4 changes: 4 additions & 0 deletions src/source_inspector/symbols_ctags.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@
from commoncode.cliutils import PluggableCommandLineOption
from plugincode.scan import ScanPlugin
from plugincode.scan import scan_impl
from typecode.contenttype import Type

"""
Extract symbols information from source code files with ctags.
Expand Down Expand Up @@ -67,6 +68,9 @@ def collect_symbols(location):
if not is_ctags_installed():
return

if not Type(location).is_source:
return

rc, result, err = command.execute(
cmd_loc="ctags",
args=["--output-format=json", "-f", "-", location],
Expand Down
32 changes: 32 additions & 0 deletions tests/data/strings_xgettext/fdisk.c
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
/* vi: set sw=4 ts=4: */
/*
* fdisk.c -- Partition table manipulator for Linux.
*
* Copyright (C) 1992 A. V. Le Blanc ([email protected])
* Copyright (C) 2001,2002 Vladimir Oleynik <[email protected]> (initial bb port)
*
* Licensed under GPLv2 or later, see file LICENSE in this source tree.
*/
//applet:IF_FDISK(APPLET(fdisk, BB_DIR_SBIN, BB_SUID_DROP))

//kbuild:lib-$(CONFIG_FDISK) += fdisk.o

/* Looks like someone forgot to add this to config system */
//usage:#ifndef ENABLE_FEATURE_FDISK_BLKSIZE
#include "libbb.h"
#include "unicode.h"

static void list_types(const char *const *sys);
"\x0a" "OS/2 Boot Manager",/* OS/2 Boot Manager */
"\x42" "SFS",
"\x63" "GNU HURD or SysV", /* GNU HURD or Mach or Sys V/386 (such as ISC UNIX) */
"\x80" "Old Minix", /* Minix 1.4a and earlier */
"\x81" "Minix / old Linux",/* Minix 1.4b and later */
"\x82" "Linux swap", /* also Solaris */
"\x83" "Linux",
"\x84" "OS/2 hidden C: drive",
"\x85" "Linux extended",
"\x86" "NTFS volume set",
"\x87" "NTFS volume set",
"\x8e" "Linux LVM",
"\x9f" "BSD/OS", /* BSDI */
89 changes: 89 additions & 0 deletions tests/data/strings_xgettext/fdisk.c-expected.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,89 @@
{
"files": [
{
"path": "fdisk.c",
"type": "file",
"source_strings": [
{
"line_numbers": [
20
],
"string": "OS/2 Boot Manager"
},
{
"line_numbers": [
21
],
"string": "BSFS"
},
{
"line_numbers": [
22
],
"string": "cGNU HURD or SysV"
},
{
"line_numbers": [
23
],
"string": "Old Minix"
},
{
"line_numbers": [
24
],
"string": "Minix / old Linux"
},
{
"line_numbers": [
25
],
"string": "Linux swap"
},
{
"line_numbers": [
26
],
"string": "Linux"
},
{
"line_numbers": [
27
],
"string": "OS/2 hidden C: drive"
},
{
"line_numbers": [
28
],
"string": "Linux extended"
},
{
"line_numbers": [
29
],
"string": "NTFS volume set"
},
{
"line_numbers": [
30
],
"string": "NTFS volume set"
},
{
"line_numbers": [
31
],
"string": "Linux LVM"
},
{
"line_numbers": [
32
],
"string": "BSD/OS"
}
],
"scan_errors": []
}
]
}
3 changes: 3 additions & 0 deletions tests/data/strings_xgettext/fdisk.c.ABOUT
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
date: 2024-03-16
download_url: https://git.busybox.net/busybox/plain/util-linux/fdisk.c?h=1_35_stable
notes: stripped down to the few klines we care for
Loading
Loading