Skip to content

Commit

Permalink
extend german normalizer (#59)
Browse files Browse the repository at this point in the history
Co-authored-by: JarbasAl <[email protected]>
  • Loading branch information
emphasize and JarbasAl authored Aug 21, 2023
1 parent d8596ee commit 711c9e2
Show file tree
Hide file tree
Showing 10 changed files with 33 additions and 12 deletions.
6 changes: 3 additions & 3 deletions lingua_franca/lang/parse_common.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,7 +51,7 @@ def should_expand_contractions(self):

@property
def should_remove_symbols(self):
return self.config.get("remove_symbols", False)
return self.config.get("remove_symbols", True)

@property
def should_remove_accents(self):
Expand Down Expand Up @@ -104,8 +104,8 @@ def articles(self):
def symbols(self):
return self.config.get("symbols",
[".", ",", ";", "_", "!", "?", "<", ">",
"|", "(", ")", "=", "[", "]", "{",
"}", "»", "«", "*", "~", "^", "`"])
"|", "(", ")", "=", "[", "]", "{", "}",
"»", "«", "*", "~", "^", "`", "\""])

def expand_contractions(self, utterance):
""" Expand common contractions, e.g. "isn't" -> "is not" """
Expand Down
4 changes: 4 additions & 0 deletions lingua_franca/lang/parse_de.py
Original file line number Diff line number Diff line change
Expand Up @@ -1156,6 +1156,10 @@ class GermanNormalizer(Normalizer):
with open(resolve_resource_file("text/de-de/normalize.json")) as f:
_default_config = json.load(f)

def remove_symbols(self, utterance):
utterance = re.sub(r"\b(\w*)-([A-Za-z]+)\b", r"\1 \2", utterance)
return super().remove_symbols(utterance)


def normalize_de(text, remove_articles=True):
return GermanNormalizer().normalize(text, remove_articles)
2 changes: 1 addition & 1 deletion lingua_franca/res/text/az-az/normalize.json
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
"lowercase": false,
"numbers_to_digits": true,
"expand_contractions": true,
"remove_symbols": false,
"remove_symbols": true,
"remove_accents": false,
"remove_articles": false,
"remove_stopwords": false,
Expand Down
2 changes: 1 addition & 1 deletion lingua_franca/res/text/cs-cz/normalize.json
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
"lowercase": false,
"numbers_to_digits": true,
"expand_contractions": true,
"remove_symbols": false,
"remove_symbols": true,
"remove_accents": false,
"remove_articles": false,
"remove_stopwords": false,
Expand Down
2 changes: 1 addition & 1 deletion lingua_franca/res/text/de-de/normalize.json
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
"lowercase": false,
"numbers_to_digits": true,
"expand_contractions": true,
"remove_symbols": false,
"remove_symbols": true,
"remove_accents": false,
"remove_articles": false,
"remove_stopwords": false,
Expand Down
2 changes: 1 addition & 1 deletion lingua_franca/res/text/en-us/normalize.json
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
"lowercase": false,
"numbers_to_digits": true,
"expand_contractions": true,
"remove_symbols": false,
"remove_symbols": true,
"remove_accents": false,
"remove_articles": false,
"remove_stopwords": false,
Expand Down
2 changes: 1 addition & 1 deletion lingua_franca/res/text/ru-ru/normalize.json
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
"lowercase": false,
"numbers_to_digits": true,
"expand_contractions": true,
"remove_symbols": false,
"remove_symbols": true,
"remove_accents": false,
"remove_articles": false,
"remove_stopwords": false,
Expand Down
2 changes: 1 addition & 1 deletion lingua_franca/res/text/sl-si/normalize.json
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
"lowercase": false,
"numbers_to_digits": true,
"expand_contractions": false,
"remove_symbols": false,
"remove_symbols": true,
"remove_accents": false,
"remove_articles": false,
"remove_stopwords": false,
Expand Down
2 changes: 1 addition & 1 deletion lingua_franca/res/text/uk-uk/normalize.json
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
"lowercase": false,
"numbers_to_digits": true,
"expand_contractions": true,
"remove_symbols": false,
"remove_symbols": true,
"remove_accents": false,
"remove_articles": false,
"remove_stopwords": false,
Expand Down
21 changes: 19 additions & 2 deletions test/unittests/test_parse_de.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,9 +45,9 @@ def test_articles(self):
self.assertEqual(
normalize("und noch ein Test", lang="de-de", remove_articles=True),
"und noch ein Test")
self.assertEqual(normalize("dies ist der Extra-Test", lang="de-de",
self.assertEqual(normalize("dies ist der extra Test", lang="de-de",
remove_articles=False),
"dies ist der Extra-Test")
"dies ist der extra Test")

def test_spaces(self):
self.assertEqual(normalize(" dies ist ein test", lang="de-de"),
Expand Down Expand Up @@ -80,6 +80,23 @@ def test_numbers(self):
self.assertEqual(
normalize("dies ist achtzehn neunzehn zwanzig", lang="de-de"),
"dies ist 18 19 20")

def test_symbols(self):
self.assertEqual(
normalize("starte einen 15-Minuten-Timer", lang="de-de"),
"starte einen 15 Minuten Timer")

self.assertEqual(
normalize('"starte einen 15-Minuten-Timer"', lang="de-de"),
'starte einen 15 Minuten Timer')

self.assertEqual(normalize("gib mir noch ein Test!", lang="de-de"),
"gib mir noch ein Test")

self.assertEqual(normalize("Ist das der letzte?", lang="de-de",
remove_articles=False),
"Ist das der letzte")



class TestExtractNumber(unittest.TestCase):
Expand Down

0 comments on commit 711c9e2

Please sign in to comment.