diff --git a/lingua_franca/lang/parse_common.py b/lingua_franca/lang/parse_common.py index e4992677..a60b0b3a 100644 --- a/lingua_franca/lang/parse_common.py +++ b/lingua_franca/lang/parse_common.py @@ -51,7 +51,7 @@ def should_expand_contractions(self): @property def should_remove_symbols(self): - return self.config.get("remove_symbols", False) + return self.config.get("remove_symbols", True) @property def should_remove_accents(self): @@ -104,8 +104,8 @@ def articles(self): def symbols(self): return self.config.get("symbols", [".", ",", ";", "_", "!", "?", "<", ">", - "|", "(", ")", "=", "[", "]", "{", - "}", "»", "«", "*", "~", "^", "`"]) + "|", "(", ")", "=", "[", "]", "{", "}", + "»", "«", "*", "~", "^", "`", "\""]) def expand_contractions(self, utterance): """ Expand common contractions, e.g. "isn't" -> "is not" """ diff --git a/lingua_franca/lang/parse_de.py b/lingua_franca/lang/parse_de.py index 0e479c83..143e4aaf 100644 --- a/lingua_franca/lang/parse_de.py +++ b/lingua_franca/lang/parse_de.py @@ -1156,6 +1156,10 @@ class GermanNormalizer(Normalizer): with open(resolve_resource_file("text/de-de/normalize.json")) as f: _default_config = json.load(f) + def remove_symbols(self, utterance): + utterance = re.sub(r"\b(\w*)-([A-Za-z]+)\b", r"\1 \2", utterance) + return super().remove_symbols(utterance) + def normalize_de(text, remove_articles=True): return GermanNormalizer().normalize(text, remove_articles) diff --git a/lingua_franca/res/text/az-az/normalize.json b/lingua_franca/res/text/az-az/normalize.json index da2f0ca7..1a7729a4 100644 --- a/lingua_franca/res/text/az-az/normalize.json +++ b/lingua_franca/res/text/az-az/normalize.json @@ -2,7 +2,7 @@ "lowercase": false, "numbers_to_digits": true, "expand_contractions": true, - "remove_symbols": false, + "remove_symbols": true, "remove_accents": false, "remove_articles": false, "remove_stopwords": false, diff --git a/lingua_franca/res/text/cs-cz/normalize.json b/lingua_franca/res/text/cs-cz/normalize.json index 9493b787..c7836ee1 100644 --- a/lingua_franca/res/text/cs-cz/normalize.json +++ b/lingua_franca/res/text/cs-cz/normalize.json @@ -2,7 +2,7 @@ "lowercase": false, "numbers_to_digits": true, "expand_contractions": true, - "remove_symbols": false, + "remove_symbols": true, "remove_accents": false, "remove_articles": false, "remove_stopwords": false, diff --git a/lingua_franca/res/text/de-de/normalize.json b/lingua_franca/res/text/de-de/normalize.json index 3ede9681..59409081 100644 --- a/lingua_franca/res/text/de-de/normalize.json +++ b/lingua_franca/res/text/de-de/normalize.json @@ -2,7 +2,7 @@ "lowercase": false, "numbers_to_digits": true, "expand_contractions": true, - "remove_symbols": false, + "remove_symbols": true, "remove_accents": false, "remove_articles": false, "remove_stopwords": false, diff --git a/lingua_franca/res/text/en-us/normalize.json b/lingua_franca/res/text/en-us/normalize.json index 3dc31d30..9ae7bf39 100644 --- a/lingua_franca/res/text/en-us/normalize.json +++ b/lingua_franca/res/text/en-us/normalize.json @@ -2,7 +2,7 @@ "lowercase": false, "numbers_to_digits": true, "expand_contractions": true, - "remove_symbols": false, + "remove_symbols": true, "remove_accents": false, "remove_articles": false, "remove_stopwords": false, diff --git a/lingua_franca/res/text/ru-ru/normalize.json b/lingua_franca/res/text/ru-ru/normalize.json index 49bb02b3..b7322d99 100644 --- a/lingua_franca/res/text/ru-ru/normalize.json +++ b/lingua_franca/res/text/ru-ru/normalize.json @@ -2,7 +2,7 @@ "lowercase": false, "numbers_to_digits": true, "expand_contractions": true, - "remove_symbols": false, + "remove_symbols": true, "remove_accents": false, "remove_articles": false, "remove_stopwords": false, diff --git a/lingua_franca/res/text/sl-si/normalize.json b/lingua_franca/res/text/sl-si/normalize.json index a0892fda..2ee1d828 100644 --- a/lingua_franca/res/text/sl-si/normalize.json +++ b/lingua_franca/res/text/sl-si/normalize.json @@ -2,7 +2,7 @@ "lowercase": false, "numbers_to_digits": true, "expand_contractions": false, - "remove_symbols": false, + "remove_symbols": true, "remove_accents": false, "remove_articles": false, "remove_stopwords": false, diff --git a/lingua_franca/res/text/uk-uk/normalize.json b/lingua_franca/res/text/uk-uk/normalize.json index d8339dc1..aa13d2c2 100644 --- a/lingua_franca/res/text/uk-uk/normalize.json +++ b/lingua_franca/res/text/uk-uk/normalize.json @@ -2,7 +2,7 @@ "lowercase": false, "numbers_to_digits": true, "expand_contractions": true, - "remove_symbols": false, + "remove_symbols": true, "remove_accents": false, "remove_articles": false, "remove_stopwords": false, diff --git a/test/unittests/test_parse_de.py b/test/unittests/test_parse_de.py index 4d8c0cab..92289667 100644 --- a/test/unittests/test_parse_de.py +++ b/test/unittests/test_parse_de.py @@ -45,9 +45,9 @@ def test_articles(self): self.assertEqual( normalize("und noch ein Test", lang="de-de", remove_articles=True), "und noch ein Test") - self.assertEqual(normalize("dies ist der Extra-Test", lang="de-de", + self.assertEqual(normalize("dies ist der extra Test", lang="de-de", remove_articles=False), - "dies ist der Extra-Test") + "dies ist der extra Test") def test_spaces(self): self.assertEqual(normalize(" dies ist ein test", lang="de-de"), @@ -80,6 +80,23 @@ def test_numbers(self): self.assertEqual( normalize("dies ist achtzehn neunzehn zwanzig", lang="de-de"), "dies ist 18 19 20") + + def test_symbols(self): + self.assertEqual( + normalize("starte einen 15-Minuten-Timer", lang="de-de"), + "starte einen 15 Minuten Timer") + + self.assertEqual( + normalize('"starte einen 15-Minuten-Timer"', lang="de-de"), + 'starte einen 15 Minuten Timer') + + self.assertEqual(normalize("gib mir noch ein Test!", lang="de-de"), + "gib mir noch ein Test") + + self.assertEqual(normalize("Ist das der letzte?", lang="de-de", + remove_articles=False), + "Ist das der letzte") + class TestExtractNumber(unittest.TestCase):