Skip to content

Commit

Permalink
move to quebra tokenizer (#61)
Browse files Browse the repository at this point in the history
  • Loading branch information
emphasize authored Aug 15, 2023
1 parent 574e0af commit 1a81e47
Show file tree
Hide file tree
Showing 8 changed files with 52 additions and 27 deletions.
19 changes: 8 additions & 11 deletions lingua_franca/lang/parse_common.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,9 +16,11 @@
from collections import namedtuple
import re
import json
from lingua_franca.internal import resolve_resource_file, FunctionNotLocalizedError
import unicodedata

from quebra_frases import word_tokenize
from lingua_franca.internal import resolve_resource_file, FunctionNotLocalizedError


class Normalizer:
"""
Expand All @@ -33,11 +35,7 @@ def __init__(self, config=None):

@staticmethod
def tokenize(utterance):
# Split things like 12%
utterance = re.sub(r"([0-9]+)([\%])", r"\1 \2", utterance)
# Split thins like #1
utterance = re.sub(r"(\#)([0-9]+\b)", r"\1 \2", utterance)
return utterance.split()
return word_tokenize(utterance)

@property
def should_lowercase(self):
Expand Down Expand Up @@ -105,7 +103,7 @@ def articles(self):
@property
def symbols(self):
return self.config.get("symbols",
[";", "_", "!", "?", "<", ">",
[".", ",", ";", "_", "!", "?", "<", ">",
"|", "(", ")", "=", "[", "]", "{",
"}", "»", "«", "*", "~", "^", "`"])

Expand Down Expand Up @@ -148,9 +146,8 @@ def remove_stopwords(self, utterance):
return utterance

def remove_symbols(self, utterance):
for s in self.symbols:
utterance = utterance.replace(s, " ")
return utterance
words = self.tokenize(utterance)
return " ".join([w for w in words if w not in self.symbols])

def remove_accents(self, utterance):
for s in self.accents:
Expand All @@ -171,9 +168,9 @@ def normalize(self, utterance="", remove_articles=None):
utterance = utterance.lower()
if self.should_expand_contractions:
utterance = self.expand_contractions(utterance)
utterance = self.replace_words(utterance)
if self.should_numbers_to_digits:
utterance = self.numbers_to_digits(utterance)
utterance = self.replace_words(utterance)

# removals
if self.should_remove_symbols:
Expand Down
2 changes: 1 addition & 1 deletion requirements/requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -2,4 +2,4 @@ python-dateutil~=2.6
rapidfuzz
colour~=0.1
webcolors
quebra_frases
quebra_frases>=0.3.7
38 changes: 34 additions & 4 deletions test/unittests/test_parse.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@
from dateutil import tz

from lingua_franca import load_language, unload_language, set_default_lang
from lingua_franca.lang.parse_common import tokenize, Token
from lingua_franca.lang.parse_common import tokenize, Token, Normalizer
from lingua_franca.parse import extract_datetime, fuzzy_match, match_one, extract_langcode, yes_or_no
from lingua_franca.time import default_timezone, now_local, set_default_tz
from lingua_franca.internal import FunctionNotLocalizedError
Expand Down Expand Up @@ -101,7 +101,7 @@ def test_match_one(self):
self.assertEqual(match_one('enry', choices)[0], 4)


class TestParseCommon(unittest.TestCase):
class TestTokenize(unittest.TestCase):
def test_tokenize(self):
self.assertEqual(tokenize('One small step for man'),
[Token('One', 0), Token('small', 1), Token('step', 2),
Expand All @@ -115,8 +115,38 @@ def test_tokenize(self):
Token('1', 3)])

self.assertEqual(tokenize('hashtag #1world'),
[Token('hashtag', 0), Token('#1world', 1)])

[Token('hashtag', 0), Token('#', 1), Token('1world', 2)])

self.assertEqual(tokenize(",;_!?<>|()=[]{}»«*~^`."),
[Token(",", 0), Token(";", 1), Token("_",2), Token("!",3),
Token("?", 4), Token("<", 5), Token(">", 6), Token("|", 7),
Token("(", 8), Token(")", 9), Token("=", 10), Token("[", 11),
Token("]", 12), Token("{", 13), Token("}", 14), Token("»", 15),
Token("«", 16), Token("*", 17), Token("~", 18), Token("^", 19),
Token("`", 20), Token(".", 21)])


class TestRemoveSymbols(unittest.TestCase):
def test_remove_symbols_empty_string(self):
self.assertEqual(Normalizer().remove_symbols(""), "")

def test_remove_symbols_no_symbols(self):
self.assertEqual(Normalizer().remove_symbols("Hello world"), "Hello world")

def test_remove_symbols_one_symbol(self):
self.assertEqual(Normalizer().remove_symbols("Hello, world?!"), "Hello world")

def test_remove_symbols_only_symbols(self):
self.assertEqual(Normalizer().remove_symbols(",;_!?<>|()=[]{}»«*~^`"), "")

def test_remove_symbols_contraction(self):
self.assertEqual(Normalizer().remove_symbols("It's sunny and warm outside."),
"It's sunny and warm outside")

def test_remove_symbols_dates(self):
self.assertEqual(Normalizer().remove_symbols("(* 15/2/2018)"),
"15/2/2018")


class TestLangcode(unittest.TestCase):
def test_parse_lang_code(self):
Expand Down
4 changes: 2 additions & 2 deletions test/unittests/test_parse_cs.py
Original file line number Diff line number Diff line change
Expand Up @@ -218,11 +218,11 @@ def test_extract_duration_cs(self):
" sto devadesát sedm dní, a"
" tři sto 91.6 sekund"),
(timedelta(weeks=3, days=497, seconds=391.6),
"vzbuď mě za , , a"))
"vzbuď mě za , , a"))
self.assertEqual(extract_duration("film je jedna hodina, padesát sedm"
" a půl minuty dlouhý"),
(timedelta(hours=1, minutes=57.5),
"film je , dlouhý"))
"film je , dlouhý"))
self.assertEqual(extract_duration("10-sekund"),
(timedelta(seconds=10.0), ""))
self.assertEqual(extract_duration("5-minut"),
Expand Down
4 changes: 2 additions & 2 deletions test/unittests/test_parse_de.py
Original file line number Diff line number Diff line change
Expand Up @@ -454,15 +454,15 @@ def test_extract_duration_de(self):
" 497 tagen und"
" 391.6 sekunden"), lang="de-de"),
(timedelta(weeks=3, days=497, seconds=391.6),
"weck mich in, und"))
"weck mich in , und"))

self.assertEqual(extract_duration("weck mich in einer viertel stunde"),
(timedelta(hours=0.25), "weck mich in"))

self.assertEqual(extract_duration(("der film ist eine stunde, fünfzehn"
" einhalb minuten lang")),
(timedelta(hours=1, minutes=15.5),
"der film ist, lang"))
"der film ist , lang"))

# wenn überhaupt wäre anstatt -sekunde -sekündig[e][ns] notwendig
self.assertEqual(extract_duration("10-sekunden", lang="de-de"),
Expand Down
6 changes: 2 additions & 4 deletions test/unittests/test_parse_en.py
Original file line number Diff line number Diff line change
Expand Up @@ -583,7 +583,7 @@ def test_extract_duration_en(self):
" hundred ninety seven days, and"
" three hundred 91.6 seconds"),
(timedelta(weeks=3, days=497, seconds=391.6),
"wake me up in , , and"))
"wake me up in , , and"))
self.assertEqual(extract_duration("10-seconds"),
(timedelta(seconds=10.0), ""))
self.assertEqual(extract_duration("5-minutes"),
Expand All @@ -595,7 +595,7 @@ def test_extract_duration_case_en(self):
self.assertEqual(extract_duration("The movie is one hour, fifty seven"
" and a half minutes long"),
(timedelta(hours=1, minutes=57.5),
"The movie is , long"))
"The movie is , long"))
self.assertEqual(extract_duration("Four and a Half minutes until"
" sunset"),
(timedelta(minutes=4.5), "until sunset"))
Expand Down Expand Up @@ -888,8 +888,6 @@ def testExtract(text, expected_date, expected_leftover):
"2017-06-27 17:00:00", "lets meet")
testExtract("lets meet at 8 a.m.",
"2017-06-28 08:00:00", "lets meet")
testExtract("remind me to wake up at 8 a.m",
"2017-06-28 08:00:00", "remind me to wake up")
testExtract("what is the weather on tuesday",
"2017-06-27 00:00:00", "what is weather")
testExtract("what is the weather on monday",
Expand Down
2 changes: 1 addition & 1 deletion test/unittests/test_parse_nl.py
Original file line number Diff line number Diff line change
Expand Up @@ -210,7 +210,7 @@ def test_extract_duration_nl(self):
self.assertEqual(extract_duration("zet een timer voor 1 uur", LANG),
(timedelta(seconds=3600), "zet 1 timer voor"))
self.assertEqual(extract_duration("een treinrit van 2 uur, 17 minuten en zestien seconden", LANG),
(timedelta(seconds=8236), "1 treinrit van , en"))
(timedelta(seconds=8236), "1 treinrit van , en"))
self.assertEqual(extract_duration("een uurtje", LANG),
(timedelta(seconds=3600), ""))

Expand Down
4 changes: 2 additions & 2 deletions test/unittests/test_parse_pl.py
Original file line number Diff line number Diff line change
Expand Up @@ -147,11 +147,11 @@ def test_extract_duration_pl(self):
self.assertEqual(extract_duration("obudź mnie za 3 tygodnie, czterysta dziewięćdziesiąt siedem dni i"
" trzysta 91.6 sekund"),
(timedelta(weeks=3, days=497, seconds=391.6),
"obudź mnie za , i"))
"obudź mnie za , i"))
self.assertEqual(extract_duration("ten film trwa jedną godzinę, pięćdziesiąt siedem i pół minuty",
lang='pl-pl'),
(timedelta(hours=1, minutes=57.5),
"ten film trwa ,"))
"ten film trwa ,"))
self.assertEqual(extract_duration("10-sekund"),
(timedelta(seconds=10.0), ""))
self.assertEqual(extract_duration("5-minut"),
Expand Down

0 comments on commit 1a81e47

Please sign in to comment.