Skip to content

Commit

Permalink
fix german time parser (#65)
Browse files Browse the repository at this point in the history
  • Loading branch information
emphasize authored Sep 13, 2023
1 parent ceb864e commit e277278
Show file tree
Hide file tree
Showing 2 changed files with 110 additions and 13 deletions.
91 changes: 79 additions & 12 deletions lingua_franca/lang/parse_de.py
Original file line number Diff line number Diff line change
Expand Up @@ -277,6 +277,20 @@ def _extract_real_number_with_text_de(tokens, short_scale):
if not next_word or not number:
val = f"{_val-1}:{int(60*_prev_val)}"
break

# correct time format (whisper "13.30 Uhr")
if all([isinstance(_current_val, float),
next_word.lower() in ["uhr", "pm", "a.m.", "p.m."]]):
components = word.split(".")
if len(components) == 2 and \
all(map(str.isdigit, components)) and\
int(components[0]) < 25 and int(components[1]) < 60:
_hstr, _mstr = components
_mstr = _mstr.ljust(2, "0")
tokens[idx] = Token(f"{_hstr}:{_mstr}", idx)
number_words.clear()
_val = _prev_val = None
continue

# spoken decimals
if _current_val is not None and _comma:
Expand Down Expand Up @@ -413,7 +427,7 @@ def date_found():
days = ['montag', 'dienstag', 'mittwoch',
'donnerstag', 'freitag', 'samstag', 'sonntag']
months = ['januar', 'februar', 'märz', 'april', 'mai', 'juni',
'juli', 'august', 'september', 'october', 'november',
'juli', 'august', 'september', 'oktober', 'november',
'dezember']
monthsShort = ['jan', 'feb', 'mär', 'apr', 'mai', 'juni', 'juli', 'aug',
'sept', 'oct', 'nov', 'dez']
Expand Down Expand Up @@ -627,6 +641,7 @@ def date_found():
wordNextNext = words[idx + 2] if idx + 2 < len(words) else ""
wordNextNextNext = words[idx + 3] if idx + 3 < len(words) else ""
wordNextNextNextNext = words[idx + 4] if idx + 4 < len(words) else ""
wordNextNextNextNextNext = words[idx + 5] if idx + 5 < len(words) else ""

# parse noon, midnight, morning, afternoon, evening
used = 0
Expand Down Expand Up @@ -702,6 +717,19 @@ def date_found():
elif nextWord in timeQualifiersList:
used += 1
timeQualifier = "am"
elif nextWord == "uhr":
used += 1
if wordNextNext in eveningQualifiers:
used += 1
timeQualifier = "pm"
elif wordNextNext in timeQualifiersList:
used += 1
timeQualifier = "am"
elif strHH.isdigit():
if int(strHH) > 12:
timeQualifier = "pm"
else:
timeQualifier = "am"
else:
# try to parse # s without colons
# 5 hours, 10 minutes etc.
Expand All @@ -714,9 +742,6 @@ def date_found():
else:
remainder += word[i]

if remainder == "":
timeQualifier = wordNext.replace(".", "").lstrip().rstrip()

if (
remainder == "pm" or
wordNext == "pm" or
Expand Down Expand Up @@ -760,7 +785,9 @@ def date_found():
strHH = word
used += 1
isTime = True
if wordNextNext in timeQualifiersList:
if wordNextNext in timeQualifiersList or \
wordNextNextNext in timeQualifiersList \
and not is_number_de(wordNextNext):
strMM = ""
if wordNextNext[:10] == "nachmittag":
used += 1
Expand All @@ -769,6 +796,13 @@ def date_found():
"nachmittag":
used += 2
timeQualifier = "pm"
elif wordNextNext[:6] == "mittag":
used += 1
timeQualifier = "am"
elif wordNextNext == "am" and wordNextNextNext == \
"mittag":
used += 2
timeQualifier = "am"
elif wordNextNext[:5] == "abend":
used += 1
timeQualifier = "pm"
Expand All @@ -793,14 +827,27 @@ def date_found():
elif is_numeric_de(wordNextNext):
strMM = wordNextNext
used += 1
if wordNextNextNext == timeQualifier:
# TTS failure "16 Uhr 30 Uhr" (common with google)
if wordNextNextNext == "uhr":
used += 1
wordNextNextNext = wordNextNextNextNext
wordNextNextNextNext = wordNextNextNextNextNext
if wordNextNextNext in timeQualifiersList or \
wordNextNextNextNext in timeQualifiersList:
if wordNextNextNext[:10] == "nachmittag":
used += 1
timeQualifier = "pm"
elif wordNextNextNext == "am" and \
wordNextNextNextNext == "nachmittag":
used += 2
timeQualifier = "pm"
elif wordNextNext[:6] == "mittag":
used += 1
timeQualifier = "am"
elif wordNextNext == "am" and wordNextNextNext == \
"mittag":
used += 2
timeQualifier = "am"
elif wordNextNextNext[:5] == "abend":
used += 1
timeQualifier = "pm"
Expand All @@ -821,8 +868,19 @@ def date_found():
timeQualifier = "pm"
else:
timeQualifier = "am"
elif strHH.isdigit():
if int(strHH) > 12:
timeQualifier = "pm"
else:
timeQualifier = "am"
elif strHH.isdigit():
if int(strHH) > 12:
timeQualifier = "pm"
else:
timeQualifier = "am"

elif wordNext in timeQualifiersList:
elif wordNext in timeQualifiersList or \
wordNextNext in timeQualifiersList:
strHH = word
strMM = 00
isTime = True
Expand All @@ -832,6 +890,13 @@ def date_found():
elif wordNext == "am" and wordNextNext == "nachmittag":
used += 2
timeQualifier = "pm"
elif wordNextNext[:6] == "mittag":
used += 1
timeQualifier = "am"
elif wordNextNext == "am" and wordNextNextNext == \
"mittag":
used += 2
timeQualifier = "am"
elif wordNext[:5] == "abend":
used += 1
timeQualifier = "pm"
Expand All @@ -851,10 +916,8 @@ def date_found():
else:
timeQualifier = "am"

# if timeQualifier != "":
# military = True
# else:
# isTime = False
if timeQualifier == "":
isTime = False

strHH = int(strHH) if strHH else 0
strMM = int(strMM) if strMM else 0
Expand Down Expand Up @@ -923,7 +986,11 @@ def date_found():
for idx, en_month in enumerate(en_monthsShort):
datestr = datestr.replace(monthsShort[idx], en_month)

temp = datetime.strptime(datestr, "%B %d")
if hasYear:
temp = datetime.strptime(datestr, "%B %d %Y")
else:
temp = datetime.strptime(datestr, "%B %d")

if extractedDate.tzinfo:
temp = temp.replace(tzinfo=extractedDate.tzinfo)

Expand Down
32 changes: 31 additions & 1 deletion test/unittests/test_parse_de.py
Original file line number Diff line number Diff line change
Expand Up @@ -364,12 +364,42 @@ def testExtract(text, expected_date, expected_leftover):

testExtract("wie ist das wetter am mittwoch um 07:00",
"2017-06-28 07:00:00", "wie ist das wetter")

testExtract("wie ist das wetter am mittwoch um 07:00 Uhr",
"2017-06-28 07:00:00", "wie ist das wetter")

# TTS failure
testExtract("wie ist das wetter am mittwoch um 07.00 Uhr",
"2017-06-28 07:00:00", "wie ist das wetter")

# TTS failure
testExtract("wie ist das wetter am mittwoch um 07.30 Uhr",
"2017-06-28 07:30:00", "wie ist das wetter")

testExtract("wie ist das wetter am mittwoch um 7 uhr",
"2017-06-28 07:00:00", "wie ist das wetter")

testExtract("wie ist das wetter am mittwoch um 7 uhr 30",
"2017-06-28 07:30:00", "wie ist das wetter")

# TTS failure
testExtract("wie ist das wetter am mittwoch um 7 uhr 30 uhr",
"2017-06-28 07:30:00", "wie ist das wetter")

testExtract("wie ist das wetter am mittwoch um 7:30 Uhr abends",
"2017-06-28 19:30:00", "wie ist das wetter")

testExtract("wie ist das wetter am mittwoch um 7 uhr 30 am abend",
"2017-06-28 19:30:00", "wie ist das wetter")

testExtract("wie ist das wetter am mittwoch um 5 uhr nachmittags",
"2017-06-28 17:00:00", "wie ist das wetter")

testExtract("wie ist das wetter am mittwoch um 11 uhr mittags",
"2017-06-28 11:00:00", "wie ist das wetter")

testExtract("Mache einen Termin um 12:45 pm nächsten donnerstag",
"2017-07-06 12:45:00", "mache termin")
"2017-07-06 12:45:00", "mache 1 termin")

testExtract("wie ist das wetter an diesem donnerstag?",
"2017-06-29 00:00:00", "wie ist das wetter")
Expand Down

0 comments on commit e277278

Please sign in to comment.