From 645d3a3d0f5e3085fe6d487b735a70a2be224d83 Mon Sep 17 00:00:00 2001 From: NeonMariia <95533543+NeonMariia@users.noreply.github.com> Date: Fri, 7 Apr 2023 02:09:00 +0200 Subject: [PATCH] Format uk (#50) Co-authored-by: NeonDmitry Co-authored-by: Daniel McKnight <34697904+NeonDaniel@users.noreply.github.com> Co-authored-by: Swen Gross Co-authored-by: ChanceNCounter --- lingua_franca/format.py | 7 + lingua_franca/internal.py | 7 +- lingua_franca/lang/common_data_ru.py | 6 + lingua_franca/lang/common_data_uk.py | 347 ++++ lingua_franca/lang/format_uk.py | 504 +++++ lingua_franca/lang/parse_common.py | 10 +- lingua_franca/lang/parse_uk.py | 1806 +++++++++++++++++ lingua_franca/res/text/uk-uk/and.word | 3 + lingua_franca/res/text/uk-uk/date_time.json | 149 ++ .../res/text/uk-uk/date_time_test.json | 22 + lingua_franca/res/text/uk-uk/day.word | 1 + lingua_franca/res/text/uk-uk/days.word | 1 + lingua_franca/res/text/uk-uk/hour.word | 1 + lingua_franca/res/text/uk-uk/hours.word | 1 + lingua_franca/res/text/uk-uk/minute.word | 1 + lingua_franca/res/text/uk-uk/minutes.word | 1 + lingua_franca/res/text/uk-uk/normalize.json | 74 + lingua_franca/res/text/uk-uk/or.word | 1 + lingua_franca/res/text/uk-uk/second.word | 1 + lingua_franca/res/text/uk-uk/seconds.word | 1 + lingua_franca/res/text/uk-uk/yesno.json | 36 + test/unittests/test_format_ru.py | 2 + test/unittests/test_format_uk.py | 581 ++++++ test/unittests/test_parse_ru.py | 1 + 24 files changed, 3558 insertions(+), 6 deletions(-) create mode 100644 lingua_franca/lang/common_data_uk.py create mode 100644 lingua_franca/lang/format_uk.py create mode 100644 lingua_franca/lang/parse_uk.py create mode 100644 lingua_franca/res/text/uk-uk/and.word create mode 100644 lingua_franca/res/text/uk-uk/date_time.json create mode 100644 lingua_franca/res/text/uk-uk/date_time_test.json create mode 100644 lingua_franca/res/text/uk-uk/day.word create mode 100644 lingua_franca/res/text/uk-uk/days.word create mode 100644 lingua_franca/res/text/uk-uk/hour.word create mode 100644 lingua_franca/res/text/uk-uk/hours.word create mode 100644 lingua_franca/res/text/uk-uk/minute.word create mode 100644 lingua_franca/res/text/uk-uk/minutes.word create mode 100644 lingua_franca/res/text/uk-uk/normalize.json create mode 100644 lingua_franca/res/text/uk-uk/or.word create mode 100644 lingua_franca/res/text/uk-uk/second.word create mode 100644 lingua_franca/res/text/uk-uk/seconds.word create mode 100644 lingua_franca/res/text/uk-uk/yesno.json create mode 100644 test/unittests/test_format_uk.py diff --git a/lingua_franca/format.py b/lingua_franca/format.py index 994a996e..eeab9f40 100644 --- a/lingua_franca/format.py +++ b/lingua_franca/format.py @@ -165,6 +165,9 @@ def _format_string(self, number, format_section, lang): def _decade_format(self, number, number_tuple, lang): s = self._format_string(number % 100, 'decade_format', lang) + decade = s.format(x=number_tuple.x, xx=number_tuple.xx, + x0=number_tuple.x0, x_in_x0=number_tuple.x_in_x0, + number=str(number % 100)) return s.format(x=number_tuple.x, xx=number_tuple.xx, x0=number_tuple.x0, x_in_x0=number_tuple.x_in_x0, number=str(number % 100)) @@ -172,6 +175,10 @@ def _decade_format(self, number, number_tuple, lang): def _number_format_hundreds(self, number, number_tuple, lang, formatted_decade): s = self._format_string(number % 1000, 'hundreds_format', lang) + hundreds = s.format(xxx=number_tuple.xxx, x00=number_tuple.x00, + x_in_x00=number_tuple.x_in_x00, + formatted_decade=formatted_decade, + number=str(number % 1000)) return s.format(xxx=number_tuple.xxx, x00=number_tuple.x00, x_in_x00=number_tuple.x_in_x00, formatted_decade=formatted_decade, diff --git a/lingua_franca/internal.py b/lingua_franca/internal.py index 497c0db1..dd20c8cb 100644 --- a/lingua_franca/internal.py +++ b/lingua_franca/internal.py @@ -10,13 +10,13 @@ _SUPPORTED_LANGUAGES = ("az", "ca", "cs", "da", "de", "en", "es", "fr", "hu", - "it", "nl", "pl", "pt", "ru", "sl", "sv", "fa", "eu") + "it", "nl", "pl", "pt", "ru", "sl", "sv", "fa", "eu", "uk") _SUPPORTED_FULL_LOCALIZATIONS = ("az-az", "ca-es", "cs-cz", "da-dk", "de-de", "en-au", "en-us", "es-es", "fr-fr", "hu-hu", "it-it", "nl-nl", "pl-pl", "fa-ir", "pt-pt", "ru-ru", "sl-si", - "sv-se", "tr-tr", "eu-eu") + "sv-se", "tr-tr", "eu-eu", "uk-uk") _DEFAULT_FULL_LANG_CODES = {'az': 'az-az', 'ca': 'ca-es', @@ -36,7 +36,8 @@ 'ru': 'ru-ru', 'sl': 'sl-si', 'sv': 'sv-se', - 'tr': 'tr-tr'} + 'tr': 'tr-tr', + 'uk': 'uk-uk'} __default_lang = None __active_lang_code = None diff --git a/lingua_franca/lang/common_data_ru.py b/lingua_franca/lang/common_data_ru.py index dfa795fe..aebf957a 100644 --- a/lingua_franca/lang/common_data_ru.py +++ b/lingua_franca/lang/common_data_ru.py @@ -292,6 +292,12 @@ 'минута': 'minutes', 'минуты': 'minutes', 'минут': 'minutes', + 'година': 'hours', + 'годин': 'hours', + 'години': 'hours', + 'годиною': 'hours', + 'годинами': 'hours', + 'годині': 'hours', 'час': 'hours', 'часа': 'hours', 'часов': 'hours', diff --git a/lingua_franca/lang/common_data_uk.py b/lingua_franca/lang/common_data_uk.py new file mode 100644 index 00000000..71b43933 --- /dev/null +++ b/lingua_franca/lang/common_data_uk.py @@ -0,0 +1,347 @@ +# +# Copyright 2017 Mycroft AI Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +from collections import OrderedDict + + +_NUM_STRING_UK = { + 0: "нуль", + 1: "один", + 2: "два", + 3: "три", + 4: "чотири", + 5: "п'ять", + 6: "шість", + 7: "сім", + 8: "вісім", + 9: "дев'ять", + 10: "десять", + 11: "одинадцять", + 12: "дванадцять", + 13: "тринадцять", + 14: "чотирнадцять", + 15: "п'ятнадцять", + 16: "шістнадцять", + 17: "сімнадцять", + 18: "вісімнадцять", + 19: "дев'ятнадцять", + 20: "двадцять", + 30: "тридцять", + 40: "сорок", + 50: "п'ятдесят", + 60: "шістдесят", + 70: "сімдесят", + 80: "вісімдесят", + 90: "дев'яносто", + 100: "сто", + 200: "двісті", + 300: "триста", + 400: "чотириста", + 500: "п'ятсот", + 600: "шістсот", + 700: "сімсот", + 800: "вісімсот", + 900: "дев'ятсот" +} + +_PLURALS = { + 'двох': 2, 'двум': 2, 'двома': 2, 'дві': 2, "двоє": 2, "двійка": 2, + 'обидва': 2, 'обидвох': 2, 'обидві': 2, 'обох': 2, 'обома': 2, 'обом': 2, + 'пара': 2, 'пари': 2, 'парою': 2, 'парами': 2, 'парі': 2, 'парах': 2, 'пару': 2, + 'трьох': 3, 'трьома': 3, 'трьом': 3, + 'чотирьох': 4, 'чотирьом': 4, 'чотирма': 4, + "п'ятьох": 5, "п'ятьом": 5, "п'ятьома": 5, + "шістьом": 6, "шести": 6, "шістьох": 6, "шістьма": 6, "шістьома": 6, + "семи": 7, "сімом": 7, "сімох": 7, "сімома": 7, "сьома": 7, + "восьми": 8, "вісьмох": 8, "вісьмом": 8, "вісьма": 8, "вісьмома": 8, + "дев'яти": 9, "дев'ятьох": 9, "дев'ятьом": 9, "дев'ятьма": 9, + "десяти": 10, "десятьох": 10, "десятьма": 10, "десятьома": 10, + "сорока": 40, + "сот": 100, "сотень": 100, "сотні": 100, "сотня": 100, + "двохсот": 200, "двомстам": 200, "двомастами": 200, "двохстах": 200, + "тисяч": 1000, "тисячі": 1000, "тисячу": 1000, "тисячах": 1000, + "тисячами": 1000, "тисячею": 1000 + } + + +_FRACTION_STRING_UK = { + 2: "друга", + 3: "третя", + 4: "четверта", + 5: "п'ята", + 6: "шоста", + 7: "сьома", + 8: "восьма", + 9: "дев'ята", + 10: "десята", + 11: "одинадцята", + 12: "дванадцята", + 13: "тринадцята", + 14: "чотирнадцята", + 15: "п'ятнадцята", + 16: "шістнадцята", + 17: "сімнадцята", + 18: "вісімнадцята", + 19: "дев'ятнадцята", + 20: "двадцята", + 30: "тридцята", + 40: "сорокова", + 50: "п'ятдесята", + 60: "шістдесята", + 70: "сімдесята", + 80: "вісімдесята", + 90: "дев'яноста", + 1e2: "сота", + 1e3: "тисячна", + 1e6: "мільйонна", + 1e9: "мільярдна", + 1e-12: "більйонна", +} + + +_SHORT_SCALE_UK = OrderedDict([ + (1e3, "тисяча"), + (1e6, "мільйон"), + (1e9, "мільярд"), + (1e18, "трильйон"), + (1e12, "більйон"), + (1e15, "квадрилліон"), + (1e18, "квінтиліон"), + (1e21, "секстильйон"), + (1e24, "септилліон"), + (1e27, "октиліон"), + (1e30, "нонільйон"), + (1e33, "дециліон"), + (1e36, "ундеціліон"), + (1e39, "дуодециліон"), + (1e42, "тредециліон"), + (1e45, "кваттордециліон"), + (1e48, "квіндециліон"), + (1e51, "сексдециліон"), + (1e54, "септендециліон"), + (1e57, "октодециліон"), + (1e60, "новемдециліон"), + (1e63, "вігінтильйон"), + (1e66, "унвігінтільйон"), + (1e69, "дуовігінтильйон"), + (1e72, "тревігінтильйон"), + (1e75, "кватторвігінтільйон"), + (1e78, "квінвігінтильйон"), + (1e81, "секснвігінтіліон"), + (1e84, "септенвігінтильйон"), + (1e87, "октовігінтиліон"), + (1e90, "новемвігінтільйон"), + (1e93, "тригінтильйон"), +]) + + +_LONG_SCALE_UK = OrderedDict([ + (1e3, "тисяча"), + (1e6, "мільйон"), + (1e9, "мільярд"), + (1e12, "більйон"), + (1e15, "біліард"), + (1e18, "трильйон"), + (1e21, "трильярд"), + (1e24, "квадрилліон"), + (1e27, "квадрільярд"), + (1e30, "квінтиліон"), + (1e33, "квінтільярд"), + (1e36, "секстильйон"), + (1e39, "секстильярд"), + (1e42, "септилліон"), + (1e45, "септільярд"), + (1e48, "октиліон"), + (1e51, "октільярд"), + (1e54, "нонільйон"), + (1e57, "нонільярд"), + (1e60, "дециліон"), + (1e63, "дециліард"), + (1e66, "ундеціліон"), + (1e72, "дуодециліон"), + (1e78, "тредециліон"), + (1e84, "кваттордециліон"), + (1e90, "квіндециліон"), + (1e96, "сексдециліон"), + (1e102, "септендециліон"), + (1e108, "октодециліон"), + (1e114, "новемдециліон"), + (1e120, "вігінтильйон"), +]) + + +_ORDINAL_BASE_UK = { + 1: "перший", + 2: "другий", + 3: "третій", + 4: "четвертий", + 5: "п'ятий", + 6: "шостий", + 7: "сьомий", + 8: "восьмий", + 9: "дев'ятий", + 10: "десятий", + 11: "одинадцятий", + 12: "дванадцятий", + 13: "тринадцятий", + 14: "чотирнадцятий", + 15: "п'ятнадцятий", + 16: "шістнадцятий", + 17: "сімнадцятий", + 18: "вісімнадцятий", + 19: "дев'ятнадцятий", + 20: "двадцятий", + 30: "тридцятий", + 40: "сороковий", + 50: "п'ятдесятий", + 60: "шістдесятий", + 70: "сімдесятий", + 80: "вісімдесятий", + 90: "дев'яностий", + 1e2: "сотий", + 2e2: "двохсотий", + 3e2: "трьохсотий", + 4e2: "чотирисотий", + 5e2: "п'ятисотий", + 6e2: "шістсотий", + 7e2: "семисотий", + 8e2: "восьмисотий", + 9e2: "дев'ятисотий", + 1e3: "тисячний" +} + + +_SHORT_ORDINAL_UK = { + 1e6: "мільйон", + 1e9: "мільярд", + 1e18: "трильйон", + 1e15: "квадрилліон", + 1e18: "квінтильйон", + 1e21: "секстильйон", + 1e24: "септилліон", + 1e27: "октиліон", + 1e30: "нонільйон", + 1e33: "дециліон", + 1e36: "ундеціліон", + 1e39: "дуодециліон", + 1e42: "тредециліон", + 1e45: "кваттордециліон", + 1e48: "квіндециліон", + 1e51: "сексдециліон", + 1e54: "септендециліон", + 1e57: "октодециліон", + 1e60: "новемдециліон", + 1e63: "вігінтильйон" +} +_SHORT_ORDINAL_UK.update(_ORDINAL_BASE_UK) + + +_LONG_ORDINAL_UK = { + 1e6: "мільйон", + 1e9: "мільярд", + 1e12: "більйон", + 1e15: "біліард", + 1e18: "трильйон", + 1e21: "трильярд", + 1e24: "квадрилліон", + 1e27: "квадрильярд", + 1e30: "квінтиліон", + 1e33: "квінтільярд", + 1e36: "секстильйон", + 1e39: "секстильярд", + 1e42: "септилліон", + 1e45: "септільярд", + 1e48: "октиліон", + 1e51: "октільярд", + 1e54: "нонільйон", + 1e57: "нонільярд", + 1e60: "дециліон", + 1e63: "дециліард", + 1e66: "ундеціліон", + 1e72: "дуодециліон", + 1e78: "тредециліон", + 1e84: "кваттордециліон", + 1e90: "квіндециліон", + 1e96: "сексдециліон", + 1e102: "септендециліон", + 1e108: "октодециліон", + 1e114: "новемдециліон", + 1e120: "вігінтильйон" +} +_LONG_ORDINAL_UK.update(_ORDINAL_BASE_UK) + +# hours +HOURS_UK = { + 1: 'перша', + 2: 'друга', + 3: 'третя', + 4: 'четверта', + 5: "п'ята", + 6: 'шоста', + 7: 'сьома', + 8: 'восьма', + 9: "дев'ята", + 10: 'десята', + 11: 'одинадцята', + 12: 'дванадцята' + } +# Months + +_MONTHS_CONVERSION = { + 0: "january", + 1: "february", + 2: "march", + 3: "april", + 4: "may", + 5: "june", + 6: "july", + 7: "august", + 8: "september", + 9: "october", + 10: "november", + 11: "december" +} + +_MONTHS_UK = ["січень", "лютий", "березень", "квітень", "травень", "червень", + "липень", "серпень", "вересень", "жовтень", "листопад", + "грудень"] + +# Time +_TIME_UNITS_CONVERSION = { + "мікросекунд": "microseconds", + "мілісекунд": "milliseconds", + "секунда": "seconds", + "секунди": "seconds", + "секунд": "seconds", + "секунду": "seconds", + "хвилина": "minutes", + "хвилини": "minutes", + "хвилин": "minutes", + "хвилину": "minutes", + "година": "hours", + "годин": "hours", + "години": "hours", + "годину": "hours", + "годинами": "hours", + "годиною": "hours", + "днів": "days", + "день": "days", + "дні": "days", + "дня": "days", + "тиждень": "weeks", + "тижня": "weeks", + "тижні": "weeks", + "тижнів": "weeks" +} diff --git a/lingua_franca/lang/format_uk.py b/lingua_franca/lang/format_uk.py new file mode 100644 index 00000000..588cc0bf --- /dev/null +++ b/lingua_franca/lang/format_uk.py @@ -0,0 +1,504 @@ +# -*- coding: utf-8 -*- +# +# Copyright 2017 Mycroft AI Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +from lingua_franca.lang.format_common import convert_to_mixed_fraction +from lingua_franca.lang.common_data_uk import _NUM_STRING_UK, \ + _FRACTION_STRING_UK, _LONG_SCALE_UK, _SHORT_SCALE_UK, \ + _SHORT_ORDINAL_UK, _LONG_ORDINAL_UK, HOURS_UK +from lingua_franca.internal import FunctionNotLocalizedError + + +def nice_number_uk(number, speech=True, denominators=range(1, 21)): + """ Ukrainian helper for nice_number + + This function formats a float to human understandable functions. Like + 4.5 becomes "4 and a half" for speech and "4 1/2" for text + + Args: + number (int or float): the float to format + speech (bool): format for speech (True) or display (False) + denominators (iter of ints): denominators to use, default [1 .. 20] + Returns: + (str): The formatted string. + """ + + result = convert_to_mixed_fraction(number, denominators) + if not result: + # Give up, just represent as a 3 decimal number + return str(round(number, 3)) + + whole, num, den = result + + if not speech: + if num == 0: + # TODO: Number grouping? E.g. "1,000,000" + return str(whole) + else: + return '{} {}/{}'.format(whole, num, den) + + if num == 0: + return str(whole) + den_str = _FRACTION_STRING_UK[den] + + if whole == 0: + # if num == 1 and den <= 4: + # return_string = '{}'.format(den_str) + # else: + return_string = '{} {}'.format(num, den_str) + elif num == 1 and den == 2: + return_string = '{} з половиною'.format(whole) + else: + return_string = '{} і {} {}'.format(whole, num, den_str) + if 2 <= den <= 4: + if 2 <= num <= 4: + return_string = return_string[:-1] + 'і' + elif num > 4: + return_string = return_string[:-1] + 'ій' + elif den >= 5: + if 2 <= num <= 4: + return_string = return_string[:-1] + 'і' + elif num > 4: + return_string = return_string[:-1] + 'их' + + return return_string + + +def pronounce_number_uk(number, places=2, short_scale=True, scientific=False, + ordinals=False): + """ + Convert a number to it's spoken equivalent + + For example, '5.2' would return 'five point two' + + Args: + number(float or int): the number to pronounce (under 100) + places(int): maximum decimal places to speak + short_scale (bool) : use short (True) or long scale (False) + https://en.wikipedia.org/wiki/Names_of_large_numbers + scientific (bool): pronounce in scientific notation + ordinals (bool): pronounce in ordinal form "first" instead of "one" + Returns: + (str): The pronounced number + """ + num = number + # deal with infinity + if num == float("inf"): + return "нескінченність" + elif num == float("-inf"): + return "мінус нескінченність" + if scientific: + number = '%E' % num + n, power = number.replace("+", "").split("E") + power = int(power) + if power != 0: + if ordinals: + # This handles negative powers separately from the normal + # handling since each call disables the scientific flag + if float(n) < 0: + first_part = 'мінус ' + pronounce_number_uk( + abs(float(n)), places, short_scale, False, ordinals=True) + else: + first_part = pronounce_number_uk( + abs(float(n)), places, short_scale, False, ordinals=True) + + if power < 0: + second_part = 'мінус ' + pronounce_number_uk( + abs(power), places, short_scale, False, ordinals=True) + else: + second_part = pronounce_number_uk( + abs(power), places, short_scale, False, ordinals=True) + if second_part.endswith('ий'): + second_part = second_part[:-2] + 'ому' + + return '{} на десять у {} ступені'.format( + first_part, second_part) + else: + # This handles negative powers separately from the normal + # handling since each call disables the scientific flag + return '{}{} на десять у ступені {}{}'.format( + 'мінус ' if float(n) < 0 else '', + pronounce_number_uk( + abs(float(n)), places, short_scale, False, ordinals=False), + 'мінус ' if power < 0 else '', + pronounce_number_uk(abs(power), places, short_scale, False, ordinals=False)) + + if short_scale: + number_names = _NUM_STRING_UK.copy() + number_names.update(_SHORT_SCALE_UK) + else: + number_names = _NUM_STRING_UK.copy() + number_names.update(_LONG_SCALE_UK) + + digits = [number_names[n] for n in range(0, 20)] + + tens = [number_names[n] for n in range(10, 100, 10)] + + if short_scale: + hundreds = [_SHORT_SCALE_UK[n] for n in _SHORT_SCALE_UK.keys()] + else: + hundreds = [_LONG_SCALE_UK[n] for n in _LONG_SCALE_UK.keys()] + + # deal with negative numbers + result = "" + if num < 0: + result = "мінус " + num = abs(num) + + # check for a direct match + if num in number_names and not ordinals: + result += number_names[num] + else: + def _sub_thousand(n, ordinals=False): + assert 0 <= n <= 999 + if n in _SHORT_ORDINAL_UK and ordinals: + return _SHORT_ORDINAL_UK[n] + if n <= 19: + return digits[n] + elif n <= 99: + q, r = divmod(n, 10) + return tens[q - 1] + (" " + _sub_thousand(r, ordinals) if r + else "") + else: + q, r = divmod(n, 100) + return _NUM_STRING_UK[q * 100] + (" " + _sub_thousand(r, ordinals) if r else "") + + def _short_scale(n): + if n > max(_SHORT_SCALE_UK.keys()): + return "нескінченність" + ordi = ordinals + + if int(n) != n: + ordi = False + n = int(n) + assert 0 <= n + res = [] + for i, z in enumerate(_split_by(n, 1000)): + if not z: + continue + number = _sub_thousand(z, not i and ordi) + if i: + if i >= len(hundreds): + return "" + if ordi: + if i * 1000 in _SHORT_ORDINAL_UK: + if z == 1: + number = _SHORT_ORDINAL_UK[i * 1000] + else: + if z > 5: + number = number[:-1] + "и" + number += _SHORT_ORDINAL_UK[i * 1000] + else: + if n not in _SHORT_SCALE_UK: + num = int("1" + "0" * (len(str(n)) // 3 * 3)) + + if number[-3:] == "два": + number = number[:-1] + "ох" + elif number[-2:] == "ри" or number[-2:] == "ре": + number = number[:-1] + "ьох" + elif number[-1:] == "ь": + number = number[:-1] + "и" + + if _SHORT_SCALE_UK[num].endswith('н'): + number += _SHORT_SCALE_UK[num] + "ний" + else: + number += _SHORT_SCALE_UK[num] + "ий" + else: + if _SHORT_SCALE_UK[n].endswith('н'): + number = _SHORT_SCALE_UK[n] + "ний" + else: + number = _SHORT_SCALE_UK[n] + "ий" + elif z == 1: + number = hundreds[i - 1] + else: + if i == 1: + if z % 10 == 1 and z % 100 // 10 != 1: + number = number[:-2] + "на" + elif z % 10 == 2 and z % 100 // 10 != 1: + number = number[:-1] + "і" + number += " " + plural_uk(z, "тисяча", "тисячі", "тисяч") + elif 1 <= z % 10 <= 4 and z % 100 // 10 != 1: + number += " " + hundreds[i - 1] + "а" + else: + number += " " + hundreds[i - 1] + "ів" + + res.append(number) + ordi = False + + return " ".join(reversed(res)) + + def _split_by(n, split=1000): + assert 0 <= n + res = [] + while n: + n, r = divmod(n, split) + res.append(r) + return res + + def _long_scale(n): + if n >= max(_LONG_SCALE_UK.keys()): + return "нескінченність" + ordi = ordinals + if int(n) != n: + ordi = False + n = int(n) + assert 0 <= n + res = [] + for i, z in enumerate(_split_by(n, 1000000)): + if not z: + continue + number = pronounce_number_uk(z, places, True, scientific, + ordinals=ordi and not i) + # strip off the comma after the thousand + if i: + if i >= len(hundreds): + return "" + # plus one as we skip 'thousand' + # (and 'hundred', but this is excluded by index value) + number = number.replace(',', '') + + if ordi: + if (i + 1) * 1000000 in _LONG_ORDINAL_UK: + if z == 1: + number = _LONG_ORDINAL_UK[ + (i + 1) * 1000000] + else: + number += _LONG_ORDINAL_UK[ + (i + 1) * 1000000] + else: + if n not in _LONG_SCALE_UK: + num = int("1" + "0" * (len(str(n)) // 3 * 3)) + + if number[-3:] == "два": + number = number[:-1] + "ох" + elif number[-2:] == "ри" or number[-2:] == "ре": + number = number[:-1] + "ьох" + elif number[-1:] == "ь": + number = number[:-1] + "и" + + number += _LONG_SCALE_UK[num] + "ний" + else: + number = " " + _LONG_SCALE_UK[n] + "ний" + elif z == 1: + number = hundreds[i] + elif z <= 4: + number += " " + hundreds[i] + "а" + else: + number += " " + hundreds[i] + "ів" + + res.append(number) + return " ".join(reversed(res)) + + if short_scale: + result += _short_scale(num) + else: + result += _long_scale(num) + + # deal with scientific notation unpronounceable as number + if not result and "e" in str(num): + return pronounce_number_uk(num, places, short_scale, scientific=True) + # Deal with fractional part + elif not num == int(num) and places > 0: + if abs(num) < 1.0 and (result == "мінус " or not result): + result += "нуль" + result += " крапка" + _num_str = str(num) + _num_str = _num_str.split(".")[1][0:places] + for char in _num_str: + result += " " + number_names[int(char)] + return result + + +def nice_time_uk(dt, speech=True, use_24hour=True, use_ampm=False): + """ + Format a time to a comfortable human format + For example, generate 'five thirty' for speech or '5:30' for + text display. + Args: + dt (datetime): date to format (assumes already in local timezone) + speech (bool): format for speech (default/True) or display (False)=Fal + use_24hour (bool): output in 24-hour/military or 12-hour format + use_ampm (bool): include the am/pm for 12-hour format + Returns: + (str): The formatted time string + """ + if use_24hour: + # e.g. "03:01" or "14:22" + string = dt.strftime("%H:%M") + else: + if use_ampm: + # e.g. "3:01 AM" or "2:22 PM" + string = dt.strftime("%I:%M") + if dt.hour < 4: + string += " ночі" + elif dt.hour < 12: + string += " ранку" + elif dt.hour < 18: + string += " дня" + else: + string += " вечора" + else: + # e.g. "3:01" or "2:22" + string = dt.strftime("%I:%M") + if string[0] == '0': + string = string[1:] # strip leading zeros + if not speech: + return string + + # Generate a speakable version of the time + if use_24hour: + speak = "" + + # Either "0 8 hundred" or "13 hundred" + if string[0] == '0': + speak = pronounce_hour_uk(int(string[0])) + if not speak: + speak = pronounce_number_uk(int(string[0]))+' ' + speak += pronounce_number_uk(int(string[1])) + else: + speak = pronounce_hour_uk(int(string[0:2])) + if speak == None: + speak = pronounce_number_uk(int(string[0:2])) + + speak += " " + if string[3:5] == '00': + speak += "рівно" + else: + if string[3] == '0': + speak += pronounce_number_uk(0) + " " + speak += pronounce_number_uk(int(string[4])) + else: + speak += pronounce_number_uk(int(string[3:5])) + return speak + else: + if dt.hour == 0 and dt.minute == 0: + return "опівночі" + elif dt.hour == 12 and dt.minute == 0: + return "опівдні" + + hour = dt.hour % 12 or 12 # 12 hour clock and 0 is spoken as 12 + if dt.minute == 15: + speak = "чверть після " + pronounce_hour_genitive_uk(hour) + elif dt.minute == 30: + speak = "половина після " + pronounce_hour_genitive_uk(hour) + elif dt.minute == 45: + next_hour = (dt.hour + 1) % 12 or 12 + speak = "без четверті " + pronounce_hour_uk(next_hour) + else: + speak = pronounce_hour_uk(hour) + + if dt.minute == 0: + if not use_ampm: + if dt.hour % 12 == 1: + return speak + return speak + " " + plural_uk(dt.hour % 12, "година", "години", "годин") + else: + if dt.minute < 10: + speak += " нуль" + speak += " " + pronounce_number_uk(dt.minute) + + if use_ampm: + if dt.hour < 4: + speak += " ночі" + elif dt.hour < 12: + speak += " ранку" + elif dt.hour < 18: + speak += " дня" + else: + speak += " вечора" + return speak + + +def nice_duration_uk(duration, speech=True): + """ Convert duration to a nice spoken timespan + + Args: + seconds: number of seconds + minutes: number of minutes + hours: number of hours + days: number of days + Returns: + str: timespan as a string + """ + + if not speech: + raise FunctionNotLocalizedError + + days = int(duration // 86400) + hours = int(duration // 3600 % 24) + minutes = int(duration // 60 % 60) + seconds = int(duration % 60) + + out = '' + + if days > 0: + out += pronounce_number_uk(days) + out += " " + plural_uk(days, "день", "дня", "днів") + if hours > 0: + if out: + out += " " + out += pronounce_number_feminine_uk(hours) + if out == 'один': + out = 'одна' + out += " " + plural_uk(hours, "година", "години", "годин") + if minutes > 0: + if out: + out += " " + out += pronounce_number_feminine_uk(minutes) + out += " " + plural_uk(minutes, "хвилина", "хвилини", "хвилин") + if seconds > 0: + if out: + out += " " + out += pronounce_number_feminine_uk(seconds) + out += " " + plural_uk(seconds, "секунда", "секунди", "секунд") + + return out + + +def pronounce_hour_uk(num): + if num in HOURS_UK.keys(): + return HOURS_UK[num] + ' година' + + +def pronounce_hour_genitive_uk(num): + if num in HOURS_UK.keys(): + if num == 3: + gen_hour = HOURS_UK[num][:-1]+'ьої' + else: + gen_hour = HOURS_UK[num][:-1]+'ої' + return gen_hour + ' години' + + +def pronounce_number_feminine_uk(num): + pronounced = pronounce_number_uk(num) + num %= 100 + if num % 10 == 1 and num // 10 != 1: + return pronounced[:-2] + "на" + elif num % 10 == 2 and num // 10 != 1: + return pronounced[:-1] + "і" + + return pronounced + + +def plural_uk(num: int, one: str, few: str, many: str): + num %= 100 + if num // 10 == 1: + return many + if num % 10 == 1: + return one + if 2 <= num % 10 <= 4: + return few + return many diff --git a/lingua_franca/lang/parse_common.py b/lingua_franca/lang/parse_common.py index 97cf5be7..afd8aa2f 100644 --- a/lingua_franca/lang/parse_common.py +++ b/lingua_franca/lang/parse_common.py @@ -200,11 +200,15 @@ def match_yes_or_no(text, lang): with open(resource_file) as f: words = json.load(f) words = {k: [_.lower() for _ in v] for k, v in words.items()} - - text = unicodedata.normalize('NFD', text) \ - .encode('ascii', 'ignore').decode("utf-8") + # after encoding information is lost + if lang == 'uk-uk': + text = unicodedata.normalize('NFD', text) + else: + text = unicodedata.normalize('NFD', text) \ + .encode('ascii', 'ignore').decode("utf-8") text = text.lower() + # if user says yes but later says no, he changed his mind mid-sentence # the highest index is the last yesno word res = None diff --git a/lingua_franca/lang/parse_uk.py b/lingua_franca/lang/parse_uk.py new file mode 100644 index 00000000..03ee6b39 --- /dev/null +++ b/lingua_franca/lang/parse_uk.py @@ -0,0 +1,1806 @@ +# +# Copyright 2017 Mycroft AI Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +from datetime import datetime, timedelta + +from dateutil.relativedelta import relativedelta + +from lingua_franca.lang.parse_common import is_numeric, look_for_fractions, \ + invert_dict, ReplaceableNumber, partition_list, tokenize, Token, Normalizer +from lingua_franca.lang.common_data_uk import _NUM_STRING_UK, \ + _LONG_ORDINAL_UK, _LONG_SCALE_UK, _SHORT_SCALE_UK, _SHORT_ORDINAL_UK, \ + _FRACTION_STRING_UK, _MONTHS_CONVERSION, _MONTHS_UK, _TIME_UNITS_CONVERSION, \ + _ORDINAL_BASE_UK, _PLURALS + +import re +import json +from lingua_franca import resolve_resource_file +from lingua_franca.time import now_local + + +def generate_plurals_uk(originals): + """ + Return a new set or dict containing the plural form of the original values, + Generate different cases of values + + In English this means all with 's' appended to them. + + Args: + originals set(str) or dict(str, any): values to pluralize + + Returns: + set(str) or dict(str, any) + + """ + suffixes = ["а", "ах", "их", "ам", "ами", "ів", + "ям", "ох", "и", "на", "ни", "і", "ні", + "ий", "ний", 'ьох', 'ьома', 'ьом', 'ох', + 'ум', 'ма', 'ом'] + if isinstance(originals, dict): + thousand = {"тисяч": 1000, "тисячі": 1000, "тисячу": 1000, "тисячах": 1000} + hundred = {"сотня": 100, "сотні": 100, "сотень": 100} + result_dict = {key + suffix: value for key, value in originals.items() for suffix in suffixes} + result_dict.update(thousand) + result_dict.update(hundred) + return result_dict + thousand = ["тисяч", "тисячі", "тисячу", "тисячах"] + result_dict = {value + suffix for value in originals for suffix in suffixes} + result_dict.update(thousand) + return {value + suffix for value in originals for suffix in suffixes} + + +# negate next number (-2 = 0 - 2) +_NEGATIVES = {"мінус"} + +# sum the next number (twenty two = 20 + 2) +_SUMS = {"двадцять", "20", "тридцять", "30", "сорок", "40", "п'ятдесят", "50", + "шістдесят", "60", "сімдесят", "70", "вісімдесят", "80", "дев'яносто", "90", + "сто", "100", "двісті", "200", "триста", "300", "чотириста", "400", + "п'ятсот", "500", "шістсот", "600", "сімсот", "700", "вісімсот", "800", + "дев'ятсот", "900"} + +_MULTIPLIES_LONG_SCALE_UK = set(_LONG_SCALE_UK.values()) | \ + generate_plurals_uk(_LONG_SCALE_UK.values()) + + +_MULTIPLIES_SHORT_SCALE_UK = set(_SHORT_SCALE_UK.values()) | \ + generate_plurals_uk(_SHORT_SCALE_UK.values()) + +# split sentence parse separately and sum ( 2 and a half = 2 + 0.5 ) +_FRACTION_MARKER = {"і", "та", "з", " "} + +# decimal marker ( 1 point 5 = 1 + 0.5) +_DECIMAL_MARKER = {"ціла", "цілих", "точка", "крапка", "кома"} + +_STRING_NUM_UK = invert_dict(_NUM_STRING_UK) + +_STRING_NUM_UK.update(generate_plurals_uk(_STRING_NUM_UK)) +_STRING_NUM_UK.update(_PLURALS) +_STRING_NUM_UK.update({ + "трильйон": 1e18, + "половина": 0.5, "половиною": 0.5, "половини": 0.5, "половин": 0.5, "половинами": 0.5, "пів": 0.5, + "одна": 1, "одної": 1, "одній": 1, "одну": 1 +}) + +_WORDS_NEXT_UK = [ + "майбутня", "майбутнє", "майбутній", "майбутньому", "майбутнім", "майбутньої", "майбутнього", + "нова", "нове", "новий", "нового", "нової", "новим", "новою", "через", + "наступна", "наступне", "наступний", "наступній", "наступному", "наступним", "наступною", +] +_WORDS_PREV_UK = [ + "попередня", "попередній", "попереднім", "попередньої", + "попередню", "попереднього", "попередне", "тому", + "минула", "минулий", "минуле", "минулу", "минулого", "минулій", "минулому", + "минулої", "минулою", "минулим", + "та", "той", "ті", "те", "того", +] +_WORDS_CURRENT_UK = [ + "теперішній", "теперішня", "теперішні", "теперішній", "теперішньому", + "теперішньою", "теперішнім", "теперішнього", "теперішньої", + "дана", "даний", "дане", "даним", "даною", "даного", "даної", "даному", "даній", + "поточний", "поточна", "поточні", "поточне", "поточного", "поточної", + "поточному", "поточній", "поточним", "поточною", + "нинішній", "нинішня", "нинішнє", "нинішньому", "нинішній", + "нинішнього", "нинішньої", "нинішнім", "нинішньою", + "цей", "ця", "це", "цим", "цією", "цьому", "цій" +] +_WORDS_NOW_UK = [ + "тепер", + "зараз", +] +_WORDS_MORNING_UK = ["ранок", "зранку", "вранці", "ранку"] +_WORDS_DAY_UK = ["вдень", "опівдні"] +_WORDS_EVENING_UK = ["вечер", "ввечері", "увечері", "вечором"] +_WORDS_NIGHT_UK = ["ніч", "вночі"] + +_STRING_SHORT_ORDINAL_UK = invert_dict(_SHORT_ORDINAL_UK) +_STRING_LONG_ORDINAL_UK = invert_dict(_LONG_ORDINAL_UK) + + +def _convert_words_to_numbers_uk(text, short_scale=True, ordinals=False): + """ + Convert words in a string into their equivalent numbers. + Args: + text str: + short_scale boolean: True if short scale numbers should be used. + ordinals boolean: True if ordinals (e.g. first, second, third) should + be parsed to their number values (1, 2, 3...) + + Returns: + str + The original text, with numbers subbed in where appropriate. + + """ + text = text.lower() + + tokens = tokenize(text) + numbers_to_replace = \ + _extract_numbers_with_text_uk(tokens, short_scale, ordinals) + numbers_to_replace.sort(key=lambda number: number.start_index) + + results = [] + for token in tokens: + if not numbers_to_replace or \ + token.index < numbers_to_replace[0].start_index: + results.append(token.word) + else: + if numbers_to_replace and \ + token.index == numbers_to_replace[0].start_index: + results.append(str(numbers_to_replace[0].value)) + if numbers_to_replace and \ + token.index == numbers_to_replace[0].end_index: + numbers_to_replace.pop(0) + return ' '.join(results) + + +def _extract_numbers_with_text_uk(tokens, short_scale=True, + ordinals=False, fractional_numbers=True): + """ + Extract all numbers from a list of Tokens, with the words that + represent them. + + Args: + [Token]: The tokens to parse. + short_scale bool: True if short scale numbers should be used, False for + long scale. True by default. + ordinals bool: True if ordinal words (first, second, third, etc) should + be parsed. + fractional_numbers bool: True if we should look for fractions and + decimals. + + Returns: + [ReplaceableNumber]: A list of tuples, each containing a number and a + string. + + """ + placeholder = "" # inserted to maintain correct indices + results = [] + while True: + to_replace = \ + _extract_number_with_text_uk(tokens, short_scale, + ordinals, fractional_numbers) + + if not to_replace: + break + results.append(to_replace) + tokens = [ + t if not + to_replace.start_index <= t.index <= to_replace.end_index + else + Token(placeholder, t.index) for t in tokens + ] + results.sort(key=lambda n: n.start_index) + return results + + +def _extract_number_with_text_uk(tokens, short_scale=True, + ordinals=False, fractional_numbers=True): + """ + This function extracts a number from a list of Tokens. + + Args: + tokens str: the string to normalize + short_scale (bool): use short scale if True, long scale if False + ordinals (bool): consider ordinal numbers, third=3 instead of 1/3 + fractional_numbers (bool): True if we should look for fractions and + decimals. + Returns: + ReplaceableNumber + + """ + number, tokens = \ + _extract_number_with_text_uk_helper(tokens, short_scale, + ordinals, fractional_numbers) + return ReplaceableNumber(number, tokens) + + +def _extract_number_with_text_uk_helper(tokens, + short_scale=True, ordinals=False, + fractional_numbers=True): + """ + Helper for _extract_number_with_text_uk. + + This contains the real logic for parsing, but produces + a result that needs a little cleaning (specific, it may + contain leading articles that can be trimmed off). + + Args: + tokens [Token]: + short_scale boolean: + ordinals boolean: + fractional_numbers boolean: + + Returns: + int or float, [Tokens] + + """ + if fractional_numbers: + fraction, fraction_text = \ + _extract_fraction_with_text_uk(tokens, short_scale, ordinals) + if fraction: + return fraction, fraction_text + + decimal, decimal_text = \ + _extract_decimal_with_text_uk(tokens, short_scale, ordinals) + if decimal: + return decimal, decimal_text + # special_number = [word for word in tokens if word ] + # short_scale == False + return _extract_whole_number_with_text_uk(tokens, short_scale, ordinals) + + +def _extract_fraction_with_text_uk(tokens, short_scale, ordinals): + """ + Extract fraction numbers from a string. + + This function handles text such as '2 and 3/4'. Note that "one half" or + similar will be parsed by the whole number function. + + Args: + tokens [Token]: words and their indexes in the original string. + short_scale boolean: + ordinals boolean: + + Returns: + (int or float, [Token]) + The value found, and the list of relevant tokens. + (None, None) if no fraction value is found. + + """ + for c in _FRACTION_MARKER: + partitions = partition_list(tokens, lambda t: t.word == c) + + if len(partitions) == 3: + numbers1 = \ + _extract_numbers_with_text_uk(partitions[0], short_scale, + ordinals, fractional_numbers=False) + numbers2 = \ + _extract_numbers_with_text_uk(partitions[2], short_scale, + ordinals, fractional_numbers=True) + + if not numbers1 or not numbers2: + return None, None + + # ensure first is not a fraction and second is a fraction + num1 = numbers1[-1] + num2 = numbers2[0] + if num1.value >= 1 and 0 < num2.value < 1: + return num1.value + num2.value, \ + num1.tokens + partitions[1] + num2.tokens + + return None, None + + +def _extract_decimal_with_text_uk(tokens, short_scale, ordinals): + """ + Extract decimal numbers from a string. + + This function handles text such as '2 point 5'. + + Notes: + While this is a helper for extract_number_xx, it also depends on + extract_number_xx, to parse out the components of the decimal. + + This does not currently handle things like: + number dot number number number + + Args: + tokens [Token]: The text to parse. + short_scale boolean: + ordinals boolean: + + Returns: + (float, [Token]) + The value found and relevant tokens. + (None, None) if no decimal value is found. + + """ + for c in _DECIMAL_MARKER: + partitions = partition_list(tokens, lambda t: t.word == c) + + if len(partitions) == 3: + numbers1 = \ + _extract_numbers_with_text_uk(partitions[0], short_scale, + ordinals, fractional_numbers=False) + numbers2 = \ + _extract_numbers_with_text_uk(partitions[2], short_scale, + ordinals, fractional_numbers=False) + + if not numbers1 or not numbers2: + return None, None + + number = numbers1[-1] + decimal = numbers2[0] + + # TODO handle number dot number number number + if "." not in str(decimal.text): + return number.value + float('0.' + str(decimal.value)), \ + number.tokens + partitions[1] + decimal.tokens + return None, None + + +def _extract_whole_number_with_text_uk(tokens, short_scale, ordinals): + """ + Handle numbers not handled by the decimal or fraction functions. This is + generally whole numbers. Note that phrases such as "one half" will be + handled by this function, while "one and a half" are handled by the + fraction function. + + Args: + tokens [Token]: + short_scale boolean: + ordinals boolean: + + Returns: + int or float, [Tokens] + The value parsed, and tokens that it corresponds to. + + """ + number_token = [token for token in tokens if token.word.lower() in _MULTIPLIES_LONG_SCALE_UK] + if number_token: + short_scale = False + multiplies, string_num_ordinal, string_num_scale = \ + _initialize_number_data(short_scale) + number_words = [] # type: [Token] + val = False + prev_val = None + next_val = None + to_sum = [] + for idx, token in enumerate(tokens): + current_val = None + if next_val: + next_val = None + continue + + word = token.word + if word in word in _NEGATIVES: + number_words.append(token) + continue + + prev_word = tokens[idx - 1].word if idx > 0 else "" + prev_word = _text_uk_inflection_normalize(prev_word, 1) + next_word = tokens[idx + 1].word if idx + 1 < len(tokens) else "" + next_word = _text_uk_inflection_normalize(next_word, 1) + + # In Ukrainian (?) we do not use suffix (1st,2nd,..) but use point instead (1.,2.,..) + if is_numeric(word[:-1]) and \ + (word.endswith(".")): + # explicit ordinals, 1st, 2nd, 3rd, 4th.... Nth + word = word[:-1] + + # Normalize Ukrainian inflection of numbers (один, одна, одно,...) + if not ordinals: + if word not in _STRING_NUM_UK: + word = _text_uk_inflection_normalize(word, 1) + + if word not in string_num_scale and \ + word not in _STRING_NUM_UK and \ + word not in _SUMS and \ + word not in multiplies and \ + not (ordinals and word in string_num_ordinal) and \ + not is_numeric(word) and \ + not is_fractional_uk(word, word, short_scale=short_scale) and \ + not look_for_fractions(word.split('/')): + words_only = [token.word for token in number_words] + if number_words and not all([w in _NEGATIVES for w in words_only]): + break + else: + number_words = [] + continue + elif word not in multiplies \ + and prev_word not in multiplies \ + and prev_word not in _SUMS \ + and not (ordinals and prev_word in string_num_ordinal) \ + and prev_word not in _NEGATIVES: + + number_words = [token] + elif prev_word in _SUMS and word in _SUMS : + number_words = [token] + else: + number_words.append(token) + # is this word already a number ? + if is_numeric(word): + if word.isdigit(): # doesn't work with decimals + val = int(word) + else: + val = float(word) + current_val = val + + # is this word the name of a number ? + if word in _STRING_NUM_UK: + val = _STRING_NUM_UK.get(word) + current_val = val + elif word in string_num_scale: + val = string_num_scale.get(word) + current_val = val + elif ordinals and word in string_num_ordinal: + val = string_num_ordinal[word] + current_val = val + # is the prev word an ordinal number and current word is one? + # second one, third one + if ordinals and prev_word in string_num_ordinal and val == 1: + val = prev_val + # is the prev word a number and should we sum it? + # twenty two, fifty six + if (prev_word in _SUMS and val and val < 10) \ + or (prev_word in _SUMS and val and val < 100 and prev_val >= 100) \ + or all([prev_word in multiplies, val < prev_val if prev_val else False]): + val = prev_val + val + + # is the prev word a number and should we multiply it? + multiplies.update({"тисячa", "тисячі", "тисячу", "тисячах", "тисячaми", "тисячею", "тисяч"}) + if word in multiplies: + if not prev_val: + prev_val = 1 + val = prev_val * val + + # пара сотень, три пари пива + if prev_word in ['пара', 'пари', 'парою', 'парами'] and current_val != 1000.0: + val = val * 2 + if prev_val in _STRING_NUM_UK.values() and current_val == 100: + val = prev_val * current_val + + # half cup + if val is False: + val = is_fractional_uk(word, word, short_scale=short_scale) + current_val = val + + # 2 fifths + if not ordinals: + next_val = is_fractional_uk(next_word, word, short_scale=short_scale) + if next_val: + if not val: + val = 1 + val = val * next_val + number_words.append(tokens[idx + 1]) + if word in ['пара', 'пари', 'парою', 'парами']: + if prev_val: + val = val * prev_val + else: + val = 2 + # is this a negative number? + if val and prev_word and prev_word in _NEGATIVES: + val = 0 - val + + # let's make sure it isn't a fraction + if not val: + # look for fractions like "2/3" + a_pieces = word.split('/') + if look_for_fractions(a_pieces): + val = float(a_pieces[0]) / float(a_pieces[1]) + else: + # checking if word is digit in order not to substitute + # existing calculated value + new_word = re.sub(r'\.', '', word) + if all([ + prev_word in _SUMS, + word not in _SUMS, + new_word.isdigit() is False, + word not in multiplies, + current_val >= 10 + ]): + # Backtrack - we've got numbers we can't sum + number_words.pop() + val = prev_val + break + prev_val = val + if word in multiplies and next_word not in multiplies: + # handle long numbers + # six hundred sixty six + # two million five hundred thousand + # + # This logic is somewhat complex, and warrants + # extensive documentation for the next coder's sake. + # + # The current word is a power of ten. `current_val` is + # its integer value. `val` is our working sum + # (above, when `current_val` is 1 million, `val` is + # 2 million.) + # + # We have a dict `string_num_scale` containing [value, word] + # pairs for "all" powers of ten: string_num_scale[10] == "ten. + # + # We need go over the rest of the tokens, looking for other + # powers of ten. If we find one, we compare it with the current + # value, to see if it's smaller than the current power of ten. + # + # Numbers which are not powers of ten will be passed over. + # + # If all the remaining powers of ten are smaller than our + # current value, we can set the current value aside for later, + # and begin extracting another portion of our final result. + # For example, suppose we have the following string. + # The current word is "million".`val` is 9000000. + # `current_val` is 1000000. + # + # "nine **million** nine *hundred* seven **thousand** + # six *hundred* fifty seven" + # + # Iterating over the rest of the string, the current + # value is larger than all remaining powers of ten. + # + # The if statement passes, and nine million (9000000) + # is appended to `to_sum`. + # + # The main variables are reset, and the main loop begins + # assembling another number, which will also be appended + # under the same conditions. + # + # By the end of the main loop, to_sum will be a list of each + # "place" from 100 up: [9000000, 907000, 600] + # + # The final three digits will be added to the sum of that list + # at the end of the main loop, to produce the extracted number: + # + # sum([9000000, 907000, 600]) + 57 + # == 9,000,000 + 907,000 + 600 + 57 + # == 9,907,657 + # + # >>> foo = "nine million nine hundred seven thousand six + # hundred fifty seven" + # >>> extract_number(foo) + # 9907657 + time_to_sum = True + for other_token in tokens[idx + 1:]: + if other_token.word in multiplies: + if string_num_scale[other_token.word] >= current_val: + time_to_sum = False + else: + continue + if not time_to_sum: + break + if time_to_sum: + to_sum.append(val) + val = 0 + prev_val = 0 + + if val is not None and to_sum: + val += sum(to_sum) + return val, number_words + + +def _initialize_number_data(short_scale): + """ + Generate dictionaries of words to numbers, based on scale. + + This is a helper function for _extract_whole_number. + + Args: + short_scale boolean: + + Returns: + (set(str), dict(str, number), dict(str, number)) + multiplies, string_num_ordinal, string_num_scale + + """ + multiplies = _MULTIPLIES_SHORT_SCALE_UK if short_scale \ + else _MULTIPLIES_LONG_SCALE_UK + + string_num_ordinal_uk = _STRING_SHORT_ORDINAL_UK if short_scale \ + else _STRING_LONG_ORDINAL_UK + + string_num_scale_uk = _SHORT_SCALE_UK if short_scale else _LONG_SCALE_UK + string_num_scale_uk = invert_dict(string_num_scale_uk) + string_num_scale_uk.update(generate_plurals_uk(string_num_scale_uk)) + return multiplies, string_num_ordinal_uk, string_num_scale_uk + + +def extract_number_uk(text, short_scale=True, ordinals=False): + """ + This function extracts a number from a text string, + handles pronunciations in long scale and short scale + + https://en.wikipedia.org/wiki/Names_of_large_numbers + + Args: + text (str): the string to normalize + short_scale (bool): use short scale if True, long scale if False + ordinals (bool): consider ordinal numbers, third=3 instead of 1/3 + Returns: + (int) or (float) or False: The extracted number or False if no number + was found + + """ + return _extract_number_with_text_uk(tokenize(text.lower()), + short_scale, ordinals).value + + +def extract_duration_uk(text): + """ + Convert an english phrase into a number of seconds + + Convert things like: + "10 minute" + "2 and a half hours" + "3 days 8 hours 10 minutes and 49 seconds" + into an int, representing the total number of seconds. + + The words used in the duration will be consumed, and + the remainder returned. + + As an example, "set a timer for 5 minutes" would return + (300, "set a timer for"). + + Args: + text (str): string containing a duration + + Returns: + (timedelta, str): + A tuple containing the duration and the remaining text + not consumed in the parsing. The first value will + be None if no duration is found. The text returned + will have whitespace stripped from the ends. + """ + if not text: + return None + + # Ukrainian inflection for time: хвилина, хвилини, хвилин - safe to use хвилина as pattern + # For day: день, дня, днів - short pattern not applicable, list all + + time_units = { + 'microseconds': 0, + 'milliseconds': 0, + 'seconds': 0, + 'minutes': 0, + 'hours': 0, + 'days': 0, + 'weeks': 0 + } + + pattern = r"(?P\d+(?:\.?\d+)?)(?:\s+|\-){unit}(?:ів|я|и|ин|і|унд|ни|ну|ку|дні|у|днів)?" + text = _convert_words_to_numbers_uk(text) + + for (unit_uk, unit_en) in _TIME_UNITS_CONVERSION.items(): + unit_pattern = pattern.format(unit=unit_uk) + + def repl(match): + time_units[unit_en] += float(match.group(1)) + return '' + text = re.sub(unit_pattern, repl, text) + + new_text = [] + tokens_in_result_text = text.split(' ') + for token in tokens_in_result_text: + if not token.isdigit(): + new_text.append(token) + text = " ".join(new_text).strip() + duration = timedelta(**time_units) if any(time_units.values()) else None + + return duration, text + + +def extract_datetime_uk(text, anchor_date=None, default_time=None): + """ Convert a human date reference into an exact datetime + + Convert things like + "today" + "tomorrow afternoon" + "next Tuesday at 4pm" + "August 3rd" + into a datetime. If a reference date is not provided, the current + local time is used. Also consumes the words used to define the date + returning the remaining string. For example, the string + "what is Tuesday's weather forecast" + returns the date for the forthcoming Tuesday relative to the reference + date and the remainder string + "what is weather forecast". + + The "next" instance of a day or weekend is considered to be no earlier than + 48 hours in the future. On Friday, "next Monday" would be in 3 days. + On Saturday, "next Monday" would be in 9 days. + + Args: + text (str): string containing date words + anchor_date (datetime): A reference date/time for "tommorrow", etc + default_time (time): Time to set if no time was found in the string + + Returns: + [datetime, str]: An array containing the datetime and the remaining + text not consumed in the parsing, or None if no + date or time related text was found. + """ + + def clean_string(s): + # clean unneeded punctuation and capitalization among other things. + # Normalize Ukrainian inflection + s = s.lower().replace('?', '').replace('.', '').replace(',', '') + s = s.replace("сьогодні вечером|сьогодні ввечері|вечором", "ввечері") + s = s.replace("сьогодні вночі", "вночі") + word_list = s.split() + + for idx, word in enumerate(word_list): + ########## + # Ukrainian Day Ordinals - we do not use 1st,2nd format + # instead we use full ordinal number names with specific format(suffix) + # Example: двадцять третього - 23 + count_ordinals = 0 + if word == "третього": + count_ordinals = 3 + # Example: тридцять першого - 31 + elif word.endswith("ого"): + tmp = word[:-3] + tmp += "ий" + for nr, name in _ORDINAL_BASE_UK.items(): + if name == tmp: + count_ordinals = nr + # Example: тридцять перше > 31 + elif word.endswith("є") or word.endswith("е"): + tmp = word[:-1] + tmp += "ий" + for nr, name in _ORDINAL_BASE_UK.items(): + if name == tmp: + count_ordinals = nr + # If number is bigger than 19 check if next word is also ordinal + # and count them together + if count_ordinals > 19: + if word_list[idx + 1] == "третього": + count_ordinals += 3 + elif word_list[idx + 1].endswith("ого"): + tmp = word_list[idx + 1][:-3] + tmp += "ий" + for nr, name in _ORDINAL_BASE_UK.items(): + if name == tmp and nr < 10: + # write only if sum makes acceptable count of days in month + if (count_ordinals + nr) <= 31: + count_ordinals += nr + + if count_ordinals > 0: + word = str(count_ordinals) # Write normalized value into word + if count_ordinals > 20: + # If counted number is greater than 20, clear next word so it is not used again + word_list[idx + 1] = "" + ########## + # Remove inflection from Ukrainian months + word_list[idx] = word + return word_list + + def date_found(): + return found or \ + ( + date_string != "" or + year_offset != 0 or month_offset != 0 or + day_offset is True or hr_offset != 0 or + hr_abs or min_offset != 0 or + min_abs or sec_offset != 0 + ) + + if text == "": + return None + + anchor_date = anchor_date or now_local() + found = False + day_specified = False + day_offset = False + month_offset = 0 + year_offset = 0 + today = anchor_date.strftime("%w") + current_year = anchor_date.strftime("%Y") + from_flag = False + date_string = "" + has_year = False + time_qualifier = "" + + time_qualifiers_am = _WORDS_MORNING_UK + time_qualifiers_pm = ['дня', 'вечора'] + time_qualifiers_pm.extend(_WORDS_DAY_UK) + time_qualifiers_pm.extend(_WORDS_EVENING_UK) + time_qualifiers_pm.extend(_WORDS_NIGHT_UK) + time_qualifiers_list = set(time_qualifiers_am + time_qualifiers_pm) + markers = ['на', 'у', 'в', 'о', 'до', 'це', + 'біля', 'цей', 'через', 'після', 'за', 'той'] + days = ["понеділок", "вівторок", "середа", + "четвер", "п'ятниця", "субота", "неділя"] + months = _MONTHS_UK + recur_markers = days + ['вихідні', 'вікенд'] + months_short = ["січ", "лют", "бер", "квіт", "трав", "червень", "лип", "серп", + "верес", "жовт", "листоп", "груд"] + year_multiples = ["десятиліття", "століття", "тисячоліття", "тисячоліть", "століть", + "сторіччя", "сторіч"] + + words = clean_string(text) + preposition = "" + + for idx, word in enumerate(words): + if word == "": + continue + + if word in markers: + preposition = word + + word = _text_uk_inflection_normalize(word, 2) + word_prev_prev = _text_uk_inflection_normalize( + words[idx - 2], 2) if idx > 1 else "" + word_prev = _text_uk_inflection_normalize( + words[idx - 1], 2) if idx > 0 else "" + word_next = _text_uk_inflection_normalize( + words[idx + 1], 2) if idx + 1 < len(words) else "" + word_next_next = _text_uk_inflection_normalize( + words[idx + 2], 2) if idx + 2 < len(words) else "" + + # this isn't in clean string because I don't want to save back to words + start = idx + used = 0 + if word in _WORDS_NOW_UK and not date_string: + result_str = " ".join(words[idx + 1:]) + result_str = ' '.join(result_str.split()) + extracted_date = anchor_date.replace(microsecond=0) + return [extracted_date, result_str] + elif word_next in year_multiples: + multiplier = None + if is_numeric(word): + multiplier = extract_number_uk(word) + multiplier = multiplier or 1 + multiplier = int(multiplier) + used += 2 + if word_next == "десятиліття" or word_next == "декада": + year_offset = multiplier * 10 + elif word_next == "століття" or word_next == "сторіччя": + year_offset = multiplier * 100 + elif word_next in ["тисячоліття", "тисячоліть"]: + year_offset = multiplier * 1000 + elif word_next in ["тисяча", "тисячі", "тисяч"]: + year_offset = multiplier * 1000 + elif word in time_qualifiers_list and preposition != "через" and word_next != "тому": + time_qualifier = word + # parse today, tomorrow, day after tomorrow + elif word == "сьогодні" and not from_flag: + day_offset = 0 + used += 1 + elif word == "завтра" and not from_flag: + day_offset = 1 + used += 1 + elif word == "післязавтра" and not from_flag: + day_offset = 2 + used += 1 + elif word == "після" and word_next == "завтра" and not from_flag: + day_offset = 2 + used += 2 + elif word == "позавчора" and not from_flag: + day_offset = -2 + used += 1 + elif word == "вчора" and not from_flag: + day_offset = -1 + used += 1 + elif (word in ["день", "дня", "дні", "днів"] and + word_next == "після" and + word_next_next == "завтра" and + not from_flag and + (not word_prev or not word_prev[0].isdigit())): + day_offset = 2 + used = 2 + elif word in ["день", "дня", "дні", "днів"] and is_numeric(word_prev) and preposition == "через": + if word_prev and word_prev[0].isdigit(): + day_offset += int(word_prev) + start -= 1 + used = 2 + elif word in ["день", "дня", "дні", "днів"] and is_numeric(word_prev) and word_next == "тому": + if word_prev and word_prev[0].isdigit(): + day_offset += -int(word_prev) + start -= 1 + used = 3 + elif word in ["день", "дня", "дні", "днів"] and is_numeric(word_prev) and word_prev_prev == "на": + if word_prev and word_prev[0].isdigit(): + day_offset += int(word_prev) + start -= 1 + used = 2 + elif word == "сьогодні" and not from_flag and word_prev: + if word_prev[0].isdigit(): + day_offset += int(word_prev) * 7 + start -= 1 + used = 2 + elif word_prev in _WORDS_NEXT_UK: + day_offset = 7 + start -= 1 + used = 2 + elif word_prev in _WORDS_PREV_UK: + day_offset = -7 + start -= 1 + used = 2 + # parse 10 months, next month, last month + elif word == "тиждень" and not from_flag and preposition in ["через", "на"]: + if word_prev[0].isdigit(): + day_offset = int(word_prev) * 7 + start -= 1 + used = 2 + elif word_prev in _WORDS_NEXT_UK: + day_offset = 7 + start -= 1 + used = 2 + elif word_prev in _WORDS_PREV_UK: + day_offset = -7 + start -= 1 + used = 2 + elif word == "місяць" and not from_flag and preposition in ["через", "на"]: + if word_prev[0].isdigit(): + month_offset = int(word_prev) + start -= 1 + used = 2 + elif word_prev in _WORDS_NEXT_UK: + month_offset = 1 + start -= 1 + used = 2 + elif word_prev in _WORDS_PREV_UK: + month_offset = -1 + start -= 1 + used = 2 + # parse 5 years, next year, last year + elif word == "рік" and not from_flag and preposition in ["через", "на"]: + if word_prev[0].isdigit(): + if word_prev_prev[0].isdigit(): + year_offset = int(word_prev)*int(word_prev_prev) + else: + year_offset = int(word_prev) + start -= 1 + used = 2 + elif word_prev in _WORDS_NEXT_UK: + year_offset = 1 + start -= 1 + used = 2 + elif word_prev in _WORDS_PREV_UK: + year_offset = -1 + start -= 1 + used = 2 + elif word_prev == "через": + year_offset = 1 + used = 1 + # parse Monday, Tuesday, etc., and next Monday, + # last Tuesday, etc. + elif word in days and not from_flag: + d = days.index(word) + day_offset = (d + 1) - int(today) + used = 1 + if day_offset < 0: + day_offset += 7 + if word_prev in _WORDS_NEXT_UK: + if day_offset <= 2: + day_offset += 7 + used += 1 + start -= 1 + elif word_prev in _WORDS_PREV_UK: + day_offset -= 7 + used += 1 + start -= 1 + elif word in months or word in months_short and not from_flag: + try: + m = months.index(word) + except ValueError: + m = months_short.index(word) + used += 1 + # Convert Ukrainian months to english + date_string = _MONTHS_CONVERSION.get(m) + if word_prev and (word_prev[0].isdigit() or + (word_prev == " " and word_prev_prev[0].isdigit())): + if word_prev == " " and word_prev_prev[0].isdigit(): + date_string += " " + words[idx - 2] + used += 1 + start -= 1 + else: + date_string += " " + word_prev + start -= 1 + used += 1 + if word_next and word_next[0].isdigit(): + date_string += " " + word_next + used += 1 + has_year = True + else: + has_year = False + + elif word_next and word_next[0].isdigit(): + date_string += " " + word_next + used += 1 + if word_next_next and word_next_next[0].isdigit(): + date_string += " " + word_next_next + used += 1 + has_year = True + else: + has_year = False + + # parse 5 days from tomorrow, 10 weeks from next thursday, + # 2 months from July + valid_followups = days + months + months_short + valid_followups.append("сьогодні") + valid_followups.append("завтра") + valid_followups.append("післязавтра") + valid_followups.append("вчора") + valid_followups.append("позавчора") + for followup in _WORDS_NEXT_UK: + valid_followups.append(followup) + for followup in _WORDS_PREV_UK: + valid_followups.append(followup) + for followup in _WORDS_CURRENT_UK: + valid_followups.append(followup) + for followup in _WORDS_NOW_UK: + valid_followups.append(followup) + if (word in ["до", "по", "з"]) and word_next in valid_followups: + used = 2 + from_flag = True + if word_next == "завтра": + day_offset += 1 + elif word_next == "післязавтра": + day_offset += 2 + elif word_next == "вчора": + day_offset -= 1 + elif word_next == "позавчора": + day_offset -= 2 + elif word_next in days: + d = days.index(word_next) + tmp_offset = (d + 1) - int(today) + used = 2 + if tmp_offset < 0: + tmp_offset += 7 + day_offset += tmp_offset + elif word_next_next and word_next_next in days: + d = days.index(word_next_next) + tmp_offset = (d + 1) - int(today) + used = 3 + if word_next in _WORDS_NEXT_UK: + if day_offset <= 2: + tmp_offset += 7 + used += 1 + start -= 1 + elif word_next in _WORDS_PREV_UK: + tmp_offset -= 7 + used += 1 + start -= 1 + day_offset += tmp_offset + if used > 0: + if start - 1 > 0 and (words[start - 1] in _WORDS_CURRENT_UK): + start -= 1 + used += 1 + + for i in range(0, used): + words[i + start] = "" + + if start - 1 >= 0 and words[start - 1] in markers: + words[start - 1] = "" + found = True + day_specified = True + + # parse time + hr_offset = 0 + min_offset = 0 + sec_offset = 0 + hr_abs = None + min_abs = None + military = False + preposition = "" + + for idx, word in enumerate(words): + if word == "": + continue + + if word in markers: + preposition = word + word = _text_uk_inflection_normalize(word, 1) + word_prev_prev = _text_uk_inflection_normalize( + words[idx - 2], 2) if idx > 1 else "" + word_prev = _text_uk_inflection_normalize( + words[idx - 1], 2) if idx > 0 else "" + word_next = _text_uk_inflection_normalize( + words[idx + 1], 2) if idx + 1 < len(words) else "" + word_next_next = _text_uk_inflection_normalize( + words[idx + 2], 2) if idx + 2 < len(words) else "" + + # parse noon, midnight, morning, afternoon, evening + used = 0 + if word == "опівдні": + hr_abs = 12 + used += 1 + elif word == "північ": + hr_abs = 0 + used += 1 + elif word in _STRING_NUM_UK: + val = _STRING_NUM_UK.get(word) + elif word in _WORDS_MORNING_UK: + if hr_abs is None: + hr_abs = 8 + used += 1 + elif word in _WORDS_DAY_UK: + if hr_abs is None: + hr_abs = 15 + used += 1 + elif word in _WORDS_EVENING_UK: + if hr_abs is None: + hr_abs = 19 + used += 1 + if word_next != "" and word_next[0].isdigit() and ":" in word_next: + used -= 1 + elif word in _WORDS_NIGHT_UK: + if hr_abs is None: + hr_abs = 22 + # parse half an hour, quarter hour + # should be added different variations oh "hour forms" + elif word in ["година", "годину", "години"] and \ + (word_prev in markers or word_prev_prev in markers): + if word_prev in ["пів", "половина", "опів на", "опів"]: + min_offset = 30 + elif word_prev == "чверть": + min_offset = 15 + #parse in an hour + elif word_prev == "через": + hr_offset = 1 + else: + hr_offset = 1 + if word_prev_prev in markers: + words[idx - 2] = "" + if word_prev_prev in _WORDS_CURRENT_UK: + day_specified = True + words[idx - 1] = "" + used += 1 + hr_abs = -1 + min_abs = -1 + # parse 5:00 am, 12:00 p.m., etc + # parse in a minute + elif word == "хвилину" and word_prev == "через": + min_offset = 1 + words[idx - 1] = "" + used += 1 + # parse in a second + elif word == "секунду" and word_prev == "через": + sec_offset = 1 + words[idx - 1] = "" + used += 1 + elif word[0].isdigit(): + is_time = True + str_hh = "" + str_mm = "" + remainder = "" + word_next_next_next = words[idx + 3] \ + if idx + 3 < len(words) else "" + if word_next in _WORDS_EVENING_UK or word_next in _WORDS_NIGHT_UK or word_next_next in _WORDS_EVENING_UK \ + or word_next_next in _WORDS_NIGHT_UK or word_prev in _WORDS_EVENING_UK \ + or word_prev in _WORDS_NIGHT_UK or word_prev_prev in _WORDS_EVENING_UK \ + or word_prev_prev in _WORDS_NIGHT_UK or word_next_next_next in _WORDS_EVENING_UK \ + or word_next_next_next in _WORDS_NIGHT_UK: + remainder = "pm" + used += 1 + if word_prev in _WORDS_EVENING_UK or word_prev in _WORDS_NIGHT_UK: + words[idx - 1] = "" + if word_prev_prev in _WORDS_EVENING_UK or word_prev_prev in _WORDS_NIGHT_UK: + words[idx - 2] = "" + if word_next_next in _WORDS_EVENING_UK or word_next_next in _WORDS_NIGHT_UK: + used += 1 + if word_next_next_next in _WORDS_EVENING_UK or word_next_next_next in _WORDS_NIGHT_UK: + used += 1 + + if ':' in word: + # parse colons + # "3:00 in the morning" + stage = 0 + length = len(word) + for i in range(length): + if stage == 0: + if word[i].isdigit(): + str_hh += word[i] + elif word[i] == ":": + stage = 1 + else: + stage = 2 + i -= 1 + elif stage == 1: + if word[i].isdigit(): + str_mm += word[i] + else: + stage = 2 + i -= 1 + elif stage == 2: + remainder = word[i:].replace(".", "") + break + if remainder == "": + hour = ["година", "годині"] + next_word = word_next.replace(".", "") + if next_word in ["am", "pm", "ночі", "ранку", "дня", "вечора"]: + remainder = next_word + used += 1 + # question with the case "година" + elif next_word in hour and word_next_next in ["am", "pm", "ночи", "утра", "дня", "вечера"]: + remainder = word_next_next + used += 2 + elif word_next in _WORDS_MORNING_UK: + remainder = "am" + used += 2 + elif word_next in _WORDS_DAY_UK: + remainder = "pm" + used += 2 + elif word_next in _WORDS_EVENING_UK: + remainder = "pm" + used += 2 + elif word_next == "цього" and word_next_next in _WORDS_MORNING_UK: + remainder = "am" + used = 2 + day_specified = True + elif word_next == "на" and word_next_next in _WORDS_DAY_UK: + remainder = "pm" + used = 2 + day_specified = True + elif word_next == "на" and word_next_next in _WORDS_EVENING_UK: + remainder = "pm" + used = 2 + day_specified = True + elif word_next == "в" and word_next_next in _WORDS_NIGHT_UK: + if str_hh and int(str_hh) > 5: + remainder = "pm" + else: + remainder = "am" + used += 2 + elif word_next == "о" and word_next_next in _WORDS_NIGHT_UK: + if str_hh and int(str_hh) > 5: + remainder = "pm" + else: + remainder = "am" + used += 2 + elif hr_abs and hr_abs != -1: + if hr_abs >= 12: + remainder = "pm" + else: + remainder = "am" + used += 1 + else: + if time_qualifier != "": + military = True + if str_hh and int(str_hh) <= 12 and \ + (time_qualifier in time_qualifiers_pm): + str_hh += str(int(str_hh) + 12) + + else: + # try to parse numbers without colons + # 5 hours, 10 minutes etc. + length = len(word) + str_num = "" + remainder = "" + for i in range(length): + if word[i].isdigit(): + str_num += word[i] + else: + remainder += word[i] + + if remainder == "": + remainder = word_next.replace(".", "").lstrip().rstrip() + if ( + remainder == "pm" or + word_next == "pm" or + remainder == "p.m." or + word_next == "p.m." or + (remainder == "дня" and preposition != 'через') or + (word_next == "дня" and preposition != 'через') or + remainder == "вечора" or + word_next == "вечора"): + str_hh = str_num + remainder = "pm" + used = 1 + if ( + remainder == "pm" or + word_next == "pm" or + remainder == "p.m." or + word_next == "p.m." or + (remainder == "дня" and preposition != 'через') or + (word_next == "дня" and preposition != 'через') or + remainder == "вечора" or + word_next == "вечора"): + str_hh = str_num + remainder = "pm" + used = 1 + elif ( + remainder == "am" or + word_next == "am" or + remainder == "a.m." or + word_next == "a.m." or + remainder == "ночі" or + word_next == "ночі" or + remainder == "ранку" or + word_next == "ранку"): + str_hh = str_num + remainder = "am" + used = 1 + elif ( + remainder in recur_markers or + word_next in recur_markers or + word_next_next in recur_markers): + # Ex: "7 on mondays" or "3 this friday" + # Set str_hh so that is_time == True + # when am or pm is not specified + str_hh = str_num + used = 1 + else: + if int(str_num) > 100: + str_hh = str(int(str_num) // 100) + str_mm = str(int(str_num) % 100) + military = True + if word_next == "година": + used += 1 + elif ( + (word_next == "година" or word_next == "годину" or + remainder == "година") and + word[0] != '0' and + # (wordPrev != "в" and wordPrev != "на") + word_prev == "через" + and + ( + int(str_num) < 100 or + int(str_num) > 2400 + )): + # ignores military time + # "in 3 hours" + hr_offset = int(str_num) + used = 2 + is_time = False + hr_abs = -1 + min_abs = -1 + elif word_next == "хвилина" or \ + remainder == "хвилина": + # "in 10 minutes" + min_offset = int(str_num) + used = 2 + is_time = False + hr_abs = -1 + min_abs = -1 + elif word_next == "секунда" \ + or remainder == "секунда": + # in 5 seconds + sec_offset = int(str_num) + used = 2 + is_time = False + hr_abs = -1 + min_abs = -1 + elif int(str_num) > 100: + # military time, eg. "3300 hours" + str_hh = str(int(str_num) // 100) + str_mm = str(int(str_num) % 100) + military = True + if word_next == "час" or \ + remainder == "час": + used += 1 + elif word_next and word_next[0].isdigit(): + # military time, e.g. "04 38 hours" + str_hh = str_num + str_mm = word_next + military = True + used += 1 + if (word_next_next == "година" or + remainder == "час"): + used += 1 + elif ( + word_next == "" or word_next == "година" or + ( + (word_next == "в" or word_next == "на") and + ( + word_next_next == time_qualifier + ) + ) or word_next in _WORDS_EVENING_UK or + word_next_next in _WORDS_EVENING_UK): + + str_hh = str_num + str_mm = "00" + if word_next == "година": + used += 1 + if (word_next == "о" or word_next == "на" + or word_next_next == "о" or word_next_next == "на"): + used += (1 if (word_next == + "о" or word_next == "на") else 2) + word_next_next_next = words[idx + 3] \ + if idx + 3 < len(words) else "" + + if (word_next_next and + (word_next_next in time_qualifier or + word_next_next_next in time_qualifier)): + if (word_next_next in time_qualifiers_pm or + word_next_next_next in time_qualifiers_pm): + remainder = "pm" + used += 1 + if (word_next_next in time_qualifiers_am or + word_next_next_next in time_qualifiers_am): + remainder = "am" + used += 1 + + if time_qualifier != "": + if time_qualifier in time_qualifiers_pm: + remainder = "pm" + used += 1 + + elif time_qualifier in time_qualifiers_am: + remainder = "am" + used += 1 + else: + # TODO: Unsure if this is 100% accurate + used += 1 + military = True + elif remainder == "година": + if word_next_next in ["ночі", "ранку"]: + remainder = "am" + used += 1 + elif word_next_next in ["дня", "вечора"]: + remainder = "pm" + used += 1 + else: + remainder = "" + + else: + is_time = False + hh = int(str_hh) if str_hh else 0 + mm = int(str_mm) if str_mm else 0 + hh = hh + 12 if remainder == "pm" and hh < 12 else hh + hh = hh - 12 if remainder == "am" and hh >= 12 else hh + if (not military and + remainder not in ['am', 'pm', 'година', 'хвилина', 'секунда'] and + ((not day_specified) or 0 <= day_offset < 1)): + + # ambiguous time, detect whether they mean this evening or + # the next morning based on whether it has already passed + if anchor_date.hour < hh or (anchor_date.hour == hh and + anchor_date.minute < mm): + pass # No modification needed + elif anchor_date.hour < hh + 12: + hh += 12 + else: + # has passed, assume the next morning + day_offset += 1 + if time_qualifier in time_qualifiers_pm and hh < 12: + hh += 12 + + if hh > 24 or mm > 59: + is_time = False + used = 0 + if is_time: + hr_abs = hh + min_abs = mm + used += 1 + + if used > 0: + # removed parsed words from the sentence + for i in range(used): + if idx + i >= len(words): + break + words[idx + i] = "" + + # if wordPrev == "o" or wordPrev == "oh": + # words[words.index(wordPrev)] = "" + + if word_prev == "скоро": + hr_offset = -1 + words[idx - 1] = "" + idx -= 1 + elif word_prev == "пізніше": + hr_offset = 1 + words[idx - 1] = "" + idx -= 1 + if idx > 0 and word_prev in markers: + words[idx - 1] = "" + if word_prev in _WORDS_CURRENT_UK: + day_specified = True + if idx > 1 and word_prev_prev in markers: + words[idx - 2] = "" + if word_prev_prev in _WORDS_CURRENT_UK: + day_specified = True + + idx += used - 1 + found = True + # check that we found a date + if not date_found(): + return None + + if day_offset is False: + day_offset = 0 + + # perform date manipulation + + extracted_date = anchor_date.replace(microsecond=0) + if date_string != "": + # date included an explicit date, e.g. "june 5" or "june 2, 2017" + try: + temp = datetime.strptime(date_string, "%B %d") + except ValueError: + # Try again, allowing the year + temp = datetime.strptime(date_string, "%B %d %Y") + extracted_date = extracted_date.replace(hour=0, minute=0, second=0) + if not has_year: + temp = temp.replace(year=extracted_date.year, + tzinfo=extracted_date.tzinfo) + if extracted_date < temp: + extracted_date = extracted_date.replace( + year=int(current_year), + month=int(temp.strftime("%m")), + day=int(temp.strftime("%d")), + tzinfo=extracted_date.tzinfo) + else: + extracted_date = extracted_date.replace( + year=int(current_year) + 1, + month=int(temp.strftime("%m")), + day=int(temp.strftime("%d")), + tzinfo=extracted_date.tzinfo) + else: + extracted_date = extracted_date.replace( + year=int(temp.strftime("%Y")), + month=int(temp.strftime("%m")), + day=int(temp.strftime("%d")), + tzinfo=extracted_date.tzinfo) + else: + # ignore the current HH:MM:SS if relative using days or greater + if hr_offset == 0 and min_offset == 0 and sec_offset == 0: + extracted_date = extracted_date.replace(hour=0, minute=0, second=0) + + if year_offset != 0: + extracted_date = extracted_date + relativedelta(years=year_offset) + if month_offset != 0: + extracted_date = extracted_date + relativedelta(months=month_offset) + if day_offset != 0: + extracted_date = extracted_date + relativedelta(days=day_offset) + if hr_abs != -1 and min_abs != -1: + # If no time was supplied in the string set the time to default + # time if it's available + if hr_abs is None and min_abs is None and default_time is not None: + hr_abs, min_abs = default_time.hour, default_time.minute + else: + hr_abs = hr_abs or 0 + min_abs = min_abs or 0 + + extracted_date = extracted_date + relativedelta(hours=hr_abs, + minutes=min_abs) + if (hr_abs != 0 or min_abs != 0) and date_string == "": + if not day_specified and anchor_date > extracted_date: + extracted_date = extracted_date + relativedelta(days=1) + if hr_offset != 0: + extracted_date = extracted_date + relativedelta(hours=hr_offset) + if min_offset != 0: + extracted_date = extracted_date + relativedelta(minutes=min_offset) + if sec_offset != 0: + extracted_date = extracted_date + relativedelta(seconds=sec_offset) + for idx, word in enumerate(words): + if words[idx] == "і" and \ + words[idx - 1] == "" and words[idx + 1] == "": + words[idx] = "" + + result_str = " ".join(words) + result_str = ' '.join(result_str.split()) + return [extracted_date, result_str] + +# change logic here +def is_fractional_uk(input_str, word, short_scale=True): + """ + This function takes the given text and checks if it is a fraction. + + Args: + input_str (str): the string to check if fractional + short_scale (bool): use short scale if True, long scale if False + Returns: + (bool) or (float): False if not a fraction, otherwise the fraction + + """ + fractions = {"ціла": 1} + # endings for creation different cases and plurals in different cases + ending = ['ої', 'е', 'их', 'ою', 'і', 'ими', 'ій'] + for num in _FRACTION_STRING_UK.keys(): # Numbers from 2 to 1 hundred, more is not usually used in common speech + if num > 1: + fractions[str(_FRACTION_STRING_UK[num])] = num + for end in ending: + new_fraction_number = _FRACTION_STRING_UK[num][:-1]+end + fractions[new_fraction_number] = num + fractions.update({ + "половина": 2, "половиною": 2, "половини": 2, "половин": 2, "половинами": 2, "пів": 2, + "шоста": 6, + "третина": 1 / 3, "треть": 1 / 3, "треті": 3, "третьої": 3, + "чверті": 4, "чверть": 0.25, "чвертю": 0.25 + }) + if input_str.lower() in fractions.keys(): + if word == input_str: + return fractions[input_str.lower()] + elif word not in _STRING_NUM_UK: + return fractions[input_str.lower()] + else: + return 1.0 / fractions[input_str.lower()] + return False + + +def extract_numbers_uk(text, short_scale=True, ordinals=False): + """ + Takes in a string and extracts a list of numbers. + + Args: + text (str): the string to extract a number from + short_scale (bool): Use "short scale" or "long scale" for large + numbers -- over a million. The default is short scale, which + is now common in most English speaking countries. + See https://en.wikipedia.org/wiki/Names_of_large_numbers + ordinals (bool): consider ordinal numbers, e.g. third=3 instead of 1/3 + Returns: + list: list of extracted numbers as floats + """ + results = _extract_numbers_with_text_uk(tokenize(text), + short_scale, ordinals) + #numbers_sum = sum([float(result.value) for result in results]) + return [float(result.value) for result in results] + + +class UkrainianNormalizer(Normalizer): + with open(resolve_resource_file("text/uk-uk/normalize.json"), encoding='utf8') as f: + _default_config = json.load(f) + + +def normalize_uk(text, remove_articles=True): + """ Ukrainian string normalization """ + return UkrainianNormalizer().normalize(text, remove_articles) + + +def _text_uk_inflection_normalize(word, arg): + """ + Ukrainian Inflection normalizer. + + This try to normalize known inflection. This function is called + from multiple places, each one is defined with arg. + + Args: + word [Word] + arg [Int] + + Returns: + word [Word] + + """ + + + if arg == 1: # _extract_whole_number_with_text_uk + if word in ["одна", "одним", "одно", "одною", "одного", "одної", "одному", "одній", "одного", "одну"]: + return "один" + return _plurals_normalizer(word) + + elif arg == 2: # extract_datetime_uk + if word in ["година", "години", "годин", "годину", "годин", "годинами"]: + return "година" + if word in ["хвилина", "хвилини", "хвилину", "хвилин", "хвилька"]: + return "хвилина" + if word in ["секунд", "секунди", "секундами", "секунду", "секунд", "сек"]: + return "секунда" + if word in ["днів", "дні", "днями", "дню", "днем", "днями"]: + return "день" + if word in ["тижні", "тижнів", "тижнями", "тиждень", "тижня"]: + return "тиждень" + if word in ["місяцем", "місяці", "місяця", "місяцях", "місяцем", "місяцями", "місяців"]: + return "місяць" + if word in ["року", "роки", "році", "роках", "роком", "роками", "років"]: + return "рік" + if word in _WORDS_MORNING_UK: + return "вранці" + if word in ["опівдні", "півдня"]: + return "південь" + if word in _WORDS_EVENING_UK: + return "ввечері" + if word in _WORDS_NIGHT_UK: + return "ніч" + if word in ["вікенд", "вихідних", "вихідними"]: + return "вихідні" + if word in ["столітті", "століттях", "століть"]: + return "століття" + if word in ["десятиліття", "десятиліть", "десятиліттях"]: + return "десятиліття" + if word in ["столітті", "століттях", "століть"]: + return "століття" + + # Week days + if word in ["понеділка", "понеділки"]: + return "понеділок" + if word in ["вівторка", "вівторки"]: + return "вівторок" + if word in ["середу", "середи"]: + return "среда" + if word in ["четверга"]: + return "четвер" + if word in ["п'ятницю", "п'ятниці"]: + return "п'ятниця" + if word in ["суботу", "суботи"]: + return "субота" + if word in ["неділю", "неділі"]: + return "неділя" + + # Months + if word in ["лютому", "лютого", "лютим"]: + return "лютий" + if word in ["листопада", "листопаді", "листопадом"]: + return "листопад" + tmp = '' + if word[-3:] in ["ого", "ому"]: + tmp = word[:-3] + "ень" + elif word[-2:] in ["ні", "ня"]: + tmp = word[:-2] + "ень" + for name in _MONTHS_UK: + if name == tmp: + return name + return word + +def _plurals_normalizer(word): + """ + Ukrainian Plurals normalizer. + + This function normalizes plural endings of numerals + including different case variations. + Uses _PLURALS dictionary with exceptions that can not + be covered by rules. + Args: + word [Word] + + Returns: + word [Word] + + """ + if word not in _STRING_NUM_UK: + # checking for plurals 2-10 + for key, value in _PLURALS.items(): + if word == key: + return _NUM_STRING_UK[value] + + # checking for plurals 11-19 + case_endings = ['надцяти', 'надцятим', 'надцятими', + 'надцятьох', 'надцятьма', 'надцятьома', 'надцятьом'] + plural_case = ''.join([case for case in case_endings if case in word]) + if plural_case: + if 'один' in word: + return "одинадцять" + word = word.replace(plural_case, '')+'надцять' + return word + + # checking for plurals 20,30 + case_endings = ['дцяти', 'дцятим', 'дцятими', + 'дцятьох', 'дцятьма', 'дцятьома', 'дцятьом'] + plural_case = ''.join([case for case in case_endings if case in word]) + if plural_case: + word = word.replace(plural_case, '')+'дцять' + return word + + # checking for plurals 50, 60, 70, 80 + case_endings = ['десятьох', 'десяти', 'десятьом', + 'десятьма', 'десятьома'] + plural_case = ''.join([case for case in case_endings if case in word]) + if plural_case: + word = word.replace(plural_case, '')+'десят' + return word + + # checking for plurals 90, 100 + case_endings = ['стам', 'стами', 'стах', + 'стами', 'ста', 'сот'] + plural_case = ''.join([case for case in case_endings if case in word]) + if plural_case: + word = word.replace(plural_case, '') + for key, value in _PLURALS.items(): + if word == key: + firs_part = _NUM_STRING_UK[value] + if value in [3, 4]: + word = firs_part+'ста' + elif value in [5, 6, 9]: + word = firs_part[:-1]+'сот' + elif value in [7, 8]: + word = firs_part+'сот' + return word + return word + return word + + diff --git a/lingua_franca/res/text/uk-uk/and.word b/lingua_franca/res/text/uk-uk/and.word new file mode 100644 index 00000000..83e90efc --- /dev/null +++ b/lingua_franca/res/text/uk-uk/and.word @@ -0,0 +1,3 @@ +і +також +та diff --git a/lingua_franca/res/text/uk-uk/date_time.json b/lingua_franca/res/text/uk-uk/date_time.json new file mode 100644 index 00000000..314356e5 --- /dev/null +++ b/lingua_franca/res/text/uk-uk/date_time.json @@ -0,0 +1,149 @@ +{ + "decade_format": { + "1": {"match": "^\\d$", "format": "{x}"}, + "2": {"match": "^1\\d$", "format": "{xx}"}, + "3": {"match": "^\\d0$", "format": "{x0}"}, + "4": {"match": "^[2-9]\\d$", "format": "{x0} {x}"}, + "default": "{number}" + }, + "hundreds_format": { + "1": {"match": "^1\\d{2}$", "format": "сто"}, + "2": {"match": "^2\\d{2}$", "format": "двісті"}, + "3": {"match": "^[34]\\d{2}$", "format": "{x_in_x00}ста"}, + "4": {"match": "^\\d{3}$", "format": "{x_in_x00}сот"}, + "default": "{number}" + }, + "thousand_format": { + "1": {"match": "^10\\d{2}$", "format": "тисяча"}, + "2": {"match": "^11\\d{2}$", "format": "тисяча сто"}, + "3": {"match": "^12\\d{2}$", "format": "тисяча двісті"}, + "4": {"match": "^1[34]\\d{2}$", "format": "тисяча {x_in_x00}ста"}, + "5": {"match": "^1\\d{3}$", "format": "тисяча {x_in_x00}сот"}, + + "6": {"match": "^20\\d{2}$", "format": "дві тисячі"}, + "7": {"match": "^21\\d{2}$", "format": "дві тисячі сто"}, + "8": {"match": "^22\\d{2}$", "format": "дві тисячі двісті"}, + "9": {"match": "^2[34]\\d{2}$", "format": "дві тисячі {x_in_x00}ста"}, + "10": {"match": "^2\\d{3}$", "format": "дві тисячі {x_in_x00}сот"}, + + "11": {"match": "^[34]0\\d{2}$", "format": "{x_in_x000} тисячі"}, + "12": {"match": "^[34]1\\d{2}$", "format": "{x_in_x000} тисячі сто"}, + "13": {"match": "^[34]2\\d{2}$", "format": "{x_in_x000} тисячі двісті"}, + "14": {"match": "^[34][34]\\d{2}$", "format": "{x_in_x000} тисячі {x_in_x00}ста"}, + "15": {"match": "^[34]\\d{3}$", "format": "{x_in_x000} тисячі {x_in_x00}сот"}, + + "16": {"match": "^[5-9]0\\d{2}$", "format": "{x_in_x000} тисяч"}, + "17": {"match": "^[5-9]1\\d{2}$", "format": "{x_in_x000} тисяч сто"}, + "18": {"match": "^[5-9]2\\d{2}$", "format": "{x_in_x000} тисяч двісті"}, + "19": {"match": "^[5-9][34]\\d{2}$", "format": "{x_in_x000} тисяч {x_in_x00}ста"}, + "20": {"match": "^[5-9]\\d{3}$", "format": "{x_in_x000} тисяч {x_in_x00}сот"}, + + "default": "{number}" + }, + "year_format": { + "1": {"match": "^\\d\\d?$", "format": "{formatted_decade} {bc}"}, + "2": {"match": "^\\d00$", "format": "{formatted_hundreds} {bc}"}, + "3": {"match": "^\\d{3}$", "format": "{formatted_hundreds} {formatted_decade} {bc}"}, + "4": {"match": "^\\d{2}00$", "format": "{formatted_thousand} {bc}"}, + "5": {"match": "^\\d{4}$", "format": "{formatted_thousand} {formatted_decade} {bc}"}, + "default": "{year} {bc}", + "bc": "до нашої ери" + }, + "date_format": { + "date_full": "{weekday}, {day} {month}, {formatted_year}", + "date_full_no_year": "{weekday}, {day} {month}", + "date_full_no_year_month": "{weekday}, {day}", + "today": "сьогодні", + "tomorrow": "завтра", + "yesterday": "вчора" + }, + "date_time_format": { + "date_time": "{formatted_date} о {formatted_time}" + }, + "weekday": { + "0": "у понеділок", + "1": "у вівторок", + "2": "у середу", + "3": "у четвер", + "4": "у п'ятницю", + "5": "у суботу", + "6": "у неділю" + }, + "date": { + "1": "першого", + "2": "другого", + "3": "третього", + "4": "четвертого", + "5": "п'ятого", + "6": "шостого", + "7": "сьомого", + "8": "восьмого", + "9": "дев'ятого", + "10": "десятого", + "11": "одинадцятого", + "12": "дванадцятого", + "13": "тринадцятого", + "14": "чотирнадцятого", + "15": "п'ятнадцятого", + "16": "шістнадцятого", + "17": "сімнадцятого", + "18": "вісімнадцятого", + "19": "дев'ятнадцятого", + "20": "двадцятого", + "21": "двадцять першого", + "22": "двадцять другого", + "23": "двадцять третього", + "24": "двадцять четвертого", + "25": "двадцять п'ятого", + "26": "двадцять шостого", + "27": "двадцять сьомого", + "28": "двадцять восьмого", + "29": "двадцять дев'ятого", + "30": "тридцятого", + "31": "тридцять першого" + }, + "month": { + "1": "січня", + "2": "лютого", + "3": "березня", + "4": "квітня", + "5": "травня", + "6": "червня", + "7": "липня", + "8": "серпня", + "9": "вересня", + "10": "жовтня", + "11": "листопада", + "12": "грудня" + }, + "number": { + "0": "нуль", + "1": "перший", + "2": "другий", + "3": "третій", + "4": "четвертий", + "5": "п'ятий", + "6": "шостий", + "7": "сьомий", + "8": "восьмий", + "9": "дев'ятий", + "10": "десятий", + "11": "одинадцятий", + "12": "дванадцятий", + "13": "тринадцятий", + "14": "чотирнадцятий", + "15": "п'ятнадцятий", + "16": "шістнадцятий", + "17": "сімнадцятий", + "18": "вісімнадцятий", + "19": "дев'ятнадцятий", + "20": "двадцять", + "30": "тридцять", + "40": "сорок", + "50": "п'ятдесят", + "60": "шістдесят", + "70": "сімдесят", + "80": "вісімдесят", + "90": "дев'яносто" + } +} diff --git a/lingua_franca/res/text/uk-uk/date_time_test.json b/lingua_franca/res/text/uk-uk/date_time_test.json new file mode 100644 index 00000000..eb645131 --- /dev/null +++ b/lingua_franca/res/text/uk-uk/date_time_test.json @@ -0,0 +1,22 @@ +{ + "test_nice_year": { + "1": {"datetime_param": "1, 1, 31, 13, 22, 3", "bc": "True", "assertEqual": "перший до нашої ери" } + }, + + "test_nice_date": { + "1": {"datetime_param": "2017, 1, 31, 0, 2, 3", "now": "None", "assertEqual": "у вівторок, тридцять першого січня, дві тисячі сімнадцятий"}, + "2": {"datetime_param": "2018, 2, 4, 0, 2, 3", "now": "2017, 1, 1, 0, 2, 3", "assertEqual": "у неділю, четвертого лютого, дві тисячі вісімнадцятий"}, + "3": {"datetime_param": "2018, 2, 4, 0, 2, 3", "now": "2018, 1, 1, 0, 2, 3", "assertEqual": "у неділю, четвертого лютого"}, + "4": {"datetime_param": "2018, 2, 4, 0, 2, 3", "now": "2018, 2, 1, 0, 2, 3", "assertEqual": "у неділю, четвертого"}, + "5": {"datetime_param": "2018, 2, 4, 0, 2, 3", "now": "2018, 2, 3, 0, 2, 3", "assertEqual": "завтра"}, + "6": {"datetime_param": "2018, 2, 4, 0, 2, 3", "now": "2018, 2, 4, 0, 2, 3", "assertEqual": "сьогодні"}, + "7": {"datetime_param": "2018, 2, 4, 0, 2, 3", "now": "2018, 2, 5, 0, 2, 3", "assertEqual": "вчора"}, + "8": {"datetime_param": "2018, 2, 4, 0, 2, 3", "now": "2018, 2, 6, 0, 2, 3", "assertEqual": "у неділю, четвертого лютого"}, + "9": {"datetime_param": "2021, 2, 4, 0, 2, 3", "now": "None", "assertEqual": "у четвер, четвертого лютого, дві тисячі двадцять перший"} + }, + + "test_nice_date_time": { + "1": {"datetime_param": "2017, 1, 31, 13, 22, 3", "now": "None", "use_24hour": "False", "use_ampm": "True", "assertEqual": "у вівторок, тридцять першого січня, дві тисячі сімнадцятий о перша година двадцять два дня"}, + "2": {"datetime_param": "2017, 1, 31, 13, 22, 3", "now": "None", "use_24hour": "True", "use_ampm": "False", "assertEqual": "у вівторок, тридцять першого січня, дві тисячі сімнадцятий о тринадцять двадцять два"} + } +} diff --git a/lingua_franca/res/text/uk-uk/day.word b/lingua_franca/res/text/uk-uk/day.word new file mode 100644 index 00000000..bcb325eb --- /dev/null +++ b/lingua_franca/res/text/uk-uk/day.word @@ -0,0 +1 @@ +день diff --git a/lingua_franca/res/text/uk-uk/days.word b/lingua_franca/res/text/uk-uk/days.word new file mode 100644 index 00000000..de6b478d --- /dev/null +++ b/lingua_franca/res/text/uk-uk/days.word @@ -0,0 +1 @@ +днів diff --git a/lingua_franca/res/text/uk-uk/hour.word b/lingua_franca/res/text/uk-uk/hour.word new file mode 100644 index 00000000..b14784b9 --- /dev/null +++ b/lingua_franca/res/text/uk-uk/hour.word @@ -0,0 +1 @@ +година diff --git a/lingua_franca/res/text/uk-uk/hours.word b/lingua_franca/res/text/uk-uk/hours.word new file mode 100644 index 00000000..29ae7ff9 --- /dev/null +++ b/lingua_franca/res/text/uk-uk/hours.word @@ -0,0 +1 @@ +годин diff --git a/lingua_franca/res/text/uk-uk/minute.word b/lingua_franca/res/text/uk-uk/minute.word new file mode 100644 index 00000000..a95dd785 --- /dev/null +++ b/lingua_franca/res/text/uk-uk/minute.word @@ -0,0 +1 @@ +хвилина diff --git a/lingua_franca/res/text/uk-uk/minutes.word b/lingua_franca/res/text/uk-uk/minutes.word new file mode 100644 index 00000000..bd206800 --- /dev/null +++ b/lingua_franca/res/text/uk-uk/minutes.word @@ -0,0 +1 @@ +хвилин diff --git a/lingua_franca/res/text/uk-uk/normalize.json b/lingua_franca/res/text/uk-uk/normalize.json new file mode 100644 index 00000000..d8339dc1 --- /dev/null +++ b/lingua_franca/res/text/uk-uk/normalize.json @@ -0,0 +1,74 @@ +{ + "lowercase": false, + "numbers_to_digits": true, + "expand_contractions": true, + "remove_symbols": false, + "remove_accents": false, + "remove_articles": false, + "remove_stopwords": false, + "contractions": {}, + "word_replacements": {}, + "number_replacements": { + "ноль": "0", + "нуль": "0", + "нуля": "0", + "один": "1", + "одна": "1", + "одну": "1", + "одного": "1", + "пару": "2", + "пари": "2", + "пара": "2", + "два": "2", + "двох": "2", + "дві": "2", + "три": "3", + "трьох": "3", + "чотири": "4", + "п'ять": "5", + "шість": "6", + "сім": "7", + "вісім": "8", + "дев'ять": "9", + "десять": "10", + "одинадцять": "11", + "дванадцять": "12", + "тринадцять": "13", + "чотирнадцять": "14", + "п'ятнадцять": "15", + "шістнадцять": "16", + "сімнадцять": "17", + "вісімнадцять": "18", + "дев'ятнадцять": "19", + "двадцять": "20", + "тридцять": "30", + "сорок": "40", + "п'ятдесят": "50", + "шістдесят": "60", + "сімдесят": "70", + "вісімдесят": "80", + "дев'яносто": "90", + "сто": "100", + "двісті": "200", + "триста": "300", + "чотириста": "400", + "п'ятсот": "500", + "шістсот": "600", + "сімсот": "700", + "вісімсот": "800", + "дев'ятсот": "900", + "дві сотні": "200", + "три сотні": "300", + "чотири сотні": "400", + "п'ять сотень": "500", + "шість сотень": "600", + "сім сотень": "700", + "вісім сотень": "800", + "дев'ять сотень": "900", + "тисячі": "1000", + "тисяча": "1000", + "тисяч": "1000" + }, + "stopwords": [], + "articles": [] +} diff --git a/lingua_franca/res/text/uk-uk/or.word b/lingua_franca/res/text/uk-uk/or.word new file mode 100644 index 00000000..b48ea1e8 --- /dev/null +++ b/lingua_franca/res/text/uk-uk/or.word @@ -0,0 +1 @@ +або \ No newline at end of file diff --git a/lingua_franca/res/text/uk-uk/second.word b/lingua_franca/res/text/uk-uk/second.word new file mode 100644 index 00000000..f1be0808 --- /dev/null +++ b/lingua_franca/res/text/uk-uk/second.word @@ -0,0 +1 @@ +секунда diff --git a/lingua_franca/res/text/uk-uk/seconds.word b/lingua_franca/res/text/uk-uk/seconds.word new file mode 100644 index 00000000..ba1da0d7 --- /dev/null +++ b/lingua_franca/res/text/uk-uk/seconds.word @@ -0,0 +1 @@ +секунд diff --git a/lingua_franca/res/text/uk-uk/yesno.json b/lingua_franca/res/text/uk-uk/yesno.json new file mode 100644 index 00000000..4e620ae4 --- /dev/null +++ b/lingua_franca/res/text/uk-uk/yesno.json @@ -0,0 +1,36 @@ +{ + "yes": [ + "так", + "та", + "авжеж", + "да", + "ага", + "правильно", + "підтверджую", + "погоджуюсь" + ], + "no": [ + "ні", + "не", + "нє", + "незадовільно", + "недобре" + ], + "neutral_yes": [ + "хочу", + "будь ласка", + "точно", + "вірно", + "підходить", + "задовільно", + "очевидно", + "можна", + "задовільняє" + ], + "neutral_no": [ + "незадовільно", + "неправда", + "помилка", + "маячня" + ] +} \ No newline at end of file diff --git a/test/unittests/test_format_ru.py b/test/unittests/test_format_ru.py index 8112a36b..73211b3f 100644 --- a/test/unittests/test_format_ru.py +++ b/test/unittests/test_format_ru.py @@ -573,3 +573,5 @@ def test_join(self): if __name__ == "__main__": unittest.main() + +#%% diff --git a/test/unittests/test_format_uk.py b/test/unittests/test_format_uk.py new file mode 100644 index 00000000..4d6a8bd2 --- /dev/null +++ b/test/unittests/test_format_uk.py @@ -0,0 +1,581 @@ +# +# Copyright 2017 Mycroft AI Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +import json +import unittest +import datetime +import ast +import sys +from pathlib import Path + +from lingua_franca import get_default_lang, set_default_lang, \ + load_language, unload_language +from lingua_franca.format import date_time_format +from lingua_franca.format import join_list +from lingua_franca.format import nice_date +from lingua_franca.format import nice_date_time +from lingua_franca.format import nice_duration +from lingua_franca.format import nice_number +from lingua_franca.format import nice_time +from lingua_franca.format import nice_year +from lingua_franca.format import pronounce_number +from lingua_franca.time import default_timezone + + +def setUpModule(): + load_language("uk-uk") + set_default_lang("uk") + + +def tearDownModule(): + unload_language("uk") + + +NUMBERS_FIXTURE_UK = { + 1.435634: "1.436", + 2: "2", + 5.0: "5", + 0.027: "0.027", + 0.5: "1 друга", + 1.333: "1 і 1 третя", + 2.666: "2 і 2 треті", + 0.25: "1 четверта", + 1.25: "1 і 1 четверта", + 0.75: "3 четверті", + 1.75: "1 і 3 четверті", + 3.4: "3 і 2 п'яті", + 16.8333: "16 і 5 шостих", + 12.5714: "12 і 4 сьомі", + 9.625: "9 і 5 восьмих", + 6.777: "6 і 7 дев'ятих", + 3.1: "3 і 1 десята", + 2.272: "2 і 3 одинадцяті", + 5.583: "5 і 7 дванадцятих", + 8.384: "8 і 5 тринадцятих", + 0.071: "1 чотирнадцята", + 6.466: "6 і 7 п'ятнадцятих", + 8.312: "8 і 5 шістнадцятих", + 2.176: "2 і 3 сімнадцяті", + 200.722: "200 і 13 вісімнадцятих", + 7.421: "7 і 8 дев'ятнадцятих", + 0.05: "1 двадцята" +} + +def setUpModule(): + load_language("uk-uk") + set_default_lang("uk") + +class TestNiceNumberFormat(unittest.TestCase): + load_language("uk-uk") + set_default_lang("uk") + + def test_convert_float_to_nice_number(self): + load_language("uk-uk") + set_default_lang("uk") + for number, number_str in NUMBERS_FIXTURE_UK.items(): + self.assertEqual(nice_number(number, speech=True), number_str, + "повинен відформатувати {} як {}, а не {}".format( + number, number_str, nice_number(number, speech=True))) + + def test_specify_denominator(self): + self.assertEqual(nice_number(5.5, speech=True, denominators=[1, 2, 3]), + "5 з половиною", + "повинен відформатувати 5.5 як 5 з половиною, а не {}".format( + nice_number(5.5, speech=True, denominators=[1, 2, 3]))) + self.assertEqual(nice_number(2.333, speech=True, denominators=[1, 2]), + "2.333", + "повинен відформатувати 2.333 як 2.333, а не {}".format( + nice_number(2.333, speech=True, denominators=[1, 2]))) + + def test_no_speech(self): + self.assertEqual(nice_number(6.777, speech=False), + "6 7/9", + "повинен відформатувати 6.777 як 6 7/9, а не {}".format( + nice_number(6.777, speech=False))) + self.assertEqual(nice_number(6.0, speech=False), + "6", + "повинен відформатувати 6.0 як 6, а не {}".format( + nice_number(6.0, speech=False))) + + +class TestPronounceNumber(unittest.TestCase): + + def test_convert_int(self): + self.assertEqual(pronounce_number(0), "нуль") + self.assertEqual(pronounce_number(1), "один") + self.assertEqual(pronounce_number(10), "десять") + self.assertEqual(pronounce_number(15), "п'ятнадцять") + self.assertEqual(pronounce_number(20), "двадцять") + self.assertEqual(pronounce_number(27), "двадцять сім") + self.assertEqual(pronounce_number(30), "тридцять") + self.assertEqual(pronounce_number(33), "тридцять три") + + def test_convert_negative_int(self): + self.assertEqual(pronounce_number(-1), "мінус один") + self.assertEqual(pronounce_number(-10), "мінус десять") + self.assertEqual(pronounce_number(-15), "мінус п'ятнадцять") + self.assertEqual(pronounce_number(-20), "мінус двадцять") + self.assertEqual(pronounce_number(-27), "мінус двадцять сім") + self.assertEqual(pronounce_number(-30), "мінус тридцять") + self.assertEqual(pronounce_number(-33), "мінус тридцять три") + + def test_convert_decimals(self): + self.assertEqual(pronounce_number(0.05), "нуль крапка нуль п'ять") + self.assertEqual(pronounce_number(-0.05), "мінус нуль крапка нуль п'ять") + self.assertEqual(pronounce_number(1.234), + "один крапка два три") + self.assertEqual(pronounce_number(21.234), + "двадцять один крапка два три") + self.assertEqual(pronounce_number(21.234, places=1), + "двадцять один крапка два") + self.assertEqual(pronounce_number(21.234, places=0), + "двадцять один") + self.assertEqual(pronounce_number(21.234, places=3), + "двадцять один крапка два три чотири") + self.assertEqual(pronounce_number(21.234, places=4), + "двадцять один крапка два три чотири") + self.assertEqual(pronounce_number(21.234, places=5), + "двадцять один крапка два три чотири") + self.assertEqual(pronounce_number(-1.234), + "мінус один крапка два три") + self.assertEqual(pronounce_number(-21.234), + "мінус двадцять один крапка два три") + self.assertEqual(pronounce_number(-21.234, places=1), + "мінус двадцять один крапка два") + self.assertEqual(pronounce_number(-21.234, places=0), + "мінус двадцять один") + self.assertEqual(pronounce_number(-21.234, places=3), + "мінус двадцять один крапка два три чотири") + self.assertEqual(pronounce_number(-21.234, places=4), + "мінус двадцять один крапка два три чотири") + self.assertEqual(pronounce_number(-21.234, places=5), + "мінус двадцять один крапка два три чотири") + + def test_convert_stos(self): + self.assertEqual(pronounce_number(100), "сто") + self.assertEqual(pronounce_number(666), "шістсот шістдесят шість") + self.assertEqual(pronounce_number(1456), "тисяча чотириста п'ятдесят шість") + self.assertEqual(pronounce_number(103254654), "сто три мільйона " + "двісті п'ятдесят " + "чотири тисячі " + "шістсот " + "п'ятдесят чотири") + self.assertEqual(pronounce_number(1512457), "мільйон п'ятсот" + " дванадцять тисяч " + "чотириста п'ятдесят " + "сім") + self.assertEqual(pronounce_number(209996), "двісті дев'ять " + "тисяч дев'ятсот " + "дев'яносто шість") + + def test_convert_scientific_notation(self): + self.assertEqual(pronounce_number(0, scientific=True), "нуль") + self.assertEqual(pronounce_number(33, scientific=True), + "три крапка три на десять у ступені один") + self.assertEqual(pronounce_number(299792458, scientific=True), + "два крапка дев'ять дев'ять на десять у ступені вісім") + self.assertEqual(pronounce_number(299792458, places=6, + scientific=True), + "два крапка дев'ять дев'ять сім дев'ять два п'ять " + "на десять у ступені вісім") + self.assertEqual(pronounce_number(1.672e-27, places=3, + scientific=True), + "один крапка шість сім два на десять у ступені " + "мінус двадцять сім") + + def test_auto_scientific_notation(self): + self.assertEqual( + pronounce_number(1.1e-150), "один крапка один на десять у ступені " + "мінус сто п'ятдесят") + + def test_large_numbers(self): + self.maxDiff = None + self.assertEqual( + pronounce_number(299792458, short_scale=True), + "двісті дев'яносто дев'ять мільйонів сімсот " + "дев'яносто дві тисячі чотириста п'ятдесят вісім") + self.assertEqual( + pronounce_number(299792458, short_scale=False), + "двісті дев'яносто дев'ять мільйонів сімсот " + "дев'яносто дві тисячі чотириста п'ятдесят вісім") + self.assertEqual( + pronounce_number(100034000000299792458, short_scale=True), + "сто квадрилліонів тридцять чотири більйона " + "двісті дев'яносто дев'ять мільйонів сімсот " + "дев'яносто дві тисячі чотириста п'ятдесят вісім") + self.assertEqual( + pronounce_number(100034000000299792458, short_scale=False), + "сто більйонів тридцять чотири тисячі мільярдів " + "двісті дев'яносто дев'ять мільйонів сімсот " + "дев'яносто дві тисячі чотириста п'ятдесят вісім") + self.assertEqual( + pronounce_number(1e10, short_scale=True), + "десять мільярдів") + self.assertEqual( + pronounce_number(1e12, short_scale=True), + "більйон") + # TODO maybe beautify this + self.assertEqual( + pronounce_number(1000001, short_scale=True), + "мільйон один") + self.assertEqual(pronounce_number(95505896639631893, short_scale=True), + "дев'яносто п'ять більйонів " + "п'ятсот п'ять квінтиліонів " + "вісімсот дев'яносто шість мільярдів " + "шістсот тридцять дев'ять мільйонів " + "шістсот тридцять одна тисяча " + "вісімсот дев'яносто три") + self.assertEqual(pronounce_number(95505896639631893, + short_scale=False), + "дев'яносто п'ять тисяч п'ятсот п'ять мільярдів " + "вісімсот дев'яносто шість тисяч " + "шістсот тридцять дев'ять мільйонів " + "шістсот тридцять одна тисяча " + "вісімсот дев'яносто три") + self.assertEqual(pronounce_number(10e80, places=1), + "секснвігінтіліон") + # TODO floating point rounding issues might happen + # self.assertEqual(pronounce_number(1.9874522571e80, places=9), + # "сто дев'яносто вісім квінвігінтільйонів " + # # "сімсот сорок п'ять кватторвігінтільйонів " + # "двісті двадцять п'ять тревігінтільйонів " + # "сімсот дев'ять дуовігінтільйонів " + # "дев'ятсот дев'яносто дев'ять унвігінтільйонів " + # "дев'ятсот вісімдесят дев'ять вигінтильйонів " + # "сімсот тридцять новемдециліонів " + # "дев'ятсот девятнадцать октодецильйонів " + # "дев'ятсот дев'яносто дев'ять септендециліонів " + # "дев'ятсот п'ятдесят п'ять сексдециліонів " + # "чотириста дев'яносто вісім квіндециліонів " + # "двісті чотирнадцять кваттордециліонів " + # "вісімсот сорок п'ять тредецільйонів " + # "чотириста двадцять дев'ять дуодецильйонів " + # "чотириста сорок чотири ундецильйона " + # "триста тридцять шість дециліонів " + # "сімсот двадцять чотири нонільйону " + # "п'ятсот шістьдесят дев'ять октильйонів " + # "триста сімдесят п'ять септільйонів " + # "двісті тридцять дев'ять секстильйонів " + # "шістсот сімдесят квінтільйонів " + # "п'ятсот сімдесят чотири квадрильйона " + # "сімсот тридцять дев'ять трильйонів " + # "сімсот сорок вісім мільярдів " + # "чотириста сімдесят мільйонів " + # "дев'ятсот п'ятнадцять тысяч " + # "сімдесят два") + + # infinity + self.assertEqual( + pronounce_number(sys.float_info.max * 2), "нескінченність") + self.assertEqual( + pronounce_number(float("inf")), + "нескінченність") + self.assertEqual( + pronounce_number(float("-inf")), + "мінус нескінченність") + + def test_ordinals(self): + self.assertEqual(pronounce_number(1, ordinals=True), "перший") + self.assertEqual(pronounce_number(10, ordinals=True), "десятий") + self.assertEqual(pronounce_number(15, ordinals=True), "п'ятнадцятий") + self.assertEqual(pronounce_number(20, ordinals=True), "двадцятий") + self.assertEqual(pronounce_number(27, ordinals=True), "двадцять сьомий") + self.assertEqual(pronounce_number(30, ordinals=True), "тридцятий") + self.assertEqual(pronounce_number(33, ordinals=True), "тридцять третій") + self.assertEqual(pronounce_number(100, ordinals=True), "сотий") + self.assertEqual(pronounce_number(1000, ordinals=True), "тисячний") + self.assertEqual(pronounce_number(10000, ordinals=True), + "десятитисячний") + self.assertEqual(pronounce_number(18691, ordinals=True), + "вісімнадцять тисяч шістсот дев'яносто перший") + self.assertEqual(pronounce_number(1567, ordinals=True), + "тисяча п'ятсот шістдесят сьомий") + self.assertEqual(pronounce_number(1.672e-27, places=3, + scientific=True, ordinals=True), + "один крапка шість сім два на десять у мінус " + "двадцять сьомому ступені") + self.assertEqual(pronounce_number(1e6, ordinals=True), + "мільйонний") + self.assertEqual(pronounce_number(2e6, ordinals=True), + "двохмільйонний") + self.assertEqual(pronounce_number(3e6, ordinals=True), + "трьохмільйонний") + self.assertEqual(pronounce_number(4e6, ordinals=True), + "чотирьохмільйонний") + self.assertEqual(pronounce_number(18e6, ordinals=True), + "вісімнадцятимільйонний") + self.assertEqual(pronounce_number(18e12, ordinals=True), + "вісімнадцятибільйонний") + self.assertEqual(pronounce_number(18e18, ordinals=True, + short_scale=False), "вісімнадцятитрильйонний") + + +class TestNiceDateFormat(unittest.TestCase): + @classmethod + def setUpClass(cls): + # Read date_time_test.json files for test data + cls.test_config = {} + p = Path(date_time_format.config_path) + print(p) + for sub_dir in [x for x in p.iterdir() if x.is_dir()]: + print(sub_dir) + if (sub_dir / "date_time_test.json").exists(): + print("Loading test for " + + str(sub_dir / "date_time_test.json")) + with (sub_dir / "date_time_test.json").open() as f: + cls.test_config[sub_dir.parts[-1]] = json.loads(f.read()) + + def test_convert_times(self): + dt = datetime.datetime(2017, 1, 31, + 13, 22, 3, tzinfo=default_timezone()) + + # Verify defaults haven"t changed + self.assertEqual(nice_time(dt), + nice_time(dt, speech=True, use_24hour=True, use_ampm=False)) + + self.assertEqual(nice_time(dt, use_24hour=False), + "перша година двадцять два") + self.assertEqual(nice_time(dt, use_24hour=False, use_ampm=True), + "перша година двадцять два дня") + self.assertEqual(nice_time(dt, speech=False, use_24hour=False), + "1:22") + self.assertEqual(nice_time(dt, speech=False, use_24hour=False, use_ampm=True), + "1:22 дня") + self.assertEqual(nice_time(dt, speech=False, use_24hour=True), + "13:22") + self.assertEqual(nice_time(dt, speech=False, use_24hour=True, + use_ampm=True), + "13:22") + self.assertEqual(nice_time(dt, use_24hour=True, use_ampm=True), + "тринадцять двадцять два") + self.assertEqual(nice_time(dt, use_24hour=True, use_ampm=False), + "тринадцять двадцять два") + + dt = datetime.datetime(2017, 1, 31, + 13, 0, 3, tzinfo=default_timezone()) + self.assertEqual(nice_time(dt, use_24hour=False), + "перша година") + self.assertEqual(nice_time(dt, use_24hour=False, use_ampm=True), + "перша година дня") + self.assertEqual(nice_time(dt, use_24hour=False, speech=False), + "1:00") + self.assertEqual(nice_time(dt, speech=False, use_24hour=False, use_ampm=True), + "1:00 дня") + self.assertEqual(nice_time(dt, speech=False, use_24hour=True), + "13:00") + self.assertEqual(nice_time(dt, speech=False, use_24hour=True, + use_ampm=True), + "13:00") + self.assertEqual(nice_time(dt, use_24hour=True, use_ampm=True), + "тринадцять рівно") + self.assertEqual(nice_time(dt, use_24hour=True, use_ampm=False), + "тринадцять рівно") + + dt = datetime.datetime(2017, 1, 31, + 13, 2, 3, tzinfo=default_timezone()) + self.assertEqual(nice_time(dt, use_24hour=False), + "перша година нуль два") + self.assertEqual(nice_time(dt, use_24hour=False, use_ampm=True), + "перша година нуль два дня") + self.assertEqual(nice_time(dt, use_24hour=False, speech=False), + "1:02") + self.assertEqual(nice_time(dt, use_24hour=False, speech=False, use_ampm=True), + "1:02 дня") + self.assertEqual(nice_time(dt, speech=False, use_24hour=True), + "13:02") + self.assertEqual(nice_time(dt, speech=False, use_24hour=True, + use_ampm=True), + "13:02") + self.assertEqual(nice_time(dt, use_24hour=True, use_ampm=True), + "тринадцять нуль два") + self.assertEqual(nice_time(dt, use_24hour=True, use_ampm=False), + "тринадцять нуль два") + + dt = datetime.datetime(2017, 1, 31, + 0, 2, 3, tzinfo=default_timezone()) + self.assertEqual(nice_time(dt, use_24hour=False), + "дванадцята година нуль два") + self.assertEqual(nice_time(dt, use_24hour=False, use_ampm=True), + "дванадцята година нуль два ночі") + self.assertEqual(nice_time(dt, speech=False, use_24hour=False), + "12:02") + self.assertEqual(nice_time(dt, speech=False, use_24hour=False, use_ampm=True), + "12:02 ночі") + self.assertEqual(nice_time(dt, speech=False, use_24hour=True), + "00:02") + self.assertEqual(nice_time(dt, speech=False, use_24hour=True, + use_ampm=True), + "00:02") + self.assertEqual(nice_time(dt, use_24hour=True, use_ampm=True), + "нуль нуль нуль два") + self.assertEqual(nice_time(dt, use_24hour=True, use_ampm=False), + "нуль нуль нуль два") + + dt = datetime.datetime(2018, 2, 8, + 1, 2, 33, tzinfo=default_timezone()) + self.assertEqual(nice_time(dt, use_24hour=False), + "перша година нуль два") + self.assertEqual(nice_time(dt, use_24hour=False, use_ampm=True), + "перша година нуль два ночі") + self.assertEqual(nice_time(dt, speech=False, use_24hour=False), + "1:02") + self.assertEqual(nice_time(dt, speech=False, use_24hour=False, use_ampm=True), + "1:02 ночі") + self.assertEqual(nice_time(dt, speech=False, use_24hour=True), + "01:02") + self.assertEqual(nice_time(dt, speech=False, use_24hour=True, + use_ampm=True), + "01:02") + self.assertEqual(nice_time(dt, use_24hour=True, use_ampm=True), + "нуль один нуль два") + self.assertEqual(nice_time(dt, use_24hour=True, use_ampm=False), + "нуль один нуль два") + + dt = datetime.datetime(2017, 1, 31, + 12, 15, 9, tzinfo=default_timezone()) + self.assertEqual(nice_time(dt, use_24hour=False), + "чверть після дванадцятої години") + self.assertEqual(nice_time(dt, use_24hour=False, use_ampm=True), + "чверть після дванадцятої години дня") + + dt = datetime.datetime(2017, 1, 31, + 5, 30, 00, tzinfo=default_timezone()) + self.assertEqual(nice_time(dt, use_24hour=False, use_ampm=True), + "половина після п'ятої години ранку") + + dt = datetime.datetime(2017, 1, 31, + 1, 45, 00, tzinfo=default_timezone()) + self.assertEqual(nice_time(dt, use_24hour=False), + "без четверті друга година") + + def test_nice_date(self): + lang = "uk-uk" + i = 1 + print(self.test_config[lang]["test_nice_date"].get(str(i))) + while (self.test_config[lang].get("test_nice_date") and + self.test_config[lang]["test_nice_date"].get(str(i))): + p = self.test_config[lang]["test_nice_date"][str(i)] + dp = ast.literal_eval(p["datetime_param"]) + np = ast.literal_eval(p["now"]) + dt = datetime.datetime( + dp[0], dp[1], dp[2], dp[3], dp[4], dp[5], + tzinfo=default_timezone()) + now = None if not np else datetime.datetime( + np[0], np[1], np[2], np[3], np[4], np[5], + tzinfo=default_timezone()) + print("Testing for " + lang + " that " + str(dt) + + " is date " + p["assertEqual"]) + self.assertEqual(p["assertEqual"], + nice_date(dt, lang=lang, now=now)) + i = i + 1 + + # test all days in a year for all languages, + # that some output is produced + # for lang in self.test_config: + for dt in (datetime.datetime(2017, 12, 30, 0, 2, 3, + tzinfo=default_timezone()) + + datetime.timedelta(n) for n in range(368)): + self.assertTrue(len(nice_date(dt, lang=lang)) > 0) + + def test_nice_date_time(self): + lang = "uk-uk" + i = 1 + while (self.test_config[lang].get("test_nice_date_time") and + self.test_config[lang]["test_nice_date_time"].get(str(i))): + p = self.test_config[lang]["test_nice_date_time"][str(i)] + dp = ast.literal_eval(p["datetime_param"]) + np = ast.literal_eval(p["now"]) + dt = datetime.datetime( + dp[0], dp[1], dp[2], dp[3], dp[4], dp[5], + tzinfo=default_timezone()) + now = None if not np else datetime.datetime( + np[0], np[1], np[2], np[3], np[4], np[5]) + print("Testing for " + lang + " that " + str(dt) + + " is date time " + p["assertEqual"]) + self.assertEqual( + p["assertEqual"], + nice_date_time( + dt, lang=lang, now=now, + use_24hour=ast.literal_eval(p["use_24hour"]), + use_ampm=ast.literal_eval(p["use_ampm"]))) + i = i + 1 + + def test_nice_year(self): + lang = "uk-uk" + i = 1 + while (self.test_config[lang].get("test_nice_year") and + self.test_config[lang]["test_nice_year"].get(str(i))): + p = self.test_config[lang]["test_nice_year"][str(i)] + print(p) + dp = ast.literal_eval(p["datetime_param"]) + dt = datetime.datetime( + dp[0], dp[1], dp[2], dp[3], dp[4], dp[5], + tzinfo=default_timezone()) + print("Testing for " + lang + " that " + str(dt) + + " is year " + p["assertEqual"]) + self.assertEqual(p["assertEqual"], nice_year( + dt, lang=lang, bc=ast.literal_eval(p["bc"]))) + i = i + 1 + + # Test all years from 0 to 9999 for all languages, + # that some output is produced + print("Test all years in " + lang) + for i in range(1, 9999): + dt = datetime.datetime(i, 1, 31, 13, 2, 3, + tzinfo=default_timezone()) + self.assertTrue(len(nice_year(dt, lang=lang)) > 0) + + def test_nice_duration(self): + + self.assertEqual(nice_duration(1), "одна секунда") + self.assertEqual(nice_duration(3), "три секунди") + self.assertEqual(nice_duration(1, speech=False), "0:01") + self.assertEqual(nice_duration(61), "одна хвилина одна секунда") + self.assertEqual(nice_duration(121), "дві хвилини одна секунда") + self.assertEqual(nice_duration(61, speech=False), "1:01") + self.assertEqual(nice_duration(5000), + "одна година двадцять три хвилини двадцять секунд") + self.assertEqual(nice_duration(5000, speech=False), "1:23:20") + self.assertEqual(nice_duration(50000), + "тринадцять годин п'ятдесят три хвилини двадцять секунд") + self.assertEqual(nice_duration(50000, speech=False), "13:53:20") + self.assertEqual(nice_duration(500000), + "п'ять днів вісімнадцять годин п'ятдесят три хвилини двадцять секунд") # nopep8 + self.assertEqual(nice_duration(500000, speech=False), "5d 18:53:20") + self.assertEqual(nice_duration(datetime.timedelta(seconds=500000), + speech=False), + "5d 18:53:20") + + def test_join(self): + self.assertEqual(join_list(None, "і"), "") + self.assertEqual(join_list([], "і"), "") + + self.assertEqual(join_list(["a"], "і"), "a") + self.assertEqual(join_list(["a", "b"], "і"), "a і b") + self.assertEqual(join_list(["a", "b"], "або"), "a або b") + + self.assertEqual(join_list(["a", "b", "c"], "і"), "a, b і c") + self.assertEqual(join_list(["a", "b", "c"], "або"), "a, b або c") + self.assertEqual( + join_list(["a", "b", "c"], "або", ";"), "a; b або c") + self.assertEqual( + join_list(["a", "b", "c", "d"], "або"), "a, b, c або d") + + self.assertEqual(join_list([1, "b", 3, "d"], "або"), "1, b, 3 або d") + + +if __name__ == "__main__": + unittest.main() diff --git a/test/unittests/test_parse_ru.py b/test/unittests/test_parse_ru.py index 7792a436..8fae04e3 100644 --- a/test/unittests/test_parse_ru.py +++ b/test/unittests/test_parse_ru.py @@ -187,6 +187,7 @@ def test_extract_number(self): self.assertEqual(extract_number("в общем 100%"), 100) def test_extract_duration_ru(self): + self.assertEqual(extract_duration("10 секунд"), (timedelta(seconds=10.0), "")) self.assertEqual(extract_duration("5 минут"),