From 5a2a29e1a116cad77c587b7294f057fb39aefc5d Mon Sep 17 00:00:00 2001 From: Juanjo Diaz Date: Wed, 18 Jan 2023 17:33:36 +0100 Subject: [PATCH] refactor: allow user to configure cache sizes --- simplemma/dictionary_pickler.py | 8 +- simplemma/langdetect.py | 11 +- simplemma/simplemma.py | 411 +++++++++++++------------ simplemma/simplemma.py.orig | 530 -------------------------------- simplemma/utils.py | 4 - tests/test_simplemma.py | 48 +-- tests/udscore.py | 7 +- 7 files changed, 254 insertions(+), 765 deletions(-) delete mode 100644 simplemma/simplemma.py.orig diff --git a/simplemma/dictionary_pickler.py b/simplemma/dictionary_pickler.py index 396cbdf..6f19f97 100644 --- a/simplemma/dictionary_pickler.py +++ b/simplemma/dictionary_pickler.py @@ -8,7 +8,13 @@ from typing import List, Dict, Optional from .constants import LANGLIST -from .utils import levenshtein_dist +from .utils import levenshtein_dist as raw_levenshtein_dist + +from functools import lru_cache + +@lru_cache(maxsize=65536) +def levenshtein_dist(str1: str, str2: str) -> int: + return raw_levenshtein_dist(str1, str2) try: from .rules import apply_rules diff --git a/simplemma/langdetect.py b/simplemma/langdetect.py index 520d45c..1faaaa7 100644 --- a/simplemma/langdetect.py +++ b/simplemma/langdetect.py @@ -6,7 +6,7 @@ from operator import itemgetter from typing import List, Optional, Tuple -from .simplemma import _return_lemma +from .simplemma import Lemmatizer from .dictionaries import DictionaryCache SPLIT_INPUT = re.compile(r"[^\W\d_]{3,}") @@ -41,6 +41,7 @@ def __init__(self, dictionaryCache: Optional[DictionaryCache] = None) -> None: dictionaryCache = DictionaryCache() assert isinstance(dictionaryCache, DictionaryCache) self.dictionaryCache: DictionaryCache = dictionaryCache + self.lemmatizer = Lemmatizer(self.dictionaryCache) def in_target_language( self, text: str, lang: Optional[Tuple[str]] = None, sample_size: int = 1000 @@ -52,7 +53,9 @@ def in_target_language( for token in prepare_text(text, sample_size): total += 1 for l in self.dictionaryCache.data: - candidate = _return_lemma(token, l.dict, greedy=True, lang=l.code) + candidate = self.lemmatizer._return_lemma( + token, l.dict, greedy=True, lang=l.code + ) if candidate is not None: in_target += 1 break @@ -78,7 +81,9 @@ def lang_detector( for l in self.dictionaryCache.data: in_target = 0 for token in tokens: - candidate = _return_lemma(token, l.dict, greedy=extensive, lang=l.code) + candidate = self.lemmatizer._return_lemma( + token, l.dict, greedy=extensive, lang=l.code + ) if candidate is not None: in_target += 1 # compute results diff --git a/simplemma/simplemma.py b/simplemma/simplemma.py index 646816d..c75e656 100644 --- a/simplemma/simplemma.py +++ b/simplemma/simplemma.py @@ -35,203 +35,6 @@ PUNCTUATION = {".", "?", "!", "…", "¿", "¡"} -def _simple_search( - token: str, datadict: Dict[str, str], initial: bool = False -) -> Optional[str]: - # beginning of sentence, reverse case - if initial: - token = token.lower() - candidate = datadict.get(token) - if candidate is None: - # try upper or lowercase - if token[0].isupper(): - candidate = datadict.get(token.lower()) - else: - candidate = datadict.get(token.capitalize()) - return candidate - - -def _greedy_search( - candidate: str, datadict: Dict[str, str], steps: int = 1, distance: int = 5 -) -> str: - i = 0 - while candidate in datadict and ( - len(datadict[candidate]) < len(candidate) - and levenshtein_dist(datadict[candidate], candidate) <= distance - ): - candidate = datadict[candidate] - i += 1 - if i >= steps: - break - return candidate - - -def _decompose( - token: str, datadict: Dict[str, str], affixlen: int = 0 -) -> Tuple[Optional[str], Optional[str]]: - candidate, plan_b = None, None - # this only makes sense for languages written from left to right - # AFFIXLEN or MINCOMPLEN can spare time for some languages - for count in range(1, len(token) - MINCOMPLEN + 1): - part1, part2 = token[:-count], token[-count:] - # part1_aff = token[:-(count + affixlen)] - lempart1 = _simple_search(part1, datadict) - if lempart1 is not None: - # maybe an affix? discard it - if count <= affixlen: - candidate = lempart1 - break - # account for case before looking for second part - if token[0].isupper(): - part2 = part2.capitalize() - lempart2 = _simple_search(part2, datadict) - if lempart2 is not None: - # candidate must be shorter - # try original case, then substitute - if lempart2[0].isupper(): - substitute = part2.lower() - else: - substitute = part2.capitalize() - # try other case - greedy_candidate = _greedy_search(substitute, datadict) - # shorten the second known part of the token - if greedy_candidate and len(greedy_candidate) < len(part2): - candidate = part1 + greedy_candidate.lower() - # backup: equal length or further candidates accepted - if candidate is None: - # try without capitalizing - lower_candidate = _simple_search(part2, datadict) - if lower_candidate and len(lower_candidate) <= len(part2): - candidate = part1 + lower_candidate.lower() - # even greedier - # with capital letter? - elif len(lempart2) < len(part2) + affixlen: - plan_b = part1 + lempart2.lower() - # print(part1, part2, affixlen, count, newcandidate, planb) - # elif newcandidate and len(newcandidate) < len(part2) + affixlen: - # plan_b = part1 + newcandidate.lower() - # print(part1, part2, affixlen, count, newcandidate, planb) - # else: - # print(part1, part2, affixlen, count, newcandidate) - break - return candidate, plan_b - - -def _dehyphen(token: str, datadict: Dict[str, str], greedy: bool) -> Optional[str]: - splitted = HYPHEN_REGEX.split(token) - if len(splitted) > 1 and splitted[-1]: - # try to find a word form without hyphen - subcandidate = "".join([t for t in splitted if t not in HYPHENS]).lower() - if token[0].isupper(): - subcandidate = subcandidate.capitalize() - candidate = datadict.get(subcandidate) - if candidate: - return candidate - # decompose - last_candidate = _simple_search(splitted[-1], datadict) - # search further - if last_candidate is None and greedy: - last_candidate = _affix_search(splitted[-1], datadict) - # return - if last_candidate is not None: - splitted[-1] = last_candidate - return "".join(splitted) - return None - - -def _affix_search( - wordform: str, datadict: Dict[str, str], maxlen: int = AFFIXLEN -) -> Optional[str]: - for length in range(maxlen, 1, -1): - candidate, plan_b = _decompose(wordform, datadict, affixlen=length) - if candidate is not None: - break - # exceptionally accept a longer solution - if candidate is None and plan_b is not None: - candidate = plan_b - return candidate - - -def _prefix_search(token: str, lang: str, datadict: Dict[str, str]) -> Optional[str]: - # load prefixes - if lang == "de": - preflist = GERMAN_PREFIXES - elif lang == "ru": - preflist = RUSSIAN_PREFIXES - else: - return None - # apply - prefix = None - for p in preflist: - if token.startswith(p): - prefix = p - break - # decompose according to predefined prefix - if prefix is not None: - subword = _simple_search(token[len(prefix) :], datadict) - if subword is not None: - if lang != "de" or token[len(prefix) : len(prefix) + 2] != "zu": - return prefix + subword.lower() - return None - - -def _suffix_search(token: str, datadict: Dict[str, str]) -> Optional[str]: - lastcount = 0 - for count in range(MINCOMPLEN, len(token) - MINCOMPLEN + 1): - part = _simple_search(token[-count:].capitalize(), datadict) - if part is not None and len(part) <= len(token[-count:]): - lastpart, lastcount = part, count - if lastcount > 0: - return token[:-lastcount] + lastpart.lower() - return None - - -def _return_lemma( - token: str, - datadict: Dict[str, str], - greedy: bool = True, - lang: Optional[str] = None, - initial: bool = False, -) -> Optional[str]: - # filters - if token.isnumeric(): - return token - # dictionary search - candidate = _simple_search(token, datadict, initial=initial) - # simple rules - if candidate is None and lang in RULES_LANGS: - candidate = apply_rules(token, lang, greedy) - # decomposition - if candidate is None: # and greedy is True - candidate = _dehyphen(token, datadict, greedy) - else: - newcandidate = _dehyphen(candidate, datadict, greedy) - if newcandidate is not None: - candidate = newcandidate - # stop here in some cases - # if not greedy: - # return candidate - limit = 6 if lang in SHORTER_GREEDY else 8 - if len(token) <= limit: - return candidate - # subword decomposition: predefined prefixes (absent from vocabulary if they are not words) - if candidate is None: - candidate = _prefix_search(token, lang, datadict) # type: ignore[arg-type] - # unsupervised suffix/affix search: not productive for all languages - if candidate is None and (greedy or lang in AFFIX_LANGS): - # define parameters - maxlen = LONGAFFIXLEN if lang in LONGER_AFFIXES else AFFIXLEN - # greedier subword decomposition: suffix search with character in between - # then suffixes - candidate = _affix_search(token, datadict, maxlen) or _suffix_search( - token, datadict - ) - # greedy mode: try further hops, not always a good idea - if candidate is not None and greedy: - candidate = _greedy_search(candidate, datadict) - return candidate - - def _control_input_type(token: Any) -> None: "Make sure the input is a string of length > 0." if not isinstance(token, str): @@ -241,12 +44,218 @@ def _control_input_type(token: Any) -> None: class Lemmatizer: - def __init__(self, dictionaryCache: Optional[DictionaryCache] = None, lemmatizationCacheMaxSize = 1048576, ) -> None: + def __init__( + self, + dictionaryCache: Optional[DictionaryCache] = None, + lemmatization_distance_cache_max_size=1048576, + levenshtein_distance_cache_max_size=1048576, + ) -> None: if dictionaryCache == None: dictionaryCache = DictionaryCache() assert isinstance(dictionaryCache, DictionaryCache) self.dictionaryCache: DictionaryCache = dictionaryCache - self.lemmatize = lru_cache(maxsize=1048576)(self._lemmatize) + self.lemmatize = lru_cache(maxsize=lemmatization_distance_cache_max_size)( + self._lemmatize + ) + self.levenshtein_dist = lru_cache(maxsize=levenshtein_distance_cache_max_size)( + levenshtein_dist + ) + + def _simple_search( + self, token: str, datadict: Dict[str, str], initial: bool = False + ) -> Optional[str]: + # beginning of sentence, reverse case + if initial: + token = token.lower() + candidate = datadict.get(token) + if candidate is None: + # try upper or lowercase + if token[0].isupper(): + candidate = datadict.get(token.lower()) + else: + candidate = datadict.get(token.capitalize()) + return candidate + + def _greedy_search( + self, + candidate: str, + datadict: Dict[str, str], + steps: int = 1, + distance: int = 5, + ) -> str: + i = 0 + while candidate in datadict and ( + len(datadict[candidate]) < len(candidate) + and self.levenshtein_dist(datadict[candidate], candidate) <= distance + ): + candidate = datadict[candidate] + i += 1 + if i >= steps: + break + return candidate + + def _decompose( + self, token: str, datadict: Dict[str, str], affixlen: int = 0 + ) -> Tuple[Optional[str], Optional[str]]: + candidate, plan_b = None, None + # this only makes sense for languages written from left to right + # AFFIXLEN or MINCOMPLEN can spare time for some languages + for count in range(1, len(token) - MINCOMPLEN + 1): + part1, part2 = token[:-count], token[-count:] + # part1_aff = token[:-(count + affixlen)] + lempart1 = self._simple_search(part1, datadict) + if lempart1 is not None: + # maybe an affix? discard it + if count <= affixlen: + candidate = lempart1 + break + # account for case before looking for second part + if token[0].isupper(): + part2 = part2.capitalize() + lempart2 = self._simple_search(part2, datadict) + if lempart2 is not None: + # candidate must be shorter + # try original case, then substitute + if lempart2[0].isupper(): + substitute = part2.lower() + else: + substitute = part2.capitalize() + # try other case + greedy_candidate = self._greedy_search(substitute, datadict) + # shorten the second known part of the token + if greedy_candidate and len(greedy_candidate) < len(part2): + candidate = part1 + greedy_candidate.lower() + # backup: equal length or further candidates accepted + if candidate is None: + # try without capitalizing + lower_candidate = self._simple_search(part2, datadict) + if lower_candidate and len(lower_candidate) <= len(part2): + candidate = part1 + lower_candidate.lower() + # even greedier + # with capital letter? + elif len(lempart2) < len(part2) + affixlen: + plan_b = part1 + lempart2.lower() + # print(part1, part2, affixlen, count, newcandidate, planb) + # elif newcandidate and len(newcandidate) < len(part2) + affixlen: + # plan_b = part1 + newcandidate.lower() + # print(part1, part2, affixlen, count, newcandidate, planb) + # else: + # print(part1, part2, affixlen, count, newcandidate) + break + return candidate, plan_b + + def _dehyphen( + self, token: str, datadict: Dict[str, str], greedy: bool + ) -> Optional[str]: + splitted = HYPHEN_REGEX.split(token) + if len(splitted) > 1 and splitted[-1]: + # try to find a word form without hyphen + subcandidate = "".join([t for t in splitted if t not in HYPHENS]).lower() + if token[0].isupper(): + subcandidate = subcandidate.capitalize() + candidate = datadict.get(subcandidate) + if candidate: + return candidate + # decompose + last_candidate = self._simple_search(splitted[-1], datadict) + # search further + if last_candidate is None and greedy: + last_candidate = self._affix_search(splitted[-1], datadict) + # return + if last_candidate is not None: + splitted[-1] = last_candidate + return "".join(splitted) + return None + + def _affix_search( + self, wordform: str, datadict: Dict[str, str], maxlen: int = AFFIXLEN + ) -> Optional[str]: + for length in range(maxlen, 1, -1): + candidate, plan_b = self._decompose(wordform, datadict, affixlen=length) + if candidate is not None: + break + # exceptionally accept a longer solution + if candidate is None and plan_b is not None: + candidate = plan_b + return candidate + + def _prefix_search(self, token: str, lang: str, datadict: Dict[str, str]) -> Optional[str]: + # load prefixes + if lang == "de": + preflist = GERMAN_PREFIXES + elif lang == "ru": + preflist = RUSSIAN_PREFIXES + else: + return None + # apply + prefix = None + for p in preflist: + if token.startswith(p): + prefix = p + break + # decompose according to predefined prefix + if prefix is not None: + subword = self._simple_search(token[len(prefix) :], datadict) + if subword is not None: + if lang != "de" or token[len(prefix) : len(prefix) + 2] != "zu": + return prefix + subword.lower() + return None + + def _suffix_search(self, token: str, datadict: Dict[str, str]) -> Optional[str]: + lastcount = 0 + for count in range(MINCOMPLEN, len(token) - MINCOMPLEN + 1): + part = self._simple_search(token[-count:].capitalize(), datadict) + if part is not None and len(part) <= len(token[-count:]): + lastpart, lastcount = part, count + if lastcount > 0: + return token[:-lastcount] + lastpart.lower() + return None + + def _return_lemma( + self, + token: str, + datadict: Dict[str, str], + greedy: bool = True, + lang: Optional[str] = None, + initial: bool = False, + ) -> Optional[str]: + # filters + if token.isnumeric(): + return token + # dictionary search + candidate = self._simple_search(token, datadict, initial=initial) + # simple rules + if candidate is None and lang in RULES_LANGS: + candidate = apply_rules(token, lang, greedy) + # decomposition + if candidate is None: # and greedy is True + candidate = self._dehyphen(token, datadict, greedy) + else: + newcandidate = self._dehyphen(candidate, datadict, greedy) + if newcandidate is not None: + candidate = newcandidate + # stop here in some cases + # if not greedy: + # return candidate + limit = 6 if lang in SHORTER_GREEDY else 8 + if len(token) <= limit: + return candidate + # subword decomposition: predefined prefixes (absent from vocabulary if they are not words) + if candidate is None: + candidate = self._prefix_search(token, lang, datadict) # type: ignore[arg-type] + # unsupervised suffix/affix search: not productive for all languages + if candidate is None and (greedy or lang in AFFIX_LANGS): + # define parameters + maxlen = LONGAFFIXLEN if lang in LONGER_AFFIXES else AFFIXLEN + # greedier subword decomposition: suffix search with character in between + # then suffixes + candidate = self._affix_search( + token, datadict, maxlen + ) or self._suffix_search(token, datadict) + # greedy mode: try further hops, not always a good idea + if candidate is not None and greedy: + candidate = self._greedy_search(candidate, datadict) + return candidate def is_known( self, token: str, lang: Optional[Union[str, Tuple[str]]] = None @@ -256,7 +265,7 @@ def is_known( _control_input_type(token) _ = self.dictionaryCache.update_lang_data(lang) # ignore returned value return any( - _simple_search(token, language.dict) is not None + self._simple_search(token, language.dict) is not None for language in self.dictionaryCache.data ) @@ -280,7 +289,7 @@ def _lemmatize( # if greedy is None: # greedy = _define_greediness(language) # determine lemma - candidate = _return_lemma( + candidate = self._return_lemma( token, l.dict, greedy=greedy, lang=l.code, initial=initial ) if candidate is not None: diff --git a/simplemma/simplemma.py.orig b/simplemma/simplemma.py.orig deleted file mode 100644 index 0ed96a2..0000000 --- a/simplemma/simplemma.py.orig +++ /dev/null @@ -1,530 +0,0 @@ -"""Main module.""" - -import logging -import re - -from functools import lru_cache -from typing import Any, Dict, List, Iterator, Optional, Tuple, Union - -from .dictionaries import DictionaryCache -from .utils import levenshtein_dist - -try: - from .rules import apply_rules, GERMAN_PREFIXES, RULES_LANGS, RUSSIAN_PREFIXES - from .tokenizer import simple_tokenizer -# local error, also ModuleNotFoundError for Python >= 3.6 -except ImportError: # pragma: no cover - from rules import apply_rules, RULES_LANGS # type: ignore - from tokenizer import simple_tokenizer # type: ignore - - -cache = DictionaryCache() - -LOGGER = logging.getLogger(__name__) - -AFFIXLEN = 2 -LONGAFFIXLEN = 5 # better for some languages -MINCOMPLEN = 4 - -BETTER_LOWER = {"bg", "es", "hy", "lt", "lv", "pt", "sk"} -<<<<<<< HEAD -BUFFER_HACK = {"bg", "es", "et", "fi", "fr", "it", "lt", "pl", "sk"} # "da", "nl" - -# TODO: This custom behavior has to be simplified before it becomes unmaintainable -======= ->>>>>>> refactor: separate logic in multiple modules -LONGER_AFFIXES = {"et", "fi", "hu", "lt"} -SHORTER_GREEDY = {"bg", "et", "fi"} -AFFIX_LANGS = {"bg", "et", "fi", "hu", "lt", "lv", "nb", "pl", "ru", "sk", "tr"} - -HYPHEN_REGEX = re.compile(r"([_-])") -HYPHENS = {"-", "_"} -PUNCTUATION = {".", "?", "!", "…", "¿", "¡"} - -<<<<<<< HEAD -LANG_DATA = [] # type: List[LangDict] - -# class LangData: -# "Class to store word pairs and relevant information." -# __slots__ = ('dictionaries', 'languages') -# -# def __init__(self): -# self.languages = [] -# self.dictionaries = LangDict() - - -class LangDict: - "Class to store word pairs and relevant information for a single language." - __slots__ = ("code", "dict") - - def __init__(self, langcode: str, langdict: Dict[str, str]): - self.code: str = langcode - self.dict: Dict[str, str] = langdict - - -def _determine_path(listpath: str, langcode: str) -> str: - filename = f"{listpath}/{langcode}.txt" - return str(Path(__file__).parent / filename) - - -def _load_dict( - langcode: str, listpath: str = "lists", silent: bool = True -) -> Dict[str, str]: - filepath = _determine_path(listpath, langcode) - return _read_dict(filepath, langcode, silent) - - -def _read_dict(filepath: str, langcode: str, silent: bool) -> Dict[str, str]: - mydict, myadditions, i = {}, [], 0 # type: Dict[str, str], List[str], int - leftlimit = 1 if langcode in SAFE_LIMIT else 2 - # load data from list - with open(filepath, "r", encoding="utf-8") as filehandle: - for line in filehandle: - # skip potentially invalid lines - if line.startswith("-") or " " in line or re.search(r"[+_]", line): - continue - columns = line.strip().split("\t") - # invalid: remove noise - if ( - len(columns) != 2 - or len(columns[0]) < leftlimit - or ":" in columns[1] - # todo: exclude columns with punctuation! - ): - # or len(columns[1]) < 2: - if not silent: - LOGGER.warning("wrong format: %s", line.strip()) - continue - # too long - if langcode in VOC_LIMIT and ( - len(columns[0]) > MAXLENGTH or len(columns[1]) > MAXLENGTH - ): - continue - # length difference - if len(columns[0]) == 1 and len(columns[1]) > 6: - continue - if len(columns[0]) > 6 and len(columns[1]) == 1: - continue - # tackled by rules - if len(columns[1]) > 6: # columns[1] != columns[0] - rule = apply_rules(columns[1], langcode) - if rule == columns[0]: - continue - elif rule is not None and rule != columns[1]: - print(columns[1], columns[0], rule) - # process - if columns[1] in mydict and mydict[columns[1]] != columns[0]: - # prevent mistakes and noise coming from the lists - dist1, dist2 = _levenshtein_dist( - columns[1], mydict[columns[1]] - ), _levenshtein_dist(columns[1], columns[0]) - # fail-safe: delete potential false entry - # if dist1 >= len(columns[1]) and dist2 >= len(columns[1]): - # del mydict[columns[1]] - # continue - if dist1 == 0 or dist2 < dist1: # dist1 < 2 - mydict[columns[1]] = columns[0] - elif not silent: - LOGGER.warning( - "diverging: %s %s | %s %s", - columns[1], - mydict[columns[1]], - columns[1], - columns[0], - ) - LOGGER.debug("distances: %s %s", dist1, dist2) - else: - mydict[columns[1]] = columns[0] - # deal with verbal forms (mostly) - if langcode in BUFFER_HACK: - myadditions.append(columns[0]) - elif columns[0] not in mydict: - mydict[columns[0]] = columns[0] - i += 1 - # overwrite - for word in myadditions: - mydict[word] = word - LOGGER.debug("%s %s", langcode, i) - return dict(sorted(mydict.items())) - - -def _pickle_dict( - langcode: str, listpath: str = "lists", filepath: Optional[str] = None -) -> None: - mydict = _load_dict(langcode, listpath) - if filepath is None: - filename = f"data/{langcode}.plzma" - filepath = str(Path(__file__).parent / filename) - with lzma.open(filepath, "wb") as filehandle: # , filters=my_filters, preset=9 - pickle.dump(mydict, filehandle, protocol=4) - LOGGER.debug("%s %s", langcode, len(mydict)) - - -def _load_pickle(langcode: str) -> Dict[str, str]: - filename = f"data/{langcode}.plzma" - filepath = str(Path(__file__).parent / filename) - with lzma.open(filepath, "rb") as filehandle: - pickled_dict = pickle.load(filehandle) - assert isinstance(pickled_dict, dict) - return pickled_dict - - -def _control_lang(lang: Any) -> Tuple[str]: - "Make sure the lang variable is a valid tuple." - # convert string - if isinstance(lang, str): - lang = (lang,) - if not isinstance(lang, tuple): - raise TypeError("lang argument must be a two-letter language code") - return lang # type: ignore[return-value] - - -def _load_data(langs: Optional[Tuple[str]]) -> List[LangDict]: - """Decompress und unpickle lemmatization rules. - Takes one or several ISO 639-1 code language code as input. - Returns a list of dictionaries.""" - langlist = [] - assert isinstance(langs, tuple) - for lang in langs: - if lang not in LANGLIST: - LOGGER.error("language not supported: %s", lang) - continue - LOGGER.debug("loading %s", lang) - langlist.append(LangDict(lang, _load_pickle(lang))) - return langlist - - -def _update_lang_data(lang: Optional[Union[str, Tuple[str]]]) -> Tuple[str]: - # convert string - lang = _control_lang(lang) - # load corresponding data - global LANG_DATA - if not LANG_DATA or tuple(l.code for l in LANG_DATA) != lang: - LANG_DATA = _load_data(lang) - lemmatize.cache_clear() - return lang - - -@lru_cache(maxsize=65536) -def _levenshtein_dist(str1: str, str2: str) -> int: - # inspired by this noticeably faster code: - # https://gist.github.com/p-hash/9e0f9904ce7947c133308fbe48fe032b - if str1 == str2: - return 0 - if len(str1) > len(str2): - str1, str2 = str2, str1 - r1 = list(range(len(str2) + 1)) - r2 = [0] * len(r1) - for i, c1 in enumerate(str1): - r2[0] = i + 1 - for j, c2 in enumerate(str2): - if c1 == c2: - r2[j + 1] = r1[j] - else: - a1, a2, a3 = r2[j], r1[j], r1[j + 1] - if a1 > a2: - if a2 > a3: - r2[j + 1] = 1 + a3 - else: - r2[j + 1] = 1 + a2 - else: - if a1 > a3: - r2[j + 1] = 1 + a3 - else: - r2[j + 1] = 1 + a1 - aux = r1 - r1, r2 = r2, aux - return r1[-1] - -======= ->>>>>>> refactor: separate logic in multiple modules - -def _simple_search( - token: str, datadict: Dict[str, str], initial: bool = False -) -> Optional[str]: - # beginning of sentence, reverse case - if initial: - token = token.lower() - candidate = datadict.get(token) - if candidate is None: - # try upper or lowercase - if token[0].isupper(): - candidate = datadict.get(token.lower()) - else: - candidate = datadict.get(token.capitalize()) - return candidate - - -def _greedy_search( - candidate: str, datadict: Dict[str, str], steps: int = 1, distance: int = 5 -) -> str: - i = 0 - while candidate in datadict and ( - len(datadict[candidate]) < len(candidate) - and levenshtein_dist(datadict[candidate], candidate) <= distance - ): - candidate = datadict[candidate] - i += 1 - if i >= steps: - break - return candidate - - -def _decompose( - token: str, datadict: Dict[str, str], affixlen: int = 0 -) -> Tuple[Optional[str], Optional[str]]: - candidate, plan_b = None, None - # this only makes sense for languages written from left to right - # AFFIXLEN or MINCOMPLEN can spare time for some languages - for count in range(1, len(token) - MINCOMPLEN + 1): - part1, part2 = token[:-count], token[-count:] - # part1_aff = token[:-(count + affixlen)] - lempart1 = _simple_search(part1, datadict) - if lempart1 is not None: - # maybe an affix? discard it - if count <= affixlen: - candidate = lempart1 - break - # account for case before looking for second part - if token[0].isupper(): - part2 = part2.capitalize() - lempart2 = _simple_search(part2, datadict) - if lempart2 is not None: - # candidate must be shorter - # try original case, then substitute - if lempart2[0].isupper(): - substitute = part2.lower() - else: - substitute = part2.capitalize() - # try other case - greedy_candidate = _greedy_search(substitute, datadict) - # shorten the second known part of the token - if greedy_candidate and len(greedy_candidate) < len(part2): - candidate = part1 + greedy_candidate.lower() - # backup: equal length or further candidates accepted - if candidate is None: - # try without capitalizing - lower_candidate = _simple_search(part2, datadict) - if lower_candidate and len(lower_candidate) <= len(part2): - candidate = part1 + lower_candidate.lower() - # even greedier - # with capital letter? - elif len(lempart2) < len(part2) + affixlen: - plan_b = part1 + lempart2.lower() - # print(part1, part2, affixlen, count, newcandidate, planb) - # elif newcandidate and len(newcandidate) < len(part2) + affixlen: - # plan_b = part1 + newcandidate.lower() - # print(part1, part2, affixlen, count, newcandidate, planb) - # else: - # print(part1, part2, affixlen, count, newcandidate) - break - return candidate, plan_b - - -def _dehyphen(token: str, datadict: Dict[str, str], greedy: bool) -> Optional[str]: - splitted = HYPHEN_REGEX.split(token) - if len(splitted) > 1 and splitted[-1]: - # try to find a word form without hyphen - subcandidate = "".join([t for t in splitted if t not in HYPHENS]).lower() - if token[0].isupper(): - subcandidate = subcandidate.capitalize() - candidate = datadict.get(subcandidate) - if candidate: - return candidate - # decompose - last_candidate = _simple_search(splitted[-1], datadict) - # search further - if last_candidate is None and greedy: - last_candidate = _affix_search(splitted[-1], datadict) - # return - if last_candidate is not None: - splitted[-1] = last_candidate - return "".join(splitted) - return None - - -def _affix_search( - wordform: str, datadict: Dict[str, str], maxlen: int = AFFIXLEN -) -> Optional[str]: - for length in range(maxlen, 1, -1): - candidate, plan_b = _decompose(wordform, datadict, affixlen=length) - if candidate is not None: - break - # exceptionally accept a longer solution - if candidate is None and plan_b is not None: - candidate = plan_b - return candidate - - -def _prefix_search(token: str, lang: str, datadict: Dict[str, str]) -> Optional[str]: - # load prefixes - if lang == "de": - preflist = GERMAN_PREFIXES - elif lang == "ru": - preflist = RUSSIAN_PREFIXES - else: - return None - # apply - prefix = None - for p in preflist: - if token.startswith(p): - prefix = p - break - # decompose according to predefined prefix - if prefix is not None: - subword = _simple_search(token[len(prefix) :], datadict) - if subword is not None: - if lang != "de" or token[len(prefix) : len(prefix) + 2] != "zu": - return prefix + subword.lower() - return None - - -def _suffix_search(token: str, datadict: Dict[str, str]) -> Optional[str]: - lastcount = 0 - for count in range(MINCOMPLEN, len(token) - MINCOMPLEN + 1): - part = _simple_search(token[-count:].capitalize(), datadict) - if part is not None and len(part) <= len(token[-count:]): - lastpart, lastcount = part, count - if lastcount > 0: - return token[:-lastcount] + lastpart.lower() - return None - - -def _return_lemma( - token: str, - datadict: Dict[str, str], - greedy: bool = True, - lang: Optional[str] = None, - initial: bool = False, -) -> Optional[str]: - # filters - if token.isnumeric(): - return token - # dictionary search - candidate = _simple_search(token, datadict, initial=initial) - # simple rules - if candidate is None and lang in RULES_LANGS: - candidate = apply_rules(token, lang, greedy) - # decomposition - if candidate is None: # and greedy is True - candidate = _dehyphen(token, datadict, greedy) - else: - newcandidate = _dehyphen(candidate, datadict, greedy) - if newcandidate is not None: - candidate = newcandidate - # stop here in some cases - # if not greedy: - # return candidate - limit = 6 if lang in SHORTER_GREEDY else 8 - if len(token) <= limit: - return candidate - # subword decomposition: predefined prefixes (absent from vocabulary if they are not words) - if candidate is None: - candidate = _prefix_search(token, lang, datadict) # type: ignore[arg-type] - # unsupervised suffix/affix search: not productive for all languages - if candidate is None and (greedy or lang in AFFIX_LANGS): - # define parameters - maxlen = LONGAFFIXLEN if lang in LONGER_AFFIXES else AFFIXLEN - # greedier subword decomposition: suffix search with character in between - # then suffixes - candidate = _affix_search(token, datadict, maxlen) or _suffix_search( - token, datadict - ) - # greedy mode: try further hops, not always a good idea - if candidate is not None and greedy: - candidate = _greedy_search(candidate, datadict) - return candidate - - -def _control_input_type(token: Any) -> None: - "Make sure the input is a string of length > 0." - if not isinstance(token, str): - raise TypeError(f"Wrong input type, expected string, got {type(token)}") - if token == "": - raise ValueError("Wrong input type: empty string") - - -def is_known(token: str, lang: Optional[Union[str, Tuple[str]]] = None) -> bool: - """Tell if a token is present in one of the loaded dictionaries. - Case-insensitive, whole word forms only. Returns True or False.""" - _control_input_type(token) - _ = cache.update_lang_data(lang) # ignore returned value - return any( - _simple_search(token, language.dict) is not None for language in cache.data - ) - - -@lru_cache(maxsize=1048576) -def lemmatize( - token: str, - lang: Optional[Union[str, Tuple[str]]] = None, - greedy: bool = False, - silent: bool = True, - initial: bool = False, -) -> str: - """Try to reduce a token to its lemma form according to the - language list passed as input. - Returns a string. - Can raise ValueError by silent=False if no lemma has been found.""" - _control_input_type(token) - lang = cache.update_lang_data(lang) # use returned lang value - # start - for i, l in enumerate(cache.data, start=1): - # determine default greediness - # if greedy is None: - # greedy = _define_greediness(language) - # determine lemma - candidate = _return_lemma( - token, l.dict, greedy=greedy, lang=l.code, initial=initial - ) - if candidate is not None: - if i != 1: - LOGGER.debug("%s found in %s", token, l.code) - return candidate - if not silent: - raise ValueError(f"Token not found: {token}") - # try to simply lowercase # and len(token) < 10 ? - return token.lower() if lang[0] in BETTER_LOWER else token - - -def text_lemmatizer( - text: str, - lang: Optional[Union[str, Tuple[str]]] = None, - greedy: bool = False, - silent: bool = True, -) -> List[str]: - """Convenience function to lemmatize a text using a simple tokenizer. - Returns a list of tokens and lemmata.""" - lemmata = [] - last = "." # beginning is initial - for match in simple_tokenizer(text, iterate=True): - # lemmatize, simple heuristic for sentence boundary - lemmata.append( - lemmatize( - match[0], - lang=lang, - greedy=greedy, - silent=silent, - initial=last in PUNCTUATION, - ) - ) - last = match[0] - return lemmata - - -def lemma_iterator( - text: str, - lang: Optional[Union[str, Tuple[str]]] = None, - greedy: bool = False, - silent: bool = True, -) -> Iterator[str]: - """Convenience function to lemmatize a text using a simple tokenizer. - Returns a list of tokens and lemmata.""" - last = "." # beginning is initial - for match in simple_tokenizer(text, iterate=True): - # lemmatize - initial = last in PUNCTUATION - last = match[0] - yield lemmatize( - match[0], lang=lang, greedy=greedy, silent=silent, initial=initial - ) diff --git a/simplemma/utils.py b/simplemma/utils.py index 258886f..4ae7c68 100644 --- a/simplemma/utils.py +++ b/simplemma/utils.py @@ -1,7 +1,3 @@ -from functools import lru_cache - - -@lru_cache(maxsize=65536) def levenshtein_dist(str1: str, str2: str) -> int: # inspired by this noticeably faster code: # https://gist.github.com/p-hash/9e0f9904ce7947c133308fbe48fe032b diff --git a/tests/test_simplemma.py b/tests/test_simplemma.py index 1f4265a..70d1122 100644 --- a/tests/test_simplemma.py +++ b/tests/test_simplemma.py @@ -139,50 +139,52 @@ def test_logic(): assert lemmatizer.lemmatize(None, lang="en") is None with pytest.raises(ValueError): assert lemmatizer.lemmatize("", lang="en") is None - assert simplemma.simplemma._suffix_search("ccc",deDict) is None + assert lemmatizer._suffix_search("ccc",deDict) is None assert ( - simplemma.simplemma._return_lemma("Gender-Sternchens",deDict) + lemmatizer._return_lemma("Gender-Sternchens",deDict) == "Gendersternchen" ) assert ( - simplemma.simplemma._return_lemma("an-gespieltes",deDict) + lemmatizer._return_lemma("an-gespieltes",deDict) == "anspielen" ) assert ( - simplemma.simplemma._greedy_search( + lemmatizer._greedy_search( "getesteten",deDict, steps=0, distance=20 ) == "getestet" ) assert ( - simplemma.simplemma._greedy_search( + lemmatizer._greedy_search( "getesteten",deDict, steps=1, distance=20 ) == "getestet" ) assert ( - simplemma.simplemma._greedy_search( + lemmatizer._greedy_search( "getesteten",deDict, steps=2, distance=20 ) == "testen" ) assert ( - simplemma.simplemma._greedy_search( + lemmatizer._greedy_search( "getesteten",deDict, steps=2, distance=2 ) == "getestet" ) # prefixes - mydata = simplemma.simplemma._load_data(("de", "ru")) + dictionaryCache.update_lang_data(("de", "ru")) + deDict = dictionaryCache.data[0].dict + ruDict = dictionaryCache.data[1].dict assert ( - simplemma.simplemma._prefix_search("zerlemmatisiertes", "de", mydata[0].dict) + lemmatizer._prefix_search("zerlemmatisiertes", "de", deDict) == "zerlemmatisiert" ) assert ( - simplemma.simplemma._prefix_search("зафиксированные", "ru", mydata[1].dict) + lemmatizer._prefix_search("зафиксированные", "ru", ruDict) == "зафиксированный" ) @@ -257,23 +259,23 @@ def test_search(): dictionaryCache.update_lang_data(("en",)) enDict = dictionaryCache.data[0].dict lemmatizer = Lemmatizer(dictionaryCache) - assert simplemma.simplemma._simple_search("ignorant", enDict) == "ignorant" - assert simplemma.simplemma._simple_search("Ignorant", enDict) == "ignorant" + assert lemmatizer._simple_search("ignorant", enDict) == "ignorant" + assert lemmatizer._simple_search("Ignorant", enDict) == "ignorant" assert ( - simplemma.simplemma._dehyphen("magni-ficent", enDict, False) == "magnificent" + lemmatizer._dehyphen("magni-ficent", enDict, False) == "magnificent" ) - assert simplemma.simplemma._dehyphen("magni-ficents", enDict, False) is None - # assert simplemma.simplemma._greedy_search('Ignorance-Tests', enDict) == 'Ignorance-Test' + assert lemmatizer._dehyphen("magni-ficents", enDict, False) is None + # assert lemmatizer._greedy_search('Ignorance-Tests', enDict) == 'Ignorance-Test' # don't lemmatize numbers - assert simplemma.simplemma._return_lemma("01234", enDict) == "01234" + assert lemmatizer._return_lemma("01234", enDict) == "01234" # initial or not dictionaryCache.update_lang_data(("de",)) deDict = dictionaryCache.data[0].dict assert ( - simplemma.simplemma._simple_search("Dritte", deDict, initial=True) == "dritt" + lemmatizer._simple_search("Dritte", deDict, initial=True) == "dritt" ) assert ( - simplemma.simplemma._simple_search("Dritte", deDict, initial=False) + lemmatizer._simple_search("Dritte", deDict, initial=False) == "Dritter" ) @@ -346,7 +348,7 @@ def test_subwords(): == "PCR-Bestätigungstest" ) # assert ( - # lemmatize("standortübergreifend", lang="de", greedy=True) + # lemmatizer.lemmatize("standortübergreifend", lang="de", greedy=True) # == "standortübergreifend" # ) assert lemmatizer.lemmatize("obamamäßigsten", lang="de", greedy=True) == "obamamäßig" @@ -377,10 +379,10 @@ def test_subwords(): # assert lemmatizer.lemmatize("Bandmitgliedern", lang="de", greedy=True) == "Bandmitglied" # prefixes - assert lemmatize("lemmatisiertes", lang="de") == "lemmatisiert" - assert lemmatize("zerlemmatisiertes", lang="de") == "zerlemmatisiert" - assert lemmatize("фиксированные", lang="ru") == "фиксированный" - assert lemmatize("зафиксированные", lang="ru") == "зафиксированный" + assert lemmatizer.lemmatize("lemmatisiertes", lang="de") == "lemmatisiert" + assert lemmatizer.lemmatize("zerlemmatisiertes", lang="de") == "zerlemmatisiert" + assert lemmatizer.lemmatize("фиксированные", lang="ru") == "фиксированный" + assert lemmatizer.lemmatize("зафиксированные", lang="ru") == "зафиксированный" def test_tokenizer(): diff --git a/tests/udscore.py b/tests/udscore.py index edc5ce6..a154054 100644 --- a/tests/udscore.py +++ b/tests/udscore.py @@ -3,7 +3,7 @@ from collections import Counter from conllu import parse_incr -from simplemma import lemmatize +from simplemma import Lemmatizer data_files = [ @@ -73,10 +73,11 @@ else: initial = False - greedy_candidate = lemmatize( + lemmatizer = Lemmatizer() + greedy_candidate = lemmatizer.lemmatize( token["form"], lang=language, greedy=True, initial=initial ) - candidate = lemmatize( + candidate = lemmatizer.lemmatize( token["form"], lang=language, greedy=False, initial=initial )