From 5a2a29e1a116cad77c587b7294f057fb39aefc5d Mon Sep 17 00:00:00 2001
From: Juanjo Diaz <juanjo.diazmo@gmail.com>
Date: Wed, 18 Jan 2023 17:33:36 +0100
Subject: [PATCH] refactor: allow user to configure cache sizes

---
 simplemma/dictionary_pickler.py |   8 +-
 simplemma/langdetect.py         |  11 +-
 simplemma/simplemma.py          | 411 +++++++++++++------------
 simplemma/simplemma.py.orig     | 530 --------------------------------
 simplemma/utils.py              |   4 -
 tests/test_simplemma.py         |  48 +--
 tests/udscore.py                |   7 +-
 7 files changed, 254 insertions(+), 765 deletions(-)
 delete mode 100644 simplemma/simplemma.py.orig

diff --git a/simplemma/dictionary_pickler.py b/simplemma/dictionary_pickler.py
index 396cbdf..6f19f97 100644
--- a/simplemma/dictionary_pickler.py
+++ b/simplemma/dictionary_pickler.py
@@ -8,7 +8,13 @@
 from typing import List, Dict, Optional
 
 from .constants import LANGLIST
-from .utils import levenshtein_dist
+from .utils import levenshtein_dist as raw_levenshtein_dist
+
+from functools import lru_cache
+
+@lru_cache(maxsize=65536)
+def levenshtein_dist(str1: str, str2: str) -> int:
+    return raw_levenshtein_dist(str1, str2)
 
 try:
     from .rules import apply_rules
diff --git a/simplemma/langdetect.py b/simplemma/langdetect.py
index 520d45c..1faaaa7 100644
--- a/simplemma/langdetect.py
+++ b/simplemma/langdetect.py
@@ -6,7 +6,7 @@
 from operator import itemgetter
 from typing import List, Optional, Tuple
 
-from .simplemma import _return_lemma
+from .simplemma import Lemmatizer
 from .dictionaries import DictionaryCache
 
 SPLIT_INPUT = re.compile(r"[^\W\d_]{3,}")
@@ -41,6 +41,7 @@ def __init__(self, dictionaryCache: Optional[DictionaryCache] = None) -> None:
             dictionaryCache = DictionaryCache()
         assert isinstance(dictionaryCache, DictionaryCache)
         self.dictionaryCache: DictionaryCache = dictionaryCache
+        self.lemmatizer = Lemmatizer(self.dictionaryCache)
 
     def in_target_language(
         self, text: str, lang: Optional[Tuple[str]] = None, sample_size: int = 1000
@@ -52,7 +53,9 @@ def in_target_language(
         for token in prepare_text(text, sample_size):
             total += 1
             for l in self.dictionaryCache.data:
-                candidate = _return_lemma(token, l.dict, greedy=True, lang=l.code)
+                candidate = self.lemmatizer._return_lemma(
+                    token, l.dict, greedy=True, lang=l.code
+                )
                 if candidate is not None:
                     in_target += 1
                     break
@@ -78,7 +81,9 @@ def lang_detector(
         for l in self.dictionaryCache.data:
             in_target = 0
             for token in tokens:
-                candidate = _return_lemma(token, l.dict, greedy=extensive, lang=l.code)
+                candidate = self.lemmatizer._return_lemma(
+                    token, l.dict, greedy=extensive, lang=l.code
+                )
                 if candidate is not None:
                     in_target += 1
             # compute results
diff --git a/simplemma/simplemma.py b/simplemma/simplemma.py
index 646816d..c75e656 100644
--- a/simplemma/simplemma.py
+++ b/simplemma/simplemma.py
@@ -35,203 +35,6 @@
 PUNCTUATION = {".", "?", "!", "…", "¿", "¡"}
 
 
-def _simple_search(
-    token: str, datadict: Dict[str, str], initial: bool = False
-) -> Optional[str]:
-    # beginning of sentence, reverse case
-    if initial:
-        token = token.lower()
-    candidate = datadict.get(token)
-    if candidate is None:
-        # try upper or lowercase
-        if token[0].isupper():
-            candidate = datadict.get(token.lower())
-        else:
-            candidate = datadict.get(token.capitalize())
-    return candidate
-
-
-def _greedy_search(
-    candidate: str, datadict: Dict[str, str], steps: int = 1, distance: int = 5
-) -> str:
-    i = 0
-    while candidate in datadict and (
-        len(datadict[candidate]) < len(candidate)
-        and levenshtein_dist(datadict[candidate], candidate) <= distance
-    ):
-        candidate = datadict[candidate]
-        i += 1
-        if i >= steps:
-            break
-    return candidate
-
-
-def _decompose(
-    token: str, datadict: Dict[str, str], affixlen: int = 0
-) -> Tuple[Optional[str], Optional[str]]:
-    candidate, plan_b = None, None
-    # this only makes sense for languages written from left to right
-    # AFFIXLEN or MINCOMPLEN can spare time for some languages
-    for count in range(1, len(token) - MINCOMPLEN + 1):
-        part1, part2 = token[:-count], token[-count:]
-        # part1_aff = token[:-(count + affixlen)]
-        lempart1 = _simple_search(part1, datadict)
-        if lempart1 is not None:
-            # maybe an affix? discard it
-            if count <= affixlen:
-                candidate = lempart1
-                break
-            # account for case before looking for second part
-            if token[0].isupper():
-                part2 = part2.capitalize()
-            lempart2 = _simple_search(part2, datadict)
-            if lempart2 is not None:
-                # candidate must be shorter
-                # try original case, then substitute
-                if lempart2[0].isupper():
-                    substitute = part2.lower()
-                else:
-                    substitute = part2.capitalize()
-                # try other case
-                greedy_candidate = _greedy_search(substitute, datadict)
-                # shorten the second known part of the token
-                if greedy_candidate and len(greedy_candidate) < len(part2):
-                    candidate = part1 + greedy_candidate.lower()
-                # backup: equal length or further candidates accepted
-                if candidate is None:
-                    # try without capitalizing
-                    lower_candidate = _simple_search(part2, datadict)
-                    if lower_candidate and len(lower_candidate) <= len(part2):
-                        candidate = part1 + lower_candidate.lower()
-                    # even greedier
-                    # with capital letter?
-                    elif len(lempart2) < len(part2) + affixlen:
-                        plan_b = part1 + lempart2.lower()
-                        # print(part1, part2, affixlen, count, newcandidate, planb)
-                    # elif newcandidate and len(newcandidate) < len(part2) + affixlen:
-                    # plan_b = part1 + newcandidate.lower()
-                    # print(part1, part2, affixlen, count, newcandidate, planb)
-                    # else:
-                    #    print(part1, part2, affixlen, count, newcandidate)
-                break
-    return candidate, plan_b
-
-
-def _dehyphen(token: str, datadict: Dict[str, str], greedy: bool) -> Optional[str]:
-    splitted = HYPHEN_REGEX.split(token)
-    if len(splitted) > 1 and splitted[-1]:
-        # try to find a word form without hyphen
-        subcandidate = "".join([t for t in splitted if t not in HYPHENS]).lower()
-        if token[0].isupper():
-            subcandidate = subcandidate.capitalize()
-        candidate = datadict.get(subcandidate)
-        if candidate:
-            return candidate
-        # decompose
-        last_candidate = _simple_search(splitted[-1], datadict)
-        # search further
-        if last_candidate is None and greedy:
-            last_candidate = _affix_search(splitted[-1], datadict)
-        # return
-        if last_candidate is not None:
-            splitted[-1] = last_candidate
-            return "".join(splitted)
-    return None
-
-
-def _affix_search(
-    wordform: str, datadict: Dict[str, str], maxlen: int = AFFIXLEN
-) -> Optional[str]:
-    for length in range(maxlen, 1, -1):
-        candidate, plan_b = _decompose(wordform, datadict, affixlen=length)
-        if candidate is not None:
-            break
-    # exceptionally accept a longer solution
-    if candidate is None and plan_b is not None:
-        candidate = plan_b
-    return candidate
-
-
-def _prefix_search(token: str, lang: str, datadict: Dict[str, str]) -> Optional[str]:
-    # load prefixes
-    if lang == "de":
-        preflist = GERMAN_PREFIXES
-    elif lang == "ru":
-        preflist = RUSSIAN_PREFIXES
-    else:
-        return None
-    # apply
-    prefix = None
-    for p in preflist:
-        if token.startswith(p):
-            prefix = p
-            break
-    # decompose according to predefined prefix
-    if prefix is not None:
-        subword = _simple_search(token[len(prefix) :], datadict)
-        if subword is not None:
-            if lang != "de" or token[len(prefix) : len(prefix) + 2] != "zu":
-                return prefix + subword.lower()
-    return None
-
-
-def _suffix_search(token: str, datadict: Dict[str, str]) -> Optional[str]:
-    lastcount = 0
-    for count in range(MINCOMPLEN, len(token) - MINCOMPLEN + 1):
-        part = _simple_search(token[-count:].capitalize(), datadict)
-        if part is not None and len(part) <= len(token[-count:]):
-            lastpart, lastcount = part, count
-    if lastcount > 0:
-        return token[:-lastcount] + lastpart.lower()
-    return None
-
-
-def _return_lemma(
-    token: str,
-    datadict: Dict[str, str],
-    greedy: bool = True,
-    lang: Optional[str] = None,
-    initial: bool = False,
-) -> Optional[str]:
-    # filters
-    if token.isnumeric():
-        return token
-    # dictionary search
-    candidate = _simple_search(token, datadict, initial=initial)
-    # simple rules
-    if candidate is None and lang in RULES_LANGS:
-        candidate = apply_rules(token, lang, greedy)
-    # decomposition
-    if candidate is None:  # and greedy is True
-        candidate = _dehyphen(token, datadict, greedy)
-    else:
-        newcandidate = _dehyphen(candidate, datadict, greedy)
-        if newcandidate is not None:
-            candidate = newcandidate
-    # stop here in some cases
-    # if not greedy:
-    #    return candidate
-    limit = 6 if lang in SHORTER_GREEDY else 8
-    if len(token) <= limit:
-        return candidate
-    # subword decomposition: predefined prefixes (absent from vocabulary if they are not words)
-    if candidate is None:
-        candidate = _prefix_search(token, lang, datadict)  # type: ignore[arg-type]
-    # unsupervised suffix/affix search: not productive for all languages
-    if candidate is None and (greedy or lang in AFFIX_LANGS):
-        # define parameters
-        maxlen = LONGAFFIXLEN if lang in LONGER_AFFIXES else AFFIXLEN
-        # greedier subword decomposition: suffix search with character in between
-        # then suffixes
-        candidate = _affix_search(token, datadict, maxlen) or _suffix_search(
-            token, datadict
-        )
-    # greedy mode: try further hops, not always a good idea
-    if candidate is not None and greedy:
-        candidate = _greedy_search(candidate, datadict)
-    return candidate
-
-
 def _control_input_type(token: Any) -> None:
     "Make sure the input is a string of length > 0."
     if not isinstance(token, str):
@@ -241,12 +44,218 @@ def _control_input_type(token: Any) -> None:
 
 
 class Lemmatizer:
-    def __init__(self, dictionaryCache: Optional[DictionaryCache] = None, lemmatizationCacheMaxSize = 1048576, ) -> None:
+    def __init__(
+        self,
+        dictionaryCache: Optional[DictionaryCache] = None,
+        lemmatization_distance_cache_max_size=1048576,
+        levenshtein_distance_cache_max_size=1048576,
+    ) -> None:
         if dictionaryCache == None:
             dictionaryCache = DictionaryCache()
         assert isinstance(dictionaryCache, DictionaryCache)
         self.dictionaryCache: DictionaryCache = dictionaryCache
-        self.lemmatize = lru_cache(maxsize=1048576)(self._lemmatize)
+        self.lemmatize = lru_cache(maxsize=lemmatization_distance_cache_max_size)(
+            self._lemmatize
+        )
+        self.levenshtein_dist = lru_cache(maxsize=levenshtein_distance_cache_max_size)(
+            levenshtein_dist
+        )
+
+    def _simple_search(
+        self, token: str, datadict: Dict[str, str], initial: bool = False
+    ) -> Optional[str]:
+        # beginning of sentence, reverse case
+        if initial:
+            token = token.lower()
+        candidate = datadict.get(token)
+        if candidate is None:
+            # try upper or lowercase
+            if token[0].isupper():
+                candidate = datadict.get(token.lower())
+            else:
+                candidate = datadict.get(token.capitalize())
+        return candidate
+
+    def _greedy_search(
+        self,
+        candidate: str,
+        datadict: Dict[str, str],
+        steps: int = 1,
+        distance: int = 5,
+    ) -> str:
+        i = 0
+        while candidate in datadict and (
+            len(datadict[candidate]) < len(candidate)
+            and self.levenshtein_dist(datadict[candidate], candidate) <= distance
+        ):
+            candidate = datadict[candidate]
+            i += 1
+            if i >= steps:
+                break
+        return candidate
+
+    def _decompose(
+        self, token: str, datadict: Dict[str, str], affixlen: int = 0
+    ) -> Tuple[Optional[str], Optional[str]]:
+        candidate, plan_b = None, None
+        # this only makes sense for languages written from left to right
+        # AFFIXLEN or MINCOMPLEN can spare time for some languages
+        for count in range(1, len(token) - MINCOMPLEN + 1):
+            part1, part2 = token[:-count], token[-count:]
+            # part1_aff = token[:-(count + affixlen)]
+            lempart1 = self._simple_search(part1, datadict)
+            if lempart1 is not None:
+                # maybe an affix? discard it
+                if count <= affixlen:
+                    candidate = lempart1
+                    break
+                # account for case before looking for second part
+                if token[0].isupper():
+                    part2 = part2.capitalize()
+                lempart2 = self._simple_search(part2, datadict)
+                if lempart2 is not None:
+                    # candidate must be shorter
+                    # try original case, then substitute
+                    if lempart2[0].isupper():
+                        substitute = part2.lower()
+                    else:
+                        substitute = part2.capitalize()
+                    # try other case
+                    greedy_candidate = self._greedy_search(substitute, datadict)
+                    # shorten the second known part of the token
+                    if greedy_candidate and len(greedy_candidate) < len(part2):
+                        candidate = part1 + greedy_candidate.lower()
+                    # backup: equal length or further candidates accepted
+                    if candidate is None:
+                        # try without capitalizing
+                        lower_candidate = self._simple_search(part2, datadict)
+                        if lower_candidate and len(lower_candidate) <= len(part2):
+                            candidate = part1 + lower_candidate.lower()
+                        # even greedier
+                        # with capital letter?
+                        elif len(lempart2) < len(part2) + affixlen:
+                            plan_b = part1 + lempart2.lower()
+                            # print(part1, part2, affixlen, count, newcandidate, planb)
+                        # elif newcandidate and len(newcandidate) < len(part2) + affixlen:
+                        # plan_b = part1 + newcandidate.lower()
+                        # print(part1, part2, affixlen, count, newcandidate, planb)
+                        # else:
+                        #    print(part1, part2, affixlen, count, newcandidate)
+                    break
+        return candidate, plan_b
+
+    def _dehyphen(
+        self, token: str, datadict: Dict[str, str], greedy: bool
+    ) -> Optional[str]:
+        splitted = HYPHEN_REGEX.split(token)
+        if len(splitted) > 1 and splitted[-1]:
+            # try to find a word form without hyphen
+            subcandidate = "".join([t for t in splitted if t not in HYPHENS]).lower()
+            if token[0].isupper():
+                subcandidate = subcandidate.capitalize()
+            candidate = datadict.get(subcandidate)
+            if candidate:
+                return candidate
+            # decompose
+            last_candidate = self._simple_search(splitted[-1], datadict)
+            # search further
+            if last_candidate is None and greedy:
+                last_candidate = self._affix_search(splitted[-1], datadict)
+            # return
+            if last_candidate is not None:
+                splitted[-1] = last_candidate
+                return "".join(splitted)
+        return None
+
+    def _affix_search(
+        self, wordform: str, datadict: Dict[str, str], maxlen: int = AFFIXLEN
+    ) -> Optional[str]:
+        for length in range(maxlen, 1, -1):
+            candidate, plan_b = self._decompose(wordform, datadict, affixlen=length)
+            if candidate is not None:
+                break
+        # exceptionally accept a longer solution
+        if candidate is None and plan_b is not None:
+            candidate = plan_b
+        return candidate
+
+    def _prefix_search(self, token: str, lang: str, datadict: Dict[str, str]) -> Optional[str]:
+        # load prefixes
+        if lang == "de":
+            preflist = GERMAN_PREFIXES
+        elif lang == "ru":
+            preflist = RUSSIAN_PREFIXES
+        else:
+            return None
+        # apply
+        prefix = None
+        for p in preflist:
+            if token.startswith(p):
+                prefix = p
+                break
+        # decompose according to predefined prefix
+        if prefix is not None:
+            subword = self._simple_search(token[len(prefix) :], datadict)
+            if subword is not None:
+                if lang != "de" or token[len(prefix) : len(prefix) + 2] != "zu":
+                    return prefix + subword.lower()
+        return None
+
+    def _suffix_search(self, token: str, datadict: Dict[str, str]) -> Optional[str]:
+        lastcount = 0
+        for count in range(MINCOMPLEN, len(token) - MINCOMPLEN + 1):
+            part = self._simple_search(token[-count:].capitalize(), datadict)
+            if part is not None and len(part) <= len(token[-count:]):
+                lastpart, lastcount = part, count
+        if lastcount > 0:
+            return token[:-lastcount] + lastpart.lower()
+        return None
+
+    def _return_lemma(
+        self,
+        token: str,
+        datadict: Dict[str, str],
+        greedy: bool = True,
+        lang: Optional[str] = None,
+        initial: bool = False,
+    ) -> Optional[str]:
+        # filters
+        if token.isnumeric():
+            return token
+        # dictionary search
+        candidate = self._simple_search(token, datadict, initial=initial)
+        # simple rules
+        if candidate is None and lang in RULES_LANGS:
+            candidate = apply_rules(token, lang, greedy)
+        # decomposition
+        if candidate is None:  # and greedy is True
+            candidate = self._dehyphen(token, datadict, greedy)
+        else:
+            newcandidate = self._dehyphen(candidate, datadict, greedy)
+            if newcandidate is not None:
+                candidate = newcandidate
+        # stop here in some cases
+        # if not greedy:
+        #    return candidate
+        limit = 6 if lang in SHORTER_GREEDY else 8
+        if len(token) <= limit:
+            return candidate
+        # subword decomposition: predefined prefixes (absent from vocabulary if they are not words)
+        if candidate is None:
+            candidate = self._prefix_search(token, lang, datadict)  # type: ignore[arg-type]
+        # unsupervised suffix/affix search: not productive for all languages
+        if candidate is None and (greedy or lang in AFFIX_LANGS):
+            # define parameters
+            maxlen = LONGAFFIXLEN if lang in LONGER_AFFIXES else AFFIXLEN
+            # greedier subword decomposition: suffix search with character in between
+            # then suffixes
+            candidate = self._affix_search(
+                token, datadict, maxlen
+            ) or self._suffix_search(token, datadict)
+        # greedy mode: try further hops, not always a good idea
+        if candidate is not None and greedy:
+            candidate = self._greedy_search(candidate, datadict)
+        return candidate
 
     def is_known(
         self, token: str, lang: Optional[Union[str, Tuple[str]]] = None
@@ -256,7 +265,7 @@ def is_known(
         _control_input_type(token)
         _ = self.dictionaryCache.update_lang_data(lang)  # ignore returned value
         return any(
-            _simple_search(token, language.dict) is not None
+            self._simple_search(token, language.dict) is not None
             for language in self.dictionaryCache.data
         )
 
@@ -280,7 +289,7 @@ def _lemmatize(
             # if greedy is None:
             #    greedy = _define_greediness(language)
             # determine lemma
-            candidate = _return_lemma(
+            candidate = self._return_lemma(
                 token, l.dict, greedy=greedy, lang=l.code, initial=initial
             )
             if candidate is not None:
diff --git a/simplemma/simplemma.py.orig b/simplemma/simplemma.py.orig
deleted file mode 100644
index 0ed96a2..0000000
--- a/simplemma/simplemma.py.orig
+++ /dev/null
@@ -1,530 +0,0 @@
-"""Main module."""
-
-import logging
-import re
-
-from functools import lru_cache
-from typing import Any, Dict, List, Iterator, Optional, Tuple, Union
-
-from .dictionaries import DictionaryCache
-from .utils import levenshtein_dist
-
-try:
-    from .rules import apply_rules, GERMAN_PREFIXES, RULES_LANGS, RUSSIAN_PREFIXES
-    from .tokenizer import simple_tokenizer
-# local error, also ModuleNotFoundError for Python >= 3.6
-except ImportError:  # pragma: no cover
-    from rules import apply_rules, RULES_LANGS  # type: ignore
-    from tokenizer import simple_tokenizer  # type: ignore
-
-
-cache = DictionaryCache()
-
-LOGGER = logging.getLogger(__name__)
-
-AFFIXLEN = 2
-LONGAFFIXLEN = 5  # better for some languages
-MINCOMPLEN = 4
-
-BETTER_LOWER = {"bg", "es", "hy", "lt", "lv", "pt", "sk"}
-<<<<<<< HEAD
-BUFFER_HACK = {"bg", "es", "et", "fi", "fr", "it", "lt", "pl", "sk"}  # "da", "nl"
-
-# TODO: This custom behavior has to be simplified before it becomes unmaintainable
-=======
->>>>>>> refactor: separate logic in multiple modules
-LONGER_AFFIXES = {"et", "fi", "hu", "lt"}
-SHORTER_GREEDY = {"bg", "et", "fi"}
-AFFIX_LANGS = {"bg", "et", "fi", "hu", "lt", "lv", "nb", "pl", "ru", "sk", "tr"}
-
-HYPHEN_REGEX = re.compile(r"([_-])")
-HYPHENS = {"-", "_"}
-PUNCTUATION = {".", "?", "!", "…", "¿", "¡"}
-
-<<<<<<< HEAD
-LANG_DATA = []  # type: List[LangDict]
-
-# class LangData:
-#    "Class to store word pairs and relevant information."
-#    __slots__ = ('dictionaries', 'languages')
-#
-#    def __init__(self):
-#        self.languages = []
-#        self.dictionaries = LangDict()
-
-
-class LangDict:
-    "Class to store word pairs and relevant information for a single language."
-    __slots__ = ("code", "dict")
-
-    def __init__(self, langcode: str, langdict: Dict[str, str]):
-        self.code: str = langcode
-        self.dict: Dict[str, str] = langdict
-
-
-def _determine_path(listpath: str, langcode: str) -> str:
-    filename = f"{listpath}/{langcode}.txt"
-    return str(Path(__file__).parent / filename)
-
-
-def _load_dict(
-    langcode: str, listpath: str = "lists", silent: bool = True
-) -> Dict[str, str]:
-    filepath = _determine_path(listpath, langcode)
-    return _read_dict(filepath, langcode, silent)
-
-
-def _read_dict(filepath: str, langcode: str, silent: bool) -> Dict[str, str]:
-    mydict, myadditions, i = {}, [], 0  # type: Dict[str, str], List[str], int
-    leftlimit = 1 if langcode in SAFE_LIMIT else 2
-    # load data from list
-    with open(filepath, "r", encoding="utf-8") as filehandle:
-        for line in filehandle:
-            # skip potentially invalid lines
-            if line.startswith("-") or " " in line or re.search(r"[+_]", line):
-                continue
-            columns = line.strip().split("\t")
-            # invalid: remove noise
-            if (
-                len(columns) != 2
-                or len(columns[0]) < leftlimit
-                or ":" in columns[1]
-                # todo: exclude columns with punctuation!
-            ):
-                # or len(columns[1]) < 2:
-                if not silent:
-                    LOGGER.warning("wrong format: %s", line.strip())
-                continue
-            # too long
-            if langcode in VOC_LIMIT and (
-                len(columns[0]) > MAXLENGTH or len(columns[1]) > MAXLENGTH
-            ):
-                continue
-            # length difference
-            if len(columns[0]) == 1 and len(columns[1]) > 6:
-                continue
-            if len(columns[0]) > 6 and len(columns[1]) == 1:
-                continue
-            # tackled by rules
-            if len(columns[1]) > 6:  # columns[1] != columns[0]
-                rule = apply_rules(columns[1], langcode)
-                if rule == columns[0]:
-                    continue
-                elif rule is not None and rule != columns[1]:
-                    print(columns[1], columns[0], rule)
-            # process
-            if columns[1] in mydict and mydict[columns[1]] != columns[0]:
-                # prevent mistakes and noise coming from the lists
-                dist1, dist2 = _levenshtein_dist(
-                    columns[1], mydict[columns[1]]
-                ), _levenshtein_dist(columns[1], columns[0])
-                # fail-safe: delete potential false entry
-                # if dist1 >= len(columns[1]) and dist2 >= len(columns[1]):
-                #    del mydict[columns[1]]
-                #    continue
-                if dist1 == 0 or dist2 < dist1:  # dist1 < 2
-                    mydict[columns[1]] = columns[0]
-                elif not silent:
-                    LOGGER.warning(
-                        "diverging: %s %s | %s %s",
-                        columns[1],
-                        mydict[columns[1]],
-                        columns[1],
-                        columns[0],
-                    )
-                    LOGGER.debug("distances: %s %s", dist1, dist2)
-            else:
-                mydict[columns[1]] = columns[0]
-                # deal with verbal forms (mostly)
-                if langcode in BUFFER_HACK:
-                    myadditions.append(columns[0])
-                elif columns[0] not in mydict:
-                    mydict[columns[0]] = columns[0]
-                i += 1
-    # overwrite
-    for word in myadditions:
-        mydict[word] = word
-    LOGGER.debug("%s %s", langcode, i)
-    return dict(sorted(mydict.items()))
-
-
-def _pickle_dict(
-    langcode: str, listpath: str = "lists", filepath: Optional[str] = None
-) -> None:
-    mydict = _load_dict(langcode, listpath)
-    if filepath is None:
-        filename = f"data/{langcode}.plzma"
-        filepath = str(Path(__file__).parent / filename)
-    with lzma.open(filepath, "wb") as filehandle:  # , filters=my_filters, preset=9
-        pickle.dump(mydict, filehandle, protocol=4)
-    LOGGER.debug("%s %s", langcode, len(mydict))
-
-
-def _load_pickle(langcode: str) -> Dict[str, str]:
-    filename = f"data/{langcode}.plzma"
-    filepath = str(Path(__file__).parent / filename)
-    with lzma.open(filepath, "rb") as filehandle:
-        pickled_dict = pickle.load(filehandle)
-        assert isinstance(pickled_dict, dict)
-        return pickled_dict
-
-
-def _control_lang(lang: Any) -> Tuple[str]:
-    "Make sure the lang variable is a valid tuple."
-    # convert string
-    if isinstance(lang, str):
-        lang = (lang,)
-    if not isinstance(lang, tuple):
-        raise TypeError("lang argument must be a two-letter language code")
-    return lang  # type: ignore[return-value]
-
-
-def _load_data(langs: Optional[Tuple[str]]) -> List[LangDict]:
-    """Decompress und unpickle lemmatization rules.
-    Takes one or several ISO 639-1 code language code as input.
-    Returns a list of dictionaries."""
-    langlist = []
-    assert isinstance(langs, tuple)
-    for lang in langs:
-        if lang not in LANGLIST:
-            LOGGER.error("language not supported: %s", lang)
-            continue
-        LOGGER.debug("loading %s", lang)
-        langlist.append(LangDict(lang, _load_pickle(lang)))
-    return langlist
-
-
-def _update_lang_data(lang: Optional[Union[str, Tuple[str]]]) -> Tuple[str]:
-    # convert string
-    lang = _control_lang(lang)
-    # load corresponding data
-    global LANG_DATA
-    if not LANG_DATA or tuple(l.code for l in LANG_DATA) != lang:
-        LANG_DATA = _load_data(lang)
-        lemmatize.cache_clear()
-    return lang
-
-
-@lru_cache(maxsize=65536)
-def _levenshtein_dist(str1: str, str2: str) -> int:
-    # inspired by this noticeably faster code:
-    # https://gist.github.com/p-hash/9e0f9904ce7947c133308fbe48fe032b
-    if str1 == str2:
-        return 0
-    if len(str1) > len(str2):
-        str1, str2 = str2, str1
-    r1 = list(range(len(str2) + 1))
-    r2 = [0] * len(r1)
-    for i, c1 in enumerate(str1):
-        r2[0] = i + 1
-        for j, c2 in enumerate(str2):
-            if c1 == c2:
-                r2[j + 1] = r1[j]
-            else:
-                a1, a2, a3 = r2[j], r1[j], r1[j + 1]
-                if a1 > a2:
-                    if a2 > a3:
-                        r2[j + 1] = 1 + a3
-                    else:
-                        r2[j + 1] = 1 + a2
-                else:
-                    if a1 > a3:
-                        r2[j + 1] = 1 + a3
-                    else:
-                        r2[j + 1] = 1 + a1
-        aux = r1
-        r1, r2 = r2, aux
-    return r1[-1]
-
-=======
->>>>>>> refactor: separate logic in multiple modules
-
-def _simple_search(
-    token: str, datadict: Dict[str, str], initial: bool = False
-) -> Optional[str]:
-    # beginning of sentence, reverse case
-    if initial:
-        token = token.lower()
-    candidate = datadict.get(token)
-    if candidate is None:
-        # try upper or lowercase
-        if token[0].isupper():
-            candidate = datadict.get(token.lower())
-        else:
-            candidate = datadict.get(token.capitalize())
-    return candidate
-
-
-def _greedy_search(
-    candidate: str, datadict: Dict[str, str], steps: int = 1, distance: int = 5
-) -> str:
-    i = 0
-    while candidate in datadict and (
-        len(datadict[candidate]) < len(candidate)
-        and levenshtein_dist(datadict[candidate], candidate) <= distance
-    ):
-        candidate = datadict[candidate]
-        i += 1
-        if i >= steps:
-            break
-    return candidate
-
-
-def _decompose(
-    token: str, datadict: Dict[str, str], affixlen: int = 0
-) -> Tuple[Optional[str], Optional[str]]:
-    candidate, plan_b = None, None
-    # this only makes sense for languages written from left to right
-    # AFFIXLEN or MINCOMPLEN can spare time for some languages
-    for count in range(1, len(token) - MINCOMPLEN + 1):
-        part1, part2 = token[:-count], token[-count:]
-        # part1_aff = token[:-(count + affixlen)]
-        lempart1 = _simple_search(part1, datadict)
-        if lempart1 is not None:
-            # maybe an affix? discard it
-            if count <= affixlen:
-                candidate = lempart1
-                break
-            # account for case before looking for second part
-            if token[0].isupper():
-                part2 = part2.capitalize()
-            lempart2 = _simple_search(part2, datadict)
-            if lempart2 is not None:
-                # candidate must be shorter
-                # try original case, then substitute
-                if lempart2[0].isupper():
-                    substitute = part2.lower()
-                else:
-                    substitute = part2.capitalize()
-                # try other case
-                greedy_candidate = _greedy_search(substitute, datadict)
-                # shorten the second known part of the token
-                if greedy_candidate and len(greedy_candidate) < len(part2):
-                    candidate = part1 + greedy_candidate.lower()
-                # backup: equal length or further candidates accepted
-                if candidate is None:
-                    # try without capitalizing
-                    lower_candidate = _simple_search(part2, datadict)
-                    if lower_candidate and len(lower_candidate) <= len(part2):
-                        candidate = part1 + lower_candidate.lower()
-                    # even greedier
-                    # with capital letter?
-                    elif len(lempart2) < len(part2) + affixlen:
-                        plan_b = part1 + lempart2.lower()
-                        # print(part1, part2, affixlen, count, newcandidate, planb)
-                    # elif newcandidate and len(newcandidate) < len(part2) + affixlen:
-                    # plan_b = part1 + newcandidate.lower()
-                    # print(part1, part2, affixlen, count, newcandidate, planb)
-                    # else:
-                    #    print(part1, part2, affixlen, count, newcandidate)
-                break
-    return candidate, plan_b
-
-
-def _dehyphen(token: str, datadict: Dict[str, str], greedy: bool) -> Optional[str]:
-    splitted = HYPHEN_REGEX.split(token)
-    if len(splitted) > 1 and splitted[-1]:
-        # try to find a word form without hyphen
-        subcandidate = "".join([t for t in splitted if t not in HYPHENS]).lower()
-        if token[0].isupper():
-            subcandidate = subcandidate.capitalize()
-        candidate = datadict.get(subcandidate)
-        if candidate:
-            return candidate
-        # decompose
-        last_candidate = _simple_search(splitted[-1], datadict)
-        # search further
-        if last_candidate is None and greedy:
-            last_candidate = _affix_search(splitted[-1], datadict)
-        # return
-        if last_candidate is not None:
-            splitted[-1] = last_candidate
-            return "".join(splitted)
-    return None
-
-
-def _affix_search(
-    wordform: str, datadict: Dict[str, str], maxlen: int = AFFIXLEN
-) -> Optional[str]:
-    for length in range(maxlen, 1, -1):
-        candidate, plan_b = _decompose(wordform, datadict, affixlen=length)
-        if candidate is not None:
-            break
-    # exceptionally accept a longer solution
-    if candidate is None and plan_b is not None:
-        candidate = plan_b
-    return candidate
-
-
-def _prefix_search(token: str, lang: str, datadict: Dict[str, str]) -> Optional[str]:
-    # load prefixes
-    if lang == "de":
-        preflist = GERMAN_PREFIXES
-    elif lang == "ru":
-        preflist = RUSSIAN_PREFIXES
-    else:
-        return None
-    # apply
-    prefix = None
-    for p in preflist:
-        if token.startswith(p):
-            prefix = p
-            break
-    # decompose according to predefined prefix
-    if prefix is not None:
-        subword = _simple_search(token[len(prefix) :], datadict)
-        if subword is not None:
-            if lang != "de" or token[len(prefix) : len(prefix) + 2] != "zu":
-                return prefix + subword.lower()
-    return None
-
-
-def _suffix_search(token: str, datadict: Dict[str, str]) -> Optional[str]:
-    lastcount = 0
-    for count in range(MINCOMPLEN, len(token) - MINCOMPLEN + 1):
-        part = _simple_search(token[-count:].capitalize(), datadict)
-        if part is not None and len(part) <= len(token[-count:]):
-            lastpart, lastcount = part, count
-    if lastcount > 0:
-        return token[:-lastcount] + lastpart.lower()
-    return None
-
-
-def _return_lemma(
-    token: str,
-    datadict: Dict[str, str],
-    greedy: bool = True,
-    lang: Optional[str] = None,
-    initial: bool = False,
-) -> Optional[str]:
-    # filters
-    if token.isnumeric():
-        return token
-    # dictionary search
-    candidate = _simple_search(token, datadict, initial=initial)
-    # simple rules
-    if candidate is None and lang in RULES_LANGS:
-        candidate = apply_rules(token, lang, greedy)
-    # decomposition
-    if candidate is None:  # and greedy is True
-        candidate = _dehyphen(token, datadict, greedy)
-    else:
-        newcandidate = _dehyphen(candidate, datadict, greedy)
-        if newcandidate is not None:
-            candidate = newcandidate
-    # stop here in some cases
-    # if not greedy:
-    #    return candidate
-    limit = 6 if lang in SHORTER_GREEDY else 8
-    if len(token) <= limit:
-        return candidate
-    # subword decomposition: predefined prefixes (absent from vocabulary if they are not words)
-    if candidate is None:
-        candidate = _prefix_search(token, lang, datadict)  # type: ignore[arg-type]
-    # unsupervised suffix/affix search: not productive for all languages
-    if candidate is None and (greedy or lang in AFFIX_LANGS):
-        # define parameters
-        maxlen = LONGAFFIXLEN if lang in LONGER_AFFIXES else AFFIXLEN
-        # greedier subword decomposition: suffix search with character in between
-        # then suffixes
-        candidate = _affix_search(token, datadict, maxlen) or _suffix_search(
-            token, datadict
-        )
-    # greedy mode: try further hops, not always a good idea
-    if candidate is not None and greedy:
-        candidate = _greedy_search(candidate, datadict)
-    return candidate
-
-
-def _control_input_type(token: Any) -> None:
-    "Make sure the input is a string of length > 0."
-    if not isinstance(token, str):
-        raise TypeError(f"Wrong input type, expected string, got {type(token)}")
-    if token == "":
-        raise ValueError("Wrong input type: empty string")
-
-
-def is_known(token: str, lang: Optional[Union[str, Tuple[str]]] = None) -> bool:
-    """Tell if a token is present in one of the loaded dictionaries.
-    Case-insensitive, whole word forms only. Returns True or False."""
-    _control_input_type(token)
-    _ = cache.update_lang_data(lang)  # ignore returned value
-    return any(
-        _simple_search(token, language.dict) is not None for language in cache.data
-    )
-
-
-@lru_cache(maxsize=1048576)
-def lemmatize(
-    token: str,
-    lang: Optional[Union[str, Tuple[str]]] = None,
-    greedy: bool = False,
-    silent: bool = True,
-    initial: bool = False,
-) -> str:
-    """Try to reduce a token to its lemma form according to the
-    language list passed as input.
-    Returns a string.
-    Can raise ValueError by silent=False if no lemma has been found."""
-    _control_input_type(token)
-    lang = cache.update_lang_data(lang)  # use returned lang value
-    # start
-    for i, l in enumerate(cache.data, start=1):
-        # determine default greediness
-        # if greedy is None:
-        #    greedy = _define_greediness(language)
-        # determine lemma
-        candidate = _return_lemma(
-            token, l.dict, greedy=greedy, lang=l.code, initial=initial
-        )
-        if candidate is not None:
-            if i != 1:
-                LOGGER.debug("%s found in %s", token, l.code)
-            return candidate
-    if not silent:
-        raise ValueError(f"Token not found: {token}")
-    # try to simply lowercase # and len(token) < 10 ?
-    return token.lower() if lang[0] in BETTER_LOWER else token
-
-
-def text_lemmatizer(
-    text: str,
-    lang: Optional[Union[str, Tuple[str]]] = None,
-    greedy: bool = False,
-    silent: bool = True,
-) -> List[str]:
-    """Convenience function to lemmatize a text using a simple tokenizer.
-    Returns a list of tokens and lemmata."""
-    lemmata = []
-    last = "."  # beginning is initial
-    for match in simple_tokenizer(text, iterate=True):
-        # lemmatize, simple heuristic for sentence boundary
-        lemmata.append(
-            lemmatize(
-                match[0],
-                lang=lang,
-                greedy=greedy,
-                silent=silent,
-                initial=last in PUNCTUATION,
-            )
-        )
-        last = match[0]
-    return lemmata
-
-
-def lemma_iterator(
-    text: str,
-    lang: Optional[Union[str, Tuple[str]]] = None,
-    greedy: bool = False,
-    silent: bool = True,
-) -> Iterator[str]:
-    """Convenience function to lemmatize a text using a simple tokenizer.
-    Returns a list of tokens and lemmata."""
-    last = "."  # beginning is initial
-    for match in simple_tokenizer(text, iterate=True):
-        # lemmatize
-        initial = last in PUNCTUATION
-        last = match[0]
-        yield lemmatize(
-            match[0], lang=lang, greedy=greedy, silent=silent, initial=initial
-        )
diff --git a/simplemma/utils.py b/simplemma/utils.py
index 258886f..4ae7c68 100644
--- a/simplemma/utils.py
+++ b/simplemma/utils.py
@@ -1,7 +1,3 @@
-from functools import lru_cache
-
-
-@lru_cache(maxsize=65536)
 def levenshtein_dist(str1: str, str2: str) -> int:
     # inspired by this noticeably faster code:
     # https://gist.github.com/p-hash/9e0f9904ce7947c133308fbe48fe032b
diff --git a/tests/test_simplemma.py b/tests/test_simplemma.py
index 1f4265a..70d1122 100644
--- a/tests/test_simplemma.py
+++ b/tests/test_simplemma.py
@@ -139,50 +139,52 @@ def test_logic():
         assert lemmatizer.lemmatize(None, lang="en") is None
     with pytest.raises(ValueError):
         assert lemmatizer.lemmatize("", lang="en") is None
-    assert simplemma.simplemma._suffix_search("ccc",deDict) is None
+    assert lemmatizer._suffix_search("ccc",deDict) is None
 
     assert (
-        simplemma.simplemma._return_lemma("Gender-Sternchens",deDict)
+        lemmatizer._return_lemma("Gender-Sternchens",deDict)
         == "Gendersternchen"
     )
     assert (
-        simplemma.simplemma._return_lemma("an-gespieltes",deDict)
+        lemmatizer._return_lemma("an-gespieltes",deDict)
         == "anspielen"
     )
 
     assert (
-        simplemma.simplemma._greedy_search(
+        lemmatizer._greedy_search(
             "getesteten",deDict, steps=0, distance=20
         )
         == "getestet"
     )
     assert (
-        simplemma.simplemma._greedy_search(
+        lemmatizer._greedy_search(
             "getesteten",deDict, steps=1, distance=20
         )
         == "getestet"
     )
     assert (
-        simplemma.simplemma._greedy_search(
+        lemmatizer._greedy_search(
             "getesteten",deDict, steps=2, distance=20
         )
         == "testen"
     )
     assert (
-        simplemma.simplemma._greedy_search(
+        lemmatizer._greedy_search(
             "getesteten",deDict, steps=2, distance=2
         )
         == "getestet"
     )
 
     # prefixes
-    mydata = simplemma.simplemma._load_data(("de", "ru"))
+    dictionaryCache.update_lang_data(("de", "ru"))
+    deDict = dictionaryCache.data[0].dict
+    ruDict = dictionaryCache.data[1].dict
     assert (
-        simplemma.simplemma._prefix_search("zerlemmatisiertes", "de", mydata[0].dict)
+        lemmatizer._prefix_search("zerlemmatisiertes", "de", deDict)
         == "zerlemmatisiert"
     )
     assert (
-        simplemma.simplemma._prefix_search("зафиксированные", "ru", mydata[1].dict)
+        lemmatizer._prefix_search("зафиксированные", "ru", ruDict)
         == "зафиксированный"
     )
 
@@ -257,23 +259,23 @@ def test_search():
     dictionaryCache.update_lang_data(("en",))
     enDict = dictionaryCache.data[0].dict
     lemmatizer = Lemmatizer(dictionaryCache)
-    assert simplemma.simplemma._simple_search("ignorant", enDict) == "ignorant"
-    assert simplemma.simplemma._simple_search("Ignorant", enDict) == "ignorant"
+    assert lemmatizer._simple_search("ignorant", enDict) == "ignorant"
+    assert lemmatizer._simple_search("Ignorant", enDict) == "ignorant"
     assert (
-        simplemma.simplemma._dehyphen("magni-ficent", enDict, False) == "magnificent"
+        lemmatizer._dehyphen("magni-ficent", enDict, False) == "magnificent"
     )
-    assert simplemma.simplemma._dehyphen("magni-ficents", enDict, False) is None
-    # assert simplemma.simplemma._greedy_search('Ignorance-Tests', enDict) == 'Ignorance-Test'
+    assert lemmatizer._dehyphen("magni-ficents", enDict, False) is None
+    # assert lemmatizer._greedy_search('Ignorance-Tests', enDict) == 'Ignorance-Test'
     # don't lemmatize numbers
-    assert simplemma.simplemma._return_lemma("01234", enDict) == "01234"
+    assert lemmatizer._return_lemma("01234", enDict) == "01234"
     # initial or not
     dictionaryCache.update_lang_data(("de",))
     deDict = dictionaryCache.data[0].dict
     assert (
-        simplemma.simplemma._simple_search("Dritte", deDict, initial=True) == "dritt"
+        lemmatizer._simple_search("Dritte", deDict, initial=True) == "dritt"
     )
     assert (
-        simplemma.simplemma._simple_search("Dritte", deDict, initial=False)
+        lemmatizer._simple_search("Dritte", deDict, initial=False)
         == "Dritter"
     )
 
@@ -346,7 +348,7 @@ def test_subwords():
         == "PCR-Bestätigungstest"
     )
     # assert (
-    #    lemmatize("standortübergreifend", lang="de", greedy=True)
+    #    lemmatizer.lemmatize("standortübergreifend", lang="de", greedy=True)
     #    == "standortübergreifend"
     # )
     assert lemmatizer.lemmatize("obamamäßigsten", lang="de", greedy=True) == "obamamäßig"
@@ -377,10 +379,10 @@ def test_subwords():
     # assert lemmatizer.lemmatize("Bandmitgliedern", lang="de", greedy=True) == "Bandmitglied"
 
     # prefixes
-    assert lemmatize("lemmatisiertes", lang="de") == "lemmatisiert"
-    assert lemmatize("zerlemmatisiertes", lang="de") == "zerlemmatisiert"
-    assert lemmatize("фиксированные", lang="ru") == "фиксированный"
-    assert lemmatize("зафиксированные", lang="ru") == "зафиксированный"
+    assert lemmatizer.lemmatize("lemmatisiertes", lang="de") == "lemmatisiert"
+    assert lemmatizer.lemmatize("zerlemmatisiertes", lang="de") == "zerlemmatisiert"
+    assert lemmatizer.lemmatize("фиксированные", lang="ru") == "фиксированный"
+    assert lemmatizer.lemmatize("зафиксированные", lang="ru") == "зафиксированный"
 
 
 def test_tokenizer():
diff --git a/tests/udscore.py b/tests/udscore.py
index edc5ce6..a154054 100644
--- a/tests/udscore.py
+++ b/tests/udscore.py
@@ -3,7 +3,7 @@
 from collections import Counter
 
 from conllu import parse_incr
-from simplemma import lemmatize
+from simplemma import Lemmatizer
 
 
 data_files = [
@@ -73,10 +73,11 @@
             else:
                 initial = False
 
-            greedy_candidate = lemmatize(
+            lemmatizer = Lemmatizer()
+            greedy_candidate = lemmatizer.lemmatize(
                 token["form"], lang=language, greedy=True, initial=initial
             )
-            candidate = lemmatize(
+            candidate = lemmatizer.lemmatize(
                 token["form"], lang=language, greedy=False, initial=initial
             )