refactor: allow user to configure cache sizes

adbar · Jan 18, 2023 · 5a2a29e · 5a2a29e
1 parent c360b77
commit 5a2a29e
Show file tree

Hide file tree

Showing 7 changed files with 254 additions and 765 deletions.
diff --git a/simplemma/dictionary_pickler.py b/simplemma/dictionary_pickler.py
@@ -8,7 +8,13 @@
 from typing import List, Dict, Optional
 
 from .constants import LANGLIST
-from .utils import levenshtein_dist
+from .utils import levenshtein_dist as raw_levenshtein_dist
+
+from functools import lru_cache
+
+@lru_cache(maxsize=65536)
+def levenshtein_dist(str1: str, str2: str) -> int:
+    return raw_levenshtein_dist(str1, str2)
 
 try:
     from .rules import apply_rules

diff --git a/simplemma/langdetect.py b/simplemma/langdetect.py
@@ -6,7 +6,7 @@
 from operator import itemgetter
 from typing import List, Optional, Tuple
 
-from .simplemma import _return_lemma
+from .simplemma import Lemmatizer
 from .dictionaries import DictionaryCache
 
 SPLIT_INPUT = re.compile(r"[^\W\d_]{3,}")
@@ -41,6 +41,7 @@ def __init__(self, dictionaryCache: Optional[DictionaryCache] = None) -> None:
             dictionaryCache = DictionaryCache()
         assert isinstance(dictionaryCache, DictionaryCache)
         self.dictionaryCache: DictionaryCache = dictionaryCache
+        self.lemmatizer = Lemmatizer(self.dictionaryCache)
 
     def in_target_language(
         self, text: str, lang: Optional[Tuple[str]] = None, sample_size: int = 1000
@@ -52,7 +53,9 @@ def in_target_language(
         for token in prepare_text(text, sample_size):
             total += 1
             for l in self.dictionaryCache.data:
-                candidate = _return_lemma(token, l.dict, greedy=True, lang=l.code)
+                candidate = self.lemmatizer._return_lemma(
+                    token, l.dict, greedy=True, lang=l.code
+                )
                 if candidate is not None:
                     in_target += 1
                     break
@@ -78,7 +81,9 @@ def lang_detector(
         for l in self.dictionaryCache.data:
             in_target = 0
             for token in tokens:
-                candidate = _return_lemma(token, l.dict, greedy=extensive, lang=l.code)
+                candidate = self.lemmatizer._return_lemma(
+                    token, l.dict, greedy=extensive, lang=l.code
+                )
                 if candidate is not None:
                     in_target += 1
             # compute results