Skip to content

Commit

Permalink
refactor: allow user to configure cache sizes
Browse files Browse the repository at this point in the history
  • Loading branch information
juanjoDiaz committed Jan 18, 2023
1 parent c360b77 commit 5a2a29e
Show file tree
Hide file tree
Showing 7 changed files with 254 additions and 765 deletions.
8 changes: 7 additions & 1 deletion simplemma/dictionary_pickler.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,13 @@
from typing import List, Dict, Optional

from .constants import LANGLIST
from .utils import levenshtein_dist
from .utils import levenshtein_dist as raw_levenshtein_dist

from functools import lru_cache

@lru_cache(maxsize=65536)
def levenshtein_dist(str1: str, str2: str) -> int:
return raw_levenshtein_dist(str1, str2)

try:
from .rules import apply_rules
Expand Down
11 changes: 8 additions & 3 deletions simplemma/langdetect.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
from operator import itemgetter
from typing import List, Optional, Tuple

from .simplemma import _return_lemma
from .simplemma import Lemmatizer
from .dictionaries import DictionaryCache

SPLIT_INPUT = re.compile(r"[^\W\d_]{3,}")
Expand Down Expand Up @@ -41,6 +41,7 @@ def __init__(self, dictionaryCache: Optional[DictionaryCache] = None) -> None:
dictionaryCache = DictionaryCache()
assert isinstance(dictionaryCache, DictionaryCache)
self.dictionaryCache: DictionaryCache = dictionaryCache
self.lemmatizer = Lemmatizer(self.dictionaryCache)

def in_target_language(
self, text: str, lang: Optional[Tuple[str]] = None, sample_size: int = 1000
Expand All @@ -52,7 +53,9 @@ def in_target_language(
for token in prepare_text(text, sample_size):
total += 1
for l in self.dictionaryCache.data:
candidate = _return_lemma(token, l.dict, greedy=True, lang=l.code)
candidate = self.lemmatizer._return_lemma(
token, l.dict, greedy=True, lang=l.code
)
if candidate is not None:
in_target += 1
break
Expand All @@ -78,7 +81,9 @@ def lang_detector(
for l in self.dictionaryCache.data:
in_target = 0
for token in tokens:
candidate = _return_lemma(token, l.dict, greedy=extensive, lang=l.code)
candidate = self.lemmatizer._return_lemma(
token, l.dict, greedy=extensive, lang=l.code
)
if candidate is not None:
in_target += 1
# compute results
Expand Down
Loading

0 comments on commit 5a2a29e

Please sign in to comment.