feat: rename method names, create Tokenizer class and add possibility…

… to pass custom Tokenizer to Lemmatizer
adbar · Jan 19, 2023 · 806c334 · 806c334
1 parent ff6e82b
commit 806c334
Show file tree

Hide file tree

Showing 13 changed files with 696 additions and 607 deletions.
diff --git a/simplemma/__init__.py b/simplemma/__init__.py
@@ -7,8 +7,9 @@
 __version__ = "0.9.0"
 
 
-from .langdetect import LaguageDetector
-from .simplemma import Lemmatizer
-from .tokenizer import simple_tokenizer
-from .dictionaries import DictionaryCache
+from .dictionary_factory import DictionaryFactory
+from .tokenizer import Tokenizer
+from .lemmatizer import Lemmatizer
+from .language_detector import LaguageDetector
+
 from .dictionary_pickler import *
diff --git a/simplemma/dictionaries.py b/simplemma/dictionaries.py
diff --git a/simplemma/dictionary_factory.py b/simplemma/dictionary_factory.py
@@ -0,0 +1,63 @@
+"""Parts related to dictonaries."""
+import lzma
+import logging
+import pickle
+
+from functools import lru_cache
+from pathlib import Path
+from typing import Any, Dict, List, Optional, Tuple, Union
+
+from .constants import LANGLIST
+
+LOGGER = logging.getLogger(__name__)
+
+
+def _validate_and_normalize_langs(
+    langs: Optional[Union[str, Tuple[str]]]
+) -> Tuple[str]:
+    "Make sure the lang variable is a valid tuple."
+    # convert string
+    if isinstance(langs, str):
+        langs = (langs,)
+
+    if not isinstance(langs, tuple):
+        raise TypeError("lang argument must be a two-letter language code")
+
+    valid_langs = []
+    for lang in langs:
+        if lang not in LANGLIST:
+            LOGGER.error("language not supported: %s", lang)
+        else:
+            valid_langs.append(lang)
+    return tuple(valid_langs)  # type: ignore[return-value]
+
+
+def _load_dictionary_from_disk(langcode: str) -> Dict[str, str]:
+    filename = f"data/{langcode}.plzma"
+    filepath = str(Path(__file__).parent / filename)
+    with lzma.open(filepath, "rb") as filehandle:
+        pickled_dict = pickle.load(filehandle)
+        assert isinstance(pickled_dict, dict)
+        return pickled_dict
+
+
+class DictionaryFactory:
+    def __init__(self, cache_max_size: int = 1048576):
+        self.data: Dict[str, dict] = {}
+        self._load_dictionary_from_disk = lru_cache(maxsize=cache_max_size)(
+            _load_dictionary_from_disk
+        )
+
+    def get_dictionaries(
+        self, langs: Optional[Union[str, Tuple[str]]]
+    ) -> Dict[str, dict]:
+        langs = _validate_and_normalize_langs(langs)
+
+        if self.data and tuple(sorted(self.data.keys())) == sorted(langs):
+            return self.data
+
+        self.data = {}
+        for lang in langs:
+            LOGGER.debug("loading %s", lang)
+            self.data[lang] = self._load_dictionary_from_disk(lang)
+        return self.data
diff --git a/simplemma/langdetect.py → simplemma/language_detector.py b/simplemma/langdetect.py → simplemma/language_detector.py
@@ -6,8 +6,8 @@
 from operator import itemgetter
 from typing import List, Optional, Tuple
 
-from .simplemma import Lemmatizer
-from .dictionaries import DictionaryCache
+from .lemmatizer import Lemmatizer
+from .dictionary_factory import DictionaryFactory
 
 SPLIT_INPUT = re.compile(r"[^\W\d_]{3,}")
 
@@ -36,25 +36,25 @@ def _return_default() -> List[Tuple[str, float]]:
 
 
 class LaguageDetector:
-    def __init__(self, dictionaryCache: Optional[DictionaryCache] = None) -> None:
+    def __init__(self, dictionaryCache: Optional[DictionaryFactory] = None) -> None:
         if dictionaryCache == None:
-            dictionaryCache = DictionaryCache()
-        assert isinstance(dictionaryCache, DictionaryCache)
-        self.dictionaryCache: DictionaryCache = dictionaryCache
+            dictionaryCache = DictionaryFactory()
+        assert isinstance(dictionaryCache, DictionaryFactory)
+        self.dictionaryCache: DictionaryFactory = dictionaryCache
         self.lemmatizer = Lemmatizer(self.dictionaryCache)
 
-    def in_target_language(
+    def detect_coverage_of_languages(
         self, text: str, lang: Optional[Tuple[str]] = None, sample_size: int = 1000
     ) -> float:
         """Determine which proportion of the text is in the target language(s)."""
         total = 0
         in_target = 0
-        self.dictionaryCache.update_lang_data(lang)
+        dictionaries = self.dictionaryCache.get_dictionaries(lang)
         for token in prepare_text(text, sample_size):
             total += 1
-            for l in self.dictionaryCache.data:
+            for lang_code, dictionary in dictionaries.items():
                 candidate = self.lemmatizer._return_lemma(
-                    token, l.dict, greedy=True, lang=l.code
+                    token, dictionary, greedy=True, lang=lang_code
                 )
                 if candidate is not None:
                     in_target += 1
@@ -63,7 +63,7 @@ def in_target_language(
             return in_target / total
         return 0
 
-    def lang_detector(
+    def detect_languages(
         self,
         text: str,
         lang: Optional[Tuple[str]] = None,
@@ -77,18 +77,18 @@ def lang_detector(
         if total_tokens == 0:
             return _return_default()
         # iterate
-        self.dictionaryCache.update_lang_data(lang)
-        for l in self.dictionaryCache.data:
+        dictionaries = self.dictionaryCache.get_dictionaries(lang)
+        for lang_code, dictionary in dictionaries.items():
             in_target = 0
             for token in tokens:
                 candidate = self.lemmatizer._return_lemma(
-                    token, l.dict, greedy=extensive, lang=l.code
+                    token, dictionary, greedy=extensive, lang=lang_code
                 )
                 if candidate is not None:
                     in_target += 1
             # compute results
             found_ratio = in_target / total_tokens
-            myresults[l.code] = found_ratio
+            myresults[lang_code] = found_ratio
             unknown = 1 - found_ratio or 0.0
             if myresults.get("unk") is None or unknown < myresults["unk"]:
                 myresults["unk"] = unknown
@@ -97,7 +97,7 @@ def lang_detector(
         if len(results) > 1:
             # in case of ex-aequo
             if extensive is False and results[0][1] == results[1][1]:
-                results = self.lang_detector(text, lang=lang, extensive=True)
+                results = self.detect_languages(text, lang=lang, extensive=True)
             # fallback
             if len(results) > 1 and results[0][1] == results[1][1]:
                 return _return_default()

diff --git a/simplemma/simplemma.py → simplemma/lemmatizer.py b/simplemma/simplemma.py → simplemma/lemmatizer.py
@@ -6,16 +6,16 @@
 from functools import lru_cache
 from typing import Any, Dict, List, Iterator, Optional, Tuple, Union
 
-from .dictionaries import DictionaryCache
+from .dictionary_factory import DictionaryFactory
 from .utils import levenshtein_dist
 
 try:
     from .rules import apply_rules, GERMAN_PREFIXES, RULES_LANGS, RUSSIAN_PREFIXES
-    from .tokenizer import simple_tokenizer
+    from .tokenizer import Tokenizer
 # local error, also ModuleNotFoundError for Python >= 3.6
 except ImportError:  # pragma: no cover
     from rules import apply_rules, RULES_LANGS  # type: ignore
-    from tokenizer import simple_tokenizer  # type: ignore
+    from tokenizer import Tokenizer  # type: ignore
 
 LOGGER = logging.getLogger(__name__)
 
@@ -51,7 +51,7 @@
 PUNCTUATION = {".", "?", "!", "…", "¿", "¡"}
 
 
-def _control_input_type(token: Any) -> None:
+def _validate_input_type(token: str) -> None:
     "Make sure the input is a string of length > 0."
     if not isinstance(token, str):
         raise TypeError(f"Wrong input type, expected string, got {type(token)}")
@@ -62,16 +62,17 @@ def _control_input_type(token: Any) -> None:
 class Lemmatizer:
     def __init__(
         self,
-        dictionaryCache: Optional[DictionaryCache] = None,
+        dictionaryCache: Optional[DictionaryFactory] = None,
         lemmatization_distance_cache_max_size=1048576,
         levenshtein_distance_cache_max_size=1048576,
     ) -> None:
         if dictionaryCache == None:
-            dictionaryCache = DictionaryCache()
-        assert isinstance(dictionaryCache, DictionaryCache)
-        self.dictionaryCache: DictionaryCache = dictionaryCache
-        self.lemmatize = lru_cache(maxsize=lemmatization_distance_cache_max_size)(
-            self._lemmatize
+            dictionaryCache = DictionaryFactory()
+        assert isinstance(dictionaryCache, DictionaryFactory)
+        self.dictionaryCache: DictionaryFactory = dictionaryCache
+        self.tokenizer = Tokenizer()
+        self.lemmatize_token = lru_cache(maxsize=lemmatization_distance_cache_max_size)(
+            self._lemmatize_token
         )
         self.levenshtein_dist = lru_cache(maxsize=levenshtein_distance_cache_max_size)(
             levenshtein_dist
@@ -275,19 +276,19 @@ def _return_lemma(
             candidate = self._greedy_search(candidate, datadict)
         return candidate
 
-    def is_known(
+    def is_token_known(
         self, token: str, lang: Optional[Union[str, Tuple[str]]] = None
     ) -> bool:
         """Tell if a token is present in one of the loaded dictionaries.
         Case-insensitive, whole word forms only. Returns True or False."""
-        _control_input_type(token)
-        _ = self.dictionaryCache.update_lang_data(lang)  # ignore returned value
+        _validate_input_type(token)
+        dictionaries = self.dictionaryCache.get_dictionaries(lang)
         return any(
-            self._simple_search(token, language.dict) is not None
-            for language in self.dictionaryCache.data
+            self._simple_search(token, dictionary) is not None
+            for dictionary in dictionaries.values()
         )
 
-    def _lemmatize(
+    def _lemmatize_token(
         self,
         token: str,
         lang: Optional[Union[str, Tuple[str]]] = None,
@@ -299,27 +300,29 @@ def _lemmatize(
         language list passed as input.
         Returns a string.
         Can raise ValueError by silent=False if no lemma has been found."""
-        _control_input_type(token)
-        lang = self.dictionaryCache.update_lang_data(lang)  # use returned lang value
+        _validate_input_type(token)
+        dictionaries = self.dictionaryCache.get_dictionaries(
+            lang
+        )  # use returned lang value
         # start
-        for i, l in enumerate(self.dictionaryCache.data, start=1):
+        for i, (lang_code, dictionary) in enumerate(dictionaries.items(), start=1):
             # determine default greediness
             # if greedy is None:
             #    greedy = _define_greediness(language)
             # determine lemma
             candidate = self._return_lemma(
-                token, l.dict, greedy=greedy, lang=l.code, initial=initial
+                token, dictionary, greedy=greedy, lang=lang_code, initial=initial
             )
             if candidate is not None:
                 if i != 1:
-                    LOGGER.debug("%s found in %s", token, l.code)
+                    LOGGER.debug("%s found in %s", token, lang_code)
                 return candidate
         if not silent:
             raise ValueError(f"Token not found: {token}")
         # try to simply lowercase # and len(token) < 10 ?
-        return token.lower() if lang[0] in BETTER_LOWER else token
+        return token.lower() if list(dictionaries.keys())[0] in BETTER_LOWER else token
 
-    def text_lemmatizer(
+    def lemmatize_text(
         self,
         text: str,
         lang: Optional[Union[str, Tuple[str]]] = None,
@@ -328,23 +331,9 @@ def text_lemmatizer(
     ) -> List[str]:
         """Convenience function to lemmatize a text using a simple tokenizer.
         Returns a list of tokens and lemmata."""
-        lemmata = []
-        last = "."  # beginning is initial
-        for match in simple_tokenizer(text, iterate=True):
-            # lemmatize, simple heuristic for sentence boundary
-            lemmata.append(
-                self.lemmatize(
-                    match[0],
-                    lang=lang,
-                    greedy=greedy,
-                    silent=silent,
-                    initial=last in PUNCTUATION,
-                )
-            )
-            last = match[0]
-        return lemmata
+        return list(self.lemmatize_text_iterator(text, lang, greedy, silent))
 
-    def lemma_iterator(
+    def lemmatize_text_iterator(
         self,
         text: str,
         lang: Optional[Union[str, Tuple[str]]] = None,
@@ -354,10 +343,12 @@ def lemma_iterator(
         """Convenience function to lemmatize a text using a simple tokenizer.
         Returns a list of tokens and lemmata."""
         last = "."  # beginning is initial
-        for match in simple_tokenizer(text, iterate=True):
-            # lemmatize
-            initial = last in PUNCTUATION
-            last = match[0]
-            yield self.lemmatize(
-                match[0], lang=lang, greedy=greedy, silent=silent, initial=initial
+        for match in self.tokenizer.simple_tokenizer(text, iterate=True):
+            yield self.lemmatize_token(
+                match[0],
+                lang=lang,
+                greedy=greedy,
+                silent=silent,
+                initial=last in PUNCTUATION,
             )
+            last = match[0]