Skip to content

Commit

Permalink
feat: rename method names, create Tokenizer class and add possibility…
Browse files Browse the repository at this point in the history
… to pass custom Tokenizer to Lemmatizer
  • Loading branch information
juanjoDiaz committed Jan 19, 2023
1 parent ff6e82b commit 806c334
Show file tree
Hide file tree
Showing 13 changed files with 696 additions and 607 deletions.
9 changes: 5 additions & 4 deletions simplemma/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,8 +7,9 @@
__version__ = "0.9.0"


from .langdetect import LaguageDetector
from .simplemma import Lemmatizer
from .tokenizer import simple_tokenizer
from .dictionaries import DictionaryCache
from .dictionary_factory import DictionaryFactory
from .tokenizer import Tokenizer
from .lemmatizer import Lemmatizer
from .language_detector import LaguageDetector

from .dictionary_pickler import *
62 changes: 0 additions & 62 deletions simplemma/dictionaries.py

This file was deleted.

63 changes: 63 additions & 0 deletions simplemma/dictionary_factory.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,63 @@
"""Parts related to dictonaries."""
import lzma
import logging
import pickle

from functools import lru_cache
from pathlib import Path
from typing import Any, Dict, List, Optional, Tuple, Union

from .constants import LANGLIST

LOGGER = logging.getLogger(__name__)


def _validate_and_normalize_langs(
langs: Optional[Union[str, Tuple[str]]]
) -> Tuple[str]:
"Make sure the lang variable is a valid tuple."
# convert string
if isinstance(langs, str):
langs = (langs,)

if not isinstance(langs, tuple):
raise TypeError("lang argument must be a two-letter language code")

valid_langs = []
for lang in langs:
if lang not in LANGLIST:
LOGGER.error("language not supported: %s", lang)
else:
valid_langs.append(lang)
return tuple(valid_langs) # type: ignore[return-value]


def _load_dictionary_from_disk(langcode: str) -> Dict[str, str]:
filename = f"data/{langcode}.plzma"
filepath = str(Path(__file__).parent / filename)
with lzma.open(filepath, "rb") as filehandle:
pickled_dict = pickle.load(filehandle)
assert isinstance(pickled_dict, dict)
return pickled_dict


class DictionaryFactory:
def __init__(self, cache_max_size: int = 1048576):
self.data: Dict[str, dict] = {}
self._load_dictionary_from_disk = lru_cache(maxsize=cache_max_size)(
_load_dictionary_from_disk
)

def get_dictionaries(
self, langs: Optional[Union[str, Tuple[str]]]
) -> Dict[str, dict]:
langs = _validate_and_normalize_langs(langs)

if self.data and tuple(sorted(self.data.keys())) == sorted(langs):
return self.data

self.data = {}
for lang in langs:
LOGGER.debug("loading %s", lang)
self.data[lang] = self._load_dictionary_from_disk(lang)
return self.data
32 changes: 16 additions & 16 deletions simplemma/langdetect.py → simplemma/language_detector.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,8 +6,8 @@
from operator import itemgetter
from typing import List, Optional, Tuple

from .simplemma import Lemmatizer
from .dictionaries import DictionaryCache
from .lemmatizer import Lemmatizer
from .dictionary_factory import DictionaryFactory

SPLIT_INPUT = re.compile(r"[^\W\d_]{3,}")

Expand Down Expand Up @@ -36,25 +36,25 @@ def _return_default() -> List[Tuple[str, float]]:


class LaguageDetector:
def __init__(self, dictionaryCache: Optional[DictionaryCache] = None) -> None:
def __init__(self, dictionaryCache: Optional[DictionaryFactory] = None) -> None:
if dictionaryCache == None:
dictionaryCache = DictionaryCache()
assert isinstance(dictionaryCache, DictionaryCache)
self.dictionaryCache: DictionaryCache = dictionaryCache
dictionaryCache = DictionaryFactory()
assert isinstance(dictionaryCache, DictionaryFactory)
self.dictionaryCache: DictionaryFactory = dictionaryCache
self.lemmatizer = Lemmatizer(self.dictionaryCache)

def in_target_language(
def detect_coverage_of_languages(
self, text: str, lang: Optional[Tuple[str]] = None, sample_size: int = 1000
) -> float:
"""Determine which proportion of the text is in the target language(s)."""
total = 0
in_target = 0
self.dictionaryCache.update_lang_data(lang)
dictionaries = self.dictionaryCache.get_dictionaries(lang)
for token in prepare_text(text, sample_size):
total += 1
for l in self.dictionaryCache.data:
for lang_code, dictionary in dictionaries.items():
candidate = self.lemmatizer._return_lemma(
token, l.dict, greedy=True, lang=l.code
token, dictionary, greedy=True, lang=lang_code
)
if candidate is not None:
in_target += 1
Expand All @@ -63,7 +63,7 @@ def in_target_language(
return in_target / total
return 0

def lang_detector(
def detect_languages(
self,
text: str,
lang: Optional[Tuple[str]] = None,
Expand All @@ -77,18 +77,18 @@ def lang_detector(
if total_tokens == 0:
return _return_default()
# iterate
self.dictionaryCache.update_lang_data(lang)
for l in self.dictionaryCache.data:
dictionaries = self.dictionaryCache.get_dictionaries(lang)
for lang_code, dictionary in dictionaries.items():
in_target = 0
for token in tokens:
candidate = self.lemmatizer._return_lemma(
token, l.dict, greedy=extensive, lang=l.code
token, dictionary, greedy=extensive, lang=lang_code
)
if candidate is not None:
in_target += 1
# compute results
found_ratio = in_target / total_tokens
myresults[l.code] = found_ratio
myresults[lang_code] = found_ratio
unknown = 1 - found_ratio or 0.0
if myresults.get("unk") is None or unknown < myresults["unk"]:
myresults["unk"] = unknown
Expand All @@ -97,7 +97,7 @@ def lang_detector(
if len(results) > 1:
# in case of ex-aequo
if extensive is False and results[0][1] == results[1][1]:
results = self.lang_detector(text, lang=lang, extensive=True)
results = self.detect_languages(text, lang=lang, extensive=True)
# fallback
if len(results) > 1 and results[0][1] == results[1][1]:
return _return_default()
Expand Down
81 changes: 36 additions & 45 deletions simplemma/simplemma.py → simplemma/lemmatizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,16 +6,16 @@
from functools import lru_cache
from typing import Any, Dict, List, Iterator, Optional, Tuple, Union

from .dictionaries import DictionaryCache
from .dictionary_factory import DictionaryFactory
from .utils import levenshtein_dist

try:
from .rules import apply_rules, GERMAN_PREFIXES, RULES_LANGS, RUSSIAN_PREFIXES
from .tokenizer import simple_tokenizer
from .tokenizer import Tokenizer
# local error, also ModuleNotFoundError for Python >= 3.6
except ImportError: # pragma: no cover
from rules import apply_rules, RULES_LANGS # type: ignore
from tokenizer import simple_tokenizer # type: ignore
from tokenizer import Tokenizer # type: ignore

LOGGER = logging.getLogger(__name__)

Expand Down Expand Up @@ -51,7 +51,7 @@
PUNCTUATION = {".", "?", "!", "…", "¿", "¡"}


def _control_input_type(token: Any) -> None:
def _validate_input_type(token: str) -> None:
"Make sure the input is a string of length > 0."
if not isinstance(token, str):
raise TypeError(f"Wrong input type, expected string, got {type(token)}")
Expand All @@ -62,16 +62,17 @@ def _control_input_type(token: Any) -> None:
class Lemmatizer:
def __init__(
self,
dictionaryCache: Optional[DictionaryCache] = None,
dictionaryCache: Optional[DictionaryFactory] = None,
lemmatization_distance_cache_max_size=1048576,
levenshtein_distance_cache_max_size=1048576,
) -> None:
if dictionaryCache == None:
dictionaryCache = DictionaryCache()
assert isinstance(dictionaryCache, DictionaryCache)
self.dictionaryCache: DictionaryCache = dictionaryCache
self.lemmatize = lru_cache(maxsize=lemmatization_distance_cache_max_size)(
self._lemmatize
dictionaryCache = DictionaryFactory()
assert isinstance(dictionaryCache, DictionaryFactory)
self.dictionaryCache: DictionaryFactory = dictionaryCache
self.tokenizer = Tokenizer()
self.lemmatize_token = lru_cache(maxsize=lemmatization_distance_cache_max_size)(
self._lemmatize_token
)
self.levenshtein_dist = lru_cache(maxsize=levenshtein_distance_cache_max_size)(
levenshtein_dist
Expand Down Expand Up @@ -275,19 +276,19 @@ def _return_lemma(
candidate = self._greedy_search(candidate, datadict)
return candidate

def is_known(
def is_token_known(
self, token: str, lang: Optional[Union[str, Tuple[str]]] = None
) -> bool:
"""Tell if a token is present in one of the loaded dictionaries.
Case-insensitive, whole word forms only. Returns True or False."""
_control_input_type(token)
_ = self.dictionaryCache.update_lang_data(lang) # ignore returned value
_validate_input_type(token)
dictionaries = self.dictionaryCache.get_dictionaries(lang)
return any(
self._simple_search(token, language.dict) is not None
for language in self.dictionaryCache.data
self._simple_search(token, dictionary) is not None
for dictionary in dictionaries.values()
)

def _lemmatize(
def _lemmatize_token(
self,
token: str,
lang: Optional[Union[str, Tuple[str]]] = None,
Expand All @@ -299,27 +300,29 @@ def _lemmatize(
language list passed as input.
Returns a string.
Can raise ValueError by silent=False if no lemma has been found."""
_control_input_type(token)
lang = self.dictionaryCache.update_lang_data(lang) # use returned lang value
_validate_input_type(token)
dictionaries = self.dictionaryCache.get_dictionaries(
lang
) # use returned lang value
# start
for i, l in enumerate(self.dictionaryCache.data, start=1):
for i, (lang_code, dictionary) in enumerate(dictionaries.items(), start=1):
# determine default greediness
# if greedy is None:
# greedy = _define_greediness(language)
# determine lemma
candidate = self._return_lemma(
token, l.dict, greedy=greedy, lang=l.code, initial=initial
token, dictionary, greedy=greedy, lang=lang_code, initial=initial
)
if candidate is not None:
if i != 1:
LOGGER.debug("%s found in %s", token, l.code)
LOGGER.debug("%s found in %s", token, lang_code)
return candidate
if not silent:
raise ValueError(f"Token not found: {token}")
# try to simply lowercase # and len(token) < 10 ?
return token.lower() if lang[0] in BETTER_LOWER else token
return token.lower() if list(dictionaries.keys())[0] in BETTER_LOWER else token

def text_lemmatizer(
def lemmatize_text(
self,
text: str,
lang: Optional[Union[str, Tuple[str]]] = None,
Expand All @@ -328,23 +331,9 @@ def text_lemmatizer(
) -> List[str]:
"""Convenience function to lemmatize a text using a simple tokenizer.
Returns a list of tokens and lemmata."""
lemmata = []
last = "." # beginning is initial
for match in simple_tokenizer(text, iterate=True):
# lemmatize, simple heuristic for sentence boundary
lemmata.append(
self.lemmatize(
match[0],
lang=lang,
greedy=greedy,
silent=silent,
initial=last in PUNCTUATION,
)
)
last = match[0]
return lemmata
return list(self.lemmatize_text_iterator(text, lang, greedy, silent))

def lemma_iterator(
def lemmatize_text_iterator(
self,
text: str,
lang: Optional[Union[str, Tuple[str]]] = None,
Expand All @@ -354,10 +343,12 @@ def lemma_iterator(
"""Convenience function to lemmatize a text using a simple tokenizer.
Returns a list of tokens and lemmata."""
last = "." # beginning is initial
for match in simple_tokenizer(text, iterate=True):
# lemmatize
initial = last in PUNCTUATION
last = match[0]
yield self.lemmatize(
match[0], lang=lang, greedy=greedy, silent=silent, initial=initial
for match in self.tokenizer.simple_tokenizer(text, iterate=True):
yield self.lemmatize_token(
match[0],
lang=lang,
greedy=greedy,
silent=silent,
initial=last in PUNCTUATION,
)
last = match[0]
Loading

0 comments on commit 806c334

Please sign in to comment.