Skip to content

Commit

Permalink
enable using get_language_detector with many languages + add unit test
Browse files Browse the repository at this point in the history
  • Loading branch information
osma committed Sep 16, 2024
1 parent b66279e commit 4a69662
Show file tree
Hide file tree
Showing 2 changed files with 20 additions and 1 deletion.
4 changes: 3 additions & 1 deletion annif/simplemma_util.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
"""Wrapper code for using Simplemma functionality in Annif"""

from typing import Tuple, Union

from simplemma import LanguageDetector, Lemmatizer
from simplemma.strategies import DefaultStrategy
from simplemma.strategies.dictionaries import DefaultDictionaryFactory
Expand All @@ -11,5 +13,5 @@
lemmatizer = Lemmatizer(lemmatization_strategy=_lemmatization_strategy)


def get_language_detector(lang: str) -> LanguageDetector:
def get_language_detector(lang: Union[str, Tuple[str, ...]]) -> LanguageDetector:
return LanguageDetector(lang, lemmatization_strategy=_lemmatization_strategy)
17 changes: 17 additions & 0 deletions tests/test_simplemma_util.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
"""Unit tests for Simplemma utility functions"""

from annif.simplemma_util import get_language_detector


def test_get_language_detector():
detector = get_language_detector("en")
text = "She said 'au revoir' and left"
proportion = detector.proportion_in_target_languages(text)
assert proportion == 0.75


def test_get_language_detector_many():
detector = get_language_detector(("en", "fr"))
text = "She said 'au revoir' and left"
proportion = detector.proportion_in_target_languages(text)
assert proportion == 1.0

0 comments on commit 4a69662

Please sign in to comment.