diff --git a/annif/analyzer/simplemma.py b/annif/analyzer/simplemma.py index e549a2585..3e1536882 100644 --- a/annif/analyzer/simplemma.py +++ b/annif/analyzer/simplemma.py @@ -2,7 +2,7 @@ from __future__ import annotations -import simplemma +import annif.simplemma_util from . import analyzer @@ -15,4 +15,4 @@ def __init__(self, param: str, **kwargs) -> None: super().__init__(**kwargs) def _normalize_word(self, word: str) -> str: - return simplemma.lemmatize(word, lang=self.lang) + return annif.simplemma_util.lemmatizer.lemmatize(word, lang=self.lang) diff --git a/annif/openapi/annif.yaml b/annif/openapi/annif.yaml index 74e8a4661..7f86f65f7 100644 --- a/annif/openapi/annif.yaml +++ b/annif/openapi/annif.yaml @@ -182,6 +182,49 @@ paths: "503": $ref: '#/components/responses/ServiceUnavailable' x-codegen-request-body-name: documents + /detect-language: + post: + tags: + - Language detection + summary: detect the language of a text given a list of candidate languages + operationId: annif.rest.detect_language + requestBody: + content: + application/json: + schema: + type: object + required: + - text + - languages + properties: + text: + type: string + description: input text + example: A quick brown fox jumped over the lazy dog. + languages: + type: array + description: candidate languages as IETF BCP 47 codes + items: + type: string + maxLength: 3 + minLength: 2 + example: en + minItems: 1 + maxItems: 5 + required: true + responses: + 200: + description: successful operation + content: + application/json: + schema: + $ref: '#/components/schemas/DetectedLanguages' + 400: + description: Bad Request + content: + application/problem+json: + schema: + $ref: '#/components/schemas/Problem' components: schemas: ApiInfo: @@ -316,6 +359,22 @@ components: type: string example: Vulpes vulpes description: A document with attached, known good subjects + DetectedLanguages: + type: object + properties: + results: + type: array + items: + type: object + properties: + language: + type: string + example: en + nullable: true + score: + type: number + example: 0.85 + description: Candidate languages with their associated scores Problem: type: object properties: diff --git a/annif/rest.py b/annif/rest.py index c7f457687..18d5ecfe8 100644 --- a/annif/rest.py +++ b/annif/rest.py @@ -12,6 +12,7 @@ from annif.corpus import Document, DocumentList, SubjectSet from annif.exception import AnnifException from annif.project import Access +from annif.simplemma_util import get_language_detector if TYPE_CHECKING: from connexion.lifecycle import ConnexionResponse @@ -82,6 +83,31 @@ def show_project( return project.dump(), 200, {"Content-Type": "application/json"} +def detect_language(body: dict[str, Any]): + """return scores for detected languages formatted according to Swagger spec""" + + text = body.get("text") + languages = body.get("languages") + + detector = get_language_detector(tuple(languages)) + try: + proportions = detector.proportion_in_each_language(text) + except ValueError: + return connexion.problem( + status=400, + title="Bad Request", + detail="unsupported candidate languages", + ) + + result = { + "results": [ + {"language": lang if lang != "unk" else None, "score": score} + for lang, score in proportions.items() + ] + } + return result, 200, {"Content-Type": "application/json"} + + def _suggestion_to_dict( suggestion: SubjectSuggestion, subject_index: SubjectIndex, language: str ) -> dict[str, str | float | None]: diff --git a/annif/simplemma_util.py b/annif/simplemma_util.py new file mode 100644 index 000000000..4a8b8a1e6 --- /dev/null +++ b/annif/simplemma_util.py @@ -0,0 +1,17 @@ +"""Wrapper code for using Simplemma functionality in Annif""" + +from typing import Tuple, Union + +from simplemma import LanguageDetector, Lemmatizer +from simplemma.strategies import DefaultStrategy +from simplemma.strategies.dictionaries import DefaultDictionaryFactory + +LANG_CACHE_SIZE = 5 # How many language dictionaries to keep in memory at once (max) + +_dictionary_factory = DefaultDictionaryFactory(cache_max_size=LANG_CACHE_SIZE) +_lemmatization_strategy = DefaultStrategy(dictionary_factory=_dictionary_factory) +lemmatizer = Lemmatizer(lemmatization_strategy=_lemmatization_strategy) + + +def get_language_detector(lang: Union[str, Tuple[str, ...]]) -> LanguageDetector: + return LanguageDetector(lang, lemmatization_strategy=_lemmatization_strategy) diff --git a/annif/transform/langfilter.py b/annif/transform/langfilter.py index f7c985485..e5cf8fdfe 100644 --- a/annif/transform/langfilter.py +++ b/annif/transform/langfilter.py @@ -5,9 +5,8 @@ from typing import TYPE_CHECKING -from simplemma.langdetect import in_target_language - import annif +import annif.simplemma_util from . import transform @@ -31,6 +30,9 @@ def __init__( self.text_min_length = int(text_min_length) self.sentence_min_length = int(sentence_min_length) self.min_ratio = float(min_ratio) + self.language_detector = annif.simplemma_util.get_language_detector( + self.project.language + ) def transform_fn(self, text: str) -> str: if len(text) < self.text_min_length: @@ -41,7 +43,7 @@ def transform_fn(self, text: str) -> str: if len(sent) < self.sentence_min_length: retained_sentences.append(sent) continue - proportion = in_target_language(sent, lang=(self.project.language,)) + proportion = self.language_detector.proportion_in_target_languages(sent) if proportion >= self.min_ratio: retained_sentences.append(sent) return " ".join(retained_sentences) diff --git a/pyproject.toml b/pyproject.toml index 970fd2503..487bb8649 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -45,7 +45,7 @@ numpy = "1.26.*" optuna = "3.6.*" python-dateutil = "2.9.*" tomli = { version = "2.0.*", python = "<3.11" } -simplemma = "0.9.*" +simplemma = "~1.1.1" jsonschema = "4.21.*" huggingface-hub = "0.22.*" diff --git a/tests/test_openapi.py b/tests/test_openapi.py index 76f33695f..4b409c5da 100644 --- a/tests/test_openapi.py +++ b/tests/test_openapi.py @@ -126,3 +126,15 @@ def test_openapi_learn_novocab(app_client): data = [] req = app_client.post("http://localhost:8000/v1/projects/novocab/learn", json=data) assert req.status_code == 503 + + +def test_rest_detect_language_no_candidates(app_client): + data = {"text": "example text", "languages": []} + req = app_client.post("http://localhost:8000/v1/detect-language", json=data) + assert req.status_code == 400 + + +def test_rest_detect_language_too_many_candidates(app_client): + data = {"text": "example text", "languages": ["en", "fr", "de", "it", "es", "nl"]} + req = app_client.post("http://localhost:8000/v1/detect-language", json=data) + assert req.status_code == 400 diff --git a/tests/test_rest.py b/tests/test_rest.py index c905fc1de..43d42d784 100644 --- a/tests/test_rest.py +++ b/tests/test_rest.py @@ -53,6 +53,38 @@ def test_rest_show_project_nonexistent(app): assert result.status_code == 404 +def test_rest_detect_language_english(app): + # english text should be detected + with app.app_context(): + result = annif.rest.detect_language( + {"text": "example text", "languages": ["en", "fi", "sv"]} + )[0] + assert {"language": "en", "score": 1} in result["results"] + + +def test_rest_detect_language_unknown(app): + # an unknown language should return None + with app.app_context(): + result = annif.rest.detect_language( + {"text": "exampley texty", "languages": ["fi", "sv"]} + )[0] + assert {"language": None, "score": 1} in result["results"] + + +def test_rest_detect_language_no_text(app): + with app.app_context(): + result = annif.rest.detect_language({"text": "", "languages": ["en"]})[0] + assert {"language": None, "score": 1} in result["results"] + + +def test_rest_detect_language_unsupported_candidates(app): + with app.app_context(): + result = annif.rest.detect_language( + {"text": "example text", "languages": ["unk"]} + ) + assert result.status_code == 400 + + def test_rest_suggest_public(app): # public projects should be accessible via REST with app.app_context(): diff --git a/tests/test_simplemma_util.py b/tests/test_simplemma_util.py new file mode 100644 index 000000000..57ea8b83e --- /dev/null +++ b/tests/test_simplemma_util.py @@ -0,0 +1,19 @@ +"""Unit tests for Simplemma utility functions""" + +import pytest + +from annif.simplemma_util import get_language_detector + + +def test_get_language_detector(): + detector = get_language_detector("en") + text = "She said 'au revoir' and left" + proportion = detector.proportion_in_target_languages(text) + assert proportion == pytest.approx(0.75) + + +def test_get_language_detector_many(): + detector = get_language_detector(("en", "fr")) + text = "She said 'au revoir' and left" + proportion = detector.proportion_in_target_languages(text) + assert proportion == pytest.approx(1.0)