NatLibFi · juhoinkinen · Sep 17, 2024 · Aug 10, 2023 · Aug 10, 2023 · Aug 10, 2023
diff --git a/annif/analyzer/simplemma.py b/annif/analyzer/simplemma.py
@@ -2,7 +2,7 @@
 
 from __future__ import annotations
 
-import simplemma
+import annif.simplemma_util
 
 from . import analyzer
 
@@ -15,4 +15,4 @@ def __init__(self, param: str, **kwargs) -> None:
         super().__init__(**kwargs)
 
     def _normalize_word(self, word: str) -> str:
-        return simplemma.lemmatize(word, lang=self.lang)
+        return annif.simplemma_util.lemmatizer.lemmatize(word, lang=self.lang)
diff --git a/annif/openapi/annif.yaml b/annif/openapi/annif.yaml
@@ -182,6 +182,49 @@ paths:
         "503":
           $ref: '#/components/responses/ServiceUnavailable'
       x-codegen-request-body-name: documents
+  /detect-language:
+    post:
+      tags:
+      - Language detection
+      summary: detect the language of a text given a list of candidate languages
+      operationId: annif.rest.detect_language
+      requestBody:
+        content:
+          application/json:
+            schema:
+              type: object
+              required:
+              - text
+              - languages
+              properties:
+                text:
+                  type: string
+                  description: input text
+                  example: A quick brown fox jumped over the lazy dog.
+                languages:
+                  type: array
+                  description: candidate languages as IETF BCP 47 codes
+                  items:
+                    type: string
+                    maxLength: 3
+                    minLength: 2
+                    example: en
+                  minItems: 1
+                  maxItems: 5
+        required: true
+      responses:
+        200:
+          description: successful operation
+          content:
+            application/json:
+              schema:
+                $ref: '#/components/schemas/DetectedLanguages'
+        400:
+          description: Bad Request
+          content:
+            application/problem+json:
+              schema:
+                $ref: '#/components/schemas/Problem'
 components:
   schemas:
     ApiInfo:
@@ -316,6 +359,22 @@ components:
                 type: string
                 example: Vulpes vulpes
       description: A document with attached, known good subjects
+    DetectedLanguages:
+      type: object
+      properties:
+        results:
+          type: array
+          items:
+            type: object
+            properties:
+              language:
+                type: string
+                example: en
+                nullable: true
+              score:
+                type: number
+                example: 0.85
+      description: Candidate languages with their associated scores
     Problem:
       type: object
       properties:

diff --git a/annif/rest.py b/annif/rest.py
@@ -12,6 +12,7 @@
 from annif.corpus import Document, DocumentList, SubjectSet
 from annif.exception import AnnifException
 from annif.project import Access
+from annif.simplemma_util import get_language_detector
 
 if TYPE_CHECKING:
     from connexion.lifecycle import ConnexionResponse
@@ -82,6 +83,31 @@ def show_project(
     return project.dump(), 200, {"Content-Type": "application/json"}
 
 
+def detect_language(body: dict[str, Any]):
+    """return scores for detected languages formatted according to Swagger spec"""
+
+    text = body.get("text")
+    languages = body.get("languages")
+
+    detector = get_language_detector(tuple(languages))
+    try:
+        proportions = detector.proportion_in_each_language(text)
+    except ValueError:
+        return connexion.problem(
+            status=400,
+            title="Bad Request",
+            detail="unsupported candidate languages",
+        )
+
+    result = {
+        "results": [
+            {"language": lang if lang != "unk" else None, "score": score}
+            for lang, score in proportions.items()
+        ]
+    }
+    return result, 200, {"Content-Type": "application/json"}
+
+
 def _suggestion_to_dict(
     suggestion: SubjectSuggestion, subject_index: SubjectIndex, language: str
 ) -> dict[str, str | float | None]:

diff --git a/annif/simplemma_util.py b/annif/simplemma_util.py
@@ -0,0 +1,17 @@
+"""Wrapper code for using Simplemma functionality in Annif"""
+
+from typing import Tuple, Union
+
+from simplemma import LanguageDetector, Lemmatizer
+from simplemma.strategies import DefaultStrategy
+from simplemma.strategies.dictionaries import DefaultDictionaryFactory
+
+LANG_CACHE_SIZE = 5  # How many language dictionaries to keep in memory at once (max)
+
+_dictionary_factory = DefaultDictionaryFactory(cache_max_size=LANG_CACHE_SIZE)
+_lemmatization_strategy = DefaultStrategy(dictionary_factory=_dictionary_factory)
+lemmatizer = Lemmatizer(lemmatization_strategy=_lemmatization_strategy)
+
+
+def get_language_detector(lang: Union[str, Tuple[str, ...]]) -> LanguageDetector:
+    return LanguageDetector(lang, lemmatization_strategy=_lemmatization_strategy)
diff --git a/annif/transform/langfilter.py b/annif/transform/langfilter.py
@@ -5,9 +5,8 @@
 
 from typing import TYPE_CHECKING
 
-from simplemma.langdetect import in_target_language
-
 import annif
+import annif.simplemma_util
 
 from . import transform
 
@@ -31,6 +30,9 @@ def __init__(
         self.text_min_length = int(text_min_length)
         self.sentence_min_length = int(sentence_min_length)
         self.min_ratio = float(min_ratio)
+        self.language_detector = annif.simplemma_util.get_language_detector(
+            self.project.language
+        )
 
     def transform_fn(self, text: str) -> str:
         if len(text) < self.text_min_length:
@@ -41,7 +43,7 @@ def transform_fn(self, text: str) -> str:
             if len(sent) < self.sentence_min_length:
                 retained_sentences.append(sent)
                 continue
-            proportion = in_target_language(sent, lang=(self.project.language,))
+            proportion = self.language_detector.proportion_in_target_languages(sent)
             if proportion >= self.min_ratio:
                 retained_sentences.append(sent)
         return " ".join(retained_sentences)
diff --git a/pyproject.toml b/pyproject.toml
@@ -45,7 +45,7 @@ numpy = "1.26.*"
 optuna = "3.6.*"
 python-dateutil = "2.9.*"
 tomli = { version = "2.0.*", python = "<3.11" }
-simplemma = "0.9.*"
+simplemma = "~1.1.1"
 jsonschema = "4.21.*"
 huggingface-hub = "0.22.*"
 

diff --git a/tests/test_openapi.py b/tests/test_openapi.py
@@ -126,3 +126,15 @@ def test_openapi_learn_novocab(app_client):
     data = []
     req = app_client.post("http://localhost:8000/v1/projects/novocab/learn", json=data)
     assert req.status_code == 503
+
+
+def test_rest_detect_language_no_candidates(app_client):
+    data = {"text": "example text", "languages": []}
+    req = app_client.post("http://localhost:8000/v1/detect-language", json=data)
+    assert req.status_code == 400
+
+
+def test_rest_detect_language_too_many_candidates(app_client):
+    data = {"text": "example text", "languages": ["en", "fr", "de", "it", "es", "nl"]}
+    req = app_client.post("http://localhost:8000/v1/detect-language", json=data)
+    assert req.status_code == 400
diff --git a/tests/test_rest.py b/tests/test_rest.py
@@ -53,6 +53,38 @@ def test_rest_show_project_nonexistent(app):
         assert result.status_code == 404
 
 
+def test_rest_detect_language_english(app):
+    # english text should be detected
+    with app.app_context():
+        result = annif.rest.detect_language(
+            {"text": "example text", "languages": ["en", "fi", "sv"]}
+        )[0]
+        assert {"language": "en", "score": 1} in result["results"]
+
+
+def test_rest_detect_language_unknown(app):
+    # an unknown language should return None
+    with app.app_context():
+        result = annif.rest.detect_language(
+            {"text": "exampley texty", "languages": ["fi", "sv"]}
+        )[0]
+        assert {"language": None, "score": 1} in result["results"]
+
+
+def test_rest_detect_language_no_text(app):
+    with app.app_context():
+        result = annif.rest.detect_language({"text": "", "languages": ["en"]})[0]
+        assert {"language": None, "score": 1} in result["results"]
+
+
+def test_rest_detect_language_unsupported_candidates(app):
+    with app.app_context():
+        result = annif.rest.detect_language(
+            {"text": "example text", "languages": ["unk"]}
+        )
+        assert result.status_code == 400
+
+
 def test_rest_suggest_public(app):
     # public projects should be accessible via REST
     with app.app_context():

diff --git a/tests/test_simplemma_util.py b/tests/test_simplemma_util.py
@@ -0,0 +1,19 @@
+"""Unit tests for Simplemma utility functions"""
+
+import pytest
+
+from annif.simplemma_util import get_language_detector
+
+
+def test_get_language_detector():
+    detector = get_language_detector("en")
+    text = "She said 'au revoir' and left"
+    proportion = detector.proportion_in_target_languages(text)
+    assert proportion == pytest.approx(0.75)
+
+
+def test_get_language_detector_many():
+    detector = get_language_detector(("en", "fr"))
+    text = "She said 'au revoir' and left"
+    proportion = detector.proportion_in_target_languages(text)
+    assert proportion == pytest.approx(1.0)