Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add language detection to REST API #659

Merged
merged 17 commits into from
Sep 17, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions annif/analyzer/simplemma.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@

from __future__ import annotations

import simplemma
import annif.simplemma_util

from . import analyzer

Expand All @@ -15,4 +15,4 @@ def __init__(self, param: str, **kwargs) -> None:
super().__init__(**kwargs)

def _normalize_word(self, word: str) -> str:
return simplemma.lemmatize(word, lang=self.lang)
return annif.simplemma_util.lemmatizer.lemmatize(word, lang=self.lang)
59 changes: 59 additions & 0 deletions annif/openapi/annif.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -182,6 +182,49 @@ paths:
"503":
$ref: '#/components/responses/ServiceUnavailable'
x-codegen-request-body-name: documents
/detect-language:
post:
tags:
- Language detection
summary: detect the language of a text given a list of candidate languages
operationId: annif.rest.detect_language
requestBody:
content:
application/json:
schema:
type: object
required:
- text
- languages
properties:
text:
type: string
description: input text
example: A quick brown fox jumped over the lazy dog.
languages:
type: array
description: candidate languages as IETF BCP 47 codes
items:
type: string
maxLength: 3
minLength: 2
example: en
minItems: 1
maxItems: 5
required: true
responses:
200:
description: successful operation
content:
application/json:
schema:
$ref: '#/components/schemas/DetectedLanguages'
400:
description: Bad Request
content:
application/problem+json:
schema:
$ref: '#/components/schemas/Problem'
components:
schemas:
ApiInfo:
Expand Down Expand Up @@ -316,6 +359,22 @@ components:
type: string
example: Vulpes vulpes
description: A document with attached, known good subjects
DetectedLanguages:
type: object
properties:
results:
type: array
items:
type: object
properties:
language:
type: string
example: en
nullable: true
score:
type: number
example: 0.85
description: Candidate languages with their associated scores
Problem:
type: object
properties:
Expand Down
26 changes: 26 additions & 0 deletions annif/rest.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
from annif.corpus import Document, DocumentList, SubjectSet
from annif.exception import AnnifException
from annif.project import Access
from annif.simplemma_util import get_language_detector

if TYPE_CHECKING:
from connexion.lifecycle import ConnexionResponse
Expand Down Expand Up @@ -82,6 +83,31 @@ def show_project(
return project.dump(), 200, {"Content-Type": "application/json"}


def detect_language(body: dict[str, Any]):
"""return scores for detected languages formatted according to Swagger spec"""

text = body.get("text")
languages = body.get("languages")

detector = get_language_detector(tuple(languages))
try:
proportions = detector.proportion_in_each_language(text)
except ValueError:
return connexion.problem(
status=400,
title="Bad Request",
detail="unsupported candidate languages",
)

result = {
"results": [
{"language": lang if lang != "unk" else None, "score": score}
for lang, score in proportions.items()
]
}
return result, 200, {"Content-Type": "application/json"}


def _suggestion_to_dict(
suggestion: SubjectSuggestion, subject_index: SubjectIndex, language: str
) -> dict[str, str | float | None]:
Expand Down
17 changes: 17 additions & 0 deletions annif/simplemma_util.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
"""Wrapper code for using Simplemma functionality in Annif"""

from typing import Tuple, Union

from simplemma import LanguageDetector, Lemmatizer
from simplemma.strategies import DefaultStrategy
from simplemma.strategies.dictionaries import DefaultDictionaryFactory

LANG_CACHE_SIZE = 5 # How many language dictionaries to keep in memory at once (max)

_dictionary_factory = DefaultDictionaryFactory(cache_max_size=LANG_CACHE_SIZE)
_lemmatization_strategy = DefaultStrategy(dictionary_factory=_dictionary_factory)
lemmatizer = Lemmatizer(lemmatization_strategy=_lemmatization_strategy)


def get_language_detector(lang: Union[str, Tuple[str, ...]]) -> LanguageDetector:
return LanguageDetector(lang, lemmatization_strategy=_lemmatization_strategy)
8 changes: 5 additions & 3 deletions annif/transform/langfilter.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,9 +5,8 @@

from typing import TYPE_CHECKING

from simplemma.langdetect import in_target_language

import annif
import annif.simplemma_util

from . import transform

Expand All @@ -31,6 +30,9 @@ def __init__(
self.text_min_length = int(text_min_length)
self.sentence_min_length = int(sentence_min_length)
self.min_ratio = float(min_ratio)
self.language_detector = annif.simplemma_util.get_language_detector(
self.project.language
)

def transform_fn(self, text: str) -> str:
if len(text) < self.text_min_length:
Expand All @@ -41,7 +43,7 @@ def transform_fn(self, text: str) -> str:
if len(sent) < self.sentence_min_length:
retained_sentences.append(sent)
continue
proportion = in_target_language(sent, lang=(self.project.language,))
proportion = self.language_detector.proportion_in_target_languages(sent)
if proportion >= self.min_ratio:
retained_sentences.append(sent)
return " ".join(retained_sentences)
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,7 @@ numpy = "1.26.*"
optuna = "3.6.*"
python-dateutil = "2.9.*"
tomli = { version = "2.0.*", python = "<3.11" }
simplemma = "0.9.*"
simplemma = "~1.1.1"
jsonschema = "4.21.*"
huggingface-hub = "0.22.*"

Expand Down
12 changes: 12 additions & 0 deletions tests/test_openapi.py
Original file line number Diff line number Diff line change
Expand Up @@ -126,3 +126,15 @@ def test_openapi_learn_novocab(app_client):
data = []
req = app_client.post("http://localhost:8000/v1/projects/novocab/learn", json=data)
assert req.status_code == 503


def test_rest_detect_language_no_candidates(app_client):
data = {"text": "example text", "languages": []}
req = app_client.post("http://localhost:8000/v1/detect-language", json=data)
assert req.status_code == 400


def test_rest_detect_language_too_many_candidates(app_client):
data = {"text": "example text", "languages": ["en", "fr", "de", "it", "es", "nl"]}
req = app_client.post("http://localhost:8000/v1/detect-language", json=data)
assert req.status_code == 400
32 changes: 32 additions & 0 deletions tests/test_rest.py
Original file line number Diff line number Diff line change
Expand Up @@ -53,6 +53,38 @@ def test_rest_show_project_nonexistent(app):
assert result.status_code == 404


def test_rest_detect_language_english(app):
# english text should be detected
with app.app_context():
result = annif.rest.detect_language(
{"text": "example text", "languages": ["en", "fi", "sv"]}
)[0]
assert {"language": "en", "score": 1} in result["results"]


def test_rest_detect_language_unknown(app):
# an unknown language should return None
with app.app_context():
result = annif.rest.detect_language(
{"text": "exampley texty", "languages": ["fi", "sv"]}
)[0]
assert {"language": None, "score": 1} in result["results"]


def test_rest_detect_language_no_text(app):
with app.app_context():
result = annif.rest.detect_language({"text": "", "languages": ["en"]})[0]
assert {"language": None, "score": 1} in result["results"]


def test_rest_detect_language_unsupported_candidates(app):
with app.app_context():
result = annif.rest.detect_language(
{"text": "example text", "languages": ["unk"]}
)
assert result.status_code == 400


def test_rest_suggest_public(app):
# public projects should be accessible via REST
with app.app_context():
Expand Down
19 changes: 19 additions & 0 deletions tests/test_simplemma_util.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
"""Unit tests for Simplemma utility functions"""

import pytest

from annif.simplemma_util import get_language_detector


def test_get_language_detector():
detector = get_language_detector("en")
text = "She said 'au revoir' and left"
proportion = detector.proportion_in_target_languages(text)
assert proportion == pytest.approx(0.75)


def test_get_language_detector_many():
detector = get_language_detector(("en", "fr"))
text = "She said 'au revoir' and left"
proportion = detector.proportion_in_target_languages(text)
assert proportion == pytest.approx(1.0)
Loading