From 5d9c081dbafb079c52de6ae3751c73f25d79d354 Mon Sep 17 00:00:00 2001 From: Osma Suominen Date: Tue, 17 Sep 2024 14:19:05 +0300 Subject: [PATCH 1/3] add detect-language CLI command --- annif/cli.py | 22 ++++++++++++++++++++++ tests/test_cli.py | 34 ++++++++++++++++++++++++++++++++++ 2 files changed, 56 insertions(+) diff --git a/annif/cli.py b/annif/cli.py index 673dc42fb..8bbd90f88 100644 --- a/annif/cli.py +++ b/annif/cli.py @@ -24,6 +24,7 @@ OperationFailedException, ) from annif.project import Access +from annif.simplemma_util import get_language_detector from annif.util import metric_code logger = annif.logger @@ -735,5 +736,26 @@ def run_completion(shell): click.echo(script) +@cli.command("detect-language") +@click.argument("languages", nargs=-1) +def run_detect_language(languages): + """Detect the language of a text given a list of candidate languages.""" + + if not languages: + raise click.UsageError("At least one language is required as an argument") + + text = sys.stdin.read() + detector = get_language_detector(tuple(languages)) + try: + proportions = detector.proportion_in_each_language(text) + except ValueError as e: + raise click.UsageError(e) + + for lang, score in sorted(proportions.items(), key=lambda x: x[1], reverse=True): + if lang == "unk": + lang = "?" + click.echo(f"{lang}\t{score:.04f}") + + if __name__ == "__main__": cli() diff --git a/tests/test_cli.py b/tests/test_cli.py index d4c7f17d7..b19067a8e 100644 --- a/tests/test_cli.py +++ b/tests/test_cli.py @@ -1391,3 +1391,37 @@ def test_completion_show_project_project_ids_dummy(): def test_completion_load_vocab_vocab_ids_all(): completions = get_completions(annif.cli.cli, ["load-vocab"], "") assert completions == ["dummy", "dummy-noname", "yso"] + + +def test_detect_language(): + result = runner.invoke( + annif.cli.cli, + ["detect-language", "fi", "sv", "en"], + input="This is some example text", + ) + assert not result.exception + assert result.exit_code == 0 + assert result.output.split("\n")[0] == "en\t1.0000" + assert result.output.split("\n")[-2] == "?\t0.0000" + + +def test_detect_language_no_candidates(): + failed_result = runner.invoke( + annif.cli.cli, + ["detect-language"], + input="This is some example text", + ) + assert failed_result.exception + assert failed_result.exit_code != 0 + assert "At least one language is required as an argument" in failed_result.output + + +def test_detect_language_unknown_language(): + failed_result = runner.invoke( + annif.cli.cli, + ["detect-language", "xxx"], + input="This is some example text", + ) + assert failed_result.exception + assert failed_result.exit_code != 0 + assert "Error: Unsupported language: xxx" in failed_result.output From 2c06655e4c5a46e1e540c2c1cfd9df9de7cd0061 Mon Sep 17 00:00:00 2001 From: Osma Suominen Date: Tue, 17 Sep 2024 14:59:10 +0300 Subject: [PATCH 2/3] refactor: extract detect_language into a utility method used by both REST and CLI --- annif/cli.py | 7 +++---- annif/rest.py | 17 ++++++----------- annif/simplemma_util.py | 8 +++++++- tests/test_simplemma_util.py | 11 ++++++++++- 4 files changed, 26 insertions(+), 17 deletions(-) diff --git a/annif/cli.py b/annif/cli.py index 8bbd90f88..0e4bb0e26 100644 --- a/annif/cli.py +++ b/annif/cli.py @@ -24,7 +24,7 @@ OperationFailedException, ) from annif.project import Access -from annif.simplemma_util import get_language_detector +from annif.simplemma_util import detect_language from annif.util import metric_code logger = annif.logger @@ -745,13 +745,12 @@ def run_detect_language(languages): raise click.UsageError("At least one language is required as an argument") text = sys.stdin.read() - detector = get_language_detector(tuple(languages)) try: - proportions = detector.proportion_in_each_language(text) + proportions = detect_language(text, languages) except ValueError as e: raise click.UsageError(e) - for lang, score in sorted(proportions.items(), key=lambda x: x[1], reverse=True): + for lang, score in proportions.items(): if lang == "unk": lang = "?" click.echo(f"{lang}\t{score:.04f}") diff --git a/annif/rest.py b/annif/rest.py index d96786642..d00a16f97 100644 --- a/annif/rest.py +++ b/annif/rest.py @@ -9,10 +9,10 @@ import connexion import annif.registry +import annif.simplemma_util from annif.corpus import Document, DocumentList, SubjectSet from annif.exception import AnnifException from annif.project import Access -from annif.simplemma_util import get_language_detector if TYPE_CHECKING: from connexion.lifecycle import ConnexionResponse @@ -89,9 +89,8 @@ def detect_language(body: dict[str, Any]): text = body.get("text") languages = body.get("languages") - detector = get_language_detector(tuple(languages)) try: - proportions = detector.proportion_in_each_language(text) + proportions = annif.simplemma_util.detect_language(text, tuple(languages)) except ValueError: return connexion.problem( status=400, @@ -100,14 +99,10 @@ def detect_language(body: dict[str, Any]): ) result = { - "results": sorted( - [ - {"language": lang if lang != "unk" else None, "score": score} - for lang, score in proportions.items() - ], - key=lambda x: x["score"], - reverse=True, - ) + "results": [ + {"language": lang if lang != "unk" else None, "score": score} + for lang, score in proportions.items() + ] } return result, 200, {"Content-Type": "application/json"} diff --git a/annif/simplemma_util.py b/annif/simplemma_util.py index 4a8b8a1e6..8b4cb1340 100644 --- a/annif/simplemma_util.py +++ b/annif/simplemma_util.py @@ -1,6 +1,6 @@ """Wrapper code for using Simplemma functionality in Annif""" -from typing import Tuple, Union +from typing import Dict, Tuple, Union from simplemma import LanguageDetector, Lemmatizer from simplemma.strategies import DefaultStrategy @@ -15,3 +15,9 @@ def get_language_detector(lang: Union[str, Tuple[str, ...]]) -> LanguageDetector: return LanguageDetector(lang, lemmatization_strategy=_lemmatization_strategy) + + +def detect_language(text: str, languages: Tuple[str, ...]) -> Dict[str, float]: + detector = get_language_detector(languages) + proportions = detector.proportion_in_each_language(text) + return dict(sorted(proportions.items(), key=lambda x: x[1], reverse=True)) diff --git a/tests/test_simplemma_util.py b/tests/test_simplemma_util.py index 57ea8b83e..4f520f66f 100644 --- a/tests/test_simplemma_util.py +++ b/tests/test_simplemma_util.py @@ -2,7 +2,7 @@ import pytest -from annif.simplemma_util import get_language_detector +from annif.simplemma_util import detect_language, get_language_detector def test_get_language_detector(): @@ -17,3 +17,12 @@ def test_get_language_detector_many(): text = "She said 'au revoir' and left" proportion = detector.proportion_in_target_languages(text) assert proportion == pytest.approx(1.0) + + +def test_detect_language(): + text = "She said 'au revoir' and left" + languages = ("fr", "en") + proportions = detect_language(text, languages) + assert proportions["en"] == pytest.approx(0.75) + assert proportions["fr"] == pytest.approx(0.25) + assert list(proportions.keys())[0] == "en" From 0dda55b955263c3385470f39a335f40b2a9a6282 Mon Sep 17 00:00:00 2001 From: Osma Suominen Date: Tue, 17 Sep 2024 16:17:49 +0300 Subject: [PATCH 3/3] support detecting the language of multiple files in a single CLI command --- annif/cli.py | 43 ++++++++++++++++++++++++++++--------------- tests/test_cli.py | 28 +++++++++++++++------------- 2 files changed, 43 insertions(+), 28 deletions(-) diff --git a/annif/cli.py b/annif/cli.py index 0e4bb0e26..c2746ab3b 100644 --- a/annif/cli.py +++ b/annif/cli.py @@ -737,23 +737,36 @@ def run_completion(shell): @cli.command("detect-language") -@click.argument("languages", nargs=-1) -def run_detect_language(languages): - """Detect the language of a text given a list of candidate languages.""" +@click.argument("languages") +@click.argument( + "paths", type=click.Path(dir_okay=False, exists=True, allow_dash=True), nargs=-1 +) +def run_detect_language(languages, paths): + """ + Detect the language of a single text document from standard input or for one or more + document file(s) given its/their path(s). + """ - if not languages: - raise click.UsageError("At least one language is required as an argument") + langs = tuple(languages.split(",")) - text = sys.stdin.read() - try: - proportions = detect_language(text, languages) - except ValueError as e: - raise click.UsageError(e) - - for lang, score in proportions.items(): - if lang == "unk": - lang = "?" - click.echo(f"{lang}\t{score:.04f}") + def detect_language_and_show(text, languages): + try: + proportions = detect_language(text, languages) + except ValueError as e: + raise click.UsageError(e) + for lang, score in proportions.items(): + if lang == "unk": + lang = "?" + click.echo(f"{lang}\t{score:.04f}") + + if paths and not (len(paths) == 1 and paths[0] == "-"): + doclist = cli_util.open_text_documents(paths, docs_limit=None) + for doc, path in zip(doclist.documents, paths): + click.echo(f"Detected languages for {path}") + detect_language_and_show(doc.text, langs) + else: + text = sys.stdin.read() + detect_language_and_show(text, langs) if __name__ == "__main__": diff --git a/tests/test_cli.py b/tests/test_cli.py index b19067a8e..134ea9bc2 100644 --- a/tests/test_cli.py +++ b/tests/test_cli.py @@ -1393,10 +1393,10 @@ def test_completion_load_vocab_vocab_ids_all(): assert completions == ["dummy", "dummy-noname", "yso"] -def test_detect_language(): +def test_detect_language_stdin(): result = runner.invoke( annif.cli.cli, - ["detect-language", "fi", "sv", "en"], + ["detect-language", "fi,sv,en"], input="This is some example text", ) assert not result.exception @@ -1405,17 +1405,6 @@ def test_detect_language(): assert result.output.split("\n")[-2] == "?\t0.0000" -def test_detect_language_no_candidates(): - failed_result = runner.invoke( - annif.cli.cli, - ["detect-language"], - input="This is some example text", - ) - assert failed_result.exception - assert failed_result.exit_code != 0 - assert "At least one language is required as an argument" in failed_result.output - - def test_detect_language_unknown_language(): failed_result = runner.invoke( annif.cli.cli, @@ -1425,3 +1414,16 @@ def test_detect_language_unknown_language(): assert failed_result.exception assert failed_result.exit_code != 0 assert "Error: Unsupported language: xxx" in failed_result.output + + +def test_detect_language_file_and_stdin(tmpdir): + docfile1 = tmpdir.join("doc-1.txt") + docfile1.write("nothing special") + + result = runner.invoke( + annif.cli.cli, ["detect-language", "fi,en", str(docfile1), "-"], input="kissa" + ) + + assert not result.exception + assert f"Detected languages for {docfile1}" in result.output + assert "Detected languages for -" in result.output