Skip to content

Commit

Permalink
More regex rule for post-processing and skip citations (#69)
Browse files Browse the repository at this point in the history
* Fix monkeypatch of no token raiser

* Add tool to record the deepl api behavior changes

* Bookkeeping

* Post-processing regex for inline url link

* Trigger apicall test on pull request
  • Loading branch information
unkcpz authored Dec 23, 2023
1 parent 1307f01 commit da5f31a
Show file tree
Hide file tree
Showing 15 changed files with 315 additions and 39 deletions.
53 changes: 53 additions & 0 deletions .github/workflows/bookkeeping.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,53 @@
name: bookkeeping CI

# run scheduled translation on given strings
# make the record of the translation in the bookkeeping folder

on:
push:
paths:
- "src/**"
branches:
- main
schedule:
# every week on Monday at 00:00 UTC
- cron: "0 0 * * 1"
workflow_dispatch:

jobs:

bookkeeping:
runs-on: ubuntu-latest

steps:
- name: Checkout repository
uses: actions/checkout@v4
with:
submodules: recursive

- name: Install Python 3.11
uses: actions/setup-python@v5
with:
python-version: "3.11"

- name: Install aiida-core-i18n
run: |
pip install -e .
- name: Run bookkeeping
run: |
python -m aiida_core_i18n.bookkeeping
env:
DEEPL_TOKEN: ${{ secrets.DEEPL_TOKEN }}

- name: Make a PR for the bookkeeping
uses: peter-evans/create-pull-request@v5
with:
token: ${{ secrets.GITHUB_TOKEN }}
commit-message: Update bookkeeping
title: Update bookkeeping
body: |
This pull request updates the bookkeeping file.
branch: update-bookkeeping-${{ github.sha }}
reviewers: unkcpz
if: github.event_name != 'push'
4 changes: 4 additions & 0 deletions .github/workflows/ci-i18n-with-apicall.yml
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,10 @@ name: CI-i18n-with-apicall
# with apicall testing, so runs scheduled on main branch or triggered manually

on:
pull_request:
paths:
- "tests/statics/origin_text"
- "src/**"
schedule:
- cron: "0 0 1 * *" # every month on the 1st
workflow_dispatch:
Expand Down
56 changes: 51 additions & 5 deletions src/aiida_core_i18n/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,9 @@
import deepl
import os

DEEPL_TOKEN = os.environ.get("DEEPL_TOKEN")
def get_env_deepl_token() -> str:
"""Get the deepl token from the environment variable"""
return os.environ.get("DEEPL_TOKEN")

def str_post_processing(raw_str: str) -> str:
"""deepl zh_CN has problem when translate the `` in CJK from English,
Expand All @@ -21,13 +23,33 @@ def str_post_processing(raw_str: str) -> str:

# for ``{content}`` make sure a space in front
res = re.sub(r'(?:(?:(?<!^)(?<!\s)(?<!`))(``\w.*?``))', r' \1', add_space, flags=re.ASCII)

# r"请访问 ``话语论坛 <https://aiida.discourse.group> `__``。" -> r"请访问 `话语论坛 <https://aiida.discourse.group>`__。"
res = re.sub(r"``(.*?)\s+`__``", r"`\1`__", res, flags=re.ASCII)

return res.strip()

def translate(inp_str: str, target_lang="ZH") -> str:
def met_skip_rule(inp_str: str) -> bool:
"""The rule when met, skip the translation
"""
# if string is a citation, skip (container link to a doi url)
# e.g. Martin Uhrin, It is a great day, Computational Materials Science **187**, 110086 (2021); DOI: `10.1016/j.commatsci.2020.110086 <https://doi.org/10.1016/j.commatsci.2020.110086>`_
if re.match(r".*DOI: `.*? <https://doi.org/.*?>`_.*?", inp_str):
return True

return False

def translate(inp_str: str, target_lang="ZH", post_processing: bool=True) -> str:
"""Call deepl API to tranlate and do post process"""
translator = deepl.Translator(DEEPL_TOKEN)
# If the inp_str meet the skip rule, return the inp_str immediately
if met_skip_rule(inp_str):
return inp_str

translator = deepl.Translator(get_env_deepl_token())

# We don't want to translate the code snippet, so we use
# a special string to replace the `` in the code snippet to avoid
# the translation.
# `` -> EDBS after translated, recover to ``
# EDBS for End Double BackSlash

Expand All @@ -50,7 +72,10 @@ def translate(inp_str: str, target_lang="ZH") -> str:
tstr = translated.text
tstr = tstr.replace('EDBS', '``')

res = str_post_processing(tstr)
if post_processing:
res = str_post_processing(tstr)
else:
res = tstr

return res

Expand Down Expand Up @@ -134,4 +159,25 @@ def is_translated(lines: typing.List[str]) -> bool:

return output_lines



def deepl_status(info: str = "verbose") -> int:
"""Get the status of the deepl API"""
import deepl
token = get_env_deepl_token()
if token is None:
raise RuntimeError("Please set the 'DEEPL_TOKEN' environment variable")

translator = deepl.Translator(token)

usage = translator.get_usage()

if info == "verbose":
return usage
elif info == "count":
return usage.character.count
elif info == "limit":
return usage.character.limit
elif info == "avail":
return usage.character.limit - usage.character.count
else:
raise ValueError("Please set the correct parameter")
39 changes: 19 additions & 20 deletions src/aiida_core_i18n/__main__.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,9 +3,9 @@
import click
import pathlib

from aiida_core_i18n import po_translate
from aiida_core_i18n import po_translate, get_env_deepl_token

@click.group()
@click.group(context_settings={"help_option_names": ["-h", "--help"]})
def cli():
"""CLI command: to run the translation using click"""
pass
Expand Down Expand Up @@ -41,28 +41,27 @@ def translate(po: pathlib.Path, max_chars: int, override_translation: bool, over
@click.option('-p', '--param', help='which information to show', type=click.Choice(['count', 'limit', 'verbose', 'avail']), default='verbose')
def status(param: str):
"""Show the status of the api translation limit"""
import os
import deepl
from aiida_core_i18n import deepl_status

DEEPL_TOKEN = os.environ.get("DEEPL_TOKEN")
if DEEPL_TOKEN is None:
click.echo("ERROR: Please set the DEEPL_TOKEN environment variable")
return
try:
click.echo(deepl_status(param))
except ValueError as exc:
click.echo(f"ERROR: {exc}")

translator = deepl.Translator(DEEPL_TOKEN)
@cli.command()
@click.argument('string', type=str)
@click.option('--target-lang', help='The target language', default='ZH', type=str)
@click.option('--post-processing/--no-post-processing', help='Do post processing', default=True, type=bool)
def deepl(string: str, target_lang: str, post_processing: bool):
"""Translate the string"""
from aiida_core_i18n import translate

usage = translator.get_usage()
# print the initial string
click.echo(f"Input: \n\t{string}")

if param == 'verbose':
click.echo(usage)
elif param == 'count':
click.echo(usage.character.count)
elif param == 'limit':
click.echo(usage.character.limit)
elif param == 'avail':
click.echo(usage.character.limit - usage.character.count)
else:
click.echo("ERROR: Please set the correct parameter")
# translate the string
res = translate(string, target_lang=target_lang, post_processing=post_processing)
click.echo(f"Output: \n\t{res}")


if __name__ == '__main__':
Expand Down
58 changes: 58 additions & 0 deletions src/aiida_core_i18n/bookkeeping.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,58 @@
"""Tool to translate raw en strings to every languages
bookkeeping the Deepl API behavior by date.
"""
import pathlib
from datetime import datetime

from aiida_core_i18n import translate

# list of original strings (English)
inp_strs = [
r"Please visit the `Discourse forum <https://aiida.discourse.group>`__.",
]

rec_folder = pathlib.Path(__file__).parent / "_bookkeeping"

def rec(inp_strs: list[str], lang: str, output_folder: pathlib.Path):
"""Write raw strings to the file"""

if not output_folder.is_dir():
output_folder.mkdir(parents=True)

# Call translate to the inp_strs and record the result
# by the date of running this script, and save to the _bookkeeping folder
with open(output_folder / f"{lang}_{datetime.now().strftime('%Y-%m-%d')}.txt", "w") as fh:
# write the metadata (date, lang) to the file
fh.write(f"Date: {datetime.now().strftime('%Y-%m-%d')}\n")
fh.write(f"Source: EN Target: {lang}\n\n")

for inp_str in inp_strs:
res = translate(inp_str, target_lang=f"{lang}", post_processing=False)

fh.write(f"Input: \n\t{inp_str}\n")
fh.write(f"Output: \n\t{res}\n\n")

if __name__ == "__main__":
# record the raw strings and the translated strings
rec(inp_strs, "ZH", rec_folder)

# compare the file with the previous one in date, if the same (exclude the first line for date)
# means the API behavior is the same as before, remove the newly created file.

# find the latest two files by date (2023-12-23) in filename (ZH_2023-12-23.txt)
files = sorted(rec_folder.glob("ZH_*.txt"), key=lambda f: f.name.lstrip("ZH_").rstrip(".txt"), reverse=True)[:2]

if len(files) < 2:
print("No enough files to compare, exit")
exit(1)

# compare the two files if the same remove the one with the latest date
with open(files[0], "r") as fh0, open(files[1], "r") as fh1:
lines0 = fh0.readlines()[1:]
lines1 = fh1.readlines()[1:]

if lines0 == lines1:
print("Same API behavior, remove the newly created file")
files[0].unlink()


12 changes: 11 additions & 1 deletion tests/statics/origin_text.txt
Original file line number Diff line number Diff line change
Expand Up @@ -43,4 +43,14 @@ msgid ""
msgstr ""
"`点击这里打开一个 issue <https\\://github.com/aiidateam/aiida-"
"core/issues/new?assignees=&labels=topic%2Fdocumentation&template=doc-"
"improvements.md&title=Docs%3A+404>`__"
"improvements.md&title=Docs%3A+404>`__"

#: ../../source/index.rst:174
msgid ""
"Martin Uhrin, Sebastiaan. P. Huber, Jusong Yu, Nicola Marzari, and Giovanni "
"Pizzi, *Workflows in AiiDA: Engineering a high-throughput, event-based "
"engine for robust and modular computational workflows*, Computational "
"Materials Science **187**, 110086 (2021); DOI: "
"`10.1016/j.commatsci.2020.110086 "
"<https://doi.org/10.1016/j.commatsci.2020.110086>`"
msgstr ""
54 changes: 49 additions & 5 deletions tests/test_translate.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,8 @@
import pytest
import pathlib
from collections import namedtuple

from aiida_core_i18n import str_post_processing, po_translate
from aiida_core_i18n import str_post_processing, po_translate, translate

@pytest.fixture(scope="function")
def static_path() -> pathlib.Path:
Expand All @@ -18,11 +19,37 @@ def static_path() -> pathlib.Path:
("参见 :py:mod:`aiida.plugins`中的API文档。", "参见 :py:mod:`aiida.plugins`中的API文档。"),
]
)
def test_str_post_processing(input: str, expected: str):
def test_str_post_processing_legacy(input: str, expected: str):
"""test post process the string for code snippet"""
got = str_post_processing(input)
assert got == expected

# new test_str_post_processing where the en_source is recorded with the date.
@pytest.mark.parametrize(
('input', 'expected'),
[
(r"请访问 ``话语论坛 <https://aiida.discourse.group> `__``。", r"请访问 `话语论坛 <https://aiida.discourse.group>`__。"),
]
)
def test_str_post_processing(input: str, expected: str):
"""test post process the string for code snippet"""
got = str_post_processing(input)
assert got == expected

@pytest.mark.parametrize(
('input', 'expected'),
[
(r"Martin Uhrin, It is a great day, Computational Materials Science **187**, 110086 (2021); DOI: `10.1016/j.commatsci.2020.110086 <https://doi.org/10.1016/j.commatsci.2020.110086>`_", r"Martin Uhrin, It is a great day, Computational Materials Science **187**, 110086 (2021); DOI: `10.1016/j.commatsci.2020.110086 <https://doi.org/10.1016/j.commatsci.2020.110086>`_"),
]
)
def test_met_skip_rule(input: str, expected: str, monkeypatch):
"""Test the skip rule by translate, the deepl translate function is monkey patched to return a dummy string
"""
# The return value should contain the `text` attribute
monkeypatch.setattr("deepl.Translator.translate_text", lambda *args, **kwargs: namedtuple("Dummy", ["text"])("YOUSHALLNOTPASS"))

got = translate(input)
assert got == expected

@pytest.fixture(scope="function")
def pot_str(static_path: pathlib.Path) -> str:
Expand All @@ -36,9 +63,26 @@ def test_po_translate_default(pot_str, file_regression):
"""The actuall process of po file
This consumes ~ 500 characters of deepl API
"""
from aiida_core_i18n import deepl_status

# may not be enough for the whole file if so, fail the test
# We need go and maybe change
max_chars = 500

# Get initial count
i_count = deepl_status("count")

lines = pot_str.splitlines()

translated_lines = po_translate(lines)
translated_lines = po_translate(lines, max_chars)
f_count = deepl_status("count")
used = f_count - i_count

if not used < max_chars:
pytest.fail(f"Used {used} characters, more than the max_chars {max_chars}")

print(f"Translated {used} characters in this test session")

file_regression.check('\n'.join(translated_lines))

@pytest.mark.parametrize("override", [True, False])
Expand Down Expand Up @@ -66,7 +110,7 @@ def test_po_translate_max_chars(pot_str, file_regression, monkeypatch, max_chars

def test_po_translate_raise_exception_when_no_auth_key(pot_str, monkeypatch):
"""Test the exception when no auth key"""
monkeypatch.delenv("DEEPL_TOKEN", raising=False)
monkeypatch.setattr("aiida_core_i18n.get_env_deepl_token", lambda: None)

with pytest.raises(Exception):
po_translate(pot_str.splitlines())
10 changes: 10 additions & 0 deletions tests/test_translate/test_po_translate_default.txt
Original file line number Diff line number Diff line change
Expand Up @@ -44,3 +44,13 @@ msgstr ""
"`点击这里打开一个 issue <https\\://github.com/aiidateam/aiida-"
"core/issues/new?assignees=&labels=topic%2Fdocumentation&template=doc-"
"improvements.md&title=Docs%3A+404>`__"

#: ../../source/index.rst:174
msgid ""
"Martin Uhrin, Sebastiaan. P. Huber, Jusong Yu, Nicola Marzari, and Giovanni "
"Pizzi, *Workflows in AiiDA: Engineering a high-throughput, event-based "
"engine for robust and modular computational workflows*, Computational "
"Materials Science **187**, 110086 (2021); DOI: "
"`10.1016/j.commatsci.2020.110086 "
"<https://doi.org/10.1016/j.commatsci.2020.110086>`"
msgstr "Martin Uhrin、Sebastiaan.P. Huber, Jusong Yu, Nicola Marzari, and Giovanni Pizzi, *Workflows in AiiDA:为稳健的模块化计算工作流设计一个高通量、基于事件的引擎*,《计算材料科学》**187**,110086 (2021);DOI: `10.1016/j.commatsci.2020.110086 <https://doi.org/10.1016/j.commatsci.2020.110086>`"
Loading

0 comments on commit da5f31a

Please sign in to comment.