Skip to content

Commit

Permalink
添加llm_translate命令用于改进翻译效果
Browse files Browse the repository at this point in the history
  • Loading branch information
abse4411 committed Apr 23, 2024
1 parent b582986 commit 8a47658
Show file tree
Hide file tree
Showing 12 changed files with 337 additions and 174 deletions.
172 changes: 90 additions & 82 deletions README.md

Large diffs are not rendered by default.

171 changes: 90 additions & 81 deletions README_zh.md

Large diffs are not rendered by default.

5 changes: 3 additions & 2 deletions command/base/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@
# You should have received a copy of the GNU General Public License
# along with this program. If not, see <https://www.gnu.org/licenses/>.
import argparse
from typing import Tuple

from config import default_config
from store import TranslationIndex
Expand Down Expand Up @@ -45,7 +46,7 @@ def __init__(self, name: str, description: str):
self._nick_name = None

@staticmethod
def parse_index_or_name(index_or_name: str):
def parse_index_or_name(index_or_name: str) -> Tuple[int, str]:
index = None
try:
index = int(index_or_name)
Expand All @@ -58,7 +59,7 @@ def parse_args(self, text: str):
super().parse_args(text)
self._index, self._nick_name = self.parse_index_or_name(self.args.index_or_name)

def get_translation_index(self):
def get_translation_index(self) -> TranslationIndex:
res = TranslationIndex.from_docid_or_nickname(self._index, self._nick_name)
assert res is not None, \
(f'Could\'n load this TranslationIndex with "{self.args.index_or_name}". '
Expand Down
8 changes: 4 additions & 4 deletions command/file/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -74,7 +74,7 @@ def check_untranslated_tidmap(self, index: TranslationIndex):

class SaveFileBaseCmd(FileBaseCmd):
def __init__(self, name: str, file_type: str, file_ext: str):
description = f'Save untranslated lines of the give language to a {file_type} file.'
description = f'Save untranslated lines of the given language to a {file_type} file.'
super().__init__(name, description, file_type, file_ext)
save_filename = os.path.join(self.config.project_path, file_type, f'nickname_tag_lang.{file_ext}')
self._parser.add_argument("-f", "--file", required=False, type=str, metavar=f'{file_type}_file',
Expand Down Expand Up @@ -106,7 +106,7 @@ def invoke(self):

class DumpToFileBaseCmd(FileBaseCmd):
def __init__(self, name: str, file_type: str, file_ext: str):
description = f'Dump translations of the give language to a {file_type} file.'
description = f'Dump translations of the given language to a {file_type} file.'
super().__init__(name, description, file_type, file_ext)
save_filename = os.path.join(self.config.project_path, file_type, f'nickname_tag_lang_dump.{file_ext}')
self._parser.add_argument("-f", "--file", required=False, type=str, metavar=f'{file_type}_file',
Expand Down Expand Up @@ -147,7 +147,7 @@ def invoke(self):

class LoadFileBaseCmd(FileBaseCmd):
def __init__(self, name: str, file_type: str, file_ext: str):
description = f'Load translated lines of the give language from a {file_type} file.'
description = f'Load translated lines of the given language from a {file_type} file.'
super().__init__(name, description, file_type, file_ext)
save_filename = os.path.join(self.config.project_path, file_type, f'nickname_tag_lang.{file_ext}')
self._parser.add_argument("-f", "--file", required=False, type=str, metavar=f'{file_type}_file',
Expand Down Expand Up @@ -177,7 +177,7 @@ def invoke(self):

class UpdateFromFileBaseCmd(FileBaseCmd):
def __init__(self, name: str, file_type: str, file_ext: str):
description = f'Update translations of the give language from a {file_type} file.'
description = f'Update translations of the given language from a {file_type} file.'
super().__init__(name, description, file_type, file_ext)
save_filename = os.path.join(self.config.project_path, file_type, f'nickname_tag_lang_dump.{file_ext}')
self._parser.add_argument("-f", "--file", required=False, type=str, metavar=f'{file_type}_file',
Expand Down
1 change: 1 addition & 0 deletions command/manage.py
Original file line number Diff line number Diff line change
Expand Up @@ -124,6 +124,7 @@ def invoke(self):

# Translator
register(TranslateCmd(), short_name='t')
register(LLMAugmentTranslateCmd(), short_name='lt')

# Operations for TranslationIndex
register(ListTranslationIndexCmd(), short_name='l')
Expand Down
1 change: 1 addition & 0 deletions command/translation/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,3 +15,4 @@
# along with this program. If not, see <https://www.gnu.org/licenses/>.

from .base import TranslateCmd
from .llm import LLMAugmentTranslateCmd
7 changes: 3 additions & 4 deletions command/translation/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,7 @@ def __init__(self):
self.reinit()

def reinit(self):
super().__init__('translate', 'Translate untranslated lines of the give language\n using '
super().__init__('translate', 'Translate untranslated lines of the given language\n using '
'the specified translator.')
self._parser.add_argument("-t", "--translator", type=str, choices=list(_TRANSLATOR.keys()), required=True,
help="The translator to use.")
Expand All @@ -54,7 +54,7 @@ def get_untranslated_lines(self):
tids_and_texts = tids_and_texts[:self.args.limit]
print(f'The max number of lines is set to {self.args.limit}.')
if not tids_and_texts:
print('No untranslated lines to update.')
print('No untranslated lines to translate.')
else:
res = []
if tids_and_texts:
Expand Down Expand Up @@ -121,11 +121,10 @@ def _update(tlist):
print(f'Discard blank line: {raw_text}')
else:
use_cnt += 1
new_tlist.append([tid, new_text])
new_tlist.append((tid, new_text))
continue
print(f'Find {use_cnt} translated lines, and discord {len(tlist) - use_cnt} lines')
index.update_translations(self.args.lang, new_tlist,
untranslated_only=True, discord_blank=accept_blank,
say_only=self.config.say_only)

self._translator.invoke(tids_and_texts, _update)
139 changes: 139 additions & 0 deletions command/translation/llm.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,139 @@
# projz_renpy_translation, a translator for RenPy games
# Copyright (C) 2023 github.com/abse4411
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program. If not, see <https://www.gnu.org/licenses/>.
import logging

import tqdm

from command import BaseLangIndexCmd
from config import default_config
from store.database.base import flush
from store.group import group_translations_by, ALL
from store.misc import strip_tags
from trans.openai_api import OpenAITranslator
from util import strip_or_none


class _InnerTranslator(OpenAITranslator):
def __init__(self, model: str = None, target_lang: str = None, max_turns: int = None, verbose: bool = True):
super().__init__(model, target_lang, max_turns, verbose)
config = default_config['translator']['open_ai']
self.assistant_role = config.get('assistant_role', 'assistant')

def append_text(self, raw_text, new_text):
user_msg = self._user_msg.copy()
user_msg['content'] = self._user_msg['content'].format(target_lang=self._target_lang, text=raw_text)
assistant_msg = {'role': self.assistant_role, 'content': new_text}
# Put them to the message history
self._msg_manager.put(user_msg, assistant_msg)

def clear_chat(self):
self._msg_manager.clear()


class LLMAugmentTranslateCmd(BaseLangIndexCmd):
def __init__(self):
super().__init__('llm_translate', 'Translate untranslated lines of the given language\n using '
'the LLM Augment Translating.')
self._cache_size = 100
self._parser.add_argument('-m', '--model', help='The LLM model to use.')
self._parser.add_argument('-t', '--target_lang', help='The {target_lang} in the prompt.')
self._parser.add_argument("-a", "--auto", action='store_true',
help="Load translation settings form config.")
self._parser.add_argument("-ab", "--accept_blank", action='store_true',
help="Accept blank translated lines from the translator when updating translations.")
self._parser.add_argument('--limit', type=int, default=-1,
help='The max number of lines to be translated. Negative values mean no limit.')

def invoke(self):
if self.args.auto:
oconfig = default_config['translator']['open_ai']
target_lang = oconfig['target_lang']
model = oconfig['chat']['completions']['model']
print(f'target_lang: {target_lang}')
print(f'model: {model}')
else:
model = self.args.model
target_lang = self.args.target_lang
assert model and target_lang, (f'Both of model ({model}) and target_lang ({target_lang}) should provider '
f'if arg "--auto" is not presented')
cache_size = self.config['translator']['write_cache_size']
if cache_size < 100:
logging.warning(f'Low write_cache_size({cache_size}) means more frequent disk I/O operations,'
f' it may cause a high system load.')
cache_size = max(cache_size, 100)

n_untrans = 0
index = self.get_translation_index()
group_map = group_translations_by('filename', 'linenumber', ALL,
index, self.args.lang, reverse=False,
say_only=self.config.say_only)
for g in group_map.values():
for d in g:
if d['new_text'] is None and strip_or_none(strip_tags(d['old_text'])) is not None:
n_untrans += 1
if n_untrans == 0:
print('No untranslated lines to translate.')
return
if self.args.limit >= 0:
print(f'The max number of lines is set to {self.args.limit}.')
n_untrans = min(self.args.limit, n_untrans)

cnt = 0
translator = _InnerTranslator(model, target_lang)
tlist = []
accept_blank = self.args.accept_blank
try:
with tqdm.tqdm(total=n_untrans, desc='Translating') as t:
for g in group_map.values():
translator.clear_chat()
for d in g:
if d['new_text'] is None:
raw_text = strip_or_none(strip_tags(d['old_text']))
if raw_text is None:
continue
else:
t.update(1)
cnt += 1
new_text = translator.translate(raw_text)
if new_text == raw_text:
print(f'Discard untranslated line: {raw_text}')
else:
if new_text.strip() == '' and not accept_blank:
print(f'Discard blank line: {raw_text}')
else:
tlist.append((d['tid'], new_text))
if cnt > n_untrans:
break
if cnt % cache_size == 0 and tlist:
index.update_translations(self.args.lang, tlist, untranslated_only=True,
discord_blank=accept_blank, say_only=self.config.say_only)
tlist = []
print('Flushing...')
flush()
else:
raw_text = strip_or_none(strip_tags(d['old_text']))
new_text = strip_or_none(strip_tags(d['new_text']))
if raw_text and new_text:
translator.append_text(raw_text, new_text)
if cnt > n_untrans:
break

finally:
translator.close()
if tlist:
index.update_translations(self.args.lang, tlist,
untranslated_only=True, discord_blank=accept_blank,
say_only=self.config.say_only)
1 change: 1 addition & 0 deletions config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,7 @@ projz:
open_ai:
target_lang: 'Chinese'
user_role: &user_role 'user'
assistant_role: &assistant_role 'assistant'
max_turns: 8 # Specify the maximum number of rounds for the conversation.
langs: ['Afrikaans','Albanian','Amharic','Arabic','Armenian','Asturian','Azerbaijani','Bashkir','Belarusian','Bengali','Bosnian','Breton','Bulgarian','Burmese','Catalan','Cebuano','Central Khmer','Chinese','Croatian','Czech','Danish','Dutch','English','Estonian','Finnish','Flemish','French','Fulah','Gaelic','Galician','Ganda','Georgian','German','Greek','Gujarati','Haitian','Haitian Creole','Hausa','Hebrew','Hindi','Hungarian','Icelandic','Igbo','Iloko','Indonesian','Irish','Italian','Japanese','Javanese','Kannada','Kazakh','Khmer','Korean','Lao','Latvian','Letzeburgesch','Lingala','Lithuanian','Luxembourgish','Macedonian','Malagasy','Malay','Malayalam','Marathi','Moldavian','Moldovan','Mongolian','Nepali','Northern Sotho','Norwegian','Occitan','Oriya','Panjabi','Pashto','Persian','Polish','Portuguese','Punjabi','Pushto','Romanian','Russian','Scottish Gaelic','Serbian','Sindhi','Sinhala','Sinhalese','Slovak','Slovenian','Somali','Spanish','Sundanese','Swahili','Swati','Swedish','Tagalog','Tamil','Thai','Tswana','Turkish','Ukrainian','Urdu','Uzbek','Valencian','Vietnamese','Welsh','Western Frisian','Wolof','Xhosa','Yiddish','Yoruba','Zulu']
models: ['qwen:0.5b','qwen:1.8b', 'qwen:4b', 'qwen:7b', 'qwen:14b', 'qwen:72b', 'gpt-4','gpt-4-0314','gpt-4-32k','gpt-4-32k-0314','gpt-3.5-turbo','gpt-3.5-turbo-0301']
Expand Down
2 changes: 1 addition & 1 deletion main.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@
from util import my_input, line_to_args, is_windows
import translator

__VERSION__ = '0.4.4'
__VERSION__ = '0.4.5'


def print_banner():
Expand Down
1 change: 1 addition & 0 deletions store/group.py
Original file line number Diff line number Diff line change
Expand Up @@ -61,6 +61,7 @@ def _merge(vi: dict, bi: dict):
res['old_text'] = res['code']
res['new_text'] = res['new_code']
# we pop items that may confuse the user
res.pop('block')
res.pop('new_code')
res.pop('who')
res.pop('what')
Expand Down
3 changes: 3 additions & 0 deletions trans/openai_api/wraaper.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,9 @@ def put(self, user_msg, assistant_msg):
if len(self._msgs) // 2 > self._max_turns:
self._msgs = self._msgs[2:]

def clear(self):
self._msgs.clear()

def __len__(self):
return len(self._sys_msg) + len(self._msgs)

Expand Down

0 comments on commit 8a47658

Please sign in to comment.