Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat(translator): 优化 Gemini 翻译模板(吴恩达三步翻译法)并支持新模型 #439

Merged
merged 2 commits into from
Dec 22, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 3 additions & 2 deletions book_maker/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,8 +35,9 @@ def parse_prompt_arg(prompt_arg):
else:
raise FileNotFoundError(f"{prompt_arg} not found")

if prompt is None or any(c not in prompt["user"] for c in ["{text}", "{language}"]):
raise ValueError("prompt must contain `{text}` and `{language}`")
#if prompt is None or any(c not in prompt["user"] for c in ["{text}", "{language}"]):
if prompt is None or any(c not in prompt["user"] for c in ["{text}"]):
raise ValueError("prompt must contain `{text}`")

if "user" not in prompt:
raise ValueError("prompt must contain the key of `user`")
Expand Down
2 changes: 2 additions & 0 deletions book_maker/loader/__init__.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,12 @@
from book_maker.loader.epub_loader import EPUBBookLoader
from book_maker.loader.txt_loader import TXTBookLoader
from book_maker.loader.srt_loader import SRTBookLoader
from book_maker.loader.md_loader import MarkdownBookLoader

BOOK_LOADER_DICT = {
"epub": EPUBBookLoader,
"txt": TXTBookLoader,
"srt": SRTBookLoader,
"md": MarkdownBookLoader,
# TODO add more here
}
176 changes: 176 additions & 0 deletions book_maker/loader/md_loader.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,176 @@
import sys
from pathlib import Path

from book_maker.utils import prompt_config_to_kwargs

from .base_loader import BaseBookLoader


class MarkdownBookLoader(BaseBookLoader):
def __init__(
self,
md_name,
model,
key,
resume,
language,
model_api_base=None,
is_test=False,
test_num=5,
prompt_config=None,
single_translate=False,
context_flag=False,
context_paragraph_limit=0,
temperature=1.0,
) -> None:
self.md_name = md_name
self.translate_model = model(
key,
language,
api_base=model_api_base,
temperature=temperature,
**prompt_config_to_kwargs(prompt_config),
)
self.is_test = is_test
self.p_to_save = []
self.bilingual_result = []
self.bilingual_temp_result = []
self.test_num = test_num
self.batch_size = 10
self.single_translate = single_translate
self.md_paragraphs = []

try:
with open(f"{md_name}", encoding="utf-8") as f:
self.origin_book = f.read().splitlines()

except Exception as e:
raise Exception("can not load file") from e

self.resume = resume
self.bin_path = f"{Path(md_name).parent}/.{Path(md_name).stem}.temp.bin"
if self.resume:
self.load_state()

self.process_markdown_content()

def process_markdown_content(self):
"""将原始内容处理成 markdown 段落"""
current_paragraph = []
for line in self.origin_book:
# 如果是空行且当前段落不为空,保存当前段落
if not line.strip() and current_paragraph:
self.md_paragraphs.append('\n'.join(current_paragraph))
current_paragraph = []
# 如果是标题行,单独作为一个段落
elif line.strip().startswith('#'):
if current_paragraph:
self.md_paragraphs.append('\n'.join(current_paragraph))
current_paragraph = []
self.md_paragraphs.append(line)
# 其他情况,添加到当前段落
else:
current_paragraph.append(line)

# 处理最后一个段落
if current_paragraph:
self.md_paragraphs.append('\n'.join(current_paragraph))

@staticmethod
def _is_special_text(text):
return text.isdigit() or text.isspace() or len(text) == 0

def _make_new_book(self, book):
pass

def make_bilingual_book(self):
index = 0
p_to_save_len = len(self.p_to_save)

try:
sliced_list = [
self.md_paragraphs[i : i + self.batch_size]
for i in range(0, len(self.md_paragraphs), self.batch_size)
]
for paragraphs in sliced_list:
batch_text = '\n\n'.join(paragraphs)
if self._is_special_text(batch_text):
continue
if not self.resume or index >= p_to_save_len:
try:
max_retries = 3
retry_count = 0
while retry_count < max_retries:
try:
temp = self.translate_model.translate(batch_text)
break
except AttributeError as ae:
print(f"翻译出错: {ae}")
retry_count += 1
if retry_count == max_retries:
raise Exception("翻译模型初始化失败") from ae
except Exception as e:
print(f"翻译过程中出错: {e}")
raise Exception("翻译过程中出现错误") from e

self.p_to_save.append(temp)
if not self.single_translate:
self.bilingual_result.append(batch_text)
self.bilingual_result.append(temp)
index += self.batch_size
if self.is_test and index > self.test_num:
break

self.save_file(
f"{Path(self.md_name).parent}/{Path(self.md_name).stem}_bilingual.md",
self.bilingual_result,
)

except (KeyboardInterrupt, Exception) as e:
print(f"发生错误: {e}")
print("程序将保存进度,您可以稍后继续")
self._save_progress()
self._save_temp_book()
sys.exit(1) # 使用非零退出码表示错误

def _save_temp_book(self):
index = 0
sliced_list = [
self.origin_book[i : i + self.batch_size]
for i in range(0, len(self.origin_book), self.batch_size)
]

for i in range(len(sliced_list)):
batch_text = "".join(sliced_list[i])
self.bilingual_temp_result.append(batch_text)
if self._is_special_text(self.origin_book[i]):
continue
if index < len(self.p_to_save):
self.bilingual_temp_result.append(self.p_to_save[index])
index += 1

self.save_file(
f"{Path(self.md_name).parent}/{Path(self.md_name).stem}_bilingual_temp.txt",
self.bilingual_temp_result,
)

def _save_progress(self):
try:
with open(self.bin_path, "w", encoding="utf-8") as f:
f.write("\n".join(self.p_to_save))
except:
raise Exception("can not save resume file")

def load_state(self):
try:
with open(self.bin_path, encoding="utf-8") as f:
self.p_to_save = f.read().splitlines()
except Exception as e:
raise Exception("can not load resume file") from e

def save_file(self, book_path, content):
try:
with open(book_path, "w", encoding="utf-8") as f:
f.write("\n".join(content))
except:
raise Exception("can not save file")
10 changes: 9 additions & 1 deletion book_maker/translator/gemini_translator.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,7 @@
"gemini-1.5-flash-latest",
"gemini-1.5-flash-001",
"gemini-1.5-flash-002",
"gemini-2.0-flash-exp",
]


Expand Down Expand Up @@ -75,7 +76,7 @@ def __init__(
or environ.get(PROMPT_ENV_MAP["system"])
or None # Allow None, but not empty string
)

self.interval = 3
genai.configure(api_key=next(self.keys))
generation_config["temperature"] = temperature

Expand Down Expand Up @@ -119,6 +120,13 @@ def translate(self, text):
self.prompt.format(text=text, language=self.language)
)
t_text = self.convo.last.text.strip()
# 检查是否包含特定标签,如果有则只返回标签内的内容
tag_pattern = r'<step3_refined_translation>(.*?)</step3_refined_translation>'
tag_match = re.search(tag_pattern, t_text, re.DOTALL)
if tag_match:
print("[bold green]" + re.sub("\n{3,}", "\n\n", t_text) + "[/bold green]")
t_text = tag_match.group(1).strip()
#print("[bold green]" + re.sub("\n{3,}", "\n\n", t_text) + "[/bold green]")
break
except StopCandidateException as e:
print(
Expand Down
4 changes: 4 additions & 0 deletions prompt_md.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
{
"system": "You are a highly skilled translator responsible for translating the content of books in Markdown format from English into Chinese.",
"user": "## Strategies\nYou will follow a three-step translation process:\n### 1. Translate the input content from English into Chinese, respect the intention of the original text, keep the original Markdown format unchanged, and do not delete or omit any content, nor add additional explanations or remarks.\n### 2. Read the original text and the translation carefully, and then put forward constructive criticism and helpful suggestions to improve the translation. The final style and tone of the translation should conform to the Chinese language style.\nYou must strictly follow the rules below.\n- Never change the Markdown markup structure. Don't add or remove links. Do not change any URL.\n- Never touch or change the contents of code blocks even if they appear to have a bug.\n- Always preserve the original line breaks. Do not add or remove blank lines.\n- Never touch any permalink at the end of each heading.\n- Never touch HTML-like tags such as `<Notes>`.\nWhen writing suggestions, pay attention to whether there are ways to improve the translation in terms of:\n- Accuracy (by correcting errors such as additions, mistranslations, omissions or untranslated text).\n- Fluency (by applying the rules of Chinese grammar, spelling and punctuation, and ensuring there is no unnecessary repetition).\n- Conciseness and abbreviation (please appropriately simplify and abbreviate the translation result while keeping the original meaning unchanged to avoid the translation being too lengthy).\n### 3. Based on the results of steps 1 and 2, refine and polish the translation, and do not add additional explanations or remarks.\n## Output\nFor each step of the translation process, output the results within the appropriate XML tags:\n<step1_initial_translation>\n[Insert your initial translation here.]\n</step1_initial_translation>\n<step2_reflection>\n[Insert your reflection on the translation and put forward specific here, useful and constructive suggestions to improve the translation. Each suggestion should target a specific part of the translation.]\n</step2_reflection>\n<step3_refined_translation>\n[Insert your refined and polished translation here.]\n</step3_refined_translation>\n## Input\nThe following is the content of the book that needs to be translated within the <INPUT> tag:\n<INPUT>{text}</INPUT>"
}
4 changes: 2 additions & 2 deletions prompt_template_sample.json
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
{
"system": "You are a professional translator.",
"user": "Translate the given text to {language}. Be faithful or accurate in translation. Make the translation readable or intelligible. Be elegant or natural in translation. If the text cannot be translated, return the original text as is. Do not translate person's name. Do not add any additional text in the translation. The text to be translated is:\n{text}"
"system": "You are a highly skilled academic translator. Please complete the translation task according to the following instructions and provide only the final polished translation.",
"user": "## Strategies\nYou will follow a three-step translation process:\n### Step.1 Initial Direct Translation: Translate the content from English to Chinese sentence by sentence, respecting the original intent without deleting, omitting, or adding any extra explanations or notes.\n ### Step.2 Reflection and Revision: Carefully review both the input content and the initial direct translation from Step 1. Check if the translation conveys the original meaning, if the grammatical structure is correct, if word choices are appropriate, and if there are any ambiguities or polysemous words. The final style and tone should conform to Chinese language conventions. \nYou must strictly follow the rules below.\n- Don't add or remove links. Do not change any URL.\n- Do not translate the reference list.\n- Never touch,change or translate the mathematical formulas.\n- Never touch,change or translate the contents of code blocks even if they appear to have a bug.\n- Always preserve the original line breaks. Do not add or remove blank lines.\nProvide constructive criticism and helpful suggestions to improve: \n- translation accuracy (correct additions, mistranslations, omissions, or untranslated text errors),\n- fluency (apply Chinese grammar, spelling, and punctuation rules, and ensure no unnecessary repetition), \n- conciseness (streamline the translation results while maintaining the original meaning, avoiding wordiness).\n ### Step.3 Polish and Optimize: Based on the results from Steps 1 and 2, refine and polish the translation, ensuring the final translation adheres to Chinese style without additional explanations or notes. The content to be translated is wrapped in the following <INPUT> tags:\n\n<INPUT>{text}</INPUT>. \n\nPlease write and output only the final polished translation here: "
}
Loading