From 5f7d526b9a24476e817c786abbd441c3c0f58d1a Mon Sep 17 00:00:00 2001 From: colin4k Date: Sun, 22 Dec 2024 10:26:10 +0800 Subject: [PATCH] support md file type --- book_maker/loader/__init__.py | 2 + book_maker/loader/md_loader.py | 176 +++++++++++++++++++++ book_maker/translator/gemini_translator.py | 2 +- prompt_md.json | 4 + prompt_template_sample.json | 6 +- 5 files changed, 186 insertions(+), 4 deletions(-) create mode 100644 book_maker/loader/md_loader.py create mode 100644 prompt_md.json diff --git a/book_maker/loader/__init__.py b/book_maker/loader/__init__.py index c8c79618..ff8a49ea 100644 --- a/book_maker/loader/__init__.py +++ b/book_maker/loader/__init__.py @@ -1,10 +1,12 @@ from book_maker.loader.epub_loader import EPUBBookLoader from book_maker.loader.txt_loader import TXTBookLoader from book_maker.loader.srt_loader import SRTBookLoader +from book_maker.loader.md_loader import MarkdownBookLoader BOOK_LOADER_DICT = { "epub": EPUBBookLoader, "txt": TXTBookLoader, "srt": SRTBookLoader, + "md": MarkdownBookLoader, # TODO add more here } diff --git a/book_maker/loader/md_loader.py b/book_maker/loader/md_loader.py new file mode 100644 index 00000000..88b1bde1 --- /dev/null +++ b/book_maker/loader/md_loader.py @@ -0,0 +1,176 @@ +import sys +from pathlib import Path + +from book_maker.utils import prompt_config_to_kwargs + +from .base_loader import BaseBookLoader + + +class MarkdownBookLoader(BaseBookLoader): + def __init__( + self, + md_name, + model, + key, + resume, + language, + model_api_base=None, + is_test=False, + test_num=5, + prompt_config=None, + single_translate=False, + context_flag=False, + context_paragraph_limit=0, + temperature=1.0, + ) -> None: + self.md_name = md_name + self.translate_model = model( + key, + language, + api_base=model_api_base, + temperature=temperature, + **prompt_config_to_kwargs(prompt_config), + ) + self.is_test = is_test + self.p_to_save = [] + self.bilingual_result = [] + self.bilingual_temp_result = [] + self.test_num = test_num + self.batch_size = 10 + self.single_translate = single_translate + self.md_paragraphs = [] + + try: + with open(f"{md_name}", encoding="utf-8") as f: + self.origin_book = f.read().splitlines() + + except Exception as e: + raise Exception("can not load file") from e + + self.resume = resume + self.bin_path = f"{Path(md_name).parent}/.{Path(md_name).stem}.temp.bin" + if self.resume: + self.load_state() + + self.process_markdown_content() + + def process_markdown_content(self): + """将原始内容处理成 markdown 段落""" + current_paragraph = [] + for line in self.origin_book: + # 如果是空行且当前段落不为空,保存当前段落 + if not line.strip() and current_paragraph: + self.md_paragraphs.append('\n'.join(current_paragraph)) + current_paragraph = [] + # 如果是标题行,单独作为一个段落 + elif line.strip().startswith('#'): + if current_paragraph: + self.md_paragraphs.append('\n'.join(current_paragraph)) + current_paragraph = [] + self.md_paragraphs.append(line) + # 其他情况,添加到当前段落 + else: + current_paragraph.append(line) + + # 处理最后一个段落 + if current_paragraph: + self.md_paragraphs.append('\n'.join(current_paragraph)) + + @staticmethod + def _is_special_text(text): + return text.isdigit() or text.isspace() or len(text) == 0 + + def _make_new_book(self, book): + pass + + def make_bilingual_book(self): + index = 0 + p_to_save_len = len(self.p_to_save) + + try: + sliced_list = [ + self.md_paragraphs[i : i + self.batch_size] + for i in range(0, len(self.md_paragraphs), self.batch_size) + ] + for paragraphs in sliced_list: + batch_text = '\n\n'.join(paragraphs) + if self._is_special_text(batch_text): + continue + if not self.resume or index >= p_to_save_len: + try: + max_retries = 3 + retry_count = 0 + while retry_count < max_retries: + try: + temp = self.translate_model.translate(batch_text) + break + except AttributeError as ae: + print(f"翻译出错: {ae}") + retry_count += 1 + if retry_count == max_retries: + raise Exception("翻译模型初始化失败") from ae + except Exception as e: + print(f"翻译过程中出错: {e}") + raise Exception("翻译过程中出现错误") from e + + self.p_to_save.append(temp) + if not self.single_translate: + self.bilingual_result.append(batch_text) + self.bilingual_result.append(temp) + index += self.batch_size + if self.is_test and index > self.test_num: + break + + self.save_file( + f"{Path(self.md_name).parent}/{Path(self.md_name).stem}_bilingual.md", + self.bilingual_result, + ) + + except (KeyboardInterrupt, Exception) as e: + print(f"发生错误: {e}") + print("程序将保存进度,您可以稍后继续") + self._save_progress() + self._save_temp_book() + sys.exit(1) # 使用非零退出码表示错误 + + def _save_temp_book(self): + index = 0 + sliced_list = [ + self.origin_book[i : i + self.batch_size] + for i in range(0, len(self.origin_book), self.batch_size) + ] + + for i in range(len(sliced_list)): + batch_text = "".join(sliced_list[i]) + self.bilingual_temp_result.append(batch_text) + if self._is_special_text(self.origin_book[i]): + continue + if index < len(self.p_to_save): + self.bilingual_temp_result.append(self.p_to_save[index]) + index += 1 + + self.save_file( + f"{Path(self.md_name).parent}/{Path(self.md_name).stem}_bilingual_temp.txt", + self.bilingual_temp_result, + ) + + def _save_progress(self): + try: + with open(self.bin_path, "w", encoding="utf-8") as f: + f.write("\n".join(self.p_to_save)) + except: + raise Exception("can not save resume file") + + def load_state(self): + try: + with open(self.bin_path, encoding="utf-8") as f: + self.p_to_save = f.read().splitlines() + except Exception as e: + raise Exception("can not load resume file") from e + + def save_file(self, book_path, content): + try: + with open(book_path, "w", encoding="utf-8") as f: + f.write("\n".join(content)) + except: + raise Exception("can not save file") diff --git a/book_maker/translator/gemini_translator.py b/book_maker/translator/gemini_translator.py index 99cbc9ee..139daa9c 100644 --- a/book_maker/translator/gemini_translator.py +++ b/book_maker/translator/gemini_translator.py @@ -76,7 +76,7 @@ def __init__( or environ.get(PROMPT_ENV_MAP["system"]) or None # Allow None, but not empty string ) - + self.interval = 3 genai.configure(api_key=next(self.keys)) generation_config["temperature"] = temperature diff --git a/prompt_md.json b/prompt_md.json new file mode 100644 index 00000000..4a06bd6c --- /dev/null +++ b/prompt_md.json @@ -0,0 +1,4 @@ +{ + "system": "You are a highly skilled translator responsible for translating the content of books in Markdown format from English into Chinese.", + "user": "## Strategies\nYou will follow a three-step translation process:\n### 1. Translate the input content from English into Chinese, respect the intention of the original text, keep the original Markdown format unchanged, and do not delete or omit any content, nor add additional explanations or remarks.\n### 2. Read the original text and the translation carefully, and then put forward constructive criticism and helpful suggestions to improve the translation. The final style and tone of the translation should conform to the Chinese language style.\nYou must strictly follow the rules below.\n- Never change the Markdown markup structure. Don't add or remove links. Do not change any URL.\n- Never touch or change the contents of code blocks even if they appear to have a bug.\n- Always preserve the original line breaks. Do not add or remove blank lines.\n- Never touch any permalink at the end of each heading.\n- Never touch HTML-like tags such as ``.\nWhen writing suggestions, pay attention to whether there are ways to improve the translation in terms of:\n- Accuracy (by correcting errors such as additions, mistranslations, omissions or untranslated text).\n- Fluency (by applying the rules of Chinese grammar, spelling and punctuation, and ensuring there is no unnecessary repetition).\n- Conciseness and abbreviation (please appropriately simplify and abbreviate the translation result while keeping the original meaning unchanged to avoid the translation being too lengthy).\n### 3. Based on the results of steps 1 and 2, refine and polish the translation, and do not add additional explanations or remarks.\n## Output\nFor each step of the translation process, output the results within the appropriate XML tags:\n\n[Insert your initial translation here.]\n\n\n[Insert your reflection on the translation and put forward specific here, useful and constructive suggestions to improve the translation. Each suggestion should target a specific part of the translation.]\n\n\n[Insert your refined and polished translation here.]\n\n## Input\nThe following is the content of the book that needs to be translated within the tag:\n{text}" +} \ No newline at end of file diff --git a/prompt_template_sample.json b/prompt_template_sample.json index 54bd3725..2f5342ca 100644 --- a/prompt_template_sample.json +++ b/prompt_template_sample.json @@ -1,4 +1,4 @@ { - "system": "您是一位技术精湛的翻译人员,负责将书籍的内容从其他语言翻译成中文。", - "user": "## 策略\n您将遵循三步翻译流程:\n1. 将输入的内容翻译成中文,尊重原文意图,保持原始SRT格式不变,不删除或省略任何内容,也不添加额外的解释或说明。\n2. 仔细阅读原文和译文,然后提出建设性的批评和有益的建议以改进翻译。翻译的最终风格和语气应符合中文语言风格。写建议时,注意是否有办法改进翻译的\n- 准确性 (通过纠正添加、误译、遗漏或未翻译文本的错误)\n- 流畅性 (通过应用中文语法、拼写和标点符号规则,并确保没有不必要的重复)\n- 精简缩写 (请在保持原意不变的情况下尽量精简缩写翻译结果,避免翻译结果冗长)\n3. 根据步骤1和2的结果,完善和润色翻译,不要添加额外的解释、说明。\n\n## 输出\n\n对于翻译过程的每个步骤,在适当的 XML 标签中输出结果:\n\n\n[在此插入您的初始翻译]\n\n\n\n[插入您对翻译的反思,提出具体、有用且建设性的建议,以改进翻译。每个建议都应针对翻译的一个特定部分。]\n\n\n\n[在此处插入您精炼和润色后的翻译]\n\n\n以下标签内是需要翻译的书籍内容:\n{text}" -} \ No newline at end of file + "system": "You are a highly skilled academic translator. Please complete the translation task according to the following instructions and provide only the final polished translation.", + "user": "## Strategies\nYou will follow a three-step translation process:\n### Step.1 Initial Direct Translation: Translate the content from English to Chinese sentence by sentence, respecting the original intent without deleting, omitting, or adding any extra explanations or notes.\n ### Step.2 Reflection and Revision: Carefully review both the input content and the initial direct translation from Step 1. Check if the translation conveys the original meaning, if the grammatical structure is correct, if word choices are appropriate, and if there are any ambiguities or polysemous words. The final style and tone should conform to Chinese language conventions. \nYou must strictly follow the rules below.\n- Don't add or remove links. Do not change any URL.\n- Do not translate the reference list.\n- Never touch,change or translate the mathematical formulas.\n- Never touch,change or translate the contents of code blocks even if they appear to have a bug.\n- Always preserve the original line breaks. Do not add or remove blank lines.\nProvide constructive criticism and helpful suggestions to improve: \n- translation accuracy (correct additions, mistranslations, omissions, or untranslated text errors),\n- fluency (apply Chinese grammar, spelling, and punctuation rules, and ensure no unnecessary repetition), \n- conciseness (streamline the translation results while maintaining the original meaning, avoiding wordiness).\n ### Step.3 Polish and Optimize: Based on the results from Steps 1 and 2, refine and polish the translation, ensuring the final translation adheres to Chinese style without additional explanations or notes. The content to be translated is wrapped in the following tags:\n\n{text}. \n\nPlease write and output only the final polished translation here: " +}