Merge pull request #807 from popcion/main

add split translation
zyddnys · Feb 5, 2025 · f75b669 · f75b669
2 parents 126bb08 + 49dcd3d
commit f75b669
Show file tree

Hide file tree

Showing 2 changed files with 248 additions and 125 deletions.
diff --git a/manga_translator/translators/chatgpt.py b/manga_translator/translators/chatgpt.py
@@ -9,7 +9,7 @@
 import asyncio
 import time
 from typing import List, Dict
-
+from manga_translator.utils import is_valuable_text
 from .common import CommonTranslator, MissingAPIKeyException
 from .keys import OPENAI_API_KEY, OPENAI_HTTP_PROXY, OPENAI_API_BASE
 CONFIG = None
@@ -42,7 +42,7 @@ class GPT3Translator(CommonTranslator):
         'THA': 'Thai',
         'IND': 'Indonesian'
     }
-    _INVALID_REPEAT_COUNT = 2 # repeat up to 2 times if "invalid" translation was detected
+    _INVALID_REPEAT_COUNT = 0 # useless
     _MAX_REQUESTS_PER_MINUTE = 20
     _TIMEOUT = 40 # Seconds to wait for a response from the server before retrying
     _RETRY_ATTEMPTS = 3 # Number of times to retry an errored request before giving up
@@ -130,95 +130,177 @@ def _format_prompt_log(self, to_lang: str, prompt: str) -> str:
     async def _translate(self, from_lang: str, to_lang: str, queries: List[str]) -> List[str]:  
         translations = [''] * len(queries)  
         self.logger.debug(f'Temperature: {self.temperature}, TopP: {self.top_p}')  
+        MAX_SPLIT_ATTEMPTS = 5  # Default max split attempts  
+        RETRY_ATTEMPTS = self._RETRY_ATTEMPTS  
 
-        query_index = 0  
-        for prompt, query_size in self._assemble_prompts(from_lang, to_lang, queries):  
-            self.logger.debug('-- GPT Prompt --\n' + self._format_prompt_log(to_lang, prompt))  
+        async def translate_batch(prompt_queries, prompt_query_indices, split_level=0):  
+            nonlocal MAX_SPLIT_ATTEMPTS
+            split_prefix = ' (split)' if split_level > 0 else ''  
 
-            for attempt in range(self._RETRY_ATTEMPTS):  
-                try:  
-                    response = await self._request_translation(to_lang, prompt)  
-                    self.logger.debug('-- GPT Response --\n' + response)  
+            # Assemble prompt for the current batch  
+            prompt, query_size = self._assemble_prompts(from_lang, to_lang, prompt_queries).__next__()  
+            self.logger.debug(f'-- GPT Prompt{split_prefix} --\n' + self._format_prompt_log(to_lang, prompt))  
 
-                    # split 
+            for attempt in range(RETRY_ATTEMPTS):  
+                try:  
+                    # Start the translation request with timeout handling
+                    request_task = asyncio.create_task(self._request_translation(to_lang, prompt))
+                    started = time.time()
+                    timeout_attempt = 0
+                    ratelimit_attempt = 0
+                    server_error_attempt = 0
+                    while not request_task.done():
+                        await asyncio.sleep(0.1)
+                        if time.time() - started > self._TIMEOUT + (timeout_attempt * self._TIMEOUT / 2):
+                            # Server takes too long to respond
+                            if timeout_attempt >= self._TIMEOUT_RETRY_ATTEMPTS:
+                                raise Exception('Openai servers did not respond quickly enough.')
+                            timeout_attempt += 1
+                            self.logger.warn(f'Restarting request due to timeout. Attempt: {timeout_attempt}')
+                            request_task.cancel()
+                            request_task = asyncio.create_task(self._request_translation(to_lang, prompt))
+                            started = time.time()
+
+                    # Get the response
+                    response = await request_task  
+                    self.logger.debug(f'-- GPT Response{split_prefix} --\n' + response)  
+
+                    # Split response into translations  
                     new_translations = re.split(r'<\|\d+\|>', response)  
                     if not new_translations[0].strip():  
                         new_translations = new_translations[1:]  
 
-                    if len(queries) == 1 and len(new_translations) == 1 and not re.match(r'^\s*<\|\d+\|>', response) :
-                        self.logger.warn(f'Single query response does not contain prefix, retrying...')
-                        continue
+                    if len(prompt_queries) == 1 and len(new_translations) == 1 and not re.match(r'^\s*<\|\d+\|>', response):  
+                        self.logger.warn(f'Single query response does not contain prefix, retrying...(Attempt {attempt + 1})')  
+                        continue  
 
-                    #Check for error messages in translations                     
+                    # Check for error messages in translations  
                     ERROR_KEYWORDS = [  
-                        # ENG_KEYWORDS 
+                        # ENG_KEYWORDS  
                         #"sorry,",  
-                        "I'm sorry, I can't assist with that.",
+                        "I'm sorry, but I can't assist with that request.",
+                        "I'm sorry, I can't assist with that.",  
                         #"I apologize",  
                         #"assist with",  
                         "I cannot help with",  
                         "I must decline",  
-                        "I am not comfortable about",  
-                        "I will not engage with",  
+                        #"not comfortable",  
+                        #"engage with",  
                         "I cannot generate or create",  
-                        #"I'd prefer not to",  
-                        #"I must refrain from",  
+                        "I'd prefer not to",  
+                        "I must refrain from",  
                         "This goes beyond what I can",  
                         #"unable",  
                         "That's not something I can help with",  
                         #"appropriate",  
-
-                        # CHINESE_KEYWORDS
-                        "抱歉，我不", 
-                        "我无法满足该请求",
+                        # CHINESE_KEYWORDS  
+                        "抱歉，我不",  
+                        "我无法满足该请求",  
                         "对不起，我不",  
-                        "我无法将", 
-                        "我无法把", 
-                        "我无法回答你",
+                        "我无法将",  
+                        "我无法把",  
+                        "我无法回答你",  
                         "这超出了我的范围",  
                         "我不便回答",  
                         "我不能提供相关建议",  
                         "这类内容我不能处理",  
                         "我需要婉拒",  
-
-                        # JAPANESE_KEYWORDS 
-                        "申し訳ありませんが",
+                        # JAPANESE_KEYWORDS  
+                        "申し訳ありませんが",  
                     ]  
-                    if any(keyword in t for t in new_translations for keyword in ERROR_KEYWORDS):
-                        self.logger.warn(f'Error message detected in response, retrying... (Attempt {attempt + 1})')
-                        continue
+                    if any(keyword in t for t in new_translations for keyword in ERROR_KEYWORDS):  
+                        remaining_attempts = RETRY_ATTEMPTS - attempt - 1  
+                        self.logger.warn(f'Error message detected in response, remaining {remaining_attempts} time(s) before splitting the translation.')  
+                        continue  
 
                     if len(new_translations) < query_size:  
                         # Try splitting by newlines instead  
                         new_translations = re.split(r'\n', response)  
 
                     if len(new_translations) < query_size:  
-                        self.logger.warn(f'Incomplete response, retrying... (Attempt {attempt + 1})')  
+                        remaining_attempts = RETRY_ATTEMPTS - attempt - 1  
+                        self.logger.warn(f'Incomplete response, remaining {remaining_attempts} time(s) before splitting the translation.')  
                         continue  
 
                     # Trim excess translations and pad if necessary  
                     new_translations = new_translations[:query_size] + [''] * (query_size - len(new_translations))  
-
                     # Clean translations by keeping only the content before the first newline  
-                    new_translations = [t.split('\n')[0].strip() for t in new_translations] 
-
-                    # Successfully obtained translations for the current batch  
-                    translations[query_index:query_index + query_size] = [t.strip() for t in new_translations]  
-                    query_index += query_size  
-                    break  
+                    new_translations = [t.split('\n')[0].strip() for t in new_translations]  
+                    # Remove any potential prefix markers  
+                    new_translations = [re.sub(r'^\s*<\|\d+\|>\s*', '', t) for t in new_translations]  
+
+                    #for i, translation in enumerate(new_translations):
+                    #    if not is_valuable_text(translation):
+                    #        self.logger.info(f'Filtered out: {translation}')  
+                    #        self.logger.info('Reason: Text is not considered valuable.')  
+                    #        new_translations[i] = ''  
+
+                    # Check if any translations are empty  
+                    if any(not t.strip() for t in new_translations):  
+                        self.logger.warn(f'Empty translations detected. Resplitting the batch.') 
+                        break  # Exit retry loop and trigger split logic below 
+
+                    # Store the translations in the correct indices  
+                    for idx, translation in zip(prompt_query_indices, new_translations):  
+                        translations[idx] = translation  
+
+                    # Log progress  
+                    self.logger.info(f'Batch translated: {len([t for t in translations if t])}/{len(queries)} completed.')  
+                    self.logger.debug(f'Completed translations: {[t if t else queries[i] for i, t in enumerate(translations)]}')        
+                    return True  # Successfully translated this batch  
+
+                except openai.RateLimitError:  # Server returned ratelimit response
+                    ratelimit_attempt += 1
+                    if ratelimit_attempt >= self._RATELIMIT_RETRY_ATTEMPTS:
+                        raise
+                    self.logger.warn(
+                        f'Restarting request due to ratelimiting by openai servers. Attempt: {ratelimit_attempt}')
+                    await asyncio.sleep(2)
+                except openai.APIError:  
+                    server_error_attempt += 1
+                    if server_error_attempt >= self._RETRY_ATTEMPTS:
+                        self.logger.error(
+                            'Openai encountered a server error, possibly due to high server load. Use a different translator or try again later.')
+                        raise
+                    self.logger.warn(f'Restarting request due to a server error. Attempt: {server_error_attempt}')
+                    await asyncio.sleep(1)
                 except Exception as e:  
                     self.logger.error(f'Error during translation attempt: {e}')  
-                    if attempt == self._RETRY_ATTEMPTS - 1:  
+                    if attempt == RETRY_ATTEMPTS - 1:  
                         raise  
                     await asyncio.sleep(1)  
 
-        # Remove any potential prefix markers before returning the results  
-        translations = [re.sub(r'^\s*<\|\d+\|>\s*', '', t) for t in translations]  
+            # If retries exhausted and still not successful, proceed to split if allowed  
+            if split_level < MAX_SPLIT_ATTEMPTS:  
+                if split_level == 0:  
+                    self.logger.warn('Retry limit reached. Starting to split the translation batch.')  
+                else:  
+                    self.logger.warn('Further splitting the translation batch due to persistent errors.')  
+                mid_index = len(prompt_queries) // 2  
+                futures = []  
+                # Split the batch into two halves  
+                for sub_queries, sub_indices in [   
+                    (prompt_queries[:mid_index], prompt_query_indices[:mid_index]),  
+                    (prompt_queries[mid_index:], prompt_query_indices[mid_index:]),  
+                ]:  
+                    if sub_queries:  
+                        futures.append(translate_batch(sub_queries, sub_indices, split_level + 1))  
+                results = await asyncio.gather(*futures)  
+                return all(results)  
+            else:  
+                self.logger.error('Maximum split attempts reached. Unable to translate the following queries:')  
+                for idx in prompt_query_indices:  
+                    self.logger.error(f'Query: {queries[idx]}')  
+                return False  # Indicate failure for this batch   
+
+        # Begin translation process  
+        prompt_queries = queries  
+        prompt_query_indices = list(range(len(queries)))  
+        await translate_batch(prompt_queries, prompt_query_indices)  
 
         self.logger.debug(translations)  
         if self.token_count_last:  
             self.logger.info(f'Used {self.token_count_last} tokens (Total: {self.token_count})')  
-
         return translations
 
     async def _request_translation(self, to_lang: str, prompt: str) -> str:
@@ -334,7 +416,7 @@ async def _request_translation(self, to_lang: str, prompt: str) -> str:
 class GPT4Translator(GPT35TurboTranslator):
     _CONFIG_KEY = 'gpt4'
     _MAX_REQUESTS_PER_MINUTE = 200
-    _RETRY_ATTEMPTS = 5
+    _RETRY_ATTEMPTS = 3
     _MAX_TOKENS = 8192
 
     @property
@@ -357,7 +439,7 @@ async def _request_translation(self, to_lang: str, prompt: str) -> str:
 
         try:
             response = await self.client.chat.completions.create(
-                model='gpt-4o',
+                model='chatgpt-4o-latest',
                 messages=messages,
                 max_tokens=self._MAX_TOKENS // 2,
                 temperature=self.temperature,