Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

add split translation #807

Merged
merged 14 commits into from
Feb 5, 2025
176 changes: 129 additions & 47 deletions manga_translator/translators/chatgpt.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@
import asyncio
import time
from typing import List, Dict

from manga_translator.utils import is_valuable_text
from .common import CommonTranslator, MissingAPIKeyException
from .keys import OPENAI_API_KEY, OPENAI_HTTP_PROXY, OPENAI_API_BASE
CONFIG = None
Expand Down Expand Up @@ -42,7 +42,7 @@ class GPT3Translator(CommonTranslator):
'THA': 'Thai',
'IND': 'Indonesian'
}
_INVALID_REPEAT_COUNT = 2 # repeat up to 2 times if "invalid" translation was detected
_INVALID_REPEAT_COUNT = 0 # useless
_MAX_REQUESTS_PER_MINUTE = 20
_TIMEOUT = 40 # Seconds to wait for a response from the server before retrying
_RETRY_ATTEMPTS = 3 # Number of times to retry an errored request before giving up
Expand Down Expand Up @@ -130,95 +130,177 @@ def _format_prompt_log(self, to_lang: str, prompt: str) -> str:
async def _translate(self, from_lang: str, to_lang: str, queries: List[str]) -> List[str]:
translations = [''] * len(queries)
self.logger.debug(f'Temperature: {self.temperature}, TopP: {self.top_p}')
MAX_SPLIT_ATTEMPTS = 5 # Default max split attempts
RETRY_ATTEMPTS = self._RETRY_ATTEMPTS

query_index = 0
for prompt, query_size in self._assemble_prompts(from_lang, to_lang, queries):
self.logger.debug('-- GPT Prompt --\n' + self._format_prompt_log(to_lang, prompt))
async def translate_batch(prompt_queries, prompt_query_indices, split_level=0):
nonlocal MAX_SPLIT_ATTEMPTS
split_prefix = ' (split)' if split_level > 0 else ''

for attempt in range(self._RETRY_ATTEMPTS):
try:
response = await self._request_translation(to_lang, prompt)
self.logger.debug('-- GPT Response --\n' + response)
# Assemble prompt for the current batch
prompt, query_size = self._assemble_prompts(from_lang, to_lang, prompt_queries).__next__()
self.logger.debug(f'-- GPT Prompt{split_prefix} --\n' + self._format_prompt_log(to_lang, prompt))

# split
for attempt in range(RETRY_ATTEMPTS):
try:
# Start the translation request with timeout handling
request_task = asyncio.create_task(self._request_translation(to_lang, prompt))
started = time.time()
timeout_attempt = 0
ratelimit_attempt = 0
server_error_attempt = 0
while not request_task.done():
await asyncio.sleep(0.1)
if time.time() - started > self._TIMEOUT + (timeout_attempt * self._TIMEOUT / 2):
# Server takes too long to respond
if timeout_attempt >= self._TIMEOUT_RETRY_ATTEMPTS:
raise Exception('Openai servers did not respond quickly enough.')
timeout_attempt += 1
self.logger.warn(f'Restarting request due to timeout. Attempt: {timeout_attempt}')
request_task.cancel()
request_task = asyncio.create_task(self._request_translation(to_lang, prompt))
started = time.time()

# Get the response
response = await request_task
self.logger.debug(f'-- GPT Response{split_prefix} --\n' + response)

# Split response into translations
new_translations = re.split(r'<\|\d+\|>', response)
if not new_translations[0].strip():
new_translations = new_translations[1:]

if len(queries) == 1 and len(new_translations) == 1 and not re.match(r'^\s*<\|\d+\|>', response) :
self.logger.warn(f'Single query response does not contain prefix, retrying...')
continue
if len(prompt_queries) == 1 and len(new_translations) == 1 and not re.match(r'^\s*<\|\d+\|>', response):
self.logger.warn(f'Single query response does not contain prefix, retrying...(Attempt {attempt + 1})')
continue

#Check for error messages in translations
# Check for error messages in translations
ERROR_KEYWORDS = [
# ENG_KEYWORDS
# ENG_KEYWORDS
#"sorry,",
"I'm sorry, I can't assist with that.",
"I'm sorry, but I can't assist with that request.",
"I'm sorry, I can't assist with that.",
#"I apologize",
#"assist with",
"I cannot help with",
"I must decline",
"I am not comfortable about",
"I will not engage with",
#"not comfortable",
#"engage with",
"I cannot generate or create",
#"I'd prefer not to",
#"I must refrain from",
"I'd prefer not to",
"I must refrain from",
"This goes beyond what I can",
#"unable",
"That's not something I can help with",
#"appropriate",

# CHINESE_KEYWORDS
"抱歉,我不",
"我无法满足该请求",
# CHINESE_KEYWORDS
"抱歉,我不",
"我无法满足该请求",
"对不起,我不",
"我无法将",
"我无法把",
"我无法回答你",
"我无法将",
"我无法把",
"我无法回答你",
"这超出了我的范围",
"我不便回答",
"我不能提供相关建议",
"这类内容我不能处理",
"我需要婉拒",

# JAPANESE_KEYWORDS
"申し訳ありませんが",
# JAPANESE_KEYWORDS
"申し訳ありませんが",
]
if any(keyword in t for t in new_translations for keyword in ERROR_KEYWORDS):
self.logger.warn(f'Error message detected in response, retrying... (Attempt {attempt + 1})')
continue
if any(keyword in t for t in new_translations for keyword in ERROR_KEYWORDS):
remaining_attempts = RETRY_ATTEMPTS - attempt - 1
self.logger.warn(f'Error message detected in response, remaining {remaining_attempts} time(s) before splitting the translation.')
continue

if len(new_translations) < query_size:
# Try splitting by newlines instead
new_translations = re.split(r'\n', response)

if len(new_translations) < query_size:
self.logger.warn(f'Incomplete response, retrying... (Attempt {attempt + 1})')
remaining_attempts = RETRY_ATTEMPTS - attempt - 1
self.logger.warn(f'Incomplete response, remaining {remaining_attempts} time(s) before splitting the translation.')
continue

# Trim excess translations and pad if necessary
new_translations = new_translations[:query_size] + [''] * (query_size - len(new_translations))

# Clean translations by keeping only the content before the first newline
new_translations = [t.split('\n')[0].strip() for t in new_translations]

# Successfully obtained translations for the current batch
translations[query_index:query_index + query_size] = [t.strip() for t in new_translations]
query_index += query_size
break
new_translations = [t.split('\n')[0].strip() for t in new_translations]
# Remove any potential prefix markers
new_translations = [re.sub(r'^\s*<\|\d+\|>\s*', '', t) for t in new_translations]

#for i, translation in enumerate(new_translations):
# if not is_valuable_text(translation):
# self.logger.info(f'Filtered out: {translation}')
# self.logger.info('Reason: Text is not considered valuable.')
# new_translations[i] = ''

# Check if any translations are empty
if any(not t.strip() for t in new_translations):
self.logger.warn(f'Empty translations detected. Resplitting the batch.')
break # Exit retry loop and trigger split logic below

# Store the translations in the correct indices
for idx, translation in zip(prompt_query_indices, new_translations):
translations[idx] = translation

# Log progress
self.logger.info(f'Batch translated: {len([t for t in translations if t])}/{len(queries)} completed.')
self.logger.debug(f'Completed translations: {[t if t else queries[i] for i, t in enumerate(translations)]}')
return True # Successfully translated this batch

except openai.RateLimitError: # Server returned ratelimit response
ratelimit_attempt += 1
if ratelimit_attempt >= self._RATELIMIT_RETRY_ATTEMPTS:
raise
self.logger.warn(
f'Restarting request due to ratelimiting by openai servers. Attempt: {ratelimit_attempt}')
await asyncio.sleep(2)
except openai.APIError:
server_error_attempt += 1
if server_error_attempt >= self._RETRY_ATTEMPTS:
self.logger.error(
'Openai encountered a server error, possibly due to high server load. Use a different translator or try again later.')
raise
self.logger.warn(f'Restarting request due to a server error. Attempt: {server_error_attempt}')
await asyncio.sleep(1)
except Exception as e:
self.logger.error(f'Error during translation attempt: {e}')
if attempt == self._RETRY_ATTEMPTS - 1:
if attempt == RETRY_ATTEMPTS - 1:
raise
await asyncio.sleep(1)

# Remove any potential prefix markers before returning the results
translations = [re.sub(r'^\s*<\|\d+\|>\s*', '', t) for t in translations]
# If retries exhausted and still not successful, proceed to split if allowed
if split_level < MAX_SPLIT_ATTEMPTS:
if split_level == 0:
self.logger.warn('Retry limit reached. Starting to split the translation batch.')
else:
self.logger.warn('Further splitting the translation batch due to persistent errors.')
mid_index = len(prompt_queries) // 2
futures = []
# Split the batch into two halves
for sub_queries, sub_indices in [
(prompt_queries[:mid_index], prompt_query_indices[:mid_index]),
(prompt_queries[mid_index:], prompt_query_indices[mid_index:]),
]:
if sub_queries:
futures.append(translate_batch(sub_queries, sub_indices, split_level + 1))
results = await asyncio.gather(*futures)
return all(results)
else:
self.logger.error('Maximum split attempts reached. Unable to translate the following queries:')
for idx in prompt_query_indices:
self.logger.error(f'Query: {queries[idx]}')
return False # Indicate failure for this batch

# Begin translation process
prompt_queries = queries
prompt_query_indices = list(range(len(queries)))
await translate_batch(prompt_queries, prompt_query_indices)

self.logger.debug(translations)
if self.token_count_last:
self.logger.info(f'Used {self.token_count_last} tokens (Total: {self.token_count})')

return translations

async def _request_translation(self, to_lang: str, prompt: str) -> str:
Expand Down Expand Up @@ -334,7 +416,7 @@ async def _request_translation(self, to_lang: str, prompt: str) -> str:
class GPT4Translator(GPT35TurboTranslator):
_CONFIG_KEY = 'gpt4'
_MAX_REQUESTS_PER_MINUTE = 200
_RETRY_ATTEMPTS = 5
_RETRY_ATTEMPTS = 3
_MAX_TOKENS = 8192

@property
Expand All @@ -357,7 +439,7 @@ async def _request_translation(self, to_lang: str, prompt: str) -> str:

try:
response = await self.client.chat.completions.create(
model='gpt-4o',
model='chatgpt-4o-latest',
messages=messages,
max_tokens=self._MAX_TOKENS // 2,
temperature=self.temperature,
Expand Down
Loading
Loading