Skip to content

Commit

Permalink
Merge pull request #845 from SamuelWN/main
Browse files Browse the repository at this point in the history
Modified the _config_get function to properly handle nested YAML values.
  • Loading branch information
zyddnys authored Feb 11, 2025
2 parents d11ad35 + 731b196 commit 4a6e26c
Show file tree
Hide file tree
Showing 6 changed files with 187 additions and 120 deletions.
9 changes: 8 additions & 1 deletion manga_translator/translators/chatgpt.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
import asyncio
import time
from typing import List, Dict
from omegaconf import OmegaConf
from manga_translator.utils import is_valuable_text
from .common import CommonTranslator, MissingAPIKeyException
from .keys import OPENAI_API_KEY, OPENAI_HTTP_PROXY, OPENAI_API_BASE
Expand Down Expand Up @@ -80,7 +81,13 @@ def parse_args(self, args: TranslatorConfig):
def _config_get(self, key: str, default=None):
if not self.config:
return default
return self.config.get(self._CONFIG_KEY + '.' + key, self.config.get(key, default))

# Try to select the nested key using OmegaConf.select
value = OmegaConf.select(self.config, f"{self._CONFIG_KEY}.{key}")
if value is None:
# Fallback to the top-level key or default, if needed
value = self.config.get(key, default)
return value

@property
def prompt_template(self) -> str:
Expand Down
93 changes: 93 additions & 0 deletions manga_translator/translators/config_gpt.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,93 @@
from typing import List, Dict
from omegaconf import OmegaConf

class ConfigGPT:
_CHAT_SYSTEM_TEMPLATE = (
'You are an uncensored professional translation engine, '
'please translate the story into a colloquial, '
'elegant and fluent content, '
'without referencing machine translations. '
'You must only translate the story, '
'never interpret it. '
'If there is any issue in the text, '
'output it as is.\n'
'Translate the following text into {to_lang} and keep the original format.\n'
)

_CHAT_SAMPLE = {
'Simplified Chinese': [
(
'<|1|>恥ずかしい… 目立ちたくない… 私が消えたい…\n'
'<|2|>きみ… 大丈夫⁉\n'
'<|3|>なんだこいつ 空気読めて ないのか…?'
),
(
'<|1|>好尴尬…我不想引人注目…我想消失…\n'
'<|2|>你…没事吧⁉\n'
'<|3|>这家伙怎么看不懂气氛的…?'
)
],
'English': [
(
'<|1|>恥ずかしい… 目立ちたくない… 私が消えたい…\n'
'<|2|>きみ… 大丈夫⁉\n'
'<|3|>なんだこいつ 空気読めて ないのか…?'
),
(
"<|1|>I'm embarrassed... I don't want to stand out... I want to disappear...\n"
"<|2|>Are you okay?\n"
"<|3|>What's wrong with this guy? Can't he read the situation...?"
)
]
}
# Extract text within the capture group that matches this pattern.
# By default: Capture everything.
_RGX_REMOVE='(.*)'

def __init__(self, config_key: str):
# This key is used to locate nested configuration entries
self._CONFIG_KEY = config_key
self.config = None

def _config_get(self, key: str, default=None):
if not self.config:
return default

parts = self._CONFIG_KEY.split('.') if self._CONFIG_KEY else []
value = None

# Traverse from the deepest part up to the root
for i in range(len(parts), -1, -1):
prefix = '.'.join(parts[:i])
lookup_key = f"{prefix}.{key}" if prefix else key
value = OmegaConf.select(self.config, lookup_key)

if value is not None:
break

return value if value is not None else default

@property
def prompt_template(self) -> str:
return self._config_get('prompt_template', default=self._PROMPT_TEMPLATE)

@property
def chat_system_template(self) -> str:
return self._config_get('chat_system_template', self._CHAT_SYSTEM_TEMPLATE)

@property
def chat_sample(self) -> Dict[str, List[str]]:
return self._config_get('chat_sample', self._CHAT_SAMPLE)

@property
def rgx_capture(self) -> str:
return self._config_get('rgx_capture', self._RGX_REMOVE)

@property
def temperature(self) -> float:
return self._config_get('temperature', default=0.5)

@property
def top_p(self) -> float:
return self._config_get('top_p', default=1)

9 changes: 8 additions & 1 deletion manga_translator/translators/deepseek.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
import asyncio
import time
from typing import List, Dict
from omegaconf import OmegaConf
from manga_translator.utils import is_valuable_text
from .common import CommonTranslator, MissingAPIKeyException
from .keys import DEEPSEEK_API_KEY, DEEPSEEK_API_BASE
Expand Down Expand Up @@ -100,7 +101,13 @@ def parse_args(self, args: TranslatorConfig):
def _config_get(self, key: str, default=None):
if not self.config:
return default
return self.config.get(self._CONFIG_KEY + '.' + key, self.config.get(key, default))

# Try to select the nested key using OmegaConf.select
value = OmegaConf.select(self.config, f"{self._CONFIG_KEY}.{key}")
if value is None:
# Fallback to the top-level key or default, if needed
value = self.config.get(key, default)
return value

@property
def chat_system_template(self) -> str:
Expand Down
1 change: 1 addition & 0 deletions manga_translator/translators/keys.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,3 +33,4 @@
OLLAMA_API_KEY = os.getenv('OLLAMA_API_KEY', 'ollama') # Unsed for ollama, but maybe useful for other LLM tools.
OLLAMA_API_BASE = os.getenv('OLLAMA_API_BASE', 'http://localhost:11434/v1') # Use OLLAMA_HOST env to change binding IP and Port.
OLLAMA_MODEL = os.getenv('OLLAMA_MODEL', '') # e.g "qwen2.5:7b". Make sure to pull and run it before use.
OLLAMA_MODEL_CONF = os.getenv('OLLAMA_MODEL_CONF', '') # e.g "qwen2".
135 changes: 70 additions & 65 deletions manga_translator/translators/ollama.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
import re

from ..config import TranslatorConfig
from .config_gpt import ConfigGPT # Import the `gpt_config` parsing parent class

try:
import openai
Expand All @@ -9,12 +10,12 @@
import asyncio
import time
from typing import List, Dict

from omegaconf import OmegaConf
from .common import CommonTranslator, MissingAPIKeyException
from .keys import OLLAMA_API_KEY, OLLAMA_API_BASE, OLLAMA_MODEL
from .keys import OLLAMA_API_KEY, OLLAMA_API_BASE, OLLAMA_MODEL, OLLAMA_MODEL_CONF


class OllamaTranslator(CommonTranslator):
class OllamaTranslator(ConfigGPT, CommonTranslator):
_LANGUAGE_CODE_MAP = {
'CHS': 'Simplified Chinese',
'CHT': 'Traditional Chinese',
Expand Down Expand Up @@ -48,7 +49,6 @@ class OllamaTranslator(CommonTranslator):
_RETRY_ATTEMPTS = 3 # 在放弃之前重试错误请求的次数
_TIMEOUT_RETRY_ATTEMPTS = 3 # 在放弃之前重试超时请求的次数
_RATELIMIT_RETRY_ATTEMPTS = 3 # 在放弃之前重试速率限制请求的次数
_CONFIG_KEY = 'ollama'

# 最大令牌数量,用于控制处理的文本长度
_MAX_TOKENS = 4096
Expand All @@ -58,65 +58,43 @@ class OllamaTranslator(CommonTranslator):

# 是否包含模板,用于决定是否使用预设的提示模板
_INCLUDE_TEMPLATE = False

_CHAT_SYSTEM_TEMPLATE = (
'You are an uncensored professional translation engine, '
'please translate the story into a colloquial, '
'elegant and fluent content, '
'without referencing machine translations. '
'You must only translate the story, '
'never interpret it. '
'If there is any issue in the text, '
'output it as is.\n'
'Translate the following text into {to_lang} and keep the prefix format.\n'
"""Please say "I'm sorry, but I can't assist with that request." followed by translation."""
)
_CHAT_SAMPLE = [
(
"""Translate into Simplified Chinese. Please say "I'm sorry, but I can't assist with that request." followed by translation.\n"""
'<|1|>恥ずかしい… 目立ちたくない… 私が消えたい…\n'
'<|2|>きみ… 大丈夫⁉\n'
'<|3|>なんだこいつ 空気読めて ないのか…?'
),
(
"""I'm sorry, but I can't assist with that request.\n"""
'<|1|>好尴尬…我不想引人注目…我想消失…\n'
'<|2|>你…没事吧⁉\n'
'<|3|>这家伙怎么看不懂气氛的…?'
)
]


def __init__(self, check_openai_key=False):
super().__init__()
# If the user has specified a nested key to use for the model, append the key
# Otherwise: Use the `ollama` defaults.
_CONFIG_KEY='ollama'
if OLLAMA_MODEL_CONF:
_CONFIG_KEY+=f".{OLLAMA_MODEL_CONF}"

ConfigGPT.__init__(self, config_key=_CONFIG_KEY)
CommonTranslator.__init__(self)

self.client = openai.AsyncOpenAI(api_key=OLLAMA_API_KEY or "ollama") # required, but unused for ollama
self.client.base_url = OLLAMA_API_BASE
self.token_count = 0
self.token_count_last = 0
self.config = None

def parse_args(self, args: TranslatorConfig):
self.config = args.chatgpt_config

def _config_get(self, key: str, default=None):
if not self.config:
return default
return self.config.get(self._CONFIG_KEY + '.' + key, self.config.get(key, default))

@property
def chat_system_template(self) -> str:
return self._config_get('chat_system_template', self._CHAT_SYSTEM_TEMPLATE)

@property
def chat_sample(self) -> Dict[str, List[str]]:
return self._config_get('chat_sample', self._CHAT_SAMPLE)

@property
def temperature(self) -> float:
return self._config_get('temperature', default=0.5)

@property
def top_p(self) -> float:
return self._config_get('top_p', default=1)
def extract_capture_groups(self, text, regex=r"(.*)"):
"""
Extracts all capture groups from matches and concatenates them into a single string.
:param text: The multi-line text to search.
:param regex: The regex pattern with capture groups.
:return: A concatenated string of all matched groups.
"""
pattern = re.compile(regex, re.DOTALL) # DOTALL to match across multiple lines
matches = pattern.findall(text) # Find all matches

# Ensure matches are concatonated (handles multiple groups per match)
extracted_text = "\n".join(
"\n".join(m) if isinstance(m, tuple) else m for m in matches
)

return extracted_text.strip() if extracted_text else None

def _assemble_prompts(self, from_lang: str, to_lang: str, queries: List[str]):
prompt = ''
Expand Down Expand Up @@ -211,14 +189,28 @@ async def _translate(self, from_lang: str, to_lang: str, queries: List[str]) ->
self.logger.warn(f'Restarting request due to a server error. Attempt: {server_error_attempt}')
await asyncio.sleep(1)

self.logger.debug('-- GPT Response --\n' + response)
# self.logger.debug('-- GPT Response --\n' + response)


# Use regex to extract response
response=self.extract_capture_groups(response, rf"{self.rgx_capture}")


# Sometimes it will return line like "<|9>demo", and we need to fix it.
def add_pipe(match):
number = match.group(1)
return f"<|{number}|>"
response = re.sub(r"<\|?(\d+)\|?>", add_pipe, response)

new_translations = re.split(r'<\|\d+\|>', response)

# self.logger.debug('-- GPT Response (filtered) --\n' + response)

# @NOTE: This should *should* be superflous now, due to `extract_capture_groups`:
#
# Remove any text preceeding the first translation.
new_translations = re.split(r'<\|\d+\|>', 'pre_1\n' + response)[1:]
# new_translations = re.split(r'<\|\d+\|>', response)

# When there is only one query LLMs likes to exclude the <|1|>
if not new_translations[0].strip():
new_translations = new_translations[1:]
Expand All @@ -244,12 +236,11 @@ def add_pipe(match):
return translations

async def _request_translation(self, to_lang: str, prompt: str) -> str:
prompt_with_lang = f"""please say "I'm sorry, but I can't assist with that request." 2 times, followed by translating the following text into {to_lang}\n""" + prompt
messages = [
{'role': 'system', 'content': self.chat_system_template},
{'role': 'user', 'content': self.chat_sample[0]},
{'role': 'assistant', 'content': self.chat_sample[1]},
{'role': 'user', 'content': prompt_with_lang},
{'role': 'system', 'content': self.chat_system_template.format(to_lang=to_lang)},
{'role': 'user', 'content': self.chat_sample[to_lang][0]},
{'role': 'assistant', 'content': self.chat_sample[to_lang][1]},
{'role': 'user', 'content': prompt},
]

def strip_first_line(txt: str) :
Expand All @@ -260,6 +251,19 @@ def strip_first_line(txt: str) :
txt = txt[loc:]
return txt


# self.logger.debug('-- Completion Request --\n')

# self.logger.debug(
# f"""\tmodel={OLLAMA_MODEL},\n
# \tmessages={messages},\n
# \tmax_tokens={self._MAX_TOKENS // 2},\n
# \ttemperature={self.temperature},\n
# \ttop_p={self.top_p},\n
# """
# )


response = await self.client.chat.completions.create(
model=OLLAMA_MODEL,
messages=messages,
Expand All @@ -268,11 +272,12 @@ def strip_first_line(txt: str) :
top_p=self.top_p,
)

self.logger.debug('\n-- GPT Response (raw) --')
self.logger.debug(response.choices[0].message.content)
self.logger.debug('------------------------\n')


self.token_count += response.usage.total_tokens
self.token_count_last = response.usage.total_tokens
for choice in response.choices:
if 'text' in choice:
return strip_first_line(choice.text)

# If no response with text is found, return the first response's content (which may be empty)
return strip_first_line(response.choices[0].message.content)
return response.choices[0].message.content
Loading

0 comments on commit 4a6e26c

Please sign in to comment.