vilarin commited on
Commit
9984001
·
verified ·
1 Parent(s): e340fe7

Upload 6 files

Browse files
app/webui/README.md ADDED
File without changes
app/webui/app.py ADDED
@@ -0,0 +1,147 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import re
2
+ import gradio as gr
3
+ from process import model_load, lang_detector, diff_texts, translator, read_doc
4
+ from llama_index.core import SimpleDirectoryReader
5
+
6
+ def huanik(
7
+ endpoint,
8
+ model,
9
+ api_key,
10
+ source_lang,
11
+ target_lang,
12
+ source_text,
13
+ country,
14
+ max_tokens,
15
+ context_window,
16
+ num_output,
17
+ ):
18
+
19
+ if not source_text or source_lang == target_lang:
20
+ raise gr.Error("Please check that the content or options are entered correctly.")
21
+
22
+ try:
23
+ model_load(endpoint, model, api_key, context_window, num_output)
24
+ except Exception as e:
25
+ raise gr.Error(f"An unexpected error occurred: {e}")
26
+
27
+ source_text = re.sub(r'\n+', '\n', source_text)
28
+
29
+ init_translation, reflect_translation, final_translation = translator(
30
+ source_lang=source_lang,
31
+ target_lang=target_lang,
32
+ source_text=source_text,
33
+ country=country,
34
+ max_tokens=max_tokens,
35
+ )
36
+
37
+ final_diff = gr.HighlightedText(
38
+ diff_texts(init_translation, final_translation),
39
+ label="Diff translation",
40
+ combine_adjacent=True,
41
+ show_legend=True,
42
+ visible=True,
43
+ color_map={"removed": "red", "added": "green"})
44
+
45
+ return init_translation, reflect_translation, final_translation, final_diff
46
+
47
+ def update_model(endpoint):
48
+ endpoint_model_map = {
49
+ "Groq": "llama3-70b-8192",
50
+ "OpenAI": "gpt-4o",
51
+ "Cohere": "command-r",
52
+ "TogetherAI": "Qwen/Qwen2-72B-Instruct",
53
+ "Ollama": "llama3",
54
+ "Huggingface": "mistralai/Mistral-7B-Instruct-v0.3"
55
+ }
56
+ return gr.update(value=endpoint_model_map[endpoint])
57
+
58
+ def read_doc(file):
59
+ docs = SimpleDirectoryReader(input_files=file).load_data()
60
+ return docs
61
+
62
+ TITLE = """
63
+ <h1><a href="https://github.com/andrewyng/translation-agent">Translation-Agent</a> webUI</h1>
64
+ """
65
+
66
+ CSS = """
67
+ h1 {
68
+ text-align: center;
69
+ display: block;
70
+ height: 10vh;
71
+ align-content: center;
72
+ }
73
+ footer {
74
+ visibility: hidden;
75
+ }
76
+ """
77
+
78
+ with gr.Blocks(theme="soft", css=CSS) as demo:
79
+ gr.Markdown(TITLE)
80
+ with gr.Row():
81
+ with gr.Column(scale=1):
82
+ endpoint = gr.Dropdown(
83
+ label="Endpoint",
84
+ choices=["Groq","OpenAI","Cohere","TogetherAI","Ollama","Huggingface"],
85
+ value="Groq",
86
+ )
87
+ model = gr.Textbox(label="Model", value="llama3-70b-8192", )
88
+ api_key = gr.Textbox(label="API_KEY", type="password", )
89
+ source_lang = gr.Textbox(
90
+ label="Source Lang(Auto-Detect)",
91
+ value="English",
92
+ )
93
+ target_lang = gr.Textbox(
94
+ label="Target Lang",
95
+ value="Spanish",
96
+ )
97
+ country = gr.Textbox(label="Country", value="Argentina", max_lines=1)
98
+ with gr.Accordion("Advanced Options", open=False):
99
+ max_tokens = gr.Slider(
100
+ label="Max tokens Per Chunk",
101
+ minimum=512,
102
+ maximum=2046,
103
+ value=1000,
104
+ step=8,
105
+ )
106
+ context_window = gr.Slider(
107
+ label="Context Window",
108
+ minimum=512,
109
+ maximum=8192,
110
+ value=4096,
111
+ step=8,
112
+ )
113
+ num_output = gr.Slider(
114
+ label="Output Num",
115
+ minimum=256,
116
+ maximum=8192,
117
+ value=512,
118
+ step=8,
119
+ )
120
+ with gr.Column(scale=4):
121
+ source_text = gr.Textbox(
122
+ label="Source Text",
123
+ value="How we live is so different from how we ought to live that he who studies "+\
124
+ "what ought to be done rather than what is done will learn the way to his downfall "+\
125
+ "rather than to his preservation.",
126
+ lines=5,
127
+ )
128
+ with gr.Tab("Final"):
129
+ output_final = gr.Textbox(label="FInal Translation", lines=3, show_copy_button=True)
130
+ with gr.Tab("Initial"):
131
+ output_init = gr.Textbox(label="Init Translation", lines=3, show_copy_button=True)
132
+ with gr.Tab("Reflection"):
133
+ output_reflect = gr.Textbox(label="Reflection", lines=3, show_copy_button=True)
134
+ with gr.Tab("Diff"):
135
+ output_diff = gr.HighlightedText(visible = False)
136
+ with gr.Row():
137
+ submit = gr.Button(value="Submit")
138
+ upload = gr.UploadButton("Upload")
139
+ clear = gr.ClearButton([source_text, output_init, output_reflect, output_final])
140
+
141
+ endpoint.change(fn=update_model, inputs=[endpoint], outputs=[model])
142
+ source_text.change(lang_detector, source_text, source_lang)
143
+ submit.click(fn=huanik, inputs=[endpoint, model, api_key, source_lang, target_lang, source_text, country, max_tokens, context_window, num_output], outputs=[output_init, output_reflect, output_final, output_diff])
144
+ upload.upload(fn=read_doc, inputs = upload, outputs = source_text)
145
+
146
+ if __name__ == "__main__":
147
+ demo.queue(api_open=False).launch(show_api=False, share=False)
app/webui/patch.py ADDED
@@ -0,0 +1,131 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # a monkey patch to use llama-index completion
2
+ from typing import Union, Callable
3
+ from functools import wraps
4
+ from src.translation_agent.utils import *
5
+
6
+
7
+ from llama_index.llms.groq import Groq
8
+ from llama_index.llms.cohere import Cohere
9
+ from llama_index.llms.openai import OpenAI
10
+ from llama_index.llms.together import TogetherLLM
11
+ from llama_index.llms.ollama import Ollama
12
+ from llama_index.llms.huggingface_api import HuggingFaceInferenceAPI
13
+
14
+ from llama_index.core import Settings
15
+ from llama_index.core.llms import ChatMessage
16
+
17
+
18
+ # Add your LLMs here
19
+
20
+ def model_load(
21
+ endpoint: str,
22
+ model: str,
23
+ api_key: str = None,
24
+ context_window: int = 4096,
25
+ num_output: int = 512,
26
+ ):
27
+ if endpoint == "Groq":
28
+ llm = Groq(
29
+ model=model,
30
+ api_key=api_key,
31
+ )
32
+ elif endpoint == "Cohere":
33
+ llm = Cohere(
34
+ model=model,
35
+ api_key=api_key,
36
+ )
37
+ elif endpoint == "OpenAI":
38
+ llm = OpenAI(
39
+ model=model,
40
+ api_key=api_key,
41
+ )
42
+ elif endpoint == "TogetherAI":
43
+ llm = TogetherLLM(
44
+ model=model,
45
+ api_key=api_key,
46
+ )
47
+ elif endpoint == "ollama":
48
+ llm = Ollama(
49
+ model=model,
50
+ request_timeout=120.0)
51
+ elif endpoint == "Huggingface":
52
+ llm = HuggingFaceInferenceAPI(
53
+ model_name=model,
54
+ token=api_key,
55
+ task="text-generation",
56
+ )
57
+ Settings.llm = llm
58
+ # maximum input size to the LLM
59
+ Settings.context_window = context_window
60
+
61
+ # number of tokens reserved for text generation.
62
+ Settings.num_output = num_output
63
+
64
+
65
+
66
+ def completion_wrapper(func: Callable) -> Callable:
67
+ @wraps(func)
68
+ def wrapper(
69
+ prompt: str,
70
+ system_message: str = "You are a helpful assistant.",
71
+ temperature: float = 0.3,
72
+ json_mode: bool = False,
73
+ ) -> Union[str, dict]:
74
+ """
75
+ Generate a completion using the OpenAI API.
76
+
77
+ Args:
78
+ prompt (str): The user's prompt or query.
79
+ system_message (str, optional): The system message to set the context for the assistant.
80
+ Defaults to "You are a helpful assistant.".
81
+ temperature (float, optional): The sampling temperature for controlling the randomness of the generated text.
82
+ Defaults to 0.3.
83
+ json_mode (bool, optional): Whether to return the response in JSON format.
84
+ Defaults to False.
85
+
86
+ Returns:
87
+ Union[str, dict]: The generated completion.
88
+ If json_mode is True, returns the complete API response as a dictionary.
89
+ If json_mode is False, returns the generated text as a string.
90
+ """
91
+ llm = Settings.llm
92
+ if llm.class_name() == "HuggingFaceInferenceAPI":
93
+ llm.system_prompt = system_message
94
+ messages = [
95
+ ChatMessage(
96
+ role="user", content=prompt),
97
+ ]
98
+ response = llm.chat(
99
+ messages=messages,
100
+ temperature=temperature,
101
+ top_p=1,
102
+ )
103
+ return response.message.content
104
+ else:
105
+ messages = [
106
+ ChatMessage(
107
+ role="system", content=system_message),
108
+ ChatMessage(
109
+ role="user", content=prompt),
110
+ ]
111
+
112
+ if json_mode:
113
+ response = llm.chat(
114
+ temperature=temperature,
115
+ top_p=1,
116
+ response_format={"type": "json_object"},
117
+ messages=messages,
118
+ )
119
+ return response.message.content
120
+ else:
121
+ response = llm.chat(
122
+ temperature=temperature,
123
+ top_p=1,
124
+ messages=messages,
125
+ )
126
+ return response.message.content
127
+
128
+ return wrapper
129
+
130
+ openai_completion = get_completion
131
+ get_completion = completion_wrapper(openai_completion)
app/webui/process.py ADDED
@@ -0,0 +1,136 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from polyglot.detect import Detector
2
+ from polyglot.text import Text
3
+ from difflib import Differ
4
+ from icecream import ic
5
+ from patch import *
6
+ from llama_index.core.node_parser import SentenceSplitter
7
+
8
+ def lang_detector(text):
9
+ min_chars = 5
10
+ if len(text) < min_chars:
11
+ return "Input text too short"
12
+ try:
13
+ detector = Detector(text).language
14
+ lang_info = str(detector)
15
+ code = re.search(r"name: (\w+)", lang_info).group(1)
16
+ return code
17
+ except Exception as e:
18
+ return f"ERROR:{str(e)}"
19
+
20
+ def tokenize(text):
21
+ # Use polyglot to tokenize the text
22
+ polyglot_text = Text(text)
23
+ words = polyglot_text.words
24
+
25
+ # Check if the text contains spaces
26
+ if ' ' in text:
27
+ # Create a list of words and spaces
28
+ tokens = []
29
+ for word in words:
30
+ tokens.append(word)
31
+ tokens.append(' ') # Add space after each word
32
+ return tokens[:-1] # Remove the last space
33
+ else:
34
+ return words
35
+
36
+
37
+ def diff_texts(text1, text2):
38
+ tokens1 = tokenize(text1)
39
+ tokens2 = tokenize(text2)
40
+
41
+ d = Differ()
42
+ diff_result = list(d.compare(tokens1, tokens2))
43
+
44
+ highlighted_text = []
45
+ for token in diff_result:
46
+ word = token[2:]
47
+ category = None
48
+ if token[0] == '+':
49
+ category = 'added'
50
+ elif token[0] == '-':
51
+ category = 'removed'
52
+ elif token[0] == '?':
53
+ continue # Ignore the hints line
54
+
55
+ highlighted_text.append((word, category))
56
+
57
+ return highlighted_text
58
+
59
+ #modified from src.translaation-agent.utils.tranlsate
60
+ def translator(
61
+ source_lang,
62
+ target_lang,
63
+ source_text,
64
+ country,
65
+ max_tokens=MAX_TOKENS_PER_CHUNK
66
+ ):
67
+ """Translate the source_text from source_lang to target_lang."""
68
+ num_tokens_in_text = num_tokens_in_string(source_text)
69
+
70
+ ic(num_tokens_in_text)
71
+
72
+ if num_tokens_in_text < max_tokens:
73
+ ic("Translating text as single chunk")
74
+
75
+ #Note: use yield from B() if put yield in function B()
76
+ init_translation = one_chunk_initial_translation(
77
+ source_lang, target_lang, source_text
78
+ )
79
+
80
+
81
+ reflection = one_chunk_reflect_on_translation(
82
+ source_lang, target_lang, source_text, init_translation, country
83
+ )
84
+
85
+ final_translation = one_chunk_improve_translation(
86
+ source_lang, target_lang, source_text, init_translation, reflection
87
+ )
88
+
89
+ return init_translation, reflection, final_translation
90
+
91
+ else:
92
+ ic("Translating text as multiple chunks")
93
+
94
+ token_size = calculate_chunk_size(
95
+ token_count=num_tokens_in_text, token_limit=max_tokens
96
+ )
97
+
98
+ ic(token_size)
99
+
100
+ #using sentence splitter
101
+ text_parser = SentenceSplitter(
102
+ chunk_size=token_size,
103
+ )
104
+
105
+ source_text_chunks = text_parser.split_text(source_text)
106
+
107
+ translation_1_chunks = multichunk_initial_translation(
108
+ source_lang, target_lang, source_text_chunks
109
+ )
110
+
111
+ init_translation = "".join(translation_1_chunks)
112
+
113
+ reflection_chunks = multichunk_reflect_on_translation(
114
+ source_lang,
115
+ target_lang,
116
+ source_text_chunks,
117
+ translation_1_chunks,
118
+ country,
119
+ )
120
+
121
+ reflection = "".join(reflection_chunks)
122
+
123
+ translation_2_chunks = multichunk_improve_translation(
124
+ source_lang,
125
+ target_lang,
126
+ source_text_chunks,
127
+ translation_1_chunks,
128
+ reflection_chunks,
129
+ )
130
+
131
+ final_translation = "".join(translation_2_chunks)
132
+
133
+ return init_translation, reflection, final_translation
134
+
135
+
136
+
src/translation_agent/__init__.py ADDED
@@ -0,0 +1 @@
 
 
1
+ from .utils import translate
src/translation_agent/utils.py ADDED
@@ -0,0 +1,687 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ from typing import List
3
+ from typing import Union
4
+
5
+ import openai
6
+ import tiktoken
7
+ from dotenv import load_dotenv
8
+ from icecream import ic
9
+ from langchain_text_splitters import RecursiveCharacterTextSplitter
10
+
11
+
12
+ load_dotenv() # read local .env file
13
+ client = openai.OpenAI(api_key=os.getenv("OPENAI_API_KEY"))
14
+
15
+ MAX_TOKENS_PER_CHUNK = (
16
+ 1000 # if text is more than this many tokens, we'll break it up into
17
+ )
18
+ # discrete chunks to translate one chunk at a time
19
+
20
+
21
+ def get_completion(
22
+ prompt: str,
23
+ system_message: str = "You are a helpful assistant.",
24
+ model: str = "gpt-4-turbo",
25
+ temperature: float = 0.3,
26
+ json_mode: bool = False,
27
+ ) -> Union[str, dict]:
28
+ """
29
+ Generate a completion using the OpenAI API.
30
+
31
+ Args:
32
+ prompt (str): The user's prompt or query.
33
+ system_message (str, optional): The system message to set the context for the assistant.
34
+ Defaults to "You are a helpful assistant.".
35
+ model (str, optional): The name of the OpenAI model to use for generating the completion.
36
+ Defaults to "gpt-4-turbo".
37
+ temperature (float, optional): The sampling temperature for controlling the randomness of the generated text.
38
+ Defaults to 0.3.
39
+ json_mode (bool, optional): Whether to return the response in JSON format.
40
+ Defaults to False.
41
+
42
+ Returns:
43
+ Union[str, dict]: The generated completion.
44
+ If json_mode is True, returns the complete API response as a dictionary.
45
+ If json_mode is False, returns the generated text as a string.
46
+ """
47
+
48
+ if json_mode:
49
+ response = client.chat.completions.create(
50
+ model=model,
51
+ temperature=temperature,
52
+ top_p=1,
53
+ response_format={"type": "json_object"},
54
+ messages=[
55
+ {"role": "system", "content": system_message},
56
+ {"role": "user", "content": prompt},
57
+ ],
58
+ )
59
+ return response.choices[0].message.content
60
+ else:
61
+ response = client.chat.completions.create(
62
+ model=model,
63
+ temperature=temperature,
64
+ top_p=1,
65
+ messages=[
66
+ {"role": "system", "content": system_message},
67
+ {"role": "user", "content": prompt},
68
+ ],
69
+ )
70
+ return response.choices[0].message.content
71
+
72
+
73
+ def one_chunk_initial_translation(
74
+ source_lang: str, target_lang: str, source_text: str
75
+ ) -> str:
76
+ """
77
+ Translate the entire text as one chunk using an LLM.
78
+
79
+ Args:
80
+ source_lang (str): The source language of the text.
81
+ target_lang (str): The target language for translation.
82
+ source_text (str): The text to be translated.
83
+
84
+ Returns:
85
+ str: The translated text.
86
+ """
87
+
88
+ system_message = f"You are an expert linguist, specializing in translation from {source_lang} to {target_lang}."
89
+
90
+ translation_prompt = f"""This is an {source_lang} to {target_lang} translation, please provide the {target_lang} translation for this text. \
91
+ Do not provide any explanations or text apart from the translation.
92
+ {source_lang}: {source_text}
93
+
94
+ {target_lang}:"""
95
+
96
+ prompt = translation_prompt.format(source_text=source_text)
97
+
98
+ translation = get_completion(prompt, system_message=system_message)
99
+
100
+ return translation
101
+
102
+
103
+ def one_chunk_reflect_on_translation(
104
+ source_lang: str,
105
+ target_lang: str,
106
+ source_text: str,
107
+ translation_1: str,
108
+ country: str = "",
109
+ ) -> str:
110
+ """
111
+ Use an LLM to reflect on the translation, treating the entire text as one chunk.
112
+
113
+ Args:
114
+ source_lang (str): The source language of the text.
115
+ target_lang (str): The target language of the translation.
116
+ source_text (str): The original text in the source language.
117
+ translation_1 (str): The initial translation of the source text.
118
+ country (str): Country specified for target language.
119
+
120
+ Returns:
121
+ str: The LLM's reflection on the translation, providing constructive criticism and suggestions for improvement.
122
+ """
123
+
124
+ system_message = f"You are an expert linguist specializing in translation from {source_lang} to {target_lang}. \
125
+ You will be provided with a source text and its translation and your goal is to improve the translation."
126
+
127
+ if country != "":
128
+ reflection_prompt = f"""Your task is to carefully read a source text and a translation from {source_lang} to {target_lang}, and then give constructive criticism and helpful suggestions to improve the translation. \
129
+ The final style and tone of the translation should match the style of {target_lang} colloquially spoken in {country}.
130
+
131
+ The source text and initial translation, delimited by XML tags <SOURCE_TEXT></SOURCE_TEXT> and <TRANSLATION></TRANSLATION>, are as follows:
132
+
133
+ <SOURCE_TEXT>
134
+ {source_text}
135
+ </SOURCE_TEXT>
136
+
137
+ <TRANSLATION>
138
+ {translation_1}
139
+ </TRANSLATION>
140
+
141
+ When writing suggestions, pay attention to whether there are ways to improve the translation's \n\
142
+ (i) accuracy (by correcting errors of addition, mistranslation, omission, or untranslated text),\n\
143
+ (ii) fluency (by applying {target_lang} grammar, spelling and punctuation rules, and ensuring there are no unnecessary repetitions),\n\
144
+ (iii) style (by ensuring the translations reflect the style of the source text and takes into account any cultural context),\n\
145
+ (iv) terminology (by ensuring terminology use is consistent and reflects the source text domain; and by only ensuring you use equivalent idioms {target_lang}).\n\
146
+
147
+ Write a list of specific, helpful and constructive suggestions for improving the translation.
148
+ Each suggestion should address one specific part of the translation.
149
+ Output only the suggestions and nothing else."""
150
+
151
+ else:
152
+ reflection_prompt = f"""Your task is to carefully read a source text and a translation from {source_lang} to {target_lang}, and then give constructive criticism and helpful suggestions to improve the translation. \
153
+
154
+ The source text and initial translation, delimited by XML tags <SOURCE_TEXT></SOURCE_TEXT> and <TRANSLATION></TRANSLATION>, are as follows:
155
+
156
+ <SOURCE_TEXT>
157
+ {source_text}
158
+ </SOURCE_TEXT>
159
+
160
+ <TRANSLATION>
161
+ {translation_1}
162
+ </TRANSLATION>
163
+
164
+ When writing suggestions, pay attention to whether there are ways to improve the translation's \n\
165
+ (i) accuracy (by correcting errors of addition, mistranslation, omission, or untranslated text),\n\
166
+ (ii) fluency (by applying {target_lang} grammar, spelling and punctuation rules, and ensuring there are no unnecessary repetitions),\n\
167
+ (iii) style (by ensuring the translations reflect the style of the source text and takes into account any cultural context),\n\
168
+ (iv) terminology (by ensuring terminology use is consistent and reflects the source text domain; and by only ensuring you use equivalent idioms {target_lang}).\n\
169
+
170
+ Write a list of specific, helpful and constructive suggestions for improving the translation.
171
+ Each suggestion should address one specific part of the translation.
172
+ Output only the suggestions and nothing else."""
173
+
174
+ prompt = reflection_prompt.format(
175
+ source_lang=source_lang,
176
+ target_lang=target_lang,
177
+ source_text=source_text,
178
+ translation_1=translation_1,
179
+ )
180
+ reflection = get_completion(prompt, system_message=system_message)
181
+ return reflection
182
+
183
+
184
+ def one_chunk_improve_translation(
185
+ source_lang: str,
186
+ target_lang: str,
187
+ source_text: str,
188
+ translation_1: str,
189
+ reflection: str,
190
+ ) -> str:
191
+ """
192
+ Use the reflection to improve the translation, treating the entire text as one chunk.
193
+
194
+ Args:
195
+ source_lang (str): The source language of the text.
196
+ target_lang (str): The target language for the translation.
197
+ source_text (str): The original text in the source language.
198
+ translation_1 (str): The initial translation of the source text.
199
+ reflection (str): Expert suggestions and constructive criticism for improving the translation.
200
+
201
+ Returns:
202
+ str: The improved translation based on the expert suggestions.
203
+ """
204
+
205
+ system_message = f"You are an expert linguist, specializing in translation editing from {source_lang} to {target_lang}."
206
+
207
+ prompt = f"""Your task is to carefully read, then edit, a translation from {source_lang} to {target_lang}, taking into
208
+ account a list of expert suggestions and constructive criticisms.
209
+
210
+ The source text, the initial translation, and the expert linguist suggestions are delimited by XML tags <SOURCE_TEXT></SOURCE_TEXT>, <TRANSLATION></TRANSLATION> and <EXPERT_SUGGESTIONS></EXPERT_SUGGESTIONS> \
211
+ as follows:
212
+
213
+ <SOURCE_TEXT>
214
+ {source_text}
215
+ </SOURCE_TEXT>
216
+
217
+ <TRANSLATION>
218
+ {translation_1}
219
+ </TRANSLATION>
220
+
221
+ <EXPERT_SUGGESTIONS>
222
+ {reflection}
223
+ </EXPERT_SUGGESTIONS>
224
+
225
+ Please take into account the expert suggestions when editing the translation. Edit the translation by ensuring:
226
+
227
+ (i) accuracy (by correcting errors of addition, mistranslation, omission, or untranslated text),
228
+ (ii) fluency (by applying {target_lang} grammar, spelling and punctuation rules and ensuring there are no unnecessary repetitions), \
229
+ (iii) style (by ensuring the translations reflect the style of the source text)
230
+ (iv) terminology (inappropriate for context, inconsistent use), or
231
+ (v) other errors.
232
+
233
+ Output only the new translation and nothing else."""
234
+
235
+ translation_2 = get_completion(prompt, system_message)
236
+
237
+ return translation_2
238
+
239
+
240
+ def one_chunk_translate_text(
241
+ source_lang: str, target_lang: str, source_text: str, country: str = ""
242
+ ) -> str:
243
+ """
244
+ Translate a single chunk of text from the source language to the target language.
245
+
246
+ This function performs a two-step translation process:
247
+ 1. Get an initial translation of the source text.
248
+ 2. Reflect on the initial translation and generate an improved translation.
249
+
250
+ Args:
251
+ source_lang (str): The source language of the text.
252
+ target_lang (str): The target language for the translation.
253
+ source_text (str): The text to be translated.
254
+ country (str): Country specified for target language.
255
+ Returns:
256
+ str: The improved translation of the source text.
257
+ """
258
+ translation_1 = one_chunk_initial_translation(
259
+ source_lang, target_lang, source_text
260
+ )
261
+
262
+ reflection = one_chunk_reflect_on_translation(
263
+ source_lang, target_lang, source_text, translation_1, country
264
+ )
265
+ translation_2 = one_chunk_improve_translation(
266
+ source_lang, target_lang, source_text, translation_1, reflection
267
+ )
268
+
269
+ return translation_2
270
+
271
+
272
+ def num_tokens_in_string(
273
+ input_str: str, encoding_name: str = "cl100k_base"
274
+ ) -> int:
275
+ """
276
+ Calculate the number of tokens in a given string using a specified encoding.
277
+
278
+ Args:
279
+ str (str): The input string to be tokenized.
280
+ encoding_name (str, optional): The name of the encoding to use. Defaults to "cl100k_base",
281
+ which is the most commonly used encoder (used by GPT-4).
282
+
283
+ Returns:
284
+ int: The number of tokens in the input string.
285
+
286
+ Example:
287
+ >>> text = "Hello, how are you?"
288
+ >>> num_tokens = num_tokens_in_string(text)
289
+ >>> print(num_tokens)
290
+ 5
291
+ """
292
+ encoding = tiktoken.get_encoding(encoding_name)
293
+ num_tokens = len(encoding.encode(input_str))
294
+ return num_tokens
295
+
296
+
297
+ def multichunk_initial_translation(
298
+ source_lang: str, target_lang: str, source_text_chunks: List[str]
299
+ ) -> List[str]:
300
+ """
301
+ Translate a text in multiple chunks from the source language to the target language.
302
+
303
+ Args:
304
+ source_lang (str): The source language of the text.
305
+ target_lang (str): The target language for translation.
306
+ source_text_chunks (List[str]): A list of text chunks to be translated.
307
+
308
+ Returns:
309
+ List[str]: A list of translated text chunks.
310
+ """
311
+
312
+ system_message = f"You are an expert linguist, specializing in translation from {source_lang} to {target_lang}."
313
+
314
+ translation_prompt = """Your task is provide a professional translation from {source_lang} to {target_lang} of PART of a text.
315
+
316
+ The source text is below, delimited by XML tags <SOURCE_TEXT> and </SOURCE_TEXT>. Translate only the part within the source text
317
+ delimited by <TRANSLATE_THIS> and </TRANSLATE_THIS>. You can use the rest of the source text as context, but do not translate any
318
+ of the other text. Do not output anything other than the translation of the indicated part of the text.
319
+
320
+ <SOURCE_TEXT>
321
+ {tagged_text}
322
+ </SOURCE_TEXT>
323
+
324
+ To reiterate, you should translate only this part of the text, shown here again between <TRANSLATE_THIS> and </TRANSLATE_THIS>:
325
+ <TRANSLATE_THIS>
326
+ {chunk_to_translate}
327
+ </TRANSLATE_THIS>
328
+
329
+ Output only the translation of the portion you are asked to translate, and nothing else.
330
+ """
331
+
332
+ translation_chunks = []
333
+ for i in range(len(source_text_chunks)):
334
+ # Will translate chunk i
335
+ tagged_text = (
336
+ "".join(source_text_chunks[0:i])
337
+ + "<TRANSLATE_THIS>"
338
+ + source_text_chunks[i]
339
+ + "</TRANSLATE_THIS>"
340
+ + "".join(source_text_chunks[i + 1 :])
341
+ )
342
+
343
+ prompt = translation_prompt.format(
344
+ source_lang=source_lang,
345
+ target_lang=target_lang,
346
+ tagged_text=tagged_text,
347
+ chunk_to_translate=source_text_chunks[i],
348
+ )
349
+
350
+ translation = get_completion(prompt, system_message=system_message)
351
+ translation_chunks.append(translation)
352
+
353
+ return translation_chunks
354
+
355
+
356
+ def multichunk_reflect_on_translation(
357
+ source_lang: str,
358
+ target_lang: str,
359
+ source_text_chunks: List[str],
360
+ translation_1_chunks: List[str],
361
+ country: str = "",
362
+ ) -> List[str]:
363
+ """
364
+ Provides constructive criticism and suggestions for improving a partial translation.
365
+
366
+ Args:
367
+ source_lang (str): The source language of the text.
368
+ target_lang (str): The target language of the translation.
369
+ source_text_chunks (List[str]): The source text divided into chunks.
370
+ translation_1_chunks (List[str]): The translated chunks corresponding to the source text chunks.
371
+ country (str): Country specified for target language.
372
+
373
+ Returns:
374
+ List[str]: A list of reflections containing suggestions for improving each translated chunk.
375
+ """
376
+
377
+ system_message = f"You are an expert linguist specializing in translation from {source_lang} to {target_lang}. \
378
+ You will be provided with a source text and its translation and your goal is to improve the translation."
379
+
380
+ if country != "":
381
+ reflection_prompt = """Your task is to carefully read a source text and part of a translation of that text from {source_lang} to {target_lang}, and then give constructive criticism and helpful suggestions for improving the translation.
382
+ The final style and tone of the translation should match the style of {target_lang} colloquially spoken in {country}.
383
+
384
+ The source text is below, delimited by XML tags <SOURCE_TEXT> and </SOURCE_TEXT>, and the part that has been translated
385
+ is delimited by <TRANSLATE_THIS> and </TRANSLATE_THIS> within the source text. You can use the rest of the source text
386
+ as context for critiquing the translated part.
387
+
388
+ <SOURCE_TEXT>
389
+ {tagged_text}
390
+ </SOURCE_TEXT>
391
+
392
+ To reiterate, only part of the text is being translated, shown here again between <TRANSLATE_THIS> and </TRANSLATE_THIS>:
393
+ <TRANSLATE_THIS>
394
+ {chunk_to_translate}
395
+ </TRANSLATE_THIS>
396
+
397
+ The translation of the indicated part, delimited below by <TRANSLATION> and </TRANSLATION>, is as follows:
398
+ <TRANSLATION>
399
+ {translation_1_chunk}
400
+ </TRANSLATION>
401
+
402
+ When writing suggestions, pay attention to whether there are ways to improve the translation's:\n\
403
+ (i) accuracy (by correcting errors of addition, mistranslation, omission, or untranslated text),\n\
404
+ (ii) fluency (by applying {target_lang} grammar, spelling and punctuation rules, and ensuring there are no unnecessary repetitions),\n\
405
+ (iii) style (by ensuring the translations reflect the style of the source text and takes into account any cultural context),\n\
406
+ (iv) terminology (by ensuring terminology use is consistent and reflects the source text domain; and by only ensuring you use equivalent idioms {target_lang}).\n\
407
+
408
+ Write a list of specific, helpful and constructive suggestions for improving the translation.
409
+ Each suggestion should address one specific part of the translation.
410
+ Output only the suggestions and nothing else."""
411
+
412
+ else:
413
+ reflection_prompt = """Your task is to carefully read a source text and part of a translation of that text from {source_lang} to {target_lang}, and then give constructive criticism and helpful suggestions for improving the translation.
414
+
415
+ The source text is below, delimited by XML tags <SOURCE_TEXT> and </SOURCE_TEXT>, and the part that has been translated
416
+ is delimited by <TRANSLATE_THIS> and </TRANSLATE_THIS> within the source text. You can use the rest of the source text
417
+ as context for critiquing the translated part.
418
+
419
+ <SOURCE_TEXT>
420
+ {tagged_text}
421
+ </SOURCE_TEXT>
422
+
423
+ To reiterate, only part of the text is being translated, shown here again between <TRANSLATE_THIS> and </TRANSLATE_THIS>:
424
+ <TRANSLATE_THIS>
425
+ {chunk_to_translate}
426
+ </TRANSLATE_THIS>
427
+
428
+ The translation of the indicated part, delimited below by <TRANSLATION> and </TRANSLATION>, is as follows:
429
+ <TRANSLATION>
430
+ {translation_1_chunk}
431
+ </TRANSLATION>
432
+
433
+ When writing suggestions, pay attention to whether there are ways to improve the translation's:\n\
434
+ (i) accuracy (by correcting errors of addition, mistranslation, omission, or untranslated text),\n\
435
+ (ii) fluency (by applying {target_lang} grammar, spelling and punctuation rules, and ensuring there are no unnecessary repetitions),\n\
436
+ (iii) style (by ensuring the translations reflect the style of the source text and takes into account any cultural context),\n\
437
+ (iv) terminology (by ensuring terminology use is consistent and reflects the source text domain; and by only ensuring you use equivalent idioms {target_lang}).\n\
438
+
439
+ Write a list of specific, helpful and constructive suggestions for improving the translation.
440
+ Each suggestion should address one specific part of the translation.
441
+ Output only the suggestions and nothing else."""
442
+
443
+ reflection_chunks = []
444
+ for i in range(len(source_text_chunks)):
445
+ # Will translate chunk i
446
+ tagged_text = (
447
+ "".join(source_text_chunks[0:i])
448
+ + "<TRANSLATE_THIS>"
449
+ + source_text_chunks[i]
450
+ + "</TRANSLATE_THIS>"
451
+ + "".join(source_text_chunks[i + 1 :])
452
+ )
453
+ if country != "":
454
+ prompt = reflection_prompt.format(
455
+ source_lang=source_lang,
456
+ target_lang=target_lang,
457
+ tagged_text=tagged_text,
458
+ chunk_to_translate=source_text_chunks[i],
459
+ translation_1_chunk=translation_1_chunks[i],
460
+ country=country,
461
+ )
462
+ else:
463
+ prompt = reflection_prompt.format(
464
+ source_lang=source_lang,
465
+ target_lang=target_lang,
466
+ tagged_text=tagged_text,
467
+ chunk_to_translate=source_text_chunks[i],
468
+ translation_1_chunk=translation_1_chunks[i],
469
+ )
470
+
471
+ reflection = get_completion(prompt, system_message=system_message)
472
+ reflection_chunks.append(reflection)
473
+
474
+ return reflection_chunks
475
+
476
+
477
+ def multichunk_improve_translation(
478
+ source_lang: str,
479
+ target_lang: str,
480
+ source_text_chunks: List[str],
481
+ translation_1_chunks: List[str],
482
+ reflection_chunks: List[str],
483
+ ) -> List[str]:
484
+ """
485
+ Improves the translation of a text from source language to target language by considering expert suggestions.
486
+
487
+ Args:
488
+ source_lang (str): The source language of the text.
489
+ target_lang (str): The target language for translation.
490
+ source_text_chunks (List[str]): The source text divided into chunks.
491
+ translation_1_chunks (List[str]): The initial translation of each chunk.
492
+ reflection_chunks (List[str]): Expert suggestions for improving each translated chunk.
493
+
494
+ Returns:
495
+ List[str]: The improved translation of each chunk.
496
+ """
497
+
498
+ system_message = f"You are an expert linguist, specializing in translation editing from {source_lang} to {target_lang}."
499
+
500
+ improvement_prompt = """Your task is to carefully read, then improve, a translation from {source_lang} to {target_lang}, taking into
501
+ account a set of expert suggestions and constructive criticisms. Below, the source text, initial translation, and expert suggestions are provided.
502
+
503
+ The source text is below, delimited by XML tags <SOURCE_TEXT> and </SOURCE_TEXT>, and the part that has been translated
504
+ is delimited by <TRANSLATE_THIS> and </TRANSLATE_THIS> within the source text. You can use the rest of the source text
505
+ as context, but need to provide a translation only of the part indicated by <TRANSLATE_THIS> and </TRANSLATE_THIS>.
506
+
507
+ <SOURCE_TEXT>
508
+ {tagged_text}
509
+ </SOURCE_TEXT>
510
+
511
+ To reiterate, only part of the text is being translated, shown here again between <TRANSLATE_THIS> and </TRANSLATE_THIS>:
512
+ <TRANSLATE_THIS>
513
+ {chunk_to_translate}
514
+ </TRANSLATE_THIS>
515
+
516
+ The translation of the indicated part, delimited below by <TRANSLATION> and </TRANSLATION>, is as follows:
517
+ <TRANSLATION>
518
+ {translation_1_chunk}
519
+ </TRANSLATION>
520
+
521
+ The expert translations of the indicated part, delimited below by <EXPERT_SUGGESTIONS> and </EXPERT_SUGGESTIONS>, is as follows:
522
+ <EXPERT_SUGGESTIONS>
523
+ {reflection_chunk}
524
+ </EXPERT_SUGGESTIONS>
525
+
526
+ Taking into account the expert suggestions rewrite the translation to improve it, paying attention
527
+ to whether there are ways to improve the translation's
528
+
529
+ (i) accuracy (by correcting errors of addition, mistranslation, omission, or untranslated text),
530
+ (ii) fluency (by applying {target_lang} grammar, spelling and punctuation rules and ensuring there are no unnecessary repetitions), \
531
+ (iii) style (by ensuring the translations reflect the style of the source text)
532
+ (iv) terminology (inappropriate for context, inconsistent use), or
533
+ (v) other errors.
534
+
535
+ Output only the new translation of the indicated part and nothing else."""
536
+
537
+ translation_2_chunks = []
538
+ for i in range(len(source_text_chunks)):
539
+ # Will translate chunk i
540
+ tagged_text = (
541
+ "".join(source_text_chunks[0:i])
542
+ + "<TRANSLATE_THIS>"
543
+ + source_text_chunks[i]
544
+ + "</TRANSLATE_THIS>"
545
+ + "".join(source_text_chunks[i + 1 :])
546
+ )
547
+
548
+ prompt = improvement_prompt.format(
549
+ source_lang=source_lang,
550
+ target_lang=target_lang,
551
+ tagged_text=tagged_text,
552
+ chunk_to_translate=source_text_chunks[i],
553
+ translation_1_chunk=translation_1_chunks[i],
554
+ reflection_chunk=reflection_chunks[i],
555
+ )
556
+
557
+ translation_2 = get_completion(prompt, system_message=system_message)
558
+ translation_2_chunks.append(translation_2)
559
+
560
+ return translation_2_chunks
561
+
562
+
563
+ def multichunk_translation(
564
+ source_lang, target_lang, source_text_chunks, country: str = ""
565
+ ):
566
+ """
567
+ Improves the translation of multiple text chunks based on the initial translation and reflection.
568
+
569
+ Args:
570
+ source_lang (str): The source language of the text chunks.
571
+ target_lang (str): The target language for translation.
572
+ source_text_chunks (List[str]): The list of source text chunks to be translated.
573
+ translation_1_chunks (List[str]): The list of initial translations for each source text chunk.
574
+ reflection_chunks (List[str]): The list of reflections on the initial translations.
575
+ country (str): Country specified for target language
576
+ Returns:
577
+ List[str]: The list of improved translations for each source text chunk.
578
+ """
579
+
580
+ translation_1_chunks = multichunk_initial_translation(
581
+ source_lang, target_lang, source_text_chunks
582
+ )
583
+
584
+ reflection_chunks = multichunk_reflect_on_translation(
585
+ source_lang,
586
+ target_lang,
587
+ source_text_chunks,
588
+ translation_1_chunks,
589
+ country,
590
+ )
591
+
592
+ translation_2_chunks = multichunk_improve_translation(
593
+ source_lang,
594
+ target_lang,
595
+ source_text_chunks,
596
+ translation_1_chunks,
597
+ reflection_chunks,
598
+ )
599
+
600
+ return translation_2_chunks
601
+
602
+
603
+ def calculate_chunk_size(token_count: int, token_limit: int) -> int:
604
+ """
605
+ Calculate the chunk size based on the token count and token limit.
606
+
607
+ Args:
608
+ token_count (int): The total number of tokens.
609
+ token_limit (int): The maximum number of tokens allowed per chunk.
610
+
611
+ Returns:
612
+ int: The calculated chunk size.
613
+
614
+ Description:
615
+ This function calculates the chunk size based on the given token count and token limit.
616
+ If the token count is less than or equal to the token limit, the function returns the token count as the chunk size.
617
+ Otherwise, it calculates the number of chunks needed to accommodate all the tokens within the token limit.
618
+ The chunk size is determined by dividing the token limit by the number of chunks.
619
+ If there are remaining tokens after dividing the token count by the token limit,
620
+ the chunk size is adjusted by adding the remaining tokens divided by the number of chunks.
621
+
622
+ Example:
623
+ >>> calculate_chunk_size(1000, 500)
624
+ 500
625
+ >>> calculate_chunk_size(1530, 500)
626
+ 389
627
+ >>> calculate_chunk_size(2242, 500)
628
+ 496
629
+ """
630
+
631
+ if token_count <= token_limit:
632
+ return token_count
633
+
634
+ num_chunks = (token_count + token_limit - 1) // token_limit
635
+ chunk_size = token_count // num_chunks
636
+
637
+ remaining_tokens = token_count % token_limit
638
+ if remaining_tokens > 0:
639
+ chunk_size += remaining_tokens // num_chunks
640
+
641
+ return chunk_size
642
+
643
+
644
+ def translate(
645
+ source_lang,
646
+ target_lang,
647
+ source_text,
648
+ country,
649
+ max_tokens=MAX_TOKENS_PER_CHUNK,
650
+ ):
651
+ """Translate the source_text from source_lang to target_lang."""
652
+
653
+ num_tokens_in_text = num_tokens_in_string(source_text)
654
+
655
+ ic(num_tokens_in_text)
656
+
657
+ if num_tokens_in_text < max_tokens:
658
+ ic("Translating text as single chunk")
659
+
660
+ final_translation = one_chunk_translate_text(
661
+ source_lang, target_lang, source_text, country
662
+ )
663
+
664
+ return final_translation
665
+
666
+ else:
667
+ ic("Translating text as multiple chunks")
668
+
669
+ token_size = calculate_chunk_size(
670
+ token_count=num_tokens_in_text, token_limit=max_tokens
671
+ )
672
+
673
+ ic(token_size)
674
+
675
+ text_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(
676
+ model_name="gpt-4",
677
+ chunk_size=token_size,
678
+ chunk_overlap=0,
679
+ )
680
+
681
+ source_text_chunks = text_splitter.split_text(source_text)
682
+
683
+ translation_2_chunks = multichunk_translation(
684
+ source_lang, target_lang, source_text_chunks, country
685
+ )
686
+
687
+ return "".join(translation_2_chunks)