import gradio as gr from simplemma import simple_tokenizer from difflib import Differ from icecream import ic from app.webui.patch import model_load,num_tokens_in_string,one_chunk_initial_translation, one_chunk_reflect_on_translation, one_chunk_improve_translation from app.webui.patch import calculate_chunk_size, multichunk_initial_translation, multichunk_reflect_on_translation, multichunk_improve_translation from llama_index.core.node_parser import SentenceSplitter from translatepy.translators.google import GoogleTranslate from translatepy.exceptions import UnknownLanguage from translatepy import Language gtranslator = GoogleTranslate() progress=gr.Progress() def tokenize(text): # Use nltk to tokenize the text words = simple_tokenizer(text) # Check if the text contains spaces if ' ' in text: # Create a list of words and spaces tokens = [] for word in words: tokens.append(word) if not word.startswith("'") and not word.endswith("'"): # Avoid adding space after punctuation tokens.append(' ') # Add space after each word return tokens[:-1] # Remove the last space else: return words def diff_texts(text1, text2): tokens1 = tokenize(text1) tokens2 = tokenize(text2) d = Differ() diff_result = list(d.compare(tokens1, tokens2)) highlighted_text = [] for token in diff_result: word = token[2:] category = None if token[0] == '+': category = 'added' elif token[0] == '-': category = 'removed' elif token[0] == '?': continue # Ignore the hints line highlighted_text.append((word, category)) return highlighted_text #modified from src.translaation-agent.utils.tranlsate def translator( source_lang: str, target_lang: str, source_text: str, country: str, max_tokens:int = 1000, ): """Translate the source_text from source_lang to target_lang.""" num_tokens_in_text = num_tokens_in_string(source_text) ic(num_tokens_in_text) if num_tokens_in_text < max_tokens: ic("Translating text as single chunk") progress((1,3), desc="First translation...") init_translation = one_chunk_initial_translation( source_lang, target_lang, source_text ) progress((2,3), desc="Reflecton...") reflection = one_chunk_reflect_on_translation( source_lang, target_lang, source_text, init_translation, country ) progress((3,3), desc="Second translation...") final_translation = one_chunk_improve_translation( source_lang, target_lang, source_text, init_translation, reflection ) return init_translation, reflection, final_translation else: ic("Translating text as multiple chunks") progress((1,5), desc="Calculate chunk size...") token_size = calculate_chunk_size( token_count=num_tokens_in_text, token_limit=max_tokens ) ic(token_size) #using sentence splitter text_parser = SentenceSplitter( chunk_size=token_size, ) progress((2,5), desc="Spilt source text...") source_text_chunks = text_parser.split_text(source_text) progress((3,5), desc="First translation...") translation_1_chunks = multichunk_initial_translation( source_lang, target_lang, source_text_chunks ) init_translation = "".join(translation_1_chunks) progress((4,5), desc="Reflection...") reflection_chunks = multichunk_reflect_on_translation( source_lang, target_lang, source_text_chunks, translation_1_chunks, country, ) reflection = "".join(reflection_chunks) progress((5,5), desc="Second translation...") translation_2_chunks = multichunk_improve_translation( source_lang, target_lang, source_text_chunks, translation_1_chunks, reflection_chunks, ) final_translation = "".join(translation_2_chunks) return init_translation, reflection, final_translation def translator_sec( endpoint2: str, model2: str, api_key2: str, context_window: int, num_output: int, source_lang: str, target_lang: str, source_text: str, country: str, max_tokens: int = 1000, gt: bool = False, ): """Translate the source_text from source_lang to target_lang.""" num_tokens_in_text = num_tokens_in_string(source_text) ic(num_tokens_in_text) if num_tokens_in_text < max_tokens: ic("Translating text as single chunk") progress((1,3), desc="First translation...") if gt: try: language = Language(target_lang) except Exception as e: raise gr.Error(f"An unexpected error occurred: {e}") init_translation = gtranslator.translate(source_text, language).result else: init_translation = one_chunk_initial_translation( source_lang, target_lang, source_text ) try: model_load(endpoint2, model2, api_key2, context_window, num_output) except Exception as e: raise gr.Error(f"An unexpected error occurred: {e}") progress((2,3), desc="Reflecton...") reflection = one_chunk_reflect_on_translation( source_lang, target_lang, source_text, init_translation, country ) progress((3,3), desc="Second translation...") final_translation = one_chunk_improve_translation( source_lang, target_lang, source_text, init_translation, reflection ) return init_translation, reflection, final_translation else: ic("Translating text as multiple chunks") progress((1,5), desc="Calculate chunk size...") token_size = calculate_chunk_size( token_count=num_tokens_in_text, token_limit=max_tokens ) ic(token_size) #using sentence splitter text_parser = SentenceSplitter( chunk_size=token_size, ) progress((2,5), desc="Spilt source text...") source_text_chunks = text_parser.split_text(source_text) progress((3,5), desc="First translation...") if gt: try: language = Language(target_lang) except Exception as e: raise gr.Error(f"An unexpected error occurred: {e}") translation_1_chunks = gtranslator.translate(source_text_chunks, language).result else: translation_1_chunks = multichunk_initial_translation( source_lang, target_lang, source_text_chunks ) try: model_load(endpoint2, model2, api_key2, context_window, num_output) except Exception as e: raise gr.Error(f"An unexpected error occurred: {e}") init_translation = "".join(translation_1_chunks) progress((4,5), desc="Reflection...") reflection_chunks = multichunk_reflect_on_translation( source_lang, target_lang, source_text_chunks, translation_1_chunks, country, ) reflection = "".join(reflection_chunks) progress((5,5), desc="Second translation...") translation_2_chunks = multichunk_improve_translation( source_lang, target_lang, source_text_chunks, translation_1_chunks, reflection_chunks, ) final_translation = "".join(translation_2_chunks) return init_translation, reflection, final_translation