Spaces:
Build error
Build error
# ====================================================================================== | |
#replace installed inference folder with inference folder of IndicTrans2 | |
import shutil | |
import os | |
# Source and destination paths | |
source_folder = "/content/Translation/IndicTrans2/inference" | |
destination_folder = "/usr/local/lib/python3.10/dist-packages" | |
# Get the folder name from the source path | |
folder_name = os.path.basename(source_folder) | |
# Create the new destination path with the folder name | |
new_destination_path = os.path.join(destination_folder, folder_name) | |
# Remove the destination folder if it exists | |
if os.path.exists(new_destination_path): | |
shutil.rmtree(new_destination_path) | |
# Move the folder | |
shutil.copytree(source_folder, new_destination_path) | |
# ================================================================================ | |
# Import necessary libraries | |
import requests | |
from dotenv import load_dotenv | |
import os | |
import gradio as gr | |
import pandas as pd | |
from mahaNLP.tagger import EntityRecognizer | |
from inference.engine import Model | |
from ai4bharat.transliteration import XlitEngine | |
# Initialize models | |
model = Model(r"/content/Translation/indic-en/fairseq_model", model_type="fairseq") | |
model2 = EntityRecognizer() | |
model4 = Model(r"/content/Translation/en-indic/fairseq_model", model_type="fairseq") | |
e = XlitEngine(beam_width=10, src_script_type="indic") | |
# Function to load Marathi suffixes from file | |
def load_marathi_suffixes(file_path): | |
with open(file_path, 'r', encoding='utf-8') as file: | |
suffixes = [line.strip() for line in file] | |
return suffixes | |
marathi_suffixes = load_marathi_suffixes(r"/content/Translation/marathi_stopwords.txt") | |
# Function to get suffix of a word | |
def get_suffix(word, suffixes): | |
for suffix in suffixes: | |
if word.endswith(suffix): | |
main_word = word[:-len(suffix)].strip() | |
return main_word, suffix | |
return word, '' | |
# Function to perform Named Entity Recognition (NER) and handle suffixes separately | |
def ner_tagger(text, suffixes): | |
tag = model2.get_token_labels(text) | |
tokens = [(row.word, row.entity_group) for row in tag.itertuples(index=False)] | |
combined_tokens = [] | |
for word, entity in tokens: | |
if entity == "Person": | |
main_word, suffix = get_suffix(word, suffixes) | |
combined_tokens.append((main_word, "Person")) | |
if suffix: | |
combined_tokens.append((suffix, "Other")) | |
else: | |
combined_tokens.append((word, entity)) | |
return combined_tokens | |
# Function to transliterate person tokens | |
def transliterate_person_tokens(tokens): | |
transliterated_tokens = [] | |
for token, label in tokens: | |
if label == 'Person': | |
split_token = token.rsplit(' ', 1) | |
if len(split_token) > 1: | |
main_name, suffix = split_token | |
else: | |
main_name = split_token[0] | |
suffix = '' | |
transliterated_main_name = e.translit_sentence(main_name, 'mr') | |
transliterated_token = transliterated_main_name + (' ' + suffix if suffix else '') | |
transliterated_tokens.append((transliterated_token, label)) | |
else: | |
transliterated_tokens.append((token, label)) | |
return transliterated_tokens | |
# Function to transliterate only person tags and maintain their positions | |
def transliterate_person_tags_only(text, suffixes): | |
# Perform Named Entity Recognition (NER) | |
tokens = ner_tagger(text, suffixes) | |
# Transliterate person tags only | |
transliterated_text = [] | |
original_person_tokens = {} # To store the transliterated person tokens and their original positions | |
index_offset = 0 # Offset for adjusting index when inserting placeholders | |
for index, (token, label) in enumerate(tokens): | |
if label == 'Person': | |
# Transliterate the token | |
transliterated_token = transliterate_person_tokens([(token, label)]) | |
original_person_tokens[index] = transliterated_token[0][0] # Store transliterated token and original position | |
transliterated_text.append(f"[PERSON{index}]") # Add a placeholder for the transliterated person token | |
index_offset += 1 # Increase offset after inserting a placeholder | |
else: | |
transliterated_text.append(token) | |
return transliterated_text, original_person_tokens | |
def count_person_tags(text, suffixes): | |
# Perform Named Entity Recognition (NER) | |
tokens = ner_tagger(text, suffixes) | |
# Count the number of person tags | |
person_count = sum(1 for token, label in tokens if label == 'Person') | |
return person_count | |
def process_text(text, src_lang, tgt_lang, suffixes): | |
# Count the number of person tags | |
num_person_tags = count_person_tags(text, suffixes) | |
if num_person_tags > 6: | |
# Translate the text directly | |
translated_text = model.batch_translate([text], src_lang, tgt_lang)[0] | |
else: | |
# Transliterate person tags only | |
transliterated_text, original_person_tokens = transliterate_person_tags_only(text, suffixes) | |
# Translate the transliterated text | |
translated_text = model.batch_translate([' '.join(transliterated_text)], src_lang, tgt_lang)[0] | |
# Replace the placeholders with original person tokens in their original positions | |
for index, transliterated_token in original_person_tokens.items(): | |
translated_text = translated_text.replace(f"[PERSON{index}]", transliterated_token, 1) | |
return translated_text | |
def translate_sentence_with_replacements(model, df, input_text): | |
# Translate the original sentence | |
translated_sentence = model.batch_translate([input_text], "eng_Latn", "mar_Deva")[0] | |
# Tokenize the original sentence | |
sentence_tokens = input_text.lower().split() | |
# Find all rows where eng_Latn phrases match as whole phrases in the original sentence | |
mask = df['eng_Latn'].apply(lambda x: all(word in sentence_tokens for word in x.lower().split())) | |
filtered_df = df[mask] | |
# Store replacements | |
replacements = {} | |
for _, row in filtered_df.iterrows(): | |
mar_wrong_word = row['mar_Deva_wrong'] | |
mar_correct_word = row['mar_Deva'] | |
if isinstance(mar_wrong_word, str) and isinstance(mar_correct_word, str): | |
if mar_wrong_word in translated_sentence and mar_wrong_word not in replacements: | |
translated_sentence = translated_sentence.replace(mar_wrong_word, mar_correct_word) | |
replacements[mar_wrong_word] = mar_correct_word | |
return translated_sentence | |
# Read the DataFrame | |
df1 = pd.read_excel(r"/content/Translation/Final_Translation_Data.xlsx") | |
# Function to translate Marathi to English | |
def translate_marathi_to_english(input_text): | |
translated_text_en = process_text(input_text, "mar_Deva", "eng_Latn", marathi_suffixes) | |
return translated_text_en | |
# Define the translation function for English to Marathi | |
def translate_english_to_marathi(input_text): | |
translated_text_mr = translate_sentence_with_replacements(model4, df1, input_text) | |
return translated_text_mr | |
# Define the translation function for English to Hindi | |
def translate_english_to_hindi(input_text): | |
translated_text_hi = model4.batch_translate(input_text, "eng_Latn", "hin_Deva")[0] | |
return translated_text_hi | |
# Define the translation function for Hindi to English | |
def translate_hindi_to_english(input_text): | |
translated_text_en = model.translate_paragraph(input_text, "hin_Deva", "eng_Latn") | |
return translated_text_en | |
# Define the translation function for Gradio | |
def translate_with_gradio(input_text, src_lang, tgt_lang): | |
if src_lang == "Marathi" and tgt_lang == "English": | |
return translate_marathi_to_english(input_text) | |
elif src_lang == "English" and tgt_lang == "Marathi": | |
return translate_english_to_marathi(input_text) | |
elif src_lang == "English" and tgt_lang == "Hindi": | |
return translate_english_to_hindi(input_text) | |
elif src_lang == "Hindi" and tgt_lang == "English": | |
return translate_hindi_to_english(input_text) | |
else: | |
return "Translation direction not supported" | |
languages = ['English', 'Marathi', 'Hindi'] | |
# Create the Gradio interface | |
demo = gr.Interface( | |
fn=translate_with_gradio, | |
inputs=[ | |
gr.Text(label="Enter text"), | |
gr.Dropdown(label="From",choices=languages,value="Marathi",), | |
gr.Dropdown(label="To",choices=languages,value="English") | |
], | |
outputs=gr.Textbox(label="Translation"), | |
title="Multilingual Translation", | |
description="Translate text between Marathi to English & English to Marathi and Hindi to English & English to Hindi", | |
) | |
# Launch the interface | |
demo.launch(share=True) | |