Spaces:

chandan2706
/

Translation

Build error

App Files Files Community

Translation / app.py

chandan2706

Update app.py

1d7e437 verified 9 months ago

raw

history blame

8.73 kB

	# ======================================================================================
	#replace installed inference folder with inference folder of IndicTrans2
	import shutil
	import os

	# Source and destination paths
	source_folder = "/content/Translation/IndicTrans2/inference"
	destination_folder = "/usr/local/lib/python3.10/dist-packages"

	# Get the folder name from the source path
	folder_name = os.path.basename(source_folder)

	# Create the new destination path with the folder name
	new_destination_path = os.path.join(destination_folder, folder_name)

	# Remove the destination folder if it exists
	if os.path.exists(new_destination_path):
	shutil.rmtree(new_destination_path)

	# Move the folder
	shutil.copytree(source_folder, new_destination_path)

	# ================================================================================


	# Import necessary libraries
	import requests
	from dotenv import load_dotenv
	import os
	import gradio as gr
	import pandas as pd
	from mahaNLP.tagger import EntityRecognizer
	from inference.engine import Model
	from ai4bharat.transliteration import XlitEngine

	# Initialize models
	model = Model(r"/content/Translation/indic-en/fairseq_model", model_type="fairseq")
	model2 = EntityRecognizer()
	model4 = Model(r"/content/Translation/en-indic/fairseq_model", model_type="fairseq")
	e = XlitEngine(beam_width=10, src_script_type="indic")

	# Function to load Marathi suffixes from file
	def load_marathi_suffixes(file_path):
	with open(file_path, 'r', encoding='utf-8') as file:
	suffixes = [line.strip() for line in file]
	return suffixes

	marathi_suffixes = load_marathi_suffixes(r"/content/Translation/marathi_stopwords.txt")

	# Function to get suffix of a word
	def get_suffix(word, suffixes):
	for suffix in suffixes:
	if word.endswith(suffix):
	main_word = word[:-len(suffix)].strip()
	return main_word, suffix
	return word, ''

	# Function to perform Named Entity Recognition (NER) and handle suffixes separately
	def ner_tagger(text, suffixes):
	tag = model2.get_token_labels(text)
	tokens = [(row.word, row.entity_group) for row in tag.itertuples(index=False)]
	combined_tokens = []
	for word, entity in tokens:
	if entity == "Person":
	main_word, suffix = get_suffix(word, suffixes)
	combined_tokens.append((main_word, "Person"))
	if suffix:
	combined_tokens.append((suffix, "Other"))
	else:
	combined_tokens.append((word, entity))
	return combined_tokens

	# Function to transliterate person tokens
	def transliterate_person_tokens(tokens):
	transliterated_tokens = []
	for token, label in tokens:
	if label == 'Person':
	split_token = token.rsplit(' ', 1)
	if len(split_token) > 1:
	main_name, suffix = split_token
	else:
	main_name = split_token[0]
	suffix = ''
	transliterated_main_name = e.translit_sentence(main_name, 'mr')
	transliterated_token = transliterated_main_name + (' ' + suffix if suffix else '')
	transliterated_tokens.append((transliterated_token, label))
	else:
	transliterated_tokens.append((token, label))
	return transliterated_tokens

	# Function to transliterate only person tags and maintain their positions
	def transliterate_person_tags_only(text, suffixes):
	# Perform Named Entity Recognition (NER)
	tokens = ner_tagger(text, suffixes)

	# Transliterate person tags only
	transliterated_text = []
	original_person_tokens = {} # To store the transliterated person tokens and their original positions
	index_offset = 0 # Offset for adjusting index when inserting placeholders
	for index, (token, label) in enumerate(tokens):
	if label == 'Person':
	# Transliterate the token
	transliterated_token = transliterate_person_tokens([(token, label)])
	original_person_tokens[index] = transliterated_token[0][0] # Store transliterated token and original position
	transliterated_text.append(f"[PERSON{index}]") # Add a placeholder for the transliterated person token
	index_offset += 1 # Increase offset after inserting a placeholder
	else:
	transliterated_text.append(token)

	return transliterated_text, original_person_tokens


	def count_person_tags(text, suffixes):
	# Perform Named Entity Recognition (NER)
	tokens = ner_tagger(text, suffixes)

	# Count the number of person tags
	person_count = sum(1 for token, label in tokens if label == 'Person')

	return person_count


	def process_text(text, src_lang, tgt_lang, suffixes):
	# Count the number of person tags
	num_person_tags = count_person_tags(text, suffixes)

	if num_person_tags > 6:
	# Translate the text directly
	translated_text = model.batch_translate([text], src_lang, tgt_lang)[0]
	else:
	# Transliterate person tags only
	transliterated_text, original_person_tokens = transliterate_person_tags_only(text, suffixes)

	# Translate the transliterated text
	translated_text = model.batch_translate([' '.join(transliterated_text)], src_lang, tgt_lang)[0]

	# Replace the placeholders with original person tokens in their original positions
	for index, transliterated_token in original_person_tokens.items():
	translated_text = translated_text.replace(f"[PERSON{index}]", transliterated_token, 1)

	return translated_text


	def translate_sentence_with_replacements(model, df, input_text):
	# Translate the original sentence
	translated_sentence = model.batch_translate([input_text], "eng_Latn", "mar_Deva")[0]

	# Tokenize the original sentence
	sentence_tokens = input_text.lower().split()

	# Find all rows where eng_Latn phrases match as whole phrases in the original sentence
	mask = df['eng_Latn'].apply(lambda x: all(word in sentence_tokens for word in x.lower().split()))
	filtered_df = df[mask]

	# Store replacements
	replacements = {}
	for _, row in filtered_df.iterrows():
	mar_wrong_word = row['mar_Deva_wrong']
	mar_correct_word = row['mar_Deva']
	if isinstance(mar_wrong_word, str) and isinstance(mar_correct_word, str):
	if mar_wrong_word in translated_sentence and mar_wrong_word not in replacements:
	translated_sentence = translated_sentence.replace(mar_wrong_word, mar_correct_word)
	replacements[mar_wrong_word] = mar_correct_word

	return translated_sentence

	# Read the DataFrame
	df1 = pd.read_excel(r"/content/Translation/Final_Translation_Data.xlsx")


	# Function to translate Marathi to English
	def translate_marathi_to_english(input_text):
	translated_text_en = process_text(input_text, "mar_Deva", "eng_Latn", marathi_suffixes)
	return translated_text_en

	# Define the translation function for English to Marathi
	def translate_english_to_marathi(input_text):
	translated_text_mr = translate_sentence_with_replacements(model4, df1, input_text)
	return translated_text_mr

	# Define the translation function for English to Hindi
	def translate_english_to_hindi(input_text):
	translated_text_hi = model4.batch_translate(input_text, "eng_Latn", "hin_Deva")[0]
	return translated_text_hi

	# Define the translation function for Hindi to English
	def translate_hindi_to_english(input_text):
	translated_text_en = model.translate_paragraph(input_text, "hin_Deva", "eng_Latn")
	return translated_text_en

	# Define the translation function for Gradio
	def translate_with_gradio(input_text, src_lang, tgt_lang):
	if src_lang == "Marathi" and tgt_lang == "English":
	return translate_marathi_to_english(input_text)
	elif src_lang == "English" and tgt_lang == "Marathi":
	return translate_english_to_marathi(input_text)
	elif src_lang == "English" and tgt_lang == "Hindi":
	return translate_english_to_hindi(input_text)
	elif src_lang == "Hindi" and tgt_lang == "English":
	return translate_hindi_to_english(input_text)
	else:
	return "Translation direction not supported"

	languages = ['English', 'Marathi', 'Hindi']
	# Create the Gradio interface
	demo = gr.Interface(
	fn=translate_with_gradio,
	inputs=[
	gr.Text(label="Enter text"),
	gr.Dropdown(label="From",choices=languages,value="Marathi",),
	gr.Dropdown(label="To",choices=languages,value="English")
	],
	outputs=gr.Textbox(label="Translation"),
	title="Multilingual Translation",
	description="Translate text between Marathi to English & English to Marathi and Hindi to English & English to Hindi",
	)

	# Launch the interface
	demo.launch(share=True)