Spaces:

chandan2706
/

Translation

Build error

App Files Files Community

chandan2706 commited on Apr 26, 2024

Commit

7d8c6d4

verified ·

1 Parent(s): d6a9fcb

create app.py

Browse files

translation gradio app file

Files changed (1) hide show

app.py +219 -0

app.py ADDED Viewed

	@@ -0,0 +1,219 @@

+================================================================================
+#replace installed inference folder with inference folder of IndicTrans2
+import shutil
+import os
+# Source and destination paths
+source_folder = "/content/Translation/IndicTrans2/inference"
+destination_folder = "/usr/local/lib/python3.10/dist-packages"
+# Get the folder name from the source path
+folder_name = os.path.basename(source_folder)
+# Create the new destination path with the folder name
+new_destination_path = os.path.join(destination_folder, folder_name)
+# Remove the destination folder if it exists
+if os.path.exists(new_destination_path):
+    shutil.rmtree(new_destination_path)
+# Move the folder
+shutil.copytree(source_folder, new_destination_path)
+================================================================================
+# Import necessary libraries
+import requests
+from dotenv import load_dotenv
+import os
+import gradio as gr
+import pandas as pd
+from mahaNLP.tagger import EntityRecognizer
+from inference.engine import Model
+from ai4bharat.transliteration import XlitEngine
+# Initialize models
+model = Model(r"/content/Translation/indic-en/fairseq_model", model_type="fairseq")
+model2 = EntityRecognizer()
+model4 = Model(r"/content/Translation/en-indic/fairseq_model", model_type="fairseq")
+e = XlitEngine(beam_width=10, src_script_type="indic")
+# Function to load Marathi suffixes from file
+def load_marathi_suffixes(file_path):
+    with open(file_path, 'r', encoding='utf-8') as file:
+        suffixes = [line.strip() for line in file]
+    return suffixes
+marathi_suffixes = load_marathi_suffixes(r"/content/Translation/marathi_stopwords.txt")
+# Function to get suffix of a word
+def get_suffix(word, suffixes):
+    for suffix in suffixes:
+        if word.endswith(suffix):
+            main_word = word[:-len(suffix)].strip()
+            return main_word, suffix
+    return word, ''
+# Function to perform Named Entity Recognition (NER) and handle suffixes separately
+def ner_tagger(text, suffixes):
+    tag = model2.get_token_labels(text)
+    tokens = [(row.word, row.entity_group) for row in tag.itertuples(index=False)]
+    combined_tokens = []
+    for word, entity in tokens:
+        if entity == "Person":
+            main_word, suffix = get_suffix(word, suffixes)
+            combined_tokens.append((main_word, "Person"))
+            if suffix:
+                combined_tokens.append((suffix, "Other"))
+        else:
+            combined_tokens.append((word, entity))
+    return combined_tokens
+# Function to transliterate person tokens
+def transliterate_person_tokens(tokens):
+    transliterated_tokens = []
+    for token, label in tokens:
+        if label == 'Person':
+            split_token = token.rsplit(' ', 1)
+            if len(split_token) > 1:
+                main_name, suffix = split_token
+            else:
+                main_name = split_token[0]
+                suffix = ''
+            transliterated_main_name = e.translit_sentence(main_name, 'mr')
+            transliterated_token = transliterated_main_name + (' ' + suffix if suffix else '')
+            transliterated_tokens.append((transliterated_token, label))
+        else:
+            transliterated_tokens.append((token, label))
+    return transliterated_tokens
+# Function to transliterate only person tags and maintain their positions
+def transliterate_person_tags_only(text, suffixes):
+    # Perform Named Entity Recognition (NER)
+    tokens = ner_tagger(text, suffixes)
+    # Transliterate person tags only
+    transliterated_text = []
+    original_person_tokens = {}  # To store the transliterated person tokens and their original positions
+    index_offset = 0  # Offset for adjusting index when inserting placeholders
+    for index, (token, label) in enumerate(tokens):
+        if label == 'Person':
+            # Transliterate the token
+            transliterated_token = transliterate_person_tokens([(token, label)])
+            original_person_tokens[index] = transliterated_token[0][0]  # Store transliterated token and original position
+            transliterated_text.append(f"[PERSON{index}]")  # Add a placeholder for the transliterated person token
+            index_offset += 1  # Increase offset after inserting a placeholder
+        else:
+            transliterated_text.append(token)
+    return transliterated_text, original_person_tokens
+def count_person_tags(text, suffixes):
+    # Perform Named Entity Recognition (NER)
+    tokens = ner_tagger(text, suffixes)
+    # Count the number of person tags
+    person_count = sum(1 for token, label in tokens if label == 'Person')
+    return person_count
+def process_text(text, src_lang, tgt_lang, suffixes):
+    # Count the number of person tags
+    num_person_tags = count_person_tags(text, suffixes)
+    if num_person_tags > 6:
+        # Translate the text directly
+        translated_text = model.batch_translate([text], src_lang, tgt_lang)[0]
+    else:
+        # Transliterate person tags only
+        transliterated_text, original_person_tokens = transliterate_person_tags_only(text, suffixes)
+        # Translate the transliterated text
+        translated_text = model.batch_translate([' '.join(transliterated_text)], src_lang, tgt_lang)[0]
+        # Replace the placeholders with original person tokens in their original positions
+        for index, transliterated_token in original_person_tokens.items():
+            translated_text = translated_text.replace(f"[PERSON{index}]", transliterated_token, 1)
+    return translated_text
+def translate_sentence_with_replacements(model, df, input_text):
+    # Translate the original sentence
+    translated_sentence = model.batch_translate([input_text], "eng_Latn", "mar_Deva")[0]
+    # Tokenize the original sentence
+    sentence_tokens = input_text.lower().split()
+    # Find all rows where eng_Latn phrases match as whole phrases in the original sentence
+    mask = df['eng_Latn'].apply(lambda x: all(word in sentence_tokens for word in x.lower().split()))
+    filtered_df = df[mask]
+    # Store replacements
+    replacements = {}
+    for _, row in filtered_df.iterrows():
+        mar_wrong_word = row['mar_Deva_wrong']
+        mar_correct_word = row['mar_Deva']
+        if isinstance(mar_wrong_word, str) and isinstance(mar_correct_word, str):
+            if mar_wrong_word in translated_sentence and mar_wrong_word not in replacements:
+                translated_sentence = translated_sentence.replace(mar_wrong_word, mar_correct_word)
+                replacements[mar_wrong_word] = mar_correct_word
+    return translated_sentence
+# Read the DataFrame
+df1 = pd.read_excel(r"/content/Translation/Final_Translation_Data.xlsx")
+# Function to translate Marathi to English
+def translate_marathi_to_english(input_text):
+    translated_text_en = process_text(input_text, "mar_Deva", "eng_Latn", marathi_suffixes)
+    return translated_text_en
+# Define the translation function for English to Marathi
+def translate_english_to_marathi(input_text):
+    translated_text_mr = translate_sentence_with_replacements(model4, df1, input_text)
+    return translated_text_mr
+# Define the translation function for English to Hindi
+def translate_english_to_hindi(input_text):
+    translated_text_hi = model4.batch_translate(input_text, "eng_Latn", "hin_Deva")[0]
+    return translated_text_hi
+# Define the translation function for Hindi to English
+def translate_hindi_to_english(input_text):
+    translated_text_en = model.translate_paragraph(input_text, "hin_Deva", "eng_Latn")
+    return translated_text_en
+# Define the translation function for Gradio
+def translate_with_gradio(input_text, src_lang, tgt_lang):
+    if src_lang == "Marathi" and tgt_lang == "English":
+        return translate_marathi_to_english(input_text)
+    elif src_lang == "English" and tgt_lang == "Marathi":
+        return translate_english_to_marathi(input_text)
+    elif src_lang == "English" and tgt_lang == "Hindi":
+        return translate_english_to_hindi(input_text)
+    elif src_lang == "Hindi" and tgt_lang == "English":
+        return translate_hindi_to_english(input_text)
+    else:
+        return "Translation direction not supported"
+languages = ['English', 'Marathi', 'Hindi']
+# Create the Gradio interface
+demo = gr.Interface(
+    fn=translate_with_gradio,
+    inputs=[
+        gr.Text(label="Enter text"),
+        gr.Dropdown(label="From",choices=languages,value="Marathi",),
+        gr.Dropdown(label="To",choices=languages,value="English")
+    ],
+    outputs=gr.Textbox(label="Translation"),
+    title="Multilingual Translation",
+    description="Translate text between Marathi to English & English to Marathi and Hindi to English & English to Hindi",
+)
+# Launch the interface
+demo.launch(share=True)