chandan2706 commited on
Commit
7d8c6d4
·
verified ·
1 Parent(s): d6a9fcb

create app.py

Browse files

translation gradio app file

Files changed (1) hide show
  1. app.py +219 -0
app.py ADDED
@@ -0,0 +1,219 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ================================================================================
2
+ #replace installed inference folder with inference folder of IndicTrans2
3
+ import shutil
4
+ import os
5
+
6
+ # Source and destination paths
7
+ source_folder = "/content/Translation/IndicTrans2/inference"
8
+ destination_folder = "/usr/local/lib/python3.10/dist-packages"
9
+
10
+ # Get the folder name from the source path
11
+ folder_name = os.path.basename(source_folder)
12
+
13
+ # Create the new destination path with the folder name
14
+ new_destination_path = os.path.join(destination_folder, folder_name)
15
+
16
+ # Remove the destination folder if it exists
17
+ if os.path.exists(new_destination_path):
18
+ shutil.rmtree(new_destination_path)
19
+
20
+ # Move the folder
21
+ shutil.copytree(source_folder, new_destination_path)
22
+
23
+ ================================================================================
24
+
25
+
26
+ # Import necessary libraries
27
+ import requests
28
+ from dotenv import load_dotenv
29
+ import os
30
+ import gradio as gr
31
+ import pandas as pd
32
+ from mahaNLP.tagger import EntityRecognizer
33
+ from inference.engine import Model
34
+ from ai4bharat.transliteration import XlitEngine
35
+
36
+ # Initialize models
37
+ model = Model(r"/content/Translation/indic-en/fairseq_model", model_type="fairseq")
38
+ model2 = EntityRecognizer()
39
+ model4 = Model(r"/content/Translation/en-indic/fairseq_model", model_type="fairseq")
40
+ e = XlitEngine(beam_width=10, src_script_type="indic")
41
+
42
+ # Function to load Marathi suffixes from file
43
+ def load_marathi_suffixes(file_path):
44
+ with open(file_path, 'r', encoding='utf-8') as file:
45
+ suffixes = [line.strip() for line in file]
46
+ return suffixes
47
+
48
+ marathi_suffixes = load_marathi_suffixes(r"/content/Translation/marathi_stopwords.txt")
49
+
50
+ # Function to get suffix of a word
51
+ def get_suffix(word, suffixes):
52
+ for suffix in suffixes:
53
+ if word.endswith(suffix):
54
+ main_word = word[:-len(suffix)].strip()
55
+ return main_word, suffix
56
+ return word, ''
57
+
58
+ # Function to perform Named Entity Recognition (NER) and handle suffixes separately
59
+ def ner_tagger(text, suffixes):
60
+ tag = model2.get_token_labels(text)
61
+ tokens = [(row.word, row.entity_group) for row in tag.itertuples(index=False)]
62
+ combined_tokens = []
63
+ for word, entity in tokens:
64
+ if entity == "Person":
65
+ main_word, suffix = get_suffix(word, suffixes)
66
+ combined_tokens.append((main_word, "Person"))
67
+ if suffix:
68
+ combined_tokens.append((suffix, "Other"))
69
+ else:
70
+ combined_tokens.append((word, entity))
71
+ return combined_tokens
72
+
73
+ # Function to transliterate person tokens
74
+ def transliterate_person_tokens(tokens):
75
+ transliterated_tokens = []
76
+ for token, label in tokens:
77
+ if label == 'Person':
78
+ split_token = token.rsplit(' ', 1)
79
+ if len(split_token) > 1:
80
+ main_name, suffix = split_token
81
+ else:
82
+ main_name = split_token[0]
83
+ suffix = ''
84
+ transliterated_main_name = e.translit_sentence(main_name, 'mr')
85
+ transliterated_token = transliterated_main_name + (' ' + suffix if suffix else '')
86
+ transliterated_tokens.append((transliterated_token, label))
87
+ else:
88
+ transliterated_tokens.append((token, label))
89
+ return transliterated_tokens
90
+
91
+ # Function to transliterate only person tags and maintain their positions
92
+ def transliterate_person_tags_only(text, suffixes):
93
+ # Perform Named Entity Recognition (NER)
94
+ tokens = ner_tagger(text, suffixes)
95
+
96
+ # Transliterate person tags only
97
+ transliterated_text = []
98
+ original_person_tokens = {} # To store the transliterated person tokens and their original positions
99
+ index_offset = 0 # Offset for adjusting index when inserting placeholders
100
+ for index, (token, label) in enumerate(tokens):
101
+ if label == 'Person':
102
+ # Transliterate the token
103
+ transliterated_token = transliterate_person_tokens([(token, label)])
104
+ original_person_tokens[index] = transliterated_token[0][0] # Store transliterated token and original position
105
+ transliterated_text.append(f"[PERSON{index}]") # Add a placeholder for the transliterated person token
106
+ index_offset += 1 # Increase offset after inserting a placeholder
107
+ else:
108
+ transliterated_text.append(token)
109
+
110
+ return transliterated_text, original_person_tokens
111
+
112
+
113
+ def count_person_tags(text, suffixes):
114
+ # Perform Named Entity Recognition (NER)
115
+ tokens = ner_tagger(text, suffixes)
116
+
117
+ # Count the number of person tags
118
+ person_count = sum(1 for token, label in tokens if label == 'Person')
119
+
120
+ return person_count
121
+
122
+
123
+ def process_text(text, src_lang, tgt_lang, suffixes):
124
+ # Count the number of person tags
125
+ num_person_tags = count_person_tags(text, suffixes)
126
+
127
+ if num_person_tags > 6:
128
+ # Translate the text directly
129
+ translated_text = model.batch_translate([text], src_lang, tgt_lang)[0]
130
+ else:
131
+ # Transliterate person tags only
132
+ transliterated_text, original_person_tokens = transliterate_person_tags_only(text, suffixes)
133
+
134
+ # Translate the transliterated text
135
+ translated_text = model.batch_translate([' '.join(transliterated_text)], src_lang, tgt_lang)[0]
136
+
137
+ # Replace the placeholders with original person tokens in their original positions
138
+ for index, transliterated_token in original_person_tokens.items():
139
+ translated_text = translated_text.replace(f"[PERSON{index}]", transliterated_token, 1)
140
+
141
+ return translated_text
142
+
143
+
144
+ def translate_sentence_with_replacements(model, df, input_text):
145
+ # Translate the original sentence
146
+ translated_sentence = model.batch_translate([input_text], "eng_Latn", "mar_Deva")[0]
147
+
148
+ # Tokenize the original sentence
149
+ sentence_tokens = input_text.lower().split()
150
+
151
+ # Find all rows where eng_Latn phrases match as whole phrases in the original sentence
152
+ mask = df['eng_Latn'].apply(lambda x: all(word in sentence_tokens for word in x.lower().split()))
153
+ filtered_df = df[mask]
154
+
155
+ # Store replacements
156
+ replacements = {}
157
+ for _, row in filtered_df.iterrows():
158
+ mar_wrong_word = row['mar_Deva_wrong']
159
+ mar_correct_word = row['mar_Deva']
160
+ if isinstance(mar_wrong_word, str) and isinstance(mar_correct_word, str):
161
+ if mar_wrong_word in translated_sentence and mar_wrong_word not in replacements:
162
+ translated_sentence = translated_sentence.replace(mar_wrong_word, mar_correct_word)
163
+ replacements[mar_wrong_word] = mar_correct_word
164
+
165
+ return translated_sentence
166
+
167
+ # Read the DataFrame
168
+ df1 = pd.read_excel(r"/content/Translation/Final_Translation_Data.xlsx")
169
+
170
+
171
+ # Function to translate Marathi to English
172
+ def translate_marathi_to_english(input_text):
173
+ translated_text_en = process_text(input_text, "mar_Deva", "eng_Latn", marathi_suffixes)
174
+ return translated_text_en
175
+
176
+ # Define the translation function for English to Marathi
177
+ def translate_english_to_marathi(input_text):
178
+ translated_text_mr = translate_sentence_with_replacements(model4, df1, input_text)
179
+ return translated_text_mr
180
+
181
+ # Define the translation function for English to Hindi
182
+ def translate_english_to_hindi(input_text):
183
+ translated_text_hi = model4.batch_translate(input_text, "eng_Latn", "hin_Deva")[0]
184
+ return translated_text_hi
185
+
186
+ # Define the translation function for Hindi to English
187
+ def translate_hindi_to_english(input_text):
188
+ translated_text_en = model.translate_paragraph(input_text, "hin_Deva", "eng_Latn")
189
+ return translated_text_en
190
+
191
+ # Define the translation function for Gradio
192
+ def translate_with_gradio(input_text, src_lang, tgt_lang):
193
+ if src_lang == "Marathi" and tgt_lang == "English":
194
+ return translate_marathi_to_english(input_text)
195
+ elif src_lang == "English" and tgt_lang == "Marathi":
196
+ return translate_english_to_marathi(input_text)
197
+ elif src_lang == "English" and tgt_lang == "Hindi":
198
+ return translate_english_to_hindi(input_text)
199
+ elif src_lang == "Hindi" and tgt_lang == "English":
200
+ return translate_hindi_to_english(input_text)
201
+ else:
202
+ return "Translation direction not supported"
203
+
204
+ languages = ['English', 'Marathi', 'Hindi']
205
+ # Create the Gradio interface
206
+ demo = gr.Interface(
207
+ fn=translate_with_gradio,
208
+ inputs=[
209
+ gr.Text(label="Enter text"),
210
+ gr.Dropdown(label="From",choices=languages,value="Marathi",),
211
+ gr.Dropdown(label="To",choices=languages,value="English")
212
+ ],
213
+ outputs=gr.Textbox(label="Translation"),
214
+ title="Multilingual Translation",
215
+ description="Translate text between Marathi to English & English to Marathi and Hindi to English & English to Hindi",
216
+ )
217
+
218
+ # Launch the interface
219
+ demo.launch(share=True)