Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
@@ -8,19 +8,16 @@ from gtts import gTTS
|
|
8 |
import soundfile as sf
|
9 |
from transformers import VitsTokenizer, VitsModel, set_seed
|
10 |
|
11 |
-
# Clone and Install IndicTransToolkit repository
|
12 |
if not os.path.exists('IndicTransToolkit'):
|
13 |
os.system('git clone https://github.com/VarunGumma/IndicTransToolkit')
|
14 |
os.system('cd IndicTransToolkit && python3 -m pip install --editable ./')
|
15 |
|
16 |
-
# Ensure that IndicTransToolkit is installed and used properly
|
17 |
from IndicTransToolkit import IndicProcessor
|
18 |
|
19 |
-
# Initialize BLIP for image captioning
|
20 |
blip_processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-large")
|
21 |
blip_model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-large").to("cuda" if torch.cuda.is_available() else "cpu")
|
22 |
|
23 |
-
|
24 |
def generate_caption(image_path):
|
25 |
image = Image.open(image_path).convert("RGB")
|
26 |
inputs = blip_processor(image, "image of", return_tensors="pt").to("cuda" if torch.cuda.is_available() else "cpu")
|
@@ -29,7 +26,7 @@ def generate_caption(image_path):
|
|
29 |
caption = blip_processor.decode(generated_ids[0], skip_special_tokens=True)
|
30 |
return caption
|
31 |
|
32 |
-
|
33 |
def translate_caption(caption, target_languages):
|
34 |
# Load model and tokenizer
|
35 |
model_name = "ai4bharat/indictrans2-en-indic-1B"
|
@@ -44,20 +41,16 @@ def translate_caption(caption, target_languages):
|
|
44 |
# Source language (English)
|
45 |
src_lang = "eng_Latn"
|
46 |
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
|
47 |
-
model_IT2.to(DEVICE)
|
48 |
-
|
49 |
-
# Integrating with workflow now
|
50 |
input_sentences = [caption]
|
51 |
translations = {}
|
52 |
|
53 |
for tgt_lang in target_languages:
|
54 |
-
# Preprocess input sentences
|
55 |
batch = ip.preprocess_batch(input_sentences, src_lang=src_lang, tgt_lang=tgt_lang)
|
56 |
|
57 |
-
# Tokenize the sentences and generate input encodings
|
58 |
inputs = tokenizer_IT2(batch, truncation=True, padding="longest", return_tensors="pt").to(DEVICE)
|
59 |
|
60 |
-
# Generate translations using the model
|
61 |
with torch.no_grad():
|
62 |
generated_tokens = model_IT2.generate(
|
63 |
**inputs,
|
@@ -68,23 +61,21 @@ def translate_caption(caption, target_languages):
|
|
68 |
num_return_sequences=1,
|
69 |
)
|
70 |
|
71 |
-
# Decode the generated tokens into text
|
72 |
with tokenizer_IT2.as_target_tokenizer():
|
73 |
generated_tokens = tokenizer_IT2.batch_decode(generated_tokens.detach().cpu().tolist(), skip_special_tokens=True, clean_up_tokenization_spaces=True)
|
74 |
|
75 |
-
# Postprocess the translations
|
76 |
translated_texts = ip.postprocess_batch(generated_tokens, lang=tgt_lang)
|
77 |
translations[tgt_lang] = translated_texts[0]
|
78 |
|
79 |
return translations
|
80 |
|
81 |
-
|
82 |
def generate_audio_gtts(text, lang_code, output_file):
|
83 |
tts = gTTS(text=text, lang=lang_code)
|
84 |
tts.save(output_file)
|
85 |
return output_file
|
86 |
|
87 |
-
|
88 |
def generate_audio_fbmms(text, model_name, output_file):
|
89 |
tokenizer = VitsTokenizer.from_pretrained(model_name)
|
90 |
model = VitsModel.from_pretrained(model_name)
|
@@ -114,11 +105,10 @@ if uploaded_image is not None:
|
|
114 |
# Select target languages for translation
|
115 |
target_languages = st.multiselect(
|
116 |
"Select target languages for translation",
|
117 |
-
["hin_Deva", "mar_Deva", "guj_Gujr", "urd_Arab"],
|
118 |
["hin_Deva", "mar_Deva"]
|
119 |
)
|
120 |
|
121 |
-
# Generate Translations
|
122 |
if target_languages:
|
123 |
st.write("Translating Caption...")
|
124 |
translations = translate_caption(caption, target_languages)
|
@@ -126,7 +116,6 @@ if uploaded_image is not None:
|
|
126 |
for lang, translation in translations.items():
|
127 |
st.write(f"{lang}: {translation}")
|
128 |
|
129 |
-
# Default to gTTS for TTS
|
130 |
for lang in target_languages:
|
131 |
st.write(f"Using gTTS for {lang}...")
|
132 |
lang_code = {
|
|
|
8 |
import soundfile as sf
|
9 |
from transformers import VitsTokenizer, VitsModel, set_seed
|
10 |
|
|
|
11 |
if not os.path.exists('IndicTransToolkit'):
|
12 |
os.system('git clone https://github.com/VarunGumma/IndicTransToolkit')
|
13 |
os.system('cd IndicTransToolkit && python3 -m pip install --editable ./')
|
14 |
|
|
|
15 |
from IndicTransToolkit import IndicProcessor
|
16 |
|
|
|
17 |
blip_processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-large")
|
18 |
blip_model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-large").to("cuda" if torch.cuda.is_available() else "cpu")
|
19 |
|
20 |
+
@st.cache_resource
|
21 |
def generate_caption(image_path):
|
22 |
image = Image.open(image_path).convert("RGB")
|
23 |
inputs = blip_processor(image, "image of", return_tensors="pt").to("cuda" if torch.cuda.is_available() else "cpu")
|
|
|
26 |
caption = blip_processor.decode(generated_ids[0], skip_special_tokens=True)
|
27 |
return caption
|
28 |
|
29 |
+
@st.cache_resource
|
30 |
def translate_caption(caption, target_languages):
|
31 |
# Load model and tokenizer
|
32 |
model_name = "ai4bharat/indictrans2-en-indic-1B"
|
|
|
41 |
# Source language (English)
|
42 |
src_lang = "eng_Latn"
|
43 |
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
|
44 |
+
model_IT2.to(DEVICE)
|
45 |
+
|
|
|
46 |
input_sentences = [caption]
|
47 |
translations = {}
|
48 |
|
49 |
for tgt_lang in target_languages:
|
|
|
50 |
batch = ip.preprocess_batch(input_sentences, src_lang=src_lang, tgt_lang=tgt_lang)
|
51 |
|
|
|
52 |
inputs = tokenizer_IT2(batch, truncation=True, padding="longest", return_tensors="pt").to(DEVICE)
|
53 |
|
|
|
54 |
with torch.no_grad():
|
55 |
generated_tokens = model_IT2.generate(
|
56 |
**inputs,
|
|
|
61 |
num_return_sequences=1,
|
62 |
)
|
63 |
|
|
|
64 |
with tokenizer_IT2.as_target_tokenizer():
|
65 |
generated_tokens = tokenizer_IT2.batch_decode(generated_tokens.detach().cpu().tolist(), skip_special_tokens=True, clean_up_tokenization_spaces=True)
|
66 |
|
|
|
67 |
translated_texts = ip.postprocess_batch(generated_tokens, lang=tgt_lang)
|
68 |
translations[tgt_lang] = translated_texts[0]
|
69 |
|
70 |
return translations
|
71 |
|
72 |
+
@st.cache_resource
|
73 |
def generate_audio_gtts(text, lang_code, output_file):
|
74 |
tts = gTTS(text=text, lang=lang_code)
|
75 |
tts.save(output_file)
|
76 |
return output_file
|
77 |
|
78 |
+
@st.cache_resource
|
79 |
def generate_audio_fbmms(text, model_name, output_file):
|
80 |
tokenizer = VitsTokenizer.from_pretrained(model_name)
|
81 |
model = VitsModel.from_pretrained(model_name)
|
|
|
105 |
# Select target languages for translation
|
106 |
target_languages = st.multiselect(
|
107 |
"Select target languages for translation",
|
108 |
+
["hin_Deva", "mar_Deva", "guj_Gujr", "urd_Arab"],
|
109 |
["hin_Deva", "mar_Deva"]
|
110 |
)
|
111 |
|
|
|
112 |
if target_languages:
|
113 |
st.write("Translating Caption...")
|
114 |
translations = translate_caption(caption, target_languages)
|
|
|
116 |
for lang, translation in translations.items():
|
117 |
st.write(f"{lang}: {translation}")
|
118 |
|
|
|
119 |
for lang in target_languages:
|
120 |
st.write(f"Using gTTS for {lang}...")
|
121 |
lang_code = {
|