VDNT11 commited on
Commit
64dfa3e
1 Parent(s): 4012b48

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +7 -18
app.py CHANGED
@@ -8,19 +8,16 @@ from gtts import gTTS
8
  import soundfile as sf
9
  from transformers import VitsTokenizer, VitsModel, set_seed
10
 
11
- # Clone and Install IndicTransToolkit repository
12
  if not os.path.exists('IndicTransToolkit'):
13
  os.system('git clone https://github.com/VarunGumma/IndicTransToolkit')
14
  os.system('cd IndicTransToolkit && python3 -m pip install --editable ./')
15
 
16
- # Ensure that IndicTransToolkit is installed and used properly
17
  from IndicTransToolkit import IndicProcessor
18
 
19
- # Initialize BLIP for image captioning
20
  blip_processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-large")
21
  blip_model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-large").to("cuda" if torch.cuda.is_available() else "cpu")
22
 
23
- # Function to generate captions
24
  def generate_caption(image_path):
25
  image = Image.open(image_path).convert("RGB")
26
  inputs = blip_processor(image, "image of", return_tensors="pt").to("cuda" if torch.cuda.is_available() else "cpu")
@@ -29,7 +26,7 @@ def generate_caption(image_path):
29
  caption = blip_processor.decode(generated_ids[0], skip_special_tokens=True)
30
  return caption
31
 
32
- # Function for translation using IndicTrans2
33
  def translate_caption(caption, target_languages):
34
  # Load model and tokenizer
35
  model_name = "ai4bharat/indictrans2-en-indic-1B"
@@ -44,20 +41,16 @@ def translate_caption(caption, target_languages):
44
  # Source language (English)
45
  src_lang = "eng_Latn"
46
  DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
47
- model_IT2.to(DEVICE) # Move model to the device
48
-
49
- # Integrating with workflow now
50
  input_sentences = [caption]
51
  translations = {}
52
 
53
  for tgt_lang in target_languages:
54
- # Preprocess input sentences
55
  batch = ip.preprocess_batch(input_sentences, src_lang=src_lang, tgt_lang=tgt_lang)
56
 
57
- # Tokenize the sentences and generate input encodings
58
  inputs = tokenizer_IT2(batch, truncation=True, padding="longest", return_tensors="pt").to(DEVICE)
59
 
60
- # Generate translations using the model
61
  with torch.no_grad():
62
  generated_tokens = model_IT2.generate(
63
  **inputs,
@@ -68,23 +61,21 @@ def translate_caption(caption, target_languages):
68
  num_return_sequences=1,
69
  )
70
 
71
- # Decode the generated tokens into text
72
  with tokenizer_IT2.as_target_tokenizer():
73
  generated_tokens = tokenizer_IT2.batch_decode(generated_tokens.detach().cpu().tolist(), skip_special_tokens=True, clean_up_tokenization_spaces=True)
74
 
75
- # Postprocess the translations
76
  translated_texts = ip.postprocess_batch(generated_tokens, lang=tgt_lang)
77
  translations[tgt_lang] = translated_texts[0]
78
 
79
  return translations
80
 
81
- # Function to generate audio using gTTS
82
  def generate_audio_gtts(text, lang_code, output_file):
83
  tts = gTTS(text=text, lang=lang_code)
84
  tts.save(output_file)
85
  return output_file
86
 
87
- # Function to generate audio using Facebook MMS-TTS
88
  def generate_audio_fbmms(text, model_name, output_file):
89
  tokenizer = VitsTokenizer.from_pretrained(model_name)
90
  model = VitsModel.from_pretrained(model_name)
@@ -114,11 +105,10 @@ if uploaded_image is not None:
114
  # Select target languages for translation
115
  target_languages = st.multiselect(
116
  "Select target languages for translation",
117
- ["hin_Deva", "mar_Deva", "guj_Gujr", "urd_Arab"], # Add more languages as needed
118
  ["hin_Deva", "mar_Deva"]
119
  )
120
 
121
- # Generate Translations
122
  if target_languages:
123
  st.write("Translating Caption...")
124
  translations = translate_caption(caption, target_languages)
@@ -126,7 +116,6 @@ if uploaded_image is not None:
126
  for lang, translation in translations.items():
127
  st.write(f"{lang}: {translation}")
128
 
129
- # Default to gTTS for TTS
130
  for lang in target_languages:
131
  st.write(f"Using gTTS for {lang}...")
132
  lang_code = {
 
8
  import soundfile as sf
9
  from transformers import VitsTokenizer, VitsModel, set_seed
10
 
 
11
  if not os.path.exists('IndicTransToolkit'):
12
  os.system('git clone https://github.com/VarunGumma/IndicTransToolkit')
13
  os.system('cd IndicTransToolkit && python3 -m pip install --editable ./')
14
 
 
15
  from IndicTransToolkit import IndicProcessor
16
 
 
17
  blip_processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-large")
18
  blip_model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-large").to("cuda" if torch.cuda.is_available() else "cpu")
19
 
20
+ @st.cache_resource
21
  def generate_caption(image_path):
22
  image = Image.open(image_path).convert("RGB")
23
  inputs = blip_processor(image, "image of", return_tensors="pt").to("cuda" if torch.cuda.is_available() else "cpu")
 
26
  caption = blip_processor.decode(generated_ids[0], skip_special_tokens=True)
27
  return caption
28
 
29
+ @st.cache_resource
30
  def translate_caption(caption, target_languages):
31
  # Load model and tokenizer
32
  model_name = "ai4bharat/indictrans2-en-indic-1B"
 
41
  # Source language (English)
42
  src_lang = "eng_Latn"
43
  DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
44
+ model_IT2.to(DEVICE)
45
+
 
46
  input_sentences = [caption]
47
  translations = {}
48
 
49
  for tgt_lang in target_languages:
 
50
  batch = ip.preprocess_batch(input_sentences, src_lang=src_lang, tgt_lang=tgt_lang)
51
 
 
52
  inputs = tokenizer_IT2(batch, truncation=True, padding="longest", return_tensors="pt").to(DEVICE)
53
 
 
54
  with torch.no_grad():
55
  generated_tokens = model_IT2.generate(
56
  **inputs,
 
61
  num_return_sequences=1,
62
  )
63
 
 
64
  with tokenizer_IT2.as_target_tokenizer():
65
  generated_tokens = tokenizer_IT2.batch_decode(generated_tokens.detach().cpu().tolist(), skip_special_tokens=True, clean_up_tokenization_spaces=True)
66
 
 
67
  translated_texts = ip.postprocess_batch(generated_tokens, lang=tgt_lang)
68
  translations[tgt_lang] = translated_texts[0]
69
 
70
  return translations
71
 
72
+ @st.cache_resource
73
  def generate_audio_gtts(text, lang_code, output_file):
74
  tts = gTTS(text=text, lang=lang_code)
75
  tts.save(output_file)
76
  return output_file
77
 
78
+ @st.cache_resource
79
  def generate_audio_fbmms(text, model_name, output_file):
80
  tokenizer = VitsTokenizer.from_pretrained(model_name)
81
  model = VitsModel.from_pretrained(model_name)
 
105
  # Select target languages for translation
106
  target_languages = st.multiselect(
107
  "Select target languages for translation",
108
+ ["hin_Deva", "mar_Deva", "guj_Gujr", "urd_Arab"],
109
  ["hin_Deva", "mar_Deva"]
110
  )
111
 
 
112
  if target_languages:
113
  st.write("Translating Caption...")
114
  translations = translate_caption(caption, target_languages)
 
116
  for lang, translation in translations.items():
117
  st.write(f"{lang}: {translation}")
118
 
 
119
  for lang in target_languages:
120
  st.write(f"Using gTTS for {lang}...")
121
  lang_code = {