AIML_project / app.py
VDNT11's picture
Update app.py
8476397 verified
import os
import subprocess
import sys
# Clone required repositories
def clone_repositories():
repos = [
('https://github.com/AI4Bharat/IndicTrans2.git', 'indictrans2'),
('https://github.com/VarunGumma/IndicTransToolkit.git', 'indictranstoolkit')
]
for repo_url, repo_dir in repos:
if not os.path.exists(repo_dir):
subprocess.check_call(['git', 'clone', repo_url, repo_dir])
sys.path.append(os.path.abspath(repo_dir))
# Clone repositories before importing
clone_repositories()
import streamlit as st
import torch
import librosa
import matplotlib.pyplot as plt
from PIL import Image
import torchaudio
from transformers import (
AutoModelForSpeechSeq2Seq,
AutoProcessor,
pipeline,
AutoModelForSeq2SeqLM,
AutoTokenizer,
BitsAndBytesConfig
)
from diffusers import StableDiffusionPipeline, EulerDiscreteScheduler, StableDiffusionImg2ImgPipeline
import stanza
import numpy as np
from IndicTransToolkit import IndicProcessor
class TransGen:
def __init__(
self,
translation_model="ai4bharat/indictrans2-indic-en-1B",
stable_diff_model="stabilityai/stable-diffusion-2-base",
src_lang='hin_Deva',
tgt_lang='eng_Latn'
):
self.bnb_config = BitsAndBytesConfig(load_in_4bit=True)
self.tokenizer = AutoTokenizer.from_pretrained(translation_model, trust_remote_code=True)
self.model = AutoModelForSeq2SeqLM.from_pretrained(translation_model, trust_remote_code=True, quantization_config=self.bnb_config)
self.ip = IndicProcessor(inference=True)
self.src_lang = src_lang
self.tgt_lang = tgt_lang
scheduler = EulerDiscreteScheduler.from_pretrained(stable_diff_model, subfolder="scheduler")
self.pipe = StableDiffusionPipeline.from_pretrained(stable_diff_model, scheduler=scheduler, torch_dtype=torch.bfloat16)
self.pipe = self.pipe.to("cuda")
self.img2img_pipe = StableDiffusionImg2ImgPipeline.from_pretrained(stable_diff_model, torch_dtype=torch.float16)
self.img2img_pipe = self.img2img_pipe.to('cuda')
def translate(self, input_sentences):
batch = self.ip.preprocess_batch(
input_sentences,
src_lang=self.src_lang,
tgt_lang=self.tgt_lang,
)
inputs = self.tokenizer(
batch,
truncation=True,
padding="longest",
return_tensors="pt",
return_attention_mask=True,
)
with torch.no_grad():
generated_tokens = self.model.generate(
**inputs,
use_cache=True,
min_length=0,
max_length=256,
num_beams=5,
num_return_sequences=1,
)
with self.tokenizer.as_target_tokenizer():
generated_tokens = self.tokenizer.batch_decode(
generated_tokens.detach().cpu().tolist(),
skip_special_tokens=True,
clean_up_tokenization_spaces=True,
)
translations = self.ip.postprocess_batch(generated_tokens, lang=self.tgt_lang)
return translations
def generate_image(self, prompt, prev_image, strength=1.0, guidance_scale=7.5):
strength = float(strength) if strength is not None else 1.0
guidance_scale = float(guidance_scale) if guidance_scale is not None else 7.5
strength = max(0.0, min(1.0, strength))
if prev_image is not None:
image = self.img2img_pipe(
prompt,
image=prev_image,
strength=strength,
guidance_scale=guidance_scale,
negative_prompt='generate text in image'
).images[0]
return image
image = self.pipe(prompt)
return image.images[0]
def run(self, input_sentences, strength, guidance_scale, prev_image=None):
translations = self.translate(input_sentences)
sentence = translations[0]
image = self.generate_image(sentence, prev_image, strength, guidance_scale)
return sentence, image
def transcribe_audio_to_hindi(audio_path: str) -> str:
device = "cuda:0" if torch.cuda.is_available() else "cpu"
torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32
model_id = "openai/whisper-large-v3"
model = AutoModelForSpeechSeq2Seq.from_pretrained(
model_id, torch_dtype=torch_dtype, low_cpu_mem_usage=True, use_safetensors=True
)
model.to(device)
processor = AutoProcessor.from_pretrained(model_id)
whisper_pipe = pipeline(
"automatic-speech-recognition",
model=model,
tokenizer=processor.tokenizer,
feature_extractor=processor.feature_extractor,
torch_dtype=torch_dtype,
device=device,
model_kwargs={"language": "hi"}
)
waveform, sample_rate = torchaudio.load(audio_path)
if sample_rate != 16000:
resampler = torchaudio.transforms.Resample(orig_freq=sample_rate, new_freq=16000)
waveform = resampler(waveform)
result = whisper_pipe(waveform.squeeze(0).cpu().numpy(), return_timestamps=True)
return result["text"]
# Download Stanza resources
stanza.download('hi')
nlp = stanza.Pipeline(lang='hi', processors='tokenize,pos')
def POS_policy(input_text):
doc = nlp(input_text)
words = doc.sentences[-1].words
n = len(words)
i = n-1
while i >= 0:
if words[i].upos in ['NOUN', 'VERB']:
return i
i -= 1
return 0
def generate_images_from_audio(audio_path, base_strength=0.8, base_guidance_scale=12):
text_tot = transcribe_audio_to_hindi(audio_path)
st.write(f'Transcripted sentence: {text_tot}')
cur_sent = ''
prev_idx = 0
generated_images = []
transgen = TransGen()
for word in text_tot.split():
cur_sent += word + ' '
str_idx = POS_policy(cur_sent)
if str_idx != 0 and str_idx != prev_idx:
prev_idx = str_idx
sent, image = transgen.run(
[cur_sent],
base_strength,
base_guidance_scale,
image if 'image' in locals() else None
)
generated_images.append({
'sentence': cur_sent,
'image': image
})
return generated_images
def main():
st.title("Audio to Image Generation App")
# File uploader
uploaded_file = st.file_uploader("Choose a WAV audio file", type="wav")
# Strength and Guidance Scale sliders
base_strength = st.slider("Image Generation Strength", min_value=0.0, max_value=1.0, value=0.8, step=0.1)
base_guidance_scale = st.slider("Guidance Scale", min_value=1.0, max_value=20.0, value=12.0, step=0.5)
if uploaded_file is not None:
# Save the uploaded file temporarily
with open("temp_audio.wav", "wb") as f:
f.write(uploaded_file.getvalue())
# Generate images
st.write("Generating Images...")
generated_images = generate_images_from_audio("temp_audio.wav", base_strength, base_guidance_scale)
# Display generated images
st.write("Generated Images:")
for img_data in generated_images:
st.image(img_data['image'], caption=img_data['sentence'])
if __name__ == "__main__":
main()