Spaces:
Sleeping
Sleeping
import os | |
import subprocess | |
import sys | |
# Clone required repositories | |
def clone_repositories(): | |
repos = [ | |
('https://github.com/AI4Bharat/IndicTrans2.git', 'indictrans2'), | |
('https://github.com/VarunGumma/IndicTransToolkit.git', 'indictranstoolkit') | |
] | |
for repo_url, repo_dir in repos: | |
if not os.path.exists(repo_dir): | |
subprocess.check_call(['git', 'clone', repo_url, repo_dir]) | |
sys.path.append(os.path.abspath(repo_dir)) | |
# Clone repositories before importing | |
clone_repositories() | |
import streamlit as st | |
import torch | |
import librosa | |
import matplotlib.pyplot as plt | |
from PIL import Image | |
import torchaudio | |
from transformers import ( | |
AutoModelForSpeechSeq2Seq, | |
AutoProcessor, | |
pipeline, | |
AutoModelForSeq2SeqLM, | |
AutoTokenizer, | |
BitsAndBytesConfig | |
) | |
from diffusers import StableDiffusionPipeline, EulerDiscreteScheduler, StableDiffusionImg2ImgPipeline | |
import stanza | |
import numpy as np | |
from IndicTransToolkit import IndicProcessor | |
class TransGen: | |
def __init__( | |
self, | |
translation_model="ai4bharat/indictrans2-indic-en-1B", | |
stable_diff_model="stabilityai/stable-diffusion-2-base", | |
src_lang='hin_Deva', | |
tgt_lang='eng_Latn' | |
): | |
self.bnb_config = BitsAndBytesConfig(load_in_4bit=True) | |
self.tokenizer = AutoTokenizer.from_pretrained(translation_model, trust_remote_code=True) | |
self.model = AutoModelForSeq2SeqLM.from_pretrained(translation_model, trust_remote_code=True, quantization_config=self.bnb_config) | |
self.ip = IndicProcessor(inference=True) | |
self.src_lang = src_lang | |
self.tgt_lang = tgt_lang | |
scheduler = EulerDiscreteScheduler.from_pretrained(stable_diff_model, subfolder="scheduler") | |
self.pipe = StableDiffusionPipeline.from_pretrained(stable_diff_model, scheduler=scheduler, torch_dtype=torch.bfloat16) | |
self.pipe = self.pipe.to("cuda") | |
self.img2img_pipe = StableDiffusionImg2ImgPipeline.from_pretrained(stable_diff_model, torch_dtype=torch.float16) | |
self.img2img_pipe = self.img2img_pipe.to('cuda') | |
def translate(self, input_sentences): | |
batch = self.ip.preprocess_batch( | |
input_sentences, | |
src_lang=self.src_lang, | |
tgt_lang=self.tgt_lang, | |
) | |
inputs = self.tokenizer( | |
batch, | |
truncation=True, | |
padding="longest", | |
return_tensors="pt", | |
return_attention_mask=True, | |
) | |
with torch.no_grad(): | |
generated_tokens = self.model.generate( | |
**inputs, | |
use_cache=True, | |
min_length=0, | |
max_length=256, | |
num_beams=5, | |
num_return_sequences=1, | |
) | |
with self.tokenizer.as_target_tokenizer(): | |
generated_tokens = self.tokenizer.batch_decode( | |
generated_tokens.detach().cpu().tolist(), | |
skip_special_tokens=True, | |
clean_up_tokenization_spaces=True, | |
) | |
translations = self.ip.postprocess_batch(generated_tokens, lang=self.tgt_lang) | |
return translations | |
def generate_image(self, prompt, prev_image, strength=1.0, guidance_scale=7.5): | |
strength = float(strength) if strength is not None else 1.0 | |
guidance_scale = float(guidance_scale) if guidance_scale is not None else 7.5 | |
strength = max(0.0, min(1.0, strength)) | |
if prev_image is not None: | |
image = self.img2img_pipe( | |
prompt, | |
image=prev_image, | |
strength=strength, | |
guidance_scale=guidance_scale, | |
negative_prompt='generate text in image' | |
).images[0] | |
return image | |
image = self.pipe(prompt) | |
return image.images[0] | |
def run(self, input_sentences, strength, guidance_scale, prev_image=None): | |
translations = self.translate(input_sentences) | |
sentence = translations[0] | |
image = self.generate_image(sentence, prev_image, strength, guidance_scale) | |
return sentence, image | |
def transcribe_audio_to_hindi(audio_path: str) -> str: | |
device = "cuda:0" if torch.cuda.is_available() else "cpu" | |
torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32 | |
model_id = "openai/whisper-large-v3" | |
model = AutoModelForSpeechSeq2Seq.from_pretrained( | |
model_id, torch_dtype=torch_dtype, low_cpu_mem_usage=True, use_safetensors=True | |
) | |
model.to(device) | |
processor = AutoProcessor.from_pretrained(model_id) | |
whisper_pipe = pipeline( | |
"automatic-speech-recognition", | |
model=model, | |
tokenizer=processor.tokenizer, | |
feature_extractor=processor.feature_extractor, | |
torch_dtype=torch_dtype, | |
device=device, | |
model_kwargs={"language": "hi"} | |
) | |
waveform, sample_rate = torchaudio.load(audio_path) | |
if sample_rate != 16000: | |
resampler = torchaudio.transforms.Resample(orig_freq=sample_rate, new_freq=16000) | |
waveform = resampler(waveform) | |
result = whisper_pipe(waveform.squeeze(0).cpu().numpy(), return_timestamps=True) | |
return result["text"] | |
# Download Stanza resources | |
stanza.download('hi') | |
nlp = stanza.Pipeline(lang='hi', processors='tokenize,pos') | |
def POS_policy(input_text): | |
doc = nlp(input_text) | |
words = doc.sentences[-1].words | |
n = len(words) | |
i = n-1 | |
while i >= 0: | |
if words[i].upos in ['NOUN', 'VERB']: | |
return i | |
i -= 1 | |
return 0 | |
def generate_images_from_audio(audio_path, base_strength=0.8, base_guidance_scale=12): | |
text_tot = transcribe_audio_to_hindi(audio_path) | |
st.write(f'Transcripted sentence: {text_tot}') | |
cur_sent = '' | |
prev_idx = 0 | |
generated_images = [] | |
transgen = TransGen() | |
for word in text_tot.split(): | |
cur_sent += word + ' ' | |
str_idx = POS_policy(cur_sent) | |
if str_idx != 0 and str_idx != prev_idx: | |
prev_idx = str_idx | |
sent, image = transgen.run( | |
[cur_sent], | |
base_strength, | |
base_guidance_scale, | |
image if 'image' in locals() else None | |
) | |
generated_images.append({ | |
'sentence': cur_sent, | |
'image': image | |
}) | |
return generated_images | |
def main(): | |
st.title("Audio to Image Generation App") | |
# File uploader | |
uploaded_file = st.file_uploader("Choose a WAV audio file", type="wav") | |
# Strength and Guidance Scale sliders | |
base_strength = st.slider("Image Generation Strength", min_value=0.0, max_value=1.0, value=0.8, step=0.1) | |
base_guidance_scale = st.slider("Guidance Scale", min_value=1.0, max_value=20.0, value=12.0, step=0.5) | |
if uploaded_file is not None: | |
# Save the uploaded file temporarily | |
with open("temp_audio.wav", "wb") as f: | |
f.write(uploaded_file.getvalue()) | |
# Generate images | |
st.write("Generating Images...") | |
generated_images = generate_images_from_audio("temp_audio.wav", base_strength, base_guidance_scale) | |
# Display generated images | |
st.write("Generated Images:") | |
for img_data in generated_images: | |
st.image(img_data['image'], caption=img_data['sentence']) | |
if __name__ == "__main__": | |
main() |