Spaces:
Build error
Build error
import gradio as gr | |
import time | |
import whisper | |
import torch | |
from diffusers import StableDiffusionPipeline, EulerDiscreteScheduler | |
def SpeechToText(audio): | |
if audio == None : return "" | |
model = whisper.load_model("base") | |
audio = whisper.load_audio(audio) | |
audio = whisper.pad_or_trim(audio) | |
# make log-Mel spectrogram and move to the same device as the model | |
mel = whisper.log_mel_spectrogram(audio).to(model.device) | |
# Detect the Max probability of language ? | |
_, probs = model.detect_language(mel) | |
lang = f"Language: {max(probs, key=probs.get)}" | |
# Decode audio to Text | |
options = whisper.DecodingOptions(fp16 = False) | |
result = whisper.decode(model, mel, options) | |
return result.text | |
def img_Generation(text): | |
print(text) | |
model_id = "stabilityai/stable-diffusion-2" | |
#model_id = "stabilityai/stable-diffusion-2-1" | |
# Use the Euler scheduler here instead | |
scheduler = EulerDiscreteScheduler.from_pretrained(model_id, subfolder="scheduler") | |
pipe = StableDiffusionPipeline.from_pretrained(model_id, scheduler=scheduler, revision="fp16", torch_dtype=torch.float16) | |
pipe = pipe.to("cuda") | |
image = pipe(text, num_inference_steps = 80).images[0] | |
#image.save("img_1.png") | |
return image | |
def transcribe(audio): | |
text = SpeechToText(audio) | |
image = img_Generation(text) | |
return image | |
# gradio | |
gr.Interface( | |
fn=transcribe, | |
inputs=gr.Audio(source="microphone", type="filepath"), | |
outputs="image",description="A Speech to Image Generation App Using OpenAI's Whisper and Stable Diffusion V.2",title= "Whisper2IMG").launch() | |