divakaivan's picture
Update app.py
abc0ae6 verified
raw
history blame
2.73 kB
from transformers import pipeline, SpeechT5Processor, SpeechT5ForTextToSpeech, SpeechT5HifiGan
import gradio as gr
import torch
import numpy as np
from datasets import load_dataset, Audio
from transformers import pipeline
import librosa
from openai import OpenAI
# Load ASR model
asr_pipe = pipeline(model="divakaivan/glaswegian-asr")
# Load TTS components
processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_tts")
tts_model = SpeechT5ForTextToSpeech.from_pretrained("divakaivan/glaswegian_tts")
vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan")
# Load dataset for speaker embedding
dataset = load_dataset("divakaivan/glaswegian_audio")
dataset = dataset.cast_column("audio", Audio(sampling_rate=16000))['train']
def transcribe(audio):
text = asr_pipe(audio)["text"]
return text
def generate_response(text, api_key):
client = OpenAI(api_key=api_key)
response = client.chat.completions.create(
model='gpt-3.5-turbo-0125',
messages=[{"role": "user", "content": text}]
)
return response.choices[0].message.content
def synthesize_speech(text):
inputs = processor(text=text, return_tensors="pt")
speaker_embeddings = create_speaker_embedding(dataset[0]["audio"]["array"])
spectrogram = tts_model.generate_speech(inputs["input_ids"], torch.tensor([speaker_embeddings]))
with torch.no_grad():
speech = vocoder(spectrogram)
speech = (speech.numpy() * 32767).astype(np.int16)
return (16000, speech)
def create_speaker_embedding(waveform):
import os
from speechbrain.inference.speaker import EncoderClassifier
spk_model_name = "speechbrain/spkrec-xvect-voxceleb"
device = "cuda" if torch.cuda.is_available() else "cpu"
speaker_model = EncoderClassifier.from_hparams(
source=spk_model_name,
run_opts={"device": device},
savedir=os.path.join("/tmp", spk_model_name),
)
with torch.no_grad():
speaker_embeddings = speaker_model.encode_batch(torch.tensor(waveform))
speaker_embeddings = torch.nn.functional.normalize(speaker_embeddings, dim=2)
speaker_embeddings = speaker_embeddings.squeeze().cpu().numpy()
return speaker_embeddings
def voice_assistant(audio, api_key):
transcribed_text = transcribe(audio)
response_text = generate_response(transcribed_text, api_key)
speech_audio = synthesize_speech(response_text)
return speech_audio
iface = gr.Interface(
fn=voice_assistant,
inputs=[
gr.Audio(type="filepath"),
gr.Textbox(label="OpenAI API Key", type="password")
],
outputs=gr.Audio(label="Response Speech", type="numpy"),
title="Your Glaswegian Assistant"
)
iface.launch()