Spaces:
Running
on
Zero
Running
on
Zero
File size: 1,586 Bytes
2ed7223 dc03737 c621812 8b70c99 039f770 f079dda 62dda31 dc03737 8b70c99 dc03737 8b70c99 dc03737 c621812 8b70c99 dc03737 8b70c99 dc03737 2ed7223 ab07d9e 8b70c99 2ed7223 e8c2661 8b70c99 dc03737 c621812 dc03737 2ed7223 c621812 05dddc6 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 |
import transformers
import gradio as gr
import librosa
import torch
import numpy as np
import spaces
from typing import Tuple
@spaces.GPU(duration=120)
def transcribe_and_respond(audio_input: Tuple[np.ndarray, int]) -> str:
try:
pipe = transformers.pipeline(
model='sarvamai/shuka_v1',
trust_remote_code=True,
device=0,
torch_dtype=torch.bfloat16
)
# Unpack the audio input
audio, sr = audio_input
# Ensure audio is float32
if audio.dtype != np.float32:
audio = audio.astype(np.float32)
# Resample if necessary
if sr != 16000:
audio = librosa.resample(audio, orig_sr=sr, target_sr=16000)
# Define conversation turns
turns = [
{'role': 'system', 'content': 'Respond naturally and informatively.'},
{'role': 'user', 'content': ''}
]
# Run the pipeline with the audio and conversation turns
output = pipe({'audio': audio, 'turns': turns, 'sampling_rate': 16000}, max_new_tokens=512)
# Return the model's response
return output
except Exception as e:
return f"Error processing audio: {str(e)}"
iface = gr.Interface(
fn=transcribe_and_respond,
inputs=gr.Audio(sources="microphone", type="numpy"),
outputs="text",
title="Live Transcription and Response",
description="Speak into your microphone, and the model will respond naturally and informatively.",
live=True # Enable live processing
)
if __name__ == "__main__":
iface.launch()
|