Spaces:

archit11
/

shuka_demo

Running on Zero

File size: 1,586 Bytes

2ed7223
 
dc03737
c621812
8b70c99
039f770
f079dda
62dda31
dc03737
8b70c99
dc03737
 
8b70c99
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
dc03737
 
 
 
c621812
8b70c99
 
dc03737
8b70c99
dc03737
2ed7223
ab07d9e
8b70c99
2ed7223
 
 
e8c2661
8b70c99
dc03737
c621812
dc03737
2ed7223
 
c621812
05dddc6

import transformers
import gradio as gr
import librosa
import torch
import numpy as np
import spaces
from typing import Tuple  

@spaces.GPU(duration=120)
def transcribe_and_respond(audio_input: Tuple[np.ndarray, int]) -> str:
    try:
        pipe = transformers.pipeline(
    model='sarvamai/shuka_v1',
    trust_remote_code=True,
    device=0,
    torch_dtype=torch.bfloat16
)
        # Unpack the audio input
        audio, sr = audio_input
        
        # Ensure audio is float32
        if audio.dtype != np.float32:
            audio = audio.astype(np.float32)
        
        # Resample if necessary
        if sr != 16000:
            audio = librosa.resample(audio, orig_sr=sr, target_sr=16000)
        
        # Define conversation turns
        turns = [
            {'role': 'system', 'content': 'Respond naturally and informatively.'},
            {'role': 'user', 'content': ''}
        ]

        # Run the pipeline with the audio and conversation turns
        output = pipe({'audio': audio, 'turns': turns, 'sampling_rate': 16000}, max_new_tokens=512)

        # Return the model's response
        return output

    except Exception as e:
        return f"Error processing audio: {str(e)}"

iface = gr.Interface(
    fn=transcribe_and_respond,
    inputs=gr.Audio(sources="microphone", type="numpy"),  
    outputs="text", 
    title="Live Transcription and Response",
    description="Speak into your microphone, and the model will respond naturally and informatively.",
    live=True  # Enable live processing
)

if __name__ == "__main__":
    iface.launch()