archit11 commited on
Commit
8b70c99
·
verified ·
1 Parent(s): 8146e90

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +26 -14
app.py CHANGED
@@ -2,36 +2,48 @@ import transformers
2
  import gradio as gr
3
  import librosa
4
  import torch
5
- import spaces
 
6
 
7
  @spaces.GPU(duration=120)
8
- def transcribe_and_respond(audio_file):
9
  try:
10
  pipe = transformers.pipeline(
11
- model='sarvamai/shuka_v1',
12
- trust_remote_code=True,
13
- device=0,
14
- torch_dtype=torch.bfloat16
15
- )
16
-
17
- audio, sr = librosa.load(audio_file, sr=16000)
18
-
 
 
 
 
 
 
 
 
 
19
  turns = [
20
  {'role': 'system', 'content': 'Respond naturally and informatively.'},
21
  {'role': 'user', 'content': ''}
22
  ]
23
 
24
- output = pipe({'audio': audio, 'turns': turns, 'sampling_rate': sr}, max_new_tokens=512)
 
25
 
 
26
  return output
27
 
28
  except Exception as e:
29
- return f"Error: {str(e)}"
30
 
31
  iface = gr.Interface(
32
  fn=transcribe_and_respond,
33
- inputs=gr.Audio(sources="microphone", type="filepath"), # Accept audio input from microphone
34
- outputs="text", # Output as text
35
  title="Live Transcription and Response",
36
  description="Speak into your microphone, and the model will respond naturally and informatively.",
37
  live=True # Enable live processing
 
2
  import gradio as gr
3
  import librosa
4
  import torch
5
+ import numpy as np
6
+
7
 
8
  @spaces.GPU(duration=120)
9
+ def transcribe_and_respond(audio_input: Tuple[np.ndarray, int]) -> str:
10
  try:
11
  pipe = transformers.pipeline(
12
+ model='sarvamai/shuka_v1',
13
+ trust_remote_code=True,
14
+ device=0,
15
+ torch_dtype=torch.bfloat16
16
+ )
17
+ # Unpack the audio input
18
+ audio, sr = audio_input
19
+
20
+ # Ensure audio is float32
21
+ if audio.dtype != np.float32:
22
+ audio = audio.astype(np.float32)
23
+
24
+ # Resample if necessary
25
+ if sr != 16000:
26
+ audio = librosa.resample(audio, orig_sr=sr, target_sr=16000)
27
+
28
+ # Define conversation turns
29
  turns = [
30
  {'role': 'system', 'content': 'Respond naturally and informatively.'},
31
  {'role': 'user', 'content': ''}
32
  ]
33
 
34
+ # Run the pipeline with the audio and conversation turns
35
+ output = pipe({'audio': audio, 'turns': turns, 'sampling_rate': 16000}, max_new_tokens=512)
36
 
37
+ # Return the model's response
38
  return output
39
 
40
  except Exception as e:
41
+ return f"Error processing audio: {str(e)}"
42
 
43
  iface = gr.Interface(
44
  fn=transcribe_and_respond,
45
+ inputs=gr.Audio(source="microphone", type="numpy"),
46
+ outputs="text",
47
  title="Live Transcription and Response",
48
  description="Speak into your microphone, and the model will respond naturally and informatively.",
49
  live=True # Enable live processing