Safwanahmad619 commited on
Commit
9e5a4f6
·
verified ·
1 Parent(s): b6d33b5

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +244 -100
app.py CHANGED
@@ -53,117 +53,261 @@
53
  # live=True
54
  # )
55
 
56
- # iface.launch()
57
- import os
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
58
  import gradio as gr
59
  import whisper
60
  from gtts import gTTS
61
- from anthropic import Anthropic # Import the Anthropic client
62
- import io # Import io for BytesIO
 
 
 
63
 
64
- # Get the Anthropic API key from environment variables
65
- ANTHROPIC_API_KEY = os.getenv("ANTHROPIC_API_KEY")
66
- if not ANTHROPIC_API_KEY:
67
- raise ValueError("ANTHROPIC_API_KEY environment variable is not set.")
68
- client = Anthropic(api_key=ANTHROPIC_API_KEY) # Initialize the Anthropic client
69
 
70
- # Load Whisper model
71
- model = whisper.load_model("base") # You can also use "small", "medium", "large"
72
 
73
- def chatbot(audio=None):
74
- try:
75
- if audio is None:
76
- return "No input detected. Please provide an audio input.", None
77
-
78
- # Transcribe the audio input using Whisper
79
- transcription = model.transcribe(audio)
80
- user_input = transcription.get("text", "")
81
-
82
- # Generate a response using Anthropic API
83
- chat_completion = client.completions.create(
84
- model="claude-v1", # Specify the model
85
- prompt=user_input, # Provide the user input as the prompt
86
- max_tokens_to_sample=100, # Specify the maximum tokens to sample
87
- )
88
- response_text = chat_completion['completion']
89
 
90
- # Convert the response text to speech using gTTS
91
- tts = gTTS(text=response_text, lang='en')
92
- response_audio_io = io.BytesIO() # Create a BytesIO object
93
- tts.save(response_audio_io) # Save the audio to the BytesIO object
94
- response_audio_io.seek(0) # Rewind the BytesIO object
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
95
 
96
- return response_text, response_audio_io
 
 
 
 
 
 
 
 
 
97
 
 
 
 
 
98
  except Exception as e:
99
- return f"An error occurred: {e}", None
100
-
101
- def clear_inputs():
102
- return None, None, None
103
-
104
- # Create a custom interface
105
- def build_interface():
106
- with gr.Blocks(css="""
107
- .block-title {
108
- text-align: center;
109
- color: white;
110
- background-color: #4CAF50;
111
- padding: 10px;
112
- border-radius: 8px;
113
- }
114
- .gradio-row {
115
- background-color: #f9f9f9;
116
- border-radius: 8px;
117
- padding: 20px;
118
- margin: 10px;
119
- box-shadow: 0px 4px 12px rgba(0, 0, 0, 0.1);
120
- }
121
- .gradio-column {
122
- padding: 10px;
123
- }
124
- .gradio-button {
125
- background-color: #ff6347 !important;
126
- color: white !important;
127
- border-radius: 8px !important;
128
- padding: 10px 20px !important;
129
- font-size: 16px !important;
130
- border: none !important;
131
- cursor: pointer !important;
132
- box-shadow: 0px 4px 12px rgba(0, 0, 0, 0.2) !important;
133
- transition: background-color 0.3s ease !important;
134
- }
135
- .gradio-button:hover {
136
- background-color: #e5533d !important;
137
- }
138
- """) as demo:
139
- gr.Markdown(
140
- """
141
- <h1 class="block-title">Voice-to-Voice AI Chatbot</h1>
142
- """
143
- )
144
- with gr.Row(elem_classes="gradio-row"):
145
- with gr.Column(elem_classes="gradio-column", scale=1):
146
- audio_input = gr.Audio(type="filepath", label="Record Your Voice")
147
- with gr.Column(elem_classes="gradio-column", scale=2):
148
- chatbot_output_text = gr.Textbox(label="Chatbot Response")
149
- chatbot_output_audio = gr.Audio(label="Audio Response")
150
-
151
- clear_button = gr.Button("Clear", elem_classes="gradio-button")
152
-
153
- clear_button.click(
154
- fn=clear_inputs,
155
- outputs=[audio_input, chatbot_output_text, chatbot_output_audio]
156
- )
157
 
158
- audio_input.change(
159
- fn=chatbot,
160
- inputs=[audio_input],
161
- outputs=[chatbot_output_text, chatbot_output_audio]
 
 
 
 
 
 
 
 
162
  )
163
 
164
- return demo
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
165
 
166
- # Launch the interface
167
- if __name__ == "__main__":
168
- interface = build_interface()
169
- interface.launch()
 
53
  # live=True
54
  # )
55
 
56
+ # # iface.launch()
57
+ # import os
58
+ # import gradio as gr
59
+ # import whisper
60
+ # from gtts import gTTS
61
+ # from anthropic import Anthropic # Import the Anthropic client
62
+ # import io # Import io for BytesIO
63
+
64
+ # # Get the Anthropic API key from environment variables
65
+ # ANTHROPIC_API_KEY = os.getenv("ANTHROPIC_API_KEY")
66
+ # if not ANTHROPIC_API_KEY:
67
+ # raise ValueError("ANTHROPIC_API_KEY environment variable is not set.")
68
+ # client = Anthropic(api_key=ANTHROPIC_API_KEY) # Initialize the Anthropic client
69
+
70
+ # # Load Whisper model
71
+ # model = whisper.load_model("base") # You can also use "small", "medium", "large"
72
+
73
+ # def chatbot(audio=None):
74
+ # try:
75
+ # if audio is None:
76
+ # return "No input detected. Please provide an audio input.", None
77
+
78
+ # # Transcribe the audio input using Whisper
79
+ # transcription = model.transcribe(audio)
80
+ # user_input = transcription.get("text", "")
81
+
82
+ # # Generate a response using Anthropic API
83
+ # chat_completion = client.completions.create(
84
+ # model="claude-v1", # Specify the model
85
+ # prompt=user_input, # Provide the user input as the prompt
86
+ # max_tokens_to_sample=100, # Specify the maximum tokens to sample
87
+ # )
88
+ # response_text = chat_completion['completion']
89
+
90
+ # # Convert the response text to speech using gTTS
91
+ # tts = gTTS(text=response_text, lang='en')
92
+ # response_audio_io = io.BytesIO() # Create a BytesIO object
93
+ # tts.save(response_audio_io) # Save the audio to the BytesIO object
94
+ # response_audio_io.seek(0) # Rewind the BytesIO object
95
+
96
+ # return response_text, response_audio_io
97
+
98
+ # except Exception as e:
99
+ # return f"An error occurred: {e}", None
100
+
101
+ # def clear_inputs():
102
+ # return None, None, None
103
+
104
+ # # Create a custom interface
105
+ # def build_interface():
106
+ # with gr.Blocks(css="""
107
+ # .block-title {
108
+ # text-align: center;
109
+ # color: white;
110
+ # background-color: #4CAF50;
111
+ # padding: 10px;
112
+ # border-radius: 8px;
113
+ # }
114
+ # .gradio-row {
115
+ # background-color: #f9f9f9;
116
+ # border-radius: 8px;
117
+ # padding: 20px;
118
+ # margin: 10px;
119
+ # box-shadow: 0px 4px 12px rgba(0, 0, 0, 0.1);
120
+ # }
121
+ # .gradio-column {
122
+ # padding: 10px;
123
+ # }
124
+ # .gradio-button {
125
+ # background-color: #ff6347 !important;
126
+ # color: white !important;
127
+ # border-radius: 8px !important;
128
+ # padding: 10px 20px !important;
129
+ # font-size: 16px !important;
130
+ # border: none !important;
131
+ # cursor: pointer !important;
132
+ # box-shadow: 0px 4px 12px rgba(0, 0, 0, 0.2) !important;
133
+ # transition: background-color 0.3s ease !important;
134
+ # }
135
+ # .gradio-button:hover {
136
+ # background-color: #e5533d !important;
137
+ # }
138
+ # """) as demo:
139
+ # gr.Markdown(
140
+ # """
141
+ # <h1 class="block-title">Voice-to-Voice AI Chatbot</h1>
142
+ # """
143
+ # )
144
+ # with gr.Row(elem_classes="gradio-row"):
145
+ # with gr.Column(elem_classes="gradio-column", scale=1):
146
+ # audio_input = gr.Audio(type="filepath", label="Record Your Voice")
147
+ # with gr.Column(elem_classes="gradio-column", scale=2):
148
+ # chatbot_output_text = gr.Textbox(label="Chatbot Response")
149
+ # chatbot_output_audio = gr.Audio(label="Audio Response")
150
+
151
+ # clear_button = gr.Button("Clear", elem_classes="gradio-button")
152
+
153
+ # clear_button.click(
154
+ # fn=clear_inputs,
155
+ # outputs=[audio_input, chatbot_output_text, chatbot_output_audio]
156
+ # )
157
+
158
+ # audio_input.change(
159
+ # fn=chatbot,
160
+ # inputs=[audio_input],
161
+ # outputs=[chatbot_output_text, chatbot_output_audio]
162
+ # )
163
+
164
+ # return demo
165
+
166
+ # # Launch the interface
167
+ # if __name__ == "__main__":
168
+ # interface = build_interface()
169
+ # interface.launch()
170
+
171
  import gradio as gr
172
  import whisper
173
  from gtts import gTTS
174
+ from groq import Groq
175
+ import os
176
+ import numpy as np
177
+ import soundfile as sf
178
+ import logging
179
 
180
+ # Configure logging
181
+ logging.basicConfig(level=logging.DEBUG)
 
 
 
182
 
183
+ # Initialize the Groq API key from environment variables
184
+ GROQ_API_KEY = os.getenv("GROQ_API_KEY")
185
 
186
+ if not GROQ_API_KEY:
187
+ raise RuntimeError("GROQ_API_KEY environment variable not set.")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
188
 
189
+ # Initialize Whisper model (No API key required)
190
+ try:
191
+ whisper_model = whisper.load_model("base")
192
+ logging.info("Whisper model loaded successfully.")
193
+ except Exception as e:
194
+ raise RuntimeError(f"Error loading Whisper model: {e}")
195
+
196
+ # Initialize Groq client (API key required for Groq API)
197
+ try:
198
+ client = Groq(
199
+ api_key=GROQ_API_KEY # Use the API key from the environment variable
200
+ )
201
+ logging.info("Groq client initialized successfully.")
202
+ except Exception as e:
203
+ raise RuntimeError(f"Error initializing Groq client: {e}")
204
+
205
+ # Function to transcribe audio using Whisper
206
+ def transcribe_audio(audio):
207
+ try:
208
+ # Load audio file with soundfile
209
+ logging.debug(f"Loading audio file: {audio}")
210
+ audio_data, sample_rate = sf.read(audio, dtype='float32') # Ensure dtype is float32
211
+ logging.debug(f"Audio loaded with sample rate: {sample_rate}, data shape: {audio_data.shape}")
212
 
213
+ # Whisper expects a specific sample rate
214
+ if sample_rate != 16000:
215
+ logging.debug(f"Resampling audio from {sample_rate} to 16000 Hz")
216
+ # Resample audio data to 16000 Hz
217
+ num_samples = int(len(audio_data) * (16000 / sample_rate))
218
+ audio_data_resampled = np.interp(np.linspace(0, len(audio_data), num_samples),
219
+ np.arange(len(audio_data)),
220
+ audio_data)
221
+ audio_data = audio_data_resampled.astype(np.float32) # Ensure dtype is float32
222
+ sample_rate = 16000
223
 
224
+ # Perform the transcription
225
+ result = whisper_model.transcribe(audio_data)
226
+ logging.debug(f"Transcription result: {result['text']}")
227
+ return result['text']
228
  except Exception as e:
229
+ logging.error(f"Error during transcription: {e}")
230
+ return f"Error during transcription: {e}"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
231
 
232
+ # Function to get response from LLaMA model using Groq API
233
+ def get_response(text):
234
+ try:
235
+ logging.debug(f"Sending request to Groq API with text: {text}")
236
+ chat_completion = client.chat.completions.create(
237
+ messages=[
238
+ {
239
+ "role": "user",
240
+ "content": text, # Using the transcribed text as input
241
+ }
242
+ ],
243
+ model="llama3-8b-8192", # Ensure the correct model is used
244
  )
245
 
246
+ # Extract and return the model's response
247
+ response_text = chat_completion.choices[0].message.content
248
+ logging.debug(f"Received response from Groq API: {response_text}")
249
+ return response_text
250
+ except Exception as e:
251
+ logging.error(f"Error during model response generation: {e}")
252
+ return f"Error during model response generation: {e}"
253
+
254
+ # Function to convert text to speech using gTTS
255
+ def text_to_speech(text):
256
+ try:
257
+ tts = gTTS(text)
258
+ tts.save("response.mp3")
259
+ logging.debug("Text-to-speech conversion completed successfully.")
260
+ return "response.mp3"
261
+ except Exception as e:
262
+ logging.error(f"Error during text-to-speech conversion: {e}")
263
+ return f"Error during text-to-speech conversion: {e}"
264
+
265
+ # Combined function for Gradio
266
+ def chatbot(audio):
267
+ try:
268
+ # Step 1: Transcribe the audio input using Whisper
269
+ user_input = transcribe_audio(audio)
270
+
271
+ # Check if transcription returned an error
272
+ if "Error" in user_input:
273
+ return user_input, None
274
+
275
+ logging.debug(f"Transcribed text: {user_input}")
276
+
277
+ # Step 2: Get response from the LLaMA model using Groq API
278
+ response_text = get_response(user_input)
279
+
280
+ # Check if the response generation returned an error
281
+ if "Error" in response_text:
282
+ return response_text, None
283
+
284
+ logging.debug(f"Response text: {response_text}")
285
+
286
+ # Step 3: Convert the response text to speech using gTTS
287
+ response_audio = text_to_speech(response_text)
288
+
289
+ # Check if the text-to-speech conversion returned an error
290
+ if "Error" in response_audio:
291
+ return response_audio, None
292
+
293
+ # Step 4: Return the response text and response audio file
294
+ return response_text, response_audio
295
+
296
+ except Exception as e:
297
+ logging.error(f"Unexpected error occurred: {e}")
298
+ return f"Unexpected error occurred: {e}", None
299
+
300
+ # Gradio Interface
301
+ iface = gr.Interface(
302
+ fn=chatbot,
303
+ inputs=gr.Audio(type="filepath"),
304
+ outputs=[gr.Textbox(label="Response Text"), gr.Audio(label="Response Audio")],
305
+ live=True,
306
+ title="Voice-to-Voice Chatbot",
307
+ description="Speak to the bot, and it will respond with voice.",
308
+ )
309
 
310
+ try:
311
+ iface.launch()
312
+ except Exception as e:
313
+ logging.error(f"Error launching Gradio interface: {e}")