Spaces:

Safwanahmad619
/

voice-to-voice

Running

App Files Files Community

Safwanahmad619 commited on Aug 26, 2024

Commit

9e5a4f6

verified ·

1 Parent(s): b6d33b5

Update app.py

Browse files

Files changed (1) hide show

app.py +244 -100

app.py CHANGED Viewed

@@ -53,117 +53,261 @@
 #     live=True
 # )
-# iface.launch()
-import os
 import gradio as gr
 import whisper
 from gtts import gTTS
-from anthropic import Anthropic  # Import the Anthropic client
-import io  # Import io for BytesIO
-# Get the Anthropic API key from environment variables
-ANTHROPIC_API_KEY = os.getenv("ANTHROPIC_API_KEY")
-if not ANTHROPIC_API_KEY:
-    raise ValueError("ANTHROPIC_API_KEY environment variable is not set.")
-client = Anthropic(api_key=ANTHROPIC_API_KEY)  # Initialize the Anthropic client
-# Load Whisper model
-model = whisper.load_model("base")  # You can also use "small", "medium", "large"
-def chatbot(audio=None):
-    try:
-        if audio is None:
-            return "No input detected. Please provide an audio input.", None
-        # Transcribe the audio input using Whisper
-        transcription = model.transcribe(audio)
-        user_input = transcription.get("text", "")
-        # Generate a response using Anthropic API
-        chat_completion = client.completions.create(
-            model="claude-v1",  # Specify the model
-            prompt=user_input,   # Provide the user input as the prompt
-            max_tokens_to_sample=100,  # Specify the maximum tokens to sample
-        )
-        response_text = chat_completion['completion']
-        # Convert the response text to speech using gTTS
-        tts = gTTS(text=response_text, lang='en')
-        response_audio_io = io.BytesIO()  # Create a BytesIO object
-        tts.save(response_audio_io)  # Save the audio to the BytesIO object
-        response_audio_io.seek(0)  # Rewind the BytesIO object
-        return response_text, response_audio_io
     except Exception as e:
-        return f"An error occurred: {e}", None
-def clear_inputs():
-    return None, None, None
-# Create a custom interface
-def build_interface():
-    with gr.Blocks(css="""
-        .block-title {
-            text-align: center;
-            color: white;
-            background-color: #4CAF50;
-            padding: 10px;
-            border-radius: 8px;
-        }
-        .gradio-row {
-            background-color: #f9f9f9;
-            border-radius: 8px;
-            padding: 20px;
-            margin: 10px;
-            box-shadow: 0px 4px 12px rgba(0, 0, 0, 0.1);
-        }
-        .gradio-column {
-            padding: 10px;
-        }
-        .gradio-button {
-            background-color: #ff6347 !important;
-            color: white !important;
-            border-radius: 8px !important;
-            padding: 10px 20px !important;
-            font-size: 16px !important;
-            border: none !important;
-            cursor: pointer !important;
-            box-shadow: 0px 4px 12px rgba(0, 0, 0, 0.2) !important;
-            transition: background-color 0.3s ease !important;
-        }
-        .gradio-button:hover {
-            background-color: #e5533d !important;
-        }
-    """) as demo:
-        gr.Markdown(
-            """
-            <h1 class="block-title">Voice-to-Voice AI Chatbot</h1>
-            """
-        )
-        with gr.Row(elem_classes="gradio-row"):
-            with gr.Column(elem_classes="gradio-column", scale=1):
-                audio_input = gr.Audio(type="filepath", label="Record Your Voice")
-            with gr.Column(elem_classes="gradio-column", scale=2):
-                chatbot_output_text = gr.Textbox(label="Chatbot Response")
-                chatbot_output_audio = gr.Audio(label="Audio Response")
-        clear_button = gr.Button("Clear", elem_classes="gradio-button")
-        clear_button.click(
-            fn=clear_inputs,
-            outputs=[audio_input, chatbot_output_text, chatbot_output_audio]
-        )
-        audio_input.change(
-            fn=chatbot,
-            inputs=[audio_input],
-            outputs=[chatbot_output_text, chatbot_output_audio]
         )
-    return demo
-# Launch the interface
-if __name__ == "__main__":
-    interface = build_interface()
-    interface.launch()

 #     live=True
 # )
+# # iface.launch()
+# import os
+# import gradio as gr
+# import whisper
+# from gtts import gTTS
+# from anthropic import Anthropic  # Import the Anthropic client
+# import io  # Import io for BytesIO
+# # Get the Anthropic API key from environment variables
+# ANTHROPIC_API_KEY = os.getenv("ANTHROPIC_API_KEY")
+# if not ANTHROPIC_API_KEY:
+#     raise ValueError("ANTHROPIC_API_KEY environment variable is not set.")
+# client = Anthropic(api_key=ANTHROPIC_API_KEY)  # Initialize the Anthropic client
+# # Load Whisper model
+# model = whisper.load_model("base")  # You can also use "small", "medium", "large"
+# def chatbot(audio=None):
+#     try:
+#         if audio is None:
+#             return "No input detected. Please provide an audio input.", None
+#         # Transcribe the audio input using Whisper
+#         transcription = model.transcribe(audio)
+#         user_input = transcription.get("text", "")
+#         # Generate a response using Anthropic API
+#         chat_completion = client.completions.create(
+#             model="claude-v1",  # Specify the model
+#             prompt=user_input,   # Provide the user input as the prompt
+#             max_tokens_to_sample=100,  # Specify the maximum tokens to sample
+#         )
+#         response_text = chat_completion['completion']
+#         # Convert the response text to speech using gTTS
+#         tts = gTTS(text=response_text, lang='en')
+#         response_audio_io = io.BytesIO()  # Create a BytesIO object
+#         tts.save(response_audio_io)  # Save the audio to the BytesIO object
+#         response_audio_io.seek(0)  # Rewind the BytesIO object
+#         return response_text, response_audio_io
+#     except Exception as e:
+#         return f"An error occurred: {e}", None
+# def clear_inputs():
+#     return None, None, None
+# # Create a custom interface
+# def build_interface():
+#     with gr.Blocks(css="""
+#         .block-title {
+#             text-align: center;
+#             color: white;
+#             background-color: #4CAF50;
+#             padding: 10px;
+#             border-radius: 8px;
+#         }
+#         .gradio-row {
+#             background-color: #f9f9f9;
+#             border-radius: 8px;
+#             padding: 20px;
+#             margin: 10px;
+#             box-shadow: 0px 4px 12px rgba(0, 0, 0, 0.1);
+#         }
+#         .gradio-column {
+#             padding: 10px;
+#         }
+#         .gradio-button {
+#             background-color: #ff6347 !important;
+#             color: white !important;
+#             border-radius: 8px !important;
+#             padding: 10px 20px !important;
+#             font-size: 16px !important;
+#             border: none !important;
+#             cursor: pointer !important;
+#             box-shadow: 0px 4px 12px rgba(0, 0, 0, 0.2) !important;
+#             transition: background-color 0.3s ease !important;
+#         }
+#         .gradio-button:hover {
+#             background-color: #e5533d !important;
+#         }
+#     """) as demo:
+#         gr.Markdown(
+#             """
+#             <h1 class="block-title">Voice-to-Voice AI Chatbot</h1>
+#             """
+#         )
+#         with gr.Row(elem_classes="gradio-row"):
+#             with gr.Column(elem_classes="gradio-column", scale=1):
+#                 audio_input = gr.Audio(type="filepath", label="Record Your Voice")
+#             with gr.Column(elem_classes="gradio-column", scale=2):
+#                 chatbot_output_text = gr.Textbox(label="Chatbot Response")
+#                 chatbot_output_audio = gr.Audio(label="Audio Response")
+#         clear_button = gr.Button("Clear", elem_classes="gradio-button")
+#         clear_button.click(
+#             fn=clear_inputs,
+#             outputs=[audio_input, chatbot_output_text, chatbot_output_audio]
+#         )
+#         audio_input.change(
+#             fn=chatbot,
+#             inputs=[audio_input],
+#             outputs=[chatbot_output_text, chatbot_output_audio]
+#         )
+#     return demo
+# # Launch the interface
+# if __name__ == "__main__":
+#     interface = build_interface()
+#     interface.launch()
 import gradio as gr
 import whisper
 from gtts import gTTS
+from groq import Groq
+import os
+import numpy as np
+import soundfile as sf
+import logging
+# Configure logging
+logging.basicConfig(level=logging.DEBUG)
+# Initialize the Groq API key from environment variables
+GROQ_API_KEY = os.getenv("GROQ_API_KEY")
+if not GROQ_API_KEY:
+    raise RuntimeError("GROQ_API_KEY environment variable not set.")
+# Initialize Whisper model (No API key required)
+try:
+    whisper_model = whisper.load_model("base")
+    logging.info("Whisper model loaded successfully.")
+except Exception as e:
+    raise RuntimeError(f"Error loading Whisper model: {e}")
+# Initialize Groq client (API key required for Groq API)
+try:
+    client = Groq(
+        api_key=GROQ_API_KEY  # Use the API key from the environment variable
+    )
+    logging.info("Groq client initialized successfully.")
+except Exception as e:
+    raise RuntimeError(f"Error initializing Groq client: {e}")
+# Function to transcribe audio using Whisper
+def transcribe_audio(audio):
+    try:
+        # Load audio file with soundfile
+        logging.debug(f"Loading audio file: {audio}")
+        audio_data, sample_rate = sf.read(audio, dtype='float32')  # Ensure dtype is float32
+        logging.debug(f"Audio loaded with sample rate: {sample_rate}, data shape: {audio_data.shape}")
+        # Whisper expects a specific sample rate
+        if sample_rate != 16000:
+            logging.debug(f"Resampling audio from {sample_rate} to 16000 Hz")
+            # Resample audio data to 16000 Hz
+            num_samples = int(len(audio_data) * (16000 / sample_rate))
+            audio_data_resampled = np.interp(np.linspace(0, len(audio_data), num_samples),
+                                             np.arange(len(audio_data)),
+                                             audio_data)
+            audio_data = audio_data_resampled.astype(np.float32)  # Ensure dtype is float32
+            sample_rate = 16000
+        # Perform the transcription
+        result = whisper_model.transcribe(audio_data)
+        logging.debug(f"Transcription result: {result['text']}")
+        return result['text']
     except Exception as e:
+        logging.error(f"Error during transcription: {e}")
+        return f"Error during transcription: {e}"
+# Function to get response from LLaMA model using Groq API
+def get_response(text):
+    try:
+        logging.debug(f"Sending request to Groq API with text: {text}")
+        chat_completion = client.chat.completions.create(
+            messages=[
+                {
+                    "role": "user",
+                    "content": text,  # Using the transcribed text as input
+                }
+            ],
+            model="llama3-8b-8192",  # Ensure the correct model is used
         )
+        # Extract and return the model's response
+        response_text = chat_completion.choices[0].message.content
+        logging.debug(f"Received response from Groq API: {response_text}")
+        return response_text
+    except Exception as e:
+        logging.error(f"Error during model response generation: {e}")
+        return f"Error during model response generation: {e}"
+# Function to convert text to speech using gTTS
+def text_to_speech(text):
+    try:
+        tts = gTTS(text)
+        tts.save("response.mp3")
+        logging.debug("Text-to-speech conversion completed successfully.")
+        return "response.mp3"
+    except Exception as e:
+        logging.error(f"Error during text-to-speech conversion: {e}")
+        return f"Error during text-to-speech conversion: {e}"
+# Combined function for Gradio
+def chatbot(audio):
+    try:
+        # Step 1: Transcribe the audio input using Whisper
+        user_input = transcribe_audio(audio)
+        # Check if transcription returned an error
+        if "Error" in user_input:
+            return user_input, None
+        logging.debug(f"Transcribed text: {user_input}")
+        # Step 2: Get response from the LLaMA model using Groq API
+        response_text = get_response(user_input)
+        # Check if the response generation returned an error
+        if "Error" in response_text:
+            return response_text, None
+        logging.debug(f"Response text: {response_text}")
+        # Step 3: Convert the response text to speech using gTTS
+        response_audio = text_to_speech(response_text)
+        # Check if the text-to-speech conversion returned an error
+        if "Error" in response_audio:
+            return response_audio, None
+        # Step 4: Return the response text and response audio file
+        return response_text, response_audio
+    except Exception as e:
+        logging.error(f"Unexpected error occurred: {e}")
+        return f"Unexpected error occurred: {e}", None
+# Gradio Interface
+iface = gr.Interface(
+    fn=chatbot,
+    inputs=gr.Audio(type="filepath"),
+    outputs=[gr.Textbox(label="Response Text"), gr.Audio(label="Response Audio")],
+    live=True,
+    title="Voice-to-Voice Chatbot",
+    description="Speak to the bot, and it will respond with voice.",
+)
+try:
+    iface.launch()
+except Exception as e:
+    logging.error(f"Error launching Gradio interface: {e}")