Jaward commited on
Commit
ea9c1e1
·
verified ·
1 Parent(s): 81e8d61

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +28 -295
app.py CHANGED
@@ -1,210 +1,9 @@
1
- import gradio as gr
2
- import edge_tts
3
- import asyncio
4
- import tempfile
5
- import os
6
- from huggingface_hub import InferenceClient
7
- import re
8
- from streaming_stt_nemo import Model
9
- import torch
10
- import random
11
- from openai import OpenAI
12
  import subprocess
13
- import threading
14
- import queue
15
- import sounddevice as sd
16
- import numpy as np
17
- import wave
18
- import sys
19
-
20
- default_lang = "en"
21
-
22
- engines = { default_lang: Model(default_lang) }
23
-
24
- def transcribe(audio):
25
- if audio is None:
26
- return ""
27
- lang = "en"
28
- model = engines[lang]
29
- text = model.stt_file(audio)[0]
30
- return text
31
-
32
- HF_TOKEN = os.environ.get("HF_TOKEN", None)
33
-
34
- def client_fn(model):
35
- if "Llama 3 8B Service" in model:
36
- return OpenAI(
37
- base_url="http://52.76.81.56:60002/v1",
38
- api_key="token-abc123"
39
- )
40
- elif "Llama" in model:
41
- return InferenceClient("meta-llama/Meta-Llama-3-8B-Instruct")
42
- elif "Mistral" in model:
43
- return InferenceClient("mistralai/Mistral-7B-Instruct-v0.2")
44
- elif "Phi" in model:
45
- return InferenceClient("microsoft/Phi-3-mini-4k-instruct")
46
- elif "Mixtral" in model:
47
- return InferenceClient("mistralai/Mixtral-8x7B-Instruct-v0.1")
48
- else:
49
- return InferenceClient("microsoft/Phi-3-mini-4k-instruct")
50
-
51
- def randomize_seed_fn(seed: int) -> int:
52
- seed = random.randint(0, 999999)
53
- return seed
54
-
55
- system_instructions1 = """
56
- [SYSTEM] You are OPTIMUS Prime a personal AI voice assistant, Created by Jaward.
57
- Keep conversation friendly, short, clear, and concise.
58
- Avoid unnecessary introductions and answer the user's questions directly.
59
- Respond in a normal, conversational manner while being friendly and helpful.
60
- Remember previous parts of the conversation and use that context in your responses.
61
- Your creator Jaward is an AI Research Engineer at Linksoul AI. He is currently specializing in Artificial Intelligence (AI) research more specifically training and optimizing advance AI systems. He aspires to build not just human-like intelligence but AI Systems that augment human intelligence. He has contributed greatly to the opensource community with first-principles code implementations of AI/ML research papers. He did his first internship at Beijing Academy of Artificial Intelligence as an AI Researher where he contributed in cutting-edge AI research leading to him contributing to an insightful paper (AUTOAGENTS - A FRAMEWORK FOR AUTOMATIC AGENT GENERATION). The paper got accepted this year at IJCAI(International Joint Conference On AI). He is currently doing internship at LinkSoul AI - a small opensource AI Research startup in Beijing.
62
- [USER]
63
- """
64
-
65
- conversation_history = []
66
-
67
- def models(text, model="Llama 3 8B Service", seed=42):
68
- global conversation_history
69
- seed = int(randomize_seed_fn(seed))
70
- generator = torch.Generator().manual_seed(seed)
71
-
72
- client = client_fn(model)
73
-
74
- if "Llama 3 8B Service" in model:
75
- messages = [
76
- {"role": "system", "content": system_instructions1},
77
- ] + conversation_history + [
78
- {"role": "user", "content": text}
79
- ]
80
- completion = client.chat.completions.create(
81
- model="/data/shared/huggingface/hub/models--meta-llama--Meta-Llama-3-8B-Instruct/snapshots/c4a54320a52ed5f88b7a2f84496903ea4ff07b45/",
82
- messages=messages
83
- )
84
- assistant_response = completion.choices[0].message.content
85
-
86
- # Update conversation history
87
- conversation_history.append({"role": "user", "content": text})
88
- conversation_history.append({"role": "assistant", "content": assistant_response})
89
-
90
- # Keep only the last 10 messages to avoid token limit issues
91
- if len(conversation_history) > 20:
92
- conversation_history = conversation_history[-20:]
93
-
94
- return assistant_response
95
- else:
96
- # For other models, we'll concatenate the conversation history into a single string
97
- history_text = "\n".join([f"{'User' if msg['role'] == 'user' else 'Assistant'}: {msg['content']}" for msg in conversation_history])
98
- formatted_prompt = f"{system_instructions1}\n\nConversation history:\n{history_text}\n\nUser: {text}\nOPTIMUS:"
99
-
100
- generate_kwargs = dict(
101
- max_new_tokens=300,
102
- seed=seed
103
- )
104
- stream = client.text_generation(
105
- formatted_prompt, **generate_kwargs, stream=True, details=True, return_full_text=False)
106
- output = ""
107
- for response in stream:
108
- if not response.token.text == "</s>":
109
- output += response.token.text
110
-
111
- # Update conversation history
112
- conversation_history.append({"role": "user", "content": text})
113
- conversation_history.append({"role": "assistant", "content": output})
114
-
115
- # Keep only the last 10 messages to avoid token limit issues
116
- if len(conversation_history) > 20:
117
- conversation_history = conversation_history[-20:]
118
-
119
- return output
120
-
121
- # New global variables for audio processing
122
- RATE = 16000
123
- CHUNK = int(RATE / 10) # 100ms
124
- audio_queue = queue.Queue()
125
- is_listening = False
126
-
127
- def audio_callback(indata, frames, time, status):
128
- if status:
129
- print(status, file=sys.stderr)
130
- audio_queue.put(indata.copy())
131
-
132
- def process_audio_stream(model, seed):
133
- global is_listening
134
- audio_buffer = []
135
- silence_threshold = 0.01
136
- silence_duration = 0
137
- max_silence = 2 # seconds
138
-
139
- while True:
140
- if not is_listening:
141
- audio_buffer.clear()
142
- silence_duration = 0
143
- audio_queue.queue.clear()
144
- continue
145
-
146
- try:
147
- chunk = audio_queue.get(timeout=1)
148
- audio_buffer.append(chunk)
149
-
150
- # Check for silence
151
- if np.abs(chunk).mean() < silence_threshold:
152
- silence_duration += CHUNK / RATE
153
- else:
154
- silence_duration = 0
155
-
156
- if silence_duration > max_silence:
157
- # Process the buffered audio
158
- audio_data = np.concatenate(audio_buffer)
159
- with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as tmp_file:
160
- tmp_path = tmp_file.name
161
- with wave.open(tmp_path, 'wb') as wf:
162
- wf.setnchannels(1)
163
- wf.setsampwidth(2)
164
- wf.setframerate(RATE)
165
- wf.writeframes((audio_data * 32767).astype(np.int16).tobytes())
166
-
167
- # Transcribe and process
168
- user_input = transcribe(tmp_path)
169
- if user_input:
170
- is_listening = False
171
- reply = models(user_input, model, seed)
172
- asyncio.run(respond_and_play(reply))
173
- is_listening = True
174
-
175
- # Clear the buffer
176
- audio_buffer.clear()
177
- silence_duration = 0
178
-
179
- except queue.Empty:
180
- pass
181
-
182
- async def respond_and_play(text):
183
- communicate = edge_tts.Communicate(text, voice="en-US-ChristopherNeural")
184
- with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as tmp_file:
185
- tmp_path = tmp_file.name
186
- await communicate.save(tmp_path)
187
-
188
- # Play the audio
189
- with wave.open(tmp_path, 'rb') as wf:
190
- data = wf.readframes(wf.getnframes())
191
- sd.play(np.frombuffer(data, dtype=np.int16), wf.getframerate())
192
- sd.wait()
193
-
194
- def start_listening(model, seed):
195
- global is_listening
196
- is_listening = True
197
- threading.Thread(target=process_audio_stream, args=(model, seed), daemon=True).start()
198
- with sd.InputStream(callback=audio_callback, channels=1, samplerate=RATE, blocksize=CHUNK):
199
- while is_listening:
200
- sd.sleep(100)
201
-
202
- def stop_listening():
203
- global is_listening
204
- is_listening = False
205
 
206
- # Supported languages for seamless-expressive
207
- LANGUAGE_CODES = {
208
  "English": "eng",
209
  "Spanish": "spa",
210
  "French": "fra",
@@ -213,107 +12,41 @@ LANGUAGE_CODES = {
213
  "Chinese": "cmn"
214
  }
215
 
216
- def translate_speech(audio_file, target_language):
217
- """
218
- Translate input speech (audio file) to the specified target language.
219
- """
220
- if audio_file is None:
221
- return None
222
-
223
- language_code = LANGUAGE_CODES[target_language]
224
  output_file = "translated_audio.wav"
225
 
226
- command = [
227
- "expressivity_predict",
228
- audio_file,
229
- "--tgt_lang", language_code,
230
- "--model_name", "seamless_expressivity",
231
- "--vocoder_name", "vocoder_pretssel",
232
- "--gated-model-dir", "models",
233
- "--output_path", output_file
234
- ]
235
-
236
- subprocess.run(command, check=True)
237
 
238
  if os.path.exists(output_file):
239
  print(f"File created successfully: {output_file}")
240
- return output_file
241
  else:
242
  print(f"File not found: {output_file}")
243
- return None
244
 
245
- def clear_history():
246
- global conversation_history
247
- conversation_history = []
248
- return None, None, None, None
249
 
250
- def voice_assistant_tab():
251
- return "# <center><b>Hello, I am Optimus Prime your personal AI voice assistant</b></center>"
252
 
253
- def speech_translation_tab():
254
- return "# <center><b>Hear how you sound in another language</b></center>"
255
-
256
- with gr.Blocks(css="style.css") as demo:
257
- description = gr.Markdown("# <center><b>Hello, I am Optimus Prime your personal AI voice assistant</b></center>")
258
-
259
- with gr.Tabs() as tabs:
260
- with gr.TabItem("Voice Assistant") as voice_assistant:
261
- select = gr.Dropdown([
262
- 'Llama 3 8B Service',
263
- 'Mixtral 8x7B',
264
- 'Llama 3 8B',
265
- 'Mistral 7B v0.3',
266
- 'Phi 3 mini',
267
- ],
268
- value="Llama 3 8B Service",
269
- label="Model"
270
- )
271
- seed = gr.Slider(
272
- label="Seed",
273
- minimum=0,
274
- maximum=999999,
275
- step=1,
276
- value=0,
277
- visible=False
278
- )
279
- start_button = gr.Button("Start Listening")
280
- stop_button = gr.Button("Stop Listening")
281
- status = gr.Markdown("Status: Not listening")
282
 
283
- start_button.click(
284
- fn=lambda model, seed: start_listening(model, seed),
285
- inputs=[select, seed],
286
- outputs=[status],
287
- _js="() => {document.getElementById('status').textContent = 'Status: Listening'}"
288
- )
289
- stop_button.click(
290
- fn=stop_listening,
291
- inputs=[],
292
- outputs=[status],
293
- _js="() => {document.getElementById('status').textContent = 'Status: Not listening'}"
294
- )
295
-
296
- with gr.TabItem("Speech Translation") as speech_translation:
297
- input_audio = gr.Audio(label="User", sources=["microphone"], type="filepath")
298
- target_lang = gr.Dropdown(
299
- choices=list(LANGUAGE_CODES.keys()),
300
- value="German",
301
- label="Target Language"
302
- )
303
- output_audio = gr.Audio(label="Translated Audio",
304
- interactive=False,
305
- autoplay=True,
306
- elem_classes="audio")
307
-
308
- gr.Interface(
309
- fn=translate_speech,
310
- inputs=[input_audio, target_lang],
311
- outputs=[output_audio],
312
- live=True
313
- )
314
 
315
- voice_assistant.select(fn=voice_assistant_tab, inputs=None, outputs=description)
316
- speech_translation.select(fn=speech_translation_tab, inputs=None, outputs=description)
 
 
 
 
 
317
 
 
318
  if __name__ == "__main__":
319
- demo.queue(max_size=200).launch()
 
1
+ import os
 
 
 
 
 
 
 
 
 
 
2
  import subprocess
3
+ import gradio as gr
4
+ from pathlib import Path
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5
 
6
+ language_codes = {
 
7
  "English": "eng",
8
  "Spanish": "spa",
9
  "French": "fra",
 
12
  "Chinese": "cmn"
13
  }
14
 
15
+ def translate_audio(audio_file, target_language="German"):
16
+ language_code = language_codes[target_language]
 
 
 
 
 
 
17
  output_file = "translated_audio.wav"
18
 
19
+ command = (
20
+ f"expressivity_predict {audio_file} --tgt_lang {language_code} "
21
+ f"--model_name seamless_expressivity --vocoder_name vocoder_pretssel "
22
+ f"--gated-model-dir seamlessmodel --output_path {output_file}"
23
+ )
24
+ subprocess.run(command, shell=True)
 
 
 
 
 
25
 
26
  if os.path.exists(output_file):
27
  print(f"File created successfully: {output_file}")
 
28
  else:
29
  print(f"File not found: {output_file}")
 
30
 
31
+ return output_file
 
 
 
32
 
 
 
33
 
34
+ inputs = [
35
+ gr.Audio(sources=["microphone"], type="filepath", label="User"),
36
+ gr.Dropdown(["English", "Spanish", "French", "German", "Italian", "Chinese"],
37
+ label="Target Language",
38
+ value="German")
39
+ ]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
40
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
41
 
42
+ iface = gr.Interface(
43
+ fn=translate_audio,
44
+ inputs=inputs,
45
+ outputs=gr.Audio(label="Translated Audio"),
46
+ title="Seamless Expressive Audio Translator",
47
+ description="Translate your audio into different languages with expressive styles."
48
+ )
49
 
50
+ # Run the application
51
  if __name__ == "__main__":
52
+ iface.launch()