hu-po commited on
Commit
a86e62a
·
1 Parent(s): 64c61b2

release 0.3

Browse files
app.py CHANGED
@@ -197,7 +197,26 @@ def make_voices(voices_yaml: str):
197
 
198
  # Define the main GradIO UI
199
  with gr.Blocks() as demo:
200
- gr.HTML('''<center><h1>Speech2Speech</h1></center>''')
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
201
  with gr.Tab("Conversation"):
202
  gr_convo_output = gr.HTML()
203
  with gr.Row():
@@ -249,8 +268,6 @@ with gr.Blocks() as demo:
249
 
250
  gr.HTML('''<center>
251
  Created by <a href="https://youtube.com/@hu-po">Hu Po</a> GitHub: <a href="https://github.com/hu-po/speech2speech">speech2speech</a>
252
- <br>
253
- Duplicate this space:<a href="https://huggingface.co/spaces/hu-po/speech2speech?duplicate=true"><img src="https://bit.ly/3gLdBN6" alt="Duplicate Space"></a>
254
  </center>
255
  ''')
256
 
 
197
 
198
  # Define the main GradIO UI
199
  with gr.Blocks() as demo:
200
+ gr.HTML('''
201
+ <center>
202
+ <h1>Speech2Speech</h1>
203
+ Make a private copy of this space to paste your API keys.
204
+ <br>
205
+ <a href="https://huggingface.co/spaces/hu-po/speech2speech?duplicate=true"><img src="https://bit.ly/3gLdBN6" alt="Duplicate Space"></a>
206
+ </center>''')
207
+ with gr.Row():
208
+ openai_api_key_textbox = gr.Textbox(
209
+ placeholder="Paste your OpenAI API key here",
210
+ show_label=False,
211
+ lines=1,
212
+ type="password",
213
+ )
214
+ elevenlabs_api_key_textbox = gr.Textbox(
215
+ placeholder="Paste your ElevenLabs API key here",
216
+ show_label=False,
217
+ lines=1,
218
+ type="password",
219
+ )
220
  with gr.Tab("Conversation"):
221
  gr_convo_output = gr.HTML()
222
  with gr.Row():
 
268
 
269
  gr.HTML('''<center>
270
  Created by <a href="https://youtube.com/@hu-po">Hu Po</a> GitHub: <a href="https://github.com/hu-po/speech2speech">speech2speech</a>
 
 
271
  </center>
272
  ''')
273
 
requirements.txt CHANGED
@@ -5,7 +5,7 @@ soundfile==0.12.1
5
  gradio==3.19.1
6
  scipy==1.10.1
7
  SpeechRecognition==3.9.0
8
- # pytube==12.1.2
9
  # git+https://github.com/pytube/pytube@master#egg=pytube
10
- librosa
11
- pytube
 
 
5
  gradio==3.19.1
6
  scipy==1.10.1
7
  SpeechRecognition==3.9.0
 
8
  # git+https://github.com/pytube/pytube@master#egg=pytube
9
+ pytube==12.1.2
10
+ # librosa
11
+ # torchlibrosa
src/src/__pycache__/elevenlabs.cpython-310.pyc ADDED
Binary file (4.65 kB). View file
 
src/src/__pycache__/elevenlabs.cpython-39.pyc ADDED
Binary file (4.64 kB). View file
 
src/src/__pycache__/openailib.cpython-310.pyc ADDED
Binary file (1.59 kB). View file
 
src/src/__pycache__/openailib.cpython-39.pyc ADDED
Binary file (1.59 kB). View file
 
src/src/__pycache__/tube.cpython-310.pyc ADDED
Binary file (1.82 kB). View file
 
src/src/__pycache__/tube.cpython-39.pyc ADDED
Binary file (1.81 kB). View file
 
src/src/__pycache__/utils.cpython-310.pyc ADDED
Binary file (639 Bytes). View file
 
src/src/__pycache__/utils.cpython-39.pyc ADDED
Binary file (637 Bytes). View file
 
src/src/elevenlabs.py ADDED
@@ -0,0 +1,136 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import asyncio
2
+ import io
3
+ import logging
4
+ import os
5
+ import time
6
+ from concurrent.futures import ThreadPoolExecutor
7
+ from dataclasses import dataclass
8
+ from typing import List, Union, Tuple
9
+
10
+ import sounddevice as sd
11
+ import soundfile as sf
12
+ from elevenlabslib import ElevenLabsUser, ElevenLabsVoice
13
+
14
+ from .utils import timeit
15
+
16
+ logging.basicConfig(level=logging.INFO)
17
+ log = logging.getLogger(__name__)
18
+
19
+ USER = None
20
+
21
+ def set_elevenlabs_key(elevenlabs_api_key_textbox=None):
22
+ global USER
23
+ log.info(f"Setting ElevenLabs key.")
24
+ if elevenlabs_api_key_textbox is not None:
25
+ os.environ["ELEVENLABS_API_KEY"] = elevenlabs_api_key_textbox
26
+ try:
27
+ USER = ElevenLabsUser(os.environ["ELEVENLABS_API_KEY"])
28
+ except KeyError as e:
29
+ USER = None
30
+ log.warning("ELEVENLABS_API_KEY not found in environment variables.")
31
+ pass
32
+
33
+ set_elevenlabs_key()
34
+
35
+ @dataclass
36
+ class Speaker:
37
+ name: str
38
+ voice: ElevenLabsVoice
39
+ color: str
40
+ description: str = None
41
+
42
+
43
+ async def text_to_speechbytes_async(text, speaker, loop):
44
+ with ThreadPoolExecutor() as executor:
45
+ speech_bytes = await loop.run_in_executor(executor, text_to_speechbytes, text, speaker.voice)
46
+ return speech_bytes
47
+
48
+
49
+ async def play_history(history: List[Tuple[Speaker, str]]):
50
+ loop = asyncio.get_event_loop()
51
+
52
+ # Create a list of tasks for all text_to_speechbytes function calls
53
+ tasks = [text_to_speechbytes_async(
54
+ text, speaker, loop) for speaker, text in history]
55
+
56
+ # Run tasks concurrently, waiting for the first one to complete
57
+ for speech_bytes in await asyncio.gather(*tasks):
58
+ audioFile = io.BytesIO(speech_bytes)
59
+ soundFile = sf.SoundFile(audioFile)
60
+ sd.play(soundFile.read(), samplerate=soundFile.samplerate, blocking=True)
61
+
62
+
63
+ async def save_history(history: List[Tuple[Speaker, str]], audio_savepath: str):
64
+ loop = asyncio.get_event_loop()
65
+
66
+ # Create a list of tasks for all text_to_speechbytes function calls
67
+ tasks = [text_to_speechbytes_async(
68
+ text, speaker, loop) for speaker, text in history]
69
+
70
+ # Run tasks concurrently, waiting for the first one to complete
71
+ all_speech_bytes = await asyncio.gather(*tasks)
72
+
73
+ # Combine all audio bytes into a single audio file
74
+ concatenated_audio = io.BytesIO(b''.join(all_speech_bytes))
75
+
76
+ # Save the combined audio file to disk
77
+ with sf.SoundFile(concatenated_audio, mode='r') as soundFile:
78
+ with sf.SoundFile(
79
+ audio_savepath, mode='w',
80
+ samplerate=soundFile.samplerate,
81
+ channels=soundFile.channels,
82
+ ) as outputFile:
83
+ outputFile.write(soundFile.read())
84
+
85
+
86
+ def check_voice_exists(voice: Union[ElevenLabsVoice, str]) -> Union[ElevenLabsVoice, None]:
87
+ if USER is None:
88
+ log.warning(
89
+ "No ElevenLabsUser found, have you set the ELEVENLABS_API_KEY environment variable?")
90
+ return None
91
+ log.info(f"Getting voice {voice}...")
92
+ _available_voices = USER.get_voices_by_name(voice)
93
+ if _available_voices:
94
+ log.info(f"Voice {voice} already exists, found {_available_voices}.")
95
+ return _available_voices[0]
96
+ return None
97
+
98
+
99
+ @timeit
100
+ def get_make_voice(voice: Union[ElevenLabsVoice, str], audio_path: List[str] = None) -> ElevenLabsVoice:
101
+ if USER is None:
102
+ log.warning(
103
+ "No ElevenLabsUser found, have you set the ELEVENLABS_API_KEY environment variable?")
104
+ return None
105
+ _voice = check_voice_exists(voice)
106
+ if _voice is not None:
107
+ return _voice
108
+ else:
109
+ if USER.get_voice_clone_available():
110
+ assert audio_path is not None, "audio_path must be provided"
111
+ assert isinstance(audio_path, list), "audio_path must be a list"
112
+ log.info(f"Cloning voice {voice}...")
113
+ _audio_source_dict = {
114
+ # Audio path is a PosixPath
115
+ _.name: open(_, "rb").read() for _ in audio_path
116
+ }
117
+ newVoice = USER.clone_voice_bytes(voice, _audio_source_dict)
118
+ return newVoice
119
+ raise ValueError(
120
+ f"Voice {voice} does not exist and cloning is not available.")
121
+
122
+
123
+ @timeit
124
+ def text_to_speech(text: str, voice: ElevenLabsVoice):
125
+ log.info(f"Generating audio using voice {voice}...")
126
+ time_start = time.time()
127
+ voice.generate_and_play_audio(text, playInBackground=False)
128
+ duration = time.time() - time_start
129
+ return duration
130
+
131
+
132
+ @timeit
133
+ def text_to_speechbytes(text: str, voice: ElevenLabsVoice):
134
+ log.info(f"Generating audio for voice {voice} text {text}...")
135
+ audio_bytes = voice.generate_audio_bytes(text)
136
+ return audio_bytes
src/src/openailib.py ADDED
@@ -0,0 +1,58 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import logging
2
+ import os
3
+
4
+ from .utils import timeit
5
+
6
+ import openai
7
+
8
+ logging.basicConfig(level=logging.INFO)
9
+ log = logging.getLogger(__name__)
10
+
11
+
12
+ def set_openai_key(openai_api_key_textbox = None):
13
+ log.info(f"Setting OpenAI key.")
14
+ if openai_api_key_textbox is not None:
15
+ os.environ["OPENAI_API_KEY"] = openai_api_key_textbox
16
+ try:
17
+ openai.api_key = os.getenv("OPENAI_API_KEY")
18
+ except KeyError as e:
19
+ log.warning("OPENAI_API_KEY not found in environment variables.")
20
+ pass
21
+
22
+ set_openai_key()
23
+
24
+ @timeit
25
+ def speech_to_text(audio_path):
26
+ log.info("Transcribing audio...")
27
+ transcript = openai.Audio.transcribe("whisper-1", open(audio_path, "rb"))
28
+ text = transcript["text"]
29
+ log.info(f"Transcript: \n\t{text}")
30
+ return text
31
+
32
+
33
+ @timeit
34
+ def top_response(prompt, system=None, model="gpt-3.5-turbo", max_tokens=20, temperature=0.8):
35
+ _prompt = [
36
+ {
37
+ "role": "user",
38
+ "content": prompt,
39
+ },
40
+ ]
41
+ if system:
42
+ _prompt = [
43
+ {
44
+ "role": "system",
45
+ "content": system,
46
+ },
47
+ ] + _prompt
48
+ log.info(f"API call to {model} with prompt: \n\n\t{_prompt}\n\n")
49
+ _response = openai.ChatCompletion.create(
50
+ model=model,
51
+ messages=_prompt,
52
+ temperature=temperature,
53
+ n=1,
54
+ max_tokens=max_tokens,
55
+ )
56
+ log.info(f"API reponse: \n\t{_response}")
57
+ response: str = _response['choices'][0]['message']['content']
58
+ return response
src/src/tube.py ADDED
@@ -0,0 +1,64 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ '''
2
+ Extract audio from a YouTube video
3
+
4
+ Usage:
5
+ tube.py <url> <person> [-s <start_time>] [-d <duration>]
6
+ '''
7
+
8
+ import subprocess
9
+ from pathlib import Path
10
+ import datetime
11
+ import argparse
12
+ import os
13
+ from pytube import YouTube
14
+
15
+ # Define argparse arguments
16
+ parser = argparse.ArgumentParser(description='Extract audio from a YouTube video')
17
+ parser.add_argument('url', type=str, help='the YouTube video URL')
18
+ parser.add_argument('person', type=str, help='the name of the person speaking')
19
+ parser.add_argument('-s', '--start-time', type=float, default=0, help='the start time in minutes for the extracted audio (default: 0)')
20
+ parser.add_argument('-d', '--duration', type=int, help='the duration in seconds for the extracted audio (default: 60)')
21
+
22
+
23
+ # 200 seconds seems to be max duration for single clips
24
+ def extract_audio(url: str, label: str, start_minute: float = 0, duration: int = 200):
25
+
26
+ # Download the YouTube video
27
+ youtube_object = YouTube(url)
28
+ stream = youtube_object.streams.first()
29
+ video_path = Path(stream.download(skip_existing=True))
30
+
31
+ # Convert start time to seconds
32
+ start_time_seconds = int(start_minute * 60)
33
+
34
+ # Format the start time in HH:MM:SS.mmm format
35
+ start_time_formatted = str(datetime.timedelta(seconds=start_time_seconds))
36
+ start_time_formatted = start_time_formatted[:11] + start_time_formatted[12:]
37
+
38
+ # Set the output path using the audio file name
39
+ output_path = video_path.parent / f"{label}.wav"
40
+
41
+ # Run ffmpeg to extract the audio
42
+ cmd = ['ffmpeg', '-y', '-i', str(video_path), '-ss', start_time_formatted]
43
+ if duration is not None:
44
+ # Format the duration in HH:MM:SS.mmm format
45
+ duration_formatted = str(datetime.timedelta(seconds=duration))
46
+ duration_formatted = duration_formatted[:11] + duration_formatted[12:]
47
+ cmd += ['-t', duration_formatted]
48
+ cmd += ['-q:a', '0', '-map', 'a', str(output_path)]
49
+ subprocess.run(cmd)
50
+
51
+ # remove the extra .3gpp file that is created:
52
+ for file in os.listdir(video_path.parent):
53
+ if file.endswith(".3gpp"):
54
+ os.remove(os.path.join(video_path.parent, file))
55
+
56
+ return output_path
57
+
58
+ if __name__ == '__main__':
59
+
60
+ # Parse the arguments
61
+ args = parser.parse_args()
62
+
63
+ # Extract the audio
64
+ extract_audio(args.url, args.person, args.start_time, args.duration)
src/src/utils.py ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import time
2
+ import logging
3
+
4
+ log = logging.getLogger(__name__)
5
+
6
+ # Decorator to time a function
7
+ def timeit(func):
8
+ def timed(*args, **kwargs):
9
+ time_start = time.time()
10
+ result = func(*args, **kwargs)
11
+ _yellow = "\x1b[33;20m"
12
+ _reset = "\x1b[0m"
13
+ _msg = f"{_yellow}{func.__name__} duration: {time.time() - time_start:.2f} seconds{_reset}"
14
+ log.info(_msg)
15
+ return result
16
+ return timed