import os, sys import tempfile import gradio as gr import numpy as np from typing import Tuple, List # Setup and installation os.system("git clone https://github.com/neonbjb/tortoise-tts.git") sys.path.append("./tortoise-tts/") os.system("pip install -r ./tortoise-tts/requirements.txt") os.system("python ./tortoise-tts/setup.py install") import torch import torchaudio import torch.nn as nn import torch.nn.functional as F from tortoise.api import TextToSpeech from tortoise.utils.audio import load_audio, load_voice # Download and instantiate model tts = TextToSpeech() # Display parameters VOICES = [ "random", "train_atkins", "train_daws", "train_dotrice", "train_dreams", "train_empire", "train_grace", "train_kennard", "train_lescault", "train_mouse", "angie", "applejack", "daniel", "deniro", "emma", "freeman", "geralt", "halle", "jlaw", "lj", "mol", "myself", "pat", "pat2", "rainbow", "snakes", "tim_reynolds", "tom", "weaver", "william", ] DEFAULT_VOICE = "random" PRESETS = ["ultra_fast", "fast", "standard", "high_quality"] DEFAULT_PRESET = "fast" DEFAULT_TEXT = "Hello, world!" README = """# TorToiSe Tortoise is a text-to-speech model developed by James Betker. It is capable of zero-shot voice cloning from a small set of voice samples. GitHub repo: [neonbjb/tortoise-tts](https://github.com/neonbjb/tortoise-tts). ## Usage 1. Select a model preset and type the text to speak. 2. Load a voice - either by choosing a preset, uploading audio files, or recording via microphone. Select the option to split audio into chunks if the clips are much longer than 10 seconds each. Follow the guidelines in the [voice customization guide](https://github.com/neonbjb/tortoise-tts#voice-customization-guide). 3. Click **Generate**, and wait - it's called *tortoise* for a reason! """ TORTOISE_SR_IN = 22050 TORTOISE_SR_OUT = 24000 def chunk_audio( t: torch.Tensor, sample_rate: int, chunk_duration_sec: int ) -> List[torch.Tensor]: duration = t.shape[1] / sample_rate num_chunks = 1 + int(duration / chunk_duration_sec) chunks = [ t[ :, (sample_rate * chunk_duration_sec * i) : ( sample_rate * chunk_duration_sec * (i + 1) ), ] for i in range(num_chunks) ] # remove 0-width chunks chunks = [chunk for chunk in chunks if chunk.shape[1] > 0] return chunks def tts_main(voice_samples: List[torch.Tensor], text: str, model_preset: str) -> str: gen = tts.tts_with_preset( text, voice_samples=voice_samples, conditioning_latents=None, preset=model_preset, ) torchaudio.save("generated.wav", gen.squeeze(0).cpu(), TORTOISE_SR_OUT) return "generated.wav" def tts_from_preset(voice: str, text, model_preset): voice_samples, _ = load_voice(voice) return tts_main(voice_samples, text, model_preset) def tts_from_files( files: List[tempfile._TemporaryFileWrapper], do_chunk, text, model_preset ): voice_samples = [load_audio(f.name, TORTOISE_SR_IN) for f in files] if do_chunk: voice_samples = [ chunk for t in voice_samples for chunk in chunk_audio(t, TORTOISE_SR_IN, 10) ] return tts_main(voice_samples, text, model_preset) def tts_from_recording(recording: Tuple[int, np.ndarray], do_chunk, text, model_preset): sample_rate, audio = recording # normalize- https://github.com/neonbjb/tortoise-tts/blob/main/tortoise/utils/audio.py#L16 norm_fix = 1 if audio.dtype == np.int32: norm_fix = 2**31 elif audio.dtype == np.int16: norm_fix = 2**15 audio = torch.FloatTensor(audio.T) / norm_fix if len(audio.shape) > 1: # convert to mono audio = torch.mean(audio, axis=0).unsqueeze(0) audio = torchaudio.transforms.Resample(sample_rate, TORTOISE_SR_IN)(audio) if do_chunk: voice_samples = chunk_audio(audio, TORTOISE_SR_IN, 10) else: voice_samples = [audio] return tts_main(voice_samples, text, model_preset) def tts_from_url(audio_url, start_time, end_time, do_chunk, text, model_preset): os.system( f"yt-dlp -x --audio-format mp3 --force-overwrites {audio_url} -o audio.mp3" ) audio = load_audio("audio.mp3", TORTOISE_SR_IN) audio = audio[:, start_time * TORTOISE_SR_IN : end_time * TORTOISE_SR_IN] if do_chunk: voice_samples = chunk_audio(audio, TORTOISE_SR_IN, 10) else: voice_samples = [audio] return tts_main(voice_samples, text, model_preset) with gr.Blocks() as demo: gr.Markdown(README) preset = gr.Dropdown(PRESETS, label="Model preset", value=DEFAULT_PRESET) text = gr.Textbox(label="Text to speak", value=DEFAULT_TEXT) do_chunk_label = "Split audio into chunks? (for audio much longer than 10 seconds.)" do_chunk_default = True with gr.Tab("Choose preset voice"): inp1 = gr.Dropdown(VOICES, value=DEFAULT_VOICE, label="Preset voice") btn1 = gr.Button("Generate") with gr.Tab("Upload audio"): inp2 = gr.File(file_count="multiple") do_chunk2 = gr.Checkbox(label=do_chunk_label, value=do_chunk_default) btn2 = gr.Button("Generate") with gr.Tab("Record audio"): inp3 = gr.Audio(source="microphone") do_chunk3 = gr.Checkbox(label=do_chunk_label, value=do_chunk_default) btn3 = gr.Button("Generate") # with gr.Tab("From YouTube"): # inp4 = gr.Textbox(label="URL") # do_chunk4 = gr.Checkbox(label=do_chunk_label, value=do_chunk_default) # start_time = gr.Number(label="Start time (seconds)", precision=0) # end_time = gr.Number(label="End time (seconds)", precision=0) # btn4 = gr.Button("Generate") audio_out = gr.Audio() btn1.click( tts_from_preset, [inp1, text, preset], [audio_out], ) btn2.click( tts_from_files, [inp2, do_chunk2, text, preset], [audio_out], ) btn3.click( tts_from_recording, [inp3, do_chunk3, text, preset], [audio_out], ) # btn4.click( # tts_from_url, # [inp4, start_time, end_time, do_chunk4, text, preset], # [audio_out], # ) demo.launch()