File size: 6,815 Bytes
b5753ae 156829e f81d4f2 b5753ae f81d4f2 b5753ae 48423a0 b5753ae 48423a0 b5753ae 194fffd f81d4f2 def995e f81d4f2 b5753ae def995e b5753ae def995e b5753ae 0db6209 156829e 0db6209 b5753ae 96324d6 285150b 156829e 96324d6 285150b 194fffd f81d4f2 4584388 f81d4f2 b5753ae f81d4f2 b5753ae 48423a0 b5753ae 156829e b5753ae 5a8706b b5753ae 156829e b5753ae ee48acc b5753ae 0dfedcd b5753ae e886026 b5753ae 63c45d7 b5753ae 63c45d7 8d707c1 b5753ae 8d707c1 63c45d7 b5753ae 63c45d7 b5753ae 63c45d7 b5753ae 63c45d7 b5753ae 63c45d7 b5753ae 63c45d7 b5753ae 63c45d7 b5753ae |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 |
import os
import uuid
import time
import torch
import gradio as gr
import torchaudio
import subprocess
import numpy as np
from zipfile import ZipFile
from io import StringIO
import csv
import datetime
import langid
from TTS.api import TTS
from TTS.tts.configs.xtts_config import XttsConfig
from TTS.tts.models.xtts import Xtts
from TTS.utils.generic_utils import get_user_data_dir
from huggingface_hub import HfApi
# Set up environment and API
os.environ["COQUI_TOS_AGREED"] = "1"
HF_TOKEN = os.environ.get("HF_TOKEN")
api = HfApi(token=HF_TOKEN)
repo_id = "your/repo-id" # Replace with your repository ID
# Download and set up ffmpeg
print("Export newer ffmpeg binary for denoise filter")
ZipFile("ffmpeg.zip").extractall()
print("Make ffmpeg binary executable")
st = os.stat("ffmpeg")
os.chmod("ffmpeg", st.st_mode | stat.S_IEXEC)
# Load XTTS model
print("Downloading if not downloaded Coqui XTTS V2")
from TTS.utils.manage import ModelManager
model_name = "tts_models/multilingual/multi-dataset/xtts_v2"
ModelManager().download_model(model_name)
model_path = os.path.join(get_user_data_dir("tts"), model_name.replace("/", "--"))
print("XTTS downloaded")
config = XttsConfig()
config.load_json(os.path.join(model_path, "config.json"))
model = Xtts.init_from_config(config)
model.load_checkpoint(
config,
checkpoint_path=os.path.join(model_path, "model.pth"),
vocab_path=os.path.join(model_path, "vocab.json"),
eval=True,
use_deepspeed=False, # Adjust based on your setup
)
# Ensure model is on CPU
model.cpu()
# Function for prediction
def predict(
prompt,
language,
audio_file_pth,
mic_file_path,
use_mic,
voice_cleanup,
no_lang_auto_detect,
agree,
):
if not agree:
gr.Warning("Please accept the Terms & Condition!")
return (None, None, None, None)
if language not in config.languages:
gr.Warning(f"Language not supported. Please choose from dropdown.")
return (None, None, None, None)
language_predicted = langid.classify(prompt)[0].strip()
if language_predicted == "zh":
language_predicted = "zh-cn"
if len(prompt) < 2:
gr.Warning("Please provide a longer prompt text.")
return (None, None, None, None)
if len(prompt) > 200:
gr.Warning("Text length limited to 200 characters.")
return (None, None, None, None)
if use_mic:
if mic_file_path is None:
gr.Warning("Please record your voice with Microphone.")
return (None, None, None, None)
speaker_wav = mic_file_path
else:
speaker_wav = audio_file_pth
if voice_cleanup:
try:
out_filename = f"{speaker_wav}_{uuid.uuid4()}.wav"
shell_command = f"./ffmpeg -y -i {speaker_wav} -af lowpass=8000,highpass=75,areverse,silenceremove=start_periods=1:start_silence=0:start_threshold=0.02,areverse,silenceremove=start_periods=1:start_silence=0:start_threshold=0.02 {out_filename}".split()
subprocess.run(shell_command, capture_output=False, text=True, check=True)
speaker_wav = out_filename
except subprocess.CalledProcessError:
print("Error filtering audio.")
else:
speaker_wav = speaker_wav
try:
metrics_text = ""
t_latent = time.time()
gpt_cond_latent, speaker_embedding = model.get_conditioning_latents(
audio_path=speaker_wav,
gpt_cond_len=30,
gpt_cond_chunk_len=4,
max_ref_length=60
)
latent_calculation_time = time.time() - t_latent
prompt = re.sub("([^\x00-\x7F]|\w)(\.|\。|\?)", r"\1 \2\2", prompt)
print("Generating audio...")
t0 = time.time()
out = model.inference(
prompt,
language,
gpt_cond_latent,
speaker_embedding,
repetition_penalty=5.0,
temperature=0.75,
)
inference_time = time.time() - t0
metrics_text += f"Time to generate audio: {round(inference_time * 1000)} milliseconds\n"
real_time_factor = (time.time() - t0) / out['wav'].shape[-1] * 24000
metrics_text += f"Real-time factor (RTF): {real_time_factor:.2f}\n"
torchaudio.save("output.wav", torch.tensor(out["wav"]).unsqueeze(0), 24000)
except RuntimeError as e:
print(f"RuntimeError: {str(e)}")
gr.Warning("An error occurred. Please try again.")
return (None, None, None, None)
return (
gr.make_waveform(audio="output.wav"),
"output.wav",
metrics_text,
speaker_wav,
)
# Gradio interface
with gr.Blocks(analytics_enabled=False) as demo:
with gr.Row():
with gr.Column():
gr.Markdown("## XTTS Demo")
with gr.Column():
pass
with gr.Row():
with gr.Column():
input_text_gr = gr.Textbox(
label="Text Prompt",
info="One or two sentences at a time. Up to 200 characters.",
value="Hello! Try your best to upload quality audio.",
)
language_gr = gr.Dropdown(
label="Language",
choices=[
"en", "es", "fr", "de", "it", "pt", "pl", "tr", "ru", "nl",
"cs", "ar", "zh-cn", "ja", "ko", "hu", "hi"
],
value="en",
)
ref_gr = gr.Audio(
label="Reference Audio",
type="filepath",
value="examples/female.wav",
)
mic_gr = gr.Audio(
source="microphone",
type="filepath",
label="Use Microphone for Reference",
)
use_mic_gr = gr.Checkbox(
label="Use Microphone",
value=False,
)
clean_ref_gr = gr.Checkbox(
label="Cleanup Reference Voice",
value=False,
)
auto_det_lang_gr = gr.Checkbox(
label="Disable Language Auto-Detect",
value=False,
)
tos_gr = gr.Checkbox(
label="Agree",
value=False,
)
tts_button = gr.Button("Send")
with gr.Column():
video_gr = gr.Video(label="Waveform Visual")
audio_gr = gr.Audio(label="Synthesized Audio", autoplay=True)
out_text_gr = gr.Text(label="Metrics")
ref_audio_gr = gr.Audio(label="Reference Audio Used")
tts_button.click(
predict,
inputs=[input_text_gr, language_gr, ref_gr, mic_gr, use_mic_gr, clean_ref_gr, auto_det_lang_gr, tos_gr],
outputs=[video_gr, audio_gr, out_text_gr, ref_audio_gr]
)
demo.queue()
demo.launch(debug=True)
|