Spaces:

archivartaunik
/

Bextts

Running on Zero

App Files Files Community

archivartaunik commited on Dec 28, 2024

Commit

c6fbbbc

verified ·

1 Parent(s): ea2768e

Update app.py

Browse files

Files changed (1) hide show

app.py +63 -79

app.py CHANGED Viewed

@@ -1,69 +1,57 @@
-# Імпартуем патрэбныя модулі
-import gradio as gr
-import torch
-from huggingface_hub import hf_hub_download
 import os
-import sys
-import tempfile
-from scipy.io.wavfile import write
 from tqdm import tqdm
-import nltk
-from nltk.tokenize import sent_tokenize
-import warnings
-import logging
-# Наладжваем лагаванне
-logging.basicConfig(level=logging.INFO)
-logger = logging.getLogger(__name__)
-# Ігнаруем папярэджанні NVML
-warnings.filterwarnings("ignore", category=UserWarning, message="Can't initialize NVML")
-# Загружаем NLTK даныя
-nltk.download('punkt')
-# Клонуем рэпазіторый, калі ён яшчэ не загружаны
-if not os.path.exists("XTTSv2-Finetuning-for-New-Languages"):
-    logger.info("Кланаванне рэпазіторыя...")
-    os.system("git clone https://github.com/hellcatmon/XTTSv2-Finetuning-for-New-Languages.git")
-# Перамяшчаем тэчку TTS у асноўную дырэкторыю
-if os.path.exists("XTTSv2-Finetuning-for-New-Languages/TTS"):
-    logger.info("Перамяшчэнне тэчкі TTS...")
-    os.system("mv XTTSv2-Finetuning-for-New-Languages/TTS ./")
-# Дадаем тэчку TTS у PYTHONPATH
-sys.path.append("./TTS")
-from TTS.tts.configs.xtts_config import XttsConfig
-from TTS.tts.models.xtts import Xtts
-# Вызначэнне прылады (наўпрост CPU)
-device = "cpu"
-logger.info(f"Выбраная прылада: {device}")
-# Шлях да мадэлі ў Hugging Face
-repo_id = "archivartaunik/BE_XTTS_V2_60epoch3Dataset"
-checkpoint_file = hf_hub_download(repo_id, filename="model.pth")
-config_file = hf_hub_download(repo_id, filename="config.json")
-vocab_file = hf_hub_download(repo_id, filename="vocab.json")
-default_voice_file = hf_hub_download(repo_id, filename="voice.wav")
-# Загрузка канфігурацыі мадэлі
 config = XttsConfig()
-config.load_json(config_file)
-# Ініцыялізацыя і загрузка мадэлі без weights_only
 XTTS_MODEL = Xtts.init_from_config(config)
-XTTS_MODEL.load_checkpoint(config, checkpoint_path=checkpoint_file, vocab_path=vocab_file, use_deepspeed=False)
 XTTS_MODEL.to(device)
-def text_to_speech(belarusian_story, lang="be", speaker_audio_file=None):
-    # Калі файл не пададзены, выкарыстоўваем голас па змаўчанні
-    if not speaker_audio_file or (not isinstance(speaker_audio_file, str) and speaker_audio_file.name == ""):
-        speaker_audio_file = default_voice_file
-    logger.info("Атрыманне латэнтных умоў і эмацый...")
-    # Атрыманне латэнтных умоў і эмацый
     gpt_cond_latent, speaker_embedding = XTTS_MODEL.get_conditioning_latents(
         audio_path=speaker_audio_file,
         gpt_cond_len=XTTS_MODEL.config.gpt_cond_len,
@@ -71,12 +59,8 @@ def text_to_speech(belarusian_story, lang="be", speaker_audio_file=None):
         sound_norm_refs=XTTS_MODEL.config.sound_norm_refs,
     )
-    logger.info("Токенізацыя тэксту на асобныя сказы...")
-    # Токенізацыя тэксту на асобныя сказы
-    tts_texts = sent_tokenize(belarusian_story)
-    logger.info("Генерацыя аўдыё для кожнага сказы...")
-    # Генерацыя аўдыё для кожнага сказы
     wav_chunks = []
     for text in tqdm(tts_texts):
         wav_chunk = XTTS_MODEL.inference(
@@ -92,30 +76,30 @@ def text_to_speech(belarusian_story, lang="be", speaker_audio_file=None):
         )
         wav_chunks.append(torch.tensor(wav_chunk["wav"]))
-    logger.info("Аб'яднанне аўдыё частак у адзін масіў...")
-    # Аб'ядноўваем усе часткі аўдыё ў адзін масіў
-    out_wav = torch.cat(wav_chunks, dim=0).squeeze().cpu().numpy()
-    logger.info("Захаванне аўдыё ў часовы файл...")
-    # Захоўваем аўдыё ў часовы файл з памяншанай частатой
-    temp_file = tempfile.NamedTemporaryFile(delete=False, suffix=".wav")
-    write(temp_file.name, 16000, out_wav)  # Выкарыстоўвайце 16000 Гц
-    logger.info("Вяртанне шляху да аўдыё файла.")
-    return temp_file.name
-# Стварэнне Gradio Інтэрфейсу
-demo = gr.Interface(
-    fn=text_to_speech,
-    inputs=[
-        gr.Textbox(lines=5, label="Тэкст на беларускай мове"),
-        gr.Textbox(value="be", label="Мова (па змаўчанні BE)", visible=False),
-        gr.Audio(type="filepath", label="Запішыце або загрузіце файл голасу (без іншых гукаў) не карацей 7 секунд", interactive=True),
-    ],
-    outputs="audio",
-    title="XTTS Belarusian TTS Demo",
-    description="Увядзіце тэкст, і мадэль пераўтворыць яго ў аўдыя. Вы можаце выкарыстоўваць голас па змаўчанні, загрузіць уласны файл або запісаць аўдыё.",
-)
 # Launch the app
 if __name__ == "__main__":

 import os
+import shutil
+import torch
+import torchaudio
+import gradio as gr
 from tqdm import tqdm
+from huggingface_hub import snapshot_download
+from underthesea import sent_tokenize
+from TTS.tts.configs.xtts_config import XttsConfig
+from TTS.tts.models.xtts import Xtts
+# Define repo and model paths
+repo_id = "archivartaunik/BE_XTTS_V2_60epoch3Dataset"
+destination_dir = "checkpoints/XTTS_v2.0_original_model_files/"
+# Download model files
+print("Downloading model files from Hugging Face...")
+local_repo_path = snapshot_download(repo_id)
+os.makedirs(destination_dir, exist_ok=True)
+print("Copying model files...")
+for root, _, files in os.walk(local_repo_path):
+    for file in files:
+        source_file = os.path.join(root, file)
+        relative_path = os.path.relpath(source_file, local_repo_path)
+        destination_file = os.path.join(destination_dir, relative_path)
+        os.makedirs(os.path.dirname(destination_file), exist_ok=True)
+        shutil.copy2(source_file, destination_file)
+print(f"Model files are saved in {destination_dir}.")
+# Initialize device
+device = "cuda:0" if torch.cuda.is_available() else "cpu"
+# Load model
+print("Loading the model...")
+xtts_checkpoint = os.path.join(destination_dir, "model.pth")
+xtts_config = os.path.join(destination_dir, "config.json")
+xtts_vocab = os.path.join(destination_dir, "vocab.json")
 config = XttsConfig()
+config.load_json(xtts_config)
 XTTS_MODEL = Xtts.init_from_config(config)
+XTTS_MODEL.load_checkpoint(config, checkpoint_path=xtts_checkpoint, vocab_path=xtts_vocab, use_deepspeed=False)
 XTTS_MODEL.to(device)
+print("Model loaded successfully!")
+# Function for inference
+def tts_inference(belarusian_text):
+    lang = "be"
+    speaker_audio_file = os.path.join(destination_dir, "voice.wav")
     gpt_cond_latent, speaker_embedding = XTTS_MODEL.get_conditioning_latents(
         audio_path=speaker_audio_file,
         gpt_cond_len=XTTS_MODEL.config.gpt_cond_len,
         sound_norm_refs=XTTS_MODEL.config.sound_norm_refs,
     )
+    tts_texts = sent_tokenize(belarusian_text)
     wav_chunks = []
     for text in tqdm(tts_texts):
         wav_chunk = XTTS_MODEL.inference(
         )
         wav_chunks.append(torch.tensor(wav_chunk["wav"]))
+    out_wav = torch.cat(wav_chunks, dim=0).unsqueeze(0).cpu()
+    # Save the generated audio
+    output_path = "output.wav"
+    torchaudio.save(output_path, out_wav, sample_rate=24000)
+    return output_path
+# Create Gradio app
+def gradio_app():
+    with gr.Blocks() as app:
+        gr.Markdown("# Belarusian TTS Inference App")
+        text_input = gr.Textbox(label="Enter Belarusian Text", placeholder="Быў раз...")
+        audio_output = gr.Audio(label="Generated Speech")
+        generate_button = gr.Button("Generate Speech")
+        generate_button.click(
+            fn=tts_inference,
+            inputs=text_input,
+            outputs=audio_output,
+        )
+    return app
 # Launch the app
 if __name__ == "__main__":