archivartaunik commited on
Commit
666d1ff
·
verified ·
1 Parent(s): c43cb5d

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +76 -59
app.py CHANGED
@@ -1,92 +1,109 @@
1
  import spaces
2
  import gradio as gr
3
  import torch
4
- from huggingface_hub import hf_hub_download
5
  import os
6
  import sys
7
  import tempfile
 
8
  from scipy.io.wavfile import write
9
-
10
- # Клонуем рэпазіторый, калі ён яшчэ не загружаны
11
- if not os.path.exists("XTTSv2-Finetuning-for-New-Languages"):
12
- os.system("git clone https://github.com/hellcatmon/XTTSv2-Finetuning-for-New-Languages.git")
13
-
14
- # Перамяшчаем тэчку TTS у асноўную дырэкторыю
15
- if os.path.exists("XTTSv2-Finetuning-for-New-Languages/TTS"):
16
- os.system("mv XTTSv2-Finetuning-for-New-Languages/TTS ./")
17
-
18
- # Дадаем тэчку TTS у PYTHONPATH
19
- sys.path.append("./TTS")
20
  from tqdm import tqdm
21
  from underthesea import sent_tokenize
22
- from TTS.tts.configs.xtts_config import XttsConfig
23
- from TTS.tts.models.xtts import Xtts
24
 
25
- # Шлях да мадэлі ў Hugging Face
26
- repo_id = "archivartaunik/BE_XTTS_V2_60epoch3Dataset"
27
- checkpoint_file = hf_hub_download(repo_id, filename="model.pth")
28
- config_file = hf_hub_download(repo_id, filename="config.json")
29
- vocab_file = hf_hub_download(repo_id, filename="vocab.json")
30
- default_voice_file = hf_hub_download(repo_id, filename="voice.wav")
 
 
 
 
31
 
32
- # Загрузка канфігурацыі мадэлі
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
33
  config = XttsConfig()
34
  config.load_json(config_file)
35
-
36
- # Ініцыялізацыя і загрузка мадэлі
37
  XTTS_MODEL = Xtts.init_from_config(config)
38
  XTTS_MODEL.load_checkpoint(config, checkpoint_path=checkpoint_file, vocab_path=vocab_file, use_deepspeed=False)
 
 
 
39
 
40
  @spaces.GPU(duration=60)
41
- def text_to_speech(belarusian_story, lang="be", speaker_audio_file=None):
42
- device = "cuda:0" if torch.cuda.is_available() else "cpu"
43
- XTTS_MODEL.to(device)
44
- # Калі файл не пададзены, выкарыстоўваем голас па змаўчанні
45
  if not speaker_audio_file or (not isinstance(speaker_audio_file, str) and speaker_audio_file.name == ""):
46
- speaker_audio_file = default_voice_file
47
-
48
- # Атрыманне латэнтных умоў і эмацый
49
- gpt_cond_latent, speaker_embedding = XTTS_MODEL.get_conditioning_latents(
50
- audio_path=speaker_audio_file,
51
- gpt_cond_len=XTTS_MODEL.config.gpt_cond_len,
52
- max_ref_length=XTTS_MODEL.config.max_ref_len,
53
- sound_norm_refs=XTTS_MODEL.config.sound_norm_refs,
54
- )
 
 
55
 
56
- # Токенізацыя тэксту на асобныя сказы
57
- tts_texts = sent_tokenize(belarusian_story)
 
 
58
 
59
- # Генерацыя аўдыё для кожнага сказы
60
- wav_chunks = []
61
  for text in tqdm(tts_texts):
62
- wav_chunk = XTTS_MODEL.inference(
63
- text=text,
64
- language=lang,
65
- gpt_cond_latent=gpt_cond_latent,
66
- speaker_embedding=speaker_embedding,
67
- temperature=0.1,
68
- length_penalty=1.0,
69
- repetition_penalty=10.0,
70
- top_k=10,
71
- top_p=0.3,
72
- )
73
- wav_chunks.append(torch.tensor(wav_chunk["wav"]))
74
-
75
- # Аб'ядноўваем усе часткі аўдыё ў адзін масіў
76
- out_wav = torch.cat(wav_chunks, dim=0).squeeze().cpu().numpy()
 
 
 
 
 
 
 
 
77
 
78
- # Захоўваем аўдыё ў часовы файл
79
  temp_file = tempfile.NamedTemporaryFile(delete=False, suffix=".wav")
80
- write(temp_file.name, 24000, out_wav)
81
 
82
  return temp_file.name
83
 
84
-
85
  demo = gr.Interface(
86
  fn=text_to_speech,
87
  inputs=[
88
  gr.Textbox(lines=5, label="Тэкст на беларускай мове"),
89
- gr.Textbox(value="be", label="Мова (па змаўчанні BE)", visible=False),
90
  gr.Audio(type="filepath", label="Запішыце або загрузіце файл голасу (без іншых гукаў) не карацей 7 секунд", interactive=True),
91
  ],
92
  outputs="audio",
 
1
  import spaces
2
  import gradio as gr
3
  import torch
4
+ from huggingface_hub import hf_hub_download
5
  import os
6
  import sys
7
  import tempfile
8
+ from pathlib import Path
9
  from scipy.io.wavfile import write
10
+ import numpy as np
 
 
 
 
 
 
 
 
 
 
11
  from tqdm import tqdm
12
  from underthesea import sent_tokenize
 
 
13
 
14
+ try:
15
+ from TTS.tts.configs.xtts_config import XttsConfig
16
+ from TTS.tts.models.xtts import Xtts
17
+ except ImportError:
18
+ os.system("git clone https://github.com/hellcatmon/XTTSv2-Finetuning-for-New-Languages.git")
19
+ if os.path.exists("XTTSv2-Finetuning-for-New-Languages/TTS"):
20
+ os.system("mv XTTSv2-Finetuning-for-New-Languages/TTS ./")
21
+ sys.path.append("./TTS")
22
+ from TTS.tts.configs.xtts_config import XttsConfig
23
+ from TTS.tts.models.xtts import Xtts
24
 
25
+ # Шляхі да файлаў
26
+ repo_id = "archivartaunik/BE_XTTS_V2_60epoch3Dataset"
27
+ model_path = Path("./model")
28
+ model_path.mkdir(exist_ok=True)
29
+ checkpoint_file = model_path / "model.pth"
30
+ config_file = model_path / "config.json"
31
+ vocab_file = model_path / "vocab.json"
32
+ default_voice_file = model_path / "voice.wav"
33
+
34
+ if not checkpoint_file.exists():
35
+ hf_hub_download(repo_id, filename="model.pth", local_dir=model_path)
36
+ if not config_file.exists():
37
+ hf_hub_download(repo_id, filename="config.json", local_dir=model_path)
38
+ if not vocab_file.exists():
39
+ hf_hub_download(repo_id, filename="vocab.json", local_dir=model_path)
40
+ if not default_voice_file.exists():
41
+ hf_hub_download(repo_id, filename="voice.wav", local_dir=model_path)
42
+
43
+ # Загрузка канфігурацыі і мадэлі адзін раз
44
  config = XttsConfig()
45
  config.load_json(config_file)
 
 
46
  XTTS_MODEL = Xtts.init_from_config(config)
47
  XTTS_MODEL.load_checkpoint(config, checkpoint_path=checkpoint_file, vocab_path=vocab_file, use_deepspeed=False)
48
+ device = "cuda:0" if torch.cuda.is_available() else "cpu"
49
+ XTTS_MODEL.to(device)
50
+ sampling_rate = XTTS_MODEL.config.audio["sample_rate"]
51
 
52
  @spaces.GPU(duration=60)
53
+ def text_to_speech(belarusian_story, speaker_audio_file=None): # Прыбралі аргумент lang
 
 
 
54
  if not speaker_audio_file or (not isinstance(speaker_audio_file, str) and speaker_audio_file.name == ""):
55
+ speaker_audio_file = str(default_voice_file)
56
+
57
+ try:
58
+ gpt_cond_latent, speaker_embedding = XTTS_MODEL.get_conditioning_latents(
59
+ audio_path=speaker_audio_file,
60
+ gpt_cond_len=XTTS_MODEL.config.gpt_cond_len,
61
+ max_ref_length=XTTS_MODEL.config.max_ref_len,
62
+ sound_norm_refs=XTTS_MODEL.config.sound_norm_refs,
63
+ )
64
+ except Exception as e:
65
+ return f"Error getting conditioning latents: {e}"
66
 
67
+ try:
68
+ tts_texts = sent_tokenize(belarusian_story)
69
+ except Exception as e:
70
+ return f"Error tokenizing text: {e}"
71
 
72
+ all_wavs = []
 
73
  for text in tqdm(tts_texts):
74
+ try:
75
+ with torch.no_grad():
76
+ wav_chunk = XTTS_MODEL.inference(
77
+ text=text,
78
+ language="be", # Зафіксавалі мову як "be"
79
+ gpt_cond_latent=gpt_cond_latent,
80
+ speaker_embedding=speaker_embedding,
81
+ temperature=0.1,
82
+ length_penalty=1.0,
83
+ repetition_penalty=10.0,
84
+ top_k=10,
85
+ top_p=0.3,
86
+ )
87
+ all_wavs.append(wav_chunk["wav"])
88
+ except Exception as e:
89
+ return f"Error generating audio: {e}"
90
+
91
+ try:
92
+ out_wav = np.concatenate(all_wavs)
93
+ except ValueError:
94
+ return "Немагчыма згенерыраваць аўдыё. Праверце ўваходны тэкст і аўдыёфайл."
95
+ except Exception as e:
96
+ return f"Error concatenating audio: {e}"
97
 
 
98
  temp_file = tempfile.NamedTemporaryFile(delete=False, suffix=".wav")
99
+ write(temp_file.name, sampling_rate, out_wav)
100
 
101
  return temp_file.name
102
 
 
103
  demo = gr.Interface(
104
  fn=text_to_speech,
105
  inputs=[
106
  gr.Textbox(lines=5, label="Тэкст на беларускай мове"),
 
107
  gr.Audio(type="filepath", label="Запішыце або загрузіце файл голасу (без іншых гукаў) не карацей 7 секунд", interactive=True),
108
  ],
109
  outputs="audio",