filter if len(TTS) < 10 samples
Browse files- api.py +7 -0
- assets/audiobook_TTS.docx +2 -2
- msinference.py +9 -6
api.py
CHANGED
@@ -164,6 +164,13 @@ def tts_multi_sentence(precomputed_style_vector=None,
|
|
164 |
if precomputed_style_vector is not None:
|
165 |
x = []
|
166 |
for _sentence in text:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
167 |
x.append(msinference.inference(_sentence,
|
168 |
precomputed_style_vector)
|
169 |
)
|
|
|
164 |
if precomputed_style_vector is not None:
|
165 |
x = []
|
166 |
for _sentence in text:
|
167 |
+
|
168 |
+
# StyleTTS2 - pronounciation Fx
|
169 |
+
|
170 |
+
_sentence = _sentence.lower() # .replace("ţ", "ț").replace('ț','ts').replace('î', 'u')
|
171 |
+
if 'vctk_low#p326' in voice:
|
172 |
+
# fix sounding of sleepy AAABS TRAACT
|
173 |
+
_sentence = _sentence.replace('abstract', 'ahbstract') # 'ahstract'
|
174 |
x.append(msinference.inference(_sentence,
|
175 |
precomputed_style_vector)
|
176 |
)
|
assets/audiobook_TTS.docx
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
-
size
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:7499070a3e0b743e102cf6181b22544ba4febd6fc57f12757187b9c85554502f
|
3 |
+
size 205578
|
msinference.py
CHANGED
@@ -49,7 +49,7 @@ textclenaer = TextCleaner()
|
|
49 |
|
50 |
|
51 |
to_mel = torchaudio.transforms.MelSpectrogram(
|
52 |
-
n_mels=80, n_fft=2048, win_length=1200, hop_length=
|
53 |
mean, std = -4, 4
|
54 |
|
55 |
# START UTIL
|
@@ -162,7 +162,6 @@ def inference(text,
|
|
162 |
use_gruut=False):
|
163 |
# Ignore .,; AT end of sentence; or just [-50:]
|
164 |
|
165 |
-
|
166 |
text = text.strip()
|
167 |
|
168 |
ps = global_phonemizer.phonemize([text])
|
@@ -240,10 +239,14 @@ def inference(text,
|
|
240 |
x = model.decoder(asr,
|
241 |
F0_pred, N_pred, ref.squeeze().unsqueeze(0))
|
242 |
|
243 |
-
x = x.
|
244 |
-
|
245 |
-
x /= np.abs(x).max() + 1e-7
|
246 |
|
|
|
|
|
|
|
|
|
|
|
|
|
247 |
return x
|
248 |
|
249 |
|
@@ -434,7 +437,7 @@ def foreign(text=None, # list of text
|
|
434 |
uroman_pl = os.path.join(uroman_dir, "bin", "uroman.pl")
|
435 |
_t = text_mapper.uromanize(_t, uroman_pl)
|
436 |
|
437 |
-
_t = _t.lower().replace("ţ", "ț").replace('ț','ts').replace('î', 'u')
|
438 |
|
439 |
_t = text_mapper.filter_oov(_t, lang=lang)
|
440 |
|
|
|
49 |
|
50 |
|
51 |
to_mel = torchaudio.transforms.MelSpectrogram(
|
52 |
+
n_mels=80, n_fft=2048, win_length=1200, hop_length=328)
|
53 |
mean, std = -4, 4
|
54 |
|
55 |
# START UTIL
|
|
|
162 |
use_gruut=False):
|
163 |
# Ignore .,; AT end of sentence; or just [-50:]
|
164 |
|
|
|
165 |
text = text.strip()
|
166 |
|
167 |
ps = global_phonemizer.phonemize([text])
|
|
|
239 |
x = model.decoder(asr,
|
240 |
F0_pred, N_pred, ref.squeeze().unsqueeze(0))
|
241 |
|
242 |
+
x = x.cpu().numpy()[0, 0, :-400] # weird pulse at the end of sentences
|
|
|
|
|
243 |
|
244 |
+
print(x.shape,' A')
|
245 |
+
if x.shape[0] > 10:
|
246 |
+
x /= np.abs(x).max() + 1e-7
|
247 |
+
else:
|
248 |
+
print('\n\n\n\n\nEMPTY TTS\n\n\n\n\n\nn', x.shape)
|
249 |
+
x = np.zeros(0)
|
250 |
return x
|
251 |
|
252 |
|
|
|
437 |
uroman_pl = os.path.join(uroman_dir, "bin", "uroman.pl")
|
438 |
_t = text_mapper.uromanize(_t, uroman_pl)
|
439 |
|
440 |
+
_t = _t.lower().replace("ţ", "ț").replace('ț','ts').replace('î', 'u') # Parse STTS2 pronounciation on tts_mult()
|
441 |
|
442 |
_t = text_mapper.filter_oov(_t, lang=lang)
|
443 |
|