Dionyssos commited on
Commit
64e63ea
·
1 Parent(s): 989ad3f

filter if len(TTS) < 10 samples

Browse files
Files changed (3) hide show
  1. api.py +7 -0
  2. assets/audiobook_TTS.docx +2 -2
  3. msinference.py +9 -6
api.py CHANGED
@@ -164,6 +164,13 @@ def tts_multi_sentence(precomputed_style_vector=None,
164
  if precomputed_style_vector is not None:
165
  x = []
166
  for _sentence in text:
 
 
 
 
 
 
 
167
  x.append(msinference.inference(_sentence,
168
  precomputed_style_vector)
169
  )
 
164
  if precomputed_style_vector is not None:
165
  x = []
166
  for _sentence in text:
167
+
168
+ # StyleTTS2 - pronounciation Fx
169
+
170
+ _sentence = _sentence.lower() # .replace("ţ", "ț").replace('ț','ts').replace('î', 'u')
171
+ if 'vctk_low#p326' in voice:
172
+ # fix sounding of sleepy AAABS TRAACT
173
+ _sentence = _sentence.replace('abstract', 'ahbstract') # 'ahstract'
174
  x.append(msinference.inference(_sentence,
175
  precomputed_style_vector)
176
  )
assets/audiobook_TTS.docx CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:bbdcb8fe14e0713954e3fa49dc53eaca041c2ac6cfa6de098e892f5a7da38c27
3
- size 221307
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7499070a3e0b743e102cf6181b22544ba4febd6fc57f12757187b9c85554502f
3
+ size 205578
msinference.py CHANGED
@@ -49,7 +49,7 @@ textclenaer = TextCleaner()
49
 
50
 
51
  to_mel = torchaudio.transforms.MelSpectrogram(
52
- n_mels=80, n_fft=2048, win_length=1200, hop_length=300)
53
  mean, std = -4, 4
54
 
55
  # START UTIL
@@ -162,7 +162,6 @@ def inference(text,
162
  use_gruut=False):
163
  # Ignore .,; AT end of sentence; or just [-50:]
164
 
165
-
166
  text = text.strip()
167
 
168
  ps = global_phonemizer.phonemize([text])
@@ -240,10 +239,14 @@ def inference(text,
240
  x = model.decoder(asr,
241
  F0_pred, N_pred, ref.squeeze().unsqueeze(0))
242
 
243
- x = x.squeeze().cpu().numpy()[..., :-2504] # weird pulse at the end of sentences
244
-
245
- x /= np.abs(x).max() + 1e-7
246
 
 
 
 
 
 
 
247
  return x
248
 
249
 
@@ -434,7 +437,7 @@ def foreign(text=None, # list of text
434
  uroman_pl = os.path.join(uroman_dir, "bin", "uroman.pl")
435
  _t = text_mapper.uromanize(_t, uroman_pl)
436
 
437
- _t = _t.lower().replace("ţ", "ț").replace('ț','ts').replace('î', 'u')
438
 
439
  _t = text_mapper.filter_oov(_t, lang=lang)
440
 
 
49
 
50
 
51
  to_mel = torchaudio.transforms.MelSpectrogram(
52
+ n_mels=80, n_fft=2048, win_length=1200, hop_length=328)
53
  mean, std = -4, 4
54
 
55
  # START UTIL
 
162
  use_gruut=False):
163
  # Ignore .,; AT end of sentence; or just [-50:]
164
 
 
165
  text = text.strip()
166
 
167
  ps = global_phonemizer.phonemize([text])
 
239
  x = model.decoder(asr,
240
  F0_pred, N_pred, ref.squeeze().unsqueeze(0))
241
 
242
+ x = x.cpu().numpy()[0, 0, :-400] # weird pulse at the end of sentences
 
 
243
 
244
+ print(x.shape,' A')
245
+ if x.shape[0] > 10:
246
+ x /= np.abs(x).max() + 1e-7
247
+ else:
248
+ print('\n\n\n\n\nEMPTY TTS\n\n\n\n\n\nn', x.shape)
249
+ x = np.zeros(0)
250
  return x
251
 
252
 
 
437
  uroman_pl = os.path.join(uroman_dir, "bin", "uroman.pl")
438
  _t = text_mapper.uromanize(_t, uroman_pl)
439
 
440
+ _t = _t.lower().replace("ţ", "ț").replace('ț','ts').replace('î', 'u') # Parse STTS2 pronounciation on tts_mult()
441
 
442
  _t = text_mapper.filter_oov(_t, lang=lang)
443