File size: 2,949 Bytes
fe62fb4 bdb4f02 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 |
import numpy as np
import soundfile
import msinference
def tts_entry(text='A quick brown fox jumps over the lazy dog. Sweet dreams are made of this, I traveled the world and the seven seas.',
voice='af_ZA/google-nwu_1919', # 'serbian', # 'en_US/vctk_low#p276', 'isl', 'abi',
speed=1.4, # only for non-english
affect = True # False = high clarity for partially sight
):
'''returns 24kHZ np.array TTS
voice : 'en_US/vctk_low#p276' # from English voices -> https://audeering.github.io/shift/
or
voice : FOREIGN ACCENTS
or
voice : 'deu' # from LHS code -> https://huggingface.co/dkounadis/artificial-styletts2/blob/main/Utils/all_langs.csv
'''
# StyleTTS2 - English
# mimic-3 format of voice (English txt - English accent)
if ('en_US/' in voice) or ('en_UK/' in voice):
a = '' if affect else 'v2/'
style_vector = msinference.compute_style('assets/wavs/style_vector/' + a + voice.replace(
'/', '_').replace('#', '_').replace(
'cmu-arctic', 'cmu_arctic').replace(
'_low', '') + '.wav')
x = msinference.inference(text,
style_vector,
alpha=0.3,
beta=0.7,
diffusion_steps=7,
embedding_scale=1)
# mimic-3 format of voice (English text - Foreign accent)
elif '_' in voice and '/' in voice:
style_vector = msinference.compute_style('assets/wavs/mimic3_foreign_4x/' + voice.replace(
'/', '_').replace('#', '_').replace(
'cmu-arctic', 'cmu_arctic').replace(
'_low', '') + '.wav')
x = msinference.inference(text,
style_vector,
alpha=0.3,
beta=0.7,
diffusion_steps=7,
embedding_scale=1)
# Fallback - MMS TTS - Non-English
else:
# dont split foreign sentences: Avoids re-load of VITS & random speaker change issue
x = msinference.foreign(text=text,
lang=voice, # voice = 'romanian', 'serbian' 'hungarian'
speed=speed) # normalisation externally
# volume
x /= np.abs(x).max() + 1e-7 # amplify speech to full [-1,1]
print(x.shape, 'TTS OK')
return x
soundfile.write(f'demo.wav', tts_entry(), 24000)
|