import os import numpy as np import soundfile import msinference foreign_accents_en = os.listdir('assets/wavs/mimic3_foreign/') # build by mimic3 USE assets/wavs/style_vector/ for native foreign_accents_en = sorted(foreign_accents_en) def tts_entry(text='A quick brown fox jumps over the lazy dog. Sweet dreams are made of this, I traveled the world and the seven seas.', voice='af_ZA/google-nwu_1919', # 'serbian', # 'en_US/vctk_low#p276', 'isl', 'abi', speed=1.4, # only for non-english affect = True # False = high clarity for partially sight ): '''returns 24kHZ np.array TTS voice : 'en_US/vctk_low#p276' # from English voices -> https://audeering.github.io/shift/ or voice : FOREIGN ACCENTS or voice : 'deu' # from LHS code -> https://huggingface.co/dkounadis/artificial-styletts2/blob/main/Utils/all_langs.csv ''' # StyleTTS2 - English # mimic-3 format of voice (English txt - English accent) if ('en_US/' in voice) or ('en_UK/' in voice): a = '' if affect else 'v2/' style_vector = msinference.compute_style('assets/wavs/style_vector/' + a + voice.replace( '/', '_').replace('#', '_').replace( 'cmu-arctic', 'cmu_arctic').replace( '_low', '') + '.wav') x = msinference.inference(text, style_vector, alpha=0.3, beta=0.7, diffusion_steps=7, embedding_scale=1) # mimic-3 format of voice (English text - Foreign accent) elif '_' in voice: # MSS-TTS dont have '_' HOWEVER all assets/wavs/mimic3_foreign_4x have '_' style_vector = msinference.compute_style('assets/wavs/mimic3_foreign_4x/' + voice.replace( '/', '_').replace('#', '_').replace( 'cmu-arctic', 'cmu_arctic').replace( '_low', '') + '.wav') x = msinference.inference(text, style_vector, alpha=0.3, beta=0.7, diffusion_steps=7, embedding_scale=1) # Fallback - MMS TTS - Non-English else: # dont split foreign sentences: Avoids re-load of VITS & random speaker change issue x = msinference.foreign(text=text, lang=voice, # voice = 'romanian', 'serbian' 'hungarian' speed=speed) # normalisation externally # volume x /= np.abs(x).max() + 1e-7 # amplify speech to full [-1,1] print(x.shape, 'TTS OK') return x ONLY_PARSE_TABLE = False # also generate .wavs if not already uploaded to issue and links are taken if not ONLY_PARSE_TABLE: from pathlib import Path Path('foreign_accents_en_dir/').mkdir(parents=True, exist_ok=True) for v in foreign_accents_en: _str = v[:-4] # strip .wav print(_str) try: soundfile.write('foreign_accents_en_dir/' + v, tts_entry(voice=_str), 24000) # checkt out folder to count all phonemized voices except RuntimeError: # return F.conv2d(input, weight, bias, self.stride, # ENGLISH letters not phonemisable for this ACCENT/LANG # RuntimeError: Calculated padded input size per channel: (5 x 4). Kernel size: (5 x 5). Kernel size can't be greater than actual input size print('SKIP', _str) # --------------------------------------------------- Uploads in hf issue upload_wav = ['', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '',] tmp = '' for j, (voice, _f) in enumerate(zip(sorted(os.listdir('foreign_accents_en_dir/')), upload_wav)): tmp += f'\n\n\n\n\n' tmp += '\n\n
voice TTS
\n\n{j}\n\n\n\n {voice[:-4]} \n\n\n\n {_f} \n\n
' with open('table_for_issue_accents_upload.txt', 'w') as f: f.write(tmp)