mrfakename
commited on
Sync from GitHub repo
Browse filesThis Space is synced from the GitHub repo: https://github.com/SWivid/F5-TTS. Please submit contributions to the Space there
- inference-cli.py +5 -4
inference-cli.py
CHANGED
@@ -118,7 +118,7 @@ if args.load_vocoder_from_local:
|
|
118 |
vocos.load_state_dict(state_dict)
|
119 |
vocos.eval()
|
120 |
else:
|
121 |
-
print("
|
122 |
vocos = Vocos.from_pretrained("charactr/vocos-mel-24khz")
|
123 |
|
124 |
print(f"Using {device} device")
|
@@ -323,7 +323,7 @@ def infer_batch(ref_audio, ref_text, gen_text_batches, model,ckpt_file,file_voca
|
|
323 |
return final_wave, combined_spectrogram
|
324 |
|
325 |
def process_voice(ref_audio_orig, ref_text):
|
326 |
-
print("Converting
|
327 |
with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as f:
|
328 |
aseg = AudioSegment.from_file(ref_audio_orig)
|
329 |
|
@@ -361,7 +361,6 @@ def process_voice(ref_audio_orig, ref_text):
|
|
361 |
return ref_audio, ref_text
|
362 |
|
363 |
def infer(ref_audio, ref_text, gen_text, model,ckpt_file,file_vocab, remove_silence, cross_fade_duration=0.15):
|
364 |
-
print(gen_text)
|
365 |
# Add the functionality to ensure it ends with ". "
|
366 |
if not ref_text.endswith(". ") and not ref_text.endswith("。"):
|
367 |
if ref_text.endswith("."):
|
@@ -373,7 +372,6 @@ def infer(ref_audio, ref_text, gen_text, model,ckpt_file,file_vocab, remove_sile
|
|
373 |
audio, sr = torchaudio.load(ref_audio)
|
374 |
max_chars = int(len(ref_text.encode('utf-8')) / (audio.shape[-1] / sr) * (25 - audio.shape[-1] / sr))
|
375 |
gen_text_batches = chunk_text(gen_text, max_chars=max_chars)
|
376 |
-
print('ref_text', ref_text)
|
377 |
for i, gen_text in enumerate(gen_text_batches):
|
378 |
print(f'gen_text {i}', gen_text)
|
379 |
|
@@ -390,6 +388,9 @@ def process(ref_audio, ref_text, text_gen, model,ckpt_file,file_vocab, remove_si
|
|
390 |
voices["main"] = main_voice
|
391 |
for voice in voices:
|
392 |
voices[voice]['ref_audio'], voices[voice]['ref_text'] = process_voice(voices[voice]['ref_audio'], voices[voice]['ref_text'])
|
|
|
|
|
|
|
393 |
|
394 |
generated_audio_segments = []
|
395 |
reg1 = r'(?=\[\w+\])'
|
|
|
118 |
vocos.load_state_dict(state_dict)
|
119 |
vocos.eval()
|
120 |
else:
|
121 |
+
print("Download Vocos from huggingface charactr/vocos-mel-24khz")
|
122 |
vocos = Vocos.from_pretrained("charactr/vocos-mel-24khz")
|
123 |
|
124 |
print(f"Using {device} device")
|
|
|
323 |
return final_wave, combined_spectrogram
|
324 |
|
325 |
def process_voice(ref_audio_orig, ref_text):
|
326 |
+
print("Converting", ref_audio_orig)
|
327 |
with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as f:
|
328 |
aseg = AudioSegment.from_file(ref_audio_orig)
|
329 |
|
|
|
361 |
return ref_audio, ref_text
|
362 |
|
363 |
def infer(ref_audio, ref_text, gen_text, model,ckpt_file,file_vocab, remove_silence, cross_fade_duration=0.15):
|
|
|
364 |
# Add the functionality to ensure it ends with ". "
|
365 |
if not ref_text.endswith(". ") and not ref_text.endswith("。"):
|
366 |
if ref_text.endswith("."):
|
|
|
372 |
audio, sr = torchaudio.load(ref_audio)
|
373 |
max_chars = int(len(ref_text.encode('utf-8')) / (audio.shape[-1] / sr) * (25 - audio.shape[-1] / sr))
|
374 |
gen_text_batches = chunk_text(gen_text, max_chars=max_chars)
|
|
|
375 |
for i, gen_text in enumerate(gen_text_batches):
|
376 |
print(f'gen_text {i}', gen_text)
|
377 |
|
|
|
388 |
voices["main"] = main_voice
|
389 |
for voice in voices:
|
390 |
voices[voice]['ref_audio'], voices[voice]['ref_text'] = process_voice(voices[voice]['ref_audio'], voices[voice]['ref_text'])
|
391 |
+
print("Voice:", voice)
|
392 |
+
print("Ref_audio:", voices[voice]['ref_audio'])
|
393 |
+
print("Ref_text:", voices[voice]['ref_text'])
|
394 |
|
395 |
generated_audio_segments = []
|
396 |
reg1 = r'(?=\[\w+\])'
|