Spaces:
Running
on
Zero
Running
on
Zero
Update app.py
Browse files
app.py
CHANGED
@@ -125,18 +125,21 @@ class WhisperxModel:
|
|
125 |
return self.align_model.align(segments, audio_path)
|
126 |
|
127 |
@spaces.GPU
|
128 |
-
def load_models(whisper_backend_name,
|
129 |
global transcribe_model, align_model, ssrspeech_model
|
130 |
|
131 |
-
|
132 |
if ssrspeech_model_name == "English":
|
133 |
ssrspeech_model_name = "English"
|
134 |
text_tokenizer = TextTokenizer(backend="espeak")
|
135 |
language = "en"
|
|
|
|
|
136 |
elif ssrspeech_model_name == "Mandarin":
|
137 |
ssrspeech_model_name = "Mandarin"
|
138 |
text_tokenizer = TextTokenizer(backend="espeak", language='cmn')
|
139 |
language = "zh"
|
|
|
140 |
|
141 |
if alignment_model_name is not None:
|
142 |
align_model = WhisperxAlignModel(language)
|
@@ -212,12 +215,8 @@ def align(segments, audio_path):
|
|
212 |
|
213 |
segments = align_model.align(segments, audio_path)
|
214 |
state = get_transcribe_state(segments)
|
215 |
-
success_message = "<span style='color:green;'>Success: Alignment completed successfully!</span>"
|
216 |
|
217 |
-
return
|
218 |
-
state["transcript_with_start_time"], state["transcript_with_end_time"],
|
219 |
-
state, success_message
|
220 |
-
]
|
221 |
|
222 |
|
223 |
def get_output_audio(audio_tensors, codec_audio_sr):
|
@@ -239,141 +238,166 @@ def replace_numbers_with_words(sentence):
|
|
239 |
|
240 |
@spaces.GPU
|
241 |
def run(seed, sub_amount, ssrspeech_model_choice, codec_audio_sr, codec_sr, top_k, top_p, temperature,
|
242 |
-
stop_repetition, kvcache, silence_tokens, aug_text, cfg_coef,
|
243 |
-
audio_path,
|
244 |
-
mode, selected_sentence, previous_audio_tensors):
|
245 |
|
246 |
-
global ssrspeech_model
|
247 |
aug_text = True if aug_text == 1 else False
|
248 |
if ssrspeech_model is None:
|
249 |
raise gr.Error("ssrspeech model not loaded")
|
250 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
251 |
# resample audio
|
252 |
audio, _ = librosa.load(audio_path, sr=16000)
|
253 |
sf.write(audio_path, audio, 16000)
|
254 |
-
|
255 |
-
|
256 |
-
|
257 |
-
|
258 |
-
|
259 |
-
|
260 |
-
|
261 |
-
|
262 |
-
|
263 |
-
|
264 |
-
|
265 |
-
|
266 |
-
|
267 |
-
|
268 |
-
|
269 |
-
|
270 |
-
|
271 |
-
|
272 |
-
|
273 |
-
|
274 |
-
|
275 |
-
|
276 |
-
|
277 |
-
|
278 |
-
|
279 |
-
|
280 |
-
|
281 |
-
|
282 |
-
|
283 |
-
|
284 |
-
|
285 |
-
|
286 |
-
|
287 |
-
|
288 |
|
289 |
-
|
290 |
-
|
291 |
-
|
292 |
-
|
293 |
-
|
294 |
-
|
295 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
296 |
|
297 |
-
|
298 |
-
|
299 |
-
if current_span[1] >= next_span[0] - threshold:
|
300 |
-
current_span[1] = max(current_span[1], next_span[1])
|
301 |
-
else:
|
302 |
-
combined_spans.append(current_span)
|
303 |
-
current_span = next_span
|
304 |
-
combined_spans.append(current_span)
|
305 |
-
return combined_spans
|
306 |
|
307 |
-
|
308 |
-
|
309 |
-
|
310 |
-
|
311 |
-
|
312 |
-
|
313 |
-
|
314 |
-
|
315 |
-
|
316 |
-
|
317 |
-
|
318 |
-
ssrspeech_model["text_tokenizer"],
|
319 |
-
ssrspeech_model["audio_tokenizer"],
|
320 |
-
audio_path, original_transcript, sentence, mask_interval,
|
321 |
-
cfg_coef, aug_text, False, True, False,
|
322 |
-
device, decode_config
|
323 |
-
)
|
324 |
-
else:
|
325 |
-
orig_spans = parse_tts_en(original_transcript, sentence) if ssrspeech_model_choice == 'English' else parse_tts_zh(original_transcript, sentence)
|
326 |
-
print("orig_spans: ", orig_spans)
|
327 |
-
|
328 |
-
starting_intervals = []
|
329 |
-
ending_intervals = []
|
330 |
-
for orig_span in orig_spans:
|
331 |
-
start, end = get_mask_interval(transcribe_state, orig_span)
|
332 |
-
starting_intervals.append(start)
|
333 |
-
ending_intervals.append(end)
|
334 |
|
335 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
336 |
|
337 |
-
|
338 |
-
|
339 |
-
|
340 |
-
|
341 |
-
|
342 |
-
|
343 |
-
mask_interval = torch.LongTensor(mask_interval) # [M,2], M==1 for now
|
344 |
-
print("mask_interval: ", mask_interval)
|
345 |
-
gen_audio = inference_one_sample(
|
346 |
-
ssrspeech_model["model"],
|
347 |
-
ssrspeech_model["config"],
|
348 |
-
ssrspeech_model["phn2num"],
|
349 |
-
ssrspeech_model["text_tokenizer"],
|
350 |
-
ssrspeech_model["audio_tokenizer"],
|
351 |
-
audio_path, original_transcript, sentence, mask_interval,
|
352 |
-
cfg_coef, aug_text, False, True, True,
|
353 |
-
device, decode_config
|
354 |
-
)
|
355 |
-
|
356 |
-
gen_audio = gen_audio[0].cpu()
|
357 |
-
audio_tensors.append(gen_audio)
|
358 |
-
|
359 |
-
if mode != "Rerun":
|
360 |
-
output_audio = get_output_audio(audio_tensors, codec_audio_sr)
|
361 |
-
sentences = [f"{idx}: {text}" for idx, text in enumerate(sentences)]
|
362 |
-
component = gr.Dropdown(choices=sentences, value=sentences[0])
|
363 |
-
return output_audio, inference_transcript, component, audio_tensors
|
364 |
else:
|
365 |
-
|
366 |
-
|
367 |
-
|
368 |
-
|
369 |
-
|
|
|
|
|
370 |
|
371 |
-
|
372 |
-
|
373 |
-
|
374 |
-
|
375 |
-
|
376 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
377 |
|
378 |
|
379 |
demo_original_transcript = "Gwynplaine had, besides, for his work and for his feats of strength, round his neck and over his shoulders, an esclavine of leather."
|
@@ -405,23 +429,14 @@ def get_app():
|
|
405 |
ssrspeech_model_choice = gr.Radio(label="ssrspeech model", value="English",
|
406 |
choices=["English", "Mandarin"])
|
407 |
whisper_backend_choice = gr.Radio(label="Whisper backend", value="whisperX", choices=["whisperX", "whisper"])
|
408 |
-
whisper_model_choice = gr.Radio(label="Whisper model", value="base.en",
|
409 |
-
choices=[None, "base.en", "small.en", "medium.en", "large"])
|
410 |
-
align_model_choice = gr.Radio(label="Forced alignment model", value="whisperX", choices=["whisperX", None])
|
411 |
|
412 |
with gr.Row():
|
413 |
with gr.Column(scale=2):
|
414 |
input_audio = gr.Audio(value=f"{DEMO_PATH}/5895_34622_000026_000002.wav", label="Input Audio", type="filepath", interactive=True)
|
415 |
with gr.Group():
|
416 |
original_transcript = gr.Textbox(label="Original transcript", lines=5, value=demo_original_transcript,
|
417 |
-
info="Use whisperx model to get the transcript.
|
418 |
-
with gr.Accordion("Word start time", open=False):
|
419 |
-
transcript_with_start_time = gr.Textbox(label="Start time", lines=5, interactive=False, info="Start time before each word")
|
420 |
-
with gr.Accordion("Word end time", open=False):
|
421 |
-
transcript_with_end_time = gr.Textbox(label="End time", lines=5, interactive=False, info="End time after each word")
|
422 |
-
|
423 |
transcribe_btn = gr.Button(value="Transcribe")
|
424 |
-
align_btn = gr.Button(value="Align")
|
425 |
|
426 |
with gr.Column(scale=3):
|
427 |
with gr.Group():
|
@@ -437,11 +452,6 @@ def get_app():
|
|
437 |
with gr.Accordion("Inference transcript", open=False):
|
438 |
inference_transcript = gr.Textbox(label="Inference transcript", lines=5, interactive=False,
|
439 |
info="Inference was performed on this transcript.")
|
440 |
-
with gr.Group(visible=False) as long_tts_sentence_editor:
|
441 |
-
sentence_selector = gr.Dropdown(label="Sentence", value=None,
|
442 |
-
info="Select sentence you want to regenerate")
|
443 |
-
sentence_audio = gr.Audio(label="Sentence Audio", scale=2)
|
444 |
-
rerun_btn = gr.Button(value="Rerun")
|
445 |
|
446 |
with gr.Row():
|
447 |
with gr.Accordion("Generation Parameters - change these if you are unhappy with the generation", open=False):
|
@@ -453,57 +463,38 @@ def get_app():
|
|
453 |
aug_text = gr.Radio(label="aug_text", choices=[0, 1], value=1,
|
454 |
info="set to 1 to use cfg")
|
455 |
cfg_coef = gr.Number(label="cfg_coef", value=1.5,
|
456 |
-
info="cfg guidance scale, 1.5 is a good value")
|
457 |
-
|
|
|
|
|
458 |
top_p = gr.Number(label="top_p", value=0.8, info="0.9 is a good value, 0.8 is also good")
|
459 |
-
temperature = gr.Number(label="temperature", value=1, info="haven't try other values, do not
|
460 |
top_k = gr.Number(label="top_k", value=0, info="0 means we don't use topk sampling, because we use topp sampling")
|
461 |
-
codec_audio_sr = gr.Number(label="codec_audio_sr", value=16000, info='encodec specific,
|
462 |
-
codec_sr = gr.Number(label="codec_sr", value=50, info='encodec specific,
|
463 |
silence_tokens = gr.Textbox(label="silence tokens", value="[1388,1898,131]", info="encodec specific, do not change")
|
464 |
|
465 |
success_output = gr.HTML()
|
466 |
-
audio_tensors = gr.State()
|
467 |
-
transcribe_state = gr.State(value={"words_info": demo_words_info})
|
468 |
|
469 |
load_models_btn.click(fn=load_models,
|
470 |
-
inputs=[whisper_backend_choice,
|
471 |
outputs=[models_selector, success_output])
|
472 |
|
473 |
-
|
474 |
transcribe_btn.click(fn=transcribe,
|
475 |
-
inputs=[
|
476 |
-
outputs=[original_transcript,
|
477 |
-
|
478 |
-
inputs=[seed, original_transcript, input_audio],
|
479 |
-
outputs=[transcript_with_start_time, transcript_with_end_time, transcribe_state, success_output])
|
480 |
-
|
481 |
run_btn.click(fn=run,
|
482 |
inputs=[
|
483 |
seed, sub_amount, ssrspeech_model_choice,
|
484 |
codec_audio_sr, codec_sr,
|
485 |
-
top_k, top_p, temperature,
|
486 |
-
|
487 |
-
|
488 |
-
|
489 |
-
mode, sentence_selector, audio_tensors
|
490 |
],
|
491 |
-
outputs=[output_audio,
|
492 |
-
|
493 |
-
sentence_selector.change(fn=load_sentence,
|
494 |
-
inputs=[sentence_selector, codec_audio_sr, audio_tensors],
|
495 |
-
outputs=[sentence_audio])
|
496 |
-
rerun_btn.click(fn=run,
|
497 |
-
inputs=[
|
498 |
-
seed, sub_amount, ssrspeech_model_choice,
|
499 |
-
codec_audio_sr, codec_sr,
|
500 |
-
top_k, top_p, temperature,
|
501 |
-
stop_repetition,
|
502 |
-
kvcache, silence_tokens, aug_text, cfg_coef,
|
503 |
-
input_audio, transcribe_state, original_transcript, transcript,
|
504 |
-
gr.State(value="Rerun"), sentence_selector, audio_tensors
|
505 |
-
],
|
506 |
-
outputs=[output_audio, inference_transcript, sentence_audio, audio_tensors])
|
507 |
|
508 |
return app
|
509 |
|
|
|
125 |
return self.align_model.align(segments, audio_path)
|
126 |
|
127 |
@spaces.GPU
|
128 |
+
def load_models(whisper_backend_name, ssrspeech_model_name):
|
129 |
global transcribe_model, align_model, ssrspeech_model
|
130 |
|
131 |
+
alignment_model_name = "whisperX"
|
132 |
if ssrspeech_model_name == "English":
|
133 |
ssrspeech_model_name = "English"
|
134 |
text_tokenizer = TextTokenizer(backend="espeak")
|
135 |
language = "en"
|
136 |
+
whisper_model_name = "base.en"
|
137 |
+
|
138 |
elif ssrspeech_model_name == "Mandarin":
|
139 |
ssrspeech_model_name = "Mandarin"
|
140 |
text_tokenizer = TextTokenizer(backend="espeak", language='cmn')
|
141 |
language = "zh"
|
142 |
+
whisper_model_name = "base"
|
143 |
|
144 |
if alignment_model_name is not None:
|
145 |
align_model = WhisperxAlignModel(language)
|
|
|
215 |
|
216 |
segments = align_model.align(segments, audio_path)
|
217 |
state = get_transcribe_state(segments)
|
|
|
218 |
|
219 |
+
return state
|
|
|
|
|
|
|
220 |
|
221 |
|
222 |
def get_output_audio(audio_tensors, codec_audio_sr):
|
|
|
238 |
|
239 |
@spaces.GPU
|
240 |
def run(seed, sub_amount, ssrspeech_model_choice, codec_audio_sr, codec_sr, top_k, top_p, temperature,
|
241 |
+
stop_repetition, kvcache, silence_tokens, aug_text, cfg_coef, prompt_length,
|
242 |
+
audio_path, original_transcript, transcript, mode):
|
|
|
243 |
|
244 |
+
global transcribe_model, align_model, ssrspeech_model
|
245 |
aug_text = True if aug_text == 1 else False
|
246 |
if ssrspeech_model is None:
|
247 |
raise gr.Error("ssrspeech model not loaded")
|
248 |
|
249 |
+
seed_everything(seed)
|
250 |
+
|
251 |
+
if ssrspeech_model_choice == "English":
|
252 |
+
language = "en"
|
253 |
+
elif ssrspeech_model_choice == "Mandarin":
|
254 |
+
language = "zh"
|
255 |
+
|
256 |
# resample audio
|
257 |
audio, _ = librosa.load(audio_path, sr=16000)
|
258 |
sf.write(audio_path, audio, 16000)
|
259 |
+
|
260 |
+
# text normalization
|
261 |
+
target_transcript = replace_numbers_with_words(transcript).replace(" ", " ").replace(" ", " ").replace("\n", " ")
|
262 |
+
orig_transcript = replace_numbers_with_words(original_transcript).replace(" ", " ").replace(" ", " ").replace("\n", " ")
|
263 |
+
|
264 |
+
orig_transcript, segments = transcribe(audio_path)
|
265 |
+
if language == 'zh':
|
266 |
+
converter = opencc.OpenCC('t2s')
|
267 |
+
orig_transcript = converter.convert(orig_transcript)
|
268 |
+
transcribe_state = align(traditional_to_simplified(segments), audio_path)
|
269 |
+
transcribe_state['segments'] = traditional_to_simplified(transcribe_state['segments'])
|
270 |
+
elif language == 'en':
|
271 |
+
orig_transcript = orig_transcript.lower()
|
272 |
+
target_transcript = target_transcript.lower()
|
273 |
+
transcribe_state = align(segments, audio_path)
|
274 |
+
print(orig_transcript)
|
275 |
+
print(target_transcript)
|
276 |
+
|
277 |
+
if mode == "TTS":
|
278 |
+
info = torchaudio.info(audio_path)
|
279 |
+
duration = info.num_frames / info.sample_rate
|
280 |
+
cut_length = duration
|
281 |
+
# Cut long audio for tts
|
282 |
+
if duration > prompt_length:
|
283 |
+
seg_num = len(transcribe_state['segments'])
|
284 |
+
for i in range(seg_num):
|
285 |
+
words = transcribe_state['segments'][i]['words']
|
286 |
+
for item in words:
|
287 |
+
if item['end'] >= prompt_length:
|
288 |
+
cut_length = min(item['end'], cut_length)
|
289 |
+
|
290 |
+
audio, _ = librosa.load(audio_path, sr=16000, duration=cut_length)
|
291 |
+
sf.write(audio_path, audio, 16000)
|
292 |
+
orig_transcript, segments = transcribe(audio_path)
|
293 |
|
294 |
+
if language == 'zh':
|
295 |
+
converter = opencc.OpenCC('t2s')
|
296 |
+
orig_transcript = converter.convert(orig_transcript)
|
297 |
+
transcribe_state = align(traditional_to_simplified(segments), audio_path)
|
298 |
+
transcribe_state['segments'] = traditional_to_simplified(transcribe_state['segments'])
|
299 |
+
elif language == 'en':
|
300 |
+
orig_transcript = orig_transcript.lower()
|
301 |
+
target_transcript = target_transcript.lower()
|
302 |
+
transcribe_state = align(segments, audio_path)
|
303 |
+
print(orig_transcript)
|
304 |
+
target_transcript_copy = target_transcript # for tts cut out
|
305 |
+
if language == 'en':
|
306 |
+
target_transcript_copy = target_transcript_copy.split(' ')[0]
|
307 |
+
elif language == 'zh':
|
308 |
+
target_transcript_copy = target_transcript_copy[0]
|
309 |
+
target_transcript = orig_transcript + ' ' + target_transcript if language == 'en' else orig_transcript + target_transcript
|
310 |
+
print(target_transcript)
|
311 |
+
|
312 |
+
if mode == "Edit":
|
313 |
+
operations, orig_spans = parse_edit_en(orig_transcript, target_transcript) if language == 'en' else parse_edit_zh(orig_transcript, target_transcript)
|
314 |
+
print(operations)
|
315 |
+
print("orig_spans: ", orig_spans)
|
316 |
|
317 |
+
if len(orig_spans) > 3:
|
318 |
+
raise gr.Error("Current model only supports maximum 3 editings")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
319 |
|
320 |
+
starting_intervals = []
|
321 |
+
ending_intervals = []
|
322 |
+
for orig_span in orig_spans:
|
323 |
+
start, end = get_mask_interval(transcribe_state, orig_span)
|
324 |
+
starting_intervals.append(start)
|
325 |
+
ending_intervals.append(end)
|
326 |
+
|
327 |
+
print("intervals: ", starting_intervals, ending_intervals)
|
328 |
+
|
329 |
+
info = torchaudio.info(audio_path)
|
330 |
+
audio_dur = info.num_frames / info.sample_rate
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
331 |
|
332 |
+
def combine_spans(spans, threshold=0.2):
|
333 |
+
spans.sort(key=lambda x: x[0])
|
334 |
+
combined_spans = []
|
335 |
+
current_span = spans[0]
|
336 |
+
|
337 |
+
for i in range(1, len(spans)):
|
338 |
+
next_span = spans[i]
|
339 |
+
if current_span[1] >= next_span[0] - threshold:
|
340 |
+
current_span[1] = max(current_span[1], next_span[1])
|
341 |
+
else:
|
342 |
+
combined_spans.append(current_span)
|
343 |
+
current_span = next_span
|
344 |
+
combined_spans.append(current_span)
|
345 |
+
return combined_spans
|
346 |
|
347 |
+
morphed_span = [[max(start - sub_amount, 0), min(end + sub_amount, audio_dur)]
|
348 |
+
for start, end in zip(starting_intervals, ending_intervals)] # in seconds
|
349 |
+
morphed_span = combine_spans(morphed_span, threshold=0.2)
|
350 |
+
print("morphed_spans: ", morphed_span)
|
351 |
+
mask_interval = [[round(span[0]*codec_sr), round(span[1]*codec_sr)] for span in morphed_span]
|
352 |
+
mask_interval = torch.LongTensor(mask_interval) # [M,2], M==1 for now
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
353 |
else:
|
354 |
+
info = torchaudio.info(audio_path)
|
355 |
+
audio_dur = info.num_frames / info.sample_rate
|
356 |
+
|
357 |
+
morphed_span = [(audio_dur, audio_dur)] # in seconds
|
358 |
+
mask_interval = [[round(span[0]*codec_sr), round(span[1]*codec_sr)] for span in morphed_span]
|
359 |
+
mask_interval = torch.LongTensor(mask_interval) # [M,2], M==1 for now
|
360 |
+
print("mask_interval: ", mask_interval)
|
361 |
|
362 |
+
decode_config = {'top_k': top_k, 'top_p': top_p, 'temperature': temperature, 'stop_repetition': stop_repetition, 'kvcache': kvcache, "codec_audio_sr": codec_audio_sr, "codec_sr": codec_sr}
|
363 |
+
|
364 |
+
tts = True if mode == "TTS" else False
|
365 |
+
new_audio = inference_one_sample(
|
366 |
+
ssrspeech_model["model"],
|
367 |
+
ssrspeech_model["config"],
|
368 |
+
ssrspeech_model["phn2num"],
|
369 |
+
ssrspeech_model["text_tokenizer"],
|
370 |
+
ssrspeech_model["audio_tokenizer"],
|
371 |
+
audio_path, orig_transcript, target_transcript, mask_interval,
|
372 |
+
cfg_coef, aug_text, False, True, tts,
|
373 |
+
device, decode_config
|
374 |
+
)
|
375 |
+
audio_tensors = []
|
376 |
+
# save segments for comparison
|
377 |
+
new_audio = new_audio[0].cpu()
|
378 |
+
torchaudio.save(audio_path, new_audio, codec_audio_sr)
|
379 |
+
if tts: # remove the start parts
|
380 |
+
new_transcript, new_segments = transcribe(audio_path)
|
381 |
+
if language == 'zh':
|
382 |
+
transcribe_state = align(traditional_to_simplified(new_segments), audio_path)
|
383 |
+
transcribe_state['segments'] = traditional_to_simplified(transcribe_state['segments'])
|
384 |
+
tmp1 = transcribe_state['segments'][0]['words'][0]['word']
|
385 |
+
tmp2 = target_transcript_copy
|
386 |
+
elif language == 'en':
|
387 |
+
transcribe_state = align(new_segments, audio_path)
|
388 |
+
tmp1 = transcribe_state['segments'][0]['words'][0]['word'].lower()
|
389 |
+
tmp2 = target_transcript_copy.lower()
|
390 |
+
if tmp1 == tmp2:
|
391 |
+
offset = transcribe_state['segments'][0]['words'][0]['start']
|
392 |
+
else:
|
393 |
+
offset = transcribe_state['segments'][0]['words'][1]['start']
|
394 |
+
|
395 |
+
new_audio, _ = torchaudio.load(audio_path, frame_offset=int(offset*codec_audio_sr))
|
396 |
+
audio_tensors.append(new_audio)
|
397 |
+
output_audio = get_output_audio(audio_tensors, codec_audio_sr)
|
398 |
+
|
399 |
+
success_message = "<span style='color:green;'>Success: Inference successfully!</span>"
|
400 |
+
return output_audio, success_message
|
401 |
|
402 |
|
403 |
demo_original_transcript = "Gwynplaine had, besides, for his work and for his feats of strength, round his neck and over his shoulders, an esclavine of leather."
|
|
|
429 |
ssrspeech_model_choice = gr.Radio(label="ssrspeech model", value="English",
|
430 |
choices=["English", "Mandarin"])
|
431 |
whisper_backend_choice = gr.Radio(label="Whisper backend", value="whisperX", choices=["whisperX", "whisper"])
|
|
|
|
|
|
|
432 |
|
433 |
with gr.Row():
|
434 |
with gr.Column(scale=2):
|
435 |
input_audio = gr.Audio(value=f"{DEMO_PATH}/5895_34622_000026_000002.wav", label="Input Audio", type="filepath", interactive=True)
|
436 |
with gr.Group():
|
437 |
original_transcript = gr.Textbox(label="Original transcript", lines=5, value=demo_original_transcript,
|
438 |
+
info="Use whisperx model to get the transcript.")
|
|
|
|
|
|
|
|
|
|
|
439 |
transcribe_btn = gr.Button(value="Transcribe")
|
|
|
440 |
|
441 |
with gr.Column(scale=3):
|
442 |
with gr.Group():
|
|
|
452 |
with gr.Accordion("Inference transcript", open=False):
|
453 |
inference_transcript = gr.Textbox(label="Inference transcript", lines=5, interactive=False,
|
454 |
info="Inference was performed on this transcript.")
|
|
|
|
|
|
|
|
|
|
|
455 |
|
456 |
with gr.Row():
|
457 |
with gr.Accordion("Generation Parameters - change these if you are unhappy with the generation", open=False):
|
|
|
463 |
aug_text = gr.Radio(label="aug_text", choices=[0, 1], value=1,
|
464 |
info="set to 1 to use cfg")
|
465 |
cfg_coef = gr.Number(label="cfg_coef", value=1.5,
|
466 |
+
info="cfg guidance scale, 1.5 is a good value, change if you don't like the results")
|
467 |
+
prompt_length = gr.Number(label="prompt_length", value=3,
|
468 |
+
info="used for tts prompt, will automatically cut the prompt audio to this length")
|
469 |
+
sub_amount = gr.Number(label="sub_amount", value=0.12, info="margin to the left and right of the editing segment, change if you don't like the results")
|
470 |
top_p = gr.Number(label="top_p", value=0.8, info="0.9 is a good value, 0.8 is also good")
|
471 |
+
temperature = gr.Number(label="temperature", value=1, info="haven't try other values, do not change")
|
472 |
top_k = gr.Number(label="top_k", value=0, info="0 means we don't use topk sampling, because we use topp sampling")
|
473 |
+
codec_audio_sr = gr.Number(label="codec_audio_sr", value=16000, info='encodec specific, do not change')
|
474 |
+
codec_sr = gr.Number(label="codec_sr", value=50, info='encodec specific, do not change')
|
475 |
silence_tokens = gr.Textbox(label="silence tokens", value="[1388,1898,131]", info="encodec specific, do not change")
|
476 |
|
477 |
success_output = gr.HTML()
|
|
|
|
|
478 |
|
479 |
load_models_btn.click(fn=load_models,
|
480 |
+
inputs=[whisper_backend_choice, ssrspeech_model_choice],
|
481 |
outputs=[models_selector, success_output])
|
482 |
|
483 |
+
semgents = None # not used
|
484 |
transcribe_btn.click(fn=transcribe,
|
485 |
+
inputs=[input_audio],
|
486 |
+
outputs=[original_transcript, semgents, success_output])
|
487 |
+
|
|
|
|
|
|
|
488 |
run_btn.click(fn=run,
|
489 |
inputs=[
|
490 |
seed, sub_amount, ssrspeech_model_choice,
|
491 |
codec_audio_sr, codec_sr,
|
492 |
+
top_k, top_p, temperature, stop_repetition, kvcache, silence_tokens,
|
493 |
+
aug_text, cfg_coef, prompt_length,
|
494 |
+
input_audio, original_transcript, transcript,
|
495 |
+
mode
|
|
|
496 |
],
|
497 |
+
outputs=[output_audio, success_output])
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
498 |
|
499 |
return app
|
500 |
|