Spaces:
Running
on
Zero
Running
on
Zero
Update app.py
Browse files
app.py
CHANGED
@@ -456,101 +456,101 @@ def get_app():
|
|
456 |
choices=[None, "base.en", "small.en", "medium.en", "large"])
|
457 |
align_model_choice = gr.Radio(label="Forced alignment model", value="whisperX", choices=["whisperX", None])
|
458 |
|
459 |
-
with gr.Row():
|
460 |
-
|
461 |
-
|
462 |
-
|
463 |
-
|
464 |
-
|
465 |
-
|
466 |
-
|
467 |
-
|
468 |
-
|
469 |
-
|
470 |
-
|
471 |
-
|
472 |
-
|
473 |
-
|
474 |
-
|
475 |
-
|
476 |
-
|
477 |
-
|
478 |
-
|
479 |
-
|
480 |
-
|
481 |
-
|
482 |
-
|
483 |
-
|
484 |
-
|
485 |
-
|
486 |
-
|
487 |
-
|
488 |
-
|
489 |
-
|
490 |
-
|
491 |
-
|
492 |
-
|
493 |
-
with gr.Row():
|
494 |
-
|
495 |
-
|
496 |
-
|
497 |
-
|
498 |
-
|
499 |
-
|
500 |
-
|
501 |
-
|
502 |
-
|
503 |
-
|
504 |
-
|
505 |
-
|
506 |
-
|
507 |
-
|
508 |
-
|
509 |
-
|
510 |
-
|
511 |
-
|
512 |
-
success_output = gr.HTML()
|
513 |
-
audio_tensors = gr.State()
|
514 |
-
transcribe_state = gr.State(value={"words_info": demo_words_info})
|
515 |
-
|
516 |
-
load_models_btn.click(fn=load_models,
|
517 |
-
|
518 |
-
|
519 |
-
|
520 |
-
|
521 |
-
transcribe_btn.click(fn=transcribe,
|
522 |
-
|
523 |
-
|
524 |
-
align_btn.click(fn=align,
|
525 |
-
|
526 |
-
|
527 |
-
|
528 |
-
run_btn.click(fn=run,
|
529 |
-
|
530 |
-
|
531 |
-
|
532 |
-
|
533 |
-
|
534 |
-
|
535 |
-
|
536 |
-
|
537 |
-
|
538 |
-
|
539 |
-
|
540 |
-
sentence_selector.change(fn=load_sentence,
|
541 |
-
|
542 |
-
|
543 |
-
rerun_btn.click(fn=run,
|
544 |
-
|
545 |
-
|
546 |
-
|
547 |
-
|
548 |
-
|
549 |
-
|
550 |
-
|
551 |
-
|
552 |
-
|
553 |
-
|
554 |
|
555 |
return app
|
556 |
|
|
|
456 |
choices=[None, "base.en", "small.en", "medium.en", "large"])
|
457 |
align_model_choice = gr.Radio(label="Forced alignment model", value="whisperX", choices=["whisperX", None])
|
458 |
|
459 |
+
# with gr.Row():
|
460 |
+
# with gr.Column(scale=2):
|
461 |
+
# input_audio = gr.Audio(value=f"{DEMO_PATH}/5895_34622_000026_000002.wav", label="Input Audio", type="filepath", interactive=True)
|
462 |
+
# with gr.Group():
|
463 |
+
# original_transcript = gr.Textbox(label="Original transcript", lines=5, value=demo_original_transcript,
|
464 |
+
# info="Use whisperx model to get the transcript. Fix and align it if necessary.")
|
465 |
+
# with gr.Accordion("Word start time", open=False):
|
466 |
+
# transcript_with_start_time = gr.Textbox(label="Start time", lines=5, interactive=False, info="Start time before each word")
|
467 |
+
# with gr.Accordion("Word end time", open=False):
|
468 |
+
# transcript_with_end_time = gr.Textbox(label="End time", lines=5, interactive=False, info="End time after each word")
|
469 |
+
|
470 |
+
# transcribe_btn = gr.Button(value="Transcribe")
|
471 |
+
# align_btn = gr.Button(value="Align")
|
472 |
+
|
473 |
+
# with gr.Column(scale=3):
|
474 |
+
# with gr.Group():
|
475 |
+
# transcript = gr.Textbox(label="Text", lines=7, value=demo_text["TTS"]["smart"])
|
476 |
+
|
477 |
+
# with gr.Row():
|
478 |
+
# mode = gr.Radio(label="Mode", choices=["Edit", "TTS"], value="Edit")
|
479 |
+
|
480 |
+
# run_btn = gr.Button(value="Run")
|
481 |
+
|
482 |
+
# with gr.Column(scale=2):
|
483 |
+
# output_audio = gr.Audio(label="Output Audio")
|
484 |
+
# with gr.Accordion("Inference transcript", open=False):
|
485 |
+
# inference_transcript = gr.Textbox(label="Inference transcript", lines=5, interactive=False,
|
486 |
+
# info="Inference was performed on this transcript.")
|
487 |
+
# with gr.Group(visible=False) as long_tts_sentence_editor:
|
488 |
+
# sentence_selector = gr.Dropdown(label="Sentence", value=None,
|
489 |
+
# info="Select sentence you want to regenerate")
|
490 |
+
# sentence_audio = gr.Audio(label="Sentence Audio", scale=2)
|
491 |
+
# rerun_btn = gr.Button(value="Rerun")
|
492 |
+
|
493 |
+
# with gr.Row():
|
494 |
+
# with gr.Accordion("Generation Parameters - change these if you are unhappy with the generation", open=False):
|
495 |
+
# stop_repetition = gr.Radio(label="stop_repetition", choices=[-1, 1, 2, 3, 4], value=2,
|
496 |
+
# info="if there are long silence in the generated audio, reduce the stop_repetition to 2 or 1. -1 = disabled")
|
497 |
+
# seed = gr.Number(label="seed", value=-1, precision=0, info="random seeds always works :)")
|
498 |
+
# kvcache = gr.Radio(label="kvcache", choices=[0, 1], value=1,
|
499 |
+
# info="set to 0 to use less VRAM, but with slower inference")
|
500 |
+
# aug_text = gr.Radio(label="aug_text", choices=[0, 1], value=1,
|
501 |
+
# info="set to 1 to use cfg")
|
502 |
+
# cfg_coef = gr.Number(label="cfg_coef", value=1.5,
|
503 |
+
# info="cfg guidance scale, 1.5 is a good value")
|
504 |
+
# sub_amount = gr.Number(label="sub_amount", value=0.12, info="margin to the left and right of the editing segment")
|
505 |
+
# top_p = gr.Number(label="top_p", value=0.8, info="0.9 is a good value, 0.8 is also good")
|
506 |
+
# temperature = gr.Number(label="temperature", value=1, info="haven't try other values, do not recommend to change")
|
507 |
+
# top_k = gr.Number(label="top_k", value=0, info="0 means we don't use topk sampling, because we use topp sampling")
|
508 |
+
# codec_audio_sr = gr.Number(label="codec_audio_sr", value=16000, info='encodec specific, Do not change')
|
509 |
+
# codec_sr = gr.Number(label="codec_sr", value=50, info='encodec specific, Do not change')
|
510 |
+
# silence_tokens = gr.Textbox(label="silence tokens", value="[1388,1898,131]", info="encodec specific, do not change")
|
511 |
+
|
512 |
+
# success_output = gr.HTML()
|
513 |
+
# audio_tensors = gr.State()
|
514 |
+
# transcribe_state = gr.State(value={"words_info": demo_words_info})
|
515 |
+
|
516 |
+
# load_models_btn.click(fn=load_models,
|
517 |
+
# inputs=[whisper_backend_choice, whisper_model_choice, align_model_choice, ssrspeech_model_choice],
|
518 |
+
# outputs=[models_selector])
|
519 |
+
|
520 |
+
|
521 |
+
# transcribe_btn.click(fn=transcribe,
|
522 |
+
# inputs=[seed, input_audio],
|
523 |
+
# outputs=[original_transcript, transcript_with_start_time, transcript_with_end_time, transcribe_state, success_output])
|
524 |
+
# align_btn.click(fn=align,
|
525 |
+
# inputs=[seed, original_transcript, input_audio],
|
526 |
+
# outputs=[transcript_with_start_time, transcript_with_end_time, transcribe_state, success_output])
|
527 |
+
|
528 |
+
# run_btn.click(fn=run,
|
529 |
+
# inputs=[
|
530 |
+
# seed, sub_amount, ssrspeech_model_choice,
|
531 |
+
# codec_audio_sr, codec_sr,
|
532 |
+
# top_k, top_p, temperature,
|
533 |
+
# stop_repetition,
|
534 |
+
# kvcache, silence_tokens, aug_text, cfg_coef,
|
535 |
+
# input_audio, transcribe_state, original_transcript, transcript,
|
536 |
+
# mode, sentence_selector, audio_tensors
|
537 |
+
# ],
|
538 |
+
# outputs=[output_audio, inference_transcript, sentence_selector, audio_tensors])
|
539 |
+
|
540 |
+
# sentence_selector.change(fn=load_sentence,
|
541 |
+
# inputs=[sentence_selector, codec_audio_sr, audio_tensors],
|
542 |
+
# outputs=[sentence_audio])
|
543 |
+
# rerun_btn.click(fn=run,
|
544 |
+
# inputs=[
|
545 |
+
# seed, sub_amount, ssrspeech_model_choice,
|
546 |
+
# codec_audio_sr, codec_sr,
|
547 |
+
# top_k, top_p, temperature,
|
548 |
+
# stop_repetition,
|
549 |
+
# kvcache, silence_tokens, aug_text, cfg_coef,
|
550 |
+
# input_audio, transcribe_state, original_transcript, transcript,
|
551 |
+
# gr.State(value="Rerun"), sentence_selector, audio_tensors
|
552 |
+
# ],
|
553 |
+
# outputs=[output_audio, inference_transcript, sentence_audio, audio_tensors])
|
554 |
|
555 |
return app
|
556 |
|