OpenSound commited on
Commit
11eb10f
·
1 Parent(s): e849aec

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +95 -95
app.py CHANGED
@@ -456,101 +456,101 @@ def get_app():
456
  choices=[None, "base.en", "small.en", "medium.en", "large"])
457
  align_model_choice = gr.Radio(label="Forced alignment model", value="whisperX", choices=["whisperX", None])
458
 
459
- with gr.Row():
460
- with gr.Column(scale=2):
461
- input_audio = gr.Audio(value=f"{DEMO_PATH}/5895_34622_000026_000002.wav", label="Input Audio", type="filepath", interactive=True)
462
- with gr.Group():
463
- original_transcript = gr.Textbox(label="Original transcript", lines=5, value=demo_original_transcript,
464
- info="Use whisperx model to get the transcript. Fix and align it if necessary.")
465
- with gr.Accordion("Word start time", open=False):
466
- transcript_with_start_time = gr.Textbox(label="Start time", lines=5, interactive=False, info="Start time before each word")
467
- with gr.Accordion("Word end time", open=False):
468
- transcript_with_end_time = gr.Textbox(label="End time", lines=5, interactive=False, info="End time after each word")
469
-
470
- transcribe_btn = gr.Button(value="Transcribe")
471
- align_btn = gr.Button(value="Align")
472
-
473
- with gr.Column(scale=3):
474
- with gr.Group():
475
- transcript = gr.Textbox(label="Text", lines=7, value=demo_text["TTS"]["smart"])
476
-
477
- with gr.Row():
478
- mode = gr.Radio(label="Mode", choices=["Edit", "TTS"], value="Edit")
479
-
480
- run_btn = gr.Button(value="Run")
481
-
482
- with gr.Column(scale=2):
483
- output_audio = gr.Audio(label="Output Audio")
484
- with gr.Accordion("Inference transcript", open=False):
485
- inference_transcript = gr.Textbox(label="Inference transcript", lines=5, interactive=False,
486
- info="Inference was performed on this transcript.")
487
- with gr.Group(visible=False) as long_tts_sentence_editor:
488
- sentence_selector = gr.Dropdown(label="Sentence", value=None,
489
- info="Select sentence you want to regenerate")
490
- sentence_audio = gr.Audio(label="Sentence Audio", scale=2)
491
- rerun_btn = gr.Button(value="Rerun")
492
-
493
- with gr.Row():
494
- with gr.Accordion("Generation Parameters - change these if you are unhappy with the generation", open=False):
495
- stop_repetition = gr.Radio(label="stop_repetition", choices=[-1, 1, 2, 3, 4], value=2,
496
- info="if there are long silence in the generated audio, reduce the stop_repetition to 2 or 1. -1 = disabled")
497
- seed = gr.Number(label="seed", value=-1, precision=0, info="random seeds always works :)")
498
- kvcache = gr.Radio(label="kvcache", choices=[0, 1], value=1,
499
- info="set to 0 to use less VRAM, but with slower inference")
500
- aug_text = gr.Radio(label="aug_text", choices=[0, 1], value=1,
501
- info="set to 1 to use cfg")
502
- cfg_coef = gr.Number(label="cfg_coef", value=1.5,
503
- info="cfg guidance scale, 1.5 is a good value")
504
- sub_amount = gr.Number(label="sub_amount", value=0.12, info="margin to the left and right of the editing segment")
505
- top_p = gr.Number(label="top_p", value=0.8, info="0.9 is a good value, 0.8 is also good")
506
- temperature = gr.Number(label="temperature", value=1, info="haven't try other values, do not recommend to change")
507
- top_k = gr.Number(label="top_k", value=0, info="0 means we don't use topk sampling, because we use topp sampling")
508
- codec_audio_sr = gr.Number(label="codec_audio_sr", value=16000, info='encodec specific, Do not change')
509
- codec_sr = gr.Number(label="codec_sr", value=50, info='encodec specific, Do not change')
510
- silence_tokens = gr.Textbox(label="silence tokens", value="[1388,1898,131]", info="encodec specific, do not change")
511
-
512
- success_output = gr.HTML()
513
- audio_tensors = gr.State()
514
- transcribe_state = gr.State(value={"words_info": demo_words_info})
515
-
516
- load_models_btn.click(fn=load_models,
517
- inputs=[whisper_backend_choice, whisper_model_choice, align_model_choice, ssrspeech_model_choice],
518
- outputs=[models_selector])
519
-
520
-
521
- transcribe_btn.click(fn=transcribe,
522
- inputs=[seed, input_audio],
523
- outputs=[original_transcript, transcript_with_start_time, transcript_with_end_time, transcribe_state, success_output])
524
- align_btn.click(fn=align,
525
- inputs=[seed, original_transcript, input_audio],
526
- outputs=[transcript_with_start_time, transcript_with_end_time, transcribe_state, success_output])
527
-
528
- run_btn.click(fn=run,
529
- inputs=[
530
- seed, sub_amount, ssrspeech_model_choice,
531
- codec_audio_sr, codec_sr,
532
- top_k, top_p, temperature,
533
- stop_repetition,
534
- kvcache, silence_tokens, aug_text, cfg_coef,
535
- input_audio, transcribe_state, original_transcript, transcript,
536
- mode, sentence_selector, audio_tensors
537
- ],
538
- outputs=[output_audio, inference_transcript, sentence_selector, audio_tensors])
539
-
540
- sentence_selector.change(fn=load_sentence,
541
- inputs=[sentence_selector, codec_audio_sr, audio_tensors],
542
- outputs=[sentence_audio])
543
- rerun_btn.click(fn=run,
544
- inputs=[
545
- seed, sub_amount, ssrspeech_model_choice,
546
- codec_audio_sr, codec_sr,
547
- top_k, top_p, temperature,
548
- stop_repetition,
549
- kvcache, silence_tokens, aug_text, cfg_coef,
550
- input_audio, transcribe_state, original_transcript, transcript,
551
- gr.State(value="Rerun"), sentence_selector, audio_tensors
552
- ],
553
- outputs=[output_audio, inference_transcript, sentence_audio, audio_tensors])
554
 
555
  return app
556
 
 
456
  choices=[None, "base.en", "small.en", "medium.en", "large"])
457
  align_model_choice = gr.Radio(label="Forced alignment model", value="whisperX", choices=["whisperX", None])
458
 
459
+ # with gr.Row():
460
+ # with gr.Column(scale=2):
461
+ # input_audio = gr.Audio(value=f"{DEMO_PATH}/5895_34622_000026_000002.wav", label="Input Audio", type="filepath", interactive=True)
462
+ # with gr.Group():
463
+ # original_transcript = gr.Textbox(label="Original transcript", lines=5, value=demo_original_transcript,
464
+ # info="Use whisperx model to get the transcript. Fix and align it if necessary.")
465
+ # with gr.Accordion("Word start time", open=False):
466
+ # transcript_with_start_time = gr.Textbox(label="Start time", lines=5, interactive=False, info="Start time before each word")
467
+ # with gr.Accordion("Word end time", open=False):
468
+ # transcript_with_end_time = gr.Textbox(label="End time", lines=5, interactive=False, info="End time after each word")
469
+
470
+ # transcribe_btn = gr.Button(value="Transcribe")
471
+ # align_btn = gr.Button(value="Align")
472
+
473
+ # with gr.Column(scale=3):
474
+ # with gr.Group():
475
+ # transcript = gr.Textbox(label="Text", lines=7, value=demo_text["TTS"]["smart"])
476
+
477
+ # with gr.Row():
478
+ # mode = gr.Radio(label="Mode", choices=["Edit", "TTS"], value="Edit")
479
+
480
+ # run_btn = gr.Button(value="Run")
481
+
482
+ # with gr.Column(scale=2):
483
+ # output_audio = gr.Audio(label="Output Audio")
484
+ # with gr.Accordion("Inference transcript", open=False):
485
+ # inference_transcript = gr.Textbox(label="Inference transcript", lines=5, interactive=False,
486
+ # info="Inference was performed on this transcript.")
487
+ # with gr.Group(visible=False) as long_tts_sentence_editor:
488
+ # sentence_selector = gr.Dropdown(label="Sentence", value=None,
489
+ # info="Select sentence you want to regenerate")
490
+ # sentence_audio = gr.Audio(label="Sentence Audio", scale=2)
491
+ # rerun_btn = gr.Button(value="Rerun")
492
+
493
+ # with gr.Row():
494
+ # with gr.Accordion("Generation Parameters - change these if you are unhappy with the generation", open=False):
495
+ # stop_repetition = gr.Radio(label="stop_repetition", choices=[-1, 1, 2, 3, 4], value=2,
496
+ # info="if there are long silence in the generated audio, reduce the stop_repetition to 2 or 1. -1 = disabled")
497
+ # seed = gr.Number(label="seed", value=-1, precision=0, info="random seeds always works :)")
498
+ # kvcache = gr.Radio(label="kvcache", choices=[0, 1], value=1,
499
+ # info="set to 0 to use less VRAM, but with slower inference")
500
+ # aug_text = gr.Radio(label="aug_text", choices=[0, 1], value=1,
501
+ # info="set to 1 to use cfg")
502
+ # cfg_coef = gr.Number(label="cfg_coef", value=1.5,
503
+ # info="cfg guidance scale, 1.5 is a good value")
504
+ # sub_amount = gr.Number(label="sub_amount", value=0.12, info="margin to the left and right of the editing segment")
505
+ # top_p = gr.Number(label="top_p", value=0.8, info="0.9 is a good value, 0.8 is also good")
506
+ # temperature = gr.Number(label="temperature", value=1, info="haven't try other values, do not recommend to change")
507
+ # top_k = gr.Number(label="top_k", value=0, info="0 means we don't use topk sampling, because we use topp sampling")
508
+ # codec_audio_sr = gr.Number(label="codec_audio_sr", value=16000, info='encodec specific, Do not change')
509
+ # codec_sr = gr.Number(label="codec_sr", value=50, info='encodec specific, Do not change')
510
+ # silence_tokens = gr.Textbox(label="silence tokens", value="[1388,1898,131]", info="encodec specific, do not change")
511
+
512
+ # success_output = gr.HTML()
513
+ # audio_tensors = gr.State()
514
+ # transcribe_state = gr.State(value={"words_info": demo_words_info})
515
+
516
+ # load_models_btn.click(fn=load_models,
517
+ # inputs=[whisper_backend_choice, whisper_model_choice, align_model_choice, ssrspeech_model_choice],
518
+ # outputs=[models_selector])
519
+
520
+
521
+ # transcribe_btn.click(fn=transcribe,
522
+ # inputs=[seed, input_audio],
523
+ # outputs=[original_transcript, transcript_with_start_time, transcript_with_end_time, transcribe_state, success_output])
524
+ # align_btn.click(fn=align,
525
+ # inputs=[seed, original_transcript, input_audio],
526
+ # outputs=[transcript_with_start_time, transcript_with_end_time, transcribe_state, success_output])
527
+
528
+ # run_btn.click(fn=run,
529
+ # inputs=[
530
+ # seed, sub_amount, ssrspeech_model_choice,
531
+ # codec_audio_sr, codec_sr,
532
+ # top_k, top_p, temperature,
533
+ # stop_repetition,
534
+ # kvcache, silence_tokens, aug_text, cfg_coef,
535
+ # input_audio, transcribe_state, original_transcript, transcript,
536
+ # mode, sentence_selector, audio_tensors
537
+ # ],
538
+ # outputs=[output_audio, inference_transcript, sentence_selector, audio_tensors])
539
+
540
+ # sentence_selector.change(fn=load_sentence,
541
+ # inputs=[sentence_selector, codec_audio_sr, audio_tensors],
542
+ # outputs=[sentence_audio])
543
+ # rerun_btn.click(fn=run,
544
+ # inputs=[
545
+ # seed, sub_amount, ssrspeech_model_choice,
546
+ # codec_audio_sr, codec_sr,
547
+ # top_k, top_p, temperature,
548
+ # stop_repetition,
549
+ # kvcache, silence_tokens, aug_text, cfg_coef,
550
+ # input_audio, transcribe_state, original_transcript, transcript,
551
+ # gr.State(value="Rerun"), sentence_selector, audio_tensors
552
+ # ],
553
+ # outputs=[output_audio, inference_transcript, sentence_audio, audio_tensors])
554
 
555
  return app
556