Spaces:

OpenSound
/

SSR-Speech

Running on Zero

App Files Files Community

OpenSound commited on Sep 14, 2024

Commit

11eb10f

1 Parent(s): e849aec

Update app.py

Browse files

Files changed (1) hide show

app.py +95 -95

app.py CHANGED Viewed

@@ -456,101 +456,101 @@ def get_app():
                                                         choices=[None, "base.en", "small.en", "medium.en", "large"])
                         align_model_choice = gr.Radio(label="Forced alignment model", value="whisperX", choices=["whisperX", None])
-        with gr.Row():
-            with gr.Column(scale=2):
-                input_audio = gr.Audio(value=f"{DEMO_PATH}/5895_34622_000026_000002.wav", label="Input Audio", type="filepath", interactive=True)
-                with gr.Group():
-                    original_transcript = gr.Textbox(label="Original transcript", lines=5, value=demo_original_transcript,
-                                                    info="Use whisperx model to get the transcript. Fix and align it if necessary.")
-                    with gr.Accordion("Word start time", open=False):
-                        transcript_with_start_time = gr.Textbox(label="Start time", lines=5, interactive=False, info="Start time before each word")
-                    with gr.Accordion("Word end time", open=False):
-                        transcript_with_end_time = gr.Textbox(label="End time", lines=5, interactive=False, info="End time after each word")
-                    transcribe_btn = gr.Button(value="Transcribe")
-                    align_btn = gr.Button(value="Align")
-            with gr.Column(scale=3):
-                with gr.Group():
-                    transcript = gr.Textbox(label="Text", lines=7, value=demo_text["TTS"]["smart"])
-                    with gr.Row():
-                        mode = gr.Radio(label="Mode", choices=["Edit", "TTS"], value="Edit")
-                    run_btn = gr.Button(value="Run")
-            with gr.Column(scale=2):
-                output_audio = gr.Audio(label="Output Audio")
-                with gr.Accordion("Inference transcript", open=False):
-                    inference_transcript = gr.Textbox(label="Inference transcript", lines=5, interactive=False,
-                                                    info="Inference was performed on this transcript.")
-                with gr.Group(visible=False) as long_tts_sentence_editor:
-                    sentence_selector = gr.Dropdown(label="Sentence", value=None,
-                                                    info="Select sentence you want to regenerate")
-                    sentence_audio = gr.Audio(label="Sentence Audio", scale=2)
-                    rerun_btn = gr.Button(value="Rerun")
-        with gr.Row():
-            with gr.Accordion("Generation Parameters - change these if you are unhappy with the generation", open=False):
-                stop_repetition = gr.Radio(label="stop_repetition", choices=[-1, 1, 2, 3, 4], value=2,
-                                        info="if there are long silence in the generated audio, reduce the stop_repetition to 2 or 1. -1 = disabled")
-                seed = gr.Number(label="seed", value=-1, precision=0, info="random seeds always works :)")
-                kvcache = gr.Radio(label="kvcache", choices=[0, 1], value=1,
-                                    info="set to 0 to use less VRAM, but with slower inference")
-                aug_text = gr.Radio(label="aug_text", choices=[0, 1], value=1,
-                                    info="set to 1 to use cfg")
-                cfg_coef = gr.Number(label="cfg_coef", value=1.5,
-                                    info="cfg guidance scale, 1.5 is a good value")
-                sub_amount = gr.Number(label="sub_amount", value=0.12, info="margin to the left and right of the editing segment")
-                top_p = gr.Number(label="top_p", value=0.8, info="0.9 is a good value, 0.8 is also good")
-                temperature = gr.Number(label="temperature", value=1, info="haven't try other values, do not recommend to change")
-                top_k = gr.Number(label="top_k", value=0, info="0 means we don't use topk sampling, because we use topp sampling")
-                codec_audio_sr = gr.Number(label="codec_audio_sr", value=16000, info='encodec specific, Do not change')
-                codec_sr = gr.Number(label="codec_sr", value=50, info='encodec specific, Do not change')
-                silence_tokens = gr.Textbox(label="silence tokens", value="[1388,1898,131]", info="encodec specific, do not change")
-        success_output = gr.HTML()
-        audio_tensors = gr.State()
-        transcribe_state = gr.State(value={"words_info": demo_words_info})
-        load_models_btn.click(fn=load_models,
-                            inputs=[whisper_backend_choice, whisper_model_choice, align_model_choice, ssrspeech_model_choice],
-                            outputs=[models_selector])
-        transcribe_btn.click(fn=transcribe,
-                            inputs=[seed, input_audio],
-                            outputs=[original_transcript, transcript_with_start_time, transcript_with_end_time, transcribe_state, success_output])
-        align_btn.click(fn=align,
-                        inputs=[seed, original_transcript, input_audio],
-                        outputs=[transcript_with_start_time, transcript_with_end_time, transcribe_state, success_output])
-        run_btn.click(fn=run,
-                    inputs=[
-                        seed, sub_amount, ssrspeech_model_choice,
-                        codec_audio_sr, codec_sr,
-                        top_k, top_p, temperature,
-                        stop_repetition,
-                        kvcache, silence_tokens, aug_text, cfg_coef,
-                        input_audio, transcribe_state, original_transcript, transcript,
-                        mode, sentence_selector, audio_tensors
-                    ],
-                    outputs=[output_audio, inference_transcript, sentence_selector, audio_tensors])
-        sentence_selector.change(fn=load_sentence,
-                                inputs=[sentence_selector, codec_audio_sr, audio_tensors],
-                                outputs=[sentence_audio])
-        rerun_btn.click(fn=run,
-                        inputs=[
-                            seed, sub_amount, ssrspeech_model_choice,
-                            codec_audio_sr, codec_sr,
-                            top_k, top_p, temperature,
-                            stop_repetition,
-                            kvcache, silence_tokens, aug_text, cfg_coef,
-                            input_audio, transcribe_state, original_transcript, transcript,
-                            gr.State(value="Rerun"), sentence_selector, audio_tensors
-                        ],
-                        outputs=[output_audio, inference_transcript, sentence_audio, audio_tensors])
     return app

                                                         choices=[None, "base.en", "small.en", "medium.en", "large"])
                         align_model_choice = gr.Radio(label="Forced alignment model", value="whisperX", choices=["whisperX", None])
+        # with gr.Row():
+        #     with gr.Column(scale=2):
+        #         input_audio = gr.Audio(value=f"{DEMO_PATH}/5895_34622_000026_000002.wav", label="Input Audio", type="filepath", interactive=True)
+        #         with gr.Group():
+        #             original_transcript = gr.Textbox(label="Original transcript", lines=5, value=demo_original_transcript,
+        #                                             info="Use whisperx model to get the transcript. Fix and align it if necessary.")
+        #             with gr.Accordion("Word start time", open=False):
+        #                 transcript_with_start_time = gr.Textbox(label="Start time", lines=5, interactive=False, info="Start time before each word")
+        #             with gr.Accordion("Word end time", open=False):
+        #                 transcript_with_end_time = gr.Textbox(label="End time", lines=5, interactive=False, info="End time after each word")
+        #             transcribe_btn = gr.Button(value="Transcribe")
+        #             align_btn = gr.Button(value="Align")
+        #     with gr.Column(scale=3):
+        #         with gr.Group():
+        #             transcript = gr.Textbox(label="Text", lines=7, value=demo_text["TTS"]["smart"])
+        #             with gr.Row():
+        #                 mode = gr.Radio(label="Mode", choices=["Edit", "TTS"], value="Edit")
+        #             run_btn = gr.Button(value="Run")
+        #     with gr.Column(scale=2):
+        #         output_audio = gr.Audio(label="Output Audio")
+        #         with gr.Accordion("Inference transcript", open=False):
+        #             inference_transcript = gr.Textbox(label="Inference transcript", lines=5, interactive=False,
+        #                                             info="Inference was performed on this transcript.")
+        #         with gr.Group(visible=False) as long_tts_sentence_editor:
+        #             sentence_selector = gr.Dropdown(label="Sentence", value=None,
+        #                                             info="Select sentence you want to regenerate")
+        #             sentence_audio = gr.Audio(label="Sentence Audio", scale=2)
+        #             rerun_btn = gr.Button(value="Rerun")
+        # with gr.Row():
+        #     with gr.Accordion("Generation Parameters - change these if you are unhappy with the generation", open=False):
+        #         stop_repetition = gr.Radio(label="stop_repetition", choices=[-1, 1, 2, 3, 4], value=2,
+        #                                 info="if there are long silence in the generated audio, reduce the stop_repetition to 2 or 1. -1 = disabled")
+        #         seed = gr.Number(label="seed", value=-1, precision=0, info="random seeds always works :)")
+        #         kvcache = gr.Radio(label="kvcache", choices=[0, 1], value=1,
+        #                             info="set to 0 to use less VRAM, but with slower inference")
+        #         aug_text = gr.Radio(label="aug_text", choices=[0, 1], value=1,
+        #                             info="set to 1 to use cfg")
+        #         cfg_coef = gr.Number(label="cfg_coef", value=1.5,
+        #                             info="cfg guidance scale, 1.5 is a good value")
+        #         sub_amount = gr.Number(label="sub_amount", value=0.12, info="margin to the left and right of the editing segment")
+        #         top_p = gr.Number(label="top_p", value=0.8, info="0.9 is a good value, 0.8 is also good")
+        #         temperature = gr.Number(label="temperature", value=1, info="haven't try other values, do not recommend to change")
+        #         top_k = gr.Number(label="top_k", value=0, info="0 means we don't use topk sampling, because we use topp sampling")
+        #         codec_audio_sr = gr.Number(label="codec_audio_sr", value=16000, info='encodec specific, Do not change')
+        #         codec_sr = gr.Number(label="codec_sr", value=50, info='encodec specific, Do not change')
+        #         silence_tokens = gr.Textbox(label="silence tokens", value="[1388,1898,131]", info="encodec specific, do not change")
+        # success_output = gr.HTML()
+        # audio_tensors = gr.State()
+        # transcribe_state = gr.State(value={"words_info": demo_words_info})
+        # load_models_btn.click(fn=load_models,
+        #                     inputs=[whisper_backend_choice, whisper_model_choice, align_model_choice, ssrspeech_model_choice],
+        #                     outputs=[models_selector])
+        # transcribe_btn.click(fn=transcribe,
+        #                     inputs=[seed, input_audio],
+        #                     outputs=[original_transcript, transcript_with_start_time, transcript_with_end_time, transcribe_state, success_output])
+        # align_btn.click(fn=align,
+        #                 inputs=[seed, original_transcript, input_audio],
+        #                 outputs=[transcript_with_start_time, transcript_with_end_time, transcribe_state, success_output])
+        # run_btn.click(fn=run,
+        #             inputs=[
+        #                 seed, sub_amount, ssrspeech_model_choice,
+        #                 codec_audio_sr, codec_sr,
+        #                 top_k, top_p, temperature,
+        #                 stop_repetition,
+        #                 kvcache, silence_tokens, aug_text, cfg_coef,
+        #                 input_audio, transcribe_state, original_transcript, transcript,
+        #                 mode, sentence_selector, audio_tensors
+        #             ],
+        #             outputs=[output_audio, inference_transcript, sentence_selector, audio_tensors])
+        # sentence_selector.change(fn=load_sentence,
+        #                         inputs=[sentence_selector, codec_audio_sr, audio_tensors],
+        #                         outputs=[sentence_audio])
+        # rerun_btn.click(fn=run,
+        #                 inputs=[
+        #                     seed, sub_amount, ssrspeech_model_choice,
+        #                     codec_audio_sr, codec_sr,
+        #                     top_k, top_p, temperature,
+        #                     stop_repetition,
+        #                     kvcache, silence_tokens, aug_text, cfg_coef,
+        #                     input_audio, transcribe_state, original_transcript, transcript,
+        #                     gr.State(value="Rerun"), sentence_selector, audio_tensors
+        #                 ],
+        #                 outputs=[output_audio, inference_transcript, sentence_audio, audio_tensors])
     return app