Spaces:

OpenSound
/

SSR-Speech

Running on Zero

App Files Files Community

OpenSound commited on Sep 22, 2024

Commit

3daaddd

1 Parent(s): 41c3bad

Update app.py

Browse files

Files changed (1) hide show

app.py +169 -178

app.py CHANGED Viewed

@@ -125,18 +125,21 @@ class WhisperxModel:
         return self.align_model.align(segments, audio_path)
 @spaces.GPU
-def load_models(whisper_backend_name, whisper_model_name, alignment_model_name, ssrspeech_model_name):
     global transcribe_model, align_model, ssrspeech_model
     if ssrspeech_model_name == "English":
         ssrspeech_model_name = "English"
         text_tokenizer = TextTokenizer(backend="espeak")
         language = "en"
     elif ssrspeech_model_name == "Mandarin":
         ssrspeech_model_name = "Mandarin"
         text_tokenizer = TextTokenizer(backend="espeak", language='cmn')
         language = "zh"
     if alignment_model_name is not None:
         align_model = WhisperxAlignModel(language)
@@ -212,12 +215,8 @@ def align(segments, audio_path):
     segments = align_model.align(segments, audio_path)
     state = get_transcribe_state(segments)
-    success_message = "<span style='color:green;'>Success: Alignment completed successfully!</span>"
-    return [
-        state["transcript_with_start_time"], state["transcript_with_end_time"],
-        state, success_message
-    ]
 def get_output_audio(audio_tensors, codec_audio_sr):
@@ -239,141 +238,166 @@ def replace_numbers_with_words(sentence):
 @spaces.GPU
 def run(seed, sub_amount, ssrspeech_model_choice, codec_audio_sr, codec_sr, top_k, top_p, temperature,
-        stop_repetition, kvcache, silence_tokens, aug_text, cfg_coef,
-        audio_path, transcribe_state, original_transcript, transcript,
-        mode, selected_sentence, previous_audio_tensors):
-    global ssrspeech_model
     aug_text = True if aug_text == 1 else False
     if ssrspeech_model is None:
         raise gr.Error("ssrspeech model not loaded")
     # resample audio
     audio, _ = librosa.load(audio_path, sr=16000)
     sf.write(audio_path, audio, 16000)
-    seed_everything(seed)
-    transcript = replace_numbers_with_words(transcript).replace("  ", " ").replace("  ", " ") # replace numbers with words, so that the phonemizer can do a better job
-    if mode == "Rerun":
-        colon_position = selected_sentence.find(':')
-        selected_sentence_idx = int(selected_sentence[:colon_position])
-        sentences = [selected_sentence[colon_position + 1:]]
-    else:
-        sentences = [transcript.replace("\n", " ")]
-    audio_tensors = []
-    inference_transcript = ""
-    for sentence in sentences:
-        decode_config = {"top_k": top_k, "top_p": top_p, "temperature": temperature, "stop_repetition": stop_repetition,
-                         "kvcache": kvcache, "codec_audio_sr": codec_audio_sr, "codec_sr": codec_sr}
-         # run the script to turn user input to the format that the model can take
-        if mode == "Edit":
-            operations, orig_spans = parse_edit_en(original_transcript, sentence) if ssrspeech_model_choice == 'English' else parse_edit_zh(original_transcript, sentence)
-            print(operations)
-            print("orig_spans: ", orig_spans)
-            if len(orig_spans) > 3:
-                raise gr.Error("Current model only supports maximum 3 editings")
-            starting_intervals = []
-            ending_intervals = []
-            for orig_span in orig_spans:
-                start, end = get_mask_interval(transcribe_state, orig_span)
-                starting_intervals.append(start)
-                ending_intervals.append(end)
-            print("intervals: ", starting_intervals, ending_intervals)
-            info = torchaudio.info(audio_path)
-            audio_dur = info.num_frames / info.sample_rate
-            def combine_spans(spans, threshold=0.2):
-                spans.sort(key=lambda x: x[0])
-                combined_spans = []
-                current_span = spans[0]
-                for i in range(1, len(spans)):
-                    next_span = spans[i]
-                    if current_span[1] >= next_span[0] - threshold:
-                        current_span[1] = max(current_span[1], next_span[1])
-                    else:
-                        combined_spans.append(current_span)
-                        current_span = next_span
-                combined_spans.append(current_span)
-                return combined_spans
-            morphed_span = [[max(start - sub_amount, 0), min(end + sub_amount, audio_dur)]
-                            for start, end in zip(starting_intervals, ending_intervals)] # in seconds
-            morphed_span = combine_spans(morphed_span, threshold=0.2)
-            print("morphed_spans: ", morphed_span)
-            mask_interval = [[round(span[0]*codec_sr), round(span[1]*codec_sr)] for span in morphed_span]
-            mask_interval = torch.LongTensor(mask_interval) # [M,2], M==1 for now
-            gen_audio = inference_one_sample(
-                ssrspeech_model["model"],
-                ssrspeech_model["config"],
-                ssrspeech_model["phn2num"],
-                ssrspeech_model["text_tokenizer"],
-                ssrspeech_model["audio_tokenizer"],
-                audio_path, original_transcript, sentence, mask_interval,
-                cfg_coef, aug_text, False, True, False,
-                device, decode_config
-                )
-        else:
-            orig_spans = parse_tts_en(original_transcript, sentence) if ssrspeech_model_choice == 'English' else parse_tts_zh(original_transcript, sentence)
-            print("orig_spans: ", orig_spans)
-            starting_intervals = []
-            ending_intervals = []
-            for orig_span in orig_spans:
-                start, end = get_mask_interval(transcribe_state, orig_span)
-                starting_intervals.append(start)
-                ending_intervals.append(end)
-            print("intervals: ", starting_intervals, ending_intervals)
-            info = torchaudio.info(audio_path)
-            audio_dur = info.num_frames / info.sample_rate
-            morphed_span = [(max(start, 1/codec_sr), min(end, audio_dur))
-                            for start, end in zip(starting_intervals, ending_intervals)] # in seconds
-            mask_interval = [[round(span[0]*codec_sr), round(span[1]*codec_sr)] for span in morphed_span]
-            mask_interval = torch.LongTensor(mask_interval) # [M,2], M==1 for now
-            print("mask_interval: ", mask_interval)
-            gen_audio = inference_one_sample(
-                ssrspeech_model["model"],
-                ssrspeech_model["config"],
-                ssrspeech_model["phn2num"],
-                ssrspeech_model["text_tokenizer"],
-                ssrspeech_model["audio_tokenizer"],
-                audio_path, original_transcript, sentence, mask_interval,
-                cfg_coef, aug_text, False, True, True,
-                device, decode_config
-                )
-        gen_audio = gen_audio[0].cpu()
-        audio_tensors.append(gen_audio)
-    if mode != "Rerun":
-        output_audio = get_output_audio(audio_tensors, codec_audio_sr)
-        sentences = [f"{idx}: {text}" for idx, text in enumerate(sentences)]
-        component = gr.Dropdown(choices=sentences, value=sentences[0])
-        return output_audio, inference_transcript, component, audio_tensors
     else:
-        previous_audio_tensors[selected_sentence_idx] = audio_tensors[0]
-        output_audio = get_output_audio(previous_audio_tensors, codec_audio_sr)
-        sentence_audio = get_output_audio(audio_tensors, codec_audio_sr)
-        return output_audio, inference_transcript, sentence_audio, previous_audio_tensors
-def load_sentence(selected_sentence, codec_audio_sr, audio_tensors):
-    if selected_sentence is None:
-        return None
-    colon_position = selected_sentence.find(':')
-    selected_sentence_idx = int(selected_sentence[:colon_position])
-    return get_output_audio([audio_tensors[selected_sentence_idx]], codec_audio_sr)
 demo_original_transcript = "Gwynplaine had, besides, for his work and for his feats of strength, round his neck and over his shoulders, an esclavine of leather."
@@ -405,23 +429,14 @@ def get_app():
                         ssrspeech_model_choice = gr.Radio(label="ssrspeech model", value="English",
                                                         choices=["English", "Mandarin"])
                         whisper_backend_choice = gr.Radio(label="Whisper backend", value="whisperX", choices=["whisperX", "whisper"])
-                        whisper_model_choice = gr.Radio(label="Whisper model", value="base.en",
-                                                        choices=[None, "base.en", "small.en", "medium.en", "large"])
-                        align_model_choice = gr.Radio(label="Forced alignment model", value="whisperX", choices=["whisperX", None])
         with gr.Row():
             with gr.Column(scale=2):
                 input_audio = gr.Audio(value=f"{DEMO_PATH}/5895_34622_000026_000002.wav", label="Input Audio", type="filepath", interactive=True)
                 with gr.Group():
                     original_transcript = gr.Textbox(label="Original transcript", lines=5, value=demo_original_transcript,
-                                                    info="Use whisperx model to get the transcript. Fix and align it if necessary.")
-                    with gr.Accordion("Word start time", open=False):
-                        transcript_with_start_time = gr.Textbox(label="Start time", lines=5, interactive=False, info="Start time before each word")
-                    with gr.Accordion("Word end time", open=False):
-                        transcript_with_end_time = gr.Textbox(label="End time", lines=5, interactive=False, info="End time after each word")
                     transcribe_btn = gr.Button(value="Transcribe")
-                    align_btn = gr.Button(value="Align")
             with gr.Column(scale=3):
                 with gr.Group():
@@ -437,11 +452,6 @@ def get_app():
                 with gr.Accordion("Inference transcript", open=False):
                     inference_transcript = gr.Textbox(label="Inference transcript", lines=5, interactive=False,
                                                     info="Inference was performed on this transcript.")
-                with gr.Group(visible=False) as long_tts_sentence_editor:
-                    sentence_selector = gr.Dropdown(label="Sentence", value=None,
-                                                    info="Select sentence you want to regenerate")
-                    sentence_audio = gr.Audio(label="Sentence Audio", scale=2)
-                    rerun_btn = gr.Button(value="Rerun")
         with gr.Row():
             with gr.Accordion("Generation Parameters - change these if you are unhappy with the generation", open=False):
@@ -453,57 +463,38 @@ def get_app():
                 aug_text = gr.Radio(label="aug_text", choices=[0, 1], value=1,
                                     info="set to 1 to use cfg")
                 cfg_coef = gr.Number(label="cfg_coef", value=1.5,
-                                    info="cfg guidance scale, 1.5 is a good value")
-                sub_amount = gr.Number(label="sub_amount", value=0.12, info="margin to the left and right of the editing segment")
                 top_p = gr.Number(label="top_p", value=0.8, info="0.9 is a good value, 0.8 is also good")
-                temperature = gr.Number(label="temperature", value=1, info="haven't try other values, do not recommend to change")
                 top_k = gr.Number(label="top_k", value=0, info="0 means we don't use topk sampling, because we use topp sampling")
-                codec_audio_sr = gr.Number(label="codec_audio_sr", value=16000, info='encodec specific, Do not change')
-                codec_sr = gr.Number(label="codec_sr", value=50, info='encodec specific, Do not change')
                 silence_tokens = gr.Textbox(label="silence tokens", value="[1388,1898,131]", info="encodec specific, do not change")
         success_output = gr.HTML()
-        audio_tensors = gr.State()
-        transcribe_state = gr.State(value={"words_info": demo_words_info})
         load_models_btn.click(fn=load_models,
-                            inputs=[whisper_backend_choice, whisper_model_choice, align_model_choice, ssrspeech_model_choice],
                             outputs=[models_selector, success_output])
         transcribe_btn.click(fn=transcribe,
-                            inputs=[seed, input_audio],
-                            outputs=[original_transcript, transcript_with_start_time, transcript_with_end_time, transcribe_state, success_output])
-        align_btn.click(fn=align,
-                        inputs=[seed, original_transcript, input_audio],
-                        outputs=[transcript_with_start_time, transcript_with_end_time, transcribe_state, success_output])
         run_btn.click(fn=run,
                     inputs=[
                         seed, sub_amount, ssrspeech_model_choice,
                         codec_audio_sr, codec_sr,
-                        top_k, top_p, temperature,
-                        stop_repetition,
-                        kvcache, silence_tokens, aug_text, cfg_coef,
-                        input_audio, transcribe_state, original_transcript, transcript,
-                        mode, sentence_selector, audio_tensors
                     ],
-                    outputs=[output_audio, inference_transcript, sentence_selector, audio_tensors])
-        sentence_selector.change(fn=load_sentence,
-                                inputs=[sentence_selector, codec_audio_sr, audio_tensors],
-                                outputs=[sentence_audio])
-        rerun_btn.click(fn=run,
-                        inputs=[
-                            seed, sub_amount, ssrspeech_model_choice,
-                            codec_audio_sr, codec_sr,
-                            top_k, top_p, temperature,
-                            stop_repetition,
-                            kvcache, silence_tokens, aug_text, cfg_coef,
-                            input_audio, transcribe_state, original_transcript, transcript,
-                            gr.State(value="Rerun"), sentence_selector, audio_tensors
-                        ],
-                        outputs=[output_audio, inference_transcript, sentence_audio, audio_tensors])
     return app

         return self.align_model.align(segments, audio_path)
 @spaces.GPU
+def load_models(whisper_backend_name, ssrspeech_model_name):
     global transcribe_model, align_model, ssrspeech_model
+    alignment_model_name = "whisperX"
     if ssrspeech_model_name == "English":
         ssrspeech_model_name = "English"
         text_tokenizer = TextTokenizer(backend="espeak")
         language = "en"
+        whisper_model_name = "base.en"
     elif ssrspeech_model_name == "Mandarin":
         ssrspeech_model_name = "Mandarin"
         text_tokenizer = TextTokenizer(backend="espeak", language='cmn')
         language = "zh"
+        whisper_model_name = "base"
     if alignment_model_name is not None:
         align_model = WhisperxAlignModel(language)
     segments = align_model.align(segments, audio_path)
     state = get_transcribe_state(segments)
+    return state
 def get_output_audio(audio_tensors, codec_audio_sr):
 @spaces.GPU
 def run(seed, sub_amount, ssrspeech_model_choice, codec_audio_sr, codec_sr, top_k, top_p, temperature,
+        stop_repetition, kvcache, silence_tokens, aug_text, cfg_coef, prompt_length,
+        audio_path, original_transcript, transcript, mode):
+    global transcribe_model, align_model, ssrspeech_model
     aug_text = True if aug_text == 1 else False
     if ssrspeech_model is None:
         raise gr.Error("ssrspeech model not loaded")
+    seed_everything(seed)
+    if ssrspeech_model_choice == "English":
+        language = "en"
+    elif ssrspeech_model_choice == "Mandarin":
+        language = "zh"
     # resample audio
     audio, _ = librosa.load(audio_path, sr=16000)
     sf.write(audio_path, audio, 16000)
+    # text normalization
+    target_transcript = replace_numbers_with_words(transcript).replace("  ", " ").replace("  ", " ").replace("\n", " ")
+    orig_transcript = replace_numbers_with_words(original_transcript).replace("  ", " ").replace("  ", " ").replace("\n", " ")
+    orig_transcript, segments = transcribe(audio_path)
+    if language == 'zh':
+        converter = opencc.OpenCC('t2s')
+        orig_transcript = converter.convert(orig_transcript)
+        transcribe_state = align(traditional_to_simplified(segments), audio_path)
+        transcribe_state['segments'] = traditional_to_simplified(transcribe_state['segments'])
+    elif language == 'en':
+        orig_transcript = orig_transcript.lower()
+        target_transcript = target_transcript.lower()
+        transcribe_state = align(segments, audio_path)
+    print(orig_transcript)
+    print(target_transcript)
+    if mode == "TTS":
+        info = torchaudio.info(audio_path)
+        duration = info.num_frames / info.sample_rate
+        cut_length = duration
+        # Cut long audio for tts
+        if duration > prompt_length:
+            seg_num = len(transcribe_state['segments'])
+            for i in range(seg_num):
+                words = transcribe_state['segments'][i]['words']
+                for item in words:
+                    if item['end'] >= prompt_length:
+                        cut_length = min(item['end'], cut_length)
+        audio, _ = librosa.load(audio_path, sr=16000, duration=cut_length)
+        sf.write(audio_path, audio, 16000)
+        orig_transcript, segments = transcribe(audio_path)
+        if language == 'zh':
+            converter = opencc.OpenCC('t2s')
+            orig_transcript = converter.convert(orig_transcript)
+            transcribe_state = align(traditional_to_simplified(segments), audio_path)
+            transcribe_state['segments'] = traditional_to_simplified(transcribe_state['segments'])
+        elif language == 'en':
+            orig_transcript = orig_transcript.lower()
+            target_transcript = target_transcript.lower()
+            transcribe_state = align(segments, audio_path)
+        print(orig_transcript)
+        target_transcript_copy = target_transcript # for tts cut out
+        if language == 'en':
+            target_transcript_copy = target_transcript_copy.split(' ')[0]
+        elif language == 'zh':
+            target_transcript_copy = target_transcript_copy[0]
+        target_transcript = orig_transcript + ' ' + target_transcript if language == 'en' else orig_transcript + target_transcript
+        print(target_transcript)
+    if mode == "Edit":
+        operations, orig_spans = parse_edit_en(orig_transcript, target_transcript) if language == 'en' else parse_edit_zh(orig_transcript, target_transcript)
+        print(operations)
+        print("orig_spans: ", orig_spans)
+        if len(orig_spans) > 3:
+            raise gr.Error("Current model only supports maximum 3 editings")
+        starting_intervals = []
+        ending_intervals = []
+        for orig_span in orig_spans:
+            start, end = get_mask_interval(transcribe_state, orig_span)
+            starting_intervals.append(start)
+            ending_intervals.append(end)
+        print("intervals: ", starting_intervals, ending_intervals)
+        info = torchaudio.info(audio_path)
+        audio_dur = info.num_frames / info.sample_rate
+        def combine_spans(spans, threshold=0.2):
+            spans.sort(key=lambda x: x[0])
+            combined_spans = []
+            current_span = spans[0]
+            for i in range(1, len(spans)):
+                next_span = spans[i]
+                if current_span[1] >= next_span[0] - threshold:
+                    current_span[1] = max(current_span[1], next_span[1])
+                else:
+                    combined_spans.append(current_span)
+                    current_span = next_span
+            combined_spans.append(current_span)
+            return combined_spans
+        morphed_span = [[max(start - sub_amount, 0), min(end + sub_amount, audio_dur)]
+                        for start, end in zip(starting_intervals, ending_intervals)] # in seconds
+        morphed_span = combine_spans(morphed_span, threshold=0.2)
+        print("morphed_spans: ", morphed_span)
+        mask_interval = [[round(span[0]*codec_sr), round(span[1]*codec_sr)] for span in morphed_span]
+        mask_interval = torch.LongTensor(mask_interval) # [M,2], M==1 for now
     else:
+        info = torchaudio.info(audio_path)
+        audio_dur = info.num_frames / info.sample_rate
+        morphed_span = [(audio_dur, audio_dur)] # in seconds
+        mask_interval = [[round(span[0]*codec_sr), round(span[1]*codec_sr)] for span in morphed_span]
+        mask_interval = torch.LongTensor(mask_interval) # [M,2], M==1 for now
+        print("mask_interval: ", mask_interval)
+    decode_config = {'top_k': top_k, 'top_p': top_p, 'temperature': temperature, 'stop_repetition': stop_repetition, 'kvcache': kvcache, "codec_audio_sr": codec_audio_sr, "codec_sr": codec_sr}
+    tts = True if mode == "TTS" else False
+    new_audio = inference_one_sample(
+        ssrspeech_model["model"],
+        ssrspeech_model["config"],
+        ssrspeech_model["phn2num"],
+        ssrspeech_model["text_tokenizer"],
+        ssrspeech_model["audio_tokenizer"],
+        audio_path, orig_transcript, target_transcript, mask_interval,
+        cfg_coef, aug_text, False, True, tts,
+        device, decode_config
+    )
+    audio_tensors = []
+    # save segments for comparison
+    new_audio = new_audio[0].cpu()
+    torchaudio.save(audio_path, new_audio, codec_audio_sr)
+    if tts: # remove the start parts
+        new_transcript, new_segments = transcribe(audio_path)
+        if language == 'zh':
+            transcribe_state = align(traditional_to_simplified(new_segments), audio_path)
+            transcribe_state['segments'] = traditional_to_simplified(transcribe_state['segments'])
+            tmp1 = transcribe_state['segments'][0]['words'][0]['word']
+            tmp2 = target_transcript_copy
+        elif language == 'en':
+            transcribe_state = align(new_segments, audio_path)
+            tmp1 = transcribe_state['segments'][0]['words'][0]['word'].lower()
+            tmp2 = target_transcript_copy.lower()
+        if tmp1 == tmp2:
+            offset = transcribe_state['segments'][0]['words'][0]['start']
+        else:
+            offset = transcribe_state['segments'][0]['words'][1]['start']
+        new_audio, _ = torchaudio.load(audio_path, frame_offset=int(offset*codec_audio_sr))
+    audio_tensors.append(new_audio)
+    output_audio = get_output_audio(audio_tensors, codec_audio_sr)
+    success_message = "<span style='color:green;'>Success: Inference successfully!</span>"
+    return output_audio, success_message
 demo_original_transcript = "Gwynplaine had, besides, for his work and for his feats of strength, round his neck and over his shoulders, an esclavine of leather."
                         ssrspeech_model_choice = gr.Radio(label="ssrspeech model", value="English",
                                                         choices=["English", "Mandarin"])
                         whisper_backend_choice = gr.Radio(label="Whisper backend", value="whisperX", choices=["whisperX", "whisper"])
         with gr.Row():
             with gr.Column(scale=2):
                 input_audio = gr.Audio(value=f"{DEMO_PATH}/5895_34622_000026_000002.wav", label="Input Audio", type="filepath", interactive=True)
                 with gr.Group():
                     original_transcript = gr.Textbox(label="Original transcript", lines=5, value=demo_original_transcript,
+                                                    info="Use whisperx model to get the transcript.")
                     transcribe_btn = gr.Button(value="Transcribe")
             with gr.Column(scale=3):
                 with gr.Group():
                 with gr.Accordion("Inference transcript", open=False):
                     inference_transcript = gr.Textbox(label="Inference transcript", lines=5, interactive=False,
                                                     info="Inference was performed on this transcript.")
         with gr.Row():
             with gr.Accordion("Generation Parameters - change these if you are unhappy with the generation", open=False):
                 aug_text = gr.Radio(label="aug_text", choices=[0, 1], value=1,
                                     info="set to 1 to use cfg")
                 cfg_coef = gr.Number(label="cfg_coef", value=1.5,
+                                    info="cfg guidance scale, 1.5 is a good value, change if you don't like the results")
+                prompt_length = gr.Number(label="prompt_length", value=3,
+                                    info="used for tts prompt, will automatically cut the prompt audio to this length")
+                sub_amount = gr.Number(label="sub_amount", value=0.12, info="margin to the left and right of the editing segment, change if you don't like the results")
                 top_p = gr.Number(label="top_p", value=0.8, info="0.9 is a good value, 0.8 is also good")
+                temperature = gr.Number(label="temperature", value=1, info="haven't try other values, do not change")
                 top_k = gr.Number(label="top_k", value=0, info="0 means we don't use topk sampling, because we use topp sampling")
+                codec_audio_sr = gr.Number(label="codec_audio_sr", value=16000, info='encodec specific, do not change')
+                codec_sr = gr.Number(label="codec_sr", value=50, info='encodec specific, do not change')
                 silence_tokens = gr.Textbox(label="silence tokens", value="[1388,1898,131]", info="encodec specific, do not change")
         success_output = gr.HTML()
         load_models_btn.click(fn=load_models,
+                            inputs=[whisper_backend_choice, ssrspeech_model_choice],
                             outputs=[models_selector, success_output])
+        semgents = None # not used
         transcribe_btn.click(fn=transcribe,
+                            inputs=[input_audio],
+                            outputs=[original_transcript, semgents, success_output])
         run_btn.click(fn=run,
                     inputs=[
                         seed, sub_amount, ssrspeech_model_choice,
                         codec_audio_sr, codec_sr,
+                        top_k, top_p, temperature, stop_repetition, kvcache, silence_tokens,
+                        aug_text, cfg_coef, prompt_length,
+                        input_audio, original_transcript, transcript,
+                        mode
                     ],
+                    outputs=[output_audio, success_output])
     return app