Spaces:

OpenSound
/

SSR-Speech

Running on Zero

App Files Files Community

OpenSound commited on Oct 4, 2024

Commit

f52a712

1 Parent(s): 4b8ea71

Update app.py

Browse files

Files changed (1) hide show

app.py +5 -6

app.py CHANGED Viewed

@@ -113,12 +113,11 @@ def get_transcribe_state(segments):
 @spaces.GPU
 def transcribe(audio_path):
-    align_model, _ = load_align_model(language_code=language, device=device)
     transcribe_model = load_model(transcribe_model_name, device, asr_options={"suppress_numerals": True, "max_new_tokens": None, "clip_timestamps": None, "hallucination_silence_threshold": None}, language=language)
     segments = transcribe_model.transcribe(audio_path, batch_size=8)["segments"]
     for segment in segments:
         segment['text'] = replace_numbers_with_words(segment['text'])
-    segments = align_model.align(segments, audio_path)
     state = get_transcribe_state(segments)
     success_message = "<span style='color:green;'>Success: Transcribe completed successfully!</span>"
@@ -135,7 +134,7 @@ def align(segments, audio_path):
     segments = align_func(segments, align_model, metadata, audio, device, return_char_alignments=False)["segments"]
     state = get_transcribe_state(segments)
-    return state
 def get_output_audio(audio_tensors, codec_audio_sr):
@@ -177,7 +176,7 @@ def run(seed, sub_amount, codec_audio_sr, codec_sr, top_k, top_p, temperature,
     [orig_transcript, segments, _] = transcribe(audio_path)
     orig_transcript = orig_transcript.lower()
     target_transcript = target_transcript.lower()
-    transcribe_state = align(segments, audio_path)
     print(orig_transcript)
     print(target_transcript)
@@ -201,7 +200,7 @@ def run(seed, sub_amount, codec_audio_sr, codec_sr, top_k, top_p, temperature,
         orig_transcript = orig_transcript.lower()
         target_transcript = target_transcript.lower()
-        transcribe_state = align(segments, audio_path)
         print(orig_transcript)
         target_transcript_copy = target_transcript # for tts cut out
         target_transcript_copy = target_transcript_copy.split(' ')[0]
@@ -277,7 +276,7 @@ def run(seed, sub_amount, codec_audio_sr, codec_sr, top_k, top_p, temperature,
     torchaudio.save(audio_path, new_audio, codec_audio_sr)
     if tts: # remove the start parts
         [new_transcript, new_segments, _] = transcribe(audio_path)
-        transcribe_state = align(new_segments, audio_path)
         tmp1 = transcribe_state['segments'][0]['words'][0]['word'].lower()
         tmp2 = target_transcript_copy.lower()
         if tmp1 == tmp2:

 @spaces.GPU
 def transcribe(audio_path):
     transcribe_model = load_model(transcribe_model_name, device, asr_options={"suppress_numerals": True, "max_new_tokens": None, "clip_timestamps": None, "hallucination_silence_threshold": None}, language=language)
     segments = transcribe_model.transcribe(audio_path, batch_size=8)["segments"]
     for segment in segments:
         segment['text'] = replace_numbers_with_words(segment['text'])
+    _, segments = align(segments, audio_path)
     state = get_transcribe_state(segments)
     success_message = "<span style='color:green;'>Success: Transcribe completed successfully!</span>"
     segments = align_func(segments, align_model, metadata, audio, device, return_char_alignments=False)["segments"]
     state = get_transcribe_state(segments)
+    return state, segments
 def get_output_audio(audio_tensors, codec_audio_sr):
     [orig_transcript, segments, _] = transcribe(audio_path)
     orig_transcript = orig_transcript.lower()
     target_transcript = target_transcript.lower()
+    transcribe_state,_ = align(segments, audio_path)
     print(orig_transcript)
     print(target_transcript)
         orig_transcript = orig_transcript.lower()
         target_transcript = target_transcript.lower()
+        transcribe_state,_ = align(segments, audio_path)
         print(orig_transcript)
         target_transcript_copy = target_transcript # for tts cut out
         target_transcript_copy = target_transcript_copy.split(' ')[0]
     torchaudio.save(audio_path, new_audio, codec_audio_sr)
     if tts: # remove the start parts
         [new_transcript, new_segments, _] = transcribe(audio_path)
+        transcribe_state,_ = align(new_segments, audio_path)
         tmp1 = transcribe_state['segments'][0]['words'][0]['word'].lower()
         tmp2 = target_transcript_copy.lower()
         if tmp1 == tmp2: