Spaces:

anzorq
/

w2v-bert-2.0-kbd

Paused

App Files Files Community

anzorq commited on May 20, 2024

Commit

beedcb4

verified ·

1 Parent(s): 9ff6d33

Disable real-time transcription

Browse files

Files changed (1) hide show

app.py +13 -42

app.py CHANGED Viewed

@@ -7,9 +7,9 @@ from transformers import AutoModelForCTC, Wav2Vec2BertProcessor
 from pytube import YouTube
 from transformers import pipeline
 import re
-import numpy as np
-pipe = pipeline(model="anzorq/w2v-bert-2.0-kbd-v2", device=0)
 replacements = [
     ('гъ', 'ɣ'), ('дж', 'j'), ('дз', 'ӡ'), ('жь', 'ʐ'), ('кӏ', 'қ'),
@@ -25,46 +25,25 @@ def replace_symbols_back(text):
     return reverse_pattern.sub(lambda match: reverse_replacements[match.group(0)], text)
 @spaces.GPU
-def transcribe_speech(audio):
     if audio is None:  # Handle the NoneType error for microphone input
         return "No audio received."
     transcription = pipe(audio, chunk_length_s=10)['text']
     return replace_symbols_back(transcription)
-@spaces.GPU
-def transcribe_streaming(stream, transcription, new_chunk):
-    if new_chunk is None:  # Handle the NoneType error for microphone input
-        return stream, transcription
-    sampling_rate, audio_data = new_chunk
-    audio_data = audio_data.astype(np.float32)
-    audio_data /= np.max(np.abs(audio_data))
-    # Convert audio data to mono if it has multiple channels
-    if audio_data.ndim > 1:
-        audio_data = np.mean(audio_data, axis=1)
-    if stream is not None:
-        stream = np.concatenate([stream, audio_data])
-    else:
-        stream = audio_data
-    new_transcription = pipe({"sampling_rate": sampling_rate, "raw": stream})['text']
-    transcription += " " + replace_symbols_back(new_transcription)
-    return stream, transcription
 def transcribe_from_youtube(url, progress=gr.Progress()):
     progress(0, "Downloading YouTube audio...")
     # Download audio from YouTube using pytube
     audio_path = YouTube(url).streams.filter(only_audio=True)[0].download(filename="tmp.mp4")
-    progress(0.5, "Transcribing audio...")
     transcription = transcribe_speech(audio_path)
-    return audio_path, transcription
 def populate_metadata(url):
     yt = YouTube(url)
@@ -86,18 +65,11 @@ with gr.Blocks(theme=gr.themes.Soft()) as demo:
     with gr.Tab("Microphone Input"):
         gr.Markdown("## Transcribe speech from microphone")
-        mic_audio = gr.Audio(sources='microphone', streaming=True)
-        transcription_output = gr.Textbox(label="Transcription", lines=10)
-        mic_audio.stream(fn=transcribe_streaming, inputs=[gr.State(), gr.State(""), mic_audio], outputs=[gr.State(), transcription_output])
-    with gr.Tab("File Upload"):
-        gr.Markdown("## Transcribe speech from uploaded file")
-        upload_audio = gr.Audio(sources="upload", type="filepath")
         transcribe_button = gr.Button("Transcribe")
-        file_transcription_output = gr.Textbox(label="Transcription")
-        transcribe_button.click(fn=transcribe_speech, inputs=upload_audio, outputs=file_transcription_output)
     with gr.Tab("YouTube URL"):
         gr.Markdown("## Transcribe speech from YouTube video")
@@ -109,9 +81,8 @@ with gr.Blocks(theme=gr.themes.Soft()) as demo:
         transcribe_button = gr.Button("Transcribe")
         transcription_output = gr.Textbox(label="Transcription", placeholder="Transcription Output", lines=10)
-        youtube_audio_output = gr.Audio(label="Downloaded Audio", type="filepath")
-        transcribe_button.click(fn=transcribe_from_youtube, inputs=youtube_url, outputs=[youtube_audio_output, transcription_output])
         youtube_url.change(populate_metadata, inputs=[youtube_url], outputs=[img, title])
 demo.launch()

 from pytube import YouTube
 from transformers import pipeline
 import re
+# pipe = pipeline(model="anzorq/w2v-bert-2.0-kbd", device=0) # old model
+pipe = pipeline(model="anzorq/w2v-bert-2.0-kbd-v2", device=0) # new model with a new tokenizer
 replacements = [
     ('гъ', 'ɣ'), ('дж', 'j'), ('дз', 'ӡ'), ('жь', 'ʐ'), ('кӏ', 'қ'),
     return reverse_pattern.sub(lambda match: reverse_replacements[match.group(0)], text)
 @spaces.GPU
+def transcribe_speech(audio, progress=gr.Progress()):
     if audio is None:  # Handle the NoneType error for microphone input
         return "No audio received."
+    progress(0.5, desc="Transcribing audio...")
     transcription = pipe(audio, chunk_length_s=10)['text']
     return replace_symbols_back(transcription)
 def transcribe_from_youtube(url, progress=gr.Progress()):
     progress(0, "Downloading YouTube audio...")
     # Download audio from YouTube using pytube
     audio_path = YouTube(url).streams.filter(only_audio=True)[0].download(filename="tmp.mp4")
     transcription = transcribe_speech(audio_path)
+    os.remove(audio_path)
+    return transcription
 def populate_metadata(url):
     yt = YouTube(url)
     with gr.Tab("Microphone Input"):
         gr.Markdown("## Transcribe speech from microphone")
+        mic_audio = gr.Audio(sources=['microphone','upload'], type="filepath", label="Record or upload an audio")
         transcribe_button = gr.Button("Transcribe")
+        transcription_output = gr.Textbox(label="Transcription")
+        transcribe_button.click(fn=transcribe_speech, inputs=mic_audio, outputs=transcription_output)
     with gr.Tab("YouTube URL"):
         gr.Markdown("## Transcribe speech from YouTube video")
         transcribe_button = gr.Button("Transcribe")
         transcription_output = gr.Textbox(label="Transcription", placeholder="Transcription Output", lines=10)
+        transcribe_button.click(fn=transcribe_from_youtube, inputs=youtube_url, outputs=transcription_output)
         youtube_url.change(populate_metadata, inputs=[youtube_url], outputs=[img, title])
 demo.launch()