anzorq commited on
Commit
beedcb4
·
verified ·
1 Parent(s): 9ff6d33

Disable real-time transcription

Browse files
Files changed (1) hide show
  1. app.py +13 -42
app.py CHANGED
@@ -7,9 +7,9 @@ from transformers import AutoModelForCTC, Wav2Vec2BertProcessor
7
  from pytube import YouTube
8
  from transformers import pipeline
9
  import re
10
- import numpy as np
11
 
12
- pipe = pipeline(model="anzorq/w2v-bert-2.0-kbd-v2", device=0)
 
13
 
14
  replacements = [
15
  ('гъ', 'ɣ'), ('дж', 'j'), ('дз', 'ӡ'), ('жь', 'ʐ'), ('кӏ', 'қ'),
@@ -25,46 +25,25 @@ def replace_symbols_back(text):
25
  return reverse_pattern.sub(lambda match: reverse_replacements[match.group(0)], text)
26
 
27
  @spaces.GPU
28
- def transcribe_speech(audio):
29
  if audio is None: # Handle the NoneType error for microphone input
30
  return "No audio received."
31
 
 
32
  transcription = pipe(audio, chunk_length_s=10)['text']
33
 
34
  return replace_symbols_back(transcription)
35
 
36
- @spaces.GPU
37
- def transcribe_streaming(stream, transcription, new_chunk):
38
- if new_chunk is None: # Handle the NoneType error for microphone input
39
- return stream, transcription
40
-
41
- sampling_rate, audio_data = new_chunk
42
- audio_data = audio_data.astype(np.float32)
43
- audio_data /= np.max(np.abs(audio_data))
44
-
45
- # Convert audio data to mono if it has multiple channels
46
- if audio_data.ndim > 1:
47
- audio_data = np.mean(audio_data, axis=1)
48
-
49
- if stream is not None:
50
- stream = np.concatenate([stream, audio_data])
51
- else:
52
- stream = audio_data
53
-
54
- new_transcription = pipe({"sampling_rate": sampling_rate, "raw": stream})['text']
55
- transcription += " " + replace_symbols_back(new_transcription)
56
-
57
- return stream, transcription
58
-
59
  def transcribe_from_youtube(url, progress=gr.Progress()):
60
  progress(0, "Downloading YouTube audio...")
61
  # Download audio from YouTube using pytube
62
  audio_path = YouTube(url).streams.filter(only_audio=True)[0].download(filename="tmp.mp4")
63
-
64
- progress(0.5, "Transcribing audio...")
65
  transcription = transcribe_speech(audio_path)
 
 
66
 
67
- return audio_path, transcription
68
 
69
  def populate_metadata(url):
70
  yt = YouTube(url)
@@ -86,18 +65,11 @@ with gr.Blocks(theme=gr.themes.Soft()) as demo:
86
 
87
  with gr.Tab("Microphone Input"):
88
  gr.Markdown("## Transcribe speech from microphone")
89
- mic_audio = gr.Audio(sources='microphone', streaming=True)
90
- transcription_output = gr.Textbox(label="Transcription", lines=10)
91
-
92
- mic_audio.stream(fn=transcribe_streaming, inputs=[gr.State(), gr.State(""), mic_audio], outputs=[gr.State(), transcription_output])
93
-
94
- with gr.Tab("File Upload"):
95
- gr.Markdown("## Transcribe speech from uploaded file")
96
- upload_audio = gr.Audio(sources="upload", type="filepath")
97
  transcribe_button = gr.Button("Transcribe")
98
- file_transcription_output = gr.Textbox(label="Transcription")
99
-
100
- transcribe_button.click(fn=transcribe_speech, inputs=upload_audio, outputs=file_transcription_output)
101
 
102
  with gr.Tab("YouTube URL"):
103
  gr.Markdown("## Transcribe speech from YouTube video")
@@ -109,9 +81,8 @@ with gr.Blocks(theme=gr.themes.Soft()) as demo:
109
 
110
  transcribe_button = gr.Button("Transcribe")
111
  transcription_output = gr.Textbox(label="Transcription", placeholder="Transcription Output", lines=10)
112
- youtube_audio_output = gr.Audio(label="Downloaded Audio", type="filepath")
113
 
114
- transcribe_button.click(fn=transcribe_from_youtube, inputs=youtube_url, outputs=[youtube_audio_output, transcription_output])
115
  youtube_url.change(populate_metadata, inputs=[youtube_url], outputs=[img, title])
116
 
117
  demo.launch()
 
7
  from pytube import YouTube
8
  from transformers import pipeline
9
  import re
 
10
 
11
+ # pipe = pipeline(model="anzorq/w2v-bert-2.0-kbd", device=0) # old model
12
+ pipe = pipeline(model="anzorq/w2v-bert-2.0-kbd-v2", device=0) # new model with a new tokenizer
13
 
14
  replacements = [
15
  ('гъ', 'ɣ'), ('дж', 'j'), ('дз', 'ӡ'), ('жь', 'ʐ'), ('кӏ', 'қ'),
 
25
  return reverse_pattern.sub(lambda match: reverse_replacements[match.group(0)], text)
26
 
27
  @spaces.GPU
28
+ def transcribe_speech(audio, progress=gr.Progress()):
29
  if audio is None: # Handle the NoneType error for microphone input
30
  return "No audio received."
31
 
32
+ progress(0.5, desc="Transcribing audio...")
33
  transcription = pipe(audio, chunk_length_s=10)['text']
34
 
35
  return replace_symbols_back(transcription)
36
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
37
  def transcribe_from_youtube(url, progress=gr.Progress()):
38
  progress(0, "Downloading YouTube audio...")
39
  # Download audio from YouTube using pytube
40
  audio_path = YouTube(url).streams.filter(only_audio=True)[0].download(filename="tmp.mp4")
41
+
 
42
  transcription = transcribe_speech(audio_path)
43
+
44
+ os.remove(audio_path)
45
 
46
+ return transcription
47
 
48
  def populate_metadata(url):
49
  yt = YouTube(url)
 
65
 
66
  with gr.Tab("Microphone Input"):
67
  gr.Markdown("## Transcribe speech from microphone")
68
+ mic_audio = gr.Audio(sources=['microphone','upload'], type="filepath", label="Record or upload an audio")
 
 
 
 
 
 
 
69
  transcribe_button = gr.Button("Transcribe")
70
+ transcription_output = gr.Textbox(label="Transcription")
71
+
72
+ transcribe_button.click(fn=transcribe_speech, inputs=mic_audio, outputs=transcription_output)
73
 
74
  with gr.Tab("YouTube URL"):
75
  gr.Markdown("## Transcribe speech from YouTube video")
 
81
 
82
  transcribe_button = gr.Button("Transcribe")
83
  transcription_output = gr.Textbox(label="Transcription", placeholder="Transcription Output", lines=10)
 
84
 
85
+ transcribe_button.click(fn=transcribe_from_youtube, inputs=youtube_url, outputs=transcription_output)
86
  youtube_url.change(populate_metadata, inputs=[youtube_url], outputs=[img, title])
87
 
88
  demo.launch()