csukuangfj commited on
Commit
8cc7e65
·
1 Parent(s): b100127

Minor fixes

Browse files
Files changed (1) hide show
  1. app.py +89 -8
app.py CHANGED
@@ -40,6 +40,71 @@ def convert_to_wav(in_filename: str) -> str:
40
  return out_filename
41
 
42
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
43
  def process(
44
  in_filename: str,
45
  language: str,
@@ -88,11 +153,16 @@ def process(
88
  rtf = (end - start) / duration
89
 
90
  logging.info(f"Finished at {date_time} s. Elapsed: {end - start: .3f} s")
91
- logging.info(f"Duration {duration: .3f} s")
92
- logging.info(f"RTF {rtf: .3f}")
 
 
 
 
 
93
  logging.info(f"hyp:\n{hyp}")
94
 
95
- return hyp
96
 
97
 
98
  title = "# Automatic Speech Recognition with Next-gen Kaldi"
@@ -107,6 +177,15 @@ See more information by visiting the following links:
107
  - <https://github.com/lhotse-speech/lhotse>
108
  """
109
 
 
 
 
 
 
 
 
 
 
110
 
111
  def update_model_dropdown(language: str):
112
  if language in language_to_models:
@@ -116,7 +195,7 @@ def update_model_dropdown(language: str):
116
  raise ValueError(f"Unsupported language: {language}")
117
 
118
 
119
- demo = gr.Blocks()
120
 
121
  with demo:
122
  gr.Markdown(title)
@@ -162,6 +241,7 @@ with demo:
162
  )
163
  upload_button = gr.Button("Submit for recognition")
164
  uploaded_output = gr.Textbox(label="Recognized speech from uploaded file")
 
165
 
166
  with gr.TabItem("Record from microphone"):
167
  microphone = gr.Audio(
@@ -173,9 +253,10 @@ with demo:
173
 
174
  record_button = gr.Button("Submit for recognition")
175
  recorded_output = gr.Textbox(label="Recognized speech from recordings")
 
176
 
177
  upload_button.click(
178
- process,
179
  inputs=[
180
  uploaded_file,
181
  language_radio,
@@ -183,10 +264,10 @@ with demo:
183
  decoding_method_radio,
184
  num_active_paths_slider,
185
  ],
186
- outputs=uploaded_output,
187
  )
188
  record_button.click(
189
- process,
190
  inputs=[
191
  microphone,
192
  language_radio,
@@ -194,7 +275,7 @@ with demo:
194
  decoding_method_radio,
195
  num_active_paths_slider,
196
  ],
197
- outputs=recorded_output,
198
  )
199
  gr.Markdown(description)
200
 
 
40
  return out_filename
41
 
42
 
43
+ def build_html_output(s: str, style: str = "result_item_success"):
44
+ return f"""
45
+ <div class='result'>
46
+ <div class='result_item {style}'>
47
+ {s}
48
+ </div>
49
+ </div>
50
+ """
51
+
52
+
53
+ def process_uploaded_file(
54
+ in_filename: str,
55
+ language: str,
56
+ repo_id: str,
57
+ decoding_method: str,
58
+ num_active_paths: int,
59
+ ):
60
+ if in_filename is None or in_filename == "":
61
+ return "", build_html_output(
62
+ "Please first upload a file and then click "
63
+ 'the button "submit for recognition"',
64
+ "result_item_error",
65
+ )
66
+
67
+ logging.info(f"Processing uploaded file: {in_filename}")
68
+ try:
69
+ return process(
70
+ in_filename=in_filename,
71
+ language=language,
72
+ repo_id=repo_id,
73
+ decoding_method=decoding_method,
74
+ num_active_paths=num_active_paths,
75
+ )
76
+ except Exception as e:
77
+ return "", build_html_output(str(e), "result_item_error")
78
+
79
+
80
+ def process_microphone(
81
+ in_filename: str,
82
+ language: str,
83
+ repo_id: str,
84
+ decoding_method: str,
85
+ num_active_paths: int,
86
+ ):
87
+ if in_filename is None or in_filename == "":
88
+ return "", build_html_output(
89
+ "Please first click 'Record from microphone', speak, "
90
+ "click 'Stop recording', and then "
91
+ "click the button 'submit for recognition'",
92
+ "result_item_error",
93
+ )
94
+
95
+ logging.info(f"Processing microphone: {in_filename}")
96
+ try:
97
+ return process(
98
+ in_filename=in_filename,
99
+ language=language,
100
+ repo_id=repo_id,
101
+ decoding_method=decoding_method,
102
+ num_active_paths=num_active_paths,
103
+ )
104
+ except Exception as e:
105
+ return "", build_html_output(str(e), "result_item_error")
106
+
107
+
108
  def process(
109
  in_filename: str,
110
  language: str,
 
153
  rtf = (end - start) / duration
154
 
155
  logging.info(f"Finished at {date_time} s. Elapsed: {end - start: .3f} s")
156
+
157
+ info = f"""
158
+ Wave duration : {duration: .3f} s <br/>
159
+ Processing time: {end - start: .3f} s <br/>
160
+ RTF: {end - start: .3f}/{duration: .3f} = {(end - start)/duration:3.f} <br/>
161
+ """
162
+ logging.info(info)
163
  logging.info(f"hyp:\n{hyp}")
164
 
165
+ return hyp, build_html_output(info)
166
 
167
 
168
  title = "# Automatic Speech Recognition with Next-gen Kaldi"
 
177
  - <https://github.com/lhotse-speech/lhotse>
178
  """
179
 
180
+ # css style is copied from
181
+ # https://huggingface.co/spaces/alphacep/asr/blob/main/app.py#L113
182
+ css = """
183
+ .result {display:flex;flex-direction:column}
184
+ .result_item {padding:15px;margin-bottom:8px;border-radius:15px;width:100%}
185
+ .result_item_success {background-color:mediumaquamarine;color:white;align-self:start}
186
+ .result_item_error {background-color:#ff7070;color:white;align-self:start}
187
+ """
188
+
189
 
190
  def update_model_dropdown(language: str):
191
  if language in language_to_models:
 
195
  raise ValueError(f"Unsupported language: {language}")
196
 
197
 
198
+ demo = gr.Blocks(css=css)
199
 
200
  with demo:
201
  gr.Markdown(title)
 
241
  )
242
  upload_button = gr.Button("Submit for recognition")
243
  uploaded_output = gr.Textbox(label="Recognized speech from uploaded file")
244
+ uploaded_html_info = (gr.HTML(label="Info"),)
245
 
246
  with gr.TabItem("Record from microphone"):
247
  microphone = gr.Audio(
 
253
 
254
  record_button = gr.Button("Submit for recognition")
255
  recorded_output = gr.Textbox(label="Recognized speech from recordings")
256
+ recorded_html_info = (gr.HTML(label="Info"),)
257
 
258
  upload_button.click(
259
+ process_uploaded_file,
260
  inputs=[
261
  uploaded_file,
262
  language_radio,
 
264
  decoding_method_radio,
265
  num_active_paths_slider,
266
  ],
267
+ outputs=[uploaded_output, uploaded_html_info],
268
  )
269
  record_button.click(
270
+ process_microphone,
271
  inputs=[
272
  microphone,
273
  language_radio,
 
275
  decoding_method_radio,
276
  num_active_paths_slider,
277
  ],
278
+ outputs=[recorded_output, recorded_html_info],
279
  )
280
  gr.Markdown(description)
281