csukuangfj commited on
Commit
1dfc17d
·
1 Parent(s): 60adf6c

small fixes

Browse files
Files changed (3) hide show
  1. app.py +16 -6
  2. examples.py +42 -0
  3. model.py +146 -10
app.py CHANGED
@@ -19,6 +19,7 @@
19
  # References:
20
  # https://gradio.app/docs/#dropdown
21
 
 
22
  import logging
23
  import os
24
  import time
@@ -29,7 +30,7 @@ import torch
29
  import torchaudio
30
 
31
  from examples import examples
32
- from model import get_pretrained_model, language_to_models, sample_rate
33
 
34
  languages = list(language_to_models.keys())
35
 
@@ -39,6 +40,15 @@ def convert_to_wav(in_filename: str) -> str:
39
  out_filename = in_filename + ".wav"
40
  logging.info(f"Converting '{in_filename}' to '{out_filename}'")
41
  _ = os.system(f"ffmpeg -hide_banner -i '{in_filename}' -ar 16000 '{out_filename}'")
 
 
 
 
 
 
 
 
 
42
  return out_filename
43
 
44
 
@@ -136,12 +146,8 @@ def process(
136
  decoding_method=decoding_method,
137
  num_active_paths=num_active_paths,
138
  )
139
- s = recognizer.create_stream()
140
 
141
- s.accept_wave_file(filename)
142
- recognizer.decode_stream(s)
143
-
144
- text = s.result.text
145
 
146
  date_time = now.strftime("%Y-%m-%d %H:%M:%S.%f")
147
  end = time.time()
@@ -173,6 +179,10 @@ title = "# Automatic Speech Recognition with Next-gen Kaldi"
173
  description = """
174
  This space shows how to do automatic speech recognition with Next-gen Kaldi.
175
 
 
 
 
 
176
  It is running on CPU within a docker container provided by Hugging Face.
177
 
178
  See more information by visiting the following links:
 
19
  # References:
20
  # https://gradio.app/docs/#dropdown
21
 
22
+ import base64
23
  import logging
24
  import os
25
  import time
 
30
  import torchaudio
31
 
32
  from examples import examples
33
+ from model import decode, get_pretrained_model, language_to_models, sample_rate
34
 
35
  languages = list(language_to_models.keys())
36
 
 
40
  out_filename = in_filename + ".wav"
41
  logging.info(f"Converting '{in_filename}' to '{out_filename}'")
42
  _ = os.system(f"ffmpeg -hide_banner -i '{in_filename}' -ar 16000 '{out_filename}'")
43
+ _ = os.system(
44
+ f"ffmpeg -hide_banner -loglevel error -i '{in_filename}' -ar 16000 '{out_filename}.flac'"
45
+ )
46
+
47
+ with open(out_filename + ".flac", "rb") as f:
48
+ s = "\n" + out_filename + "\n"
49
+ s += base64.b64encode(f.read()).decode()
50
+ logging.info(s)
51
+
52
  return out_filename
53
 
54
 
 
146
  decoding_method=decoding_method,
147
  num_active_paths=num_active_paths,
148
  )
 
149
 
150
+ text = decode(recognizer, filename)
 
 
 
151
 
152
  date_time = now.strftime("%Y-%m-%d %H:%M:%S.%f")
153
  end = time.time()
 
179
  description = """
180
  This space shows how to do automatic speech recognition with Next-gen Kaldi.
181
 
182
+ Please visit
183
+ <https://huggingface.co/spaces/k2-fsa/streaming-automatic-speech-recognition>
184
+ for streaming speech recognition with **Next-gen Kaldi**.
185
+
186
  It is running on CPU within a docker container provided by Hugging Face.
187
 
188
  See more information by visiting the following links:
examples.py CHANGED
@@ -58,6 +58,48 @@ examples = [
58
  4,
59
  "./test_wavs/tibetan/a_0_cacm-A70_31117.wav",
60
  ],
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
61
  # librispeech
62
  # https://huggingface.co/csukuangfj/icefall-asr-librispeech-pruned-transducer-stateless5-2022-05-13/tree/main/test_wavs
63
  [
 
58
  4,
59
  "./test_wavs/tibetan/a_0_cacm-A70_31117.wav",
60
  ],
61
+ [
62
+ "Chinese",
63
+ "desh2608/icefall-asr-alimeeting-pruned-transducer-stateless7",
64
+ "greedy_search",
65
+ 4,
66
+ "./test_wavs/alimeeting/R8003_M8001-8004-165.wav",
67
+ ],
68
+ [
69
+ "Chinese",
70
+ "desh2608/icefall-asr-alimeeting-pruned-transducer-stateless7",
71
+ "greedy_search",
72
+ 4,
73
+ "./test_wavs/alimeeting/R8008_M8013-8049-74.wav",
74
+ ],
75
+ [
76
+ "Chinese",
77
+ "desh2608/icefall-asr-alimeeting-pruned-transducer-stateless7",
78
+ "greedy_search",
79
+ 4,
80
+ "./test_wavs/alimeeting/R8009_M8020_N_SPK8026-8026-209.wav",
81
+ ],
82
+ [
83
+ "English",
84
+ "videodanchik/icefall-asr-tedlium3-conformer-ctc2",
85
+ "greedy_search",
86
+ 4,
87
+ "./test_wavs/tedlium3/DanBarber_2010-219.wav",
88
+ ],
89
+ [
90
+ "English",
91
+ "videodanchik/icefall-asr-tedlium3-conformer-ctc2",
92
+ "greedy_search",
93
+ 4,
94
+ "./test_wavs/tedlium3/DanielKahneman_2010-157.wav",
95
+ ],
96
+ [
97
+ "English",
98
+ "videodanchik/icefall-asr-tedlium3-conformer-ctc2",
99
+ "greedy_search",
100
+ 4,
101
+ "./test_wavs/tedlium3/RobertGupta_2010U-15.wav",
102
+ ],
103
  # librispeech
104
  # https://huggingface.co/csukuangfj/icefall-asr-librispeech-pruned-transducer-stateless5-2022-05-13/tree/main/test_wavs
105
  [
model.py CHANGED
@@ -14,9 +14,13 @@
14
  # See the License for the specific language governing permissions and
15
  # limitations under the License.
16
 
17
- from huggingface_hub import hf_hub_download
18
- from functools import lru_cache
19
  import os
 
 
 
 
 
 
20
 
21
  os.system(
22
  "cp -v /home/user/.local/lib/python3.8/site-packages/k2/lib/*.so /home/user/.local/lib/python3.8/site-packages/sherpa/lib/"
@@ -25,10 +29,59 @@ os.system(
25
  import k2
26
  import sherpa
27
 
28
-
29
  sample_rate = 16000
30
 
31
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
32
  @lru_cache(maxsize=30)
33
  def get_pretrained_model(
34
  repo_id: str,
@@ -59,6 +112,10 @@ def get_pretrained_model(
59
  return german_models[repo_id](
60
  repo_id, decoding_method=decoding_method, num_active_paths=num_active_paths
61
  )
 
 
 
 
62
  else:
63
  raise ValueError(f"Unsupported repo_id: {repo_id}")
64
 
@@ -176,7 +233,7 @@ def _get_gigaspeech_pre_trained_model(
176
 
177
 
178
  @lru_cache(maxsize=10)
179
- def _get_librispeech_pre_trained_model(
180
  repo_id: str,
181
  decoding_method: str,
182
  num_active_paths: int,
@@ -186,6 +243,9 @@ def _get_librispeech_pre_trained_model(
186
  "csukuangfj/icefall-asr-librispeech-pruned-transducer-stateless3-2022-05-13", # noqa
187
  "csukuangfj/icefall-asr-librispeech-pruned-transducer-stateless7-2022-11-11", # noqa
188
  "csukuangfj/icefall-asr-librispeech-pruned-transducer-stateless8-2022-11-14", # noqa
 
 
 
189
  ], repo_id
190
 
191
  filename = "cpu_jit.pt"
@@ -205,7 +265,15 @@ def _get_librispeech_pre_trained_model(
205
  repo_id=repo_id,
206
  filename=filename,
207
  )
208
- tokens = _get_token_filename(repo_id=repo_id, subfolder="data/lang_bpe_500")
 
 
 
 
 
 
 
 
209
 
210
  feat_config = sherpa.FeatureConfig()
211
  feat_config.fbank_opts.frame_opts.samp_freq = sample_rate
@@ -311,12 +379,18 @@ def _get_alimeeting_pre_trained_model(
311
  num_active_paths: int,
312
  ):
313
  assert repo_id in [
 
314
  "luomingshuang/icefall_asr_alimeeting_pruned_transducer_stateless2",
315
  ], repo_id
316
 
 
 
 
 
 
317
  nn_model = _get_nn_model_filename(
318
  repo_id=repo_id,
319
- filename="cpu_jit_torch_1.7.1.pt",
320
  )
321
  tokens = _get_token_filename(repo_id=repo_id)
322
 
@@ -530,21 +604,76 @@ def _get_german_pre_trained_model(
530
  return recognizer
531
 
532
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
533
  chinese_models = {
534
  "luomingshuang/icefall_asr_wenetspeech_pruned_transducer_stateless2": _get_wenetspeech_pre_trained_model, # noqa
 
535
  "yuekai/icefall-asr-aishell2-pruned-transducer-stateless5-A-2022-07-12": _get_aishell2_pretrained_model, # noqa
536
  "yuekai/icefall-asr-aishell2-pruned-transducer-stateless5-B-2022-07-12": _get_aishell2_pretrained_model, # noqa
537
  "luomingshuang/icefall_asr_aidatatang-200zh_pruned_transducer_stateless2": _get_aidatatang_200zh_pretrained_mode, # noqa
538
  "luomingshuang/icefall_asr_alimeeting_pruned_transducer_stateless2": _get_alimeeting_pre_trained_model, # noqa
539
  "csukuangfj/wenet-chinese-model": _get_wenet_model,
 
540
  }
541
 
542
  english_models = {
543
  "wgb14/icefall-asr-gigaspeech-pruned-transducer-stateless2": _get_gigaspeech_pre_trained_model, # noqa
544
- "WeijiZhuang/icefall-asr-librispeech-pruned-transducer-stateless8-2022-12-02": _get_librispeech_pre_trained_model, # noqa
545
- "csukuangfj/icefall-asr-librispeech-pruned-transducer-stateless8-2022-11-14": _get_librispeech_pre_trained_model, # noqa
546
- "csukuangfj/icefall-asr-librispeech-pruned-transducer-stateless7-2022-11-11": _get_librispeech_pre_trained_model, # noqa
547
- "csukuangfj/icefall-asr-librispeech-pruned-transducer-stateless3-2022-05-13": _get_librispeech_pre_trained_model, # noqa
 
 
 
548
  "csukuangfj/wenet-english-model": _get_wenet_model,
549
  }
550
 
@@ -566,10 +695,16 @@ german_models = {
566
  "csukuangfj/wav2vec2.0-torchaudio": _get_german_pre_trained_model,
567
  }
568
 
 
 
 
 
 
569
  all_models = {
570
  **chinese_models,
571
  **english_models,
572
  **chinese_english_mixed_models,
 
573
  **tibetan_models,
574
  **arabic_models,
575
  **german_models,
@@ -579,6 +714,7 @@ language_to_models = {
579
  "Chinese": list(chinese_models.keys()),
580
  "English": list(english_models.keys()),
581
  "Chinese+English": list(chinese_english_mixed_models.keys()),
 
582
  "Tibetan": list(tibetan_models.keys()),
583
  "Arabic": list(arabic_models.keys()),
584
  "German": list(german_models.keys()),
 
14
  # See the License for the specific language governing permissions and
15
  # limitations under the License.
16
 
 
 
17
  import os
18
+ from functools import lru_cache
19
+ from typing import Union
20
+
21
+ import torch
22
+ import torchaudio
23
+ from huggingface_hub import hf_hub_download
24
 
25
  os.system(
26
  "cp -v /home/user/.local/lib/python3.8/site-packages/k2/lib/*.so /home/user/.local/lib/python3.8/site-packages/sherpa/lib/"
 
29
  import k2
30
  import sherpa
31
 
 
32
  sample_rate = 16000
33
 
34
 
35
+ def decode_offline_recognizer(
36
+ recognizer: Union[sherpa.OfflineRecognizer, sherpa.OnlineRecognizer],
37
+ filename: str,
38
+ ) -> str:
39
+ s = recognizer.create_stream()
40
+
41
+ s.accept_wave_file(filename)
42
+ recognizer.decode_stream(s)
43
+
44
+ text = s.result.text.strip()
45
+ return text.lower()
46
+
47
+
48
+ def decode_online_recognizer(
49
+ recognizer: Union[sherpa.OfflineRecognizer, sherpa.OnlineRecognizer],
50
+ filename: str,
51
+ ) -> str:
52
+ samples, actual_sample_rate = torchaudio.load(filename)
53
+ assert sample_rate == actual_sample_rate, (
54
+ sample_rate,
55
+ actual_sample_rate,
56
+ )
57
+ samples = samples[0].contiguous()
58
+
59
+ s = recognizer.create_stream()
60
+
61
+ tail_padding = torch.zeros(int(sample_rate * 0.3), dtype=torch.float32)
62
+ s.accept_waveform(sample_rate, samples)
63
+ s.accept_waveform(sample_rate, tail_padding)
64
+ s.input_finished()
65
+
66
+ while recognizer.is_ready(s):
67
+ recognizer.decode_stream(s)
68
+
69
+ text = recognizer.get_result(s).text
70
+ return text.strip().lower()
71
+
72
+
73
+ def decode(
74
+ recognizer: Union[sherpa.OfflineRecognizer, sherpa.OnlineRecognizer],
75
+ filename: str,
76
+ ) -> str:
77
+ if isinstance(recognizer, sherpa.OfflineRecognizer):
78
+ return decode_offline_recognizer(recognizer, filename)
79
+ elif isinstance(recognizer, sherpa.OnlineRecognizer):
80
+ return decode_online_recognizer(recognizer, filename)
81
+ else:
82
+ raise ValueError(f"Unknown recongizer type {type(recognizer)}")
83
+
84
+
85
  @lru_cache(maxsize=30)
86
  def get_pretrained_model(
87
  repo_id: str,
 
112
  return german_models[repo_id](
113
  repo_id, decoding_method=decoding_method, num_active_paths=num_active_paths
114
  )
115
+ elif repo_id in japanese_models:
116
+ return japanese_models[repo_id](
117
+ repo_id, decoding_method=decoding_method, num_active_paths=num_active_paths
118
+ )
119
  else:
120
  raise ValueError(f"Unsupported repo_id: {repo_id}")
121
 
 
233
 
234
 
235
  @lru_cache(maxsize=10)
236
+ def _get_english_model(
237
  repo_id: str,
238
  decoding_method: str,
239
  num_active_paths: int,
 
243
  "csukuangfj/icefall-asr-librispeech-pruned-transducer-stateless3-2022-05-13", # noqa
244
  "csukuangfj/icefall-asr-librispeech-pruned-transducer-stateless7-2022-11-11", # noqa
245
  "csukuangfj/icefall-asr-librispeech-pruned-transducer-stateless8-2022-11-14", # noqa
246
+ "videodanchik/icefall-asr-tedlium3-conformer-ctc2",
247
+ "pkufool/icefall_asr_librispeech_conformer_ctc",
248
+ "WayneWiser/icefall-asr-librispeech-conformer-ctc2-jit-bpe-500-2022-07-21",
249
  ], repo_id
250
 
251
  filename = "cpu_jit.pt"
 
265
  repo_id=repo_id,
266
  filename=filename,
267
  )
268
+ subfolder = "data/lang_bpe_500"
269
+
270
+ if repo_id in (
271
+ "videodanchik/icefall-asr-tedlium3-conformer-ctc2",
272
+ "pkufool/icefall_asr_librispeech_conformer_ctc",
273
+ ):
274
+ subfolder = "data/lang_bpe"
275
+
276
+ tokens = _get_token_filename(repo_id=repo_id, subfolder=subfolder)
277
 
278
  feat_config = sherpa.FeatureConfig()
279
  feat_config.fbank_opts.frame_opts.samp_freq = sample_rate
 
379
  num_active_paths: int,
380
  ):
381
  assert repo_id in [
382
+ "desh2608/icefall-asr-alimeeting-pruned-transducer-stateless7",
383
  "luomingshuang/icefall_asr_alimeeting_pruned_transducer_stateless2",
384
  ], repo_id
385
 
386
+ if repo_id == "desh2608/icefall-asr-alimeeting-pruned-transducer-stateless7":
387
+ filename = "cpu_jit.pt"
388
+ elif repo_id == "luomingshuang/icefall_asr_alimeeting_pruned_transducer_stateless2":
389
+ filename = "cpu_jit_torch_1.7.1.pt"
390
+
391
  nn_model = _get_nn_model_filename(
392
  repo_id=repo_id,
393
+ filename=filename,
394
  )
395
  tokens = _get_token_filename(repo_id=repo_id)
396
 
 
604
  return recognizer
605
 
606
 
607
+ @lru_cache(maxsize=10)
608
+ def _get_japanese_pre_trained_model(
609
+ repo_id: str,
610
+ decoding_method: str,
611
+ num_active_paths: int,
612
+ ):
613
+ repo_id, kind = repo_id.rsplit("-", maxsplit=1)
614
+
615
+ assert repo_id in [
616
+ "TeoWenShen/icefall-asr-csj-pruned-transducer-stateless7-streaming-230208"
617
+ ], repo_id
618
+ assert kind in ("fluent", "disfluent"), kind
619
+
620
+ encoder_model = _get_nn_model_filename(
621
+ repo_id=repo_id, filename="encoder_jit_trace.pt", subfolder=f"exp_{kind}"
622
+ )
623
+
624
+ decoder_model = _get_nn_model_filename(
625
+ repo_id=repo_id, filename="decoder_jit_trace.pt", subfolder=f"exp_{kind}"
626
+ )
627
+
628
+ joiner_model = _get_nn_model_filename(
629
+ repo_id=repo_id, filename="joiner_jit_trace.pt", subfolder=f"exp_{kind}"
630
+ )
631
+
632
+ tokens = _get_token_filename(repo_id=repo_id)
633
+
634
+ feat_config = sherpa.FeatureConfig()
635
+ feat_config.fbank_opts.frame_opts.samp_freq = sample_rate
636
+ feat_config.fbank_opts.mel_opts.num_bins = 80
637
+ feat_config.fbank_opts.frame_opts.dither = 0
638
+
639
+ config = sherpa.OnlineRecognizerConfig(
640
+ nn_model="",
641
+ encoder_model=encoder_model,
642
+ decoder_model=decoder_model,
643
+ joiner_model=joiner_model,
644
+ tokens=tokens,
645
+ use_gpu=False,
646
+ feat_config=feat_config,
647
+ decoding_method=decoding_method,
648
+ num_active_paths=num_active_paths,
649
+ chunk_size=32,
650
+ )
651
+
652
+ recognizer = sherpa.OnlineRecognizer(config)
653
+
654
+ return recognizer
655
+
656
+
657
  chinese_models = {
658
  "luomingshuang/icefall_asr_wenetspeech_pruned_transducer_stateless2": _get_wenetspeech_pre_trained_model, # noqa
659
+ "desh2608/icefall-asr-alimeeting-pruned-transducer-stateless7": _get_alimeeting_pre_trained_model,
660
  "yuekai/icefall-asr-aishell2-pruned-transducer-stateless5-A-2022-07-12": _get_aishell2_pretrained_model, # noqa
661
  "yuekai/icefall-asr-aishell2-pruned-transducer-stateless5-B-2022-07-12": _get_aishell2_pretrained_model, # noqa
662
  "luomingshuang/icefall_asr_aidatatang-200zh_pruned_transducer_stateless2": _get_aidatatang_200zh_pretrained_mode, # noqa
663
  "luomingshuang/icefall_asr_alimeeting_pruned_transducer_stateless2": _get_alimeeting_pre_trained_model, # noqa
664
  "csukuangfj/wenet-chinese-model": _get_wenet_model,
665
+ # "csukuangfj/icefall-asr-wenetspeech-lstm-transducer-stateless-2022-10-14": _get_lstm_transducer_model,
666
  }
667
 
668
  english_models = {
669
  "wgb14/icefall-asr-gigaspeech-pruned-transducer-stateless2": _get_gigaspeech_pre_trained_model, # noqa
670
+ "WeijiZhuang/icefall-asr-librispeech-pruned-transducer-stateless8-2022-12-02": _get_english_model, # noqa
671
+ "csukuangfj/icefall-asr-librispeech-pruned-transducer-stateless8-2022-11-14": _get_english_model, # noqa
672
+ "csukuangfj/icefall-asr-librispeech-pruned-transducer-stateless7-2022-11-11": _get_english_model, # noqa
673
+ "csukuangfj/icefall-asr-librispeech-pruned-transducer-stateless3-2022-05-13": _get_english_model, # noqa
674
+ "videodanchik/icefall-asr-tedlium3-conformer-ctc2": _get_english_model,
675
+ "pkufool/icefall_asr_librispeech_conformer_ctc": _get_english_model,
676
+ "WayneWiser/icefall-asr-librispeech-conformer-ctc2-jit-bpe-500-2022-07-21": _get_english_model,
677
  "csukuangfj/wenet-english-model": _get_wenet_model,
678
  }
679
 
 
695
  "csukuangfj/wav2vec2.0-torchaudio": _get_german_pre_trained_model,
696
  }
697
 
698
+ japanese_models = {
699
+ "TeoWenShen/icefall-asr-csj-pruned-transducer-stateless7-streaming-230208-fluent": _get_japanese_pre_trained_model,
700
+ "TeoWenShen/icefall-asr-csj-pruned-transducer-stateless7-streaming-230208-disfluent": _get_japanese_pre_trained_model,
701
+ }
702
+
703
  all_models = {
704
  **chinese_models,
705
  **english_models,
706
  **chinese_english_mixed_models,
707
+ # **japanese_models,
708
  **tibetan_models,
709
  **arabic_models,
710
  **german_models,
 
714
  "Chinese": list(chinese_models.keys()),
715
  "English": list(english_models.keys()),
716
  "Chinese+English": list(chinese_english_mixed_models.keys()),
717
+ # "Japanese": list(japanese_models.keys()),
718
  "Tibetan": list(tibetan_models.keys()),
719
  "Arabic": list(arabic_models.keys()),
720
  "German": list(german_models.keys()),