csukuangfj commited on
Commit
c0c87b2
·
1 Parent(s): a1e6695

update russian models

Browse files
Files changed (2) hide show
  1. model.py +46 -4
  2. requirements.txt +2 -2
model.py CHANGED
@@ -147,20 +147,56 @@ def _get_chinese_dialect_models(repo_id: str) -> sherpa_onnx.OfflineRecognizer:
147
 
148
 
149
  @lru_cache(maxsize=10)
150
- def _get_russian_pre_trained_model(repo_id: str) -> sherpa_onnx.OfflineRecognizer:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
151
  assert repo_id in (
152
  "alphacep/vosk-model-ru",
153
  "alphacep/vosk-model-small-ru",
 
154
  ), repo_id
155
 
156
  if repo_id == "alphacep/vosk-model-ru":
157
  model_dir = "am-onnx"
 
 
158
  elif repo_id == "alphacep/vosk-model-small-ru":
159
  model_dir = "am"
 
 
 
 
 
 
160
 
161
  encoder_model = _get_nn_model_filename(
162
  repo_id=repo_id,
163
- filename="encoder.onnx",
164
  subfolder=model_dir,
165
  )
166
 
@@ -176,7 +212,10 @@ def _get_russian_pre_trained_model(repo_id: str) -> sherpa_onnx.OfflineRecognize
176
  subfolder=model_dir,
177
  )
178
 
179
- tokens = _get_token_filename(repo_id=repo_id, subfolder="lang")
 
 
 
180
 
181
  recognizer = sherpa_onnx.OfflineRecognizer.from_transducer(
182
  tokens=tokens,
@@ -186,7 +225,8 @@ def _get_russian_pre_trained_model(repo_id: str) -> sherpa_onnx.OfflineRecognize
186
  num_threads=2,
187
  sample_rate=16000,
188
  feature_dim=80,
189
- decoding_method="greedy_search",
 
190
  )
191
 
192
  return recognizer
@@ -546,6 +586,8 @@ korean_models = {
546
  }
547
 
548
  russian_models = {
 
 
549
  "alphacep/vosk-model-ru": _get_russian_pre_trained_model,
550
  "alphacep/vosk-model-small-ru": _get_russian_pre_trained_model,
551
  }
 
147
 
148
 
149
  @lru_cache(maxsize=10)
150
+ def _get_russian_pre_trained_model_ctc(
151
+ repo_id: str, decoding_method: str, num_active_paths: int
152
+ ) -> sherpa_onnx.OfflineRecognizer:
153
+ assert repo_id in (
154
+ "csukuangfj/sherpa-onnx-nemo-ctc-giga-am-russian-2024-10-24",
155
+ ), repo_id
156
+
157
+ model = _get_nn_model_filename(
158
+ repo_id=repo_id,
159
+ filename="model.int8.onnx",
160
+ subfolder=".",
161
+ )
162
+
163
+ tokens = _get_token_filename(repo_id=repo_id, subfolder=".")
164
+
165
+ recognizer = sherpa_onnx.OfflineRecognizer.from_nemo_ctc(
166
+ model=model,
167
+ tokens=tokens,
168
+ num_threads=2,
169
+ )
170
+
171
+ return recognizer
172
+
173
+
174
+ @lru_cache(maxsize=10)
175
+ def _get_russian_pre_trained_model(
176
+ repo_id: str, decoding_method: str, num_active_paths: int
177
+ ) -> sherpa_onnx.OfflineRecognizer:
178
  assert repo_id in (
179
  "alphacep/vosk-model-ru",
180
  "alphacep/vosk-model-small-ru",
181
+ "csukuangfj/sherpa-onnx-nemo-transducer-giga-am-russian-2024-10-24",
182
  ), repo_id
183
 
184
  if repo_id == "alphacep/vosk-model-ru":
185
  model_dir = "am-onnx"
186
+ encoder = "encoder.onnx"
187
+ model_type = "transducer"
188
  elif repo_id == "alphacep/vosk-model-small-ru":
189
  model_dir = "am"
190
+ encoder = "encoder.onnx"
191
+ model_type = "transducer"
192
+ elif repo_id == "csukuangfj/sherpa-onnx-nemo-transducer-giga-am-russian-2024-10-24":
193
+ model_dir = "."
194
+ encoder = "encoder.int8.onnx"
195
+ model_type = "nemo_transducer"
196
 
197
  encoder_model = _get_nn_model_filename(
198
  repo_id=repo_id,
199
+ filename=encoder,
200
  subfolder=model_dir,
201
  )
202
 
 
212
  subfolder=model_dir,
213
  )
214
 
215
+ if repo_id == "csukuangfj/sherpa-onnx-nemo-transducer-giga-am-russian-2024-10-24":
216
+ tokens = _get_token_filename(repo_id=repo_id, subfolder=".")
217
+ else:
218
+ tokens = _get_token_filename(repo_id=repo_id, subfolder="lang")
219
 
220
  recognizer = sherpa_onnx.OfflineRecognizer.from_transducer(
221
  tokens=tokens,
 
225
  num_threads=2,
226
  sample_rate=16000,
227
  feature_dim=80,
228
+ decoding_method=decoding_method,
229
+ model_type=model_type,
230
  )
231
 
232
  return recognizer
 
586
  }
587
 
588
  russian_models = {
589
+ "csukuangfj/sherpa-onnx-nemo-transducer-giga-am-russian-2024-10-24": _get_russian_pre_trained_model,
590
+ "csukuangfj/sherpa-onnx-nemo-ctc-giga-am-russian-2024-10-24": _get_russian_pre_trained_model_ctc,
591
  "alphacep/vosk-model-ru": _get_russian_pre_trained_model,
592
  "alphacep/vosk-model-small-ru": _get_russian_pre_trained_model,
593
  }
requirements.txt CHANGED
@@ -1,4 +1,4 @@
1
 
2
- sherpa-onnx>=1.9.21
3
  ffmpeg-python
4
- #https://huggingface.co/csukuangfj/sherpa-onnx-wheels/resolve/main/sherpa_onnx-1.9.26-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl
 
1
 
2
+ #sherpa-onnx>=1.9.21
3
  ffmpeg-python
4
+ https://huggingface.co/csukuangfj/sherpa-onnx-wheels/resolve/main/cpu/1.10.28/sherpa_onnx-1.10.28-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl