txya900619 commited on
Commit
946455e
·
1 Parent(s): 05b5714

feat: add app.py

Browse files
Files changed (4) hide show
  1. README.md +1 -1
  2. app.py +117 -0
  3. configs/models.yaml +5 -0
  4. requirements.txt +2 -0
README.md CHANGED
@@ -1,5 +1,5 @@
1
  ---
2
- title: Asr Kaldi
3
  emoji: 🐨
4
  colorFrom: pink
5
  colorTo: indigo
 
1
  ---
2
+ title: ASR Kaldi
3
  emoji: 🐨
4
  colorFrom: pink
5
  colorTo: indigo
app.py ADDED
@@ -0,0 +1,117 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+
3
+ import gradio as gr
4
+ from huggingface_hub import snapshot_download
5
+ from omegaconf import OmegaConf
6
+ from vosk import KaldiRecognizer, Model
7
+
8
+
9
+ def load_vosk(model_id: str):
10
+ model_dir = snapshot_download(model_id)
11
+ return Model(model_path=model_dir)
12
+
13
+
14
+ OmegaConf.register_new_resolver("load_vosk", load_vosk)
15
+
16
+ models_config = OmegaConf.to_object(OmegaConf.load("configs/models.yaml"))
17
+
18
+
19
+ def automatic_speech_recognition(model_id: str, dialect_id: str, audio_data: str):
20
+ model = models_config[model_id]["model"]
21
+
22
+ sample_rate, audio_array = audio_data
23
+ if audio_array.ndim == 2:
24
+ audio_array = audio_array[:, 0]
25
+
26
+ audio_bytes = audio_array.tobytes()
27
+
28
+ rec = KaldiRecognizer(model, sample_rate)
29
+
30
+ rec.SetWords(True)
31
+
32
+ results = []
33
+
34
+ for start in range(0, len(audio_bytes), 4000):
35
+ end = min(start + 4000, len(audio_bytes))
36
+ data = audio_bytes[start:end]
37
+ if rec.AcceptWaveform(data):
38
+ raw_result = json.loads(rec.Result())
39
+ results.append(raw_result)
40
+
41
+ final_result = json.loads(rec.FinalResult())
42
+ results.append(final_result)
43
+
44
+ filtered_lines = []
45
+
46
+ for result in results:
47
+ result["text"] = result["text"].replace(" ", "")
48
+ if len(result["text"]) > 0:
49
+ filtered_lines.append(result["text"])
50
+
51
+ return ",".join(filtered_lines) + "。"
52
+
53
+
54
+ def when_model_selected(model_id: str):
55
+ model_config = models_config[model_id]
56
+
57
+ if "dialect_mapping" not in model_config:
58
+ return gr.update(visible=False)
59
+
60
+ dialect_drop_down_choices = [
61
+ (k, v) for k, v in model_config["dialect_mapping"].items()
62
+ ]
63
+
64
+ return gr.update(
65
+ choices=dialect_drop_down_choices,
66
+ value=dialect_drop_down_choices[0][1],
67
+ visible=True,
68
+ )
69
+
70
+
71
+ demo = gr.Blocks(
72
+ title="康統 Kaldi 語音辨識系統",
73
+ css="@import url(https://tauhu.tw/tauhu-oo.css);",
74
+ theme=gr.themes.Default(
75
+ font=(
76
+ "tauhu-oo",
77
+ gr.themes.GoogleFont("Source Sans Pro"),
78
+ "ui-sans-serif",
79
+ "system-ui",
80
+ "sans-serif",
81
+ )
82
+ ),
83
+ )
84
+
85
+ with demo:
86
+ default_model_id = list(models_config.keys())[0]
87
+ model_drop_down = gr.Dropdown(
88
+ models_config.keys(),
89
+ value=default_model_id,
90
+ label="模型",
91
+ )
92
+
93
+ gr.Markdown(
94
+ """
95
+ # 康統 Kaldi 語音辨識系統
96
+ """
97
+ )
98
+ gr.Interface(
99
+ automatic_speech_recognition,
100
+ inputs=[
101
+ model_drop_down,
102
+ gr.Audio(
103
+ label="上傳或錄音",
104
+ type="numpy",
105
+ format="wav",
106
+ waveform_options=gr.WaveformOptions(
107
+ sample_rate=16000,
108
+ ),
109
+ ),
110
+ ],
111
+ outputs=[
112
+ gr.Text(interactive=False, label="文字"),
113
+ ],
114
+ allow_flagging="auto",
115
+ )
116
+
117
+ demo.launch()
configs/models.yaml ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ htia-0.1:
2
+ model:
3
+ kenkone_evas_cs_v5: ${load_vosk:kenkone/kenkone_evas_cs_v5}
4
+ kenkone_evas_cs_v5_dental: ${load_vosk:kenkone/kenkone_evas_cs_v5_dental}
5
+ kenkone_evas_retrain_20240618: ${load_vosk:kenkone/kenkone_evas_retrain_20240618}
requirements.txt ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ vosk
2
+ omegaconf