游雁
commited on
Commit
·
5ed094c
1
Parent(s):
1b2f688
add
Browse files- .gitattributes +1 -0
- README.md +11 -8
- example/asr_example.wav +3 -0
.gitattributes
CHANGED
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
|
33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
|
|
|
33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
36 |
+
example/asr_example.wav filter=lfs diff=lfs merge=lfs -text
|
README.md
CHANGED
@@ -100,28 +100,31 @@ print(res)
|
|
100 |
Note: `model_hub`: represents the model repository, `ms` stands for selecting ModelScope download, `hf` stands for selecting Huggingface download.
|
101 |
|
102 |
### Speech Recognition (Streaming)
|
|
|
103 |
```python
|
104 |
from funasr import AutoModel
|
105 |
|
106 |
-
chunk_size = [0, 10, 5]
|
107 |
-
encoder_chunk_look_back = 4
|
108 |
-
decoder_chunk_look_back = 1
|
109 |
|
110 |
model = AutoModel(model="paraformer-zh-streaming", model_revision="v2.0.4")
|
111 |
|
112 |
import soundfile
|
113 |
import os
|
114 |
|
115 |
-
wav_file = os.path.join(model.model_path, "example/asr_example.wav")
|
116 |
speech, sample_rate = soundfile.read(wav_file)
|
117 |
-
chunk_stride = chunk_size[1] * 960
|
118 |
|
119 |
cache = {}
|
120 |
-
total_chunk_num = int(len((speech)-1)/chunk_stride+1)
|
121 |
for i in range(total_chunk_num):
|
122 |
-
speech_chunk = speech[i*chunk_stride:(i+1)*chunk_stride]
|
123 |
is_final = i == total_chunk_num - 1
|
124 |
-
res = model.generate(input=speech_chunk, cache=cache, is_final=is_final, chunk_size=chunk_size,
|
|
|
|
|
125 |
print(res)
|
126 |
```
|
127 |
Note: `chunk_size` is the configuration for streaming latency.` [0,10,5]` indicates that the real-time display granularity is `10*60=600ms`, and the lookahead information is `5*60=300ms`. Each inference input is `600ms` (sample points are `16000*0.6=960`), and the output is the corresponding text. For the last speech segment input, `is_final=True` needs to be set to force the output of the last word.
|
|
|
100 |
Note: `model_hub`: represents the model repository, `ms` stands for selecting ModelScope download, `hf` stands for selecting Huggingface download.
|
101 |
|
102 |
### Speech Recognition (Streaming)
|
103 |
+
|
104 |
```python
|
105 |
from funasr import AutoModel
|
106 |
|
107 |
+
chunk_size = [0, 10, 5] # [0, 10, 5] 600ms, [0, 8, 4] 480ms
|
108 |
+
encoder_chunk_look_back = 4 # number of chunks to lookback for encoder self-attention
|
109 |
+
decoder_chunk_look_back = 1 # number of encoder chunks to lookback for decoder cross-attention
|
110 |
|
111 |
model = AutoModel(model="paraformer-zh-streaming", model_revision="v2.0.4")
|
112 |
|
113 |
import soundfile
|
114 |
import os
|
115 |
|
116 |
+
wav_file = os.path.join(model.model_path, "../fa-zh/example/asr_example.wav")
|
117 |
speech, sample_rate = soundfile.read(wav_file)
|
118 |
+
chunk_stride = chunk_size[1] * 960 # 600ms
|
119 |
|
120 |
cache = {}
|
121 |
+
total_chunk_num = int(len((speech) - 1) / chunk_stride + 1)
|
122 |
for i in range(total_chunk_num):
|
123 |
+
speech_chunk = speech[i * chunk_stride:(i + 1) * chunk_stride]
|
124 |
is_final = i == total_chunk_num - 1
|
125 |
+
res = model.generate(input=speech_chunk, cache=cache, is_final=is_final, chunk_size=chunk_size,
|
126 |
+
encoder_chunk_look_back=encoder_chunk_look_back,
|
127 |
+
decoder_chunk_look_back=decoder_chunk_look_back)
|
128 |
print(res)
|
129 |
```
|
130 |
Note: `chunk_size` is the configuration for streaming latency.` [0,10,5]` indicates that the real-time display granularity is `10*60=600ms`, and the lookahead information is `5*60=300ms`. Each inference input is `600ms` (sample points are `16000*0.6=960`), and the output is the corresponding text. For the last speech segment input, `is_final=True` needs to be set to force the output of the last word.
|
example/asr_example.wav
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:732f28a4445eb2b656675b946695e59752b7df824d475c4bc531c441f43b30f9
|
3 |
+
size 417742
|