Spaces:
Runtime error
Runtime error
added app
Browse files- app.py +44 -0
- pipeline.py +66 -0
- requirements.txt +4 -0
app.py
ADDED
@@ -0,0 +1,44 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from typing import Tuple
|
2 |
+
|
3 |
+
import numpy as np
|
4 |
+
|
5 |
+
import torch
|
6 |
+
from torchaudio.transforms import Resample
|
7 |
+
|
8 |
+
from huggingface_hub import hf_hub_download
|
9 |
+
|
10 |
+
import gradio as gr
|
11 |
+
|
12 |
+
from pipeline import PreTrainedPipeline
|
13 |
+
|
14 |
+
|
15 |
+
HF_HUB_URL = 'ales/wav2vec2-cv-be'
|
16 |
+
LM_HUB_FP = 'language_model/cv8be_5gram.bin'
|
17 |
+
|
18 |
+
|
19 |
+
def main(rate_audio_tuple: Tuple[int, np.ndarray]):
|
20 |
+
sampling_rate, audio = rate_audio_tuple
|
21 |
+
|
22 |
+
# resample audio to 16kHz
|
23 |
+
resampler = Resample(orig_freq=sampling_rate, new_freq=16_000)
|
24 |
+
audio_resampled = resampler(torch.tensor(audio)).numpy().flatten()
|
25 |
+
|
26 |
+
# download Language Model from HF Hub
|
27 |
+
lm_fp = hf_hub_download(repo_id=HF_HUB_URL, filename=LM_HUB_FP)
|
28 |
+
|
29 |
+
# init pipeline
|
30 |
+
pipeline = PreTrainedPipeline(model_path=HF_HUB_URL, language_model_fp=lm_fp)
|
31 |
+
|
32 |
+
# recognize speech
|
33 |
+
text_recognized = pipeline(inputs=audio_resampled)['text'][0]
|
34 |
+
|
35 |
+
return text_recognized
|
36 |
+
|
37 |
+
|
38 |
+
iface = gr.Interface(
|
39 |
+
fn=main,
|
40 |
+
inputs='microphone',
|
41 |
+
outputs="text"
|
42 |
+
)
|
43 |
+
|
44 |
+
iface.launch()
|
pipeline.py
ADDED
@@ -0,0 +1,66 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import numpy as np
|
2 |
+
|
3 |
+
from typing import Dict
|
4 |
+
|
5 |
+
import torch
|
6 |
+
import pyctcdecode
|
7 |
+
|
8 |
+
from transformers import (
|
9 |
+
Wav2Vec2Processor,
|
10 |
+
Wav2Vec2ProcessorWithLM,
|
11 |
+
Wav2Vec2ForCTC,
|
12 |
+
)
|
13 |
+
|
14 |
+
|
15 |
+
class PreTrainedPipeline():
|
16 |
+
|
17 |
+
def __init__(self, model_path: str, language_model_fp: str):
|
18 |
+
self.language_model_fp = language_model_fp
|
19 |
+
|
20 |
+
self.device = 'cuda' if torch.cuda.is_available() else 'cpu'
|
21 |
+
self.model = Wav2Vec2ForCTC.from_pretrained(model_path)
|
22 |
+
self.model.to(self.device)
|
23 |
+
|
24 |
+
processor = Wav2Vec2Processor.from_pretrained(model_path)
|
25 |
+
self.sampling_rate = processor.feature_extractor.sampling_rate
|
26 |
+
|
27 |
+
vocab = processor.tokenizer.get_vocab()
|
28 |
+
sorted_vocab_dict = [(char, ix) for char, ix in sorted(vocab.items(), key=lambda item: item[1])]
|
29 |
+
|
30 |
+
self.decoder = pyctcdecode.build_ctcdecoder(
|
31 |
+
labels=[x[0] for x in sorted_vocab_dict],
|
32 |
+
kenlm_model_path=self.language_model_fp,
|
33 |
+
)
|
34 |
+
|
35 |
+
self.processor_with_lm = Wav2Vec2ProcessorWithLM(
|
36 |
+
feature_extractor=processor.feature_extractor,
|
37 |
+
tokenizer=processor.tokenizer,
|
38 |
+
decoder=self.decoder
|
39 |
+
)
|
40 |
+
|
41 |
+
def __call__(self, inputs: np.array) -> Dict[str, str]:
|
42 |
+
"""
|
43 |
+
Args:
|
44 |
+
inputs (:obj:`np.array`):
|
45 |
+
The raw waveform of audio received. By default at 16KHz.
|
46 |
+
Return:
|
47 |
+
A :obj:`dict`:. The object return should be liked {"text": "XXX"} containing
|
48 |
+
the detected text from the input audio.
|
49 |
+
"""
|
50 |
+
|
51 |
+
input_values = self.processor_with_lm(
|
52 |
+
inputs, return_tensors="pt",
|
53 |
+
sampling_rate=self.sampling_rate
|
54 |
+
)['input_values']
|
55 |
+
|
56 |
+
with torch.no_grad():
|
57 |
+
# input_values should be a 1D numpy array by now
|
58 |
+
input_values = torch.tensor(input_values, device=self.device)
|
59 |
+
model_outs = self.model(input_values)
|
60 |
+
logits = model_outs.logits.cpu().detach().numpy()
|
61 |
+
|
62 |
+
text_predicted = self.processor_with_lm.batch_decode(logits)['text']
|
63 |
+
|
64 |
+
return {
|
65 |
+
"text": text_predicted
|
66 |
+
}
|
requirements.txt
ADDED
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
1 |
+
transformers==4.17.0
|
2 |
+
pyctcdecode==0.3.0
|
3 |
+
numpy
|
4 |
+
https://github.com/kpu/kenlm/archive/master.zip
|