File size: 1,496 Bytes
f481a94
 
 
 
08f9ba3
f481a94
08f9ba3
f481a94
 
 
 
 
 
 
08f9ba3
f481a94
 
 
 
 
08f9ba3
f481a94
 
08f9ba3
f481a94
08f9ba3
f481a94
 
08f9ba3
 
 
 
 
 
 
f481a94
 
08f9ba3
f481a94
 
 
 
08f9ba3
 
 
 
f481a94
08f9ba3
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
import gradio as gr
import librosa
from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor
import torch

# config
model_name = "vumichien/wav2vec2-large-xlsr-japanese-hiragana"
processor = Wav2Vec2Processor.from_pretrained(model_name)
model = Wav2Vec2ForCTC.from_pretrained(model_name)


def process_audio_file(file):
    data, sr = librosa.load(file)
    if sr != 16000:
        data = librosa.resample(data, sr, 16000)
    print(data.shape)
    inputs = processor(data, sampling_rate=16000, return_tensors="pt", padding=True)
    return inputs


def transcribe(file):
    inputs = process_audio_file(file)
    with torch.no_grad():
        output_logit = model(inputs.input_values, attention_mask=inputs.attention_mask).logits
    pred_ids = torch.argmax(output_logit, dim=-1)
    return processor.batch_decode(pred_ids)[0]


description = "A simple interface to transcribe from spoken Japanese to Hiragana."
article = "<p style='text-align: center'><a @2022 Detomo </a></p>"
inputs = [gr.inputs.Audio(source="microphone", type='filepath', optional=True)
          ]
examples = [["samples/BASIC5000_0001.wav"],
            ["samples/BASIC5000_0005.wav"]
        ]
iface = gr.Interface(
    fn=transcribe,
    inputs=inputs,
    outputs="text",
    layout="horizontal",
    theme="huggingface",
    title="Transcribe Japanese audio to Hiragana",
    description=description,
    article=article,
    allow_flagging='never',
    examples=examples
)
iface.launch(enable_queue=True, share=True)