Spaces:

Detomo
/

voice-japanese

Build error

File size: 1,496 Bytes

f481a94
 
 
 
08f9ba3
f481a94
08f9ba3
f481a94
 
 
 
 
 
 
08f9ba3
f481a94
 
 
 
 
08f9ba3
f481a94
 
08f9ba3
f481a94
08f9ba3
f481a94
 
08f9ba3
 
 
 
 
 
 
f481a94
 
08f9ba3
f481a94
 
 
 
08f9ba3
 
 
 
f481a94
08f9ba3

import gradio as gr
import librosa
from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor
import torch

# config
model_name = "vumichien/wav2vec2-large-xlsr-japanese-hiragana"
processor = Wav2Vec2Processor.from_pretrained(model_name)
model = Wav2Vec2ForCTC.from_pretrained(model_name)


def process_audio_file(file):
    data, sr = librosa.load(file)
    if sr != 16000:
        data = librosa.resample(data, sr, 16000)
    print(data.shape)
    inputs = processor(data, sampling_rate=16000, return_tensors="pt", padding=True)
    return inputs


def transcribe(file):
    inputs = process_audio_file(file)
    with torch.no_grad():
        output_logit = model(inputs.input_values, attention_mask=inputs.attention_mask).logits
    pred_ids = torch.argmax(output_logit, dim=-1)
    return processor.batch_decode(pred_ids)[0]


description = "A simple interface to transcribe from spoken Japanese to Hiragana."
article = "<p style='text-align: center'><a @2022 Detomo </a></p>"
inputs = [gr.inputs.Audio(source="microphone", type='filepath', optional=True)
          ]
examples = [["samples/BASIC5000_0001.wav"],
            ["samples/BASIC5000_0005.wav"]
        ]
iface = gr.Interface(
    fn=transcribe,
    inputs=inputs,
    outputs="text",
    layout="horizontal",
    theme="huggingface",
    title="Transcribe Japanese audio to Hiragana",
    description=description,
    article=article,
    allow_flagging='never',
    examples=examples
)
iface.launch(enable_queue=True, share=True)