|
import json |
|
|
|
import gradio as gr |
|
import torch |
|
import uvicorn |
|
from transformers import AutoProcessor, pipeline |
|
|
|
model_path = "models/whisper-small-shanghainese" |
|
language = "Chinese" |
|
|
|
device = "cuda" if torch.cuda.is_available() else "cpu" |
|
torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32 |
|
|
|
processor = AutoProcessor.from_pretrained(model_path, language=language) |
|
|
|
pipe = pipeline("automatic-speech-recognition", |
|
model=model_path, |
|
tokenizer=processor.tokenizer, |
|
feature_extractor=processor.feature_extractor, |
|
max_new_tokens=128, |
|
chunk_length_s=30, |
|
torch_dtype=torch_dtype, |
|
device=device) |
|
|
|
|
|
def transcribe(audio_file): |
|
result = pipe(audio_file, return_timestamps=True, generate_kwargs={ |
|
"task": "transcribe", |
|
"language": language |
|
}) |
|
|
|
return json.dumps(result, ensure_ascii=False) |
|
|
|
|
|
def main(): |
|
with gr.Blocks() as app: |
|
with gr.Row(): |
|
with gr.Column(): |
|
gr.Markdown(''' |
|
# Finetune whisper-small for Shanghainese |
|
''') |
|
with gr.Row(): |
|
with gr.Column(): |
|
audio_input = gr.Audio(label="Audio", type="filepath") |
|
submit_btn = gr.Button("Submit", variant="primary") |
|
|
|
with gr.Row(): |
|
with gr.Column(): |
|
text_output = gr.TextArea(label="Output (JSON)", ) |
|
|
|
submit_btn.click(transcribe, inputs=[audio_input], outputs=[text_output]) |
|
|
|
app.launch(share=False, server_name="0.0.0.0", server_port=7860) |
|
|
|
|
|
if __name__ == "__main__": |
|
uvicorn.run(main) |
|
|