import json import gradio as gr import torch import uvicorn from transformers import AutoProcessor, pipeline model_path = "models/whisper-small-shanghainese" language = "Chinese" device = "cuda" if torch.cuda.is_available() else "cpu" torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32 processor = AutoProcessor.from_pretrained(model_path, language=language) pipe = pipeline("automatic-speech-recognition", model=model_path, tokenizer=processor.tokenizer, feature_extractor=processor.feature_extractor, max_new_tokens=128, chunk_length_s=30, torch_dtype=torch_dtype, device=device) def transcribe(audio_file): result = pipe(audio_file, return_timestamps=True, generate_kwargs={ "task": "transcribe", "language": language }) return json.dumps(result, ensure_ascii=False) def main(): with gr.Blocks() as app: with gr.Row(): with gr.Column(): gr.Markdown(''' # Finetune whisper-small for Shanghainese ''') with gr.Row(): with gr.Column(): audio_input = gr.Audio(label="Audio", type="filepath") submit_btn = gr.Button("Submit", variant="primary") with gr.Row(): with gr.Column(): text_output = gr.TextArea(label="Output (JSON)", ) submit_btn.click(transcribe, inputs=[audio_input], outputs=[text_output]) app.launch(share=False, server_name="0.0.0.0", server_port=7860) if __name__ == "__main__": uvicorn.run(main)