import json

import gradio as gr
import torch
import uvicorn
from transformers import AutoProcessor, pipeline

model_path = "models/whisper-small-shanghainese"
language = "Chinese"

device = "cuda" if torch.cuda.is_available() else "cpu"
torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32

processor = AutoProcessor.from_pretrained(model_path, language=language)

pipe = pipeline("automatic-speech-recognition",
                model=model_path,
                tokenizer=processor.tokenizer,
                feature_extractor=processor.feature_extractor,
                max_new_tokens=128,
                chunk_length_s=30,
                torch_dtype=torch_dtype,
                device=device)


def transcribe(audio_file):
    result = pipe(audio_file, return_timestamps=True, generate_kwargs={
        "task": "transcribe",
        "language": language
    })

    return json.dumps(result, ensure_ascii=False)


def main():
    with gr.Blocks() as app:
        with gr.Row():
            with gr.Column():
                gr.Markdown('''
                # Finetune whisper-small for Shanghainese
                ''')
        with gr.Row():
            with gr.Column():
                audio_input = gr.Audio(label="Audio", type="filepath")
                submit_btn = gr.Button("Submit", variant="primary")

        with gr.Row():
            with gr.Column():
                text_output = gr.TextArea(label="Output (JSON)", )

        submit_btn.click(transcribe, inputs=[audio_input], outputs=[text_output])

    app.launch(share=False, server_name="0.0.0.0", server_port=7860)


if __name__ == "__main__":
    uvicorn.run(main)