File size: 1,588 Bytes
08ab644 5024e84 0888de7 08ab644 ab2fef5 08ab644 0888de7 08ab644 0888de7 08ab644 89f31d9 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 |
import os
import time
import warnings
from pathlib import Path
import gradio as gr
import librosa
import spaces
import torch
from transformers import pipeline, WhisperConfig
from parse_accent import parse_pitch_accent
from surface2katakana_with_acc import katakana_to_phones
warnings.filterwarnings("ignore")
is_hf = os.getenv("SYSTEM") == "spaces"
# reference from litagin / galgame-whisper-wip
generate_kwargs = {
"max_new_tokens": 256,
}
pipe = pipeline(
"automatic-speech-recognition",
model="AkitoP/whisper-large-v3-japense-phone_accent",
chunk_length_s=30,
device="cuda" if torch.cuda.is_available() else "cpu",
)
@spaces.GPU
def transcribe(audio: str) -> str:
result = pipe(audio, generate_kwargs=generate_kwargs)["text"]
print(result)
return result, parse_pitch_accent(result), katakana_to_phones(result)
initial_md = """
# Whisper Large V3 Japanese Phone Accent
A Whisper model fine-tuned to transcribe Japanese speech into Katakana with pitch accent annotations. Built on whisper-large-v3-turbo, it uses a subset (1/20) of the Galgame-Speech dataset and the jsut-5000 dataset.
"""
with gr.Blocks() as app:
gr.Markdown(initial_md)
audio = gr.Audio(type="filepath")
transcribe_btn = gr.Button("Transcribe")
output = gr.Textbox(label="Result")
output_HL_style = gr.Textbox(label="HL Result (SBV2 style)")
output_UPDOWN_style = gr.Textbox(label="ββ Result (GSV style)")
transcribe_btn.click(fn=transcribe,inputs=[audio], outputs=[output, output_HL_style, output_UPDOWN_style])
app.launch(inbrowser=True) |