File size: 1,588 Bytes
08ab644
 
 
 
 
 
 
 
 
5024e84
0888de7
 
08ab644
 
 
 
 
 
 
 
 
 
 
 
ab2fef5
08ab644
 
 
 
 
 
 
 
0888de7
08ab644
 
 
 
 
 
 
 
 
 
 
 
 
0888de7
 
 
08ab644
89f31d9
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
import os
import time
import warnings
from pathlib import Path

import gradio as gr
import librosa
import spaces
import torch
from transformers import pipeline, WhisperConfig
from parse_accent import parse_pitch_accent
from surface2katakana_with_acc import katakana_to_phones
warnings.filterwarnings("ignore")

is_hf = os.getenv("SYSTEM") == "spaces"
# reference from litagin / galgame-whisper-wip

generate_kwargs = {
    "max_new_tokens": 256,
}

pipe = pipeline(
    "automatic-speech-recognition",
    model="AkitoP/whisper-large-v3-japense-phone_accent",
    chunk_length_s=30,
    device="cuda" if torch.cuda.is_available() else "cpu",
)


@spaces.GPU
def transcribe(audio: str) -> str:
    result = pipe(audio, generate_kwargs=generate_kwargs)["text"]
    print(result)
    return result, parse_pitch_accent(result), katakana_to_phones(result)


initial_md = """
# Whisper Large V3 Japanese Phone Accent

A Whisper model fine-tuned to transcribe Japanese speech into Katakana with pitch accent annotations. Built on whisper-large-v3-turbo, it uses a subset (1/20) of the Galgame-Speech dataset and the jsut-5000 dataset.
"""

with gr.Blocks() as app:
    gr.Markdown(initial_md)
    audio = gr.Audio(type="filepath")
    transcribe_btn = gr.Button("Transcribe")
    output = gr.Textbox(label="Result")
    output_HL_style = gr.Textbox(label="HL Result (SBV2 style)")
    output_UPDOWN_style = gr.Textbox(label="↑↓ Result (GSV style)")
    transcribe_btn.click(fn=transcribe,inputs=[audio], outputs=[output, output_HL_style, output_UPDOWN_style])
    
app.launch(inbrowser=True)