Spaces:
Runtime error
Runtime error
File size: 3,845 Bytes
9839c08 0ab122b 8097d61 9839c08 0ab122b 798ba23 9839c08 0ab122b 9839c08 0ab122b 9839c08 0ab122b 9839c08 0ab122b 9839c08 0ab122b 9839c08 0ab122b 9839c08 0ab122b 9839c08 0ab122b 9839c08 0ab122b 9839c08 0ab122b 9839c08 0ab122b 9839c08 0ab122b 9839c08 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 |
import os
import shutil
from pathlib import Path
import gradio as gr
import pandas as pd
from datasets import Audio, Dataset
from speechline.segmenters import SilenceSegmenter, WordOverlapSegmenter
from speechline.transcribers import Wav2Vec2Transcriber
from speechline.utils.tokenizer import WordTokenizer
MAX_SEGMENTS = 100
OUTPUT_DIR = "tmp"
def segmentation_interface(choice: str):
if choice == "Silence Gap":
return gr.update(visible=True), gr.update(visible=False)
elif choice == "Word Overlap":
return gr.update(visible=False), gr.update(visible=True)
def run(audio_path, model, segmentation_type, silence_duration, ground_truth):
transcriber = Wav2Vec2Transcriber(model)
dataset = Dataset.from_dict({"audio": [audio_path]})
dataset = dataset.cast_column(
"audio", Audio(sampling_rate=transcriber.sampling_rate)
)
output_offsets = transcriber.predict(dataset, output_offsets=True)
if segmentation_type == "Silence Gap":
segmenter = SilenceSegmenter()
elif segmentation_type == "Word Overlap":
segmenter = WordOverlapSegmenter()
tokenizer = WordTokenizer()
if os.path.exists(OUTPUT_DIR):
shutil.rmtree(OUTPUT_DIR)
segmenter.chunk_audio_segments(
audio_path,
OUTPUT_DIR,
output_offsets[0],
minimum_chunk_duration=0,
silence_duration=silence_duration,
ground_truth=tokenizer(ground_truth),
)
outputs, idx = [], 0
for path in sorted(Path(OUTPUT_DIR).rglob("*")):
if path.suffix == ".tsv":
gt = pd.read_csv(
path, sep="\t", names=["start_offset", "end_offset", "text"]
)
outputs.append(gr.Dataframe.update(value=gt, visible=True))
elif path.suffix == ".wav":
outputs.append(gr.Audio.update(value=str(path), visible=True))
idx += 1
for _ in range(MAX_SEGMENTS - idx):
outputs += [gr.Dataframe.update(visible=False), gr.Audio.update(visible=False)]
return outputs
with gr.Blocks() as demo:
gr.Markdown(
f"""
<center>
# 🎙️ SpeechLine Demo
[Repository](https://github.com/bookbot-kids/speechline) | [Documentation](https://bookbot-kids.github.io/speechline/)
</center>
"""
)
with gr.Row():
with gr.Column():
audio = gr.Audio(type="filepath")
model = gr.Dropdown(
choices=[
"facebook/wav2vec2-base-960h",
],
value="facebook/wav2vec2-base-960h",
label="Transcriber Model",
)
segmenter = gr.Radio(
choices=["Silence Gap", "Word Overlap"],
value="Silence Gap",
label="Segmentation Method",
)
sil = gr.Slider(
0, 1, value=0.1, step=0.1, label="Silence Duration", visible=True
)
gt = gr.Textbox(
label="Ground Truth",
placeholder="Enter Ground Truth Text",
interactive=True,
visible=False,
)
segmenter.change(
fn=segmentation_interface, inputs=segmenter, outputs=[sil, gt]
)
inputs = [audio, model, segmenter, sil, gt]
transcribe_btn = gr.Button("Transcribe")
with gr.Column():
outputs = [
gr.Dataframe(
visible=True, headers=["start_offset", "end_offset", "text"]
),
gr.Audio(visible=True),
]
for _ in range(MAX_SEGMENTS - 1):
outputs += [gr.Dataframe(visible=False), gr.Audio(visible=False)]
transcribe_btn.click(fn=run, inputs=inputs, outputs=outputs)
demo.launch()
|