Spaces:
Runtime error
Runtime error
File size: 4,851 Bytes
b1828a3 1bb7c1d b1828a3 1bb7c1d b1828a3 1bb7c1d b1828a3 1bb7c1d b1828a3 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 |
import gradio as gr
import librosa
import numpy as np
import moviepy.editor as mpy
from PIL import Image, ImageDraw, ImageFont
from transformers import pipeline
fps = 25
max_duration = 60 # seconds
video_width = 640
video_height = 480
margin_left = 20
margin_right = 20
margin_top = 20
line_height = 44
background_image = Image.open("background.png")
font = ImageFont.truetype("Lato-Regular.ttf", 40)
text_color = (255, 200, 200)
highlight_color = (255, 255, 255)
# checkpoint = "openai/whisper-tiny"
# checkpoint = "openai/whisper-base"
checkpoint = "openai/whisper-small"
pipe = pipeline(model=checkpoint)
# TODO: no longer need to set these manually once the models have been updated on the Hub
# whisper-base
# pipe.model.config.alignment_heads = [[3, 1], [4, 2], [4, 3], [4, 7], [5, 1], [5, 2], [5, 4], [5, 6]]
# whisper-small
pipe.model.config.alignment_heads = [[5, 3], [5, 9], [8, 0], [8, 4], [8, 7], [8, 8], [9, 0], [9, 7], [9, 9], [10, 5]]
chunks = []
def make_frame(t):
global chunks
# TODO speed optimization: could cache the last image returned and if the
# active chunk and active word didn't change, use that last image instead
# of drawing the exact same thing again
# TODO in the Henry V example, the word "desires" has an ending timestamp
# that's too far into the future, and so the word stays highlighted.
# Could fix this by finding the latest word that is active in the chunk
# and only highlight that one.
image = background_image.copy()
draw = ImageDraw.Draw(image)
# for debugging: draw frame time
#draw.text((20, 20), str(t), fill=text_color, font=font)
space_length = draw.textlength(" ", font)
x = margin_left
y = margin_top
for chunk in chunks:
chunk_start = chunk["timestamp"][0]
chunk_end = chunk["timestamp"][1]
if chunk_end is None: chunk_end = max_duration
if chunk_start <= t <= chunk_end:
words = [x["text"] for x in chunk["words"]]
word_times = [x["timestamp"] for x in chunk["words"]]
for (word, times) in zip(words, word_times):
word_length = draw.textlength(word + " ", font) - space_length
if x + word_length >= video_width - margin_right:
x = margin_left
y += line_height
if times[0] <= t <= times[1]:
color = highlight_color
draw.rectangle([x, y + line_height, x + word_length, y + line_height + 4], fill=color)
else:
color = text_color
draw.text((x, y), word, fill=color, font=font)
x += word_length + space_length
break
return np.array(image)
def predict(audio_path):
global chunks
audio_data, sr = librosa.load(audio_path, mono=True)
duration = librosa.get_duration(y=audio_data, sr=sr)
duration = min(max_duration, duration)
audio_data = audio_data[:int(duration * sr)]
# Run Whisper to get word-level timestamps.
audio_inputs = librosa.resample(audio_data, orig_sr=sr, target_sr=pipe.feature_extractor.sampling_rate)
output = pipe(audio_inputs, chunk_length_s=30, stride_length_s=[4, 2], return_timestamps="word")
chunks = output["chunks"]
#print(chunks)
# Create the video.
clip = mpy.VideoClip(make_frame, duration=duration)
audio_clip = mpy.AudioFileClip(audio_path).set_duration(duration)
clip = clip.set_audio(audio_clip)
clip.write_videofile("my_video.mp4", fps=fps, codec="libx264", audio_codec="aac")
return "my_video.mp4"
title = "Word-level timestamps with Whisper"
description = """
This demo shows Whisper <b>word-level timestamps</b> in action using Hugging Face Transformers. It creates a video showing subtitled audio with the current word highlighted.
This demo uses the <b>openai/whisper-small</b> checkpoint. Since it's only a demo, the output is limited to the first 60 seconds of audio.
"""
article = """
<div style='margin:20px auto;'>
<p>Credits:<p>
<ul>
<li>Shakespeare's "Henry V" speech from <a href="https://freesound.org/people/acclivity/sounds/24096/">acclivity</a> (CC BY-NC 4.0 license)
<li>"Here's to the Crazy Ones" speech by Steve Jobs</li>
<li>"Stupid People" comedy routine by Bill Engvall</li>
<li>Lato font by Łukasz Dziedzic (licensed under Open Font License)</li>
<li>Whisper model by OpenAI</li>
</ul>
</div>
"""
examples = [
"examples/steve_jobs_crazy_ones.mp3",
"examples/henry5.wav",
"examples/stupid_people.mp3",
]
gr.Interface(
fn=predict,
inputs=[
gr.Audio(label="Upload Audio", source="upload", type="filepath"),
],
outputs=[
gr.Video(label="Output Video"),
],
title=title,
description=description,
article=article,
examples=examples,
).launch()
|