Matthijs Hollemans commited on
Commit
b1828a3
·
1 Parent(s): dbc8f56
Files changed (5) hide show
  1. .gitattributes +3 -0
  2. README.md +2 -2
  3. app.py +149 -0
  4. background.png +0 -0
  5. requirements.txt +8 -0
.gitattributes CHANGED
@@ -32,3 +32,6 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
32
  *.zip filter=lfs diff=lfs merge=lfs -text
33
  *.zst filter=lfs diff=lfs merge=lfs -text
34
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
 
 
32
  *.zip filter=lfs diff=lfs merge=lfs -text
33
  *.zst filter=lfs diff=lfs merge=lfs -text
34
  *tfevents* filter=lfs diff=lfs merge=lfs -text
35
+ *.wav filter=lfs diff=lfs merge=lfs -text
36
+ *.mp3 filter=lfs diff=lfs merge=lfs -text
37
+ *.ttf filter=lfs diff=lfs merge=lfs -text
README.md CHANGED
@@ -1,6 +1,6 @@
1
  ---
2
- title: Whisper Word Timestamps
3
- emoji: 💻
4
  colorFrom: yellow
5
  colorTo: indigo
6
  sdk: gradio
 
1
  ---
2
+ title: Whisper Word-Level Timestamps
3
+ emoji: 💭⏰
4
  colorFrom: yellow
5
  colorTo: indigo
6
  sdk: gradio
app.py ADDED
@@ -0,0 +1,149 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import librosa
3
+ import numpy as np
4
+ import moviepy.editor as mpy
5
+
6
+ from PIL import Image, ImageDraw, ImageFont
7
+ from transformers import pipeline
8
+
9
+
10
+ fps = 25
11
+ max_duration = 60 # seconds
12
+ video_width = 640
13
+ video_height = 480
14
+ margin_left = 20
15
+ margin_right = 20
16
+ margin_top = 20
17
+ line_height = 44
18
+
19
+ background_image = Image.open("background.png")
20
+ font = ImageFont.truetype("Lato-Regular.ttf", 40)
21
+ text_color = (255, 200, 200)
22
+ highlight_color = (255, 255, 255)
23
+
24
+ # checkpoint = "openai/whisper-tiny"
25
+ # checkpoint = "openai/whisper-base"
26
+ checkpoint = "openai/whisper-small"
27
+ pipe = pipeline(model=checkpoint)
28
+
29
+ # TODO: no longer need to set these manually once the models have been updated on the Hub
30
+ # whisper-base
31
+ # pipe.model.config.alignment_heads = [[3, 1], [4, 2], [4, 3], [4, 7], [5, 1], [5, 2], [5, 4], [5, 6]]
32
+ # whisper-small
33
+ pipe.model.config.alignment_heads = [[5, 3], [5, 9], [8, 0], [8, 4], [8, 7], [8, 8], [9, 0], [9, 7], [9, 9], [10, 5]]
34
+
35
+ chunks = []
36
+
37
+
38
+ def make_frame(t):
39
+ global chunks
40
+
41
+ # TODO speed optimization: could cache the last image returned and if the
42
+ # active chunk and active word didn't change, use that last image instead
43
+ # of drawing the exact same thing again
44
+
45
+ # TODO in the Henry V example, the word "desires" has an ending timestamp
46
+ # that's too far into the future, and so the word stays highlighted.
47
+ # Could fix this by finding the latest word that is active in the chunk
48
+ # and only highlight that one.
49
+
50
+ image = background_image.copy()
51
+ draw = ImageDraw.Draw(image)
52
+
53
+ # for debugging: draw frame time
54
+ #draw.text((20, 20), str(t), fill=text_color, font=font)
55
+
56
+ space_length = draw.textlength(" ", font)
57
+ x = margin_left
58
+ y = margin_top
59
+
60
+ for chunk in chunks:
61
+ chunk_start = chunk["timestamp"][0]
62
+ chunk_end = chunk["timestamp"][1]
63
+ if chunk_end is None: chunk_end = max_duration
64
+
65
+ if chunk_start <= t <= chunk_end:
66
+ words = [x["text"] for x in chunk["words"]]
67
+ word_times = [x["timestamp"] for x in chunk["words"]]
68
+
69
+ for (word, times) in zip(words, word_times):
70
+ word_length = draw.textlength(word + " ", font) - space_length
71
+ if x + word_length >= video_width - margin_right:
72
+ x = margin_left
73
+ y += line_height
74
+
75
+ if times[0] <= t <= times[1]:
76
+ color = highlight_color
77
+ draw.rectangle([x, y + line_height, x + word_length, y + line_height + 4], fill=color)
78
+ else:
79
+ color = text_color
80
+
81
+ draw.text((x, y), word, fill=color, font=font)
82
+ x += word_length + space_length
83
+
84
+ break
85
+
86
+ return np.array(image)
87
+
88
+
89
+ def predict(audio_path):
90
+ global chunks
91
+
92
+ audio_data, sr = librosa.load(audio_path, mono=True)
93
+ duration = librosa.get_duration(y=audio_data, sr=sr)
94
+ duration = min(max_duration, duration)
95
+ audio_data = audio_data[:int(duration * sr)]
96
+
97
+ # Run Whisper to get word-level timestamps.
98
+ audio_inputs = librosa.resample(audio_data, orig_sr=sr, target_sr=pipe.feature_extractor.sampling_rate)
99
+ output = pipe(audio_inputs, chunk_length_s=30, stride_length_s=[4, 2], return_timestamps="word")
100
+ chunks = output["chunks"]
101
+ print(chunks)
102
+
103
+ # Create the video.
104
+ clip = mpy.VideoClip(make_frame, duration=duration)
105
+ audio_clip = mpy.AudioFileClip(audio_path).set_duration(duration)
106
+ clip = clip.set_audio(audio_clip)
107
+ clip.write_videofile("my_video.mp4", fps=fps, codec="libx264", audio_codec="aac")
108
+ return "my_video.mp4"
109
+
110
+
111
+ title = "Word-level timestamps with Whisper"
112
+
113
+ description = """
114
+ This demo shows Whisper <b>word-level timestamps</b> in action using Hugging Face Transformers. It creates a video showing subtitled audio with the current word highlighted.
115
+
116
+ This demo uses the <b>openai/whisper-small</b> checkpoint. Since it's only a demo, the output is limited to the first 60 seconds of audio.
117
+ """
118
+
119
+ article = """
120
+ <div style='margin:20px auto;'>
121
+
122
+ <p>Credits:<p>
123
+
124
+ <ul>
125
+ <li>Shakespeare's "Henry V" speech from <a href="https://freesound.org/people/acclivity/sounds/24096/">acclivity</a> (CC BY-NC 4.0 license)
126
+ <li>Lato font by Łukasz Dziedzic (licensed under Open Font License)</li>
127
+ <li>Whisper model by OpenAI</li>
128
+ </ul>
129
+
130
+ </div>
131
+ """
132
+
133
+ examples = [
134
+ "examples/henry5.wav",
135
+ ]
136
+
137
+ gr.Interface(
138
+ fn=predict,
139
+ inputs=[
140
+ gr.Audio(label="Upload Audio", source="upload", type="filepath"),
141
+ ],
142
+ outputs=[
143
+ gr.Video(label="Output Video"),
144
+ ],
145
+ title=title,
146
+ description=description,
147
+ article=article,
148
+ examples=examples,
149
+ ).launch()
background.png ADDED
requirements.txt ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ git+https://github.com/hollance/transformers.git@whisper_word_timestamps
2
+ torch
3
+ torchaudio
4
+ soundfile
5
+ librosa
6
+ moviepy
7
+ matplotlib
8
+ pillow