Matthijs Hollemans commited on
Commit
4ba2008
·
1 Parent(s): 7af44d2

fix small bug

Browse files
Files changed (1) hide show
  1. app.py +20 -13
app.py CHANGED
@@ -8,6 +8,23 @@ from PIL import Image, ImageDraw, ImageFont
8
  from transformers import pipeline
9
 
10
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
11
  max_duration = 60 # seconds
12
  fps = 25
13
  video_width = 640
@@ -22,10 +39,6 @@ font = ImageFont.truetype("Lato-Regular.ttf", 40)
22
  text_color = (255, 200, 200)
23
  highlight_color = (255, 255, 255)
24
 
25
- # checkpoint = "openai/whisper-tiny"
26
- # checkpoint = "openai/whisper-base"
27
- checkpoint = "openai/whisper-small"
28
-
29
  if torch.cuda.is_available() and torch.cuda.device_count() > 0:
30
  from transformers import (
31
  AutomaticSpeechRecognitionPipeline,
@@ -45,18 +58,12 @@ if torch.cuda.is_available() and torch.cuda.device_count() > 0:
45
  else:
46
  pipe = pipeline(model=checkpoint)
47
 
48
- # TODO: no longer need to set these manually once the models have been updated on the Hub
49
- # whisper-tiny
50
- # pipe.model.generation_config.alignment_heads = [[2, 2], [3, 0], [3, 2], [3, 3], [3, 4], [3, 5]]
51
- # whisper-base
52
- # pipe.model.generation_config.alignment_heads = [[3, 1], [4, 2], [4, 3], [4, 7], [5, 1], [5, 2], [5, 4], [5, 6]]
53
- # whisper-small
54
- pipe.model.generation_config.alignment_heads = [[5, 3], [5, 9], [8, 0], [8, 4], [8, 7], [8, 8], [9, 0], [9, 7], [9, 9], [10, 5]]
55
 
56
  chunks = []
57
 
58
  start_chunk = 0
59
- last_draws = []
60
  last_image = None
61
 
62
 
@@ -126,7 +133,7 @@ def predict(audio_path):
126
  global chunks, start_chunk, last_draws, last_image
127
 
128
  start_chunk = 0
129
- last_draws = []
130
  last_image = None
131
 
132
  audio_data, sr = librosa.load(audio_path, mono=True)
 
8
  from transformers import pipeline
9
 
10
 
11
+ # checkpoint = "openai/whisper-tiny"
12
+ # checkpoint = "openai/whisper-base"
13
+ checkpoint = "openai/whisper-small"
14
+
15
+ # We need to set alignment_heads on the model's generation_config (at least
16
+ # until the models have been updated on the hub).
17
+ # If you're going to use a different version of whisper, see the following
18
+ # for which values to use for alignment_heads:
19
+ # https://gist.github.com/hollance/42e32852f24243b748ae6bc1f985b13a
20
+
21
+ # whisper-tiny
22
+ # alignment_heads = [[2, 2], [3, 0], [3, 2], [3, 3], [3, 4], [3, 5]]
23
+ # whisper-base
24
+ # alignment_heads = [[3, 1], [4, 2], [4, 3], [4, 7], [5, 1], [5, 2], [5, 4], [5, 6]]
25
+ # whisper-small
26
+ alignment_heads = [[5, 3], [5, 9], [8, 0], [8, 4], [8, 7], [8, 8], [9, 0], [9, 7], [9, 9], [10, 5]]
27
+
28
  max_duration = 60 # seconds
29
  fps = 25
30
  video_width = 640
 
39
  text_color = (255, 200, 200)
40
  highlight_color = (255, 255, 255)
41
 
 
 
 
 
42
  if torch.cuda.is_available() and torch.cuda.device_count() > 0:
43
  from transformers import (
44
  AutomaticSpeechRecognitionPipeline,
 
58
  else:
59
  pipe = pipeline(model=checkpoint)
60
 
61
+ pipe.model.generation_config.alignment_heads = alignment_heads
 
 
 
 
 
 
62
 
63
  chunks = []
64
 
65
  start_chunk = 0
66
+ last_draws = None
67
  last_image = None
68
 
69
 
 
133
  global chunks, start_chunk, last_draws, last_image
134
 
135
  start_chunk = 0
136
+ last_draws = None
137
  last_image = None
138
 
139
  audio_data, sr = librosa.load(audio_path, mono=True)