lmz radames commited on
Commit
516463c
·
1 Parent(s): 7c6fd42

add quantized models support (#4)

Browse files

- add quantized models support (e5f217ff864b7c048b430487ac42ad6e0ca23095)


Co-authored-by: Radamés Ajna <[email protected]>

Files changed (6) hide show
  1. build/m.d.ts +9 -2
  2. build/m.js +72 -9
  3. build/m_bg.wasm +2 -2
  4. build/m_bg.wasm.d.ts +2 -1
  5. index.html +59 -40
  6. whisperWorker.js +57 -14
build/m.d.ts CHANGED
@@ -8,8 +8,14 @@ export class Decoder {
8
  * @param {Uint8Array} weights
9
  * @param {Uint8Array} tokenizer
10
  * @param {Uint8Array} mel_filters
 
 
 
 
 
 
11
  */
12
- constructor(weights: Uint8Array, tokenizer: Uint8Array, mel_filters: Uint8Array);
13
  /**
14
  * @param {Uint8Array} wav_input
15
  * @returns {string}
@@ -22,11 +28,12 @@ export type InitInput = RequestInfo | URL | Response | BufferSource | WebAssembl
22
  export interface InitOutput {
23
  readonly memory: WebAssembly.Memory;
24
  readonly __wbg_decoder_free: (a: number) => void;
25
- readonly decoder_new: (a: number, b: number, c: number, d: number, e: number, f: number, g: number) => void;
26
  readonly decoder_decode: (a: number, b: number, c: number, d: number) => void;
27
  readonly main: (a: number, b: number) => number;
28
  readonly __wbindgen_add_to_stack_pointer: (a: number) => number;
29
  readonly __wbindgen_malloc: (a: number, b: number) => number;
 
30
  readonly __wbindgen_free: (a: number, b: number, c: number) => void;
31
  readonly __wbindgen_start: () => void;
32
  }
 
8
  * @param {Uint8Array} weights
9
  * @param {Uint8Array} tokenizer
10
  * @param {Uint8Array} mel_filters
11
+ * @param {Uint8Array} config
12
+ * @param {boolean} quantized
13
+ * @param {boolean} is_multilingual
14
+ * @param {boolean} timestamps
15
+ * @param {string | undefined} task
16
+ * @param {string | undefined} language
17
  */
18
+ constructor(weights: Uint8Array, tokenizer: Uint8Array, mel_filters: Uint8Array, config: Uint8Array, quantized: boolean, is_multilingual: boolean, timestamps: boolean, task?: string, language?: string);
19
  /**
20
  * @param {Uint8Array} wav_input
21
  * @returns {string}
 
28
  export interface InitOutput {
29
  readonly memory: WebAssembly.Memory;
30
  readonly __wbg_decoder_free: (a: number) => void;
31
+ readonly decoder_new: (a: number, b: number, c: number, d: number, e: number, f: number, g: number, h: number, i: number, j: number, k: number, l: number, m: number, n: number, o: number, p: number) => void;
32
  readonly decoder_decode: (a: number, b: number, c: number, d: number) => void;
33
  readonly main: (a: number, b: number) => number;
34
  readonly __wbindgen_add_to_stack_pointer: (a: number) => number;
35
  readonly __wbindgen_malloc: (a: number, b: number) => number;
36
+ readonly __wbindgen_realloc: (a: number, b: number, c: number, d: number) => number;
37
  readonly __wbindgen_free: (a: number, b: number, c: number) => void;
38
  readonly __wbindgen_start: () => void;
39
  }
build/m.js CHANGED
@@ -42,6 +42,63 @@ function passArray8ToWasm0(arg, malloc) {
42
  return ptr;
43
  }
44
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
45
  let cachedInt32Memory0 = null;
46
 
47
  function getInt32Memory0() {
@@ -91,8 +148,14 @@ export class Decoder {
91
  * @param {Uint8Array} weights
92
  * @param {Uint8Array} tokenizer
93
  * @param {Uint8Array} mel_filters
 
 
 
 
 
 
94
  */
95
- constructor(weights, tokenizer, mel_filters) {
96
  try {
97
  const retptr = wasm.__wbindgen_add_to_stack_pointer(-16);
98
  const ptr0 = passArray8ToWasm0(weights, wasm.__wbindgen_malloc);
@@ -101,7 +164,13 @@ export class Decoder {
101
  const len1 = WASM_VECTOR_LEN;
102
  const ptr2 = passArray8ToWasm0(mel_filters, wasm.__wbindgen_malloc);
103
  const len2 = WASM_VECTOR_LEN;
104
- wasm.decoder_new(retptr, ptr0, len0, ptr1, len1, ptr2, len2);
 
 
 
 
 
 
105
  var r0 = getInt32Memory0()[retptr / 4 + 0];
106
  var r1 = getInt32Memory0()[retptr / 4 + 1];
107
  var r2 = getInt32Memory0()[retptr / 4 + 2];
@@ -183,15 +252,9 @@ function __wbg_get_imports() {
183
  const ret = new Error(getStringFromWasm0(arg0, arg1));
184
  return addHeapObject(ret);
185
  };
186
- imports.wbg.__wbg_log_f448472545eafac4 = function(arg0, arg1) {
187
  console.log(getStringFromWasm0(arg0, arg1));
188
  };
189
- imports.wbg.__wbg_time_fa135a7c2786e907 = function(arg0, arg1) {
190
- console.time(getStringFromWasm0(arg0, arg1));
191
- };
192
- imports.wbg.__wbg_timeEnd_594d82f147c9776f = function(arg0, arg1) {
193
- console.timeEnd(getStringFromWasm0(arg0, arg1));
194
- };
195
  imports.wbg.__wbindgen_throw = function(arg0, arg1) {
196
  throw new Error(getStringFromWasm0(arg0, arg1));
197
  };
 
42
  return ptr;
43
  }
44
 
45
+ const cachedTextEncoder = (typeof TextEncoder !== 'undefined' ? new TextEncoder('utf-8') : { encode: () => { throw Error('TextEncoder not available') } } );
46
+
47
+ const encodeString = (typeof cachedTextEncoder.encodeInto === 'function'
48
+ ? function (arg, view) {
49
+ return cachedTextEncoder.encodeInto(arg, view);
50
+ }
51
+ : function (arg, view) {
52
+ const buf = cachedTextEncoder.encode(arg);
53
+ view.set(buf);
54
+ return {
55
+ read: arg.length,
56
+ written: buf.length
57
+ };
58
+ });
59
+
60
+ function passStringToWasm0(arg, malloc, realloc) {
61
+
62
+ if (realloc === undefined) {
63
+ const buf = cachedTextEncoder.encode(arg);
64
+ const ptr = malloc(buf.length, 1) >>> 0;
65
+ getUint8Memory0().subarray(ptr, ptr + buf.length).set(buf);
66
+ WASM_VECTOR_LEN = buf.length;
67
+ return ptr;
68
+ }
69
+
70
+ let len = arg.length;
71
+ let ptr = malloc(len, 1) >>> 0;
72
+
73
+ const mem = getUint8Memory0();
74
+
75
+ let offset = 0;
76
+
77
+ for (; offset < len; offset++) {
78
+ const code = arg.charCodeAt(offset);
79
+ if (code > 0x7F) break;
80
+ mem[ptr + offset] = code;
81
+ }
82
+
83
+ if (offset !== len) {
84
+ if (offset !== 0) {
85
+ arg = arg.slice(offset);
86
+ }
87
+ ptr = realloc(ptr, len, len = offset + arg.length * 3, 1) >>> 0;
88
+ const view = getUint8Memory0().subarray(ptr + offset, ptr + len);
89
+ const ret = encodeString(arg, view);
90
+
91
+ offset += ret.written;
92
+ }
93
+
94
+ WASM_VECTOR_LEN = offset;
95
+ return ptr;
96
+ }
97
+
98
+ function isLikeNone(x) {
99
+ return x === undefined || x === null;
100
+ }
101
+
102
  let cachedInt32Memory0 = null;
103
 
104
  function getInt32Memory0() {
 
148
  * @param {Uint8Array} weights
149
  * @param {Uint8Array} tokenizer
150
  * @param {Uint8Array} mel_filters
151
+ * @param {Uint8Array} config
152
+ * @param {boolean} quantized
153
+ * @param {boolean} is_multilingual
154
+ * @param {boolean} timestamps
155
+ * @param {string | undefined} task
156
+ * @param {string | undefined} language
157
  */
158
+ constructor(weights, tokenizer, mel_filters, config, quantized, is_multilingual, timestamps, task, language) {
159
  try {
160
  const retptr = wasm.__wbindgen_add_to_stack_pointer(-16);
161
  const ptr0 = passArray8ToWasm0(weights, wasm.__wbindgen_malloc);
 
164
  const len1 = WASM_VECTOR_LEN;
165
  const ptr2 = passArray8ToWasm0(mel_filters, wasm.__wbindgen_malloc);
166
  const len2 = WASM_VECTOR_LEN;
167
+ const ptr3 = passArray8ToWasm0(config, wasm.__wbindgen_malloc);
168
+ const len3 = WASM_VECTOR_LEN;
169
+ var ptr4 = isLikeNone(task) ? 0 : passStringToWasm0(task, wasm.__wbindgen_malloc, wasm.__wbindgen_realloc);
170
+ var len4 = WASM_VECTOR_LEN;
171
+ var ptr5 = isLikeNone(language) ? 0 : passStringToWasm0(language, wasm.__wbindgen_malloc, wasm.__wbindgen_realloc);
172
+ var len5 = WASM_VECTOR_LEN;
173
+ wasm.decoder_new(retptr, ptr0, len0, ptr1, len1, ptr2, len2, ptr3, len3, quantized, is_multilingual, timestamps, ptr4, len4, ptr5, len5);
174
  var r0 = getInt32Memory0()[retptr / 4 + 0];
175
  var r1 = getInt32Memory0()[retptr / 4 + 1];
176
  var r2 = getInt32Memory0()[retptr / 4 + 2];
 
252
  const ret = new Error(getStringFromWasm0(arg0, arg1));
253
  return addHeapObject(ret);
254
  };
255
+ imports.wbg.__wbg_log_0d9af0379e7a06b8 = function(arg0, arg1) {
256
  console.log(getStringFromWasm0(arg0, arg1));
257
  };
 
 
 
 
 
 
258
  imports.wbg.__wbindgen_throw = function(arg0, arg1) {
259
  throw new Error(getStringFromWasm0(arg0, arg1));
260
  };
build/m_bg.wasm CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:fa61030475868a6533b28628cd20d4d49c7a00f0e2a044c1f141a3d80f3d8a72
3
- size 3659953
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:393c1add1a180c1f0403cf5bb26db587ec59d19bec0c756b613f89b5e12fa512
3
+ size 4070269
build/m_bg.wasm.d.ts CHANGED
@@ -2,10 +2,11 @@
2
  /* eslint-disable */
3
  export const memory: WebAssembly.Memory;
4
  export function __wbg_decoder_free(a: number): void;
5
- export function decoder_new(a: number, b: number, c: number, d: number, e: number, f: number, g: number): void;
6
  export function decoder_decode(a: number, b: number, c: number, d: number): void;
7
  export function main(a: number, b: number): number;
8
  export function __wbindgen_add_to_stack_pointer(a: number): number;
9
  export function __wbindgen_malloc(a: number, b: number): number;
 
10
  export function __wbindgen_free(a: number, b: number, c: number): void;
11
  export function __wbindgen_start(): void;
 
2
  /* eslint-disable */
3
  export const memory: WebAssembly.Memory;
4
  export function __wbg_decoder_free(a: number): void;
5
+ export function decoder_new(a: number, b: number, c: number, d: number, e: number, f: number, g: number, h: number, i: number, j: number, k: number, l: number, m: number, n: number, o: number, p: number): void;
6
  export function decoder_decode(a: number, b: number, c: number, d: number): void;
7
  export function main(a: number, b: number): number;
8
  export function __wbindgen_add_to_stack_pointer(a: number): number;
9
  export function __wbindgen_malloc(a: number, b: number): number;
10
+ export function __wbindgen_realloc(a: number, b: number, c: number, d: number): number;
11
  export function __wbindgen_free(a: number, b: number, c: number): void;
12
  export function __wbindgen_start(): void;
index.html CHANGED
@@ -6,7 +6,7 @@
6
  <body></body>
7
  </html>
8
 
9
- <!doctype html>
10
  <html>
11
  <head>
12
  <meta charset="UTF-8" />
@@ -26,9 +26,30 @@
26
 
27
  // models base url
28
  const MODELS = {
 
 
 
 
 
 
29
  tiny_en: {
30
  base_url:
31
- "https://huggingface.co/openai/whisper-tiny.en/resolve/refs%2Fpr%2F17/",
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
32
  },
33
  };
34
  const whisperWorker = new Worker("./whisperWorker.js", {
@@ -39,6 +60,7 @@
39
  weightsURL, // URL to the weights file
40
  modelID, // model ID
41
  tokenizerURL, // URL to the tokenizer file
 
42
  mel_filtersURL, // URL to the mel filters file
43
  audioURL, // URL to the audio file
44
  updateStatus // function to update the status
@@ -48,21 +70,25 @@
48
  weightsURL,
49
  modelID,
50
  tokenizerURL,
 
51
  mel_filtersURL,
52
  audioURL,
53
  });
54
- whisperWorker.addEventListener("message", (event) => {
55
  console.log(event.data);
56
  if ("status" in event.data) {
57
  updateStatus(event.data);
58
  }
59
  if ("error" in event.data) {
 
60
  reject(new Error(event.data.error));
61
  }
62
  if (event.data.status === "complete") {
 
63
  resolve(event.data);
64
  }
65
- });
 
66
  });
67
  }
68
 
@@ -125,13 +151,16 @@
125
  return;
126
  }
127
  const modelID = document.querySelector("#model").value;
128
- const modelURL = MODELS[modelID].base_url + "model.safetensors";
129
- const tokenizerURL = MODELS[modelID].base_url + "tokenizer.json";
 
 
130
 
131
  classifyAudio(
132
  modelURL,
133
  modelID,
134
  tokenizerURL,
 
135
  "mel_filters.safetensors",
136
  audioURL,
137
  updateStatus
@@ -175,8 +204,7 @@
175
  <a
176
  href="https://huggingface.co/openai/"
177
  target="_blank"
178
- class="underline hover:text-blue-500 hover:no-underline"
179
- >
180
  OpenAI Whisper models
181
  </a>
182
  and WASM runtime built with
@@ -193,37 +221,38 @@
193
  <label for="model" class="font-medium">Models Options: </label>
194
  <select
195
  id="model"
196
- class="border-2 border-gray-500 rounded-md font-light"
197
- >
198
  <option value="tiny_en" selected>tiny.en (151 MB)</option>
 
 
 
 
 
 
199
  </select>
200
  </div>
201
  <!-- drag and drop area -->
202
  <div class="relative">
203
  <div
204
  id="drop-area"
205
- class="flex flex-col items-center justify-center border-2 border-gray-300 border-dashed rounded-xl relative h-48 w-full overflow-hidden"
206
- >
207
  <div
208
- class="flex flex-col items-center justify-center space-y-1 text-center"
209
- >
210
  <svg
211
  width="25"
212
  height="25"
213
  viewBox="0 0 25 25"
214
  fill="none"
215
- xmlns="http://www.w3.org/2000/svg"
216
- >
217
  <path
218
  d="M3.5 24.3a3 3 0 0 1-1.9-.8c-.5-.5-.8-1.2-.8-1.9V2.9c0-.7.3-1.3.8-1.9.6-.5 1.2-.7 2-.7h18.6c.7 0 1.3.2 1.9.7.5.6.7 1.2.7 2v18.6c0 .7-.2 1.4-.7 1.9a3 3 0 0 1-2 .8H3.6Zm0-2.7h18.7V2.9H3.5v18.7Zm2.7-2.7h13.3c.3 0 .5 0 .6-.3v-.7l-3.7-5a.6.6 0 0 0-.6-.2c-.2 0-.4 0-.5.3l-3.5 4.6-2.4-3.3a.6.6 0 0 0-.6-.3c-.2 0-.4.1-.5.3l-2.7 3.6c-.1.2-.2.4 0 .7.1.2.3.3.6.3Z"
219
- fill="#000"
220
- />
221
  </svg>
222
  <div class="flex text-sm text-gray-600">
223
  <label
224
  for="file-upload"
225
- class="relative cursor-pointer bg-white rounded-md font-medium text-blue-950 hover:text-blue-700"
226
- >
227
  <span>Drag and drop your audio here</span>
228
  <span class="block text-xs">or</span>
229
  <span class="block text-xs">Click to upload</span>
@@ -234,15 +263,13 @@
234
  name="file-upload"
235
  type="file"
236
  accept="audio/*"
237
- class="sr-only"
238
- />
239
  </div>
240
  <audio
241
  id="audio"
242
  hidden
243
  controls
244
- class="w-full p-2 select-none"
245
- ></audio>
246
  </div>
247
  </div>
248
  <div>
@@ -250,43 +277,37 @@
250
  <h3 class="font-medium">Examples:</h3>
251
  <button
252
  data-value="samples_jfk.wav"
253
- class="text-gray-500 border border-gray-500 rounded-md p-2 underline hover:no-underline"
254
- >
255
  <span>jfk.wav</span>
256
  <span class="text-xs block"> (352 kB)</span>
257
  </button>
258
  <button
259
  data-value="samples_a13.wav"
260
- class="text-gray-500 border border-gray-500 rounded-md p-2 underline hover:no-underline"
261
- >
262
  <span>a13.wav</span>
263
  <span class="text-xs block"> (960 kB)</span>
264
  </button>
265
  <button
266
  data-value="samples_mm0.wav"
267
- class="text-gray-500 border border-gray-500 rounded-md p-2 underline hover:no-underline"
268
- >
269
  <span>mm0.wav</span>
270
  <span class="text-xs block new"> (957 kB)</span>
271
  </button>
272
  <button
273
  data-value="samples_gb0.wav"
274
- class="text-gray-500 border border-gray-500 rounded-md p-2 underline hover:no-underline"
275
- >
276
  <span>gb0.wav </span>
277
  <span class="text-xs block">(4.08 MB)</span>
278
  </button>
279
  <button
280
  data-value="samples_gb1.wav"
281
- class="text-gray-500 border border-gray-500 rounded-md p-2 underline hover:no-underline"
282
- >
283
  <span>gb1.wav </span>
284
  <span class="text-xs block">(6.36 MB)</span>
285
  </button>
286
  <button
287
  data-value="samples_hp0.wav"
288
- class="text-gray-500 border border-gray-500 rounded-md p-2 underline hover:no-underline"
289
- >
290
  <span>hp0.wav </span>
291
  <span class="text-xs block">(8.75 MB)</span>
292
  </button>
@@ -297,16 +318,14 @@
297
  <button
298
  id="detect"
299
  disabled
300
- class="bg-gray-700 hover:bg-gray-800 text-white font-normal py-2 px-4 rounded disabled:bg-gray-300 disabled:cursor-not-allowed"
301
- >
302
  Transcribe Audio
303
  </button>
304
  </div>
305
  <div>
306
  <h3 class="font-medium">Transcription:</h3>
307
  <div
308
- class="min-h-[250px] bg-slate-100 text-gray-500 p-4 rounded-md flex flex-col gap-2"
309
- >
310
  <p hidden id="output-generation" class="grid-rows-2"></p>
311
  <span id="output-status" class="m-auto font-light"
312
  >No transcription results yet</span
 
6
  <body></body>
7
  </html>
8
 
9
+ <!DOCTYPE html>
10
  <html>
11
  <head>
12
  <meta charset="UTF-8" />
 
26
 
27
  // models base url
28
  const MODELS = {
29
+ tiny_multilingual: {
30
+ base_url: "https://huggingface.co/openai/whisper-tiny/resolve/main/",
31
+ model: "model.safetensors",
32
+ tokenizer: "tokenizer.json",
33
+ config: "config.json",
34
+ },
35
  tiny_en: {
36
  base_url:
37
+ "https://huggingface.co/openai/whisper-tiny.en/resolve/main/",
38
+ model: "model.safetensors",
39
+ tokenizer: "tokenizer.json",
40
+ config: "config.json",
41
+ },
42
+ tiny_quantized_multilingual_q80: {
43
+ base_url: "https://huggingface.co/lmz/candle-whisper/resolve/main/",
44
+ model: "model-tiny-q80.gguf",
45
+ tokenizer: "tokenizer-tiny.json",
46
+ config: "config-tiny.json",
47
+ },
48
+ tiny_en_quantized_q80: {
49
+ base_url: "https://huggingface.co/lmz/candle-whisper/resolve/main/",
50
+ model: "model-tiny-q80.gguf",
51
+ tokenizer: "tokenizer-tiny-en.json",
52
+ config: "config-tiny-en.json",
53
  },
54
  };
55
  const whisperWorker = new Worker("./whisperWorker.js", {
 
60
  weightsURL, // URL to the weights file
61
  modelID, // model ID
62
  tokenizerURL, // URL to the tokenizer file
63
+ configURL, // model config URL
64
  mel_filtersURL, // URL to the mel filters file
65
  audioURL, // URL to the audio file
66
  updateStatus // function to update the status
 
70
  weightsURL,
71
  modelID,
72
  tokenizerURL,
73
+ configURL,
74
  mel_filtersURL,
75
  audioURL,
76
  });
77
+ function messageHandler(event) {
78
  console.log(event.data);
79
  if ("status" in event.data) {
80
  updateStatus(event.data);
81
  }
82
  if ("error" in event.data) {
83
+ whisperWorker.removeEventListener("message", messageHandler);
84
  reject(new Error(event.data.error));
85
  }
86
  if (event.data.status === "complete") {
87
+ whisperWorker.removeEventListener("message", messageHandler);
88
  resolve(event.data);
89
  }
90
+ }
91
+ whisperWorker.addEventListener("message", messageHandler);
92
  });
93
  }
94
 
 
151
  return;
152
  }
153
  const modelID = document.querySelector("#model").value;
154
+ const model = MODELS[modelID];
155
+ const modelURL = model.base_url + model.model;
156
+ const tokenizerURL = model.base_url + model.tokenizer;
157
+ const configURL = model.base_url + model.config;
158
 
159
  classifyAudio(
160
  modelURL,
161
  modelID,
162
  tokenizerURL,
163
+ configURL,
164
  "mel_filters.safetensors",
165
  audioURL,
166
  updateStatus
 
204
  <a
205
  href="https://huggingface.co/openai/"
206
  target="_blank"
207
+ class="underline hover:text-blue-500 hover:no-underline">
 
208
  OpenAI Whisper models
209
  </a>
210
  and WASM runtime built with
 
221
  <label for="model" class="font-medium">Models Options: </label>
222
  <select
223
  id="model"
224
+ class="border-2 border-gray-500 rounded-md font-light">
225
+ <option value="tiny_multilingual" selected>tiny.en (151 MB)</option>
226
  <option value="tiny_en" selected>tiny.en (151 MB)</option>
227
+ <option value="tiny_quantized_multilingual_q80">
228
+ tiny quantized q80 (41.5 MB)
229
+ </option>
230
+ <option value="tiny_en_quantized_q80">
231
+ tiny.en quantized q80 (41.8 MB)
232
+ </option>
233
  </select>
234
  </div>
235
  <!-- drag and drop area -->
236
  <div class="relative">
237
  <div
238
  id="drop-area"
239
+ class="flex flex-col items-center justify-center border-2 border-gray-300 border-dashed rounded-xl relative h-48 w-full overflow-hidden">
 
240
  <div
241
+ class="flex flex-col items-center justify-center space-y-1 text-center">
 
242
  <svg
243
  width="25"
244
  height="25"
245
  viewBox="0 0 25 25"
246
  fill="none"
247
+ xmlns="http://www.w3.org/2000/svg">
 
248
  <path
249
  d="M3.5 24.3a3 3 0 0 1-1.9-.8c-.5-.5-.8-1.2-.8-1.9V2.9c0-.7.3-1.3.8-1.9.6-.5 1.2-.7 2-.7h18.6c.7 0 1.3.2 1.9.7.5.6.7 1.2.7 2v18.6c0 .7-.2 1.4-.7 1.9a3 3 0 0 1-2 .8H3.6Zm0-2.7h18.7V2.9H3.5v18.7Zm2.7-2.7h13.3c.3 0 .5 0 .6-.3v-.7l-3.7-5a.6.6 0 0 0-.6-.2c-.2 0-.4 0-.5.3l-3.5 4.6-2.4-3.3a.6.6 0 0 0-.6-.3c-.2 0-.4.1-.5.3l-2.7 3.6c-.1.2-.2.4 0 .7.1.2.3.3.6.3Z"
250
+ fill="#000" />
 
251
  </svg>
252
  <div class="flex text-sm text-gray-600">
253
  <label
254
  for="file-upload"
255
+ class="relative cursor-pointer bg-white rounded-md font-medium text-blue-950 hover:text-blue-700">
 
256
  <span>Drag and drop your audio here</span>
257
  <span class="block text-xs">or</span>
258
  <span class="block text-xs">Click to upload</span>
 
263
  name="file-upload"
264
  type="file"
265
  accept="audio/*"
266
+ class="sr-only" />
 
267
  </div>
268
  <audio
269
  id="audio"
270
  hidden
271
  controls
272
+ class="w-full p-2 select-none"></audio>
 
273
  </div>
274
  </div>
275
  <div>
 
277
  <h3 class="font-medium">Examples:</h3>
278
  <button
279
  data-value="samples_jfk.wav"
280
+ class="text-gray-500 border border-gray-500 rounded-md p-2 underline hover:no-underline">
 
281
  <span>jfk.wav</span>
282
  <span class="text-xs block"> (352 kB)</span>
283
  </button>
284
  <button
285
  data-value="samples_a13.wav"
286
+ class="text-gray-500 border border-gray-500 rounded-md p-2 underline hover:no-underline">
 
287
  <span>a13.wav</span>
288
  <span class="text-xs block"> (960 kB)</span>
289
  </button>
290
  <button
291
  data-value="samples_mm0.wav"
292
+ class="text-gray-500 border border-gray-500 rounded-md p-2 underline hover:no-underline">
 
293
  <span>mm0.wav</span>
294
  <span class="text-xs block new"> (957 kB)</span>
295
  </button>
296
  <button
297
  data-value="samples_gb0.wav"
298
+ class="text-gray-500 border border-gray-500 rounded-md p-2 underline hover:no-underline">
 
299
  <span>gb0.wav </span>
300
  <span class="text-xs block">(4.08 MB)</span>
301
  </button>
302
  <button
303
  data-value="samples_gb1.wav"
304
+ class="text-gray-500 border border-gray-500 rounded-md p-2 underline hover:no-underline">
 
305
  <span>gb1.wav </span>
306
  <span class="text-xs block">(6.36 MB)</span>
307
  </button>
308
  <button
309
  data-value="samples_hp0.wav"
310
+ class="text-gray-500 border border-gray-500 rounded-md p-2 underline hover:no-underline">
 
311
  <span>hp0.wav </span>
312
  <span class="text-xs block">(8.75 MB)</span>
313
  </button>
 
318
  <button
319
  id="detect"
320
  disabled
321
+ class="bg-gray-700 hover:bg-gray-800 text-white font-normal py-2 px-4 rounded disabled:bg-gray-300 disabled:cursor-not-allowed">
 
322
  Transcribe Audio
323
  </button>
324
  </div>
325
  <div>
326
  <h3 class="font-medium">Transcription:</h3>
327
  <div
328
+ class="min-h-[250px] bg-slate-100 text-gray-500 p-4 rounded-md flex flex-col gap-2">
 
329
  <p hidden id="output-generation" class="grid-rows-2"></p>
330
  <span id="output-status" class="m-auto font-light"
331
  >No transcription results yet</span
whisperWorker.js CHANGED
@@ -17,23 +17,46 @@ class Whisper {
17
  static instance = {};
18
  // Retrieve the Whisper model. When called for the first time,
19
  // this will load the model and save it for future use.
20
- static async getInstance(weightsURL, modelID, tokenizerURL, mel_filtersURL) {
 
 
 
 
 
 
 
 
 
 
 
 
21
  // load individual modelID only once
22
  if (!this.instance[modelID]) {
23
  await init();
24
 
25
  self.postMessage({ status: "loading", message: "Loading Model" });
26
- const [weightsArrayU8, tokenizerArrayU8, mel_filtersArrayU8] =
27
- await Promise.all([
28
- fetchArrayBuffer(weightsURL),
29
- fetchArrayBuffer(tokenizerURL),
30
- fetchArrayBuffer(mel_filtersURL),
31
- ]);
 
 
 
 
 
32
 
33
  this.instance[modelID] = new Decoder(
34
  weightsArrayU8,
35
  tokenizerArrayU8,
36
- mel_filtersArrayU8
 
 
 
 
 
 
37
  );
38
  } else {
39
  self.postMessage({ status: "loading", message: "Model Already Loaded" });
@@ -43,17 +66,37 @@ class Whisper {
43
  }
44
 
45
  self.addEventListener("message", async (event) => {
46
- const { weightsURL, modelID, tokenizerURL, mel_filtersURL, audioURL } =
47
- event.data;
 
 
 
 
 
 
48
  try {
49
  self.postMessage({ status: "decoding", message: "Starting Decoder" });
50
-
51
- const decoder = await Whisper.getInstance(
 
 
 
 
 
 
 
 
52
  weightsURL,
53
  modelID,
54
  tokenizerURL,
55
- mel_filtersURL
56
- );
 
 
 
 
 
 
57
 
58
  self.postMessage({ status: "decoding", message: "Loading Audio" });
59
  const audioArrayU8 = await fetchArrayBuffer(audioURL);
 
17
  static instance = {};
18
  // Retrieve the Whisper model. When called for the first time,
19
  // this will load the model and save it for future use.
20
+ static async getInstance(params) {
21
+ const {
22
+ weightsURL,
23
+ modelID,
24
+ tokenizerURL,
25
+ mel_filtersURL,
26
+ configURL,
27
+ quantized,
28
+ is_multilingual,
29
+ timestamps,
30
+ task,
31
+ language,
32
+ } = params;
33
  // load individual modelID only once
34
  if (!this.instance[modelID]) {
35
  await init();
36
 
37
  self.postMessage({ status: "loading", message: "Loading Model" });
38
+ const [
39
+ weightsArrayU8,
40
+ tokenizerArrayU8,
41
+ mel_filtersArrayU8,
42
+ configArrayU8,
43
+ ] = await Promise.all([
44
+ fetchArrayBuffer(weightsURL),
45
+ fetchArrayBuffer(tokenizerURL),
46
+ fetchArrayBuffer(mel_filtersURL),
47
+ fetchArrayBuffer(configURL),
48
+ ]);
49
 
50
  this.instance[modelID] = new Decoder(
51
  weightsArrayU8,
52
  tokenizerArrayU8,
53
+ mel_filtersArrayU8,
54
+ configArrayU8,
55
+ quantized,
56
+ is_multilingual,
57
+ timestamps,
58
+ task,
59
+ language
60
  );
61
  } else {
62
  self.postMessage({ status: "loading", message: "Model Already Loaded" });
 
66
  }
67
 
68
  self.addEventListener("message", async (event) => {
69
+ const {
70
+ weightsURL,
71
+ modelID,
72
+ tokenizerURL,
73
+ configURL,
74
+ mel_filtersURL,
75
+ audioURL,
76
+ } = event.data;
77
  try {
78
  self.postMessage({ status: "decoding", message: "Starting Decoder" });
79
+ let quantized = false;
80
+ if (modelID.includes("quantized")) {
81
+ quantized = true;
82
+ }
83
+ let is_multilingual = false;
84
+ if (modelID.includes("multilingual")) {
85
+ is_multilingual = true;
86
+ }
87
+ let timestamps = true;
88
+ const decoder = await Whisper.getInstance({
89
  weightsURL,
90
  modelID,
91
  tokenizerURL,
92
+ mel_filtersURL,
93
+ configURL,
94
+ quantized,
95
+ is_multilingual,
96
+ timestamps,
97
+ task: null,
98
+ language: null,
99
+ });
100
 
101
  self.postMessage({ status: "decoding", message: "Loading Audio" });
102
  const audioArrayU8 = await fetchArrayBuffer(audioURL);