Spaces:
Running
Running
add quantized models support (#4)
Browse files- add quantized models support (e5f217ff864b7c048b430487ac42ad6e0ca23095)
Co-authored-by: Radamés Ajna <[email protected]>
- build/m.d.ts +9 -2
- build/m.js +72 -9
- build/m_bg.wasm +2 -2
- build/m_bg.wasm.d.ts +2 -1
- index.html +59 -40
- whisperWorker.js +57 -14
build/m.d.ts
CHANGED
@@ -8,8 +8,14 @@ export class Decoder {
|
|
8 |
* @param {Uint8Array} weights
|
9 |
* @param {Uint8Array} tokenizer
|
10 |
* @param {Uint8Array} mel_filters
|
|
|
|
|
|
|
|
|
|
|
|
|
11 |
*/
|
12 |
-
constructor(weights: Uint8Array, tokenizer: Uint8Array, mel_filters: Uint8Array);
|
13 |
/**
|
14 |
* @param {Uint8Array} wav_input
|
15 |
* @returns {string}
|
@@ -22,11 +28,12 @@ export type InitInput = RequestInfo | URL | Response | BufferSource | WebAssembl
|
|
22 |
export interface InitOutput {
|
23 |
readonly memory: WebAssembly.Memory;
|
24 |
readonly __wbg_decoder_free: (a: number) => void;
|
25 |
-
readonly decoder_new: (a: number, b: number, c: number, d: number, e: number, f: number, g: number) => void;
|
26 |
readonly decoder_decode: (a: number, b: number, c: number, d: number) => void;
|
27 |
readonly main: (a: number, b: number) => number;
|
28 |
readonly __wbindgen_add_to_stack_pointer: (a: number) => number;
|
29 |
readonly __wbindgen_malloc: (a: number, b: number) => number;
|
|
|
30 |
readonly __wbindgen_free: (a: number, b: number, c: number) => void;
|
31 |
readonly __wbindgen_start: () => void;
|
32 |
}
|
|
|
8 |
* @param {Uint8Array} weights
|
9 |
* @param {Uint8Array} tokenizer
|
10 |
* @param {Uint8Array} mel_filters
|
11 |
+
* @param {Uint8Array} config
|
12 |
+
* @param {boolean} quantized
|
13 |
+
* @param {boolean} is_multilingual
|
14 |
+
* @param {boolean} timestamps
|
15 |
+
* @param {string | undefined} task
|
16 |
+
* @param {string | undefined} language
|
17 |
*/
|
18 |
+
constructor(weights: Uint8Array, tokenizer: Uint8Array, mel_filters: Uint8Array, config: Uint8Array, quantized: boolean, is_multilingual: boolean, timestamps: boolean, task?: string, language?: string);
|
19 |
/**
|
20 |
* @param {Uint8Array} wav_input
|
21 |
* @returns {string}
|
|
|
28 |
export interface InitOutput {
|
29 |
readonly memory: WebAssembly.Memory;
|
30 |
readonly __wbg_decoder_free: (a: number) => void;
|
31 |
+
readonly decoder_new: (a: number, b: number, c: number, d: number, e: number, f: number, g: number, h: number, i: number, j: number, k: number, l: number, m: number, n: number, o: number, p: number) => void;
|
32 |
readonly decoder_decode: (a: number, b: number, c: number, d: number) => void;
|
33 |
readonly main: (a: number, b: number) => number;
|
34 |
readonly __wbindgen_add_to_stack_pointer: (a: number) => number;
|
35 |
readonly __wbindgen_malloc: (a: number, b: number) => number;
|
36 |
+
readonly __wbindgen_realloc: (a: number, b: number, c: number, d: number) => number;
|
37 |
readonly __wbindgen_free: (a: number, b: number, c: number) => void;
|
38 |
readonly __wbindgen_start: () => void;
|
39 |
}
|
build/m.js
CHANGED
@@ -42,6 +42,63 @@ function passArray8ToWasm0(arg, malloc) {
|
|
42 |
return ptr;
|
43 |
}
|
44 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
45 |
let cachedInt32Memory0 = null;
|
46 |
|
47 |
function getInt32Memory0() {
|
@@ -91,8 +148,14 @@ export class Decoder {
|
|
91 |
* @param {Uint8Array} weights
|
92 |
* @param {Uint8Array} tokenizer
|
93 |
* @param {Uint8Array} mel_filters
|
|
|
|
|
|
|
|
|
|
|
|
|
94 |
*/
|
95 |
-
constructor(weights, tokenizer, mel_filters) {
|
96 |
try {
|
97 |
const retptr = wasm.__wbindgen_add_to_stack_pointer(-16);
|
98 |
const ptr0 = passArray8ToWasm0(weights, wasm.__wbindgen_malloc);
|
@@ -101,7 +164,13 @@ export class Decoder {
|
|
101 |
const len1 = WASM_VECTOR_LEN;
|
102 |
const ptr2 = passArray8ToWasm0(mel_filters, wasm.__wbindgen_malloc);
|
103 |
const len2 = WASM_VECTOR_LEN;
|
104 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
105 |
var r0 = getInt32Memory0()[retptr / 4 + 0];
|
106 |
var r1 = getInt32Memory0()[retptr / 4 + 1];
|
107 |
var r2 = getInt32Memory0()[retptr / 4 + 2];
|
@@ -183,15 +252,9 @@ function __wbg_get_imports() {
|
|
183 |
const ret = new Error(getStringFromWasm0(arg0, arg1));
|
184 |
return addHeapObject(ret);
|
185 |
};
|
186 |
-
imports.wbg.
|
187 |
console.log(getStringFromWasm0(arg0, arg1));
|
188 |
};
|
189 |
-
imports.wbg.__wbg_time_fa135a7c2786e907 = function(arg0, arg1) {
|
190 |
-
console.time(getStringFromWasm0(arg0, arg1));
|
191 |
-
};
|
192 |
-
imports.wbg.__wbg_timeEnd_594d82f147c9776f = function(arg0, arg1) {
|
193 |
-
console.timeEnd(getStringFromWasm0(arg0, arg1));
|
194 |
-
};
|
195 |
imports.wbg.__wbindgen_throw = function(arg0, arg1) {
|
196 |
throw new Error(getStringFromWasm0(arg0, arg1));
|
197 |
};
|
|
|
42 |
return ptr;
|
43 |
}
|
44 |
|
45 |
+
const cachedTextEncoder = (typeof TextEncoder !== 'undefined' ? new TextEncoder('utf-8') : { encode: () => { throw Error('TextEncoder not available') } } );
|
46 |
+
|
47 |
+
const encodeString = (typeof cachedTextEncoder.encodeInto === 'function'
|
48 |
+
? function (arg, view) {
|
49 |
+
return cachedTextEncoder.encodeInto(arg, view);
|
50 |
+
}
|
51 |
+
: function (arg, view) {
|
52 |
+
const buf = cachedTextEncoder.encode(arg);
|
53 |
+
view.set(buf);
|
54 |
+
return {
|
55 |
+
read: arg.length,
|
56 |
+
written: buf.length
|
57 |
+
};
|
58 |
+
});
|
59 |
+
|
60 |
+
function passStringToWasm0(arg, malloc, realloc) {
|
61 |
+
|
62 |
+
if (realloc === undefined) {
|
63 |
+
const buf = cachedTextEncoder.encode(arg);
|
64 |
+
const ptr = malloc(buf.length, 1) >>> 0;
|
65 |
+
getUint8Memory0().subarray(ptr, ptr + buf.length).set(buf);
|
66 |
+
WASM_VECTOR_LEN = buf.length;
|
67 |
+
return ptr;
|
68 |
+
}
|
69 |
+
|
70 |
+
let len = arg.length;
|
71 |
+
let ptr = malloc(len, 1) >>> 0;
|
72 |
+
|
73 |
+
const mem = getUint8Memory0();
|
74 |
+
|
75 |
+
let offset = 0;
|
76 |
+
|
77 |
+
for (; offset < len; offset++) {
|
78 |
+
const code = arg.charCodeAt(offset);
|
79 |
+
if (code > 0x7F) break;
|
80 |
+
mem[ptr + offset] = code;
|
81 |
+
}
|
82 |
+
|
83 |
+
if (offset !== len) {
|
84 |
+
if (offset !== 0) {
|
85 |
+
arg = arg.slice(offset);
|
86 |
+
}
|
87 |
+
ptr = realloc(ptr, len, len = offset + arg.length * 3, 1) >>> 0;
|
88 |
+
const view = getUint8Memory0().subarray(ptr + offset, ptr + len);
|
89 |
+
const ret = encodeString(arg, view);
|
90 |
+
|
91 |
+
offset += ret.written;
|
92 |
+
}
|
93 |
+
|
94 |
+
WASM_VECTOR_LEN = offset;
|
95 |
+
return ptr;
|
96 |
+
}
|
97 |
+
|
98 |
+
function isLikeNone(x) {
|
99 |
+
return x === undefined || x === null;
|
100 |
+
}
|
101 |
+
|
102 |
let cachedInt32Memory0 = null;
|
103 |
|
104 |
function getInt32Memory0() {
|
|
|
148 |
* @param {Uint8Array} weights
|
149 |
* @param {Uint8Array} tokenizer
|
150 |
* @param {Uint8Array} mel_filters
|
151 |
+
* @param {Uint8Array} config
|
152 |
+
* @param {boolean} quantized
|
153 |
+
* @param {boolean} is_multilingual
|
154 |
+
* @param {boolean} timestamps
|
155 |
+
* @param {string | undefined} task
|
156 |
+
* @param {string | undefined} language
|
157 |
*/
|
158 |
+
constructor(weights, tokenizer, mel_filters, config, quantized, is_multilingual, timestamps, task, language) {
|
159 |
try {
|
160 |
const retptr = wasm.__wbindgen_add_to_stack_pointer(-16);
|
161 |
const ptr0 = passArray8ToWasm0(weights, wasm.__wbindgen_malloc);
|
|
|
164 |
const len1 = WASM_VECTOR_LEN;
|
165 |
const ptr2 = passArray8ToWasm0(mel_filters, wasm.__wbindgen_malloc);
|
166 |
const len2 = WASM_VECTOR_LEN;
|
167 |
+
const ptr3 = passArray8ToWasm0(config, wasm.__wbindgen_malloc);
|
168 |
+
const len3 = WASM_VECTOR_LEN;
|
169 |
+
var ptr4 = isLikeNone(task) ? 0 : passStringToWasm0(task, wasm.__wbindgen_malloc, wasm.__wbindgen_realloc);
|
170 |
+
var len4 = WASM_VECTOR_LEN;
|
171 |
+
var ptr5 = isLikeNone(language) ? 0 : passStringToWasm0(language, wasm.__wbindgen_malloc, wasm.__wbindgen_realloc);
|
172 |
+
var len5 = WASM_VECTOR_LEN;
|
173 |
+
wasm.decoder_new(retptr, ptr0, len0, ptr1, len1, ptr2, len2, ptr3, len3, quantized, is_multilingual, timestamps, ptr4, len4, ptr5, len5);
|
174 |
var r0 = getInt32Memory0()[retptr / 4 + 0];
|
175 |
var r1 = getInt32Memory0()[retptr / 4 + 1];
|
176 |
var r2 = getInt32Memory0()[retptr / 4 + 2];
|
|
|
252 |
const ret = new Error(getStringFromWasm0(arg0, arg1));
|
253 |
return addHeapObject(ret);
|
254 |
};
|
255 |
+
imports.wbg.__wbg_log_0d9af0379e7a06b8 = function(arg0, arg1) {
|
256 |
console.log(getStringFromWasm0(arg0, arg1));
|
257 |
};
|
|
|
|
|
|
|
|
|
|
|
|
|
258 |
imports.wbg.__wbindgen_throw = function(arg0, arg1) {
|
259 |
throw new Error(getStringFromWasm0(arg0, arg1));
|
260 |
};
|
build/m_bg.wasm
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
-
size
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:393c1add1a180c1f0403cf5bb26db587ec59d19bec0c756b613f89b5e12fa512
|
3 |
+
size 4070269
|
build/m_bg.wasm.d.ts
CHANGED
@@ -2,10 +2,11 @@
|
|
2 |
/* eslint-disable */
|
3 |
export const memory: WebAssembly.Memory;
|
4 |
export function __wbg_decoder_free(a: number): void;
|
5 |
-
export function decoder_new(a: number, b: number, c: number, d: number, e: number, f: number, g: number): void;
|
6 |
export function decoder_decode(a: number, b: number, c: number, d: number): void;
|
7 |
export function main(a: number, b: number): number;
|
8 |
export function __wbindgen_add_to_stack_pointer(a: number): number;
|
9 |
export function __wbindgen_malloc(a: number, b: number): number;
|
|
|
10 |
export function __wbindgen_free(a: number, b: number, c: number): void;
|
11 |
export function __wbindgen_start(): void;
|
|
|
2 |
/* eslint-disable */
|
3 |
export const memory: WebAssembly.Memory;
|
4 |
export function __wbg_decoder_free(a: number): void;
|
5 |
+
export function decoder_new(a: number, b: number, c: number, d: number, e: number, f: number, g: number, h: number, i: number, j: number, k: number, l: number, m: number, n: number, o: number, p: number): void;
|
6 |
export function decoder_decode(a: number, b: number, c: number, d: number): void;
|
7 |
export function main(a: number, b: number): number;
|
8 |
export function __wbindgen_add_to_stack_pointer(a: number): number;
|
9 |
export function __wbindgen_malloc(a: number, b: number): number;
|
10 |
+
export function __wbindgen_realloc(a: number, b: number, c: number, d: number): number;
|
11 |
export function __wbindgen_free(a: number, b: number, c: number): void;
|
12 |
export function __wbindgen_start(): void;
|
index.html
CHANGED
@@ -6,7 +6,7 @@
|
|
6 |
<body></body>
|
7 |
</html>
|
8 |
|
9 |
-
<!
|
10 |
<html>
|
11 |
<head>
|
12 |
<meta charset="UTF-8" />
|
@@ -26,9 +26,30 @@
|
|
26 |
|
27 |
// models base url
|
28 |
const MODELS = {
|
|
|
|
|
|
|
|
|
|
|
|
|
29 |
tiny_en: {
|
30 |
base_url:
|
31 |
-
"https://huggingface.co/openai/whisper-tiny.en/resolve/
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
32 |
},
|
33 |
};
|
34 |
const whisperWorker = new Worker("./whisperWorker.js", {
|
@@ -39,6 +60,7 @@
|
|
39 |
weightsURL, // URL to the weights file
|
40 |
modelID, // model ID
|
41 |
tokenizerURL, // URL to the tokenizer file
|
|
|
42 |
mel_filtersURL, // URL to the mel filters file
|
43 |
audioURL, // URL to the audio file
|
44 |
updateStatus // function to update the status
|
@@ -48,21 +70,25 @@
|
|
48 |
weightsURL,
|
49 |
modelID,
|
50 |
tokenizerURL,
|
|
|
51 |
mel_filtersURL,
|
52 |
audioURL,
|
53 |
});
|
54 |
-
|
55 |
console.log(event.data);
|
56 |
if ("status" in event.data) {
|
57 |
updateStatus(event.data);
|
58 |
}
|
59 |
if ("error" in event.data) {
|
|
|
60 |
reject(new Error(event.data.error));
|
61 |
}
|
62 |
if (event.data.status === "complete") {
|
|
|
63 |
resolve(event.data);
|
64 |
}
|
65 |
-
}
|
|
|
66 |
});
|
67 |
}
|
68 |
|
@@ -125,13 +151,16 @@
|
|
125 |
return;
|
126 |
}
|
127 |
const modelID = document.querySelector("#model").value;
|
128 |
-
const
|
129 |
-
const
|
|
|
|
|
130 |
|
131 |
classifyAudio(
|
132 |
modelURL,
|
133 |
modelID,
|
134 |
tokenizerURL,
|
|
|
135 |
"mel_filters.safetensors",
|
136 |
audioURL,
|
137 |
updateStatus
|
@@ -175,8 +204,7 @@
|
|
175 |
<a
|
176 |
href="https://huggingface.co/openai/"
|
177 |
target="_blank"
|
178 |
-
class="underline hover:text-blue-500 hover:no-underline"
|
179 |
-
>
|
180 |
OpenAI Whisper models
|
181 |
</a>
|
182 |
and WASM runtime built with
|
@@ -193,37 +221,38 @@
|
|
193 |
<label for="model" class="font-medium">Models Options: </label>
|
194 |
<select
|
195 |
id="model"
|
196 |
-
class="border-2 border-gray-500 rounded-md font-light"
|
197 |
-
|
198 |
<option value="tiny_en" selected>tiny.en (151 MB)</option>
|
|
|
|
|
|
|
|
|
|
|
|
|
199 |
</select>
|
200 |
</div>
|
201 |
<!-- drag and drop area -->
|
202 |
<div class="relative">
|
203 |
<div
|
204 |
id="drop-area"
|
205 |
-
class="flex flex-col items-center justify-center border-2 border-gray-300 border-dashed rounded-xl relative h-48 w-full overflow-hidden"
|
206 |
-
>
|
207 |
<div
|
208 |
-
class="flex flex-col items-center justify-center space-y-1 text-center"
|
209 |
-
>
|
210 |
<svg
|
211 |
width="25"
|
212 |
height="25"
|
213 |
viewBox="0 0 25 25"
|
214 |
fill="none"
|
215 |
-
xmlns="http://www.w3.org/2000/svg"
|
216 |
-
>
|
217 |
<path
|
218 |
d="M3.5 24.3a3 3 0 0 1-1.9-.8c-.5-.5-.8-1.2-.8-1.9V2.9c0-.7.3-1.3.8-1.9.6-.5 1.2-.7 2-.7h18.6c.7 0 1.3.2 1.9.7.5.6.7 1.2.7 2v18.6c0 .7-.2 1.4-.7 1.9a3 3 0 0 1-2 .8H3.6Zm0-2.7h18.7V2.9H3.5v18.7Zm2.7-2.7h13.3c.3 0 .5 0 .6-.3v-.7l-3.7-5a.6.6 0 0 0-.6-.2c-.2 0-.4 0-.5.3l-3.5 4.6-2.4-3.3a.6.6 0 0 0-.6-.3c-.2 0-.4.1-.5.3l-2.7 3.6c-.1.2-.2.4 0 .7.1.2.3.3.6.3Z"
|
219 |
-
fill="#000"
|
220 |
-
/>
|
221 |
</svg>
|
222 |
<div class="flex text-sm text-gray-600">
|
223 |
<label
|
224 |
for="file-upload"
|
225 |
-
class="relative cursor-pointer bg-white rounded-md font-medium text-blue-950 hover:text-blue-700"
|
226 |
-
>
|
227 |
<span>Drag and drop your audio here</span>
|
228 |
<span class="block text-xs">or</span>
|
229 |
<span class="block text-xs">Click to upload</span>
|
@@ -234,15 +263,13 @@
|
|
234 |
name="file-upload"
|
235 |
type="file"
|
236 |
accept="audio/*"
|
237 |
-
class="sr-only"
|
238 |
-
/>
|
239 |
</div>
|
240 |
<audio
|
241 |
id="audio"
|
242 |
hidden
|
243 |
controls
|
244 |
-
class="w-full p-2 select-none"
|
245 |
-
></audio>
|
246 |
</div>
|
247 |
</div>
|
248 |
<div>
|
@@ -250,43 +277,37 @@
|
|
250 |
<h3 class="font-medium">Examples:</h3>
|
251 |
<button
|
252 |
data-value="samples_jfk.wav"
|
253 |
-
class="text-gray-500 border border-gray-500 rounded-md p-2 underline hover:no-underline"
|
254 |
-
>
|
255 |
<span>jfk.wav</span>
|
256 |
<span class="text-xs block"> (352 kB)</span>
|
257 |
</button>
|
258 |
<button
|
259 |
data-value="samples_a13.wav"
|
260 |
-
class="text-gray-500 border border-gray-500 rounded-md p-2 underline hover:no-underline"
|
261 |
-
>
|
262 |
<span>a13.wav</span>
|
263 |
<span class="text-xs block"> (960 kB)</span>
|
264 |
</button>
|
265 |
<button
|
266 |
data-value="samples_mm0.wav"
|
267 |
-
class="text-gray-500 border border-gray-500 rounded-md p-2 underline hover:no-underline"
|
268 |
-
>
|
269 |
<span>mm0.wav</span>
|
270 |
<span class="text-xs block new"> (957 kB)</span>
|
271 |
</button>
|
272 |
<button
|
273 |
data-value="samples_gb0.wav"
|
274 |
-
class="text-gray-500 border border-gray-500 rounded-md p-2 underline hover:no-underline"
|
275 |
-
>
|
276 |
<span>gb0.wav </span>
|
277 |
<span class="text-xs block">(4.08 MB)</span>
|
278 |
</button>
|
279 |
<button
|
280 |
data-value="samples_gb1.wav"
|
281 |
-
class="text-gray-500 border border-gray-500 rounded-md p-2 underline hover:no-underline"
|
282 |
-
>
|
283 |
<span>gb1.wav </span>
|
284 |
<span class="text-xs block">(6.36 MB)</span>
|
285 |
</button>
|
286 |
<button
|
287 |
data-value="samples_hp0.wav"
|
288 |
-
class="text-gray-500 border border-gray-500 rounded-md p-2 underline hover:no-underline"
|
289 |
-
>
|
290 |
<span>hp0.wav </span>
|
291 |
<span class="text-xs block">(8.75 MB)</span>
|
292 |
</button>
|
@@ -297,16 +318,14 @@
|
|
297 |
<button
|
298 |
id="detect"
|
299 |
disabled
|
300 |
-
class="bg-gray-700 hover:bg-gray-800 text-white font-normal py-2 px-4 rounded disabled:bg-gray-300 disabled:cursor-not-allowed"
|
301 |
-
>
|
302 |
Transcribe Audio
|
303 |
</button>
|
304 |
</div>
|
305 |
<div>
|
306 |
<h3 class="font-medium">Transcription:</h3>
|
307 |
<div
|
308 |
-
class="min-h-[250px] bg-slate-100 text-gray-500 p-4 rounded-md flex flex-col gap-2"
|
309 |
-
>
|
310 |
<p hidden id="output-generation" class="grid-rows-2"></p>
|
311 |
<span id="output-status" class="m-auto font-light"
|
312 |
>No transcription results yet</span
|
|
|
6 |
<body></body>
|
7 |
</html>
|
8 |
|
9 |
+
<!DOCTYPE html>
|
10 |
<html>
|
11 |
<head>
|
12 |
<meta charset="UTF-8" />
|
|
|
26 |
|
27 |
// models base url
|
28 |
const MODELS = {
|
29 |
+
tiny_multilingual: {
|
30 |
+
base_url: "https://huggingface.co/openai/whisper-tiny/resolve/main/",
|
31 |
+
model: "model.safetensors",
|
32 |
+
tokenizer: "tokenizer.json",
|
33 |
+
config: "config.json",
|
34 |
+
},
|
35 |
tiny_en: {
|
36 |
base_url:
|
37 |
+
"https://huggingface.co/openai/whisper-tiny.en/resolve/main/",
|
38 |
+
model: "model.safetensors",
|
39 |
+
tokenizer: "tokenizer.json",
|
40 |
+
config: "config.json",
|
41 |
+
},
|
42 |
+
tiny_quantized_multilingual_q80: {
|
43 |
+
base_url: "https://huggingface.co/lmz/candle-whisper/resolve/main/",
|
44 |
+
model: "model-tiny-q80.gguf",
|
45 |
+
tokenizer: "tokenizer-tiny.json",
|
46 |
+
config: "config-tiny.json",
|
47 |
+
},
|
48 |
+
tiny_en_quantized_q80: {
|
49 |
+
base_url: "https://huggingface.co/lmz/candle-whisper/resolve/main/",
|
50 |
+
model: "model-tiny-q80.gguf",
|
51 |
+
tokenizer: "tokenizer-tiny-en.json",
|
52 |
+
config: "config-tiny-en.json",
|
53 |
},
|
54 |
};
|
55 |
const whisperWorker = new Worker("./whisperWorker.js", {
|
|
|
60 |
weightsURL, // URL to the weights file
|
61 |
modelID, // model ID
|
62 |
tokenizerURL, // URL to the tokenizer file
|
63 |
+
configURL, // model config URL
|
64 |
mel_filtersURL, // URL to the mel filters file
|
65 |
audioURL, // URL to the audio file
|
66 |
updateStatus // function to update the status
|
|
|
70 |
weightsURL,
|
71 |
modelID,
|
72 |
tokenizerURL,
|
73 |
+
configURL,
|
74 |
mel_filtersURL,
|
75 |
audioURL,
|
76 |
});
|
77 |
+
function messageHandler(event) {
|
78 |
console.log(event.data);
|
79 |
if ("status" in event.data) {
|
80 |
updateStatus(event.data);
|
81 |
}
|
82 |
if ("error" in event.data) {
|
83 |
+
whisperWorker.removeEventListener("message", messageHandler);
|
84 |
reject(new Error(event.data.error));
|
85 |
}
|
86 |
if (event.data.status === "complete") {
|
87 |
+
whisperWorker.removeEventListener("message", messageHandler);
|
88 |
resolve(event.data);
|
89 |
}
|
90 |
+
}
|
91 |
+
whisperWorker.addEventListener("message", messageHandler);
|
92 |
});
|
93 |
}
|
94 |
|
|
|
151 |
return;
|
152 |
}
|
153 |
const modelID = document.querySelector("#model").value;
|
154 |
+
const model = MODELS[modelID];
|
155 |
+
const modelURL = model.base_url + model.model;
|
156 |
+
const tokenizerURL = model.base_url + model.tokenizer;
|
157 |
+
const configURL = model.base_url + model.config;
|
158 |
|
159 |
classifyAudio(
|
160 |
modelURL,
|
161 |
modelID,
|
162 |
tokenizerURL,
|
163 |
+
configURL,
|
164 |
"mel_filters.safetensors",
|
165 |
audioURL,
|
166 |
updateStatus
|
|
|
204 |
<a
|
205 |
href="https://huggingface.co/openai/"
|
206 |
target="_blank"
|
207 |
+
class="underline hover:text-blue-500 hover:no-underline">
|
|
|
208 |
OpenAI Whisper models
|
209 |
</a>
|
210 |
and WASM runtime built with
|
|
|
221 |
<label for="model" class="font-medium">Models Options: </label>
|
222 |
<select
|
223 |
id="model"
|
224 |
+
class="border-2 border-gray-500 rounded-md font-light">
|
225 |
+
<option value="tiny_multilingual" selected>tiny.en (151 MB)</option>
|
226 |
<option value="tiny_en" selected>tiny.en (151 MB)</option>
|
227 |
+
<option value="tiny_quantized_multilingual_q80">
|
228 |
+
tiny quantized q80 (41.5 MB)
|
229 |
+
</option>
|
230 |
+
<option value="tiny_en_quantized_q80">
|
231 |
+
tiny.en quantized q80 (41.8 MB)
|
232 |
+
</option>
|
233 |
</select>
|
234 |
</div>
|
235 |
<!-- drag and drop area -->
|
236 |
<div class="relative">
|
237 |
<div
|
238 |
id="drop-area"
|
239 |
+
class="flex flex-col items-center justify-center border-2 border-gray-300 border-dashed rounded-xl relative h-48 w-full overflow-hidden">
|
|
|
240 |
<div
|
241 |
+
class="flex flex-col items-center justify-center space-y-1 text-center">
|
|
|
242 |
<svg
|
243 |
width="25"
|
244 |
height="25"
|
245 |
viewBox="0 0 25 25"
|
246 |
fill="none"
|
247 |
+
xmlns="http://www.w3.org/2000/svg">
|
|
|
248 |
<path
|
249 |
d="M3.5 24.3a3 3 0 0 1-1.9-.8c-.5-.5-.8-1.2-.8-1.9V2.9c0-.7.3-1.3.8-1.9.6-.5 1.2-.7 2-.7h18.6c.7 0 1.3.2 1.9.7.5.6.7 1.2.7 2v18.6c0 .7-.2 1.4-.7 1.9a3 3 0 0 1-2 .8H3.6Zm0-2.7h18.7V2.9H3.5v18.7Zm2.7-2.7h13.3c.3 0 .5 0 .6-.3v-.7l-3.7-5a.6.6 0 0 0-.6-.2c-.2 0-.4 0-.5.3l-3.5 4.6-2.4-3.3a.6.6 0 0 0-.6-.3c-.2 0-.4.1-.5.3l-2.7 3.6c-.1.2-.2.4 0 .7.1.2.3.3.6.3Z"
|
250 |
+
fill="#000" />
|
|
|
251 |
</svg>
|
252 |
<div class="flex text-sm text-gray-600">
|
253 |
<label
|
254 |
for="file-upload"
|
255 |
+
class="relative cursor-pointer bg-white rounded-md font-medium text-blue-950 hover:text-blue-700">
|
|
|
256 |
<span>Drag and drop your audio here</span>
|
257 |
<span class="block text-xs">or</span>
|
258 |
<span class="block text-xs">Click to upload</span>
|
|
|
263 |
name="file-upload"
|
264 |
type="file"
|
265 |
accept="audio/*"
|
266 |
+
class="sr-only" />
|
|
|
267 |
</div>
|
268 |
<audio
|
269 |
id="audio"
|
270 |
hidden
|
271 |
controls
|
272 |
+
class="w-full p-2 select-none"></audio>
|
|
|
273 |
</div>
|
274 |
</div>
|
275 |
<div>
|
|
|
277 |
<h3 class="font-medium">Examples:</h3>
|
278 |
<button
|
279 |
data-value="samples_jfk.wav"
|
280 |
+
class="text-gray-500 border border-gray-500 rounded-md p-2 underline hover:no-underline">
|
|
|
281 |
<span>jfk.wav</span>
|
282 |
<span class="text-xs block"> (352 kB)</span>
|
283 |
</button>
|
284 |
<button
|
285 |
data-value="samples_a13.wav"
|
286 |
+
class="text-gray-500 border border-gray-500 rounded-md p-2 underline hover:no-underline">
|
|
|
287 |
<span>a13.wav</span>
|
288 |
<span class="text-xs block"> (960 kB)</span>
|
289 |
</button>
|
290 |
<button
|
291 |
data-value="samples_mm0.wav"
|
292 |
+
class="text-gray-500 border border-gray-500 rounded-md p-2 underline hover:no-underline">
|
|
|
293 |
<span>mm0.wav</span>
|
294 |
<span class="text-xs block new"> (957 kB)</span>
|
295 |
</button>
|
296 |
<button
|
297 |
data-value="samples_gb0.wav"
|
298 |
+
class="text-gray-500 border border-gray-500 rounded-md p-2 underline hover:no-underline">
|
|
|
299 |
<span>gb0.wav </span>
|
300 |
<span class="text-xs block">(4.08 MB)</span>
|
301 |
</button>
|
302 |
<button
|
303 |
data-value="samples_gb1.wav"
|
304 |
+
class="text-gray-500 border border-gray-500 rounded-md p-2 underline hover:no-underline">
|
|
|
305 |
<span>gb1.wav </span>
|
306 |
<span class="text-xs block">(6.36 MB)</span>
|
307 |
</button>
|
308 |
<button
|
309 |
data-value="samples_hp0.wav"
|
310 |
+
class="text-gray-500 border border-gray-500 rounded-md p-2 underline hover:no-underline">
|
|
|
311 |
<span>hp0.wav </span>
|
312 |
<span class="text-xs block">(8.75 MB)</span>
|
313 |
</button>
|
|
|
318 |
<button
|
319 |
id="detect"
|
320 |
disabled
|
321 |
+
class="bg-gray-700 hover:bg-gray-800 text-white font-normal py-2 px-4 rounded disabled:bg-gray-300 disabled:cursor-not-allowed">
|
|
|
322 |
Transcribe Audio
|
323 |
</button>
|
324 |
</div>
|
325 |
<div>
|
326 |
<h3 class="font-medium">Transcription:</h3>
|
327 |
<div
|
328 |
+
class="min-h-[250px] bg-slate-100 text-gray-500 p-4 rounded-md flex flex-col gap-2">
|
|
|
329 |
<p hidden id="output-generation" class="grid-rows-2"></p>
|
330 |
<span id="output-status" class="m-auto font-light"
|
331 |
>No transcription results yet</span
|
whisperWorker.js
CHANGED
@@ -17,23 +17,46 @@ class Whisper {
|
|
17 |
static instance = {};
|
18 |
// Retrieve the Whisper model. When called for the first time,
|
19 |
// this will load the model and save it for future use.
|
20 |
-
static async getInstance(
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
21 |
// load individual modelID only once
|
22 |
if (!this.instance[modelID]) {
|
23 |
await init();
|
24 |
|
25 |
self.postMessage({ status: "loading", message: "Loading Model" });
|
26 |
-
const [
|
27 |
-
|
28 |
-
|
29 |
-
|
30 |
-
|
31 |
-
|
|
|
|
|
|
|
|
|
|
|
32 |
|
33 |
this.instance[modelID] = new Decoder(
|
34 |
weightsArrayU8,
|
35 |
tokenizerArrayU8,
|
36 |
-
mel_filtersArrayU8
|
|
|
|
|
|
|
|
|
|
|
|
|
37 |
);
|
38 |
} else {
|
39 |
self.postMessage({ status: "loading", message: "Model Already Loaded" });
|
@@ -43,17 +66,37 @@ class Whisper {
|
|
43 |
}
|
44 |
|
45 |
self.addEventListener("message", async (event) => {
|
46 |
-
const {
|
47 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
48 |
try {
|
49 |
self.postMessage({ status: "decoding", message: "Starting Decoder" });
|
50 |
-
|
51 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
52 |
weightsURL,
|
53 |
modelID,
|
54 |
tokenizerURL,
|
55 |
-
mel_filtersURL
|
56 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
57 |
|
58 |
self.postMessage({ status: "decoding", message: "Loading Audio" });
|
59 |
const audioArrayU8 = await fetchArrayBuffer(audioURL);
|
|
|
17 |
static instance = {};
|
18 |
// Retrieve the Whisper model. When called for the first time,
|
19 |
// this will load the model and save it for future use.
|
20 |
+
static async getInstance(params) {
|
21 |
+
const {
|
22 |
+
weightsURL,
|
23 |
+
modelID,
|
24 |
+
tokenizerURL,
|
25 |
+
mel_filtersURL,
|
26 |
+
configURL,
|
27 |
+
quantized,
|
28 |
+
is_multilingual,
|
29 |
+
timestamps,
|
30 |
+
task,
|
31 |
+
language,
|
32 |
+
} = params;
|
33 |
// load individual modelID only once
|
34 |
if (!this.instance[modelID]) {
|
35 |
await init();
|
36 |
|
37 |
self.postMessage({ status: "loading", message: "Loading Model" });
|
38 |
+
const [
|
39 |
+
weightsArrayU8,
|
40 |
+
tokenizerArrayU8,
|
41 |
+
mel_filtersArrayU8,
|
42 |
+
configArrayU8,
|
43 |
+
] = await Promise.all([
|
44 |
+
fetchArrayBuffer(weightsURL),
|
45 |
+
fetchArrayBuffer(tokenizerURL),
|
46 |
+
fetchArrayBuffer(mel_filtersURL),
|
47 |
+
fetchArrayBuffer(configURL),
|
48 |
+
]);
|
49 |
|
50 |
this.instance[modelID] = new Decoder(
|
51 |
weightsArrayU8,
|
52 |
tokenizerArrayU8,
|
53 |
+
mel_filtersArrayU8,
|
54 |
+
configArrayU8,
|
55 |
+
quantized,
|
56 |
+
is_multilingual,
|
57 |
+
timestamps,
|
58 |
+
task,
|
59 |
+
language
|
60 |
);
|
61 |
} else {
|
62 |
self.postMessage({ status: "loading", message: "Model Already Loaded" });
|
|
|
66 |
}
|
67 |
|
68 |
self.addEventListener("message", async (event) => {
|
69 |
+
const {
|
70 |
+
weightsURL,
|
71 |
+
modelID,
|
72 |
+
tokenizerURL,
|
73 |
+
configURL,
|
74 |
+
mel_filtersURL,
|
75 |
+
audioURL,
|
76 |
+
} = event.data;
|
77 |
try {
|
78 |
self.postMessage({ status: "decoding", message: "Starting Decoder" });
|
79 |
+
let quantized = false;
|
80 |
+
if (modelID.includes("quantized")) {
|
81 |
+
quantized = true;
|
82 |
+
}
|
83 |
+
let is_multilingual = false;
|
84 |
+
if (modelID.includes("multilingual")) {
|
85 |
+
is_multilingual = true;
|
86 |
+
}
|
87 |
+
let timestamps = true;
|
88 |
+
const decoder = await Whisper.getInstance({
|
89 |
weightsURL,
|
90 |
modelID,
|
91 |
tokenizerURL,
|
92 |
+
mel_filtersURL,
|
93 |
+
configURL,
|
94 |
+
quantized,
|
95 |
+
is_multilingual,
|
96 |
+
timestamps,
|
97 |
+
task: null,
|
98 |
+
language: null,
|
99 |
+
});
|
100 |
|
101 |
self.postMessage({ status: "decoding", message: "Loading Audio" });
|
102 |
const audioArrayU8 = await fetchArrayBuffer(audioURL);
|