Add support for EXL2 4 bit KV cache; switch from metric gigabytes (1e9 bytes) to JEDEC gigabytes (2^30 bytes)
#2
by
mo137
- opened
- index.html +17 -17
index.html
CHANGED
@@ -128,19 +128,16 @@
|
|
128 |
return (context / 1024 * 2 + 0.75) * model_config["num_attention_heads"] * 1024 * 1024
|
129 |
}
|
130 |
|
131 |
-
function kvCache(context=8192, model_config,
|
132 |
const n_gqa = model_config["num_attention_heads"] / model_config["num_key_value_heads"]
|
133 |
const n_embd_gqa = model_config["hidden_size"] / n_gqa
|
134 |
const n_elements = n_embd_gqa * (model_config["num_hidden_layers"] * context)
|
135 |
const size = 2 * n_elements
|
136 |
-
|
137 |
-
return size
|
138 |
-
}
|
139 |
-
return size * 2
|
140 |
}
|
141 |
|
142 |
-
function contextSize(context=8192, model_config, bsz=512,
|
143 |
-
return Number.parseFloat((inputBuffer(context, model_config, bsz) + kvCache(context, model_config,
|
144 |
}
|
145 |
|
146 |
function modelSize(model_config, bpw=4.5) {
|
@@ -152,22 +149,22 @@
|
|
152 |
const model_config = await modelConfig(document.getElementById("modelsearch").value)
|
153 |
const context = parseInt(document.getElementById("contextsize").value)
|
154 |
let bsz = 512
|
155 |
-
let
|
156 |
let bpw = 0
|
157 |
if (format === "gguf") {
|
158 |
bsz = parseInt(document.getElementById("batchsize").value)
|
159 |
bpw = gguf_quants[document.getElementById("quantsize").innerText]
|
160 |
|
161 |
} else if (format == "exl2") {
|
162 |
-
|
163 |
bpw = Number.parseFloat(document.getElementById("bpw").value)
|
164 |
}
|
165 |
|
166 |
const model_size = modelSize(model_config, bpw)
|
167 |
-
const context_size = contextSize(context, model_config, bsz,
|
168 |
-
const total_size = ((model_size + context_size) /
|
169 |
-
document.getElementById("resultmodel").innerText = (model_size /
|
170 |
-
document.getElementById("resultcontext").innerText = (context_size /
|
171 |
const result_total_el = document.getElementById("resulttotal");
|
172 |
result_total_el.innerText = total_size.toFixed(2)
|
173 |
|
@@ -401,13 +398,16 @@
|
|
401 |
class="w-fit p-3 h-full flex items-center gap-2 justify-center rounded-md border-0 text-gray-900 shadow-sm ring-1 ring-inset ring-gray-300 placeholder:text-gray-400 focus:ring-2 focus:ring-inset focus:ring-indigo-600 sm:text-sm sm:leading-6"
|
402 |
>
|
403 |
<label
|
404 |
-
for="
|
405 |
class="inline-block bg-white text-xs font-medium text-gray-900"
|
406 |
>
|
407 |
-
|
408 |
</label>
|
409 |
-
<
|
410 |
-
|
|
|
|
|
|
|
411 |
</div>
|
412 |
</div>
|
413 |
</div>
|
|
|
128 |
return (context / 1024 * 2 + 0.75) * model_config["num_attention_heads"] * 1024 * 1024
|
129 |
}
|
130 |
|
131 |
+
function kvCache(context=8192, model_config, cache_bit=16) {
|
132 |
const n_gqa = model_config["num_attention_heads"] / model_config["num_key_value_heads"]
|
133 |
const n_embd_gqa = model_config["hidden_size"] / n_gqa
|
134 |
const n_elements = n_embd_gqa * (model_config["num_hidden_layers"] * context)
|
135 |
const size = 2 * n_elements
|
136 |
+
return size * (cache_bit / 8)
|
|
|
|
|
|
|
137 |
}
|
138 |
|
139 |
+
function contextSize(context=8192, model_config, bsz=512, cache_bit=16) {
|
140 |
+
return Number.parseFloat((inputBuffer(context, model_config, bsz) + kvCache(context, model_config, cache_bit) + computeBuffer(context, model_config, bsz)).toFixed(2))
|
141 |
}
|
142 |
|
143 |
function modelSize(model_config, bpw=4.5) {
|
|
|
149 |
const model_config = await modelConfig(document.getElementById("modelsearch").value)
|
150 |
const context = parseInt(document.getElementById("contextsize").value)
|
151 |
let bsz = 512
|
152 |
+
let cache_bit = 16
|
153 |
let bpw = 0
|
154 |
if (format === "gguf") {
|
155 |
bsz = parseInt(document.getElementById("batchsize").value)
|
156 |
bpw = gguf_quants[document.getElementById("quantsize").innerText]
|
157 |
|
158 |
} else if (format == "exl2") {
|
159 |
+
cache_bit = Number.parseInt(document.getElementById("kvCache").value)
|
160 |
bpw = Number.parseFloat(document.getElementById("bpw").value)
|
161 |
}
|
162 |
|
163 |
const model_size = modelSize(model_config, bpw)
|
164 |
+
const context_size = contextSize(context, model_config, bsz, cache_bit)
|
165 |
+
const total_size = ((model_size + context_size) / 2**30)
|
166 |
+
document.getElementById("resultmodel").innerText = (model_size / 2**30).toFixed(2)
|
167 |
+
document.getElementById("resultcontext").innerText = (context_size / 2**30).toFixed(2)
|
168 |
const result_total_el = document.getElementById("resulttotal");
|
169 |
result_total_el.innerText = total_size.toFixed(2)
|
170 |
|
|
|
398 |
class="w-fit p-3 h-full flex items-center gap-2 justify-center rounded-md border-0 text-gray-900 shadow-sm ring-1 ring-inset ring-gray-300 placeholder:text-gray-400 focus:ring-2 focus:ring-inset focus:ring-indigo-600 sm:text-sm sm:leading-6"
|
399 |
>
|
400 |
<label
|
401 |
+
for="kvCache"
|
402 |
class="inline-block bg-white text-xs font-medium text-gray-900"
|
403 |
>
|
404 |
+
KV Cache
|
405 |
</label>
|
406 |
+
<select id="kvCache" name="kvCache">
|
407 |
+
<option value="16">16 bit</option>
|
408 |
+
<option value="8">8 bit</option>
|
409 |
+
<option value="4">4 bit</option>
|
410 |
+
</select>
|
411 |
</div>
|
412 |
</div>
|
413 |
</div>
|