Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
@@ -33,6 +33,8 @@ import requests
|
|
33 |
|
34 |
from huggingface_hub import snapshot_download
|
35 |
|
|
|
|
|
36 |
default_max_context = 16384
|
37 |
default_max_output = 512
|
38 |
|
@@ -48,7 +50,7 @@ available_models = [
|
|
48 |
"8.0bpw"
|
49 |
]
|
50 |
dirs = {}
|
51 |
-
for model in available_models:
|
52 |
dirs.update({model: snapshot_download(repo_id="turboderp/pixtral-12b-exl2", revision=model)})
|
53 |
|
54 |
@spaces.GPU(duration=45)
|
@@ -137,15 +139,14 @@ def run_inference(message, history, model_picked, context_size, max_output):
|
|
137 |
return result
|
138 |
|
139 |
description="""A demo chat interface with Pixtral 12B EXL2 Quants, deployed using **ExllamaV2**!
|
140 |
-
The model will be loaded once the GPU is available. This space specifically will load by default Pixtral at 4bpw from the following repository: [turboderp/pixtral-12b-exl2](https://huggingface.co/turboderp/pixtral-12b-exl2). Other quantization options are available.
|
141 |
-
The current version of ExllamaV2 running is the dev branch, not the master branch: [ExllamaV2](https://github.com/turboderp/exllamav2/tree/dev).
|
142 |
-
|
143 |
The model at **4bpw and 16k context size fits in less than 12GB of VRAM**, and at **2.5bpw and short context can potentially fit in 8GB of VRAM**!
|
144 |
|
145 |
The current default settings are:
|
146 |
- Model Quant: 4.0bpw
|
147 |
- Context Size: 16k tokens
|
148 |
-
- Max Output: 512 tokens
|
149 |
You can select other quants and experiment!
|
150 |
|
151 |
Thanks, turboderp!"""
|
|
|
33 |
|
34 |
from huggingface_hub import snapshot_download
|
35 |
|
36 |
+
from tqdm import tqdm
|
37 |
+
|
38 |
default_max_context = 16384
|
39 |
default_max_output = 512
|
40 |
|
|
|
50 |
"8.0bpw"
|
51 |
]
|
52 |
dirs = {}
|
53 |
+
for model in tqdm(available_models):
|
54 |
dirs.update({model: snapshot_download(repo_id="turboderp/pixtral-12b-exl2", revision=model)})
|
55 |
|
56 |
@spaces.GPU(duration=45)
|
|
|
139 |
return result
|
140 |
|
141 |
description="""A demo chat interface with Pixtral 12B EXL2 Quants, deployed using **ExllamaV2**!
|
142 |
+
The model will be loaded once the GPU is available. This space specifically will load by default Pixtral at 4bpw from the following repository: [turboderp/pixtral-12b-exl2](https://huggingface.co/turboderp/pixtral-12b-exl2). Other quantization options are available.
|
143 |
+
The current version of ExllamaV2 running is the dev branch, not the master branch: [ExllamaV2](https://github.com/turboderp/exllamav2/tree/dev).
|
|
|
144 |
The model at **4bpw and 16k context size fits in less than 12GB of VRAM**, and at **2.5bpw and short context can potentially fit in 8GB of VRAM**!
|
145 |
|
146 |
The current default settings are:
|
147 |
- Model Quant: 4.0bpw
|
148 |
- Context Size: 16k tokens
|
149 |
+
- Max Output: 512 tokens
|
150 |
You can select other quants and experiment!
|
151 |
|
152 |
Thanks, turboderp!"""
|