Spaces:
Running
on
Zero
Running
on
Zero
WillHeld
commited on
Commit
·
f7e8ea0
0
Parent(s):
Try Starting Space from Scratch
Browse files- .gitattributes +35 -0
- .gitignore +2 -0
- README.md +13 -0
- app.py +213 -0
- files.txt +0 -0
- packages.txt +1 -0
- requirements.txt +5 -0
.gitattributes
ADDED
@@ -0,0 +1,35 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
*.7z filter=lfs diff=lfs merge=lfs -text
|
2 |
+
*.arrow filter=lfs diff=lfs merge=lfs -text
|
3 |
+
*.bin filter=lfs diff=lfs merge=lfs -text
|
4 |
+
*.bz2 filter=lfs diff=lfs merge=lfs -text
|
5 |
+
*.ckpt filter=lfs diff=lfs merge=lfs -text
|
6 |
+
*.ftz filter=lfs diff=lfs merge=lfs -text
|
7 |
+
*.gz filter=lfs diff=lfs merge=lfs -text
|
8 |
+
*.h5 filter=lfs diff=lfs merge=lfs -text
|
9 |
+
*.joblib filter=lfs diff=lfs merge=lfs -text
|
10 |
+
*.lfs.* filter=lfs diff=lfs merge=lfs -text
|
11 |
+
*.mlmodel filter=lfs diff=lfs merge=lfs -text
|
12 |
+
*.model filter=lfs diff=lfs merge=lfs -text
|
13 |
+
*.msgpack filter=lfs diff=lfs merge=lfs -text
|
14 |
+
*.npy filter=lfs diff=lfs merge=lfs -text
|
15 |
+
*.npz filter=lfs diff=lfs merge=lfs -text
|
16 |
+
*.onnx filter=lfs diff=lfs merge=lfs -text
|
17 |
+
*.ot filter=lfs diff=lfs merge=lfs -text
|
18 |
+
*.parquet filter=lfs diff=lfs merge=lfs -text
|
19 |
+
*.pb filter=lfs diff=lfs merge=lfs -text
|
20 |
+
*.pickle filter=lfs diff=lfs merge=lfs -text
|
21 |
+
*.pkl filter=lfs diff=lfs merge=lfs -text
|
22 |
+
*.pt filter=lfs diff=lfs merge=lfs -text
|
23 |
+
*.pth filter=lfs diff=lfs merge=lfs -text
|
24 |
+
*.rar filter=lfs diff=lfs merge=lfs -text
|
25 |
+
*.safetensors filter=lfs diff=lfs merge=lfs -text
|
26 |
+
saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
27 |
+
*.tar.* filter=lfs diff=lfs merge=lfs -text
|
28 |
+
*.tar filter=lfs diff=lfs merge=lfs -text
|
29 |
+
*.tflite filter=lfs diff=lfs merge=lfs -text
|
30 |
+
*.tgz filter=lfs diff=lfs merge=lfs -text
|
31 |
+
*.wasm filter=lfs diff=lfs merge=lfs -text
|
32 |
+
*.xz filter=lfs diff=lfs merge=lfs -text
|
33 |
+
*.zip filter=lfs diff=lfs merge=lfs -text
|
34 |
+
*.zst filter=lfs diff=lfs merge=lfs -text
|
35 |
+
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
.gitignore
ADDED
@@ -0,0 +1,2 @@
|
|
|
|
|
|
|
1 |
+
*__pycache__*
|
2 |
+
user_study.json
|
README.md
ADDED
@@ -0,0 +1,13 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
---
|
2 |
+
title: Diva Audio
|
3 |
+
emoji: 🔊
|
4 |
+
colorFrom: gray
|
5 |
+
colorTo: red
|
6 |
+
sdk: gradio
|
7 |
+
sdk_version: 5.9.1
|
8 |
+
app_file: app.py
|
9 |
+
pinned: false
|
10 |
+
license: mpl-2.0
|
11 |
+
---
|
12 |
+
|
13 |
+
An example chatbot using [Gradio](https://gradio.app), [`huggingface_hub`](https://huggingface.co/docs/huggingface_hub/v0.22.2/en/index), and the [Hugging Face Inference API](https://huggingface.co/docs/api-inference/index).
|
app.py
ADDED
@@ -0,0 +1,213 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import copy
|
2 |
+
import os
|
3 |
+
import random
|
4 |
+
import sys
|
5 |
+
|
6 |
+
import xxhash
|
7 |
+
import gradio as gr
|
8 |
+
import librosa
|
9 |
+
import numpy as np
|
10 |
+
import soundfile as sf
|
11 |
+
import torch
|
12 |
+
import torch.nn.functional as F
|
13 |
+
from accelerate import infer_auto_device_map
|
14 |
+
from datasets import Audio
|
15 |
+
from safetensors.torch import load, load_model
|
16 |
+
import spaces
|
17 |
+
from torch import nn
|
18 |
+
from transformers import (
|
19 |
+
AutoModelForCausalLM,
|
20 |
+
AutoProcessor,
|
21 |
+
AutoTokenizer,
|
22 |
+
LlamaForCausalLM,
|
23 |
+
TextIteratorStreamer,
|
24 |
+
WhisperForConditionalGeneration,
|
25 |
+
AutoProcessor,
|
26 |
+
AutoModel,
|
27 |
+
)
|
28 |
+
from transformers.generation import GenerationConfig
|
29 |
+
|
30 |
+
anonymous = False
|
31 |
+
|
32 |
+
diva_model = AutoModel.from_pretrained(
|
33 |
+
"WillHeld/DiVA-llama-3-v0-8b", trust_remote_code=True
|
34 |
+
)
|
35 |
+
|
36 |
+
resampler = Audio(sampling_rate=16_000)
|
37 |
+
|
38 |
+
|
39 |
+
@spaces.GPU
|
40 |
+
@torch.no_grad
|
41 |
+
def diva_audio(audio_input, do_sample=False, temperature=0.001):
|
42 |
+
sr, y = audio_input
|
43 |
+
x = xxhash.xxh32(bytes(y)).hexdigest()
|
44 |
+
y = y.astype(np.float32)
|
45 |
+
y /= np.max(np.abs(y))
|
46 |
+
a = resampler.decode_example(
|
47 |
+
resampler.encode_example({"array": y, "sampling_rate": sr})
|
48 |
+
)
|
49 |
+
yield from diva_model.generate_stream(
|
50 |
+
a["array"], None, do_sample=do_sample, max_new_tokens=256
|
51 |
+
)
|
52 |
+
|
53 |
+
|
54 |
+
def transcribe_wrapper(audio_input, state, model_order):
|
55 |
+
spinner = "◒"
|
56 |
+
d_resp = gr.Textbox(
|
57 |
+
value="♫♪.ılılıll|̲̅̅●̲̅̅|̲̅̅=̲̅̅|̲̅̅●̲̅̅|llılılı.♫♪loading♫♪.ılılıll|̲̅̅●̲̅̅|̲̅̅=̲̅̅|̲̅̅●̲̅̅|llılılı.♫♪loading♫♪.ılılıll|̲̅̅●̲̅̅|̲̅̅=̲̅̅|̲̅̅●̲̅̅|llılılı.♫♪♫♪",
|
58 |
+
visible=True,
|
59 |
+
label=model_names[0] if not anonymous else f"Model {order}",
|
60 |
+
)
|
61 |
+
yield (
|
62 |
+
gr.Button(
|
63 |
+
value="Loading Weights onto ZeroGPU...",
|
64 |
+
interactive=False,
|
65 |
+
variant="primary",
|
66 |
+
),
|
67 |
+
d_resp,
|
68 |
+
state,
|
69 |
+
)
|
70 |
+
|
71 |
+
yield from transcribe(audio_input, state, model_order)
|
72 |
+
|
73 |
+
|
74 |
+
@spaces.GPU
|
75 |
+
def transcribe(audio_input, state, model_order):
|
76 |
+
if audio_input == None:
|
77 |
+
return (
|
78 |
+
"Click to run inference!",
|
79 |
+
"",
|
80 |
+
state,
|
81 |
+
)
|
82 |
+
|
83 |
+
def gen_from_diva():
|
84 |
+
diva_resp = diva_audio(audio_input)
|
85 |
+
for resp in diva_resp:
|
86 |
+
d_resp = gr.Textbox(
|
87 |
+
value=resp,
|
88 |
+
visible=True,
|
89 |
+
label=model_names[0] if not anonymous else f"Model {order}",
|
90 |
+
)
|
91 |
+
yield d_resp
|
92 |
+
|
93 |
+
spinner_id = 0
|
94 |
+
spinners = ["◐ ", "◓ ", "◑", "◒"]
|
95 |
+
|
96 |
+
for response in gen_from_diva():
|
97 |
+
spinner = spinners[spinner_id]
|
98 |
+
spinner_id = (spinner_id + 1) % 4
|
99 |
+
yield (
|
100 |
+
gr.Button(
|
101 |
+
value=spinner + " Generating Responses " + spinner,
|
102 |
+
interactive=False,
|
103 |
+
variant="primary",
|
104 |
+
),
|
105 |
+
response,
|
106 |
+
state,
|
107 |
+
)
|
108 |
+
yield (
|
109 |
+
gr.Button(value="Click to run inference!", interactive=True, variant="primary"),
|
110 |
+
response,
|
111 |
+
state,
|
112 |
+
)
|
113 |
+
|
114 |
+
|
115 |
+
def on_page_load(state, model_order):
|
116 |
+
if state == 0:
|
117 |
+
gr.Info(
|
118 |
+
"Record something you'd say to an AI Assistant! Think about what you usually use Siri, Google Assistant, or ChatGPT for."
|
119 |
+
)
|
120 |
+
state = 1
|
121 |
+
if anonymous:
|
122 |
+
random.shuffle(model_order)
|
123 |
+
return state, model_order
|
124 |
+
|
125 |
+
|
126 |
+
def recording_complete(state):
|
127 |
+
if state == 1:
|
128 |
+
gr.Info(
|
129 |
+
"Once you submit your recording, DiVA will stream back a response! This might take a second as ZeroGPU needs to load model weights into vRAM!."
|
130 |
+
)
|
131 |
+
state = 2
|
132 |
+
return (
|
133 |
+
gr.Button(value="Click to run inference!", interactive=True, variant="primary"),
|
134 |
+
state,
|
135 |
+
)
|
136 |
+
|
137 |
+
|
138 |
+
def clear_factory(button_id):
|
139 |
+
def clear(audio_input, model_order):
|
140 |
+
return (
|
141 |
+
model_order,
|
142 |
+
gr.Button(
|
143 |
+
value="Record Audio to Submit!",
|
144 |
+
interactive=False,
|
145 |
+
),
|
146 |
+
None,
|
147 |
+
None,
|
148 |
+
)
|
149 |
+
|
150 |
+
return clear
|
151 |
+
|
152 |
+
|
153 |
+
theme = gr.themes.Soft(
|
154 |
+
primary_hue=gr.themes.Color(
|
155 |
+
c100="#82000019",
|
156 |
+
c200="#82000033",
|
157 |
+
c300="#8200004c",
|
158 |
+
c400="#82000066",
|
159 |
+
c50="#8200007f",
|
160 |
+
c500="#8200007f",
|
161 |
+
c600="#82000099",
|
162 |
+
c700="#820000b2",
|
163 |
+
c800="#820000cc",
|
164 |
+
c900="#820000e5",
|
165 |
+
c950="#820000f2",
|
166 |
+
),
|
167 |
+
secondary_hue="rose",
|
168 |
+
neutral_hue="stone",
|
169 |
+
)
|
170 |
+
|
171 |
+
model_names = ["DiVA Llama 3 8B"]
|
172 |
+
model_shorthand = ["diva"]
|
173 |
+
with gr.Blocks(theme=theme) as demo:
|
174 |
+
state = gr.State(0)
|
175 |
+
model_order = gr.State([0, 1])
|
176 |
+
with gr.Row():
|
177 |
+
audio_input = gr.Audio(
|
178 |
+
sources=["microphone"], streaming=False, label="Audio Input"
|
179 |
+
)
|
180 |
+
|
181 |
+
with gr.Row():
|
182 |
+
btn = gr.Button(value="Record Audio to Submit!", interactive=False)
|
183 |
+
|
184 |
+
with gr.Row():
|
185 |
+
out1 = gr.Textbox(visible=False)
|
186 |
+
|
187 |
+
audio_input.stop_recording(
|
188 |
+
recording_complete,
|
189 |
+
[state],
|
190 |
+
[btn, state],
|
191 |
+
)
|
192 |
+
audio_input.start_recording(
|
193 |
+
lambda: gr.Button(
|
194 |
+
value="Uploading Audio to Cloud", interactive=False, variant="primary"
|
195 |
+
),
|
196 |
+
None,
|
197 |
+
btn,
|
198 |
+
)
|
199 |
+
btn.click(
|
200 |
+
fn=transcribe_wrapper,
|
201 |
+
inputs=[audio_input, state, model_order],
|
202 |
+
outputs=[btn, out1, state],
|
203 |
+
)
|
204 |
+
audio_input.clear(
|
205 |
+
clear_factory(None),
|
206 |
+
[audio_input, model_order],
|
207 |
+
[model_order, btn, audio_input, out1],
|
208 |
+
)
|
209 |
+
demo.load(
|
210 |
+
fn=on_page_load, inputs=[state, model_order], outputs=[state, model_order]
|
211 |
+
)
|
212 |
+
|
213 |
+
demo.launch(share=True)
|
files.txt
ADDED
File without changes
|
packages.txt
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
ffmpeg
|
requirements.txt
ADDED
@@ -0,0 +1,5 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
transformers==4.45.2
|
2 |
+
accelerate
|
3 |
+
peft
|
4 |
+
librosa
|
5 |
+
torchaudio
|