Spaces:
Running
on
Zero
Running
on
Zero
Update app.py
Browse files
app.py
CHANGED
@@ -121,36 +121,41 @@ class WhisperxModel:
|
|
121 |
segment['text'] = replace_numbers_with_words(segment['text'])
|
122 |
return self.align_model.align(segments, audio_path)
|
123 |
|
124 |
-
|
125 |
-
|
126 |
-
|
127 |
-
|
128 |
-
|
129 |
-
|
130 |
-
|
131 |
-
|
132 |
-
|
133 |
-
|
134 |
-
|
135 |
-
|
136 |
-
|
137 |
-
|
138 |
-
|
139 |
-
|
140 |
-
|
141 |
-
model.
|
142 |
-
|
143 |
-
|
144 |
-
|
145 |
-
|
146 |
-
|
147 |
-
|
148 |
-
|
149 |
-
|
150 |
-
|
151 |
-
|
152 |
-
|
153 |
-
|
|
|
|
|
|
|
|
|
|
|
154 |
|
155 |
def get_transcribe_state(segments):
|
156 |
transcript = " ".join([segment["text"] for segment in segments])
|
@@ -354,6 +359,14 @@ demo_text = {
|
|
354 |
|
355 |
def get_app():
|
356 |
with gr.Blocks() as app:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
357 |
with gr.Row():
|
358 |
with gr.Column(scale=2):
|
359 |
input_audio = gr.Audio(value=f"{DEMO_PATH}/5895_34622_000026_000002.wav", label="Input Audio", type="filepath", interactive=True)
|
|
|
121 |
segment['text'] = replace_numbers_with_words(segment['text'])
|
122 |
return self.align_model.align(segments, audio_path)
|
123 |
|
124 |
+
@spaces.GPU
|
125 |
+
def load_models():
|
126 |
+
ssrspeech_model_name = "English"
|
127 |
+
text_tokenizer = TextTokenizer(backend="espeak")
|
128 |
+
language = "en"
|
129 |
+
transcribe_model_name = "base.en"
|
130 |
+
|
131 |
+
align_model = WhisperxAlignModel(language)
|
132 |
+
transcribe_model = WhisperxModel(transcribe_model_name, align_model, language)
|
133 |
+
|
134 |
+
ssrspeech_fn = f"{MODELS_PATH}/{ssrspeech_model_name}.pth"
|
135 |
+
if not os.path.exists(ssrspeech_fn):
|
136 |
+
os.system(f"wget https://huggingface.co/westbrook/SSR-Speech-{ssrspeech_model_name}/resolve/main/{ssrspeech_model_name}.pth -O " + ssrspeech_fn)
|
137 |
+
|
138 |
+
ckpt = torch.load(ssrspeech_fn)
|
139 |
+
model = ssr.SSR_Speech(ckpt["config"])
|
140 |
+
model.load_state_dict(ckpt["model"])
|
141 |
+
config = model.args
|
142 |
+
phn2num = ckpt["phn2num"]
|
143 |
+
model.to(device)
|
144 |
+
|
145 |
+
encodec_fn = f"{MODELS_PATH}/wmencodec.th"
|
146 |
+
if not os.path.exists(encodec_fn):
|
147 |
+
os.system(f"wget https://huggingface.co/westbrook/SSR-Speech-English/resolve/main/wmencodec.th -O " + encodec_fn)
|
148 |
+
|
149 |
+
ssrspeech_model = {
|
150 |
+
"config": config,
|
151 |
+
"phn2num": phn2num,
|
152 |
+
"model": model,
|
153 |
+
"text_tokenizer": text_tokenizer,
|
154 |
+
"audio_tokenizer": AudioTokenizer(signature=encodec_fn)
|
155 |
+
}
|
156 |
+
return transcribe_model, align_model, ssrspeech_model
|
157 |
+
|
158 |
+
transcribe_model, align_model, ssrspeech_model = load_models()
|
159 |
|
160 |
def get_transcribe_state(segments):
|
161 |
transcript = " ".join([segment["text"] for segment in segments])
|
|
|
359 |
|
360 |
def get_app():
|
361 |
with gr.Blocks() as app:
|
362 |
+
gr.Markdown("""
|
363 |
+
# EzAudio: High-quality Text-to-Audio Generator
|
364 |
+
Generate and edit audio from text using a diffusion transformer. Adjust advanced settings for more control.
|
365 |
+
|
366 |
+
Learn more about 🟣**EzAudio** on the [EzAudio Homepage](https://haidog-yaqub.github.io/EzAudio-Page/).
|
367 |
+
|
368 |
+
🚀 The **EzAudio-ControlNet (Energy Envelope)** demo is now live! Try it on [🤗EzAudio-ControlNet Space](https://huggingface.co/spaces/OpenSound/EzAudio-ControlNet).
|
369 |
+
""")
|
370 |
with gr.Row():
|
371 |
with gr.Column(scale=2):
|
372 |
input_audio = gr.Audio(value=f"{DEMO_PATH}/5895_34622_000026_000002.wav", label="Input Audio", type="filepath", interactive=True)
|