OpenSound commited on
Commit
a67873a
·
1 Parent(s): de943de

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +43 -30
app.py CHANGED
@@ -121,36 +121,41 @@ class WhisperxModel:
121
  segment['text'] = replace_numbers_with_words(segment['text'])
122
  return self.align_model.align(segments, audio_path)
123
 
124
- ssrspeech_model_name = "English"
125
- text_tokenizer = TextTokenizer(backend="espeak")
126
- language = "en"
127
- transcribe_model_name = "base.en"
128
-
129
- align_model = WhisperxAlignModel(language)
130
- transcribe_model = WhisperxModel(transcribe_model_name, align_model, language)
131
-
132
- ssrspeech_fn = f"{MODELS_PATH}/{ssrspeech_model_name}.pth"
133
- if not os.path.exists(ssrspeech_fn):
134
- os.system(f"wget https://huggingface.co/westbrook/SSR-Speech-{ssrspeech_model_name}/resolve/main/{ssrspeech_model_name}.pth -O " + ssrspeech_fn)
135
-
136
- ckpt = torch.load(ssrspeech_fn)
137
- model = ssr.SSR_Speech(ckpt["config"])
138
- model.load_state_dict(ckpt["model"])
139
- config = model.args
140
- phn2num = ckpt["phn2num"]
141
- model.to(device)
142
-
143
- encodec_fn = f"{MODELS_PATH}/wmencodec.th"
144
- if not os.path.exists(encodec_fn):
145
- os.system(f"wget https://huggingface.co/westbrook/SSR-Speech-English/resolve/main/wmencodec.th -O " + encodec_fn)
146
-
147
- ssrspeech_model = {
148
- "config": config,
149
- "phn2num": phn2num,
150
- "model": model,
151
- "text_tokenizer": text_tokenizer,
152
- "audio_tokenizer": AudioTokenizer(signature=encodec_fn)
153
- }
 
 
 
 
 
154
 
155
  def get_transcribe_state(segments):
156
  transcript = " ".join([segment["text"] for segment in segments])
@@ -354,6 +359,14 @@ demo_text = {
354
 
355
  def get_app():
356
  with gr.Blocks() as app:
 
 
 
 
 
 
 
 
357
  with gr.Row():
358
  with gr.Column(scale=2):
359
  input_audio = gr.Audio(value=f"{DEMO_PATH}/5895_34622_000026_000002.wav", label="Input Audio", type="filepath", interactive=True)
 
121
  segment['text'] = replace_numbers_with_words(segment['text'])
122
  return self.align_model.align(segments, audio_path)
123
 
124
+ @spaces.GPU
125
+ def load_models():
126
+ ssrspeech_model_name = "English"
127
+ text_tokenizer = TextTokenizer(backend="espeak")
128
+ language = "en"
129
+ transcribe_model_name = "base.en"
130
+
131
+ align_model = WhisperxAlignModel(language)
132
+ transcribe_model = WhisperxModel(transcribe_model_name, align_model, language)
133
+
134
+ ssrspeech_fn = f"{MODELS_PATH}/{ssrspeech_model_name}.pth"
135
+ if not os.path.exists(ssrspeech_fn):
136
+ os.system(f"wget https://huggingface.co/westbrook/SSR-Speech-{ssrspeech_model_name}/resolve/main/{ssrspeech_model_name}.pth -O " + ssrspeech_fn)
137
+
138
+ ckpt = torch.load(ssrspeech_fn)
139
+ model = ssr.SSR_Speech(ckpt["config"])
140
+ model.load_state_dict(ckpt["model"])
141
+ config = model.args
142
+ phn2num = ckpt["phn2num"]
143
+ model.to(device)
144
+
145
+ encodec_fn = f"{MODELS_PATH}/wmencodec.th"
146
+ if not os.path.exists(encodec_fn):
147
+ os.system(f"wget https://huggingface.co/westbrook/SSR-Speech-English/resolve/main/wmencodec.th -O " + encodec_fn)
148
+
149
+ ssrspeech_model = {
150
+ "config": config,
151
+ "phn2num": phn2num,
152
+ "model": model,
153
+ "text_tokenizer": text_tokenizer,
154
+ "audio_tokenizer": AudioTokenizer(signature=encodec_fn)
155
+ }
156
+ return transcribe_model, align_model, ssrspeech_model
157
+
158
+ transcribe_model, align_model, ssrspeech_model = load_models()
159
 
160
  def get_transcribe_state(segments):
161
  transcript = " ".join([segment["text"] for segment in segments])
 
359
 
360
  def get_app():
361
  with gr.Blocks() as app:
362
+ gr.Markdown("""
363
+ # EzAudio: High-quality Text-to-Audio Generator
364
+ Generate and edit audio from text using a diffusion transformer. Adjust advanced settings for more control.
365
+
366
+ Learn more about 🟣**EzAudio** on the [EzAudio Homepage](https://haidog-yaqub.github.io/EzAudio-Page/).
367
+
368
+ 🚀 The **EzAudio-ControlNet (Energy Envelope)** demo is now live! Try it on [🤗EzAudio-ControlNet Space](https://huggingface.co/spaces/OpenSound/EzAudio-ControlNet).
369
+ """)
370
  with gr.Row():
371
  with gr.Column(scale=2):
372
  input_audio = gr.Audio(value=f"{DEMO_PATH}/5895_34622_000026_000002.wav", label="Input Audio", type="filepath", interactive=True)