Spaces:
Running
Running
sync models
Browse files- app/models.py +67 -4
app/models.py
CHANGED
@@ -1,3 +1,4 @@
|
|
|
|
1 |
from gradio_client import handle_file
|
2 |
|
3 |
# Models to include in the leaderboard, only include models that users can vote on
|
@@ -48,6 +49,16 @@ AVAILABLE_MODELS = {
|
|
48 |
|
49 |
# IMS-Toucan
|
50 |
# 'Flux9665/MassivelyMultilingualTTS': 'Flux9665/MassivelyMultilingualTTS', # 5.1
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
51 |
|
52 |
# HF TTS w issues
|
53 |
'LeeSangHoon/HierSpeech_TTS': 'LeeSangHoon/HierSpeech_TTS', # irresponsive to exclamation marks # 4.29
|
@@ -168,7 +179,7 @@ HF_SPACES = {
|
|
168 |
'function': '/predict',
|
169 |
'text_param_index': 0,
|
170 |
'return_audio_index': 0,
|
171 |
-
'
|
172 |
'series': 'Edge TTS',
|
173 |
},
|
174 |
|
@@ -218,6 +229,34 @@ HF_SPACES = {
|
|
218 |
'is_zero_gpu_space': True,
|
219 |
'series': 'StyleTTS',
|
220 |
},
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
221 |
}
|
222 |
|
223 |
# for zero-shot TTS - voice sample used by XTTS (11 seconds)
|
@@ -317,8 +356,10 @@ OVERRIDE_INPUTS = {
|
|
317 |
'mrfakename/E2-F5-TTS': {
|
318 |
0: DEFAULT_VOICE_SAMPLE, # voice sample
|
319 |
1: DEFAULT_VOICE_TRANSCRIPT, # transcript of sample (< 15 seconds required)
|
320 |
-
3:
|
321 |
-
|
|
|
|
|
322 |
},
|
323 |
|
324 |
# IMS-Toucan
|
@@ -337,6 +378,28 @@ OVERRIDE_INPUTS = {
|
|
337 |
2: 'en-us', # lang
|
338 |
3: 8, # lngsteps
|
339 |
},
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
340 |
}
|
341 |
|
342 |
|
@@ -385,7 +448,7 @@ def make_link_to_space(model_name, for_leaderboard=False):
|
|
385 |
try:
|
386 |
if(
|
387 |
for_leaderboard
|
388 |
-
and HF_SPACES[model_name]['
|
389 |
):
|
390 |
model_basename += ' π'
|
391 |
title += '; π = online only or proprietary'
|
|
|
1 |
+
import os
|
2 |
from gradio_client import handle_file
|
3 |
|
4 |
# Models to include in the leaderboard, only include models that users can vote on
|
|
|
49 |
|
50 |
# IMS-Toucan
|
51 |
# 'Flux9665/MassivelyMultilingualTTS': 'Flux9665/MassivelyMultilingualTTS', # 5.1
|
52 |
+
# StyleTTS v2
|
53 |
+
# 'Pendrokar/style-tts-2': 'Pendrokar/style-tts-2', # more votes in OG arena; emotionless
|
54 |
+
# StyleTTS kokoro
|
55 |
+
'hexgrad/kokoro': 'hexgrad/kokoro',
|
56 |
+
|
57 |
+
# MaskGCT (by Amphion)
|
58 |
+
# DEMANDS 300 seconds of ZeroGPU
|
59 |
+
# 'amphion/maskgct': 'amphion/maskgct',
|
60 |
+
# default ZeroGPU borrow time
|
61 |
+
'Svngoku/maskgct-audio-lab': 'Svngoku/maskgct-audio-lab',
|
62 |
|
63 |
# HF TTS w issues
|
64 |
'LeeSangHoon/HierSpeech_TTS': 'LeeSangHoon/HierSpeech_TTS', # irresponsive to exclamation marks # 4.29
|
|
|
179 |
'function': '/predict',
|
180 |
'text_param_index': 0,
|
181 |
'return_audio_index': 0,
|
182 |
+
'is_closed_source': True,
|
183 |
'series': 'Edge TTS',
|
184 |
},
|
185 |
|
|
|
229 |
'is_zero_gpu_space': True,
|
230 |
'series': 'StyleTTS',
|
231 |
},
|
232 |
+
|
233 |
+
# StyleTTS v2 kokoro fine tune
|
234 |
+
'hexgrad/kokoro': {
|
235 |
+
'name': 'StyleTTS Kokoro',
|
236 |
+
'function': '/generate',
|
237 |
+
'text_param_index': 0,
|
238 |
+
'return_audio_index': 0,
|
239 |
+
'is_zero_gpu_space': True,
|
240 |
+
'series': 'StyleTTS',
|
241 |
+
},
|
242 |
+
|
243 |
+
# MaskGCT (by Amphion)
|
244 |
+
'amphion/maskgct': {
|
245 |
+
'name': 'MaskGCT',
|
246 |
+
'function': '/predict',
|
247 |
+
'text_param_index': 1,
|
248 |
+
'return_audio_index': 0,
|
249 |
+
'is_zero_gpu_space': True,
|
250 |
+
'series': 'MaskGCT',
|
251 |
+
},
|
252 |
+
'Svngoku/maskgct-audio-lab': {
|
253 |
+
'name': 'MaskGCT',
|
254 |
+
'function': '/predict',
|
255 |
+
'text_param_index': 1,
|
256 |
+
'return_audio_index': 0,
|
257 |
+
'is_zero_gpu_space': True,
|
258 |
+
'series': 'MaskGCT',
|
259 |
+
},
|
260 |
}
|
261 |
|
262 |
# for zero-shot TTS - voice sample used by XTTS (11 seconds)
|
|
|
356 |
'mrfakename/E2-F5-TTS': {
|
357 |
0: DEFAULT_VOICE_SAMPLE, # voice sample
|
358 |
1: DEFAULT_VOICE_TRANSCRIPT, # transcript of sample (< 15 seconds required)
|
359 |
+
3: False, # cleanup silence
|
360 |
+
4: 0.15, #crossfade
|
361 |
+
5: 32, #nfe_slider
|
362 |
+
6: 1, #speed
|
363 |
},
|
364 |
|
365 |
# IMS-Toucan
|
|
|
378 |
2: 'en-us', # lang
|
379 |
3: 8, # lngsteps
|
380 |
},
|
381 |
+
|
382 |
+
# StyleTTS 2 kokoro
|
383 |
+
'hexgrad/kokoro': {
|
384 |
+
1: "af", #voice
|
385 |
+
2: None, #ps
|
386 |
+
3: 1, #speed
|
387 |
+
4: 3000, #trim
|
388 |
+
5: False, #use_gpu; fast enough with multithreaded with CPU
|
389 |
+
6: os.getenv('KOKORO'), #sk
|
390 |
+
},
|
391 |
+
|
392 |
+
# maskGCT (by amphion)
|
393 |
+
'amphion/maskgct': {
|
394 |
+
0: DEFAULT_VOICE_SAMPLE, #prompt_wav
|
395 |
+
2: -1, #target_len
|
396 |
+
3: 25, #n_timesteps
|
397 |
+
},
|
398 |
+
'Svngoku/maskgct-audio-lab': {
|
399 |
+
0: DEFAULT_VOICE_SAMPLE, #prompt_wav
|
400 |
+
2: -1, #target_len
|
401 |
+
3: 25, #n_timesteps
|
402 |
+
},
|
403 |
}
|
404 |
|
405 |
|
|
|
448 |
try:
|
449 |
if(
|
450 |
for_leaderboard
|
451 |
+
and HF_SPACES[model_name]['is_closed_source']
|
452 |
):
|
453 |
model_basename += ' π'
|
454 |
title += '; π = online only or proprietary'
|