Pendrokar commited on
Commit
db217b9
2 Parent(s): 4f6adfb 0ba527d

Merge branch 'structured'

Browse files
Files changed (18) hide show
  1. LICENSE +0 -2
  2. app.py +3 -1694
  3. app/__init__.py +0 -0
  4. app/config.py +51 -0
  5. app/cookie.js +30 -0
  6. app/db.py +54 -0
  7. app/init.py +30 -0
  8. app/leaderboard.py +91 -0
  9. app/messages.py +98 -0
  10. app/models.py +469 -0
  11. app/synth.py +421 -0
  12. app/ui.py +52 -0
  13. app/ui_battle.py +75 -0
  14. app/ui_leaderboard.py +23 -0
  15. app/ui_vote.py +194 -0
  16. app/utils.py +20 -0
  17. app/vote.py +143 -0
  18. requirements.txt +1 -1
LICENSE CHANGED
@@ -1,5 +1,3 @@
1
- ZLIB/LIBPNG LICENSE
2
-
3
  Copyright (c) 2024 TTS-AGI Contributors
4
 
5
  This software is provided ‘as-is’, without any express or implied
 
 
 
1
  Copyright (c) 2024 TTS-AGI Contributors
2
 
3
  This software is provided ‘as-is’, without any express or implied
app.py CHANGED
@@ -1,1695 +1,4 @@
1
- import gradio as gr
2
- import pandas as pd
3
- from langdetect import detect
4
- from datasets import load_dataset
5
- import threading, time, uuid, sqlite3, shutil, os, random, asyncio, threading
6
- from pathlib import Path
7
- from huggingface_hub import CommitScheduler, delete_file, hf_hub_download
8
- from gradio_client import Client, handle_file
9
- import pyloudnorm as pyln
10
- import soundfile as sf
11
- import librosa
12
- from detoxify import Detoxify
13
- import os
14
- import tempfile
15
- from pydub import AudioSegment
16
- import itertools
17
- from typing import List, Tuple, Set, Dict
18
- from hashlib import md5, sha1
19
- import spaces
20
 
21
- class User:
22
- def __init__(self, user_id: str):
23
- self.user_id = user_id
24
- self.voted_pairs: Set[Tuple[str, str]] = set()
25
-
26
- class Sample:
27
- def __init__(self, filename: str, transcript: str, modelName: str):
28
- self.filename = filename
29
- self.transcript = transcript
30
- self.modelName = modelName
31
-
32
- def match_target_amplitude(sound, target_dBFS):
33
- change_in_dBFS = target_dBFS - sound.dBFS
34
- return sound.apply_gain(change_in_dBFS)
35
-
36
- # from gradio_space_ci import enable_space_ci
37
-
38
- # enable_space_ci()
39
-
40
-
41
-
42
- toxicity = Detoxify('original')
43
- sents = []
44
- with open('harvard_sentences.txt') as f:
45
- sents += f.read().strip().splitlines()
46
- with open('llama3_command-r_sentences_1st_person.txt') as f:
47
- sents += f.read().strip().splitlines()
48
- # With other punctuation marks
49
- # Exclamations - # conversational characters/animation entertainment/tv
50
- with open('llama3_command-r_sentences_excla.txt') as f:
51
- sents += f.read().strip().splitlines()
52
- # Questions - # conversational characters/animation entertainment/tv
53
- with open('llama3_command-r_questions.txt') as f:
54
- sents += f.read().strip().splitlines()
55
-
56
- # Credit: llama3_command-r sentences generated by user KingNish
57
-
58
- ####################################
59
- # Constants
60
- ####################################
61
- AVAILABLE_MODELS = {
62
- # 'XTTSv2': 'xtts',
63
- # 'WhisperSpeech': 'whisperspeech',
64
- # 'ElevenLabs': 'eleven',
65
- # 'OpenVoice': 'openvoice',
66
- # 'OpenVoice V2': 'openvoicev2',
67
- # 'Play.HT 2.0': 'playht',
68
- # 'MetaVoice': 'metavoice',
69
- # 'MeloTTS': 'melo',
70
- # 'StyleTTS 2': 'styletts2',
71
- # 'GPT-SoVITS': 'sovits',
72
- # 'Vokan TTS': 'vokan',
73
- # 'VoiceCraft 2.0': 'voicecraft',
74
- # 'Parler TTS': 'parler'
75
-
76
- # HF Gradio Spaces: # <works with gradio version #>
77
- # gravio version that works with most spaces: 4.29
78
- 'coqui/xtts': 'coqui/xtts', # 4.29 4.32
79
- # 'collabora/WhisperSpeech': 'collabora/WhisperSpeech', # 4.32 4.36.1 DEAD SPACE
80
- # 'myshell-ai/OpenVoice': 'myshell-ai/OpenVoice', # same devs as MeloTTS, which scores higher # 4.29
81
- # 'myshell-ai/OpenVoiceV2': 'myshell-ai/OpenVoiceV2', # same devs as MeloTTS, which scores higher # 4.29
82
- #'mrfakename/MetaVoice-1B-v0.1': 'mrfakename/MetaVoice-1B-v0.1', # 4.29 4.32 DEAD SPACE
83
- 'Pendrokar/xVASynth-TTS': 'Pendrokar/xVASynth-TTS', # 4.29 4.32 4.42.0
84
- # 'coqui/CoquiTTS': 'coqui/CoquiTTS',
85
- 'mrfakename/MeloTTS': 'mrfakename/MeloTTS', # 4.29 4.32
86
- 'fishaudio/fish-speech-1': 'fishaudio/fish-speech-1', # 4.29 4.32 4.36.1
87
-
88
- # E2 & F5 TTS
89
- # F5 model
90
- 'mrfakename/E2-F5-TTS': 'mrfakename/E2-F5-TTS', # 5.0
91
-
92
- # # Parler
93
- # Parler Large model
94
- # 'parler-tts/parler_tts': 'parler-tts/parler_tts', # 4.29 4.32 4.36.1 4.42.0
95
- # Parler Mini model
96
- 'parler-tts/parler_tts': 'parler-tts/parler_tts', # 4.29 4.32 4.36.1 4.42.0
97
- # 'parler-tts/parler_tts_mini': 'parler-tts/parler_tts_mini', # Mini is the default model of parler_tts
98
- # 'parler-tts/parler-tts-expresso': 'parler-tts/parler-tts-expresso', # 4.29 4.32 4.36.1 4.42.0 # overlly jolly
99
-
100
- # # Microsoft Edge TTS
101
- 'innoai/Edge-TTS-Text-to-Speech': 'innoai/Edge-TTS-Text-to-Speech', # 4.29
102
-
103
- # IMS-Toucan
104
- # 'Flux9665/MassivelyMultilingualTTS': 'Flux9665/MassivelyMultilingualTTS', # 5.1 # randomly changes pitch male<=>female
105
-
106
- # IMS-Toucan English non-artificial
107
- # 'Flux9665/EnglishToucan': 'Flux9665/EnglishToucan', # 5.1; poor ratings => saving ZeroGPU resources;
108
-
109
- # StyleTTS v2
110
- # 'Pendrokar/style-tts-2': 'Pendrokar/style-tts-2', # more votes in OG arena; emotionless
111
- # StyleTTS kokoro
112
- 'hexgrad/kokoro': 'hexgrad/kokoro',
113
-
114
- # MaskGCT (by Amphion)
115
- # DEMANDS 300 seconds of ZeroGPU
116
- # 'amphion/maskgct': 'amphion/maskgct',
117
- # default ZeroGPU borrow time
118
- 'Svngoku/maskgct-audio-lab': 'Svngoku/maskgct-audio-lab',
119
-
120
- # HF TTS w issues
121
- 'LeeSangHoon/HierSpeech_TTS': 'LeeSangHoon/HierSpeech_TTS', # irresponsive to exclamation marks # 4.29
122
- # 'PolyAI/pheme': '/predict#0', # sleepy HF Space
123
- # 'amphion/Text-to-Speech': '/predict#0', # disabled also on original HF space due to poor ratings
124
- # 'suno/bark': '3#0', # Hallucinates
125
- # 'shivammehta25/Matcha-TTS': '5#0', # seems to require multiple requests for setup
126
- # 'styletts2/styletts2': '0#0', # API disabled, awaiting approval of PR #15
127
- # 'Manmay/tortoise-tts': '/predict#0', # Cannot retrieve streamed file; 403
128
- # 'pytorch/Tacotron2': '0#0', # old gradio
129
- }
130
-
131
- HF_SPACES = {
132
- # XTTS v2
133
- 'coqui/xtts': {
134
- 'name': 'XTTS v2',
135
- 'function': '1',
136
- 'text_param_index': 0,
137
- 'return_audio_index': 1,
138
- 'series': 'XTTS',
139
- },
140
- # WhisperSpeech
141
- 'collabora/WhisperSpeech': {
142
- 'name': 'WhisperSpeech',
143
- 'function': '/whisper_speech_demo',
144
- 'text_param_index': 0,
145
- 'return_audio_index': 0,
146
- 'series': 'WhisperSpeech',
147
- },
148
- # OpenVoice (MyShell.ai)
149
- 'myshell-ai/OpenVoice': {
150
- 'name':'OpenVoice',
151
- 'function': '1',
152
- 'text_param_index': 0,
153
- 'return_audio_index': 1,
154
- 'series': 'OpenVoice',
155
- },
156
- # OpenVoice v2 (MyShell.ai)
157
- 'myshell-ai/OpenVoiceV2': {
158
- 'name':'OpenVoice v2',
159
- 'function': '1',
160
- 'text_param_index': 0,
161
- 'return_audio_index': 1,
162
- 'series': 'OpenVoice',
163
- },
164
- # MetaVoice
165
- 'mrfakename/MetaVoice-1B-v0.1': {
166
- 'name':'MetaVoice-1B',
167
- 'function': '/tts',
168
- 'text_param_index': 0,
169
- 'return_audio_index': 0,
170
- 'series': 'MetaVoice-1B',
171
- },
172
- # xVASynth (CPU)
173
- 'Pendrokar/xVASynth-TTS': {
174
- 'name': 'xVASynth v3',
175
- 'function': '/predict',
176
- 'text_param_index': 0,
177
- 'return_audio_index': 0,
178
- 'series': 'xVASynth',
179
- },
180
- # CoquiTTS (CPU)
181
- 'coqui/CoquiTTS': {
182
- 'name': 'CoquiTTS',
183
- 'function': '0',
184
- 'text_param_index': 0,
185
- 'return_audio_index': 0,
186
- 'series': 'CoquiTTS',
187
- },
188
- # HierSpeech_TTS
189
- 'LeeSangHoon/HierSpeech_TTS': {
190
- 'name': 'HierSpeech++',
191
- 'function': '/predict',
192
- 'text_param_index': 0,
193
- 'return_audio_index': 0,
194
- 'series': 'HierSpeech++',
195
- },
196
- # MeloTTS (MyShell.ai)
197
- 'mrfakename/MeloTTS': {
198
- 'name': 'MeloTTS',
199
- 'function': '/synthesize',
200
- 'text_param_index': 0,
201
- 'return_audio_index': 0,
202
- 'series': 'MeloTTS',
203
- },
204
-
205
- # Parler
206
- 'parler-tts/parler_tts': {
207
- 'name': 'Parler Mini',
208
- 'function': '/gen_tts',
209
- 'text_param_index': 0,
210
- 'return_audio_index': 0,
211
- 'is_zero_gpu_space': True,
212
- 'series': 'Parler',
213
- },
214
- # Parler Mini
215
- # 'parler-tts/parler_tts': {
216
- # 'name': 'Parler Large',
217
- # 'function': '/gen_tts',
218
- # 'text_param_index': 0,
219
- # 'return_audio_index': 0,
220
- # 'is_zero_gpu_space': True,
221
- # 'series': 'Parler',
222
- # },
223
- # Parler Mini which using Expresso dataset
224
- 'parler-tts/parler-tts-expresso': {
225
- 'name': 'Parler Mini Expresso',
226
- 'function': '/gen_tts',
227
- 'text_param_index': 0,
228
- 'return_audio_index': 0,
229
- 'is_zero_gpu_space': True,
230
- 'series': 'Parler',
231
- },
232
-
233
- # Microsoft Edge TTS
234
- 'innoai/Edge-TTS-Text-to-Speech': {
235
- 'name': 'Microsoft™ Edge TTS',
236
- 'function': '/predict',
237
- 'text_param_index': 0,
238
- 'return_audio_index': 0,
239
- 'is_closed_off': True,
240
- 'series': 'Edge TTS',
241
- },
242
-
243
- # Fish Speech
244
- 'fishaudio/fish-speech-1': {
245
- 'name': 'Fish Speech',
246
- 'function': '/inference_wrapper',
247
- 'text_param_index': 0,
248
- 'return_audio_index': 0,
249
- 'series': 'Fish Speech',
250
- },
251
-
252
- # E2/F5 TTS
253
- 'mrfakename/E2-F5-TTS': {
254
- 'name': 'F5 TTS',
255
- 'function': '/basic_tts',
256
- 'text_param_index': 2,
257
- 'return_audio_index': 0,
258
- 'is_zero_gpu_space': True,
259
- 'series': 'E2/F5 TTS',
260
- },
261
-
262
- # IMS-Toucan
263
- 'Flux9665/MassivelyMultilingualTTS': {
264
- 'name': 'IMS-Toucan',
265
- 'function': "/predict",
266
- 'text_param_index': 0,
267
- 'return_audio_index': 0,
268
- 'series': 'IMS-Toucan',
269
- },
270
-
271
- # IMS-Toucan English non-artificial
272
- 'Flux9665/EnglishToucan': {
273
- 'name': 'IMS-Toucan EN',
274
- 'function': "/predict",
275
- 'text_param_index': 0,
276
- 'return_audio_index': 0,
277
- 'is_zero_gpu_space': True,
278
- 'series': 'IMS-Toucan',
279
- },
280
-
281
- # StyleTTS v2
282
- 'Pendrokar/style-tts-2': {
283
- 'name': 'StyleTTS v2',
284
- 'function': '/synthesize',
285
- 'text_param_index': 0,
286
- 'return_audio_index': 0,
287
- # 'is_zero_gpu_space': True,
288
- 'series': 'StyleTTS',
289
- },
290
-
291
- # StyleTTS v2 kokoro fine tune
292
- 'hexgrad/kokoro': {
293
- 'name': 'StyleTTS Kokoro',
294
- 'function': '/generate',
295
- 'text_param_index': 0,
296
- 'return_audio_index': 0,
297
- 'is_zero_gpu_space': True,
298
- 'series': 'StyleTTS',
299
- },
300
-
301
- # MaskGCT (by Amphion)
302
- 'amphion/maskgct': {
303
- 'name': 'MaskGCT',
304
- 'function': '/predict',
305
- 'text_param_index': 1,
306
- 'return_audio_index': 0,
307
- 'is_zero_gpu_space': True,
308
- 'series': 'MaskGCT',
309
- },
310
- 'Svngoku/maskgct-audio-lab': {
311
- 'name': 'MaskGCT',
312
- 'function': '/predict',
313
- 'text_param_index': 1,
314
- 'return_audio_index': 0,
315
- 'is_zero_gpu_space': True,
316
- 'series': 'MaskGCT',
317
- },
318
-
319
- # TTS w issues
320
- # 'PolyAI/pheme': '/predict#0', #sleepy HF Space
321
- # 'amphion/Text-to-Speech': '/predict#0', #takes a whole minute to synthesize
322
- # 'suno/bark': '3#0', # Hallucinates
323
- # 'shivammehta25/Matcha-TTS': '5#0', #seems to require multiple requests for setup
324
- # 'styletts2/styletts2': '0#0', #API disabled
325
- # 'Manmay/tortoise-tts': '/predict#0', #Cannot skip text-from-file parameter
326
- # 'pytorch/Tacotron2': '0#0', #old gradio
327
- # 'fishaudio/fish-speech-1': '/inference_wrapper#0', heavy hallucinations
328
- }
329
-
330
- # for zero-shot TTS - voice sample used by XTTS (11 seconds)
331
- DEFAULT_VOICE_SAMPLE_STR = 'https://cdn-uploads.huggingface.co/production/uploads/63d52e0c4e5642795617f668/V6-rMmI-P59DA4leWDIcK.wav'
332
- DEFAULT_VOICE_SAMPLE = handle_file(DEFAULT_VOICE_SAMPLE_STR)
333
- DEFAULT_VOICE_TRANSCRIPT = "The Hispaniola was rolling scuppers under in the ocean swell. The booms were tearing at the blocks, the rudder was banging to and fro, and the whole ship creaking, groaning, and jumping like a manufactory."
334
-
335
- OVERRIDE_INPUTS = {
336
- 'coqui/xtts': {
337
- 1: 'en',
338
- 2: DEFAULT_VOICE_SAMPLE_STR, # voice sample
339
- 3: None, # mic voice sample
340
- 4: False, #use_mic
341
- 5: False, #cleanup_reference
342
- 6: False, #auto_detect
343
- },
344
- 'collabora/WhisperSpeech': {
345
- 1: DEFAULT_VOICE_SAMPLE, # voice sample
346
- 2: DEFAULT_VOICE_SAMPLE, # voice sample URL
347
- 3: 14.0, #Tempo - Gradio Slider issue: takes min. rather than value
348
- },
349
- 'myshell-ai/OpenVoice': {
350
- 1: 'default', # style
351
- 2: 'https://huggingface.co/spaces/myshell-ai/OpenVoiceV2/resolve/main/examples/speaker0.mp3', # voice sample
352
- },
353
- 'myshell-ai/OpenVoiceV2': {
354
- 1: 'en_us', # style
355
- 2: 'https://huggingface.co/spaces/myshell-ai/OpenVoiceV2/resolve/main/examples/speaker0.mp3', # voice sample
356
- },
357
- 'PolyAI/pheme': {
358
- 1: 'YOU1000000044_S0000798', # voice
359
- 2: 210,
360
- 3: 0.7, #Tempo - Gradio Slider issue: takes min. rather than value
361
- },
362
- 'Pendrokar/xVASynth-TTS': {
363
- 1: 'x_ex04', #fine-tuned voice model name
364
- 3: 1.0, #pacing/duration - Gradio Slider issue: takes min. rather than value
365
- },
366
- 'suno/bark': {
367
- 1: 'Speaker 3 (en)', # voice
368
- },
369
- 'amphion/Text-to-Speech': {
370
- 1: 'LikeManyWaters', # voice
371
- },
372
- 'LeeSangHoon/HierSpeech_TTS': {
373
- 1: handle_file('https://huggingface.co/spaces/LeeSangHoon/HierSpeech_TTS/resolve/main/example/female.wav'), # voice sample
374
- 2: 0.333,
375
- 3: 0.333,
376
- 4: 1,
377
- 5: 1,
378
- 6: 0,
379
- 7: 1111,
380
- },
381
- 'Manmay/tortoise-tts': {
382
- 1: None, # text-from-file
383
- 2: 'angie', # voice
384
- 3: 'disabled', # second voice for a dialogue
385
- 4: 'No', # split by newline
386
- },
387
- 'mrfakename/MeloTTS': {
388
- 1: 'EN-Default', # speaker; DEFAULT_VOICE_SAMPLE=EN-Default
389
- 2: 1, # speed
390
- 3: 'EN', # language
391
- },
392
- 'mrfakename/MetaVoice-1B-v0.1': {
393
- 1: 5, # float (numeric value between 0.0 and 10.0) in 'Speech Stability - improves text following for a challenging speaker' Slider component
394
- 2: 5, # float (numeric value between 1.0 and 5.0) in 'Speaker similarity - How closely to match speaker identity and speech style.' Slider component
395
- 3: "Preset voices", # Literal['Preset voices', 'Upload target voice'] in 'Choose voice' Radio component
396
- 4: "Bria", # Literal['Bria', 'Alex', 'Jacob'] in 'Preset voices' Dropdown component
397
- 5: None, # filepath in 'Upload a clean sample to clone. Sample should contain 1 speaker, be between 30-90 seconds and not contain background noise.' Audio component
398
- },
399
- 'parler-tts/parler_tts': {
400
- 1: 'Laura; Laura\'s female voice; very clear audio', # description/prompt
401
- },
402
- 'parler-tts/parler-tts-expresso': {
403
- 1: 'Elisabeth; Elisabeth\'s female voice; very clear audio', # description/prompt
404
- },
405
- 'innoai/Edge-TTS-Text-to-Speech': {
406
- 1: 'en-US-EmmaMultilingualNeural - en-US (Female)', # voice
407
- 2: 0, # pace rate
408
- 3: 0, # pitch
409
- },
410
-
411
- 'fishaudio/fish-speech-1': {
412
- 1: False, # normalize
413
- 2: handle_file('https://huggingface.co/spaces/fishaudio/fish-speech-1/resolve/main/examples/English.wav'), # reference_audio
414
- 3: 'In the ancient land of Eldoria, where the skies were painted with shades of mystic hues and the forests whispered secrets of old, there existed a dragon named Zephyros. Unlike the fearsome tales of dragons that plagued human hearts with terror, Zephyros was a creature of wonder and wisdom, revered by all who knew of his existence.', # reference_text
415
- 4: 0, # max_new_tokens
416
- 5: 200, # chunk_length
417
- 6: 0.7, # top_p
418
- 7: 1.2, # repetition_penalty
419
- 8: 0.7, # temperature
420
- 9: 0, #seed
421
- 10: "never", #use_memory_cache
422
- },
423
-
424
- 'mrfakename/E2-F5-TTS': {
425
- 0: DEFAULT_VOICE_SAMPLE, # voice sample
426
- 1: DEFAULT_VOICE_TRANSCRIPT, # transcript of sample (< 15 seconds required)
427
- 3: False, # cleanup silence
428
- 4: 0.15, #crossfade
429
- 5: 32, #nfe_slider
430
- 6: 1, #speed
431
- },
432
-
433
- # IMS-Toucan
434
- 'Flux9665/MassivelyMultilingualTTS': {
435
- 1: "English (eng)", #language
436
- 2: 0.6, #prosody_creativity
437
- 3: 1, #duration_scaling_factor
438
- 4: 41, #voice_seed
439
- 5: 7.5, #emb1
440
- 6: None, #reference_audio
441
- },
442
-
443
- # StyleTTS 2
444
- 'Pendrokar/style-tts-2': {
445
- 1: "f-us-2", #voice
446
- 2: 'en-us', # lang
447
- 3: 8, # lngsteps
448
- },
449
-
450
- # StyleTTS 2 kokoro
451
- 'hexgrad/kokoro': {
452
- 1: "af", #voice
453
- 2: None, #ps
454
- 3: 1, #speed
455
- 4: 3000, #trim
456
- 5: False, #use_gpu; fast enough with multithreaded with CPU
457
- 6: os.getenv('KOKORO'), #sk
458
- },
459
-
460
- # maskGCT (by amphion)
461
- 'amphion/maskgct': {
462
- 0: DEFAULT_VOICE_SAMPLE, #prompt_wav
463
- 2: -1, #target_len
464
- 3: 25, #n_timesteps
465
- },
466
- 'Svngoku/maskgct-audio-lab': {
467
- 0: DEFAULT_VOICE_SAMPLE, #prompt_wav
468
- 2: -1, #target_len
469
- 3: 25, #n_timesteps
470
- },
471
- }
472
-
473
- hf_clients: Tuple[Client] = {}
474
- # cache audio samples for quick voting
475
- cached_samples: List[Sample] = []
476
- voting_users = {
477
- # userid as the key and USER() as the value
478
- }
479
- # top five models in order to always have one of them picked and scrutinized
480
- top_five = ['fishaudio/fish-speech-1'] # fish 1.5
481
-
482
- def generate_matching_pairs(samples: List[Sample]) -> List[Tuple[Sample, Sample]]:
483
- transcript_groups: Dict[str, List[Sample]] = {}
484
- samples = random.sample(samples, k=len(samples))
485
- for sample in samples:
486
- if sample.transcript not in transcript_groups:
487
- transcript_groups[sample.transcript] = []
488
- transcript_groups[sample.transcript].append(sample)
489
-
490
- matching_pairs: List[Tuple[Sample, Sample]] = []
491
- for group in transcript_groups.values():
492
- matching_pairs.extend(list(itertools.combinations(group, 2)))
493
-
494
- return matching_pairs
495
-
496
- cached_audio = []
497
-
498
- # @spaces.GPU(duration=10)
499
- def asr_cached_for_dataset():
500
-
501
- for caudio in cached_audio:
502
- pass
503
- return True
504
-
505
- # List[Tuple[Sample, Sample]]
506
- all_pairs = []
507
-
508
- SPACE_ID = os.getenv('SPACE_ID')
509
- MAX_SAMPLE_TXT_LENGTH = 300
510
- MIN_SAMPLE_TXT_LENGTH = 10
511
- DB_DATASET_ID = os.getenv('DATASET_ID')
512
- DB_NAME = "database.db"
513
-
514
- SPACE_ID = 'TTS-AGI/TTS-Arena'
515
-
516
- # If /data available => means local storage is enabled => let's use it!
517
- DB_PATH = f"/data/{DB_NAME}" if os.path.isdir("/data") else DB_NAME
518
- print(f"Using {DB_PATH}")
519
- # AUDIO_DATASET_ID = "ttseval/tts-arena-new"
520
- CITATION_TEXT = """@misc{tts-arena,
521
- title = {Text to Speech Arena},
522
- author = {mrfakename and Srivastav, Vaibhav and Fourrier, Clémentine and Pouget, Lucain and Lacombe, Yoach and main and Gandhi, Sanchit},
523
- year = 2024,
524
- publisher = {Hugging Face},
525
- howpublished = "\\url{https://huggingface.co/spaces/TTS-AGI/TTS-Arena}"
526
- }"""
527
-
528
- ####################################
529
- # Functions
530
- ####################################
531
-
532
- def create_db_if_missing():
533
- conn = get_db()
534
- cursor = conn.cursor()
535
- cursor.execute('''
536
- CREATE TABLE IF NOT EXISTS model (
537
- name TEXT UNIQUE,
538
- upvote INTEGER,
539
- downvote INTEGER
540
- );
541
- ''')
542
- cursor.execute('''
543
- CREATE TABLE IF NOT EXISTS vote (
544
- id INTEGER PRIMARY KEY AUTOINCREMENT,
545
- username TEXT,
546
- model TEXT,
547
- vote INTEGER,
548
- timestamp TIMESTAMP DEFAULT CURRENT_TIMESTAMP
549
- );
550
- ''')
551
- cursor.execute('''
552
- CREATE TABLE IF NOT EXISTS votelog (
553
- id INTEGER PRIMARY KEY AUTOINCREMENT,
554
- username TEXT,
555
- chosen TEXT,
556
- rejected TEXT,
557
- timestamp TIMESTAMP DEFAULT CURRENT_TIMESTAMP
558
- );
559
- ''')
560
- cursor.execute('''
561
- CREATE TABLE IF NOT EXISTS spokentext (
562
- id INTEGER PRIMARY KEY AUTOINCREMENT,
563
- votelog_id INTEGER UNIQUE,
564
- spokentext TEXT,
565
- lang TEXT,
566
- timestamp TIMESTAMP DEFAULT CURRENT_TIMESTAMP
567
- );
568
- ''')
569
- # foreign keys
570
- cursor.execute('''
571
- CREATE UNIQUE INDEX IF NOT EXISTS st_to_vl ON spokentext(votelog_id);
572
- ''')
573
- def get_db():
574
- return sqlite3.connect(DB_PATH)
575
-
576
-
577
-
578
- ####################################
579
- # Space initialization
580
- ####################################
581
-
582
- # Download existing DB
583
- if not os.path.isfile(DB_PATH):
584
- print("Downloading DB...")
585
- try:
586
- cache_path = hf_hub_download(repo_id=DB_DATASET_ID, repo_type='dataset', filename=DB_NAME)
587
- shutil.copyfile(cache_path, DB_PATH)
588
- print("Downloaded DB")
589
- except Exception as e:
590
- print("Error while downloading DB:", e)
591
-
592
- # Create DB table (if doesn't exist)
593
- create_db_if_missing()
594
-
595
- hf_token = os.getenv('HF_TOKEN')
596
- # Sync local DB with remote repo every 5 minute (only if a change is detected)
597
- scheduler = CommitScheduler(
598
- repo_id=DB_DATASET_ID,
599
- repo_type="dataset",
600
- folder_path=Path(DB_PATH).parent,
601
- every=5,
602
- allow_patterns=DB_NAME,
603
- )
604
-
605
- # Load audio dataset
606
- # audio_dataset = load_dataset(AUDIO_DATASET_ID)
607
-
608
-
609
- # prioritize low vote models
610
- sql = 'SELECT name FROM model WHERE (upvote + downvote) < 750 ORDER BY (upvote + downvote) ASC'
611
- conn = get_db()
612
- cursor = conn.cursor()
613
- cursor.execute(sql)
614
- data = cursor.fetchall()
615
- for model in data:
616
- if (
617
- len(top_five) >= 5
618
- ):
619
- break
620
-
621
- if model[0] in AVAILABLE_MODELS.keys():
622
- top_five.append(model[0])
623
- print(f"low vote top_five: {top_five}")
624
-
625
- ####################################
626
- # Router API
627
- ####################################
628
- # router = Client("TTS-AGI/tts-router", hf_token=hf_token)
629
- router = {}
630
- ####################################
631
- # Gradio app
632
- ####################################
633
- MUST_BE_LOGGEDIN = "Please login with Hugging Face to participate in the TTS Arena."
634
- DESCR = """
635
- # TTS Spaces Arena: Benchmarking Gradio hosted TTS Models in the Wild
636
-
637
- Vote to help the community find the best available text-to-speech model!
638
- """.strip()
639
- INSTR = """
640
- ## 🗳️ Vote
641
-
642
- * Press ⚡ to get cached sample pairs you've yet to vote on. (Fast 🐇)
643
- * Or press 🎲 to randomly use a sentence from the list. (Slow 🐢)
644
- * Or input text (🇺🇸 English only) to synthesize audio. (Slowest 🐌 due to _Toxicity_ test)
645
- * Listen to the two audio clips, one after the other.
646
- * _Vote on which audio sounds more natural to you._
647
- * Model names are revealed after the vote is cast.
648
-
649
- ⚠ Note: It **may take up to 30 seconds** to ***synthesize*** audio.
650
- """.strip()
651
- request = ''
652
- if SPACE_ID:
653
- request = f"""
654
- ### Request a model
655
-
656
- Clone the repo of this space, add your model by adding the parameters required of the HF Space. Then make a pull request to {SPACE_ID}.
657
- """
658
- ABOUT = f"""
659
- ## 📄 About
660
-
661
- The TTS Arena evaluates leading speech synthesis models. It is inspired by LMsys's [Chatbot Arena](https://chat.lmsys.org/).
662
-
663
- ### Motivation
664
-
665
- The field of speech synthesis has long lacked an accurate method to measure the quality of different models. Objective metrics like WER (word error rate) are unreliable measures of model quality, and subjective measures such as MOS (mean opinion score) are typically small-scale experiments conducted with few listeners. As a result, these measurements are generally not useful for comparing two models of roughly similar quality. To address these drawbacks, we are inviting the community to rank models in an easy-to-use interface, and opening it up to the public in order to make both the opportunity to rank models, as well as the results, more easily accessible to everyone.
666
-
667
- ### The Arena
668
-
669
- The leaderboard allows a user to enter text, which will be synthesized by two models. After listening to each sample, the user can vote on which model sounds more natural. Due to the risks of human bias and abuse, model names are revealed only after a vote is submitted.
670
-
671
- ### Credits
672
-
673
- Thank you to the following individuals who helped make this* project possible:
674
-
675
- * VB ([Twitter](https://twitter.com/reach_vb) / [Hugging Face](https://huggingface.co/reach-vb))
676
- * Clémentine Fourrier ([Twitter](https://twitter.com/clefourrier) / [Hugging Face](https://huggingface.co/clefourrier))
677
- * Lucain Pouget ([Twitter](https://twitter.com/Wauplin) / [Hugging Face](https://huggingface.co/Wauplin))
678
- * Yoach Lacombe ([Twitter](https://twitter.com/yoachlacombe) / [Hugging Face](https://huggingface.co/ylacombe))
679
- * Main Horse ([Twitter](https://twitter.com/main_horse) / [Hugging Face](https://huggingface.co/main-horse))
680
- * Sanchit Gandhi ([Twitter](https://twitter.com/sanchitgandhi99) / [Hugging Face](https://huggingface.co/sanchit-gandhi))
681
- * Apolinário Passos ([Twitter](https://twitter.com/multimodalart) / [Hugging Face](https://huggingface.co/multimodalart))
682
- * Pedro Cuenca ([Twitter](https://twitter.com/pcuenq) / [Hugging Face](https://huggingface.co/pcuenq))
683
-
684
- \* ***You are currently in a cloned/forked space of TTS-AGI/TTS-Arena***
685
-
686
- {request}
687
-
688
- ### Privacy statement
689
-
690
- We may store text you enter and generated audio. We store a unique ID for each session. You agree that we may collect, share, and/or publish any data you input for research and/or commercial purposes.
691
-
692
- ### License
693
-
694
- Generated audio clips cannot be redistributed and may be used for personal, non-commercial use only.
695
-
696
- Random sentences are sourced from a filtered subset of the [Harvard Sentences](https://www.cs.columbia.edu/~hgs/audio/harvard.html) and also from KingNish's generated LLM sentences.
697
- """.strip()
698
-
699
- LDESC = f"""
700
- ## 🏆 Leaderboard
701
-
702
- Vote to help the community determine the best text-to-speech (TTS) models.
703
-
704
- The leaderboard displays models in descending order of how natural they sound (based on votes cast by the community).
705
-
706
- Important: In order to help keep results fair, the leaderboard hides results by default until the number of votes passes a threshold. Tick the `Reveal preliminary results` to show models without sufficient votes. Please note that preliminary results may be inaccurate. [This dataset is public](https://huggingface.co/datasets/{DB_DATASET_ID}) and only saves the hardcoded sentences while keeping the voters anonymous.
707
- """.strip()
708
-
709
- TTS_INFO = f"""
710
- ## 🗣 Contenders
711
-
712
- ### Open Source TTS capabilities table
713
-
714
- See [the below dataset itself](https://huggingface.co/datasets/Pendrokar/open_tts_tracker) for the legend and more in depth information of each model.
715
- """.strip()
716
-
717
- model_series = []
718
- for model in HF_SPACES.values():
719
- model_series.append('%27'+ model['series'].replace('+', '%2B') +'%27')
720
- TTS_DATASET_IFRAME_ORDER = '%2C+'.join(model_series)
721
- TTS_DATASET_IFRAME = f"""
722
- <iframe
723
- src="https://huggingface.co/datasets/Pendrokar/open_tts_tracker/embed/sql-console/default/train?sql_console=true&sql=--+The+SQL+console+is+powered+by+DuckDB+WASM+and+runs+entirely+in+the+browser.%0A--+Get+started+by+typing+a+query+or+selecting+a+view+from+the+options+below.%0ASELECT+*%2C+%22Name%22+IN+%28{TTS_DATASET_IFRAME_ORDER}%29+AS+%22In+arena%22+FROM+train+WHERE+%22Insta-clone+%F0%9F%91%A5%22+IS+NOT+NULL+ORDER+BY+%22In+arena%22+DESC+LIMIT+50%3B&views%5B%5D=train"
724
- frameborder="0"
725
- width="100%"
726
- height="650px"
727
- ></iframe>
728
- """.strip()
729
-
730
- # def reload_audio_dataset():
731
- # global audio_dataset
732
- # audio_dataset = load_dataset(AUDIO_DATASET_ID)
733
- # return 'Reload Audio Dataset'
734
-
735
- def del_db(txt):
736
- if not txt.lower() == 'delete db':
737
- raise gr.Error('You did not enter "delete db"')
738
-
739
- # Delete local + remote
740
- os.remove(DB_PATH)
741
- delete_file(path_in_repo=DB_NAME, repo_id=DB_DATASET_ID, repo_type='dataset')
742
-
743
- # Recreate
744
- create_db_if_missing()
745
- return 'Delete DB'
746
-
747
- theme = gr.themes.Base(
748
- font=[gr.themes.GoogleFont('Libre Franklin'), gr.themes.GoogleFont('Public Sans'), 'system-ui', 'sans-serif'],
749
- )
750
-
751
- model_names = {
752
- 'styletts2': 'StyleTTS 2',
753
- 'tacotron': 'Tacotron',
754
- 'tacotronph': 'Tacotron Phoneme',
755
- 'tacotrondca': 'Tacotron DCA',
756
- 'speedyspeech': 'Speedy Speech',
757
- 'overflow': 'Overflow TTS',
758
- 'vits': 'VITS',
759
- 'vitsneon': 'VITS Neon',
760
- 'neuralhmm': 'Neural HMM',
761
- 'glow': 'Glow TTS',
762
- 'fastpitch': 'FastPitch',
763
- 'jenny': 'Jenny',
764
- 'tortoise': 'Tortoise TTS',
765
- 'xtts2': 'Coqui XTTSv2',
766
- 'xtts': 'Coqui XTTS',
767
- 'openvoice': 'MyShell OpenVoice',
768
- 'elevenlabs': 'ElevenLabs',
769
- 'openai': 'OpenAI',
770
- 'hierspeech': 'HierSpeech++',
771
- 'pheme': 'PolyAI Pheme',
772
- 'speecht5': 'SpeechT5',
773
- 'metavoice': 'MetaVoice-1B',
774
- }
775
- model_licenses = {
776
- 'styletts2': 'MIT',
777
- 'tacotron': 'BSD-3',
778
- 'tacotronph': 'BSD-3',
779
- 'tacotrondca': 'BSD-3',
780
- 'speedyspeech': 'BSD-3',
781
- 'overflow': 'MIT',
782
- 'vits': 'MIT',
783
- 'openvoice': 'MIT',
784
- 'vitsneon': 'BSD-3',
785
- 'neuralhmm': 'MIT',
786
- 'glow': 'MIT',
787
- 'fastpitch': 'Apache 2.0',
788
- 'jenny': 'Jenny License',
789
- 'tortoise': 'Apache 2.0',
790
- 'xtts2': 'CPML (NC)',
791
- 'xtts': 'CPML (NC)',
792
- 'elevenlabs': 'Proprietary',
793
- 'eleven': 'Proprietary',
794
- 'openai': 'Proprietary',
795
- 'hierspeech': 'MIT',
796
- 'pheme': 'CC-BY',
797
- 'speecht5': 'MIT',
798
- 'metavoice': 'Apache 2.0',
799
- 'elevenlabs': 'Proprietary',
800
- 'whisperspeech': 'MIT',
801
- 'Pendrokar/xVASynth': 'GPT3',
802
- }
803
- model_links = {
804
- 'styletts2': 'https://github.com/yl4579/StyleTTS2',
805
- 'tacotron': 'https://github.com/NVIDIA/tacotron2',
806
- 'speedyspeech': 'https://github.com/janvainer/speedyspeech',
807
- 'overflow': 'https://github.com/shivammehta25/OverFlow',
808
- 'vits': 'https://github.com/jaywalnut310/vits',
809
- 'openvoice': 'https://github.com/myshell-ai/OpenVoice',
810
- 'neuralhmm': 'https://github.com/ketranm/neuralHMM',
811
- 'glow': 'https://github.com/jaywalnut310/glow-tts',
812
- 'fastpitch': 'https://fastpitch.github.io/',
813
- 'tortoise': 'https://github.com/neonbjb/tortoise-tts',
814
- 'xtts2': 'https://huggingface.co/coqui/XTTS-v2',
815
- 'xtts': 'https://huggingface.co/coqui/XTTS-v1',
816
- 'elevenlabs': 'https://elevenlabs.io/',
817
- 'openai': 'https://help.openai.com/en/articles/8555505-tts-api',
818
- 'hierspeech': 'https://github.com/sh-lee-prml/HierSpeechpp',
819
- 'pheme': 'https://github.com/PolyAI-LDN/pheme',
820
- 'speecht5': 'https://github.com/microsoft/SpeechT5',
821
- 'metavoice': 'https://github.com/metavoiceio/metavoice-src',
822
- }
823
- def model_license(name):
824
- print(name)
825
- for k, v in AVAILABLE_MODELS.items():
826
- if k == name:
827
- if v in model_licenses:
828
- return model_licenses[v]
829
- print('---')
830
- return 'Unknown'
831
- def get_leaderboard(reveal_prelim = False):
832
- conn = get_db()
833
- cursor = conn.cursor()
834
- sql = 'SELECT name, upvote, downvote, name AS orig_name FROM model'
835
- # if not reveal_prelim: sql += ' WHERE EXISTS (SELECT 1 FROM model WHERE (upvote + downvote) > 750)'
836
- if not reveal_prelim: sql += ' WHERE (upvote + downvote) > 300'
837
- cursor.execute(sql)
838
- data = cursor.fetchall()
839
- df = pd.DataFrame(data, columns=['name', 'upvote', 'downvote', 'orig_name'])
840
- # df['license'] = df['name'].map(model_license)
841
- df['name'] = df['name'].replace(model_names)
842
- for i in range(len(df)):
843
- df.loc[i, "name"] = make_link_to_space(df['name'][i], True)
844
- df['votes'] = df['upvote'] + df['downvote']
845
- # df['score'] = round((df['upvote'] / df['votes']) * 100, 2) # Percentage score
846
-
847
- ## ELO SCORE
848
- df['score'] = 1200
849
- for i in range(len(df)):
850
- for j in range(len(df)):
851
- if i != j:
852
- expected_a = 1 / (1 + 10 ** ((df['score'][j] - df['score'][i]) / 400))
853
- expected_b = 1 / (1 + 10 ** ((df['score'][i] - df['score'][j]) / 400))
854
- actual_a = df['upvote'][i] / df['votes'][i]
855
- actual_b = df['upvote'][j] / df['votes'][j]
856
- df.at[i, 'score'] += round(32 * (actual_a - expected_a))
857
- df.at[j, 'score'] += round(32 * (actual_b - expected_b))
858
- df['score'] = round(df['score'])
859
- ## ELO SCORE
860
- df = df.sort_values(by='score', ascending=False)
861
- # medals
862
- def assign_medal(rank, assign):
863
- rank = str(rank + 1)
864
- if assign:
865
- if rank == '1':
866
- rank += '🥇'
867
- elif rank == '2':
868
- rank += '🥈'
869
- elif rank == '3':
870
- rank += '🥉'
871
-
872
- return '#'+ rank
873
-
874
- df['order'] = [assign_medal(i, not reveal_prelim and len(df) > 2) for i in range(len(df))]
875
- # fetch top_five
876
- for orig_name in df['orig_name']:
877
- if (
878
- reveal_prelim
879
- and len(top_five) < 5
880
- and orig_name in AVAILABLE_MODELS.keys()
881
- ):
882
- top_five.append(orig_name)
883
-
884
- df = df[['order', 'name', 'score', 'votes']]
885
- return df
886
-
887
- def make_link_to_space(model_name, for_leaderboard=False):
888
- # create a anchor link if a HF space
889
- style = 'text-decoration: underline;text-decoration-style: dotted;'
890
- title = ''
891
-
892
- if model_name in AVAILABLE_MODELS:
893
- style += 'color: var(--link-text-color);'
894
- title = model_name
895
- else:
896
- style += 'font-style: italic;'
897
- title = 'Disabled for Arena (See AVAILABLE_MODELS within code for why)'
898
-
899
- model_basename = model_name
900
- if model_name in HF_SPACES:
901
- model_basename = HF_SPACES[model_name]['name']
902
-
903
- try:
904
- if(
905
- for_leaderboard
906
- and HF_SPACES[model_name]['is_closed_off']
907
- ):
908
- model_basename += ' 🔐'
909
- title += '; 🔐 = online only or proprietary'
910
- except:
911
- pass
912
-
913
- if '/' in model_name:
914
- return '🤗 <a target="_blank" style="'+ style +'" title="'+ title +'" href="'+ 'https://huggingface.co/spaces/'+ model_name +'">'+ model_basename +'</a>'
915
-
916
- # otherwise just return the model name
917
- return '<span style="'+ style +'" title="'+ title +'" href="'+ 'https://huggingface.co/spaces/'+ model_name +'">'+ model_name +'</span>'
918
-
919
- def markdown_link_to_space(model_name):
920
- # create a anchor link if a HF space using markdown syntax
921
- if '/' in model_name:
922
- return '🤗 [' + model_name + '](https://huggingface.co/spaces/' + model_name + ')'
923
- # otherwise just return the model name
924
- return model_name
925
-
926
- def mkuuid(uid):
927
- if not uid:
928
- uid = uuid.uuid4()
929
- return uid
930
- def upvote_model(model, uname):
931
- conn = get_db()
932
- cursor = conn.cursor()
933
- cursor.execute('UPDATE model SET upvote = upvote + 1 WHERE name = ?', (model,))
934
- if cursor.rowcount == 0:
935
- cursor.execute('INSERT OR REPLACE INTO model (name, upvote, downvote) VALUES (?, 1, 0)', (model,))
936
- cursor.execute('INSERT INTO vote (username, model, vote) VALUES (?, ?, ?)', (uname, model, 1,))
937
- with scheduler.lock:
938
- conn.commit()
939
- cursor.close()
940
- def log_text(text, voteid):
941
- # log only hardcoded sentences
942
- if (text not in sents):
943
- return
944
-
945
- conn = get_db()
946
- cursor = conn.cursor()
947
- # TODO: multilang
948
- cursor.execute('INSERT INTO spokentext (spokentext, lang, votelog_id) VALUES (?,?,?)', (text,'en',voteid))
949
- with scheduler.lock:
950
- conn.commit()
951
- cursor.close()
952
- def downvote_model(model, uname):
953
- conn = get_db()
954
- cursor = conn.cursor()
955
- cursor.execute('UPDATE model SET downvote = downvote + 1 WHERE name = ?', (model,))
956
- if cursor.rowcount == 0:
957
- cursor.execute('INSERT OR REPLACE INTO model (name, upvote, downvote) VALUES (?, 0, 1)', (model,))
958
- cursor.execute('INSERT INTO vote (username, model, vote) VALUES (?, ?, ?)', (uname, model, -1,))
959
- with scheduler.lock:
960
- conn.commit()
961
- cursor.close()
962
-
963
- def a_is_better(model1, model2, userid, text):
964
- return is_better(model1, model2, userid, text, True)
965
- def b_is_better(model1, model2, userid, text):
966
- return is_better(model1, model2, userid, text, False)
967
-
968
- def is_better(model1, model2, userid, text, chose_a):
969
- if(
970
- (
971
- not model1 in AVAILABLE_MODELS.keys()
972
- and not model1 in AVAILABLE_MODELS.values()
973
- )
974
- or (
975
- not model2 in AVAILABLE_MODELS.keys()
976
- and not model2 in AVAILABLE_MODELS.values()
977
- )
978
- ):
979
- raise gr.Error('Sorry, please try voting again.')
980
-
981
- # userid is unique for each cast vote pair
982
- userid = mkuuid(userid)
983
- if model1 and model2:
984
- conn = get_db()
985
- cursor = conn.cursor()
986
- sql_query = 'INSERT INTO votelog (username, chosen, rejected) VALUES (?, ?, ?)'
987
- if chose_a:
988
- cursor.execute(sql_query, (str(userid), model1, model2))
989
- else:
990
- cursor.execute(sql_query, (str(userid), model2, model1))
991
-
992
- with scheduler.lock:
993
- conn.commit()
994
- # also retrieve primary key ID
995
- cursor.execute('SELECT last_insert_rowid()')
996
- votelogid = cursor.fetchone()[0]
997
- cursor.close()
998
-
999
- if chose_a:
1000
- upvote_model(model1, str(userid))
1001
- downvote_model(model2, str(userid))
1002
- else:
1003
- upvote_model(model2, str(userid))
1004
- downvote_model(model1, str(userid))
1005
- log_text(text, votelogid)
1006
-
1007
- return reload(model1, model2, userid, chose_a=chose_a, chose_b=(not chose_a))
1008
-
1009
- def both_bad(model1, model2, userid):
1010
- userid = mkuuid(userid)
1011
- if model1 and model2:
1012
- downvote_model(model1, str(userid))
1013
- downvote_model(model2, str(userid))
1014
- return reload(model1, model2, userid)
1015
- def both_good(model1, model2, userid):
1016
- userid = mkuuid(userid)
1017
- if model1 and model2:
1018
- upvote_model(model1, str(userid))
1019
- upvote_model(model2, str(userid))
1020
- return reload(model1, model2, userid)
1021
- def reload(chosenmodel1=None, chosenmodel2=None, userid=None, chose_a=False, chose_b=False):
1022
- # Select random splits
1023
- chosenmodel1 = make_link_to_space(chosenmodel1)
1024
- chosenmodel2 = make_link_to_space(chosenmodel2)
1025
- out = [
1026
- gr.update(interactive=False, visible=False),
1027
- gr.update(interactive=False, visible=False)
1028
- ]
1029
- style = 'text-align: center; font-size: 1rem; margin-bottom: 0; padding: var(--input-padding)'
1030
- if chose_a == True:
1031
- out.append(gr.update(value=f'<p style="{style}">Your vote: {chosenmodel1}</p>', visible=True))
1032
- out.append(gr.update(value=f'<p style="{style}">{chosenmodel2}</p>', visible=True))
1033
- else:
1034
- out.append(gr.update(value=f'<p style="{style}">{chosenmodel1}</p>', visible=True))
1035
- out.append(gr.update(value=f'<p style="{style}">Your vote: {chosenmodel2}</p>', visible=True))
1036
- out.append(gr.update(visible=True))
1037
- return out
1038
-
1039
- with gr.Blocks() as leaderboard:
1040
- gr.Markdown(LDESC)
1041
- # df = gr.Dataframe(interactive=False, value=get_leaderboard())
1042
- df = gr.Dataframe(
1043
- interactive=False,
1044
- min_width=0,
1045
- wrap=True,
1046
- column_widths=[30, 200, 50, 50],
1047
- datatype=["str", "html", "number", "number"]
1048
- )
1049
- with gr.Row():
1050
- reveal_prelim = gr.Checkbox(label="Reveal preliminary results", info="Show all models, including models with very few human ratings.", scale=1)
1051
- reloadbtn = gr.Button("Refresh", scale=3)
1052
- reveal_prelim.input(get_leaderboard, inputs=[reveal_prelim], outputs=[df])
1053
- leaderboard.load(get_leaderboard, inputs=[reveal_prelim], outputs=[df])
1054
- reloadbtn.click(get_leaderboard, inputs=[reveal_prelim], outputs=[df])
1055
- # gr.Markdown("DISCLAIMER: The licenses listed may not be accurate or up to date, you are responsible for checking the licenses before using the models. Also note that some models may have additional usage restrictions.")
1056
-
1057
- def doloudnorm(path):
1058
- data, rate = sf.read(path)
1059
- meter = pyln.Meter(rate)
1060
- loudness = meter.integrated_loudness(data)
1061
- loudness_normalized_audio = pyln.normalize.loudness(data, loudness, -12.0)
1062
- sf.write(path, loudness_normalized_audio, rate)
1063
-
1064
- def doresample(path_to_wav):
1065
- pass
1066
- ##########################
1067
- # 2x speedup (hopefully) #
1068
- ##########################
1069
-
1070
- def synthandreturn(text, request: gr.Request):
1071
- text = text.strip()
1072
- if len(text) > MAX_SAMPLE_TXT_LENGTH:
1073
- raise gr.Error(f'You exceeded the limit of {MAX_SAMPLE_TXT_LENGTH} characters')
1074
- if len(text) < MIN_SAMPLE_TXT_LENGTH:
1075
- raise gr.Error(f'Please input a text longer than {MIN_SAMPLE_TXT_LENGTH} characters')
1076
- if (
1077
- # test toxicity if not prepared text
1078
- text not in sents
1079
- and toxicity.predict(text)['toxicity'] > 0.8
1080
- ):
1081
- print(f'Detected toxic content! "{text}"')
1082
- raise gr.Error('Your text failed the toxicity test')
1083
- if not text:
1084
- raise gr.Error(f'You did not enter any text')
1085
- # Check language
1086
- try:
1087
- if (
1088
- text not in sents
1089
- and not detect(text) == "en"
1090
- ):
1091
- gr.Warning('Warning: The input text may not be in English')
1092
- except:
1093
- pass
1094
- # Get two random models
1095
-
1096
- # forced model: your TTS model versus The World!!!
1097
- # mdl1 = 'Pendrokar/xVASynth'
1098
-
1099
- # scrutinize the top five by always picking one of them
1100
- if (len(top_five) >= 5):
1101
- mdl1 = random.sample(top_five, 1)[0]
1102
- vsModels = dict(AVAILABLE_MODELS)
1103
- del vsModels[mdl1]
1104
- # randomize position of the forced model
1105
- mdl2 = random.sample(list(vsModels.keys()), 1)
1106
- # forced random
1107
- mdl1, mdl2 = random.sample(list([mdl1, mdl2[0]]), 2)
1108
- else:
1109
- # actual random
1110
- mdl1, mdl2 = random.sample(list(AVAILABLE_MODELS.keys()), 2)
1111
-
1112
- print("[debug] Using", mdl1, mdl2)
1113
- def predict_and_update_result(text, model, result_storage, request:gr.Request):
1114
-
1115
- hf_headers = {}
1116
- try:
1117
- if HF_SPACES[model]['is_zero_gpu_space']:
1118
- hf_headers = {"X-IP-Token": request.headers['x-ip-token']}
1119
- except:
1120
- pass
1121
-
1122
- # re-attempt if necessary
1123
- attempt_count = 0
1124
- max_attempts = 1 # 3 =May cause 429 Too Many Request
1125
- while attempt_count < max_attempts:
1126
- try:
1127
- if model in AVAILABLE_MODELS:
1128
- if '/' in model:
1129
- # Use public HF Space
1130
- # if (model not in hf_clients):
1131
- # hf_clients[model] = Client(model, hf_token=hf_token, headers=hf_headers)
1132
- mdl_space = Client(model, hf_token=hf_token, headers=hf_headers)
1133
-
1134
- # print(f"{model}: Fetching endpoints of HF Space")
1135
- # assume the index is one of the first 9 return params
1136
- return_audio_index = int(HF_SPACES[model]['return_audio_index'])
1137
- endpoints = mdl_space.view_api(all_endpoints=True, print_info=False, return_format='dict')
1138
-
1139
- api_name = None
1140
- fn_index = None
1141
- end_parameters = None
1142
- # has named endpoint
1143
- if '/' == HF_SPACES[model]['function'][0]:
1144
- # audio sync function name
1145
- api_name = HF_SPACES[model]['function']
1146
-
1147
- end_parameters = _get_param_examples(
1148
- endpoints['named_endpoints'][api_name]['parameters']
1149
- )
1150
- # has unnamed endpoint
1151
- else:
1152
- # endpoint index is the first character
1153
- fn_index = int(HF_SPACES[model]['function'])
1154
-
1155
- end_parameters = _get_param_examples(
1156
- endpoints['unnamed_endpoints'][str(fn_index)]['parameters']
1157
- )
1158
-
1159
- # override some or all default parameters
1160
- space_inputs = _override_params(end_parameters, model)
1161
-
1162
- # force text
1163
- space_inputs[HF_SPACES[model]['text_param_index']] = text
1164
-
1165
- print(f"{model}: Sending request to HF Space")
1166
- results = mdl_space.predict(*space_inputs, api_name=api_name, fn_index=fn_index)
1167
-
1168
- # return path to audio
1169
- result = results
1170
- if (not isinstance(results, str)):
1171
- # return_audio_index may be a filepath string
1172
- result = results[return_audio_index]
1173
- if (isinstance(result, dict)):
1174
- # return_audio_index is a dictionary
1175
- result = results[return_audio_index]['value']
1176
- else:
1177
- # Use the private HF Space
1178
- result = router.predict(text, AVAILABLE_MODELS[model].lower(), api_name="/synthesize")
1179
- else:
1180
- result = router.predict(text, model.lower(), api_name="/synthesize")
1181
- break
1182
- except Exception as e:
1183
- attempt_count += 1
1184
- raise gr.Error(f"{model}:"+ repr(e))
1185
- # print(f"{model}: Unable to call API (attempt: {attempt_count})")
1186
- # sleep for three seconds to avoid spamming the server with requests
1187
- # time.sleep(3)
1188
-
1189
- # Fetch and store client again
1190
- # hf_clients[model] = Client(model, hf_token=hf_token, headers=hf_headers)
1191
-
1192
- if attempt_count >= max_attempts:
1193
- raise gr.Error(f"{model}: Failed to call model")
1194
- else:
1195
- print('Done with', model)
1196
-
1197
- try:
1198
- with tempfile.NamedTemporaryFile(suffix='.wav', delete=False) as f:
1199
- audio = AudioSegment.from_file(result)
1200
- current_sr = audio.frame_rate
1201
- if current_sr > 24000:
1202
- print(f"{model}: Resampling")
1203
- audio = audio.set_frame_rate(24000)
1204
- try:
1205
- print(f"{model}: Trying to normalize audio")
1206
- audio = match_target_amplitude(audio, -20)
1207
- except:
1208
- print(f"{model}: [WARN] Unable to normalize audio")
1209
- audio.export(f.name, format="wav")
1210
- os.unlink(result)
1211
- result = f.name
1212
- gr.Info('Audio from a TTS model received')
1213
- except:
1214
- print(f"{model}: [WARN] Unable to resample audio")
1215
- pass
1216
- if model in AVAILABLE_MODELS.keys(): model = AVAILABLE_MODELS[model]
1217
- result_storage[model] = result
1218
-
1219
- def _get_param_examples(parameters):
1220
- example_inputs = []
1221
- for param_info in parameters:
1222
- if (
1223
- param_info['component'] == 'Radio'
1224
- or param_info['component'] == 'Dropdown'
1225
- or param_info['component'] == 'Audio'
1226
- or param_info['python_type']['type'] == 'str'
1227
- ):
1228
- example_inputs.append(str(param_info['example_input']))
1229
- continue
1230
- if param_info['python_type']['type'] == 'int':
1231
- example_inputs.append(int(param_info['example_input']))
1232
- continue
1233
- if param_info['python_type']['type'] == 'float':
1234
- example_inputs.append(float(param_info['example_input']))
1235
- continue
1236
- if param_info['python_type']['type'] == 'bool':
1237
- example_inputs.append(bool(param_info['example_input']))
1238
- continue
1239
-
1240
- return example_inputs
1241
-
1242
- def _override_params(inputs, modelname):
1243
- try:
1244
- for key,value in OVERRIDE_INPUTS[modelname].items():
1245
- inputs[key] = value
1246
- print(f"{modelname}: Default inputs overridden by Arena")
1247
- except:
1248
- pass
1249
-
1250
- return inputs
1251
-
1252
- def _cache_sample(text, model):
1253
- # skip caching if not hardcoded sentence
1254
- if (text not in sents):
1255
- return False
1256
-
1257
- already_cached = False
1258
- # check if already cached
1259
- for cached_sample in cached_samples:
1260
- # TODO:replace cached with newer version
1261
- if (cached_sample.transcript == text and cached_sample.modelName == model):
1262
- already_cached = True
1263
- return True
1264
-
1265
- if (already_cached):
1266
- return False
1267
-
1268
- try:
1269
- cached_samples.append(Sample(results[model], text, model))
1270
- except:
1271
- print('Error when trying to cache sample')
1272
- return False
1273
-
1274
- mdl1k = mdl1
1275
- mdl2k = mdl2
1276
- print(mdl1k, mdl2k)
1277
- if mdl1 in AVAILABLE_MODELS.keys(): mdl1k=AVAILABLE_MODELS[mdl1]
1278
- if mdl2 in AVAILABLE_MODELS.keys(): mdl2k=AVAILABLE_MODELS[mdl2]
1279
- results = {}
1280
- print(f"Sending models {mdl1k} and {mdl2k} to API")
1281
-
1282
- # do not use multithreading when both spaces are ZeroGPU type
1283
- if (
1284
- # exists
1285
- 'is_zero_gpu_space' in HF_SPACES[mdl1]
1286
- # is True
1287
- and HF_SPACES[mdl1]['is_zero_gpu_space']
1288
- and 'is_zero_gpu_space' in HF_SPACES[mdl2]
1289
- and HF_SPACES[mdl2]['is_zero_gpu_space']
1290
- ):
1291
- # run Zero-GPU spaces one at a time
1292
- predict_and_update_result(text, mdl1k, results, request)
1293
- _cache_sample(text, mdl1k)
1294
-
1295
- predict_and_update_result(text, mdl2k, results, request)
1296
- _cache_sample(text, mdl2k)
1297
- else:
1298
- # use multithreading
1299
- thread1 = threading.Thread(target=predict_and_update_result, args=(text, mdl1k, results, request))
1300
- thread2 = threading.Thread(target=predict_and_update_result, args=(text, mdl2k, results, request))
1301
-
1302
- thread1.start()
1303
- # wait 3 seconds to calm hf.space domain
1304
- time.sleep(3)
1305
- thread2.start()
1306
- # timeout in 2 minutes
1307
- thread1.join(120)
1308
- thread2.join(120)
1309
-
1310
- # cache the result
1311
- for model in [mdl1k, mdl2k]:
1312
- _cache_sample(text, model)
1313
-
1314
- #debug
1315
- # print(results)
1316
- # print(list(results.keys())[0])
1317
- # y, sr = librosa.load(results[list(results.keys())[0]], sr=None)
1318
- # print(sr)
1319
- # print(list(results.keys())[1])
1320
- # y, sr = librosa.load(results[list(results.keys())[1]], sr=None)
1321
- # print(sr)
1322
- #debug
1323
- # outputs = [text, btn, r2, model1, model2, aud1, aud2, abetter, bbetter, prevmodel1, prevmodel2, nxtroundbtn]
1324
-
1325
- # all_pairs = generate_matching_pairs(cached_samples)
1326
-
1327
- print(f"Retrieving models {mdl1k} and {mdl2k} from API")
1328
- return (
1329
- text,
1330
- "Synthesize",
1331
- gr.update(visible=True), # r2
1332
- mdl1, # model1
1333
- mdl2, # model2
1334
- gr.update(visible=True, value=results[mdl1k], interactive=False, autoplay=True), # aud1
1335
- gr.update(visible=True, value=results[mdl2k], interactive=False, autoplay=False), # aud2
1336
- gr.update(visible=True, interactive=False), #abetter
1337
- gr.update(visible=True, interactive=False), #bbetter
1338
- gr.update(visible=False), #prevmodel1
1339
- gr.update(visible=False), #prevmodel2
1340
- gr.update(visible=False), #nxt round btn
1341
- # reset gr.State aplayed & bplayed
1342
- False, #aplayed
1343
- False, #bplayed
1344
- )
1345
-
1346
- def unlock_vote(btn_index, aplayed, bplayed):
1347
- # sample played
1348
- if btn_index == 0:
1349
- aplayed = True
1350
- if btn_index == 1:
1351
- bplayed = True
1352
-
1353
- # both audio samples played
1354
- if bool(aplayed) and bool(bplayed):
1355
- # print('Both audio samples played, voting unlocked')
1356
- return [gr.update(interactive=True), gr.update(interactive=True), True, True]
1357
-
1358
- return [gr.update(), gr.update(), aplayed, bplayed]
1359
-
1360
- def play_other(bplayed):
1361
- return bplayed
1362
-
1363
- def get_userid(session_hash: str, request):
1364
- # JS cookie
1365
- if (session_hash != ''):
1366
- # print('auth by session cookie')
1367
- return sha1(bytes(session_hash.encode('ascii')), usedforsecurity=False).hexdigest()
1368
-
1369
- if request.username:
1370
- # print('auth by username')
1371
- # by HuggingFace username - requires `auth` to be enabled therefore denying access to anonymous users
1372
- return sha1(bytes(request.username.encode('ascii')), usedforsecurity=False).hexdigest()
1373
- else:
1374
- # print('auth by ip')
1375
- # by IP address - unreliable when gradio within HTML iframe
1376
- # return sha1(bytes(request.client.host.encode('ascii')), usedforsecurity=False).hexdigest()
1377
- # by browser session cookie - Gradio on HF is run in an HTML iframe, access to parent session required to reach session token
1378
- # return sha1(bytes(request.headers.encode('ascii'))).hexdigest()
1379
- # by browser session hash - Not a cookie, session hash changes on page reload
1380
- return sha1(bytes(request.session_hash.encode('ascii')), usedforsecurity=False).hexdigest()
1381
-
1382
- # Give user a cached audio sample pair they have yet to vote on
1383
- def give_cached_sample(session_hash: str, request: gr.Request):
1384
- # add new userid to voting_users from Browser session hash
1385
- # stored only in RAM
1386
- userid = get_userid(session_hash, request)
1387
-
1388
- if userid not in voting_users:
1389
- voting_users[userid] = User(userid)
1390
-
1391
- def get_next_pair(user: User):
1392
- # FIXME: all_pairs var out of scope
1393
- # all_pairs = generate_matching_pairs(cached_samples)
1394
-
1395
- # for pair in all_pairs:
1396
- for pair in generate_matching_pairs(cached_samples):
1397
- hash1 = md5(bytes((pair[0].modelName + pair[0].transcript).encode('ascii'))).hexdigest()
1398
- hash2 = md5(bytes((pair[1].modelName + pair[1].transcript).encode('ascii'))).hexdigest()
1399
- pair_key = (hash1, hash2)
1400
- if (
1401
- pair_key not in user.voted_pairs
1402
- # or in reversed order
1403
- and (pair_key[1], pair_key[0]) not in user.voted_pairs
1404
- ):
1405
- return pair
1406
- return None
1407
-
1408
- pair = get_next_pair(voting_users[userid])
1409
- if pair is None:
1410
- return [
1411
- *clear_stuff(),
1412
- # disable get cached sample button
1413
- gr.update(interactive=False)
1414
- ]
1415
-
1416
- return (
1417
- gr.update(visible=True, value=pair[0].transcript, elem_classes=['blurred-text']),
1418
- "Synthesize",
1419
- gr.update(visible=True), # r2
1420
- pair[0].modelName, # model1
1421
- pair[1].modelName, # model2
1422
- gr.update(visible=True, value=pair[0].filename, interactive=False, autoplay=True), # aud1
1423
- gr.update(visible=True, value=pair[1].filename, interactive=False, autoplay=False), # aud2
1424
- gr.update(visible=True, interactive=False), #abetter
1425
- gr.update(visible=True, interactive=False), #bbetter
1426
- gr.update(visible=False), #prevmodel1
1427
- gr.update(visible=False), #prevmodel2
1428
- gr.update(visible=False), #nxt round btn
1429
- # reset aplayed, bplayed audio playback events
1430
- False, #aplayed
1431
- False, #bplayed
1432
- # fetch cached btn
1433
- gr.update(interactive=True)
1434
- )
1435
-
1436
- # note the vote on cached sample pair
1437
- def voted_on_cached(modelName1: str, modelName2: str, transcript: str, session_hash: str, request: gr.Request):
1438
- userid = get_userid(session_hash, request)
1439
- # print(f'userid voted on cached: {userid}')
1440
-
1441
- if userid not in voting_users:
1442
- voting_users[userid] = User(userid)
1443
-
1444
- hash1 = md5(bytes((modelName1 + transcript).encode('ascii'))).hexdigest()
1445
- hash2 = md5(bytes((modelName2 + transcript).encode('ascii'))).hexdigest()
1446
-
1447
- voting_users[userid].voted_pairs.add((hash1, hash2))
1448
- return []
1449
-
1450
- def randomsent():
1451
- return '⚡', random.choice(sents), '🎲'
1452
- def clear_stuff():
1453
- return [
1454
- gr.update(visible=True, value="", elem_classes=[]),
1455
- "Synthesize",
1456
- gr.update(visible=False), # r2
1457
- '', # model1
1458
- '', # model2
1459
- gr.update(visible=False, interactive=False, autoplay=False), # aud1
1460
- gr.update(visible=False, interactive=False, autoplay=False), # aud2
1461
- gr.update(visible=False, interactive=False), #abetter
1462
- gr.update(visible=False, interactive=False), #bbetter
1463
- gr.update(visible=False), #prevmodel1
1464
- gr.update(visible=False), #prevmodel2
1465
- gr.update(visible=False), #nxt round btn
1466
- False, #aplayed
1467
- False, #bplayed
1468
- ]
1469
-
1470
- def disable():
1471
- return [gr.update(interactive=False), gr.update(interactive=False), gr.update(interactive=False), gr.update(interactive=False)]
1472
- def enable():
1473
- return [gr.update(interactive=True), gr.update(interactive=True), gr.update(interactive=True), gr.update(interactive=True)]
1474
- def unblur_text():
1475
- return gr.update(elem_classes=[])
1476
-
1477
- # JavaScript within HTML head
1478
- head_js = ""
1479
- unblur_js = 'document.getElementById("arena-text-input").classList.remove("blurred-text")'
1480
- shortcut_js = """
1481
- <script>
1482
- function shortcuts(e) {
1483
- var event = document.all ? window.event : e;
1484
- switch (e.target.tagName.toLowerCase()) {
1485
- case "input":
1486
- case "textarea":
1487
- break;
1488
- default:
1489
- switch (e.key.toLowerCase()) {
1490
- case "a":
1491
- document.getElementById("arena-a-better").click();
1492
- break;
1493
- case "b":
1494
- document.getElementById("arena-b-better").click();
1495
- break;
1496
- case "n":
1497
- document.getElementById("arena-next-round").click();
1498
- break;
1499
- }
1500
- }
1501
- }
1502
- document.addEventListener('keypress', shortcuts, false);
1503
-
1504
- """
1505
- head_js += shortcut_js
1506
- head_js += open("cookie.js").read()
1507
- head_js += '</script>'
1508
-
1509
- with gr.Blocks() as vote:
1510
- session_hash = gr.Textbox(visible=False, value='')
1511
-
1512
- # sample played, using Checkbox so that JS can fetch the value
1513
- aplayed = gr.Checkbox(visible=False, value=False)
1514
- bplayed = gr.Checkbox(visible=False, value=False)
1515
- # voter ID
1516
- useridstate = gr.State()
1517
- gr.Markdown(INSTR)
1518
- with gr.Group():
1519
- with gr.Row():
1520
- cachedt = gr.Button('⚡', scale=0, min_width=0, variant='tool', interactive=True)
1521
- text = gr.Textbox(
1522
- container=False,
1523
- show_label=False,
1524
- placeholder="Enter text to synthesize",
1525
- lines=1,
1526
- max_lines=1,
1527
- scale=9999999,
1528
- min_width=0,
1529
- elem_id="arena-text-input",
1530
- )
1531
- randomt = gr.Button('🎲', scale=0, min_width=0, variant='tool')
1532
- randomt\
1533
- .click(randomsent, outputs=[cachedt, text, randomt])\
1534
- .then(None, js="() => "+ unblur_js)
1535
- btn = gr.Button("Synthesize", variant='primary')
1536
- model1 = gr.Textbox(interactive=False, lines=1, max_lines=1, visible=False)
1537
- model2 = gr.Textbox(interactive=False, lines=1, max_lines=1, visible=False)
1538
- with gr.Row(visible=False) as r2:
1539
- with gr.Column():
1540
- with gr.Group():
1541
- aud1 = gr.Audio(
1542
- interactive=False,
1543
- show_label=False,
1544
- show_download_button=False,
1545
- show_share_button=False,
1546
- waveform_options={'waveform_progress_color': '#EF4444'},
1547
- # var(--color-red-500)'}); gradio only accepts HEX and CSS color
1548
- )
1549
- abetter = gr.Button(
1550
- "A is better [a]",
1551
- elem_id='arena-a-better',
1552
- variant='primary',
1553
- interactive=False,
1554
- )
1555
- prevmodel1 = gr.HTML(show_label=False, value="Vote to reveal model A", visible=False)
1556
- with gr.Column():
1557
- with gr.Group():
1558
- aud2 = gr.Audio(
1559
- interactive=False,
1560
- show_label=False,
1561
- show_download_button=False,
1562
- show_share_button=False,
1563
- waveform_options={'waveform_progress_color': '#3C82F6'},
1564
- # var(--secondary-500)'}); gradio only accepts HEX and CSS color
1565
- )
1566
- bbetter = gr.Button(
1567
- "B is better [b]",
1568
- elem_id='arena-b-better',
1569
- variant='primary',
1570
- interactive=False
1571
- )
1572
- prevmodel2 = gr.HTML(show_label=False, value="Vote to reveal model B", visible=False)
1573
- nxtroundbtn = gr.Button(
1574
- '⚡ Next round [n]',
1575
- elem_id='arena-next-round',
1576
- visible=False
1577
- )
1578
- outputs = [
1579
- text,
1580
- btn,
1581
- r2,
1582
- model1,
1583
- model2,
1584
- aud1,
1585
- aud2,
1586
- abetter,
1587
- bbetter,
1588
- prevmodel1,
1589
- prevmodel2,
1590
- nxtroundbtn,
1591
- aplayed,
1592
- bplayed,
1593
- ]
1594
- """
1595
- text,
1596
- "Synthesize",
1597
- gr.update(visible=True), # r2
1598
- mdl1, # model1
1599
- mdl2, # model2
1600
- gr.update(visible=True, value=results[mdl1]), # aud1
1601
- gr.update(visible=True, value=results[mdl2]), # aud2
1602
- gr.update(visible=True, interactive=False), #abetter
1603
- gr.update(visible=True, interactive=False), #bbetter
1604
- gr.update(visible=False), #prevmodel1
1605
- gr.update(visible=False), #prevmodel2
1606
- gr.update(visible=False), #nxt round btn"""
1607
- btn\
1608
- .click(disable, outputs=[btn, abetter, bbetter, cachedt])\
1609
- .then(synthandreturn, inputs=[text], outputs=outputs)\
1610
- .then(enable, outputs=[btn, gr.State(), gr.State(), cachedt])\
1611
- .then(None, js="() => "+ unblur_js)
1612
- nxtroundbtn\
1613
- .click(clear_stuff, outputs=outputs)\
1614
- .then(disable, outputs=[btn, abetter, bbetter, cachedt])\
1615
- .then(give_cached_sample, inputs=[session_hash], outputs=[*outputs, cachedt])\
1616
- .then(enable, outputs=[btn, gr.State(), gr.State(), gr.State()])
1617
-
1618
- # fetch a comparison pair from cache
1619
- cachedt\
1620
- .click(disable, outputs=[btn, abetter, bbetter, cachedt])\
1621
- .then(give_cached_sample, inputs=[session_hash], outputs=[*outputs, cachedt])\
1622
- .then(enable, outputs=[btn, gr.State(), gr.State(), gr.State()])
1623
- # TODO: await download of sample before allowing playback
1624
-
1625
- # Allow interaction with the vote buttons only when both audio samples have finished playing
1626
- aud1\
1627
- .stop(
1628
- unlock_vote,
1629
- outputs=[abetter, bbetter, aplayed, bplayed],
1630
- inputs=[gr.State(value=0), aplayed, bplayed],
1631
- )\
1632
- .then(
1633
- None,
1634
- inputs=[bplayed],
1635
- js="(b) => b ? 0 : document.querySelector('.row .gap+.gap button.play-pause-button[aria-label=Play]').click()",
1636
- )
1637
- # autoplay if unplayed
1638
- aud2\
1639
- .stop(
1640
- unlock_vote,
1641
- outputs=[abetter, bbetter, aplayed, bplayed],
1642
- inputs=[gr.State(value=1), aplayed, bplayed],
1643
- )\
1644
- .then(None, js="() => "+ unblur_js)
1645
-
1646
- nxt_outputs = [abetter, bbetter, prevmodel1, prevmodel2, nxtroundbtn]
1647
- abetter\
1648
- .click(a_is_better, outputs=nxt_outputs, inputs=[model1, model2, useridstate, text])\
1649
- .then(voted_on_cached, inputs=[model1, model2, text, session_hash], outputs=[])
1650
- bbetter\
1651
- .click(b_is_better, outputs=nxt_outputs, inputs=[model1, model2, useridstate, text])\
1652
- .then(voted_on_cached, inputs=[model1, model2, text, session_hash], outputs=[])
1653
- # skipbtn.click(b_is_better, outputs=outputs, inputs=[model1, model2, useridstate])
1654
-
1655
- # bothbad.click(both_bad, outputs=outputs, inputs=[model1, model2, useridstate])
1656
- # bothgood.click(both_good, outputs=outputs, inputs=[model1, model2, useridstate])
1657
-
1658
- # get session cookie
1659
- vote\
1660
- .load(
1661
- None,
1662
- None,
1663
- session_hash,
1664
- js="() => { return getArenaCookie('session') }",
1665
- )
1666
- # give a cached sample pair to voter; .then() did not work here
1667
- vote\
1668
- .load(give_cached_sample, inputs=[session_hash], outputs=[*outputs, cachedt])
1669
-
1670
- with gr.Blocks() as about:
1671
- gr.Markdown(ABOUT)
1672
- with gr.Blocks() as tts_info:
1673
- gr.Markdown(TTS_INFO)
1674
- gr.HTML(TTS_DATASET_IFRAME)
1675
- # with gr.Blocks() as admin:
1676
- # rdb = gr.Button("Reload Audio Dataset")
1677
- # # rdb.click(reload_audio_dataset, outputs=rdb)
1678
- # with gr.Group():
1679
- # dbtext = gr.Textbox(label="Type \"delete db\" to confirm", placeholder="delete db")
1680
- # ddb = gr.Button("Delete DB")
1681
- # ddb.click(del_db, inputs=dbtext, outputs=ddb)
1682
- # Blur cached sample text so the voting user picks up mispronouncements
1683
- with gr.Blocks(theme=theme, css="footer {visibility: hidden}textbox{resize:none} .blurred-text {filter: blur(0.15em);}", head=head_js, title="TTS Arena") as demo:
1684
- gr.Markdown(DESCR)
1685
- # gr.TabbedInterface([vote, leaderboard, about, admin], ['Vote', 'Leaderboard', 'About', 'Admin (ONLY IN BETA)'])
1686
- gr.TabbedInterface([vote, leaderboard, about, tts_info], ['🗳️ Vote', '🏆 Leaderboard', '📄 About', '🗣 Contenders'])
1687
- if CITATION_TEXT:
1688
- with gr.Row():
1689
- with gr.Accordion("Citation", open=False):
1690
- gr.Markdown(f"If you use this data in your publication, please cite us!\n\nCopy the BibTeX citation to cite this source:\n\n```bibtext\n{CITATION_TEXT}\n```\n\nPlease remember that all generated audio clips should be assumed unsuitable for redistribution or commercial use.")
1691
-
1692
-
1693
- demo\
1694
- .queue(api_open=False, default_concurrency_limit=4)\
1695
- .launch(show_api=False, show_error=True)
 
1
+ from app.ui import app
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2
 
3
+ if __name__ == "__main__":
4
+ app.queue(default_concurrency_limit=50, api_open=False).launch(show_api=False)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
app/__init__.py ADDED
File without changes
app/config.py ADDED
@@ -0,0 +1,51 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+
3
+ # NOTE: Configure models in `models.py`
4
+
5
+ #########################
6
+ # General Configuration #
7
+ #########################
8
+
9
+ DB_NAME = "database.db"
10
+
11
+ TOXICITY_CHECK = False
12
+
13
+ MAX_SAMPLE_TXT_LENGTH = 300 # Maximum text length (characters)
14
+ MIN_SAMPLE_TXT_LENGTH = 10 # Minimum text length (characters)
15
+
16
+ DB_PATH = f"/data/{DB_NAME}" if os.path.isdir("/data") else DB_NAME # If /data available => means local storage is enabled => let's use it!
17
+
18
+ ROUTER_ID = "Pendrokar/xVASynth-TTS" # You should use a router space to route TTS models to avoid exposing your API keys!
19
+ # ROUTER_ID = "TTS-AGI/tts-router" # You should use a router space to route TTS models to avoid exposing your API keys!
20
+
21
+ SYNC_DB = True # Sync DB to HF dataset?
22
+ DB_DATASET_ID = os.getenv('DATASET_ID') # HF dataset ID, can be None if not syncing
23
+
24
+ SPACE_ID = os.getenv('SPACE_ID') # Don't change this! It detects if we're running in a HF Space
25
+
26
+ sents = []
27
+ with open(os.path.dirname(__file__) + '/../harvard_sentences.txt') as f:
28
+ sents += f.read().strip().splitlines()
29
+ with open(os.path.dirname(__file__) + '/../llama3_command-r_sentences_1st_person.txt') as f:
30
+ sents += f.read().strip().splitlines()
31
+ # With other punctuation marks
32
+ # Exclamations - # conversational characters/animation entertainment/tv
33
+ with open(os.path.dirname(__file__) + '/../llama3_command-r_sentences_excla.txt') as f:
34
+ sents += f.read().strip().splitlines()
35
+ # Questions - # conversational characters/animation entertainment/tv
36
+ with open(os.path.dirname(__file__) + '/../llama3_command-r_questions.txt') as f:
37
+ sents += f.read().strip().splitlines()
38
+
39
+ # Credit: llama3_command-r sentences generated by user KingNish
40
+
41
+ ######################
42
+ # TTS Arena Settings #
43
+ ######################
44
+
45
+ CITATION_TEXT = """@misc{tts-arena,
46
+ title = {Text to Speech Arena},
47
+ author = {mrfakename and Srivastav, Vaibhav and Fourrier, Clémentine and Pouget, Lucain and Lacombe, Yoach and main and Gandhi, Sanchit},
48
+ year = 2024,
49
+ publisher = {Hugging Face},
50
+ howpublished = "\\url{https://huggingface.co/spaces/TTS-AGI/TTS-Arena}"
51
+ }"""
app/cookie.js ADDED
@@ -0,0 +1,30 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ window.getArenaCookie = function getArenaCookie(cname) {
2
+ let name = cname + "=";
3
+ let decodedCookie = decodeURIComponent(window.document.cookie);
4
+ let ca = decodedCookie.split(';');
5
+ for (let i = 0; i < ca.length; i++) {
6
+ let c = ca[i];
7
+ while (c.charAt(0) == ' ') {
8
+ c = c.substring(1);
9
+ }
10
+ if (c.indexOf(name) == 0) {
11
+ return c.substring(name.length, c.length);
12
+ }
13
+ }
14
+ return "";
15
+ }
16
+
17
+ window.setArenaCookie = function setArenaCookie(cname, cvalue, exdays) {
18
+ const d = new Date();
19
+ d.setTime(d.getTime() + (exdays * 24 * 60 * 60 * 1000));
20
+ let expires = "expires=" + d.toUTCString();
21
+ window.document.cookie = cname + "=" + cvalue + ";" + expires + "; path=/; SameSite=None; Secure";
22
+ }
23
+
24
+ if (window.getArenaCookie('session').length == 0)
25
+ {
26
+ const d = new Date();
27
+ // store cookie for 90 days, about the time the the cached audio should be deleted
28
+ window.setArenaCookie('session', d.getTime().toString(), 90);
29
+ console.log('Session cookie created')
30
+ }
app/db.py ADDED
@@ -0,0 +1,54 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import sqlite3
2
+ from .config import *
3
+ import os
4
+ import shutil
5
+ from huggingface_hub import hf_hub_download
6
+
7
+ def download_db():
8
+ if not os.path.isfile(DB_PATH):
9
+ print("Downloading DB...")
10
+ try:
11
+ cache_path = hf_hub_download(repo_id=DB_DATASET_ID, repo_type='dataset', filename=DB_NAME)
12
+ shutil.copyfile(cache_path, DB_PATH)
13
+ print("Downloaded DB")
14
+ except Exception as e:
15
+ print("Error while downloading DB:", e)
16
+
17
+ def get_db():
18
+ return sqlite3.connect(DB_PATH)
19
+
20
+ def create_db():
21
+ conn = get_db()
22
+ cursor = conn.cursor()
23
+ cursor.execute('''
24
+ CREATE TABLE IF NOT EXISTS model (
25
+ name TEXT UNIQUE,
26
+ upvote INTEGER,
27
+ downvote INTEGER
28
+ );
29
+ ''')
30
+ cursor.execute('''
31
+ CREATE TABLE IF NOT EXISTS vote (
32
+ id INTEGER PRIMARY KEY AUTOINCREMENT,
33
+ username TEXT,
34
+ model TEXT,
35
+ vote INTEGER,
36
+ timestamp TIMESTAMP DEFAULT CURRENT_TIMESTAMP
37
+ );
38
+ ''')
39
+ cursor.execute('''
40
+ CREATE TABLE IF NOT EXISTS votelog (
41
+ id INTEGER PRIMARY KEY AUTOINCREMENT,
42
+ username TEXT,
43
+ chosen TEXT,
44
+ rejected TEXT,
45
+ timestamp TIMESTAMP DEFAULT CURRENT_TIMESTAMP
46
+ );
47
+ ''')
48
+ cursor.execute('''
49
+ CREATE TABLE IF NOT EXISTS spokentext (
50
+ id INTEGER PRIMARY KEY AUTOINCREMENT,
51
+ spokentext TEXT,
52
+ timestamp TIMESTAMP DEFAULT CURRENT_TIMESTAMP
53
+ );
54
+ ''')
app/init.py ADDED
@@ -0,0 +1,30 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from .config import *
2
+ from .db import *
3
+ from huggingface_hub import CommitScheduler
4
+ from pathlib import Path
5
+ from gradio_client import Client
6
+ import os
7
+
8
+
9
+ scheduler = None
10
+
11
+ if SYNC_DB:
12
+ download_db()
13
+ # Sync local DB with remote repo every 5 minute (only if a change is detected)
14
+ scheduler = CommitScheduler(
15
+ repo_id=DB_DATASET_ID,
16
+ repo_type="dataset",
17
+ folder_path=Path(DB_PATH).parent,
18
+ every=5,
19
+ allow_patterns=DB_NAME,
20
+ )
21
+
22
+ create_db()
23
+
24
+ # Load TTS Router
25
+ router = Client(ROUTER_ID, hf_token=os.getenv('HF_TOKEN'))
26
+
27
+ if TOXICITY_CHECK:
28
+ # Load toxicity model
29
+ from detoxify import Detoxify
30
+ toxicity = Detoxify('original')
app/leaderboard.py ADDED
@@ -0,0 +1,91 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from .config import *
2
+ from .db import *
3
+ from .models import *
4
+ from .synth import top_five
5
+
6
+ import pandas as pd
7
+
8
+ # for diff
9
+ leaderboard_df = {}
10
+ def get_leaderboard(reveal_prelim = False):
11
+ global leaderboard_df
12
+
13
+ conn = get_db()
14
+ cursor = conn.cursor()
15
+ sql = 'SELECT name, upvote, downvote, name AS orig_name FROM model'
16
+ if not reveal_prelim: sql += ' WHERE (upvote + downvote) > 300'
17
+ cursor.execute(sql)
18
+ data = cursor.fetchall()
19
+ df = pd.DataFrame(data, columns=['name', 'upvote', 'downvote', 'orig_name'])
20
+ # df['license'] = df['name'].map(model_license)
21
+ df['name'] = df['name'].replace(model_names)
22
+ for i in range(len(df)):
23
+ df.loc[i, "name"] = make_link_to_space(df['name'][i], True)
24
+ df['votes'] = df['upvote'] + df['downvote']
25
+ # df['score'] = round((df['upvote'] / df['votes']) * 100, 2) # Percentage score
26
+
27
+ ## ELO SCORE
28
+ df['score'] = 1200
29
+ df['score_diff'] = ""
30
+ for i in range(len(df)):
31
+ for j in range(len(df)):
32
+ if i != j:
33
+ try:
34
+ expected_a = 1 / (1 + 10 ** ((df['score'].iloc[j] - df['score'].iloc[i]) / 400))
35
+ expected_b = 1 / (1 + 10 ** ((df['score'].iloc[i] - df['score'].iloc[j]) / 400))
36
+ actual_a = df['upvote'].iloc[i] / df['votes'].iloc[i] if df['votes'].iloc[i] > 0 else 0.5
37
+ actual_b = df['upvote'].iloc[j] / df['votes'].iloc[j] if df['votes'].iloc[j] > 0 else 0.5
38
+ df.at[i, 'score'] += round(32 * (actual_a - expected_a))
39
+ df.at[j, 'score'] += round(32 * (actual_b - expected_b))
40
+ except Exception as e:
41
+ print(f"Error in ELO calculation for rows {i} and {j}: {str(e)}")
42
+ continue
43
+ df['score'] = round(df['score'])
44
+ df['score_diff'] = df['score']
45
+
46
+ if (
47
+ reveal_prelim == False
48
+ and len(leaderboard_df) == 0
49
+ ):
50
+ leaderboard_df = df
51
+
52
+ if (reveal_prelim == False):
53
+ for i in range(len(df)):
54
+ score_diff = (df['score'].iloc[i] - leaderboard_df['score'].iloc[i])
55
+ if (score_diff == 0):
56
+ continue
57
+ if (score_diff > 0):
58
+ plus = '<em style="color: green; font-family: monospace">+'
59
+ else:
60
+ plus = '<em style="color: red; font-family: monospace">'
61
+
62
+ df.at[i, 'score_diff'] = str(df['score'].iloc[i]) + plus + str(score_diff) +'</em>'
63
+
64
+ ## ELO SCORE
65
+ df = df.sort_values(by='score', ascending=False)
66
+ # medals
67
+ def assign_medal(rank, assign):
68
+ rank = str(rank + 1)
69
+ if assign:
70
+ if rank == '1':
71
+ rank += '🥇'
72
+ elif rank == '2':
73
+ rank += '🥈'
74
+ elif rank == '3':
75
+ rank += '🥉'
76
+
77
+ return '#'+ rank
78
+
79
+ df['order'] = [assign_medal(i, not reveal_prelim and len(df) > 2) for i in range(len(df))]
80
+ # fetch top_five
81
+ for orig_name in df['orig_name']:
82
+ if (
83
+ reveal_prelim
84
+ and len(top_five) < 5
85
+ and orig_name in AVAILABLE_MODELS.keys()
86
+ ):
87
+ top_five.append(orig_name)
88
+
89
+ df['score'] = df['score_diff']
90
+ df = df[['order', 'name', 'score', 'votes']]
91
+ return df
app/messages.py ADDED
@@ -0,0 +1,98 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from .config import *
2
+ from .models import *
3
+
4
+ ############
5
+ # Messages #
6
+ ############
7
+
8
+ MUST_BE_LOGGEDIN = "Please login with Hugging Face to participate in the TTS Arena."
9
+ DESCR = """
10
+ # TTS Arena: Benchmarking TTS Models in the Wild
11
+ Vote to help the community find the best available text-to-speech model!
12
+ """.strip()
13
+ BATTLE_INSTR = """
14
+ ## Battle
15
+ Choose 2 candidates and vote on which one is better! Currently in beta.
16
+ * Input text (English only) to synthesize audio (or press 🎲 for random text).
17
+ * Listen to the two audio clips, one after the other.
18
+ * Vote on which audio sounds more natural to you.
19
+ """
20
+ INSTR = """
21
+ * Press ⚡ to quickly get cached sample pairs you've yet to vote on. (Fast 🐇)
22
+ * Or press 🎲 to randomly use a sentence from the list. (Slow 🐢)
23
+ * Or input text (🇺🇸 English only) to synthesize audio. (Slowest 🐌)
24
+ * _Listen to the two audio clips, one after the other and then vote on which audio sounds more natural to you._
25
+ * Model names are revealed after the vote is cast.
26
+ ⚠ Note: It **may take up to 30 seconds** to ***synthesize*** audio.
27
+ """.strip()
28
+ request = ""
29
+ if SPACE_ID:
30
+ request = f"""
31
+ ### Request a model
32
+ Please [create a Discussion](https://huggingface.co/spaces/{SPACE_ID}/discussions/new) to request a model.
33
+ """
34
+ ABOUT = f"""
35
+ ## About
36
+ The TTS Arena evaluates leading speech synthesis models. It is inspired by LMsys's [Chatbot Arena](https://chat.lmsys.org/).
37
+ ### Motivation
38
+ The field of speech synthesis has long lacked an accurate method to measure the quality of different models. Objective metrics like WER (word error rate) are unreliable measures of model quality, and subjective measures such as MOS (mean opinion score) are typically small-scale experiments conducted with few listeners. As a result, these measurements are generally not useful for comparing two models of roughly similar quality. To address these drawbacks, we are inviting the community to rank models in an easy-to-use interface, and opening it up to the public in order to make both the opportunity to rank models, as well as the results, more easily accessible to everyone.
39
+ ### The Arena
40
+ The leaderboard allows a user to enter text, which will be synthesized by two models. After listening to each sample, the user can vote on which model sounds more natural. Due to the risks of human bias and abuse, model names are revealed only after a vote is submitted.
41
+ ### Credits
42
+ Thank you to the following individuals who helped make this project possible:
43
+ * VB ([Twitter](https://twitter.com/reach_vb) / [Hugging Face](https://huggingface.co/reach-vb))
44
+ * Clémentine Fourrier ([Twitter](https://twitter.com/clefourrier) / [Hugging Face](https://huggingface.co/clefourrier))
45
+ * Lucain Pouget ([Twitter](https://twitter.com/Wauplin) / [Hugging Face](https://huggingface.co/Wauplin))
46
+ * Yoach Lacombe ([Twitter](https://twitter.com/yoachlacombe) / [Hugging Face](https://huggingface.co/ylacombe))
47
+ * Main Horse ([Twitter](https://twitter.com/main_horse) / [Hugging Face](https://huggingface.co/main-horse))
48
+ * Sanchit Gandhi ([Twitter](https://twitter.com/sanchitgandhi99) / [Hugging Face](https://huggingface.co/sanchit-gandhi))
49
+ * Apolinário Passos ([Twitter](https://twitter.com/multimodalart) / [Hugging Face](https://huggingface.co/multimodalart))
50
+ * Pedro Cuenca ([Twitter](https://twitter.com/pcuenq) / [Hugging Face](https://huggingface.co/pcuenq))
51
+
52
+ \* ***You are currently in a cloned/forked space of TTS-AGI/TTS-Arena***
53
+
54
+ {request}
55
+ ### Privacy statement
56
+ We may store text you enter and generated audio. We store a unique ID for each session. You agree that we may collect, share, and/or publish any data you input for research and/or commercial purposes.
57
+ ### License
58
+ Generated audio clips cannot be redistributed and may be used for personal, non-commercial use only.
59
+ Random sentences are sourced from a filtered subset of the [Harvard Sentences](https://www.cs.columbia.edu/~hgs/audio/harvard.html).
60
+ """.strip()
61
+ LDESC = """
62
+ Vote to help the community determine the best text-to-speech (TTS) models.
63
+ The leaderboard displays models in descending order of how natural they sound (based on votes cast by the community).
64
+ Important: In order to help keep results fair, the leaderboard hides results by default until the number of votes passes a threshold. Tick the `Reveal preliminary results` to show models without sufficient votes. Please note that preliminary results may be inaccurate. [This dataset is public](https://huggingface.co/datasets/{DB_DATASET_ID}) and only saves the hardcoded sentences while keeping the voters anonymous.
65
+ """.strip()
66
+
67
+
68
+ model_series = []
69
+ for model in AVAILABLE_MODELS.keys():
70
+ # name up to first whitespace
71
+ model = model.split()[0]
72
+ model_series.append('%27'+ model +'%27')
73
+ try:
74
+ for model in HF_SPACES.values():
75
+ # url encode pluses +
76
+ model_series.append('%27'+ model['series'].replace('+', '%2B') +'%27')
77
+ except:
78
+ pass
79
+
80
+ TTS_INFO = f"""
81
+ ## 🗣 Contenders
82
+
83
+ ### 🔐 Closed Source TTS
84
+ * Microsoft _Edge TTS_
85
+
86
+ ### 🔓 Open Source TTS capabilities table
87
+
88
+ See [the full dataset itself](https://huggingface.co/datasets/Pendrokar/open_tts_tracker) for the legend and more in depth information of each model.
89
+ """.strip()
90
+ TTS_DATASET_IFRAME_ORDER = '%2C+'.join(model_series)
91
+ TTS_DATASET_IFRAME = f"""
92
+ <iframe
93
+ src="https://huggingface.co/datasets/Pendrokar/open_tts_tracker/embed/sql-console/default/train?sql_console=true&sql=--+The+SQL+console+is+powered+by+DuckDB+WASM+and+runs+entirely+in+the+browser.%0A--+Get+started+by+typing+a+query+or+selecting+a+view+from+the+options+below.%0ASELECT+*%2C+%22Name%22+IN+%28{TTS_DATASET_IFRAME_ORDER}%29+AS+%22In+arena%22+FROM+train+WHERE+%22Insta-clone+%F0%9F%91%A5%22+IS+NOT+NULL+ORDER+BY+%22In+arena%22+DESC+LIMIT+50%3B&views%5B%5D=train"
94
+ frameborder="0"
95
+ width="100%"
96
+ height="650px"
97
+ ></iframe>
98
+ """.strip()
app/models.py ADDED
@@ -0,0 +1,469 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ from gradio_client import handle_file
3
+
4
+ # Models to include in the leaderboard, only include models that users can vote on
5
+ AVAILABLE_MODELS = {
6
+ # 'XTTSv2': 'xtts',
7
+ # 'WhisperSpeech': 'whisperspeech',
8
+ # 'ElevenLabs': 'eleven',
9
+ # 'OpenVoice': 'openvoice',
10
+ # 'OpenVoice V2': 'openvoicev2',
11
+ # 'Play.HT 2.0': 'playht',
12
+ # 'Play.HT 3.0 Mini': 'playht3',
13
+ # 'MetaVoice': 'metavoice',
14
+ # 'MeloTTS': 'melo',
15
+ # 'StyleTTS 2': 'styletts2',
16
+ # 'GPT-SoVITS': 'sovits',
17
+ # 'Vokan TTS': 'vokan',
18
+ # 'VoiceCraft 2.0': 'voicecraft',
19
+ # 'Parler TTS': 'parler',
20
+ # 'Parler TTS Large': 'parlerlarge',
21
+ # 'Fish Speech v1.4': 'fish',
22
+
23
+ # HF Gradio Spaces: # <works with gradio version #>
24
+ # gravio version that works with most spaces: 4.29
25
+ # 'coqui/xtts': 'coqui/xtts', # 4.29 4.32
26
+ # 'collabora/WhisperSpeech': 'collabora/WhisperSpeech', # 4.32 4.36.1
27
+ # 'myshell-ai/OpenVoice': 'myshell-ai/OpenVoice', # same devs as MeloTTS, which scores higher # 4.29
28
+ # 'myshell-ai/OpenVoiceV2': 'myshell-ai/OpenVoiceV2', # same devs as MeloTTS, which scores higher # 4.29
29
+ # 'mrfakename/MetaVoice-1B-v0.1': 'mrfakename/MetaVoice-1B-v0.1', # 4.29 4.32
30
+ 'Pendrokar/xVASynth-TTS': 'Pendrokar/xVASynth-TTS', # 4.29 4.32 4.42.0
31
+ # 'coqui/CoquiTTS': 'coqui/CoquiTTS',
32
+ # 'mrfakename/MeloTTS': 'mrfakename/MeloTTS', # 4.29 4.32
33
+ # 'fishaudio/fish-speech-1': 'fishaudio/fish-speech-1', # 4.29 4.32 4.36.1
34
+
35
+ # E2 & F5 TTS
36
+ # F5 model
37
+ # 'mrfakename/E2-F5-TTS': 'mrfakename/E2-F5-TTS', # 5.0
38
+
39
+ # # Parler
40
+ # Parler Large model
41
+ # 'parler-tts/parler_tts': 'parler-tts/parler_tts', # 4.29 4.32 4.36.1 4.42.0
42
+ # Parler Mini model
43
+ # 'parler-tts/parler_tts': 'parler-tts/parler_tts', # 4.29 4.32 4.36.1 4.42.0
44
+ # 'parler-tts/parler_tts_mini': 'parler-tts/parler_tts_mini', # Mini is the default model of parler_tts
45
+ # 'parler-tts/parler-tts-expresso': 'parler-tts/parler-tts-expresso', # 4.29 4.32 4.36.1 4.42.0
46
+
47
+ # # Microsoft Edge TTS
48
+ # 'innoai/Edge-TTS-Text-to-Speech': 'innoai/Edge-TTS-Text-to-Speech', # 4.29
49
+
50
+ # IMS-Toucan
51
+ # 'Flux9665/MassivelyMultilingualTTS': 'Flux9665/MassivelyMultilingualTTS', # 5.1
52
+ # StyleTTS v2
53
+ # 'Pendrokar/style-tts-2': 'Pendrokar/style-tts-2', # more votes in OG arena; emotionless
54
+ # StyleTTS kokoro
55
+ 'hexgrad/kokoro': 'hexgrad/kokoro',
56
+
57
+ # MaskGCT (by Amphion)
58
+ # DEMANDS 300 seconds of ZeroGPU
59
+ # 'amphion/maskgct': 'amphion/maskgct',
60
+ # default ZeroGPU borrow time
61
+ 'Svngoku/maskgct-audio-lab': 'Svngoku/maskgct-audio-lab',
62
+
63
+ # HF TTS w issues
64
+ 'LeeSangHoon/HierSpeech_TTS': 'LeeSangHoon/HierSpeech_TTS', # irresponsive to exclamation marks # 4.29
65
+ # 'PolyAI/pheme': '/predict#0', # sleepy HF Space
66
+ # 'amphion/Text-to-Speech': '/predict#0', # disabled also on original HF space due to poor ratings
67
+ # 'suno/bark': '3#0', # Hallucinates
68
+ # 'shivammehta25/Matcha-TTS': '5#0', # seems to require multiple requests for setup
69
+ # 'styletts2/styletts2': '0#0', # API disabled, awaiting approval of PR #15
70
+ # 'Manmay/tortoise-tts': '/predict#0', # Cannot retrieve streamed file; 403
71
+ # 'pytorch/Tacotron2': '0#0', # old gradio
72
+ }
73
+
74
+ HF_SPACES = {
75
+ # XTTS v2
76
+ 'coqui/xtts': {
77
+ 'name': 'XTTS v2',
78
+ 'function': '1',
79
+ 'text_param_index': 0,
80
+ 'return_audio_index': 1,
81
+ 'series': 'XTTS',
82
+ },
83
+ # WhisperSpeech
84
+ 'collabora/WhisperSpeech': {
85
+ 'name': 'WhisperSpeech',
86
+ 'function': '/whisper_speech_demo',
87
+ 'text_param_index': 0,
88
+ 'return_audio_index': 0,
89
+ 'series': 'WhisperSpeech',
90
+ },
91
+ # OpenVoice (MyShell.ai)
92
+ 'myshell-ai/OpenVoice': {
93
+ 'name':'OpenVoice',
94
+ 'function': '1',
95
+ 'text_param_index': 0,
96
+ 'return_audio_index': 1,
97
+ 'series': 'OpenVoice',
98
+ },
99
+ # OpenVoice v2 (MyShell.ai)
100
+ 'myshell-ai/OpenVoiceV2': {
101
+ 'name':'OpenVoice v2',
102
+ 'function': '1',
103
+ 'text_param_index': 0,
104
+ 'return_audio_index': 1,
105
+ 'series': 'OpenVoice',
106
+ },
107
+ # MetaVoice
108
+ 'mrfakename/MetaVoice-1B-v0.1': {
109
+ 'name':'MetaVoice-1B',
110
+ 'function': '/tts',
111
+ 'text_param_index': 0,
112
+ 'return_audio_index': 0,
113
+ 'series': 'MetaVoice-1B',
114
+ },
115
+ # xVASynth (CPU)
116
+ 'Pendrokar/xVASynth-TTS': {
117
+ 'name': 'xVASynth v3',
118
+ 'function': '/predict',
119
+ 'text_param_index': 0,
120
+ 'return_audio_index': 0,
121
+ 'series': 'xVASynth',
122
+ },
123
+ # CoquiTTS (CPU)
124
+ 'coqui/CoquiTTS': {
125
+ 'name': 'CoquiTTS',
126
+ 'function': '0',
127
+ 'text_param_index': 0,
128
+ 'return_audio_index': 0,
129
+ 'series': 'CoquiTTS',
130
+ },
131
+ # HierSpeech_TTS
132
+ 'LeeSangHoon/HierSpeech_TTS': {
133
+ 'name': 'HierSpeech++',
134
+ 'function': '/predict',
135
+ 'text_param_index': 0,
136
+ 'return_audio_index': 0,
137
+ 'series': 'HierSpeech++',
138
+ },
139
+ # MeloTTS (MyShell.ai)
140
+ 'mrfakename/MeloTTS': {
141
+ 'name': 'MeloTTS',
142
+ 'function': '/synthesize',
143
+ 'text_param_index': 0,
144
+ 'return_audio_index': 0,
145
+ 'series': 'MeloTTS',
146
+ },
147
+
148
+ # Parler
149
+ 'parler-tts/parler_tts': {
150
+ 'name': 'Parler Mini',
151
+ 'function': '/gen_tts',
152
+ 'text_param_index': 0,
153
+ 'return_audio_index': 0,
154
+ 'is_zero_gpu_space': True,
155
+ 'series': 'Parler',
156
+ },
157
+ # Parler Mini
158
+ # 'parler-tts/parler_tts': {
159
+ # 'name': 'Parler Large',
160
+ # 'function': '/gen_tts',
161
+ # 'text_param_index': 0,
162
+ # 'return_audio_index': 0,
163
+ # 'is_zero_gpu_space': True,
164
+ # 'series': 'Parler',
165
+ # },
166
+ # Parler Mini which using Expresso dataset
167
+ 'parler-tts/parler-tts-expresso': {
168
+ 'name': 'Parler Mini Expresso',
169
+ 'function': '/gen_tts',
170
+ 'text_param_index': 0,
171
+ 'return_audio_index': 0,
172
+ 'is_zero_gpu_space': True,
173
+ 'series': 'Parler',
174
+ },
175
+
176
+ # Microsoft Edge TTS
177
+ 'innoai/Edge-TTS-Text-to-Speech': {
178
+ 'name': 'Edge TTS',
179
+ 'function': '/predict',
180
+ 'text_param_index': 0,
181
+ 'return_audio_index': 0,
182
+ 'is_closed_source': True,
183
+ 'series': 'Edge TTS',
184
+ },
185
+
186
+ # Fish Speech
187
+ 'fishaudio/fish-speech-1': {
188
+ 'name': 'Fish Speech',
189
+ 'function': '/inference_wrapper',
190
+ 'text_param_index': 0,
191
+ 'return_audio_index': 1,
192
+ 'series': 'Fish Speech',
193
+ },
194
+
195
+ # E2/F5 TTS
196
+ 'mrfakename/E2-F5-TTS': {
197
+ 'name': 'F5 of E2 TTS',
198
+ 'function': '/infer',
199
+ 'text_param_index': 2,
200
+ 'return_audio_index': 0,
201
+ 'is_zero_gpu_space': True,
202
+ 'series': 'E2/F5 TTS',
203
+ },
204
+
205
+ # IMS-Toucan
206
+ 'Flux9665/MassivelyMultilingualTTS': {
207
+ 'name': 'IMS-Toucan',
208
+ 'function': "/predict",
209
+ 'text_param_index': 0,
210
+ 'return_audio_index': 0,
211
+ 'series': 'IMS-Toucan',
212
+ },
213
+
214
+ # IMS-Toucan English non-artificial
215
+ 'Flux9665/EnglishToucan': {
216
+ 'name': 'IMS-Toucan EN',
217
+ 'function': "/predict",
218
+ 'text_param_index': 0,
219
+ 'return_audio_index': 0,
220
+ 'series': 'IMS-Toucan',
221
+ },
222
+
223
+ # StyleTTS v2
224
+ 'Pendrokar/style-tts-2': {
225
+ 'name': 'StyleTTS v2',
226
+ 'function': '/synthesize',
227
+ 'text_param_index': 0,
228
+ 'return_audio_index': 0,
229
+ 'is_zero_gpu_space': True,
230
+ 'series': 'StyleTTS',
231
+ },
232
+
233
+ # StyleTTS v2 kokoro fine tune
234
+ 'hexgrad/kokoro': {
235
+ 'name': 'StyleTTS Kokoro',
236
+ 'function': '/generate',
237
+ 'text_param_index': 0,
238
+ 'return_audio_index': 0,
239
+ 'is_zero_gpu_space': True,
240
+ 'series': 'StyleTTS',
241
+ },
242
+
243
+ # MaskGCT (by Amphion)
244
+ 'amphion/maskgct': {
245
+ 'name': 'MaskGCT',
246
+ 'function': '/predict',
247
+ 'text_param_index': 1,
248
+ 'return_audio_index': 0,
249
+ 'is_zero_gpu_space': True,
250
+ 'series': 'MaskGCT',
251
+ },
252
+ 'Svngoku/maskgct-audio-lab': {
253
+ 'name': 'MaskGCT',
254
+ 'function': '/predict',
255
+ 'text_param_index': 1,
256
+ 'return_audio_index': 0,
257
+ 'is_zero_gpu_space': True,
258
+ 'series': 'MaskGCT',
259
+ },
260
+ }
261
+
262
+ # for zero-shot TTS - voice sample used by XTTS (11 seconds)
263
+ DEFAULT_VOICE_SAMPLE_STR = 'https://cdn-uploads.huggingface.co/production/uploads/63d52e0c4e5642795617f668/V6-rMmI-P59DA4leWDIcK.wav'
264
+ DEFAULT_VOICE_SAMPLE = handle_file(DEFAULT_VOICE_SAMPLE_STR)
265
+ DEFAULT_VOICE_TRANSCRIPT = "The Hispaniola was rolling scuppers under in the ocean swell. The booms were tearing at the blocks, the rudder was banging to and fro, and the whole ship creaking, groaning, and jumping like a manufactory."
266
+
267
+ OVERRIDE_INPUTS = {
268
+ 'coqui/xtts': {
269
+ 1: 'en',
270
+ 2: DEFAULT_VOICE_SAMPLE_STR, # voice sample
271
+ 3: None, # mic voice sample
272
+ 4: False, #use_mic
273
+ 5: False, #cleanup_reference
274
+ 6: False, #auto_detect
275
+ },
276
+ 'collabora/WhisperSpeech': {
277
+ 1: DEFAULT_VOICE_SAMPLE, # voice sample
278
+ 2: DEFAULT_VOICE_SAMPLE, # voice sample URL
279
+ 3: 14.0, #Tempo - Gradio Slider issue: takes min. rather than value
280
+ },
281
+ 'myshell-ai/OpenVoice': {
282
+ 1: 'default', # style
283
+ 2: 'https://huggingface.co/spaces/myshell-ai/OpenVoiceV2/resolve/main/examples/speaker0.mp3', # voice sample
284
+ },
285
+ 'myshell-ai/OpenVoiceV2': {
286
+ 1: 'en_us', # style
287
+ 2: 'https://huggingface.co/spaces/myshell-ai/OpenVoiceV2/resolve/main/examples/speaker0.mp3', # voice sample
288
+ },
289
+ 'PolyAI/pheme': {
290
+ 1: 'YOU1000000044_S0000798', # voice
291
+ 2: 210,
292
+ 3: 0.7, #Tempo - Gradio Slider issue: takes min. rather than value
293
+ },
294
+ 'Pendrokar/xVASynth-TTS': {
295
+ 1: 'x_ex04', #fine-tuned voice model name
296
+ 3: 1.0, #pacing/duration - Gradio Slider issue: takes min. rather than value
297
+ },
298
+ 'suno/bark': {
299
+ 1: 'Speaker 3 (en)', # voice
300
+ },
301
+ 'amphion/Text-to-Speech': {
302
+ 1: 'LikeManyWaters', # voice
303
+ },
304
+ 'LeeSangHoon/HierSpeech_TTS': {
305
+ 1: handle_file('https://huggingface.co/spaces/LeeSangHoon/HierSpeech_TTS/resolve/main/example/female.wav'), # voice sample
306
+ 2: 0.333,
307
+ 3: 0.333,
308
+ 4: 1,
309
+ 5: 1,
310
+ 6: 0,
311
+ 7: 1111,
312
+ },
313
+ 'Manmay/tortoise-tts': {
314
+ 1: None, # text-from-file
315
+ 2: 'angie', # voice
316
+ 3: 'disabled', # second voice for a dialogue
317
+ 4: 'No', # split by newline
318
+ },
319
+ 'mrfakename/MeloTTS': {
320
+ 1: 'EN-Default', # speaker; DEFAULT_VOICE_SAMPLE=EN-Default
321
+ 2: 1, # speed
322
+ 3: 'EN', # language
323
+ },
324
+ 'mrfakename/MetaVoice-1B-v0.1': {
325
+ 1: 5, # float (numeric value between 0.0 and 10.0) in 'Speech Stability - improves text following for a challenging speaker' Slider component
326
+ 2: 5, # float (numeric value between 1.0 and 5.0) in 'Speaker similarity - How closely to match speaker identity and speech style.' Slider component
327
+ 3: "Preset voices", # Literal['Preset voices', 'Upload target voice'] in 'Choose voice' Radio component
328
+ 4: "Bria", # Literal['Bria', 'Alex', 'Jacob'] in 'Preset voices' Dropdown component
329
+ 5: None, # filepath in 'Upload a clean sample to clone. Sample should contain 1 speaker, be between 30-90 seconds and not contain background noise.' Audio component
330
+ },
331
+ 'parler-tts/parler_tts': {
332
+ 1: 'Laura; Laura\'s female voice; very clear audio', # description/prompt
333
+ },
334
+ 'parler-tts/parler-tts-expresso': {
335
+ 1: 'Elisabeth; Elisabeth\'s female voice; very clear audio', # description/prompt
336
+ },
337
+ 'innoai/Edge-TTS-Text-to-Speech': {
338
+ 1: 'en-US-EmmaMultilingualNeural - en-US (Female)', # voice
339
+ 2: 0, # pace rate
340
+ 3: 0, # pitch
341
+ },
342
+
343
+ 'fishaudio/fish-speech-1': {
344
+ 1: True, # enable_reference_audio
345
+ 2: handle_file('https://huggingface.co/spaces/fishaudio/fish-speech-1/resolve/main/examples/English.wav'), # reference_audio
346
+ 3: 'In the ancient land of Eldoria, where the skies were painted with shades of mystic hues and the forests whispered secrets of old, there existed a dragon named Zephyros. Unlike the fearsome tales of dragons that plagued human hearts with terror, Zephyros was a creature of wonder and wisdom, revered by all who knew of his existence.', # reference_text
347
+ 4: 0, # max_new_tokens
348
+ 5: 200, # chunk_length
349
+ 6: 0.7, # top_p
350
+ 7: 1.2, # repetition_penalty
351
+ 8: 0.7, # temperature
352
+ 9: 1, # batch_infer_num
353
+ 10: False, # if_load_asr_model
354
+ },
355
+
356
+ 'mrfakename/E2-F5-TTS': {
357
+ 0: DEFAULT_VOICE_SAMPLE, # voice sample
358
+ 1: DEFAULT_VOICE_TRANSCRIPT, # transcript of sample (< 15 seconds required)
359
+ 3: False, # cleanup silence
360
+ 4: 0.15, #crossfade
361
+ 5: 32, #nfe_slider
362
+ 6: 1, #speed
363
+ },
364
+
365
+ # IMS-Toucan
366
+ 'Flux9665/MassivelyMultilingualTTS': {
367
+ 1: "English (eng)", #language
368
+ 2: 0.6, #prosody_creativity
369
+ 3: 1, #duration_scaling_factor
370
+ 4: 41, #voice_seed
371
+ 5: -7.5, #emb1
372
+ 6: None, #reference_audio
373
+ },
374
+
375
+ # StyleTTS 2
376
+ 'Pendrokar/style-tts-2': {
377
+ 1: "f-us-2", #voice
378
+ 2: 'en-us', # lang
379
+ 3: 8, # lngsteps
380
+ },
381
+
382
+ # StyleTTS 2 kokoro
383
+ 'hexgrad/kokoro': {
384
+ 1: "af", #voice
385
+ 2: None, #ps
386
+ 3: 1, #speed
387
+ 4: 3000, #trim
388
+ 5: False, #use_gpu; fast enough with multithreaded with CPU
389
+ 6: os.getenv('KOKORO'), #sk
390
+ },
391
+
392
+ # maskGCT (by amphion)
393
+ 'amphion/maskgct': {
394
+ 0: DEFAULT_VOICE_SAMPLE, #prompt_wav
395
+ 2: -1, #target_len
396
+ 3: 25, #n_timesteps
397
+ },
398
+ 'Svngoku/maskgct-audio-lab': {
399
+ 0: DEFAULT_VOICE_SAMPLE, #prompt_wav
400
+ 2: -1, #target_len
401
+ 3: 25, #n_timesteps
402
+ },
403
+ }
404
+
405
+
406
+ # Model name mapping, can include models that users cannot vote on
407
+ model_names = {
408
+ 'styletts2': 'StyleTTS 2',
409
+ 'tacotron': 'Tacotron',
410
+ 'tacotronph': 'Tacotron Phoneme',
411
+ 'tacotrondca': 'Tacotron DCA',
412
+ 'speedyspeech': 'Speedy Speech',
413
+ 'overflow': 'Overflow TTS',
414
+ 'vits': 'VITS',
415
+ 'vitsneon': 'VITS Neon',
416
+ 'neuralhmm': 'Neural HMM',
417
+ 'glow': 'Glow TTS',
418
+ 'fastpitch': 'FastPitch',
419
+ 'jenny': 'Jenny',
420
+ 'tortoise': 'Tortoise TTS',
421
+ 'xtts2': 'Coqui XTTSv2',
422
+ 'xtts': 'Coqui XTTS',
423
+ 'openvoice': 'MyShell OpenVoice',
424
+ 'elevenlabs': 'ElevenLabs',
425
+ 'openai': 'OpenAI',
426
+ 'hierspeech': 'HierSpeech++',
427
+ 'pheme': 'PolyAI Pheme',
428
+ 'speecht5': 'SpeechT5',
429
+ 'metavoice': 'MetaVoice-1B',
430
+ }
431
+
432
+ def make_link_to_space(model_name, for_leaderboard=False):
433
+ # create a anchor link if a HF space
434
+ style = 'text-decoration: underline;text-decoration-style: dotted;'
435
+ title = ''
436
+
437
+ if model_name in AVAILABLE_MODELS:
438
+ style += 'color: var(--link-text-color);'
439
+ title = model_name
440
+ else:
441
+ style += 'font-style: italic;'
442
+ title = 'Disabled for Arena (See AVAILABLE_MODELS within code for why)'
443
+
444
+ model_basename = model_name
445
+ if model_name in HF_SPACES:
446
+ model_basename = HF_SPACES[model_name]['name']
447
+
448
+ try:
449
+ if(
450
+ for_leaderboard
451
+ and HF_SPACES[model_name]['is_closed_source']
452
+ ):
453
+ model_basename += ' 🔐'
454
+ title += '; 🔐 = online only or proprietary'
455
+ except:
456
+ pass
457
+
458
+ if '/' in model_name:
459
+ return '🤗 <a target="_blank" style="'+ style +'" title="'+ title +'" href="'+ 'https://huggingface.co/spaces/'+ model_name +'">'+ model_basename +'</a>'
460
+
461
+ # otherwise just return the model name
462
+ return '<span style="'+ style +'" title="'+ title +'" href="'+ 'https://huggingface.co/spaces/'+ model_name +'">'+ model_name +'</span>'
463
+
464
+ def markdown_link_to_space(model_name):
465
+ # create a anchor link if a HF space using markdown syntax
466
+ if '/' in model_name:
467
+ return '🤗 [' + model_name + '](https://huggingface.co/spaces/' + model_name + ')'
468
+ # otherwise just return the model name
469
+ return model_name
app/synth.py ADDED
@@ -0,0 +1,421 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import time
2
+ from .models import *
3
+ from .utils import *
4
+ from .config import *
5
+ from .init import *
6
+ from .sample_caching import *
7
+
8
+ import gradio as gr
9
+ from pydub import AudioSegment
10
+ import random, os, threading, tempfile
11
+ from langdetect import detect
12
+ from .vote import log_text
13
+
14
+ # top five models in order to always have one of them picked and scrutinized
15
+ top_five = ['fishaudio/fish-speech-1'] # fish 1.5
16
+ hf_token=os.getenv('HF_TOKEN')
17
+
18
+ # prioritize low vote models
19
+ sql = 'SELECT name FROM model WHERE (upvote + downvote) < 750 ORDER BY (upvote + downvote) ASC'
20
+ conn = get_db()
21
+ cursor = conn.cursor()
22
+ cursor.execute(sql)
23
+ data = cursor.fetchall()
24
+ for model in data:
25
+ if (
26
+ len(top_five) >= 5
27
+ ):
28
+ break
29
+
30
+ if model[0] in AVAILABLE_MODELS.keys():
31
+ top_five.append(model[0])
32
+ print(f"low vote top_five: {top_five}")
33
+
34
+ def random_m():
35
+ return random.sample(list(set(AVAILABLE_MODELS.keys())), 2)
36
+
37
+ def check_toxicity(text):
38
+ if not TOXICITY_CHECK:
39
+ return False
40
+ return toxicity.predict(text)['toxicity'] > 0.8
41
+
42
+ def synthandreturn(text, autoplay, request: gr.Request):
43
+ text = text.strip()
44
+ if len(text) > MAX_SAMPLE_TXT_LENGTH:
45
+ raise gr.Error(f'You exceeded the limit of {MAX_SAMPLE_TXT_LENGTH} characters')
46
+ if len(text) < MIN_SAMPLE_TXT_LENGTH:
47
+ raise gr.Error(f'Please input a text longer than {MIN_SAMPLE_TXT_LENGTH} characters')
48
+ if (
49
+ # test toxicity if not prepared text
50
+ text not in sents
51
+ and check_toxicity(text)
52
+ ):
53
+ print(f'Detected toxic content! "{text}"')
54
+ raise gr.Error('Your text failed the toxicity test')
55
+ if not text:
56
+ raise gr.Error(f'You did not enter any text')
57
+ # Check language
58
+ try:
59
+ if (
60
+ text not in sents
61
+ and not detect(text) == "en"
62
+ ):
63
+ gr.Warning('Warning: The input text may not be in English')
64
+ except:
65
+ pass
66
+ # Get two random models
67
+
68
+ # forced model: your TTS model versus The World!!!
69
+ # mdl1 = 'Pendrokar/xVASynth'
70
+
71
+ # scrutinize the top five by always picking one of them
72
+ if (len(top_five) >= 5):
73
+ mdl1 = random.sample(top_five, 1)[0]
74
+ vsModels = dict(AVAILABLE_MODELS)
75
+ del vsModels[mdl1]
76
+ # randomize position of the forced model
77
+ mdl2 = random.sample(list(vsModels.keys()), 1)
78
+ # forced random
79
+ mdl1, mdl2 = random.sample(list([mdl1, mdl2[0]]), 2)
80
+ else:
81
+ # actual random
82
+ mdl1, mdl2 = random.sample(list(AVAILABLE_MODELS.keys()), 2)
83
+
84
+ print("[debug] Using", mdl1, mdl2)
85
+ def predict_and_update_result(text, model, result_storage, request:gr.Request):
86
+
87
+ hf_headers = {}
88
+ try:
89
+ if HF_SPACES[model]['is_zero_gpu_space']:
90
+ hf_headers = {"X-IP-Token": request.headers['x-ip-token']}
91
+ except:
92
+ pass
93
+
94
+ # re-attempt if necessary
95
+ attempt_count = 0
96
+ max_attempts = 1 # 3 =May cause 429 Too Many Request
97
+ while attempt_count < max_attempts:
98
+ try:
99
+ if model in AVAILABLE_MODELS:
100
+ if '/' in model:
101
+ # Use public HF Space
102
+ # if (model not in hf_clients):
103
+ # hf_clients[model] = Client(model, hf_token=hf_token, headers=hf_headers)
104
+ mdl_space = Client(model, hf_token=hf_token, headers=hf_headers)
105
+
106
+ # print(f"{model}: Fetching endpoints of HF Space")
107
+ # assume the index is one of the first 9 return params
108
+ return_audio_index = int(HF_SPACES[model]['return_audio_index'])
109
+ endpoints = mdl_space.view_api(all_endpoints=True, print_info=False, return_format='dict')
110
+
111
+ api_name = None
112
+ fn_index = None
113
+ end_parameters = None
114
+ # has named endpoint
115
+ if '/' == HF_SPACES[model]['function'][0]:
116
+ # audio sync function name
117
+ api_name = HF_SPACES[model]['function']
118
+
119
+ end_parameters = _get_param_examples(
120
+ endpoints['named_endpoints'][api_name]['parameters']
121
+ )
122
+ # has unnamed endpoint
123
+ else:
124
+ # endpoint index is the first character
125
+ fn_index = int(HF_SPACES[model]['function'])
126
+
127
+ end_parameters = _get_param_examples(
128
+ endpoints['unnamed_endpoints'][str(fn_index)]['parameters']
129
+ )
130
+
131
+ # override some or all default parameters
132
+ space_inputs = _override_params(end_parameters, model)
133
+
134
+ # force text
135
+ space_inputs[HF_SPACES[model]['text_param_index']] = text
136
+
137
+ print(f"{model}: Sending request to HF Space")
138
+ results = mdl_space.predict(*space_inputs, api_name=api_name, fn_index=fn_index)
139
+
140
+ # return path to audio
141
+ result = results
142
+ if (not isinstance(results, str)):
143
+ # return_audio_index may be a filepath string
144
+ result = results[return_audio_index]
145
+ if (isinstance(result, dict)):
146
+ # return_audio_index is a dictionary
147
+ result = results[return_audio_index]['value']
148
+ else:
149
+ # Use the private HF Space
150
+ result = router.predict(text, AVAILABLE_MODELS[model].lower(), api_name="/synthesize")
151
+ else:
152
+ result = router.predict(text, model.lower(), api_name="/synthesize")
153
+ break
154
+ except Exception as e:
155
+ attempt_count += 1
156
+ raise gr.Error(f"{model}:"+ repr(e))
157
+ # print(f"{model}: Unable to call API (attempt: {attempt_count})")
158
+ # sleep for three seconds to avoid spamming the server with requests
159
+ # time.sleep(3)
160
+
161
+ # Fetch and store client again
162
+ # hf_clients[model] = Client(model, hf_token=hf_token, headers=hf_headers)
163
+
164
+ if attempt_count >= max_attempts:
165
+ raise gr.Error(f"{model}: Failed to call model")
166
+ else:
167
+ print('Done with', model)
168
+
169
+ try:
170
+ with tempfile.NamedTemporaryFile(suffix='.wav', delete=False) as f:
171
+ audio = AudioSegment.from_file(result)
172
+ current_sr = audio.frame_rate
173
+ if current_sr > 24000:
174
+ print(f"{model}: Resampling")
175
+ audio = audio.set_frame_rate(24000)
176
+ try:
177
+ print(f"{model}: Trying to normalize audio")
178
+ audio = match_target_amplitude(audio, -20)
179
+ except:
180
+ print(f"{model}: [WARN] Unable to normalize audio")
181
+ audio.export(f.name, format="wav")
182
+ os.unlink(result)
183
+ result = f.name
184
+ gr.Info('Audio from a TTS model received')
185
+ except:
186
+ print(f"{model}: [WARN] Unable to resample audio")
187
+ pass
188
+ if model in AVAILABLE_MODELS.keys(): model = AVAILABLE_MODELS[model]
189
+ result_storage[model] = result
190
+
191
+ def _get_param_examples(parameters):
192
+ example_inputs = []
193
+ for param_info in parameters:
194
+ if (
195
+ param_info['component'] == 'Radio'
196
+ or param_info['component'] == 'Dropdown'
197
+ or param_info['component'] == 'Audio'
198
+ or param_info['python_type']['type'] == 'str'
199
+ ):
200
+ example_inputs.append(str(param_info['example_input']))
201
+ continue
202
+ if param_info['python_type']['type'] == 'int':
203
+ example_inputs.append(int(param_info['example_input']))
204
+ continue
205
+ if param_info['python_type']['type'] == 'float':
206
+ example_inputs.append(float(param_info['example_input']))
207
+ continue
208
+ if param_info['python_type']['type'] == 'bool':
209
+ example_inputs.append(bool(param_info['example_input']))
210
+ continue
211
+
212
+ return example_inputs
213
+
214
+ def _override_params(inputs, modelname):
215
+ try:
216
+ for key,value in OVERRIDE_INPUTS[modelname].items():
217
+ inputs[key] = value
218
+ print(f"{modelname}: Default inputs overridden by Arena")
219
+ except:
220
+ pass
221
+
222
+ return inputs
223
+
224
+ def _cache_sample(text, model):
225
+ # skip caching if not hardcoded sentence
226
+ if (text not in sents):
227
+ return False
228
+
229
+ already_cached = False
230
+ # check if already cached
231
+ for cached_sample in cached_samples:
232
+ # TODO:replace cached with newer version
233
+ if (cached_sample.transcript == text and cached_sample.modelName == model):
234
+ already_cached = True
235
+ return True
236
+
237
+ if (already_cached):
238
+ return False
239
+
240
+ try:
241
+ cached_samples.append(Sample(results[model], text, model))
242
+ except:
243
+ print('Error when trying to cache sample')
244
+ return False
245
+
246
+ mdl1k = mdl1
247
+ mdl2k = mdl2
248
+ print(mdl1k, mdl2k)
249
+ if mdl1 in AVAILABLE_MODELS.keys(): mdl1k=AVAILABLE_MODELS[mdl1]
250
+ if mdl2 in AVAILABLE_MODELS.keys(): mdl2k=AVAILABLE_MODELS[mdl2]
251
+ results = {}
252
+ print(f"Sending models {mdl1k} and {mdl2k} to API")
253
+
254
+ # do not use multithreading when both spaces are ZeroGPU type
255
+ if (
256
+ # exists
257
+ 'is_zero_gpu_space' in HF_SPACES[mdl1]
258
+ # is True
259
+ and HF_SPACES[mdl1]['is_zero_gpu_space']
260
+ and 'is_zero_gpu_space' in HF_SPACES[mdl2]
261
+ and HF_SPACES[mdl2]['is_zero_gpu_space']
262
+ ):
263
+ # run Zero-GPU spaces one at a time
264
+ predict_and_update_result(text, mdl1k, results, request)
265
+ _cache_sample(text, mdl1k)
266
+
267
+ predict_and_update_result(text, mdl2k, results, request)
268
+ _cache_sample(text, mdl2k)
269
+ else:
270
+ # use multithreading
271
+ thread1 = threading.Thread(target=predict_and_update_result, args=(text, mdl1k, results, request))
272
+ thread2 = threading.Thread(target=predict_and_update_result, args=(text, mdl2k, results, request))
273
+
274
+ thread1.start()
275
+ # wait 3 seconds to calm hf.space domain
276
+ time.sleep(3)
277
+ thread2.start()
278
+ # timeout in 2 minutes
279
+ thread1.join(120)
280
+ thread2.join(120)
281
+
282
+ # cache the result
283
+ for model in [mdl1k, mdl2k]:
284
+ _cache_sample(text, model)
285
+
286
+ print(f"Retrieving models {mdl1k} and {mdl2k} from API")
287
+ return (
288
+ text,
289
+ "Synthesize",
290
+ gr.update(visible=True), # r2
291
+ mdl1, # model1
292
+ mdl2, # model2
293
+ gr.update(visible=True, value=results[mdl1k], autoplay=autoplay), # aud1
294
+ gr.update(visible=True, value=results[mdl2k], autoplay=False), # aud2
295
+ gr.update(visible=True, interactive=False), #abetter
296
+ gr.update(visible=True, interactive=False), #bbetter
297
+ gr.update(visible=False), #prevmodel1
298
+ gr.update(visible=False), #prevmodel2
299
+ gr.update(visible=False), #nxt round btn
300
+ # reset gr.State aplayed & bplayed
301
+ False, #aplayed
302
+ False, #bplayed
303
+ )
304
+
305
+ # Battle Mode
306
+
307
+ def synthandreturn_battle(text, mdl1, mdl2, autoplay):
308
+ if mdl1 == mdl2:
309
+ raise gr.Error('You can\'t pick two of the same models.')
310
+ text = text.strip()
311
+ if len(text) > MAX_SAMPLE_TXT_LENGTH:
312
+ raise gr.Error(f'You exceeded the limit of {MAX_SAMPLE_TXT_LENGTH} characters')
313
+ if len(text) < MIN_SAMPLE_TXT_LENGTH:
314
+ raise gr.Error(f'Please input a text longer than {MIN_SAMPLE_TXT_LENGTH} characters')
315
+ if (
316
+ # test toxicity if not prepared text
317
+ text not in sents
318
+ and check_toxicity(text)
319
+ ):
320
+ print(f'Detected toxic content! "{text}"')
321
+ raise gr.Error('Your text failed the toxicity test')
322
+ if not text:
323
+ raise gr.Error(f'You did not enter any text')
324
+ # Check language
325
+ try:
326
+ if not detect(text) == "en":
327
+ gr.Warning('Warning: The input text may not be in English')
328
+ except:
329
+ pass
330
+ # Get two random models
331
+ log_text(text)
332
+ print("[debug] Using", mdl1, mdl2)
333
+ def predict_and_update_result(text, model, result_storage):
334
+ try:
335
+ if model in AVAILABLE_MODELS:
336
+ result = router.predict(text, AVAILABLE_MODELS[model].lower(), api_name="/synthesize")
337
+ else:
338
+ result = router.predict(text, model.lower(), api_name="/synthesize")
339
+ except:
340
+ raise gr.Error('Unable to call API, please try again :)')
341
+ print('Done with', model)
342
+ # try:
343
+ # doresample(result)
344
+ # except:
345
+ # pass
346
+ try:
347
+ with tempfile.NamedTemporaryFile(suffix='.wav', delete=False) as f:
348
+ audio = AudioSegment.from_file(result)
349
+ current_sr = audio.frame_rate
350
+ if current_sr > 24000:
351
+ audio = audio.set_frame_rate(24000)
352
+ try:
353
+ print('Trying to normalize audio')
354
+ audio = match_target_amplitude(audio, -20)
355
+ except:
356
+ print('[WARN] Unable to normalize audio')
357
+ audio.export(f.name, format="wav")
358
+ os.unlink(result)
359
+ result = f.name
360
+ except:
361
+ pass
362
+ if model in AVAILABLE_MODELS.keys(): model = AVAILABLE_MODELS[model]
363
+ print(model)
364
+ print(f"Running model {model}")
365
+ result_storage[model] = result
366
+ # try:
367
+ # doloudnorm(result)
368
+ # except:
369
+ # pass
370
+ mdl1k = mdl1
371
+ mdl2k = mdl2
372
+ print(mdl1k, mdl2k)
373
+ if mdl1 in AVAILABLE_MODELS.keys(): mdl1k=AVAILABLE_MODELS[mdl1]
374
+ if mdl2 in AVAILABLE_MODELS.keys(): mdl2k=AVAILABLE_MODELS[mdl2]
375
+ results = {}
376
+ print(f"Sending models {mdl1k} and {mdl2k} to API")
377
+ thread1 = threading.Thread(target=predict_and_update_result, args=(text, mdl1k, results))
378
+ thread2 = threading.Thread(target=predict_and_update_result, args=(text, mdl2k, results))
379
+
380
+ thread1.start()
381
+ thread2.start()
382
+ thread1.join()
383
+ thread2.join()
384
+
385
+ print(f"Retrieving models {mdl1k} and {mdl2k} from API")
386
+ return (
387
+ text,
388
+ "Synthesize",
389
+ gr.update(visible=True), # r2
390
+ mdl1, # model1
391
+ mdl2, # model2
392
+ gr.update(visible=True, value=results[mdl1k], autoplay=autoplay), # aud1
393
+ gr.update(visible=True, value=results[mdl2k], autoplay=False), # aud2
394
+ gr.update(visible=True, interactive=False), #abetter
395
+ gr.update(visible=True, interactive=False), #bbetter
396
+ gr.update(visible=False), #prevmodel1
397
+ gr.update(visible=False), #prevmodel2
398
+ gr.update(visible=False), #nxt round btn
399
+ )
400
+
401
+ def randomsent():
402
+ return '⚡', random.choice(sents), '🎲'
403
+ def randomsent_battle():
404
+ return tuple(randomsent()) + tuple(random_m())
405
+ def clear_stuff():
406
+ return [
407
+ gr.update(visible=True, value="", elem_classes=[]),
408
+ "Synthesize",
409
+ gr.update(visible=False), # r2
410
+ '', # model1
411
+ '', # model2
412
+ gr.update(visible=False, interactive=False, autoplay=False), # aud1
413
+ gr.update(visible=False, interactive=False, autoplay=False), # aud2
414
+ gr.update(visible=False, interactive=False), #abetter
415
+ gr.update(visible=False, interactive=False), #bbetter
416
+ gr.update(visible=False), #prevmodel1
417
+ gr.update(visible=False), #prevmodel2
418
+ gr.update(visible=False), #nxt round btn
419
+ False, #aplayed
420
+ False, #bplayed
421
+ ]
app/ui.py ADDED
@@ -0,0 +1,52 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ from .config import *
3
+ from .messages import *
4
+ from .ui_vote import *
5
+ from .ui_leaderboard import *
6
+ from .ui_contenders import *
7
+
8
+ # JavaScript within HTML head
9
+ head_js = ""
10
+ shortcut_js = """
11
+ <script>
12
+ function shortcuts(e) {
13
+ var event = document.all ? window.event : e;
14
+ switch (e.target.tagName.toLowerCase()) {
15
+ case "input":
16
+ case "textarea":
17
+ break;
18
+ default:
19
+ switch (e.key.toLowerCase()) {
20
+ case "a":
21
+ document.getElementById("arena-a-better").click();
22
+ break;
23
+ case "b":
24
+ document.getElementById("arena-b-better").click();
25
+ break;
26
+ case "n":
27
+ document.getElementById("arena-next-round").click();
28
+ break;
29
+ }
30
+ }
31
+ }
32
+ document.addEventListener('keypress', shortcuts, false);
33
+
34
+ """
35
+ head_js += shortcut_js
36
+ head_js += open("app/cookie.js").read()
37
+ head_js += '</script>'
38
+
39
+ with gr.Blocks() as about:
40
+ gr.Markdown(ABOUT)
41
+
42
+ with gr.Blocks(
43
+ css="footer {visibility: hidden}textbox{resize:none} .blurred-text {filter: blur(0.15em);}",
44
+ head=head_js,
45
+ title="TTS Arena"
46
+ ) as app:
47
+ gr.Markdown(DESCR)
48
+ gr.TabbedInterface([vote, leaderboard, about, tts_info], ['🗳️ Vote', '🏆 Leaderboard', '📄 About', '🗣 Contenders'])
49
+ if CITATION_TEXT:
50
+ with gr.Row():
51
+ with gr.Accordion("Citation", open=False):
52
+ gr.Markdown(f"If you use this data in your publication, please cite us!\n\nCopy the BibTeX citation to cite this source:\n\n```bibtext\n{CITATION_TEXT}\n```\n\nPlease note that all generated audio clips should be assumed unsuitable for redistribution or commercial use.")
app/ui_battle.py ADDED
@@ -0,0 +1,75 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ from .config import *
3
+ from .ui import *
4
+ from .synth import *
5
+ from .vote import *
6
+ from .messages import *
7
+
8
+ def disable():
9
+ return [gr.update(interactive=False), gr.update(interactive=False), gr.update(interactive=False)]
10
+ def enable():
11
+ return [gr.update(interactive=True), gr.update(interactive=True), gr.update(interactive=True)]
12
+
13
+
14
+ with gr.Blocks() as battle:
15
+ battle_useridstate = gr.State()
16
+
17
+ gr.Markdown(BATTLE_INSTR)
18
+ model1 = gr.Textbox(interactive=False, lines=1, max_lines=1, visible=False)
19
+ model2 = gr.Textbox(interactive=False, lines=1, max_lines=1, visible=False)
20
+ with gr.Group():
21
+ with gr.Row():
22
+ text = gr.Textbox(container=False, show_label=False, placeholder="Enter text to synthesize", lines=1, max_lines=1, scale=9999999, min_width=0)
23
+ randomt_battle = gr.Button('🎲', scale=0, min_width=0, variant='tool')
24
+ with gr.Row():
25
+ with gr.Column(scale=10):
26
+ model1s = gr.Dropdown(label="Model 1", container=False, show_label=False, choices=AVAILABLE_MODELS.keys(), interactive=True, value=list(AVAILABLE_MODELS.keys())[0])
27
+ with gr.Column(scale=10):
28
+ model2s = gr.Dropdown(label="Model 2", container=False, show_label=False, choices=AVAILABLE_MODELS.keys(), interactive=True, value=list(AVAILABLE_MODELS.keys())[1])
29
+ randomt_battle.click(randomsent_battle, outputs=[text, randomt_battle, model1s, model2s])
30
+ btn = gr.Button("Synthesize", variant='primary')
31
+ with gr.Row(visible=False) as r2:
32
+ with gr.Column():
33
+ with gr.Group():
34
+ aud1 = gr.Audio(interactive=False, show_label=False, show_download_button=False, show_share_button=False)
35
+ abetter = gr.Button("A is better", variant='primary')
36
+ prevmodel1 = gr.Textbox(interactive=False, show_label=False, container=False, value="Vote to reveal model A", text_align="center", lines=1, max_lines=1, visible=False)
37
+ with gr.Column():
38
+ with gr.Group():
39
+ aud2 = gr.Audio(interactive=False, show_label=False, show_download_button=False, show_share_button=False)
40
+ bbetter = gr.Button("B is better", variant='primary')
41
+ prevmodel2 = gr.Textbox(interactive=False, show_label=False, container=False, value="Vote to reveal model B", text_align="center", lines=1, max_lines=1, visible=False)
42
+ autoplay = gr.Checkbox(
43
+ label="Autoplay audio",
44
+ value=True
45
+ )
46
+
47
+ outputs = [
48
+ text,
49
+ btn,
50
+ r2,
51
+ model1,
52
+ model2,
53
+ aud1,
54
+ aud2,
55
+ abetter,
56
+ bbetter,
57
+ prevmodel1,
58
+ prevmodel2,
59
+ ]
60
+ btn\
61
+ .click(disable, outputs=[btn, abetter, bbetter])\
62
+ .then(synthandreturn_battle, inputs=[text, model1s, model2s, autoplay], outputs=outputs)\
63
+ .then(enable, outputs=[btn, abetter, bbetter])
64
+ nxt_outputs = [abetter, bbetter, prevmodel1, prevmodel2]
65
+ abetter.click(a_is_better_battle, outputs=nxt_outputs, inputs=[model1, model2, battle_useridstate])
66
+ bbetter.click(b_is_better_battle, outputs=nxt_outputs, inputs=[model1, model2, battle_useridstate])
67
+ battle.load(random_m, outputs=[model1s, model2s])
68
+
69
+ # Autoplay second audio using JS
70
+ aud1\
71
+ .stop(
72
+ None,
73
+ inputs=[autoplay],
74
+ js="(b) => b ? 0 : document.querySelector('.row .gap+.gap button.play-pause-button[aria-label=Play]').click()",
75
+ )
app/ui_leaderboard.py ADDED
@@ -0,0 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ from .config import *
3
+ from .leaderboard import *
4
+ from .messages import *
5
+
6
+ with gr.Blocks() as leaderboard:
7
+ gr.Markdown(LDESC)
8
+ df = gr.Dataframe(
9
+ interactive=False,
10
+ min_width=0,
11
+ wrap=False,
12
+ column_widths=[30, 200, 50, 50],
13
+ datatype=["str", "html", "html", "number"]
14
+ )
15
+ reloadbtn = gr.Button("Refresh")
16
+ with gr.Row():
17
+ reveal_prelim = gr.Checkbox(label="Reveal preliminary results", info="Show all models, including models with very few human ratings.", scale=1)
18
+ # hide_battle_votes = gr.Checkbox(label="Hide Battle Mode votes", info="Exclude votes obtained through Battle Mode.", scale=1)
19
+ reveal_prelim.input(get_leaderboard, inputs=[reveal_prelim], outputs=[df])
20
+ # hide_battle_votes.input(get_leaderboard, inputs=[reveal_prelim, hide_battle_votes], outputs=[df])
21
+ leaderboard.load(get_leaderboard, inputs=[reveal_prelim], outputs=[df])
22
+ reloadbtn.click(get_leaderboard, inputs=[reveal_prelim], outputs=[df])
23
+ # gr.Markdown("DISCLAIMER: The licenses listed may not be accurate or up to date, you are responsible for checking the licenses before using the models. Also note that some models may have additional usage restrictions.")
app/ui_vote.py ADDED
@@ -0,0 +1,194 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ from .config import *
3
+ from .synth import *
4
+ from .vote import *
5
+ from .messages import *
6
+ from .sample_caching import *
7
+
8
+ blur_text_js = 'document.getElementById("arena-text-input").classList.add("blurred-text")'
9
+ unblur_text_js = 'document.getElementById("arena-text-input").classList.remove("blurred-text")'
10
+
11
+ def disable():
12
+ return [gr.update(interactive=False), gr.update(interactive=False), gr.update(interactive=False), gr.update(interactive=False)]
13
+ def enable():
14
+ return [gr.update(interactive=True), gr.update(interactive=True), gr.update(interactive=True), gr.update(interactive=True)]
15
+ def blur_text():
16
+ return gr.update(elem_classes=['blurred-text'])
17
+ def unblur_text():
18
+ return gr.update(elem_classes=[])
19
+
20
+ with gr.Blocks() as vote:
21
+ session_hash = gr.Textbox(visible=False, value='')
22
+
23
+ # sample played, using Checkbox so that JS can fetch the value
24
+ aplayed = gr.Checkbox(visible=False, value=False)
25
+ bplayed = gr.Checkbox(visible=False, value=False)
26
+ # voter ID
27
+ useridstate = gr.State()
28
+ gr.Markdown(INSTR)
29
+ with gr.Group():
30
+ with gr.Row():
31
+ cachedt = gr.Button('⚡', scale=0, min_width=0, variant='tool', interactive=True)
32
+ text = gr.Textbox(
33
+ container=False,
34
+ show_label=False,
35
+ placeholder="Enter text to synthesize",
36
+ lines=1,
37
+ max_lines=1,
38
+ scale=9999999,
39
+ min_width=0,
40
+ elem_id="arena-text-input",
41
+ )
42
+ randomt = gr.Button('🎲', scale=0, min_width=0, variant='tool')
43
+ randomt\
44
+ .click(randomsent, outputs=[cachedt, text, randomt])\
45
+ .then(None, js="() => "+ unblur_text_js)
46
+ btn = gr.Button("Synthesize", variant='primary')
47
+ model1 = gr.Textbox(interactive=False, lines=1, max_lines=1, visible=False)
48
+ model2 = gr.Textbox(interactive=False, lines=1, max_lines=1, visible=False)
49
+ with gr.Row(visible=False) as r2:
50
+ with gr.Column():
51
+ with gr.Group():
52
+ aud1 = gr.Audio(
53
+ interactive=False,
54
+ show_label=False,
55
+ show_download_button=False,
56
+ show_share_button=False,
57
+ # waveform_options={'waveform_progress_color': '#EF4444'},
58
+ # var(--color-red-500)'}); gradio only accepts HEX and CSS color
59
+ )
60
+ abetter = gr.Button(
61
+ "[A] is better",
62
+ elem_id='arena-a-better',
63
+ variant='primary',
64
+ interactive=False,
65
+ )
66
+ prevmodel1 = gr.HTML(
67
+ show_label=False,
68
+ value="Vote to reveal model A",
69
+ visible=False,
70
+ )
71
+ with gr.Column():
72
+ with gr.Group():
73
+ aud2 = gr.Audio(
74
+ interactive=False,
75
+ show_label=False,
76
+ show_download_button=False,
77
+ show_share_button=False,
78
+ waveform_options={'waveform_progress_color': '#3C82F6'},
79
+ # var(--secondary-500)'}); gradio only accepts HEX and CSS color
80
+ )
81
+ bbetter = gr.Button(
82
+ "[B] is better",
83
+ elem_id='arena-b-better',
84
+ variant='primary',
85
+ interactive=False,
86
+ )
87
+ prevmodel2 = gr.HTML(
88
+ show_label=False,
89
+ value="Vote to reveal model B",
90
+ visible=False,
91
+ )
92
+ nxtroundbtn = gr.Button(
93
+ '⚡ [N]ext round',
94
+ elem_id='arena-next-round',
95
+ visible=False,
96
+ variant='primary',
97
+ )
98
+ autoplay = gr.Checkbox(
99
+ label="Autoplay audio",
100
+ value=True
101
+ )
102
+
103
+ outputs = [
104
+ text,
105
+ btn,
106
+ r2,
107
+ model1,
108
+ model2,
109
+ aud1,
110
+ aud2,
111
+ abetter,
112
+ bbetter,
113
+ prevmodel1,
114
+ prevmodel2,
115
+ nxtroundbtn,
116
+ aplayed,
117
+ bplayed,
118
+ ]
119
+ """
120
+ text,
121
+ "Synthesize",
122
+ gr.update(visible=True), # r2
123
+ mdl1, # model1
124
+ mdl2, # model2
125
+ gr.update(visible=True, value=results[mdl1]), # aud1
126
+ gr.update(visible=True, value=results[mdl2]), # aud2
127
+ gr.update(visible=True, interactive=False), #abetter
128
+ gr.update(visible=True, interactive=False), #bbetter
129
+ gr.update(visible=False), #prevmodel1
130
+ gr.update(visible=False), #prevmodel2
131
+ gr.update(visible=False), #nxt round btn"""
132
+ # , concurrency_count=1, concurrency_id="sync_queue"
133
+ btn\
134
+ .click(disable, outputs=[btn, abetter, bbetter, cachedt])\
135
+ .then(synthandreturn, inputs=[text, autoplay], outputs=outputs)\
136
+ .then(enable, outputs=[btn, gr.State(), gr.State(), gr.State()])\
137
+ .then(None, js="() => "+ unblur_text_js)
138
+ # Next Round ; blur text
139
+ nxtroundbtn\
140
+ .click(clear_stuff, outputs=outputs)\
141
+ .then(disable, outputs=[btn, abetter, bbetter, cachedt])\
142
+ .then(give_cached_sample, inputs=[session_hash, autoplay], outputs=[*outputs, cachedt])\
143
+ .then(enable, outputs=[btn, gr.State(), gr.State(), gr.State()])
144
+ # blur text
145
+ nxtroundbtn.click(None, js="() => "+ blur_text_js)
146
+
147
+ # fetch a comparison pair from cache
148
+ cachedt\
149
+ .click(disable, outputs=[btn, abetter, bbetter, cachedt])\
150
+ .then(give_cached_sample, inputs=[session_hash, autoplay], outputs=[*outputs, cachedt])\
151
+ .then(enable, outputs=[btn, gr.State(), gr.State(), gr.State()])
152
+ # TODO: await download of sample before allowing playback
153
+
154
+ # Allow interaction with the vote buttons only when both audio samples have finished playing
155
+ aud1\
156
+ .stop(
157
+ unlock_vote,
158
+ inputs=[autoplay, gr.State(value=0), aplayed, bplayed],
159
+ outputs=[abetter, bbetter, aplayed, bplayed],
160
+ )\
161
+ .then(
162
+ None,
163
+ inputs=[bplayed if autoplay else True],
164
+ js="(b) => b ? 0 : document.querySelector('.row .gap+.gap button.play-pause-button[aria-label=Play]').click()",
165
+ )
166
+ # autoplay if unplayed
167
+ aud2\
168
+ .stop(
169
+ unlock_vote,
170
+ inputs=[autoplay, gr.State(value=1), aplayed, bplayed],
171
+ outputs=[abetter, bbetter, aplayed, bplayed],
172
+ )
173
+ # unblur text with JS; faster than sending output with elem_classes
174
+ aud2.stop(None, inputs=[aplayed], js="(a) => a ? "+ unblur_text_js +" : 0;")
175
+
176
+ nxt_outputs = [abetter, bbetter, prevmodel1, prevmodel2, nxtroundbtn]
177
+ abetter\
178
+ .click(a_is_better, outputs=nxt_outputs, inputs=[model1, model2, useridstate, text])\
179
+ .then(voted_on_cached, inputs=[model1, model2, text, session_hash], outputs=[])
180
+ bbetter\
181
+ .click(b_is_better, outputs=nxt_outputs, inputs=[model1, model2, useridstate, text])\
182
+ .then(voted_on_cached, inputs=[model1, model2, text, session_hash], outputs=[])
183
+
184
+ # get session cookie
185
+ vote\
186
+ .load(
187
+ None,
188
+ None,
189
+ session_hash,
190
+ js="() => { return getArenaCookie('session') }",
191
+ )
192
+ # give a cached sample pair to voter; .then() did not work here
193
+ vote\
194
+ .load(give_cached_sample, inputs=[session_hash, autoplay], outputs=[*outputs, cachedt])
app/utils.py ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import uuid
2
+ import soundfile as sf
3
+ import pydub
4
+ import pyloudnorm as pyln
5
+
6
+ def match_target_amplitude(sound, target_dBFS):
7
+ change_in_dBFS = target_dBFS - sound.dBFS
8
+ return sound.apply_gain(change_in_dBFS)
9
+
10
+ def mkuuid(uid):
11
+ if not uid:
12
+ uid = uuid.uuid4()
13
+ return uid
14
+
15
+ def doloudnorm(path):
16
+ data, rate = sf.read(path)
17
+ meter = pyln.Meter(rate)
18
+ loudness = meter.integrated_loudness(data)
19
+ loudness_normalized_audio = pyln.normalize.loudness(data, loudness, -12.0)
20
+ sf.write(path, loudness_normalized_audio, rate)
app/vote.py ADDED
@@ -0,0 +1,143 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from .utils import *
2
+ from .config import *
3
+ from .models import *
4
+ from .db import *
5
+ from .init import *
6
+
7
+ import gradio as gr
8
+
9
+ # Logging
10
+
11
+ def log_text(text):
12
+ conn = get_db()
13
+ cursor = conn.cursor()
14
+ cursor.execute('INSERT INTO spokentext (spokentext) VALUES (?)', (text,))
15
+ if scheduler:
16
+ with scheduler.lock:
17
+ conn.commit()
18
+ else:
19
+ conn.commit()
20
+ cursor.close()
21
+
22
+ # Vote
23
+
24
+ def upvote_model(model, uname, battle=False):
25
+ conn = get_db()
26
+ cursor = conn.cursor()
27
+ if battle: uname = "unknown_battle"
28
+ cursor.execute('UPDATE model SET upvote = upvote + 1 WHERE name = ?', (model,))
29
+ if cursor.rowcount == 0:
30
+ cursor.execute('INSERT OR REPLACE INTO model (name, upvote, downvote) VALUES (?, 1, 0)', (model,))
31
+ cursor.execute('INSERT INTO vote (username, model, vote) VALUES (?, ?, ?)', (uname, model, 1,))
32
+ if scheduler:
33
+ with scheduler.lock:
34
+ conn.commit()
35
+ else:
36
+ conn.commit()
37
+ cursor.close()
38
+
39
+ def downvote_model(model, uname, battle=False):
40
+ conn = get_db()
41
+ cursor = conn.cursor()
42
+ if battle: uname = "unknown_battle"
43
+ cursor.execute('UPDATE model SET downvote = downvote + 1 WHERE name = ?', (model,))
44
+ if cursor.rowcount == 0:
45
+ cursor.execute('INSERT OR REPLACE INTO model (name, upvote, downvote) VALUES (?, 0, 1)', (model,))
46
+ cursor.execute('INSERT INTO vote (username, model, vote) VALUES (?, ?, ?)', (uname, model, -1,))
47
+ if scheduler:
48
+ with scheduler.lock:
49
+ conn.commit()
50
+ else:
51
+ conn.commit()
52
+ cursor.close()
53
+
54
+ # Battle Mode
55
+
56
+ def a_is_better_battle(model1, model2, userid):
57
+ return a_is_better(model1, model2, 'unknown_battle', True)
58
+ def b_is_better_battle(model1, model2, userid):
59
+ return b_is_better(model1, model2, 'unknown_battle', True)
60
+
61
+ # A/B better
62
+
63
+ def a_is_better(model1, model2, userid, text):
64
+ return is_better(model1, model2, userid, text, True)
65
+ def b_is_better(model1, model2, userid, text):
66
+ return is_better(model1, model2, userid, text, False)
67
+
68
+ def is_better(model1, model2, userid, text, chose_a):
69
+ if(
70
+ (
71
+ not model1 in AVAILABLE_MODELS.keys()
72
+ and not model1 in AVAILABLE_MODELS.values()
73
+ )
74
+ or (
75
+ not model2 in AVAILABLE_MODELS.keys()
76
+ and not model2 in AVAILABLE_MODELS.values()
77
+ )
78
+ ):
79
+ raise gr.Error('Sorry, please try voting again.')
80
+
81
+ # userid is unique for each cast vote pair
82
+ userid = mkuuid(userid)
83
+ if model1 and model2:
84
+ conn = get_db()
85
+ cursor = conn.cursor()
86
+ sql_query = 'INSERT INTO votelog (username, chosen, rejected) VALUES (?, ?, ?)'
87
+ if chose_a:
88
+ cursor.execute(sql_query, (str(userid), model1, model2))
89
+ else:
90
+ cursor.execute(sql_query, (str(userid), model2, model1))
91
+
92
+ with scheduler.lock:
93
+ conn.commit()
94
+ # also retrieve primary key ID
95
+ cursor.execute('SELECT last_insert_rowid()')
96
+ votelogid = cursor.fetchone()[0]
97
+ cursor.close()
98
+
99
+ if chose_a:
100
+ upvote_model(model1, str(userid))
101
+ downvote_model(model2, str(userid))
102
+ else:
103
+ upvote_model(model2, str(userid))
104
+ downvote_model(model1, str(userid))
105
+ log_text(text)
106
+ # log_text(text, votelogid)
107
+
108
+ return reload(model1, model2, userid, chose_a=chose_a, chose_b=(not chose_a))
109
+
110
+ # Reload
111
+ def reload(chosenmodel1=None, chosenmodel2=None, userid=None, chose_a=False, chose_b=False):
112
+ chosenmodel1 = make_link_to_space(chosenmodel1)
113
+ chosenmodel2 = make_link_to_space(chosenmodel2)
114
+ out = [
115
+ gr.update(interactive=False, visible=False),
116
+ gr.update(interactive=False, visible=False)
117
+ ]
118
+ style = 'text-align: center; font-size: 1rem; margin-bottom: 0; padding: var(--input-padding)'
119
+ if chose_a == True:
120
+ out.append(gr.HTML(value=f'<p style="{style}">Your vote: {chosenmodel1}</p>', visible=True))
121
+ out.append(gr.HTML(value=f'<p style="{style}">{chosenmodel2}</p>', visible=True))
122
+ else:
123
+ out.append(gr.HTML(value=f'<p style="{style}">{chosenmodel1}</p>', visible=True))
124
+ out.append(gr.HTML(value=f'<p style="{style}">Your vote: {chosenmodel2}</p>', visible=True))
125
+ out.append(gr.update(visible=True))
126
+ return out
127
+
128
+ def unlock_vote(autoplay, btn_index, aplayed, bplayed):
129
+ if autoplay == False:
130
+ return [gr.update(), gr.update(), aplayed, bplayed]
131
+
132
+ # sample played
133
+ if btn_index == 0:
134
+ aplayed = True
135
+ if btn_index == 1:
136
+ bplayed = True
137
+
138
+ # both audio samples played
139
+ if bool(aplayed) and bool(bplayed):
140
+ # print('Both audio samples played, voting unlocked')
141
+ return [gr.update(interactive=True), gr.update(interactive=True), True, True]
142
+
143
+ return [gr.update(), gr.update(), aplayed, bplayed]
requirements.txt CHANGED
@@ -1,8 +1,8 @@
1
  datasets
2
  librosa
3
  soundfile
 
4
  git+https://github.com/unitaryai/detoxify
5
  pyloudnorm
6
  langdetect
7
  pydub
8
- transformers
 
1
  datasets
2
  librosa
3
  soundfile
4
+ gradio-client
5
  git+https://github.com/unitaryai/detoxify
6
  pyloudnorm
7
  langdetect
8
  pydub