Dionyssos commited on
Commit
c9121c5
·
1 Parent(s): 33b0763

visuals for 3mimic & 1human - draft

Browse files
Files changed (1) hide show
  1. mimic3_make_harvard_sentences.py +127 -231
mimic3_make_harvard_sentences.py CHANGED
@@ -1,10 +1,3 @@
1
- # 1. Syntesize Harvard Sentences via Mimic-3 - 1 voice
2
- # 1. Synthesize via StyleTTS2 --> use same or sweetdreams
3
- # 2. Run audinterface on this 767
4
- # 3. .mimic3_pkl .styletts2_pkl -> different durations
5
-
6
- # It may crash due to non-truly-blocking shutil.copyfile() saying onnx protobuf incomplete file
7
- # You have to rerun the script - it will copy all voices from hf:mimic3-voices to ~/.local/mimic3
8
  import shutil
9
  import csv
10
  import io
@@ -12,6 +5,7 @@ import os
12
  import typing
13
  import wave
14
  import sys
 
15
  from mimic3_tts.__main__ import (CommandLineInterfaceState,
16
  get_args,
17
  initialize_args,
@@ -21,7 +15,7 @@ from mimic3_tts.__main__ import (CommandLineInterfaceState,
21
  shutdown_tts,
22
  OutputNaming,
23
  process_line)
24
- # import msinference
25
  import time
26
  import json
27
  import pandas as pd
@@ -39,31 +33,44 @@ import audiofile
39
 
40
 
41
  # ================================================ LIST OF VOICES
42
- ROOT_DIR = '/data/dkounadis/mimic3-voices/'
43
- foreign_voices = []
44
- english_voices = []
45
- for lang in os.listdir(ROOT_DIR + 'voices'):
46
 
47
- for voice in os.listdir(ROOT_DIR + 'voices/' + lang):
48
- if 'en_' in lang:
49
-
50
- try:
51
- with open(ROOT_DIR + 'voices/' + lang + '/' + voice + '/speakers.txt', 'r') as f:
52
- for spk in f:
53
- english_voices.append(lang + '/' + voice + '#' + spk.rstrip())
54
- # voice_spk_string = lang + '/' + voice + '#' + spk.rstrip() for spk in f
55
- except FileNotFoundError:
56
- english_voices.append(lang + '/' + voice)
57
-
58
- else:
59
 
60
- try:
61
- with open(ROOT_DIR + 'voices/' + lang + '/' + voice + '/speakers.txt', 'r') as f:
62
- for spk in f:
63
- foreign_voices.append(lang + '/' + voice + '#' + spk.rstrip())
64
 
65
- except FileNotFoundError:
66
- foreign_voices.append(lang + '/' + voice)
 
 
 
 
 
 
 
 
 
 
 
 
 
67
  # ================================================== INTERFACE MODELS
68
  LABELS = [
69
  'arousal', 'dominance', 'valence',
@@ -156,8 +163,8 @@ interface = audinterface.Feature(
156
  process_func=process_function,
157
  # process_func_args={'outputs': 'logits_scene'},
158
  process_func_applies_sliding_window=False,
159
- win_dur=4.0,
160
- hop_dur=1.0,
161
  sampling_rate=16000,
162
  resample=True,
163
  verbose=True,
@@ -168,38 +175,6 @@ interface = audinterface.Feature(
168
 
169
 
170
 
171
-
172
-
173
-
174
-
175
-
176
-
177
-
178
-
179
-
180
-
181
-
182
-
183
-
184
-
185
-
186
-
187
- # Filter insufficient durations - prompt
188
- foreign_voices = [i for i in foreign_voices if i not in ['bn/multi_low#02194',
189
- 'uk_UK/m-ailabs_low#obruchov',
190
- 'uk_UK/m-ailabs_low#shepel',
191
- 'uk_UK/m-ailabs_low#loboda',
192
- 'uk_UK/m-ailabs_low#miskun',
193
- 'uk_UK/m-ailabs_low#sumska',
194
- 'uk_UK/m-ailabs_low#pysariev',
195
- ]]
196
-
197
- # print(english_voices, '\n_________________________\n', foreign_voices)
198
- # ----------------------
199
- # print(foreign_voices.keys(), len(foreign_voices))
200
- # raise SystemExit
201
-
202
-
203
  def process_lines(state: CommandLineInterfaceState, wav_path=None):
204
  '''MIMIC3 INTERNAL CALL that yields the sigh sound'''
205
 
@@ -264,114 +239,36 @@ def process_lines(state: CommandLineInterfaceState, wav_path=None):
264
  # https://huggingface.co/dkounadis/artificial-styletts2/tree/main/mimic3_foreign
265
 
266
  # STYLES Already Made - HF
267
- english_dir = 'english_pkl/'
268
- foreign_dir = 'foreign_pkl/'
269
 
270
- Path(english_dir).mkdir(parents=True, exist_ok=True)
271
- Path(foreign_dir).mkdir(parents=True, exist_ok=True)
272
 
273
 
274
-
275
- # # state.output_dir = '.noopy'
276
- # # state.interactive = False
277
- # # state.output_naming = OutputNaming.TIME
278
- # # # state.ssml = 1234546575
279
- # # state.stdout = True
280
- # # state.tts = True
281
- # process_lines(state, wav_path='tmp1.wav')
282
- # shutdown_tts(state)
283
- # x, fs = audiofile.read('tmp1.wav')
284
- # total_audio_mimic3.append(x)
285
- # print(fs, text, 'mimic3')
286
 
287
- # # MIMIC3 = = = = = = = = = = = = = = END
288
-
289
-
290
-
291
-
292
-
293
-
294
- # total_audio_stts2 = np.concatenate(total_audio_stts2) # -- concat 77x lists
295
- # audiofile.write(foreign_dir + 'styletts2__' + _str + '.wav', total_audio_stts2, 24000)
296
-
297
- # total_audio_mimic3 = np.concatenate(total_audio_mimic3) # -- concat 77x lists
298
- # audiofile.write(foreign_dir + 'mimic3__' + _str + '.wav', total_audio_mimic3, 22050)
299
-
300
- # print('Saving:', foreign_dir + 'mimic3__' + _str + '.wav')
301
- # else:
302
- # print('Skip:', foreign_dir + 'styletts2__' + _str + '.wav')
303
-
304
-
305
-
306
-
307
-
308
-
309
-
310
-
311
-
312
-
313
-
314
-
315
-
316
-
317
-
318
-
319
-
320
-
321
-
322
-
323
-
324
-
325
-
326
-
327
-
328
-
329
-
330
-
331
-
332
-
333
-
334
-
335
-
336
-
337
-
338
-
339
-
340
-
341
-
342
-
343
- # load all harvard and for every voice -> load-its-style -> synth-mimic3 -> synth-stylett2 -> run-both-pkl
344
- # FOREIGN
345
- for folder, list_voices in [
346
- ['foreign', foreign_voices],
347
- ['english', english_voices],
348
- ]:
349
- print(folder, list_voices[:4], '\n\nEEEEEEEEEEEEEEEEEEEEEEEEEEEEEE')
350
- for _id, _voice in enumerate(list_voices[:4]):
351
- _str = _voice.replace('/', '_').replace('#', '_').replace('_low', '')
352
- _dir = folder + '_pkl/'
353
- if 'cmu-arctic' in _str:
354
- _str = _str.replace('cmu-arctic', 'cmu_arctic') #+ '.wav'
355
-
356
- print('\n\n\n\nExecuting', _voice,'\n\n\n\n\n')
357
-
358
- if (
359
- not os.path.isfile(_dir + 'mimic3__' + _str + '.wav') or
360
- not os.path.isfile(_dir + 'styletts2__' + _str + '.wav')
361
- ):
362
-
363
- # Mimic3 GitHub Quota exceded:
364
- # https://github.com/MycroftAI/mimic3-voices
365
- # Above repo can exceed download quota of LFS
366
- # Copy mimic-voices from local copies
367
- # clone https://huggingface.co/mukowaty/mimic3-voices/tree/main/voices
368
- # copy to ~/
369
- #
370
- #
371
  home_voice_dir = f'/home/audeering.local/dkounadis/.local/share/mycroft/mimic3/voices/{_voice.split("#")[0]}/'
372
  Path(home_voice_dir).mkdir(parents=True, exist_ok=True)
373
-
374
-
375
  speaker_free_voice_name = _voice.split("#")[0] if '#' in _voice else _voice
376
 
377
 
@@ -386,53 +283,28 @@ for folder, list_voices in [
386
  f'/data/dkounadis/mimic3-voices/voices/{speaker_free_voice_name}/generator.onnx',
387
  home_voice_dir + 'generator.onnx')
388
 
389
-
390
-
391
- # pre made
392
- prompt_path = f'mimic3_{folder}_4x/' + _str + '.wav'
393
-
394
-
395
 
 
 
 
 
 
 
 
 
396
 
 
397
 
 
 
398
 
399
 
 
400
 
401
-
402
-
403
-
404
-
405
- # ACTUAL TTS
406
-
407
-
408
- with open('harvard.json', 'r') as f:
409
- harvard_individual_sentences = json.load(f)['sentences']
410
- total_audio_mimic3 = []
411
- total_audio_stts2 = []
412
- ix = 0
413
- for list_of_10 in harvard_individual_sentences[:1]: # 77
414
- text = ' '.join(list_of_10['sentences'])
415
- # harvard.append(long_sentence.replace('.', ' '))
416
- # for text in list_of_10['sentences']:
417
- style_vec = msinference.compute_style(prompt_path)
418
- print(ix, text)
419
- ix += 1
420
-
421
-
422
- x = msinference.inference(text,
423
- style_vec,
424
- alpha=0.3,
425
- beta=0.7,
426
- diffusion_steps=7,
427
- embedding_scale=1)
428
-
429
- total_audio_stts2.append(x)
430
-
431
- # also synthesize mimic with the same sentence and voice
432
-
433
- # MIMIC-3 = = = = = = = = = = = = = = BEGIN
434
-
435
- rate = 1 # high speed sounds nice if used as speaker-reference audio for StyleTTS2
436
  _ssml = (
437
  '<speak>'
438
  '<prosody volume=\'64\'>'
@@ -472,51 +344,75 @@ for folder, list_voices in [
472
  process_lines(state, wav_path='tmp1.wav')
473
  shutdown_tts(state)
474
  x, fs = audiofile.read('tmp1.wav')
475
- total_audio_mimic3.append(x)
476
- print(fs, text, 'mimic3')
477
-
478
- # MIMIC3 = = = = = = = = = = = = = = END
 
 
 
 
 
 
479
 
480
 
481
 
 
 
 
482
 
483
 
 
 
 
 
 
 
 
 
484
 
485
- total_audio_stts2 = np.concatenate(total_audio_stts2) # -- concat 77x lists
486
- audiofile.write(_dir + 'styletts2__' + _str + '.wav', total_audio_stts2, 24000)
487
 
488
- total_audio_mimic3 = np.concatenate(total_audio_mimic3) # -- concat 77x lists
489
- audiofile.write(_dir + 'mimic3__' + _str + '.wav', total_audio_mimic3, 22050)
490
 
491
- print('Saving:', _dir + 'mimic3__' + _str + '.wav')
492
- else:
493
- print('Skip:', _dir + 'styletts2__' + _str + '.wav')
494
 
495
-
496
- # AUD I N T E R F A C E
497
- # file_interface = f'timeseries_{long_audio.replace("/", "")}.pkl'
498
 
499
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
500
 
501
- for engine in ['mimic3', 'styletts2']:
502
- harvard_of_voice = f'{_dir}{engine}__{_str}'
503
- if not os.path.exists(harvard_of_voice + '.pkl'):
504
- df = interface.process_file(harvard_of_voice + '.wav')
505
- df.to_pickle(harvard_of_voice + '.pkl')
506
- else:
507
- # df = pd.read_pickle(harvard_of_voice + '.pkl')
508
- print(harvard_of_voice + '.pkl', 'FOUND')
509
 
510
 
 
 
 
 
 
 
 
 
 
511
 
512
 
513
 
514
- # Her we have pkls
 
515
 
516
 
517
 
518
 
519
-
520
 
521
  # ===============================================================================
522
  # V I S U A L S
 
 
 
 
 
 
 
 
1
  import shutil
2
  import csv
3
  import io
 
5
  import typing
6
  import wave
7
  import sys
8
+ import audresample
9
  from mimic3_tts.__main__ import (CommandLineInterfaceState,
10
  get_args,
11
  initialize_args,
 
15
  shutdown_tts,
16
  OutputNaming,
17
  process_line)
18
+ import msinference
19
  import time
20
  import json
21
  import pandas as pd
 
33
 
34
 
35
  # ================================================ LIST OF VOICES
36
+ # ROOT_DIR = '/data/dkounadis/mimic3-voices/'
37
+ # foreign_voices = []
38
+ # english_voices = []
39
+ # for lang in os.listdir(ROOT_DIR + 'voices'):
40
 
41
+ # for voice in os.listdir(ROOT_DIR + 'voices/' + lang):
42
+ # if 'en_' in lang:
43
+
44
+ # try:
45
+ # with open(ROOT_DIR + 'voices/' + lang + '/' + voice + '/speakers.txt', 'r') as f:
46
+ # for spk in f:
47
+ # english_voices.append(lang + '/' + voice + '#' + spk.rstrip())
48
+ # # voice_spk_string = lang + '/' + voice + '#' + spk.rstrip() for spk in f
49
+ # except FileNotFoundError:
50
+ # english_voices.append(lang + '/' + voice)
51
+
52
+ # else:
53
 
54
+ # try:
55
+ # with open(ROOT_DIR + 'voices/' + lang + '/' + voice + '/speakers.txt', 'r') as f:
56
+ # for spk in f:
57
+ # foreign_voices.append(lang + '/' + voice + '#' + spk.rstrip())
58
 
59
+ # except FileNotFoundError:
60
+ # foreign_voices.append(lang + '/' + voice)
61
+ # #
62
+ # [print(i) for i in foreign_voices]
63
+ # print('\n_______________________________\n')
64
+ # [print(i) for i in english_voices]
65
+ # ====================================================== END PRINT LIST OF VOICES
66
+ list_voices = [
67
+ 'en_US/m-ailabs_low#mary_ann',
68
+ 'en_UK/apope_low',
69
+ 'de_DE/thorsten-emotion_low#neutral', # is the 4x really interesting we can just write it in Section
70
+ 'human'
71
+ ] # special - for human we load specific style file - no Mimic3 is run
72
+
73
+
74
  # ================================================== INTERFACE MODELS
75
  LABELS = [
76
  'arousal', 'dominance', 'valence',
 
163
  process_func=process_function,
164
  # process_func_args={'outputs': 'logits_scene'},
165
  process_func_applies_sliding_window=False,
166
+ win_dur=7.0,
167
+ hop_dur=4.0,
168
  sampling_rate=16000,
169
  resample=True,
170
  verbose=True,
 
175
 
176
 
177
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
178
  def process_lines(state: CommandLineInterfaceState, wav_path=None):
179
  '''MIMIC3 INTERNAL CALL that yields the sigh sound'''
180
 
 
239
  # https://huggingface.co/dkounadis/artificial-styletts2/tree/main/mimic3_foreign
240
 
241
  # STYLES Already Made - HF
242
+ out_dir = 'out_dir/'
243
+ Path(out_dir).mkdir(parents=True, exist_ok=True)
244
 
 
 
245
 
246
 
247
+ for _id, _voice in enumerate(list_voices):
248
+ _str = _voice.replace('/', '_').replace('#', '_').replace('_low', '')
249
+
250
+ if 'cmu-arctic' in _str:
251
+ _str = _str.replace('cmu-arctic', 'cmu_arctic') #+ '.wav'
 
 
 
 
 
 
 
252
 
253
+ print('\n\n\n\nExecuting', _voice,'\n\n\n\n\n')
254
+
255
+ if (
256
+ not os.path.isfile(out_dir + 'mimic3__' + _str + '.wav') or
257
+ not os.path.isfile(out_dir + 'styletts2__' + _str + '.wav')
258
+ ):
259
+
260
+ # Mimic3 GitHub Quota exceded:
261
+ # https://github.com/MycroftAI/mimic3-voices
262
+ # Above repo can exceed download quota of LFS
263
+ # Copy mimic-voices from local copies
264
+ # clone https://huggingface.co/mukowaty/mimic3-voices/tree/main/voices
265
+ # copy to ~/
266
+ #
267
+ #
268
+ if 'human' not in _voice:
269
+ # assure mimic-3 generator .onnx exists
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
270
  home_voice_dir = f'/home/audeering.local/dkounadis/.local/share/mycroft/mimic3/voices/{_voice.split("#")[0]}/'
271
  Path(home_voice_dir).mkdir(parents=True, exist_ok=True)
 
 
272
  speaker_free_voice_name = _voice.split("#")[0] if '#' in _voice else _voice
273
 
274
 
 
283
  f'/data/dkounadis/mimic3-voices/voices/{speaker_free_voice_name}/generator.onnx',
284
  home_voice_dir + 'generator.onnx')
285
 
286
+
 
 
 
 
 
287
 
288
+
289
+ # prompt_path = f'mimic3_{folder}_4x/' + _str + '.wav'
290
+ with open('harvard.json', 'r') as f:
291
+ harvard_individual_sentences = json.load(f)['sentences']
292
+ total_audio_mimic3 = []
293
+ total_audio_stts2 = []
294
+ ix = 0
295
+ for list_of_10 in harvard_individual_sentences[:1]: # 77
296
 
297
+ text = ' '.join(list_of_10['sentences'])
298
 
299
+ print(ix, text)
300
+ ix += 1
301
 
302
 
303
+ # Synthesis Mimic-3 then use it as prompt for StyleTTS2
304
 
305
+ # MIMIC-3 if _voice is not HUMAN
306
+ if 'human' not in _voice:
307
+ rate = 1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
308
  _ssml = (
309
  '<speak>'
310
  '<prosody volume=\'64\'>'
 
344
  process_lines(state, wav_path='tmp1.wav')
345
  shutdown_tts(state)
346
  x, fs = audiofile.read('tmp1.wav')
347
+ print(x.shape)
348
+ else:
349
+ # MSP['valence.train.votes'].get().sort_values('7').index[-1]
350
+ human_style = '/cache/audb/msppodcast/2.4.0/fe182b91/Audios/MSP-PODCAST_0235_0053.wav'
351
+ x, fs = audiofile.read(human_style)
352
+ print(x.shape,' human') # crop human to almost mimic-3 duration
353
+ total_audio_mimic3.append(x)
354
+ print(fs, text, 'mimic3')
355
+
356
+ # MIMIC3 = = = = = = = = = = = = = = END
357
 
358
 
359
 
360
+
361
+ style_vec = msinference.compute_style('tmp1.wav') # use mimic-3 as prompt
362
+
363
 
364
 
365
+ x = msinference.inference(text,
366
+ style_vec,
367
+ alpha=0.3,
368
+ beta=0.7,
369
+ diffusion_steps=7,
370
+ embedding_scale=1)
371
+
372
+ total_audio_stts2.append(x)
373
 
 
 
374
 
 
 
375
 
 
 
 
376
 
 
 
 
377
 
378
 
379
+ total_audio_stts2 = np.concatenate(total_audio_stts2) # -- concat 77x lists
380
+ total_audio_stts2 = audresample.resample(total_audio_stts2, original_rate=24000, target_rate=16000)[0] # for audinterface
381
+ audiofile.write(out_dir + 'styletts2__' + _str + '.wav', total_audio_stts2, 16000)
382
+
383
+ total_audio_mimic3 = np.concatenate(total_audio_mimic3) # -- concat 77x lists
384
+ total_audio_mimic3 = audresample.resample(total_audio_mimic3, original_rate=24000, target_rate=16000)[0]
385
+ audiofile.write(out_dir + 'mimic3__' + _str + '.wav', total_audio_mimic3, 16000)
386
+
387
+ print('Saving:', out_dir + 'mimic3__' + _str + '.wav')
388
+ else:
389
+ print('Skip:', out_dir + 'styletts2__' + _str + '.wav')
390
+
391
+
392
+ # AUD I N T E R F A C E
393
+ # file_interface = f'timeseries_{long_audio.replace("/", "")}.pkl'
394
 
 
 
 
 
 
 
 
 
395
 
396
 
397
+ for engine in ['mimic3', 'styletts2']:
398
+ harvard_of_voice = f'{out_dir}{engine}__{_str}'
399
+ if not os.path.exists(harvard_of_voice + '.pkl'):
400
+ df = interface.process_file(harvard_of_voice + '.wav')
401
+ df.to_pickle(harvard_of_voice + '.pkl')
402
+ else:
403
+ # df = pd.read_pickle(harvard_of_voice + '.pkl')
404
+ print(harvard_of_voice + '.pkl', 'FOUND')
405
+
406
 
407
 
408
 
409
+
410
+
411
 
412
 
413
 
414
 
415
+ print('\nVisuals\n')
416
 
417
  # ===============================================================================
418
  # V I S U A L S