yesyesjaewook commited on
Commit
3a7d489
·
1 Parent(s): 0f5fae7

Initial commit

Browse files
README.md ADDED
@@ -0,0 +1,51 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ tags:
3
+ - espnet
4
+ - audio
5
+ - text-to-speech
6
+ language: ko
7
+ datasets:
8
+ - Jaewook
9
+ license: cc-by-4.0
10
+ ---
11
+
12
+ ## ESPnet2 TTS model
13
+
14
+ ### yesyesjaewook/jets-jaewook-ko
15
+
16
+ This model was trained by yesyesjaewook using jaewook recipe in [espnet](https://github.com/espnet/espnet/).
17
+
18
+ ### Citing ESPnet
19
+
20
+ ```BibTex
21
+ @inproceedings{watanabe2018espnet,
22
+ author={Shinji Watanabe and Takaaki Hori and Shigeki Karita and Tomoki Hayashi and Jiro Nishitoba and Yuya Unno and Nelson Yalta and Jahn Heymann and Matthew Wiesner and Nanxin Chen and Adithya Renduchintala and Tsubasa Ochiai},
23
+ title={{ESPnet}: End-to-End Speech Processing Toolkit},
24
+ year={2018},
25
+ booktitle={Proceedings of Interspeech},
26
+ pages={2207--2211},
27
+ doi={10.21437/Interspeech.2018-1456},
28
+ url={http://dx.doi.org/10.21437/Interspeech.2018-1456}
29
+ }
30
+ @inproceedings{hayashi2020espnet,
31
+ title={{Espnet-TTS}: Unified, reproducible, and integratable open source end-to-end text-to-speech toolkit},
32
+ author={Hayashi, Tomoki and Yamamoto, Ryuichi and Inoue, Katsuki and Yoshimura, Takenori and Watanabe, Shinji and Toda, Tomoki and Takeda, Kazuya and Zhang, Yu and Tan, Xu},
33
+ booktitle={Proceedings of IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)},
34
+ pages={7654--7658},
35
+ year={2020},
36
+ organization={IEEE}
37
+ }
38
+ ```
39
+
40
+ or arXiv:
41
+
42
+ ```bibtex
43
+ @misc{watanabe2018espnet,
44
+ title={ESPnet: End-to-End Speech Processing Toolkit},
45
+ author={Shinji Watanabe and Takaaki Hori and Shigeki Karita and Tomoki Hayashi and Jiro Nishitoba and Yuya Unno and Nelson Yalta and Jahn Heymann and Matthew Wiesner and Nanxin Chen and Adithya Renduchintala and Tsubasa Ochiai},
46
+ year={2018},
47
+ eprint={1804.00015},
48
+ archivePrefix={arXiv},
49
+ primaryClass={cs.CL}
50
+ }
51
+ ```
experiments/tts_jaewook/config.yaml ADDED
@@ -0,0 +1,420 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ config: conf/fine_tune_jets_multi_lang.yaml
2
+ print_config: false
3
+ log_level: INFO
4
+ dry_run: false
5
+ iterator_type: sequence
6
+ output_dir: experiments/tts_jaewook
7
+ ngpu: 1
8
+ seed: 777
9
+ num_workers: 8
10
+ num_att_plot: 3
11
+ dist_backend: nccl
12
+ dist_init_method: env://
13
+ dist_world_size: null
14
+ dist_rank: null
15
+ local_rank: 0
16
+ dist_master_addr: null
17
+ dist_master_port: null
18
+ dist_launcher: null
19
+ multiprocessing_distributed: false
20
+ unused_parameters: true
21
+ sharded_ddp: false
22
+ cudnn_enabled: true
23
+ cudnn_benchmark: false
24
+ cudnn_deterministic: false
25
+ collect_stats: false
26
+ write_collected_feats: false
27
+ max_epoch: 1000
28
+ patience: null
29
+ val_scheduler_criterion:
30
+ - valid
31
+ - loss
32
+ early_stopping_criterion:
33
+ - valid
34
+ - loss
35
+ - min
36
+ best_model_criterion:
37
+ - - valid
38
+ - text2mel_loss
39
+ - min
40
+ - - train
41
+ - text2mel_loss
42
+ - min
43
+ - - train
44
+ - total_count
45
+ - max
46
+ keep_nbest_models: 100
47
+ nbest_averaging_interval: 0
48
+ grad_clip: -1
49
+ grad_clip_type: 2.0
50
+ grad_noise: false
51
+ accum_grad: 1
52
+ no_forward_run: false
53
+ resume: true
54
+ train_dtype: float32
55
+ use_amp: false
56
+ log_interval: 50
57
+ use_matplotlib: true
58
+ use_tensorboard: true
59
+ create_graph_in_tensorboard: false
60
+ use_wandb: true
61
+ wandb_project: null
62
+ wandb_id: null
63
+ wandb_entity: null
64
+ wandb_name: null
65
+ wandb_model_log_interval: -1
66
+ detect_anomaly: false
67
+ pretrain_path: null
68
+ init_param: []
69
+ ignore_init_mismatch: false
70
+ freeze_param:
71
+ - tts.generator.lid_emb
72
+ num_iters_per_epoch: 1000
73
+ batch_size: 20
74
+ valid_batch_size: null
75
+ batch_bins: 3500000
76
+ valid_batch_bins: null
77
+ train_shape_file:
78
+ - experiments/tts_stats_raw_phn_none/train/text_shape.phn
79
+ - experiments/tts_stats_raw_phn_none/train/speech_shape
80
+ valid_shape_file:
81
+ - experiments/tts_stats_raw_phn_none/valid/text_shape.phn
82
+ - experiments/tts_stats_raw_phn_none/valid/speech_shape
83
+ batch_type: numel
84
+ valid_batch_type: null
85
+ fold_length:
86
+ - 150
87
+ - 204800
88
+ sort_in_batch: descending
89
+ sort_batch: descending
90
+ multiple_iterator: false
91
+ chunk_length: 500
92
+ chunk_shift_ratio: 0.5
93
+ num_cache_chunks: 1024
94
+ chunk_excluded_key_prefixes: []
95
+ train_data_path_and_name_and_type:
96
+ - - dumps/raw/train_phn/text
97
+ - text
98
+ - text
99
+ - - dumps/raw/train_phn/wav.scp
100
+ - speech
101
+ - sound
102
+ - - dumps/raw/train_phn/utt2lid
103
+ - lids
104
+ - text_int
105
+ valid_data_path_and_name_and_type:
106
+ - - dumps/raw/dev_phn/text
107
+ - text
108
+ - text
109
+ - - dumps/raw/dev_phn/wav.scp
110
+ - speech
111
+ - sound
112
+ - - dumps/raw/dev_phn/utt2lid
113
+ - lids
114
+ - text_int
115
+ allow_variable_data_keys: false
116
+ max_cache_size: 0.0
117
+ max_cache_fd: 32
118
+ valid_max_cache_size: null
119
+ exclude_weight_decay: false
120
+ exclude_weight_decay_conf: {}
121
+ optim: adamw
122
+ optim_conf:
123
+ lr: 1.0e-05
124
+ betas:
125
+ - 0.8
126
+ - 0.99
127
+ eps: 1.0e-09
128
+ weight_decay: 0.0
129
+ scheduler: exponentiallr
130
+ scheduler_conf:
131
+ gamma: 0.999875
132
+ optim2: adamw
133
+ optim2_conf:
134
+ lr: 1.0e-05
135
+ betas:
136
+ - 0.8
137
+ - 0.99
138
+ eps: 1.0e-09
139
+ weight_decay: 0.0
140
+ scheduler2: exponentiallr
141
+ scheduler2_conf:
142
+ gamma: 0.999875
143
+ generator_first: true
144
+ token_list:
145
+ - <blank>
146
+ - <unk>
147
+ - <space>
148
+ - ᅡ
149
+ - ᅵ
150
+ - ᅳ
151
+ - ᄋ
152
+ - ᄀ
153
+ - ᄂ
154
+ - ᅥ
155
+ - ᄅ
156
+ - ᆫ
157
+ - ᅩ
158
+ - ᄃ
159
+ - ᆯ
160
+ - ᄉ
161
+ - ᄆ
162
+ - ᄌ
163
+ - ᅮ
164
+ - .
165
+ - ᄒ
166
+ - ᆮ
167
+ - ᆼ
168
+ - ᅧ
169
+ - ᅢ
170
+ - ᄇ
171
+ - ᅦ
172
+ - ᄊ
173
+ - ᄄ
174
+ - ᆷ
175
+ - ᄁ
176
+ - ᄎ
177
+ - ᄏ
178
+ - ᆨ
179
+ - ᄑ
180
+ - ᄐ
181
+ - ᅪ
182
+ - ᅭ
183
+ - ᅴ
184
+ - ᄍ
185
+ - ᆸ
186
+ - ᅣ
187
+ - ᅬ
188
+ - ᅱ
189
+ - ᄈ
190
+ - ᅯ
191
+ - '?'
192
+ - ᅨ
193
+ - ','
194
+ - ᅲ
195
+ - ᅫ
196
+ - '!'
197
+ - ᅤ
198
+ - ᅰ
199
+ - '~'
200
+ - <sos/eos>
201
+ odim: null
202
+ model_conf: {}
203
+ use_preprocessor: true
204
+ token_type: phn
205
+ bpemodel: null
206
+ non_linguistic_symbols: null
207
+ cleaner: korean_cleaner
208
+ g2p: g2pk_explicit_space
209
+ feats_extract: fbank
210
+ feats_extract_conf:
211
+ n_fft: 1024
212
+ hop_length: 256
213
+ win_length: null
214
+ fs: 22050
215
+ fmin: 0
216
+ fmax: null
217
+ n_mels: 80
218
+ normalize: global_mvn
219
+ normalize_conf:
220
+ stats_file: experiments/tts_stats_raw_phn_none/train/feats_stats.npz
221
+ tts: jets
222
+ tts_conf:
223
+ generator_type: jets_generator
224
+ generator_params:
225
+ langs: 96
226
+ adim: 256
227
+ aheads: 2
228
+ elayers: 4
229
+ eunits: 1024
230
+ dlayers: 4
231
+ dunits: 1024
232
+ positionwise_layer_type: conv1d
233
+ positionwise_conv_kernel_size: 3
234
+ duration_predictor_layers: 2
235
+ duration_predictor_chans: 256
236
+ duration_predictor_kernel_size: 3
237
+ use_masking: true
238
+ encoder_normalize_before: true
239
+ decoder_normalize_before: true
240
+ encoder_type: transformer
241
+ decoder_type: transformer
242
+ conformer_rel_pos_type: latest
243
+ conformer_pos_enc_layer_type: rel_pos
244
+ conformer_self_attn_layer_type: rel_selfattn
245
+ conformer_activation_type: swish
246
+ use_macaron_style_in_conformer: true
247
+ use_cnn_in_conformer: true
248
+ conformer_enc_kernel_size: 7
249
+ conformer_dec_kernel_size: 31
250
+ init_type: xavier_uniform
251
+ transformer_enc_dropout_rate: 0.2
252
+ transformer_enc_positional_dropout_rate: 0.2
253
+ transformer_enc_attn_dropout_rate: 0.2
254
+ transformer_dec_dropout_rate: 0.2
255
+ transformer_dec_positional_dropout_rate: 0.2
256
+ transformer_dec_attn_dropout_rate: 0.2
257
+ pitch_predictor_layers: 5
258
+ pitch_predictor_chans: 256
259
+ pitch_predictor_kernel_size: 5
260
+ pitch_predictor_dropout: 0.5
261
+ pitch_embed_kernel_size: 1
262
+ pitch_embed_dropout: 0.0
263
+ stop_gradient_from_pitch_predictor: true
264
+ energy_predictor_layers: 2
265
+ energy_predictor_chans: 256
266
+ energy_predictor_kernel_size: 3
267
+ energy_predictor_dropout: 0.5
268
+ energy_embed_kernel_size: 1
269
+ energy_embed_dropout: 0.0
270
+ stop_gradient_from_energy_predictor: false
271
+ generator_out_channels: 1
272
+ generator_channels: 512
273
+ generator_global_channels: -1
274
+ generator_kernel_size: 7
275
+ generator_upsample_scales:
276
+ - 8
277
+ - 8
278
+ - 2
279
+ - 2
280
+ generator_upsample_kernel_sizes:
281
+ - 16
282
+ - 16
283
+ - 4
284
+ - 4
285
+ generator_resblock_kernel_sizes:
286
+ - 3
287
+ - 7
288
+ - 11
289
+ generator_resblock_dilations:
290
+ - - 1
291
+ - 3
292
+ - 5
293
+ - - 1
294
+ - 3
295
+ - 5
296
+ - - 1
297
+ - 3
298
+ - 5
299
+ generator_use_additional_convs: true
300
+ generator_bias: true
301
+ generator_nonlinear_activation: LeakyReLU
302
+ generator_nonlinear_activation_params:
303
+ negative_slope: 0.1
304
+ generator_use_weight_norm: true
305
+ segment_size: 64
306
+ idim: 56
307
+ odim: 80
308
+ discriminator_type: hifigan_multi_scale_multi_period_discriminator
309
+ discriminator_params:
310
+ scales: 1
311
+ scale_downsample_pooling: AvgPool1d
312
+ scale_downsample_pooling_params:
313
+ kernel_size: 4
314
+ stride: 2
315
+ padding: 2
316
+ scale_discriminator_params:
317
+ in_channels: 1
318
+ out_channels: 1
319
+ kernel_sizes:
320
+ - 15
321
+ - 41
322
+ - 5
323
+ - 3
324
+ channels: 128
325
+ max_downsample_channels: 1024
326
+ max_groups: 16
327
+ bias: true
328
+ downsample_scales:
329
+ - 2
330
+ - 2
331
+ - 4
332
+ - 4
333
+ - 1
334
+ nonlinear_activation: LeakyReLU
335
+ nonlinear_activation_params:
336
+ negative_slope: 0.1
337
+ use_weight_norm: true
338
+ use_spectral_norm: false
339
+ follow_official_norm: false
340
+ periods:
341
+ - 2
342
+ - 3
343
+ - 5
344
+ - 7
345
+ - 11
346
+ period_discriminator_params:
347
+ in_channels: 1
348
+ out_channels: 1
349
+ kernel_sizes:
350
+ - 5
351
+ - 3
352
+ channels: 32
353
+ downsample_scales:
354
+ - 3
355
+ - 3
356
+ - 3
357
+ - 3
358
+ - 1
359
+ max_downsample_channels: 1024
360
+ bias: true
361
+ nonlinear_activation: LeakyReLU
362
+ nonlinear_activation_params:
363
+ negative_slope: 0.1
364
+ use_weight_norm: true
365
+ use_spectral_norm: false
366
+ generator_adv_loss_params:
367
+ average_by_discriminators: false
368
+ loss_type: mse
369
+ discriminator_adv_loss_params:
370
+ average_by_discriminators: false
371
+ loss_type: mse
372
+ feat_match_loss_params:
373
+ average_by_discriminators: false
374
+ average_by_layers: false
375
+ include_final_outputs: true
376
+ mel_loss_params:
377
+ fs: 22050
378
+ n_fft: 1024
379
+ hop_length: 256
380
+ win_length: null
381
+ window: hann
382
+ n_mels: 80
383
+ fmin: 0
384
+ fmax: null
385
+ log_base: null
386
+ lambda_adv: 1.0
387
+ lambda_mel: 45.0
388
+ lambda_feat_match: 2.0
389
+ lambda_var: 1.0
390
+ lambda_align: 2.0
391
+ sampling_rate: 22050
392
+ cache_generator_outputs: true
393
+ pitch_extract: dio
394
+ pitch_extract_conf:
395
+ reduction_factor: 1
396
+ use_token_averaged_f0: false
397
+ fs: 22050
398
+ n_fft: 1024
399
+ hop_length: 256
400
+ f0max: 400
401
+ f0min: 80
402
+ pitch_normalize: global_mvn
403
+ pitch_normalize_conf:
404
+ stats_file: experiments/tts_stats_raw_phn_none/train/pitch_stats.npz
405
+ energy_extract: energy
406
+ energy_extract_conf:
407
+ reduction_factor: 1
408
+ use_token_averaged_energy: false
409
+ fs: 22050
410
+ n_fft: 1024
411
+ hop_length: 256
412
+ win_length: null
413
+ energy_normalize: global_mvn
414
+ energy_normalize_conf:
415
+ stats_file: experiments/tts_stats_raw_phn_none/train/energy_stats.npz
416
+ required:
417
+ - output_dir
418
+ - token_list
419
+ version: '202301'
420
+ distributed: false
experiments/tts_jaewook/model.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ffec6be5944f06e945f461f2cbc0ac7358a31bcd2718412a3f01d984728fc78b
3
+ size 333767147
experiments/tts_stats_raw_phn_none/train/energy_stats.npz ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ae615a5bf81d1ccdbd7577dea026a5e6fcdef3f67cc390c6cf2896979054b09e
3
+ size 770
experiments/tts_stats_raw_phn_none/train/feats_stats.npz ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2fc6e274870e5dc3fd137ab378d707332b8429fd4dc09a14ee9e7f88dd5f3584
3
+ size 1402
experiments/tts_stats_raw_phn_none/train/pitch_stats.npz ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:17a62c752d615f82e2f0182696c4c22c9f1a57226035e06ada5490f2a3c0d4ef
3
+ size 770
meta.yaml ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ espnet: '202301'
2
+ files:
3
+ model_file: experiments/tts_jaewook/model.pth
4
+ python: 3.10.9 (main, Dec 29 2022, 07:07:21) [GCC 11.3.0]
5
+ timestamp: 1680614714.734043
6
+ torch: 1.13.1+cu117
7
+ yaml_files:
8
+ train_config: experiments/tts_jaewook/config.yaml