saeki
commited on
Commit
·
699157c
1
Parent(s):
a6efb81
fix
Browse files- configs/test/melspec/audio_effect_transfer.yaml +28 -0
- configs/test/melspec/dual.yaml +12 -0
- configs/test/melspec/pretrain_jvs.yaml +47 -0
- configs/test/melspec/ssl_jsut.yaml +55 -0
- configs/test/melspec/ssl_tono.yaml +54 -0
- configs/test/vocfeats/audio_effect_transfer.yaml +28 -0
- configs/test/vocfeats/dual.yaml +12 -0
- configs/test/vocfeats/pretrain_jvs.yaml +49 -0
- configs/test/vocfeats/ssl_jsut.yaml +57 -0
- configs/test/vocfeats/ssl_tono.yaml +56 -0
- configs/train/melspec/dual.yaml +13 -0
- configs/train/melspec/pretrain_jvs.yaml +47 -0
- configs/train/melspec/ssl_jsut.yaml +54 -0
- configs/train/melspec/ssl_tono.yaml +54 -0
- configs/train/vocfeats/dual.yaml +12 -0
- configs/train/vocfeats/pretrain_jvs.yaml +49 -0
- configs/train/vocfeats/ssl_jsut.yaml +56 -0
- configs/train/vocfeats/ssl_tono.yaml +56 -0
configs/test/melspec/audio_effect_transfer.yaml
ADDED
@@ -0,0 +1,28 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
general:
|
2 |
+
preprocessed_path: "./preprocessed/audio_effect_transfer"
|
3 |
+
output_path: "./output/melspec/audio_effect_transfer"
|
4 |
+
feature_type: "melspec"
|
5 |
+
source:
|
6 |
+
dataset_path: "./data/tono"
|
7 |
+
config_path: "./configs/test/melspec/ssl_tono.yaml"
|
8 |
+
ckpt_path: "./ckpts_tono/tono_melspec_multi_nopre_0217.ckpt"
|
9 |
+
target:
|
10 |
+
dataset_path: "./data/jvs_22k-low"
|
11 |
+
config_path: "./configs/test/melspec/pretrain_jvs.yaml"
|
12 |
+
use_gst: False
|
13 |
+
|
14 |
+
preprocess:
|
15 |
+
sampling_rate: 22050
|
16 |
+
segment_length: -1
|
17 |
+
frame_shift: 256
|
18 |
+
|
19 |
+
model: null
|
20 |
+
|
21 |
+
train:
|
22 |
+
epoch: 100
|
23 |
+
batchsize: 8
|
24 |
+
multi_gpu_mode: False
|
25 |
+
num_workers: 4
|
26 |
+
learning_rate: 0.001
|
27 |
+
grad_clip_thresh: 1.0
|
28 |
+
logger_step: 1000
|
configs/test/melspec/dual.yaml
ADDED
@@ -0,0 +1,12 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
general:
|
2 |
+
stage: "pretrain"
|
3 |
+
corpus_type: "multi-unseen"
|
4 |
+
source_path: "./data/jvs_22k"
|
5 |
+
aux_path: null
|
6 |
+
preprocessed_path: "./preprocessed/dual"
|
7 |
+
preprocess:
|
8 |
+
n_train: 90
|
9 |
+
n_val: 5
|
10 |
+
n_test: 5
|
11 |
+
sampling_rate: 22050
|
12 |
+
segment_length: 2
|
configs/test/melspec/pretrain_jvs.yaml
ADDED
@@ -0,0 +1,47 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
general:
|
2 |
+
stage: "pretrain"
|
3 |
+
corpus_type: "multi-unseen" # (single, multi-seen, multi-unseen)
|
4 |
+
source_path: "./data/jvs_22k-low"
|
5 |
+
aux_path: "./data/jvs_22k"
|
6 |
+
preprocessed_path: "./preprocessed/jvs"
|
7 |
+
output_path: "./output/melspec/pretrain"
|
8 |
+
test_wav_path: null
|
9 |
+
feature_type: "melspec"
|
10 |
+
hifigan_path: "./hifigan/hifigan_melspec_universal"
|
11 |
+
power_norm: True
|
12 |
+
use_gst: False
|
13 |
+
|
14 |
+
preprocess:
|
15 |
+
n_train: 90
|
16 |
+
n_val: 5
|
17 |
+
n_test: 5
|
18 |
+
sampling_rate: 22050
|
19 |
+
frame_length: 1024
|
20 |
+
frame_shift: 256
|
21 |
+
fft_length: 1024
|
22 |
+
fmin: 0
|
23 |
+
fmax: 8000
|
24 |
+
n_mels: 80
|
25 |
+
comp_factor: 1.0
|
26 |
+
min_magnitude: 0.00001
|
27 |
+
max_wav_value: 32768.0
|
28 |
+
segment_length: -1
|
29 |
+
|
30 |
+
train:
|
31 |
+
batchsize: 8
|
32 |
+
epoch: 50
|
33 |
+
alpha: 0.1
|
34 |
+
augment: True
|
35 |
+
multi_gpu_mode: False
|
36 |
+
num_workers: 4
|
37 |
+
learning_rate: 0.005
|
38 |
+
grad_clip_thresh: 1.0
|
39 |
+
logger_step: 1000
|
40 |
+
load_pretrained: False
|
41 |
+
pretrained_path: null
|
42 |
+
early_stopping: False
|
43 |
+
multi_scale_loss:
|
44 |
+
use_linear: False
|
45 |
+
gamma: 1.0
|
46 |
+
feature_loss:
|
47 |
+
type: "mae"
|
configs/test/melspec/ssl_jsut.yaml
ADDED
@@ -0,0 +1,55 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
general:
|
2 |
+
stage: "ssl"
|
3 |
+
corpus_type: "single" # (single, multi-seen, multi-unseen)
|
4 |
+
source_path: "./data/jsut_22k-low"
|
5 |
+
aux_path: "./data/jsut_22k"
|
6 |
+
preprocessed_path: "./preprocessed/jsut-low"
|
7 |
+
output_path: "./output/melspec/jsut-low"
|
8 |
+
test_wav_path: null
|
9 |
+
feature_type: "melspec"
|
10 |
+
hifigan_path: "./hifigan/hifigan_melspec_universal"
|
11 |
+
power_norm: True
|
12 |
+
use_gst: False
|
13 |
+
|
14 |
+
preprocess:
|
15 |
+
n_train: 4950
|
16 |
+
n_val: 25
|
17 |
+
n_test: 25
|
18 |
+
sampling_rate: 22050
|
19 |
+
frame_length: 1024
|
20 |
+
frame_shift: 256
|
21 |
+
fft_length: 1024
|
22 |
+
fmin: 0
|
23 |
+
fmax: 8000
|
24 |
+
n_mels: 80
|
25 |
+
comp_factor: 1.0
|
26 |
+
min_magnitude: 0.00001
|
27 |
+
bitrate: "16k"
|
28 |
+
max_wav_value: 32768.0
|
29 |
+
segment_length: -1
|
30 |
+
|
31 |
+
train:
|
32 |
+
batchsize: 1
|
33 |
+
epoch: 50
|
34 |
+
epoch_channel: 25
|
35 |
+
multi_gpu_mode: False
|
36 |
+
num_workers: 4
|
37 |
+
learning_rate: 0.001
|
38 |
+
alpha: 0.1
|
39 |
+
beta: 0.1
|
40 |
+
augment: False
|
41 |
+
grad_clip_thresh: 1.0
|
42 |
+
logger_step: 1000
|
43 |
+
load_pretrained: False
|
44 |
+
pretrained_path: null
|
45 |
+
fix_channel: False
|
46 |
+
early_stopping: False
|
47 |
+
multi_scale_loss:
|
48 |
+
use_linear: True
|
49 |
+
gamma: 1.0
|
50 |
+
feature_loss:
|
51 |
+
type: "mae"
|
52 |
+
|
53 |
+
dual:
|
54 |
+
enable: True
|
55 |
+
config_path: ./configs/test/melspec/dual.yaml
|
configs/test/melspec/ssl_tono.yaml
ADDED
@@ -0,0 +1,54 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
general:
|
2 |
+
stage: "ssl"
|
3 |
+
corpus_type: "single" # (single, multi-seen, multi-unseen)
|
4 |
+
source_path: "./data/tono_22k"
|
5 |
+
aux_path: null
|
6 |
+
preprocessed_path: "./preprocessed/tono"
|
7 |
+
output_path: "./output/melspec/tono"
|
8 |
+
test_wav_path: null
|
9 |
+
feature_type: "melspec"
|
10 |
+
hifigan_path: "./hifigan/hifigan_melspec_universal"
|
11 |
+
power_norm: True
|
12 |
+
use_gst: False
|
13 |
+
|
14 |
+
preprocess:
|
15 |
+
n_train: 270
|
16 |
+
n_val: 34
|
17 |
+
n_test: 30
|
18 |
+
sampling_rate: 22050
|
19 |
+
frame_length: 1024
|
20 |
+
frame_shift: 256
|
21 |
+
fft_length: 1024
|
22 |
+
fmin: 0
|
23 |
+
fmax: 8000
|
24 |
+
n_mels: 80
|
25 |
+
comp_factor: 1.0
|
26 |
+
min_magnitude: 0.00001
|
27 |
+
bitrate: "16k"
|
28 |
+
max_wav_value: 32768.0
|
29 |
+
segment_length: -1
|
30 |
+
|
31 |
+
train:
|
32 |
+
batchsize: 4
|
33 |
+
epoch: 50
|
34 |
+
epoch_channel: 25
|
35 |
+
multi_gpu_mode: False
|
36 |
+
num_workers: 4
|
37 |
+
learning_rate: 0.001
|
38 |
+
alpha: 0.1
|
39 |
+
beta: 0.1
|
40 |
+
grad_clip_thresh: 1.0
|
41 |
+
logger_step: 1000
|
42 |
+
load_pretrained: False
|
43 |
+
pretrained_path: null
|
44 |
+
fix_channel: False
|
45 |
+
early_stopping: False
|
46 |
+
multi_scale_loss:
|
47 |
+
use_linear: True
|
48 |
+
gamma: 1.0
|
49 |
+
feature_loss:
|
50 |
+
type: "mae"
|
51 |
+
|
52 |
+
dual:
|
53 |
+
enable: True
|
54 |
+
config_path: ./configs/train/melspec/dual.yaml
|
configs/test/vocfeats/audio_effect_transfer.yaml
ADDED
@@ -0,0 +1,28 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
general:
|
2 |
+
preprocessed_path: "./preprocessed/audio_effect_transfer"
|
3 |
+
output_path: "./output/vocfeats/audio_effect_transfer"
|
4 |
+
feature_type: "vocfeats"
|
5 |
+
source:
|
6 |
+
dataset_path: "./data/tono"
|
7 |
+
config_path: "./configs/test/melspec/ssl_tono.yaml"
|
8 |
+
ckpt_path: "./ckpts_tono/tono_melspec_multi_nopre_0217.ckpt"
|
9 |
+
target:
|
10 |
+
dataset_path: "./data/jvs_22k-low"
|
11 |
+
config_path: "./configs/test/vocfeats/pretrain_jvs.yaml"
|
12 |
+
use_gst: False
|
13 |
+
|
14 |
+
preprocess:
|
15 |
+
sampling_rate: 22050
|
16 |
+
segment_length: -1
|
17 |
+
frame_shift: 256
|
18 |
+
|
19 |
+
model: null
|
20 |
+
|
21 |
+
train:
|
22 |
+
epoch: 100
|
23 |
+
batchsize: 8
|
24 |
+
multi_gpu_mode: False
|
25 |
+
num_workers: 4
|
26 |
+
learning_rate: 0.001
|
27 |
+
grad_clip_thresh: 1.0
|
28 |
+
logger_step: 1000
|
configs/test/vocfeats/dual.yaml
ADDED
@@ -0,0 +1,12 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
general:
|
2 |
+
stage: "pretrain"
|
3 |
+
corpus_type: "multi-unseen"
|
4 |
+
source_path: "./data/jvs_22k"
|
5 |
+
aux_path: null
|
6 |
+
preprocessed_path: "./preprocessed/dual"
|
7 |
+
preprocess:
|
8 |
+
n_train: 90
|
9 |
+
n_val: 5
|
10 |
+
n_test: 5
|
11 |
+
sampling_rate: 22050
|
12 |
+
segment_length: 2
|
configs/test/vocfeats/pretrain_jvs.yaml
ADDED
@@ -0,0 +1,49 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
general:
|
2 |
+
stage: "pretrain"
|
3 |
+
corpus_type: "multi-unseen" # (single, multi-seen, multi-unseen)
|
4 |
+
source_path: "./data/jvs_22k-low"
|
5 |
+
aux_path: "./data/jvs_22k"
|
6 |
+
preprocessed_path: "./preprocessed/jvs"
|
7 |
+
output_path: "./output/vocfeats/pretrain"
|
8 |
+
test_wav_path: null
|
9 |
+
feature_type: "vocfeats"
|
10 |
+
hifigan_path: "./hifigan/hifigan_jvs_40d_600k"
|
11 |
+
power_norm: True
|
12 |
+
use_gst: False
|
13 |
+
|
14 |
+
preprocess:
|
15 |
+
n_train: 90
|
16 |
+
n_val: 5
|
17 |
+
n_test: 5
|
18 |
+
sampling_rate: 22050
|
19 |
+
frame_length: 1024
|
20 |
+
frame_shift: 256
|
21 |
+
fft_length: 1024
|
22 |
+
fmin: 0
|
23 |
+
fmax: 8000
|
24 |
+
n_mels: 80
|
25 |
+
cep_order: 40
|
26 |
+
f0_extractor: "dio"
|
27 |
+
comp_factor: 1.0
|
28 |
+
min_magnitude: 0.00001
|
29 |
+
max_wav_value: 32768.0
|
30 |
+
segment_length: -1
|
31 |
+
|
32 |
+
train:
|
33 |
+
batchsize: 8
|
34 |
+
epoch: 50
|
35 |
+
alpha: 0.1
|
36 |
+
augment: True
|
37 |
+
multi_gpu_mode: False
|
38 |
+
num_workers: 4
|
39 |
+
learning_rate: 0.005
|
40 |
+
grad_clip_thresh: 1.0
|
41 |
+
logger_step: 1000
|
42 |
+
load_pretrained: False
|
43 |
+
pretrained_path: null
|
44 |
+
early_stopping: False
|
45 |
+
multi_scale_loss:
|
46 |
+
use_linear: True
|
47 |
+
gamma: 1.0
|
48 |
+
feature_loss:
|
49 |
+
type: "mae"
|
configs/test/vocfeats/ssl_jsut.yaml
ADDED
@@ -0,0 +1,57 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
general:
|
2 |
+
stage: "ssl"
|
3 |
+
corpus_type: "single" # (single, multi-seen, multi-unseen)
|
4 |
+
source_path: "./data/jsut_22k-low"
|
5 |
+
aux_path: "./data/jsut_22k"
|
6 |
+
preprocessed_path: "./preprocessed/jsut-low"
|
7 |
+
output_path: "./output/vocfeats/jsut-low"
|
8 |
+
test_wav_path: null
|
9 |
+
feature_type: "vocfeats"
|
10 |
+
hifigan_path: "./hifigan/hifigan_jvs_40d_600k"
|
11 |
+
power_norm: True
|
12 |
+
use_gst: False
|
13 |
+
|
14 |
+
preprocess:
|
15 |
+
n_train: 4950
|
16 |
+
n_val: 25
|
17 |
+
n_test: 25
|
18 |
+
sampling_rate: 22050
|
19 |
+
frame_length: 1024
|
20 |
+
frame_shift: 256
|
21 |
+
fft_length: 1024
|
22 |
+
fmin: 0
|
23 |
+
fmax: 8000
|
24 |
+
n_mels: 80
|
25 |
+
cep_order: 40
|
26 |
+
f0_extractor: "harvest"
|
27 |
+
comp_factor: 1.0
|
28 |
+
min_magnitude: 0.00001
|
29 |
+
bitrate: "16k"
|
30 |
+
max_wav_value: 32768.0
|
31 |
+
segment_length: -1
|
32 |
+
|
33 |
+
train:
|
34 |
+
batchsize: 1
|
35 |
+
epoch: 50
|
36 |
+
epoch_channel: 25
|
37 |
+
multi_gpu_mode: False
|
38 |
+
num_workers: 4
|
39 |
+
learning_rate: 0.001
|
40 |
+
alpha: 0.1
|
41 |
+
beta: 0.1
|
42 |
+
augment: False
|
43 |
+
grad_clip_thresh: 1.0
|
44 |
+
logger_step: 1000
|
45 |
+
load_pretrained: False
|
46 |
+
pretrained_path: null
|
47 |
+
fix_channel: False
|
48 |
+
early_stopping: False
|
49 |
+
multi_scale_loss:
|
50 |
+
use_linear: True
|
51 |
+
gamma: 1.0
|
52 |
+
feature_loss:
|
53 |
+
type: "mae"
|
54 |
+
|
55 |
+
dual:
|
56 |
+
enable: True
|
57 |
+
config_path: ./configs/test/vocfeats/dual.yaml
|
configs/test/vocfeats/ssl_tono.yaml
ADDED
@@ -0,0 +1,56 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
general:
|
2 |
+
stage: "ssl"
|
3 |
+
corpus_type: "single" # (single, multi-seen, multi-unseen)
|
4 |
+
source_path: "./data/tono"
|
5 |
+
aux_path: null
|
6 |
+
preprocessed_path: "./preprocessed/tono-denoise"
|
7 |
+
output_path: "./output/vocfeats/tono-denoise"
|
8 |
+
test_wav_path: null
|
9 |
+
feature_type: "vocfeats"
|
10 |
+
hifigan_path: "./hifigan/hifigan_jvs_40d_600k"
|
11 |
+
power_norm: True
|
12 |
+
use_gst: False
|
13 |
+
|
14 |
+
preprocess:
|
15 |
+
n_train: 270
|
16 |
+
n_val: 34
|
17 |
+
n_test: 30
|
18 |
+
sampling_rate: 22050
|
19 |
+
frame_length: 1024
|
20 |
+
frame_shift: 256
|
21 |
+
fft_length: 1024
|
22 |
+
fmin: 0
|
23 |
+
fmax: 8000
|
24 |
+
n_mels: 80
|
25 |
+
cep_order: 40
|
26 |
+
comp_factor: 1.0
|
27 |
+
min_magnitude: 0.00001
|
28 |
+
bitrate: "16k"
|
29 |
+
f0_extractor: "harvest"
|
30 |
+
max_wav_value: 32768.0
|
31 |
+
segment_length: -1
|
32 |
+
|
33 |
+
train:
|
34 |
+
batchsize: 4
|
35 |
+
epoch: 50
|
36 |
+
epoch_channel: 25
|
37 |
+
multi_gpu_mode: False
|
38 |
+
num_workers: 4
|
39 |
+
learning_rate: 0.001
|
40 |
+
alpha: 0.1
|
41 |
+
beta: 0.1
|
42 |
+
grad_clip_thresh: 1.0
|
43 |
+
logger_step: 1000
|
44 |
+
load_pretrained: False
|
45 |
+
pretrained_path: null
|
46 |
+
fix_channel: False
|
47 |
+
early_stopping: False
|
48 |
+
multi_scale_loss:
|
49 |
+
use_linear: True
|
50 |
+
gamma: 1.0
|
51 |
+
feature_loss:
|
52 |
+
type: "mae"
|
53 |
+
|
54 |
+
dual:
|
55 |
+
enable: True
|
56 |
+
config_path: ./configs/train/vocfeats/dual.yaml
|
configs/train/melspec/dual.yaml
ADDED
@@ -0,0 +1,13 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
general:
|
2 |
+
stage: "pretrain"
|
3 |
+
corpus_type: "multi-unseen"
|
4 |
+
source_path: "./data/jvs_22k"
|
5 |
+
aux_path: null
|
6 |
+
preprocessed_path: "./preprocessed/dual"
|
7 |
+
preprocess:
|
8 |
+
n_train: 90
|
9 |
+
n_val: 5
|
10 |
+
n_test: 5
|
11 |
+
sampling_rate: 22050
|
12 |
+
segment_length: 2
|
13 |
+
|
configs/train/melspec/pretrain_jvs.yaml
ADDED
@@ -0,0 +1,47 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
general:
|
2 |
+
stage: "pretrain"
|
3 |
+
corpus_type: "multi-unseen" # (single, multi-seen, multi-unseen)
|
4 |
+
source_path: "./data/jvs_22k-low"
|
5 |
+
aux_path: ./data/jvs_22k"
|
6 |
+
preprocessed_path: "./preprocessed/jvs"
|
7 |
+
output_path: "./output/melspec/pretrain"
|
8 |
+
test_wav_path: null
|
9 |
+
feature_type: "melspec"
|
10 |
+
hifigan_path: "./hifigan/hifigan_melspec_universal"
|
11 |
+
power_norm: True
|
12 |
+
use_gst: False
|
13 |
+
|
14 |
+
preprocess:
|
15 |
+
n_train: 90
|
16 |
+
n_val: 5
|
17 |
+
n_test: 5
|
18 |
+
sampling_rate: 22050
|
19 |
+
frame_length: 1024
|
20 |
+
frame_shift: 256
|
21 |
+
fft_length: 1024
|
22 |
+
fmin: 0
|
23 |
+
fmax: 8000
|
24 |
+
n_mels: 80
|
25 |
+
comp_factor: 1.0
|
26 |
+
min_magnitude: 0.00001
|
27 |
+
max_wav_value: 32768.0
|
28 |
+
segment_length: 2
|
29 |
+
|
30 |
+
train:
|
31 |
+
batchsize: 8
|
32 |
+
epoch: 50
|
33 |
+
alpha: 0.1
|
34 |
+
augment: True
|
35 |
+
multi_gpu_mode: False
|
36 |
+
num_workers: 4
|
37 |
+
learning_rate: 0.005
|
38 |
+
grad_clip_thresh: 1.0
|
39 |
+
logger_step: 1000
|
40 |
+
load_pretrained: False
|
41 |
+
pretrained_path: null
|
42 |
+
early_stopping: False
|
43 |
+
multi_scale_loss:
|
44 |
+
use_linear: False
|
45 |
+
gamma: 1.0
|
46 |
+
feature_loss:
|
47 |
+
type: "mae"
|
configs/train/melspec/ssl_jsut.yaml
ADDED
@@ -0,0 +1,54 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
general:
|
2 |
+
stage: "ssl"
|
3 |
+
corpus_type: "single" # (single, multi-seen, multi-unseen)
|
4 |
+
source_path: "./data/jsut_22k-low"
|
5 |
+
aux_path: "./data/jsut_22k"
|
6 |
+
preprocessed_path: "./preprocessed/jsut-low"
|
7 |
+
output_path: "./output/melspec/jsut-low"
|
8 |
+
test_wav_path: null
|
9 |
+
feature_type: "melspec"
|
10 |
+
hifigan_path: "./hifigan/hifigan_melspec_universal"
|
11 |
+
power_norm: True
|
12 |
+
use_gst: False
|
13 |
+
|
14 |
+
preprocess:
|
15 |
+
n_train: 4950
|
16 |
+
n_val: 25
|
17 |
+
n_test: 25
|
18 |
+
sampling_rate: 22050
|
19 |
+
frame_length: 1024
|
20 |
+
frame_shift: 256
|
21 |
+
fft_length: 1024
|
22 |
+
fmin: 0
|
23 |
+
fmax: 8000
|
24 |
+
n_mels: 80
|
25 |
+
comp_factor: 1.0
|
26 |
+
min_magnitude: 0.00001
|
27 |
+
bitrate: "16k"
|
28 |
+
max_wav_value: 32768.0
|
29 |
+
segment_length: 2
|
30 |
+
|
31 |
+
train:
|
32 |
+
batchsize: 4
|
33 |
+
epoch: 50
|
34 |
+
epoch_channel: 25
|
35 |
+
multi_gpu_mode: False
|
36 |
+
num_workers: 4
|
37 |
+
learning_rate: 0.001
|
38 |
+
alpha: 0.1
|
39 |
+
beta: 0.1
|
40 |
+
grad_clip_thresh: 1.0
|
41 |
+
logger_step: 1000
|
42 |
+
load_pretrained: True
|
43 |
+
pretrained_path: null
|
44 |
+
fix_channel: False
|
45 |
+
early_stopping: False
|
46 |
+
multi_scale_loss:
|
47 |
+
use_linear: True
|
48 |
+
gamma: 1.0
|
49 |
+
feature_loss:
|
50 |
+
type: "mae"
|
51 |
+
|
52 |
+
dual:
|
53 |
+
enable: True
|
54 |
+
config_path: ./configs/train/melspec/dual.yaml
|
configs/train/melspec/ssl_tono.yaml
ADDED
@@ -0,0 +1,54 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
general:
|
2 |
+
stage: "ssl"
|
3 |
+
corpus_type: "single" # (single, multi-seen, multi-unseen)
|
4 |
+
source_path: "./data/tono"
|
5 |
+
aux_path: null
|
6 |
+
preprocessed_path: "./preprocessed/tono"
|
7 |
+
output_path: "./output/melspec/tono"
|
8 |
+
test_wav_path: null
|
9 |
+
feature_type: "melspec"
|
10 |
+
hifigan_path: "./hifigan/hifigan_melspec_universal"
|
11 |
+
power_norm: True
|
12 |
+
use_gst: False
|
13 |
+
|
14 |
+
preprocess:
|
15 |
+
n_train: 270
|
16 |
+
n_val: 34
|
17 |
+
n_test: 30
|
18 |
+
sampling_rate: 22050
|
19 |
+
frame_length: 1024
|
20 |
+
frame_shift: 256
|
21 |
+
fft_length: 1024
|
22 |
+
fmin: 0
|
23 |
+
fmax: 8000
|
24 |
+
n_mels: 80
|
25 |
+
comp_factor: 1.0
|
26 |
+
min_magnitude: 0.00001
|
27 |
+
bitrate: "16k"
|
28 |
+
max_wav_value: 32768.0
|
29 |
+
segment_length: 2
|
30 |
+
|
31 |
+
train:
|
32 |
+
batchsize: 4
|
33 |
+
epoch: 50
|
34 |
+
epoch_channel: 25
|
35 |
+
multi_gpu_mode: False
|
36 |
+
num_workers: 4
|
37 |
+
learning_rate: 0.001
|
38 |
+
alpha: 0.1
|
39 |
+
beta: 0.1
|
40 |
+
grad_clip_thresh: 1.0
|
41 |
+
logger_step: 1000
|
42 |
+
load_pretrained: False
|
43 |
+
pretrained_path: null
|
44 |
+
fix_channel: False
|
45 |
+
early_stopping: False
|
46 |
+
multi_scale_loss:
|
47 |
+
use_linear: True
|
48 |
+
gamma: 1.0
|
49 |
+
feature_loss:
|
50 |
+
type: "mae"
|
51 |
+
|
52 |
+
dual:
|
53 |
+
enable: True
|
54 |
+
config_path: ./configs/train/melspec/dual.yaml
|
configs/train/vocfeats/dual.yaml
ADDED
@@ -0,0 +1,12 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
general:
|
2 |
+
stage: "pretrain"
|
3 |
+
corpus_type: "multi-unseen"
|
4 |
+
source_path: "./data/jvs_22k"
|
5 |
+
aux_path: null
|
6 |
+
preprocessed_path: "./preprocessed/dual"
|
7 |
+
preprocess:
|
8 |
+
n_train: 90
|
9 |
+
n_val: 5
|
10 |
+
n_test: 5
|
11 |
+
sampling_rate: 22050
|
12 |
+
segment_length: 2
|
configs/train/vocfeats/pretrain_jvs.yaml
ADDED
@@ -0,0 +1,49 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
general:
|
2 |
+
stage: "pretrain"
|
3 |
+
corpus_type: "multi-unseen" # (single, multi-seen, multi-unseen)
|
4 |
+
source_path: "./data/jvs_22k-low"
|
5 |
+
aux_path: "./data/jvs_22k"
|
6 |
+
preprocessed_path: "./preprocessed/jvs"
|
7 |
+
output_path: "./output/vocfeats/pretrain"
|
8 |
+
test_wav_path: null
|
9 |
+
feature_type: "vocfeats"
|
10 |
+
hifigan_path: "./hifigan/hifigan_jvs_40d_600k"
|
11 |
+
power_norm: True
|
12 |
+
use_gst: False
|
13 |
+
|
14 |
+
preprocess:
|
15 |
+
n_train: 90
|
16 |
+
n_val: 5
|
17 |
+
n_test: 5
|
18 |
+
sampling_rate: 22050
|
19 |
+
frame_length: 1024
|
20 |
+
frame_shift: 256
|
21 |
+
fft_length: 1024
|
22 |
+
fmin: 0
|
23 |
+
fmax: 8000
|
24 |
+
n_mels: 80
|
25 |
+
cep_order: 40
|
26 |
+
f0_extractor: "dio"
|
27 |
+
comp_factor: 1.0
|
28 |
+
min_magnitude: 0.00001
|
29 |
+
max_wav_value: 32768.0
|
30 |
+
segment_length: 2
|
31 |
+
|
32 |
+
train:
|
33 |
+
batchsize: 8
|
34 |
+
epoch: 50
|
35 |
+
alpha: 0.1
|
36 |
+
augment: True
|
37 |
+
multi_gpu_mode: False
|
38 |
+
num_workers: 4
|
39 |
+
learning_rate: 0.005
|
40 |
+
grad_clip_thresh: 1.0
|
41 |
+
logger_step: 1000
|
42 |
+
load_pretrained: False
|
43 |
+
pretrained_path: null
|
44 |
+
early_stopping: False
|
45 |
+
multi_scale_loss:
|
46 |
+
use_linear: True
|
47 |
+
gamma: 1.0
|
48 |
+
feature_loss:
|
49 |
+
type: "mae"
|
configs/train/vocfeats/ssl_jsut.yaml
ADDED
@@ -0,0 +1,56 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
general:
|
2 |
+
stage: "ssl"
|
3 |
+
corpus_type: "single" # (single, multi-seen, multi-unseen)
|
4 |
+
source_path: "./data/jsut_22k-low"
|
5 |
+
aux_path: "./data/jsut_22k"
|
6 |
+
preprocessed_path: "./preprocessed/jsut-low"
|
7 |
+
output_path: "./output/vocfeats/jsut-low"
|
8 |
+
test_wav_path: null
|
9 |
+
feature_type: "vocfeats"
|
10 |
+
hifigan_path: "./hifigan/hifigan_jvs_40d_600k"
|
11 |
+
power_norm: True
|
12 |
+
use_gst: False
|
13 |
+
|
14 |
+
preprocess:
|
15 |
+
n_train: 4950
|
16 |
+
n_val: 25
|
17 |
+
n_test: 25
|
18 |
+
sampling_rate: 22050
|
19 |
+
frame_length: 1024
|
20 |
+
frame_shift: 256
|
21 |
+
fft_length: 1024
|
22 |
+
fmin: 0
|
23 |
+
fmax: 8000
|
24 |
+
n_mels: 80
|
25 |
+
cep_order: 40
|
26 |
+
comp_factor: 1.0
|
27 |
+
min_magnitude: 0.00001
|
28 |
+
bitrate: "16k"
|
29 |
+
f0_extractor: "harvest"
|
30 |
+
max_wav_value: 32768.0
|
31 |
+
segment_length: 2
|
32 |
+
|
33 |
+
train:
|
34 |
+
batchsize: 4
|
35 |
+
epoch: 50
|
36 |
+
epoch_channel: 25
|
37 |
+
multi_gpu_mode: False
|
38 |
+
num_workers: 4
|
39 |
+
learning_rate: 0.001
|
40 |
+
alpha: 0.1
|
41 |
+
beta: 0.1
|
42 |
+
grad_clip_thresh: 1.0
|
43 |
+
logger_step: 1000
|
44 |
+
load_pretrained: True
|
45 |
+
pretrained_path: null
|
46 |
+
fix_channel: False
|
47 |
+
early_stopping: False
|
48 |
+
multi_scale_loss:
|
49 |
+
use_linear: True
|
50 |
+
gamma: 1.0
|
51 |
+
feature_loss:
|
52 |
+
type: "mae"
|
53 |
+
|
54 |
+
dual:
|
55 |
+
enable: True
|
56 |
+
config_path: ./configs/train/vocfeats/dual.yaml
|
configs/train/vocfeats/ssl_tono.yaml
ADDED
@@ -0,0 +1,56 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
general:
|
2 |
+
stage: "ssl"
|
3 |
+
corpus_type: "single" # (single, multi-seen, multi-unseen)
|
4 |
+
source_path: "./data/tono"
|
5 |
+
aux_path: null
|
6 |
+
preprocessed_path: "./preprocessed/tono"
|
7 |
+
output_path: "./output/vocfeats/tono"
|
8 |
+
test_wav_path: null
|
9 |
+
feature_type: "vocfeats"
|
10 |
+
hifigan_path: "./hifigan/hifigan_jvs_40d_600k"
|
11 |
+
power_norm: True
|
12 |
+
use_gst: False
|
13 |
+
|
14 |
+
preprocess:
|
15 |
+
n_train: 270
|
16 |
+
n_val: 34
|
17 |
+
n_test: 30
|
18 |
+
sampling_rate: 22050
|
19 |
+
frame_length: 1024
|
20 |
+
frame_shift: 256
|
21 |
+
fft_length: 1024
|
22 |
+
fmin: 0
|
23 |
+
fmax: 8000
|
24 |
+
n_mels: 80
|
25 |
+
cep_order: 40
|
26 |
+
comp_factor: 1.0
|
27 |
+
min_magnitude: 0.00001
|
28 |
+
bitrate: "16k"
|
29 |
+
f0_extractor: "harvest"
|
30 |
+
max_wav_value: 32768.0
|
31 |
+
segment_length: 2
|
32 |
+
|
33 |
+
train:
|
34 |
+
batchsize: 4
|
35 |
+
epoch: 50
|
36 |
+
epoch_channel: 25
|
37 |
+
multi_gpu_mode: False
|
38 |
+
num_workers: 4
|
39 |
+
learning_rate: 0.001
|
40 |
+
alpha: 0.1
|
41 |
+
beta: 0.1
|
42 |
+
grad_clip_thresh: 1.0
|
43 |
+
logger_step: 1000
|
44 |
+
load_pretrained: False
|
45 |
+
pretrained_path: null
|
46 |
+
fix_channel: False
|
47 |
+
early_stopping: False
|
48 |
+
multi_scale_loss:
|
49 |
+
use_linear: True
|
50 |
+
gamma: 1.0
|
51 |
+
feature_loss:
|
52 |
+
type: "mae"
|
53 |
+
|
54 |
+
dual:
|
55 |
+
enable: True
|
56 |
+
config_path: ./configs/train/vocfeats/dual.yaml
|