Spaces:
Paused
Paused
File size: 1,720 Bytes
d66c48f |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 |
{
"model_type": "AutoregressiveTransformer",
"dataset": [
"emilia",
],
"preprocess": {
"hop_size": 320,
"sample_rate": 16000, // HuBERT, WavLM, are both 16000
"n_fft": 1920,
"num_mels": 128,
"win_size": 1920,
"fmin": 0,
"fmax": 12000,
"mel_var": 8.14,
"mel_mean": -4.92,
"processed_dir": "",
"valid_file": "valid.json",
"train_file": "train.json",
"min_dur": 3,
"max_dur": 30,
"load_phone": true,
},
"model": {
"autoregressive_transformer": {
"input_vocab_size": 1056,
"output_vocab_size": 8192,
"hidden_size": 1536,
"intermediate_size": 6144,
"num_hidden_layers": 12,
"num_attention_heads": 16,
"use_global_style_encoder": false
},
"train_both_vc_and_tts": true,
"vc_input_token_type": "hubert_vevo_codec",
"vc_input_vocab_size": 32,
"tts_input_token_type": "g2p",
"tts_input_vocab_size": 1024,
"output_token_type": "hubert_codec",
"representation_stat_mean_var_path": "./Amphion/models/vc/vevo/config/hubert_large_l18_mean_std.npz",
"input_repcodec": {
"config_path": "./Amphion/models/vc/vevo/config/hubert_large_l18_c32.yaml",
},
"output_repcodec": {
"codebook_size": 8192, // VQ Codebook Size
"hidden_size": 1024, // Representations Dim
"codebook_dim": 8,
"vocos_dim": 384,
"vocos_intermediate_dim": 2048,
"vocos_num_layers": 12,
}
},
} |