Spaces:
Paused
Paused
{ | |
"model_type": "AutoregressiveTransformer", | |
"dataset": [ | |
"emilia", | |
], | |
"preprocess": { | |
"hop_size": 320, | |
"sample_rate": 16000, // HuBERT, WavLM, are both 16000 | |
"n_fft": 1920, | |
"num_mels": 128, | |
"win_size": 1920, | |
"fmin": 0, | |
"fmax": 12000, | |
"mel_var": 8.14, | |
"mel_mean": -4.92, | |
"processed_dir": "", | |
"valid_file": "valid.json", | |
"train_file": "train.json", | |
"min_dur": 3, | |
"max_dur": 30, | |
"load_phone": true, | |
}, | |
"model": { | |
"autoregressive_transformer": { | |
"input_vocab_size": 1056, | |
"output_vocab_size": 8192, | |
"hidden_size": 1536, | |
"intermediate_size": 6144, | |
"num_hidden_layers": 12, | |
"num_attention_heads": 16, | |
"use_global_style_encoder": false | |
}, | |
"train_both_vc_and_tts": true, | |
"vc_input_token_type": "hubert_vevo_codec", | |
"vc_input_vocab_size": 32, | |
"tts_input_token_type": "g2p", | |
"tts_input_vocab_size": 1024, | |
"output_token_type": "hubert_codec", | |
"representation_stat_mean_var_path": "./Amphion/models/vc/vevo/config/hubert_large_l18_mean_std.npz", | |
"input_repcodec": { | |
"config_path": "./Amphion/models/vc/vevo/config/hubert_large_l18_c32.yaml", | |
}, | |
"output_repcodec": { | |
"codebook_size": 8192, // VQ Codebook Size | |
"hidden_size": 1024, // Representations Dim | |
"codebook_dim": 8, | |
"vocos_dim": 384, | |
"vocos_intermediate_dim": 2048, | |
"vocos_num_layers": 12, | |
} | |
}, | |
} |