Dongchao
/

UniAudio

Model card Files Files and versions Community

Dongchao commited on Oct 3, 2023

Commit

9552794

1 Parent(s): 229c19a

Upload 7 files

Browse files

Files changed (7) hide show

codec/tts_model/ckpt_01215000.pth +3 -0
codec/tts_model/config.yaml +158 -0
codec/tts_model/model.pth +3 -0
codec/universal_model/ckpt_01455000.pth +3 -0
codec/universal_model/config.yaml +157 -0
codec/universal_model/model.pth +3 -0
hubert_base_ls960.pt +3 -0

codec/tts_model/ckpt_01215000.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:0e8c35c42dc3cb9960a1d00e06956b17e5506829df4b132b5b11a55e2e14f116
+size 276692277

codec/tts_model/config.yaml ADDED Viewed

	@@ -0,0 +1,158 @@

+generator:
+  name: SoundStream
+  config:
+    n_filters: 32
+    D: 128
+    target_bandwidths:
+    - 0.5
+    - 1
+    - 1.5
+    - 2
+    - 4
+    - 6
+    ratios:
+    - 8
+    - 5
+    - 4
+    - 2
+    sample_rate: 16000
+    bins: 1024
+d_list:
+- mfd
+mfd:
+  name: MultiFrequencyDiscriminator
+  config:
+    hop_lengths:
+    - 32
+    - 64
+    - 128
+    - 256
+    - 512
+    - 1024
+    hidden_channels:
+    - 64
+    - 128
+    - 256
+    - 512
+    - 512
+    - 512
+    domain: double
+    mel_scale: true
+    sample_rate: 16000
+mpd:
+  name: MultiPeriodDiscriminator
+  config:
+    period_sizes:
+    - 2
+    - 3
+    - 5
+    - 7
+    - 11
+    period_kernel_size: 5
+msd:
+  name: MultiScaleDiscriminator
+  config:
+    num_scales: 3
+    pool_kernel_size: 4
+    pool_stride: 2
+optimizer:
+  g:
+    name: AdamW
+    config:
+      lr: 0.0002
+      betas:
+      - 0.8
+      - 0.99
+      eps: 1.0e-06
+  d:
+    name: AdamW
+    config:
+      lr: 0.0002
+      betas:
+      - 0.8
+      - 0.99
+      eps: 1.0e-06
+lr_scheduler:
+  g:
+    name: ExponentialLR
+    config:
+      gamma: 0.999
+  d:
+    name: ExponentialLR
+    config:
+      gamma: 0.999
+criterion:
+  g_criterion:
+    name: losses.generator_loss.GeneratorSTFTLoss
+    config:
+      use_mel_loss: false
+      adv_criterion: MSEGLoss
+      mel_loss_weight: 45
+      use_feature_match: true
+      feat_match_loss_weight: 20
+      use_full_stft_loss: true
+      use_sub_stft_loss: true
+      full_stft_loss_weight: 1
+      sub_stft_loss_weight: 1
+      mel_scale_loss:
+        sampling_rate: 16000
+        n_fft: 1024
+        num_mels: 80
+        hop_size: 160
+        win_size: 800
+        fmin: 0
+      full_multi_scale_stft_loss:
+        fft_sizes:
+        - 512
+        - 1024
+        - 2048
+        win_sizes:
+        - 480
+        - 960
+        - 1200
+        hop_sizes:
+        - 120
+        - 240
+        - 300
+      sub_multi_scale_stft_loss:
+        num_bands: 6
+        fft_sizes:
+        - 128
+        - 256
+        - 256
+        win_sizes:
+        - 80
+        - 120
+        - 200
+        hop_sizes:
+        - 20
+        - 40
+        - 50
+  d_criterion:
+    name: losses.discriminator_loss.MSEDiscriminatorLoss
+    config: null
+  commit_loss_weight: 1.0
+training_file: /apdcephfs_cq2/share_1297902/speech_user/shaunxliu/data/codec_data_24k/train_valid_lists/train.lst
+validation_file: /apdcephfs_cq2/share_1297902/speech_user/shaunxliu/data/codec_data_24k/train_valid_lists/valid_256.lst
+seed: 2333
+cudnn_deterministic: false
+tensorboard: true
+checkpoint_interval: 5000
+summary_interval: 100
+validation_interval: 5000
+num_epoches: 5000
+print_freq: 10
+discriminator_iter_start: 0
+num_ckpt_keep: 10
+segment_size: 48000
+audio_norm_scale: 0.95
+batch_size: 8
+num_workers: 8
+num_plots: 8
+local_rank: 0
+basic_model_config: config/encodec_16k_6kbps_v3_vqdp.yaml
+exp_model_config: null
+log_dir: exp_log/encodec_16k_6kbps_v3_vqdp_1disc
+ngpus_per_node: 8
+sample_rate: 16000
+model_ckpt_dir: exp_log/encodec_16k_6kbps_v3_vqdp_1disc/model_ckpts

codec/tts_model/model.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:0e8c35c42dc3cb9960a1d00e06956b17e5506829df4b132b5b11a55e2e14f116
+size 276692277

codec/universal_model/ckpt_01455000.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:9a497cc8ef0e0819c23e9aaf7fd15d0b3bd7bb28817818f51c03cf591ca29e25
+size 291880869

codec/universal_model/config.yaml ADDED Viewed

	@@ -0,0 +1,157 @@

+generator:
+  name: SoundStream
+  config:
+    n_filters: 32
+    D: 256
+    target_bandwidths:
+    - 0.5
+    - 1
+    - 1.5
+    - 2
+    - 4
+    ratios:
+    - 8
+    - 5
+    - 4
+    - 2
+    sample_rate: 16000
+    bins: 1024
+d_list:
+- mfd
+mfd:
+  name: MultiFrequencyDiscriminator
+  config:
+    hop_lengths:
+    - 32
+    - 64
+    - 128
+    - 256
+    - 512
+    - 1024
+    hidden_channels:
+    - 64
+    - 128
+    - 256
+    - 512
+    - 512
+    - 512
+    domain: double
+    mel_scale: true
+    sample_rate: 16000
+mpd:
+  name: MultiPeriodDiscriminator
+  config:
+    period_sizes:
+    - 2
+    - 3
+    - 5
+    - 7
+    - 11
+    period_kernel_size: 5
+msd:
+  name: MultiScaleDiscriminator
+  config:
+    num_scales: 3
+    pool_kernel_size: 4
+    pool_stride: 2
+optimizer:
+  g:
+    name: AdamW
+    config:
+      lr: 0.0002
+      betas:
+      - 0.8
+      - 0.99
+      eps: 1.0e-06
+  d:
+    name: AdamW
+    config:
+      lr: 0.0002
+      betas:
+      - 0.8
+      - 0.99
+      eps: 1.0e-06
+lr_scheduler:
+  g:
+    name: ExponentialLR
+    config:
+      gamma: 0.999
+  d:
+    name: ExponentialLR
+    config:
+      gamma: 0.999
+criterion:
+  g_criterion:
+    name: losses.generator_loss.GeneratorSTFTLoss
+    config:
+      use_mel_loss: false
+      adv_criterion: MSEGLoss
+      mel_loss_weight: 45
+      use_feature_match: true
+      feat_match_loss_weight: 20
+      use_full_stft_loss: true
+      use_sub_stft_loss: true
+      full_stft_loss_weight: 1
+      sub_stft_loss_weight: 1
+      mel_scale_loss:
+        sampling_rate: 16000
+        n_fft: 1024
+        num_mels: 80
+        hop_size: 160
+        win_size: 800
+        fmin: 0
+      full_multi_scale_stft_loss:
+        fft_sizes:
+        - 512
+        - 1024
+        - 2048
+        win_sizes:
+        - 480
+        - 960
+        - 1200
+        hop_sizes:
+        - 120
+        - 240
+        - 300
+      sub_multi_scale_stft_loss:
+        num_bands: 6
+        fft_sizes:
+        - 128
+        - 256
+        - 256
+        win_sizes:
+        - 80
+        - 120
+        - 200
+        hop_sizes:
+        - 20
+        - 40
+        - 50
+  d_criterion:
+    name: losses.discriminator_loss.MSEDiscriminatorLoss
+    config: null
+  commit_loss_weight: 1.0
+training_file: /apdcephfs_cq2/share_1297902/speech_user/shaunxliu/dongchao/audio_encodec/group_vqvae_16k_res2/big_data/train.lst
+validation_file: /apdcephfs_cq2/share_1297902/speech_user/shaunxliu/dongchao/audio_encodec/group_vqvae_16k_res2/big_data/val.lst
+seed: 2333
+cudnn_deterministic: false
+tensorboard: true
+checkpoint_interval: 5000
+summary_interval: 100
+validation_interval: 5000
+num_epoches: 5000
+print_freq: 10
+discriminator_iter_start: 0
+num_ckpt_keep: 10
+segment_size: 32000
+audio_norm_scale: 0.95
+batch_size: 8
+num_workers: 8
+num_plots: 8
+local_rank: 1
+basic_model_config: config/encodec_16k_6kbps_v3_vqdp.yaml
+exp_model_config: null
+log_dir: /apdcephfs_cq2/share_1297902/speech_user/shaunxliu/dongchao/code/SoundStream2/log/2022-11-14-19-48/exp/encodec_16k_6kbps_v3_vqdp_1disc
+ngpus_per_node: 8
+sample_rate: 16000
+model_ckpt_dir: /apdcephfs_cq2/share_1297902/speech_user/shaunxliu/dongchao/code/SoundStream2/log/2022-11-14-19-48/exp/encodec_16k_6kbps_v3_vqdp_1disc/model_ckpts

codec/universal_model/model.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:9a497cc8ef0e0819c23e9aaf7fd15d0b3bd7bb28817818f51c03cf591ca29e25
+size 291880869

hubert_base_ls960.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:1703cf8d2cdc76f8c046f5f6a9bcd224e0e6caf4744cad1a1f4199c32cac8c8d
+size 1136468879