yangwang825 commited on
Commit
0a4a54d
·
1 Parent(s): 76fcc9f

Upload 3 files

Browse files
Files changed (3) hide show
  1. embedding_model.ckpt +3 -0
  2. hyperparams.yaml +199 -0
  3. label_encoder.txt +0 -0
embedding_model.ckpt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e5458351fcb867dd9fbb60822c91d302dcc2512e9a3be03f714668c3c026ffae
3
+ size 26960925
hyperparams.yaml ADDED
@@ -0,0 +1,199 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Generated 2022-12-15 from:
2
+ # /home/pcp22wc/exps/speaker-recognition/hparams/train_etdnn.yaml
3
+ # yamllint disable
4
+ # ################################
5
+ # Model: Speaker identification with ECAPA
6
+ # Authors: Hwidong Na & Mirco Ravanelli
7
+ # ################################
8
+
9
+ # Basic parameters
10
+ seed: 914
11
+ __set_seed: !apply:torch.manual_seed [914]
12
+ output_folder: results/etdnn_augment/914
13
+ save_folder: results/etdnn_augment/914/save
14
+ train_log: results/etdnn_augment/914/train_log.txt
15
+
16
+ # Data files
17
+ data_folder: /fastdata/pcp22wc/audio/VoxCeleb2/dev, /fastdata/pcp22wc/audio/VoxCeleb1/test # e.g. /path/to/Voxceleb
18
+ train_annotation: save/train.csv
19
+ valid_annotation: save/dev.csv
20
+
21
+ # Folder to extract data augmentation files
22
+ rir_folder: /fastdata/pcp22wc/audio # Change it if needed
23
+ musan_folder: /fastdata/pcp22wc/audio/musan
24
+ music_csv: save/music.csv
25
+ noise_csv: save/noise.csv
26
+ speech_csv: save/speech.csv
27
+
28
+ # Use the following links for the official voxceleb splits:
29
+ # VoxCeleb1 (cleaned): https://www.robots.ox.ac.uk/~vgg/data/voxceleb/meta/veri_test2.txt
30
+ # VoxCeleb1-H (cleaned): https://www.robots.ox.ac.uk/~vgg/data/voxceleb/meta/list_test_hard2.txt
31
+ # VoxCeleb1-E (cleaned): https://www.robots.ox.ac.uk/~vgg/data/voxceleb/meta/list_test_all2.txt.
32
+ # VoxCeleb1-E and VoxCeleb1-H lists are drawn from the VoxCeleb1 training set.
33
+ # Therefore you cannot use any files in VoxCeleb1 for training if you are using these lists for testing.
34
+ verification_file: https://www.robots.ox.ac.uk/~vgg/data/voxceleb/meta/veri_test2.txt
35
+
36
+ skip_prep: true
37
+ ckpt_interval_minutes: 15 # save checkpoint every N min
38
+
39
+ # Training parameters
40
+ number_of_epochs: 40
41
+ batch_size: 512
42
+ lr: 0.001
43
+ lr_final: 0.0001
44
+ step_size: 65000
45
+ sample_rate: 16000
46
+ sentence_len: 3.0 # seconds
47
+ shuffle: true
48
+ random_chunk: true
49
+
50
+ # Feature parameters
51
+ n_mels: 80
52
+ deltas: false
53
+
54
+ # Number of speakers
55
+ out_n_neurons: 5994 #1211 for vox1 # 5994 for vox2, 7205 for vox1+vox2
56
+
57
+ dataloader_options:
58
+ batch_size: 512
59
+ shuffle: true
60
+ num_workers: 8
61
+
62
+ # Functions
63
+ compute_features: &id007 !new:speechbrain.lobes.features.Fbank
64
+ # augment_wavedrop: !ref <augment_wavedrop>
65
+ # augment_speed: !ref <augment_speed>
66
+ n_mels: 80
67
+ deltas: false
68
+
69
+ embedding_model: &id008 !new:speechbrain.lobes.models.Xvector.Xvector
70
+ in_channels: 80
71
+ activation: !name:torch.nn.LeakyReLU
72
+ tdnn_blocks: 10
73
+ tdnn_channels: [512, 512, 512, 512, 512, 512, 512, 512, 512, 1500]
74
+ tdnn_kernel_sizes: [5, 1, 3, 1, 3, 1, 5, 1, 1, 1]
75
+ tdnn_dilations: [2, 1, 1, 1, 1, 1, 2, 1, 1, 1]
76
+ lin_neurons: 512
77
+
78
+ classifier: &id009 !new:speechbrain.lobes.models.ECAPA_TDNN.Classifier
79
+ input_size: 512
80
+ out_neurons: 5994
81
+
82
+ epoch_counter: &id011 !new:speechbrain.utils.epoch_loop.EpochCounter
83
+ limit: 40
84
+
85
+
86
+ augment_wavedrop: !new:speechbrain.lobes.augment.TimeDomainSpecAugment
87
+ sample_rate: 16000
88
+ speeds: [100]
89
+
90
+ augment_speed: !new:speechbrain.lobes.augment.TimeDomainSpecAugment
91
+ sample_rate: 16000
92
+ speeds: [95, 100, 105]
93
+
94
+ add_rev: &id001 !new:speechbrain.lobes.augment.EnvCorrupt
95
+ openrir_folder: /fastdata/pcp22wc/audio
96
+ openrir_max_noise_len: 3.0 # seconds
97
+ reverb_prob: 1.0
98
+ noise_prob: 0.0
99
+ noise_snr_low: 0
100
+ noise_snr_high: 15
101
+ rir_scale_factor: 1.0
102
+
103
+ add_noise: &id002 !new:speechbrain.lobes.augment.EnvCorrupt
104
+ openrir_folder: /fastdata/pcp22wc/audio
105
+ openrir_max_noise_len: 3.0 # seconds
106
+ reverb_prob: 0.0
107
+ noise_prob: 1.0
108
+ noise_snr_low: 0
109
+ noise_snr_high: 15
110
+ rir_scale_factor: 1.0
111
+
112
+ add_rev_noise: &id003 !new:speechbrain.lobes.augment.EnvCorrupt
113
+ openrir_folder: /fastdata/pcp22wc/audio
114
+ openrir_max_noise_len: 3.0 # seconds
115
+ reverb_prob: 1.0
116
+ noise_prob: 1.0
117
+ noise_snr_low: 0
118
+ noise_snr_high: 15
119
+ rir_scale_factor: 1.0
120
+
121
+ add_noise_musan: &id004 !new:speechbrain.lobes.augment.EnvCorrupt
122
+ noise_csv: save/noise.csv
123
+ babble_prob: 0.0
124
+ reverb_prob: 0.0
125
+ noise_prob: 1.0
126
+ noise_snr_low: 0
127
+ noise_snr_high: 15
128
+
129
+ add_music_musan: &id005 !new:speechbrain.lobes.augment.EnvCorrupt
130
+ noise_csv: save/music.csv
131
+ babble_prob: 0.0
132
+ reverb_prob: 0.0
133
+ noise_prob: 1.0
134
+ noise_snr_low: 0
135
+ noise_snr_high: 15
136
+
137
+ add_speech_musan: &id006 !new:speechbrain.lobes.augment.EnvCorrupt
138
+ noise_csv: save/speech.csv
139
+ babble_prob: 0.0
140
+ reverb_prob: 0.0
141
+ noise_prob: 1.0
142
+ noise_snr_low: 0
143
+ noise_snr_high: 15
144
+
145
+ # Definition of the augmentation pipeline.
146
+ # If concat_augment = False, the augmentation techniques are applied
147
+ # in sequence. If concat_augment = True, all the augmented signals
148
+ # # are concatenated in a single big batch.
149
+
150
+ augment_pipeline: [*id001, *id002, *id003, *id004, *id005, *id006]
151
+ concat_augment: true
152
+
153
+ mean_var_norm: &id010 !new:speechbrain.processing.features.InputNormalization
154
+
155
+ norm_type: sentence
156
+ std_norm: false
157
+
158
+ modules:
159
+ compute_features: *id007
160
+ add_rev: *id001
161
+ add_noise: *id002
162
+ add_rev_noise: *id003
163
+ add_noise_musan: *id004
164
+ add_music_musan: *id005
165
+ add_speech_musan: *id006
166
+ embedding_model: *id008
167
+ classifier: *id009
168
+ mean_var_norm: *id010
169
+ compute_cost: !new:speechbrain.nnet.losses.LogSoftmaxWrapper
170
+ loss_fn: !new:speechbrain.nnet.losses.AdditiveAngularMargin
171
+ margin: 0.2
172
+ scale: 30
173
+
174
+ # compute_error: !name:speechbrain.nnet.losses.classification_error
175
+
176
+ opt_class: !name:torch.optim.Adam
177
+ lr: 0.001
178
+ weight_decay: 0.000002
179
+
180
+ lr_annealing: !new:speechbrain.nnet.schedulers.LinearScheduler
181
+ initial_value: 0.001
182
+ final_value: 0.0001
183
+ epoch_count: 40
184
+
185
+ # Logging + checkpoints
186
+ train_logger: !new:speechbrain.utils.train_logger.FileTrainLogger
187
+ save_file: results/etdnn_augment/914/train_log.txt
188
+
189
+ error_stats: !name:speechbrain.utils.metric_stats.MetricStats
190
+ metric: !name:speechbrain.nnet.losses.classification_error
191
+ reduction: batch
192
+
193
+ checkpointer: !new:speechbrain.utils.checkpoints.Checkpointer
194
+ checkpoints_dir: results/etdnn_augment/914/save
195
+ recoverables:
196
+ embedding_model: *id008
197
+ classifier: *id009
198
+ normalizer: *id010
199
+ counter: *id011
label_encoder.txt ADDED
The diff for this file is too large to render. See raw diff