File size: 4,661 Bytes
a70d86e 8859d4f a70d86e f6bcbf4 a70d86e f6bcbf4 a70d86e f6bcbf4 a70d86e f6bcbf4 a70d86e f6bcbf4 a70d86e |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 |
num_units: 1750
tokenizer: !new:sentencepiece.SentencePieceProcessor
# Audio input normalization:
# Yes, the input should be 22050 and then features computed with the
# wrong sample rate...
audio_normalizer: !new:speechbrain.dataio.preprocess.AudioNormalizer
sample_rate: 22050
# Feature parameters
sample_rate: 16000
n_fft: 400
n_mels: 40
# Model parameters
activation: !name:torch.nn.LeakyReLU
dropout: 0.15
cnn_blocks: 2
cnn_channels: (64, 128)
inter_layer_pooling_size: (2, 2)
cnn_kernelsize: (3, 3)
time_pooling_size: 4
rnn_class: !name:speechbrain.nnet.RNN.LSTM
rnn_layers: 3
rnn_neurons: 512
rnn_bidirectional: True
dnn_blocks: 1
dnn_neurons: 512
emb_size: 128
dec_neurons: 512
dec_layers: 1
output_neurons: !ref <num_units> #Number of tokens (same as LM)
blank_index: 0
bos_index: 0
eos_index: 0
unk_index: 0
min_decode_ratio: 0.0
max_decode_ratio: 1.0
valid_beam_size: 4
test_beam_size: 12
eos_threshold: 1.2
using_max_attn_shift: False
max_attn_shift: 240
ctc_weight_decode: 0.0
coverage_penalty: 3.0
temperature: 2.0
# Feature extraction
compute_features: !new:speechbrain.lobes.features.Fbank
sample_rate: !ref <sample_rate>
n_fft: !ref <n_fft>
n_mels: !ref <n_mels>
# Feature normalization (mean and std)
normalize: !new:speechbrain.processing.features.InputNormalization
norm_type: global
# The CRDNN model is an encoder that combines CNNs, RNNs, and DNNs.
encoder: !new:speechbrain.lobes.models.CRDNN.CRDNN
input_shape: [null, null, !ref <n_mels>]
activation: !ref <activation>
dropout: !ref <dropout>
cnn_blocks: !ref <cnn_blocks>
cnn_channels: !ref <cnn_channels>
cnn_kernelsize: !ref <cnn_kernelsize>
inter_layer_pooling_size: !ref <inter_layer_pooling_size>
time_pooling: True
using_2d_pooling: False
time_pooling_size: !ref <time_pooling_size>
rnn_class: !ref <rnn_class>
rnn_layers: !ref <rnn_layers>
rnn_neurons: !ref <rnn_neurons>
rnn_bidirectional: !ref <rnn_bidirectional>
rnn_re_init: True
dnn_blocks: !ref <dnn_blocks>
dnn_neurons: !ref <dnn_neurons>
use_rnnp: False
# Embedding (from indexes to an embedding space of dimension emb_size).
embedding: !new:speechbrain.nnet.embedding.Embedding
num_embeddings: !ref <output_neurons>
embedding_dim: !ref <emb_size>
# Attention-based RNN decoder.
decoder: !new:speechbrain.nnet.RNN.AttentionalRNNDecoder
enc_dim: !ref <dnn_neurons>
input_size: !ref <emb_size>
rnn_type: gru
attn_type: location
hidden_size: !ref <dec_neurons>
attn_dim: 1024
num_layers: !ref <dec_layers>
scaling: 1.0
channels: 10
kernel_size: 100
re_init: True
dropout: !ref <dropout>
# Linear transformation on the top of the decoder.
seq_lin: !new:speechbrain.nnet.linear.Linear
input_size: !ref <dec_neurons>
n_neurons: !ref <output_neurons>
# Linear transformation on the top of the encoder.
ctc_lin: !new:speechbrain.nnet.linear.Linear
input_size: !ref <dnn_neurons>
n_neurons: !ref <output_neurons>
# Final softmax (for log posteriors computation).
log_softmax: !new:speechbrain.nnet.activations.Softmax
apply_log: True
# Gathering all the submodels in a single model object.
model: !new:torch.nn.ModuleList
- - !ref <encoder>
- !ref <embedding>
- !ref <decoder>
- !ref <ctc_lin>
- !ref <seq_lin>
full_encode_step: !new:speechbrain.nnet.containers.LengthsCapableSequential
input_shape: [null, null, !ref <n_mels>]
compute_features: !ref <compute_features>
normalize: !ref <normalize>
model: !ref <encoder>
test_search: !new:speechbrain.decoders.S2SRNNBeamSearcher
embedding: !ref <embedding>
decoder: !ref <decoder>
linear: !ref <seq_lin>
ctc_linear: !ref <ctc_lin>
bos_index: !ref <bos_index>
eos_index: !ref <eos_index>
blank_index: !ref <blank_index>
min_decode_ratio: !ref <min_decode_ratio>
max_decode_ratio: !ref <max_decode_ratio>
beam_size: !ref <test_beam_size>
eos_threshold: !ref <eos_threshold>
using_max_attn_shift: !ref <using_max_attn_shift>
max_attn_shift: !ref <max_attn_shift>
coverage_penalty: !ref <coverage_penalty>
ctc_weight: !ref <ctc_weight_decode>
temperature: !ref <temperature>
# Objects in "modules" dict will have their parameters moved to the correct
# device, as well as having train()/eval() called on them by the Brain class
modules:
encoder: !ref <full_encode_step>
decoder: !ref <test_search>
pretrainer: !new:speechbrain.utils.parameter_transfer.Pretrainer
loadables:
model: !ref <model>
normalizer: !ref <normalize>
tokenizer: !ref <tokenizer>
|