File size: 2,890 Bytes
418d5a5 5025284 418d5a5 04a099d 418d5a5 04a099d 418d5a5 5025284 418d5a5 5025284 04a099d 418d5a5 04a099d 418d5a5 74357e5 5606bad 418d5a5 5025284 418d5a5 5025284 418d5a5 5025284 418d5a5 5025284 418d5a5 5025284 97011e9 418d5a5 5025284 49e611d b1ec785 49e611d 97011e9 49e611d 1c2af43 418d5a5 5025284 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 |
# ################################
# Model: wav2vec2 + DNN + CTC
# Augmentation: SpecAugment
# Authors:
# Sung-Lin Yeh 2021
# Pooneh Mousavi 2023
# ################################
# BPE parameters
token_type: unigram # ["unigram", "bpe", "char"]
character_coverage: 1.0
# Model parameters
# activation: !name:torch.nn.LeakyReLU
dnn_neurons: 1024
wav2vec_output_dim: 1024
dropout: 0.15
sample_rate: 16000
wav2vec2_hub: facebook/wav2vec2-large-lv60
# Outputs
output_neurons: 1000 # BPE size, index(blank/eos/bos) = 0
# Decoding parameters
# Be sure that the bos and eos index match with the BPEs ones
blank_index: 0
bos_index: 1
eos_index: 2
enc: !new:speechbrain.nnet.containers.Sequential
input_shape: [null, null, !ref <wav2vec_output_dim>]
linear1: !name:speechbrain.nnet.linear.Linear
n_neurons: !ref <dnn_neurons>
bias: True
bn1: !name:speechbrain.nnet.normalization.BatchNorm1d
activation: !new:torch.nn.LeakyReLU
drop: !new:torch.nn.Dropout
p: !ref <dropout>
linear2: !name:speechbrain.nnet.linear.Linear
n_neurons: !ref <dnn_neurons>
bias: True
bn2: !name:speechbrain.nnet.normalization.BatchNorm1d
activation2: !new:torch.nn.LeakyReLU
drop2: !new:torch.nn.Dropout
p: !ref <dropout>
linear3: !name:speechbrain.nnet.linear.Linear
n_neurons: !ref <dnn_neurons>
bias: True
bn3: !name:speechbrain.nnet.normalization.BatchNorm1d
activation3: !new:torch.nn.LeakyReLU
wav2vec2: !new:speechbrain.lobes.models.huggingface_transformers.wav2vec2.Wav2Vec2
source: !ref <wav2vec2_hub>
output_norm: True
freeze: True
save_path: wav2vec2_checkpoint
ctc_lin: !new:speechbrain.nnet.linear.Linear
input_size: !ref <dnn_neurons>
n_neurons: !ref <output_neurons>
log_softmax: !new:speechbrain.nnet.activations.Softmax
apply_log: True
ctc_cost: !name:speechbrain.nnet.losses.ctc_loss
blank_index: !ref <blank_index>
asr_model: !new:torch.nn.ModuleList
- [!ref <enc>, !ref <ctc_lin>]
tokenizer: !new:sentencepiece.SentencePieceProcessor
encoder: !new:speechbrain.nnet.containers.LengthsCapableSequential
wav2vec2: !ref <wav2vec2>
enc: !ref <enc>
ctc_lin: !ref <ctc_lin>
log_softmax: !ref <log_softmax>
modules:
encoder: !ref <encoder>
# Decoding parameters
decoding_function: !name:speechbrain.decoders.CTCBeamSearcher
test_beam_search:
beam_size: 100
beam_prune_logp: -12.0
token_prune_min_logp: -1.2
prune_history: True
blank_index: !ref <blank_index>
topk: 1
alpha: 1.0
beta: 0.5
kenlm_model_path: speechbrain/asr-wav2vec2-commonvoice-14-en/en_5gram.arpa # uncomment this line if you do want to use ngram rescoring
pretrainer: !new:speechbrain.utils.parameter_transfer.Pretrainer
loadables:
wav2vec2: !ref <wav2vec2>
asr: !ref <asr_model>
tokenizer: !ref <tokenizer> |