# ############################################################################ # Model: Direct SLU # Encoder: Pre-trained ASR encoder -> LSTM # Decoder: GRU + beamsearch # Tokens: BPE with unigram # losses: NLL # Training: Fluent Speech Commands # Authors: Loren Lugosch, Mirco Ravanelli 2020 # ############################################################################ # Model parameters sample_rate: 16000 emb_size: 128 dec_neurons: 512 output_neurons: 51 # index(eos/bos) = 0 ASR_encoder_dim: 512 encoder_dim: 256 # Decoding parameters bos_index: 0 eos_index: 0 min_decode_ratio: 0.0 max_decode_ratio: 10.0 slu_beam_size: 80 eos_threshold: 1.5 temperature: 1.25 # Models asr_model_source: speechbrain/asr-crdnn-rnnlm-librispeech slu_enc: !new:speechbrain.nnet.containers.Sequential input_shape: [null, null, !ref <ASR_encoder_dim>] lstm: !new:speechbrain.nnet.RNN.LSTM input_size: !ref <ASR_encoder_dim> bidirectional: True hidden_size: !ref <encoder_dim> num_layers: 2 linear: !new:speechbrain.nnet.linear.Linear input_size: !ref <encoder_dim> * 2 n_neurons: !ref <encoder_dim> output_emb: !new:speechbrain.nnet.embedding.Embedding num_embeddings: !ref <output_neurons> embedding_dim: !ref <emb_size> dec: !new:speechbrain.nnet.RNN.AttentionalRNNDecoder enc_dim: !ref <encoder_dim> input_size: !ref <emb_size> rnn_type: gru attn_type: keyvalue hidden_size: !ref <dec_neurons> attn_dim: 512 num_layers: 3 scaling: 1.0 dropout: 0.0 seq_lin: !new:speechbrain.nnet.linear.Linear input_size: !ref <dec_neurons> n_neurons: !ref <output_neurons> model: !new:torch.nn.ModuleList - [!ref <slu_enc>, !ref <output_emb>, !ref <dec>, !ref <seq_lin>] tokenizer: !new:sentencepiece.SentencePieceProcessor pretrainer: !new:speechbrain.utils.parameter_transfer.Pretrainer loadables: model: !ref <model> tokenizer: !ref <tokenizer> beam_searcher: !new:speechbrain.decoders.S2SRNNBeamSearcher embedding: !ref <output_emb> decoder: !ref <dec> linear: !ref <seq_lin> bos_index: !ref <bos_index> eos_index: !ref <eos_index> min_decode_ratio: !ref <min_decode_ratio> max_decode_ratio: !ref <max_decode_ratio> beam_size: !ref <slu_beam_size> eos_threshold: !ref <eos_threshold> temperature: !ref <temperature> using_max_attn_shift: False max_attn_shift: 30 coverage_penalty: 0. modules: slu_enc: !ref <slu_enc> beam_searcher: !ref <beam_searcher>