File size: 2,470 Bytes
b48b442
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
{
  "architectures": [
    "DistilxLSTM"
  ],
  "model_type": "xlstm",
  "pad_token_id": 151643,
  "torch_dtype": "float32",
  "transformers_version": "4.47.1",
  "xlstm_cfg": {
    "_block_map": "1,0,1,0,1,0",
    "add_embedding_dropout": false,
    "add_post_blocks_norm": true,
    "bias": false,
    "context_length": 512,
    "dropout": 0.0,
    "embedding_dim": 1536,
    "mlstm_block": {
      "_block_idx": null,
      "_num_blocks": 6,
      "mlstm": {
        "_inner_embedding_dim": 3072,
        "_num_blocks": 6,
        "_proj_up_dim": 3072,
        "bias": false,
        "context_length": 512,
        "conv1d_kernel_size": 4,
        "dropout": 0.0,
        "embedding_dim": 1536,
        "num_heads": 16,
        "proj_factor": 2.0,
        "qkv_proj_blocksize": 32,
        "round_proj_up_dim_up": true,
        "round_proj_up_to_multiple_of": 64
      }
    },
    "num_blocks": 6,
    "slstm_at": [
      0,
      2,
      4
    ],
    "slstm_block": {
      "_block_idx": null,
      "_num_blocks": 6,
      "feedforward": {
        "_num_blocks": 1,
        "_proj_up_dim": 0,
        "act_fn": "gelu",
        "bias": false,
        "dropout": 0.0,
        "embedding_dim": -1,
        "ff_type": "ffn_gated",
        "proj_factor": 1.7,
        "round_proj_up_dim_up": true,
        "round_proj_up_to_multiple_of": 64
      },
      "slstm": {
        "_block_idx": null,
        "_num_blocks": 6,
        "backend": "cuda",
        "batch_size": 8,
        "bias_init": "powerlaw_blockdependent",
        "constants": {},
        "conv1d_kernel_size": 4,
        "dropout": 0.0,
        "dtype": "bfloat16",
        "dtype_a": "float32",
        "dtype_b": "float32",
        "dtype_g": "bfloat16",
        "dtype_r": "bfloat16",
        "dtype_s": "bfloat16",
        "dtype_w": "bfloat16",
        "embedding_dim": 1536,
        "enable_automatic_mixed_precision": true,
        "forward_clipval": null,
        "function": "slstm",
        "gradient_recurrent_clipval": null,
        "gradient_recurrent_cut": false,
        "group_norm_weight": true,
        "hidden_size": 1536,
        "initial_val": 0.0,
        "input_shape": "BSGNH",
        "internal_input_shape": "SBNGH",
        "num_gates": 4,
        "num_heads": 16,
        "num_states": 4,
        "output_shape": "BNSH",
        "recurrent_weight_init": "zeros"
      }
    },
    "tie_weights": false,
    "vocab_size": 151936,
    "weight_decay_on_embedding": false
  }
}