Gurveer05 commited on
Commit
f80d36c
·
verified ·
1 Parent(s): 8f94303

Add new SentenceTransformer model.

Browse files
1_Pooling/config.json CHANGED
@@ -1,7 +1,7 @@
1
  {
2
- "word_embedding_dimension": 384,
3
- "pooling_mode_cls_token": false,
4
- "pooling_mode_mean_tokens": true,
5
  "pooling_mode_max_tokens": false,
6
  "pooling_mode_mean_sqrt_len_tokens": false,
7
  "pooling_mode_weightedmean_tokens": false,
 
1
  {
2
+ "word_embedding_dimension": 768,
3
+ "pooling_mode_cls_token": true,
4
+ "pooling_mode_mean_tokens": false,
5
  "pooling_mode_max_tokens": false,
6
  "pooling_mode_mean_sqrt_len_tokens": false,
7
  "pooling_mode_weightedmean_tokens": false,
README.md CHANGED
@@ -1,5 +1,5 @@
1
  ---
2
- base_model: sentence-transformers/all-MiniLM-L6-v2
3
  library_name: sentence-transformers
4
  pipeline_tag: sentence-similarity
5
  tags:
@@ -174,17 +174,17 @@ widget:
174
  - Thinks x = y is an axis
175
  ---
176
 
177
- # SentenceTransformer based on sentence-transformers/all-MiniLM-L6-v2
178
 
179
- This is a [sentence-transformers](https://www.SBERT.net) model finetuned from [sentence-transformers/all-MiniLM-L6-v2](https://huggingface.co/sentence-transformers/all-MiniLM-L6-v2) on the csv dataset. It maps sentences & paragraphs to a 384-dimensional dense vector space and can be used for semantic textual similarity, semantic search, paraphrase mining, text classification, clustering, and more.
180
 
181
  ## Model Details
182
 
183
  ### Model Description
184
  - **Model Type:** Sentence Transformer
185
- - **Base model:** [sentence-transformers/all-MiniLM-L6-v2](https://huggingface.co/sentence-transformers/all-MiniLM-L6-v2) <!-- at revision 8b3219a92973c328a8e22fadcfa821b5dc75636a -->
186
- - **Maximum Sequence Length:** 256 tokens
187
- - **Output Dimensionality:** 384 tokens
188
  - **Similarity Function:** Cosine Similarity
189
  - **Training Dataset:**
190
  - csv
@@ -201,9 +201,8 @@ This is a [sentence-transformers](https://www.SBERT.net) model finetuned from [s
201
 
202
  ```
203
  SentenceTransformer(
204
- (0): Transformer({'max_seq_length': 256, 'do_lower_case': False}) with Transformer model: BertModel
205
- (1): Pooling({'word_embedding_dimension': 384, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False, 'pooling_mode_weightedmean_tokens': False, 'pooling_mode_lasttoken': False, 'include_prompt': True})
206
- (2): Normalize()
207
  )
208
  ```
209
 
@@ -231,7 +230,7 @@ sentences = [
231
  ]
232
  embeddings = model.encode(sentences)
233
  print(embeddings.shape)
234
- # [3, 384]
235
 
236
  # Get the similarity scores for the embeddings
237
  similarities = model.similarity(embeddings, embeddings)
@@ -285,10 +284,10 @@ You can finetune this model on your own dataset.
285
  * Size: 12,210 training samples
286
  * Columns: <code>qa_pair_text</code>, <code>MisconceptionName</code>, and <code>negative</code>
287
  * Approximate statistics based on the first 1000 samples:
288
- | | qa_pair_text | MisconceptionName | negative |
289
- |:--------|:-------------------------------------------------------------------------------------|:----------------------------------------------------------------------------------|:----------------------------------------------------------------------------------|
290
- | type | string | string | string |
291
- | details | <ul><li>min: 54 tokens</li><li>mean: 121.45 tokens</li><li>max: 256 tokens</li></ul> | <ul><li>min: 4 tokens</li><li>mean: 15.16 tokens</li><li>max: 39 tokens</li></ul> | <ul><li>min: 7 tokens</li><li>mean: 14.49 tokens</li><li>max: 40 tokens</li></ul> |
292
  * Samples:
293
  | qa_pair_text | MisconceptionName | negative |
294
  |:----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|:-----------------------------------------------------------------------------------------------------|:-------------------------------------------------------------------------------------|
@@ -311,10 +310,10 @@ You can finetune this model on your own dataset.
311
  * Size: 9,640 evaluation samples
312
  * Columns: <code>qa_pair_text</code>, <code>MisconceptionName</code>, and <code>negative</code>
313
  * Approximate statistics based on the first 1000 samples:
314
- | | qa_pair_text | MisconceptionName | negative |
315
- |:--------|:-------------------------------------------------------------------------------------|:----------------------------------------------------------------------------------|:----------------------------------------------------------------------------------|
316
- | type | string | string | string |
317
- | details | <ul><li>min: 56 tokens</li><li>mean: 119.35 tokens</li><li>max: 256 tokens</li></ul> | <ul><li>min: 6 tokens</li><li>mean: 14.51 tokens</li><li>max: 39 tokens</li></ul> | <ul><li>min: 6 tokens</li><li>mean: 13.86 tokens</li><li>max: 40 tokens</li></ul> |
318
  * Samples:
319
  | qa_pair_text | MisconceptionName | negative |
320
  |:----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|:----------------------------------------------------------------------------------------------------------------------------------------------|:-------------------------------------------------------------------------------------------------------------------------------------------------|
@@ -335,7 +334,7 @@ You can finetune this model on your own dataset.
335
  - `eval_strategy`: steps
336
  - `per_device_train_batch_size`: 32
337
  - `per_device_eval_batch_size`: 32
338
- - `gradient_accumulation_steps`: 8
339
  - `learning_rate`: 1e-05
340
  - `weight_decay`: 0.01
341
  - `num_train_epochs`: 40
@@ -359,7 +358,7 @@ You can finetune this model on your own dataset.
359
  - `per_device_eval_batch_size`: 32
360
  - `per_gpu_train_batch_size`: None
361
  - `per_gpu_eval_batch_size`: None
362
- - `gradient_accumulation_steps`: 8
363
  - `eval_accumulation_steps`: None
364
  - `torch_empty_cache_steps`: None
365
  - `learning_rate`: 1e-05
@@ -468,64 +467,42 @@ You can finetune this model on your own dataset.
468
  ### Training Logs
469
  | Epoch | Step | Training Loss | loss |
470
  |:-----------:|:-------:|:-------------:|:----------:|
471
- | 0.5026 | 12 | 2.2789 | - |
472
- | 1.0052 | 24 | 2.1642 | 1.9746 |
473
- | 1.4974 | 36 | 2.0463 | - |
474
- | 2.0 | 48 | 1.8955 | 1.6808 |
475
- | 2.4921 | 60 | 1.7692 | - |
476
- | 2.9948 | 72 | 1.6528 | 1.4532 |
477
- | 3.4869 | 84 | 1.5298 | - |
478
- | 3.9895 | 96 | 1.4338 | 1.2853 |
479
- | 4.4817 | 108 | 1.3374 | - |
480
- | 4.9843 | 120 | 1.3084 | 1.2465 |
481
- | 5.4764 | 132 | 1.2921 | - |
482
- | 5.9791 | 144 | 1.2143 | 1.1766 |
483
- | 6.4712 | 156 | 1.1689 | - |
484
- | 6.9738 | 168 | 1.1656 | 1.1518 |
485
- | 7.4660 | 180 | 1.1172 | - |
486
- | 7.9686 | 192 | 1.0737 | 1.1080 |
487
- | 8.4607 | 204 | 1.0373 | - |
488
- | 8.9634 | 216 | 1.0445 | 1.0874 |
489
- | 9.4555 | 228 | 0.9707 | - |
490
- | 9.9581 | 240 | 0.9644 | 1.0649 |
491
- | 10.4503 | 252 | 0.9252 | - |
492
- | 10.9529 | 264 | 0.9211 | 1.0367 |
493
- | 11.4450 | 276 | 0.8645 | - |
494
- | 11.9476 | 288 | 0.8635 | 1.0297 |
495
- | 12.4398 | 300 | 0.8279 | - |
496
- | 12.9424 | 312 | 0.819 | 1.0161 |
497
- | 13.4346 | 324 | 0.7684 | - |
498
- | 13.9372 | 336 | 0.7842 | 1.0016 |
499
- | 14.4293 | 348 | 0.7448 | - |
500
- | 14.9319 | 360 | 0.7321 | 0.9951 |
501
- | 15.4241 | 372 | 0.7064 | - |
502
- | 15.9267 | 384 | 0.7161 | 0.9835 |
503
- | 16.4188 | 396 | 0.6692 | - |
504
- | 16.9215 | 408 | 0.6594 | 0.9774 |
505
- | 17.4136 | 420 | 0.6405 | - |
506
- | 17.9162 | 432 | 0.638 | 0.9723 |
507
- | 18.4084 | 444 | 0.6 | - |
508
- | 18.9110 | 456 | 0.6122 | 0.9706 |
509
- | 19.4031 | 468 | 0.5763 | - |
510
- | 19.9058 | 480 | 0.5787 | 0.9732 |
511
- | 20.3979 | 492 | 0.5432 | - |
512
- | 20.9005 | 504 | 0.5599 | 0.9618 |
513
- | 21.3927 | 516 | 0.5245 | - |
514
- | 21.8953 | 528 | 0.5278 | 0.9626 |
515
- | 22.3874 | 540 | 0.4989 | - |
516
- | 22.8901 | 552 | 0.509 | 0.9583 |
517
- | 23.3822 | 564 | 0.4674 | - |
518
- | **23.8848** | **576** | **0.4854** | **0.9573** |
519
- | 24.3770 | 588 | 0.4619 | - |
520
- | 24.8796 | 600 | 0.4631 | 0.9615 |
521
- | 25.3717 | 612 | 0.4339 | - |
522
- | 25.8743 | 624 | 0.4427 | 0.9593 |
523
- | 26.3665 | 636 | 0.4225 | - |
524
- | 26.8691 | 648 | 0.4245 | 0.9694 |
525
- | 27.3613 | 660 | 0.3936 | - |
526
- | 27.8639 | 672 | 0.4168 | 0.9586 |
527
- | 28.3560 | 684 | 0.3835 | - |
528
- | 28.8586 | 696 | 0.3921 | 0.9629 |
529
 
530
  * The bold row denotes the saved checkpoint.
531
 
 
1
  ---
2
+ base_model: Alibaba-NLP/gte-base-en-v1.5
3
  library_name: sentence-transformers
4
  pipeline_tag: sentence-similarity
5
  tags:
 
174
  - Thinks x = y is an axis
175
  ---
176
 
177
+ # SentenceTransformer based on Alibaba-NLP/gte-base-en-v1.5
178
 
179
+ This is a [sentence-transformers](https://www.SBERT.net) model finetuned from [Alibaba-NLP/gte-base-en-v1.5](https://huggingface.co/Alibaba-NLP/gte-base-en-v1.5) on the csv dataset. It maps sentences & paragraphs to a 768-dimensional dense vector space and can be used for semantic textual similarity, semantic search, paraphrase mining, text classification, clustering, and more.
180
 
181
  ## Model Details
182
 
183
  ### Model Description
184
  - **Model Type:** Sentence Transformer
185
+ - **Base model:** [Alibaba-NLP/gte-base-en-v1.5](https://huggingface.co/Alibaba-NLP/gte-base-en-v1.5) <!-- at revision a8e4f3e0ee719c75bc30d12b8eae0f8440502718 -->
186
+ - **Maximum Sequence Length:** 8192 tokens
187
+ - **Output Dimensionality:** 768 tokens
188
  - **Similarity Function:** Cosine Similarity
189
  - **Training Dataset:**
190
  - csv
 
201
 
202
  ```
203
  SentenceTransformer(
204
+ (0): Transformer({'max_seq_length': 8192, 'do_lower_case': False}) with Transformer model: NewModel
205
+ (1): Pooling({'word_embedding_dimension': 768, 'pooling_mode_cls_token': True, 'pooling_mode_mean_tokens': False, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False, 'pooling_mode_weightedmean_tokens': False, 'pooling_mode_lasttoken': False, 'include_prompt': True})
 
206
  )
207
  ```
208
 
 
230
  ]
231
  embeddings = model.encode(sentences)
232
  print(embeddings.shape)
233
+ # [3, 768]
234
 
235
  # Get the similarity scores for the embeddings
236
  similarities = model.similarity(embeddings, embeddings)
 
284
  * Size: 12,210 training samples
285
  * Columns: <code>qa_pair_text</code>, <code>MisconceptionName</code>, and <code>negative</code>
286
  * Approximate statistics based on the first 1000 samples:
287
+ | | qa_pair_text | MisconceptionName | negative |
288
+ |:--------|:------------------------------------------------------------------------------------|:----------------------------------------------------------------------------------|:----------------------------------------------------------------------------------|
289
+ | type | string | string | string |
290
+ | details | <ul><li>min: 54 tokens</li><li>mean: 124.3 tokens</li><li>max: 618 tokens</li></ul> | <ul><li>min: 4 tokens</li><li>mean: 15.16 tokens</li><li>max: 39 tokens</li></ul> | <ul><li>min: 7 tokens</li><li>mean: 14.49 tokens</li><li>max: 40 tokens</li></ul> |
291
  * Samples:
292
  | qa_pair_text | MisconceptionName | negative |
293
  |:----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|:-----------------------------------------------------------------------------------------------------|:-------------------------------------------------------------------------------------|
 
310
  * Size: 9,640 evaluation samples
311
  * Columns: <code>qa_pair_text</code>, <code>MisconceptionName</code>, and <code>negative</code>
312
  * Approximate statistics based on the first 1000 samples:
313
+ | | qa_pair_text | MisconceptionName | negative |
314
+ |:--------|:--------------------------------------------------------------------------------------|:----------------------------------------------------------------------------------|:----------------------------------------------------------------------------------|
315
+ | type | string | string | string |
316
+ | details | <ul><li>min: 56 tokens</li><li>mean: 123.29 tokens</li><li>max: 1092 tokens</li></ul> | <ul><li>min: 6 tokens</li><li>mean: 14.51 tokens</li><li>max: 39 tokens</li></ul> | <ul><li>min: 6 tokens</li><li>mean: 13.86 tokens</li><li>max: 40 tokens</li></ul> |
317
  * Samples:
318
  | qa_pair_text | MisconceptionName | negative |
319
  |:----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|:----------------------------------------------------------------------------------------------------------------------------------------------|:-------------------------------------------------------------------------------------------------------------------------------------------------|
 
334
  - `eval_strategy`: steps
335
  - `per_device_train_batch_size`: 32
336
  - `per_device_eval_batch_size`: 32
337
+ - `gradient_accumulation_steps`: 16
338
  - `learning_rate`: 1e-05
339
  - `weight_decay`: 0.01
340
  - `num_train_epochs`: 40
 
358
  - `per_device_eval_batch_size`: 32
359
  - `per_gpu_train_batch_size`: None
360
  - `per_gpu_eval_batch_size`: None
361
+ - `gradient_accumulation_steps`: 16
362
  - `eval_accumulation_steps`: None
363
  - `torch_empty_cache_steps`: None
364
  - `learning_rate`: 1e-05
 
467
  ### Training Logs
468
  | Epoch | Step | Training Loss | loss |
469
  |:-----------:|:-------:|:-------------:|:----------:|
470
+ | 0.5026 | 6 | 2.8901 | - |
471
+ | 1.0052 | 12 | 2.5455 | 2.1423 |
472
+ | 1.4974 | 18 | 2.2716 | - |
473
+ | 2.0 | 24 | 2.0293 | 1.7440 |
474
+ | 2.4921 | 30 | 1.8326 | - |
475
+ | 2.9948 | 36 | 1.6703 | 1.4220 |
476
+ | 3.4869 | 42 | 1.4876 | - |
477
+ | 3.9895 | 48 | 1.3571 | 1.2232 |
478
+ | 4.4817 | 54 | 1.2347 | - |
479
+ | 4.9843 | 60 | 1.2289 | 1.1891 |
480
+ | 5.4764 | 66 | 1.1551 | - |
481
+ | 5.9791 | 72 | 1.0629 | 1.1069 |
482
+ | 6.4712 | 78 | 1.0166 | - |
483
+ | 6.9738 | 84 | 1.0095 | 1.0651 |
484
+ | 7.4660 | 90 | 0.8951 | - |
485
+ | 7.9686 | 96 | 0.8782 | 1.0386 |
486
+ | 8.4607 | 102 | 0.8305 | - |
487
+ | 8.9634 | 108 | 0.809 | 1.0174 |
488
+ | 9.4555 | 114 | 0.7202 | - |
489
+ | 9.9581 | 120 | 0.7403 | 1.0041 |
490
+ | 10.4503 | 126 | 0.6737 | - |
491
+ | 10.9529 | 132 | 0.6499 | 0.9903 |
492
+ | 11.4450 | 138 | 0.6149 | - |
493
+ | 11.9476 | 144 | 0.6185 | 0.9889 |
494
+ | 12.4398 | 150 | 0.5492 | - |
495
+ | **12.9424** | **156** | **0.5595** | **0.9878** |
496
+ | 13.4346 | 162 | 0.5146 | - |
497
+ | 13.9372 | 168 | 0.5097 | 0.9927 |
498
+ | 14.4293 | 174 | 0.4584 | - |
499
+ | 14.9319 | 180 | 0.4746 | 0.9912 |
500
+ | 15.4241 | 186 | 0.4331 | - |
501
+ | 15.9267 | 192 | 0.424 | 1.0016 |
502
+ | 16.4188 | 198 | 0.3946 | - |
503
+ | 16.9215 | 204 | 0.4077 | 1.0002 |
504
+ | 17.4136 | 210 | 0.366 | - |
505
+ | 17.9162 | 216 | 0.3721 | 1.0070 |
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
506
 
507
  * The bold row denotes the saved checkpoint.
508
 
config.json CHANGED
@@ -1,26 +1,44 @@
1
  {
2
- "_name_or_path": "sentence-transformers/all-MiniLM-L6-v2",
3
  "architectures": [
4
- "BertModel"
5
  ],
6
- "attention_probs_dropout_prob": 0.1,
 
 
 
 
 
 
 
 
 
7
  "classifier_dropout": null,
8
- "gradient_checkpointing": false,
9
  "hidden_act": "gelu",
10
  "hidden_dropout_prob": 0.1,
11
- "hidden_size": 384,
12
  "initializer_range": 0.02,
13
- "intermediate_size": 1536,
14
  "layer_norm_eps": 1e-12,
15
- "max_position_embeddings": 512,
16
- "model_type": "bert",
 
 
 
17
  "num_attention_heads": 12,
18
- "num_hidden_layers": 6,
 
19
  "pad_token_id": 0,
20
- "position_embedding_type": "absolute",
 
 
 
 
 
21
  "torch_dtype": "float32",
22
  "transformers_version": "4.44.0",
23
- "type_vocab_size": 2,
24
- "use_cache": true,
25
- "vocab_size": 30522
 
26
  }
 
1
  {
2
+ "_name_or_path": "Alibaba-NLP/gte-base-en-v1.5",
3
  "architectures": [
4
+ "NewModel"
5
  ],
6
+ "attention_probs_dropout_prob": 0.0,
7
+ "auto_map": {
8
+ "AutoConfig": "Alibaba-NLP/new-impl--configuration.NewConfig",
9
+ "AutoModel": "Alibaba-NLP/new-impl--modeling.NewModel",
10
+ "AutoModelForMaskedLM": "Alibaba-NLP/new-impl--modeling.NewForMaskedLM",
11
+ "AutoModelForMultipleChoice": "Alibaba-NLP/new-impl--modeling.NewForMultipleChoice",
12
+ "AutoModelForQuestionAnswering": "Alibaba-NLP/new-impl--modeling.NewForQuestionAnswering",
13
+ "AutoModelForSequenceClassification": "Alibaba-NLP/new-impl--modeling.NewForSequenceClassification",
14
+ "AutoModelForTokenClassification": "Alibaba-NLP/new-impl--modeling.NewForTokenClassification"
15
+ },
16
  "classifier_dropout": null,
 
17
  "hidden_act": "gelu",
18
  "hidden_dropout_prob": 0.1,
19
+ "hidden_size": 768,
20
  "initializer_range": 0.02,
21
+ "intermediate_size": 3072,
22
  "layer_norm_eps": 1e-12,
23
+ "layer_norm_type": "layer_norm",
24
+ "logn_attention_clip1": false,
25
+ "logn_attention_scale": false,
26
+ "max_position_embeddings": 8192,
27
+ "model_type": "new",
28
  "num_attention_heads": 12,
29
+ "num_hidden_layers": 12,
30
+ "pack_qkv": true,
31
  "pad_token_id": 0,
32
+ "position_embedding_type": "rope",
33
+ "rope_scaling": {
34
+ "factor": 2.0,
35
+ "type": "ntk"
36
+ },
37
+ "rope_theta": 500000,
38
  "torch_dtype": "float32",
39
  "transformers_version": "4.44.0",
40
+ "type_vocab_size": 0,
41
+ "unpad_inputs": false,
42
+ "use_memory_efficient_attention": false,
43
+ "vocab_size": 30528
44
  }
model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:3dc81fcda60ac280c966d16b9cc07ebe8f5e13619f4caa4425c7f6e5dd344a91
3
- size 90864192
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e9a3cff2df37b85a7a0b3b7953d156029625c6b2ee5b6ed8bea9e8f144b64982
3
+ size 547119128
modules.json CHANGED
@@ -10,11 +10,5 @@
10
  "name": "1",
11
  "path": "1_Pooling",
12
  "type": "sentence_transformers.models.Pooling"
13
- },
14
- {
15
- "idx": 2,
16
- "name": "2",
17
- "path": "2_Normalize",
18
- "type": "sentence_transformers.models.Normalize"
19
  }
20
  ]
 
10
  "name": "1",
11
  "path": "1_Pooling",
12
  "type": "sentence_transformers.models.Pooling"
 
 
 
 
 
 
13
  }
14
  ]
sentence_bert_config.json CHANGED
@@ -1,4 +1,4 @@
1
  {
2
- "max_seq_length": 256,
3
  "do_lower_case": false
4
  }
 
1
  {
2
+ "max_seq_length": 8192,
3
  "do_lower_case": false
4
  }
tokenizer.json CHANGED
@@ -2,7 +2,7 @@
2
  "version": "1.0",
3
  "truncation": {
4
  "direction": "Right",
5
- "max_length": 256,
6
  "strategy": "LongestFirst",
7
  "stride": 0
8
  },
 
2
  "version": "1.0",
3
  "truncation": {
4
  "direction": "Right",
5
+ "max_length": 8192,
6
  "strategy": "LongestFirst",
7
  "stride": 0
8
  },
tokenizer_config.json CHANGED
@@ -43,12 +43,10 @@
43
  },
44
  "clean_up_tokenization_spaces": true,
45
  "cls_token": "[CLS]",
46
- "do_basic_tokenize": true,
47
  "do_lower_case": true,
48
  "mask_token": "[MASK]",
49
- "max_length": 128,
50
- "model_max_length": 256,
51
- "never_split": null,
52
  "pad_to_multiple_of": null,
53
  "pad_token": "[PAD]",
54
  "pad_token_type_id": 0,
 
43
  },
44
  "clean_up_tokenization_spaces": true,
45
  "cls_token": "[CLS]",
 
46
  "do_lower_case": true,
47
  "mask_token": "[MASK]",
48
+ "max_length": 512,
49
+ "model_max_length": 8192,
 
50
  "pad_to_multiple_of": null,
51
  "pad_token": "[PAD]",
52
  "pad_token_type_id": 0,