Add new SentenceTransformer model.

Browse files

Files changed (8) hide show

1_Pooling/config.json +3 -3
README.md +84 -58
config.json +13 -31
model.safetensors +2 -2
modules.json +6 -0
sentence_bert_config.json +1 -1
tokenizer.json +1 -1
tokenizer_config.json +4 -2

1_Pooling/config.json CHANGED Viewed

@@ -1,7 +1,7 @@
 {
-  "word_embedding_dimension": 768,
-  "pooling_mode_cls_token": true,
-  "pooling_mode_mean_tokens": false,
   "pooling_mode_max_tokens": false,
   "pooling_mode_mean_sqrt_len_tokens": false,
   "pooling_mode_weightedmean_tokens": false,

 {
+  "word_embedding_dimension": 384,
+  "pooling_mode_cls_token": false,
+  "pooling_mode_mean_tokens": true,
   "pooling_mode_max_tokens": false,
   "pooling_mode_mean_sqrt_len_tokens": false,
   "pooling_mode_weightedmean_tokens": false,

README.md CHANGED Viewed

@@ -1,5 +1,5 @@
 ---
-base_model: Alibaba-NLP/gte-base-en-v1.5
 library_name: sentence-transformers
 pipeline_tag: sentence-similarity
 tags:
@@ -174,17 +174,17 @@ widget:
   - Thinks x = y is an axis
 ---
-# SentenceTransformer based on Alibaba-NLP/gte-base-en-v1.5
-This is a [sentence-transformers](https://www.SBERT.net) model finetuned from [Alibaba-NLP/gte-base-en-v1.5](https://huggingface.co/Alibaba-NLP/gte-base-en-v1.5) on the csv dataset. It maps sentences & paragraphs to a 768-dimensional dense vector space and can be used for semantic textual similarity, semantic search, paraphrase mining, text classification, clustering, and more.
 ## Model Details
 ### Model Description
 - **Model Type:** Sentence Transformer
-- **Base model:** [Alibaba-NLP/gte-base-en-v1.5](https://huggingface.co/Alibaba-NLP/gte-base-en-v1.5) <!-- at revision a8e4f3e0ee719c75bc30d12b8eae0f8440502718 -->
-- **Maximum Sequence Length:** 8192 tokens
-- **Output Dimensionality:** 768 tokens
 - **Similarity Function:** Cosine Similarity
 - **Training Dataset:**
     - csv
@@ -201,8 +201,9 @@ This is a [sentence-transformers](https://www.SBERT.net) model finetuned from [A
 ```
 SentenceTransformer(
-  (0): Transformer({'max_seq_length': 8192, 'do_lower_case': False}) with Transformer model: NewModel
-  (1): Pooling({'word_embedding_dimension': 768, 'pooling_mode_cls_token': True, 'pooling_mode_mean_tokens': False, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False, 'pooling_mode_weightedmean_tokens': False, 'pooling_mode_lasttoken': False, 'include_prompt': True})
 )
 ```
@@ -230,7 +231,7 @@ sentences = [
 ]
 embeddings = model.encode(sentences)
 print(embeddings.shape)
-# [3, 768]
 # Get the similarity scores for the embeddings
 similarities = model.similarity(embeddings, embeddings)
@@ -284,10 +285,10 @@ You can finetune this model on your own dataset.
 * Size: 12,210 training samples
 * Columns: <code>qa_pair_text</code>, <code>MisconceptionName</code>, and <code>negative</code>
 * Approximate statistics based on the first 1000 samples:
-  |         | qa_pair_text                                                                        | MisconceptionName                                                                 | negative                                                                          |
-  |:--------|:------------------------------------------------------------------------------------|:----------------------------------------------------------------------------------|:----------------------------------------------------------------------------------|
-  | type    | string                                                                              | string                                                                            | string                                                                            |
-  | details | <ul><li>min: 54 tokens</li><li>mean: 124.3 tokens</li><li>max: 618 tokens</li></ul> | <ul><li>min: 4 tokens</li><li>mean: 15.16 tokens</li><li>max: 39 tokens</li></ul> | <ul><li>min: 7 tokens</li><li>mean: 14.49 tokens</li><li>max: 40 tokens</li></ul> |
 * Samples:
   | qa_pair_text                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                | MisconceptionName                                                                                    | negative                                                                             |
   |:----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|:-----------------------------------------------------------------------------------------------------|:-------------------------------------------------------------------------------------|
@@ -310,10 +311,10 @@ You can finetune this model on your own dataset.
 * Size: 9,640 evaluation samples
 * Columns: <code>qa_pair_text</code>, <code>MisconceptionName</code>, and <code>negative</code>
 * Approximate statistics based on the first 1000 samples:
-  |         | qa_pair_text                                                                          | MisconceptionName                                                                 | negative                                                                          |
-  |:--------|:--------------------------------------------------------------------------------------|:----------------------------------------------------------------------------------|:----------------------------------------------------------------------------------|
-  | type    | string                                                                                | string                                                                            | string                                                                            |
-  | details | <ul><li>min: 56 tokens</li><li>mean: 123.29 tokens</li><li>max: 1092 tokens</li></ul> | <ul><li>min: 6 tokens</li><li>mean: 14.51 tokens</li><li>max: 39 tokens</li></ul> | <ul><li>min: 6 tokens</li><li>mean: 13.86 tokens</li><li>max: 40 tokens</li></ul> |
 * Samples:
   | qa_pair_text                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                          | MisconceptionName                                                                                                                             | negative                                                                                                                                         |
   |:----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|:----------------------------------------------------------------------------------------------------------------------------------------------|:-------------------------------------------------------------------------------------------------------------------------------------------------|
@@ -334,12 +335,12 @@ You can finetune this model on your own dataset.
 - `eval_strategy`: steps
 - `per_device_train_batch_size`: 32
 - `per_device_eval_batch_size`: 32
-- `gradient_accumulation_steps`: 32
 - `learning_rate`: 1e-05
 - `weight_decay`: 0.01
-- `num_train_epochs`: 20
 - `lr_scheduler_type`: cosine
-- `lr_scheduler_kwargs`: {'num_cycles': 10}
 - `warmup_ratio`: 0.1
 - `fp16`: True
 - `load_best_model_at_end`: True
@@ -358,7 +359,7 @@ You can finetune this model on your own dataset.
 - `per_device_eval_batch_size`: 32
 - `per_gpu_train_batch_size`: None
 - `per_gpu_eval_batch_size`: None
-- `gradient_accumulation_steps`: 32
 - `eval_accumulation_steps`: None
 - `torch_empty_cache_steps`: None
 - `learning_rate`: 1e-05
@@ -367,10 +368,10 @@ You can finetune this model on your own dataset.
 - `adam_beta2`: 0.999
 - `adam_epsilon`: 1e-08
 - `max_grad_norm`: 1.0
-- `num_train_epochs`: 20
 - `max_steps`: -1
 - `lr_scheduler_type`: cosine
-- `lr_scheduler_kwargs`: {'num_cycles': 10}
 - `warmup_ratio`: 0.1
 - `warmup_steps`: 0
 - `log_level`: passive
@@ -465,41 +466,66 @@ You can finetune this model on your own dataset.
 </details>
 ### Training Logs
-| Epoch       | Step   | Training Loss | loss       |
-|:-----------:|:------:|:-------------:|:----------:|
-| 0.5026      | 3      | 2.9133        | -          |
-| 1.0052      | 6      | 2.5832        | 2.1410     |
-| 1.4974      | 9      | 2.2895        | -          |
-| 2.0         | 12     | 2.0454        | 1.7594     |
-| 2.4921      | 15     | 1.8939        | -          |
-| 2.9948      | 18     | 1.8752        | 1.6653     |
-| 3.4869      | 21     | 1.7731        | -          |
-| 3.9895      | 24     | 1.6771        | 1.4987     |
-| 4.4817      | 27     | 1.6388        | -          |
-| 4.9843      | 30     | 1.5924        | 1.3795     |
-| 5.4764      | 33     | 1.4895        | -          |
-| 5.9791      | 36     | 1.4837        | 1.3370     |
-| 6.4712      | 39     | 1.4183        | -          |
-| 6.9738      | 42     | 1.3677        | 1.2660     |
-| 7.4660      | 45     | 1.3165        | -          |
-| 7.9686      | 48     | 1.3034        | 1.2091     |
-| 8.4607      | 51     | 1.199         | -          |
-| 8.9634      | 54     | 1.2276        | 1.1851     |
-| 9.4555      | 57     | 1.1421        | -          |
-| 9.9581      | 60     | 1.1234        | 1.1398     |
-| 10.4503     | 63     | 1.0703        | -          |
-| 10.9529     | 66     | 1.0716        | 1.1000     |
-| 11.4450     | 69     | 0.9864        | -          |
-| 11.9476     | 72     | 1.0047        | 1.0839     |
-| 12.4398     | 75     | 0.9381        | -          |
-| 12.9424     | 78     | 0.9298        | 1.0559     |
-| 13.4346     | 81     | 0.8725        | -          |
-| 13.9372     | 84     | 0.8813        | 1.0333     |
-| 14.4293     | 87     | 0.7988        | -          |
-| 14.9319     | 90     | 0.8256        | 1.0245     |
-| 15.4241     | 93     | 0.7617        | -          |
-| **15.9267** | **96** | **0.7551**    | **1.0121** |
-| 16.4188     | 99     | 0.713         | -          |
 * The bold row denotes the saved checkpoint.

 ---
+base_model: sentence-transformers/all-MiniLM-L6-v2
 library_name: sentence-transformers
 pipeline_tag: sentence-similarity
 tags:
   - Thinks x = y is an axis
 ---
+# SentenceTransformer based on sentence-transformers/all-MiniLM-L6-v2
+This is a [sentence-transformers](https://www.SBERT.net) model finetuned from [sentence-transformers/all-MiniLM-L6-v2](https://huggingface.co/sentence-transformers/all-MiniLM-L6-v2) on the csv dataset. It maps sentences & paragraphs to a 384-dimensional dense vector space and can be used for semantic textual similarity, semantic search, paraphrase mining, text classification, clustering, and more.
 ## Model Details
 ### Model Description
 - **Model Type:** Sentence Transformer
+- **Base model:** [sentence-transformers/all-MiniLM-L6-v2](https://huggingface.co/sentence-transformers/all-MiniLM-L6-v2) <!-- at revision 8b3219a92973c328a8e22fadcfa821b5dc75636a -->
+- **Maximum Sequence Length:** 256 tokens
+- **Output Dimensionality:** 384 tokens
 - **Similarity Function:** Cosine Similarity
 - **Training Dataset:**
     - csv
 ```
 SentenceTransformer(
+  (0): Transformer({'max_seq_length': 256, 'do_lower_case': False}) with Transformer model: BertModel
+  (1): Pooling({'word_embedding_dimension': 384, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False, 'pooling_mode_weightedmean_tokens': False, 'pooling_mode_lasttoken': False, 'include_prompt': True})
+  (2): Normalize()
 )
 ```
 ]
 embeddings = model.encode(sentences)
 print(embeddings.shape)
+# [3, 384]
 # Get the similarity scores for the embeddings
 similarities = model.similarity(embeddings, embeddings)
 * Size: 12,210 training samples
 * Columns: <code>qa_pair_text</code>, <code>MisconceptionName</code>, and <code>negative</code>
 * Approximate statistics based on the first 1000 samples:
+  |         | qa_pair_text                                                                         | MisconceptionName                                                                 | negative                                                                          |
+  |:--------|:-------------------------------------------------------------------------------------|:----------------------------------------------------------------------------------|:----------------------------------------------------------------------------------|
+  | type    | string                                                                               | string                                                                            | string                                                                            |
+  | details | <ul><li>min: 54 tokens</li><li>mean: 121.45 tokens</li><li>max: 256 tokens</li></ul> | <ul><li>min: 4 tokens</li><li>mean: 15.16 tokens</li><li>max: 39 tokens</li></ul> | <ul><li>min: 7 tokens</li><li>mean: 14.49 tokens</li><li>max: 40 tokens</li></ul> |
 * Samples:
   | qa_pair_text                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                | MisconceptionName                                                                                    | negative                                                                             |
   |:----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|:-----------------------------------------------------------------------------------------------------|:-------------------------------------------------------------------------------------|
 * Size: 9,640 evaluation samples
 * Columns: <code>qa_pair_text</code>, <code>MisconceptionName</code>, and <code>negative</code>
 * Approximate statistics based on the first 1000 samples:
+  |         | qa_pair_text                                                                         | MisconceptionName                                                                 | negative                                                                          |
+  |:--------|:-------------------------------------------------------------------------------------|:----------------------------------------------------------------------------------|:----------------------------------------------------------------------------------|
+  | type    | string                                                                               | string                                                                            | string                                                                            |
+  | details | <ul><li>min: 56 tokens</li><li>mean: 119.35 tokens</li><li>max: 256 tokens</li></ul> | <ul><li>min: 6 tokens</li><li>mean: 14.51 tokens</li><li>max: 39 tokens</li></ul> | <ul><li>min: 6 tokens</li><li>mean: 13.86 tokens</li><li>max: 40 tokens</li></ul> |
 * Samples:
   | qa_pair_text                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                          | MisconceptionName                                                                                                                             | negative                                                                                                                                         |
   |:----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|:----------------------------------------------------------------------------------------------------------------------------------------------|:-------------------------------------------------------------------------------------------------------------------------------------------------|
 - `eval_strategy`: steps
 - `per_device_train_batch_size`: 32
 - `per_device_eval_batch_size`: 32
+- `gradient_accumulation_steps`: 8
 - `learning_rate`: 1e-05
 - `weight_decay`: 0.01
+- `num_train_epochs`: 40
 - `lr_scheduler_type`: cosine
+- `lr_scheduler_kwargs`: {'num_cycles': 20}
 - `warmup_ratio`: 0.1
 - `fp16`: True
 - `load_best_model_at_end`: True
 - `per_device_eval_batch_size`: 32
 - `per_gpu_train_batch_size`: None
 - `per_gpu_eval_batch_size`: None
+- `gradient_accumulation_steps`: 8
 - `eval_accumulation_steps`: None
 - `torch_empty_cache_steps`: None
 - `learning_rate`: 1e-05
 - `adam_beta2`: 0.999
 - `adam_epsilon`: 1e-08
 - `max_grad_norm`: 1.0
+- `num_train_epochs`: 40
 - `max_steps`: -1
 - `lr_scheduler_type`: cosine
+- `lr_scheduler_kwargs`: {'num_cycles': 20}
 - `warmup_ratio`: 0.1
 - `warmup_steps`: 0
 - `log_level`: passive
 </details>
 ### Training Logs
+| Epoch       | Step    | Training Loss | loss       |
+|:-----------:|:-------:|:-------------:|:----------:|
+| 0.5026      | 12      | 2.2789        | -          |
+| 1.0052      | 24      | 2.1642        | 1.9746     |
+| 1.4974      | 36      | 2.0463        | -          |
+| 2.0         | 48      | 1.8955        | 1.6808     |
+| 2.4921      | 60      | 1.7692        | -          |
+| 2.9948      | 72      | 1.6528        | 1.4532     |
+| 3.4869      | 84      | 1.5298        | -          |
+| 3.9895      | 96      | 1.4338        | 1.2853     |
+| 4.4817      | 108     | 1.3374        | -          |
+| 4.9843      | 120     | 1.3084        | 1.2465     |
+| 5.4764      | 132     | 1.2921        | -          |
+| 5.9791      | 144     | 1.2143        | 1.1766     |
+| 6.4712      | 156     | 1.1689        | -          |
+| 6.9738      | 168     | 1.1656        | 1.1518     |
+| 7.4660      | 180     | 1.1172        | -          |
+| 7.9686      | 192     | 1.0737        | 1.1080     |
+| 8.4607      | 204     | 1.0373        | -          |
+| 8.9634      | 216     | 1.0445        | 1.0874     |
+| 9.4555      | 228     | 0.9707        | -          |
+| 9.9581      | 240     | 0.9644        | 1.0649     |
+| 10.4503     | 252     | 0.9252        | -          |
+| 10.9529     | 264     | 0.9211        | 1.0367     |
+| 11.4450     | 276     | 0.8645        | -          |
+| 11.9476     | 288     | 0.8635        | 1.0297     |
+| 12.4398     | 300     | 0.8279        | -          |
+| 12.9424     | 312     | 0.819         | 1.0161     |
+| 13.4346     | 324     | 0.7684        | -          |
+| 13.9372     | 336     | 0.7842        | 1.0016     |
+| 14.4293     | 348     | 0.7448        | -          |
+| 14.9319     | 360     | 0.7321        | 0.9951     |
+| 15.4241     | 372     | 0.7064        | -          |
+| 15.9267     | 384     | 0.7161        | 0.9835     |
+| 16.4188     | 396     | 0.6692        | -          |
+| 16.9215     | 408     | 0.6594        | 0.9774     |
+| 17.4136     | 420     | 0.6405        | -          |
+| 17.9162     | 432     | 0.638         | 0.9723     |
+| 18.4084     | 444     | 0.6           | -          |
+| 18.9110     | 456     | 0.6122        | 0.9706     |
+| 19.4031     | 468     | 0.5763        | -          |
+| 19.9058     | 480     | 0.5787        | 0.9732     |
+| 20.3979     | 492     | 0.5432        | -          |
+| 20.9005     | 504     | 0.5599        | 0.9618     |
+| 21.3927     | 516     | 0.5245        | -          |
+| 21.8953     | 528     | 0.5278        | 0.9626     |
+| 22.3874     | 540     | 0.4989        | -          |
+| 22.8901     | 552     | 0.509         | 0.9583     |
+| 23.3822     | 564     | 0.4674        | -          |
+| **23.8848** | **576** | **0.4854**    | **0.9573** |
+| 24.3770     | 588     | 0.4619        | -          |
+| 24.8796     | 600     | 0.4631        | 0.9615     |
+| 25.3717     | 612     | 0.4339        | -          |
+| 25.8743     | 624     | 0.4427        | 0.9593     |
+| 26.3665     | 636     | 0.4225        | -          |
+| 26.8691     | 648     | 0.4245        | 0.9694     |
+| 27.3613     | 660     | 0.3936        | -          |
+| 27.8639     | 672     | 0.4168        | 0.9586     |
+| 28.3560     | 684     | 0.3835        | -          |
+| 28.8586     | 696     | 0.3921        | 0.9629     |
 * The bold row denotes the saved checkpoint.

config.json CHANGED Viewed

@@ -1,44 +1,26 @@
 {
-  "_name_or_path": "Alibaba-NLP/gte-base-en-v1.5",
   "architectures": [
-    "NewModel"
   ],
-  "attention_probs_dropout_prob": 0.0,
-  "auto_map": {
-    "AutoConfig": "Alibaba-NLP/new-impl--configuration.NewConfig",
-    "AutoModel": "Alibaba-NLP/new-impl--modeling.NewModel",
-    "AutoModelForMaskedLM": "Alibaba-NLP/new-impl--modeling.NewForMaskedLM",
-    "AutoModelForMultipleChoice": "Alibaba-NLP/new-impl--modeling.NewForMultipleChoice",
-    "AutoModelForQuestionAnswering": "Alibaba-NLP/new-impl--modeling.NewForQuestionAnswering",
-    "AutoModelForSequenceClassification": "Alibaba-NLP/new-impl--modeling.NewForSequenceClassification",
-    "AutoModelForTokenClassification": "Alibaba-NLP/new-impl--modeling.NewForTokenClassification"
-  },
   "classifier_dropout": null,
   "hidden_act": "gelu",
   "hidden_dropout_prob": 0.1,
-  "hidden_size": 768,
   "initializer_range": 0.02,
-  "intermediate_size": 3072,
   "layer_norm_eps": 1e-12,
-  "layer_norm_type": "layer_norm",
-  "logn_attention_clip1": false,
-  "logn_attention_scale": false,
-  "max_position_embeddings": 8192,
-  "model_type": "new",
   "num_attention_heads": 12,
-  "num_hidden_layers": 12,
-  "pack_qkv": true,
   "pad_token_id": 0,
-  "position_embedding_type": "rope",
-  "rope_scaling": {
-    "factor": 2.0,
-    "type": "ntk"
-  },
-  "rope_theta": 500000,
   "torch_dtype": "float32",
   "transformers_version": "4.44.0",
-  "type_vocab_size": 0,
-  "unpad_inputs": false,
-  "use_memory_efficient_attention": false,
-  "vocab_size": 30528
 }

 {
+  "_name_or_path": "sentence-transformers/all-MiniLM-L6-v2",
   "architectures": [
+    "BertModel"
   ],
+  "attention_probs_dropout_prob": 0.1,
   "classifier_dropout": null,
+  "gradient_checkpointing": false,
   "hidden_act": "gelu",
   "hidden_dropout_prob": 0.1,
+  "hidden_size": 384,
   "initializer_range": 0.02,
+  "intermediate_size": 1536,
   "layer_norm_eps": 1e-12,
+  "max_position_embeddings": 512,
+  "model_type": "bert",
   "num_attention_heads": 12,
+  "num_hidden_layers": 6,
   "pad_token_id": 0,
+  "position_embedding_type": "absolute",
   "torch_dtype": "float32",
   "transformers_version": "4.44.0",
+  "type_vocab_size": 2,
+  "use_cache": true,
+  "vocab_size": 30522
 }

model.safetensors CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:7d10283288ad7a6ce66a5175fb51467f4d3b6f939ee20bd89479c7bab36286f8
-size 547119128

 version https://git-lfs.github.com/spec/v1
+oid sha256:3dc81fcda60ac280c966d16b9cc07ebe8f5e13619f4caa4425c7f6e5dd344a91
+size 90864192

modules.json CHANGED Viewed

@@ -10,5 +10,11 @@
     "name": "1",
     "path": "1_Pooling",
     "type": "sentence_transformers.models.Pooling"
   }
 ]

     "name": "1",
     "path": "1_Pooling",
     "type": "sentence_transformers.models.Pooling"
+  },
+  {
+    "idx": 2,
+    "name": "2",
+    "path": "2_Normalize",
+    "type": "sentence_transformers.models.Normalize"
   }
 ]

sentence_bert_config.json CHANGED Viewed

@@ -1,4 +1,4 @@
 {
-  "max_seq_length": 8192,
   "do_lower_case": false
 }

 {
+  "max_seq_length": 256,
   "do_lower_case": false
 }

tokenizer.json CHANGED Viewed

@@ -2,7 +2,7 @@
   "version": "1.0",
   "truncation": {
     "direction": "Right",
-    "max_length": 8192,
     "strategy": "LongestFirst",
     "stride": 0
   },

   "version": "1.0",
   "truncation": {
     "direction": "Right",
+    "max_length": 256,
     "strategy": "LongestFirst",
     "stride": 0
   },

tokenizer_config.json CHANGED Viewed

@@ -43,10 +43,12 @@
   },
   "clean_up_tokenization_spaces": true,
   "cls_token": "[CLS]",
   "do_lower_case": true,
   "mask_token": "[MASK]",
-  "max_length": 512,
-  "model_max_length": 8192,
   "pad_to_multiple_of": null,
   "pad_token": "[PAD]",
   "pad_token_type_id": 0,

   },
   "clean_up_tokenization_spaces": true,
   "cls_token": "[CLS]",
+  "do_basic_tokenize": true,
   "do_lower_case": true,
   "mask_token": "[MASK]",
+  "max_length": 128,
+  "model_max_length": 256,
+  "never_split": null,
   "pad_to_multiple_of": null,
   "pad_token": "[PAD]",
   "pad_token_type_id": 0,