Add new SentenceTransformer model.

Browse files

Files changed (8) hide show

1_Pooling/config.json +3 -3
README.md +55 -78
config.json +31 -13
model.safetensors +2 -2
modules.json +0 -6
sentence_bert_config.json +1 -1
tokenizer.json +1 -1
tokenizer_config.json +2 -4

1_Pooling/config.json CHANGED Viewed

@@ -1,7 +1,7 @@
 {
-  "word_embedding_dimension": 384,
-  "pooling_mode_cls_token": false,
-  "pooling_mode_mean_tokens": true,
   "pooling_mode_max_tokens": false,
   "pooling_mode_mean_sqrt_len_tokens": false,
   "pooling_mode_weightedmean_tokens": false,

 {
+  "word_embedding_dimension": 768,
+  "pooling_mode_cls_token": true,
+  "pooling_mode_mean_tokens": false,
   "pooling_mode_max_tokens": false,
   "pooling_mode_mean_sqrt_len_tokens": false,
   "pooling_mode_weightedmean_tokens": false,

README.md CHANGED Viewed

@@ -1,5 +1,5 @@
 ---
-base_model: sentence-transformers/all-MiniLM-L6-v2
 library_name: sentence-transformers
 pipeline_tag: sentence-similarity
 tags:
@@ -174,17 +174,17 @@ widget:
   - Thinks x = y is an axis
 ---
-# SentenceTransformer based on sentence-transformers/all-MiniLM-L6-v2
-This is a [sentence-transformers](https://www.SBERT.net) model finetuned from [sentence-transformers/all-MiniLM-L6-v2](https://huggingface.co/sentence-transformers/all-MiniLM-L6-v2) on the csv dataset. It maps sentences & paragraphs to a 384-dimensional dense vector space and can be used for semantic textual similarity, semantic search, paraphrase mining, text classification, clustering, and more.
 ## Model Details
 ### Model Description
 - **Model Type:** Sentence Transformer
-- **Base model:** [sentence-transformers/all-MiniLM-L6-v2](https://huggingface.co/sentence-transformers/all-MiniLM-L6-v2) <!-- at revision 8b3219a92973c328a8e22fadcfa821b5dc75636a -->
-- **Maximum Sequence Length:** 256 tokens
-- **Output Dimensionality:** 384 tokens
 - **Similarity Function:** Cosine Similarity
 - **Training Dataset:**
     - csv
@@ -201,9 +201,8 @@ This is a [sentence-transformers](https://www.SBERT.net) model finetuned from [s
 ```
 SentenceTransformer(
-  (0): Transformer({'max_seq_length': 256, 'do_lower_case': False}) with Transformer model: BertModel
-  (1): Pooling({'word_embedding_dimension': 384, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False, 'pooling_mode_weightedmean_tokens': False, 'pooling_mode_lasttoken': False, 'include_prompt': True})
-  (2): Normalize()
 )
 ```
@@ -231,7 +230,7 @@ sentences = [
 ]
 embeddings = model.encode(sentences)
 print(embeddings.shape)
-# [3, 384]
 # Get the similarity scores for the embeddings
 similarities = model.similarity(embeddings, embeddings)
@@ -285,10 +284,10 @@ You can finetune this model on your own dataset.
 * Size: 12,210 training samples
 * Columns: <code>qa_pair_text</code>, <code>MisconceptionName</code>, and <code>negative</code>
 * Approximate statistics based on the first 1000 samples:
-  |         | qa_pair_text                                                                         | MisconceptionName                                                                 | negative                                                                          |
-  |:--------|:-------------------------------------------------------------------------------------|:----------------------------------------------------------------------------------|:----------------------------------------------------------------------------------|
-  | type    | string                                                                               | string                                                                            | string                                                                            |
-  | details | <ul><li>min: 54 tokens</li><li>mean: 121.45 tokens</li><li>max: 256 tokens</li></ul> | <ul><li>min: 4 tokens</li><li>mean: 15.16 tokens</li><li>max: 39 tokens</li></ul> | <ul><li>min: 7 tokens</li><li>mean: 14.49 tokens</li><li>max: 40 tokens</li></ul> |
 * Samples:
   | qa_pair_text                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                | MisconceptionName                                                                                    | negative                                                                             |
   |:----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|:-----------------------------------------------------------------------------------------------------|:-------------------------------------------------------------------------------------|
@@ -311,10 +310,10 @@ You can finetune this model on your own dataset.
 * Size: 9,640 evaluation samples
 * Columns: <code>qa_pair_text</code>, <code>MisconceptionName</code>, and <code>negative</code>
 * Approximate statistics based on the first 1000 samples:
-  |         | qa_pair_text                                                                         | MisconceptionName                                                                 | negative                                                                          |
-  |:--------|:-------------------------------------------------------------------------------------|:----------------------------------------------------------------------------------|:----------------------------------------------------------------------------------|
-  | type    | string                                                                               | string                                                                            | string                                                                            |
-  | details | <ul><li>min: 56 tokens</li><li>mean: 119.35 tokens</li><li>max: 256 tokens</li></ul> | <ul><li>min: 6 tokens</li><li>mean: 14.51 tokens</li><li>max: 39 tokens</li></ul> | <ul><li>min: 6 tokens</li><li>mean: 13.86 tokens</li><li>max: 40 tokens</li></ul> |
 * Samples:
   | qa_pair_text                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                          | MisconceptionName                                                                                                                             | negative                                                                                                                                         |
   |:----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|:----------------------------------------------------------------------------------------------------------------------------------------------|:-------------------------------------------------------------------------------------------------------------------------------------------------|
@@ -335,7 +334,7 @@ You can finetune this model on your own dataset.
 - `eval_strategy`: steps
 - `per_device_train_batch_size`: 32
 - `per_device_eval_batch_size`: 32
-- `gradient_accumulation_steps`: 8
 - `learning_rate`: 1e-05
 - `weight_decay`: 0.01
 - `num_train_epochs`: 40
@@ -359,7 +358,7 @@ You can finetune this model on your own dataset.
 - `per_device_eval_batch_size`: 32
 - `per_gpu_train_batch_size`: None
 - `per_gpu_eval_batch_size`: None
-- `gradient_accumulation_steps`: 8
 - `eval_accumulation_steps`: None
 - `torch_empty_cache_steps`: None
 - `learning_rate`: 1e-05
@@ -468,64 +467,42 @@ You can finetune this model on your own dataset.
 ### Training Logs
 | Epoch       | Step    | Training Loss | loss       |
 |:-----------:|:-------:|:-------------:|:----------:|
-| 0.5026      | 12      | 2.2789        | -          |
-| 1.0052      | 24      | 2.1642        | 1.9746     |
-| 1.4974      | 36      | 2.0463        | -          |
-| 2.0         | 48      | 1.8955        | 1.6808     |
-| 2.4921      | 60      | 1.7692        | -          |
-| 2.9948      | 72      | 1.6528        | 1.4532     |
-| 3.4869      | 84      | 1.5298        | -          |
-| 3.9895      | 96      | 1.4338        | 1.2853     |
-| 4.4817      | 108     | 1.3374        | -          |
-| 4.9843      | 120     | 1.3084        | 1.2465     |
-| 5.4764      | 132     | 1.2921        | -          |
-| 5.9791      | 144     | 1.2143        | 1.1766     |
-| 6.4712      | 156     | 1.1689        | -          |
-| 6.9738      | 168     | 1.1656        | 1.1518     |
-| 7.4660      | 180     | 1.1172        | -          |
-| 7.9686      | 192     | 1.0737        | 1.1080     |
-| 8.4607      | 204     | 1.0373        | -          |
-| 8.9634      | 216     | 1.0445        | 1.0874     |
-| 9.4555      | 228     | 0.9707        | -          |
-| 9.9581      | 240     | 0.9644        | 1.0649     |
-| 10.4503     | 252     | 0.9252        | -          |
-| 10.9529     | 264     | 0.9211        | 1.0367     |
-| 11.4450     | 276     | 0.8645        | -          |
-| 11.9476     | 288     | 0.8635        | 1.0297     |
-| 12.4398     | 300     | 0.8279        | -          |
-| 12.9424     | 312     | 0.819         | 1.0161     |
-| 13.4346     | 324     | 0.7684        | -          |
-| 13.9372     | 336     | 0.7842        | 1.0016     |
-| 14.4293     | 348     | 0.7448        | -          |
-| 14.9319     | 360     | 0.7321        | 0.9951     |
-| 15.4241     | 372     | 0.7064        | -          |
-| 15.9267     | 384     | 0.7161        | 0.9835     |
-| 16.4188     | 396     | 0.6692        | -          |
-| 16.9215     | 408     | 0.6594        | 0.9774     |
-| 17.4136     | 420     | 0.6405        | -          |
-| 17.9162     | 432     | 0.638         | 0.9723     |
-| 18.4084     | 444     | 0.6           | -          |
-| 18.9110     | 456     | 0.6122        | 0.9706     |
-| 19.4031     | 468     | 0.5763        | -          |
-| 19.9058     | 480     | 0.5787        | 0.9732     |
-| 20.3979     | 492     | 0.5432        | -          |
-| 20.9005     | 504     | 0.5599        | 0.9618     |
-| 21.3927     | 516     | 0.5245        | -          |
-| 21.8953     | 528     | 0.5278        | 0.9626     |
-| 22.3874     | 540     | 0.4989        | -          |
-| 22.8901     | 552     | 0.509         | 0.9583     |
-| 23.3822     | 564     | 0.4674        | -          |
-| **23.8848** | **576** | **0.4854**    | **0.9573** |
-| 24.3770     | 588     | 0.4619        | -          |
-| 24.8796     | 600     | 0.4631        | 0.9615     |
-| 25.3717     | 612     | 0.4339        | -          |
-| 25.8743     | 624     | 0.4427        | 0.9593     |
-| 26.3665     | 636     | 0.4225        | -          |
-| 26.8691     | 648     | 0.4245        | 0.9694     |
-| 27.3613     | 660     | 0.3936        | -          |
-| 27.8639     | 672     | 0.4168        | 0.9586     |
-| 28.3560     | 684     | 0.3835        | -          |
-| 28.8586     | 696     | 0.3921        | 0.9629     |
 * The bold row denotes the saved checkpoint.

 ---
+base_model: Alibaba-NLP/gte-base-en-v1.5
 library_name: sentence-transformers
 pipeline_tag: sentence-similarity
 tags:
   - Thinks x = y is an axis
 ---
+# SentenceTransformer based on Alibaba-NLP/gte-base-en-v1.5
+This is a [sentence-transformers](https://www.SBERT.net) model finetuned from [Alibaba-NLP/gte-base-en-v1.5](https://huggingface.co/Alibaba-NLP/gte-base-en-v1.5) on the csv dataset. It maps sentences & paragraphs to a 768-dimensional dense vector space and can be used for semantic textual similarity, semantic search, paraphrase mining, text classification, clustering, and more.
 ## Model Details
 ### Model Description
 - **Model Type:** Sentence Transformer
+- **Base model:** [Alibaba-NLP/gte-base-en-v1.5](https://huggingface.co/Alibaba-NLP/gte-base-en-v1.5) <!-- at revision a8e4f3e0ee719c75bc30d12b8eae0f8440502718 -->
+- **Maximum Sequence Length:** 8192 tokens
+- **Output Dimensionality:** 768 tokens
 - **Similarity Function:** Cosine Similarity
 - **Training Dataset:**
     - csv
 ```
 SentenceTransformer(
+  (0): Transformer({'max_seq_length': 8192, 'do_lower_case': False}) with Transformer model: NewModel
+  (1): Pooling({'word_embedding_dimension': 768, 'pooling_mode_cls_token': True, 'pooling_mode_mean_tokens': False, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False, 'pooling_mode_weightedmean_tokens': False, 'pooling_mode_lasttoken': False, 'include_prompt': True})
 )
 ```
 ]
 embeddings = model.encode(sentences)
 print(embeddings.shape)
+# [3, 768]
 # Get the similarity scores for the embeddings
 similarities = model.similarity(embeddings, embeddings)
 * Size: 12,210 training samples
 * Columns: <code>qa_pair_text</code>, <code>MisconceptionName</code>, and <code>negative</code>
 * Approximate statistics based on the first 1000 samples:
+  |         | qa_pair_text                                                                        | MisconceptionName                                                                 | negative                                                                          |
+  |:--------|:------------------------------------------------------------------------------------|:----------------------------------------------------------------------------------|:----------------------------------------------------------------------------------|
+  | type    | string                                                                              | string                                                                            | string                                                                            |
+  | details | <ul><li>min: 54 tokens</li><li>mean: 124.3 tokens</li><li>max: 618 tokens</li></ul> | <ul><li>min: 4 tokens</li><li>mean: 15.16 tokens</li><li>max: 39 tokens</li></ul> | <ul><li>min: 7 tokens</li><li>mean: 14.49 tokens</li><li>max: 40 tokens</li></ul> |
 * Samples:
   | qa_pair_text                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                | MisconceptionName                                                                                    | negative                                                                             |
   |:----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|:-----------------------------------------------------------------------------------------------------|:-------------------------------------------------------------------------------------|
 * Size: 9,640 evaluation samples
 * Columns: <code>qa_pair_text</code>, <code>MisconceptionName</code>, and <code>negative</code>
 * Approximate statistics based on the first 1000 samples:
+  |         | qa_pair_text                                                                          | MisconceptionName                                                                 | negative                                                                          |
+  |:--------|:--------------------------------------------------------------------------------------|:----------------------------------------------------------------------------------|:----------------------------------------------------------------------------------|
+  | type    | string                                                                                | string                                                                            | string                                                                            |
+  | details | <ul><li>min: 56 tokens</li><li>mean: 123.29 tokens</li><li>max: 1092 tokens</li></ul> | <ul><li>min: 6 tokens</li><li>mean: 14.51 tokens</li><li>max: 39 tokens</li></ul> | <ul><li>min: 6 tokens</li><li>mean: 13.86 tokens</li><li>max: 40 tokens</li></ul> |
 * Samples:
   | qa_pair_text                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                          | MisconceptionName                                                                                                                             | negative                                                                                                                                         |
   |:----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|:----------------------------------------------------------------------------------------------------------------------------------------------|:-------------------------------------------------------------------------------------------------------------------------------------------------|
 - `eval_strategy`: steps
 - `per_device_train_batch_size`: 32
 - `per_device_eval_batch_size`: 32
+- `gradient_accumulation_steps`: 16
 - `learning_rate`: 1e-05
 - `weight_decay`: 0.01
 - `num_train_epochs`: 40
 - `per_device_eval_batch_size`: 32
 - `per_gpu_train_batch_size`: None
 - `per_gpu_eval_batch_size`: None
+- `gradient_accumulation_steps`: 16
 - `eval_accumulation_steps`: None
 - `torch_empty_cache_steps`: None
 - `learning_rate`: 1e-05
 ### Training Logs
 | Epoch       | Step    | Training Loss | loss       |
 |:-----------:|:-------:|:-------------:|:----------:|
+| 0.5026      | 6       | 2.8901        | -          |
+| 1.0052      | 12      | 2.5455        | 2.1423     |
+| 1.4974      | 18      | 2.2716        | -          |
+| 2.0         | 24      | 2.0293        | 1.7440     |
+| 2.4921      | 30      | 1.8326        | -          |
+| 2.9948      | 36      | 1.6703        | 1.4220     |
+| 3.4869      | 42      | 1.4876        | -          |
+| 3.9895      | 48      | 1.3571        | 1.2232     |
+| 4.4817      | 54      | 1.2347        | -          |
+| 4.9843      | 60      | 1.2289        | 1.1891     |
+| 5.4764      | 66      | 1.1551        | -          |
+| 5.9791      | 72      | 1.0629        | 1.1069     |
+| 6.4712      | 78      | 1.0166        | -          |
+| 6.9738      | 84      | 1.0095        | 1.0651     |
+| 7.4660      | 90      | 0.8951        | -          |
+| 7.9686      | 96      | 0.8782        | 1.0386     |
+| 8.4607      | 102     | 0.8305        | -          |
+| 8.9634      | 108     | 0.809         | 1.0174     |
+| 9.4555      | 114     | 0.7202        | -          |
+| 9.9581      | 120     | 0.7403        | 1.0041     |
+| 10.4503     | 126     | 0.6737        | -          |
+| 10.9529     | 132     | 0.6499        | 0.9903     |
+| 11.4450     | 138     | 0.6149        | -          |
+| 11.9476     | 144     | 0.6185        | 0.9889     |
+| 12.4398     | 150     | 0.5492        | -          |
+| **12.9424** | **156** | **0.5595**    | **0.9878** |
+| 13.4346     | 162     | 0.5146        | -          |
+| 13.9372     | 168     | 0.5097        | 0.9927     |
+| 14.4293     | 174     | 0.4584        | -          |
+| 14.9319     | 180     | 0.4746        | 0.9912     |
+| 15.4241     | 186     | 0.4331        | -          |
+| 15.9267     | 192     | 0.424         | 1.0016     |
+| 16.4188     | 198     | 0.3946        | -          |
+| 16.9215     | 204     | 0.4077        | 1.0002     |
+| 17.4136     | 210     | 0.366         | -          |
+| 17.9162     | 216     | 0.3721        | 1.0070     |
 * The bold row denotes the saved checkpoint.

config.json CHANGED Viewed

@@ -1,26 +1,44 @@
 {
-  "_name_or_path": "sentence-transformers/all-MiniLM-L6-v2",
   "architectures": [
-    "BertModel"
   ],
-  "attention_probs_dropout_prob": 0.1,
   "classifier_dropout": null,
-  "gradient_checkpointing": false,
   "hidden_act": "gelu",
   "hidden_dropout_prob": 0.1,
-  "hidden_size": 384,
   "initializer_range": 0.02,
-  "intermediate_size": 1536,
   "layer_norm_eps": 1e-12,
-  "max_position_embeddings": 512,
-  "model_type": "bert",
   "num_attention_heads": 12,
-  "num_hidden_layers": 6,
   "pad_token_id": 0,
-  "position_embedding_type": "absolute",
   "torch_dtype": "float32",
   "transformers_version": "4.44.0",
-  "type_vocab_size": 2,
-  "use_cache": true,
-  "vocab_size": 30522
 }

 {
+  "_name_or_path": "Alibaba-NLP/gte-base-en-v1.5",
   "architectures": [
+    "NewModel"
   ],
+  "attention_probs_dropout_prob": 0.0,
+  "auto_map": {
+    "AutoConfig": "Alibaba-NLP/new-impl--configuration.NewConfig",
+    "AutoModel": "Alibaba-NLP/new-impl--modeling.NewModel",
+    "AutoModelForMaskedLM": "Alibaba-NLP/new-impl--modeling.NewForMaskedLM",
+    "AutoModelForMultipleChoice": "Alibaba-NLP/new-impl--modeling.NewForMultipleChoice",
+    "AutoModelForQuestionAnswering": "Alibaba-NLP/new-impl--modeling.NewForQuestionAnswering",
+    "AutoModelForSequenceClassification": "Alibaba-NLP/new-impl--modeling.NewForSequenceClassification",
+    "AutoModelForTokenClassification": "Alibaba-NLP/new-impl--modeling.NewForTokenClassification"
+  },
   "classifier_dropout": null,
   "hidden_act": "gelu",
   "hidden_dropout_prob": 0.1,
+  "hidden_size": 768,
   "initializer_range": 0.02,
+  "intermediate_size": 3072,
   "layer_norm_eps": 1e-12,
+  "layer_norm_type": "layer_norm",
+  "logn_attention_clip1": false,
+  "logn_attention_scale": false,
+  "max_position_embeddings": 8192,
+  "model_type": "new",
   "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "pack_qkv": true,
   "pad_token_id": 0,
+  "position_embedding_type": "rope",
+  "rope_scaling": {
+    "factor": 2.0,
+    "type": "ntk"
+  },
+  "rope_theta": 500000,
   "torch_dtype": "float32",
   "transformers_version": "4.44.0",
+  "type_vocab_size": 0,
+  "unpad_inputs": false,
+  "use_memory_efficient_attention": false,
+  "vocab_size": 30528
 }

model.safetensors CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:3dc81fcda60ac280c966d16b9cc07ebe8f5e13619f4caa4425c7f6e5dd344a91
-size 90864192

 version https://git-lfs.github.com/spec/v1
+oid sha256:e9a3cff2df37b85a7a0b3b7953d156029625c6b2ee5b6ed8bea9e8f144b64982
+size 547119128

modules.json CHANGED Viewed

@@ -10,11 +10,5 @@
     "name": "1",
     "path": "1_Pooling",
     "type": "sentence_transformers.models.Pooling"
-  },
-  {
-    "idx": 2,
-    "name": "2",
-    "path": "2_Normalize",
-    "type": "sentence_transformers.models.Normalize"
   }
 ]

     "name": "1",
     "path": "1_Pooling",
     "type": "sentence_transformers.models.Pooling"
   }
 ]

sentence_bert_config.json CHANGED Viewed

@@ -1,4 +1,4 @@
 {
-  "max_seq_length": 256,
   "do_lower_case": false
 }

 {
+  "max_seq_length": 8192,
   "do_lower_case": false
 }

tokenizer.json CHANGED Viewed

@@ -2,7 +2,7 @@
   "version": "1.0",
   "truncation": {
     "direction": "Right",
-    "max_length": 256,
     "strategy": "LongestFirst",
     "stride": 0
   },

   "version": "1.0",
   "truncation": {
     "direction": "Right",
+    "max_length": 8192,
     "strategy": "LongestFirst",
     "stride": 0
   },

tokenizer_config.json CHANGED Viewed

@@ -43,12 +43,10 @@
   },
   "clean_up_tokenization_spaces": true,
   "cls_token": "[CLS]",
-  "do_basic_tokenize": true,
   "do_lower_case": true,
   "mask_token": "[MASK]",
-  "max_length": 128,
-  "model_max_length": 256,
-  "never_split": null,
   "pad_to_multiple_of": null,
   "pad_token": "[PAD]",
   "pad_token_type_id": 0,

   },
   "clean_up_tokenization_spaces": true,
   "cls_token": "[CLS]",
   "do_lower_case": true,
   "mask_token": "[MASK]",
+  "max_length": 512,
+  "model_max_length": 8192,
   "pad_to_multiple_of": null,
   "pad_token": "[PAD]",
   "pad_token_type_id": 0,