metadata
language:
- de
- en
- es
- fr
- it
- nl
- pl
- pt
- ru
- zh
library_name: sentence-transformers
tags:
- sentence-transformers
- sentence-similarity
- feature-extraction
- dataset_size:10K<n<100K
- loss:MatryoshkaLoss
- loss:CosineSimilarityLoss
base_model: aari1995/gbert-large-2-cls-nlisim
metrics:
- pearson_cosine
- spearman_cosine
- pearson_manhattan
- spearman_manhattan
- pearson_euclidean
- spearman_euclidean
- pearson_dot
- spearman_dot
- pearson_max
- spearman_max
widget:
- source_sentence: Ein Mann spricht.
sentences:
- Ein Mann spricht in ein Mikrofon.
- Der Mann spielt auf den Tastaturen.
- Zwei Mädchen gehen im Ozean spazieren.
- source_sentence: Eine Flagge weht.
sentences:
- Die Flagge bewegte sich in der Luft.
- Ein Hund fährt auf einem Skateboard.
- Zwei Frauen sitzen in einem Cafe.
- source_sentence: Ein Mann übt Boxen
sentences:
- Ein Affe praktiziert Kampfsportarten.
- Eine Person faltet ein Blatt Papier.
- Eine Frau geht mit ihrem Hund spazieren.
- source_sentence: Das Tor ist gelb.
sentences:
- Das Tor ist blau.
- Die Frau hält die Hände des Mannes.
- NATO-Soldat bei afghanischem Angriff getötet
- source_sentence: Zwei Frauen laufen.
sentences:
- Frauen laufen.
- Die Frau prüft die Augen des Mannes.
- Ein Mann ist auf einem Dach
pipeline_tag: sentence-similarity
model-index:
- name: SentenceTransformer based on aari1995/gbert-large-2-cls-nlisim
results:
- task:
type: semantic-similarity
name: Semantic Similarity
dataset:
name: sts dev 1024
type: sts-dev-1024
metrics:
- type: pearson_cosine
value: 0.8417806877288009
name: Pearson Cosine
- type: spearman_cosine
value: 0.8452891310343582
name: Spearman Cosine
- type: pearson_manhattan
value: 0.8418749526406495
name: Pearson Manhattan
- type: spearman_manhattan
value: 0.8450348906331776
name: Spearman Manhattan
- type: pearson_euclidean
value: 0.8422615095001257
name: Pearson Euclidean
- type: spearman_euclidean
value: 0.8453390990427703
name: Spearman Euclidean
- type: pearson_dot
value: 0.8416625079549063
name: Pearson Dot
- type: spearman_dot
value: 0.8450616171323844
name: Spearman Dot
- type: pearson_max
value: 0.8422615095001257
name: Pearson Max
- type: spearman_max
value: 0.8453390990427703
name: Spearman Max
- task:
type: semantic-similarity
name: Semantic Similarity
dataset:
name: sts dev 768
type: sts-dev-768
metrics:
- type: pearson_cosine
value: 0.8418107096367227
name: Pearson Cosine
- type: spearman_cosine
value: 0.8453863409322975
name: Spearman Cosine
- type: pearson_manhattan
value: 0.8418527770289471
name: Pearson Manhattan
- type: spearman_manhattan
value: 0.8448328869253576
name: Spearman Manhattan
- type: pearson_euclidean
value: 0.8422791953749277
name: Pearson Euclidean
- type: spearman_euclidean
value: 0.8451547857394669
name: Spearman Euclidean
- type: pearson_dot
value: 0.8417682812591724
name: Pearson Dot
- type: spearman_dot
value: 0.8446927200809794
name: Spearman Dot
- type: pearson_max
value: 0.8422791953749277
name: Pearson Max
- type: spearman_max
value: 0.8453863409322975
name: Spearman Max
- task:
type: semantic-similarity
name: Semantic Similarity
dataset:
name: sts dev 512
type: sts-dev-512
metrics:
- type: pearson_cosine
value: 0.8394808864309438
name: Pearson Cosine
- type: spearman_cosine
value: 0.8437551103291275
name: Spearman Cosine
- type: pearson_manhattan
value: 0.8420246416513741
name: Pearson Manhattan
- type: spearman_manhattan
value: 0.8447335398769396
name: Spearman Manhattan
- type: pearson_euclidean
value: 0.8422722079216611
name: Pearson Euclidean
- type: spearman_euclidean
value: 0.8448909261141044
name: Spearman Euclidean
- type: pearson_dot
value: 0.8358204287638725
name: Pearson Dot
- type: spearman_dot
value: 0.8380004733308642
name: Spearman Dot
- type: pearson_max
value: 0.8422722079216611
name: Pearson Max
- type: spearman_max
value: 0.8448909261141044
name: Spearman Max
- task:
type: semantic-similarity
name: Semantic Similarity
dataset:
name: sts dev 256
type: sts-dev-256
metrics:
- type: pearson_cosine
value: 0.833879413726309
name: Pearson Cosine
- type: spearman_cosine
value: 0.8392439788855341
name: Spearman Cosine
- type: pearson_manhattan
value: 0.8379618268497928
name: Pearson Manhattan
- type: spearman_manhattan
value: 0.839860826315925
name: Spearman Manhattan
- type: pearson_euclidean
value: 0.838931461279174
name: Pearson Euclidean
- type: spearman_euclidean
value: 0.8404811150299943
name: Spearman Euclidean
- type: pearson_dot
value: 0.8230557648139373
name: Pearson Dot
- type: spearman_dot
value: 0.8242532718299653
name: Spearman Dot
- type: pearson_max
value: 0.838931461279174
name: Pearson Max
- type: spearman_max
value: 0.8404811150299943
name: Spearman Max
- task:
type: semantic-similarity
name: Semantic Similarity
dataset:
name: sts dev 128
type: sts-dev-128
metrics:
- type: pearson_cosine
value: 0.8253967606033702
name: Pearson Cosine
- type: spearman_cosine
value: 0.8335750690073012
name: Spearman Cosine
- type: pearson_manhattan
value: 0.8341588626988476
name: Pearson Manhattan
- type: spearman_manhattan
value: 0.8343994326050966
name: Spearman Manhattan
- type: pearson_euclidean
value: 0.8355263623880292
name: Pearson Euclidean
- type: spearman_euclidean
value: 0.8358857095028451
name: Spearman Euclidean
- type: pearson_dot
value: 0.8035163216908426
name: Pearson Dot
- type: spearman_dot
value: 0.8050271037746011
name: Spearman Dot
- type: pearson_max
value: 0.8355263623880292
name: Pearson Max
- type: spearman_max
value: 0.8358857095028451
name: Spearman Max
- task:
type: semantic-similarity
name: Semantic Similarity
dataset:
name: sts dev 64
type: sts-dev-64
metrics:
- type: pearson_cosine
value: 0.8150661334039712
name: Pearson Cosine
- type: spearman_cosine
value: 0.8265558538619309
name: Spearman Cosine
- type: pearson_manhattan
value: 0.8241988539394505
name: Pearson Manhattan
- type: spearman_manhattan
value: 0.8238763145175863
name: Spearman Manhattan
- type: pearson_euclidean
value: 0.8274925218859535
name: Pearson Euclidean
- type: spearman_euclidean
value: 0.8270778062044848
name: Spearman Euclidean
- type: pearson_dot
value: 0.7773847317840161
name: Pearson Dot
- type: spearman_dot
value: 0.7790338242936304
name: Spearman Dot
- type: pearson_max
value: 0.8274925218859535
name: Pearson Max
- type: spearman_max
value: 0.8270778062044848
name: Spearman Max
- task:
type: semantic-similarity
name: Semantic Similarity
dataset:
name: sts test 1024
type: sts-test-1024
metrics:
- type: pearson_cosine
value: 0.8130772714952826
name: Pearson Cosine
- type: spearman_cosine
value: 0.8188901246173036
name: Spearman Cosine
- type: pearson_manhattan
value: 0.8208715312691268
name: Pearson Manhattan
- type: spearman_manhattan
value: 0.8195095089412118
name: Spearman Manhattan
- type: pearson_euclidean
value: 0.820344720619671
name: Pearson Euclidean
- type: spearman_euclidean
value: 0.8189263018901494
name: Spearman Euclidean
- type: pearson_dot
value: 0.8127924456922464
name: Pearson Dot
- type: spearman_dot
value: 0.8185815083131535
name: Spearman Dot
- type: pearson_max
value: 0.8208715312691268
name: Pearson Max
- type: spearman_max
value: 0.8195095089412118
name: Spearman Max
- task:
type: semantic-similarity
name: Semantic Similarity
dataset:
name: sts test 768
type: sts-test-768
metrics:
- type: pearson_cosine
value: 0.8121757739236393
name: Pearson Cosine
- type: spearman_cosine
value: 0.8182913347635533
name: Spearman Cosine
- type: pearson_manhattan
value: 0.820604714791802
name: Pearson Manhattan
- type: spearman_manhattan
value: 0.8190481839997107
name: Spearman Manhattan
- type: pearson_euclidean
value: 0.8197462057663948
name: Pearson Euclidean
- type: spearman_euclidean
value: 0.8183157116237637
name: Spearman Euclidean
- type: pearson_dot
value: 0.8106698462984598
name: Pearson Dot
- type: spearman_dot
value: 0.8148932181769889
name: Spearman Dot
- type: pearson_max
value: 0.820604714791802
name: Pearson Max
- type: spearman_max
value: 0.8190481839997107
name: Spearman Max
- task:
type: semantic-similarity
name: Semantic Similarity
dataset:
name: sts test 512
type: sts-test-512
metrics:
- type: pearson_cosine
value: 0.8096452235754106
name: Pearson Cosine
- type: spearman_cosine
value: 0.816264314810491
name: Spearman Cosine
- type: pearson_manhattan
value: 0.8180021560255247
name: Pearson Manhattan
- type: spearman_manhattan
value: 0.8165486306356095
name: Spearman Manhattan
- type: pearson_euclidean
value: 0.8173829404008947
name: Pearson Euclidean
- type: spearman_euclidean
value: 0.8158592878546184
name: Spearman Euclidean
- type: pearson_dot
value: 0.8059176831913651
name: Pearson Dot
- type: spearman_dot
value: 0.8088972406630007
name: Spearman Dot
- type: pearson_max
value: 0.8180021560255247
name: Pearson Max
- type: spearman_max
value: 0.8165486306356095
name: Spearman Max
- task:
type: semantic-similarity
name: Semantic Similarity
dataset:
name: sts test 256
type: sts-test-256
metrics:
- type: pearson_cosine
value: 0.8070921035712145
name: Pearson Cosine
- type: spearman_cosine
value: 0.8150266310280979
name: Spearman Cosine
- type: pearson_manhattan
value: 0.818409081545237
name: Pearson Manhattan
- type: spearman_manhattan
value: 0.8167245415653657
name: Spearman Manhattan
- type: pearson_euclidean
value: 0.8176811220335696
name: Pearson Euclidean
- type: spearman_euclidean
value: 0.8158894222194816
name: Spearman Euclidean
- type: pearson_dot
value: 0.795483328805793
name: Pearson Dot
- type: spearman_dot
value: 0.7956062163122977
name: Spearman Dot
- type: pearson_max
value: 0.818409081545237
name: Pearson Max
- type: spearman_max
value: 0.8167245415653657
name: Spearman Max
- task:
type: semantic-similarity
name: Semantic Similarity
dataset:
name: sts test 128
type: sts-test-128
metrics:
- type: pearson_cosine
value: 0.7974039089035316
name: Pearson Cosine
- type: spearman_cosine
value: 0.8093067652791092
name: Spearman Cosine
- type: pearson_manhattan
value: 0.8125792968401813
name: Pearson Manhattan
- type: spearman_manhattan
value: 0.8121486514324944
name: Spearman Manhattan
- type: pearson_euclidean
value: 0.8119102513178551
name: Pearson Euclidean
- type: spearman_euclidean
value: 0.811152531425261
name: Spearman Euclidean
- type: pearson_dot
value: 0.7739555890021923
name: Pearson Dot
- type: spearman_dot
value: 0.770072655568691
name: Spearman Dot
- type: pearson_max
value: 0.8125792968401813
name: Pearson Max
- type: spearman_max
value: 0.8121486514324944
name: Spearman Max
- task:
type: semantic-similarity
name: Semantic Similarity
dataset:
name: sts test 64
type: sts-test-64
metrics:
- type: pearson_cosine
value: 0.7873069617689994
name: Pearson Cosine
- type: spearman_cosine
value: 0.8024994399645912
name: Spearman Cosine
- type: pearson_manhattan
value: 0.8048161563115213
name: Pearson Manhattan
- type: spearman_manhattan
value: 0.8031972835914969
name: Spearman Manhattan
- type: pearson_euclidean
value: 0.8060416893207731
name: Pearson Euclidean
- type: spearman_euclidean
value: 0.8041515980374414
name: Spearman Euclidean
- type: pearson_dot
value: 0.747911221220991
name: Pearson Dot
- type: spearman_dot
value: 0.7386011869481828
name: Spearman Dot
- type: pearson_max
value: 0.8060416893207731
name: Pearson Max
- type: spearman_max
value: 0.8041515980374414
name: Spearman Max
SentenceTransformer based on aari1995/gbert-large-2-cls-nlisim
This is a sentence-transformers model finetuned from aari1995/gbert-large-2-cls-nlisim on the PhilipMay/stsb_multi_mt dataset. It maps sentences & paragraphs to a 1024-dimensional dense vector space and can be used for semantic textual similarity, semantic search, paraphrase mining, text classification, clustering, and more.
Model Details
Model Description
- Model Type: Sentence Transformer
- Base model: aari1995/gbert-large-2-cls-nlisim
- Maximum Sequence Length: 8192 tokens
- Output Dimensionality: 1024 tokens
- Similarity Function: Cosine Similarity
- Training Dataset:
- Languages: de, en, es, fr, it, nl, pl, pt, ru, zh
Model Sources
Full Model Architecture
SentenceTransformer(
(0): Transformer({'max_seq_length': 8192, 'do_lower_case': False}) with Transformer model: JinaBertModel
(1): Pooling({'word_embedding_dimension': 1024, 'pooling_mode_cls_token': True, 'pooling_mode_mean_tokens': False, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False, 'pooling_mode_weightedmean_tokens': False, 'pooling_mode_lasttoken': False, 'include_prompt': True})
)
Usage
Direct Usage (Sentence Transformers)
First install the Sentence Transformers library:
pip install -U sentence-transformers
Then you can load this model and run inference.
from sentence_transformers import SentenceTransformer
model = SentenceTransformer("aari1995/gbert-large-2-cls-pawsx-nli-sts")
sentences = [
'Zwei Frauen laufen.',
'Frauen laufen.',
'Die Frau prüft die Augen des Mannes.',
]
embeddings = model.encode(sentences)
print(embeddings.shape)
similarities = model.similarity(embeddings, embeddings)
print(similarities.shape)
Evaluation
Metrics
Semantic Similarity
Metric |
Value |
pearson_cosine |
0.8418 |
spearman_cosine |
0.8453 |
pearson_manhattan |
0.8419 |
spearman_manhattan |
0.845 |
pearson_euclidean |
0.8423 |
spearman_euclidean |
0.8453 |
pearson_dot |
0.8417 |
spearman_dot |
0.8451 |
pearson_max |
0.8423 |
spearman_max |
0.8453 |
Semantic Similarity
Metric |
Value |
pearson_cosine |
0.8418 |
spearman_cosine |
0.8454 |
pearson_manhattan |
0.8419 |
spearman_manhattan |
0.8448 |
pearson_euclidean |
0.8423 |
spearman_euclidean |
0.8452 |
pearson_dot |
0.8418 |
spearman_dot |
0.8447 |
pearson_max |
0.8423 |
spearman_max |
0.8454 |
Semantic Similarity
Metric |
Value |
pearson_cosine |
0.8395 |
spearman_cosine |
0.8438 |
pearson_manhattan |
0.842 |
spearman_manhattan |
0.8447 |
pearson_euclidean |
0.8423 |
spearman_euclidean |
0.8449 |
pearson_dot |
0.8358 |
spearman_dot |
0.838 |
pearson_max |
0.8423 |
spearman_max |
0.8449 |
Semantic Similarity
Metric |
Value |
pearson_cosine |
0.8339 |
spearman_cosine |
0.8392 |
pearson_manhattan |
0.838 |
spearman_manhattan |
0.8399 |
pearson_euclidean |
0.8389 |
spearman_euclidean |
0.8405 |
pearson_dot |
0.8231 |
spearman_dot |
0.8243 |
pearson_max |
0.8389 |
spearman_max |
0.8405 |
Semantic Similarity
Metric |
Value |
pearson_cosine |
0.8254 |
spearman_cosine |
0.8336 |
pearson_manhattan |
0.8342 |
spearman_manhattan |
0.8344 |
pearson_euclidean |
0.8355 |
spearman_euclidean |
0.8359 |
pearson_dot |
0.8035 |
spearman_dot |
0.805 |
pearson_max |
0.8355 |
spearman_max |
0.8359 |
Semantic Similarity
Metric |
Value |
pearson_cosine |
0.8151 |
spearman_cosine |
0.8266 |
pearson_manhattan |
0.8242 |
spearman_manhattan |
0.8239 |
pearson_euclidean |
0.8275 |
spearman_euclidean |
0.8271 |
pearson_dot |
0.7774 |
spearman_dot |
0.779 |
pearson_max |
0.8275 |
spearman_max |
0.8271 |
Semantic Similarity
Metric |
Value |
pearson_cosine |
0.8131 |
spearman_cosine |
0.8189 |
pearson_manhattan |
0.8209 |
spearman_manhattan |
0.8195 |
pearson_euclidean |
0.8203 |
spearman_euclidean |
0.8189 |
pearson_dot |
0.8128 |
spearman_dot |
0.8186 |
pearson_max |
0.8209 |
spearman_max |
0.8195 |
Semantic Similarity
Metric |
Value |
pearson_cosine |
0.8122 |
spearman_cosine |
0.8183 |
pearson_manhattan |
0.8206 |
spearman_manhattan |
0.819 |
pearson_euclidean |
0.8197 |
spearman_euclidean |
0.8183 |
pearson_dot |
0.8107 |
spearman_dot |
0.8149 |
pearson_max |
0.8206 |
spearman_max |
0.819 |
Semantic Similarity
Metric |
Value |
pearson_cosine |
0.8096 |
spearman_cosine |
0.8163 |
pearson_manhattan |
0.818 |
spearman_manhattan |
0.8165 |
pearson_euclidean |
0.8174 |
spearman_euclidean |
0.8159 |
pearson_dot |
0.8059 |
spearman_dot |
0.8089 |
pearson_max |
0.818 |
spearman_max |
0.8165 |
Semantic Similarity
Metric |
Value |
pearson_cosine |
0.8071 |
spearman_cosine |
0.815 |
pearson_manhattan |
0.8184 |
spearman_manhattan |
0.8167 |
pearson_euclidean |
0.8177 |
spearman_euclidean |
0.8159 |
pearson_dot |
0.7955 |
spearman_dot |
0.7956 |
pearson_max |
0.8184 |
spearman_max |
0.8167 |
Semantic Similarity
Metric |
Value |
pearson_cosine |
0.7974 |
spearman_cosine |
0.8093 |
pearson_manhattan |
0.8126 |
spearman_manhattan |
0.8121 |
pearson_euclidean |
0.8119 |
spearman_euclidean |
0.8112 |
pearson_dot |
0.774 |
spearman_dot |
0.7701 |
pearson_max |
0.8126 |
spearman_max |
0.8121 |
Semantic Similarity
Metric |
Value |
pearson_cosine |
0.7873 |
spearman_cosine |
0.8025 |
pearson_manhattan |
0.8048 |
spearman_manhattan |
0.8032 |
pearson_euclidean |
0.806 |
spearman_euclidean |
0.8042 |
pearson_dot |
0.7479 |
spearman_dot |
0.7386 |
pearson_max |
0.806 |
spearman_max |
0.8042 |
Training Details
Training Dataset
PhilipMay/stsb_multi_mt
- Dataset: PhilipMay/stsb_multi_mt at 3acaa3d
- Size: 22,996 training samples
- Columns:
sentence1
, sentence2
, and score
- Approximate statistics based on the first 1000 samples:
|
sentence1 |
sentence2 |
score |
type |
string |
string |
float |
details |
- min: 6 tokens
- mean: 18.13 tokens
- max: 65 tokens
|
- min: 6 tokens
- mean: 18.25 tokens
- max: 90 tokens
|
- min: 0.0
- mean: 0.54
- max: 1.0
|
- Samples:
sentence1 |
sentence2 |
score |
schütze wegen mordes an schwarzem us-jugendlichen angeklagt |
gedanken zu den rassenbeziehungen unter einem schwarzen präsidenten |
0.1599999964237213 |
fußballspieler kicken einen fußball in das tor. |
Ein Fußballspieler schießt ein Tor. |
0.7599999904632568 |
obama lockert abschiebungsregeln für junge einwanderer |
usa lockert abschiebebestimmungen für jugendliche: napolitano |
0.800000011920929 |
- Loss:
MatryoshkaLoss
with these parameters:{
"loss": "CosineSimilarityLoss",
"matryoshka_dims": [
1024,
768,
512,
256,
128,
64
],
"matryoshka_weights": [
1,
1,
1,
1,
1,
1
],
"n_dims_per_step": -1
}
Evaluation Dataset
PhilipMay/stsb_multi_mt
- Dataset: PhilipMay/stsb_multi_mt at 3acaa3d
- Size: 1,500 evaluation samples
- Columns:
sentence1
, sentence2
, and score
- Approximate statistics based on the first 1000 samples:
|
sentence1 |
sentence2 |
score |
type |
string |
string |
float |
details |
- min: 5 tokens
- mean: 16.54 tokens
- max: 53 tokens
|
- min: 5 tokens
- mean: 16.53 tokens
- max: 47 tokens
|
- min: 0.0
- mean: 0.47
- max: 1.0
|
- Samples:
sentence1 |
sentence2 |
score |
Ein Mann mit einem Schutzhelm tanzt. |
Ein Mann mit einem Schutzhelm tanzt. |
1.0 |
Ein kleines Kind reitet auf einem Pferd. |
Ein Kind reitet auf einem Pferd. |
0.949999988079071 |
Ein Mann verfüttert eine Maus an eine Schlange. |
Der Mann füttert die Schlange mit einer Maus. |
1.0 |
- Loss:
MatryoshkaLoss
with these parameters:{
"loss": "CosineSimilarityLoss",
"matryoshka_dims": [
1024,
768,
512,
256,
128,
64
],
"matryoshka_weights": [
1,
1,
1,
1,
1,
1
],
"n_dims_per_step": -1
}
Training Hyperparameters
Non-Default Hyperparameters
eval_strategy
: steps
per_device_train_batch_size
: 4
per_device_eval_batch_size
: 16
learning_rate
: 5e-06
num_train_epochs
: 1
warmup_ratio
: 0.1
bf16
: True
All Hyperparameters
Click to expand
overwrite_output_dir
: False
do_predict
: False
eval_strategy
: steps
prediction_loss_only
: True
per_device_train_batch_size
: 4
per_device_eval_batch_size
: 16
per_gpu_train_batch_size
: None
per_gpu_eval_batch_size
: None
gradient_accumulation_steps
: 1
eval_accumulation_steps
: None
learning_rate
: 5e-06
weight_decay
: 0.0
adam_beta1
: 0.9
adam_beta2
: 0.999
adam_epsilon
: 1e-08
max_grad_norm
: 1.0
num_train_epochs
: 1
max_steps
: -1
lr_scheduler_type
: linear
lr_scheduler_kwargs
: {}
warmup_ratio
: 0.1
warmup_steps
: 0
log_level
: passive
log_level_replica
: warning
log_on_each_node
: True
logging_nan_inf_filter
: True
save_safetensors
: True
save_on_each_node
: False
save_only_model
: False
restore_callback_states_from_checkpoint
: False
no_cuda
: False
use_cpu
: False
use_mps_device
: False
seed
: 42
data_seed
: None
jit_mode_eval
: False
use_ipex
: False
bf16
: True
fp16
: False
fp16_opt_level
: O1
half_precision_backend
: auto
bf16_full_eval
: False
fp16_full_eval
: False
tf32
: None
local_rank
: 0
ddp_backend
: None
tpu_num_cores
: None
tpu_metrics_debug
: False
debug
: []
dataloader_drop_last
: False
dataloader_num_workers
: 0
dataloader_prefetch_factor
: None
past_index
: -1
disable_tqdm
: False
remove_unused_columns
: True
label_names
: None
load_best_model_at_end
: False
ignore_data_skip
: False
fsdp
: []
fsdp_min_num_params
: 0
fsdp_config
: {'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False}
fsdp_transformer_layer_cls_to_wrap
: None
accelerator_config
: {'split_batches': False, 'dispatch_batches': None, 'even_batches': True, 'use_seedable_sampler': True, 'non_blocking': False, 'gradient_accumulation_kwargs': None}
deepspeed
: None
label_smoothing_factor
: 0.0
optim
: adamw_torch
optim_args
: None
adafactor
: False
group_by_length
: False
length_column_name
: length
ddp_find_unused_parameters
: None
ddp_bucket_cap_mb
: None
ddp_broadcast_buffers
: False
dataloader_pin_memory
: True
dataloader_persistent_workers
: False
skip_memory_metrics
: True
use_legacy_prediction_loop
: False
push_to_hub
: False
resume_from_checkpoint
: None
hub_model_id
: None
hub_strategy
: every_save
hub_private_repo
: False
hub_always_push
: False
gradient_checkpointing
: False
gradient_checkpointing_kwargs
: None
include_inputs_for_metrics
: False
eval_do_concat_batches
: True
fp16_backend
: auto
push_to_hub_model_id
: None
push_to_hub_organization
: None
mp_parameters
:
auto_find_batch_size
: False
full_determinism
: False
torchdynamo
: None
ray_scope
: last
ddp_timeout
: 1800
torch_compile
: False
torch_compile_backend
: None
torch_compile_mode
: None
dispatch_batches
: None
split_batches
: None
include_tokens_per_second
: False
include_num_input_tokens_seen
: False
neftune_noise_alpha
: None
optim_target_modules
: None
batch_eval_metrics
: False
eval_on_start
: False
batch_sampler
: batch_sampler
multi_dataset_batch_sampler
: proportional
Training Logs
Epoch |
Step |
Training Loss |
loss |
sts-dev-1024_spearman_cosine |
sts-dev-128_spearman_cosine |
sts-dev-256_spearman_cosine |
sts-dev-512_spearman_cosine |
sts-dev-64_spearman_cosine |
sts-dev-768_spearman_cosine |
sts-test-1024_spearman_cosine |
sts-test-128_spearman_cosine |
sts-test-256_spearman_cosine |
sts-test-512_spearman_cosine |
sts-test-64_spearman_cosine |
sts-test-768_spearman_cosine |
0.0174 |
100 |
0.2958 |
- |
- |
- |
- |
- |
- |
- |
- |
- |
- |
- |
- |
- |
0.0348 |
200 |
0.2914 |
- |
- |
- |
- |
- |
- |
- |
- |
- |
- |
- |
- |
- |
0.0522 |
300 |
0.2691 |
- |
- |
- |
- |
- |
- |
- |
- |
- |
- |
- |
- |
- |
0.0696 |
400 |
0.253 |
- |
- |
- |
- |
- |
- |
- |
- |
- |
- |
- |
- |
- |
0.0870 |
500 |
0.2458 |
- |
- |
- |
- |
- |
- |
- |
- |
- |
- |
- |
- |
- |
0.1044 |
600 |
0.2594 |
- |
- |
- |
- |
- |
- |
- |
- |
- |
- |
- |
- |
- |
0.1218 |
700 |
0.2339 |
- |
- |
- |
- |
- |
- |
- |
- |
- |
- |
- |
- |
- |
0.1392 |
800 |
0.2245 |
- |
- |
- |
- |
- |
- |
- |
- |
- |
- |
- |
- |
- |
0.1565 |
900 |
0.2122 |
- |
- |
- |
- |
- |
- |
- |
- |
- |
- |
- |
- |
- |
0.1739 |
1000 |
0.2369 |
0.2394 |
0.8402 |
0.8277 |
0.8352 |
0.8393 |
0.8164 |
0.8404 |
- |
- |
- |
- |
- |
- |
0.1913 |
1100 |
0.2308 |
- |
- |
- |
- |
- |
- |
- |
- |
- |
- |
- |
- |
- |
0.2087 |
1200 |
0.2292 |
- |
- |
- |
- |
- |
- |
- |
- |
- |
- |
- |
- |
- |
0.2261 |
1300 |
0.2232 |
- |
- |
- |
- |
- |
- |
- |
- |
- |
- |
- |
- |
- |
0.2435 |
1400 |
0.2001 |
- |
- |
- |
- |
- |
- |
- |
- |
- |
- |
- |
- |
- |
0.2609 |
1500 |
0.2139 |
- |
- |
- |
- |
- |
- |
- |
- |
- |
- |
- |
- |
- |
0.2783 |
1600 |
0.1906 |
- |
- |
- |
- |
- |
- |
- |
- |
- |
- |
- |
- |
- |
0.2957 |
1700 |
0.1895 |
- |
- |
- |
- |
- |
- |
- |
- |
- |
- |
- |
- |
- |
0.3131 |
1800 |
0.2011 |
- |
- |
- |
- |
- |
- |
- |
- |
- |
- |
- |
- |
- |
0.3305 |
1900 |
0.1723 |
- |
- |
- |
- |
- |
- |
- |
- |
- |
- |
- |
- |
- |
0.3479 |
2000 |
0.1886 |
0.2340 |
0.8448 |
0.8321 |
0.8385 |
0.8435 |
0.8233 |
0.8449 |
- |
- |
- |
- |
- |
- |
0.3653 |
2100 |
0.1719 |
- |
- |
- |
- |
- |
- |
- |
- |
- |
- |
- |
- |
- |
0.3827 |
2200 |
0.1879 |
- |
- |
- |
- |
- |
- |
- |
- |
- |
- |
- |
- |
- |
0.4001 |
2300 |
0.187 |
- |
- |
- |
- |
- |
- |
- |
- |
- |
- |
- |
- |
- |
0.4175 |
2400 |
0.1487 |
- |
- |
- |
- |
- |
- |
- |
- |
- |
- |
- |
- |
- |
0.4349 |
2500 |
0.1752 |
- |
- |
- |
- |
- |
- |
- |
- |
- |
- |
- |
- |
- |
0.4523 |
2600 |
0.1475 |
- |
- |
- |
- |
- |
- |
- |
- |
- |
- |
- |
- |
- |
0.4696 |
2700 |
0.1695 |
- |
- |
- |
- |
- |
- |
- |
- |
- |
- |
- |
- |
- |
0.4870 |
2800 |
0.1615 |
- |
- |
- |
- |
- |
- |
- |
- |
- |
- |
- |
- |
- |
0.5044 |
2900 |
0.1558 |
- |
- |
- |
- |
- |
- |
- |
- |
- |
- |
- |
- |
- |
0.5218 |
3000 |
0.1713 |
0.2357 |
0.8457 |
0.8344 |
0.8406 |
0.8447 |
0.8266 |
0.8461 |
- |
- |
- |
- |
- |
- |
0.5392 |
3100 |
0.1556 |
- |
- |
- |
- |
- |
- |
- |
- |
- |
- |
- |
- |
- |
0.5566 |
3200 |
0.1743 |
- |
- |
- |
- |
- |
- |
- |
- |
- |
- |
- |
- |
- |
0.5740 |
3300 |
0.1426 |
- |
- |
- |
- |
- |
- |
- |
- |
- |
- |
- |
- |
- |
0.5914 |
3400 |
0.1519 |
- |
- |
- |
- |
- |
- |
- |
- |
- |
- |
- |
- |
- |
0.6088 |
3500 |
0.1763 |
- |
- |
- |
- |
- |
- |
- |
- |
- |
- |
- |
- |
- |
0.6262 |
3600 |
0.1456 |
- |
- |
- |
- |
- |
- |
- |
- |
- |
- |
- |
- |
- |
0.6436 |
3700 |
0.1649 |
- |
- |
- |
- |
- |
- |
- |
- |
- |
- |
- |
- |
- |
0.6610 |
3800 |
0.1427 |
- |
- |
- |
- |
- |
- |
- |
- |
- |
- |
- |
- |
- |
0.6784 |
3900 |
0.1284 |
- |
- |
- |
- |
- |
- |
- |
- |
- |
- |
- |
- |
- |
0.6958 |
4000 |
0.1533 |
0.2344 |
0.8417 |
0.8291 |
0.8357 |
0.8402 |
0.8225 |
0.8421 |
- |
- |
- |
- |
- |
- |
0.7132 |
4100 |
0.1397 |
- |
- |
- |
- |
- |
- |
- |
- |
- |
- |
- |
- |
- |
0.7306 |
4200 |
0.1505 |
- |
- |
- |
- |
- |
- |
- |
- |
- |
- |
- |
- |
- |
0.7480 |
4300 |
0.1355 |
- |
- |
- |
- |
- |
- |
- |
- |
- |
- |
- |
- |
- |
0.7654 |
4400 |
0.1275 |
- |
- |
- |
- |
- |
- |
- |
- |
- |
- |
- |
- |
- |
0.7827 |
4500 |
0.1599 |
- |
- |
- |
- |
- |
- |
- |
- |
- |
- |
- |
- |
- |
0.8001 |
4600 |
0.1493 |
- |
- |
- |
- |
- |
- |
- |
- |
- |
- |
- |
- |
- |
0.8175 |
4700 |
0.1497 |
- |
- |
- |
- |
- |
- |
- |
- |
- |
- |
- |
- |
- |
0.8349 |
4800 |
0.1492 |
- |
- |
- |
- |
- |
- |
- |
- |
- |
- |
- |
- |
- |
0.8523 |
4900 |
0.1378 |
- |
- |
- |
- |
- |
- |
- |
- |
- |
- |
- |
- |
- |
0.8697 |
5000 |
0.1391 |
0.2362 |
0.8453 |
0.8336 |
0.8392 |
0.8438 |
0.8266 |
0.8454 |
- |
- |
- |
- |
- |
- |
0.8871 |
5100 |
0.1622 |
- |
- |
- |
- |
- |
- |
- |
- |
- |
- |
- |
- |
- |
0.9045 |
5200 |
0.1456 |
- |
- |
- |
- |
- |
- |
- |
- |
- |
- |
- |
- |
- |
0.9219 |
5300 |
0.1367 |
- |
- |
- |
- |
- |
- |
- |
- |
- |
- |
- |
- |
- |
0.9393 |
5400 |
0.1243 |
- |
- |
- |
- |
- |
- |
- |
- |
- |
- |
- |
- |
- |
0.9567 |
5500 |
0.1389 |
- |
- |
- |
- |
- |
- |
- |
- |
- |
- |
- |
- |
- |
0.9741 |
5600 |
0.1338 |
- |
- |
- |
- |
- |
- |
- |
- |
- |
- |
- |
- |
- |
0.9915 |
5700 |
0.1146 |
- |
- |
- |
- |
- |
- |
- |
- |
- |
- |
- |
- |
- |
1.0 |
5749 |
- |
- |
- |
- |
- |
- |
- |
- |
0.8189 |
0.8093 |
0.8150 |
0.8163 |
0.8025 |
0.8183 |
Framework Versions
- Python: 3.9.16
- Sentence Transformers: 3.0.0
- Transformers: 4.42.0.dev0
- PyTorch: 2.2.2+cu118
- Accelerate: 0.31.0
- Datasets: 2.19.1
- Tokenizers: 0.19.1
Citation
BibTeX
Sentence Transformers
@inproceedings{reimers-2019-sentence-bert,
title = "Sentence-BERT: Sentence Embeddings using Siamese BERT-Networks",
author = "Reimers, Nils and Gurevych, Iryna",
booktitle = "Proceedings of the 2019 Conference on Empirical Methods in Natural Language Processing",
month = "11",
year = "2019",
publisher = "Association for Computational Linguistics",
url = "https://arxiv.org/abs/1908.10084",
}
MatryoshkaLoss
@misc{kusupati2024matryoshka,
title={Matryoshka Representation Learning},
author={Aditya Kusupati and Gantavya Bhatt and Aniket Rege and Matthew Wallingford and Aditya Sinha and Vivek Ramanujan and William Howard-Snyder and Kaifeng Chen and Sham Kakade and Prateek Jain and Ali Farhadi},
year={2024},
eprint={2205.13147},
archivePrefix={arXiv},
primaryClass={cs.LG}
}