German_Semantic_V3 / README.md
aari1995's picture
Add new SentenceTransformer model.
4eccf7a verified
|
raw
history blame
61.9 kB
metadata
language:
  - de
  - en
  - es
  - fr
  - it
  - nl
  - pl
  - pt
  - ru
  - zh
library_name: sentence-transformers
tags:
  - sentence-transformers
  - sentence-similarity
  - feature-extraction
  - dataset_size:10K<n<100K
  - loss:MatryoshkaLoss
  - loss:CosineSimilarityLoss
base_model: aari1995/gbert-large-2-cls-nlisim
metrics:
  - pearson_cosine
  - spearman_cosine
  - pearson_manhattan
  - spearman_manhattan
  - pearson_euclidean
  - spearman_euclidean
  - pearson_dot
  - spearman_dot
  - pearson_max
  - spearman_max
widget:
  - source_sentence: Ein Mann spricht.
    sentences:
      - Ein Mann spricht in ein Mikrofon.
      - Der Mann spielt auf den Tastaturen.
      - Zwei Mädchen gehen im Ozean spazieren.
  - source_sentence: Eine Flagge weht.
    sentences:
      - Die Flagge bewegte sich in der Luft.
      - Ein Hund fährt auf einem Skateboard.
      - Zwei Frauen sitzen in einem Cafe.
  - source_sentence: Ein Mann übt Boxen
    sentences:
      - Ein Affe praktiziert Kampfsportarten.
      - Eine Person faltet ein Blatt Papier.
      - Eine Frau geht mit ihrem Hund spazieren.
  - source_sentence: Das Tor ist gelb.
    sentences:
      - Das Tor ist blau.
      - Die Frau hält die Hände des Mannes.
      - NATO-Soldat bei afghanischem Angriff getötet
  - source_sentence: Zwei Frauen laufen.
    sentences:
      - Frauen laufen.
      - Die Frau prüft die Augen des Mannes.
      - Ein Mann ist auf einem Dach
pipeline_tag: sentence-similarity
model-index:
  - name: SentenceTransformer based on aari1995/gbert-large-2-cls-nlisim
    results:
      - task:
          type: semantic-similarity
          name: Semantic Similarity
        dataset:
          name: sts dev 1024
          type: sts-dev-1024
        metrics:
          - type: pearson_cosine
            value: 0.8417806877288009
            name: Pearson Cosine
          - type: spearman_cosine
            value: 0.8452891310343582
            name: Spearman Cosine
          - type: pearson_manhattan
            value: 0.8418749526406495
            name: Pearson Manhattan
          - type: spearman_manhattan
            value: 0.8450348906331776
            name: Spearman Manhattan
          - type: pearson_euclidean
            value: 0.8422615095001257
            name: Pearson Euclidean
          - type: spearman_euclidean
            value: 0.8453390990427703
            name: Spearman Euclidean
          - type: pearson_dot
            value: 0.8416625079549063
            name: Pearson Dot
          - type: spearman_dot
            value: 0.8450616171323844
            name: Spearman Dot
          - type: pearson_max
            value: 0.8422615095001257
            name: Pearson Max
          - type: spearman_max
            value: 0.8453390990427703
            name: Spearman Max
      - task:
          type: semantic-similarity
          name: Semantic Similarity
        dataset:
          name: sts dev 768
          type: sts-dev-768
        metrics:
          - type: pearson_cosine
            value: 0.8418107096367227
            name: Pearson Cosine
          - type: spearman_cosine
            value: 0.8453863409322975
            name: Spearman Cosine
          - type: pearson_manhattan
            value: 0.8418527770289471
            name: Pearson Manhattan
          - type: spearman_manhattan
            value: 0.8448328869253576
            name: Spearman Manhattan
          - type: pearson_euclidean
            value: 0.8422791953749277
            name: Pearson Euclidean
          - type: spearman_euclidean
            value: 0.8451547857394669
            name: Spearman Euclidean
          - type: pearson_dot
            value: 0.8417682812591724
            name: Pearson Dot
          - type: spearman_dot
            value: 0.8446927200809794
            name: Spearman Dot
          - type: pearson_max
            value: 0.8422791953749277
            name: Pearson Max
          - type: spearman_max
            value: 0.8453863409322975
            name: Spearman Max
      - task:
          type: semantic-similarity
          name: Semantic Similarity
        dataset:
          name: sts dev 512
          type: sts-dev-512
        metrics:
          - type: pearson_cosine
            value: 0.8394808864309438
            name: Pearson Cosine
          - type: spearman_cosine
            value: 0.8437551103291275
            name: Spearman Cosine
          - type: pearson_manhattan
            value: 0.8420246416513741
            name: Pearson Manhattan
          - type: spearman_manhattan
            value: 0.8447335398769396
            name: Spearman Manhattan
          - type: pearson_euclidean
            value: 0.8422722079216611
            name: Pearson Euclidean
          - type: spearman_euclidean
            value: 0.8448909261141044
            name: Spearman Euclidean
          - type: pearson_dot
            value: 0.8358204287638725
            name: Pearson Dot
          - type: spearman_dot
            value: 0.8380004733308642
            name: Spearman Dot
          - type: pearson_max
            value: 0.8422722079216611
            name: Pearson Max
          - type: spearman_max
            value: 0.8448909261141044
            name: Spearman Max
      - task:
          type: semantic-similarity
          name: Semantic Similarity
        dataset:
          name: sts dev 256
          type: sts-dev-256
        metrics:
          - type: pearson_cosine
            value: 0.833879413726309
            name: Pearson Cosine
          - type: spearman_cosine
            value: 0.8392439788855341
            name: Spearman Cosine
          - type: pearson_manhattan
            value: 0.8379618268497928
            name: Pearson Manhattan
          - type: spearman_manhattan
            value: 0.839860826315925
            name: Spearman Manhattan
          - type: pearson_euclidean
            value: 0.838931461279174
            name: Pearson Euclidean
          - type: spearman_euclidean
            value: 0.8404811150299943
            name: Spearman Euclidean
          - type: pearson_dot
            value: 0.8230557648139373
            name: Pearson Dot
          - type: spearman_dot
            value: 0.8242532718299653
            name: Spearman Dot
          - type: pearson_max
            value: 0.838931461279174
            name: Pearson Max
          - type: spearman_max
            value: 0.8404811150299943
            name: Spearman Max
      - task:
          type: semantic-similarity
          name: Semantic Similarity
        dataset:
          name: sts dev 128
          type: sts-dev-128
        metrics:
          - type: pearson_cosine
            value: 0.8253967606033702
            name: Pearson Cosine
          - type: spearman_cosine
            value: 0.8335750690073012
            name: Spearman Cosine
          - type: pearson_manhattan
            value: 0.8341588626988476
            name: Pearson Manhattan
          - type: spearman_manhattan
            value: 0.8343994326050966
            name: Spearman Manhattan
          - type: pearson_euclidean
            value: 0.8355263623880292
            name: Pearson Euclidean
          - type: spearman_euclidean
            value: 0.8358857095028451
            name: Spearman Euclidean
          - type: pearson_dot
            value: 0.8035163216908426
            name: Pearson Dot
          - type: spearman_dot
            value: 0.8050271037746011
            name: Spearman Dot
          - type: pearson_max
            value: 0.8355263623880292
            name: Pearson Max
          - type: spearman_max
            value: 0.8358857095028451
            name: Spearman Max
      - task:
          type: semantic-similarity
          name: Semantic Similarity
        dataset:
          name: sts dev 64
          type: sts-dev-64
        metrics:
          - type: pearson_cosine
            value: 0.8150661334039712
            name: Pearson Cosine
          - type: spearman_cosine
            value: 0.8265558538619309
            name: Spearman Cosine
          - type: pearson_manhattan
            value: 0.8241988539394505
            name: Pearson Manhattan
          - type: spearman_manhattan
            value: 0.8238763145175863
            name: Spearman Manhattan
          - type: pearson_euclidean
            value: 0.8274925218859535
            name: Pearson Euclidean
          - type: spearman_euclidean
            value: 0.8270778062044848
            name: Spearman Euclidean
          - type: pearson_dot
            value: 0.7773847317840161
            name: Pearson Dot
          - type: spearman_dot
            value: 0.7790338242936304
            name: Spearman Dot
          - type: pearson_max
            value: 0.8274925218859535
            name: Pearson Max
          - type: spearman_max
            value: 0.8270778062044848
            name: Spearman Max
      - task:
          type: semantic-similarity
          name: Semantic Similarity
        dataset:
          name: sts test 1024
          type: sts-test-1024
        metrics:
          - type: pearson_cosine
            value: 0.8130772714952826
            name: Pearson Cosine
          - type: spearman_cosine
            value: 0.8188901246173036
            name: Spearman Cosine
          - type: pearson_manhattan
            value: 0.8208715312691268
            name: Pearson Manhattan
          - type: spearman_manhattan
            value: 0.8195095089412118
            name: Spearman Manhattan
          - type: pearson_euclidean
            value: 0.820344720619671
            name: Pearson Euclidean
          - type: spearman_euclidean
            value: 0.8189263018901494
            name: Spearman Euclidean
          - type: pearson_dot
            value: 0.8127924456922464
            name: Pearson Dot
          - type: spearman_dot
            value: 0.8185815083131535
            name: Spearman Dot
          - type: pearson_max
            value: 0.8208715312691268
            name: Pearson Max
          - type: spearman_max
            value: 0.8195095089412118
            name: Spearman Max
      - task:
          type: semantic-similarity
          name: Semantic Similarity
        dataset:
          name: sts test 768
          type: sts-test-768
        metrics:
          - type: pearson_cosine
            value: 0.8121757739236393
            name: Pearson Cosine
          - type: spearman_cosine
            value: 0.8182913347635533
            name: Spearman Cosine
          - type: pearson_manhattan
            value: 0.820604714791802
            name: Pearson Manhattan
          - type: spearman_manhattan
            value: 0.8190481839997107
            name: Spearman Manhattan
          - type: pearson_euclidean
            value: 0.8197462057663948
            name: Pearson Euclidean
          - type: spearman_euclidean
            value: 0.8183157116237637
            name: Spearman Euclidean
          - type: pearson_dot
            value: 0.8106698462984598
            name: Pearson Dot
          - type: spearman_dot
            value: 0.8148932181769889
            name: Spearman Dot
          - type: pearson_max
            value: 0.820604714791802
            name: Pearson Max
          - type: spearman_max
            value: 0.8190481839997107
            name: Spearman Max
      - task:
          type: semantic-similarity
          name: Semantic Similarity
        dataset:
          name: sts test 512
          type: sts-test-512
        metrics:
          - type: pearson_cosine
            value: 0.8096452235754106
            name: Pearson Cosine
          - type: spearman_cosine
            value: 0.816264314810491
            name: Spearman Cosine
          - type: pearson_manhattan
            value: 0.8180021560255247
            name: Pearson Manhattan
          - type: spearman_manhattan
            value: 0.8165486306356095
            name: Spearman Manhattan
          - type: pearson_euclidean
            value: 0.8173829404008947
            name: Pearson Euclidean
          - type: spearman_euclidean
            value: 0.8158592878546184
            name: Spearman Euclidean
          - type: pearson_dot
            value: 0.8059176831913651
            name: Pearson Dot
          - type: spearman_dot
            value: 0.8088972406630007
            name: Spearman Dot
          - type: pearson_max
            value: 0.8180021560255247
            name: Pearson Max
          - type: spearman_max
            value: 0.8165486306356095
            name: Spearman Max
      - task:
          type: semantic-similarity
          name: Semantic Similarity
        dataset:
          name: sts test 256
          type: sts-test-256
        metrics:
          - type: pearson_cosine
            value: 0.8070921035712145
            name: Pearson Cosine
          - type: spearman_cosine
            value: 0.8150266310280979
            name: Spearman Cosine
          - type: pearson_manhattan
            value: 0.818409081545237
            name: Pearson Manhattan
          - type: spearman_manhattan
            value: 0.8167245415653657
            name: Spearman Manhattan
          - type: pearson_euclidean
            value: 0.8176811220335696
            name: Pearson Euclidean
          - type: spearman_euclidean
            value: 0.8158894222194816
            name: Spearman Euclidean
          - type: pearson_dot
            value: 0.795483328805793
            name: Pearson Dot
          - type: spearman_dot
            value: 0.7956062163122977
            name: Spearman Dot
          - type: pearson_max
            value: 0.818409081545237
            name: Pearson Max
          - type: spearman_max
            value: 0.8167245415653657
            name: Spearman Max
      - task:
          type: semantic-similarity
          name: Semantic Similarity
        dataset:
          name: sts test 128
          type: sts-test-128
        metrics:
          - type: pearson_cosine
            value: 0.7974039089035316
            name: Pearson Cosine
          - type: spearman_cosine
            value: 0.8093067652791092
            name: Spearman Cosine
          - type: pearson_manhattan
            value: 0.8125792968401813
            name: Pearson Manhattan
          - type: spearman_manhattan
            value: 0.8121486514324944
            name: Spearman Manhattan
          - type: pearson_euclidean
            value: 0.8119102513178551
            name: Pearson Euclidean
          - type: spearman_euclidean
            value: 0.811152531425261
            name: Spearman Euclidean
          - type: pearson_dot
            value: 0.7739555890021923
            name: Pearson Dot
          - type: spearman_dot
            value: 0.770072655568691
            name: Spearman Dot
          - type: pearson_max
            value: 0.8125792968401813
            name: Pearson Max
          - type: spearman_max
            value: 0.8121486514324944
            name: Spearman Max
      - task:
          type: semantic-similarity
          name: Semantic Similarity
        dataset:
          name: sts test 64
          type: sts-test-64
        metrics:
          - type: pearson_cosine
            value: 0.7873069617689994
            name: Pearson Cosine
          - type: spearman_cosine
            value: 0.8024994399645912
            name: Spearman Cosine
          - type: pearson_manhattan
            value: 0.8048161563115213
            name: Pearson Manhattan
          - type: spearman_manhattan
            value: 0.8031972835914969
            name: Spearman Manhattan
          - type: pearson_euclidean
            value: 0.8060416893207731
            name: Pearson Euclidean
          - type: spearman_euclidean
            value: 0.8041515980374414
            name: Spearman Euclidean
          - type: pearson_dot
            value: 0.747911221220991
            name: Pearson Dot
          - type: spearman_dot
            value: 0.7386011869481828
            name: Spearman Dot
          - type: pearson_max
            value: 0.8060416893207731
            name: Pearson Max
          - type: spearman_max
            value: 0.8041515980374414
            name: Spearman Max

SentenceTransformer based on aari1995/gbert-large-2-cls-nlisim

This is a sentence-transformers model finetuned from aari1995/gbert-large-2-cls-nlisim on the PhilipMay/stsb_multi_mt dataset. It maps sentences & paragraphs to a 1024-dimensional dense vector space and can be used for semantic textual similarity, semantic search, paraphrase mining, text classification, clustering, and more.

Model Details

Model Description

  • Model Type: Sentence Transformer
  • Base model: aari1995/gbert-large-2-cls-nlisim
  • Maximum Sequence Length: 8192 tokens
  • Output Dimensionality: 1024 tokens
  • Similarity Function: Cosine Similarity
  • Training Dataset:
  • Languages: de, en, es, fr, it, nl, pl, pt, ru, zh

Model Sources

Full Model Architecture

SentenceTransformer(
  (0): Transformer({'max_seq_length': 8192, 'do_lower_case': False}) with Transformer model: JinaBertModel 
  (1): Pooling({'word_embedding_dimension': 1024, 'pooling_mode_cls_token': True, 'pooling_mode_mean_tokens': False, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False, 'pooling_mode_weightedmean_tokens': False, 'pooling_mode_lasttoken': False, 'include_prompt': True})
)

Usage

Direct Usage (Sentence Transformers)

First install the Sentence Transformers library:

pip install -U sentence-transformers

Then you can load this model and run inference.

from sentence_transformers import SentenceTransformer

# Download from the 🤗 Hub
model = SentenceTransformer("aari1995/gbert-large-2-cls-pawsx-nli-sts")
# Run inference
sentences = [
    'Zwei Frauen laufen.',
    'Frauen laufen.',
    'Die Frau prüft die Augen des Mannes.',
]
embeddings = model.encode(sentences)
print(embeddings.shape)
# [3, 1024]

# Get the similarity scores for the embeddings
similarities = model.similarity(embeddings, embeddings)
print(similarities.shape)
# [3, 3]

Evaluation

Metrics

Semantic Similarity

Metric Value
pearson_cosine 0.8418
spearman_cosine 0.8453
pearson_manhattan 0.8419
spearman_manhattan 0.845
pearson_euclidean 0.8423
spearman_euclidean 0.8453
pearson_dot 0.8417
spearman_dot 0.8451
pearson_max 0.8423
spearman_max 0.8453

Semantic Similarity

Metric Value
pearson_cosine 0.8418
spearman_cosine 0.8454
pearson_manhattan 0.8419
spearman_manhattan 0.8448
pearson_euclidean 0.8423
spearman_euclidean 0.8452
pearson_dot 0.8418
spearman_dot 0.8447
pearson_max 0.8423
spearman_max 0.8454

Semantic Similarity

Metric Value
pearson_cosine 0.8395
spearman_cosine 0.8438
pearson_manhattan 0.842
spearman_manhattan 0.8447
pearson_euclidean 0.8423
spearman_euclidean 0.8449
pearson_dot 0.8358
spearman_dot 0.838
pearson_max 0.8423
spearman_max 0.8449

Semantic Similarity

Metric Value
pearson_cosine 0.8339
spearman_cosine 0.8392
pearson_manhattan 0.838
spearman_manhattan 0.8399
pearson_euclidean 0.8389
spearman_euclidean 0.8405
pearson_dot 0.8231
spearman_dot 0.8243
pearson_max 0.8389
spearman_max 0.8405

Semantic Similarity

Metric Value
pearson_cosine 0.8254
spearman_cosine 0.8336
pearson_manhattan 0.8342
spearman_manhattan 0.8344
pearson_euclidean 0.8355
spearman_euclidean 0.8359
pearson_dot 0.8035
spearman_dot 0.805
pearson_max 0.8355
spearman_max 0.8359

Semantic Similarity

Metric Value
pearson_cosine 0.8151
spearman_cosine 0.8266
pearson_manhattan 0.8242
spearman_manhattan 0.8239
pearson_euclidean 0.8275
spearman_euclidean 0.8271
pearson_dot 0.7774
spearman_dot 0.779
pearson_max 0.8275
spearman_max 0.8271

Semantic Similarity

Metric Value
pearson_cosine 0.8131
spearman_cosine 0.8189
pearson_manhattan 0.8209
spearman_manhattan 0.8195
pearson_euclidean 0.8203
spearman_euclidean 0.8189
pearson_dot 0.8128
spearman_dot 0.8186
pearson_max 0.8209
spearman_max 0.8195

Semantic Similarity

Metric Value
pearson_cosine 0.8122
spearman_cosine 0.8183
pearson_manhattan 0.8206
spearman_manhattan 0.819
pearson_euclidean 0.8197
spearman_euclidean 0.8183
pearson_dot 0.8107
spearman_dot 0.8149
pearson_max 0.8206
spearman_max 0.819

Semantic Similarity

Metric Value
pearson_cosine 0.8096
spearman_cosine 0.8163
pearson_manhattan 0.818
spearman_manhattan 0.8165
pearson_euclidean 0.8174
spearman_euclidean 0.8159
pearson_dot 0.8059
spearman_dot 0.8089
pearson_max 0.818
spearman_max 0.8165

Semantic Similarity

Metric Value
pearson_cosine 0.8071
spearman_cosine 0.815
pearson_manhattan 0.8184
spearman_manhattan 0.8167
pearson_euclidean 0.8177
spearman_euclidean 0.8159
pearson_dot 0.7955
spearman_dot 0.7956
pearson_max 0.8184
spearman_max 0.8167

Semantic Similarity

Metric Value
pearson_cosine 0.7974
spearman_cosine 0.8093
pearson_manhattan 0.8126
spearman_manhattan 0.8121
pearson_euclidean 0.8119
spearman_euclidean 0.8112
pearson_dot 0.774
spearman_dot 0.7701
pearson_max 0.8126
spearman_max 0.8121

Semantic Similarity

Metric Value
pearson_cosine 0.7873
spearman_cosine 0.8025
pearson_manhattan 0.8048
spearman_manhattan 0.8032
pearson_euclidean 0.806
spearman_euclidean 0.8042
pearson_dot 0.7479
spearman_dot 0.7386
pearson_max 0.806
spearman_max 0.8042

Training Details

Training Dataset

PhilipMay/stsb_multi_mt

  • Dataset: PhilipMay/stsb_multi_mt at 3acaa3d
  • Size: 22,996 training samples
  • Columns: sentence1, sentence2, and score
  • Approximate statistics based on the first 1000 samples:
    sentence1 sentence2 score
    type string string float
    details
    • min: 6 tokens
    • mean: 18.13 tokens
    • max: 65 tokens
    • min: 6 tokens
    • mean: 18.25 tokens
    • max: 90 tokens
    • min: 0.0
    • mean: 0.54
    • max: 1.0
  • Samples:
    sentence1 sentence2 score
    schütze wegen mordes an schwarzem us-jugendlichen angeklagt gedanken zu den rassenbeziehungen unter einem schwarzen präsidenten 0.1599999964237213
    fußballspieler kicken einen fußball in das tor. Ein Fußballspieler schießt ein Tor. 0.7599999904632568
    obama lockert abschiebungsregeln für junge einwanderer usa lockert abschiebebestimmungen für jugendliche: napolitano 0.800000011920929
  • Loss: MatryoshkaLoss with these parameters:
    {
        "loss": "CosineSimilarityLoss",
        "matryoshka_dims": [
            1024,
            768,
            512,
            256,
            128,
            64
        ],
        "matryoshka_weights": [
            1,
            1,
            1,
            1,
            1,
            1
        ],
        "n_dims_per_step": -1
    }
    

Evaluation Dataset

PhilipMay/stsb_multi_mt

  • Dataset: PhilipMay/stsb_multi_mt at 3acaa3d
  • Size: 1,500 evaluation samples
  • Columns: sentence1, sentence2, and score
  • Approximate statistics based on the first 1000 samples:
    sentence1 sentence2 score
    type string string float
    details
    • min: 5 tokens
    • mean: 16.54 tokens
    • max: 53 tokens
    • min: 5 tokens
    • mean: 16.53 tokens
    • max: 47 tokens
    • min: 0.0
    • mean: 0.47
    • max: 1.0
  • Samples:
    sentence1 sentence2 score
    Ein Mann mit einem Schutzhelm tanzt. Ein Mann mit einem Schutzhelm tanzt. 1.0
    Ein kleines Kind reitet auf einem Pferd. Ein Kind reitet auf einem Pferd. 0.949999988079071
    Ein Mann verfüttert eine Maus an eine Schlange. Der Mann füttert die Schlange mit einer Maus. 1.0
  • Loss: MatryoshkaLoss with these parameters:
    {
        "loss": "CosineSimilarityLoss",
        "matryoshka_dims": [
            1024,
            768,
            512,
            256,
            128,
            64
        ],
        "matryoshka_weights": [
            1,
            1,
            1,
            1,
            1,
            1
        ],
        "n_dims_per_step": -1
    }
    

Training Hyperparameters

Non-Default Hyperparameters

  • eval_strategy: steps
  • per_device_train_batch_size: 4
  • per_device_eval_batch_size: 16
  • learning_rate: 5e-06
  • num_train_epochs: 1
  • warmup_ratio: 0.1
  • bf16: True

All Hyperparameters

Click to expand
  • overwrite_output_dir: False
  • do_predict: False
  • eval_strategy: steps
  • prediction_loss_only: True
  • per_device_train_batch_size: 4
  • per_device_eval_batch_size: 16
  • per_gpu_train_batch_size: None
  • per_gpu_eval_batch_size: None
  • gradient_accumulation_steps: 1
  • eval_accumulation_steps: None
  • learning_rate: 5e-06
  • weight_decay: 0.0
  • adam_beta1: 0.9
  • adam_beta2: 0.999
  • adam_epsilon: 1e-08
  • max_grad_norm: 1.0
  • num_train_epochs: 1
  • max_steps: -1
  • lr_scheduler_type: linear
  • lr_scheduler_kwargs: {}
  • warmup_ratio: 0.1
  • warmup_steps: 0
  • log_level: passive
  • log_level_replica: warning
  • log_on_each_node: True
  • logging_nan_inf_filter: True
  • save_safetensors: True
  • save_on_each_node: False
  • save_only_model: False
  • restore_callback_states_from_checkpoint: False
  • no_cuda: False
  • use_cpu: False
  • use_mps_device: False
  • seed: 42
  • data_seed: None
  • jit_mode_eval: False
  • use_ipex: False
  • bf16: True
  • fp16: False
  • fp16_opt_level: O1
  • half_precision_backend: auto
  • bf16_full_eval: False
  • fp16_full_eval: False
  • tf32: None
  • local_rank: 0
  • ddp_backend: None
  • tpu_num_cores: None
  • tpu_metrics_debug: False
  • debug: []
  • dataloader_drop_last: False
  • dataloader_num_workers: 0
  • dataloader_prefetch_factor: None
  • past_index: -1
  • disable_tqdm: False
  • remove_unused_columns: True
  • label_names: None
  • load_best_model_at_end: False
  • ignore_data_skip: False
  • fsdp: []
  • fsdp_min_num_params: 0
  • fsdp_config: {'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False}
  • fsdp_transformer_layer_cls_to_wrap: None
  • accelerator_config: {'split_batches': False, 'dispatch_batches': None, 'even_batches': True, 'use_seedable_sampler': True, 'non_blocking': False, 'gradient_accumulation_kwargs': None}
  • deepspeed: None
  • label_smoothing_factor: 0.0
  • optim: adamw_torch
  • optim_args: None
  • adafactor: False
  • group_by_length: False
  • length_column_name: length
  • ddp_find_unused_parameters: None
  • ddp_bucket_cap_mb: None
  • ddp_broadcast_buffers: False
  • dataloader_pin_memory: True
  • dataloader_persistent_workers: False
  • skip_memory_metrics: True
  • use_legacy_prediction_loop: False
  • push_to_hub: False
  • resume_from_checkpoint: None
  • hub_model_id: None
  • hub_strategy: every_save
  • hub_private_repo: False
  • hub_always_push: False
  • gradient_checkpointing: False
  • gradient_checkpointing_kwargs: None
  • include_inputs_for_metrics: False
  • eval_do_concat_batches: True
  • fp16_backend: auto
  • push_to_hub_model_id: None
  • push_to_hub_organization: None
  • mp_parameters:
  • auto_find_batch_size: False
  • full_determinism: False
  • torchdynamo: None
  • ray_scope: last
  • ddp_timeout: 1800
  • torch_compile: False
  • torch_compile_backend: None
  • torch_compile_mode: None
  • dispatch_batches: None
  • split_batches: None
  • include_tokens_per_second: False
  • include_num_input_tokens_seen: False
  • neftune_noise_alpha: None
  • optim_target_modules: None
  • batch_eval_metrics: False
  • eval_on_start: False
  • batch_sampler: batch_sampler
  • multi_dataset_batch_sampler: proportional

Training Logs

Epoch Step Training Loss loss sts-dev-1024_spearman_cosine sts-dev-128_spearman_cosine sts-dev-256_spearman_cosine sts-dev-512_spearman_cosine sts-dev-64_spearman_cosine sts-dev-768_spearman_cosine sts-test-1024_spearman_cosine sts-test-128_spearman_cosine sts-test-256_spearman_cosine sts-test-512_spearman_cosine sts-test-64_spearman_cosine sts-test-768_spearman_cosine
0.0174 100 0.2958 - - - - - - - - - - - - -
0.0348 200 0.2914 - - - - - - - - - - - - -
0.0522 300 0.2691 - - - - - - - - - - - - -
0.0696 400 0.253 - - - - - - - - - - - - -
0.0870 500 0.2458 - - - - - - - - - - - - -
0.1044 600 0.2594 - - - - - - - - - - - - -
0.1218 700 0.2339 - - - - - - - - - - - - -
0.1392 800 0.2245 - - - - - - - - - - - - -
0.1565 900 0.2122 - - - - - - - - - - - - -
0.1739 1000 0.2369 0.2394 0.8402 0.8277 0.8352 0.8393 0.8164 0.8404 - - - - - -
0.1913 1100 0.2308 - - - - - - - - - - - - -
0.2087 1200 0.2292 - - - - - - - - - - - - -
0.2261 1300 0.2232 - - - - - - - - - - - - -
0.2435 1400 0.2001 - - - - - - - - - - - - -
0.2609 1500 0.2139 - - - - - - - - - - - - -
0.2783 1600 0.1906 - - - - - - - - - - - - -
0.2957 1700 0.1895 - - - - - - - - - - - - -
0.3131 1800 0.2011 - - - - - - - - - - - - -
0.3305 1900 0.1723 - - - - - - - - - - - - -
0.3479 2000 0.1886 0.2340 0.8448 0.8321 0.8385 0.8435 0.8233 0.8449 - - - - - -
0.3653 2100 0.1719 - - - - - - - - - - - - -
0.3827 2200 0.1879 - - - - - - - - - - - - -
0.4001 2300 0.187 - - - - - - - - - - - - -
0.4175 2400 0.1487 - - - - - - - - - - - - -
0.4349 2500 0.1752 - - - - - - - - - - - - -
0.4523 2600 0.1475 - - - - - - - - - - - - -
0.4696 2700 0.1695 - - - - - - - - - - - - -
0.4870 2800 0.1615 - - - - - - - - - - - - -
0.5044 2900 0.1558 - - - - - - - - - - - - -
0.5218 3000 0.1713 0.2357 0.8457 0.8344 0.8406 0.8447 0.8266 0.8461 - - - - - -
0.5392 3100 0.1556 - - - - - - - - - - - - -
0.5566 3200 0.1743 - - - - - - - - - - - - -
0.5740 3300 0.1426 - - - - - - - - - - - - -
0.5914 3400 0.1519 - - - - - - - - - - - - -
0.6088 3500 0.1763 - - - - - - - - - - - - -
0.6262 3600 0.1456 - - - - - - - - - - - - -
0.6436 3700 0.1649 - - - - - - - - - - - - -
0.6610 3800 0.1427 - - - - - - - - - - - - -
0.6784 3900 0.1284 - - - - - - - - - - - - -
0.6958 4000 0.1533 0.2344 0.8417 0.8291 0.8357 0.8402 0.8225 0.8421 - - - - - -
0.7132 4100 0.1397 - - - - - - - - - - - - -
0.7306 4200 0.1505 - - - - - - - - - - - - -
0.7480 4300 0.1355 - - - - - - - - - - - - -
0.7654 4400 0.1275 - - - - - - - - - - - - -
0.7827 4500 0.1599 - - - - - - - - - - - - -
0.8001 4600 0.1493 - - - - - - - - - - - - -
0.8175 4700 0.1497 - - - - - - - - - - - - -
0.8349 4800 0.1492 - - - - - - - - - - - - -
0.8523 4900 0.1378 - - - - - - - - - - - - -
0.8697 5000 0.1391 0.2362 0.8453 0.8336 0.8392 0.8438 0.8266 0.8454 - - - - - -
0.8871 5100 0.1622 - - - - - - - - - - - - -
0.9045 5200 0.1456 - - - - - - - - - - - - -
0.9219 5300 0.1367 - - - - - - - - - - - - -
0.9393 5400 0.1243 - - - - - - - - - - - - -
0.9567 5500 0.1389 - - - - - - - - - - - - -
0.9741 5600 0.1338 - - - - - - - - - - - - -
0.9915 5700 0.1146 - - - - - - - - - - - - -
1.0 5749 - - - - - - - - 0.8189 0.8093 0.8150 0.8163 0.8025 0.8183

Framework Versions

  • Python: 3.9.16
  • Sentence Transformers: 3.0.0
  • Transformers: 4.42.0.dev0
  • PyTorch: 2.2.2+cu118
  • Accelerate: 0.31.0
  • Datasets: 2.19.1
  • Tokenizers: 0.19.1

Citation

BibTeX

Sentence Transformers

@inproceedings{reimers-2019-sentence-bert,
    title = "Sentence-BERT: Sentence Embeddings using Siamese BERT-Networks",
    author = "Reimers, Nils and Gurevych, Iryna",
    booktitle = "Proceedings of the 2019 Conference on Empirical Methods in Natural Language Processing",
    month = "11",
    year = "2019",
    publisher = "Association for Computational Linguistics",
    url = "https://arxiv.org/abs/1908.10084",
}

MatryoshkaLoss

@misc{kusupati2024matryoshka,
    title={Matryoshka Representation Learning}, 
    author={Aditya Kusupati and Gantavya Bhatt and Aniket Rege and Matthew Wallingford and Aditya Sinha and Vivek Ramanujan and William Howard-Snyder and Kaifeng Chen and Sham Kakade and Prateek Jain and Ali Farhadi},
    year={2024},
    eprint={2205.13147},
    archivePrefix={arXiv},
    primaryClass={cs.LG}
}