metadata
base_model: Omartificial-Intelligence-Space/Arabic-Triplet-Matryoshka-V2
datasets:
- Omartificial-Intelligence-Space/Arabic-stsb
- Omartificial-Intelligence-Space/Arabic-NLi-Pair-Class
language:
- ar
library_name: sentence-transformers
metrics:
- pearson_cosine
- spearman_cosine
- pearson_manhattan
- spearman_manhattan
- pearson_euclidean
- spearman_euclidean
- pearson_dot
- spearman_dot
- pearson_max
- spearman_max
pipeline_tag: sentence-similarity
tags:
- mteb
- sentence-transformers
- sentence-similarity
- feature-extraction
- generated_from_trainer
- dataset_size:947818
- loss:SoftmaxLoss
- loss:CosineSimilarityLoss
- transformers
model-index:
- name: Omartificial-Intelligence-Space/GATE-AraBert-v1
results:
- dataset:
config: ar
name: MTEB MIRACLRetrievalHardNegatives (ar)
revision: 95c8db7d4a6e9c1d8a60601afd63d553ae20a2eb
split: dev
type: miracl/mmteb-miracl-hardnegatives
metrics:
- type: main_score
value: 57.737
- type: map_at_1
value: 32.108
- type: map_at_3
value: 44.405
- type: map_at_5
value: 47.164
- type: map_at_10
value: 49.477
- type: ndcg_at_1
value: 48.7
- type: ndcg_at_3
value: 51.161
- type: ndcg_at_5
value: 53.923
- type: ndcg_at_10
value: 57.737
- type: recall_at_1
value: 32.108
- type: recall_at_3
value: 52.675
- type: recall_at_5
value: 60.709
- type: recall_at_10
value: 70.61
- type: precision_at_1
value: 48.7
- type: precision_at_3
value: 29.7
- type: precision_at_5
value: 21.34
- type: precision_at_10
value: 12.98
- type: mrr_at_1
value: 48.7
- type: mrr_at_3
value: 57.5333
- type: mrr_at_5
value: 59.1333
- type: mrr_at_10
value: 60.1163
task:
type: Retrieval
- dataset:
config: ar
name: MTEB MintakaRetrieval (ar)
revision: efa78cc2f74bbcd21eff2261f9e13aebe40b814e
split: test
type: mintaka/mmteb-mintaka
metrics:
- type: main_score
value: 19.13
- type: map_at_1
value: 11.212
- type: map_at_3
value: 14.904
- type: map_at_5
value: 15.719
- type: map_at_10
value: 16.375
- type: ndcg_at_1
value: 11.212
- type: ndcg_at_3
value: 16.08
- type: ndcg_at_5
value: 17.543
- type: ndcg_at_10
value: 19.13
- type: recall_at_1
value: 11.212
- type: recall_at_3
value: 19.473
- type: recall_at_5
value: 23.014
- type: recall_at_10
value: 27.916
- type: precision_at_1
value: 11.212
- type: precision_at_3
value: 6.491
- type: precision_at_5
value: 4.603
- type: precision_at_10
value: 2.792
- type: mrr_at_1
value: 11.212
- type: mrr_at_3
value: 14.9039
- type: mrr_at_5
value: 15.7187
- type: mrr_at_10
value: 16.3746
task:
type: Retrieval
- dataset:
config: ar
name: MTEB MLQARetrieval (ar)
revision: 397ed406c1a7902140303e7faf60fff35b58d285
split: validation
type: mlqa/mmteb-mlqa
metrics:
- type: main_score
value: 62.58
- type: map_at_1
value: 47.776
- type: map_at_3
value: 55.545
- type: map_at_5
value: 56.812
- type: map_at_10
value: 57.756
- type: ndcg_at_1
value: 47.776
- type: ndcg_at_3
value: 57.978
- type: ndcg_at_5
value: 60.276
- type: ndcg_at_10
value: 62.58
- type: recall_at_1
value: 47.776
- type: recall_at_3
value: 64.99
- type: recall_at_5
value: 70.6
- type: recall_at_10
value: 77.756
- type: precision_at_1
value: 47.776
- type: precision_at_3
value: 21.663
- type: precision_at_5
value: 14.12
- type: precision_at_10
value: 7.776
- type: mrr_at_1
value: 47.7756
- type: mrr_at_3
value: 55.5448
- type: mrr_at_5
value: 56.8117
- type: mrr_at_10
value: 57.7562
task:
type: Retrieval
- dataset:
config: default
name: MTEB SadeemQuestionRetrieval (ar)
revision: 3cb0752b182e5d5d740df547748b06663c8e0bd9
split: test
type: sadeem/mmteb-sadeem
metrics:
- type: main_score
value: 63.155
- type: map_at_1
value: 28.674
- type: map_at_3
value: 52.21
- type: map_at_5
value: 53.052
- type: map_at_10
value: 53.498
- type: ndcg_at_1
value: 28.674
- type: ndcg_at_3
value: 60.604
- type: ndcg_at_5
value: 62.092
- type: ndcg_at_10
value: 63.155
- type: recall_at_1
value: 28.674
- type: recall_at_3
value: 85.112
- type: recall_at_5
value: 88.655
- type: recall_at_10
value: 91.91
- type: precision_at_1
value: 28.674
- type: precision_at_3
value: 28.371
- type: precision_at_5
value: 17.731
- type: precision_at_10
value: 9.191
- type: mrr_at_1
value: 25.371
- type: mrr_at_3
value: 50.2314
- type: mrr_at_5
value: 51.0212
- type: mrr_at_10
value: 51.4811
task:
type: Retrieval
- dataset:
config: ar-ar
name: MTEB STS17 (ar-ar)
revision: faeb762787bd10488a50c8b5be4a3b82e411949c
split: test
type: mteb/sts17-crosslingual-sts
metrics:
- type: cosine_pearson
value: 82.06597171670848
- type: cosine_spearman
value: 82.7809395809498
- type: euclidean_pearson
value: 79.23996991139896
- type: euclidean_spearman
value: 81.5287595404711
- type: main_score
value: 82.7809395809498
- type: manhattan_pearson
value: 78.95407006608013
- type: manhattan_spearman
value: 81.15109493737467
task:
type: STS
- dataset:
config: ar
name: MTEB STS22.v2 (ar)
revision: d31f33a128469b20e357535c39b82fb3c3f6f2bd
split: test
type: mteb/sts22-crosslingual-sts
metrics:
- type: cosine_pearson
value: 54.912880452465004
- type: cosine_spearman
value: 63.09788380910325
- type: euclidean_pearson
value: 57.92665617677832
- type: euclidean_spearman
value: 62.76032598469037
- type: main_score
value: 63.09788380910325
- type: manhattan_pearson
value: 58.0736648155273
- type: manhattan_spearman
value: 62.94190582776664
task:
type: STS
- dataset:
config: ar
name: MTEB STS22 (ar)
revision: de9d86b3b84231dc21f76c7b7af1f28e2f57f6e3
split: test
type: mteb/sts22-crosslingual-sts
metrics:
- type: cosine_pearson
value: 51.72534929358701
- type: cosine_spearman
value: 59.75149627160101
- type: euclidean_pearson
value: 53.894835373598774
- type: euclidean_spearman
value: 59.44278354697161
- type: main_score
value: 59.75149627160101
- type: manhattan_pearson
value: 54.076675975406985
- type: manhattan_spearman
value: 59.610061143235725
task:
type: STS
widget:
- source_sentence: امرأة تكتب شيئاً
sentences:
- مراهق يتحدث إلى فتاة عبر كاميرا الإنترنت
- امرأة تقطع البصل الأخضر.
- مجموعة من كبار السن يتظاهرون حول طاولة الطعام.
- source_sentence: تتشكل النجوم في مناطق تكوين النجوم، والتي تنشأ نفسها من السحب الجزيئية.
sentences:
- لاعب كرة السلة على وشك تسجيل نقاط لفريقه.
- المقال التالي مأخوذ من نسختي من "أطلس البطريق الجديد للتاريخ الوسطى"
- قد يكون من الممكن أن يوجد نظام شمسي مثل نظامنا خارج المجرة
- source_sentence: >-
تحت السماء الزرقاء مع الغيوم البيضاء، يصل طفل لمس مروحة طائرة واقفة على
حقل من العشب.
sentences:
- امرأة تحمل كأساً
- طفل يحاول لمس مروحة طائرة
- اثنان من عازبين عن الشرب يستعدون للعشاء
- source_sentence: رجل في منتصف العمر يحلق لحيته في غرفة ذات جدران بيضاء والتي لا تبدو كحمام
sentences:
- فتى يخطط اسمه على مكتبه
- رجل ينام
- المرأة وحدها وهي نائمة في غرفة نومها
- source_sentence: الكلب البني مستلقي على جانبه على سجادة بيج، مع جسم أخضر في المقدمة.
sentences:
- شخص طويل القامة
- المرأة تنظر من النافذة.
- لقد مات الكلب
license: apache-2.0
GATE-AraBert-V1
This is GATE | General Arabic Text Embedding trained using SentenceTransformers in a multi-task setup. The system trains on the AllNLI and on the STS dataset.
Model Details
Model Description
- Model Type: Sentence Transformer
- Base model: Omartificial-Intelligence-Space/Arabic-Triplet-Matryoshka-V2
- Maximum Sequence Length: 512 tokens
- Output Dimensionality: 768 tokens
- Similarity Function: Cosine Similarity
- Training Datasets:
- Language: ar
Usage
Direct Usage (Sentence Transformers)
First install the Sentence Transformers library:
pip install -U sentence-transformers
Then you can load this model and run inference.
from sentence_transformers import SentenceTransformer
# Download from the 🤗 Hub
model = SentenceTransformer("Omartificial-Intelligence-Space/GATE-AraBert-v1")
# Run inference
sentences = [
'الكلب البني مستلقي على جانبه على سجادة بيج، مع جسم أخضر في المقدمة.',
'لقد مات الكلب',
'شخص طويل القامة',
]
embeddings = model.encode(sentences)
print(embeddings.shape)
# [3, 768]
# Get the similarity scores for the embeddings
similarities = model.similarity(embeddings, embeddings)
print(similarities.shape)
# [3, 3]
Evaluation
Metrics
Semantic Similarity
- Dataset:
sts-dev
- Evaluated with
EmbeddingSimilarityEvaluator
Metric | Value |
---|---|
pearson_cosine | 0.8391 |
spearman_cosine | 0.841 |
pearson_manhattan | 0.8277 |
spearman_manhattan | 0.8361 |
pearson_euclidean | 0.8274 |
spearman_euclidean | 0.8358 |
pearson_dot | 0.8154 |
spearman_dot | 0.818 |
pearson_max | 0.8391 |
spearman_max | 0.841 |
Semantic Similarity
- Dataset:
sts-test
- Evaluated with
EmbeddingSimilarityEvaluator
Metric | Value |
---|---|
pearson_cosine | 0.813 |
spearman_cosine | 0.8173 |
pearson_manhattan | 0.8114 |
spearman_manhattan | 0.8164 |
pearson_euclidean | 0.8103 |
spearman_euclidean | 0.8158 |
pearson_dot | 0.7908 |
spearman_dot | 0.7887 |
pearson_max | 0.813 |
spearman_max | 0.8173 |
Acknowledgments
The author would like to thank Prince Sultan University for their invaluable support in this project. Their contributions and resources have been instrumental in the development and fine-tuning of these models.
## Citation
If you use the GATE, please cite it as follows:
@misc{nacar2025GATE,
title={GATE: General Arabic Text Embedding for Enhanced Semantic Textual Similarity with Hybrid Loss Training},
author={Omer Nacar, Anis Koubaa, Serry Taiseer Sibaee and Lahouari Ghouti},
year={2025},
note={Submitted to COLING 2025},
url={https://huggingface.co/Omartificial-Intelligence-Space/GATE-AraBert-v1},
}