|
--- |
|
base_model: Omartificial-Intelligence-Space/Arabert-all-nli-triplet-Matryoshka |
|
datasets: |
|
- Omartificial-Intelligence-Space/Arabic-stsb |
|
language: |
|
- ar |
|
library_name: sentence-transformers |
|
metrics: |
|
- pearson_cosine |
|
- spearman_cosine |
|
- pearson_manhattan |
|
- spearman_manhattan |
|
- pearson_euclidean |
|
- spearman_euclidean |
|
- pearson_dot |
|
- spearman_dot |
|
- pearson_max |
|
- spearman_max |
|
pipeline_tag: sentence-similarity |
|
tags: |
|
- sentence-transformers |
|
- sentence-similarity |
|
- feature-extraction |
|
- generated_from_trainer |
|
- dataset_size:947818 |
|
- loss:SoftmaxLoss |
|
- loss:CosineSimilarityLoss |
|
widget: |
|
- source_sentence: امرأة تكتب شيئاً |
|
sentences: |
|
- مراهق يتحدث إلى فتاة عبر كاميرا الإنترنت |
|
- امرأة تقطع البصل الأخضر. |
|
- مجموعة من كبار السن يتظاهرون حول طاولة الطعام. |
|
- source_sentence: تتشكل النجوم في مناطق تكوين النجوم، والتي تنشأ نفسها من السحب الجزيئية. |
|
sentences: |
|
- لاعب كرة السلة على وشك تسجيل نقاط لفريقه. |
|
- المقال التالي مأخوذ من نسختي من "أطلس البطريق الجديد للتاريخ الوسطى" |
|
- قد يكون من الممكن أن يوجد نظام شمسي مثل نظامنا خارج المجرة |
|
- source_sentence: تحت السماء الزرقاء مع الغيوم البيضاء، يصل طفل لمس مروحة طائرة واقفة |
|
على حقل من العشب. |
|
sentences: |
|
- امرأة تحمل كأساً |
|
- طفل يحاول لمس مروحة طائرة |
|
- اثنان من عازبين عن الشرب يستعدون للعشاء |
|
- source_sentence: رجل في منتصف العمر يحلق لحيته في غرفة ذات جدران بيضاء والتي لا |
|
تبدو كحمام |
|
sentences: |
|
- فتى يخطط اسمه على مكتبه |
|
- رجل ينام |
|
- المرأة وحدها وهي نائمة في غرفة نومها |
|
- source_sentence: الكلب البني مستلقي على جانبه على سجادة بيج، مع جسم أخضر في المقدمة. |
|
sentences: |
|
- شخص طويل القامة |
|
- المرأة تنظر من النافذة. |
|
- لقد مات الكلب |
|
model-index: |
|
- name: Omartificial-Intelligence-Space/Arabert-all-nli-triplet-Matryoshka |
|
results: |
|
- dataset: |
|
config: ar |
|
name: MTEB MintakaRetrieval (ar) |
|
revision: efa78cc2f74bbcd21eff2261f9e13aebe40b814e |
|
split: test |
|
type: mintaka/mmteb-mintaka |
|
metrics: |
|
- type: main_score |
|
value: 17.121 |
|
- type: map_at_1 |
|
value: 9.805 |
|
- type: map_at_3 |
|
value: 12.574 |
|
- type: map_at_5 |
|
value: 13.468 |
|
- type: map_at_10 |
|
value: 14.294 |
|
- type: ndcg_at_1 |
|
value: 9.805 |
|
- type: ndcg_at_3 |
|
value: 13.504 |
|
- type: ndcg_at_5 |
|
value: 15.114 |
|
- type: ndcg_at_10 |
|
value: 17.121 |
|
- type: recall_at_1 |
|
value: 9.805 |
|
- type: recall_at_3 |
|
value: 16.205 |
|
- type: recall_at_5 |
|
value: 20.109 |
|
- type: recall_at_10 |
|
value: 26.328 |
|
- type: precision_at_1 |
|
value: 9.805 |
|
- type: precision_at_3 |
|
value: 5.402 |
|
- type: precision_at_5 |
|
value: 4.022 |
|
- type: precision_at_10 |
|
value: 2.633 |
|
- type: mrr_at_1 |
|
value: 9.8048 |
|
- type: mrr_at_3 |
|
value: 12.5738 |
|
- type: mrr_at_5 |
|
value: 13.468 |
|
- type: mrr_at_10 |
|
value: 14.2936 |
|
task: |
|
type: Retrieval |
|
- dataset: |
|
config: ar |
|
name: MTEB MIRACLRetrievalHardNegatives (ar) |
|
revision: 95c8db7d4a6e9c1d8a60601afd63d553ae20a2eb |
|
split: dev |
|
type: miracl/mmteb-miracl-hardnegatives |
|
metrics: |
|
- type: main_score |
|
value: 17.751 |
|
- type: map_at_1 |
|
value: 6.057 |
|
- type: map_at_3 |
|
value: 9.526 |
|
- type: map_at_5 |
|
value: 10.812 |
|
- type: map_at_10 |
|
value: 12.51 |
|
- type: ndcg_at_1 |
|
value: 8.9 |
|
- type: ndcg_at_3 |
|
value: 11.773 |
|
- type: ndcg_at_5 |
|
value: 13.94 |
|
- type: ndcg_at_10 |
|
value: 17.751 |
|
- type: recall_at_1 |
|
value: 6.057 |
|
- type: recall_at_3 |
|
value: 13.244 |
|
- type: recall_at_5 |
|
value: 18.536 |
|
- type: recall_at_10 |
|
value: 28.793 |
|
- type: precision_at_1 |
|
value: 8.9 |
|
- type: precision_at_3 |
|
value: 7.1 |
|
- type: precision_at_5 |
|
value: 6.02 |
|
- type: precision_at_10 |
|
value: 4.84 |
|
- type: mrr_at_1 |
|
value: 8.9 |
|
- type: mrr_at_3 |
|
value: 13.5833 |
|
- type: mrr_at_5 |
|
value: 15.2683 |
|
- type: mrr_at_10 |
|
value: 16.9415 |
|
task: |
|
type: Retrieval |
|
- dataset: |
|
config: ar |
|
name: MTEB MLQARetrieval (ar) |
|
revision: 397ed406c1a7902140303e7faf60fff35b58d285 |
|
split: validation |
|
type: mlqa/mmteb-mlqa |
|
metrics: |
|
- type: main_score |
|
value: 58.026 |
|
- type: map_at_1 |
|
value: 42.553 |
|
- type: map_at_3 |
|
value: 50.709 |
|
- type: map_at_5 |
|
value: 51.899 |
|
- type: map_at_10 |
|
value: 52.972 |
|
- type: ndcg_at_1 |
|
value: 42.553 |
|
- type: ndcg_at_3 |
|
value: 53.336 |
|
- type: ndcg_at_5 |
|
value: 55.484 |
|
- type: ndcg_at_10 |
|
value: 58.026 |
|
- type: recall_at_1 |
|
value: 42.553 |
|
- type: recall_at_3 |
|
value: 60.928 |
|
- type: recall_at_5 |
|
value: 66.151 |
|
- type: recall_at_10 |
|
value: 73.888 |
|
- type: precision_at_1 |
|
value: 42.553 |
|
- type: precision_at_3 |
|
value: 20.309 |
|
- type: precision_at_5 |
|
value: 13.23 |
|
- type: precision_at_10 |
|
value: 7.389 |
|
- type: mrr_at_1 |
|
value: 42.5532 |
|
- type: mrr_at_3 |
|
value: 50.7092 |
|
- type: mrr_at_5 |
|
value: 51.8988 |
|
- type: mrr_at_10 |
|
value: 52.9717 |
|
task: |
|
type: Retrieval |
|
- dataset: |
|
config: default |
|
name: MTEB SadeemQuestionRetrieval (ar) |
|
revision: 3cb0752b182e5d5d740df547748b06663c8e0bd9 |
|
split: test |
|
type: sadeem/mmteb-sadeem |
|
metrics: |
|
- type: main_score |
|
value: 59.306 |
|
- type: map_at_1 |
|
value: 25.945 |
|
- type: map_at_3 |
|
value: 47.766 |
|
- type: map_at_5 |
|
value: 48.994 |
|
- type: map_at_10 |
|
value: 49.675 |
|
- type: ndcg_at_1 |
|
value: 25.945 |
|
- type: ndcg_at_3 |
|
value: 55.479 |
|
- type: ndcg_at_5 |
|
value: 57.679 |
|
- type: ndcg_at_10 |
|
value: 59.306 |
|
- type: recall_at_1 |
|
value: 25.945 |
|
- type: recall_at_3 |
|
value: 77.98 |
|
- type: recall_at_5 |
|
value: 83.293 |
|
- type: recall_at_10 |
|
value: 88.272 |
|
- type: precision_at_1 |
|
value: 25.945 |
|
- type: precision_at_3 |
|
value: 25.993 |
|
- type: precision_at_5 |
|
value: 16.659 |
|
- type: precision_at_10 |
|
value: 8.827 |
|
- type: mrr_at_1 |
|
value: 24.988 |
|
- type: mrr_at_3 |
|
value: 47.056 |
|
- type: mrr_at_5 |
|
value: 48.2671 |
|
- type: mrr_at_10 |
|
value: 48.9239 |
|
task: |
|
type: Retrieval |
|
--- |
|
|
|
# GATE-AraBert-v0 |
|
|
|
This is a General Arabic Text Embedding trained using SentenceTransformers in a multi-task setup. The system trains on the AllNLI and on the STS dataset. |
|
|
|
## Model Details |
|
|
|
### Model Description |
|
- **Model Type:** Sentence Transformer |
|
- **Base model:** [Omartificial-Intelligence-Space/Arabic-Triplet-Matryoshka-V2](https://huggingface.co/Omartificial-Intelligence-Space/Arabic-Triplet-Matryoshka-V2) <!-- at revision 5ce4f80f3ede26de623d6ac10681399dba5c684a --> |
|
- **Maximum Sequence Length:** 512 tokens |
|
- **Output Dimensionality:** 768 tokens |
|
- **Similarity Function:** Cosine Similarity |
|
- **Training Datasets:** |
|
- [all-nli](https://huggingface.co/datasets/Omartificial-Intelligence-Space/Arabic-NLi-Pair-Class) |
|
- [sts](https://huggingface.co/datasets/Omartificial-Intelligence-Space/arabic-stsb) |
|
- **Language:** ar |
|
|
|
|
|
## Usage |
|
|
|
### Direct Usage (Sentence Transformers) |
|
|
|
First install the Sentence Transformers library: |
|
|
|
```bash |
|
pip install -U sentence-transformers |
|
``` |
|
|
|
Then you can load this model and run inference. |
|
```python |
|
from sentence_transformers import SentenceTransformer |
|
|
|
# Download from the 🤗 Hub |
|
model = SentenceTransformer("Omartificial-Intelligence-Space/GATE-AraBert-v0") |
|
# Run inference |
|
sentences = [ |
|
'الكلب البني مستلقي على جانبه على سجادة بيج، مع جسم أخضر في المقدمة.', |
|
'لقد مات الكلب', |
|
'شخص طويل القامة', |
|
] |
|
embeddings = model.encode(sentences) |
|
print(embeddings.shape) |
|
# [3, 768] |
|
|
|
# Get the similarity scores for the embeddings |
|
similarities = model.similarity(embeddings, embeddings) |
|
print(similarities.shape) |
|
# [3, 3] |
|
``` |
|
|
|
|
|
|
|
## Evaluation |
|
|
|
### Metrics |
|
|
|
#### Semantic Similarity |
|
* Dataset: `sts-dev` |
|
* Evaluated with [<code>EmbeddingSimilarityEvaluator</code>](https://sbert.net/docs/package_reference/sentence_transformer/evaluation.html#sentence_transformers.evaluation.EmbeddingSimilarityEvaluator) |
|
|
|
| Metric | Value | |
|
|:--------------------|:-----------| |
|
| pearson_cosine | 0.8384 | |
|
| **spearman_cosine** | **0.8389** | |
|
| pearson_manhattan | 0.8248 | |
|
| spearman_manhattan | 0.8329 | |
|
| pearson_euclidean | 0.825 | |
|
| spearman_euclidean | 0.8337 | |
|
| pearson_dot | 0.8072 | |
|
| spearman_dot | 0.8098 | |
|
| pearson_max | 0.8384 | |
|
| spearman_max | 0.8389 | |
|
|
|
#### Semantic Similarity |
|
* Dataset: `sts-test` |
|
* Evaluated with [<code>EmbeddingSimilarityEvaluator</code>](https://sbert.net/docs/package_reference/sentence_transformer/evaluation.html#sentence_transformers.evaluation.EmbeddingSimilarityEvaluator) |
|
|
|
| Metric | Value | |
|
|:--------------------|:-----------| |
|
| pearson_cosine | 0.7908 | |
|
| **spearman_cosine** | **0.7893** | |
|
| pearson_manhattan | 0.7923 | |
|
| spearman_manhattan | 0.7947 | |
|
| pearson_euclidean | 0.7904 | |
|
| spearman_euclidean | 0.7934 | |
|
| pearson_dot | 0.7404 | |
|
| spearman_dot | 0.7354 | |
|
| pearson_max | 0.7923 | |
|
| spearman_max | 0.7947 | |
|
|
|
|