Build in query prompt to Sentence Transformers Config (#19)
Browse files- Build in query prompt to Sentence Transformers Config (b192313ec7397fa525650967f22d04201dfa0788)
Co-authored-by: Jonathan Wang <[email protected]>
- README.md +21 -17
- config_sentence_transformers.json +5 -2
README.md
CHANGED
@@ -2621,7 +2621,7 @@ Here, we provide several ways to produce sentence embeddings. Please note that y
|
|
2621 |
|
2622 |
## Quickstart
|
2623 |
|
2624 |
-
Here, we provide several ways to produce sentence embeddings. Please note that you have to provide the prompt `Represent this sentence for searching relevant passages
|
2625 |
|
2626 |
### sentence-transformers
|
2627 |
|
@@ -2640,11 +2640,11 @@ dimensions = 512
|
|
2640 |
# 2. load model
|
2641 |
model = SentenceTransformer("mixedbread-ai/mxbai-embed-large-v1", truncate_dim=dimensions)
|
2642 |
|
2643 |
-
#
|
2644 |
-
|
2645 |
|
|
|
2646 |
docs = [
|
2647 |
-
query,
|
2648 |
"A man is eating food.",
|
2649 |
"A man is eating pasta.",
|
2650 |
"The girl is carrying a baby.",
|
@@ -2652,19 +2652,24 @@ docs = [
|
|
2652 |
]
|
2653 |
|
2654 |
# 2. Encode
|
2655 |
-
|
|
|
|
|
|
|
|
|
|
|
2656 |
|
2657 |
# Optional: Quantize the embeddings
|
2658 |
-
|
|
|
2659 |
|
2660 |
-
similarities = cos_sim(
|
2661 |
print('similarities:', similarities)
|
2662 |
|
2663 |
|
2664 |
-
```
|
2665 |
### Transformers
|
2666 |
|
2667 |
-
|
2668 |
from typing import Dict
|
2669 |
|
2670 |
import torch
|
@@ -2712,18 +2717,18 @@ embeddings = pooling(outputs, inputs, 'cls')
|
|
2712 |
|
2713 |
similarities = cos_sim(embeddings[0], embeddings[1:])
|
2714 |
print('similarities:', similarities)
|
2715 |
-
|
2716 |
|
2717 |
### Transformers.js
|
2718 |
|
2719 |
If you haven't already, you can install the [Transformers.js](https://huggingface.co/docs/transformers.js) JavaScript library from [NPM](https://www.npmjs.com/package/@xenova/transformers) using:
|
2720 |
-
|
2721 |
npm i @xenova/transformers
|
2722 |
-
|
2723 |
|
2724 |
You can then use the model to compute embeddings like this:
|
2725 |
|
2726 |
-
|
2727 |
import { pipeline, cos_sim } from '@xenova/transformers';
|
2728 |
|
2729 |
// Create a feature extraction pipeline
|
@@ -2745,13 +2750,13 @@ const output = await extractor(docs, { pooling: 'cls' });
|
|
2745 |
const [source_embeddings, ...document_embeddings ] = output.tolist();
|
2746 |
const similarities = document_embeddings.map(x => cos_sim(source_embeddings, x));
|
2747 |
console.log(similarities); // [0.7919578577247139, 0.6369278664248345, 0.16512018371357193, 0.3620778366720027]
|
2748 |
-
|
2749 |
|
2750 |
### Using API
|
2751 |
|
2752 |
You can use the model via our API as follows:
|
2753 |
|
2754 |
-
|
2755 |
from mixedbread_ai.client import MixedbreadAI, EncodingFormat
|
2756 |
from sklearn.metrics.pairwise import cosine_similarity
|
2757 |
import os
|
@@ -2773,10 +2778,9 @@ res = mxbai.embeddings(
|
|
2773 |
|
2774 |
encoded_embeddings = res.data[0].embedding
|
2775 |
print(res.dimensions, encoded_embeddings.ubinary, encoded_embeddings.float_, encoded_embeddings.int_8)
|
2776 |
-
```
|
2777 |
|
2778 |
-
The API comes with native int8 and binary quantization support! Check out the [docs](https://mixedbread.ai/docs) for more information.
|
2779 |
|
|
|
2780 |
## Evaluation
|
2781 |
As of March 2024, our model archives SOTA performance for Bert-large sized models on the [MTEB](https://huggingface.co/spaces/mteb/leaderboard). It ourperforms commercial models like OpenAIs text-embedding-3-large and matches the performance of model 20x it's size like the [echo-mistral-7b](https://huggingface.co/jspringer/echo-mistral-7b-instruct-lasttoken). Our model was trained with no overlap of the MTEB data, which indicates that our model generalizes well across several domains, tasks and text length. We know there are some limitations with this model, which will be fixed in v2.
|
2782 |
|
|
|
2621 |
|
2622 |
## Quickstart
|
2623 |
|
2624 |
+
Here, we provide several ways to produce sentence embeddings. Please note that you have to provide the prompt `Represent this sentence for searching relevant passages: ` for query if you want to use it for retrieval. Besides that you don't need any prompt.
|
2625 |
|
2626 |
### sentence-transformers
|
2627 |
|
|
|
2640 |
# 2. load model
|
2641 |
model = SentenceTransformer("mixedbread-ai/mxbai-embed-large-v1", truncate_dim=dimensions)
|
2642 |
|
2643 |
+
# The prompt used for query retrieval tasks:
|
2644 |
+
# query_prompt = 'Represent this sentence for searching relevant passages: '
|
2645 |
|
2646 |
+
query = "A man is eating a piece of bread"
|
2647 |
docs = [
|
|
|
2648 |
"A man is eating food.",
|
2649 |
"A man is eating pasta.",
|
2650 |
"The girl is carrying a baby.",
|
|
|
2652 |
]
|
2653 |
|
2654 |
# 2. Encode
|
2655 |
+
query_embedding = model.encode(query, prompt_name="query")
|
2656 |
+
# Equivalent Alternatives:
|
2657 |
+
# query_embedding = model.encode(query_prompt + query)
|
2658 |
+
# query_embedding = model.encode(query, prompt=query_prompt)
|
2659 |
+
|
2660 |
+
docs_embeddings = model.encode(docs)
|
2661 |
|
2662 |
# Optional: Quantize the embeddings
|
2663 |
+
binary_query_embedding = quantize_embeddings(query_embedding, precision="ubinary")
|
2664 |
+
binary_docs_embeddings = quantize_embeddings(docs_embeddings, precision="ubinary")
|
2665 |
|
2666 |
+
similarities = cos_sim(query_embedding, docs_embeddings)
|
2667 |
print('similarities:', similarities)
|
2668 |
|
2669 |
|
|
|
2670 |
### Transformers
|
2671 |
|
2672 |
+
|
2673 |
from typing import Dict
|
2674 |
|
2675 |
import torch
|
|
|
2717 |
|
2718 |
similarities = cos_sim(embeddings[0], embeddings[1:])
|
2719 |
print('similarities:', similarities)
|
2720 |
+
|
2721 |
|
2722 |
### Transformers.js
|
2723 |
|
2724 |
If you haven't already, you can install the [Transformers.js](https://huggingface.co/docs/transformers.js) JavaScript library from [NPM](https://www.npmjs.com/package/@xenova/transformers) using:
|
2725 |
+
|
2726 |
npm i @xenova/transformers
|
2727 |
+
|
2728 |
|
2729 |
You can then use the model to compute embeddings like this:
|
2730 |
|
2731 |
+
|
2732 |
import { pipeline, cos_sim } from '@xenova/transformers';
|
2733 |
|
2734 |
// Create a feature extraction pipeline
|
|
|
2750 |
const [source_embeddings, ...document_embeddings ] = output.tolist();
|
2751 |
const similarities = document_embeddings.map(x => cos_sim(source_embeddings, x));
|
2752 |
console.log(similarities); // [0.7919578577247139, 0.6369278664248345, 0.16512018371357193, 0.3620778366720027]
|
2753 |
+
|
2754 |
|
2755 |
### Using API
|
2756 |
|
2757 |
You can use the model via our API as follows:
|
2758 |
|
2759 |
+
|
2760 |
from mixedbread_ai.client import MixedbreadAI, EncodingFormat
|
2761 |
from sklearn.metrics.pairwise import cosine_similarity
|
2762 |
import os
|
|
|
2778 |
|
2779 |
encoded_embeddings = res.data[0].embedding
|
2780 |
print(res.dimensions, encoded_embeddings.ubinary, encoded_embeddings.float_, encoded_embeddings.int_8)
|
|
|
2781 |
|
|
|
2782 |
|
2783 |
+
The API comes with native int8 and binary quantization support! Check out the [docs](https://mixedbread.ai/docs) for more information.
|
2784 |
## Evaluation
|
2785 |
As of March 2024, our model archives SOTA performance for Bert-large sized models on the [MTEB](https://huggingface.co/spaces/mteb/leaderboard). It ourperforms commercial models like OpenAIs text-embedding-3-large and matches the performance of model 20x it's size like the [echo-mistral-7b](https://huggingface.co/jspringer/echo-mistral-7b-instruct-lasttoken). Our model was trained with no overlap of the MTEB data, which indicates that our model generalizes well across several domains, tasks and text length. We know there are some limitations with this model, which will be fixed in v2.
|
2786 |
|
config_sentence_transformers.json
CHANGED
@@ -4,6 +4,9 @@
|
|
4 |
"transformers": "4.37.0",
|
5 |
"pytorch": "2.1.0+cu121"
|
6 |
},
|
7 |
-
"prompts": {
|
8 |
-
|
|
|
|
|
|
|
9 |
}
|
|
|
4 |
"transformers": "4.37.0",
|
5 |
"pytorch": "2.1.0+cu121"
|
6 |
},
|
7 |
+
"prompts": {
|
8 |
+
"query": "Represent this sentence for searching relevant passages: ",
|
9 |
+
"text": ""
|
10 |
+
},
|
11 |
+
"default_prompt_name": "text"
|
12 |
}
|