Introduce custom Sentence Transformer module

Files changed (4) hide show

README.md +12 -74
custom_st.py +87 -0
modules.json +12 -6
sentence_bert_config.json +4 -1

README.md CHANGED Viewed

@@ -8985,66 +8985,10 @@ This work was accomplished during my free time; please grant time a little time.
 ## Usage
 ```python
-import functools
-import PIL
-import numpy as np
 import torch
-from typing import Dict
-from io import BytesIO
-from transformers import SiglipImageProcessor
 from sentence_transformers import SentenceTransformer
-def jasper_vl_forward(self, features: dict[str, torch.Tensor], **kwargs) -> dict[str, torch.Tensor]:
-    trans_features = {"input_ids": features["input_ids"], "attention_mask": features["attention_mask"]}
-    if "pixel_values" in features:
-        trans_features["pixel_values"] = features["pixel_values"]
-    sentence_embedding = self.auto_model(**trans_features, **kwargs)["sentence_embedding"]
-    features.update({"sentence_embedding": sentence_embedding})
-    return features
-def jasper_vl_tokenize(self, texts: list[Dict] | list[str]) -> dict[str, torch.Tensor]:
-    img_start_token = "<|jasper_img_start|>"
-    img_token = "<|jasper_img_token|>"
-    img_end_token = "<|jasper_img_end|>"
-    num_img_tokens = 300
-    def process_text_item(item):
-        if isinstance(item, str):
-            return item, []
-        text, images = "", []
-        for sub_item in item:
-            if sub_item["type"] == "text":
-                text += sub_item["content"]
-            elif sub_item["type"] == "image_bytes":
-                text += img_start_token + img_token * num_img_tokens + img_end_token
-                images.append(PIL.Image.open(BytesIO(sub_item["content"])).convert("RGB"))
-            elif sub_item["type"] == "image_path":
-                text += img_start_token + img_token * num_img_tokens + img_end_token
-                images.append(PIL.Image.open(sub_item["content"]).convert("RGB"))
-            else:
-                raise ValueError(f"unknown data type {sub_item['type']}")
-        return text, images
-    all_texts, all_images = [], []
-    for item in texts:
-        text, images = process_text_item(item)
-        all_texts.append(text)
-        all_images.extend(images)
-    ipt = self.tokenizer(all_texts, padding="longest", truncation=True, max_length=1024, return_tensors="pt")
-    if all_images:
-        ipt["pixel_values"] = self.processor(
-            images=all_images,
-            return_tensors="pt"
-        )["pixel_values"]
-        # For the sake of demonstration, external variables are used here, please modify the code according to your own environment.
-        if use_gpu:
-            ipt["pixel_values"] = ipt["pixel_values"].bfloat16()
-    return ipt
 DOC1 = """
 Blue light is scattered in all directions by the tiny molecules of air in Earth's atmosphere.
 Blue is scattered more than other colors because it travels as shorter, smaller waves. This is why we see a blue sky most of the time.
@@ -9062,10 +9006,6 @@ Color combinations: Decide how to best complement your preferred color with othe
 Color palette: Limit your color palette to a main color and one or two additional colors.
 60-30-10 rule: Use a primary color 60% of the time, a secondary color 30% of the time, and an accent color 10% of the time
 """
-prompt_dict = {
-    "s2p_query": "Instruct: Given a web search query, retrieve relevant passages that answer the query.\nQuery: ",
-    "s2s_query": "Instruct: Retrieve semantically similar text.\nQuery: "
-}
 if __name__ == "__main__":
     # load model
     use_gpu = False
@@ -9073,7 +9013,7 @@ if __name__ == "__main__":
     model = SentenceTransformer(
         model_name,
         trust_remote_code=True,
-        device="cpu",
         model_kwargs={
             "torch_dtype": torch.bfloat16 if use_gpu else torch.float32,
             "attn_implementation": "sdpa"
@@ -9082,13 +9022,10 @@ if __name__ == "__main__":
         ## 1024 is recommended
         # set is_text_encoder 'True', if you do not encode image
         config_kwargs={"is_text_encoder": False, "vector_dim": 1024},
-        tokenizer_kwargs={"padding_side": "right"}
     )
-    # jasper model cannot directly be used in SentenceTransformer, do some modifications
-    model.processor = SiglipImageProcessor.from_pretrained(model_name)
-    model.tokenize = functools.partial(jasper_vl_tokenize, model)
-    model._first_module().forward = functools.partial(jasper_vl_forward, model._first_module())
     model.max_seq_length = 1024
     # data
     q_list = [
         "Why the sky is blue?",
@@ -9099,16 +9036,17 @@ if __name__ == "__main__":
         [{"type": "image_path", "content": "./assets/img1.png"}, {"type": "text", "content": "Hope this image helps!"}],
         DOC2,
         [{"type": "image_path", "content": "./assets/img2.png"}],
     ]
-    q_vecs = model.encode([prompt_dict["s2p_query"] + text for text in q_list], normalize_embeddings=True)
-    doc_vecs = model.encode(doc_list, normalize_embeddings=True)
-    print(np.matmul(q_vecs, doc_vecs.T))
-    # the output is:
-    # [[0.777521   0.75944513 0.24291277 0.2187205]
-    #  [0.32261407 0.30536035 0.74208796 0.5484469]]
 ```
 ## License
 **This model should not be used for any commercial purpose!**

 ## Usage
 ```python
 import torch
 from sentence_transformers import SentenceTransformer
 DOC1 = """
 Blue light is scattered in all directions by the tiny molecules of air in Earth's atmosphere.
 Blue is scattered more than other colors because it travels as shorter, smaller waves. This is why we see a blue sky most of the time.
 Color palette: Limit your color palette to a main color and one or two additional colors.
 60-30-10 rule: Use a primary color 60% of the time, a secondary color 30% of the time, and an accent color 10% of the time
 """
 if __name__ == "__main__":
     # load model
     use_gpu = False
     model = SentenceTransformer(
         model_name,
         trust_remote_code=True,
+        device="cpu" if not use_gpu else "cuda",
         model_kwargs={
             "torch_dtype": torch.bfloat16 if use_gpu else torch.float32,
             "attn_implementation": "sdpa"
         ## 1024 is recommended
         # set is_text_encoder 'True', if you do not encode image
         config_kwargs={"is_text_encoder": False, "vector_dim": 1024},
     )
+    # We can reduce the max_seq_length from the default of 2048 for faster encoding
     model.max_seq_length = 1024
     # data
     q_list = [
         "Why the sky is blue?",
         [{"type": "image_path", "content": "./assets/img1.png"}, {"type": "text", "content": "Hope this image helps!"}],
         DOC2,
         [{"type": "image_path", "content": "./assets/img2.png"}],
     ]
+    q_vecs = model.encode(q_list, prompt_name="s2p_query")
+    doc_vecs = model.encode(doc_list)
+    # calculate similarity
+    similarities = model.similarity(q_vecs, doc_vecs)
+    print(similarities)
+    # the output is:
+    # tensor([[0.7775, 0.7594, 0.2429, 0.2187],
+    #         [0.3226, 0.3054, 0.7421, 0.5484]])
 ```
 ## License
 **This model should not be used for any commercial purpose!**

custom_st.py ADDED Viewed

	@@ -0,0 +1,87 @@

+from typing import Any, Dict, Optional
+import PIL
+import torch
+import PIL
+import torch
+from typing import Dict
+from io import BytesIO
+from transformers import SiglipImageProcessor
+from sentence_transformers.models import Transformer as BaseTransformer
+class MultiModalTransformer(BaseTransformer):
+    def __init__(
+        self,
+        model_name_or_path: str,
+        cache_dir: Optional[str] = None,
+        tokenizer_args: Optional[Dict[str, Any]] = None,
+        **kwargs,
+    ):
+        super().__init__(model_name_or_path, **kwargs)
+        if tokenizer_args is None:
+            tokenizer_args = {}
+        self.processor = SiglipImageProcessor.from_pretrained(
+            model_name_or_path, cache_dir=cache_dir, **tokenizer_args
+        )
+    def forward(
+        self, features: dict[str, torch.Tensor], **kwargs
+    ) -> dict[str, torch.Tensor]:
+        trans_features = {
+            "input_ids": features["input_ids"],
+            "attention_mask": features["attention_mask"],
+        }
+        if "pixel_values" in features:
+            trans_features["pixel_values"] = features["pixel_values"].to(
+                self.auto_model.dtype
+            )
+        sentence_embedding = self.auto_model(**trans_features, **kwargs)[
+            "sentence_embedding"
+        ]
+        features.update({"sentence_embedding": sentence_embedding})
+        return features
+    def tokenize(self, texts: list[Dict] | list[str]) -> dict[str, torch.Tensor]:
+        img_start_token = "<|jasper_img_start|>"
+        img_token = "<|jasper_img_token|>"
+        img_end_token = "<|jasper_img_end|>"
+        num_img_tokens = 300
+        def process_text_item(item):
+            if isinstance(item, str):
+                return item, []
+            text, images = "", []
+            for sub_item in item:
+                if sub_item["type"] == "text":
+                    text += sub_item["content"]
+                elif sub_item["type"] == "image_bytes":
+                    text += img_start_token + img_token * num_img_tokens + img_end_token
+                    images.append(
+                        PIL.Image.open(BytesIO(sub_item["content"])).convert("RGB")
+                    )
+                elif sub_item["type"] == "image_path":
+                    text += img_start_token + img_token * num_img_tokens + img_end_token
+                    images.append(PIL.Image.open(sub_item["content"]).convert("RGB"))
+                else:
+                    raise ValueError(f"unknown data type {sub_item['type']}")
+            return text, images
+        all_texts, all_images = [], []
+        for item in texts:
+            text, images = process_text_item(item)
+            all_texts.append(text)
+            all_images.extend(images)
+        ipt = self.tokenizer(
+            all_texts,
+            padding="longest",
+            truncation=True,
+            max_length=1024,
+            return_tensors="pt",
+        )
+        if all_images:
+            ipt["pixel_values"] = self.processor(
+                images=all_images, return_tensors="pt"
+            )["pixel_values"]
+        return ipt

modules.json CHANGED Viewed

@@ -1,8 +1,14 @@
 [
- {
-  "idx": 0,
-  "name": "0",
-  "path": "",
-  "type": "sentence_transformers.models.Transformer"
- }
 ]

 [
+  {
+    "idx": 0,
+    "name": "0",
+    "path": "",
+    "type": "custom_st.MultiModalTransformer"
+  },
+  {
+    "idx": 1,
+    "name": "1",
+    "path": "1_Normalize",
+    "type": "sentence_transformers.models.Normalize"
+  }
 ]

sentence_bert_config.json CHANGED Viewed

@@ -1,4 +1,7 @@
 {
  "max_seq_length": 2048,
- "do_lower_case": false
 }

 {
  "max_seq_length": 2048,
+ "do_lower_case": false,
+ "tokenizer_args": {
+    "padding_side": "right"
+ }
 }