--- # For reference on model card metadata, see the spec: https://github.com/huggingface/hub-docs/blob/main/modelcard.md?plain=1 # Doc / guide: https://huggingface.co/docs/hub/model-cards tags: - FlagEmbedding - Embedding - Hybrid Retrieval - ONNX - Optimum - ONNXRuntime - Multilingual license: mit base_model: BAAI/bge-m3 --- # Model Card for philipchung/bge-m3-onnx This is the [BAAI/BGE-M3](https://huggingface.co/BAAI/bge-m3) inference model converted to ONNX format and can be used with Optimum ONNX Runtime with CPU acceleration. This model outputs all 3 embedding types (Dense, Sparse, ColBERT). No ONNX optimizations are applied to this model. If you want to apply optimizations, use the export script included in this repo to generate a version of ONNX model with optimizations. Some of the code is adapted from [aapot/bge-m3-onnx](https://huggingface.co/aapot/bge-m3-onnx). The model in this repo inherits from `PretrainedModel` and the ONNX model can be downloaded from Huggingface Hub and used directly with the `model.from_pretrained()` method. ## How to Use ```python from collections import defaultdict from typing import Any import numpy as np from optimum.onnxruntime import ORTModelForCustomTasks from transformers import AutoTokenizer # Download ONNX model from Huggingface Hub onnx_model = ORTModelForCustomTasks.from_pretrained("philipchung/bge-m3-onnx") tokenizer = AutoTokenizer.from_pretrained("philipchung/bge-m3-onnx") # Inference forward pass sentences = ["First test sentence.", "Second test sentence"] inputs = tokenizer( sentences, padding="longest", return_tensors="np", ) outputs = onnx_model.forward(**inputs) def process_token_weights( token_weights: np.ndarray, input_ids: list ) -> defaultdict[Any, int]: """Convert sparse token weights into dictionary of token indices and corresponding weights. Function is taken from the original FlagEmbedding.bge_m3.BGEM3FlagModel from the _process_token_weights() function defined within the encode() method. """ # convert to dict result = defaultdict(int) unused_tokens = set( [ tokenizer.cls_token_id, tokenizer.eos_token_id, tokenizer.pad_token_id, tokenizer.unk_token_id, ] ) for w, idx in zip(token_weights, input_ids, strict=False): if idx not in unused_tokens and w > 0: idx = str(idx) # w = int(w) if w > result[idx]: result[idx] = w return result # Each sentence results in a dict[str, list]float] | dict[str, float] | list[list[float]]] which corresponds to a dict with dense, sparse, and colbert embeddings. embeddings_list = [] for input_ids, dense_vec, sparse_vec, colbert_vec in zip( inputs["input_ids"], outputs["dense_vecs"], outputs["sparse_vecs"], outputs["colbert_vecs"], strict=False, ): # Convert token weights into dictionary of token indices and corresponding weights token_weights = sparse_vec.astype(float).squeeze(-1) sparse_embeddings = process_token_weights( token_weights, input_ids.tolist(), ) multivector_embedding = { "dense": dense_vec.astype(float).tolist(), # (1024) "sparse": dict(sparse_embeddings), # dict[token_index, weight] "colbert": colbert_vec.astype(float).tolist(), # (token len, 1024) } embeddings_list.append(multivector_embedding) ```