ZNV Embedding utilizes a 6B LLM (Large Language Model) for embedding, achieving excellent embedding results.

In a single inference, we used two prompts to extract two different embeddings for a sentence, and then concatenated them.

Model usage method:

  1. Define ZNVEmbeddingModel
import os
from transformers import (
    LlamaForCausalLM,
    LlamaTokenizer, AutoConfig,
)
import torch
import torch.nn.functional as F
import numpy as np


class ZNVEmbeddingModel(torch.nn.Module):
    def __init__(self, model_name_or_path):
        super(ZNVEmbeddingModel, self).__init__()
        self.prompt_prefix = "阅读下文,然后答题\n"
        self.prompt_suffixes = ["\n1.一个字总结上文的意思是:",
                                "\n2.上文深层次的意思是:"]
        self.hidden_size = 4096
        self.model_name_or_path = model_name_or_path
        self.linear_suffixes = torch.nn.ModuleList(
            [torch.nn.Linear(self.hidden_size, self.hidden_size//len(self.prompt_suffixes))
             for _ in range(len(self.prompt_suffixes))])
        self.tokenizer, self.llama = self.load_llama()

        self.tanh = torch.nn.Tanh()
        self.suffixes_ids = []
        self.suffixes_ids_len = []
        self.suffixes_len = 0
        for suffix in self.prompt_suffixes:
            ids = self.tokenizer(suffix, return_tensors="pt")["input_ids"].tolist()[0]
            self.suffixes_ids += ids
            self.suffixes_ids_len.append(len(ids))
            self.suffixes_len += len(ids)

        self.suffixes_ones = torch.ones(self.suffixes_len)
        self.suffixes_ids = torch.tensor(self.suffixes_ids)

        linear_file = os.path.join(model_name_or_path, "linears")
        load_layers = torch.load(linear_file)
        model_state = self.state_dict()
        model_state.update(load_layers)
        self.load_state_dict(model_state, strict=False)

    def load_llama(self):
        llm_path = os.path.join(self.model_name_or_path)
        config = AutoConfig.from_pretrained(llm_path)
        tokenizer = LlamaTokenizer.from_pretrained(self.model_name_or_path)
        tokenizer.padding_side = "left"
        model = LlamaForCausalLM.from_pretrained(
            llm_path,
            config=config,
            low_cpu_mem_usage=True
        )
        model.config.use_cache = False
        return tokenizer, model

    def forward(self, sentences):
        prompts_embeddings = []
        sentences = [self.prompt_prefix + s for s in sentences]
        inputs = self.tokenizer(sentences, max_length=256, padding=True, truncation=True,
                                return_tensors='pt')
        attention_mask = inputs["attention_mask"]
        input_ids = inputs["input_ids"]
        batch_size = len(sentences)
        suffixes_ones = self.suffixes_ones.unsqueeze(0)
        suffixes_ones = suffixes_ones.repeat(batch_size, 1)
        device = next(self.parameters()).device
        attention_mask = torch.cat([attention_mask, suffixes_ones], dim=-1).to(device)

        suffixes_ids = self.suffixes_ids.unsqueeze(0)
        suffixes_ids = suffixes_ids.repeat(batch_size, 1)
        input_ids = torch.cat([input_ids, suffixes_ids], dim=-1).to(device)
        last_hidden_state = self.llama.base_model.base_model(attention_mask=attention_mask, input_ids=input_ids).last_hidden_state
        index = -1
        for i in range(len(self.suffixes_ids_len)):
            embedding = last_hidden_state[:, index, :]
            embedding = self.linear_suffixes[i](embedding)
            prompts_embeddings.append(embedding)
            index -= self.suffixes_ids_len[-i-1]

        output_embedding = torch.cat(prompts_embeddings, dim=-1)
        output_embedding = self.tanh(output_embedding)
        output_embedding = F.normalize(output_embedding, p=2, dim=1)
        return output_embedding

    def encode(self, sentences, batch_size=10, **kwargs):
        size = len(sentences)
        embeddings = None
        handled = 0
        while handled < size:
            tokens = sentences[handled:handled + batch_size]
            output_embeddings = self.forward(tokens)
            result = output_embeddings.cpu().numpy()
            handled += result.shape[0]
            if embeddings is not None:
                embeddings = np.concatenate((embeddings, result), axis=0)
            else:
                embeddings = result
        return embeddings
  1. Use ZNVEmbeddingModel for Embedding.
znv_model = ZNVEmbeddingModel("your_model_path")
znv_model.eval()
with torch.no_grad():
    output = znv_model(["请问你的电话号码是多少?","可以告诉我你的手机号吗?"])
    cos_sim = F.cosine_similarity(output[0],output[1],dim=0)
    print(cos_sim)
Downloads last month
601
Safetensors
Model size
6.06B params
Tensor type
BF16
·
Inference Examples
This model does not have enough activity to be deployed to Inference API (serverless) yet. Increase its social visibility and check back later, or deploy to Inference Endpoints (dedicated) instead.

Spaces using sentosa/ZNV-Embedding 2

Evaluation results