the embeddings from Transformers and Sentence Transformers do not match

#1
by kenzo226 - opened

Thanks for sharing this model. I found out that embeddings generated from two interfaces do not match. Is there anything missing?

# sentence transformer
from sentence_transformers import SentenceTransformer
general_model = SentenceTransformer("infgrad/stella-base-zh-v3-1792d")

text = ["B没打篮球是因为受伤了。"]
text_vectors = general_model.encode(corpus, normalize_embeddings=True, )
# array([-0.00718741, -0.01593147,  0.01091087, -0.0029516 ,  0.02063012], dtype=float32)

# transformers
from transformers import AutoModel, AutoTokenizer
from sklearn.preprocessing import normalize

model = AutoModel.from_pretrained('infgrad/stella-base-zh-v3-1792d')
tokenizer = AutoTokenizer.from_pretrained('infgrad/stella-base-zh-v3-1792d')

batch_data = tokenizer(
    text=text,
    padding="longest",
    return_tensors="pt",
    max_length=1024,
    truncation=True,
)

attention_mask = batch_data["attention_mask"]
model_output = model(**batch_data)
last_hidden = model_output.last_hidden_state.masked_fill(~attention_mask[..., None].bool(), 0.0)
vectors = last_hidden.sum(dim=1) / attention_mask.sum(dim=1)[..., None]
vectors = normalize(vectors.detach(), norm="l2", axis=1, )
print(vectors.shape)  # 1,768
# array([-0.00420516,  0.04585313,  0.03543789, -0.02160062, -0.05666691])

@kenzo226 Hi, vectors have a shape of [batch-size,1792], I add a Linear(output features is 1792) to the last_hidden_state. You cannot only use the bert model to get vectors. Please check the moudels.json file:

[
 {
  "idx": 0,
  "name": "0",
  "path": "",
  "type": "sentence_transformers.models.Transformer"
 },
 {
  "idx": 1,
  "name": "1",
  "path": "1_Pooling",
  "type": "sentence_transformers.models.Pooling"
 },
 {
  "idx": 2,
  "name": "2",
  "path": "2_Dense",
  "type": "sentence_transformers.models.Dense"
 }

Please use SentenceTransformer to get vectors, or You do a full connection operation manually(e.g. WX+B), the W and B are in ./2_Dense/pytorch_model.bin

Thanks for the reply! Still want to give origin transformers a try. Attempted using the dense network and loaded the weights from 2_Dense, the shapes match, but the values do not. Not sure if something is missing or if I conducted something incorrectly, possibly in the pooling operation.

# transformer bert output
from transformers import AutoModel, AutoTokenizer
from sklearn.preprocessing import normalize

model = AutoModel.from_pretrained('infgrad/stella-base-zh-v3-1792d')
tokenizer = AutoTokenizer.from_pretrained('infgrad/stella-base-zh-v3-1792d')

batch_data = tokenizer(
    text=text,
    padding="longest",
    return_tensors="pt",
    max_length=1024,
    truncation=True,
)

# pooling
attention_mask = batch_data["attention_mask"]
model_output = model(**batch_data)
last_hidden = model_output.last_hidden_state.masked_fill(~attention_mask[..., None].bool(), 0.0)
vectors = last_hidden.sum(dim=1) / attention_mask.sum(dim=1)[..., None]
vectors = normalize(vectors.detach(), norm="l2", axis=1, ) # 1, 768

# dense layer
class CustomLinear(nn.Module):
    def __init__(self, in_features, out_features, bias=True, activation_function=None):
        super(CustomLinear, self).__init__()
        self.linear = nn.Linear(in_features, out_features, bias=bias)
        
        if activation_function is not None:
            self.activation = activation_function()
        else:
            self.activation = None

    def forward(self, x):
        x = self.linear(x)
        if self.activation is not None:
            x = self.activation(x)
        return x

# Load configuration
config = {
    "in_features": 768,
    "out_features": 1792,
    "bias": True,
    "activation_function": "torch.nn.modules.linear.Identity"
}

# Create the model
dense_model = CustomLinear(
    in_features=config["in_features"],
    out_features=config["out_features"],
    bias=config["bias"],
    activation_function=eval(config["activation_function"])
)

dense_model.load_state_dict(torch.load('/stella-base-zh-v3-1792d/2_Dense/pytorch_model.bin'))
dense_model.eval()

dense_output = dense_model(torch.tensor(vectors, dtype=torch.float))
dense_output.detach().numpy() # 1, 1792

# array([-0.00759222, -0.01908403,  0.00410945,  0.00283392,  0.02646818], dtype=float32), transformer result
# array([-0.00718741, -0.01593147,  0.01091087, -0.0029516 ,  0.02063012], dtype=float32), sentence transformer result

This is my training code, pleas check.


#  self.bert is bert model
        outputs = self.bert(
            input_ids,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids,
            position_ids=position_ids,
            head_mask=head_mask,
            inputs_embeds=inputs_embeds,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
        )
        # last_hidden_state, bsz*seq_len*embedding
        last_hidden_state = outputs[0]
        last_hidden_state = self.vec_linear(last_hidden_state)
        last_hidden = last_hidden_state.masked_fill(~attention_mask[..., None].bool(), 0.0)
        vectors = last_hidden.sum(dim=1) / attention_mask.sum(dim=1)[..., None]
        vectors = F.normalize(vectors, 2.0, dim=1)

please remove vectors = normalize(vectors.detach(), norm="l2", axis=1, ) # 1, 768 and dense_output should be normalized

Thanks for the reply, the values match after rearranging the normalization operation:)

This comment has been hidden

Sign up or log in to comment