the embeddings from Transformers and Sentence Transformers do not match
Thanks for sharing this model. I found out that embeddings generated from two interfaces do not match. Is there anything missing?
# sentence transformer
from sentence_transformers import SentenceTransformer
general_model = SentenceTransformer("infgrad/stella-base-zh-v3-1792d")
text = ["B没打篮球是因为受伤了。"]
text_vectors = general_model.encode(corpus, normalize_embeddings=True, )
# array([-0.00718741, -0.01593147, 0.01091087, -0.0029516 , 0.02063012], dtype=float32)
# transformers
from transformers import AutoModel, AutoTokenizer
from sklearn.preprocessing import normalize
model = AutoModel.from_pretrained('infgrad/stella-base-zh-v3-1792d')
tokenizer = AutoTokenizer.from_pretrained('infgrad/stella-base-zh-v3-1792d')
batch_data = tokenizer(
text=text,
padding="longest",
return_tensors="pt",
max_length=1024,
truncation=True,
)
attention_mask = batch_data["attention_mask"]
model_output = model(**batch_data)
last_hidden = model_output.last_hidden_state.masked_fill(~attention_mask[..., None].bool(), 0.0)
vectors = last_hidden.sum(dim=1) / attention_mask.sum(dim=1)[..., None]
vectors = normalize(vectors.detach(), norm="l2", axis=1, )
print(vectors.shape) # 1,768
# array([-0.00420516, 0.04585313, 0.03543789, -0.02160062, -0.05666691])
@kenzo226
Hi, vectors have a shape of [batch-size,1792], I add a Linear(output features is 1792) to the last_hidden_state. You cannot only use the bert model to get vectors. Please check the moudels.json
file:
[
{
"idx": 0,
"name": "0",
"path": "",
"type": "sentence_transformers.models.Transformer"
},
{
"idx": 1,
"name": "1",
"path": "1_Pooling",
"type": "sentence_transformers.models.Pooling"
},
{
"idx": 2,
"name": "2",
"path": "2_Dense",
"type": "sentence_transformers.models.Dense"
}
Please use SentenceTransformer to get vectors, or You do a full connection operation manually(e.g. WX+B), the W and B are in ./2_Dense/pytorch_model.bin
Thanks for the reply! Still want to give origin transformers a try. Attempted using the dense network and loaded the weights from 2_Dense, the shapes match, but the values do not. Not sure if something is missing or if I conducted something incorrectly, possibly in the pooling operation.
# transformer bert output
from transformers import AutoModel, AutoTokenizer
from sklearn.preprocessing import normalize
model = AutoModel.from_pretrained('infgrad/stella-base-zh-v3-1792d')
tokenizer = AutoTokenizer.from_pretrained('infgrad/stella-base-zh-v3-1792d')
batch_data = tokenizer(
text=text,
padding="longest",
return_tensors="pt",
max_length=1024,
truncation=True,
)
# pooling
attention_mask = batch_data["attention_mask"]
model_output = model(**batch_data)
last_hidden = model_output.last_hidden_state.masked_fill(~attention_mask[..., None].bool(), 0.0)
vectors = last_hidden.sum(dim=1) / attention_mask.sum(dim=1)[..., None]
vectors = normalize(vectors.detach(), norm="l2", axis=1, ) # 1, 768
# dense layer
class CustomLinear(nn.Module):
def __init__(self, in_features, out_features, bias=True, activation_function=None):
super(CustomLinear, self).__init__()
self.linear = nn.Linear(in_features, out_features, bias=bias)
if activation_function is not None:
self.activation = activation_function()
else:
self.activation = None
def forward(self, x):
x = self.linear(x)
if self.activation is not None:
x = self.activation(x)
return x
# Load configuration
config = {
"in_features": 768,
"out_features": 1792,
"bias": True,
"activation_function": "torch.nn.modules.linear.Identity"
}
# Create the model
dense_model = CustomLinear(
in_features=config["in_features"],
out_features=config["out_features"],
bias=config["bias"],
activation_function=eval(config["activation_function"])
)
dense_model.load_state_dict(torch.load('/stella-base-zh-v3-1792d/2_Dense/pytorch_model.bin'))
dense_model.eval()
dense_output = dense_model(torch.tensor(vectors, dtype=torch.float))
dense_output.detach().numpy() # 1, 1792
# array([-0.00759222, -0.01908403, 0.00410945, 0.00283392, 0.02646818], dtype=float32), transformer result
# array([-0.00718741, -0.01593147, 0.01091087, -0.0029516 , 0.02063012], dtype=float32), sentence transformer result
This is my training code, pleas check.
# self.bert is bert model
outputs = self.bert(
input_ids,
attention_mask=attention_mask,
token_type_ids=token_type_ids,
position_ids=position_ids,
head_mask=head_mask,
inputs_embeds=inputs_embeds,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
return_dict=return_dict,
)
# last_hidden_state, bsz*seq_len*embedding
last_hidden_state = outputs[0]
last_hidden_state = self.vec_linear(last_hidden_state)
last_hidden = last_hidden_state.masked_fill(~attention_mask[..., None].bool(), 0.0)
vectors = last_hidden.sum(dim=1) / attention_mask.sum(dim=1)[..., None]
vectors = F.normalize(vectors, 2.0, dim=1)
please remove vectors = normalize(vectors.detach(), norm="l2", axis=1, ) # 1, 768
and dense_output should be normalized
Thanks for the reply, the values match after rearranging the normalization operation:)