|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
from dataclasses import dataclass |
|
from typing import Optional, Tuple |
|
|
|
import paddle |
|
from paddle import nn |
|
|
|
from paddlenlp.transformers import RobertaConfig as XLMRobertaConfig |
|
from paddlenlp.transformers import RobertaModel as XLMRobertaModel |
|
from paddlenlp.transformers import RobertaPretrainedModel |
|
from paddlenlp.transformers.model_outputs import ModelOutput |
|
|
|
|
|
def create_position_ids_from_input_ids(input_ids, padding_idx, past_key_values_length=0): |
|
""" |
|
Replace non-padding symbols with their position numbers. Position numbers begin at padding_idx+1. Padding symbols |
|
are ignored. This is modified from fairseq's `utils.make_positions`. |
|
|
|
Args: |
|
x: paddle.Tensor x: |
|
Returns: paddle.Tensor |
|
|
|
""" |
|
|
|
mask = (input_ids != padding_idx).cast("int64") |
|
incremental_indices = (paddle.cumsum(mask, axis=1) + past_key_values_length) * mask |
|
return incremental_indices + padding_idx |
|
|
|
|
|
@dataclass |
|
class TransformationModelOutput(ModelOutput): |
|
""" |
|
Base class for text model's outputs that also contains a pooling of the last hidden states. |
|
Args: |
|
text_embeds (`paddle.Tensor` of shape `(batch_size, output_dim)` *optional* returned when model is initialized with `with_projection=True`): |
|
The text embeddings obtained by applying the projection layer to the pooler_output. |
|
last_hidden_state (`paddle.Tensor` of shape `(batch_size, sequence_length, hidden_size)`): |
|
Sequence of hidden-states at the output of the last layer of the model. |
|
hidden_states (`tuple(paddle.Tensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`): |
|
Tuple of `paddle.Tensor` (one for the output of the embeddings, if the model has an embedding layer, + |
|
one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`. |
|
Hidden-states of the model at the output of each layer plus the optional initial embedding outputs. |
|
attentions (`tuple(paddle.Tensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`): |
|
Tuple of `paddle.Tensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length, |
|
sequence_length)`. |
|
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention |
|
heads. |
|
""" |
|
|
|
projection_state: Optional[paddle.Tensor] = None |
|
last_hidden_state: paddle.Tensor = None |
|
hidden_states: Optional[Tuple[paddle.Tensor]] = None |
|
attentions: Optional[Tuple[paddle.Tensor]] = None |
|
|
|
|
|
class RobertaSeriesConfig(XLMRobertaConfig): |
|
model_type = "roberta" |
|
|
|
def __init__( |
|
self, |
|
pad_token_id=1, |
|
bos_token_id=0, |
|
eos_token_id=2, |
|
project_dim=512, |
|
pooler_fn="cls", |
|
learn_encoder=False, |
|
use_attention_mask=True, |
|
**kwargs, |
|
): |
|
super().__init__(pad_token_id=pad_token_id, bos_token_id=bos_token_id, eos_token_id=eos_token_id, **kwargs) |
|
self.project_dim = project_dim |
|
self.pooler_fn = pooler_fn |
|
self.learn_encoder = learn_encoder |
|
self.use_attention_mask = use_attention_mask |
|
|
|
|
|
class RobertaSeriesModelWithTransformation(RobertaPretrainedModel): |
|
_keys_to_ignore_on_load_unexpected = [r"pooler"] |
|
_keys_to_ignore_on_load_missing = [r"position_ids", r"predictions.decoder.bias"] |
|
base_model_prefix = "roberta" |
|
config_class = RobertaSeriesConfig |
|
|
|
def __init__(self, config: RobertaSeriesConfig): |
|
super().__init__(config) |
|
self.roberta = XLMRobertaModel(config) |
|
self.transformation = nn.Linear(config.hidden_size, config.project_dim) |
|
self.apply(self.init_weights) |
|
|
|
def forward( |
|
self, |
|
input_ids: Optional[paddle.Tensor] = None, |
|
attention_mask: Optional[paddle.Tensor] = None, |
|
token_type_ids: Optional[paddle.Tensor] = None, |
|
position_ids: Optional[paddle.Tensor] = None, |
|
output_attentions: Optional[bool] = None, |
|
return_dict: Optional[bool] = None, |
|
output_hidden_states: Optional[bool] = None, |
|
): |
|
return_dict = return_dict if return_dict is not None else self.config.use_return_dict |
|
|
|
if position_ids is None: |
|
position_ids = create_position_ids_from_input_ids(input_ids, self.config.pad_token_id) |
|
outputs = self.base_model( |
|
input_ids=input_ids, |
|
attention_mask=attention_mask, |
|
token_type_ids=token_type_ids, |
|
position_ids=position_ids, |
|
output_attentions=output_attentions, |
|
output_hidden_states=output_hidden_states, |
|
return_dict=return_dict, |
|
) |
|
|
|
projection_state = self.transformation(outputs.last_hidden_state) |
|
|
|
return TransformationModelOutput( |
|
projection_state=projection_state, |
|
last_hidden_state=outputs.last_hidden_state, |
|
hidden_states=outputs.hidden_states, |
|
attentions=outputs.attentions, |
|
) |
|
|