vince62s commited on
Commit
b225361
·
verified ·
1 Parent(s): 36b0403

Upload 2 files

Browse files
Files changed (2) hide show
  1. configuration_xlm_roberta.py +157 -0
  2. modelling_xlm_roberta.py +1706 -0
configuration_xlm_roberta.py ADDED
@@ -0,0 +1,157 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # coding=utf-8
2
+ # Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
3
+ # Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved.
4
+ #
5
+ # Licensed under the Apache License, Version 2.0 (the "License");
6
+ # you may not use this file except in compliance with the License.
7
+ # You may obtain a copy of the License at
8
+ #
9
+ # http://www.apache.org/licenses/LICENSE-2.0
10
+ #
11
+ # Unless required by applicable law or agreed to in writing, software
12
+ # distributed under the License is distributed on an "AS IS" BASIS,
13
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14
+ # See the License for the specific language governing permissions and
15
+ # limitations under the License.
16
+ """ XLM-RoBERTa configuration"""
17
+ from collections import OrderedDict
18
+ from typing import Mapping
19
+
20
+ from transformers.configuration_utils import PretrainedConfig
21
+ from transformers.onnx import OnnxConfig
22
+ from transformers.utils import logging
23
+
24
+
25
+ logger = logging.get_logger(__name__)
26
+
27
+
28
+ #from transformers.deprecated._archive_maps import XLM_ROBERTA_PRETRAINED_CONFIG_ARCHIVE_MAP # noqa: F401, E402
29
+
30
+
31
+ class XLMRobertaConfig(PretrainedConfig):
32
+ r"""
33
+ This is the configuration class to store the configuration of a [`XLMRobertaModel`] or a [`TFXLMRobertaModel`]. It
34
+ is used to instantiate a XLM-RoBERTa model according to the specified arguments, defining the model architecture.
35
+ Instantiating a configuration with the defaults will yield a similar configuration to that of the XLMRoBERTa
36
+ [FacebookAI/xlm-roberta-base](https://huggingface.co/FacebookAI/xlm-roberta-base) architecture.
37
+
38
+ Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
39
+ documentation from [`PretrainedConfig`] for more information.
40
+
41
+
42
+ Args:
43
+ vocab_size (`int`, *optional*, defaults to 30522):
44
+ Vocabulary size of the XLM-RoBERTa model. Defines the number of different tokens that can be represented by
45
+ the `inputs_ids` passed when calling [`XLMRobertaModel`] or [`TFXLMRobertaModel`].
46
+ hidden_size (`int`, *optional*, defaults to 768):
47
+ Dimensionality of the encoder layers and the pooler layer.
48
+ num_hidden_layers (`int`, *optional*, defaults to 12):
49
+ Number of hidden layers in the Transformer encoder.
50
+ num_attention_heads (`int`, *optional*, defaults to 12):
51
+ Number of attention heads for each attention layer in the Transformer encoder.
52
+ intermediate_size (`int`, *optional*, defaults to 3072):
53
+ Dimensionality of the "intermediate" (often named feed-forward) layer in the Transformer encoder.
54
+ hidden_act (`str` or `Callable`, *optional*, defaults to `"gelu"`):
55
+ The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
56
+ `"relu"`, `"silu"` and `"gelu_new"` are supported.
57
+ hidden_dropout_prob (`float`, *optional*, defaults to 0.1):
58
+ The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
59
+ attention_probs_dropout_prob (`float`, *optional*, defaults to 0.1):
60
+ The dropout ratio for the attention probabilities.
61
+ max_position_embeddings (`int`, *optional*, defaults to 512):
62
+ The maximum sequence length that this model might ever be used with. Typically set this to something large
63
+ just in case (e.g., 512 or 1024 or 2048).
64
+ type_vocab_size (`int`, *optional*, defaults to 2):
65
+ The vocabulary size of the `token_type_ids` passed when calling [`XLMRobertaModel`] or
66
+ [`TFXLMRobertaModel`].
67
+ initializer_range (`float`, *optional*, defaults to 0.02):
68
+ The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
69
+ layer_norm_eps (`float`, *optional*, defaults to 1e-12):
70
+ The epsilon used by the layer normalization layers.
71
+ position_embedding_type (`str`, *optional*, defaults to `"absolute"`):
72
+ Type of position embedding. Choose one of `"absolute"`, `"relative_key"`, `"relative_key_query"`. For
73
+ positional embeddings use `"absolute"`. For more information on `"relative_key"`, please refer to
74
+ [Self-Attention with Relative Position Representations (Shaw et al.)](https://arxiv.org/abs/1803.02155).
75
+ For more information on `"relative_key_query"`, please refer to *Method 4* in [Improve Transformer Models
76
+ with Better Relative Position Embeddings (Huang et al.)](https://arxiv.org/abs/2009.13658).
77
+ is_decoder (`bool`, *optional*, defaults to `False`):
78
+ Whether the model is used as a decoder or not. If `False`, the model is used as an encoder.
79
+ use_cache (`bool`, *optional*, defaults to `True`):
80
+ Whether or not the model should return the last key/values attentions (not used by all models). Only
81
+ relevant if `config.is_decoder=True`.
82
+ classifier_dropout (`float`, *optional*):
83
+ The dropout ratio for the classification head.
84
+
85
+ Examples:
86
+
87
+ ```python
88
+ >>> from transformers import XLMRobertaConfig, XLMRobertaModel
89
+
90
+ >>> # Initializing a XLM-RoBERTa FacebookAI/xlm-roberta-base style configuration
91
+ >>> configuration = XLMRobertaConfig()
92
+
93
+ >>> # Initializing a model (with random weights) from the FacebookAI/xlm-roberta-base style configuration
94
+ >>> model = XLMRobertaModel(configuration)
95
+
96
+ >>> # Accessing the model configuration
97
+ >>> configuration = model.config
98
+ ```"""
99
+
100
+ model_type = "xlm-roberta"
101
+
102
+ def __init__(
103
+ self,
104
+ vocab_size=30522,
105
+ hidden_size=768,
106
+ num_hidden_layers=12,
107
+ num_attention_heads=12,
108
+ intermediate_size=3072,
109
+ hidden_act="gelu",
110
+ hidden_dropout_prob=0.1,
111
+ attention_probs_dropout_prob=0.1,
112
+ max_position_embeddings=512,
113
+ type_vocab_size=2,
114
+ initializer_range=0.02,
115
+ layer_norm_eps=1e-12,
116
+ pad_token_id=1,
117
+ bos_token_id=0,
118
+ eos_token_id=2,
119
+ position_embedding_type="absolute",
120
+ use_cache=True,
121
+ classifier_dropout=None,
122
+ **kwargs,
123
+ ):
124
+ super().__init__(pad_token_id=pad_token_id, bos_token_id=bos_token_id, eos_token_id=eos_token_id, **kwargs)
125
+
126
+ self.vocab_size = vocab_size
127
+ self.hidden_size = hidden_size
128
+ self.num_hidden_layers = num_hidden_layers
129
+ self.num_attention_heads = num_attention_heads
130
+ self.hidden_act = hidden_act
131
+ self.intermediate_size = intermediate_size
132
+ self.hidden_dropout_prob = hidden_dropout_prob
133
+ self.attention_probs_dropout_prob = attention_probs_dropout_prob
134
+ self.max_position_embeddings = max_position_embeddings
135
+ self.type_vocab_size = type_vocab_size
136
+ self.initializer_range = initializer_range
137
+ self.layer_norm_eps = layer_norm_eps
138
+ self.position_embedding_type = position_embedding_type
139
+ self.use_cache = use_cache
140
+ self.classifier_dropout = classifier_dropout
141
+
142
+
143
+ # Copied from transformers.models.roberta.configuration_roberta.RobertaOnnxConfig with Roberta->XLMRoberta
144
+ class XLMRobertaOnnxConfig(OnnxConfig):
145
+ @property
146
+ def inputs(self) -> Mapping[str, Mapping[int, str]]:
147
+ if self.task == "multiple-choice":
148
+ dynamic_axis = {0: "batch", 1: "choice", 2: "sequence"}
149
+ else:
150
+ dynamic_axis = {0: "batch", 1: "sequence"}
151
+ return OrderedDict(
152
+ [
153
+ ("input_ids", dynamic_axis),
154
+ ("attention_mask", dynamic_axis),
155
+ ]
156
+ )
157
+
modelling_xlm_roberta.py ADDED
@@ -0,0 +1,1706 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # coding=utf-8
2
+ # Copyright 2019 Facebook AI Research and the HuggingFace Inc. team.
3
+ # Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved.
4
+ #
5
+ # Licensed under the Apache License, Version 2.0 (the "License");
6
+ # you may not use this file except in compliance with the License.
7
+ # You may obtain a copy of the License at
8
+ #
9
+ # http://www.apache.org/licenses/LICENSE-2.0
10
+ #
11
+ # Unless required by applicable law or agreed to in writing, software
12
+ # distributed under the License is distributed on an "AS IS" BASIS,
13
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14
+ # See the License for the specific language governing permissions and
15
+ # limitations under the License.
16
+ """PyTorch XLM-RoBERTa model."""
17
+
18
+ import math
19
+ from typing import List, Optional, Tuple, Union
20
+
21
+ import torch
22
+ import torch.utils.checkpoint
23
+ from torch import nn
24
+ from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
25
+
26
+ from transformers.activations import ACT2FN, gelu
27
+ from transformers.modeling_outputs import (
28
+ BaseModelOutputWithPastAndCrossAttentions,
29
+ BaseModelOutputWithPoolingAndCrossAttentions,
30
+ CausalLMOutputWithCrossAttentions,
31
+ MaskedLMOutput,
32
+ MultipleChoiceModelOutput,
33
+ QuestionAnsweringModelOutput,
34
+ SequenceClassifierOutput,
35
+ TokenClassifierOutput,
36
+ )
37
+ from transformers import PreTrainedModel
38
+ from transformers.pytorch_utils import apply_chunking_to_forward, find_pruneable_heads_and_indices, prune_linear_layer
39
+ from transformers.utils import (
40
+ add_code_sample_docstrings,
41
+ add_start_docstrings,
42
+ add_start_docstrings_to_model_forward,
43
+ logging,
44
+ replace_return_docstrings,
45
+ )
46
+ from configuration_xlm_roberta import XLMRobertaConfig
47
+
48
+
49
+ logger = logging.get_logger(__name__)
50
+
51
+ _CHECKPOINT_FOR_DOC = "FacebookAI/xlm-roberta-base"
52
+ _CONFIG_FOR_DOC = "XLMRobertaConfig"
53
+
54
+
55
+ #from ..deprecated._archive_maps import XLM_ROBERTA_PRETRAINED_MODEL_ARCHIVE_LIST # noqa: F401, E402
56
+
57
+
58
+ # Copied from transformers.models.roberta.modeling_roberta.RobertaEmbeddings with Roberta->XLMRoberta
59
+ class XLMRobertaEmbeddings(nn.Module):
60
+ """
61
+ Same as BertEmbeddings with a tiny tweak for positional embeddings indexing.
62
+ """
63
+
64
+ # Copied from transformers.models.bert.modeling_bert.BertEmbeddings.__init__
65
+ def __init__(self, config):
66
+ super().__init__()
67
+ self.word_embeddings = nn.Embedding(config.vocab_size, config.hidden_size, padding_idx=config.pad_token_id)
68
+ self.position_embeddings = nn.Embedding(config.max_position_embeddings, config.hidden_size)
69
+ self.token_type_embeddings = nn.Embedding(config.type_vocab_size, config.hidden_size)
70
+
71
+ # self.LayerNorm is not snake-cased to stick with TensorFlow model variable name and be able to load
72
+ # any TensorFlow checkpoint file
73
+ self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
74
+ self.dropout = nn.Dropout(config.hidden_dropout_prob)
75
+ # position_ids (1, len position emb) is contiguous in memory and exported when serialized
76
+ self.position_embedding_type = getattr(config, "position_embedding_type", "absolute")
77
+ self.register_buffer(
78
+ "position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)), persistent=False
79
+ )
80
+ self.register_buffer(
81
+ "token_type_ids", torch.zeros(self.position_ids.size(), dtype=torch.long), persistent=False
82
+ )
83
+
84
+ # End copy
85
+ self.padding_idx = config.pad_token_id
86
+ self.position_embeddings = nn.Embedding(
87
+ config.max_position_embeddings, config.hidden_size, padding_idx=self.padding_idx
88
+ )
89
+
90
+ def forward(
91
+ self, input_ids=None, token_type_ids=None, position_ids=None, inputs_embeds=None, past_key_values_length=0
92
+ ):
93
+ if position_ids is None:
94
+ if input_ids is not None:
95
+ # Create the position ids from the input token ids. Any padded tokens remain padded.
96
+ position_ids = create_position_ids_from_input_ids(input_ids, self.padding_idx, past_key_values_length)
97
+ else:
98
+ position_ids = self.create_position_ids_from_inputs_embeds(inputs_embeds)
99
+
100
+ if input_ids is not None:
101
+ input_shape = input_ids.size()
102
+ else:
103
+ input_shape = inputs_embeds.size()[:-1]
104
+
105
+ seq_length = input_shape[1]
106
+
107
+ # Setting the token_type_ids to the registered buffer in constructor where it is all zeros, which usually occurs
108
+ # when its auto-generated, registered buffer helps users when tracing the model without passing token_type_ids, solves
109
+ # issue #5664
110
+ if token_type_ids is None:
111
+ if hasattr(self, "token_type_ids"):
112
+ buffered_token_type_ids = self.token_type_ids[:, :seq_length]
113
+ buffered_token_type_ids_expanded = buffered_token_type_ids.expand(input_shape[0], seq_length)
114
+ token_type_ids = buffered_token_type_ids_expanded
115
+ else:
116
+ token_type_ids = torch.zeros(input_shape, dtype=torch.long, device=self.position_ids.device)
117
+
118
+ if inputs_embeds is None:
119
+ inputs_embeds = self.word_embeddings(input_ids)
120
+ token_type_embeddings = self.token_type_embeddings(token_type_ids)
121
+
122
+ embeddings = inputs_embeds + token_type_embeddings
123
+ if self.position_embedding_type == "absolute":
124
+ position_embeddings = self.position_embeddings(position_ids)
125
+ embeddings += position_embeddings
126
+ embeddings = self.LayerNorm(embeddings)
127
+ embeddings = self.dropout(embeddings)
128
+ return embeddings
129
+
130
+ def create_position_ids_from_inputs_embeds(self, inputs_embeds):
131
+ """
132
+ We are provided embeddings directly. We cannot infer which are padded so just generate sequential position ids.
133
+
134
+ Args:
135
+ inputs_embeds: torch.Tensor
136
+
137
+ Returns: torch.Tensor
138
+ """
139
+ input_shape = inputs_embeds.size()[:-1]
140
+ sequence_length = input_shape[1]
141
+
142
+ position_ids = torch.arange(
143
+ self.padding_idx + 1, sequence_length + self.padding_idx + 1, dtype=torch.long, device=inputs_embeds.device
144
+ )
145
+ return position_ids.unsqueeze(0).expand(input_shape)
146
+
147
+
148
+ # Copied from transformers.models.roberta.modeling_roberta.RobertaSelfAttention with Roberta->XLMRoberta
149
+ class XLMRobertaSelfAttention(nn.Module):
150
+ def __init__(self, config, position_embedding_type=None):
151
+ super().__init__()
152
+ if config.hidden_size % config.num_attention_heads != 0 and not hasattr(config, "embedding_size"):
153
+ raise ValueError(
154
+ f"The hidden size ({config.hidden_size}) is not a multiple of the number of attention "
155
+ f"heads ({config.num_attention_heads})"
156
+ )
157
+
158
+ self.num_attention_heads = config.num_attention_heads
159
+ self.attention_head_size = int(config.hidden_size / config.num_attention_heads)
160
+ self.all_head_size = self.num_attention_heads * self.attention_head_size
161
+
162
+ self.query = nn.Linear(config.hidden_size, self.all_head_size)
163
+ self.key = nn.Linear(config.hidden_size, self.all_head_size)
164
+ self.value = nn.Linear(config.hidden_size, self.all_head_size)
165
+
166
+ self.dropout = nn.Dropout(config.attention_probs_dropout_prob)
167
+ self.position_embedding_type = position_embedding_type or getattr(
168
+ config, "position_embedding_type", "absolute"
169
+ )
170
+ if self.position_embedding_type == "relative_key" or self.position_embedding_type == "relative_key_query":
171
+ self.max_position_embeddings = config.max_position_embeddings
172
+ self.distance_embedding = nn.Embedding(2 * config.max_position_embeddings - 1, self.attention_head_size)
173
+
174
+ self.is_decoder = config.is_decoder
175
+
176
+ def transpose_for_scores(self, x: torch.Tensor) -> torch.Tensor:
177
+ new_x_shape = x.size()[:-1] + (self.num_attention_heads, self.attention_head_size)
178
+ x = x.view(new_x_shape)
179
+ return x.permute(0, 2, 1, 3)
180
+
181
+ def forward(
182
+ self,
183
+ hidden_states: torch.Tensor,
184
+ attention_mask: Optional[torch.FloatTensor] = None,
185
+ head_mask: Optional[torch.FloatTensor] = None,
186
+ encoder_hidden_states: Optional[torch.FloatTensor] = None,
187
+ encoder_attention_mask: Optional[torch.FloatTensor] = None,
188
+ past_key_value: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
189
+ output_attentions: Optional[bool] = False,
190
+ ) -> Tuple[torch.Tensor]:
191
+ mixed_query_layer = self.query(hidden_states)
192
+
193
+ # If this is instantiated as a cross-attention module, the keys
194
+ # and values come from an encoder; the attention mask needs to be
195
+ # such that the encoder's padding tokens are not attended to.
196
+ is_cross_attention = encoder_hidden_states is not None
197
+
198
+ if is_cross_attention and past_key_value is not None:
199
+ # reuse k,v, cross_attentions
200
+ key_layer = past_key_value[0]
201
+ value_layer = past_key_value[1]
202
+ attention_mask = encoder_attention_mask
203
+ elif is_cross_attention:
204
+ key_layer = self.transpose_for_scores(self.key(encoder_hidden_states))
205
+ value_layer = self.transpose_for_scores(self.value(encoder_hidden_states))
206
+ attention_mask = encoder_attention_mask
207
+ elif past_key_value is not None:
208
+ key_layer = self.transpose_for_scores(self.key(hidden_states))
209
+ value_layer = self.transpose_for_scores(self.value(hidden_states))
210
+ key_layer = torch.cat([past_key_value[0], key_layer], dim=2)
211
+ value_layer = torch.cat([past_key_value[1], value_layer], dim=2)
212
+ else:
213
+ key_layer = self.transpose_for_scores(self.key(hidden_states))
214
+ value_layer = self.transpose_for_scores(self.value(hidden_states))
215
+
216
+ query_layer = self.transpose_for_scores(mixed_query_layer)
217
+
218
+ use_cache = past_key_value is not None
219
+ if self.is_decoder:
220
+ # if cross_attention save Tuple(torch.Tensor, torch.Tensor) of all cross attention key/value_states.
221
+ # Further calls to cross_attention layer can then reuse all cross-attention
222
+ # key/value_states (first "if" case)
223
+ # if uni-directional self-attention (decoder) save Tuple(torch.Tensor, torch.Tensor) of
224
+ # all previous decoder key/value_states. Further calls to uni-directional self-attention
225
+ # can concat previous decoder key/value_states to current projected key/value_states (third "elif" case)
226
+ # if encoder bi-directional self-attention `past_key_value` is always `None`
227
+ past_key_value = (key_layer, value_layer)
228
+
229
+ # Take the dot product between "query" and "key" to get the raw attention scores.
230
+ attention_scores = torch.matmul(query_layer, key_layer.transpose(-1, -2))
231
+
232
+ if self.position_embedding_type == "relative_key" or self.position_embedding_type == "relative_key_query":
233
+ query_length, key_length = query_layer.shape[2], key_layer.shape[2]
234
+ if use_cache:
235
+ position_ids_l = torch.tensor(key_length - 1, dtype=torch.long, device=hidden_states.device).view(
236
+ -1, 1
237
+ )
238
+ else:
239
+ position_ids_l = torch.arange(query_length, dtype=torch.long, device=hidden_states.device).view(-1, 1)
240
+ position_ids_r = torch.arange(key_length, dtype=torch.long, device=hidden_states.device).view(1, -1)
241
+ distance = position_ids_l - position_ids_r
242
+
243
+ positional_embedding = self.distance_embedding(distance + self.max_position_embeddings - 1)
244
+ positional_embedding = positional_embedding.to(dtype=query_layer.dtype) # fp16 compatibility
245
+
246
+ if self.position_embedding_type == "relative_key":
247
+ relative_position_scores = torch.einsum("bhld,lrd->bhlr", query_layer, positional_embedding)
248
+ attention_scores = attention_scores + relative_position_scores
249
+ elif self.position_embedding_type == "relative_key_query":
250
+ relative_position_scores_query = torch.einsum("bhld,lrd->bhlr", query_layer, positional_embedding)
251
+ relative_position_scores_key = torch.einsum("bhrd,lrd->bhlr", key_layer, positional_embedding)
252
+ attention_scores = attention_scores + relative_position_scores_query + relative_position_scores_key
253
+
254
+ attention_scores = attention_scores / math.sqrt(self.attention_head_size)
255
+ if attention_mask is not None:
256
+ # Apply the attention mask is (precomputed for all layers in XLMRobertaModel forward() function)
257
+ attention_scores = attention_scores + attention_mask
258
+
259
+ # Normalize the attention scores to probabilities.
260
+ attention_probs = nn.functional.softmax(attention_scores, dim=-1)
261
+
262
+ # This is actually dropping out entire tokens to attend to, which might
263
+ # seem a bit unusual, but is taken from the original Transformer paper.
264
+ attention_probs = self.dropout(attention_probs)
265
+
266
+ # Mask heads if we want to
267
+ if head_mask is not None:
268
+ attention_probs = attention_probs * head_mask
269
+
270
+ context_layer = torch.matmul(attention_probs, value_layer)
271
+
272
+ context_layer = context_layer.permute(0, 2, 1, 3).contiguous()
273
+ new_context_layer_shape = context_layer.size()[:-2] + (self.all_head_size,)
274
+ context_layer = context_layer.view(new_context_layer_shape)
275
+
276
+ outputs = (context_layer, attention_probs) if output_attentions else (context_layer,)
277
+
278
+ if self.is_decoder:
279
+ outputs = outputs + (past_key_value,)
280
+ return outputs
281
+
282
+
283
+ # Copied from transformers.models.roberta.modeling_roberta.RobertaSelfOutput with Roberta->XLMRoberta
284
+ class XLMRobertaSelfOutput(nn.Module):
285
+ def __init__(self, config):
286
+ super().__init__()
287
+ self.dense = nn.Linear(config.hidden_size, config.hidden_size)
288
+ self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
289
+ self.dropout = nn.Dropout(config.hidden_dropout_prob)
290
+
291
+ def forward(self, hidden_states: torch.Tensor, input_tensor: torch.Tensor) -> torch.Tensor:
292
+ hidden_states = self.dense(hidden_states)
293
+ hidden_states = self.dropout(hidden_states)
294
+ hidden_states = self.LayerNorm(hidden_states + input_tensor)
295
+ return hidden_states
296
+
297
+
298
+ # Copied from transformers.models.roberta.modeling_roberta.RobertaAttention with Roberta->XLMRoberta
299
+ class XLMRobertaAttention(nn.Module):
300
+ def __init__(self, config, position_embedding_type=None):
301
+ super().__init__()
302
+ self.self = XLMRobertaSelfAttention(config, position_embedding_type=position_embedding_type)
303
+ self.output = XLMRobertaSelfOutput(config)
304
+ self.pruned_heads = set()
305
+
306
+ def prune_heads(self, heads):
307
+ if len(heads) == 0:
308
+ return
309
+ heads, index = find_pruneable_heads_and_indices(
310
+ heads, self.self.num_attention_heads, self.self.attention_head_size, self.pruned_heads
311
+ )
312
+
313
+ # Prune linear layers
314
+ self.self.query = prune_linear_layer(self.self.query, index)
315
+ self.self.key = prune_linear_layer(self.self.key, index)
316
+ self.self.value = prune_linear_layer(self.self.value, index)
317
+ self.output.dense = prune_linear_layer(self.output.dense, index, dim=1)
318
+
319
+ # Update hyper params and store pruned heads
320
+ self.self.num_attention_heads = self.self.num_attention_heads - len(heads)
321
+ self.self.all_head_size = self.self.attention_head_size * self.self.num_attention_heads
322
+ self.pruned_heads = self.pruned_heads.union(heads)
323
+
324
+ def forward(
325
+ self,
326
+ hidden_states: torch.Tensor,
327
+ attention_mask: Optional[torch.FloatTensor] = None,
328
+ head_mask: Optional[torch.FloatTensor] = None,
329
+ encoder_hidden_states: Optional[torch.FloatTensor] = None,
330
+ encoder_attention_mask: Optional[torch.FloatTensor] = None,
331
+ past_key_value: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
332
+ output_attentions: Optional[bool] = False,
333
+ ) -> Tuple[torch.Tensor]:
334
+ self_outputs = self.self(
335
+ hidden_states,
336
+ attention_mask,
337
+ head_mask,
338
+ encoder_hidden_states,
339
+ encoder_attention_mask,
340
+ past_key_value,
341
+ output_attentions,
342
+ )
343
+ attention_output = self.output(self_outputs[0], hidden_states)
344
+ outputs = (attention_output,) + self_outputs[1:] # add attentions if we output them
345
+ return outputs
346
+
347
+
348
+ # Copied from transformers.models.roberta.modeling_roberta.RobertaIntermediate with Roberta->XLMRoberta
349
+ class XLMRobertaIntermediate(nn.Module):
350
+ def __init__(self, config):
351
+ super().__init__()
352
+ self.dense = nn.Linear(config.hidden_size, config.intermediate_size)
353
+ if isinstance(config.hidden_act, str):
354
+ self.intermediate_act_fn = ACT2FN[config.hidden_act]
355
+ else:
356
+ self.intermediate_act_fn = config.hidden_act
357
+
358
+ def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
359
+ hidden_states = self.dense(hidden_states)
360
+ hidden_states = self.intermediate_act_fn(hidden_states)
361
+ return hidden_states
362
+
363
+
364
+ # Copied from transformers.models.roberta.modeling_roberta.RobertaOutput with Roberta->XLMRoberta
365
+ class XLMRobertaOutput(nn.Module):
366
+ def __init__(self, config):
367
+ super().__init__()
368
+ self.dense = nn.Linear(config.intermediate_size, config.hidden_size)
369
+ self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
370
+ self.dropout = nn.Dropout(config.hidden_dropout_prob)
371
+
372
+ def forward(self, hidden_states: torch.Tensor, input_tensor: torch.Tensor) -> torch.Tensor:
373
+ hidden_states = self.dense(hidden_states)
374
+ hidden_states = self.dropout(hidden_states)
375
+ hidden_states = self.LayerNorm(hidden_states + input_tensor)
376
+ return hidden_states
377
+
378
+
379
+ # Copied from transformers.models.roberta.modeling_roberta.RobertaLayer with Roberta->XLMRoberta
380
+ class XLMRobertaLayer(nn.Module):
381
+ def __init__(self, config):
382
+ super().__init__()
383
+ self.chunk_size_feed_forward = config.chunk_size_feed_forward
384
+ self.seq_len_dim = 1
385
+ self.attention = XLMRobertaAttention(config)
386
+ self.is_decoder = config.is_decoder
387
+ self.add_cross_attention = config.add_cross_attention
388
+ if self.add_cross_attention:
389
+ if not self.is_decoder:
390
+ raise ValueError(f"{self} should be used as a decoder model if cross attention is added")
391
+ self.crossattention = XLMRobertaAttention(config, position_embedding_type="absolute")
392
+ self.intermediate = XLMRobertaIntermediate(config)
393
+ self.output = XLMRobertaOutput(config)
394
+
395
+ def forward(
396
+ self,
397
+ hidden_states: torch.Tensor,
398
+ attention_mask: Optional[torch.FloatTensor] = None,
399
+ head_mask: Optional[torch.FloatTensor] = None,
400
+ encoder_hidden_states: Optional[torch.FloatTensor] = None,
401
+ encoder_attention_mask: Optional[torch.FloatTensor] = None,
402
+ past_key_value: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
403
+ output_attentions: Optional[bool] = False,
404
+ ) -> Tuple[torch.Tensor]:
405
+ # decoder uni-directional self-attention cached key/values tuple is at positions 1,2
406
+ self_attn_past_key_value = past_key_value[:2] if past_key_value is not None else None
407
+ self_attention_outputs = self.attention(
408
+ hidden_states,
409
+ attention_mask,
410
+ head_mask,
411
+ output_attentions=output_attentions,
412
+ past_key_value=self_attn_past_key_value,
413
+ )
414
+ attention_output = self_attention_outputs[0]
415
+
416
+ # if decoder, the last output is tuple of self-attn cache
417
+ if self.is_decoder:
418
+ outputs = self_attention_outputs[1:-1]
419
+ present_key_value = self_attention_outputs[-1]
420
+ else:
421
+ outputs = self_attention_outputs[1:] # add self attentions if we output attention weights
422
+
423
+ cross_attn_present_key_value = None
424
+ if self.is_decoder and encoder_hidden_states is not None:
425
+ if not hasattr(self, "crossattention"):
426
+ raise ValueError(
427
+ f"If `encoder_hidden_states` are passed, {self} has to be instantiated with cross-attention layers"
428
+ " by setting `config.add_cross_attention=True`"
429
+ )
430
+
431
+ # cross_attn cached key/values tuple is at positions 3,4 of past_key_value tuple
432
+ cross_attn_past_key_value = past_key_value[-2:] if past_key_value is not None else None
433
+ cross_attention_outputs = self.crossattention(
434
+ attention_output,
435
+ attention_mask,
436
+ head_mask,
437
+ encoder_hidden_states,
438
+ encoder_attention_mask,
439
+ cross_attn_past_key_value,
440
+ output_attentions,
441
+ )
442
+ attention_output = cross_attention_outputs[0]
443
+ outputs = outputs + cross_attention_outputs[1:-1] # add cross attentions if we output attention weights
444
+
445
+ # add cross-attn cache to positions 3,4 of present_key_value tuple
446
+ cross_attn_present_key_value = cross_attention_outputs[-1]
447
+ present_key_value = present_key_value + cross_attn_present_key_value
448
+
449
+ layer_output = apply_chunking_to_forward(
450
+ self.feed_forward_chunk, self.chunk_size_feed_forward, self.seq_len_dim, attention_output
451
+ )
452
+ outputs = (layer_output,) + outputs
453
+
454
+ # if decoder, return the attn key/values as the last output
455
+ if self.is_decoder:
456
+ outputs = outputs + (present_key_value,)
457
+
458
+ return outputs
459
+
460
+ def feed_forward_chunk(self, attention_output):
461
+ intermediate_output = self.intermediate(attention_output)
462
+ layer_output = self.output(intermediate_output, attention_output)
463
+ return layer_output
464
+
465
+
466
+ # Copied from transformers.models.roberta.modeling_roberta.RobertaEncoder with Roberta->XLMRoberta
467
+ class XLMRobertaEncoder(nn.Module):
468
+ def __init__(self, config):
469
+ super().__init__()
470
+ self.config = config
471
+ self.layer = nn.ModuleList([XLMRobertaLayer(config) for _ in range(config.num_hidden_layers)])
472
+ self.gradient_checkpointing = False
473
+
474
+ def forward(
475
+ self,
476
+ hidden_states: torch.Tensor,
477
+ attention_mask: Optional[torch.FloatTensor] = None,
478
+ head_mask: Optional[torch.FloatTensor] = None,
479
+ encoder_hidden_states: Optional[torch.FloatTensor] = None,
480
+ encoder_attention_mask: Optional[torch.FloatTensor] = None,
481
+ past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
482
+ use_cache: Optional[bool] = None,
483
+ output_attentions: Optional[bool] = False,
484
+ output_hidden_states: Optional[bool] = False,
485
+ return_dict: Optional[bool] = True,
486
+ ) -> Union[Tuple[torch.Tensor], BaseModelOutputWithPastAndCrossAttentions]:
487
+ all_hidden_states = () if output_hidden_states else None
488
+ all_self_attentions = () if output_attentions else None
489
+ all_cross_attentions = () if output_attentions and self.config.add_cross_attention else None
490
+
491
+ if self.gradient_checkpointing and self.training:
492
+ if use_cache:
493
+ logger.warning_once(
494
+ "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
495
+ )
496
+ use_cache = False
497
+
498
+ next_decoder_cache = () if use_cache else None
499
+ for i, layer_module in enumerate(self.layer):
500
+ if output_hidden_states:
501
+ all_hidden_states = all_hidden_states + (hidden_states,)
502
+
503
+ layer_head_mask = head_mask[i] if head_mask is not None else None
504
+ past_key_value = past_key_values[i] if past_key_values is not None else None
505
+
506
+ if self.gradient_checkpointing and self.training:
507
+ layer_outputs = self._gradient_checkpointing_func(
508
+ layer_module.__call__,
509
+ hidden_states,
510
+ attention_mask,
511
+ layer_head_mask,
512
+ encoder_hidden_states,
513
+ encoder_attention_mask,
514
+ past_key_value,
515
+ output_attentions,
516
+ )
517
+ else:
518
+ layer_outputs = layer_module(
519
+ hidden_states,
520
+ attention_mask,
521
+ layer_head_mask,
522
+ encoder_hidden_states,
523
+ encoder_attention_mask,
524
+ past_key_value,
525
+ output_attentions,
526
+ )
527
+
528
+ hidden_states = layer_outputs[0]
529
+ if use_cache:
530
+ next_decoder_cache += (layer_outputs[-1],)
531
+ if output_attentions:
532
+ all_self_attentions = all_self_attentions + (layer_outputs[1],)
533
+ if self.config.add_cross_attention:
534
+ all_cross_attentions = all_cross_attentions + (layer_outputs[2],)
535
+
536
+ if output_hidden_states:
537
+ all_hidden_states = all_hidden_states + (hidden_states,)
538
+
539
+ if not return_dict:
540
+ return tuple(
541
+ v
542
+ for v in [
543
+ hidden_states,
544
+ next_decoder_cache,
545
+ all_hidden_states,
546
+ all_self_attentions,
547
+ all_cross_attentions,
548
+ ]
549
+ if v is not None
550
+ )
551
+ return BaseModelOutputWithPastAndCrossAttentions(
552
+ last_hidden_state=hidden_states,
553
+ past_key_values=next_decoder_cache,
554
+ hidden_states=all_hidden_states,
555
+ attentions=all_self_attentions,
556
+ cross_attentions=all_cross_attentions,
557
+ )
558
+
559
+
560
+ # Copied from transformers.models.roberta.modeling_roberta.RobertaPooler with Roberta->XLMRoberta
561
+ class XLMRobertaPooler(nn.Module):
562
+ def __init__(self, config):
563
+ super().__init__()
564
+ self.dense = nn.Linear(config.hidden_size, config.hidden_size)
565
+ self.activation = nn.Tanh()
566
+
567
+ def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
568
+ # We "pool" the model by simply taking the hidden state corresponding
569
+ # to the first token.
570
+ first_token_tensor = hidden_states[:, 0]
571
+ pooled_output = self.dense(first_token_tensor)
572
+ pooled_output = self.activation(pooled_output)
573
+ return pooled_output
574
+
575
+
576
+ # Copied from transformers.models.roberta.modeling_roberta.RobertaPreTrainedModel with Roberta->XLMRoberta
577
+ class XLMRobertaPreTrainedModel(PreTrainedModel):
578
+ """
579
+ An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
580
+ models.
581
+ """
582
+
583
+ config_class = XLMRobertaConfig
584
+ base_model_prefix = "roberta"
585
+ supports_gradient_checkpointing = True
586
+ _no_split_modules = ["XLMRobertaEmbeddings", "XLMRobertaSelfAttention"]
587
+
588
+ # Copied from transformers.models.bert.modeling_bert.BertPreTrainedModel._init_weights
589
+ def _init_weights(self, module):
590
+ """Initialize the weights"""
591
+ if isinstance(module, nn.Linear):
592
+ # Slightly different from the TF version which uses truncated_normal for initialization
593
+ # cf https://github.com/pytorch/pytorch/pull/5617
594
+ module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
595
+ if module.bias is not None:
596
+ module.bias.data.zero_()
597
+ elif isinstance(module, nn.Embedding):
598
+ module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
599
+ if module.padding_idx is not None:
600
+ module.weight.data[module.padding_idx].zero_()
601
+ elif isinstance(module, nn.LayerNorm):
602
+ module.bias.data.zero_()
603
+ module.weight.data.fill_(1.0)
604
+
605
+
606
+ XLM_ROBERTA_START_DOCSTRING = r"""
607
+
608
+ This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
609
+ library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
610
+ etc.)
611
+
612
+ This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
613
+ Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
614
+ and behavior.
615
+
616
+ Parameters:
617
+ config ([`XLMRobertaConfig`]): Model configuration class with all the parameters of the
618
+ model. Initializing with a config file does not load the weights associated with the model, only the
619
+ configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
620
+ """
621
+
622
+ XLM_ROBERTA_INPUTS_DOCSTRING = r"""
623
+ Args:
624
+ input_ids (`torch.LongTensor` of shape `({0})`):
625
+ Indices of input sequence tokens in the vocabulary.
626
+
627
+ Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
628
+ [`PreTrainedTokenizer.__call__`] for details.
629
+
630
+ [What are input IDs?](../glossary#input-ids)
631
+ attention_mask (`torch.FloatTensor` of shape `({0})`, *optional*):
632
+ Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
633
+
634
+ - 1 for tokens that are **not masked**,
635
+ - 0 for tokens that are **masked**.
636
+
637
+ [What are attention masks?](../glossary#attention-mask)
638
+ token_type_ids (`torch.LongTensor` of shape `({0})`, *optional*):
639
+ Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0,
640
+ 1]`:
641
+
642
+ - 0 corresponds to a *sentence A* token,
643
+ - 1 corresponds to a *sentence B* token.
644
+
645
+ [What are token type IDs?](../glossary#token-type-ids)
646
+ position_ids (`torch.LongTensor` of shape `({0})`, *optional*):
647
+ Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
648
+ config.max_position_embeddings - 1]`.
649
+
650
+ [What are position IDs?](../glossary#position-ids)
651
+ head_mask (`torch.FloatTensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*):
652
+ Mask to nullify selected heads of the self-attention modules. Mask values selected in `[0, 1]`:
653
+
654
+ - 1 indicates the head is **not masked**,
655
+ - 0 indicates the head is **masked**.
656
+
657
+ inputs_embeds (`torch.FloatTensor` of shape `({0}, hidden_size)`, *optional*):
658
+ Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
659
+ is useful if you want more control over how to convert `input_ids` indices into associated vectors than the
660
+ model's internal embedding lookup matrix.
661
+ output_attentions (`bool`, *optional*):
662
+ Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
663
+ tensors for more detail.
664
+ output_hidden_states (`bool`, *optional*):
665
+ Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
666
+ more detail.
667
+ return_dict (`bool`, *optional*):
668
+ Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
669
+ """
670
+
671
+
672
+ @add_start_docstrings(
673
+ "The bare XLM-RoBERTa Model transformer outputting raw hidden-states without any specific head on top.",
674
+ XLM_ROBERTA_START_DOCSTRING,
675
+ )
676
+ # Copied from transformers.models.roberta.modeling_roberta.RobertaModel with Roberta->XLMRoberta, ROBERTA->XLM_ROBERTA
677
+ class XLMRobertaModel(XLMRobertaPreTrainedModel):
678
+ """
679
+
680
+ The model can behave as an encoder (with only self-attention) as well as a decoder, in which case a layer of
681
+ cross-attention is added between the self-attention layers, following the architecture described in *Attention is
682
+ all you need*_ by Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit, Llion Jones, Aidan N. Gomez, Lukasz
683
+ Kaiser and Illia Polosukhin.
684
+
685
+ To behave as an decoder the model needs to be initialized with the `is_decoder` argument of the configuration set
686
+ to `True`. To be used in a Seq2Seq model, the model needs to initialized with both `is_decoder` argument and
687
+ `add_cross_attention` set to `True`; an `encoder_hidden_states` is then expected as an input to the forward pass.
688
+
689
+ .. _*Attention is all you need*: https://arxiv.org/abs/1706.03762
690
+
691
+ """
692
+
693
+ # Copied from transformers.models.bert.modeling_bert.BertModel.__init__ with Bert->XLMRoberta
694
+ def __init__(self, config, add_pooling_layer=True):
695
+ super().__init__(config)
696
+ self.config = config
697
+
698
+ self.embeddings = XLMRobertaEmbeddings(config)
699
+ self.encoder = XLMRobertaEncoder(config)
700
+
701
+ self.pooler = XLMRobertaPooler(config) if add_pooling_layer else None
702
+
703
+ # Initialize weights and apply final processing
704
+ self.post_init()
705
+
706
+ def get_input_embeddings(self):
707
+ return self.embeddings.word_embeddings
708
+
709
+ def set_input_embeddings(self, value):
710
+ self.embeddings.word_embeddings = value
711
+
712
+ def _prune_heads(self, heads_to_prune):
713
+ """
714
+ Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base
715
+ class PreTrainedModel
716
+ """
717
+ for layer, heads in heads_to_prune.items():
718
+ self.encoder.layer[layer].attention.prune_heads(heads)
719
+
720
+ @add_start_docstrings_to_model_forward(XLM_ROBERTA_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
721
+ @add_code_sample_docstrings(
722
+ checkpoint=_CHECKPOINT_FOR_DOC,
723
+ output_type=BaseModelOutputWithPoolingAndCrossAttentions,
724
+ config_class=_CONFIG_FOR_DOC,
725
+ )
726
+ # Copied from transformers.models.bert.modeling_bert.BertModel.forward
727
+ def forward(
728
+ self,
729
+ input_ids: Optional[torch.Tensor] = None,
730
+ attention_mask: Optional[torch.Tensor] = None,
731
+ token_type_ids: Optional[torch.Tensor] = None,
732
+ position_ids: Optional[torch.Tensor] = None,
733
+ head_mask: Optional[torch.Tensor] = None,
734
+ inputs_embeds: Optional[torch.Tensor] = None,
735
+ encoder_hidden_states: Optional[torch.Tensor] = None,
736
+ encoder_attention_mask: Optional[torch.Tensor] = None,
737
+ past_key_values: Optional[List[torch.FloatTensor]] = None,
738
+ use_cache: Optional[bool] = None,
739
+ output_attentions: Optional[bool] = None,
740
+ output_hidden_states: Optional[bool] = None,
741
+ return_dict: Optional[bool] = None,
742
+ ) -> Union[Tuple[torch.Tensor], BaseModelOutputWithPoolingAndCrossAttentions]:
743
+ r"""
744
+ encoder_hidden_states (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
745
+ Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention if
746
+ the model is configured as a decoder.
747
+ encoder_attention_mask (`torch.FloatTensor` of shape `(batch_size, sequence_length)`, *optional*):
748
+ Mask to avoid performing attention on the padding token indices of the encoder input. This mask is used in
749
+ the cross-attention if the model is configured as a decoder. Mask values selected in `[0, 1]`:
750
+
751
+ - 1 for tokens that are **not masked**,
752
+ - 0 for tokens that are **masked**.
753
+ past_key_values (`tuple(tuple(torch.FloatTensor))` of length `config.n_layers` with each tuple having 4 tensors of shape `(batch_size, num_heads, sequence_length - 1, embed_size_per_head)`):
754
+ Contains precomputed key and value hidden states of the attention blocks. Can be used to speed up decoding.
755
+
756
+ If `past_key_values` are used, the user can optionally input only the last `decoder_input_ids` (those that
757
+ don't have their past key value states given to this model) of shape `(batch_size, 1)` instead of all
758
+ `decoder_input_ids` of shape `(batch_size, sequence_length)`.
759
+ use_cache (`bool`, *optional*):
760
+ If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see
761
+ `past_key_values`).
762
+ """
763
+ output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
764
+ output_hidden_states = (
765
+ output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
766
+ )
767
+ return_dict = return_dict if return_dict is not None else self.config.use_return_dict
768
+
769
+ if self.config.is_decoder:
770
+ use_cache = use_cache if use_cache is not None else self.config.use_cache
771
+ else:
772
+ use_cache = False
773
+
774
+ if input_ids is not None and inputs_embeds is not None:
775
+ raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
776
+ elif input_ids is not None:
777
+ self.warn_if_padding_and_no_attention_mask(input_ids, attention_mask)
778
+ input_shape = input_ids.size()
779
+ elif inputs_embeds is not None:
780
+ input_shape = inputs_embeds.size()[:-1]
781
+ else:
782
+ raise ValueError("You have to specify either input_ids or inputs_embeds")
783
+
784
+ batch_size, seq_length = input_shape
785
+ device = input_ids.device if input_ids is not None else inputs_embeds.device
786
+
787
+ # past_key_values_length
788
+ past_key_values_length = past_key_values[0][0].shape[2] if past_key_values is not None else 0
789
+
790
+ if attention_mask is None:
791
+ attention_mask = torch.ones(((batch_size, seq_length + past_key_values_length)), device=device)
792
+
793
+ if token_type_ids is None:
794
+ if hasattr(self.embeddings, "token_type_ids"):
795
+ buffered_token_type_ids = self.embeddings.token_type_ids[:, :seq_length]
796
+ buffered_token_type_ids_expanded = buffered_token_type_ids.expand(batch_size, seq_length)
797
+ token_type_ids = buffered_token_type_ids_expanded
798
+ else:
799
+ token_type_ids = torch.zeros(input_shape, dtype=torch.long, device=device)
800
+
801
+ # We can provide a self-attention mask of dimensions [batch_size, from_seq_length, to_seq_length]
802
+ # ourselves in which case we just need to make it broadcastable to all heads.
803
+ extended_attention_mask: torch.Tensor = self.get_extended_attention_mask(attention_mask, input_shape)
804
+
805
+ # If a 2D or 3D attention mask is provided for the cross-attention
806
+ # we need to make broadcastable to [batch_size, num_heads, seq_length, seq_length]
807
+ if self.config.is_decoder and encoder_hidden_states is not None:
808
+ encoder_batch_size, encoder_sequence_length, _ = encoder_hidden_states.size()
809
+ encoder_hidden_shape = (encoder_batch_size, encoder_sequence_length)
810
+ if encoder_attention_mask is None:
811
+ encoder_attention_mask = torch.ones(encoder_hidden_shape, device=device)
812
+ encoder_extended_attention_mask = self.invert_attention_mask(encoder_attention_mask)
813
+ else:
814
+ encoder_extended_attention_mask = None
815
+
816
+ # Prepare head mask if needed
817
+ # 1.0 in head_mask indicate we keep the head
818
+ # attention_probs has shape bsz x n_heads x N x N
819
+ # input head_mask has shape [num_heads] or [num_hidden_layers x num_heads]
820
+ # and head_mask is converted to shape [num_hidden_layers x batch x num_heads x seq_length x seq_length]
821
+ head_mask = self.get_head_mask(head_mask, self.config.num_hidden_layers)
822
+
823
+ embedding_output = self.embeddings(
824
+ input_ids=input_ids,
825
+ position_ids=position_ids,
826
+ token_type_ids=token_type_ids,
827
+ inputs_embeds=inputs_embeds,
828
+ past_key_values_length=past_key_values_length,
829
+ )
830
+ encoder_outputs = self.encoder(
831
+ embedding_output,
832
+ attention_mask=extended_attention_mask,
833
+ head_mask=head_mask,
834
+ encoder_hidden_states=encoder_hidden_states,
835
+ encoder_attention_mask=encoder_extended_attention_mask,
836
+ past_key_values=past_key_values,
837
+ use_cache=use_cache,
838
+ output_attentions=output_attentions,
839
+ output_hidden_states=output_hidden_states,
840
+ return_dict=return_dict,
841
+ )
842
+ sequence_output = encoder_outputs[0]
843
+ pooled_output = self.pooler(sequence_output) if self.pooler is not None else None
844
+
845
+ if not return_dict:
846
+ return (sequence_output, pooled_output) + encoder_outputs[1:]
847
+
848
+ return BaseModelOutputWithPoolingAndCrossAttentions(
849
+ last_hidden_state=sequence_output,
850
+ pooler_output=pooled_output,
851
+ past_key_values=encoder_outputs.past_key_values,
852
+ hidden_states=encoder_outputs.hidden_states,
853
+ attentions=encoder_outputs.attentions,
854
+ cross_attentions=encoder_outputs.cross_attentions,
855
+ )
856
+
857
+
858
+ @add_start_docstrings(
859
+ "XLM-RoBERTa Model with a `language modeling` head on top for CLM fine-tuning.",
860
+ XLM_ROBERTA_START_DOCSTRING,
861
+ )
862
+ # Copied from transformers.models.roberta.modeling_roberta.RobertaForCausalLM with Roberta->XLMRoberta, ROBERTA->XLM_ROBERTA
863
+ class XLMRobertaForCausalLM(XLMRobertaPreTrainedModel):
864
+ _tied_weights_keys = ["lm_head.decoder.weight", "lm_head.decoder.bias"]
865
+
866
+ def __init__(self, config):
867
+ super().__init__(config)
868
+
869
+ if not config.is_decoder:
870
+ logger.warning("If you want to use `XLMRobertaLMHeadModel` as a standalone, add `is_decoder=True.`")
871
+
872
+ self.roberta = XLMRobertaModel(config, add_pooling_layer=False)
873
+ self.lm_head = XLMRobertaLMHead(config)
874
+
875
+ # Initialize weights and apply final processing
876
+ self.post_init()
877
+
878
+ def get_output_embeddings(self):
879
+ return self.lm_head.decoder
880
+
881
+ def set_output_embeddings(self, new_embeddings):
882
+ self.lm_head.decoder = new_embeddings
883
+
884
+ @add_start_docstrings_to_model_forward(XLM_ROBERTA_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
885
+ @replace_return_docstrings(output_type=CausalLMOutputWithCrossAttentions, config_class=_CONFIG_FOR_DOC)
886
+ def forward(
887
+ self,
888
+ input_ids: Optional[torch.LongTensor] = None,
889
+ attention_mask: Optional[torch.FloatTensor] = None,
890
+ token_type_ids: Optional[torch.LongTensor] = None,
891
+ position_ids: Optional[torch.LongTensor] = None,
892
+ head_mask: Optional[torch.FloatTensor] = None,
893
+ inputs_embeds: Optional[torch.FloatTensor] = None,
894
+ encoder_hidden_states: Optional[torch.FloatTensor] = None,
895
+ encoder_attention_mask: Optional[torch.FloatTensor] = None,
896
+ labels: Optional[torch.LongTensor] = None,
897
+ past_key_values: Tuple[Tuple[torch.FloatTensor]] = None,
898
+ use_cache: Optional[bool] = None,
899
+ output_attentions: Optional[bool] = None,
900
+ output_hidden_states: Optional[bool] = None,
901
+ return_dict: Optional[bool] = None,
902
+ ) -> Union[Tuple[torch.Tensor], CausalLMOutputWithCrossAttentions]:
903
+ r"""
904
+ encoder_hidden_states (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
905
+ Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention if
906
+ the model is configured as a decoder.
907
+ encoder_attention_mask (`torch.FloatTensor` of shape `(batch_size, sequence_length)`, *optional*):
908
+ Mask to avoid performing attention on the padding token indices of the encoder input. This mask is used in
909
+ the cross-attention if the model is configured as a decoder. Mask values selected in `[0, 1]`:
910
+
911
+ - 1 for tokens that are **not masked**,
912
+ - 0 for tokens that are **masked**.
913
+
914
+ labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
915
+ Labels for computing the left-to-right language modeling loss (next word prediction). Indices should be in
916
+ `[-100, 0, ..., config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are
917
+ ignored (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`
918
+ past_key_values (`tuple(tuple(torch.FloatTensor))` of length `config.n_layers` with each tuple having 4 tensors of shape `(batch_size, num_heads, sequence_length - 1, embed_size_per_head)`):
919
+ Contains precomputed key and value hidden states of the attention blocks. Can be used to speed up decoding.
920
+
921
+ If `past_key_values` are used, the user can optionally input only the last `decoder_input_ids` (those that
922
+ don't have their past key value states given to this model) of shape `(batch_size, 1)` instead of all
923
+ `decoder_input_ids` of shape `(batch_size, sequence_length)`.
924
+ use_cache (`bool`, *optional*):
925
+ If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see
926
+ `past_key_values`).
927
+
928
+ Returns:
929
+
930
+ Example:
931
+
932
+ ```python
933
+ >>> from transformers import AutoTokenizer, XLMRobertaForCausalLM, AutoConfig
934
+ >>> import torch
935
+
936
+ >>> tokenizer = AutoTokenizer.from_pretrained("FacebookAI/roberta-base")
937
+ >>> config = AutoConfig.from_pretrained("FacebookAI/roberta-base")
938
+ >>> config.is_decoder = True
939
+ >>> model = XLMRobertaForCausalLM.from_pretrained("FacebookAI/roberta-base", config=config)
940
+
941
+ >>> inputs = tokenizer("Hello, my dog is cute", return_tensors="pt")
942
+ >>> outputs = model(**inputs)
943
+
944
+ >>> prediction_logits = outputs.logits
945
+ ```"""
946
+ return_dict = return_dict if return_dict is not None else self.config.use_return_dict
947
+ if labels is not None:
948
+ use_cache = False
949
+
950
+ outputs = self.roberta(
951
+ input_ids,
952
+ attention_mask=attention_mask,
953
+ token_type_ids=token_type_ids,
954
+ position_ids=position_ids,
955
+ head_mask=head_mask,
956
+ inputs_embeds=inputs_embeds,
957
+ encoder_hidden_states=encoder_hidden_states,
958
+ encoder_attention_mask=encoder_attention_mask,
959
+ past_key_values=past_key_values,
960
+ use_cache=use_cache,
961
+ output_attentions=output_attentions,
962
+ output_hidden_states=output_hidden_states,
963
+ return_dict=return_dict,
964
+ )
965
+
966
+ sequence_output = outputs[0]
967
+ prediction_scores = self.lm_head(sequence_output)
968
+
969
+ lm_loss = None
970
+ if labels is not None:
971
+ # move labels to correct device to enable model parallelism
972
+ labels = labels.to(prediction_scores.device)
973
+ # we are doing next-token prediction; shift prediction scores and input ids by one
974
+ shifted_prediction_scores = prediction_scores[:, :-1, :].contiguous()
975
+ labels = labels[:, 1:].contiguous()
976
+ loss_fct = CrossEntropyLoss()
977
+ lm_loss = loss_fct(shifted_prediction_scores.view(-1, self.config.vocab_size), labels.view(-1))
978
+
979
+ if not return_dict:
980
+ output = (prediction_scores,) + outputs[2:]
981
+ return ((lm_loss,) + output) if lm_loss is not None else output
982
+
983
+ return CausalLMOutputWithCrossAttentions(
984
+ loss=lm_loss,
985
+ logits=prediction_scores,
986
+ past_key_values=outputs.past_key_values,
987
+ hidden_states=outputs.hidden_states,
988
+ attentions=outputs.attentions,
989
+ cross_attentions=outputs.cross_attentions,
990
+ )
991
+
992
+ def prepare_inputs_for_generation(self, input_ids, past_key_values=None, attention_mask=None, **model_kwargs):
993
+ input_shape = input_ids.shape
994
+ # if model is used as a decoder in encoder-decoder model, the decoder attention mask is created on the fly
995
+ if attention_mask is None:
996
+ attention_mask = input_ids.new_ones(input_shape)
997
+
998
+ # cut decoder_input_ids if past_key_values is used
999
+ if past_key_values is not None:
1000
+ past_length = past_key_values[0][0].shape[2]
1001
+
1002
+ # Some generation methods already pass only the last input ID
1003
+ if input_ids.shape[1] > past_length:
1004
+ remove_prefix_length = past_length
1005
+ else:
1006
+ # Default to old behavior: keep only final ID
1007
+ remove_prefix_length = input_ids.shape[1] - 1
1008
+
1009
+ input_ids = input_ids[:, remove_prefix_length:]
1010
+
1011
+ return {"input_ids": input_ids, "attention_mask": attention_mask, "past_key_values": past_key_values}
1012
+
1013
+ def _reorder_cache(self, past_key_values, beam_idx):
1014
+ reordered_past = ()
1015
+ for layer_past in past_key_values:
1016
+ reordered_past += (
1017
+ tuple(past_state.index_select(0, beam_idx.to(past_state.device)) for past_state in layer_past),
1018
+ )
1019
+ return reordered_past
1020
+
1021
+
1022
+ @add_start_docstrings(
1023
+ """XLM-RoBERTa Model with a `language modeling` head on top.""",
1024
+ XLM_ROBERTA_START_DOCSTRING,
1025
+ )
1026
+ # Copied from transformers.models.roberta.modeling_roberta.RobertaForMaskedLM with Roberta->XLMRoberta, ROBERTA->XLM_ROBERTA
1027
+ class XLMRobertaForMaskedLM(XLMRobertaPreTrainedModel):
1028
+ _tied_weights_keys = ["lm_head.decoder.weight", "lm_head.decoder.bias"]
1029
+
1030
+ def __init__(self, config):
1031
+ super().__init__(config)
1032
+
1033
+ if config.is_decoder:
1034
+ logger.warning(
1035
+ "If you want to use `XLMRobertaForMaskedLM` make sure `config.is_decoder=False` for "
1036
+ "bi-directional self-attention."
1037
+ )
1038
+
1039
+ self.roberta = XLMRobertaModel(config, add_pooling_layer=False)
1040
+ self.lm_head = XLMRobertaLMHead(config)
1041
+
1042
+ # Initialize weights and apply final processing
1043
+ self.post_init()
1044
+
1045
+ def get_output_embeddings(self):
1046
+ return self.lm_head.decoder
1047
+
1048
+ def set_output_embeddings(self, new_embeddings):
1049
+ self.lm_head.decoder = new_embeddings
1050
+
1051
+ @add_start_docstrings_to_model_forward(XLM_ROBERTA_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
1052
+ @add_code_sample_docstrings(
1053
+ checkpoint=_CHECKPOINT_FOR_DOC,
1054
+ output_type=MaskedLMOutput,
1055
+ config_class=_CONFIG_FOR_DOC,
1056
+ mask="<mask>",
1057
+ expected_output="' Paris'",
1058
+ expected_loss=0.1,
1059
+ )
1060
+ def forward(
1061
+ self,
1062
+ input_ids: Optional[torch.LongTensor] = None,
1063
+ attention_mask: Optional[torch.FloatTensor] = None,
1064
+ token_type_ids: Optional[torch.LongTensor] = None,
1065
+ position_ids: Optional[torch.LongTensor] = None,
1066
+ head_mask: Optional[torch.FloatTensor] = None,
1067
+ inputs_embeds: Optional[torch.FloatTensor] = None,
1068
+ encoder_hidden_states: Optional[torch.FloatTensor] = None,
1069
+ encoder_attention_mask: Optional[torch.FloatTensor] = None,
1070
+ labels: Optional[torch.LongTensor] = None,
1071
+ output_attentions: Optional[bool] = None,
1072
+ output_hidden_states: Optional[bool] = None,
1073
+ return_dict: Optional[bool] = None,
1074
+ ) -> Union[Tuple[torch.Tensor], MaskedLMOutput]:
1075
+ r"""
1076
+ labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
1077
+ Labels for computing the masked language modeling loss. Indices should be in `[-100, 0, ...,
1078
+ config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are ignored (masked), the
1079
+ loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`
1080
+ kwargs (`Dict[str, any]`, optional, defaults to *{}*):
1081
+ Used to hide legacy arguments that have been deprecated.
1082
+ """
1083
+ return_dict = return_dict if return_dict is not None else self.config.use_return_dict
1084
+
1085
+ outputs = self.roberta(
1086
+ input_ids,
1087
+ attention_mask=attention_mask,
1088
+ token_type_ids=token_type_ids,
1089
+ position_ids=position_ids,
1090
+ head_mask=head_mask,
1091
+ inputs_embeds=inputs_embeds,
1092
+ encoder_hidden_states=encoder_hidden_states,
1093
+ encoder_attention_mask=encoder_attention_mask,
1094
+ output_attentions=output_attentions,
1095
+ output_hidden_states=output_hidden_states,
1096
+ return_dict=return_dict,
1097
+ )
1098
+ sequence_output = outputs[0]
1099
+ prediction_scores = self.lm_head(sequence_output)
1100
+
1101
+ masked_lm_loss = None
1102
+ if labels is not None:
1103
+ # move labels to correct device to enable model parallelism
1104
+ labels = labels.to(prediction_scores.device)
1105
+ loss_fct = CrossEntropyLoss()
1106
+ masked_lm_loss = loss_fct(prediction_scores.view(-1, self.config.vocab_size), labels.view(-1))
1107
+
1108
+ if not return_dict:
1109
+ output = (prediction_scores,) + outputs[2:]
1110
+ return ((masked_lm_loss,) + output) if masked_lm_loss is not None else output
1111
+
1112
+ return MaskedLMOutput(
1113
+ loss=masked_lm_loss,
1114
+ logits=prediction_scores,
1115
+ hidden_states=outputs.hidden_states,
1116
+ attentions=outputs.attentions,
1117
+ )
1118
+
1119
+
1120
+ # Copied from transformers.models.roberta.modeling_roberta.RobertaLMHead
1121
+ class XLMRobertaLMHead(nn.Module):
1122
+ """Roberta Head for masked language modeling."""
1123
+
1124
+ def __init__(self, config):
1125
+ super().__init__()
1126
+ self.dense = nn.Linear(config.hidden_size, config.hidden_size)
1127
+ self.layer_norm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
1128
+
1129
+ self.decoder = nn.Linear(config.hidden_size, config.vocab_size)
1130
+ self.bias = nn.Parameter(torch.zeros(config.vocab_size))
1131
+ self.decoder.bias = self.bias
1132
+
1133
+ def forward(self, features, **kwargs):
1134
+ x = self.dense(features)
1135
+ x = gelu(x)
1136
+ x = self.layer_norm(x)
1137
+
1138
+ # project back to size of vocabulary with bias
1139
+ x = self.decoder(x)
1140
+
1141
+ return x
1142
+
1143
+ def _tie_weights(self):
1144
+ # To tie those two weights if they get disconnected (on TPU or when the bias is resized)
1145
+ # For accelerate compatibility and to not break backward compatibility
1146
+ if self.decoder.bias.device.type == "meta":
1147
+ self.decoder.bias = self.bias
1148
+ else:
1149
+ self.bias = self.decoder.bias
1150
+
1151
+
1152
+ @add_start_docstrings(
1153
+ """
1154
+ XLM-RoBERTa Model transformer with a sequence classification/regression head on top (a linear layer on top of the
1155
+ pooled output) e.g. for GLUE tasks.
1156
+ """,
1157
+ XLM_ROBERTA_START_DOCSTRING,
1158
+ )
1159
+ # Copied from transformers.models.roberta.modeling_roberta.RobertaForSequenceClassification with Roberta->XLMRoberta, ROBERTA->XLM_ROBERTA
1160
+ class XLMRobertaForSequenceClassification(XLMRobertaPreTrainedModel):
1161
+ def __init__(self, config):
1162
+ super().__init__(config)
1163
+ self.num_labels = config.num_labels
1164
+ self.config = config
1165
+
1166
+ self.roberta = XLMRobertaModel(config, add_pooling_layer=False)
1167
+ self.classifier = XLMRobertaClassificationHead(config)
1168
+
1169
+ # Initialize weights and apply final processing
1170
+ self.post_init()
1171
+
1172
+ @add_start_docstrings_to_model_forward(XLM_ROBERTA_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
1173
+ @add_code_sample_docstrings(
1174
+ checkpoint="cardiffnlp/twitter-roberta-base-emotion",
1175
+ output_type=SequenceClassifierOutput,
1176
+ config_class=_CONFIG_FOR_DOC,
1177
+ expected_output="'optimism'",
1178
+ expected_loss=0.08,
1179
+ )
1180
+ def forward(
1181
+ self,
1182
+ input_ids: Optional[torch.LongTensor] = None,
1183
+ attention_mask: Optional[torch.FloatTensor] = None,
1184
+ token_type_ids: Optional[torch.LongTensor] = None,
1185
+ position_ids: Optional[torch.LongTensor] = None,
1186
+ head_mask: Optional[torch.FloatTensor] = None,
1187
+ inputs_embeds: Optional[torch.FloatTensor] = None,
1188
+ labels: Optional[torch.LongTensor] = None,
1189
+ output_attentions: Optional[bool] = None,
1190
+ output_hidden_states: Optional[bool] = None,
1191
+ return_dict: Optional[bool] = None,
1192
+ ) -> Union[Tuple[torch.Tensor], SequenceClassifierOutput]:
1193
+ r"""
1194
+ labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
1195
+ Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
1196
+ config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
1197
+ `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
1198
+ """
1199
+ return_dict = return_dict if return_dict is not None else self.config.use_return_dict
1200
+
1201
+ outputs = self.roberta(
1202
+ input_ids,
1203
+ attention_mask=attention_mask,
1204
+ token_type_ids=token_type_ids,
1205
+ position_ids=position_ids,
1206
+ head_mask=head_mask,
1207
+ inputs_embeds=inputs_embeds,
1208
+ output_attentions=output_attentions,
1209
+ output_hidden_states=output_hidden_states,
1210
+ return_dict=return_dict,
1211
+ )
1212
+ sequence_output = outputs[0]
1213
+ logits = self.classifier(sequence_output)
1214
+
1215
+ loss = None
1216
+ if labels is not None:
1217
+ # move labels to correct device to enable model parallelism
1218
+ labels = labels.to(logits.device)
1219
+ if self.config.problem_type is None:
1220
+ if self.num_labels == 1:
1221
+ self.config.problem_type = "regression"
1222
+ elif self.num_labels > 1 and (labels.dtype == torch.long or labels.dtype == torch.int):
1223
+ self.config.problem_type = "single_label_classification"
1224
+ else:
1225
+ self.config.problem_type = "multi_label_classification"
1226
+
1227
+ if self.config.problem_type == "regression":
1228
+ loss_fct = MSELoss()
1229
+ if self.num_labels == 1:
1230
+ loss = loss_fct(logits.squeeze(), labels.squeeze())
1231
+ else:
1232
+ loss = loss_fct(logits, labels)
1233
+ elif self.config.problem_type == "single_label_classification":
1234
+ loss_fct = CrossEntropyLoss()
1235
+ loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
1236
+ elif self.config.problem_type == "multi_label_classification":
1237
+ loss_fct = BCEWithLogitsLoss()
1238
+ loss = loss_fct(logits, labels)
1239
+
1240
+ if not return_dict:
1241
+ output = (logits,) + outputs[2:]
1242
+ return ((loss,) + output) if loss is not None else output
1243
+
1244
+ return SequenceClassifierOutput(
1245
+ loss=loss,
1246
+ logits=logits,
1247
+ hidden_states=outputs.hidden_states,
1248
+ attentions=outputs.attentions,
1249
+ )
1250
+
1251
+
1252
+ @add_start_docstrings(
1253
+ """
1254
+ XLM-RoBERTa Model with a multiple choice classification head on top (a linear layer on top of the pooled output and
1255
+ a softmax) e.g. for RocStories/SWAG tasks.
1256
+ """,
1257
+ XLM_ROBERTA_START_DOCSTRING,
1258
+ )
1259
+ # Copied from transformers.models.roberta.modeling_roberta.RobertaForMultipleChoice with Roberta->XLMRoberta, ROBERTA->XLM_ROBERTA
1260
+ class XLMRobertaForMultipleChoice(XLMRobertaPreTrainedModel):
1261
+ def __init__(self, config):
1262
+ super().__init__(config)
1263
+
1264
+ self.roberta = XLMRobertaModel(config)
1265
+ self.dropout = nn.Dropout(config.hidden_dropout_prob)
1266
+ self.classifier = nn.Linear(config.hidden_size, 1)
1267
+
1268
+ # Initialize weights and apply final processing
1269
+ self.post_init()
1270
+
1271
+ @add_start_docstrings_to_model_forward(
1272
+ XLM_ROBERTA_INPUTS_DOCSTRING.format("batch_size, num_choices, sequence_length")
1273
+ )
1274
+ @add_code_sample_docstrings(
1275
+ checkpoint=_CHECKPOINT_FOR_DOC,
1276
+ output_type=MultipleChoiceModelOutput,
1277
+ config_class=_CONFIG_FOR_DOC,
1278
+ )
1279
+ def forward(
1280
+ self,
1281
+ input_ids: Optional[torch.LongTensor] = None,
1282
+ token_type_ids: Optional[torch.LongTensor] = None,
1283
+ attention_mask: Optional[torch.FloatTensor] = None,
1284
+ labels: Optional[torch.LongTensor] = None,
1285
+ position_ids: Optional[torch.LongTensor] = None,
1286
+ head_mask: Optional[torch.FloatTensor] = None,
1287
+ inputs_embeds: Optional[torch.FloatTensor] = None,
1288
+ output_attentions: Optional[bool] = None,
1289
+ output_hidden_states: Optional[bool] = None,
1290
+ return_dict: Optional[bool] = None,
1291
+ ) -> Union[Tuple[torch.Tensor], MultipleChoiceModelOutput]:
1292
+ r"""
1293
+ labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
1294
+ Labels for computing the multiple choice classification loss. Indices should be in `[0, ...,
1295
+ num_choices-1]` where `num_choices` is the size of the second dimension of the input tensors. (See
1296
+ `input_ids` above)
1297
+ """
1298
+ return_dict = return_dict if return_dict is not None else self.config.use_return_dict
1299
+ num_choices = input_ids.shape[1] if input_ids is not None else inputs_embeds.shape[1]
1300
+
1301
+ flat_input_ids = input_ids.view(-1, input_ids.size(-1)) if input_ids is not None else None
1302
+ flat_position_ids = position_ids.view(-1, position_ids.size(-1)) if position_ids is not None else None
1303
+ flat_token_type_ids = token_type_ids.view(-1, token_type_ids.size(-1)) if token_type_ids is not None else None
1304
+ flat_attention_mask = attention_mask.view(-1, attention_mask.size(-1)) if attention_mask is not None else None
1305
+ flat_inputs_embeds = (
1306
+ inputs_embeds.view(-1, inputs_embeds.size(-2), inputs_embeds.size(-1))
1307
+ if inputs_embeds is not None
1308
+ else None
1309
+ )
1310
+
1311
+ outputs = self.roberta(
1312
+ flat_input_ids,
1313
+ position_ids=flat_position_ids,
1314
+ token_type_ids=flat_token_type_ids,
1315
+ attention_mask=flat_attention_mask,
1316
+ head_mask=head_mask,
1317
+ inputs_embeds=flat_inputs_embeds,
1318
+ output_attentions=output_attentions,
1319
+ output_hidden_states=output_hidden_states,
1320
+ return_dict=return_dict,
1321
+ )
1322
+ pooled_output = outputs[1]
1323
+
1324
+ pooled_output = self.dropout(pooled_output)
1325
+ logits = self.classifier(pooled_output)
1326
+ reshaped_logits = logits.view(-1, num_choices)
1327
+
1328
+ loss = None
1329
+ if labels is not None:
1330
+ # move labels to correct device to enable model parallelism
1331
+ labels = labels.to(reshaped_logits.device)
1332
+ loss_fct = CrossEntropyLoss()
1333
+ loss = loss_fct(reshaped_logits, labels)
1334
+
1335
+ if not return_dict:
1336
+ output = (reshaped_logits,) + outputs[2:]
1337
+ return ((loss,) + output) if loss is not None else output
1338
+
1339
+ return MultipleChoiceModelOutput(
1340
+ loss=loss,
1341
+ logits=reshaped_logits,
1342
+ hidden_states=outputs.hidden_states,
1343
+ attentions=outputs.attentions,
1344
+ )
1345
+
1346
+
1347
+ class FeedForward(nn.Module):
1348
+ """Feed Forward Neural Network.
1349
+
1350
+ Args:
1351
+ in_dim (int): Number input features.
1352
+ out_dim (int): Number of output features. Default is just a score.
1353
+ hidden_sizes (List[int]): List with hidden layer sizes. Defaults to [3072,1024]
1354
+ activations (str): Name of the activation function to be used in the hidden
1355
+ layers. Defaults to 'Tanh'.
1356
+ final_activation (Optional[str]): Final activation if any.
1357
+ dropout (float): dropout to be used in the hidden layers.
1358
+ """
1359
+
1360
+ def __init__(
1361
+ self,
1362
+ in_dim: int = 1024,
1363
+ out_dim: int = 1,
1364
+ hidden_sizes: List[int] = [3072, 1024],
1365
+ activations: str = "Tanh",
1366
+ final_activation: Optional[str] = None,
1367
+ dropout: float = 0.1,
1368
+ ) -> None:
1369
+ super().__init__()
1370
+ modules = []
1371
+ modules.append(nn.Linear(in_dim, hidden_sizes[0]))
1372
+ modules.append(self.build_activation(activations))
1373
+ modules.append(nn.Dropout(dropout))
1374
+
1375
+ for i in range(1, len(hidden_sizes)):
1376
+ modules.append(nn.Linear(hidden_sizes[i - 1], hidden_sizes[i]))
1377
+ modules.append(self.build_activation(activations))
1378
+ modules.append(nn.Dropout(dropout))
1379
+
1380
+ modules.append(nn.Linear(hidden_sizes[-1], int(out_dim)))
1381
+ if final_activation is not None:
1382
+ modules.append(self.build_activation(final_activation))
1383
+
1384
+ self.ff = nn.Sequential(*modules)
1385
+
1386
+ def build_activation(self, activation: str) -> nn.Module:
1387
+ if hasattr(nn, activation.title()):
1388
+ return getattr(nn, activation.title())()
1389
+ else:
1390
+ raise Exception(f"{activation} is not a valid activation function!")
1391
+
1392
+ def forward(self, in_features: torch.Tensor) -> torch.Tensor:
1393
+ return self.ff(in_features)
1394
+
1395
+
1396
+ @add_start_docstrings(
1397
+ """
1398
+ XLM-RoBERTa Model with a multiple choice classification head on top (a linear layer on top of the pooled output and
1399
+ a softmax) e.g. for RocStories/SWAG tasks.
1400
+ """,
1401
+ XLM_ROBERTA_START_DOCSTRING,
1402
+ )
1403
+ # Copied from transformers.models.roberta.modeling_roberta.RobertaForMultipleChoice with Roberta->XLMRoberta, ROBERTA->XLM_ROBERTA
1404
+ class XLMRobertaForEstimation(XLMRobertaPreTrainedModel):
1405
+ def __init__(self, config):
1406
+ super().__init__(config)
1407
+
1408
+ self.roberta = XLMRobertaModel(config, add_pooling_layer=False)
1409
+ print("toto")
1410
+ self.estimator = FeedForward()
1411
+
1412
+ # Initialize weights and apply final processing
1413
+ self.post_init()
1414
+
1415
+ def forward(
1416
+ self,
1417
+ input_ids: Optional[torch.LongTensor] = None,
1418
+ token_type_ids: Optional[torch.LongTensor] = None,
1419
+ attention_mask: Optional[torch.FloatTensor] = None,
1420
+ labels: Optional[torch.LongTensor] = None,
1421
+ position_ids: Optional[torch.LongTensor] = None,
1422
+ head_mask: Optional[torch.FloatTensor] = None,
1423
+ inputs_embeds: Optional[torch.FloatTensor] = None,
1424
+ output_attentions: Optional[bool] = None,
1425
+ output_hidden_states: Optional[bool] = None,
1426
+ return_dict: Optional[bool] = None,
1427
+ ) -> Union[Tuple[torch.Tensor], MultipleChoiceModelOutput]:
1428
+ r"""
1429
+ labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
1430
+ Labels for computing the multiple choice classification loss. Indices should be in `[0, ...,
1431
+ num_choices-1]` where `num_choices` is the size of the second dimension of the input tensors. (See
1432
+ `input_ids` above)
1433
+ """
1434
+ return_dict = return_dict if return_dict is not None else self.config.use_return_dict
1435
+ num_choices = input_ids.shape[1] if input_ids is not None else inputs_embeds.shape[1]
1436
+
1437
+ flat_input_ids = input_ids.view(-1, input_ids.size(-1)) if input_ids is not None else None
1438
+ flat_position_ids = position_ids.view(-1, position_ids.size(-1)) if position_ids is not None else None
1439
+ flat_token_type_ids = token_type_ids.view(-1, token_type_ids.size(-1)) if token_type_ids is not None else None
1440
+ flat_attention_mask = attention_mask.view(-1, attention_mask.size(-1)) if attention_mask is not None else None
1441
+ flat_inputs_embeds = (
1442
+ inputs_embeds.view(-1, inputs_embeds.size(-2), inputs_embeds.size(-1))
1443
+ if inputs_embeds is not None
1444
+ else None
1445
+ )
1446
+
1447
+ outputs = self.roberta(
1448
+ flat_input_ids,
1449
+ position_ids=flat_position_ids,
1450
+ token_type_ids=flat_token_type_ids,
1451
+ attention_mask=flat_attention_mask,
1452
+ head_mask=head_mask,
1453
+ inputs_embeds=flat_inputs_embeds,
1454
+ output_attentions=output_attentions,
1455
+ output_hidden_states=output_hidden_states,
1456
+ return_dict=return_dict,
1457
+ )
1458
+
1459
+ logits = self.estimator(outputs[0])
1460
+ reshaped_logits = logits.view(-1, num_choices)
1461
+
1462
+ loss = None
1463
+ if labels is not None:
1464
+ # move labels to correct device to enable model parallelism
1465
+ labels = labels.to(reshaped_logits.device)
1466
+ loss_fct = CrossEntropyLoss()
1467
+ loss = loss_fct(reshaped_logits, labels)
1468
+
1469
+ if not return_dict:
1470
+ output = (reshaped_logits,) + outputs[2:]
1471
+ return ((loss,) + output) if loss is not None else output
1472
+
1473
+ return MultipleChoiceModelOutput(
1474
+ loss=loss,
1475
+ logits=reshaped_logits,
1476
+ hidden_states=outputs.hidden_states,
1477
+ attentions=outputs.attentions,
1478
+ )
1479
+
1480
+ @add_start_docstrings(
1481
+ """
1482
+ XLM-RoBERTa Model with a token classification head on top (a linear layer on top of the hidden-states output) e.g.
1483
+ for Named-Entity-Recognition (NER) tasks.
1484
+ """,
1485
+ XLM_ROBERTA_START_DOCSTRING,
1486
+ )
1487
+ # Copied from transformers.models.roberta.modeling_roberta.RobertaForTokenClassification with Roberta->XLMRoberta, ROBERTA->XLM_ROBERTA
1488
+ class XLMRobertaForTokenClassification(XLMRobertaPreTrainedModel):
1489
+ def __init__(self, config):
1490
+ super().__init__(config)
1491
+ self.num_labels = config.num_labels
1492
+
1493
+ self.roberta = XLMRobertaModel(config, add_pooling_layer=False)
1494
+ classifier_dropout = (
1495
+ config.classifier_dropout if config.classifier_dropout is not None else config.hidden_dropout_prob
1496
+ )
1497
+ self.dropout = nn.Dropout(classifier_dropout)
1498
+ self.classifier = nn.Linear(config.hidden_size, config.num_labels)
1499
+
1500
+ # Initialize weights and apply final processing
1501
+ self.post_init()
1502
+
1503
+ @add_start_docstrings_to_model_forward(XLM_ROBERTA_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
1504
+ @add_code_sample_docstrings(
1505
+ checkpoint="Jean-Baptiste/roberta-large-ner-english",
1506
+ output_type=TokenClassifierOutput,
1507
+ config_class=_CONFIG_FOR_DOC,
1508
+ expected_output="['O', 'ORG', 'ORG', 'O', 'O', 'O', 'O', 'O', 'LOC', 'O', 'LOC', 'LOC']",
1509
+ expected_loss=0.01,
1510
+ )
1511
+ def forward(
1512
+ self,
1513
+ input_ids: Optional[torch.LongTensor] = None,
1514
+ attention_mask: Optional[torch.FloatTensor] = None,
1515
+ token_type_ids: Optional[torch.LongTensor] = None,
1516
+ position_ids: Optional[torch.LongTensor] = None,
1517
+ head_mask: Optional[torch.FloatTensor] = None,
1518
+ inputs_embeds: Optional[torch.FloatTensor] = None,
1519
+ labels: Optional[torch.LongTensor] = None,
1520
+ output_attentions: Optional[bool] = None,
1521
+ output_hidden_states: Optional[bool] = None,
1522
+ return_dict: Optional[bool] = None,
1523
+ ) -> Union[Tuple[torch.Tensor], TokenClassifierOutput]:
1524
+ r"""
1525
+ labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
1526
+ Labels for computing the token classification loss. Indices should be in `[0, ..., config.num_labels - 1]`.
1527
+ """
1528
+ return_dict = return_dict if return_dict is not None else self.config.use_return_dict
1529
+
1530
+ outputs = self.roberta(
1531
+ input_ids,
1532
+ attention_mask=attention_mask,
1533
+ token_type_ids=token_type_ids,
1534
+ position_ids=position_ids,
1535
+ head_mask=head_mask,
1536
+ inputs_embeds=inputs_embeds,
1537
+ output_attentions=output_attentions,
1538
+ output_hidden_states=output_hidden_states,
1539
+ return_dict=return_dict,
1540
+ )
1541
+
1542
+ sequence_output = outputs[0]
1543
+
1544
+ sequence_output = self.dropout(sequence_output)
1545
+ logits = self.classifier(sequence_output)
1546
+
1547
+ loss = None
1548
+ if labels is not None:
1549
+ # move labels to correct device to enable model parallelism
1550
+ labels = labels.to(logits.device)
1551
+ loss_fct = CrossEntropyLoss()
1552
+ loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
1553
+
1554
+ if not return_dict:
1555
+ output = (logits,) + outputs[2:]
1556
+ return ((loss,) + output) if loss is not None else output
1557
+
1558
+ return TokenClassifierOutput(
1559
+ loss=loss,
1560
+ logits=logits,
1561
+ hidden_states=outputs.hidden_states,
1562
+ attentions=outputs.attentions,
1563
+ )
1564
+
1565
+
1566
+ # Copied from transformers.models.roberta.modeling_roberta.RobertaClassificationHead with Roberta->XLMRoberta
1567
+ class XLMRobertaClassificationHead(nn.Module):
1568
+ """Head for sentence-level classification tasks."""
1569
+
1570
+ def __init__(self, config):
1571
+ super().__init__()
1572
+ self.dense = nn.Linear(config.hidden_size, config.hidden_size)
1573
+ classifier_dropout = (
1574
+ config.classifier_dropout if config.classifier_dropout is not None else config.hidden_dropout_prob
1575
+ )
1576
+ self.dropout = nn.Dropout(classifier_dropout)
1577
+ self.out_proj = nn.Linear(config.hidden_size, config.num_labels)
1578
+
1579
+ def forward(self, features, **kwargs):
1580
+ x = features[:, 0, :] # take <s> token (equiv. to [CLS])
1581
+ x = self.dropout(x)
1582
+ x = self.dense(x)
1583
+ x = torch.tanh(x)
1584
+ x = self.dropout(x)
1585
+ x = self.out_proj(x)
1586
+ return x
1587
+
1588
+
1589
+ @add_start_docstrings(
1590
+ """
1591
+ XLM-RoBERTa Model with a span classification head on top for extractive question-answering tasks like SQuAD (a
1592
+ linear layers on top of the hidden-states output to compute `span start logits` and `span end logits`).
1593
+ """,
1594
+ XLM_ROBERTA_START_DOCSTRING,
1595
+ )
1596
+ # Copied from transformers.models.roberta.modeling_roberta.RobertaForQuestionAnswering with Roberta->XLMRoberta, ROBERTA->XLM_ROBERTA
1597
+ class XLMRobertaForQuestionAnswering(XLMRobertaPreTrainedModel):
1598
+ def __init__(self, config):
1599
+ super().__init__(config)
1600
+ self.num_labels = config.num_labels
1601
+
1602
+ self.roberta = XLMRobertaModel(config, add_pooling_layer=False)
1603
+ self.qa_outputs = nn.Linear(config.hidden_size, config.num_labels)
1604
+
1605
+ # Initialize weights and apply final processing
1606
+ self.post_init()
1607
+
1608
+ @add_start_docstrings_to_model_forward(XLM_ROBERTA_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
1609
+ @add_code_sample_docstrings(
1610
+ checkpoint="deepset/roberta-base-squad2",
1611
+ output_type=QuestionAnsweringModelOutput,
1612
+ config_class=_CONFIG_FOR_DOC,
1613
+ expected_output="' puppet'",
1614
+ expected_loss=0.86,
1615
+ )
1616
+ def forward(
1617
+ self,
1618
+ input_ids: Optional[torch.LongTensor] = None,
1619
+ attention_mask: Optional[torch.FloatTensor] = None,
1620
+ token_type_ids: Optional[torch.LongTensor] = None,
1621
+ position_ids: Optional[torch.LongTensor] = None,
1622
+ head_mask: Optional[torch.FloatTensor] = None,
1623
+ inputs_embeds: Optional[torch.FloatTensor] = None,
1624
+ start_positions: Optional[torch.LongTensor] = None,
1625
+ end_positions: Optional[torch.LongTensor] = None,
1626
+ output_attentions: Optional[bool] = None,
1627
+ output_hidden_states: Optional[bool] = None,
1628
+ return_dict: Optional[bool] = None,
1629
+ ) -> Union[Tuple[torch.Tensor], QuestionAnsweringModelOutput]:
1630
+ r"""
1631
+ start_positions (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
1632
+ Labels for position (index) of the start of the labelled span for computing the token classification loss.
1633
+ Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence
1634
+ are not taken into account for computing the loss.
1635
+ end_positions (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
1636
+ Labels for position (index) of the end of the labelled span for computing the token classification loss.
1637
+ Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence
1638
+ are not taken into account for computing the loss.
1639
+ """
1640
+ return_dict = return_dict if return_dict is not None else self.config.use_return_dict
1641
+
1642
+ outputs = self.roberta(
1643
+ input_ids,
1644
+ attention_mask=attention_mask,
1645
+ token_type_ids=token_type_ids,
1646
+ position_ids=position_ids,
1647
+ head_mask=head_mask,
1648
+ inputs_embeds=inputs_embeds,
1649
+ output_attentions=output_attentions,
1650
+ output_hidden_states=output_hidden_states,
1651
+ return_dict=return_dict,
1652
+ )
1653
+
1654
+ sequence_output = outputs[0]
1655
+
1656
+ logits = self.qa_outputs(sequence_output)
1657
+ start_logits, end_logits = logits.split(1, dim=-1)
1658
+ start_logits = start_logits.squeeze(-1).contiguous()
1659
+ end_logits = end_logits.squeeze(-1).contiguous()
1660
+
1661
+ total_loss = None
1662
+ if start_positions is not None and end_positions is not None:
1663
+ # If we are on multi-GPU, split add a dimension
1664
+ if len(start_positions.size()) > 1:
1665
+ start_positions = start_positions.squeeze(-1)
1666
+ if len(end_positions.size()) > 1:
1667
+ end_positions = end_positions.squeeze(-1)
1668
+ # sometimes the start/end positions are outside our model inputs, we ignore these terms
1669
+ ignored_index = start_logits.size(1)
1670
+ start_positions = start_positions.clamp(0, ignored_index)
1671
+ end_positions = end_positions.clamp(0, ignored_index)
1672
+
1673
+ loss_fct = CrossEntropyLoss(ignore_index=ignored_index)
1674
+ start_loss = loss_fct(start_logits, start_positions)
1675
+ end_loss = loss_fct(end_logits, end_positions)
1676
+ total_loss = (start_loss + end_loss) / 2
1677
+
1678
+ if not return_dict:
1679
+ output = (start_logits, end_logits) + outputs[2:]
1680
+ return ((total_loss,) + output) if total_loss is not None else output
1681
+
1682
+ return QuestionAnsweringModelOutput(
1683
+ loss=total_loss,
1684
+ start_logits=start_logits,
1685
+ end_logits=end_logits,
1686
+ hidden_states=outputs.hidden_states,
1687
+ attentions=outputs.attentions,
1688
+ )
1689
+
1690
+
1691
+ # Copied from transformers.models.roberta.modeling_roberta.create_position_ids_from_input_ids
1692
+ def create_position_ids_from_input_ids(input_ids, padding_idx, past_key_values_length=0):
1693
+ """
1694
+ Replace non-padding symbols with their position numbers. Position numbers begin at padding_idx+1. Padding symbols
1695
+ are ignored. This is modified from fairseq's `utils.make_positions`.
1696
+
1697
+ Args:
1698
+ x: torch.Tensor x:
1699
+
1700
+ Returns: torch.Tensor
1701
+ """
1702
+ # The series of casts and type-conversions here are carefully balanced to both work with ONNX export and XLA.
1703
+ mask = input_ids.ne(padding_idx).int()
1704
+ incremental_indices = (torch.cumsum(mask, dim=1).type_as(mask) + past_key_values_length) * mask
1705
+ return incremental_indices.long() + padding_idx
1706
+