sdadas commited on
Commit
2786c02
·
verified ·
1 Parent(s): 0bc59fb

Upload 8 files

Browse files
.gitattributes CHANGED
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ tokenizer.json filter=lfs diff=lfs merge=lfs -text
config.json ADDED
@@ -0,0 +1,38 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "architectures": [
3
+ "XLMRobertaForSequenceClassification"
4
+ ],
5
+ "attention_probs_dropout_prob": 0.1,
6
+ "auto_map": {
7
+ "AutoModel": "modeling_xlm_roberta.XLMRobertaModel",
8
+ "AutoModelForSequenceClassification": "modeling_xlm_roberta.XLMRobertaForSequenceClassification",
9
+ "AutoConfig": "modeling_xlm_roberta.XLMRobertaConfig"
10
+ },
11
+ "bos_token_id": 0,
12
+ "classifier_dropout": null,
13
+ "eos_token_id": 2,
14
+ "hidden_act": "gelu",
15
+ "hidden_dropout_prob": 0.1,
16
+ "hidden_size": 1024,
17
+ "id2label": {
18
+ "0": "LABEL_0"
19
+ },
20
+ "initializer_range": 0.02,
21
+ "intermediate_size": 4096,
22
+ "label2id": {
23
+ "LABEL_0": 0
24
+ },
25
+ "layer_norm_eps": 1e-05,
26
+ "max_position_embeddings": 8194,
27
+ "model_type": "xlm-roberta",
28
+ "num_attention_heads": 16,
29
+ "num_hidden_layers": 24,
30
+ "output_past": true,
31
+ "pad_token_id": 1,
32
+ "position_embedding_type": "absolute",
33
+ "torch_dtype": "bfloat16",
34
+ "transformers_version": "4.44.0",
35
+ "type_vocab_size": 1,
36
+ "use_cache": true,
37
+ "vocab_size": 250002
38
+ }
configuration_xlm_roberta.py ADDED
@@ -0,0 +1,152 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # coding=utf-8
2
+ # Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
3
+ # Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved.
4
+ #
5
+ # Licensed under the Apache License, Version 2.0 (the "License");
6
+ # you may not use this file except in compliance with the License.
7
+ # You may obtain a copy of the License at
8
+ #
9
+ # http://www.apache.org/licenses/LICENSE-2.0
10
+ #
11
+ # Unless required by applicable law or agreed to in writing, software
12
+ # distributed under the License is distributed on an "AS IS" BASIS,
13
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14
+ # See the License for the specific language governing permissions and
15
+ # limitations under the License.
16
+ """ XLM-RoBERTa configuration"""
17
+ from collections import OrderedDict
18
+ from typing import Mapping
19
+
20
+ from transformers import PretrainedConfig
21
+ from transformers.onnx import OnnxConfig
22
+ from transformers.utils import logging
23
+
24
+ logger = logging.get_logger(__name__)
25
+
26
+
27
+ class XLMRobertaConfig(PretrainedConfig):
28
+ r"""
29
+ This is the configuration class to store the configuration of a [`XLMRobertaModel`] or a [`TFXLMRobertaModel`]. It
30
+ is used to instantiate a XLM-RoBERTa model according to the specified arguments, defining the model architecture.
31
+ Instantiating a configuration with the defaults will yield a similar configuration to that of the XLMRoBERTa
32
+ [FacebookAI/xlm-roberta-base](https://huggingface.co/FacebookAI/xlm-roberta-base) architecture.
33
+
34
+ Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
35
+ documentation from [`PretrainedConfig`] for more information.
36
+
37
+
38
+ Args:
39
+ vocab_size (`int`, *optional*, defaults to 30522):
40
+ Vocabulary size of the XLM-RoBERTa model. Defines the number of different tokens that can be represented by
41
+ the `inputs_ids` passed when calling [`XLMRobertaModel`] or [`TFXLMRobertaModel`].
42
+ hidden_size (`int`, *optional*, defaults to 768):
43
+ Dimensionality of the encoder layers and the pooler layer.
44
+ num_hidden_layers (`int`, *optional*, defaults to 12):
45
+ Number of hidden layers in the Transformer encoder.
46
+ num_attention_heads (`int`, *optional*, defaults to 12):
47
+ Number of attention heads for each attention layer in the Transformer encoder.
48
+ intermediate_size (`int`, *optional*, defaults to 3072):
49
+ Dimensionality of the "intermediate" (often named feed-forward) layer in the Transformer encoder.
50
+ hidden_act (`str` or `Callable`, *optional*, defaults to `"gelu"`):
51
+ The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
52
+ `"relu"`, `"silu"` and `"gelu_new"` are supported.
53
+ hidden_dropout_prob (`float`, *optional*, defaults to 0.1):
54
+ The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
55
+ attention_probs_dropout_prob (`float`, *optional*, defaults to 0.1):
56
+ The dropout ratio for the attention probabilities.
57
+ max_position_embeddings (`int`, *optional*, defaults to 512):
58
+ The maximum sequence length that this model might ever be used with. Typically set this to something large
59
+ just in case (e.g., 512 or 1024 or 2048).
60
+ type_vocab_size (`int`, *optional*, defaults to 2):
61
+ The vocabulary size of the `token_type_ids` passed when calling [`XLMRobertaModel`] or
62
+ [`TFXLMRobertaModel`].
63
+ initializer_range (`float`, *optional*, defaults to 0.02):
64
+ The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
65
+ layer_norm_eps (`float`, *optional*, defaults to 1e-12):
66
+ The epsilon used by the layer normalization layers.
67
+ position_embedding_type (`str`, *optional*, defaults to `"absolute"`):
68
+ Type of position embedding. Choose one of `"absolute"`, `"relative_key"`, `"relative_key_query"`. For
69
+ positional embeddings use `"absolute"`. For more information on `"relative_key"`, please refer to
70
+ [Self-Attention with Relative Position Representations (Shaw et al.)](https://arxiv.org/abs/1803.02155).
71
+ For more information on `"relative_key_query"`, please refer to *Method 4* in [Improve Transformer Models
72
+ with Better Relative Position Embeddings (Huang et al.)](https://arxiv.org/abs/2009.13658).
73
+ is_decoder (`bool`, *optional*, defaults to `False`):
74
+ Whether the model is used as a decoder or not. If `False`, the model is used as an encoder.
75
+ use_cache (`bool`, *optional*, defaults to `True`):
76
+ Whether or not the model should return the last key/values attentions (not used by all models). Only
77
+ relevant if `config.is_decoder=True`.
78
+ classifier_dropout (`float`, *optional*):
79
+ The dropout ratio for the classification head.
80
+
81
+ Examples:
82
+
83
+ ```python
84
+ >>> from transformers import XLMRobertaConfig, XLMRobertaModel
85
+
86
+ >>> # Initializing a XLM-RoBERTa FacebookAI/xlm-roberta-base style configuration
87
+ >>> configuration = XLMRobertaConfig()
88
+
89
+ >>> # Initializing a model (with random weights) from the FacebookAI/xlm-roberta-base style configuration
90
+ >>> model = XLMRobertaModel(configuration)
91
+
92
+ >>> # Accessing the model configuration
93
+ >>> configuration = model.config
94
+ ```"""
95
+
96
+ model_type = "xlm-roberta"
97
+
98
+ def __init__(
99
+ self,
100
+ vocab_size=30522,
101
+ hidden_size=768,
102
+ num_hidden_layers=12,
103
+ num_attention_heads=12,
104
+ intermediate_size=3072,
105
+ hidden_act="gelu",
106
+ hidden_dropout_prob=0.1,
107
+ attention_probs_dropout_prob=0.1,
108
+ max_position_embeddings=512,
109
+ type_vocab_size=2,
110
+ initializer_range=0.02,
111
+ layer_norm_eps=1e-12,
112
+ pad_token_id=1,
113
+ bos_token_id=0,
114
+ eos_token_id=2,
115
+ position_embedding_type="absolute",
116
+ use_cache=True,
117
+ classifier_dropout=None,
118
+ **kwargs,
119
+ ):
120
+ super().__init__(pad_token_id=pad_token_id, bos_token_id=bos_token_id, eos_token_id=eos_token_id, **kwargs)
121
+
122
+ self.vocab_size = vocab_size
123
+ self.hidden_size = hidden_size
124
+ self.num_hidden_layers = num_hidden_layers
125
+ self.num_attention_heads = num_attention_heads
126
+ self.hidden_act = hidden_act
127
+ self.intermediate_size = intermediate_size
128
+ self.hidden_dropout_prob = hidden_dropout_prob
129
+ self.attention_probs_dropout_prob = attention_probs_dropout_prob
130
+ self.max_position_embeddings = max_position_embeddings
131
+ self.type_vocab_size = type_vocab_size
132
+ self.initializer_range = initializer_range
133
+ self.layer_norm_eps = layer_norm_eps
134
+ self.position_embedding_type = position_embedding_type
135
+ self.use_cache = use_cache
136
+ self.classifier_dropout = classifier_dropout
137
+
138
+
139
+ # Copied from transformers.models.roberta.configuration_roberta.RobertaOnnxConfig with Roberta->XLMRoberta
140
+ class XLMRobertaOnnxConfig(OnnxConfig):
141
+ @property
142
+ def inputs(self) -> Mapping[str, Mapping[int, str]]:
143
+ if self.task == "multiple-choice":
144
+ dynamic_axis = {0: "batch", 1: "choice", 2: "sequence"}
145
+ else:
146
+ dynamic_axis = {0: "batch", 1: "sequence"}
147
+ return OrderedDict(
148
+ [
149
+ ("input_ids", dynamic_axis),
150
+ ("attention_mask", dynamic_axis),
151
+ ]
152
+ )
model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f967137261b4d74ff7a87624e1649fcd2dc9f550b40d6a7592dcafd0b94581e4
3
+ size 2271071852
modeling_xlm_roberta.py ADDED
@@ -0,0 +1,1961 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # coding=utf-8
2
+ # Copyright 2019 Facebook AI Research and the HuggingFace Inc. team.
3
+ # Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved.
4
+ #
5
+ # Licensed under the Apache License, Version 2.0 (the "License");
6
+ # you may not use this file except in compliance with the License.
7
+ # You may obtain a copy of the License at
8
+ #
9
+ # http://www.apache.org/licenses/LICENSE-2.0
10
+ #
11
+ # Unless required by applicable law or agreed to in writing, software
12
+ # distributed under the License is distributed on an "AS IS" BASIS,
13
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14
+ # See the License for the specific language governing permissions and
15
+ # limitations under the License.
16
+ """PyTorch XLM-RoBERTa model."""
17
+
18
+ import math
19
+ from typing import List, Optional, Tuple, Union
20
+
21
+ import torch
22
+ import torch.nn.functional as F
23
+ import torch.utils.checkpoint
24
+ from torch import nn
25
+ from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
26
+
27
+ from transformers.activations import ACT2FN, gelu
28
+ from transformers.modeling_outputs import (
29
+ BaseModelOutputWithPastAndCrossAttentions,
30
+ BaseModelOutputWithPoolingAndCrossAttentions,
31
+ CausalLMOutputWithCrossAttentions,
32
+ MaskedLMOutput,
33
+ MultipleChoiceModelOutput,
34
+ QuestionAnsweringModelOutput,
35
+ SequenceClassifierOutput,
36
+ TokenClassifierOutput,
37
+ )
38
+ from transformers.modeling_utils import PreTrainedModel
39
+ from transformers.pytorch_utils import apply_chunking_to_forward, find_pruneable_heads_and_indices, prune_linear_layer
40
+ from transformers.utils import (
41
+ add_code_sample_docstrings,
42
+ add_start_docstrings,
43
+ add_start_docstrings_to_model_forward,
44
+ is_flash_attn_2_available,
45
+ is_flash_attn_greater_or_equal_2_10,
46
+ logging,
47
+ replace_return_docstrings,
48
+ )
49
+ from .configuration_xlm_roberta import XLMRobertaConfig
50
+
51
+
52
+ if is_flash_attn_2_available():
53
+ from flash_attn import flash_attn_func, flash_attn_varlen_func
54
+ from flash_attn.bert_padding import index_first_axis, pad_input, unpad_input # noqa
55
+
56
+
57
+ logger = logging.get_logger(__name__)
58
+
59
+ _CHECKPOINT_FOR_DOC = "FacebookAI/xlm-roberta-base"
60
+ _CONFIG_FOR_DOC = "XLMRobertaConfig"
61
+
62
+
63
+ # Copied from transformers.models.llama.modeling_llama._get_unpad_data
64
+ def _get_unpad_data(attention_mask):
65
+ seqlens_in_batch = attention_mask.sum(dim=-1, dtype=torch.int32)
66
+ indices = torch.nonzero(attention_mask.flatten(), as_tuple=False).flatten()
67
+ max_seqlen_in_batch = seqlens_in_batch.max().item()
68
+ cu_seqlens = F.pad(torch.cumsum(seqlens_in_batch, dim=0, dtype=torch.int32), (1, 0))
69
+ return (
70
+ indices,
71
+ cu_seqlens,
72
+ max_seqlen_in_batch,
73
+ )
74
+
75
+
76
+ # Copied from transformers.models.roberta.modeling_roberta.RobertaEmbeddings with Roberta->XLMRoberta
77
+ class XLMRobertaEmbeddings(nn.Module):
78
+ """
79
+ Same as BertEmbeddings with a tiny tweak for positional embeddings indexing.
80
+ """
81
+
82
+ # Copied from transformers.models.bert.modeling_bert.BertEmbeddings.__init__
83
+ def __init__(self, config):
84
+ super().__init__()
85
+ self.word_embeddings = nn.Embedding(config.vocab_size, config.hidden_size, padding_idx=config.pad_token_id)
86
+ self.position_embeddings = nn.Embedding(config.max_position_embeddings, config.hidden_size)
87
+ self.token_type_embeddings = nn.Embedding(config.type_vocab_size, config.hidden_size)
88
+
89
+ # self.LayerNorm is not snake-cased to stick with TensorFlow model variable name and be able to load
90
+ # any TensorFlow checkpoint file
91
+ self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
92
+ self.dropout = nn.Dropout(config.hidden_dropout_prob)
93
+ # position_ids (1, len position emb) is contiguous in memory and exported when serialized
94
+ self.position_embedding_type = getattr(config, "position_embedding_type", "absolute")
95
+ self.register_buffer(
96
+ "position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)), persistent=False
97
+ )
98
+ self.register_buffer(
99
+ "token_type_ids", torch.zeros(self.position_ids.size(), dtype=torch.long), persistent=False
100
+ )
101
+
102
+ # End copy
103
+ self.padding_idx = config.pad_token_id
104
+ self.position_embeddings = nn.Embedding(
105
+ config.max_position_embeddings, config.hidden_size, padding_idx=self.padding_idx
106
+ )
107
+
108
+ def forward(
109
+ self, input_ids=None, token_type_ids=None, position_ids=None, inputs_embeds=None, past_key_values_length=0
110
+ ):
111
+ if position_ids is None:
112
+ if input_ids is not None:
113
+ # Create the position ids from the input token ids. Any padded tokens remain padded.
114
+ position_ids = create_position_ids_from_input_ids(input_ids, self.padding_idx, past_key_values_length)
115
+ else:
116
+ position_ids = self.create_position_ids_from_inputs_embeds(inputs_embeds)
117
+
118
+ if input_ids is not None:
119
+ input_shape = input_ids.size()
120
+ else:
121
+ input_shape = inputs_embeds.size()[:-1]
122
+
123
+ seq_length = input_shape[1]
124
+
125
+ # Setting the token_type_ids to the registered buffer in constructor where it is all zeros, which usually occurs
126
+ # when its auto-generated, registered buffer helps users when tracing the model without passing token_type_ids, solves
127
+ # issue #5664
128
+ if token_type_ids is None:
129
+ if hasattr(self, "token_type_ids"):
130
+ buffered_token_type_ids = self.token_type_ids[:, :seq_length]
131
+ buffered_token_type_ids_expanded = buffered_token_type_ids.expand(input_shape[0], seq_length)
132
+ token_type_ids = buffered_token_type_ids_expanded
133
+ else:
134
+ token_type_ids = torch.zeros(input_shape, dtype=torch.long, device=self.position_ids.device)
135
+
136
+ if inputs_embeds is None:
137
+ inputs_embeds = self.word_embeddings(input_ids)
138
+ token_type_embeddings = self.token_type_embeddings(token_type_ids)
139
+
140
+ embeddings = inputs_embeds + token_type_embeddings
141
+ if self.position_embedding_type == "absolute":
142
+ position_embeddings = self.position_embeddings(position_ids)
143
+ embeddings += position_embeddings
144
+ embeddings = self.LayerNorm(embeddings)
145
+ embeddings = self.dropout(embeddings)
146
+ return embeddings
147
+
148
+ def create_position_ids_from_inputs_embeds(self, inputs_embeds):
149
+ """
150
+ We are provided embeddings directly. We cannot infer which are padded so just generate sequential position ids.
151
+
152
+ Args:
153
+ inputs_embeds: torch.Tensor
154
+
155
+ Returns: torch.Tensor
156
+ """
157
+ input_shape = inputs_embeds.size()[:-1]
158
+ sequence_length = input_shape[1]
159
+
160
+ position_ids = torch.arange(
161
+ self.padding_idx + 1, sequence_length + self.padding_idx + 1, dtype=torch.long, device=inputs_embeds.device
162
+ )
163
+ return position_ids.unsqueeze(0).expand(input_shape)
164
+
165
+
166
+ # Copied from transformers.models.roberta.modeling_roberta.RobertaSelfAttention with Roberta->XLMRoberta
167
+ class XLMRobertaSelfAttention(nn.Module):
168
+ def __init__(self, config, position_embedding_type=None):
169
+ super().__init__()
170
+ self.config = config
171
+ if config.hidden_size % config.num_attention_heads != 0 and not hasattr(config, "embedding_size"):
172
+ raise ValueError(
173
+ f"The hidden size ({config.hidden_size}) is not a multiple of the number of attention "
174
+ f"heads ({config.num_attention_heads})"
175
+ )
176
+
177
+ self.num_attention_heads = config.num_attention_heads
178
+ self.attention_head_size = int(config.hidden_size / config.num_attention_heads)
179
+ self.all_head_size = self.num_attention_heads * self.attention_head_size
180
+
181
+ self.query = nn.Linear(config.hidden_size, self.all_head_size)
182
+ self.key = nn.Linear(config.hidden_size, self.all_head_size)
183
+ self.value = nn.Linear(config.hidden_size, self.all_head_size)
184
+
185
+ self.dropout = nn.Dropout(config.attention_probs_dropout_prob)
186
+ self.position_embedding_type = position_embedding_type or getattr(
187
+ config, "position_embedding_type", "absolute"
188
+ )
189
+ if self.position_embedding_type == "relative_key" or self.position_embedding_type == "relative_key_query":
190
+ self.max_position_embeddings = config.max_position_embeddings
191
+ self.distance_embedding = nn.Embedding(2 * config.max_position_embeddings - 1, self.attention_head_size)
192
+
193
+ self.is_decoder = config.is_decoder
194
+
195
+ def transpose_for_scores(self, x: torch.Tensor) -> torch.Tensor:
196
+ new_x_shape = x.size()[:-1] + (self.num_attention_heads, self.attention_head_size)
197
+ x = x.view(new_x_shape)
198
+ return x.permute(0, 2, 1, 3)
199
+
200
+ def forward(
201
+ self,
202
+ hidden_states: torch.Tensor,
203
+ attention_mask: Optional[torch.FloatTensor] = None,
204
+ head_mask: Optional[torch.FloatTensor] = None,
205
+ encoder_hidden_states: Optional[torch.FloatTensor] = None,
206
+ encoder_attention_mask: Optional[torch.FloatTensor] = None,
207
+ past_key_value: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
208
+ output_attentions: Optional[bool] = False,
209
+ ) -> Tuple[torch.Tensor]:
210
+ mixed_query_layer = self.query(hidden_states)
211
+
212
+ # If this is instantiated as a cross-attention module, the keys
213
+ # and values come from an encoder; the attention mask needs to be
214
+ # such that the encoder's padding tokens are not attended to.
215
+ is_cross_attention = encoder_hidden_states is not None
216
+
217
+ if is_cross_attention and past_key_value is not None:
218
+ # reuse k,v, cross_attentions
219
+ key_layer = past_key_value[0]
220
+ value_layer = past_key_value[1]
221
+ attention_mask = encoder_attention_mask
222
+ elif is_cross_attention:
223
+ key_layer = self.transpose_for_scores(self.key(encoder_hidden_states))
224
+ value_layer = self.transpose_for_scores(self.value(encoder_hidden_states))
225
+ attention_mask = encoder_attention_mask
226
+ elif past_key_value is not None:
227
+ key_layer = self.transpose_for_scores(self.key(hidden_states))
228
+ value_layer = self.transpose_for_scores(self.value(hidden_states))
229
+ key_layer = torch.cat([past_key_value[0], key_layer], dim=2)
230
+ value_layer = torch.cat([past_key_value[1], value_layer], dim=2)
231
+ else:
232
+ key_layer = self.transpose_for_scores(self.key(hidden_states))
233
+ value_layer = self.transpose_for_scores(self.value(hidden_states))
234
+
235
+ query_layer = self.transpose_for_scores(mixed_query_layer)
236
+
237
+ use_cache = past_key_value is not None
238
+ if self.is_decoder:
239
+ # if cross_attention save Tuple(torch.Tensor, torch.Tensor) of all cross attention key/value_states.
240
+ # Further calls to cross_attention layer can then reuse all cross-attention
241
+ # key/value_states (first "if" case)
242
+ # if uni-directional self-attention (decoder) save Tuple(torch.Tensor, torch.Tensor) of
243
+ # all previous decoder key/value_states. Further calls to uni-directional self-attention
244
+ # can concat previous decoder key/value_states to current projected key/value_states (third "elif" case)
245
+ # if encoder bi-directional self-attention `past_key_value` is always `None`
246
+ past_key_value = (key_layer, value_layer)
247
+
248
+ # Take the dot product between "query" and "key" to get the raw attention scores.
249
+ attention_scores = torch.matmul(query_layer, key_layer.transpose(-1, -2))
250
+
251
+ if self.position_embedding_type == "relative_key" or self.position_embedding_type == "relative_key_query":
252
+ query_length, key_length = query_layer.shape[2], key_layer.shape[2]
253
+ if use_cache:
254
+ position_ids_l = torch.tensor(key_length - 1, dtype=torch.long, device=hidden_states.device).view(
255
+ -1, 1
256
+ )
257
+ else:
258
+ position_ids_l = torch.arange(query_length, dtype=torch.long, device=hidden_states.device).view(-1, 1)
259
+ position_ids_r = torch.arange(key_length, dtype=torch.long, device=hidden_states.device).view(1, -1)
260
+ distance = position_ids_l - position_ids_r
261
+
262
+ positional_embedding = self.distance_embedding(distance + self.max_position_embeddings - 1)
263
+ positional_embedding = positional_embedding.to(dtype=query_layer.dtype) # fp16 compatibility
264
+
265
+ if self.position_embedding_type == "relative_key":
266
+ relative_position_scores = torch.einsum("bhld,lrd->bhlr", query_layer, positional_embedding)
267
+ attention_scores = attention_scores + relative_position_scores
268
+ elif self.position_embedding_type == "relative_key_query":
269
+ relative_position_scores_query = torch.einsum("bhld,lrd->bhlr", query_layer, positional_embedding)
270
+ relative_position_scores_key = torch.einsum("bhrd,lrd->bhlr", key_layer, positional_embedding)
271
+ attention_scores = attention_scores + relative_position_scores_query + relative_position_scores_key
272
+
273
+ attention_scores = attention_scores / math.sqrt(self.attention_head_size)
274
+ if attention_mask is not None:
275
+ # Apply the attention mask is (precomputed for all layers in XLMRobertaModel forward() function)
276
+ attention_scores = attention_scores + attention_mask
277
+
278
+ # Normalize the attention scores to probabilities.
279
+ attention_probs = nn.functional.softmax(attention_scores, dim=-1)
280
+
281
+ # This is actually dropping out entire tokens to attend to, which might
282
+ # seem a bit unusual, but is taken from the original Transformer paper.
283
+ attention_probs = self.dropout(attention_probs)
284
+
285
+ # Mask heads if we want to
286
+ if head_mask is not None:
287
+ attention_probs = attention_probs * head_mask
288
+
289
+ context_layer = torch.matmul(attention_probs, value_layer)
290
+
291
+ context_layer = context_layer.permute(0, 2, 1, 3).contiguous()
292
+ new_context_layer_shape = context_layer.size()[:-2] + (self.all_head_size,)
293
+ context_layer = context_layer.view(new_context_layer_shape)
294
+
295
+ outputs = (context_layer, attention_probs) if output_attentions else (context_layer,)
296
+
297
+ if self.is_decoder:
298
+ outputs = outputs + (past_key_value,)
299
+ return outputs
300
+
301
+
302
+ # Copied from transformers.models.roberta.modeling_roberta.RobertaFlashAttention2 with Roberta->XLMRoberta
303
+ class XLMRobertaFlashAttention2(XLMRobertaSelfAttention):
304
+ def __init__(self, *args, **kwargs):
305
+ super().__init__(*args, **kwargs)
306
+
307
+ # TODO: Should be removed once Flash Attention for RoCm is bumped to 2.1.
308
+ # flash_attn<2.1 generates top-left aligned causal mask, while what is needed here is bottom-right alignement, that was made default for flash_attn>=2.1. This attribute is used to handle this difference. Reference: https://github.com/Dao-AILab/flash-attention/releases/tag/v2.1.0.
309
+ # Beware that with flash_attn<2.1, using q_seqlen != k_seqlen (except for the case q_seqlen == 1) produces a wrong mask (top-left).
310
+ self._flash_attn_uses_top_left_mask = not is_flash_attn_greater_or_equal_2_10()
311
+
312
+ self.is_causal = False
313
+
314
+ if self.position_embedding_type != "absolute":
315
+ raise ValueError("XLMRobertaFlashAttention2 only supports absolute position embeddings")
316
+
317
+ def transpose_for_scores(self, x: torch.Tensor) -> torch.Tensor:
318
+ new_x_shape = x.size()[:-1] + (self.num_attention_heads, self.attention_head_size)
319
+ x = x.view(new_x_shape)
320
+ return x
321
+
322
+ def forward(
323
+ self,
324
+ hidden_states: torch.Tensor,
325
+ attention_mask: Optional[torch.FloatTensor] = None,
326
+ head_mask: Optional[torch.FloatTensor] = None,
327
+ encoder_hidden_states: Optional[torch.FloatTensor] = None,
328
+ encoder_attention_mask: Optional[torch.FloatTensor] = None,
329
+ past_key_value: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
330
+ output_attentions: Optional[bool] = False,
331
+ ) -> Tuple[torch.Tensor, ...]:
332
+ """
333
+ Parameters:
334
+ query: torch.tensor(bs, seq_length, dim)
335
+ key: torch.tensor(bs, seq_length, dim)
336
+ value: torch.tensor(bs, seq_length, dim)
337
+ mask: torch.tensor(bs, seq_length)
338
+
339
+ Returns:
340
+ weights: torch.tensor(bs, n_heads, seq_length, seq_length) Attention weights context: torch.tensor(bs,
341
+ seq_length, dim) Contextualized layer. Optional: only if `output_attentions=True`
342
+ """
343
+ if output_attentions:
344
+ raise ValueError("XLMRobertaFlashAttention2 attention does not support output_attentions")
345
+ if head_mask is not None:
346
+ raise ValueError("XLMRobertaFlashAttention2 attention does not support head_mask")
347
+
348
+ mixed_query_layer = self.query(hidden_states)
349
+
350
+ # If this is instantiated as a cross-attention module, the keys
351
+ # and values come from an encoder; the attention mask needs to be
352
+ # such that the encoder's padding tokens are not attended to.
353
+ is_cross_attention = encoder_hidden_states is not None
354
+
355
+ if is_cross_attention and past_key_value is not None:
356
+ # reuse k,v, cross_attentions
357
+ key_states = past_key_value[0]
358
+ value_states = past_key_value[1]
359
+ attention_mask = encoder_attention_mask
360
+ elif is_cross_attention:
361
+ key_states = self.transpose_for_scores(self.key(encoder_hidden_states))
362
+ value_states = self.transpose_for_scores(self.value(encoder_hidden_states))
363
+ attention_mask = encoder_attention_mask
364
+ elif past_key_value is not None:
365
+ key_states = self.transpose_for_scores(self.key(hidden_states))
366
+ value_states = self.transpose_for_scores(self.value(hidden_states))
367
+ key_states = torch.cat([past_key_value[0], key_states], dim=2)
368
+ value_states = torch.cat([past_key_value[1], value_states], dim=2)
369
+ else:
370
+ key_states = self.transpose_for_scores(self.key(hidden_states))
371
+ value_states = self.transpose_for_scores(self.value(hidden_states))
372
+
373
+ # attention_mask is of the "extended attention mask" at this stage, i.e. it's 0 for positions that need attention
374
+ # and the lowest possible value for positions that should be masked. So, an "all attention" mask sums to 0.
375
+ # In that case, we can safely set it to None to avoid unnecessary computation for variable length attention.
376
+ if attention_mask.sum().item() == 0:
377
+ attention_mask = None
378
+ else:
379
+ # Otherwise, we want to undo the "extended attention mask" format, as flash attention doesn't work with it.
380
+ attention_mask = torch.where(attention_mask[:, 0, 0, :] == 0, 1.0, 0.0)
381
+
382
+ query_states = self.transpose_for_scores(mixed_query_layer)
383
+ # At this stage, the key, value and query states all have the shape of
384
+ # batch_size x seq_len x head_dim x hidden_dim
385
+
386
+ if self.is_decoder:
387
+ # if cross_attention save Tuple(torch.Tensor, torch.Tensor) of all cross attention key/value_states.
388
+ # Further calls to cross_attention layer can then reuse all cross-attention
389
+ # key/value_states (first "if" case)
390
+ # if uni-directional self-attention (decoder) save Tuple(torch.Tensor, torch.Tensor) of
391
+ # all previous decoder key/value_states. Further calls to uni-directional self-attention
392
+ # can concat previous decoder key/value_states to current projected key/value_states (third "elif" case)
393
+ # if encoder bi-directional self-attention `past_key_value` is always `None`
394
+ past_key_value = (key_states, value_states)
395
+
396
+ seq_len = query_states.shape[1]
397
+
398
+ attn_dropout = self.config.attention_probs_dropout_prob if self.training else 0.0
399
+
400
+ # In PEFT, usually we cast the layer norms in float32 for training stability reasons
401
+ # therefore the input hidden states gets silently casted in float32. Hence, we need
402
+ # cast them back in the correct dtype just to be sure everything works as expected.
403
+ # This might slowdown training & inference so it is recommended to not cast the LayerNorms
404
+ # in fp32. (LlamaRMSNorm handles it correctly)
405
+
406
+ if query_states.dtype == torch.float32:
407
+ if torch.is_autocast_enabled():
408
+ target_dtype = torch.get_autocast_gpu_dtype()
409
+ # Handle the case where the model is quantized
410
+ elif hasattr(self.config, "_pre_quantization_dtype"):
411
+ target_dtype = self.config._pre_quantization_dtype
412
+ else:
413
+ target_dtype = self.q_lin.weight.dtype
414
+
415
+ logger.warning_once(
416
+ f"The input hidden states seems to be silently casted in float32, this might be related to"
417
+ f" the fact you have upcasted embedding or layer norm layers in float32. We will cast back the input in"
418
+ f" {target_dtype}."
419
+ )
420
+
421
+ query_states = query_states.to(target_dtype)
422
+ key_states = key_states.to(target_dtype)
423
+ value_states = value_states.to(target_dtype)
424
+
425
+ attn_weights = self._flash_attention_forward(
426
+ query_states, key_states, value_states, attention_mask, seq_len, dropout=attn_dropout
427
+ )
428
+
429
+ new_shape = attn_weights.size()[:-2] + (self.all_head_size,)
430
+ attn_output = attn_weights.view(new_shape)
431
+
432
+ outputs = (attn_output,)
433
+
434
+ if self.is_decoder:
435
+ outputs = outputs + (past_key_value,)
436
+ return outputs
437
+
438
+ # Copied from transformers.models.llama.modeling_llama.LlamaFlashAttention2._flash_attention_forward
439
+ def _flash_attention_forward(
440
+ self, query_states, key_states, value_states, attention_mask, query_length, dropout=0.0, softmax_scale=None
441
+ ):
442
+ """
443
+ Calls the forward method of Flash Attention - if the input hidden states contain at least one padding token
444
+ first unpad the input, then computes the attention scores and pad the final attention scores.
445
+
446
+ Args:
447
+ query_states (`torch.Tensor`):
448
+ Input query states to be passed to Flash Attention API
449
+ key_states (`torch.Tensor`):
450
+ Input key states to be passed to Flash Attention API
451
+ value_states (`torch.Tensor`):
452
+ Input value states to be passed to Flash Attention API
453
+ attention_mask (`torch.Tensor`):
454
+ The padding mask - corresponds to a tensor of size `(batch_size, seq_len)` where 0 stands for the
455
+ position of padding tokens and 1 for the position of non-padding tokens.
456
+ dropout (`float`):
457
+ Attention dropout
458
+ softmax_scale (`float`, *optional*):
459
+ The scaling of QK^T before applying softmax. Default to 1 / sqrt(head_dim)
460
+ """
461
+ if not self._flash_attn_uses_top_left_mask:
462
+ causal = self.is_causal
463
+ else:
464
+ # TODO: Remove the `query_length != 1` check once Flash Attention for RoCm is bumped to 2.1. For details, please see the comment in LlamaFlashAttention2 __init__.
465
+ causal = self.is_causal and query_length != 1
466
+
467
+ # Contains at least one padding token in the sequence
468
+ if attention_mask is not None:
469
+ batch_size = query_states.shape[0]
470
+ query_states, key_states, value_states, indices_q, cu_seq_lens, max_seq_lens = self._upad_input(
471
+ query_states, key_states, value_states, attention_mask, query_length
472
+ )
473
+
474
+ cu_seqlens_q, cu_seqlens_k = cu_seq_lens
475
+ max_seqlen_in_batch_q, max_seqlen_in_batch_k = max_seq_lens
476
+
477
+ attn_output_unpad = flash_attn_varlen_func(
478
+ query_states,
479
+ key_states,
480
+ value_states,
481
+ cu_seqlens_q=cu_seqlens_q,
482
+ cu_seqlens_k=cu_seqlens_k,
483
+ max_seqlen_q=max_seqlen_in_batch_q,
484
+ max_seqlen_k=max_seqlen_in_batch_k,
485
+ dropout_p=dropout,
486
+ softmax_scale=softmax_scale,
487
+ causal=causal,
488
+ )
489
+
490
+ attn_output = pad_input(attn_output_unpad, indices_q, batch_size, query_length)
491
+ else:
492
+ attn_output = flash_attn_func(
493
+ query_states, key_states, value_states, dropout, softmax_scale=softmax_scale, causal=causal
494
+ )
495
+
496
+ return attn_output
497
+
498
+ # Copied from transformers.models.llama.modeling_llama.LlamaFlashAttention2._upad_input with num_heads->num_attention_heads
499
+ def _upad_input(self, query_layer, key_layer, value_layer, attention_mask, query_length):
500
+ indices_k, cu_seqlens_k, max_seqlen_in_batch_k = _get_unpad_data(attention_mask)
501
+ batch_size, kv_seq_len, num_key_value_heads, head_dim = key_layer.shape
502
+
503
+ key_layer = index_first_axis(
504
+ key_layer.reshape(batch_size * kv_seq_len, num_key_value_heads, head_dim), indices_k
505
+ )
506
+ value_layer = index_first_axis(
507
+ value_layer.reshape(batch_size * kv_seq_len, num_key_value_heads, head_dim), indices_k
508
+ )
509
+ if query_length == kv_seq_len:
510
+ query_layer = index_first_axis(
511
+ query_layer.reshape(batch_size * kv_seq_len, self.num_attention_heads, head_dim), indices_k
512
+ )
513
+ cu_seqlens_q = cu_seqlens_k
514
+ max_seqlen_in_batch_q = max_seqlen_in_batch_k
515
+ indices_q = indices_k
516
+ elif query_length == 1:
517
+ max_seqlen_in_batch_q = 1
518
+ cu_seqlens_q = torch.arange(
519
+ batch_size + 1, dtype=torch.int32, device=query_layer.device
520
+ ) # There is a memcpy here, that is very bad.
521
+ indices_q = cu_seqlens_q[:-1]
522
+ query_layer = query_layer.squeeze(1)
523
+ else:
524
+ # The -q_len: slice assumes left padding.
525
+ attention_mask = attention_mask[:, -query_length:]
526
+ query_layer, indices_q, cu_seqlens_q, max_seqlen_in_batch_q = unpad_input(query_layer, attention_mask)
527
+
528
+ return (
529
+ query_layer,
530
+ key_layer,
531
+ value_layer,
532
+ indices_q,
533
+ (cu_seqlens_q, cu_seqlens_k),
534
+ (max_seqlen_in_batch_q, max_seqlen_in_batch_k),
535
+ )
536
+
537
+
538
+ # Copied from transformers.models.roberta.modeling_roberta.RobertaSdpaAttention with Roberta->XLMRoberta
539
+ class XLMRobertaSdpaAttention(XLMRobertaSelfAttention):
540
+ """
541
+ XLMRoberta attention module using torch.nn.functional.scaled_dot_product_attention. This module inherits from
542
+ `XLMRobertaSelfAttention` as the weights of the module stays untouched. The only changes are on the forward pass to adapt to
543
+ SDPA API.
544
+ """
545
+
546
+ def __init__(self, config, position_embedding_type=None):
547
+ super().__init__(config, position_embedding_type)
548
+
549
+ self.is_causal = False
550
+
551
+ if self.position_embedding_type != "absolute":
552
+ raise ValueError("XLMRobertaSdpaAttention only supports absolute position embeddings")
553
+
554
+ # Adapted from LlamaAttention.forward and XLMRobertaFlashAttention2.forward
555
+ def forward(
556
+ self,
557
+ hidden_states: torch.Tensor,
558
+ attention_mask: Optional[torch.FloatTensor] = None,
559
+ head_mask: Optional[torch.FloatTensor] = None,
560
+ encoder_hidden_states: Optional[torch.FloatTensor] = None,
561
+ encoder_attention_mask: Optional[torch.FloatTensor] = None,
562
+ past_key_value: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
563
+ output_attentions: Optional[bool] = False,
564
+ ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
565
+ if output_attentions:
566
+ # TODO: Improve this warning with e.g. `model.config.attn_implementation = "manual"` once this is implemented.
567
+ logger.warning_once(
568
+ "XLMRobertaModel is using XLMRobertaSdpaAttention, but `torch.nn.functional.scaled_dot_product_attention` does not support `output_attentions=True`. Falling back to the manual attention implementation, "
569
+ 'but specifying the manual implementation will be required from Transformers version v5.0.0 onwards. This warning can be removed using the argument `attn_implementation="eager"` when loading the model.'
570
+ )
571
+ return super().forward(
572
+ hidden_states=hidden_states,
573
+ attention_mask=attention_mask,
574
+ head_mask=head_mask,
575
+ encoder_hidden_states=encoder_hidden_states,
576
+ encoder_attention_mask=encoder_attention_mask,
577
+ past_key_value=past_key_value,
578
+ output_attentions=output_attentions,
579
+ )
580
+
581
+ mixed_query_layer = self.query(hidden_states)
582
+
583
+ # If this is instantiated as a cross-attention module, the keys
584
+ # and values come from an encoder; the attention mask needs to be
585
+ # such that the encoder's padding tokens are not attended to.
586
+ is_cross_attention = encoder_hidden_states is not None
587
+
588
+ if is_cross_attention and past_key_value is not None:
589
+ # reuse k,v, cross_attentions
590
+ key_states = past_key_value[0]
591
+ value_states = past_key_value[1]
592
+ attention_mask = encoder_attention_mask
593
+ elif is_cross_attention:
594
+ key_states = self.transpose_for_scores(self.key(encoder_hidden_states))
595
+ value_states = self.transpose_for_scores(self.value(encoder_hidden_states))
596
+ attention_mask = encoder_attention_mask
597
+ elif past_key_value is not None:
598
+ key_states = self.transpose_for_scores(self.key(hidden_states))
599
+ value_states = self.transpose_for_scores(self.value(hidden_states))
600
+ key_states = torch.cat([past_key_value[0], key_states], dim=2)
601
+ value_states = torch.cat([past_key_value[1], value_states], dim=2)
602
+ else:
603
+ key_states = self.transpose_for_scores(self.key(hidden_states))
604
+ value_states = self.transpose_for_scores(self.value(hidden_states))
605
+
606
+ query_states = self.transpose_for_scores(mixed_query_layer)
607
+ # At this stage, the key, value and query states all have the shape of
608
+ # batch_size x head_dim x seq_len x hidden_dim
609
+
610
+ if self.is_decoder:
611
+ # if cross_attention save Tuple(torch.Tensor, torch.Tensor) of all cross attention key/value_states.
612
+ # Further calls to cross_attention layer can then reuse all cross-attention
613
+ # key/value_states (first "if" case)
614
+ # if uni-directional self-attention (decoder) save Tuple(torch.Tensor, torch.Tensor) of
615
+ # all previous decoder key/value_states. Further calls to uni-directional self-attention
616
+ # can concat previous decoder key/value_states to current projected key/value_states (third "elif" case)
617
+ # if encoder bi-directional self-attention `past_key_value` is always `None`
618
+ past_key_value = (key_states, value_states)
619
+
620
+ batch_size, _, seq_len, _ = query_states.size()
621
+
622
+ attn_dropout = self.config.attention_probs_dropout_prob if self.training else 0.0
623
+
624
+ # SDPA with memory-efficient backend is currently (torch==2.1.2) bugged with non-contiguous inputs with custom attn_mask,
625
+ # Reference: https://github.com/pytorch/pytorch/issues/112577.
626
+ if query_states.device.type == "cuda" and attention_mask is not None:
627
+ query_states = query_states.contiguous()
628
+ key_states = key_states.contiguous()
629
+ value_states = value_states.contiguous()
630
+
631
+ # In case we are not compiling, we may set `causal_mask` to None, which is required to dispatch to SDPA's Flash Attention 2 backend, rather
632
+ # relying on the `is_causal` argument.
633
+ attn_output = torch.nn.functional.scaled_dot_product_attention(
634
+ query_states,
635
+ key_states,
636
+ value_states,
637
+ attn_mask=attention_mask,
638
+ dropout_p=attn_dropout,
639
+ is_causal=self.is_causal and attention_mask is None and seq_len > 1,
640
+ )
641
+
642
+ if attn_output.size() != (batch_size, self.num_attention_heads, seq_len, self.attention_head_size):
643
+ raise ValueError(
644
+ f"`attn_output` should be of size {(batch_size, self.num_attention_heads, seq_len, self.attention_head_size)}, but is"
645
+ f" {attn_output.size()}"
646
+ )
647
+
648
+ attn_output = attn_output.transpose(1, 2)
649
+ attn_output = attn_output.reshape(batch_size, seq_len, self.all_head_size)
650
+
651
+ outputs = (attn_output,)
652
+
653
+ if self.is_decoder:
654
+ outputs = outputs + (past_key_value,)
655
+ return outputs
656
+
657
+
658
+ XLM_ROBERTA_ATTENTION_CLASSES = {
659
+ "eager": XLMRobertaSelfAttention,
660
+ "sdpa": XLMRobertaSdpaAttention,
661
+ "flash_attention_2": XLMRobertaFlashAttention2,
662
+ }
663
+
664
+
665
+ # Copied from transformers.models.roberta.modeling_roberta.RobertaSelfOutput with Roberta->XLMRoberta
666
+ class XLMRobertaSelfOutput(nn.Module):
667
+ def __init__(self, config):
668
+ super().__init__()
669
+ self.dense = nn.Linear(config.hidden_size, config.hidden_size)
670
+ self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
671
+ self.dropout = nn.Dropout(config.hidden_dropout_prob)
672
+
673
+ def forward(self, hidden_states: torch.Tensor, input_tensor: torch.Tensor) -> torch.Tensor:
674
+ hidden_states = self.dense(hidden_states)
675
+ hidden_states = self.dropout(hidden_states)
676
+ hidden_states = self.LayerNorm(hidden_states + input_tensor)
677
+ return hidden_states
678
+
679
+
680
+ # Copied from transformers.models.roberta.modeling_roberta.RobertaAttention with Roberta->XLMRoberta,ROBERTA->XLM_ROBERTA
681
+ class XLMRobertaAttention(nn.Module):
682
+ def __init__(self, config, position_embedding_type=None):
683
+ super().__init__()
684
+ self.self = XLM_ROBERTA_ATTENTION_CLASSES[config._attn_implementation](
685
+ config,
686
+ position_embedding_type=position_embedding_type,
687
+ )
688
+ self.output = XLMRobertaSelfOutput(config)
689
+ self.pruned_heads = set()
690
+
691
+ # Copied from transformers.models.bert.modeling_bert.BertAttention.prune_heads
692
+ def prune_heads(self, heads):
693
+ if len(heads) == 0:
694
+ return
695
+ heads, index = find_pruneable_heads_and_indices(
696
+ heads, self.self.num_attention_heads, self.self.attention_head_size, self.pruned_heads
697
+ )
698
+
699
+ # Prune linear layers
700
+ self.self.query = prune_linear_layer(self.self.query, index)
701
+ self.self.key = prune_linear_layer(self.self.key, index)
702
+ self.self.value = prune_linear_layer(self.self.value, index)
703
+ self.output.dense = prune_linear_layer(self.output.dense, index, dim=1)
704
+
705
+ # Update hyper params and store pruned heads
706
+ self.self.num_attention_heads = self.self.num_attention_heads - len(heads)
707
+ self.self.all_head_size = self.self.attention_head_size * self.self.num_attention_heads
708
+ self.pruned_heads = self.pruned_heads.union(heads)
709
+
710
+ # Copied from transformers.models.bert.modeling_bert.BertAttention.forward
711
+ def forward(
712
+ self,
713
+ hidden_states: torch.Tensor,
714
+ attention_mask: Optional[torch.FloatTensor] = None,
715
+ head_mask: Optional[torch.FloatTensor] = None,
716
+ encoder_hidden_states: Optional[torch.FloatTensor] = None,
717
+ encoder_attention_mask: Optional[torch.FloatTensor] = None,
718
+ past_key_value: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
719
+ output_attentions: Optional[bool] = False,
720
+ ) -> Tuple[torch.Tensor]:
721
+ self_outputs = self.self(
722
+ hidden_states,
723
+ attention_mask,
724
+ head_mask,
725
+ encoder_hidden_states,
726
+ encoder_attention_mask,
727
+ past_key_value,
728
+ output_attentions,
729
+ )
730
+ attention_output = self.output(self_outputs[0], hidden_states)
731
+ outputs = (attention_output,) + self_outputs[1:] # add attentions if we output them
732
+ return outputs
733
+
734
+
735
+ # Copied from transformers.models.roberta.modeling_roberta.RobertaIntermediate with Roberta->XLMRoberta
736
+ class XLMRobertaIntermediate(nn.Module):
737
+ def __init__(self, config):
738
+ super().__init__()
739
+ self.dense = nn.Linear(config.hidden_size, config.intermediate_size)
740
+ if isinstance(config.hidden_act, str):
741
+ self.intermediate_act_fn = ACT2FN[config.hidden_act]
742
+ else:
743
+ self.intermediate_act_fn = config.hidden_act
744
+
745
+ def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
746
+ hidden_states = self.dense(hidden_states)
747
+ hidden_states = self.intermediate_act_fn(hidden_states)
748
+ return hidden_states
749
+
750
+
751
+ # Copied from transformers.models.roberta.modeling_roberta.RobertaOutput with Roberta->XLMRoberta
752
+ class XLMRobertaOutput(nn.Module):
753
+ def __init__(self, config):
754
+ super().__init__()
755
+ self.dense = nn.Linear(config.intermediate_size, config.hidden_size)
756
+ self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
757
+ self.dropout = nn.Dropout(config.hidden_dropout_prob)
758
+
759
+ def forward(self, hidden_states: torch.Tensor, input_tensor: torch.Tensor) -> torch.Tensor:
760
+ hidden_states = self.dense(hidden_states)
761
+ hidden_states = self.dropout(hidden_states)
762
+ hidden_states = self.LayerNorm(hidden_states + input_tensor)
763
+ return hidden_states
764
+
765
+
766
+ # Copied from transformers.models.roberta.modeling_roberta.RobertaLayer with Roberta->XLMRoberta
767
+ class XLMRobertaLayer(nn.Module):
768
+ def __init__(self, config):
769
+ super().__init__()
770
+ self.chunk_size_feed_forward = config.chunk_size_feed_forward
771
+ self.seq_len_dim = 1
772
+ self.attention = XLMRobertaAttention(config)
773
+ self.is_decoder = config.is_decoder
774
+ self.add_cross_attention = config.add_cross_attention
775
+ if self.add_cross_attention:
776
+ if not self.is_decoder:
777
+ raise ValueError(f"{self} should be used as a decoder model if cross attention is added")
778
+ self.crossattention = XLMRobertaAttention(config, position_embedding_type="absolute")
779
+ self.intermediate = XLMRobertaIntermediate(config)
780
+ self.output = XLMRobertaOutput(config)
781
+
782
+ def forward(
783
+ self,
784
+ hidden_states: torch.Tensor,
785
+ attention_mask: Optional[torch.FloatTensor] = None,
786
+ head_mask: Optional[torch.FloatTensor] = None,
787
+ encoder_hidden_states: Optional[torch.FloatTensor] = None,
788
+ encoder_attention_mask: Optional[torch.FloatTensor] = None,
789
+ past_key_value: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
790
+ output_attentions: Optional[bool] = False,
791
+ ) -> Tuple[torch.Tensor]:
792
+ # decoder uni-directional self-attention cached key/values tuple is at positions 1,2
793
+ self_attn_past_key_value = past_key_value[:2] if past_key_value is not None else None
794
+ self_attention_outputs = self.attention(
795
+ hidden_states,
796
+ attention_mask,
797
+ head_mask,
798
+ output_attentions=output_attentions,
799
+ past_key_value=self_attn_past_key_value,
800
+ )
801
+ attention_output = self_attention_outputs[0]
802
+
803
+ # if decoder, the last output is tuple of self-attn cache
804
+ if self.is_decoder:
805
+ outputs = self_attention_outputs[1:-1]
806
+ present_key_value = self_attention_outputs[-1]
807
+ else:
808
+ outputs = self_attention_outputs[1:] # add self attentions if we output attention weights
809
+
810
+ cross_attn_present_key_value = None
811
+ if self.is_decoder and encoder_hidden_states is not None:
812
+ if not hasattr(self, "crossattention"):
813
+ raise ValueError(
814
+ f"If `encoder_hidden_states` are passed, {self} has to be instantiated with cross-attention layers"
815
+ " by setting `config.add_cross_attention=True`"
816
+ )
817
+
818
+ # cross_attn cached key/values tuple is at positions 3,4 of past_key_value tuple
819
+ cross_attn_past_key_value = past_key_value[-2:] if past_key_value is not None else None
820
+ cross_attention_outputs = self.crossattention(
821
+ attention_output,
822
+ attention_mask,
823
+ head_mask,
824
+ encoder_hidden_states,
825
+ encoder_attention_mask,
826
+ cross_attn_past_key_value,
827
+ output_attentions,
828
+ )
829
+ attention_output = cross_attention_outputs[0]
830
+ outputs = outputs + cross_attention_outputs[1:-1] # add cross attentions if we output attention weights
831
+
832
+ # add cross-attn cache to positions 3,4 of present_key_value tuple
833
+ cross_attn_present_key_value = cross_attention_outputs[-1]
834
+ present_key_value = present_key_value + cross_attn_present_key_value
835
+
836
+ layer_output = apply_chunking_to_forward(
837
+ self.feed_forward_chunk, self.chunk_size_feed_forward, self.seq_len_dim, attention_output
838
+ )
839
+ outputs = (layer_output,) + outputs
840
+
841
+ # if decoder, return the attn key/values as the last output
842
+ if self.is_decoder:
843
+ outputs = outputs + (present_key_value,)
844
+
845
+ return outputs
846
+
847
+ def feed_forward_chunk(self, attention_output):
848
+ intermediate_output = self.intermediate(attention_output)
849
+ layer_output = self.output(intermediate_output, attention_output)
850
+ return layer_output
851
+
852
+
853
+ # Copied from transformers.models.roberta.modeling_roberta.RobertaEncoder with Roberta->XLMRoberta
854
+ class XLMRobertaEncoder(nn.Module):
855
+ def __init__(self, config):
856
+ super().__init__()
857
+ self.config = config
858
+ self.layer = nn.ModuleList([XLMRobertaLayer(config) for _ in range(config.num_hidden_layers)])
859
+ self.gradient_checkpointing = False
860
+
861
+ def forward(
862
+ self,
863
+ hidden_states: torch.Tensor,
864
+ attention_mask: Optional[torch.FloatTensor] = None,
865
+ head_mask: Optional[torch.FloatTensor] = None,
866
+ encoder_hidden_states: Optional[torch.FloatTensor] = None,
867
+ encoder_attention_mask: Optional[torch.FloatTensor] = None,
868
+ past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
869
+ use_cache: Optional[bool] = None,
870
+ output_attentions: Optional[bool] = False,
871
+ output_hidden_states: Optional[bool] = False,
872
+ return_dict: Optional[bool] = True,
873
+ ) -> Union[Tuple[torch.Tensor], BaseModelOutputWithPastAndCrossAttentions]:
874
+ all_hidden_states = () if output_hidden_states else None
875
+ all_self_attentions = () if output_attentions else None
876
+ all_cross_attentions = () if output_attentions and self.config.add_cross_attention else None
877
+
878
+ if self.gradient_checkpointing and self.training:
879
+ if use_cache:
880
+ logger.warning_once(
881
+ "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
882
+ )
883
+ use_cache = False
884
+
885
+ next_decoder_cache = () if use_cache else None
886
+ for i, layer_module in enumerate(self.layer):
887
+ if output_hidden_states:
888
+ all_hidden_states = all_hidden_states + (hidden_states,)
889
+
890
+ layer_head_mask = head_mask[i] if head_mask is not None else None
891
+ past_key_value = past_key_values[i] if past_key_values is not None else None
892
+
893
+ if self.gradient_checkpointing and self.training:
894
+ layer_outputs = self._gradient_checkpointing_func(
895
+ layer_module.__call__,
896
+ hidden_states,
897
+ attention_mask,
898
+ layer_head_mask,
899
+ encoder_hidden_states,
900
+ encoder_attention_mask,
901
+ past_key_value,
902
+ output_attentions,
903
+ )
904
+ else:
905
+ layer_outputs = layer_module(
906
+ hidden_states,
907
+ attention_mask,
908
+ layer_head_mask,
909
+ encoder_hidden_states,
910
+ encoder_attention_mask,
911
+ past_key_value,
912
+ output_attentions,
913
+ )
914
+
915
+ hidden_states = layer_outputs[0]
916
+ if use_cache:
917
+ next_decoder_cache += (layer_outputs[-1],)
918
+ if output_attentions:
919
+ all_self_attentions = all_self_attentions + (layer_outputs[1],)
920
+ if self.config.add_cross_attention:
921
+ all_cross_attentions = all_cross_attentions + (layer_outputs[2],)
922
+
923
+ if output_hidden_states:
924
+ all_hidden_states = all_hidden_states + (hidden_states,)
925
+
926
+ if not return_dict:
927
+ return tuple(
928
+ v
929
+ for v in [
930
+ hidden_states,
931
+ next_decoder_cache,
932
+ all_hidden_states,
933
+ all_self_attentions,
934
+ all_cross_attentions,
935
+ ]
936
+ if v is not None
937
+ )
938
+ return BaseModelOutputWithPastAndCrossAttentions(
939
+ last_hidden_state=hidden_states,
940
+ past_key_values=next_decoder_cache,
941
+ hidden_states=all_hidden_states,
942
+ attentions=all_self_attentions,
943
+ cross_attentions=all_cross_attentions,
944
+ )
945
+
946
+
947
+ # Copied from transformers.models.roberta.modeling_roberta.RobertaPooler with Roberta->XLMRoberta
948
+ class XLMRobertaPooler(nn.Module):
949
+ def __init__(self, config):
950
+ super().__init__()
951
+ self.dense = nn.Linear(config.hidden_size, config.hidden_size)
952
+ self.activation = nn.Tanh()
953
+
954
+ def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
955
+ # We "pool" the model by simply taking the hidden state corresponding
956
+ # to the first token.
957
+ first_token_tensor = hidden_states[:, 0]
958
+ pooled_output = self.dense(first_token_tensor)
959
+ pooled_output = self.activation(pooled_output)
960
+ return pooled_output
961
+
962
+
963
+ # Copied from transformers.models.roberta.modeling_roberta.RobertaPreTrainedModel with Roberta->XLMRoberta
964
+ class XLMRobertaPreTrainedModel(PreTrainedModel):
965
+ """
966
+ An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
967
+ models.
968
+ """
969
+
970
+ config_class = XLMRobertaConfig
971
+ base_model_prefix = "roberta"
972
+ supports_gradient_checkpointing = True
973
+ _no_split_modules = ["XLMRobertaEmbeddings", "XLMRobertaSelfAttention"]
974
+ _supports_flash_attn_2 = True
975
+ _supports_sdpa = True
976
+
977
+ # Copied from transformers.models.bert.modeling_bert.BertPreTrainedModel._init_weights
978
+ def _init_weights(self, module):
979
+ """Initialize the weights"""
980
+ if isinstance(module, nn.Linear):
981
+ # Slightly different from the TF version which uses truncated_normal for initialization
982
+ # cf https://github.com/pytorch/pytorch/pull/5617
983
+ module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
984
+ if module.bias is not None:
985
+ module.bias.data.zero_()
986
+ elif isinstance(module, nn.Embedding):
987
+ module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
988
+ if module.padding_idx is not None:
989
+ module.weight.data[module.padding_idx].zero_()
990
+ elif isinstance(module, nn.LayerNorm):
991
+ module.bias.data.zero_()
992
+ module.weight.data.fill_(1.0)
993
+
994
+
995
+ XLM_ROBERTA_START_DOCSTRING = r"""
996
+
997
+ This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
998
+ library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
999
+ etc.)
1000
+
1001
+ This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
1002
+ Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
1003
+ and behavior.
1004
+
1005
+ Parameters:
1006
+ config ([`XLMRobertaConfig`]): Model configuration class with all the parameters of the
1007
+ model. Initializing with a config file does not load the weights associated with the model, only the
1008
+ configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
1009
+ """
1010
+
1011
+ XLM_ROBERTA_INPUTS_DOCSTRING = r"""
1012
+ Args:
1013
+ input_ids (`torch.LongTensor` of shape `({0})`):
1014
+ Indices of input sequence tokens in the vocabulary.
1015
+
1016
+ Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
1017
+ [`PreTrainedTokenizer.__call__`] for details.
1018
+
1019
+ [What are input IDs?](../glossary#input-ids)
1020
+ attention_mask (`torch.FloatTensor` of shape `({0})`, *optional*):
1021
+ Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
1022
+
1023
+ - 1 for tokens that are **not masked**,
1024
+ - 0 for tokens that are **masked**.
1025
+
1026
+ [What are attention masks?](../glossary#attention-mask)
1027
+ token_type_ids (`torch.LongTensor` of shape `({0})`, *optional*):
1028
+ Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0,
1029
+ 1]`:
1030
+
1031
+ - 0 corresponds to a *sentence A* token,
1032
+ - 1 corresponds to a *sentence B* token.
1033
+
1034
+ [What are token type IDs?](../glossary#token-type-ids)
1035
+ position_ids (`torch.LongTensor` of shape `({0})`, *optional*):
1036
+ Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
1037
+ config.max_position_embeddings - 1]`.
1038
+
1039
+ [What are position IDs?](../glossary#position-ids)
1040
+ head_mask (`torch.FloatTensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*):
1041
+ Mask to nullify selected heads of the self-attention modules. Mask values selected in `[0, 1]`:
1042
+
1043
+ - 1 indicates the head is **not masked**,
1044
+ - 0 indicates the head is **masked**.
1045
+
1046
+ inputs_embeds (`torch.FloatTensor` of shape `({0}, hidden_size)`, *optional*):
1047
+ Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
1048
+ is useful if you want more control over how to convert `input_ids` indices into associated vectors than the
1049
+ model's internal embedding lookup matrix.
1050
+ output_attentions (`bool`, *optional*):
1051
+ Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
1052
+ tensors for more detail.
1053
+ output_hidden_states (`bool`, *optional*):
1054
+ Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
1055
+ more detail.
1056
+ return_dict (`bool`, *optional*):
1057
+ Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
1058
+ """
1059
+
1060
+
1061
+ @add_start_docstrings(
1062
+ "The bare XLM-RoBERTa Model transformer outputting raw hidden-states without any specific head on top.",
1063
+ XLM_ROBERTA_START_DOCSTRING,
1064
+ )
1065
+ # Copied from transformers.models.roberta.modeling_roberta.RobertaModel with Roberta->XLMRoberta, ROBERTA->XLM_ROBERTA
1066
+ class XLMRobertaModel(XLMRobertaPreTrainedModel):
1067
+ """
1068
+
1069
+ The model can behave as an encoder (with only self-attention) as well as a decoder, in which case a layer of
1070
+ cross-attention is added between the self-attention layers, following the architecture described in *Attention is
1071
+ all you need*_ by Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit, Llion Jones, Aidan N. Gomez, Lukasz
1072
+ Kaiser and Illia Polosukhin.
1073
+
1074
+ To behave as an decoder the model needs to be initialized with the `is_decoder` argument of the configuration set
1075
+ to `True`. To be used in a Seq2Seq model, the model needs to initialized with both `is_decoder` argument and
1076
+ `add_cross_attention` set to `True`; an `encoder_hidden_states` is then expected as an input to the forward pass.
1077
+
1078
+ .. _*Attention is all you need*: https://arxiv.org/abs/1706.03762
1079
+
1080
+ """
1081
+
1082
+ # Copied from transformers.models.bert.modeling_bert.BertModel.__init__ with Bert->XLMRoberta
1083
+ def __init__(self, config, add_pooling_layer=True):
1084
+ super().__init__(config)
1085
+ self.config = config
1086
+
1087
+ self.embeddings = XLMRobertaEmbeddings(config)
1088
+ self.encoder = XLMRobertaEncoder(config)
1089
+
1090
+ self.pooler = XLMRobertaPooler(config) if add_pooling_layer else None
1091
+
1092
+ # Initialize weights and apply final processing
1093
+ self.post_init()
1094
+
1095
+ def get_input_embeddings(self):
1096
+ return self.embeddings.word_embeddings
1097
+
1098
+ def set_input_embeddings(self, value):
1099
+ self.embeddings.word_embeddings = value
1100
+
1101
+ def _prune_heads(self, heads_to_prune):
1102
+ """
1103
+ Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base
1104
+ class PreTrainedModel
1105
+ """
1106
+ for layer, heads in heads_to_prune.items():
1107
+ self.encoder.layer[layer].attention.prune_heads(heads)
1108
+
1109
+ @add_start_docstrings_to_model_forward(XLM_ROBERTA_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
1110
+ @add_code_sample_docstrings(
1111
+ checkpoint=_CHECKPOINT_FOR_DOC,
1112
+ output_type=BaseModelOutputWithPoolingAndCrossAttentions,
1113
+ config_class=_CONFIG_FOR_DOC,
1114
+ )
1115
+ # Copied from transformers.models.bert.modeling_bert.BertModel.forward
1116
+ def forward(
1117
+ self,
1118
+ input_ids: Optional[torch.Tensor] = None,
1119
+ attention_mask: Optional[torch.Tensor] = None,
1120
+ token_type_ids: Optional[torch.Tensor] = None,
1121
+ position_ids: Optional[torch.Tensor] = None,
1122
+ head_mask: Optional[torch.Tensor] = None,
1123
+ inputs_embeds: Optional[torch.Tensor] = None,
1124
+ encoder_hidden_states: Optional[torch.Tensor] = None,
1125
+ encoder_attention_mask: Optional[torch.Tensor] = None,
1126
+ past_key_values: Optional[List[torch.FloatTensor]] = None,
1127
+ use_cache: Optional[bool] = None,
1128
+ output_attentions: Optional[bool] = None,
1129
+ output_hidden_states: Optional[bool] = None,
1130
+ return_dict: Optional[bool] = None,
1131
+ ) -> Union[Tuple[torch.Tensor], BaseModelOutputWithPoolingAndCrossAttentions]:
1132
+ r"""
1133
+ encoder_hidden_states (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
1134
+ Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention if
1135
+ the model is configured as a decoder.
1136
+ encoder_attention_mask (`torch.FloatTensor` of shape `(batch_size, sequence_length)`, *optional*):
1137
+ Mask to avoid performing attention on the padding token indices of the encoder input. This mask is used in
1138
+ the cross-attention if the model is configured as a decoder. Mask values selected in `[0, 1]`:
1139
+
1140
+ - 1 for tokens that are **not masked**,
1141
+ - 0 for tokens that are **masked**.
1142
+ past_key_values (`tuple(tuple(torch.FloatTensor))` of length `config.n_layers` with each tuple having 4 tensors of shape `(batch_size, num_heads, sequence_length - 1, embed_size_per_head)`):
1143
+ Contains precomputed key and value hidden states of the attention blocks. Can be used to speed up decoding.
1144
+
1145
+ If `past_key_values` are used, the user can optionally input only the last `decoder_input_ids` (those that
1146
+ don't have their past key value states given to this model) of shape `(batch_size, 1)` instead of all
1147
+ `decoder_input_ids` of shape `(batch_size, sequence_length)`.
1148
+ use_cache (`bool`, *optional*):
1149
+ If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see
1150
+ `past_key_values`).
1151
+ """
1152
+ output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
1153
+ output_hidden_states = (
1154
+ output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
1155
+ )
1156
+ return_dict = return_dict if return_dict is not None else self.config.use_return_dict
1157
+
1158
+ if self.config.is_decoder:
1159
+ use_cache = use_cache if use_cache is not None else self.config.use_cache
1160
+ else:
1161
+ use_cache = False
1162
+
1163
+ if input_ids is not None and inputs_embeds is not None:
1164
+ raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
1165
+ elif input_ids is not None:
1166
+ self.warn_if_padding_and_no_attention_mask(input_ids, attention_mask)
1167
+ input_shape = input_ids.size()
1168
+ elif inputs_embeds is not None:
1169
+ input_shape = inputs_embeds.size()[:-1]
1170
+ else:
1171
+ raise ValueError("You have to specify either input_ids or inputs_embeds")
1172
+
1173
+ batch_size, seq_length = input_shape
1174
+ device = input_ids.device if input_ids is not None else inputs_embeds.device
1175
+
1176
+ # past_key_values_length
1177
+ past_key_values_length = past_key_values[0][0].shape[2] if past_key_values is not None else 0
1178
+
1179
+ if attention_mask is None:
1180
+ attention_mask = torch.ones(((batch_size, seq_length + past_key_values_length)), device=device)
1181
+
1182
+ if token_type_ids is None:
1183
+ if hasattr(self.embeddings, "token_type_ids"):
1184
+ buffered_token_type_ids = self.embeddings.token_type_ids[:, :seq_length]
1185
+ buffered_token_type_ids_expanded = buffered_token_type_ids.expand(batch_size, seq_length)
1186
+ token_type_ids = buffered_token_type_ids_expanded
1187
+ else:
1188
+ token_type_ids = torch.zeros(input_shape, dtype=torch.long, device=device)
1189
+
1190
+ # We can provide a self-attention mask of dimensions [batch_size, from_seq_length, to_seq_length]
1191
+ # ourselves in which case we just need to make it broadcastable to all heads.
1192
+ extended_attention_mask: torch.Tensor = self.get_extended_attention_mask(attention_mask, input_shape)
1193
+
1194
+ # If a 2D or 3D attention mask is provided for the cross-attention
1195
+ # we need to make broadcastable to [batch_size, num_heads, seq_length, seq_length]
1196
+ if self.config.is_decoder and encoder_hidden_states is not None:
1197
+ encoder_batch_size, encoder_sequence_length, _ = encoder_hidden_states.size()
1198
+ encoder_hidden_shape = (encoder_batch_size, encoder_sequence_length)
1199
+ if encoder_attention_mask is None:
1200
+ encoder_attention_mask = torch.ones(encoder_hidden_shape, device=device)
1201
+ encoder_extended_attention_mask = self.invert_attention_mask(encoder_attention_mask)
1202
+ else:
1203
+ encoder_extended_attention_mask = None
1204
+
1205
+ # Prepare head mask if needed
1206
+ # 1.0 in head_mask indicate we keep the head
1207
+ # attention_probs has shape bsz x n_heads x N x N
1208
+ # input head_mask has shape [num_heads] or [num_hidden_layers x num_heads]
1209
+ # and head_mask is converted to shape [num_hidden_layers x batch x num_heads x seq_length x seq_length]
1210
+ head_mask = self.get_head_mask(head_mask, self.config.num_hidden_layers)
1211
+
1212
+ embedding_output = self.embeddings(
1213
+ input_ids=input_ids,
1214
+ position_ids=position_ids,
1215
+ token_type_ids=token_type_ids,
1216
+ inputs_embeds=inputs_embeds,
1217
+ past_key_values_length=past_key_values_length,
1218
+ )
1219
+ encoder_outputs = self.encoder(
1220
+ embedding_output,
1221
+ attention_mask=extended_attention_mask,
1222
+ head_mask=head_mask,
1223
+ encoder_hidden_states=encoder_hidden_states,
1224
+ encoder_attention_mask=encoder_extended_attention_mask,
1225
+ past_key_values=past_key_values,
1226
+ use_cache=use_cache,
1227
+ output_attentions=output_attentions,
1228
+ output_hidden_states=output_hidden_states,
1229
+ return_dict=return_dict,
1230
+ )
1231
+ sequence_output = encoder_outputs[0]
1232
+ pooled_output = self.pooler(sequence_output) if self.pooler is not None else None
1233
+
1234
+ if not return_dict:
1235
+ return (sequence_output, pooled_output) + encoder_outputs[1:]
1236
+
1237
+ return BaseModelOutputWithPoolingAndCrossAttentions(
1238
+ last_hidden_state=sequence_output,
1239
+ pooler_output=pooled_output,
1240
+ past_key_values=encoder_outputs.past_key_values,
1241
+ hidden_states=encoder_outputs.hidden_states,
1242
+ attentions=encoder_outputs.attentions,
1243
+ cross_attentions=encoder_outputs.cross_attentions,
1244
+ )
1245
+
1246
+
1247
+ @add_start_docstrings(
1248
+ "XLM-RoBERTa Model with a `language modeling` head on top for CLM fine-tuning.",
1249
+ XLM_ROBERTA_START_DOCSTRING,
1250
+ )
1251
+ # Copied from transformers.models.roberta.modeling_roberta.RobertaForCausalLM with Roberta->XLMRoberta, ROBERTA->XLM_ROBERTA
1252
+ class XLMRobertaForCausalLM(XLMRobertaPreTrainedModel):
1253
+ _tied_weights_keys = ["lm_head.decoder.weight", "lm_head.decoder.bias"]
1254
+
1255
+ def __init__(self, config):
1256
+ super().__init__(config)
1257
+
1258
+ if not config.is_decoder:
1259
+ logger.warning("If you want to use `XLMRobertaLMHeadModel` as a standalone, add `is_decoder=True.`")
1260
+
1261
+ self.roberta = XLMRobertaModel(config, add_pooling_layer=False)
1262
+ self.lm_head = XLMRobertaLMHead(config)
1263
+
1264
+ # Initialize weights and apply final processing
1265
+ self.post_init()
1266
+
1267
+ def get_output_embeddings(self):
1268
+ return self.lm_head.decoder
1269
+
1270
+ def set_output_embeddings(self, new_embeddings):
1271
+ self.lm_head.decoder = new_embeddings
1272
+
1273
+ @add_start_docstrings_to_model_forward(XLM_ROBERTA_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
1274
+ @replace_return_docstrings(output_type=CausalLMOutputWithCrossAttentions, config_class=_CONFIG_FOR_DOC)
1275
+ def forward(
1276
+ self,
1277
+ input_ids: Optional[torch.LongTensor] = None,
1278
+ attention_mask: Optional[torch.FloatTensor] = None,
1279
+ token_type_ids: Optional[torch.LongTensor] = None,
1280
+ position_ids: Optional[torch.LongTensor] = None,
1281
+ head_mask: Optional[torch.FloatTensor] = None,
1282
+ inputs_embeds: Optional[torch.FloatTensor] = None,
1283
+ encoder_hidden_states: Optional[torch.FloatTensor] = None,
1284
+ encoder_attention_mask: Optional[torch.FloatTensor] = None,
1285
+ labels: Optional[torch.LongTensor] = None,
1286
+ past_key_values: Tuple[Tuple[torch.FloatTensor]] = None,
1287
+ use_cache: Optional[bool] = None,
1288
+ output_attentions: Optional[bool] = None,
1289
+ output_hidden_states: Optional[bool] = None,
1290
+ return_dict: Optional[bool] = None,
1291
+ ) -> Union[Tuple[torch.Tensor], CausalLMOutputWithCrossAttentions]:
1292
+ r"""
1293
+ encoder_hidden_states (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
1294
+ Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention if
1295
+ the model is configured as a decoder.
1296
+ encoder_attention_mask (`torch.FloatTensor` of shape `(batch_size, sequence_length)`, *optional*):
1297
+ Mask to avoid performing attention on the padding token indices of the encoder input. This mask is used in
1298
+ the cross-attention if the model is configured as a decoder. Mask values selected in `[0, 1]`:
1299
+
1300
+ - 1 for tokens that are **not masked**,
1301
+ - 0 for tokens that are **masked**.
1302
+
1303
+ labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
1304
+ Labels for computing the left-to-right language modeling loss (next word prediction). Indices should be in
1305
+ `[-100, 0, ..., config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are
1306
+ ignored (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`
1307
+ past_key_values (`tuple(tuple(torch.FloatTensor))` of length `config.n_layers` with each tuple having 4 tensors of shape `(batch_size, num_heads, sequence_length - 1, embed_size_per_head)`):
1308
+ Contains precomputed key and value hidden states of the attention blocks. Can be used to speed up decoding.
1309
+
1310
+ If `past_key_values` are used, the user can optionally input only the last `decoder_input_ids` (those that
1311
+ don't have their past key value states given to this model) of shape `(batch_size, 1)` instead of all
1312
+ `decoder_input_ids` of shape `(batch_size, sequence_length)`.
1313
+ use_cache (`bool`, *optional*):
1314
+ If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see
1315
+ `past_key_values`).
1316
+
1317
+ Returns:
1318
+
1319
+ Example:
1320
+
1321
+ ```python
1322
+ >>> from transformers import AutoTokenizer, XLMRobertaForCausalLM, AutoConfig
1323
+ >>> import torch
1324
+
1325
+ >>> tokenizer = AutoTokenizer.from_pretrained("FacebookAI/roberta-base")
1326
+ >>> config = AutoConfig.from_pretrained("FacebookAI/roberta-base")
1327
+ >>> config.is_decoder = True
1328
+ >>> model = XLMRobertaForCausalLM.from_pretrained("FacebookAI/roberta-base", config=config)
1329
+
1330
+ >>> inputs = tokenizer("Hello, my dog is cute", return_tensors="pt")
1331
+ >>> outputs = model(**inputs)
1332
+
1333
+ >>> prediction_logits = outputs.logits
1334
+ ```"""
1335
+ return_dict = return_dict if return_dict is not None else self.config.use_return_dict
1336
+ if labels is not None:
1337
+ use_cache = False
1338
+
1339
+ outputs = self.roberta(
1340
+ input_ids,
1341
+ attention_mask=attention_mask,
1342
+ token_type_ids=token_type_ids,
1343
+ position_ids=position_ids,
1344
+ head_mask=head_mask,
1345
+ inputs_embeds=inputs_embeds,
1346
+ encoder_hidden_states=encoder_hidden_states,
1347
+ encoder_attention_mask=encoder_attention_mask,
1348
+ past_key_values=past_key_values,
1349
+ use_cache=use_cache,
1350
+ output_attentions=output_attentions,
1351
+ output_hidden_states=output_hidden_states,
1352
+ return_dict=return_dict,
1353
+ )
1354
+
1355
+ sequence_output = outputs[0]
1356
+ prediction_scores = self.lm_head(sequence_output)
1357
+
1358
+ lm_loss = None
1359
+ if labels is not None:
1360
+ # move labels to correct device to enable model parallelism
1361
+ labels = labels.to(prediction_scores.device)
1362
+ # we are doing next-token prediction; shift prediction scores and input ids by one
1363
+ shifted_prediction_scores = prediction_scores[:, :-1, :].contiguous()
1364
+ labels = labels[:, 1:].contiguous()
1365
+ loss_fct = CrossEntropyLoss()
1366
+ lm_loss = loss_fct(shifted_prediction_scores.view(-1, self.config.vocab_size), labels.view(-1))
1367
+
1368
+ if not return_dict:
1369
+ output = (prediction_scores,) + outputs[2:]
1370
+ return ((lm_loss,) + output) if lm_loss is not None else output
1371
+
1372
+ return CausalLMOutputWithCrossAttentions(
1373
+ loss=lm_loss,
1374
+ logits=prediction_scores,
1375
+ past_key_values=outputs.past_key_values,
1376
+ hidden_states=outputs.hidden_states,
1377
+ attentions=outputs.attentions,
1378
+ cross_attentions=outputs.cross_attentions,
1379
+ )
1380
+
1381
+ def prepare_inputs_for_generation(self, input_ids, past_key_values=None, attention_mask=None, **model_kwargs):
1382
+ input_shape = input_ids.shape
1383
+ # if model is used as a decoder in encoder-decoder model, the decoder attention mask is created on the fly
1384
+ if attention_mask is None:
1385
+ attention_mask = input_ids.new_ones(input_shape)
1386
+
1387
+ # cut decoder_input_ids if past_key_values is used
1388
+ if past_key_values is not None:
1389
+ past_length = past_key_values[0][0].shape[2]
1390
+
1391
+ # Some generation methods already pass only the last input ID
1392
+ if input_ids.shape[1] > past_length:
1393
+ remove_prefix_length = past_length
1394
+ else:
1395
+ # Default to old behavior: keep only final ID
1396
+ remove_prefix_length = input_ids.shape[1] - 1
1397
+
1398
+ input_ids = input_ids[:, remove_prefix_length:]
1399
+
1400
+ return {"input_ids": input_ids, "attention_mask": attention_mask, "past_key_values": past_key_values}
1401
+
1402
+ def _reorder_cache(self, past_key_values, beam_idx):
1403
+ reordered_past = ()
1404
+ for layer_past in past_key_values:
1405
+ reordered_past += (
1406
+ tuple(past_state.index_select(0, beam_idx.to(past_state.device)) for past_state in layer_past),
1407
+ )
1408
+ return reordered_past
1409
+
1410
+
1411
+ @add_start_docstrings(
1412
+ """XLM-RoBERTa Model with a `language modeling` head on top.""",
1413
+ XLM_ROBERTA_START_DOCSTRING,
1414
+ )
1415
+ # Copied from transformers.models.roberta.modeling_roberta.RobertaForMaskedLM with Roberta->XLMRoberta, ROBERTA->XLM_ROBERTA
1416
+ class XLMRobertaForMaskedLM(XLMRobertaPreTrainedModel):
1417
+ _tied_weights_keys = ["lm_head.decoder.weight", "lm_head.decoder.bias"]
1418
+
1419
+ def __init__(self, config):
1420
+ super().__init__(config)
1421
+
1422
+ if config.is_decoder:
1423
+ logger.warning(
1424
+ "If you want to use `XLMRobertaForMaskedLM` make sure `config.is_decoder=False` for "
1425
+ "bi-directional self-attention."
1426
+ )
1427
+
1428
+ self.roberta = XLMRobertaModel(config, add_pooling_layer=False)
1429
+ self.lm_head = XLMRobertaLMHead(config)
1430
+
1431
+ # Initialize weights and apply final processing
1432
+ self.post_init()
1433
+
1434
+ def get_output_embeddings(self):
1435
+ return self.lm_head.decoder
1436
+
1437
+ def set_output_embeddings(self, new_embeddings):
1438
+ self.lm_head.decoder = new_embeddings
1439
+
1440
+ @add_start_docstrings_to_model_forward(XLM_ROBERTA_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
1441
+ @add_code_sample_docstrings(
1442
+ checkpoint=_CHECKPOINT_FOR_DOC,
1443
+ output_type=MaskedLMOutput,
1444
+ config_class=_CONFIG_FOR_DOC,
1445
+ mask="<mask>",
1446
+ expected_output="' Paris'",
1447
+ expected_loss=0.1,
1448
+ )
1449
+ def forward(
1450
+ self,
1451
+ input_ids: Optional[torch.LongTensor] = None,
1452
+ attention_mask: Optional[torch.FloatTensor] = None,
1453
+ token_type_ids: Optional[torch.LongTensor] = None,
1454
+ position_ids: Optional[torch.LongTensor] = None,
1455
+ head_mask: Optional[torch.FloatTensor] = None,
1456
+ inputs_embeds: Optional[torch.FloatTensor] = None,
1457
+ encoder_hidden_states: Optional[torch.FloatTensor] = None,
1458
+ encoder_attention_mask: Optional[torch.FloatTensor] = None,
1459
+ labels: Optional[torch.LongTensor] = None,
1460
+ output_attentions: Optional[bool] = None,
1461
+ output_hidden_states: Optional[bool] = None,
1462
+ return_dict: Optional[bool] = None,
1463
+ ) -> Union[Tuple[torch.Tensor], MaskedLMOutput]:
1464
+ r"""
1465
+ labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
1466
+ Labels for computing the masked language modeling loss. Indices should be in `[-100, 0, ...,
1467
+ config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are ignored (masked), the
1468
+ loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`
1469
+ kwargs (`Dict[str, any]`, optional, defaults to *{}*):
1470
+ Used to hide legacy arguments that have been deprecated.
1471
+ """
1472
+ return_dict = return_dict if return_dict is not None else self.config.use_return_dict
1473
+
1474
+ outputs = self.roberta(
1475
+ input_ids,
1476
+ attention_mask=attention_mask,
1477
+ token_type_ids=token_type_ids,
1478
+ position_ids=position_ids,
1479
+ head_mask=head_mask,
1480
+ inputs_embeds=inputs_embeds,
1481
+ encoder_hidden_states=encoder_hidden_states,
1482
+ encoder_attention_mask=encoder_attention_mask,
1483
+ output_attentions=output_attentions,
1484
+ output_hidden_states=output_hidden_states,
1485
+ return_dict=return_dict,
1486
+ )
1487
+ sequence_output = outputs[0]
1488
+ prediction_scores = self.lm_head(sequence_output)
1489
+
1490
+ masked_lm_loss = None
1491
+ if labels is not None:
1492
+ # move labels to correct device to enable model parallelism
1493
+ labels = labels.to(prediction_scores.device)
1494
+ loss_fct = CrossEntropyLoss()
1495
+ masked_lm_loss = loss_fct(prediction_scores.view(-1, self.config.vocab_size), labels.view(-1))
1496
+
1497
+ if not return_dict:
1498
+ output = (prediction_scores,) + outputs[2:]
1499
+ return ((masked_lm_loss,) + output) if masked_lm_loss is not None else output
1500
+
1501
+ return MaskedLMOutput(
1502
+ loss=masked_lm_loss,
1503
+ logits=prediction_scores,
1504
+ hidden_states=outputs.hidden_states,
1505
+ attentions=outputs.attentions,
1506
+ )
1507
+
1508
+
1509
+ # Copied from transformers.models.roberta.modeling_roberta.RobertaLMHead
1510
+ class XLMRobertaLMHead(nn.Module):
1511
+ """Roberta Head for masked language modeling."""
1512
+
1513
+ def __init__(self, config):
1514
+ super().__init__()
1515
+ self.dense = nn.Linear(config.hidden_size, config.hidden_size)
1516
+ self.layer_norm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
1517
+
1518
+ self.decoder = nn.Linear(config.hidden_size, config.vocab_size)
1519
+ self.bias = nn.Parameter(torch.zeros(config.vocab_size))
1520
+ self.decoder.bias = self.bias
1521
+
1522
+ def forward(self, features, **kwargs):
1523
+ x = self.dense(features)
1524
+ x = gelu(x)
1525
+ x = self.layer_norm(x)
1526
+
1527
+ # project back to size of vocabulary with bias
1528
+ x = self.decoder(x)
1529
+
1530
+ return x
1531
+
1532
+ def _tie_weights(self):
1533
+ # To tie those two weights if they get disconnected (on TPU or when the bias is resized)
1534
+ # For accelerate compatibility and to not break backward compatibility
1535
+ if self.decoder.bias.device.type == "meta":
1536
+ self.decoder.bias = self.bias
1537
+ else:
1538
+ self.bias = self.decoder.bias
1539
+
1540
+
1541
+ @add_start_docstrings(
1542
+ """
1543
+ XLM-RoBERTa Model transformer with a sequence classification/regression head on top (a linear layer on top of the
1544
+ pooled output) e.g. for GLUE tasks.
1545
+ """,
1546
+ XLM_ROBERTA_START_DOCSTRING,
1547
+ )
1548
+ # Copied from transformers.models.roberta.modeling_roberta.RobertaForSequenceClassification with Roberta->XLMRoberta, ROBERTA->XLM_ROBERTA
1549
+ class XLMRobertaForSequenceClassification(XLMRobertaPreTrainedModel):
1550
+ def __init__(self, config):
1551
+ super().__init__(config)
1552
+ self.num_labels = config.num_labels
1553
+ self.config = config
1554
+
1555
+ self.roberta = XLMRobertaModel(config, add_pooling_layer=False)
1556
+ self.classifier = XLMRobertaClassificationHead(config)
1557
+
1558
+ # Initialize weights and apply final processing
1559
+ self.post_init()
1560
+
1561
+ @add_start_docstrings_to_model_forward(XLM_ROBERTA_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
1562
+ @add_code_sample_docstrings(
1563
+ checkpoint="cardiffnlp/twitter-roberta-base-emotion",
1564
+ output_type=SequenceClassifierOutput,
1565
+ config_class=_CONFIG_FOR_DOC,
1566
+ expected_output="'optimism'",
1567
+ expected_loss=0.08,
1568
+ )
1569
+ def forward(
1570
+ self,
1571
+ input_ids: Optional[torch.LongTensor] = None,
1572
+ attention_mask: Optional[torch.FloatTensor] = None,
1573
+ token_type_ids: Optional[torch.LongTensor] = None,
1574
+ position_ids: Optional[torch.LongTensor] = None,
1575
+ head_mask: Optional[torch.FloatTensor] = None,
1576
+ inputs_embeds: Optional[torch.FloatTensor] = None,
1577
+ labels: Optional[torch.LongTensor] = None,
1578
+ output_attentions: Optional[bool] = None,
1579
+ output_hidden_states: Optional[bool] = None,
1580
+ return_dict: Optional[bool] = None,
1581
+ ) -> Union[Tuple[torch.Tensor], SequenceClassifierOutput]:
1582
+ r"""
1583
+ labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
1584
+ Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
1585
+ config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
1586
+ `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
1587
+ """
1588
+ return_dict = return_dict if return_dict is not None else self.config.use_return_dict
1589
+
1590
+ outputs = self.roberta(
1591
+ input_ids,
1592
+ attention_mask=attention_mask,
1593
+ token_type_ids=token_type_ids,
1594
+ position_ids=position_ids,
1595
+ head_mask=head_mask,
1596
+ inputs_embeds=inputs_embeds,
1597
+ output_attentions=output_attentions,
1598
+ output_hidden_states=output_hidden_states,
1599
+ return_dict=return_dict,
1600
+ )
1601
+ sequence_output = outputs[0]
1602
+ logits = self.classifier(sequence_output)
1603
+
1604
+ loss = None
1605
+ if labels is not None:
1606
+ # move labels to correct device to enable model parallelism
1607
+ labels = labels.to(logits.device)
1608
+ if self.config.problem_type is None:
1609
+ if self.num_labels == 1:
1610
+ self.config.problem_type = "regression"
1611
+ elif self.num_labels > 1 and (labels.dtype == torch.long or labels.dtype == torch.int):
1612
+ self.config.problem_type = "single_label_classification"
1613
+ else:
1614
+ self.config.problem_type = "multi_label_classification"
1615
+
1616
+ if self.config.problem_type == "regression":
1617
+ loss_fct = MSELoss()
1618
+ if self.num_labels == 1:
1619
+ loss = loss_fct(logits.squeeze(), labels.squeeze())
1620
+ else:
1621
+ loss = loss_fct(logits, labels)
1622
+ elif self.config.problem_type == "single_label_classification":
1623
+ loss_fct = CrossEntropyLoss()
1624
+ loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
1625
+ elif self.config.problem_type == "multi_label_classification":
1626
+ loss_fct = BCEWithLogitsLoss()
1627
+ loss = loss_fct(logits, labels)
1628
+
1629
+ if not return_dict:
1630
+ output = (logits,) + outputs[2:]
1631
+ return ((loss,) + output) if loss is not None else output
1632
+
1633
+ return SequenceClassifierOutput(
1634
+ loss=loss,
1635
+ logits=logits,
1636
+ hidden_states=outputs.hidden_states,
1637
+ attentions=outputs.attentions,
1638
+ )
1639
+
1640
+
1641
+ @add_start_docstrings(
1642
+ """
1643
+ XLM-RoBERTa Model with a multiple choice classification head on top (a linear layer on top of the pooled output and
1644
+ a softmax) e.g. for RocStories/SWAG tasks.
1645
+ """,
1646
+ XLM_ROBERTA_START_DOCSTRING,
1647
+ )
1648
+ # Copied from transformers.models.roberta.modeling_roberta.RobertaForMultipleChoice with Roberta->XLMRoberta, ROBERTA->XLM_ROBERTA
1649
+ class XLMRobertaForMultipleChoice(XLMRobertaPreTrainedModel):
1650
+ def __init__(self, config):
1651
+ super().__init__(config)
1652
+
1653
+ self.roberta = XLMRobertaModel(config)
1654
+ self.dropout = nn.Dropout(config.hidden_dropout_prob)
1655
+ self.classifier = nn.Linear(config.hidden_size, 1)
1656
+
1657
+ # Initialize weights and apply final processing
1658
+ self.post_init()
1659
+
1660
+ @add_start_docstrings_to_model_forward(
1661
+ XLM_ROBERTA_INPUTS_DOCSTRING.format("batch_size, num_choices, sequence_length")
1662
+ )
1663
+ @add_code_sample_docstrings(
1664
+ checkpoint=_CHECKPOINT_FOR_DOC,
1665
+ output_type=MultipleChoiceModelOutput,
1666
+ config_class=_CONFIG_FOR_DOC,
1667
+ )
1668
+ def forward(
1669
+ self,
1670
+ input_ids: Optional[torch.LongTensor] = None,
1671
+ token_type_ids: Optional[torch.LongTensor] = None,
1672
+ attention_mask: Optional[torch.FloatTensor] = None,
1673
+ labels: Optional[torch.LongTensor] = None,
1674
+ position_ids: Optional[torch.LongTensor] = None,
1675
+ head_mask: Optional[torch.FloatTensor] = None,
1676
+ inputs_embeds: Optional[torch.FloatTensor] = None,
1677
+ output_attentions: Optional[bool] = None,
1678
+ output_hidden_states: Optional[bool] = None,
1679
+ return_dict: Optional[bool] = None,
1680
+ ) -> Union[Tuple[torch.Tensor], MultipleChoiceModelOutput]:
1681
+ r"""
1682
+ labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
1683
+ Labels for computing the multiple choice classification loss. Indices should be in `[0, ...,
1684
+ num_choices-1]` where `num_choices` is the size of the second dimension of the input tensors. (See
1685
+ `input_ids` above)
1686
+ """
1687
+ return_dict = return_dict if return_dict is not None else self.config.use_return_dict
1688
+ num_choices = input_ids.shape[1] if input_ids is not None else inputs_embeds.shape[1]
1689
+
1690
+ flat_input_ids = input_ids.view(-1, input_ids.size(-1)) if input_ids is not None else None
1691
+ flat_position_ids = position_ids.view(-1, position_ids.size(-1)) if position_ids is not None else None
1692
+ flat_token_type_ids = token_type_ids.view(-1, token_type_ids.size(-1)) if token_type_ids is not None else None
1693
+ flat_attention_mask = attention_mask.view(-1, attention_mask.size(-1)) if attention_mask is not None else None
1694
+ flat_inputs_embeds = (
1695
+ inputs_embeds.view(-1, inputs_embeds.size(-2), inputs_embeds.size(-1))
1696
+ if inputs_embeds is not None
1697
+ else None
1698
+ )
1699
+
1700
+ outputs = self.roberta(
1701
+ flat_input_ids,
1702
+ position_ids=flat_position_ids,
1703
+ token_type_ids=flat_token_type_ids,
1704
+ attention_mask=flat_attention_mask,
1705
+ head_mask=head_mask,
1706
+ inputs_embeds=flat_inputs_embeds,
1707
+ output_attentions=output_attentions,
1708
+ output_hidden_states=output_hidden_states,
1709
+ return_dict=return_dict,
1710
+ )
1711
+ pooled_output = outputs[1]
1712
+
1713
+ pooled_output = self.dropout(pooled_output)
1714
+ logits = self.classifier(pooled_output)
1715
+ reshaped_logits = logits.view(-1, num_choices)
1716
+
1717
+ loss = None
1718
+ if labels is not None:
1719
+ # move labels to correct device to enable model parallelism
1720
+ labels = labels.to(reshaped_logits.device)
1721
+ loss_fct = CrossEntropyLoss()
1722
+ loss = loss_fct(reshaped_logits, labels)
1723
+
1724
+ if not return_dict:
1725
+ output = (reshaped_logits,) + outputs[2:]
1726
+ return ((loss,) + output) if loss is not None else output
1727
+
1728
+ return MultipleChoiceModelOutput(
1729
+ loss=loss,
1730
+ logits=reshaped_logits,
1731
+ hidden_states=outputs.hidden_states,
1732
+ attentions=outputs.attentions,
1733
+ )
1734
+
1735
+
1736
+ @add_start_docstrings(
1737
+ """
1738
+ XLM-RoBERTa Model with a token classification head on top (a linear layer on top of the hidden-states output) e.g.
1739
+ for Named-Entity-Recognition (NER) tasks.
1740
+ """,
1741
+ XLM_ROBERTA_START_DOCSTRING,
1742
+ )
1743
+ # Copied from transformers.models.roberta.modeling_roberta.RobertaForTokenClassification with Roberta->XLMRoberta, ROBERTA->XLM_ROBERTA
1744
+ class XLMRobertaForTokenClassification(XLMRobertaPreTrainedModel):
1745
+ def __init__(self, config):
1746
+ super().__init__(config)
1747
+ self.num_labels = config.num_labels
1748
+
1749
+ self.roberta = XLMRobertaModel(config, add_pooling_layer=False)
1750
+ classifier_dropout = (
1751
+ config.classifier_dropout if config.classifier_dropout is not None else config.hidden_dropout_prob
1752
+ )
1753
+ self.dropout = nn.Dropout(classifier_dropout)
1754
+ self.classifier = nn.Linear(config.hidden_size, config.num_labels)
1755
+
1756
+ # Initialize weights and apply final processing
1757
+ self.post_init()
1758
+
1759
+ @add_start_docstrings_to_model_forward(XLM_ROBERTA_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
1760
+ @add_code_sample_docstrings(
1761
+ checkpoint="Jean-Baptiste/roberta-large-ner-english",
1762
+ output_type=TokenClassifierOutput,
1763
+ config_class=_CONFIG_FOR_DOC,
1764
+ expected_output="['O', 'ORG', 'ORG', 'O', 'O', 'O', 'O', 'O', 'LOC', 'O', 'LOC', 'LOC']",
1765
+ expected_loss=0.01,
1766
+ )
1767
+ def forward(
1768
+ self,
1769
+ input_ids: Optional[torch.LongTensor] = None,
1770
+ attention_mask: Optional[torch.FloatTensor] = None,
1771
+ token_type_ids: Optional[torch.LongTensor] = None,
1772
+ position_ids: Optional[torch.LongTensor] = None,
1773
+ head_mask: Optional[torch.FloatTensor] = None,
1774
+ inputs_embeds: Optional[torch.FloatTensor] = None,
1775
+ labels: Optional[torch.LongTensor] = None,
1776
+ output_attentions: Optional[bool] = None,
1777
+ output_hidden_states: Optional[bool] = None,
1778
+ return_dict: Optional[bool] = None,
1779
+ ) -> Union[Tuple[torch.Tensor], TokenClassifierOutput]:
1780
+ r"""
1781
+ labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
1782
+ Labels for computing the token classification loss. Indices should be in `[0, ..., config.num_labels - 1]`.
1783
+ """
1784
+ return_dict = return_dict if return_dict is not None else self.config.use_return_dict
1785
+
1786
+ outputs = self.roberta(
1787
+ input_ids,
1788
+ attention_mask=attention_mask,
1789
+ token_type_ids=token_type_ids,
1790
+ position_ids=position_ids,
1791
+ head_mask=head_mask,
1792
+ inputs_embeds=inputs_embeds,
1793
+ output_attentions=output_attentions,
1794
+ output_hidden_states=output_hidden_states,
1795
+ return_dict=return_dict,
1796
+ )
1797
+
1798
+ sequence_output = outputs[0]
1799
+
1800
+ sequence_output = self.dropout(sequence_output)
1801
+ logits = self.classifier(sequence_output)
1802
+
1803
+ loss = None
1804
+ if labels is not None:
1805
+ # move labels to correct device to enable model parallelism
1806
+ labels = labels.to(logits.device)
1807
+ loss_fct = CrossEntropyLoss()
1808
+ loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
1809
+
1810
+ if not return_dict:
1811
+ output = (logits,) + outputs[2:]
1812
+ return ((loss,) + output) if loss is not None else output
1813
+
1814
+ return TokenClassifierOutput(
1815
+ loss=loss,
1816
+ logits=logits,
1817
+ hidden_states=outputs.hidden_states,
1818
+ attentions=outputs.attentions,
1819
+ )
1820
+
1821
+
1822
+ # Copied from transformers.models.roberta.modeling_roberta.RobertaClassificationHead with Roberta->XLMRoberta
1823
+ class XLMRobertaClassificationHead(nn.Module):
1824
+ """Head for sentence-level classification tasks."""
1825
+
1826
+ def __init__(self, config):
1827
+ super().__init__()
1828
+ self.dense = nn.Linear(config.hidden_size, config.hidden_size)
1829
+ classifier_dropout = (
1830
+ config.classifier_dropout if config.classifier_dropout is not None else config.hidden_dropout_prob
1831
+ )
1832
+ self.dropout = nn.Dropout(classifier_dropout)
1833
+ self.out_proj = nn.Linear(config.hidden_size, config.num_labels)
1834
+
1835
+ def forward(self, features, **kwargs):
1836
+ x = features[:, 0, :] # take <s> token (equiv. to [CLS])
1837
+ x = self.dropout(x)
1838
+ x = self.dense(x)
1839
+ x = torch.tanh(x)
1840
+ x = self.dropout(x)
1841
+ x = self.out_proj(x)
1842
+ return x
1843
+
1844
+
1845
+ @add_start_docstrings(
1846
+ """
1847
+ XLM-RoBERTa Model with a span classification head on top for extractive question-answering tasks like SQuAD (a
1848
+ linear layers on top of the hidden-states output to compute `span start logits` and `span end logits`).
1849
+ """,
1850
+ XLM_ROBERTA_START_DOCSTRING,
1851
+ )
1852
+ # Copied from transformers.models.roberta.modeling_roberta.RobertaForQuestionAnswering with Roberta->XLMRoberta, ROBERTA->XLM_ROBERTA
1853
+ class XLMRobertaForQuestionAnswering(XLMRobertaPreTrainedModel):
1854
+ def __init__(self, config):
1855
+ super().__init__(config)
1856
+ self.num_labels = config.num_labels
1857
+
1858
+ self.roberta = XLMRobertaModel(config, add_pooling_layer=False)
1859
+ self.qa_outputs = nn.Linear(config.hidden_size, config.num_labels)
1860
+
1861
+ # Initialize weights and apply final processing
1862
+ self.post_init()
1863
+
1864
+ @add_start_docstrings_to_model_forward(XLM_ROBERTA_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
1865
+ @add_code_sample_docstrings(
1866
+ checkpoint="deepset/roberta-base-squad2",
1867
+ output_type=QuestionAnsweringModelOutput,
1868
+ config_class=_CONFIG_FOR_DOC,
1869
+ expected_output="' puppet'",
1870
+ expected_loss=0.86,
1871
+ )
1872
+ def forward(
1873
+ self,
1874
+ input_ids: Optional[torch.LongTensor] = None,
1875
+ attention_mask: Optional[torch.FloatTensor] = None,
1876
+ token_type_ids: Optional[torch.LongTensor] = None,
1877
+ position_ids: Optional[torch.LongTensor] = None,
1878
+ head_mask: Optional[torch.FloatTensor] = None,
1879
+ inputs_embeds: Optional[torch.FloatTensor] = None,
1880
+ start_positions: Optional[torch.LongTensor] = None,
1881
+ end_positions: Optional[torch.LongTensor] = None,
1882
+ output_attentions: Optional[bool] = None,
1883
+ output_hidden_states: Optional[bool] = None,
1884
+ return_dict: Optional[bool] = None,
1885
+ ) -> Union[Tuple[torch.Tensor], QuestionAnsweringModelOutput]:
1886
+ r"""
1887
+ start_positions (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
1888
+ Labels for position (index) of the start of the labelled span for computing the token classification loss.
1889
+ Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence
1890
+ are not taken into account for computing the loss.
1891
+ end_positions (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
1892
+ Labels for position (index) of the end of the labelled span for computing the token classification loss.
1893
+ Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence
1894
+ are not taken into account for computing the loss.
1895
+ """
1896
+ return_dict = return_dict if return_dict is not None else self.config.use_return_dict
1897
+
1898
+ outputs = self.roberta(
1899
+ input_ids,
1900
+ attention_mask=attention_mask,
1901
+ token_type_ids=token_type_ids,
1902
+ position_ids=position_ids,
1903
+ head_mask=head_mask,
1904
+ inputs_embeds=inputs_embeds,
1905
+ output_attentions=output_attentions,
1906
+ output_hidden_states=output_hidden_states,
1907
+ return_dict=return_dict,
1908
+ )
1909
+
1910
+ sequence_output = outputs[0]
1911
+
1912
+ logits = self.qa_outputs(sequence_output)
1913
+ start_logits, end_logits = logits.split(1, dim=-1)
1914
+ start_logits = start_logits.squeeze(-1).contiguous()
1915
+ end_logits = end_logits.squeeze(-1).contiguous()
1916
+
1917
+ total_loss = None
1918
+ if start_positions is not None and end_positions is not None:
1919
+ # If we are on multi-GPU, split add a dimension
1920
+ if len(start_positions.size()) > 1:
1921
+ start_positions = start_positions.squeeze(-1)
1922
+ if len(end_positions.size()) > 1:
1923
+ end_positions = end_positions.squeeze(-1)
1924
+ # sometimes the start/end positions are outside our model inputs, we ignore these terms
1925
+ ignored_index = start_logits.size(1)
1926
+ start_positions = start_positions.clamp(0, ignored_index)
1927
+ end_positions = end_positions.clamp(0, ignored_index)
1928
+
1929
+ loss_fct = CrossEntropyLoss(ignore_index=ignored_index)
1930
+ start_loss = loss_fct(start_logits, start_positions)
1931
+ end_loss = loss_fct(end_logits, end_positions)
1932
+ total_loss = (start_loss + end_loss) / 2
1933
+
1934
+ if not return_dict:
1935
+ output = (start_logits, end_logits) + outputs[2:]
1936
+ return ((total_loss,) + output) if total_loss is not None else output
1937
+
1938
+ return QuestionAnsweringModelOutput(
1939
+ loss=total_loss,
1940
+ start_logits=start_logits,
1941
+ end_logits=end_logits,
1942
+ hidden_states=outputs.hidden_states,
1943
+ attentions=outputs.attentions,
1944
+ )
1945
+
1946
+
1947
+ # Copied from transformers.models.roberta.modeling_roberta.create_position_ids_from_input_ids
1948
+ def create_position_ids_from_input_ids(input_ids, padding_idx, past_key_values_length=0):
1949
+ """
1950
+ Replace non-padding symbols with their position numbers. Position numbers begin at padding_idx+1. Padding symbols
1951
+ are ignored. This is modified from fairseq's `utils.make_positions`.
1952
+
1953
+ Args:
1954
+ x: torch.Tensor x:
1955
+
1956
+ Returns: torch.Tensor
1957
+ """
1958
+ # The series of casts and type-conversions here are carefully balanced to both work with ONNX export and XLA.
1959
+ mask = input_ids.ne(padding_idx).int()
1960
+ incremental_indices = (torch.cumsum(mask, dim=1).type_as(mask) + past_key_values_length) * mask
1961
+ return incremental_indices.long() + padding_idx
sentencepiece.bpe.model ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:cfc8146abe2a0488e9e2a0c56de7952f7c11ab059eca145a0a727afce0db2865
3
+ size 5069051
special_tokens_map.json ADDED
@@ -0,0 +1,51 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token": {
3
+ "content": "<s>",
4
+ "lstrip": false,
5
+ "normalized": false,
6
+ "rstrip": false,
7
+ "single_word": false
8
+ },
9
+ "cls_token": {
10
+ "content": "<s>",
11
+ "lstrip": false,
12
+ "normalized": false,
13
+ "rstrip": false,
14
+ "single_word": false
15
+ },
16
+ "eos_token": {
17
+ "content": "</s>",
18
+ "lstrip": false,
19
+ "normalized": false,
20
+ "rstrip": false,
21
+ "single_word": false
22
+ },
23
+ "mask_token": {
24
+ "content": "<mask>",
25
+ "lstrip": true,
26
+ "normalized": false,
27
+ "rstrip": false,
28
+ "single_word": false
29
+ },
30
+ "pad_token": {
31
+ "content": "<pad>",
32
+ "lstrip": false,
33
+ "normalized": false,
34
+ "rstrip": false,
35
+ "single_word": false
36
+ },
37
+ "sep_token": {
38
+ "content": "</s>",
39
+ "lstrip": false,
40
+ "normalized": false,
41
+ "rstrip": false,
42
+ "single_word": false
43
+ },
44
+ "unk_token": {
45
+ "content": "<unk>",
46
+ "lstrip": false,
47
+ "normalized": false,
48
+ "rstrip": false,
49
+ "single_word": false
50
+ }
51
+ }
tokenizer.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:222975faa02f5257c6e8c734e85973e48c8d42d7d37d90b894c73efa1841d76a
3
+ size 17083154
tokenizer_config.json ADDED
@@ -0,0 +1,55 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "added_tokens_decoder": {
3
+ "0": {
4
+ "content": "<s>",
5
+ "lstrip": false,
6
+ "normalized": false,
7
+ "rstrip": false,
8
+ "single_word": false,
9
+ "special": true
10
+ },
11
+ "1": {
12
+ "content": "<pad>",
13
+ "lstrip": false,
14
+ "normalized": false,
15
+ "rstrip": false,
16
+ "single_word": false,
17
+ "special": true
18
+ },
19
+ "2": {
20
+ "content": "</s>",
21
+ "lstrip": false,
22
+ "normalized": false,
23
+ "rstrip": false,
24
+ "single_word": false,
25
+ "special": true
26
+ },
27
+ "3": {
28
+ "content": "<unk>",
29
+ "lstrip": false,
30
+ "normalized": false,
31
+ "rstrip": false,
32
+ "single_word": false,
33
+ "special": true
34
+ },
35
+ "250001": {
36
+ "content": "<mask>",
37
+ "lstrip": true,
38
+ "normalized": false,
39
+ "rstrip": false,
40
+ "single_word": false,
41
+ "special": true
42
+ }
43
+ },
44
+ "bos_token": "<s>",
45
+ "clean_up_tokenization_spaces": true,
46
+ "cls_token": "<s>",
47
+ "eos_token": "</s>",
48
+ "mask_token": "<mask>",
49
+ "model_max_length": 8192,
50
+ "pad_token": "<pad>",
51
+ "sep_token": "</s>",
52
+ "sp_model_kwargs": {},
53
+ "tokenizer_class": "XLMRobertaTokenizer",
54
+ "unk_token": "<unk>"
55
+ }