Files changed (3) hide show
  1. modeling_qwen.py +0 -1431
  2. modules.json +1 -1
  3. tokenization_qwen.py +0 -267
modeling_qwen.py DELETED
@@ -1,1431 +0,0 @@
1
- # coding=utf-8
2
- # Copyright 2024 The Qwen team, Alibaba Group and the HuggingFace Inc. team. All rights reserved.
3
- #
4
- # This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
5
- # and OPT implementations in this library. It has been modified from its
6
- # original forms to accommodate minor architectural differences compared
7
- # to GPT-NeoX and OPT used by the Meta AI team that trained the model.
8
- #
9
- # Licensed under the Apache License, Version 2.0 (the "License");
10
- # you may not use this file except in compliance with the License.
11
- # You may obtain a copy of the License at
12
- #
13
- # http://www.apache.org/licenses/LICENSE-2.0
14
- #
15
- # Unless required by applicable law or agreed to in writing, software
16
- # distributed under the License is distributed on an "AS IS" BASIS,
17
- # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
18
- # See the License for the specific language governing permissions and
19
- # limitations under the License.
20
- """ PyTorch Qwen2 model."""
21
- from transformers import Qwen2Config
22
- import inspect
23
- import math
24
- import os
25
- import warnings
26
- from typing import List, Optional, Tuple, Union
27
-
28
- import torch
29
- import torch.nn.functional as F
30
- import torch.utils.checkpoint
31
- from torch import nn
32
- from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
33
-
34
- from transformers.activations import ACT2FN
35
- from transformers.cache_utils import Cache, DynamicCache
36
- from transformers.modeling_attn_mask_utils import _prepare_4d_causal_attention_mask, _prepare_4d_causal_attention_mask_for_sdpa, _prepare_4d_attention_mask, _prepare_4d_attention_mask_for_sdpa
37
- from transformers.modeling_outputs import BaseModelOutputWithPast, CausalLMOutputWithPast, SequenceClassifierOutputWithPast
38
- from transformers.modeling_utils import PreTrainedModel
39
- from transformers.utils import (
40
- add_start_docstrings,
41
- add_start_docstrings_to_model_forward,
42
- is_flash_attn_2_available,
43
- is_flash_attn_greater_or_equal_2_10,
44
- logging,
45
- replace_return_docstrings,
46
- )
47
-
48
-
49
- # if is_flash_attn_2_available():
50
- # from flash_attn import flash_attn_func, flash_attn_varlen_func
51
- # from flash_attn.bert_padding import index_first_axis, pad_input, unpad_input # noqa
52
-
53
- # _flash_supports_window_size = "window_size" in list(inspect.signature(flash_attn_func).parameters)
54
-
55
- # REMOVE THE CODE ABOVE SHOULD ALLOW FOR RUNNING OF THIS MODEL WITHOUT FLASH ATTENTION SUPPORT.
56
-
57
-
58
- logger = logging.get_logger(__name__)
59
-
60
-
61
- _CHECKPOINT_FOR_DOC = "Qwen/Qwen2-7B-beta"
62
- _CONFIG_FOR_DOC = "Qwen2Config"
63
-
64
- QWEN2_PRETRAINED_MODEL_ARCHIVE_LIST = [
65
- "Qwen/Qwen2-7B-beta",
66
- # See all Qwen2 models at https://huggingface.co/models?filter=qwen2
67
- ]
68
-
69
-
70
- # Copied from transformers.models.llama.modeling_llama._get_unpad_data
71
- def _get_unpad_data(attention_mask):
72
- seqlens_in_batch = attention_mask.sum(dim=-1, dtype=torch.int32)
73
- indices = torch.nonzero(attention_mask.flatten(), as_tuple=False).flatten()
74
- max_seqlen_in_batch = seqlens_in_batch.max().item()
75
- cu_seqlens = F.pad(torch.cumsum(seqlens_in_batch, dim=0, dtype=torch.int32), (1, 0))
76
- return (
77
- indices,
78
- cu_seqlens,
79
- max_seqlen_in_batch,
80
- )
81
-
82
-
83
- # Copied from transformers.models.llama.modeling_llama.LlamaRMSNorm with Llama->Qwen2
84
- class Qwen2RMSNorm(nn.Module):
85
- def __init__(self, hidden_size, eps=1e-6):
86
- """
87
- Qwen2RMSNorm is equivalent to T5LayerNorm
88
- """
89
- super().__init__()
90
- self.weight = nn.Parameter(torch.ones(hidden_size))
91
- self.variance_epsilon = eps
92
-
93
- def forward(self, hidden_states):
94
- input_dtype = hidden_states.dtype
95
- hidden_states = hidden_states.to(torch.float32)
96
- variance = hidden_states.pow(2).mean(-1, keepdim=True)
97
- hidden_states = hidden_states * torch.rsqrt(variance + self.variance_epsilon)
98
- return self.weight * hidden_states.to(input_dtype)
99
-
100
-
101
- # Copied from transformers.models.mistral.modeling_mistral.MistralRotaryEmbedding with Mistral->Qwen2
102
- class Qwen2RotaryEmbedding(nn.Module):
103
- def __init__(self, dim, max_position_embeddings=2048, base=10000, device=None):
104
- super().__init__()
105
-
106
- self.dim = dim
107
- self.max_position_embeddings = max_position_embeddings
108
- self.base = base
109
- inv_freq = 1.0 / (self.base ** (torch.arange(0, self.dim, 2, dtype=torch.int64).float().to(device) / self.dim))
110
- self.register_buffer("inv_freq", inv_freq, persistent=False)
111
-
112
- # Build here to make `torch.jit.trace` work.
113
- self._set_cos_sin_cache(
114
- seq_len=max_position_embeddings, device=self.inv_freq.device, dtype=torch.get_default_dtype()
115
- )
116
-
117
- def _set_cos_sin_cache(self, seq_len, device, dtype):
118
- self.max_seq_len_cached = seq_len
119
- t = torch.arange(self.max_seq_len_cached, device=device, dtype=torch.int64).type_as(self.inv_freq)
120
-
121
- freqs = torch.outer(t, self.inv_freq)
122
- # Different from paper, but it uses a different permutation in order to obtain the same calculation
123
- emb = torch.cat((freqs, freqs), dim=-1)
124
- self.register_buffer("cos_cached", emb.cos().to(dtype), persistent=False)
125
- self.register_buffer("sin_cached", emb.sin().to(dtype), persistent=False)
126
-
127
- def forward(self, x, seq_len=None):
128
- # x: [bs, num_attention_heads, seq_len, head_size]
129
- if seq_len > self.max_seq_len_cached:
130
- self._set_cos_sin_cache(seq_len=seq_len, device=x.device, dtype=x.dtype)
131
-
132
- return (
133
- self.cos_cached[:seq_len].to(dtype=x.dtype),
134
- self.sin_cached[:seq_len].to(dtype=x.dtype),
135
- )
136
-
137
-
138
- # Copied from transformers.models.llama.modeling_llama.rotate_half
139
- def rotate_half(x):
140
- """Rotates half the hidden dims of the input."""
141
- x1 = x[..., : x.shape[-1] // 2]
142
- x2 = x[..., x.shape[-1] // 2 :]
143
- return torch.cat((-x2, x1), dim=-1)
144
-
145
-
146
- # Copied from transformers.models.mistral.modeling_mistral.apply_rotary_pos_emb
147
- def apply_rotary_pos_emb(q, k, cos, sin, position_ids, unsqueeze_dim=1):
148
- """Applies Rotary Position Embedding to the query and key tensors.
149
-
150
- Args:
151
- q (`torch.Tensor`): The query tensor.
152
- k (`torch.Tensor`): The key tensor.
153
- cos (`torch.Tensor`): The cosine part of the rotary embedding.
154
- sin (`torch.Tensor`): The sine part of the rotary embedding.
155
- position_ids (`torch.Tensor`):
156
- The position indices of the tokens corresponding to the query and key tensors. For example, this can be
157
- used to pass offsetted position ids when working with a KV-cache.
158
- unsqueeze_dim (`int`, *optional*, defaults to 1):
159
- The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and
160
- sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note
161
- that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if q and
162
- k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes
163
- cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have
164
- the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2.
165
- Returns:
166
- `tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding.
167
- """
168
- cos = cos[position_ids].unsqueeze(unsqueeze_dim)
169
- sin = sin[position_ids].unsqueeze(unsqueeze_dim)
170
- q_embed = (q * cos) + (rotate_half(q) * sin)
171
- k_embed = (k * cos) + (rotate_half(k) * sin)
172
- return q_embed, k_embed
173
-
174
-
175
- # Copied from transformers.models.mistral.modeling_mistral.MistralMLP with Mistral->Qwen2
176
- class Qwen2MLP(nn.Module):
177
- def __init__(self, config):
178
- super().__init__()
179
- self.config = config
180
- self.hidden_size = config.hidden_size
181
- self.intermediate_size = config.intermediate_size
182
- self.gate_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=False)
183
- self.up_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=False)
184
- self.down_proj = nn.Linear(self.intermediate_size, self.hidden_size, bias=False)
185
- self.act_fn = ACT2FN[config.hidden_act]
186
-
187
- def forward(self, x):
188
- return self.down_proj(self.act_fn(self.gate_proj(x)) * self.up_proj(x))
189
-
190
-
191
- # Copied from transformers.models.llama.modeling_llama.repeat_kv
192
- def repeat_kv(hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor:
193
- """
194
- This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch,
195
- num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim)
196
- """
197
- batch, num_key_value_heads, slen, head_dim = hidden_states.shape
198
- if n_rep == 1:
199
- return hidden_states
200
- hidden_states = hidden_states[:, :, None, :, :].expand(batch, num_key_value_heads, n_rep, slen, head_dim)
201
- return hidden_states.reshape(batch, num_key_value_heads * n_rep, slen, head_dim)
202
-
203
-
204
- class Qwen2Attention(nn.Module):
205
- """
206
- Multi-headed attention from 'Attention Is All You Need' paper. Modified to use sliding window attention: Longformer
207
- and "Generating Long Sequences with Sparse Transformers".
208
- """
209
-
210
- def __init__(self, config: Qwen2Config, layer_idx: Optional[int] = None):
211
- super().__init__()
212
- self.config = config
213
- self.layer_idx = layer_idx
214
- if layer_idx is None:
215
- logger.warning_once(
216
- f"Instantiating {self.__class__.__name__} without passing `layer_idx` is not recommended and will "
217
- "to errors during the forward call, if caching is used. Please make sure to provide a `layer_idx` "
218
- "when creating this class."
219
- )
220
-
221
- self.hidden_size = config.hidden_size
222
- self.num_heads = config.num_attention_heads
223
- self.head_dim = self.hidden_size // self.num_heads
224
- self.num_key_value_heads = config.num_key_value_heads
225
- self.num_key_value_groups = self.num_heads // self.num_key_value_heads
226
- self.max_position_embeddings = config.max_position_embeddings
227
- self.rope_theta = config.rope_theta
228
- self.is_causal = True
229
- self.attention_dropout = config.attention_dropout
230
-
231
- if (self.head_dim * self.num_heads) != self.hidden_size:
232
- raise ValueError(
233
- f"hidden_size must be divisible by num_heads (got `hidden_size`: {self.hidden_size}"
234
- f" and `num_heads`: {self.num_heads})."
235
- )
236
- self.q_proj = nn.Linear(self.hidden_size, self.num_heads * self.head_dim, bias=True)
237
- self.k_proj = nn.Linear(self.hidden_size, self.num_key_value_heads * self.head_dim, bias=True)
238
- self.v_proj = nn.Linear(self.hidden_size, self.num_key_value_heads * self.head_dim, bias=True)
239
- self.o_proj = nn.Linear(self.num_heads * self.head_dim, self.hidden_size, bias=False)
240
-
241
- self.rotary_emb = Qwen2RotaryEmbedding(
242
- self.head_dim,
243
- max_position_embeddings=self.max_position_embeddings,
244
- base=self.rope_theta,
245
- )
246
-
247
- def forward(
248
- self,
249
- hidden_states: torch.Tensor,
250
- attention_mask: Optional[torch.Tensor] = None,
251
- position_ids: Optional[torch.LongTensor] = None,
252
- past_key_value: Optional[Cache] = None,
253
- output_attentions: bool = False,
254
- use_cache: bool = False,
255
- **kwargs,
256
- ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
257
- if "padding_mask" in kwargs:
258
- warnings.warn(
259
- "Passing `padding_mask` is deprecated and will be removed in v4.37. Please make sure use `attention_mask` instead.`"
260
- )
261
- bsz, q_len, _ = hidden_states.size()
262
-
263
- query_states = self.q_proj(hidden_states)
264
- key_states = self.k_proj(hidden_states)
265
- value_states = self.v_proj(hidden_states)
266
-
267
- query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
268
- key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
269
- value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
270
-
271
- kv_seq_len = key_states.shape[-2]
272
- if past_key_value is not None:
273
- if self.layer_idx is None:
274
- raise ValueError(
275
- f"The cache structure has changed since version v4.36. If you are using {self.__class__.__name__} "
276
- "for auto-regressive decoding with k/v caching, please make sure to initialize the attention class "
277
- "with a layer index."
278
- )
279
- kv_seq_len += past_key_value.get_usable_length(kv_seq_len, self.layer_idx)
280
- cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len)
281
- query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin, position_ids)
282
-
283
- if past_key_value is not None:
284
- cache_kwargs = {"sin": sin, "cos": cos} # Specific to RoPE models
285
- key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs)
286
-
287
- # repeat k/v heads if n_kv_heads < n_heads
288
- key_states = repeat_kv(key_states, self.num_key_value_groups)
289
- value_states = repeat_kv(value_states, self.num_key_value_groups)
290
-
291
- attn_weights = torch.matmul(query_states, key_states.transpose(2, 3)) / math.sqrt(self.head_dim)
292
-
293
- if attn_weights.size() != (bsz, self.num_heads, q_len, kv_seq_len):
294
- raise ValueError(
295
- f"Attention weights should be of size {(bsz, self.num_heads, q_len, kv_seq_len)}, but is"
296
- f" {attn_weights.size()}"
297
- )
298
-
299
- if attention_mask is not None:
300
- if attention_mask.size() != (bsz, 1, q_len, kv_seq_len):
301
- raise ValueError(
302
- f"Attention mask should be of size {(bsz, 1, q_len, kv_seq_len)}, but is {attention_mask.size()}"
303
- )
304
-
305
- attn_weights = attn_weights + attention_mask
306
-
307
- # upcast attention to fp32
308
- attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query_states.dtype)
309
- attn_weights = nn.functional.dropout(attn_weights, p=self.attention_dropout, training=self.training)
310
- attn_output = torch.matmul(attn_weights, value_states)
311
-
312
- if attn_output.size() != (bsz, self.num_heads, q_len, self.head_dim):
313
- raise ValueError(
314
- f"`attn_output` should be of size {(bsz, self.num_heads, q_len, self.head_dim)}, but is"
315
- f" {attn_output.size()}"
316
- )
317
-
318
- attn_output = attn_output.transpose(1, 2).contiguous()
319
- attn_output = attn_output.reshape(bsz, q_len, self.hidden_size)
320
-
321
- attn_output = self.o_proj(attn_output)
322
-
323
- if not output_attentions:
324
- attn_weights = None
325
-
326
- return attn_output, attn_weights, past_key_value
327
-
328
-
329
- class Qwen2FlashAttention2(Qwen2Attention):
330
- """
331
- Qwen2 flash attention module, following Qwen2 attention module. This module inherits from `Qwen2Attention`
332
- as the weights of the module stays untouched. The only required change would be on the forward pass
333
- where it needs to correctly call the public API of flash attention and deal with padding tokens
334
- in case the input contains any of them. Additionally, for sliding window attention, we apply SWA only to the bottom
335
- config.max_window_layers layers.
336
- """
337
-
338
- # Copied from transformers.models.llama.modeling_llama.LlamaFlashAttention2.__init__
339
- def __init__(self, *args, **kwargs):
340
- super().__init__(*args, **kwargs)
341
-
342
- # TODO: Should be removed once Flash Attention for RoCm is bumped to 2.1.
343
- # flash_attn<2.1 generates top-left aligned causal mask, while what is needed here is bottom-right alignement, that was made default for flash_attn>=2.1. This attribute is used to handle this difference. Reference: https://github.com/Dao-AILab/flash-attention/releases/tag/v2.1.0.
344
- # Beware that with flash_attn<2.1, using q_seqlen != k_seqlen (except for the case q_seqlen == 1) produces a wrong mask (top-left).
345
- self._flash_attn_uses_top_left_mask = not is_flash_attn_greater_or_equal_2_10()
346
-
347
- def forward(
348
- self,
349
- hidden_states: torch.Tensor,
350
- attention_mask: Optional[torch.Tensor] = None,
351
- position_ids: Optional[torch.LongTensor] = None,
352
- past_key_value: Optional[Cache] = None,
353
- output_attentions: bool = False,
354
- use_cache: bool = False,
355
- is_causal: bool = False,
356
- **kwargs,
357
- ):
358
- if "padding_mask" in kwargs:
359
- warnings.warn(
360
- "Passing `padding_mask` is deprecated and will be removed in v4.37. Please make sure use `attention_mask` instead.`"
361
- )
362
-
363
- # overwrite attention_mask with padding_mask
364
- attention_mask = kwargs.pop("padding_mask")
365
- bsz, q_len, _ = hidden_states.size()
366
-
367
- query_states = self.q_proj(hidden_states)
368
- key_states = self.k_proj(hidden_states)
369
- value_states = self.v_proj(hidden_states)
370
-
371
- query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
372
- key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
373
- value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
374
-
375
- kv_seq_len = key_states.shape[-2]
376
- if past_key_value is not None:
377
- if self.layer_idx is None:
378
- raise ValueError(
379
- f"The cache structure has changed since version v4.36. If you are using {self.__class__.__name__} "
380
- "for auto-regressive decoding with k/v caching, please make sure to initialize the attention class "
381
- "with a layer index."
382
- )
383
- kv_seq_len += past_key_value.get_usable_length(kv_seq_len, self.layer_idx)
384
-
385
- # Because the input can be padded, the absolute sequence length depends on the max position id.
386
- rotary_seq_len = max(kv_seq_len, position_ids[:, -1].max().item()) + 1
387
- cos, sin = self.rotary_emb(value_states, seq_len=rotary_seq_len)
388
-
389
- query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin, position_ids)
390
-
391
- use_sliding_windows = (
392
- _flash_supports_window_size
393
- and getattr(self.config, "sliding_window", None) is not None
394
- and kv_seq_len > self.config.sliding_window
395
- and self.config.use_sliding_window
396
- )
397
-
398
- if not _flash_supports_window_size:
399
- logger.warning_once(
400
- "The current flash attention version does not support sliding window attention, for a more memory efficient implementation"
401
- " make sure to upgrade flash-attn library."
402
- )
403
-
404
- if past_key_value is not None:
405
- # Activate slicing cache only if the config has a value `sliding_windows` attribute
406
- cache_has_contents = past_key_value.get_seq_length(self.layer_idx) > 0
407
- if (
408
- getattr(self.config, "sliding_window", None) is not None
409
- and kv_seq_len > self.config.sliding_window
410
- and cache_has_contents
411
- ):
412
- slicing_tokens = 1 - self.config.sliding_window
413
-
414
- past_key = past_key_value[self.layer_idx][0]
415
- past_value = past_key_value[self.layer_idx][1]
416
-
417
- past_key = past_key[:, :, slicing_tokens:, :].contiguous()
418
- past_value = past_value[:, :, slicing_tokens:, :].contiguous()
419
-
420
- if past_key.shape[-2] != self.config.sliding_window - 1:
421
- raise ValueError(
422
- f"past key must have a shape of (`batch_size, num_heads, self.config.sliding_window-1, head_dim`), got"
423
- f" {past_key.shape}"
424
- )
425
-
426
- if attention_mask is not None:
427
- attention_mask = attention_mask[:, slicing_tokens:]
428
- attention_mask = torch.cat([attention_mask, torch.ones_like(attention_mask[:, -1:])], dim=-1)
429
-
430
- cache_kwargs = {"sin": sin, "cos": cos} # Specific to RoPE models
431
- key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs)
432
-
433
- # repeat k/v heads if n_kv_heads < n_heads
434
- key_states = repeat_kv(key_states, self.num_key_value_groups)
435
- value_states = repeat_kv(value_states, self.num_key_value_groups)
436
- dropout_rate = 0.0 if not self.training else self.attention_dropout
437
-
438
- # In PEFT, usually we cast the layer norms in float32 for training stability reasons
439
- # therefore the input hidden states gets silently casted in float32. Hence, we need
440
- # cast them back in float16 just to be sure everything works as expected.
441
- input_dtype = query_states.dtype
442
- if input_dtype == torch.float32:
443
- if torch.is_autocast_enabled():
444
- target_dtype = torch.get_autocast_gpu_dtype()
445
- # Handle the case where the model is quantized
446
- elif hasattr(self.config, "_pre_quantization_dtype"):
447
- target_dtype = self.config._pre_quantization_dtype
448
- else:
449
- target_dtype = self.q_proj.weight.dtype
450
-
451
- logger.warning_once(
452
- f"The input hidden states seems to be silently casted in float32, this might be related to"
453
- f" the fact you have upcasted embedding or layer norm layers in float32. We will cast back the input in"
454
- f" {target_dtype}."
455
- )
456
-
457
- query_states = query_states.to(target_dtype)
458
- key_states = key_states.to(target_dtype)
459
- value_states = value_states.to(target_dtype)
460
-
461
- # Reashape to the expected shape for Flash Attention
462
- query_states = query_states.transpose(1, 2)
463
- key_states = key_states.transpose(1, 2)
464
- value_states = value_states.transpose(1, 2)
465
-
466
- attn_output = self._flash_attention_forward(
467
- query_states,
468
- key_states,
469
- value_states,
470
- attention_mask,
471
- q_len,
472
- dropout=dropout_rate,
473
- use_sliding_windows=use_sliding_windows,
474
- is_causal=is_causal
475
- )
476
-
477
- attn_output = attn_output.reshape(bsz, q_len, self.hidden_size).contiguous()
478
- attn_output = self.o_proj(attn_output)
479
-
480
- if not output_attentions:
481
- attn_weights = None
482
-
483
- return attn_output, attn_weights, past_key_value
484
-
485
- def _flash_attention_forward(
486
- self,
487
- query_states,
488
- key_states,
489
- value_states,
490
- attention_mask,
491
- query_length,
492
- dropout=0.0,
493
- softmax_scale=None,
494
- use_sliding_windows=False,
495
- is_causal=True,
496
- ):
497
- """
498
- Calls the forward method of Flash Attention - if the input hidden states contain at least one padding token
499
- first unpad the input, then computes the attention scores and pad the final attention scores.
500
-
501
- Args:
502
- query_states (`torch.Tensor`):
503
- Input query states to be passed to Flash Attention API
504
- key_states (`torch.Tensor`):
505
- Input key states to be passed to Flash Attention API
506
- value_states (`torch.Tensor`):
507
- Input value states to be passed to Flash Attention API
508
- attention_mask (`torch.Tensor`):
509
- The padding mask - corresponds to a tensor of size `(batch_size, seq_len)` where 0 stands for the
510
- position of padding tokens and 1 for the position of non-padding tokens.
511
- dropout (`int`, *optional*):
512
- Attention dropout
513
- softmax_scale (`float`, *optional*):
514
- The scaling of QK^T before applying softmax. Default to 1 / sqrt(head_dim)
515
- use_sliding_windows (`bool`, *optional*):
516
- Whether to activate sliding window attention.
517
- """
518
- if not self._flash_attn_uses_top_left_mask:
519
- causal = is_causal
520
- else:
521
- # TODO: Remove the `query_length != 1` check once Flash Attention for RoCm is bumped to 2.1. For details, please see the comment in LlamaFlashAttention2 __init__.
522
- causal = is_causal and query_length != 1
523
-
524
- # Decide whether to use SWA or not by layer index.
525
- if use_sliding_windows and self.layer_idx >= self.config.max_window_layers:
526
- use_sliding_windows = False
527
-
528
- # Contains at least one padding token in the sequence
529
- if attention_mask is not None:
530
- batch_size = query_states.shape[0]
531
- query_states, key_states, value_states, indices_q, cu_seq_lens, max_seq_lens = self._upad_input(
532
- query_states, key_states, value_states, attention_mask, query_length
533
- )
534
-
535
- cu_seqlens_q, cu_seqlens_k = cu_seq_lens
536
- max_seqlen_in_batch_q, max_seqlen_in_batch_k = max_seq_lens
537
-
538
- if not use_sliding_windows:
539
- attn_output_unpad = flash_attn_varlen_func(
540
- query_states,
541
- key_states,
542
- value_states,
543
- cu_seqlens_q=cu_seqlens_q,
544
- cu_seqlens_k=cu_seqlens_k,
545
- max_seqlen_q=max_seqlen_in_batch_q,
546
- max_seqlen_k=max_seqlen_in_batch_k,
547
- dropout_p=dropout,
548
- softmax_scale=softmax_scale,
549
- causal=causal,
550
- )
551
- else:
552
- attn_output_unpad = flash_attn_varlen_func(
553
- query_states,
554
- key_states,
555
- value_states,
556
- cu_seqlens_q=cu_seqlens_q,
557
- cu_seqlens_k=cu_seqlens_k,
558
- max_seqlen_q=max_seqlen_in_batch_q,
559
- max_seqlen_k=max_seqlen_in_batch_k,
560
- dropout_p=dropout,
561
- softmax_scale=softmax_scale,
562
- causal=causal,
563
- window_size=(self.config.sliding_window, self.config.sliding_window),
564
- )
565
-
566
- attn_output = pad_input(attn_output_unpad, indices_q, batch_size, query_length)
567
- else:
568
- if not use_sliding_windows:
569
- attn_output = flash_attn_func(
570
- query_states,
571
- key_states,
572
- value_states,
573
- dropout,
574
- softmax_scale=softmax_scale,
575
- causal=causal,
576
- )
577
- else:
578
- attn_output = flash_attn_func(
579
- query_states,
580
- key_states,
581
- value_states,
582
- dropout,
583
- softmax_scale=softmax_scale,
584
- causal=causal,
585
- window_size=(self.config.sliding_window, self.config.sliding_window),
586
- )
587
-
588
- return attn_output
589
-
590
- # Copied from transformers.models.mistral.modeling_mistral.MistralFlashAttention2._upad_input
591
- def _upad_input(self, query_layer, key_layer, value_layer, attention_mask, query_length):
592
- batch_size, kv_seq_len, num_heads, head_dim = key_layer.shape
593
-
594
- # On the first iteration we need to properly re-create the padding mask
595
- # by slicing it on the proper place
596
- if kv_seq_len != attention_mask.shape[-1]:
597
- attention_mask_num_tokens = attention_mask.shape[-1]
598
- attention_mask = attention_mask[:, attention_mask_num_tokens - kv_seq_len :]
599
-
600
- indices_k, cu_seqlens_k, max_seqlen_in_batch_k = _get_unpad_data(attention_mask)
601
-
602
- key_layer = index_first_axis(key_layer.reshape(batch_size * kv_seq_len, num_heads, head_dim), indices_k)
603
- value_layer = index_first_axis(value_layer.reshape(batch_size * kv_seq_len, num_heads, head_dim), indices_k)
604
-
605
- if query_length == kv_seq_len:
606
- query_layer = index_first_axis(
607
- query_layer.reshape(batch_size * kv_seq_len, num_heads, head_dim), indices_k
608
- )
609
- cu_seqlens_q = cu_seqlens_k
610
- max_seqlen_in_batch_q = max_seqlen_in_batch_k
611
- indices_q = indices_k
612
- elif query_length == 1:
613
- max_seqlen_in_batch_q = 1
614
- cu_seqlens_q = torch.arange(
615
- batch_size + 1, dtype=torch.int32, device=query_layer.device
616
- ) # There is a memcpy here, that is very bad.
617
- indices_q = cu_seqlens_q[:-1]
618
- query_layer = query_layer.squeeze(1)
619
- else:
620
- # The -q_len: slice assumes left padding.
621
- attention_mask = attention_mask[:, -query_length:]
622
- query_layer, indices_q, cu_seqlens_q, max_seqlen_in_batch_q = unpad_input(query_layer, attention_mask)
623
-
624
- return (
625
- query_layer,
626
- key_layer,
627
- value_layer,
628
- indices_q,
629
- (cu_seqlens_q, cu_seqlens_k),
630
- (max_seqlen_in_batch_q, max_seqlen_in_batch_k),
631
- )
632
-
633
-
634
- # Copied from transformers.models.mistral.modeling_mistral.MistralSdpaAttention with Mistral->Qwen2
635
- class Qwen2SdpaAttention(Qwen2Attention):
636
- """
637
- Qwen2 attention module using torch.nn.functional.scaled_dot_product_attention. This module inherits from
638
- `Qwen2Attention` as the weights of the module stays untouched. The only changes are on the forward pass to adapt to
639
- SDPA API.
640
- """
641
-
642
- # Adapted from Qwen2Attention.forward
643
- def forward(
644
- self,
645
- hidden_states: torch.Tensor,
646
- attention_mask: Optional[torch.Tensor] = None,
647
- position_ids: Optional[torch.LongTensor] = None,
648
- past_key_value: Optional[Cache] = None,
649
- output_attentions: bool = False,
650
- use_cache: bool = False,
651
- is_causal: bool = True,
652
- ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
653
- if output_attentions:
654
- # TODO: Improve this warning with e.g. `model.config.attn_implementation = "manual"` once this is implemented.
655
- logger.warning_once(
656
- "Qwen2Model is using Qwen2SdpaAttention, but `torch.nn.functional.scaled_dot_product_attention` does not support `output_attentions=True`. Falling back to the manual attention implementation, "
657
- 'but specifying the manual implementation will be required from Transformers version v5.0.0 onwards. This warning can be removed using the argument `attn_implementation="eager"` when loading the model.'
658
- )
659
- return super().forward(
660
- hidden_states=hidden_states,
661
- attention_mask=attention_mask,
662
- position_ids=position_ids,
663
- past_key_value=past_key_value,
664
- output_attentions=output_attentions,
665
- use_cache=use_cache,
666
- is_causal=is_causal
667
- )
668
-
669
- bsz, q_len, _ = hidden_states.size()
670
-
671
- query_states = self.q_proj(hidden_states)
672
- key_states = self.k_proj(hidden_states)
673
- value_states = self.v_proj(hidden_states)
674
-
675
- query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
676
- key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
677
- value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
678
-
679
- kv_seq_len = key_states.shape[-2]
680
- if past_key_value is not None:
681
- kv_seq_len += past_key_value.get_usable_length(kv_seq_len, self.layer_idx)
682
- cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len)
683
-
684
- query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin, position_ids)
685
-
686
- if past_key_value is not None:
687
- cache_kwargs = {"sin": sin, "cos": cos} # Specific to RoPE models
688
- key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs)
689
-
690
- key_states = repeat_kv(key_states, self.num_key_value_groups)
691
- value_states = repeat_kv(value_states, self.num_key_value_groups)
692
-
693
- if attention_mask is not None:
694
- if attention_mask.size() != (bsz, 1, q_len, kv_seq_len):
695
- raise ValueError(
696
- f"Attention mask should be of size {(bsz, 1, q_len, kv_seq_len)}, but is {attention_mask.size()}"
697
- )
698
-
699
- # SDPA with memory-efficient backend is currently (torch==2.1.2) bugged with non-contiguous inputs with custom attn_mask,
700
- # Reference: https://github.com/pytorch/pytorch/issues/112577.
701
- if query_states.device.type == "cuda" and attention_mask is not None:
702
- query_states = query_states.contiguous()
703
- key_states = key_states.contiguous()
704
- value_states = value_states.contiguous()
705
-
706
- attn_output = torch.nn.functional.scaled_dot_product_attention(
707
- query_states,
708
- key_states,
709
- value_states,
710
- attn_mask=attention_mask,
711
- dropout_p=self.attention_dropout if self.training else 0.0,
712
- # The q_len > 1 is necessary to match with AttentionMaskConverter.to_causal_4d that does not create a causal mask in case q_len == 1.
713
- is_causal=is_causal and attention_mask is None and q_len > 1,
714
- )
715
-
716
- attn_output = attn_output.transpose(1, 2).contiguous()
717
- attn_output = attn_output.view(bsz, q_len, self.hidden_size)
718
-
719
- attn_output = self.o_proj(attn_output)
720
-
721
- return attn_output, None, past_key_value
722
-
723
-
724
- QWEN2_ATTENTION_CLASSES = {
725
- "eager": Qwen2Attention,
726
- "flash_attention_2": Qwen2FlashAttention2,
727
- "sdpa": Qwen2SdpaAttention,
728
- }
729
-
730
-
731
- class Qwen2DecoderLayer(nn.Module):
732
- def __init__(self, config: Qwen2Config, layer_idx: int):
733
- super().__init__()
734
- self.hidden_size = config.hidden_size
735
-
736
- if config.use_sliding_window and config._attn_implementation != "flash_attention_2":
737
- logger.warning_once(
738
- f"Sliding Window Attention is enabled but not implemented for `{config._attn_implementation}`; "
739
- "unexpected results may be encountered."
740
- )
741
- self.self_attn = QWEN2_ATTENTION_CLASSES[config._attn_implementation](config, layer_idx)
742
-
743
- self.mlp = Qwen2MLP(config)
744
- self.input_layernorm = Qwen2RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
745
- self.post_attention_layernorm = Qwen2RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
746
-
747
- def forward(
748
- self,
749
- hidden_states: torch.Tensor,
750
- attention_mask: Optional[torch.Tensor] = None,
751
- position_ids: Optional[torch.LongTensor] = None,
752
- past_key_value: Optional[Tuple[torch.Tensor]] = None,
753
- output_attentions: Optional[bool] = False,
754
- use_cache: Optional[bool] = False,
755
- is_causal: Optional[bool] = True,
756
- **kwargs,
757
- ) -> Tuple[torch.FloatTensor, Optional[Tuple[torch.FloatTensor, torch.FloatTensor]]]:
758
- if "padding_mask" in kwargs:
759
- warnings.warn(
760
- "Passing `padding_mask` is deprecated and will be removed in v4.37. "
761
- "Please make sure use `attention_mask` instead.`"
762
- )
763
- """
764
- Args:
765
- hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
766
- attention_mask (`torch.FloatTensor`, *optional*): attention mask of size
767
- `(batch, sequence_length)` where padding elements are indicated by 0.
768
- output_attentions (`bool`, *optional*):
769
- Whether or not to return the attentions tensors of all attention layers. See `attentions` under
770
- returned tensors for more detail.
771
- use_cache (`bool`, *optional*):
772
- If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding
773
- (see `past_key_values`).
774
- past_key_value (`Tuple(torch.FloatTensor)`, *optional*): cached past key and value projection states
775
- """
776
-
777
- residual = hidden_states
778
-
779
- hidden_states = self.input_layernorm(hidden_states)
780
-
781
- # Self Attention
782
- hidden_states, self_attn_weights, present_key_value = self.self_attn(
783
- hidden_states=hidden_states,
784
- attention_mask=attention_mask,
785
- position_ids=position_ids,
786
- past_key_value=past_key_value,
787
- output_attentions=output_attentions,
788
- use_cache=use_cache,
789
- is_causal=is_causal,
790
- )
791
- hidden_states = residual + hidden_states
792
-
793
- # Fully Connected
794
- residual = hidden_states
795
- hidden_states = self.post_attention_layernorm(hidden_states)
796
- hidden_states = self.mlp(hidden_states)
797
- hidden_states = residual + hidden_states
798
-
799
- outputs = (hidden_states,)
800
-
801
- if output_attentions:
802
- outputs += (self_attn_weights,)
803
-
804
- if use_cache:
805
- outputs += (present_key_value,)
806
-
807
- return outputs
808
-
809
-
810
- QWEN2_START_DOCSTRING = r"""
811
- This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
812
- library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
813
- etc.)
814
-
815
- This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
816
- Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
817
- and behavior.
818
-
819
- Parameters:
820
- config ([`Qwen2Config`]):
821
- Model configuration class with all the parameters of the model. Initializing with a config file does not
822
- load the weights associated with the model, only the configuration. Check out the
823
- [`~PreTrainedModel.from_pretrained`] method to load the model weights.
824
- """
825
-
826
-
827
- @add_start_docstrings(
828
- "The bare Qwen2 Model outputting raw hidden-states without any specific head on top.",
829
- QWEN2_START_DOCSTRING,
830
- )
831
- class Qwen2PreTrainedModel(PreTrainedModel):
832
- config_class = Qwen2Config
833
- base_model_prefix = "model"
834
- supports_gradient_checkpointing = True
835
- _no_split_modules = ["Qwen2DecoderLayer"]
836
- _skip_keys_device_placement = "past_key_values"
837
- _supports_flash_attn_2 = True
838
- _supports_sdpa = True
839
- _supports_cache_class = True
840
-
841
- def _init_weights(self, module):
842
- std = self.config.initializer_range
843
- if isinstance(module, nn.Linear):
844
- module.weight.data.normal_(mean=0.0, std=std)
845
- if module.bias is not None:
846
- module.bias.data.zero_()
847
- elif isinstance(module, nn.Embedding):
848
- module.weight.data.normal_(mean=0.0, std=std)
849
- if module.padding_idx is not None:
850
- module.weight.data[module.padding_idx].zero_()
851
-
852
-
853
- QWEN2_INPUTS_DOCSTRING = r"""
854
- Args:
855
- input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
856
- Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide
857
- it.
858
-
859
- Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
860
- [`PreTrainedTokenizer.__call__`] for details.
861
-
862
- [What are input IDs?](../glossary#input-ids)
863
- attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
864
- Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
865
-
866
- - 1 for tokens that are **not masked**,
867
- - 0 for tokens that are **masked**.
868
-
869
- [What are attention masks?](../glossary#attention-mask)
870
-
871
- Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
872
- [`PreTrainedTokenizer.__call__`] for details.
873
-
874
- If `past_key_values` is used, optionally only the last `decoder_input_ids` have to be input (see
875
- `past_key_values`).
876
-
877
- If you want to change padding behavior, you should read [`modeling_opt._prepare_decoder_attention_mask`]
878
- and modify to your needs. See diagram 1 in [the paper](https://arxiv.org/abs/1910.13461) for more
879
- information on the default strategy.
880
-
881
- - 1 indicates the head is **not masked**,
882
- - 0 indicates the head is **masked**.
883
- position_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
884
- Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
885
- config.n_positions - 1]`.
886
-
887
- [What are position IDs?](../glossary#position-ids)
888
- past_key_values (`Cache` or `tuple(tuple(torch.FloatTensor))`, *optional*):
889
- Pre-computed hidden-states (key and values in the self-attention blocks and in the cross-attention
890
- blocks) that can be used to speed up sequential decoding. This typically consists in the `past_key_values`
891
- returned by the model at a previous stage of decoding, when `use_cache=True` or `config.use_cache=True`.
892
-
893
- Two formats are allowed:
894
- - a [`~cache_utils.Cache`] instance;
895
- - Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of
896
- shape `(batch_size, num_heads, sequence_length, embed_size_per_head)`). This is also known as the legacy
897
- cache format.
898
-
899
- The model will output the same cache format that is fed as input. If no `past_key_values` are passed, the
900
- legacy cache format will be returned.
901
-
902
- If `past_key_values` are used, the user can optionally input only the last `input_ids` (those that don't
903
- have their past key value states given to this model) of shape `(batch_size, 1)` instead of all `input_ids`
904
- of shape `(batch_size, sequence_length)`.
905
- inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
906
- Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
907
- is useful if you want more control over how to convert `input_ids` indices into associated vectors than the
908
- model's internal embedding lookup matrix.
909
- use_cache (`bool`, *optional*):
910
- If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see
911
- `past_key_values`).
912
- output_attentions (`bool`, *optional*):
913
- Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
914
- tensors for more detail.
915
- output_hidden_states (`bool`, *optional*):
916
- Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
917
- more detail.
918
- return_dict (`bool`, *optional*):
919
- Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
920
- """
921
-
922
-
923
- @add_start_docstrings(
924
- "The bare Qwen2 Model outputting raw hidden-states without any specific head on top.",
925
- QWEN2_START_DOCSTRING,
926
- )
927
- class Qwen2Model(Qwen2PreTrainedModel):
928
- """
929
- Transformer decoder consisting of *config.num_hidden_layers* layers. Each layer is a [`Qwen2DecoderLayer`]
930
-
931
- Args:
932
- config: Qwen2Config
933
- """
934
-
935
- def __init__(self, config: Qwen2Config):
936
- super().__init__(config)
937
- self.padding_idx = config.pad_token_id
938
- self.vocab_size = config.vocab_size
939
-
940
- self.embed_tokens = nn.Embedding(config.vocab_size, config.hidden_size, self.padding_idx)
941
- self.layers = nn.ModuleList(
942
- [Qwen2DecoderLayer(config, layer_idx) for layer_idx in range(config.num_hidden_layers)]
943
- )
944
- self._attn_implementation = config._attn_implementation
945
- self.norm = Qwen2RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
946
-
947
- self.gradient_checkpointing = False
948
- # Initialize weights and apply final processing
949
- self.post_init()
950
-
951
- def get_input_embeddings(self):
952
- return self.embed_tokens
953
-
954
- def set_input_embeddings(self, value):
955
- self.embed_tokens = value
956
-
957
- @add_start_docstrings_to_model_forward(QWEN2_INPUTS_DOCSTRING)
958
- def forward(
959
- self,
960
- input_ids: torch.LongTensor = None,
961
- attention_mask: Optional[torch.Tensor] = None,
962
- position_ids: Optional[torch.LongTensor] = None,
963
- past_key_values: Optional[List[torch.FloatTensor]] = None,
964
- inputs_embeds: Optional[torch.FloatTensor] = None,
965
- use_cache: Optional[bool] = None,
966
- output_attentions: Optional[bool] = None,
967
- output_hidden_states: Optional[bool] = None,
968
- return_dict: Optional[bool] = None,
969
- labels: Optional[torch.LongTensor] = None,
970
- is_causal: Optional[bool] = False,
971
- ) -> Union[Tuple, BaseModelOutputWithPast]:
972
- output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
973
- output_hidden_states = (
974
- output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
975
- )
976
- use_cache = use_cache if use_cache is not None else self.config.use_cache
977
-
978
- return_dict = return_dict if return_dict is not None else self.config.use_return_dict
979
-
980
- # retrieve input_ids and inputs_embeds
981
- if input_ids is not None and inputs_embeds is not None:
982
- raise ValueError("You cannot specify both decoder_input_ids and decoder_inputs_embeds at the same time")
983
- elif input_ids is not None:
984
- batch_size, seq_length = input_ids.shape
985
- elif inputs_embeds is not None:
986
- batch_size, seq_length, _ = inputs_embeds.shape
987
- else:
988
- raise ValueError("You have to specify either decoder_input_ids or decoder_inputs_embeds")
989
-
990
- if self.gradient_checkpointing and self.training:
991
- if use_cache:
992
- logger.warning_once(
993
- "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
994
- )
995
- use_cache = False
996
-
997
- past_key_values_length = 0
998
-
999
- if use_cache:
1000
- use_legacy_cache = not isinstance(past_key_values, Cache)
1001
- if use_legacy_cache:
1002
- past_key_values = DynamicCache.from_legacy_cache(past_key_values)
1003
- past_key_values_length = past_key_values.get_usable_length(seq_length)
1004
-
1005
- if position_ids is None:
1006
- device = input_ids.device if input_ids is not None else inputs_embeds.device
1007
- position_ids = torch.arange(
1008
- past_key_values_length, seq_length + past_key_values_length, dtype=torch.long, device=device
1009
- )
1010
- position_ids = position_ids.unsqueeze(0).view(-1, seq_length)
1011
- else:
1012
- position_ids = position_ids.view(-1, seq_length).long()
1013
-
1014
- if inputs_embeds is None:
1015
- inputs_embeds = self.embed_tokens(input_ids)
1016
-
1017
- if attention_mask is not None and self._attn_implementation == "flash_attention_2" and use_cache:
1018
- is_padding_right = attention_mask[:, -1].sum().item() != batch_size
1019
- if is_padding_right:
1020
- raise ValueError(
1021
- "You are attempting to perform batched generation with padding_side='right'"
1022
- " this may lead to unexpected behaviour for Flash Attention version of Qwen2. Make sure to "
1023
- " call `tokenizer.padding_side = 'left'` before tokenizing the input. "
1024
- )
1025
-
1026
- if self._attn_implementation == "flash_attention_2":
1027
- # 2d mask is passed through the layers
1028
- attention_mask = attention_mask if (attention_mask is not None and 0 in attention_mask) else None
1029
- elif self._attn_implementation == "sdpa" and not output_attentions:
1030
- # output_attentions=True can not be supported when using SDPA, and we fall back on
1031
- # the manual implementation that requires a 4D causal mask in all cases.
1032
- if is_causal:
1033
- attention_mask = _prepare_4d_causal_attention_mask_for_sdpa(
1034
- attention_mask,
1035
- (batch_size, seq_length),
1036
- inputs_embeds,
1037
- past_key_values_length,
1038
- )
1039
- else:
1040
- attention_mask = _prepare_4d_attention_mask_for_sdpa(
1041
- attention_mask, inputs_embeds.dtype
1042
- )
1043
- else:
1044
- # 4d mask is passed through the layers
1045
- if is_causal:
1046
- # Causal mask with -3.3895e+38 where no attention should be
1047
- attention_mask = _prepare_4d_causal_attention_mask(
1048
- attention_mask,
1049
- (batch_size, seq_length),
1050
- inputs_embeds,
1051
- past_key_values_length,
1052
- sliding_window=self.config.sliding_window,
1053
- )
1054
- else:
1055
- # Shape: batch_size, 1, query_length, key_value_length
1056
- attention_mask = _prepare_4d_attention_mask(
1057
- attention_mask, inputs_embeds.dtype
1058
- )
1059
-
1060
- hidden_states = inputs_embeds
1061
-
1062
- # decoder layers
1063
- all_hidden_states = () if output_hidden_states else None
1064
- all_self_attns = () if output_attentions else None
1065
- next_decoder_cache = None
1066
-
1067
- for decoder_layer in self.layers:
1068
- if output_hidden_states:
1069
- all_hidden_states += (hidden_states,)
1070
-
1071
- if self.gradient_checkpointing and self.training:
1072
- layer_outputs = self._gradient_checkpointing_func(
1073
- decoder_layer.__call__,
1074
- hidden_states,
1075
- attention_mask,
1076
- position_ids,
1077
- past_key_values,
1078
- output_attentions,
1079
- use_cache,
1080
- is_causal,
1081
- )
1082
- else:
1083
- layer_outputs = decoder_layer(
1084
- hidden_states,
1085
- attention_mask=attention_mask,
1086
- position_ids=position_ids,
1087
- past_key_value=past_key_values,
1088
- output_attentions=output_attentions,
1089
- use_cache=use_cache,
1090
- is_causal=is_causal,
1091
- )
1092
-
1093
- hidden_states = layer_outputs[0]
1094
-
1095
- if use_cache:
1096
- next_decoder_cache = layer_outputs[2 if output_attentions else 1]
1097
-
1098
- if output_attentions:
1099
- all_self_attns += (layer_outputs[1],)
1100
-
1101
- hidden_states = self.norm(hidden_states)
1102
-
1103
- # add hidden states from the last decoder layer
1104
- if output_hidden_states:
1105
- all_hidden_states += (hidden_states,)
1106
-
1107
- next_cache = None
1108
- if use_cache:
1109
- next_cache = next_decoder_cache.to_legacy_cache() if use_legacy_cache else next_decoder_cache
1110
-
1111
- if not return_dict:
1112
- return tuple(v for v in [hidden_states, next_cache, all_hidden_states, all_self_attns] if v is not None)
1113
- return BaseModelOutputWithPast(
1114
- last_hidden_state=hidden_states,
1115
- past_key_values=next_cache,
1116
- hidden_states=all_hidden_states,
1117
- attentions=all_self_attns,
1118
- )
1119
-
1120
-
1121
- class Qwen2ForCausalLM(Qwen2PreTrainedModel):
1122
- _tied_weights_keys = ["lm_head.weight"]
1123
-
1124
- def __init__(self, config):
1125
- super().__init__(config)
1126
- self.model = Qwen2Model(config)
1127
- self.vocab_size = config.vocab_size
1128
- self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
1129
-
1130
- # Initialize weights and apply final processing
1131
- self.post_init()
1132
-
1133
- def get_input_embeddings(self):
1134
- return self.model.embed_tokens
1135
-
1136
- def set_input_embeddings(self, value):
1137
- self.model.embed_tokens = value
1138
-
1139
- def get_output_embeddings(self):
1140
- return self.lm_head
1141
-
1142
- def set_output_embeddings(self, new_embeddings):
1143
- self.lm_head = new_embeddings
1144
-
1145
- def set_decoder(self, decoder):
1146
- self.model = decoder
1147
-
1148
- def get_decoder(self):
1149
- return self.model
1150
-
1151
- @add_start_docstrings_to_model_forward(QWEN2_INPUTS_DOCSTRING)
1152
- @replace_return_docstrings(output_type=CausalLMOutputWithPast, config_class=_CONFIG_FOR_DOC)
1153
- def forward(
1154
- self,
1155
- input_ids: torch.LongTensor = None,
1156
- attention_mask: Optional[torch.Tensor] = None,
1157
- position_ids: Optional[torch.LongTensor] = None,
1158
- past_key_values: Optional[List[torch.FloatTensor]] = None,
1159
- inputs_embeds: Optional[torch.FloatTensor] = None,
1160
- labels: Optional[torch.LongTensor] = None,
1161
- use_cache: Optional[bool] = None,
1162
- output_attentions: Optional[bool] = None,
1163
- output_hidden_states: Optional[bool] = None,
1164
- return_dict: Optional[bool] = None,
1165
- is_causal: Optional[bool] = False,
1166
- ) -> Union[Tuple, CausalLMOutputWithPast]:
1167
- r"""
1168
- Args:
1169
- labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
1170
- Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
1171
- config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
1172
- (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
1173
-
1174
- Returns:
1175
-
1176
- Example:
1177
-
1178
- ```python
1179
- >>> from transformers import AutoTokenizer, Qwen2ForCausalLM
1180
-
1181
- >>> model = Qwen2ForCausalLM.from_pretrained(PATH_TO_CONVERTED_WEIGHTS)
1182
- >>> tokenizer = AutoTokenizer.from_pretrained(PATH_TO_CONVERTED_TOKENIZER)
1183
-
1184
- >>> prompt = "Hey, are you conscious? Can you talk to me?"
1185
- >>> inputs = tokenizer(prompt, return_tensors="pt")
1186
-
1187
- >>> # Generate
1188
- >>> generate_ids = model.generate(inputs.input_ids, max_length=30)
1189
- >>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
1190
- "Hey, are you conscious? Can you talk to me?\nI'm not conscious, but I can talk to you."
1191
- ```"""
1192
-
1193
- output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
1194
- output_hidden_states = (
1195
- output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
1196
- )
1197
- return_dict = return_dict if return_dict is not None else self.config.use_return_dict
1198
-
1199
- # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn)
1200
- outputs = self.model(
1201
- input_ids=input_ids,
1202
- attention_mask=attention_mask,
1203
- position_ids=position_ids,
1204
- past_key_values=past_key_values,
1205
- inputs_embeds=inputs_embeds,
1206
- use_cache=use_cache,
1207
- output_attentions=output_attentions,
1208
- output_hidden_states=output_hidden_states,
1209
- return_dict=return_dict,
1210
- is_causal=is_causal,
1211
- )
1212
-
1213
- hidden_states = outputs[0]
1214
- logits = self.lm_head(hidden_states)
1215
- logits = logits.float()
1216
-
1217
- loss = None
1218
- if labels is not None:
1219
- # Shift so that tokens < n predict n
1220
- shift_logits = logits[..., :-1, :].contiguous()
1221
- shift_labels = labels[..., 1:].contiguous()
1222
- # Flatten the tokens
1223
- loss_fct = CrossEntropyLoss()
1224
- shift_logits = shift_logits.view(-1, self.config.vocab_size)
1225
- shift_labels = shift_labels.view(-1)
1226
- # Enable model parallelism
1227
- shift_labels = shift_labels.to(shift_logits.device)
1228
- loss = loss_fct(shift_logits, shift_labels)
1229
-
1230
- if not return_dict:
1231
- output = (logits,) + outputs[1:]
1232
- return (loss,) + output if loss is not None else output
1233
-
1234
- return CausalLMOutputWithPast(
1235
- loss=loss,
1236
- logits=logits,
1237
- past_key_values=outputs.past_key_values,
1238
- hidden_states=outputs.hidden_states,
1239
- attentions=outputs.attentions,
1240
- )
1241
-
1242
- def prepare_inputs_for_generation(
1243
- self, input_ids, past_key_values=None, attention_mask=None, inputs_embeds=None, **kwargs
1244
- ):
1245
- # Omit tokens covered by past_key_values
1246
- if past_key_values is not None:
1247
- if isinstance(past_key_values, Cache):
1248
- cache_length = past_key_values.get_seq_length()
1249
- past_length = past_key_values.seen_tokens
1250
- max_cache_length = past_key_values.get_max_length()
1251
- else:
1252
- cache_length = past_length = past_key_values[0][0].shape[2]
1253
- max_cache_length = None
1254
-
1255
- # Keep only the unprocessed tokens:
1256
- # 1 - If the length of the attention_mask exceeds the length of input_ids, then we are in a setting where
1257
- # some of the inputs are exclusively passed as part of the cache (e.g. when passing input_embeds as
1258
- # input)
1259
- if attention_mask is not None and attention_mask.shape[1] > input_ids.shape[1]:
1260
- input_ids = input_ids[:, -(attention_mask.shape[1] - past_length) :]
1261
- # 2 - If the past_length is smaller than input_ids', then input_ids holds all input tokens. We can discard
1262
- # input_ids based on the past_length.
1263
- elif past_length < input_ids.shape[1]:
1264
- input_ids = input_ids[:, past_length:]
1265
- # 3 - Otherwise (past_length >= input_ids.shape[1]), let's assume input_ids only has unprocessed tokens.
1266
-
1267
- # If we are about to go beyond the maximum cache length, we need to crop the input attention mask.
1268
- if (
1269
- max_cache_length is not None
1270
- and attention_mask is not None
1271
- and cache_length + input_ids.shape[1] > max_cache_length
1272
- ):
1273
- attention_mask = attention_mask[:, -max_cache_length:]
1274
-
1275
- position_ids = kwargs.get("position_ids", None)
1276
- if attention_mask is not None and position_ids is None:
1277
- # create position_ids on the fly for batch generation
1278
- position_ids = attention_mask.long().cumsum(-1) - 1
1279
- position_ids.masked_fill_(attention_mask == 0, 1)
1280
- if past_key_values:
1281
- position_ids = position_ids[:, -input_ids.shape[1] :]
1282
-
1283
- # if `inputs_embeds` are passed, we only want to use them in the 1st generation step
1284
- if inputs_embeds is not None and past_key_values is None:
1285
- model_inputs = {"inputs_embeds": inputs_embeds}
1286
- else:
1287
- model_inputs = {"input_ids": input_ids}
1288
-
1289
- model_inputs.update(
1290
- {
1291
- "position_ids": position_ids,
1292
- "past_key_values": past_key_values,
1293
- "use_cache": kwargs.get("use_cache"),
1294
- "attention_mask": attention_mask,
1295
- }
1296
- )
1297
- return model_inputs
1298
-
1299
- @staticmethod
1300
- def _reorder_cache(past_key_values, beam_idx):
1301
- reordered_past = ()
1302
- for layer_past in past_key_values:
1303
- reordered_past += (
1304
- tuple(past_state.index_select(0, beam_idx.to(past_state.device)) for past_state in layer_past),
1305
- )
1306
- return reordered_past
1307
-
1308
-
1309
- @add_start_docstrings(
1310
- """
1311
- The Qwen2 Model transformer with a sequence classification head on top (linear layer).
1312
-
1313
- [`Qwen2ForSequenceClassification`] uses the last token in order to do the classification, as other causal models
1314
- (e.g. GPT-2) do.
1315
-
1316
- Since it does classification on the last token, it requires to know the position of the last token. If a
1317
- `pad_token_id` is defined in the configuration, it finds the last token that is not a padding token in each row. If
1318
- no `pad_token_id` is defined, it simply takes the last value in each row of the batch. Since it cannot guess the
1319
- padding tokens when `inputs_embeds` are passed instead of `input_ids`, it does the same (take the last value in
1320
- each row of the batch).
1321
- """,
1322
- QWEN2_START_DOCSTRING,
1323
- )
1324
- class Qwen2ForSequenceClassification(Qwen2PreTrainedModel):
1325
- def __init__(self, config):
1326
- super().__init__(config)
1327
- self.num_labels = config.num_labels
1328
- self.model = Qwen2Model(config)
1329
- self.score = nn.Linear(config.hidden_size, self.num_labels, bias=False)
1330
-
1331
- # Initialize weights and apply final processing
1332
- self.post_init()
1333
-
1334
- def get_input_embeddings(self):
1335
- return self.model.embed_tokens
1336
-
1337
- def set_input_embeddings(self, value):
1338
- self.model.embed_tokens = value
1339
-
1340
- @add_start_docstrings_to_model_forward(QWEN2_INPUTS_DOCSTRING)
1341
- def forward(
1342
- self,
1343
- input_ids: torch.LongTensor = None,
1344
- attention_mask: Optional[torch.Tensor] = None,
1345
- position_ids: Optional[torch.LongTensor] = None,
1346
- past_key_values: Optional[List[torch.FloatTensor]] = None,
1347
- inputs_embeds: Optional[torch.FloatTensor] = None,
1348
- labels: Optional[torch.LongTensor] = None,
1349
- use_cache: Optional[bool] = None,
1350
- output_attentions: Optional[bool] = None,
1351
- output_hidden_states: Optional[bool] = None,
1352
- return_dict: Optional[bool] = None,
1353
- is_causal: Optional[bool] = True,
1354
- ) -> Union[Tuple, SequenceClassifierOutputWithPast]:
1355
- r"""
1356
- labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
1357
- Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
1358
- config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
1359
- `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
1360
- """
1361
- return_dict = return_dict if return_dict is not None else self.config.use_return_dict
1362
-
1363
- transformer_outputs = self.model(
1364
- input_ids,
1365
- attention_mask=attention_mask,
1366
- position_ids=position_ids,
1367
- past_key_values=past_key_values,
1368
- inputs_embeds=inputs_embeds,
1369
- use_cache=use_cache,
1370
- output_attentions=output_attentions,
1371
- output_hidden_states=output_hidden_states,
1372
- return_dict=return_dict,
1373
- is_causal=is_causal,
1374
- )
1375
- hidden_states = transformer_outputs[0]
1376
- logits = self.score(hidden_states)
1377
-
1378
- if input_ids is not None:
1379
- batch_size = input_ids.shape[0]
1380
- else:
1381
- batch_size = inputs_embeds.shape[0]
1382
-
1383
- if self.config.pad_token_id is None and batch_size != 1:
1384
- raise ValueError("Cannot handle batch sizes > 1 if no padding token is defined.")
1385
- if self.config.pad_token_id is None:
1386
- sequence_lengths = -1
1387
- else:
1388
- if input_ids is not None:
1389
- # if no pad token found, use modulo instead of reverse indexing for ONNX compatibility
1390
- sequence_lengths = torch.eq(input_ids, self.config.pad_token_id).int().argmax(-1) - 1
1391
- sequence_lengths = sequence_lengths % input_ids.shape[-1]
1392
- sequence_lengths = sequence_lengths.to(logits.device)
1393
- else:
1394
- sequence_lengths = -1
1395
-
1396
- pooled_logits = logits[torch.arange(batch_size, device=logits.device), sequence_lengths]
1397
-
1398
- loss = None
1399
- if labels is not None:
1400
- labels = labels.to(logits.device)
1401
- if self.config.problem_type is None:
1402
- if self.num_labels == 1:
1403
- self.config.problem_type = "regression"
1404
- elif self.num_labels > 1 and (labels.dtype == torch.long or labels.dtype == torch.int):
1405
- self.config.problem_type = "single_label_classification"
1406
- else:
1407
- self.config.problem_type = "multi_label_classification"
1408
-
1409
- if self.config.problem_type == "regression":
1410
- loss_fct = MSELoss()
1411
- if self.num_labels == 1:
1412
- loss = loss_fct(pooled_logits.squeeze(), labels.squeeze())
1413
- else:
1414
- loss = loss_fct(pooled_logits, labels)
1415
- elif self.config.problem_type == "single_label_classification":
1416
- loss_fct = CrossEntropyLoss()
1417
- loss = loss_fct(pooled_logits.view(-1, self.num_labels), labels.view(-1))
1418
- elif self.config.problem_type == "multi_label_classification":
1419
- loss_fct = BCEWithLogitsLoss()
1420
- loss = loss_fct(pooled_logits, labels)
1421
- if not return_dict:
1422
- output = (pooled_logits,) + transformer_outputs[1:]
1423
- return ((loss,) + output) if loss is not None else output
1424
-
1425
- return SequenceClassifierOutputWithPast(
1426
- loss=loss,
1427
- logits=pooled_logits,
1428
- past_key_values=transformer_outputs.past_key_values,
1429
- hidden_states=transformer_outputs.hidden_states,
1430
- attentions=transformer_outputs.attentions,
1431
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
modules.json CHANGED
@@ -14,7 +14,7 @@
14
  {
15
  "idx": 2,
16
  "name": "2",
17
- "path": "2_Dense_1024",
18
  "type": "sentence_transformers.models.Dense"
19
  }
20
  ]
 
14
  {
15
  "idx": 2,
16
  "name": "2",
17
+ "path": "2_Dense_2048",
18
  "type": "sentence_transformers.models.Dense"
19
  }
20
  ]
tokenization_qwen.py DELETED
@@ -1,267 +0,0 @@
1
-
2
- from typing import List, Optional
3
- from transformers.models.qwen2.tokenization_qwen2 import Qwen2Tokenizer as OriginalQwen2Tokenizer
4
- from transformers.models.qwen2.tokenization_qwen2_fast import Qwen2TokenizerFast as OriginalQwen2TokenizerFast
5
- from tokenizers import processors
6
-
7
- VOCAB_FILES_NAMES = {
8
- "vocab_file": "vocab.json",
9
- "merges_file": "merges.txt",
10
- "tokenizer_file": "tokenizer.json",
11
- }
12
-
13
- class Qwen2Tokenizer(OriginalQwen2Tokenizer):
14
- """
15
- Construct a Qwen2 tokenizer. Based on byte-level Byte-Pair-Encoding.
16
-
17
- Same with GPT2Tokenizer, this tokenizer has been trained to treat spaces like parts of the tokens so a word will
18
- be encoded differently whether it is at the beginning of the sentence (without space) or not:
19
-
20
- ```python
21
- >>> from transformers import Qwen2Tokenizer
22
-
23
- >>> tokenizer = Qwen2Tokenizer.from_pretrained("Qwen/Qwen-tokenizer")
24
- >>> tokenizer("Hello world")["input_ids"]
25
- [9707, 1879]
26
-
27
- >>> tokenizer(" Hello world")["input_ids"]
28
- [21927, 1879]
29
- ```
30
- This is expected.
31
-
32
- You should not use GPT2Tokenizer instead, because of the different pretokenization rules.
33
-
34
- This tokenizer inherits from [`PreTrainedTokenizer`] which contains most of the main methods. Users should refer to
35
- this superclass for more information regarding those methods.
36
-
37
- Args:
38
- vocab_file (`str`):
39
- Path to the vocabulary file.
40
- merges_file (`str`):
41
- Path to the merges file.
42
- errors (`str`, *optional*, defaults to `"replace"`):
43
- Paradigm to follow when decoding bytes to UTF-8. See
44
- [bytes.decode](https://docs.python.org/3/library/stdtypes.html#bytes.decode) for more information.
45
- unk_token (`str`, *optional*, defaults to `"<|endoftext|>"`):
46
- The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
47
- token instead.
48
- bos_token (`str`, *optional*):
49
- The beginning of sequence token. Not applicable for this tokenizer.
50
- eos_token (`str`, *optional*, defaults to `"<|endoftext|>"`):
51
- The end of sequence token.
52
- pad_token (`str`, *optional*, defaults to `"<|endoftext|>"`):
53
- The token used for padding, for example when batching sequences of different lengths.
54
- clean_up_tokenization_spaces (`bool`, *optional*, defaults to `False`):
55
- Whether or not the model should cleanup the spaces that were added when splitting the input text during the
56
- tokenization process. Not applicable to this tokenizer, since tokenization does not add spaces.
57
- split_special_tokens (`bool`, *optional*, defaults to `False`):
58
- Whether or not the special tokens should be split during the tokenization process. The default behavior is
59
- to not split special tokens. This means that if `<|endoftext|>` is the `eos_token`, then `tokenizer.tokenize("<|endoftext|>") =
60
- ['<|endoftext|>`]. Otherwise, if `split_special_tokens=True`, then `tokenizer.tokenize("<|endoftext|>")` will be give `['<',
61
- '|', 'endo', 'ft', 'ext', '|', '>']`. This argument is only supported for `slow` tokenizers for the moment.
62
- add_eos_token (`bool`, *optional*, defaults to `False`):
63
- Whether or not to add an `eos_token` at the end of sequences.
64
- """
65
-
66
- def __init__(
67
- self,
68
- vocab_file,
69
- merges_file,
70
- errors="replace",
71
- unk_token="<|endoftext|>",
72
- bos_token=None,
73
- eos_token="<|endoftext|>",
74
- pad_token="<|endoftext|>",
75
- clean_up_tokenization_spaces=False,
76
- split_special_tokens=False,
77
- add_eos_token=False,
78
- **kwargs,
79
- ):
80
- # The add_eos_token code was inspired by the LlamaTokenizer
81
- self.add_eos_token = add_eos_token
82
-
83
- super().__init__(
84
- vocab_file=vocab_file,
85
- merges_file=merges_file,
86
- errors=errors,
87
- unk_token=unk_token,
88
- bos_token=bos_token,
89
- eos_token=eos_token,
90
- pad_token=pad_token,
91
- clean_up_tokenization_spaces=clean_up_tokenization_spaces,
92
- split_special_tokens=split_special_tokens,
93
- add_eos_token=add_eos_token,
94
- **kwargs,
95
- )
96
-
97
- def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None):
98
- eos_token_id = [self.eos_token_id] if self.add_eos_token else []
99
-
100
- output = token_ids_0 + eos_token_id
101
-
102
- if token_ids_1 is not None:
103
- output = output + token_ids_1 + eos_token_id
104
-
105
- return output
106
-
107
- def get_special_tokens_mask(
108
- self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None, already_has_special_tokens: bool = False
109
- ) -> List[int]:
110
- """
111
- Retrieve sequence ids from a token list that has no special tokens added. This method is called when adding
112
- special tokens using the tokenizer `prepare_for_model` method.
113
-
114
- Args:
115
- token_ids_0 (`List[int]`):
116
- List of IDs.
117
- token_ids_1 (`List[int]`, *optional*):
118
- Optional second list of IDs for sequence pairs.
119
- already_has_special_tokens (`bool`, *optional*, defaults to `False`):
120
- Whether or not the token list is already formatted with special tokens for the model.
121
-
122
- Returns:
123
- `List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
124
- """
125
- if already_has_special_tokens:
126
- return super().get_special_tokens_mask(
127
- token_ids_0=token_ids_0, token_ids_1=token_ids_1, already_has_special_tokens=True
128
- )
129
-
130
- eos_token_id = [1] if self.add_eos_token else []
131
-
132
- if token_ids_1 is None:
133
- return ([0] * len(token_ids_0)) + eos_token_id
134
- return (
135
- ([0] * len(token_ids_0))
136
- + eos_token_id
137
- + ([0] * len(token_ids_1))
138
- + eos_token_id
139
- )
140
-
141
- def create_token_type_ids_from_sequences(
142
- self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
143
- ) -> List[int]:
144
- """
145
- Creates a mask from the two sequences passed to be used in a sequence-pair classification task. An ALBERT
146
- sequence pair mask has the following format:
147
-
148
- ```
149
- 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1
150
- | first sequence | second sequence |
151
- ```
152
-
153
- if token_ids_1 is None, only returns the first portion of the mask (0s).
154
-
155
- Args:
156
- token_ids_0 (`List[int]`):
157
- List of ids.
158
- token_ids_1 (`List[int]`, *optional*):
159
- Optional second list of IDs for sequence pairs.
160
-
161
- Returns:
162
- `List[int]`: List of [token type IDs](../glossary#token-type-ids) according to the given sequence(s).
163
- """
164
- eos_token_id = [self.eos_token_id] if self.add_eos_token else []
165
-
166
- output = [0] * len(token_ids_0 + eos_token_id)
167
-
168
- if token_ids_1 is not None:
169
- output += [1] * len(token_ids_1 + eos_token_id)
170
-
171
- return output
172
-
173
- class Qwen2TokenizerFast(OriginalQwen2TokenizerFast):
174
- """
175
- Construct a "fast" Qwen2 tokenizer (backed by HuggingFace's *tokenizers* library). Based on byte-level
176
- Byte-Pair-Encoding.
177
-
178
- Same with GPT2Tokenizer, this tokenizer has been trained to treat spaces like parts of the tokens so a word will
179
- be encoded differently whether it is at the beginning of the sentence (without space) or not:
180
-
181
- ```python
182
- >>> from transformers import Qwen2TokenizerFast
183
-
184
- >>> tokenizer = Qwen2TokenizerFast.from_pretrained("Qwen/Qwen-tokenizer")
185
- >>> tokenizer("Hello world")["input_ids"]
186
- [9707, 1879]
187
-
188
- >>> tokenizer(" Hello world")["input_ids"]
189
- [21927, 1879]
190
- ```
191
- This is expected.
192
-
193
- This tokenizer inherits from [`PreTrainedTokenizerFast`] which contains most of the main methods. Users should
194
- refer to this superclass for more information regarding those methods.
195
-
196
- Args:
197
- vocab_file (`str`, *optional*):
198
- Path to the vocabulary file.
199
- merges_file (`str`, *optional*):
200
- Path to the merges file.
201
- tokenizer_file (`str`, *optional*):
202
- Path to [tokenizers](https://github.com/huggingface/tokenizers) file (generally has a .json extension) that
203
- contains everything needed to load the tokenizer.
204
- unk_token (`str`, *optional*, defaults to `"<|endoftext|>"`):
205
- The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
206
- token instead. Not applicable to this tokenizer.
207
- bos_token (`str`, *optional*):
208
- The beginning of sequence token. Not applicable for this tokenizer.
209
- eos_token (`str`, *optional*, defaults to `"<|endoftext|>"`):
210
- The end of sequence token.
211
- pad_token (`str`, *optional*, defaults to `"<|endoftext|>"`):
212
- The token used for padding, for example when batching sequences of different lengths.
213
- add_eos_token (`bool`, *optional*, defaults to `False`):
214
- Whether or not to add an `eos_token` at the end of sequences.
215
- """
216
-
217
- slow_tokenizer_class = Qwen2Tokenizer
218
- padding_side = "left"
219
-
220
- def __init__(
221
- self,
222
- vocab_file=None,
223
- merges_file=None,
224
- tokenizer_file=None,
225
- unk_token="<|endoftext|>",
226
- bos_token=None,
227
- eos_token="<|endoftext|>",
228
- pad_token="<|endoftext|>",
229
- add_eos_token=False,
230
- **kwargs,
231
- ):
232
- super().__init__(
233
- vocab_file=vocab_file,
234
- merges_file=merges_file,
235
- tokenizer_file=tokenizer_file,
236
- unk_token=unk_token,
237
- bos_token=bos_token,
238
- eos_token=eos_token,
239
- pad_token=pad_token,
240
- **kwargs,
241
- )
242
-
243
- self._add_eos_token = add_eos_token
244
- self.update_post_processor()
245
-
246
- def update_post_processor(self):
247
- """
248
- Updates the underlying post processor with the current `eos_token`.
249
- """
250
- eos = self.eos_token
251
- eos_token_id = self.eos_token_id
252
- if eos is None and self.add_eos_token:
253
- raise ValueError("add_eos_token = True but eos_token = None")
254
-
255
- single = f"$A:0{(' '+eos+':0') if self.add_eos_token else ''}"
256
- pair = f"{single} $B:1{(' '+eos+':1') if self.add_eos_token else ''}"
257
-
258
- special_tokens = []
259
- if self.add_eos_token:
260
- special_tokens.append((eos, eos_token_id))
261
- self._tokenizer.post_processor = processors.TemplateProcessing(
262
- single=single, pair=pair, special_tokens=special_tokens
263
- )
264
-
265
- @property
266
- def add_eos_token(self):
267
- return self._add_eos_token