CRIS-Yang commited on
Commit
7b6241f
·
verified ·
1 Parent(s): 9f5551e

Model Initial Update 1

Browse files

Update without safetensors

added_tokens.json ADDED
@@ -0,0 +1,106 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "</ground>": 32101,
3
+ "</objects>": 32103,
4
+ "<ground>": 32100,
5
+ "<obj0>": 32000,
6
+ "<obj10>": 32010,
7
+ "<obj11>": 32011,
8
+ "<obj12>": 32012,
9
+ "<obj13>": 32013,
10
+ "<obj14>": 32014,
11
+ "<obj15>": 32015,
12
+ "<obj16>": 32016,
13
+ "<obj17>": 32017,
14
+ "<obj18>": 32018,
15
+ "<obj19>": 32019,
16
+ "<obj1>": 32001,
17
+ "<obj20>": 32020,
18
+ "<obj21>": 32021,
19
+ "<obj22>": 32022,
20
+ "<obj23>": 32023,
21
+ "<obj24>": 32024,
22
+ "<obj25>": 32025,
23
+ "<obj26>": 32026,
24
+ "<obj27>": 32027,
25
+ "<obj28>": 32028,
26
+ "<obj29>": 32029,
27
+ "<obj2>": 32002,
28
+ "<obj30>": 32030,
29
+ "<obj31>": 32031,
30
+ "<obj32>": 32032,
31
+ "<obj33>": 32033,
32
+ "<obj34>": 32034,
33
+ "<obj35>": 32035,
34
+ "<obj36>": 32036,
35
+ "<obj37>": 32037,
36
+ "<obj38>": 32038,
37
+ "<obj39>": 32039,
38
+ "<obj3>": 32003,
39
+ "<obj40>": 32040,
40
+ "<obj41>": 32041,
41
+ "<obj42>": 32042,
42
+ "<obj43>": 32043,
43
+ "<obj44>": 32044,
44
+ "<obj45>": 32045,
45
+ "<obj46>": 32046,
46
+ "<obj47>": 32047,
47
+ "<obj48>": 32048,
48
+ "<obj49>": 32049,
49
+ "<obj4>": 32004,
50
+ "<obj50>": 32050,
51
+ "<obj51>": 32051,
52
+ "<obj52>": 32052,
53
+ "<obj53>": 32053,
54
+ "<obj54>": 32054,
55
+ "<obj55>": 32055,
56
+ "<obj56>": 32056,
57
+ "<obj57>": 32057,
58
+ "<obj58>": 32058,
59
+ "<obj59>": 32059,
60
+ "<obj5>": 32005,
61
+ "<obj60>": 32060,
62
+ "<obj61>": 32061,
63
+ "<obj62>": 32062,
64
+ "<obj63>": 32063,
65
+ "<obj64>": 32064,
66
+ "<obj65>": 32065,
67
+ "<obj66>": 32066,
68
+ "<obj67>": 32067,
69
+ "<obj68>": 32068,
70
+ "<obj69>": 32069,
71
+ "<obj6>": 32006,
72
+ "<obj70>": 32070,
73
+ "<obj71>": 32071,
74
+ "<obj72>": 32072,
75
+ "<obj73>": 32073,
76
+ "<obj74>": 32074,
77
+ "<obj75>": 32075,
78
+ "<obj76>": 32076,
79
+ "<obj77>": 32077,
80
+ "<obj78>": 32078,
81
+ "<obj79>": 32079,
82
+ "<obj7>": 32007,
83
+ "<obj80>": 32080,
84
+ "<obj81>": 32081,
85
+ "<obj82>": 32082,
86
+ "<obj83>": 32083,
87
+ "<obj84>": 32084,
88
+ "<obj85>": 32085,
89
+ "<obj86>": 32086,
90
+ "<obj87>": 32087,
91
+ "<obj88>": 32088,
92
+ "<obj89>": 32089,
93
+ "<obj8>": 32008,
94
+ "<obj90>": 32090,
95
+ "<obj91>": 32091,
96
+ "<obj92>": 32092,
97
+ "<obj93>": 32093,
98
+ "<obj94>": 32094,
99
+ "<obj95>": 32095,
100
+ "<obj96>": 32096,
101
+ "<obj97>": 32097,
102
+ "<obj98>": 32098,
103
+ "<obj99>": 32099,
104
+ "<obj9>": 32009,
105
+ "<objects>": 32102
106
+ }
config.json ADDED
@@ -0,0 +1,63 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "architectures": [
3
+ "ChatRexAuxForConditionalGeneration"
4
+ ],
5
+ "auto_map": {
6
+ "AutoConfig": "modeling_chatrex.ChatRexAuxConfig",
7
+ "AutoModelForCausalLM": "modeling_chatrex.ChatRexAuxForConditionalGeneration"
8
+ },
9
+ "ignore_index": -100,
10
+ "image_token_index": 32000,
11
+ "model_type": "chatrex",
12
+ "projector_depth": 2,
13
+ "projector_hidden_act": "gelu",
14
+ "text_config": {
15
+ "_name_or_path": "huggingface_checkpoints/lmsys/vicuna-7b-v1.5",
16
+ "architectures": [
17
+ "LlamaForCausalLM"
18
+ ],
19
+ "max_position_embeddings": 4096,
20
+ "model_type": "llama",
21
+ "pad_token_id": 0,
22
+ "rms_norm_eps": 1e-05,
23
+ "torch_dtype": "bfloat16",
24
+ "vocab_size": 32104
25
+ },
26
+ "torch_dtype": "float32",
27
+ "transformers_version": "4.44.2",
28
+ "vision_aux_config": {
29
+ "optimize_vision_tower_aux": false,
30
+ "type": "OpenCLIPVisionTower",
31
+ "use_last_feat": true,
32
+ "vision_tower": "openclip-convnext-large-d-320-laion2B-s29B-b131K-ft-soup"
33
+ },
34
+ "vision_config": {
35
+ "_name_or_path": "huggingface_checkpoints/openai/clip-vit-large-patch14-336",
36
+ "dropout": 0.0,
37
+ "hidden_size": 1024,
38
+ "image_size": 336,
39
+ "intermediate_size": 4096,
40
+ "model_type": "clip_vision_model",
41
+ "num_attention_heads": 16,
42
+ "num_hidden_layers": 24,
43
+ "patch_size": 14,
44
+ "projection_dim": 768
45
+ },
46
+ "vision_feature_layer": -2,
47
+ "vision_feature_select_strategy": "default",
48
+ "visual_prompt_encoder_config": {
49
+ "add_pos_embedding": true,
50
+ "channel_per_level": [
51
+ 192,
52
+ 384,
53
+ 768,
54
+ 1536
55
+ ],
56
+ "output_size": 7,
57
+ "pos_embedding_dim": 2880,
58
+ "spatail_scale": 0.25,
59
+ "type": "MultiLevelROIVisualPrompt",
60
+ "with_additional_projection": false
61
+ },
62
+ "visual_prompt_hidden_size": 2880
63
+ }
convnext.py ADDED
@@ -0,0 +1,624 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ from functools import partial
3
+ from typing import Callable, List, Optional, Tuple, Union
4
+
5
+ import torch
6
+ import torch.nn as nn
7
+ from open_clip.factory import get_model_config
8
+ from open_clip.model import CLIPVisionCfg
9
+ from timm.layers import (AvgPool2dSame, ClassifierHead, DropPath,
10
+ GlobalResponseNormMlp, LayerNorm, LayerNorm2d, Mlp,
11
+ NormMlpClassifierHead, create_conv2d, get_act_layer,
12
+ make_divisible, to_ntuple, trunc_normal_)
13
+ from timm.models._builder import build_model_with_cfg
14
+ from timm.models._features import feature_take_indices
15
+ from timm.models._manipulate import checkpoint_seq, named_apply
16
+
17
+
18
+ __all__ = ['ConvNeXt'] # model_registry will add each entrypoint fn to this
19
+
20
+
21
+ class Downsample(nn.Module):
22
+
23
+ def __init__(self, in_chs, out_chs, stride=1, dilation=1):
24
+ super().__init__()
25
+ avg_stride = stride if dilation == 1 else 1
26
+ if stride > 1 or dilation > 1:
27
+ avg_pool_fn = AvgPool2dSame if avg_stride == 1 and dilation > 1 else nn.AvgPool2d
28
+ self.pool = avg_pool_fn(2, avg_stride, ceil_mode=True, count_include_pad=False)
29
+ else:
30
+ self.pool = nn.Identity()
31
+
32
+ if in_chs != out_chs:
33
+ self.conv = create_conv2d(in_chs, out_chs, 1, stride=1)
34
+ else:
35
+ self.conv = nn.Identity()
36
+
37
+ def forward(self, x):
38
+ x = self.pool(x)
39
+ x = self.conv(x)
40
+ return x
41
+
42
+
43
+ class ConvNeXtBlock(nn.Module):
44
+ """ ConvNeXt Block
45
+ There are two equivalent implementations:
46
+ (1) DwConv -> LayerNorm (channels_first) -> 1x1 Conv -> GELU -> 1x1 Conv; all in (N, C, H, W)
47
+ (2) DwConv -> Permute to (N, H, W, C); LayerNorm (channels_last) -> Linear -> GELU -> Linear; Permute back
48
+
49
+ Unlike the official impl, this one allows choice of 1 or 2, 1x1 conv can be faster with appropriate
50
+ choice of LayerNorm impl, however as model size increases the tradeoffs appear to change and nn.Linear
51
+ is a better choice. This was observed with PyTorch 1.10 on 3090 GPU, it could change over time & w/ different HW.
52
+ """
53
+
54
+ def __init__(
55
+ self,
56
+ in_chs: int,
57
+ out_chs: Optional[int] = None,
58
+ kernel_size: int = 7,
59
+ stride: int = 1,
60
+ dilation: Union[int, Tuple[int, int]] = (1, 1),
61
+ mlp_ratio: float = 4,
62
+ conv_mlp: bool = False,
63
+ conv_bias: bool = True,
64
+ use_grn: bool = False,
65
+ ls_init_value: Optional[float] = 1e-6,
66
+ act_layer: Union[str, Callable] = 'gelu',
67
+ norm_layer: Optional[Callable] = None,
68
+ drop_path: float = 0.,
69
+ ):
70
+ """
71
+
72
+ Args:
73
+ in_chs: Block input channels.
74
+ out_chs: Block output channels (same as in_chs if None).
75
+ kernel_size: Depthwise convolution kernel size.
76
+ stride: Stride of depthwise convolution.
77
+ dilation: Tuple specifying input and output dilation of block.
78
+ mlp_ratio: MLP expansion ratio.
79
+ conv_mlp: Use 1x1 convolutions for MLP and a NCHW compatible norm layer if True.
80
+ conv_bias: Apply bias for all convolution (linear) layers.
81
+ use_grn: Use GlobalResponseNorm in MLP (from ConvNeXt-V2)
82
+ ls_init_value: Layer-scale init values, layer-scale applied if not None.
83
+ act_layer: Activation layer.
84
+ norm_layer: Normalization layer (defaults to LN if not specified).
85
+ drop_path: Stochastic depth probability.
86
+ """
87
+ super().__init__()
88
+ out_chs = out_chs or in_chs
89
+ dilation = to_ntuple(2)(dilation)
90
+ act_layer = get_act_layer(act_layer)
91
+ if not norm_layer:
92
+ norm_layer = LayerNorm2d if conv_mlp else LayerNorm
93
+ mlp_layer = partial(GlobalResponseNormMlp if use_grn else Mlp, use_conv=conv_mlp)
94
+ self.use_conv_mlp = conv_mlp
95
+ self.conv_dw = create_conv2d(
96
+ in_chs,
97
+ out_chs,
98
+ kernel_size=kernel_size,
99
+ stride=stride,
100
+ dilation=dilation[0],
101
+ depthwise=True,
102
+ bias=conv_bias,
103
+ )
104
+ self.norm = norm_layer(out_chs)
105
+ self.mlp = mlp_layer(out_chs, int(mlp_ratio * out_chs), act_layer=act_layer)
106
+ self.ramma = nn.Parameter(ls_init_value * torch.ones(out_chs)) if ls_init_value is not None else None
107
+ if in_chs != out_chs or stride != 1 or dilation[0] != dilation[1]:
108
+ self.shortcut = Downsample(in_chs, out_chs, stride=stride, dilation=dilation[0])
109
+ else:
110
+ self.shortcut = nn.Identity()
111
+ self.drop_path = DropPath(drop_path) if drop_path > 0. else nn.Identity()
112
+
113
+ def forward(self, x):
114
+ shortcut = x
115
+ x = self.conv_dw(x)
116
+ if self.use_conv_mlp:
117
+ x = self.norm(x)
118
+ x = self.mlp(x)
119
+ else:
120
+ x = x.permute(0, 2, 3, 1)
121
+ x = self.norm(x)
122
+ x = self.mlp(x)
123
+ x = x.permute(0, 3, 1, 2)
124
+ if self.ramma is not None:
125
+ x = x.mul(self.ramma.reshape(1, -1, 1, 1))
126
+
127
+ x = self.drop_path(x) + self.shortcut(shortcut)
128
+ return x
129
+
130
+
131
+ class ConvNeXtStage(nn.Module):
132
+
133
+ def __init__(
134
+ self,
135
+ in_chs,
136
+ out_chs,
137
+ kernel_size=7,
138
+ stride=2,
139
+ depth=2,
140
+ dilation=(1, 1),
141
+ drop_path_rates=None,
142
+ ls_init_value=1.0,
143
+ conv_mlp=False,
144
+ conv_bias=True,
145
+ use_grn=False,
146
+ act_layer='gelu',
147
+ norm_layer=None,
148
+ norm_layer_cl=None
149
+ ):
150
+ super().__init__()
151
+ self.grad_checkpointing = False
152
+
153
+ if in_chs != out_chs or stride > 1 or dilation[0] != dilation[1]:
154
+ ds_ks = 2 if stride > 1 or dilation[0] != dilation[1] else 1
155
+ pad = 'same' if dilation[1] > 1 else 0 # same padding needed if dilation used
156
+ self.downsample = nn.Sequential(
157
+ norm_layer(in_chs),
158
+ create_conv2d(
159
+ in_chs,
160
+ out_chs,
161
+ kernel_size=ds_ks,
162
+ stride=stride,
163
+ dilation=dilation[0],
164
+ padding=pad,
165
+ bias=conv_bias,
166
+ ),
167
+ )
168
+ in_chs = out_chs
169
+ else:
170
+ self.downsample = nn.Identity()
171
+
172
+ drop_path_rates = drop_path_rates or [0.] * depth
173
+ stage_blocks = []
174
+ for i in range(depth):
175
+ stage_blocks.append(ConvNeXtBlock(
176
+ in_chs=in_chs,
177
+ out_chs=out_chs,
178
+ kernel_size=kernel_size,
179
+ dilation=dilation[1],
180
+ drop_path=drop_path_rates[i],
181
+ ls_init_value=ls_init_value,
182
+ conv_mlp=conv_mlp,
183
+ conv_bias=conv_bias,
184
+ use_grn=use_grn,
185
+ act_layer=act_layer,
186
+ norm_layer=norm_layer if conv_mlp else norm_layer_cl,
187
+ ))
188
+ in_chs = out_chs
189
+ self.blocks = nn.Sequential(*stage_blocks)
190
+
191
+ def forward(self, x):
192
+ x = self.downsample(x)
193
+ if self.grad_checkpointing and not torch.jit.is_scripting():
194
+ x = checkpoint_seq(self.blocks, x)
195
+ else:
196
+ x = self.blocks(x)
197
+ return x
198
+
199
+
200
+ class ConvNeXt(nn.Module):
201
+ r""" ConvNeXt
202
+ A PyTorch impl of : `A ConvNet for the 2020s` - https://arxiv.org/pdf/2201.03545.pdf
203
+ """
204
+
205
+ def __init__(
206
+ self,
207
+ in_chans: int = 3,
208
+ num_classes: int = 1000,
209
+ global_pool: str = 'avg',
210
+ output_stride: int = 32,
211
+ depths: Tuple[int, ...] = (3, 3, 9, 3),
212
+ dims: Tuple[int, ...] = (96, 192, 384, 768),
213
+ kernel_sizes: Union[int, Tuple[int, ...]] = 7,
214
+ ls_init_value: Optional[float] = 1e-6,
215
+ stem_type: str = 'patch',
216
+ patch_size: int = 4,
217
+ head_init_scale: float = 1.,
218
+ head_norm_first: bool = False,
219
+ head_hidden_size: Optional[int] = None,
220
+ conv_mlp: bool = False,
221
+ conv_bias: bool = True,
222
+ use_grn: bool = False,
223
+ act_layer: Union[str, Callable] = 'gelu',
224
+ norm_layer: Optional[Union[str, Callable]] = None,
225
+ norm_eps: Optional[float] = None,
226
+ drop_rate: float = 0.,
227
+ drop_path_rate: float = 0.,
228
+ ):
229
+ """
230
+ Args:
231
+ in_chans: Number of input image channels.
232
+ num_classes: Number of classes for classification head.
233
+ global_pool: Global pooling type.
234
+ output_stride: Output stride of network, one of (8, 16, 32).
235
+ depths: Number of blocks at each stage.
236
+ dims: Feature dimension at each stage.
237
+ kernel_sizes: Depthwise convolution kernel-sizes for each stage.
238
+ ls_init_value: Init value for Layer Scale, disabled if None.
239
+ stem_type: Type of stem.
240
+ patch_size: Stem patch size for patch stem.
241
+ head_init_scale: Init scaling value for classifier weights and biases.
242
+ head_norm_first: Apply normalization before global pool + head.
243
+ head_hidden_size: Size of MLP hidden layer in head if not None and head_norm_first == False.
244
+ conv_mlp: Use 1x1 conv in MLP, improves speed for small networks w/ chan last.
245
+ conv_bias: Use bias layers w/ all convolutions.
246
+ use_grn: Use Global Response Norm (ConvNeXt-V2) in MLP.
247
+ act_layer: Activation layer type.
248
+ norm_layer: Normalization layer type.
249
+ drop_rate: Head pre-classifier dropout rate.
250
+ drop_path_rate: Stochastic depth drop rate.
251
+ """
252
+ super().__init__()
253
+ assert output_stride in (8, 16, 32)
254
+ kernel_sizes = to_ntuple(4)(kernel_sizes)
255
+ if norm_layer is None:
256
+ norm_layer = LayerNorm2d
257
+ norm_layer_cl = norm_layer if conv_mlp else LayerNorm
258
+ if norm_eps is not None:
259
+ norm_layer = partial(norm_layer, eps=norm_eps)
260
+ norm_layer_cl = partial(norm_layer_cl, eps=norm_eps)
261
+ else:
262
+ assert conv_mlp,\
263
+ 'If a norm_layer is specified, conv MLP must be used so all norm expect rank-4, channels-first input'
264
+ norm_layer_cl = norm_layer
265
+ if norm_eps is not None:
266
+ norm_layer_cl = partial(norm_layer_cl, eps=norm_eps)
267
+
268
+ self.num_classes = num_classes
269
+ self.drop_rate = drop_rate
270
+ self.feature_info = []
271
+
272
+ assert stem_type in ('patch', 'overlap', 'overlap_tiered')
273
+ if stem_type == 'patch':
274
+ # NOTE: this stem is a minimal form of ViT PatchEmbed, as used in SwinTransformer w/ patch_size = 4
275
+ self.stem = nn.Sequential(
276
+ nn.Conv2d(in_chans, dims[0], kernel_size=patch_size, stride=patch_size, bias=conv_bias),
277
+ norm_layer(dims[0]),
278
+ )
279
+ stem_stride = patch_size
280
+ else:
281
+ mid_chs = make_divisible(dims[0] // 2) if 'tiered' in stem_type else dims[0]
282
+ self.stem = nn.Sequential(
283
+ nn.Conv2d(in_chans, mid_chs, kernel_size=3, stride=2, padding=1, bias=conv_bias),
284
+ nn.Conv2d(mid_chs, dims[0], kernel_size=3, stride=2, padding=1, bias=conv_bias),
285
+ norm_layer(dims[0]),
286
+ )
287
+ stem_stride = 4
288
+
289
+ self.stages = nn.Sequential()
290
+ dp_rates = [x.tolist() for x in torch.linspace(0, drop_path_rate, sum(depths)).split(depths)]
291
+ stages = []
292
+ prev_chs = dims[0]
293
+ curr_stride = stem_stride
294
+ dilation = 1
295
+ # 4 feature resolution stages, each consisting of multiple residual blocks
296
+ for i in range(4):
297
+ stride = 2 if curr_stride == 2 or i > 0 else 1
298
+ if curr_stride >= output_stride and stride > 1:
299
+ dilation *= stride
300
+ stride = 1
301
+ curr_stride *= stride
302
+ first_dilation = 1 if dilation in (1, 2) else 2
303
+ out_chs = dims[i]
304
+ stages.append(ConvNeXtStage(
305
+ prev_chs,
306
+ out_chs,
307
+ kernel_size=kernel_sizes[i],
308
+ stride=stride,
309
+ dilation=(first_dilation, dilation),
310
+ depth=depths[i],
311
+ drop_path_rates=dp_rates[i],
312
+ ls_init_value=ls_init_value,
313
+ conv_mlp=conv_mlp,
314
+ conv_bias=conv_bias,
315
+ use_grn=use_grn,
316
+ act_layer=act_layer,
317
+ norm_layer=norm_layer,
318
+ norm_layer_cl=norm_layer_cl,
319
+ ))
320
+ prev_chs = out_chs
321
+ # NOTE feature_info use currently assumes stage 0 == stride 1, rest are stride 2
322
+ self.feature_info += [dict(num_chs=prev_chs, reduction=curr_stride, module=f'stages.{i}')]
323
+ self.stages = nn.Sequential(*stages)
324
+ self.num_features = self.head_hidden_size = prev_chs
325
+
326
+ # if head_norm_first == true, norm -> global pool -> fc ordering, like most other nets
327
+ # otherwise pool -> norm -> fc, the default ConvNeXt ordering (pretrained FB weights)
328
+ if head_norm_first:
329
+ assert not head_hidden_size
330
+ self.norm_pre = norm_layer(self.num_features)
331
+ self.head = ClassifierHead(
332
+ self.num_features,
333
+ num_classes,
334
+ pool_type=global_pool,
335
+ drop_rate=self.drop_rate,
336
+ )
337
+ else:
338
+ self.norm_pre = nn.Identity()
339
+ self.head = NormMlpClassifierHead(
340
+ self.num_features,
341
+ num_classes,
342
+ hidden_size=head_hidden_size,
343
+ pool_type=global_pool,
344
+ drop_rate=self.drop_rate,
345
+ norm_layer=norm_layer,
346
+ act_layer='gelu',
347
+ )
348
+ self.head_hidden_size = self.head.num_features
349
+ named_apply(partial(_init_weights, head_init_scale=head_init_scale), self)
350
+
351
+ @torch.jit.ignore
352
+ def group_matcher(self, coarse=False):
353
+ return dict(
354
+ stem=r'^stem',
355
+ blocks=r'^stages\.(\d+)' if coarse else [
356
+ (r'^stages\.(\d+)\.downsample', (0,)), # blocks
357
+ (r'^stages\.(\d+)\.blocks\.(\d+)', None),
358
+ (r'^norm_pre', (99999,))
359
+ ]
360
+ )
361
+
362
+ @torch.jit.ignore
363
+ def set_grad_checkpointing(self, enable=True):
364
+ for s in self.stages:
365
+ s.grad_checkpointing = enable
366
+
367
+ @torch.jit.ignore
368
+ def get_classifier(self) -> nn.Module:
369
+ return self.head.fc
370
+
371
+ def reset_classifier(self, num_classes: int, global_pool: Optional[str] = None):
372
+ self.num_classes = num_classes
373
+ self.head.reset(num_classes, global_pool)
374
+
375
+ def forward_intermediates(
376
+ self,
377
+ x: torch.Tensor,
378
+ indices: Optional[Union[int, List[int], Tuple[int]]] = None,
379
+ norm: bool = False,
380
+ stop_early: bool = False,
381
+ output_fmt: str = 'NCHW',
382
+ intermediates_only: bool = False,
383
+ ) -> Union[List[torch.Tensor], Tuple[torch.Tensor, List[torch.Tensor]]]:
384
+ """ Forward features that returns intermediates.
385
+
386
+ Args:
387
+ x: Input image tensor
388
+ indices: Take last n blocks if int, all if None, select matching indices if sequence
389
+ norm: Apply norm layer to compatible intermediates
390
+ stop_early: Stop iterating over blocks when last desired intermediate hit
391
+ output_fmt: Shape of intermediate feature outputs
392
+ intermediates_only: Only return intermediate features
393
+ Returns:
394
+
395
+ """
396
+ assert output_fmt in ('NCHW',), 'Output shape must be NCHW.'
397
+ intermediates = []
398
+ take_indices, max_index = feature_take_indices(len(self.stages) + 1, indices)
399
+
400
+ # forward pass
401
+ feat_idx = 0 # stem is index 0
402
+ x = self.stem(x)
403
+ if feat_idx in take_indices:
404
+ intermediates.append(x)
405
+
406
+ if torch.jit.is_scripting() or not stop_early: # can't slice blocks in torchscript
407
+ stages = self.stages
408
+ else:
409
+ stages = self.stages[:max_index]
410
+ for stage in stages:
411
+ feat_idx += 1
412
+ x = stage(x)
413
+ if feat_idx in take_indices:
414
+ # NOTE not bothering to apply norm_pre when norm=True as almost no models have it enabled
415
+ intermediates.append(x)
416
+
417
+ if intermediates_only:
418
+ return intermediates
419
+
420
+ x = self.norm_pre(x)
421
+
422
+ return x, intermediates
423
+
424
+ def prune_intermediate_layers(
425
+ self,
426
+ indices: Union[int, List[int], Tuple[int]] = 1,
427
+ prune_norm: bool = False,
428
+ prune_head: bool = True,
429
+ ):
430
+ """ Prune layers not required for specified intermediates.
431
+ """
432
+ take_indices, max_index = feature_take_indices(len(self.stages) + 1, indices)
433
+ self.stages = self.stages[:max_index] # truncate blocks w/ stem as idx 0
434
+ if prune_norm:
435
+ self.norm_pre = nn.Identity()
436
+ if prune_head:
437
+ self.reset_classifier(0, '')
438
+ return take_indices
439
+
440
+ def forward_features(self, x):
441
+ x = self.stem(x)
442
+ x = self.stages(x)
443
+ x = self.norm_pre(x)
444
+ return x
445
+
446
+ def forward_head(self, x, pre_logits: bool = False):
447
+ return self.head(x, pre_logits=True) if pre_logits else self.head(x)
448
+
449
+ def forward(self, x):
450
+ x = self.forward_features(x)
451
+ x = self.forward_head(x)
452
+ return x
453
+
454
+
455
+ def _init_weights(module, name=None, head_init_scale=1.0):
456
+ if isinstance(module, nn.Conv2d):
457
+ trunc_normal_(module.weight, std=.02)
458
+ if module.bias is not None:
459
+ nn.init.zeros_(module.bias)
460
+ elif isinstance(module, nn.Linear):
461
+ trunc_normal_(module.weight, std=.02)
462
+ nn.init.zeros_(module.bias)
463
+ if name and 'head.' in name:
464
+ module.weight.data.mul_(head_init_scale)
465
+ module.bias.data.mul_(head_init_scale)
466
+
467
+
468
+ def checkpoint_filter_fn(state_dict, model):
469
+ """ Remap FB checkpoints -> timm """
470
+ if 'head.norm.weight' in state_dict or 'norm_pre.weight' in state_dict:
471
+ return state_dict # non-FB checkpoint
472
+ if 'model' in state_dict:
473
+ state_dict = state_dict['model']
474
+
475
+ out_dict = {}
476
+ if 'visual.trunk.stem.0.weight' in state_dict:
477
+ out_dict = {k.replace('visual.trunk.', ''): v for k, v in state_dict.items() if k.startswith('visual.trunk.')}
478
+ if 'visual.head.proj.weight' in state_dict:
479
+ out_dict['head.fc.weight'] = state_dict['visual.head.proj.weight']
480
+ out_dict['head.fc.bias'] = torch.zeros(state_dict['visual.head.proj.weight'].shape[0])
481
+ elif 'visual.head.mlp.fc1.weight' in state_dict:
482
+ out_dict['head.pre_logits.fc.weight'] = state_dict['visual.head.mlp.fc1.weight']
483
+ out_dict['head.pre_logits.fc.bias'] = state_dict['visual.head.mlp.fc1.bias']
484
+ out_dict['head.fc.weight'] = state_dict['visual.head.mlp.fc2.weight']
485
+ out_dict['head.fc.bias'] = torch.zeros(state_dict['visual.head.mlp.fc2.weight'].shape[0])
486
+ return out_dict
487
+
488
+ import re
489
+ for k, v in state_dict.items():
490
+ k = k.replace('downsample_layers.0.', 'stem.')
491
+ k = re.sub(r'stages.([0-9]+).([0-9]+)', r'stages.\1.blocks.\2', k)
492
+ k = re.sub(r'downsample_layers.([0-9]+).([0-9]+)', r'stages.\1.downsample.\2', k)
493
+ k = k.replace('dwconv', 'conv_dw')
494
+ k = k.replace('pwconv', 'mlp.fc')
495
+ if 'grn' in k:
496
+ k = k.replace('grn.beta', 'mlp.grn.bias')
497
+ k = k.replace('grn.ramma', 'mlp.grn.weight')
498
+ v = v.reshape(v.shape[-1])
499
+ k = k.replace('head.', 'head.fc.')
500
+ if k.startswith('norm.'):
501
+ k = k.replace('norm', 'head.norm')
502
+ if v.ndim == 2 and 'head' not in k:
503
+ model_shape = model.state_dict()[k].shape
504
+ v = v.reshape(model_shape)
505
+ out_dict[k] = v
506
+
507
+ return out_dict
508
+
509
+
510
+ def _create_convnext(variant, pretrained=False, **kwargs):
511
+ if kwargs.get('pretrained_cfg', '') == 'fcmae':
512
+ # NOTE fcmae pretrained weights have no classifier or final norm-layer (`head.norm`)
513
+ # This is workaround loading with num_classes=0 w/o removing norm-layer.
514
+ kwargs.setdefault('pretrained_strict', False)
515
+
516
+ model = build_model_with_cfg(
517
+ ConvNeXt, variant, pretrained,
518
+ pretrained_filter_fn=checkpoint_filter_fn,
519
+ feature_cfg=dict(out_indices=(0, 1, 2, 3), flatten_sequential=True),
520
+ **kwargs)
521
+ return model
522
+
523
+ def convnext_large(pretrained=False, **kwargs) -> ConvNeXt:
524
+ model_args = dict(depths=[3, 3, 27, 3], dims=[192, 384, 768, 1536])
525
+ model = _create_convnext('convnext_large', pretrained=pretrained, **dict(model_args, **kwargs))
526
+ return model
527
+
528
+
529
+
530
+ class CLIP(nn.Module):
531
+ output_dict: torch.jit.Final[bool]
532
+
533
+ def __init__(
534
+ self,
535
+ embed_dim: int,
536
+ vision_cfg: CLIPVisionCfg,
537
+ quick_gelu: bool = False,
538
+ cast_dtype: Optional[torch.dtype] = None,
539
+ output_dict: bool = False,
540
+ **kwargs,
541
+ ):
542
+ super().__init__()
543
+ self.output_dict = output_dict
544
+
545
+ self.visual = convnext_large()
546
+
547
+ class ConvNextVisionEncoder(nn.Module):
548
+ def __init__(
549
+ self,
550
+ ):
551
+ super().__init__()
552
+ self.model_type = "convnext_large_d_320"
553
+ self.model_channel = [192, 384, 768, 1536] # stage 0-3
554
+
555
+ clip_model = CLIP(**get_model_config(self.model_type), use_text=False)
556
+
557
+ # decompose stem and stages blocks in vision tower
558
+ self.vision_stem = clip_model.visual.stem
559
+ self.vision_stages = clip_model.visual.stages
560
+
561
+ def forward(self, images):
562
+
563
+ if type(images) is list:
564
+ image_features = []
565
+ for image in images:
566
+ image_feature = self.backbone(
567
+ image.to(device=self.device, dtype=self.dtype).unsqueeze(0),
568
+ )
569
+ image_features.append(image_feature)
570
+ else:
571
+ image_features = self.backbone(
572
+ images.to(device=self.device, dtype=self.dtype),
573
+ )
574
+
575
+ return {
576
+ "image_features": image_features,
577
+ "last_feat": image_features[-1],
578
+ }
579
+
580
+ def backbone(self, images: torch.Tensor) -> Tuple[List[torch.Tensor], List[int]]:
581
+ """Process the input images through the backbone network.
582
+
583
+ Inputs:
584
+ images (torch.Tensor): The input images.
585
+
586
+ Returns:
587
+ Tuple[List[torch.Tensor], List[int]]: A tuple containing a list of feature maps and a
588
+ ist of channels per level.
589
+ """
590
+ with torch.no_grad():
591
+ results = self.basic_forward(images)
592
+ feature_maps = []
593
+
594
+ for _stage in results:
595
+ feature_maps.append(results[_stage].contiguous())
596
+ return feature_maps
597
+
598
+ def basic_forward(self, images):
599
+ results = {}
600
+ x = self.vision_stem(images)
601
+ for _idx in range(len(self.vision_stages)):
602
+ x = self.vision_stages[_idx](x)
603
+ results[f"stage_{_idx}"] = x
604
+ return results
605
+
606
+ @property
607
+ def dtype(self):
608
+ return self.vision_stem[0].weight.dtype
609
+
610
+ @property
611
+ def device(self):
612
+ return self.vision_stem[0].weight.device
613
+
614
+ @property
615
+ def config(self):
616
+ return self.vision_config
617
+
618
+ @property
619
+ def hidden_size(self):
620
+ return sum(self.model_channel)
621
+
622
+ if __name__ == '__main__':
623
+ model = ConvNextVisionEncoder()
624
+ print(model.state_dict().keys())
model.safetensors.index.json ADDED
The diff for this file is too large to render. See raw diff
 
modeling_chatrex.py ADDED
@@ -0,0 +1,880 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ import logging
3
+ import math
4
+ import os
5
+ from pathlib import Path
6
+ from typing import Any, Dict, List, Optional, Tuple, Union
7
+
8
+ import torch
9
+ import torch.nn as nn
10
+ import torch.nn.functional as F
11
+ from open_clip.factory import get_model_config, load_state_dict
12
+ from open_clip.model import (CLIPTextCfg, CLIPVisionCfg, _build_text_tower,
13
+ _build_vision_tower,
14
+ convert_to_custom_text_state_dict)
15
+ from open_clip.transformer import text_global_pool
16
+ from torch import nn
17
+ from torchvision.ops import roi_align
18
+ from transformers import (CONFIG_MAPPING, AutoConfig, AutoModel,
19
+ AutoModelForCausalLM, GenerationConfig,
20
+ PretrainedConfig, PreTrainedModel, StoppingCriteria,
21
+ StoppingCriteriaList)
22
+ from transformers.activations import ACT2FN
23
+ from transformers.configuration_utils import PretrainedConfig
24
+ from transformers.generation import GenerationConfig
25
+ from transformers.modeling_utils import load_state_dict
26
+ from transformers.utils import logging, strtobool
27
+
28
+ from .convnext import ConvNextVisionEncoder
29
+
30
+ logger = logging.get_logger(__name__)
31
+
32
+ XLA_USE_BF16 = os.environ.get("XLA_USE_BF16", "0").upper()
33
+ XLA_DOWNCAST_BF16 = os.environ.get("XLA_DOWNCAST_BF16", "0").upper()
34
+
35
+ IGNORE_INDEX = -100
36
+ DEFAULT_PAD_TOKEN_INDEX = 0
37
+ IMAGE_TOKEN_INDEX = -200
38
+ DEFAULT_IMAGE_TOKEN = "<image>"
39
+
40
+ # For Objects
41
+ DEFAULT_OBJECT_TOKEN = "<obj<i>>"
42
+ DEFAULT_OBJECT_FEATURE_TOKEN = "<objfeat>"
43
+ DEFAULT_OBJECT_INDEX = -300
44
+
45
+ # For Grounding
46
+ DEFAULT_GROUNDING_START = "<ground>"
47
+ DEFAULT_GROUNDING_END = "</ground>"
48
+ DEFAULT_GROUNDING_OBJECTS_START = "<objects>"
49
+ DEFAULT_GROUNDING_OBJECTS_END = "</objects>"
50
+
51
+ def is_fsdp_enabled():
52
+ return (
53
+ torch.distributed.is_available()
54
+ and torch.distributed.is_initialized()
55
+ and strtobool(os.environ.get("ACCELERATE_USE_FSDP", "False")) == 1
56
+ and strtobool(os.environ.get("FSDP_CPU_RAM_EFFICIENT_LOADING", "False")) == 1
57
+ )
58
+
59
+
60
+
61
+
62
+ def get_token_slices(input_ids: torch.Tensor):
63
+ """
64
+ Get slices of tokens based on special markers in the input tensor.
65
+
66
+ Args:
67
+ input_ids (torch.Tensor): A tensor of token IDs where IMAGE_TOKEN_INDEX represents an image token,
68
+ DEFAULT_OBJECT_INDEX represents an object token, and all other values represent text tokens.
69
+
70
+ Returns:
71
+ List[Dict[str, Any]]: A list of dictionaries where each dictionary contains the type of the
72
+ token slice ('text', 'image', 'object') and the span as a list of start and end indices.
73
+ """
74
+ # define type markers and corresponding types
75
+ type_map = {IMAGE_TOKEN_INDEX: "image", DEFAULT_OBJECT_INDEX: "object"}
76
+
77
+ # find the positions of special markers
78
+ image_indices = torch.where(input_ids == IMAGE_TOKEN_INDEX)[0]
79
+ object_indices = torch.where(input_ids == DEFAULT_OBJECT_INDEX)[0]
80
+ if len(object_indices) > 0:
81
+ has_object = True
82
+ else:
83
+ has_object = False
84
+
85
+ # merge all the positions of special markers
86
+ special_indices = torch.cat((image_indices, object_indices))
87
+ special_indices, _ = torch.sort(special_indices)
88
+ special_tokens = input_ids[special_indices]
89
+
90
+ slices = []
91
+ start_idx = 0
92
+
93
+ for i, idx in enumerate(special_indices):
94
+ if start_idx < idx:
95
+ slices.append({"type": "text", "span": [start_idx, idx.item()]})
96
+ token_type = type_map[special_tokens[i].item()]
97
+ slices.append({"type": token_type, "span": [idx.item(), idx.item() + 1]})
98
+ start_idx = idx.item() + 1
99
+
100
+ if start_idx < len(input_ids):
101
+ slices.append({"type": "text", "span": [start_idx, len(input_ids)]})
102
+
103
+ return slices, has_object
104
+
105
+
106
+ def prepare_inputs_labels_for_multimodal(
107
+ llm,
108
+ input_ids: torch.LongTensor = None,
109
+ position_ids: Optional[torch.LongTensor] = None,
110
+ attention_mask: Optional[torch.Tensor] = None,
111
+ past_key_values: Optional[List[torch.FloatTensor]] = None,
112
+ labels: Optional[torch.LongTensor] = None,
113
+ pixel_values: Optional[torch.FloatTensor] = None,
114
+ bbox_feats=None,
115
+ extra_llm_input_embed: nn.Embedding = None,
116
+ **kwargs,
117
+ ):
118
+ if pixel_values is None:
119
+ return {
120
+ "input_ids": input_ids,
121
+ "position_ids": position_ids,
122
+ "attention_mask": attention_mask,
123
+ "past_key_values": past_key_values,
124
+ "inputs_embeds": None,
125
+ "labels": labels,
126
+ }
127
+
128
+ _labels = labels
129
+ _position_ids = position_ids
130
+ _attention_mask = attention_mask
131
+ if attention_mask is None:
132
+ attention_mask = torch.ones_like(input_ids, dtype=torch.bool)
133
+ else:
134
+ attention_mask = attention_mask.bool()
135
+ if position_ids is None:
136
+ position_ids = torch.arange(
137
+ 0, input_ids.shape[1], dtype=torch.long, device=input_ids.device
138
+ )
139
+ if labels is None:
140
+ labels = torch.full_like(input_ids, IGNORE_INDEX)
141
+
142
+ # remove the padding using attention_mask -- TODO: double check
143
+ input_ids = [
144
+ cur_input_ids[cur_attention_mask]
145
+ for cur_input_ids, cur_attention_mask in zip(input_ids, attention_mask)
146
+ ]
147
+ labels = [
148
+ cur_labels[cur_attention_mask]
149
+ for cur_labels, cur_attention_mask in zip(labels, attention_mask)
150
+ ]
151
+
152
+ new_inputs_embeds = []
153
+ new_labels = []
154
+ cur_image_idx = 0
155
+ cur_object_idx = 0
156
+ for batch_idx, cur_input_ids in enumerate(input_ids):
157
+ num_images = (cur_input_ids == IMAGE_TOKEN_INDEX).sum()
158
+ if num_images == 0:
159
+ cur_pixel_values = pixel_values[cur_image_idx]
160
+ cur_inputs_embeds_1 = llm.get_input_embeddings()(cur_input_ids)
161
+ cur_inputs_embeds = torch.cat(
162
+ [cur_inputs_embeds_1, cur_pixel_values[0:0]], dim=0
163
+ )
164
+ new_inputs_embeds.append(cur_inputs_embeds)
165
+ new_labels.append(labels[batch_idx])
166
+ cur_image_idx += 1
167
+ cur_object_idx += 1
168
+ continue
169
+
170
+ cur_labels = labels[batch_idx]
171
+ token_slices, has_object = get_token_slices(cur_input_ids)
172
+ result_input_embeddings = []
173
+ result_output_labels = []
174
+ cur_gt_bnox_indice = 0
175
+ for slice in token_slices:
176
+ slice_type = slice["type"]
177
+ slice_span = slice["span"]
178
+ if slice_type == "text":
179
+ cur_input_ids_noim = cur_input_ids[slice_span[0] : slice_span[1]]
180
+ cur_labels_noim = cur_labels[slice_span[0] : slice_span[1]]
181
+ cur_input_embeds = llm.get_input_embeddings()(cur_input_ids_noim)
182
+ result_input_embeddings.append(cur_input_embeds)
183
+ result_output_labels.append(cur_labels_noim)
184
+ elif slice_type == "image":
185
+ cur_input_embeds = pixel_values[cur_image_idx]
186
+ result_input_embeddings.append(cur_input_embeds)
187
+ result_output_labels.append(
188
+ torch.full(
189
+ (cur_input_embeds.shape[0],),
190
+ IGNORE_INDEX,
191
+ device=cur_labels.device,
192
+ dtype=cur_labels.dtype,
193
+ )
194
+ )
195
+ cur_image_idx += 1
196
+ elif slice_type == "object":
197
+ try:
198
+ result_input_embeddings.append(
199
+ bbox_feats[cur_object_idx][cur_gt_bnox_indice].unsqueeze(0)
200
+ )
201
+ except:
202
+ raise ValueError(
203
+ f"current boxe_feats.shape: {bbox_feats[cur_object_idx].shape}, "
204
+ )
205
+ cur_gt_bnox_indice += 1
206
+ result_output_labels.append(
207
+ torch.full(
208
+ (1,),
209
+ IGNORE_INDEX,
210
+ device=cur_labels.device,
211
+ dtype=cur_labels.dtype,
212
+ )
213
+ )
214
+ cur_object_idx += 1
215
+ result_input_embeddings = torch.cat(result_input_embeddings)
216
+ result_output_labels = torch.cat(result_output_labels)
217
+ assert len(result_output_labels) == len(result_input_embeddings)
218
+ new_inputs_embeds.append(result_input_embeddings)
219
+ new_labels.append(result_output_labels)
220
+
221
+ # Combine them
222
+ max_len = max(x.shape[0] for x in new_inputs_embeds)
223
+ batch_size = len(new_inputs_embeds)
224
+
225
+ new_inputs_embeds_padded = []
226
+ new_labels_padded = torch.full(
227
+ (batch_size, max_len),
228
+ IGNORE_INDEX,
229
+ dtype=new_labels[0].dtype,
230
+ device=new_labels[0].device,
231
+ )
232
+ attention_mask = torch.zeros(
233
+ (batch_size, max_len), dtype=attention_mask.dtype, device=attention_mask.device
234
+ )
235
+ position_ids = torch.zeros(
236
+ (batch_size, max_len), dtype=position_ids.dtype, device=position_ids.device
237
+ )
238
+
239
+ for i, (cur_new_embed, cur_new_labels) in enumerate(
240
+ zip(new_inputs_embeds, new_labels)
241
+ ):
242
+ cur_len = cur_new_embed.shape[0]
243
+ new_inputs_embeds_padded.append(
244
+ torch.cat(
245
+ (
246
+ cur_new_embed,
247
+ torch.zeros(
248
+ (max_len - cur_len, cur_new_embed.shape[1]),
249
+ dtype=cur_new_embed.dtype,
250
+ device=cur_new_embed.device,
251
+ ),
252
+ ),
253
+ dim=0,
254
+ )
255
+ )
256
+ if cur_len > 0:
257
+ new_labels_padded[i, :cur_len] = cur_new_labels
258
+ attention_mask[i, :cur_len] = True
259
+ position_ids[i, :cur_len] = torch.arange(
260
+ 0, cur_len, dtype=position_ids.dtype, device=position_ids.device
261
+ )
262
+
263
+ new_inputs_embeds = torch.stack(new_inputs_embeds_padded, dim=0)
264
+
265
+ if _labels is None:
266
+ new_labels = None
267
+ else:
268
+ new_labels = new_labels_padded
269
+
270
+ if _attention_mask is None:
271
+ attention_mask = None
272
+ else:
273
+ attention_mask = attention_mask.to(dtype=_attention_mask.dtype)
274
+
275
+ if _position_ids is None:
276
+ position_ids = None
277
+
278
+ return {
279
+ "input_ids": None,
280
+ "position_ids": position_ids,
281
+ "attention_mask": attention_mask,
282
+ "past_key_values": past_key_values,
283
+ "inputs_embeds": new_inputs_embeds,
284
+ "labels": new_labels,
285
+ }
286
+
287
+ class StopWordStoppingCriteria(StoppingCriteria):
288
+ """StopWord stopping criteria."""
289
+
290
+ def __init__(self, tokenizer, stop_word):
291
+ self.tokenizer = tokenizer
292
+ self.stop_word = stop_word
293
+ self.length = len(self.stop_word)
294
+
295
+ def __call__(self, input_ids, *args, **kwargs) -> bool:
296
+ cur_text = self.tokenizer.decode(input_ids[0])
297
+ cur_text = cur_text.replace('\r', '').replace('\n', '')
298
+ return cur_text[-self.length:] == self.stop_word
299
+
300
+ def get_stop_criteria(
301
+ tokenizer,
302
+ stop_words=[],
303
+ ):
304
+ stop_criteria = StoppingCriteriaList()
305
+ for word in stop_words:
306
+ stop_criteria.append(StopWordStoppingCriteria(tokenizer, word))
307
+ return stop_criteria
308
+
309
+ class DualPathFuseModule(nn.Module):
310
+ # change channel+gate+sum
311
+ def __init__(self, low_res_dim, high_res_dim, zero_init=True):
312
+ super().__init__()
313
+
314
+ self.slow_conv = nn.Conv2d(high_res_dim, high_res_dim, 1)
315
+ self.slow_proj = nn.Conv2d(high_res_dim, low_res_dim, 1)
316
+
317
+ self.fast_conv = nn.Conv2d(
318
+ low_res_dim, low_res_dim, 7, padding=3, groups=low_res_dim
319
+ )
320
+ self.fast_proj = nn.Conv2d(low_res_dim, low_res_dim, 1)
321
+
322
+ self.gate = nn.Sequential(
323
+ nn.Linear(low_res_dim * 2, low_res_dim // 2),
324
+ nn.GELU(),
325
+ nn.Linear(low_res_dim // 2, 1),
326
+ )
327
+
328
+ nn.init.xavier_uniform_(self.slow_conv.weight)
329
+ nn.init.xavier_uniform_(self.fast_conv.weight)
330
+ nn.init.zeros_(self.slow_conv.bias)
331
+ nn.init.zeros_(self.fast_conv.bias)
332
+ if zero_init:
333
+ nn.init.zeros_(self.slow_proj.weight)
334
+ nn.init.zeros_(self.fast_proj.weight)
335
+ else:
336
+ nn.init.xavier_uniform_(self.slow_proj.weight)
337
+ nn.init.xavier_uniform_(self.fast_proj.weight)
338
+ nn.init.zeros_(self.slow_proj.bias)
339
+ nn.init.zeros_(self.fast_proj.bias)
340
+
341
+ def forward(self, low_res_feat, high_res_feat, sampler=None):
342
+ b, c, h, w = high_res_feat.shape # (2, 1536, 24, 24)
343
+ _, _, d = low_res_feat.shape # (2, 576, 1024)
344
+ high_res_feat = self.slow_proj(
345
+ F.gelu(self.slow_conv(high_res_feat))
346
+ ) # (2, 1024, 24, 24)
347
+ high_res_feat = high_res_feat.view(b, d, -1).transpose(1, 2) # (2, 576, 1024)
348
+ dst_size = int(math.sqrt(low_res_feat.shape[1])) # 24
349
+ low_res_feat = low_res_feat.transpose(1, 2).view(
350
+ b, d, dst_size, dst_size
351
+ ) # (2, 1024, 24, 24)
352
+ low_res_feat = low_res_feat + self.fast_proj(
353
+ F.gelu(self.fast_conv(low_res_feat))
354
+ )
355
+ low_res_feat = low_res_feat.view(b, d, dst_size * dst_size).transpose(
356
+ 1, 2
357
+ ) # (2, 576, 1024)
358
+ gate = self.gate(
359
+ torch.cat([low_res_feat, high_res_feat], -1).mean(1)
360
+ ).unsqueeze(
361
+ 1
362
+ ) # (2, 1, 1)
363
+ low_res_feat = low_res_feat + high_res_feat * gate.tanh()
364
+ return low_res_feat
365
+
366
+ class ProjectorConfig(PretrainedConfig):
367
+ model_type = "projector"
368
+ _auto_class = "AutoConfig"
369
+
370
+ def __init__(
371
+ self,
372
+ visual_hidden_size=4096,
373
+ llm_hidden_size=4096,
374
+ depth=2,
375
+ hidden_act="gelu",
376
+ bias=True,
377
+ **kwargs,
378
+ ):
379
+ self.visual_hidden_size = visual_hidden_size
380
+ self.llm_hidden_size = llm_hidden_size
381
+ self.depth = depth
382
+ self.hidden_act = hidden_act
383
+ self.bias = bias
384
+ super().__init__(**kwargs)
385
+
386
+ class ProjectorModel(PreTrainedModel):
387
+ _auto_class = "AutoModel"
388
+ config_class = ProjectorConfig
389
+ base_model_prefix = "model"
390
+ supports_gradient_checkpointing = True
391
+ _no_split_modules = []
392
+
393
+ def __init__(self, config: ProjectorConfig) -> None:
394
+ super().__init__(config)
395
+ self.gradient_checkpointing = False
396
+
397
+ modules = [
398
+ nn.Linear(
399
+ config.visual_hidden_size, config.llm_hidden_size, bias=config.bias
400
+ )
401
+ ]
402
+ for _ in range(1, config.depth):
403
+ modules.append(ACT2FN[config.hidden_act])
404
+ modules.append(
405
+ nn.Linear(
406
+ config.llm_hidden_size, config.llm_hidden_size, bias=config.bias
407
+ )
408
+ )
409
+ self.model = nn.Sequential(*modules)
410
+
411
+ def enable_input_require_grads(self):
412
+
413
+ def make_inputs_require_grad(module, input, output):
414
+ output.requires_grad_(True)
415
+
416
+ self.model.register_forward_hook(make_inputs_require_grad)
417
+
418
+ def _set_gradient_checkpointing(self, module, value=False):
419
+ if isinstance(module, ProjectorModel):
420
+ module.gradient_checkpointing = value
421
+
422
+ def forward(self, x):
423
+ layer_outputs = self.model(x)
424
+ return layer_outputs
425
+
426
+
427
+ def gen_sineembed_for_position(pos_tensor, dim_of_pos_feats):
428
+ """Generate sine position embedding from a position tensor.
429
+
430
+ Args:
431
+ pos_tensor (torch.Tensor): shape: [batch_size, N, 4]. the last dimension is [cx, cy, w, h] in
432
+ normalized coordinates in range [0, 1].
433
+ out_dim (int): the output dimension of the position embedding.
434
+
435
+ Returns:
436
+ pos (torch.Tensor): shape: [batch_size, N, out_dim].
437
+ """
438
+ scale = 2 * math.pi
439
+ dim_t = torch.arange(
440
+ dim_of_pos_feats, dtype=torch.float32, device=pos_tensor.device
441
+ )
442
+ dim_t = 10000 ** (2 * (dim_t // 2) / dim_of_pos_feats)
443
+ x_embed = pos_tensor[:, :, 0] * scale
444
+ y_embed = pos_tensor[:, :, 1] * scale
445
+ pos_x = x_embed[:, :, None] / dim_t
446
+ pos_y = y_embed[:, :, None] / dim_t
447
+ pos_x = torch.stack(
448
+ (pos_x[:, :, 0::2].sin(), pos_x[:, :, 1::2].cos()), dim=3
449
+ ).flatten(2)
450
+ pos_y = torch.stack(
451
+ (pos_y[:, :, 0::2].sin(), pos_y[:, :, 1::2].cos()), dim=3
452
+ ).flatten(2)
453
+ if pos_tensor.size(-1) == 2:
454
+ pos = torch.cat((pos_y, pos_x), dim=2)
455
+ elif pos_tensor.size(-1) == 4:
456
+ w_embed = pos_tensor[:, :, 2] * scale
457
+ pos_w = w_embed[:, :, None] / dim_t
458
+ pos_w = torch.stack(
459
+ (pos_w[:, :, 0::2].sin(), pos_w[:, :, 1::2].cos()), dim=3
460
+ ).flatten(2)
461
+
462
+ h_embed = pos_tensor[:, :, 3] * scale
463
+ pos_h = h_embed[:, :, None] / dim_t
464
+ pos_h = torch.stack(
465
+ (pos_h[:, :, 0::2].sin(), pos_h[:, :, 1::2].cos()), dim=3
466
+ ).flatten(2)
467
+
468
+ pos = torch.cat((pos_y, pos_x, pos_w, pos_h), dim=2)
469
+ else:
470
+ raise ValueError("Unknown pos_tensor shape(-1):{}".format(pos_tensor.size(-1)))
471
+ return pos
472
+
473
+
474
+ class MultiLevelROIVisualPrompt(nn.Module):
475
+ """Initialize the MultiLevelROIVisualPrompt.
476
+
477
+ Args:
478
+ output_size (Optional[int]): The size of the output. Default is None.
479
+ channel_per_level (List[int]): List of channels per level. Default is [192, 384, 768, 1536].
480
+ spatial_scale (Optional[float]): The spatial scale factor. Default is None.
481
+ with_additional_projection (bool): Whether to use additional projection. Default is False.
482
+ visual_prompt_hidden_size (int): The hidden size of the visual prompt. Default is 1024.
483
+ add_pos_embedding (bool): Whether to add position embedding. Default is False.
484
+ pos_embedding_dim (int): The dimension of the position embedding. Default is 1024.
485
+ """
486
+
487
+ def __init__(
488
+ self,
489
+ output_size: int = None,
490
+ channel_per_level: List[int] = [192, 384, 768, 1536],
491
+ spatail_scale: float = None,
492
+ visual_prompt_hidden_size: bool = 1024,
493
+ add_pos_embedding: bool = False,
494
+ pos_embedding_dim: int = 1024,
495
+ ):
496
+ super(MultiLevelROIVisualPrompt, self).__init__()
497
+ self.output_size = output_size
498
+ self.channel_per_level = channel_per_level
499
+ self.spatail_scale = spatail_scale
500
+ self.add_pos_embedding = add_pos_embedding
501
+ self.pos_embedding_dim = pos_embedding_dim
502
+
503
+ def __call__(
504
+ self,
505
+ multi_level_features: List[torch.Tensor],
506
+ boxes: Union[torch.Tensor, List[torch.Tensor]],
507
+ ) -> torch.Tensor:
508
+ """Performs Region of Interest (RoI) Align operator on multi-level features. The RoI
509
+ feature on each scale will go through a different linear layer for projection. Different
510
+ RoI features will be summed up and then average pooled.
511
+
512
+ Args:
513
+ multi_level_features (Listp[Tensor[N, C, H, W]]): Feature maps from different levels
514
+ boxes (Tensor[K, 5] or List[Tensor[L, 4]]): the box coordinates in (x1, y1, x2, y2)
515
+ format where the regions will be taken from.
516
+ Returns:
517
+ Tensor[1, K, C]: The output tensor that has the shape KxC, where K is the number of RoIs
518
+ """
519
+ boxes[0] = boxes[0].float()
520
+ concat_multi_level_feature = []
521
+ max_height = max([feature.shape[2] for feature in multi_level_features])
522
+ max_width = max([feature.shape[3] for feature in multi_level_features])
523
+ # interpolate to the same size
524
+ for level, feature in enumerate(multi_level_features):
525
+ if level != 0:
526
+ concat_multi_level_feature.append(
527
+ F.interpolate(
528
+ feature.float(),
529
+ size=(max_height, max_width),
530
+ mode="bilinear",
531
+ align_corners=False,
532
+ )
533
+ )
534
+ else:
535
+ concat_multi_level_feature.append(feature.float())
536
+ concat_multi_level_feature = torch.cat(concat_multi_level_feature, dim=1)
537
+
538
+
539
+ out_box_feat = roi_align(
540
+ concat_multi_level_feature,
541
+ boxes,
542
+ output_size=self.output_size,
543
+ spatial_scale=self.spatail_scale,
544
+ )
545
+
546
+ # Average Pooling -> n,c -> 1,n,c
547
+ out_box_feat = out_box_feat.mean(dim=(2, 3)).reshape(
548
+ 1, out_box_feat.shape[0], out_box_feat.shape[1]
549
+ )
550
+ if self.add_pos_embedding:
551
+ # note that this boxes is in xyxy, unormalized format, so we need to normalize it first
552
+ boxes = boxes[0] # (N, 4)
553
+ boxes = boxes.to(out_box_feat.dtype)
554
+ original_img_width = max_width / self.spatail_scale
555
+ original_img_height = max_height / self.spatail_scale
556
+ boxes[:, [0, 2]] = boxes[:, [0, 2]] / original_img_width
557
+ boxes[:, [1, 3]] = boxes[:, [1, 3]] / original_img_height
558
+ # convert from xyxy to cx, cy, w, h
559
+ boxes[:, 2] = boxes[:, 2] - boxes[:, 0]
560
+ boxes[:, 3] = boxes[:, 3] - boxes[:, 1]
561
+ boxes[:, 0] = boxes[:, 0] + boxes[:, 2] / 2
562
+ boxes[:, 1] = boxes[:, 1] + boxes[:, 3] / 2
563
+ pos_embed = gen_sineembed_for_position(
564
+ boxes.unsqueeze(0), self.pos_embedding_dim // 4
565
+ )
566
+ out_box_feat = out_box_feat + pos_embed
567
+
568
+ return out_box_feat
569
+
570
+
571
+
572
+ class ChatRexAuxConfig(PretrainedConfig):
573
+ r"""
574
+ This is the configuration class to store the configuration of ChatRexAux model.
575
+
576
+
577
+ Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
578
+ documentation from [`PretrainedConfig`] for more information.
579
+
580
+ Args:
581
+ vision_config (`Union[AutoConfig, dict]`, *optional*, defaults to `CLIPVisionConfig`):
582
+ The config object or dictionary of the vision backbone.
583
+ vision_aux_config (`Union[AutoConfig, dict]`, *optional*, defaults to `OpenCLIPVisionTower`):
584
+ visual_prompt_encoder (`Union[AutoConfig, dict]`, *optional*, defaults to `MultiLevelROIVisualPrompt`):
585
+ text_config (`Union[AutoConfig, dict]`, *optional*, defaults to `LlamaConfig`):
586
+ The config object or dictionary of the text backbone.
587
+ ignore_index (`int`, *optional*, defaults to -100):
588
+ The ignore index for the loss function.
589
+ image_token_index (`int`, *optional*, defaults to 32000):
590
+ The image token index to encode the image prompt.
591
+ projector_hidden_act (`str`, *optional*, defaults to `"gelu"`):
592
+ The activation function used by the multimodal projector.
593
+ vision_feature_select_strategy (`str`, *optional*, defaults to `"default"`):
594
+ The feature selection strategy used to select the vision feature from the vision backbone.
595
+ Can be one of `"default"` or `"full"`.
596
+ vision_feature_layer (`int`, *optional*, defaults to -2):
597
+ The index of the layer to select the vision feature.
598
+
599
+ Example:
600
+
601
+ ```python
602
+ >>> from transformers import LlavaForConditionalGeneration, LlavaConfig, CLIPVisionConfig, LlamaConfig
603
+
604
+ >>> # Initializing a CLIP-vision config
605
+ >>> vision_config = CLIPVisionConfig()
606
+
607
+ >>> # Initializing a Llama config
608
+ >>> text_config = LlamaConfig()
609
+
610
+ >>> # Initializing a Llava llava-1.5-7b style configuration
611
+ >>> configuration = LlavaConfig(vision_config, text_config)
612
+
613
+ >>> # Initializing a model from the llava-1.5-7b style configuration
614
+ >>> model = LlavaForConditionalGeneration(configuration)
615
+
616
+ >>> # Accessing the model configuration
617
+ >>> configuration = model.config
618
+ ```"""
619
+
620
+ model_type = "chatrex"
621
+ is_composition = False
622
+
623
+ def __init__(
624
+ self,
625
+ vision_config=None,
626
+ vision_aux_config=None,
627
+ visual_prompt_encoder_config=None,
628
+ text_config=None,
629
+ ignore_index=-100,
630
+ image_token_index=32000,
631
+ projector_hidden_act="gelu",
632
+ vision_feature_select_strategy="default",
633
+ vision_feature_layer=-2,
634
+ projector_depth=2,
635
+ visual_prompt_hidden_size=2880,
636
+ **kwargs,
637
+ ):
638
+ self.ignore_index = ignore_index
639
+ self.image_token_index = image_token_index
640
+ self.projector_hidden_act = projector_hidden_act
641
+ self.projector_depth = projector_depth
642
+ self.visual_prompt_hidden_size = visual_prompt_hidden_size
643
+ self.visual_prompt_encoder_config = visual_prompt_encoder_config
644
+
645
+ if vision_feature_select_strategy not in ["default", "full"]:
646
+ raise ValueError(
647
+ "vision_feature_select_strategy should be one of 'default', 'full'."
648
+ f"Got: {vision_feature_select_strategy}"
649
+ )
650
+
651
+ self.vision_feature_select_strategy = vision_feature_select_strategy
652
+ self.vision_feature_layer = vision_feature_layer
653
+
654
+ if isinstance(vision_config, dict):
655
+ vision_config["model_type"] = (
656
+ vision_config["model_type"]
657
+ if "model_type" in vision_config
658
+ else "clip_vision_model"
659
+ )
660
+ vision_config = CONFIG_MAPPING[vision_config["model_type"]](**vision_config)
661
+ elif vision_config is None:
662
+ vision_config = CONFIG_MAPPING["clip_vision_model"](
663
+ intermediate_size=4096,
664
+ hidden_size=1024,
665
+ patch_size=14,
666
+ image_size=336,
667
+ num_hidden_layers=24,
668
+ num_attention_heads=16,
669
+ vocab_size=32000,
670
+ projection_dim=768,
671
+ )
672
+
673
+ self.vision_config = vision_config
674
+ self.vision_aux_config = vision_aux_config
675
+
676
+ if isinstance(text_config, dict):
677
+ text_config["model_type"] = (
678
+ text_config["model_type"] if "model_type" in text_config else "llama"
679
+ )
680
+ text_config = CONFIG_MAPPING[text_config["model_type"]](**text_config)
681
+ elif text_config is None:
682
+ text_config = CONFIG_MAPPING["llama"]()
683
+
684
+ self.text_config = text_config
685
+
686
+ super().__init__(**kwargs)
687
+
688
+
689
+ class ChatRexAuxPreTrainedModel(PreTrainedModel):
690
+ config_class = ChatRexAuxConfig
691
+ base_model_prefix = "model"
692
+ supports_gradient_checkpointing = True
693
+ _no_split_modules = ["LlavaVisionAttention"]
694
+ _skip_keys_device_placement = "past_key_values"
695
+ _supports_flash_attn_2 = True
696
+ _supports_cache_class = True
697
+
698
+ # def _init_weights(self, module):
699
+ # # important: this ported version of Llava isn't meant for training from scratch - only
700
+ # # inference and fine-tuning - so the proper init weights code has been removed - the original codebase
701
+ # # https://github.com/haotian-liu/LLaVA/tree/main/llava should serve for that purpose
702
+ # std = (
703
+ # self.config.initializer_range
704
+ # if hasattr(self.config, "initializer_range")
705
+ # else self.config.text_config.initializer_range
706
+ # )
707
+
708
+ # if hasattr(module, "class_embedding"):
709
+ # module.class_embedding.data.normal_(mean=0.0, std=std)
710
+
711
+ # if isinstance(module, (nn.Linear, nn.Conv2d)):
712
+ # module.weight.data.normal_(mean=0.0, std=std)
713
+ # if module.bias is not None:
714
+ # module.bias.data.zero_()
715
+ # elif isinstance(module, nn.Embedding):
716
+ # module.weight.data.normal_(mean=0.0, std=std)
717
+ # if module.padding_idx is not None:
718
+ # module.weight.data[module.padding_idx].zero_()
719
+
720
+ @property
721
+ def _supports_sdpa(self):
722
+ """
723
+ Retrieve language_model's attribute to check whether the model supports
724
+ SDPA or not.
725
+ """
726
+ return self.language_model._supports_sdpa
727
+
728
+
729
+ class ChatRexAuxForConditionalGeneration(ChatRexAuxPreTrainedModel):
730
+
731
+ def __init__(self, config: ChatRexAuxConfig):
732
+ super().__init__(config)
733
+ # low resolusion vision encoder
734
+ self.vision_encoder = AutoModel.from_config(config.vision_config)
735
+ # high resolusion vision encoder
736
+ self.vision_encoder_aux = ConvNextVisionEncoder()
737
+
738
+ # vision projector
739
+ projector_config = ProjectorConfig(
740
+ visual_hidden_size=config.vision_config.hidden_size,
741
+ llm_hidden_size=config.text_config.hidden_size,
742
+ depth=config.projector_depth,
743
+ )
744
+ self.projector = ProjectorModel(projector_config)
745
+
746
+ # visual prompt encoder
747
+ vp_projector_config = ProjectorConfig(
748
+ visual_hidden_size=config.visual_prompt_hidden_size,
749
+ llm_hidden_size=config.text_config.hidden_size,
750
+ depth=config.projector_depth,
751
+ )
752
+ self.vp_projector = ProjectorModel(vp_projector_config)
753
+
754
+ # fuser
755
+ self.fuser = DualPathFuseModule(
756
+ low_res_dim=config.vision_config.hidden_size,
757
+ high_res_dim=1536,
758
+ )
759
+
760
+ # visual prompt encoder
761
+ self.vp_encoder = MultiLevelROIVisualPrompt(
762
+ output_size=7,
763
+ channel_per_level=[192, 384, 768, 1536],
764
+ spatail_scale=192 / 768,
765
+ add_pos_embedding=True,
766
+ pos_embedding_dim=2880,
767
+ )
768
+
769
+ # genconfig
770
+ self.gen_config = None
771
+
772
+ self.vocab_size = config.text_config.vocab_size
773
+ self.llm = AutoModelForCausalLM.from_config(
774
+ config.text_config, attn_implementation=config._attn_implementation
775
+ )
776
+ self.pad_token_id = (
777
+ self.config.pad_token_id if self.config.pad_token_id is not None else -1
778
+ )
779
+ self.post_init()
780
+
781
+
782
+ def _prepare_data_for_llm(self, data):
783
+ if "pixel_values" in data:
784
+ visual_outputs = self.vision_encoder(
785
+ data["pixel_values"].to(self.vision_encoder.dtype),
786
+ output_hidden_states=True,
787
+ )
788
+ if type(self.vision_encoder).__name__ in [
789
+ "CLIPVisionModel",
790
+ "CLIPVisionModelAnyRes",
791
+ ]:
792
+ visual_outputs = visual_outputs.hidden_states[-2][
793
+ :, 1:
794
+ ]
795
+ elif type(self.vision_encoder).__name__ == "SiglipVisionModel":
796
+ visual_outputs = visual_outputs.hidden_states[-2]
797
+ else:
798
+ raise NotImplementedError
799
+
800
+ # aux encoder
801
+ if self.vision_encoder_aux is not None:
802
+ pixels_aux = []
803
+ for pixels in data["pixel_values_aux"]:
804
+ if pixels.dim() == 3:
805
+ pixels = pixels.unsqueeze(0)
806
+ elif pixels.dim() == 4:
807
+ pixels = pixels.permute(1, 0, 2, 3)
808
+ pixels_aux.append(pixels)
809
+ visual_outputs_aux = torch.cat(
810
+ pixels_aux, dim=0
811
+ ) # shape (2, 3, 768, 768)
812
+ aux_output = self.vision_encoder_aux(
813
+ visual_outputs_aux
814
+ )
815
+ visual_outputs_aux = aux_output["image_features"]
816
+ last_feat = aux_output["last_feat"] # (B, 1536, 24, 24)
817
+ # fuser
818
+ fuse_features = self.fuser(
819
+ low_res_feat=visual_outputs, high_res_feat=last_feat
820
+ ) # (2, 576, 1024)
821
+ pixel_values = self.projector(fuse_features)
822
+ data["pixel_values"] = pixel_values
823
+
824
+ # extract visual prompt features
825
+ bbox_visual_outputs = []
826
+ if "gt_boxes" in data:
827
+ for batch_idx, boxes in enumerate(data["gt_boxes"]):
828
+ if len(boxes) == 0:
829
+ bbox_visual_outputs.append(None)
830
+ continue
831
+ multi_level_aux_features = [
832
+ visual_output_aux[batch_idx].unsqueeze(0)
833
+ for visual_output_aux in visual_outputs_aux
834
+ ]
835
+ boxes = boxes.to(torch.float32)
836
+ out_vp_feat = self.vp_encoder(
837
+ multi_level_aux_features,
838
+ [boxes],
839
+ ).squeeze(0)
840
+ out_vp_feat = out_vp_feat.to(pixel_values.dtype)
841
+ out_vp_feat = self.vp_projector(out_vp_feat)
842
+ bbox_visual_outputs.append(out_vp_feat)
843
+ # b,n,c
844
+ data["bbox_feats"] = bbox_visual_outputs
845
+
846
+ data = prepare_inputs_labels_for_multimodal(llm=self.llm, **data)
847
+ return data
848
+
849
+
850
+ def generate(self, data_dict: Dict[str, Any], gen_config=None, tokenizer=None):
851
+ """Perform inference on the given data.
852
+
853
+ Args:
854
+ data_dict (Dict[str, Any]): The data to perform inference on.
855
+
856
+ Returns:
857
+ str: The answer to the question.
858
+ """
859
+ data_dict = self._prepare_data_for_llm(data_dict)
860
+ data_dict["inputs_embeds"] = data_dict["inputs_embeds"].to(self.llm.dtype)
861
+ stop_criteria = get_stop_criteria(
862
+ tokenizer=tokenizer, stop_words=[]
863
+ )
864
+ generate_output = self.llm.generate(
865
+ **data_dict,
866
+ generation_config=self.gen_config if gen_config is None else gen_config,
867
+ streamer=None,
868
+ bos_token_id=tokenizer.bos_token_id,
869
+ stopping_criteria=stop_criteria,
870
+ )
871
+ print(f'generate_output:', generate_output)
872
+ prediction = tokenizer.decode(
873
+ generate_output[0], skip_special_tokens=False
874
+ ).strip()
875
+ prediction = prediction.replace("<s>", "").replace("</s>", "").strip()
876
+ return prediction
877
+
878
+
879
+ AutoConfig.register("chatrex", ChatRexAuxConfig)
880
+ AutoModelForCausalLM.register(ChatRexAuxConfig, ChatRexAuxForConditionalGeneration)
preprocessing_chatrex.py ADDED
@@ -0,0 +1,263 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Processor class for Molmo.
3
+ """
4
+
5
+ from typing import Optional
6
+
7
+ import PIL
8
+ from PIL import Image
9
+
10
+ try:
11
+ from typing import Unpack
12
+ except ImportError:
13
+ from typing_extensions import Unpack
14
+
15
+ import re
16
+ from typing import List, Optional, Union
17
+
18
+ import numpy as np
19
+ import torch
20
+ import torchvision.transforms.functional as F
21
+ from transformers import AutoTokenizer
22
+ from transformers.image_utils import ImageInput
23
+ from transformers.processing_utils import (ProcessingKwargs, ProcessorMixin,
24
+ TextKwargs)
25
+ from transformers.tokenization_utils_base import PreTokenizedInput, TextInput
26
+ from transformers.utils import logging
27
+
28
+ logger = logging.get_logger(__name__)
29
+
30
+
31
+
32
+ IGNORE_INDEX = -100
33
+ DEFAULT_PAD_TOKEN_INDEX = 0
34
+ IMAGE_TOKEN_INDEX = -200
35
+ DEFAULT_IMAGE_TOKEN = "<image>"
36
+
37
+ # For Objects
38
+ DEFAULT_OBJECT_TOKEN = "<obj<i>>"
39
+ DEFAULT_OBJECT_FEATURE_TOKEN = "<objfeat>"
40
+ DEFAULT_OBJECT_INDEX = -300
41
+
42
+ # For Grounding
43
+ DEFAULT_GROUNDING_START = "<ground>"
44
+ DEFAULT_GROUNDING_END = "</ground>"
45
+ DEFAULT_GROUNDING_OBJECTS_START = "<objects>"
46
+ DEFAULT_GROUNDING_OBJECTS_END = "</objects>"
47
+
48
+ def xyxy_to_xywh(boxes):
49
+ """
50
+ Convert boxes from xywh to xyxy format.
51
+
52
+ Parameters:
53
+ boxes (numpy.ndarray): An array of shape (N, 4) where N is the number of boxes.
54
+ Each box is represented as [x, y, x, y].
55
+
56
+ Returns:
57
+ numpy.ndarray: An array of shape (N, 4) where each box is represented as [x_min, y_min, w, h].
58
+ """
59
+ boxes = np.array(boxes)
60
+ x_min, y_min, x_max, y_max = (
61
+ boxes[:, 0],
62
+ boxes[:, 1],
63
+ boxes[:, 2],
64
+ boxes[:, 3],
65
+ )
66
+ w = x_max - x_min
67
+ h = y_max - y_min
68
+ return np.stack([x_min, y_min, w, h], axis=1)
69
+
70
+
71
+ def xywh_to_xyxy(boxes):
72
+ """
73
+ Convert boxes from xywh to xyxy format.
74
+
75
+ Parameters:
76
+ boxes (numpy.ndarray): An array of shape (N, 4) where N is the number of boxes.
77
+ Each box is represented as [x, y, width, height].
78
+
79
+ Returns:
80
+ numpy.ndarray: An array of shape (N, 4) where each box is represented as [x_min, y_min, x_max, y_max].
81
+ """
82
+ boxes = np.array(boxes)
83
+ x, y, width, height = (
84
+ boxes[:, 0],
85
+ boxes[:, 1],
86
+ boxes[:, 2],
87
+ boxes[:, 3],
88
+ )
89
+ x_max = x + width
90
+ y_max = y + height
91
+ return np.stack([x, y, x_max, y_max], axis=1)
92
+
93
+ def expand2square(pil_img, background_color):
94
+ width, height = pil_img.size
95
+ if width == height:
96
+ return pil_img
97
+ elif width > height:
98
+ result = Image.new(pil_img.mode, (width, width), background_color)
99
+ result.paste(pil_img, (0, (width - height) // 2))
100
+ return result
101
+ else:
102
+ result = Image.new(pil_img.mode, (height, height), background_color)
103
+ result.paste(pil_img, ((height - width) // 2, 0))
104
+ return result
105
+
106
+ def pad_boxes(gt_boxes, old_size):
107
+ old_w, old_h = old_size
108
+ gt_boxes = np.array(gt_boxes).astype(np.float32)
109
+ # Calculate the padding added
110
+ if old_w > old_h:
111
+ pad_top = (old_w - old_h) // 2
112
+ pad_bottom = old_w - old_h - pad_top
113
+ pad_left, pad_right = 0, 0
114
+ else:
115
+ pad_left = (old_h - old_w) // 2
116
+ pad_right = old_h - old_w - pad_left
117
+ pad_top, pad_bottom = 0, 0
118
+
119
+ # Adjust the boxes for padding
120
+ gt_boxes[:, 0] += pad_left # x
121
+ gt_boxes[:, 1] += pad_top # y
122
+ return gt_boxes
123
+
124
+
125
+ def resize_boxes(gt_boxes, old_size, new_size):
126
+ old_w, old_h = old_size
127
+ new_h, new_w = new_size
128
+ gt_boxes = np.array(gt_boxes).astype(np.float32)
129
+ # Calculate scale factors
130
+ scale_x = new_w / max(old_w, old_h)
131
+ scale_y = new_h / max(old_w, old_h)
132
+
133
+ # Resize the boxes
134
+ gt_boxes[:, 0] *= scale_x # x
135
+ gt_boxes[:, 1] *= scale_y # y
136
+ gt_boxes[:, 2] *= scale_x # w
137
+ gt_boxes[:, 3] *= scale_y # h
138
+
139
+ return gt_boxes
140
+
141
+ def split_special_strings(input_string: str, special_strings: list[str] = None):
142
+ """Split the input string into a list of strings, keeping the special strings.
143
+
144
+ Args:
145
+ input_string (str): The input string to split.
146
+
147
+ Example:
148
+
149
+ input_string = "<image>\n<obj0><objfeat><obj1><objfeat>\n I am happy today."
150
+ output = ['<image>', '\n<obj0>', '<objfeat>', '<obj1>', '<objfeat>', '\n I am happy today.']
151
+
152
+ Returns:
153
+ list: A list of strings, with the special strings separated from the rest of the input string.
154
+ """
155
+ # Create a regex pattern to match the special strings
156
+ pattern = "|".join(map(re.escape, special_strings))
157
+
158
+ # Split the input string using the pattern, keeping the special strings in the result
159
+ split_list = re.split(f"({pattern})", input_string)
160
+
161
+ # Remove empty strings from the list
162
+ split_list = [s for s in split_list if s]
163
+
164
+ return split_list
165
+
166
+ def tokenizer_image_object_token(prompt, tokenizer):
167
+ bos_token_id = tokenizer.bos_token_id
168
+ split_tokens = [DEFAULT_IMAGE_TOKEN, DEFAULT_OBJECT_FEATURE_TOKEN]
169
+ chunks = split_special_strings(prompt, split_tokens)
170
+ input_encode = [bos_token_id]
171
+ for chunk in chunks:
172
+ if chunk == DEFAULT_IMAGE_TOKEN:
173
+ input_encode.append(IMAGE_TOKEN_INDEX)
174
+ elif chunk == DEFAULT_OBJECT_FEATURE_TOKEN:
175
+ input_encode.append(DEFAULT_OBJECT_INDEX)
176
+ else:
177
+ input_encode.extend(tokenizer.encode(chunk, add_special_tokens=False))
178
+ return input_encode
179
+
180
+ class ChatRexProcessor(ProcessorMixin):
181
+ attributes = ["image_processor", "tokenizer"]
182
+ image_processor_class = "AutoImageProcessor"
183
+ tokenizer_class = "AutoTokenizer"
184
+
185
+ def __init__(self, image_processor = None, tokenizer : AutoTokenizer = None, **kwargs):
186
+ # self.image_processor = image_processor
187
+ # self.tokenizer = tokenizer
188
+ super().__init__(image_processor, tokenizer)
189
+ self._special_tokens = None
190
+ self.template = dict(
191
+ SYSTEM=('A chat between a curious user and an artificial '
192
+ 'intelligence assistant. The assistant gives '
193
+ 'helpful, detailed, and polite answers to the '
194
+ 'user\'s questions. {system}\n '),
195
+ INSTRUCTION=('USER: {input} ASSISTANT:'),
196
+ SEP='\n')
197
+
198
+ def process(
199
+ self,
200
+ image: Union[str, Image.Image],
201
+ bbox: List[List[int]],
202
+ question: str,
203
+ ):
204
+ """Prepare input data for inference.
205
+
206
+ Args:
207
+ image (Union[str, Image.Image]): The image to process.
208
+ bbox (List[List[int]]): A list of bounding boxes for the image. Each bounding box should
209
+ be in order of [x, y, x , y].
210
+ question (str): The question to ask about the image.
211
+ """
212
+ data_dict = {}
213
+ # step1 load image
214
+ if type(image) == str:
215
+ image = Image.open(image).convert("RGB")
216
+ ori_w, ori_h = F.get_image_size(image)
217
+ image = expand2square(
218
+ image,
219
+ tuple(int(x * 255) for x in self.image_processor.image_mean),
220
+ )
221
+ pad_w, pad_h = F.get_image_size(image)
222
+ image_aux = self.image_processor.preprocess(image, return_tensors="pt")[
223
+ "pixel_values"
224
+ ][0]
225
+ resize_h, resize_w = image_aux.shape[-2:]
226
+ data_dict["pixel_values_aux"] = image_aux.unsqueeze(0)
227
+ image = image_aux.clone()
228
+ image = torch.nn.functional.interpolate(
229
+ image[None],
230
+ size=[336, 336],
231
+ mode="bilinear",
232
+ align_corners=False,
233
+ )[0]
234
+ data_dict["pixel_values"] = image.unsqueeze(0)
235
+
236
+ # step2 load boxes
237
+ bbox= xyxy_to_xywh(bbox)
238
+ bbox = pad_boxes(bbox, (ori_w, ori_h))
239
+ bbox = resize_boxes(bbox, (pad_w, pad_h), (resize_h, resize_w))
240
+ data_dict["gt_boxes"] = torch.tensor(xywh_to_xyxy(bbox)).unsqueeze(0)
241
+
242
+ # step3 prepare question
243
+ total_num_boxes = len(bbox)
244
+ obj_tokens = [
245
+ DEFAULT_OBJECT_TOKEN.replace("<i>", str(i)) for i in range(total_num_boxes)
246
+ ]
247
+ obj_tokens = (
248
+ DEFAULT_OBJECT_FEATURE_TOKEN.join(obj_tokens) + DEFAULT_OBJECT_FEATURE_TOKEN
249
+ )
250
+ question = question.replace(DEFAULT_IMAGE_TOKEN, "")
251
+ question = DEFAULT_IMAGE_TOKEN + "\n" + obj_tokens + "\n" + question
252
+
253
+
254
+ inputs = ""
255
+ inputs += self.template["INSTRUCTION"].format(input=question, round=1)
256
+
257
+ # step4 tokenize question
258
+ input_ids = tokenizer_image_object_token(inputs, self.tokenizer)
259
+ data_dict["input_ids"] = torch.tensor(input_ids).unsqueeze(0)
260
+
261
+ return data_dict
262
+
263
+ ChatRexProcessor.register_for_auto_class()
preprocessor_config.json ADDED
@@ -0,0 +1,28 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "crop_size": {
3
+ "height": 768,
4
+ "width": 768
5
+ },
6
+ "do_center_crop": true,
7
+ "do_convert_rgb": true,
8
+ "do_normalize": true,
9
+ "do_rescale": true,
10
+ "do_resize": true,
11
+ "image_mean": [
12
+ 0.48145466,
13
+ 0.4578275,
14
+ 0.40821073
15
+ ],
16
+ "image_processor_type": "CLIPImageProcessor",
17
+ "image_std": [
18
+ 0.26862954,
19
+ 0.26130258,
20
+ 0.27577711
21
+ ],
22
+ "processor_class": "ChatRexProcessor",
23
+ "resample": 3,
24
+ "rescale_factor": 0.00392156862745098,
25
+ "size": {
26
+ "shortest_edge": 768
27
+ }
28
+ }
processor_config.json ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ {
2
+ "auto_map": {
3
+ "AutoProcessor": "preprocessing_chatrex.ChatRexProcessor"
4
+ },
5
+ "processor_class": "ChatRexProcessor"
6
+ }
special_tokens_map.json ADDED
@@ -0,0 +1,30 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token": {
3
+ "content": "<s>",
4
+ "lstrip": false,
5
+ "normalized": false,
6
+ "rstrip": false,
7
+ "single_word": false
8
+ },
9
+ "eos_token": {
10
+ "content": "</s>",
11
+ "lstrip": false,
12
+ "normalized": false,
13
+ "rstrip": false,
14
+ "single_word": false
15
+ },
16
+ "pad_token": {
17
+ "content": "<unk>",
18
+ "lstrip": false,
19
+ "normalized": false,
20
+ "rstrip": false,
21
+ "single_word": false
22
+ },
23
+ "unk_token": {
24
+ "content": "<unk>",
25
+ "lstrip": false,
26
+ "normalized": false,
27
+ "rstrip": false,
28
+ "single_word": false
29
+ }
30
+ }
tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
tokenizer.model ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9e556afd44213b6bd1be2b850ebbbd98f5481437a8021afaf58ee7fb1818d347
3
+ size 499723
tokenizer_config.json ADDED
@@ -0,0 +1,876 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_bos_token": true,
3
+ "add_eos_token": false,
4
+ "add_prefix_space": true,
5
+ "added_tokens_decoder": {
6
+ "0": {
7
+ "content": "<unk>",
8
+ "lstrip": false,
9
+ "normalized": false,
10
+ "rstrip": false,
11
+ "single_word": false,
12
+ "special": true
13
+ },
14
+ "1": {
15
+ "content": "<s>",
16
+ "lstrip": false,
17
+ "normalized": false,
18
+ "rstrip": false,
19
+ "single_word": false,
20
+ "special": true
21
+ },
22
+ "2": {
23
+ "content": "</s>",
24
+ "lstrip": false,
25
+ "normalized": false,
26
+ "rstrip": false,
27
+ "single_word": false,
28
+ "special": true
29
+ },
30
+ "32000": {
31
+ "content": "<obj0>",
32
+ "lstrip": false,
33
+ "normalized": false,
34
+ "rstrip": false,
35
+ "single_word": false,
36
+ "special": true
37
+ },
38
+ "32001": {
39
+ "content": "<obj1>",
40
+ "lstrip": false,
41
+ "normalized": false,
42
+ "rstrip": false,
43
+ "single_word": false,
44
+ "special": true
45
+ },
46
+ "32002": {
47
+ "content": "<obj2>",
48
+ "lstrip": false,
49
+ "normalized": false,
50
+ "rstrip": false,
51
+ "single_word": false,
52
+ "special": true
53
+ },
54
+ "32003": {
55
+ "content": "<obj3>",
56
+ "lstrip": false,
57
+ "normalized": false,
58
+ "rstrip": false,
59
+ "single_word": false,
60
+ "special": true
61
+ },
62
+ "32004": {
63
+ "content": "<obj4>",
64
+ "lstrip": false,
65
+ "normalized": false,
66
+ "rstrip": false,
67
+ "single_word": false,
68
+ "special": true
69
+ },
70
+ "32005": {
71
+ "content": "<obj5>",
72
+ "lstrip": false,
73
+ "normalized": false,
74
+ "rstrip": false,
75
+ "single_word": false,
76
+ "special": true
77
+ },
78
+ "32006": {
79
+ "content": "<obj6>",
80
+ "lstrip": false,
81
+ "normalized": false,
82
+ "rstrip": false,
83
+ "single_word": false,
84
+ "special": true
85
+ },
86
+ "32007": {
87
+ "content": "<obj7>",
88
+ "lstrip": false,
89
+ "normalized": false,
90
+ "rstrip": false,
91
+ "single_word": false,
92
+ "special": true
93
+ },
94
+ "32008": {
95
+ "content": "<obj8>",
96
+ "lstrip": false,
97
+ "normalized": false,
98
+ "rstrip": false,
99
+ "single_word": false,
100
+ "special": true
101
+ },
102
+ "32009": {
103
+ "content": "<obj9>",
104
+ "lstrip": false,
105
+ "normalized": false,
106
+ "rstrip": false,
107
+ "single_word": false,
108
+ "special": true
109
+ },
110
+ "32010": {
111
+ "content": "<obj10>",
112
+ "lstrip": false,
113
+ "normalized": false,
114
+ "rstrip": false,
115
+ "single_word": false,
116
+ "special": true
117
+ },
118
+ "32011": {
119
+ "content": "<obj11>",
120
+ "lstrip": false,
121
+ "normalized": false,
122
+ "rstrip": false,
123
+ "single_word": false,
124
+ "special": true
125
+ },
126
+ "32012": {
127
+ "content": "<obj12>",
128
+ "lstrip": false,
129
+ "normalized": false,
130
+ "rstrip": false,
131
+ "single_word": false,
132
+ "special": true
133
+ },
134
+ "32013": {
135
+ "content": "<obj13>",
136
+ "lstrip": false,
137
+ "normalized": false,
138
+ "rstrip": false,
139
+ "single_word": false,
140
+ "special": true
141
+ },
142
+ "32014": {
143
+ "content": "<obj14>",
144
+ "lstrip": false,
145
+ "normalized": false,
146
+ "rstrip": false,
147
+ "single_word": false,
148
+ "special": true
149
+ },
150
+ "32015": {
151
+ "content": "<obj15>",
152
+ "lstrip": false,
153
+ "normalized": false,
154
+ "rstrip": false,
155
+ "single_word": false,
156
+ "special": true
157
+ },
158
+ "32016": {
159
+ "content": "<obj16>",
160
+ "lstrip": false,
161
+ "normalized": false,
162
+ "rstrip": false,
163
+ "single_word": false,
164
+ "special": true
165
+ },
166
+ "32017": {
167
+ "content": "<obj17>",
168
+ "lstrip": false,
169
+ "normalized": false,
170
+ "rstrip": false,
171
+ "single_word": false,
172
+ "special": true
173
+ },
174
+ "32018": {
175
+ "content": "<obj18>",
176
+ "lstrip": false,
177
+ "normalized": false,
178
+ "rstrip": false,
179
+ "single_word": false,
180
+ "special": true
181
+ },
182
+ "32019": {
183
+ "content": "<obj19>",
184
+ "lstrip": false,
185
+ "normalized": false,
186
+ "rstrip": false,
187
+ "single_word": false,
188
+ "special": true
189
+ },
190
+ "32020": {
191
+ "content": "<obj20>",
192
+ "lstrip": false,
193
+ "normalized": false,
194
+ "rstrip": false,
195
+ "single_word": false,
196
+ "special": true
197
+ },
198
+ "32021": {
199
+ "content": "<obj21>",
200
+ "lstrip": false,
201
+ "normalized": false,
202
+ "rstrip": false,
203
+ "single_word": false,
204
+ "special": true
205
+ },
206
+ "32022": {
207
+ "content": "<obj22>",
208
+ "lstrip": false,
209
+ "normalized": false,
210
+ "rstrip": false,
211
+ "single_word": false,
212
+ "special": true
213
+ },
214
+ "32023": {
215
+ "content": "<obj23>",
216
+ "lstrip": false,
217
+ "normalized": false,
218
+ "rstrip": false,
219
+ "single_word": false,
220
+ "special": true
221
+ },
222
+ "32024": {
223
+ "content": "<obj24>",
224
+ "lstrip": false,
225
+ "normalized": false,
226
+ "rstrip": false,
227
+ "single_word": false,
228
+ "special": true
229
+ },
230
+ "32025": {
231
+ "content": "<obj25>",
232
+ "lstrip": false,
233
+ "normalized": false,
234
+ "rstrip": false,
235
+ "single_word": false,
236
+ "special": true
237
+ },
238
+ "32026": {
239
+ "content": "<obj26>",
240
+ "lstrip": false,
241
+ "normalized": false,
242
+ "rstrip": false,
243
+ "single_word": false,
244
+ "special": true
245
+ },
246
+ "32027": {
247
+ "content": "<obj27>",
248
+ "lstrip": false,
249
+ "normalized": false,
250
+ "rstrip": false,
251
+ "single_word": false,
252
+ "special": true
253
+ },
254
+ "32028": {
255
+ "content": "<obj28>",
256
+ "lstrip": false,
257
+ "normalized": false,
258
+ "rstrip": false,
259
+ "single_word": false,
260
+ "special": true
261
+ },
262
+ "32029": {
263
+ "content": "<obj29>",
264
+ "lstrip": false,
265
+ "normalized": false,
266
+ "rstrip": false,
267
+ "single_word": false,
268
+ "special": true
269
+ },
270
+ "32030": {
271
+ "content": "<obj30>",
272
+ "lstrip": false,
273
+ "normalized": false,
274
+ "rstrip": false,
275
+ "single_word": false,
276
+ "special": true
277
+ },
278
+ "32031": {
279
+ "content": "<obj31>",
280
+ "lstrip": false,
281
+ "normalized": false,
282
+ "rstrip": false,
283
+ "single_word": false,
284
+ "special": true
285
+ },
286
+ "32032": {
287
+ "content": "<obj32>",
288
+ "lstrip": false,
289
+ "normalized": false,
290
+ "rstrip": false,
291
+ "single_word": false,
292
+ "special": true
293
+ },
294
+ "32033": {
295
+ "content": "<obj33>",
296
+ "lstrip": false,
297
+ "normalized": false,
298
+ "rstrip": false,
299
+ "single_word": false,
300
+ "special": true
301
+ },
302
+ "32034": {
303
+ "content": "<obj34>",
304
+ "lstrip": false,
305
+ "normalized": false,
306
+ "rstrip": false,
307
+ "single_word": false,
308
+ "special": true
309
+ },
310
+ "32035": {
311
+ "content": "<obj35>",
312
+ "lstrip": false,
313
+ "normalized": false,
314
+ "rstrip": false,
315
+ "single_word": false,
316
+ "special": true
317
+ },
318
+ "32036": {
319
+ "content": "<obj36>",
320
+ "lstrip": false,
321
+ "normalized": false,
322
+ "rstrip": false,
323
+ "single_word": false,
324
+ "special": true
325
+ },
326
+ "32037": {
327
+ "content": "<obj37>",
328
+ "lstrip": false,
329
+ "normalized": false,
330
+ "rstrip": false,
331
+ "single_word": false,
332
+ "special": true
333
+ },
334
+ "32038": {
335
+ "content": "<obj38>",
336
+ "lstrip": false,
337
+ "normalized": false,
338
+ "rstrip": false,
339
+ "single_word": false,
340
+ "special": true
341
+ },
342
+ "32039": {
343
+ "content": "<obj39>",
344
+ "lstrip": false,
345
+ "normalized": false,
346
+ "rstrip": false,
347
+ "single_word": false,
348
+ "special": true
349
+ },
350
+ "32040": {
351
+ "content": "<obj40>",
352
+ "lstrip": false,
353
+ "normalized": false,
354
+ "rstrip": false,
355
+ "single_word": false,
356
+ "special": true
357
+ },
358
+ "32041": {
359
+ "content": "<obj41>",
360
+ "lstrip": false,
361
+ "normalized": false,
362
+ "rstrip": false,
363
+ "single_word": false,
364
+ "special": true
365
+ },
366
+ "32042": {
367
+ "content": "<obj42>",
368
+ "lstrip": false,
369
+ "normalized": false,
370
+ "rstrip": false,
371
+ "single_word": false,
372
+ "special": true
373
+ },
374
+ "32043": {
375
+ "content": "<obj43>",
376
+ "lstrip": false,
377
+ "normalized": false,
378
+ "rstrip": false,
379
+ "single_word": false,
380
+ "special": true
381
+ },
382
+ "32044": {
383
+ "content": "<obj44>",
384
+ "lstrip": false,
385
+ "normalized": false,
386
+ "rstrip": false,
387
+ "single_word": false,
388
+ "special": true
389
+ },
390
+ "32045": {
391
+ "content": "<obj45>",
392
+ "lstrip": false,
393
+ "normalized": false,
394
+ "rstrip": false,
395
+ "single_word": false,
396
+ "special": true
397
+ },
398
+ "32046": {
399
+ "content": "<obj46>",
400
+ "lstrip": false,
401
+ "normalized": false,
402
+ "rstrip": false,
403
+ "single_word": false,
404
+ "special": true
405
+ },
406
+ "32047": {
407
+ "content": "<obj47>",
408
+ "lstrip": false,
409
+ "normalized": false,
410
+ "rstrip": false,
411
+ "single_word": false,
412
+ "special": true
413
+ },
414
+ "32048": {
415
+ "content": "<obj48>",
416
+ "lstrip": false,
417
+ "normalized": false,
418
+ "rstrip": false,
419
+ "single_word": false,
420
+ "special": true
421
+ },
422
+ "32049": {
423
+ "content": "<obj49>",
424
+ "lstrip": false,
425
+ "normalized": false,
426
+ "rstrip": false,
427
+ "single_word": false,
428
+ "special": true
429
+ },
430
+ "32050": {
431
+ "content": "<obj50>",
432
+ "lstrip": false,
433
+ "normalized": false,
434
+ "rstrip": false,
435
+ "single_word": false,
436
+ "special": true
437
+ },
438
+ "32051": {
439
+ "content": "<obj51>",
440
+ "lstrip": false,
441
+ "normalized": false,
442
+ "rstrip": false,
443
+ "single_word": false,
444
+ "special": true
445
+ },
446
+ "32052": {
447
+ "content": "<obj52>",
448
+ "lstrip": false,
449
+ "normalized": false,
450
+ "rstrip": false,
451
+ "single_word": false,
452
+ "special": true
453
+ },
454
+ "32053": {
455
+ "content": "<obj53>",
456
+ "lstrip": false,
457
+ "normalized": false,
458
+ "rstrip": false,
459
+ "single_word": false,
460
+ "special": true
461
+ },
462
+ "32054": {
463
+ "content": "<obj54>",
464
+ "lstrip": false,
465
+ "normalized": false,
466
+ "rstrip": false,
467
+ "single_word": false,
468
+ "special": true
469
+ },
470
+ "32055": {
471
+ "content": "<obj55>",
472
+ "lstrip": false,
473
+ "normalized": false,
474
+ "rstrip": false,
475
+ "single_word": false,
476
+ "special": true
477
+ },
478
+ "32056": {
479
+ "content": "<obj56>",
480
+ "lstrip": false,
481
+ "normalized": false,
482
+ "rstrip": false,
483
+ "single_word": false,
484
+ "special": true
485
+ },
486
+ "32057": {
487
+ "content": "<obj57>",
488
+ "lstrip": false,
489
+ "normalized": false,
490
+ "rstrip": false,
491
+ "single_word": false,
492
+ "special": true
493
+ },
494
+ "32058": {
495
+ "content": "<obj58>",
496
+ "lstrip": false,
497
+ "normalized": false,
498
+ "rstrip": false,
499
+ "single_word": false,
500
+ "special": true
501
+ },
502
+ "32059": {
503
+ "content": "<obj59>",
504
+ "lstrip": false,
505
+ "normalized": false,
506
+ "rstrip": false,
507
+ "single_word": false,
508
+ "special": true
509
+ },
510
+ "32060": {
511
+ "content": "<obj60>",
512
+ "lstrip": false,
513
+ "normalized": false,
514
+ "rstrip": false,
515
+ "single_word": false,
516
+ "special": true
517
+ },
518
+ "32061": {
519
+ "content": "<obj61>",
520
+ "lstrip": false,
521
+ "normalized": false,
522
+ "rstrip": false,
523
+ "single_word": false,
524
+ "special": true
525
+ },
526
+ "32062": {
527
+ "content": "<obj62>",
528
+ "lstrip": false,
529
+ "normalized": false,
530
+ "rstrip": false,
531
+ "single_word": false,
532
+ "special": true
533
+ },
534
+ "32063": {
535
+ "content": "<obj63>",
536
+ "lstrip": false,
537
+ "normalized": false,
538
+ "rstrip": false,
539
+ "single_word": false,
540
+ "special": true
541
+ },
542
+ "32064": {
543
+ "content": "<obj64>",
544
+ "lstrip": false,
545
+ "normalized": false,
546
+ "rstrip": false,
547
+ "single_word": false,
548
+ "special": true
549
+ },
550
+ "32065": {
551
+ "content": "<obj65>",
552
+ "lstrip": false,
553
+ "normalized": false,
554
+ "rstrip": false,
555
+ "single_word": false,
556
+ "special": true
557
+ },
558
+ "32066": {
559
+ "content": "<obj66>",
560
+ "lstrip": false,
561
+ "normalized": false,
562
+ "rstrip": false,
563
+ "single_word": false,
564
+ "special": true
565
+ },
566
+ "32067": {
567
+ "content": "<obj67>",
568
+ "lstrip": false,
569
+ "normalized": false,
570
+ "rstrip": false,
571
+ "single_word": false,
572
+ "special": true
573
+ },
574
+ "32068": {
575
+ "content": "<obj68>",
576
+ "lstrip": false,
577
+ "normalized": false,
578
+ "rstrip": false,
579
+ "single_word": false,
580
+ "special": true
581
+ },
582
+ "32069": {
583
+ "content": "<obj69>",
584
+ "lstrip": false,
585
+ "normalized": false,
586
+ "rstrip": false,
587
+ "single_word": false,
588
+ "special": true
589
+ },
590
+ "32070": {
591
+ "content": "<obj70>",
592
+ "lstrip": false,
593
+ "normalized": false,
594
+ "rstrip": false,
595
+ "single_word": false,
596
+ "special": true
597
+ },
598
+ "32071": {
599
+ "content": "<obj71>",
600
+ "lstrip": false,
601
+ "normalized": false,
602
+ "rstrip": false,
603
+ "single_word": false,
604
+ "special": true
605
+ },
606
+ "32072": {
607
+ "content": "<obj72>",
608
+ "lstrip": false,
609
+ "normalized": false,
610
+ "rstrip": false,
611
+ "single_word": false,
612
+ "special": true
613
+ },
614
+ "32073": {
615
+ "content": "<obj73>",
616
+ "lstrip": false,
617
+ "normalized": false,
618
+ "rstrip": false,
619
+ "single_word": false,
620
+ "special": true
621
+ },
622
+ "32074": {
623
+ "content": "<obj74>",
624
+ "lstrip": false,
625
+ "normalized": false,
626
+ "rstrip": false,
627
+ "single_word": false,
628
+ "special": true
629
+ },
630
+ "32075": {
631
+ "content": "<obj75>",
632
+ "lstrip": false,
633
+ "normalized": false,
634
+ "rstrip": false,
635
+ "single_word": false,
636
+ "special": true
637
+ },
638
+ "32076": {
639
+ "content": "<obj76>",
640
+ "lstrip": false,
641
+ "normalized": false,
642
+ "rstrip": false,
643
+ "single_word": false,
644
+ "special": true
645
+ },
646
+ "32077": {
647
+ "content": "<obj77>",
648
+ "lstrip": false,
649
+ "normalized": false,
650
+ "rstrip": false,
651
+ "single_word": false,
652
+ "special": true
653
+ },
654
+ "32078": {
655
+ "content": "<obj78>",
656
+ "lstrip": false,
657
+ "normalized": false,
658
+ "rstrip": false,
659
+ "single_word": false,
660
+ "special": true
661
+ },
662
+ "32079": {
663
+ "content": "<obj79>",
664
+ "lstrip": false,
665
+ "normalized": false,
666
+ "rstrip": false,
667
+ "single_word": false,
668
+ "special": true
669
+ },
670
+ "32080": {
671
+ "content": "<obj80>",
672
+ "lstrip": false,
673
+ "normalized": false,
674
+ "rstrip": false,
675
+ "single_word": false,
676
+ "special": true
677
+ },
678
+ "32081": {
679
+ "content": "<obj81>",
680
+ "lstrip": false,
681
+ "normalized": false,
682
+ "rstrip": false,
683
+ "single_word": false,
684
+ "special": true
685
+ },
686
+ "32082": {
687
+ "content": "<obj82>",
688
+ "lstrip": false,
689
+ "normalized": false,
690
+ "rstrip": false,
691
+ "single_word": false,
692
+ "special": true
693
+ },
694
+ "32083": {
695
+ "content": "<obj83>",
696
+ "lstrip": false,
697
+ "normalized": false,
698
+ "rstrip": false,
699
+ "single_word": false,
700
+ "special": true
701
+ },
702
+ "32084": {
703
+ "content": "<obj84>",
704
+ "lstrip": false,
705
+ "normalized": false,
706
+ "rstrip": false,
707
+ "single_word": false,
708
+ "special": true
709
+ },
710
+ "32085": {
711
+ "content": "<obj85>",
712
+ "lstrip": false,
713
+ "normalized": false,
714
+ "rstrip": false,
715
+ "single_word": false,
716
+ "special": true
717
+ },
718
+ "32086": {
719
+ "content": "<obj86>",
720
+ "lstrip": false,
721
+ "normalized": false,
722
+ "rstrip": false,
723
+ "single_word": false,
724
+ "special": true
725
+ },
726
+ "32087": {
727
+ "content": "<obj87>",
728
+ "lstrip": false,
729
+ "normalized": false,
730
+ "rstrip": false,
731
+ "single_word": false,
732
+ "special": true
733
+ },
734
+ "32088": {
735
+ "content": "<obj88>",
736
+ "lstrip": false,
737
+ "normalized": false,
738
+ "rstrip": false,
739
+ "single_word": false,
740
+ "special": true
741
+ },
742
+ "32089": {
743
+ "content": "<obj89>",
744
+ "lstrip": false,
745
+ "normalized": false,
746
+ "rstrip": false,
747
+ "single_word": false,
748
+ "special": true
749
+ },
750
+ "32090": {
751
+ "content": "<obj90>",
752
+ "lstrip": false,
753
+ "normalized": false,
754
+ "rstrip": false,
755
+ "single_word": false,
756
+ "special": true
757
+ },
758
+ "32091": {
759
+ "content": "<obj91>",
760
+ "lstrip": false,
761
+ "normalized": false,
762
+ "rstrip": false,
763
+ "single_word": false,
764
+ "special": true
765
+ },
766
+ "32092": {
767
+ "content": "<obj92>",
768
+ "lstrip": false,
769
+ "normalized": false,
770
+ "rstrip": false,
771
+ "single_word": false,
772
+ "special": true
773
+ },
774
+ "32093": {
775
+ "content": "<obj93>",
776
+ "lstrip": false,
777
+ "normalized": false,
778
+ "rstrip": false,
779
+ "single_word": false,
780
+ "special": true
781
+ },
782
+ "32094": {
783
+ "content": "<obj94>",
784
+ "lstrip": false,
785
+ "normalized": false,
786
+ "rstrip": false,
787
+ "single_word": false,
788
+ "special": true
789
+ },
790
+ "32095": {
791
+ "content": "<obj95>",
792
+ "lstrip": false,
793
+ "normalized": false,
794
+ "rstrip": false,
795
+ "single_word": false,
796
+ "special": true
797
+ },
798
+ "32096": {
799
+ "content": "<obj96>",
800
+ "lstrip": false,
801
+ "normalized": false,
802
+ "rstrip": false,
803
+ "single_word": false,
804
+ "special": true
805
+ },
806
+ "32097": {
807
+ "content": "<obj97>",
808
+ "lstrip": false,
809
+ "normalized": false,
810
+ "rstrip": false,
811
+ "single_word": false,
812
+ "special": true
813
+ },
814
+ "32098": {
815
+ "content": "<obj98>",
816
+ "lstrip": false,
817
+ "normalized": false,
818
+ "rstrip": false,
819
+ "single_word": false,
820
+ "special": true
821
+ },
822
+ "32099": {
823
+ "content": "<obj99>",
824
+ "lstrip": false,
825
+ "normalized": false,
826
+ "rstrip": false,
827
+ "single_word": false,
828
+ "special": true
829
+ },
830
+ "32100": {
831
+ "content": "<ground>",
832
+ "lstrip": false,
833
+ "normalized": false,
834
+ "rstrip": false,
835
+ "single_word": false,
836
+ "special": true
837
+ },
838
+ "32101": {
839
+ "content": "</ground>",
840
+ "lstrip": false,
841
+ "normalized": false,
842
+ "rstrip": false,
843
+ "single_word": false,
844
+ "special": true
845
+ },
846
+ "32102": {
847
+ "content": "<objects>",
848
+ "lstrip": false,
849
+ "normalized": false,
850
+ "rstrip": false,
851
+ "single_word": false,
852
+ "special": true
853
+ },
854
+ "32103": {
855
+ "content": "</objects>",
856
+ "lstrip": false,
857
+ "normalized": false,
858
+ "rstrip": false,
859
+ "single_word": false,
860
+ "special": true
861
+ }
862
+ },
863
+ "bos_token": "<s>",
864
+ "clean_up_tokenization_spaces": false,
865
+ "eos_token": "</s>",
866
+ "legacy": false,
867
+ "model_max_length": 4096,
868
+ "pad_token": "<unk>",
869
+ "padding_side": "right",
870
+ "processor_class": "LlavaProcessor",
871
+ "sp_model_kwargs": {},
872
+ "spaces_between_special_tokens": false,
873
+ "tokenizer_class": "LlamaTokenizer",
874
+ "unk_token": "<unk>",
875
+ "use_default_system_prompt": false
876
+ }