VictorSanh
commited on
Commit
·
1f31f46
1
Parent(s):
8d33f67
trying the autoconfig
Browse files- config.json +1 -1
- configuration_img2html.py +12 -14
- modeling_img2html.py +8 -8
config.json
CHANGED
@@ -10,7 +10,7 @@
|
|
10 |
],
|
11 |
"attention_dropout": 0.0,
|
12 |
"auto_map": {
|
13 |
-
"
|
14 |
"AutoModelForCausalLM": "modeling_img2html.Img2HTMLForVisionText2Text"
|
15 |
},
|
16 |
"bos_token_id": 1,
|
|
|
10 |
],
|
11 |
"attention_dropout": 0.0,
|
12 |
"auto_map": {
|
13 |
+
"AutoConfig": "confgiration_img2html.Img2HTMLConfig",
|
14 |
"AutoModelForCausalLM": "modeling_img2html.Img2HTMLForVisionText2Text"
|
15 |
},
|
16 |
"bos_token_id": 1,
|
configuration_img2html.py
CHANGED
@@ -24,16 +24,15 @@ MISTRAL_PRETRAINED_CONFIG_ARCHIVE_MAP = {
|
|
24 |
}
|
25 |
|
26 |
|
27 |
-
class
|
28 |
r"""
|
29 |
"""
|
30 |
-
model_type = "
|
31 |
|
32 |
def __init__(
|
33 |
self,
|
34 |
hidden_size=768,
|
35 |
intermediate_size=3072,
|
36 |
-
projection_dim=512,
|
37 |
num_hidden_layers=12,
|
38 |
num_attention_heads=12,
|
39 |
num_channels=3,
|
@@ -51,7 +50,6 @@ class VMistralVisionConfig(PretrainedConfig):
|
|
51 |
|
52 |
self.hidden_size = hidden_size
|
53 |
self.intermediate_size = intermediate_size
|
54 |
-
self.projection_dim = projection_dim
|
55 |
self.num_hidden_layers = num_hidden_layers
|
56 |
self.num_attention_heads = num_attention_heads
|
57 |
self.num_channels = num_channels
|
@@ -65,7 +63,7 @@ class VMistralVisionConfig(PretrainedConfig):
|
|
65 |
self._flash_attn_2_enabled = _flash_attn_2_enabled
|
66 |
|
67 |
|
68 |
-
class
|
69 |
r"""
|
70 |
TThis is the configuration class to store the configuration of a [`MistralModel`]. It is used to instantiate an
|
71 |
Mistral model according to the specified arguments, defining the model architecture. Instantiating a configuration
|
@@ -91,7 +89,7 @@ class VMistralPerceiverConfig(PretrainedConfig):
|
|
91 |
qk_layer_norms_perceiver (`bool`, *optional*, defaults to `False`):
|
92 |
Whether or not to use qk layer norms in perceiver
|
93 |
"""
|
94 |
-
model_type = "
|
95 |
|
96 |
def __init__(
|
97 |
self,
|
@@ -111,7 +109,7 @@ class VMistralPerceiverConfig(PretrainedConfig):
|
|
111 |
super().__init__(**kwargs)
|
112 |
|
113 |
|
114 |
-
class
|
115 |
r"""
|
116 |
This is the configuration class to store the configuration of a [`MistralModel`]. It is used to instantiate an
|
117 |
Mistral model according to the specified arguments, defining the model architecture. Instantiating a configuration
|
@@ -203,7 +201,7 @@ class VMistralConfig(PretrainedConfig):
|
|
203 |
>>> # Accessing the model configuration
|
204 |
>>> configuration = model.config
|
205 |
```"""
|
206 |
-
model_type = "
|
207 |
is_composition = False
|
208 |
|
209 |
def __init__(
|
@@ -282,17 +280,17 @@ class VMistralConfig(PretrainedConfig):
|
|
282 |
self.attention_dropout = attention_dropout
|
283 |
|
284 |
if perceiver_config is None:
|
285 |
-
self.perceiver_config =
|
286 |
elif isinstance(perceiver_config, dict):
|
287 |
-
self.perceiver_config =
|
288 |
-
elif isinstance(perceiver_config,
|
289 |
self.perceiver_config = perceiver_config
|
290 |
|
291 |
if vision_config is None:
|
292 |
-
self.vision_config =
|
293 |
elif isinstance(vision_config, dict):
|
294 |
-
self.vision_config =
|
295 |
-
elif isinstance(vision_config,
|
296 |
self.vision_config = vision_config
|
297 |
|
298 |
super().__init__(
|
|
|
24 |
}
|
25 |
|
26 |
|
27 |
+
class Img2HTMLVisionConfig(PretrainedConfig):
|
28 |
r"""
|
29 |
"""
|
30 |
+
model_type = "img2html"
|
31 |
|
32 |
def __init__(
|
33 |
self,
|
34 |
hidden_size=768,
|
35 |
intermediate_size=3072,
|
|
|
36 |
num_hidden_layers=12,
|
37 |
num_attention_heads=12,
|
38 |
num_channels=3,
|
|
|
50 |
|
51 |
self.hidden_size = hidden_size
|
52 |
self.intermediate_size = intermediate_size
|
|
|
53 |
self.num_hidden_layers = num_hidden_layers
|
54 |
self.num_attention_heads = num_attention_heads
|
55 |
self.num_channels = num_channels
|
|
|
63 |
self._flash_attn_2_enabled = _flash_attn_2_enabled
|
64 |
|
65 |
|
66 |
+
class Img2HTMLPerceiverConfig(PretrainedConfig):
|
67 |
r"""
|
68 |
TThis is the configuration class to store the configuration of a [`MistralModel`]. It is used to instantiate an
|
69 |
Mistral model according to the specified arguments, defining the model architecture. Instantiating a configuration
|
|
|
89 |
qk_layer_norms_perceiver (`bool`, *optional*, defaults to `False`):
|
90 |
Whether or not to use qk layer norms in perceiver
|
91 |
"""
|
92 |
+
model_type = "img2html"
|
93 |
|
94 |
def __init__(
|
95 |
self,
|
|
|
109 |
super().__init__(**kwargs)
|
110 |
|
111 |
|
112 |
+
class Img2HTMLConfig(PretrainedConfig):
|
113 |
r"""
|
114 |
This is the configuration class to store the configuration of a [`MistralModel`]. It is used to instantiate an
|
115 |
Mistral model according to the specified arguments, defining the model architecture. Instantiating a configuration
|
|
|
201 |
>>> # Accessing the model configuration
|
202 |
>>> configuration = model.config
|
203 |
```"""
|
204 |
+
model_type = "img2html"
|
205 |
is_composition = False
|
206 |
|
207 |
def __init__(
|
|
|
280 |
self.attention_dropout = attention_dropout
|
281 |
|
282 |
if perceiver_config is None:
|
283 |
+
self.perceiver_config = Img2HTMLPerceiverConfig()
|
284 |
elif isinstance(perceiver_config, dict):
|
285 |
+
self.perceiver_config = Img2HTMLPerceiverConfig(**perceiver_config)
|
286 |
+
elif isinstance(perceiver_config, Img2HTMLPerceiverConfig):
|
287 |
self.perceiver_config = perceiver_config
|
288 |
|
289 |
if vision_config is None:
|
290 |
+
self.vision_config = Img2HTMLVisionConfig()
|
291 |
elif isinstance(vision_config, dict):
|
292 |
+
self.vision_config = Img2HTMLVisionConfig(**vision_config)
|
293 |
+
elif isinstance(vision_config, Img2HTMLVisionConfig):
|
294 |
self.vision_config = vision_config
|
295 |
|
296 |
super().__init__(
|
modeling_img2html.py
CHANGED
@@ -43,7 +43,7 @@ from transformers import PreTrainedModel
|
|
43 |
from transformers.utils import logging
|
44 |
from transformers.modeling_outputs import ModelOutput
|
45 |
|
46 |
-
from .configuration_img2html import
|
47 |
from .vision import SiglipVisionModel
|
48 |
|
49 |
|
@@ -55,7 +55,7 @@ if is_flash_attn_2_available():
|
|
55 |
|
56 |
logger = logging.get_logger(__name__)
|
57 |
|
58 |
-
_CONFIG_FOR_DOC = "
|
59 |
|
60 |
IMG2HTML_PRETRAINED_MODEL_ARCHIVE_LIST = [
|
61 |
"HuggingFaceM4/Img2HTML"
|
@@ -698,7 +698,7 @@ class MistralAttention(nn.Module):
|
|
698 |
and "Generating Long Sequences with Sparse Transformers".
|
699 |
"""
|
700 |
|
701 |
-
def __init__(self, config:
|
702 |
super().__init__()
|
703 |
self.config = config
|
704 |
self.hidden_size = config.hidden_size
|
@@ -1093,7 +1093,7 @@ class MistralFlashAttention2(MistralAttention):
|
|
1093 |
|
1094 |
|
1095 |
class MistralDecoderLayer(nn.Module):
|
1096 |
-
def __init__(self, config:
|
1097 |
super().__init__()
|
1098 |
self.hidden_size = config.hidden_size
|
1099 |
self.self_attn = (
|
@@ -1176,7 +1176,7 @@ MISTRAL_START_DOCSTRING = r"""
|
|
1176 |
and behavior.
|
1177 |
|
1178 |
Parameters:
|
1179 |
-
config ([`
|
1180 |
Model configuration class with all the parameters of the model. Initializing with a config file does not
|
1181 |
load the weights associated with the model, only the configuration. Check out the
|
1182 |
[`~PreTrainedModel.from_pretrained`] method to load the model weights.
|
@@ -1188,7 +1188,7 @@ MISTRAL_START_DOCSTRING = r"""
|
|
1188 |
MISTRAL_START_DOCSTRING,
|
1189 |
)
|
1190 |
class VMistralPreTrainedModel(PreTrainedModel):
|
1191 |
-
config_class =
|
1192 |
base_model_prefix = "model"
|
1193 |
supports_gradient_checkpointing = True
|
1194 |
_no_split_modules = ["MistralDecoderLayer"]
|
@@ -1290,10 +1290,10 @@ class VMistralModel(VMistralPreTrainedModel):
|
|
1290 |
Transformer decoder consisting of *config.num_hidden_layers* layers. Each layer is a [`MistralDecoderLayer`]
|
1291 |
|
1292 |
Args:
|
1293 |
-
config:
|
1294 |
"""
|
1295 |
|
1296 |
-
def __init__(self, config:
|
1297 |
super().__init__(config)
|
1298 |
self.config = config
|
1299 |
self.padding_idx = config.pad_token_id
|
|
|
43 |
from transformers.utils import logging
|
44 |
from transformers.modeling_outputs import ModelOutput
|
45 |
|
46 |
+
from .configuration_img2html import Img2HTMLConfig
|
47 |
from .vision import SiglipVisionModel
|
48 |
|
49 |
|
|
|
55 |
|
56 |
logger = logging.get_logger(__name__)
|
57 |
|
58 |
+
_CONFIG_FOR_DOC = "Img2HTMLConfig"
|
59 |
|
60 |
IMG2HTML_PRETRAINED_MODEL_ARCHIVE_LIST = [
|
61 |
"HuggingFaceM4/Img2HTML"
|
|
|
698 |
and "Generating Long Sequences with Sparse Transformers".
|
699 |
"""
|
700 |
|
701 |
+
def __init__(self, config: Img2HTMLConfig, qk_layer_norms: bool = False):
|
702 |
super().__init__()
|
703 |
self.config = config
|
704 |
self.hidden_size = config.hidden_size
|
|
|
1093 |
|
1094 |
|
1095 |
class MistralDecoderLayer(nn.Module):
|
1096 |
+
def __init__(self, config: Img2HTMLConfig):
|
1097 |
super().__init__()
|
1098 |
self.hidden_size = config.hidden_size
|
1099 |
self.self_attn = (
|
|
|
1176 |
and behavior.
|
1177 |
|
1178 |
Parameters:
|
1179 |
+
config ([`Img2HTMLConfig`]):
|
1180 |
Model configuration class with all the parameters of the model. Initializing with a config file does not
|
1181 |
load the weights associated with the model, only the configuration. Check out the
|
1182 |
[`~PreTrainedModel.from_pretrained`] method to load the model weights.
|
|
|
1188 |
MISTRAL_START_DOCSTRING,
|
1189 |
)
|
1190 |
class VMistralPreTrainedModel(PreTrainedModel):
|
1191 |
+
config_class = Img2HTMLConfig
|
1192 |
base_model_prefix = "model"
|
1193 |
supports_gradient_checkpointing = True
|
1194 |
_no_split_modules = ["MistralDecoderLayer"]
|
|
|
1290 |
Transformer decoder consisting of *config.num_hidden_layers* layers. Each layer is a [`MistralDecoderLayer`]
|
1291 |
|
1292 |
Args:
|
1293 |
+
config: Img2HTMLConfig
|
1294 |
"""
|
1295 |
|
1296 |
+
def __init__(self, config: Img2HTMLConfig, vision_model=None):
|
1297 |
super().__init__(config)
|
1298 |
self.config = config
|
1299 |
self.padding_idx = config.pad_token_id
|