VictorSanh commited on
Commit
1f31f46
·
1 Parent(s): 8d33f67

trying the autoconfig

Browse files
Files changed (3) hide show
  1. config.json +1 -1
  2. configuration_img2html.py +12 -14
  3. modeling_img2html.py +8 -8
config.json CHANGED
@@ -10,7 +10,7 @@
10
  ],
11
  "attention_dropout": 0.0,
12
  "auto_map": {
13
- "AutoProcessor": "processing_img2html.Img2HTMLProcessor",
14
  "AutoModelForCausalLM": "modeling_img2html.Img2HTMLForVisionText2Text"
15
  },
16
  "bos_token_id": 1,
 
10
  ],
11
  "attention_dropout": 0.0,
12
  "auto_map": {
13
+ "AutoConfig": "confgiration_img2html.Img2HTMLConfig",
14
  "AutoModelForCausalLM": "modeling_img2html.Img2HTMLForVisionText2Text"
15
  },
16
  "bos_token_id": 1,
configuration_img2html.py CHANGED
@@ -24,16 +24,15 @@ MISTRAL_PRETRAINED_CONFIG_ARCHIVE_MAP = {
24
  }
25
 
26
 
27
- class VMistralVisionConfig(PretrainedConfig):
28
  r"""
29
  """
30
- model_type = "vmistral"
31
 
32
  def __init__(
33
  self,
34
  hidden_size=768,
35
  intermediate_size=3072,
36
- projection_dim=512,
37
  num_hidden_layers=12,
38
  num_attention_heads=12,
39
  num_channels=3,
@@ -51,7 +50,6 @@ class VMistralVisionConfig(PretrainedConfig):
51
 
52
  self.hidden_size = hidden_size
53
  self.intermediate_size = intermediate_size
54
- self.projection_dim = projection_dim
55
  self.num_hidden_layers = num_hidden_layers
56
  self.num_attention_heads = num_attention_heads
57
  self.num_channels = num_channels
@@ -65,7 +63,7 @@ class VMistralVisionConfig(PretrainedConfig):
65
  self._flash_attn_2_enabled = _flash_attn_2_enabled
66
 
67
 
68
- class VMistralPerceiverConfig(PretrainedConfig):
69
  r"""
70
  TThis is the configuration class to store the configuration of a [`MistralModel`]. It is used to instantiate an
71
  Mistral model according to the specified arguments, defining the model architecture. Instantiating a configuration
@@ -91,7 +89,7 @@ class VMistralPerceiverConfig(PretrainedConfig):
91
  qk_layer_norms_perceiver (`bool`, *optional*, defaults to `False`):
92
  Whether or not to use qk layer norms in perceiver
93
  """
94
- model_type = "vmistral"
95
 
96
  def __init__(
97
  self,
@@ -111,7 +109,7 @@ class VMistralPerceiverConfig(PretrainedConfig):
111
  super().__init__(**kwargs)
112
 
113
 
114
- class VMistralConfig(PretrainedConfig):
115
  r"""
116
  This is the configuration class to store the configuration of a [`MistralModel`]. It is used to instantiate an
117
  Mistral model according to the specified arguments, defining the model architecture. Instantiating a configuration
@@ -203,7 +201,7 @@ class VMistralConfig(PretrainedConfig):
203
  >>> # Accessing the model configuration
204
  >>> configuration = model.config
205
  ```"""
206
- model_type = "vmistral"
207
  is_composition = False
208
 
209
  def __init__(
@@ -282,17 +280,17 @@ class VMistralConfig(PretrainedConfig):
282
  self.attention_dropout = attention_dropout
283
 
284
  if perceiver_config is None:
285
- self.perceiver_config = VMistralPerceiverConfig()
286
  elif isinstance(perceiver_config, dict):
287
- self.perceiver_config = VMistralPerceiverConfig(**perceiver_config)
288
- elif isinstance(perceiver_config, VMistralPerceiverConfig):
289
  self.perceiver_config = perceiver_config
290
 
291
  if vision_config is None:
292
- self.vision_config = VMistralVisionConfig()
293
  elif isinstance(vision_config, dict):
294
- self.vision_config = VMistralVisionConfig(**vision_config)
295
- elif isinstance(vision_config, VMistralVisionConfig):
296
  self.vision_config = vision_config
297
 
298
  super().__init__(
 
24
  }
25
 
26
 
27
+ class Img2HTMLVisionConfig(PretrainedConfig):
28
  r"""
29
  """
30
+ model_type = "img2html"
31
 
32
  def __init__(
33
  self,
34
  hidden_size=768,
35
  intermediate_size=3072,
 
36
  num_hidden_layers=12,
37
  num_attention_heads=12,
38
  num_channels=3,
 
50
 
51
  self.hidden_size = hidden_size
52
  self.intermediate_size = intermediate_size
 
53
  self.num_hidden_layers = num_hidden_layers
54
  self.num_attention_heads = num_attention_heads
55
  self.num_channels = num_channels
 
63
  self._flash_attn_2_enabled = _flash_attn_2_enabled
64
 
65
 
66
+ class Img2HTMLPerceiverConfig(PretrainedConfig):
67
  r"""
68
  TThis is the configuration class to store the configuration of a [`MistralModel`]. It is used to instantiate an
69
  Mistral model according to the specified arguments, defining the model architecture. Instantiating a configuration
 
89
  qk_layer_norms_perceiver (`bool`, *optional*, defaults to `False`):
90
  Whether or not to use qk layer norms in perceiver
91
  """
92
+ model_type = "img2html"
93
 
94
  def __init__(
95
  self,
 
109
  super().__init__(**kwargs)
110
 
111
 
112
+ class Img2HTMLConfig(PretrainedConfig):
113
  r"""
114
  This is the configuration class to store the configuration of a [`MistralModel`]. It is used to instantiate an
115
  Mistral model according to the specified arguments, defining the model architecture. Instantiating a configuration
 
201
  >>> # Accessing the model configuration
202
  >>> configuration = model.config
203
  ```"""
204
+ model_type = "img2html"
205
  is_composition = False
206
 
207
  def __init__(
 
280
  self.attention_dropout = attention_dropout
281
 
282
  if perceiver_config is None:
283
+ self.perceiver_config = Img2HTMLPerceiverConfig()
284
  elif isinstance(perceiver_config, dict):
285
+ self.perceiver_config = Img2HTMLPerceiverConfig(**perceiver_config)
286
+ elif isinstance(perceiver_config, Img2HTMLPerceiverConfig):
287
  self.perceiver_config = perceiver_config
288
 
289
  if vision_config is None:
290
+ self.vision_config = Img2HTMLVisionConfig()
291
  elif isinstance(vision_config, dict):
292
+ self.vision_config = Img2HTMLVisionConfig(**vision_config)
293
+ elif isinstance(vision_config, Img2HTMLVisionConfig):
294
  self.vision_config = vision_config
295
 
296
  super().__init__(
modeling_img2html.py CHANGED
@@ -43,7 +43,7 @@ from transformers import PreTrainedModel
43
  from transformers.utils import logging
44
  from transformers.modeling_outputs import ModelOutput
45
 
46
- from .configuration_img2html import VMistralConfig
47
  from .vision import SiglipVisionModel
48
 
49
 
@@ -55,7 +55,7 @@ if is_flash_attn_2_available():
55
 
56
  logger = logging.get_logger(__name__)
57
 
58
- _CONFIG_FOR_DOC = "VMistralConfig"
59
 
60
  IMG2HTML_PRETRAINED_MODEL_ARCHIVE_LIST = [
61
  "HuggingFaceM4/Img2HTML"
@@ -698,7 +698,7 @@ class MistralAttention(nn.Module):
698
  and "Generating Long Sequences with Sparse Transformers".
699
  """
700
 
701
- def __init__(self, config: VMistralConfig, qk_layer_norms: bool = False):
702
  super().__init__()
703
  self.config = config
704
  self.hidden_size = config.hidden_size
@@ -1093,7 +1093,7 @@ class MistralFlashAttention2(MistralAttention):
1093
 
1094
 
1095
  class MistralDecoderLayer(nn.Module):
1096
- def __init__(self, config: VMistralConfig):
1097
  super().__init__()
1098
  self.hidden_size = config.hidden_size
1099
  self.self_attn = (
@@ -1176,7 +1176,7 @@ MISTRAL_START_DOCSTRING = r"""
1176
  and behavior.
1177
 
1178
  Parameters:
1179
- config ([`VMistralConfig`]):
1180
  Model configuration class with all the parameters of the model. Initializing with a config file does not
1181
  load the weights associated with the model, only the configuration. Check out the
1182
  [`~PreTrainedModel.from_pretrained`] method to load the model weights.
@@ -1188,7 +1188,7 @@ MISTRAL_START_DOCSTRING = r"""
1188
  MISTRAL_START_DOCSTRING,
1189
  )
1190
  class VMistralPreTrainedModel(PreTrainedModel):
1191
- config_class = VMistralConfig
1192
  base_model_prefix = "model"
1193
  supports_gradient_checkpointing = True
1194
  _no_split_modules = ["MistralDecoderLayer"]
@@ -1290,10 +1290,10 @@ class VMistralModel(VMistralPreTrainedModel):
1290
  Transformer decoder consisting of *config.num_hidden_layers* layers. Each layer is a [`MistralDecoderLayer`]
1291
 
1292
  Args:
1293
- config: VMistralConfig
1294
  """
1295
 
1296
- def __init__(self, config: VMistralConfig, vision_model=None):
1297
  super().__init__(config)
1298
  self.config = config
1299
  self.padding_idx = config.pad_token_id
 
43
  from transformers.utils import logging
44
  from transformers.modeling_outputs import ModelOutput
45
 
46
+ from .configuration_img2html import Img2HTMLConfig
47
  from .vision import SiglipVisionModel
48
 
49
 
 
55
 
56
  logger = logging.get_logger(__name__)
57
 
58
+ _CONFIG_FOR_DOC = "Img2HTMLConfig"
59
 
60
  IMG2HTML_PRETRAINED_MODEL_ARCHIVE_LIST = [
61
  "HuggingFaceM4/Img2HTML"
 
698
  and "Generating Long Sequences with Sparse Transformers".
699
  """
700
 
701
+ def __init__(self, config: Img2HTMLConfig, qk_layer_norms: bool = False):
702
  super().__init__()
703
  self.config = config
704
  self.hidden_size = config.hidden_size
 
1093
 
1094
 
1095
  class MistralDecoderLayer(nn.Module):
1096
+ def __init__(self, config: Img2HTMLConfig):
1097
  super().__init__()
1098
  self.hidden_size = config.hidden_size
1099
  self.self_attn = (
 
1176
  and behavior.
1177
 
1178
  Parameters:
1179
+ config ([`Img2HTMLConfig`]):
1180
  Model configuration class with all the parameters of the model. Initializing with a config file does not
1181
  load the weights associated with the model, only the configuration. Check out the
1182
  [`~PreTrainedModel.from_pretrained`] method to load the model weights.
 
1188
  MISTRAL_START_DOCSTRING,
1189
  )
1190
  class VMistralPreTrainedModel(PreTrainedModel):
1191
+ config_class = Img2HTMLConfig
1192
  base_model_prefix = "model"
1193
  supports_gradient_checkpointing = True
1194
  _no_split_modules = ["MistralDecoderLayer"]
 
1290
  Transformer decoder consisting of *config.num_hidden_layers* layers. Each layer is a [`MistralDecoderLayer`]
1291
 
1292
  Args:
1293
+ config: Img2HTMLConfig
1294
  """
1295
 
1296
+ def __init__(self, config: Img2HTMLConfig, vision_model=None):
1297
  super().__init__(config)
1298
  self.config = config
1299
  self.padding_idx = config.pad_token_id