|
from typing import List, Optional |
|
from transformers import PretrainedConfig |
|
|
|
config = { |
|
"_name_or_path": "mtgv/MobileVLM-1.7B", |
|
"architectures": [ |
|
"MobileLlamaForCausalLM" |
|
], |
|
"bos_token_id": 1, |
|
"eos_token_id": 2, |
|
"freeze_mm_mlp_adapter": False, |
|
"hidden_act": "silu", |
|
"hidden_size": 2048, |
|
"image_aspect_ratio": "pad", |
|
"image_grid_pinpoints": None, |
|
"initializer_range": 0.02, |
|
"intermediate_size": 5632, |
|
"max_position_embeddings": 2048, |
|
"max_sequence_length": 2048, |
|
"mm_hidden_size": 1024, |
|
"mm_projector_type": "ldpnet", |
|
"mm_use_im_patch_token": False, |
|
"mm_use_im_start_end": False, |
|
"mm_vision_select_feature": "patch", |
|
"mm_vision_select_layer": -2, |
|
"mm_vision_tower": "openai/clip-vit-large-patch14-336", |
|
"model_type": "mobilevlm", |
|
"num_attention_heads": 16, |
|
"num_hidden_layers": 24, |
|
"num_key_value_heads": 16, |
|
"pad_token_id": 0, |
|
"pretraining_tp": 1, |
|
"rms_norm_eps": 1e-06, |
|
"rope_scaling": None, |
|
"rope_theta": 10000.0, |
|
"tie_word_embeddings": False, |
|
"torch_dtype": "bfloat16", |
|
"transformers_version": "4.33.1", |
|
"tune_mm_mlp_adapter": False, |
|
"use_cache": True, |
|
"use_mm_proj": True, |
|
"vision_tower_type": "clip", |
|
"vocab_size": 32000 |
|
} |
|
|
|
class MobileVLMConfig(PretrainedConfig): |
|
model_type = "mobilevlm" |
|
|
|
def __init__( |
|
self, |
|
|
|
architectures: Optional[List[str]] = config["architectures"], |
|
bos_token_id: Optional[int] = config["bos_token_id"], |
|
eos_token_id: Optional[int] = config["eos_token_id"], |
|
freeze_mm_mlp_adapter: Optional[bool] = config["freeze_mm_mlp_adapter"], |
|
hidden_act: Optional[str] = config["hidden_act"], |
|
hidden_size: Optional[int] = config["hidden_size"], |
|
image_aspect_ratio: Optional[str] = config["image_aspect_ratio"], |
|
image_grid_pinpoints: Optional[bool] = config["image_grid_pinpoints"], |
|
initializer_range: Optional[float] = config["initializer_range"], |
|
intermediate_size: Optional[int] = config["intermediate_size"], |
|
max_position_embeddings: Optional[int] = config["max_position_embeddings"], |
|
max_sequence_length: Optional[int] = config["max_sequence_length"], |
|
mm_hidden_size: Optional[int] = config["mm_hidden_size"], |
|
mm_projector_type: Optional[str] = config["mm_projector_type"], |
|
mm_use_im_patch_token: Optional[bool] = config["mm_use_im_patch_token"], |
|
mm_use_im_start_end: Optional[bool] = config["mm_use_im_start_end"], |
|
mm_vision_select_feature: Optional[str] = config["mm_vision_select_feature"], |
|
mm_vision_select_layer: Optional[int] = config["mm_vision_select_layer"], |
|
mm_vision_tower: Optional[str] = config["mm_vision_tower"], |
|
|
|
num_attention_heads: Optional[int] = config["num_attention_heads"], |
|
num_hidden_layers: Optional[int] = config["num_hidden_layers"], |
|
num_key_value_heads: Optional[int] = config["num_key_value_heads"], |
|
pad_token_id: Optional[int] = config["pad_token_id"], |
|
pretraining_tp: Optional[int] = config["pretraining_tp"], |
|
rms_norm_eps: Optional[float] = config["rms_norm_eps"], |
|
rope_scaling: Optional[float] = config["rope_scaling"], |
|
rope_theta: Optional[float] = config["rope_theta"], |
|
tie_word_embeddings: Optional[bool] = config["tie_word_embeddings"], |
|
torch_dtype: Optional[str] = config["torch_dtype"], |
|
transformers_version: Optional[str] = config["transformers_version"], |
|
tune_mm_mlp_adapter: Optional[bool] = config["tune_mm_mlp_adapter"], |
|
use_cache: Optional[bool] = config["use_cache"], |
|
use_mm_proj: Optional[bool] = config["use_mm_proj"], |
|
vision_tower_type: Optional[str] = config["vision_tower_type"], |
|
vocab_size: Optional[int] = config["vocab_size"], |
|
**kwargs, |
|
): |
|
|
|
self.architectures = architectures |
|
self.bos_token_id = bos_token_id |
|
self.eos_token_id = eos_token_id |
|
self.freeze_mm_mlp_adapter = freeze_mm_mlp_adapter |
|
self.hidden_act = hidden_act |
|
self.hidden_size = hidden_size |
|
self.image_aspect_ratio = image_aspect_ratio |
|
self.image_grid_pinpoints = image_grid_pinpoints |
|
self.initializer_range = initializer_range |
|
self.intermediate_size = intermediate_size |
|
self.max_position_embeddings = max_position_embeddings |
|
self.max_sequence_length = max_sequence_length |
|
self.mm_hidden_size = mm_hidden_size |
|
self.mm_projector_type = mm_projector_type |
|
self.mm_use_im_patch_token = mm_use_im_patch_token |
|
self.mm_use_im_start_end = mm_use_im_start_end |
|
self.mm_vision_select_feature = mm_vision_select_feature |
|
self.mm_vision_select_layer = mm_vision_select_layer |
|
self.mm_vision_tower = mm_vision_tower |
|
|
|
self.num_attention_heads = num_attention_heads |
|
self.num_hidden_layers = num_hidden_layers |
|
self.num_key_value_heads = num_key_value_heads |
|
self.pad_token_id = pad_token_id |
|
self.pretraining_tp = pretraining_tp |
|
self.rms_norm_eps = rms_norm_eps |
|
self.rope_scaling = rope_scaling |
|
self.rope_theta = rope_theta |
|
self.tie_word_embeddings = tie_word_embeddings |
|
self.torch_dtype = torch_dtype |
|
self.transformers_version = transformers_version |
|
self.tune_mm_mlp_adapter = tune_mm_mlp_adapter |
|
self.use_cache = use_cache |
|
self.use_mm_proj = use_mm_proj |
|
self.vision_tower_type = vision_tower_type |
|
self.vocab_size = vocab_size |
|
|
|
super().__init__(**kwargs) |
|
|