|
{ |
|
"_name_or_path": "", |
|
"architectures": [ |
|
"MultiModalLLM_PT" |
|
], |
|
"auto_map": { |
|
"AutoConfig": "model_config.VideoChatEConfig", |
|
"AutoModel": "modeling_videochate.MultiModalLLM_PT" |
|
}, |
|
"model_config": { |
|
"bridge": { |
|
"extra_num_query_token": 64, |
|
"name": "qformer", |
|
"num_query_token": 32, |
|
"qformer_attention_probs_dropout_prob": 0.1, |
|
"qformer_drop_path_rate": 0.2, |
|
"qformer_hidden_dropout_prob": 0.1 |
|
}, |
|
"freeze_bridge": false, |
|
"freeze_llm": false, |
|
"freeze_vision_encoder": false, |
|
"llm": { |
|
"lora_alpha": 32, |
|
"lora_dropout": 0.1, |
|
"lora_r": 16, |
|
"name": "mistral_7b", |
|
"pretrained_llm_path": "mistralai/Mistral-7B-Instruct-v0.3", |
|
"use_lora": true, |
|
"hidden_size": 4096 |
|
}, |
|
"loss": { |
|
"use_vision_regression_loss": false |
|
}, |
|
"pretrained_paths": {}, |
|
|
|
"vision_encoder": { |
|
"name":"vit_l14", |
|
"img_size":224, |
|
"patch_size":16, |
|
"d_model":1024, |
|
"encoder_embed_dim":1024, |
|
"encoder_depth":24, |
|
"encoder_num_heads":16, |
|
"drop_path_rate": 0.0, |
|
"num_frames":16, |
|
"tubelet_size":1, |
|
"use_checkpoint":false, |
|
"checkpoint_num":0, |
|
"return_index":-2, |
|
"vit_add_ln":true, |
|
"pretrained": null |
|
} |
|
}, |
|
"torch_dtype": "float32", |
|
"transformers_version": "4.38.0", |
|
"use_flash_attention": true, |
|
"use_cache": true, |
|
"build_decoder":true, |
|
"hidden_size": 4096 |
|
} |
|
|