model_card_metadata = { "license": "apache-2.0", "language": ["en"], "metrics": ["accuracy", "bertscore"], "library_name": ["adapter-transformers", "transformers"], "model_name": "AutoModel", "model_type": "multimodal-transformer", "tags": ["multimodal", "transformer"], "datasets": ["dataset1", "dataset2"], "finetuned_from": "pretrained-model", "config": { "hidden_size": 768, "num_attention_heads": 12, "num_hidden_layers": 12, "intermediate_size": 2048, "hidden_dropout_prob": 0.1, "attention_probs_dropout_prob": 0.1, "image_size": 224, "image_channels": 3, "patch_size": 16, "max_position_embeddings": 512, "vocab_size": 30522, "type_vocab_size": 2, "audio_sample_rate": 16000, "audio_frame_size": 1024, "audio_hop_size": 512, "enable_vqa": True, "enable_caption": True, "enable_retrieval": True, "enable_asr": True, "enable_realtime_asr": True, "batch_size": 32, "learning_rate": 0.0001, "weight_decay": 0.01, "warmup_steps": 10000, "max_steps": 100000 } }