DVGFormer / config.json
yunzhong-hou's picture
checkpoint upload
3f581bb
{
"_name_or_path": "logs/checkpoint",
"action_downsample": 1,
"action_fps": 15,
"action_option": "dense",
"architectures": [
"DVGFormerModel"
],
"backbone_downsample": 14,
"cropped_sensor_width": 36.0,
"drone_types": [
0,
1
],
"fix_image_width": true,
"focal_alpha": 0.9,
"fps": 3,
"fps_downsample": 5,
"gpt2_config": {
"action_downsample": 1,
"action_fps": 15,
"architectures": [
"UAVPoseNetModel"
],
"backbone_downsample": 14,
"cropped_sensor_width": 36.0,
"fps_downsample": 5,
"hdf5_fname": "dataset_full.h5",
"image_resolution": [
168,
294
],
"model_type": "gpt2",
"n_action_to_predict": 5,
"n_embd": 384,
"n_head": 6,
"n_positions": 1562,
"n_token_image": 45,
"n_token_to_predict": 5,
"n_token_total": 52,
"per_token_preds": 1,
"root": "youtube_drone_videos",
"torch_dtype": "bfloat16",
"vision_feat_dim": 384
},
"hdf5_fname": "dataset_full.h5",
"hidden_size": 384,
"ignore_value": -100,
"image_featmap_shape": [
5,
9
],
"image_resolution": [
168,
294
],
"loss_coef_action": 1,
"loss_coef_drone_type": 0,
"loss_coef_future": 0,
"loss_coef_state": 0,
"loss_coef_stop": 0,
"max_model_frames": 150,
"model_type": "dvgformer",
"motion_option": "local",
"n_action_to_predict": 5,
"n_future_frames": 15,
"n_token_action": 1,
"n_token_boa": 1,
"n_token_drone_type": 1,
"n_token_frame": 52,
"n_token_image": 45,
"n_token_noise": 1,
"n_token_predict": 5,
"n_token_prepend": 2,
"n_token_quality": 0,
"n_token_state": 1,
"n_token_to_predict": 5,
"n_token_total": 52,
"num_quantile_bins": 10,
"pad_side": "right",
"pad_token_value": 0,
"per_token_preds": 1,
"prediction_option": "iterative",
"root": "youtube_drone_videos",
"test_gt_forcing": "allframe",
"torch_dtype": "bfloat16",
"transformers_version": "4.45.2",
"use_depth": true,
"use_quality_mlps": false,
"vision_backbone": "dinov2_vits14_reg",
"vision_feat_dim": 384
}