ericonaldo commited on
Commit
00342af
·
verified ·
1 Parent(s): 85adcf3

Upload folder using huggingface_hub

Browse files
configs/kosmos_ph_calvin_abc.json ADDED
@@ -0,0 +1,154 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "robovlm_name": "RoboKosMos",
3
+ "parent": null,
4
+ "task_name": "calvin_finetune",
5
+ "model": "kosmos",
6
+ "model_url": "https://huggingface.co/microsoft/kosmos-2-patch14-224",
7
+ "seq_len": 1,
8
+ "image_size": 224,
9
+ "image_mean": [
10
+ 0.48145466,
11
+ 0.4578275,
12
+ 0.40821073
13
+ ],
14
+ "image_std": [
15
+ 0.26862954,
16
+ 0.26130258,
17
+ 0.27577711
18
+ ],
19
+ "window_size": 16,
20
+ "fwd_pred_next_n": 10,
21
+ "arm_gripper_loss_ratio": 0.01,
22
+ "cap_loss_ratio": 0.05,
23
+ "fwd_loss_ratio": 0,
24
+ "seed": 123,
25
+ "batch_size": 4,
26
+ "num_workers": 16,
27
+ "data_scale": 1,
28
+ "optimizer": "adam",
29
+ "learning_rate": 0.0001,
30
+ "min_lr_scale": 0.01,
31
+ "weight_decay": 0,
32
+ "warmup_epochs": 0.25,
33
+ "warmup_steps": 0,
34
+ "warmup_ratio": null,
35
+ "use_hand_rgb": true,
36
+ "use_time_causal_attn": false,
37
+ "use_mim_obs_loss": false,
38
+ "use_pixel_loss": true,
39
+ "use_obs_queries": true,
40
+ "use_vision_resampler": false,
41
+ "vision_masked_ratio": 0.9,
42
+ "use_tube_mask": false,
43
+ "output_root": "checkpoints/kosmos/calvin_finetune",
44
+ "log_root": "logs/kosmos/calvin_finetune",
45
+ "cache_root": "cache/kosmos",
46
+ "model_load_path": null,
47
+ "model_load_source": "torch",
48
+ "resume": null,
49
+ "model_path": ".vlms/kosmos-2-patch14-224",
50
+ "model_config": ".vlms/kosmos-2-patch14-224/config.json",
51
+ "train_setup": {
52
+ "precision": "16",
53
+ "predict_action": true,
54
+ "predict_forward": false,
55
+ "predict_forward_hand": false,
56
+ "predict_caption": false,
57
+ "train_vision": true,
58
+ "bits": -1,
59
+ "freeze_mm_mlp_adapter": false,
60
+ "freeze_backbone": false,
61
+ "freeze_resampler": false,
62
+ "tune_mm_mlp_adapter": false,
63
+ "mm_use_im_start_end": false,
64
+ "mm_use_im_patch_token": false,
65
+ "gradient_checkpointing": false,
66
+ "lora_enable": false,
67
+ "mm_projector_lr": 0.0001,
68
+ "lora_r": 64,
69
+ "lora_alpha": 16,
70
+ "lora_dropout": 0.05,
71
+ "lora_bias": "none",
72
+ "train_text_embedding": true
73
+ },
74
+ "vision_resampler": {
75
+ "vis_dim": 1024,
76
+ "depth": 8,
77
+ "dim_head": 64,
78
+ "heads": 8,
79
+ "num_latents": 64
80
+ },
81
+ "act_encoder": null,
82
+ "act_head": {
83
+ "type": "LSTMDecoder",
84
+ "hidden_size": 1024,
85
+ "action_dim": 7,
86
+ "down_sample": "none",
87
+ "latent": 1,
88
+ "fwd_pred_next_n": 1,
89
+ "window_size": 1,
90
+ "action_space": "continuous",
91
+ "with_history": true,
92
+ "history_type": "post"
93
+ },
94
+ "fwd_head": null,
95
+ "tokenizer": {
96
+ "type": "AutoProcessor",
97
+ "pretrained_model_name_or_path": ".vlms/kosmos-2-patch14-224",
98
+ "tokenizer_type": "kosmos",
99
+ "max_text_len": 256,
100
+ "additional_special_tokens": null
101
+ },
102
+ "trainer": {
103
+ "accelerator": "gpu",
104
+ "strategy": "deepspeed_stage_2",
105
+ "precision": "16",
106
+ "logger": [
107
+ "tensorboard"
108
+ ],
109
+ "gradient_clip_val": 1.0,
110
+ "use_distributed_sampler": false,
111
+ "log_every_n_steps": 10,
112
+ "max_epochs": 5,
113
+ "val_check_interval": null,
114
+ "check_val_every_n_epoch": 1,
115
+ "max_steps": -1,
116
+ "accumulate_grad_batches": 1
117
+ },
118
+ "train_dataset": {
119
+ "type": "DiskCalvinDataset",
120
+ "data_dir": "datasets/calvin_data/task_ABC_D/training",
121
+ "shift_first": false,
122
+ "model_name": "kosmos",
123
+ "rgb_pad": 10,
124
+ "gripper_pad": 4
125
+ },
126
+ "val_dataset": {
127
+ "type": "DiskCalvinDataset",
128
+ "data_dir": "datasets/calvin_data/task_ABC_D/validation",
129
+ "model_name": "kosmos"
130
+ },
131
+ "norm_action": true,
132
+ "norm_min": -0.65,
133
+ "norm_max": 0.65,
134
+ "raw_config_path": "configs/kosmos/finetune_kosmos_cont-lstm-post_full-ft_text_vision_wd=0_hist=16_act=10_use-hand_aug-shift_act-norm_lr-1e-4_abc.json",
135
+ "config": "configs/kosmos/finetune_kosmos_cont-lstm-post_full-ft_text_vision_wd=0_hist=16_act=10_use-hand_aug-shift_act-norm_lr-1e-4_abc.json",
136
+ "gpus": 8,
137
+ "num_nodes": 4,
138
+ "log_dir": "logs/kosmos/calvin_finetune/2024-11-20/04-00",
139
+ "output_dir": "checkpoints/kosmos/calvin_finetune/2024-11-20/04-00",
140
+ "data_dir": null,
141
+ "annotation_file": null,
142
+ "data_subfolder": null,
143
+ "task_num": null,
144
+ "exp_name": "04-00",
145
+ "use_multi_modal_emb": false,
146
+ "no_video_pretrained_model": false,
147
+ "finetune": false,
148
+ "llm": {
149
+ "type": null,
150
+ "n_embd": null,
151
+ "n_layer": null,
152
+ "n_head": null
153
+ }
154
+ }
configs/kosmos_ph_calvin_abcd.json ADDED
@@ -0,0 +1,155 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "robovlm_name": "RoboKosMos",
3
+ "parent": null,
4
+ "task_name": "calvin_finetune",
5
+ "model": "kosmos",
6
+ "model_url": "https://huggingface.co/microsoft/kosmos-2-patch14-224",
7
+ "seq_len": 1,
8
+ "image_size": 224,
9
+ "image_mean": [
10
+ 0.48145466,
11
+ 0.4578275,
12
+ 0.40821073
13
+ ],
14
+ "image_std": [
15
+ 0.26862954,
16
+ 0.26130258,
17
+ 0.27577711
18
+ ],
19
+ "window_size": 16,
20
+ "fwd_pred_next_n": 10,
21
+ "arm_gripper_loss_ratio": 0.01,
22
+ "cap_loss_ratio": 0.05,
23
+ "fwd_loss_ratio": 0,
24
+ "seed": 123,
25
+ "batch_size": 4,
26
+ "num_workers": 16,
27
+ "data_scale": 1,
28
+ "optimizer": "adam",
29
+ "learning_rate": 0.0001,
30
+ "min_lr_scale": 0.01,
31
+ "weight_decay": 0,
32
+ "warmup_epochs": 0.25,
33
+ "warmup_steps": 0,
34
+ "warmup_ratio": null,
35
+ "use_hand_rgb": true,
36
+ "use_time_causal_attn": false,
37
+ "use_mim_obs_loss": false,
38
+ "use_pixel_loss": true,
39
+ "use_obs_queries": true,
40
+ "use_vision_resampler": false,
41
+ "vision_masked_ratio": 0.9,
42
+ "use_tube_mask": false,
43
+ "output_root": "checkpoints/kosmos/calvin_finetune",
44
+ "log_root": "logs/kosmos/calvin_finetune",
45
+ "cache_root": "cache/kosmos",
46
+ "model_load_path": null,
47
+ "model_load_source": "torch",
48
+ "resume": null,
49
+ "model_path": "/mnt/bn/robotics-data-lxh-lq/LLaVA/kosmos-2-patch14-224",
50
+ "model_config": "/mnt/bn/robotics-data-lxh-lq/LLaVA/kosmos-2-patch14-224/config.json",
51
+ "train_setup": {
52
+ "precision": "16",
53
+ "predict_action": true,
54
+ "predict_forward": false,
55
+ "predict_forward_hand": false,
56
+ "predict_caption": false,
57
+ "train_vision": true,
58
+ "bits": -1,
59
+ "freeze_mm_mlp_adapter": false,
60
+ "freeze_backbone": false,
61
+ "freeze_resampler": false,
62
+ "tune_mm_mlp_adapter": false,
63
+ "mm_use_im_start_end": false,
64
+ "mm_use_im_patch_token": false,
65
+ "gradient_checkpointing": false,
66
+ "lora_enable": false,
67
+ "mm_projector_lr": 0.0001,
68
+ "lora_r": 64,
69
+ "lora_alpha": 16,
70
+ "lora_dropout": 0.05,
71
+ "lora_bias": "none",
72
+ "train_text_embedding": true
73
+ },
74
+ "vision_resampler": {
75
+ "vis_dim": 1024,
76
+ "depth": 8,
77
+ "dim_head": 64,
78
+ "heads": 8,
79
+ "num_latents": 64
80
+ },
81
+ "act_encoder": null,
82
+ "act_head": {
83
+ "type": "LSTMDecoder",
84
+ "hidden_size": 1024,
85
+ "action_dim": 7,
86
+ "down_sample": "none",
87
+ "latent": 1,
88
+ "fwd_pred_next_n": 1,
89
+ "window_size": 1,
90
+ "action_space": "continuous",
91
+ "with_history": true,
92
+ "history_type": "post"
93
+ },
94
+ "fwd_head": null,
95
+ "tokenizer": {
96
+ "type": "AutoProcessor",
97
+ "pretrained_model_name_or_path": "/mnt/bn/robotics-data-lxh-lq/LLaVA/kosmos-2-patch14-224",
98
+ "tokenizer_type": "kosmos",
99
+ "max_text_len": 256,
100
+ "additional_special_tokens": null
101
+ },
102
+ "trainer": {
103
+ "accelerator": "gpu",
104
+ "strategy": "deepspeed_stage_2",
105
+ "precision": "16",
106
+ "logger": [
107
+ "tensorboard"
108
+ ],
109
+ "gradient_clip_val": 1.0,
110
+ "use_distributed_sampler": false,
111
+ "log_every_n_steps": 10,
112
+ "max_epochs": 5,
113
+ "val_check_interval": null,
114
+ "check_val_every_n_epoch": 1,
115
+ "max_steps": -1,
116
+ "accumulate_grad_batches": 1
117
+ },
118
+ "train_dataset": {
119
+ "type": "DiskCalvinDataset",
120
+ "data_dir": "/mnt/bn/robotics/manipulation_data/calvin_data/task_ABCD_D/training",
121
+ "shift_first": false,
122
+ "model_name": "kosmos",
123
+ "rgb_pad": 10,
124
+ "gripper_pad": 4
125
+ },
126
+ "val_dataset": {
127
+ "type": "DiskCalvinDataset",
128
+ "data_dir": "/mnt/bn/robotics/manipulation_data/calvin_data/task_ABCD_D/validation",
129
+ "model_name": "kosmos"
130
+ },
131
+ "norm_action": true,
132
+ "norm_min": -0.65,
133
+ "norm_max": 0.65,
134
+ "scheduler": "cosine",
135
+ "raw_config_path": "configs/kosmos/finetune_kosmos_cont-lstm-post_full-ft_text_vision_wd=0_hist=16_act=10_use-hand_aug-shift_act-norm_lr-1e-4.json",
136
+ "config": "configs/kosmos/finetune_kosmos_cont-lstm-post_full-ft_text_vision_wd=0_hist=16_act=10_use-hand_aug-shift_act-norm_lr-1e-4.json",
137
+ "gpus": 8,
138
+ "num_nodes": 4,
139
+ "log_dir": "logs/kosmos/calvin_finetune/2024-11-20/04-00",
140
+ "output_dir": "checkpoints/kosmos/calvin_finetune/2024-11-20/04-00",
141
+ "data_dir": null,
142
+ "annotation_file": null,
143
+ "data_subfolder": null,
144
+ "task_num": null,
145
+ "exp_name": "17-54",
146
+ "use_multi_modal_emb": false,
147
+ "no_video_pretrained_model": false,
148
+ "finetune": false,
149
+ "llm": {
150
+ "type": null,
151
+ "n_embd": null,
152
+ "n_layer": null,
153
+ "n_head": null
154
+ }
155
+ }
configs/kosmos_ph_oxe-pretrain.json ADDED
@@ -0,0 +1,159 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "parent": null,
3
+ "model": "kosmos",
4
+ "seq_len": 1,
5
+ "image_size": 224,
6
+ "image_mean": [
7
+ 0.48145466,
8
+ 0.4578275,
9
+ 0.40821073
10
+ ],
11
+ "image_std": [
12
+ 0.26862954,
13
+ 0.26130258,
14
+ 0.27577711
15
+ ],
16
+ "window_size": 16,
17
+ "fwd_pred_next_n": 10,
18
+ "arm_gripper_loss_ratio": 0.01,
19
+ "cap_loss_ratio": 0.05,
20
+ "fwd_loss_ratio": 0,
21
+ "seed": 123,
22
+ "batch_size": 4,
23
+ "num_workers": 16,
24
+ "data_scale": 1,
25
+ "optimizer": "adam",
26
+ "learning_rate": 2e-05,
27
+ "min_lr_scale": 0.01,
28
+ "weight_decay": 0,
29
+ "warmup_epochs": 0,
30
+ "warmup_steps": 5000,
31
+ "warmup_ratio": null,
32
+ "use_hand_rgb": true,
33
+ "use_time_causal_attn": false,
34
+ "use_mim_obs_loss": false,
35
+ "use_pixel_loss": true,
36
+ "use_obs_queries": true,
37
+ "use_vision_resampler": false,
38
+ "vision_masked_ratio": 0.9,
39
+ "use_tube_mask": false,
40
+ "output_root": "/mnt/bn/robotics-data-lxh-lq-v2/checkpoints/video_pretrain_manipulation/kosmos/calvin_finetune",
41
+ "log_root": "/mnt/bn/robotics-data-lxh-lq-v2/logs/video_pretrain_manipulation/kosmos/calvin_finetune",
42
+ "cache_root": "/mnt/bn/robotics-data-lxh-lq-v2/cache/video_pretrain_manipulation/kosmos",
43
+ "model_load_path": null,
44
+ "model_load_source": "torch",
45
+ "resume": null,
46
+ "model_path": "/mnt/bn/robotics-data-lxh-lq/LLaVA/kosmos-2-patch14-224",
47
+ "model_config": "/mnt/bn/robotics-data-lxh-lq/LLaVA/kosmos-2-patch14-224/config.json",
48
+ "train_setup": {
49
+ "precision": "16-mixed",
50
+ "predict_action": true,
51
+ "predict_forward": false,
52
+ "predict_forward_hand": false,
53
+ "predict_caption": false,
54
+ "train_vision": true,
55
+ "bits": -1,
56
+ "freeze_mm_mlp_adapter": false,
57
+ "freeze_backbone": false,
58
+ "freeze_resampler": false,
59
+ "tune_mm_mlp_adapter": false,
60
+ "mm_use_im_start_end": false,
61
+ "mm_use_im_patch_token": false,
62
+ "gradient_checkpointing": false,
63
+ "lora_enable": false,
64
+ "mm_projector_lr": 0.0001,
65
+ "lora_r": 64,
66
+ "lora_alpha": 16,
67
+ "lora_dropout": 0.05,
68
+ "lora_bias": "none",
69
+ "train_text_embedding": true
70
+ },
71
+ "vision_resampler": {
72
+ "vis_dim": 1024,
73
+ "depth": 8,
74
+ "dim_head": 64,
75
+ "heads": 8,
76
+ "num_latents": 64
77
+ },
78
+ "act_encoder": null,
79
+ "act_head": {
80
+ "type": "LSTMDecoder",
81
+ "hidden_size": 1024,
82
+ "action_dim": 7,
83
+ "down_sample": "none",
84
+ "latent": 1,
85
+ "fwd_pred_next_n": 1,
86
+ "window_size": 1,
87
+ "action_space": "continuous",
88
+ "with_history": true,
89
+ "history_type": "post"
90
+ },
91
+ "fwd_head": null,
92
+ "tokenizer": {
93
+ "type": "AutoProcessor",
94
+ "pretrained_model_name_or_path": "/mnt/bn/robotics-data-lxh-lq/LLaVA/kosmos-2-patch14-224",
95
+ "tokenizer_type": "kosmos",
96
+ "max_text_len": 256,
97
+ "additional_special_tokens": null
98
+ },
99
+ "trainer": {
100
+ "accelerator": "gpu",
101
+ "strategy": "deepspeed_stage_2",
102
+ "precision": "16-mixed",
103
+ "logger": [
104
+ "tensorboard"
105
+ ],
106
+ "gradient_clip_val": 1.0,
107
+ "use_distributed_sampler": false,
108
+ "log_every_n_steps": 10,
109
+ "max_epochs": 5,
110
+ "val_check_interval": 10000,
111
+ "check_val_every_n_epoch": null,
112
+ "max_steps": 100000,
113
+ "accumulate_grad_batches": 1,
114
+ "limit_val_batches": 1000
115
+ },
116
+ "train_dataset": {
117
+ "type": "OpenVLADataset",
118
+ "data_root_dir": "/mnt/bn/robotics-data-lxh-lq/openvla/datasets/open-x-embodiment",
119
+ "model_name": "kosmos",
120
+ "image_aug": true,
121
+ "mode": "train",
122
+ "data_mix": "oxe_magic_soup",
123
+ "window_sample": "sliding",
124
+ "organize_type": "interleave",
125
+ "shuffle_buffer_size": 51200,
126
+ "train": true
127
+ },
128
+ "val_dataset": {
129
+ "type": "OpenVLADataset",
130
+ "data_root_dir": "/mnt/bn/robotics-data-lxh-lq/openvla/datasets/open-x-embodiment",
131
+ "model_name": "kosmos",
132
+ "mode": "train",
133
+ "data_mix": "rt_1",
134
+ "window_sample": "sliding",
135
+ "organize_type": "interleave",
136
+ "shuffle_buffer_size": 10000,
137
+ "train": false
138
+ },
139
+ "raw_config_path": "configs/kosmos/oxe_new/finetune_kosmos_cont-lstm-post_full-ft_text_vision_wd=0_hist=16_act=10_use-hand_aug-shift_act-norm_lr-2e-5_oxe.json",
140
+ "config": "configs/kosmos/oxe_new/finetune_kosmos_cont-lstm-post_full-ft_text_vision_wd=0_hist=16_act=10_use-hand_aug-shift_act-norm_lr-2e-5_oxe.json",
141
+ "gpus": 8,
142
+ "num_nodes": 4,
143
+ "log_dir": "/mnt/bn/robotics-data-lxh-lq-v2/logs/video_pretrain_manipulation/kosmos/calvin_finetune/2024-11-21/01-16",
144
+ "output_dir": "/mnt/bn/robotics-data-lxh-lq-v2/checkpoints/video_pretrain_manipulation/kosmos/calvin_finetune/2024-11-21/01-16",
145
+ "data_dir": null,
146
+ "annotation_file": null,
147
+ "data_subfolder": null,
148
+ "task_num": null,
149
+ "exp_name": "01-16",
150
+ "use_multi_modal_emb": false,
151
+ "no_video_pretrained_model": false,
152
+ "finetune": false,
153
+ "llm": {
154
+ "type": null,
155
+ "n_embd": null,
156
+ "n_layer": null,
157
+ "n_head": null
158
+ }
159
+ }