belkhale commited on
Commit
ca81022
·
verified ·
1 Parent(s): f70d13e

Upload folder using huggingface_hub

Browse files
Files changed (42) hide show
  1. .gitattributes +2 -0
  2. checkpoints/step-009999-epoch-00-loss=0.7280.pt +3 -0
  3. checkpoints/step-019999-epoch-01-loss=1.4845.pt +3 -0
  4. checkpoints/step-020792-epoch-01-loss=0.5268.pt +3 -0
  5. config.json +61 -0
  6. config.yaml +54 -0
  7. prism-qwen25-extra-dinosiglip-224px+0_5b+stage-finetune+x7.jsonl +0 -0
  8. run-metrics.jsonl +1 -0
  9. wandb/debug-internal.log +17 -0
  10. wandb/debug.log +32 -0
  11. wandb/latest-run/files/config.yaml +105 -0
  12. wandb/latest-run/files/output.log +4 -0
  13. wandb/latest-run/files/wandb-metadata.json +134 -0
  14. wandb/latest-run/files/wandb-summary.json +1 -0
  15. wandb/latest-run/logs/debug-core.log +16 -0
  16. wandb/latest-run/logs/debug-internal.log +17 -0
  17. wandb/latest-run/logs/debug.log +32 -0
  18. wandb/latest-run/run-jcj67gg8.wandb +3 -0
  19. wandb/run-20241105_192502-8vxhoj6d/files/config.yaml +103 -0
  20. wandb/run-20241105_192502-8vxhoj6d/files/output.log +35 -0
  21. wandb/run-20241105_192502-8vxhoj6d/files/wandb-metadata.json +134 -0
  22. wandb/run-20241105_192502-8vxhoj6d/files/wandb-summary.json +1 -0
  23. wandb/run-20241105_192502-8vxhoj6d/logs/debug-core.log +14 -0
  24. wandb/run-20241105_192502-8vxhoj6d/logs/debug-internal.log +16 -0
  25. wandb/run-20241105_192502-8vxhoj6d/logs/debug.log +26 -0
  26. wandb/run-20241105_192502-8vxhoj6d/run-8vxhoj6d.wandb +0 -0
  27. wandb/run-20241105_192659-mqdqjqly/files/config.yaml +103 -0
  28. wandb/run-20241105_192659-mqdqjqly/files/output.log +43 -0
  29. wandb/run-20241105_192659-mqdqjqly/files/wandb-metadata.json +134 -0
  30. wandb/run-20241105_192659-mqdqjqly/files/wandb-summary.json +1 -0
  31. wandb/run-20241105_192659-mqdqjqly/logs/debug-core.log +14 -0
  32. wandb/run-20241105_192659-mqdqjqly/logs/debug-internal.log +16 -0
  33. wandb/run-20241105_192659-mqdqjqly/logs/debug.log +26 -0
  34. wandb/run-20241105_192659-mqdqjqly/run-mqdqjqly.wandb +0 -0
  35. wandb/run-20241105_193102-jcj67gg8/files/config.yaml +105 -0
  36. wandb/run-20241105_193102-jcj67gg8/files/output.log +4 -0
  37. wandb/run-20241105_193102-jcj67gg8/files/wandb-metadata.json +134 -0
  38. wandb/run-20241105_193102-jcj67gg8/files/wandb-summary.json +1 -0
  39. wandb/run-20241105_193102-jcj67gg8/logs/debug-core.log +16 -0
  40. wandb/run-20241105_193102-jcj67gg8/logs/debug-internal.log +17 -0
  41. wandb/run-20241105_193102-jcj67gg8/logs/debug.log +32 -0
  42. wandb/run-20241105_193102-jcj67gg8/run-jcj67gg8.wandb +3 -0
.gitattributes CHANGED
@@ -33,3 +33,5 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ wandb/latest-run/run-jcj67gg8.wandb filter=lfs diff=lfs merge=lfs -text
37
+ wandb/run-20241105_193102-jcj67gg8/run-jcj67gg8.wandb filter=lfs diff=lfs merge=lfs -text
checkpoints/step-009999-epoch-00-loss=0.7280.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:bcb0567c48a8af51b96e8b73b794de80964badc806defd7badc77007782a5a0a
3
+ size 2630986501
checkpoints/step-019999-epoch-01-loss=1.4845.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:71573cb46a219059c298c41f8facc10271b95e0a4cb3c1e0e8c39ac3a66079b8
3
+ size 2630986501
checkpoints/step-020792-epoch-01-loss=0.5268.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d86525b8d21865b2e83d4a403ee9d1b641579bb41259111d7e3cc7f7ed46564a
3
+ size 2630986501
config.json ADDED
@@ -0,0 +1,61 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "dataset": {
3
+ "align_stage_components": [
4
+ "download/llava-laion-cc-sbu-558k/chat.json",
5
+ "download/llava-laion-cc-sbu-558k"
6
+ ],
7
+ "dataset_id": "llava-v15",
8
+ "dataset_root_dir": "/hai/scratch/belkhale/datasets/prismatic-vlms",
9
+ "finetune_stage_components": [
10
+ "download/llava-v1.5-instruct/llava_v1_5_mix665k.json",
11
+ "download/llava-v1.5-instruct"
12
+ ],
13
+ "type": "llava-v15"
14
+ },
15
+ "hf_token": ".hf_token",
16
+ "model": {
17
+ "align_epochs": 1,
18
+ "align_global_batch_size": 96,
19
+ "align_learning_rate": 0.001,
20
+ "align_lr_scheduler_type": "linear-warmup+cosine-decay",
21
+ "align_max_grad_norm": 1.0,
22
+ "align_max_steps": null,
23
+ "align_per_device_batch_size": 16,
24
+ "align_save_every_n_steps": 10000,
25
+ "align_train_strategy": "fsdp-shard-grad-op",
26
+ "align_warmup_ratio": 0.03,
27
+ "align_weight_decay": 0.0,
28
+ "arch_specifier": "no-align+fused-gelu-mlp",
29
+ "enable_gradient_checkpointing": true,
30
+ "enable_mixed_precision_training": true,
31
+ "finetune_epochs": 2,
32
+ "finetune_global_batch_size": 64,
33
+ "finetune_learning_rate": 2e-05,
34
+ "finetune_lr_scheduler_type": "linear-warmup+cosine-decay",
35
+ "finetune_max_grad_norm": 1.0,
36
+ "finetune_max_steps": null,
37
+ "finetune_per_device_batch_size": 4,
38
+ "finetune_save_every_n_steps": 10000,
39
+ "finetune_train_strategy": "fsdp-full-shard",
40
+ "finetune_warmup_ratio": 0.03,
41
+ "finetune_weight_decay": 0.1,
42
+ "image_resize_strategy": "resize-naive",
43
+ "llm_backbone_id": "qwen25-0_5b-extra",
44
+ "llm_max_length": 32768,
45
+ "model_id": "prism-qwen25-extra-dinosiglip-224px+0_5b",
46
+ "reduce_in_full_precision": false,
47
+ "type": "prism-qwen25-extra-dinosiglip-224px+0_5b",
48
+ "vision_backbone_id": "dinosiglip-vit-so-224px"
49
+ },
50
+ "pretrained_checkpoint": null,
51
+ "run_id": "prism-qwen25-extra-dinosiglip-224px+0_5b+stage-finetune+x7",
52
+ "run_root_dir": "runs",
53
+ "seed": 7,
54
+ "stage": "finetune",
55
+ "trackers": [
56
+ "jsonl",
57
+ "wandb"
58
+ ],
59
+ "wandb_entity": null,
60
+ "wandb_project": "prismatic"
61
+ }
config.yaml ADDED
@@ -0,0 +1,54 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ dataset:
2
+ align_stage_components:
3
+ - download/llava-laion-cc-sbu-558k/chat.json
4
+ - download/llava-laion-cc-sbu-558k
5
+ dataset_id: llava-v15
6
+ dataset_root_dir: /hai/scratch/belkhale/datasets/prismatic-vlms
7
+ finetune_stage_components:
8
+ - download/llava-v1.5-instruct/llava_v1_5_mix665k.json
9
+ - download/llava-v1.5-instruct
10
+ type: llava-v15
11
+ hf_token: .hf_token
12
+ model:
13
+ align_epochs: 1
14
+ align_global_batch_size: 96
15
+ align_learning_rate: 0.001
16
+ align_lr_scheduler_type: linear-warmup+cosine-decay
17
+ align_max_grad_norm: 1.0
18
+ align_max_steps: null
19
+ align_per_device_batch_size: 16
20
+ align_save_every_n_steps: 10000
21
+ align_train_strategy: fsdp-shard-grad-op
22
+ align_warmup_ratio: 0.03
23
+ align_weight_decay: 0.0
24
+ arch_specifier: no-align+fused-gelu-mlp
25
+ enable_gradient_checkpointing: true
26
+ enable_mixed_precision_training: true
27
+ finetune_epochs: 2
28
+ finetune_global_batch_size: 64
29
+ finetune_learning_rate: 2.0e-05
30
+ finetune_lr_scheduler_type: linear-warmup+cosine-decay
31
+ finetune_max_grad_norm: 1.0
32
+ finetune_max_steps: null
33
+ finetune_per_device_batch_size: 4
34
+ finetune_save_every_n_steps: 10000
35
+ finetune_train_strategy: fsdp-full-shard
36
+ finetune_warmup_ratio: 0.03
37
+ finetune_weight_decay: 0.1
38
+ image_resize_strategy: resize-naive
39
+ llm_backbone_id: qwen25-0_5b-extra
40
+ llm_max_length: 32768
41
+ model_id: prism-qwen25-extra-dinosiglip-224px+0_5b
42
+ reduce_in_full_precision: false
43
+ type: prism-qwen25-extra-dinosiglip-224px+0_5b
44
+ vision_backbone_id: dinosiglip-vit-so-224px
45
+ pretrained_checkpoint: null
46
+ run_id: prism-qwen25-extra-dinosiglip-224px+0_5b+stage-finetune+x7
47
+ run_root_dir: runs
48
+ seed: 7
49
+ stage: finetune
50
+ trackers:
51
+ - jsonl
52
+ - wandb
53
+ wandb_entity: null
54
+ wandb_project: prismatic
prism-qwen25-extra-dinosiglip-224px+0_5b+stage-finetune+x7.jsonl ADDED
The diff for this file is too large to render. See raw diff
 
run-metrics.jsonl ADDED
@@ -0,0 +1 @@
 
 
1
+ {"hparams": {"dataset": {"align_stage_components": ["download/llava-laion-cc-sbu-558k/chat.json", "download/llava-laion-cc-sbu-558k"], "dataset_id": "llava-v15", "dataset_root_dir": "/hai/scratch/belkhale/datasets/prismatic-vlms", "finetune_stage_components": ["download/llava-v1.5-instruct/llava_v1_5_mix665k.json", "download/llava-v1.5-instruct"], "type": "llava-v15"}, "hf_token": ".hf_token", "model": {"align_epochs": 1, "align_global_batch_size": 96, "align_learning_rate": 0.001, "align_lr_scheduler_type": "linear-warmup+cosine-decay", "align_max_grad_norm": 1.0, "align_max_steps": null, "align_per_device_batch_size": 16, "align_save_every_n_steps": 10000, "align_train_strategy": "fsdp-shard-grad-op", "align_warmup_ratio": 0.03, "align_weight_decay": 0.0, "arch_specifier": "no-align+fused-gelu-mlp", "enable_gradient_checkpointing": true, "enable_mixed_precision_training": true, "finetune_epochs": 2, "finetune_global_batch_size": 64, "finetune_learning_rate": 2e-05, "finetune_lr_scheduler_type": "linear-warmup+cosine-decay", "finetune_max_grad_norm": 1.0, "finetune_max_steps": null, "finetune_per_device_batch_size": 4, "finetune_save_every_n_steps": 10000, "finetune_train_strategy": "fsdp-full-shard", "finetune_warmup_ratio": 0.03, "finetune_weight_decay": 0.1, "image_resize_strategy": "resize-naive", "llm_backbone_id": "qwen25-0_5b-extra", "llm_max_length": 32768, "model_id": "prism-qwen25-extra-dinosiglip-224px+0_5b", "reduce_in_full_precision": false, "type": "prism-qwen25-extra-dinosiglip-224px+0_5b", "vision_backbone_id": "dinosiglip-vit-so-224px"}, "pretrained_checkpoint": null, "run_id": "prism-qwen25-extra-dinosiglip-224px+0_5b+stage-finetune+x7", "run_root_dir": "runs", "seed": 7, "stage": "finetune", "trackers": ["jsonl", "wandb"], "wandb_entity": null, "wandb_project": "prismatic"}, "run_id": "prism-qwen25-extra-dinosiglip-224px+0_5b+stage-finetune+x7"}
wandb/debug-internal.log ADDED
@@ -0,0 +1,17 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {"time":"2024-11-05T19:31:02.519998294-08:00","level":"INFO","msg":"using version","core version":"0.18.5"}
2
+ {"time":"2024-11-05T19:31:02.520021589-08:00","level":"INFO","msg":"created symlink","path":"runs/prism-qwen25-extra-dinosiglip-224px+0_5b+stage-finetune+x7/wandb/run-20241105_193102-jcj67gg8/logs/debug-core.log"}
3
+ {"time":"2024-11-05T19:31:02.738801523-08:00","level":"INFO","msg":"created new stream","id":"jcj67gg8"}
4
+ {"time":"2024-11-05T19:31:02.738835256-08:00","level":"INFO","msg":"stream: started","id":"jcj67gg8"}
5
+ {"time":"2024-11-05T19:31:02.738905513-08:00","level":"INFO","msg":"sender: started","stream_id":"jcj67gg8"}
6
+ {"time":"2024-11-05T19:31:02.738892436-08:00","level":"INFO","msg":"writer: Do: started","stream_id":{"value":"jcj67gg8"}}
7
+ {"time":"2024-11-05T19:31:02.738902832-08:00","level":"INFO","msg":"handler: started","stream_id":{"value":"jcj67gg8"}}
8
+ {"time":"2024-11-05T19:31:03.056521545-08:00","level":"INFO","msg":"Starting system monitor"}
9
+ {"time":"2024-11-05T23:38:31.946246118-08:00","level":"INFO","msg":"Stopping system monitor"}
10
+ {"time":"2024-11-05T23:38:32.023973784-08:00","level":"INFO","msg":"Stopped system monitor"}
11
+ {"time":"2024-11-05T23:38:32.922341453-08:00","level":"INFO","msg":"handler: operation stats","stats":{"operations":[{"desc":"saving job artifact","runtime_seconds":0.775009129}],"total_operations":1}}
12
+ {"time":"2024-11-05T23:38:33.390878425-08:00","level":"INFO","msg":"fileTransfer: Close: file transfer manager closed"}
13
+ {"time":"2024-11-05T23:38:34.555591357-08:00","level":"INFO","msg":"stream: closing","id":"jcj67gg8"}
14
+ {"time":"2024-11-05T23:38:34.555667186-08:00","level":"INFO","msg":"handler: closed","stream_id":{"value":"jcj67gg8"}}
15
+ {"time":"2024-11-05T23:38:34.555706801-08:00","level":"INFO","msg":"writer: Close: closed","stream_id":{"value":"jcj67gg8"}}
16
+ {"time":"2024-11-05T23:38:34.555872-08:00","level":"INFO","msg":"sender: closed","stream_id":"jcj67gg8"}
17
+ {"time":"2024-11-05T23:38:34.558526671-08:00","level":"INFO","msg":"stream: closed","id":"jcj67gg8"}
wandb/debug.log ADDED
@@ -0,0 +1,32 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ 2024-11-05 19:31:02,511 INFO MainThread:2188020 [wandb_setup.py:_flush():79] Current SDK version is 0.18.5
2
+ 2024-11-05 19:31:02,511 INFO MainThread:2188020 [wandb_setup.py:_flush():79] Configure stats pid to 2188020
3
+ 2024-11-05 19:31:02,511 INFO MainThread:2188020 [wandb_setup.py:_flush():79] Loading settings from /hai/scratch/belkhale/.config/wandb/settings
4
+ 2024-11-05 19:31:02,511 INFO MainThread:2188020 [wandb_setup.py:_flush():79] Loading settings from /hai/scratch/belkhale/openvla-mini/wandb/settings
5
+ 2024-11-05 19:31:02,511 INFO MainThread:2188020 [wandb_setup.py:_flush():79] Loading settings from environment variables: {'_service_wait': '300'}
6
+ 2024-11-05 19:31:02,512 INFO MainThread:2188020 [wandb_setup.py:_flush():79] Applying setup settings: {'mode': None, '_disable_service': None}
7
+ 2024-11-05 19:31:02,512 INFO MainThread:2188020 [wandb_setup.py:_flush():79] Inferring run settings from compute environment: {'program_relpath': 'scripts/pretrain.py', 'program_abspath': '/hai/scratch/belkhale/openvla-mini/scripts/pretrain.py', 'program': '/hai/scratch/belkhale/openvla-mini/scripts/pretrain.py'}
8
+ 2024-11-05 19:31:02,512 INFO MainThread:2188020 [wandb_setup.py:_flush():79] Applying login settings: {}
9
+ 2024-11-05 19:31:02,512 INFO MainThread:2188020 [wandb_init.py:_log_setup():534] Logging user logs to runs/prism-qwen25-extra-dinosiglip-224px+0_5b+stage-finetune+x7/wandb/run-20241105_193102-jcj67gg8/logs/debug.log
10
+ 2024-11-05 19:31:02,512 INFO MainThread:2188020 [wandb_init.py:_log_setup():535] Logging internal logs to runs/prism-qwen25-extra-dinosiglip-224px+0_5b+stage-finetune+x7/wandb/run-20241105_193102-jcj67gg8/logs/debug-internal.log
11
+ 2024-11-05 19:31:02,512 INFO MainThread:2188020 [wandb_init.py:init():621] calling init triggers
12
+ 2024-11-05 19:31:02,512 INFO MainThread:2188020 [wandb_init.py:init():628] wandb.init called with sweep_config: {}
13
+ config: {'model': {'type': 'prism-qwen25-extra-dinosiglip-224px+0_5b', 'model_id': 'prism-qwen25-extra-dinosiglip-224px+0_5b', 'arch_specifier': 'no-align+fused-gelu-mlp', 'vision_backbone_id': 'dinosiglip-vit-so-224px', 'llm_backbone_id': 'qwen25-0_5b-extra', 'image_resize_strategy': 'resize-naive', 'llm_max_length': 32768, 'align_epochs': 1, 'align_max_steps': None, 'align_save_every_n_steps': 10000, 'align_global_batch_size': 96, 'align_per_device_batch_size': 16, 'align_learning_rate': 0.001, 'align_weight_decay': 0.0, 'align_max_grad_norm': 1.0, 'align_lr_scheduler_type': 'linear-warmup+cosine-decay', 'align_warmup_ratio': 0.03, 'align_train_strategy': 'fsdp-shard-grad-op', 'finetune_epochs': 2, 'finetune_max_steps': None, 'finetune_save_every_n_steps': 10000, 'finetune_global_batch_size': 64, 'finetune_per_device_batch_size': 4, 'finetune_learning_rate': 2e-05, 'finetune_weight_decay': 0.1, 'finetune_max_grad_norm': 1.0, 'finetune_lr_scheduler_type': 'linear-warmup+cosine-decay', 'finetune_warmup_ratio': 0.03, 'finetune_train_strategy': 'fsdp-full-shard', 'enable_gradient_checkpointing': True, 'enable_mixed_precision_training': True, 'reduce_in_full_precision': False}, 'dataset': {'type': 'llava-v15', 'dataset_id': 'llava-v15', 'align_stage_components': ['download/llava-laion-cc-sbu-558k/chat.json', 'download/llava-laion-cc-sbu-558k'], 'finetune_stage_components': ['download/llava-v1.5-instruct/llava_v1_5_mix665k.json', 'download/llava-v1.5-instruct'], 'dataset_root_dir': '/hai/scratch/belkhale/datasets/prismatic-vlms'}, 'stage': 'finetune', 'pretrained_checkpoint': None, 'run_id': 'prism-qwen25-extra-dinosiglip-224px+0_5b+stage-finetune+x7', 'run_root_dir': 'runs', 'seed': 7, 'hf_token': '.hf_token', 'trackers': ['jsonl', 'wandb'], 'wandb_project': 'prismatic', 'wandb_entity': None}
14
+ 2024-11-05 19:31:02,512 INFO MainThread:2188020 [wandb_init.py:init():671] starting backend
15
+ 2024-11-05 19:31:02,512 INFO MainThread:2188020 [wandb_init.py:init():675] sending inform_init request
16
+ 2024-11-05 19:31:02,513 INFO MainThread:2188020 [backend.py:_multiprocessing_setup():104] multiprocessing start_methods=fork,spawn,forkserver, using: spawn
17
+ 2024-11-05 19:31:02,513 INFO MainThread:2188020 [wandb_init.py:init():688] backend started and connected
18
+ 2024-11-05 19:31:02,515 INFO MainThread:2188020 [wandb_init.py:init():783] updated telemetry
19
+ 2024-11-05 19:31:02,573 INFO MainThread:2188020 [wandb_init.py:init():816] communicating run to backend with 90.0 second timeout
20
+ 2024-11-05 19:31:03,050 INFO MainThread:2188020 [wandb_init.py:init():867] starting run threads in backend
21
+ 2024-11-05 19:31:03,226 INFO MainThread:2188020 [wandb_run.py:_console_start():2463] atexit reg
22
+ 2024-11-05 19:31:03,227 INFO MainThread:2188020 [wandb_run.py:_redirect():2311] redirect: wrap_raw
23
+ 2024-11-05 19:31:03,227 INFO MainThread:2188020 [wandb_run.py:_redirect():2376] Wrapping output streams.
24
+ 2024-11-05 19:31:03,227 INFO MainThread:2188020 [wandb_run.py:_redirect():2401] Redirects installed.
25
+ 2024-11-05 19:31:03,230 INFO MainThread:2188020 [wandb_init.py:init():911] run started, returning control to user process
26
+ 2024-11-05 23:38:31,920 INFO MainThread:2188020 [wandb_run.py:_finish():2158] finishing run belkhale/prismatic/jcj67gg8
27
+ 2024-11-05 23:38:31,920 INFO MainThread:2188020 [wandb_run.py:_atexit_cleanup():2426] got exitcode: 0
28
+ 2024-11-05 23:38:31,921 INFO MainThread:2188020 [wandb_run.py:_restore():2408] restore
29
+ 2024-11-05 23:38:31,921 INFO MainThread:2188020 [wandb_run.py:_restore():2414] restore done
30
+ 2024-11-05 23:38:34,516 INFO MainThread:2188020 [wandb_run.py:_footer_history_summary_info():3975] rendering history
31
+ 2024-11-05 23:38:34,517 INFO MainThread:2188020 [wandb_run.py:_footer_history_summary_info():4007] rendering summary
32
+ 2024-11-05 23:38:34,534 INFO MainThread:2188020 [wandb_run.py:_footer_sync_info():3934] logging synced files
wandb/latest-run/files/config.yaml ADDED
@@ -0,0 +1,105 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ _wandb:
2
+ value:
3
+ cli_version: 0.18.5
4
+ m: []
5
+ python_version: 3.10.15
6
+ t:
7
+ "1":
8
+ - 1
9
+ - 2
10
+ - 3
11
+ - 11
12
+ - 41
13
+ - 49
14
+ - 55
15
+ - 63
16
+ - 71
17
+ "2":
18
+ - 1
19
+ - 2
20
+ - 3
21
+ - 11
22
+ - 41
23
+ - 49
24
+ - 55
25
+ - 63
26
+ - 71
27
+ "3":
28
+ - 2
29
+ - 13
30
+ - 16
31
+ - 23
32
+ - 55
33
+ - 61
34
+ "4": 3.10.15
35
+ "5": 0.18.5
36
+ "6": 4.40.1
37
+ "8":
38
+ - 5
39
+ "12": 0.18.5
40
+ "13": linux-x86_64
41
+ dataset:
42
+ value:
43
+ align_stage_components:
44
+ - download/llava-laion-cc-sbu-558k/chat.json
45
+ - download/llava-laion-cc-sbu-558k
46
+ dataset_id: llava-v15
47
+ dataset_root_dir: /hai/scratch/belkhale/datasets/prismatic-vlms
48
+ finetune_stage_components:
49
+ - download/llava-v1.5-instruct/llava_v1_5_mix665k.json
50
+ - download/llava-v1.5-instruct
51
+ type: llava-v15
52
+ hf_token:
53
+ value: .hf_token
54
+ model:
55
+ value:
56
+ align_epochs: 1
57
+ align_global_batch_size: 96
58
+ align_learning_rate: 0.001
59
+ align_lr_scheduler_type: linear-warmup+cosine-decay
60
+ align_max_grad_norm: 1
61
+ align_max_steps: null
62
+ align_per_device_batch_size: 16
63
+ align_save_every_n_steps: 10000
64
+ align_train_strategy: fsdp-shard-grad-op
65
+ align_warmup_ratio: 0.03
66
+ align_weight_decay: 0
67
+ arch_specifier: no-align+fused-gelu-mlp
68
+ enable_gradient_checkpointing: true
69
+ enable_mixed_precision_training: true
70
+ finetune_epochs: 2
71
+ finetune_global_batch_size: 64
72
+ finetune_learning_rate: 2e-05
73
+ finetune_lr_scheduler_type: linear-warmup+cosine-decay
74
+ finetune_max_grad_norm: 1
75
+ finetune_max_steps: null
76
+ finetune_per_device_batch_size: 4
77
+ finetune_save_every_n_steps: 10000
78
+ finetune_train_strategy: fsdp-full-shard
79
+ finetune_warmup_ratio: 0.03
80
+ finetune_weight_decay: 0.1
81
+ image_resize_strategy: resize-naive
82
+ llm_backbone_id: qwen25-0_5b-extra
83
+ llm_max_length: 32768
84
+ model_id: prism-qwen25-extra-dinosiglip-224px+0_5b
85
+ reduce_in_full_precision: false
86
+ type: prism-qwen25-extra-dinosiglip-224px+0_5b
87
+ vision_backbone_id: dinosiglip-vit-so-224px
88
+ pretrained_checkpoint:
89
+ value: null
90
+ run_id:
91
+ value: prism-qwen25-extra-dinosiglip-224px+0_5b+stage-finetune+x7
92
+ run_root_dir:
93
+ value: runs
94
+ seed:
95
+ value: 7
96
+ stage:
97
+ value: finetune
98
+ trackers:
99
+ value:
100
+ - jsonl
101
+ - wandb
102
+ wandb_entity:
103
+ value: null
104
+ wandb_project:
105
+ value: prismatic
wandb/latest-run/files/output.log ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ 11/05 [19:31:03] INFO | >> [*] Starting Training Loop pretrain.py:227
2
+
3
+ 11/05 [23:38:31] INFO | >> [*] Done with Training =>> pretrain.py:231
4
+ Finalizing Metrics
wandb/latest-run/files/wandb-metadata.json ADDED
@@ -0,0 +1,134 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "os": "Linux-5.15.0-116-generic-x86_64-with-glibc2.35",
3
+ "python": "3.10.15",
4
+ "startedAt": "2024-11-06T03:31:02.513650Z",
5
+ "args": [
6
+ "--model.type",
7
+ "prism-qwen25-extra-dinosiglip-224px+0_5b",
8
+ "--model.finetune_global_batch_size",
9
+ "64",
10
+ "--model.finetune_per_device_batch_size",
11
+ "4"
12
+ ],
13
+ "program": "/hai/scratch/belkhale/openvla-mini/scripts/pretrain.py",
14
+ "codePath": "scripts/pretrain.py",
15
+ "git": {
16
+ "remote": "[email protected]:Stanford-ILIAD/openvla-mini.git",
17
+ "commit": "05073927b096dab7d326a3e39db9262f08d3a8ae"
18
+ },
19
+ "email": "[email protected]",
20
+ "root": "runs/prism-qwen25-extra-dinosiglip-224px+0_5b+stage-finetune+x7",
21
+ "host": "haic-hgx-2.stanford.edu",
22
+ "username": "belkhale",
23
+ "executable": "/hai/scratch/belkhale/miniforge3/envs/vla/bin/python3.10",
24
+ "codePathLocal": "scripts/pretrain.py",
25
+ "cpu_count": 112,
26
+ "cpu_count_logical": 224,
27
+ "gpu": "NVIDIA H100 80GB HBM3",
28
+ "gpu_count": 8,
29
+ "disk": {
30
+ "/": {
31
+ "total": "942725181440",
32
+ "used": "50880847872"
33
+ }
34
+ },
35
+ "memory": {
36
+ "total": "2164104577024"
37
+ },
38
+ "cpu": {
39
+ "count": 112,
40
+ "countLogical": 224
41
+ },
42
+ "gpu_nvidia": [
43
+ {
44
+ "name": "NVIDIA H100 80GB HBM3",
45
+ "memoryTotal": "85520809984",
46
+ "cudaCores": 16896,
47
+ "architecture": "Hopper"
48
+ },
49
+ {
50
+ "name": "NVIDIA H100 80GB HBM3",
51
+ "memoryTotal": "85520809984",
52
+ "cudaCores": 16896,
53
+ "architecture": "Hopper"
54
+ },
55
+ {
56
+ "name": "NVIDIA H100 80GB HBM3",
57
+ "memoryTotal": "85520809984",
58
+ "cudaCores": 16896,
59
+ "architecture": "Hopper"
60
+ },
61
+ {
62
+ "name": "NVIDIA H100 80GB HBM3",
63
+ "memoryTotal": "85520809984",
64
+ "cudaCores": 16896,
65
+ "architecture": "Hopper"
66
+ },
67
+ {
68
+ "name": "NVIDIA H100 80GB HBM3",
69
+ "memoryTotal": "85520809984",
70
+ "cudaCores": 16896,
71
+ "architecture": "Hopper"
72
+ },
73
+ {
74
+ "name": "NVIDIA H100 80GB HBM3",
75
+ "memoryTotal": "85520809984",
76
+ "cudaCores": 16896,
77
+ "architecture": "Hopper"
78
+ },
79
+ {
80
+ "name": "NVIDIA H100 80GB HBM3",
81
+ "memoryTotal": "85520809984",
82
+ "cudaCores": 16896,
83
+ "architecture": "Hopper"
84
+ },
85
+ {
86
+ "name": "NVIDIA H100 80GB HBM3",
87
+ "memoryTotal": "85520809984",
88
+ "cudaCores": 16896,
89
+ "architecture": "Hopper"
90
+ }
91
+ ],
92
+ "slurm": {
93
+ "cluster_name": "haic",
94
+ "conf": "/usr/local/etc/slurm.conf",
95
+ "cpus_on_node": "64",
96
+ "cpus_per_task": "64",
97
+ "gpus_on_node": "8",
98
+ "gtids": "0",
99
+ "job_account": "models",
100
+ "job_cpus_per_node": "64",
101
+ "job_end_time": "1731122999",
102
+ "job_gid": "37",
103
+ "job_gpus": "0,1,2,3,4,5,6,7",
104
+ "job_id": "11026",
105
+ "job_name": "pretrain",
106
+ "job_nodelist": "haic-hgx-2",
107
+ "job_num_nodes": "1",
108
+ "job_partition": "hai",
109
+ "job_qos": "models",
110
+ "job_start_time": "1730863799",
111
+ "job_uid": "377095",
112
+ "job_user": "belkhale",
113
+ "jobid": "11026",
114
+ "localid": "0",
115
+ "mem_per_node": "102400",
116
+ "nnodes": "1",
117
+ "nodeid": "0",
118
+ "nodelist": "haic-hgx-2",
119
+ "nprocs": "1",
120
+ "ntasks": "1",
121
+ "ntasks_per_node": "1",
122
+ "prio_process": "0",
123
+ "procid": "0",
124
+ "script_context": "prolog_task",
125
+ "submit_dir": "/hai/scratch/belkhale/openvla-mini",
126
+ "submit_host": "haic.stanford.edu",
127
+ "task_pid": "2187908",
128
+ "tasks_per_node": "1",
129
+ "topology_addr": "haic-hgx-2",
130
+ "topology_addr_pattern": "node",
131
+ "tres_per_task": "cpu=64"
132
+ },
133
+ "cudaVersion": "12.4"
134
+ }
wandb/latest-run/files/wandb-summary.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"_wandb":{"runtime":14849},"_runtime":14849.432571063,"_step":20792,"Finetune/Loss":0.734754204750061,"Finetune/Loss (Raw)":0.7624474763870239,"Finetune/Learning Rate":0,"Finetune/Step Time":0.7374007441103458,"_timestamp":1.7308787040734835e+09,"Finetune/Step":20792}
wandb/latest-run/logs/debug-core.log ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {"time":"2024-11-05T19:31:02.167132681-08:00","level":"INFO","msg":"started logging, with flags","port-filename":"/tmp/belkhale/tmpowkszwq0/port-2188020.txt","pid":2188020,"debug":false,"disable-analytics":false}
2
+ {"time":"2024-11-05T19:31:02.167154904-08:00","level":"INFO","msg":"FeatureState","shutdownOnParentExitEnabled":false}
3
+ {"time":"2024-11-05T19:31:02.168180089-08:00","level":"INFO","msg":"Will exit if parent process dies.","ppid":2188020}
4
+ {"time":"2024-11-05T19:31:02.168180088-08:00","level":"INFO","msg":"server is running","addr":{"IP":"127.0.0.1","Port":35793,"Zone":""}}
5
+ {"time":"2024-11-05T19:31:02.357031058-08:00","level":"INFO","msg":"connection: ManageConnectionData: new connection created","id":"127.0.0.1:51484"}
6
+ {"time":"2024-11-05T19:31:02.518306545-08:00","level":"INFO","msg":"handleInformInit: received","streamId":"jcj67gg8","id":"127.0.0.1:51484"}
7
+ {"time":"2024-11-05T19:31:02.738838516-08:00","level":"INFO","msg":"handleInformInit: stream started","streamId":"jcj67gg8","id":"127.0.0.1:51484"}
8
+ {"time":"2024-11-05T23:38:34.554528568-08:00","level":"INFO","msg":"handleInformFinish: finish message received","streamId":"jcj67gg8","id":"127.0.0.1:51484"}
9
+ {"time":"2024-11-05T23:38:34.558563756-08:00","level":"INFO","msg":"handleInformFinish: stream closed","streamId":"jcj67gg8","id":"127.0.0.1:51484"}
10
+ {"time":"2024-11-05T23:42:06.504927152-08:00","level":"INFO","msg":"handleInformTeardown: server teardown initiated","id":"127.0.0.1:51484"}
11
+ {"time":"2024-11-05T23:42:06.505861575-08:00","level":"INFO","msg":"handleInformTeardown: server shutdown complete","id":"127.0.0.1:51484"}
12
+ {"time":"2024-11-05T23:42:06.505880856-08:00","level":"INFO","msg":"server is shutting down"}
13
+ {"time":"2024-11-05T23:42:06.50594903-08:00","level":"INFO","msg":"connection: Close: initiating connection closure","id":"127.0.0.1:51484"}
14
+ {"time":"2024-11-05T23:42:06.506095025-08:00","level":"INFO","msg":"connection: Close: connection successfully closed","id":"127.0.0.1:51484"}
15
+ {"time":"2024-11-05T23:42:06.506152479-08:00","level":"INFO","msg":"connection: ManageConnectionData: connection closed","id":"127.0.0.1:51484"}
16
+ {"time":"2024-11-05T23:42:06.506171224-08:00","level":"INFO","msg":"server is closed"}
wandb/latest-run/logs/debug-internal.log ADDED
@@ -0,0 +1,17 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {"time":"2024-11-05T19:31:02.519998294-08:00","level":"INFO","msg":"using version","core version":"0.18.5"}
2
+ {"time":"2024-11-05T19:31:02.520021589-08:00","level":"INFO","msg":"created symlink","path":"runs/prism-qwen25-extra-dinosiglip-224px+0_5b+stage-finetune+x7/wandb/run-20241105_193102-jcj67gg8/logs/debug-core.log"}
3
+ {"time":"2024-11-05T19:31:02.738801523-08:00","level":"INFO","msg":"created new stream","id":"jcj67gg8"}
4
+ {"time":"2024-11-05T19:31:02.738835256-08:00","level":"INFO","msg":"stream: started","id":"jcj67gg8"}
5
+ {"time":"2024-11-05T19:31:02.738905513-08:00","level":"INFO","msg":"sender: started","stream_id":"jcj67gg8"}
6
+ {"time":"2024-11-05T19:31:02.738892436-08:00","level":"INFO","msg":"writer: Do: started","stream_id":{"value":"jcj67gg8"}}
7
+ {"time":"2024-11-05T19:31:02.738902832-08:00","level":"INFO","msg":"handler: started","stream_id":{"value":"jcj67gg8"}}
8
+ {"time":"2024-11-05T19:31:03.056521545-08:00","level":"INFO","msg":"Starting system monitor"}
9
+ {"time":"2024-11-05T23:38:31.946246118-08:00","level":"INFO","msg":"Stopping system monitor"}
10
+ {"time":"2024-11-05T23:38:32.023973784-08:00","level":"INFO","msg":"Stopped system monitor"}
11
+ {"time":"2024-11-05T23:38:32.922341453-08:00","level":"INFO","msg":"handler: operation stats","stats":{"operations":[{"desc":"saving job artifact","runtime_seconds":0.775009129}],"total_operations":1}}
12
+ {"time":"2024-11-05T23:38:33.390878425-08:00","level":"INFO","msg":"fileTransfer: Close: file transfer manager closed"}
13
+ {"time":"2024-11-05T23:38:34.555591357-08:00","level":"INFO","msg":"stream: closing","id":"jcj67gg8"}
14
+ {"time":"2024-11-05T23:38:34.555667186-08:00","level":"INFO","msg":"handler: closed","stream_id":{"value":"jcj67gg8"}}
15
+ {"time":"2024-11-05T23:38:34.555706801-08:00","level":"INFO","msg":"writer: Close: closed","stream_id":{"value":"jcj67gg8"}}
16
+ {"time":"2024-11-05T23:38:34.555872-08:00","level":"INFO","msg":"sender: closed","stream_id":"jcj67gg8"}
17
+ {"time":"2024-11-05T23:38:34.558526671-08:00","level":"INFO","msg":"stream: closed","id":"jcj67gg8"}
wandb/latest-run/logs/debug.log ADDED
@@ -0,0 +1,32 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ 2024-11-05 19:31:02,511 INFO MainThread:2188020 [wandb_setup.py:_flush():79] Current SDK version is 0.18.5
2
+ 2024-11-05 19:31:02,511 INFO MainThread:2188020 [wandb_setup.py:_flush():79] Configure stats pid to 2188020
3
+ 2024-11-05 19:31:02,511 INFO MainThread:2188020 [wandb_setup.py:_flush():79] Loading settings from /hai/scratch/belkhale/.config/wandb/settings
4
+ 2024-11-05 19:31:02,511 INFO MainThread:2188020 [wandb_setup.py:_flush():79] Loading settings from /hai/scratch/belkhale/openvla-mini/wandb/settings
5
+ 2024-11-05 19:31:02,511 INFO MainThread:2188020 [wandb_setup.py:_flush():79] Loading settings from environment variables: {'_service_wait': '300'}
6
+ 2024-11-05 19:31:02,512 INFO MainThread:2188020 [wandb_setup.py:_flush():79] Applying setup settings: {'mode': None, '_disable_service': None}
7
+ 2024-11-05 19:31:02,512 INFO MainThread:2188020 [wandb_setup.py:_flush():79] Inferring run settings from compute environment: {'program_relpath': 'scripts/pretrain.py', 'program_abspath': '/hai/scratch/belkhale/openvla-mini/scripts/pretrain.py', 'program': '/hai/scratch/belkhale/openvla-mini/scripts/pretrain.py'}
8
+ 2024-11-05 19:31:02,512 INFO MainThread:2188020 [wandb_setup.py:_flush():79] Applying login settings: {}
9
+ 2024-11-05 19:31:02,512 INFO MainThread:2188020 [wandb_init.py:_log_setup():534] Logging user logs to runs/prism-qwen25-extra-dinosiglip-224px+0_5b+stage-finetune+x7/wandb/run-20241105_193102-jcj67gg8/logs/debug.log
10
+ 2024-11-05 19:31:02,512 INFO MainThread:2188020 [wandb_init.py:_log_setup():535] Logging internal logs to runs/prism-qwen25-extra-dinosiglip-224px+0_5b+stage-finetune+x7/wandb/run-20241105_193102-jcj67gg8/logs/debug-internal.log
11
+ 2024-11-05 19:31:02,512 INFO MainThread:2188020 [wandb_init.py:init():621] calling init triggers
12
+ 2024-11-05 19:31:02,512 INFO MainThread:2188020 [wandb_init.py:init():628] wandb.init called with sweep_config: {}
13
+ config: {'model': {'type': 'prism-qwen25-extra-dinosiglip-224px+0_5b', 'model_id': 'prism-qwen25-extra-dinosiglip-224px+0_5b', 'arch_specifier': 'no-align+fused-gelu-mlp', 'vision_backbone_id': 'dinosiglip-vit-so-224px', 'llm_backbone_id': 'qwen25-0_5b-extra', 'image_resize_strategy': 'resize-naive', 'llm_max_length': 32768, 'align_epochs': 1, 'align_max_steps': None, 'align_save_every_n_steps': 10000, 'align_global_batch_size': 96, 'align_per_device_batch_size': 16, 'align_learning_rate': 0.001, 'align_weight_decay': 0.0, 'align_max_grad_norm': 1.0, 'align_lr_scheduler_type': 'linear-warmup+cosine-decay', 'align_warmup_ratio': 0.03, 'align_train_strategy': 'fsdp-shard-grad-op', 'finetune_epochs': 2, 'finetune_max_steps': None, 'finetune_save_every_n_steps': 10000, 'finetune_global_batch_size': 64, 'finetune_per_device_batch_size': 4, 'finetune_learning_rate': 2e-05, 'finetune_weight_decay': 0.1, 'finetune_max_grad_norm': 1.0, 'finetune_lr_scheduler_type': 'linear-warmup+cosine-decay', 'finetune_warmup_ratio': 0.03, 'finetune_train_strategy': 'fsdp-full-shard', 'enable_gradient_checkpointing': True, 'enable_mixed_precision_training': True, 'reduce_in_full_precision': False}, 'dataset': {'type': 'llava-v15', 'dataset_id': 'llava-v15', 'align_stage_components': ['download/llava-laion-cc-sbu-558k/chat.json', 'download/llava-laion-cc-sbu-558k'], 'finetune_stage_components': ['download/llava-v1.5-instruct/llava_v1_5_mix665k.json', 'download/llava-v1.5-instruct'], 'dataset_root_dir': '/hai/scratch/belkhale/datasets/prismatic-vlms'}, 'stage': 'finetune', 'pretrained_checkpoint': None, 'run_id': 'prism-qwen25-extra-dinosiglip-224px+0_5b+stage-finetune+x7', 'run_root_dir': 'runs', 'seed': 7, 'hf_token': '.hf_token', 'trackers': ['jsonl', 'wandb'], 'wandb_project': 'prismatic', 'wandb_entity': None}
14
+ 2024-11-05 19:31:02,512 INFO MainThread:2188020 [wandb_init.py:init():671] starting backend
15
+ 2024-11-05 19:31:02,512 INFO MainThread:2188020 [wandb_init.py:init():675] sending inform_init request
16
+ 2024-11-05 19:31:02,513 INFO MainThread:2188020 [backend.py:_multiprocessing_setup():104] multiprocessing start_methods=fork,spawn,forkserver, using: spawn
17
+ 2024-11-05 19:31:02,513 INFO MainThread:2188020 [wandb_init.py:init():688] backend started and connected
18
+ 2024-11-05 19:31:02,515 INFO MainThread:2188020 [wandb_init.py:init():783] updated telemetry
19
+ 2024-11-05 19:31:02,573 INFO MainThread:2188020 [wandb_init.py:init():816] communicating run to backend with 90.0 second timeout
20
+ 2024-11-05 19:31:03,050 INFO MainThread:2188020 [wandb_init.py:init():867] starting run threads in backend
21
+ 2024-11-05 19:31:03,226 INFO MainThread:2188020 [wandb_run.py:_console_start():2463] atexit reg
22
+ 2024-11-05 19:31:03,227 INFO MainThread:2188020 [wandb_run.py:_redirect():2311] redirect: wrap_raw
23
+ 2024-11-05 19:31:03,227 INFO MainThread:2188020 [wandb_run.py:_redirect():2376] Wrapping output streams.
24
+ 2024-11-05 19:31:03,227 INFO MainThread:2188020 [wandb_run.py:_redirect():2401] Redirects installed.
25
+ 2024-11-05 19:31:03,230 INFO MainThread:2188020 [wandb_init.py:init():911] run started, returning control to user process
26
+ 2024-11-05 23:38:31,920 INFO MainThread:2188020 [wandb_run.py:_finish():2158] finishing run belkhale/prismatic/jcj67gg8
27
+ 2024-11-05 23:38:31,920 INFO MainThread:2188020 [wandb_run.py:_atexit_cleanup():2426] got exitcode: 0
28
+ 2024-11-05 23:38:31,921 INFO MainThread:2188020 [wandb_run.py:_restore():2408] restore
29
+ 2024-11-05 23:38:31,921 INFO MainThread:2188020 [wandb_run.py:_restore():2414] restore done
30
+ 2024-11-05 23:38:34,516 INFO MainThread:2188020 [wandb_run.py:_footer_history_summary_info():3975] rendering history
31
+ 2024-11-05 23:38:34,517 INFO MainThread:2188020 [wandb_run.py:_footer_history_summary_info():4007] rendering summary
32
+ 2024-11-05 23:38:34,534 INFO MainThread:2188020 [wandb_run.py:_footer_sync_info():3934] logging synced files
wandb/latest-run/run-jcj67gg8.wandb ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b1e028b995d945fb180c85455d7219269515a28888eda671f635380e4dac0d23
3
+ size 37709802
wandb/run-20241105_192502-8vxhoj6d/files/config.yaml ADDED
@@ -0,0 +1,103 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ _wandb:
2
+ value:
3
+ cli_version: 0.18.5
4
+ m: []
5
+ python_version: 3.10.15
6
+ t:
7
+ "1":
8
+ - 1
9
+ - 2
10
+ - 3
11
+ - 11
12
+ - 41
13
+ - 49
14
+ - 55
15
+ - 63
16
+ - 71
17
+ "2":
18
+ - 1
19
+ - 2
20
+ - 3
21
+ - 11
22
+ - 41
23
+ - 49
24
+ - 55
25
+ - 63
26
+ - 71
27
+ "3":
28
+ - 13
29
+ - 16
30
+ - 23
31
+ - 55
32
+ "4": 3.10.15
33
+ "5": 0.18.5
34
+ "6": 4.40.1
35
+ "8":
36
+ - 5
37
+ "12": 0.18.5
38
+ "13": linux-x86_64
39
+ dataset:
40
+ value:
41
+ align_stage_components:
42
+ - download/llava-laion-cc-sbu-558k/chat.json
43
+ - download/llava-laion-cc-sbu-558k
44
+ dataset_id: llava-v15
45
+ dataset_root_dir: /hai/scratch/belkhale/datasets/prismatic-vlms
46
+ finetune_stage_components:
47
+ - download/llava-v1.5-instruct/llava_v1_5_mix665k.json
48
+ - download/llava-v1.5-instruct
49
+ type: llava-v15
50
+ hf_token:
51
+ value: .hf_token
52
+ model:
53
+ value:
54
+ align_epochs: 1
55
+ align_global_batch_size: 96
56
+ align_learning_rate: 0.001
57
+ align_lr_scheduler_type: linear-warmup+cosine-decay
58
+ align_max_grad_norm: 1
59
+ align_max_steps: null
60
+ align_per_device_batch_size: 16
61
+ align_save_every_n_steps: 10000
62
+ align_train_strategy: fsdp-shard-grad-op
63
+ align_warmup_ratio: 0.03
64
+ align_weight_decay: 0
65
+ arch_specifier: no-align+fused-gelu-mlp
66
+ enable_gradient_checkpointing: true
67
+ enable_mixed_precision_training: true
68
+ finetune_epochs: 2
69
+ finetune_global_batch_size: 128
70
+ finetune_learning_rate: 2e-05
71
+ finetune_lr_scheduler_type: linear-warmup+cosine-decay
72
+ finetune_max_grad_norm: 1
73
+ finetune_max_steps: null
74
+ finetune_per_device_batch_size: 16
75
+ finetune_save_every_n_steps: 10000
76
+ finetune_train_strategy: fsdp-full-shard
77
+ finetune_warmup_ratio: 0.03
78
+ finetune_weight_decay: 0.1
79
+ image_resize_strategy: resize-naive
80
+ llm_backbone_id: qwen25-0_5b-extra
81
+ llm_max_length: 32768
82
+ model_id: prism-qwen25-extra-dinosiglip-224px+0_5b
83
+ reduce_in_full_precision: false
84
+ type: prism-qwen25-extra-dinosiglip-224px+0_5b
85
+ vision_backbone_id: dinosiglip-vit-so-224px
86
+ pretrained_checkpoint:
87
+ value: null
88
+ run_id:
89
+ value: prism-qwen25-extra-dinosiglip-224px+0_5b+stage-finetune+x7
90
+ run_root_dir:
91
+ value: runs
92
+ seed:
93
+ value: 7
94
+ stage:
95
+ value: finetune
96
+ trackers:
97
+ value:
98
+ - jsonl
99
+ - wandb
100
+ wandb_entity:
101
+ value: null
102
+ wandb_project:
103
+ value: prismatic
wandb/run-20241105_192502-8vxhoj6d/files/output.log ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ 11/05 [19:25:05] INFO | >> [*] Starting Training Loop pretrain.py:227
2
+ Traceback (most recent call last):
3
+ File "/hai/scratch/belkhale/openvla-mini/scripts/pretrain.py", line 241, in <module>
4
+ pretrain()
5
+ File "/hai/scratch/belkhale/miniforge3/envs/vla/lib/python3.10/site-packages/draccus/argparsing.py", line 203, in wrapper_inner
6
+ response = fn(cfg, *args, **kwargs)
7
+ File "/hai/scratch/belkhale/openvla-mini/scripts/pretrain.py", line 228, in pretrain
8
+ train_strategy.run_training(train_dataset, collator, metrics, stage=cfg.stage, seed=cfg.seed)
9
+ File "/hai/scratch/belkhale/openvla-mini/prismatic/training/strategies/base_strategy.py", line 190, in run_training
10
+ output: CausalLMOutputWithPast = self.vlm(
11
+ File "/hai/scratch/belkhale/miniforge3/envs/vla/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1511, in _wrapped_call_impl
12
+ return self._call_impl(*args, **kwargs)
13
+ File "/hai/scratch/belkhale/miniforge3/envs/vla/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1520, in _call_impl
14
+ return forward_call(*args, **kwargs)
15
+ File "/hai/scratch/belkhale/miniforge3/envs/vla/lib/python3.10/site-packages/torch/distributed/fsdp/fully_sharded_data_parallel.py", line 849, in forward
16
+ output = self._fsdp_wrapped_module(*args, **kwargs)
17
+ File "/hai/scratch/belkhale/miniforge3/envs/vla/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1511, in _wrapped_call_impl
18
+ return self._call_impl(*args, **kwargs)
19
+ File "/hai/scratch/belkhale/miniforge3/envs/vla/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1520, in _call_impl
20
+ return forward_call(*args, **kwargs)
21
+ File "/hai/scratch/belkhale/openvla-mini/prismatic/models/vlms/prismatic.py", line 470, in forward
22
+ return self.llm_backbone(
23
+ File "/hai/scratch/belkhale/miniforge3/envs/vla/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1511, in _wrapped_call_impl
24
+ return self._call_impl(*args, **kwargs)
25
+ File "/hai/scratch/belkhale/miniforge3/envs/vla/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1520, in _call_impl
26
+ return forward_call(*args, **kwargs)
27
+ File "/hai/scratch/belkhale/openvla-mini/prismatic/models/backbones/llm/base_llm.py", line 221, in forward
28
+ output: CausalLMOutputWithPast = self.llm(
29
+ File "/hai/scratch/belkhale/miniforge3/envs/vla/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1511, in _wrapped_call_impl
30
+ return self._call_impl(*args, **kwargs)
31
+ File "/hai/scratch/belkhale/miniforge3/envs/vla/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1520, in _call_impl
32
+ return forward_call(*args, **kwargs)
33
+ File "/hai/scratch/belkhale/miniforge3/envs/vla/lib/python3.10/site-packages/transformers/models/qwen2/modeling_qwen2.py", line 1183, in forward
34
+ logits = logits.float()
35
+ torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 49.14 GiB. GPU 0 has a total capacity of 79.10 GiB of which 42.31 GiB is free. Including non-PyTorch memory, this process has 36.77 GiB memory in use. Of the allocated memory 29.84 GiB is allocated by PyTorch, and 1.02 GiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
wandb/run-20241105_192502-8vxhoj6d/files/wandb-metadata.json ADDED
@@ -0,0 +1,134 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "os": "Linux-5.15.0-116-generic-x86_64-with-glibc2.35",
3
+ "python": "3.10.15",
4
+ "startedAt": "2024-11-06T03:25:02.892307Z",
5
+ "args": [
6
+ "--model.type",
7
+ "prism-qwen25-extra-dinosiglip-224px+0_5b",
8
+ "--model.finetune_global_batch_size",
9
+ "128",
10
+ "--model.finetune_per_device_batch_size",
11
+ "16"
12
+ ],
13
+ "program": "/hai/scratch/belkhale/openvla-mini/scripts/pretrain.py",
14
+ "codePath": "scripts/pretrain.py",
15
+ "git": {
16
+ "remote": "[email protected]:Stanford-ILIAD/openvla-mini.git",
17
+ "commit": "05073927b096dab7d326a3e39db9262f08d3a8ae"
18
+ },
19
+ "email": "[email protected]",
20
+ "root": "runs/prism-qwen25-extra-dinosiglip-224px+0_5b+stage-finetune+x7",
21
+ "host": "haic-hgx-2.stanford.edu",
22
+ "username": "belkhale",
23
+ "executable": "/hai/scratch/belkhale/miniforge3/envs/vla/bin/python3.10",
24
+ "codePathLocal": "scripts/pretrain.py",
25
+ "cpu_count": 112,
26
+ "cpu_count_logical": 224,
27
+ "gpu": "NVIDIA H100 80GB HBM3",
28
+ "gpu_count": 8,
29
+ "disk": {
30
+ "/": {
31
+ "total": "942725181440",
32
+ "used": "50880245760"
33
+ }
34
+ },
35
+ "memory": {
36
+ "total": "2164104577024"
37
+ },
38
+ "cpu": {
39
+ "count": 112,
40
+ "countLogical": 224
41
+ },
42
+ "gpu_nvidia": [
43
+ {
44
+ "name": "NVIDIA H100 80GB HBM3",
45
+ "memoryTotal": "85520809984",
46
+ "cudaCores": 16896,
47
+ "architecture": "Hopper"
48
+ },
49
+ {
50
+ "name": "NVIDIA H100 80GB HBM3",
51
+ "memoryTotal": "85520809984",
52
+ "cudaCores": 16896,
53
+ "architecture": "Hopper"
54
+ },
55
+ {
56
+ "name": "NVIDIA H100 80GB HBM3",
57
+ "memoryTotal": "85520809984",
58
+ "cudaCores": 16896,
59
+ "architecture": "Hopper"
60
+ },
61
+ {
62
+ "name": "NVIDIA H100 80GB HBM3",
63
+ "memoryTotal": "85520809984",
64
+ "cudaCores": 16896,
65
+ "architecture": "Hopper"
66
+ },
67
+ {
68
+ "name": "NVIDIA H100 80GB HBM3",
69
+ "memoryTotal": "85520809984",
70
+ "cudaCores": 16896,
71
+ "architecture": "Hopper"
72
+ },
73
+ {
74
+ "name": "NVIDIA H100 80GB HBM3",
75
+ "memoryTotal": "85520809984",
76
+ "cudaCores": 16896,
77
+ "architecture": "Hopper"
78
+ },
79
+ {
80
+ "name": "NVIDIA H100 80GB HBM3",
81
+ "memoryTotal": "85520809984",
82
+ "cudaCores": 16896,
83
+ "architecture": "Hopper"
84
+ },
85
+ {
86
+ "name": "NVIDIA H100 80GB HBM3",
87
+ "memoryTotal": "85520809984",
88
+ "cudaCores": 16896,
89
+ "architecture": "Hopper"
90
+ }
91
+ ],
92
+ "slurm": {
93
+ "cluster_name": "haic",
94
+ "conf": "/usr/local/etc/slurm.conf",
95
+ "cpus_on_node": "64",
96
+ "cpus_per_task": "64",
97
+ "gpus_on_node": "8",
98
+ "gtids": "0",
99
+ "job_account": "models",
100
+ "job_cpus_per_node": "64",
101
+ "job_end_time": "1731122631",
102
+ "job_gid": "37",
103
+ "job_gpus": "0,1,2,3,4,5,6,7",
104
+ "job_id": "11023",
105
+ "job_name": "pretrain",
106
+ "job_nodelist": "haic-hgx-2",
107
+ "job_num_nodes": "1",
108
+ "job_partition": "hai",
109
+ "job_qos": "models",
110
+ "job_start_time": "1730863431",
111
+ "job_uid": "377095",
112
+ "job_user": "belkhale",
113
+ "jobid": "11023",
114
+ "localid": "0",
115
+ "mem_per_node": "102400",
116
+ "nnodes": "1",
117
+ "nodeid": "0",
118
+ "nodelist": "haic-hgx-2",
119
+ "nprocs": "1",
120
+ "ntasks": "1",
121
+ "ntasks_per_node": "1",
122
+ "prio_process": "0",
123
+ "procid": "0",
124
+ "script_context": "prolog_task",
125
+ "submit_dir": "/hai/scratch/belkhale/openvla-mini",
126
+ "submit_host": "haic.stanford.edu",
127
+ "task_pid": "2182609",
128
+ "tasks_per_node": "1",
129
+ "topology_addr": "haic-hgx-2",
130
+ "topology_addr_pattern": "node",
131
+ "tres_per_task": "cpu=64"
132
+ },
133
+ "cudaVersion": "12.4"
134
+ }
wandb/run-20241105_192502-8vxhoj6d/files/wandb-summary.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"_wandb":{"runtime":22}}
wandb/run-20241105_192502-8vxhoj6d/logs/debug-core.log ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {"time":"2024-11-05T19:25:02.690703685-08:00","level":"INFO","msg":"started logging, with flags","port-filename":"/tmp/belkhale/tmp97kp6p__/port-2182724.txt","pid":2182724,"debug":false,"disable-analytics":false}
2
+ {"time":"2024-11-05T19:25:02.690734566-08:00","level":"INFO","msg":"FeatureState","shutdownOnParentExitEnabled":false}
3
+ {"time":"2024-11-05T19:25:02.692100649-08:00","level":"INFO","msg":"Will exit if parent process dies.","ppid":2182724}
4
+ {"time":"2024-11-05T19:25:02.69208955-08:00","level":"INFO","msg":"server is running","addr":{"IP":"127.0.0.1","Port":41525,"Zone":""}}
5
+ {"time":"2024-11-05T19:25:02.744814731-08:00","level":"INFO","msg":"connection: ManageConnectionData: new connection created","id":"127.0.0.1:59560"}
6
+ {"time":"2024-11-05T19:25:02.895430941-08:00","level":"INFO","msg":"handleInformInit: received","streamId":"8vxhoj6d","id":"127.0.0.1:59560"}
7
+ {"time":"2024-11-05T19:25:03.194511518-08:00","level":"INFO","msg":"handleInformInit: stream started","streamId":"8vxhoj6d","id":"127.0.0.1:59560"}
8
+ {"time":"2024-11-05T19:25:25.782400709-08:00","level":"INFO","msg":"handleInformTeardown: server teardown initiated","id":"127.0.0.1:59560"}
9
+ {"time":"2024-11-05T19:25:25.782779877-08:00","level":"INFO","msg":"connection: Close: initiating connection closure","id":"127.0.0.1:59560"}
10
+ {"time":"2024-11-05T19:25:25.782799272-08:00","level":"INFO","msg":"server is shutting down"}
11
+ {"time":"2024-11-05T19:25:25.783947145-08:00","level":"INFO","msg":"connection: Close: connection successfully closed","id":"127.0.0.1:59560"}
12
+ {"time":"2024-11-05T19:25:27.304522773-08:00","level":"INFO","msg":"handleInformTeardown: server shutdown complete","id":"127.0.0.1:59560"}
13
+ {"time":"2024-11-05T19:25:27.304560668-08:00","level":"INFO","msg":"connection: ManageConnectionData: connection closed","id":"127.0.0.1:59560"}
14
+ {"time":"2024-11-05T19:25:27.304578483-08:00","level":"INFO","msg":"server is closed"}
wandb/run-20241105_192502-8vxhoj6d/logs/debug-internal.log ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {"time":"2024-11-05T19:25:02.896392755-08:00","level":"INFO","msg":"using version","core version":"0.18.5"}
2
+ {"time":"2024-11-05T19:25:02.896416546-08:00","level":"INFO","msg":"created symlink","path":"runs/prism-qwen25-extra-dinosiglip-224px+0_5b+stage-finetune+x7/wandb/run-20241105_192502-8vxhoj6d/logs/debug-core.log"}
3
+ {"time":"2024-11-05T19:25:03.194464738-08:00","level":"INFO","msg":"created new stream","id":"8vxhoj6d"}
4
+ {"time":"2024-11-05T19:25:03.194508127-08:00","level":"INFO","msg":"stream: started","id":"8vxhoj6d"}
5
+ {"time":"2024-11-05T19:25:03.194578475-08:00","level":"INFO","msg":"sender: started","stream_id":"8vxhoj6d"}
6
+ {"time":"2024-11-05T19:25:03.194561979-08:00","level":"INFO","msg":"writer: Do: started","stream_id":{"value":"8vxhoj6d"}}
7
+ {"time":"2024-11-05T19:25:03.194595326-08:00","level":"INFO","msg":"handler: started","stream_id":{"value":"8vxhoj6d"}}
8
+ {"time":"2024-11-05T19:25:03.382215025-08:00","level":"INFO","msg":"Starting system monitor"}
9
+ {"time":"2024-11-05T19:25:25.782775323-08:00","level":"INFO","msg":"stream: closing","id":"8vxhoj6d"}
10
+ {"time":"2024-11-05T19:25:25.784095739-08:00","level":"INFO","msg":"Stopping system monitor"}
11
+ {"time":"2024-11-05T19:25:25.785772774-08:00","level":"INFO","msg":"Stopped system monitor"}
12
+ {"time":"2024-11-05T19:25:27.185173962-08:00","level":"INFO","msg":"fileTransfer: Close: file transfer manager closed"}
13
+ {"time":"2024-11-05T19:25:27.303068441-08:00","level":"INFO","msg":"handler: closed","stream_id":{"value":"8vxhoj6d"}}
14
+ {"time":"2024-11-05T19:25:27.303126471-08:00","level":"INFO","msg":"writer: Close: closed","stream_id":{"value":"8vxhoj6d"}}
15
+ {"time":"2024-11-05T19:25:27.303147247-08:00","level":"INFO","msg":"sender: closed","stream_id":"8vxhoj6d"}
16
+ {"time":"2024-11-05T19:25:27.304415448-08:00","level":"INFO","msg":"stream: closed","id":"8vxhoj6d"}
wandb/run-20241105_192502-8vxhoj6d/logs/debug.log ADDED
@@ -0,0 +1,26 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ 2024-11-05 19:25:02,890 INFO MainThread:2182724 [wandb_setup.py:_flush():79] Current SDK version is 0.18.5
2
+ 2024-11-05 19:25:02,890 INFO MainThread:2182724 [wandb_setup.py:_flush():79] Configure stats pid to 2182724
3
+ 2024-11-05 19:25:02,890 INFO MainThread:2182724 [wandb_setup.py:_flush():79] Loading settings from /hai/scratch/belkhale/.config/wandb/settings
4
+ 2024-11-05 19:25:02,890 INFO MainThread:2182724 [wandb_setup.py:_flush():79] Loading settings from /hai/scratch/belkhale/openvla-mini/wandb/settings
5
+ 2024-11-05 19:25:02,890 INFO MainThread:2182724 [wandb_setup.py:_flush():79] Loading settings from environment variables: {'_service_wait': '300'}
6
+ 2024-11-05 19:25:02,890 INFO MainThread:2182724 [wandb_setup.py:_flush():79] Applying setup settings: {'mode': None, '_disable_service': None}
7
+ 2024-11-05 19:25:02,890 INFO MainThread:2182724 [wandb_setup.py:_flush():79] Inferring run settings from compute environment: {'program_relpath': 'scripts/pretrain.py', 'program_abspath': '/hai/scratch/belkhale/openvla-mini/scripts/pretrain.py', 'program': '/hai/scratch/belkhale/openvla-mini/scripts/pretrain.py'}
8
+ 2024-11-05 19:25:02,890 INFO MainThread:2182724 [wandb_setup.py:_flush():79] Applying login settings: {}
9
+ 2024-11-05 19:25:02,890 INFO MainThread:2182724 [wandb_init.py:_log_setup():534] Logging user logs to runs/prism-qwen25-extra-dinosiglip-224px+0_5b+stage-finetune+x7/wandb/run-20241105_192502-8vxhoj6d/logs/debug.log
10
+ 2024-11-05 19:25:02,890 INFO MainThread:2182724 [wandb_init.py:_log_setup():535] Logging internal logs to runs/prism-qwen25-extra-dinosiglip-224px+0_5b+stage-finetune+x7/wandb/run-20241105_192502-8vxhoj6d/logs/debug-internal.log
11
+ 2024-11-05 19:25:02,890 INFO MainThread:2182724 [wandb_init.py:init():621] calling init triggers
12
+ 2024-11-05 19:25:02,890 INFO MainThread:2182724 [wandb_init.py:init():628] wandb.init called with sweep_config: {}
13
+ config: {'model': {'type': 'prism-qwen25-extra-dinosiglip-224px+0_5b', 'model_id': 'prism-qwen25-extra-dinosiglip-224px+0_5b', 'arch_specifier': 'no-align+fused-gelu-mlp', 'vision_backbone_id': 'dinosiglip-vit-so-224px', 'llm_backbone_id': 'qwen25-0_5b-extra', 'image_resize_strategy': 'resize-naive', 'llm_max_length': 32768, 'align_epochs': 1, 'align_max_steps': None, 'align_save_every_n_steps': 10000, 'align_global_batch_size': 96, 'align_per_device_batch_size': 16, 'align_learning_rate': 0.001, 'align_weight_decay': 0.0, 'align_max_grad_norm': 1.0, 'align_lr_scheduler_type': 'linear-warmup+cosine-decay', 'align_warmup_ratio': 0.03, 'align_train_strategy': 'fsdp-shard-grad-op', 'finetune_epochs': 2, 'finetune_max_steps': None, 'finetune_save_every_n_steps': 10000, 'finetune_global_batch_size': 128, 'finetune_per_device_batch_size': 16, 'finetune_learning_rate': 2e-05, 'finetune_weight_decay': 0.1, 'finetune_max_grad_norm': 1.0, 'finetune_lr_scheduler_type': 'linear-warmup+cosine-decay', 'finetune_warmup_ratio': 0.03, 'finetune_train_strategy': 'fsdp-full-shard', 'enable_gradient_checkpointing': True, 'enable_mixed_precision_training': True, 'reduce_in_full_precision': False}, 'dataset': {'type': 'llava-v15', 'dataset_id': 'llava-v15', 'align_stage_components': ['download/llava-laion-cc-sbu-558k/chat.json', 'download/llava-laion-cc-sbu-558k'], 'finetune_stage_components': ['download/llava-v1.5-instruct/llava_v1_5_mix665k.json', 'download/llava-v1.5-instruct'], 'dataset_root_dir': '/hai/scratch/belkhale/datasets/prismatic-vlms'}, 'stage': 'finetune', 'pretrained_checkpoint': None, 'run_id': 'prism-qwen25-extra-dinosiglip-224px+0_5b+stage-finetune+x7', 'run_root_dir': 'runs', 'seed': 7, 'hf_token': '.hf_token', 'trackers': ['jsonl', 'wandb'], 'wandb_project': 'prismatic', 'wandb_entity': None}
14
+ 2024-11-05 19:25:02,890 INFO MainThread:2182724 [wandb_init.py:init():671] starting backend
15
+ 2024-11-05 19:25:02,890 INFO MainThread:2182724 [wandb_init.py:init():675] sending inform_init request
16
+ 2024-11-05 19:25:02,891 INFO MainThread:2182724 [backend.py:_multiprocessing_setup():104] multiprocessing start_methods=fork,spawn,forkserver, using: spawn
17
+ 2024-11-05 19:25:02,892 INFO MainThread:2182724 [wandb_init.py:init():688] backend started and connected
18
+ 2024-11-05 19:25:02,893 INFO MainThread:2182724 [wandb_init.py:init():783] updated telemetry
19
+ 2024-11-05 19:25:03,020 INFO MainThread:2182724 [wandb_init.py:init():816] communicating run to backend with 90.0 second timeout
20
+ 2024-11-05 19:25:03,377 INFO MainThread:2182724 [wandb_init.py:init():867] starting run threads in backend
21
+ 2024-11-05 19:25:05,102 INFO MainThread:2182724 [wandb_run.py:_console_start():2463] atexit reg
22
+ 2024-11-05 19:25:05,102 INFO MainThread:2182724 [wandb_run.py:_redirect():2311] redirect: wrap_raw
23
+ 2024-11-05 19:25:05,103 INFO MainThread:2182724 [wandb_run.py:_redirect():2376] Wrapping output streams.
24
+ 2024-11-05 19:25:05,103 INFO MainThread:2182724 [wandb_run.py:_redirect():2401] Redirects installed.
25
+ 2024-11-05 19:25:05,128 INFO MainThread:2182724 [wandb_init.py:init():911] run started, returning control to user process
26
+ 2024-11-05 19:25:25,783 WARNING MsgRouterThr:2182724 [router.py:message_loop():77] message_loop has been closed
wandb/run-20241105_192502-8vxhoj6d/run-8vxhoj6d.wandb ADDED
Binary file (30.6 kB). View file
 
wandb/run-20241105_192659-mqdqjqly/files/config.yaml ADDED
@@ -0,0 +1,103 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ _wandb:
2
+ value:
3
+ cli_version: 0.18.5
4
+ m: []
5
+ python_version: 3.10.15
6
+ t:
7
+ "1":
8
+ - 1
9
+ - 2
10
+ - 3
11
+ - 11
12
+ - 41
13
+ - 49
14
+ - 55
15
+ - 63
16
+ - 71
17
+ "2":
18
+ - 1
19
+ - 2
20
+ - 3
21
+ - 11
22
+ - 41
23
+ - 49
24
+ - 55
25
+ - 63
26
+ - 71
27
+ "3":
28
+ - 13
29
+ - 16
30
+ - 23
31
+ - 55
32
+ "4": 3.10.15
33
+ "5": 0.18.5
34
+ "6": 4.40.1
35
+ "8":
36
+ - 5
37
+ "12": 0.18.5
38
+ "13": linux-x86_64
39
+ dataset:
40
+ value:
41
+ align_stage_components:
42
+ - download/llava-laion-cc-sbu-558k/chat.json
43
+ - download/llava-laion-cc-sbu-558k
44
+ dataset_id: llava-v15
45
+ dataset_root_dir: /hai/scratch/belkhale/datasets/prismatic-vlms
46
+ finetune_stage_components:
47
+ - download/llava-v1.5-instruct/llava_v1_5_mix665k.json
48
+ - download/llava-v1.5-instruct
49
+ type: llava-v15
50
+ hf_token:
51
+ value: .hf_token
52
+ model:
53
+ value:
54
+ align_epochs: 1
55
+ align_global_batch_size: 96
56
+ align_learning_rate: 0.001
57
+ align_lr_scheduler_type: linear-warmup+cosine-decay
58
+ align_max_grad_norm: 1
59
+ align_max_steps: null
60
+ align_per_device_batch_size: 16
61
+ align_save_every_n_steps: 10000
62
+ align_train_strategy: fsdp-shard-grad-op
63
+ align_warmup_ratio: 0.03
64
+ align_weight_decay: 0
65
+ arch_specifier: no-align+fused-gelu-mlp
66
+ enable_gradient_checkpointing: true
67
+ enable_mixed_precision_training: true
68
+ finetune_epochs: 2
69
+ finetune_global_batch_size: 64
70
+ finetune_learning_rate: 2e-05
71
+ finetune_lr_scheduler_type: linear-warmup+cosine-decay
72
+ finetune_max_grad_norm: 1
73
+ finetune_max_steps: null
74
+ finetune_per_device_batch_size: 8
75
+ finetune_save_every_n_steps: 10000
76
+ finetune_train_strategy: fsdp-full-shard
77
+ finetune_warmup_ratio: 0.03
78
+ finetune_weight_decay: 0.1
79
+ image_resize_strategy: resize-naive
80
+ llm_backbone_id: qwen25-0_5b-extra
81
+ llm_max_length: 32768
82
+ model_id: prism-qwen25-extra-dinosiglip-224px+0_5b
83
+ reduce_in_full_precision: false
84
+ type: prism-qwen25-extra-dinosiglip-224px+0_5b
85
+ vision_backbone_id: dinosiglip-vit-so-224px
86
+ pretrained_checkpoint:
87
+ value: null
88
+ run_id:
89
+ value: prism-qwen25-extra-dinosiglip-224px+0_5b+stage-finetune+x7
90
+ run_root_dir:
91
+ value: runs
92
+ seed:
93
+ value: 7
94
+ stage:
95
+ value: finetune
96
+ trackers:
97
+ value:
98
+ - jsonl
99
+ - wandb
100
+ wandb_entity:
101
+ value: null
102
+ wandb_project:
103
+ value: prismatic
wandb/run-20241105_192659-mqdqjqly/files/output.log ADDED
@@ -0,0 +1,43 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ 11/05 [19:26:59] INFO | >> [*] Starting Training Loop pretrain.py:227
2
+ Traceback (most recent call last):
3
+ File "/hai/scratch/belkhale/openvla-mini/scripts/pretrain.py", line 241, in <module>
4
+ pretrain()
5
+ File "/hai/scratch/belkhale/miniforge3/envs/vla/lib/python3.10/site-packages/draccus/argparsing.py", line 203, in wrapper_inner
6
+ response = fn(cfg, *args, **kwargs)
7
+ File "/hai/scratch/belkhale/openvla-mini/scripts/pretrain.py", line 228, in pretrain
8
+ train_strategy.run_training(train_dataset, collator, metrics, stage=cfg.stage, seed=cfg.seed)
9
+ File "/hai/scratch/belkhale/openvla-mini/prismatic/training/strategies/base_strategy.py", line 190, in run_training
10
+ output: CausalLMOutputWithPast = self.vlm(
11
+ File "/hai/scratch/belkhale/miniforge3/envs/vla/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1511, in _wrapped_call_impl
12
+ return self._call_impl(*args, **kwargs)
13
+ File "/hai/scratch/belkhale/miniforge3/envs/vla/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1520, in _call_impl
14
+ return forward_call(*args, **kwargs)
15
+ File "/hai/scratch/belkhale/miniforge3/envs/vla/lib/python3.10/site-packages/torch/distributed/fsdp/fully_sharded_data_parallel.py", line 849, in forward
16
+ output = self._fsdp_wrapped_module(*args, **kwargs)
17
+ File "/hai/scratch/belkhale/miniforge3/envs/vla/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1511, in _wrapped_call_impl
18
+ return self._call_impl(*args, **kwargs)
19
+ File "/hai/scratch/belkhale/miniforge3/envs/vla/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1520, in _call_impl
20
+ return forward_call(*args, **kwargs)
21
+ File "/hai/scratch/belkhale/openvla-mini/prismatic/models/vlms/prismatic.py", line 470, in forward
22
+ return self.llm_backbone(
23
+ File "/hai/scratch/belkhale/miniforge3/envs/vla/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1511, in _wrapped_call_impl
24
+ return self._call_impl(*args, **kwargs)
25
+ File "/hai/scratch/belkhale/miniforge3/envs/vla/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1520, in _call_impl
26
+ return forward_call(*args, **kwargs)
27
+ File "/hai/scratch/belkhale/openvla-mini/prismatic/models/backbones/llm/base_llm.py", line 221, in forward
28
+ output: CausalLMOutputWithPast = self.llm(
29
+ File "/hai/scratch/belkhale/miniforge3/envs/vla/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1511, in _wrapped_call_impl
30
+ return self._call_impl(*args, **kwargs)
31
+ File "/hai/scratch/belkhale/miniforge3/envs/vla/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1520, in _call_impl
32
+ return forward_call(*args, **kwargs)
33
+ File "/hai/scratch/belkhale/miniforge3/envs/vla/lib/python3.10/site-packages/transformers/models/qwen2/modeling_qwen2.py", line 1196, in forward
34
+ loss = loss_fct(shift_logits, shift_labels)
35
+ File "/hai/scratch/belkhale/miniforge3/envs/vla/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1511, in _wrapped_call_impl
36
+ return self._call_impl(*args, **kwargs)
37
+ File "/hai/scratch/belkhale/miniforge3/envs/vla/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1520, in _call_impl
38
+ return forward_call(*args, **kwargs)
39
+ File "/hai/scratch/belkhale/miniforge3/envs/vla/lib/python3.10/site-packages/torch/nn/modules/loss.py", line 1179, in forward
40
+ return F.cross_entropy(input, target, weight=self.weight,
41
+ File "/hai/scratch/belkhale/miniforge3/envs/vla/lib/python3.10/site-packages/torch/nn/functional.py", line 3059, in cross_entropy
42
+ return torch._C._nn.cross_entropy_loss(input, target, weight, _Reduction.get_enum(reduction), ignore_index, label_smoothing)
43
+ torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 24.57 GiB. GPU 0 has a total capacity of 79.10 GiB of which 20.18 GiB is free. Including non-PyTorch memory, this process has 58.91 GiB memory in use. Of the allocated memory 52.22 GiB is allocated by PyTorch, and 798.66 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
wandb/run-20241105_192659-mqdqjqly/files/wandb-metadata.json ADDED
@@ -0,0 +1,134 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "os": "Linux-5.15.0-116-generic-x86_64-with-glibc2.35",
3
+ "python": "3.10.15",
4
+ "startedAt": "2024-11-06T03:26:59.309206Z",
5
+ "args": [
6
+ "--model.type",
7
+ "prism-qwen25-extra-dinosiglip-224px+0_5b",
8
+ "--model.finetune_global_batch_size",
9
+ "64",
10
+ "--model.finetune_per_device_batch_size",
11
+ "8"
12
+ ],
13
+ "program": "/hai/scratch/belkhale/openvla-mini/scripts/pretrain.py",
14
+ "codePath": "scripts/pretrain.py",
15
+ "git": {
16
+ "remote": "[email protected]:Stanford-ILIAD/openvla-mini.git",
17
+ "commit": "05073927b096dab7d326a3e39db9262f08d3a8ae"
18
+ },
19
+ "email": "[email protected]",
20
+ "root": "runs/prism-qwen25-extra-dinosiglip-224px+0_5b+stage-finetune+x7",
21
+ "host": "haic-hgx-2.stanford.edu",
22
+ "username": "belkhale",
23
+ "executable": "/hai/scratch/belkhale/miniforge3/envs/vla/bin/python3.10",
24
+ "codePathLocal": "scripts/pretrain.py",
25
+ "cpu_count": 112,
26
+ "cpu_count_logical": 224,
27
+ "gpu": "NVIDIA H100 80GB HBM3",
28
+ "gpu_count": 8,
29
+ "disk": {
30
+ "/": {
31
+ "total": "942725181440",
32
+ "used": "50880540672"
33
+ }
34
+ },
35
+ "memory": {
36
+ "total": "2164104577024"
37
+ },
38
+ "cpu": {
39
+ "count": 112,
40
+ "countLogical": 224
41
+ },
42
+ "gpu_nvidia": [
43
+ {
44
+ "name": "NVIDIA H100 80GB HBM3",
45
+ "memoryTotal": "85520809984",
46
+ "cudaCores": 16896,
47
+ "architecture": "Hopper"
48
+ },
49
+ {
50
+ "name": "NVIDIA H100 80GB HBM3",
51
+ "memoryTotal": "85520809984",
52
+ "cudaCores": 16896,
53
+ "architecture": "Hopper"
54
+ },
55
+ {
56
+ "name": "NVIDIA H100 80GB HBM3",
57
+ "memoryTotal": "85520809984",
58
+ "cudaCores": 16896,
59
+ "architecture": "Hopper"
60
+ },
61
+ {
62
+ "name": "NVIDIA H100 80GB HBM3",
63
+ "memoryTotal": "85520809984",
64
+ "cudaCores": 16896,
65
+ "architecture": "Hopper"
66
+ },
67
+ {
68
+ "name": "NVIDIA H100 80GB HBM3",
69
+ "memoryTotal": "85520809984",
70
+ "cudaCores": 16896,
71
+ "architecture": "Hopper"
72
+ },
73
+ {
74
+ "name": "NVIDIA H100 80GB HBM3",
75
+ "memoryTotal": "85520809984",
76
+ "cudaCores": 16896,
77
+ "architecture": "Hopper"
78
+ },
79
+ {
80
+ "name": "NVIDIA H100 80GB HBM3",
81
+ "memoryTotal": "85520809984",
82
+ "cudaCores": 16896,
83
+ "architecture": "Hopper"
84
+ },
85
+ {
86
+ "name": "NVIDIA H100 80GB HBM3",
87
+ "memoryTotal": "85520809984",
88
+ "cudaCores": 16896,
89
+ "architecture": "Hopper"
90
+ }
91
+ ],
92
+ "slurm": {
93
+ "cluster_name": "haic",
94
+ "conf": "/usr/local/etc/slurm.conf",
95
+ "cpus_on_node": "64",
96
+ "cpus_per_task": "64",
97
+ "gpus_on_node": "8",
98
+ "gtids": "0",
99
+ "job_account": "models",
100
+ "job_cpus_per_node": "64",
101
+ "job_end_time": "1731122748",
102
+ "job_gid": "37",
103
+ "job_gpus": "0,1,2,3,4,5,6,7",
104
+ "job_id": "11024",
105
+ "job_name": "pretrain",
106
+ "job_nodelist": "haic-hgx-2",
107
+ "job_num_nodes": "1",
108
+ "job_partition": "hai",
109
+ "job_qos": "models",
110
+ "job_start_time": "1730863548",
111
+ "job_uid": "377095",
112
+ "job_user": "belkhale",
113
+ "jobid": "11024",
114
+ "localid": "0",
115
+ "mem_per_node": "102400",
116
+ "nnodes": "1",
117
+ "nodeid": "0",
118
+ "nodelist": "haic-hgx-2",
119
+ "nprocs": "1",
120
+ "ntasks": "1",
121
+ "ntasks_per_node": "1",
122
+ "prio_process": "0",
123
+ "procid": "0",
124
+ "script_context": "prolog_task",
125
+ "submit_dir": "/hai/scratch/belkhale/openvla-mini",
126
+ "submit_host": "haic.stanford.edu",
127
+ "task_pid": "2184784",
128
+ "tasks_per_node": "1",
129
+ "topology_addr": "haic-hgx-2",
130
+ "topology_addr_pattern": "node",
131
+ "tres_per_task": "cpu=64"
132
+ },
133
+ "cudaVersion": "12.4"
134
+ }
wandb/run-20241105_192659-mqdqjqly/files/wandb-summary.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"_wandb":{"runtime":18}}
wandb/run-20241105_192659-mqdqjqly/logs/debug-core.log ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {"time":"2024-11-05T19:26:58.953439705-08:00","level":"INFO","msg":"started logging, with flags","port-filename":"/tmp/belkhale/tmp1fqrk6pa/port-2184904.txt","pid":2184904,"debug":false,"disable-analytics":false}
2
+ {"time":"2024-11-05T19:26:58.953464408-08:00","level":"INFO","msg":"FeatureState","shutdownOnParentExitEnabled":false}
3
+ {"time":"2024-11-05T19:26:58.954979044-08:00","level":"INFO","msg":"server is running","addr":{"IP":"127.0.0.1","Port":41281,"Zone":""}}
4
+ {"time":"2024-11-05T19:26:58.955028412-08:00","level":"INFO","msg":"Will exit if parent process dies.","ppid":2184904}
5
+ {"time":"2024-11-05T19:26:59.144036286-08:00","level":"INFO","msg":"connection: ManageConnectionData: new connection created","id":"127.0.0.1:38180"}
6
+ {"time":"2024-11-05T19:26:59.313342183-08:00","level":"INFO","msg":"handleInformInit: received","streamId":"mqdqjqly","id":"127.0.0.1:38180"}
7
+ {"time":"2024-11-05T19:26:59.533375776-08:00","level":"INFO","msg":"handleInformInit: stream started","streamId":"mqdqjqly","id":"127.0.0.1:38180"}
8
+ {"time":"2024-11-05T19:27:18.207609388-08:00","level":"INFO","msg":"handleInformTeardown: server teardown initiated","id":"127.0.0.1:38180"}
9
+ {"time":"2024-11-05T19:27:18.208156833-08:00","level":"INFO","msg":"server is shutting down"}
10
+ {"time":"2024-11-05T19:27:18.20815625-08:00","level":"INFO","msg":"connection: Close: initiating connection closure","id":"127.0.0.1:38180"}
11
+ {"time":"2024-11-05T19:27:18.208292453-08:00","level":"INFO","msg":"connection: Close: connection successfully closed","id":"127.0.0.1:38180"}
12
+ {"time":"2024-11-05T19:27:18.887422598-08:00","level":"INFO","msg":"handleInformTeardown: server shutdown complete","id":"127.0.0.1:38180"}
13
+ {"time":"2024-11-05T19:27:18.887444653-08:00","level":"INFO","msg":"connection: ManageConnectionData: connection closed","id":"127.0.0.1:38180"}
14
+ {"time":"2024-11-05T19:27:18.887460329-08:00","level":"INFO","msg":"server is closed"}
wandb/run-20241105_192659-mqdqjqly/logs/debug-internal.log ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {"time":"2024-11-05T19:26:59.314379425-08:00","level":"INFO","msg":"using version","core version":"0.18.5"}
2
+ {"time":"2024-11-05T19:26:59.314391049-08:00","level":"INFO","msg":"created symlink","path":"runs/prism-qwen25-extra-dinosiglip-224px+0_5b+stage-finetune+x7/wandb/run-20241105_192659-mqdqjqly/logs/debug-core.log"}
3
+ {"time":"2024-11-05T19:26:59.533339524-08:00","level":"INFO","msg":"created new stream","id":"mqdqjqly"}
4
+ {"time":"2024-11-05T19:26:59.533372406-08:00","level":"INFO","msg":"stream: started","id":"mqdqjqly"}
5
+ {"time":"2024-11-05T19:26:59.533420404-08:00","level":"INFO","msg":"sender: started","stream_id":"mqdqjqly"}
6
+ {"time":"2024-11-05T19:26:59.533413693-08:00","level":"INFO","msg":"writer: Do: started","stream_id":{"value":"mqdqjqly"}}
7
+ {"time":"2024-11-05T19:26:59.533432721-08:00","level":"INFO","msg":"handler: started","stream_id":{"value":"mqdqjqly"}}
8
+ {"time":"2024-11-05T19:26:59.715136674-08:00","level":"INFO","msg":"Starting system monitor"}
9
+ {"time":"2024-11-05T19:27:18.2081844-08:00","level":"INFO","msg":"stream: closing","id":"mqdqjqly"}
10
+ {"time":"2024-11-05T19:27:18.20827275-08:00","level":"INFO","msg":"Stopping system monitor"}
11
+ {"time":"2024-11-05T19:27:18.209227648-08:00","level":"INFO","msg":"Stopped system monitor"}
12
+ {"time":"2024-11-05T19:27:18.690761255-08:00","level":"INFO","msg":"fileTransfer: Close: file transfer manager closed"}
13
+ {"time":"2024-11-05T19:27:18.88603778-08:00","level":"INFO","msg":"handler: closed","stream_id":{"value":"mqdqjqly"}}
14
+ {"time":"2024-11-05T19:27:18.886115295-08:00","level":"INFO","msg":"writer: Close: closed","stream_id":{"value":"mqdqjqly"}}
15
+ {"time":"2024-11-05T19:27:18.88614639-08:00","level":"INFO","msg":"sender: closed","stream_id":"mqdqjqly"}
16
+ {"time":"2024-11-05T19:27:18.887360713-08:00","level":"INFO","msg":"stream: closed","id":"mqdqjqly"}
wandb/run-20241105_192659-mqdqjqly/logs/debug.log ADDED
@@ -0,0 +1,26 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ 2024-11-05 19:26:59,307 INFO MainThread:2184904 [wandb_setup.py:_flush():79] Current SDK version is 0.18.5
2
+ 2024-11-05 19:26:59,307 INFO MainThread:2184904 [wandb_setup.py:_flush():79] Configure stats pid to 2184904
3
+ 2024-11-05 19:26:59,307 INFO MainThread:2184904 [wandb_setup.py:_flush():79] Loading settings from /hai/scratch/belkhale/.config/wandb/settings
4
+ 2024-11-05 19:26:59,307 INFO MainThread:2184904 [wandb_setup.py:_flush():79] Loading settings from /hai/scratch/belkhale/openvla-mini/wandb/settings
5
+ 2024-11-05 19:26:59,307 INFO MainThread:2184904 [wandb_setup.py:_flush():79] Loading settings from environment variables: {'_service_wait': '300'}
6
+ 2024-11-05 19:26:59,307 INFO MainThread:2184904 [wandb_setup.py:_flush():79] Applying setup settings: {'mode': None, '_disable_service': None}
7
+ 2024-11-05 19:26:59,307 INFO MainThread:2184904 [wandb_setup.py:_flush():79] Inferring run settings from compute environment: {'program_relpath': 'scripts/pretrain.py', 'program_abspath': '/hai/scratch/belkhale/openvla-mini/scripts/pretrain.py', 'program': '/hai/scratch/belkhale/openvla-mini/scripts/pretrain.py'}
8
+ 2024-11-05 19:26:59,307 INFO MainThread:2184904 [wandb_setup.py:_flush():79] Applying login settings: {}
9
+ 2024-11-05 19:26:59,307 INFO MainThread:2184904 [wandb_init.py:_log_setup():534] Logging user logs to runs/prism-qwen25-extra-dinosiglip-224px+0_5b+stage-finetune+x7/wandb/run-20241105_192659-mqdqjqly/logs/debug.log
10
+ 2024-11-05 19:26:59,307 INFO MainThread:2184904 [wandb_init.py:_log_setup():535] Logging internal logs to runs/prism-qwen25-extra-dinosiglip-224px+0_5b+stage-finetune+x7/wandb/run-20241105_192659-mqdqjqly/logs/debug-internal.log
11
+ 2024-11-05 19:26:59,307 INFO MainThread:2184904 [wandb_init.py:init():621] calling init triggers
12
+ 2024-11-05 19:26:59,307 INFO MainThread:2184904 [wandb_init.py:init():628] wandb.init called with sweep_config: {}
13
+ config: {'model': {'type': 'prism-qwen25-extra-dinosiglip-224px+0_5b', 'model_id': 'prism-qwen25-extra-dinosiglip-224px+0_5b', 'arch_specifier': 'no-align+fused-gelu-mlp', 'vision_backbone_id': 'dinosiglip-vit-so-224px', 'llm_backbone_id': 'qwen25-0_5b-extra', 'image_resize_strategy': 'resize-naive', 'llm_max_length': 32768, 'align_epochs': 1, 'align_max_steps': None, 'align_save_every_n_steps': 10000, 'align_global_batch_size': 96, 'align_per_device_batch_size': 16, 'align_learning_rate': 0.001, 'align_weight_decay': 0.0, 'align_max_grad_norm': 1.0, 'align_lr_scheduler_type': 'linear-warmup+cosine-decay', 'align_warmup_ratio': 0.03, 'align_train_strategy': 'fsdp-shard-grad-op', 'finetune_epochs': 2, 'finetune_max_steps': None, 'finetune_save_every_n_steps': 10000, 'finetune_global_batch_size': 64, 'finetune_per_device_batch_size': 8, 'finetune_learning_rate': 2e-05, 'finetune_weight_decay': 0.1, 'finetune_max_grad_norm': 1.0, 'finetune_lr_scheduler_type': 'linear-warmup+cosine-decay', 'finetune_warmup_ratio': 0.03, 'finetune_train_strategy': 'fsdp-full-shard', 'enable_gradient_checkpointing': True, 'enable_mixed_precision_training': True, 'reduce_in_full_precision': False}, 'dataset': {'type': 'llava-v15', 'dataset_id': 'llava-v15', 'align_stage_components': ['download/llava-laion-cc-sbu-558k/chat.json', 'download/llava-laion-cc-sbu-558k'], 'finetune_stage_components': ['download/llava-v1.5-instruct/llava_v1_5_mix665k.json', 'download/llava-v1.5-instruct'], 'dataset_root_dir': '/hai/scratch/belkhale/datasets/prismatic-vlms'}, 'stage': 'finetune', 'pretrained_checkpoint': None, 'run_id': 'prism-qwen25-extra-dinosiglip-224px+0_5b+stage-finetune+x7', 'run_root_dir': 'runs', 'seed': 7, 'hf_token': '.hf_token', 'trackers': ['jsonl', 'wandb'], 'wandb_project': 'prismatic', 'wandb_entity': None}
14
+ 2024-11-05 19:26:59,307 INFO MainThread:2184904 [wandb_init.py:init():671] starting backend
15
+ 2024-11-05 19:26:59,307 INFO MainThread:2184904 [wandb_init.py:init():675] sending inform_init request
16
+ 2024-11-05 19:26:59,308 INFO MainThread:2184904 [backend.py:_multiprocessing_setup():104] multiprocessing start_methods=fork,spawn,forkserver, using: spawn
17
+ 2024-11-05 19:26:59,309 INFO MainThread:2184904 [wandb_init.py:init():688] backend started and connected
18
+ 2024-11-05 19:26:59,310 INFO MainThread:2184904 [wandb_init.py:init():783] updated telemetry
19
+ 2024-11-05 19:26:59,372 INFO MainThread:2184904 [wandb_init.py:init():816] communicating run to backend with 90.0 second timeout
20
+ 2024-11-05 19:26:59,708 INFO MainThread:2184904 [wandb_init.py:init():867] starting run threads in backend
21
+ 2024-11-05 19:26:59,921 INFO MainThread:2184904 [wandb_run.py:_console_start():2463] atexit reg
22
+ 2024-11-05 19:26:59,921 INFO MainThread:2184904 [wandb_run.py:_redirect():2311] redirect: wrap_raw
23
+ 2024-11-05 19:26:59,921 INFO MainThread:2184904 [wandb_run.py:_redirect():2376] Wrapping output streams.
24
+ 2024-11-05 19:26:59,921 INFO MainThread:2184904 [wandb_run.py:_redirect():2401] Redirects installed.
25
+ 2024-11-05 19:26:59,924 INFO MainThread:2184904 [wandb_init.py:init():911] run started, returning control to user process
26
+ 2024-11-05 19:27:18,208 WARNING MsgRouterThr:2184904 [router.py:message_loop():77] message_loop has been closed
wandb/run-20241105_192659-mqdqjqly/run-mqdqjqly.wandb ADDED
Binary file (21 kB). View file
 
wandb/run-20241105_193102-jcj67gg8/files/config.yaml ADDED
@@ -0,0 +1,105 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ _wandb:
2
+ value:
3
+ cli_version: 0.18.5
4
+ m: []
5
+ python_version: 3.10.15
6
+ t:
7
+ "1":
8
+ - 1
9
+ - 2
10
+ - 3
11
+ - 11
12
+ - 41
13
+ - 49
14
+ - 55
15
+ - 63
16
+ - 71
17
+ "2":
18
+ - 1
19
+ - 2
20
+ - 3
21
+ - 11
22
+ - 41
23
+ - 49
24
+ - 55
25
+ - 63
26
+ - 71
27
+ "3":
28
+ - 2
29
+ - 13
30
+ - 16
31
+ - 23
32
+ - 55
33
+ - 61
34
+ "4": 3.10.15
35
+ "5": 0.18.5
36
+ "6": 4.40.1
37
+ "8":
38
+ - 5
39
+ "12": 0.18.5
40
+ "13": linux-x86_64
41
+ dataset:
42
+ value:
43
+ align_stage_components:
44
+ - download/llava-laion-cc-sbu-558k/chat.json
45
+ - download/llava-laion-cc-sbu-558k
46
+ dataset_id: llava-v15
47
+ dataset_root_dir: /hai/scratch/belkhale/datasets/prismatic-vlms
48
+ finetune_stage_components:
49
+ - download/llava-v1.5-instruct/llava_v1_5_mix665k.json
50
+ - download/llava-v1.5-instruct
51
+ type: llava-v15
52
+ hf_token:
53
+ value: .hf_token
54
+ model:
55
+ value:
56
+ align_epochs: 1
57
+ align_global_batch_size: 96
58
+ align_learning_rate: 0.001
59
+ align_lr_scheduler_type: linear-warmup+cosine-decay
60
+ align_max_grad_norm: 1
61
+ align_max_steps: null
62
+ align_per_device_batch_size: 16
63
+ align_save_every_n_steps: 10000
64
+ align_train_strategy: fsdp-shard-grad-op
65
+ align_warmup_ratio: 0.03
66
+ align_weight_decay: 0
67
+ arch_specifier: no-align+fused-gelu-mlp
68
+ enable_gradient_checkpointing: true
69
+ enable_mixed_precision_training: true
70
+ finetune_epochs: 2
71
+ finetune_global_batch_size: 64
72
+ finetune_learning_rate: 2e-05
73
+ finetune_lr_scheduler_type: linear-warmup+cosine-decay
74
+ finetune_max_grad_norm: 1
75
+ finetune_max_steps: null
76
+ finetune_per_device_batch_size: 4
77
+ finetune_save_every_n_steps: 10000
78
+ finetune_train_strategy: fsdp-full-shard
79
+ finetune_warmup_ratio: 0.03
80
+ finetune_weight_decay: 0.1
81
+ image_resize_strategy: resize-naive
82
+ llm_backbone_id: qwen25-0_5b-extra
83
+ llm_max_length: 32768
84
+ model_id: prism-qwen25-extra-dinosiglip-224px+0_5b
85
+ reduce_in_full_precision: false
86
+ type: prism-qwen25-extra-dinosiglip-224px+0_5b
87
+ vision_backbone_id: dinosiglip-vit-so-224px
88
+ pretrained_checkpoint:
89
+ value: null
90
+ run_id:
91
+ value: prism-qwen25-extra-dinosiglip-224px+0_5b+stage-finetune+x7
92
+ run_root_dir:
93
+ value: runs
94
+ seed:
95
+ value: 7
96
+ stage:
97
+ value: finetune
98
+ trackers:
99
+ value:
100
+ - jsonl
101
+ - wandb
102
+ wandb_entity:
103
+ value: null
104
+ wandb_project:
105
+ value: prismatic
wandb/run-20241105_193102-jcj67gg8/files/output.log ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ 11/05 [19:31:03] INFO | >> [*] Starting Training Loop pretrain.py:227
2
+
3
+ 11/05 [23:38:31] INFO | >> [*] Done with Training =>> pretrain.py:231
4
+ Finalizing Metrics
wandb/run-20241105_193102-jcj67gg8/files/wandb-metadata.json ADDED
@@ -0,0 +1,134 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "os": "Linux-5.15.0-116-generic-x86_64-with-glibc2.35",
3
+ "python": "3.10.15",
4
+ "startedAt": "2024-11-06T03:31:02.513650Z",
5
+ "args": [
6
+ "--model.type",
7
+ "prism-qwen25-extra-dinosiglip-224px+0_5b",
8
+ "--model.finetune_global_batch_size",
9
+ "64",
10
+ "--model.finetune_per_device_batch_size",
11
+ "4"
12
+ ],
13
+ "program": "/hai/scratch/belkhale/openvla-mini/scripts/pretrain.py",
14
+ "codePath": "scripts/pretrain.py",
15
+ "git": {
16
+ "remote": "[email protected]:Stanford-ILIAD/openvla-mini.git",
17
+ "commit": "05073927b096dab7d326a3e39db9262f08d3a8ae"
18
+ },
19
+ "email": "[email protected]",
20
+ "root": "runs/prism-qwen25-extra-dinosiglip-224px+0_5b+stage-finetune+x7",
21
+ "host": "haic-hgx-2.stanford.edu",
22
+ "username": "belkhale",
23
+ "executable": "/hai/scratch/belkhale/miniforge3/envs/vla/bin/python3.10",
24
+ "codePathLocal": "scripts/pretrain.py",
25
+ "cpu_count": 112,
26
+ "cpu_count_logical": 224,
27
+ "gpu": "NVIDIA H100 80GB HBM3",
28
+ "gpu_count": 8,
29
+ "disk": {
30
+ "/": {
31
+ "total": "942725181440",
32
+ "used": "50880847872"
33
+ }
34
+ },
35
+ "memory": {
36
+ "total": "2164104577024"
37
+ },
38
+ "cpu": {
39
+ "count": 112,
40
+ "countLogical": 224
41
+ },
42
+ "gpu_nvidia": [
43
+ {
44
+ "name": "NVIDIA H100 80GB HBM3",
45
+ "memoryTotal": "85520809984",
46
+ "cudaCores": 16896,
47
+ "architecture": "Hopper"
48
+ },
49
+ {
50
+ "name": "NVIDIA H100 80GB HBM3",
51
+ "memoryTotal": "85520809984",
52
+ "cudaCores": 16896,
53
+ "architecture": "Hopper"
54
+ },
55
+ {
56
+ "name": "NVIDIA H100 80GB HBM3",
57
+ "memoryTotal": "85520809984",
58
+ "cudaCores": 16896,
59
+ "architecture": "Hopper"
60
+ },
61
+ {
62
+ "name": "NVIDIA H100 80GB HBM3",
63
+ "memoryTotal": "85520809984",
64
+ "cudaCores": 16896,
65
+ "architecture": "Hopper"
66
+ },
67
+ {
68
+ "name": "NVIDIA H100 80GB HBM3",
69
+ "memoryTotal": "85520809984",
70
+ "cudaCores": 16896,
71
+ "architecture": "Hopper"
72
+ },
73
+ {
74
+ "name": "NVIDIA H100 80GB HBM3",
75
+ "memoryTotal": "85520809984",
76
+ "cudaCores": 16896,
77
+ "architecture": "Hopper"
78
+ },
79
+ {
80
+ "name": "NVIDIA H100 80GB HBM3",
81
+ "memoryTotal": "85520809984",
82
+ "cudaCores": 16896,
83
+ "architecture": "Hopper"
84
+ },
85
+ {
86
+ "name": "NVIDIA H100 80GB HBM3",
87
+ "memoryTotal": "85520809984",
88
+ "cudaCores": 16896,
89
+ "architecture": "Hopper"
90
+ }
91
+ ],
92
+ "slurm": {
93
+ "cluster_name": "haic",
94
+ "conf": "/usr/local/etc/slurm.conf",
95
+ "cpus_on_node": "64",
96
+ "cpus_per_task": "64",
97
+ "gpus_on_node": "8",
98
+ "gtids": "0",
99
+ "job_account": "models",
100
+ "job_cpus_per_node": "64",
101
+ "job_end_time": "1731122999",
102
+ "job_gid": "37",
103
+ "job_gpus": "0,1,2,3,4,5,6,7",
104
+ "job_id": "11026",
105
+ "job_name": "pretrain",
106
+ "job_nodelist": "haic-hgx-2",
107
+ "job_num_nodes": "1",
108
+ "job_partition": "hai",
109
+ "job_qos": "models",
110
+ "job_start_time": "1730863799",
111
+ "job_uid": "377095",
112
+ "job_user": "belkhale",
113
+ "jobid": "11026",
114
+ "localid": "0",
115
+ "mem_per_node": "102400",
116
+ "nnodes": "1",
117
+ "nodeid": "0",
118
+ "nodelist": "haic-hgx-2",
119
+ "nprocs": "1",
120
+ "ntasks": "1",
121
+ "ntasks_per_node": "1",
122
+ "prio_process": "0",
123
+ "procid": "0",
124
+ "script_context": "prolog_task",
125
+ "submit_dir": "/hai/scratch/belkhale/openvla-mini",
126
+ "submit_host": "haic.stanford.edu",
127
+ "task_pid": "2187908",
128
+ "tasks_per_node": "1",
129
+ "topology_addr": "haic-hgx-2",
130
+ "topology_addr_pattern": "node",
131
+ "tres_per_task": "cpu=64"
132
+ },
133
+ "cudaVersion": "12.4"
134
+ }
wandb/run-20241105_193102-jcj67gg8/files/wandb-summary.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"_wandb":{"runtime":14849},"_runtime":14849.432571063,"_step":20792,"Finetune/Loss":0.734754204750061,"Finetune/Loss (Raw)":0.7624474763870239,"Finetune/Learning Rate":0,"Finetune/Step Time":0.7374007441103458,"_timestamp":1.7308787040734835e+09,"Finetune/Step":20792}
wandb/run-20241105_193102-jcj67gg8/logs/debug-core.log ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {"time":"2024-11-05T19:31:02.167132681-08:00","level":"INFO","msg":"started logging, with flags","port-filename":"/tmp/belkhale/tmpowkszwq0/port-2188020.txt","pid":2188020,"debug":false,"disable-analytics":false}
2
+ {"time":"2024-11-05T19:31:02.167154904-08:00","level":"INFO","msg":"FeatureState","shutdownOnParentExitEnabled":false}
3
+ {"time":"2024-11-05T19:31:02.168180089-08:00","level":"INFO","msg":"Will exit if parent process dies.","ppid":2188020}
4
+ {"time":"2024-11-05T19:31:02.168180088-08:00","level":"INFO","msg":"server is running","addr":{"IP":"127.0.0.1","Port":35793,"Zone":""}}
5
+ {"time":"2024-11-05T19:31:02.357031058-08:00","level":"INFO","msg":"connection: ManageConnectionData: new connection created","id":"127.0.0.1:51484"}
6
+ {"time":"2024-11-05T19:31:02.518306545-08:00","level":"INFO","msg":"handleInformInit: received","streamId":"jcj67gg8","id":"127.0.0.1:51484"}
7
+ {"time":"2024-11-05T19:31:02.738838516-08:00","level":"INFO","msg":"handleInformInit: stream started","streamId":"jcj67gg8","id":"127.0.0.1:51484"}
8
+ {"time":"2024-11-05T23:38:34.554528568-08:00","level":"INFO","msg":"handleInformFinish: finish message received","streamId":"jcj67gg8","id":"127.0.0.1:51484"}
9
+ {"time":"2024-11-05T23:38:34.558563756-08:00","level":"INFO","msg":"handleInformFinish: stream closed","streamId":"jcj67gg8","id":"127.0.0.1:51484"}
10
+ {"time":"2024-11-05T23:42:06.504927152-08:00","level":"INFO","msg":"handleInformTeardown: server teardown initiated","id":"127.0.0.1:51484"}
11
+ {"time":"2024-11-05T23:42:06.505861575-08:00","level":"INFO","msg":"handleInformTeardown: server shutdown complete","id":"127.0.0.1:51484"}
12
+ {"time":"2024-11-05T23:42:06.505880856-08:00","level":"INFO","msg":"server is shutting down"}
13
+ {"time":"2024-11-05T23:42:06.50594903-08:00","level":"INFO","msg":"connection: Close: initiating connection closure","id":"127.0.0.1:51484"}
14
+ {"time":"2024-11-05T23:42:06.506095025-08:00","level":"INFO","msg":"connection: Close: connection successfully closed","id":"127.0.0.1:51484"}
15
+ {"time":"2024-11-05T23:42:06.506152479-08:00","level":"INFO","msg":"connection: ManageConnectionData: connection closed","id":"127.0.0.1:51484"}
16
+ {"time":"2024-11-05T23:42:06.506171224-08:00","level":"INFO","msg":"server is closed"}
wandb/run-20241105_193102-jcj67gg8/logs/debug-internal.log ADDED
@@ -0,0 +1,17 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {"time":"2024-11-05T19:31:02.519998294-08:00","level":"INFO","msg":"using version","core version":"0.18.5"}
2
+ {"time":"2024-11-05T19:31:02.520021589-08:00","level":"INFO","msg":"created symlink","path":"runs/prism-qwen25-extra-dinosiglip-224px+0_5b+stage-finetune+x7/wandb/run-20241105_193102-jcj67gg8/logs/debug-core.log"}
3
+ {"time":"2024-11-05T19:31:02.738801523-08:00","level":"INFO","msg":"created new stream","id":"jcj67gg8"}
4
+ {"time":"2024-11-05T19:31:02.738835256-08:00","level":"INFO","msg":"stream: started","id":"jcj67gg8"}
5
+ {"time":"2024-11-05T19:31:02.738905513-08:00","level":"INFO","msg":"sender: started","stream_id":"jcj67gg8"}
6
+ {"time":"2024-11-05T19:31:02.738892436-08:00","level":"INFO","msg":"writer: Do: started","stream_id":{"value":"jcj67gg8"}}
7
+ {"time":"2024-11-05T19:31:02.738902832-08:00","level":"INFO","msg":"handler: started","stream_id":{"value":"jcj67gg8"}}
8
+ {"time":"2024-11-05T19:31:03.056521545-08:00","level":"INFO","msg":"Starting system monitor"}
9
+ {"time":"2024-11-05T23:38:31.946246118-08:00","level":"INFO","msg":"Stopping system monitor"}
10
+ {"time":"2024-11-05T23:38:32.023973784-08:00","level":"INFO","msg":"Stopped system monitor"}
11
+ {"time":"2024-11-05T23:38:32.922341453-08:00","level":"INFO","msg":"handler: operation stats","stats":{"operations":[{"desc":"saving job artifact","runtime_seconds":0.775009129}],"total_operations":1}}
12
+ {"time":"2024-11-05T23:38:33.390878425-08:00","level":"INFO","msg":"fileTransfer: Close: file transfer manager closed"}
13
+ {"time":"2024-11-05T23:38:34.555591357-08:00","level":"INFO","msg":"stream: closing","id":"jcj67gg8"}
14
+ {"time":"2024-11-05T23:38:34.555667186-08:00","level":"INFO","msg":"handler: closed","stream_id":{"value":"jcj67gg8"}}
15
+ {"time":"2024-11-05T23:38:34.555706801-08:00","level":"INFO","msg":"writer: Close: closed","stream_id":{"value":"jcj67gg8"}}
16
+ {"time":"2024-11-05T23:38:34.555872-08:00","level":"INFO","msg":"sender: closed","stream_id":"jcj67gg8"}
17
+ {"time":"2024-11-05T23:38:34.558526671-08:00","level":"INFO","msg":"stream: closed","id":"jcj67gg8"}
wandb/run-20241105_193102-jcj67gg8/logs/debug.log ADDED
@@ -0,0 +1,32 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ 2024-11-05 19:31:02,511 INFO MainThread:2188020 [wandb_setup.py:_flush():79] Current SDK version is 0.18.5
2
+ 2024-11-05 19:31:02,511 INFO MainThread:2188020 [wandb_setup.py:_flush():79] Configure stats pid to 2188020
3
+ 2024-11-05 19:31:02,511 INFO MainThread:2188020 [wandb_setup.py:_flush():79] Loading settings from /hai/scratch/belkhale/.config/wandb/settings
4
+ 2024-11-05 19:31:02,511 INFO MainThread:2188020 [wandb_setup.py:_flush():79] Loading settings from /hai/scratch/belkhale/openvla-mini/wandb/settings
5
+ 2024-11-05 19:31:02,511 INFO MainThread:2188020 [wandb_setup.py:_flush():79] Loading settings from environment variables: {'_service_wait': '300'}
6
+ 2024-11-05 19:31:02,512 INFO MainThread:2188020 [wandb_setup.py:_flush():79] Applying setup settings: {'mode': None, '_disable_service': None}
7
+ 2024-11-05 19:31:02,512 INFO MainThread:2188020 [wandb_setup.py:_flush():79] Inferring run settings from compute environment: {'program_relpath': 'scripts/pretrain.py', 'program_abspath': '/hai/scratch/belkhale/openvla-mini/scripts/pretrain.py', 'program': '/hai/scratch/belkhale/openvla-mini/scripts/pretrain.py'}
8
+ 2024-11-05 19:31:02,512 INFO MainThread:2188020 [wandb_setup.py:_flush():79] Applying login settings: {}
9
+ 2024-11-05 19:31:02,512 INFO MainThread:2188020 [wandb_init.py:_log_setup():534] Logging user logs to runs/prism-qwen25-extra-dinosiglip-224px+0_5b+stage-finetune+x7/wandb/run-20241105_193102-jcj67gg8/logs/debug.log
10
+ 2024-11-05 19:31:02,512 INFO MainThread:2188020 [wandb_init.py:_log_setup():535] Logging internal logs to runs/prism-qwen25-extra-dinosiglip-224px+0_5b+stage-finetune+x7/wandb/run-20241105_193102-jcj67gg8/logs/debug-internal.log
11
+ 2024-11-05 19:31:02,512 INFO MainThread:2188020 [wandb_init.py:init():621] calling init triggers
12
+ 2024-11-05 19:31:02,512 INFO MainThread:2188020 [wandb_init.py:init():628] wandb.init called with sweep_config: {}
13
+ config: {'model': {'type': 'prism-qwen25-extra-dinosiglip-224px+0_5b', 'model_id': 'prism-qwen25-extra-dinosiglip-224px+0_5b', 'arch_specifier': 'no-align+fused-gelu-mlp', 'vision_backbone_id': 'dinosiglip-vit-so-224px', 'llm_backbone_id': 'qwen25-0_5b-extra', 'image_resize_strategy': 'resize-naive', 'llm_max_length': 32768, 'align_epochs': 1, 'align_max_steps': None, 'align_save_every_n_steps': 10000, 'align_global_batch_size': 96, 'align_per_device_batch_size': 16, 'align_learning_rate': 0.001, 'align_weight_decay': 0.0, 'align_max_grad_norm': 1.0, 'align_lr_scheduler_type': 'linear-warmup+cosine-decay', 'align_warmup_ratio': 0.03, 'align_train_strategy': 'fsdp-shard-grad-op', 'finetune_epochs': 2, 'finetune_max_steps': None, 'finetune_save_every_n_steps': 10000, 'finetune_global_batch_size': 64, 'finetune_per_device_batch_size': 4, 'finetune_learning_rate': 2e-05, 'finetune_weight_decay': 0.1, 'finetune_max_grad_norm': 1.0, 'finetune_lr_scheduler_type': 'linear-warmup+cosine-decay', 'finetune_warmup_ratio': 0.03, 'finetune_train_strategy': 'fsdp-full-shard', 'enable_gradient_checkpointing': True, 'enable_mixed_precision_training': True, 'reduce_in_full_precision': False}, 'dataset': {'type': 'llava-v15', 'dataset_id': 'llava-v15', 'align_stage_components': ['download/llava-laion-cc-sbu-558k/chat.json', 'download/llava-laion-cc-sbu-558k'], 'finetune_stage_components': ['download/llava-v1.5-instruct/llava_v1_5_mix665k.json', 'download/llava-v1.5-instruct'], 'dataset_root_dir': '/hai/scratch/belkhale/datasets/prismatic-vlms'}, 'stage': 'finetune', 'pretrained_checkpoint': None, 'run_id': 'prism-qwen25-extra-dinosiglip-224px+0_5b+stage-finetune+x7', 'run_root_dir': 'runs', 'seed': 7, 'hf_token': '.hf_token', 'trackers': ['jsonl', 'wandb'], 'wandb_project': 'prismatic', 'wandb_entity': None}
14
+ 2024-11-05 19:31:02,512 INFO MainThread:2188020 [wandb_init.py:init():671] starting backend
15
+ 2024-11-05 19:31:02,512 INFO MainThread:2188020 [wandb_init.py:init():675] sending inform_init request
16
+ 2024-11-05 19:31:02,513 INFO MainThread:2188020 [backend.py:_multiprocessing_setup():104] multiprocessing start_methods=fork,spawn,forkserver, using: spawn
17
+ 2024-11-05 19:31:02,513 INFO MainThread:2188020 [wandb_init.py:init():688] backend started and connected
18
+ 2024-11-05 19:31:02,515 INFO MainThread:2188020 [wandb_init.py:init():783] updated telemetry
19
+ 2024-11-05 19:31:02,573 INFO MainThread:2188020 [wandb_init.py:init():816] communicating run to backend with 90.0 second timeout
20
+ 2024-11-05 19:31:03,050 INFO MainThread:2188020 [wandb_init.py:init():867] starting run threads in backend
21
+ 2024-11-05 19:31:03,226 INFO MainThread:2188020 [wandb_run.py:_console_start():2463] atexit reg
22
+ 2024-11-05 19:31:03,227 INFO MainThread:2188020 [wandb_run.py:_redirect():2311] redirect: wrap_raw
23
+ 2024-11-05 19:31:03,227 INFO MainThread:2188020 [wandb_run.py:_redirect():2376] Wrapping output streams.
24
+ 2024-11-05 19:31:03,227 INFO MainThread:2188020 [wandb_run.py:_redirect():2401] Redirects installed.
25
+ 2024-11-05 19:31:03,230 INFO MainThread:2188020 [wandb_init.py:init():911] run started, returning control to user process
26
+ 2024-11-05 23:38:31,920 INFO MainThread:2188020 [wandb_run.py:_finish():2158] finishing run belkhale/prismatic/jcj67gg8
27
+ 2024-11-05 23:38:31,920 INFO MainThread:2188020 [wandb_run.py:_atexit_cleanup():2426] got exitcode: 0
28
+ 2024-11-05 23:38:31,921 INFO MainThread:2188020 [wandb_run.py:_restore():2408] restore
29
+ 2024-11-05 23:38:31,921 INFO MainThread:2188020 [wandb_run.py:_restore():2414] restore done
30
+ 2024-11-05 23:38:34,516 INFO MainThread:2188020 [wandb_run.py:_footer_history_summary_info():3975] rendering history
31
+ 2024-11-05 23:38:34,517 INFO MainThread:2188020 [wandb_run.py:_footer_history_summary_info():4007] rendering summary
32
+ 2024-11-05 23:38:34,534 INFO MainThread:2188020 [wandb_run.py:_footer_sync_info():3934] logging synced files
wandb/run-20241105_193102-jcj67gg8/run-jcj67gg8.wandb ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b1e028b995d945fb180c85455d7219269515a28888eda671f635380e4dac0d23
3
+ size 37709802