Upload folder using huggingface_hub
Browse files- .gitattributes +2 -0
- checkpoints/step-009999-epoch-00-loss=0.7280.pt +3 -0
- checkpoints/step-019999-epoch-01-loss=1.4845.pt +3 -0
- checkpoints/step-020792-epoch-01-loss=0.5268.pt +3 -0
- config.json +61 -0
- config.yaml +54 -0
- prism-qwen25-extra-dinosiglip-224px+0_5b+stage-finetune+x7.jsonl +0 -0
- run-metrics.jsonl +1 -0
- wandb/debug-internal.log +17 -0
- wandb/debug.log +32 -0
- wandb/latest-run/files/config.yaml +105 -0
- wandb/latest-run/files/output.log +4 -0
- wandb/latest-run/files/wandb-metadata.json +134 -0
- wandb/latest-run/files/wandb-summary.json +1 -0
- wandb/latest-run/logs/debug-core.log +16 -0
- wandb/latest-run/logs/debug-internal.log +17 -0
- wandb/latest-run/logs/debug.log +32 -0
- wandb/latest-run/run-jcj67gg8.wandb +3 -0
- wandb/run-20241105_192502-8vxhoj6d/files/config.yaml +103 -0
- wandb/run-20241105_192502-8vxhoj6d/files/output.log +35 -0
- wandb/run-20241105_192502-8vxhoj6d/files/wandb-metadata.json +134 -0
- wandb/run-20241105_192502-8vxhoj6d/files/wandb-summary.json +1 -0
- wandb/run-20241105_192502-8vxhoj6d/logs/debug-core.log +14 -0
- wandb/run-20241105_192502-8vxhoj6d/logs/debug-internal.log +16 -0
- wandb/run-20241105_192502-8vxhoj6d/logs/debug.log +26 -0
- wandb/run-20241105_192502-8vxhoj6d/run-8vxhoj6d.wandb +0 -0
- wandb/run-20241105_192659-mqdqjqly/files/config.yaml +103 -0
- wandb/run-20241105_192659-mqdqjqly/files/output.log +43 -0
- wandb/run-20241105_192659-mqdqjqly/files/wandb-metadata.json +134 -0
- wandb/run-20241105_192659-mqdqjqly/files/wandb-summary.json +1 -0
- wandb/run-20241105_192659-mqdqjqly/logs/debug-core.log +14 -0
- wandb/run-20241105_192659-mqdqjqly/logs/debug-internal.log +16 -0
- wandb/run-20241105_192659-mqdqjqly/logs/debug.log +26 -0
- wandb/run-20241105_192659-mqdqjqly/run-mqdqjqly.wandb +0 -0
- wandb/run-20241105_193102-jcj67gg8/files/config.yaml +105 -0
- wandb/run-20241105_193102-jcj67gg8/files/output.log +4 -0
- wandb/run-20241105_193102-jcj67gg8/files/wandb-metadata.json +134 -0
- wandb/run-20241105_193102-jcj67gg8/files/wandb-summary.json +1 -0
- wandb/run-20241105_193102-jcj67gg8/logs/debug-core.log +16 -0
- wandb/run-20241105_193102-jcj67gg8/logs/debug-internal.log +17 -0
- wandb/run-20241105_193102-jcj67gg8/logs/debug.log +32 -0
- wandb/run-20241105_193102-jcj67gg8/run-jcj67gg8.wandb +3 -0
.gitattributes
CHANGED
@@ -33,3 +33,5 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
|
33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
|
|
|
|
|
33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
36 |
+
wandb/latest-run/run-jcj67gg8.wandb filter=lfs diff=lfs merge=lfs -text
|
37 |
+
wandb/run-20241105_193102-jcj67gg8/run-jcj67gg8.wandb filter=lfs diff=lfs merge=lfs -text
|
checkpoints/step-009999-epoch-00-loss=0.7280.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:bcb0567c48a8af51b96e8b73b794de80964badc806defd7badc77007782a5a0a
|
3 |
+
size 2630986501
|
checkpoints/step-019999-epoch-01-loss=1.4845.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:71573cb46a219059c298c41f8facc10271b95e0a4cb3c1e0e8c39ac3a66079b8
|
3 |
+
size 2630986501
|
checkpoints/step-020792-epoch-01-loss=0.5268.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:d86525b8d21865b2e83d4a403ee9d1b641579bb41259111d7e3cc7f7ed46564a
|
3 |
+
size 2630986501
|
config.json
ADDED
@@ -0,0 +1,61 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"dataset": {
|
3 |
+
"align_stage_components": [
|
4 |
+
"download/llava-laion-cc-sbu-558k/chat.json",
|
5 |
+
"download/llava-laion-cc-sbu-558k"
|
6 |
+
],
|
7 |
+
"dataset_id": "llava-v15",
|
8 |
+
"dataset_root_dir": "/hai/scratch/belkhale/datasets/prismatic-vlms",
|
9 |
+
"finetune_stage_components": [
|
10 |
+
"download/llava-v1.5-instruct/llava_v1_5_mix665k.json",
|
11 |
+
"download/llava-v1.5-instruct"
|
12 |
+
],
|
13 |
+
"type": "llava-v15"
|
14 |
+
},
|
15 |
+
"hf_token": ".hf_token",
|
16 |
+
"model": {
|
17 |
+
"align_epochs": 1,
|
18 |
+
"align_global_batch_size": 96,
|
19 |
+
"align_learning_rate": 0.001,
|
20 |
+
"align_lr_scheduler_type": "linear-warmup+cosine-decay",
|
21 |
+
"align_max_grad_norm": 1.0,
|
22 |
+
"align_max_steps": null,
|
23 |
+
"align_per_device_batch_size": 16,
|
24 |
+
"align_save_every_n_steps": 10000,
|
25 |
+
"align_train_strategy": "fsdp-shard-grad-op",
|
26 |
+
"align_warmup_ratio": 0.03,
|
27 |
+
"align_weight_decay": 0.0,
|
28 |
+
"arch_specifier": "no-align+fused-gelu-mlp",
|
29 |
+
"enable_gradient_checkpointing": true,
|
30 |
+
"enable_mixed_precision_training": true,
|
31 |
+
"finetune_epochs": 2,
|
32 |
+
"finetune_global_batch_size": 64,
|
33 |
+
"finetune_learning_rate": 2e-05,
|
34 |
+
"finetune_lr_scheduler_type": "linear-warmup+cosine-decay",
|
35 |
+
"finetune_max_grad_norm": 1.0,
|
36 |
+
"finetune_max_steps": null,
|
37 |
+
"finetune_per_device_batch_size": 4,
|
38 |
+
"finetune_save_every_n_steps": 10000,
|
39 |
+
"finetune_train_strategy": "fsdp-full-shard",
|
40 |
+
"finetune_warmup_ratio": 0.03,
|
41 |
+
"finetune_weight_decay": 0.1,
|
42 |
+
"image_resize_strategy": "resize-naive",
|
43 |
+
"llm_backbone_id": "qwen25-0_5b-extra",
|
44 |
+
"llm_max_length": 32768,
|
45 |
+
"model_id": "prism-qwen25-extra-dinosiglip-224px+0_5b",
|
46 |
+
"reduce_in_full_precision": false,
|
47 |
+
"type": "prism-qwen25-extra-dinosiglip-224px+0_5b",
|
48 |
+
"vision_backbone_id": "dinosiglip-vit-so-224px"
|
49 |
+
},
|
50 |
+
"pretrained_checkpoint": null,
|
51 |
+
"run_id": "prism-qwen25-extra-dinosiglip-224px+0_5b+stage-finetune+x7",
|
52 |
+
"run_root_dir": "runs",
|
53 |
+
"seed": 7,
|
54 |
+
"stage": "finetune",
|
55 |
+
"trackers": [
|
56 |
+
"jsonl",
|
57 |
+
"wandb"
|
58 |
+
],
|
59 |
+
"wandb_entity": null,
|
60 |
+
"wandb_project": "prismatic"
|
61 |
+
}
|
config.yaml
ADDED
@@ -0,0 +1,54 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
dataset:
|
2 |
+
align_stage_components:
|
3 |
+
- download/llava-laion-cc-sbu-558k/chat.json
|
4 |
+
- download/llava-laion-cc-sbu-558k
|
5 |
+
dataset_id: llava-v15
|
6 |
+
dataset_root_dir: /hai/scratch/belkhale/datasets/prismatic-vlms
|
7 |
+
finetune_stage_components:
|
8 |
+
- download/llava-v1.5-instruct/llava_v1_5_mix665k.json
|
9 |
+
- download/llava-v1.5-instruct
|
10 |
+
type: llava-v15
|
11 |
+
hf_token: .hf_token
|
12 |
+
model:
|
13 |
+
align_epochs: 1
|
14 |
+
align_global_batch_size: 96
|
15 |
+
align_learning_rate: 0.001
|
16 |
+
align_lr_scheduler_type: linear-warmup+cosine-decay
|
17 |
+
align_max_grad_norm: 1.0
|
18 |
+
align_max_steps: null
|
19 |
+
align_per_device_batch_size: 16
|
20 |
+
align_save_every_n_steps: 10000
|
21 |
+
align_train_strategy: fsdp-shard-grad-op
|
22 |
+
align_warmup_ratio: 0.03
|
23 |
+
align_weight_decay: 0.0
|
24 |
+
arch_specifier: no-align+fused-gelu-mlp
|
25 |
+
enable_gradient_checkpointing: true
|
26 |
+
enable_mixed_precision_training: true
|
27 |
+
finetune_epochs: 2
|
28 |
+
finetune_global_batch_size: 64
|
29 |
+
finetune_learning_rate: 2.0e-05
|
30 |
+
finetune_lr_scheduler_type: linear-warmup+cosine-decay
|
31 |
+
finetune_max_grad_norm: 1.0
|
32 |
+
finetune_max_steps: null
|
33 |
+
finetune_per_device_batch_size: 4
|
34 |
+
finetune_save_every_n_steps: 10000
|
35 |
+
finetune_train_strategy: fsdp-full-shard
|
36 |
+
finetune_warmup_ratio: 0.03
|
37 |
+
finetune_weight_decay: 0.1
|
38 |
+
image_resize_strategy: resize-naive
|
39 |
+
llm_backbone_id: qwen25-0_5b-extra
|
40 |
+
llm_max_length: 32768
|
41 |
+
model_id: prism-qwen25-extra-dinosiglip-224px+0_5b
|
42 |
+
reduce_in_full_precision: false
|
43 |
+
type: prism-qwen25-extra-dinosiglip-224px+0_5b
|
44 |
+
vision_backbone_id: dinosiglip-vit-so-224px
|
45 |
+
pretrained_checkpoint: null
|
46 |
+
run_id: prism-qwen25-extra-dinosiglip-224px+0_5b+stage-finetune+x7
|
47 |
+
run_root_dir: runs
|
48 |
+
seed: 7
|
49 |
+
stage: finetune
|
50 |
+
trackers:
|
51 |
+
- jsonl
|
52 |
+
- wandb
|
53 |
+
wandb_entity: null
|
54 |
+
wandb_project: prismatic
|
prism-qwen25-extra-dinosiglip-224px+0_5b+stage-finetune+x7.jsonl
ADDED
The diff for this file is too large to render.
See raw diff
|
|
run-metrics.jsonl
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
{"hparams": {"dataset": {"align_stage_components": ["download/llava-laion-cc-sbu-558k/chat.json", "download/llava-laion-cc-sbu-558k"], "dataset_id": "llava-v15", "dataset_root_dir": "/hai/scratch/belkhale/datasets/prismatic-vlms", "finetune_stage_components": ["download/llava-v1.5-instruct/llava_v1_5_mix665k.json", "download/llava-v1.5-instruct"], "type": "llava-v15"}, "hf_token": ".hf_token", "model": {"align_epochs": 1, "align_global_batch_size": 96, "align_learning_rate": 0.001, "align_lr_scheduler_type": "linear-warmup+cosine-decay", "align_max_grad_norm": 1.0, "align_max_steps": null, "align_per_device_batch_size": 16, "align_save_every_n_steps": 10000, "align_train_strategy": "fsdp-shard-grad-op", "align_warmup_ratio": 0.03, "align_weight_decay": 0.0, "arch_specifier": "no-align+fused-gelu-mlp", "enable_gradient_checkpointing": true, "enable_mixed_precision_training": true, "finetune_epochs": 2, "finetune_global_batch_size": 64, "finetune_learning_rate": 2e-05, "finetune_lr_scheduler_type": "linear-warmup+cosine-decay", "finetune_max_grad_norm": 1.0, "finetune_max_steps": null, "finetune_per_device_batch_size": 4, "finetune_save_every_n_steps": 10000, "finetune_train_strategy": "fsdp-full-shard", "finetune_warmup_ratio": 0.03, "finetune_weight_decay": 0.1, "image_resize_strategy": "resize-naive", "llm_backbone_id": "qwen25-0_5b-extra", "llm_max_length": 32768, "model_id": "prism-qwen25-extra-dinosiglip-224px+0_5b", "reduce_in_full_precision": false, "type": "prism-qwen25-extra-dinosiglip-224px+0_5b", "vision_backbone_id": "dinosiglip-vit-so-224px"}, "pretrained_checkpoint": null, "run_id": "prism-qwen25-extra-dinosiglip-224px+0_5b+stage-finetune+x7", "run_root_dir": "runs", "seed": 7, "stage": "finetune", "trackers": ["jsonl", "wandb"], "wandb_entity": null, "wandb_project": "prismatic"}, "run_id": "prism-qwen25-extra-dinosiglip-224px+0_5b+stage-finetune+x7"}
|
wandb/debug-internal.log
ADDED
@@ -0,0 +1,17 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{"time":"2024-11-05T19:31:02.519998294-08:00","level":"INFO","msg":"using version","core version":"0.18.5"}
|
2 |
+
{"time":"2024-11-05T19:31:02.520021589-08:00","level":"INFO","msg":"created symlink","path":"runs/prism-qwen25-extra-dinosiglip-224px+0_5b+stage-finetune+x7/wandb/run-20241105_193102-jcj67gg8/logs/debug-core.log"}
|
3 |
+
{"time":"2024-11-05T19:31:02.738801523-08:00","level":"INFO","msg":"created new stream","id":"jcj67gg8"}
|
4 |
+
{"time":"2024-11-05T19:31:02.738835256-08:00","level":"INFO","msg":"stream: started","id":"jcj67gg8"}
|
5 |
+
{"time":"2024-11-05T19:31:02.738905513-08:00","level":"INFO","msg":"sender: started","stream_id":"jcj67gg8"}
|
6 |
+
{"time":"2024-11-05T19:31:02.738892436-08:00","level":"INFO","msg":"writer: Do: started","stream_id":{"value":"jcj67gg8"}}
|
7 |
+
{"time":"2024-11-05T19:31:02.738902832-08:00","level":"INFO","msg":"handler: started","stream_id":{"value":"jcj67gg8"}}
|
8 |
+
{"time":"2024-11-05T19:31:03.056521545-08:00","level":"INFO","msg":"Starting system monitor"}
|
9 |
+
{"time":"2024-11-05T23:38:31.946246118-08:00","level":"INFO","msg":"Stopping system monitor"}
|
10 |
+
{"time":"2024-11-05T23:38:32.023973784-08:00","level":"INFO","msg":"Stopped system monitor"}
|
11 |
+
{"time":"2024-11-05T23:38:32.922341453-08:00","level":"INFO","msg":"handler: operation stats","stats":{"operations":[{"desc":"saving job artifact","runtime_seconds":0.775009129}],"total_operations":1}}
|
12 |
+
{"time":"2024-11-05T23:38:33.390878425-08:00","level":"INFO","msg":"fileTransfer: Close: file transfer manager closed"}
|
13 |
+
{"time":"2024-11-05T23:38:34.555591357-08:00","level":"INFO","msg":"stream: closing","id":"jcj67gg8"}
|
14 |
+
{"time":"2024-11-05T23:38:34.555667186-08:00","level":"INFO","msg":"handler: closed","stream_id":{"value":"jcj67gg8"}}
|
15 |
+
{"time":"2024-11-05T23:38:34.555706801-08:00","level":"INFO","msg":"writer: Close: closed","stream_id":{"value":"jcj67gg8"}}
|
16 |
+
{"time":"2024-11-05T23:38:34.555872-08:00","level":"INFO","msg":"sender: closed","stream_id":"jcj67gg8"}
|
17 |
+
{"time":"2024-11-05T23:38:34.558526671-08:00","level":"INFO","msg":"stream: closed","id":"jcj67gg8"}
|
wandb/debug.log
ADDED
@@ -0,0 +1,32 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
2024-11-05 19:31:02,511 INFO MainThread:2188020 [wandb_setup.py:_flush():79] Current SDK version is 0.18.5
|
2 |
+
2024-11-05 19:31:02,511 INFO MainThread:2188020 [wandb_setup.py:_flush():79] Configure stats pid to 2188020
|
3 |
+
2024-11-05 19:31:02,511 INFO MainThread:2188020 [wandb_setup.py:_flush():79] Loading settings from /hai/scratch/belkhale/.config/wandb/settings
|
4 |
+
2024-11-05 19:31:02,511 INFO MainThread:2188020 [wandb_setup.py:_flush():79] Loading settings from /hai/scratch/belkhale/openvla-mini/wandb/settings
|
5 |
+
2024-11-05 19:31:02,511 INFO MainThread:2188020 [wandb_setup.py:_flush():79] Loading settings from environment variables: {'_service_wait': '300'}
|
6 |
+
2024-11-05 19:31:02,512 INFO MainThread:2188020 [wandb_setup.py:_flush():79] Applying setup settings: {'mode': None, '_disable_service': None}
|
7 |
+
2024-11-05 19:31:02,512 INFO MainThread:2188020 [wandb_setup.py:_flush():79] Inferring run settings from compute environment: {'program_relpath': 'scripts/pretrain.py', 'program_abspath': '/hai/scratch/belkhale/openvla-mini/scripts/pretrain.py', 'program': '/hai/scratch/belkhale/openvla-mini/scripts/pretrain.py'}
|
8 |
+
2024-11-05 19:31:02,512 INFO MainThread:2188020 [wandb_setup.py:_flush():79] Applying login settings: {}
|
9 |
+
2024-11-05 19:31:02,512 INFO MainThread:2188020 [wandb_init.py:_log_setup():534] Logging user logs to runs/prism-qwen25-extra-dinosiglip-224px+0_5b+stage-finetune+x7/wandb/run-20241105_193102-jcj67gg8/logs/debug.log
|
10 |
+
2024-11-05 19:31:02,512 INFO MainThread:2188020 [wandb_init.py:_log_setup():535] Logging internal logs to runs/prism-qwen25-extra-dinosiglip-224px+0_5b+stage-finetune+x7/wandb/run-20241105_193102-jcj67gg8/logs/debug-internal.log
|
11 |
+
2024-11-05 19:31:02,512 INFO MainThread:2188020 [wandb_init.py:init():621] calling init triggers
|
12 |
+
2024-11-05 19:31:02,512 INFO MainThread:2188020 [wandb_init.py:init():628] wandb.init called with sweep_config: {}
|
13 |
+
config: {'model': {'type': 'prism-qwen25-extra-dinosiglip-224px+0_5b', 'model_id': 'prism-qwen25-extra-dinosiglip-224px+0_5b', 'arch_specifier': 'no-align+fused-gelu-mlp', 'vision_backbone_id': 'dinosiglip-vit-so-224px', 'llm_backbone_id': 'qwen25-0_5b-extra', 'image_resize_strategy': 'resize-naive', 'llm_max_length': 32768, 'align_epochs': 1, 'align_max_steps': None, 'align_save_every_n_steps': 10000, 'align_global_batch_size': 96, 'align_per_device_batch_size': 16, 'align_learning_rate': 0.001, 'align_weight_decay': 0.0, 'align_max_grad_norm': 1.0, 'align_lr_scheduler_type': 'linear-warmup+cosine-decay', 'align_warmup_ratio': 0.03, 'align_train_strategy': 'fsdp-shard-grad-op', 'finetune_epochs': 2, 'finetune_max_steps': None, 'finetune_save_every_n_steps': 10000, 'finetune_global_batch_size': 64, 'finetune_per_device_batch_size': 4, 'finetune_learning_rate': 2e-05, 'finetune_weight_decay': 0.1, 'finetune_max_grad_norm': 1.0, 'finetune_lr_scheduler_type': 'linear-warmup+cosine-decay', 'finetune_warmup_ratio': 0.03, 'finetune_train_strategy': 'fsdp-full-shard', 'enable_gradient_checkpointing': True, 'enable_mixed_precision_training': True, 'reduce_in_full_precision': False}, 'dataset': {'type': 'llava-v15', 'dataset_id': 'llava-v15', 'align_stage_components': ['download/llava-laion-cc-sbu-558k/chat.json', 'download/llava-laion-cc-sbu-558k'], 'finetune_stage_components': ['download/llava-v1.5-instruct/llava_v1_5_mix665k.json', 'download/llava-v1.5-instruct'], 'dataset_root_dir': '/hai/scratch/belkhale/datasets/prismatic-vlms'}, 'stage': 'finetune', 'pretrained_checkpoint': None, 'run_id': 'prism-qwen25-extra-dinosiglip-224px+0_5b+stage-finetune+x7', 'run_root_dir': 'runs', 'seed': 7, 'hf_token': '.hf_token', 'trackers': ['jsonl', 'wandb'], 'wandb_project': 'prismatic', 'wandb_entity': None}
|
14 |
+
2024-11-05 19:31:02,512 INFO MainThread:2188020 [wandb_init.py:init():671] starting backend
|
15 |
+
2024-11-05 19:31:02,512 INFO MainThread:2188020 [wandb_init.py:init():675] sending inform_init request
|
16 |
+
2024-11-05 19:31:02,513 INFO MainThread:2188020 [backend.py:_multiprocessing_setup():104] multiprocessing start_methods=fork,spawn,forkserver, using: spawn
|
17 |
+
2024-11-05 19:31:02,513 INFO MainThread:2188020 [wandb_init.py:init():688] backend started and connected
|
18 |
+
2024-11-05 19:31:02,515 INFO MainThread:2188020 [wandb_init.py:init():783] updated telemetry
|
19 |
+
2024-11-05 19:31:02,573 INFO MainThread:2188020 [wandb_init.py:init():816] communicating run to backend with 90.0 second timeout
|
20 |
+
2024-11-05 19:31:03,050 INFO MainThread:2188020 [wandb_init.py:init():867] starting run threads in backend
|
21 |
+
2024-11-05 19:31:03,226 INFO MainThread:2188020 [wandb_run.py:_console_start():2463] atexit reg
|
22 |
+
2024-11-05 19:31:03,227 INFO MainThread:2188020 [wandb_run.py:_redirect():2311] redirect: wrap_raw
|
23 |
+
2024-11-05 19:31:03,227 INFO MainThread:2188020 [wandb_run.py:_redirect():2376] Wrapping output streams.
|
24 |
+
2024-11-05 19:31:03,227 INFO MainThread:2188020 [wandb_run.py:_redirect():2401] Redirects installed.
|
25 |
+
2024-11-05 19:31:03,230 INFO MainThread:2188020 [wandb_init.py:init():911] run started, returning control to user process
|
26 |
+
2024-11-05 23:38:31,920 INFO MainThread:2188020 [wandb_run.py:_finish():2158] finishing run belkhale/prismatic/jcj67gg8
|
27 |
+
2024-11-05 23:38:31,920 INFO MainThread:2188020 [wandb_run.py:_atexit_cleanup():2426] got exitcode: 0
|
28 |
+
2024-11-05 23:38:31,921 INFO MainThread:2188020 [wandb_run.py:_restore():2408] restore
|
29 |
+
2024-11-05 23:38:31,921 INFO MainThread:2188020 [wandb_run.py:_restore():2414] restore done
|
30 |
+
2024-11-05 23:38:34,516 INFO MainThread:2188020 [wandb_run.py:_footer_history_summary_info():3975] rendering history
|
31 |
+
2024-11-05 23:38:34,517 INFO MainThread:2188020 [wandb_run.py:_footer_history_summary_info():4007] rendering summary
|
32 |
+
2024-11-05 23:38:34,534 INFO MainThread:2188020 [wandb_run.py:_footer_sync_info():3934] logging synced files
|
wandb/latest-run/files/config.yaml
ADDED
@@ -0,0 +1,105 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
_wandb:
|
2 |
+
value:
|
3 |
+
cli_version: 0.18.5
|
4 |
+
m: []
|
5 |
+
python_version: 3.10.15
|
6 |
+
t:
|
7 |
+
"1":
|
8 |
+
- 1
|
9 |
+
- 2
|
10 |
+
- 3
|
11 |
+
- 11
|
12 |
+
- 41
|
13 |
+
- 49
|
14 |
+
- 55
|
15 |
+
- 63
|
16 |
+
- 71
|
17 |
+
"2":
|
18 |
+
- 1
|
19 |
+
- 2
|
20 |
+
- 3
|
21 |
+
- 11
|
22 |
+
- 41
|
23 |
+
- 49
|
24 |
+
- 55
|
25 |
+
- 63
|
26 |
+
- 71
|
27 |
+
"3":
|
28 |
+
- 2
|
29 |
+
- 13
|
30 |
+
- 16
|
31 |
+
- 23
|
32 |
+
- 55
|
33 |
+
- 61
|
34 |
+
"4": 3.10.15
|
35 |
+
"5": 0.18.5
|
36 |
+
"6": 4.40.1
|
37 |
+
"8":
|
38 |
+
- 5
|
39 |
+
"12": 0.18.5
|
40 |
+
"13": linux-x86_64
|
41 |
+
dataset:
|
42 |
+
value:
|
43 |
+
align_stage_components:
|
44 |
+
- download/llava-laion-cc-sbu-558k/chat.json
|
45 |
+
- download/llava-laion-cc-sbu-558k
|
46 |
+
dataset_id: llava-v15
|
47 |
+
dataset_root_dir: /hai/scratch/belkhale/datasets/prismatic-vlms
|
48 |
+
finetune_stage_components:
|
49 |
+
- download/llava-v1.5-instruct/llava_v1_5_mix665k.json
|
50 |
+
- download/llava-v1.5-instruct
|
51 |
+
type: llava-v15
|
52 |
+
hf_token:
|
53 |
+
value: .hf_token
|
54 |
+
model:
|
55 |
+
value:
|
56 |
+
align_epochs: 1
|
57 |
+
align_global_batch_size: 96
|
58 |
+
align_learning_rate: 0.001
|
59 |
+
align_lr_scheduler_type: linear-warmup+cosine-decay
|
60 |
+
align_max_grad_norm: 1
|
61 |
+
align_max_steps: null
|
62 |
+
align_per_device_batch_size: 16
|
63 |
+
align_save_every_n_steps: 10000
|
64 |
+
align_train_strategy: fsdp-shard-grad-op
|
65 |
+
align_warmup_ratio: 0.03
|
66 |
+
align_weight_decay: 0
|
67 |
+
arch_specifier: no-align+fused-gelu-mlp
|
68 |
+
enable_gradient_checkpointing: true
|
69 |
+
enable_mixed_precision_training: true
|
70 |
+
finetune_epochs: 2
|
71 |
+
finetune_global_batch_size: 64
|
72 |
+
finetune_learning_rate: 2e-05
|
73 |
+
finetune_lr_scheduler_type: linear-warmup+cosine-decay
|
74 |
+
finetune_max_grad_norm: 1
|
75 |
+
finetune_max_steps: null
|
76 |
+
finetune_per_device_batch_size: 4
|
77 |
+
finetune_save_every_n_steps: 10000
|
78 |
+
finetune_train_strategy: fsdp-full-shard
|
79 |
+
finetune_warmup_ratio: 0.03
|
80 |
+
finetune_weight_decay: 0.1
|
81 |
+
image_resize_strategy: resize-naive
|
82 |
+
llm_backbone_id: qwen25-0_5b-extra
|
83 |
+
llm_max_length: 32768
|
84 |
+
model_id: prism-qwen25-extra-dinosiglip-224px+0_5b
|
85 |
+
reduce_in_full_precision: false
|
86 |
+
type: prism-qwen25-extra-dinosiglip-224px+0_5b
|
87 |
+
vision_backbone_id: dinosiglip-vit-so-224px
|
88 |
+
pretrained_checkpoint:
|
89 |
+
value: null
|
90 |
+
run_id:
|
91 |
+
value: prism-qwen25-extra-dinosiglip-224px+0_5b+stage-finetune+x7
|
92 |
+
run_root_dir:
|
93 |
+
value: runs
|
94 |
+
seed:
|
95 |
+
value: 7
|
96 |
+
stage:
|
97 |
+
value: finetune
|
98 |
+
trackers:
|
99 |
+
value:
|
100 |
+
- jsonl
|
101 |
+
- wandb
|
102 |
+
wandb_entity:
|
103 |
+
value: null
|
104 |
+
wandb_project:
|
105 |
+
value: prismatic
|
wandb/latest-run/files/output.log
ADDED
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
1 |
+
11/05 [19:31:03] INFO | >> [*] Starting Training Loop pretrain.py:227
|
2 |
+
|
3 |
+
11/05 [23:38:31] INFO | >> [*] Done with Training =>> pretrain.py:231
|
4 |
+
Finalizing Metrics
|
wandb/latest-run/files/wandb-metadata.json
ADDED
@@ -0,0 +1,134 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"os": "Linux-5.15.0-116-generic-x86_64-with-glibc2.35",
|
3 |
+
"python": "3.10.15",
|
4 |
+
"startedAt": "2024-11-06T03:31:02.513650Z",
|
5 |
+
"args": [
|
6 |
+
"--model.type",
|
7 |
+
"prism-qwen25-extra-dinosiglip-224px+0_5b",
|
8 |
+
"--model.finetune_global_batch_size",
|
9 |
+
"64",
|
10 |
+
"--model.finetune_per_device_batch_size",
|
11 |
+
"4"
|
12 |
+
],
|
13 |
+
"program": "/hai/scratch/belkhale/openvla-mini/scripts/pretrain.py",
|
14 |
+
"codePath": "scripts/pretrain.py",
|
15 |
+
"git": {
|
16 |
+
"remote": "[email protected]:Stanford-ILIAD/openvla-mini.git",
|
17 |
+
"commit": "05073927b096dab7d326a3e39db9262f08d3a8ae"
|
18 |
+
},
|
19 |
+
"email": "[email protected]",
|
20 |
+
"root": "runs/prism-qwen25-extra-dinosiglip-224px+0_5b+stage-finetune+x7",
|
21 |
+
"host": "haic-hgx-2.stanford.edu",
|
22 |
+
"username": "belkhale",
|
23 |
+
"executable": "/hai/scratch/belkhale/miniforge3/envs/vla/bin/python3.10",
|
24 |
+
"codePathLocal": "scripts/pretrain.py",
|
25 |
+
"cpu_count": 112,
|
26 |
+
"cpu_count_logical": 224,
|
27 |
+
"gpu": "NVIDIA H100 80GB HBM3",
|
28 |
+
"gpu_count": 8,
|
29 |
+
"disk": {
|
30 |
+
"/": {
|
31 |
+
"total": "942725181440",
|
32 |
+
"used": "50880847872"
|
33 |
+
}
|
34 |
+
},
|
35 |
+
"memory": {
|
36 |
+
"total": "2164104577024"
|
37 |
+
},
|
38 |
+
"cpu": {
|
39 |
+
"count": 112,
|
40 |
+
"countLogical": 224
|
41 |
+
},
|
42 |
+
"gpu_nvidia": [
|
43 |
+
{
|
44 |
+
"name": "NVIDIA H100 80GB HBM3",
|
45 |
+
"memoryTotal": "85520809984",
|
46 |
+
"cudaCores": 16896,
|
47 |
+
"architecture": "Hopper"
|
48 |
+
},
|
49 |
+
{
|
50 |
+
"name": "NVIDIA H100 80GB HBM3",
|
51 |
+
"memoryTotal": "85520809984",
|
52 |
+
"cudaCores": 16896,
|
53 |
+
"architecture": "Hopper"
|
54 |
+
},
|
55 |
+
{
|
56 |
+
"name": "NVIDIA H100 80GB HBM3",
|
57 |
+
"memoryTotal": "85520809984",
|
58 |
+
"cudaCores": 16896,
|
59 |
+
"architecture": "Hopper"
|
60 |
+
},
|
61 |
+
{
|
62 |
+
"name": "NVIDIA H100 80GB HBM3",
|
63 |
+
"memoryTotal": "85520809984",
|
64 |
+
"cudaCores": 16896,
|
65 |
+
"architecture": "Hopper"
|
66 |
+
},
|
67 |
+
{
|
68 |
+
"name": "NVIDIA H100 80GB HBM3",
|
69 |
+
"memoryTotal": "85520809984",
|
70 |
+
"cudaCores": 16896,
|
71 |
+
"architecture": "Hopper"
|
72 |
+
},
|
73 |
+
{
|
74 |
+
"name": "NVIDIA H100 80GB HBM3",
|
75 |
+
"memoryTotal": "85520809984",
|
76 |
+
"cudaCores": 16896,
|
77 |
+
"architecture": "Hopper"
|
78 |
+
},
|
79 |
+
{
|
80 |
+
"name": "NVIDIA H100 80GB HBM3",
|
81 |
+
"memoryTotal": "85520809984",
|
82 |
+
"cudaCores": 16896,
|
83 |
+
"architecture": "Hopper"
|
84 |
+
},
|
85 |
+
{
|
86 |
+
"name": "NVIDIA H100 80GB HBM3",
|
87 |
+
"memoryTotal": "85520809984",
|
88 |
+
"cudaCores": 16896,
|
89 |
+
"architecture": "Hopper"
|
90 |
+
}
|
91 |
+
],
|
92 |
+
"slurm": {
|
93 |
+
"cluster_name": "haic",
|
94 |
+
"conf": "/usr/local/etc/slurm.conf",
|
95 |
+
"cpus_on_node": "64",
|
96 |
+
"cpus_per_task": "64",
|
97 |
+
"gpus_on_node": "8",
|
98 |
+
"gtids": "0",
|
99 |
+
"job_account": "models",
|
100 |
+
"job_cpus_per_node": "64",
|
101 |
+
"job_end_time": "1731122999",
|
102 |
+
"job_gid": "37",
|
103 |
+
"job_gpus": "0,1,2,3,4,5,6,7",
|
104 |
+
"job_id": "11026",
|
105 |
+
"job_name": "pretrain",
|
106 |
+
"job_nodelist": "haic-hgx-2",
|
107 |
+
"job_num_nodes": "1",
|
108 |
+
"job_partition": "hai",
|
109 |
+
"job_qos": "models",
|
110 |
+
"job_start_time": "1730863799",
|
111 |
+
"job_uid": "377095",
|
112 |
+
"job_user": "belkhale",
|
113 |
+
"jobid": "11026",
|
114 |
+
"localid": "0",
|
115 |
+
"mem_per_node": "102400",
|
116 |
+
"nnodes": "1",
|
117 |
+
"nodeid": "0",
|
118 |
+
"nodelist": "haic-hgx-2",
|
119 |
+
"nprocs": "1",
|
120 |
+
"ntasks": "1",
|
121 |
+
"ntasks_per_node": "1",
|
122 |
+
"prio_process": "0",
|
123 |
+
"procid": "0",
|
124 |
+
"script_context": "prolog_task",
|
125 |
+
"submit_dir": "/hai/scratch/belkhale/openvla-mini",
|
126 |
+
"submit_host": "haic.stanford.edu",
|
127 |
+
"task_pid": "2187908",
|
128 |
+
"tasks_per_node": "1",
|
129 |
+
"topology_addr": "haic-hgx-2",
|
130 |
+
"topology_addr_pattern": "node",
|
131 |
+
"tres_per_task": "cpu=64"
|
132 |
+
},
|
133 |
+
"cudaVersion": "12.4"
|
134 |
+
}
|
wandb/latest-run/files/wandb-summary.json
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
{"_wandb":{"runtime":14849},"_runtime":14849.432571063,"_step":20792,"Finetune/Loss":0.734754204750061,"Finetune/Loss (Raw)":0.7624474763870239,"Finetune/Learning Rate":0,"Finetune/Step Time":0.7374007441103458,"_timestamp":1.7308787040734835e+09,"Finetune/Step":20792}
|
wandb/latest-run/logs/debug-core.log
ADDED
@@ -0,0 +1,16 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{"time":"2024-11-05T19:31:02.167132681-08:00","level":"INFO","msg":"started logging, with flags","port-filename":"/tmp/belkhale/tmpowkszwq0/port-2188020.txt","pid":2188020,"debug":false,"disable-analytics":false}
|
2 |
+
{"time":"2024-11-05T19:31:02.167154904-08:00","level":"INFO","msg":"FeatureState","shutdownOnParentExitEnabled":false}
|
3 |
+
{"time":"2024-11-05T19:31:02.168180089-08:00","level":"INFO","msg":"Will exit if parent process dies.","ppid":2188020}
|
4 |
+
{"time":"2024-11-05T19:31:02.168180088-08:00","level":"INFO","msg":"server is running","addr":{"IP":"127.0.0.1","Port":35793,"Zone":""}}
|
5 |
+
{"time":"2024-11-05T19:31:02.357031058-08:00","level":"INFO","msg":"connection: ManageConnectionData: new connection created","id":"127.0.0.1:51484"}
|
6 |
+
{"time":"2024-11-05T19:31:02.518306545-08:00","level":"INFO","msg":"handleInformInit: received","streamId":"jcj67gg8","id":"127.0.0.1:51484"}
|
7 |
+
{"time":"2024-11-05T19:31:02.738838516-08:00","level":"INFO","msg":"handleInformInit: stream started","streamId":"jcj67gg8","id":"127.0.0.1:51484"}
|
8 |
+
{"time":"2024-11-05T23:38:34.554528568-08:00","level":"INFO","msg":"handleInformFinish: finish message received","streamId":"jcj67gg8","id":"127.0.0.1:51484"}
|
9 |
+
{"time":"2024-11-05T23:38:34.558563756-08:00","level":"INFO","msg":"handleInformFinish: stream closed","streamId":"jcj67gg8","id":"127.0.0.1:51484"}
|
10 |
+
{"time":"2024-11-05T23:42:06.504927152-08:00","level":"INFO","msg":"handleInformTeardown: server teardown initiated","id":"127.0.0.1:51484"}
|
11 |
+
{"time":"2024-11-05T23:42:06.505861575-08:00","level":"INFO","msg":"handleInformTeardown: server shutdown complete","id":"127.0.0.1:51484"}
|
12 |
+
{"time":"2024-11-05T23:42:06.505880856-08:00","level":"INFO","msg":"server is shutting down"}
|
13 |
+
{"time":"2024-11-05T23:42:06.50594903-08:00","level":"INFO","msg":"connection: Close: initiating connection closure","id":"127.0.0.1:51484"}
|
14 |
+
{"time":"2024-11-05T23:42:06.506095025-08:00","level":"INFO","msg":"connection: Close: connection successfully closed","id":"127.0.0.1:51484"}
|
15 |
+
{"time":"2024-11-05T23:42:06.506152479-08:00","level":"INFO","msg":"connection: ManageConnectionData: connection closed","id":"127.0.0.1:51484"}
|
16 |
+
{"time":"2024-11-05T23:42:06.506171224-08:00","level":"INFO","msg":"server is closed"}
|
wandb/latest-run/logs/debug-internal.log
ADDED
@@ -0,0 +1,17 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{"time":"2024-11-05T19:31:02.519998294-08:00","level":"INFO","msg":"using version","core version":"0.18.5"}
|
2 |
+
{"time":"2024-11-05T19:31:02.520021589-08:00","level":"INFO","msg":"created symlink","path":"runs/prism-qwen25-extra-dinosiglip-224px+0_5b+stage-finetune+x7/wandb/run-20241105_193102-jcj67gg8/logs/debug-core.log"}
|
3 |
+
{"time":"2024-11-05T19:31:02.738801523-08:00","level":"INFO","msg":"created new stream","id":"jcj67gg8"}
|
4 |
+
{"time":"2024-11-05T19:31:02.738835256-08:00","level":"INFO","msg":"stream: started","id":"jcj67gg8"}
|
5 |
+
{"time":"2024-11-05T19:31:02.738905513-08:00","level":"INFO","msg":"sender: started","stream_id":"jcj67gg8"}
|
6 |
+
{"time":"2024-11-05T19:31:02.738892436-08:00","level":"INFO","msg":"writer: Do: started","stream_id":{"value":"jcj67gg8"}}
|
7 |
+
{"time":"2024-11-05T19:31:02.738902832-08:00","level":"INFO","msg":"handler: started","stream_id":{"value":"jcj67gg8"}}
|
8 |
+
{"time":"2024-11-05T19:31:03.056521545-08:00","level":"INFO","msg":"Starting system monitor"}
|
9 |
+
{"time":"2024-11-05T23:38:31.946246118-08:00","level":"INFO","msg":"Stopping system monitor"}
|
10 |
+
{"time":"2024-11-05T23:38:32.023973784-08:00","level":"INFO","msg":"Stopped system monitor"}
|
11 |
+
{"time":"2024-11-05T23:38:32.922341453-08:00","level":"INFO","msg":"handler: operation stats","stats":{"operations":[{"desc":"saving job artifact","runtime_seconds":0.775009129}],"total_operations":1}}
|
12 |
+
{"time":"2024-11-05T23:38:33.390878425-08:00","level":"INFO","msg":"fileTransfer: Close: file transfer manager closed"}
|
13 |
+
{"time":"2024-11-05T23:38:34.555591357-08:00","level":"INFO","msg":"stream: closing","id":"jcj67gg8"}
|
14 |
+
{"time":"2024-11-05T23:38:34.555667186-08:00","level":"INFO","msg":"handler: closed","stream_id":{"value":"jcj67gg8"}}
|
15 |
+
{"time":"2024-11-05T23:38:34.555706801-08:00","level":"INFO","msg":"writer: Close: closed","stream_id":{"value":"jcj67gg8"}}
|
16 |
+
{"time":"2024-11-05T23:38:34.555872-08:00","level":"INFO","msg":"sender: closed","stream_id":"jcj67gg8"}
|
17 |
+
{"time":"2024-11-05T23:38:34.558526671-08:00","level":"INFO","msg":"stream: closed","id":"jcj67gg8"}
|
wandb/latest-run/logs/debug.log
ADDED
@@ -0,0 +1,32 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
2024-11-05 19:31:02,511 INFO MainThread:2188020 [wandb_setup.py:_flush():79] Current SDK version is 0.18.5
|
2 |
+
2024-11-05 19:31:02,511 INFO MainThread:2188020 [wandb_setup.py:_flush():79] Configure stats pid to 2188020
|
3 |
+
2024-11-05 19:31:02,511 INFO MainThread:2188020 [wandb_setup.py:_flush():79] Loading settings from /hai/scratch/belkhale/.config/wandb/settings
|
4 |
+
2024-11-05 19:31:02,511 INFO MainThread:2188020 [wandb_setup.py:_flush():79] Loading settings from /hai/scratch/belkhale/openvla-mini/wandb/settings
|
5 |
+
2024-11-05 19:31:02,511 INFO MainThread:2188020 [wandb_setup.py:_flush():79] Loading settings from environment variables: {'_service_wait': '300'}
|
6 |
+
2024-11-05 19:31:02,512 INFO MainThread:2188020 [wandb_setup.py:_flush():79] Applying setup settings: {'mode': None, '_disable_service': None}
|
7 |
+
2024-11-05 19:31:02,512 INFO MainThread:2188020 [wandb_setup.py:_flush():79] Inferring run settings from compute environment: {'program_relpath': 'scripts/pretrain.py', 'program_abspath': '/hai/scratch/belkhale/openvla-mini/scripts/pretrain.py', 'program': '/hai/scratch/belkhale/openvla-mini/scripts/pretrain.py'}
|
8 |
+
2024-11-05 19:31:02,512 INFO MainThread:2188020 [wandb_setup.py:_flush():79] Applying login settings: {}
|
9 |
+
2024-11-05 19:31:02,512 INFO MainThread:2188020 [wandb_init.py:_log_setup():534] Logging user logs to runs/prism-qwen25-extra-dinosiglip-224px+0_5b+stage-finetune+x7/wandb/run-20241105_193102-jcj67gg8/logs/debug.log
|
10 |
+
2024-11-05 19:31:02,512 INFO MainThread:2188020 [wandb_init.py:_log_setup():535] Logging internal logs to runs/prism-qwen25-extra-dinosiglip-224px+0_5b+stage-finetune+x7/wandb/run-20241105_193102-jcj67gg8/logs/debug-internal.log
|
11 |
+
2024-11-05 19:31:02,512 INFO MainThread:2188020 [wandb_init.py:init():621] calling init triggers
|
12 |
+
2024-11-05 19:31:02,512 INFO MainThread:2188020 [wandb_init.py:init():628] wandb.init called with sweep_config: {}
|
13 |
+
config: {'model': {'type': 'prism-qwen25-extra-dinosiglip-224px+0_5b', 'model_id': 'prism-qwen25-extra-dinosiglip-224px+0_5b', 'arch_specifier': 'no-align+fused-gelu-mlp', 'vision_backbone_id': 'dinosiglip-vit-so-224px', 'llm_backbone_id': 'qwen25-0_5b-extra', 'image_resize_strategy': 'resize-naive', 'llm_max_length': 32768, 'align_epochs': 1, 'align_max_steps': None, 'align_save_every_n_steps': 10000, 'align_global_batch_size': 96, 'align_per_device_batch_size': 16, 'align_learning_rate': 0.001, 'align_weight_decay': 0.0, 'align_max_grad_norm': 1.0, 'align_lr_scheduler_type': 'linear-warmup+cosine-decay', 'align_warmup_ratio': 0.03, 'align_train_strategy': 'fsdp-shard-grad-op', 'finetune_epochs': 2, 'finetune_max_steps': None, 'finetune_save_every_n_steps': 10000, 'finetune_global_batch_size': 64, 'finetune_per_device_batch_size': 4, 'finetune_learning_rate': 2e-05, 'finetune_weight_decay': 0.1, 'finetune_max_grad_norm': 1.0, 'finetune_lr_scheduler_type': 'linear-warmup+cosine-decay', 'finetune_warmup_ratio': 0.03, 'finetune_train_strategy': 'fsdp-full-shard', 'enable_gradient_checkpointing': True, 'enable_mixed_precision_training': True, 'reduce_in_full_precision': False}, 'dataset': {'type': 'llava-v15', 'dataset_id': 'llava-v15', 'align_stage_components': ['download/llava-laion-cc-sbu-558k/chat.json', 'download/llava-laion-cc-sbu-558k'], 'finetune_stage_components': ['download/llava-v1.5-instruct/llava_v1_5_mix665k.json', 'download/llava-v1.5-instruct'], 'dataset_root_dir': '/hai/scratch/belkhale/datasets/prismatic-vlms'}, 'stage': 'finetune', 'pretrained_checkpoint': None, 'run_id': 'prism-qwen25-extra-dinosiglip-224px+0_5b+stage-finetune+x7', 'run_root_dir': 'runs', 'seed': 7, 'hf_token': '.hf_token', 'trackers': ['jsonl', 'wandb'], 'wandb_project': 'prismatic', 'wandb_entity': None}
|
14 |
+
2024-11-05 19:31:02,512 INFO MainThread:2188020 [wandb_init.py:init():671] starting backend
|
15 |
+
2024-11-05 19:31:02,512 INFO MainThread:2188020 [wandb_init.py:init():675] sending inform_init request
|
16 |
+
2024-11-05 19:31:02,513 INFO MainThread:2188020 [backend.py:_multiprocessing_setup():104] multiprocessing start_methods=fork,spawn,forkserver, using: spawn
|
17 |
+
2024-11-05 19:31:02,513 INFO MainThread:2188020 [wandb_init.py:init():688] backend started and connected
|
18 |
+
2024-11-05 19:31:02,515 INFO MainThread:2188020 [wandb_init.py:init():783] updated telemetry
|
19 |
+
2024-11-05 19:31:02,573 INFO MainThread:2188020 [wandb_init.py:init():816] communicating run to backend with 90.0 second timeout
|
20 |
+
2024-11-05 19:31:03,050 INFO MainThread:2188020 [wandb_init.py:init():867] starting run threads in backend
|
21 |
+
2024-11-05 19:31:03,226 INFO MainThread:2188020 [wandb_run.py:_console_start():2463] atexit reg
|
22 |
+
2024-11-05 19:31:03,227 INFO MainThread:2188020 [wandb_run.py:_redirect():2311] redirect: wrap_raw
|
23 |
+
2024-11-05 19:31:03,227 INFO MainThread:2188020 [wandb_run.py:_redirect():2376] Wrapping output streams.
|
24 |
+
2024-11-05 19:31:03,227 INFO MainThread:2188020 [wandb_run.py:_redirect():2401] Redirects installed.
|
25 |
+
2024-11-05 19:31:03,230 INFO MainThread:2188020 [wandb_init.py:init():911] run started, returning control to user process
|
26 |
+
2024-11-05 23:38:31,920 INFO MainThread:2188020 [wandb_run.py:_finish():2158] finishing run belkhale/prismatic/jcj67gg8
|
27 |
+
2024-11-05 23:38:31,920 INFO MainThread:2188020 [wandb_run.py:_atexit_cleanup():2426] got exitcode: 0
|
28 |
+
2024-11-05 23:38:31,921 INFO MainThread:2188020 [wandb_run.py:_restore():2408] restore
|
29 |
+
2024-11-05 23:38:31,921 INFO MainThread:2188020 [wandb_run.py:_restore():2414] restore done
|
30 |
+
2024-11-05 23:38:34,516 INFO MainThread:2188020 [wandb_run.py:_footer_history_summary_info():3975] rendering history
|
31 |
+
2024-11-05 23:38:34,517 INFO MainThread:2188020 [wandb_run.py:_footer_history_summary_info():4007] rendering summary
|
32 |
+
2024-11-05 23:38:34,534 INFO MainThread:2188020 [wandb_run.py:_footer_sync_info():3934] logging synced files
|
wandb/latest-run/run-jcj67gg8.wandb
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:b1e028b995d945fb180c85455d7219269515a28888eda671f635380e4dac0d23
|
3 |
+
size 37709802
|
wandb/run-20241105_192502-8vxhoj6d/files/config.yaml
ADDED
@@ -0,0 +1,103 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
_wandb:
|
2 |
+
value:
|
3 |
+
cli_version: 0.18.5
|
4 |
+
m: []
|
5 |
+
python_version: 3.10.15
|
6 |
+
t:
|
7 |
+
"1":
|
8 |
+
- 1
|
9 |
+
- 2
|
10 |
+
- 3
|
11 |
+
- 11
|
12 |
+
- 41
|
13 |
+
- 49
|
14 |
+
- 55
|
15 |
+
- 63
|
16 |
+
- 71
|
17 |
+
"2":
|
18 |
+
- 1
|
19 |
+
- 2
|
20 |
+
- 3
|
21 |
+
- 11
|
22 |
+
- 41
|
23 |
+
- 49
|
24 |
+
- 55
|
25 |
+
- 63
|
26 |
+
- 71
|
27 |
+
"3":
|
28 |
+
- 13
|
29 |
+
- 16
|
30 |
+
- 23
|
31 |
+
- 55
|
32 |
+
"4": 3.10.15
|
33 |
+
"5": 0.18.5
|
34 |
+
"6": 4.40.1
|
35 |
+
"8":
|
36 |
+
- 5
|
37 |
+
"12": 0.18.5
|
38 |
+
"13": linux-x86_64
|
39 |
+
dataset:
|
40 |
+
value:
|
41 |
+
align_stage_components:
|
42 |
+
- download/llava-laion-cc-sbu-558k/chat.json
|
43 |
+
- download/llava-laion-cc-sbu-558k
|
44 |
+
dataset_id: llava-v15
|
45 |
+
dataset_root_dir: /hai/scratch/belkhale/datasets/prismatic-vlms
|
46 |
+
finetune_stage_components:
|
47 |
+
- download/llava-v1.5-instruct/llava_v1_5_mix665k.json
|
48 |
+
- download/llava-v1.5-instruct
|
49 |
+
type: llava-v15
|
50 |
+
hf_token:
|
51 |
+
value: .hf_token
|
52 |
+
model:
|
53 |
+
value:
|
54 |
+
align_epochs: 1
|
55 |
+
align_global_batch_size: 96
|
56 |
+
align_learning_rate: 0.001
|
57 |
+
align_lr_scheduler_type: linear-warmup+cosine-decay
|
58 |
+
align_max_grad_norm: 1
|
59 |
+
align_max_steps: null
|
60 |
+
align_per_device_batch_size: 16
|
61 |
+
align_save_every_n_steps: 10000
|
62 |
+
align_train_strategy: fsdp-shard-grad-op
|
63 |
+
align_warmup_ratio: 0.03
|
64 |
+
align_weight_decay: 0
|
65 |
+
arch_specifier: no-align+fused-gelu-mlp
|
66 |
+
enable_gradient_checkpointing: true
|
67 |
+
enable_mixed_precision_training: true
|
68 |
+
finetune_epochs: 2
|
69 |
+
finetune_global_batch_size: 128
|
70 |
+
finetune_learning_rate: 2e-05
|
71 |
+
finetune_lr_scheduler_type: linear-warmup+cosine-decay
|
72 |
+
finetune_max_grad_norm: 1
|
73 |
+
finetune_max_steps: null
|
74 |
+
finetune_per_device_batch_size: 16
|
75 |
+
finetune_save_every_n_steps: 10000
|
76 |
+
finetune_train_strategy: fsdp-full-shard
|
77 |
+
finetune_warmup_ratio: 0.03
|
78 |
+
finetune_weight_decay: 0.1
|
79 |
+
image_resize_strategy: resize-naive
|
80 |
+
llm_backbone_id: qwen25-0_5b-extra
|
81 |
+
llm_max_length: 32768
|
82 |
+
model_id: prism-qwen25-extra-dinosiglip-224px+0_5b
|
83 |
+
reduce_in_full_precision: false
|
84 |
+
type: prism-qwen25-extra-dinosiglip-224px+0_5b
|
85 |
+
vision_backbone_id: dinosiglip-vit-so-224px
|
86 |
+
pretrained_checkpoint:
|
87 |
+
value: null
|
88 |
+
run_id:
|
89 |
+
value: prism-qwen25-extra-dinosiglip-224px+0_5b+stage-finetune+x7
|
90 |
+
run_root_dir:
|
91 |
+
value: runs
|
92 |
+
seed:
|
93 |
+
value: 7
|
94 |
+
stage:
|
95 |
+
value: finetune
|
96 |
+
trackers:
|
97 |
+
value:
|
98 |
+
- jsonl
|
99 |
+
- wandb
|
100 |
+
wandb_entity:
|
101 |
+
value: null
|
102 |
+
wandb_project:
|
103 |
+
value: prismatic
|
wandb/run-20241105_192502-8vxhoj6d/files/output.log
ADDED
@@ -0,0 +1,35 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
11/05 [19:25:05] INFO | >> [*] Starting Training Loop pretrain.py:227
|
2 |
+
Traceback (most recent call last):
|
3 |
+
File "/hai/scratch/belkhale/openvla-mini/scripts/pretrain.py", line 241, in <module>
|
4 |
+
pretrain()
|
5 |
+
File "/hai/scratch/belkhale/miniforge3/envs/vla/lib/python3.10/site-packages/draccus/argparsing.py", line 203, in wrapper_inner
|
6 |
+
response = fn(cfg, *args, **kwargs)
|
7 |
+
File "/hai/scratch/belkhale/openvla-mini/scripts/pretrain.py", line 228, in pretrain
|
8 |
+
train_strategy.run_training(train_dataset, collator, metrics, stage=cfg.stage, seed=cfg.seed)
|
9 |
+
File "/hai/scratch/belkhale/openvla-mini/prismatic/training/strategies/base_strategy.py", line 190, in run_training
|
10 |
+
output: CausalLMOutputWithPast = self.vlm(
|
11 |
+
File "/hai/scratch/belkhale/miniforge3/envs/vla/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1511, in _wrapped_call_impl
|
12 |
+
return self._call_impl(*args, **kwargs)
|
13 |
+
File "/hai/scratch/belkhale/miniforge3/envs/vla/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1520, in _call_impl
|
14 |
+
return forward_call(*args, **kwargs)
|
15 |
+
File "/hai/scratch/belkhale/miniforge3/envs/vla/lib/python3.10/site-packages/torch/distributed/fsdp/fully_sharded_data_parallel.py", line 849, in forward
|
16 |
+
output = self._fsdp_wrapped_module(*args, **kwargs)
|
17 |
+
File "/hai/scratch/belkhale/miniforge3/envs/vla/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1511, in _wrapped_call_impl
|
18 |
+
return self._call_impl(*args, **kwargs)
|
19 |
+
File "/hai/scratch/belkhale/miniforge3/envs/vla/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1520, in _call_impl
|
20 |
+
return forward_call(*args, **kwargs)
|
21 |
+
File "/hai/scratch/belkhale/openvla-mini/prismatic/models/vlms/prismatic.py", line 470, in forward
|
22 |
+
return self.llm_backbone(
|
23 |
+
File "/hai/scratch/belkhale/miniforge3/envs/vla/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1511, in _wrapped_call_impl
|
24 |
+
return self._call_impl(*args, **kwargs)
|
25 |
+
File "/hai/scratch/belkhale/miniforge3/envs/vla/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1520, in _call_impl
|
26 |
+
return forward_call(*args, **kwargs)
|
27 |
+
File "/hai/scratch/belkhale/openvla-mini/prismatic/models/backbones/llm/base_llm.py", line 221, in forward
|
28 |
+
output: CausalLMOutputWithPast = self.llm(
|
29 |
+
File "/hai/scratch/belkhale/miniforge3/envs/vla/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1511, in _wrapped_call_impl
|
30 |
+
return self._call_impl(*args, **kwargs)
|
31 |
+
File "/hai/scratch/belkhale/miniforge3/envs/vla/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1520, in _call_impl
|
32 |
+
return forward_call(*args, **kwargs)
|
33 |
+
File "/hai/scratch/belkhale/miniforge3/envs/vla/lib/python3.10/site-packages/transformers/models/qwen2/modeling_qwen2.py", line 1183, in forward
|
34 |
+
logits = logits.float()
|
35 |
+
torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 49.14 GiB. GPU 0 has a total capacity of 79.10 GiB of which 42.31 GiB is free. Including non-PyTorch memory, this process has 36.77 GiB memory in use. Of the allocated memory 29.84 GiB is allocated by PyTorch, and 1.02 GiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
|
wandb/run-20241105_192502-8vxhoj6d/files/wandb-metadata.json
ADDED
@@ -0,0 +1,134 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"os": "Linux-5.15.0-116-generic-x86_64-with-glibc2.35",
|
3 |
+
"python": "3.10.15",
|
4 |
+
"startedAt": "2024-11-06T03:25:02.892307Z",
|
5 |
+
"args": [
|
6 |
+
"--model.type",
|
7 |
+
"prism-qwen25-extra-dinosiglip-224px+0_5b",
|
8 |
+
"--model.finetune_global_batch_size",
|
9 |
+
"128",
|
10 |
+
"--model.finetune_per_device_batch_size",
|
11 |
+
"16"
|
12 |
+
],
|
13 |
+
"program": "/hai/scratch/belkhale/openvla-mini/scripts/pretrain.py",
|
14 |
+
"codePath": "scripts/pretrain.py",
|
15 |
+
"git": {
|
16 |
+
"remote": "[email protected]:Stanford-ILIAD/openvla-mini.git",
|
17 |
+
"commit": "05073927b096dab7d326a3e39db9262f08d3a8ae"
|
18 |
+
},
|
19 |
+
"email": "[email protected]",
|
20 |
+
"root": "runs/prism-qwen25-extra-dinosiglip-224px+0_5b+stage-finetune+x7",
|
21 |
+
"host": "haic-hgx-2.stanford.edu",
|
22 |
+
"username": "belkhale",
|
23 |
+
"executable": "/hai/scratch/belkhale/miniforge3/envs/vla/bin/python3.10",
|
24 |
+
"codePathLocal": "scripts/pretrain.py",
|
25 |
+
"cpu_count": 112,
|
26 |
+
"cpu_count_logical": 224,
|
27 |
+
"gpu": "NVIDIA H100 80GB HBM3",
|
28 |
+
"gpu_count": 8,
|
29 |
+
"disk": {
|
30 |
+
"/": {
|
31 |
+
"total": "942725181440",
|
32 |
+
"used": "50880245760"
|
33 |
+
}
|
34 |
+
},
|
35 |
+
"memory": {
|
36 |
+
"total": "2164104577024"
|
37 |
+
},
|
38 |
+
"cpu": {
|
39 |
+
"count": 112,
|
40 |
+
"countLogical": 224
|
41 |
+
},
|
42 |
+
"gpu_nvidia": [
|
43 |
+
{
|
44 |
+
"name": "NVIDIA H100 80GB HBM3",
|
45 |
+
"memoryTotal": "85520809984",
|
46 |
+
"cudaCores": 16896,
|
47 |
+
"architecture": "Hopper"
|
48 |
+
},
|
49 |
+
{
|
50 |
+
"name": "NVIDIA H100 80GB HBM3",
|
51 |
+
"memoryTotal": "85520809984",
|
52 |
+
"cudaCores": 16896,
|
53 |
+
"architecture": "Hopper"
|
54 |
+
},
|
55 |
+
{
|
56 |
+
"name": "NVIDIA H100 80GB HBM3",
|
57 |
+
"memoryTotal": "85520809984",
|
58 |
+
"cudaCores": 16896,
|
59 |
+
"architecture": "Hopper"
|
60 |
+
},
|
61 |
+
{
|
62 |
+
"name": "NVIDIA H100 80GB HBM3",
|
63 |
+
"memoryTotal": "85520809984",
|
64 |
+
"cudaCores": 16896,
|
65 |
+
"architecture": "Hopper"
|
66 |
+
},
|
67 |
+
{
|
68 |
+
"name": "NVIDIA H100 80GB HBM3",
|
69 |
+
"memoryTotal": "85520809984",
|
70 |
+
"cudaCores": 16896,
|
71 |
+
"architecture": "Hopper"
|
72 |
+
},
|
73 |
+
{
|
74 |
+
"name": "NVIDIA H100 80GB HBM3",
|
75 |
+
"memoryTotal": "85520809984",
|
76 |
+
"cudaCores": 16896,
|
77 |
+
"architecture": "Hopper"
|
78 |
+
},
|
79 |
+
{
|
80 |
+
"name": "NVIDIA H100 80GB HBM3",
|
81 |
+
"memoryTotal": "85520809984",
|
82 |
+
"cudaCores": 16896,
|
83 |
+
"architecture": "Hopper"
|
84 |
+
},
|
85 |
+
{
|
86 |
+
"name": "NVIDIA H100 80GB HBM3",
|
87 |
+
"memoryTotal": "85520809984",
|
88 |
+
"cudaCores": 16896,
|
89 |
+
"architecture": "Hopper"
|
90 |
+
}
|
91 |
+
],
|
92 |
+
"slurm": {
|
93 |
+
"cluster_name": "haic",
|
94 |
+
"conf": "/usr/local/etc/slurm.conf",
|
95 |
+
"cpus_on_node": "64",
|
96 |
+
"cpus_per_task": "64",
|
97 |
+
"gpus_on_node": "8",
|
98 |
+
"gtids": "0",
|
99 |
+
"job_account": "models",
|
100 |
+
"job_cpus_per_node": "64",
|
101 |
+
"job_end_time": "1731122631",
|
102 |
+
"job_gid": "37",
|
103 |
+
"job_gpus": "0,1,2,3,4,5,6,7",
|
104 |
+
"job_id": "11023",
|
105 |
+
"job_name": "pretrain",
|
106 |
+
"job_nodelist": "haic-hgx-2",
|
107 |
+
"job_num_nodes": "1",
|
108 |
+
"job_partition": "hai",
|
109 |
+
"job_qos": "models",
|
110 |
+
"job_start_time": "1730863431",
|
111 |
+
"job_uid": "377095",
|
112 |
+
"job_user": "belkhale",
|
113 |
+
"jobid": "11023",
|
114 |
+
"localid": "0",
|
115 |
+
"mem_per_node": "102400",
|
116 |
+
"nnodes": "1",
|
117 |
+
"nodeid": "0",
|
118 |
+
"nodelist": "haic-hgx-2",
|
119 |
+
"nprocs": "1",
|
120 |
+
"ntasks": "1",
|
121 |
+
"ntasks_per_node": "1",
|
122 |
+
"prio_process": "0",
|
123 |
+
"procid": "0",
|
124 |
+
"script_context": "prolog_task",
|
125 |
+
"submit_dir": "/hai/scratch/belkhale/openvla-mini",
|
126 |
+
"submit_host": "haic.stanford.edu",
|
127 |
+
"task_pid": "2182609",
|
128 |
+
"tasks_per_node": "1",
|
129 |
+
"topology_addr": "haic-hgx-2",
|
130 |
+
"topology_addr_pattern": "node",
|
131 |
+
"tres_per_task": "cpu=64"
|
132 |
+
},
|
133 |
+
"cudaVersion": "12.4"
|
134 |
+
}
|
wandb/run-20241105_192502-8vxhoj6d/files/wandb-summary.json
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
{"_wandb":{"runtime":22}}
|
wandb/run-20241105_192502-8vxhoj6d/logs/debug-core.log
ADDED
@@ -0,0 +1,14 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{"time":"2024-11-05T19:25:02.690703685-08:00","level":"INFO","msg":"started logging, with flags","port-filename":"/tmp/belkhale/tmp97kp6p__/port-2182724.txt","pid":2182724,"debug":false,"disable-analytics":false}
|
2 |
+
{"time":"2024-11-05T19:25:02.690734566-08:00","level":"INFO","msg":"FeatureState","shutdownOnParentExitEnabled":false}
|
3 |
+
{"time":"2024-11-05T19:25:02.692100649-08:00","level":"INFO","msg":"Will exit if parent process dies.","ppid":2182724}
|
4 |
+
{"time":"2024-11-05T19:25:02.69208955-08:00","level":"INFO","msg":"server is running","addr":{"IP":"127.0.0.1","Port":41525,"Zone":""}}
|
5 |
+
{"time":"2024-11-05T19:25:02.744814731-08:00","level":"INFO","msg":"connection: ManageConnectionData: new connection created","id":"127.0.0.1:59560"}
|
6 |
+
{"time":"2024-11-05T19:25:02.895430941-08:00","level":"INFO","msg":"handleInformInit: received","streamId":"8vxhoj6d","id":"127.0.0.1:59560"}
|
7 |
+
{"time":"2024-11-05T19:25:03.194511518-08:00","level":"INFO","msg":"handleInformInit: stream started","streamId":"8vxhoj6d","id":"127.0.0.1:59560"}
|
8 |
+
{"time":"2024-11-05T19:25:25.782400709-08:00","level":"INFO","msg":"handleInformTeardown: server teardown initiated","id":"127.0.0.1:59560"}
|
9 |
+
{"time":"2024-11-05T19:25:25.782779877-08:00","level":"INFO","msg":"connection: Close: initiating connection closure","id":"127.0.0.1:59560"}
|
10 |
+
{"time":"2024-11-05T19:25:25.782799272-08:00","level":"INFO","msg":"server is shutting down"}
|
11 |
+
{"time":"2024-11-05T19:25:25.783947145-08:00","level":"INFO","msg":"connection: Close: connection successfully closed","id":"127.0.0.1:59560"}
|
12 |
+
{"time":"2024-11-05T19:25:27.304522773-08:00","level":"INFO","msg":"handleInformTeardown: server shutdown complete","id":"127.0.0.1:59560"}
|
13 |
+
{"time":"2024-11-05T19:25:27.304560668-08:00","level":"INFO","msg":"connection: ManageConnectionData: connection closed","id":"127.0.0.1:59560"}
|
14 |
+
{"time":"2024-11-05T19:25:27.304578483-08:00","level":"INFO","msg":"server is closed"}
|
wandb/run-20241105_192502-8vxhoj6d/logs/debug-internal.log
ADDED
@@ -0,0 +1,16 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{"time":"2024-11-05T19:25:02.896392755-08:00","level":"INFO","msg":"using version","core version":"0.18.5"}
|
2 |
+
{"time":"2024-11-05T19:25:02.896416546-08:00","level":"INFO","msg":"created symlink","path":"runs/prism-qwen25-extra-dinosiglip-224px+0_5b+stage-finetune+x7/wandb/run-20241105_192502-8vxhoj6d/logs/debug-core.log"}
|
3 |
+
{"time":"2024-11-05T19:25:03.194464738-08:00","level":"INFO","msg":"created new stream","id":"8vxhoj6d"}
|
4 |
+
{"time":"2024-11-05T19:25:03.194508127-08:00","level":"INFO","msg":"stream: started","id":"8vxhoj6d"}
|
5 |
+
{"time":"2024-11-05T19:25:03.194578475-08:00","level":"INFO","msg":"sender: started","stream_id":"8vxhoj6d"}
|
6 |
+
{"time":"2024-11-05T19:25:03.194561979-08:00","level":"INFO","msg":"writer: Do: started","stream_id":{"value":"8vxhoj6d"}}
|
7 |
+
{"time":"2024-11-05T19:25:03.194595326-08:00","level":"INFO","msg":"handler: started","stream_id":{"value":"8vxhoj6d"}}
|
8 |
+
{"time":"2024-11-05T19:25:03.382215025-08:00","level":"INFO","msg":"Starting system monitor"}
|
9 |
+
{"time":"2024-11-05T19:25:25.782775323-08:00","level":"INFO","msg":"stream: closing","id":"8vxhoj6d"}
|
10 |
+
{"time":"2024-11-05T19:25:25.784095739-08:00","level":"INFO","msg":"Stopping system monitor"}
|
11 |
+
{"time":"2024-11-05T19:25:25.785772774-08:00","level":"INFO","msg":"Stopped system monitor"}
|
12 |
+
{"time":"2024-11-05T19:25:27.185173962-08:00","level":"INFO","msg":"fileTransfer: Close: file transfer manager closed"}
|
13 |
+
{"time":"2024-11-05T19:25:27.303068441-08:00","level":"INFO","msg":"handler: closed","stream_id":{"value":"8vxhoj6d"}}
|
14 |
+
{"time":"2024-11-05T19:25:27.303126471-08:00","level":"INFO","msg":"writer: Close: closed","stream_id":{"value":"8vxhoj6d"}}
|
15 |
+
{"time":"2024-11-05T19:25:27.303147247-08:00","level":"INFO","msg":"sender: closed","stream_id":"8vxhoj6d"}
|
16 |
+
{"time":"2024-11-05T19:25:27.304415448-08:00","level":"INFO","msg":"stream: closed","id":"8vxhoj6d"}
|
wandb/run-20241105_192502-8vxhoj6d/logs/debug.log
ADDED
@@ -0,0 +1,26 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
2024-11-05 19:25:02,890 INFO MainThread:2182724 [wandb_setup.py:_flush():79] Current SDK version is 0.18.5
|
2 |
+
2024-11-05 19:25:02,890 INFO MainThread:2182724 [wandb_setup.py:_flush():79] Configure stats pid to 2182724
|
3 |
+
2024-11-05 19:25:02,890 INFO MainThread:2182724 [wandb_setup.py:_flush():79] Loading settings from /hai/scratch/belkhale/.config/wandb/settings
|
4 |
+
2024-11-05 19:25:02,890 INFO MainThread:2182724 [wandb_setup.py:_flush():79] Loading settings from /hai/scratch/belkhale/openvla-mini/wandb/settings
|
5 |
+
2024-11-05 19:25:02,890 INFO MainThread:2182724 [wandb_setup.py:_flush():79] Loading settings from environment variables: {'_service_wait': '300'}
|
6 |
+
2024-11-05 19:25:02,890 INFO MainThread:2182724 [wandb_setup.py:_flush():79] Applying setup settings: {'mode': None, '_disable_service': None}
|
7 |
+
2024-11-05 19:25:02,890 INFO MainThread:2182724 [wandb_setup.py:_flush():79] Inferring run settings from compute environment: {'program_relpath': 'scripts/pretrain.py', 'program_abspath': '/hai/scratch/belkhale/openvla-mini/scripts/pretrain.py', 'program': '/hai/scratch/belkhale/openvla-mini/scripts/pretrain.py'}
|
8 |
+
2024-11-05 19:25:02,890 INFO MainThread:2182724 [wandb_setup.py:_flush():79] Applying login settings: {}
|
9 |
+
2024-11-05 19:25:02,890 INFO MainThread:2182724 [wandb_init.py:_log_setup():534] Logging user logs to runs/prism-qwen25-extra-dinosiglip-224px+0_5b+stage-finetune+x7/wandb/run-20241105_192502-8vxhoj6d/logs/debug.log
|
10 |
+
2024-11-05 19:25:02,890 INFO MainThread:2182724 [wandb_init.py:_log_setup():535] Logging internal logs to runs/prism-qwen25-extra-dinosiglip-224px+0_5b+stage-finetune+x7/wandb/run-20241105_192502-8vxhoj6d/logs/debug-internal.log
|
11 |
+
2024-11-05 19:25:02,890 INFO MainThread:2182724 [wandb_init.py:init():621] calling init triggers
|
12 |
+
2024-11-05 19:25:02,890 INFO MainThread:2182724 [wandb_init.py:init():628] wandb.init called with sweep_config: {}
|
13 |
+
config: {'model': {'type': 'prism-qwen25-extra-dinosiglip-224px+0_5b', 'model_id': 'prism-qwen25-extra-dinosiglip-224px+0_5b', 'arch_specifier': 'no-align+fused-gelu-mlp', 'vision_backbone_id': 'dinosiglip-vit-so-224px', 'llm_backbone_id': 'qwen25-0_5b-extra', 'image_resize_strategy': 'resize-naive', 'llm_max_length': 32768, 'align_epochs': 1, 'align_max_steps': None, 'align_save_every_n_steps': 10000, 'align_global_batch_size': 96, 'align_per_device_batch_size': 16, 'align_learning_rate': 0.001, 'align_weight_decay': 0.0, 'align_max_grad_norm': 1.0, 'align_lr_scheduler_type': 'linear-warmup+cosine-decay', 'align_warmup_ratio': 0.03, 'align_train_strategy': 'fsdp-shard-grad-op', 'finetune_epochs': 2, 'finetune_max_steps': None, 'finetune_save_every_n_steps': 10000, 'finetune_global_batch_size': 128, 'finetune_per_device_batch_size': 16, 'finetune_learning_rate': 2e-05, 'finetune_weight_decay': 0.1, 'finetune_max_grad_norm': 1.0, 'finetune_lr_scheduler_type': 'linear-warmup+cosine-decay', 'finetune_warmup_ratio': 0.03, 'finetune_train_strategy': 'fsdp-full-shard', 'enable_gradient_checkpointing': True, 'enable_mixed_precision_training': True, 'reduce_in_full_precision': False}, 'dataset': {'type': 'llava-v15', 'dataset_id': 'llava-v15', 'align_stage_components': ['download/llava-laion-cc-sbu-558k/chat.json', 'download/llava-laion-cc-sbu-558k'], 'finetune_stage_components': ['download/llava-v1.5-instruct/llava_v1_5_mix665k.json', 'download/llava-v1.5-instruct'], 'dataset_root_dir': '/hai/scratch/belkhale/datasets/prismatic-vlms'}, 'stage': 'finetune', 'pretrained_checkpoint': None, 'run_id': 'prism-qwen25-extra-dinosiglip-224px+0_5b+stage-finetune+x7', 'run_root_dir': 'runs', 'seed': 7, 'hf_token': '.hf_token', 'trackers': ['jsonl', 'wandb'], 'wandb_project': 'prismatic', 'wandb_entity': None}
|
14 |
+
2024-11-05 19:25:02,890 INFO MainThread:2182724 [wandb_init.py:init():671] starting backend
|
15 |
+
2024-11-05 19:25:02,890 INFO MainThread:2182724 [wandb_init.py:init():675] sending inform_init request
|
16 |
+
2024-11-05 19:25:02,891 INFO MainThread:2182724 [backend.py:_multiprocessing_setup():104] multiprocessing start_methods=fork,spawn,forkserver, using: spawn
|
17 |
+
2024-11-05 19:25:02,892 INFO MainThread:2182724 [wandb_init.py:init():688] backend started and connected
|
18 |
+
2024-11-05 19:25:02,893 INFO MainThread:2182724 [wandb_init.py:init():783] updated telemetry
|
19 |
+
2024-11-05 19:25:03,020 INFO MainThread:2182724 [wandb_init.py:init():816] communicating run to backend with 90.0 second timeout
|
20 |
+
2024-11-05 19:25:03,377 INFO MainThread:2182724 [wandb_init.py:init():867] starting run threads in backend
|
21 |
+
2024-11-05 19:25:05,102 INFO MainThread:2182724 [wandb_run.py:_console_start():2463] atexit reg
|
22 |
+
2024-11-05 19:25:05,102 INFO MainThread:2182724 [wandb_run.py:_redirect():2311] redirect: wrap_raw
|
23 |
+
2024-11-05 19:25:05,103 INFO MainThread:2182724 [wandb_run.py:_redirect():2376] Wrapping output streams.
|
24 |
+
2024-11-05 19:25:05,103 INFO MainThread:2182724 [wandb_run.py:_redirect():2401] Redirects installed.
|
25 |
+
2024-11-05 19:25:05,128 INFO MainThread:2182724 [wandb_init.py:init():911] run started, returning control to user process
|
26 |
+
2024-11-05 19:25:25,783 WARNING MsgRouterThr:2182724 [router.py:message_loop():77] message_loop has been closed
|
wandb/run-20241105_192502-8vxhoj6d/run-8vxhoj6d.wandb
ADDED
Binary file (30.6 kB). View file
|
|
wandb/run-20241105_192659-mqdqjqly/files/config.yaml
ADDED
@@ -0,0 +1,103 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
_wandb:
|
2 |
+
value:
|
3 |
+
cli_version: 0.18.5
|
4 |
+
m: []
|
5 |
+
python_version: 3.10.15
|
6 |
+
t:
|
7 |
+
"1":
|
8 |
+
- 1
|
9 |
+
- 2
|
10 |
+
- 3
|
11 |
+
- 11
|
12 |
+
- 41
|
13 |
+
- 49
|
14 |
+
- 55
|
15 |
+
- 63
|
16 |
+
- 71
|
17 |
+
"2":
|
18 |
+
- 1
|
19 |
+
- 2
|
20 |
+
- 3
|
21 |
+
- 11
|
22 |
+
- 41
|
23 |
+
- 49
|
24 |
+
- 55
|
25 |
+
- 63
|
26 |
+
- 71
|
27 |
+
"3":
|
28 |
+
- 13
|
29 |
+
- 16
|
30 |
+
- 23
|
31 |
+
- 55
|
32 |
+
"4": 3.10.15
|
33 |
+
"5": 0.18.5
|
34 |
+
"6": 4.40.1
|
35 |
+
"8":
|
36 |
+
- 5
|
37 |
+
"12": 0.18.5
|
38 |
+
"13": linux-x86_64
|
39 |
+
dataset:
|
40 |
+
value:
|
41 |
+
align_stage_components:
|
42 |
+
- download/llava-laion-cc-sbu-558k/chat.json
|
43 |
+
- download/llava-laion-cc-sbu-558k
|
44 |
+
dataset_id: llava-v15
|
45 |
+
dataset_root_dir: /hai/scratch/belkhale/datasets/prismatic-vlms
|
46 |
+
finetune_stage_components:
|
47 |
+
- download/llava-v1.5-instruct/llava_v1_5_mix665k.json
|
48 |
+
- download/llava-v1.5-instruct
|
49 |
+
type: llava-v15
|
50 |
+
hf_token:
|
51 |
+
value: .hf_token
|
52 |
+
model:
|
53 |
+
value:
|
54 |
+
align_epochs: 1
|
55 |
+
align_global_batch_size: 96
|
56 |
+
align_learning_rate: 0.001
|
57 |
+
align_lr_scheduler_type: linear-warmup+cosine-decay
|
58 |
+
align_max_grad_norm: 1
|
59 |
+
align_max_steps: null
|
60 |
+
align_per_device_batch_size: 16
|
61 |
+
align_save_every_n_steps: 10000
|
62 |
+
align_train_strategy: fsdp-shard-grad-op
|
63 |
+
align_warmup_ratio: 0.03
|
64 |
+
align_weight_decay: 0
|
65 |
+
arch_specifier: no-align+fused-gelu-mlp
|
66 |
+
enable_gradient_checkpointing: true
|
67 |
+
enable_mixed_precision_training: true
|
68 |
+
finetune_epochs: 2
|
69 |
+
finetune_global_batch_size: 64
|
70 |
+
finetune_learning_rate: 2e-05
|
71 |
+
finetune_lr_scheduler_type: linear-warmup+cosine-decay
|
72 |
+
finetune_max_grad_norm: 1
|
73 |
+
finetune_max_steps: null
|
74 |
+
finetune_per_device_batch_size: 8
|
75 |
+
finetune_save_every_n_steps: 10000
|
76 |
+
finetune_train_strategy: fsdp-full-shard
|
77 |
+
finetune_warmup_ratio: 0.03
|
78 |
+
finetune_weight_decay: 0.1
|
79 |
+
image_resize_strategy: resize-naive
|
80 |
+
llm_backbone_id: qwen25-0_5b-extra
|
81 |
+
llm_max_length: 32768
|
82 |
+
model_id: prism-qwen25-extra-dinosiglip-224px+0_5b
|
83 |
+
reduce_in_full_precision: false
|
84 |
+
type: prism-qwen25-extra-dinosiglip-224px+0_5b
|
85 |
+
vision_backbone_id: dinosiglip-vit-so-224px
|
86 |
+
pretrained_checkpoint:
|
87 |
+
value: null
|
88 |
+
run_id:
|
89 |
+
value: prism-qwen25-extra-dinosiglip-224px+0_5b+stage-finetune+x7
|
90 |
+
run_root_dir:
|
91 |
+
value: runs
|
92 |
+
seed:
|
93 |
+
value: 7
|
94 |
+
stage:
|
95 |
+
value: finetune
|
96 |
+
trackers:
|
97 |
+
value:
|
98 |
+
- jsonl
|
99 |
+
- wandb
|
100 |
+
wandb_entity:
|
101 |
+
value: null
|
102 |
+
wandb_project:
|
103 |
+
value: prismatic
|
wandb/run-20241105_192659-mqdqjqly/files/output.log
ADDED
@@ -0,0 +1,43 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
11/05 [19:26:59] INFO | >> [*] Starting Training Loop pretrain.py:227
|
2 |
+
Traceback (most recent call last):
|
3 |
+
File "/hai/scratch/belkhale/openvla-mini/scripts/pretrain.py", line 241, in <module>
|
4 |
+
pretrain()
|
5 |
+
File "/hai/scratch/belkhale/miniforge3/envs/vla/lib/python3.10/site-packages/draccus/argparsing.py", line 203, in wrapper_inner
|
6 |
+
response = fn(cfg, *args, **kwargs)
|
7 |
+
File "/hai/scratch/belkhale/openvla-mini/scripts/pretrain.py", line 228, in pretrain
|
8 |
+
train_strategy.run_training(train_dataset, collator, metrics, stage=cfg.stage, seed=cfg.seed)
|
9 |
+
File "/hai/scratch/belkhale/openvla-mini/prismatic/training/strategies/base_strategy.py", line 190, in run_training
|
10 |
+
output: CausalLMOutputWithPast = self.vlm(
|
11 |
+
File "/hai/scratch/belkhale/miniforge3/envs/vla/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1511, in _wrapped_call_impl
|
12 |
+
return self._call_impl(*args, **kwargs)
|
13 |
+
File "/hai/scratch/belkhale/miniforge3/envs/vla/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1520, in _call_impl
|
14 |
+
return forward_call(*args, **kwargs)
|
15 |
+
File "/hai/scratch/belkhale/miniforge3/envs/vla/lib/python3.10/site-packages/torch/distributed/fsdp/fully_sharded_data_parallel.py", line 849, in forward
|
16 |
+
output = self._fsdp_wrapped_module(*args, **kwargs)
|
17 |
+
File "/hai/scratch/belkhale/miniforge3/envs/vla/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1511, in _wrapped_call_impl
|
18 |
+
return self._call_impl(*args, **kwargs)
|
19 |
+
File "/hai/scratch/belkhale/miniforge3/envs/vla/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1520, in _call_impl
|
20 |
+
return forward_call(*args, **kwargs)
|
21 |
+
File "/hai/scratch/belkhale/openvla-mini/prismatic/models/vlms/prismatic.py", line 470, in forward
|
22 |
+
return self.llm_backbone(
|
23 |
+
File "/hai/scratch/belkhale/miniforge3/envs/vla/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1511, in _wrapped_call_impl
|
24 |
+
return self._call_impl(*args, **kwargs)
|
25 |
+
File "/hai/scratch/belkhale/miniforge3/envs/vla/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1520, in _call_impl
|
26 |
+
return forward_call(*args, **kwargs)
|
27 |
+
File "/hai/scratch/belkhale/openvla-mini/prismatic/models/backbones/llm/base_llm.py", line 221, in forward
|
28 |
+
output: CausalLMOutputWithPast = self.llm(
|
29 |
+
File "/hai/scratch/belkhale/miniforge3/envs/vla/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1511, in _wrapped_call_impl
|
30 |
+
return self._call_impl(*args, **kwargs)
|
31 |
+
File "/hai/scratch/belkhale/miniforge3/envs/vla/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1520, in _call_impl
|
32 |
+
return forward_call(*args, **kwargs)
|
33 |
+
File "/hai/scratch/belkhale/miniforge3/envs/vla/lib/python3.10/site-packages/transformers/models/qwen2/modeling_qwen2.py", line 1196, in forward
|
34 |
+
loss = loss_fct(shift_logits, shift_labels)
|
35 |
+
File "/hai/scratch/belkhale/miniforge3/envs/vla/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1511, in _wrapped_call_impl
|
36 |
+
return self._call_impl(*args, **kwargs)
|
37 |
+
File "/hai/scratch/belkhale/miniforge3/envs/vla/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1520, in _call_impl
|
38 |
+
return forward_call(*args, **kwargs)
|
39 |
+
File "/hai/scratch/belkhale/miniforge3/envs/vla/lib/python3.10/site-packages/torch/nn/modules/loss.py", line 1179, in forward
|
40 |
+
return F.cross_entropy(input, target, weight=self.weight,
|
41 |
+
File "/hai/scratch/belkhale/miniforge3/envs/vla/lib/python3.10/site-packages/torch/nn/functional.py", line 3059, in cross_entropy
|
42 |
+
return torch._C._nn.cross_entropy_loss(input, target, weight, _Reduction.get_enum(reduction), ignore_index, label_smoothing)
|
43 |
+
torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 24.57 GiB. GPU 0 has a total capacity of 79.10 GiB of which 20.18 GiB is free. Including non-PyTorch memory, this process has 58.91 GiB memory in use. Of the allocated memory 52.22 GiB is allocated by PyTorch, and 798.66 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
|
wandb/run-20241105_192659-mqdqjqly/files/wandb-metadata.json
ADDED
@@ -0,0 +1,134 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"os": "Linux-5.15.0-116-generic-x86_64-with-glibc2.35",
|
3 |
+
"python": "3.10.15",
|
4 |
+
"startedAt": "2024-11-06T03:26:59.309206Z",
|
5 |
+
"args": [
|
6 |
+
"--model.type",
|
7 |
+
"prism-qwen25-extra-dinosiglip-224px+0_5b",
|
8 |
+
"--model.finetune_global_batch_size",
|
9 |
+
"64",
|
10 |
+
"--model.finetune_per_device_batch_size",
|
11 |
+
"8"
|
12 |
+
],
|
13 |
+
"program": "/hai/scratch/belkhale/openvla-mini/scripts/pretrain.py",
|
14 |
+
"codePath": "scripts/pretrain.py",
|
15 |
+
"git": {
|
16 |
+
"remote": "[email protected]:Stanford-ILIAD/openvla-mini.git",
|
17 |
+
"commit": "05073927b096dab7d326a3e39db9262f08d3a8ae"
|
18 |
+
},
|
19 |
+
"email": "[email protected]",
|
20 |
+
"root": "runs/prism-qwen25-extra-dinosiglip-224px+0_5b+stage-finetune+x7",
|
21 |
+
"host": "haic-hgx-2.stanford.edu",
|
22 |
+
"username": "belkhale",
|
23 |
+
"executable": "/hai/scratch/belkhale/miniforge3/envs/vla/bin/python3.10",
|
24 |
+
"codePathLocal": "scripts/pretrain.py",
|
25 |
+
"cpu_count": 112,
|
26 |
+
"cpu_count_logical": 224,
|
27 |
+
"gpu": "NVIDIA H100 80GB HBM3",
|
28 |
+
"gpu_count": 8,
|
29 |
+
"disk": {
|
30 |
+
"/": {
|
31 |
+
"total": "942725181440",
|
32 |
+
"used": "50880540672"
|
33 |
+
}
|
34 |
+
},
|
35 |
+
"memory": {
|
36 |
+
"total": "2164104577024"
|
37 |
+
},
|
38 |
+
"cpu": {
|
39 |
+
"count": 112,
|
40 |
+
"countLogical": 224
|
41 |
+
},
|
42 |
+
"gpu_nvidia": [
|
43 |
+
{
|
44 |
+
"name": "NVIDIA H100 80GB HBM3",
|
45 |
+
"memoryTotal": "85520809984",
|
46 |
+
"cudaCores": 16896,
|
47 |
+
"architecture": "Hopper"
|
48 |
+
},
|
49 |
+
{
|
50 |
+
"name": "NVIDIA H100 80GB HBM3",
|
51 |
+
"memoryTotal": "85520809984",
|
52 |
+
"cudaCores": 16896,
|
53 |
+
"architecture": "Hopper"
|
54 |
+
},
|
55 |
+
{
|
56 |
+
"name": "NVIDIA H100 80GB HBM3",
|
57 |
+
"memoryTotal": "85520809984",
|
58 |
+
"cudaCores": 16896,
|
59 |
+
"architecture": "Hopper"
|
60 |
+
},
|
61 |
+
{
|
62 |
+
"name": "NVIDIA H100 80GB HBM3",
|
63 |
+
"memoryTotal": "85520809984",
|
64 |
+
"cudaCores": 16896,
|
65 |
+
"architecture": "Hopper"
|
66 |
+
},
|
67 |
+
{
|
68 |
+
"name": "NVIDIA H100 80GB HBM3",
|
69 |
+
"memoryTotal": "85520809984",
|
70 |
+
"cudaCores": 16896,
|
71 |
+
"architecture": "Hopper"
|
72 |
+
},
|
73 |
+
{
|
74 |
+
"name": "NVIDIA H100 80GB HBM3",
|
75 |
+
"memoryTotal": "85520809984",
|
76 |
+
"cudaCores": 16896,
|
77 |
+
"architecture": "Hopper"
|
78 |
+
},
|
79 |
+
{
|
80 |
+
"name": "NVIDIA H100 80GB HBM3",
|
81 |
+
"memoryTotal": "85520809984",
|
82 |
+
"cudaCores": 16896,
|
83 |
+
"architecture": "Hopper"
|
84 |
+
},
|
85 |
+
{
|
86 |
+
"name": "NVIDIA H100 80GB HBM3",
|
87 |
+
"memoryTotal": "85520809984",
|
88 |
+
"cudaCores": 16896,
|
89 |
+
"architecture": "Hopper"
|
90 |
+
}
|
91 |
+
],
|
92 |
+
"slurm": {
|
93 |
+
"cluster_name": "haic",
|
94 |
+
"conf": "/usr/local/etc/slurm.conf",
|
95 |
+
"cpus_on_node": "64",
|
96 |
+
"cpus_per_task": "64",
|
97 |
+
"gpus_on_node": "8",
|
98 |
+
"gtids": "0",
|
99 |
+
"job_account": "models",
|
100 |
+
"job_cpus_per_node": "64",
|
101 |
+
"job_end_time": "1731122748",
|
102 |
+
"job_gid": "37",
|
103 |
+
"job_gpus": "0,1,2,3,4,5,6,7",
|
104 |
+
"job_id": "11024",
|
105 |
+
"job_name": "pretrain",
|
106 |
+
"job_nodelist": "haic-hgx-2",
|
107 |
+
"job_num_nodes": "1",
|
108 |
+
"job_partition": "hai",
|
109 |
+
"job_qos": "models",
|
110 |
+
"job_start_time": "1730863548",
|
111 |
+
"job_uid": "377095",
|
112 |
+
"job_user": "belkhale",
|
113 |
+
"jobid": "11024",
|
114 |
+
"localid": "0",
|
115 |
+
"mem_per_node": "102400",
|
116 |
+
"nnodes": "1",
|
117 |
+
"nodeid": "0",
|
118 |
+
"nodelist": "haic-hgx-2",
|
119 |
+
"nprocs": "1",
|
120 |
+
"ntasks": "1",
|
121 |
+
"ntasks_per_node": "1",
|
122 |
+
"prio_process": "0",
|
123 |
+
"procid": "0",
|
124 |
+
"script_context": "prolog_task",
|
125 |
+
"submit_dir": "/hai/scratch/belkhale/openvla-mini",
|
126 |
+
"submit_host": "haic.stanford.edu",
|
127 |
+
"task_pid": "2184784",
|
128 |
+
"tasks_per_node": "1",
|
129 |
+
"topology_addr": "haic-hgx-2",
|
130 |
+
"topology_addr_pattern": "node",
|
131 |
+
"tres_per_task": "cpu=64"
|
132 |
+
},
|
133 |
+
"cudaVersion": "12.4"
|
134 |
+
}
|
wandb/run-20241105_192659-mqdqjqly/files/wandb-summary.json
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
{"_wandb":{"runtime":18}}
|
wandb/run-20241105_192659-mqdqjqly/logs/debug-core.log
ADDED
@@ -0,0 +1,14 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{"time":"2024-11-05T19:26:58.953439705-08:00","level":"INFO","msg":"started logging, with flags","port-filename":"/tmp/belkhale/tmp1fqrk6pa/port-2184904.txt","pid":2184904,"debug":false,"disable-analytics":false}
|
2 |
+
{"time":"2024-11-05T19:26:58.953464408-08:00","level":"INFO","msg":"FeatureState","shutdownOnParentExitEnabled":false}
|
3 |
+
{"time":"2024-11-05T19:26:58.954979044-08:00","level":"INFO","msg":"server is running","addr":{"IP":"127.0.0.1","Port":41281,"Zone":""}}
|
4 |
+
{"time":"2024-11-05T19:26:58.955028412-08:00","level":"INFO","msg":"Will exit if parent process dies.","ppid":2184904}
|
5 |
+
{"time":"2024-11-05T19:26:59.144036286-08:00","level":"INFO","msg":"connection: ManageConnectionData: new connection created","id":"127.0.0.1:38180"}
|
6 |
+
{"time":"2024-11-05T19:26:59.313342183-08:00","level":"INFO","msg":"handleInformInit: received","streamId":"mqdqjqly","id":"127.0.0.1:38180"}
|
7 |
+
{"time":"2024-11-05T19:26:59.533375776-08:00","level":"INFO","msg":"handleInformInit: stream started","streamId":"mqdqjqly","id":"127.0.0.1:38180"}
|
8 |
+
{"time":"2024-11-05T19:27:18.207609388-08:00","level":"INFO","msg":"handleInformTeardown: server teardown initiated","id":"127.0.0.1:38180"}
|
9 |
+
{"time":"2024-11-05T19:27:18.208156833-08:00","level":"INFO","msg":"server is shutting down"}
|
10 |
+
{"time":"2024-11-05T19:27:18.20815625-08:00","level":"INFO","msg":"connection: Close: initiating connection closure","id":"127.0.0.1:38180"}
|
11 |
+
{"time":"2024-11-05T19:27:18.208292453-08:00","level":"INFO","msg":"connection: Close: connection successfully closed","id":"127.0.0.1:38180"}
|
12 |
+
{"time":"2024-11-05T19:27:18.887422598-08:00","level":"INFO","msg":"handleInformTeardown: server shutdown complete","id":"127.0.0.1:38180"}
|
13 |
+
{"time":"2024-11-05T19:27:18.887444653-08:00","level":"INFO","msg":"connection: ManageConnectionData: connection closed","id":"127.0.0.1:38180"}
|
14 |
+
{"time":"2024-11-05T19:27:18.887460329-08:00","level":"INFO","msg":"server is closed"}
|
wandb/run-20241105_192659-mqdqjqly/logs/debug-internal.log
ADDED
@@ -0,0 +1,16 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{"time":"2024-11-05T19:26:59.314379425-08:00","level":"INFO","msg":"using version","core version":"0.18.5"}
|
2 |
+
{"time":"2024-11-05T19:26:59.314391049-08:00","level":"INFO","msg":"created symlink","path":"runs/prism-qwen25-extra-dinosiglip-224px+0_5b+stage-finetune+x7/wandb/run-20241105_192659-mqdqjqly/logs/debug-core.log"}
|
3 |
+
{"time":"2024-11-05T19:26:59.533339524-08:00","level":"INFO","msg":"created new stream","id":"mqdqjqly"}
|
4 |
+
{"time":"2024-11-05T19:26:59.533372406-08:00","level":"INFO","msg":"stream: started","id":"mqdqjqly"}
|
5 |
+
{"time":"2024-11-05T19:26:59.533420404-08:00","level":"INFO","msg":"sender: started","stream_id":"mqdqjqly"}
|
6 |
+
{"time":"2024-11-05T19:26:59.533413693-08:00","level":"INFO","msg":"writer: Do: started","stream_id":{"value":"mqdqjqly"}}
|
7 |
+
{"time":"2024-11-05T19:26:59.533432721-08:00","level":"INFO","msg":"handler: started","stream_id":{"value":"mqdqjqly"}}
|
8 |
+
{"time":"2024-11-05T19:26:59.715136674-08:00","level":"INFO","msg":"Starting system monitor"}
|
9 |
+
{"time":"2024-11-05T19:27:18.2081844-08:00","level":"INFO","msg":"stream: closing","id":"mqdqjqly"}
|
10 |
+
{"time":"2024-11-05T19:27:18.20827275-08:00","level":"INFO","msg":"Stopping system monitor"}
|
11 |
+
{"time":"2024-11-05T19:27:18.209227648-08:00","level":"INFO","msg":"Stopped system monitor"}
|
12 |
+
{"time":"2024-11-05T19:27:18.690761255-08:00","level":"INFO","msg":"fileTransfer: Close: file transfer manager closed"}
|
13 |
+
{"time":"2024-11-05T19:27:18.88603778-08:00","level":"INFO","msg":"handler: closed","stream_id":{"value":"mqdqjqly"}}
|
14 |
+
{"time":"2024-11-05T19:27:18.886115295-08:00","level":"INFO","msg":"writer: Close: closed","stream_id":{"value":"mqdqjqly"}}
|
15 |
+
{"time":"2024-11-05T19:27:18.88614639-08:00","level":"INFO","msg":"sender: closed","stream_id":"mqdqjqly"}
|
16 |
+
{"time":"2024-11-05T19:27:18.887360713-08:00","level":"INFO","msg":"stream: closed","id":"mqdqjqly"}
|
wandb/run-20241105_192659-mqdqjqly/logs/debug.log
ADDED
@@ -0,0 +1,26 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
2024-11-05 19:26:59,307 INFO MainThread:2184904 [wandb_setup.py:_flush():79] Current SDK version is 0.18.5
|
2 |
+
2024-11-05 19:26:59,307 INFO MainThread:2184904 [wandb_setup.py:_flush():79] Configure stats pid to 2184904
|
3 |
+
2024-11-05 19:26:59,307 INFO MainThread:2184904 [wandb_setup.py:_flush():79] Loading settings from /hai/scratch/belkhale/.config/wandb/settings
|
4 |
+
2024-11-05 19:26:59,307 INFO MainThread:2184904 [wandb_setup.py:_flush():79] Loading settings from /hai/scratch/belkhale/openvla-mini/wandb/settings
|
5 |
+
2024-11-05 19:26:59,307 INFO MainThread:2184904 [wandb_setup.py:_flush():79] Loading settings from environment variables: {'_service_wait': '300'}
|
6 |
+
2024-11-05 19:26:59,307 INFO MainThread:2184904 [wandb_setup.py:_flush():79] Applying setup settings: {'mode': None, '_disable_service': None}
|
7 |
+
2024-11-05 19:26:59,307 INFO MainThread:2184904 [wandb_setup.py:_flush():79] Inferring run settings from compute environment: {'program_relpath': 'scripts/pretrain.py', 'program_abspath': '/hai/scratch/belkhale/openvla-mini/scripts/pretrain.py', 'program': '/hai/scratch/belkhale/openvla-mini/scripts/pretrain.py'}
|
8 |
+
2024-11-05 19:26:59,307 INFO MainThread:2184904 [wandb_setup.py:_flush():79] Applying login settings: {}
|
9 |
+
2024-11-05 19:26:59,307 INFO MainThread:2184904 [wandb_init.py:_log_setup():534] Logging user logs to runs/prism-qwen25-extra-dinosiglip-224px+0_5b+stage-finetune+x7/wandb/run-20241105_192659-mqdqjqly/logs/debug.log
|
10 |
+
2024-11-05 19:26:59,307 INFO MainThread:2184904 [wandb_init.py:_log_setup():535] Logging internal logs to runs/prism-qwen25-extra-dinosiglip-224px+0_5b+stage-finetune+x7/wandb/run-20241105_192659-mqdqjqly/logs/debug-internal.log
|
11 |
+
2024-11-05 19:26:59,307 INFO MainThread:2184904 [wandb_init.py:init():621] calling init triggers
|
12 |
+
2024-11-05 19:26:59,307 INFO MainThread:2184904 [wandb_init.py:init():628] wandb.init called with sweep_config: {}
|
13 |
+
config: {'model': {'type': 'prism-qwen25-extra-dinosiglip-224px+0_5b', 'model_id': 'prism-qwen25-extra-dinosiglip-224px+0_5b', 'arch_specifier': 'no-align+fused-gelu-mlp', 'vision_backbone_id': 'dinosiglip-vit-so-224px', 'llm_backbone_id': 'qwen25-0_5b-extra', 'image_resize_strategy': 'resize-naive', 'llm_max_length': 32768, 'align_epochs': 1, 'align_max_steps': None, 'align_save_every_n_steps': 10000, 'align_global_batch_size': 96, 'align_per_device_batch_size': 16, 'align_learning_rate': 0.001, 'align_weight_decay': 0.0, 'align_max_grad_norm': 1.0, 'align_lr_scheduler_type': 'linear-warmup+cosine-decay', 'align_warmup_ratio': 0.03, 'align_train_strategy': 'fsdp-shard-grad-op', 'finetune_epochs': 2, 'finetune_max_steps': None, 'finetune_save_every_n_steps': 10000, 'finetune_global_batch_size': 64, 'finetune_per_device_batch_size': 8, 'finetune_learning_rate': 2e-05, 'finetune_weight_decay': 0.1, 'finetune_max_grad_norm': 1.0, 'finetune_lr_scheduler_type': 'linear-warmup+cosine-decay', 'finetune_warmup_ratio': 0.03, 'finetune_train_strategy': 'fsdp-full-shard', 'enable_gradient_checkpointing': True, 'enable_mixed_precision_training': True, 'reduce_in_full_precision': False}, 'dataset': {'type': 'llava-v15', 'dataset_id': 'llava-v15', 'align_stage_components': ['download/llava-laion-cc-sbu-558k/chat.json', 'download/llava-laion-cc-sbu-558k'], 'finetune_stage_components': ['download/llava-v1.5-instruct/llava_v1_5_mix665k.json', 'download/llava-v1.5-instruct'], 'dataset_root_dir': '/hai/scratch/belkhale/datasets/prismatic-vlms'}, 'stage': 'finetune', 'pretrained_checkpoint': None, 'run_id': 'prism-qwen25-extra-dinosiglip-224px+0_5b+stage-finetune+x7', 'run_root_dir': 'runs', 'seed': 7, 'hf_token': '.hf_token', 'trackers': ['jsonl', 'wandb'], 'wandb_project': 'prismatic', 'wandb_entity': None}
|
14 |
+
2024-11-05 19:26:59,307 INFO MainThread:2184904 [wandb_init.py:init():671] starting backend
|
15 |
+
2024-11-05 19:26:59,307 INFO MainThread:2184904 [wandb_init.py:init():675] sending inform_init request
|
16 |
+
2024-11-05 19:26:59,308 INFO MainThread:2184904 [backend.py:_multiprocessing_setup():104] multiprocessing start_methods=fork,spawn,forkserver, using: spawn
|
17 |
+
2024-11-05 19:26:59,309 INFO MainThread:2184904 [wandb_init.py:init():688] backend started and connected
|
18 |
+
2024-11-05 19:26:59,310 INFO MainThread:2184904 [wandb_init.py:init():783] updated telemetry
|
19 |
+
2024-11-05 19:26:59,372 INFO MainThread:2184904 [wandb_init.py:init():816] communicating run to backend with 90.0 second timeout
|
20 |
+
2024-11-05 19:26:59,708 INFO MainThread:2184904 [wandb_init.py:init():867] starting run threads in backend
|
21 |
+
2024-11-05 19:26:59,921 INFO MainThread:2184904 [wandb_run.py:_console_start():2463] atexit reg
|
22 |
+
2024-11-05 19:26:59,921 INFO MainThread:2184904 [wandb_run.py:_redirect():2311] redirect: wrap_raw
|
23 |
+
2024-11-05 19:26:59,921 INFO MainThread:2184904 [wandb_run.py:_redirect():2376] Wrapping output streams.
|
24 |
+
2024-11-05 19:26:59,921 INFO MainThread:2184904 [wandb_run.py:_redirect():2401] Redirects installed.
|
25 |
+
2024-11-05 19:26:59,924 INFO MainThread:2184904 [wandb_init.py:init():911] run started, returning control to user process
|
26 |
+
2024-11-05 19:27:18,208 WARNING MsgRouterThr:2184904 [router.py:message_loop():77] message_loop has been closed
|
wandb/run-20241105_192659-mqdqjqly/run-mqdqjqly.wandb
ADDED
Binary file (21 kB). View file
|
|
wandb/run-20241105_193102-jcj67gg8/files/config.yaml
ADDED
@@ -0,0 +1,105 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
_wandb:
|
2 |
+
value:
|
3 |
+
cli_version: 0.18.5
|
4 |
+
m: []
|
5 |
+
python_version: 3.10.15
|
6 |
+
t:
|
7 |
+
"1":
|
8 |
+
- 1
|
9 |
+
- 2
|
10 |
+
- 3
|
11 |
+
- 11
|
12 |
+
- 41
|
13 |
+
- 49
|
14 |
+
- 55
|
15 |
+
- 63
|
16 |
+
- 71
|
17 |
+
"2":
|
18 |
+
- 1
|
19 |
+
- 2
|
20 |
+
- 3
|
21 |
+
- 11
|
22 |
+
- 41
|
23 |
+
- 49
|
24 |
+
- 55
|
25 |
+
- 63
|
26 |
+
- 71
|
27 |
+
"3":
|
28 |
+
- 2
|
29 |
+
- 13
|
30 |
+
- 16
|
31 |
+
- 23
|
32 |
+
- 55
|
33 |
+
- 61
|
34 |
+
"4": 3.10.15
|
35 |
+
"5": 0.18.5
|
36 |
+
"6": 4.40.1
|
37 |
+
"8":
|
38 |
+
- 5
|
39 |
+
"12": 0.18.5
|
40 |
+
"13": linux-x86_64
|
41 |
+
dataset:
|
42 |
+
value:
|
43 |
+
align_stage_components:
|
44 |
+
- download/llava-laion-cc-sbu-558k/chat.json
|
45 |
+
- download/llava-laion-cc-sbu-558k
|
46 |
+
dataset_id: llava-v15
|
47 |
+
dataset_root_dir: /hai/scratch/belkhale/datasets/prismatic-vlms
|
48 |
+
finetune_stage_components:
|
49 |
+
- download/llava-v1.5-instruct/llava_v1_5_mix665k.json
|
50 |
+
- download/llava-v1.5-instruct
|
51 |
+
type: llava-v15
|
52 |
+
hf_token:
|
53 |
+
value: .hf_token
|
54 |
+
model:
|
55 |
+
value:
|
56 |
+
align_epochs: 1
|
57 |
+
align_global_batch_size: 96
|
58 |
+
align_learning_rate: 0.001
|
59 |
+
align_lr_scheduler_type: linear-warmup+cosine-decay
|
60 |
+
align_max_grad_norm: 1
|
61 |
+
align_max_steps: null
|
62 |
+
align_per_device_batch_size: 16
|
63 |
+
align_save_every_n_steps: 10000
|
64 |
+
align_train_strategy: fsdp-shard-grad-op
|
65 |
+
align_warmup_ratio: 0.03
|
66 |
+
align_weight_decay: 0
|
67 |
+
arch_specifier: no-align+fused-gelu-mlp
|
68 |
+
enable_gradient_checkpointing: true
|
69 |
+
enable_mixed_precision_training: true
|
70 |
+
finetune_epochs: 2
|
71 |
+
finetune_global_batch_size: 64
|
72 |
+
finetune_learning_rate: 2e-05
|
73 |
+
finetune_lr_scheduler_type: linear-warmup+cosine-decay
|
74 |
+
finetune_max_grad_norm: 1
|
75 |
+
finetune_max_steps: null
|
76 |
+
finetune_per_device_batch_size: 4
|
77 |
+
finetune_save_every_n_steps: 10000
|
78 |
+
finetune_train_strategy: fsdp-full-shard
|
79 |
+
finetune_warmup_ratio: 0.03
|
80 |
+
finetune_weight_decay: 0.1
|
81 |
+
image_resize_strategy: resize-naive
|
82 |
+
llm_backbone_id: qwen25-0_5b-extra
|
83 |
+
llm_max_length: 32768
|
84 |
+
model_id: prism-qwen25-extra-dinosiglip-224px+0_5b
|
85 |
+
reduce_in_full_precision: false
|
86 |
+
type: prism-qwen25-extra-dinosiglip-224px+0_5b
|
87 |
+
vision_backbone_id: dinosiglip-vit-so-224px
|
88 |
+
pretrained_checkpoint:
|
89 |
+
value: null
|
90 |
+
run_id:
|
91 |
+
value: prism-qwen25-extra-dinosiglip-224px+0_5b+stage-finetune+x7
|
92 |
+
run_root_dir:
|
93 |
+
value: runs
|
94 |
+
seed:
|
95 |
+
value: 7
|
96 |
+
stage:
|
97 |
+
value: finetune
|
98 |
+
trackers:
|
99 |
+
value:
|
100 |
+
- jsonl
|
101 |
+
- wandb
|
102 |
+
wandb_entity:
|
103 |
+
value: null
|
104 |
+
wandb_project:
|
105 |
+
value: prismatic
|
wandb/run-20241105_193102-jcj67gg8/files/output.log
ADDED
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
1 |
+
11/05 [19:31:03] INFO | >> [*] Starting Training Loop pretrain.py:227
|
2 |
+
|
3 |
+
11/05 [23:38:31] INFO | >> [*] Done with Training =>> pretrain.py:231
|
4 |
+
Finalizing Metrics
|
wandb/run-20241105_193102-jcj67gg8/files/wandb-metadata.json
ADDED
@@ -0,0 +1,134 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"os": "Linux-5.15.0-116-generic-x86_64-with-glibc2.35",
|
3 |
+
"python": "3.10.15",
|
4 |
+
"startedAt": "2024-11-06T03:31:02.513650Z",
|
5 |
+
"args": [
|
6 |
+
"--model.type",
|
7 |
+
"prism-qwen25-extra-dinosiglip-224px+0_5b",
|
8 |
+
"--model.finetune_global_batch_size",
|
9 |
+
"64",
|
10 |
+
"--model.finetune_per_device_batch_size",
|
11 |
+
"4"
|
12 |
+
],
|
13 |
+
"program": "/hai/scratch/belkhale/openvla-mini/scripts/pretrain.py",
|
14 |
+
"codePath": "scripts/pretrain.py",
|
15 |
+
"git": {
|
16 |
+
"remote": "[email protected]:Stanford-ILIAD/openvla-mini.git",
|
17 |
+
"commit": "05073927b096dab7d326a3e39db9262f08d3a8ae"
|
18 |
+
},
|
19 |
+
"email": "[email protected]",
|
20 |
+
"root": "runs/prism-qwen25-extra-dinosiglip-224px+0_5b+stage-finetune+x7",
|
21 |
+
"host": "haic-hgx-2.stanford.edu",
|
22 |
+
"username": "belkhale",
|
23 |
+
"executable": "/hai/scratch/belkhale/miniforge3/envs/vla/bin/python3.10",
|
24 |
+
"codePathLocal": "scripts/pretrain.py",
|
25 |
+
"cpu_count": 112,
|
26 |
+
"cpu_count_logical": 224,
|
27 |
+
"gpu": "NVIDIA H100 80GB HBM3",
|
28 |
+
"gpu_count": 8,
|
29 |
+
"disk": {
|
30 |
+
"/": {
|
31 |
+
"total": "942725181440",
|
32 |
+
"used": "50880847872"
|
33 |
+
}
|
34 |
+
},
|
35 |
+
"memory": {
|
36 |
+
"total": "2164104577024"
|
37 |
+
},
|
38 |
+
"cpu": {
|
39 |
+
"count": 112,
|
40 |
+
"countLogical": 224
|
41 |
+
},
|
42 |
+
"gpu_nvidia": [
|
43 |
+
{
|
44 |
+
"name": "NVIDIA H100 80GB HBM3",
|
45 |
+
"memoryTotal": "85520809984",
|
46 |
+
"cudaCores": 16896,
|
47 |
+
"architecture": "Hopper"
|
48 |
+
},
|
49 |
+
{
|
50 |
+
"name": "NVIDIA H100 80GB HBM3",
|
51 |
+
"memoryTotal": "85520809984",
|
52 |
+
"cudaCores": 16896,
|
53 |
+
"architecture": "Hopper"
|
54 |
+
},
|
55 |
+
{
|
56 |
+
"name": "NVIDIA H100 80GB HBM3",
|
57 |
+
"memoryTotal": "85520809984",
|
58 |
+
"cudaCores": 16896,
|
59 |
+
"architecture": "Hopper"
|
60 |
+
},
|
61 |
+
{
|
62 |
+
"name": "NVIDIA H100 80GB HBM3",
|
63 |
+
"memoryTotal": "85520809984",
|
64 |
+
"cudaCores": 16896,
|
65 |
+
"architecture": "Hopper"
|
66 |
+
},
|
67 |
+
{
|
68 |
+
"name": "NVIDIA H100 80GB HBM3",
|
69 |
+
"memoryTotal": "85520809984",
|
70 |
+
"cudaCores": 16896,
|
71 |
+
"architecture": "Hopper"
|
72 |
+
},
|
73 |
+
{
|
74 |
+
"name": "NVIDIA H100 80GB HBM3",
|
75 |
+
"memoryTotal": "85520809984",
|
76 |
+
"cudaCores": 16896,
|
77 |
+
"architecture": "Hopper"
|
78 |
+
},
|
79 |
+
{
|
80 |
+
"name": "NVIDIA H100 80GB HBM3",
|
81 |
+
"memoryTotal": "85520809984",
|
82 |
+
"cudaCores": 16896,
|
83 |
+
"architecture": "Hopper"
|
84 |
+
},
|
85 |
+
{
|
86 |
+
"name": "NVIDIA H100 80GB HBM3",
|
87 |
+
"memoryTotal": "85520809984",
|
88 |
+
"cudaCores": 16896,
|
89 |
+
"architecture": "Hopper"
|
90 |
+
}
|
91 |
+
],
|
92 |
+
"slurm": {
|
93 |
+
"cluster_name": "haic",
|
94 |
+
"conf": "/usr/local/etc/slurm.conf",
|
95 |
+
"cpus_on_node": "64",
|
96 |
+
"cpus_per_task": "64",
|
97 |
+
"gpus_on_node": "8",
|
98 |
+
"gtids": "0",
|
99 |
+
"job_account": "models",
|
100 |
+
"job_cpus_per_node": "64",
|
101 |
+
"job_end_time": "1731122999",
|
102 |
+
"job_gid": "37",
|
103 |
+
"job_gpus": "0,1,2,3,4,5,6,7",
|
104 |
+
"job_id": "11026",
|
105 |
+
"job_name": "pretrain",
|
106 |
+
"job_nodelist": "haic-hgx-2",
|
107 |
+
"job_num_nodes": "1",
|
108 |
+
"job_partition": "hai",
|
109 |
+
"job_qos": "models",
|
110 |
+
"job_start_time": "1730863799",
|
111 |
+
"job_uid": "377095",
|
112 |
+
"job_user": "belkhale",
|
113 |
+
"jobid": "11026",
|
114 |
+
"localid": "0",
|
115 |
+
"mem_per_node": "102400",
|
116 |
+
"nnodes": "1",
|
117 |
+
"nodeid": "0",
|
118 |
+
"nodelist": "haic-hgx-2",
|
119 |
+
"nprocs": "1",
|
120 |
+
"ntasks": "1",
|
121 |
+
"ntasks_per_node": "1",
|
122 |
+
"prio_process": "0",
|
123 |
+
"procid": "0",
|
124 |
+
"script_context": "prolog_task",
|
125 |
+
"submit_dir": "/hai/scratch/belkhale/openvla-mini",
|
126 |
+
"submit_host": "haic.stanford.edu",
|
127 |
+
"task_pid": "2187908",
|
128 |
+
"tasks_per_node": "1",
|
129 |
+
"topology_addr": "haic-hgx-2",
|
130 |
+
"topology_addr_pattern": "node",
|
131 |
+
"tres_per_task": "cpu=64"
|
132 |
+
},
|
133 |
+
"cudaVersion": "12.4"
|
134 |
+
}
|
wandb/run-20241105_193102-jcj67gg8/files/wandb-summary.json
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
{"_wandb":{"runtime":14849},"_runtime":14849.432571063,"_step":20792,"Finetune/Loss":0.734754204750061,"Finetune/Loss (Raw)":0.7624474763870239,"Finetune/Learning Rate":0,"Finetune/Step Time":0.7374007441103458,"_timestamp":1.7308787040734835e+09,"Finetune/Step":20792}
|
wandb/run-20241105_193102-jcj67gg8/logs/debug-core.log
ADDED
@@ -0,0 +1,16 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{"time":"2024-11-05T19:31:02.167132681-08:00","level":"INFO","msg":"started logging, with flags","port-filename":"/tmp/belkhale/tmpowkszwq0/port-2188020.txt","pid":2188020,"debug":false,"disable-analytics":false}
|
2 |
+
{"time":"2024-11-05T19:31:02.167154904-08:00","level":"INFO","msg":"FeatureState","shutdownOnParentExitEnabled":false}
|
3 |
+
{"time":"2024-11-05T19:31:02.168180089-08:00","level":"INFO","msg":"Will exit if parent process dies.","ppid":2188020}
|
4 |
+
{"time":"2024-11-05T19:31:02.168180088-08:00","level":"INFO","msg":"server is running","addr":{"IP":"127.0.0.1","Port":35793,"Zone":""}}
|
5 |
+
{"time":"2024-11-05T19:31:02.357031058-08:00","level":"INFO","msg":"connection: ManageConnectionData: new connection created","id":"127.0.0.1:51484"}
|
6 |
+
{"time":"2024-11-05T19:31:02.518306545-08:00","level":"INFO","msg":"handleInformInit: received","streamId":"jcj67gg8","id":"127.0.0.1:51484"}
|
7 |
+
{"time":"2024-11-05T19:31:02.738838516-08:00","level":"INFO","msg":"handleInformInit: stream started","streamId":"jcj67gg8","id":"127.0.0.1:51484"}
|
8 |
+
{"time":"2024-11-05T23:38:34.554528568-08:00","level":"INFO","msg":"handleInformFinish: finish message received","streamId":"jcj67gg8","id":"127.0.0.1:51484"}
|
9 |
+
{"time":"2024-11-05T23:38:34.558563756-08:00","level":"INFO","msg":"handleInformFinish: stream closed","streamId":"jcj67gg8","id":"127.0.0.1:51484"}
|
10 |
+
{"time":"2024-11-05T23:42:06.504927152-08:00","level":"INFO","msg":"handleInformTeardown: server teardown initiated","id":"127.0.0.1:51484"}
|
11 |
+
{"time":"2024-11-05T23:42:06.505861575-08:00","level":"INFO","msg":"handleInformTeardown: server shutdown complete","id":"127.0.0.1:51484"}
|
12 |
+
{"time":"2024-11-05T23:42:06.505880856-08:00","level":"INFO","msg":"server is shutting down"}
|
13 |
+
{"time":"2024-11-05T23:42:06.50594903-08:00","level":"INFO","msg":"connection: Close: initiating connection closure","id":"127.0.0.1:51484"}
|
14 |
+
{"time":"2024-11-05T23:42:06.506095025-08:00","level":"INFO","msg":"connection: Close: connection successfully closed","id":"127.0.0.1:51484"}
|
15 |
+
{"time":"2024-11-05T23:42:06.506152479-08:00","level":"INFO","msg":"connection: ManageConnectionData: connection closed","id":"127.0.0.1:51484"}
|
16 |
+
{"time":"2024-11-05T23:42:06.506171224-08:00","level":"INFO","msg":"server is closed"}
|
wandb/run-20241105_193102-jcj67gg8/logs/debug-internal.log
ADDED
@@ -0,0 +1,17 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{"time":"2024-11-05T19:31:02.519998294-08:00","level":"INFO","msg":"using version","core version":"0.18.5"}
|
2 |
+
{"time":"2024-11-05T19:31:02.520021589-08:00","level":"INFO","msg":"created symlink","path":"runs/prism-qwen25-extra-dinosiglip-224px+0_5b+stage-finetune+x7/wandb/run-20241105_193102-jcj67gg8/logs/debug-core.log"}
|
3 |
+
{"time":"2024-11-05T19:31:02.738801523-08:00","level":"INFO","msg":"created new stream","id":"jcj67gg8"}
|
4 |
+
{"time":"2024-11-05T19:31:02.738835256-08:00","level":"INFO","msg":"stream: started","id":"jcj67gg8"}
|
5 |
+
{"time":"2024-11-05T19:31:02.738905513-08:00","level":"INFO","msg":"sender: started","stream_id":"jcj67gg8"}
|
6 |
+
{"time":"2024-11-05T19:31:02.738892436-08:00","level":"INFO","msg":"writer: Do: started","stream_id":{"value":"jcj67gg8"}}
|
7 |
+
{"time":"2024-11-05T19:31:02.738902832-08:00","level":"INFO","msg":"handler: started","stream_id":{"value":"jcj67gg8"}}
|
8 |
+
{"time":"2024-11-05T19:31:03.056521545-08:00","level":"INFO","msg":"Starting system monitor"}
|
9 |
+
{"time":"2024-11-05T23:38:31.946246118-08:00","level":"INFO","msg":"Stopping system monitor"}
|
10 |
+
{"time":"2024-11-05T23:38:32.023973784-08:00","level":"INFO","msg":"Stopped system monitor"}
|
11 |
+
{"time":"2024-11-05T23:38:32.922341453-08:00","level":"INFO","msg":"handler: operation stats","stats":{"operations":[{"desc":"saving job artifact","runtime_seconds":0.775009129}],"total_operations":1}}
|
12 |
+
{"time":"2024-11-05T23:38:33.390878425-08:00","level":"INFO","msg":"fileTransfer: Close: file transfer manager closed"}
|
13 |
+
{"time":"2024-11-05T23:38:34.555591357-08:00","level":"INFO","msg":"stream: closing","id":"jcj67gg8"}
|
14 |
+
{"time":"2024-11-05T23:38:34.555667186-08:00","level":"INFO","msg":"handler: closed","stream_id":{"value":"jcj67gg8"}}
|
15 |
+
{"time":"2024-11-05T23:38:34.555706801-08:00","level":"INFO","msg":"writer: Close: closed","stream_id":{"value":"jcj67gg8"}}
|
16 |
+
{"time":"2024-11-05T23:38:34.555872-08:00","level":"INFO","msg":"sender: closed","stream_id":"jcj67gg8"}
|
17 |
+
{"time":"2024-11-05T23:38:34.558526671-08:00","level":"INFO","msg":"stream: closed","id":"jcj67gg8"}
|
wandb/run-20241105_193102-jcj67gg8/logs/debug.log
ADDED
@@ -0,0 +1,32 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
2024-11-05 19:31:02,511 INFO MainThread:2188020 [wandb_setup.py:_flush():79] Current SDK version is 0.18.5
|
2 |
+
2024-11-05 19:31:02,511 INFO MainThread:2188020 [wandb_setup.py:_flush():79] Configure stats pid to 2188020
|
3 |
+
2024-11-05 19:31:02,511 INFO MainThread:2188020 [wandb_setup.py:_flush():79] Loading settings from /hai/scratch/belkhale/.config/wandb/settings
|
4 |
+
2024-11-05 19:31:02,511 INFO MainThread:2188020 [wandb_setup.py:_flush():79] Loading settings from /hai/scratch/belkhale/openvla-mini/wandb/settings
|
5 |
+
2024-11-05 19:31:02,511 INFO MainThread:2188020 [wandb_setup.py:_flush():79] Loading settings from environment variables: {'_service_wait': '300'}
|
6 |
+
2024-11-05 19:31:02,512 INFO MainThread:2188020 [wandb_setup.py:_flush():79] Applying setup settings: {'mode': None, '_disable_service': None}
|
7 |
+
2024-11-05 19:31:02,512 INFO MainThread:2188020 [wandb_setup.py:_flush():79] Inferring run settings from compute environment: {'program_relpath': 'scripts/pretrain.py', 'program_abspath': '/hai/scratch/belkhale/openvla-mini/scripts/pretrain.py', 'program': '/hai/scratch/belkhale/openvla-mini/scripts/pretrain.py'}
|
8 |
+
2024-11-05 19:31:02,512 INFO MainThread:2188020 [wandb_setup.py:_flush():79] Applying login settings: {}
|
9 |
+
2024-11-05 19:31:02,512 INFO MainThread:2188020 [wandb_init.py:_log_setup():534] Logging user logs to runs/prism-qwen25-extra-dinosiglip-224px+0_5b+stage-finetune+x7/wandb/run-20241105_193102-jcj67gg8/logs/debug.log
|
10 |
+
2024-11-05 19:31:02,512 INFO MainThread:2188020 [wandb_init.py:_log_setup():535] Logging internal logs to runs/prism-qwen25-extra-dinosiglip-224px+0_5b+stage-finetune+x7/wandb/run-20241105_193102-jcj67gg8/logs/debug-internal.log
|
11 |
+
2024-11-05 19:31:02,512 INFO MainThread:2188020 [wandb_init.py:init():621] calling init triggers
|
12 |
+
2024-11-05 19:31:02,512 INFO MainThread:2188020 [wandb_init.py:init():628] wandb.init called with sweep_config: {}
|
13 |
+
config: {'model': {'type': 'prism-qwen25-extra-dinosiglip-224px+0_5b', 'model_id': 'prism-qwen25-extra-dinosiglip-224px+0_5b', 'arch_specifier': 'no-align+fused-gelu-mlp', 'vision_backbone_id': 'dinosiglip-vit-so-224px', 'llm_backbone_id': 'qwen25-0_5b-extra', 'image_resize_strategy': 'resize-naive', 'llm_max_length': 32768, 'align_epochs': 1, 'align_max_steps': None, 'align_save_every_n_steps': 10000, 'align_global_batch_size': 96, 'align_per_device_batch_size': 16, 'align_learning_rate': 0.001, 'align_weight_decay': 0.0, 'align_max_grad_norm': 1.0, 'align_lr_scheduler_type': 'linear-warmup+cosine-decay', 'align_warmup_ratio': 0.03, 'align_train_strategy': 'fsdp-shard-grad-op', 'finetune_epochs': 2, 'finetune_max_steps': None, 'finetune_save_every_n_steps': 10000, 'finetune_global_batch_size': 64, 'finetune_per_device_batch_size': 4, 'finetune_learning_rate': 2e-05, 'finetune_weight_decay': 0.1, 'finetune_max_grad_norm': 1.0, 'finetune_lr_scheduler_type': 'linear-warmup+cosine-decay', 'finetune_warmup_ratio': 0.03, 'finetune_train_strategy': 'fsdp-full-shard', 'enable_gradient_checkpointing': True, 'enable_mixed_precision_training': True, 'reduce_in_full_precision': False}, 'dataset': {'type': 'llava-v15', 'dataset_id': 'llava-v15', 'align_stage_components': ['download/llava-laion-cc-sbu-558k/chat.json', 'download/llava-laion-cc-sbu-558k'], 'finetune_stage_components': ['download/llava-v1.5-instruct/llava_v1_5_mix665k.json', 'download/llava-v1.5-instruct'], 'dataset_root_dir': '/hai/scratch/belkhale/datasets/prismatic-vlms'}, 'stage': 'finetune', 'pretrained_checkpoint': None, 'run_id': 'prism-qwen25-extra-dinosiglip-224px+0_5b+stage-finetune+x7', 'run_root_dir': 'runs', 'seed': 7, 'hf_token': '.hf_token', 'trackers': ['jsonl', 'wandb'], 'wandb_project': 'prismatic', 'wandb_entity': None}
|
14 |
+
2024-11-05 19:31:02,512 INFO MainThread:2188020 [wandb_init.py:init():671] starting backend
|
15 |
+
2024-11-05 19:31:02,512 INFO MainThread:2188020 [wandb_init.py:init():675] sending inform_init request
|
16 |
+
2024-11-05 19:31:02,513 INFO MainThread:2188020 [backend.py:_multiprocessing_setup():104] multiprocessing start_methods=fork,spawn,forkserver, using: spawn
|
17 |
+
2024-11-05 19:31:02,513 INFO MainThread:2188020 [wandb_init.py:init():688] backend started and connected
|
18 |
+
2024-11-05 19:31:02,515 INFO MainThread:2188020 [wandb_init.py:init():783] updated telemetry
|
19 |
+
2024-11-05 19:31:02,573 INFO MainThread:2188020 [wandb_init.py:init():816] communicating run to backend with 90.0 second timeout
|
20 |
+
2024-11-05 19:31:03,050 INFO MainThread:2188020 [wandb_init.py:init():867] starting run threads in backend
|
21 |
+
2024-11-05 19:31:03,226 INFO MainThread:2188020 [wandb_run.py:_console_start():2463] atexit reg
|
22 |
+
2024-11-05 19:31:03,227 INFO MainThread:2188020 [wandb_run.py:_redirect():2311] redirect: wrap_raw
|
23 |
+
2024-11-05 19:31:03,227 INFO MainThread:2188020 [wandb_run.py:_redirect():2376] Wrapping output streams.
|
24 |
+
2024-11-05 19:31:03,227 INFO MainThread:2188020 [wandb_run.py:_redirect():2401] Redirects installed.
|
25 |
+
2024-11-05 19:31:03,230 INFO MainThread:2188020 [wandb_init.py:init():911] run started, returning control to user process
|
26 |
+
2024-11-05 23:38:31,920 INFO MainThread:2188020 [wandb_run.py:_finish():2158] finishing run belkhale/prismatic/jcj67gg8
|
27 |
+
2024-11-05 23:38:31,920 INFO MainThread:2188020 [wandb_run.py:_atexit_cleanup():2426] got exitcode: 0
|
28 |
+
2024-11-05 23:38:31,921 INFO MainThread:2188020 [wandb_run.py:_restore():2408] restore
|
29 |
+
2024-11-05 23:38:31,921 INFO MainThread:2188020 [wandb_run.py:_restore():2414] restore done
|
30 |
+
2024-11-05 23:38:34,516 INFO MainThread:2188020 [wandb_run.py:_footer_history_summary_info():3975] rendering history
|
31 |
+
2024-11-05 23:38:34,517 INFO MainThread:2188020 [wandb_run.py:_footer_history_summary_info():4007] rendering summary
|
32 |
+
2024-11-05 23:38:34,534 INFO MainThread:2188020 [wandb_run.py:_footer_sync_info():3934] logging synced files
|
wandb/run-20241105_193102-jcj67gg8/run-jcj67gg8.wandb
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:b1e028b995d945fb180c85455d7219269515a28888eda671f635380e4dac0d23
|
3 |
+
size 37709802
|