export NCCL_BLOCKING_WAIT=1 # Set this variable to use the NCCL backend | |
export NCCL_IB_DISABLE=1 | |
export NCCL_DEBUG=INFO | |
export NCCL_P2P_DISABLE=1 # direct access between GPUs? using NVLink or PCI. | |
# See https://github.com/NVIDIA/nccl/issues/631 | |
#export TORCH_DISTRIBUTED_DEBUG=DETAIL | |
export TORCH_DISTRIBUTED_DEBUG=OFF | |
#--data_file ./data/finetune-pairs.json \ | |
deepspeed \ | |
--include=localhost:4,5,6,7 \ | |
--master_port 8921 \ | |
train.py \ | |
\ | |
--model_name_or_path lmsys/vicuna-13b-v1.5 \ | |
--data_file ./data/finetune-pairs.json \ | |
--debug_single_layer False \ | |
--dryrun False \ | |
--use_lora False \ | |
--ctx_length 2048 \ | |
--datamap_nprocs 10 \ | |
--use_flash_att2 True \ | |
--load_8bit False \ | |
--num_train_epochs 3 \ | |
\ | |
--output_dir ./output \ | |
--save_strategy "steps" \ | |
--save_steps 100 \ | |
--save_total_limit 2 \ | |
--logging_steps 1 \ | |
--report_to "tensorboard" \ | |
\ | |
--per_device_train_batch_size 1 \ | |
--gradient_accumulation_steps 12 \ | |
--max_grad_norm 1.0 \ | |
--learning_rate 2e-5 \ | |
--warmup_ratio 0.03 \ | |
--fp16 False \ | |
--bf16 True \ | |
--deepspeed $(python ds_config.py \ | |
--en_param_offload False \ | |
--en_act_ackpt False \ | |
--en_sparse_attn False \ | |
) | |