mathy-vicuna-13B-FFT / Aug11_01-11-56_watgpu-100.train.sh
w32zhong's picture
Upload training commands.
fe739fd
export NCCL_BLOCKING_WAIT=1 # Set this variable to use the NCCL backend
export NCCL_IB_DISABLE=1
export NCCL_DEBUG=INFO
export NCCL_P2P_DISABLE=1 # direct access between GPUs? using NVLink or PCI.
# See https://github.com/NVIDIA/nccl/issues/631
#export TORCH_DISTRIBUTED_DEBUG=DETAIL
export TORCH_DISTRIBUTED_DEBUG=OFF
#--data_file ./data/finetune-pairs.json \
deepspeed \
--include=localhost:4,5,6,7 \
--master_port 8921 \
train.py \
\
--model_name_or_path lmsys/vicuna-13b-v1.5 \
--data_file ./data/finetune-pairs.json \
--debug_single_layer False \
--dryrun False \
--use_lora False \
--ctx_length 2048 \
--datamap_nprocs 10 \
--use_flash_att2 True \
--load_8bit False \
--num_train_epochs 3 \
\
--output_dir ./output \
--save_strategy "steps" \
--save_steps 100 \
--save_total_limit 2 \
--logging_steps 1 \
--report_to "tensorboard" \
\
--per_device_train_batch_size 1 \
--gradient_accumulation_steps 12 \
--max_grad_norm 1.0 \
--learning_rate 2e-5 \
--warmup_ratio 0.03 \
--fp16 False \
--bf16 True \
--deepspeed $(python ds_config.py \
--en_param_offload False \
--en_act_ackpt False \
--en_sparse_attn False \
)