export NCCL_BLOCKING_WAIT=1  # Set this variable to use the NCCL backend
export NCCL_IB_DISABLE=1
export NCCL_DEBUG=INFO
export NCCL_P2P_DISABLE=1 # direct access between GPUs? using NVLink or PCI.
# See https://github.com/NVIDIA/nccl/issues/631

#export TORCH_DISTRIBUTED_DEBUG=DETAIL
export TORCH_DISTRIBUTED_DEBUG=OFF

#--data_file ./data/finetune-pairs.json \

deepspeed \
    --include=localhost:4,5,6,7 \
    --master_port 8921 \
    train.py \
    \
    --model_name_or_path lmsys/vicuna-13b-v1.5 \
    --data_file ./data/finetune-pairs.json \
    --debug_single_layer False \
    --dryrun False \
    --use_lora False \
    --ctx_length 2048 \
    --datamap_nprocs 10 \
    --use_flash_att2 True \
    --load_8bit False \
    --num_train_epochs 3 \
    \
    --output_dir ./output \
    --save_strategy "steps" \
    --save_steps 100 \
    --save_total_limit 2 \
    --logging_steps 1 \
    --report_to "tensorboard" \
    \
    --per_device_train_batch_size 1 \
    --gradient_accumulation_steps 12 \
    --max_grad_norm 1.0 \
    --learning_rate 2e-5 \
    --warmup_ratio 0.03 \
    --fp16 False \
    --bf16 True \
    --deepspeed $(python ds_config.py \
        --en_param_offload False \
        --en_act_ackpt False \
        --en_sparse_attn False \
    )