|
|
|
model_name_or_path: meta-llama/Llama-3.1-8B-Instruct |
|
reward_model: chchen/Llama-3.1-8B-Instruct-reward-500 |
|
trust_remote_code: true |
|
|
|
|
|
stage: ppo |
|
do_train: true |
|
finetuning_type: lora |
|
lora_target: all |
|
|
|
|
|
dataset: bct_non_cot_sft_500 |
|
dataset_dir: data_private |
|
template: llama3 |
|
cutoff_len: 1024 |
|
|
|
overwrite_cache: true |
|
preprocessing_num_workers: 16 |
|
|
|
|
|
output_dir: saves/Llama-3.1-8B-Instruct/ppo-500/train |
|
logging_steps: 10 |
|
save_steps: 50 |
|
plot_loss: true |
|
overwrite_output_dir: true |
|
save_total_limit: 3 |
|
push_to_hub: true |
|
hub_model_id: chchen/Llama-3.1-8B-Instruct-2410-ppo-500 |
|
|
|
|
|
per_device_train_batch_size: 4 |
|
gradient_accumulation_steps: 8 |
|
learning_rate: 1.0e-5 |
|
num_train_epochs: 10.0 |
|
lr_scheduler_type: cosine |
|
warmup_ratio: 0.1 |
|
bf16: true |
|
ddp_timeout: 180000000 |
|
|
|
|
|
max_new_tokens: 512 |
|
top_k: 0 |
|
top_p: 0.9 |
|
|