|
myrank: 0 local_rank: 0 device_count: 8 world_size: 1 device: cuda:4 |
|
==================================================================================================== |
|
- platform : local |
|
- local_rank : 0 |
|
- rank : 0 |
|
- device : cuda:4 |
|
- world_size : 1 |
|
- random_seed : 110 |
|
- lr : 0.0002 |
|
- weight_decay : 0.01 |
|
- correct_bias : True |
|
- adam_epislon : 1e-06 |
|
- no_decay_bias : False |
|
- adam_beta1 : 0.9 |
|
- adam_beta2 : 0.999 |
|
- scheduler : linear |
|
- max_step : None |
|
- max_epoch : 5 |
|
- warmup_step : 500 |
|
- i_steps : 0 |
|
- i_lrs : 0.00025 |
|
- train_data : ./data/e2e/train.jsonl |
|
- valid_data : ./data/e2e/valid.jsonl |
|
- train_batch_size : 8 |
|
- valid_batch_size : 4 |
|
- grad_acc : 1 |
|
- clip : 0.0 |
|
- seq_len : 512 |
|
- model_card : gpt2.md |
|
- init_checkpoint : ./pretrained_checkpoints/gpt2-medium-pytorch_model.bin |
|
- fp16 : False |
|
- log_interval : 100 |
|
- eval_interval : 2000 |
|
- save_interval : 1000 |
|
- work_dir : ./trained_models/GPT2_M/e2e |
|
- lora_dim : 4 |
|
- lora_alpha : 32 |
|
- obj : clm |
|
- lora_dropout : 0.1 |
|
- label_smooth : 0.1 |
|
- roll_interval : -1 |
|
- roll_lr : 1e-05 |
|
- roll_step : 100 |
|
- eval_epoch : 1 |
|
- dist : <module 'torch.distributed' from '/home/inc/miniconda3/envs/fedadp-new/lib/python3.7/site-packages/torch/distributed/__init__.py'> |
|
==================================================================================================== |
|
Experiment dir : ./trained_models/GPT2_M/e2e |
|
train_loader=5258, train_data=42064 |
|
valid_loader=1168, valid_data=4672 |
|
scaling = 8.0 |
|
loading model pretrained weight. |
|
GPT2LMModel( |
|
(transformer): GPT2Model( |
|
(wte): Embedding(50257, 1024) |
|
(wpe): Embedding(1024, 1024) |
|
(h): ModuleList( |
|
(0): Block( |
|
(ln_1): LayerNorm() |
|
(attn): Attention( |
|
(c_attn): MergedLinear( |
|
in_features=1024, out_features=3072, bias=True |
|
(lora_dropout): Dropout(p=0.1, inplace=False) |
|
) |
|
(c_proj): Conv1D() |
|
) |
|
(ln_2): LayerNorm() |
|
(mlp): MLP( |
|
(c_fc): Conv1D() |
|
(c_proj): Conv1D() |
|
) |
|
) |
|
(1): Block( |
|
(ln_1): LayerNorm() |
|
(attn): Attention( |
|
(c_attn): MergedLinear( |
|
in_features=1024, out_features=3072, bias=True |
|
(lora_dropout): Dropout(p=0.1, inplace=False) |
|
) |
|
(c_proj): Conv1D() |
|
) |
|
(ln_2): LayerNorm() |
|
(mlp): MLP( |
|
(c_fc): Conv1D() |
|
(c_proj): Conv1D() |
|
) |
|
) |
|
(2): Block( |
|
(ln_1): LayerNorm() |
|
(attn): Attention( |
|
(c_attn): MergedLinear( |
|
in_features=1024, out_features=3072, bias=True |
|
(lora_dropout): Dropout(p=0.1, inplace=False) |
|
) |
|
(c_proj): Conv1D() |
|
) |
|
(ln_2): LayerNorm() |
|
(mlp): MLP( |
|
(c_fc): Conv1D() |
|
(c_proj): Conv1D() |
|
) |
|
) |
|
(3): Block( |
|
(ln_1): LayerNorm() |
|
(attn): Attention( |
|
(c_attn): MergedLinear( |
|
in_features=1024, out_features=3072, bias=True |
|
(lora_dropout): Dropout(p=0.1, inplace=False) |
|
) |
|
(c_proj): Conv1D() |
|
) |
|
(ln_2): LayerNorm() |
|
(mlp): MLP( |
|
(c_fc): Conv1D() |
|
(c_proj): Conv1D() |
|
) |
|
) |
|
(4): Block( |
|
(ln_1): LayerNorm() |
|
(attn): Attention( |
|
(c_attn): MergedLinear( |
|
in_features=1024, out_features=3072, bias=True |
|
(lora_dropout): Dropout(p=0.1, inplace=False) |
|
) |
|
(c_proj): Conv1D() |
|
) |
|
(ln_2): LayerNorm() |
|
(mlp): MLP( |
|
(c_fc): Conv1D() |
|
(c_proj): Conv1D() |
|
) |
|
) |
|
(5): Block( |
|
(ln_1): LayerNorm() |
|
(attn): Attention( |
|
(c_attn): MergedLinear( |
|
in_features=1024, out_features=3072, bias=True |
|
(lora_dropout): Dropout(p=0.1, inplace=False) |
|
) |
|
(c_proj): Conv1D() |
|
) |
|
(ln_2): LayerNorm() |
|
(mlp): MLP( |
|
(c_fc): Conv1D() |
|
(c_proj): Conv1D() |
|
) |
|
) |
|
(6): Block( |
|
(ln_1): LayerNorm() |
|
(attn): Attention( |
|
(c_attn): MergedLinear( |
|
in_features=1024, out_features=3072, bias=True |
|
(lora_dropout): Dropout(p=0.1, inplace=False) |
|
) |
|
(c_proj): Conv1D() |
|
) |
|
(ln_2): LayerNorm() |
|
(mlp): MLP( |
|
(c_fc): Conv1D() |
|
(c_proj): Conv1D() |
|
) |
|
) |
|
(7): Block( |
|
(ln_1): LayerNorm() |
|
(attn): Attention( |
|
(c_attn): MergedLinear( |
|
in_features=1024, out_features=3072, bias=True |
|
(lora_dropout): Dropout(p=0.1, inplace=False) |
|
) |
|
(c_proj): Conv1D() |
|
) |
|
(ln_2): LayerNorm() |
|
(mlp): MLP( |
|
(c_fc): Conv1D() |
|
(c_proj): Conv1D() |
|
) |
|
) |
|
(8): Block( |
|
(ln_1): LayerNorm() |
|
(attn): Attention( |
|
(c_attn): MergedLinear( |
|
in_features=1024, out_features=3072, bias=True |
|
(lora_dropout): Dropout(p=0.1, inplace=False) |
|
) |
|
(c_proj): Conv1D() |
|
) |
|
(ln_2): LayerNorm() |
|
(mlp): MLP( |
|
(c_fc): Conv1D() |
|
(c_proj): Conv1D() |
|
) |
|
) |
|
(9): Block( |
|
(ln_1): LayerNorm() |
|
(attn): Attention( |
|
(c_attn): MergedLinear( |
|
in_features=1024, out_features=3072, bias=True |
|
(lora_dropout): Dropout(p=0.1, inplace=False) |
|
) |
|
(c_proj): Conv1D() |
|
) |
|
(ln_2): LayerNorm() |
|
(mlp): MLP( |
|
(c_fc): Conv1D() |
|
(c_proj): Conv1D() |
|
) |
|
) |
|
(10): Block( |
|
(ln_1): LayerNorm() |
|
(attn): Attention( |
|
(c_attn): MergedLinear( |
|
in_features=1024, out_features=3072, bias=True |
|
(lora_dropout): Dropout(p=0.1, inplace=False) |
|
) |
|
(c_proj): Conv1D() |
|
) |
|
(ln_2): LayerNorm() |
|
(mlp): MLP( |
|
(c_fc): Conv1D() |
|
(c_proj): Conv1D() |
|
) |
|
) |
|
(11): Block( |
|
(ln_1): LayerNorm() |
|
(attn): Attention( |
|
(c_attn): MergedLinear( |
|
in_features=1024, out_features=3072, bias=True |
|
(lora_dropout): Dropout(p=0.1, inplace=False) |
|
) |
|
(c_proj): Conv1D() |
|
) |
|
(ln_2): LayerNorm() |
|
(mlp): MLP( |
|
(c_fc): Conv1D() |
|
(c_proj): Conv1D() |
|
) |
|
) |
|
(12): Block( |
|
(ln_1): LayerNorm() |
|
(attn): Attention( |
|
(c_attn): MergedLinear( |
|
in_features=1024, out_features=3072, bias=True |
|
(lora_dropout): Dropout(p=0.1, inplace=False) |
|
) |
|
(c_proj): Conv1D() |
|
) |
|
(ln_2): LayerNorm() |
|
(mlp): MLP( |
|
(c_fc): Conv1D() |
|
(c_proj): Conv1D() |
|
) |
|
) |
|
(13): Block( |
|
(ln_1): LayerNorm() |
|
(attn): Attention( |
|
(c_attn): MergedLinear( |
|
in_features=1024, out_features=3072, bias=True |
|
(lora_dropout): Dropout(p=0.1, inplace=False) |
|
) |
|
(c_proj): Conv1D() |
|
) |
|
(ln_2): LayerNorm() |
|
(mlp): MLP( |
|
(c_fc): Conv1D() |
|
(c_proj): Conv1D() |
|
) |
|
) |
|
(14): Block( |
|
(ln_1): LayerNorm() |
|
(attn): Attention( |
|
(c_attn): MergedLinear( |
|
in_features=1024, out_features=3072, bias=True |
|
(lora_dropout): Dropout(p=0.1, inplace=False) |
|
) |
|
(c_proj): Conv1D() |
|
) |
|
(ln_2): LayerNorm() |
|
(mlp): MLP( |
|
(c_fc): Conv1D() |
|
(c_proj): Conv1D() |
|
) |
|
) |
|
(15): Block( |
|
(ln_1): LayerNorm() |
|
(attn): Attention( |
|
(c_attn): MergedLinear( |
|
in_features=1024, out_features=3072, bias=True |
|
(lora_dropout): Dropout(p=0.1, inplace=False) |
|
) |
|
(c_proj): Conv1D() |
|
) |
|
(ln_2): LayerNorm() |
|
(mlp): MLP( |
|
(c_fc): Conv1D() |
|
(c_proj): Conv1D() |
|
) |
|
) |
|
(16): Block( |
|
(ln_1): LayerNorm() |
|
(attn): Attention( |
|
(c_attn): MergedLinear( |
|
in_features=1024, out_features=3072, bias=True |
|
(lora_dropout): Dropout(p=0.1, inplace=False) |
|
) |
|
(c_proj): Conv1D() |
|
) |
|
(ln_2): LayerNorm() |
|
(mlp): MLP( |
|
(c_fc): Conv1D() |
|
(c_proj): Conv1D() |
|
) |
|
) |
|
(17): Block( |
|
(ln_1): LayerNorm() |
|
(attn): Attention( |
|
(c_attn): MergedLinear( |
|
in_features=1024, out_features=3072, bias=True |
|
(lora_dropout): Dropout(p=0.1, inplace=False) |
|
) |
|
(c_proj): Conv1D() |
|
) |
|
(ln_2): LayerNorm() |
|
(mlp): MLP( |
|
(c_fc): Conv1D() |
|
(c_proj): Conv1D() |
|
) |
|
) |
|
(18): Block( |
|
(ln_1): LayerNorm() |
|
(attn): Attention( |
|
(c_attn): MergedLinear( |
|
in_features=1024, out_features=3072, bias=True |
|
(lora_dropout): Dropout(p=0.1, inplace=False) |
|
) |
|
(c_proj): Conv1D() |
|
) |
|
(ln_2): LayerNorm() |
|
(mlp): MLP( |
|
(c_fc): Conv1D() |
|
(c_proj): Conv1D() |
|
) |
|
) |
|
(19): Block( |
|
(ln_1): LayerNorm() |
|
(attn): Attention( |
|
(c_attn): MergedLinear( |
|
in_features=1024, out_features=3072, bias=True |
|
(lora_dropout): Dropout(p=0.1, inplace=False) |
|
) |
|
(c_proj): Conv1D() |
|
) |
|
(ln_2): LayerNorm() |
|
(mlp): MLP( |
|
(c_fc): Conv1D() |
|
(c_proj): Conv1D() |
|
) |
|
) |
|
(20): Block( |
|
(ln_1): LayerNorm() |
|
(attn): Attention( |
|
(c_attn): MergedLinear( |
|
in_features=1024, out_features=3072, bias=True |
|
(lora_dropout): Dropout(p=0.1, inplace=False) |
|
) |
|
(c_proj): Conv1D() |
|
) |
|
(ln_2): LayerNorm() |
|
(mlp): MLP( |
|
(c_fc): Conv1D() |
|
(c_proj): Conv1D() |
|
) |
|
) |
|
(21): Block( |
|
(ln_1): LayerNorm() |
|
(attn): Attention( |
|
(c_attn): MergedLinear( |
|
in_features=1024, out_features=3072, bias=True |
|
(lora_dropout): Dropout(p=0.1, inplace=False) |
|
) |
|
(c_proj): Conv1D() |
|
) |
|
(ln_2): LayerNorm() |
|
(mlp): MLP( |
|
(c_fc): Conv1D() |
|
(c_proj): Conv1D() |
|
) |
|
) |
|
(22): Block( |
|
(ln_1): LayerNorm() |
|
(attn): Attention( |
|
(c_attn): MergedLinear( |
|
in_features=1024, out_features=3072, bias=True |
|
(lora_dropout): Dropout(p=0.1, inplace=False) |
|
) |
|
(c_proj): Conv1D() |
|
) |
|
(ln_2): LayerNorm() |
|
(mlp): MLP( |
|
(c_fc): Conv1D() |
|
(c_proj): Conv1D() |
|
) |
|
) |
|
(23): Block( |
|
(ln_1): LayerNorm() |
|
(attn): Attention( |
|
(c_attn): MergedLinear( |
|
in_features=1024, out_features=3072, bias=True |
|
(lora_dropout): Dropout(p=0.1, inplace=False) |
|
) |
|
(c_proj): Conv1D() |
|
) |
|
(ln_2): LayerNorm() |
|
(mlp): MLP( |
|
(c_fc): Conv1D() |
|
(c_proj): Conv1D() |
|
) |
|
) |
|
) |
|
(ln_f): LayerNorm() |
|
) |
|
(lm_head): GPT2LMHead( |
|
(decoder): Linear(in_features=1024, out_features=50257, bias=False) |
|
) |
|
) |
|
vocab_size: 50257 |
|
n_ctx: 1024 |
|
n_positions: 1024 |
|
n_embd: 1024 |
|
n_layer: 24 |
|
n_head: 16 |
|
layer_norm_epsilon: 1e-05 |
|
initializer_range: 0.02 |
|
lora_attn_dim: 4 |
|
lora_attn_alpha: 32 |
|
lora_dropout: 0.1 |
|
lora_r_dropout: 0.0 |
|
fix_dropout: 0.0 |
|
Namespace(adam_beta1=0.9, adam_beta2=0.999, adam_epislon=1e-06, clip=0.0, correct_bias=True, device=device(type='cuda', index=4), dist=<module 'torch.distributed' from '/home/inc/miniconda3/envs/fedadp-new/lib/python3.7/site-packages/torch/distributed/__init__.py'>, eval_epoch=1, eval_interval=2000, fp16=False, grad_acc=1, i_lrs='0.00025', i_steps='0', init_checkpoint='./pretrained_checkpoints/gpt2-medium-pytorch_model.bin', label_smooth=0.1, local_rank=0, log_interval=100, logging=functools.partial(<function logging at 0x7f90cac2ae60>, log_path='./trained_models/GPT2_M/e2e/log.txt'), lora_alpha=32, lora_dim=4, lora_dropout=0.1, lr=0.0002, max_epoch=5, max_step=None, model_card='gpt2.md', no_decay_bias=False, obj='clm', platform='local', random_seed=110, rank=0, roll_interval=-1, roll_lr=1e-05, roll_step=100, save_interval=1000, scheduler='linear', seq_len=512, train_batch_size=8, train_data='./data/e2e/train.jsonl', valid_batch_size=4, valid_data='./data/e2e/valid.jsonl', warmup_step=500, weight_decay=0.01, work_dir='./trained_models/GPT2_M/e2e', world_size=1) |
|
optimizer: AdamW ( |
|
Parameter Group 0 |
|
betas: (0.9, 0.999) |
|
correct_bias: True |
|
eps: 1e-06 |
|
lr: 0.0002 |
|
weight_decay: 0.01 |
|
) |
|
set max_step: 26290 |
|
train_data.num_batches: 5258 |
|
start to train the model................ 1 |
|
/home/inc/Documents/fzh/python/LoRA-main/examples/NLG/src/optimizer.py:117: UserWarning: This overload of addcdiv_ is deprecated: |
|
addcdiv_(Number value, Tensor tensor1, Tensor tensor2) |
|
Consider using one of the following signatures instead: |
|
addcdiv_(Tensor tensor1, Tensor tensor2, *, Number value) (Triggered internally at ../torch/csrc/utils/python_arg_parser.cpp:1050.) |
|
p.data.addcdiv_(-step_size, exp_avg, denom) |
|
|
|
|
|
| epoch 1 step 100 | 100 batches | lr 4e-05 | ms/batch 612.69 | loss 5.06 | avg loss 5.52 | ppl 250.72 |
|
| epoch 1 step 200 | 200 batches | lr 8e-05 | ms/batch 608.52 | loss 3.21 | avg loss 3.70 | ppl 40.58 |
|
| epoch 1 step 300 | 300 batches | lr 0.00012 | ms/batch 609.77 | loss 2.98 | avg loss 3.08 | ppl 21.74 |
|
| epoch 1 step 400 | 400 batches | lr 0.00016 | ms/batch 610.18 | loss 3.11 | avg loss 2.98 | ppl 19.63 |
|
| epoch 1 step 500 | 500 batches | lr 0.0002 | ms/batch 610.03 | loss 2.84 | avg loss 2.89 | ppl 18.03 |
|
| epoch 1 step 600 | 600 batches | lr 0.000199 | ms/batch 608.84 | loss 2.77 | avg loss 2.83 | ppl 16.93 |
|
| epoch 1 step 700 | 700 batches | lr 0.000198 | ms/batch 611.37 | loss 2.88 | avg loss 2.80 | ppl 16.37 |
|
| epoch 1 step 800 | 800 batches | lr 0.000198 | ms/batch 611.10 | loss 2.48 | avg loss 2.76 | ppl 15.76 |
|
| epoch 1 step 900 | 900 batches | lr 0.000197 | ms/batch 610.61 | loss 2.50 | avg loss 2.75 | ppl 15.59 |
|
| epoch 1 step 1000 | 1000 batches | lr 0.000196 | ms/batch 610.44 | loss 3.19 | avg loss 2.77 | ppl 15.95 |
|
saving checkpoint ./trained_models/GPT2_M/e2e/model.1000.pt |
|
| epoch 1 step 1100 | 1100 batches | lr 0.000195 | ms/batch 612.14 | loss 2.76 | avg loss 2.73 | ppl 15.41 |
|
| epoch 1 step 1200 | 1200 batches | lr 0.000195 | ms/batch 608.16 | loss 3.02 | avg loss 2.76 | ppl 15.84 |
|
| epoch 1 step 1300 | 1300 batches | lr 0.000194 | ms/batch 610.06 | loss 2.55 | avg loss 2.75 | ppl 15.62 |
|
| epoch 1 step 1400 | 1400 batches | lr 0.000193 | ms/batch 609.24 | loss 2.35 | avg loss 2.70 | ppl 14.93 |
|
| epoch 1 step 1500 | 1500 batches | lr 0.000192 | ms/batch 607.91 | loss 2.53 | avg loss 2.72 | ppl 15.24 |
|
| epoch 1 step 1600 | 1600 batches | lr 0.000191 | ms/batch 608.62 | loss 2.53 | avg loss 2.67 | ppl 14.50 |
|
| epoch 1 step 1700 | 1700 batches | lr 0.000191 | ms/batch 608.92 | loss 2.66 | avg loss 2.71 | ppl 14.99 |
|
| epoch 1 step 1800 | 1800 batches | lr 0.00019 | ms/batch 608.44 | loss 2.55 | avg loss 2.69 | ppl 14.75 |
|
| epoch 1 step 1900 | 1900 batches | lr 0.000189 | ms/batch 609.27 | loss 2.43 | avg loss 2.66 | ppl 14.31 |
|
| epoch 1 step 2000 | 2000 batches | lr 0.000188 | ms/batch 607.05 | loss 2.71 | avg loss 2.66 | ppl 14.36 |
|
saving checkpoint ./trained_models/GPT2_M/e2e/model.2000.pt |
|
/home/inc/miniconda3/envs/fedadp-new/lib/python3.7/site-packages/torch/nn/_reduction.py:42: UserWarning: size_average and reduce args will be deprecated, please use reduction='none' instead. |
|
warnings.warn(warning.format(ret)) |
|
eval samples: 0 loss: tensor(1.1374, device='cuda:4') |
|
eval samples: 100 loss: tensor(1.0985, device='cuda:4') |
|
eval samples: 200 loss: tensor(1.2215, device='cuda:4') |
|
eval samples: 300 loss: tensor(1.2918, device='cuda:4') |
|
eval samples: 400 loss: tensor(1.6716, device='cuda:4') |
|
eval samples: 500 loss: tensor(1.9854, device='cuda:4') |
|
eval samples: 600 loss: tensor(1.2216, device='cuda:4') |
|
eval samples: 700 loss: tensor(1.0347, device='cuda:4') |
|
eval samples: 800 loss: tensor(1.5289, device='cuda:4') |
|
eval samples: 900 loss: tensor(1.5743, device='cuda:4') |
|
eval samples: 1000 loss: tensor(1.3339, device='cuda:4') |
|
eval samples: 1100 loss: tensor(1.3198, device='cuda:4') |
|
average loss 1.3344345796496084 |
|
---------------------------------------------------------------------------------------------------- |
|
| Eval 1 at step 2000 | time: 137.89s | valid loss 1.33 | valid ppl 3.80 | best ppl 3.80 |
|
---------------------------------------------------------------------------------------------------- |
|
| epoch 1 step 2100 | 2100 batches | lr 0.000188 | ms/batch 1988.14 | loss 2.64 | avg loss 2.68 | ppl 14.57 |
|
| epoch 1 step 2200 | 2200 batches | lr 0.000187 | ms/batch 608.77 | loss 2.45 | avg loss 2.66 | ppl 14.34 |
|
| epoch 1 step 2300 | 2300 batches | lr 0.000186 | ms/batch 610.52 | loss 2.60 | avg loss 2.67 | ppl 14.38 |
|
| epoch 1 step 2400 | 2400 batches | lr 0.000185 | ms/batch 608.14 | loss 2.70 | avg loss 2.67 | ppl 14.49 |
|
| epoch 1 step 2500 | 2500 batches | lr 0.000184 | ms/batch 607.87 | loss 2.52 | avg loss 2.64 | ppl 14.05 |
|
| epoch 1 step 2600 | 2600 batches | lr 0.000184 | ms/batch 608.44 | loss 2.54 | avg loss 2.70 | ppl 14.85 |
|
| epoch 1 step 2700 | 2700 batches | lr 0.000183 | ms/batch 608.49 | loss 2.87 | avg loss 2.69 | ppl 14.72 |
|
| epoch 1 step 2800 | 2800 batches | lr 0.000182 | ms/batch 608.82 | loss 2.44 | avg loss 2.66 | ppl 14.26 |
|
| epoch 1 step 2900 | 2900 batches | lr 0.000181 | ms/batch 609.19 | loss 2.69 | avg loss 2.68 | ppl 14.52 |
|
| epoch 1 step 3000 | 3000 batches | lr 0.000181 | ms/batch 609.05 | loss 2.73 | avg loss 2.64 | ppl 13.99 |
|
saving checkpoint ./trained_models/GPT2_M/e2e/model.3000.pt |
|
| epoch 1 step 3100 | 3100 batches | lr 0.00018 | ms/batch 609.17 | loss 2.63 | avg loss 2.64 | ppl 14.04 |
|
| epoch 1 step 3200 | 3200 batches | lr 0.000179 | ms/batch 609.50 | loss 2.57 | avg loss 2.66 | ppl 14.28 |
|
| epoch 1 step 3300 | 3300 batches | lr 0.000178 | ms/batch 607.31 | loss 2.47 | avg loss 2.62 | ppl 13.76 |
|
| epoch 1 step 3400 | 3400 batches | lr 0.000178 | ms/batch 604.83 | loss 2.54 | avg loss 2.60 | ppl 13.49 |
|
| epoch 1 step 3500 | 3500 batches | lr 0.000177 | ms/batch 607.92 | loss 2.62 | avg loss 2.63 | ppl 13.90 |
|
| epoch 1 step 3600 | 3600 batches | lr 0.000176 | ms/batch 608.49 | loss 2.41 | avg loss 2.62 | ppl 13.78 |
|
| epoch 1 step 3700 | 3700 batches | lr 0.000175 | ms/batch 605.91 | loss 2.58 | avg loss 2.59 | ppl 13.36 |
|
| epoch 1 step 3800 | 3800 batches | lr 0.000174 | ms/batch 607.54 | loss 2.46 | avg loss 2.64 | ppl 13.97 |
|
| epoch 1 step 3900 | 3900 batches | lr 0.000174 | ms/batch 610.01 | loss 2.68 | avg loss 2.66 | ppl 14.24 |
|
| epoch 1 step 4000 | 4000 batches | lr 0.000173 | ms/batch 607.98 | loss 2.78 | avg loss 2.64 | ppl 14.04 |
|
saving checkpoint ./trained_models/GPT2_M/e2e/model.4000.pt |
|
eval samples: 0 loss: tensor(1.1133, device='cuda:4') |
|
eval samples: 100 loss: tensor(1.0210, device='cuda:4') |
|
eval samples: 200 loss: tensor(1.1742, device='cuda:4') |
|
eval samples: 300 loss: tensor(1.2072, device='cuda:4') |
|
eval samples: 400 loss: tensor(1.6256, device='cuda:4') |
|
eval samples: 500 loss: tensor(1.9378, device='cuda:4') |
|
eval samples: 600 loss: tensor(1.0971, device='cuda:4') |
|
eval samples: 700 loss: tensor(1.0210, device='cuda:4') |
|
eval samples: 800 loss: tensor(1.4538, device='cuda:4') |
|
eval samples: 900 loss: tensor(1.5298, device='cuda:4') |
|
eval samples: 1000 loss: tensor(1.2354, device='cuda:4') |
|
eval samples: 1100 loss: tensor(1.2567, device='cuda:4') |
|
average loss 1.2714025441506138 |
|
---------------------------------------------------------------------------------------------------- |
|
| Eval 2 at step 4000 | time: 138.19s | valid loss 1.27 | valid ppl 3.57 | best ppl 3.57 |
|
---------------------------------------------------------------------------------------------------- |
|
| epoch 1 step 4100 | 4100 batches | lr 0.000172 | ms/batch 1990.32 | loss 2.81 | avg loss 2.62 | ppl 13.78 |
|
| epoch 1 step 4200 | 4200 batches | lr 0.000171 | ms/batch 608.76 | loss 3.11 | avg loss 2.61 | ppl 13.57 |
|
| epoch 1 step 4300 | 4300 batches | lr 0.000171 | ms/batch 610.45 | loss 2.46 | avg loss 2.61 | ppl 13.63 |
|
| epoch 1 step 4400 | 4400 batches | lr 0.00017 | ms/batch 610.84 | loss 2.96 | avg loss 2.62 | ppl 13.74 |
|
| epoch 1 step 4500 | 4500 batches | lr 0.000169 | ms/batch 611.36 | loss 2.78 | avg loss 2.61 | ppl 13.58 |
|
| epoch 1 step 4600 | 4600 batches | lr 0.000168 | ms/batch 612.08 | loss 2.81 | avg loss 2.57 | ppl 13.07 |
|
| epoch 1 step 4700 | 4700 batches | lr 0.000167 | ms/batch 615.36 | loss 2.90 | avg loss 2.63 | ppl 13.91 |
|
| epoch 1 step 4800 | 4800 batches | lr 0.000167 | ms/batch 611.17 | loss 2.99 | avg loss 2.61 | ppl 13.55 |
|
| epoch 1 step 4900 | 4900 batches | lr 0.000166 | ms/batch 608.81 | loss 2.73 | avg loss 2.60 | ppl 13.47 |
|
| epoch 1 step 5000 | 5000 batches | lr 0.000165 | ms/batch 609.73 | loss 2.50 | avg loss 2.58 | ppl 13.26 |
|
saving checkpoint ./trained_models/GPT2_M/e2e/model.5000.pt |
|
| epoch 1 step 5100 | 5100 batches | lr 0.000164 | ms/batch 609.36 | loss 2.27 | avg loss 2.59 | ppl 13.33 |
|
| epoch 1 step 5200 | 5200 batches | lr 0.000164 | ms/batch 611.66 | loss 2.39 | avg loss 2.62 | ppl 13.78 |
|
saving checkpoint ./trained_models/GPT2_M/e2e/model.5258.pt |
|
start to train the model................ 2 |
|
| epoch 2 step 5300 | 42 batches | lr 0.000163 | ms/batch 256.06 | loss 2.41 | avg loss 2.61 | ppl 13.53 |
|
| epoch 2 step 5400 | 142 batches | lr 0.000162 | ms/batch 609.01 | loss 2.63 | avg loss 2.61 | ppl 13.58 |
|
| epoch 2 step 5500 | 242 batches | lr 0.000161 | ms/batch 612.10 | loss 2.45 | avg loss 2.59 | ppl 13.30 |
|
| epoch 2 step 5600 | 342 batches | lr 0.00016 | ms/batch 611.07 | loss 2.67 | avg loss 2.59 | ppl 13.27 |
|
| epoch 2 step 5700 | 442 batches | lr 0.00016 | ms/batch 611.19 | loss 2.52 | avg loss 2.64 | ppl 13.95 |
|
| epoch 2 step 5800 | 542 batches | lr 0.000159 | ms/batch 611.61 | loss 2.87 | avg loss 2.57 | ppl 13.10 |
|
| epoch 2 step 5900 | 642 batches | lr 0.000158 | ms/batch 612.67 | loss 3.17 | avg loss 2.58 | ppl 13.25 |
|
| epoch 2 step 6000 | 742 batches | lr 0.000157 | ms/batch 610.88 | loss 2.45 | avg loss 2.59 | ppl 13.32 |
|
saving checkpoint ./trained_models/GPT2_M/e2e/model.6000.pt |
|
eval samples: 0 loss: tensor(1.0454, device='cuda:4') |
|
eval samples: 100 loss: tensor(0.9909, device='cuda:4') |
|
eval samples: 200 loss: tensor(1.1352, device='cuda:4') |
|
eval samples: 300 loss: tensor(1.1335, device='cuda:4') |
|
eval samples: 400 loss: tensor(1.5766, device='cuda:4') |
|
eval samples: 500 loss: tensor(2.0034, device='cuda:4') |
|
eval samples: 600 loss: tensor(1.1043, device='cuda:4') |
|
eval samples: 700 loss: tensor(0.9965, device='cuda:4') |
|
eval samples: 800 loss: tensor(1.4912, device='cuda:4') |
|
eval samples: 900 loss: tensor(1.5128, device='cuda:4') |
|
eval samples: 1000 loss: tensor(1.1385, device='cuda:4') |
|
eval samples: 1100 loss: tensor(1.2201, device='cuda:4') |
|
average loss 1.239899498908079 |
|
---------------------------------------------------------------------------------------------------- |
|
| Eval 3 at step 6000 | time: 138.83s | valid loss 1.24 | valid ppl 3.46 | best ppl 3.46 |
|
---------------------------------------------------------------------------------------------------- |
|
| epoch 2 step 6100 | 842 batches | lr 0.000157 | ms/batch 1999.78 | loss 2.55 | avg loss 2.61 | ppl 13.54 |
|
| epoch 2 step 6200 | 942 batches | lr 0.000156 | ms/batch 612.01 | loss 2.72 | avg loss 2.60 | ppl 13.48 |
|
| epoch 2 step 6300 | 1042 batches | lr 0.000155 | ms/batch 611.75 | loss 2.61 | avg loss 2.58 | ppl 13.26 |
|
| epoch 2 step 6400 | 1142 batches | lr 0.000154 | ms/batch 612.29 | loss 2.48 | avg loss 2.58 | ppl 13.15 |
|
| epoch 2 step 6500 | 1242 batches | lr 0.000153 | ms/batch 613.03 | loss 2.90 | avg loss 2.62 | ppl 13.67 |
|
| epoch 2 step 6600 | 1342 batches | lr 0.000153 | ms/batch 611.04 | loss 3.07 | avg loss 2.58 | ppl 13.16 |
|
| epoch 2 step 6700 | 1442 batches | lr 0.000152 | ms/batch 611.17 | loss 2.79 | avg loss 2.56 | ppl 12.96 |
|
| epoch 2 step 6800 | 1542 batches | lr 0.000151 | ms/batch 614.47 | loss 2.50 | avg loss 2.56 | ppl 12.95 |
|
| epoch 2 step 6900 | 1642 batches | lr 0.00015 | ms/batch 610.47 | loss 2.71 | avg loss 2.56 | ppl 12.99 |
|
| epoch 2 step 7000 | 1742 batches | lr 0.00015 | ms/batch 608.59 | loss 2.56 | avg loss 2.59 | ppl 13.37 |
|
saving checkpoint ./trained_models/GPT2_M/e2e/model.7000.pt |
|
| epoch 2 step 7100 | 1842 batches | lr 0.000149 | ms/batch 610.96 | loss 2.32 | avg loss 2.57 | ppl 13.01 |
|
| epoch 2 step 7200 | 1942 batches | lr 0.000148 | ms/batch 610.97 | loss 2.41 | avg loss 2.53 | ppl 12.50 |
|
| epoch 2 step 7300 | 2042 batches | lr 0.000147 | ms/batch 611.57 | loss 2.48 | avg loss 2.57 | ppl 13.10 |
|
| epoch 2 step 7400 | 2142 batches | lr 0.000146 | ms/batch 610.40 | loss 2.39 | avg loss 2.56 | ppl 12.89 |
|
| epoch 2 step 7500 | 2242 batches | lr 0.000146 | ms/batch 610.66 | loss 2.63 | avg loss 2.57 | ppl 13.04 |
|
| epoch 2 step 7600 | 2342 batches | lr 0.000145 | ms/batch 610.52 | loss 2.63 | avg loss 2.58 | ppl 13.26 |
|
| epoch 2 step 7700 | 2442 batches | lr 0.000144 | ms/batch 608.69 | loss 2.22 | avg loss 2.54 | ppl 12.73 |
|
| epoch 2 step 7800 | 2542 batches | lr 0.000143 | ms/batch 609.99 | loss 2.35 | avg loss 2.57 | ppl 13.07 |
|
| epoch 2 step 7900 | 2642 batches | lr 0.000143 | ms/batch 609.05 | loss 2.72 | avg loss 2.60 | ppl 13.47 |
|
| epoch 2 step 8000 | 2742 batches | lr 0.000142 | ms/batch 609.02 | loss 2.57 | avg loss 2.59 | ppl 13.30 |
|
saving checkpoint ./trained_models/GPT2_M/e2e/model.8000.pt |
|
eval samples: 0 loss: tensor(1.0535, device='cuda:4') |
|
eval samples: 100 loss: tensor(0.9691, device='cuda:4') |
|
eval samples: 200 loss: tensor(1.1137, device='cuda:4') |
|
eval samples: 300 loss: tensor(1.1214, device='cuda:4') |
|
eval samples: 400 loss: tensor(1.5688, device='cuda:4') |
|
eval samples: 500 loss: tensor(1.9425, device='cuda:4') |
|
eval samples: 600 loss: tensor(1.0476, device='cuda:4') |
|
eval samples: 700 loss: tensor(0.9898, device='cuda:4') |
|
eval samples: 800 loss: tensor(1.4776, device='cuda:4') |
|
eval samples: 900 loss: tensor(1.5046, device='cuda:4') |
|
eval samples: 1000 loss: tensor(1.1689, device='cuda:4') |
|
eval samples: 1100 loss: tensor(1.1641, device='cuda:4') |
|
average loss 1.2270236368456933 |
|
---------------------------------------------------------------------------------------------------- |
|
| Eval 4 at step 8000 | time: 138.04s | valid loss 1.23 | valid ppl 3.41 | best ppl 3.41 |
|
---------------------------------------------------------------------------------------------------- |
|
| epoch 2 step 8100 | 2842 batches | lr 0.000141 | ms/batch 1991.53 | loss 2.46 | avg loss 2.56 | ppl 12.98 |
|
| epoch 2 step 8200 | 2942 batches | lr 0.00014 | ms/batch 609.84 | loss 2.50 | avg loss 2.60 | ppl 13.49 |
|
| epoch 2 step 8300 | 3042 batches | lr 0.00014 | ms/batch 610.87 | loss 2.47 | avg loss 2.54 | ppl 12.72 |
|
| epoch 2 step 8400 | 3142 batches | lr 0.000139 | ms/batch 610.92 | loss 2.41 | avg loss 2.57 | ppl 13.03 |
|
| epoch 2 step 8500 | 3242 batches | lr 0.000138 | ms/batch 611.04 | loss 2.81 | avg loss 2.56 | ppl 12.89 |
|
| epoch 2 step 8600 | 3342 batches | lr 0.000137 | ms/batch 612.82 | loss 2.40 | avg loss 2.55 | ppl 12.87 |
|
| epoch 2 step 8700 | 3442 batches | lr 0.000136 | ms/batch 611.25 | loss 2.47 | avg loss 2.52 | ppl 12.43 |
|
| epoch 2 step 8800 | 3542 batches | lr 0.000136 | ms/batch 611.59 | loss 2.57 | avg loss 2.55 | ppl 12.86 |
|
| epoch 2 step 8900 | 3642 batches | lr 0.000135 | ms/batch 611.43 | loss 2.33 | avg loss 2.54 | ppl 12.62 |
|
| epoch 2 step 9000 | 3742 batches | lr 0.000134 | ms/batch 610.78 | loss 2.96 | avg loss 2.55 | ppl 12.78 |
|
saving checkpoint ./trained_models/GPT2_M/e2e/model.9000.pt |
|
| epoch 2 step 9100 | 3842 batches | lr 0.000133 | ms/batch 608.39 | loss 2.67 | avg loss 2.55 | ppl 12.81 |
|
| epoch 2 step 9200 | 3942 batches | lr 0.000133 | ms/batch 611.72 | loss 2.65 | avg loss 2.58 | ppl 13.17 |
|
| epoch 2 step 9300 | 4042 batches | lr 0.000132 | ms/batch 611.24 | loss 2.60 | avg loss 2.58 | ppl 13.15 |
|
| epoch 2 step 9400 | 4142 batches | lr 0.000131 | ms/batch 613.45 | loss 2.58 | avg loss 2.56 | ppl 12.95 |
|
| epoch 2 step 9500 | 4242 batches | lr 0.00013 | ms/batch 611.51 | loss 2.40 | avg loss 2.54 | ppl 12.71 |
|
| epoch 2 step 9600 | 4342 batches | lr 0.000129 | ms/batch 613.03 | loss 2.62 | avg loss 2.53 | ppl 12.55 |
|
| epoch 2 step 9700 | 4442 batches | lr 0.000129 | ms/batch 612.45 | loss 2.26 | avg loss 2.54 | ppl 12.74 |
|
| epoch 2 step 9800 | 4542 batches | lr 0.000128 | ms/batch 610.95 | loss 2.78 | avg loss 2.55 | ppl 12.82 |
|
| epoch 2 step 9900 | 4642 batches | lr 0.000127 | ms/batch 608.32 | loss 2.61 | avg loss 2.52 | ppl 12.37 |
|
| epoch 2 step 10000 | 4742 batches | lr 0.000126 | ms/batch 610.72 | loss 2.45 | avg loss 2.54 | ppl 12.73 |
|
saving checkpoint ./trained_models/GPT2_M/e2e/model.10000.pt |
|
eval samples: 0 loss: tensor(1.0123, device='cuda:4') |
|
eval samples: 100 loss: tensor(1.0022, device='cuda:4') |
|
eval samples: 200 loss: tensor(1.0972, device='cuda:4') |
|
eval samples: 300 loss: tensor(1.1317, device='cuda:4') |
|
eval samples: 400 loss: tensor(1.5788, device='cuda:4') |
|
eval samples: 500 loss: tensor(1.9430, device='cuda:4') |
|
eval samples: 600 loss: tensor(1.0426, device='cuda:4') |
|
eval samples: 700 loss: tensor(0.9720, device='cuda:4') |
|
eval samples: 800 loss: tensor(1.4556, device='cuda:4') |
|
eval samples: 900 loss: tensor(1.4790, device='cuda:4') |
|
eval samples: 1000 loss: tensor(1.1323, device='cuda:4') |
|
eval samples: 1100 loss: tensor(1.1691, device='cuda:4') |
|
average loss 1.2222425683006033 |
|
---------------------------------------------------------------------------------------------------- |
|
| Eval 5 at step 10000 | time: 139.05s | valid loss 1.22 | valid ppl 3.39 | best ppl 3.39 |
|
---------------------------------------------------------------------------------------------------- |
|
| epoch 2 step 10100 | 4842 batches | lr 0.000126 | ms/batch 2003.85 | loss 2.46 | avg loss 2.55 | ppl 12.79 |
|
| epoch 2 step 10200 | 4942 batches | lr 0.000125 | ms/batch 609.56 | loss 2.62 | avg loss 2.56 | ppl 12.88 |
|
| epoch 2 step 10300 | 5042 batches | lr 0.000124 | ms/batch 610.36 | loss 2.85 | avg loss 2.51 | ppl 12.28 |
|
| epoch 2 step 10400 | 5142 batches | lr 0.000123 | ms/batch 610.63 | loss 2.40 | avg loss 2.57 | ppl 13.05 |
|
| epoch 2 step 10500 | 5242 batches | lr 0.000122 | ms/batch 613.64 | loss 2.43 | avg loss 2.52 | ppl 12.45 |
|
saving checkpoint ./trained_models/GPT2_M/e2e/model.10516.pt |
|
start to train the model................ 3 |
|
| epoch 3 step 10600 | 84 batches | lr 0.000122 | ms/batch 510.61 | loss 2.63 | avg loss 2.53 | ppl 12.61 |
|
| epoch 3 step 10700 | 184 batches | lr 0.000121 | ms/batch 613.48 | loss 2.67 | avg loss 2.56 | ppl 13.00 |
|
| epoch 3 step 10800 | 284 batches | lr 0.00012 | ms/batch 608.43 | loss 2.48 | avg loss 2.52 | ppl 12.39 |
|
| epoch 3 step 10900 | 384 batches | lr 0.000119 | ms/batch 611.59 | loss 2.69 | avg loss 2.56 | ppl 12.91 |
|
|
|
|
|
|
|
|
|
|
|
Running MS-COCO evaluator... |
|
creating index... |
|
index created! |
|
Loading and preparing results... |
|
DONE (t=0.00s) |
|
creating index... |
|
index created! |
|
tokenization... |
|
PTBTokenizer tokenized 22530 tokens at 184928.37 tokens per second. |
|
PTBTokenizer tokenized 2122 tokens at 21442.98 tokens per second. |
|
setting up scorers... |
|
computing METEOR score... |
|
METEOR: 0.485 |
|
computing Rouge score... |
|
ROUGE_L: 0.761 |
|
computing CIDEr score... |
|
CIDEr: 3.314 |
|
Running Py-MTEval metrics... |
|
SCORES: |
|
============== |
|
BLEU: 0.7401 |
|
NIST: 8.6766 |
|
METEOR: 0.4851 |
|
ROUGE_L: 0.7614 |
|
CIDEr: 3.3144 |
|
=== lora.Linear, model.5258.pt === |
|
|
|
BLEU: 0.7905 |
|
NIST: 9.1684 |
|
METEOR: 0.5016 |
|
ROUGE_L: 0.7865 |
|
CIDEr: 3.4686 |
|
=== lora.MergedLinear, model.26290.pt === |