|
{ |
|
"best_metric": 0.8260869565217391, |
|
"best_model_checkpoint": "MAE-CT-M1N0-M12_v8_split4/checkpoint-5265", |
|
"epoch": 98.0046875, |
|
"eval_steps": 500, |
|
"global_step": 6400, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.0015625, |
|
"grad_norm": 5.617676734924316, |
|
"learning_rate": 1.5625e-07, |
|
"loss": 0.6877, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.003125, |
|
"grad_norm": 5.341566562652588, |
|
"learning_rate": 3.125e-07, |
|
"loss": 0.6905, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.0046875, |
|
"grad_norm": 2.849367141723633, |
|
"learning_rate": 4.6875000000000006e-07, |
|
"loss": 0.6788, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.00625, |
|
"grad_norm": 4.359816551208496, |
|
"learning_rate": 6.25e-07, |
|
"loss": 0.6791, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.0078125, |
|
"grad_norm": 1.5862407684326172, |
|
"learning_rate": 7.8125e-07, |
|
"loss": 0.6755, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.009375, |
|
"grad_norm": 3.142695903778076, |
|
"learning_rate": 9.375000000000001e-07, |
|
"loss": 0.6773, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.01015625, |
|
"eval_accuracy": 0.43478260869565216, |
|
"eval_loss": 0.7108777165412903, |
|
"eval_runtime": 5.4861, |
|
"eval_samples_per_second": 4.192, |
|
"eval_steps_per_second": 1.094, |
|
"step": 65 |
|
}, |
|
{ |
|
"epoch": 1.00078125, |
|
"grad_norm": 4.359466075897217, |
|
"learning_rate": 1.0937500000000001e-06, |
|
"loss": 0.617, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 1.00234375, |
|
"grad_norm": 4.6110968589782715, |
|
"learning_rate": 1.25e-06, |
|
"loss": 0.658, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 1.00390625, |
|
"grad_norm": 2.396003246307373, |
|
"learning_rate": 1.40625e-06, |
|
"loss": 0.6424, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 1.00546875, |
|
"grad_norm": 5.18359899520874, |
|
"learning_rate": 1.5625e-06, |
|
"loss": 0.6857, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 1.00703125, |
|
"grad_norm": 5.112548351287842, |
|
"learning_rate": 1.71875e-06, |
|
"loss": 0.5847, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 1.00859375, |
|
"grad_norm": 3.414761543273926, |
|
"learning_rate": 1.8750000000000003e-06, |
|
"loss": 0.6395, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 1.01015625, |
|
"grad_norm": 8.075427055358887, |
|
"learning_rate": 2.0312500000000002e-06, |
|
"loss": 0.7393, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 1.01015625, |
|
"eval_accuracy": 0.43478260869565216, |
|
"eval_loss": 0.7720307111740112, |
|
"eval_runtime": 4.9012, |
|
"eval_samples_per_second": 4.693, |
|
"eval_steps_per_second": 1.224, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 2.0015625, |
|
"grad_norm": 5.541121959686279, |
|
"learning_rate": 2.1875000000000002e-06, |
|
"loss": 0.5912, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 2.003125, |
|
"grad_norm": 7.405311584472656, |
|
"learning_rate": 2.3437500000000002e-06, |
|
"loss": 0.6909, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 2.0046875, |
|
"grad_norm": 7.448993682861328, |
|
"learning_rate": 2.5e-06, |
|
"loss": 0.6436, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 2.00625, |
|
"grad_norm": 7.356657028198242, |
|
"learning_rate": 2.65625e-06, |
|
"loss": 0.6642, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 2.0078125, |
|
"grad_norm": 4.84994649887085, |
|
"learning_rate": 2.8125e-06, |
|
"loss": 0.5426, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 2.009375, |
|
"grad_norm": 8.717711448669434, |
|
"learning_rate": 2.96875e-06, |
|
"loss": 0.6483, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 2.01015625, |
|
"eval_accuracy": 0.43478260869565216, |
|
"eval_loss": 0.8130948543548584, |
|
"eval_runtime": 4.9011, |
|
"eval_samples_per_second": 4.693, |
|
"eval_steps_per_second": 1.224, |
|
"step": 195 |
|
}, |
|
{ |
|
"epoch": 3.00078125, |
|
"grad_norm": 5.927984237670898, |
|
"learning_rate": 3.125e-06, |
|
"loss": 0.7289, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 3.00234375, |
|
"grad_norm": 5.716071605682373, |
|
"learning_rate": 3.28125e-06, |
|
"loss": 0.5937, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 3.00390625, |
|
"grad_norm": 6.823408603668213, |
|
"learning_rate": 3.4375e-06, |
|
"loss": 0.6431, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 3.00546875, |
|
"grad_norm": 11.641050338745117, |
|
"learning_rate": 3.59375e-06, |
|
"loss": 0.5708, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 3.00703125, |
|
"grad_norm": 6.452781677246094, |
|
"learning_rate": 3.7500000000000005e-06, |
|
"loss": 0.5963, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 3.00859375, |
|
"grad_norm": 6.331055641174316, |
|
"learning_rate": 3.90625e-06, |
|
"loss": 0.5717, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 3.01015625, |
|
"grad_norm": 16.759431838989258, |
|
"learning_rate": 4.0625000000000005e-06, |
|
"loss": 0.5872, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 3.01015625, |
|
"eval_accuracy": 0.43478260869565216, |
|
"eval_loss": 0.7177644968032837, |
|
"eval_runtime": 4.7105, |
|
"eval_samples_per_second": 4.883, |
|
"eval_steps_per_second": 1.274, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 4.0015625, |
|
"grad_norm": 14.158082962036133, |
|
"learning_rate": 4.21875e-06, |
|
"loss": 0.5765, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 4.003125, |
|
"grad_norm": 12.876073837280273, |
|
"learning_rate": 4.3750000000000005e-06, |
|
"loss": 0.5683, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 4.0046875, |
|
"grad_norm": 26.516162872314453, |
|
"learning_rate": 4.53125e-06, |
|
"loss": 0.4597, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 4.00625, |
|
"grad_norm": 16.54762840270996, |
|
"learning_rate": 4.6875000000000004e-06, |
|
"loss": 0.6127, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 4.0078125, |
|
"grad_norm": 16.069034576416016, |
|
"learning_rate": 4.84375e-06, |
|
"loss": 0.6515, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 4.009375, |
|
"grad_norm": 11.875980377197266, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5612, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 4.01015625, |
|
"eval_accuracy": 0.6956521739130435, |
|
"eval_loss": 0.6202592253684998, |
|
"eval_runtime": 4.8326, |
|
"eval_samples_per_second": 4.759, |
|
"eval_steps_per_second": 1.242, |
|
"step": 325 |
|
}, |
|
{ |
|
"epoch": 5.00078125, |
|
"grad_norm": 17.546005249023438, |
|
"learning_rate": 5.156250000000001e-06, |
|
"loss": 0.5529, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 5.00234375, |
|
"grad_norm": 7.8157782554626465, |
|
"learning_rate": 5.3125e-06, |
|
"loss": 0.4669, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 5.00390625, |
|
"grad_norm": 15.228111267089844, |
|
"learning_rate": 5.468750000000001e-06, |
|
"loss": 0.647, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 5.00546875, |
|
"grad_norm": 21.23976707458496, |
|
"learning_rate": 5.625e-06, |
|
"loss": 0.4997, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 5.00703125, |
|
"grad_norm": 17.036767959594727, |
|
"learning_rate": 5.781250000000001e-06, |
|
"loss": 0.4676, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 5.00859375, |
|
"grad_norm": 23.452939987182617, |
|
"learning_rate": 5.9375e-06, |
|
"loss": 0.5638, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 5.01015625, |
|
"grad_norm": 3.643690347671509, |
|
"learning_rate": 6.093750000000001e-06, |
|
"loss": 0.2855, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 5.01015625, |
|
"eval_accuracy": 0.391304347826087, |
|
"eval_loss": 0.7646759748458862, |
|
"eval_runtime": 4.8497, |
|
"eval_samples_per_second": 4.743, |
|
"eval_steps_per_second": 1.237, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 6.0015625, |
|
"grad_norm": 15.79963493347168, |
|
"learning_rate": 6.25e-06, |
|
"loss": 0.4038, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 6.003125, |
|
"grad_norm": 9.234519958496094, |
|
"learning_rate": 6.406250000000001e-06, |
|
"loss": 0.7449, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 6.0046875, |
|
"grad_norm": 15.391752243041992, |
|
"learning_rate": 6.5625e-06, |
|
"loss": 0.3984, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 6.00625, |
|
"grad_norm": 20.15530014038086, |
|
"learning_rate": 6.718750000000001e-06, |
|
"loss": 0.5221, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 6.0078125, |
|
"grad_norm": 2.97125244140625, |
|
"learning_rate": 6.875e-06, |
|
"loss": 0.3465, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 6.009375, |
|
"grad_norm": 15.078941345214844, |
|
"learning_rate": 7.031250000000001e-06, |
|
"loss": 0.3332, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 6.01015625, |
|
"eval_accuracy": 0.391304347826087, |
|
"eval_loss": 0.9563228487968445, |
|
"eval_runtime": 4.7826, |
|
"eval_samples_per_second": 4.809, |
|
"eval_steps_per_second": 1.255, |
|
"step": 455 |
|
}, |
|
{ |
|
"epoch": 7.00078125, |
|
"grad_norm": 51.98073959350586, |
|
"learning_rate": 7.1875e-06, |
|
"loss": 0.4844, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 7.00234375, |
|
"grad_norm": 5.570568561553955, |
|
"learning_rate": 7.343750000000001e-06, |
|
"loss": 0.6797, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 7.00390625, |
|
"grad_norm": 18.14199447631836, |
|
"learning_rate": 7.500000000000001e-06, |
|
"loss": 0.428, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 7.00546875, |
|
"grad_norm": 20.653505325317383, |
|
"learning_rate": 7.656250000000001e-06, |
|
"loss": 0.4506, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 7.00703125, |
|
"grad_norm": 22.278905868530273, |
|
"learning_rate": 7.8125e-06, |
|
"loss": 0.3975, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 7.00859375, |
|
"grad_norm": 4.0497260093688965, |
|
"learning_rate": 7.96875e-06, |
|
"loss": 0.2203, |
|
"step": 510 |
|
}, |
|
{ |
|
"epoch": 7.01015625, |
|
"grad_norm": 2.5937061309814453, |
|
"learning_rate": 8.125000000000001e-06, |
|
"loss": 0.5376, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 7.01015625, |
|
"eval_accuracy": 0.43478260869565216, |
|
"eval_loss": 1.0380140542984009, |
|
"eval_runtime": 4.8039, |
|
"eval_samples_per_second": 4.788, |
|
"eval_steps_per_second": 1.249, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 8.0015625, |
|
"grad_norm": 22.033885955810547, |
|
"learning_rate": 8.281250000000001e-06, |
|
"loss": 0.442, |
|
"step": 530 |
|
}, |
|
{ |
|
"epoch": 8.003125, |
|
"grad_norm": 9.786211967468262, |
|
"learning_rate": 8.4375e-06, |
|
"loss": 0.3946, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 8.0046875, |
|
"grad_norm": 26.71308708190918, |
|
"learning_rate": 8.59375e-06, |
|
"loss": 0.4643, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 8.00625, |
|
"grad_norm": 27.76260757446289, |
|
"learning_rate": 8.750000000000001e-06, |
|
"loss": 0.498, |
|
"step": 560 |
|
}, |
|
{ |
|
"epoch": 8.0078125, |
|
"grad_norm": 37.68694305419922, |
|
"learning_rate": 8.906250000000001e-06, |
|
"loss": 0.4055, |
|
"step": 570 |
|
}, |
|
{ |
|
"epoch": 8.009375, |
|
"grad_norm": 13.42328929901123, |
|
"learning_rate": 9.0625e-06, |
|
"loss": 0.3236, |
|
"step": 580 |
|
}, |
|
{ |
|
"epoch": 8.01015625, |
|
"eval_accuracy": 0.782608695652174, |
|
"eval_loss": 0.6012648940086365, |
|
"eval_runtime": 4.8341, |
|
"eval_samples_per_second": 4.758, |
|
"eval_steps_per_second": 1.241, |
|
"step": 585 |
|
}, |
|
{ |
|
"epoch": 9.00078125, |
|
"grad_norm": 28.908456802368164, |
|
"learning_rate": 9.21875e-06, |
|
"loss": 0.2175, |
|
"step": 590 |
|
}, |
|
{ |
|
"epoch": 9.00234375, |
|
"grad_norm": 50.885311126708984, |
|
"learning_rate": 9.375000000000001e-06, |
|
"loss": 0.3656, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 9.00390625, |
|
"grad_norm": 13.825862884521484, |
|
"learning_rate": 9.531250000000001e-06, |
|
"loss": 0.5367, |
|
"step": 610 |
|
}, |
|
{ |
|
"epoch": 9.00546875, |
|
"grad_norm": 10.76695442199707, |
|
"learning_rate": 9.6875e-06, |
|
"loss": 0.1877, |
|
"step": 620 |
|
}, |
|
{ |
|
"epoch": 9.00703125, |
|
"grad_norm": 54.17179870605469, |
|
"learning_rate": 9.84375e-06, |
|
"loss": 0.4309, |
|
"step": 630 |
|
}, |
|
{ |
|
"epoch": 9.00859375, |
|
"grad_norm": 3.338512659072876, |
|
"learning_rate": 1e-05, |
|
"loss": 0.3109, |
|
"step": 640 |
|
}, |
|
{ |
|
"epoch": 9.01015625, |
|
"grad_norm": 29.185001373291016, |
|
"learning_rate": 9.98263888888889e-06, |
|
"loss": 0.2583, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 9.01015625, |
|
"eval_accuracy": 0.6956521739130435, |
|
"eval_loss": 0.6641891598701477, |
|
"eval_runtime": 4.6699, |
|
"eval_samples_per_second": 4.925, |
|
"eval_steps_per_second": 1.285, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 10.0015625, |
|
"grad_norm": 42.67464828491211, |
|
"learning_rate": 9.965277777777778e-06, |
|
"loss": 0.212, |
|
"step": 660 |
|
}, |
|
{ |
|
"epoch": 10.003125, |
|
"grad_norm": 3.2036612033843994, |
|
"learning_rate": 9.947916666666667e-06, |
|
"loss": 0.0744, |
|
"step": 670 |
|
}, |
|
{ |
|
"epoch": 10.0046875, |
|
"grad_norm": 47.8891716003418, |
|
"learning_rate": 9.930555555555557e-06, |
|
"loss": 0.2478, |
|
"step": 680 |
|
}, |
|
{ |
|
"epoch": 10.00625, |
|
"grad_norm": 0.49096113443374634, |
|
"learning_rate": 9.913194444444446e-06, |
|
"loss": 0.5103, |
|
"step": 690 |
|
}, |
|
{ |
|
"epoch": 10.0078125, |
|
"grad_norm": 42.20719909667969, |
|
"learning_rate": 9.895833333333334e-06, |
|
"loss": 0.1993, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 10.009375, |
|
"grad_norm": 54.64994430541992, |
|
"learning_rate": 9.878472222222223e-06, |
|
"loss": 0.519, |
|
"step": 710 |
|
}, |
|
{ |
|
"epoch": 10.01015625, |
|
"eval_accuracy": 0.6521739130434783, |
|
"eval_loss": 0.8796622157096863, |
|
"eval_runtime": 4.5152, |
|
"eval_samples_per_second": 5.094, |
|
"eval_steps_per_second": 1.329, |
|
"step": 715 |
|
}, |
|
{ |
|
"epoch": 11.00078125, |
|
"grad_norm": 3.232229471206665, |
|
"learning_rate": 9.861111111111112e-06, |
|
"loss": 0.118, |
|
"step": 720 |
|
}, |
|
{ |
|
"epoch": 11.00234375, |
|
"grad_norm": 2.1435928344726562, |
|
"learning_rate": 9.84375e-06, |
|
"loss": 0.3272, |
|
"step": 730 |
|
}, |
|
{ |
|
"epoch": 11.00390625, |
|
"grad_norm": 73.0626220703125, |
|
"learning_rate": 9.826388888888889e-06, |
|
"loss": 0.1605, |
|
"step": 740 |
|
}, |
|
{ |
|
"epoch": 11.00546875, |
|
"grad_norm": 0.07265316694974899, |
|
"learning_rate": 9.80902777777778e-06, |
|
"loss": 0.2396, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 11.00703125, |
|
"grad_norm": 0.3741931915283203, |
|
"learning_rate": 9.791666666666666e-06, |
|
"loss": 0.2981, |
|
"step": 760 |
|
}, |
|
{ |
|
"epoch": 11.00859375, |
|
"grad_norm": 3.6390833854675293, |
|
"learning_rate": 9.774305555555557e-06, |
|
"loss": 0.2877, |
|
"step": 770 |
|
}, |
|
{ |
|
"epoch": 11.01015625, |
|
"grad_norm": 29.540559768676758, |
|
"learning_rate": 9.756944444444445e-06, |
|
"loss": 0.2594, |
|
"step": 780 |
|
}, |
|
{ |
|
"epoch": 11.01015625, |
|
"eval_accuracy": 0.7391304347826086, |
|
"eval_loss": 0.8122978210449219, |
|
"eval_runtime": 4.7158, |
|
"eval_samples_per_second": 4.877, |
|
"eval_steps_per_second": 1.272, |
|
"step": 780 |
|
}, |
|
{ |
|
"epoch": 12.0015625, |
|
"grad_norm": 54.16925048828125, |
|
"learning_rate": 9.739583333333334e-06, |
|
"loss": 0.3136, |
|
"step": 790 |
|
}, |
|
{ |
|
"epoch": 12.003125, |
|
"grad_norm": 77.38468933105469, |
|
"learning_rate": 9.722222222222223e-06, |
|
"loss": 0.2467, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 12.0046875, |
|
"grad_norm": 87.69710540771484, |
|
"learning_rate": 9.704861111111113e-06, |
|
"loss": 0.2896, |
|
"step": 810 |
|
}, |
|
{ |
|
"epoch": 12.00625, |
|
"grad_norm": 71.97374725341797, |
|
"learning_rate": 9.6875e-06, |
|
"loss": 0.3536, |
|
"step": 820 |
|
}, |
|
{ |
|
"epoch": 12.0078125, |
|
"grad_norm": 0.159846693277359, |
|
"learning_rate": 9.670138888888889e-06, |
|
"loss": 0.2543, |
|
"step": 830 |
|
}, |
|
{ |
|
"epoch": 12.009375, |
|
"grad_norm": 2.6350886821746826, |
|
"learning_rate": 9.652777777777779e-06, |
|
"loss": 0.2015, |
|
"step": 840 |
|
}, |
|
{ |
|
"epoch": 12.01015625, |
|
"eval_accuracy": 0.6521739130434783, |
|
"eval_loss": 1.2629700899124146, |
|
"eval_runtime": 4.3577, |
|
"eval_samples_per_second": 5.278, |
|
"eval_steps_per_second": 1.377, |
|
"step": 845 |
|
}, |
|
{ |
|
"epoch": 13.00078125, |
|
"grad_norm": 171.4978790283203, |
|
"learning_rate": 9.635416666666668e-06, |
|
"loss": 0.3562, |
|
"step": 850 |
|
}, |
|
{ |
|
"epoch": 13.00234375, |
|
"grad_norm": 8.83787727355957, |
|
"learning_rate": 9.618055555555556e-06, |
|
"loss": 0.1326, |
|
"step": 860 |
|
}, |
|
{ |
|
"epoch": 13.00390625, |
|
"grad_norm": 0.10538630187511444, |
|
"learning_rate": 9.600694444444445e-06, |
|
"loss": 0.2292, |
|
"step": 870 |
|
}, |
|
{ |
|
"epoch": 13.00546875, |
|
"grad_norm": 108.46855926513672, |
|
"learning_rate": 9.583333333333335e-06, |
|
"loss": 0.2593, |
|
"step": 880 |
|
}, |
|
{ |
|
"epoch": 13.00703125, |
|
"grad_norm": 148.98483276367188, |
|
"learning_rate": 9.565972222222222e-06, |
|
"loss": 0.595, |
|
"step": 890 |
|
}, |
|
{ |
|
"epoch": 13.00859375, |
|
"grad_norm": 0.1856043040752411, |
|
"learning_rate": 9.548611111111113e-06, |
|
"loss": 0.4979, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 13.01015625, |
|
"grad_norm": 0.03162425383925438, |
|
"learning_rate": 9.531250000000001e-06, |
|
"loss": 0.3333, |
|
"step": 910 |
|
}, |
|
{ |
|
"epoch": 13.01015625, |
|
"eval_accuracy": 0.6086956521739131, |
|
"eval_loss": 1.4961862564086914, |
|
"eval_runtime": 6.1439, |
|
"eval_samples_per_second": 3.744, |
|
"eval_steps_per_second": 0.977, |
|
"step": 910 |
|
}, |
|
{ |
|
"epoch": 14.0015625, |
|
"grad_norm": 0.2646234929561615, |
|
"learning_rate": 9.51388888888889e-06, |
|
"loss": 0.0677, |
|
"step": 920 |
|
}, |
|
{ |
|
"epoch": 14.003125, |
|
"grad_norm": 0.07416494190692902, |
|
"learning_rate": 9.496527777777779e-06, |
|
"loss": 0.1394, |
|
"step": 930 |
|
}, |
|
{ |
|
"epoch": 14.0046875, |
|
"grad_norm": 127.45858001708984, |
|
"learning_rate": 9.479166666666667e-06, |
|
"loss": 0.5357, |
|
"step": 940 |
|
}, |
|
{ |
|
"epoch": 14.00625, |
|
"grad_norm": 44.76925277709961, |
|
"learning_rate": 9.461805555555556e-06, |
|
"loss": 0.2052, |
|
"step": 950 |
|
}, |
|
{ |
|
"epoch": 14.0078125, |
|
"grad_norm": 0.2831000089645386, |
|
"learning_rate": 9.444444444444445e-06, |
|
"loss": 0.2634, |
|
"step": 960 |
|
}, |
|
{ |
|
"epoch": 14.009375, |
|
"grad_norm": 62.27622604370117, |
|
"learning_rate": 9.427083333333335e-06, |
|
"loss": 0.1593, |
|
"step": 970 |
|
}, |
|
{ |
|
"epoch": 14.01015625, |
|
"eval_accuracy": 0.6956521739130435, |
|
"eval_loss": 1.1972347497940063, |
|
"eval_runtime": 5.5894, |
|
"eval_samples_per_second": 4.115, |
|
"eval_steps_per_second": 1.073, |
|
"step": 975 |
|
}, |
|
{ |
|
"epoch": 15.00078125, |
|
"grad_norm": 51.28601837158203, |
|
"learning_rate": 9.409722222222224e-06, |
|
"loss": 0.0407, |
|
"step": 980 |
|
}, |
|
{ |
|
"epoch": 15.00234375, |
|
"grad_norm": 0.2956711947917938, |
|
"learning_rate": 9.392361111111112e-06, |
|
"loss": 0.1513, |
|
"step": 990 |
|
}, |
|
{ |
|
"epoch": 15.00390625, |
|
"grad_norm": 149.50283813476562, |
|
"learning_rate": 9.375000000000001e-06, |
|
"loss": 0.3649, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 15.00546875, |
|
"grad_norm": 139.6261444091797, |
|
"learning_rate": 9.35763888888889e-06, |
|
"loss": 0.4625, |
|
"step": 1010 |
|
}, |
|
{ |
|
"epoch": 15.00703125, |
|
"grad_norm": 125.6368408203125, |
|
"learning_rate": 9.340277777777778e-06, |
|
"loss": 0.6565, |
|
"step": 1020 |
|
}, |
|
{ |
|
"epoch": 15.00859375, |
|
"grad_norm": 30.86675453186035, |
|
"learning_rate": 9.322916666666667e-06, |
|
"loss": 0.2548, |
|
"step": 1030 |
|
}, |
|
{ |
|
"epoch": 15.01015625, |
|
"grad_norm": 0.12581364810466766, |
|
"learning_rate": 9.305555555555557e-06, |
|
"loss": 0.1296, |
|
"step": 1040 |
|
}, |
|
{ |
|
"epoch": 15.01015625, |
|
"eval_accuracy": 0.782608695652174, |
|
"eval_loss": 1.1893320083618164, |
|
"eval_runtime": 5.5025, |
|
"eval_samples_per_second": 4.18, |
|
"eval_steps_per_second": 1.09, |
|
"step": 1040 |
|
}, |
|
{ |
|
"epoch": 16.0015625, |
|
"grad_norm": 23.72901725769043, |
|
"learning_rate": 9.288194444444444e-06, |
|
"loss": 0.1616, |
|
"step": 1050 |
|
}, |
|
{ |
|
"epoch": 16.003125, |
|
"grad_norm": 0.08215180039405823, |
|
"learning_rate": 9.270833333333334e-06, |
|
"loss": 0.311, |
|
"step": 1060 |
|
}, |
|
{ |
|
"epoch": 16.0046875, |
|
"grad_norm": 0.03871282935142517, |
|
"learning_rate": 9.253472222222223e-06, |
|
"loss": 0.0348, |
|
"step": 1070 |
|
}, |
|
{ |
|
"epoch": 16.00625, |
|
"grad_norm": 0.054497916251420975, |
|
"learning_rate": 9.236111111111112e-06, |
|
"loss": 0.0279, |
|
"step": 1080 |
|
}, |
|
{ |
|
"epoch": 16.0078125, |
|
"grad_norm": 238.58651733398438, |
|
"learning_rate": 9.21875e-06, |
|
"loss": 0.0624, |
|
"step": 1090 |
|
}, |
|
{ |
|
"epoch": 16.009375, |
|
"grad_norm": 0.07025988399982452, |
|
"learning_rate": 9.201388888888889e-06, |
|
"loss": 0.3097, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 16.01015625, |
|
"eval_accuracy": 0.7391304347826086, |
|
"eval_loss": 1.5245493650436401, |
|
"eval_runtime": 5.6836, |
|
"eval_samples_per_second": 4.047, |
|
"eval_steps_per_second": 1.056, |
|
"step": 1105 |
|
}, |
|
{ |
|
"epoch": 17.00078125, |
|
"grad_norm": 0.025951266288757324, |
|
"learning_rate": 9.18402777777778e-06, |
|
"loss": 0.2226, |
|
"step": 1110 |
|
}, |
|
{ |
|
"epoch": 17.00234375, |
|
"grad_norm": 0.01526107732206583, |
|
"learning_rate": 9.166666666666666e-06, |
|
"loss": 0.0156, |
|
"step": 1120 |
|
}, |
|
{ |
|
"epoch": 17.00390625, |
|
"grad_norm": 170.8768768310547, |
|
"learning_rate": 9.149305555555557e-06, |
|
"loss": 0.1, |
|
"step": 1130 |
|
}, |
|
{ |
|
"epoch": 17.00546875, |
|
"grad_norm": 0.23553895950317383, |
|
"learning_rate": 9.131944444444445e-06, |
|
"loss": 0.0009, |
|
"step": 1140 |
|
}, |
|
{ |
|
"epoch": 17.00703125, |
|
"grad_norm": 0.02782592922449112, |
|
"learning_rate": 9.114583333333334e-06, |
|
"loss": 0.052, |
|
"step": 1150 |
|
}, |
|
{ |
|
"epoch": 17.00859375, |
|
"grad_norm": 0.1742725521326065, |
|
"learning_rate": 9.097222222222223e-06, |
|
"loss": 0.0406, |
|
"step": 1160 |
|
}, |
|
{ |
|
"epoch": 17.01015625, |
|
"grad_norm": 0.024772852659225464, |
|
"learning_rate": 9.079861111111113e-06, |
|
"loss": 0.1145, |
|
"step": 1170 |
|
}, |
|
{ |
|
"epoch": 17.01015625, |
|
"eval_accuracy": 0.782608695652174, |
|
"eval_loss": 1.2978923320770264, |
|
"eval_runtime": 6.3812, |
|
"eval_samples_per_second": 3.604, |
|
"eval_steps_per_second": 0.94, |
|
"step": 1170 |
|
}, |
|
{ |
|
"epoch": 18.0015625, |
|
"grad_norm": 0.14872823655605316, |
|
"learning_rate": 9.0625e-06, |
|
"loss": 0.2639, |
|
"step": 1180 |
|
}, |
|
{ |
|
"epoch": 18.003125, |
|
"grad_norm": 0.047333743423223495, |
|
"learning_rate": 9.045138888888889e-06, |
|
"loss": 0.1541, |
|
"step": 1190 |
|
}, |
|
{ |
|
"epoch": 18.0046875, |
|
"grad_norm": 1.0252918004989624, |
|
"learning_rate": 9.027777777777779e-06, |
|
"loss": 0.0615, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 18.00625, |
|
"grad_norm": 0.021228138357400894, |
|
"learning_rate": 9.010416666666668e-06, |
|
"loss": 0.2217, |
|
"step": 1210 |
|
}, |
|
{ |
|
"epoch": 18.0078125, |
|
"grad_norm": 0.05429055541753769, |
|
"learning_rate": 8.993055555555556e-06, |
|
"loss": 0.0296, |
|
"step": 1220 |
|
}, |
|
{ |
|
"epoch": 18.009375, |
|
"grad_norm": 0.0435396246612072, |
|
"learning_rate": 8.975694444444445e-06, |
|
"loss": 0.2288, |
|
"step": 1230 |
|
}, |
|
{ |
|
"epoch": 18.01015625, |
|
"eval_accuracy": 0.6956521739130435, |
|
"eval_loss": 1.7657958269119263, |
|
"eval_runtime": 6.8688, |
|
"eval_samples_per_second": 3.348, |
|
"eval_steps_per_second": 0.874, |
|
"step": 1235 |
|
}, |
|
{ |
|
"epoch": 19.00078125, |
|
"grad_norm": 0.018049171194434166, |
|
"learning_rate": 8.958333333333334e-06, |
|
"loss": 0.0557, |
|
"step": 1240 |
|
}, |
|
{ |
|
"epoch": 19.00234375, |
|
"grad_norm": 0.08535438030958176, |
|
"learning_rate": 8.940972222222222e-06, |
|
"loss": 0.0036, |
|
"step": 1250 |
|
}, |
|
{ |
|
"epoch": 19.00390625, |
|
"grad_norm": 0.017897306010127068, |
|
"learning_rate": 8.923611111111113e-06, |
|
"loss": 0.0007, |
|
"step": 1260 |
|
}, |
|
{ |
|
"epoch": 19.00546875, |
|
"grad_norm": 38.087764739990234, |
|
"learning_rate": 8.906250000000001e-06, |
|
"loss": 0.5809, |
|
"step": 1270 |
|
}, |
|
{ |
|
"epoch": 19.00703125, |
|
"grad_norm": 0.016219746321439743, |
|
"learning_rate": 8.888888888888888e-06, |
|
"loss": 0.2081, |
|
"step": 1280 |
|
}, |
|
{ |
|
"epoch": 19.00859375, |
|
"grad_norm": 0.010095655918121338, |
|
"learning_rate": 8.871527777777779e-06, |
|
"loss": 0.0009, |
|
"step": 1290 |
|
}, |
|
{ |
|
"epoch": 19.01015625, |
|
"grad_norm": 0.044915877282619476, |
|
"learning_rate": 8.854166666666667e-06, |
|
"loss": 0.0217, |
|
"step": 1300 |
|
}, |
|
{ |
|
"epoch": 19.01015625, |
|
"eval_accuracy": 0.6086956521739131, |
|
"eval_loss": 2.6376562118530273, |
|
"eval_runtime": 5.9605, |
|
"eval_samples_per_second": 3.859, |
|
"eval_steps_per_second": 1.007, |
|
"step": 1300 |
|
}, |
|
{ |
|
"epoch": 20.0015625, |
|
"grad_norm": 0.01931353099644184, |
|
"learning_rate": 8.836805555555556e-06, |
|
"loss": 0.0008, |
|
"step": 1310 |
|
}, |
|
{ |
|
"epoch": 20.003125, |
|
"grad_norm": 0.01681886427104473, |
|
"learning_rate": 8.819444444444445e-06, |
|
"loss": 0.3116, |
|
"step": 1320 |
|
}, |
|
{ |
|
"epoch": 20.0046875, |
|
"grad_norm": 364.32012939453125, |
|
"learning_rate": 8.802083333333335e-06, |
|
"loss": 0.0818, |
|
"step": 1330 |
|
}, |
|
{ |
|
"epoch": 20.00625, |
|
"grad_norm": 0.008102004416286945, |
|
"learning_rate": 8.784722222222224e-06, |
|
"loss": 0.0012, |
|
"step": 1340 |
|
}, |
|
{ |
|
"epoch": 20.0078125, |
|
"grad_norm": 0.010571327991783619, |
|
"learning_rate": 8.767361111111112e-06, |
|
"loss": 0.0852, |
|
"step": 1350 |
|
}, |
|
{ |
|
"epoch": 20.009375, |
|
"grad_norm": 0.3356912434101105, |
|
"learning_rate": 8.750000000000001e-06, |
|
"loss": 0.1368, |
|
"step": 1360 |
|
}, |
|
{ |
|
"epoch": 20.01015625, |
|
"eval_accuracy": 0.6956521739130435, |
|
"eval_loss": 1.6947327852249146, |
|
"eval_runtime": 5.9562, |
|
"eval_samples_per_second": 3.862, |
|
"eval_steps_per_second": 1.007, |
|
"step": 1365 |
|
}, |
|
{ |
|
"epoch": 21.00078125, |
|
"grad_norm": 0.00919839832931757, |
|
"learning_rate": 8.73263888888889e-06, |
|
"loss": 0.2066, |
|
"step": 1370 |
|
}, |
|
{ |
|
"epoch": 21.00234375, |
|
"grad_norm": 0.012858807109296322, |
|
"learning_rate": 8.715277777777778e-06, |
|
"loss": 0.1892, |
|
"step": 1380 |
|
}, |
|
{ |
|
"epoch": 21.00390625, |
|
"grad_norm": 0.17203153669834137, |
|
"learning_rate": 8.697916666666667e-06, |
|
"loss": 0.0069, |
|
"step": 1390 |
|
}, |
|
{ |
|
"epoch": 21.00546875, |
|
"grad_norm": 0.08047995716333389, |
|
"learning_rate": 8.680555555555557e-06, |
|
"loss": 0.1126, |
|
"step": 1400 |
|
}, |
|
{ |
|
"epoch": 21.00703125, |
|
"grad_norm": 0.025560539215803146, |
|
"learning_rate": 8.663194444444444e-06, |
|
"loss": 0.1641, |
|
"step": 1410 |
|
}, |
|
{ |
|
"epoch": 21.00859375, |
|
"grad_norm": 0.16339129209518433, |
|
"learning_rate": 8.645833333333335e-06, |
|
"loss": 0.0022, |
|
"step": 1420 |
|
}, |
|
{ |
|
"epoch": 21.01015625, |
|
"grad_norm": 342.7193908691406, |
|
"learning_rate": 8.628472222222223e-06, |
|
"loss": 0.1717, |
|
"step": 1430 |
|
}, |
|
{ |
|
"epoch": 21.01015625, |
|
"eval_accuracy": 0.6521739130434783, |
|
"eval_loss": 1.8904714584350586, |
|
"eval_runtime": 5.9032, |
|
"eval_samples_per_second": 3.896, |
|
"eval_steps_per_second": 1.016, |
|
"step": 1430 |
|
}, |
|
{ |
|
"epoch": 22.0015625, |
|
"grad_norm": 0.08339407294988632, |
|
"learning_rate": 8.611111111111112e-06, |
|
"loss": 0.0007, |
|
"step": 1440 |
|
}, |
|
{ |
|
"epoch": 22.003125, |
|
"grad_norm": 28.949604034423828, |
|
"learning_rate": 8.59375e-06, |
|
"loss": 0.108, |
|
"step": 1450 |
|
}, |
|
{ |
|
"epoch": 22.0046875, |
|
"grad_norm": 0.04607783257961273, |
|
"learning_rate": 8.57638888888889e-06, |
|
"loss": 0.2159, |
|
"step": 1460 |
|
}, |
|
{ |
|
"epoch": 22.00625, |
|
"grad_norm": 0.006548835895955563, |
|
"learning_rate": 8.559027777777778e-06, |
|
"loss": 0.0005, |
|
"step": 1470 |
|
}, |
|
{ |
|
"epoch": 22.0078125, |
|
"grad_norm": 117.19865417480469, |
|
"learning_rate": 8.541666666666666e-06, |
|
"loss": 0.1625, |
|
"step": 1480 |
|
}, |
|
{ |
|
"epoch": 22.009375, |
|
"grad_norm": 0.009444577619433403, |
|
"learning_rate": 8.524305555555557e-06, |
|
"loss": 0.0014, |
|
"step": 1490 |
|
}, |
|
{ |
|
"epoch": 22.01015625, |
|
"eval_accuracy": 0.6521739130434783, |
|
"eval_loss": 2.1503288745880127, |
|
"eval_runtime": 5.9406, |
|
"eval_samples_per_second": 3.872, |
|
"eval_steps_per_second": 1.01, |
|
"step": 1495 |
|
}, |
|
{ |
|
"epoch": 23.00078125, |
|
"grad_norm": 0.013862960040569305, |
|
"learning_rate": 8.506944444444445e-06, |
|
"loss": 0.0014, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 23.00234375, |
|
"grad_norm": 117.46649932861328, |
|
"learning_rate": 8.489583333333334e-06, |
|
"loss": 0.135, |
|
"step": 1510 |
|
}, |
|
{ |
|
"epoch": 23.00390625, |
|
"grad_norm": 0.004416728392243385, |
|
"learning_rate": 8.472222222222223e-06, |
|
"loss": 0.0008, |
|
"step": 1520 |
|
}, |
|
{ |
|
"epoch": 23.00546875, |
|
"grad_norm": 0.08458108454942703, |
|
"learning_rate": 8.454861111111111e-06, |
|
"loss": 0.0225, |
|
"step": 1530 |
|
}, |
|
{ |
|
"epoch": 23.00703125, |
|
"grad_norm": 0.017984559759497643, |
|
"learning_rate": 8.4375e-06, |
|
"loss": 0.0002, |
|
"step": 1540 |
|
}, |
|
{ |
|
"epoch": 23.00859375, |
|
"grad_norm": 0.006690251640975475, |
|
"learning_rate": 8.420138888888889e-06, |
|
"loss": 0.002, |
|
"step": 1550 |
|
}, |
|
{ |
|
"epoch": 23.01015625, |
|
"grad_norm": 0.0025458575692027807, |
|
"learning_rate": 8.402777777777779e-06, |
|
"loss": 0.012, |
|
"step": 1560 |
|
}, |
|
{ |
|
"epoch": 23.01015625, |
|
"eval_accuracy": 0.6521739130434783, |
|
"eval_loss": 2.050550699234009, |
|
"eval_runtime": 6.4241, |
|
"eval_samples_per_second": 3.58, |
|
"eval_steps_per_second": 0.934, |
|
"step": 1560 |
|
}, |
|
{ |
|
"epoch": 24.0015625, |
|
"grad_norm": 0.011097385548055172, |
|
"learning_rate": 8.385416666666668e-06, |
|
"loss": 0.006, |
|
"step": 1570 |
|
}, |
|
{ |
|
"epoch": 24.003125, |
|
"grad_norm": 0.08708302676677704, |
|
"learning_rate": 8.368055555555556e-06, |
|
"loss": 0.0016, |
|
"step": 1580 |
|
}, |
|
{ |
|
"epoch": 24.0046875, |
|
"grad_norm": 0.04860710725188255, |
|
"learning_rate": 8.350694444444445e-06, |
|
"loss": 0.0002, |
|
"step": 1590 |
|
}, |
|
{ |
|
"epoch": 24.00625, |
|
"grad_norm": 0.0030894328374415636, |
|
"learning_rate": 8.333333333333334e-06, |
|
"loss": 0.001, |
|
"step": 1600 |
|
}, |
|
{ |
|
"epoch": 24.0078125, |
|
"grad_norm": 0.005946220364421606, |
|
"learning_rate": 8.315972222222222e-06, |
|
"loss": 0.0002, |
|
"step": 1610 |
|
}, |
|
{ |
|
"epoch": 24.009375, |
|
"grad_norm": 0.04797302186489105, |
|
"learning_rate": 8.298611111111113e-06, |
|
"loss": 0.0007, |
|
"step": 1620 |
|
}, |
|
{ |
|
"epoch": 24.01015625, |
|
"eval_accuracy": 0.6521739130434783, |
|
"eval_loss": 2.3373279571533203, |
|
"eval_runtime": 5.8943, |
|
"eval_samples_per_second": 3.902, |
|
"eval_steps_per_second": 1.018, |
|
"step": 1625 |
|
}, |
|
{ |
|
"epoch": 25.00078125, |
|
"grad_norm": 0.013759410940110683, |
|
"learning_rate": 8.281250000000001e-06, |
|
"loss": 0.0002, |
|
"step": 1630 |
|
}, |
|
{ |
|
"epoch": 25.00234375, |
|
"grad_norm": 0.00804007425904274, |
|
"learning_rate": 8.263888888888888e-06, |
|
"loss": 0.0002, |
|
"step": 1640 |
|
}, |
|
{ |
|
"epoch": 25.00390625, |
|
"grad_norm": 0.004292096011340618, |
|
"learning_rate": 8.246527777777779e-06, |
|
"loss": 0.0001, |
|
"step": 1650 |
|
}, |
|
{ |
|
"epoch": 25.00546875, |
|
"grad_norm": 0.011900864541530609, |
|
"learning_rate": 8.229166666666667e-06, |
|
"loss": 0.0052, |
|
"step": 1660 |
|
}, |
|
{ |
|
"epoch": 25.00703125, |
|
"grad_norm": 32.002662658691406, |
|
"learning_rate": 8.211805555555556e-06, |
|
"loss": 0.0032, |
|
"step": 1670 |
|
}, |
|
{ |
|
"epoch": 25.00859375, |
|
"grad_norm": 0.12309475988149643, |
|
"learning_rate": 8.194444444444445e-06, |
|
"loss": 0.0002, |
|
"step": 1680 |
|
}, |
|
{ |
|
"epoch": 25.01015625, |
|
"grad_norm": 0.0057711414992809296, |
|
"learning_rate": 8.177083333333335e-06, |
|
"loss": 0.0001, |
|
"step": 1690 |
|
}, |
|
{ |
|
"epoch": 25.01015625, |
|
"eval_accuracy": 0.7391304347826086, |
|
"eval_loss": 1.6162152290344238, |
|
"eval_runtime": 5.9473, |
|
"eval_samples_per_second": 3.867, |
|
"eval_steps_per_second": 1.009, |
|
"step": 1690 |
|
}, |
|
{ |
|
"epoch": 26.0015625, |
|
"grad_norm": 0.004880541004240513, |
|
"learning_rate": 8.159722222222222e-06, |
|
"loss": 0.0012, |
|
"step": 1700 |
|
}, |
|
{ |
|
"epoch": 26.003125, |
|
"grad_norm": 0.0035077305510640144, |
|
"learning_rate": 8.142361111111112e-06, |
|
"loss": 0.0135, |
|
"step": 1710 |
|
}, |
|
{ |
|
"epoch": 26.0046875, |
|
"grad_norm": 3.304060697555542, |
|
"learning_rate": 8.125000000000001e-06, |
|
"loss": 0.1258, |
|
"step": 1720 |
|
}, |
|
{ |
|
"epoch": 26.00625, |
|
"grad_norm": 0.008674221113324165, |
|
"learning_rate": 8.10763888888889e-06, |
|
"loss": 0.0002, |
|
"step": 1730 |
|
}, |
|
{ |
|
"epoch": 26.0078125, |
|
"grad_norm": 0.056495197117328644, |
|
"learning_rate": 8.090277777777778e-06, |
|
"loss": 0.0001, |
|
"step": 1740 |
|
}, |
|
{ |
|
"epoch": 26.009375, |
|
"grad_norm": 0.046838484704494476, |
|
"learning_rate": 8.072916666666667e-06, |
|
"loss": 0.0002, |
|
"step": 1750 |
|
}, |
|
{ |
|
"epoch": 26.01015625, |
|
"eval_accuracy": 0.6086956521739131, |
|
"eval_loss": 2.7662017345428467, |
|
"eval_runtime": 6.1012, |
|
"eval_samples_per_second": 3.77, |
|
"eval_steps_per_second": 0.983, |
|
"step": 1755 |
|
}, |
|
{ |
|
"epoch": 27.00078125, |
|
"grad_norm": 0.008759694173932076, |
|
"learning_rate": 8.055555555555557e-06, |
|
"loss": 0.0138, |
|
"step": 1760 |
|
}, |
|
{ |
|
"epoch": 27.00234375, |
|
"grad_norm": 0.00761327613145113, |
|
"learning_rate": 8.038194444444444e-06, |
|
"loss": 0.0009, |
|
"step": 1770 |
|
}, |
|
{ |
|
"epoch": 27.00390625, |
|
"grad_norm": 0.13704411685466766, |
|
"learning_rate": 8.020833333333335e-06, |
|
"loss": 0.0002, |
|
"step": 1780 |
|
}, |
|
{ |
|
"epoch": 27.00546875, |
|
"grad_norm": 0.2657420337200165, |
|
"learning_rate": 8.003472222222223e-06, |
|
"loss": 0.0026, |
|
"step": 1790 |
|
}, |
|
{ |
|
"epoch": 27.00703125, |
|
"grad_norm": 0.1798246055841446, |
|
"learning_rate": 7.986111111111112e-06, |
|
"loss": 0.0664, |
|
"step": 1800 |
|
}, |
|
{ |
|
"epoch": 27.00859375, |
|
"grad_norm": 0.0032853896263986826, |
|
"learning_rate": 7.96875e-06, |
|
"loss": 0.0001, |
|
"step": 1810 |
|
}, |
|
{ |
|
"epoch": 27.01015625, |
|
"grad_norm": 0.002602074760943651, |
|
"learning_rate": 7.95138888888889e-06, |
|
"loss": 0.104, |
|
"step": 1820 |
|
}, |
|
{ |
|
"epoch": 27.01015625, |
|
"eval_accuracy": 0.782608695652174, |
|
"eval_loss": 1.5636570453643799, |
|
"eval_runtime": 5.8552, |
|
"eval_samples_per_second": 3.928, |
|
"eval_steps_per_second": 1.025, |
|
"step": 1820 |
|
}, |
|
{ |
|
"epoch": 28.0015625, |
|
"grad_norm": 0.03211245685815811, |
|
"learning_rate": 7.934027777777778e-06, |
|
"loss": 0.0001, |
|
"step": 1830 |
|
}, |
|
{ |
|
"epoch": 28.003125, |
|
"grad_norm": 0.00280300946906209, |
|
"learning_rate": 7.916666666666667e-06, |
|
"loss": 0.0001, |
|
"step": 1840 |
|
}, |
|
{ |
|
"epoch": 28.0046875, |
|
"grad_norm": 0.009329872205853462, |
|
"learning_rate": 7.899305555555557e-06, |
|
"loss": 0.0001, |
|
"step": 1850 |
|
}, |
|
{ |
|
"epoch": 28.00625, |
|
"grad_norm": 0.09660506993532181, |
|
"learning_rate": 7.881944444444446e-06, |
|
"loss": 0.0002, |
|
"step": 1860 |
|
}, |
|
{ |
|
"epoch": 28.0078125, |
|
"grad_norm": 0.0060371337458491325, |
|
"learning_rate": 7.864583333333334e-06, |
|
"loss": 0.0001, |
|
"step": 1870 |
|
}, |
|
{ |
|
"epoch": 28.009375, |
|
"grad_norm": 0.12182425707578659, |
|
"learning_rate": 7.847222222222223e-06, |
|
"loss": 0.1848, |
|
"step": 1880 |
|
}, |
|
{ |
|
"epoch": 28.01015625, |
|
"eval_accuracy": 0.5217391304347826, |
|
"eval_loss": 3.688724994659424, |
|
"eval_runtime": 6.7118, |
|
"eval_samples_per_second": 3.427, |
|
"eval_steps_per_second": 0.894, |
|
"step": 1885 |
|
}, |
|
{ |
|
"epoch": 29.00078125, |
|
"grad_norm": 0.0027912850491702557, |
|
"learning_rate": 7.829861111111112e-06, |
|
"loss": 0.3939, |
|
"step": 1890 |
|
}, |
|
{ |
|
"epoch": 29.00234375, |
|
"grad_norm": 0.005003814585506916, |
|
"learning_rate": 7.8125e-06, |
|
"loss": 0.0001, |
|
"step": 1900 |
|
}, |
|
{ |
|
"epoch": 29.00390625, |
|
"grad_norm": 0.006808743346482515, |
|
"learning_rate": 7.795138888888889e-06, |
|
"loss": 0.016, |
|
"step": 1910 |
|
}, |
|
{ |
|
"epoch": 29.00546875, |
|
"grad_norm": 0.0020619197748601437, |
|
"learning_rate": 7.77777777777778e-06, |
|
"loss": 0.0001, |
|
"step": 1920 |
|
}, |
|
{ |
|
"epoch": 29.00703125, |
|
"grad_norm": 0.0028212652541697025, |
|
"learning_rate": 7.760416666666666e-06, |
|
"loss": 0.1682, |
|
"step": 1930 |
|
}, |
|
{ |
|
"epoch": 29.00859375, |
|
"grad_norm": 0.005537331569939852, |
|
"learning_rate": 7.743055555555556e-06, |
|
"loss": 0.0002, |
|
"step": 1940 |
|
}, |
|
{ |
|
"epoch": 29.01015625, |
|
"grad_norm": 0.010945729911327362, |
|
"learning_rate": 7.725694444444445e-06, |
|
"loss": 0.0015, |
|
"step": 1950 |
|
}, |
|
{ |
|
"epoch": 29.01015625, |
|
"eval_accuracy": 0.6956521739130435, |
|
"eval_loss": 1.7132729291915894, |
|
"eval_runtime": 6.9636, |
|
"eval_samples_per_second": 3.303, |
|
"eval_steps_per_second": 0.862, |
|
"step": 1950 |
|
}, |
|
{ |
|
"epoch": 30.0015625, |
|
"grad_norm": 37.507930755615234, |
|
"learning_rate": 7.708333333333334e-06, |
|
"loss": 0.2236, |
|
"step": 1960 |
|
}, |
|
{ |
|
"epoch": 30.003125, |
|
"grad_norm": 0.008836254477500916, |
|
"learning_rate": 7.690972222222222e-06, |
|
"loss": 0.1809, |
|
"step": 1970 |
|
}, |
|
{ |
|
"epoch": 30.0046875, |
|
"grad_norm": 0.21053043007850647, |
|
"learning_rate": 7.673611111111113e-06, |
|
"loss": 0.0063, |
|
"step": 1980 |
|
}, |
|
{ |
|
"epoch": 30.00625, |
|
"grad_norm": 0.00869149062782526, |
|
"learning_rate": 7.656250000000001e-06, |
|
"loss": 0.0012, |
|
"step": 1990 |
|
}, |
|
{ |
|
"epoch": 30.0078125, |
|
"grad_norm": 0.004439413081854582, |
|
"learning_rate": 7.638888888888888e-06, |
|
"loss": 0.037, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 30.009375, |
|
"grad_norm": 0.005357842892408371, |
|
"learning_rate": 7.621527777777779e-06, |
|
"loss": 0.0001, |
|
"step": 2010 |
|
}, |
|
{ |
|
"epoch": 30.01015625, |
|
"eval_accuracy": 0.7391304347826086, |
|
"eval_loss": 2.186380386352539, |
|
"eval_runtime": 6.2949, |
|
"eval_samples_per_second": 3.654, |
|
"eval_steps_per_second": 0.953, |
|
"step": 2015 |
|
}, |
|
{ |
|
"epoch": 31.00078125, |
|
"grad_norm": 0.004626353271305561, |
|
"learning_rate": 7.6041666666666666e-06, |
|
"loss": 0.1059, |
|
"step": 2020 |
|
}, |
|
{ |
|
"epoch": 31.00234375, |
|
"grad_norm": 0.014557418413460255, |
|
"learning_rate": 7.586805555555556e-06, |
|
"loss": 0.0001, |
|
"step": 2030 |
|
}, |
|
{ |
|
"epoch": 31.00390625, |
|
"grad_norm": 0.0017245536437258124, |
|
"learning_rate": 7.569444444444445e-06, |
|
"loss": 0.1546, |
|
"step": 2040 |
|
}, |
|
{ |
|
"epoch": 31.00546875, |
|
"grad_norm": 0.005222524981945753, |
|
"learning_rate": 7.552083333333334e-06, |
|
"loss": 0.001, |
|
"step": 2050 |
|
}, |
|
{ |
|
"epoch": 31.00703125, |
|
"grad_norm": 0.001107494463212788, |
|
"learning_rate": 7.534722222222223e-06, |
|
"loss": 0.0002, |
|
"step": 2060 |
|
}, |
|
{ |
|
"epoch": 31.00859375, |
|
"grad_norm": 0.0143509516492486, |
|
"learning_rate": 7.517361111111112e-06, |
|
"loss": 0.0007, |
|
"step": 2070 |
|
}, |
|
{ |
|
"epoch": 31.01015625, |
|
"grad_norm": 0.0029032255988568068, |
|
"learning_rate": 7.500000000000001e-06, |
|
"loss": 0.0008, |
|
"step": 2080 |
|
}, |
|
{ |
|
"epoch": 31.01015625, |
|
"eval_accuracy": 0.7391304347826086, |
|
"eval_loss": 1.945203185081482, |
|
"eval_runtime": 6.3117, |
|
"eval_samples_per_second": 3.644, |
|
"eval_steps_per_second": 0.951, |
|
"step": 2080 |
|
}, |
|
{ |
|
"epoch": 32.0015625, |
|
"grad_norm": 0.002496402943506837, |
|
"learning_rate": 7.482638888888889e-06, |
|
"loss": 0.0002, |
|
"step": 2090 |
|
}, |
|
{ |
|
"epoch": 32.003125, |
|
"grad_norm": 0.010571641847491264, |
|
"learning_rate": 7.465277777777778e-06, |
|
"loss": 0.0859, |
|
"step": 2100 |
|
}, |
|
{ |
|
"epoch": 32.0046875, |
|
"grad_norm": 0.057661667466163635, |
|
"learning_rate": 7.447916666666667e-06, |
|
"loss": 0.2547, |
|
"step": 2110 |
|
}, |
|
{ |
|
"epoch": 32.00625, |
|
"grad_norm": 0.003463858738541603, |
|
"learning_rate": 7.4305555555555565e-06, |
|
"loss": 0.1449, |
|
"step": 2120 |
|
}, |
|
{ |
|
"epoch": 32.0078125, |
|
"grad_norm": 0.006739518139511347, |
|
"learning_rate": 7.413194444444445e-06, |
|
"loss": 0.0699, |
|
"step": 2130 |
|
}, |
|
{ |
|
"epoch": 32.009375, |
|
"grad_norm": 0.0035657647531479597, |
|
"learning_rate": 7.395833333333335e-06, |
|
"loss": 0.0002, |
|
"step": 2140 |
|
}, |
|
{ |
|
"epoch": 32.01015625, |
|
"eval_accuracy": 0.7391304347826086, |
|
"eval_loss": 1.7982096672058105, |
|
"eval_runtime": 6.1396, |
|
"eval_samples_per_second": 3.746, |
|
"eval_steps_per_second": 0.977, |
|
"step": 2145 |
|
}, |
|
{ |
|
"epoch": 33.00078125, |
|
"grad_norm": 0.007901329547166824, |
|
"learning_rate": 7.3784722222222225e-06, |
|
"loss": 0.0041, |
|
"step": 2150 |
|
}, |
|
{ |
|
"epoch": 33.00234375, |
|
"grad_norm": 0.005937185604125261, |
|
"learning_rate": 7.361111111111112e-06, |
|
"loss": 0.2072, |
|
"step": 2160 |
|
}, |
|
{ |
|
"epoch": 33.00390625, |
|
"grad_norm": 0.16729001700878143, |
|
"learning_rate": 7.343750000000001e-06, |
|
"loss": 0.0766, |
|
"step": 2170 |
|
}, |
|
{ |
|
"epoch": 33.00546875, |
|
"grad_norm": 0.006795287132263184, |
|
"learning_rate": 7.326388888888889e-06, |
|
"loss": 0.0002, |
|
"step": 2180 |
|
}, |
|
{ |
|
"epoch": 33.00703125, |
|
"grad_norm": 0.010546072386205196, |
|
"learning_rate": 7.309027777777779e-06, |
|
"loss": 0.0002, |
|
"step": 2190 |
|
}, |
|
{ |
|
"epoch": 33.00859375, |
|
"grad_norm": 0.22133323550224304, |
|
"learning_rate": 7.291666666666667e-06, |
|
"loss": 0.0001, |
|
"step": 2200 |
|
}, |
|
{ |
|
"epoch": 33.01015625, |
|
"grad_norm": 0.00504819443449378, |
|
"learning_rate": 7.274305555555556e-06, |
|
"loss": 0.0001, |
|
"step": 2210 |
|
}, |
|
{ |
|
"epoch": 33.01015625, |
|
"eval_accuracy": 0.6956521739130435, |
|
"eval_loss": 2.3271660804748535, |
|
"eval_runtime": 6.4197, |
|
"eval_samples_per_second": 3.583, |
|
"eval_steps_per_second": 0.935, |
|
"step": 2210 |
|
}, |
|
{ |
|
"epoch": 34.0015625, |
|
"grad_norm": 0.002802980365231633, |
|
"learning_rate": 7.256944444444445e-06, |
|
"loss": 0.0002, |
|
"step": 2220 |
|
}, |
|
{ |
|
"epoch": 34.003125, |
|
"grad_norm": 0.0017406290862709284, |
|
"learning_rate": 7.239583333333334e-06, |
|
"loss": 0.0001, |
|
"step": 2230 |
|
}, |
|
{ |
|
"epoch": 34.0046875, |
|
"grad_norm": 0.011353565379977226, |
|
"learning_rate": 7.222222222222223e-06, |
|
"loss": 0.0001, |
|
"step": 2240 |
|
}, |
|
{ |
|
"epoch": 34.00625, |
|
"grad_norm": 0.005228589754551649, |
|
"learning_rate": 7.204861111111112e-06, |
|
"loss": 0.0002, |
|
"step": 2250 |
|
}, |
|
{ |
|
"epoch": 34.0078125, |
|
"grad_norm": 0.0054736933670938015, |
|
"learning_rate": 7.1875e-06, |
|
"loss": 0.0618, |
|
"step": 2260 |
|
}, |
|
{ |
|
"epoch": 34.009375, |
|
"grad_norm": 0.003737458260729909, |
|
"learning_rate": 7.170138888888889e-06, |
|
"loss": 0.0072, |
|
"step": 2270 |
|
}, |
|
{ |
|
"epoch": 34.01015625, |
|
"eval_accuracy": 0.6956521739130435, |
|
"eval_loss": 2.5864713191986084, |
|
"eval_runtime": 6.7103, |
|
"eval_samples_per_second": 3.428, |
|
"eval_steps_per_second": 0.894, |
|
"step": 2275 |
|
}, |
|
{ |
|
"epoch": 35.00078125, |
|
"grad_norm": 0.0033578260336071253, |
|
"learning_rate": 7.152777777777778e-06, |
|
"loss": 0.0001, |
|
"step": 2280 |
|
}, |
|
{ |
|
"epoch": 35.00234375, |
|
"grad_norm": 0.002227027900516987, |
|
"learning_rate": 7.135416666666667e-06, |
|
"loss": 0.0004, |
|
"step": 2290 |
|
}, |
|
{ |
|
"epoch": 35.00390625, |
|
"grad_norm": 0.003478831145912409, |
|
"learning_rate": 7.1180555555555565e-06, |
|
"loss": 0.1353, |
|
"step": 2300 |
|
}, |
|
{ |
|
"epoch": 35.00546875, |
|
"grad_norm": 0.0026797729078680277, |
|
"learning_rate": 7.100694444444445e-06, |
|
"loss": 0.0772, |
|
"step": 2310 |
|
}, |
|
{ |
|
"epoch": 35.00703125, |
|
"grad_norm": 0.028473230078816414, |
|
"learning_rate": 7.083333333333335e-06, |
|
"loss": 0.0002, |
|
"step": 2320 |
|
}, |
|
{ |
|
"epoch": 35.00859375, |
|
"grad_norm": 0.0039703757502138615, |
|
"learning_rate": 7.0659722222222225e-06, |
|
"loss": 0.2555, |
|
"step": 2330 |
|
}, |
|
{ |
|
"epoch": 35.01015625, |
|
"grad_norm": 0.0012055512052029371, |
|
"learning_rate": 7.048611111111112e-06, |
|
"loss": 0.275, |
|
"step": 2340 |
|
}, |
|
{ |
|
"epoch": 35.01015625, |
|
"eval_accuracy": 0.5652173913043478, |
|
"eval_loss": 4.006451606750488, |
|
"eval_runtime": 5.9574, |
|
"eval_samples_per_second": 3.861, |
|
"eval_steps_per_second": 1.007, |
|
"step": 2340 |
|
}, |
|
{ |
|
"epoch": 36.0015625, |
|
"grad_norm": 0.1346377581357956, |
|
"learning_rate": 7.031250000000001e-06, |
|
"loss": 0.3966, |
|
"step": 2350 |
|
}, |
|
{ |
|
"epoch": 36.003125, |
|
"grad_norm": 0.06270433962345123, |
|
"learning_rate": 7.013888888888889e-06, |
|
"loss": 0.6715, |
|
"step": 2360 |
|
}, |
|
{ |
|
"epoch": 36.0046875, |
|
"grad_norm": 0.18773694336414337, |
|
"learning_rate": 6.996527777777779e-06, |
|
"loss": 0.0284, |
|
"step": 2370 |
|
}, |
|
{ |
|
"epoch": 36.00625, |
|
"grad_norm": 0.009740647859871387, |
|
"learning_rate": 6.979166666666667e-06, |
|
"loss": 0.0009, |
|
"step": 2380 |
|
}, |
|
{ |
|
"epoch": 36.0078125, |
|
"grad_norm": 0.008271731436252594, |
|
"learning_rate": 6.961805555555556e-06, |
|
"loss": 0.0328, |
|
"step": 2390 |
|
}, |
|
{ |
|
"epoch": 36.009375, |
|
"grad_norm": 0.012119914405047894, |
|
"learning_rate": 6.944444444444445e-06, |
|
"loss": 0.0004, |
|
"step": 2400 |
|
}, |
|
{ |
|
"epoch": 36.01015625, |
|
"eval_accuracy": 0.782608695652174, |
|
"eval_loss": 1.434956431388855, |
|
"eval_runtime": 5.65, |
|
"eval_samples_per_second": 4.071, |
|
"eval_steps_per_second": 1.062, |
|
"step": 2405 |
|
}, |
|
{ |
|
"epoch": 37.00078125, |
|
"grad_norm": 0.004492076113820076, |
|
"learning_rate": 6.927083333333334e-06, |
|
"loss": 0.0001, |
|
"step": 2410 |
|
}, |
|
{ |
|
"epoch": 37.00234375, |
|
"grad_norm": 0.016119619831442833, |
|
"learning_rate": 6.909722222222223e-06, |
|
"loss": 0.0001, |
|
"step": 2420 |
|
}, |
|
{ |
|
"epoch": 37.00390625, |
|
"grad_norm": 0.010017652064561844, |
|
"learning_rate": 6.8923611111111124e-06, |
|
"loss": 0.0002, |
|
"step": 2430 |
|
}, |
|
{ |
|
"epoch": 37.00546875, |
|
"grad_norm": 0.0022193919867277145, |
|
"learning_rate": 6.875e-06, |
|
"loss": 0.0002, |
|
"step": 2440 |
|
}, |
|
{ |
|
"epoch": 37.00703125, |
|
"grad_norm": 0.05469023436307907, |
|
"learning_rate": 6.857638888888889e-06, |
|
"loss": 0.0067, |
|
"step": 2450 |
|
}, |
|
{ |
|
"epoch": 37.00859375, |
|
"grad_norm": 0.0024135003332048655, |
|
"learning_rate": 6.840277777777778e-06, |
|
"loss": 0.0001, |
|
"step": 2460 |
|
}, |
|
{ |
|
"epoch": 37.01015625, |
|
"grad_norm": 0.005584715865552425, |
|
"learning_rate": 6.822916666666667e-06, |
|
"loss": 0.0001, |
|
"step": 2470 |
|
}, |
|
{ |
|
"epoch": 37.01015625, |
|
"eval_accuracy": 0.782608695652174, |
|
"eval_loss": 1.8395758867263794, |
|
"eval_runtime": 5.6621, |
|
"eval_samples_per_second": 4.062, |
|
"eval_steps_per_second": 1.06, |
|
"step": 2470 |
|
}, |
|
{ |
|
"epoch": 38.0015625, |
|
"grad_norm": 0.002385572763159871, |
|
"learning_rate": 6.8055555555555566e-06, |
|
"loss": 0.0001, |
|
"step": 2480 |
|
}, |
|
{ |
|
"epoch": 38.003125, |
|
"grad_norm": 0.0032217069528996944, |
|
"learning_rate": 6.788194444444444e-06, |
|
"loss": 0.0001, |
|
"step": 2490 |
|
}, |
|
{ |
|
"epoch": 38.0046875, |
|
"grad_norm": 0.003425781149417162, |
|
"learning_rate": 6.770833333333334e-06, |
|
"loss": 0.0001, |
|
"step": 2500 |
|
}, |
|
{ |
|
"epoch": 38.00625, |
|
"grad_norm": 0.0016244082944467664, |
|
"learning_rate": 6.7534722222222225e-06, |
|
"loss": 0.0001, |
|
"step": 2510 |
|
}, |
|
{ |
|
"epoch": 38.0078125, |
|
"grad_norm": 0.011852677911520004, |
|
"learning_rate": 6.736111111111112e-06, |
|
"loss": 0.0001, |
|
"step": 2520 |
|
}, |
|
{ |
|
"epoch": 38.009375, |
|
"grad_norm": 0.0012439934071153402, |
|
"learning_rate": 6.718750000000001e-06, |
|
"loss": 0.1562, |
|
"step": 2530 |
|
}, |
|
{ |
|
"epoch": 38.01015625, |
|
"eval_accuracy": 0.6521739130434783, |
|
"eval_loss": 2.678842306137085, |
|
"eval_runtime": 5.7899, |
|
"eval_samples_per_second": 3.972, |
|
"eval_steps_per_second": 1.036, |
|
"step": 2535 |
|
}, |
|
{ |
|
"epoch": 39.00078125, |
|
"grad_norm": 0.0010783456964418292, |
|
"learning_rate": 6.701388888888889e-06, |
|
"loss": 0.0107, |
|
"step": 2540 |
|
}, |
|
{ |
|
"epoch": 39.00234375, |
|
"grad_norm": 0.4099120497703552, |
|
"learning_rate": 6.684027777777779e-06, |
|
"loss": 0.0001, |
|
"step": 2550 |
|
}, |
|
{ |
|
"epoch": 39.00390625, |
|
"grad_norm": 0.0027852212078869343, |
|
"learning_rate": 6.666666666666667e-06, |
|
"loss": 0.174, |
|
"step": 2560 |
|
}, |
|
{ |
|
"epoch": 39.00546875, |
|
"grad_norm": 0.0022237482480704784, |
|
"learning_rate": 6.649305555555556e-06, |
|
"loss": 0.0001, |
|
"step": 2570 |
|
}, |
|
{ |
|
"epoch": 39.00703125, |
|
"grad_norm": 0.002032435964792967, |
|
"learning_rate": 6.631944444444445e-06, |
|
"loss": 0.0001, |
|
"step": 2580 |
|
}, |
|
{ |
|
"epoch": 39.00859375, |
|
"grad_norm": 0.004132281057536602, |
|
"learning_rate": 6.614583333333334e-06, |
|
"loss": 0.0786, |
|
"step": 2590 |
|
}, |
|
{ |
|
"epoch": 39.01015625, |
|
"grad_norm": 0.0035320252645760775, |
|
"learning_rate": 6.597222222222223e-06, |
|
"loss": 0.0001, |
|
"step": 2600 |
|
}, |
|
{ |
|
"epoch": 39.01015625, |
|
"eval_accuracy": 0.6956521739130435, |
|
"eval_loss": 2.000983476638794, |
|
"eval_runtime": 5.9001, |
|
"eval_samples_per_second": 3.898, |
|
"eval_steps_per_second": 1.017, |
|
"step": 2600 |
|
}, |
|
{ |
|
"epoch": 40.0015625, |
|
"grad_norm": 0.0028500519692897797, |
|
"learning_rate": 6.5798611111111125e-06, |
|
"loss": 0.1721, |
|
"step": 2610 |
|
}, |
|
{ |
|
"epoch": 40.003125, |
|
"grad_norm": 0.007120965048670769, |
|
"learning_rate": 6.5625e-06, |
|
"loss": 0.0001, |
|
"step": 2620 |
|
}, |
|
{ |
|
"epoch": 40.0046875, |
|
"grad_norm": 0.0011918245581910014, |
|
"learning_rate": 6.545138888888889e-06, |
|
"loss": 0.0001, |
|
"step": 2630 |
|
}, |
|
{ |
|
"epoch": 40.00625, |
|
"grad_norm": 0.0032589342445135117, |
|
"learning_rate": 6.5277777777777784e-06, |
|
"loss": 0.0001, |
|
"step": 2640 |
|
}, |
|
{ |
|
"epoch": 40.0078125, |
|
"grad_norm": 0.001322840340435505, |
|
"learning_rate": 6.510416666666667e-06, |
|
"loss": 0.0001, |
|
"step": 2650 |
|
}, |
|
{ |
|
"epoch": 40.009375, |
|
"grad_norm": 0.002229139907285571, |
|
"learning_rate": 6.493055555555557e-06, |
|
"loss": 0.0001, |
|
"step": 2660 |
|
}, |
|
{ |
|
"epoch": 40.01015625, |
|
"eval_accuracy": 0.6521739130434783, |
|
"eval_loss": 2.4220354557037354, |
|
"eval_runtime": 4.322, |
|
"eval_samples_per_second": 5.322, |
|
"eval_steps_per_second": 1.388, |
|
"step": 2665 |
|
}, |
|
{ |
|
"epoch": 41.00078125, |
|
"grad_norm": 0.002013720339164138, |
|
"learning_rate": 6.475694444444444e-06, |
|
"loss": 0.0044, |
|
"step": 2670 |
|
}, |
|
{ |
|
"epoch": 41.00234375, |
|
"grad_norm": 0.002689501503482461, |
|
"learning_rate": 6.458333333333334e-06, |
|
"loss": 0.0003, |
|
"step": 2680 |
|
}, |
|
{ |
|
"epoch": 41.00390625, |
|
"grad_norm": 0.0017358016921207309, |
|
"learning_rate": 6.4409722222222226e-06, |
|
"loss": 0.0001, |
|
"step": 2690 |
|
}, |
|
{ |
|
"epoch": 41.00546875, |
|
"grad_norm": 0.0026384200900793076, |
|
"learning_rate": 6.423611111111112e-06, |
|
"loss": 0.0001, |
|
"step": 2700 |
|
}, |
|
{ |
|
"epoch": 41.00703125, |
|
"grad_norm": 0.004121453035622835, |
|
"learning_rate": 6.406250000000001e-06, |
|
"loss": 0.0002, |
|
"step": 2710 |
|
}, |
|
{ |
|
"epoch": 41.00859375, |
|
"grad_norm": 0.005099486093968153, |
|
"learning_rate": 6.3888888888888885e-06, |
|
"loss": 0.0001, |
|
"step": 2720 |
|
}, |
|
{ |
|
"epoch": 41.01015625, |
|
"grad_norm": 0.0035702483728528023, |
|
"learning_rate": 6.371527777777778e-06, |
|
"loss": 0.1117, |
|
"step": 2730 |
|
}, |
|
{ |
|
"epoch": 41.01015625, |
|
"eval_accuracy": 0.6956521739130435, |
|
"eval_loss": 2.3290226459503174, |
|
"eval_runtime": 4.9361, |
|
"eval_samples_per_second": 4.66, |
|
"eval_steps_per_second": 1.216, |
|
"step": 2730 |
|
}, |
|
{ |
|
"epoch": 42.0015625, |
|
"grad_norm": 0.001132201636210084, |
|
"learning_rate": 6.354166666666667e-06, |
|
"loss": 0.0, |
|
"step": 2740 |
|
}, |
|
{ |
|
"epoch": 42.003125, |
|
"grad_norm": 0.0018950949888676405, |
|
"learning_rate": 6.336805555555556e-06, |
|
"loss": 0.0001, |
|
"step": 2750 |
|
}, |
|
{ |
|
"epoch": 42.0046875, |
|
"grad_norm": 0.014954060316085815, |
|
"learning_rate": 6.319444444444445e-06, |
|
"loss": 0.0001, |
|
"step": 2760 |
|
}, |
|
{ |
|
"epoch": 42.00625, |
|
"grad_norm": 0.035089947283267975, |
|
"learning_rate": 6.302083333333334e-06, |
|
"loss": 0.105, |
|
"step": 2770 |
|
}, |
|
{ |
|
"epoch": 42.0078125, |
|
"grad_norm": 0.017378531396389008, |
|
"learning_rate": 6.284722222222223e-06, |
|
"loss": 0.0001, |
|
"step": 2780 |
|
}, |
|
{ |
|
"epoch": 42.009375, |
|
"grad_norm": 0.0027489045169204473, |
|
"learning_rate": 6.2673611111111125e-06, |
|
"loss": 0.0001, |
|
"step": 2790 |
|
}, |
|
{ |
|
"epoch": 42.01015625, |
|
"eval_accuracy": 0.5652173913043478, |
|
"eval_loss": 3.1235392093658447, |
|
"eval_runtime": 5.0758, |
|
"eval_samples_per_second": 4.531, |
|
"eval_steps_per_second": 1.182, |
|
"step": 2795 |
|
}, |
|
{ |
|
"epoch": 43.00078125, |
|
"grad_norm": 0.0023033316247165203, |
|
"learning_rate": 6.25e-06, |
|
"loss": 0.0001, |
|
"step": 2800 |
|
}, |
|
{ |
|
"epoch": 43.00234375, |
|
"grad_norm": 0.0024786265566945076, |
|
"learning_rate": 6.232638888888889e-06, |
|
"loss": 0.0001, |
|
"step": 2810 |
|
}, |
|
{ |
|
"epoch": 43.00390625, |
|
"grad_norm": 0.0007389386300928891, |
|
"learning_rate": 6.2152777777777785e-06, |
|
"loss": 0.0, |
|
"step": 2820 |
|
}, |
|
{ |
|
"epoch": 43.00546875, |
|
"grad_norm": 0.11342310905456543, |
|
"learning_rate": 6.197916666666667e-06, |
|
"loss": 0.0007, |
|
"step": 2830 |
|
}, |
|
{ |
|
"epoch": 43.00703125, |
|
"grad_norm": 0.0016803776379674673, |
|
"learning_rate": 6.180555555555557e-06, |
|
"loss": 0.0147, |
|
"step": 2840 |
|
}, |
|
{ |
|
"epoch": 43.00859375, |
|
"grad_norm": 0.0018280907534062862, |
|
"learning_rate": 6.163194444444444e-06, |
|
"loss": 0.0001, |
|
"step": 2850 |
|
}, |
|
{ |
|
"epoch": 43.01015625, |
|
"grad_norm": 0.0025411536917090416, |
|
"learning_rate": 6.145833333333334e-06, |
|
"loss": 0.0001, |
|
"step": 2860 |
|
}, |
|
{ |
|
"epoch": 43.01015625, |
|
"eval_accuracy": 0.6086956521739131, |
|
"eval_loss": 2.906446933746338, |
|
"eval_runtime": 5.6809, |
|
"eval_samples_per_second": 4.049, |
|
"eval_steps_per_second": 1.056, |
|
"step": 2860 |
|
}, |
|
{ |
|
"epoch": 44.0015625, |
|
"grad_norm": 0.0019124329555779696, |
|
"learning_rate": 6.128472222222223e-06, |
|
"loss": 0.0005, |
|
"step": 2870 |
|
}, |
|
{ |
|
"epoch": 44.003125, |
|
"grad_norm": 0.0017936074873432517, |
|
"learning_rate": 6.111111111111112e-06, |
|
"loss": 0.1123, |
|
"step": 2880 |
|
}, |
|
{ |
|
"epoch": 44.0046875, |
|
"grad_norm": 0.0036375941708683968, |
|
"learning_rate": 6.093750000000001e-06, |
|
"loss": 0.0646, |
|
"step": 2890 |
|
}, |
|
{ |
|
"epoch": 44.00625, |
|
"grad_norm": 0.07857771962881088, |
|
"learning_rate": 6.0763888888888885e-06, |
|
"loss": 0.0001, |
|
"step": 2900 |
|
}, |
|
{ |
|
"epoch": 44.0078125, |
|
"grad_norm": 0.0012261528754606843, |
|
"learning_rate": 6.059027777777778e-06, |
|
"loss": 0.2223, |
|
"step": 2910 |
|
}, |
|
{ |
|
"epoch": 44.009375, |
|
"grad_norm": 0.0008247962687164545, |
|
"learning_rate": 6.041666666666667e-06, |
|
"loss": 0.0003, |
|
"step": 2920 |
|
}, |
|
{ |
|
"epoch": 44.01015625, |
|
"eval_accuracy": 0.6086956521739131, |
|
"eval_loss": 3.135910987854004, |
|
"eval_runtime": 5.8456, |
|
"eval_samples_per_second": 3.935, |
|
"eval_steps_per_second": 1.026, |
|
"step": 2925 |
|
}, |
|
{ |
|
"epoch": 45.00078125, |
|
"grad_norm": 0.0016557525377720594, |
|
"learning_rate": 6.024305555555556e-06, |
|
"loss": 0.0001, |
|
"step": 2930 |
|
}, |
|
{ |
|
"epoch": 45.00234375, |
|
"grad_norm": 0.020362816751003265, |
|
"learning_rate": 6.006944444444445e-06, |
|
"loss": 0.0001, |
|
"step": 2940 |
|
}, |
|
{ |
|
"epoch": 45.00390625, |
|
"grad_norm": 0.003076077438890934, |
|
"learning_rate": 5.989583333333334e-06, |
|
"loss": 0.0167, |
|
"step": 2950 |
|
}, |
|
{ |
|
"epoch": 45.00546875, |
|
"grad_norm": 0.0016468287212774158, |
|
"learning_rate": 5.972222222222222e-06, |
|
"loss": 0.0001, |
|
"step": 2960 |
|
}, |
|
{ |
|
"epoch": 45.00703125, |
|
"grad_norm": 0.0008026000577956438, |
|
"learning_rate": 5.954861111111112e-06, |
|
"loss": 0.1236, |
|
"step": 2970 |
|
}, |
|
{ |
|
"epoch": 45.00859375, |
|
"grad_norm": 0.0015771895414218307, |
|
"learning_rate": 5.9375e-06, |
|
"loss": 0.0001, |
|
"step": 2980 |
|
}, |
|
{ |
|
"epoch": 45.01015625, |
|
"grad_norm": 0.0010327126365154982, |
|
"learning_rate": 5.920138888888889e-06, |
|
"loss": 0.0007, |
|
"step": 2990 |
|
}, |
|
{ |
|
"epoch": 45.01015625, |
|
"eval_accuracy": 0.6086956521739131, |
|
"eval_loss": 3.1225311756134033, |
|
"eval_runtime": 5.8749, |
|
"eval_samples_per_second": 3.915, |
|
"eval_steps_per_second": 1.021, |
|
"step": 2990 |
|
}, |
|
{ |
|
"epoch": 46.0015625, |
|
"grad_norm": 0.0014787332620471716, |
|
"learning_rate": 5.9027777777777785e-06, |
|
"loss": 0.0001, |
|
"step": 3000 |
|
}, |
|
{ |
|
"epoch": 46.003125, |
|
"grad_norm": 0.04597454518079758, |
|
"learning_rate": 5.885416666666667e-06, |
|
"loss": 0.0001, |
|
"step": 3010 |
|
}, |
|
{ |
|
"epoch": 46.0046875, |
|
"grad_norm": 0.0013207652373239398, |
|
"learning_rate": 5.868055555555557e-06, |
|
"loss": 0.0, |
|
"step": 3020 |
|
}, |
|
{ |
|
"epoch": 46.00625, |
|
"grad_norm": 0.004010067321360111, |
|
"learning_rate": 5.8506944444444444e-06, |
|
"loss": 0.0, |
|
"step": 3030 |
|
}, |
|
{ |
|
"epoch": 46.0078125, |
|
"grad_norm": 0.0014423461398109794, |
|
"learning_rate": 5.833333333333334e-06, |
|
"loss": 0.0548, |
|
"step": 3040 |
|
}, |
|
{ |
|
"epoch": 46.009375, |
|
"grad_norm": 0.10172294080257416, |
|
"learning_rate": 5.815972222222223e-06, |
|
"loss": 0.0031, |
|
"step": 3050 |
|
}, |
|
{ |
|
"epoch": 46.01015625, |
|
"eval_accuracy": 0.6086956521739131, |
|
"eval_loss": 2.9251842498779297, |
|
"eval_runtime": 5.9272, |
|
"eval_samples_per_second": 3.88, |
|
"eval_steps_per_second": 1.012, |
|
"step": 3055 |
|
}, |
|
{ |
|
"epoch": 47.00078125, |
|
"grad_norm": 0.001549496315419674, |
|
"learning_rate": 5.798611111111112e-06, |
|
"loss": 0.0, |
|
"step": 3060 |
|
}, |
|
{ |
|
"epoch": 47.00234375, |
|
"grad_norm": 0.0004972199094481766, |
|
"learning_rate": 5.781250000000001e-06, |
|
"loss": 0.0004, |
|
"step": 3070 |
|
}, |
|
{ |
|
"epoch": 47.00390625, |
|
"grad_norm": 0.0068430183455348015, |
|
"learning_rate": 5.7638888888888886e-06, |
|
"loss": 0.0001, |
|
"step": 3080 |
|
}, |
|
{ |
|
"epoch": 47.00546875, |
|
"grad_norm": 0.0006451302324421704, |
|
"learning_rate": 5.746527777777778e-06, |
|
"loss": 0.0001, |
|
"step": 3090 |
|
}, |
|
{ |
|
"epoch": 47.00703125, |
|
"grad_norm": 0.0031021556351333857, |
|
"learning_rate": 5.729166666666667e-06, |
|
"loss": 0.0, |
|
"step": 3100 |
|
}, |
|
{ |
|
"epoch": 47.00859375, |
|
"grad_norm": 0.0017613332020118833, |
|
"learning_rate": 5.711805555555556e-06, |
|
"loss": 0.0001, |
|
"step": 3110 |
|
}, |
|
{ |
|
"epoch": 47.01015625, |
|
"grad_norm": 0.0005297003081068397, |
|
"learning_rate": 5.694444444444445e-06, |
|
"loss": 0.0, |
|
"step": 3120 |
|
}, |
|
{ |
|
"epoch": 47.01015625, |
|
"eval_accuracy": 0.5652173913043478, |
|
"eval_loss": 3.3918874263763428, |
|
"eval_runtime": 5.9599, |
|
"eval_samples_per_second": 3.859, |
|
"eval_steps_per_second": 1.007, |
|
"step": 3120 |
|
}, |
|
{ |
|
"epoch": 48.0015625, |
|
"grad_norm": 0.009134139865636826, |
|
"learning_rate": 5.677083333333334e-06, |
|
"loss": 0.0, |
|
"step": 3130 |
|
}, |
|
{ |
|
"epoch": 48.003125, |
|
"grad_norm": 0.002516545820981264, |
|
"learning_rate": 5.659722222222222e-06, |
|
"loss": 0.0, |
|
"step": 3140 |
|
}, |
|
{ |
|
"epoch": 48.0046875, |
|
"grad_norm": 0.0007491998258046806, |
|
"learning_rate": 5.642361111111112e-06, |
|
"loss": 0.0, |
|
"step": 3150 |
|
}, |
|
{ |
|
"epoch": 48.00625, |
|
"grad_norm": 0.04278244078159332, |
|
"learning_rate": 5.625e-06, |
|
"loss": 0.1166, |
|
"step": 3160 |
|
}, |
|
{ |
|
"epoch": 48.0078125, |
|
"grad_norm": 0.0011739269830286503, |
|
"learning_rate": 5.607638888888889e-06, |
|
"loss": 0.0001, |
|
"step": 3170 |
|
}, |
|
{ |
|
"epoch": 48.009375, |
|
"grad_norm": 0.00377621385268867, |
|
"learning_rate": 5.5902777777777785e-06, |
|
"loss": 0.0003, |
|
"step": 3180 |
|
}, |
|
{ |
|
"epoch": 48.01015625, |
|
"eval_accuracy": 0.6956521739130435, |
|
"eval_loss": 2.8240015506744385, |
|
"eval_runtime": 5.9267, |
|
"eval_samples_per_second": 3.881, |
|
"eval_steps_per_second": 1.012, |
|
"step": 3185 |
|
}, |
|
{ |
|
"epoch": 49.00078125, |
|
"grad_norm": 0.04634140804409981, |
|
"learning_rate": 5.572916666666667e-06, |
|
"loss": 0.0002, |
|
"step": 3190 |
|
}, |
|
{ |
|
"epoch": 49.00234375, |
|
"grad_norm": 0.007841131649911404, |
|
"learning_rate": 5.555555555555557e-06, |
|
"loss": 0.0, |
|
"step": 3200 |
|
}, |
|
{ |
|
"epoch": 49.00390625, |
|
"grad_norm": 0.001621553674340248, |
|
"learning_rate": 5.5381944444444445e-06, |
|
"loss": 0.0001, |
|
"step": 3210 |
|
}, |
|
{ |
|
"epoch": 49.00546875, |
|
"grad_norm": 0.0036857123486697674, |
|
"learning_rate": 5.520833333333334e-06, |
|
"loss": 0.2286, |
|
"step": 3220 |
|
}, |
|
{ |
|
"epoch": 49.00703125, |
|
"grad_norm": 0.05553280934691429, |
|
"learning_rate": 5.503472222222223e-06, |
|
"loss": 0.0001, |
|
"step": 3230 |
|
}, |
|
{ |
|
"epoch": 49.00859375, |
|
"grad_norm": 3.9317102432250977, |
|
"learning_rate": 5.486111111111112e-06, |
|
"loss": 0.0039, |
|
"step": 3240 |
|
}, |
|
{ |
|
"epoch": 49.01015625, |
|
"grad_norm": 0.0012907739728689194, |
|
"learning_rate": 5.468750000000001e-06, |
|
"loss": 0.0014, |
|
"step": 3250 |
|
}, |
|
{ |
|
"epoch": 49.01015625, |
|
"eval_accuracy": 0.5652173913043478, |
|
"eval_loss": 2.4431238174438477, |
|
"eval_runtime": 6.054, |
|
"eval_samples_per_second": 3.799, |
|
"eval_steps_per_second": 0.991, |
|
"step": 3250 |
|
}, |
|
{ |
|
"epoch": 50.0015625, |
|
"grad_norm": 0.004488171543926001, |
|
"learning_rate": 5.451388888888889e-06, |
|
"loss": 0.0, |
|
"step": 3260 |
|
}, |
|
{ |
|
"epoch": 50.003125, |
|
"grad_norm": 0.0019073676085099578, |
|
"learning_rate": 5.434027777777778e-06, |
|
"loss": 0.0002, |
|
"step": 3270 |
|
}, |
|
{ |
|
"epoch": 50.0046875, |
|
"grad_norm": 0.0031424148473888636, |
|
"learning_rate": 5.416666666666667e-06, |
|
"loss": 0.0, |
|
"step": 3280 |
|
}, |
|
{ |
|
"epoch": 50.00625, |
|
"grad_norm": 0.001569430693052709, |
|
"learning_rate": 5.399305555555556e-06, |
|
"loss": 0.0001, |
|
"step": 3290 |
|
}, |
|
{ |
|
"epoch": 50.0078125, |
|
"grad_norm": 0.007558396551758051, |
|
"learning_rate": 5.381944444444445e-06, |
|
"loss": 0.0001, |
|
"step": 3300 |
|
}, |
|
{ |
|
"epoch": 50.009375, |
|
"grad_norm": 0.07257850468158722, |
|
"learning_rate": 5.364583333333334e-06, |
|
"loss": 0.0001, |
|
"step": 3310 |
|
}, |
|
{ |
|
"epoch": 50.01015625, |
|
"eval_accuracy": 0.6956521739130435, |
|
"eval_loss": 2.2488174438476562, |
|
"eval_runtime": 6.1093, |
|
"eval_samples_per_second": 3.765, |
|
"eval_steps_per_second": 0.982, |
|
"step": 3315 |
|
}, |
|
{ |
|
"epoch": 51.00078125, |
|
"grad_norm": 0.002742693992331624, |
|
"learning_rate": 5.347222222222222e-06, |
|
"loss": 0.1927, |
|
"step": 3320 |
|
}, |
|
{ |
|
"epoch": 51.00234375, |
|
"grad_norm": 0.0056789107620716095, |
|
"learning_rate": 5.329861111111112e-06, |
|
"loss": 0.0003, |
|
"step": 3330 |
|
}, |
|
{ |
|
"epoch": 51.00390625, |
|
"grad_norm": 0.0010554116452112794, |
|
"learning_rate": 5.3125e-06, |
|
"loss": 0.0, |
|
"step": 3340 |
|
}, |
|
{ |
|
"epoch": 51.00546875, |
|
"grad_norm": 0.002496332162991166, |
|
"learning_rate": 5.295138888888889e-06, |
|
"loss": 0.0, |
|
"step": 3350 |
|
}, |
|
{ |
|
"epoch": 51.00703125, |
|
"grad_norm": 0.0018673554295673966, |
|
"learning_rate": 5.2777777777777785e-06, |
|
"loss": 0.0001, |
|
"step": 3360 |
|
}, |
|
{ |
|
"epoch": 51.00859375, |
|
"grad_norm": 0.0005963024450466037, |
|
"learning_rate": 5.260416666666666e-06, |
|
"loss": 0.0001, |
|
"step": 3370 |
|
}, |
|
{ |
|
"epoch": 51.01015625, |
|
"grad_norm": 0.001718234270811081, |
|
"learning_rate": 5.243055555555556e-06, |
|
"loss": 0.0, |
|
"step": 3380 |
|
}, |
|
{ |
|
"epoch": 51.01015625, |
|
"eval_accuracy": 0.6086956521739131, |
|
"eval_loss": 2.616856575012207, |
|
"eval_runtime": 5.9739, |
|
"eval_samples_per_second": 3.85, |
|
"eval_steps_per_second": 1.004, |
|
"step": 3380 |
|
}, |
|
{ |
|
"epoch": 52.0015625, |
|
"grad_norm": 0.0014240281889215112, |
|
"learning_rate": 5.2256944444444445e-06, |
|
"loss": 0.0, |
|
"step": 3390 |
|
}, |
|
{ |
|
"epoch": 52.003125, |
|
"grad_norm": 0.0026255918201059103, |
|
"learning_rate": 5.208333333333334e-06, |
|
"loss": 0.0, |
|
"step": 3400 |
|
}, |
|
{ |
|
"epoch": 52.0046875, |
|
"grad_norm": 0.04270762950181961, |
|
"learning_rate": 5.190972222222223e-06, |
|
"loss": 0.0289, |
|
"step": 3410 |
|
}, |
|
{ |
|
"epoch": 52.00625, |
|
"grad_norm": 0.0017076137010008097, |
|
"learning_rate": 5.173611111111112e-06, |
|
"loss": 0.1845, |
|
"step": 3420 |
|
}, |
|
{ |
|
"epoch": 52.0078125, |
|
"grad_norm": 0.0011627430794760585, |
|
"learning_rate": 5.156250000000001e-06, |
|
"loss": 0.0, |
|
"step": 3430 |
|
}, |
|
{ |
|
"epoch": 52.009375, |
|
"grad_norm": 0.0007181333494372666, |
|
"learning_rate": 5.138888888888889e-06, |
|
"loss": 0.0, |
|
"step": 3440 |
|
}, |
|
{ |
|
"epoch": 52.01015625, |
|
"eval_accuracy": 0.7391304347826086, |
|
"eval_loss": 2.411802291870117, |
|
"eval_runtime": 6.0467, |
|
"eval_samples_per_second": 3.804, |
|
"eval_steps_per_second": 0.992, |
|
"step": 3445 |
|
}, |
|
{ |
|
"epoch": 53.00078125, |
|
"grad_norm": 0.12792538106441498, |
|
"learning_rate": 5.121527777777778e-06, |
|
"loss": 0.0001, |
|
"step": 3450 |
|
}, |
|
{ |
|
"epoch": 53.00234375, |
|
"grad_norm": 0.0005251476541161537, |
|
"learning_rate": 5.104166666666667e-06, |
|
"loss": 0.0005, |
|
"step": 3460 |
|
}, |
|
{ |
|
"epoch": 53.00390625, |
|
"grad_norm": 0.13350893557071686, |
|
"learning_rate": 5.086805555555556e-06, |
|
"loss": 0.0001, |
|
"step": 3470 |
|
}, |
|
{ |
|
"epoch": 53.00546875, |
|
"grad_norm": 0.005934035871177912, |
|
"learning_rate": 5.069444444444445e-06, |
|
"loss": 0.0001, |
|
"step": 3480 |
|
}, |
|
{ |
|
"epoch": 53.00703125, |
|
"grad_norm": 0.001763555221259594, |
|
"learning_rate": 5.0520833333333344e-06, |
|
"loss": 0.0, |
|
"step": 3490 |
|
}, |
|
{ |
|
"epoch": 53.00859375, |
|
"grad_norm": 0.1637350171804428, |
|
"learning_rate": 5.034722222222222e-06, |
|
"loss": 0.0001, |
|
"step": 3500 |
|
}, |
|
{ |
|
"epoch": 53.01015625, |
|
"grad_norm": 0.012841137126088142, |
|
"learning_rate": 5.017361111111112e-06, |
|
"loss": 0.0002, |
|
"step": 3510 |
|
}, |
|
{ |
|
"epoch": 53.01015625, |
|
"eval_accuracy": 0.5652173913043478, |
|
"eval_loss": 2.4927992820739746, |
|
"eval_runtime": 6.0136, |
|
"eval_samples_per_second": 3.825, |
|
"eval_steps_per_second": 0.998, |
|
"step": 3510 |
|
}, |
|
{ |
|
"epoch": 54.0015625, |
|
"grad_norm": 0.0011469083838164806, |
|
"learning_rate": 5e-06, |
|
"loss": 0.0, |
|
"step": 3520 |
|
}, |
|
{ |
|
"epoch": 54.003125, |
|
"grad_norm": 336.460693359375, |
|
"learning_rate": 4.982638888888889e-06, |
|
"loss": 0.0136, |
|
"step": 3530 |
|
}, |
|
{ |
|
"epoch": 54.0046875, |
|
"grad_norm": 0.0007544786785729229, |
|
"learning_rate": 4.9652777777777786e-06, |
|
"loss": 0.0, |
|
"step": 3540 |
|
}, |
|
{ |
|
"epoch": 54.00625, |
|
"grad_norm": 0.0013037599856033921, |
|
"learning_rate": 4.947916666666667e-06, |
|
"loss": 0.0001, |
|
"step": 3550 |
|
}, |
|
{ |
|
"epoch": 54.0078125, |
|
"grad_norm": 112.71292114257812, |
|
"learning_rate": 4.930555555555556e-06, |
|
"loss": 0.0073, |
|
"step": 3560 |
|
}, |
|
{ |
|
"epoch": 54.009375, |
|
"grad_norm": 0.016042951494455338, |
|
"learning_rate": 4.9131944444444445e-06, |
|
"loss": 0.0001, |
|
"step": 3570 |
|
}, |
|
{ |
|
"epoch": 54.01015625, |
|
"eval_accuracy": 0.5652173913043478, |
|
"eval_loss": 3.6148810386657715, |
|
"eval_runtime": 5.8165, |
|
"eval_samples_per_second": 3.954, |
|
"eval_steps_per_second": 1.032, |
|
"step": 3575 |
|
}, |
|
{ |
|
"epoch": 55.00078125, |
|
"grad_norm": 0.02267097495496273, |
|
"learning_rate": 4.895833333333333e-06, |
|
"loss": 0.0275, |
|
"step": 3580 |
|
}, |
|
{ |
|
"epoch": 55.00234375, |
|
"grad_norm": 0.002501361072063446, |
|
"learning_rate": 4.878472222222223e-06, |
|
"loss": 0.0, |
|
"step": 3590 |
|
}, |
|
{ |
|
"epoch": 55.00390625, |
|
"grad_norm": 0.0013894840376451612, |
|
"learning_rate": 4.861111111111111e-06, |
|
"loss": 0.0, |
|
"step": 3600 |
|
}, |
|
{ |
|
"epoch": 55.00546875, |
|
"grad_norm": 0.0008909847820177674, |
|
"learning_rate": 4.84375e-06, |
|
"loss": 0.0, |
|
"step": 3610 |
|
}, |
|
{ |
|
"epoch": 55.00703125, |
|
"grad_norm": 0.007556704338639975, |
|
"learning_rate": 4.8263888888888895e-06, |
|
"loss": 0.239, |
|
"step": 3620 |
|
}, |
|
{ |
|
"epoch": 55.00859375, |
|
"grad_norm": 0.0007475628517568111, |
|
"learning_rate": 4.809027777777778e-06, |
|
"loss": 0.0707, |
|
"step": 3630 |
|
}, |
|
{ |
|
"epoch": 55.01015625, |
|
"grad_norm": 0.0013660925906151533, |
|
"learning_rate": 4.791666666666668e-06, |
|
"loss": 0.0, |
|
"step": 3640 |
|
}, |
|
{ |
|
"epoch": 55.01015625, |
|
"eval_accuracy": 0.5652173913043478, |
|
"eval_loss": 3.297786235809326, |
|
"eval_runtime": 5.8585, |
|
"eval_samples_per_second": 3.926, |
|
"eval_steps_per_second": 1.024, |
|
"step": 3640 |
|
}, |
|
{ |
|
"epoch": 56.0015625, |
|
"grad_norm": 0.000969829095993191, |
|
"learning_rate": 4.774305555555556e-06, |
|
"loss": 0.0002, |
|
"step": 3650 |
|
}, |
|
{ |
|
"epoch": 56.003125, |
|
"grad_norm": 0.003255591494962573, |
|
"learning_rate": 4.756944444444445e-06, |
|
"loss": 0.0015, |
|
"step": 3660 |
|
}, |
|
{ |
|
"epoch": 56.0046875, |
|
"grad_norm": 119.21192932128906, |
|
"learning_rate": 4.739583333333334e-06, |
|
"loss": 0.2332, |
|
"step": 3670 |
|
}, |
|
{ |
|
"epoch": 56.00625, |
|
"grad_norm": 0.0014074137434363365, |
|
"learning_rate": 4.722222222222222e-06, |
|
"loss": 0.0102, |
|
"step": 3680 |
|
}, |
|
{ |
|
"epoch": 56.0078125, |
|
"grad_norm": 0.0026245727203786373, |
|
"learning_rate": 4.704861111111112e-06, |
|
"loss": 0.0001, |
|
"step": 3690 |
|
}, |
|
{ |
|
"epoch": 56.009375, |
|
"grad_norm": 0.001142139662988484, |
|
"learning_rate": 4.6875000000000004e-06, |
|
"loss": 0.0, |
|
"step": 3700 |
|
}, |
|
{ |
|
"epoch": 56.01015625, |
|
"eval_accuracy": 0.5217391304347826, |
|
"eval_loss": 2.9059712886810303, |
|
"eval_runtime": 6.0003, |
|
"eval_samples_per_second": 3.833, |
|
"eval_steps_per_second": 1.0, |
|
"step": 3705 |
|
}, |
|
{ |
|
"epoch": 57.00078125, |
|
"grad_norm": 0.0010896347230300307, |
|
"learning_rate": 4.670138888888889e-06, |
|
"loss": 0.0061, |
|
"step": 3710 |
|
}, |
|
{ |
|
"epoch": 57.00234375, |
|
"grad_norm": 0.0015965335769578815, |
|
"learning_rate": 4.652777777777779e-06, |
|
"loss": 0.0001, |
|
"step": 3720 |
|
}, |
|
{ |
|
"epoch": 57.00390625, |
|
"grad_norm": 0.24039725959300995, |
|
"learning_rate": 4.635416666666667e-06, |
|
"loss": 0.0011, |
|
"step": 3730 |
|
}, |
|
{ |
|
"epoch": 57.00546875, |
|
"grad_norm": 0.0013993968022987247, |
|
"learning_rate": 4.618055555555556e-06, |
|
"loss": 0.0002, |
|
"step": 3740 |
|
}, |
|
{ |
|
"epoch": 57.00703125, |
|
"grad_norm": 0.0015714854234829545, |
|
"learning_rate": 4.6006944444444446e-06, |
|
"loss": 0.0028, |
|
"step": 3750 |
|
}, |
|
{ |
|
"epoch": 57.00859375, |
|
"grad_norm": 0.002782302675768733, |
|
"learning_rate": 4.583333333333333e-06, |
|
"loss": 0.0001, |
|
"step": 3760 |
|
}, |
|
{ |
|
"epoch": 57.01015625, |
|
"grad_norm": 0.000537557527422905, |
|
"learning_rate": 4.565972222222223e-06, |
|
"loss": 0.1108, |
|
"step": 3770 |
|
}, |
|
{ |
|
"epoch": 57.01015625, |
|
"eval_accuracy": 0.6086956521739131, |
|
"eval_loss": 3.0361039638519287, |
|
"eval_runtime": 5.9188, |
|
"eval_samples_per_second": 3.886, |
|
"eval_steps_per_second": 1.014, |
|
"step": 3770 |
|
}, |
|
{ |
|
"epoch": 58.0015625, |
|
"grad_norm": 0.001980976667255163, |
|
"learning_rate": 4.548611111111111e-06, |
|
"loss": 0.0, |
|
"step": 3780 |
|
}, |
|
{ |
|
"epoch": 58.003125, |
|
"grad_norm": 0.0013299890561029315, |
|
"learning_rate": 4.53125e-06, |
|
"loss": 0.0001, |
|
"step": 3790 |
|
}, |
|
{ |
|
"epoch": 58.0046875, |
|
"grad_norm": 0.0005312523571774364, |
|
"learning_rate": 4.5138888888888895e-06, |
|
"loss": 0.1662, |
|
"step": 3800 |
|
}, |
|
{ |
|
"epoch": 58.00625, |
|
"grad_norm": 0.001108271419070661, |
|
"learning_rate": 4.496527777777778e-06, |
|
"loss": 0.0001, |
|
"step": 3810 |
|
}, |
|
{ |
|
"epoch": 58.0078125, |
|
"grad_norm": 0.00438848789781332, |
|
"learning_rate": 4.479166666666667e-06, |
|
"loss": 0.0004, |
|
"step": 3820 |
|
}, |
|
{ |
|
"epoch": 58.009375, |
|
"grad_norm": 0.0009947356302291155, |
|
"learning_rate": 4.461805555555556e-06, |
|
"loss": 0.0, |
|
"step": 3830 |
|
}, |
|
{ |
|
"epoch": 58.01015625, |
|
"eval_accuracy": 0.6086956521739131, |
|
"eval_loss": 3.3928611278533936, |
|
"eval_runtime": 5.7082, |
|
"eval_samples_per_second": 4.029, |
|
"eval_steps_per_second": 1.051, |
|
"step": 3835 |
|
}, |
|
{ |
|
"epoch": 59.00078125, |
|
"grad_norm": 0.009486984461545944, |
|
"learning_rate": 4.444444444444444e-06, |
|
"loss": 0.0, |
|
"step": 3840 |
|
}, |
|
{ |
|
"epoch": 59.00234375, |
|
"grad_norm": 0.0018799022072926164, |
|
"learning_rate": 4.427083333333334e-06, |
|
"loss": 0.2343, |
|
"step": 3850 |
|
}, |
|
{ |
|
"epoch": 59.00390625, |
|
"grad_norm": 937.9744873046875, |
|
"learning_rate": 4.409722222222222e-06, |
|
"loss": 0.1035, |
|
"step": 3860 |
|
}, |
|
{ |
|
"epoch": 59.00546875, |
|
"grad_norm": 0.1265738308429718, |
|
"learning_rate": 4.392361111111112e-06, |
|
"loss": 0.1981, |
|
"step": 3870 |
|
}, |
|
{ |
|
"epoch": 59.00703125, |
|
"grad_norm": 0.009575917385518551, |
|
"learning_rate": 4.3750000000000005e-06, |
|
"loss": 0.0003, |
|
"step": 3880 |
|
}, |
|
{ |
|
"epoch": 59.00859375, |
|
"grad_norm": 0.0943688154220581, |
|
"learning_rate": 4.357638888888889e-06, |
|
"loss": 0.0001, |
|
"step": 3890 |
|
}, |
|
{ |
|
"epoch": 59.01015625, |
|
"grad_norm": 0.0005097028333693743, |
|
"learning_rate": 4.340277777777779e-06, |
|
"loss": 0.0, |
|
"step": 3900 |
|
}, |
|
{ |
|
"epoch": 59.01015625, |
|
"eval_accuracy": 0.5652173913043478, |
|
"eval_loss": 3.5174272060394287, |
|
"eval_runtime": 5.8322, |
|
"eval_samples_per_second": 3.944, |
|
"eval_steps_per_second": 1.029, |
|
"step": 3900 |
|
}, |
|
{ |
|
"epoch": 60.0015625, |
|
"grad_norm": 0.000787988887168467, |
|
"learning_rate": 4.322916666666667e-06, |
|
"loss": 0.0, |
|
"step": 3910 |
|
}, |
|
{ |
|
"epoch": 60.003125, |
|
"grad_norm": 0.0006839316338300705, |
|
"learning_rate": 4.305555555555556e-06, |
|
"loss": 0.0003, |
|
"step": 3920 |
|
}, |
|
{ |
|
"epoch": 60.0046875, |
|
"grad_norm": 0.0026071376632899046, |
|
"learning_rate": 4.288194444444445e-06, |
|
"loss": 0.0001, |
|
"step": 3930 |
|
}, |
|
{ |
|
"epoch": 60.00625, |
|
"grad_norm": 0.004166269209235907, |
|
"learning_rate": 4.270833333333333e-06, |
|
"loss": 0.0, |
|
"step": 3940 |
|
}, |
|
{ |
|
"epoch": 60.0078125, |
|
"grad_norm": 0.0007391138351522386, |
|
"learning_rate": 4.253472222222223e-06, |
|
"loss": 0.0, |
|
"step": 3950 |
|
}, |
|
{ |
|
"epoch": 60.009375, |
|
"grad_norm": 0.0005469180759973824, |
|
"learning_rate": 4.236111111111111e-06, |
|
"loss": 0.0007, |
|
"step": 3960 |
|
}, |
|
{ |
|
"epoch": 60.01015625, |
|
"eval_accuracy": 0.7391304347826086, |
|
"eval_loss": 2.1117029190063477, |
|
"eval_runtime": 6.4071, |
|
"eval_samples_per_second": 3.59, |
|
"eval_steps_per_second": 0.936, |
|
"step": 3965 |
|
}, |
|
{ |
|
"epoch": 61.00078125, |
|
"grad_norm": 0.0014843333046883345, |
|
"learning_rate": 4.21875e-06, |
|
"loss": 0.0, |
|
"step": 3970 |
|
}, |
|
{ |
|
"epoch": 61.00234375, |
|
"grad_norm": 0.0012195684248581529, |
|
"learning_rate": 4.2013888888888896e-06, |
|
"loss": 0.0185, |
|
"step": 3980 |
|
}, |
|
{ |
|
"epoch": 61.00390625, |
|
"grad_norm": 0.005012029781937599, |
|
"learning_rate": 4.184027777777778e-06, |
|
"loss": 0.0, |
|
"step": 3990 |
|
}, |
|
{ |
|
"epoch": 61.00546875, |
|
"grad_norm": 0.004374057520180941, |
|
"learning_rate": 4.166666666666667e-06, |
|
"loss": 0.0, |
|
"step": 4000 |
|
}, |
|
{ |
|
"epoch": 61.00703125, |
|
"grad_norm": 0.0008582215523347259, |
|
"learning_rate": 4.149305555555556e-06, |
|
"loss": 0.0, |
|
"step": 4010 |
|
}, |
|
{ |
|
"epoch": 61.00859375, |
|
"grad_norm": 0.0006553750718012452, |
|
"learning_rate": 4.131944444444444e-06, |
|
"loss": 0.0001, |
|
"step": 4020 |
|
}, |
|
{ |
|
"epoch": 61.01015625, |
|
"grad_norm": 0.0005242990446276963, |
|
"learning_rate": 4.114583333333334e-06, |
|
"loss": 0.0, |
|
"step": 4030 |
|
}, |
|
{ |
|
"epoch": 61.01015625, |
|
"eval_accuracy": 0.6086956521739131, |
|
"eval_loss": 3.527374505996704, |
|
"eval_runtime": 7.3754, |
|
"eval_samples_per_second": 3.118, |
|
"eval_steps_per_second": 0.814, |
|
"step": 4030 |
|
}, |
|
{ |
|
"epoch": 62.0015625, |
|
"grad_norm": 0.0005454741767607629, |
|
"learning_rate": 4.097222222222222e-06, |
|
"loss": 0.0, |
|
"step": 4040 |
|
}, |
|
{ |
|
"epoch": 62.003125, |
|
"grad_norm": 0.0011714308056980371, |
|
"learning_rate": 4.079861111111111e-06, |
|
"loss": 0.0, |
|
"step": 4050 |
|
}, |
|
{ |
|
"epoch": 62.0046875, |
|
"grad_norm": 0.001519624493084848, |
|
"learning_rate": 4.0625000000000005e-06, |
|
"loss": 0.0, |
|
"step": 4060 |
|
}, |
|
{ |
|
"epoch": 62.00625, |
|
"grad_norm": 0.0004894700832664967, |
|
"learning_rate": 4.045138888888889e-06, |
|
"loss": 0.0, |
|
"step": 4070 |
|
}, |
|
{ |
|
"epoch": 62.0078125, |
|
"grad_norm": 0.0012128478847444057, |
|
"learning_rate": 4.027777777777779e-06, |
|
"loss": 0.0, |
|
"step": 4080 |
|
}, |
|
{ |
|
"epoch": 62.009375, |
|
"grad_norm": 0.005811003036797047, |
|
"learning_rate": 4.010416666666667e-06, |
|
"loss": 0.0, |
|
"step": 4090 |
|
}, |
|
{ |
|
"epoch": 62.01015625, |
|
"eval_accuracy": 0.6086956521739131, |
|
"eval_loss": 3.514857530593872, |
|
"eval_runtime": 7.5237, |
|
"eval_samples_per_second": 3.057, |
|
"eval_steps_per_second": 0.797, |
|
"step": 4095 |
|
}, |
|
{ |
|
"epoch": 63.00078125, |
|
"grad_norm": 0.004614518489688635, |
|
"learning_rate": 3.993055555555556e-06, |
|
"loss": 0.0, |
|
"step": 4100 |
|
}, |
|
{ |
|
"epoch": 63.00234375, |
|
"grad_norm": 0.0026280442252755165, |
|
"learning_rate": 3.975694444444445e-06, |
|
"loss": 0.0, |
|
"step": 4110 |
|
}, |
|
{ |
|
"epoch": 63.00390625, |
|
"grad_norm": 0.0007883926155045629, |
|
"learning_rate": 3.958333333333333e-06, |
|
"loss": 0.0, |
|
"step": 4120 |
|
}, |
|
{ |
|
"epoch": 63.00546875, |
|
"grad_norm": 0.0003756518417503685, |
|
"learning_rate": 3.940972222222223e-06, |
|
"loss": 0.0, |
|
"step": 4130 |
|
}, |
|
{ |
|
"epoch": 63.00703125, |
|
"grad_norm": 0.005922640673816204, |
|
"learning_rate": 3.9236111111111114e-06, |
|
"loss": 0.0, |
|
"step": 4140 |
|
}, |
|
{ |
|
"epoch": 63.00859375, |
|
"grad_norm": 0.0009902652818709612, |
|
"learning_rate": 3.90625e-06, |
|
"loss": 0.0, |
|
"step": 4150 |
|
}, |
|
{ |
|
"epoch": 63.01015625, |
|
"grad_norm": 0.0009228180279023945, |
|
"learning_rate": 3.88888888888889e-06, |
|
"loss": 0.0, |
|
"step": 4160 |
|
}, |
|
{ |
|
"epoch": 63.01015625, |
|
"eval_accuracy": 0.6086956521739131, |
|
"eval_loss": 3.4864935874938965, |
|
"eval_runtime": 57.8974, |
|
"eval_samples_per_second": 0.397, |
|
"eval_steps_per_second": 0.104, |
|
"step": 4160 |
|
}, |
|
{ |
|
"epoch": 64.0015625, |
|
"grad_norm": 0.006006907671689987, |
|
"learning_rate": 3.871527777777778e-06, |
|
"loss": 0.0001, |
|
"step": 4170 |
|
}, |
|
{ |
|
"epoch": 64.003125, |
|
"grad_norm": 0.0028358676936477423, |
|
"learning_rate": 3.854166666666667e-06, |
|
"loss": 0.0, |
|
"step": 4180 |
|
}, |
|
{ |
|
"epoch": 64.0046875, |
|
"grad_norm": 0.002369464607909322, |
|
"learning_rate": 3.836805555555556e-06, |
|
"loss": 0.0, |
|
"step": 4190 |
|
}, |
|
{ |
|
"epoch": 64.00625, |
|
"grad_norm": 0.0018403598805889487, |
|
"learning_rate": 3.819444444444444e-06, |
|
"loss": 0.0, |
|
"step": 4200 |
|
}, |
|
{ |
|
"epoch": 64.0078125, |
|
"grad_norm": 0.0008932430064305663, |
|
"learning_rate": 3.8020833333333333e-06, |
|
"loss": 0.0, |
|
"step": 4210 |
|
}, |
|
{ |
|
"epoch": 64.009375, |
|
"grad_norm": 0.0009429508936591446, |
|
"learning_rate": 3.7847222222222224e-06, |
|
"loss": 0.0, |
|
"step": 4220 |
|
}, |
|
{ |
|
"epoch": 64.01015625, |
|
"eval_accuracy": 0.6086956521739131, |
|
"eval_loss": 3.231806516647339, |
|
"eval_runtime": 7.6108, |
|
"eval_samples_per_second": 3.022, |
|
"eval_steps_per_second": 0.788, |
|
"step": 4225 |
|
}, |
|
{ |
|
"epoch": 65.00078125, |
|
"grad_norm": 0.0008638093713670969, |
|
"learning_rate": 3.7673611111111114e-06, |
|
"loss": 0.0, |
|
"step": 4230 |
|
}, |
|
{ |
|
"epoch": 65.00234375, |
|
"grad_norm": 0.0004404432838782668, |
|
"learning_rate": 3.7500000000000005e-06, |
|
"loss": 0.0, |
|
"step": 4240 |
|
}, |
|
{ |
|
"epoch": 65.00390625, |
|
"grad_norm": 0.0019350500078871846, |
|
"learning_rate": 3.732638888888889e-06, |
|
"loss": 0.0, |
|
"step": 4250 |
|
}, |
|
{ |
|
"epoch": 65.00546875, |
|
"grad_norm": 0.000627523404546082, |
|
"learning_rate": 3.7152777777777783e-06, |
|
"loss": 0.0, |
|
"step": 4260 |
|
}, |
|
{ |
|
"epoch": 65.00703125, |
|
"grad_norm": 0.0004430541303008795, |
|
"learning_rate": 3.6979166666666673e-06, |
|
"loss": 0.0, |
|
"step": 4270 |
|
}, |
|
{ |
|
"epoch": 65.00859375, |
|
"grad_norm": 0.0006956512806937099, |
|
"learning_rate": 3.680555555555556e-06, |
|
"loss": 0.0, |
|
"step": 4280 |
|
}, |
|
{ |
|
"epoch": 65.01015625, |
|
"grad_norm": 0.00038146533188410103, |
|
"learning_rate": 3.6631944444444446e-06, |
|
"loss": 0.0, |
|
"step": 4290 |
|
}, |
|
{ |
|
"epoch": 65.01015625, |
|
"eval_accuracy": 0.6086956521739131, |
|
"eval_loss": 3.1843786239624023, |
|
"eval_runtime": 138.2448, |
|
"eval_samples_per_second": 0.166, |
|
"eval_steps_per_second": 0.043, |
|
"step": 4290 |
|
}, |
|
{ |
|
"epoch": 66.0015625, |
|
"grad_norm": 0.0004239015397615731, |
|
"learning_rate": 3.6458333333333333e-06, |
|
"loss": 0.0, |
|
"step": 4300 |
|
}, |
|
{ |
|
"epoch": 66.003125, |
|
"grad_norm": 0.00977334938943386, |
|
"learning_rate": 3.6284722222222224e-06, |
|
"loss": 0.0, |
|
"step": 4310 |
|
}, |
|
{ |
|
"epoch": 66.0046875, |
|
"grad_norm": 0.0007026152452453971, |
|
"learning_rate": 3.6111111111111115e-06, |
|
"loss": 0.0, |
|
"step": 4320 |
|
}, |
|
{ |
|
"epoch": 66.00625, |
|
"grad_norm": 0.00105427170637995, |
|
"learning_rate": 3.59375e-06, |
|
"loss": 0.0, |
|
"step": 4330 |
|
}, |
|
{ |
|
"epoch": 66.0078125, |
|
"grad_norm": 0.002872730838134885, |
|
"learning_rate": 3.576388888888889e-06, |
|
"loss": 0.0, |
|
"step": 4340 |
|
}, |
|
{ |
|
"epoch": 66.009375, |
|
"grad_norm": 0.00210633035749197, |
|
"learning_rate": 3.5590277777777783e-06, |
|
"loss": 0.0, |
|
"step": 4350 |
|
}, |
|
{ |
|
"epoch": 66.01015625, |
|
"eval_accuracy": 0.6086956521739131, |
|
"eval_loss": 3.218122959136963, |
|
"eval_runtime": 6.6565, |
|
"eval_samples_per_second": 3.455, |
|
"eval_steps_per_second": 0.901, |
|
"step": 4355 |
|
}, |
|
{ |
|
"epoch": 67.00078125, |
|
"grad_norm": 0.03868458420038223, |
|
"learning_rate": 3.5416666666666673e-06, |
|
"loss": 0.0, |
|
"step": 4360 |
|
}, |
|
{ |
|
"epoch": 67.00234375, |
|
"grad_norm": 0.0006052263779565692, |
|
"learning_rate": 3.524305555555556e-06, |
|
"loss": 0.0, |
|
"step": 4370 |
|
}, |
|
{ |
|
"epoch": 67.00390625, |
|
"grad_norm": 0.0004380837199278176, |
|
"learning_rate": 3.5069444444444447e-06, |
|
"loss": 0.0, |
|
"step": 4380 |
|
}, |
|
{ |
|
"epoch": 67.00546875, |
|
"grad_norm": 0.0006919961306266487, |
|
"learning_rate": 3.4895833333333333e-06, |
|
"loss": 0.0, |
|
"step": 4390 |
|
}, |
|
{ |
|
"epoch": 67.00703125, |
|
"grad_norm": 0.0007833715644665062, |
|
"learning_rate": 3.4722222222222224e-06, |
|
"loss": 0.0, |
|
"step": 4400 |
|
}, |
|
{ |
|
"epoch": 67.00859375, |
|
"grad_norm": 0.0011872323229908943, |
|
"learning_rate": 3.4548611111111115e-06, |
|
"loss": 0.0, |
|
"step": 4410 |
|
}, |
|
{ |
|
"epoch": 67.01015625, |
|
"grad_norm": 0.0018731305608525872, |
|
"learning_rate": 3.4375e-06, |
|
"loss": 0.0, |
|
"step": 4420 |
|
}, |
|
{ |
|
"epoch": 67.01015625, |
|
"eval_accuracy": 0.6086956521739131, |
|
"eval_loss": 3.293574571609497, |
|
"eval_runtime": 7.7812, |
|
"eval_samples_per_second": 2.956, |
|
"eval_steps_per_second": 0.771, |
|
"step": 4420 |
|
}, |
|
{ |
|
"epoch": 68.0015625, |
|
"grad_norm": 0.0009602176141925156, |
|
"learning_rate": 3.420138888888889e-06, |
|
"loss": 0.0, |
|
"step": 4430 |
|
}, |
|
{ |
|
"epoch": 68.003125, |
|
"grad_norm": 0.0007613273337483406, |
|
"learning_rate": 3.4027777777777783e-06, |
|
"loss": 0.0, |
|
"step": 4440 |
|
}, |
|
{ |
|
"epoch": 68.0046875, |
|
"grad_norm": 0.0003638800699263811, |
|
"learning_rate": 3.385416666666667e-06, |
|
"loss": 0.0, |
|
"step": 4450 |
|
}, |
|
{ |
|
"epoch": 68.00625, |
|
"grad_norm": 0.0005400192458182573, |
|
"learning_rate": 3.368055555555556e-06, |
|
"loss": 0.0, |
|
"step": 4460 |
|
}, |
|
{ |
|
"epoch": 68.0078125, |
|
"grad_norm": 0.000447685772087425, |
|
"learning_rate": 3.3506944444444447e-06, |
|
"loss": 0.0, |
|
"step": 4470 |
|
}, |
|
{ |
|
"epoch": 68.009375, |
|
"grad_norm": 0.00042887049494311213, |
|
"learning_rate": 3.3333333333333333e-06, |
|
"loss": 0.0, |
|
"step": 4480 |
|
}, |
|
{ |
|
"epoch": 68.01015625, |
|
"eval_accuracy": 0.6086956521739131, |
|
"eval_loss": 3.3043465614318848, |
|
"eval_runtime": 7.4553, |
|
"eval_samples_per_second": 3.085, |
|
"eval_steps_per_second": 0.805, |
|
"step": 4485 |
|
}, |
|
{ |
|
"epoch": 69.00078125, |
|
"grad_norm": 0.0013600009260699153, |
|
"learning_rate": 3.3159722222222224e-06, |
|
"loss": 0.0, |
|
"step": 4490 |
|
}, |
|
{ |
|
"epoch": 69.00234375, |
|
"grad_norm": 0.0006413975497707725, |
|
"learning_rate": 3.2986111111111115e-06, |
|
"loss": 0.0, |
|
"step": 4500 |
|
}, |
|
{ |
|
"epoch": 69.00390625, |
|
"grad_norm": 0.0006146755767986178, |
|
"learning_rate": 3.28125e-06, |
|
"loss": 0.0, |
|
"step": 4510 |
|
}, |
|
{ |
|
"epoch": 69.00546875, |
|
"grad_norm": 0.000594130833633244, |
|
"learning_rate": 3.2638888888888892e-06, |
|
"loss": 0.0, |
|
"step": 4520 |
|
}, |
|
{ |
|
"epoch": 69.00703125, |
|
"grad_norm": 0.0006359369726851583, |
|
"learning_rate": 3.2465277777777783e-06, |
|
"loss": 0.0, |
|
"step": 4530 |
|
}, |
|
{ |
|
"epoch": 69.00859375, |
|
"grad_norm": 0.0011723951902240515, |
|
"learning_rate": 3.229166666666667e-06, |
|
"loss": 0.0, |
|
"step": 4540 |
|
}, |
|
{ |
|
"epoch": 69.01015625, |
|
"grad_norm": 0.00037415928090922534, |
|
"learning_rate": 3.211805555555556e-06, |
|
"loss": 0.0, |
|
"step": 4550 |
|
}, |
|
{ |
|
"epoch": 69.01015625, |
|
"eval_accuracy": 0.6521739130434783, |
|
"eval_loss": 3.136009931564331, |
|
"eval_runtime": 6.68, |
|
"eval_samples_per_second": 3.443, |
|
"eval_steps_per_second": 0.898, |
|
"step": 4550 |
|
}, |
|
{ |
|
"epoch": 70.0015625, |
|
"grad_norm": 0.0006099409656599164, |
|
"learning_rate": 3.1944444444444443e-06, |
|
"loss": 0.0, |
|
"step": 4560 |
|
}, |
|
{ |
|
"epoch": 70.003125, |
|
"grad_norm": 0.0007047757972031832, |
|
"learning_rate": 3.1770833333333333e-06, |
|
"loss": 0.0, |
|
"step": 4570 |
|
}, |
|
{ |
|
"epoch": 70.0046875, |
|
"grad_norm": 0.0007766408962197602, |
|
"learning_rate": 3.1597222222222224e-06, |
|
"loss": 0.0, |
|
"step": 4580 |
|
}, |
|
{ |
|
"epoch": 70.00625, |
|
"grad_norm": 0.0007212317432276905, |
|
"learning_rate": 3.1423611111111115e-06, |
|
"loss": 0.0, |
|
"step": 4590 |
|
}, |
|
{ |
|
"epoch": 70.0078125, |
|
"grad_norm": 0.00269865314476192, |
|
"learning_rate": 3.125e-06, |
|
"loss": 0.0, |
|
"step": 4600 |
|
}, |
|
{ |
|
"epoch": 70.009375, |
|
"grad_norm": 0.0007821142789907753, |
|
"learning_rate": 3.1076388888888892e-06, |
|
"loss": 0.0186, |
|
"step": 4610 |
|
}, |
|
{ |
|
"epoch": 70.01015625, |
|
"eval_accuracy": 0.7391304347826086, |
|
"eval_loss": 2.3659093379974365, |
|
"eval_runtime": 7.3745, |
|
"eval_samples_per_second": 3.119, |
|
"eval_steps_per_second": 0.814, |
|
"step": 4615 |
|
}, |
|
{ |
|
"epoch": 71.00078125, |
|
"grad_norm": 0.0032316665165126324, |
|
"learning_rate": 3.0902777777777783e-06, |
|
"loss": 0.0, |
|
"step": 4620 |
|
}, |
|
{ |
|
"epoch": 71.00234375, |
|
"grad_norm": 0.006559448316693306, |
|
"learning_rate": 3.072916666666667e-06, |
|
"loss": 0.0901, |
|
"step": 4630 |
|
}, |
|
{ |
|
"epoch": 71.00390625, |
|
"grad_norm": 0.002651064656674862, |
|
"learning_rate": 3.055555555555556e-06, |
|
"loss": 0.0, |
|
"step": 4640 |
|
}, |
|
{ |
|
"epoch": 71.00546875, |
|
"grad_norm": 0.0003016916452907026, |
|
"learning_rate": 3.0381944444444443e-06, |
|
"loss": 0.0, |
|
"step": 4650 |
|
}, |
|
{ |
|
"epoch": 71.00703125, |
|
"grad_norm": 0.0018887541955336928, |
|
"learning_rate": 3.0208333333333334e-06, |
|
"loss": 0.0, |
|
"step": 4660 |
|
}, |
|
{ |
|
"epoch": 71.00859375, |
|
"grad_norm": 0.00041917903581634164, |
|
"learning_rate": 3.0034722222222224e-06, |
|
"loss": 0.1323, |
|
"step": 4670 |
|
}, |
|
{ |
|
"epoch": 71.01015625, |
|
"grad_norm": 0.003749463940039277, |
|
"learning_rate": 2.986111111111111e-06, |
|
"loss": 0.0, |
|
"step": 4680 |
|
}, |
|
{ |
|
"epoch": 71.01015625, |
|
"eval_accuracy": 0.7391304347826086, |
|
"eval_loss": 2.5226190090179443, |
|
"eval_runtime": 7.2879, |
|
"eval_samples_per_second": 3.156, |
|
"eval_steps_per_second": 0.823, |
|
"step": 4680 |
|
}, |
|
{ |
|
"epoch": 72.0015625, |
|
"grad_norm": 0.00048035994404926896, |
|
"learning_rate": 2.96875e-06, |
|
"loss": 0.0, |
|
"step": 4690 |
|
}, |
|
{ |
|
"epoch": 72.003125, |
|
"grad_norm": 0.0015026311157271266, |
|
"learning_rate": 2.9513888888888892e-06, |
|
"loss": 0.0, |
|
"step": 4700 |
|
}, |
|
{ |
|
"epoch": 72.0046875, |
|
"grad_norm": 0.0003598359180614352, |
|
"learning_rate": 2.9340277777777783e-06, |
|
"loss": 0.0001, |
|
"step": 4710 |
|
}, |
|
{ |
|
"epoch": 72.00625, |
|
"grad_norm": 0.0005997862317599356, |
|
"learning_rate": 2.916666666666667e-06, |
|
"loss": 0.0, |
|
"step": 4720 |
|
}, |
|
{ |
|
"epoch": 72.0078125, |
|
"grad_norm": 0.0005218391306698322, |
|
"learning_rate": 2.899305555555556e-06, |
|
"loss": 0.0, |
|
"step": 4730 |
|
}, |
|
{ |
|
"epoch": 72.009375, |
|
"grad_norm": 0.0006006735493429005, |
|
"learning_rate": 2.8819444444444443e-06, |
|
"loss": 0.0, |
|
"step": 4740 |
|
}, |
|
{ |
|
"epoch": 72.01015625, |
|
"eval_accuracy": 0.6521739130434783, |
|
"eval_loss": 2.7737224102020264, |
|
"eval_runtime": 7.702, |
|
"eval_samples_per_second": 2.986, |
|
"eval_steps_per_second": 0.779, |
|
"step": 4745 |
|
}, |
|
{ |
|
"epoch": 73.00078125, |
|
"grad_norm": 0.0005741188651882112, |
|
"learning_rate": 2.8645833333333334e-06, |
|
"loss": 0.0, |
|
"step": 4750 |
|
}, |
|
{ |
|
"epoch": 73.00234375, |
|
"grad_norm": 0.0005552352522499859, |
|
"learning_rate": 2.8472222222222224e-06, |
|
"loss": 0.0, |
|
"step": 4760 |
|
}, |
|
{ |
|
"epoch": 73.00390625, |
|
"grad_norm": 0.0004445587401278317, |
|
"learning_rate": 2.829861111111111e-06, |
|
"loss": 0.0, |
|
"step": 4770 |
|
}, |
|
{ |
|
"epoch": 73.00546875, |
|
"grad_norm": 0.006175428628921509, |
|
"learning_rate": 2.8125e-06, |
|
"loss": 0.0, |
|
"step": 4780 |
|
}, |
|
{ |
|
"epoch": 73.00703125, |
|
"grad_norm": 0.0004637633974198252, |
|
"learning_rate": 2.7951388888888893e-06, |
|
"loss": 0.0, |
|
"step": 4790 |
|
}, |
|
{ |
|
"epoch": 73.00859375, |
|
"grad_norm": 0.0002843421825673431, |
|
"learning_rate": 2.7777777777777783e-06, |
|
"loss": 0.0, |
|
"step": 4800 |
|
}, |
|
{ |
|
"epoch": 73.01015625, |
|
"grad_norm": 0.00034766693715937436, |
|
"learning_rate": 2.760416666666667e-06, |
|
"loss": 0.0, |
|
"step": 4810 |
|
}, |
|
{ |
|
"epoch": 73.01015625, |
|
"eval_accuracy": 0.6956521739130435, |
|
"eval_loss": 2.672950029373169, |
|
"eval_runtime": 7.9091, |
|
"eval_samples_per_second": 2.908, |
|
"eval_steps_per_second": 0.759, |
|
"step": 4810 |
|
}, |
|
{ |
|
"epoch": 74.0015625, |
|
"grad_norm": 0.00034793929080478847, |
|
"learning_rate": 2.743055555555556e-06, |
|
"loss": 0.0, |
|
"step": 4820 |
|
}, |
|
{ |
|
"epoch": 74.003125, |
|
"grad_norm": 0.0005387531709857285, |
|
"learning_rate": 2.7256944444444443e-06, |
|
"loss": 0.0, |
|
"step": 4830 |
|
}, |
|
{ |
|
"epoch": 74.0046875, |
|
"grad_norm": 0.001991413999348879, |
|
"learning_rate": 2.7083333333333334e-06, |
|
"loss": 0.0, |
|
"step": 4840 |
|
}, |
|
{ |
|
"epoch": 74.00625, |
|
"grad_norm": 0.0003038486756850034, |
|
"learning_rate": 2.6909722222222225e-06, |
|
"loss": 0.0, |
|
"step": 4850 |
|
}, |
|
{ |
|
"epoch": 74.0078125, |
|
"grad_norm": 0.0008349215495400131, |
|
"learning_rate": 2.673611111111111e-06, |
|
"loss": 0.0, |
|
"step": 4860 |
|
}, |
|
{ |
|
"epoch": 74.009375, |
|
"grad_norm": 0.000454833876574412, |
|
"learning_rate": 2.65625e-06, |
|
"loss": 0.0, |
|
"step": 4870 |
|
}, |
|
{ |
|
"epoch": 74.01015625, |
|
"eval_accuracy": 0.6956521739130435, |
|
"eval_loss": 2.7864532470703125, |
|
"eval_runtime": 8.0486, |
|
"eval_samples_per_second": 2.858, |
|
"eval_steps_per_second": 0.745, |
|
"step": 4875 |
|
}, |
|
{ |
|
"epoch": 75.00078125, |
|
"grad_norm": 0.00047676265239715576, |
|
"learning_rate": 2.6388888888888893e-06, |
|
"loss": 0.0, |
|
"step": 4880 |
|
}, |
|
{ |
|
"epoch": 75.00234375, |
|
"grad_norm": 0.0028695412911474705, |
|
"learning_rate": 2.621527777777778e-06, |
|
"loss": 0.0, |
|
"step": 4890 |
|
}, |
|
{ |
|
"epoch": 75.00390625, |
|
"grad_norm": 0.0006085538188926876, |
|
"learning_rate": 2.604166666666667e-06, |
|
"loss": 0.0, |
|
"step": 4900 |
|
}, |
|
{ |
|
"epoch": 75.00546875, |
|
"grad_norm": 0.00026899942895397544, |
|
"learning_rate": 2.586805555555556e-06, |
|
"loss": 0.0, |
|
"step": 4910 |
|
}, |
|
{ |
|
"epoch": 75.00703125, |
|
"grad_norm": 0.0008939497638493776, |
|
"learning_rate": 2.5694444444444443e-06, |
|
"loss": 0.0, |
|
"step": 4920 |
|
}, |
|
{ |
|
"epoch": 75.00859375, |
|
"grad_norm": 0.0004671047499869019, |
|
"learning_rate": 2.5520833333333334e-06, |
|
"loss": 0.0, |
|
"step": 4930 |
|
}, |
|
{ |
|
"epoch": 75.01015625, |
|
"grad_norm": 0.0005671957042068243, |
|
"learning_rate": 2.5347222222222225e-06, |
|
"loss": 0.0, |
|
"step": 4940 |
|
}, |
|
{ |
|
"epoch": 75.01015625, |
|
"eval_accuracy": 0.6956521739130435, |
|
"eval_loss": 2.792165994644165, |
|
"eval_runtime": 7.0834, |
|
"eval_samples_per_second": 3.247, |
|
"eval_steps_per_second": 0.847, |
|
"step": 4940 |
|
}, |
|
{ |
|
"epoch": 76.0015625, |
|
"grad_norm": 0.0005376060144044459, |
|
"learning_rate": 2.517361111111111e-06, |
|
"loss": 0.0, |
|
"step": 4950 |
|
}, |
|
{ |
|
"epoch": 76.003125, |
|
"grad_norm": 0.002598729683086276, |
|
"learning_rate": 2.5e-06, |
|
"loss": 0.0072, |
|
"step": 4960 |
|
}, |
|
{ |
|
"epoch": 76.0046875, |
|
"grad_norm": 0.014847584068775177, |
|
"learning_rate": 2.4826388888888893e-06, |
|
"loss": 0.0, |
|
"step": 4970 |
|
}, |
|
{ |
|
"epoch": 76.00625, |
|
"grad_norm": 0.00036059808917343616, |
|
"learning_rate": 2.465277777777778e-06, |
|
"loss": 0.1283, |
|
"step": 4980 |
|
}, |
|
{ |
|
"epoch": 76.0078125, |
|
"grad_norm": 0.00047456120955757797, |
|
"learning_rate": 2.4479166666666666e-06, |
|
"loss": 0.0, |
|
"step": 4990 |
|
}, |
|
{ |
|
"epoch": 76.009375, |
|
"grad_norm": 0.003510046051815152, |
|
"learning_rate": 2.4305555555555557e-06, |
|
"loss": 0.0, |
|
"step": 5000 |
|
}, |
|
{ |
|
"epoch": 76.01015625, |
|
"eval_accuracy": 0.6086956521739131, |
|
"eval_loss": 3.055154800415039, |
|
"eval_runtime": 18.4062, |
|
"eval_samples_per_second": 1.25, |
|
"eval_steps_per_second": 0.326, |
|
"step": 5005 |
|
}, |
|
{ |
|
"epoch": 77.00078125, |
|
"grad_norm": 0.0008884906419552863, |
|
"learning_rate": 2.4131944444444448e-06, |
|
"loss": 0.0068, |
|
"step": 5010 |
|
}, |
|
{ |
|
"epoch": 77.00234375, |
|
"grad_norm": 0.0005388921708799899, |
|
"learning_rate": 2.395833333333334e-06, |
|
"loss": 0.0, |
|
"step": 5020 |
|
}, |
|
{ |
|
"epoch": 77.00390625, |
|
"grad_norm": 0.00851092953234911, |
|
"learning_rate": 2.3784722222222225e-06, |
|
"loss": 0.0, |
|
"step": 5030 |
|
}, |
|
{ |
|
"epoch": 77.00546875, |
|
"grad_norm": 0.00045580798177979887, |
|
"learning_rate": 2.361111111111111e-06, |
|
"loss": 0.0001, |
|
"step": 5040 |
|
}, |
|
{ |
|
"epoch": 77.00703125, |
|
"grad_norm": 0.00034425558988004923, |
|
"learning_rate": 2.3437500000000002e-06, |
|
"loss": 0.0, |
|
"step": 5050 |
|
}, |
|
{ |
|
"epoch": 77.00859375, |
|
"grad_norm": 0.0005117100663483143, |
|
"learning_rate": 2.3263888888888893e-06, |
|
"loss": 0.0, |
|
"step": 5060 |
|
}, |
|
{ |
|
"epoch": 77.01015625, |
|
"grad_norm": 0.0008873433689586818, |
|
"learning_rate": 2.309027777777778e-06, |
|
"loss": 0.0, |
|
"step": 5070 |
|
}, |
|
{ |
|
"epoch": 77.01015625, |
|
"eval_accuracy": 0.7391304347826086, |
|
"eval_loss": 2.493333578109741, |
|
"eval_runtime": 7.0979, |
|
"eval_samples_per_second": 3.24, |
|
"eval_steps_per_second": 0.845, |
|
"step": 5070 |
|
}, |
|
{ |
|
"epoch": 78.0015625, |
|
"grad_norm": 0.0033883508294820786, |
|
"learning_rate": 2.2916666666666666e-06, |
|
"loss": 0.0, |
|
"step": 5080 |
|
}, |
|
{ |
|
"epoch": 78.003125, |
|
"grad_norm": 0.001539805089123547, |
|
"learning_rate": 2.2743055555555557e-06, |
|
"loss": 0.0, |
|
"step": 5090 |
|
}, |
|
{ |
|
"epoch": 78.0046875, |
|
"grad_norm": 0.0013049524277448654, |
|
"learning_rate": 2.2569444444444448e-06, |
|
"loss": 0.0, |
|
"step": 5100 |
|
}, |
|
{ |
|
"epoch": 78.00625, |
|
"grad_norm": 0.001957050757482648, |
|
"learning_rate": 2.2395833333333334e-06, |
|
"loss": 0.0, |
|
"step": 5110 |
|
}, |
|
{ |
|
"epoch": 78.0078125, |
|
"grad_norm": 0.0028707508463412523, |
|
"learning_rate": 2.222222222222222e-06, |
|
"loss": 0.0, |
|
"step": 5120 |
|
}, |
|
{ |
|
"epoch": 78.009375, |
|
"grad_norm": 0.0010582773247733712, |
|
"learning_rate": 2.204861111111111e-06, |
|
"loss": 0.0044, |
|
"step": 5130 |
|
}, |
|
{ |
|
"epoch": 78.01015625, |
|
"eval_accuracy": 0.7391304347826086, |
|
"eval_loss": 2.1810925006866455, |
|
"eval_runtime": 6.9675, |
|
"eval_samples_per_second": 3.301, |
|
"eval_steps_per_second": 0.861, |
|
"step": 5135 |
|
}, |
|
{ |
|
"epoch": 79.00078125, |
|
"grad_norm": 0.00025594123871997, |
|
"learning_rate": 2.1875000000000002e-06, |
|
"loss": 0.0, |
|
"step": 5140 |
|
}, |
|
{ |
|
"epoch": 79.00234375, |
|
"grad_norm": 0.0011205327464267612, |
|
"learning_rate": 2.1701388888888893e-06, |
|
"loss": 0.0, |
|
"step": 5150 |
|
}, |
|
{ |
|
"epoch": 79.00390625, |
|
"grad_norm": 0.0011309379478916526, |
|
"learning_rate": 2.152777777777778e-06, |
|
"loss": 0.0, |
|
"step": 5160 |
|
}, |
|
{ |
|
"epoch": 79.00546875, |
|
"grad_norm": 0.0006058907019905746, |
|
"learning_rate": 2.1354166666666666e-06, |
|
"loss": 0.0, |
|
"step": 5170 |
|
}, |
|
{ |
|
"epoch": 79.00703125, |
|
"grad_norm": 0.0004243666189722717, |
|
"learning_rate": 2.1180555555555557e-06, |
|
"loss": 0.0, |
|
"step": 5180 |
|
}, |
|
{ |
|
"epoch": 79.00859375, |
|
"grad_norm": 0.16494303941726685, |
|
"learning_rate": 2.1006944444444448e-06, |
|
"loss": 0.0, |
|
"step": 5190 |
|
}, |
|
{ |
|
"epoch": 79.01015625, |
|
"grad_norm": 0.0006716122734360397, |
|
"learning_rate": 2.0833333333333334e-06, |
|
"loss": 0.0, |
|
"step": 5200 |
|
}, |
|
{ |
|
"epoch": 79.01015625, |
|
"eval_accuracy": 0.782608695652174, |
|
"eval_loss": 1.9050878286361694, |
|
"eval_runtime": 6.8445, |
|
"eval_samples_per_second": 3.36, |
|
"eval_steps_per_second": 0.877, |
|
"step": 5200 |
|
}, |
|
{ |
|
"epoch": 80.0015625, |
|
"grad_norm": 0.0005032762419432402, |
|
"learning_rate": 2.065972222222222e-06, |
|
"loss": 0.0, |
|
"step": 5210 |
|
}, |
|
{ |
|
"epoch": 80.003125, |
|
"grad_norm": 0.0006515904678963125, |
|
"learning_rate": 2.048611111111111e-06, |
|
"loss": 0.0, |
|
"step": 5220 |
|
}, |
|
{ |
|
"epoch": 80.0046875, |
|
"grad_norm": 0.0008974867523647845, |
|
"learning_rate": 2.0312500000000002e-06, |
|
"loss": 0.0, |
|
"step": 5230 |
|
}, |
|
{ |
|
"epoch": 80.00625, |
|
"grad_norm": 0.0004513378662522882, |
|
"learning_rate": 2.0138888888888893e-06, |
|
"loss": 0.0, |
|
"step": 5240 |
|
}, |
|
{ |
|
"epoch": 80.0078125, |
|
"grad_norm": 0.0005900393007323146, |
|
"learning_rate": 1.996527777777778e-06, |
|
"loss": 0.0, |
|
"step": 5250 |
|
}, |
|
{ |
|
"epoch": 80.009375, |
|
"grad_norm": 0.0005228605587035418, |
|
"learning_rate": 1.9791666666666666e-06, |
|
"loss": 0.0, |
|
"step": 5260 |
|
}, |
|
{ |
|
"epoch": 80.01015625, |
|
"eval_accuracy": 0.8260869565217391, |
|
"eval_loss": 1.8406822681427002, |
|
"eval_runtime": 7.235, |
|
"eval_samples_per_second": 3.179, |
|
"eval_steps_per_second": 0.829, |
|
"step": 5265 |
|
}, |
|
{ |
|
"epoch": 81.00078125, |
|
"grad_norm": 0.0006630319985561073, |
|
"learning_rate": 1.9618055555555557e-06, |
|
"loss": 0.0, |
|
"step": 5270 |
|
}, |
|
{ |
|
"epoch": 81.00234375, |
|
"grad_norm": 0.0007084131939336658, |
|
"learning_rate": 1.944444444444445e-06, |
|
"loss": 0.0, |
|
"step": 5280 |
|
}, |
|
{ |
|
"epoch": 81.00390625, |
|
"grad_norm": 0.0004504051757976413, |
|
"learning_rate": 1.9270833333333334e-06, |
|
"loss": 0.0, |
|
"step": 5290 |
|
}, |
|
{ |
|
"epoch": 81.00546875, |
|
"grad_norm": 0.0005867110448889434, |
|
"learning_rate": 1.909722222222222e-06, |
|
"loss": 0.0, |
|
"step": 5300 |
|
}, |
|
{ |
|
"epoch": 81.00703125, |
|
"grad_norm": 0.00044120638631284237, |
|
"learning_rate": 1.8923611111111112e-06, |
|
"loss": 0.0027, |
|
"step": 5310 |
|
}, |
|
{ |
|
"epoch": 81.00859375, |
|
"grad_norm": 0.004088283982127905, |
|
"learning_rate": 1.8750000000000003e-06, |
|
"loss": 0.0, |
|
"step": 5320 |
|
}, |
|
{ |
|
"epoch": 81.01015625, |
|
"grad_norm": 0.00046825598110444844, |
|
"learning_rate": 1.8576388888888891e-06, |
|
"loss": 0.0, |
|
"step": 5330 |
|
}, |
|
{ |
|
"epoch": 81.01015625, |
|
"eval_accuracy": 0.782608695652174, |
|
"eval_loss": 2.196652889251709, |
|
"eval_runtime": 7.2441, |
|
"eval_samples_per_second": 3.175, |
|
"eval_steps_per_second": 0.828, |
|
"step": 5330 |
|
}, |
|
{ |
|
"epoch": 82.0015625, |
|
"grad_norm": 0.001545197912491858, |
|
"learning_rate": 1.840277777777778e-06, |
|
"loss": 0.0001, |
|
"step": 5340 |
|
}, |
|
{ |
|
"epoch": 82.003125, |
|
"grad_norm": 0.0010275169042870402, |
|
"learning_rate": 1.8229166666666666e-06, |
|
"loss": 0.0, |
|
"step": 5350 |
|
}, |
|
{ |
|
"epoch": 82.0046875, |
|
"grad_norm": 0.0010215662186965346, |
|
"learning_rate": 1.8055555555555557e-06, |
|
"loss": 0.0, |
|
"step": 5360 |
|
}, |
|
{ |
|
"epoch": 82.00625, |
|
"grad_norm": 0.0014606657205149531, |
|
"learning_rate": 1.7881944444444446e-06, |
|
"loss": 0.0, |
|
"step": 5370 |
|
}, |
|
{ |
|
"epoch": 82.0078125, |
|
"grad_norm": 0.00041382870404049754, |
|
"learning_rate": 1.7708333333333337e-06, |
|
"loss": 0.0, |
|
"step": 5380 |
|
}, |
|
{ |
|
"epoch": 82.009375, |
|
"grad_norm": 0.0007086304831318557, |
|
"learning_rate": 1.7534722222222223e-06, |
|
"loss": 0.0, |
|
"step": 5390 |
|
}, |
|
{ |
|
"epoch": 82.01015625, |
|
"eval_accuracy": 0.6956521739130435, |
|
"eval_loss": 2.3231327533721924, |
|
"eval_runtime": 6.3536, |
|
"eval_samples_per_second": 3.62, |
|
"eval_steps_per_second": 0.944, |
|
"step": 5395 |
|
}, |
|
{ |
|
"epoch": 83.00078125, |
|
"grad_norm": 0.0003945750358980149, |
|
"learning_rate": 1.7361111111111112e-06, |
|
"loss": 0.0, |
|
"step": 5400 |
|
}, |
|
{ |
|
"epoch": 83.00234375, |
|
"grad_norm": 0.0006641672225669026, |
|
"learning_rate": 1.71875e-06, |
|
"loss": 0.0, |
|
"step": 5410 |
|
}, |
|
{ |
|
"epoch": 83.00390625, |
|
"grad_norm": 0.0003971403057221323, |
|
"learning_rate": 1.7013888888888891e-06, |
|
"loss": 0.0, |
|
"step": 5420 |
|
}, |
|
{ |
|
"epoch": 83.00546875, |
|
"grad_norm": 0.0005819292273372412, |
|
"learning_rate": 1.684027777777778e-06, |
|
"loss": 0.0, |
|
"step": 5430 |
|
}, |
|
{ |
|
"epoch": 83.00703125, |
|
"grad_norm": 0.0006034942343831062, |
|
"learning_rate": 1.6666666666666667e-06, |
|
"loss": 0.0, |
|
"step": 5440 |
|
}, |
|
{ |
|
"epoch": 83.00859375, |
|
"grad_norm": 0.00044165682629682124, |
|
"learning_rate": 1.6493055555555557e-06, |
|
"loss": 0.0, |
|
"step": 5450 |
|
}, |
|
{ |
|
"epoch": 83.01015625, |
|
"grad_norm": 0.0006319324602372944, |
|
"learning_rate": 1.6319444444444446e-06, |
|
"loss": 0.0, |
|
"step": 5460 |
|
}, |
|
{ |
|
"epoch": 83.01015625, |
|
"eval_accuracy": 0.6956521739130435, |
|
"eval_loss": 2.342456579208374, |
|
"eval_runtime": 7.9283, |
|
"eval_samples_per_second": 2.901, |
|
"eval_steps_per_second": 0.757, |
|
"step": 5460 |
|
}, |
|
{ |
|
"epoch": 84.0015625, |
|
"grad_norm": 0.0004467582912184298, |
|
"learning_rate": 1.6145833333333335e-06, |
|
"loss": 0.0, |
|
"step": 5470 |
|
}, |
|
{ |
|
"epoch": 84.003125, |
|
"grad_norm": 0.0006506055360659957, |
|
"learning_rate": 1.5972222222222221e-06, |
|
"loss": 0.0, |
|
"step": 5480 |
|
}, |
|
{ |
|
"epoch": 84.0046875, |
|
"grad_norm": 0.0003559678152669221, |
|
"learning_rate": 1.5798611111111112e-06, |
|
"loss": 0.0, |
|
"step": 5490 |
|
}, |
|
{ |
|
"epoch": 84.00625, |
|
"grad_norm": 0.0005574678652919829, |
|
"learning_rate": 1.5625e-06, |
|
"loss": 0.0, |
|
"step": 5500 |
|
}, |
|
{ |
|
"epoch": 84.0078125, |
|
"grad_norm": 0.0009164654766209424, |
|
"learning_rate": 1.5451388888888892e-06, |
|
"loss": 0.1025, |
|
"step": 5510 |
|
}, |
|
{ |
|
"epoch": 84.009375, |
|
"grad_norm": 0.0003405215102247894, |
|
"learning_rate": 1.527777777777778e-06, |
|
"loss": 0.0, |
|
"step": 5520 |
|
}, |
|
{ |
|
"epoch": 84.01015625, |
|
"eval_accuracy": 0.5652173913043478, |
|
"eval_loss": 2.840322971343994, |
|
"eval_runtime": 8.0804, |
|
"eval_samples_per_second": 2.846, |
|
"eval_steps_per_second": 0.743, |
|
"step": 5525 |
|
}, |
|
{ |
|
"epoch": 85.00078125, |
|
"grad_norm": 0.0005728736286982894, |
|
"learning_rate": 1.5104166666666667e-06, |
|
"loss": 0.0, |
|
"step": 5530 |
|
}, |
|
{ |
|
"epoch": 85.00234375, |
|
"grad_norm": 0.0006991293630562723, |
|
"learning_rate": 1.4930555555555555e-06, |
|
"loss": 0.2353, |
|
"step": 5540 |
|
}, |
|
{ |
|
"epoch": 85.00390625, |
|
"grad_norm": 0.0010961528168991208, |
|
"learning_rate": 1.4756944444444446e-06, |
|
"loss": 0.0, |
|
"step": 5550 |
|
}, |
|
{ |
|
"epoch": 85.00546875, |
|
"grad_norm": 0.0006365890149027109, |
|
"learning_rate": 1.4583333333333335e-06, |
|
"loss": 0.0, |
|
"step": 5560 |
|
}, |
|
{ |
|
"epoch": 85.00703125, |
|
"grad_norm": 0.000564310175832361, |
|
"learning_rate": 1.4409722222222221e-06, |
|
"loss": 0.0, |
|
"step": 5570 |
|
}, |
|
{ |
|
"epoch": 85.00859375, |
|
"grad_norm": 0.0025850527454167604, |
|
"learning_rate": 1.4236111111111112e-06, |
|
"loss": 0.0, |
|
"step": 5580 |
|
}, |
|
{ |
|
"epoch": 85.01015625, |
|
"grad_norm": 0.0003799022815655917, |
|
"learning_rate": 1.40625e-06, |
|
"loss": 0.0, |
|
"step": 5590 |
|
}, |
|
{ |
|
"epoch": 85.01015625, |
|
"eval_accuracy": 0.6956521739130435, |
|
"eval_loss": 2.342359781265259, |
|
"eval_runtime": 4.4336, |
|
"eval_samples_per_second": 5.188, |
|
"eval_steps_per_second": 1.353, |
|
"step": 5590 |
|
}, |
|
{ |
|
"epoch": 86.0015625, |
|
"grad_norm": 0.000343962456099689, |
|
"learning_rate": 1.3888888888888892e-06, |
|
"loss": 0.0, |
|
"step": 5600 |
|
}, |
|
{ |
|
"epoch": 86.003125, |
|
"grad_norm": 0.0007259986596181989, |
|
"learning_rate": 1.371527777777778e-06, |
|
"loss": 0.0, |
|
"step": 5610 |
|
}, |
|
{ |
|
"epoch": 86.0046875, |
|
"grad_norm": 0.0005732372519560158, |
|
"learning_rate": 1.3541666666666667e-06, |
|
"loss": 0.0, |
|
"step": 5620 |
|
}, |
|
{ |
|
"epoch": 86.00625, |
|
"grad_norm": 0.0008637688006274402, |
|
"learning_rate": 1.3368055555555556e-06, |
|
"loss": 0.0, |
|
"step": 5630 |
|
}, |
|
{ |
|
"epoch": 86.0078125, |
|
"grad_norm": 0.001968758413568139, |
|
"learning_rate": 1.3194444444444446e-06, |
|
"loss": 0.0, |
|
"step": 5640 |
|
}, |
|
{ |
|
"epoch": 86.009375, |
|
"grad_norm": 0.0009530284442007542, |
|
"learning_rate": 1.3020833333333335e-06, |
|
"loss": 0.0, |
|
"step": 5650 |
|
}, |
|
{ |
|
"epoch": 86.01015625, |
|
"eval_accuracy": 0.6956521739130435, |
|
"eval_loss": 2.4246294498443604, |
|
"eval_runtime": 6.8425, |
|
"eval_samples_per_second": 3.361, |
|
"eval_steps_per_second": 0.877, |
|
"step": 5655 |
|
}, |
|
{ |
|
"epoch": 87.00078125, |
|
"grad_norm": 0.0003137109742965549, |
|
"learning_rate": 1.2847222222222222e-06, |
|
"loss": 0.0, |
|
"step": 5660 |
|
}, |
|
{ |
|
"epoch": 87.00234375, |
|
"grad_norm": 0.0005719884647987783, |
|
"learning_rate": 1.2673611111111112e-06, |
|
"loss": 0.0, |
|
"step": 5670 |
|
}, |
|
{ |
|
"epoch": 87.00390625, |
|
"grad_norm": 0.0004218370304442942, |
|
"learning_rate": 1.25e-06, |
|
"loss": 0.0, |
|
"step": 5680 |
|
}, |
|
{ |
|
"epoch": 87.00546875, |
|
"grad_norm": 0.0008452400797978044, |
|
"learning_rate": 1.232638888888889e-06, |
|
"loss": 0.0, |
|
"step": 5690 |
|
}, |
|
{ |
|
"epoch": 87.00703125, |
|
"grad_norm": 0.0008123002480715513, |
|
"learning_rate": 1.2152777777777778e-06, |
|
"loss": 0.0, |
|
"step": 5700 |
|
}, |
|
{ |
|
"epoch": 87.00859375, |
|
"grad_norm": 0.009576586075127125, |
|
"learning_rate": 1.197916666666667e-06, |
|
"loss": 0.0, |
|
"step": 5710 |
|
}, |
|
{ |
|
"epoch": 87.01015625, |
|
"grad_norm": 0.0009949628729373217, |
|
"learning_rate": 1.1805555555555556e-06, |
|
"loss": 0.0, |
|
"step": 5720 |
|
}, |
|
{ |
|
"epoch": 87.01015625, |
|
"eval_accuracy": 0.6956521739130435, |
|
"eval_loss": 2.428931951522827, |
|
"eval_runtime": 5.7719, |
|
"eval_samples_per_second": 3.985, |
|
"eval_steps_per_second": 1.04, |
|
"step": 5720 |
|
}, |
|
{ |
|
"epoch": 88.0015625, |
|
"grad_norm": 0.0004862360074184835, |
|
"learning_rate": 1.1631944444444446e-06, |
|
"loss": 0.0, |
|
"step": 5730 |
|
}, |
|
{ |
|
"epoch": 88.003125, |
|
"grad_norm": 0.0013689215993508697, |
|
"learning_rate": 1.1458333333333333e-06, |
|
"loss": 0.0, |
|
"step": 5740 |
|
}, |
|
{ |
|
"epoch": 88.0046875, |
|
"grad_norm": 0.0004143691621720791, |
|
"learning_rate": 1.1284722222222224e-06, |
|
"loss": 0.0, |
|
"step": 5750 |
|
}, |
|
{ |
|
"epoch": 88.00625, |
|
"grad_norm": 0.000784511910751462, |
|
"learning_rate": 1.111111111111111e-06, |
|
"loss": 0.0, |
|
"step": 5760 |
|
}, |
|
{ |
|
"epoch": 88.0078125, |
|
"grad_norm": 0.0004031602293252945, |
|
"learning_rate": 1.0937500000000001e-06, |
|
"loss": 0.0, |
|
"step": 5770 |
|
}, |
|
{ |
|
"epoch": 88.009375, |
|
"grad_norm": 0.001072776154614985, |
|
"learning_rate": 1.076388888888889e-06, |
|
"loss": 0.0, |
|
"step": 5780 |
|
}, |
|
{ |
|
"epoch": 88.01015625, |
|
"eval_accuracy": 0.6956521739130435, |
|
"eval_loss": 2.43101167678833, |
|
"eval_runtime": 4.7751, |
|
"eval_samples_per_second": 4.817, |
|
"eval_steps_per_second": 1.257, |
|
"step": 5785 |
|
}, |
|
{ |
|
"epoch": 89.00078125, |
|
"grad_norm": 0.0005272148991934955, |
|
"learning_rate": 1.0590277777777778e-06, |
|
"loss": 0.0, |
|
"step": 5790 |
|
}, |
|
{ |
|
"epoch": 89.00234375, |
|
"grad_norm": 0.0004827801021747291, |
|
"learning_rate": 1.0416666666666667e-06, |
|
"loss": 0.0, |
|
"step": 5800 |
|
}, |
|
{ |
|
"epoch": 89.00390625, |
|
"grad_norm": 0.0005783537635579705, |
|
"learning_rate": 1.0243055555555556e-06, |
|
"loss": 0.0, |
|
"step": 5810 |
|
}, |
|
{ |
|
"epoch": 89.00546875, |
|
"grad_norm": 0.0006673180614598095, |
|
"learning_rate": 1.0069444444444447e-06, |
|
"loss": 0.0, |
|
"step": 5820 |
|
}, |
|
{ |
|
"epoch": 89.00703125, |
|
"grad_norm": 0.0004277201369404793, |
|
"learning_rate": 9.895833333333333e-07, |
|
"loss": 0.0, |
|
"step": 5830 |
|
}, |
|
{ |
|
"epoch": 89.00859375, |
|
"grad_norm": 0.0005442069377750158, |
|
"learning_rate": 9.722222222222224e-07, |
|
"loss": 0.0, |
|
"step": 5840 |
|
}, |
|
{ |
|
"epoch": 89.01015625, |
|
"grad_norm": 0.0005160087021067739, |
|
"learning_rate": 9.54861111111111e-07, |
|
"loss": 0.0, |
|
"step": 5850 |
|
}, |
|
{ |
|
"epoch": 89.01015625, |
|
"eval_accuracy": 0.6956521739130435, |
|
"eval_loss": 2.43607234954834, |
|
"eval_runtime": 4.5703, |
|
"eval_samples_per_second": 5.032, |
|
"eval_steps_per_second": 1.313, |
|
"step": 5850 |
|
}, |
|
{ |
|
"epoch": 90.0015625, |
|
"grad_norm": 0.0002721658383961767, |
|
"learning_rate": 9.375000000000001e-07, |
|
"loss": 0.0, |
|
"step": 5860 |
|
}, |
|
{ |
|
"epoch": 90.003125, |
|
"grad_norm": 0.0004188539751339704, |
|
"learning_rate": 9.20138888888889e-07, |
|
"loss": 0.0, |
|
"step": 5870 |
|
}, |
|
{ |
|
"epoch": 90.0046875, |
|
"grad_norm": 0.014377003535628319, |
|
"learning_rate": 9.027777777777779e-07, |
|
"loss": 0.0, |
|
"step": 5880 |
|
}, |
|
{ |
|
"epoch": 90.00625, |
|
"grad_norm": 0.001269037718884647, |
|
"learning_rate": 8.854166666666668e-07, |
|
"loss": 0.0, |
|
"step": 5890 |
|
}, |
|
{ |
|
"epoch": 90.0078125, |
|
"grad_norm": 0.0020640110597014427, |
|
"learning_rate": 8.680555555555556e-07, |
|
"loss": 0.0, |
|
"step": 5900 |
|
}, |
|
{ |
|
"epoch": 90.009375, |
|
"grad_norm": 0.00035188894253224134, |
|
"learning_rate": 8.506944444444446e-07, |
|
"loss": 0.0, |
|
"step": 5910 |
|
}, |
|
{ |
|
"epoch": 90.01015625, |
|
"eval_accuracy": 0.6956521739130435, |
|
"eval_loss": 2.366661310195923, |
|
"eval_runtime": 4.486, |
|
"eval_samples_per_second": 5.127, |
|
"eval_steps_per_second": 1.338, |
|
"step": 5915 |
|
}, |
|
{ |
|
"epoch": 91.00078125, |
|
"grad_norm": 0.00033934033126570284, |
|
"learning_rate": 8.333333333333333e-07, |
|
"loss": 0.0, |
|
"step": 5920 |
|
}, |
|
{ |
|
"epoch": 91.00234375, |
|
"grad_norm": 0.0003388167533557862, |
|
"learning_rate": 8.159722222222223e-07, |
|
"loss": 0.0, |
|
"step": 5930 |
|
}, |
|
{ |
|
"epoch": 91.00390625, |
|
"grad_norm": 0.0002893786586355418, |
|
"learning_rate": 7.986111111111111e-07, |
|
"loss": 0.0, |
|
"step": 5940 |
|
}, |
|
{ |
|
"epoch": 91.00546875, |
|
"grad_norm": 0.0014214407419785857, |
|
"learning_rate": 7.8125e-07, |
|
"loss": 0.0, |
|
"step": 5950 |
|
}, |
|
{ |
|
"epoch": 91.00703125, |
|
"grad_norm": 0.00035594150540418923, |
|
"learning_rate": 7.63888888888889e-07, |
|
"loss": 0.0, |
|
"step": 5960 |
|
}, |
|
{ |
|
"epoch": 91.00859375, |
|
"grad_norm": 0.0007969331927597523, |
|
"learning_rate": 7.465277777777778e-07, |
|
"loss": 0.0, |
|
"step": 5970 |
|
}, |
|
{ |
|
"epoch": 91.01015625, |
|
"grad_norm": 0.004214271903038025, |
|
"learning_rate": 7.291666666666667e-07, |
|
"loss": 0.0, |
|
"step": 5980 |
|
}, |
|
{ |
|
"epoch": 91.01015625, |
|
"eval_accuracy": 0.6956521739130435, |
|
"eval_loss": 2.3626725673675537, |
|
"eval_runtime": 67.4451, |
|
"eval_samples_per_second": 0.341, |
|
"eval_steps_per_second": 0.089, |
|
"step": 5980 |
|
}, |
|
{ |
|
"epoch": 92.0015625, |
|
"grad_norm": 0.0013107710983604193, |
|
"learning_rate": 7.118055555555556e-07, |
|
"loss": 0.0, |
|
"step": 5990 |
|
}, |
|
{ |
|
"epoch": 92.003125, |
|
"grad_norm": 0.0003296121140010655, |
|
"learning_rate": 6.944444444444446e-07, |
|
"loss": 0.0, |
|
"step": 6000 |
|
}, |
|
{ |
|
"epoch": 92.0046875, |
|
"grad_norm": 0.00023893473553471267, |
|
"learning_rate": 6.770833333333333e-07, |
|
"loss": 0.0, |
|
"step": 6010 |
|
}, |
|
{ |
|
"epoch": 92.00625, |
|
"grad_norm": 0.0014689895324409008, |
|
"learning_rate": 6.597222222222223e-07, |
|
"loss": 0.0, |
|
"step": 6020 |
|
}, |
|
{ |
|
"epoch": 92.0078125, |
|
"grad_norm": 0.0019727437756955624, |
|
"learning_rate": 6.423611111111111e-07, |
|
"loss": 0.0, |
|
"step": 6030 |
|
}, |
|
{ |
|
"epoch": 92.009375, |
|
"grad_norm": 0.0005726708914153278, |
|
"learning_rate": 6.25e-07, |
|
"loss": 0.0, |
|
"step": 6040 |
|
}, |
|
{ |
|
"epoch": 92.01015625, |
|
"eval_accuracy": 0.6956521739130435, |
|
"eval_loss": 2.3715200424194336, |
|
"eval_runtime": 4.4622, |
|
"eval_samples_per_second": 5.154, |
|
"eval_steps_per_second": 1.345, |
|
"step": 6045 |
|
}, |
|
{ |
|
"epoch": 93.00078125, |
|
"grad_norm": 0.00046812117216177285, |
|
"learning_rate": 6.076388888888889e-07, |
|
"loss": 0.0039, |
|
"step": 6050 |
|
}, |
|
{ |
|
"epoch": 93.00234375, |
|
"grad_norm": 0.0027324645780026913, |
|
"learning_rate": 5.902777777777778e-07, |
|
"loss": 0.0, |
|
"step": 6060 |
|
}, |
|
{ |
|
"epoch": 93.00390625, |
|
"grad_norm": 0.00047073099995031953, |
|
"learning_rate": 5.729166666666667e-07, |
|
"loss": 0.0, |
|
"step": 6070 |
|
}, |
|
{ |
|
"epoch": 93.00546875, |
|
"grad_norm": 0.0006114395800977945, |
|
"learning_rate": 5.555555555555555e-07, |
|
"loss": 0.0, |
|
"step": 6080 |
|
}, |
|
{ |
|
"epoch": 93.00703125, |
|
"grad_norm": 0.00047333547263406217, |
|
"learning_rate": 5.381944444444445e-07, |
|
"loss": 0.0, |
|
"step": 6090 |
|
}, |
|
{ |
|
"epoch": 93.00859375, |
|
"grad_norm": 0.0005634532426483929, |
|
"learning_rate": 5.208333333333334e-07, |
|
"loss": 0.0, |
|
"step": 6100 |
|
}, |
|
{ |
|
"epoch": 93.01015625, |
|
"grad_norm": 0.0014537627575919032, |
|
"learning_rate": 5.034722222222223e-07, |
|
"loss": 0.0, |
|
"step": 6110 |
|
}, |
|
{ |
|
"epoch": 93.01015625, |
|
"eval_accuracy": 0.7391304347826086, |
|
"eval_loss": 2.3772759437561035, |
|
"eval_runtime": 4.6391, |
|
"eval_samples_per_second": 4.958, |
|
"eval_steps_per_second": 1.293, |
|
"step": 6110 |
|
}, |
|
{ |
|
"epoch": 94.0015625, |
|
"grad_norm": 0.00027765726554207504, |
|
"learning_rate": 4.861111111111112e-07, |
|
"loss": 0.0, |
|
"step": 6120 |
|
}, |
|
{ |
|
"epoch": 94.003125, |
|
"grad_norm": 0.001291173743084073, |
|
"learning_rate": 4.6875000000000006e-07, |
|
"loss": 0.0, |
|
"step": 6130 |
|
}, |
|
{ |
|
"epoch": 94.0046875, |
|
"grad_norm": 0.0007418213644996285, |
|
"learning_rate": 4.5138888888888893e-07, |
|
"loss": 0.0, |
|
"step": 6140 |
|
}, |
|
{ |
|
"epoch": 94.00625, |
|
"grad_norm": 0.0003440550353843719, |
|
"learning_rate": 4.340277777777778e-07, |
|
"loss": 0.0, |
|
"step": 6150 |
|
}, |
|
{ |
|
"epoch": 94.0078125, |
|
"grad_norm": 0.0003957599401473999, |
|
"learning_rate": 4.1666666666666667e-07, |
|
"loss": 0.0, |
|
"step": 6160 |
|
}, |
|
{ |
|
"epoch": 94.009375, |
|
"grad_norm": 0.0005624539335258305, |
|
"learning_rate": 3.9930555555555553e-07, |
|
"loss": 0.0, |
|
"step": 6170 |
|
}, |
|
{ |
|
"epoch": 94.01015625, |
|
"eval_accuracy": 0.7391304347826086, |
|
"eval_loss": 2.4264447689056396, |
|
"eval_runtime": 4.4311, |
|
"eval_samples_per_second": 5.191, |
|
"eval_steps_per_second": 1.354, |
|
"step": 6175 |
|
}, |
|
{ |
|
"epoch": 95.00078125, |
|
"grad_norm": 0.0005723352078348398, |
|
"learning_rate": 3.819444444444445e-07, |
|
"loss": 0.0, |
|
"step": 6180 |
|
}, |
|
{ |
|
"epoch": 95.00234375, |
|
"grad_norm": 0.0003953687846660614, |
|
"learning_rate": 3.6458333333333337e-07, |
|
"loss": 0.0, |
|
"step": 6190 |
|
}, |
|
{ |
|
"epoch": 95.00390625, |
|
"grad_norm": 0.003462144872173667, |
|
"learning_rate": 3.472222222222223e-07, |
|
"loss": 0.0, |
|
"step": 6200 |
|
}, |
|
{ |
|
"epoch": 95.00546875, |
|
"grad_norm": 0.0016035564476624131, |
|
"learning_rate": 3.2986111111111116e-07, |
|
"loss": 0.0, |
|
"step": 6210 |
|
}, |
|
{ |
|
"epoch": 95.00703125, |
|
"grad_norm": 0.0005412808386608958, |
|
"learning_rate": 3.125e-07, |
|
"loss": 0.0, |
|
"step": 6220 |
|
}, |
|
{ |
|
"epoch": 95.00859375, |
|
"grad_norm": 0.0007983644027262926, |
|
"learning_rate": 2.951388888888889e-07, |
|
"loss": 0.0, |
|
"step": 6230 |
|
}, |
|
{ |
|
"epoch": 95.01015625, |
|
"grad_norm": 0.0008442331454716623, |
|
"learning_rate": 2.7777777777777776e-07, |
|
"loss": 0.0, |
|
"step": 6240 |
|
}, |
|
{ |
|
"epoch": 95.01015625, |
|
"eval_accuracy": 0.7391304347826086, |
|
"eval_loss": 2.439257860183716, |
|
"eval_runtime": 4.2971, |
|
"eval_samples_per_second": 5.352, |
|
"eval_steps_per_second": 1.396, |
|
"step": 6240 |
|
}, |
|
{ |
|
"epoch": 96.0015625, |
|
"grad_norm": 0.0006277182837948203, |
|
"learning_rate": 2.604166666666667e-07, |
|
"loss": 0.0, |
|
"step": 6250 |
|
}, |
|
{ |
|
"epoch": 96.003125, |
|
"grad_norm": 0.0005379779613576829, |
|
"learning_rate": 2.430555555555556e-07, |
|
"loss": 0.0, |
|
"step": 6260 |
|
}, |
|
{ |
|
"epoch": 96.0046875, |
|
"grad_norm": 0.00022837742289993912, |
|
"learning_rate": 2.2569444444444447e-07, |
|
"loss": 0.0, |
|
"step": 6270 |
|
}, |
|
{ |
|
"epoch": 96.00625, |
|
"grad_norm": 0.0003217007906641811, |
|
"learning_rate": 2.0833333333333333e-07, |
|
"loss": 0.0, |
|
"step": 6280 |
|
}, |
|
{ |
|
"epoch": 96.0078125, |
|
"grad_norm": 0.0003856255498249084, |
|
"learning_rate": 1.9097222222222225e-07, |
|
"loss": 0.0, |
|
"step": 6290 |
|
}, |
|
{ |
|
"epoch": 96.009375, |
|
"grad_norm": 0.0003599489573389292, |
|
"learning_rate": 1.7361111111111115e-07, |
|
"loss": 0.0, |
|
"step": 6300 |
|
}, |
|
{ |
|
"epoch": 96.01015625, |
|
"eval_accuracy": 0.7391304347826086, |
|
"eval_loss": 2.4449336528778076, |
|
"eval_runtime": 4.3099, |
|
"eval_samples_per_second": 5.337, |
|
"eval_steps_per_second": 1.392, |
|
"step": 6305 |
|
}, |
|
{ |
|
"epoch": 97.00078125, |
|
"grad_norm": 0.00033434602664783597, |
|
"learning_rate": 1.5625e-07, |
|
"loss": 0.0, |
|
"step": 6310 |
|
}, |
|
{ |
|
"epoch": 97.00234375, |
|
"grad_norm": 0.0005138604319654405, |
|
"learning_rate": 1.3888888888888888e-07, |
|
"loss": 0.0, |
|
"step": 6320 |
|
}, |
|
{ |
|
"epoch": 97.00390625, |
|
"grad_norm": 0.0004908979753963649, |
|
"learning_rate": 1.215277777777778e-07, |
|
"loss": 0.0, |
|
"step": 6330 |
|
}, |
|
{ |
|
"epoch": 97.00546875, |
|
"grad_norm": 0.000500280992127955, |
|
"learning_rate": 1.0416666666666667e-07, |
|
"loss": 0.0, |
|
"step": 6340 |
|
}, |
|
{ |
|
"epoch": 97.00703125, |
|
"grad_norm": 0.00043059204472228885, |
|
"learning_rate": 8.680555555555557e-08, |
|
"loss": 0.0, |
|
"step": 6350 |
|
}, |
|
{ |
|
"epoch": 97.00859375, |
|
"grad_norm": 0.00024899342679418623, |
|
"learning_rate": 6.944444444444444e-08, |
|
"loss": 0.0, |
|
"step": 6360 |
|
}, |
|
{ |
|
"epoch": 97.01015625, |
|
"grad_norm": 0.00028796697733923793, |
|
"learning_rate": 5.208333333333333e-08, |
|
"loss": 0.0, |
|
"step": 6370 |
|
}, |
|
{ |
|
"epoch": 97.01015625, |
|
"eval_accuracy": 0.7391304347826086, |
|
"eval_loss": 2.445077419281006, |
|
"eval_runtime": 4.3184, |
|
"eval_samples_per_second": 5.326, |
|
"eval_steps_per_second": 1.389, |
|
"step": 6370 |
|
}, |
|
{ |
|
"epoch": 98.0015625, |
|
"grad_norm": 0.0007087057456374168, |
|
"learning_rate": 3.472222222222222e-08, |
|
"loss": 0.0, |
|
"step": 6380 |
|
}, |
|
{ |
|
"epoch": 98.003125, |
|
"grad_norm": 0.0005677673034369946, |
|
"learning_rate": 1.736111111111111e-08, |
|
"loss": 0.0, |
|
"step": 6390 |
|
}, |
|
{ |
|
"epoch": 98.0046875, |
|
"grad_norm": 0.0004224831354804337, |
|
"learning_rate": 0.0, |
|
"loss": 0.0, |
|
"step": 6400 |
|
}, |
|
{ |
|
"epoch": 98.0046875, |
|
"eval_accuracy": 0.7391304347826086, |
|
"eval_loss": 2.4450957775115967, |
|
"eval_runtime": 5.2109, |
|
"eval_samples_per_second": 4.414, |
|
"eval_steps_per_second": 1.151, |
|
"step": 6400 |
|
}, |
|
{ |
|
"epoch": 98.0046875, |
|
"step": 6400, |
|
"total_flos": 1.1111931193490001e+20, |
|
"train_loss": 0.0934151310825655, |
|
"train_runtime": 12972.4051, |
|
"train_samples_per_second": 1.973, |
|
"train_steps_per_second": 0.493 |
|
}, |
|
{ |
|
"epoch": 98.0046875, |
|
"eval_accuracy": 0.8266666666666667, |
|
"eval_loss": 1.4311991930007935, |
|
"eval_runtime": 15.3975, |
|
"eval_samples_per_second": 4.871, |
|
"eval_steps_per_second": 1.234, |
|
"step": 6400 |
|
}, |
|
{ |
|
"epoch": 98.0046875, |
|
"eval_accuracy": 0.8266666666666667, |
|
"eval_loss": 1.4311994314193726, |
|
"eval_runtime": 14.103, |
|
"eval_samples_per_second": 5.318, |
|
"eval_steps_per_second": 1.347, |
|
"step": 6400 |
|
} |
|
], |
|
"logging_steps": 10, |
|
"max_steps": 6400, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 9223372036854775807, |
|
"save_steps": 500, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 1.1111931193490001e+20, |
|
"train_batch_size": 4, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|