|
{ |
|
"best_metric": 0.9951154529307282, |
|
"best_model_checkpoint": "vit-hybrid-base-bit-384-finetuned-ibird/checkpoint-1131", |
|
"epoch": 2.9973474801061006, |
|
"eval_steps": 500, |
|
"global_step": 1695, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.02, |
|
"grad_norm": 10.60638427734375, |
|
"learning_rate": 2.9411764705882355e-06, |
|
"loss": 3.3842, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"grad_norm": 10.456100463867188, |
|
"learning_rate": 5.882352941176471e-06, |
|
"loss": 3.1282, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"grad_norm": 9.986790657043457, |
|
"learning_rate": 8.823529411764707e-06, |
|
"loss": 2.8815, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.07, |
|
"grad_norm": 10.580012321472168, |
|
"learning_rate": 1.1764705882352942e-05, |
|
"loss": 2.3087, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.09, |
|
"grad_norm": 7.9744133949279785, |
|
"learning_rate": 1.4705882352941177e-05, |
|
"loss": 1.5822, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.11, |
|
"grad_norm": 6.688446044921875, |
|
"learning_rate": 1.7647058823529414e-05, |
|
"loss": 0.9381, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.12, |
|
"grad_norm": 4.64487361907959, |
|
"learning_rate": 2.058823529411765e-05, |
|
"loss": 0.4115, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.14, |
|
"grad_norm": 5.120860576629639, |
|
"learning_rate": 2.3529411764705884e-05, |
|
"loss": 0.1915, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.16, |
|
"grad_norm": 2.6903035640716553, |
|
"learning_rate": 2.647058823529412e-05, |
|
"loss": 0.0935, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.18, |
|
"grad_norm": 2.5217204093933105, |
|
"learning_rate": 2.9411764705882354e-05, |
|
"loss": 0.0949, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.19, |
|
"grad_norm": 0.7664793133735657, |
|
"learning_rate": 3.235294117647059e-05, |
|
"loss": 0.0545, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.21, |
|
"grad_norm": 4.868194580078125, |
|
"learning_rate": 3.529411764705883e-05, |
|
"loss": 0.0495, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.23, |
|
"grad_norm": 3.152761459350586, |
|
"learning_rate": 3.8235294117647055e-05, |
|
"loss": 0.0566, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.25, |
|
"grad_norm": 3.1319901943206787, |
|
"learning_rate": 4.11764705882353e-05, |
|
"loss": 0.0522, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.27, |
|
"grad_norm": 2.599245309829712, |
|
"learning_rate": 4.411764705882353e-05, |
|
"loss": 0.0605, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.28, |
|
"grad_norm": 0.845979630947113, |
|
"learning_rate": 4.705882352941177e-05, |
|
"loss": 0.0521, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.3, |
|
"grad_norm": 6.3421759605407715, |
|
"learning_rate": 5e-05, |
|
"loss": 0.0849, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.32, |
|
"grad_norm": 2.421968698501587, |
|
"learning_rate": 4.967213114754098e-05, |
|
"loss": 0.0502, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.34, |
|
"grad_norm": 0.6349968910217285, |
|
"learning_rate": 4.934426229508197e-05, |
|
"loss": 0.0161, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.35, |
|
"grad_norm": 0.3493717908859253, |
|
"learning_rate": 4.9016393442622957e-05, |
|
"loss": 0.027, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.37, |
|
"grad_norm": 0.24897021055221558, |
|
"learning_rate": 4.868852459016394e-05, |
|
"loss": 0.0481, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.39, |
|
"grad_norm": 0.10974353551864624, |
|
"learning_rate": 4.836065573770492e-05, |
|
"loss": 0.017, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.41, |
|
"grad_norm": 2.431795358657837, |
|
"learning_rate": 4.8032786885245904e-05, |
|
"loss": 0.0494, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 0.42, |
|
"grad_norm": 3.4805123805999756, |
|
"learning_rate": 4.770491803278689e-05, |
|
"loss": 0.0388, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.44, |
|
"grad_norm": 2.0460708141326904, |
|
"learning_rate": 4.737704918032787e-05, |
|
"loss": 0.0507, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.46, |
|
"grad_norm": 0.7923760414123535, |
|
"learning_rate": 4.704918032786885e-05, |
|
"loss": 0.0781, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 0.48, |
|
"grad_norm": 2.829329252243042, |
|
"learning_rate": 4.672131147540984e-05, |
|
"loss": 0.0446, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 0.5, |
|
"grad_norm": 0.8546658158302307, |
|
"learning_rate": 4.6393442622950825e-05, |
|
"loss": 0.0543, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 0.51, |
|
"grad_norm": 0.2265116274356842, |
|
"learning_rate": 4.6065573770491805e-05, |
|
"loss": 0.0528, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 0.53, |
|
"grad_norm": 2.7868409156799316, |
|
"learning_rate": 4.5737704918032786e-05, |
|
"loss": 0.0442, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.55, |
|
"grad_norm": 0.05122404918074608, |
|
"learning_rate": 4.540983606557377e-05, |
|
"loss": 0.0473, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 0.57, |
|
"grad_norm": 4.265463829040527, |
|
"learning_rate": 4.508196721311476e-05, |
|
"loss": 0.0358, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 0.58, |
|
"grad_norm": 0.3292916417121887, |
|
"learning_rate": 4.475409836065574e-05, |
|
"loss": 0.0291, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 0.6, |
|
"grad_norm": 1.2873409986495972, |
|
"learning_rate": 4.442622950819673e-05, |
|
"loss": 0.0681, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 0.62, |
|
"grad_norm": 0.5806503891944885, |
|
"learning_rate": 4.409836065573771e-05, |
|
"loss": 0.0838, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.64, |
|
"grad_norm": 4.996669769287109, |
|
"learning_rate": 4.377049180327869e-05, |
|
"loss": 0.0182, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 0.65, |
|
"grad_norm": 0.058223385363817215, |
|
"learning_rate": 4.3442622950819674e-05, |
|
"loss": 0.0454, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 0.67, |
|
"grad_norm": 0.5088571906089783, |
|
"learning_rate": 4.311475409836066e-05, |
|
"loss": 0.0444, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 0.69, |
|
"grad_norm": 6.739996433258057, |
|
"learning_rate": 4.278688524590164e-05, |
|
"loss": 0.0445, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 0.71, |
|
"grad_norm": 0.22746062278747559, |
|
"learning_rate": 4.245901639344262e-05, |
|
"loss": 0.0258, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.73, |
|
"grad_norm": 0.04663905128836632, |
|
"learning_rate": 4.213114754098361e-05, |
|
"loss": 0.0023, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 0.74, |
|
"grad_norm": 0.3145718574523926, |
|
"learning_rate": 4.1803278688524595e-05, |
|
"loss": 0.0205, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 0.76, |
|
"grad_norm": 0.06759347021579742, |
|
"learning_rate": 4.1475409836065575e-05, |
|
"loss": 0.0779, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 0.78, |
|
"grad_norm": 0.7269498705863953, |
|
"learning_rate": 4.1147540983606556e-05, |
|
"loss": 0.0194, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 0.8, |
|
"grad_norm": 0.6663650274276733, |
|
"learning_rate": 4.081967213114754e-05, |
|
"loss": 0.0495, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 0.81, |
|
"grad_norm": 0.015244757756590843, |
|
"learning_rate": 4.049180327868853e-05, |
|
"loss": 0.0217, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 0.83, |
|
"grad_norm": 0.05687205493450165, |
|
"learning_rate": 4.016393442622951e-05, |
|
"loss": 0.0157, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 0.85, |
|
"grad_norm": 0.06807345896959305, |
|
"learning_rate": 3.983606557377049e-05, |
|
"loss": 0.0021, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 0.87, |
|
"grad_norm": 6.138463497161865, |
|
"learning_rate": 3.950819672131148e-05, |
|
"loss": 0.0473, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 0.88, |
|
"grad_norm": 0.23685503005981445, |
|
"learning_rate": 3.9180327868852464e-05, |
|
"loss": 0.0289, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.9, |
|
"grad_norm": 0.19919660687446594, |
|
"learning_rate": 3.8852459016393444e-05, |
|
"loss": 0.0659, |
|
"step": 510 |
|
}, |
|
{ |
|
"epoch": 0.92, |
|
"grad_norm": 2.63227915763855, |
|
"learning_rate": 3.8524590163934424e-05, |
|
"loss": 0.0618, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 0.94, |
|
"grad_norm": 0.18995435535907745, |
|
"learning_rate": 3.819672131147541e-05, |
|
"loss": 0.0151, |
|
"step": 530 |
|
}, |
|
{ |
|
"epoch": 0.95, |
|
"grad_norm": 0.05399240553379059, |
|
"learning_rate": 3.78688524590164e-05, |
|
"loss": 0.0168, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 0.97, |
|
"grad_norm": 0.13513301312923431, |
|
"learning_rate": 3.754098360655738e-05, |
|
"loss": 0.0492, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 0.99, |
|
"grad_norm": 8.097681999206543, |
|
"learning_rate": 3.721311475409836e-05, |
|
"loss": 0.0997, |
|
"step": 560 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"eval_accuracy": 0.9906749555950266, |
|
"eval_loss": 0.031034117564558983, |
|
"eval_runtime": 124.3045, |
|
"eval_samples_per_second": 18.117, |
|
"eval_steps_per_second": 2.269, |
|
"step": 565 |
|
}, |
|
{ |
|
"epoch": 1.01, |
|
"grad_norm": 0.08757388591766357, |
|
"learning_rate": 3.6885245901639346e-05, |
|
"loss": 0.0243, |
|
"step": 570 |
|
}, |
|
{ |
|
"epoch": 1.03, |
|
"grad_norm": 0.03581409156322479, |
|
"learning_rate": 3.655737704918033e-05, |
|
"loss": 0.0101, |
|
"step": 580 |
|
}, |
|
{ |
|
"epoch": 1.04, |
|
"grad_norm": 0.01882335916161537, |
|
"learning_rate": 3.622950819672131e-05, |
|
"loss": 0.0034, |
|
"step": 590 |
|
}, |
|
{ |
|
"epoch": 1.06, |
|
"grad_norm": 0.12845346331596375, |
|
"learning_rate": 3.590163934426229e-05, |
|
"loss": 0.006, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 1.08, |
|
"grad_norm": 6.836170196533203, |
|
"learning_rate": 3.557377049180328e-05, |
|
"loss": 0.0054, |
|
"step": 610 |
|
}, |
|
{ |
|
"epoch": 1.1, |
|
"grad_norm": 0.014669831842184067, |
|
"learning_rate": 3.524590163934427e-05, |
|
"loss": 0.0007, |
|
"step": 620 |
|
}, |
|
{ |
|
"epoch": 1.11, |
|
"grad_norm": 0.0303161833435297, |
|
"learning_rate": 3.491803278688525e-05, |
|
"loss": 0.0272, |
|
"step": 630 |
|
}, |
|
{ |
|
"epoch": 1.13, |
|
"grad_norm": 0.010958039201796055, |
|
"learning_rate": 3.459016393442623e-05, |
|
"loss": 0.0016, |
|
"step": 640 |
|
}, |
|
{ |
|
"epoch": 1.15, |
|
"grad_norm": 0.10876976698637009, |
|
"learning_rate": 3.4262295081967214e-05, |
|
"loss": 0.0113, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 1.17, |
|
"grad_norm": 0.024475092068314552, |
|
"learning_rate": 3.39344262295082e-05, |
|
"loss": 0.0199, |
|
"step": 660 |
|
}, |
|
{ |
|
"epoch": 1.18, |
|
"grad_norm": 3.7424721717834473, |
|
"learning_rate": 3.360655737704918e-05, |
|
"loss": 0.0153, |
|
"step": 670 |
|
}, |
|
{ |
|
"epoch": 1.2, |
|
"grad_norm": 3.548234224319458, |
|
"learning_rate": 3.327868852459017e-05, |
|
"loss": 0.0124, |
|
"step": 680 |
|
}, |
|
{ |
|
"epoch": 1.22, |
|
"grad_norm": 0.018901199102401733, |
|
"learning_rate": 3.295081967213115e-05, |
|
"loss": 0.0059, |
|
"step": 690 |
|
}, |
|
{ |
|
"epoch": 1.24, |
|
"grad_norm": 0.01598968915641308, |
|
"learning_rate": 3.2622950819672136e-05, |
|
"loss": 0.0272, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 1.26, |
|
"grad_norm": 0.01613859459757805, |
|
"learning_rate": 3.2295081967213116e-05, |
|
"loss": 0.013, |
|
"step": 710 |
|
}, |
|
{ |
|
"epoch": 1.27, |
|
"grad_norm": 0.030256085097789764, |
|
"learning_rate": 3.19672131147541e-05, |
|
"loss": 0.0061, |
|
"step": 720 |
|
}, |
|
{ |
|
"epoch": 1.29, |
|
"grad_norm": 0.08354157209396362, |
|
"learning_rate": 3.163934426229508e-05, |
|
"loss": 0.0008, |
|
"step": 730 |
|
}, |
|
{ |
|
"epoch": 1.31, |
|
"grad_norm": 0.1932225227355957, |
|
"learning_rate": 3.131147540983606e-05, |
|
"loss": 0.0019, |
|
"step": 740 |
|
}, |
|
{ |
|
"epoch": 1.33, |
|
"grad_norm": 0.4999471604824066, |
|
"learning_rate": 3.098360655737705e-05, |
|
"loss": 0.012, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 1.34, |
|
"grad_norm": 0.016076169908046722, |
|
"learning_rate": 3.065573770491804e-05, |
|
"loss": 0.0006, |
|
"step": 760 |
|
}, |
|
{ |
|
"epoch": 1.36, |
|
"grad_norm": 0.010563875548541546, |
|
"learning_rate": 3.0327868852459017e-05, |
|
"loss": 0.0034, |
|
"step": 770 |
|
}, |
|
{ |
|
"epoch": 1.38, |
|
"grad_norm": 0.020218759775161743, |
|
"learning_rate": 3e-05, |
|
"loss": 0.0016, |
|
"step": 780 |
|
}, |
|
{ |
|
"epoch": 1.4, |
|
"grad_norm": 0.016868168488144875, |
|
"learning_rate": 2.967213114754098e-05, |
|
"loss": 0.0033, |
|
"step": 790 |
|
}, |
|
{ |
|
"epoch": 1.41, |
|
"grad_norm": 0.013298319652676582, |
|
"learning_rate": 2.934426229508197e-05, |
|
"loss": 0.0037, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 1.43, |
|
"grad_norm": 0.062481291592121124, |
|
"learning_rate": 2.901639344262295e-05, |
|
"loss": 0.0118, |
|
"step": 810 |
|
}, |
|
{ |
|
"epoch": 1.45, |
|
"grad_norm": 1.4717094898223877, |
|
"learning_rate": 2.8688524590163935e-05, |
|
"loss": 0.0104, |
|
"step": 820 |
|
}, |
|
{ |
|
"epoch": 1.47, |
|
"grad_norm": 1.5437506437301636, |
|
"learning_rate": 2.8360655737704922e-05, |
|
"loss": 0.0062, |
|
"step": 830 |
|
}, |
|
{ |
|
"epoch": 1.49, |
|
"grad_norm": 0.03338282182812691, |
|
"learning_rate": 2.8032786885245906e-05, |
|
"loss": 0.0025, |
|
"step": 840 |
|
}, |
|
{ |
|
"epoch": 1.5, |
|
"grad_norm": 1.8838506937026978, |
|
"learning_rate": 2.7704918032786886e-05, |
|
"loss": 0.0193, |
|
"step": 850 |
|
}, |
|
{ |
|
"epoch": 1.52, |
|
"grad_norm": 0.007599662058055401, |
|
"learning_rate": 2.737704918032787e-05, |
|
"loss": 0.0083, |
|
"step": 860 |
|
}, |
|
{ |
|
"epoch": 1.54, |
|
"grad_norm": 0.0383157841861248, |
|
"learning_rate": 2.7049180327868856e-05, |
|
"loss": 0.0059, |
|
"step": 870 |
|
}, |
|
{ |
|
"epoch": 1.56, |
|
"grad_norm": 0.004423599224537611, |
|
"learning_rate": 2.6721311475409837e-05, |
|
"loss": 0.0019, |
|
"step": 880 |
|
}, |
|
{ |
|
"epoch": 1.57, |
|
"grad_norm": 7.753259658813477, |
|
"learning_rate": 2.639344262295082e-05, |
|
"loss": 0.0197, |
|
"step": 890 |
|
}, |
|
{ |
|
"epoch": 1.59, |
|
"grad_norm": 0.0051222569309175014, |
|
"learning_rate": 2.6065573770491804e-05, |
|
"loss": 0.0128, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 1.61, |
|
"grad_norm": 0.006367386784404516, |
|
"learning_rate": 2.573770491803279e-05, |
|
"loss": 0.0097, |
|
"step": 910 |
|
}, |
|
{ |
|
"epoch": 1.63, |
|
"grad_norm": 0.016590917482972145, |
|
"learning_rate": 2.540983606557377e-05, |
|
"loss": 0.0163, |
|
"step": 920 |
|
}, |
|
{ |
|
"epoch": 1.64, |
|
"grad_norm": 0.006148540414869785, |
|
"learning_rate": 2.5081967213114754e-05, |
|
"loss": 0.0107, |
|
"step": 930 |
|
}, |
|
{ |
|
"epoch": 1.66, |
|
"grad_norm": 0.021444685757160187, |
|
"learning_rate": 2.4754098360655738e-05, |
|
"loss": 0.008, |
|
"step": 940 |
|
}, |
|
{ |
|
"epoch": 1.68, |
|
"grad_norm": 0.27239102125167847, |
|
"learning_rate": 2.442622950819672e-05, |
|
"loss": 0.0009, |
|
"step": 950 |
|
}, |
|
{ |
|
"epoch": 1.7, |
|
"grad_norm": 2.461951971054077, |
|
"learning_rate": 2.4098360655737705e-05, |
|
"loss": 0.0224, |
|
"step": 960 |
|
}, |
|
{ |
|
"epoch": 1.72, |
|
"grad_norm": 0.009482527151703835, |
|
"learning_rate": 2.377049180327869e-05, |
|
"loss": 0.0032, |
|
"step": 970 |
|
}, |
|
{ |
|
"epoch": 1.73, |
|
"grad_norm": 0.0694417804479599, |
|
"learning_rate": 2.3442622950819672e-05, |
|
"loss": 0.001, |
|
"step": 980 |
|
}, |
|
{ |
|
"epoch": 1.75, |
|
"grad_norm": 0.008940008468925953, |
|
"learning_rate": 2.311475409836066e-05, |
|
"loss": 0.0063, |
|
"step": 990 |
|
}, |
|
{ |
|
"epoch": 1.77, |
|
"grad_norm": 0.008408569730818272, |
|
"learning_rate": 2.278688524590164e-05, |
|
"loss": 0.0026, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 1.79, |
|
"grad_norm": 0.00473896786570549, |
|
"learning_rate": 2.2459016393442626e-05, |
|
"loss": 0.0162, |
|
"step": 1010 |
|
}, |
|
{ |
|
"epoch": 1.8, |
|
"grad_norm": 0.0444784052670002, |
|
"learning_rate": 2.2131147540983607e-05, |
|
"loss": 0.0004, |
|
"step": 1020 |
|
}, |
|
{ |
|
"epoch": 1.82, |
|
"grad_norm": 0.002952872309833765, |
|
"learning_rate": 2.1803278688524594e-05, |
|
"loss": 0.0079, |
|
"step": 1030 |
|
}, |
|
{ |
|
"epoch": 1.84, |
|
"grad_norm": 0.009093045257031918, |
|
"learning_rate": 2.1475409836065574e-05, |
|
"loss": 0.0049, |
|
"step": 1040 |
|
}, |
|
{ |
|
"epoch": 1.86, |
|
"grad_norm": 5.8584818840026855, |
|
"learning_rate": 2.114754098360656e-05, |
|
"loss": 0.0048, |
|
"step": 1050 |
|
}, |
|
{ |
|
"epoch": 1.87, |
|
"grad_norm": 0.025469429790973663, |
|
"learning_rate": 2.081967213114754e-05, |
|
"loss": 0.0118, |
|
"step": 1060 |
|
}, |
|
{ |
|
"epoch": 1.89, |
|
"grad_norm": 0.005341151263564825, |
|
"learning_rate": 2.0491803278688525e-05, |
|
"loss": 0.0016, |
|
"step": 1070 |
|
}, |
|
{ |
|
"epoch": 1.91, |
|
"grad_norm": 0.006102704908698797, |
|
"learning_rate": 2.0163934426229508e-05, |
|
"loss": 0.001, |
|
"step": 1080 |
|
}, |
|
{ |
|
"epoch": 1.93, |
|
"grad_norm": 0.009585659019649029, |
|
"learning_rate": 1.9836065573770492e-05, |
|
"loss": 0.0003, |
|
"step": 1090 |
|
}, |
|
{ |
|
"epoch": 1.95, |
|
"grad_norm": 0.003977091982960701, |
|
"learning_rate": 1.9508196721311475e-05, |
|
"loss": 0.0034, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 1.96, |
|
"grad_norm": 0.008223881945014, |
|
"learning_rate": 1.918032786885246e-05, |
|
"loss": 0.0016, |
|
"step": 1110 |
|
}, |
|
{ |
|
"epoch": 1.98, |
|
"grad_norm": 0.014407954178750515, |
|
"learning_rate": 1.8852459016393442e-05, |
|
"loss": 0.0013, |
|
"step": 1120 |
|
}, |
|
{ |
|
"epoch": 2.0, |
|
"grad_norm": 0.3026231825351715, |
|
"learning_rate": 1.8524590163934426e-05, |
|
"loss": 0.0005, |
|
"step": 1130 |
|
}, |
|
{ |
|
"epoch": 2.0, |
|
"eval_accuracy": 0.9951154529307282, |
|
"eval_loss": 0.01963997818529606, |
|
"eval_runtime": 122.5197, |
|
"eval_samples_per_second": 18.381, |
|
"eval_steps_per_second": 2.302, |
|
"step": 1131 |
|
}, |
|
{ |
|
"epoch": 2.02, |
|
"grad_norm": 0.005760945845395327, |
|
"learning_rate": 1.8196721311475413e-05, |
|
"loss": 0.0028, |
|
"step": 1140 |
|
}, |
|
{ |
|
"epoch": 2.03, |
|
"grad_norm": 0.00677720969542861, |
|
"learning_rate": 1.7868852459016393e-05, |
|
"loss": 0.0008, |
|
"step": 1150 |
|
}, |
|
{ |
|
"epoch": 2.05, |
|
"grad_norm": 0.004843315575271845, |
|
"learning_rate": 1.754098360655738e-05, |
|
"loss": 0.002, |
|
"step": 1160 |
|
}, |
|
{ |
|
"epoch": 2.07, |
|
"grad_norm": 0.003445906564593315, |
|
"learning_rate": 1.721311475409836e-05, |
|
"loss": 0.0002, |
|
"step": 1170 |
|
}, |
|
{ |
|
"epoch": 2.09, |
|
"grad_norm": 0.0024833856150507927, |
|
"learning_rate": 1.6885245901639347e-05, |
|
"loss": 0.0028, |
|
"step": 1180 |
|
}, |
|
{ |
|
"epoch": 2.1, |
|
"grad_norm": 0.024947261437773705, |
|
"learning_rate": 1.6557377049180328e-05, |
|
"loss": 0.0003, |
|
"step": 1190 |
|
}, |
|
{ |
|
"epoch": 2.12, |
|
"grad_norm": 0.0022959029302001, |
|
"learning_rate": 1.6229508196721314e-05, |
|
"loss": 0.0002, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 2.14, |
|
"grad_norm": 0.01967487297952175, |
|
"learning_rate": 1.5901639344262295e-05, |
|
"loss": 0.0003, |
|
"step": 1210 |
|
}, |
|
{ |
|
"epoch": 2.16, |
|
"grad_norm": 0.357111394405365, |
|
"learning_rate": 1.557377049180328e-05, |
|
"loss": 0.0005, |
|
"step": 1220 |
|
}, |
|
{ |
|
"epoch": 2.18, |
|
"grad_norm": 0.0058966344222426414, |
|
"learning_rate": 1.5245901639344262e-05, |
|
"loss": 0.0001, |
|
"step": 1230 |
|
}, |
|
{ |
|
"epoch": 2.19, |
|
"grad_norm": 0.0033721905201673508, |
|
"learning_rate": 1.4918032786885247e-05, |
|
"loss": 0.0002, |
|
"step": 1240 |
|
}, |
|
{ |
|
"epoch": 2.21, |
|
"grad_norm": 0.0022145358379930258, |
|
"learning_rate": 1.4590163934426229e-05, |
|
"loss": 0.0002, |
|
"step": 1250 |
|
}, |
|
{ |
|
"epoch": 2.23, |
|
"grad_norm": 0.035282738506793976, |
|
"learning_rate": 1.4262295081967214e-05, |
|
"loss": 0.0002, |
|
"step": 1260 |
|
}, |
|
{ |
|
"epoch": 2.25, |
|
"grad_norm": 0.0015229666605591774, |
|
"learning_rate": 1.3934426229508196e-05, |
|
"loss": 0.0002, |
|
"step": 1270 |
|
}, |
|
{ |
|
"epoch": 2.26, |
|
"grad_norm": 0.005502559244632721, |
|
"learning_rate": 1.3606557377049181e-05, |
|
"loss": 0.0021, |
|
"step": 1280 |
|
}, |
|
{ |
|
"epoch": 2.28, |
|
"grad_norm": 0.0072832051664590836, |
|
"learning_rate": 1.3278688524590163e-05, |
|
"loss": 0.0001, |
|
"step": 1290 |
|
}, |
|
{ |
|
"epoch": 2.3, |
|
"grad_norm": 0.001554520451463759, |
|
"learning_rate": 1.2950819672131149e-05, |
|
"loss": 0.0002, |
|
"step": 1300 |
|
}, |
|
{ |
|
"epoch": 2.32, |
|
"grad_norm": 0.0054123131558299065, |
|
"learning_rate": 1.2622950819672132e-05, |
|
"loss": 0.0002, |
|
"step": 1310 |
|
}, |
|
{ |
|
"epoch": 2.33, |
|
"grad_norm": 0.013851379975676537, |
|
"learning_rate": 1.2295081967213116e-05, |
|
"loss": 0.0066, |
|
"step": 1320 |
|
}, |
|
{ |
|
"epoch": 2.35, |
|
"grad_norm": 0.0056920950300991535, |
|
"learning_rate": 1.19672131147541e-05, |
|
"loss": 0.0002, |
|
"step": 1330 |
|
}, |
|
{ |
|
"epoch": 2.37, |
|
"grad_norm": 0.00168787338770926, |
|
"learning_rate": 1.1639344262295083e-05, |
|
"loss": 0.0002, |
|
"step": 1340 |
|
}, |
|
{ |
|
"epoch": 2.39, |
|
"grad_norm": 0.005322947632521391, |
|
"learning_rate": 1.1311475409836065e-05, |
|
"loss": 0.0002, |
|
"step": 1350 |
|
}, |
|
{ |
|
"epoch": 2.4, |
|
"grad_norm": 0.002933880779892206, |
|
"learning_rate": 1.0983606557377048e-05, |
|
"loss": 0.0002, |
|
"step": 1360 |
|
}, |
|
{ |
|
"epoch": 2.42, |
|
"grad_norm": 0.0015972905093804002, |
|
"learning_rate": 1.0655737704918032e-05, |
|
"loss": 0.0003, |
|
"step": 1370 |
|
}, |
|
{ |
|
"epoch": 2.44, |
|
"grad_norm": 0.0015742258401587605, |
|
"learning_rate": 1.0327868852459017e-05, |
|
"loss": 0.0003, |
|
"step": 1380 |
|
}, |
|
{ |
|
"epoch": 2.46, |
|
"grad_norm": 0.003087802790105343, |
|
"learning_rate": 1e-05, |
|
"loss": 0.0002, |
|
"step": 1390 |
|
}, |
|
{ |
|
"epoch": 2.48, |
|
"grad_norm": 0.002106766914948821, |
|
"learning_rate": 9.672131147540984e-06, |
|
"loss": 0.0008, |
|
"step": 1400 |
|
}, |
|
{ |
|
"epoch": 2.49, |
|
"grad_norm": 0.058892298489809036, |
|
"learning_rate": 9.344262295081968e-06, |
|
"loss": 0.0003, |
|
"step": 1410 |
|
}, |
|
{ |
|
"epoch": 2.51, |
|
"grad_norm": 0.0024353417102247477, |
|
"learning_rate": 9.016393442622952e-06, |
|
"loss": 0.0002, |
|
"step": 1420 |
|
}, |
|
{ |
|
"epoch": 2.53, |
|
"grad_norm": 0.003295810893177986, |
|
"learning_rate": 8.688524590163935e-06, |
|
"loss": 0.0028, |
|
"step": 1430 |
|
}, |
|
{ |
|
"epoch": 2.55, |
|
"grad_norm": 0.006564658600836992, |
|
"learning_rate": 8.360655737704919e-06, |
|
"loss": 0.0001, |
|
"step": 1440 |
|
}, |
|
{ |
|
"epoch": 2.56, |
|
"grad_norm": 0.17080430686473846, |
|
"learning_rate": 8.032786885245902e-06, |
|
"loss": 0.0005, |
|
"step": 1450 |
|
}, |
|
{ |
|
"epoch": 2.58, |
|
"grad_norm": 0.012207315303385258, |
|
"learning_rate": 7.704918032786886e-06, |
|
"loss": 0.0008, |
|
"step": 1460 |
|
}, |
|
{ |
|
"epoch": 2.6, |
|
"grad_norm": 0.07983198761940002, |
|
"learning_rate": 7.3770491803278695e-06, |
|
"loss": 0.0003, |
|
"step": 1470 |
|
}, |
|
{ |
|
"epoch": 2.62, |
|
"grad_norm": 0.0021029170602560043, |
|
"learning_rate": 7.049180327868852e-06, |
|
"loss": 0.0002, |
|
"step": 1480 |
|
}, |
|
{ |
|
"epoch": 2.63, |
|
"grad_norm": 0.001665195683017373, |
|
"learning_rate": 6.721311475409836e-06, |
|
"loss": 0.0002, |
|
"step": 1490 |
|
}, |
|
{ |
|
"epoch": 2.65, |
|
"grad_norm": 0.0021821956615895033, |
|
"learning_rate": 6.393442622950819e-06, |
|
"loss": 0.0003, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 2.67, |
|
"grad_norm": 0.0023076001089066267, |
|
"learning_rate": 6.065573770491804e-06, |
|
"loss": 0.0007, |
|
"step": 1510 |
|
}, |
|
{ |
|
"epoch": 2.69, |
|
"grad_norm": 0.002858547493815422, |
|
"learning_rate": 5.737704918032787e-06, |
|
"loss": 0.0002, |
|
"step": 1520 |
|
}, |
|
{ |
|
"epoch": 2.71, |
|
"grad_norm": 0.014774510636925697, |
|
"learning_rate": 5.409836065573771e-06, |
|
"loss": 0.0002, |
|
"step": 1530 |
|
}, |
|
{ |
|
"epoch": 2.72, |
|
"grad_norm": 0.0014713788405060768, |
|
"learning_rate": 5.0819672131147545e-06, |
|
"loss": 0.0001, |
|
"step": 1540 |
|
}, |
|
{ |
|
"epoch": 2.74, |
|
"grad_norm": 0.0027496495749801397, |
|
"learning_rate": 4.754098360655738e-06, |
|
"loss": 0.0002, |
|
"step": 1550 |
|
}, |
|
{ |
|
"epoch": 2.76, |
|
"grad_norm": 0.002773697255179286, |
|
"learning_rate": 4.426229508196722e-06, |
|
"loss": 0.0002, |
|
"step": 1560 |
|
}, |
|
{ |
|
"epoch": 2.78, |
|
"grad_norm": 0.002084126928821206, |
|
"learning_rate": 4.098360655737704e-06, |
|
"loss": 0.0001, |
|
"step": 1570 |
|
}, |
|
{ |
|
"epoch": 2.79, |
|
"grad_norm": 0.0016491630813106894, |
|
"learning_rate": 3.770491803278689e-06, |
|
"loss": 0.0002, |
|
"step": 1580 |
|
}, |
|
{ |
|
"epoch": 2.81, |
|
"grad_norm": 0.003069406608119607, |
|
"learning_rate": 3.4426229508196724e-06, |
|
"loss": 0.0074, |
|
"step": 1590 |
|
}, |
|
{ |
|
"epoch": 2.83, |
|
"grad_norm": 0.001330082886852324, |
|
"learning_rate": 3.114754098360656e-06, |
|
"loss": 0.0003, |
|
"step": 1600 |
|
}, |
|
{ |
|
"epoch": 2.85, |
|
"grad_norm": 0.9783486127853394, |
|
"learning_rate": 2.7868852459016396e-06, |
|
"loss": 0.0007, |
|
"step": 1610 |
|
}, |
|
{ |
|
"epoch": 2.86, |
|
"grad_norm": 0.001621302799321711, |
|
"learning_rate": 2.459016393442623e-06, |
|
"loss": 0.0001, |
|
"step": 1620 |
|
}, |
|
{ |
|
"epoch": 2.88, |
|
"grad_norm": 0.05432431399822235, |
|
"learning_rate": 2.1311475409836067e-06, |
|
"loss": 0.0002, |
|
"step": 1630 |
|
}, |
|
{ |
|
"epoch": 2.9, |
|
"grad_norm": 0.001987756695598364, |
|
"learning_rate": 1.8032786885245903e-06, |
|
"loss": 0.0003, |
|
"step": 1640 |
|
}, |
|
{ |
|
"epoch": 2.92, |
|
"grad_norm": 0.0034532160498201847, |
|
"learning_rate": 1.4754098360655739e-06, |
|
"loss": 0.0001, |
|
"step": 1650 |
|
}, |
|
{ |
|
"epoch": 2.94, |
|
"grad_norm": 0.0017541700508445501, |
|
"learning_rate": 1.1475409836065575e-06, |
|
"loss": 0.0112, |
|
"step": 1660 |
|
}, |
|
{ |
|
"epoch": 2.95, |
|
"grad_norm": 0.02740754559636116, |
|
"learning_rate": 8.19672131147541e-07, |
|
"loss": 0.0002, |
|
"step": 1670 |
|
}, |
|
{ |
|
"epoch": 2.97, |
|
"grad_norm": 0.0015147700905799866, |
|
"learning_rate": 4.918032786885246e-07, |
|
"loss": 0.0001, |
|
"step": 1680 |
|
}, |
|
{ |
|
"epoch": 2.99, |
|
"grad_norm": 2.0458450317382812, |
|
"learning_rate": 1.639344262295082e-07, |
|
"loss": 0.0014, |
|
"step": 1690 |
|
}, |
|
{ |
|
"epoch": 3.0, |
|
"eval_accuracy": 0.9942273534635879, |
|
"eval_loss": 0.02003500424325466, |
|
"eval_runtime": 123.0739, |
|
"eval_samples_per_second": 18.298, |
|
"eval_steps_per_second": 2.291, |
|
"step": 1695 |
|
}, |
|
{ |
|
"epoch": 3.0, |
|
"step": 1695, |
|
"total_flos": 1.4134330246261506e+19, |
|
"train_loss": 0.10345972847079468, |
|
"train_runtime": 4872.3509, |
|
"train_samples_per_second": 11.138, |
|
"train_steps_per_second": 0.348 |
|
} |
|
], |
|
"logging_steps": 10, |
|
"max_steps": 1695, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 3, |
|
"save_steps": 500, |
|
"total_flos": 1.4134330246261506e+19, |
|
"train_batch_size": 8, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|