|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 0.9999783273010988, |
|
"eval_steps": 3000, |
|
"global_step": 34605, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.0014448465934129443, |
|
"grad_norm": 23.53125, |
|
"learning_rate": 2.2575809568531125e-10, |
|
"loss": 18.2618, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.0028896931868258886, |
|
"grad_norm": 23.046875, |
|
"learning_rate": 4.515161913706225e-10, |
|
"loss": 18.3279, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.004334539780238833, |
|
"grad_norm": 23.703125, |
|
"learning_rate": 6.772742870559338e-10, |
|
"loss": 18.2094, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.005779386373651777, |
|
"grad_norm": 21.046875, |
|
"learning_rate": 9.03032382741245e-10, |
|
"loss": 18.0812, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.007224232967064722, |
|
"grad_norm": 21.34375, |
|
"learning_rate": 1.1287904784265563e-09, |
|
"loss": 18.0178, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.008669079560477666, |
|
"grad_norm": 23.390625, |
|
"learning_rate": 1.3545485741118676e-09, |
|
"loss": 18.0735, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.01011392615389061, |
|
"grad_norm": 22.078125, |
|
"learning_rate": 1.5803066697971788e-09, |
|
"loss": 18.0169, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.011558772747303554, |
|
"grad_norm": 19.1875, |
|
"learning_rate": 1.80606476548249e-09, |
|
"loss": 17.9299, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.0130036193407165, |
|
"grad_norm": 22.75, |
|
"learning_rate": 2.0318228611678016e-09, |
|
"loss": 17.7586, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 0.014448465934129445, |
|
"grad_norm": 20.984375, |
|
"learning_rate": 2.2575809568531127e-09, |
|
"loss": 17.7956, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.015893312527542388, |
|
"grad_norm": 22.453125, |
|
"learning_rate": 2.4833390525384237e-09, |
|
"loss": 17.7873, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 0.01733815912095533, |
|
"grad_norm": 19.71875, |
|
"learning_rate": 2.709097148223735e-09, |
|
"loss": 17.7643, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 0.018783005714368278, |
|
"grad_norm": 20.1875, |
|
"learning_rate": 2.9348552439090465e-09, |
|
"loss": 17.6861, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 0.02022785230778122, |
|
"grad_norm": 20.8125, |
|
"learning_rate": 3.1606133395943576e-09, |
|
"loss": 17.6883, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 0.021672698901194165, |
|
"grad_norm": 23.875, |
|
"learning_rate": 3.386371435279669e-09, |
|
"loss": 17.6176, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 0.02311754549460711, |
|
"grad_norm": 19.96875, |
|
"learning_rate": 3.61212953096498e-09, |
|
"loss": 17.6334, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 0.024562392088020055, |
|
"grad_norm": 20.734375, |
|
"learning_rate": 3.837887626650292e-09, |
|
"loss": 17.5943, |
|
"step": 850 |
|
}, |
|
{ |
|
"epoch": 0.026007238681433, |
|
"grad_norm": 21.09375, |
|
"learning_rate": 4.063645722335603e-09, |
|
"loss": 17.5436, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 0.027452085274845942, |
|
"grad_norm": 19.34375, |
|
"learning_rate": 4.289403818020914e-09, |
|
"loss": 17.6069, |
|
"step": 950 |
|
}, |
|
{ |
|
"epoch": 0.02889693186825889, |
|
"grad_norm": 23.109375, |
|
"learning_rate": 4.515161913706225e-09, |
|
"loss": 17.4956, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 0.030341778461671833, |
|
"grad_norm": 23.53125, |
|
"learning_rate": 4.740920009391537e-09, |
|
"loss": 17.624, |
|
"step": 1050 |
|
}, |
|
{ |
|
"epoch": 0.031786625055084776, |
|
"grad_norm": 19.40625, |
|
"learning_rate": 4.966678105076847e-09, |
|
"loss": 17.6303, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 0.03323147164849772, |
|
"grad_norm": 20.9375, |
|
"learning_rate": 5.19243620076216e-09, |
|
"loss": 17.5501, |
|
"step": 1150 |
|
}, |
|
{ |
|
"epoch": 0.03467631824191066, |
|
"grad_norm": 18.9375, |
|
"learning_rate": 5.41819429644747e-09, |
|
"loss": 17.5943, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 0.036121164835323606, |
|
"grad_norm": 25.625, |
|
"learning_rate": 5.643952392132782e-09, |
|
"loss": 17.4464, |
|
"step": 1250 |
|
}, |
|
{ |
|
"epoch": 0.037566011428736557, |
|
"grad_norm": 18.203125, |
|
"learning_rate": 5.869710487818093e-09, |
|
"loss": 17.5614, |
|
"step": 1300 |
|
}, |
|
{ |
|
"epoch": 0.0390108580221495, |
|
"grad_norm": 19.4375, |
|
"learning_rate": 6.095468583503404e-09, |
|
"loss": 17.3942, |
|
"step": 1350 |
|
}, |
|
{ |
|
"epoch": 0.04045570461556244, |
|
"grad_norm": 20.71875, |
|
"learning_rate": 6.321226679188715e-09, |
|
"loss": 17.3659, |
|
"step": 1400 |
|
}, |
|
{ |
|
"epoch": 0.04190055120897539, |
|
"grad_norm": 17.828125, |
|
"learning_rate": 6.546984774874027e-09, |
|
"loss": 17.432, |
|
"step": 1450 |
|
}, |
|
{ |
|
"epoch": 0.04334539780238833, |
|
"grad_norm": 22.234375, |
|
"learning_rate": 6.772742870559338e-09, |
|
"loss": 17.5557, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 0.044790244395801274, |
|
"grad_norm": 21.0625, |
|
"learning_rate": 6.998500966244649e-09, |
|
"loss": 17.4979, |
|
"step": 1550 |
|
}, |
|
{ |
|
"epoch": 0.04623509098921422, |
|
"grad_norm": 18.84375, |
|
"learning_rate": 7.22425906192996e-09, |
|
"loss": 17.2844, |
|
"step": 1600 |
|
}, |
|
{ |
|
"epoch": 0.04767993758262717, |
|
"grad_norm": 22.40625, |
|
"learning_rate": 7.4500171576152714e-09, |
|
"loss": 17.3339, |
|
"step": 1650 |
|
}, |
|
{ |
|
"epoch": 0.04912478417604011, |
|
"grad_norm": 19.390625, |
|
"learning_rate": 7.675775253300584e-09, |
|
"loss": 17.313, |
|
"step": 1700 |
|
}, |
|
{ |
|
"epoch": 0.050569630769453054, |
|
"grad_norm": 19.328125, |
|
"learning_rate": 7.901533348985894e-09, |
|
"loss": 17.3682, |
|
"step": 1750 |
|
}, |
|
{ |
|
"epoch": 0.052014477362866, |
|
"grad_norm": 22.421875, |
|
"learning_rate": 8.127291444671207e-09, |
|
"loss": 17.3144, |
|
"step": 1800 |
|
}, |
|
{ |
|
"epoch": 0.05345932395627894, |
|
"grad_norm": 19.4375, |
|
"learning_rate": 8.353049540356517e-09, |
|
"loss": 17.3066, |
|
"step": 1850 |
|
}, |
|
{ |
|
"epoch": 0.054904170549691884, |
|
"grad_norm": 19.984375, |
|
"learning_rate": 8.578807636041828e-09, |
|
"loss": 17.4404, |
|
"step": 1900 |
|
}, |
|
{ |
|
"epoch": 0.05634901714310483, |
|
"grad_norm": 19.921875, |
|
"learning_rate": 8.804565731727138e-09, |
|
"loss": 17.3098, |
|
"step": 1950 |
|
}, |
|
{ |
|
"epoch": 0.05779386373651778, |
|
"grad_norm": 18.578125, |
|
"learning_rate": 9.03032382741245e-09, |
|
"loss": 17.2569, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 0.05923871032993072, |
|
"grad_norm": 21.890625, |
|
"learning_rate": 9.256081923097763e-09, |
|
"loss": 17.3394, |
|
"step": 2050 |
|
}, |
|
{ |
|
"epoch": 0.060683556923343665, |
|
"grad_norm": 25.4375, |
|
"learning_rate": 9.481840018783073e-09, |
|
"loss": 17.3198, |
|
"step": 2100 |
|
}, |
|
{ |
|
"epoch": 0.06212840351675661, |
|
"grad_norm": 21.171875, |
|
"learning_rate": 9.707598114468384e-09, |
|
"loss": 17.3099, |
|
"step": 2150 |
|
}, |
|
{ |
|
"epoch": 0.06357325011016955, |
|
"grad_norm": 26.203125, |
|
"learning_rate": 9.933356210153695e-09, |
|
"loss": 17.2845, |
|
"step": 2200 |
|
}, |
|
{ |
|
"epoch": 0.0650180967035825, |
|
"grad_norm": 21.296875, |
|
"learning_rate": 1.0159114305839007e-08, |
|
"loss": 17.2587, |
|
"step": 2250 |
|
}, |
|
{ |
|
"epoch": 0.06646294329699544, |
|
"grad_norm": 22.65625, |
|
"learning_rate": 1.038487240152432e-08, |
|
"loss": 17.3986, |
|
"step": 2300 |
|
}, |
|
{ |
|
"epoch": 0.06790778989040838, |
|
"grad_norm": 23.625, |
|
"learning_rate": 1.061063049720963e-08, |
|
"loss": 17.3046, |
|
"step": 2350 |
|
}, |
|
{ |
|
"epoch": 0.06935263648382133, |
|
"grad_norm": 21.203125, |
|
"learning_rate": 1.083638859289494e-08, |
|
"loss": 17.2043, |
|
"step": 2400 |
|
}, |
|
{ |
|
"epoch": 0.07079748307723427, |
|
"grad_norm": 24.640625, |
|
"learning_rate": 1.1062146688580251e-08, |
|
"loss": 17.2274, |
|
"step": 2450 |
|
}, |
|
{ |
|
"epoch": 0.07224232967064721, |
|
"grad_norm": 20.03125, |
|
"learning_rate": 1.1287904784265563e-08, |
|
"loss": 17.2168, |
|
"step": 2500 |
|
}, |
|
{ |
|
"epoch": 0.07368717626406017, |
|
"grad_norm": 23.5, |
|
"learning_rate": 1.1513662879950874e-08, |
|
"loss": 17.1795, |
|
"step": 2550 |
|
}, |
|
{ |
|
"epoch": 0.07513202285747311, |
|
"grad_norm": 19.890625, |
|
"learning_rate": 1.1739420975636186e-08, |
|
"loss": 17.2713, |
|
"step": 2600 |
|
}, |
|
{ |
|
"epoch": 0.07657686945088606, |
|
"grad_norm": 20.796875, |
|
"learning_rate": 1.1965179071321498e-08, |
|
"loss": 17.371, |
|
"step": 2650 |
|
}, |
|
{ |
|
"epoch": 0.078021716044299, |
|
"grad_norm": 21.640625, |
|
"learning_rate": 1.2190937167006807e-08, |
|
"loss": 17.199, |
|
"step": 2700 |
|
}, |
|
{ |
|
"epoch": 0.07946656263771194, |
|
"grad_norm": 24.96875, |
|
"learning_rate": 1.241669526269212e-08, |
|
"loss": 17.2428, |
|
"step": 2750 |
|
}, |
|
{ |
|
"epoch": 0.08091140923112489, |
|
"grad_norm": 18.640625, |
|
"learning_rate": 1.264245335837743e-08, |
|
"loss": 17.2682, |
|
"step": 2800 |
|
}, |
|
{ |
|
"epoch": 0.08235625582453783, |
|
"grad_norm": 22.984375, |
|
"learning_rate": 1.2868211454062742e-08, |
|
"loss": 17.1985, |
|
"step": 2850 |
|
}, |
|
{ |
|
"epoch": 0.08380110241795077, |
|
"grad_norm": 52.03125, |
|
"learning_rate": 1.3093969549748055e-08, |
|
"loss": 17.2148, |
|
"step": 2900 |
|
}, |
|
{ |
|
"epoch": 0.08524594901136372, |
|
"grad_norm": 20.71875, |
|
"learning_rate": 1.3319727645433364e-08, |
|
"loss": 17.1465, |
|
"step": 2950 |
|
}, |
|
{ |
|
"epoch": 0.08669079560477666, |
|
"grad_norm": 21.015625, |
|
"learning_rate": 1.3545485741118676e-08, |
|
"loss": 17.1231, |
|
"step": 3000 |
|
}, |
|
{ |
|
"epoch": 0.08669079560477666, |
|
"eval_loss": 2.1503310203552246, |
|
"eval_runtime": 340.0537, |
|
"eval_samples_per_second": 2742.314, |
|
"eval_steps_per_second": 42.849, |
|
"step": 3000 |
|
}, |
|
{ |
|
"epoch": 0.0881356421981896, |
|
"grad_norm": 19.109375, |
|
"learning_rate": 1.3771243836803987e-08, |
|
"loss": 17.2298, |
|
"step": 3050 |
|
}, |
|
{ |
|
"epoch": 0.08958048879160255, |
|
"grad_norm": 20.453125, |
|
"learning_rate": 1.3997001932489299e-08, |
|
"loss": 17.0646, |
|
"step": 3100 |
|
}, |
|
{ |
|
"epoch": 0.09102533538501549, |
|
"grad_norm": 22.234375, |
|
"learning_rate": 1.4222760028174611e-08, |
|
"loss": 17.2016, |
|
"step": 3150 |
|
}, |
|
{ |
|
"epoch": 0.09247018197842843, |
|
"grad_norm": 18.921875, |
|
"learning_rate": 1.444851812385992e-08, |
|
"loss": 17.1826, |
|
"step": 3200 |
|
}, |
|
{ |
|
"epoch": 0.09391502857184139, |
|
"grad_norm": 18.0, |
|
"learning_rate": 1.4674276219545232e-08, |
|
"loss": 17.1939, |
|
"step": 3250 |
|
}, |
|
{ |
|
"epoch": 0.09535987516525433, |
|
"grad_norm": 19.859375, |
|
"learning_rate": 1.4900034315230543e-08, |
|
"loss": 17.104, |
|
"step": 3300 |
|
}, |
|
{ |
|
"epoch": 0.09680472175866728, |
|
"grad_norm": 25.421875, |
|
"learning_rate": 1.5125792410915855e-08, |
|
"loss": 17.1392, |
|
"step": 3350 |
|
}, |
|
{ |
|
"epoch": 0.09824956835208022, |
|
"grad_norm": 23.34375, |
|
"learning_rate": 1.5351550506601167e-08, |
|
"loss": 17.2487, |
|
"step": 3400 |
|
}, |
|
{ |
|
"epoch": 0.09969441494549316, |
|
"grad_norm": 21.78125, |
|
"learning_rate": 1.5577308602286476e-08, |
|
"loss": 17.1119, |
|
"step": 3450 |
|
}, |
|
{ |
|
"epoch": 0.10113926153890611, |
|
"grad_norm": 21.125, |
|
"learning_rate": 1.580306669797179e-08, |
|
"loss": 17.2458, |
|
"step": 3500 |
|
}, |
|
{ |
|
"epoch": 0.10258410813231905, |
|
"grad_norm": 20.96875, |
|
"learning_rate": 1.6028824793657098e-08, |
|
"loss": 17.1168, |
|
"step": 3550 |
|
}, |
|
{ |
|
"epoch": 0.104028954725732, |
|
"grad_norm": 23.03125, |
|
"learning_rate": 1.6254582889342413e-08, |
|
"loss": 17.1953, |
|
"step": 3600 |
|
}, |
|
{ |
|
"epoch": 0.10547380131914494, |
|
"grad_norm": 20.828125, |
|
"learning_rate": 1.6480340985027722e-08, |
|
"loss": 17.1476, |
|
"step": 3650 |
|
}, |
|
{ |
|
"epoch": 0.10691864791255788, |
|
"grad_norm": 20.09375, |
|
"learning_rate": 1.6706099080713034e-08, |
|
"loss": 17.1084, |
|
"step": 3700 |
|
}, |
|
{ |
|
"epoch": 0.10836349450597083, |
|
"grad_norm": 21.5, |
|
"learning_rate": 1.6931857176398343e-08, |
|
"loss": 17.3216, |
|
"step": 3750 |
|
}, |
|
{ |
|
"epoch": 0.10980834109938377, |
|
"grad_norm": 19.453125, |
|
"learning_rate": 1.7157615272083656e-08, |
|
"loss": 17.1359, |
|
"step": 3800 |
|
}, |
|
{ |
|
"epoch": 0.11125318769279671, |
|
"grad_norm": 19.640625, |
|
"learning_rate": 1.7383373367768968e-08, |
|
"loss": 17.1561, |
|
"step": 3850 |
|
}, |
|
{ |
|
"epoch": 0.11269803428620966, |
|
"grad_norm": 21.0, |
|
"learning_rate": 1.7609131463454277e-08, |
|
"loss": 17.0558, |
|
"step": 3900 |
|
}, |
|
{ |
|
"epoch": 0.11414288087962261, |
|
"grad_norm": 22.21875, |
|
"learning_rate": 1.7834889559139592e-08, |
|
"loss": 17.149, |
|
"step": 3950 |
|
}, |
|
{ |
|
"epoch": 0.11558772747303556, |
|
"grad_norm": 23.109375, |
|
"learning_rate": 1.80606476548249e-08, |
|
"loss": 17.0748, |
|
"step": 4000 |
|
}, |
|
{ |
|
"epoch": 0.1170325740664485, |
|
"grad_norm": 19.265625, |
|
"learning_rate": 1.828640575051021e-08, |
|
"loss": 17.1038, |
|
"step": 4050 |
|
}, |
|
{ |
|
"epoch": 0.11847742065986144, |
|
"grad_norm": 22.34375, |
|
"learning_rate": 1.8512163846195526e-08, |
|
"loss": 17.0105, |
|
"step": 4100 |
|
}, |
|
{ |
|
"epoch": 0.11992226725327439, |
|
"grad_norm": 20.25, |
|
"learning_rate": 1.8737921941880835e-08, |
|
"loss": 17.0041, |
|
"step": 4150 |
|
}, |
|
{ |
|
"epoch": 0.12136711384668733, |
|
"grad_norm": 23.484375, |
|
"learning_rate": 1.8963680037566147e-08, |
|
"loss": 17.1879, |
|
"step": 4200 |
|
}, |
|
{ |
|
"epoch": 0.12281196044010027, |
|
"grad_norm": 23.859375, |
|
"learning_rate": 1.9189438133251456e-08, |
|
"loss": 17.0091, |
|
"step": 4250 |
|
}, |
|
{ |
|
"epoch": 0.12425680703351322, |
|
"grad_norm": 23.609375, |
|
"learning_rate": 1.9415196228936768e-08, |
|
"loss": 16.9834, |
|
"step": 4300 |
|
}, |
|
{ |
|
"epoch": 0.12570165362692617, |
|
"grad_norm": 20.171875, |
|
"learning_rate": 1.964095432462208e-08, |
|
"loss": 16.9839, |
|
"step": 4350 |
|
}, |
|
{ |
|
"epoch": 0.1271465002203391, |
|
"grad_norm": 20.75, |
|
"learning_rate": 1.986671242030739e-08, |
|
"loss": 16.9732, |
|
"step": 4400 |
|
}, |
|
{ |
|
"epoch": 0.12859134681375206, |
|
"grad_norm": 21.078125, |
|
"learning_rate": 2.0092470515992705e-08, |
|
"loss": 17.0603, |
|
"step": 4450 |
|
}, |
|
{ |
|
"epoch": 0.130036193407165, |
|
"grad_norm": 19.796875, |
|
"learning_rate": 2.0318228611678014e-08, |
|
"loss": 17.0293, |
|
"step": 4500 |
|
}, |
|
{ |
|
"epoch": 0.13148104000057795, |
|
"grad_norm": 21.53125, |
|
"learning_rate": 2.0543986707363323e-08, |
|
"loss": 17.0322, |
|
"step": 4550 |
|
}, |
|
{ |
|
"epoch": 0.13292588659399088, |
|
"grad_norm": 23.171875, |
|
"learning_rate": 2.076974480304864e-08, |
|
"loss": 17.0431, |
|
"step": 4600 |
|
}, |
|
{ |
|
"epoch": 0.13437073318740383, |
|
"grad_norm": 20.203125, |
|
"learning_rate": 2.0995502898733947e-08, |
|
"loss": 17.0135, |
|
"step": 4650 |
|
}, |
|
{ |
|
"epoch": 0.13581557978081676, |
|
"grad_norm": 20.1875, |
|
"learning_rate": 2.122126099441926e-08, |
|
"loss": 16.905, |
|
"step": 4700 |
|
}, |
|
{ |
|
"epoch": 0.13726042637422972, |
|
"grad_norm": 20.703125, |
|
"learning_rate": 2.144701909010457e-08, |
|
"loss": 17.082, |
|
"step": 4750 |
|
}, |
|
{ |
|
"epoch": 0.13870527296764265, |
|
"grad_norm": 21.265625, |
|
"learning_rate": 2.167277718578988e-08, |
|
"loss": 16.9678, |
|
"step": 4800 |
|
}, |
|
{ |
|
"epoch": 0.1401501195610556, |
|
"grad_norm": 20.640625, |
|
"learning_rate": 2.1898535281475193e-08, |
|
"loss": 17.0, |
|
"step": 4850 |
|
}, |
|
{ |
|
"epoch": 0.14159496615446854, |
|
"grad_norm": 20.390625, |
|
"learning_rate": 2.2124293377160502e-08, |
|
"loss": 16.9941, |
|
"step": 4900 |
|
}, |
|
{ |
|
"epoch": 0.1430398127478815, |
|
"grad_norm": 21.15625, |
|
"learning_rate": 2.2350051472845818e-08, |
|
"loss": 17.0649, |
|
"step": 4950 |
|
}, |
|
{ |
|
"epoch": 0.14448465934129442, |
|
"grad_norm": 19.21875, |
|
"learning_rate": 2.2575809568531127e-08, |
|
"loss": 17.0574, |
|
"step": 5000 |
|
}, |
|
{ |
|
"epoch": 0.14592950593470738, |
|
"grad_norm": 19.59375, |
|
"learning_rate": 2.2801567664216436e-08, |
|
"loss": 17.1142, |
|
"step": 5050 |
|
}, |
|
{ |
|
"epoch": 0.14737435252812034, |
|
"grad_norm": 20.828125, |
|
"learning_rate": 2.3027325759901748e-08, |
|
"loss": 16.957, |
|
"step": 5100 |
|
}, |
|
{ |
|
"epoch": 0.14881919912153327, |
|
"grad_norm": 21.125, |
|
"learning_rate": 2.325308385558706e-08, |
|
"loss": 17.1079, |
|
"step": 5150 |
|
}, |
|
{ |
|
"epoch": 0.15026404571494623, |
|
"grad_norm": 21.890625, |
|
"learning_rate": 2.3478841951272372e-08, |
|
"loss": 17.0049, |
|
"step": 5200 |
|
}, |
|
{ |
|
"epoch": 0.15170889230835916, |
|
"grad_norm": 22.828125, |
|
"learning_rate": 2.370460004695768e-08, |
|
"loss": 17.0316, |
|
"step": 5250 |
|
}, |
|
{ |
|
"epoch": 0.1531537389017721, |
|
"grad_norm": 22.703125, |
|
"learning_rate": 2.3930358142642997e-08, |
|
"loss": 17.0376, |
|
"step": 5300 |
|
}, |
|
{ |
|
"epoch": 0.15459858549518504, |
|
"grad_norm": 20.90625, |
|
"learning_rate": 2.4156116238328306e-08, |
|
"loss": 17.1354, |
|
"step": 5350 |
|
}, |
|
{ |
|
"epoch": 0.156043432088598, |
|
"grad_norm": 22.609375, |
|
"learning_rate": 2.4381874334013615e-08, |
|
"loss": 16.9243, |
|
"step": 5400 |
|
}, |
|
{ |
|
"epoch": 0.15748827868201093, |
|
"grad_norm": 20.859375, |
|
"learning_rate": 2.460763242969893e-08, |
|
"loss": 17.0623, |
|
"step": 5450 |
|
}, |
|
{ |
|
"epoch": 0.1589331252754239, |
|
"grad_norm": 22.15625, |
|
"learning_rate": 2.483339052538424e-08, |
|
"loss": 17.0041, |
|
"step": 5500 |
|
}, |
|
{ |
|
"epoch": 0.16037797186883682, |
|
"grad_norm": 23.078125, |
|
"learning_rate": 2.505914862106955e-08, |
|
"loss": 16.945, |
|
"step": 5550 |
|
}, |
|
{ |
|
"epoch": 0.16182281846224977, |
|
"grad_norm": 20.5, |
|
"learning_rate": 2.528490671675486e-08, |
|
"loss": 16.886, |
|
"step": 5600 |
|
}, |
|
{ |
|
"epoch": 0.1632676650556627, |
|
"grad_norm": 19.859375, |
|
"learning_rate": 2.5510664812440173e-08, |
|
"loss": 16.8898, |
|
"step": 5650 |
|
}, |
|
{ |
|
"epoch": 0.16471251164907566, |
|
"grad_norm": 22.875, |
|
"learning_rate": 2.5736422908125485e-08, |
|
"loss": 17.0453, |
|
"step": 5700 |
|
}, |
|
{ |
|
"epoch": 0.16615735824248862, |
|
"grad_norm": 21.015625, |
|
"learning_rate": 2.5962181003810794e-08, |
|
"loss": 17.01, |
|
"step": 5750 |
|
}, |
|
{ |
|
"epoch": 0.16760220483590155, |
|
"grad_norm": 24.078125, |
|
"learning_rate": 2.618793909949611e-08, |
|
"loss": 17.0377, |
|
"step": 5800 |
|
}, |
|
{ |
|
"epoch": 0.1690470514293145, |
|
"grad_norm": 19.53125, |
|
"learning_rate": 2.641369719518142e-08, |
|
"loss": 16.8983, |
|
"step": 5850 |
|
}, |
|
{ |
|
"epoch": 0.17049189802272743, |
|
"grad_norm": 19.234375, |
|
"learning_rate": 2.6639455290866727e-08, |
|
"loss": 17.0153, |
|
"step": 5900 |
|
}, |
|
{ |
|
"epoch": 0.1719367446161404, |
|
"grad_norm": 21.40625, |
|
"learning_rate": 2.6865213386552043e-08, |
|
"loss": 16.9406, |
|
"step": 5950 |
|
}, |
|
{ |
|
"epoch": 0.17338159120955332, |
|
"grad_norm": 21.703125, |
|
"learning_rate": 2.7090971482237352e-08, |
|
"loss": 16.9213, |
|
"step": 6000 |
|
}, |
|
{ |
|
"epoch": 0.17338159120955332, |
|
"eval_loss": 2.117004871368408, |
|
"eval_runtime": 351.2723, |
|
"eval_samples_per_second": 2654.733, |
|
"eval_steps_per_second": 41.481, |
|
"step": 6000 |
|
}, |
|
{ |
|
"epoch": 0.17482643780296628, |
|
"grad_norm": 23.40625, |
|
"learning_rate": 2.7316729577922664e-08, |
|
"loss": 16.9717, |
|
"step": 6050 |
|
}, |
|
{ |
|
"epoch": 0.1762712843963792, |
|
"grad_norm": 21.09375, |
|
"learning_rate": 2.7542487673607973e-08, |
|
"loss": 17.0021, |
|
"step": 6100 |
|
}, |
|
{ |
|
"epoch": 0.17771613098979216, |
|
"grad_norm": 20.078125, |
|
"learning_rate": 2.7768245769293285e-08, |
|
"loss": 16.9975, |
|
"step": 6150 |
|
}, |
|
{ |
|
"epoch": 0.1791609775832051, |
|
"grad_norm": 20.46875, |
|
"learning_rate": 2.7994003864978598e-08, |
|
"loss": 16.9298, |
|
"step": 6200 |
|
}, |
|
{ |
|
"epoch": 0.18060582417661805, |
|
"grad_norm": 21.453125, |
|
"learning_rate": 2.8219761960663907e-08, |
|
"loss": 16.8852, |
|
"step": 6250 |
|
}, |
|
{ |
|
"epoch": 0.18205067077003098, |
|
"grad_norm": 20.90625, |
|
"learning_rate": 2.8445520056349222e-08, |
|
"loss": 17.0017, |
|
"step": 6300 |
|
}, |
|
{ |
|
"epoch": 0.18349551736344394, |
|
"grad_norm": 23.765625, |
|
"learning_rate": 2.867127815203453e-08, |
|
"loss": 16.9424, |
|
"step": 6350 |
|
}, |
|
{ |
|
"epoch": 0.18494036395685687, |
|
"grad_norm": 21.328125, |
|
"learning_rate": 2.889703624771984e-08, |
|
"loss": 16.9559, |
|
"step": 6400 |
|
}, |
|
{ |
|
"epoch": 0.18638521055026983, |
|
"grad_norm": 21.8125, |
|
"learning_rate": 2.9122794343405156e-08, |
|
"loss": 16.9595, |
|
"step": 6450 |
|
}, |
|
{ |
|
"epoch": 0.18783005714368278, |
|
"grad_norm": 22.53125, |
|
"learning_rate": 2.9348552439090465e-08, |
|
"loss": 16.8973, |
|
"step": 6500 |
|
}, |
|
{ |
|
"epoch": 0.1892749037370957, |
|
"grad_norm": 21.3125, |
|
"learning_rate": 2.9574310534775777e-08, |
|
"loss": 16.9388, |
|
"step": 6550 |
|
}, |
|
{ |
|
"epoch": 0.19071975033050867, |
|
"grad_norm": 25.25, |
|
"learning_rate": 2.9800068630461086e-08, |
|
"loss": 16.8936, |
|
"step": 6600 |
|
}, |
|
{ |
|
"epoch": 0.1921645969239216, |
|
"grad_norm": 21.921875, |
|
"learning_rate": 3.0025826726146395e-08, |
|
"loss": 16.9861, |
|
"step": 6650 |
|
}, |
|
{ |
|
"epoch": 0.19360944351733456, |
|
"grad_norm": 22.34375, |
|
"learning_rate": 3.025158482183171e-08, |
|
"loss": 16.7772, |
|
"step": 6700 |
|
}, |
|
{ |
|
"epoch": 0.19505429011074749, |
|
"grad_norm": 21.921875, |
|
"learning_rate": 3.047734291751702e-08, |
|
"loss": 16.8555, |
|
"step": 6750 |
|
}, |
|
{ |
|
"epoch": 0.19649913670416044, |
|
"grad_norm": 25.28125, |
|
"learning_rate": 3.0703101013202335e-08, |
|
"loss": 16.7765, |
|
"step": 6800 |
|
}, |
|
{ |
|
"epoch": 0.19794398329757337, |
|
"grad_norm": 21.625, |
|
"learning_rate": 3.0928859108887644e-08, |
|
"loss": 16.868, |
|
"step": 6850 |
|
}, |
|
{ |
|
"epoch": 0.19938882989098633, |
|
"grad_norm": 25.046875, |
|
"learning_rate": 3.115461720457295e-08, |
|
"loss": 16.9386, |
|
"step": 6900 |
|
}, |
|
{ |
|
"epoch": 0.20083367648439926, |
|
"grad_norm": 23.671875, |
|
"learning_rate": 3.138037530025826e-08, |
|
"loss": 16.9085, |
|
"step": 6950 |
|
}, |
|
{ |
|
"epoch": 0.20227852307781222, |
|
"grad_norm": 22.234375, |
|
"learning_rate": 3.160613339594358e-08, |
|
"loss": 16.8305, |
|
"step": 7000 |
|
}, |
|
{ |
|
"epoch": 0.20372336967122515, |
|
"grad_norm": 18.609375, |
|
"learning_rate": 3.183189149162889e-08, |
|
"loss": 16.7971, |
|
"step": 7050 |
|
}, |
|
{ |
|
"epoch": 0.2051682162646381, |
|
"grad_norm": 21.78125, |
|
"learning_rate": 3.2057649587314195e-08, |
|
"loss": 16.8924, |
|
"step": 7100 |
|
}, |
|
{ |
|
"epoch": 0.20661306285805103, |
|
"grad_norm": 19.921875, |
|
"learning_rate": 3.228340768299951e-08, |
|
"loss": 16.9471, |
|
"step": 7150 |
|
}, |
|
{ |
|
"epoch": 0.208057909451464, |
|
"grad_norm": 20.75, |
|
"learning_rate": 3.2509165778684826e-08, |
|
"loss": 16.8681, |
|
"step": 7200 |
|
}, |
|
{ |
|
"epoch": 0.20950275604487695, |
|
"grad_norm": 20.859375, |
|
"learning_rate": 3.2734923874370135e-08, |
|
"loss": 16.7762, |
|
"step": 7250 |
|
}, |
|
{ |
|
"epoch": 0.21094760263828988, |
|
"grad_norm": 19.671875, |
|
"learning_rate": 3.2960681970055444e-08, |
|
"loss": 16.8033, |
|
"step": 7300 |
|
}, |
|
{ |
|
"epoch": 0.21239244923170283, |
|
"grad_norm": 22.171875, |
|
"learning_rate": 3.318644006574075e-08, |
|
"loss": 16.9011, |
|
"step": 7350 |
|
}, |
|
{ |
|
"epoch": 0.21383729582511576, |
|
"grad_norm": 20.09375, |
|
"learning_rate": 3.341219816142607e-08, |
|
"loss": 16.7137, |
|
"step": 7400 |
|
}, |
|
{ |
|
"epoch": 0.21528214241852872, |
|
"grad_norm": 22.828125, |
|
"learning_rate": 3.363795625711138e-08, |
|
"loss": 16.8546, |
|
"step": 7450 |
|
}, |
|
{ |
|
"epoch": 0.21672698901194165, |
|
"grad_norm": 22.640625, |
|
"learning_rate": 3.3863714352796687e-08, |
|
"loss": 16.9191, |
|
"step": 7500 |
|
}, |
|
{ |
|
"epoch": 0.2181718356053546, |
|
"grad_norm": 28.5, |
|
"learning_rate": 3.4089472448482e-08, |
|
"loss": 16.8509, |
|
"step": 7550 |
|
}, |
|
{ |
|
"epoch": 0.21961668219876754, |
|
"grad_norm": 20.484375, |
|
"learning_rate": 3.431523054416731e-08, |
|
"loss": 16.8247, |
|
"step": 7600 |
|
}, |
|
{ |
|
"epoch": 0.2210615287921805, |
|
"grad_norm": 20.5625, |
|
"learning_rate": 3.454098863985262e-08, |
|
"loss": 16.7724, |
|
"step": 7650 |
|
}, |
|
{ |
|
"epoch": 0.22250637538559342, |
|
"grad_norm": 22.0625, |
|
"learning_rate": 3.4766746735537936e-08, |
|
"loss": 16.7584, |
|
"step": 7700 |
|
}, |
|
{ |
|
"epoch": 0.22395122197900638, |
|
"grad_norm": 20.375, |
|
"learning_rate": 3.499250483122325e-08, |
|
"loss": 16.7699, |
|
"step": 7750 |
|
}, |
|
{ |
|
"epoch": 0.2253960685724193, |
|
"grad_norm": 20.65625, |
|
"learning_rate": 3.5218262926908553e-08, |
|
"loss": 16.8213, |
|
"step": 7800 |
|
}, |
|
{ |
|
"epoch": 0.22684091516583227, |
|
"grad_norm": 22.171875, |
|
"learning_rate": 3.544402102259387e-08, |
|
"loss": 16.9104, |
|
"step": 7850 |
|
}, |
|
{ |
|
"epoch": 0.22828576175924523, |
|
"grad_norm": 22.109375, |
|
"learning_rate": 3.5669779118279185e-08, |
|
"loss": 16.7777, |
|
"step": 7900 |
|
}, |
|
{ |
|
"epoch": 0.22973060835265816, |
|
"grad_norm": 22.109375, |
|
"learning_rate": 3.589553721396449e-08, |
|
"loss": 16.7741, |
|
"step": 7950 |
|
}, |
|
{ |
|
"epoch": 0.2311754549460711, |
|
"grad_norm": 20.90625, |
|
"learning_rate": 3.61212953096498e-08, |
|
"loss": 16.7486, |
|
"step": 8000 |
|
}, |
|
{ |
|
"epoch": 0.23262030153948404, |
|
"grad_norm": 21.265625, |
|
"learning_rate": 3.634705340533512e-08, |
|
"loss": 16.7256, |
|
"step": 8050 |
|
}, |
|
{ |
|
"epoch": 0.234065148132897, |
|
"grad_norm": 23.40625, |
|
"learning_rate": 3.657281150102042e-08, |
|
"loss": 16.7431, |
|
"step": 8100 |
|
}, |
|
{ |
|
"epoch": 0.23550999472630993, |
|
"grad_norm": 21.0625, |
|
"learning_rate": 3.6798569596705736e-08, |
|
"loss": 16.777, |
|
"step": 8150 |
|
}, |
|
{ |
|
"epoch": 0.2369548413197229, |
|
"grad_norm": 23.125, |
|
"learning_rate": 3.702432769239105e-08, |
|
"loss": 16.6821, |
|
"step": 8200 |
|
}, |
|
{ |
|
"epoch": 0.23839968791313582, |
|
"grad_norm": 19.203125, |
|
"learning_rate": 3.725008578807636e-08, |
|
"loss": 16.7046, |
|
"step": 8250 |
|
}, |
|
{ |
|
"epoch": 0.23984453450654877, |
|
"grad_norm": 21.015625, |
|
"learning_rate": 3.747584388376167e-08, |
|
"loss": 16.7624, |
|
"step": 8300 |
|
}, |
|
{ |
|
"epoch": 0.2412893810999617, |
|
"grad_norm": 20.90625, |
|
"learning_rate": 3.770160197944698e-08, |
|
"loss": 16.801, |
|
"step": 8350 |
|
}, |
|
{ |
|
"epoch": 0.24273422769337466, |
|
"grad_norm": 22.703125, |
|
"learning_rate": 3.7927360075132294e-08, |
|
"loss": 16.7837, |
|
"step": 8400 |
|
}, |
|
{ |
|
"epoch": 0.2441790742867876, |
|
"grad_norm": 23.5, |
|
"learning_rate": 3.81531181708176e-08, |
|
"loss": 16.7856, |
|
"step": 8450 |
|
}, |
|
{ |
|
"epoch": 0.24562392088020055, |
|
"grad_norm": 19.84375, |
|
"learning_rate": 3.837887626650291e-08, |
|
"loss": 16.7071, |
|
"step": 8500 |
|
}, |
|
{ |
|
"epoch": 0.24706876747361348, |
|
"grad_norm": 21.78125, |
|
"learning_rate": 3.860463436218823e-08, |
|
"loss": 16.7162, |
|
"step": 8550 |
|
}, |
|
{ |
|
"epoch": 0.24851361406702643, |
|
"grad_norm": 21.109375, |
|
"learning_rate": 3.8830392457873536e-08, |
|
"loss": 16.7331, |
|
"step": 8600 |
|
}, |
|
{ |
|
"epoch": 0.2499584606604394, |
|
"grad_norm": 21.875, |
|
"learning_rate": 3.9056150553558845e-08, |
|
"loss": 16.7945, |
|
"step": 8650 |
|
}, |
|
{ |
|
"epoch": 0.25140330725385235, |
|
"grad_norm": 21.1875, |
|
"learning_rate": 3.928190864924416e-08, |
|
"loss": 16.7675, |
|
"step": 8700 |
|
}, |
|
{ |
|
"epoch": 0.2528481538472653, |
|
"grad_norm": 21.375, |
|
"learning_rate": 3.9507666744929476e-08, |
|
"loss": 16.7202, |
|
"step": 8750 |
|
}, |
|
{ |
|
"epoch": 0.2542930004406782, |
|
"grad_norm": 21.25, |
|
"learning_rate": 3.973342484061478e-08, |
|
"loss": 16.6576, |
|
"step": 8800 |
|
}, |
|
{ |
|
"epoch": 0.25573784703409114, |
|
"grad_norm": 20.84375, |
|
"learning_rate": 3.9959182936300094e-08, |
|
"loss": 16.6249, |
|
"step": 8850 |
|
}, |
|
{ |
|
"epoch": 0.2571826936275041, |
|
"grad_norm": 20.703125, |
|
"learning_rate": 4.018494103198541e-08, |
|
"loss": 16.7702, |
|
"step": 8900 |
|
}, |
|
{ |
|
"epoch": 0.25862754022091705, |
|
"grad_norm": 23.203125, |
|
"learning_rate": 4.041069912767071e-08, |
|
"loss": 16.687, |
|
"step": 8950 |
|
}, |
|
{ |
|
"epoch": 0.26007238681433, |
|
"grad_norm": 24.96875, |
|
"learning_rate": 4.063645722335603e-08, |
|
"loss": 16.6843, |
|
"step": 9000 |
|
}, |
|
{ |
|
"epoch": 0.26007238681433, |
|
"eval_loss": 2.087214946746826, |
|
"eval_runtime": 348.1727, |
|
"eval_samples_per_second": 2678.367, |
|
"eval_steps_per_second": 41.85, |
|
"step": 9000 |
|
}, |
|
{ |
|
"epoch": 0.2615172334077429, |
|
"grad_norm": 22.578125, |
|
"learning_rate": 4.0862215319041343e-08, |
|
"loss": 16.6685, |
|
"step": 9050 |
|
}, |
|
{ |
|
"epoch": 0.2629620800011559, |
|
"grad_norm": 21.125, |
|
"learning_rate": 4.1087973414726646e-08, |
|
"loss": 16.8334, |
|
"step": 9100 |
|
}, |
|
{ |
|
"epoch": 0.2644069265945688, |
|
"grad_norm": 23.84375, |
|
"learning_rate": 4.131373151041196e-08, |
|
"loss": 16.5566, |
|
"step": 9150 |
|
}, |
|
{ |
|
"epoch": 0.26585177318798175, |
|
"grad_norm": 18.640625, |
|
"learning_rate": 4.153948960609728e-08, |
|
"loss": 16.7514, |
|
"step": 9200 |
|
}, |
|
{ |
|
"epoch": 0.2672966197813947, |
|
"grad_norm": 22.53125, |
|
"learning_rate": 4.1765247701782586e-08, |
|
"loss": 16.688, |
|
"step": 9250 |
|
}, |
|
{ |
|
"epoch": 0.26874146637480767, |
|
"grad_norm": 23.46875, |
|
"learning_rate": 4.1991005797467895e-08, |
|
"loss": 16.6654, |
|
"step": 9300 |
|
}, |
|
{ |
|
"epoch": 0.2701863129682206, |
|
"grad_norm": 20.984375, |
|
"learning_rate": 4.2216763893153204e-08, |
|
"loss": 16.7596, |
|
"step": 9350 |
|
}, |
|
{ |
|
"epoch": 0.27163115956163353, |
|
"grad_norm": 21.9375, |
|
"learning_rate": 4.244252198883852e-08, |
|
"loss": 16.6031, |
|
"step": 9400 |
|
}, |
|
{ |
|
"epoch": 0.2730760061550465, |
|
"grad_norm": 20.703125, |
|
"learning_rate": 4.266828008452383e-08, |
|
"loss": 16.702, |
|
"step": 9450 |
|
}, |
|
{ |
|
"epoch": 0.27452085274845944, |
|
"grad_norm": 22.328125, |
|
"learning_rate": 4.289403818020914e-08, |
|
"loss": 16.6338, |
|
"step": 9500 |
|
}, |
|
{ |
|
"epoch": 0.2759656993418724, |
|
"grad_norm": 20.0, |
|
"learning_rate": 4.311979627589445e-08, |
|
"loss": 16.7581, |
|
"step": 9550 |
|
}, |
|
{ |
|
"epoch": 0.2774105459352853, |
|
"grad_norm": 20.46875, |
|
"learning_rate": 4.334555437157976e-08, |
|
"loss": 16.7627, |
|
"step": 9600 |
|
}, |
|
{ |
|
"epoch": 0.2788553925286983, |
|
"grad_norm": 21.90625, |
|
"learning_rate": 4.357131246726507e-08, |
|
"loss": 16.7124, |
|
"step": 9650 |
|
}, |
|
{ |
|
"epoch": 0.2803002391221112, |
|
"grad_norm": 22.46875, |
|
"learning_rate": 4.3797070562950386e-08, |
|
"loss": 16.7091, |
|
"step": 9700 |
|
}, |
|
{ |
|
"epoch": 0.28174508571552415, |
|
"grad_norm": 23.078125, |
|
"learning_rate": 4.40228286586357e-08, |
|
"loss": 16.6876, |
|
"step": 9750 |
|
}, |
|
{ |
|
"epoch": 0.2831899323089371, |
|
"grad_norm": 19.53125, |
|
"learning_rate": 4.4248586754321004e-08, |
|
"loss": 16.5888, |
|
"step": 9800 |
|
}, |
|
{ |
|
"epoch": 0.28463477890235006, |
|
"grad_norm": 22.921875, |
|
"learning_rate": 4.447434485000632e-08, |
|
"loss": 16.7229, |
|
"step": 9850 |
|
}, |
|
{ |
|
"epoch": 0.286079625495763, |
|
"grad_norm": 21.265625, |
|
"learning_rate": 4.4700102945691635e-08, |
|
"loss": 16.6842, |
|
"step": 9900 |
|
}, |
|
{ |
|
"epoch": 0.2875244720891759, |
|
"grad_norm": 20.5625, |
|
"learning_rate": 4.492586104137694e-08, |
|
"loss": 16.614, |
|
"step": 9950 |
|
}, |
|
{ |
|
"epoch": 0.28896931868258885, |
|
"grad_norm": 21.203125, |
|
"learning_rate": 4.515161913706225e-08, |
|
"loss": 16.6419, |
|
"step": 10000 |
|
}, |
|
{ |
|
"epoch": 0.29041416527600183, |
|
"grad_norm": 20.375, |
|
"learning_rate": 4.537737723274757e-08, |
|
"loss": 16.6317, |
|
"step": 10050 |
|
}, |
|
{ |
|
"epoch": 0.29185901186941476, |
|
"grad_norm": 21.375, |
|
"learning_rate": 4.560313532843287e-08, |
|
"loss": 16.6707, |
|
"step": 10100 |
|
}, |
|
{ |
|
"epoch": 0.2933038584628277, |
|
"grad_norm": 23.4375, |
|
"learning_rate": 4.5828893424118187e-08, |
|
"loss": 16.6595, |
|
"step": 10150 |
|
}, |
|
{ |
|
"epoch": 0.2947487050562407, |
|
"grad_norm": 20.703125, |
|
"learning_rate": 4.6054651519803496e-08, |
|
"loss": 16.639, |
|
"step": 10200 |
|
}, |
|
{ |
|
"epoch": 0.2961935516496536, |
|
"grad_norm": 20.453125, |
|
"learning_rate": 4.628040961548881e-08, |
|
"loss": 16.6537, |
|
"step": 10250 |
|
}, |
|
{ |
|
"epoch": 0.29763839824306654, |
|
"grad_norm": 21.234375, |
|
"learning_rate": 4.650616771117412e-08, |
|
"loss": 16.6099, |
|
"step": 10300 |
|
}, |
|
{ |
|
"epoch": 0.29908324483647947, |
|
"grad_norm": 22.84375, |
|
"learning_rate": 4.673192580685943e-08, |
|
"loss": 16.6629, |
|
"step": 10350 |
|
}, |
|
{ |
|
"epoch": 0.30052809142989245, |
|
"grad_norm": 20.796875, |
|
"learning_rate": 4.6957683902544745e-08, |
|
"loss": 16.6335, |
|
"step": 10400 |
|
}, |
|
{ |
|
"epoch": 0.3019729380233054, |
|
"grad_norm": 21.65625, |
|
"learning_rate": 4.7183441998230054e-08, |
|
"loss": 16.6122, |
|
"step": 10450 |
|
}, |
|
{ |
|
"epoch": 0.3034177846167183, |
|
"grad_norm": 23.046875, |
|
"learning_rate": 4.740920009391536e-08, |
|
"loss": 16.6171, |
|
"step": 10500 |
|
}, |
|
{ |
|
"epoch": 0.30486263121013124, |
|
"grad_norm": 20.71875, |
|
"learning_rate": 4.763495818960068e-08, |
|
"loss": 16.5997, |
|
"step": 10550 |
|
}, |
|
{ |
|
"epoch": 0.3063074778035442, |
|
"grad_norm": 22.71875, |
|
"learning_rate": 4.7860716285285994e-08, |
|
"loss": 16.615, |
|
"step": 10600 |
|
}, |
|
{ |
|
"epoch": 0.30775232439695716, |
|
"grad_norm": 21.53125, |
|
"learning_rate": 4.8086474380971296e-08, |
|
"loss": 16.6133, |
|
"step": 10650 |
|
}, |
|
{ |
|
"epoch": 0.3091971709903701, |
|
"grad_norm": 22.828125, |
|
"learning_rate": 4.831223247665661e-08, |
|
"loss": 16.5908, |
|
"step": 10700 |
|
}, |
|
{ |
|
"epoch": 0.310642017583783, |
|
"grad_norm": 22.859375, |
|
"learning_rate": 4.853799057234193e-08, |
|
"loss": 16.5617, |
|
"step": 10750 |
|
}, |
|
{ |
|
"epoch": 0.312086864177196, |
|
"grad_norm": 30.734375, |
|
"learning_rate": 4.876374866802723e-08, |
|
"loss": 16.5051, |
|
"step": 10800 |
|
}, |
|
{ |
|
"epoch": 0.31353171077060893, |
|
"grad_norm": 20.203125, |
|
"learning_rate": 4.8989506763712545e-08, |
|
"loss": 16.6504, |
|
"step": 10850 |
|
}, |
|
{ |
|
"epoch": 0.31497655736402186, |
|
"grad_norm": 26.859375, |
|
"learning_rate": 4.921526485939786e-08, |
|
"loss": 16.6061, |
|
"step": 10900 |
|
}, |
|
{ |
|
"epoch": 0.31642140395743484, |
|
"grad_norm": 20.828125, |
|
"learning_rate": 4.944102295508316e-08, |
|
"loss": 16.6288, |
|
"step": 10950 |
|
}, |
|
{ |
|
"epoch": 0.3178662505508478, |
|
"grad_norm": 25.515625, |
|
"learning_rate": 4.966678105076848e-08, |
|
"loss": 16.5228, |
|
"step": 11000 |
|
}, |
|
{ |
|
"epoch": 0.3193110971442607, |
|
"grad_norm": 23.671875, |
|
"learning_rate": 4.9892539146453794e-08, |
|
"loss": 16.4985, |
|
"step": 11050 |
|
}, |
|
{ |
|
"epoch": 0.32075594373767363, |
|
"grad_norm": 23.25, |
|
"learning_rate": 5.01182972421391e-08, |
|
"loss": 16.6149, |
|
"step": 11100 |
|
}, |
|
{ |
|
"epoch": 0.3222007903310866, |
|
"grad_norm": 20.375, |
|
"learning_rate": 5.034405533782441e-08, |
|
"loss": 16.5877, |
|
"step": 11150 |
|
}, |
|
{ |
|
"epoch": 0.32364563692449955, |
|
"grad_norm": 20.53125, |
|
"learning_rate": 5.056981343350972e-08, |
|
"loss": 16.4745, |
|
"step": 11200 |
|
}, |
|
{ |
|
"epoch": 0.3250904835179125, |
|
"grad_norm": 19.71875, |
|
"learning_rate": 5.0795571529195036e-08, |
|
"loss": 16.5593, |
|
"step": 11250 |
|
}, |
|
{ |
|
"epoch": 0.3265353301113254, |
|
"grad_norm": 20.53125, |
|
"learning_rate": 5.1021329624880345e-08, |
|
"loss": 16.4393, |
|
"step": 11300 |
|
}, |
|
{ |
|
"epoch": 0.3279801767047384, |
|
"grad_norm": 22.078125, |
|
"learning_rate": 5.1247087720565654e-08, |
|
"loss": 16.5429, |
|
"step": 11350 |
|
}, |
|
{ |
|
"epoch": 0.3294250232981513, |
|
"grad_norm": 20.5, |
|
"learning_rate": 5.147284581625097e-08, |
|
"loss": 16.4117, |
|
"step": 11400 |
|
}, |
|
{ |
|
"epoch": 0.33086986989156425, |
|
"grad_norm": 23.640625, |
|
"learning_rate": 5.169860391193628e-08, |
|
"loss": 16.5424, |
|
"step": 11450 |
|
}, |
|
{ |
|
"epoch": 0.33231471648497724, |
|
"grad_norm": 21.625, |
|
"learning_rate": 5.192436200762159e-08, |
|
"loss": 16.6651, |
|
"step": 11500 |
|
}, |
|
{ |
|
"epoch": 0.33375956307839016, |
|
"grad_norm": 24.453125, |
|
"learning_rate": 5.2150120103306903e-08, |
|
"loss": 16.5083, |
|
"step": 11550 |
|
}, |
|
{ |
|
"epoch": 0.3352044096718031, |
|
"grad_norm": 22.46875, |
|
"learning_rate": 5.237587819899222e-08, |
|
"loss": 16.4798, |
|
"step": 11600 |
|
}, |
|
{ |
|
"epoch": 0.336649256265216, |
|
"grad_norm": 22.703125, |
|
"learning_rate": 5.260163629467752e-08, |
|
"loss": 16.6141, |
|
"step": 11650 |
|
}, |
|
{ |
|
"epoch": 0.338094102858629, |
|
"grad_norm": 23.921875, |
|
"learning_rate": 5.282739439036284e-08, |
|
"loss": 16.6017, |
|
"step": 11700 |
|
}, |
|
{ |
|
"epoch": 0.33953894945204194, |
|
"grad_norm": 24.140625, |
|
"learning_rate": 5.305315248604815e-08, |
|
"loss": 16.55, |
|
"step": 11750 |
|
}, |
|
{ |
|
"epoch": 0.34098379604545487, |
|
"grad_norm": 21.9375, |
|
"learning_rate": 5.3278910581733455e-08, |
|
"loss": 16.5809, |
|
"step": 11800 |
|
}, |
|
{ |
|
"epoch": 0.3424286426388678, |
|
"grad_norm": 22.3125, |
|
"learning_rate": 5.350466867741877e-08, |
|
"loss": 16.5002, |
|
"step": 11850 |
|
}, |
|
{ |
|
"epoch": 0.3438734892322808, |
|
"grad_norm": 20.984375, |
|
"learning_rate": 5.3730426773104086e-08, |
|
"loss": 16.6775, |
|
"step": 11900 |
|
}, |
|
{ |
|
"epoch": 0.3453183358256937, |
|
"grad_norm": 20.28125, |
|
"learning_rate": 5.395618486878939e-08, |
|
"loss": 16.5301, |
|
"step": 11950 |
|
}, |
|
{ |
|
"epoch": 0.34676318241910664, |
|
"grad_norm": 21.578125, |
|
"learning_rate": 5.4181942964474704e-08, |
|
"loss": 16.501, |
|
"step": 12000 |
|
}, |
|
{ |
|
"epoch": 0.34676318241910664, |
|
"eval_loss": 2.0641098022460938, |
|
"eval_runtime": 340.7339, |
|
"eval_samples_per_second": 2736.839, |
|
"eval_steps_per_second": 42.764, |
|
"step": 12000 |
|
}, |
|
{ |
|
"epoch": 0.34820802901251957, |
|
"grad_norm": 22.203125, |
|
"learning_rate": 5.440770106016002e-08, |
|
"loss": 16.6513, |
|
"step": 12050 |
|
}, |
|
{ |
|
"epoch": 0.34965287560593256, |
|
"grad_norm": 22.296875, |
|
"learning_rate": 5.463345915584533e-08, |
|
"loss": 16.584, |
|
"step": 12100 |
|
}, |
|
{ |
|
"epoch": 0.3510977221993455, |
|
"grad_norm": 22.953125, |
|
"learning_rate": 5.485921725153064e-08, |
|
"loss": 16.5184, |
|
"step": 12150 |
|
}, |
|
{ |
|
"epoch": 0.3525425687927584, |
|
"grad_norm": 19.984375, |
|
"learning_rate": 5.5084975347215946e-08, |
|
"loss": 16.4685, |
|
"step": 12200 |
|
}, |
|
{ |
|
"epoch": 0.3539874153861714, |
|
"grad_norm": 22.515625, |
|
"learning_rate": 5.531073344290126e-08, |
|
"loss": 16.5087, |
|
"step": 12250 |
|
}, |
|
{ |
|
"epoch": 0.35543226197958433, |
|
"grad_norm": 22.984375, |
|
"learning_rate": 5.553649153858657e-08, |
|
"loss": 16.4561, |
|
"step": 12300 |
|
}, |
|
{ |
|
"epoch": 0.35687710857299726, |
|
"grad_norm": 20.421875, |
|
"learning_rate": 5.576224963427188e-08, |
|
"loss": 16.502, |
|
"step": 12350 |
|
}, |
|
{ |
|
"epoch": 0.3583219551664102, |
|
"grad_norm": 21.234375, |
|
"learning_rate": 5.5988007729957195e-08, |
|
"loss": 16.4946, |
|
"step": 12400 |
|
}, |
|
{ |
|
"epoch": 0.3597668017598232, |
|
"grad_norm": 20.953125, |
|
"learning_rate": 5.6213765825642504e-08, |
|
"loss": 16.4988, |
|
"step": 12450 |
|
}, |
|
{ |
|
"epoch": 0.3612116483532361, |
|
"grad_norm": 23.421875, |
|
"learning_rate": 5.643952392132781e-08, |
|
"loss": 16.5329, |
|
"step": 12500 |
|
}, |
|
{ |
|
"epoch": 0.36265649494664903, |
|
"grad_norm": 31.59375, |
|
"learning_rate": 5.666528201701313e-08, |
|
"loss": 16.4695, |
|
"step": 12550 |
|
}, |
|
{ |
|
"epoch": 0.36410134154006196, |
|
"grad_norm": 22.203125, |
|
"learning_rate": 5.6891040112698444e-08, |
|
"loss": 16.4166, |
|
"step": 12600 |
|
}, |
|
{ |
|
"epoch": 0.36554618813347495, |
|
"grad_norm": 23.828125, |
|
"learning_rate": 5.7116798208383747e-08, |
|
"loss": 16.3887, |
|
"step": 12650 |
|
}, |
|
{ |
|
"epoch": 0.3669910347268879, |
|
"grad_norm": 25.0, |
|
"learning_rate": 5.734255630406906e-08, |
|
"loss": 16.3849, |
|
"step": 12700 |
|
}, |
|
{ |
|
"epoch": 0.3684358813203008, |
|
"grad_norm": 21.875, |
|
"learning_rate": 5.756831439975438e-08, |
|
"loss": 16.489, |
|
"step": 12750 |
|
}, |
|
{ |
|
"epoch": 0.36988072791371374, |
|
"grad_norm": 22.859375, |
|
"learning_rate": 5.779407249543968e-08, |
|
"loss": 16.4234, |
|
"step": 12800 |
|
}, |
|
{ |
|
"epoch": 0.3713255745071267, |
|
"grad_norm": 20.53125, |
|
"learning_rate": 5.8019830591124996e-08, |
|
"loss": 16.3971, |
|
"step": 12850 |
|
}, |
|
{ |
|
"epoch": 0.37277042110053965, |
|
"grad_norm": 23.0625, |
|
"learning_rate": 5.824558868681031e-08, |
|
"loss": 16.4946, |
|
"step": 12900 |
|
}, |
|
{ |
|
"epoch": 0.3742152676939526, |
|
"grad_norm": 21.78125, |
|
"learning_rate": 5.8471346782495613e-08, |
|
"loss": 16.5603, |
|
"step": 12950 |
|
}, |
|
{ |
|
"epoch": 0.37566011428736557, |
|
"grad_norm": 21.296875, |
|
"learning_rate": 5.869710487818093e-08, |
|
"loss": 16.3374, |
|
"step": 13000 |
|
}, |
|
{ |
|
"epoch": 0.3771049608807785, |
|
"grad_norm": 22.90625, |
|
"learning_rate": 5.8922862973866245e-08, |
|
"loss": 16.3641, |
|
"step": 13050 |
|
}, |
|
{ |
|
"epoch": 0.3785498074741914, |
|
"grad_norm": 22.84375, |
|
"learning_rate": 5.9148621069551554e-08, |
|
"loss": 16.4834, |
|
"step": 13100 |
|
}, |
|
{ |
|
"epoch": 0.37999465406760435, |
|
"grad_norm": 20.40625, |
|
"learning_rate": 5.937437916523686e-08, |
|
"loss": 16.4651, |
|
"step": 13150 |
|
}, |
|
{ |
|
"epoch": 0.38143950066101734, |
|
"grad_norm": 23.140625, |
|
"learning_rate": 5.960013726092217e-08, |
|
"loss": 16.5031, |
|
"step": 13200 |
|
}, |
|
{ |
|
"epoch": 0.38288434725443027, |
|
"grad_norm": 22.625, |
|
"learning_rate": 5.982589535660749e-08, |
|
"loss": 16.3802, |
|
"step": 13250 |
|
}, |
|
{ |
|
"epoch": 0.3843291938478432, |
|
"grad_norm": 20.375, |
|
"learning_rate": 6.005165345229279e-08, |
|
"loss": 16.3925, |
|
"step": 13300 |
|
}, |
|
{ |
|
"epoch": 0.3857740404412561, |
|
"grad_norm": 20.734375, |
|
"learning_rate": 6.02774115479781e-08, |
|
"loss": 16.399, |
|
"step": 13350 |
|
}, |
|
{ |
|
"epoch": 0.3872188870346691, |
|
"grad_norm": 22.5, |
|
"learning_rate": 6.050316964366342e-08, |
|
"loss": 16.3255, |
|
"step": 13400 |
|
}, |
|
{ |
|
"epoch": 0.38866373362808204, |
|
"grad_norm": 22.0625, |
|
"learning_rate": 6.072892773934872e-08, |
|
"loss": 16.5031, |
|
"step": 13450 |
|
}, |
|
{ |
|
"epoch": 0.39010858022149497, |
|
"grad_norm": 23.171875, |
|
"learning_rate": 6.095468583503404e-08, |
|
"loss": 16.3906, |
|
"step": 13500 |
|
}, |
|
{ |
|
"epoch": 0.3915534268149079, |
|
"grad_norm": 23.46875, |
|
"learning_rate": 6.118044393071935e-08, |
|
"loss": 16.4107, |
|
"step": 13550 |
|
}, |
|
{ |
|
"epoch": 0.3929982734083209, |
|
"grad_norm": 24.046875, |
|
"learning_rate": 6.140620202640467e-08, |
|
"loss": 16.4128, |
|
"step": 13600 |
|
}, |
|
{ |
|
"epoch": 0.3944431200017338, |
|
"grad_norm": 23.53125, |
|
"learning_rate": 6.163196012208997e-08, |
|
"loss": 16.4372, |
|
"step": 13650 |
|
}, |
|
{ |
|
"epoch": 0.39588796659514675, |
|
"grad_norm": 23.734375, |
|
"learning_rate": 6.185771821777529e-08, |
|
"loss": 16.5197, |
|
"step": 13700 |
|
}, |
|
{ |
|
"epoch": 0.39733281318855973, |
|
"grad_norm": 22.0625, |
|
"learning_rate": 6.20834763134606e-08, |
|
"loss": 16.4171, |
|
"step": 13750 |
|
}, |
|
{ |
|
"epoch": 0.39877765978197266, |
|
"grad_norm": 21.953125, |
|
"learning_rate": 6.23092344091459e-08, |
|
"loss": 16.3508, |
|
"step": 13800 |
|
}, |
|
{ |
|
"epoch": 0.4002225063753856, |
|
"grad_norm": 23.796875, |
|
"learning_rate": 6.253499250483122e-08, |
|
"loss": 16.4303, |
|
"step": 13850 |
|
}, |
|
{ |
|
"epoch": 0.4016673529687985, |
|
"grad_norm": 21.484375, |
|
"learning_rate": 6.276075060051652e-08, |
|
"loss": 16.3384, |
|
"step": 13900 |
|
}, |
|
{ |
|
"epoch": 0.4031121995622115, |
|
"grad_norm": 21.3125, |
|
"learning_rate": 6.298650869620184e-08, |
|
"loss": 16.3742, |
|
"step": 13950 |
|
}, |
|
{ |
|
"epoch": 0.40455704615562443, |
|
"grad_norm": 21.53125, |
|
"learning_rate": 6.321226679188715e-08, |
|
"loss": 16.3369, |
|
"step": 14000 |
|
}, |
|
{ |
|
"epoch": 0.40600189274903736, |
|
"grad_norm": 24.578125, |
|
"learning_rate": 6.343802488757247e-08, |
|
"loss": 16.4018, |
|
"step": 14050 |
|
}, |
|
{ |
|
"epoch": 0.4074467393424503, |
|
"grad_norm": 26.078125, |
|
"learning_rate": 6.366378298325779e-08, |
|
"loss": 16.4958, |
|
"step": 14100 |
|
}, |
|
{ |
|
"epoch": 0.4088915859358633, |
|
"grad_norm": 20.296875, |
|
"learning_rate": 6.388954107894309e-08, |
|
"loss": 16.4418, |
|
"step": 14150 |
|
}, |
|
{ |
|
"epoch": 0.4103364325292762, |
|
"grad_norm": 26.171875, |
|
"learning_rate": 6.411529917462839e-08, |
|
"loss": 16.3343, |
|
"step": 14200 |
|
}, |
|
{ |
|
"epoch": 0.41178127912268914, |
|
"grad_norm": 19.671875, |
|
"learning_rate": 6.43410572703137e-08, |
|
"loss": 16.3505, |
|
"step": 14250 |
|
}, |
|
{ |
|
"epoch": 0.41322612571610207, |
|
"grad_norm": 21.265625, |
|
"learning_rate": 6.456681536599902e-08, |
|
"loss": 16.3802, |
|
"step": 14300 |
|
}, |
|
{ |
|
"epoch": 0.41467097230951505, |
|
"grad_norm": 22.59375, |
|
"learning_rate": 6.479257346168434e-08, |
|
"loss": 16.3253, |
|
"step": 14350 |
|
}, |
|
{ |
|
"epoch": 0.416115818902928, |
|
"grad_norm": 24.015625, |
|
"learning_rate": 6.501833155736965e-08, |
|
"loss": 16.3624, |
|
"step": 14400 |
|
}, |
|
{ |
|
"epoch": 0.4175606654963409, |
|
"grad_norm": 26.5, |
|
"learning_rate": 6.524408965305495e-08, |
|
"loss": 16.2562, |
|
"step": 14450 |
|
}, |
|
{ |
|
"epoch": 0.4190055120897539, |
|
"grad_norm": 19.75, |
|
"learning_rate": 6.546984774874027e-08, |
|
"loss": 16.3554, |
|
"step": 14500 |
|
}, |
|
{ |
|
"epoch": 0.4204503586831668, |
|
"grad_norm": 24.234375, |
|
"learning_rate": 6.569560584442557e-08, |
|
"loss": 16.4004, |
|
"step": 14550 |
|
}, |
|
{ |
|
"epoch": 0.42189520527657975, |
|
"grad_norm": 22.9375, |
|
"learning_rate": 6.592136394011089e-08, |
|
"loss": 16.3374, |
|
"step": 14600 |
|
}, |
|
{ |
|
"epoch": 0.4233400518699927, |
|
"grad_norm": 24.21875, |
|
"learning_rate": 6.61471220357962e-08, |
|
"loss": 16.2601, |
|
"step": 14650 |
|
}, |
|
{ |
|
"epoch": 0.42478489846340567, |
|
"grad_norm": 23.203125, |
|
"learning_rate": 6.63728801314815e-08, |
|
"loss": 16.331, |
|
"step": 14700 |
|
}, |
|
{ |
|
"epoch": 0.4262297450568186, |
|
"grad_norm": 23.640625, |
|
"learning_rate": 6.659863822716682e-08, |
|
"loss": 16.3419, |
|
"step": 14750 |
|
}, |
|
{ |
|
"epoch": 0.42767459165023153, |
|
"grad_norm": 25.453125, |
|
"learning_rate": 6.682439632285214e-08, |
|
"loss": 16.2981, |
|
"step": 14800 |
|
}, |
|
{ |
|
"epoch": 0.42911943824364446, |
|
"grad_norm": 25.765625, |
|
"learning_rate": 6.705015441853745e-08, |
|
"loss": 16.2609, |
|
"step": 14850 |
|
}, |
|
{ |
|
"epoch": 0.43056428483705744, |
|
"grad_norm": 20.46875, |
|
"learning_rate": 6.727591251422276e-08, |
|
"loss": 16.3126, |
|
"step": 14900 |
|
}, |
|
{ |
|
"epoch": 0.43200913143047037, |
|
"grad_norm": 23.25, |
|
"learning_rate": 6.750167060990807e-08, |
|
"loss": 16.2534, |
|
"step": 14950 |
|
}, |
|
{ |
|
"epoch": 0.4334539780238833, |
|
"grad_norm": 21.4375, |
|
"learning_rate": 6.772742870559337e-08, |
|
"loss": 16.2914, |
|
"step": 15000 |
|
}, |
|
{ |
|
"epoch": 0.4334539780238833, |
|
"eval_loss": 2.0395615100860596, |
|
"eval_runtime": 351.3281, |
|
"eval_samples_per_second": 2654.311, |
|
"eval_steps_per_second": 41.474, |
|
"step": 15000 |
|
}, |
|
{ |
|
"epoch": 0.43489882461729623, |
|
"grad_norm": 20.796875, |
|
"learning_rate": 6.795318680127869e-08, |
|
"loss": 16.2964, |
|
"step": 15050 |
|
}, |
|
{ |
|
"epoch": 0.4363436712107092, |
|
"grad_norm": 21.578125, |
|
"learning_rate": 6.8178944896964e-08, |
|
"loss": 16.3424, |
|
"step": 15100 |
|
}, |
|
{ |
|
"epoch": 0.43778851780412215, |
|
"grad_norm": 21.703125, |
|
"learning_rate": 6.840470299264932e-08, |
|
"loss": 16.2815, |
|
"step": 15150 |
|
}, |
|
{ |
|
"epoch": 0.4392333643975351, |
|
"grad_norm": 22.578125, |
|
"learning_rate": 6.863046108833462e-08, |
|
"loss": 16.2694, |
|
"step": 15200 |
|
}, |
|
{ |
|
"epoch": 0.44067821099094806, |
|
"grad_norm": 26.828125, |
|
"learning_rate": 6.885621918401994e-08, |
|
"loss": 16.2776, |
|
"step": 15250 |
|
}, |
|
{ |
|
"epoch": 0.442123057584361, |
|
"grad_norm": 21.828125, |
|
"learning_rate": 6.908197727970524e-08, |
|
"loss": 16.3022, |
|
"step": 15300 |
|
}, |
|
{ |
|
"epoch": 0.4435679041777739, |
|
"grad_norm": 20.9375, |
|
"learning_rate": 6.930773537539056e-08, |
|
"loss": 16.2117, |
|
"step": 15350 |
|
}, |
|
{ |
|
"epoch": 0.44501275077118685, |
|
"grad_norm": 25.59375, |
|
"learning_rate": 6.953349347107587e-08, |
|
"loss": 16.3209, |
|
"step": 15400 |
|
}, |
|
{ |
|
"epoch": 0.44645759736459983, |
|
"grad_norm": 24.96875, |
|
"learning_rate": 6.975925156676119e-08, |
|
"loss": 16.2672, |
|
"step": 15450 |
|
}, |
|
{ |
|
"epoch": 0.44790244395801276, |
|
"grad_norm": 20.359375, |
|
"learning_rate": 6.99850096624465e-08, |
|
"loss": 16.2478, |
|
"step": 15500 |
|
}, |
|
{ |
|
"epoch": 0.4493472905514257, |
|
"grad_norm": 20.28125, |
|
"learning_rate": 7.021076775813179e-08, |
|
"loss": 16.2187, |
|
"step": 15550 |
|
}, |
|
{ |
|
"epoch": 0.4507921371448386, |
|
"grad_norm": 21.1875, |
|
"learning_rate": 7.043652585381711e-08, |
|
"loss": 16.2327, |
|
"step": 15600 |
|
}, |
|
{ |
|
"epoch": 0.4522369837382516, |
|
"grad_norm": 21.0625, |
|
"learning_rate": 7.066228394950242e-08, |
|
"loss": 16.321, |
|
"step": 15650 |
|
}, |
|
{ |
|
"epoch": 0.45368183033166454, |
|
"grad_norm": 24.265625, |
|
"learning_rate": 7.088804204518774e-08, |
|
"loss": 16.2536, |
|
"step": 15700 |
|
}, |
|
{ |
|
"epoch": 0.45512667692507747, |
|
"grad_norm": 22.953125, |
|
"learning_rate": 7.111380014087305e-08, |
|
"loss": 16.2889, |
|
"step": 15750 |
|
}, |
|
{ |
|
"epoch": 0.45657152351849045, |
|
"grad_norm": 22.265625, |
|
"learning_rate": 7.133955823655837e-08, |
|
"loss": 16.2099, |
|
"step": 15800 |
|
}, |
|
{ |
|
"epoch": 0.4580163701119034, |
|
"grad_norm": 21.25, |
|
"learning_rate": 7.156531633224367e-08, |
|
"loss": 16.2138, |
|
"step": 15850 |
|
}, |
|
{ |
|
"epoch": 0.4594612167053163, |
|
"grad_norm": 22.125, |
|
"learning_rate": 7.179107442792897e-08, |
|
"loss": 16.2212, |
|
"step": 15900 |
|
}, |
|
{ |
|
"epoch": 0.46090606329872924, |
|
"grad_norm": 21.515625, |
|
"learning_rate": 7.201683252361429e-08, |
|
"loss": 16.2495, |
|
"step": 15950 |
|
}, |
|
{ |
|
"epoch": 0.4623509098921422, |
|
"grad_norm": 22.609375, |
|
"learning_rate": 7.22425906192996e-08, |
|
"loss": 16.2045, |
|
"step": 16000 |
|
}, |
|
{ |
|
"epoch": 0.46379575648555516, |
|
"grad_norm": 22.15625, |
|
"learning_rate": 7.246834871498492e-08, |
|
"loss": 16.2562, |
|
"step": 16050 |
|
}, |
|
{ |
|
"epoch": 0.4652406030789681, |
|
"grad_norm": 22.703125, |
|
"learning_rate": 7.269410681067024e-08, |
|
"loss": 16.2227, |
|
"step": 16100 |
|
}, |
|
{ |
|
"epoch": 0.466685449672381, |
|
"grad_norm": 22.484375, |
|
"learning_rate": 7.291986490635554e-08, |
|
"loss": 16.2847, |
|
"step": 16150 |
|
}, |
|
{ |
|
"epoch": 0.468130296265794, |
|
"grad_norm": 23.125, |
|
"learning_rate": 7.314562300204084e-08, |
|
"loss": 16.2231, |
|
"step": 16200 |
|
}, |
|
{ |
|
"epoch": 0.46957514285920693, |
|
"grad_norm": 21.703125, |
|
"learning_rate": 7.337138109772616e-08, |
|
"loss": 16.2217, |
|
"step": 16250 |
|
}, |
|
{ |
|
"epoch": 0.47101998945261986, |
|
"grad_norm": 21.34375, |
|
"learning_rate": 7.359713919341147e-08, |
|
"loss": 16.0962, |
|
"step": 16300 |
|
}, |
|
{ |
|
"epoch": 0.4724648360460328, |
|
"grad_norm": 24.234375, |
|
"learning_rate": 7.382289728909679e-08, |
|
"loss": 16.1981, |
|
"step": 16350 |
|
}, |
|
{ |
|
"epoch": 0.4739096826394458, |
|
"grad_norm": 22.09375, |
|
"learning_rate": 7.40486553847821e-08, |
|
"loss": 16.228, |
|
"step": 16400 |
|
}, |
|
{ |
|
"epoch": 0.4753545292328587, |
|
"grad_norm": 20.671875, |
|
"learning_rate": 7.42744134804674e-08, |
|
"loss": 16.2862, |
|
"step": 16450 |
|
}, |
|
{ |
|
"epoch": 0.47679937582627163, |
|
"grad_norm": 21.265625, |
|
"learning_rate": 7.450017157615272e-08, |
|
"loss": 16.3092, |
|
"step": 16500 |
|
}, |
|
{ |
|
"epoch": 0.4782442224196846, |
|
"grad_norm": 24.90625, |
|
"learning_rate": 7.472592967183802e-08, |
|
"loss": 16.2476, |
|
"step": 16550 |
|
}, |
|
{ |
|
"epoch": 0.47968906901309755, |
|
"grad_norm": 23.328125, |
|
"learning_rate": 7.495168776752334e-08, |
|
"loss": 16.1906, |
|
"step": 16600 |
|
}, |
|
{ |
|
"epoch": 0.4811339156065105, |
|
"grad_norm": 23.140625, |
|
"learning_rate": 7.517744586320865e-08, |
|
"loss": 16.154, |
|
"step": 16650 |
|
}, |
|
{ |
|
"epoch": 0.4825787621999234, |
|
"grad_norm": 22.234375, |
|
"learning_rate": 7.540320395889396e-08, |
|
"loss": 16.2017, |
|
"step": 16700 |
|
}, |
|
{ |
|
"epoch": 0.4840236087933364, |
|
"grad_norm": 22.890625, |
|
"learning_rate": 7.562896205457927e-08, |
|
"loss": 16.345, |
|
"step": 16750 |
|
}, |
|
{ |
|
"epoch": 0.4854684553867493, |
|
"grad_norm": 21.625, |
|
"learning_rate": 7.585472015026459e-08, |
|
"loss": 16.2125, |
|
"step": 16800 |
|
}, |
|
{ |
|
"epoch": 0.48691330198016225, |
|
"grad_norm": 21.640625, |
|
"learning_rate": 7.60804782459499e-08, |
|
"loss": 16.217, |
|
"step": 16850 |
|
}, |
|
{ |
|
"epoch": 0.4883581485735752, |
|
"grad_norm": 23.421875, |
|
"learning_rate": 7.63062363416352e-08, |
|
"loss": 16.2419, |
|
"step": 16900 |
|
}, |
|
{ |
|
"epoch": 0.48980299516698816, |
|
"grad_norm": 25.9375, |
|
"learning_rate": 7.653199443732052e-08, |
|
"loss": 16.1463, |
|
"step": 16950 |
|
}, |
|
{ |
|
"epoch": 0.4912478417604011, |
|
"grad_norm": 22.703125, |
|
"learning_rate": 7.675775253300582e-08, |
|
"loss": 16.0852, |
|
"step": 17000 |
|
}, |
|
{ |
|
"epoch": 0.492692688353814, |
|
"grad_norm": 21.96875, |
|
"learning_rate": 7.698351062869114e-08, |
|
"loss": 16.2164, |
|
"step": 17050 |
|
}, |
|
{ |
|
"epoch": 0.49413753494722695, |
|
"grad_norm": 25.6875, |
|
"learning_rate": 7.720926872437645e-08, |
|
"loss": 16.1581, |
|
"step": 17100 |
|
}, |
|
{ |
|
"epoch": 0.49558238154063994, |
|
"grad_norm": 22.953125, |
|
"learning_rate": 7.743502682006177e-08, |
|
"loss": 16.1946, |
|
"step": 17150 |
|
}, |
|
{ |
|
"epoch": 0.49702722813405287, |
|
"grad_norm": 21.984375, |
|
"learning_rate": 7.766078491574707e-08, |
|
"loss": 16.2602, |
|
"step": 17200 |
|
}, |
|
{ |
|
"epoch": 0.4984720747274658, |
|
"grad_norm": 23.78125, |
|
"learning_rate": 7.788654301143239e-08, |
|
"loss": 16.1283, |
|
"step": 17250 |
|
}, |
|
{ |
|
"epoch": 0.4999169213208788, |
|
"grad_norm": 21.0625, |
|
"learning_rate": 7.811230110711769e-08, |
|
"loss": 16.1301, |
|
"step": 17300 |
|
}, |
|
{ |
|
"epoch": 0.5013617679142917, |
|
"grad_norm": 27.609375, |
|
"learning_rate": 7.8338059202803e-08, |
|
"loss": 16.0988, |
|
"step": 17350 |
|
}, |
|
{ |
|
"epoch": 0.5028066145077047, |
|
"grad_norm": 24.4375, |
|
"learning_rate": 7.856381729848832e-08, |
|
"loss": 16.207, |
|
"step": 17400 |
|
}, |
|
{ |
|
"epoch": 0.5042514611011176, |
|
"grad_norm": 23.765625, |
|
"learning_rate": 7.878957539417364e-08, |
|
"loss": 16.2054, |
|
"step": 17450 |
|
}, |
|
{ |
|
"epoch": 0.5056963076945306, |
|
"grad_norm": 22.140625, |
|
"learning_rate": 7.901533348985895e-08, |
|
"loss": 16.173, |
|
"step": 17500 |
|
}, |
|
{ |
|
"epoch": 0.5071411542879435, |
|
"grad_norm": 23.53125, |
|
"learning_rate": 7.924109158554424e-08, |
|
"loss": 16.043, |
|
"step": 17550 |
|
}, |
|
{ |
|
"epoch": 0.5085860008813564, |
|
"grad_norm": 21.78125, |
|
"learning_rate": 7.946684968122956e-08, |
|
"loss": 16.246, |
|
"step": 17600 |
|
}, |
|
{ |
|
"epoch": 0.5100308474747693, |
|
"grad_norm": 22.125, |
|
"learning_rate": 7.969260777691487e-08, |
|
"loss": 16.2203, |
|
"step": 17650 |
|
}, |
|
{ |
|
"epoch": 0.5114756940681823, |
|
"grad_norm": 24.15625, |
|
"learning_rate": 7.991836587260019e-08, |
|
"loss": 16.1279, |
|
"step": 17700 |
|
}, |
|
{ |
|
"epoch": 0.5129205406615952, |
|
"grad_norm": 22.1875, |
|
"learning_rate": 8.01441239682855e-08, |
|
"loss": 16.0409, |
|
"step": 17750 |
|
}, |
|
{ |
|
"epoch": 0.5143653872550082, |
|
"grad_norm": 23.640625, |
|
"learning_rate": 8.036988206397082e-08, |
|
"loss": 16.0298, |
|
"step": 17800 |
|
}, |
|
{ |
|
"epoch": 0.5158102338484212, |
|
"grad_norm": 24.015625, |
|
"learning_rate": 8.059564015965612e-08, |
|
"loss": 16.0894, |
|
"step": 17850 |
|
}, |
|
{ |
|
"epoch": 0.5172550804418341, |
|
"grad_norm": 25.109375, |
|
"learning_rate": 8.082139825534142e-08, |
|
"loss": 16.1767, |
|
"step": 17900 |
|
}, |
|
{ |
|
"epoch": 0.518699927035247, |
|
"grad_norm": 19.859375, |
|
"learning_rate": 8.104715635102674e-08, |
|
"loss": 16.0735, |
|
"step": 17950 |
|
}, |
|
{ |
|
"epoch": 0.52014477362866, |
|
"grad_norm": 21.375, |
|
"learning_rate": 8.127291444671206e-08, |
|
"loss": 16.1829, |
|
"step": 18000 |
|
}, |
|
{ |
|
"epoch": 0.52014477362866, |
|
"eval_loss": 2.015727996826172, |
|
"eval_runtime": 347.0309, |
|
"eval_samples_per_second": 2687.178, |
|
"eval_steps_per_second": 41.988, |
|
"step": 18000 |
|
}, |
|
{ |
|
"epoch": 0.5215896202220729, |
|
"grad_norm": 20.765625, |
|
"learning_rate": 8.149867254239737e-08, |
|
"loss": 16.1783, |
|
"step": 18050 |
|
}, |
|
{ |
|
"epoch": 0.5230344668154858, |
|
"grad_norm": 21.921875, |
|
"learning_rate": 8.172443063808269e-08, |
|
"loss": 16.0167, |
|
"step": 18100 |
|
}, |
|
{ |
|
"epoch": 0.5244793134088989, |
|
"grad_norm": 21.1875, |
|
"learning_rate": 8.195018873376799e-08, |
|
"loss": 16.0192, |
|
"step": 18150 |
|
}, |
|
{ |
|
"epoch": 0.5259241600023118, |
|
"grad_norm": 22.78125, |
|
"learning_rate": 8.217594682945329e-08, |
|
"loss": 16.1586, |
|
"step": 18200 |
|
}, |
|
{ |
|
"epoch": 0.5273690065957247, |
|
"grad_norm": 21.15625, |
|
"learning_rate": 8.240170492513861e-08, |
|
"loss": 16.0389, |
|
"step": 18250 |
|
}, |
|
{ |
|
"epoch": 0.5288138531891377, |
|
"grad_norm": 24.90625, |
|
"learning_rate": 8.262746302082392e-08, |
|
"loss": 16.1815, |
|
"step": 18300 |
|
}, |
|
{ |
|
"epoch": 0.5302586997825506, |
|
"grad_norm": 21.171875, |
|
"learning_rate": 8.285322111650924e-08, |
|
"loss": 16.1257, |
|
"step": 18350 |
|
}, |
|
{ |
|
"epoch": 0.5317035463759635, |
|
"grad_norm": 23.96875, |
|
"learning_rate": 8.307897921219455e-08, |
|
"loss": 16.1506, |
|
"step": 18400 |
|
}, |
|
{ |
|
"epoch": 0.5331483929693764, |
|
"grad_norm": 23.3125, |
|
"learning_rate": 8.330473730787986e-08, |
|
"loss": 16.1117, |
|
"step": 18450 |
|
}, |
|
{ |
|
"epoch": 0.5345932395627894, |
|
"grad_norm": 23.328125, |
|
"learning_rate": 8.353049540356517e-08, |
|
"loss": 16.1372, |
|
"step": 18500 |
|
}, |
|
{ |
|
"epoch": 0.5360380861562024, |
|
"grad_norm": 21.265625, |
|
"learning_rate": 8.375625349925047e-08, |
|
"loss": 16.1619, |
|
"step": 18550 |
|
}, |
|
{ |
|
"epoch": 0.5374829327496153, |
|
"grad_norm": 22.53125, |
|
"learning_rate": 8.398201159493579e-08, |
|
"loss": 16.0791, |
|
"step": 18600 |
|
}, |
|
{ |
|
"epoch": 0.5389277793430283, |
|
"grad_norm": 27.203125, |
|
"learning_rate": 8.42077696906211e-08, |
|
"loss": 16.2036, |
|
"step": 18650 |
|
}, |
|
{ |
|
"epoch": 0.5403726259364412, |
|
"grad_norm": 23.25, |
|
"learning_rate": 8.443352778630641e-08, |
|
"loss": 16.0342, |
|
"step": 18700 |
|
}, |
|
{ |
|
"epoch": 0.5418174725298541, |
|
"grad_norm": 22.453125, |
|
"learning_rate": 8.465928588199172e-08, |
|
"loss": 16.1121, |
|
"step": 18750 |
|
}, |
|
{ |
|
"epoch": 0.5432623191232671, |
|
"grad_norm": 27.1875, |
|
"learning_rate": 8.488504397767704e-08, |
|
"loss": 16.1368, |
|
"step": 18800 |
|
}, |
|
{ |
|
"epoch": 0.54470716571668, |
|
"grad_norm": 24.609375, |
|
"learning_rate": 8.511080207336235e-08, |
|
"loss": 16.006, |
|
"step": 18850 |
|
}, |
|
{ |
|
"epoch": 0.546152012310093, |
|
"grad_norm": 25.890625, |
|
"learning_rate": 8.533656016904766e-08, |
|
"loss": 16.0605, |
|
"step": 18900 |
|
}, |
|
{ |
|
"epoch": 0.547596858903506, |
|
"grad_norm": 21.46875, |
|
"learning_rate": 8.556231826473297e-08, |
|
"loss": 16.1316, |
|
"step": 18950 |
|
}, |
|
{ |
|
"epoch": 0.5490417054969189, |
|
"grad_norm": 21.75, |
|
"learning_rate": 8.578807636041827e-08, |
|
"loss": 16.0824, |
|
"step": 19000 |
|
}, |
|
{ |
|
"epoch": 0.5504865520903318, |
|
"grad_norm": 23.796875, |
|
"learning_rate": 8.601383445610359e-08, |
|
"loss": 16.0387, |
|
"step": 19050 |
|
}, |
|
{ |
|
"epoch": 0.5519313986837447, |
|
"grad_norm": 21.625, |
|
"learning_rate": 8.62395925517889e-08, |
|
"loss": 16.1834, |
|
"step": 19100 |
|
}, |
|
{ |
|
"epoch": 0.5533762452771577, |
|
"grad_norm": 20.375, |
|
"learning_rate": 8.646535064747422e-08, |
|
"loss": 16.0675, |
|
"step": 19150 |
|
}, |
|
{ |
|
"epoch": 0.5548210918705706, |
|
"grad_norm": 20.65625, |
|
"learning_rate": 8.669110874315952e-08, |
|
"loss": 15.9053, |
|
"step": 19200 |
|
}, |
|
{ |
|
"epoch": 0.5562659384639835, |
|
"grad_norm": 21.984375, |
|
"learning_rate": 8.691686683884484e-08, |
|
"loss": 16.0319, |
|
"step": 19250 |
|
}, |
|
{ |
|
"epoch": 0.5577107850573966, |
|
"grad_norm": 19.9375, |
|
"learning_rate": 8.714262493453014e-08, |
|
"loss": 16.0288, |
|
"step": 19300 |
|
}, |
|
{ |
|
"epoch": 0.5591556316508095, |
|
"grad_norm": 21.234375, |
|
"learning_rate": 8.736838303021546e-08, |
|
"loss": 16.0782, |
|
"step": 19350 |
|
}, |
|
{ |
|
"epoch": 0.5606004782442224, |
|
"grad_norm": 23.84375, |
|
"learning_rate": 8.759414112590077e-08, |
|
"loss": 15.987, |
|
"step": 19400 |
|
}, |
|
{ |
|
"epoch": 0.5620453248376354, |
|
"grad_norm": 26.1875, |
|
"learning_rate": 8.781989922158609e-08, |
|
"loss": 16.1125, |
|
"step": 19450 |
|
}, |
|
{ |
|
"epoch": 0.5634901714310483, |
|
"grad_norm": 22.640625, |
|
"learning_rate": 8.80456573172714e-08, |
|
"loss": 16.066, |
|
"step": 19500 |
|
}, |
|
{ |
|
"epoch": 0.5649350180244612, |
|
"grad_norm": 23.9375, |
|
"learning_rate": 8.827141541295669e-08, |
|
"loss": 15.9326, |
|
"step": 19550 |
|
}, |
|
{ |
|
"epoch": 0.5663798646178742, |
|
"grad_norm": 23.34375, |
|
"learning_rate": 8.849717350864201e-08, |
|
"loss": 16.0386, |
|
"step": 19600 |
|
}, |
|
{ |
|
"epoch": 0.5678247112112872, |
|
"grad_norm": 22.703125, |
|
"learning_rate": 8.872293160432732e-08, |
|
"loss": 16.0198, |
|
"step": 19650 |
|
}, |
|
{ |
|
"epoch": 0.5692695578047001, |
|
"grad_norm": 24.21875, |
|
"learning_rate": 8.894868970001264e-08, |
|
"loss": 15.9735, |
|
"step": 19700 |
|
}, |
|
{ |
|
"epoch": 0.570714404398113, |
|
"grad_norm": 23.1875, |
|
"learning_rate": 8.917444779569795e-08, |
|
"loss": 15.9592, |
|
"step": 19750 |
|
}, |
|
{ |
|
"epoch": 0.572159250991526, |
|
"grad_norm": 21.96875, |
|
"learning_rate": 8.940020589138327e-08, |
|
"loss": 16.0753, |
|
"step": 19800 |
|
}, |
|
{ |
|
"epoch": 0.5736040975849389, |
|
"grad_norm": 23.59375, |
|
"learning_rate": 8.962596398706857e-08, |
|
"loss": 15.9445, |
|
"step": 19850 |
|
}, |
|
{ |
|
"epoch": 0.5750489441783518, |
|
"grad_norm": 21.453125, |
|
"learning_rate": 8.985172208275388e-08, |
|
"loss": 15.9417, |
|
"step": 19900 |
|
}, |
|
{ |
|
"epoch": 0.5764937907717648, |
|
"grad_norm": 26.203125, |
|
"learning_rate": 9.007748017843919e-08, |
|
"loss": 15.9153, |
|
"step": 19950 |
|
}, |
|
{ |
|
"epoch": 0.5779386373651777, |
|
"grad_norm": 26.90625, |
|
"learning_rate": 9.03032382741245e-08, |
|
"loss": 16.0521, |
|
"step": 20000 |
|
}, |
|
{ |
|
"epoch": 0.5793834839585907, |
|
"grad_norm": 24.5625, |
|
"learning_rate": 9.052899636980982e-08, |
|
"loss": 16.0634, |
|
"step": 20050 |
|
}, |
|
{ |
|
"epoch": 0.5808283305520037, |
|
"grad_norm": 22.6875, |
|
"learning_rate": 9.075475446549514e-08, |
|
"loss": 16.0152, |
|
"step": 20100 |
|
}, |
|
{ |
|
"epoch": 0.5822731771454166, |
|
"grad_norm": 21.15625, |
|
"learning_rate": 9.098051256118044e-08, |
|
"loss": 15.9701, |
|
"step": 20150 |
|
}, |
|
{ |
|
"epoch": 0.5837180237388295, |
|
"grad_norm": 23.03125, |
|
"learning_rate": 9.120627065686574e-08, |
|
"loss": 15.9496, |
|
"step": 20200 |
|
}, |
|
{ |
|
"epoch": 0.5851628703322425, |
|
"grad_norm": 22.53125, |
|
"learning_rate": 9.143202875255106e-08, |
|
"loss": 15.97, |
|
"step": 20250 |
|
}, |
|
{ |
|
"epoch": 0.5866077169256554, |
|
"grad_norm": 23.046875, |
|
"learning_rate": 9.165778684823637e-08, |
|
"loss": 15.8332, |
|
"step": 20300 |
|
}, |
|
{ |
|
"epoch": 0.5880525635190683, |
|
"grad_norm": 21.40625, |
|
"learning_rate": 9.188354494392169e-08, |
|
"loss": 15.977, |
|
"step": 20350 |
|
}, |
|
{ |
|
"epoch": 0.5894974101124814, |
|
"grad_norm": 24.3125, |
|
"learning_rate": 9.210930303960699e-08, |
|
"loss": 15.9583, |
|
"step": 20400 |
|
}, |
|
{ |
|
"epoch": 0.5909422567058943, |
|
"grad_norm": 23.359375, |
|
"learning_rate": 9.23350611352923e-08, |
|
"loss": 16.0266, |
|
"step": 20450 |
|
}, |
|
{ |
|
"epoch": 0.5923871032993072, |
|
"grad_norm": 23.25, |
|
"learning_rate": 9.256081923097762e-08, |
|
"loss": 15.9804, |
|
"step": 20500 |
|
}, |
|
{ |
|
"epoch": 0.5938319498927201, |
|
"grad_norm": 20.296875, |
|
"learning_rate": 9.278657732666292e-08, |
|
"loss": 15.9473, |
|
"step": 20550 |
|
}, |
|
{ |
|
"epoch": 0.5952767964861331, |
|
"grad_norm": 21.1875, |
|
"learning_rate": 9.301233542234824e-08, |
|
"loss": 15.9547, |
|
"step": 20600 |
|
}, |
|
{ |
|
"epoch": 0.596721643079546, |
|
"grad_norm": 23.40625, |
|
"learning_rate": 9.323809351803356e-08, |
|
"loss": 16.0047, |
|
"step": 20650 |
|
}, |
|
{ |
|
"epoch": 0.5981664896729589, |
|
"grad_norm": 22.03125, |
|
"learning_rate": 9.346385161371886e-08, |
|
"loss": 15.9565, |
|
"step": 20700 |
|
}, |
|
{ |
|
"epoch": 0.5996113362663719, |
|
"grad_norm": 23.96875, |
|
"learning_rate": 9.368960970940417e-08, |
|
"loss": 15.978, |
|
"step": 20750 |
|
}, |
|
{ |
|
"epoch": 0.6010561828597849, |
|
"grad_norm": 26.078125, |
|
"learning_rate": 9.391536780508949e-08, |
|
"loss": 15.9615, |
|
"step": 20800 |
|
}, |
|
{ |
|
"epoch": 0.6025010294531978, |
|
"grad_norm": 22.421875, |
|
"learning_rate": 9.41411259007748e-08, |
|
"loss": 15.9731, |
|
"step": 20850 |
|
}, |
|
{ |
|
"epoch": 0.6039458760466108, |
|
"grad_norm": 28.59375, |
|
"learning_rate": 9.436688399646011e-08, |
|
"loss": 15.8916, |
|
"step": 20900 |
|
}, |
|
{ |
|
"epoch": 0.6053907226400237, |
|
"grad_norm": 21.375, |
|
"learning_rate": 9.459264209214542e-08, |
|
"loss": 15.9134, |
|
"step": 20950 |
|
}, |
|
{ |
|
"epoch": 0.6068355692334366, |
|
"grad_norm": 23.015625, |
|
"learning_rate": 9.481840018783072e-08, |
|
"loss": 15.9756, |
|
"step": 21000 |
|
}, |
|
{ |
|
"epoch": 0.6068355692334366, |
|
"eval_loss": 1.990402102470398, |
|
"eval_runtime": 346.2542, |
|
"eval_samples_per_second": 2693.206, |
|
"eval_steps_per_second": 42.082, |
|
"step": 21000 |
|
}, |
|
{ |
|
"epoch": 0.6082804158268496, |
|
"grad_norm": 21.765625, |
|
"learning_rate": 9.504415828351604e-08, |
|
"loss": 15.9846, |
|
"step": 21050 |
|
}, |
|
{ |
|
"epoch": 0.6097252624202625, |
|
"grad_norm": 21.875, |
|
"learning_rate": 9.526991637920136e-08, |
|
"loss": 15.92, |
|
"step": 21100 |
|
}, |
|
{ |
|
"epoch": 0.6111701090136755, |
|
"grad_norm": 23.140625, |
|
"learning_rate": 9.549567447488667e-08, |
|
"loss": 16.0141, |
|
"step": 21150 |
|
}, |
|
{ |
|
"epoch": 0.6126149556070885, |
|
"grad_norm": 22.1875, |
|
"learning_rate": 9.572143257057199e-08, |
|
"loss": 15.9318, |
|
"step": 21200 |
|
}, |
|
{ |
|
"epoch": 0.6140598022005014, |
|
"grad_norm": 24.421875, |
|
"learning_rate": 9.594719066625728e-08, |
|
"loss": 15.9115, |
|
"step": 21250 |
|
}, |
|
{ |
|
"epoch": 0.6155046487939143, |
|
"grad_norm": 22.734375, |
|
"learning_rate": 9.617294876194259e-08, |
|
"loss": 15.847, |
|
"step": 21300 |
|
}, |
|
{ |
|
"epoch": 0.6169494953873272, |
|
"grad_norm": 23.078125, |
|
"learning_rate": 9.639870685762791e-08, |
|
"loss": 16.0101, |
|
"step": 21350 |
|
}, |
|
{ |
|
"epoch": 0.6183943419807402, |
|
"grad_norm": 21.265625, |
|
"learning_rate": 9.662446495331322e-08, |
|
"loss": 15.9588, |
|
"step": 21400 |
|
}, |
|
{ |
|
"epoch": 0.6198391885741531, |
|
"grad_norm": 24.171875, |
|
"learning_rate": 9.685022304899854e-08, |
|
"loss": 15.9525, |
|
"step": 21450 |
|
}, |
|
{ |
|
"epoch": 0.621284035167566, |
|
"grad_norm": 23.078125, |
|
"learning_rate": 9.707598114468385e-08, |
|
"loss": 15.9591, |
|
"step": 21500 |
|
}, |
|
{ |
|
"epoch": 0.6227288817609791, |
|
"grad_norm": 26.1875, |
|
"learning_rate": 9.730173924036914e-08, |
|
"loss": 15.8973, |
|
"step": 21550 |
|
}, |
|
{ |
|
"epoch": 0.624173728354392, |
|
"grad_norm": 22.1875, |
|
"learning_rate": 9.752749733605446e-08, |
|
"loss": 15.921, |
|
"step": 21600 |
|
}, |
|
{ |
|
"epoch": 0.6256185749478049, |
|
"grad_norm": 34.90625, |
|
"learning_rate": 9.775325543173977e-08, |
|
"loss": 15.8763, |
|
"step": 21650 |
|
}, |
|
{ |
|
"epoch": 0.6270634215412179, |
|
"grad_norm": 20.875, |
|
"learning_rate": 9.797901352742509e-08, |
|
"loss": 15.9879, |
|
"step": 21700 |
|
}, |
|
{ |
|
"epoch": 0.6285082681346308, |
|
"grad_norm": 22.796875, |
|
"learning_rate": 9.82047716231104e-08, |
|
"loss": 15.8933, |
|
"step": 21750 |
|
}, |
|
{ |
|
"epoch": 0.6299531147280437, |
|
"grad_norm": 25.84375, |
|
"learning_rate": 9.843052971879572e-08, |
|
"loss": 15.7943, |
|
"step": 21800 |
|
}, |
|
{ |
|
"epoch": 0.6313979613214566, |
|
"grad_norm": 21.1875, |
|
"learning_rate": 9.865628781448102e-08, |
|
"loss": 15.842, |
|
"step": 21850 |
|
}, |
|
{ |
|
"epoch": 0.6328428079148697, |
|
"grad_norm": 25.890625, |
|
"learning_rate": 9.888204591016633e-08, |
|
"loss": 15.9081, |
|
"step": 21900 |
|
}, |
|
{ |
|
"epoch": 0.6342876545082826, |
|
"grad_norm": 20.859375, |
|
"learning_rate": 9.910780400585164e-08, |
|
"loss": 15.825, |
|
"step": 21950 |
|
}, |
|
{ |
|
"epoch": 0.6357325011016955, |
|
"grad_norm": 20.953125, |
|
"learning_rate": 9.933356210153696e-08, |
|
"loss": 15.8846, |
|
"step": 22000 |
|
}, |
|
{ |
|
"epoch": 0.6371773476951085, |
|
"grad_norm": 22.0, |
|
"learning_rate": 9.955932019722227e-08, |
|
"loss": 15.8721, |
|
"step": 22050 |
|
}, |
|
{ |
|
"epoch": 0.6386221942885214, |
|
"grad_norm": 24.765625, |
|
"learning_rate": 9.978507829290759e-08, |
|
"loss": 15.7167, |
|
"step": 22100 |
|
}, |
|
{ |
|
"epoch": 0.6400670408819343, |
|
"grad_norm": 23.984375, |
|
"learning_rate": 1.0001083638859289e-07, |
|
"loss": 15.8247, |
|
"step": 22150 |
|
}, |
|
{ |
|
"epoch": 0.6415118874753473, |
|
"grad_norm": 19.5, |
|
"learning_rate": 1.002365944842782e-07, |
|
"loss": 15.8111, |
|
"step": 22200 |
|
}, |
|
{ |
|
"epoch": 0.6429567340687602, |
|
"grad_norm": 25.640625, |
|
"learning_rate": 1.0046235257996351e-07, |
|
"loss": 15.9381, |
|
"step": 22250 |
|
}, |
|
{ |
|
"epoch": 0.6444015806621732, |
|
"grad_norm": 19.84375, |
|
"learning_rate": 1.0068811067564882e-07, |
|
"loss": 15.7998, |
|
"step": 22300 |
|
}, |
|
{ |
|
"epoch": 0.6458464272555862, |
|
"grad_norm": 21.296875, |
|
"learning_rate": 1.0091386877133414e-07, |
|
"loss": 15.8191, |
|
"step": 22350 |
|
}, |
|
{ |
|
"epoch": 0.6472912738489991, |
|
"grad_norm": 22.0, |
|
"learning_rate": 1.0113962686701944e-07, |
|
"loss": 15.8098, |
|
"step": 22400 |
|
}, |
|
{ |
|
"epoch": 0.648736120442412, |
|
"grad_norm": 24.46875, |
|
"learning_rate": 1.0136538496270476e-07, |
|
"loss": 15.8501, |
|
"step": 22450 |
|
}, |
|
{ |
|
"epoch": 0.650180967035825, |
|
"grad_norm": 23.3125, |
|
"learning_rate": 1.0159114305839007e-07, |
|
"loss": 15.8631, |
|
"step": 22500 |
|
}, |
|
{ |
|
"epoch": 0.6516258136292379, |
|
"grad_norm": 25.875, |
|
"learning_rate": 1.0181690115407538e-07, |
|
"loss": 15.6956, |
|
"step": 22550 |
|
}, |
|
{ |
|
"epoch": 0.6530706602226508, |
|
"grad_norm": 24.703125, |
|
"learning_rate": 1.0204265924976069e-07, |
|
"loss": 15.8223, |
|
"step": 22600 |
|
}, |
|
{ |
|
"epoch": 0.6545155068160639, |
|
"grad_norm": 22.15625, |
|
"learning_rate": 1.02268417345446e-07, |
|
"loss": 15.9404, |
|
"step": 22650 |
|
}, |
|
{ |
|
"epoch": 0.6559603534094768, |
|
"grad_norm": 23.59375, |
|
"learning_rate": 1.0249417544113131e-07, |
|
"loss": 15.7929, |
|
"step": 22700 |
|
}, |
|
{ |
|
"epoch": 0.6574052000028897, |
|
"grad_norm": 20.5, |
|
"learning_rate": 1.0271993353681662e-07, |
|
"loss": 15.8155, |
|
"step": 22750 |
|
}, |
|
{ |
|
"epoch": 0.6588500465963026, |
|
"grad_norm": 22.984375, |
|
"learning_rate": 1.0294569163250194e-07, |
|
"loss": 15.8803, |
|
"step": 22800 |
|
}, |
|
{ |
|
"epoch": 0.6602948931897156, |
|
"grad_norm": 22.859375, |
|
"learning_rate": 1.0317144972818726e-07, |
|
"loss": 15.8193, |
|
"step": 22850 |
|
}, |
|
{ |
|
"epoch": 0.6617397397831285, |
|
"grad_norm": 24.109375, |
|
"learning_rate": 1.0339720782387256e-07, |
|
"loss": 15.6904, |
|
"step": 22900 |
|
}, |
|
{ |
|
"epoch": 0.6631845863765414, |
|
"grad_norm": 28.3125, |
|
"learning_rate": 1.0362296591955787e-07, |
|
"loss": 15.8436, |
|
"step": 22950 |
|
}, |
|
{ |
|
"epoch": 0.6646294329699545, |
|
"grad_norm": 24.234375, |
|
"learning_rate": 1.0384872401524318e-07, |
|
"loss": 15.7791, |
|
"step": 23000 |
|
}, |
|
{ |
|
"epoch": 0.6660742795633674, |
|
"grad_norm": 22.40625, |
|
"learning_rate": 1.0407448211092849e-07, |
|
"loss": 15.8147, |
|
"step": 23050 |
|
}, |
|
{ |
|
"epoch": 0.6675191261567803, |
|
"grad_norm": 21.90625, |
|
"learning_rate": 1.0430024020661381e-07, |
|
"loss": 15.8562, |
|
"step": 23100 |
|
}, |
|
{ |
|
"epoch": 0.6689639727501933, |
|
"grad_norm": 22.296875, |
|
"learning_rate": 1.0452599830229912e-07, |
|
"loss": 15.8147, |
|
"step": 23150 |
|
}, |
|
{ |
|
"epoch": 0.6704088193436062, |
|
"grad_norm": 24.0, |
|
"learning_rate": 1.0475175639798444e-07, |
|
"loss": 15.7467, |
|
"step": 23200 |
|
}, |
|
{ |
|
"epoch": 0.6718536659370191, |
|
"grad_norm": 23.09375, |
|
"learning_rate": 1.0497751449366973e-07, |
|
"loss": 15.8176, |
|
"step": 23250 |
|
}, |
|
{ |
|
"epoch": 0.673298512530432, |
|
"grad_norm": 29.296875, |
|
"learning_rate": 1.0520327258935504e-07, |
|
"loss": 15.7216, |
|
"step": 23300 |
|
}, |
|
{ |
|
"epoch": 0.674743359123845, |
|
"grad_norm": 22.8125, |
|
"learning_rate": 1.0542903068504036e-07, |
|
"loss": 15.707, |
|
"step": 23350 |
|
}, |
|
{ |
|
"epoch": 0.676188205717258, |
|
"grad_norm": 23.859375, |
|
"learning_rate": 1.0565478878072567e-07, |
|
"loss": 15.752, |
|
"step": 23400 |
|
}, |
|
{ |
|
"epoch": 0.677633052310671, |
|
"grad_norm": 22.40625, |
|
"learning_rate": 1.0588054687641099e-07, |
|
"loss": 15.7979, |
|
"step": 23450 |
|
}, |
|
{ |
|
"epoch": 0.6790778989040839, |
|
"grad_norm": 23.6875, |
|
"learning_rate": 1.061063049720963e-07, |
|
"loss": 15.7577, |
|
"step": 23500 |
|
}, |
|
{ |
|
"epoch": 0.6805227454974968, |
|
"grad_norm": 20.421875, |
|
"learning_rate": 1.063320630677816e-07, |
|
"loss": 15.7758, |
|
"step": 23550 |
|
}, |
|
{ |
|
"epoch": 0.6819675920909097, |
|
"grad_norm": 22.734375, |
|
"learning_rate": 1.0655782116346691e-07, |
|
"loss": 15.7973, |
|
"step": 23600 |
|
}, |
|
{ |
|
"epoch": 0.6834124386843227, |
|
"grad_norm": 20.765625, |
|
"learning_rate": 1.0678357925915222e-07, |
|
"loss": 15.8134, |
|
"step": 23650 |
|
}, |
|
{ |
|
"epoch": 0.6848572852777356, |
|
"grad_norm": 24.046875, |
|
"learning_rate": 1.0700933735483754e-07, |
|
"loss": 15.7493, |
|
"step": 23700 |
|
}, |
|
{ |
|
"epoch": 0.6863021318711486, |
|
"grad_norm": 24.34375, |
|
"learning_rate": 1.0723509545052286e-07, |
|
"loss": 15.6879, |
|
"step": 23750 |
|
}, |
|
{ |
|
"epoch": 0.6877469784645616, |
|
"grad_norm": 23.109375, |
|
"learning_rate": 1.0746085354620817e-07, |
|
"loss": 15.807, |
|
"step": 23800 |
|
}, |
|
{ |
|
"epoch": 0.6891918250579745, |
|
"grad_norm": 25.34375, |
|
"learning_rate": 1.0768661164189347e-07, |
|
"loss": 15.8162, |
|
"step": 23850 |
|
}, |
|
{ |
|
"epoch": 0.6906366716513874, |
|
"grad_norm": 21.140625, |
|
"learning_rate": 1.0791236973757878e-07, |
|
"loss": 15.7665, |
|
"step": 23900 |
|
}, |
|
{ |
|
"epoch": 0.6920815182448004, |
|
"grad_norm": 22.796875, |
|
"learning_rate": 1.0813812783326409e-07, |
|
"loss": 15.7595, |
|
"step": 23950 |
|
}, |
|
{ |
|
"epoch": 0.6935263648382133, |
|
"grad_norm": 22.578125, |
|
"learning_rate": 1.0836388592894941e-07, |
|
"loss": 15.7217, |
|
"step": 24000 |
|
}, |
|
{ |
|
"epoch": 0.6935263648382133, |
|
"eval_loss": 1.9680598974227905, |
|
"eval_runtime": 341.6298, |
|
"eval_samples_per_second": 2729.662, |
|
"eval_steps_per_second": 42.651, |
|
"step": 24000 |
|
}, |
|
{ |
|
"epoch": 0.6949712114316262, |
|
"grad_norm": 21.015625, |
|
"learning_rate": 1.0858964402463472e-07, |
|
"loss": 15.778, |
|
"step": 24050 |
|
}, |
|
{ |
|
"epoch": 0.6964160580250391, |
|
"grad_norm": 24.84375, |
|
"learning_rate": 1.0881540212032004e-07, |
|
"loss": 15.7675, |
|
"step": 24100 |
|
}, |
|
{ |
|
"epoch": 0.6978609046184522, |
|
"grad_norm": 26.171875, |
|
"learning_rate": 1.0904116021600534e-07, |
|
"loss": 15.7371, |
|
"step": 24150 |
|
}, |
|
{ |
|
"epoch": 0.6993057512118651, |
|
"grad_norm": 20.859375, |
|
"learning_rate": 1.0926691831169066e-07, |
|
"loss": 15.8547, |
|
"step": 24200 |
|
}, |
|
{ |
|
"epoch": 0.700750597805278, |
|
"grad_norm": 22.34375, |
|
"learning_rate": 1.0949267640737596e-07, |
|
"loss": 15.6903, |
|
"step": 24250 |
|
}, |
|
{ |
|
"epoch": 0.702195444398691, |
|
"grad_norm": 23.140625, |
|
"learning_rate": 1.0971843450306127e-07, |
|
"loss": 15.7397, |
|
"step": 24300 |
|
}, |
|
{ |
|
"epoch": 0.7036402909921039, |
|
"grad_norm": 21.953125, |
|
"learning_rate": 1.0994419259874659e-07, |
|
"loss": 15.6126, |
|
"step": 24350 |
|
}, |
|
{ |
|
"epoch": 0.7050851375855168, |
|
"grad_norm": 25.8125, |
|
"learning_rate": 1.1016995069443189e-07, |
|
"loss": 15.8006, |
|
"step": 24400 |
|
}, |
|
{ |
|
"epoch": 0.7065299841789298, |
|
"grad_norm": 25.0625, |
|
"learning_rate": 1.1039570879011721e-07, |
|
"loss": 15.6731, |
|
"step": 24450 |
|
}, |
|
{ |
|
"epoch": 0.7079748307723428, |
|
"grad_norm": 26.265625, |
|
"learning_rate": 1.1062146688580252e-07, |
|
"loss": 15.7758, |
|
"step": 24500 |
|
}, |
|
{ |
|
"epoch": 0.7094196773657557, |
|
"grad_norm": 23.390625, |
|
"learning_rate": 1.1084722498148783e-07, |
|
"loss": 15.7123, |
|
"step": 24550 |
|
}, |
|
{ |
|
"epoch": 0.7108645239591687, |
|
"grad_norm": 21.984375, |
|
"learning_rate": 1.1107298307717314e-07, |
|
"loss": 15.7585, |
|
"step": 24600 |
|
}, |
|
{ |
|
"epoch": 0.7123093705525816, |
|
"grad_norm": 23.765625, |
|
"learning_rate": 1.1129874117285846e-07, |
|
"loss": 15.7461, |
|
"step": 24650 |
|
}, |
|
{ |
|
"epoch": 0.7137542171459945, |
|
"grad_norm": 22.265625, |
|
"learning_rate": 1.1152449926854376e-07, |
|
"loss": 15.7875, |
|
"step": 24700 |
|
}, |
|
{ |
|
"epoch": 0.7151990637394074, |
|
"grad_norm": 22.375, |
|
"learning_rate": 1.1175025736422907e-07, |
|
"loss": 15.6815, |
|
"step": 24750 |
|
}, |
|
{ |
|
"epoch": 0.7166439103328204, |
|
"grad_norm": 25.4375, |
|
"learning_rate": 1.1197601545991439e-07, |
|
"loss": 15.7047, |
|
"step": 24800 |
|
}, |
|
{ |
|
"epoch": 0.7180887569262333, |
|
"grad_norm": 30.53125, |
|
"learning_rate": 1.122017735555997e-07, |
|
"loss": 15.7082, |
|
"step": 24850 |
|
}, |
|
{ |
|
"epoch": 0.7195336035196463, |
|
"grad_norm": 23.859375, |
|
"learning_rate": 1.1242753165128501e-07, |
|
"loss": 15.6932, |
|
"step": 24900 |
|
}, |
|
{ |
|
"epoch": 0.7209784501130593, |
|
"grad_norm": 21.890625, |
|
"learning_rate": 1.1265328974697032e-07, |
|
"loss": 15.7004, |
|
"step": 24950 |
|
}, |
|
{ |
|
"epoch": 0.7224232967064722, |
|
"grad_norm": 22.21875, |
|
"learning_rate": 1.1287904784265563e-07, |
|
"loss": 15.5976, |
|
"step": 25000 |
|
}, |
|
{ |
|
"epoch": 0.7238681432998851, |
|
"grad_norm": 20.765625, |
|
"learning_rate": 1.1310480593834094e-07, |
|
"loss": 15.651, |
|
"step": 25050 |
|
}, |
|
{ |
|
"epoch": 0.7253129898932981, |
|
"grad_norm": 21.203125, |
|
"learning_rate": 1.1333056403402626e-07, |
|
"loss": 15.7333, |
|
"step": 25100 |
|
}, |
|
{ |
|
"epoch": 0.726757836486711, |
|
"grad_norm": 22.25, |
|
"learning_rate": 1.1355632212971157e-07, |
|
"loss": 15.6802, |
|
"step": 25150 |
|
}, |
|
{ |
|
"epoch": 0.7282026830801239, |
|
"grad_norm": 22.8125, |
|
"learning_rate": 1.1378208022539689e-07, |
|
"loss": 15.6639, |
|
"step": 25200 |
|
}, |
|
{ |
|
"epoch": 0.729647529673537, |
|
"grad_norm": 21.140625, |
|
"learning_rate": 1.1400783832108218e-07, |
|
"loss": 15.6816, |
|
"step": 25250 |
|
}, |
|
{ |
|
"epoch": 0.7310923762669499, |
|
"grad_norm": 23.0, |
|
"learning_rate": 1.1423359641676749e-07, |
|
"loss": 15.5984, |
|
"step": 25300 |
|
}, |
|
{ |
|
"epoch": 0.7325372228603628, |
|
"grad_norm": 23.40625, |
|
"learning_rate": 1.1445935451245281e-07, |
|
"loss": 15.7119, |
|
"step": 25350 |
|
}, |
|
{ |
|
"epoch": 0.7339820694537758, |
|
"grad_norm": 21.296875, |
|
"learning_rate": 1.1468511260813812e-07, |
|
"loss": 15.6212, |
|
"step": 25400 |
|
}, |
|
{ |
|
"epoch": 0.7354269160471887, |
|
"grad_norm": 20.03125, |
|
"learning_rate": 1.1491087070382344e-07, |
|
"loss": 15.659, |
|
"step": 25450 |
|
}, |
|
{ |
|
"epoch": 0.7368717626406016, |
|
"grad_norm": 22.984375, |
|
"learning_rate": 1.1513662879950876e-07, |
|
"loss": 15.771, |
|
"step": 25500 |
|
}, |
|
{ |
|
"epoch": 0.7383166092340145, |
|
"grad_norm": 21.84375, |
|
"learning_rate": 1.1536238689519404e-07, |
|
"loss": 15.7338, |
|
"step": 25550 |
|
}, |
|
{ |
|
"epoch": 0.7397614558274275, |
|
"grad_norm": 22.234375, |
|
"learning_rate": 1.1558814499087936e-07, |
|
"loss": 15.6507, |
|
"step": 25600 |
|
}, |
|
{ |
|
"epoch": 0.7412063024208405, |
|
"grad_norm": 25.3125, |
|
"learning_rate": 1.1581390308656468e-07, |
|
"loss": 15.7084, |
|
"step": 25650 |
|
}, |
|
{ |
|
"epoch": 0.7426511490142534, |
|
"grad_norm": 26.171875, |
|
"learning_rate": 1.1603966118224999e-07, |
|
"loss": 15.5601, |
|
"step": 25700 |
|
}, |
|
{ |
|
"epoch": 0.7440959956076664, |
|
"grad_norm": 22.515625, |
|
"learning_rate": 1.1626541927793531e-07, |
|
"loss": 15.7191, |
|
"step": 25750 |
|
}, |
|
{ |
|
"epoch": 0.7455408422010793, |
|
"grad_norm": 24.375, |
|
"learning_rate": 1.1649117737362062e-07, |
|
"loss": 15.6457, |
|
"step": 25800 |
|
}, |
|
{ |
|
"epoch": 0.7469856887944922, |
|
"grad_norm": 23.640625, |
|
"learning_rate": 1.1671693546930592e-07, |
|
"loss": 15.572, |
|
"step": 25850 |
|
}, |
|
{ |
|
"epoch": 0.7484305353879052, |
|
"grad_norm": 24.375, |
|
"learning_rate": 1.1694269356499123e-07, |
|
"loss": 15.6297, |
|
"step": 25900 |
|
}, |
|
{ |
|
"epoch": 0.7498753819813181, |
|
"grad_norm": 23.8125, |
|
"learning_rate": 1.1716845166067654e-07, |
|
"loss": 15.6828, |
|
"step": 25950 |
|
}, |
|
{ |
|
"epoch": 0.7513202285747311, |
|
"grad_norm": 23.953125, |
|
"learning_rate": 1.1739420975636186e-07, |
|
"loss": 15.5568, |
|
"step": 26000 |
|
}, |
|
{ |
|
"epoch": 0.7527650751681441, |
|
"grad_norm": 25.421875, |
|
"learning_rate": 1.1761996785204717e-07, |
|
"loss": 15.6016, |
|
"step": 26050 |
|
}, |
|
{ |
|
"epoch": 0.754209921761557, |
|
"grad_norm": 22.15625, |
|
"learning_rate": 1.1784572594773249e-07, |
|
"loss": 15.5887, |
|
"step": 26100 |
|
}, |
|
{ |
|
"epoch": 0.7556547683549699, |
|
"grad_norm": 21.5625, |
|
"learning_rate": 1.1807148404341779e-07, |
|
"loss": 15.6077, |
|
"step": 26150 |
|
}, |
|
{ |
|
"epoch": 0.7570996149483828, |
|
"grad_norm": 23.328125, |
|
"learning_rate": 1.1829724213910311e-07, |
|
"loss": 15.6592, |
|
"step": 26200 |
|
}, |
|
{ |
|
"epoch": 0.7585444615417958, |
|
"grad_norm": 23.71875, |
|
"learning_rate": 1.1852300023478841e-07, |
|
"loss": 15.621, |
|
"step": 26250 |
|
}, |
|
{ |
|
"epoch": 0.7599893081352087, |
|
"grad_norm": 24.5625, |
|
"learning_rate": 1.1874875833047373e-07, |
|
"loss": 15.5728, |
|
"step": 26300 |
|
}, |
|
{ |
|
"epoch": 0.7614341547286216, |
|
"grad_norm": 23.21875, |
|
"learning_rate": 1.1897451642615904e-07, |
|
"loss": 15.6658, |
|
"step": 26350 |
|
}, |
|
{ |
|
"epoch": 0.7628790013220347, |
|
"grad_norm": 24.96875, |
|
"learning_rate": 1.1920027452184434e-07, |
|
"loss": 15.4367, |
|
"step": 26400 |
|
}, |
|
{ |
|
"epoch": 0.7643238479154476, |
|
"grad_norm": 23.296875, |
|
"learning_rate": 1.1942603261752967e-07, |
|
"loss": 15.6812, |
|
"step": 26450 |
|
}, |
|
{ |
|
"epoch": 0.7657686945088605, |
|
"grad_norm": 21.21875, |
|
"learning_rate": 1.1965179071321497e-07, |
|
"loss": 15.4966, |
|
"step": 26500 |
|
}, |
|
{ |
|
"epoch": 0.7672135411022735, |
|
"grad_norm": 21.859375, |
|
"learning_rate": 1.1987754880890028e-07, |
|
"loss": 15.6969, |
|
"step": 26550 |
|
}, |
|
{ |
|
"epoch": 0.7686583876956864, |
|
"grad_norm": 21.234375, |
|
"learning_rate": 1.2010330690458558e-07, |
|
"loss": 15.5063, |
|
"step": 26600 |
|
}, |
|
{ |
|
"epoch": 0.7701032342890993, |
|
"grad_norm": 30.65625, |
|
"learning_rate": 1.203290650002709e-07, |
|
"loss": 15.5682, |
|
"step": 26650 |
|
}, |
|
{ |
|
"epoch": 0.7715480808825123, |
|
"grad_norm": 23.0625, |
|
"learning_rate": 1.205548230959562e-07, |
|
"loss": 15.5967, |
|
"step": 26700 |
|
}, |
|
{ |
|
"epoch": 0.7729929274759253, |
|
"grad_norm": 22.171875, |
|
"learning_rate": 1.2078058119164154e-07, |
|
"loss": 15.5911, |
|
"step": 26750 |
|
}, |
|
{ |
|
"epoch": 0.7744377740693382, |
|
"grad_norm": 25.8125, |
|
"learning_rate": 1.2100633928732684e-07, |
|
"loss": 15.6635, |
|
"step": 26800 |
|
}, |
|
{ |
|
"epoch": 0.7758826206627512, |
|
"grad_norm": 23.40625, |
|
"learning_rate": 1.2123209738301214e-07, |
|
"loss": 15.5876, |
|
"step": 26850 |
|
}, |
|
{ |
|
"epoch": 0.7773274672561641, |
|
"grad_norm": 21.15625, |
|
"learning_rate": 1.2145785547869745e-07, |
|
"loss": 15.5192, |
|
"step": 26900 |
|
}, |
|
{ |
|
"epoch": 0.778772313849577, |
|
"grad_norm": 23.5625, |
|
"learning_rate": 1.2168361357438277e-07, |
|
"loss": 15.5746, |
|
"step": 26950 |
|
}, |
|
{ |
|
"epoch": 0.7802171604429899, |
|
"grad_norm": 27.359375, |
|
"learning_rate": 1.2190937167006808e-07, |
|
"loss": 15.5407, |
|
"step": 27000 |
|
}, |
|
{ |
|
"epoch": 0.7802171604429899, |
|
"eval_loss": 1.9437412023544312, |
|
"eval_runtime": 340.4, |
|
"eval_samples_per_second": 2739.524, |
|
"eval_steps_per_second": 42.806, |
|
"step": 27000 |
|
}, |
|
{ |
|
"epoch": 0.7816620070364029, |
|
"grad_norm": 23.0625, |
|
"learning_rate": 1.221351297657534e-07, |
|
"loss": 15.609, |
|
"step": 27050 |
|
}, |
|
{ |
|
"epoch": 0.7831068536298158, |
|
"grad_norm": 25.40625, |
|
"learning_rate": 1.223608878614387e-07, |
|
"loss": 15.6637, |
|
"step": 27100 |
|
}, |
|
{ |
|
"epoch": 0.7845517002232288, |
|
"grad_norm": 23.90625, |
|
"learning_rate": 1.22586645957124e-07, |
|
"loss": 15.6405, |
|
"step": 27150 |
|
}, |
|
{ |
|
"epoch": 0.7859965468166418, |
|
"grad_norm": 22.390625, |
|
"learning_rate": 1.2281240405280934e-07, |
|
"loss": 15.5515, |
|
"step": 27200 |
|
}, |
|
{ |
|
"epoch": 0.7874413934100547, |
|
"grad_norm": 25.265625, |
|
"learning_rate": 1.2303816214849464e-07, |
|
"loss": 15.5254, |
|
"step": 27250 |
|
}, |
|
{ |
|
"epoch": 0.7888862400034676, |
|
"grad_norm": 22.125, |
|
"learning_rate": 1.2326392024417994e-07, |
|
"loss": 15.5474, |
|
"step": 27300 |
|
}, |
|
{ |
|
"epoch": 0.7903310865968806, |
|
"grad_norm": 23.03125, |
|
"learning_rate": 1.2348967833986527e-07, |
|
"loss": 15.554, |
|
"step": 27350 |
|
}, |
|
{ |
|
"epoch": 0.7917759331902935, |
|
"grad_norm": 19.96875, |
|
"learning_rate": 1.2371543643555057e-07, |
|
"loss": 15.5717, |
|
"step": 27400 |
|
}, |
|
{ |
|
"epoch": 0.7932207797837064, |
|
"grad_norm": 20.53125, |
|
"learning_rate": 1.2394119453123588e-07, |
|
"loss": 15.5454, |
|
"step": 27450 |
|
}, |
|
{ |
|
"epoch": 0.7946656263771195, |
|
"grad_norm": 21.34375, |
|
"learning_rate": 1.241669526269212e-07, |
|
"loss": 15.5759, |
|
"step": 27500 |
|
}, |
|
{ |
|
"epoch": 0.7961104729705324, |
|
"grad_norm": 23.9375, |
|
"learning_rate": 1.243927107226065e-07, |
|
"loss": 15.5199, |
|
"step": 27550 |
|
}, |
|
{ |
|
"epoch": 0.7975553195639453, |
|
"grad_norm": 21.84375, |
|
"learning_rate": 1.246184688182918e-07, |
|
"loss": 15.4171, |
|
"step": 27600 |
|
}, |
|
{ |
|
"epoch": 0.7990001661573582, |
|
"grad_norm": 22.234375, |
|
"learning_rate": 1.2484422691397714e-07, |
|
"loss": 15.5973, |
|
"step": 27650 |
|
}, |
|
{ |
|
"epoch": 0.8004450127507712, |
|
"grad_norm": 22.421875, |
|
"learning_rate": 1.2506998500966244e-07, |
|
"loss": 15.4923, |
|
"step": 27700 |
|
}, |
|
{ |
|
"epoch": 0.8018898593441841, |
|
"grad_norm": 21.34375, |
|
"learning_rate": 1.2529574310534774e-07, |
|
"loss": 15.509, |
|
"step": 27750 |
|
}, |
|
{ |
|
"epoch": 0.803334705937597, |
|
"grad_norm": 24.359375, |
|
"learning_rate": 1.2552150120103305e-07, |
|
"loss": 15.5824, |
|
"step": 27800 |
|
}, |
|
{ |
|
"epoch": 0.80477955253101, |
|
"grad_norm": 21.25, |
|
"learning_rate": 1.2574725929671838e-07, |
|
"loss": 15.5973, |
|
"step": 27850 |
|
}, |
|
{ |
|
"epoch": 0.806224399124423, |
|
"grad_norm": 27.984375, |
|
"learning_rate": 1.2597301739240368e-07, |
|
"loss": 15.564, |
|
"step": 27900 |
|
}, |
|
{ |
|
"epoch": 0.8076692457178359, |
|
"grad_norm": 33.71875, |
|
"learning_rate": 1.26198775488089e-07, |
|
"loss": 15.439, |
|
"step": 27950 |
|
}, |
|
{ |
|
"epoch": 0.8091140923112489, |
|
"grad_norm": 24.09375, |
|
"learning_rate": 1.264245335837743e-07, |
|
"loss": 15.5574, |
|
"step": 28000 |
|
}, |
|
{ |
|
"epoch": 0.8105589389046618, |
|
"grad_norm": 22.78125, |
|
"learning_rate": 1.266502916794596e-07, |
|
"loss": 15.5851, |
|
"step": 28050 |
|
}, |
|
{ |
|
"epoch": 0.8120037854980747, |
|
"grad_norm": 21.125, |
|
"learning_rate": 1.2687604977514494e-07, |
|
"loss": 15.5334, |
|
"step": 28100 |
|
}, |
|
{ |
|
"epoch": 0.8134486320914877, |
|
"grad_norm": 21.5625, |
|
"learning_rate": 1.2710180787083024e-07, |
|
"loss": 15.5789, |
|
"step": 28150 |
|
}, |
|
{ |
|
"epoch": 0.8148934786849006, |
|
"grad_norm": 21.8125, |
|
"learning_rate": 1.2732756596651557e-07, |
|
"loss": 15.4793, |
|
"step": 28200 |
|
}, |
|
{ |
|
"epoch": 0.8163383252783136, |
|
"grad_norm": 21.234375, |
|
"learning_rate": 1.2755332406220087e-07, |
|
"loss": 15.5015, |
|
"step": 28250 |
|
}, |
|
{ |
|
"epoch": 0.8177831718717266, |
|
"grad_norm": 21.203125, |
|
"learning_rate": 1.2777908215788618e-07, |
|
"loss": 15.3711, |
|
"step": 28300 |
|
}, |
|
{ |
|
"epoch": 0.8192280184651395, |
|
"grad_norm": 23.84375, |
|
"learning_rate": 1.280048402535715e-07, |
|
"loss": 15.4745, |
|
"step": 28350 |
|
}, |
|
{ |
|
"epoch": 0.8206728650585524, |
|
"grad_norm": 22.03125, |
|
"learning_rate": 1.2823059834925678e-07, |
|
"loss": 15.5285, |
|
"step": 28400 |
|
}, |
|
{ |
|
"epoch": 0.8221177116519653, |
|
"grad_norm": 20.9375, |
|
"learning_rate": 1.284563564449421e-07, |
|
"loss": 15.5214, |
|
"step": 28450 |
|
}, |
|
{ |
|
"epoch": 0.8235625582453783, |
|
"grad_norm": 25.546875, |
|
"learning_rate": 1.286821145406274e-07, |
|
"loss": 15.4168, |
|
"step": 28500 |
|
}, |
|
{ |
|
"epoch": 0.8250074048387912, |
|
"grad_norm": 24.265625, |
|
"learning_rate": 1.2890787263631271e-07, |
|
"loss": 15.5043, |
|
"step": 28550 |
|
}, |
|
{ |
|
"epoch": 0.8264522514322041, |
|
"grad_norm": 23.265625, |
|
"learning_rate": 1.2913363073199804e-07, |
|
"loss": 15.4206, |
|
"step": 28600 |
|
}, |
|
{ |
|
"epoch": 0.8278970980256172, |
|
"grad_norm": 22.0, |
|
"learning_rate": 1.2935938882768334e-07, |
|
"loss": 15.4444, |
|
"step": 28650 |
|
}, |
|
{ |
|
"epoch": 0.8293419446190301, |
|
"grad_norm": 25.09375, |
|
"learning_rate": 1.2958514692336867e-07, |
|
"loss": 15.4043, |
|
"step": 28700 |
|
}, |
|
{ |
|
"epoch": 0.830786791212443, |
|
"grad_norm": 21.046875, |
|
"learning_rate": 1.2981090501905398e-07, |
|
"loss": 15.5465, |
|
"step": 28750 |
|
}, |
|
{ |
|
"epoch": 0.832231637805856, |
|
"grad_norm": 21.234375, |
|
"learning_rate": 1.300366631147393e-07, |
|
"loss": 15.4988, |
|
"step": 28800 |
|
}, |
|
{ |
|
"epoch": 0.8336764843992689, |
|
"grad_norm": 21.046875, |
|
"learning_rate": 1.302624212104246e-07, |
|
"loss": 15.4368, |
|
"step": 28850 |
|
}, |
|
{ |
|
"epoch": 0.8351213309926818, |
|
"grad_norm": 23.46875, |
|
"learning_rate": 1.304881793061099e-07, |
|
"loss": 15.4251, |
|
"step": 28900 |
|
}, |
|
{ |
|
"epoch": 0.8365661775860948, |
|
"grad_norm": 23.046875, |
|
"learning_rate": 1.3071393740179524e-07, |
|
"loss": 15.4271, |
|
"step": 28950 |
|
}, |
|
{ |
|
"epoch": 0.8380110241795078, |
|
"grad_norm": 21.0, |
|
"learning_rate": 1.3093969549748054e-07, |
|
"loss": 15.4439, |
|
"step": 29000 |
|
}, |
|
{ |
|
"epoch": 0.8394558707729207, |
|
"grad_norm": 21.96875, |
|
"learning_rate": 1.3116545359316584e-07, |
|
"loss": 15.4197, |
|
"step": 29050 |
|
}, |
|
{ |
|
"epoch": 0.8409007173663336, |
|
"grad_norm": 21.109375, |
|
"learning_rate": 1.3139121168885115e-07, |
|
"loss": 15.428, |
|
"step": 29100 |
|
}, |
|
{ |
|
"epoch": 0.8423455639597466, |
|
"grad_norm": 21.984375, |
|
"learning_rate": 1.3161696978453645e-07, |
|
"loss": 15.3989, |
|
"step": 29150 |
|
}, |
|
{ |
|
"epoch": 0.8437904105531595, |
|
"grad_norm": 21.921875, |
|
"learning_rate": 1.3184272788022178e-07, |
|
"loss": 15.4178, |
|
"step": 29200 |
|
}, |
|
{ |
|
"epoch": 0.8452352571465724, |
|
"grad_norm": 20.765625, |
|
"learning_rate": 1.3206848597590708e-07, |
|
"loss": 15.3614, |
|
"step": 29250 |
|
}, |
|
{ |
|
"epoch": 0.8466801037399854, |
|
"grad_norm": 21.390625, |
|
"learning_rate": 1.322942440715924e-07, |
|
"loss": 15.4306, |
|
"step": 29300 |
|
}, |
|
{ |
|
"epoch": 0.8481249503333983, |
|
"grad_norm": 28.84375, |
|
"learning_rate": 1.325200021672777e-07, |
|
"loss": 15.4706, |
|
"step": 29350 |
|
}, |
|
{ |
|
"epoch": 0.8495697969268113, |
|
"grad_norm": 22.921875, |
|
"learning_rate": 1.32745760262963e-07, |
|
"loss": 15.4703, |
|
"step": 29400 |
|
}, |
|
{ |
|
"epoch": 0.8510146435202243, |
|
"grad_norm": 36.5625, |
|
"learning_rate": 1.3297151835864834e-07, |
|
"loss": 15.45, |
|
"step": 29450 |
|
}, |
|
{ |
|
"epoch": 0.8524594901136372, |
|
"grad_norm": 24.25, |
|
"learning_rate": 1.3319727645433364e-07, |
|
"loss": 15.4233, |
|
"step": 29500 |
|
}, |
|
{ |
|
"epoch": 0.8539043367070501, |
|
"grad_norm": 34.96875, |
|
"learning_rate": 1.3342303455001897e-07, |
|
"loss": 15.4693, |
|
"step": 29550 |
|
}, |
|
{ |
|
"epoch": 0.8553491833004631, |
|
"grad_norm": 21.953125, |
|
"learning_rate": 1.3364879264570427e-07, |
|
"loss": 15.4402, |
|
"step": 29600 |
|
}, |
|
{ |
|
"epoch": 0.856794029893876, |
|
"grad_norm": 22.40625, |
|
"learning_rate": 1.338745507413896e-07, |
|
"loss": 15.4761, |
|
"step": 29650 |
|
}, |
|
{ |
|
"epoch": 0.8582388764872889, |
|
"grad_norm": 20.625, |
|
"learning_rate": 1.341003088370749e-07, |
|
"loss": 15.3796, |
|
"step": 29700 |
|
}, |
|
{ |
|
"epoch": 0.859683723080702, |
|
"grad_norm": 21.328125, |
|
"learning_rate": 1.3432606693276018e-07, |
|
"loss": 15.3857, |
|
"step": 29750 |
|
}, |
|
{ |
|
"epoch": 0.8611285696741149, |
|
"grad_norm": 23.375, |
|
"learning_rate": 1.345518250284455e-07, |
|
"loss": 15.414, |
|
"step": 29800 |
|
}, |
|
{ |
|
"epoch": 0.8625734162675278, |
|
"grad_norm": 22.671875, |
|
"learning_rate": 1.347775831241308e-07, |
|
"loss": 15.3401, |
|
"step": 29850 |
|
}, |
|
{ |
|
"epoch": 0.8640182628609407, |
|
"grad_norm": 22.65625, |
|
"learning_rate": 1.3500334121981614e-07, |
|
"loss": 15.346, |
|
"step": 29900 |
|
}, |
|
{ |
|
"epoch": 0.8654631094543537, |
|
"grad_norm": 23.890625, |
|
"learning_rate": 1.3522909931550144e-07, |
|
"loss": 15.42, |
|
"step": 29950 |
|
}, |
|
{ |
|
"epoch": 0.8669079560477666, |
|
"grad_norm": 20.515625, |
|
"learning_rate": 1.3545485741118675e-07, |
|
"loss": 15.389, |
|
"step": 30000 |
|
}, |
|
{ |
|
"epoch": 0.8669079560477666, |
|
"eval_loss": 1.9219062328338623, |
|
"eval_runtime": 349.965, |
|
"eval_samples_per_second": 2664.65, |
|
"eval_steps_per_second": 41.636, |
|
"step": 30000 |
|
}, |
|
{ |
|
"epoch": 0.8683528026411795, |
|
"grad_norm": 21.8125, |
|
"learning_rate": 1.3568061550687207e-07, |
|
"loss": 15.327, |
|
"step": 30050 |
|
}, |
|
{ |
|
"epoch": 0.8697976492345925, |
|
"grad_norm": 22.8125, |
|
"learning_rate": 1.3590637360255738e-07, |
|
"loss": 15.3229, |
|
"step": 30100 |
|
}, |
|
{ |
|
"epoch": 0.8712424958280055, |
|
"grad_norm": 23.671875, |
|
"learning_rate": 1.361321316982427e-07, |
|
"loss": 15.3576, |
|
"step": 30150 |
|
}, |
|
{ |
|
"epoch": 0.8726873424214184, |
|
"grad_norm": 22.609375, |
|
"learning_rate": 1.36357889793928e-07, |
|
"loss": 15.3539, |
|
"step": 30200 |
|
}, |
|
{ |
|
"epoch": 0.8741321890148314, |
|
"grad_norm": 20.65625, |
|
"learning_rate": 1.365836478896133e-07, |
|
"loss": 15.2693, |
|
"step": 30250 |
|
}, |
|
{ |
|
"epoch": 0.8755770356082443, |
|
"grad_norm": 22.5625, |
|
"learning_rate": 1.3680940598529864e-07, |
|
"loss": 15.4018, |
|
"step": 30300 |
|
}, |
|
{ |
|
"epoch": 0.8770218822016572, |
|
"grad_norm": 19.875, |
|
"learning_rate": 1.3703516408098394e-07, |
|
"loss": 15.4242, |
|
"step": 30350 |
|
}, |
|
{ |
|
"epoch": 0.8784667287950702, |
|
"grad_norm": 27.234375, |
|
"learning_rate": 1.3726092217666924e-07, |
|
"loss": 15.3294, |
|
"step": 30400 |
|
}, |
|
{ |
|
"epoch": 0.8799115753884831, |
|
"grad_norm": 23.375, |
|
"learning_rate": 1.3748668027235455e-07, |
|
"loss": 15.3841, |
|
"step": 30450 |
|
}, |
|
{ |
|
"epoch": 0.8813564219818961, |
|
"grad_norm": 23.125, |
|
"learning_rate": 1.3771243836803988e-07, |
|
"loss": 15.3368, |
|
"step": 30500 |
|
}, |
|
{ |
|
"epoch": 0.882801268575309, |
|
"grad_norm": 23.171875, |
|
"learning_rate": 1.3793819646372518e-07, |
|
"loss": 15.343, |
|
"step": 30550 |
|
}, |
|
{ |
|
"epoch": 0.884246115168722, |
|
"grad_norm": 29.78125, |
|
"learning_rate": 1.3816395455941048e-07, |
|
"loss": 15.3782, |
|
"step": 30600 |
|
}, |
|
{ |
|
"epoch": 0.8856909617621349, |
|
"grad_norm": 22.453125, |
|
"learning_rate": 1.383897126550958e-07, |
|
"loss": 15.4537, |
|
"step": 30650 |
|
}, |
|
{ |
|
"epoch": 0.8871358083555478, |
|
"grad_norm": 21.265625, |
|
"learning_rate": 1.386154707507811e-07, |
|
"loss": 15.4048, |
|
"step": 30700 |
|
}, |
|
{ |
|
"epoch": 0.8885806549489608, |
|
"grad_norm": 24.25, |
|
"learning_rate": 1.3884122884646644e-07, |
|
"loss": 15.3433, |
|
"step": 30750 |
|
}, |
|
{ |
|
"epoch": 0.8900255015423737, |
|
"grad_norm": 25.1875, |
|
"learning_rate": 1.3906698694215174e-07, |
|
"loss": 15.3141, |
|
"step": 30800 |
|
}, |
|
{ |
|
"epoch": 0.8914703481357866, |
|
"grad_norm": 24.25, |
|
"learning_rate": 1.3929274503783704e-07, |
|
"loss": 15.2703, |
|
"step": 30850 |
|
}, |
|
{ |
|
"epoch": 0.8929151947291997, |
|
"grad_norm": 22.3125, |
|
"learning_rate": 1.3951850313352237e-07, |
|
"loss": 15.4022, |
|
"step": 30900 |
|
}, |
|
{ |
|
"epoch": 0.8943600413226126, |
|
"grad_norm": 19.859375, |
|
"learning_rate": 1.3974426122920768e-07, |
|
"loss": 15.2936, |
|
"step": 30950 |
|
}, |
|
{ |
|
"epoch": 0.8958048879160255, |
|
"grad_norm": 20.5, |
|
"learning_rate": 1.39970019324893e-07, |
|
"loss": 15.3219, |
|
"step": 31000 |
|
}, |
|
{ |
|
"epoch": 0.8972497345094385, |
|
"grad_norm": 21.71875, |
|
"learning_rate": 1.4019577742057828e-07, |
|
"loss": 15.2468, |
|
"step": 31050 |
|
}, |
|
{ |
|
"epoch": 0.8986945811028514, |
|
"grad_norm": 23.421875, |
|
"learning_rate": 1.4042153551626358e-07, |
|
"loss": 15.2591, |
|
"step": 31100 |
|
}, |
|
{ |
|
"epoch": 0.9001394276962643, |
|
"grad_norm": 23.09375, |
|
"learning_rate": 1.406472936119489e-07, |
|
"loss": 15.3318, |
|
"step": 31150 |
|
}, |
|
{ |
|
"epoch": 0.9015842742896772, |
|
"grad_norm": 24.09375, |
|
"learning_rate": 1.4087305170763421e-07, |
|
"loss": 15.2105, |
|
"step": 31200 |
|
}, |
|
{ |
|
"epoch": 0.9030291208830903, |
|
"grad_norm": 22.671875, |
|
"learning_rate": 1.4109880980331954e-07, |
|
"loss": 15.2557, |
|
"step": 31250 |
|
}, |
|
{ |
|
"epoch": 0.9044739674765032, |
|
"grad_norm": 22.0625, |
|
"learning_rate": 1.4132456789900484e-07, |
|
"loss": 15.4014, |
|
"step": 31300 |
|
}, |
|
{ |
|
"epoch": 0.9059188140699161, |
|
"grad_norm": 21.796875, |
|
"learning_rate": 1.4155032599469017e-07, |
|
"loss": 15.2382, |
|
"step": 31350 |
|
}, |
|
{ |
|
"epoch": 0.9073636606633291, |
|
"grad_norm": 24.5, |
|
"learning_rate": 1.4177608409037548e-07, |
|
"loss": 15.395, |
|
"step": 31400 |
|
}, |
|
{ |
|
"epoch": 0.908808507256742, |
|
"grad_norm": 21.828125, |
|
"learning_rate": 1.4200184218606078e-07, |
|
"loss": 15.2785, |
|
"step": 31450 |
|
}, |
|
{ |
|
"epoch": 0.9102533538501549, |
|
"grad_norm": 22.5625, |
|
"learning_rate": 1.422276002817461e-07, |
|
"loss": 15.2983, |
|
"step": 31500 |
|
}, |
|
{ |
|
"epoch": 0.9116982004435679, |
|
"grad_norm": 21.328125, |
|
"learning_rate": 1.424533583774314e-07, |
|
"loss": 15.382, |
|
"step": 31550 |
|
}, |
|
{ |
|
"epoch": 0.9131430470369809, |
|
"grad_norm": 21.3125, |
|
"learning_rate": 1.4267911647311674e-07, |
|
"loss": 15.2084, |
|
"step": 31600 |
|
}, |
|
{ |
|
"epoch": 0.9145878936303938, |
|
"grad_norm": 22.6875, |
|
"learning_rate": 1.4290487456880204e-07, |
|
"loss": 15.2803, |
|
"step": 31650 |
|
}, |
|
{ |
|
"epoch": 0.9160327402238068, |
|
"grad_norm": 20.953125, |
|
"learning_rate": 1.4313063266448734e-07, |
|
"loss": 15.3734, |
|
"step": 31700 |
|
}, |
|
{ |
|
"epoch": 0.9174775868172197, |
|
"grad_norm": 22.765625, |
|
"learning_rate": 1.4335639076017265e-07, |
|
"loss": 15.3248, |
|
"step": 31750 |
|
}, |
|
{ |
|
"epoch": 0.9189224334106326, |
|
"grad_norm": 21.640625, |
|
"learning_rate": 1.4358214885585795e-07, |
|
"loss": 15.2958, |
|
"step": 31800 |
|
}, |
|
{ |
|
"epoch": 0.9203672800040456, |
|
"grad_norm": 21.53125, |
|
"learning_rate": 1.4380790695154328e-07, |
|
"loss": 15.2854, |
|
"step": 31850 |
|
}, |
|
{ |
|
"epoch": 0.9218121265974585, |
|
"grad_norm": 21.265625, |
|
"learning_rate": 1.4403366504722858e-07, |
|
"loss": 15.2983, |
|
"step": 31900 |
|
}, |
|
{ |
|
"epoch": 0.9232569731908714, |
|
"grad_norm": 26.265625, |
|
"learning_rate": 1.4425942314291388e-07, |
|
"loss": 15.1967, |
|
"step": 31950 |
|
}, |
|
{ |
|
"epoch": 0.9247018197842845, |
|
"grad_norm": 21.078125, |
|
"learning_rate": 1.444851812385992e-07, |
|
"loss": 15.2861, |
|
"step": 32000 |
|
}, |
|
{ |
|
"epoch": 0.9261466663776974, |
|
"grad_norm": 22.53125, |
|
"learning_rate": 1.447109393342845e-07, |
|
"loss": 15.203, |
|
"step": 32050 |
|
}, |
|
{ |
|
"epoch": 0.9275915129711103, |
|
"grad_norm": 20.46875, |
|
"learning_rate": 1.4493669742996984e-07, |
|
"loss": 15.3343, |
|
"step": 32100 |
|
}, |
|
{ |
|
"epoch": 0.9290363595645232, |
|
"grad_norm": 21.5625, |
|
"learning_rate": 1.4516245552565514e-07, |
|
"loss": 15.1377, |
|
"step": 32150 |
|
}, |
|
{ |
|
"epoch": 0.9304812061579362, |
|
"grad_norm": 23.609375, |
|
"learning_rate": 1.4538821362134047e-07, |
|
"loss": 15.267, |
|
"step": 32200 |
|
}, |
|
{ |
|
"epoch": 0.9319260527513491, |
|
"grad_norm": 22.59375, |
|
"learning_rate": 1.4561397171702577e-07, |
|
"loss": 15.3935, |
|
"step": 32250 |
|
}, |
|
{ |
|
"epoch": 0.933370899344762, |
|
"grad_norm": 23.90625, |
|
"learning_rate": 1.4583972981271108e-07, |
|
"loss": 15.2605, |
|
"step": 32300 |
|
}, |
|
{ |
|
"epoch": 0.9348157459381751, |
|
"grad_norm": 23.171875, |
|
"learning_rate": 1.460654879083964e-07, |
|
"loss": 15.2479, |
|
"step": 32350 |
|
}, |
|
{ |
|
"epoch": 0.936260592531588, |
|
"grad_norm": 22.734375, |
|
"learning_rate": 1.4629124600408168e-07, |
|
"loss": 15.1586, |
|
"step": 32400 |
|
}, |
|
{ |
|
"epoch": 0.9377054391250009, |
|
"grad_norm": 22.078125, |
|
"learning_rate": 1.46517004099767e-07, |
|
"loss": 15.2709, |
|
"step": 32450 |
|
}, |
|
{ |
|
"epoch": 0.9391502857184139, |
|
"grad_norm": 20.484375, |
|
"learning_rate": 1.467427621954523e-07, |
|
"loss": 15.3932, |
|
"step": 32500 |
|
}, |
|
{ |
|
"epoch": 0.9405951323118268, |
|
"grad_norm": 20.171875, |
|
"learning_rate": 1.4696852029113762e-07, |
|
"loss": 15.3021, |
|
"step": 32550 |
|
}, |
|
{ |
|
"epoch": 0.9420399789052397, |
|
"grad_norm": 20.375, |
|
"learning_rate": 1.4719427838682294e-07, |
|
"loss": 15.3676, |
|
"step": 32600 |
|
}, |
|
{ |
|
"epoch": 0.9434848254986526, |
|
"grad_norm": 18.40625, |
|
"learning_rate": 1.4742003648250825e-07, |
|
"loss": 15.225, |
|
"step": 32650 |
|
}, |
|
{ |
|
"epoch": 0.9449296720920656, |
|
"grad_norm": 22.921875, |
|
"learning_rate": 1.4764579457819357e-07, |
|
"loss": 15.2833, |
|
"step": 32700 |
|
}, |
|
{ |
|
"epoch": 0.9463745186854786, |
|
"grad_norm": 21.359375, |
|
"learning_rate": 1.4787155267387888e-07, |
|
"loss": 15.2648, |
|
"step": 32750 |
|
}, |
|
{ |
|
"epoch": 0.9478193652788915, |
|
"grad_norm": 18.03125, |
|
"learning_rate": 1.480973107695642e-07, |
|
"loss": 15.1701, |
|
"step": 32800 |
|
}, |
|
{ |
|
"epoch": 0.9492642118723045, |
|
"grad_norm": 20.734375, |
|
"learning_rate": 1.483230688652495e-07, |
|
"loss": 15.3169, |
|
"step": 32850 |
|
}, |
|
{ |
|
"epoch": 0.9507090584657174, |
|
"grad_norm": 30.40625, |
|
"learning_rate": 1.485488269609348e-07, |
|
"loss": 15.1193, |
|
"step": 32900 |
|
}, |
|
{ |
|
"epoch": 0.9521539050591303, |
|
"grad_norm": 18.96875, |
|
"learning_rate": 1.4877458505662014e-07, |
|
"loss": 15.1477, |
|
"step": 32950 |
|
}, |
|
{ |
|
"epoch": 0.9535987516525433, |
|
"grad_norm": 22.125, |
|
"learning_rate": 1.4900034315230544e-07, |
|
"loss": 15.1363, |
|
"step": 33000 |
|
}, |
|
{ |
|
"epoch": 0.9535987516525433, |
|
"eval_loss": 1.9005507230758667, |
|
"eval_runtime": 343.9939, |
|
"eval_samples_per_second": 2710.903, |
|
"eval_steps_per_second": 42.358, |
|
"step": 33000 |
|
}, |
|
{ |
|
"epoch": 0.9550435982459562, |
|
"grad_norm": 20.171875, |
|
"learning_rate": 1.4922610124799077e-07, |
|
"loss": 15.2704, |
|
"step": 33050 |
|
}, |
|
{ |
|
"epoch": 0.9564884448393692, |
|
"grad_norm": 25.21875, |
|
"learning_rate": 1.4945185934367605e-07, |
|
"loss": 15.2413, |
|
"step": 33100 |
|
}, |
|
{ |
|
"epoch": 0.9579332914327822, |
|
"grad_norm": 20.984375, |
|
"learning_rate": 1.4967761743936135e-07, |
|
"loss": 15.0358, |
|
"step": 33150 |
|
}, |
|
{ |
|
"epoch": 0.9593781380261951, |
|
"grad_norm": 22.65625, |
|
"learning_rate": 1.4990337553504668e-07, |
|
"loss": 15.148, |
|
"step": 33200 |
|
}, |
|
{ |
|
"epoch": 0.960822984619608, |
|
"grad_norm": 24.671875, |
|
"learning_rate": 1.5012913363073198e-07, |
|
"loss": 15.0575, |
|
"step": 33250 |
|
}, |
|
{ |
|
"epoch": 0.962267831213021, |
|
"grad_norm": 21.28125, |
|
"learning_rate": 1.503548917264173e-07, |
|
"loss": 15.1119, |
|
"step": 33300 |
|
}, |
|
{ |
|
"epoch": 0.9637126778064339, |
|
"grad_norm": 24.21875, |
|
"learning_rate": 1.505806498221026e-07, |
|
"loss": 15.21, |
|
"step": 33350 |
|
}, |
|
{ |
|
"epoch": 0.9651575243998468, |
|
"grad_norm": 21.6875, |
|
"learning_rate": 1.5080640791778791e-07, |
|
"loss": 15.1355, |
|
"step": 33400 |
|
}, |
|
{ |
|
"epoch": 0.9666023709932597, |
|
"grad_norm": 24.390625, |
|
"learning_rate": 1.5103216601347324e-07, |
|
"loss": 15.2218, |
|
"step": 33450 |
|
}, |
|
{ |
|
"epoch": 0.9680472175866728, |
|
"grad_norm": 19.25, |
|
"learning_rate": 1.5125792410915854e-07, |
|
"loss": 15.1256, |
|
"step": 33500 |
|
}, |
|
{ |
|
"epoch": 0.9694920641800857, |
|
"grad_norm": 19.984375, |
|
"learning_rate": 1.5148368220484387e-07, |
|
"loss": 15.1171, |
|
"step": 33550 |
|
}, |
|
{ |
|
"epoch": 0.9709369107734986, |
|
"grad_norm": 19.640625, |
|
"learning_rate": 1.5170944030052918e-07, |
|
"loss": 15.0999, |
|
"step": 33600 |
|
}, |
|
{ |
|
"epoch": 0.9723817573669116, |
|
"grad_norm": 24.265625, |
|
"learning_rate": 1.519351983962145e-07, |
|
"loss": 15.2255, |
|
"step": 33650 |
|
}, |
|
{ |
|
"epoch": 0.9738266039603245, |
|
"grad_norm": 25.546875, |
|
"learning_rate": 1.521609564918998e-07, |
|
"loss": 15.0743, |
|
"step": 33700 |
|
}, |
|
{ |
|
"epoch": 0.9752714505537374, |
|
"grad_norm": 21.578125, |
|
"learning_rate": 1.5238671458758508e-07, |
|
"loss": 15.145, |
|
"step": 33750 |
|
}, |
|
{ |
|
"epoch": 0.9767162971471504, |
|
"grad_norm": 24.46875, |
|
"learning_rate": 1.526124726832704e-07, |
|
"loss": 15.2408, |
|
"step": 33800 |
|
}, |
|
{ |
|
"epoch": 0.9781611437405634, |
|
"grad_norm": 21.984375, |
|
"learning_rate": 1.5283823077895571e-07, |
|
"loss": 15.1413, |
|
"step": 33850 |
|
}, |
|
{ |
|
"epoch": 0.9796059903339763, |
|
"grad_norm": 21.828125, |
|
"learning_rate": 1.5306398887464104e-07, |
|
"loss": 15.1452, |
|
"step": 33900 |
|
}, |
|
{ |
|
"epoch": 0.9810508369273893, |
|
"grad_norm": 22.125, |
|
"learning_rate": 1.5328974697032635e-07, |
|
"loss": 15.1786, |
|
"step": 33950 |
|
}, |
|
{ |
|
"epoch": 0.9824956835208022, |
|
"grad_norm": 27.046875, |
|
"learning_rate": 1.5351550506601165e-07, |
|
"loss": 15.069, |
|
"step": 34000 |
|
}, |
|
{ |
|
"epoch": 0.9839405301142151, |
|
"grad_norm": 21.65625, |
|
"learning_rate": 1.5374126316169698e-07, |
|
"loss": 15.1776, |
|
"step": 34050 |
|
}, |
|
{ |
|
"epoch": 0.985385376707628, |
|
"grad_norm": 21.953125, |
|
"learning_rate": 1.5396702125738228e-07, |
|
"loss": 15.1561, |
|
"step": 34100 |
|
}, |
|
{ |
|
"epoch": 0.986830223301041, |
|
"grad_norm": 25.75, |
|
"learning_rate": 1.541927793530676e-07, |
|
"loss": 15.2242, |
|
"step": 34150 |
|
}, |
|
{ |
|
"epoch": 0.9882750698944539, |
|
"grad_norm": 23.484375, |
|
"learning_rate": 1.544185374487529e-07, |
|
"loss": 15.1583, |
|
"step": 34200 |
|
}, |
|
{ |
|
"epoch": 0.989719916487867, |
|
"grad_norm": 27.984375, |
|
"learning_rate": 1.546442955444382e-07, |
|
"loss": 15.1159, |
|
"step": 34250 |
|
}, |
|
{ |
|
"epoch": 0.9911647630812799, |
|
"grad_norm": 21.34375, |
|
"learning_rate": 1.5487005364012354e-07, |
|
"loss": 15.0641, |
|
"step": 34300 |
|
}, |
|
{ |
|
"epoch": 0.9926096096746928, |
|
"grad_norm": 20.25, |
|
"learning_rate": 1.5509581173580884e-07, |
|
"loss": 15.139, |
|
"step": 34350 |
|
}, |
|
{ |
|
"epoch": 0.9940544562681057, |
|
"grad_norm": 21.640625, |
|
"learning_rate": 1.5532156983149415e-07, |
|
"loss": 15.0966, |
|
"step": 34400 |
|
}, |
|
{ |
|
"epoch": 0.9954993028615187, |
|
"grad_norm": 22.125, |
|
"learning_rate": 1.5554732792717945e-07, |
|
"loss": 15.1072, |
|
"step": 34450 |
|
}, |
|
{ |
|
"epoch": 0.9969441494549316, |
|
"grad_norm": 21.859375, |
|
"learning_rate": 1.5577308602286478e-07, |
|
"loss": 15.1392, |
|
"step": 34500 |
|
}, |
|
{ |
|
"epoch": 0.9983889960483445, |
|
"grad_norm": 21.703125, |
|
"learning_rate": 1.5599884411855008e-07, |
|
"loss": 15.2097, |
|
"step": 34550 |
|
}, |
|
{ |
|
"epoch": 0.9998338426417576, |
|
"grad_norm": 23.140625, |
|
"learning_rate": 1.5622460221423538e-07, |
|
"loss": 15.0667, |
|
"step": 34600 |
|
} |
|
], |
|
"logging_steps": 50, |
|
"max_steps": 34605, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 1, |
|
"save_steps": 5000, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 6.042588809533587e+18, |
|
"train_batch_size": 8, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|