|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 8.807339449541285, |
|
"eval_steps": 500, |
|
"global_step": 1080, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.08154943934760449, |
|
"grad_norm": 13.197553634643555, |
|
"learning_rate": 9.259259259259259e-07, |
|
"loss": 1.1931, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.16309887869520898, |
|
"grad_norm": 15.166808128356934, |
|
"learning_rate": 1.8518518518518519e-06, |
|
"loss": 1.123, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.24464831804281345, |
|
"grad_norm": 16.447948455810547, |
|
"learning_rate": 2.7777777777777783e-06, |
|
"loss": 1.1451, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.32619775739041795, |
|
"grad_norm": 9.410321235656738, |
|
"learning_rate": 3.7037037037037037e-06, |
|
"loss": 1.0174, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.4077471967380224, |
|
"grad_norm": 13.654789924621582, |
|
"learning_rate": 4.62962962962963e-06, |
|
"loss": 1.0555, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.4892966360856269, |
|
"grad_norm": 8.732908248901367, |
|
"learning_rate": 5.555555555555557e-06, |
|
"loss": 1.1169, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.5708460754332314, |
|
"grad_norm": 16.178895950317383, |
|
"learning_rate": 6.481481481481482e-06, |
|
"loss": 1.1143, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.6523955147808359, |
|
"grad_norm": 20.069887161254883, |
|
"learning_rate": 7.4074074074074075e-06, |
|
"loss": 0.9055, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.7339449541284404, |
|
"grad_norm": 7.063018798828125, |
|
"learning_rate": 8.333333333333334e-06, |
|
"loss": 1.0901, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.8154943934760448, |
|
"grad_norm": 10.402606964111328, |
|
"learning_rate": 9.25925925925926e-06, |
|
"loss": 0.98, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.8970438328236493, |
|
"grad_norm": 11.173501014709473, |
|
"learning_rate": 9.999895536228031e-06, |
|
"loss": 1.1819, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.9785932721712538, |
|
"grad_norm": 9.713117599487305, |
|
"learning_rate": 9.996239762521152e-06, |
|
"loss": 1.019, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 1.0601427115188584, |
|
"grad_norm": 6.5643534660339355, |
|
"learning_rate": 9.987365164467767e-06, |
|
"loss": 0.8775, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 1.1416921508664628, |
|
"grad_norm": 7.599371910095215, |
|
"learning_rate": 9.973281012033009e-06, |
|
"loss": 0.7782, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 1.2232415902140672, |
|
"grad_norm": 7.510239601135254, |
|
"learning_rate": 9.954002016824226e-06, |
|
"loss": 0.7959, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 1.3047910295616718, |
|
"grad_norm": 5.231692790985107, |
|
"learning_rate": 9.929548316723983e-06, |
|
"loss": 0.7244, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 1.3863404689092762, |
|
"grad_norm": 15.204421043395996, |
|
"learning_rate": 9.899945454855007e-06, |
|
"loss": 0.6964, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 1.4678899082568808, |
|
"grad_norm": 6.420439720153809, |
|
"learning_rate": 9.86522435289912e-06, |
|
"loss": 0.6273, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 1.5494393476044852, |
|
"grad_norm": 5.670159816741943, |
|
"learning_rate": 9.825421278797984e-06, |
|
"loss": 0.6795, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 1.6309887869520896, |
|
"grad_norm": 7.807967185974121, |
|
"learning_rate": 9.7805778088694e-06, |
|
"loss": 0.7074, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 1.7125382262996942, |
|
"grad_norm": 5.227024078369141, |
|
"learning_rate": 9.730740784378755e-06, |
|
"loss": 0.7545, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 1.7940876656472988, |
|
"grad_norm": 10.772171020507812, |
|
"learning_rate": 9.67596226261095e-06, |
|
"loss": 0.7335, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 1.8756371049949032, |
|
"grad_norm": 5.900475025177002, |
|
"learning_rate": 9.616299462493952e-06, |
|
"loss": 0.7046, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 1.9571865443425076, |
|
"grad_norm": 8.141891479492188, |
|
"learning_rate": 9.551814704830734e-06, |
|
"loss": 0.6498, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 2.038735983690112, |
|
"grad_norm": 7.0482659339904785, |
|
"learning_rate": 9.482575347202047e-06, |
|
"loss": 0.6264, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 2.120285423037717, |
|
"grad_norm": 9.526334762573242, |
|
"learning_rate": 9.40865371360804e-06, |
|
"loss": 0.4823, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 2.2018348623853212, |
|
"grad_norm": 4.857308864593506, |
|
"learning_rate": 9.330127018922195e-06, |
|
"loss": 0.4391, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 2.2833843017329256, |
|
"grad_norm": 15.281082153320312, |
|
"learning_rate": 9.247077288236488e-06, |
|
"loss": 0.3913, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 2.36493374108053, |
|
"grad_norm": 9.266241073608398, |
|
"learning_rate": 9.159591271182058e-06, |
|
"loss": 0.4327, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 2.4464831804281344, |
|
"grad_norm": 4.62817907333374, |
|
"learning_rate": 9.067760351314838e-06, |
|
"loss": 0.4115, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 2.528032619775739, |
|
"grad_norm": 6.626443862915039, |
|
"learning_rate": 8.97168045066082e-06, |
|
"loss": 0.3849, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 2.6095820591233436, |
|
"grad_norm": 7.996133804321289, |
|
"learning_rate": 8.871451929520662e-06, |
|
"loss": 0.4089, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 2.691131498470948, |
|
"grad_norm": 7.03031587600708, |
|
"learning_rate": 8.767179481638303e-06, |
|
"loss": 0.451, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 2.7726809378185524, |
|
"grad_norm": 5.80501127243042, |
|
"learning_rate": 8.658972024843063e-06, |
|
"loss": 0.4384, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 2.8542303771661572, |
|
"grad_norm": 7.129272937774658, |
|
"learning_rate": 8.546942587279465e-06, |
|
"loss": 0.4124, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 2.9357798165137616, |
|
"grad_norm": 6.861177921295166, |
|
"learning_rate": 8.43120818934367e-06, |
|
"loss": 0.4244, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 3.017329255861366, |
|
"grad_norm": 4.206913948059082, |
|
"learning_rate": 8.31188972144974e-06, |
|
"loss": 0.3352, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 3.0988786952089704, |
|
"grad_norm": 6.867356300354004, |
|
"learning_rate": 8.18911181775353e-06, |
|
"loss": 0.2296, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 3.180428134556575, |
|
"grad_norm": 6.237820625305176, |
|
"learning_rate": 8.063002725966014e-06, |
|
"loss": 0.2352, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 3.261977573904179, |
|
"grad_norm": 5.08056640625, |
|
"learning_rate": 7.93369417339209e-06, |
|
"loss": 0.2583, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 3.343527013251784, |
|
"grad_norm": 15.075705528259277, |
|
"learning_rate": 7.801321229334764e-06, |
|
"loss": 0.2715, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 3.4250764525993884, |
|
"grad_norm": 3.862241744995117, |
|
"learning_rate": 7.666022164008458e-06, |
|
"loss": 0.2239, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 3.506625891946993, |
|
"grad_norm": 3.4025380611419678, |
|
"learning_rate": 7.527938304108795e-06, |
|
"loss": 0.2292, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 3.588175331294597, |
|
"grad_norm": 3.6665425300598145, |
|
"learning_rate": 7.387213885189746e-06, |
|
"loss": 0.2294, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 3.669724770642202, |
|
"grad_norm": 6.751482009887695, |
|
"learning_rate": 7.243995901002312e-06, |
|
"loss": 0.2348, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 3.7512742099898064, |
|
"grad_norm": 2.690695285797119, |
|
"learning_rate": 7.098433949952146e-06, |
|
"loss": 0.2471, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 3.832823649337411, |
|
"grad_norm": 3.8786795139312744, |
|
"learning_rate": 6.950680078836475e-06, |
|
"loss": 0.2518, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 3.914373088685015, |
|
"grad_norm": 4.775713920593262, |
|
"learning_rate": 6.800888624023552e-06, |
|
"loss": 0.2274, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 3.9959225280326196, |
|
"grad_norm": 2.959533929824829, |
|
"learning_rate": 6.649216050240539e-06, |
|
"loss": 0.245, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 4.077471967380224, |
|
"grad_norm": 1.844783067703247, |
|
"learning_rate": 6.495820787138209e-06, |
|
"loss": 0.1411, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 4.077471967380224, |
|
"eval_loss": 1.580026388168335, |
|
"eval_runtime": 3.9804, |
|
"eval_samples_per_second": 27.636, |
|
"eval_steps_per_second": 27.636, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 4.159021406727828, |
|
"grad_norm": 6.747076988220215, |
|
"learning_rate": 6.340863063803187e-06, |
|
"loss": 0.1365, |
|
"step": 510 |
|
}, |
|
{ |
|
"epoch": 4.240570846075434, |
|
"grad_norm": 2.562960624694824, |
|
"learning_rate": 6.184504741390596e-06, |
|
"loss": 0.1192, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 4.322120285423038, |
|
"grad_norm": 5.0884552001953125, |
|
"learning_rate": 6.02690914405191e-06, |
|
"loss": 0.1517, |
|
"step": 530 |
|
}, |
|
{ |
|
"epoch": 4.4036697247706424, |
|
"grad_norm": 3.955160617828369, |
|
"learning_rate": 5.8682408883346535e-06, |
|
"loss": 0.0783, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 4.485219164118247, |
|
"grad_norm": 6.160187721252441, |
|
"learning_rate": 5.708665711232103e-06, |
|
"loss": 0.1147, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 4.566768603465851, |
|
"grad_norm": 1.819947361946106, |
|
"learning_rate": 5.548350297062659e-06, |
|
"loss": 0.1061, |
|
"step": 560 |
|
}, |
|
{ |
|
"epoch": 4.648318042813456, |
|
"grad_norm": 5.095085620880127, |
|
"learning_rate": 5.387462103359655e-06, |
|
"loss": 0.1298, |
|
"step": 570 |
|
}, |
|
{ |
|
"epoch": 4.72986748216106, |
|
"grad_norm": 5.54387092590332, |
|
"learning_rate": 5.2261691859535325e-06, |
|
"loss": 0.1029, |
|
"step": 580 |
|
}, |
|
{ |
|
"epoch": 4.811416921508664, |
|
"grad_norm": 1.481745958328247, |
|
"learning_rate": 5.064640023429042e-06, |
|
"loss": 0.0996, |
|
"step": 590 |
|
}, |
|
{ |
|
"epoch": 4.892966360856269, |
|
"grad_norm": 3.6181232929229736, |
|
"learning_rate": 4.903043341140879e-06, |
|
"loss": 0.1103, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 4.974515800203873, |
|
"grad_norm": 5.307178497314453, |
|
"learning_rate": 4.741547934971528e-06, |
|
"loss": 0.126, |
|
"step": 610 |
|
}, |
|
{ |
|
"epoch": 5.0560652395514785, |
|
"grad_norm": 2.1945080757141113, |
|
"learning_rate": 4.580322495015466e-06, |
|
"loss": 0.0616, |
|
"step": 620 |
|
}, |
|
{ |
|
"epoch": 5.137614678899083, |
|
"grad_norm": 3.431023597717285, |
|
"learning_rate": 4.4195354293738484e-06, |
|
"loss": 0.049, |
|
"step": 630 |
|
}, |
|
{ |
|
"epoch": 5.219164118246687, |
|
"grad_norm": 4.72199010848999, |
|
"learning_rate": 4.259354688243758e-06, |
|
"loss": 0.0563, |
|
"step": 640 |
|
}, |
|
{ |
|
"epoch": 5.300713557594292, |
|
"grad_norm": 2.1474854946136475, |
|
"learning_rate": 4.099947588485744e-06, |
|
"loss": 0.0464, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 5.382262996941896, |
|
"grad_norm": 1.2775018215179443, |
|
"learning_rate": 3.941480638852948e-06, |
|
"loss": 0.0454, |
|
"step": 660 |
|
}, |
|
{ |
|
"epoch": 5.4638124362895, |
|
"grad_norm": 2.0239715576171875, |
|
"learning_rate": 3.784119366064293e-06, |
|
"loss": 0.0776, |
|
"step": 670 |
|
}, |
|
{ |
|
"epoch": 5.545361875637105, |
|
"grad_norm": 9.16295051574707, |
|
"learning_rate": 3.6280281419034934e-06, |
|
"loss": 0.0565, |
|
"step": 680 |
|
}, |
|
{ |
|
"epoch": 5.626911314984709, |
|
"grad_norm": 3.2734322547912598, |
|
"learning_rate": 3.473370011524435e-06, |
|
"loss": 0.0624, |
|
"step": 690 |
|
}, |
|
{ |
|
"epoch": 5.708460754332314, |
|
"grad_norm": 2.0991153717041016, |
|
"learning_rate": 3.3203065231422904e-06, |
|
"loss": 0.0506, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 5.790010193679919, |
|
"grad_norm": 2.030103921890259, |
|
"learning_rate": 3.1689975592882603e-06, |
|
"loss": 0.0482, |
|
"step": 710 |
|
}, |
|
{ |
|
"epoch": 5.871559633027523, |
|
"grad_norm": 3.2920758724212646, |
|
"learning_rate": 3.019601169804216e-06, |
|
"loss": 0.062, |
|
"step": 720 |
|
}, |
|
{ |
|
"epoch": 5.953109072375128, |
|
"grad_norm": 8.777363777160645, |
|
"learning_rate": 2.8722734067516637e-06, |
|
"loss": 0.0573, |
|
"step": 730 |
|
}, |
|
{ |
|
"epoch": 6.034658511722732, |
|
"grad_norm": 1.1958752870559692, |
|
"learning_rate": 2.7271681614074973e-06, |
|
"loss": 0.0389, |
|
"step": 740 |
|
}, |
|
{ |
|
"epoch": 6.116207951070336, |
|
"grad_norm": 5.468116283416748, |
|
"learning_rate": 2.5844370035168077e-06, |
|
"loss": 0.0331, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 6.197757390417941, |
|
"grad_norm": 0.45254144072532654, |
|
"learning_rate": 2.4442290229706344e-06, |
|
"loss": 0.0248, |
|
"step": 760 |
|
}, |
|
{ |
|
"epoch": 6.279306829765545, |
|
"grad_norm": 1.1617120504379272, |
|
"learning_rate": 2.3066906740740626e-06, |
|
"loss": 0.023, |
|
"step": 770 |
|
}, |
|
{ |
|
"epoch": 6.36085626911315, |
|
"grad_norm": 1.583760380744934, |
|
"learning_rate": 2.171965622567308e-06, |
|
"loss": 0.0167, |
|
"step": 780 |
|
}, |
|
{ |
|
"epoch": 6.442405708460754, |
|
"grad_norm": 0.8195653557777405, |
|
"learning_rate": 2.0401945955596206e-06, |
|
"loss": 0.0369, |
|
"step": 790 |
|
}, |
|
{ |
|
"epoch": 6.523955147808358, |
|
"grad_norm": 1.8565921783447266, |
|
"learning_rate": 1.9115152345327154e-06, |
|
"loss": 0.0183, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 6.605504587155964, |
|
"grad_norm": 1.3874293565750122, |
|
"learning_rate": 1.7860619515673034e-06, |
|
"loss": 0.0191, |
|
"step": 810 |
|
}, |
|
{ |
|
"epoch": 6.687054026503568, |
|
"grad_norm": 1.5542582273483276, |
|
"learning_rate": 1.6639657889429017e-06, |
|
"loss": 0.0183, |
|
"step": 820 |
|
}, |
|
{ |
|
"epoch": 6.7686034658511725, |
|
"grad_norm": 1.0783458948135376, |
|
"learning_rate": 1.5453542822575624e-06, |
|
"loss": 0.02, |
|
"step": 830 |
|
}, |
|
{ |
|
"epoch": 6.850152905198777, |
|
"grad_norm": 1.0901105403900146, |
|
"learning_rate": 1.4303513272105057e-06, |
|
"loss": 0.0197, |
|
"step": 840 |
|
}, |
|
{ |
|
"epoch": 6.931702344546381, |
|
"grad_norm": 1.554459810256958, |
|
"learning_rate": 1.3190770501868243e-06, |
|
"loss": 0.0211, |
|
"step": 850 |
|
}, |
|
{ |
|
"epoch": 7.013251783893986, |
|
"grad_norm": 3.050743818283081, |
|
"learning_rate": 1.2116476827794104e-06, |
|
"loss": 0.0148, |
|
"step": 860 |
|
}, |
|
{ |
|
"epoch": 7.09480122324159, |
|
"grad_norm": 0.37844064831733704, |
|
"learning_rate": 1.1081754403792e-06, |
|
"loss": 0.0063, |
|
"step": 870 |
|
}, |
|
{ |
|
"epoch": 7.176350662589194, |
|
"grad_norm": 0.4527202844619751, |
|
"learning_rate": 1.008768404960535e-06, |
|
"loss": 0.0087, |
|
"step": 880 |
|
}, |
|
{ |
|
"epoch": 7.257900101936799, |
|
"grad_norm": 0.7698076963424683, |
|
"learning_rate": 9.135304121840976e-07, |
|
"loss": 0.0086, |
|
"step": 890 |
|
}, |
|
{ |
|
"epoch": 7.339449541284404, |
|
"grad_norm": 0.1463124006986618, |
|
"learning_rate": 8.225609429353187e-07, |
|
"loss": 0.0064, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 7.4209989806320085, |
|
"grad_norm": 0.4381123483181, |
|
"learning_rate": 7.35955019411585e-07, |
|
"loss": 0.0105, |
|
"step": 910 |
|
}, |
|
{ |
|
"epoch": 7.502548419979613, |
|
"grad_norm": 1.877506971359253, |
|
"learning_rate": 6.53803105866761e-07, |
|
"loss": 0.008, |
|
"step": 920 |
|
}, |
|
{ |
|
"epoch": 7.584097859327217, |
|
"grad_norm": 1.034546971321106, |
|
"learning_rate": 5.76191014116711e-07, |
|
"loss": 0.0106, |
|
"step": 930 |
|
}, |
|
{ |
|
"epoch": 7.665647298674822, |
|
"grad_norm": 0.48700782656669617, |
|
"learning_rate": 5.031998139045352e-07, |
|
"loss": 0.0094, |
|
"step": 940 |
|
}, |
|
{ |
|
"epoch": 7.747196738022426, |
|
"grad_norm": 1.1844407320022583, |
|
"learning_rate": 4.349057482191299e-07, |
|
"loss": 0.0108, |
|
"step": 950 |
|
}, |
|
{ |
|
"epoch": 7.82874617737003, |
|
"grad_norm": 1.7279776334762573, |
|
"learning_rate": 3.7138015365554834e-07, |
|
"loss": 0.0143, |
|
"step": 960 |
|
}, |
|
{ |
|
"epoch": 7.910295616717635, |
|
"grad_norm": 0.49893057346343994, |
|
"learning_rate": 3.1268938590032495e-07, |
|
"loss": 0.0091, |
|
"step": 970 |
|
}, |
|
{ |
|
"epoch": 7.991845056065239, |
|
"grad_norm": 0.29836803674697876, |
|
"learning_rate": 2.5889475041961767e-07, |
|
"loss": 0.0063, |
|
"step": 980 |
|
}, |
|
{ |
|
"epoch": 8.073394495412844, |
|
"grad_norm": 0.30645838379859924, |
|
"learning_rate": 2.1005243842255552e-07, |
|
"loss": 0.015, |
|
"step": 990 |
|
}, |
|
{ |
|
"epoch": 8.154943934760448, |
|
"grad_norm": 0.2830670475959778, |
|
"learning_rate": 1.6621346816668993e-07, |
|
"loss": 0.0056, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 8.154943934760448, |
|
"eval_loss": 2.430070638656616, |
|
"eval_runtime": 3.8698, |
|
"eval_samples_per_second": 28.425, |
|
"eval_steps_per_second": 28.425, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 8.236493374108052, |
|
"grad_norm": 0.15629835426807404, |
|
"learning_rate": 1.2742363166685035e-07, |
|
"loss": 0.006, |
|
"step": 1010 |
|
}, |
|
{ |
|
"epoch": 8.318042813455657, |
|
"grad_norm": 0.0809037983417511, |
|
"learning_rate": 9.372344686307655e-08, |
|
"loss": 0.0047, |
|
"step": 1020 |
|
}, |
|
{ |
|
"epoch": 8.399592252803261, |
|
"grad_norm": 0.775462806224823, |
|
"learning_rate": 6.514811529758747e-08, |
|
"loss": 0.009, |
|
"step": 1030 |
|
}, |
|
{ |
|
"epoch": 8.481141692150867, |
|
"grad_norm": 0.14626573026180267, |
|
"learning_rate": 4.172748534499449e-08, |
|
"loss": 0.008, |
|
"step": 1040 |
|
}, |
|
{ |
|
"epoch": 8.562691131498472, |
|
"grad_norm": 0.44388526678085327, |
|
"learning_rate": 2.3486021034170857e-08, |
|
"loss": 0.0059, |
|
"step": 1050 |
|
}, |
|
{ |
|
"epoch": 8.644240570846076, |
|
"grad_norm": 0.3372404873371124, |
|
"learning_rate": 1.044277649433989e-08, |
|
"loss": 0.006, |
|
"step": 1060 |
|
}, |
|
{ |
|
"epoch": 8.72579001019368, |
|
"grad_norm": 0.5315991640090942, |
|
"learning_rate": 2.611376052073511e-09, |
|
"loss": 0.007, |
|
"step": 1070 |
|
}, |
|
{ |
|
"epoch": 8.807339449541285, |
|
"grad_norm": 0.2758943736553192, |
|
"learning_rate": 0.0, |
|
"loss": 0.0048, |
|
"step": 1080 |
|
}, |
|
{ |
|
"epoch": 8.807339449541285, |
|
"step": 1080, |
|
"total_flos": 1.2056852760477696e+16, |
|
"train_loss": 0.3034052159499239, |
|
"train_runtime": 1803.2161, |
|
"train_samples_per_second": 4.791, |
|
"train_steps_per_second": 0.599 |
|
} |
|
], |
|
"logging_steps": 10, |
|
"max_steps": 1080, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 9, |
|
"save_steps": 500, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 1.2056852760477696e+16, |
|
"train_batch_size": 1, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|