diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,6920 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 2.0, + "eval_steps": 200, + "global_step": 978, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.002044989775051125, + "grad_norm": 2.8995674216310645, + "learning_rate": 9.999974203447434e-06, + "loss": 0.097, + "step": 1 + }, + { + "epoch": 0.00408997955010225, + "grad_norm": 2.0878590818900724, + "learning_rate": 9.999896814055916e-06, + "loss": 0.0793, + "step": 2 + }, + { + "epoch": 0.006134969325153374, + "grad_norm": 3.252004392065858, + "learning_rate": 9.999767832624e-06, + "loss": 0.1446, + "step": 3 + }, + { + "epoch": 0.0081799591002045, + "grad_norm": 2.1719657405930333, + "learning_rate": 9.999587260482597e-06, + "loss": 0.0606, + "step": 4 + }, + { + "epoch": 0.010224948875255624, + "grad_norm": 1.5951295112804458, + "learning_rate": 9.999355099494961e-06, + "loss": 0.0543, + "step": 5 + }, + { + "epoch": 0.012269938650306749, + "grad_norm": 2.0082268910826957, + "learning_rate": 9.999071352056676e-06, + "loss": 0.0752, + "step": 6 + }, + { + "epoch": 0.014314928425357873, + "grad_norm": 1.9536326911273243, + "learning_rate": 9.998736021095621e-06, + "loss": 0.0453, + "step": 7 + }, + { + "epoch": 0.016359918200409, + "grad_norm": 2.13634714300749, + "learning_rate": 9.99834911007195e-06, + "loss": 0.0732, + "step": 8 + }, + { + "epoch": 0.018404907975460124, + "grad_norm": 1.920732150945499, + "learning_rate": 9.99791062297805e-06, + "loss": 0.0541, + "step": 9 + }, + { + "epoch": 0.02044989775051125, + "grad_norm": 2.1324187216203034, + "learning_rate": 9.99742056433851e-06, + "loss": 0.0549, + "step": 10 + }, + { + "epoch": 0.022494887525562373, + "grad_norm": 2.919114524687416, + "learning_rate": 9.99687893921005e-06, + "loss": 0.0895, + "step": 11 + }, + { + "epoch": 0.024539877300613498, + "grad_norm": 1.899625115074746, + "learning_rate": 9.996285753181499e-06, + "loss": 0.0589, + "step": 12 + }, + { + "epoch": 0.026584867075664622, + "grad_norm": 2.5554509832362973, + "learning_rate": 9.99564101237372e-06, + "loss": 0.0785, + "step": 13 + }, + { + "epoch": 0.028629856850715747, + "grad_norm": 2.4318482065803666, + "learning_rate": 9.994944723439546e-06, + "loss": 0.0784, + "step": 14 + }, + { + "epoch": 0.03067484662576687, + "grad_norm": 3.583468004202154, + "learning_rate": 9.994196893563722e-06, + "loss": 0.1125, + "step": 15 + }, + { + "epoch": 0.032719836400818, + "grad_norm": 1.4181812641718199, + "learning_rate": 9.993397530462818e-06, + "loss": 0.0397, + "step": 16 + }, + { + "epoch": 0.034764826175869123, + "grad_norm": 1.8010048779280416, + "learning_rate": 9.99254664238516e-06, + "loss": 0.0575, + "step": 17 + }, + { + "epoch": 0.03680981595092025, + "grad_norm": 2.1503927037059385, + "learning_rate": 9.991644238110741e-06, + "loss": 0.0665, + "step": 18 + }, + { + "epoch": 0.03885480572597137, + "grad_norm": 1.8100049883218121, + "learning_rate": 9.990690326951126e-06, + "loss": 0.0682, + "step": 19 + }, + { + "epoch": 0.0408997955010225, + "grad_norm": 2.3966939056398266, + "learning_rate": 9.989684918749365e-06, + "loss": 0.0846, + "step": 20 + }, + { + "epoch": 0.04294478527607362, + "grad_norm": 1.918166279143656, + "learning_rate": 9.988628023879883e-06, + "loss": 0.0668, + "step": 21 + }, + { + "epoch": 0.044989775051124746, + "grad_norm": 1.7912977784419148, + "learning_rate": 9.98751965324838e-06, + "loss": 0.0729, + "step": 22 + }, + { + "epoch": 0.04703476482617587, + "grad_norm": 1.9098490695074073, + "learning_rate": 9.986359818291706e-06, + "loss": 0.0733, + "step": 23 + }, + { + "epoch": 0.049079754601226995, + "grad_norm": 2.2200718894862805, + "learning_rate": 9.985148530977767e-06, + "loss": 0.0723, + "step": 24 + }, + { + "epoch": 0.05112474437627812, + "grad_norm": 1.8085849304791404, + "learning_rate": 9.983885803805373e-06, + "loss": 0.0713, + "step": 25 + }, + { + "epoch": 0.053169734151329244, + "grad_norm": 2.5900909947296507, + "learning_rate": 9.982571649804126e-06, + "loss": 0.0805, + "step": 26 + }, + { + "epoch": 0.05521472392638037, + "grad_norm": 2.557173352737123, + "learning_rate": 9.981206082534287e-06, + "loss": 0.0849, + "step": 27 + }, + { + "epoch": 0.05725971370143149, + "grad_norm": 2.3095562915819503, + "learning_rate": 9.979789116086625e-06, + "loss": 0.0848, + "step": 28 + }, + { + "epoch": 0.05930470347648262, + "grad_norm": 1.652313462404793, + "learning_rate": 9.97832076508228e-06, + "loss": 0.057, + "step": 29 + }, + { + "epoch": 0.06134969325153374, + "grad_norm": 3.3750373556197752, + "learning_rate": 9.976801044672608e-06, + "loss": 0.1154, + "step": 30 + }, + { + "epoch": 0.06339468302658487, + "grad_norm": 2.7053744260152803, + "learning_rate": 9.97522997053903e-06, + "loss": 0.0841, + "step": 31 + }, + { + "epoch": 0.065439672801636, + "grad_norm": 2.1510005490299497, + "learning_rate": 9.973607558892864e-06, + "loss": 0.0732, + "step": 32 + }, + { + "epoch": 0.06748466257668712, + "grad_norm": 2.1823073488659324, + "learning_rate": 9.971933826475162e-06, + "loss": 0.0776, + "step": 33 + }, + { + "epoch": 0.06952965235173825, + "grad_norm": 2.0539979554320817, + "learning_rate": 9.970208790556531e-06, + "loss": 0.0688, + "step": 34 + }, + { + "epoch": 0.07157464212678936, + "grad_norm": 1.6876619685011311, + "learning_rate": 9.968432468936967e-06, + "loss": 0.0608, + "step": 35 + }, + { + "epoch": 0.0736196319018405, + "grad_norm": 3.0575087238752805, + "learning_rate": 9.966604879945659e-06, + "loss": 0.12, + "step": 36 + }, + { + "epoch": 0.07566462167689161, + "grad_norm": 2.414478148852492, + "learning_rate": 9.964726042440802e-06, + "loss": 0.0958, + "step": 37 + }, + { + "epoch": 0.07770961145194274, + "grad_norm": 2.173225061106067, + "learning_rate": 9.962795975809411e-06, + "loss": 0.084, + "step": 38 + }, + { + "epoch": 0.07975460122699386, + "grad_norm": 2.040698856807742, + "learning_rate": 9.960814699967112e-06, + "loss": 0.0794, + "step": 39 + }, + { + "epoch": 0.081799591002045, + "grad_norm": 2.249606373953477, + "learning_rate": 9.958782235357938e-06, + "loss": 0.0951, + "step": 40 + }, + { + "epoch": 0.08384458077709611, + "grad_norm": 2.5979000902419895, + "learning_rate": 9.956698602954124e-06, + "loss": 0.1029, + "step": 41 + }, + { + "epoch": 0.08588957055214724, + "grad_norm": 2.1602269446719644, + "learning_rate": 9.954563824255879e-06, + "loss": 0.0901, + "step": 42 + }, + { + "epoch": 0.08793456032719836, + "grad_norm": 1.8153325069101112, + "learning_rate": 9.952377921291179e-06, + "loss": 0.0623, + "step": 43 + }, + { + "epoch": 0.08997955010224949, + "grad_norm": 2.7967114830172615, + "learning_rate": 9.950140916615526e-06, + "loss": 0.1192, + "step": 44 + }, + { + "epoch": 0.09202453987730061, + "grad_norm": 2.0707153248622827, + "learning_rate": 9.947852833311725e-06, + "loss": 0.0846, + "step": 45 + }, + { + "epoch": 0.09406952965235174, + "grad_norm": 2.1452757583479474, + "learning_rate": 9.94551369498964e-06, + "loss": 0.0875, + "step": 46 + }, + { + "epoch": 0.09611451942740286, + "grad_norm": 2.3194318990073923, + "learning_rate": 9.943123525785952e-06, + "loss": 0.0921, + "step": 47 + }, + { + "epoch": 0.09815950920245399, + "grad_norm": 1.798820349857878, + "learning_rate": 9.940682350363913e-06, + "loss": 0.0592, + "step": 48 + }, + { + "epoch": 0.10020449897750511, + "grad_norm": 1.8591670519797276, + "learning_rate": 9.938190193913084e-06, + "loss": 0.0757, + "step": 49 + }, + { + "epoch": 0.10224948875255624, + "grad_norm": 1.8617586001231685, + "learning_rate": 9.935647082149088e-06, + "loss": 0.0677, + "step": 50 + }, + { + "epoch": 0.10429447852760736, + "grad_norm": 2.402863095839252, + "learning_rate": 9.933053041313325e-06, + "loss": 0.0873, + "step": 51 + }, + { + "epoch": 0.10633946830265849, + "grad_norm": 2.2249519855906756, + "learning_rate": 9.930408098172725e-06, + "loss": 0.0912, + "step": 52 + }, + { + "epoch": 0.1083844580777096, + "grad_norm": 2.1251826013803323, + "learning_rate": 9.92771228001945e-06, + "loss": 0.076, + "step": 53 + }, + { + "epoch": 0.11042944785276074, + "grad_norm": 1.9764253903583366, + "learning_rate": 9.924965614670629e-06, + "loss": 0.0784, + "step": 54 + }, + { + "epoch": 0.11247443762781185, + "grad_norm": 1.8078917942604569, + "learning_rate": 9.92216813046806e-06, + "loss": 0.0667, + "step": 55 + }, + { + "epoch": 0.11451942740286299, + "grad_norm": 2.5631523625105372, + "learning_rate": 9.919319856277921e-06, + "loss": 0.1003, + "step": 56 + }, + { + "epoch": 0.1165644171779141, + "grad_norm": 2.066653670325792, + "learning_rate": 9.916420821490474e-06, + "loss": 0.0756, + "step": 57 + }, + { + "epoch": 0.11860940695296524, + "grad_norm": 2.5780966602305693, + "learning_rate": 9.91347105601976e-06, + "loss": 0.0984, + "step": 58 + }, + { + "epoch": 0.12065439672801637, + "grad_norm": 2.219344023968354, + "learning_rate": 9.910470590303294e-06, + "loss": 0.0789, + "step": 59 + }, + { + "epoch": 0.12269938650306748, + "grad_norm": 2.642779106386566, + "learning_rate": 9.90741945530174e-06, + "loss": 0.078, + "step": 60 + }, + { + "epoch": 0.12474437627811862, + "grad_norm": 1.8439341873720778, + "learning_rate": 9.904317682498609e-06, + "loss": 0.0725, + "step": 61 + }, + { + "epoch": 0.12678936605316973, + "grad_norm": 2.1976218170570783, + "learning_rate": 9.901165303899916e-06, + "loss": 0.1094, + "step": 62 + }, + { + "epoch": 0.12883435582822086, + "grad_norm": 2.4577264483674166, + "learning_rate": 9.89796235203386e-06, + "loss": 0.0922, + "step": 63 + }, + { + "epoch": 0.130879345603272, + "grad_norm": 3.012519841445848, + "learning_rate": 9.89470885995049e-06, + "loss": 0.1109, + "step": 64 + }, + { + "epoch": 0.1329243353783231, + "grad_norm": 2.248540711936193, + "learning_rate": 9.891404861221356e-06, + "loss": 0.0892, + "step": 65 + }, + { + "epoch": 0.13496932515337423, + "grad_norm": 2.3347058109208825, + "learning_rate": 9.888050389939172e-06, + "loss": 0.0851, + "step": 66 + }, + { + "epoch": 0.13701431492842536, + "grad_norm": 2.460632130242845, + "learning_rate": 9.884645480717452e-06, + "loss": 0.0967, + "step": 67 + }, + { + "epoch": 0.1390593047034765, + "grad_norm": 1.8587061916271175, + "learning_rate": 9.881190168690164e-06, + "loss": 0.0661, + "step": 68 + }, + { + "epoch": 0.1411042944785276, + "grad_norm": 2.813362612221172, + "learning_rate": 9.877684489511367e-06, + "loss": 0.1079, + "step": 69 + }, + { + "epoch": 0.14314928425357873, + "grad_norm": 2.7724880857855085, + "learning_rate": 9.874128479354833e-06, + "loss": 0.0865, + "step": 70 + }, + { + "epoch": 0.14519427402862986, + "grad_norm": 2.0084192749000223, + "learning_rate": 9.870522174913683e-06, + "loss": 0.0811, + "step": 71 + }, + { + "epoch": 0.147239263803681, + "grad_norm": 1.901062419637755, + "learning_rate": 9.866865613400008e-06, + "loss": 0.0834, + "step": 72 + }, + { + "epoch": 0.1492842535787321, + "grad_norm": 2.143697771944517, + "learning_rate": 9.863158832544477e-06, + "loss": 0.0967, + "step": 73 + }, + { + "epoch": 0.15132924335378323, + "grad_norm": 1.8252931029432322, + "learning_rate": 9.859401870595959e-06, + "loss": 0.0725, + "step": 74 + }, + { + "epoch": 0.15337423312883436, + "grad_norm": 1.9307281956151774, + "learning_rate": 9.855594766321122e-06, + "loss": 0.077, + "step": 75 + }, + { + "epoch": 0.1554192229038855, + "grad_norm": 2.2429643925966993, + "learning_rate": 9.85173755900403e-06, + "loss": 0.0891, + "step": 76 + }, + { + "epoch": 0.1574642126789366, + "grad_norm": 1.8200761545917128, + "learning_rate": 9.847830288445745e-06, + "loss": 0.0785, + "step": 77 + }, + { + "epoch": 0.15950920245398773, + "grad_norm": 1.8916674815016423, + "learning_rate": 9.843872994963912e-06, + "loss": 0.0755, + "step": 78 + }, + { + "epoch": 0.16155419222903886, + "grad_norm": 2.0741375008009655, + "learning_rate": 9.83986571939234e-06, + "loss": 0.0744, + "step": 79 + }, + { + "epoch": 0.16359918200409, + "grad_norm": 1.7919605782077757, + "learning_rate": 9.835808503080586e-06, + "loss": 0.0757, + "step": 80 + }, + { + "epoch": 0.1656441717791411, + "grad_norm": 1.950729934719885, + "learning_rate": 9.831701387893533e-06, + "loss": 0.0815, + "step": 81 + }, + { + "epoch": 0.16768916155419222, + "grad_norm": 2.124785118083205, + "learning_rate": 9.82754441621094e-06, + "loss": 0.0807, + "step": 82 + }, + { + "epoch": 0.16973415132924335, + "grad_norm": 2.053195322602257, + "learning_rate": 9.823337630927027e-06, + "loss": 0.0902, + "step": 83 + }, + { + "epoch": 0.17177914110429449, + "grad_norm": 2.5090758861647826, + "learning_rate": 9.819081075450014e-06, + "loss": 0.0873, + "step": 84 + }, + { + "epoch": 0.1738241308793456, + "grad_norm": 2.137957503401185, + "learning_rate": 9.814774793701686e-06, + "loss": 0.092, + "step": 85 + }, + { + "epoch": 0.17586912065439672, + "grad_norm": 2.230490758825473, + "learning_rate": 9.810418830116933e-06, + "loss": 0.0833, + "step": 86 + }, + { + "epoch": 0.17791411042944785, + "grad_norm": 2.012709266353046, + "learning_rate": 9.80601322964329e-06, + "loss": 0.0877, + "step": 87 + }, + { + "epoch": 0.17995910020449898, + "grad_norm": 2.572752374501912, + "learning_rate": 9.80155803774048e-06, + "loss": 0.1141, + "step": 88 + }, + { + "epoch": 0.18200408997955012, + "grad_norm": 1.5628909847161165, + "learning_rate": 9.797053300379938e-06, + "loss": 0.0672, + "step": 89 + }, + { + "epoch": 0.18404907975460122, + "grad_norm": 1.8013050781356985, + "learning_rate": 9.792499064044343e-06, + "loss": 0.0804, + "step": 90 + }, + { + "epoch": 0.18609406952965235, + "grad_norm": 2.128417350277261, + "learning_rate": 9.787895375727137e-06, + "loss": 0.0903, + "step": 91 + }, + { + "epoch": 0.18813905930470348, + "grad_norm": 2.6231742831814255, + "learning_rate": 9.783242282932028e-06, + "loss": 0.0991, + "step": 92 + }, + { + "epoch": 0.1901840490797546, + "grad_norm": 2.14671431766684, + "learning_rate": 9.778539833672525e-06, + "loss": 0.0844, + "step": 93 + }, + { + "epoch": 0.19222903885480572, + "grad_norm": 1.668300942440577, + "learning_rate": 9.773788076471415e-06, + "loss": 0.0677, + "step": 94 + }, + { + "epoch": 0.19427402862985685, + "grad_norm": 1.6611049562639426, + "learning_rate": 9.76898706036028e-06, + "loss": 0.0815, + "step": 95 + }, + { + "epoch": 0.19631901840490798, + "grad_norm": 1.7467281372812702, + "learning_rate": 9.764136834878987e-06, + "loss": 0.0802, + "step": 96 + }, + { + "epoch": 0.1983640081799591, + "grad_norm": 2.0082876640493525, + "learning_rate": 9.759237450075174e-06, + "loss": 0.0845, + "step": 97 + }, + { + "epoch": 0.20040899795501022, + "grad_norm": 1.6218133242260213, + "learning_rate": 9.754288956503737e-06, + "loss": 0.0792, + "step": 98 + }, + { + "epoch": 0.20245398773006135, + "grad_norm": 1.8693374042253028, + "learning_rate": 9.749291405226304e-06, + "loss": 0.089, + "step": 99 + }, + { + "epoch": 0.20449897750511248, + "grad_norm": 2.3402858038101337, + "learning_rate": 9.744244847810716e-06, + "loss": 0.0945, + "step": 100 + }, + { + "epoch": 0.2065439672801636, + "grad_norm": 2.400216651654056, + "learning_rate": 9.739149336330482e-06, + "loss": 0.0994, + "step": 101 + }, + { + "epoch": 0.2085889570552147, + "grad_norm": 1.9932426008301034, + "learning_rate": 9.734004923364258e-06, + "loss": 0.0813, + "step": 102 + }, + { + "epoch": 0.21063394683026584, + "grad_norm": 1.8232352554241547, + "learning_rate": 9.728811661995287e-06, + "loss": 0.0833, + "step": 103 + }, + { + "epoch": 0.21267893660531698, + "grad_norm": 1.774918510432305, + "learning_rate": 9.72356960581087e-06, + "loss": 0.0853, + "step": 104 + }, + { + "epoch": 0.2147239263803681, + "grad_norm": 2.987329389159815, + "learning_rate": 9.718278808901797e-06, + "loss": 0.1114, + "step": 105 + }, + { + "epoch": 0.2167689161554192, + "grad_norm": 2.248351378515216, + "learning_rate": 9.712939325861794e-06, + "loss": 0.0826, + "step": 106 + }, + { + "epoch": 0.21881390593047034, + "grad_norm": 2.218767795388457, + "learning_rate": 9.707551211786966e-06, + "loss": 0.088, + "step": 107 + }, + { + "epoch": 0.22085889570552147, + "grad_norm": 2.3431433008509917, + "learning_rate": 9.702114522275216e-06, + "loss": 0.0897, + "step": 108 + }, + { + "epoch": 0.2229038854805726, + "grad_norm": 1.9166897788167856, + "learning_rate": 9.696629313425688e-06, + "loss": 0.088, + "step": 109 + }, + { + "epoch": 0.2249488752556237, + "grad_norm": 1.9440115291462636, + "learning_rate": 9.691095641838168e-06, + "loss": 0.0836, + "step": 110 + }, + { + "epoch": 0.22699386503067484, + "grad_norm": 1.813961610317634, + "learning_rate": 9.685513564612521e-06, + "loss": 0.078, + "step": 111 + }, + { + "epoch": 0.22903885480572597, + "grad_norm": 1.8809059426216883, + "learning_rate": 9.679883139348082e-06, + "loss": 0.0821, + "step": 112 + }, + { + "epoch": 0.2310838445807771, + "grad_norm": 2.2311254705001233, + "learning_rate": 9.674204424143079e-06, + "loss": 0.0883, + "step": 113 + }, + { + "epoch": 0.2331288343558282, + "grad_norm": 1.9295136215801372, + "learning_rate": 9.668477477594021e-06, + "loss": 0.0833, + "step": 114 + }, + { + "epoch": 0.23517382413087934, + "grad_norm": 1.8615614639144564, + "learning_rate": 9.662702358795098e-06, + "loss": 0.0822, + "step": 115 + }, + { + "epoch": 0.23721881390593047, + "grad_norm": 1.8761973618596817, + "learning_rate": 9.656879127337571e-06, + "loss": 0.0785, + "step": 116 + }, + { + "epoch": 0.2392638036809816, + "grad_norm": 2.017270471451727, + "learning_rate": 9.651007843309164e-06, + "loss": 0.0878, + "step": 117 + }, + { + "epoch": 0.24130879345603273, + "grad_norm": 2.1414773647169936, + "learning_rate": 9.645088567293426e-06, + "loss": 0.0932, + "step": 118 + }, + { + "epoch": 0.24335378323108384, + "grad_norm": 1.7284124634354323, + "learning_rate": 9.639121360369127e-06, + "loss": 0.0683, + "step": 119 + }, + { + "epoch": 0.24539877300613497, + "grad_norm": 2.3422614186852577, + "learning_rate": 9.633106284109612e-06, + "loss": 0.1061, + "step": 120 + }, + { + "epoch": 0.2474437627811861, + "grad_norm": 1.9680728218006462, + "learning_rate": 9.627043400582173e-06, + "loss": 0.0832, + "step": 121 + }, + { + "epoch": 0.24948875255623723, + "grad_norm": 1.744621659832594, + "learning_rate": 9.620932772347408e-06, + "loss": 0.0716, + "step": 122 + }, + { + "epoch": 0.25153374233128833, + "grad_norm": 2.003659281799268, + "learning_rate": 9.614774462458573e-06, + "loss": 0.0943, + "step": 123 + }, + { + "epoch": 0.25357873210633947, + "grad_norm": 1.9112829391643362, + "learning_rate": 9.608568534460938e-06, + "loss": 0.0791, + "step": 124 + }, + { + "epoch": 0.2556237218813906, + "grad_norm": 1.6018069748701698, + "learning_rate": 9.602315052391116e-06, + "loss": 0.0699, + "step": 125 + }, + { + "epoch": 0.25766871165644173, + "grad_norm": 1.9898564316497316, + "learning_rate": 9.596014080776424e-06, + "loss": 0.0868, + "step": 126 + }, + { + "epoch": 0.25971370143149286, + "grad_norm": 1.9062653706577775, + "learning_rate": 9.589665684634197e-06, + "loss": 0.0797, + "step": 127 + }, + { + "epoch": 0.261758691206544, + "grad_norm": 2.105685404483493, + "learning_rate": 9.583269929471129e-06, + "loss": 0.0802, + "step": 128 + }, + { + "epoch": 0.26380368098159507, + "grad_norm": 1.8889444529306618, + "learning_rate": 9.576826881282595e-06, + "loss": 0.0773, + "step": 129 + }, + { + "epoch": 0.2658486707566462, + "grad_norm": 1.89509366954467, + "learning_rate": 9.570336606551966e-06, + "loss": 0.0845, + "step": 130 + }, + { + "epoch": 0.26789366053169733, + "grad_norm": 2.5730619597875792, + "learning_rate": 9.56379917224993e-06, + "loss": 0.1218, + "step": 131 + }, + { + "epoch": 0.26993865030674846, + "grad_norm": 3.174335117295452, + "learning_rate": 9.557214645833792e-06, + "loss": 0.1396, + "step": 132 + }, + { + "epoch": 0.2719836400817996, + "grad_norm": 1.506901278245754, + "learning_rate": 9.550583095246786e-06, + "loss": 0.0631, + "step": 133 + }, + { + "epoch": 0.2740286298568507, + "grad_norm": 2.3300783174234887, + "learning_rate": 9.543904588917366e-06, + "loss": 0.109, + "step": 134 + }, + { + "epoch": 0.27607361963190186, + "grad_norm": 1.8554323699407922, + "learning_rate": 9.537179195758513e-06, + "loss": 0.0746, + "step": 135 + }, + { + "epoch": 0.278118609406953, + "grad_norm": 1.4907022435447066, + "learning_rate": 9.530406985167005e-06, + "loss": 0.0712, + "step": 136 + }, + { + "epoch": 0.28016359918200406, + "grad_norm": 1.7196544870819945, + "learning_rate": 9.523588027022721e-06, + "loss": 0.075, + "step": 137 + }, + { + "epoch": 0.2822085889570552, + "grad_norm": 1.7344914939658451, + "learning_rate": 9.516722391687903e-06, + "loss": 0.0856, + "step": 138 + }, + { + "epoch": 0.2842535787321063, + "grad_norm": 2.1773597101038087, + "learning_rate": 9.50981015000644e-06, + "loss": 0.0929, + "step": 139 + }, + { + "epoch": 0.28629856850715746, + "grad_norm": 2.0166181602910376, + "learning_rate": 9.502851373303137e-06, + "loss": 0.0892, + "step": 140 + }, + { + "epoch": 0.2883435582822086, + "grad_norm": 2.0996295005016483, + "learning_rate": 9.495846133382973e-06, + "loss": 0.085, + "step": 141 + }, + { + "epoch": 0.2903885480572597, + "grad_norm": 2.09058564013836, + "learning_rate": 9.488794502530361e-06, + "loss": 0.0872, + "step": 142 + }, + { + "epoch": 0.29243353783231085, + "grad_norm": 1.8321276625056864, + "learning_rate": 9.481696553508411e-06, + "loss": 0.0927, + "step": 143 + }, + { + "epoch": 0.294478527607362, + "grad_norm": 1.918438250366742, + "learning_rate": 9.474552359558167e-06, + "loss": 0.0744, + "step": 144 + }, + { + "epoch": 0.2965235173824131, + "grad_norm": 2.327981634380635, + "learning_rate": 9.46736199439786e-06, + "loss": 0.1025, + "step": 145 + }, + { + "epoch": 0.2985685071574642, + "grad_norm": 2.2135170524903995, + "learning_rate": 9.460125532222142e-06, + "loss": 0.09, + "step": 146 + }, + { + "epoch": 0.3006134969325153, + "grad_norm": 2.2539230814408073, + "learning_rate": 9.452843047701324e-06, + "loss": 0.1023, + "step": 147 + }, + { + "epoch": 0.30265848670756645, + "grad_norm": 2.104687258049424, + "learning_rate": 9.445514615980604e-06, + "loss": 0.0905, + "step": 148 + }, + { + "epoch": 0.3047034764826176, + "grad_norm": 1.7372025147408934, + "learning_rate": 9.438140312679292e-06, + "loss": 0.0849, + "step": 149 + }, + { + "epoch": 0.3067484662576687, + "grad_norm": 2.0671665965859662, + "learning_rate": 9.43072021389003e-06, + "loss": 0.0924, + "step": 150 + }, + { + "epoch": 0.30879345603271985, + "grad_norm": 1.6350351491282862, + "learning_rate": 9.423254396178003e-06, + "loss": 0.0769, + "step": 151 + }, + { + "epoch": 0.310838445807771, + "grad_norm": 2.878396608282762, + "learning_rate": 9.415742936580156e-06, + "loss": 0.1538, + "step": 152 + }, + { + "epoch": 0.3128834355828221, + "grad_norm": 1.4213578692087034, + "learning_rate": 9.408185912604395e-06, + "loss": 0.065, + "step": 153 + }, + { + "epoch": 0.3149284253578732, + "grad_norm": 2.0855996921354, + "learning_rate": 9.400583402228785e-06, + "loss": 0.0844, + "step": 154 + }, + { + "epoch": 0.3169734151329243, + "grad_norm": 1.7352864078553754, + "learning_rate": 9.39293548390075e-06, + "loss": 0.0853, + "step": 155 + }, + { + "epoch": 0.31901840490797545, + "grad_norm": 1.334038745461943, + "learning_rate": 9.385242236536259e-06, + "loss": 0.0656, + "step": 156 + }, + { + "epoch": 0.3210633946830266, + "grad_norm": 2.174575475791565, + "learning_rate": 9.377503739519019e-06, + "loss": 0.0991, + "step": 157 + }, + { + "epoch": 0.3231083844580777, + "grad_norm": 1.6357643314755432, + "learning_rate": 9.369720072699648e-06, + "loss": 0.0792, + "step": 158 + }, + { + "epoch": 0.32515337423312884, + "grad_norm": 2.316934261247635, + "learning_rate": 9.36189131639485e-06, + "loss": 0.1112, + "step": 159 + }, + { + "epoch": 0.32719836400818, + "grad_norm": 1.9234234290855614, + "learning_rate": 9.354017551386599e-06, + "loss": 0.0851, + "step": 160 + }, + { + "epoch": 0.3292433537832311, + "grad_norm": 2.475496525507223, + "learning_rate": 9.346098858921292e-06, + "loss": 0.1062, + "step": 161 + }, + { + "epoch": 0.3312883435582822, + "grad_norm": 2.3268380138649487, + "learning_rate": 9.338135320708912e-06, + "loss": 0.1035, + "step": 162 + }, + { + "epoch": 0.3333333333333333, + "grad_norm": 1.5336893905703746, + "learning_rate": 9.330127018922195e-06, + "loss": 0.0702, + "step": 163 + }, + { + "epoch": 0.33537832310838445, + "grad_norm": 2.8082604544179035, + "learning_rate": 9.32207403619577e-06, + "loss": 0.1209, + "step": 164 + }, + { + "epoch": 0.3374233128834356, + "grad_norm": 1.5750634984249117, + "learning_rate": 9.313976455625316e-06, + "loss": 0.0713, + "step": 165 + }, + { + "epoch": 0.3394683026584867, + "grad_norm": 2.2373522766525262, + "learning_rate": 9.305834360766695e-06, + "loss": 0.0969, + "step": 166 + }, + { + "epoch": 0.34151329243353784, + "grad_norm": 2.342451381996767, + "learning_rate": 9.297647835635102e-06, + "loss": 0.0934, + "step": 167 + }, + { + "epoch": 0.34355828220858897, + "grad_norm": 1.936610520437153, + "learning_rate": 9.289416964704186e-06, + "loss": 0.0883, + "step": 168 + }, + { + "epoch": 0.3456032719836401, + "grad_norm": 1.8338353993342575, + "learning_rate": 9.281141832905185e-06, + "loss": 0.0778, + "step": 169 + }, + { + "epoch": 0.3476482617586912, + "grad_norm": 1.9110066741814127, + "learning_rate": 9.272822525626047e-06, + "loss": 0.0735, + "step": 170 + }, + { + "epoch": 0.3496932515337423, + "grad_norm": 2.179479069452803, + "learning_rate": 9.26445912871055e-06, + "loss": 0.0843, + "step": 171 + }, + { + "epoch": 0.35173824130879344, + "grad_norm": 1.9177594380676963, + "learning_rate": 9.25605172845742e-06, + "loss": 0.0805, + "step": 172 + }, + { + "epoch": 0.3537832310838446, + "grad_norm": 2.1882619443952684, + "learning_rate": 9.247600411619434e-06, + "loss": 0.0965, + "step": 173 + }, + { + "epoch": 0.3558282208588957, + "grad_norm": 2.2176075779513824, + "learning_rate": 9.239105265402525e-06, + "loss": 0.0974, + "step": 174 + }, + { + "epoch": 0.35787321063394684, + "grad_norm": 1.5074567124767815, + "learning_rate": 9.23056637746489e-06, + "loss": 0.0735, + "step": 175 + }, + { + "epoch": 0.35991820040899797, + "grad_norm": 2.060069998365139, + "learning_rate": 9.221983835916074e-06, + "loss": 0.1022, + "step": 176 + }, + { + "epoch": 0.3619631901840491, + "grad_norm": 2.1165212064315235, + "learning_rate": 9.213357729316077e-06, + "loss": 0.0995, + "step": 177 + }, + { + "epoch": 0.36400817995910023, + "grad_norm": 2.1868849806726787, + "learning_rate": 9.204688146674418e-06, + "loss": 0.0939, + "step": 178 + }, + { + "epoch": 0.3660531697341513, + "grad_norm": 1.7544924490641574, + "learning_rate": 9.195975177449238e-06, + "loss": 0.0873, + "step": 179 + }, + { + "epoch": 0.36809815950920244, + "grad_norm": 1.838768964795654, + "learning_rate": 9.187218911546363e-06, + "loss": 0.0864, + "step": 180 + }, + { + "epoch": 0.37014314928425357, + "grad_norm": 1.9536263850909072, + "learning_rate": 9.178419439318382e-06, + "loss": 0.0828, + "step": 181 + }, + { + "epoch": 0.3721881390593047, + "grad_norm": 1.8125655303827894, + "learning_rate": 9.169576851563715e-06, + "loss": 0.0707, + "step": 182 + }, + { + "epoch": 0.37423312883435583, + "grad_norm": 1.5346489369821823, + "learning_rate": 9.160691239525675e-06, + "loss": 0.0707, + "step": 183 + }, + { + "epoch": 0.37627811860940696, + "grad_norm": 2.0774049712635745, + "learning_rate": 9.151762694891522e-06, + "loss": 0.0892, + "step": 184 + }, + { + "epoch": 0.3783231083844581, + "grad_norm": 1.6068313703103427, + "learning_rate": 9.142791309791528e-06, + "loss": 0.0737, + "step": 185 + }, + { + "epoch": 0.3803680981595092, + "grad_norm": 2.491559077597992, + "learning_rate": 9.133777176798013e-06, + "loss": 0.1063, + "step": 186 + }, + { + "epoch": 0.3824130879345603, + "grad_norm": 1.936364688582553, + "learning_rate": 9.124720388924403e-06, + "loss": 0.0879, + "step": 187 + }, + { + "epoch": 0.38445807770961143, + "grad_norm": 1.7501246261711056, + "learning_rate": 9.115621039624256e-06, + "loss": 0.0831, + "step": 188 + }, + { + "epoch": 0.38650306748466257, + "grad_norm": 1.9375047463204769, + "learning_rate": 9.106479222790312e-06, + "loss": 0.0798, + "step": 189 + }, + { + "epoch": 0.3885480572597137, + "grad_norm": 1.9799704235731947, + "learning_rate": 9.09729503275351e-06, + "loss": 0.0818, + "step": 190 + }, + { + "epoch": 0.39059304703476483, + "grad_norm": 2.1027233151637046, + "learning_rate": 9.08806856428203e-06, + "loss": 0.0737, + "step": 191 + }, + { + "epoch": 0.39263803680981596, + "grad_norm": 2.2130274217863377, + "learning_rate": 9.078799912580305e-06, + "loss": 0.1049, + "step": 192 + }, + { + "epoch": 0.3946830265848671, + "grad_norm": 1.8596492941083875, + "learning_rate": 9.069489173288037e-06, + "loss": 0.0788, + "step": 193 + }, + { + "epoch": 0.3967280163599182, + "grad_norm": 1.8220962906735956, + "learning_rate": 9.060136442479215e-06, + "loss": 0.0789, + "step": 194 + }, + { + "epoch": 0.3987730061349693, + "grad_norm": 2.1684932411419773, + "learning_rate": 9.050741816661128e-06, + "loss": 0.1101, + "step": 195 + }, + { + "epoch": 0.40081799591002043, + "grad_norm": 2.2585167924890674, + "learning_rate": 9.041305392773355e-06, + "loss": 0.0899, + "step": 196 + }, + { + "epoch": 0.40286298568507156, + "grad_norm": 2.2529963379779514, + "learning_rate": 9.03182726818678e-06, + "loss": 0.1001, + "step": 197 + }, + { + "epoch": 0.4049079754601227, + "grad_norm": 2.019146584665829, + "learning_rate": 9.022307540702576e-06, + "loss": 0.0889, + "step": 198 + }, + { + "epoch": 0.4069529652351738, + "grad_norm": 2.0147227938530214, + "learning_rate": 9.012746308551208e-06, + "loss": 0.0779, + "step": 199 + }, + { + "epoch": 0.40899795501022496, + "grad_norm": 1.6785890661043144, + "learning_rate": 9.003143670391403e-06, + "loss": 0.0714, + "step": 200 + }, + { + "epoch": 0.40899795501022496, + "eval_loss": 0.09443490207195282, + "eval_runtime": 1.6107, + "eval_samples_per_second": 24.835, + "eval_steps_per_second": 6.209, + "step": 200 + }, + { + "epoch": 0.4110429447852761, + "grad_norm": 1.7907653453087733, + "learning_rate": 8.993499725309148e-06, + "loss": 0.0644, + "step": 201 + }, + { + "epoch": 0.4130879345603272, + "grad_norm": 2.0499291659974572, + "learning_rate": 8.983814572816656e-06, + "loss": 0.0764, + "step": 202 + }, + { + "epoch": 0.41513292433537835, + "grad_norm": 2.027050105104232, + "learning_rate": 8.974088312851346e-06, + "loss": 0.0896, + "step": 203 + }, + { + "epoch": 0.4171779141104294, + "grad_norm": 1.8185300386254655, + "learning_rate": 8.964321045774808e-06, + "loss": 0.0904, + "step": 204 + }, + { + "epoch": 0.41922290388548056, + "grad_norm": 1.8351321980331647, + "learning_rate": 8.954512872371768e-06, + "loss": 0.0798, + "step": 205 + }, + { + "epoch": 0.4212678936605317, + "grad_norm": 2.2777878812250734, + "learning_rate": 8.944663893849053e-06, + "loss": 0.094, + "step": 206 + }, + { + "epoch": 0.4233128834355828, + "grad_norm": 2.078616561352449, + "learning_rate": 8.934774211834538e-06, + "loss": 0.097, + "step": 207 + }, + { + "epoch": 0.42535787321063395, + "grad_norm": 1.5026879665719408, + "learning_rate": 8.924843928376105e-06, + "loss": 0.0667, + "step": 208 + }, + { + "epoch": 0.4274028629856851, + "grad_norm": 2.031373760012224, + "learning_rate": 8.914873145940585e-06, + "loss": 0.0983, + "step": 209 + }, + { + "epoch": 0.4294478527607362, + "grad_norm": 1.7750919975425428, + "learning_rate": 8.904861967412702e-06, + "loss": 0.0832, + "step": 210 + }, + { + "epoch": 0.43149284253578735, + "grad_norm": 1.6859653025880537, + "learning_rate": 8.894810496094016e-06, + "loss": 0.0739, + "step": 211 + }, + { + "epoch": 0.4335378323108384, + "grad_norm": 2.4773597386512374, + "learning_rate": 8.88471883570185e-06, + "loss": 0.104, + "step": 212 + }, + { + "epoch": 0.43558282208588955, + "grad_norm": 1.7481062215506529, + "learning_rate": 8.874587090368221e-06, + "loss": 0.0685, + "step": 213 + }, + { + "epoch": 0.4376278118609407, + "grad_norm": 1.8687306127676215, + "learning_rate": 8.86441536463877e-06, + "loss": 0.0812, + "step": 214 + }, + { + "epoch": 0.4396728016359918, + "grad_norm": 2.7660751966702515, + "learning_rate": 8.85420376347168e-06, + "loss": 0.1228, + "step": 215 + }, + { + "epoch": 0.44171779141104295, + "grad_norm": 2.008073359861921, + "learning_rate": 8.843952392236595e-06, + "loss": 0.092, + "step": 216 + }, + { + "epoch": 0.4437627811860941, + "grad_norm": 1.9689667185293374, + "learning_rate": 8.833661356713528e-06, + "loss": 0.0918, + "step": 217 + }, + { + "epoch": 0.4458077709611452, + "grad_norm": 2.0550779883515844, + "learning_rate": 8.823330763091775e-06, + "loss": 0.0842, + "step": 218 + }, + { + "epoch": 0.44785276073619634, + "grad_norm": 2.1458614538975316, + "learning_rate": 8.81296071796882e-06, + "loss": 0.0955, + "step": 219 + }, + { + "epoch": 0.4498977505112474, + "grad_norm": 2.0801721508502173, + "learning_rate": 8.802551328349222e-06, + "loss": 0.0696, + "step": 220 + }, + { + "epoch": 0.45194274028629855, + "grad_norm": 1.6170897770649597, + "learning_rate": 8.792102701643532e-06, + "loss": 0.074, + "step": 221 + }, + { + "epoch": 0.4539877300613497, + "grad_norm": 1.6010742203809665, + "learning_rate": 8.78161494566717e-06, + "loss": 0.068, + "step": 222 + }, + { + "epoch": 0.4560327198364008, + "grad_norm": 1.8263013055696211, + "learning_rate": 8.771088168639312e-06, + "loss": 0.0785, + "step": 223 + }, + { + "epoch": 0.45807770961145194, + "grad_norm": 1.8074234496570727, + "learning_rate": 8.760522479181784e-06, + "loss": 0.0843, + "step": 224 + }, + { + "epoch": 0.4601226993865031, + "grad_norm": 1.9423241552319763, + "learning_rate": 8.74991798631793e-06, + "loss": 0.0902, + "step": 225 + }, + { + "epoch": 0.4621676891615542, + "grad_norm": 2.426636585412464, + "learning_rate": 8.739274799471492e-06, + "loss": 0.1147, + "step": 226 + }, + { + "epoch": 0.46421267893660534, + "grad_norm": 1.8764452830009553, + "learning_rate": 8.728593028465481e-06, + "loss": 0.088, + "step": 227 + }, + { + "epoch": 0.4662576687116564, + "grad_norm": 1.8742190983636138, + "learning_rate": 8.717872783521048e-06, + "loss": 0.0919, + "step": 228 + }, + { + "epoch": 0.46830265848670755, + "grad_norm": 1.9812429967202114, + "learning_rate": 8.707114175256335e-06, + "loss": 0.1032, + "step": 229 + }, + { + "epoch": 0.4703476482617587, + "grad_norm": 1.5710292326402762, + "learning_rate": 8.696317314685342e-06, + "loss": 0.0735, + "step": 230 + }, + { + "epoch": 0.4723926380368098, + "grad_norm": 2.135568048299338, + "learning_rate": 8.685482313216784e-06, + "loss": 0.1003, + "step": 231 + }, + { + "epoch": 0.47443762781186094, + "grad_norm": 1.8410190133874755, + "learning_rate": 8.674609282652936e-06, + "loss": 0.0805, + "step": 232 + }, + { + "epoch": 0.47648261758691207, + "grad_norm": 1.95093910503971, + "learning_rate": 8.663698335188477e-06, + "loss": 0.0799, + "step": 233 + }, + { + "epoch": 0.4785276073619632, + "grad_norm": 2.0656801774088582, + "learning_rate": 8.65274958340934e-06, + "loss": 0.0953, + "step": 234 + }, + { + "epoch": 0.48057259713701433, + "grad_norm": 1.7872037593524146, + "learning_rate": 8.641763140291546e-06, + "loss": 0.0702, + "step": 235 + }, + { + "epoch": 0.48261758691206547, + "grad_norm": 2.0351005102773634, + "learning_rate": 8.630739119200035e-06, + "loss": 0.0828, + "step": 236 + }, + { + "epoch": 0.48466257668711654, + "grad_norm": 1.966029733326491, + "learning_rate": 8.61967763388751e-06, + "loss": 0.0887, + "step": 237 + }, + { + "epoch": 0.4867075664621677, + "grad_norm": 2.2496225787645714, + "learning_rate": 8.608578798493237e-06, + "loss": 0.0921, + "step": 238 + }, + { + "epoch": 0.4887525562372188, + "grad_norm": 2.3703828414232935, + "learning_rate": 8.597442727541898e-06, + "loss": 0.1055, + "step": 239 + }, + { + "epoch": 0.49079754601226994, + "grad_norm": 2.072283129147399, + "learning_rate": 8.586269535942386e-06, + "loss": 0.096, + "step": 240 + }, + { + "epoch": 0.49284253578732107, + "grad_norm": 1.763736942283961, + "learning_rate": 8.575059338986632e-06, + "loss": 0.0851, + "step": 241 + }, + { + "epoch": 0.4948875255623722, + "grad_norm": 1.9418651840022931, + "learning_rate": 8.563812252348412e-06, + "loss": 0.0817, + "step": 242 + }, + { + "epoch": 0.49693251533742333, + "grad_norm": 1.4038177877319757, + "learning_rate": 8.552528392082147e-06, + "loss": 0.0692, + "step": 243 + }, + { + "epoch": 0.49897750511247446, + "grad_norm": 2.2775569689795225, + "learning_rate": 8.541207874621718e-06, + "loss": 0.1092, + "step": 244 + }, + { + "epoch": 0.5010224948875256, + "grad_norm": 2.5534087713100955, + "learning_rate": 8.529850816779252e-06, + "loss": 0.1033, + "step": 245 + }, + { + "epoch": 0.5030674846625767, + "grad_norm": 1.531811934175557, + "learning_rate": 8.518457335743927e-06, + "loss": 0.0761, + "step": 246 + }, + { + "epoch": 0.5051124744376279, + "grad_norm": 2.3960006081387974, + "learning_rate": 8.507027549080753e-06, + "loss": 0.0941, + "step": 247 + }, + { + "epoch": 0.5071574642126789, + "grad_norm": 2.245296156491926, + "learning_rate": 8.49556157472937e-06, + "loss": 0.0992, + "step": 248 + }, + { + "epoch": 0.50920245398773, + "grad_norm": 2.1662992544835467, + "learning_rate": 8.484059531002822e-06, + "loss": 0.1096, + "step": 249 + }, + { + "epoch": 0.5112474437627812, + "grad_norm": 1.9378805133589119, + "learning_rate": 8.472521536586336e-06, + "loss": 0.0884, + "step": 250 + }, + { + "epoch": 0.5132924335378323, + "grad_norm": 1.7472804645413123, + "learning_rate": 8.460947710536108e-06, + "loss": 0.0881, + "step": 251 + }, + { + "epoch": 0.5153374233128835, + "grad_norm": 1.8567960096830705, + "learning_rate": 8.44933817227806e-06, + "loss": 0.1041, + "step": 252 + }, + { + "epoch": 0.5173824130879345, + "grad_norm": 1.6639705835205088, + "learning_rate": 8.437693041606619e-06, + "loss": 0.0767, + "step": 253 + }, + { + "epoch": 0.5194274028629857, + "grad_norm": 1.7811045494491748, + "learning_rate": 8.426012438683472e-06, + "loss": 0.0795, + "step": 254 + }, + { + "epoch": 0.5214723926380368, + "grad_norm": 2.601937087112271, + "learning_rate": 8.41429648403634e-06, + "loss": 0.1157, + "step": 255 + }, + { + "epoch": 0.523517382413088, + "grad_norm": 2.2629417508652896, + "learning_rate": 8.402545298557712e-06, + "loss": 0.0965, + "step": 256 + }, + { + "epoch": 0.5255623721881391, + "grad_norm": 1.6219382198043681, + "learning_rate": 8.390759003503624e-06, + "loss": 0.0804, + "step": 257 + }, + { + "epoch": 0.5276073619631901, + "grad_norm": 1.6735037903910355, + "learning_rate": 8.378937720492384e-06, + "loss": 0.0708, + "step": 258 + }, + { + "epoch": 0.5296523517382413, + "grad_norm": 1.6949968905732045, + "learning_rate": 8.367081571503332e-06, + "loss": 0.0796, + "step": 259 + }, + { + "epoch": 0.5316973415132924, + "grad_norm": 1.5829034537038222, + "learning_rate": 8.355190678875577e-06, + "loss": 0.0685, + "step": 260 + }, + { + "epoch": 0.5337423312883436, + "grad_norm": 2.1474520860458814, + "learning_rate": 8.343265165306736e-06, + "loss": 0.0966, + "step": 261 + }, + { + "epoch": 0.5357873210633947, + "grad_norm": 2.685259620414307, + "learning_rate": 8.331305153851659e-06, + "loss": 0.1199, + "step": 262 + }, + { + "epoch": 0.5378323108384458, + "grad_norm": 1.5378328527936944, + "learning_rate": 8.319310767921174e-06, + "loss": 0.0746, + "step": 263 + }, + { + "epoch": 0.5398773006134969, + "grad_norm": 1.5728870201255574, + "learning_rate": 8.307282131280805e-06, + "loss": 0.0794, + "step": 264 + }, + { + "epoch": 0.5419222903885481, + "grad_norm": 1.9037474406992847, + "learning_rate": 8.295219368049494e-06, + "loss": 0.0831, + "step": 265 + }, + { + "epoch": 0.5439672801635992, + "grad_norm": 1.8713169547943331, + "learning_rate": 8.283122602698324e-06, + "loss": 0.0866, + "step": 266 + }, + { + "epoch": 0.5460122699386503, + "grad_norm": 2.0187272804624032, + "learning_rate": 8.270991960049231e-06, + "loss": 0.0953, + "step": 267 + }, + { + "epoch": 0.5480572597137015, + "grad_norm": 2.3890714658865857, + "learning_rate": 8.258827565273717e-06, + "loss": 0.0993, + "step": 268 + }, + { + "epoch": 0.5501022494887525, + "grad_norm": 1.4224265522394863, + "learning_rate": 8.24662954389157e-06, + "loss": 0.0685, + "step": 269 + }, + { + "epoch": 0.5521472392638037, + "grad_norm": 1.8253908241082366, + "learning_rate": 8.234398021769541e-06, + "loss": 0.0859, + "step": 270 + }, + { + "epoch": 0.5541922290388548, + "grad_norm": 1.8297687093456312, + "learning_rate": 8.222133125120076e-06, + "loss": 0.0842, + "step": 271 + }, + { + "epoch": 0.556237218813906, + "grad_norm": 1.7325614091536314, + "learning_rate": 8.209834980499995e-06, + "loss": 0.0664, + "step": 272 + }, + { + "epoch": 0.558282208588957, + "grad_norm": 1.8426658391443724, + "learning_rate": 8.19750371480919e-06, + "loss": 0.0823, + "step": 273 + }, + { + "epoch": 0.5603271983640081, + "grad_norm": 2.335513659237072, + "learning_rate": 8.185139455289322e-06, + "loss": 0.1004, + "step": 274 + }, + { + "epoch": 0.5623721881390593, + "grad_norm": 2.281382949923011, + "learning_rate": 8.172742329522493e-06, + "loss": 0.0923, + "step": 275 + }, + { + "epoch": 0.5644171779141104, + "grad_norm": 2.0875496660986586, + "learning_rate": 8.160312465429952e-06, + "loss": 0.1007, + "step": 276 + }, + { + "epoch": 0.5664621676891616, + "grad_norm": 1.6706016356250908, + "learning_rate": 8.147849991270753e-06, + "loss": 0.0749, + "step": 277 + }, + { + "epoch": 0.5685071574642127, + "grad_norm": 2.3348044470325586, + "learning_rate": 8.135355035640445e-06, + "loss": 0.1075, + "step": 278 + }, + { + "epoch": 0.5705521472392638, + "grad_norm": 1.9325325555725485, + "learning_rate": 8.122827727469737e-06, + "loss": 0.0847, + "step": 279 + }, + { + "epoch": 0.5725971370143149, + "grad_norm": 2.06473154517661, + "learning_rate": 8.110268196023179e-06, + "loss": 0.0923, + "step": 280 + }, + { + "epoch": 0.5746421267893661, + "grad_norm": 1.7347784233467545, + "learning_rate": 8.097676570897814e-06, + "loss": 0.0767, + "step": 281 + }, + { + "epoch": 0.5766871165644172, + "grad_norm": 1.7284531347044014, + "learning_rate": 8.085052982021849e-06, + "loss": 0.0822, + "step": 282 + }, + { + "epoch": 0.5787321063394683, + "grad_norm": 2.0234039627173863, + "learning_rate": 8.072397559653314e-06, + "loss": 0.0903, + "step": 283 + }, + { + "epoch": 0.5807770961145194, + "grad_norm": 1.8567076129812703, + "learning_rate": 8.059710434378717e-06, + "loss": 0.0829, + "step": 284 + }, + { + "epoch": 0.5828220858895705, + "grad_norm": 1.8280706428554012, + "learning_rate": 8.046991737111696e-06, + "loss": 0.0846, + "step": 285 + }, + { + "epoch": 0.5848670756646217, + "grad_norm": 1.6827693552674245, + "learning_rate": 8.034241599091666e-06, + "loss": 0.0744, + "step": 286 + }, + { + "epoch": 0.5869120654396728, + "grad_norm": 1.4276933688240632, + "learning_rate": 8.021460151882472e-06, + "loss": 0.0644, + "step": 287 + }, + { + "epoch": 0.588957055214724, + "grad_norm": 1.7054089254136917, + "learning_rate": 8.008647527371022e-06, + "loss": 0.0691, + "step": 288 + }, + { + "epoch": 0.591002044989775, + "grad_norm": 2.3943112344962616, + "learning_rate": 7.995803857765934e-06, + "loss": 0.1105, + "step": 289 + }, + { + "epoch": 0.5930470347648262, + "grad_norm": 2.025612566291375, + "learning_rate": 7.982929275596164e-06, + "loss": 0.0936, + "step": 290 + }, + { + "epoch": 0.5950920245398773, + "grad_norm": 2.0696844237753984, + "learning_rate": 7.970023913709652e-06, + "loss": 0.0916, + "step": 291 + }, + { + "epoch": 0.5971370143149284, + "grad_norm": 2.1125496705836184, + "learning_rate": 7.957087905271934e-06, + "loss": 0.0812, + "step": 292 + }, + { + "epoch": 0.5991820040899796, + "grad_norm": 1.9111826855162881, + "learning_rate": 7.944121383764775e-06, + "loss": 0.0878, + "step": 293 + }, + { + "epoch": 0.6012269938650306, + "grad_norm": 2.0166887475359507, + "learning_rate": 7.931124482984802e-06, + "loss": 0.088, + "step": 294 + }, + { + "epoch": 0.6032719836400818, + "grad_norm": 2.4597183492348145, + "learning_rate": 7.918097337042106e-06, + "loss": 0.1066, + "step": 295 + }, + { + "epoch": 0.6053169734151329, + "grad_norm": 1.7705184105320022, + "learning_rate": 7.905040080358869e-06, + "loss": 0.0784, + "step": 296 + }, + { + "epoch": 0.6073619631901841, + "grad_norm": 1.7246778829446732, + "learning_rate": 7.891952847667973e-06, + "loss": 0.0777, + "step": 297 + }, + { + "epoch": 0.6094069529652352, + "grad_norm": 2.1760471200028593, + "learning_rate": 7.878835774011615e-06, + "loss": 0.0983, + "step": 298 + }, + { + "epoch": 0.6114519427402862, + "grad_norm": 2.1592710885226327, + "learning_rate": 7.865688994739907e-06, + "loss": 0.0996, + "step": 299 + }, + { + "epoch": 0.6134969325153374, + "grad_norm": 1.7446253812062307, + "learning_rate": 7.85251264550948e-06, + "loss": 0.0767, + "step": 300 + }, + { + "epoch": 0.6155419222903885, + "grad_norm": 2.784714583612841, + "learning_rate": 7.83930686228209e-06, + "loss": 0.0871, + "step": 301 + }, + { + "epoch": 0.6175869120654397, + "grad_norm": 1.923087819950953, + "learning_rate": 7.826071781323208e-06, + "loss": 0.076, + "step": 302 + }, + { + "epoch": 0.6196319018404908, + "grad_norm": 1.78632914754461, + "learning_rate": 7.812807539200622e-06, + "loss": 0.0778, + "step": 303 + }, + { + "epoch": 0.621676891615542, + "grad_norm": 1.9376192118205642, + "learning_rate": 7.799514272783014e-06, + "loss": 0.0817, + "step": 304 + }, + { + "epoch": 0.623721881390593, + "grad_norm": 2.550158615394769, + "learning_rate": 7.786192119238568e-06, + "loss": 0.1057, + "step": 305 + }, + { + "epoch": 0.6257668711656442, + "grad_norm": 1.9711665467023245, + "learning_rate": 7.772841216033534e-06, + "loss": 0.0764, + "step": 306 + }, + { + "epoch": 0.6278118609406953, + "grad_norm": 1.5340501908307014, + "learning_rate": 7.759461700930824e-06, + "loss": 0.0637, + "step": 307 + }, + { + "epoch": 0.6298568507157464, + "grad_norm": 2.2338456267605005, + "learning_rate": 7.746053711988584e-06, + "loss": 0.1059, + "step": 308 + }, + { + "epoch": 0.6319018404907976, + "grad_norm": 1.7891397758115173, + "learning_rate": 7.732617387558769e-06, + "loss": 0.0824, + "step": 309 + }, + { + "epoch": 0.6339468302658486, + "grad_norm": 2.1234757737848287, + "learning_rate": 7.719152866285722e-06, + "loss": 0.0885, + "step": 310 + }, + { + "epoch": 0.6359918200408998, + "grad_norm": 2.4102510823654457, + "learning_rate": 7.70566028710473e-06, + "loss": 0.0996, + "step": 311 + }, + { + "epoch": 0.6380368098159509, + "grad_norm": 1.9375735772859437, + "learning_rate": 7.692139789240611e-06, + "loss": 0.091, + "step": 312 + }, + { + "epoch": 0.6400817995910021, + "grad_norm": 2.0158092912142824, + "learning_rate": 7.678591512206254e-06, + "loss": 0.088, + "step": 313 + }, + { + "epoch": 0.6421267893660532, + "grad_norm": 1.6480327933319945, + "learning_rate": 7.665015595801198e-06, + "loss": 0.0791, + "step": 314 + }, + { + "epoch": 0.6441717791411042, + "grad_norm": 1.8510030476483572, + "learning_rate": 7.651412180110176e-06, + "loss": 0.085, + "step": 315 + }, + { + "epoch": 0.6462167689161554, + "grad_norm": 1.592679706462086, + "learning_rate": 7.637781405501682e-06, + "loss": 0.0719, + "step": 316 + }, + { + "epoch": 0.6482617586912065, + "grad_norm": 1.871195454539005, + "learning_rate": 7.6241234126265115e-06, + "loss": 0.0935, + "step": 317 + }, + { + "epoch": 0.6503067484662577, + "grad_norm": 2.1635066751175978, + "learning_rate": 7.61043834241632e-06, + "loss": 0.0887, + "step": 318 + }, + { + "epoch": 0.6523517382413088, + "grad_norm": 1.7458256267250807, + "learning_rate": 7.596726336082158e-06, + "loss": 0.0784, + "step": 319 + }, + { + "epoch": 0.65439672801636, + "grad_norm": 1.9970410164681027, + "learning_rate": 7.5829875351130224e-06, + "loss": 0.0825, + "step": 320 + }, + { + "epoch": 0.656441717791411, + "grad_norm": 1.8581711995026613, + "learning_rate": 7.569222081274396e-06, + "loss": 0.074, + "step": 321 + }, + { + "epoch": 0.6584867075664622, + "grad_norm": 1.5023298192040886, + "learning_rate": 7.555430116606778e-06, + "loss": 0.0707, + "step": 322 + }, + { + "epoch": 0.6605316973415133, + "grad_norm": 1.9742828072984793, + "learning_rate": 7.5416117834242254e-06, + "loss": 0.0839, + "step": 323 + }, + { + "epoch": 0.6625766871165644, + "grad_norm": 1.7579407302668417, + "learning_rate": 7.527767224312883e-06, + "loss": 0.0802, + "step": 324 + }, + { + "epoch": 0.6646216768916156, + "grad_norm": 1.7128227508559022, + "learning_rate": 7.513896582129507e-06, + "loss": 0.0745, + "step": 325 + }, + { + "epoch": 0.6666666666666666, + "grad_norm": 1.9293198934120017, + "learning_rate": 7.500000000000001e-06, + "loss": 0.0856, + "step": 326 + }, + { + "epoch": 0.6687116564417178, + "grad_norm": 2.0925311155648703, + "learning_rate": 7.4860776213179264e-06, + "loss": 0.0839, + "step": 327 + }, + { + "epoch": 0.6707566462167689, + "grad_norm": 2.082947312061181, + "learning_rate": 7.472129589743034e-06, + "loss": 0.0844, + "step": 328 + }, + { + "epoch": 0.6728016359918201, + "grad_norm": 2.0524639760050127, + "learning_rate": 7.458156049199775e-06, + "loss": 0.1008, + "step": 329 + }, + { + "epoch": 0.6748466257668712, + "grad_norm": 1.8254793507601215, + "learning_rate": 7.44415714387582e-06, + "loss": 0.0692, + "step": 330 + }, + { + "epoch": 0.6768916155419223, + "grad_norm": 1.9185120612100472, + "learning_rate": 7.430133018220567e-06, + "loss": 0.0902, + "step": 331 + }, + { + "epoch": 0.6789366053169734, + "grad_norm": 1.5528728788442376, + "learning_rate": 7.416083816943653e-06, + "loss": 0.0681, + "step": 332 + }, + { + "epoch": 0.6809815950920245, + "grad_norm": 1.8960655345457742, + "learning_rate": 7.4020096850134635e-06, + "loss": 0.0862, + "step": 333 + }, + { + "epoch": 0.6830265848670757, + "grad_norm": 1.8164525363712967, + "learning_rate": 7.38791076765563e-06, + "loss": 0.08, + "step": 334 + }, + { + "epoch": 0.6850715746421268, + "grad_norm": 1.8489841001332317, + "learning_rate": 7.37378721035154e-06, + "loss": 0.0863, + "step": 335 + }, + { + "epoch": 0.6871165644171779, + "grad_norm": 1.9227410779505356, + "learning_rate": 7.359639158836828e-06, + "loss": 0.0797, + "step": 336 + }, + { + "epoch": 0.689161554192229, + "grad_norm": 2.1782307041733855, + "learning_rate": 7.345466759099875e-06, + "loss": 0.0946, + "step": 337 + }, + { + "epoch": 0.6912065439672802, + "grad_norm": 2.1346962188887626, + "learning_rate": 7.331270157380304e-06, + "loss": 0.0953, + "step": 338 + }, + { + "epoch": 0.6932515337423313, + "grad_norm": 1.759960430802437, + "learning_rate": 7.317049500167466e-06, + "loss": 0.0969, + "step": 339 + }, + { + "epoch": 0.6952965235173824, + "grad_norm": 2.0404870097493646, + "learning_rate": 7.302804934198937e-06, + "loss": 0.0852, + "step": 340 + }, + { + "epoch": 0.6973415132924335, + "grad_norm": 2.3585223108650037, + "learning_rate": 7.28853660645899e-06, + "loss": 0.1054, + "step": 341 + }, + { + "epoch": 0.6993865030674846, + "grad_norm": 1.8518134360019116, + "learning_rate": 7.2742446641770985e-06, + "loss": 0.0942, + "step": 342 + }, + { + "epoch": 0.7014314928425358, + "grad_norm": 1.6802043170675642, + "learning_rate": 7.259929254826393e-06, + "loss": 0.0703, + "step": 343 + }, + { + "epoch": 0.7034764826175869, + "grad_norm": 2.3222003347544233, + "learning_rate": 7.2455905261221585e-06, + "loss": 0.0981, + "step": 344 + }, + { + "epoch": 0.7055214723926381, + "grad_norm": 1.7096656290299208, + "learning_rate": 7.231228626020303e-06, + "loss": 0.0686, + "step": 345 + }, + { + "epoch": 0.7075664621676891, + "grad_norm": 2.301527792978425, + "learning_rate": 7.216843702715831e-06, + "loss": 0.0806, + "step": 346 + }, + { + "epoch": 0.7096114519427403, + "grad_norm": 1.7573853731950437, + "learning_rate": 7.202435904641316e-06, + "loss": 0.0766, + "step": 347 + }, + { + "epoch": 0.7116564417177914, + "grad_norm": 1.882419627052227, + "learning_rate": 7.188005380465365e-06, + "loss": 0.0733, + "step": 348 + }, + { + "epoch": 0.7137014314928425, + "grad_norm": 2.470103268920824, + "learning_rate": 7.173552279091087e-06, + "loss": 0.1016, + "step": 349 + }, + { + "epoch": 0.7157464212678937, + "grad_norm": 1.4869158817717396, + "learning_rate": 7.159076749654559e-06, + "loss": 0.0624, + "step": 350 + }, + { + "epoch": 0.7177914110429447, + "grad_norm": 1.5968050085844632, + "learning_rate": 7.144578941523283e-06, + "loss": 0.0707, + "step": 351 + }, + { + "epoch": 0.7198364008179959, + "grad_norm": 1.6356481647041587, + "learning_rate": 7.130059004294647e-06, + "loss": 0.066, + "step": 352 + }, + { + "epoch": 0.721881390593047, + "grad_norm": 2.9392656768707504, + "learning_rate": 7.115517087794381e-06, + "loss": 0.1009, + "step": 353 + }, + { + "epoch": 0.7239263803680982, + "grad_norm": 2.2918804151158065, + "learning_rate": 7.10095334207501e-06, + "loss": 0.0962, + "step": 354 + }, + { + "epoch": 0.7259713701431493, + "grad_norm": 1.8475331071622312, + "learning_rate": 7.086367917414307e-06, + "loss": 0.082, + "step": 355 + }, + { + "epoch": 0.7280163599182005, + "grad_norm": 1.9726367085045817, + "learning_rate": 7.071760964313739e-06, + "loss": 0.0732, + "step": 356 + }, + { + "epoch": 0.7300613496932515, + "grad_norm": 2.1502810171764244, + "learning_rate": 7.057132633496924e-06, + "loss": 0.1049, + "step": 357 + }, + { + "epoch": 0.7321063394683026, + "grad_norm": 1.8592273232420053, + "learning_rate": 7.042483075908062e-06, + "loss": 0.0862, + "step": 358 + }, + { + "epoch": 0.7341513292433538, + "grad_norm": 2.355170511162385, + "learning_rate": 7.027812442710385e-06, + "loss": 0.0937, + "step": 359 + }, + { + "epoch": 0.7361963190184049, + "grad_norm": 1.6779561691380307, + "learning_rate": 7.013120885284599e-06, + "loss": 0.0675, + "step": 360 + }, + { + "epoch": 0.7382413087934561, + "grad_norm": 2.3918539767349762, + "learning_rate": 6.9984085552273136e-06, + "loss": 0.0964, + "step": 361 + }, + { + "epoch": 0.7402862985685071, + "grad_norm": 2.0029627191660087, + "learning_rate": 6.983675604349492e-06, + "loss": 0.0808, + "step": 362 + }, + { + "epoch": 0.7423312883435583, + "grad_norm": 2.361971189154723, + "learning_rate": 6.968922184674868e-06, + "loss": 0.0902, + "step": 363 + }, + { + "epoch": 0.7443762781186094, + "grad_norm": 1.7941380948957237, + "learning_rate": 6.954148448438389e-06, + "loss": 0.093, + "step": 364 + }, + { + "epoch": 0.7464212678936605, + "grad_norm": 1.8475776231124883, + "learning_rate": 6.9393545480846405e-06, + "loss": 0.0803, + "step": 365 + }, + { + "epoch": 0.7484662576687117, + "grad_norm": 1.391463463223123, + "learning_rate": 6.924540636266272e-06, + "loss": 0.0604, + "step": 366 + }, + { + "epoch": 0.7505112474437627, + "grad_norm": 1.4587955996368223, + "learning_rate": 6.909706865842429e-06, + "loss": 0.0707, + "step": 367 + }, + { + "epoch": 0.7525562372188139, + "grad_norm": 1.4497943658621633, + "learning_rate": 6.894853389877163e-06, + "loss": 0.0562, + "step": 368 + }, + { + "epoch": 0.754601226993865, + "grad_norm": 2.2816948101972474, + "learning_rate": 6.879980361637865e-06, + "loss": 0.0933, + "step": 369 + }, + { + "epoch": 0.7566462167689162, + "grad_norm": 2.2765511971102925, + "learning_rate": 6.86508793459368e-06, + "loss": 0.0799, + "step": 370 + }, + { + "epoch": 0.7586912065439673, + "grad_norm": 1.8489677964373195, + "learning_rate": 6.8501762624139125e-06, + "loss": 0.0828, + "step": 371 + }, + { + "epoch": 0.7607361963190185, + "grad_norm": 2.2599682805893244, + "learning_rate": 6.835245498966461e-06, + "loss": 0.1019, + "step": 372 + }, + { + "epoch": 0.7627811860940695, + "grad_norm": 1.7535048313819637, + "learning_rate": 6.820295798316214e-06, + "loss": 0.0877, + "step": 373 + }, + { + "epoch": 0.7648261758691206, + "grad_norm": 2.1348338096756962, + "learning_rate": 6.805327314723469e-06, + "loss": 0.0713, + "step": 374 + }, + { + "epoch": 0.7668711656441718, + "grad_norm": 1.471825773848477, + "learning_rate": 6.790340202642333e-06, + "loss": 0.0648, + "step": 375 + }, + { + "epoch": 0.7689161554192229, + "grad_norm": 1.9667987135525467, + "learning_rate": 6.775334616719136e-06, + "loss": 0.0933, + "step": 376 + }, + { + "epoch": 0.7709611451942741, + "grad_norm": 1.9656786527852497, + "learning_rate": 6.760310711790831e-06, + "loss": 0.0886, + "step": 377 + }, + { + "epoch": 0.7730061349693251, + "grad_norm": 1.7703569506269972, + "learning_rate": 6.7452686428834045e-06, + "loss": 0.0774, + "step": 378 + }, + { + "epoch": 0.7750511247443763, + "grad_norm": 2.247523798525931, + "learning_rate": 6.73020856521026e-06, + "loss": 0.1031, + "step": 379 + }, + { + "epoch": 0.7770961145194274, + "grad_norm": 1.872342874790795, + "learning_rate": 6.715130634170636e-06, + "loss": 0.0895, + "step": 380 + }, + { + "epoch": 0.7791411042944786, + "grad_norm": 2.070656684465323, + "learning_rate": 6.700035005347983e-06, + "loss": 0.0868, + "step": 381 + }, + { + "epoch": 0.7811860940695297, + "grad_norm": 2.2454799924898667, + "learning_rate": 6.6849218345083785e-06, + "loss": 0.0978, + "step": 382 + }, + { + "epoch": 0.7832310838445807, + "grad_norm": 1.891754000824279, + "learning_rate": 6.6697912775989045e-06, + "loss": 0.0785, + "step": 383 + }, + { + "epoch": 0.7852760736196319, + "grad_norm": 1.7579771296347333, + "learning_rate": 6.654643490746042e-06, + "loss": 0.0858, + "step": 384 + }, + { + "epoch": 0.787321063394683, + "grad_norm": 1.9641471370237964, + "learning_rate": 6.6394786302540645e-06, + "loss": 0.082, + "step": 385 + }, + { + "epoch": 0.7893660531697342, + "grad_norm": 1.8254808653521009, + "learning_rate": 6.624296852603419e-06, + "loss": 0.0882, + "step": 386 + }, + { + "epoch": 0.7914110429447853, + "grad_norm": 1.4088372814526477, + "learning_rate": 6.609098314449116e-06, + "loss": 0.0671, + "step": 387 + }, + { + "epoch": 0.7934560327198364, + "grad_norm": 1.9617841850743343, + "learning_rate": 6.593883172619111e-06, + "loss": 0.0933, + "step": 388 + }, + { + "epoch": 0.7955010224948875, + "grad_norm": 1.5767225526580313, + "learning_rate": 6.578651584112687e-06, + "loss": 0.0636, + "step": 389 + }, + { + "epoch": 0.7975460122699386, + "grad_norm": 2.2228834140058336, + "learning_rate": 6.563403706098833e-06, + "loss": 0.1077, + "step": 390 + }, + { + "epoch": 0.7995910020449898, + "grad_norm": 1.9792433955524278, + "learning_rate": 6.5481396959146225e-06, + "loss": 0.0891, + "step": 391 + }, + { + "epoch": 0.8016359918200409, + "grad_norm": 1.2215680463568073, + "learning_rate": 6.532859711063594e-06, + "loss": 0.0563, + "step": 392 + }, + { + "epoch": 0.803680981595092, + "grad_norm": 1.6824250107088006, + "learning_rate": 6.517563909214119e-06, + "loss": 0.0783, + "step": 393 + }, + { + "epoch": 0.8057259713701431, + "grad_norm": 1.7462647827998714, + "learning_rate": 6.502252448197782e-06, + "loss": 0.0814, + "step": 394 + }, + { + "epoch": 0.8077709611451943, + "grad_norm": 1.3887650073154911, + "learning_rate": 6.486925486007743e-06, + "loss": 0.0641, + "step": 395 + }, + { + "epoch": 0.8098159509202454, + "grad_norm": 2.0714118588443613, + "learning_rate": 6.471583180797121e-06, + "loss": 0.1055, + "step": 396 + }, + { + "epoch": 0.8118609406952966, + "grad_norm": 1.5588416633458682, + "learning_rate": 6.456225690877345e-06, + "loss": 0.0744, + "step": 397 + }, + { + "epoch": 0.8139059304703476, + "grad_norm": 1.6448175082442864, + "learning_rate": 6.440853174716535e-06, + "loss": 0.0679, + "step": 398 + }, + { + "epoch": 0.8159509202453987, + "grad_norm": 1.7938499571539583, + "learning_rate": 6.4254657909378615e-06, + "loss": 0.0701, + "step": 399 + }, + { + "epoch": 0.8179959100204499, + "grad_norm": 2.1584932014661606, + "learning_rate": 6.410063698317901e-06, + "loss": 0.0896, + "step": 400 + }, + { + "epoch": 0.8179959100204499, + "eval_loss": 0.08662194758653641, + "eval_runtime": 1.5943, + "eval_samples_per_second": 25.089, + "eval_steps_per_second": 6.272, + "step": 400 + }, + { + "epoch": 0.820040899795501, + "grad_norm": 1.6606377004284583, + "learning_rate": 6.394647055785017e-06, + "loss": 0.0699, + "step": 401 + }, + { + "epoch": 0.8220858895705522, + "grad_norm": 2.2914577704716113, + "learning_rate": 6.379216022417695e-06, + "loss": 0.0858, + "step": 402 + }, + { + "epoch": 0.8241308793456033, + "grad_norm": 1.7940636149724014, + "learning_rate": 6.363770757442927e-06, + "loss": 0.0838, + "step": 403 + }, + { + "epoch": 0.8261758691206544, + "grad_norm": 2.1090208330887363, + "learning_rate": 6.348311420234542e-06, + "loss": 0.0837, + "step": 404 + }, + { + "epoch": 0.8282208588957055, + "grad_norm": 1.761887269760676, + "learning_rate": 6.332838170311586e-06, + "loss": 0.0791, + "step": 405 + }, + { + "epoch": 0.8302658486707567, + "grad_norm": 2.0316688681749846, + "learning_rate": 6.31735116733666e-06, + "loss": 0.0762, + "step": 406 + }, + { + "epoch": 0.8323108384458078, + "grad_norm": 1.4824433767272704, + "learning_rate": 6.301850571114282e-06, + "loss": 0.0531, + "step": 407 + }, + { + "epoch": 0.8343558282208589, + "grad_norm": 1.9042239460112056, + "learning_rate": 6.286336541589224e-06, + "loss": 0.0685, + "step": 408 + }, + { + "epoch": 0.83640081799591, + "grad_norm": 1.631266470020269, + "learning_rate": 6.270809238844881e-06, + "loss": 0.0713, + "step": 409 + }, + { + "epoch": 0.8384458077709611, + "grad_norm": 1.8805596275114955, + "learning_rate": 6.255268823101604e-06, + "loss": 0.0751, + "step": 410 + }, + { + "epoch": 0.8404907975460123, + "grad_norm": 2.295370695981097, + "learning_rate": 6.239715454715054e-06, + "loss": 0.0984, + "step": 411 + }, + { + "epoch": 0.8425357873210634, + "grad_norm": 2.269325740615013, + "learning_rate": 6.224149294174549e-06, + "loss": 0.0966, + "step": 412 + }, + { + "epoch": 0.8445807770961146, + "grad_norm": 2.060132528646075, + "learning_rate": 6.208570502101393e-06, + "loss": 0.0817, + "step": 413 + }, + { + "epoch": 0.8466257668711656, + "grad_norm": 1.8016710838966334, + "learning_rate": 6.192979239247243e-06, + "loss": 0.0858, + "step": 414 + }, + { + "epoch": 0.8486707566462167, + "grad_norm": 1.9922284651178528, + "learning_rate": 6.177375666492431e-06, + "loss": 0.0735, + "step": 415 + }, + { + "epoch": 0.8507157464212679, + "grad_norm": 1.689681220388234, + "learning_rate": 6.161759944844308e-06, + "loss": 0.0756, + "step": 416 + }, + { + "epoch": 0.852760736196319, + "grad_norm": 2.618019309211191, + "learning_rate": 6.146132235435591e-06, + "loss": 0.0829, + "step": 417 + }, + { + "epoch": 0.8548057259713702, + "grad_norm": 2.0274624599323414, + "learning_rate": 6.1304926995226895e-06, + "loss": 0.0836, + "step": 418 + }, + { + "epoch": 0.8568507157464212, + "grad_norm": 2.0858291852426496, + "learning_rate": 6.114841498484049e-06, + "loss": 0.09, + "step": 419 + }, + { + "epoch": 0.8588957055214724, + "grad_norm": 1.656532684919004, + "learning_rate": 6.099178793818479e-06, + "loss": 0.0674, + "step": 420 + }, + { + "epoch": 0.8609406952965235, + "grad_norm": 1.781888769859481, + "learning_rate": 6.083504747143496e-06, + "loss": 0.0706, + "step": 421 + }, + { + "epoch": 0.8629856850715747, + "grad_norm": 2.2606057911008217, + "learning_rate": 6.0678195201936455e-06, + "loss": 0.0969, + "step": 422 + }, + { + "epoch": 0.8650306748466258, + "grad_norm": 2.3434090242083943, + "learning_rate": 6.0521232748188416e-06, + "loss": 0.1064, + "step": 423 + }, + { + "epoch": 0.8670756646216768, + "grad_norm": 2.064354269601007, + "learning_rate": 6.0364161729826905e-06, + "loss": 0.0896, + "step": 424 + }, + { + "epoch": 0.869120654396728, + "grad_norm": 1.7331387406948884, + "learning_rate": 6.020698376760824e-06, + "loss": 0.0753, + "step": 425 + }, + { + "epoch": 0.8711656441717791, + "grad_norm": 1.6248452960794957, + "learning_rate": 6.0049700483392256e-06, + "loss": 0.0683, + "step": 426 + }, + { + "epoch": 0.8732106339468303, + "grad_norm": 1.7788246413520943, + "learning_rate": 5.9892313500125545e-06, + "loss": 0.0808, + "step": 427 + }, + { + "epoch": 0.8752556237218814, + "grad_norm": 1.6403389067415772, + "learning_rate": 5.9734824441824745e-06, + "loss": 0.0763, + "step": 428 + }, + { + "epoch": 0.8773006134969326, + "grad_norm": 1.968967047123883, + "learning_rate": 5.957723493355977e-06, + "loss": 0.0946, + "step": 429 + }, + { + "epoch": 0.8793456032719836, + "grad_norm": 1.5050654888065231, + "learning_rate": 5.941954660143703e-06, + "loss": 0.0673, + "step": 430 + }, + { + "epoch": 0.8813905930470347, + "grad_norm": 1.5627708754572884, + "learning_rate": 5.926176107258265e-06, + "loss": 0.0662, + "step": 431 + }, + { + "epoch": 0.8834355828220859, + "grad_norm": 1.9429047212464141, + "learning_rate": 5.910387997512573e-06, + "loss": 0.0845, + "step": 432 + }, + { + "epoch": 0.885480572597137, + "grad_norm": 1.8862289067048144, + "learning_rate": 5.894590493818149e-06, + "loss": 0.074, + "step": 433 + }, + { + "epoch": 0.8875255623721882, + "grad_norm": 1.4871525287185456, + "learning_rate": 5.8787837591834415e-06, + "loss": 0.0642, + "step": 434 + }, + { + "epoch": 0.8895705521472392, + "grad_norm": 1.9230413221781277, + "learning_rate": 5.86296795671216e-06, + "loss": 0.0854, + "step": 435 + }, + { + "epoch": 0.8916155419222904, + "grad_norm": 1.8042936065902104, + "learning_rate": 5.847143249601575e-06, + "loss": 0.0733, + "step": 436 + }, + { + "epoch": 0.8936605316973415, + "grad_norm": 1.89659500750371, + "learning_rate": 5.831309801140841e-06, + "loss": 0.0717, + "step": 437 + }, + { + "epoch": 0.8957055214723927, + "grad_norm": 1.988875729296592, + "learning_rate": 5.815467774709314e-06, + "loss": 0.0901, + "step": 438 + }, + { + "epoch": 0.8977505112474438, + "grad_norm": 2.1651335543706365, + "learning_rate": 5.799617333774861e-06, + "loss": 0.0942, + "step": 439 + }, + { + "epoch": 0.8997955010224948, + "grad_norm": 1.694629036784553, + "learning_rate": 5.783758641892172e-06, + "loss": 0.0691, + "step": 440 + }, + { + "epoch": 0.901840490797546, + "grad_norm": 1.8724577454949232, + "learning_rate": 5.767891862701081e-06, + "loss": 0.0704, + "step": 441 + }, + { + "epoch": 0.9038854805725971, + "grad_norm": 2.1444156343749103, + "learning_rate": 5.7520171599248704e-06, + "loss": 0.0862, + "step": 442 + }, + { + "epoch": 0.9059304703476483, + "grad_norm": 1.6044981562664562, + "learning_rate": 5.73613469736858e-06, + "loss": 0.0695, + "step": 443 + }, + { + "epoch": 0.9079754601226994, + "grad_norm": 1.7887677604270025, + "learning_rate": 5.7202446389173225e-06, + "loss": 0.0776, + "step": 444 + }, + { + "epoch": 0.9100204498977505, + "grad_norm": 2.0623558286912487, + "learning_rate": 5.704347148534589e-06, + "loss": 0.0939, + "step": 445 + }, + { + "epoch": 0.9120654396728016, + "grad_norm": 1.7656943163705168, + "learning_rate": 5.688442390260559e-06, + "loss": 0.0699, + "step": 446 + }, + { + "epoch": 0.9141104294478528, + "grad_norm": 1.950808092154816, + "learning_rate": 5.672530528210405e-06, + "loss": 0.0764, + "step": 447 + }, + { + "epoch": 0.9161554192229039, + "grad_norm": 1.5958859437062274, + "learning_rate": 5.656611726572601e-06, + "loss": 0.0707, + "step": 448 + }, + { + "epoch": 0.918200408997955, + "grad_norm": 2.106375056034876, + "learning_rate": 5.640686149607228e-06, + "loss": 0.0884, + "step": 449 + }, + { + "epoch": 0.9202453987730062, + "grad_norm": 1.6944267542875595, + "learning_rate": 5.624753961644281e-06, + "loss": 0.0705, + "step": 450 + }, + { + "epoch": 0.9222903885480572, + "grad_norm": 1.7841030194649183, + "learning_rate": 5.608815327081969e-06, + "loss": 0.0765, + "step": 451 + }, + { + "epoch": 0.9243353783231084, + "grad_norm": 1.7712077615995716, + "learning_rate": 5.592870410385021e-06, + "loss": 0.0733, + "step": 452 + }, + { + "epoch": 0.9263803680981595, + "grad_norm": 2.1116933877527835, + "learning_rate": 5.57691937608299e-06, + "loss": 0.0913, + "step": 453 + }, + { + "epoch": 0.9284253578732107, + "grad_norm": 1.4163030121649893, + "learning_rate": 5.560962388768554e-06, + "loss": 0.0545, + "step": 454 + }, + { + "epoch": 0.9304703476482618, + "grad_norm": 1.810312240995325, + "learning_rate": 5.5449996130958185e-06, + "loss": 0.0754, + "step": 455 + }, + { + "epoch": 0.9325153374233128, + "grad_norm": 1.7804851440319986, + "learning_rate": 5.529031213778615e-06, + "loss": 0.0647, + "step": 456 + }, + { + "epoch": 0.934560327198364, + "grad_norm": 2.2045196131947624, + "learning_rate": 5.513057355588804e-06, + "loss": 0.0891, + "step": 457 + }, + { + "epoch": 0.9366053169734151, + "grad_norm": 1.9852749682627289, + "learning_rate": 5.497078203354577e-06, + "loss": 0.0775, + "step": 458 + }, + { + "epoch": 0.9386503067484663, + "grad_norm": 1.831470663502445, + "learning_rate": 5.481093921958749e-06, + "loss": 0.0845, + "step": 459 + }, + { + "epoch": 0.9406952965235174, + "grad_norm": 2.11329473791922, + "learning_rate": 5.4651046763370615e-06, + "loss": 0.0797, + "step": 460 + }, + { + "epoch": 0.9427402862985685, + "grad_norm": 1.936029472084334, + "learning_rate": 5.449110631476481e-06, + "loss": 0.0626, + "step": 461 + }, + { + "epoch": 0.9447852760736196, + "grad_norm": 2.8880649224481254, + "learning_rate": 5.433111952413496e-06, + "loss": 0.0876, + "step": 462 + }, + { + "epoch": 0.9468302658486708, + "grad_norm": 1.6788136591444187, + "learning_rate": 5.417108804232409e-06, + "loss": 0.0802, + "step": 463 + }, + { + "epoch": 0.9488752556237219, + "grad_norm": 1.7603381558531794, + "learning_rate": 5.4011013520636466e-06, + "loss": 0.0711, + "step": 464 + }, + { + "epoch": 0.950920245398773, + "grad_norm": 1.6546291038527539, + "learning_rate": 5.385089761082039e-06, + "loss": 0.0718, + "step": 465 + }, + { + "epoch": 0.9529652351738241, + "grad_norm": 1.7527461122946937, + "learning_rate": 5.3690741965051255e-06, + "loss": 0.0772, + "step": 466 + }, + { + "epoch": 0.9550102249488752, + "grad_norm": 2.153339872012431, + "learning_rate": 5.353054823591446e-06, + "loss": 0.0984, + "step": 467 + }, + { + "epoch": 0.9570552147239264, + "grad_norm": 1.663490695062259, + "learning_rate": 5.3370318076388405e-06, + "loss": 0.0719, + "step": 468 + }, + { + "epoch": 0.9591002044989775, + "grad_norm": 2.039791879502307, + "learning_rate": 5.3210053139827374e-06, + "loss": 0.0852, + "step": 469 + }, + { + "epoch": 0.9611451942740287, + "grad_norm": 1.5152660819257473, + "learning_rate": 5.304975507994453e-06, + "loss": 0.0705, + "step": 470 + }, + { + "epoch": 0.9631901840490797, + "grad_norm": 2.5741046076702485, + "learning_rate": 5.288942555079479e-06, + "loss": 0.0841, + "step": 471 + }, + { + "epoch": 0.9652351738241309, + "grad_norm": 1.9038985725819735, + "learning_rate": 5.27290662067578e-06, + "loss": 0.0852, + "step": 472 + }, + { + "epoch": 0.967280163599182, + "grad_norm": 2.287787910789673, + "learning_rate": 5.256867870252087e-06, + "loss": 0.0943, + "step": 473 + }, + { + "epoch": 0.9693251533742331, + "grad_norm": 2.001848621526479, + "learning_rate": 5.240826469306187e-06, + "loss": 0.0784, + "step": 474 + }, + { + "epoch": 0.9713701431492843, + "grad_norm": 2.1169709747380865, + "learning_rate": 5.224782583363215e-06, + "loss": 0.0841, + "step": 475 + }, + { + "epoch": 0.9734151329243353, + "grad_norm": 1.929731685506713, + "learning_rate": 5.208736377973954e-06, + "loss": 0.0749, + "step": 476 + }, + { + "epoch": 0.9754601226993865, + "grad_norm": 1.67776042592261, + "learning_rate": 5.1926880187131134e-06, + "loss": 0.0724, + "step": 477 + }, + { + "epoch": 0.9775051124744376, + "grad_norm": 2.298605193205057, + "learning_rate": 5.176637671177631e-06, + "loss": 0.1006, + "step": 478 + }, + { + "epoch": 0.9795501022494888, + "grad_norm": 1.6533628146778505, + "learning_rate": 5.160585500984962e-06, + "loss": 0.0646, + "step": 479 + }, + { + "epoch": 0.9815950920245399, + "grad_norm": 1.837841443310845, + "learning_rate": 5.144531673771364e-06, + "loss": 0.0735, + "step": 480 + }, + { + "epoch": 0.983640081799591, + "grad_norm": 2.0263842675819426, + "learning_rate": 5.1284763551901995e-06, + "loss": 0.0826, + "step": 481 + }, + { + "epoch": 0.9856850715746421, + "grad_norm": 1.7313226963602824, + "learning_rate": 5.112419710910213e-06, + "loss": 0.0672, + "step": 482 + }, + { + "epoch": 0.9877300613496932, + "grad_norm": 2.015542364940025, + "learning_rate": 5.096361906613836e-06, + "loss": 0.0782, + "step": 483 + }, + { + "epoch": 0.9897750511247444, + "grad_norm": 1.514253640731746, + "learning_rate": 5.080303107995461e-06, + "loss": 0.0737, + "step": 484 + }, + { + "epoch": 0.9918200408997955, + "grad_norm": 1.8395138697043611, + "learning_rate": 5.064243480759749e-06, + "loss": 0.0718, + "step": 485 + }, + { + "epoch": 0.9938650306748467, + "grad_norm": 1.884215459624302, + "learning_rate": 5.048183190619904e-06, + "loss": 0.0698, + "step": 486 + }, + { + "epoch": 0.9959100204498977, + "grad_norm": 2.146037958767005, + "learning_rate": 5.032122403295977e-06, + "loss": 0.0902, + "step": 487 + }, + { + "epoch": 0.9979550102249489, + "grad_norm": 2.050214345057994, + "learning_rate": 5.016061284513142e-06, + "loss": 0.0682, + "step": 488 + }, + { + "epoch": 1.0, + "grad_norm": 2.118455250575704, + "learning_rate": 5e-06, + "loss": 0.0774, + "step": 489 + }, + { + "epoch": 1.0020449897750512, + "grad_norm": 1.2462794649364155, + "learning_rate": 4.983938715486858e-06, + "loss": 0.033, + "step": 490 + }, + { + "epoch": 1.0040899795501022, + "grad_norm": 1.1459651928513004, + "learning_rate": 4.967877596704026e-06, + "loss": 0.0332, + "step": 491 + }, + { + "epoch": 1.0061349693251533, + "grad_norm": 1.2067420298473397, + "learning_rate": 4.951816809380098e-06, + "loss": 0.0286, + "step": 492 + }, + { + "epoch": 1.0081799591002045, + "grad_norm": 1.5512568893071932, + "learning_rate": 4.935756519240253e-06, + "loss": 0.0371, + "step": 493 + }, + { + "epoch": 1.0102249488752557, + "grad_norm": 1.0286360100738143, + "learning_rate": 4.919696892004539e-06, + "loss": 0.0302, + "step": 494 + }, + { + "epoch": 1.0122699386503067, + "grad_norm": 1.1516477911852547, + "learning_rate": 4.903638093386167e-06, + "loss": 0.0369, + "step": 495 + }, + { + "epoch": 1.0143149284253579, + "grad_norm": 1.245679943150789, + "learning_rate": 4.887580289089788e-06, + "loss": 0.0301, + "step": 496 + }, + { + "epoch": 1.016359918200409, + "grad_norm": 1.703967009754419, + "learning_rate": 4.871523644809802e-06, + "loss": 0.0466, + "step": 497 + }, + { + "epoch": 1.01840490797546, + "grad_norm": 1.546303848260575, + "learning_rate": 4.855468326228638e-06, + "loss": 0.0318, + "step": 498 + }, + { + "epoch": 1.0204498977505112, + "grad_norm": 1.251270953754372, + "learning_rate": 4.839414499015041e-06, + "loss": 0.0263, + "step": 499 + }, + { + "epoch": 1.0224948875255624, + "grad_norm": 1.3764180941343123, + "learning_rate": 4.82336232882237e-06, + "loss": 0.0345, + "step": 500 + }, + { + "epoch": 1.0245398773006136, + "grad_norm": 1.3372315078519637, + "learning_rate": 4.807311981286888e-06, + "loss": 0.0292, + "step": 501 + }, + { + "epoch": 1.0265848670756645, + "grad_norm": 1.544786930842698, + "learning_rate": 4.791263622026048e-06, + "loss": 0.0307, + "step": 502 + }, + { + "epoch": 1.0286298568507157, + "grad_norm": 1.3878895340607698, + "learning_rate": 4.775217416636786e-06, + "loss": 0.0326, + "step": 503 + }, + { + "epoch": 1.030674846625767, + "grad_norm": 1.6162202894342939, + "learning_rate": 4.7591735306938144e-06, + "loss": 0.0352, + "step": 504 + }, + { + "epoch": 1.032719836400818, + "grad_norm": 1.7805340361439255, + "learning_rate": 4.7431321297479135e-06, + "loss": 0.0372, + "step": 505 + }, + { + "epoch": 1.034764826175869, + "grad_norm": 1.758896549825897, + "learning_rate": 4.727093379324222e-06, + "loss": 0.0372, + "step": 506 + }, + { + "epoch": 1.0368098159509203, + "grad_norm": 1.431786070102582, + "learning_rate": 4.711057444920522e-06, + "loss": 0.0384, + "step": 507 + }, + { + "epoch": 1.0388548057259714, + "grad_norm": 1.4839513210400883, + "learning_rate": 4.6950244920055475e-06, + "loss": 0.0383, + "step": 508 + }, + { + "epoch": 1.0408997955010224, + "grad_norm": 1.4451001891145334, + "learning_rate": 4.678994686017263e-06, + "loss": 0.0352, + "step": 509 + }, + { + "epoch": 1.0429447852760736, + "grad_norm": 1.866286088536185, + "learning_rate": 4.662968192361161e-06, + "loss": 0.0395, + "step": 510 + }, + { + "epoch": 1.0449897750511248, + "grad_norm": 1.715211234345797, + "learning_rate": 4.646945176408555e-06, + "loss": 0.0313, + "step": 511 + }, + { + "epoch": 1.047034764826176, + "grad_norm": 1.466087161229122, + "learning_rate": 4.630925803494877e-06, + "loss": 0.0386, + "step": 512 + }, + { + "epoch": 1.049079754601227, + "grad_norm": 1.9408338143723025, + "learning_rate": 4.614910238917963e-06, + "loss": 0.042, + "step": 513 + }, + { + "epoch": 1.0511247443762781, + "grad_norm": 1.4266586107150059, + "learning_rate": 4.598898647936354e-06, + "loss": 0.0392, + "step": 514 + }, + { + "epoch": 1.0531697341513293, + "grad_norm": 1.2356672924767518, + "learning_rate": 4.582891195767591e-06, + "loss": 0.0263, + "step": 515 + }, + { + "epoch": 1.0552147239263803, + "grad_norm": 1.7715488824913876, + "learning_rate": 4.5668880475865074e-06, + "loss": 0.0405, + "step": 516 + }, + { + "epoch": 1.0572597137014315, + "grad_norm": 1.5415710251918855, + "learning_rate": 4.55088936852352e-06, + "loss": 0.0357, + "step": 517 + }, + { + "epoch": 1.0593047034764826, + "grad_norm": 1.4428168778927553, + "learning_rate": 4.534895323662939e-06, + "loss": 0.0317, + "step": 518 + }, + { + "epoch": 1.0613496932515338, + "grad_norm": 2.2250878381040953, + "learning_rate": 4.518906078041252e-06, + "loss": 0.0415, + "step": 519 + }, + { + "epoch": 1.0633946830265848, + "grad_norm": 1.9143713762008936, + "learning_rate": 4.502921796645424e-06, + "loss": 0.0525, + "step": 520 + }, + { + "epoch": 1.065439672801636, + "grad_norm": 1.3675839835879693, + "learning_rate": 4.486942644411197e-06, + "loss": 0.0308, + "step": 521 + }, + { + "epoch": 1.0674846625766872, + "grad_norm": 1.561173769749881, + "learning_rate": 4.4709687862213866e-06, + "loss": 0.0314, + "step": 522 + }, + { + "epoch": 1.0695296523517381, + "grad_norm": 1.7909630810420591, + "learning_rate": 4.455000386904185e-06, + "loss": 0.0434, + "step": 523 + }, + { + "epoch": 1.0715746421267893, + "grad_norm": 1.5523968118402804, + "learning_rate": 4.439037611231448e-06, + "loss": 0.0303, + "step": 524 + }, + { + "epoch": 1.0736196319018405, + "grad_norm": 1.4974277205131226, + "learning_rate": 4.423080623917012e-06, + "loss": 0.026, + "step": 525 + }, + { + "epoch": 1.0756646216768917, + "grad_norm": 1.4404941302278607, + "learning_rate": 4.40712958961498e-06, + "loss": 0.0439, + "step": 526 + }, + { + "epoch": 1.0777096114519427, + "grad_norm": 1.4209943648710208, + "learning_rate": 4.391184672918034e-06, + "loss": 0.033, + "step": 527 + }, + { + "epoch": 1.0797546012269938, + "grad_norm": 1.555014037990696, + "learning_rate": 4.3752460383557195e-06, + "loss": 0.0326, + "step": 528 + }, + { + "epoch": 1.081799591002045, + "grad_norm": 2.090310209970452, + "learning_rate": 4.3593138503927725e-06, + "loss": 0.0365, + "step": 529 + }, + { + "epoch": 1.0838445807770962, + "grad_norm": 1.4619403629973973, + "learning_rate": 4.3433882734274e-06, + "loss": 0.0317, + "step": 530 + }, + { + "epoch": 1.0858895705521472, + "grad_norm": 1.714317746744886, + "learning_rate": 4.327469471789597e-06, + "loss": 0.0384, + "step": 531 + }, + { + "epoch": 1.0879345603271984, + "grad_norm": 1.590590872486597, + "learning_rate": 4.311557609739442e-06, + "loss": 0.0259, + "step": 532 + }, + { + "epoch": 1.0899795501022496, + "grad_norm": 1.1141369910738055, + "learning_rate": 4.295652851465412e-06, + "loss": 0.0252, + "step": 533 + }, + { + "epoch": 1.0920245398773005, + "grad_norm": 1.4321694078456395, + "learning_rate": 4.27975536108268e-06, + "loss": 0.0266, + "step": 534 + }, + { + "epoch": 1.0940695296523517, + "grad_norm": 1.7443372340301369, + "learning_rate": 4.263865302631423e-06, + "loss": 0.04, + "step": 535 + }, + { + "epoch": 1.096114519427403, + "grad_norm": 1.569615321607395, + "learning_rate": 4.24798284007513e-06, + "loss": 0.0349, + "step": 536 + }, + { + "epoch": 1.098159509202454, + "grad_norm": 1.895924350646276, + "learning_rate": 4.2321081372989195e-06, + "loss": 0.0424, + "step": 537 + }, + { + "epoch": 1.100204498977505, + "grad_norm": 1.5923290036258984, + "learning_rate": 4.216241358107831e-06, + "loss": 0.0327, + "step": 538 + }, + { + "epoch": 1.1022494887525562, + "grad_norm": 1.6634363551936802, + "learning_rate": 4.200382666225141e-06, + "loss": 0.0486, + "step": 539 + }, + { + "epoch": 1.1042944785276074, + "grad_norm": 1.134137554773839, + "learning_rate": 4.184532225290687e-06, + "loss": 0.0223, + "step": 540 + }, + { + "epoch": 1.1063394683026584, + "grad_norm": 1.426415380744953, + "learning_rate": 4.16869019885916e-06, + "loss": 0.0312, + "step": 541 + }, + { + "epoch": 1.1083844580777096, + "grad_norm": 1.1710483257547355, + "learning_rate": 4.152856750398426e-06, + "loss": 0.0223, + "step": 542 + }, + { + "epoch": 1.1104294478527608, + "grad_norm": 1.5906880940400463, + "learning_rate": 4.137032043287841e-06, + "loss": 0.0343, + "step": 543 + }, + { + "epoch": 1.112474437627812, + "grad_norm": 1.4350816491421392, + "learning_rate": 4.121216240816559e-06, + "loss": 0.035, + "step": 544 + }, + { + "epoch": 1.114519427402863, + "grad_norm": 1.7558271509204728, + "learning_rate": 4.105409506181855e-06, + "loss": 0.0378, + "step": 545 + }, + { + "epoch": 1.116564417177914, + "grad_norm": 1.427323382586195, + "learning_rate": 4.089612002487428e-06, + "loss": 0.0312, + "step": 546 + }, + { + "epoch": 1.1186094069529653, + "grad_norm": 1.6522808692997277, + "learning_rate": 4.0738238927417354e-06, + "loss": 0.0359, + "step": 547 + }, + { + "epoch": 1.1206543967280163, + "grad_norm": 2.1347991414153986, + "learning_rate": 4.0580453398563005e-06, + "loss": 0.0336, + "step": 548 + }, + { + "epoch": 1.1226993865030674, + "grad_norm": 1.1279124304678423, + "learning_rate": 4.042276506644024e-06, + "loss": 0.0245, + "step": 549 + }, + { + "epoch": 1.1247443762781186, + "grad_norm": 1.1967369793368918, + "learning_rate": 4.026517555817527e-06, + "loss": 0.034, + "step": 550 + }, + { + "epoch": 1.1267893660531698, + "grad_norm": 1.543038828112648, + "learning_rate": 4.010768649987446e-06, + "loss": 0.0323, + "step": 551 + }, + { + "epoch": 1.1288343558282208, + "grad_norm": 2.005013019238116, + "learning_rate": 3.995029951660777e-06, + "loss": 0.0466, + "step": 552 + }, + { + "epoch": 1.130879345603272, + "grad_norm": 1.6139182636953708, + "learning_rate": 3.979301623239177e-06, + "loss": 0.0358, + "step": 553 + }, + { + "epoch": 1.1329243353783232, + "grad_norm": 1.3981520538108454, + "learning_rate": 3.963583827017311e-06, + "loss": 0.0377, + "step": 554 + }, + { + "epoch": 1.1349693251533743, + "grad_norm": 1.2494252534129648, + "learning_rate": 3.94787672518116e-06, + "loss": 0.0239, + "step": 555 + }, + { + "epoch": 1.1370143149284253, + "grad_norm": 2.204641878373284, + "learning_rate": 3.932180479806357e-06, + "loss": 0.0456, + "step": 556 + }, + { + "epoch": 1.1390593047034765, + "grad_norm": 1.565570789271408, + "learning_rate": 3.916495252856506e-06, + "loss": 0.0324, + "step": 557 + }, + { + "epoch": 1.1411042944785277, + "grad_norm": 1.6789711476896356, + "learning_rate": 3.900821206181521e-06, + "loss": 0.0368, + "step": 558 + }, + { + "epoch": 1.1431492842535786, + "grad_norm": 1.2216110777737839, + "learning_rate": 3.885158501515954e-06, + "loss": 0.0279, + "step": 559 + }, + { + "epoch": 1.1451942740286298, + "grad_norm": 1.3633350295226845, + "learning_rate": 3.869507300477311e-06, + "loss": 0.0328, + "step": 560 + }, + { + "epoch": 1.147239263803681, + "grad_norm": 1.4166924061545885, + "learning_rate": 3.853867764564409e-06, + "loss": 0.0329, + "step": 561 + }, + { + "epoch": 1.149284253578732, + "grad_norm": 1.6034718030946522, + "learning_rate": 3.838240055155692e-06, + "loss": 0.0334, + "step": 562 + }, + { + "epoch": 1.1513292433537832, + "grad_norm": 1.077423978891052, + "learning_rate": 3.8226243335075715e-06, + "loss": 0.0224, + "step": 563 + }, + { + "epoch": 1.1533742331288344, + "grad_norm": 1.3542581793543431, + "learning_rate": 3.8070207607527587e-06, + "loss": 0.0319, + "step": 564 + }, + { + "epoch": 1.1554192229038855, + "grad_norm": 1.6634879057734975, + "learning_rate": 3.7914294978986083e-06, + "loss": 0.0393, + "step": 565 + }, + { + "epoch": 1.1574642126789365, + "grad_norm": 1.628283419385412, + "learning_rate": 3.7758507058254547e-06, + "loss": 0.036, + "step": 566 + }, + { + "epoch": 1.1595092024539877, + "grad_norm": 1.5659369656664788, + "learning_rate": 3.760284545284947e-06, + "loss": 0.0277, + "step": 567 + }, + { + "epoch": 1.1615541922290389, + "grad_norm": 1.4921911012284916, + "learning_rate": 3.744731176898396e-06, + "loss": 0.0389, + "step": 568 + }, + { + "epoch": 1.16359918200409, + "grad_norm": 1.8188186488286036, + "learning_rate": 3.7291907611551197e-06, + "loss": 0.0521, + "step": 569 + }, + { + "epoch": 1.165644171779141, + "grad_norm": 1.7461144499706955, + "learning_rate": 3.7136634584107787e-06, + "loss": 0.0314, + "step": 570 + }, + { + "epoch": 1.1676891615541922, + "grad_norm": 1.5152391192451116, + "learning_rate": 3.69814942888572e-06, + "loss": 0.0395, + "step": 571 + }, + { + "epoch": 1.1697341513292434, + "grad_norm": 1.2278434943664795, + "learning_rate": 3.6826488326633393e-06, + "loss": 0.03, + "step": 572 + }, + { + "epoch": 1.1717791411042944, + "grad_norm": 1.3578782487107308, + "learning_rate": 3.6671618296884147e-06, + "loss": 0.0329, + "step": 573 + }, + { + "epoch": 1.1738241308793456, + "grad_norm": 1.0722909218685073, + "learning_rate": 3.6516885797654593e-06, + "loss": 0.024, + "step": 574 + }, + { + "epoch": 1.1758691206543967, + "grad_norm": 1.3302319522941752, + "learning_rate": 3.6362292425570754e-06, + "loss": 0.0281, + "step": 575 + }, + { + "epoch": 1.177914110429448, + "grad_norm": 1.574439290096253, + "learning_rate": 3.620783977582305e-06, + "loss": 0.0342, + "step": 576 + }, + { + "epoch": 1.179959100204499, + "grad_norm": 1.2172183727962829, + "learning_rate": 3.605352944214986e-06, + "loss": 0.026, + "step": 577 + }, + { + "epoch": 1.18200408997955, + "grad_norm": 1.4383648089060725, + "learning_rate": 3.5899363016821e-06, + "loss": 0.0265, + "step": 578 + }, + { + "epoch": 1.1840490797546013, + "grad_norm": 1.716222220393621, + "learning_rate": 3.5745342090621406e-06, + "loss": 0.0316, + "step": 579 + }, + { + "epoch": 1.1860940695296525, + "grad_norm": 1.3303476968967134, + "learning_rate": 3.5591468252834654e-06, + "loss": 0.0298, + "step": 580 + }, + { + "epoch": 1.1881390593047034, + "grad_norm": 1.226061907060063, + "learning_rate": 3.543774309122657e-06, + "loss": 0.0209, + "step": 581 + }, + { + "epoch": 1.1901840490797546, + "grad_norm": 1.656144699228516, + "learning_rate": 3.528416819202881e-06, + "loss": 0.0332, + "step": 582 + }, + { + "epoch": 1.1922290388548058, + "grad_norm": 1.5013453797233454, + "learning_rate": 3.5130745139922572e-06, + "loss": 0.0288, + "step": 583 + }, + { + "epoch": 1.1942740286298568, + "grad_norm": 1.9928513490408657, + "learning_rate": 3.497747551802221e-06, + "loss": 0.0521, + "step": 584 + }, + { + "epoch": 1.196319018404908, + "grad_norm": 1.2521586168450574, + "learning_rate": 3.4824360907858824e-06, + "loss": 0.0274, + "step": 585 + }, + { + "epoch": 1.1983640081799591, + "grad_norm": 1.2256691948629876, + "learning_rate": 3.467140288936407e-06, + "loss": 0.0282, + "step": 586 + }, + { + "epoch": 1.20040899795501, + "grad_norm": 1.6527331639972576, + "learning_rate": 3.4518603040853783e-06, + "loss": 0.0436, + "step": 587 + }, + { + "epoch": 1.2024539877300613, + "grad_norm": 1.368490830870422, + "learning_rate": 3.43659629390117e-06, + "loss": 0.0254, + "step": 588 + }, + { + "epoch": 1.2044989775051125, + "grad_norm": 1.8028951429047948, + "learning_rate": 3.421348415887315e-06, + "loss": 0.0408, + "step": 589 + }, + { + "epoch": 1.2065439672801637, + "grad_norm": 1.497550290975889, + "learning_rate": 3.4061168273808896e-06, + "loss": 0.0381, + "step": 590 + }, + { + "epoch": 1.2085889570552146, + "grad_norm": 1.6321033308692612, + "learning_rate": 3.390901685550887e-06, + "loss": 0.0383, + "step": 591 + }, + { + "epoch": 1.2106339468302658, + "grad_norm": 1.5159929907176852, + "learning_rate": 3.3757031473965827e-06, + "loss": 0.0304, + "step": 592 + }, + { + "epoch": 1.212678936605317, + "grad_norm": 2.4724108292496187, + "learning_rate": 3.360521369745937e-06, + "loss": 0.0518, + "step": 593 + }, + { + "epoch": 1.2147239263803682, + "grad_norm": 1.437590640293289, + "learning_rate": 3.3453565092539586e-06, + "loss": 0.0257, + "step": 594 + }, + { + "epoch": 1.2167689161554192, + "grad_norm": 1.169486503639217, + "learning_rate": 3.330208722401097e-06, + "loss": 0.0235, + "step": 595 + }, + { + "epoch": 1.2188139059304703, + "grad_norm": 1.1268592276904335, + "learning_rate": 3.315078165491622e-06, + "loss": 0.0279, + "step": 596 + }, + { + "epoch": 1.2208588957055215, + "grad_norm": 1.5683278793352897, + "learning_rate": 3.299964994652017e-06, + "loss": 0.0305, + "step": 597 + }, + { + "epoch": 1.2229038854805725, + "grad_norm": 1.9967429863243036, + "learning_rate": 3.2848693658293675e-06, + "loss": 0.0397, + "step": 598 + }, + { + "epoch": 1.2249488752556237, + "grad_norm": 1.4152854852084966, + "learning_rate": 3.269791434789741e-06, + "loss": 0.0256, + "step": 599 + }, + { + "epoch": 1.2269938650306749, + "grad_norm": 1.2653327623743533, + "learning_rate": 3.254731357116597e-06, + "loss": 0.029, + "step": 600 + }, + { + "epoch": 1.2269938650306749, + "eval_loss": 0.07708186656236649, + "eval_runtime": 1.5947, + "eval_samples_per_second": 25.083, + "eval_steps_per_second": 6.271, + "step": 600 + }, + { + "epoch": 1.229038854805726, + "grad_norm": 1.775392827404977, + "learning_rate": 3.2396892882091678e-06, + "loss": 0.0379, + "step": 601 + }, + { + "epoch": 1.231083844580777, + "grad_norm": 1.3830173529601073, + "learning_rate": 3.2246653832808674e-06, + "loss": 0.0288, + "step": 602 + }, + { + "epoch": 1.2331288343558282, + "grad_norm": 2.1155311480126264, + "learning_rate": 3.209659797357669e-06, + "loss": 0.0615, + "step": 603 + }, + { + "epoch": 1.2351738241308794, + "grad_norm": 2.4445073717181662, + "learning_rate": 3.1946726852765325e-06, + "loss": 0.0635, + "step": 604 + }, + { + "epoch": 1.2372188139059306, + "grad_norm": 2.110428549540264, + "learning_rate": 3.179704201683786e-06, + "loss": 0.0364, + "step": 605 + }, + { + "epoch": 1.2392638036809815, + "grad_norm": 1.7469628121389942, + "learning_rate": 3.16475450103354e-06, + "loss": 0.0372, + "step": 606 + }, + { + "epoch": 1.2413087934560327, + "grad_norm": 0.9152243306182197, + "learning_rate": 3.149823737586089e-06, + "loss": 0.0161, + "step": 607 + }, + { + "epoch": 1.243353783231084, + "grad_norm": 1.3056210800799668, + "learning_rate": 3.1349120654063224e-06, + "loss": 0.0266, + "step": 608 + }, + { + "epoch": 1.2453987730061349, + "grad_norm": 1.2879845659396767, + "learning_rate": 3.1200196383621363e-06, + "loss": 0.0274, + "step": 609 + }, + { + "epoch": 1.247443762781186, + "grad_norm": 1.382752868609299, + "learning_rate": 3.105146610122839e-06, + "loss": 0.0303, + "step": 610 + }, + { + "epoch": 1.2494887525562373, + "grad_norm": 1.433137069436945, + "learning_rate": 3.090293134157572e-06, + "loss": 0.0259, + "step": 611 + }, + { + "epoch": 1.2515337423312882, + "grad_norm": 1.2272571061501605, + "learning_rate": 3.0754593637337276e-06, + "loss": 0.0305, + "step": 612 + }, + { + "epoch": 1.2535787321063394, + "grad_norm": 1.8924881887997287, + "learning_rate": 3.0606454519153608e-06, + "loss": 0.0478, + "step": 613 + }, + { + "epoch": 1.2556237218813906, + "grad_norm": 1.5252407085003916, + "learning_rate": 3.0458515515616117e-06, + "loss": 0.0382, + "step": 614 + }, + { + "epoch": 1.2576687116564418, + "grad_norm": 1.2566622827157716, + "learning_rate": 3.0310778153251325e-06, + "loss": 0.0265, + "step": 615 + }, + { + "epoch": 1.259713701431493, + "grad_norm": 1.5092416262198107, + "learning_rate": 3.0163243956505093e-06, + "loss": 0.0313, + "step": 616 + }, + { + "epoch": 1.261758691206544, + "grad_norm": 2.0141840286367083, + "learning_rate": 3.001591444772687e-06, + "loss": 0.0373, + "step": 617 + }, + { + "epoch": 1.2638036809815951, + "grad_norm": 1.0802484572404294, + "learning_rate": 2.986879114715403e-06, + "loss": 0.0266, + "step": 618 + }, + { + "epoch": 1.2658486707566463, + "grad_norm": 1.359876254126494, + "learning_rate": 2.972187557289616e-06, + "loss": 0.0305, + "step": 619 + }, + { + "epoch": 1.2678936605316973, + "grad_norm": 1.3671926484976908, + "learning_rate": 2.95751692409194e-06, + "loss": 0.0296, + "step": 620 + }, + { + "epoch": 1.2699386503067485, + "grad_norm": 1.553369266205434, + "learning_rate": 2.9428673665030772e-06, + "loss": 0.0352, + "step": 621 + }, + { + "epoch": 1.2719836400817996, + "grad_norm": 1.9647938781064505, + "learning_rate": 2.9282390356862606e-06, + "loss": 0.0414, + "step": 622 + }, + { + "epoch": 1.2740286298568506, + "grad_norm": 1.7057696677799985, + "learning_rate": 2.9136320825856967e-06, + "loss": 0.0364, + "step": 623 + }, + { + "epoch": 1.2760736196319018, + "grad_norm": 1.6026279841764746, + "learning_rate": 2.899046657924992e-06, + "loss": 0.0411, + "step": 624 + }, + { + "epoch": 1.278118609406953, + "grad_norm": 1.5405330605244225, + "learning_rate": 2.884482912205621e-06, + "loss": 0.0358, + "step": 625 + }, + { + "epoch": 1.280163599182004, + "grad_norm": 1.4374105894350884, + "learning_rate": 2.8699409957053535e-06, + "loss": 0.0267, + "step": 626 + }, + { + "epoch": 1.2822085889570551, + "grad_norm": 1.4661224711168332, + "learning_rate": 2.8554210584767188e-06, + "loss": 0.0271, + "step": 627 + }, + { + "epoch": 1.2842535787321063, + "grad_norm": 2.5207979346685963, + "learning_rate": 2.840923250345442e-06, + "loss": 0.0481, + "step": 628 + }, + { + "epoch": 1.2862985685071575, + "grad_norm": 1.617876668968538, + "learning_rate": 2.8264477209089147e-06, + "loss": 0.0369, + "step": 629 + }, + { + "epoch": 1.2883435582822087, + "grad_norm": 1.808249674605706, + "learning_rate": 2.8119946195346375e-06, + "loss": 0.0391, + "step": 630 + }, + { + "epoch": 1.2903885480572597, + "grad_norm": 1.3734714216015984, + "learning_rate": 2.7975640953586846e-06, + "loss": 0.0294, + "step": 631 + }, + { + "epoch": 1.2924335378323109, + "grad_norm": 1.391392647849848, + "learning_rate": 2.78315629728417e-06, + "loss": 0.0304, + "step": 632 + }, + { + "epoch": 1.294478527607362, + "grad_norm": 1.4706567091432965, + "learning_rate": 2.7687713739796972e-06, + "loss": 0.0302, + "step": 633 + }, + { + "epoch": 1.296523517382413, + "grad_norm": 1.3791514521388877, + "learning_rate": 2.7544094738778436e-06, + "loss": 0.0338, + "step": 634 + }, + { + "epoch": 1.2985685071574642, + "grad_norm": 1.9205903741411616, + "learning_rate": 2.7400707451736103e-06, + "loss": 0.0352, + "step": 635 + }, + { + "epoch": 1.3006134969325154, + "grad_norm": 1.2860131091648965, + "learning_rate": 2.725755335822903e-06, + "loss": 0.0305, + "step": 636 + }, + { + "epoch": 1.3026584867075663, + "grad_norm": 2.0441358593719396, + "learning_rate": 2.7114633935410083e-06, + "loss": 0.0381, + "step": 637 + }, + { + "epoch": 1.3047034764826175, + "grad_norm": 1.606441709854988, + "learning_rate": 2.6971950658010666e-06, + "loss": 0.0343, + "step": 638 + }, + { + "epoch": 1.3067484662576687, + "grad_norm": 1.1232676032960067, + "learning_rate": 2.6829504998325352e-06, + "loss": 0.0223, + "step": 639 + }, + { + "epoch": 1.30879345603272, + "grad_norm": 2.0695929953679353, + "learning_rate": 2.6687298426196974e-06, + "loss": 0.0437, + "step": 640 + }, + { + "epoch": 1.310838445807771, + "grad_norm": 1.2094015200347492, + "learning_rate": 2.6545332409001267e-06, + "loss": 0.0251, + "step": 641 + }, + { + "epoch": 1.312883435582822, + "grad_norm": 1.3109256389089359, + "learning_rate": 2.6403608411631744e-06, + "loss": 0.0319, + "step": 642 + }, + { + "epoch": 1.3149284253578732, + "grad_norm": 1.3994815087983556, + "learning_rate": 2.62621278964846e-06, + "loss": 0.0281, + "step": 643 + }, + { + "epoch": 1.3169734151329244, + "grad_norm": 1.3379418428305758, + "learning_rate": 2.612089232344371e-06, + "loss": 0.0301, + "step": 644 + }, + { + "epoch": 1.3190184049079754, + "grad_norm": 1.2825425624367577, + "learning_rate": 2.5979903149865386e-06, + "loss": 0.016, + "step": 645 + }, + { + "epoch": 1.3210633946830266, + "grad_norm": 1.1904405038306072, + "learning_rate": 2.5839161830563475e-06, + "loss": 0.0282, + "step": 646 + }, + { + "epoch": 1.3231083844580778, + "grad_norm": 1.4411097446473085, + "learning_rate": 2.569866981779433e-06, + "loss": 0.0312, + "step": 647 + }, + { + "epoch": 1.3251533742331287, + "grad_norm": 1.5429385898328931, + "learning_rate": 2.555842856124182e-06, + "loss": 0.0288, + "step": 648 + }, + { + "epoch": 1.32719836400818, + "grad_norm": 1.6647644695086803, + "learning_rate": 2.541843950800226e-06, + "loss": 0.0345, + "step": 649 + }, + { + "epoch": 1.329243353783231, + "grad_norm": 1.6594984133499624, + "learning_rate": 2.527870410256966e-06, + "loss": 0.0355, + "step": 650 + }, + { + "epoch": 1.331288343558282, + "grad_norm": 1.4712049332054007, + "learning_rate": 2.513922378682075e-06, + "loss": 0.0326, + "step": 651 + }, + { + "epoch": 1.3333333333333333, + "grad_norm": 2.058004614790052, + "learning_rate": 2.5000000000000015e-06, + "loss": 0.0497, + "step": 652 + }, + { + "epoch": 1.3353783231083844, + "grad_norm": 1.63215913934179, + "learning_rate": 2.486103417870493e-06, + "loss": 0.0407, + "step": 653 + }, + { + "epoch": 1.3374233128834356, + "grad_norm": 1.4235640655071966, + "learning_rate": 2.472232775687119e-06, + "loss": 0.0256, + "step": 654 + }, + { + "epoch": 1.3394683026584868, + "grad_norm": 1.4966832324121504, + "learning_rate": 2.4583882165757766e-06, + "loss": 0.0341, + "step": 655 + }, + { + "epoch": 1.3415132924335378, + "grad_norm": 1.579776697797337, + "learning_rate": 2.4445698833932236e-06, + "loss": 0.0318, + "step": 656 + }, + { + "epoch": 1.343558282208589, + "grad_norm": 1.5153710401247442, + "learning_rate": 2.4307779187256064e-06, + "loss": 0.041, + "step": 657 + }, + { + "epoch": 1.3456032719836402, + "grad_norm": 2.3410319215359343, + "learning_rate": 2.417012464886978e-06, + "loss": 0.0493, + "step": 658 + }, + { + "epoch": 1.3476482617586911, + "grad_norm": 1.7830705068876032, + "learning_rate": 2.4032736639178443e-06, + "loss": 0.038, + "step": 659 + }, + { + "epoch": 1.3496932515337423, + "grad_norm": 1.401656109282602, + "learning_rate": 2.389561657583681e-06, + "loss": 0.0314, + "step": 660 + }, + { + "epoch": 1.3517382413087935, + "grad_norm": 1.6005336710481401, + "learning_rate": 2.3758765873734897e-06, + "loss": 0.0339, + "step": 661 + }, + { + "epoch": 1.3537832310838445, + "grad_norm": 1.4717588678443954, + "learning_rate": 2.3622185944983187e-06, + "loss": 0.024, + "step": 662 + }, + { + "epoch": 1.3558282208588956, + "grad_norm": 1.4073986715417073, + "learning_rate": 2.3485878198898253e-06, + "loss": 0.0314, + "step": 663 + }, + { + "epoch": 1.3578732106339468, + "grad_norm": 1.1155919297375605, + "learning_rate": 2.3349844041988044e-06, + "loss": 0.0238, + "step": 664 + }, + { + "epoch": 1.359918200408998, + "grad_norm": 1.4447238413299701, + "learning_rate": 2.3214084877937464e-06, + "loss": 0.024, + "step": 665 + }, + { + "epoch": 1.3619631901840492, + "grad_norm": 1.4727637644326748, + "learning_rate": 2.30786021075939e-06, + "loss": 0.0352, + "step": 666 + }, + { + "epoch": 1.3640081799591002, + "grad_norm": 1.0917991783905898, + "learning_rate": 2.294339712895271e-06, + "loss": 0.02, + "step": 667 + }, + { + "epoch": 1.3660531697341514, + "grad_norm": 1.5740943427206295, + "learning_rate": 2.28084713371428e-06, + "loss": 0.0323, + "step": 668 + }, + { + "epoch": 1.3680981595092025, + "grad_norm": 1.4720527439260216, + "learning_rate": 2.2673826124412314e-06, + "loss": 0.0286, + "step": 669 + }, + { + "epoch": 1.3701431492842535, + "grad_norm": 1.4833939892417702, + "learning_rate": 2.253946288011419e-06, + "loss": 0.0342, + "step": 670 + }, + { + "epoch": 1.3721881390593047, + "grad_norm": 1.6876515961228076, + "learning_rate": 2.240538299069178e-06, + "loss": 0.0311, + "step": 671 + }, + { + "epoch": 1.3742331288343559, + "grad_norm": 2.1720167724269874, + "learning_rate": 2.2271587839664673e-06, + "loss": 0.0381, + "step": 672 + }, + { + "epoch": 1.3762781186094069, + "grad_norm": 1.5126928906252048, + "learning_rate": 2.213807880761434e-06, + "loss": 0.0332, + "step": 673 + }, + { + "epoch": 1.378323108384458, + "grad_norm": 1.6737538431685655, + "learning_rate": 2.2004857272169878e-06, + "loss": 0.0345, + "step": 674 + }, + { + "epoch": 1.3803680981595092, + "grad_norm": 1.426935375770983, + "learning_rate": 2.18719246079938e-06, + "loss": 0.0398, + "step": 675 + }, + { + "epoch": 1.3824130879345602, + "grad_norm": 1.4051149662672344, + "learning_rate": 2.173928218676792e-06, + "loss": 0.0232, + "step": 676 + }, + { + "epoch": 1.3844580777096114, + "grad_norm": 1.7917331547528335, + "learning_rate": 2.160693137717912e-06, + "loss": 0.0368, + "step": 677 + }, + { + "epoch": 1.3865030674846626, + "grad_norm": 1.8111522355910634, + "learning_rate": 2.1474873544905204e-06, + "loss": 0.0269, + "step": 678 + }, + { + "epoch": 1.3885480572597138, + "grad_norm": 1.6693031730647383, + "learning_rate": 2.134311005260093e-06, + "loss": 0.0362, + "step": 679 + }, + { + "epoch": 1.390593047034765, + "grad_norm": 1.4202013946415086, + "learning_rate": 2.121164225988387e-06, + "loss": 0.0298, + "step": 680 + }, + { + "epoch": 1.392638036809816, + "grad_norm": 1.3927664117864682, + "learning_rate": 2.108047152332028e-06, + "loss": 0.026, + "step": 681 + }, + { + "epoch": 1.394683026584867, + "grad_norm": 1.405359317118805, + "learning_rate": 2.0949599196411326e-06, + "loss": 0.0312, + "step": 682 + }, + { + "epoch": 1.3967280163599183, + "grad_norm": 1.2371988179013782, + "learning_rate": 2.081902662957895e-06, + "loss": 0.0214, + "step": 683 + }, + { + "epoch": 1.3987730061349692, + "grad_norm": 2.047236610352014, + "learning_rate": 2.0688755170152e-06, + "loss": 0.0421, + "step": 684 + }, + { + "epoch": 1.4008179959100204, + "grad_norm": 1.2055104899996096, + "learning_rate": 2.0558786162352245e-06, + "loss": 0.0218, + "step": 685 + }, + { + "epoch": 1.4028629856850716, + "grad_norm": 1.2042481090348163, + "learning_rate": 2.042912094728068e-06, + "loss": 0.0232, + "step": 686 + }, + { + "epoch": 1.4049079754601226, + "grad_norm": 1.965874166063246, + "learning_rate": 2.029976086290347e-06, + "loss": 0.0422, + "step": 687 + }, + { + "epoch": 1.4069529652351738, + "grad_norm": 1.7221753979316607, + "learning_rate": 2.017070724403835e-06, + "loss": 0.0315, + "step": 688 + }, + { + "epoch": 1.408997955010225, + "grad_norm": 1.319102902846999, + "learning_rate": 2.004196142234068e-06, + "loss": 0.0315, + "step": 689 + }, + { + "epoch": 1.4110429447852761, + "grad_norm": 0.9513064566229582, + "learning_rate": 1.9913524726289784e-06, + "loss": 0.0168, + "step": 690 + }, + { + "epoch": 1.4130879345603273, + "grad_norm": 1.9447952357011042, + "learning_rate": 1.9785398481175295e-06, + "loss": 0.0413, + "step": 691 + }, + { + "epoch": 1.4151329243353783, + "grad_norm": 1.5286895548644743, + "learning_rate": 1.965758400908334e-06, + "loss": 0.0274, + "step": 692 + }, + { + "epoch": 1.4171779141104295, + "grad_norm": 1.1539526277463092, + "learning_rate": 1.9530082628883058e-06, + "loss": 0.0239, + "step": 693 + }, + { + "epoch": 1.4192229038854807, + "grad_norm": 1.6908331934705023, + "learning_rate": 1.9402895656212834e-06, + "loss": 0.0342, + "step": 694 + }, + { + "epoch": 1.4212678936605316, + "grad_norm": 2.2914630227874886, + "learning_rate": 1.927602440346687e-06, + "loss": 0.0414, + "step": 695 + }, + { + "epoch": 1.4233128834355828, + "grad_norm": 1.4300685831945064, + "learning_rate": 1.914947017978153e-06, + "loss": 0.0272, + "step": 696 + }, + { + "epoch": 1.425357873210634, + "grad_norm": 1.119854466298958, + "learning_rate": 1.9023234291021875e-06, + "loss": 0.0237, + "step": 697 + }, + { + "epoch": 1.427402862985685, + "grad_norm": 2.157933270356651, + "learning_rate": 1.889731803976822e-06, + "loss": 0.0365, + "step": 698 + }, + { + "epoch": 1.4294478527607362, + "grad_norm": 2.1495827518419017, + "learning_rate": 1.8771722725302644e-06, + "loss": 0.0421, + "step": 699 + }, + { + "epoch": 1.4314928425357873, + "grad_norm": 1.4403502460755069, + "learning_rate": 1.8646449643595565e-06, + "loss": 0.0256, + "step": 700 + }, + { + "epoch": 1.4335378323108383, + "grad_norm": 1.612284239493657, + "learning_rate": 1.8521500087292466e-06, + "loss": 0.0314, + "step": 701 + }, + { + "epoch": 1.4355828220858895, + "grad_norm": 1.1017923126212417, + "learning_rate": 1.8396875345700498e-06, + "loss": 0.022, + "step": 702 + }, + { + "epoch": 1.4376278118609407, + "grad_norm": 1.6446468659290325, + "learning_rate": 1.8272576704775074e-06, + "loss": 0.0416, + "step": 703 + }, + { + "epoch": 1.4396728016359919, + "grad_norm": 1.3298795930204095, + "learning_rate": 1.81486054471068e-06, + "loss": 0.0269, + "step": 704 + }, + { + "epoch": 1.441717791411043, + "grad_norm": 1.0537263463371598, + "learning_rate": 1.8024962851908106e-06, + "loss": 0.022, + "step": 705 + }, + { + "epoch": 1.443762781186094, + "grad_norm": 1.569014799305421, + "learning_rate": 1.790165019500007e-06, + "loss": 0.027, + "step": 706 + }, + { + "epoch": 1.4458077709611452, + "grad_norm": 1.1169910353575982, + "learning_rate": 1.7778668748799244e-06, + "loss": 0.0214, + "step": 707 + }, + { + "epoch": 1.4478527607361964, + "grad_norm": 1.4170681283218884, + "learning_rate": 1.7656019782304602e-06, + "loss": 0.0241, + "step": 708 + }, + { + "epoch": 1.4498977505112474, + "grad_norm": 1.548818986804255, + "learning_rate": 1.7533704561084331e-06, + "loss": 0.0362, + "step": 709 + }, + { + "epoch": 1.4519427402862985, + "grad_norm": 2.0680503202924028, + "learning_rate": 1.7411724347262826e-06, + "loss": 0.0431, + "step": 710 + }, + { + "epoch": 1.4539877300613497, + "grad_norm": 1.4173336455080414, + "learning_rate": 1.729008039950772e-06, + "loss": 0.0279, + "step": 711 + }, + { + "epoch": 1.4560327198364007, + "grad_norm": 1.7820106072819453, + "learning_rate": 1.7168773973016779e-06, + "loss": 0.0353, + "step": 712 + }, + { + "epoch": 1.4580777096114519, + "grad_norm": 1.3988149854171141, + "learning_rate": 1.7047806319505079e-06, + "loss": 0.0271, + "step": 713 + }, + { + "epoch": 1.460122699386503, + "grad_norm": 1.390929649329335, + "learning_rate": 1.6927178687191953e-06, + "loss": 0.0256, + "step": 714 + }, + { + "epoch": 1.4621676891615543, + "grad_norm": 1.5277454496972025, + "learning_rate": 1.680689232078827e-06, + "loss": 0.0312, + "step": 715 + }, + { + "epoch": 1.4642126789366054, + "grad_norm": 1.9787662527459544, + "learning_rate": 1.6686948461483432e-06, + "loss": 0.0297, + "step": 716 + }, + { + "epoch": 1.4662576687116564, + "grad_norm": 1.3331939153009726, + "learning_rate": 1.656734834693266e-06, + "loss": 0.0269, + "step": 717 + }, + { + "epoch": 1.4683026584867076, + "grad_norm": 2.1133774298806784, + "learning_rate": 1.6448093211244232e-06, + "loss": 0.048, + "step": 718 + }, + { + "epoch": 1.4703476482617588, + "grad_norm": 1.9547321525275854, + "learning_rate": 1.6329184284966675e-06, + "loss": 0.0428, + "step": 719 + }, + { + "epoch": 1.4723926380368098, + "grad_norm": 1.5090987203091184, + "learning_rate": 1.621062279507617e-06, + "loss": 0.0305, + "step": 720 + }, + { + "epoch": 1.474437627811861, + "grad_norm": 0.9195367104860552, + "learning_rate": 1.6092409964963779e-06, + "loss": 0.0189, + "step": 721 + }, + { + "epoch": 1.4764826175869121, + "grad_norm": 1.9963374669225287, + "learning_rate": 1.597454701442288e-06, + "loss": 0.0385, + "step": 722 + }, + { + "epoch": 1.478527607361963, + "grad_norm": 1.5113737285476996, + "learning_rate": 1.5857035159636625e-06, + "loss": 0.033, + "step": 723 + }, + { + "epoch": 1.4805725971370143, + "grad_norm": 2.004845056871369, + "learning_rate": 1.5739875613165283e-06, + "loss": 0.0339, + "step": 724 + }, + { + "epoch": 1.4826175869120655, + "grad_norm": 0.9984169610067929, + "learning_rate": 1.5623069583933836e-06, + "loss": 0.02, + "step": 725 + }, + { + "epoch": 1.4846625766871164, + "grad_norm": 1.6259268058261294, + "learning_rate": 1.550661827721941e-06, + "loss": 0.0273, + "step": 726 + }, + { + "epoch": 1.4867075664621676, + "grad_norm": 1.6297643950263438, + "learning_rate": 1.5390522894638937e-06, + "loss": 0.028, + "step": 727 + }, + { + "epoch": 1.4887525562372188, + "grad_norm": 1.53638106009823, + "learning_rate": 1.5274784634136658e-06, + "loss": 0.0293, + "step": 728 + }, + { + "epoch": 1.49079754601227, + "grad_norm": 1.268974698538747, + "learning_rate": 1.5159404689971797e-06, + "loss": 0.0248, + "step": 729 + }, + { + "epoch": 1.4928425357873212, + "grad_norm": 1.427953166829002, + "learning_rate": 1.5044384252706312e-06, + "loss": 0.025, + "step": 730 + }, + { + "epoch": 1.4948875255623721, + "grad_norm": 1.0778297960602063, + "learning_rate": 1.492972450919249e-06, + "loss": 0.0196, + "step": 731 + }, + { + "epoch": 1.4969325153374233, + "grad_norm": 1.6048151777257864, + "learning_rate": 1.4815426642560753e-06, + "loss": 0.0254, + "step": 732 + }, + { + "epoch": 1.4989775051124745, + "grad_norm": 1.3837639161000226, + "learning_rate": 1.4701491832207481e-06, + "loss": 0.0234, + "step": 733 + }, + { + "epoch": 1.5010224948875255, + "grad_norm": 1.6210880071717662, + "learning_rate": 1.458792125378285e-06, + "loss": 0.0279, + "step": 734 + }, + { + "epoch": 1.5030674846625767, + "grad_norm": 1.6055051497727444, + "learning_rate": 1.4474716079178541e-06, + "loss": 0.047, + "step": 735 + }, + { + "epoch": 1.5051124744376279, + "grad_norm": 1.4164487131203813, + "learning_rate": 1.436187747651589e-06, + "loss": 0.0294, + "step": 736 + }, + { + "epoch": 1.5071574642126788, + "grad_norm": 1.404797134072682, + "learning_rate": 1.4249406610133686e-06, + "loss": 0.0333, + "step": 737 + }, + { + "epoch": 1.50920245398773, + "grad_norm": 1.5568137049723834, + "learning_rate": 1.4137304640576161e-06, + "loss": 0.0261, + "step": 738 + }, + { + "epoch": 1.5112474437627812, + "grad_norm": 1.4289478333095673, + "learning_rate": 1.4025572724581037e-06, + "loss": 0.0261, + "step": 739 + }, + { + "epoch": 1.5132924335378322, + "grad_norm": 2.5332634796920264, + "learning_rate": 1.3914212015067653e-06, + "loss": 0.0444, + "step": 740 + }, + { + "epoch": 1.5153374233128836, + "grad_norm": 1.788966871785357, + "learning_rate": 1.3803223661124938e-06, + "loss": 0.0283, + "step": 741 + }, + { + "epoch": 1.5173824130879345, + "grad_norm": 1.450672721983178, + "learning_rate": 1.3692608807999652e-06, + "loss": 0.0362, + "step": 742 + }, + { + "epoch": 1.5194274028629857, + "grad_norm": 1.2779026779976663, + "learning_rate": 1.3582368597084566e-06, + "loss": 0.0259, + "step": 743 + }, + { + "epoch": 1.521472392638037, + "grad_norm": 1.181583768603287, + "learning_rate": 1.3472504165906614e-06, + "loss": 0.0189, + "step": 744 + }, + { + "epoch": 1.5235173824130879, + "grad_norm": 0.9817943493019303, + "learning_rate": 1.3363016648115246e-06, + "loss": 0.0184, + "step": 745 + }, + { + "epoch": 1.525562372188139, + "grad_norm": 1.270037833596693, + "learning_rate": 1.325390717347065e-06, + "loss": 0.0268, + "step": 746 + }, + { + "epoch": 1.5276073619631902, + "grad_norm": 1.3472246238651557, + "learning_rate": 1.3145176867832165e-06, + "loss": 0.0262, + "step": 747 + }, + { + "epoch": 1.5296523517382412, + "grad_norm": 1.4783552939397928, + "learning_rate": 1.3036826853146601e-06, + "loss": 0.0256, + "step": 748 + }, + { + "epoch": 1.5316973415132924, + "grad_norm": 1.5785020479524052, + "learning_rate": 1.2928858247436672e-06, + "loss": 0.0303, + "step": 749 + }, + { + "epoch": 1.5337423312883436, + "grad_norm": 0.9545819980628849, + "learning_rate": 1.2821272164789544e-06, + "loss": 0.0154, + "step": 750 + }, + { + "epoch": 1.5357873210633946, + "grad_norm": 1.7853036227571542, + "learning_rate": 1.2714069715345195e-06, + "loss": 0.0366, + "step": 751 + }, + { + "epoch": 1.537832310838446, + "grad_norm": 1.2881320204863016, + "learning_rate": 1.2607252005285109e-06, + "loss": 0.0271, + "step": 752 + }, + { + "epoch": 1.539877300613497, + "grad_norm": 1.8402584593837081, + "learning_rate": 1.2500820136820735e-06, + "loss": 0.0397, + "step": 753 + }, + { + "epoch": 1.5419222903885481, + "grad_norm": 0.9104264280152901, + "learning_rate": 1.2394775208182175e-06, + "loss": 0.0185, + "step": 754 + }, + { + "epoch": 1.5439672801635993, + "grad_norm": 1.6576714713446372, + "learning_rate": 1.2289118313606895e-06, + "loss": 0.0329, + "step": 755 + }, + { + "epoch": 1.5460122699386503, + "grad_norm": 1.516510626114462, + "learning_rate": 1.2183850543328313e-06, + "loss": 0.029, + "step": 756 + }, + { + "epoch": 1.5480572597137015, + "grad_norm": 1.7170915167008158, + "learning_rate": 1.2078972983564686e-06, + "loss": 0.0281, + "step": 757 + }, + { + "epoch": 1.5501022494887526, + "grad_norm": 1.572147913277003, + "learning_rate": 1.1974486716507782e-06, + "loss": 0.0275, + "step": 758 + }, + { + "epoch": 1.5521472392638036, + "grad_norm": 1.6917430084108376, + "learning_rate": 1.187039282031182e-06, + "loss": 0.0357, + "step": 759 + }, + { + "epoch": 1.5541922290388548, + "grad_norm": 1.5988116947928293, + "learning_rate": 1.1766692369082255e-06, + "loss": 0.037, + "step": 760 + }, + { + "epoch": 1.556237218813906, + "grad_norm": 1.5739169249494382, + "learning_rate": 1.1663386432864725e-06, + "loss": 0.0323, + "step": 761 + }, + { + "epoch": 1.558282208588957, + "grad_norm": 0.8239355040656156, + "learning_rate": 1.156047607763407e-06, + "loss": 0.0153, + "step": 762 + }, + { + "epoch": 1.5603271983640081, + "grad_norm": 1.4324066370868447, + "learning_rate": 1.145796236528322e-06, + "loss": 0.0281, + "step": 763 + }, + { + "epoch": 1.5623721881390593, + "grad_norm": 1.167864770241578, + "learning_rate": 1.135584635361232e-06, + "loss": 0.0206, + "step": 764 + }, + { + "epoch": 1.5644171779141103, + "grad_norm": 1.2252633383184313, + "learning_rate": 1.1254129096317807e-06, + "loss": 0.0219, + "step": 765 + }, + { + "epoch": 1.5664621676891617, + "grad_norm": 1.2772246245687098, + "learning_rate": 1.115281164298153e-06, + "loss": 0.0228, + "step": 766 + }, + { + "epoch": 1.5685071574642127, + "grad_norm": 1.1793575214560597, + "learning_rate": 1.1051895039059851e-06, + "loss": 0.0239, + "step": 767 + }, + { + "epoch": 1.5705521472392638, + "grad_norm": 1.3979051592502238, + "learning_rate": 1.095138032587298e-06, + "loss": 0.0284, + "step": 768 + }, + { + "epoch": 1.572597137014315, + "grad_norm": 1.1554168176295245, + "learning_rate": 1.0851268540594168e-06, + "loss": 0.0233, + "step": 769 + }, + { + "epoch": 1.574642126789366, + "grad_norm": 1.1645512388718606, + "learning_rate": 1.0751560716238968e-06, + "loss": 0.0229, + "step": 770 + }, + { + "epoch": 1.5766871165644172, + "grad_norm": 1.7131522059742506, + "learning_rate": 1.0652257881654625e-06, + "loss": 0.0406, + "step": 771 + }, + { + "epoch": 1.5787321063394684, + "grad_norm": 1.2606812526165108, + "learning_rate": 1.0553361061509482e-06, + "loss": 0.0235, + "step": 772 + }, + { + "epoch": 1.5807770961145193, + "grad_norm": 1.1957626319021837, + "learning_rate": 1.0454871276282335e-06, + "loss": 0.0254, + "step": 773 + }, + { + "epoch": 1.5828220858895705, + "grad_norm": 1.221410722093273, + "learning_rate": 1.0356789542251939e-06, + "loss": 0.0285, + "step": 774 + }, + { + "epoch": 1.5848670756646217, + "grad_norm": 1.4005487946112367, + "learning_rate": 1.0259116871486557e-06, + "loss": 0.0237, + "step": 775 + }, + { + "epoch": 1.5869120654396727, + "grad_norm": 1.363179363127451, + "learning_rate": 1.0161854271833444e-06, + "loss": 0.023, + "step": 776 + }, + { + "epoch": 1.588957055214724, + "grad_norm": 1.3303699717121924, + "learning_rate": 1.0065002746908532e-06, + "loss": 0.0219, + "step": 777 + }, + { + "epoch": 1.591002044989775, + "grad_norm": 1.4319116309801472, + "learning_rate": 9.96856329608597e-07, + "loss": 0.031, + "step": 778 + }, + { + "epoch": 1.5930470347648262, + "grad_norm": 1.1984953249513992, + "learning_rate": 9.87253691448794e-07, + "loss": 0.0245, + "step": 779 + }, + { + "epoch": 1.5950920245398774, + "grad_norm": 1.2215565328948168, + "learning_rate": 9.776924592974257e-07, + "loss": 0.0248, + "step": 780 + }, + { + "epoch": 1.5971370143149284, + "grad_norm": 1.4160156872424536, + "learning_rate": 9.681727318132228e-07, + "loss": 0.0242, + "step": 781 + }, + { + "epoch": 1.5991820040899796, + "grad_norm": 1.1174710591294479, + "learning_rate": 9.586946072266479e-07, + "loss": 0.0191, + "step": 782 + }, + { + "epoch": 1.6012269938650308, + "grad_norm": 1.0676003932307012, + "learning_rate": 9.492581833388736e-07, + "loss": 0.0188, + "step": 783 + }, + { + "epoch": 1.6032719836400817, + "grad_norm": 1.0900550444215484, + "learning_rate": 9.398635575207854e-07, + "loss": 0.0218, + "step": 784 + }, + { + "epoch": 1.605316973415133, + "grad_norm": 1.2361313996180479, + "learning_rate": 9.305108267119645e-07, + "loss": 0.0207, + "step": 785 + }, + { + "epoch": 1.607361963190184, + "grad_norm": 1.218779379666619, + "learning_rate": 9.212000874196953e-07, + "loss": 0.0226, + "step": 786 + }, + { + "epoch": 1.609406952965235, + "grad_norm": 1.5316948706786864, + "learning_rate": 9.119314357179687e-07, + "loss": 0.0263, + "step": 787 + }, + { + "epoch": 1.6114519427402862, + "grad_norm": 1.3658846792851305, + "learning_rate": 9.027049672464916e-07, + "loss": 0.0207, + "step": 788 + }, + { + "epoch": 1.6134969325153374, + "grad_norm": 2.4597956315625455, + "learning_rate": 8.935207772096904e-07, + "loss": 0.0254, + "step": 789 + }, + { + "epoch": 1.6155419222903884, + "grad_norm": 1.3358397828434039, + "learning_rate": 8.843789603757446e-07, + "loss": 0.0265, + "step": 790 + }, + { + "epoch": 1.6175869120654398, + "grad_norm": 1.2481079015069951, + "learning_rate": 8.752796110755985e-07, + "loss": 0.02, + "step": 791 + }, + { + "epoch": 1.6196319018404908, + "grad_norm": 0.9661429436209987, + "learning_rate": 8.662228232019876e-07, + "loss": 0.0166, + "step": 792 + }, + { + "epoch": 1.621676891615542, + "grad_norm": 1.7556913252148523, + "learning_rate": 8.572086902084731e-07, + "loss": 0.0341, + "step": 793 + }, + { + "epoch": 1.6237218813905931, + "grad_norm": 1.418921330732568, + "learning_rate": 8.482373051084791e-07, + "loss": 0.0283, + "step": 794 + }, + { + "epoch": 1.6257668711656441, + "grad_norm": 2.369535130694504, + "learning_rate": 8.393087604743283e-07, + "loss": 0.0445, + "step": 795 + }, + { + "epoch": 1.6278118609406953, + "grad_norm": 1.6601126609364323, + "learning_rate": 8.304231484362868e-07, + "loss": 0.0293, + "step": 796 + }, + { + "epoch": 1.6298568507157465, + "grad_norm": 1.2796195343972467, + "learning_rate": 8.215805606816191e-07, + "loss": 0.0199, + "step": 797 + }, + { + "epoch": 1.6319018404907975, + "grad_norm": 1.207648315269951, + "learning_rate": 8.127810884536402e-07, + "loss": 0.0181, + "step": 798 + }, + { + "epoch": 1.6339468302658486, + "grad_norm": 2.1150186432662728, + "learning_rate": 8.040248225507641e-07, + "loss": 0.0473, + "step": 799 + }, + { + "epoch": 1.6359918200408998, + "grad_norm": 1.4200026666542498, + "learning_rate": 7.953118533255821e-07, + "loss": 0.0247, + "step": 800 + }, + { + "epoch": 1.6359918200408998, + "eval_loss": 0.07060948759317398, + "eval_runtime": 1.5943, + "eval_samples_per_second": 25.09, + "eval_steps_per_second": 6.272, + "step": 800 + }, + { + "epoch": 1.6380368098159508, + "grad_norm": 1.5772837122475736, + "learning_rate": 7.866422706839239e-07, + "loss": 0.0264, + "step": 801 + }, + { + "epoch": 1.6400817995910022, + "grad_norm": 1.1550918911272414, + "learning_rate": 7.780161640839257e-07, + "loss": 0.0224, + "step": 802 + }, + { + "epoch": 1.6421267893660532, + "grad_norm": 1.4676067465705516, + "learning_rate": 7.694336225351107e-07, + "loss": 0.0237, + "step": 803 + }, + { + "epoch": 1.6441717791411041, + "grad_norm": 1.4993385397429064, + "learning_rate": 7.60894734597476e-07, + "loss": 0.0295, + "step": 804 + }, + { + "epoch": 1.6462167689161555, + "grad_norm": 1.2385669586685766, + "learning_rate": 7.52399588380568e-07, + "loss": 0.0243, + "step": 805 + }, + { + "epoch": 1.6482617586912065, + "grad_norm": 1.4635374861697166, + "learning_rate": 7.439482715425806e-07, + "loss": 0.0252, + "step": 806 + }, + { + "epoch": 1.6503067484662577, + "grad_norm": 1.2402570999087212, + "learning_rate": 7.355408712894508e-07, + "loss": 0.0211, + "step": 807 + }, + { + "epoch": 1.6523517382413089, + "grad_norm": 1.5520153711347568, + "learning_rate": 7.271774743739546e-07, + "loss": 0.0303, + "step": 808 + }, + { + "epoch": 1.6543967280163598, + "grad_norm": 1.2762250260415324, + "learning_rate": 7.18858167094817e-07, + "loss": 0.0242, + "step": 809 + }, + { + "epoch": 1.656441717791411, + "grad_norm": 1.4244259857298884, + "learning_rate": 7.105830352958143e-07, + "loss": 0.0278, + "step": 810 + }, + { + "epoch": 1.6584867075664622, + "grad_norm": 1.4760993572706773, + "learning_rate": 7.023521643648984e-07, + "loss": 0.0292, + "step": 811 + }, + { + "epoch": 1.6605316973415132, + "grad_norm": 1.3443460519107557, + "learning_rate": 6.941656392333046e-07, + "loss": 0.0232, + "step": 812 + }, + { + "epoch": 1.6625766871165644, + "grad_norm": 1.3709203745792065, + "learning_rate": 6.86023544374686e-07, + "loss": 0.027, + "step": 813 + }, + { + "epoch": 1.6646216768916156, + "grad_norm": 1.4289920722764744, + "learning_rate": 6.779259638042318e-07, + "loss": 0.0231, + "step": 814 + }, + { + "epoch": 1.6666666666666665, + "grad_norm": 1.2467075238350902, + "learning_rate": 6.698729810778065e-07, + "loss": 0.0288, + "step": 815 + }, + { + "epoch": 1.668711656441718, + "grad_norm": 1.5823026933811752, + "learning_rate": 6.618646792910893e-07, + "loss": 0.0326, + "step": 816 + }, + { + "epoch": 1.670756646216769, + "grad_norm": 1.5584280269321396, + "learning_rate": 6.539011410787105e-07, + "loss": 0.0262, + "step": 817 + }, + { + "epoch": 1.67280163599182, + "grad_norm": 1.1208057763458479, + "learning_rate": 6.459824486134015e-07, + "loss": 0.0212, + "step": 818 + }, + { + "epoch": 1.6748466257668713, + "grad_norm": 1.3862339324803945, + "learning_rate": 6.381086836051498e-07, + "loss": 0.0258, + "step": 819 + }, + { + "epoch": 1.6768916155419222, + "grad_norm": 1.1160447785511467, + "learning_rate": 6.302799273003546e-07, + "loss": 0.0166, + "step": 820 + }, + { + "epoch": 1.6789366053169734, + "grad_norm": 1.3240491165501231, + "learning_rate": 6.22496260480982e-07, + "loss": 0.0248, + "step": 821 + }, + { + "epoch": 1.6809815950920246, + "grad_norm": 1.338838004083599, + "learning_rate": 6.147577634637413e-07, + "loss": 0.0262, + "step": 822 + }, + { + "epoch": 1.6830265848670756, + "grad_norm": 1.3968985445629194, + "learning_rate": 6.070645160992523e-07, + "loss": 0.0281, + "step": 823 + }, + { + "epoch": 1.6850715746421268, + "grad_norm": 1.171408977887829, + "learning_rate": 5.994165977712175e-07, + "loss": 0.0213, + "step": 824 + }, + { + "epoch": 1.687116564417178, + "grad_norm": 1.3360283784514455, + "learning_rate": 5.918140873956063e-07, + "loss": 0.0203, + "step": 825 + }, + { + "epoch": 1.689161554192229, + "grad_norm": 1.2733261388021238, + "learning_rate": 5.842570634198453e-07, + "loss": 0.0193, + "step": 826 + }, + { + "epoch": 1.6912065439672803, + "grad_norm": 1.6784098486146612, + "learning_rate": 5.767456038219987e-07, + "loss": 0.0262, + "step": 827 + }, + { + "epoch": 1.6932515337423313, + "grad_norm": 1.0355585556125833, + "learning_rate": 5.692797861099719e-07, + "loss": 0.0215, + "step": 828 + }, + { + "epoch": 1.6952965235173822, + "grad_norm": 1.4014112675195356, + "learning_rate": 5.618596873207083e-07, + "loss": 0.0225, + "step": 829 + }, + { + "epoch": 1.6973415132924337, + "grad_norm": 1.6204759478058526, + "learning_rate": 5.544853840193981e-07, + "loss": 0.0283, + "step": 830 + }, + { + "epoch": 1.6993865030674846, + "grad_norm": 1.1175326576111029, + "learning_rate": 5.471569522986775e-07, + "loss": 0.0197, + "step": 831 + }, + { + "epoch": 1.7014314928425358, + "grad_norm": 1.5156333961192319, + "learning_rate": 5.398744677778595e-07, + "loss": 0.0286, + "step": 832 + }, + { + "epoch": 1.703476482617587, + "grad_norm": 1.3492765083670422, + "learning_rate": 5.326380056021419e-07, + "loss": 0.0259, + "step": 833 + }, + { + "epoch": 1.705521472392638, + "grad_norm": 1.911784218966074, + "learning_rate": 5.254476404418341e-07, + "loss": 0.036, + "step": 834 + }, + { + "epoch": 1.7075664621676891, + "grad_norm": 1.3456317179935473, + "learning_rate": 5.183034464915898e-07, + "loss": 0.0248, + "step": 835 + }, + { + "epoch": 1.7096114519427403, + "grad_norm": 1.3465884976486044, + "learning_rate": 5.112054974696395e-07, + "loss": 0.0214, + "step": 836 + }, + { + "epoch": 1.7116564417177913, + "grad_norm": 1.2682146752514654, + "learning_rate": 5.041538666170282e-07, + "loss": 0.0245, + "step": 837 + }, + { + "epoch": 1.7137014314928425, + "grad_norm": 1.0732597160929007, + "learning_rate": 4.971486266968634e-07, + "loss": 0.0248, + "step": 838 + }, + { + "epoch": 1.7157464212678937, + "grad_norm": 1.2390245442361538, + "learning_rate": 4.901898499935609e-07, + "loss": 0.022, + "step": 839 + }, + { + "epoch": 1.7177914110429446, + "grad_norm": 1.1298732472922557, + "learning_rate": 4.832776083120983e-07, + "loss": 0.019, + "step": 840 + }, + { + "epoch": 1.719836400817996, + "grad_norm": 1.2513860400146173, + "learning_rate": 4.764119729772809e-07, + "loss": 0.0254, + "step": 841 + }, + { + "epoch": 1.721881390593047, + "grad_norm": 1.5812395858247674, + "learning_rate": 4.695930148329958e-07, + "loss": 0.0303, + "step": 842 + }, + { + "epoch": 1.7239263803680982, + "grad_norm": 1.2260179416900976, + "learning_rate": 4.628208042414889e-07, + "loss": 0.0231, + "step": 843 + }, + { + "epoch": 1.7259713701431494, + "grad_norm": 0.9260246632190309, + "learning_rate": 4.5609541108263377e-07, + "loss": 0.0191, + "step": 844 + }, + { + "epoch": 1.7280163599182004, + "grad_norm": 1.8092568351032716, + "learning_rate": 4.494169047532154e-07, + "loss": 0.0377, + "step": 845 + }, + { + "epoch": 1.7300613496932515, + "grad_norm": 1.4342896955808682, + "learning_rate": 4.4278535416620914e-07, + "loss": 0.0296, + "step": 846 + }, + { + "epoch": 1.7321063394683027, + "grad_norm": 1.411079843320368, + "learning_rate": 4.362008277500701e-07, + "loss": 0.0252, + "step": 847 + }, + { + "epoch": 1.7341513292433537, + "grad_norm": 1.4065270120904347, + "learning_rate": 4.2966339344803376e-07, + "loss": 0.0236, + "step": 848 + }, + { + "epoch": 1.7361963190184049, + "grad_norm": 2.637324684778294, + "learning_rate": 4.231731187174065e-07, + "loss": 0.0406, + "step": 849 + }, + { + "epoch": 1.738241308793456, + "grad_norm": 1.5036834826794743, + "learning_rate": 4.167300705288718e-07, + "loss": 0.0238, + "step": 850 + }, + { + "epoch": 1.740286298568507, + "grad_norm": 1.7305730073425691, + "learning_rate": 4.10334315365804e-07, + "loss": 0.03, + "step": 851 + }, + { + "epoch": 1.7423312883435584, + "grad_norm": 1.3670965259099597, + "learning_rate": 4.0398591922357787e-07, + "loss": 0.0244, + "step": 852 + }, + { + "epoch": 1.7443762781186094, + "grad_norm": 1.4873125793549382, + "learning_rate": 3.9768494760888455e-07, + "loss": 0.0281, + "step": 853 + }, + { + "epoch": 1.7464212678936604, + "grad_norm": 1.3256819619759466, + "learning_rate": 3.914314655390633e-07, + "loss": 0.018, + "step": 854 + }, + { + "epoch": 1.7484662576687118, + "grad_norm": 1.0528899986433782, + "learning_rate": 3.852255375414271e-07, + "loss": 0.0185, + "step": 855 + }, + { + "epoch": 1.7505112474437627, + "grad_norm": 1.5167752108851527, + "learning_rate": 3.7906722765259364e-07, + "loss": 0.0285, + "step": 856 + }, + { + "epoch": 1.752556237218814, + "grad_norm": 1.2661873569980087, + "learning_rate": 3.7295659941782856e-07, + "loss": 0.0229, + "step": 857 + }, + { + "epoch": 1.7546012269938651, + "grad_norm": 1.2713073653615368, + "learning_rate": 3.6689371589039013e-07, + "loss": 0.022, + "step": 858 + }, + { + "epoch": 1.756646216768916, + "grad_norm": 1.410691086480624, + "learning_rate": 3.60878639630875e-07, + "loss": 0.0296, + "step": 859 + }, + { + "epoch": 1.7586912065439673, + "grad_norm": 0.9920426356145646, + "learning_rate": 3.5491143270657445e-07, + "loss": 0.015, + "step": 860 + }, + { + "epoch": 1.7607361963190185, + "grad_norm": 1.5216849169101498, + "learning_rate": 3.489921566908372e-07, + "loss": 0.0271, + "step": 861 + }, + { + "epoch": 1.7627811860940694, + "grad_norm": 1.4674709021434214, + "learning_rate": 3.4312087266242964e-07, + "loss": 0.0263, + "step": 862 + }, + { + "epoch": 1.7648261758691206, + "grad_norm": 1.7675475614023826, + "learning_rate": 3.3729764120490447e-07, + "loss": 0.0384, + "step": 863 + }, + { + "epoch": 1.7668711656441718, + "grad_norm": 1.4676888698930726, + "learning_rate": 3.315225224059809e-07, + "loss": 0.0301, + "step": 864 + }, + { + "epoch": 1.7689161554192228, + "grad_norm": 1.4800320849283661, + "learning_rate": 3.25795575856922e-07, + "loss": 0.0283, + "step": 865 + }, + { + "epoch": 1.7709611451942742, + "grad_norm": 1.6806826350105444, + "learning_rate": 3.2011686065191894e-07, + "loss": 0.0391, + "step": 866 + }, + { + "epoch": 1.7730061349693251, + "grad_norm": 1.3249873571873563, + "learning_rate": 3.1448643538748045e-07, + "loss": 0.0203, + "step": 867 + }, + { + "epoch": 1.7750511247443763, + "grad_norm": 1.8551891141720298, + "learning_rate": 3.0890435816183226e-07, + "loss": 0.0393, + "step": 868 + }, + { + "epoch": 1.7770961145194275, + "grad_norm": 1.2327805158687992, + "learning_rate": 3.03370686574313e-07, + "loss": 0.0236, + "step": 869 + }, + { + "epoch": 1.7791411042944785, + "grad_norm": 1.3314203527215986, + "learning_rate": 2.9788547772478416e-07, + "loss": 0.0235, + "step": 870 + }, + { + "epoch": 1.7811860940695297, + "grad_norm": 1.1861648902004243, + "learning_rate": 2.9244878821303556e-07, + "loss": 0.0154, + "step": 871 + }, + { + "epoch": 1.7832310838445808, + "grad_norm": 1.3988331617040364, + "learning_rate": 2.870606741382059e-07, + "loss": 0.0349, + "step": 872 + }, + { + "epoch": 1.7852760736196318, + "grad_norm": 1.4786599382381074, + "learning_rate": 2.817211910982037e-07, + "loss": 0.0281, + "step": 873 + }, + { + "epoch": 1.787321063394683, + "grad_norm": 1.7984122833021066, + "learning_rate": 2.7643039418912996e-07, + "loss": 0.0291, + "step": 874 + }, + { + "epoch": 1.7893660531697342, + "grad_norm": 1.7454260608433505, + "learning_rate": 2.711883380047131e-07, + "loss": 0.0292, + "step": 875 + }, + { + "epoch": 1.7914110429447851, + "grad_norm": 1.435756459453004, + "learning_rate": 2.6599507663574387e-07, + "loss": 0.0293, + "step": 876 + }, + { + "epoch": 1.7934560327198366, + "grad_norm": 1.4644482699217904, + "learning_rate": 2.6085066366951907e-07, + "loss": 0.0245, + "step": 877 + }, + { + "epoch": 1.7955010224948875, + "grad_norm": 1.3875758357595886, + "learning_rate": 2.557551521892859e-07, + "loss": 0.0271, + "step": 878 + }, + { + "epoch": 1.7975460122699385, + "grad_norm": 0.9876574184926428, + "learning_rate": 2.5070859477369645e-07, + "loss": 0.0148, + "step": 879 + }, + { + "epoch": 1.79959100204499, + "grad_norm": 1.265878789206368, + "learning_rate": 2.457110434962645e-07, + "loss": 0.0216, + "step": 880 + }, + { + "epoch": 1.8016359918200409, + "grad_norm": 1.40284126411437, + "learning_rate": 2.407625499248273e-07, + "loss": 0.0249, + "step": 881 + }, + { + "epoch": 1.803680981595092, + "grad_norm": 1.1876694796769958, + "learning_rate": 2.3586316512101416e-07, + "loss": 0.018, + "step": 882 + }, + { + "epoch": 1.8057259713701432, + "grad_norm": 0.8367133769318583, + "learning_rate": 2.3101293963972094e-07, + "loss": 0.0178, + "step": 883 + }, + { + "epoch": 1.8077709611451942, + "grad_norm": 1.0995322120882318, + "learning_rate": 2.2621192352858702e-07, + "loss": 0.0198, + "step": 884 + }, + { + "epoch": 1.8098159509202454, + "grad_norm": 1.703675555853278, + "learning_rate": 2.2146016632747624e-07, + "loss": 0.0341, + "step": 885 + }, + { + "epoch": 1.8118609406952966, + "grad_norm": 1.6229142547220725, + "learning_rate": 2.1675771706797132e-07, + "loss": 0.0278, + "step": 886 + }, + { + "epoch": 1.8139059304703475, + "grad_norm": 1.6632882474057635, + "learning_rate": 2.1210462427286528e-07, + "loss": 0.0264, + "step": 887 + }, + { + "epoch": 1.8159509202453987, + "grad_norm": 1.696506311524546, + "learning_rate": 2.0750093595565735e-07, + "loss": 0.0315, + "step": 888 + }, + { + "epoch": 1.81799591002045, + "grad_norm": 1.783147077677834, + "learning_rate": 2.0294669962006352e-07, + "loss": 0.0306, + "step": 889 + }, + { + "epoch": 1.8200408997955009, + "grad_norm": 1.0803640055203296, + "learning_rate": 1.984419622595224e-07, + "loss": 0.0159, + "step": 890 + }, + { + "epoch": 1.8220858895705523, + "grad_norm": 1.549113936998901, + "learning_rate": 1.9398677035671222e-07, + "loss": 0.0356, + "step": 891 + }, + { + "epoch": 1.8241308793456033, + "grad_norm": 1.217663448407663, + "learning_rate": 1.8958116988306852e-07, + "loss": 0.0214, + "step": 892 + }, + { + "epoch": 1.8261758691206544, + "grad_norm": 1.2606236244237474, + "learning_rate": 1.8522520629831396e-07, + "loss": 0.0264, + "step": 893 + }, + { + "epoch": 1.8282208588957056, + "grad_norm": 1.1212441204936592, + "learning_rate": 1.8091892454998595e-07, + "loss": 0.017, + "step": 894 + }, + { + "epoch": 1.8302658486707566, + "grad_norm": 1.042748614877236, + "learning_rate": 1.7666236907297407e-07, + "loss": 0.0164, + "step": 895 + }, + { + "epoch": 1.8323108384458078, + "grad_norm": 1.3863959126170518, + "learning_rate": 1.7245558378906012e-07, + "loss": 0.0266, + "step": 896 + }, + { + "epoch": 1.834355828220859, + "grad_norm": 1.3029901304956657, + "learning_rate": 1.682986121064689e-07, + "loss": 0.025, + "step": 897 + }, + { + "epoch": 1.83640081799591, + "grad_norm": 0.8924861887554183, + "learning_rate": 1.641914969194147e-07, + "loss": 0.014, + "step": 898 + }, + { + "epoch": 1.8384458077709611, + "grad_norm": 1.0234983500191113, + "learning_rate": 1.6013428060766168e-07, + "loss": 0.019, + "step": 899 + }, + { + "epoch": 1.8404907975460123, + "grad_norm": 0.9136453201589728, + "learning_rate": 1.561270050360897e-07, + "loss": 0.0146, + "step": 900 + }, + { + "epoch": 1.8425357873210633, + "grad_norm": 1.8298008002925186, + "learning_rate": 1.5216971155425474e-07, + "loss": 0.0367, + "step": 901 + }, + { + "epoch": 1.8445807770961147, + "grad_norm": 0.9475181347283721, + "learning_rate": 1.4826244099596986e-07, + "loss": 0.0148, + "step": 902 + }, + { + "epoch": 1.8466257668711656, + "grad_norm": 0.9550648746556579, + "learning_rate": 1.444052336788787e-07, + "loss": 0.015, + "step": 903 + }, + { + "epoch": 1.8486707566462166, + "grad_norm": 1.4927311894076911, + "learning_rate": 1.4059812940404093e-07, + "loss": 0.0286, + "step": 904 + }, + { + "epoch": 1.850715746421268, + "grad_norm": 1.1696983318525789, + "learning_rate": 1.3684116745552423e-07, + "loss": 0.0212, + "step": 905 + }, + { + "epoch": 1.852760736196319, + "grad_norm": 1.2578768723641045, + "learning_rate": 1.33134386599994e-07, + "loss": 0.0244, + "step": 906 + }, + { + "epoch": 1.8548057259713702, + "grad_norm": 1.558622255405316, + "learning_rate": 1.2947782508631823e-07, + "loss": 0.0237, + "step": 907 + }, + { + "epoch": 1.8568507157464214, + "grad_norm": 1.52292583646827, + "learning_rate": 1.2587152064516828e-07, + "loss": 0.0246, + "step": 908 + }, + { + "epoch": 1.8588957055214723, + "grad_norm": 1.1936675481636827, + "learning_rate": 1.2231551048863421e-07, + "loss": 0.022, + "step": 909 + }, + { + "epoch": 1.8609406952965235, + "grad_norm": 1.4367150151701205, + "learning_rate": 1.1880983130983626e-07, + "loss": 0.0274, + "step": 910 + }, + { + "epoch": 1.8629856850715747, + "grad_norm": 1.4947550938995606, + "learning_rate": 1.1535451928254948e-07, + "loss": 0.0225, + "step": 911 + }, + { + "epoch": 1.8650306748466257, + "grad_norm": 1.6128578500137207, + "learning_rate": 1.1194961006082972e-07, + "loss": 0.0332, + "step": 912 + }, + { + "epoch": 1.8670756646216768, + "grad_norm": 1.1175499571067549, + "learning_rate": 1.0859513877864381e-07, + "loss": 0.0202, + "step": 913 + }, + { + "epoch": 1.869120654396728, + "grad_norm": 1.7323202236444266, + "learning_rate": 1.0529114004951047e-07, + "loss": 0.0423, + "step": 914 + }, + { + "epoch": 1.871165644171779, + "grad_norm": 1.1600018835452393, + "learning_rate": 1.0203764796614057e-07, + "loss": 0.0194, + "step": 915 + }, + { + "epoch": 1.8732106339468304, + "grad_norm": 1.3204191190409245, + "learning_rate": 9.883469610008578e-08, + "loss": 0.027, + "step": 916 + }, + { + "epoch": 1.8752556237218814, + "grad_norm": 1.5789271332032802, + "learning_rate": 9.568231750139212e-08, + "loss": 0.0381, + "step": 917 + }, + { + "epoch": 1.8773006134969326, + "grad_norm": 1.8636082047134532, + "learning_rate": 9.258054469825972e-08, + "loss": 0.0343, + "step": 918 + }, + { + "epoch": 1.8793456032719837, + "grad_norm": 1.729715689169104, + "learning_rate": 8.952940969670809e-08, + "loss": 0.0333, + "step": 919 + }, + { + "epoch": 1.8813905930470347, + "grad_norm": 1.075641909438574, + "learning_rate": 8.652894398024137e-08, + "loss": 0.0191, + "step": 920 + }, + { + "epoch": 1.883435582822086, + "grad_norm": 1.3038211172361949, + "learning_rate": 8.357917850952802e-08, + "loss": 0.0235, + "step": 921 + }, + { + "epoch": 1.885480572597137, + "grad_norm": 1.06035453866717, + "learning_rate": 8.06801437220811e-08, + "loss": 0.0191, + "step": 922 + }, + { + "epoch": 1.887525562372188, + "grad_norm": 1.2603710476757168, + "learning_rate": 7.783186953194189e-08, + "loss": 0.0246, + "step": 923 + }, + { + "epoch": 1.8895705521472392, + "grad_norm": 1.6288893230239516, + "learning_rate": 7.503438532937169e-08, + "loss": 0.036, + "step": 924 + }, + { + "epoch": 1.8916155419222904, + "grad_norm": 1.3517453504226422, + "learning_rate": 7.228771998054995e-08, + "loss": 0.0239, + "step": 925 + }, + { + "epoch": 1.8936605316973414, + "grad_norm": 1.1095416942390794, + "learning_rate": 6.959190182727616e-08, + "loss": 0.0165, + "step": 926 + }, + { + "epoch": 1.8957055214723928, + "grad_norm": 1.356854695078147, + "learning_rate": 6.694695868667556e-08, + "loss": 0.0236, + "step": 927 + }, + { + "epoch": 1.8977505112474438, + "grad_norm": 1.3824579801789842, + "learning_rate": 6.43529178509139e-08, + "loss": 0.0267, + "step": 928 + }, + { + "epoch": 1.8997955010224947, + "grad_norm": 1.5910728635319653, + "learning_rate": 6.180980608691656e-08, + "loss": 0.0269, + "step": 929 + }, + { + "epoch": 1.9018404907975461, + "grad_norm": 1.3467484522834312, + "learning_rate": 5.9317649636088656e-08, + "loss": 0.0272, + "step": 930 + }, + { + "epoch": 1.903885480572597, + "grad_norm": 1.404654837104888, + "learning_rate": 5.687647421404874e-08, + "loss": 0.0242, + "step": 931 + }, + { + "epoch": 1.9059304703476483, + "grad_norm": 1.2357699402907227, + "learning_rate": 5.4486305010361116e-08, + "loss": 0.0211, + "step": 932 + }, + { + "epoch": 1.9079754601226995, + "grad_norm": 1.1078419095507943, + "learning_rate": 5.214716668827558e-08, + "loss": 0.0174, + "step": 933 + }, + { + "epoch": 1.9100204498977504, + "grad_norm": 1.1455453685150343, + "learning_rate": 4.985908338447476e-08, + "loss": 0.0215, + "step": 934 + }, + { + "epoch": 1.9120654396728016, + "grad_norm": 1.5709389406578784, + "learning_rate": 4.7622078708822184e-08, + "loss": 0.0269, + "step": 935 + }, + { + "epoch": 1.9141104294478528, + "grad_norm": 1.1654623477471513, + "learning_rate": 4.543617574412185e-08, + "loss": 0.0207, + "step": 936 + }, + { + "epoch": 1.9161554192229038, + "grad_norm": 1.2721022886120923, + "learning_rate": 4.330139704587788e-08, + "loss": 0.0247, + "step": 937 + }, + { + "epoch": 1.918200408997955, + "grad_norm": 1.7353353754797876, + "learning_rate": 4.1217764642062505e-08, + "loss": 0.0325, + "step": 938 + }, + { + "epoch": 1.9202453987730062, + "grad_norm": 1.4523042929349372, + "learning_rate": 3.9185300032889005e-08, + "loss": 0.0245, + "step": 939 + }, + { + "epoch": 1.9222903885480571, + "grad_norm": 1.038008423835432, + "learning_rate": 3.720402419058966e-08, + "loss": 0.0172, + "step": 940 + }, + { + "epoch": 1.9243353783231085, + "grad_norm": 1.5830227670771397, + "learning_rate": 3.5273957559199265e-08, + "loss": 0.0363, + "step": 941 + }, + { + "epoch": 1.9263803680981595, + "grad_norm": 1.3333668595314416, + "learning_rate": 3.339512005434309e-08, + "loss": 0.0299, + "step": 942 + }, + { + "epoch": 1.9284253578732107, + "grad_norm": 1.013011546713111, + "learning_rate": 3.156753106303367e-08, + "loss": 0.0211, + "step": 943 + }, + { + "epoch": 1.9304703476482619, + "grad_norm": 1.0888245467200752, + "learning_rate": 2.979120944346936e-08, + "loss": 0.0197, + "step": 944 + }, + { + "epoch": 1.9325153374233128, + "grad_norm": 1.5607072199045122, + "learning_rate": 2.8066173524839978e-08, + "loss": 0.0254, + "step": 945 + }, + { + "epoch": 1.934560327198364, + "grad_norm": 1.299299607811048, + "learning_rate": 2.6392441107137013e-08, + "loss": 0.021, + "step": 946 + }, + { + "epoch": 1.9366053169734152, + "grad_norm": 1.6718512782375123, + "learning_rate": 2.4770029460970956e-08, + "loss": 0.0261, + "step": 947 + }, + { + "epoch": 1.9386503067484662, + "grad_norm": 1.5210112393820647, + "learning_rate": 2.319895532739369e-08, + "loss": 0.0301, + "step": 948 + }, + { + "epoch": 1.9406952965235174, + "grad_norm": 1.1866446525320051, + "learning_rate": 2.1679234917721946e-08, + "loss": 0.0219, + "step": 949 + }, + { + "epoch": 1.9427402862985685, + "grad_norm": 1.5585668068927292, + "learning_rate": 2.0210883913376334e-08, + "loss": 0.0271, + "step": 950 + }, + { + "epoch": 1.9447852760736195, + "grad_norm": 1.7577228374712899, + "learning_rate": 1.8793917465713686e-08, + "loss": 0.0368, + "step": 951 + }, + { + "epoch": 1.946830265848671, + "grad_norm": 1.122105721519514, + "learning_rate": 1.742835019587441e-08, + "loss": 0.0195, + "step": 952 + }, + { + "epoch": 1.9488752556237219, + "grad_norm": 1.0658375784177427, + "learning_rate": 1.6114196194628174e-08, + "loss": 0.017, + "step": 953 + }, + { + "epoch": 1.9509202453987728, + "grad_norm": 1.3923391468611417, + "learning_rate": 1.4851469022234e-08, + "loss": 0.0273, + "step": 954 + }, + { + "epoch": 1.9529652351738243, + "grad_norm": 1.3524898267742, + "learning_rate": 1.3640181708293731e-08, + "loss": 0.0259, + "step": 955 + }, + { + "epoch": 1.9550102249488752, + "grad_norm": 1.8660062595825002, + "learning_rate": 1.2480346751622686e-08, + "loss": 0.0324, + "step": 956 + }, + { + "epoch": 1.9570552147239264, + "grad_norm": 1.179664700215166, + "learning_rate": 1.137197612011809e-08, + "loss": 0.0259, + "step": 957 + }, + { + "epoch": 1.9591002044989776, + "grad_norm": 1.8058704230919458, + "learning_rate": 1.0315081250636405e-08, + "loss": 0.0265, + "step": 958 + }, + { + "epoch": 1.9611451942740286, + "grad_norm": 1.2051173078887687, + "learning_rate": 9.30967304887509e-09, + "loss": 0.0195, + "step": 959 + }, + { + "epoch": 1.9631901840490797, + "grad_norm": 1.4007351229371985, + "learning_rate": 8.35576188926046e-09, + "loss": 0.0304, + "step": 960 + }, + { + "epoch": 1.965235173824131, + "grad_norm": 1.1573585526709194, + "learning_rate": 7.453357614841116e-09, + "loss": 0.0202, + "step": 961 + }, + { + "epoch": 1.967280163599182, + "grad_norm": 1.3119134280852782, + "learning_rate": 6.60246953718302e-09, + "loss": 0.0284, + "step": 962 + }, + { + "epoch": 1.969325153374233, + "grad_norm": 2.16522611001059, + "learning_rate": 5.803106436279571e-09, + "loss": 0.039, + "step": 963 + }, + { + "epoch": 1.9713701431492843, + "grad_norm": 1.2979533184475112, + "learning_rate": 5.055276560454459e-09, + "loss": 0.025, + "step": 964 + }, + { + "epoch": 1.9734151329243352, + "grad_norm": 1.0802181068863799, + "learning_rate": 4.358987626281175e-09, + "loss": 0.0151, + "step": 965 + }, + { + "epoch": 1.9754601226993866, + "grad_norm": 1.7117951934036464, + "learning_rate": 3.71424681850141e-09, + "loss": 0.0355, + "step": 966 + }, + { + "epoch": 1.9775051124744376, + "grad_norm": 1.3376552108682394, + "learning_rate": 3.1210607899512244e-09, + "loss": 0.0251, + "step": 967 + }, + { + "epoch": 1.9795501022494888, + "grad_norm": 1.1725724434752407, + "learning_rate": 2.579435661492213e-09, + "loss": 0.0204, + "step": 968 + }, + { + "epoch": 1.98159509202454, + "grad_norm": 1.696450698444297, + "learning_rate": 2.0893770219493347e-09, + "loss": 0.0299, + "step": 969 + }, + { + "epoch": 1.983640081799591, + "grad_norm": 1.325923649921468, + "learning_rate": 1.6508899280515134e-09, + "loss": 0.0192, + "step": 970 + }, + { + "epoch": 1.9856850715746421, + "grad_norm": 1.1813786320601327, + "learning_rate": 1.2639789043805695e-09, + "loss": 0.0196, + "step": 971 + }, + { + "epoch": 1.9877300613496933, + "grad_norm": 0.9284331487465304, + "learning_rate": 9.286479433257e-10, + "loss": 0.0144, + "step": 972 + }, + { + "epoch": 1.9897750511247443, + "grad_norm": 1.7289144251248756, + "learning_rate": 6.4490050503907e-10, + "loss": 0.0365, + "step": 973 + }, + { + "epoch": 1.9918200408997955, + "grad_norm": 1.3982000044906164, + "learning_rate": 4.127395174036153e-10, + "loss": 0.0259, + "step": 974 + }, + { + "epoch": 1.9938650306748467, + "grad_norm": 1.1534216307337495, + "learning_rate": 2.321673760002918e-10, + "loss": 0.0185, + "step": 975 + }, + { + "epoch": 1.9959100204498976, + "grad_norm": 1.4226080527702496, + "learning_rate": 1.0318594408476045e-10, + "loss": 0.0227, + "step": 976 + }, + { + "epoch": 1.997955010224949, + "grad_norm": 2.2227043437975627, + "learning_rate": 2.57965525674031e-11, + "loss": 0.0355, + "step": 977 + }, + { + "epoch": 2.0, + "grad_norm": 1.7779216395225974, + "learning_rate": 0.0, + "loss": 0.029, + "step": 978 + }, + { + "epoch": 2.0, + "step": 978, + "total_flos": 4304231890944.0, + "train_loss": 0.056620436601515986, + "train_runtime": 754.4403, + "train_samples_per_second": 10.36, + "train_steps_per_second": 1.296 + } + ], + "logging_steps": 1, + "max_steps": 978, + "num_input_tokens_seen": 0, + "num_train_epochs": 2, + "save_steps": 2000, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 4304231890944.0, + "train_batch_size": 2, + "trial_name": null, + "trial_params": null +}