{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.9998983843105376, "eval_steps": 500, "global_step": 1230, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.000812925515699624, "grad_norm": 34.804351806640625, "learning_rate": 5.405405405405406e-07, "loss": 2.022, "step": 1 }, { "epoch": 0.001625851031399248, "grad_norm": 35.288848876953125, "learning_rate": 1.0810810810810812e-06, "loss": 2.1055, "step": 2 }, { "epoch": 0.002438776547098872, "grad_norm": 37.58893585205078, "learning_rate": 1.6216216216216219e-06, "loss": 2.0685, "step": 3 }, { "epoch": 0.003251702062798496, "grad_norm": 28.51118278503418, "learning_rate": 2.1621621621621623e-06, "loss": 2.0364, "step": 4 }, { "epoch": 0.00406462757849812, "grad_norm": 24.874475479125977, "learning_rate": 2.702702702702703e-06, "loss": 1.9688, "step": 5 }, { "epoch": 0.004877553094197744, "grad_norm": 12.156012535095215, "learning_rate": 3.2432432432432437e-06, "loss": 1.8677, "step": 6 }, { "epoch": 0.005690478609897368, "grad_norm": 7.017012119293213, "learning_rate": 3.7837837837837844e-06, "loss": 1.7774, "step": 7 }, { "epoch": 0.006503404125596992, "grad_norm": 11.024828910827637, "learning_rate": 4.324324324324325e-06, "loss": 1.8042, "step": 8 }, { "epoch": 0.007316329641296616, "grad_norm": 6.988280296325684, "learning_rate": 4.864864864864866e-06, "loss": 1.7973, "step": 9 }, { "epoch": 0.00812925515699624, "grad_norm": 8.541196823120117, "learning_rate": 5.405405405405406e-06, "loss": 1.7946, "step": 10 }, { "epoch": 0.008942180672695864, "grad_norm": 7.084593772888184, "learning_rate": 5.945945945945947e-06, "loss": 1.8178, "step": 11 }, { "epoch": 0.009755106188395488, "grad_norm": 5.755589962005615, "learning_rate": 6.486486486486487e-06, "loss": 1.7748, "step": 12 }, { "epoch": 0.010568031704095112, "grad_norm": 4.855886459350586, "learning_rate": 7.027027027027028e-06, "loss": 1.6665, "step": 13 }, { "epoch": 0.011380957219794737, "grad_norm": 5.280701160430908, "learning_rate": 7.567567567567569e-06, "loss": 1.7226, "step": 14 }, { "epoch": 0.01219388273549436, "grad_norm": 4.513389587402344, "learning_rate": 8.108108108108109e-06, "loss": 1.7219, "step": 15 }, { "epoch": 0.013006808251193984, "grad_norm": 4.92287015914917, "learning_rate": 8.64864864864865e-06, "loss": 1.697, "step": 16 }, { "epoch": 0.013819733766893608, "grad_norm": 4.488801002502441, "learning_rate": 9.189189189189191e-06, "loss": 1.6584, "step": 17 }, { "epoch": 0.014632659282593233, "grad_norm": 3.736851930618286, "learning_rate": 9.729729729729732e-06, "loss": 1.6752, "step": 18 }, { "epoch": 0.015445584798292857, "grad_norm": 3.7089431285858154, "learning_rate": 1.027027027027027e-05, "loss": 1.602, "step": 19 }, { "epoch": 0.01625851031399248, "grad_norm": 3.9821619987487793, "learning_rate": 1.0810810810810812e-05, "loss": 1.6492, "step": 20 }, { "epoch": 0.017071435829692106, "grad_norm": 3.72698974609375, "learning_rate": 1.1351351351351352e-05, "loss": 1.6893, "step": 21 }, { "epoch": 0.017884361345391727, "grad_norm": 3.0124993324279785, "learning_rate": 1.1891891891891894e-05, "loss": 1.5879, "step": 22 }, { "epoch": 0.01869728686109135, "grad_norm": 9.361907005310059, "learning_rate": 1.2432432432432433e-05, "loss": 1.6172, "step": 23 }, { "epoch": 0.019510212376790976, "grad_norm": 3.431147813796997, "learning_rate": 1.2972972972972975e-05, "loss": 1.6354, "step": 24 }, { "epoch": 0.0203231378924906, "grad_norm": 3.3041067123413086, "learning_rate": 1.3513513513513515e-05, "loss": 1.5998, "step": 25 }, { "epoch": 0.021136063408190225, "grad_norm": 3.4122121334075928, "learning_rate": 1.4054054054054055e-05, "loss": 1.5737, "step": 26 }, { "epoch": 0.02194898892388985, "grad_norm": 3.538844585418701, "learning_rate": 1.4594594594594596e-05, "loss": 1.5737, "step": 27 }, { "epoch": 0.022761914439589474, "grad_norm": 3.245887041091919, "learning_rate": 1.5135135135135138e-05, "loss": 1.5893, "step": 28 }, { "epoch": 0.023574839955289098, "grad_norm": 3.7787671089172363, "learning_rate": 1.5675675675675676e-05, "loss": 1.5923, "step": 29 }, { "epoch": 0.02438776547098872, "grad_norm": 3.557563066482544, "learning_rate": 1.6216216216216218e-05, "loss": 1.5906, "step": 30 }, { "epoch": 0.025200690986688343, "grad_norm": 3.1536169052124023, "learning_rate": 1.6756756756756757e-05, "loss": 1.5976, "step": 31 }, { "epoch": 0.026013616502387968, "grad_norm": 3.060678005218506, "learning_rate": 1.72972972972973e-05, "loss": 1.5239, "step": 32 }, { "epoch": 0.026826542018087592, "grad_norm": 3.0163331031799316, "learning_rate": 1.783783783783784e-05, "loss": 1.5703, "step": 33 }, { "epoch": 0.027639467533787217, "grad_norm": 3.0648066997528076, "learning_rate": 1.8378378378378383e-05, "loss": 1.5421, "step": 34 }, { "epoch": 0.02845239304948684, "grad_norm": 2.8359413146972656, "learning_rate": 1.891891891891892e-05, "loss": 1.5698, "step": 35 }, { "epoch": 0.029265318565186466, "grad_norm": 3.0256259441375732, "learning_rate": 1.9459459459459463e-05, "loss": 1.5258, "step": 36 }, { "epoch": 0.03007824408088609, "grad_norm": 3.305952548980713, "learning_rate": 2e-05, "loss": 1.5979, "step": 37 }, { "epoch": 0.030891169596585714, "grad_norm": 2.7958834171295166, "learning_rate": 1.99999653272242e-05, "loss": 1.5065, "step": 38 }, { "epoch": 0.031704095112285335, "grad_norm": 3.515479564666748, "learning_rate": 1.9999861309137232e-05, "loss": 1.4837, "step": 39 }, { "epoch": 0.03251702062798496, "grad_norm": 2.7845990657806396, "learning_rate": 1.999968794646042e-05, "loss": 1.5634, "step": 40 }, { "epoch": 0.033329946143684584, "grad_norm": 3.0540645122528076, "learning_rate": 1.9999445240395953e-05, "loss": 1.5001, "step": 41 }, { "epoch": 0.03414287165938421, "grad_norm": 3.059220790863037, "learning_rate": 1.9999133192626893e-05, "loss": 1.502, "step": 42 }, { "epoch": 0.03495579717508383, "grad_norm": 2.594452142715454, "learning_rate": 1.9998751805317152e-05, "loss": 1.5245, "step": 43 }, { "epoch": 0.035768722690783454, "grad_norm": 3.0076844692230225, "learning_rate": 1.999830108111148e-05, "loss": 1.5032, "step": 44 }, { "epoch": 0.03658164820648308, "grad_norm": 2.9521396160125732, "learning_rate": 1.999778102313545e-05, "loss": 1.5381, "step": 45 }, { "epoch": 0.0373945737221827, "grad_norm": 3.280303478240967, "learning_rate": 1.999719163499543e-05, "loss": 1.5478, "step": 46 }, { "epoch": 0.03820749923788233, "grad_norm": 2.9089877605438232, "learning_rate": 1.999653292077857e-05, "loss": 1.4783, "step": 47 }, { "epoch": 0.03902042475358195, "grad_norm": 5.1869635581970215, "learning_rate": 1.999580488505276e-05, "loss": 1.5067, "step": 48 }, { "epoch": 0.03983335026928158, "grad_norm": 3.053921699523926, "learning_rate": 1.9995007532866594e-05, "loss": 1.503, "step": 49 }, { "epoch": 0.0406462757849812, "grad_norm": 2.952059507369995, "learning_rate": 1.9994140869749366e-05, "loss": 1.4579, "step": 50 }, { "epoch": 0.04145920130068083, "grad_norm": 2.609379291534424, "learning_rate": 1.9993204901710995e-05, "loss": 1.4679, "step": 51 }, { "epoch": 0.04227212681638045, "grad_norm": 3.41717267036438, "learning_rate": 1.9992199635241997e-05, "loss": 1.5197, "step": 52 }, { "epoch": 0.04308505233208007, "grad_norm": 2.8707101345062256, "learning_rate": 1.999112507731346e-05, "loss": 1.5074, "step": 53 }, { "epoch": 0.0438979778477797, "grad_norm": 3.325697660446167, "learning_rate": 1.9989981235376956e-05, "loss": 1.427, "step": 54 }, { "epoch": 0.04471090336347932, "grad_norm": 2.7196686267852783, "learning_rate": 1.9988768117364526e-05, "loss": 1.4868, "step": 55 }, { "epoch": 0.04552382887917895, "grad_norm": 2.9488351345062256, "learning_rate": 1.9987485731688595e-05, "loss": 1.5011, "step": 56 }, { "epoch": 0.04633675439487857, "grad_norm": 2.7776849269866943, "learning_rate": 1.998613408724195e-05, "loss": 1.4664, "step": 57 }, { "epoch": 0.047149679910578196, "grad_norm": 2.719594717025757, "learning_rate": 1.998471319339763e-05, "loss": 1.4905, "step": 58 }, { "epoch": 0.04796260542627782, "grad_norm": 2.8028323650360107, "learning_rate": 1.9983223060008908e-05, "loss": 1.4754, "step": 59 }, { "epoch": 0.04877553094197744, "grad_norm": 3.1789817810058594, "learning_rate": 1.9981663697409203e-05, "loss": 1.4618, "step": 60 }, { "epoch": 0.049588456457677066, "grad_norm": 3.077449321746826, "learning_rate": 1.998003511641199e-05, "loss": 1.453, "step": 61 }, { "epoch": 0.05040138197337669, "grad_norm": 2.960418939590454, "learning_rate": 1.997833732831076e-05, "loss": 1.4564, "step": 62 }, { "epoch": 0.051214307489076315, "grad_norm": 5.316094875335693, "learning_rate": 1.9976570344878916e-05, "loss": 1.4711, "step": 63 }, { "epoch": 0.052027233004775936, "grad_norm": 3.257415771484375, "learning_rate": 1.9974734178369702e-05, "loss": 1.4606, "step": 64 }, { "epoch": 0.052840158520475564, "grad_norm": 3.0437912940979004, "learning_rate": 1.997282884151612e-05, "loss": 1.5075, "step": 65 }, { "epoch": 0.053653084036175185, "grad_norm": 3.3059332370758057, "learning_rate": 1.9970854347530828e-05, "loss": 1.484, "step": 66 }, { "epoch": 0.05446600955187481, "grad_norm": 4.510897636413574, "learning_rate": 1.9968810710106065e-05, "loss": 1.5091, "step": 67 }, { "epoch": 0.05527893506757443, "grad_norm": 3.3621528148651123, "learning_rate": 1.9966697943413548e-05, "loss": 1.4603, "step": 68 }, { "epoch": 0.056091860583274054, "grad_norm": 2.878563642501831, "learning_rate": 1.9964516062104377e-05, "loss": 1.4438, "step": 69 }, { "epoch": 0.05690478609897368, "grad_norm": 2.8587141036987305, "learning_rate": 1.996226508130892e-05, "loss": 1.441, "step": 70 }, { "epoch": 0.0577177116146733, "grad_norm": 3.2675728797912598, "learning_rate": 1.995994501663674e-05, "loss": 1.4515, "step": 71 }, { "epoch": 0.05853063713037293, "grad_norm": 3.018068790435791, "learning_rate": 1.995755588417644e-05, "loss": 1.4499, "step": 72 }, { "epoch": 0.05934356264607255, "grad_norm": 3.715628147125244, "learning_rate": 1.99550977004956e-05, "loss": 1.4624, "step": 73 }, { "epoch": 0.06015648816177218, "grad_norm": 2.7632699012756348, "learning_rate": 1.9952570482640628e-05, "loss": 1.4437, "step": 74 }, { "epoch": 0.0609694136774718, "grad_norm": 3.3581650257110596, "learning_rate": 1.9949974248136655e-05, "loss": 1.4865, "step": 75 }, { "epoch": 0.06178233919317143, "grad_norm": 3.090432643890381, "learning_rate": 1.9947309014987414e-05, "loss": 1.4416, "step": 76 }, { "epoch": 0.06259526470887104, "grad_norm": 3.3709418773651123, "learning_rate": 1.9944574801675106e-05, "loss": 1.4184, "step": 77 }, { "epoch": 0.06340819022457067, "grad_norm": 3.6959853172302246, "learning_rate": 1.9941771627160287e-05, "loss": 1.4694, "step": 78 }, { "epoch": 0.0642211157402703, "grad_norm": 3.2907724380493164, "learning_rate": 1.9938899510881732e-05, "loss": 1.4121, "step": 79 }, { "epoch": 0.06503404125596993, "grad_norm": 2.7885124683380127, "learning_rate": 1.9935958472756283e-05, "loss": 1.4033, "step": 80 }, { "epoch": 0.06584696677166954, "grad_norm": 2.8771262168884277, "learning_rate": 1.993294853317873e-05, "loss": 1.4466, "step": 81 }, { "epoch": 0.06665989228736917, "grad_norm": 3.572303056716919, "learning_rate": 1.9929869713021668e-05, "loss": 1.3854, "step": 82 }, { "epoch": 0.0674728178030688, "grad_norm": 2.636934757232666, "learning_rate": 1.9926722033635343e-05, "loss": 1.4186, "step": 83 }, { "epoch": 0.06828574331876842, "grad_norm": 3.1140427589416504, "learning_rate": 1.9923505516847514e-05, "loss": 1.424, "step": 84 }, { "epoch": 0.06909866883446804, "grad_norm": 2.808480739593506, "learning_rate": 1.9920220184963296e-05, "loss": 1.4744, "step": 85 }, { "epoch": 0.06991159435016767, "grad_norm": 3.118234872817993, "learning_rate": 1.9916866060764994e-05, "loss": 1.4277, "step": 86 }, { "epoch": 0.0707245198658673, "grad_norm": 4.0702033042907715, "learning_rate": 1.991344316751198e-05, "loss": 1.4236, "step": 87 }, { "epoch": 0.07153744538156691, "grad_norm": 2.938345193862915, "learning_rate": 1.9909951528940485e-05, "loss": 1.4119, "step": 88 }, { "epoch": 0.07235037089726654, "grad_norm": 2.960853338241577, "learning_rate": 1.990639116926348e-05, "loss": 1.471, "step": 89 }, { "epoch": 0.07316329641296616, "grad_norm": 3.146742343902588, "learning_rate": 1.9902762113170467e-05, "loss": 1.4751, "step": 90 }, { "epoch": 0.07397622192866579, "grad_norm": 3.3954169750213623, "learning_rate": 1.989906438582734e-05, "loss": 1.467, "step": 91 }, { "epoch": 0.0747891474443654, "grad_norm": 2.9790520668029785, "learning_rate": 1.9895298012876192e-05, "loss": 1.507, "step": 92 }, { "epoch": 0.07560207296006503, "grad_norm": 2.577925682067871, "learning_rate": 1.9891463020435144e-05, "loss": 1.4728, "step": 93 }, { "epoch": 0.07641499847576466, "grad_norm": 3.437133550643921, "learning_rate": 1.9887559435098162e-05, "loss": 1.4472, "step": 94 }, { "epoch": 0.07722792399146428, "grad_norm": 2.806886911392212, "learning_rate": 1.9883587283934875e-05, "loss": 1.4497, "step": 95 }, { "epoch": 0.0780408495071639, "grad_norm": 2.703793525695801, "learning_rate": 1.9879546594490383e-05, "loss": 1.4643, "step": 96 }, { "epoch": 0.07885377502286353, "grad_norm": 3.2830615043640137, "learning_rate": 1.987543739478507e-05, "loss": 1.4162, "step": 97 }, { "epoch": 0.07966670053856316, "grad_norm": 2.5376830101013184, "learning_rate": 1.987125971331441e-05, "loss": 1.494, "step": 98 }, { "epoch": 0.08047962605426277, "grad_norm": 2.532893180847168, "learning_rate": 1.9867013579048765e-05, "loss": 1.4575, "step": 99 }, { "epoch": 0.0812925515699624, "grad_norm": 2.838155508041382, "learning_rate": 1.9862699021433186e-05, "loss": 1.4007, "step": 100 }, { "epoch": 0.08210547708566203, "grad_norm": 2.5777368545532227, "learning_rate": 1.9858316070387208e-05, "loss": 1.4213, "step": 101 }, { "epoch": 0.08291840260136166, "grad_norm": 2.634209394454956, "learning_rate": 1.9853864756304654e-05, "loss": 1.4544, "step": 102 }, { "epoch": 0.08373132811706127, "grad_norm": 2.9893202781677246, "learning_rate": 1.9849345110053405e-05, "loss": 1.4361, "step": 103 }, { "epoch": 0.0845442536327609, "grad_norm": 2.668808698654175, "learning_rate": 1.984475716297519e-05, "loss": 1.4267, "step": 104 }, { "epoch": 0.08535717914846053, "grad_norm": 3.2199463844299316, "learning_rate": 1.984010094688539e-05, "loss": 1.4731, "step": 105 }, { "epoch": 0.08617010466416014, "grad_norm": 2.746006965637207, "learning_rate": 1.9835376494072788e-05, "loss": 1.385, "step": 106 }, { "epoch": 0.08698303017985977, "grad_norm": 2.955232620239258, "learning_rate": 1.9830583837299363e-05, "loss": 1.3984, "step": 107 }, { "epoch": 0.0877959556955594, "grad_norm": 5.357511520385742, "learning_rate": 1.9825723009800058e-05, "loss": 1.4562, "step": 108 }, { "epoch": 0.08860888121125902, "grad_norm": 2.5583655834198, "learning_rate": 1.9820794045282553e-05, "loss": 1.4222, "step": 109 }, { "epoch": 0.08942180672695864, "grad_norm": 2.6951992511749268, "learning_rate": 1.9815796977927015e-05, "loss": 1.4697, "step": 110 }, { "epoch": 0.09023473224265827, "grad_norm": 2.714019775390625, "learning_rate": 1.9810731842385892e-05, "loss": 1.4696, "step": 111 }, { "epoch": 0.0910476577583579, "grad_norm": 3.2998311519622803, "learning_rate": 1.9805598673783644e-05, "loss": 1.4034, "step": 112 }, { "epoch": 0.09186058327405751, "grad_norm": 7.7324652671813965, "learning_rate": 1.980039750771651e-05, "loss": 1.4697, "step": 113 }, { "epoch": 0.09267350878975714, "grad_norm": 2.6200242042541504, "learning_rate": 1.9795128380252263e-05, "loss": 1.451, "step": 114 }, { "epoch": 0.09348643430545676, "grad_norm": 2.937061071395874, "learning_rate": 1.978979132792996e-05, "loss": 1.4348, "step": 115 }, { "epoch": 0.09429935982115639, "grad_norm": 3.087085247039795, "learning_rate": 1.9784386387759684e-05, "loss": 1.4271, "step": 116 }, { "epoch": 0.095112285336856, "grad_norm": 2.6796271800994873, "learning_rate": 1.977891359722229e-05, "loss": 1.4933, "step": 117 }, { "epoch": 0.09592521085255563, "grad_norm": 3.4506633281707764, "learning_rate": 1.9773372994269147e-05, "loss": 1.427, "step": 118 }, { "epoch": 0.09673813636825526, "grad_norm": 2.532562732696533, "learning_rate": 1.976776461732187e-05, "loss": 1.436, "step": 119 }, { "epoch": 0.09755106188395488, "grad_norm": 3.619605541229248, "learning_rate": 1.976208850527206e-05, "loss": 1.4384, "step": 120 }, { "epoch": 0.0983639873996545, "grad_norm": 3.5245602130889893, "learning_rate": 1.9756344697481027e-05, "loss": 1.4303, "step": 121 }, { "epoch": 0.09917691291535413, "grad_norm": 2.649686336517334, "learning_rate": 1.975053323377952e-05, "loss": 1.4692, "step": 122 }, { "epoch": 0.09998983843105376, "grad_norm": 2.516016721725464, "learning_rate": 1.9744654154467468e-05, "loss": 1.4154, "step": 123 }, { "epoch": 0.10080276394675337, "grad_norm": 2.591168165206909, "learning_rate": 1.9738707500313655e-05, "loss": 1.403, "step": 124 }, { "epoch": 0.101615689462453, "grad_norm": 2.522486686706543, "learning_rate": 1.9732693312555492e-05, "loss": 1.4575, "step": 125 }, { "epoch": 0.10242861497815263, "grad_norm": 2.8282413482666016, "learning_rate": 1.9726611632898693e-05, "loss": 1.377, "step": 126 }, { "epoch": 0.10324154049385226, "grad_norm": 2.598076820373535, "learning_rate": 1.9720462503517e-05, "loss": 1.4382, "step": 127 }, { "epoch": 0.10405446600955187, "grad_norm": 2.688178777694702, "learning_rate": 1.971424596705189e-05, "loss": 1.4132, "step": 128 }, { "epoch": 0.1048673915252515, "grad_norm": 2.7512471675872803, "learning_rate": 1.9707962066612278e-05, "loss": 1.4193, "step": 129 }, { "epoch": 0.10568031704095113, "grad_norm": 2.8025805950164795, "learning_rate": 1.970161084577422e-05, "loss": 1.3829, "step": 130 }, { "epoch": 0.10649324255665074, "grad_norm": 2.6514623165130615, "learning_rate": 1.9695192348580606e-05, "loss": 1.4362, "step": 131 }, { "epoch": 0.10730616807235037, "grad_norm": 2.4559547901153564, "learning_rate": 1.9688706619540863e-05, "loss": 1.4357, "step": 132 }, { "epoch": 0.10811909358805, "grad_norm": 2.8258724212646484, "learning_rate": 1.968215370363063e-05, "loss": 1.4501, "step": 133 }, { "epoch": 0.10893201910374962, "grad_norm": 2.8553593158721924, "learning_rate": 1.9675533646291463e-05, "loss": 1.4841, "step": 134 }, { "epoch": 0.10974494461944924, "grad_norm": 3.625079870223999, "learning_rate": 1.9668846493430522e-05, "loss": 1.47, "step": 135 }, { "epoch": 0.11055787013514887, "grad_norm": 2.9793193340301514, "learning_rate": 1.9662092291420233e-05, "loss": 1.3969, "step": 136 }, { "epoch": 0.1113707956508485, "grad_norm": 2.5699939727783203, "learning_rate": 1.965527108709798e-05, "loss": 1.4258, "step": 137 }, { "epoch": 0.11218372116654811, "grad_norm": 2.7961106300354004, "learning_rate": 1.964838292776579e-05, "loss": 1.4637, "step": 138 }, { "epoch": 0.11299664668224774, "grad_norm": 3.3331451416015625, "learning_rate": 1.9641427861189973e-05, "loss": 1.3976, "step": 139 }, { "epoch": 0.11380957219794736, "grad_norm": 2.5645205974578857, "learning_rate": 1.963440593560083e-05, "loss": 1.409, "step": 140 }, { "epoch": 0.11462249771364699, "grad_norm": 2.5996487140655518, "learning_rate": 1.9627317199692287e-05, "loss": 1.4834, "step": 141 }, { "epoch": 0.1154354232293466, "grad_norm": 2.9811034202575684, "learning_rate": 1.962016170262157e-05, "loss": 1.4508, "step": 142 }, { "epoch": 0.11624834874504623, "grad_norm": 2.4133377075195312, "learning_rate": 1.961293949400888e-05, "loss": 1.4077, "step": 143 }, { "epoch": 0.11706127426074586, "grad_norm": 2.622091770172119, "learning_rate": 1.960565062393701e-05, "loss": 1.4046, "step": 144 }, { "epoch": 0.11787419977644548, "grad_norm": 2.857346534729004, "learning_rate": 1.9598295142951035e-05, "loss": 1.4217, "step": 145 }, { "epoch": 0.1186871252921451, "grad_norm": 2.600935220718384, "learning_rate": 1.9590873102057948e-05, "loss": 1.403, "step": 146 }, { "epoch": 0.11950005080784473, "grad_norm": 2.820359945297241, "learning_rate": 1.9583384552726294e-05, "loss": 1.4358, "step": 147 }, { "epoch": 0.12031297632354436, "grad_norm": 2.6272051334381104, "learning_rate": 1.957582954688584e-05, "loss": 1.4505, "step": 148 }, { "epoch": 0.12112590183924397, "grad_norm": 2.8003182411193848, "learning_rate": 1.9568208136927177e-05, "loss": 1.3977, "step": 149 }, { "epoch": 0.1219388273549436, "grad_norm": 3.560518264770508, "learning_rate": 1.9560520375701408e-05, "loss": 1.3992, "step": 150 }, { "epoch": 0.12275175287064323, "grad_norm": 2.6377906799316406, "learning_rate": 1.9552766316519726e-05, "loss": 1.4022, "step": 151 }, { "epoch": 0.12356467838634286, "grad_norm": 2.7982730865478516, "learning_rate": 1.9544946013153093e-05, "loss": 1.409, "step": 152 }, { "epoch": 0.12437760390204247, "grad_norm": 2.7179160118103027, "learning_rate": 1.9537059519831822e-05, "loss": 1.415, "step": 153 }, { "epoch": 0.12519052941774209, "grad_norm": 2.959397554397583, "learning_rate": 1.9529106891245244e-05, "loss": 1.4296, "step": 154 }, { "epoch": 0.12600345493344173, "grad_norm": 3.3228979110717773, "learning_rate": 1.9521088182541298e-05, "loss": 1.4282, "step": 155 }, { "epoch": 0.12681638044914134, "grad_norm": 2.763151168823242, "learning_rate": 1.951300344932616e-05, "loss": 1.3686, "step": 156 }, { "epoch": 0.12762930596484098, "grad_norm": 2.863190174102783, "learning_rate": 1.9504852747663862e-05, "loss": 1.4227, "step": 157 }, { "epoch": 0.1284422314805406, "grad_norm": 2.889604330062866, "learning_rate": 1.9496636134075894e-05, "loss": 1.4658, "step": 158 }, { "epoch": 0.1292551569962402, "grad_norm": 3.024122476577759, "learning_rate": 1.9488353665540813e-05, "loss": 1.4081, "step": 159 }, { "epoch": 0.13006808251193985, "grad_norm": 2.4810218811035156, "learning_rate": 1.9480005399493857e-05, "loss": 1.4296, "step": 160 }, { "epoch": 0.13088100802763947, "grad_norm": 2.643673896789551, "learning_rate": 1.9471591393826536e-05, "loss": 1.3652, "step": 161 }, { "epoch": 0.13169393354333908, "grad_norm": 2.888829231262207, "learning_rate": 1.9463111706886234e-05, "loss": 1.4003, "step": 162 }, { "epoch": 0.13250685905903872, "grad_norm": 2.7480149269104004, "learning_rate": 1.9454566397475813e-05, "loss": 1.4195, "step": 163 }, { "epoch": 0.13331978457473834, "grad_norm": 2.68972110748291, "learning_rate": 1.944595552485319e-05, "loss": 1.3848, "step": 164 }, { "epoch": 0.13413271009043795, "grad_norm": 2.8888440132141113, "learning_rate": 1.943727914873094e-05, "loss": 1.481, "step": 165 }, { "epoch": 0.1349456356061376, "grad_norm": 2.8409390449523926, "learning_rate": 1.9428537329275862e-05, "loss": 1.4176, "step": 166 }, { "epoch": 0.1357585611218372, "grad_norm": 2.4992098808288574, "learning_rate": 1.941973012710859e-05, "loss": 1.395, "step": 167 }, { "epoch": 0.13657148663753685, "grad_norm": 4.587447166442871, "learning_rate": 1.941085760330316e-05, "loss": 1.3905, "step": 168 }, { "epoch": 0.13738441215323646, "grad_norm": 2.4778833389282227, "learning_rate": 1.940191981938657e-05, "loss": 1.3707, "step": 169 }, { "epoch": 0.13819733766893608, "grad_norm": 2.7843387126922607, "learning_rate": 1.9392916837338376e-05, "loss": 1.3698, "step": 170 }, { "epoch": 0.13901026318463572, "grad_norm": 2.731437921524048, "learning_rate": 1.9383848719590257e-05, "loss": 1.4358, "step": 171 }, { "epoch": 0.13982318870033533, "grad_norm": 3.079371213912964, "learning_rate": 1.9374715529025575e-05, "loss": 1.4027, "step": 172 }, { "epoch": 0.14063611421603495, "grad_norm": 3.6557998657226562, "learning_rate": 1.9365517328978943e-05, "loss": 1.428, "step": 173 }, { "epoch": 0.1414490397317346, "grad_norm": 2.9291248321533203, "learning_rate": 1.9356254183235785e-05, "loss": 1.4039, "step": 174 }, { "epoch": 0.1422619652474342, "grad_norm": 2.498507499694824, "learning_rate": 1.93469261560319e-05, "loss": 1.3731, "step": 175 }, { "epoch": 0.14307489076313382, "grad_norm": 3.6117923259735107, "learning_rate": 1.9337533312053002e-05, "loss": 1.4263, "step": 176 }, { "epoch": 0.14388781627883346, "grad_norm": 2.490755319595337, "learning_rate": 1.9328075716434287e-05, "loss": 1.4215, "step": 177 }, { "epoch": 0.14470074179453307, "grad_norm": 2.9008986949920654, "learning_rate": 1.931855343475998e-05, "loss": 1.3968, "step": 178 }, { "epoch": 0.14551366731023269, "grad_norm": 5.284730911254883, "learning_rate": 1.930896653306286e-05, "loss": 1.418, "step": 179 }, { "epoch": 0.14632659282593233, "grad_norm": 2.826756000518799, "learning_rate": 1.929931507782383e-05, "loss": 1.3996, "step": 180 }, { "epoch": 0.14713951834163194, "grad_norm": 2.8084652423858643, "learning_rate": 1.9289599135971437e-05, "loss": 1.374, "step": 181 }, { "epoch": 0.14795244385733158, "grad_norm": 2.736046075820923, "learning_rate": 1.9279818774881418e-05, "loss": 1.3687, "step": 182 }, { "epoch": 0.1487653693730312, "grad_norm": 2.7098567485809326, "learning_rate": 1.9269974062376224e-05, "loss": 1.4059, "step": 183 }, { "epoch": 0.1495782948887308, "grad_norm": 2.7764878273010254, "learning_rate": 1.926006506672456e-05, "loss": 1.42, "step": 184 }, { "epoch": 0.15039122040443045, "grad_norm": 2.7715649604797363, "learning_rate": 1.9250091856640895e-05, "loss": 1.4549, "step": 185 }, { "epoch": 0.15120414592013007, "grad_norm": 2.4104158878326416, "learning_rate": 1.9240054501285015e-05, "loss": 1.4129, "step": 186 }, { "epoch": 0.15201707143582968, "grad_norm": 2.75614595413208, "learning_rate": 1.922995307026151e-05, "loss": 1.3959, "step": 187 }, { "epoch": 0.15282999695152932, "grad_norm": 2.813262939453125, "learning_rate": 1.921978763361931e-05, "loss": 1.4139, "step": 188 }, { "epoch": 0.15364292246722894, "grad_norm": 2.5106594562530518, "learning_rate": 1.9209558261851194e-05, "loss": 1.3683, "step": 189 }, { "epoch": 0.15445584798292855, "grad_norm": 2.9257330894470215, "learning_rate": 1.919926502589331e-05, "loss": 1.3387, "step": 190 }, { "epoch": 0.1552687734986282, "grad_norm": 2.5029993057250977, "learning_rate": 1.9188907997124666e-05, "loss": 1.3892, "step": 191 }, { "epoch": 0.1560816990143278, "grad_norm": 2.6917388439178467, "learning_rate": 1.9178487247366652e-05, "loss": 1.3946, "step": 192 }, { "epoch": 0.15689462453002745, "grad_norm": 2.7038626670837402, "learning_rate": 1.916800284888253e-05, "loss": 1.4082, "step": 193 }, { "epoch": 0.15770755004572706, "grad_norm": 2.586545467376709, "learning_rate": 1.915745487437694e-05, "loss": 1.3431, "step": 194 }, { "epoch": 0.15852047556142668, "grad_norm": 3.043938159942627, "learning_rate": 1.9146843396995396e-05, "loss": 1.3967, "step": 195 }, { "epoch": 0.15933340107712632, "grad_norm": 2.80709171295166, "learning_rate": 1.9136168490323772e-05, "loss": 1.3617, "step": 196 }, { "epoch": 0.16014632659282593, "grad_norm": 5.03334903717041, "learning_rate": 1.9125430228387794e-05, "loss": 1.4326, "step": 197 }, { "epoch": 0.16095925210852555, "grad_norm": 4.717489719390869, "learning_rate": 1.9114628685652535e-05, "loss": 1.3459, "step": 198 }, { "epoch": 0.1617721776242252, "grad_norm": 3.0668435096740723, "learning_rate": 1.9103763937021887e-05, "loss": 1.3763, "step": 199 }, { "epoch": 0.1625851031399248, "grad_norm": 2.712122678756714, "learning_rate": 1.909283605783805e-05, "loss": 1.3319, "step": 200 }, { "epoch": 0.16339802865562442, "grad_norm": 2.7631924152374268, "learning_rate": 1.9081845123881002e-05, "loss": 1.3641, "step": 201 }, { "epoch": 0.16421095417132406, "grad_norm": 3.499955654144287, "learning_rate": 1.9070791211367984e-05, "loss": 1.3259, "step": 202 }, { "epoch": 0.16502387968702367, "grad_norm": 2.913755416870117, "learning_rate": 1.9059674396952963e-05, "loss": 1.3386, "step": 203 }, { "epoch": 0.16583680520272331, "grad_norm": 2.5671772956848145, "learning_rate": 1.90484947577261e-05, "loss": 1.3301, "step": 204 }, { "epoch": 0.16664973071842293, "grad_norm": 3.3566508293151855, "learning_rate": 1.903725237121322e-05, "loss": 1.3375, "step": 205 }, { "epoch": 0.16746265623412254, "grad_norm": 2.3617210388183594, "learning_rate": 1.902594731537527e-05, "loss": 1.4476, "step": 206 }, { "epoch": 0.16827558174982218, "grad_norm": 2.8202669620513916, "learning_rate": 1.901457966860779e-05, "loss": 1.334, "step": 207 }, { "epoch": 0.1690885072655218, "grad_norm": 2.5990843772888184, "learning_rate": 1.9003149509740347e-05, "loss": 1.4321, "step": 208 }, { "epoch": 0.1699014327812214, "grad_norm": 5.9826507568359375, "learning_rate": 1.899165691803601e-05, "loss": 1.4338, "step": 209 }, { "epoch": 0.17071435829692105, "grad_norm": 3.9570019245147705, "learning_rate": 1.8980101973190787e-05, "loss": 1.3265, "step": 210 }, { "epoch": 0.17152728381262067, "grad_norm": 2.8985307216644287, "learning_rate": 1.896848475533309e-05, "loss": 1.3297, "step": 211 }, { "epoch": 0.17234020932832028, "grad_norm": 3.2575559616088867, "learning_rate": 1.8956805345023145e-05, "loss": 1.4086, "step": 212 }, { "epoch": 0.17315313484401992, "grad_norm": 3.264796733856201, "learning_rate": 1.894506382325248e-05, "loss": 1.391, "step": 213 }, { "epoch": 0.17396606035971954, "grad_norm": 2.767975330352783, "learning_rate": 1.8933260271443313e-05, "loss": 1.3731, "step": 214 }, { "epoch": 0.17477898587541915, "grad_norm": 2.3556087017059326, "learning_rate": 1.8921394771448032e-05, "loss": 1.3288, "step": 215 }, { "epoch": 0.1755919113911188, "grad_norm": 4.253211975097656, "learning_rate": 1.89094674055486e-05, "loss": 1.3776, "step": 216 }, { "epoch": 0.1764048369068184, "grad_norm": 3.0681605339050293, "learning_rate": 1.889747825645599e-05, "loss": 1.4169, "step": 217 }, { "epoch": 0.17721776242251805, "grad_norm": 2.3741588592529297, "learning_rate": 1.8885427407309627e-05, "loss": 1.3392, "step": 218 }, { "epoch": 0.17803068793821766, "grad_norm": 2.968780279159546, "learning_rate": 1.887331494167678e-05, "loss": 1.4019, "step": 219 }, { "epoch": 0.17884361345391728, "grad_norm": 2.3684914112091064, "learning_rate": 1.8861140943552014e-05, "loss": 1.3599, "step": 220 }, { "epoch": 0.17965653896961692, "grad_norm": 3.0405993461608887, "learning_rate": 1.884890549735659e-05, "loss": 1.4245, "step": 221 }, { "epoch": 0.18046946448531653, "grad_norm": 3.397047281265259, "learning_rate": 1.8836608687937883e-05, "loss": 1.392, "step": 222 }, { "epoch": 0.18128239000101615, "grad_norm": 2.4693644046783447, "learning_rate": 1.8824250600568798e-05, "loss": 1.3726, "step": 223 }, { "epoch": 0.1820953155167158, "grad_norm": 6.75039005279541, "learning_rate": 1.8811831320947177e-05, "loss": 1.3473, "step": 224 }, { "epoch": 0.1829082410324154, "grad_norm": 2.922574758529663, "learning_rate": 1.879935093519519e-05, "loss": 1.4221, "step": 225 }, { "epoch": 0.18372116654811502, "grad_norm": 5.6719136238098145, "learning_rate": 1.878680952985877e-05, "loss": 1.3844, "step": 226 }, { "epoch": 0.18453409206381466, "grad_norm": 2.6967201232910156, "learning_rate": 1.8774207191906976e-05, "loss": 1.344, "step": 227 }, { "epoch": 0.18534701757951427, "grad_norm": 3.049881935119629, "learning_rate": 1.8761544008731426e-05, "loss": 1.3912, "step": 228 }, { "epoch": 0.18615994309521391, "grad_norm": 3.1408843994140625, "learning_rate": 1.874882006814565e-05, "loss": 1.4048, "step": 229 }, { "epoch": 0.18697286861091353, "grad_norm": 2.6653666496276855, "learning_rate": 1.8736035458384528e-05, "loss": 1.3844, "step": 230 }, { "epoch": 0.18778579412661314, "grad_norm": 2.6866488456726074, "learning_rate": 1.8723190268103634e-05, "loss": 1.3586, "step": 231 }, { "epoch": 0.18859871964231278, "grad_norm": 3.2653231620788574, "learning_rate": 1.8710284586378645e-05, "loss": 1.3856, "step": 232 }, { "epoch": 0.1894116451580124, "grad_norm": 2.841388463973999, "learning_rate": 1.8697318502704734e-05, "loss": 1.3868, "step": 233 }, { "epoch": 0.190224570673712, "grad_norm": 2.797558307647705, "learning_rate": 1.8684292106995916e-05, "loss": 1.3885, "step": 234 }, { "epoch": 0.19103749618941165, "grad_norm": 2.915003776550293, "learning_rate": 1.8671205489584453e-05, "loss": 1.3434, "step": 235 }, { "epoch": 0.19185042170511127, "grad_norm": 3.2142281532287598, "learning_rate": 1.865805874122021e-05, "loss": 1.3975, "step": 236 }, { "epoch": 0.19266334722081088, "grad_norm": 3.0831453800201416, "learning_rate": 1.8644851953070045e-05, "loss": 1.367, "step": 237 }, { "epoch": 0.19347627273651052, "grad_norm": 3.2555181980133057, "learning_rate": 1.863158521671716e-05, "loss": 1.33, "step": 238 }, { "epoch": 0.19428919825221014, "grad_norm": 2.8768310546875, "learning_rate": 1.8618258624160465e-05, "loss": 1.3867, "step": 239 }, { "epoch": 0.19510212376790975, "grad_norm": 2.9737942218780518, "learning_rate": 1.8604872267813954e-05, "loss": 1.3726, "step": 240 }, { "epoch": 0.1959150492836094, "grad_norm": 2.5942904949188232, "learning_rate": 1.859142624050605e-05, "loss": 1.3704, "step": 241 }, { "epoch": 0.196727974799309, "grad_norm": 2.6901443004608154, "learning_rate": 1.8577920635478976e-05, "loss": 1.3523, "step": 242 }, { "epoch": 0.19754090031500865, "grad_norm": 2.4508392810821533, "learning_rate": 1.8564355546388094e-05, "loss": 1.3758, "step": 243 }, { "epoch": 0.19835382583070826, "grad_norm": 2.3041279315948486, "learning_rate": 1.855073106730126e-05, "loss": 1.3491, "step": 244 }, { "epoch": 0.19916675134640788, "grad_norm": 2.8388736248016357, "learning_rate": 1.8537047292698175e-05, "loss": 1.3578, "step": 245 }, { "epoch": 0.19997967686210752, "grad_norm": 3.058314085006714, "learning_rate": 1.852330431746973e-05, "loss": 1.3547, "step": 246 }, { "epoch": 0.20079260237780713, "grad_norm": 2.881788492202759, "learning_rate": 1.8509502236917353e-05, "loss": 1.3823, "step": 247 }, { "epoch": 0.20160552789350675, "grad_norm": 2.623408794403076, "learning_rate": 1.8495641146752322e-05, "loss": 1.4516, "step": 248 }, { "epoch": 0.2024184534092064, "grad_norm": 2.662614345550537, "learning_rate": 1.848172114309513e-05, "loss": 1.3924, "step": 249 }, { "epoch": 0.203231378924906, "grad_norm": 2.520263671875, "learning_rate": 1.8467742322474822e-05, "loss": 1.4097, "step": 250 }, { "epoch": 0.20404430444060562, "grad_norm": 4.465703964233398, "learning_rate": 1.845370478182829e-05, "loss": 1.3645, "step": 251 }, { "epoch": 0.20485722995630526, "grad_norm": 2.5109176635742188, "learning_rate": 1.8439608618499637e-05, "loss": 1.3238, "step": 252 }, { "epoch": 0.20567015547200487, "grad_norm": 2.703659772872925, "learning_rate": 1.842545393023949e-05, "loss": 1.4027, "step": 253 }, { "epoch": 0.20648308098770451, "grad_norm": 3.483933448791504, "learning_rate": 1.841124081520431e-05, "loss": 1.4167, "step": 254 }, { "epoch": 0.20729600650340413, "grad_norm": 2.7172889709472656, "learning_rate": 1.8396969371955724e-05, "loss": 1.3017, "step": 255 }, { "epoch": 0.20810893201910374, "grad_norm": 2.512045383453369, "learning_rate": 1.838263969945985e-05, "loss": 1.4112, "step": 256 }, { "epoch": 0.20892185753480338, "grad_norm": 2.4449141025543213, "learning_rate": 1.836825189708659e-05, "loss": 1.3396, "step": 257 }, { "epoch": 0.209734783050503, "grad_norm": 2.9280951023101807, "learning_rate": 1.8353806064608953e-05, "loss": 1.3461, "step": 258 }, { "epoch": 0.2105477085662026, "grad_norm": 3.962769031524658, "learning_rate": 1.833930230220236e-05, "loss": 1.3347, "step": 259 }, { "epoch": 0.21136063408190225, "grad_norm": 3.3168771266937256, "learning_rate": 1.8324740710443955e-05, "loss": 1.3264, "step": 260 }, { "epoch": 0.21217355959760187, "grad_norm": 2.754786252975464, "learning_rate": 1.831012139031189e-05, "loss": 1.3859, "step": 261 }, { "epoch": 0.21298648511330148, "grad_norm": 2.5179426670074463, "learning_rate": 1.829544444318466e-05, "loss": 1.3653, "step": 262 }, { "epoch": 0.21379941062900112, "grad_norm": 2.9228906631469727, "learning_rate": 1.8280709970840352e-05, "loss": 1.3929, "step": 263 }, { "epoch": 0.21461233614470074, "grad_norm": 2.732806921005249, "learning_rate": 1.8265918075455985e-05, "loss": 1.3197, "step": 264 }, { "epoch": 0.21542526166040035, "grad_norm": 2.7236287593841553, "learning_rate": 1.8251068859606777e-05, "loss": 1.3156, "step": 265 }, { "epoch": 0.2162381871761, "grad_norm": 3.677654504776001, "learning_rate": 1.823616242626542e-05, "loss": 1.3565, "step": 266 }, { "epoch": 0.2170511126917996, "grad_norm": 2.4574098587036133, "learning_rate": 1.8221198878801415e-05, "loss": 1.3802, "step": 267 }, { "epoch": 0.21786403820749925, "grad_norm": 3.2601144313812256, "learning_rate": 1.8206178320980295e-05, "loss": 1.3606, "step": 268 }, { "epoch": 0.21867696372319886, "grad_norm": 2.4183156490325928, "learning_rate": 1.819110085696295e-05, "loss": 1.3327, "step": 269 }, { "epoch": 0.21948988923889848, "grad_norm": 2.6820755004882812, "learning_rate": 1.817596659130489e-05, "loss": 1.3676, "step": 270 }, { "epoch": 0.22030281475459812, "grad_norm": 2.619580030441284, "learning_rate": 1.816077562895551e-05, "loss": 1.408, "step": 271 }, { "epoch": 0.22111574027029773, "grad_norm": 2.4499645233154297, "learning_rate": 1.814552807525738e-05, "loss": 1.3445, "step": 272 }, { "epoch": 0.22192866578599735, "grad_norm": 2.5966873168945312, "learning_rate": 1.81302240359455e-05, "loss": 1.3354, "step": 273 }, { "epoch": 0.222741591301697, "grad_norm": 8.227926254272461, "learning_rate": 1.8114863617146576e-05, "loss": 1.3495, "step": 274 }, { "epoch": 0.2235545168173966, "grad_norm": 5.334491729736328, "learning_rate": 1.8099446925378278e-05, "loss": 1.3845, "step": 275 }, { "epoch": 0.22436744233309622, "grad_norm": 2.436473846435547, "learning_rate": 1.8083974067548506e-05, "loss": 1.3152, "step": 276 }, { "epoch": 0.22518036784879586, "grad_norm": 2.4906110763549805, "learning_rate": 1.806844515095465e-05, "loss": 1.3213, "step": 277 }, { "epoch": 0.22599329336449547, "grad_norm": 2.627547264099121, "learning_rate": 1.8052860283282832e-05, "loss": 1.3394, "step": 278 }, { "epoch": 0.22680621888019512, "grad_norm": 3.9034616947174072, "learning_rate": 1.8037219572607177e-05, "loss": 1.2956, "step": 279 }, { "epoch": 0.22761914439589473, "grad_norm": 2.9307639598846436, "learning_rate": 1.8021523127389066e-05, "loss": 1.3507, "step": 280 }, { "epoch": 0.22843206991159434, "grad_norm": 2.6711225509643555, "learning_rate": 1.800577105647635e-05, "loss": 1.4043, "step": 281 }, { "epoch": 0.22924499542729398, "grad_norm": 2.9251246452331543, "learning_rate": 1.7989963469102643e-05, "loss": 1.3424, "step": 282 }, { "epoch": 0.2300579209429936, "grad_norm": 2.2818679809570312, "learning_rate": 1.797410047488653e-05, "loss": 1.334, "step": 283 }, { "epoch": 0.2308708464586932, "grad_norm": 2.6961264610290527, "learning_rate": 1.7958182183830816e-05, "loss": 1.3411, "step": 284 }, { "epoch": 0.23168377197439285, "grad_norm": 2.5082268714904785, "learning_rate": 1.794220870632177e-05, "loss": 1.3815, "step": 285 }, { "epoch": 0.23249669749009247, "grad_norm": 2.6569674015045166, "learning_rate": 1.7926180153128358e-05, "loss": 1.4037, "step": 286 }, { "epoch": 0.23330962300579208, "grad_norm": 2.559483289718628, "learning_rate": 1.791009663540146e-05, "loss": 1.333, "step": 287 }, { "epoch": 0.23412254852149172, "grad_norm": 2.6982040405273438, "learning_rate": 1.789395826467312e-05, "loss": 1.4168, "step": 288 }, { "epoch": 0.23493547403719134, "grad_norm": 2.414900541305542, "learning_rate": 1.7877765152855757e-05, "loss": 1.3583, "step": 289 }, { "epoch": 0.23574839955289095, "grad_norm": 2.465045928955078, "learning_rate": 1.78615174122414e-05, "loss": 1.44, "step": 290 }, { "epoch": 0.2365613250685906, "grad_norm": 2.306795597076416, "learning_rate": 1.78452151555009e-05, "loss": 1.3215, "step": 291 }, { "epoch": 0.2373742505842902, "grad_norm": 2.6841700077056885, "learning_rate": 1.7828858495683162e-05, "loss": 1.351, "step": 292 }, { "epoch": 0.23818717609998985, "grad_norm": 2.4231340885162354, "learning_rate": 1.781244754621434e-05, "loss": 1.3923, "step": 293 }, { "epoch": 0.23900010161568946, "grad_norm": 2.8300161361694336, "learning_rate": 1.779598242089707e-05, "loss": 1.3876, "step": 294 }, { "epoch": 0.23981302713138908, "grad_norm": 2.6287200450897217, "learning_rate": 1.7779463233909677e-05, "loss": 1.3609, "step": 295 }, { "epoch": 0.24062595264708872, "grad_norm": 2.656332015991211, "learning_rate": 1.7762890099805362e-05, "loss": 1.3538, "step": 296 }, { "epoch": 0.24143887816278833, "grad_norm": 2.5331099033355713, "learning_rate": 1.774626313351145e-05, "loss": 1.3154, "step": 297 }, { "epoch": 0.24225180367848795, "grad_norm": 2.8881306648254395, "learning_rate": 1.7729582450328547e-05, "loss": 1.3561, "step": 298 }, { "epoch": 0.2430647291941876, "grad_norm": 2.4491260051727295, "learning_rate": 1.771284816592978e-05, "loss": 1.3494, "step": 299 }, { "epoch": 0.2438776547098872, "grad_norm": 2.8161392211914062, "learning_rate": 1.7696060396359956e-05, "loss": 1.3125, "step": 300 }, { "epoch": 0.24469058022558682, "grad_norm": 2.788238048553467, "learning_rate": 1.7679219258034798e-05, "loss": 1.41, "step": 301 }, { "epoch": 0.24550350574128646, "grad_norm": 3.0948519706726074, "learning_rate": 1.7662324867740102e-05, "loss": 1.4138, "step": 302 }, { "epoch": 0.24631643125698607, "grad_norm": 3.617783308029175, "learning_rate": 1.7645377342630956e-05, "loss": 1.3995, "step": 303 }, { "epoch": 0.24712935677268572, "grad_norm": 2.713531255722046, "learning_rate": 1.76283768002309e-05, "loss": 1.354, "step": 304 }, { "epoch": 0.24794228228838533, "grad_norm": 3.9215407371520996, "learning_rate": 1.7611323358431145e-05, "loss": 1.3939, "step": 305 }, { "epoch": 0.24875520780408494, "grad_norm": 3.519932508468628, "learning_rate": 1.759421713548971e-05, "loss": 1.3311, "step": 306 }, { "epoch": 0.24956813331978459, "grad_norm": 3.0680055618286133, "learning_rate": 1.757705825003065e-05, "loss": 1.4131, "step": 307 }, { "epoch": 0.25038105883548417, "grad_norm": 2.456533908843994, "learning_rate": 1.7559846821043205e-05, "loss": 1.3132, "step": 308 }, { "epoch": 0.25119398435118384, "grad_norm": 2.6937081813812256, "learning_rate": 1.754258296788097e-05, "loss": 1.3041, "step": 309 }, { "epoch": 0.25200690986688346, "grad_norm": 5.319806098937988, "learning_rate": 1.7525266810261096e-05, "loss": 1.3544, "step": 310 }, { "epoch": 0.25281983538258307, "grad_norm": 2.9595742225646973, "learning_rate": 1.7507898468263422e-05, "loss": 1.3528, "step": 311 }, { "epoch": 0.2536327608982827, "grad_norm": 4.085862636566162, "learning_rate": 1.7490478062329686e-05, "loss": 1.3314, "step": 312 }, { "epoch": 0.2544456864139823, "grad_norm": 2.4585909843444824, "learning_rate": 1.7473005713262644e-05, "loss": 1.3622, "step": 313 }, { "epoch": 0.25525861192968197, "grad_norm": 2.4798450469970703, "learning_rate": 1.7455481542225272e-05, "loss": 1.3804, "step": 314 }, { "epoch": 0.2560715374453816, "grad_norm": 2.686068534851074, "learning_rate": 1.7437905670739893e-05, "loss": 1.2945, "step": 315 }, { "epoch": 0.2568844629610812, "grad_norm": 2.7424585819244385, "learning_rate": 1.7420278220687366e-05, "loss": 1.3561, "step": 316 }, { "epoch": 0.2576973884767808, "grad_norm": 2.964237928390503, "learning_rate": 1.7402599314306207e-05, "loss": 1.3701, "step": 317 }, { "epoch": 0.2585103139924804, "grad_norm": 2.7983458042144775, "learning_rate": 1.7384869074191777e-05, "loss": 1.3536, "step": 318 }, { "epoch": 0.25932323950818004, "grad_norm": 2.6008524894714355, "learning_rate": 1.7367087623295394e-05, "loss": 1.3394, "step": 319 }, { "epoch": 0.2601361650238797, "grad_norm": 2.4116249084472656, "learning_rate": 1.7349255084923517e-05, "loss": 1.3785, "step": 320 }, { "epoch": 0.2609490905395793, "grad_norm": 2.9649388790130615, "learning_rate": 1.7331371582736864e-05, "loss": 1.3779, "step": 321 }, { "epoch": 0.26176201605527893, "grad_norm": 2.692847490310669, "learning_rate": 1.731343724074957e-05, "loss": 1.3715, "step": 322 }, { "epoch": 0.26257494157097855, "grad_norm": 2.6246955394744873, "learning_rate": 1.7295452183328317e-05, "loss": 1.3856, "step": 323 }, { "epoch": 0.26338786708667816, "grad_norm": 2.822334051132202, "learning_rate": 1.7277416535191478e-05, "loss": 1.3289, "step": 324 }, { "epoch": 0.26420079260237783, "grad_norm": 2.703158378601074, "learning_rate": 1.7259330421408247e-05, "loss": 1.3447, "step": 325 }, { "epoch": 0.26501371811807745, "grad_norm": 2.5357322692871094, "learning_rate": 1.7241193967397784e-05, "loss": 1.3414, "step": 326 }, { "epoch": 0.26582664363377706, "grad_norm": 2.7839202880859375, "learning_rate": 1.7223007298928322e-05, "loss": 1.3725, "step": 327 }, { "epoch": 0.2666395691494767, "grad_norm": 2.6645684242248535, "learning_rate": 1.7204770542116326e-05, "loss": 1.3163, "step": 328 }, { "epoch": 0.2674524946651763, "grad_norm": 4.677945137023926, "learning_rate": 1.7186483823425582e-05, "loss": 1.3583, "step": 329 }, { "epoch": 0.2682654201808759, "grad_norm": 2.948094367980957, "learning_rate": 1.7168147269666357e-05, "loss": 1.3643, "step": 330 }, { "epoch": 0.26907834569657557, "grad_norm": 2.5047991275787354, "learning_rate": 1.714976100799449e-05, "loss": 1.3542, "step": 331 }, { "epoch": 0.2698912712122752, "grad_norm": 2.680239677429199, "learning_rate": 1.713132516591053e-05, "loss": 1.3204, "step": 332 }, { "epoch": 0.2707041967279748, "grad_norm": 2.703165054321289, "learning_rate": 1.7112839871258838e-05, "loss": 1.3467, "step": 333 }, { "epoch": 0.2715171222436744, "grad_norm": 2.5855846405029297, "learning_rate": 1.7094305252226713e-05, "loss": 1.3807, "step": 334 }, { "epoch": 0.272330047759374, "grad_norm": 2.8401761054992676, "learning_rate": 1.7075721437343488e-05, "loss": 1.4032, "step": 335 }, { "epoch": 0.2731429732750737, "grad_norm": 2.727287530899048, "learning_rate": 1.705708855547966e-05, "loss": 1.3416, "step": 336 }, { "epoch": 0.2739558987907733, "grad_norm": 2.9767589569091797, "learning_rate": 1.7038406735845967e-05, "loss": 1.3062, "step": 337 }, { "epoch": 0.2747688243064729, "grad_norm": 2.6532137393951416, "learning_rate": 1.7019676107992523e-05, "loss": 1.3717, "step": 338 }, { "epoch": 0.27558174982217254, "grad_norm": 5.618951797485352, "learning_rate": 1.70008968018079e-05, "loss": 1.4021, "step": 339 }, { "epoch": 0.27639467533787215, "grad_norm": 2.75219464302063, "learning_rate": 1.6982068947518235e-05, "loss": 1.3345, "step": 340 }, { "epoch": 0.27720760085357177, "grad_norm": 2.7771074771881104, "learning_rate": 1.6963192675686312e-05, "loss": 1.3613, "step": 341 }, { "epoch": 0.27802052636927144, "grad_norm": 2.4822003841400146, "learning_rate": 1.694426811721069e-05, "loss": 1.3465, "step": 342 }, { "epoch": 0.27883345188497105, "grad_norm": 2.684894323348999, "learning_rate": 1.6925295403324758e-05, "loss": 1.337, "step": 343 }, { "epoch": 0.27964637740067066, "grad_norm": 2.804255962371826, "learning_rate": 1.6906274665595854e-05, "loss": 1.2862, "step": 344 }, { "epoch": 0.2804593029163703, "grad_norm": 2.7327306270599365, "learning_rate": 1.688720603592432e-05, "loss": 1.3826, "step": 345 }, { "epoch": 0.2812722284320699, "grad_norm": 3.0967769622802734, "learning_rate": 1.6868089646542632e-05, "loss": 1.3406, "step": 346 }, { "epoch": 0.28208515394776956, "grad_norm": 2.4972376823425293, "learning_rate": 1.6848925630014445e-05, "loss": 1.3315, "step": 347 }, { "epoch": 0.2828980794634692, "grad_norm": 22.60991668701172, "learning_rate": 1.6829714119233688e-05, "loss": 1.3325, "step": 348 }, { "epoch": 0.2837110049791688, "grad_norm": 3.207625389099121, "learning_rate": 1.6810455247423634e-05, "loss": 1.3926, "step": 349 }, { "epoch": 0.2845239304948684, "grad_norm": 2.6568946838378906, "learning_rate": 1.6791149148136003e-05, "loss": 1.3464, "step": 350 }, { "epoch": 0.285336856010568, "grad_norm": 2.9483156204223633, "learning_rate": 1.677179595525e-05, "loss": 1.2875, "step": 351 }, { "epoch": 0.28614978152626763, "grad_norm": 2.841442584991455, "learning_rate": 1.675239580297141e-05, "loss": 1.3441, "step": 352 }, { "epoch": 0.2869627070419673, "grad_norm": 3.3877551555633545, "learning_rate": 1.6732948825831657e-05, "loss": 1.3662, "step": 353 }, { "epoch": 0.2877756325576669, "grad_norm": 2.9442946910858154, "learning_rate": 1.671345515868688e-05, "loss": 1.3075, "step": 354 }, { "epoch": 0.28858855807336653, "grad_norm": 2.672950029373169, "learning_rate": 1.6693914936716983e-05, "loss": 1.2982, "step": 355 }, { "epoch": 0.28940148358906614, "grad_norm": 2.7699198722839355, "learning_rate": 1.6674328295424723e-05, "loss": 1.3331, "step": 356 }, { "epoch": 0.29021440910476576, "grad_norm": 2.578444719314575, "learning_rate": 1.6654695370634738e-05, "loss": 1.3768, "step": 357 }, { "epoch": 0.29102733462046537, "grad_norm": 2.748466968536377, "learning_rate": 1.6635016298492628e-05, "loss": 1.3108, "step": 358 }, { "epoch": 0.29184026013616504, "grad_norm": 2.818321943283081, "learning_rate": 1.6615291215464005e-05, "loss": 1.2586, "step": 359 }, { "epoch": 0.29265318565186466, "grad_norm": 3.6742396354675293, "learning_rate": 1.6595520258333545e-05, "loss": 1.3112, "step": 360 }, { "epoch": 0.29346611116756427, "grad_norm": 2.999140977859497, "learning_rate": 1.657570356420404e-05, "loss": 1.2923, "step": 361 }, { "epoch": 0.2942790366832639, "grad_norm": 2.704463481903076, "learning_rate": 1.6555841270495456e-05, "loss": 1.3329, "step": 362 }, { "epoch": 0.2950919621989635, "grad_norm": 3.2639801502227783, "learning_rate": 1.6535933514943955e-05, "loss": 1.3215, "step": 363 }, { "epoch": 0.29590488771466317, "grad_norm": 3.2200841903686523, "learning_rate": 1.6515980435600965e-05, "loss": 1.3792, "step": 364 }, { "epoch": 0.2967178132303628, "grad_norm": 2.9226245880126953, "learning_rate": 1.6495982170832224e-05, "loss": 1.3565, "step": 365 }, { "epoch": 0.2975307387460624, "grad_norm": 3.096405029296875, "learning_rate": 1.6475938859316795e-05, "loss": 1.3857, "step": 366 }, { "epoch": 0.298343664261762, "grad_norm": 2.7694365978240967, "learning_rate": 1.6455850640046134e-05, "loss": 1.3782, "step": 367 }, { "epoch": 0.2991565897774616, "grad_norm": 3.011751890182495, "learning_rate": 1.6435717652323097e-05, "loss": 1.3426, "step": 368 }, { "epoch": 0.29996951529316124, "grad_norm": 2.7828853130340576, "learning_rate": 1.6415540035761008e-05, "loss": 1.3429, "step": 369 }, { "epoch": 0.3007824408088609, "grad_norm": 2.5543785095214844, "learning_rate": 1.639531793028265e-05, "loss": 1.3768, "step": 370 }, { "epoch": 0.3015953663245605, "grad_norm": 2.8462271690368652, "learning_rate": 1.637505147611934e-05, "loss": 1.3203, "step": 371 }, { "epoch": 0.30240829184026013, "grad_norm": 2.404257297515869, "learning_rate": 1.6354740813809917e-05, "loss": 1.3693, "step": 372 }, { "epoch": 0.30322121735595975, "grad_norm": 2.674553394317627, "learning_rate": 1.6334386084199787e-05, "loss": 1.3518, "step": 373 }, { "epoch": 0.30403414287165936, "grad_norm": 2.4954397678375244, "learning_rate": 1.631398742843995e-05, "loss": 1.3669, "step": 374 }, { "epoch": 0.30484706838735903, "grad_norm": 3.333721876144409, "learning_rate": 1.629354498798601e-05, "loss": 1.3358, "step": 375 }, { "epoch": 0.30565999390305865, "grad_norm": 2.859560966491699, "learning_rate": 1.627305890459719e-05, "loss": 1.3334, "step": 376 }, { "epoch": 0.30647291941875826, "grad_norm": 2.8346803188323975, "learning_rate": 1.625252932033538e-05, "loss": 1.3366, "step": 377 }, { "epoch": 0.3072858449344579, "grad_norm": 2.64909029006958, "learning_rate": 1.6231956377564095e-05, "loss": 1.3398, "step": 378 }, { "epoch": 0.3080987704501575, "grad_norm": 3.935067653656006, "learning_rate": 1.621134021894756e-05, "loss": 1.2953, "step": 379 }, { "epoch": 0.3089116959658571, "grad_norm": 5.056494235992432, "learning_rate": 1.619068098744965e-05, "loss": 1.3245, "step": 380 }, { "epoch": 0.30972462148155677, "grad_norm": 2.9668800830841064, "learning_rate": 1.6169978826332955e-05, "loss": 1.3199, "step": 381 }, { "epoch": 0.3105375469972564, "grad_norm": 2.6101276874542236, "learning_rate": 1.6149233879157747e-05, "loss": 1.3317, "step": 382 }, { "epoch": 0.311350472512956, "grad_norm": 2.677374839782715, "learning_rate": 1.6128446289781012e-05, "loss": 1.304, "step": 383 }, { "epoch": 0.3121633980286556, "grad_norm": 4.049331188201904, "learning_rate": 1.610761620235543e-05, "loss": 1.3241, "step": 384 }, { "epoch": 0.31297632354435523, "grad_norm": 2.566908836364746, "learning_rate": 1.60867437613284e-05, "loss": 1.3392, "step": 385 }, { "epoch": 0.3137892490600549, "grad_norm": 2.550367832183838, "learning_rate": 1.6065829111441e-05, "loss": 1.3274, "step": 386 }, { "epoch": 0.3146021745757545, "grad_norm": 4.543491363525391, "learning_rate": 1.6044872397727037e-05, "loss": 1.2993, "step": 387 }, { "epoch": 0.3154151000914541, "grad_norm": 2.8900489807128906, "learning_rate": 1.6023873765511993e-05, "loss": 1.3274, "step": 388 }, { "epoch": 0.31622802560715374, "grad_norm": 2.4930450916290283, "learning_rate": 1.6002833360412044e-05, "loss": 1.3074, "step": 389 }, { "epoch": 0.31704095112285335, "grad_norm": 3.0221235752105713, "learning_rate": 1.5981751328333036e-05, "loss": 1.3077, "step": 390 }, { "epoch": 0.31785387663855297, "grad_norm": 3.0569851398468018, "learning_rate": 1.5960627815469486e-05, "loss": 1.3705, "step": 391 }, { "epoch": 0.31866680215425264, "grad_norm": 7.261632442474365, "learning_rate": 1.5939462968303554e-05, "loss": 1.3564, "step": 392 }, { "epoch": 0.31947972766995225, "grad_norm": 3.0555789470672607, "learning_rate": 1.5918256933604047e-05, "loss": 1.3451, "step": 393 }, { "epoch": 0.32029265318565187, "grad_norm": 3.360779047012329, "learning_rate": 1.589700985842538e-05, "loss": 1.2764, "step": 394 }, { "epoch": 0.3211055787013515, "grad_norm": 2.9022507667541504, "learning_rate": 1.5875721890106574e-05, "loss": 1.3424, "step": 395 }, { "epoch": 0.3219185042170511, "grad_norm": 5.119380474090576, "learning_rate": 1.5854393176270205e-05, "loss": 1.3392, "step": 396 }, { "epoch": 0.32273142973275076, "grad_norm": 2.7554409503936768, "learning_rate": 1.5833023864821427e-05, "loss": 1.3762, "step": 397 }, { "epoch": 0.3235443552484504, "grad_norm": 2.553323984146118, "learning_rate": 1.5811614103946905e-05, "loss": 1.3066, "step": 398 }, { "epoch": 0.32435728076415, "grad_norm": 3.514381170272827, "learning_rate": 1.5790164042113805e-05, "loss": 1.3575, "step": 399 }, { "epoch": 0.3251702062798496, "grad_norm": 2.89054012298584, "learning_rate": 1.576867382806877e-05, "loss": 1.3106, "step": 400 }, { "epoch": 0.3259831317955492, "grad_norm": 2.9955763816833496, "learning_rate": 1.5747143610836873e-05, "loss": 1.3634, "step": 401 }, { "epoch": 0.32679605731124883, "grad_norm": 3.175438404083252, "learning_rate": 1.5725573539720592e-05, "loss": 1.2876, "step": 402 }, { "epoch": 0.3276089828269485, "grad_norm": 2.6269116401672363, "learning_rate": 1.570396376429877e-05, "loss": 1.342, "step": 403 }, { "epoch": 0.3284219083426481, "grad_norm": 2.900568962097168, "learning_rate": 1.5682314434425593e-05, "loss": 1.3133, "step": 404 }, { "epoch": 0.32923483385834773, "grad_norm": 2.6711323261260986, "learning_rate": 1.5660625700229526e-05, "loss": 1.2702, "step": 405 }, { "epoch": 0.33004775937404734, "grad_norm": 2.8045928478240967, "learning_rate": 1.5638897712112303e-05, "loss": 1.3336, "step": 406 }, { "epoch": 0.33086068488974696, "grad_norm": 2.9632303714752197, "learning_rate": 1.561713062074785e-05, "loss": 1.3546, "step": 407 }, { "epoch": 0.33167361040544663, "grad_norm": 2.5156984329223633, "learning_rate": 1.5595324577081265e-05, "loss": 1.3587, "step": 408 }, { "epoch": 0.33248653592114624, "grad_norm": 2.6634364128112793, "learning_rate": 1.5573479732327758e-05, "loss": 1.3317, "step": 409 }, { "epoch": 0.33329946143684586, "grad_norm": 4.38008451461792, "learning_rate": 1.555159623797161e-05, "loss": 1.3078, "step": 410 }, { "epoch": 0.33411238695254547, "grad_norm": 3.089078903198242, "learning_rate": 1.552967424576512e-05, "loss": 1.328, "step": 411 }, { "epoch": 0.3349253124682451, "grad_norm": 2.9011247158050537, "learning_rate": 1.5507713907727557e-05, "loss": 1.349, "step": 412 }, { "epoch": 0.3357382379839447, "grad_norm": 2.431152582168579, "learning_rate": 1.5485715376144087e-05, "loss": 1.383, "step": 413 }, { "epoch": 0.33655116349964437, "grad_norm": 2.6097633838653564, "learning_rate": 1.5463678803564753e-05, "loss": 1.3414, "step": 414 }, { "epoch": 0.337364089015344, "grad_norm": 2.9973533153533936, "learning_rate": 1.5441604342803374e-05, "loss": 1.3359, "step": 415 }, { "epoch": 0.3381770145310436, "grad_norm": 2.849950075149536, "learning_rate": 1.5419492146936518e-05, "loss": 1.3378, "step": 416 }, { "epoch": 0.3389899400467432, "grad_norm": 2.600947856903076, "learning_rate": 1.5397342369302425e-05, "loss": 1.3411, "step": 417 }, { "epoch": 0.3398028655624428, "grad_norm": 2.946190595626831, "learning_rate": 1.5375155163499953e-05, "loss": 1.2981, "step": 418 }, { "epoch": 0.34061579107814244, "grad_norm": 3.5300893783569336, "learning_rate": 1.5352930683387502e-05, "loss": 1.3717, "step": 419 }, { "epoch": 0.3414287165938421, "grad_norm": 2.342288017272949, "learning_rate": 1.5330669083081956e-05, "loss": 1.2734, "step": 420 }, { "epoch": 0.3422416421095417, "grad_norm": 3.7037856578826904, "learning_rate": 1.5308370516957617e-05, "loss": 1.3402, "step": 421 }, { "epoch": 0.34305456762524134, "grad_norm": 2.5814309120178223, "learning_rate": 1.528603513964511e-05, "loss": 1.3207, "step": 422 }, { "epoch": 0.34386749314094095, "grad_norm": 2.4542317390441895, "learning_rate": 1.5263663106030347e-05, "loss": 1.3257, "step": 423 }, { "epoch": 0.34468041865664056, "grad_norm": 2.689870595932007, "learning_rate": 1.5241254571253433e-05, "loss": 1.3105, "step": 424 }, { "epoch": 0.34549334417234023, "grad_norm": 2.900061845779419, "learning_rate": 1.5218809690707583e-05, "loss": 1.3113, "step": 425 }, { "epoch": 0.34630626968803985, "grad_norm": 2.7165238857269287, "learning_rate": 1.5196328620038059e-05, "loss": 1.335, "step": 426 }, { "epoch": 0.34711919520373946, "grad_norm": 2.3893747329711914, "learning_rate": 1.5173811515141083e-05, "loss": 1.3062, "step": 427 }, { "epoch": 0.3479321207194391, "grad_norm": 2.568575143814087, "learning_rate": 1.5151258532162771e-05, "loss": 1.3338, "step": 428 }, { "epoch": 0.3487450462351387, "grad_norm": 3.406301736831665, "learning_rate": 1.5128669827498024e-05, "loss": 1.3189, "step": 429 }, { "epoch": 0.3495579717508383, "grad_norm": 2.752307653427124, "learning_rate": 1.5106045557789453e-05, "loss": 1.331, "step": 430 }, { "epoch": 0.350370897266538, "grad_norm": 2.570742130279541, "learning_rate": 1.5083385879926309e-05, "loss": 1.2887, "step": 431 }, { "epoch": 0.3511838227822376, "grad_norm": 2.4754555225372314, "learning_rate": 1.5060690951043385e-05, "loss": 1.3432, "step": 432 }, { "epoch": 0.3519967482979372, "grad_norm": 3.853609561920166, "learning_rate": 1.5037960928519902e-05, "loss": 1.3625, "step": 433 }, { "epoch": 0.3528096738136368, "grad_norm": 2.6506130695343018, "learning_rate": 1.501519596997847e-05, "loss": 1.2797, "step": 434 }, { "epoch": 0.35362259932933643, "grad_norm": 2.8529601097106934, "learning_rate": 1.499239623328394e-05, "loss": 1.2868, "step": 435 }, { "epoch": 0.3544355248450361, "grad_norm": 4.091727256774902, "learning_rate": 1.4969561876542348e-05, "loss": 1.2648, "step": 436 }, { "epoch": 0.3552484503607357, "grad_norm": 2.5217483043670654, "learning_rate": 1.4946693058099802e-05, "loss": 1.2792, "step": 437 }, { "epoch": 0.3560613758764353, "grad_norm": 3.035297155380249, "learning_rate": 1.4923789936541378e-05, "loss": 1.3267, "step": 438 }, { "epoch": 0.35687430139213494, "grad_norm": 4.371755599975586, "learning_rate": 1.4900852670690044e-05, "loss": 1.3114, "step": 439 }, { "epoch": 0.35768722690783455, "grad_norm": 2.904101610183716, "learning_rate": 1.487788141960553e-05, "loss": 1.3716, "step": 440 }, { "epoch": 0.35850015242353417, "grad_norm": 2.663241147994995, "learning_rate": 1.4854876342583246e-05, "loss": 1.3269, "step": 441 }, { "epoch": 0.35931307793923384, "grad_norm": 2.626646041870117, "learning_rate": 1.4831837599153165e-05, "loss": 1.3077, "step": 442 }, { "epoch": 0.36012600345493345, "grad_norm": 2.8876073360443115, "learning_rate": 1.4808765349078729e-05, "loss": 1.2807, "step": 443 }, { "epoch": 0.36093892897063307, "grad_norm": 2.5428106784820557, "learning_rate": 1.4785659752355724e-05, "loss": 1.3242, "step": 444 }, { "epoch": 0.3617518544863327, "grad_norm": 2.7515244483947754, "learning_rate": 1.4762520969211186e-05, "loss": 1.3356, "step": 445 }, { "epoch": 0.3625647800020323, "grad_norm": 2.771684408187866, "learning_rate": 1.4739349160102285e-05, "loss": 1.3255, "step": 446 }, { "epoch": 0.36337770551773196, "grad_norm": 2.7270543575286865, "learning_rate": 1.4716144485715209e-05, "loss": 1.2797, "step": 447 }, { "epoch": 0.3641906310334316, "grad_norm": 3.5211868286132812, "learning_rate": 1.4692907106964051e-05, "loss": 1.3098, "step": 448 }, { "epoch": 0.3650035565491312, "grad_norm": 5.923196315765381, "learning_rate": 1.4669637184989696e-05, "loss": 1.3212, "step": 449 }, { "epoch": 0.3658164820648308, "grad_norm": 2.50697922706604, "learning_rate": 1.4646334881158704e-05, "loss": 1.3195, "step": 450 }, { "epoch": 0.3666294075805304, "grad_norm": 3.3721578121185303, "learning_rate": 1.4623000357062184e-05, "loss": 1.2747, "step": 451 }, { "epoch": 0.36744233309623003, "grad_norm": 2.429243803024292, "learning_rate": 1.459963377451468e-05, "loss": 1.3122, "step": 452 }, { "epoch": 0.3682552586119297, "grad_norm": 4.240250587463379, "learning_rate": 1.457623529555305e-05, "loss": 1.3447, "step": 453 }, { "epoch": 0.3690681841276293, "grad_norm": 2.631667137145996, "learning_rate": 1.4552805082435333e-05, "loss": 1.3171, "step": 454 }, { "epoch": 0.36988110964332893, "grad_norm": 2.906388521194458, "learning_rate": 1.4529343297639638e-05, "loss": 1.3193, "step": 455 }, { "epoch": 0.37069403515902855, "grad_norm": 3.047884464263916, "learning_rate": 1.4505850103863007e-05, "loss": 1.3181, "step": 456 }, { "epoch": 0.37150696067472816, "grad_norm": 2.3922433853149414, "learning_rate": 1.448232566402028e-05, "loss": 1.3203, "step": 457 }, { "epoch": 0.37231988619042783, "grad_norm": 3.278813123703003, "learning_rate": 1.4458770141242992e-05, "loss": 1.3309, "step": 458 }, { "epoch": 0.37313281170612744, "grad_norm": 2.7148866653442383, "learning_rate": 1.4435183698878212e-05, "loss": 1.3408, "step": 459 }, { "epoch": 0.37394573722182706, "grad_norm": 2.913823366165161, "learning_rate": 1.4411566500487425e-05, "loss": 1.3426, "step": 460 }, { "epoch": 0.37475866273752667, "grad_norm": 2.435643196105957, "learning_rate": 1.4387918709845395e-05, "loss": 1.3357, "step": 461 }, { "epoch": 0.3755715882532263, "grad_norm": 2.6099560260772705, "learning_rate": 1.4364240490939032e-05, "loss": 1.3013, "step": 462 }, { "epoch": 0.3763845137689259, "grad_norm": 2.7896599769592285, "learning_rate": 1.4340532007966252e-05, "loss": 1.3284, "step": 463 }, { "epoch": 0.37719743928462557, "grad_norm": 2.857205867767334, "learning_rate": 1.4316793425334836e-05, "loss": 1.2926, "step": 464 }, { "epoch": 0.3780103648003252, "grad_norm": 2.4580750465393066, "learning_rate": 1.4293024907661295e-05, "loss": 1.3926, "step": 465 }, { "epoch": 0.3788232903160248, "grad_norm": 2.6340065002441406, "learning_rate": 1.4269226619769727e-05, "loss": 1.3315, "step": 466 }, { "epoch": 0.3796362158317244, "grad_norm": 3.416398525238037, "learning_rate": 1.424539872669067e-05, "loss": 1.2822, "step": 467 }, { "epoch": 0.380449141347424, "grad_norm": 2.4222054481506348, "learning_rate": 1.4221541393659966e-05, "loss": 1.2894, "step": 468 }, { "epoch": 0.38126206686312364, "grad_norm": 2.797074794769287, "learning_rate": 1.4197654786117604e-05, "loss": 1.3519, "step": 469 }, { "epoch": 0.3820749923788233, "grad_norm": 2.563831329345703, "learning_rate": 1.4173739069706586e-05, "loss": 1.3474, "step": 470 }, { "epoch": 0.3828879178945229, "grad_norm": 2.4004971981048584, "learning_rate": 1.414979441027176e-05, "loss": 1.3007, "step": 471 }, { "epoch": 0.38370084341022254, "grad_norm": 2.532390594482422, "learning_rate": 1.4125820973858693e-05, "loss": 1.2613, "step": 472 }, { "epoch": 0.38451376892592215, "grad_norm": 2.5733683109283447, "learning_rate": 1.41018189267125e-05, "loss": 1.3212, "step": 473 }, { "epoch": 0.38532669444162176, "grad_norm": 2.710106134414673, "learning_rate": 1.4077788435276701e-05, "loss": 1.3235, "step": 474 }, { "epoch": 0.38613961995732143, "grad_norm": 2.996795892715454, "learning_rate": 1.4053729666192067e-05, "loss": 1.3722, "step": 475 }, { "epoch": 0.38695254547302105, "grad_norm": 2.4392545223236084, "learning_rate": 1.4029642786295452e-05, "loss": 1.3706, "step": 476 }, { "epoch": 0.38776547098872066, "grad_norm": 2.6843369007110596, "learning_rate": 1.400552796261866e-05, "loss": 1.3382, "step": 477 }, { "epoch": 0.3885783965044203, "grad_norm": 2.405515193939209, "learning_rate": 1.3981385362387268e-05, "loss": 1.316, "step": 478 }, { "epoch": 0.3893913220201199, "grad_norm": 2.425203800201416, "learning_rate": 1.3957215153019463e-05, "loss": 1.3578, "step": 479 }, { "epoch": 0.3902042475358195, "grad_norm": 2.5134634971618652, "learning_rate": 1.3933017502124897e-05, "loss": 1.3531, "step": 480 }, { "epoch": 0.3910171730515192, "grad_norm": 2.4274141788482666, "learning_rate": 1.3908792577503514e-05, "loss": 1.3705, "step": 481 }, { "epoch": 0.3918300985672188, "grad_norm": 2.881443500518799, "learning_rate": 1.3884540547144393e-05, "loss": 1.3196, "step": 482 }, { "epoch": 0.3926430240829184, "grad_norm": 2.5505170822143555, "learning_rate": 1.3860261579224574e-05, "loss": 1.3221, "step": 483 }, { "epoch": 0.393455949598618, "grad_norm": 2.5604939460754395, "learning_rate": 1.3835955842107897e-05, "loss": 1.2565, "step": 484 }, { "epoch": 0.39426887511431763, "grad_norm": 2.8203351497650146, "learning_rate": 1.3811623504343845e-05, "loss": 1.323, "step": 485 }, { "epoch": 0.3950818006300173, "grad_norm": 3.9116978645324707, "learning_rate": 1.378726473466635e-05, "loss": 1.3188, "step": 486 }, { "epoch": 0.3958947261457169, "grad_norm": 2.918548822402954, "learning_rate": 1.3762879701992642e-05, "loss": 1.337, "step": 487 }, { "epoch": 0.3967076516614165, "grad_norm": 3.048039674758911, "learning_rate": 1.373846857542208e-05, "loss": 1.3379, "step": 488 }, { "epoch": 0.39752057717711614, "grad_norm": 2.6825406551361084, "learning_rate": 1.3714031524234965e-05, "loss": 1.3096, "step": 489 }, { "epoch": 0.39833350269281576, "grad_norm": 2.5955066680908203, "learning_rate": 1.3689568717891381e-05, "loss": 1.2947, "step": 490 }, { "epoch": 0.39914642820851537, "grad_norm": 2.5204849243164062, "learning_rate": 1.3665080326029997e-05, "loss": 1.2852, "step": 491 }, { "epoch": 0.39995935372421504, "grad_norm": 3.158151865005493, "learning_rate": 1.364056651846693e-05, "loss": 1.3323, "step": 492 }, { "epoch": 0.40077227923991465, "grad_norm": 2.787951946258545, "learning_rate": 1.3616027465194525e-05, "loss": 1.325, "step": 493 }, { "epoch": 0.40158520475561427, "grad_norm": 3.462423324584961, "learning_rate": 1.35914633363802e-05, "loss": 1.2689, "step": 494 }, { "epoch": 0.4023981302713139, "grad_norm": 3.3612263202667236, "learning_rate": 1.356687430236526e-05, "loss": 1.2846, "step": 495 }, { "epoch": 0.4032110557870135, "grad_norm": 2.521135091781616, "learning_rate": 1.3542260533663723e-05, "loss": 1.2845, "step": 496 }, { "epoch": 0.40402398130271316, "grad_norm": 2.702359914779663, "learning_rate": 1.351762220096112e-05, "loss": 1.2982, "step": 497 }, { "epoch": 0.4048369068184128, "grad_norm": 2.928270101547241, "learning_rate": 1.3492959475113332e-05, "loss": 1.2878, "step": 498 }, { "epoch": 0.4056498323341124, "grad_norm": 2.491701126098633, "learning_rate": 1.3468272527145388e-05, "loss": 1.2913, "step": 499 }, { "epoch": 0.406462757849812, "grad_norm": 2.8777735233306885, "learning_rate": 1.3443561528250295e-05, "loss": 1.328, "step": 500 }, { "epoch": 0.4072756833655116, "grad_norm": 3.4918212890625, "learning_rate": 1.3418826649787834e-05, "loss": 1.3415, "step": 501 }, { "epoch": 0.40808860888121123, "grad_norm": 2.6940505504608154, "learning_rate": 1.3394068063283387e-05, "loss": 1.3017, "step": 502 }, { "epoch": 0.4089015343969109, "grad_norm": 3.9722023010253906, "learning_rate": 1.3369285940426737e-05, "loss": 1.3161, "step": 503 }, { "epoch": 0.4097144599126105, "grad_norm": 2.6105010509490967, "learning_rate": 1.334448045307088e-05, "loss": 1.2853, "step": 504 }, { "epoch": 0.41052738542831013, "grad_norm": 4.094304084777832, "learning_rate": 1.331965177323084e-05, "loss": 1.3059, "step": 505 }, { "epoch": 0.41134031094400975, "grad_norm": 2.5570600032806396, "learning_rate": 1.3294800073082464e-05, "loss": 1.2957, "step": 506 }, { "epoch": 0.41215323645970936, "grad_norm": 2.60870099067688, "learning_rate": 1.3269925524961237e-05, "loss": 1.2887, "step": 507 }, { "epoch": 0.41296616197540903, "grad_norm": 2.3958325386047363, "learning_rate": 1.3245028301361086e-05, "loss": 1.3207, "step": 508 }, { "epoch": 0.41377908749110864, "grad_norm": 2.718470811843872, "learning_rate": 1.3220108574933185e-05, "loss": 1.2884, "step": 509 }, { "epoch": 0.41459201300680826, "grad_norm": 2.9990408420562744, "learning_rate": 1.3195166518484748e-05, "loss": 1.3104, "step": 510 }, { "epoch": 0.41540493852250787, "grad_norm": 3.256333589553833, "learning_rate": 1.317020230497784e-05, "loss": 1.2586, "step": 511 }, { "epoch": 0.4162178640382075, "grad_norm": 3.0497708320617676, "learning_rate": 1.3145216107528178e-05, "loss": 1.2946, "step": 512 }, { "epoch": 0.4170307895539071, "grad_norm": 2.6696412563323975, "learning_rate": 1.3120208099403926e-05, "loss": 1.3413, "step": 513 }, { "epoch": 0.41784371506960677, "grad_norm": 2.592937469482422, "learning_rate": 1.3095178454024496e-05, "loss": 1.2827, "step": 514 }, { "epoch": 0.4186566405853064, "grad_norm": 2.450669288635254, "learning_rate": 1.3070127344959348e-05, "loss": 1.2505, "step": 515 }, { "epoch": 0.419469566101006, "grad_norm": 4.529777526855469, "learning_rate": 1.3045054945926775e-05, "loss": 1.3001, "step": 516 }, { "epoch": 0.4202824916167056, "grad_norm": 3.2491648197174072, "learning_rate": 1.3019961430792711e-05, "loss": 1.2932, "step": 517 }, { "epoch": 0.4210954171324052, "grad_norm": 3.3505818843841553, "learning_rate": 1.2994846973569524e-05, "loss": 1.3516, "step": 518 }, { "epoch": 0.4219083426481049, "grad_norm": 3.5476715564727783, "learning_rate": 1.2969711748414804e-05, "loss": 1.2834, "step": 519 }, { "epoch": 0.4227212681638045, "grad_norm": 2.738903522491455, "learning_rate": 1.2944555929630152e-05, "loss": 1.2978, "step": 520 }, { "epoch": 0.4235341936795041, "grad_norm": 2.5854766368865967, "learning_rate": 1.2919379691659979e-05, "loss": 1.293, "step": 521 }, { "epoch": 0.42434711919520374, "grad_norm": 3.76955246925354, "learning_rate": 1.2894183209090304e-05, "loss": 1.2517, "step": 522 }, { "epoch": 0.42516004471090335, "grad_norm": 2.566361904144287, "learning_rate": 1.2868966656647522e-05, "loss": 1.3295, "step": 523 }, { "epoch": 0.42597297022660296, "grad_norm": 2.7477164268493652, "learning_rate": 1.2843730209197203e-05, "loss": 1.3067, "step": 524 }, { "epoch": 0.42678589574230263, "grad_norm": 3.0560967922210693, "learning_rate": 1.2818474041742885e-05, "loss": 1.2951, "step": 525 }, { "epoch": 0.42759882125800225, "grad_norm": 2.9634625911712646, "learning_rate": 1.2793198329424858e-05, "loss": 1.268, "step": 526 }, { "epoch": 0.42841174677370186, "grad_norm": 2.8108301162719727, "learning_rate": 1.2767903247518945e-05, "loss": 1.3319, "step": 527 }, { "epoch": 0.4292246722894015, "grad_norm": 3.85799241065979, "learning_rate": 1.2742588971435276e-05, "loss": 1.3764, "step": 528 }, { "epoch": 0.4300375978051011, "grad_norm": 2.564434766769409, "learning_rate": 1.2717255676717106e-05, "loss": 1.2854, "step": 529 }, { "epoch": 0.4308505233208007, "grad_norm": 5.098544597625732, "learning_rate": 1.2691903539039563e-05, "loss": 1.3143, "step": 530 }, { "epoch": 0.4316634488365004, "grad_norm": 7.195343017578125, "learning_rate": 1.2666532734208437e-05, "loss": 1.3026, "step": 531 }, { "epoch": 0.4324763743522, "grad_norm": 2.743298053741455, "learning_rate": 1.264114343815898e-05, "loss": 1.3124, "step": 532 }, { "epoch": 0.4332892998678996, "grad_norm": 3.183859348297119, "learning_rate": 1.2615735826954664e-05, "loss": 1.3132, "step": 533 }, { "epoch": 0.4341022253835992, "grad_norm": 7.095142364501953, "learning_rate": 1.2590310076785974e-05, "loss": 1.2599, "step": 534 }, { "epoch": 0.43491515089929883, "grad_norm": 2.91894268989563, "learning_rate": 1.256486636396917e-05, "loss": 1.3251, "step": 535 }, { "epoch": 0.4357280764149985, "grad_norm": 2.931509494781494, "learning_rate": 1.2539404864945087e-05, "loss": 1.3347, "step": 536 }, { "epoch": 0.4365410019306981, "grad_norm": 2.4552268981933594, "learning_rate": 1.2513925756277894e-05, "loss": 1.3469, "step": 537 }, { "epoch": 0.43735392744639773, "grad_norm": 2.846196174621582, "learning_rate": 1.2488429214653871e-05, "loss": 1.2654, "step": 538 }, { "epoch": 0.43816685296209734, "grad_norm": 3.494403600692749, "learning_rate": 1.24629154168802e-05, "loss": 1.2688, "step": 539 }, { "epoch": 0.43897977847779696, "grad_norm": 3.00067138671875, "learning_rate": 1.2437384539883715e-05, "loss": 1.2865, "step": 540 }, { "epoch": 0.43979270399349657, "grad_norm": 3.0412096977233887, "learning_rate": 1.2411836760709686e-05, "loss": 1.269, "step": 541 }, { "epoch": 0.44060562950919624, "grad_norm": 2.3580715656280518, "learning_rate": 1.2386272256520606e-05, "loss": 1.2752, "step": 542 }, { "epoch": 0.44141855502489585, "grad_norm": 9.030720710754395, "learning_rate": 1.2360691204594937e-05, "loss": 1.3074, "step": 543 }, { "epoch": 0.44223148054059547, "grad_norm": 3.970172882080078, "learning_rate": 1.2335093782325889e-05, "loss": 1.3117, "step": 544 }, { "epoch": 0.4430444060562951, "grad_norm": 2.8179943561553955, "learning_rate": 1.2309480167220203e-05, "loss": 1.3196, "step": 545 }, { "epoch": 0.4438573315719947, "grad_norm": 2.9376232624053955, "learning_rate": 1.2283850536896907e-05, "loss": 1.2614, "step": 546 }, { "epoch": 0.44467025708769436, "grad_norm": 2.811709403991699, "learning_rate": 1.2258205069086082e-05, "loss": 1.2666, "step": 547 }, { "epoch": 0.445483182603394, "grad_norm": 3.060638427734375, "learning_rate": 1.2232543941627641e-05, "loss": 1.2891, "step": 548 }, { "epoch": 0.4462961081190936, "grad_norm": 2.581530809402466, "learning_rate": 1.2206867332470091e-05, "loss": 1.2875, "step": 549 }, { "epoch": 0.4471090336347932, "grad_norm": 2.588129997253418, "learning_rate": 1.2181175419669293e-05, "loss": 1.2964, "step": 550 }, { "epoch": 0.4479219591504928, "grad_norm": 3.0943429470062256, "learning_rate": 1.215546838138723e-05, "loss": 1.29, "step": 551 }, { "epoch": 0.44873488466619244, "grad_norm": 2.960190534591675, "learning_rate": 1.212974639589078e-05, "loss": 1.2812, "step": 552 }, { "epoch": 0.4495478101818921, "grad_norm": 2.7364282608032227, "learning_rate": 1.2104009641550472e-05, "loss": 1.2783, "step": 553 }, { "epoch": 0.4503607356975917, "grad_norm": 2.509277105331421, "learning_rate": 1.2078258296839245e-05, "loss": 1.2859, "step": 554 }, { "epoch": 0.45117366121329133, "grad_norm": 2.769371747970581, "learning_rate": 1.2052492540331218e-05, "loss": 1.2866, "step": 555 }, { "epoch": 0.45198658672899095, "grad_norm": 3.057968854904175, "learning_rate": 1.2026712550700457e-05, "loss": 1.3051, "step": 556 }, { "epoch": 0.45279951224469056, "grad_norm": 3.4182374477386475, "learning_rate": 1.200091850671972e-05, "loss": 1.3266, "step": 557 }, { "epoch": 0.45361243776039023, "grad_norm": 2.6871426105499268, "learning_rate": 1.1975110587259222e-05, "loss": 1.2596, "step": 558 }, { "epoch": 0.45442536327608984, "grad_norm": 3.463675022125244, "learning_rate": 1.1949288971285411e-05, "loss": 1.2767, "step": 559 }, { "epoch": 0.45523828879178946, "grad_norm": 2.8260090351104736, "learning_rate": 1.1923453837859706e-05, "loss": 1.2734, "step": 560 }, { "epoch": 0.4560512143074891, "grad_norm": 2.6161341667175293, "learning_rate": 1.1897605366137264e-05, "loss": 1.2377, "step": 561 }, { "epoch": 0.4568641398231887, "grad_norm": 2.847534418106079, "learning_rate": 1.1871743735365735e-05, "loss": 1.3128, "step": 562 }, { "epoch": 0.4576770653388883, "grad_norm": 3.116063117980957, "learning_rate": 1.1845869124884027e-05, "loss": 1.3114, "step": 563 }, { "epoch": 0.45848999085458797, "grad_norm": 3.2849061489105225, "learning_rate": 1.1819981714121054e-05, "loss": 1.2761, "step": 564 }, { "epoch": 0.4593029163702876, "grad_norm": 2.484531879425049, "learning_rate": 1.1794081682594491e-05, "loss": 1.2978, "step": 565 }, { "epoch": 0.4601158418859872, "grad_norm": 3.111940383911133, "learning_rate": 1.176816920990954e-05, "loss": 1.2928, "step": 566 }, { "epoch": 0.4609287674016868, "grad_norm": 3.063422918319702, "learning_rate": 1.174224447575767e-05, "loss": 1.3137, "step": 567 }, { "epoch": 0.4617416929173864, "grad_norm": 4.031757831573486, "learning_rate": 1.171630765991538e-05, "loss": 1.2986, "step": 568 }, { "epoch": 0.4625546184330861, "grad_norm": 2.650336980819702, "learning_rate": 1.169035894224295e-05, "loss": 1.3328, "step": 569 }, { "epoch": 0.4633675439487857, "grad_norm": 2.574526309967041, "learning_rate": 1.1664398502683194e-05, "loss": 1.3078, "step": 570 }, { "epoch": 0.4641804694644853, "grad_norm": 2.3674449920654297, "learning_rate": 1.1638426521260211e-05, "loss": 1.2819, "step": 571 }, { "epoch": 0.46499339498018494, "grad_norm": 2.8870980739593506, "learning_rate": 1.1612443178078138e-05, "loss": 1.2661, "step": 572 }, { "epoch": 0.46580632049588455, "grad_norm": 2.4961047172546387, "learning_rate": 1.1586448653319908e-05, "loss": 1.3042, "step": 573 }, { "epoch": 0.46661924601158417, "grad_norm": 2.6196508407592773, "learning_rate": 1.156044312724598e-05, "loss": 1.2306, "step": 574 }, { "epoch": 0.46743217152728384, "grad_norm": 2.7249913215637207, "learning_rate": 1.153442678019311e-05, "loss": 1.3095, "step": 575 }, { "epoch": 0.46824509704298345, "grad_norm": 2.9108643531799316, "learning_rate": 1.1508399792573095e-05, "loss": 1.2513, "step": 576 }, { "epoch": 0.46905802255868306, "grad_norm": 2.7690494060516357, "learning_rate": 1.1482362344871514e-05, "loss": 1.3445, "step": 577 }, { "epoch": 0.4698709480743827, "grad_norm": 3.629122734069824, "learning_rate": 1.1456314617646482e-05, "loss": 1.2616, "step": 578 }, { "epoch": 0.4706838735900823, "grad_norm": 2.6831417083740234, "learning_rate": 1.1430256791527406e-05, "loss": 1.2786, "step": 579 }, { "epoch": 0.4714967991057819, "grad_norm": 2.5316171646118164, "learning_rate": 1.1404189047213716e-05, "loss": 1.3195, "step": 580 }, { "epoch": 0.4723097246214816, "grad_norm": 4.602120399475098, "learning_rate": 1.137811156547362e-05, "loss": 1.2378, "step": 581 }, { "epoch": 0.4731226501371812, "grad_norm": 2.5073766708374023, "learning_rate": 1.1352024527142855e-05, "loss": 1.2426, "step": 582 }, { "epoch": 0.4739355756528808, "grad_norm": 2.5561444759368896, "learning_rate": 1.1325928113123431e-05, "loss": 1.318, "step": 583 }, { "epoch": 0.4747485011685804, "grad_norm": 2.8386447429656982, "learning_rate": 1.129982250438237e-05, "loss": 1.2529, "step": 584 }, { "epoch": 0.47556142668428003, "grad_norm": 2.3654778003692627, "learning_rate": 1.1273707881950445e-05, "loss": 1.2822, "step": 585 }, { "epoch": 0.4763743521999797, "grad_norm": 3.125446081161499, "learning_rate": 1.1247584426920962e-05, "loss": 1.3588, "step": 586 }, { "epoch": 0.4771872777156793, "grad_norm": 3.600827217102051, "learning_rate": 1.1221452320448449e-05, "loss": 1.3023, "step": 587 }, { "epoch": 0.47800020323137893, "grad_norm": 3.858783483505249, "learning_rate": 1.1195311743747445e-05, "loss": 1.2784, "step": 588 }, { "epoch": 0.47881312874707854, "grad_norm": 2.841679334640503, "learning_rate": 1.116916287809122e-05, "loss": 1.3084, "step": 589 }, { "epoch": 0.47962605426277816, "grad_norm": 2.9722323417663574, "learning_rate": 1.1143005904810527e-05, "loss": 1.2983, "step": 590 }, { "epoch": 0.48043897977847777, "grad_norm": 2.560037136077881, "learning_rate": 1.1116841005292339e-05, "loss": 1.3175, "step": 591 }, { "epoch": 0.48125190529417744, "grad_norm": 3.1770455837249756, "learning_rate": 1.1090668360978589e-05, "loss": 1.2603, "step": 592 }, { "epoch": 0.48206483080987705, "grad_norm": 2.4485607147216797, "learning_rate": 1.106448815336493e-05, "loss": 1.2792, "step": 593 }, { "epoch": 0.48287775632557667, "grad_norm": 3.7001748085021973, "learning_rate": 1.1038300563999455e-05, "loss": 1.2846, "step": 594 }, { "epoch": 0.4836906818412763, "grad_norm": 2.6942710876464844, "learning_rate": 1.1012105774481446e-05, "loss": 1.2864, "step": 595 }, { "epoch": 0.4845036073569759, "grad_norm": 2.5104377269744873, "learning_rate": 1.0985903966460115e-05, "loss": 1.256, "step": 596 }, { "epoch": 0.48531653287267557, "grad_norm": 2.4864704608917236, "learning_rate": 1.0959695321633346e-05, "loss": 1.2838, "step": 597 }, { "epoch": 0.4861294583883752, "grad_norm": 3.2645606994628906, "learning_rate": 1.0933480021746432e-05, "loss": 1.2966, "step": 598 }, { "epoch": 0.4869423839040748, "grad_norm": 28.041383743286133, "learning_rate": 1.0907258248590816e-05, "loss": 1.2513, "step": 599 }, { "epoch": 0.4877553094197744, "grad_norm": 2.736785888671875, "learning_rate": 1.0881030184002827e-05, "loss": 1.3217, "step": 600 }, { "epoch": 0.488568234935474, "grad_norm": 4.294330596923828, "learning_rate": 1.0854796009862434e-05, "loss": 1.3007, "step": 601 }, { "epoch": 0.48938116045117364, "grad_norm": 2.629371404647827, "learning_rate": 1.0828555908091958e-05, "loss": 1.2884, "step": 602 }, { "epoch": 0.4901940859668733, "grad_norm": 3.166304588317871, "learning_rate": 1.0802310060654832e-05, "loss": 1.3127, "step": 603 }, { "epoch": 0.4910070114825729, "grad_norm": 2.5344200134277344, "learning_rate": 1.0776058649554336e-05, "loss": 1.249, "step": 604 }, { "epoch": 0.49181993699827253, "grad_norm": 3.2902913093566895, "learning_rate": 1.0749801856832325e-05, "loss": 1.2341, "step": 605 }, { "epoch": 0.49263286251397215, "grad_norm": 2.5863964557647705, "learning_rate": 1.0723539864567983e-05, "loss": 1.3534, "step": 606 }, { "epoch": 0.49344578802967176, "grad_norm": 3.1407294273376465, "learning_rate": 1.0697272854876537e-05, "loss": 1.2452, "step": 607 }, { "epoch": 0.49425871354537143, "grad_norm": 2.339702844619751, "learning_rate": 1.0671001009908015e-05, "loss": 1.2597, "step": 608 }, { "epoch": 0.49507163906107104, "grad_norm": 2.5861027240753174, "learning_rate": 1.0644724511845976e-05, "loss": 1.304, "step": 609 }, { "epoch": 0.49588456457677066, "grad_norm": 2.6124143600463867, "learning_rate": 1.0618443542906251e-05, "loss": 1.2333, "step": 610 }, { "epoch": 0.4966974900924703, "grad_norm": 2.53468918800354, "learning_rate": 1.059215828533566e-05, "loss": 1.2587, "step": 611 }, { "epoch": 0.4975104156081699, "grad_norm": 5.205654621124268, "learning_rate": 1.0565868921410776e-05, "loss": 1.2758, "step": 612 }, { "epoch": 0.4983233411238695, "grad_norm": 3.3307433128356934, "learning_rate": 1.0539575633436645e-05, "loss": 1.3197, "step": 613 }, { "epoch": 0.49913626663956917, "grad_norm": 2.4654664993286133, "learning_rate": 1.0513278603745523e-05, "loss": 1.2733, "step": 614 }, { "epoch": 0.4999491921552688, "grad_norm": 2.5150272846221924, "learning_rate": 1.0486978014695606e-05, "loss": 1.2841, "step": 615 }, { "epoch": 0.5007621176709683, "grad_norm": 2.660186767578125, "learning_rate": 1.0460674048669783e-05, "loss": 1.3007, "step": 616 }, { "epoch": 0.501575043186668, "grad_norm": 2.7415716648101807, "learning_rate": 1.0434366888074363e-05, "loss": 1.2974, "step": 617 }, { "epoch": 0.5023879687023677, "grad_norm": 2.479142427444458, "learning_rate": 1.0408056715337797e-05, "loss": 1.301, "step": 618 }, { "epoch": 0.5032008942180672, "grad_norm": 2.4590210914611816, "learning_rate": 1.0381743712909424e-05, "loss": 1.2253, "step": 619 }, { "epoch": 0.5040138197337669, "grad_norm": 2.4704954624176025, "learning_rate": 1.0355428063258224e-05, "loss": 1.1927, "step": 620 }, { "epoch": 0.5048267452494665, "grad_norm": 3.5037641525268555, "learning_rate": 1.0329109948871512e-05, "loss": 1.2727, "step": 621 }, { "epoch": 0.5056396707651661, "grad_norm": 2.6537327766418457, "learning_rate": 1.0302789552253702e-05, "loss": 1.2295, "step": 622 }, { "epoch": 0.5064525962808658, "grad_norm": 3.4443886280059814, "learning_rate": 1.0276467055925044e-05, "loss": 1.2403, "step": 623 }, { "epoch": 0.5072655217965654, "grad_norm": 4.377493858337402, "learning_rate": 1.0250142642420335e-05, "loss": 1.2667, "step": 624 }, { "epoch": 0.508078447312265, "grad_norm": 2.712472677230835, "learning_rate": 1.0223816494287675e-05, "loss": 1.3323, "step": 625 }, { "epoch": 0.5088913728279646, "grad_norm": 2.922093152999878, "learning_rate": 1.0197488794087188e-05, "loss": 1.2713, "step": 626 }, { "epoch": 0.5097042983436643, "grad_norm": 11.951809883117676, "learning_rate": 1.0171159724389766e-05, "loss": 1.2997, "step": 627 }, { "epoch": 0.5105172238593639, "grad_norm": 2.5700554847717285, "learning_rate": 1.0144829467775794e-05, "loss": 1.261, "step": 628 }, { "epoch": 0.5113301493750635, "grad_norm": 2.6800413131713867, "learning_rate": 1.0118498206833886e-05, "loss": 1.3292, "step": 629 }, { "epoch": 0.5121430748907632, "grad_norm": 4.24453592300415, "learning_rate": 1.0092166124159628e-05, "loss": 1.3281, "step": 630 }, { "epoch": 0.5129560004064627, "grad_norm": 2.7513749599456787, "learning_rate": 1.0065833402354302e-05, "loss": 1.2944, "step": 631 }, { "epoch": 0.5137689259221624, "grad_norm": 2.610588788986206, "learning_rate": 1.003950022402361e-05, "loss": 1.3129, "step": 632 }, { "epoch": 0.5145818514378621, "grad_norm": 2.949564218521118, "learning_rate": 1.0013166771776441e-05, "loss": 1.2961, "step": 633 }, { "epoch": 0.5153947769535616, "grad_norm": 2.5617198944091797, "learning_rate": 9.986833228223562e-06, "loss": 1.2898, "step": 634 }, { "epoch": 0.5162077024692613, "grad_norm": 2.779733896255493, "learning_rate": 9.96049977597639e-06, "loss": 1.2988, "step": 635 }, { "epoch": 0.5170206279849608, "grad_norm": 2.8505136966705322, "learning_rate": 9.934166597645703e-06, "loss": 1.2652, "step": 636 }, { "epoch": 0.5178335535006605, "grad_norm": 2.847262144088745, "learning_rate": 9.907833875840374e-06, "loss": 1.3076, "step": 637 }, { "epoch": 0.5186464790163601, "grad_norm": 4.957255840301514, "learning_rate": 9.881501793166117e-06, "loss": 1.214, "step": 638 }, { "epoch": 0.5194594045320597, "grad_norm": 2.7829556465148926, "learning_rate": 9.85517053222421e-06, "loss": 1.2379, "step": 639 }, { "epoch": 0.5202723300477594, "grad_norm": 2.7060935497283936, "learning_rate": 9.82884027561024e-06, "loss": 1.3016, "step": 640 }, { "epoch": 0.521085255563459, "grad_norm": 6.336554527282715, "learning_rate": 9.802511205912815e-06, "loss": 1.269, "step": 641 }, { "epoch": 0.5218981810791586, "grad_norm": 3.0378448963165283, "learning_rate": 9.776183505712327e-06, "loss": 1.317, "step": 642 }, { "epoch": 0.5227111065948582, "grad_norm": 5.806065082550049, "learning_rate": 9.749857357579667e-06, "loss": 1.3165, "step": 643 }, { "epoch": 0.5235240321105579, "grad_norm": 2.7738869190216064, "learning_rate": 9.723532944074961e-06, "loss": 1.2835, "step": 644 }, { "epoch": 0.5243369576262575, "grad_norm": 2.6603453159332275, "learning_rate": 9.6972104477463e-06, "loss": 1.2673, "step": 645 }, { "epoch": 0.5251498831419571, "grad_norm": 2.9316189289093018, "learning_rate": 9.670890051128493e-06, "loss": 1.249, "step": 646 }, { "epoch": 0.5259628086576568, "grad_norm": 2.8541407585144043, "learning_rate": 9.644571936741778e-06, "loss": 1.2835, "step": 647 }, { "epoch": 0.5267757341733563, "grad_norm": 2.6935575008392334, "learning_rate": 9.618256287090576e-06, "loss": 1.2859, "step": 648 }, { "epoch": 0.527588659689056, "grad_norm": 3.057039260864258, "learning_rate": 9.591943284662206e-06, "loss": 1.2538, "step": 649 }, { "epoch": 0.5284015852047557, "grad_norm": 3.2430379390716553, "learning_rate": 9.56563311192564e-06, "loss": 1.294, "step": 650 }, { "epoch": 0.5292145107204552, "grad_norm": 2.378072500228882, "learning_rate": 9.53932595133022e-06, "loss": 1.2793, "step": 651 }, { "epoch": 0.5300274362361549, "grad_norm": 3.2185440063476562, "learning_rate": 9.513021985304399e-06, "loss": 1.2868, "step": 652 }, { "epoch": 0.5308403617518545, "grad_norm": 3.272632122039795, "learning_rate": 9.486721396254484e-06, "loss": 1.2128, "step": 653 }, { "epoch": 0.5316532872675541, "grad_norm": 3.163884401321411, "learning_rate": 9.460424366563355e-06, "loss": 1.2962, "step": 654 }, { "epoch": 0.5324662127832538, "grad_norm": 3.096857786178589, "learning_rate": 9.434131078589224e-06, "loss": 1.2575, "step": 655 }, { "epoch": 0.5332791382989533, "grad_norm": 2.711069107055664, "learning_rate": 9.407841714664343e-06, "loss": 1.2969, "step": 656 }, { "epoch": 0.534092063814653, "grad_norm": 4.4655866622924805, "learning_rate": 9.381556457093752e-06, "loss": 1.2229, "step": 657 }, { "epoch": 0.5349049893303526, "grad_norm": 2.7365305423736572, "learning_rate": 9.355275488154025e-06, "loss": 1.285, "step": 658 }, { "epoch": 0.5357179148460522, "grad_norm": 3.4264895915985107, "learning_rate": 9.32899899009199e-06, "loss": 1.3222, "step": 659 }, { "epoch": 0.5365308403617518, "grad_norm": 2.9572296142578125, "learning_rate": 9.30272714512347e-06, "loss": 1.2771, "step": 660 }, { "epoch": 0.5373437658774515, "grad_norm": 3.124464988708496, "learning_rate": 9.276460135432019e-06, "loss": 1.2362, "step": 661 }, { "epoch": 0.5381566913931511, "grad_norm": 3.484861373901367, "learning_rate": 9.250198143167675e-06, "loss": 1.2624, "step": 662 }, { "epoch": 0.5389696169088507, "grad_norm": 3.191455602645874, "learning_rate": 9.223941350445666e-06, "loss": 1.3271, "step": 663 }, { "epoch": 0.5397825424245504, "grad_norm": 3.055478572845459, "learning_rate": 9.19768993934517e-06, "loss": 1.2476, "step": 664 }, { "epoch": 0.5405954679402499, "grad_norm": 2.8661985397338867, "learning_rate": 9.171444091908046e-06, "loss": 1.2575, "step": 665 }, { "epoch": 0.5414083934559496, "grad_norm": 3.042300224304199, "learning_rate": 9.145203990137571e-06, "loss": 1.2472, "step": 666 }, { "epoch": 0.5422213189716493, "grad_norm": 3.324767827987671, "learning_rate": 9.118969815997174e-06, "loss": 1.2608, "step": 667 }, { "epoch": 0.5430342444873488, "grad_norm": 2.8374948501586914, "learning_rate": 9.092741751409186e-06, "loss": 1.2865, "step": 668 }, { "epoch": 0.5438471700030485, "grad_norm": 3.3593552112579346, "learning_rate": 9.06651997825357e-06, "loss": 1.2746, "step": 669 }, { "epoch": 0.544660095518748, "grad_norm": 3.2432382106781006, "learning_rate": 9.040304678366658e-06, "loss": 1.2864, "step": 670 }, { "epoch": 0.5454730210344477, "grad_norm": 2.890409469604492, "learning_rate": 9.014096033539889e-06, "loss": 1.2685, "step": 671 }, { "epoch": 0.5462859465501474, "grad_norm": 3.0769150257110596, "learning_rate": 8.987894225518556e-06, "loss": 1.2701, "step": 672 }, { "epoch": 0.547098872065847, "grad_norm": 3.453287363052368, "learning_rate": 8.961699436000548e-06, "loss": 1.2218, "step": 673 }, { "epoch": 0.5479117975815466, "grad_norm": 3.1950011253356934, "learning_rate": 8.93551184663507e-06, "loss": 1.2267, "step": 674 }, { "epoch": 0.5487247230972462, "grad_norm": 3.445006847381592, "learning_rate": 8.909331639021414e-06, "loss": 1.283, "step": 675 }, { "epoch": 0.5495376486129459, "grad_norm": 2.5453741550445557, "learning_rate": 8.883158994707666e-06, "loss": 1.3102, "step": 676 }, { "epoch": 0.5503505741286454, "grad_norm": 4.167499542236328, "learning_rate": 8.856994095189477e-06, "loss": 1.2881, "step": 677 }, { "epoch": 0.5511634996443451, "grad_norm": 2.6888363361358643, "learning_rate": 8.830837121908783e-06, "loss": 1.2332, "step": 678 }, { "epoch": 0.5519764251600447, "grad_norm": 2.9484667778015137, "learning_rate": 8.804688256252557e-06, "loss": 1.2676, "step": 679 }, { "epoch": 0.5527893506757443, "grad_norm": 2.5477519035339355, "learning_rate": 8.778547679551555e-06, "loss": 1.2956, "step": 680 }, { "epoch": 0.553602276191444, "grad_norm": 2.3307385444641113, "learning_rate": 8.75241557307904e-06, "loss": 1.3021, "step": 681 }, { "epoch": 0.5544152017071435, "grad_norm": 3.1104202270507812, "learning_rate": 8.726292118049555e-06, "loss": 1.2861, "step": 682 }, { "epoch": 0.5552281272228432, "grad_norm": 3.2731287479400635, "learning_rate": 8.700177495617635e-06, "loss": 1.33, "step": 683 }, { "epoch": 0.5560410527385429, "grad_norm": 2.923478364944458, "learning_rate": 8.674071886876572e-06, "loss": 1.2946, "step": 684 }, { "epoch": 0.5568539782542424, "grad_norm": 3.1030538082122803, "learning_rate": 8.647975472857148e-06, "loss": 1.2481, "step": 685 }, { "epoch": 0.5576669037699421, "grad_norm": 2.6904759407043457, "learning_rate": 8.621888434526382e-06, "loss": 1.2637, "step": 686 }, { "epoch": 0.5584798292856417, "grad_norm": 3.6781442165374756, "learning_rate": 8.595810952786289e-06, "loss": 1.2875, "step": 687 }, { "epoch": 0.5592927548013413, "grad_norm": 4.897818565368652, "learning_rate": 8.569743208472594e-06, "loss": 1.2804, "step": 688 }, { "epoch": 0.560105680317041, "grad_norm": 2.9090828895568848, "learning_rate": 8.543685382353518e-06, "loss": 1.2817, "step": 689 }, { "epoch": 0.5609186058327406, "grad_norm": 3.3284378051757812, "learning_rate": 8.51763765512849e-06, "loss": 1.2928, "step": 690 }, { "epoch": 0.5617315313484402, "grad_norm": 3.440209150314331, "learning_rate": 8.491600207426907e-06, "loss": 1.2667, "step": 691 }, { "epoch": 0.5625444568641398, "grad_norm": 3.1297762393951416, "learning_rate": 8.465573219806893e-06, "loss": 1.2752, "step": 692 }, { "epoch": 0.5633573823798395, "grad_norm": 3.460277795791626, "learning_rate": 8.439556872754025e-06, "loss": 1.2611, "step": 693 }, { "epoch": 0.5641703078955391, "grad_norm": 2.6390557289123535, "learning_rate": 8.413551346680095e-06, "loss": 1.2339, "step": 694 }, { "epoch": 0.5649832334112387, "grad_norm": 2.365945339202881, "learning_rate": 8.38755682192186e-06, "loss": 1.2333, "step": 695 }, { "epoch": 0.5657961589269384, "grad_norm": 3.140129804611206, "learning_rate": 8.36157347873979e-06, "loss": 1.2614, "step": 696 }, { "epoch": 0.5666090844426379, "grad_norm": 4.027166366577148, "learning_rate": 8.335601497316809e-06, "loss": 1.263, "step": 697 }, { "epoch": 0.5674220099583376, "grad_norm": 2.6872942447662354, "learning_rate": 8.309641057757052e-06, "loss": 1.2479, "step": 698 }, { "epoch": 0.5682349354740371, "grad_norm": 2.575493574142456, "learning_rate": 8.283692340084623e-06, "loss": 1.2818, "step": 699 }, { "epoch": 0.5690478609897368, "grad_norm": 2.6429176330566406, "learning_rate": 8.257755524242333e-06, "loss": 1.2921, "step": 700 }, { "epoch": 0.5698607865054365, "grad_norm": 4.695654392242432, "learning_rate": 8.231830790090461e-06, "loss": 1.2046, "step": 701 }, { "epoch": 0.570673712021136, "grad_norm": 2.4642715454101562, "learning_rate": 8.205918317405508e-06, "loss": 1.3013, "step": 702 }, { "epoch": 0.5714866375368357, "grad_norm": 2.567474842071533, "learning_rate": 8.18001828587895e-06, "loss": 1.3458, "step": 703 }, { "epoch": 0.5722995630525353, "grad_norm": 2.934668779373169, "learning_rate": 8.154130875115978e-06, "loss": 1.2804, "step": 704 }, { "epoch": 0.5731124885682349, "grad_norm": 2.669285297393799, "learning_rate": 8.12825626463427e-06, "loss": 1.2329, "step": 705 }, { "epoch": 0.5739254140839346, "grad_norm": 2.7390220165252686, "learning_rate": 8.102394633862743e-06, "loss": 1.2177, "step": 706 }, { "epoch": 0.5747383395996342, "grad_norm": 3.19964861869812, "learning_rate": 8.0765461621403e-06, "loss": 1.2625, "step": 707 }, { "epoch": 0.5755512651153338, "grad_norm": 2.753469705581665, "learning_rate": 8.050711028714589e-06, "loss": 1.2357, "step": 708 }, { "epoch": 0.5763641906310334, "grad_norm": 3.3288702964782715, "learning_rate": 8.02488941274078e-06, "loss": 1.217, "step": 709 }, { "epoch": 0.5771771161467331, "grad_norm": 2.808100700378418, "learning_rate": 7.999081493280283e-06, "loss": 1.3156, "step": 710 }, { "epoch": 0.5779900416624327, "grad_norm": 2.8736870288848877, "learning_rate": 7.973287449299545e-06, "loss": 1.3122, "step": 711 }, { "epoch": 0.5788029671781323, "grad_norm": 6.863023281097412, "learning_rate": 7.947507459668784e-06, "loss": 1.2218, "step": 712 }, { "epoch": 0.579615892693832, "grad_norm": 4.454842567443848, "learning_rate": 7.921741703160758e-06, "loss": 1.1918, "step": 713 }, { "epoch": 0.5804288182095315, "grad_norm": 2.4465959072113037, "learning_rate": 7.895990358449533e-06, "loss": 1.2705, "step": 714 }, { "epoch": 0.5812417437252312, "grad_norm": 3.3625428676605225, "learning_rate": 7.87025360410922e-06, "loss": 1.2644, "step": 715 }, { "epoch": 0.5820546692409307, "grad_norm": 2.846947431564331, "learning_rate": 7.844531618612772e-06, "loss": 1.2612, "step": 716 }, { "epoch": 0.5828675947566304, "grad_norm": 3.332118034362793, "learning_rate": 7.81882458033071e-06, "loss": 1.2597, "step": 717 }, { "epoch": 0.5836805202723301, "grad_norm": 2.646106719970703, "learning_rate": 7.79313266752991e-06, "loss": 1.2613, "step": 718 }, { "epoch": 0.5844934457880296, "grad_norm": 2.8592135906219482, "learning_rate": 7.767456058372362e-06, "loss": 1.282, "step": 719 }, { "epoch": 0.5853063713037293, "grad_norm": 2.748481035232544, "learning_rate": 7.741794930913922e-06, "loss": 1.2869, "step": 720 }, { "epoch": 0.5861192968194289, "grad_norm": 2.8134074211120605, "learning_rate": 7.7161494631031e-06, "loss": 1.3079, "step": 721 }, { "epoch": 0.5869322223351285, "grad_norm": 3.059119939804077, "learning_rate": 7.690519832779799e-06, "loss": 1.2705, "step": 722 }, { "epoch": 0.5877451478508282, "grad_norm": 2.6439130306243896, "learning_rate": 7.664906217674115e-06, "loss": 1.2413, "step": 723 }, { "epoch": 0.5885580733665278, "grad_norm": 2.812056303024292, "learning_rate": 7.639308795405066e-06, "loss": 1.2543, "step": 724 }, { "epoch": 0.5893709988822274, "grad_norm": 3.2603330612182617, "learning_rate": 7.613727743479395e-06, "loss": 1.2442, "step": 725 }, { "epoch": 0.590183924397927, "grad_norm": 2.544433116912842, "learning_rate": 7.588163239290316e-06, "loss": 1.3034, "step": 726 }, { "epoch": 0.5909968499136267, "grad_norm": 4.0246262550354, "learning_rate": 7.562615460116289e-06, "loss": 1.3188, "step": 727 }, { "epoch": 0.5918097754293263, "grad_norm": 4.249239444732666, "learning_rate": 7.537084583119802e-06, "loss": 1.3091, "step": 728 }, { "epoch": 0.5926227009450259, "grad_norm": 2.7686362266540527, "learning_rate": 7.511570785346129e-06, "loss": 1.2449, "step": 729 }, { "epoch": 0.5934356264607256, "grad_norm": 2.8529245853424072, "learning_rate": 7.486074243722109e-06, "loss": 1.2392, "step": 730 }, { "epoch": 0.5942485519764251, "grad_norm": 3.073486328125, "learning_rate": 7.460595135054916e-06, "loss": 1.2848, "step": 731 }, { "epoch": 0.5950614774921248, "grad_norm": 3.365366220474243, "learning_rate": 7.435133636030831e-06, "loss": 1.2912, "step": 732 }, { "epoch": 0.5958744030078245, "grad_norm": 2.4938106536865234, "learning_rate": 7.4096899232140295e-06, "loss": 1.2965, "step": 733 }, { "epoch": 0.596687328523524, "grad_norm": 2.9927473068237305, "learning_rate": 7.384264173045339e-06, "loss": 1.2748, "step": 734 }, { "epoch": 0.5975002540392237, "grad_norm": 7.3427205085754395, "learning_rate": 7.358856561841021e-06, "loss": 1.2457, "step": 735 }, { "epoch": 0.5983131795549232, "grad_norm": 3.274311065673828, "learning_rate": 7.333467265791563e-06, "loss": 1.2225, "step": 736 }, { "epoch": 0.5991261050706229, "grad_norm": 4.503856658935547, "learning_rate": 7.308096460960441e-06, "loss": 1.2603, "step": 737 }, { "epoch": 0.5999390305863225, "grad_norm": 3.6017913818359375, "learning_rate": 7.282744323282895e-06, "loss": 1.2278, "step": 738 }, { "epoch": 0.6007519561020221, "grad_norm": 3.0930585861206055, "learning_rate": 7.2574110285647244e-06, "loss": 1.2649, "step": 739 }, { "epoch": 0.6015648816177218, "grad_norm": 2.6793737411499023, "learning_rate": 7.232096752481061e-06, "loss": 1.215, "step": 740 }, { "epoch": 0.6023778071334214, "grad_norm": 3.0066819190979004, "learning_rate": 7.206801670575145e-06, "loss": 1.2953, "step": 741 }, { "epoch": 0.603190732649121, "grad_norm": 3.2586004734039307, "learning_rate": 7.181525958257116e-06, "loss": 1.1988, "step": 742 }, { "epoch": 0.6040036581648206, "grad_norm": 3.186267375946045, "learning_rate": 7.156269790802801e-06, "loss": 1.2425, "step": 743 }, { "epoch": 0.6048165836805203, "grad_norm": 3.919509172439575, "learning_rate": 7.131033343352483e-06, "loss": 1.3432, "step": 744 }, { "epoch": 0.6056295091962199, "grad_norm": 3.8313186168670654, "learning_rate": 7.105816790909699e-06, "loss": 1.2491, "step": 745 }, { "epoch": 0.6064424347119195, "grad_norm": 2.7689011096954346, "learning_rate": 7.080620308340024e-06, "loss": 1.2673, "step": 746 }, { "epoch": 0.6072553602276192, "grad_norm": 4.105691909790039, "learning_rate": 7.055444070369852e-06, "loss": 1.2688, "step": 747 }, { "epoch": 0.6080682857433187, "grad_norm": 3.336580276489258, "learning_rate": 7.0302882515852025e-06, "loss": 1.2613, "step": 748 }, { "epoch": 0.6088812112590184, "grad_norm": 3.7272021770477295, "learning_rate": 7.005153026430476e-06, "loss": 1.1882, "step": 749 }, { "epoch": 0.6096941367747181, "grad_norm": 4.220558166503906, "learning_rate": 6.980038569207291e-06, "loss": 1.1853, "step": 750 }, { "epoch": 0.6105070622904176, "grad_norm": 2.8943638801574707, "learning_rate": 6.954945054073228e-06, "loss": 1.2408, "step": 751 }, { "epoch": 0.6113199878061173, "grad_norm": 2.740449905395508, "learning_rate": 6.929872655040655e-06, "loss": 1.2233, "step": 752 }, { "epoch": 0.6121329133218169, "grad_norm": 3.1293320655822754, "learning_rate": 6.904821545975507e-06, "loss": 1.2362, "step": 753 }, { "epoch": 0.6129458388375165, "grad_norm": 2.9130334854125977, "learning_rate": 6.879791900596077e-06, "loss": 1.2525, "step": 754 }, { "epoch": 0.6137587643532162, "grad_norm": 2.6800663471221924, "learning_rate": 6.854783892471823e-06, "loss": 1.2811, "step": 755 }, { "epoch": 0.6145716898689157, "grad_norm": 2.7140908241271973, "learning_rate": 6.829797695022163e-06, "loss": 1.2693, "step": 756 }, { "epoch": 0.6153846153846154, "grad_norm": 2.687870740890503, "learning_rate": 6.804833481515256e-06, "loss": 1.2124, "step": 757 }, { "epoch": 0.616197540900315, "grad_norm": 3.170487880706787, "learning_rate": 6.7798914250668154e-06, "loss": 1.2373, "step": 758 }, { "epoch": 0.6170104664160146, "grad_norm": 2.6142961978912354, "learning_rate": 6.7549716986389146e-06, "loss": 1.2527, "step": 759 }, { "epoch": 0.6178233919317142, "grad_norm": 3.4092085361480713, "learning_rate": 6.730074475038766e-06, "loss": 1.2401, "step": 760 }, { "epoch": 0.6186363174474139, "grad_norm": 3.256838083267212, "learning_rate": 6.7051999269175405e-06, "loss": 1.1863, "step": 761 }, { "epoch": 0.6194492429631135, "grad_norm": 2.8312947750091553, "learning_rate": 6.680348226769162e-06, "loss": 1.241, "step": 762 }, { "epoch": 0.6202621684788131, "grad_norm": 2.799750804901123, "learning_rate": 6.655519546929121e-06, "loss": 1.2601, "step": 763 }, { "epoch": 0.6210750939945128, "grad_norm": 3.188913106918335, "learning_rate": 6.630714059573267e-06, "loss": 1.2719, "step": 764 }, { "epoch": 0.6218880195102123, "grad_norm": 5.547321796417236, "learning_rate": 6.6059319367166165e-06, "loss": 1.2307, "step": 765 }, { "epoch": 0.622700945025912, "grad_norm": 3.2380361557006836, "learning_rate": 6.581173350212169e-06, "loss": 1.2125, "step": 766 }, { "epoch": 0.6235138705416117, "grad_norm": 2.61883282661438, "learning_rate": 6.55643847174971e-06, "loss": 1.2556, "step": 767 }, { "epoch": 0.6243267960573112, "grad_norm": 3.0079920291900635, "learning_rate": 6.531727472854617e-06, "loss": 1.2761, "step": 768 }, { "epoch": 0.6251397215730109, "grad_norm": 3.995910882949829, "learning_rate": 6.507040524886672e-06, "loss": 1.302, "step": 769 }, { "epoch": 0.6259526470887105, "grad_norm": 2.7787578105926514, "learning_rate": 6.482377799038882e-06, "loss": 1.2249, "step": 770 }, { "epoch": 0.6267655726044101, "grad_norm": 3.6458895206451416, "learning_rate": 6.45773946633628e-06, "loss": 1.2833, "step": 771 }, { "epoch": 0.6275784981201098, "grad_norm": 2.9308435916900635, "learning_rate": 6.4331256976347434e-06, "loss": 1.309, "step": 772 }, { "epoch": 0.6283914236358094, "grad_norm": 3.7917234897613525, "learning_rate": 6.408536663619803e-06, "loss": 1.2996, "step": 773 }, { "epoch": 0.629204349151509, "grad_norm": 8.85531234741211, "learning_rate": 6.383972534805478e-06, "loss": 1.2499, "step": 774 }, { "epoch": 0.6300172746672086, "grad_norm": 4.16661262512207, "learning_rate": 6.359433481533074e-06, "loss": 1.1928, "step": 775 }, { "epoch": 0.6308302001829083, "grad_norm": 3.6679298877716064, "learning_rate": 6.3349196739700024e-06, "loss": 1.2917, "step": 776 }, { "epoch": 0.6316431256986078, "grad_norm": 3.2031593322753906, "learning_rate": 6.310431282108622e-06, "loss": 1.2926, "step": 777 }, { "epoch": 0.6324560512143075, "grad_norm": 2.7538363933563232, "learning_rate": 6.2859684757650365e-06, "loss": 1.2634, "step": 778 }, { "epoch": 0.6332689767300071, "grad_norm": 3.4906575679779053, "learning_rate": 6.261531424577923e-06, "loss": 1.2711, "step": 779 }, { "epoch": 0.6340819022457067, "grad_norm": 3.4287617206573486, "learning_rate": 6.2371202980073596e-06, "loss": 1.2412, "step": 780 }, { "epoch": 0.6348948277614064, "grad_norm": 3.5826241970062256, "learning_rate": 6.212735265333655e-06, "loss": 1.1782, "step": 781 }, { "epoch": 0.6357077532771059, "grad_norm": 3.369983673095703, "learning_rate": 6.188376495656156e-06, "loss": 1.2628, "step": 782 }, { "epoch": 0.6365206787928056, "grad_norm": 3.6163413524627686, "learning_rate": 6.164044157892102e-06, "loss": 1.3304, "step": 783 }, { "epoch": 0.6373336043085053, "grad_norm": 2.6903252601623535, "learning_rate": 6.13973842077543e-06, "loss": 1.2458, "step": 784 }, { "epoch": 0.6381465298242048, "grad_norm": 3.919074296951294, "learning_rate": 6.11545945285561e-06, "loss": 1.253, "step": 785 }, { "epoch": 0.6389594553399045, "grad_norm": 2.9155240058898926, "learning_rate": 6.091207422496489e-06, "loss": 1.2661, "step": 786 }, { "epoch": 0.6397723808556041, "grad_norm": 3.2426347732543945, "learning_rate": 6.066982497875109e-06, "loss": 1.2556, "step": 787 }, { "epoch": 0.6405853063713037, "grad_norm": 3.078899383544922, "learning_rate": 6.042784846980542e-06, "loss": 1.2572, "step": 788 }, { "epoch": 0.6413982318870034, "grad_norm": 3.3044381141662598, "learning_rate": 6.018614637612733e-06, "loss": 1.2301, "step": 789 }, { "epoch": 0.642211157402703, "grad_norm": 2.8474955558776855, "learning_rate": 5.99447203738134e-06, "loss": 1.2042, "step": 790 }, { "epoch": 0.6430240829184026, "grad_norm": 2.9787845611572266, "learning_rate": 5.9703572137045495e-06, "loss": 1.2608, "step": 791 }, { "epoch": 0.6438370084341022, "grad_norm": 3.380209445953369, "learning_rate": 5.946270333807937e-06, "loss": 1.2973, "step": 792 }, { "epoch": 0.6446499339498019, "grad_norm": 2.81736421585083, "learning_rate": 5.922211564723302e-06, "loss": 1.2791, "step": 793 }, { "epoch": 0.6454628594655015, "grad_norm": 2.9054102897644043, "learning_rate": 5.898181073287504e-06, "loss": 1.2692, "step": 794 }, { "epoch": 0.6462757849812011, "grad_norm": 3.2480154037475586, "learning_rate": 5.87417902614131e-06, "loss": 1.311, "step": 795 }, { "epoch": 0.6470887104969008, "grad_norm": 2.8822832107543945, "learning_rate": 5.850205589728239e-06, "loss": 1.2528, "step": 796 }, { "epoch": 0.6479016360126003, "grad_norm": 2.8832008838653564, "learning_rate": 5.826260930293417e-06, "loss": 1.2631, "step": 797 }, { "epoch": 0.6487145615283, "grad_norm": 3.547271490097046, "learning_rate": 5.802345213882396e-06, "loss": 1.2543, "step": 798 }, { "epoch": 0.6495274870439995, "grad_norm": 9.93248176574707, "learning_rate": 5.778458606340037e-06, "loss": 1.3218, "step": 799 }, { "epoch": 0.6503404125596992, "grad_norm": 4.664019584655762, "learning_rate": 5.754601273309333e-06, "loss": 1.2487, "step": 800 }, { "epoch": 0.6511533380753989, "grad_norm": 3.191390037536621, "learning_rate": 5.730773380230276e-06, "loss": 1.1966, "step": 801 }, { "epoch": 0.6519662635910984, "grad_norm": 3.228309392929077, "learning_rate": 5.70697509233871e-06, "loss": 1.2556, "step": 802 }, { "epoch": 0.6527791891067981, "grad_norm": 3.1456098556518555, "learning_rate": 5.683206574665165e-06, "loss": 1.2308, "step": 803 }, { "epoch": 0.6535921146224977, "grad_norm": 2.800039052963257, "learning_rate": 5.6594679920337514e-06, "loss": 1.2599, "step": 804 }, { "epoch": 0.6544050401381973, "grad_norm": 2.9048550128936768, "learning_rate": 5.635759509060969e-06, "loss": 1.2707, "step": 805 }, { "epoch": 0.655217965653897, "grad_norm": 4.015383720397949, "learning_rate": 5.612081290154607e-06, "loss": 1.1853, "step": 806 }, { "epoch": 0.6560308911695966, "grad_norm": 2.6166458129882812, "learning_rate": 5.58843349951258e-06, "loss": 1.2589, "step": 807 }, { "epoch": 0.6568438166852962, "grad_norm": 4.735121726989746, "learning_rate": 5.564816301121792e-06, "loss": 1.2395, "step": 808 }, { "epoch": 0.6576567422009958, "grad_norm": 3.5069589614868164, "learning_rate": 5.541229858757011e-06, "loss": 1.2888, "step": 809 }, { "epoch": 0.6584696677166955, "grad_norm": 2.354539394378662, "learning_rate": 5.517674335979721e-06, "loss": 1.1898, "step": 810 }, { "epoch": 0.6592825932323951, "grad_norm": 3.2337725162506104, "learning_rate": 5.494149896136998e-06, "loss": 1.311, "step": 811 }, { "epoch": 0.6600955187480947, "grad_norm": 2.6511757373809814, "learning_rate": 5.470656702360367e-06, "loss": 1.2788, "step": 812 }, { "epoch": 0.6609084442637944, "grad_norm": 3.772780179977417, "learning_rate": 5.447194917564671e-06, "loss": 1.2211, "step": 813 }, { "epoch": 0.6617213697794939, "grad_norm": 2.540316581726074, "learning_rate": 5.423764704446954e-06, "loss": 1.2647, "step": 814 }, { "epoch": 0.6625342952951936, "grad_norm": 2.792747735977173, "learning_rate": 5.400366225485326e-06, "loss": 1.2184, "step": 815 }, { "epoch": 0.6633472208108933, "grad_norm": 3.32261061668396, "learning_rate": 5.376999642937817e-06, "loss": 1.2727, "step": 816 }, { "epoch": 0.6641601463265928, "grad_norm": 4.128072738647461, "learning_rate": 5.353665118841296e-06, "loss": 1.2718, "step": 817 }, { "epoch": 0.6649730718422925, "grad_norm": 2.9913909435272217, "learning_rate": 5.330362815010306e-06, "loss": 1.2698, "step": 818 }, { "epoch": 0.665785997357992, "grad_norm": 2.9993457794189453, "learning_rate": 5.307092893035951e-06, "loss": 1.2447, "step": 819 }, { "epoch": 0.6665989228736917, "grad_norm": 2.801236629486084, "learning_rate": 5.2838555142847925e-06, "loss": 1.209, "step": 820 }, { "epoch": 0.6674118483893913, "grad_norm": 3.982821464538574, "learning_rate": 5.260650839897719e-06, "loss": 1.3099, "step": 821 }, { "epoch": 0.6682247739050909, "grad_norm": 2.9553382396698, "learning_rate": 5.237479030788817e-06, "loss": 1.2652, "step": 822 }, { "epoch": 0.6690376994207906, "grad_norm": 3.233414888381958, "learning_rate": 5.214340247644278e-06, "loss": 1.2256, "step": 823 }, { "epoch": 0.6698506249364902, "grad_norm": 3.1418299674987793, "learning_rate": 5.191234650921273e-06, "loss": 1.2225, "step": 824 }, { "epoch": 0.6706635504521898, "grad_norm": 2.8071773052215576, "learning_rate": 5.168162400846835e-06, "loss": 1.3381, "step": 825 }, { "epoch": 0.6714764759678894, "grad_norm": 3.2606897354125977, "learning_rate": 5.145123657416759e-06, "loss": 1.2671, "step": 826 }, { "epoch": 0.6722894014835891, "grad_norm": 2.5103461742401123, "learning_rate": 5.122118580394473e-06, "loss": 1.2349, "step": 827 }, { "epoch": 0.6731023269992887, "grad_norm": 2.882448196411133, "learning_rate": 5.099147329309959e-06, "loss": 1.2466, "step": 828 }, { "epoch": 0.6739152525149883, "grad_norm": 3.0320730209350586, "learning_rate": 5.076210063458622e-06, "loss": 1.2157, "step": 829 }, { "epoch": 0.674728178030688, "grad_norm": 3.285125970840454, "learning_rate": 5.0533069419002e-06, "loss": 1.3087, "step": 830 }, { "epoch": 0.6755411035463875, "grad_norm": 3.9807510375976562, "learning_rate": 5.030438123457655e-06, "loss": 1.2153, "step": 831 }, { "epoch": 0.6763540290620872, "grad_norm": 3.12975811958313, "learning_rate": 5.007603766716063e-06, "loss": 1.2064, "step": 832 }, { "epoch": 0.6771669545777869, "grad_norm": 2.9132258892059326, "learning_rate": 4.984804030021533e-06, "loss": 1.2132, "step": 833 }, { "epoch": 0.6779798800934864, "grad_norm": 2.872042417526245, "learning_rate": 4.962039071480102e-06, "loss": 1.2618, "step": 834 }, { "epoch": 0.6787928056091861, "grad_norm": 3.7190613746643066, "learning_rate": 4.939309048956622e-06, "loss": 1.2482, "step": 835 }, { "epoch": 0.6796057311248856, "grad_norm": 5.171625137329102, "learning_rate": 4.9166141200736885e-06, "loss": 1.2848, "step": 836 }, { "epoch": 0.6804186566405853, "grad_norm": 3.5912961959838867, "learning_rate": 4.89395444221055e-06, "loss": 1.2525, "step": 837 }, { "epoch": 0.6812315821562849, "grad_norm": 3.9113729000091553, "learning_rate": 4.871330172501979e-06, "loss": 1.2444, "step": 838 }, { "epoch": 0.6820445076719845, "grad_norm": 5.135432720184326, "learning_rate": 4.848741467837228e-06, "loss": 1.2189, "step": 839 }, { "epoch": 0.6828574331876842, "grad_norm": 3.0934841632843018, "learning_rate": 4.826188484858918e-06, "loss": 1.2357, "step": 840 }, { "epoch": 0.6836703587033838, "grad_norm": 3.951188325881958, "learning_rate": 4.803671379961945e-06, "loss": 1.2539, "step": 841 }, { "epoch": 0.6844832842190834, "grad_norm": 6.205260753631592, "learning_rate": 4.781190309292421e-06, "loss": 1.2537, "step": 842 }, { "epoch": 0.685296209734783, "grad_norm": 4.493546485900879, "learning_rate": 4.758745428746569e-06, "loss": 1.252, "step": 843 }, { "epoch": 0.6861091352504827, "grad_norm": 4.0202436447143555, "learning_rate": 4.736336893969652e-06, "loss": 1.1887, "step": 844 }, { "epoch": 0.6869220607661823, "grad_norm": 2.65285587310791, "learning_rate": 4.7139648603548925e-06, "loss": 1.2612, "step": 845 }, { "epoch": 0.6877349862818819, "grad_norm": 3.629551410675049, "learning_rate": 4.691629483042387e-06, "loss": 1.2411, "step": 846 }, { "epoch": 0.6885479117975816, "grad_norm": 3.20709228515625, "learning_rate": 4.669330916918043e-06, "loss": 1.1949, "step": 847 }, { "epoch": 0.6893608373132811, "grad_norm": 3.19427752494812, "learning_rate": 4.647069316612502e-06, "loss": 1.2134, "step": 848 }, { "epoch": 0.6901737628289808, "grad_norm": 3.6364243030548096, "learning_rate": 4.624844836500052e-06, "loss": 1.2915, "step": 849 }, { "epoch": 0.6909866883446805, "grad_norm": 3.5689237117767334, "learning_rate": 4.60265763069758e-06, "loss": 1.2234, "step": 850 }, { "epoch": 0.69179961386038, "grad_norm": 3.1175014972686768, "learning_rate": 4.580507853063487e-06, "loss": 1.1833, "step": 851 }, { "epoch": 0.6926125393760797, "grad_norm": 2.945756196975708, "learning_rate": 4.5583956571966295e-06, "loss": 1.2231, "step": 852 }, { "epoch": 0.6934254648917793, "grad_norm": 4.729986667633057, "learning_rate": 4.5363211964352524e-06, "loss": 1.2578, "step": 853 }, { "epoch": 0.6942383904074789, "grad_norm": 2.7775003910064697, "learning_rate": 4.514284623855915e-06, "loss": 1.2678, "step": 854 }, { "epoch": 0.6950513159231786, "grad_norm": 4.027686595916748, "learning_rate": 4.4922860922724466e-06, "loss": 1.1692, "step": 855 }, { "epoch": 0.6958642414388782, "grad_norm": 3.3442118167877197, "learning_rate": 4.470325754234881e-06, "loss": 1.2515, "step": 856 }, { "epoch": 0.6966771669545778, "grad_norm": 3.197281837463379, "learning_rate": 4.448403762028391e-06, "loss": 1.2789, "step": 857 }, { "epoch": 0.6974900924702774, "grad_norm": 3.1467063426971436, "learning_rate": 4.426520267672244e-06, "loss": 1.2498, "step": 858 }, { "epoch": 0.698303017985977, "grad_norm": 8.657835960388184, "learning_rate": 4.40467542291874e-06, "loss": 1.2149, "step": 859 }, { "epoch": 0.6991159435016766, "grad_norm": 5.045658111572266, "learning_rate": 4.382869379252152e-06, "loss": 1.2143, "step": 860 }, { "epoch": 0.6999288690173763, "grad_norm": 3.543026924133301, "learning_rate": 4.361102287887698e-06, "loss": 1.2727, "step": 861 }, { "epoch": 0.700741794533076, "grad_norm": 3.2592012882232666, "learning_rate": 4.339374299770477e-06, "loss": 1.2528, "step": 862 }, { "epoch": 0.7015547200487755, "grad_norm": 3.284749984741211, "learning_rate": 4.31768556557441e-06, "loss": 1.1814, "step": 863 }, { "epoch": 0.7023676455644752, "grad_norm": 2.9172427654266357, "learning_rate": 4.296036235701235e-06, "loss": 1.2536, "step": 864 }, { "epoch": 0.7031805710801747, "grad_norm": 8.07040023803711, "learning_rate": 4.274426460279412e-06, "loss": 1.2113, "step": 865 }, { "epoch": 0.7039934965958744, "grad_norm": 3.0349769592285156, "learning_rate": 4.252856389163128e-06, "loss": 1.2279, "step": 866 }, { "epoch": 0.7048064221115741, "grad_norm": 2.7983269691467285, "learning_rate": 4.231326171931231e-06, "loss": 1.2585, "step": 867 }, { "epoch": 0.7056193476272736, "grad_norm": 3.153099775314331, "learning_rate": 4.209835957886196e-06, "loss": 1.2576, "step": 868 }, { "epoch": 0.7064322731429733, "grad_norm": 3.4303712844848633, "learning_rate": 4.188385896053098e-06, "loss": 1.2569, "step": 869 }, { "epoch": 0.7072451986586729, "grad_norm": 3.310842990875244, "learning_rate": 4.166976135178575e-06, "loss": 1.2162, "step": 870 }, { "epoch": 0.7080581241743725, "grad_norm": 3.982365846633911, "learning_rate": 4.1456068237297964e-06, "loss": 1.2409, "step": 871 }, { "epoch": 0.7088710496900722, "grad_norm": 3.0641191005706787, "learning_rate": 4.124278109893432e-06, "loss": 1.2563, "step": 872 }, { "epoch": 0.7096839752057718, "grad_norm": 2.9682273864746094, "learning_rate": 4.10299014157462e-06, "loss": 1.1857, "step": 873 }, { "epoch": 0.7104969007214714, "grad_norm": 6.076914310455322, "learning_rate": 4.0817430663959536e-06, "loss": 1.2108, "step": 874 }, { "epoch": 0.711309826237171, "grad_norm": 8.528678894042969, "learning_rate": 4.06053703169645e-06, "loss": 1.2185, "step": 875 }, { "epoch": 0.7121227517528707, "grad_norm": 3.4424145221710205, "learning_rate": 4.039372184530521e-06, "loss": 1.2461, "step": 876 }, { "epoch": 0.7129356772685703, "grad_norm": 3.1624224185943604, "learning_rate": 4.0182486716669656e-06, "loss": 1.2282, "step": 877 }, { "epoch": 0.7137486027842699, "grad_norm": 4.986435890197754, "learning_rate": 3.9971666395879605e-06, "loss": 1.2048, "step": 878 }, { "epoch": 0.7145615282999696, "grad_norm": 3.537174701690674, "learning_rate": 3.9761262344880096e-06, "loss": 1.2752, "step": 879 }, { "epoch": 0.7153744538156691, "grad_norm": 2.7389779090881348, "learning_rate": 3.9551276022729644e-06, "loss": 1.2434, "step": 880 }, { "epoch": 0.7161873793313688, "grad_norm": 3.5238423347473145, "learning_rate": 3.9341708885590034e-06, "loss": 1.2409, "step": 881 }, { "epoch": 0.7170003048470683, "grad_norm": 3.9080941677093506, "learning_rate": 3.913256238671607e-06, "loss": 1.2019, "step": 882 }, { "epoch": 0.717813230362768, "grad_norm": 4.038003921508789, "learning_rate": 3.89238379764457e-06, "loss": 1.2212, "step": 883 }, { "epoch": 0.7186261558784677, "grad_norm": 3.344622850418091, "learning_rate": 3.871553710218988e-06, "loss": 1.2067, "step": 884 }, { "epoch": 0.7194390813941672, "grad_norm": 3.5090816020965576, "learning_rate": 3.850766120842252e-06, "loss": 1.2171, "step": 885 }, { "epoch": 0.7202520069098669, "grad_norm": 3.003899335861206, "learning_rate": 3.830021173667048e-06, "loss": 1.2371, "step": 886 }, { "epoch": 0.7210649324255665, "grad_norm": 3.3116228580474854, "learning_rate": 3.809319012550352e-06, "loss": 1.2123, "step": 887 }, { "epoch": 0.7218778579412661, "grad_norm": 3.532245397567749, "learning_rate": 3.788659781052444e-06, "loss": 1.2629, "step": 888 }, { "epoch": 0.7226907834569658, "grad_norm": 4.061065196990967, "learning_rate": 3.7680436224359084e-06, "loss": 1.174, "step": 889 }, { "epoch": 0.7235037089726654, "grad_norm": 3.3992788791656494, "learning_rate": 3.747470679664624e-06, "loss": 1.2209, "step": 890 }, { "epoch": 0.724316634488365, "grad_norm": 3.4010937213897705, "learning_rate": 3.7269410954028107e-06, "loss": 1.2426, "step": 891 }, { "epoch": 0.7251295600040646, "grad_norm": 2.854327917098999, "learning_rate": 3.706455012013994e-06, "loss": 1.1932, "step": 892 }, { "epoch": 0.7259424855197643, "grad_norm": 3.451002836227417, "learning_rate": 3.6860125715600513e-06, "loss": 1.253, "step": 893 }, { "epoch": 0.7267554110354639, "grad_norm": 3.123344898223877, "learning_rate": 3.665613915800217e-06, "loss": 1.2187, "step": 894 }, { "epoch": 0.7275683365511635, "grad_norm": 3.021973133087158, "learning_rate": 3.6452591861900886e-06, "loss": 1.2165, "step": 895 }, { "epoch": 0.7283812620668632, "grad_norm": 3.234985589981079, "learning_rate": 3.6249485238806637e-06, "loss": 1.212, "step": 896 }, { "epoch": 0.7291941875825627, "grad_norm": 3.7146785259246826, "learning_rate": 3.6046820697173514e-06, "loss": 1.2697, "step": 897 }, { "epoch": 0.7300071130982624, "grad_norm": 3.134507417678833, "learning_rate": 3.5844599642389965e-06, "loss": 1.2433, "step": 898 }, { "epoch": 0.7308200386139619, "grad_norm": 2.9155194759368896, "learning_rate": 3.564282347676903e-06, "loss": 1.2403, "step": 899 }, { "epoch": 0.7316329641296616, "grad_norm": 3.148232936859131, "learning_rate": 3.54414935995387e-06, "loss": 1.2575, "step": 900 }, { "epoch": 0.7324458896453613, "grad_norm": 2.685274124145508, "learning_rate": 3.524061140683206e-06, "loss": 1.2124, "step": 901 }, { "epoch": 0.7332588151610608, "grad_norm": 3.4557571411132812, "learning_rate": 3.5040178291677816e-06, "loss": 1.2105, "step": 902 }, { "epoch": 0.7340717406767605, "grad_norm": 2.8230202198028564, "learning_rate": 3.4840195643990383e-06, "loss": 1.1745, "step": 903 }, { "epoch": 0.7348846661924601, "grad_norm": 3.311697483062744, "learning_rate": 3.464066485056048e-06, "loss": 1.222, "step": 904 }, { "epoch": 0.7356975917081597, "grad_norm": 3.2953929901123047, "learning_rate": 3.444158729504549e-06, "loss": 1.2688, "step": 905 }, { "epoch": 0.7365105172238594, "grad_norm": 3.3319778442382812, "learning_rate": 3.4242964357959597e-06, "loss": 1.2539, "step": 906 }, { "epoch": 0.737323442739559, "grad_norm": 3.124361753463745, "learning_rate": 3.4044797416664564e-06, "loss": 1.2527, "step": 907 }, { "epoch": 0.7381363682552586, "grad_norm": 2.9690327644348145, "learning_rate": 3.3847087845359996e-06, "loss": 1.2722, "step": 908 }, { "epoch": 0.7389492937709582, "grad_norm": 5.119561672210693, "learning_rate": 3.364983701507376e-06, "loss": 1.2233, "step": 909 }, { "epoch": 0.7397622192866579, "grad_norm": 2.818423271179199, "learning_rate": 3.3453046293652657e-06, "loss": 1.2438, "step": 910 }, { "epoch": 0.7405751448023575, "grad_norm": 3.0988523960113525, "learning_rate": 3.3256717045752794e-06, "loss": 1.223, "step": 911 }, { "epoch": 0.7413880703180571, "grad_norm": 3.082066297531128, "learning_rate": 3.3060850632830167e-06, "loss": 1.244, "step": 912 }, { "epoch": 0.7422009958337568, "grad_norm": 2.944265127182007, "learning_rate": 3.286544841313126e-06, "loss": 1.2308, "step": 913 }, { "epoch": 0.7430139213494563, "grad_norm": 3.608762502670288, "learning_rate": 3.2670511741683475e-06, "loss": 1.2018, "step": 914 }, { "epoch": 0.743826846865156, "grad_norm": 3.958385705947876, "learning_rate": 3.2476041970285945e-06, "loss": 1.2136, "step": 915 }, { "epoch": 0.7446397723808557, "grad_norm": 2.9133267402648926, "learning_rate": 3.2282040447500063e-06, "loss": 1.2649, "step": 916 }, { "epoch": 0.7454526978965552, "grad_norm": 3.8698244094848633, "learning_rate": 3.208850851863998e-06, "loss": 1.2265, "step": 917 }, { "epoch": 0.7462656234122549, "grad_norm": 4.550247669219971, "learning_rate": 3.189544752576369e-06, "loss": 1.2046, "step": 918 }, { "epoch": 0.7470785489279544, "grad_norm": 2.9886014461517334, "learning_rate": 3.1702858807663175e-06, "loss": 1.2812, "step": 919 }, { "epoch": 0.7478914744436541, "grad_norm": 3.3736209869384766, "learning_rate": 3.151074369985556e-06, "loss": 1.2482, "step": 920 }, { "epoch": 0.7487043999593537, "grad_norm": 2.7061290740966797, "learning_rate": 3.131910353457369e-06, "loss": 1.2474, "step": 921 }, { "epoch": 0.7495173254750533, "grad_norm": 4.058886528015137, "learning_rate": 3.112793964075681e-06, "loss": 1.1897, "step": 922 }, { "epoch": 0.750330250990753, "grad_norm": 3.3311798572540283, "learning_rate": 3.0937253344041507e-06, "loss": 1.2129, "step": 923 }, { "epoch": 0.7511431765064526, "grad_norm": 3.2716569900512695, "learning_rate": 3.074704596675242e-06, "loss": 1.1763, "step": 924 }, { "epoch": 0.7519561020221522, "grad_norm": 3.360356569290161, "learning_rate": 3.055731882789311e-06, "loss": 1.2771, "step": 925 }, { "epoch": 0.7527690275378518, "grad_norm": 3.9494638442993164, "learning_rate": 3.0368073243136874e-06, "loss": 1.2551, "step": 926 }, { "epoch": 0.7535819530535515, "grad_norm": 3.3180434703826904, "learning_rate": 3.0179310524817707e-06, "loss": 1.245, "step": 927 }, { "epoch": 0.7543948785692511, "grad_norm": 4.963752746582031, "learning_rate": 2.9991031981921026e-06, "loss": 1.2266, "step": 928 }, { "epoch": 0.7552078040849507, "grad_norm": 3.1220555305480957, "learning_rate": 2.9803238920074784e-06, "loss": 1.2057, "step": 929 }, { "epoch": 0.7560207296006504, "grad_norm": 2.8764801025390625, "learning_rate": 2.961593264154038e-06, "loss": 1.2157, "step": 930 }, { "epoch": 0.7568336551163499, "grad_norm": 2.682791233062744, "learning_rate": 2.9429114445203423e-06, "loss": 1.1899, "step": 931 }, { "epoch": 0.7576465806320496, "grad_norm": 5.8080878257751465, "learning_rate": 2.924278562656514e-06, "loss": 1.1661, "step": 932 }, { "epoch": 0.7584595061477493, "grad_norm": 3.5146303176879883, "learning_rate": 2.90569474777329e-06, "loss": 1.2712, "step": 933 }, { "epoch": 0.7592724316634488, "grad_norm": 3.092174530029297, "learning_rate": 2.8871601287411634e-06, "loss": 1.2297, "step": 934 }, { "epoch": 0.7600853571791485, "grad_norm": 2.807847499847412, "learning_rate": 2.8686748340894744e-06, "loss": 1.2369, "step": 935 }, { "epoch": 0.760898282694848, "grad_norm": 2.8753178119659424, "learning_rate": 2.850238992005514e-06, "loss": 1.2812, "step": 936 }, { "epoch": 0.7617112082105477, "grad_norm": 4.227181434631348, "learning_rate": 2.8318527303336465e-06, "loss": 1.2143, "step": 937 }, { "epoch": 0.7625241337262473, "grad_norm": 3.921201229095459, "learning_rate": 2.81351617657442e-06, "loss": 1.2446, "step": 938 }, { "epoch": 0.763337059241947, "grad_norm": 3.164557695388794, "learning_rate": 2.795229457883678e-06, "loss": 1.2085, "step": 939 }, { "epoch": 0.7641499847576466, "grad_norm": 3.0904717445373535, "learning_rate": 2.7769927010716814e-06, "loss": 1.2436, "step": 940 }, { "epoch": 0.7649629102733462, "grad_norm": 9.615850448608398, "learning_rate": 2.7588060326022205e-06, "loss": 1.2179, "step": 941 }, { "epoch": 0.7657758357890458, "grad_norm": 7.9210357666015625, "learning_rate": 2.740669578591755e-06, "loss": 1.1704, "step": 942 }, { "epoch": 0.7665887613047454, "grad_norm": 3.03359055519104, "learning_rate": 2.7225834648085282e-06, "loss": 1.1919, "step": 943 }, { "epoch": 0.7674016868204451, "grad_norm": 3.331894636154175, "learning_rate": 2.7045478166716843e-06, "loss": 1.2297, "step": 944 }, { "epoch": 0.7682146123361447, "grad_norm": 2.9995782375335693, "learning_rate": 2.6865627592504295e-06, "loss": 1.1936, "step": 945 }, { "epoch": 0.7690275378518443, "grad_norm": 11.267196655273438, "learning_rate": 2.668628417263137e-06, "loss": 1.2385, "step": 946 }, { "epoch": 0.769840463367544, "grad_norm": 4.058920383453369, "learning_rate": 2.6507449150764852e-06, "loss": 1.2078, "step": 947 }, { "epoch": 0.7706533888832435, "grad_norm": 2.8774616718292236, "learning_rate": 2.632912376704607e-06, "loss": 1.2585, "step": 948 }, { "epoch": 0.7714663143989432, "grad_norm": 3.4053540229797363, "learning_rate": 2.615130925808228e-06, "loss": 1.2739, "step": 949 }, { "epoch": 0.7722792399146429, "grad_norm": 3.0022501945495605, "learning_rate": 2.597400685693795e-06, "loss": 1.2136, "step": 950 }, { "epoch": 0.7730921654303424, "grad_norm": 3.6466481685638428, "learning_rate": 2.5797217793126373e-06, "loss": 1.3104, "step": 951 }, { "epoch": 0.7739050909460421, "grad_norm": 4.021648406982422, "learning_rate": 2.5620943292601074e-06, "loss": 1.2621, "step": 952 }, { "epoch": 0.7747180164617417, "grad_norm": 2.996817111968994, "learning_rate": 2.5445184577747305e-06, "loss": 1.2194, "step": 953 }, { "epoch": 0.7755309419774413, "grad_norm": 3.8881189823150635, "learning_rate": 2.52699428673736e-06, "loss": 1.2516, "step": 954 }, { "epoch": 0.776343867493141, "grad_norm": 3.279557228088379, "learning_rate": 2.5095219376703183e-06, "loss": 1.2116, "step": 955 }, { "epoch": 0.7771567930088406, "grad_norm": 3.1030569076538086, "learning_rate": 2.4921015317365794e-06, "loss": 1.2902, "step": 956 }, { "epoch": 0.7779697185245402, "grad_norm": 3.7724967002868652, "learning_rate": 2.4747331897389103e-06, "loss": 1.2783, "step": 957 }, { "epoch": 0.7787826440402398, "grad_norm": 2.808138132095337, "learning_rate": 2.4574170321190305e-06, "loss": 1.2191, "step": 958 }, { "epoch": 0.7795955695559394, "grad_norm": 2.6033871173858643, "learning_rate": 2.440153178956798e-06, "loss": 1.2282, "step": 959 }, { "epoch": 0.780408495071639, "grad_norm": 2.870957612991333, "learning_rate": 2.42294174996935e-06, "loss": 1.2118, "step": 960 }, { "epoch": 0.7812214205873387, "grad_norm": 2.913543462753296, "learning_rate": 2.40578286451029e-06, "loss": 1.2352, "step": 961 }, { "epoch": 0.7820343461030383, "grad_norm": 3.7069716453552246, "learning_rate": 2.38867664156886e-06, "loss": 1.2218, "step": 962 }, { "epoch": 0.7828472716187379, "grad_norm": 4.073693752288818, "learning_rate": 2.3716231997691007e-06, "loss": 1.1997, "step": 963 }, { "epoch": 0.7836601971344376, "grad_norm": 2.7815756797790527, "learning_rate": 2.3546226573690444e-06, "loss": 1.1898, "step": 964 }, { "epoch": 0.7844731226501371, "grad_norm": 3.2033910751342773, "learning_rate": 2.3376751322599e-06, "loss": 1.2575, "step": 965 }, { "epoch": 0.7852860481658368, "grad_norm": 2.805227518081665, "learning_rate": 2.320780741965206e-06, "loss": 1.221, "step": 966 }, { "epoch": 0.7860989736815365, "grad_norm": 2.747638463973999, "learning_rate": 2.3039396036400463e-06, "loss": 1.2199, "step": 967 }, { "epoch": 0.786911899197236, "grad_norm": 2.758178234100342, "learning_rate": 2.287151834070226e-06, "loss": 1.1847, "step": 968 }, { "epoch": 0.7877248247129357, "grad_norm": 3.467595338821411, "learning_rate": 2.2704175496714552e-06, "loss": 1.2456, "step": 969 }, { "epoch": 0.7885377502286353, "grad_norm": 5.487158298492432, "learning_rate": 2.2537368664885527e-06, "loss": 1.2061, "step": 970 }, { "epoch": 0.7893506757443349, "grad_norm": 3.063075542449951, "learning_rate": 2.2371099001946385e-06, "loss": 1.264, "step": 971 }, { "epoch": 0.7901636012600346, "grad_norm": 2.6598317623138428, "learning_rate": 2.2205367660903267e-06, "loss": 1.1971, "step": 972 }, { "epoch": 0.7909765267757342, "grad_norm": 3.249379873275757, "learning_rate": 2.2040175791029305e-06, "loss": 1.2442, "step": 973 }, { "epoch": 0.7917894522914338, "grad_norm": 3.2312817573547363, "learning_rate": 2.187552453785662e-06, "loss": 1.1871, "step": 974 }, { "epoch": 0.7926023778071334, "grad_norm": 3.060171604156494, "learning_rate": 2.1711415043168395e-06, "loss": 1.2198, "step": 975 }, { "epoch": 0.793415303322833, "grad_norm": 3.2674033641815186, "learning_rate": 2.1547848444991025e-06, "loss": 1.2343, "step": 976 }, { "epoch": 0.7942282288385327, "grad_norm": 3.822357654571533, "learning_rate": 2.138482587758605e-06, "loss": 1.1876, "step": 977 }, { "epoch": 0.7950411543542323, "grad_norm": 3.4773342609405518, "learning_rate": 2.1222348471442477e-06, "loss": 1.1976, "step": 978 }, { "epoch": 0.795854079869932, "grad_norm": 3.8379478454589844, "learning_rate": 2.1060417353268845e-06, "loss": 1.198, "step": 979 }, { "epoch": 0.7966670053856315, "grad_norm": 4.963233470916748, "learning_rate": 2.0899033645985423e-06, "loss": 1.2991, "step": 980 }, { "epoch": 0.7974799309013312, "grad_norm": 3.4560701847076416, "learning_rate": 2.073819846871646e-06, "loss": 1.1936, "step": 981 }, { "epoch": 0.7982928564170307, "grad_norm": 2.69124698638916, "learning_rate": 2.0577912936782317e-06, "loss": 1.1708, "step": 982 }, { "epoch": 0.7991057819327304, "grad_norm": 2.973618268966675, "learning_rate": 2.041817816169187e-06, "loss": 1.2535, "step": 983 }, { "epoch": 0.7999187074484301, "grad_norm": 3.1709506511688232, "learning_rate": 2.025899525113474e-06, "loss": 1.2015, "step": 984 }, { "epoch": 0.8007316329641296, "grad_norm": 2.750272274017334, "learning_rate": 2.010036530897359e-06, "loss": 1.2677, "step": 985 }, { "epoch": 0.8015445584798293, "grad_norm": 2.7218148708343506, "learning_rate": 1.9942289435236506e-06, "loss": 1.2679, "step": 986 }, { "epoch": 0.8023574839955289, "grad_norm": 3.0237209796905518, "learning_rate": 1.978476872610939e-06, "loss": 1.2425, "step": 987 }, { "epoch": 0.8031704095112285, "grad_norm": 4.8593363761901855, "learning_rate": 1.962780427392823e-06, "loss": 1.2754, "step": 988 }, { "epoch": 0.8039833350269282, "grad_norm": 4.2402544021606445, "learning_rate": 1.9471397167171714e-06, "loss": 1.1841, "step": 989 }, { "epoch": 0.8047962605426278, "grad_norm": 2.8616418838500977, "learning_rate": 1.931554849045355e-06, "loss": 1.1712, "step": 990 }, { "epoch": 0.8056091860583274, "grad_norm": 3.0303030014038086, "learning_rate": 1.916025932451493e-06, "loss": 1.2217, "step": 991 }, { "epoch": 0.806422111574027, "grad_norm": 3.096165180206299, "learning_rate": 1.9005530746217238e-06, "loss": 1.1515, "step": 992 }, { "epoch": 0.8072350370897267, "grad_norm": 5.142411231994629, "learning_rate": 1.8851363828534253e-06, "loss": 1.167, "step": 993 }, { "epoch": 0.8080479626054263, "grad_norm": 3.1720876693725586, "learning_rate": 1.869775964054501e-06, "loss": 1.1896, "step": 994 }, { "epoch": 0.8088608881211259, "grad_norm": 3.833009719848633, "learning_rate": 1.8544719247426224e-06, "loss": 1.2517, "step": 995 }, { "epoch": 0.8096738136368256, "grad_norm": 3.188974618911743, "learning_rate": 1.8392243710444911e-06, "loss": 1.2795, "step": 996 }, { "epoch": 0.8104867391525251, "grad_norm": 3.601663589477539, "learning_rate": 1.8240334086951117e-06, "loss": 1.2366, "step": 997 }, { "epoch": 0.8112996646682248, "grad_norm": 3.1258544921875, "learning_rate": 1.8088991430370506e-06, "loss": 1.2002, "step": 998 }, { "epoch": 0.8121125901839243, "grad_norm": 2.71299409866333, "learning_rate": 1.7938216790197071e-06, "loss": 1.2609, "step": 999 }, { "epoch": 0.812925515699624, "grad_norm": 3.2866601943969727, "learning_rate": 1.77880112119859e-06, "loss": 1.2571, "step": 1000 }, { "epoch": 0.8137384412153237, "grad_norm": 3.1053292751312256, "learning_rate": 1.7638375737345804e-06, "loss": 1.2316, "step": 1001 }, { "epoch": 0.8145513667310232, "grad_norm": 2.839862823486328, "learning_rate": 1.7489311403932274e-06, "loss": 1.2464, "step": 1002 }, { "epoch": 0.8153642922467229, "grad_norm": 2.750040292739868, "learning_rate": 1.7340819245440166e-06, "loss": 1.2639, "step": 1003 }, { "epoch": 0.8161772177624225, "grad_norm": 3.918286085128784, "learning_rate": 1.7192900291596493e-06, "loss": 1.2379, "step": 1004 }, { "epoch": 0.8169901432781221, "grad_norm": 3.579942226409912, "learning_rate": 1.7045555568153415e-06, "loss": 1.1943, "step": 1005 }, { "epoch": 0.8178030687938218, "grad_norm": 3.2873690128326416, "learning_rate": 1.6898786096881104e-06, "loss": 1.2457, "step": 1006 }, { "epoch": 0.8186159943095214, "grad_norm": 2.721126079559326, "learning_rate": 1.6752592895560493e-06, "loss": 1.2681, "step": 1007 }, { "epoch": 0.819428919825221, "grad_norm": 2.9273929595947266, "learning_rate": 1.6606976977976408e-06, "loss": 1.1985, "step": 1008 }, { "epoch": 0.8202418453409206, "grad_norm": 3.6816606521606445, "learning_rate": 1.6461939353910494e-06, "loss": 1.2128, "step": 1009 }, { "epoch": 0.8210547708566203, "grad_norm": 2.8991682529449463, "learning_rate": 1.631748102913412e-06, "loss": 1.224, "step": 1010 }, { "epoch": 0.8218676963723199, "grad_norm": 3.2517406940460205, "learning_rate": 1.6173603005401505e-06, "loss": 1.1936, "step": 1011 }, { "epoch": 0.8226806218880195, "grad_norm": 3.0502426624298096, "learning_rate": 1.6030306280442764e-06, "loss": 1.2555, "step": 1012 }, { "epoch": 0.8234935474037192, "grad_norm": 3.2694664001464844, "learning_rate": 1.588759184795694e-06, "loss": 1.2643, "step": 1013 }, { "epoch": 0.8243064729194187, "grad_norm": 2.9429259300231934, "learning_rate": 1.574546069760514e-06, "loss": 1.2221, "step": 1014 }, { "epoch": 0.8251193984351184, "grad_norm": 3.2481369972229004, "learning_rate": 1.5603913815003634e-06, "loss": 1.1949, "step": 1015 }, { "epoch": 0.8259323239508181, "grad_norm": 3.006603717803955, "learning_rate": 1.5462952181717117e-06, "loss": 1.1593, "step": 1016 }, { "epoch": 0.8267452494665176, "grad_norm": 2.8126094341278076, "learning_rate": 1.532257677525183e-06, "loss": 1.2094, "step": 1017 }, { "epoch": 0.8275581749822173, "grad_norm": 3.258910894393921, "learning_rate": 1.5182788569048689e-06, "loss": 1.1524, "step": 1018 }, { "epoch": 0.8283711004979168, "grad_norm": 3.097121477127075, "learning_rate": 1.5043588532476827e-06, "loss": 1.2063, "step": 1019 }, { "epoch": 0.8291840260136165, "grad_norm": 3.5429606437683105, "learning_rate": 1.49049776308265e-06, "loss": 1.1579, "step": 1020 }, { "epoch": 0.8299969515293161, "grad_norm": 3.0676991939544678, "learning_rate": 1.476695682530268e-06, "loss": 1.2063, "step": 1021 }, { "epoch": 0.8308098770450157, "grad_norm": 3.191493272781372, "learning_rate": 1.4629527073018267e-06, "loss": 1.2724, "step": 1022 }, { "epoch": 0.8316228025607154, "grad_norm": 4.181521415710449, "learning_rate": 1.449268932698743e-06, "loss": 1.2627, "step": 1023 }, { "epoch": 0.832435728076415, "grad_norm": 3.7330870628356934, "learning_rate": 1.4356444536119085e-06, "loss": 1.1875, "step": 1024 }, { "epoch": 0.8332486535921146, "grad_norm": 3.5213124752044678, "learning_rate": 1.422079364521024e-06, "loss": 1.2345, "step": 1025 }, { "epoch": 0.8340615791078142, "grad_norm": 3.672848701477051, "learning_rate": 1.4085737594939497e-06, "loss": 1.2451, "step": 1026 }, { "epoch": 0.8348745046235139, "grad_norm": 3.2613043785095215, "learning_rate": 1.3951277321860468e-06, "loss": 1.261, "step": 1027 }, { "epoch": 0.8356874301392135, "grad_norm": 3.1444427967071533, "learning_rate": 1.381741375839537e-06, "loss": 1.2205, "step": 1028 }, { "epoch": 0.8365003556549131, "grad_norm": 3.7306652069091797, "learning_rate": 1.3684147832828409e-06, "loss": 1.2343, "step": 1029 }, { "epoch": 0.8373132811706128, "grad_norm": 3.6698615550994873, "learning_rate": 1.355148046929956e-06, "loss": 1.2195, "step": 1030 }, { "epoch": 0.8381262066863123, "grad_norm": 4.807132244110107, "learning_rate": 1.3419412587797908e-06, "loss": 1.1946, "step": 1031 }, { "epoch": 0.838939132202012, "grad_norm": 3.0877437591552734, "learning_rate": 1.3287945104155487e-06, "loss": 1.1901, "step": 1032 }, { "epoch": 0.8397520577177117, "grad_norm": 6.123032093048096, "learning_rate": 1.3157078930040856e-06, "loss": 1.2338, "step": 1033 }, { "epoch": 0.8405649832334112, "grad_norm": 3.8207807540893555, "learning_rate": 1.3026814972952674e-06, "loss": 1.2064, "step": 1034 }, { "epoch": 0.8413779087491109, "grad_norm": 3.591054916381836, "learning_rate": 1.2897154136213542e-06, "loss": 1.248, "step": 1035 }, { "epoch": 0.8421908342648105, "grad_norm": 3.14103364944458, "learning_rate": 1.2768097318963701e-06, "loss": 1.2247, "step": 1036 }, { "epoch": 0.8430037597805101, "grad_norm": 3.2605819702148438, "learning_rate": 1.2639645416154744e-06, "loss": 1.2265, "step": 1037 }, { "epoch": 0.8438166852962098, "grad_norm": 3.2860848903656006, "learning_rate": 1.2511799318543493e-06, "loss": 1.2083, "step": 1038 }, { "epoch": 0.8446296108119093, "grad_norm": 3.6271586418151855, "learning_rate": 1.2384559912685768e-06, "loss": 1.2562, "step": 1039 }, { "epoch": 0.845442536327609, "grad_norm": 3.0439271926879883, "learning_rate": 1.2257928080930236e-06, "loss": 1.1838, "step": 1040 }, { "epoch": 0.8462554618433086, "grad_norm": 2.9285664558410645, "learning_rate": 1.2131904701412345e-06, "loss": 1.2271, "step": 1041 }, { "epoch": 0.8470683873590082, "grad_norm": 4.422233581542969, "learning_rate": 1.2006490648048118e-06, "loss": 1.2218, "step": 1042 }, { "epoch": 0.8478813128747078, "grad_norm": 3.193469524383545, "learning_rate": 1.1881686790528279e-06, "loss": 1.2167, "step": 1043 }, { "epoch": 0.8486942383904075, "grad_norm": 2.9041225910186768, "learning_rate": 1.1757493994312052e-06, "loss": 1.1652, "step": 1044 }, { "epoch": 0.8495071639061071, "grad_norm": 2.902376890182495, "learning_rate": 1.1633913120621188e-06, "loss": 1.209, "step": 1045 }, { "epoch": 0.8503200894218067, "grad_norm": 2.7561545372009277, "learning_rate": 1.151094502643414e-06, "loss": 1.2105, "step": 1046 }, { "epoch": 0.8511330149375064, "grad_norm": 3.4532971382141113, "learning_rate": 1.1388590564479895e-06, "loss": 1.2457, "step": 1047 }, { "epoch": 0.8519459404532059, "grad_norm": 4.540160179138184, "learning_rate": 1.1266850583232224e-06, "loss": 1.1941, "step": 1048 }, { "epoch": 0.8527588659689056, "grad_norm": 2.99617075920105, "learning_rate": 1.1145725926903772e-06, "loss": 1.2138, "step": 1049 }, { "epoch": 0.8535717914846053, "grad_norm": 3.2309064865112305, "learning_rate": 1.1025217435440116e-06, "loss": 1.2373, "step": 1050 }, { "epoch": 0.8543847170003048, "grad_norm": 2.7454960346221924, "learning_rate": 1.0905325944514034e-06, "loss": 1.2473, "step": 1051 }, { "epoch": 0.8551976425160045, "grad_norm": 8.090238571166992, "learning_rate": 1.078605228551971e-06, "loss": 1.2342, "step": 1052 }, { "epoch": 0.8560105680317041, "grad_norm": 3.7213146686553955, "learning_rate": 1.0667397285566893e-06, "loss": 1.2232, "step": 1053 }, { "epoch": 0.8568234935474037, "grad_norm": 3.4427578449249268, "learning_rate": 1.0549361767475241e-06, "loss": 1.2474, "step": 1054 }, { "epoch": 0.8576364190631034, "grad_norm": 3.212726593017578, "learning_rate": 1.0431946549768567e-06, "loss": 1.2727, "step": 1055 }, { "epoch": 0.858449344578803, "grad_norm": 3.895224094390869, "learning_rate": 1.0315152446669142e-06, "loss": 1.2451, "step": 1056 }, { "epoch": 0.8592622700945026, "grad_norm": 2.8261964321136475, "learning_rate": 1.019898026809214e-06, "loss": 1.2416, "step": 1057 }, { "epoch": 0.8600751956102022, "grad_norm": 3.2146759033203125, "learning_rate": 1.0083430819639962e-06, "loss": 1.2258, "step": 1058 }, { "epoch": 0.8608881211259019, "grad_norm": 5.239031791687012, "learning_rate": 9.968504902596566e-07, "loss": 1.2089, "step": 1059 }, { "epoch": 0.8617010466416014, "grad_norm": 3.1487622261047363, "learning_rate": 9.85420331392214e-07, "loss": 1.2445, "step": 1060 }, { "epoch": 0.8625139721573011, "grad_norm": 3.5642974376678467, "learning_rate": 9.74052684624731e-07, "loss": 1.2724, "step": 1061 }, { "epoch": 0.8633268976730007, "grad_norm": 3.3064541816711426, "learning_rate": 9.62747628786782e-07, "loss": 1.235, "step": 1062 }, { "epoch": 0.8641398231887003, "grad_norm": 2.7583703994750977, "learning_rate": 9.515052422739035e-07, "loss": 1.1864, "step": 1063 }, { "epoch": 0.8649527487044, "grad_norm": 2.8002755641937256, "learning_rate": 9.403256030470386e-07, "loss": 1.1888, "step": 1064 }, { "epoch": 0.8657656742200995, "grad_norm": 4.0211710929870605, "learning_rate": 9.292087886320166e-07, "loss": 1.2513, "step": 1065 }, { "epoch": 0.8665785997357992, "grad_norm": 3.937668561935425, "learning_rate": 9.181548761189996e-07, "loss": 1.2111, "step": 1066 }, { "epoch": 0.8673915252514989, "grad_norm": 31.291566848754883, "learning_rate": 9.071639421619527e-07, "loss": 1.2234, "step": 1067 }, { "epoch": 0.8682044507671984, "grad_norm": 4.150018692016602, "learning_rate": 8.962360629781164e-07, "loss": 1.2205, "step": 1068 }, { "epoch": 0.8690173762828981, "grad_norm": 2.8017213344573975, "learning_rate": 8.853713143474685e-07, "loss": 1.27, "step": 1069 }, { "epoch": 0.8698303017985977, "grad_norm": 2.9798476696014404, "learning_rate": 8.745697716122081e-07, "loss": 1.2169, "step": 1070 }, { "epoch": 0.8706432273142973, "grad_norm": 4.344991683959961, "learning_rate": 8.638315096762318e-07, "loss": 1.2217, "step": 1071 }, { "epoch": 0.871456152829997, "grad_norm": 2.9421257972717285, "learning_rate": 8.531566030046035e-07, "loss": 1.2399, "step": 1072 }, { "epoch": 0.8722690783456966, "grad_norm": 3.4676921367645264, "learning_rate": 8.425451256230588e-07, "loss": 1.1957, "step": 1073 }, { "epoch": 0.8730820038613962, "grad_norm": 3.2855141162872314, "learning_rate": 8.319971511174718e-07, "loss": 1.2399, "step": 1074 }, { "epoch": 0.8738949293770958, "grad_norm": 2.990471839904785, "learning_rate": 8.215127526333499e-07, "loss": 1.2787, "step": 1075 }, { "epoch": 0.8747078548927955, "grad_norm": 3.183928966522217, "learning_rate": 8.110920028753355e-07, "loss": 1.1831, "step": 1076 }, { "epoch": 0.8755207804084951, "grad_norm": 2.8277997970581055, "learning_rate": 8.007349741066939e-07, "loss": 1.248, "step": 1077 }, { "epoch": 0.8763337059241947, "grad_norm": 2.7392983436584473, "learning_rate": 7.904417381488083e-07, "loss": 1.23, "step": 1078 }, { "epoch": 0.8771466314398944, "grad_norm": 5.617170333862305, "learning_rate": 7.802123663806938e-07, "loss": 1.2267, "step": 1079 }, { "epoch": 0.8779595569555939, "grad_norm": 2.906653642654419, "learning_rate": 7.700469297384927e-07, "loss": 1.2245, "step": 1080 }, { "epoch": 0.8787724824712936, "grad_norm": 2.7728428840637207, "learning_rate": 7.599454987149868e-07, "loss": 1.2131, "step": 1081 }, { "epoch": 0.8795854079869931, "grad_norm": 2.683861017227173, "learning_rate": 7.499081433591071e-07, "loss": 1.1936, "step": 1082 }, { "epoch": 0.8803983335026928, "grad_norm": 2.6362993717193604, "learning_rate": 7.399349332754458e-07, "loss": 1.2169, "step": 1083 }, { "epoch": 0.8812112590183925, "grad_norm": 3.3068742752075195, "learning_rate": 7.300259376237795e-07, "loss": 1.2098, "step": 1084 }, { "epoch": 0.882024184534092, "grad_norm": 2.825416326522827, "learning_rate": 7.201812251185869e-07, "loss": 1.2543, "step": 1085 }, { "epoch": 0.8828371100497917, "grad_norm": 3.172919750213623, "learning_rate": 7.104008640285642e-07, "loss": 1.1768, "step": 1086 }, { "epoch": 0.8836500355654913, "grad_norm": 3.052677869796753, "learning_rate": 7.006849221761736e-07, "loss": 1.2068, "step": 1087 }, { "epoch": 0.8844629610811909, "grad_norm": 2.8510589599609375, "learning_rate": 6.910334669371433e-07, "loss": 1.2043, "step": 1088 }, { "epoch": 0.8852758865968906, "grad_norm": 3.4369497299194336, "learning_rate": 6.814465652400237e-07, "loss": 1.2467, "step": 1089 }, { "epoch": 0.8860888121125902, "grad_norm": 2.667567491531372, "learning_rate": 6.719242835657147e-07, "loss": 1.2594, "step": 1090 }, { "epoch": 0.8869017376282898, "grad_norm": 2.983642816543579, "learning_rate": 6.62466687947001e-07, "loss": 1.2199, "step": 1091 }, { "epoch": 0.8877146631439894, "grad_norm": 3.583439350128174, "learning_rate": 6.530738439681017e-07, "loss": 1.1827, "step": 1092 }, { "epoch": 0.8885275886596891, "grad_norm": 4.706247806549072, "learning_rate": 6.437458167642164e-07, "loss": 1.2292, "step": 1093 }, { "epoch": 0.8893405141753887, "grad_norm": 4.394626140594482, "learning_rate": 6.344826710210584e-07, "loss": 1.2975, "step": 1094 }, { "epoch": 0.8901534396910883, "grad_norm": 4.5692572593688965, "learning_rate": 6.252844709744255e-07, "loss": 1.1853, "step": 1095 }, { "epoch": 0.890966365206788, "grad_norm": 3.4114434719085693, "learning_rate": 6.161512804097436e-07, "loss": 1.2067, "step": 1096 }, { "epoch": 0.8917792907224875, "grad_norm": 7.298144340515137, "learning_rate": 6.070831626616236e-07, "loss": 1.2149, "step": 1097 }, { "epoch": 0.8925922162381872, "grad_norm": 2.7437572479248047, "learning_rate": 5.980801806134318e-07, "loss": 1.2002, "step": 1098 }, { "epoch": 0.8934051417538869, "grad_norm": 3.101397752761841, "learning_rate": 5.891423966968413e-07, "loss": 1.2594, "step": 1099 }, { "epoch": 0.8942180672695864, "grad_norm": 3.186479091644287, "learning_rate": 5.80269872891408e-07, "loss": 1.1895, "step": 1100 }, { "epoch": 0.8950309927852861, "grad_norm": 3.5605878829956055, "learning_rate": 5.714626707241411e-07, "loss": 1.1804, "step": 1101 }, { "epoch": 0.8958439183009856, "grad_norm": 3.0213913917541504, "learning_rate": 5.627208512690641e-07, "loss": 1.2619, "step": 1102 }, { "epoch": 0.8966568438166853, "grad_norm": 3.0476791858673096, "learning_rate": 5.5404447514681e-07, "loss": 1.1429, "step": 1103 }, { "epoch": 0.8974697693323849, "grad_norm": 2.9802823066711426, "learning_rate": 5.45433602524188e-07, "loss": 1.2353, "step": 1104 }, { "epoch": 0.8982826948480845, "grad_norm": 3.168029308319092, "learning_rate": 5.368882931137675e-07, "loss": 1.1771, "step": 1105 }, { "epoch": 0.8990956203637842, "grad_norm": 2.8624963760375977, "learning_rate": 5.284086061734672e-07, "loss": 1.1929, "step": 1106 }, { "epoch": 0.8999085458794838, "grad_norm": 3.3826193809509277, "learning_rate": 5.199946005061462e-07, "loss": 1.1379, "step": 1107 }, { "epoch": 0.9007214713951834, "grad_norm": 3.2084782123565674, "learning_rate": 5.116463344591893e-07, "loss": 1.1694, "step": 1108 }, { "epoch": 0.901534396910883, "grad_norm": 3.6624932289123535, "learning_rate": 5.033638659241102e-07, "loss": 1.219, "step": 1109 }, { "epoch": 0.9023473224265827, "grad_norm": 3.2314536571502686, "learning_rate": 4.951472523361401e-07, "loss": 1.2457, "step": 1110 }, { "epoch": 0.9031602479422823, "grad_norm": 3.1179494857788086, "learning_rate": 4.869965506738416e-07, "loss": 1.232, "step": 1111 }, { "epoch": 0.9039731734579819, "grad_norm": 2.875725030899048, "learning_rate": 4.789118174587071e-07, "loss": 1.2515, "step": 1112 }, { "epoch": 0.9047860989736816, "grad_norm": 2.5742199420928955, "learning_rate": 4.7089310875475856e-07, "loss": 1.2554, "step": 1113 }, { "epoch": 0.9055990244893811, "grad_norm": 3.2250759601593018, "learning_rate": 4.6294048016817917e-07, "loss": 1.2281, "step": 1114 }, { "epoch": 0.9064119500050808, "grad_norm": 2.866562843322754, "learning_rate": 4.550539868469106e-07, "loss": 1.2559, "step": 1115 }, { "epoch": 0.9072248755207805, "grad_norm": 2.9703938961029053, "learning_rate": 4.4723368348027375e-07, "loss": 1.307, "step": 1116 }, { "epoch": 0.90803780103648, "grad_norm": 3.0078420639038086, "learning_rate": 4.394796242985933e-07, "loss": 1.2285, "step": 1117 }, { "epoch": 0.9088507265521797, "grad_norm": 3.0581750869750977, "learning_rate": 4.317918630728235e-07, "loss": 1.1751, "step": 1118 }, { "epoch": 0.9096636520678792, "grad_norm": 4.224788188934326, "learning_rate": 4.241704531141633e-07, "loss": 1.155, "step": 1119 }, { "epoch": 0.9104765775835789, "grad_norm": 3.2800920009613037, "learning_rate": 4.166154472737061e-07, "loss": 1.199, "step": 1120 }, { "epoch": 0.9112895030992785, "grad_norm": 5.579473495483398, "learning_rate": 4.091268979420537e-07, "loss": 1.1558, "step": 1121 }, { "epoch": 0.9121024286149781, "grad_norm": 3.660987615585327, "learning_rate": 4.0170485704896453e-07, "loss": 1.2258, "step": 1122 }, { "epoch": 0.9129153541306778, "grad_norm": 11.064430236816406, "learning_rate": 3.943493760629924e-07, "loss": 1.1699, "step": 1123 }, { "epoch": 0.9137282796463774, "grad_norm": 4.9747138023376465, "learning_rate": 3.8706050599112363e-07, "loss": 1.2415, "step": 1124 }, { "epoch": 0.914541205162077, "grad_norm": 3.7896888256073, "learning_rate": 3.798382973784298e-07, "loss": 1.2221, "step": 1125 }, { "epoch": 0.9153541306777766, "grad_norm": 3.383769989013672, "learning_rate": 3.7268280030771655e-07, "loss": 1.196, "step": 1126 }, { "epoch": 0.9161670561934763, "grad_norm": 3.491272211074829, "learning_rate": 3.655940643991718e-07, "loss": 1.1786, "step": 1127 }, { "epoch": 0.9169799817091759, "grad_norm": 3.1759097576141357, "learning_rate": 3.585721388100283e-07, "loss": 1.1696, "step": 1128 }, { "epoch": 0.9177929072248755, "grad_norm": 2.7568089962005615, "learning_rate": 3.516170722342127e-07, "loss": 1.1703, "step": 1129 }, { "epoch": 0.9186058327405752, "grad_norm": 2.992725372314453, "learning_rate": 3.4472891290201927e-07, "loss": 1.1739, "step": 1130 }, { "epoch": 0.9194187582562747, "grad_norm": 4.317306041717529, "learning_rate": 3.3790770857976995e-07, "loss": 1.184, "step": 1131 }, { "epoch": 0.9202316837719744, "grad_norm": 3.9048075675964355, "learning_rate": 3.3115350656948043e-07, "loss": 1.2651, "step": 1132 }, { "epoch": 0.9210446092876741, "grad_norm": 3.3990674018859863, "learning_rate": 3.2446635370853686e-07, "loss": 1.205, "step": 1133 }, { "epoch": 0.9218575348033736, "grad_norm": 4.0517754554748535, "learning_rate": 3.1784629636937404e-07, "loss": 1.1996, "step": 1134 }, { "epoch": 0.9226704603190733, "grad_norm": 3.340564489364624, "learning_rate": 3.1129338045914004e-07, "loss": 1.2215, "step": 1135 }, { "epoch": 0.9234833858347729, "grad_norm": 3.5760183334350586, "learning_rate": 3.0480765141939316e-07, "loss": 1.2191, "step": 1136 }, { "epoch": 0.9242963113504725, "grad_norm": 2.8496994972229004, "learning_rate": 2.9838915422578e-07, "loss": 1.2217, "step": 1137 }, { "epoch": 0.9251092368661722, "grad_norm": 3.025475025177002, "learning_rate": 2.920379333877221e-07, "loss": 1.2332, "step": 1138 }, { "epoch": 0.9259221623818717, "grad_norm": 4.238699436187744, "learning_rate": 2.8575403294811123e-07, "loss": 1.2223, "step": 1139 }, { "epoch": 0.9267350878975714, "grad_norm": 2.9650015830993652, "learning_rate": 2.795374964830022e-07, "loss": 1.2149, "step": 1140 }, { "epoch": 0.927548013413271, "grad_norm": 2.731064796447754, "learning_rate": 2.733883671013082e-07, "loss": 1.2116, "step": 1141 }, { "epoch": 0.9283609389289706, "grad_norm": 4.153676986694336, "learning_rate": 2.673066874445096e-07, "loss": 1.1189, "step": 1142 }, { "epoch": 0.9291738644446702, "grad_norm": 3.843541383743286, "learning_rate": 2.612924996863453e-07, "loss": 1.1933, "step": 1143 }, { "epoch": 0.9299867899603699, "grad_norm": 3.0720019340515137, "learning_rate": 2.5534584553253526e-07, "loss": 1.1859, "step": 1144 }, { "epoch": 0.9307997154760695, "grad_norm": 3.4368112087249756, "learning_rate": 2.494667662204797e-07, "loss": 1.22, "step": 1145 }, { "epoch": 0.9316126409917691, "grad_norm": 2.524754285812378, "learning_rate": 2.436553025189758e-07, "loss": 1.2561, "step": 1146 }, { "epoch": 0.9324255665074688, "grad_norm": 3.2625484466552734, "learning_rate": 2.3791149472794373e-07, "loss": 1.2026, "step": 1147 }, { "epoch": 0.9332384920231683, "grad_norm": 3.4842891693115234, "learning_rate": 2.3223538267813317e-07, "loss": 1.234, "step": 1148 }, { "epoch": 0.934051417538868, "grad_norm": 2.9896857738494873, "learning_rate": 2.2662700573085505e-07, "loss": 1.2008, "step": 1149 }, { "epoch": 0.9348643430545677, "grad_norm": 3.3465092182159424, "learning_rate": 2.2108640277771153e-07, "loss": 1.2392, "step": 1150 }, { "epoch": 0.9356772685702672, "grad_norm": 2.6980130672454834, "learning_rate": 2.156136122403174e-07, "loss": 1.2083, "step": 1151 }, { "epoch": 0.9364901940859669, "grad_norm": 3.4942784309387207, "learning_rate": 2.1020867207004026e-07, "loss": 1.2232, "step": 1152 }, { "epoch": 0.9373031196016665, "grad_norm": 2.874210834503174, "learning_rate": 2.048716197477374e-07, "loss": 1.2447, "step": 1153 }, { "epoch": 0.9381160451173661, "grad_norm": 3.429757833480835, "learning_rate": 1.996024922834905e-07, "loss": 1.1562, "step": 1154 }, { "epoch": 0.9389289706330658, "grad_norm": 2.96549654006958, "learning_rate": 1.9440132621635687e-07, "loss": 1.2543, "step": 1155 }, { "epoch": 0.9397418961487654, "grad_norm": 3.1660540103912354, "learning_rate": 1.8926815761410867e-07, "loss": 1.1931, "step": 1156 }, { "epoch": 0.940554821664465, "grad_norm": 2.848574161529541, "learning_rate": 1.8420302207298623e-07, "loss": 1.1837, "step": 1157 }, { "epoch": 0.9413677471801646, "grad_norm": 4.005343437194824, "learning_rate": 1.792059547174507e-07, "loss": 1.2423, "step": 1158 }, { "epoch": 0.9421806726958643, "grad_norm": 2.7809975147247314, "learning_rate": 1.7427699019994415e-07, "loss": 1.1665, "step": 1159 }, { "epoch": 0.9429935982115638, "grad_norm": 4.211681365966797, "learning_rate": 1.6941616270063854e-07, "loss": 1.2526, "step": 1160 }, { "epoch": 0.9438065237272635, "grad_norm": 4.117452144622803, "learning_rate": 1.6462350592721498e-07, "loss": 1.1957, "step": 1161 }, { "epoch": 0.9446194492429631, "grad_norm": 2.9959964752197266, "learning_rate": 1.5989905311461274e-07, "loss": 1.2342, "step": 1162 }, { "epoch": 0.9454323747586627, "grad_norm": 3.091280460357666, "learning_rate": 1.5524283702481158e-07, "loss": 1.2168, "step": 1163 }, { "epoch": 0.9462453002743624, "grad_norm": 4.000481128692627, "learning_rate": 1.5065488994659983e-07, "loss": 1.2206, "step": 1164 }, { "epoch": 0.9470582257900619, "grad_norm": 3.2974343299865723, "learning_rate": 1.461352436953478e-07, "loss": 1.1955, "step": 1165 }, { "epoch": 0.9478711513057616, "grad_norm": 3.589606285095215, "learning_rate": 1.4168392961279254e-07, "loss": 1.1277, "step": 1166 }, { "epoch": 0.9486840768214613, "grad_norm": 3.071859121322632, "learning_rate": 1.3730097856681668e-07, "loss": 1.1837, "step": 1167 }, { "epoch": 0.9494970023371608, "grad_norm": 3.4584462642669678, "learning_rate": 1.329864209512377e-07, "loss": 1.249, "step": 1168 }, { "epoch": 0.9503099278528605, "grad_norm": 4.1693434715271, "learning_rate": 1.2874028668559247e-07, "loss": 1.2234, "step": 1169 }, { "epoch": 0.9511228533685601, "grad_norm": 3.1776278018951416, "learning_rate": 1.245626052149318e-07, "loss": 1.2047, "step": 1170 }, { "epoch": 0.9519357788842597, "grad_norm": 3.347137689590454, "learning_rate": 1.2045340550961958e-07, "loss": 1.2995, "step": 1171 }, { "epoch": 0.9527487043999594, "grad_norm": 3.2806451320648193, "learning_rate": 1.164127160651285e-07, "loss": 1.1546, "step": 1172 }, { "epoch": 0.953561629915659, "grad_norm": 4.498492240905762, "learning_rate": 1.1244056490184008e-07, "loss": 1.2469, "step": 1173 }, { "epoch": 0.9543745554313586, "grad_norm": 3.0195493698120117, "learning_rate": 1.0853697956485942e-07, "loss": 1.2373, "step": 1174 }, { "epoch": 0.9551874809470582, "grad_norm": 4.176177501678467, "learning_rate": 1.0470198712381086e-07, "loss": 1.2486, "step": 1175 }, { "epoch": 0.9560004064627579, "grad_norm": 3.222987413406372, "learning_rate": 1.009356141726614e-07, "loss": 1.1905, "step": 1176 }, { "epoch": 0.9568133319784575, "grad_norm": 2.6555376052856445, "learning_rate": 9.723788682953539e-08, "loss": 1.1666, "step": 1177 }, { "epoch": 0.9576262574941571, "grad_norm": 4.015134334564209, "learning_rate": 9.360883073652238e-08, "loss": 1.2675, "step": 1178 }, { "epoch": 0.9584391830098568, "grad_norm": 3.029994487762451, "learning_rate": 9.004847105951509e-08, "loss": 1.1977, "step": 1179 }, { "epoch": 0.9592521085255563, "grad_norm": 2.7363007068634033, "learning_rate": 8.655683248802282e-08, "loss": 1.2359, "step": 1180 }, { "epoch": 0.960065034041256, "grad_norm": 4.360199451446533, "learning_rate": 8.313393923500613e-08, "loss": 1.2099, "step": 1181 }, { "epoch": 0.9608779595569555, "grad_norm": 2.9082043170928955, "learning_rate": 7.977981503670795e-08, "loss": 1.2632, "step": 1182 }, { "epoch": 0.9616908850726552, "grad_norm": 3.0049242973327637, "learning_rate": 7.64944831524872e-08, "loss": 1.2128, "step": 1183 }, { "epoch": 0.9625038105883549, "grad_norm": 2.9180142879486084, "learning_rate": 7.327796636465767e-08, "loss": 1.2075, "step": 1184 }, { "epoch": 0.9633167361040544, "grad_norm": 2.8545587062835693, "learning_rate": 7.01302869783338e-08, "loss": 1.1809, "step": 1185 }, { "epoch": 0.9641296616197541, "grad_norm": 3.2359890937805176, "learning_rate": 6.705146682127184e-08, "loss": 1.2404, "step": 1186 }, { "epoch": 0.9649425871354537, "grad_norm": 7.442730903625488, "learning_rate": 6.404152724371892e-08, "loss": 1.2081, "step": 1187 }, { "epoch": 0.9657555126511533, "grad_norm": 2.9155330657958984, "learning_rate": 6.110048911826871e-08, "loss": 1.1837, "step": 1188 }, { "epoch": 0.966568438166853, "grad_norm": 5.689270496368408, "learning_rate": 5.82283728397115e-08, "loss": 1.2039, "step": 1189 }, { "epoch": 0.9673813636825526, "grad_norm": 2.791161060333252, "learning_rate": 5.542519832489546e-08, "loss": 1.2032, "step": 1190 }, { "epoch": 0.9681942891982522, "grad_norm": 3.127793312072754, "learning_rate": 5.269098501259007e-08, "loss": 1.2016, "step": 1191 }, { "epoch": 0.9690072147139518, "grad_norm": 2.8209614753723145, "learning_rate": 5.002575186334735e-08, "loss": 1.1624, "step": 1192 }, { "epoch": 0.9698201402296515, "grad_norm": 3.3611080646514893, "learning_rate": 4.742951735937418e-08, "loss": 1.2068, "step": 1193 }, { "epoch": 0.9706330657453511, "grad_norm": 5.118293285369873, "learning_rate": 4.490229950440239e-08, "loss": 1.2398, "step": 1194 }, { "epoch": 0.9714459912610507, "grad_norm": 9.395883560180664, "learning_rate": 4.2444115823562226e-08, "loss": 1.3143, "step": 1195 }, { "epoch": 0.9722589167767504, "grad_norm": 3.1017065048217773, "learning_rate": 4.005498336326463e-08, "loss": 1.1918, "step": 1196 }, { "epoch": 0.9730718422924499, "grad_norm": 3.226966142654419, "learning_rate": 3.773491869108137e-08, "loss": 1.2046, "step": 1197 }, { "epoch": 0.9738847678081496, "grad_norm": 3.233693838119507, "learning_rate": 3.548393789562732e-08, "loss": 1.2325, "step": 1198 }, { "epoch": 0.9746976933238493, "grad_norm": 3.159299612045288, "learning_rate": 3.3302056586453916e-08, "loss": 1.1693, "step": 1199 }, { "epoch": 0.9755106188395488, "grad_norm": 2.7059924602508545, "learning_rate": 3.118928989393699e-08, "loss": 1.2422, "step": 1200 }, { "epoch": 0.9763235443552485, "grad_norm": 3.511061668395996, "learning_rate": 2.9145652469174666e-08, "loss": 1.2184, "step": 1201 }, { "epoch": 0.977136469870948, "grad_norm": 4.077070236206055, "learning_rate": 2.7171158483882963e-08, "loss": 1.2309, "step": 1202 }, { "epoch": 0.9779493953866477, "grad_norm": 3.434537887573242, "learning_rate": 2.5265821630298116e-08, "loss": 1.1943, "step": 1203 }, { "epoch": 0.9787623209023473, "grad_norm": 3.698641300201416, "learning_rate": 2.3429655121085525e-08, "loss": 1.2671, "step": 1204 }, { "epoch": 0.9795752464180469, "grad_norm": 6.674719333648682, "learning_rate": 2.1662671689242076e-08, "loss": 1.1961, "step": 1205 }, { "epoch": 0.9803881719337466, "grad_norm": 4.9146952629089355, "learning_rate": 1.996488358801174e-08, "loss": 1.2345, "step": 1206 }, { "epoch": 0.9812010974494462, "grad_norm": 2.7147114276885986, "learning_rate": 1.8336302590798992e-08, "loss": 1.2118, "step": 1207 }, { "epoch": 0.9820140229651458, "grad_norm": 2.809692859649658, "learning_rate": 1.677693999109109e-08, "loss": 1.2162, "step": 1208 }, { "epoch": 0.9828269484808454, "grad_norm": 3.857846975326538, "learning_rate": 1.5286806602372583e-08, "loss": 1.1792, "step": 1209 }, { "epoch": 0.9836398739965451, "grad_norm": 3.8911325931549072, "learning_rate": 1.3865912758054267e-08, "loss": 1.2332, "step": 1210 }, { "epoch": 0.9844527995122447, "grad_norm": 3.5572190284729004, "learning_rate": 1.2514268311405452e-08, "loss": 1.2174, "step": 1211 }, { "epoch": 0.9852657250279443, "grad_norm": 3.22208833694458, "learning_rate": 1.1231882635477364e-08, "loss": 1.2146, "step": 1212 }, { "epoch": 0.986078650543644, "grad_norm": 4.469923973083496, "learning_rate": 1.0018764623045407e-08, "loss": 1.2168, "step": 1213 }, { "epoch": 0.9868915760593435, "grad_norm": 3.1559510231018066, "learning_rate": 8.874922686541442e-09, "loss": 1.2074, "step": 1214 }, { "epoch": 0.9877045015750432, "grad_norm": 2.6890878677368164, "learning_rate": 7.800364758002721e-09, "loss": 1.2358, "step": 1215 }, { "epoch": 0.9885174270907429, "grad_norm": 3.4091622829437256, "learning_rate": 6.795098289008595e-09, "loss": 1.2484, "step": 1216 }, { "epoch": 0.9893303526064424, "grad_norm": 3.0762569904327393, "learning_rate": 5.859130250636113e-09, "loss": 1.1787, "step": 1217 }, { "epoch": 0.9901432781221421, "grad_norm": 2.616163492202759, "learning_rate": 4.992467133406731e-09, "loss": 1.2092, "step": 1218 }, { "epoch": 0.9909562036378416, "grad_norm": 3.0248591899871826, "learning_rate": 4.195114947244117e-09, "loss": 1.1998, "step": 1219 }, { "epoch": 0.9917691291535413, "grad_norm": 5.664068698883057, "learning_rate": 3.4670792214297476e-09, "loss": 1.2539, "step": 1220 }, { "epoch": 0.9925820546692409, "grad_norm": 3.449087619781494, "learning_rate": 2.808365004569602e-09, "loss": 1.2463, "step": 1221 }, { "epoch": 0.9933949801849405, "grad_norm": 2.958399534225464, "learning_rate": 2.2189768645519693e-09, "loss": 1.2076, "step": 1222 }, { "epoch": 0.9942079057006402, "grad_norm": 3.4361188411712646, "learning_rate": 1.6989188885219165e-09, "loss": 1.2436, "step": 1223 }, { "epoch": 0.9950208312163398, "grad_norm": 3.0529403686523438, "learning_rate": 1.2481946828502011e-09, "loss": 1.1955, "step": 1224 }, { "epoch": 0.9958337567320394, "grad_norm": 3.090090274810791, "learning_rate": 8.668073731088467e-10, "loss": 1.1455, "step": 1225 }, { "epoch": 0.996646682247739, "grad_norm": 3.2662580013275146, "learning_rate": 5.547596040489378e-10, "loss": 1.2283, "step": 1226 }, { "epoch": 0.9974596077634387, "grad_norm": 2.7874884605407715, "learning_rate": 3.1205353958285724e-10, "loss": 1.2011, "step": 1227 }, { "epoch": 0.9982725332791383, "grad_norm": 2.9483141899108887, "learning_rate": 1.3869086276985243e-10, "loss": 1.272, "step": 1228 }, { "epoch": 0.9990854587948379, "grad_norm": 3.550588607788086, "learning_rate": 3.467277580271322e-11, "loss": 1.1665, "step": 1229 }, { "epoch": 0.9998983843105376, "grad_norm": 3.500861406326294, "learning_rate": 0.0, "loss": 1.2382, "step": 1230 }, { "epoch": 0.9998983843105376, "step": 1230, "total_flos": 3.1215366383127757e+18, "train_loss": 1.3087712280149382, "train_runtime": 25084.8125, "train_samples_per_second": 6.277, "train_steps_per_second": 0.049 } ], "logging_steps": 1.0, "max_steps": 1230, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 7975, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 3.1215366383127757e+18, "train_batch_size": 2, "trial_name": null, "trial_params": null }