{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.8503401360544217, "eval_steps": 42, "global_step": 500, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0017006802721088435, "eval_loss": 3.1163904666900635, "eval_runtime": 34.6465, "eval_samples_per_second": 28.574, "eval_steps_per_second": 3.579, "step": 1 }, { "epoch": 0.00510204081632653, "grad_norm": 4.4349870681762695, "learning_rate": 3e-05, "loss": 3.1357, "step": 3 }, { "epoch": 0.01020408163265306, "grad_norm": 2.734741687774658, "learning_rate": 6e-05, "loss": 2.8293, "step": 6 }, { "epoch": 0.015306122448979591, "grad_norm": 3.007612943649292, "learning_rate": 9e-05, "loss": 2.3471, "step": 9 }, { "epoch": 0.02040816326530612, "grad_norm": 2.0711071491241455, "learning_rate": 9.999588943391597e-05, "loss": 2.2856, "step": 12 }, { "epoch": 0.025510204081632654, "grad_norm": 1.5911531448364258, "learning_rate": 9.99743108100344e-05, "loss": 2.0994, "step": 15 }, { "epoch": 0.030612244897959183, "grad_norm": 1.4815939664840698, "learning_rate": 9.993424445916923e-05, "loss": 2.1158, "step": 18 }, { "epoch": 0.03571428571428571, "grad_norm": 1.3731520175933838, "learning_rate": 9.987570520365104e-05, "loss": 1.9549, "step": 21 }, { "epoch": 0.04081632653061224, "grad_norm": 1.3747628927230835, "learning_rate": 9.979871469976196e-05, "loss": 1.9406, "step": 24 }, { "epoch": 0.04591836734693878, "grad_norm": 1.5050859451293945, "learning_rate": 9.970330142972401e-05, "loss": 1.8567, "step": 27 }, { "epoch": 0.05102040816326531, "grad_norm": 1.4547606706619263, "learning_rate": 9.95895006911623e-05, "loss": 1.8046, "step": 30 }, { "epoch": 0.05612244897959184, "grad_norm": 1.4084906578063965, "learning_rate": 9.945735458404681e-05, "loss": 1.7448, "step": 33 }, { "epoch": 0.061224489795918366, "grad_norm": 1.402061939239502, "learning_rate": 9.930691199511775e-05, "loss": 1.6982, "step": 36 }, { "epoch": 0.0663265306122449, "grad_norm": 1.4800869226455688, "learning_rate": 9.91382285798002e-05, "loss": 1.8587, "step": 39 }, { "epoch": 0.07142857142857142, "grad_norm": 1.3802098035812378, "learning_rate": 9.895136674161465e-05, "loss": 1.7339, "step": 42 }, { "epoch": 0.07142857142857142, "eval_loss": 1.721724033355713, "eval_runtime": 35.1374, "eval_samples_per_second": 28.175, "eval_steps_per_second": 3.529, "step": 42 }, { "epoch": 0.07653061224489796, "grad_norm": 1.3696004152297974, "learning_rate": 9.874639560909117e-05, "loss": 1.7873, "step": 45 }, { "epoch": 0.08163265306122448, "grad_norm": 1.4703681468963623, "learning_rate": 9.852339101019574e-05, "loss": 1.6507, "step": 48 }, { "epoch": 0.08673469387755102, "grad_norm": 1.4257506132125854, "learning_rate": 9.828243544427796e-05, "loss": 1.7397, "step": 51 }, { "epoch": 0.09183673469387756, "grad_norm": 1.434550166130066, "learning_rate": 9.802361805155097e-05, "loss": 1.6817, "step": 54 }, { "epoch": 0.09693877551020408, "grad_norm": 1.4836714267730713, "learning_rate": 9.774703458011453e-05, "loss": 1.6046, "step": 57 }, { "epoch": 0.10204081632653061, "grad_norm": 1.401002287864685, "learning_rate": 9.745278735053343e-05, "loss": 1.7368, "step": 60 }, { "epoch": 0.10714285714285714, "grad_norm": 1.5878403186798096, "learning_rate": 9.714098521798465e-05, "loss": 1.623, "step": 63 }, { "epoch": 0.11224489795918367, "grad_norm": 1.8310233354568481, "learning_rate": 9.681174353198687e-05, "loss": 1.608, "step": 66 }, { "epoch": 0.11734693877551021, "grad_norm": 1.4681365489959717, "learning_rate": 9.64651840937276e-05, "loss": 1.6437, "step": 69 }, { "epoch": 0.12244897959183673, "grad_norm": 1.3459882736206055, "learning_rate": 9.610143511100354e-05, "loss": 1.5577, "step": 72 }, { "epoch": 0.12755102040816327, "grad_norm": 1.3524380922317505, "learning_rate": 9.572063115079063e-05, "loss": 1.5852, "step": 75 }, { "epoch": 0.1326530612244898, "grad_norm": 1.301050066947937, "learning_rate": 9.53229130894619e-05, "loss": 1.5688, "step": 78 }, { "epoch": 0.1377551020408163, "grad_norm": 1.4519400596618652, "learning_rate": 9.490842806067095e-05, "loss": 1.6067, "step": 81 }, { "epoch": 0.14285714285714285, "grad_norm": 1.2706557512283325, "learning_rate": 9.44773294009206e-05, "loss": 1.4983, "step": 84 }, { "epoch": 0.14285714285714285, "eval_loss": 1.5768018960952759, "eval_runtime": 35.1674, "eval_samples_per_second": 28.151, "eval_steps_per_second": 3.526, "step": 84 }, { "epoch": 0.14795918367346939, "grad_norm": 1.302090048789978, "learning_rate": 9.40297765928369e-05, "loss": 1.523, "step": 87 }, { "epoch": 0.15306122448979592, "grad_norm": 1.5333181619644165, "learning_rate": 9.356593520616948e-05, "loss": 1.6106, "step": 90 }, { "epoch": 0.15816326530612246, "grad_norm": 1.360306739807129, "learning_rate": 9.308597683653975e-05, "loss": 1.5755, "step": 93 }, { "epoch": 0.16326530612244897, "grad_norm": 1.4102718830108643, "learning_rate": 9.259007904196023e-05, "loss": 1.622, "step": 96 }, { "epoch": 0.1683673469387755, "grad_norm": 1.19460928440094, "learning_rate": 9.207842527714767e-05, "loss": 1.5597, "step": 99 }, { "epoch": 0.17346938775510204, "grad_norm": 1.374776005744934, "learning_rate": 9.155120482565521e-05, "loss": 1.6207, "step": 102 }, { "epoch": 0.17857142857142858, "grad_norm": 1.337540626525879, "learning_rate": 9.10086127298478e-05, "loss": 1.5364, "step": 105 }, { "epoch": 0.1836734693877551, "grad_norm": 1.4444150924682617, "learning_rate": 9.045084971874738e-05, "loss": 1.5978, "step": 108 }, { "epoch": 0.18877551020408162, "grad_norm": 1.2942811250686646, "learning_rate": 8.987812213377424e-05, "loss": 1.5385, "step": 111 }, { "epoch": 0.19387755102040816, "grad_norm": 1.314165711402893, "learning_rate": 8.929064185241213e-05, "loss": 1.4721, "step": 114 }, { "epoch": 0.1989795918367347, "grad_norm": 1.37766432762146, "learning_rate": 8.868862620982534e-05, "loss": 1.5167, "step": 117 }, { "epoch": 0.20408163265306123, "grad_norm": 1.3688915967941284, "learning_rate": 8.807229791845673e-05, "loss": 1.516, "step": 120 }, { "epoch": 0.20918367346938777, "grad_norm": 1.2788846492767334, "learning_rate": 8.744188498563641e-05, "loss": 1.4704, "step": 123 }, { "epoch": 0.21428571428571427, "grad_norm": 1.3125512599945068, "learning_rate": 8.679762062923175e-05, "loss": 1.4922, "step": 126 }, { "epoch": 0.21428571428571427, "eval_loss": 1.512025237083435, "eval_runtime": 35.1759, "eval_samples_per_second": 28.144, "eval_steps_per_second": 3.525, "step": 126 }, { "epoch": 0.2193877551020408, "grad_norm": 1.476131796836853, "learning_rate": 8.613974319136958e-05, "loss": 1.4581, "step": 129 }, { "epoch": 0.22448979591836735, "grad_norm": 1.3448469638824463, "learning_rate": 8.54684960502629e-05, "loss": 1.5486, "step": 132 }, { "epoch": 0.22959183673469388, "grad_norm": 1.416334629058838, "learning_rate": 8.478412753017433e-05, "loss": 1.5033, "step": 135 }, { "epoch": 0.23469387755102042, "grad_norm": 1.442419171333313, "learning_rate": 8.408689080954998e-05, "loss": 1.5536, "step": 138 }, { "epoch": 0.23979591836734693, "grad_norm": 1.3300379514694214, "learning_rate": 8.33770438273574e-05, "loss": 1.5355, "step": 141 }, { "epoch": 0.24489795918367346, "grad_norm": 1.331199288368225, "learning_rate": 8.265484918766243e-05, "loss": 1.5124, "step": 144 }, { "epoch": 0.25, "grad_norm": 1.507230281829834, "learning_rate": 8.192057406248028e-05, "loss": 1.5505, "step": 147 }, { "epoch": 0.25510204081632654, "grad_norm": 1.3450610637664795, "learning_rate": 8.117449009293668e-05, "loss": 1.5118, "step": 150 }, { "epoch": 0.2602040816326531, "grad_norm": 1.3266024589538574, "learning_rate": 8.041687328877567e-05, "loss": 1.4466, "step": 153 }, { "epoch": 0.2653061224489796, "grad_norm": 1.4242159128189087, "learning_rate": 7.964800392625129e-05, "loss": 1.4573, "step": 156 }, { "epoch": 0.27040816326530615, "grad_norm": 1.2786240577697754, "learning_rate": 7.886816644444098e-05, "loss": 1.4863, "step": 159 }, { "epoch": 0.2755102040816326, "grad_norm": 1.326027274131775, "learning_rate": 7.807764934001874e-05, "loss": 1.4627, "step": 162 }, { "epoch": 0.28061224489795916, "grad_norm": 1.366431713104248, "learning_rate": 7.727674506052743e-05, "loss": 1.3841, "step": 165 }, { "epoch": 0.2857142857142857, "grad_norm": 1.2552882432937622, "learning_rate": 7.646574989618938e-05, "loss": 1.5041, "step": 168 }, { "epoch": 0.2857142857142857, "eval_loss": 1.4554890394210815, "eval_runtime": 35.2045, "eval_samples_per_second": 28.121, "eval_steps_per_second": 3.522, "step": 168 }, { "epoch": 0.29081632653061223, "grad_norm": 1.4083505868911743, "learning_rate": 7.564496387029532e-05, "loss": 1.5224, "step": 171 }, { "epoch": 0.29591836734693877, "grad_norm": 1.2891498804092407, "learning_rate": 7.481469062821252e-05, "loss": 1.4262, "step": 174 }, { "epoch": 0.3010204081632653, "grad_norm": 1.413690209388733, "learning_rate": 7.39752373250527e-05, "loss": 1.5159, "step": 177 }, { "epoch": 0.30612244897959184, "grad_norm": 1.3442703485488892, "learning_rate": 7.312691451204178e-05, "loss": 1.4575, "step": 180 }, { "epoch": 0.3112244897959184, "grad_norm": 1.348726749420166, "learning_rate": 7.227003602163295e-05, "loss": 1.5348, "step": 183 }, { "epoch": 0.3163265306122449, "grad_norm": 1.338478922843933, "learning_rate": 7.14049188514063e-05, "loss": 1.4526, "step": 186 }, { "epoch": 0.32142857142857145, "grad_norm": 1.3987809419631958, "learning_rate": 7.05318830467969e-05, "loss": 1.3506, "step": 189 }, { "epoch": 0.32653061224489793, "grad_norm": 1.2463765144348145, "learning_rate": 6.965125158269619e-05, "loss": 1.3693, "step": 192 }, { "epoch": 0.33163265306122447, "grad_norm": 1.4282686710357666, "learning_rate": 6.876335024396872e-05, "loss": 1.5498, "step": 195 }, { "epoch": 0.336734693877551, "grad_norm": 1.2759164571762085, "learning_rate": 6.786850750493006e-05, "loss": 1.4268, "step": 198 }, { "epoch": 0.34183673469387754, "grad_norm": 1.4179458618164062, "learning_rate": 6.696705440782938e-05, "loss": 1.457, "step": 201 }, { "epoch": 0.3469387755102041, "grad_norm": 1.480843424797058, "learning_rate": 6.605932444038229e-05, "loss": 1.4279, "step": 204 }, { "epoch": 0.3520408163265306, "grad_norm": 1.2799674272537231, "learning_rate": 6.514565341239861e-05, "loss": 1.4034, "step": 207 }, { "epoch": 0.35714285714285715, "grad_norm": 1.3426426649093628, "learning_rate": 6.422637933155162e-05, "loss": 1.4177, "step": 210 }, { "epoch": 0.35714285714285715, "eval_loss": 1.4206241369247437, "eval_runtime": 35.183, "eval_samples_per_second": 28.139, "eval_steps_per_second": 3.524, "step": 210 }, { "epoch": 0.3622448979591837, "grad_norm": 1.2729734182357788, "learning_rate": 6.330184227833376e-05, "loss": 1.3404, "step": 213 }, { "epoch": 0.3673469387755102, "grad_norm": 1.3893803358078003, "learning_rate": 6.237238428024572e-05, "loss": 1.4067, "step": 216 }, { "epoch": 0.37244897959183676, "grad_norm": 1.3455451726913452, "learning_rate": 6.143834918526527e-05, "loss": 1.3871, "step": 219 }, { "epoch": 0.37755102040816324, "grad_norm": 1.4596010446548462, "learning_rate": 6.0500082534642464e-05, "loss": 1.4465, "step": 222 }, { "epoch": 0.3826530612244898, "grad_norm": 1.4347237348556519, "learning_rate": 5.955793143506863e-05, "loss": 1.4422, "step": 225 }, { "epoch": 0.3877551020408163, "grad_norm": 1.2898458242416382, "learning_rate": 5.861224443026595e-05, "loss": 1.3514, "step": 228 }, { "epoch": 0.39285714285714285, "grad_norm": 1.3033252954483032, "learning_rate": 5.766337137204579e-05, "loss": 1.3825, "step": 231 }, { "epoch": 0.3979591836734694, "grad_norm": 1.4201698303222656, "learning_rate": 5.6711663290882776e-05, "loss": 1.4662, "step": 234 }, { "epoch": 0.4030612244897959, "grad_norm": 1.376613974571228, "learning_rate": 5.575747226605298e-05, "loss": 1.4155, "step": 237 }, { "epoch": 0.40816326530612246, "grad_norm": 1.3725097179412842, "learning_rate": 5.480115129538409e-05, "loss": 1.3242, "step": 240 }, { "epoch": 0.413265306122449, "grad_norm": 1.3945027589797974, "learning_rate": 5.384305416466584e-05, "loss": 1.3763, "step": 243 }, { "epoch": 0.41836734693877553, "grad_norm": 1.3332866430282593, "learning_rate": 5.288353531676873e-05, "loss": 1.4492, "step": 246 }, { "epoch": 0.42346938775510207, "grad_norm": 1.439042091369629, "learning_rate": 5.192294972051992e-05, "loss": 1.4218, "step": 249 }, { "epoch": 0.42857142857142855, "grad_norm": 1.4441425800323486, "learning_rate": 5.0961652739384356e-05, "loss": 1.3857, "step": 252 }, { "epoch": 0.42857142857142855, "eval_loss": 1.3904343843460083, "eval_runtime": 35.1932, "eval_samples_per_second": 28.13, "eval_steps_per_second": 3.523, "step": 252 }, { "epoch": 0.4336734693877551, "grad_norm": 1.393947958946228, "learning_rate": 5e-05, "loss": 1.4025, "step": 255 }, { "epoch": 0.4387755102040816, "grad_norm": 1.2842843532562256, "learning_rate": 4.903834726061565e-05, "loss": 1.3806, "step": 258 }, { "epoch": 0.44387755102040816, "grad_norm": 1.4686455726623535, "learning_rate": 4.807705027948008e-05, "loss": 1.3954, "step": 261 }, { "epoch": 0.4489795918367347, "grad_norm": 1.3992563486099243, "learning_rate": 4.711646468323129e-05, "loss": 1.3417, "step": 264 }, { "epoch": 0.45408163265306123, "grad_norm": 1.3817628622055054, "learning_rate": 4.6156945835334184e-05, "loss": 1.4016, "step": 267 }, { "epoch": 0.45918367346938777, "grad_norm": 1.4665182828903198, "learning_rate": 4.5198848704615914e-05, "loss": 1.3706, "step": 270 }, { "epoch": 0.4642857142857143, "grad_norm": 1.3997013568878174, "learning_rate": 4.424252773394704e-05, "loss": 1.4081, "step": 273 }, { "epoch": 0.46938775510204084, "grad_norm": 1.4313600063323975, "learning_rate": 4.328833670911724e-05, "loss": 1.4345, "step": 276 }, { "epoch": 0.4744897959183674, "grad_norm": 1.2777554988861084, "learning_rate": 4.23366286279542e-05, "loss": 1.3368, "step": 279 }, { "epoch": 0.47959183673469385, "grad_norm": 1.297582745552063, "learning_rate": 4.138775556973406e-05, "loss": 1.411, "step": 282 }, { "epoch": 0.4846938775510204, "grad_norm": 1.409166693687439, "learning_rate": 4.04420685649314e-05, "loss": 1.4427, "step": 285 }, { "epoch": 0.4897959183673469, "grad_norm": 1.392500400543213, "learning_rate": 3.9499917465357534e-05, "loss": 1.4439, "step": 288 }, { "epoch": 0.49489795918367346, "grad_norm": 1.3200541734695435, "learning_rate": 3.856165081473474e-05, "loss": 1.39, "step": 291 }, { "epoch": 0.5, "grad_norm": 1.4324284791946411, "learning_rate": 3.762761571975429e-05, "loss": 1.3971, "step": 294 }, { "epoch": 0.5, "eval_loss": 1.3641271591186523, "eval_runtime": 35.1729, "eval_samples_per_second": 28.147, "eval_steps_per_second": 3.525, "step": 294 }, { "epoch": 0.5051020408163265, "grad_norm": 1.3399404287338257, "learning_rate": 3.6698157721666246e-05, "loss": 1.3819, "step": 297 }, { "epoch": 0.5102040816326531, "grad_norm": 1.3525382280349731, "learning_rate": 3.5773620668448384e-05, "loss": 1.3289, "step": 300 }, { "epoch": 0.5153061224489796, "grad_norm": 1.3362396955490112, "learning_rate": 3.48543465876014e-05, "loss": 1.3518, "step": 303 }, { "epoch": 0.5204081632653061, "grad_norm": 1.4100676774978638, "learning_rate": 3.3940675559617724e-05, "loss": 1.3504, "step": 306 }, { "epoch": 0.5255102040816326, "grad_norm": 1.3914427757263184, "learning_rate": 3.303294559217063e-05, "loss": 1.4025, "step": 309 }, { "epoch": 0.5306122448979592, "grad_norm": 1.3327820301055908, "learning_rate": 3.213149249506997e-05, "loss": 1.4395, "step": 312 }, { "epoch": 0.5357142857142857, "grad_norm": 1.44315505027771, "learning_rate": 3.12366497560313e-05, "loss": 1.3655, "step": 315 }, { "epoch": 0.5408163265306123, "grad_norm": 1.3102085590362549, "learning_rate": 3.0348748417303823e-05, "loss": 1.3711, "step": 318 }, { "epoch": 0.5459183673469388, "grad_norm": 1.31125009059906, "learning_rate": 2.9468116953203107e-05, "loss": 1.2972, "step": 321 }, { "epoch": 0.5510204081632653, "grad_norm": 1.3284008502960205, "learning_rate": 2.8595081148593738e-05, "loss": 1.3539, "step": 324 }, { "epoch": 0.5561224489795918, "grad_norm": 1.274868369102478, "learning_rate": 2.772996397836704e-05, "loss": 1.3535, "step": 327 }, { "epoch": 0.5612244897959183, "grad_norm": 1.3468469381332397, "learning_rate": 2.687308548795825e-05, "loss": 1.3703, "step": 330 }, { "epoch": 0.5663265306122449, "grad_norm": 1.4011824131011963, "learning_rate": 2.6024762674947313e-05, "loss": 1.3328, "step": 333 }, { "epoch": 0.5714285714285714, "grad_norm": 1.2769042253494263, "learning_rate": 2.5185309371787513e-05, "loss": 1.2754, "step": 336 }, { "epoch": 0.5714285714285714, "eval_loss": 1.3424915075302124, "eval_runtime": 35.205, "eval_samples_per_second": 28.121, "eval_steps_per_second": 3.522, "step": 336 }, { "epoch": 0.576530612244898, "grad_norm": 1.3609542846679688, "learning_rate": 2.43550361297047e-05, "loss": 1.3515, "step": 339 }, { "epoch": 0.5816326530612245, "grad_norm": 1.3642148971557617, "learning_rate": 2.353425010381063e-05, "loss": 1.3885, "step": 342 }, { "epoch": 0.5867346938775511, "grad_norm": 1.3652516603469849, "learning_rate": 2.272325493947257e-05, "loss": 1.4246, "step": 345 }, { "epoch": 0.5918367346938775, "grad_norm": 1.3225781917572021, "learning_rate": 2.192235065998126e-05, "loss": 1.3029, "step": 348 }, { "epoch": 0.5969387755102041, "grad_norm": 1.3334158658981323, "learning_rate": 2.1131833555559037e-05, "loss": 1.3672, "step": 351 }, { "epoch": 0.6020408163265306, "grad_norm": 1.4105125665664673, "learning_rate": 2.0351996073748713e-05, "loss": 1.3198, "step": 354 }, { "epoch": 0.6071428571428571, "grad_norm": 1.4976215362548828, "learning_rate": 1.9583126711224343e-05, "loss": 1.3367, "step": 357 }, { "epoch": 0.6122448979591837, "grad_norm": 1.3433520793914795, "learning_rate": 1.8825509907063327e-05, "loss": 1.3276, "step": 360 }, { "epoch": 0.6173469387755102, "grad_norm": 1.3432618379592896, "learning_rate": 1.807942593751973e-05, "loss": 1.366, "step": 363 }, { "epoch": 0.6224489795918368, "grad_norm": 1.4558722972869873, "learning_rate": 1.7345150812337564e-05, "loss": 1.3527, "step": 366 }, { "epoch": 0.6275510204081632, "grad_norm": 1.302122712135315, "learning_rate": 1.66229561726426e-05, "loss": 1.2592, "step": 369 }, { "epoch": 0.6326530612244898, "grad_norm": 1.3466747999191284, "learning_rate": 1.5913109190450032e-05, "loss": 1.3826, "step": 372 }, { "epoch": 0.6377551020408163, "grad_norm": 1.2870639562606812, "learning_rate": 1.5215872469825682e-05, "loss": 1.3761, "step": 375 }, { "epoch": 0.6428571428571429, "grad_norm": 1.2575714588165283, "learning_rate": 1.4531503949737108e-05, "loss": 1.3175, "step": 378 }, { "epoch": 0.6428571428571429, "eval_loss": 1.3251529932022095, "eval_runtime": 35.1821, "eval_samples_per_second": 28.139, "eval_steps_per_second": 3.525, "step": 378 }, { "epoch": 0.6479591836734694, "grad_norm": 1.3623194694519043, "learning_rate": 1.3860256808630428e-05, "loss": 1.2629, "step": 381 }, { "epoch": 0.6530612244897959, "grad_norm": 1.2920513153076172, "learning_rate": 1.3202379370768252e-05, "loss": 1.3591, "step": 384 }, { "epoch": 0.6581632653061225, "grad_norm": 1.3457062244415283, "learning_rate": 1.2558115014363592e-05, "loss": 1.3316, "step": 387 }, { "epoch": 0.6632653061224489, "grad_norm": 1.3173242807388306, "learning_rate": 1.1927702081543279e-05, "loss": 1.3149, "step": 390 }, { "epoch": 0.6683673469387755, "grad_norm": 1.3412364721298218, "learning_rate": 1.1311373790174657e-05, "loss": 1.3339, "step": 393 }, { "epoch": 0.673469387755102, "grad_norm": 1.3552829027175903, "learning_rate": 1.0709358147587884e-05, "loss": 1.383, "step": 396 }, { "epoch": 0.6785714285714286, "grad_norm": 1.2894299030303955, "learning_rate": 1.0121877866225781e-05, "loss": 1.3127, "step": 399 }, { "epoch": 0.6836734693877551, "grad_norm": 1.3170045614242554, "learning_rate": 9.549150281252633e-06, "loss": 1.2562, "step": 402 }, { "epoch": 0.6887755102040817, "grad_norm": 1.3335039615631104, "learning_rate": 8.991387270152201e-06, "loss": 1.3243, "step": 405 }, { "epoch": 0.6938775510204082, "grad_norm": 1.3929165601730347, "learning_rate": 8.448795174344804e-06, "loss": 1.3727, "step": 408 }, { "epoch": 0.6989795918367347, "grad_norm": 1.3173998594284058, "learning_rate": 7.921574722852343e-06, "loss": 1.2642, "step": 411 }, { "epoch": 0.7040816326530612, "grad_norm": 1.4041547775268555, "learning_rate": 7.409920958039795e-06, "loss": 1.3234, "step": 414 }, { "epoch": 0.7091836734693877, "grad_norm": 1.2738025188446045, "learning_rate": 6.9140231634602485e-06, "loss": 1.304, "step": 417 }, { "epoch": 0.7142857142857143, "grad_norm": 1.36890709400177, "learning_rate": 6.43406479383053e-06, "loss": 1.2925, "step": 420 }, { "epoch": 0.7142857142857143, "eval_loss": 1.3164913654327393, "eval_runtime": 35.232, "eval_samples_per_second": 28.099, "eval_steps_per_second": 3.52, "step": 420 }, { "epoch": 0.7193877551020408, "grad_norm": 1.349149227142334, "learning_rate": 5.9702234071631e-06, "loss": 1.2992, "step": 423 }, { "epoch": 0.7244897959183674, "grad_norm": 1.3962756395339966, "learning_rate": 5.5226705990794155e-06, "loss": 1.3636, "step": 426 }, { "epoch": 0.7295918367346939, "grad_norm": 1.3731638193130493, "learning_rate": 5.091571939329048e-06, "loss": 1.2886, "step": 429 }, { "epoch": 0.7346938775510204, "grad_norm": 1.2778152227401733, "learning_rate": 4.677086910538092e-06, "loss": 1.3175, "step": 432 }, { "epoch": 0.7397959183673469, "grad_norm": 1.3971977233886719, "learning_rate": 4.279368849209381e-06, "loss": 1.3454, "step": 435 }, { "epoch": 0.7448979591836735, "grad_norm": 1.2573140859603882, "learning_rate": 3.898564888996476e-06, "loss": 1.2447, "step": 438 }, { "epoch": 0.75, "grad_norm": 1.3133257627487183, "learning_rate": 3.534815906272404e-06, "loss": 1.2851, "step": 441 }, { "epoch": 0.7551020408163265, "grad_norm": 1.426760196685791, "learning_rate": 3.18825646801314e-06, "loss": 1.3463, "step": 444 }, { "epoch": 0.7602040816326531, "grad_norm": 1.2696373462677002, "learning_rate": 2.8590147820153513e-06, "loss": 1.2552, "step": 447 }, { "epoch": 0.7653061224489796, "grad_norm": 1.3841983079910278, "learning_rate": 2.547212649466568e-06, "loss": 1.36, "step": 450 }, { "epoch": 0.7704081632653061, "grad_norm": 1.462028980255127, "learning_rate": 2.2529654198854835e-06, "loss": 1.3425, "step": 453 }, { "epoch": 0.7755102040816326, "grad_norm": 1.3342539072036743, "learning_rate": 1.9763819484490355e-06, "loss": 1.3417, "step": 456 }, { "epoch": 0.7806122448979592, "grad_norm": 1.4027793407440186, "learning_rate": 1.7175645557220566e-06, "loss": 1.3303, "step": 459 }, { "epoch": 0.7857142857142857, "grad_norm": 1.4022960662841797, "learning_rate": 1.4766089898042678e-06, "loss": 1.3069, "step": 462 }, { "epoch": 0.7857142857142857, "eval_loss": 1.3126194477081299, "eval_runtime": 35.1877, "eval_samples_per_second": 28.135, "eval_steps_per_second": 3.524, "step": 462 }, { "epoch": 0.7908163265306123, "grad_norm": 1.4359699487686157, "learning_rate": 1.2536043909088191e-06, "loss": 1.3433, "step": 465 }, { "epoch": 0.7959183673469388, "grad_norm": 1.4439855813980103, "learning_rate": 1.0486332583853563e-06, "loss": 1.3676, "step": 468 }, { "epoch": 0.8010204081632653, "grad_norm": 1.370235800743103, "learning_rate": 8.617714201998084e-07, "loss": 1.2855, "step": 471 }, { "epoch": 0.8061224489795918, "grad_norm": 1.2090818881988525, "learning_rate": 6.93088004882253e-07, "loss": 1.3047, "step": 474 }, { "epoch": 0.8112244897959183, "grad_norm": 1.3970741033554077, "learning_rate": 5.426454159531913e-07, "loss": 1.3344, "step": 477 }, { "epoch": 0.8163265306122449, "grad_norm": 1.4051525592803955, "learning_rate": 4.104993088376974e-07, "loss": 1.3377, "step": 480 }, { "epoch": 0.8214285714285714, "grad_norm": 1.4082175493240356, "learning_rate": 2.966985702759828e-07, "loss": 1.2679, "step": 483 }, { "epoch": 0.826530612244898, "grad_norm": 1.3774138689041138, "learning_rate": 2.012853002380466e-07, "loss": 1.2442, "step": 486 }, { "epoch": 0.8316326530612245, "grad_norm": 1.3718559741973877, "learning_rate": 1.2429479634897267e-07, "loss": 1.3134, "step": 489 }, { "epoch": 0.8367346938775511, "grad_norm": 1.3869266510009766, "learning_rate": 6.575554083078084e-08, "loss": 1.365, "step": 492 }, { "epoch": 0.8418367346938775, "grad_norm": 1.3461697101593018, "learning_rate": 2.568918996560532e-08, "loss": 1.3113, "step": 495 }, { "epoch": 0.8469387755102041, "grad_norm": 1.4545960426330566, "learning_rate": 4.110566084036816e-09, "loss": 1.3021, "step": 498 } ], "logging_steps": 3, "max_steps": 500, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 42, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 2.02538779410432e+17, "train_batch_size": 8, "trial_name": null, "trial_params": null }