|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 1.0040160642570282, |
|
"eval_steps": 500, |
|
"global_step": 250, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.004016064257028112, |
|
"grad_norm": 0.7356438636779785, |
|
"learning_rate": 2.4000000000000003e-06, |
|
"loss": 2.1205, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.008032128514056224, |
|
"grad_norm": 0.6866766810417175, |
|
"learning_rate": 4.800000000000001e-06, |
|
"loss": 2.122, |
|
"step": 2 |
|
}, |
|
{ |
|
"epoch": 0.012048192771084338, |
|
"grad_norm": 0.7540773153305054, |
|
"learning_rate": 7.2e-06, |
|
"loss": 2.326, |
|
"step": 3 |
|
}, |
|
{ |
|
"epoch": 0.01606425702811245, |
|
"grad_norm": 0.8841753005981445, |
|
"learning_rate": 9.600000000000001e-06, |
|
"loss": 1.9947, |
|
"step": 4 |
|
}, |
|
{ |
|
"epoch": 0.020080321285140562, |
|
"grad_norm": 0.7485767006874084, |
|
"learning_rate": 1.2e-05, |
|
"loss": 1.9986, |
|
"step": 5 |
|
}, |
|
{ |
|
"epoch": 0.024096385542168676, |
|
"grad_norm": 0.5316860675811768, |
|
"learning_rate": 1.44e-05, |
|
"loss": 1.9405, |
|
"step": 6 |
|
}, |
|
{ |
|
"epoch": 0.028112449799196786, |
|
"grad_norm": 0.2008056938648224, |
|
"learning_rate": 1.6800000000000002e-05, |
|
"loss": 1.7299, |
|
"step": 7 |
|
}, |
|
{ |
|
"epoch": 0.0321285140562249, |
|
"grad_norm": 0.08951256424188614, |
|
"learning_rate": 1.9200000000000003e-05, |
|
"loss": 1.8174, |
|
"step": 8 |
|
}, |
|
{ |
|
"epoch": 0.03614457831325301, |
|
"grad_norm": 0.19312520325183868, |
|
"learning_rate": 2.16e-05, |
|
"loss": 1.8569, |
|
"step": 9 |
|
}, |
|
{ |
|
"epoch": 0.040160642570281124, |
|
"grad_norm": 0.2667342722415924, |
|
"learning_rate": 2.4e-05, |
|
"loss": 1.8051, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.04417670682730924, |
|
"grad_norm": 0.3407232165336609, |
|
"learning_rate": 2.64e-05, |
|
"loss": 1.8647, |
|
"step": 11 |
|
}, |
|
{ |
|
"epoch": 0.04819277108433735, |
|
"grad_norm": 0.4737996459007263, |
|
"learning_rate": 2.88e-05, |
|
"loss": 1.7965, |
|
"step": 12 |
|
}, |
|
{ |
|
"epoch": 0.05220883534136546, |
|
"grad_norm": 0.41904422640800476, |
|
"learning_rate": 3.12e-05, |
|
"loss": 1.8721, |
|
"step": 13 |
|
}, |
|
{ |
|
"epoch": 0.05622489959839357, |
|
"grad_norm": 0.23236039280891418, |
|
"learning_rate": 3.3600000000000004e-05, |
|
"loss": 2.0646, |
|
"step": 14 |
|
}, |
|
{ |
|
"epoch": 0.060240963855421686, |
|
"grad_norm": 0.26083624362945557, |
|
"learning_rate": 3.6e-05, |
|
"loss": 1.7836, |
|
"step": 15 |
|
}, |
|
{ |
|
"epoch": 0.0642570281124498, |
|
"grad_norm": 0.35279905796051025, |
|
"learning_rate": 3.8400000000000005e-05, |
|
"loss": 1.8611, |
|
"step": 16 |
|
}, |
|
{ |
|
"epoch": 0.06827309236947791, |
|
"grad_norm": 0.30636098980903625, |
|
"learning_rate": 4.08e-05, |
|
"loss": 1.9236, |
|
"step": 17 |
|
}, |
|
{ |
|
"epoch": 0.07228915662650602, |
|
"grad_norm": 0.2782573997974396, |
|
"learning_rate": 4.32e-05, |
|
"loss": 1.6871, |
|
"step": 18 |
|
}, |
|
{ |
|
"epoch": 0.07630522088353414, |
|
"grad_norm": 0.16966235637664795, |
|
"learning_rate": 4.5600000000000004e-05, |
|
"loss": 1.7482, |
|
"step": 19 |
|
}, |
|
{ |
|
"epoch": 0.08032128514056225, |
|
"grad_norm": 0.20790238678455353, |
|
"learning_rate": 4.8e-05, |
|
"loss": 1.7494, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.08433734939759036, |
|
"grad_norm": 0.1863771229982376, |
|
"learning_rate": 5.04e-05, |
|
"loss": 1.7022, |
|
"step": 21 |
|
}, |
|
{ |
|
"epoch": 0.08835341365461848, |
|
"grad_norm": 0.1544342190027237, |
|
"learning_rate": 5.28e-05, |
|
"loss": 1.9192, |
|
"step": 22 |
|
}, |
|
{ |
|
"epoch": 0.09236947791164658, |
|
"grad_norm": 0.1717195063829422, |
|
"learning_rate": 5.520000000000001e-05, |
|
"loss": 2.0797, |
|
"step": 23 |
|
}, |
|
{ |
|
"epoch": 0.0963855421686747, |
|
"grad_norm": 0.13888715207576752, |
|
"learning_rate": 5.76e-05, |
|
"loss": 1.8933, |
|
"step": 24 |
|
}, |
|
{ |
|
"epoch": 0.10040160642570281, |
|
"grad_norm": 0.12953609228134155, |
|
"learning_rate": 6e-05, |
|
"loss": 1.8774, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.10441767068273092, |
|
"grad_norm": 0.1392662227153778, |
|
"learning_rate": 5.999933829087074e-05, |
|
"loss": 1.9031, |
|
"step": 26 |
|
}, |
|
{ |
|
"epoch": 0.10843373493975904, |
|
"grad_norm": 0.17272761464118958, |
|
"learning_rate": 5.999735319267354e-05, |
|
"loss": 1.8345, |
|
"step": 27 |
|
}, |
|
{ |
|
"epoch": 0.11244979919678715, |
|
"grad_norm": 0.17943339049816132, |
|
"learning_rate": 5.999404479297892e-05, |
|
"loss": 1.7537, |
|
"step": 28 |
|
}, |
|
{ |
|
"epoch": 0.11646586345381527, |
|
"grad_norm": 0.14608968794345856, |
|
"learning_rate": 5.998941323773343e-05, |
|
"loss": 1.8518, |
|
"step": 29 |
|
}, |
|
{ |
|
"epoch": 0.12048192771084337, |
|
"grad_norm": 0.12942039966583252, |
|
"learning_rate": 5.9983458731253224e-05, |
|
"loss": 1.8527, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.12449799196787148, |
|
"grad_norm": 0.1064087525010109, |
|
"learning_rate": 5.9976181536215066e-05, |
|
"loss": 1.7588, |
|
"step": 31 |
|
}, |
|
{ |
|
"epoch": 0.1285140562248996, |
|
"grad_norm": 0.123220294713974, |
|
"learning_rate": 5.99675819736447e-05, |
|
"loss": 1.7637, |
|
"step": 32 |
|
}, |
|
{ |
|
"epoch": 0.13253012048192772, |
|
"grad_norm": 0.1729809194803238, |
|
"learning_rate": 5.9957660422902735e-05, |
|
"loss": 1.9683, |
|
"step": 33 |
|
}, |
|
{ |
|
"epoch": 0.13654618473895583, |
|
"grad_norm": 0.12830182909965515, |
|
"learning_rate": 5.9946417321667896e-05, |
|
"loss": 1.8534, |
|
"step": 34 |
|
}, |
|
{ |
|
"epoch": 0.14056224899598393, |
|
"grad_norm": 0.10919933766126633, |
|
"learning_rate": 5.993385316591769e-05, |
|
"loss": 1.801, |
|
"step": 35 |
|
}, |
|
{ |
|
"epoch": 0.14457831325301204, |
|
"grad_norm": 0.10252447426319122, |
|
"learning_rate": 5.991996850990655e-05, |
|
"loss": 1.7462, |
|
"step": 36 |
|
}, |
|
{ |
|
"epoch": 0.14859437751004015, |
|
"grad_norm": 0.10130741447210312, |
|
"learning_rate": 5.9904763966141394e-05, |
|
"loss": 1.7917, |
|
"step": 37 |
|
}, |
|
{ |
|
"epoch": 0.15261044176706828, |
|
"grad_norm": 0.14359734952449799, |
|
"learning_rate": 5.9888240205354576e-05, |
|
"loss": 1.5508, |
|
"step": 38 |
|
}, |
|
{ |
|
"epoch": 0.1566265060240964, |
|
"grad_norm": 0.13955214619636536, |
|
"learning_rate": 5.987039795647432e-05, |
|
"loss": 1.8419, |
|
"step": 39 |
|
}, |
|
{ |
|
"epoch": 0.1606425702811245, |
|
"grad_norm": 0.1250331997871399, |
|
"learning_rate": 5.985123800659256e-05, |
|
"loss": 1.8584, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.1646586345381526, |
|
"grad_norm": 0.09694834798574448, |
|
"learning_rate": 5.983076120093022e-05, |
|
"loss": 1.7475, |
|
"step": 41 |
|
}, |
|
{ |
|
"epoch": 0.1686746987951807, |
|
"grad_norm": 0.10690533369779587, |
|
"learning_rate": 5.98089684427999e-05, |
|
"loss": 1.881, |
|
"step": 42 |
|
}, |
|
{ |
|
"epoch": 0.17269076305220885, |
|
"grad_norm": 0.10619474202394485, |
|
"learning_rate": 5.978586069356608e-05, |
|
"loss": 1.7596, |
|
"step": 43 |
|
}, |
|
{ |
|
"epoch": 0.17670682730923695, |
|
"grad_norm": 0.1079355850815773, |
|
"learning_rate": 5.9761438972602665e-05, |
|
"loss": 1.8796, |
|
"step": 44 |
|
}, |
|
{ |
|
"epoch": 0.18072289156626506, |
|
"grad_norm": 0.1343262791633606, |
|
"learning_rate": 5.973570435724803e-05, |
|
"loss": 1.8608, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 0.18473895582329317, |
|
"grad_norm": 0.08072712272405624, |
|
"learning_rate": 5.970865798275751e-05, |
|
"loss": 1.8684, |
|
"step": 46 |
|
}, |
|
{ |
|
"epoch": 0.18875502008032127, |
|
"grad_norm": 0.08719664812088013, |
|
"learning_rate": 5.968030104225331e-05, |
|
"loss": 1.7377, |
|
"step": 47 |
|
}, |
|
{ |
|
"epoch": 0.1927710843373494, |
|
"grad_norm": 0.1065979152917862, |
|
"learning_rate": 5.9650634786671835e-05, |
|
"loss": 1.9423, |
|
"step": 48 |
|
}, |
|
{ |
|
"epoch": 0.19678714859437751, |
|
"grad_norm": 0.10489759594202042, |
|
"learning_rate": 5.961966052470857e-05, |
|
"loss": 1.7712, |
|
"step": 49 |
|
}, |
|
{ |
|
"epoch": 0.20080321285140562, |
|
"grad_norm": 0.08399316668510437, |
|
"learning_rate": 5.9587379622760314e-05, |
|
"loss": 1.9296, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.20481927710843373, |
|
"grad_norm": 0.08468279987573624, |
|
"learning_rate": 5.95537935048649e-05, |
|
"loss": 1.8012, |
|
"step": 51 |
|
}, |
|
{ |
|
"epoch": 0.20883534136546184, |
|
"grad_norm": 0.08074142038822174, |
|
"learning_rate": 5.9518903652638376e-05, |
|
"loss": 1.8227, |
|
"step": 52 |
|
}, |
|
{ |
|
"epoch": 0.21285140562248997, |
|
"grad_norm": 0.13968394696712494, |
|
"learning_rate": 5.948271160520967e-05, |
|
"loss": 1.8852, |
|
"step": 53 |
|
}, |
|
{ |
|
"epoch": 0.21686746987951808, |
|
"grad_norm": 0.14500737190246582, |
|
"learning_rate": 5.944521895915265e-05, |
|
"loss": 1.664, |
|
"step": 54 |
|
}, |
|
{ |
|
"epoch": 0.22088353413654618, |
|
"grad_norm": 0.15516269207000732, |
|
"learning_rate": 5.9406427368415735e-05, |
|
"loss": 1.8514, |
|
"step": 55 |
|
}, |
|
{ |
|
"epoch": 0.2248995983935743, |
|
"grad_norm": 0.07931994646787643, |
|
"learning_rate": 5.93663385442489e-05, |
|
"loss": 2.0106, |
|
"step": 56 |
|
}, |
|
{ |
|
"epoch": 0.2289156626506024, |
|
"grad_norm": 0.07893547415733337, |
|
"learning_rate": 5.932495425512823e-05, |
|
"loss": 1.7823, |
|
"step": 57 |
|
}, |
|
{ |
|
"epoch": 0.23293172690763053, |
|
"grad_norm": 0.10270941257476807, |
|
"learning_rate": 5.928227632667782e-05, |
|
"loss": 1.7455, |
|
"step": 58 |
|
}, |
|
{ |
|
"epoch": 0.23694779116465864, |
|
"grad_norm": 0.0862252414226532, |
|
"learning_rate": 5.923830664158934e-05, |
|
"loss": 1.8221, |
|
"step": 59 |
|
}, |
|
{ |
|
"epoch": 0.24096385542168675, |
|
"grad_norm": 0.09398238360881805, |
|
"learning_rate": 5.919304713953893e-05, |
|
"loss": 1.8726, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.24497991967871485, |
|
"grad_norm": 0.12090852856636047, |
|
"learning_rate": 5.914649981710164e-05, |
|
"loss": 1.9723, |
|
"step": 61 |
|
}, |
|
{ |
|
"epoch": 0.24899598393574296, |
|
"grad_norm": 0.10450229793787003, |
|
"learning_rate": 5.909866672766334e-05, |
|
"loss": 1.9845, |
|
"step": 62 |
|
}, |
|
{ |
|
"epoch": 0.25301204819277107, |
|
"grad_norm": 0.10860798507928848, |
|
"learning_rate": 5.904954998133017e-05, |
|
"loss": 1.9028, |
|
"step": 63 |
|
}, |
|
{ |
|
"epoch": 0.2570281124497992, |
|
"grad_norm": 0.13051600754261017, |
|
"learning_rate": 5.8999151744835415e-05, |
|
"loss": 1.7577, |
|
"step": 64 |
|
}, |
|
{ |
|
"epoch": 0.26104417670682734, |
|
"grad_norm": 0.15481741726398468, |
|
"learning_rate": 5.894747424144397e-05, |
|
"loss": 1.6495, |
|
"step": 65 |
|
}, |
|
{ |
|
"epoch": 0.26506024096385544, |
|
"grad_norm": 0.10475904494524002, |
|
"learning_rate": 5.8894519750854215e-05, |
|
"loss": 1.8233, |
|
"step": 66 |
|
}, |
|
{ |
|
"epoch": 0.26907630522088355, |
|
"grad_norm": 0.08438512682914734, |
|
"learning_rate": 5.8840290609097454e-05, |
|
"loss": 1.9323, |
|
"step": 67 |
|
}, |
|
{ |
|
"epoch": 0.27309236947791166, |
|
"grad_norm": 0.08830106258392334, |
|
"learning_rate": 5.878478920843492e-05, |
|
"loss": 1.8682, |
|
"step": 68 |
|
}, |
|
{ |
|
"epoch": 0.27710843373493976, |
|
"grad_norm": 0.09730786830186844, |
|
"learning_rate": 5.872801799725218e-05, |
|
"loss": 1.8755, |
|
"step": 69 |
|
}, |
|
{ |
|
"epoch": 0.28112449799196787, |
|
"grad_norm": 0.09512544423341751, |
|
"learning_rate": 5.866997947995114e-05, |
|
"loss": 1.6028, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.285140562248996, |
|
"grad_norm": 0.11627932637929916, |
|
"learning_rate": 5.8610676216839586e-05, |
|
"loss": 1.9151, |
|
"step": 71 |
|
}, |
|
{ |
|
"epoch": 0.2891566265060241, |
|
"grad_norm": 0.13668064773082733, |
|
"learning_rate": 5.855011082401822e-05, |
|
"loss": 1.664, |
|
"step": 72 |
|
}, |
|
{ |
|
"epoch": 0.2931726907630522, |
|
"grad_norm": 0.10782669484615326, |
|
"learning_rate": 5.848828597326528e-05, |
|
"loss": 1.7598, |
|
"step": 73 |
|
}, |
|
{ |
|
"epoch": 0.2971887550200803, |
|
"grad_norm": 0.09803726524114609, |
|
"learning_rate": 5.842520439191862e-05, |
|
"loss": 1.5882, |
|
"step": 74 |
|
}, |
|
{ |
|
"epoch": 0.30120481927710846, |
|
"grad_norm": 0.09667789936065674, |
|
"learning_rate": 5.836086886275548e-05, |
|
"loss": 2.0459, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 0.30522088353413657, |
|
"grad_norm": 0.10181078314781189, |
|
"learning_rate": 5.829528222386966e-05, |
|
"loss": 1.771, |
|
"step": 76 |
|
}, |
|
{ |
|
"epoch": 0.3092369477911647, |
|
"grad_norm": 0.08946649730205536, |
|
"learning_rate": 5.8228447368546314e-05, |
|
"loss": 1.7914, |
|
"step": 77 |
|
}, |
|
{ |
|
"epoch": 0.3132530120481928, |
|
"grad_norm": 0.0814245194196701, |
|
"learning_rate": 5.816036724513439e-05, |
|
"loss": 1.764, |
|
"step": 78 |
|
}, |
|
{ |
|
"epoch": 0.3172690763052209, |
|
"grad_norm": 0.07315659523010254, |
|
"learning_rate": 5.809104485691649e-05, |
|
"loss": 1.8567, |
|
"step": 79 |
|
}, |
|
{ |
|
"epoch": 0.321285140562249, |
|
"grad_norm": 0.07435451447963715, |
|
"learning_rate": 5.802048326197644e-05, |
|
"loss": 1.8489, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.3253012048192771, |
|
"grad_norm": 0.13348247110843658, |
|
"learning_rate": 5.794868557306433e-05, |
|
"loss": 1.6286, |
|
"step": 81 |
|
}, |
|
{ |
|
"epoch": 0.3293172690763052, |
|
"grad_norm": 0.0857180655002594, |
|
"learning_rate": 5.787565495745924e-05, |
|
"loss": 1.8996, |
|
"step": 82 |
|
}, |
|
{ |
|
"epoch": 0.3333333333333333, |
|
"grad_norm": 0.11295263469219208, |
|
"learning_rate": 5.780139463682952e-05, |
|
"loss": 1.5915, |
|
"step": 83 |
|
}, |
|
{ |
|
"epoch": 0.3373493975903614, |
|
"grad_norm": 0.08790762722492218, |
|
"learning_rate": 5.7725907887090634e-05, |
|
"loss": 1.7598, |
|
"step": 84 |
|
}, |
|
{ |
|
"epoch": 0.3413654618473896, |
|
"grad_norm": 0.08483127504587173, |
|
"learning_rate": 5.764919803826068e-05, |
|
"loss": 1.862, |
|
"step": 85 |
|
}, |
|
{ |
|
"epoch": 0.3453815261044177, |
|
"grad_norm": 0.08203373849391937, |
|
"learning_rate": 5.757126847431347e-05, |
|
"loss": 1.8408, |
|
"step": 86 |
|
}, |
|
{ |
|
"epoch": 0.3493975903614458, |
|
"grad_norm": 0.0757412239909172, |
|
"learning_rate": 5.749212263302927e-05, |
|
"loss": 1.8627, |
|
"step": 87 |
|
}, |
|
{ |
|
"epoch": 0.3534136546184739, |
|
"grad_norm": 0.09138002246618271, |
|
"learning_rate": 5.741176400584313e-05, |
|
"loss": 1.6849, |
|
"step": 88 |
|
}, |
|
{ |
|
"epoch": 0.357429718875502, |
|
"grad_norm": 0.1019127294421196, |
|
"learning_rate": 5.733019613769086e-05, |
|
"loss": 1.7678, |
|
"step": 89 |
|
}, |
|
{ |
|
"epoch": 0.3614457831325301, |
|
"grad_norm": 0.09306428581476212, |
|
"learning_rate": 5.7247422626852666e-05, |
|
"loss": 1.7371, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.3654618473895582, |
|
"grad_norm": 0.10577791184186935, |
|
"learning_rate": 5.7163447124794405e-05, |
|
"loss": 1.6762, |
|
"step": 91 |
|
}, |
|
{ |
|
"epoch": 0.36947791164658633, |
|
"grad_norm": 0.12335649132728577, |
|
"learning_rate": 5.707827333600648e-05, |
|
"loss": 1.7191, |
|
"step": 92 |
|
}, |
|
{ |
|
"epoch": 0.37349397590361444, |
|
"grad_norm": 0.10361435264348984, |
|
"learning_rate": 5.699190501784049e-05, |
|
"loss": 1.8953, |
|
"step": 93 |
|
}, |
|
{ |
|
"epoch": 0.37751004016064255, |
|
"grad_norm": 0.0855787992477417, |
|
"learning_rate": 5.690434598034338e-05, |
|
"loss": 1.884, |
|
"step": 94 |
|
}, |
|
{ |
|
"epoch": 0.3815261044176707, |
|
"grad_norm": 0.09389682114124298, |
|
"learning_rate": 5.6815600086089486e-05, |
|
"loss": 1.5828, |
|
"step": 95 |
|
}, |
|
{ |
|
"epoch": 0.3855421686746988, |
|
"grad_norm": 0.08702408522367477, |
|
"learning_rate": 5.672567125001e-05, |
|
"loss": 1.8452, |
|
"step": 96 |
|
}, |
|
{ |
|
"epoch": 0.3895582329317269, |
|
"grad_norm": 0.10916967689990997, |
|
"learning_rate": 5.663456343922039e-05, |
|
"loss": 1.6201, |
|
"step": 97 |
|
}, |
|
{ |
|
"epoch": 0.39357429718875503, |
|
"grad_norm": 0.081611767411232, |
|
"learning_rate": 5.654228067284534e-05, |
|
"loss": 1.9651, |
|
"step": 98 |
|
}, |
|
{ |
|
"epoch": 0.39759036144578314, |
|
"grad_norm": 0.12812922894954681, |
|
"learning_rate": 5.6448827021841444e-05, |
|
"loss": 1.7838, |
|
"step": 99 |
|
}, |
|
{ |
|
"epoch": 0.40160642570281124, |
|
"grad_norm": 0.08412948250770569, |
|
"learning_rate": 5.635420660881763e-05, |
|
"loss": 1.737, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.40562248995983935, |
|
"grad_norm": 0.09557431936264038, |
|
"learning_rate": 5.6258423607853306e-05, |
|
"loss": 1.8245, |
|
"step": 101 |
|
}, |
|
{ |
|
"epoch": 0.40963855421686746, |
|
"grad_norm": 0.10354951024055481, |
|
"learning_rate": 5.616148224431423e-05, |
|
"loss": 1.8694, |
|
"step": 102 |
|
}, |
|
{ |
|
"epoch": 0.41365461847389556, |
|
"grad_norm": 0.08835772424936295, |
|
"learning_rate": 5.6063386794666075e-05, |
|
"loss": 1.7883, |
|
"step": 103 |
|
}, |
|
{ |
|
"epoch": 0.41767068273092367, |
|
"grad_norm": 0.10105304419994354, |
|
"learning_rate": 5.596414158628581e-05, |
|
"loss": 1.6835, |
|
"step": 104 |
|
}, |
|
{ |
|
"epoch": 0.42168674698795183, |
|
"grad_norm": 0.13019445538520813, |
|
"learning_rate": 5.586375099727081e-05, |
|
"loss": 1.8853, |
|
"step": 105 |
|
}, |
|
{ |
|
"epoch": 0.42570281124497994, |
|
"grad_norm": 0.12298959493637085, |
|
"learning_rate": 5.5762219456245676e-05, |
|
"loss": 1.7237, |
|
"step": 106 |
|
}, |
|
{ |
|
"epoch": 0.42971887550200805, |
|
"grad_norm": 0.10004926472902298, |
|
"learning_rate": 5.5659551442166924e-05, |
|
"loss": 1.6178, |
|
"step": 107 |
|
}, |
|
{ |
|
"epoch": 0.43373493975903615, |
|
"grad_norm": 0.11601317673921585, |
|
"learning_rate": 5.555575148412536e-05, |
|
"loss": 1.6619, |
|
"step": 108 |
|
}, |
|
{ |
|
"epoch": 0.43775100401606426, |
|
"grad_norm": 0.09062661230564117, |
|
"learning_rate": 5.545082416114632e-05, |
|
"loss": 1.7387, |
|
"step": 109 |
|
}, |
|
{ |
|
"epoch": 0.44176706827309237, |
|
"grad_norm": 0.096687912940979, |
|
"learning_rate": 5.534477410198763e-05, |
|
"loss": 1.8114, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.4457831325301205, |
|
"grad_norm": 0.09439574927091599, |
|
"learning_rate": 5.5237605984935435e-05, |
|
"loss": 1.8035, |
|
"step": 111 |
|
}, |
|
{ |
|
"epoch": 0.4497991967871486, |
|
"grad_norm": 0.1305311769247055, |
|
"learning_rate": 5.5129324537597846e-05, |
|
"loss": 1.9107, |
|
"step": 112 |
|
}, |
|
{ |
|
"epoch": 0.4538152610441767, |
|
"grad_norm": 0.09376931935548782, |
|
"learning_rate": 5.5019934536696336e-05, |
|
"loss": 1.8721, |
|
"step": 113 |
|
}, |
|
{ |
|
"epoch": 0.4578313253012048, |
|
"grad_norm": 0.10925185680389404, |
|
"learning_rate": 5.490944080785507e-05, |
|
"loss": 1.8682, |
|
"step": 114 |
|
}, |
|
{ |
|
"epoch": 0.46184738955823296, |
|
"grad_norm": 0.1156877651810646, |
|
"learning_rate": 5.479784822538796e-05, |
|
"loss": 1.5833, |
|
"step": 115 |
|
}, |
|
{ |
|
"epoch": 0.46586345381526106, |
|
"grad_norm": 0.1127604991197586, |
|
"learning_rate": 5.468516171208373e-05, |
|
"loss": 1.6511, |
|
"step": 116 |
|
}, |
|
{ |
|
"epoch": 0.46987951807228917, |
|
"grad_norm": 0.08820759505033493, |
|
"learning_rate": 5.4571386238988685e-05, |
|
"loss": 1.569, |
|
"step": 117 |
|
}, |
|
{ |
|
"epoch": 0.4738955823293173, |
|
"grad_norm": 0.11805479973554611, |
|
"learning_rate": 5.445652682518744e-05, |
|
"loss": 1.7633, |
|
"step": 118 |
|
}, |
|
{ |
|
"epoch": 0.4779116465863454, |
|
"grad_norm": 0.10260842740535736, |
|
"learning_rate": 5.434058853758151e-05, |
|
"loss": 1.5363, |
|
"step": 119 |
|
}, |
|
{ |
|
"epoch": 0.4819277108433735, |
|
"grad_norm": 0.09707831591367722, |
|
"learning_rate": 5.422357649066577e-05, |
|
"loss": 1.8102, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.4859437751004016, |
|
"grad_norm": 0.1374276876449585, |
|
"learning_rate": 5.4105495846302885e-05, |
|
"loss": 1.6286, |
|
"step": 121 |
|
}, |
|
{ |
|
"epoch": 0.4899598393574297, |
|
"grad_norm": 0.1383811980485916, |
|
"learning_rate": 5.398635181349553e-05, |
|
"loss": 1.8758, |
|
"step": 122 |
|
}, |
|
{ |
|
"epoch": 0.4939759036144578, |
|
"grad_norm": 0.1613747775554657, |
|
"learning_rate": 5.386614964815666e-05, |
|
"loss": 1.8786, |
|
"step": 123 |
|
}, |
|
{ |
|
"epoch": 0.4979919678714859, |
|
"grad_norm": 0.12664808332920074, |
|
"learning_rate": 5.3744894652877625e-05, |
|
"loss": 2.0399, |
|
"step": 124 |
|
}, |
|
{ |
|
"epoch": 0.5020080321285141, |
|
"grad_norm": 0.10178882628679276, |
|
"learning_rate": 5.362259217669424e-05, |
|
"loss": 1.9754, |
|
"step": 125 |
|
}, |
|
{ |
|
"epoch": 0.5060240963855421, |
|
"grad_norm": 0.12219224870204926, |
|
"learning_rate": 5.349924761485084e-05, |
|
"loss": 1.6695, |
|
"step": 126 |
|
}, |
|
{ |
|
"epoch": 0.5100401606425703, |
|
"grad_norm": 0.11439970135688782, |
|
"learning_rate": 5.3374866408562266e-05, |
|
"loss": 1.9119, |
|
"step": 127 |
|
}, |
|
{ |
|
"epoch": 0.5140562248995983, |
|
"grad_norm": 0.09417600184679031, |
|
"learning_rate": 5.324945404477383e-05, |
|
"loss": 1.7473, |
|
"step": 128 |
|
}, |
|
{ |
|
"epoch": 0.5180722891566265, |
|
"grad_norm": 0.09558118879795074, |
|
"learning_rate": 5.3123016055919275e-05, |
|
"loss": 1.4051, |
|
"step": 129 |
|
}, |
|
{ |
|
"epoch": 0.5220883534136547, |
|
"grad_norm": 0.10001164674758911, |
|
"learning_rate": 5.29955580196767e-05, |
|
"loss": 1.7789, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.5261044176706827, |
|
"grad_norm": 0.09575448930263519, |
|
"learning_rate": 5.2867085558722515e-05, |
|
"loss": 1.7881, |
|
"step": 131 |
|
}, |
|
{ |
|
"epoch": 0.5301204819277109, |
|
"grad_norm": 0.09888429939746857, |
|
"learning_rate": 5.27376043404834e-05, |
|
"loss": 1.7188, |
|
"step": 132 |
|
}, |
|
{ |
|
"epoch": 0.5341365461847389, |
|
"grad_norm": 0.11489002406597137, |
|
"learning_rate": 5.260712007688631e-05, |
|
"loss": 1.5928, |
|
"step": 133 |
|
}, |
|
{ |
|
"epoch": 0.5381526104417671, |
|
"grad_norm": 0.10772331058979034, |
|
"learning_rate": 5.247563852410647e-05, |
|
"loss": 1.7395, |
|
"step": 134 |
|
}, |
|
{ |
|
"epoch": 0.5421686746987951, |
|
"grad_norm": 0.12452363222837448, |
|
"learning_rate": 5.234316548231346e-05, |
|
"loss": 1.7145, |
|
"step": 135 |
|
}, |
|
{ |
|
"epoch": 0.5461847389558233, |
|
"grad_norm": 0.162751242518425, |
|
"learning_rate": 5.220970679541537e-05, |
|
"loss": 1.93, |
|
"step": 136 |
|
}, |
|
{ |
|
"epoch": 0.5502008032128514, |
|
"grad_norm": 0.09236788004636765, |
|
"learning_rate": 5.207526835080096e-05, |
|
"loss": 1.7755, |
|
"step": 137 |
|
}, |
|
{ |
|
"epoch": 0.5542168674698795, |
|
"grad_norm": 0.09892769902944565, |
|
"learning_rate": 5.193985607907996e-05, |
|
"loss": 1.8125, |
|
"step": 138 |
|
}, |
|
{ |
|
"epoch": 0.5582329317269076, |
|
"grad_norm": 0.10891012102365494, |
|
"learning_rate": 5.180347595382148e-05, |
|
"loss": 1.7344, |
|
"step": 139 |
|
}, |
|
{ |
|
"epoch": 0.5622489959839357, |
|
"grad_norm": 0.10631144046783447, |
|
"learning_rate": 5.166613399129045e-05, |
|
"loss": 1.8008, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.5662650602409639, |
|
"grad_norm": 0.10682453960180283, |
|
"learning_rate": 5.1527836250182216e-05, |
|
"loss": 1.8495, |
|
"step": 141 |
|
}, |
|
{ |
|
"epoch": 0.570281124497992, |
|
"grad_norm": 0.10644228011369705, |
|
"learning_rate": 5.138858883135532e-05, |
|
"loss": 1.7763, |
|
"step": 142 |
|
}, |
|
{ |
|
"epoch": 0.5742971887550201, |
|
"grad_norm": 0.10813590884208679, |
|
"learning_rate": 5.1248397877562306e-05, |
|
"loss": 1.7689, |
|
"step": 143 |
|
}, |
|
{ |
|
"epoch": 0.5783132530120482, |
|
"grad_norm": 0.1288304626941681, |
|
"learning_rate": 5.1107269573178765e-05, |
|
"loss": 1.5501, |
|
"step": 144 |
|
}, |
|
{ |
|
"epoch": 0.5823293172690763, |
|
"grad_norm": 0.09014229476451874, |
|
"learning_rate": 5.096521014393053e-05, |
|
"loss": 2.0159, |
|
"step": 145 |
|
}, |
|
{ |
|
"epoch": 0.5863453815261044, |
|
"grad_norm": 0.11336657404899597, |
|
"learning_rate": 5.082222585661902e-05, |
|
"loss": 1.6987, |
|
"step": 146 |
|
}, |
|
{ |
|
"epoch": 0.5903614457831325, |
|
"grad_norm": 0.10714301466941833, |
|
"learning_rate": 5.0678323018844784e-05, |
|
"loss": 1.6675, |
|
"step": 147 |
|
}, |
|
{ |
|
"epoch": 0.5943775100401606, |
|
"grad_norm": 0.1200934424996376, |
|
"learning_rate": 5.0533507978729236e-05, |
|
"loss": 1.7993, |
|
"step": 148 |
|
}, |
|
{ |
|
"epoch": 0.5983935742971888, |
|
"grad_norm": 0.09276578575372696, |
|
"learning_rate": 5.0387787124634674e-05, |
|
"loss": 1.7931, |
|
"step": 149 |
|
}, |
|
{ |
|
"epoch": 0.6024096385542169, |
|
"grad_norm": 0.12235195189714432, |
|
"learning_rate": 5.0241166884882394e-05, |
|
"loss": 1.8777, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.606425702811245, |
|
"grad_norm": 0.10112816095352173, |
|
"learning_rate": 5.0093653727469127e-05, |
|
"loss": 1.6646, |
|
"step": 151 |
|
}, |
|
{ |
|
"epoch": 0.6104417670682731, |
|
"grad_norm": 0.10711057484149933, |
|
"learning_rate": 4.994525415978175e-05, |
|
"loss": 1.6652, |
|
"step": 152 |
|
}, |
|
{ |
|
"epoch": 0.6144578313253012, |
|
"grad_norm": 0.12113311886787415, |
|
"learning_rate": 4.979597472831016e-05, |
|
"loss": 1.8628, |
|
"step": 153 |
|
}, |
|
{ |
|
"epoch": 0.6184738955823293, |
|
"grad_norm": 0.10867751389741898, |
|
"learning_rate": 4.964582201835856e-05, |
|
"loss": 1.8879, |
|
"step": 154 |
|
}, |
|
{ |
|
"epoch": 0.6224899598393574, |
|
"grad_norm": 0.1424121856689453, |
|
"learning_rate": 4.9494802653754846e-05, |
|
"loss": 1.6689, |
|
"step": 155 |
|
}, |
|
{ |
|
"epoch": 0.6265060240963856, |
|
"grad_norm": 0.11042691767215729, |
|
"learning_rate": 4.934292329655852e-05, |
|
"loss": 1.5745, |
|
"step": 156 |
|
}, |
|
{ |
|
"epoch": 0.6305220883534136, |
|
"grad_norm": 0.11223538219928741, |
|
"learning_rate": 4.919019064676674e-05, |
|
"loss": 1.7756, |
|
"step": 157 |
|
}, |
|
{ |
|
"epoch": 0.6345381526104418, |
|
"grad_norm": 0.12498051673173904, |
|
"learning_rate": 4.903661144201873e-05, |
|
"loss": 1.6208, |
|
"step": 158 |
|
}, |
|
{ |
|
"epoch": 0.6385542168674698, |
|
"grad_norm": 0.11375685781240463, |
|
"learning_rate": 4.888219245729863e-05, |
|
"loss": 1.9195, |
|
"step": 159 |
|
}, |
|
{ |
|
"epoch": 0.642570281124498, |
|
"grad_norm": 0.11936648935079575, |
|
"learning_rate": 4.872694050463656e-05, |
|
"loss": 1.7615, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.6465863453815262, |
|
"grad_norm": 0.14738889038562775, |
|
"learning_rate": 4.857086243280815e-05, |
|
"loss": 1.4813, |
|
"step": 161 |
|
}, |
|
{ |
|
"epoch": 0.6506024096385542, |
|
"grad_norm": 0.10601120442152023, |
|
"learning_rate": 4.8413965127032396e-05, |
|
"loss": 1.7443, |
|
"step": 162 |
|
}, |
|
{ |
|
"epoch": 0.6546184738955824, |
|
"grad_norm": 0.16903774440288544, |
|
"learning_rate": 4.825625550866795e-05, |
|
"loss": 1.7467, |
|
"step": 163 |
|
}, |
|
{ |
|
"epoch": 0.6586345381526104, |
|
"grad_norm": 0.1252775341272354, |
|
"learning_rate": 4.8097740534907745e-05, |
|
"loss": 1.7344, |
|
"step": 164 |
|
}, |
|
{ |
|
"epoch": 0.6626506024096386, |
|
"grad_norm": 0.09926458448171616, |
|
"learning_rate": 4.793842719847214e-05, |
|
"loss": 1.8299, |
|
"step": 165 |
|
}, |
|
{ |
|
"epoch": 0.6666666666666666, |
|
"grad_norm": 0.11151187121868134, |
|
"learning_rate": 4.7778322527300415e-05, |
|
"loss": 1.9818, |
|
"step": 166 |
|
}, |
|
{ |
|
"epoch": 0.6706827309236948, |
|
"grad_norm": 0.09780837595462799, |
|
"learning_rate": 4.761743358424073e-05, |
|
"loss": 1.8249, |
|
"step": 167 |
|
}, |
|
{ |
|
"epoch": 0.6746987951807228, |
|
"grad_norm": 0.11365465819835663, |
|
"learning_rate": 4.745576746673859e-05, |
|
"loss": 1.7066, |
|
"step": 168 |
|
}, |
|
{ |
|
"epoch": 0.678714859437751, |
|
"grad_norm": 0.11123645305633545, |
|
"learning_rate": 4.72933313065237e-05, |
|
"loss": 1.7652, |
|
"step": 169 |
|
}, |
|
{ |
|
"epoch": 0.6827309236947792, |
|
"grad_norm": 0.10402809083461761, |
|
"learning_rate": 4.713013226929543e-05, |
|
"loss": 1.9567, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.6867469879518072, |
|
"grad_norm": 0.11607303470373154, |
|
"learning_rate": 4.696617755440661e-05, |
|
"loss": 1.7311, |
|
"step": 171 |
|
}, |
|
{ |
|
"epoch": 0.6907630522088354, |
|
"grad_norm": 0.11645929515361786, |
|
"learning_rate": 4.6801474394546034e-05, |
|
"loss": 1.8036, |
|
"step": 172 |
|
}, |
|
{ |
|
"epoch": 0.6947791164658634, |
|
"grad_norm": 0.1081853061914444, |
|
"learning_rate": 4.663603005541933e-05, |
|
"loss": 1.8126, |
|
"step": 173 |
|
}, |
|
{ |
|
"epoch": 0.6987951807228916, |
|
"grad_norm": 0.10500285774469376, |
|
"learning_rate": 4.646985183542845e-05, |
|
"loss": 1.7044, |
|
"step": 174 |
|
}, |
|
{ |
|
"epoch": 0.7028112449799196, |
|
"grad_norm": 0.11390309780836105, |
|
"learning_rate": 4.630294706534977e-05, |
|
"loss": 1.8124, |
|
"step": 175 |
|
}, |
|
{ |
|
"epoch": 0.7068273092369478, |
|
"grad_norm": 0.1308196634054184, |
|
"learning_rate": 4.613532310801062e-05, |
|
"loss": 1.6631, |
|
"step": 176 |
|
}, |
|
{ |
|
"epoch": 0.7108433734939759, |
|
"grad_norm": 0.11062182486057281, |
|
"learning_rate": 4.5966987357964526e-05, |
|
"loss": 1.6611, |
|
"step": 177 |
|
}, |
|
{ |
|
"epoch": 0.714859437751004, |
|
"grad_norm": 0.10045716166496277, |
|
"learning_rate": 4.5797947241164984e-05, |
|
"loss": 1.8025, |
|
"step": 178 |
|
}, |
|
{ |
|
"epoch": 0.7188755020080321, |
|
"grad_norm": 0.10317470133304596, |
|
"learning_rate": 4.562821021463791e-05, |
|
"loss": 1.7107, |
|
"step": 179 |
|
}, |
|
{ |
|
"epoch": 0.7228915662650602, |
|
"grad_norm": 0.11115267127752304, |
|
"learning_rate": 4.545778376615263e-05, |
|
"loss": 1.7796, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.7269076305220884, |
|
"grad_norm": 0.11856064200401306, |
|
"learning_rate": 4.5286675413891584e-05, |
|
"loss": 1.8696, |
|
"step": 181 |
|
}, |
|
{ |
|
"epoch": 0.7309236947791165, |
|
"grad_norm": 0.1215963363647461, |
|
"learning_rate": 4.5114892706118725e-05, |
|
"loss": 1.8084, |
|
"step": 182 |
|
}, |
|
{ |
|
"epoch": 0.7349397590361446, |
|
"grad_norm": 0.09853583574295044, |
|
"learning_rate": 4.4942443220846415e-05, |
|
"loss": 1.8725, |
|
"step": 183 |
|
}, |
|
{ |
|
"epoch": 0.7389558232931727, |
|
"grad_norm": 0.10426657646894455, |
|
"learning_rate": 4.476933456550128e-05, |
|
"loss": 1.6489, |
|
"step": 184 |
|
}, |
|
{ |
|
"epoch": 0.7429718875502008, |
|
"grad_norm": 0.10910302400588989, |
|
"learning_rate": 4.459557437658844e-05, |
|
"loss": 1.9391, |
|
"step": 185 |
|
}, |
|
{ |
|
"epoch": 0.7469879518072289, |
|
"grad_norm": 0.09922551363706589, |
|
"learning_rate": 4.4421170319354825e-05, |
|
"loss": 1.8068, |
|
"step": 186 |
|
}, |
|
{ |
|
"epoch": 0.751004016064257, |
|
"grad_norm": 0.12102405726909637, |
|
"learning_rate": 4.4246130087450885e-05, |
|
"loss": 1.7633, |
|
"step": 187 |
|
}, |
|
{ |
|
"epoch": 0.7550200803212851, |
|
"grad_norm": 0.1103704422712326, |
|
"learning_rate": 4.407046140259123e-05, |
|
"loss": 1.6617, |
|
"step": 188 |
|
}, |
|
{ |
|
"epoch": 0.7590361445783133, |
|
"grad_norm": 0.13233919441699982, |
|
"learning_rate": 4.389417201421404e-05, |
|
"loss": 1.6365, |
|
"step": 189 |
|
}, |
|
{ |
|
"epoch": 0.7630522088353414, |
|
"grad_norm": 0.1326707899570465, |
|
"learning_rate": 4.3717269699139156e-05, |
|
"loss": 1.7263, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.7670682730923695, |
|
"grad_norm": 0.11567298322916031, |
|
"learning_rate": 4.3539762261225044e-05, |
|
"loss": 1.7935, |
|
"step": 191 |
|
}, |
|
{ |
|
"epoch": 0.7710843373493976, |
|
"grad_norm": 0.10680199414491653, |
|
"learning_rate": 4.336165753102451e-05, |
|
"loss": 1.7901, |
|
"step": 192 |
|
}, |
|
{ |
|
"epoch": 0.7751004016064257, |
|
"grad_norm": 0.21424338221549988, |
|
"learning_rate": 4.3182963365439275e-05, |
|
"loss": 1.8038, |
|
"step": 193 |
|
}, |
|
{ |
|
"epoch": 0.7791164658634538, |
|
"grad_norm": 0.11191921681165695, |
|
"learning_rate": 4.3003687647373405e-05, |
|
"loss": 1.6104, |
|
"step": 194 |
|
}, |
|
{ |
|
"epoch": 0.7831325301204819, |
|
"grad_norm": 0.12618285417556763, |
|
"learning_rate": 4.282383828538551e-05, |
|
"loss": 1.7352, |
|
"step": 195 |
|
}, |
|
{ |
|
"epoch": 0.7871485943775101, |
|
"grad_norm": 0.11540846526622772, |
|
"learning_rate": 4.264342321333989e-05, |
|
"loss": 1.5843, |
|
"step": 196 |
|
}, |
|
{ |
|
"epoch": 0.7911646586345381, |
|
"grad_norm": 0.2725273668766022, |
|
"learning_rate": 4.24624503900566e-05, |
|
"loss": 1.7595, |
|
"step": 197 |
|
}, |
|
{ |
|
"epoch": 0.7951807228915663, |
|
"grad_norm": 0.13503190875053406, |
|
"learning_rate": 4.228092779896021e-05, |
|
"loss": 1.7162, |
|
"step": 198 |
|
}, |
|
{ |
|
"epoch": 0.7991967871485943, |
|
"grad_norm": 0.32556626200675964, |
|
"learning_rate": 4.209886344772781e-05, |
|
"loss": 1.7433, |
|
"step": 199 |
|
}, |
|
{ |
|
"epoch": 0.8032128514056225, |
|
"grad_norm": 0.11719845980405807, |
|
"learning_rate": 4.191626536793561e-05, |
|
"loss": 1.7366, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.8072289156626506, |
|
"grad_norm": 0.13130998611450195, |
|
"learning_rate": 4.173314161470468e-05, |
|
"loss": 1.8753, |
|
"step": 201 |
|
}, |
|
{ |
|
"epoch": 0.8112449799196787, |
|
"grad_norm": 0.14412151277065277, |
|
"learning_rate": 4.154950026634566e-05, |
|
"loss": 1.7503, |
|
"step": 202 |
|
}, |
|
{ |
|
"epoch": 0.8152610441767069, |
|
"grad_norm": 0.13965652883052826, |
|
"learning_rate": 4.136534942400234e-05, |
|
"loss": 1.6057, |
|
"step": 203 |
|
}, |
|
{ |
|
"epoch": 0.8192771084337349, |
|
"grad_norm": 0.1472691297531128, |
|
"learning_rate": 4.118069721129427e-05, |
|
"loss": 1.9009, |
|
"step": 204 |
|
}, |
|
{ |
|
"epoch": 0.8232931726907631, |
|
"grad_norm": 0.1335720419883728, |
|
"learning_rate": 4.099555177395845e-05, |
|
"loss": 1.7673, |
|
"step": 205 |
|
}, |
|
{ |
|
"epoch": 0.8273092369477911, |
|
"grad_norm": 0.1401488333940506, |
|
"learning_rate": 4.0809921279489954e-05, |
|
"loss": 1.7398, |
|
"step": 206 |
|
}, |
|
{ |
|
"epoch": 0.8313253012048193, |
|
"grad_norm": 0.12071944773197174, |
|
"learning_rate": 4.062381391678163e-05, |
|
"loss": 1.7446, |
|
"step": 207 |
|
}, |
|
{ |
|
"epoch": 0.8353413654618473, |
|
"grad_norm": 0.11511554569005966, |
|
"learning_rate": 4.04372378957629e-05, |
|
"loss": 1.6161, |
|
"step": 208 |
|
}, |
|
{ |
|
"epoch": 0.8393574297188755, |
|
"grad_norm": 0.1713258922100067, |
|
"learning_rate": 4.025020144703749e-05, |
|
"loss": 1.7174, |
|
"step": 209 |
|
}, |
|
{ |
|
"epoch": 0.8433734939759037, |
|
"grad_norm": 0.13804182410240173, |
|
"learning_rate": 4.006271282152048e-05, |
|
"loss": 1.5781, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.8473895582329317, |
|
"grad_norm": 0.17464035749435425, |
|
"learning_rate": 3.9874780290074176e-05, |
|
"loss": 1.9369, |
|
"step": 211 |
|
}, |
|
{ |
|
"epoch": 0.8514056224899599, |
|
"grad_norm": 0.11888068914413452, |
|
"learning_rate": 3.9686412143143384e-05, |
|
"loss": 1.8222, |
|
"step": 212 |
|
}, |
|
{ |
|
"epoch": 0.8554216867469879, |
|
"grad_norm": 0.14188675582408905, |
|
"learning_rate": 3.94976166903896e-05, |
|
"loss": 1.7729, |
|
"step": 213 |
|
}, |
|
{ |
|
"epoch": 0.8594377510040161, |
|
"grad_norm": 0.13000579178333282, |
|
"learning_rate": 3.930840226032446e-05, |
|
"loss": 1.7222, |
|
"step": 214 |
|
}, |
|
{ |
|
"epoch": 0.8634538152610441, |
|
"grad_norm": 0.14280284941196442, |
|
"learning_rate": 3.911877719994235e-05, |
|
"loss": 2.0587, |
|
"step": 215 |
|
}, |
|
{ |
|
"epoch": 0.8674698795180723, |
|
"grad_norm": 0.16663293540477753, |
|
"learning_rate": 3.8928749874352185e-05, |
|
"loss": 1.834, |
|
"step": 216 |
|
}, |
|
{ |
|
"epoch": 0.8714859437751004, |
|
"grad_norm": 0.09696398675441742, |
|
"learning_rate": 3.873832866640836e-05, |
|
"loss": 1.7275, |
|
"step": 217 |
|
}, |
|
{ |
|
"epoch": 0.8755020080321285, |
|
"grad_norm": 0.14156584441661835, |
|
"learning_rate": 3.8547521976341004e-05, |
|
"loss": 1.6058, |
|
"step": 218 |
|
}, |
|
{ |
|
"epoch": 0.8795180722891566, |
|
"grad_norm": 0.13290172815322876, |
|
"learning_rate": 3.835633822138535e-05, |
|
"loss": 1.6701, |
|
"step": 219 |
|
}, |
|
{ |
|
"epoch": 0.8835341365461847, |
|
"grad_norm": 0.13043315708637238, |
|
"learning_rate": 3.816478583541048e-05, |
|
"loss": 1.656, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.8875502008032129, |
|
"grad_norm": 0.1180863231420517, |
|
"learning_rate": 3.797287326854723e-05, |
|
"loss": 1.8315, |
|
"step": 221 |
|
}, |
|
{ |
|
"epoch": 0.891566265060241, |
|
"grad_norm": 0.12132333219051361, |
|
"learning_rate": 3.778060898681541e-05, |
|
"loss": 1.8825, |
|
"step": 222 |
|
}, |
|
{ |
|
"epoch": 0.8955823293172691, |
|
"grad_norm": 0.14479801058769226, |
|
"learning_rate": 3.758800147175042e-05, |
|
"loss": 1.6463, |
|
"step": 223 |
|
}, |
|
{ |
|
"epoch": 0.8995983935742972, |
|
"grad_norm": 0.11980578303337097, |
|
"learning_rate": 3.739505922002898e-05, |
|
"loss": 1.7547, |
|
"step": 224 |
|
}, |
|
{ |
|
"epoch": 0.9036144578313253, |
|
"grad_norm": 0.1616433560848236, |
|
"learning_rate": 3.7201790743094374e-05, |
|
"loss": 1.6393, |
|
"step": 225 |
|
}, |
|
{ |
|
"epoch": 0.9076305220883534, |
|
"grad_norm": 0.1393657922744751, |
|
"learning_rate": 3.7008204566781004e-05, |
|
"loss": 1.705, |
|
"step": 226 |
|
}, |
|
{ |
|
"epoch": 0.9116465863453815, |
|
"grad_norm": 0.13711820542812347, |
|
"learning_rate": 3.681430923093818e-05, |
|
"loss": 1.7637, |
|
"step": 227 |
|
}, |
|
{ |
|
"epoch": 0.9156626506024096, |
|
"grad_norm": 0.1211545392870903, |
|
"learning_rate": 3.662011328905351e-05, |
|
"loss": 1.8126, |
|
"step": 228 |
|
}, |
|
{ |
|
"epoch": 0.9196787148594378, |
|
"grad_norm": 0.11105578392744064, |
|
"learning_rate": 3.64256253078755e-05, |
|
"loss": 1.5691, |
|
"step": 229 |
|
}, |
|
{ |
|
"epoch": 0.9236947791164659, |
|
"grad_norm": 0.19085654616355896, |
|
"learning_rate": 3.6230853867035654e-05, |
|
"loss": 1.6751, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 0.927710843373494, |
|
"grad_norm": 0.12571419775485992, |
|
"learning_rate": 3.603580755867001e-05, |
|
"loss": 1.589, |
|
"step": 231 |
|
}, |
|
{ |
|
"epoch": 0.9317269076305221, |
|
"grad_norm": 0.1361204832792282, |
|
"learning_rate": 3.58404949870401e-05, |
|
"loss": 1.8733, |
|
"step": 232 |
|
}, |
|
{ |
|
"epoch": 0.9357429718875502, |
|
"grad_norm": 0.11016932129859924, |
|
"learning_rate": 3.5644924768153364e-05, |
|
"loss": 1.898, |
|
"step": 233 |
|
}, |
|
{ |
|
"epoch": 0.9397590361445783, |
|
"grad_norm": 0.12060891091823578, |
|
"learning_rate": 3.544910552938309e-05, |
|
"loss": 1.7406, |
|
"step": 234 |
|
}, |
|
{ |
|
"epoch": 0.9437751004016064, |
|
"grad_norm": 0.11152440309524536, |
|
"learning_rate": 3.5253045909087813e-05, |
|
"loss": 1.9775, |
|
"step": 235 |
|
}, |
|
{ |
|
"epoch": 0.9477911646586346, |
|
"grad_norm": 0.13362078368663788, |
|
"learning_rate": 3.505675455623023e-05, |
|
"loss": 1.5462, |
|
"step": 236 |
|
}, |
|
{ |
|
"epoch": 0.9518072289156626, |
|
"grad_norm": 0.17355649173259735, |
|
"learning_rate": 3.4860240129995696e-05, |
|
"loss": 1.9621, |
|
"step": 237 |
|
}, |
|
{ |
|
"epoch": 0.9558232931726908, |
|
"grad_norm": 0.10163702070713043, |
|
"learning_rate": 3.4663511299410203e-05, |
|
"loss": 1.9847, |
|
"step": 238 |
|
}, |
|
{ |
|
"epoch": 0.9598393574297188, |
|
"grad_norm": 0.12204550206661224, |
|
"learning_rate": 3.446657674295796e-05, |
|
"loss": 1.8851, |
|
"step": 239 |
|
}, |
|
{ |
|
"epoch": 0.963855421686747, |
|
"grad_norm": 0.14867813885211945, |
|
"learning_rate": 3.426944514819856e-05, |
|
"loss": 1.6835, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.9678714859437751, |
|
"grad_norm": 0.12139367312192917, |
|
"learning_rate": 3.407212521138372e-05, |
|
"loss": 1.8269, |
|
"step": 241 |
|
}, |
|
{ |
|
"epoch": 0.9718875502008032, |
|
"grad_norm": 0.2183162420988083, |
|
"learning_rate": 3.3874625637073704e-05, |
|
"loss": 1.6785, |
|
"step": 242 |
|
}, |
|
{ |
|
"epoch": 0.9759036144578314, |
|
"grad_norm": 0.1285049170255661, |
|
"learning_rate": 3.367695513775324e-05, |
|
"loss": 1.6806, |
|
"step": 243 |
|
}, |
|
{ |
|
"epoch": 0.9799196787148594, |
|
"grad_norm": 0.12188960611820221, |
|
"learning_rate": 3.347912243344727e-05, |
|
"loss": 1.7203, |
|
"step": 244 |
|
}, |
|
{ |
|
"epoch": 0.9839357429718876, |
|
"grad_norm": 0.15413041412830353, |
|
"learning_rate": 3.328113625133624e-05, |
|
"loss": 1.6022, |
|
"step": 245 |
|
}, |
|
{ |
|
"epoch": 0.9879518072289156, |
|
"grad_norm": 0.1414673924446106, |
|
"learning_rate": 3.308300532537107e-05, |
|
"loss": 1.741, |
|
"step": 246 |
|
}, |
|
{ |
|
"epoch": 0.9919678714859438, |
|
"grad_norm": 0.1399112194776535, |
|
"learning_rate": 3.288473839588796e-05, |
|
"loss": 1.5319, |
|
"step": 247 |
|
}, |
|
{ |
|
"epoch": 0.9959839357429718, |
|
"grad_norm": 0.13522355258464813, |
|
"learning_rate": 3.26863442092227e-05, |
|
"loss": 1.6901, |
|
"step": 248 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"grad_norm": 0.11732550710439682, |
|
"learning_rate": 3.248783151732494e-05, |
|
"loss": 1.8542, |
|
"step": 249 |
|
}, |
|
{ |
|
"epoch": 1.0040160642570282, |
|
"grad_norm": 0.26141083240509033, |
|
"learning_rate": 3.2289209077372035e-05, |
|
"loss": 3.761, |
|
"step": 250 |
|
} |
|
], |
|
"logging_steps": 1, |
|
"max_steps": 498, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 2, |
|
"save_steps": 25, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": false |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 1.4115757240237425e+19, |
|
"train_batch_size": 4, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|