sciworld-qnet / trainer_state.json
xiaoxiaolin's picture
Upload folder using huggingface_hub
bf7d6af verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 1.0,
"eval_steps": 500,
"global_step": 5687,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0008791981712678037,
"grad_norm": 10.053899765014648,
"learning_rate": 5.847953216374269e-07,
"loss": 0.1487,
"step": 5
},
{
"epoch": 0.0017583963425356075,
"grad_norm": 1371.18408203125,
"learning_rate": 1.1695906432748538e-06,
"loss": 0.154,
"step": 10
},
{
"epoch": 0.0026375945138034113,
"grad_norm": 4331.09814453125,
"learning_rate": 1.7543859649122807e-06,
"loss": 0.1485,
"step": 15
},
{
"epoch": 0.003516792685071215,
"grad_norm": 50.946231842041016,
"learning_rate": 2.3391812865497075e-06,
"loss": 0.082,
"step": 20
},
{
"epoch": 0.004395990856339019,
"grad_norm": 130.27598571777344,
"learning_rate": 2.9239766081871347e-06,
"loss": 0.1029,
"step": 25
},
{
"epoch": 0.005275189027606823,
"grad_norm": 312.9079895019531,
"learning_rate": 3.5087719298245615e-06,
"loss": 0.0983,
"step": 30
},
{
"epoch": 0.006154387198874627,
"grad_norm": 828.6588134765625,
"learning_rate": 4.093567251461989e-06,
"loss": 0.116,
"step": 35
},
{
"epoch": 0.00703358537014243,
"grad_norm": 95.5357666015625,
"learning_rate": 4.678362573099415e-06,
"loss": 0.1025,
"step": 40
},
{
"epoch": 0.007912783541410234,
"grad_norm": 46.62782669067383,
"learning_rate": 5.263157894736842e-06,
"loss": 0.0929,
"step": 45
},
{
"epoch": 0.008791981712678037,
"grad_norm": 24.908008575439453,
"learning_rate": 5.847953216374269e-06,
"loss": 0.0911,
"step": 50
},
{
"epoch": 0.009671179883945842,
"grad_norm": 42.72391891479492,
"learning_rate": 6.432748538011696e-06,
"loss": 0.1051,
"step": 55
},
{
"epoch": 0.010550378055213645,
"grad_norm": 1.5553064346313477,
"learning_rate": 7.017543859649123e-06,
"loss": 0.0939,
"step": 60
},
{
"epoch": 0.011429576226481448,
"grad_norm": 840.0651245117188,
"learning_rate": 7.60233918128655e-06,
"loss": 0.0739,
"step": 65
},
{
"epoch": 0.012308774397749253,
"grad_norm": 4780.78466796875,
"learning_rate": 8.187134502923977e-06,
"loss": 0.1114,
"step": 70
},
{
"epoch": 0.013187972569017057,
"grad_norm": 36.51594161987305,
"learning_rate": 8.771929824561405e-06,
"loss": 0.1356,
"step": 75
},
{
"epoch": 0.01406717074028486,
"grad_norm": 6.5139970779418945,
"learning_rate": 9.35672514619883e-06,
"loss": 0.0977,
"step": 80
},
{
"epoch": 0.014946368911552665,
"grad_norm": 63.903018951416016,
"learning_rate": 9.941520467836257e-06,
"loss": 0.1117,
"step": 85
},
{
"epoch": 0.015825567082820468,
"grad_norm": 18.261695861816406,
"learning_rate": 1.0526315789473684e-05,
"loss": 0.0912,
"step": 90
},
{
"epoch": 0.01670476525408827,
"grad_norm": 246.80682373046875,
"learning_rate": 1.1111111111111113e-05,
"loss": 0.1014,
"step": 95
},
{
"epoch": 0.017583963425356074,
"grad_norm": 432.3685607910156,
"learning_rate": 1.1695906432748539e-05,
"loss": 0.0915,
"step": 100
},
{
"epoch": 0.018463161596623878,
"grad_norm": 1106.1822509765625,
"learning_rate": 1.2280701754385966e-05,
"loss": 0.0993,
"step": 105
},
{
"epoch": 0.019342359767891684,
"grad_norm": 39.593421936035156,
"learning_rate": 1.2865497076023392e-05,
"loss": 0.157,
"step": 110
},
{
"epoch": 0.020221557939159487,
"grad_norm": 3.5536696910858154,
"learning_rate": 1.345029239766082e-05,
"loss": 0.1274,
"step": 115
},
{
"epoch": 0.02110075611042729,
"grad_norm": 1.2601370811462402,
"learning_rate": 1.4035087719298246e-05,
"loss": 0.0806,
"step": 120
},
{
"epoch": 0.021979954281695094,
"grad_norm": 1.1521034240722656,
"learning_rate": 1.4619883040935675e-05,
"loss": 0.0785,
"step": 125
},
{
"epoch": 0.022859152452962897,
"grad_norm": 0.7003596425056458,
"learning_rate": 1.52046783625731e-05,
"loss": 0.0819,
"step": 130
},
{
"epoch": 0.0237383506242307,
"grad_norm": 0.29950231313705444,
"learning_rate": 1.578947368421053e-05,
"loss": 0.0825,
"step": 135
},
{
"epoch": 0.024617548795498507,
"grad_norm": 1.1281251907348633,
"learning_rate": 1.6374269005847955e-05,
"loss": 0.105,
"step": 140
},
{
"epoch": 0.02549674696676631,
"grad_norm": 0.8757471442222595,
"learning_rate": 1.695906432748538e-05,
"loss": 0.1029,
"step": 145
},
{
"epoch": 0.026375945138034113,
"grad_norm": 1.9525190591812134,
"learning_rate": 1.754385964912281e-05,
"loss": 0.0708,
"step": 150
},
{
"epoch": 0.027255143309301916,
"grad_norm": 1.9676127433776855,
"learning_rate": 1.8128654970760235e-05,
"loss": 0.0863,
"step": 155
},
{
"epoch": 0.02813434148056972,
"grad_norm": 0.7372169494628906,
"learning_rate": 1.871345029239766e-05,
"loss": 0.0668,
"step": 160
},
{
"epoch": 0.029013539651837523,
"grad_norm": 0.6957089900970459,
"learning_rate": 1.929824561403509e-05,
"loss": 0.0688,
"step": 165
},
{
"epoch": 0.02989273782310533,
"grad_norm": 0.7962855696678162,
"learning_rate": 1.9883040935672515e-05,
"loss": 0.0619,
"step": 170
},
{
"epoch": 0.030771935994373133,
"grad_norm": 0.6682185530662537,
"learning_rate": 1.999997404978087e-05,
"loss": 0.0698,
"step": 175
},
{
"epoch": 0.031651134165640936,
"grad_norm": 0.7355603575706482,
"learning_rate": 1.999986862724647e-05,
"loss": 0.078,
"step": 180
},
{
"epoch": 0.03253033233690874,
"grad_norm": 0.4755054712295532,
"learning_rate": 1.9999682111362368e-05,
"loss": 0.0732,
"step": 185
},
{
"epoch": 0.03340953050817654,
"grad_norm": 0.4909241497516632,
"learning_rate": 1.9999414503641103e-05,
"loss": 0.0631,
"step": 190
},
{
"epoch": 0.034288728679444345,
"grad_norm": 1.0899882316589355,
"learning_rate": 1.9999065806252828e-05,
"loss": 0.0692,
"step": 195
},
{
"epoch": 0.03516792685071215,
"grad_norm": 0.263811320066452,
"learning_rate": 1.999863602202528e-05,
"loss": 0.0696,
"step": 200
},
{
"epoch": 0.03604712502197995,
"grad_norm": 1.9172215461730957,
"learning_rate": 1.999812515444377e-05,
"loss": 0.079,
"step": 205
},
{
"epoch": 0.036926323193247755,
"grad_norm": 1.79094660282135,
"learning_rate": 1.9997533207651147e-05,
"loss": 0.0627,
"step": 210
},
{
"epoch": 0.037805521364515565,
"grad_norm": 0.3693501651287079,
"learning_rate": 1.999686018644777e-05,
"loss": 0.0778,
"step": 215
},
{
"epoch": 0.03868471953578337,
"grad_norm": 0.18116237223148346,
"learning_rate": 1.999610609629147e-05,
"loss": 0.0643,
"step": 220
},
{
"epoch": 0.03956391770705117,
"grad_norm": 0.5909445881843567,
"learning_rate": 1.999527094329749e-05,
"loss": 0.0689,
"step": 225
},
{
"epoch": 0.040443115878318975,
"grad_norm": 0.4016267955303192,
"learning_rate": 1.9994354734238456e-05,
"loss": 0.0589,
"step": 230
},
{
"epoch": 0.04132231404958678,
"grad_norm": 0.8470014929771423,
"learning_rate": 1.9993357476544314e-05,
"loss": 0.0714,
"step": 235
},
{
"epoch": 0.04220151222085458,
"grad_norm": 1.2889784574508667,
"learning_rate": 1.9992279178302266e-05,
"loss": 0.0759,
"step": 240
},
{
"epoch": 0.043080710392122384,
"grad_norm": 1.695059061050415,
"learning_rate": 1.9991119848256708e-05,
"loss": 0.0582,
"step": 245
},
{
"epoch": 0.04395990856339019,
"grad_norm": 0.7226565480232239,
"learning_rate": 1.998987949580916e-05,
"loss": 0.0802,
"step": 250
},
{
"epoch": 0.04483910673465799,
"grad_norm": 0.513992965221405,
"learning_rate": 1.9988558131018188e-05,
"loss": 0.0747,
"step": 255
},
{
"epoch": 0.045718304905925794,
"grad_norm": 0.8010172247886658,
"learning_rate": 1.998715576459932e-05,
"loss": 0.0779,
"step": 260
},
{
"epoch": 0.0465975030771936,
"grad_norm": 0.6723889112472534,
"learning_rate": 1.9985672407924966e-05,
"loss": 0.0778,
"step": 265
},
{
"epoch": 0.0474767012484614,
"grad_norm": 0.5232120752334595,
"learning_rate": 1.998410807302432e-05,
"loss": 0.0606,
"step": 270
},
{
"epoch": 0.048355899419729204,
"grad_norm": 1.1310707330703735,
"learning_rate": 1.9982462772583267e-05,
"loss": 0.0786,
"step": 275
},
{
"epoch": 0.049235097590997014,
"grad_norm": 0.42932379245758057,
"learning_rate": 1.998073651994427e-05,
"loss": 0.0674,
"step": 280
},
{
"epoch": 0.05011429576226482,
"grad_norm": 0.30086904764175415,
"learning_rate": 1.997892932910628e-05,
"loss": 0.0662,
"step": 285
},
{
"epoch": 0.05099349393353262,
"grad_norm": 0.3778522312641144,
"learning_rate": 1.9977041214724594e-05,
"loss": 0.077,
"step": 290
},
{
"epoch": 0.05187269210480042,
"grad_norm": 1.5126148462295532,
"learning_rate": 1.997507219211078e-05,
"loss": 0.073,
"step": 295
},
{
"epoch": 0.052751890276068227,
"grad_norm": 0.4894915223121643,
"learning_rate": 1.99730222772325e-05,
"loss": 0.0705,
"step": 300
},
{
"epoch": 0.05363108844733603,
"grad_norm": 0.6623161435127258,
"learning_rate": 1.9970891486713423e-05,
"loss": 0.0583,
"step": 305
},
{
"epoch": 0.05451028661860383,
"grad_norm": 0.6160693764686584,
"learning_rate": 1.9968679837833075e-05,
"loss": 0.061,
"step": 310
},
{
"epoch": 0.055389484789871636,
"grad_norm": 1.3512332439422607,
"learning_rate": 1.9966387348526682e-05,
"loss": 0.0609,
"step": 315
},
{
"epoch": 0.05626868296113944,
"grad_norm": 0.7443258166313171,
"learning_rate": 1.9964014037385065e-05,
"loss": 0.0605,
"step": 320
},
{
"epoch": 0.05714788113240724,
"grad_norm": 0.47612714767456055,
"learning_rate": 1.996155992365444e-05,
"loss": 0.0631,
"step": 325
},
{
"epoch": 0.058027079303675046,
"grad_norm": 1.439274787902832,
"learning_rate": 1.9959025027236305e-05,
"loss": 0.0687,
"step": 330
},
{
"epoch": 0.05890627747494285,
"grad_norm": 0.7117618322372437,
"learning_rate": 1.9956409368687257e-05,
"loss": 0.0714,
"step": 335
},
{
"epoch": 0.05978547564621066,
"grad_norm": 0.6142310500144958,
"learning_rate": 1.995371296921882e-05,
"loss": 0.0672,
"step": 340
},
{
"epoch": 0.06066467381747846,
"grad_norm": 1.082131028175354,
"learning_rate": 1.9950935850697288e-05,
"loss": 0.0879,
"step": 345
},
{
"epoch": 0.061543871988746265,
"grad_norm": 0.35354727506637573,
"learning_rate": 1.9948078035643546e-05,
"loss": 0.0799,
"step": 350
},
{
"epoch": 0.06242307016001407,
"grad_norm": 0.2982726991176605,
"learning_rate": 1.9945139547232872e-05,
"loss": 0.0764,
"step": 355
},
{
"epoch": 0.06330226833128187,
"grad_norm": 1.1916660070419312,
"learning_rate": 1.9942120409294768e-05,
"loss": 0.0742,
"step": 360
},
{
"epoch": 0.06418146650254968,
"grad_norm": 1.0965343713760376,
"learning_rate": 1.9939020646312764e-05,
"loss": 0.0634,
"step": 365
},
{
"epoch": 0.06506066467381748,
"grad_norm": 0.46244287490844727,
"learning_rate": 1.9935840283424196e-05,
"loss": 0.0711,
"step": 370
},
{
"epoch": 0.06593986284508528,
"grad_norm": 0.1318541318178177,
"learning_rate": 1.993257934642004e-05,
"loss": 0.0591,
"step": 375
},
{
"epoch": 0.06681906101635308,
"grad_norm": 0.5300299525260925,
"learning_rate": 1.9929237861744663e-05,
"loss": 0.0712,
"step": 380
},
{
"epoch": 0.06769825918762089,
"grad_norm": 1.014757752418518,
"learning_rate": 1.9925815856495646e-05,
"loss": 0.0612,
"step": 385
},
{
"epoch": 0.06857745735888869,
"grad_norm": 0.24749091267585754,
"learning_rate": 1.992231335842354e-05,
"loss": 0.077,
"step": 390
},
{
"epoch": 0.0694566555301565,
"grad_norm": 0.5739014148712158,
"learning_rate": 1.9918730395931648e-05,
"loss": 0.0618,
"step": 395
},
{
"epoch": 0.0703358537014243,
"grad_norm": 0.23715724050998688,
"learning_rate": 1.9915066998075797e-05,
"loss": 0.0563,
"step": 400
},
{
"epoch": 0.0712150518726921,
"grad_norm": 0.5633426904678345,
"learning_rate": 1.9911323194564095e-05,
"loss": 0.054,
"step": 405
},
{
"epoch": 0.0720942500439599,
"grad_norm": 0.4382643401622772,
"learning_rate": 1.9907499015756696e-05,
"loss": 0.0561,
"step": 410
},
{
"epoch": 0.07297344821522771,
"grad_norm": 0.4218790829181671,
"learning_rate": 1.9903594492665557e-05,
"loss": 0.0466,
"step": 415
},
{
"epoch": 0.07385264638649551,
"grad_norm": 0.8700308203697205,
"learning_rate": 1.9899609656954183e-05,
"loss": 0.0652,
"step": 420
},
{
"epoch": 0.07473184455776331,
"grad_norm": 0.1704479455947876,
"learning_rate": 1.9895544540937358e-05,
"loss": 0.0494,
"step": 425
},
{
"epoch": 0.07561104272903113,
"grad_norm": 0.6057877540588379,
"learning_rate": 1.989139917758091e-05,
"loss": 0.0494,
"step": 430
},
{
"epoch": 0.07649024090029893,
"grad_norm": 1.0760382413864136,
"learning_rate": 1.9887173600501414e-05,
"loss": 0.0767,
"step": 435
},
{
"epoch": 0.07736943907156674,
"grad_norm": 0.42263808846473694,
"learning_rate": 1.988286784396594e-05,
"loss": 0.0666,
"step": 440
},
{
"epoch": 0.07824863724283454,
"grad_norm": 0.13608968257904053,
"learning_rate": 1.987848194289178e-05,
"loss": 0.0663,
"step": 445
},
{
"epoch": 0.07912783541410234,
"grad_norm": 0.20840178430080414,
"learning_rate": 1.987401593284613e-05,
"loss": 0.0814,
"step": 450
},
{
"epoch": 0.08000703358537015,
"grad_norm": 1.5564632415771484,
"learning_rate": 1.9869469850045845e-05,
"loss": 0.0733,
"step": 455
},
{
"epoch": 0.08088623175663795,
"grad_norm": 0.3628084063529968,
"learning_rate": 1.9864843731357108e-05,
"loss": 0.0668,
"step": 460
},
{
"epoch": 0.08176542992790575,
"grad_norm": 6.541281223297119,
"learning_rate": 1.986013761429517e-05,
"loss": 0.0575,
"step": 465
},
{
"epoch": 0.08264462809917356,
"grad_norm": 0.3698543310165405,
"learning_rate": 1.9855351537024004e-05,
"loss": 0.0686,
"step": 470
},
{
"epoch": 0.08352382627044136,
"grad_norm": 0.21145962178707123,
"learning_rate": 1.9850485538356026e-05,
"loss": 0.0693,
"step": 475
},
{
"epoch": 0.08440302444170916,
"grad_norm": 0.718197226524353,
"learning_rate": 1.9845539657751768e-05,
"loss": 0.0577,
"step": 480
},
{
"epoch": 0.08528222261297697,
"grad_norm": 1.4340827465057373,
"learning_rate": 1.9840513935319557e-05,
"loss": 0.056,
"step": 485
},
{
"epoch": 0.08616142078424477,
"grad_norm": 2.368858814239502,
"learning_rate": 1.98354084118152e-05,
"loss": 0.0674,
"step": 490
},
{
"epoch": 0.08704061895551257,
"grad_norm": 0.6914955973625183,
"learning_rate": 1.9830223128641636e-05,
"loss": 0.0646,
"step": 495
},
{
"epoch": 0.08791981712678038,
"grad_norm": 0.5653345584869385,
"learning_rate": 1.9824958127848618e-05,
"loss": 0.0868,
"step": 500
},
{
"epoch": 0.08879901529804818,
"grad_norm": 0.6143190860748291,
"learning_rate": 1.9819613452132365e-05,
"loss": 0.0524,
"step": 505
},
{
"epoch": 0.08967821346931598,
"grad_norm": 0.9025689363479614,
"learning_rate": 1.9814189144835205e-05,
"loss": 0.0646,
"step": 510
},
{
"epoch": 0.09055741164058378,
"grad_norm": 1.0996524095535278,
"learning_rate": 1.9808685249945245e-05,
"loss": 0.0686,
"step": 515
},
{
"epoch": 0.09143660981185159,
"grad_norm": 1.0614774227142334,
"learning_rate": 1.9803101812096e-05,
"loss": 0.0636,
"step": 520
},
{
"epoch": 0.09231580798311939,
"grad_norm": 0.67917799949646,
"learning_rate": 1.9797438876566027e-05,
"loss": 0.0623,
"step": 525
},
{
"epoch": 0.0931950061543872,
"grad_norm": 0.29005610942840576,
"learning_rate": 1.9791696489278578e-05,
"loss": 0.059,
"step": 530
},
{
"epoch": 0.094074204325655,
"grad_norm": 0.6829861402511597,
"learning_rate": 1.97858746968012e-05,
"loss": 0.0797,
"step": 535
},
{
"epoch": 0.0949534024969228,
"grad_norm": 0.22500386834144592,
"learning_rate": 1.9779973546345385e-05,
"loss": 0.0673,
"step": 540
},
{
"epoch": 0.0958326006681906,
"grad_norm": 1.5297006368637085,
"learning_rate": 1.9773993085766163e-05,
"loss": 0.062,
"step": 545
},
{
"epoch": 0.09671179883945841,
"grad_norm": 0.35818400979042053,
"learning_rate": 1.976793336356173e-05,
"loss": 0.0627,
"step": 550
},
{
"epoch": 0.09759099701072622,
"grad_norm": 1.0418643951416016,
"learning_rate": 1.976179442887305e-05,
"loss": 0.0756,
"step": 555
},
{
"epoch": 0.09847019518199403,
"grad_norm": 1.5865000486373901,
"learning_rate": 1.9755576331483453e-05,
"loss": 0.0577,
"step": 560
},
{
"epoch": 0.09934939335326183,
"grad_norm": 0.43606239557266235,
"learning_rate": 1.9749279121818235e-05,
"loss": 0.0642,
"step": 565
},
{
"epoch": 0.10022859152452963,
"grad_norm": 0.1087045669555664,
"learning_rate": 1.9742902850944257e-05,
"loss": 0.0667,
"step": 570
},
{
"epoch": 0.10110778969579744,
"grad_norm": 0.4932880103588104,
"learning_rate": 1.9736447570569503e-05,
"loss": 0.0643,
"step": 575
},
{
"epoch": 0.10198698786706524,
"grad_norm": 0.28585073351860046,
"learning_rate": 1.97299133330427e-05,
"loss": 0.0619,
"step": 580
},
{
"epoch": 0.10286618603833304,
"grad_norm": 0.1778407096862793,
"learning_rate": 1.9723300191352866e-05,
"loss": 0.0482,
"step": 585
},
{
"epoch": 0.10374538420960085,
"grad_norm": 0.35073766112327576,
"learning_rate": 1.971660819912888e-05,
"loss": 0.075,
"step": 590
},
{
"epoch": 0.10462458238086865,
"grad_norm": 0.19325245916843414,
"learning_rate": 1.9709837410639062e-05,
"loss": 0.0629,
"step": 595
},
{
"epoch": 0.10550378055213645,
"grad_norm": 0.5602083802223206,
"learning_rate": 1.9702987880790733e-05,
"loss": 0.0537,
"step": 600
},
{
"epoch": 0.10638297872340426,
"grad_norm": 0.6220573782920837,
"learning_rate": 1.969605966512975e-05,
"loss": 0.0735,
"step": 605
},
{
"epoch": 0.10726217689467206,
"grad_norm": 0.7392856478691101,
"learning_rate": 1.968905281984007e-05,
"loss": 0.0567,
"step": 610
},
{
"epoch": 0.10814137506593986,
"grad_norm": 0.4744727909564972,
"learning_rate": 1.9681967401743297e-05,
"loss": 0.0668,
"step": 615
},
{
"epoch": 0.10902057323720767,
"grad_norm": 1.1823076009750366,
"learning_rate": 1.9674803468298216e-05,
"loss": 0.0613,
"step": 620
},
{
"epoch": 0.10989977140847547,
"grad_norm": 0.7359253764152527,
"learning_rate": 1.9667561077600325e-05,
"loss": 0.0633,
"step": 625
},
{
"epoch": 0.11077896957974327,
"grad_norm": 0.49054092168807983,
"learning_rate": 1.966024028838137e-05,
"loss": 0.0571,
"step": 630
},
{
"epoch": 0.11165816775101108,
"grad_norm": 0.3266174793243408,
"learning_rate": 1.965284116000886e-05,
"loss": 0.0717,
"step": 635
},
{
"epoch": 0.11253736592227888,
"grad_norm": 0.2428748905658722,
"learning_rate": 1.9645363752485594e-05,
"loss": 0.0805,
"step": 640
},
{
"epoch": 0.11341656409354668,
"grad_norm": 0.27535462379455566,
"learning_rate": 1.963780812644917e-05,
"loss": 0.0632,
"step": 645
},
{
"epoch": 0.11429576226481448,
"grad_norm": 1.5982582569122314,
"learning_rate": 1.9630174343171498e-05,
"loss": 0.0574,
"step": 650
},
{
"epoch": 0.11517496043608229,
"grad_norm": 0.09966005384922028,
"learning_rate": 1.9622462464558296e-05,
"loss": 0.0505,
"step": 655
},
{
"epoch": 0.11605415860735009,
"grad_norm": 0.3889922797679901,
"learning_rate": 1.9614672553148592e-05,
"loss": 0.0598,
"step": 660
},
{
"epoch": 0.1169333567786179,
"grad_norm": 1.1012970209121704,
"learning_rate": 1.9606804672114217e-05,
"loss": 0.0618,
"step": 665
},
{
"epoch": 0.1178125549498857,
"grad_norm": 0.4842506945133209,
"learning_rate": 1.959885888525929e-05,
"loss": 0.065,
"step": 670
},
{
"epoch": 0.11869175312115351,
"grad_norm": 0.5499223470687866,
"learning_rate": 1.9590835257019715e-05,
"loss": 0.0575,
"step": 675
},
{
"epoch": 0.11957095129242132,
"grad_norm": 0.9636365175247192,
"learning_rate": 1.9582733852462623e-05,
"loss": 0.0565,
"step": 680
},
{
"epoch": 0.12045014946368912,
"grad_norm": 0.5249933004379272,
"learning_rate": 1.9574554737285885e-05,
"loss": 0.0594,
"step": 685
},
{
"epoch": 0.12132934763495692,
"grad_norm": 0.8125589489936829,
"learning_rate": 1.956629797781756e-05,
"loss": 0.0652,
"step": 690
},
{
"epoch": 0.12220854580622473,
"grad_norm": 0.3194701373577118,
"learning_rate": 1.955796364101535e-05,
"loss": 0.0634,
"step": 695
},
{
"epoch": 0.12308774397749253,
"grad_norm": 0.3122730851173401,
"learning_rate": 1.954955179446608e-05,
"loss": 0.0577,
"step": 700
},
{
"epoch": 0.12396694214876033,
"grad_norm": 0.546394407749176,
"learning_rate": 1.9541062506385116e-05,
"loss": 0.0635,
"step": 705
},
{
"epoch": 0.12484614032002814,
"grad_norm": 0.6376326680183411,
"learning_rate": 1.9532495845615854e-05,
"loss": 0.0702,
"step": 710
},
{
"epoch": 0.12572533849129594,
"grad_norm": 0.5695558786392212,
"learning_rate": 1.9523851881629124e-05,
"loss": 0.0613,
"step": 715
},
{
"epoch": 0.12660453666256374,
"grad_norm": 0.5965823531150818,
"learning_rate": 1.9515130684522647e-05,
"loss": 0.0652,
"step": 720
},
{
"epoch": 0.12748373483383155,
"grad_norm": 0.4935171604156494,
"learning_rate": 1.950633232502046e-05,
"loss": 0.063,
"step": 725
},
{
"epoch": 0.12836293300509935,
"grad_norm": 0.5116500854492188,
"learning_rate": 1.9497456874472346e-05,
"loss": 0.0552,
"step": 730
},
{
"epoch": 0.12924213117636715,
"grad_norm": 0.5821178555488586,
"learning_rate": 1.9488504404853247e-05,
"loss": 0.0591,
"step": 735
},
{
"epoch": 0.13012132934763496,
"grad_norm": 0.31190237402915955,
"learning_rate": 1.94794749887627e-05,
"loss": 0.0755,
"step": 740
},
{
"epoch": 0.13100052751890276,
"grad_norm": 0.6998320817947388,
"learning_rate": 1.947036869942422e-05,
"loss": 0.086,
"step": 745
},
{
"epoch": 0.13187972569017056,
"grad_norm": 0.5217974185943604,
"learning_rate": 1.9461185610684736e-05,
"loss": 0.0602,
"step": 750
},
{
"epoch": 0.13275892386143837,
"grad_norm": 1.050721287727356,
"learning_rate": 1.9451925797013955e-05,
"loss": 0.0698,
"step": 755
},
{
"epoch": 0.13363812203270617,
"grad_norm": 0.2586376965045929,
"learning_rate": 1.9442589333503806e-05,
"loss": 0.0622,
"step": 760
},
{
"epoch": 0.13451732020397397,
"grad_norm": 0.5251173377037048,
"learning_rate": 1.9433176295867792e-05,
"loss": 0.0567,
"step": 765
},
{
"epoch": 0.13539651837524178,
"grad_norm": 0.4066588878631592,
"learning_rate": 1.9423686760440386e-05,
"loss": 0.0548,
"step": 770
},
{
"epoch": 0.13627571654650958,
"grad_norm": 0.5858006477355957,
"learning_rate": 1.9414120804176427e-05,
"loss": 0.0607,
"step": 775
},
{
"epoch": 0.13715491471777738,
"grad_norm": 0.9163162112236023,
"learning_rate": 1.9404478504650473e-05,
"loss": 0.0575,
"step": 780
},
{
"epoch": 0.13803411288904519,
"grad_norm": 0.274795264005661,
"learning_rate": 1.939475994005619e-05,
"loss": 0.07,
"step": 785
},
{
"epoch": 0.138913311060313,
"grad_norm": 0.1485268473625183,
"learning_rate": 1.938496518920571e-05,
"loss": 0.067,
"step": 790
},
{
"epoch": 0.1397925092315808,
"grad_norm": 0.57244473695755,
"learning_rate": 1.937509433152899e-05,
"loss": 0.0742,
"step": 795
},
{
"epoch": 0.1406717074028486,
"grad_norm": 0.9547910690307617,
"learning_rate": 1.9365147447073172e-05,
"loss": 0.0655,
"step": 800
},
{
"epoch": 0.1415509055741164,
"grad_norm": 0.8948219418525696,
"learning_rate": 1.9355124616501936e-05,
"loss": 0.0714,
"step": 805
},
{
"epoch": 0.1424301037453842,
"grad_norm": 0.7073503136634827,
"learning_rate": 1.934502592109484e-05,
"loss": 0.0646,
"step": 810
},
{
"epoch": 0.143309301916652,
"grad_norm": 0.38196781277656555,
"learning_rate": 1.9334851442746665e-05,
"loss": 0.0606,
"step": 815
},
{
"epoch": 0.1441885000879198,
"grad_norm": 0.22767876088619232,
"learning_rate": 1.9324601263966746e-05,
"loss": 0.0586,
"step": 820
},
{
"epoch": 0.1450676982591876,
"grad_norm": 0.44821423292160034,
"learning_rate": 1.9314275467878304e-05,
"loss": 0.0671,
"step": 825
},
{
"epoch": 0.14594689643045541,
"grad_norm": 0.32358282804489136,
"learning_rate": 1.9303874138217788e-05,
"loss": 0.0535,
"step": 830
},
{
"epoch": 0.14682609460172322,
"grad_norm": 0.39932888746261597,
"learning_rate": 1.9293397359334167e-05,
"loss": 0.0553,
"step": 835
},
{
"epoch": 0.14770529277299102,
"grad_norm": 0.160264790058136,
"learning_rate": 1.9282845216188267e-05,
"loss": 0.0583,
"step": 840
},
{
"epoch": 0.14858449094425882,
"grad_norm": 0.5190912485122681,
"learning_rate": 1.9272217794352073e-05,
"loss": 0.0716,
"step": 845
},
{
"epoch": 0.14946368911552663,
"grad_norm": 0.5016174912452698,
"learning_rate": 1.9261515180008047e-05,
"loss": 0.0668,
"step": 850
},
{
"epoch": 0.15034288728679443,
"grad_norm": 0.12489809095859528,
"learning_rate": 1.9250737459948404e-05,
"loss": 0.0619,
"step": 855
},
{
"epoch": 0.15122208545806226,
"grad_norm": 1.153669834136963,
"learning_rate": 1.923988472157445e-05,
"loss": 0.0779,
"step": 860
},
{
"epoch": 0.15210128362933006,
"grad_norm": 0.2374788522720337,
"learning_rate": 1.9228957052895816e-05,
"loss": 0.0677,
"step": 865
},
{
"epoch": 0.15298048180059787,
"grad_norm": 1.060134768486023,
"learning_rate": 1.92179545425298e-05,
"loss": 0.0632,
"step": 870
},
{
"epoch": 0.15385967997186567,
"grad_norm": 0.3676360845565796,
"learning_rate": 1.9206877279700614e-05,
"loss": 0.0614,
"step": 875
},
{
"epoch": 0.15473887814313347,
"grad_norm": 0.3198089003562927,
"learning_rate": 1.9195725354238677e-05,
"loss": 0.0718,
"step": 880
},
{
"epoch": 0.15561807631440128,
"grad_norm": 0.2891201674938202,
"learning_rate": 1.918449885657987e-05,
"loss": 0.0586,
"step": 885
},
{
"epoch": 0.15649727448566908,
"grad_norm": 0.4054102897644043,
"learning_rate": 1.9173197877764824e-05,
"loss": 0.0523,
"step": 890
},
{
"epoch": 0.15737647265693688,
"grad_norm": 0.1266939640045166,
"learning_rate": 1.916182250943816e-05,
"loss": 0.0546,
"step": 895
},
{
"epoch": 0.1582556708282047,
"grad_norm": 0.7244488000869751,
"learning_rate": 1.915037284384777e-05,
"loss": 0.0634,
"step": 900
},
{
"epoch": 0.1591348689994725,
"grad_norm": 0.8754041790962219,
"learning_rate": 1.913884897384404e-05,
"loss": 0.0712,
"step": 905
},
{
"epoch": 0.1600140671707403,
"grad_norm": 0.7527337670326233,
"learning_rate": 1.9127250992879128e-05,
"loss": 0.0685,
"step": 910
},
{
"epoch": 0.1608932653420081,
"grad_norm": 0.8655832409858704,
"learning_rate": 1.9115578995006175e-05,
"loss": 0.0709,
"step": 915
},
{
"epoch": 0.1617724635132759,
"grad_norm": 0.5657609105110168,
"learning_rate": 1.9103833074878565e-05,
"loss": 0.0606,
"step": 920
},
{
"epoch": 0.1626516616845437,
"grad_norm": 0.5217536091804504,
"learning_rate": 1.909201332774916e-05,
"loss": 0.0577,
"step": 925
},
{
"epoch": 0.1635308598558115,
"grad_norm": 0.5291991233825684,
"learning_rate": 1.908011984946949e-05,
"loss": 0.0574,
"step": 930
},
{
"epoch": 0.1644100580270793,
"grad_norm": 0.16585160791873932,
"learning_rate": 1.9068152736489036e-05,
"loss": 0.0588,
"step": 935
},
{
"epoch": 0.1652892561983471,
"grad_norm": 0.5434625744819641,
"learning_rate": 1.9056112085854397e-05,
"loss": 0.0645,
"step": 940
},
{
"epoch": 0.16616845436961492,
"grad_norm": 0.685371458530426,
"learning_rate": 1.9043997995208525e-05,
"loss": 0.0452,
"step": 945
},
{
"epoch": 0.16704765254088272,
"grad_norm": 0.3393997251987457,
"learning_rate": 1.9031810562789927e-05,
"loss": 0.0569,
"step": 950
},
{
"epoch": 0.16792685071215052,
"grad_norm": 0.281892329454422,
"learning_rate": 1.901954988743188e-05,
"loss": 0.0686,
"step": 955
},
{
"epoch": 0.16880604888341832,
"grad_norm": 0.5703971982002258,
"learning_rate": 1.9007216068561605e-05,
"loss": 0.0667,
"step": 960
},
{
"epoch": 0.16968524705468613,
"grad_norm": 0.6741696000099182,
"learning_rate": 1.899480920619949e-05,
"loss": 0.0551,
"step": 965
},
{
"epoch": 0.17056444522595393,
"grad_norm": 1.3032324314117432,
"learning_rate": 1.8982329400958254e-05,
"loss": 0.066,
"step": 970
},
{
"epoch": 0.17144364339722173,
"grad_norm": 0.8134323954582214,
"learning_rate": 1.8969776754042157e-05,
"loss": 0.0704,
"step": 975
},
{
"epoch": 0.17232284156848954,
"grad_norm": 0.6495192646980286,
"learning_rate": 1.895715136724615e-05,
"loss": 0.0687,
"step": 980
},
{
"epoch": 0.17320203973975734,
"grad_norm": 0.49367162585258484,
"learning_rate": 1.8944453342955064e-05,
"loss": 0.0555,
"step": 985
},
{
"epoch": 0.17408123791102514,
"grad_norm": 0.2508549392223358,
"learning_rate": 1.8931682784142792e-05,
"loss": 0.0694,
"step": 990
},
{
"epoch": 0.17496043608229295,
"grad_norm": 0.3268815875053406,
"learning_rate": 1.891883979437143e-05,
"loss": 0.058,
"step": 995
},
{
"epoch": 0.17583963425356075,
"grad_norm": 0.6226515173912048,
"learning_rate": 1.8905924477790452e-05,
"loss": 0.0661,
"step": 1000
},
{
"epoch": 0.17671883242482855,
"grad_norm": 0.3348465859889984,
"learning_rate": 1.8892936939135863e-05,
"loss": 0.0651,
"step": 1005
},
{
"epoch": 0.17759803059609636,
"grad_norm": 0.7326810956001282,
"learning_rate": 1.887987728372935e-05,
"loss": 0.0695,
"step": 1010
},
{
"epoch": 0.17847722876736416,
"grad_norm": 0.6014009714126587,
"learning_rate": 1.8866745617477423e-05,
"loss": 0.063,
"step": 1015
},
{
"epoch": 0.17935642693863196,
"grad_norm": 1.2527378797531128,
"learning_rate": 1.8853542046870558e-05,
"loss": 0.0631,
"step": 1020
},
{
"epoch": 0.18023562510989977,
"grad_norm": 0.3176214396953583,
"learning_rate": 1.8840266678982343e-05,
"loss": 0.0562,
"step": 1025
},
{
"epoch": 0.18111482328116757,
"grad_norm": 0.26997071504592896,
"learning_rate": 1.8826919621468595e-05,
"loss": 0.0618,
"step": 1030
},
{
"epoch": 0.18199402145243537,
"grad_norm": 0.2553798258304596,
"learning_rate": 1.8813500982566498e-05,
"loss": 0.0622,
"step": 1035
},
{
"epoch": 0.18287321962370318,
"grad_norm": 0.9949320554733276,
"learning_rate": 1.8800010871093718e-05,
"loss": 0.0748,
"step": 1040
},
{
"epoch": 0.18375241779497098,
"grad_norm": 0.5384786128997803,
"learning_rate": 1.8786449396447528e-05,
"loss": 0.0757,
"step": 1045
},
{
"epoch": 0.18463161596623878,
"grad_norm": 0.14809344708919525,
"learning_rate": 1.8772816668603907e-05,
"loss": 0.0675,
"step": 1050
},
{
"epoch": 0.18551081413750659,
"grad_norm": 0.764203667640686,
"learning_rate": 1.8759112798116673e-05,
"loss": 0.0615,
"step": 1055
},
{
"epoch": 0.1863900123087744,
"grad_norm": 0.18247248232364655,
"learning_rate": 1.874533789611655e-05,
"loss": 0.061,
"step": 1060
},
{
"epoch": 0.1872692104800422,
"grad_norm": 0.9988198280334473,
"learning_rate": 1.873149207431031e-05,
"loss": 0.0591,
"step": 1065
},
{
"epoch": 0.18814840865131,
"grad_norm": 0.07756359130144119,
"learning_rate": 1.871757544497983e-05,
"loss": 0.0641,
"step": 1070
},
{
"epoch": 0.1890276068225778,
"grad_norm": 0.7131006121635437,
"learning_rate": 1.870358812098121e-05,
"loss": 0.0581,
"step": 1075
},
{
"epoch": 0.1899068049938456,
"grad_norm": 0.3485928177833557,
"learning_rate": 1.868953021574382e-05,
"loss": 0.0645,
"step": 1080
},
{
"epoch": 0.1907860031651134,
"grad_norm": 0.16775915026664734,
"learning_rate": 1.8675401843269438e-05,
"loss": 0.0644,
"step": 1085
},
{
"epoch": 0.1916652013363812,
"grad_norm": 0.3290361762046814,
"learning_rate": 1.866120311813126e-05,
"loss": 0.0619,
"step": 1090
},
{
"epoch": 0.192544399507649,
"grad_norm": 1.0206267833709717,
"learning_rate": 1.8646934155473025e-05,
"loss": 0.0854,
"step": 1095
},
{
"epoch": 0.19342359767891681,
"grad_norm": 0.6392635703086853,
"learning_rate": 1.8632595071008044e-05,
"loss": 0.0647,
"step": 1100
},
{
"epoch": 0.19430279585018465,
"grad_norm": 0.4575440287590027,
"learning_rate": 1.8618185981018292e-05,
"loss": 0.065,
"step": 1105
},
{
"epoch": 0.19518199402145245,
"grad_norm": 0.5402662754058838,
"learning_rate": 1.8603707002353436e-05,
"loss": 0.053,
"step": 1110
},
{
"epoch": 0.19606119219272025,
"grad_norm": 0.16452832520008087,
"learning_rate": 1.858915825242991e-05,
"loss": 0.0577,
"step": 1115
},
{
"epoch": 0.19694039036398805,
"grad_norm": 0.7707622647285461,
"learning_rate": 1.857453984922995e-05,
"loss": 0.0572,
"step": 1120
},
{
"epoch": 0.19781958853525586,
"grad_norm": 0.2900203466415405,
"learning_rate": 1.8559851911300638e-05,
"loss": 0.0534,
"step": 1125
},
{
"epoch": 0.19869878670652366,
"grad_norm": 0.2933928370475769,
"learning_rate": 1.854509455775295e-05,
"loss": 0.0534,
"step": 1130
},
{
"epoch": 0.19957798487779146,
"grad_norm": 1.2258199453353882,
"learning_rate": 1.8530267908260782e-05,
"loss": 0.0645,
"step": 1135
},
{
"epoch": 0.20045718304905927,
"grad_norm": 0.3073072135448456,
"learning_rate": 1.8515372083059982e-05,
"loss": 0.0672,
"step": 1140
},
{
"epoch": 0.20133638122032707,
"grad_norm": 0.23768655955791473,
"learning_rate": 1.850040720294737e-05,
"loss": 0.0573,
"step": 1145
},
{
"epoch": 0.20221557939159487,
"grad_norm": 0.6997068524360657,
"learning_rate": 1.8485373389279768e-05,
"loss": 0.0564,
"step": 1150
},
{
"epoch": 0.20309477756286268,
"grad_norm": 0.4729757308959961,
"learning_rate": 1.8470270763973004e-05,
"loss": 0.0588,
"step": 1155
},
{
"epoch": 0.20397397573413048,
"grad_norm": 0.19242537021636963,
"learning_rate": 1.845509944950094e-05,
"loss": 0.0532,
"step": 1160
},
{
"epoch": 0.20485317390539828,
"grad_norm": 0.1492680460214615,
"learning_rate": 1.8439859568894464e-05,
"loss": 0.0658,
"step": 1165
},
{
"epoch": 0.2057323720766661,
"grad_norm": 0.6383575201034546,
"learning_rate": 1.8424551245740493e-05,
"loss": 0.0563,
"step": 1170
},
{
"epoch": 0.2066115702479339,
"grad_norm": 0.9722626805305481,
"learning_rate": 1.8409174604180977e-05,
"loss": 0.0603,
"step": 1175
},
{
"epoch": 0.2074907684192017,
"grad_norm": 0.5511413812637329,
"learning_rate": 1.8393729768911894e-05,
"loss": 0.0534,
"step": 1180
},
{
"epoch": 0.2083699665904695,
"grad_norm": 0.3645865321159363,
"learning_rate": 1.837821686518223e-05,
"loss": 0.0601,
"step": 1185
},
{
"epoch": 0.2092491647617373,
"grad_norm": 0.43972018361091614,
"learning_rate": 1.8362636018792975e-05,
"loss": 0.049,
"step": 1190
},
{
"epoch": 0.2101283629330051,
"grad_norm": 0.34283024072647095,
"learning_rate": 1.8346987356096087e-05,
"loss": 0.0596,
"step": 1195
},
{
"epoch": 0.2110075611042729,
"grad_norm": 0.3272128701210022,
"learning_rate": 1.833127100399348e-05,
"loss": 0.0604,
"step": 1200
},
{
"epoch": 0.2118867592755407,
"grad_norm": 0.48746854066848755,
"learning_rate": 1.8315487089935995e-05,
"loss": 0.0505,
"step": 1205
},
{
"epoch": 0.2127659574468085,
"grad_norm": 0.21915239095687866,
"learning_rate": 1.8299635741922365e-05,
"loss": 0.0574,
"step": 1210
},
{
"epoch": 0.21364515561807632,
"grad_norm": 0.3507218360900879,
"learning_rate": 1.8283717088498157e-05,
"loss": 0.0651,
"step": 1215
},
{
"epoch": 0.21452435378934412,
"grad_norm": 0.712352454662323,
"learning_rate": 1.8267731258754765e-05,
"loss": 0.0564,
"step": 1220
},
{
"epoch": 0.21540355196061192,
"grad_norm": 0.3927139937877655,
"learning_rate": 1.8251678382328345e-05,
"loss": 0.0474,
"step": 1225
},
{
"epoch": 0.21628275013187973,
"grad_norm": 1.3225253820419312,
"learning_rate": 1.8235558589398756e-05,
"loss": 0.0826,
"step": 1230
},
{
"epoch": 0.21716194830314753,
"grad_norm": 1.0917742252349854,
"learning_rate": 1.8219372010688516e-05,
"loss": 0.0614,
"step": 1235
},
{
"epoch": 0.21804114647441533,
"grad_norm": 1.2095632553100586,
"learning_rate": 1.8203118777461735e-05,
"loss": 0.0569,
"step": 1240
},
{
"epoch": 0.21892034464568313,
"grad_norm": 0.1377822309732437,
"learning_rate": 1.8186799021523064e-05,
"loss": 0.063,
"step": 1245
},
{
"epoch": 0.21979954281695094,
"grad_norm": 1.2656950950622559,
"learning_rate": 1.81704128752166e-05,
"loss": 0.0664,
"step": 1250
},
{
"epoch": 0.22067874098821874,
"grad_norm": 0.21433009207248688,
"learning_rate": 1.815396047142485e-05,
"loss": 0.0636,
"step": 1255
},
{
"epoch": 0.22155793915948654,
"grad_norm": 0.34033942222595215,
"learning_rate": 1.8137441943567607e-05,
"loss": 0.0535,
"step": 1260
},
{
"epoch": 0.22243713733075435,
"grad_norm": 0.3627121150493622,
"learning_rate": 1.8120857425600914e-05,
"loss": 0.0596,
"step": 1265
},
{
"epoch": 0.22331633550202215,
"grad_norm": 0.6685848832130432,
"learning_rate": 1.8104207052015952e-05,
"loss": 0.0696,
"step": 1270
},
{
"epoch": 0.22419553367328995,
"grad_norm": 0.2353779375553131,
"learning_rate": 1.8087490957837947e-05,
"loss": 0.0536,
"step": 1275
},
{
"epoch": 0.22507473184455776,
"grad_norm": 0.403475821018219,
"learning_rate": 1.807070927862509e-05,
"loss": 0.0636,
"step": 1280
},
{
"epoch": 0.22595393001582556,
"grad_norm": 0.6222421526908875,
"learning_rate": 1.8053862150467417e-05,
"loss": 0.0558,
"step": 1285
},
{
"epoch": 0.22683312818709336,
"grad_norm": 0.3176823854446411,
"learning_rate": 1.803694970998574e-05,
"loss": 0.0566,
"step": 1290
},
{
"epoch": 0.22771232635836117,
"grad_norm": 0.7340067625045776,
"learning_rate": 1.8019972094330502e-05,
"loss": 0.0487,
"step": 1295
},
{
"epoch": 0.22859152452962897,
"grad_norm": 1.254428505897522,
"learning_rate": 1.8002929441180684e-05,
"loss": 0.0511,
"step": 1300
},
{
"epoch": 0.22947072270089677,
"grad_norm": 0.20499931275844574,
"learning_rate": 1.7985821888742687e-05,
"loss": 0.0648,
"step": 1305
},
{
"epoch": 0.23034992087216458,
"grad_norm": 0.4078265428543091,
"learning_rate": 1.7968649575749202e-05,
"loss": 0.047,
"step": 1310
},
{
"epoch": 0.23122911904343238,
"grad_norm": 0.15697845816612244,
"learning_rate": 1.79514126414581e-05,
"loss": 0.0488,
"step": 1315
},
{
"epoch": 0.23210831721470018,
"grad_norm": 0.6871252059936523,
"learning_rate": 1.7934111225651293e-05,
"loss": 0.0585,
"step": 1320
},
{
"epoch": 0.23298751538596799,
"grad_norm": 0.13073213398456573,
"learning_rate": 1.7916745468633593e-05,
"loss": 0.0576,
"step": 1325
},
{
"epoch": 0.2338667135572358,
"grad_norm": 0.150588259100914,
"learning_rate": 1.7899315511231598e-05,
"loss": 0.0572,
"step": 1330
},
{
"epoch": 0.2347459117285036,
"grad_norm": 0.40196287631988525,
"learning_rate": 1.7881821494792527e-05,
"loss": 0.0573,
"step": 1335
},
{
"epoch": 0.2356251098997714,
"grad_norm": 0.7040359973907471,
"learning_rate": 1.7864263561183085e-05,
"loss": 0.0653,
"step": 1340
},
{
"epoch": 0.2365043080710392,
"grad_norm": 1.1014829874038696,
"learning_rate": 1.78466418527883e-05,
"loss": 0.0509,
"step": 1345
},
{
"epoch": 0.23738350624230703,
"grad_norm": 0.2666812539100647,
"learning_rate": 1.782895651251039e-05,
"loss": 0.0585,
"step": 1350
},
{
"epoch": 0.23826270441357483,
"grad_norm": 0.9666887521743774,
"learning_rate": 1.781120768376759e-05,
"loss": 0.0588,
"step": 1355
},
{
"epoch": 0.23914190258484264,
"grad_norm": 0.7215674519538879,
"learning_rate": 1.7793395510492986e-05,
"loss": 0.0597,
"step": 1360
},
{
"epoch": 0.24002110075611044,
"grad_norm": 0.5669434070587158,
"learning_rate": 1.7775520137133354e-05,
"loss": 0.0516,
"step": 1365
},
{
"epoch": 0.24090029892737824,
"grad_norm": 0.38593825697898865,
"learning_rate": 1.775758170864799e-05,
"loss": 0.0485,
"step": 1370
},
{
"epoch": 0.24177949709864605,
"grad_norm": 0.5211871266365051,
"learning_rate": 1.7739580370507533e-05,
"loss": 0.0619,
"step": 1375
},
{
"epoch": 0.24265869526991385,
"grad_norm": 0.3265356719493866,
"learning_rate": 1.7721516268692776e-05,
"loss": 0.0593,
"step": 1380
},
{
"epoch": 0.24353789344118165,
"grad_norm": 0.5374659895896912,
"learning_rate": 1.77033895496935e-05,
"loss": 0.0666,
"step": 1385
},
{
"epoch": 0.24441709161244946,
"grad_norm": 0.26406246423721313,
"learning_rate": 1.768520036050727e-05,
"loss": 0.0493,
"step": 1390
},
{
"epoch": 0.24529628978371726,
"grad_norm": 0.38316601514816284,
"learning_rate": 1.7666948848638257e-05,
"loss": 0.0503,
"step": 1395
},
{
"epoch": 0.24617548795498506,
"grad_norm": 0.3633623719215393,
"learning_rate": 1.7648635162096022e-05,
"loss": 0.0569,
"step": 1400
},
{
"epoch": 0.24705468612625286,
"grad_norm": 0.941490650177002,
"learning_rate": 1.763025944939434e-05,
"loss": 0.058,
"step": 1405
},
{
"epoch": 0.24793388429752067,
"grad_norm": 0.21693024039268494,
"learning_rate": 1.7611821859549977e-05,
"loss": 0.0539,
"step": 1410
},
{
"epoch": 0.24881308246878847,
"grad_norm": 0.7575194835662842,
"learning_rate": 1.7593322542081486e-05,
"loss": 0.0705,
"step": 1415
},
{
"epoch": 0.24969228064005627,
"grad_norm": 0.3184313178062439,
"learning_rate": 1.7574761647008004e-05,
"loss": 0.0655,
"step": 1420
},
{
"epoch": 0.2505714788113241,
"grad_norm": 0.9622363448143005,
"learning_rate": 1.7556139324848024e-05,
"loss": 0.0653,
"step": 1425
},
{
"epoch": 0.2514506769825919,
"grad_norm": 0.3079875111579895,
"learning_rate": 1.753745572661817e-05,
"loss": 0.0497,
"step": 1430
},
{
"epoch": 0.2523298751538597,
"grad_norm": 0.1410188525915146,
"learning_rate": 1.7518711003832003e-05,
"loss": 0.0715,
"step": 1435
},
{
"epoch": 0.2532090733251275,
"grad_norm": 0.5498221516609192,
"learning_rate": 1.749990530849875e-05,
"loss": 0.0705,
"step": 1440
},
{
"epoch": 0.2540882714963953,
"grad_norm": 0.27683818340301514,
"learning_rate": 1.748103879312209e-05,
"loss": 0.06,
"step": 1445
},
{
"epoch": 0.2549674696676631,
"grad_norm": 1.3148400783538818,
"learning_rate": 1.7462111610698934e-05,
"loss": 0.0629,
"step": 1450
},
{
"epoch": 0.2558466678389309,
"grad_norm": 0.4923277199268341,
"learning_rate": 1.744312391471816e-05,
"loss": 0.0573,
"step": 1455
},
{
"epoch": 0.2567258660101987,
"grad_norm": 0.8244169354438782,
"learning_rate": 1.7424075859159376e-05,
"loss": 0.0561,
"step": 1460
},
{
"epoch": 0.2576050641814665,
"grad_norm": 0.2395920604467392,
"learning_rate": 1.7404967598491674e-05,
"loss": 0.0643,
"step": 1465
},
{
"epoch": 0.2584842623527343,
"grad_norm": 0.3752864897251129,
"learning_rate": 1.7385799287672375e-05,
"loss": 0.0634,
"step": 1470
},
{
"epoch": 0.2593634605240021,
"grad_norm": 1.0273178815841675,
"learning_rate": 1.736657108214578e-05,
"loss": 0.0613,
"step": 1475
},
{
"epoch": 0.2602426586952699,
"grad_norm": 0.9190396666526794,
"learning_rate": 1.734728313784189e-05,
"loss": 0.0623,
"step": 1480
},
{
"epoch": 0.2611218568665377,
"grad_norm": 0.9993478655815125,
"learning_rate": 1.732793561117517e-05,
"loss": 0.0421,
"step": 1485
},
{
"epoch": 0.2620010550378055,
"grad_norm": 0.4666178226470947,
"learning_rate": 1.7308528659043243e-05,
"loss": 0.0531,
"step": 1490
},
{
"epoch": 0.2628802532090733,
"grad_norm": 0.24554145336151123,
"learning_rate": 1.7289062438825665e-05,
"loss": 0.0514,
"step": 1495
},
{
"epoch": 0.2637594513803411,
"grad_norm": 0.29805853962898254,
"learning_rate": 1.7269537108382605e-05,
"loss": 0.0526,
"step": 1500
},
{
"epoch": 0.26463864955160893,
"grad_norm": 0.9100229144096375,
"learning_rate": 1.7249952826053582e-05,
"loss": 0.0653,
"step": 1505
},
{
"epoch": 0.26551784772287673,
"grad_norm": 0.2765738368034363,
"learning_rate": 1.72303097506562e-05,
"loss": 0.0694,
"step": 1510
},
{
"epoch": 0.26639704589414454,
"grad_norm": 0.12102984637022018,
"learning_rate": 1.721060804148482e-05,
"loss": 0.0619,
"step": 1515
},
{
"epoch": 0.26727624406541234,
"grad_norm": 0.2673247456550598,
"learning_rate": 1.7190847858309304e-05,
"loss": 0.0536,
"step": 1520
},
{
"epoch": 0.26815544223668014,
"grad_norm": 0.6815070509910583,
"learning_rate": 1.71710293613737e-05,
"loss": 0.0521,
"step": 1525
},
{
"epoch": 0.26903464040794794,
"grad_norm": 0.8095347881317139,
"learning_rate": 1.7151152711394954e-05,
"loss": 0.0628,
"step": 1530
},
{
"epoch": 0.26991383857921575,
"grad_norm": 0.7218222618103027,
"learning_rate": 1.7131218069561594e-05,
"loss": 0.0405,
"step": 1535
},
{
"epoch": 0.27079303675048355,
"grad_norm": 0.6927086710929871,
"learning_rate": 1.7111225597532428e-05,
"loss": 0.0647,
"step": 1540
},
{
"epoch": 0.27167223492175135,
"grad_norm": 0.8299700617790222,
"learning_rate": 1.7091175457435242e-05,
"loss": 0.0648,
"step": 1545
},
{
"epoch": 0.27255143309301916,
"grad_norm": 0.16689668595790863,
"learning_rate": 1.7071067811865477e-05,
"loss": 0.0481,
"step": 1550
},
{
"epoch": 0.27343063126428696,
"grad_norm": 0.9474055767059326,
"learning_rate": 1.7050902823884904e-05,
"loss": 0.056,
"step": 1555
},
{
"epoch": 0.27430982943555476,
"grad_norm": 0.2540503740310669,
"learning_rate": 1.7030680657020314e-05,
"loss": 0.0642,
"step": 1560
},
{
"epoch": 0.27518902760682257,
"grad_norm": 0.24232099950313568,
"learning_rate": 1.701040147526219e-05,
"loss": 0.0531,
"step": 1565
},
{
"epoch": 0.27606822577809037,
"grad_norm": 1.1714109182357788,
"learning_rate": 1.6990065443063364e-05,
"loss": 0.0471,
"step": 1570
},
{
"epoch": 0.2769474239493582,
"grad_norm": 0.2802835702896118,
"learning_rate": 1.6969672725337706e-05,
"loss": 0.0678,
"step": 1575
},
{
"epoch": 0.277826622120626,
"grad_norm": 0.6939117312431335,
"learning_rate": 1.6949223487458764e-05,
"loss": 0.0576,
"step": 1580
},
{
"epoch": 0.2787058202918938,
"grad_norm": 0.558993935585022,
"learning_rate": 1.692871789525844e-05,
"loss": 0.0567,
"step": 1585
},
{
"epoch": 0.2795850184631616,
"grad_norm": 0.6370697617530823,
"learning_rate": 1.6908156115025626e-05,
"loss": 0.0578,
"step": 1590
},
{
"epoch": 0.2804642166344294,
"grad_norm": 0.22744275629520416,
"learning_rate": 1.6887538313504883e-05,
"loss": 0.0594,
"step": 1595
},
{
"epoch": 0.2813434148056972,
"grad_norm": 0.9439639449119568,
"learning_rate": 1.686686465789507e-05,
"loss": 0.0473,
"step": 1600
},
{
"epoch": 0.282222612976965,
"grad_norm": 0.5774978995323181,
"learning_rate": 1.6846135315847978e-05,
"loss": 0.0457,
"step": 1605
},
{
"epoch": 0.2831018111482328,
"grad_norm": 1.2410329580307007,
"learning_rate": 1.6825350455467e-05,
"loss": 0.0602,
"step": 1610
},
{
"epoch": 0.2839810093195006,
"grad_norm": 0.5602573752403259,
"learning_rate": 1.6804510245305745e-05,
"loss": 0.0533,
"step": 1615
},
{
"epoch": 0.2848602074907684,
"grad_norm": 0.6151893734931946,
"learning_rate": 1.678361485436668e-05,
"loss": 0.0549,
"step": 1620
},
{
"epoch": 0.2857394056620362,
"grad_norm": 0.17069809138774872,
"learning_rate": 1.676266445209975e-05,
"loss": 0.0548,
"step": 1625
},
{
"epoch": 0.286618603833304,
"grad_norm": 0.2259790599346161,
"learning_rate": 1.674165920840102e-05,
"loss": 0.056,
"step": 1630
},
{
"epoch": 0.2874978020045718,
"grad_norm": 0.17054542899131775,
"learning_rate": 1.6720599293611287e-05,
"loss": 0.0637,
"step": 1635
},
{
"epoch": 0.2883770001758396,
"grad_norm": 0.38651248812675476,
"learning_rate": 1.6699484878514693e-05,
"loss": 0.0517,
"step": 1640
},
{
"epoch": 0.2892561983471074,
"grad_norm": 0.3397147059440613,
"learning_rate": 1.6678316134337362e-05,
"loss": 0.0545,
"step": 1645
},
{
"epoch": 0.2901353965183752,
"grad_norm": 0.28409913182258606,
"learning_rate": 1.6657093232745973e-05,
"loss": 0.0562,
"step": 1650
},
{
"epoch": 0.291014594689643,
"grad_norm": 0.8544853925704956,
"learning_rate": 1.6635816345846413e-05,
"loss": 0.0641,
"step": 1655
},
{
"epoch": 0.29189379286091083,
"grad_norm": 0.35938528180122375,
"learning_rate": 1.661448564618235e-05,
"loss": 0.0589,
"step": 1660
},
{
"epoch": 0.29277299103217863,
"grad_norm": 0.3898780345916748,
"learning_rate": 1.6593101306733847e-05,
"loss": 0.059,
"step": 1665
},
{
"epoch": 0.29365218920344643,
"grad_norm": 0.34503045678138733,
"learning_rate": 1.6571663500915957e-05,
"loss": 0.0702,
"step": 1670
},
{
"epoch": 0.29453138737471424,
"grad_norm": 0.06647461652755737,
"learning_rate": 1.6550172402577304e-05,
"loss": 0.0618,
"step": 1675
},
{
"epoch": 0.29541058554598204,
"grad_norm": 0.3430112600326538,
"learning_rate": 1.6528628185998697e-05,
"loss": 0.0587,
"step": 1680
},
{
"epoch": 0.29628978371724984,
"grad_norm": 0.773381769657135,
"learning_rate": 1.65070310258917e-05,
"loss": 0.0608,
"step": 1685
},
{
"epoch": 0.29716898188851765,
"grad_norm": 0.6421689391136169,
"learning_rate": 1.6485381097397223e-05,
"loss": 0.0558,
"step": 1690
},
{
"epoch": 0.29804818005978545,
"grad_norm": 0.6023097634315491,
"learning_rate": 1.646367857608409e-05,
"loss": 0.0597,
"step": 1695
},
{
"epoch": 0.29892737823105325,
"grad_norm": 0.47872084379196167,
"learning_rate": 1.6441923637947627e-05,
"loss": 0.0647,
"step": 1700
},
{
"epoch": 0.29980657640232106,
"grad_norm": 1.3520686626434326,
"learning_rate": 1.6420116459408237e-05,
"loss": 0.0621,
"step": 1705
},
{
"epoch": 0.30068577457358886,
"grad_norm": 0.354427307844162,
"learning_rate": 1.6398257217309956e-05,
"loss": 0.0591,
"step": 1710
},
{
"epoch": 0.30156497274485666,
"grad_norm": 0.4515109658241272,
"learning_rate": 1.6376346088919032e-05,
"loss": 0.0444,
"step": 1715
},
{
"epoch": 0.3024441709161245,
"grad_norm": 0.8840310573577881,
"learning_rate": 1.6354383251922473e-05,
"loss": 0.069,
"step": 1720
},
{
"epoch": 0.3033233690873923,
"grad_norm": 0.13943177461624146,
"learning_rate": 1.633236888442663e-05,
"loss": 0.0621,
"step": 1725
},
{
"epoch": 0.30420256725866013,
"grad_norm": 0.2890174984931946,
"learning_rate": 1.631030316495572e-05,
"loss": 0.0676,
"step": 1730
},
{
"epoch": 0.30508176542992793,
"grad_norm": 0.18880678713321686,
"learning_rate": 1.6288186272450407e-05,
"loss": 0.0636,
"step": 1735
},
{
"epoch": 0.30596096360119573,
"grad_norm": 0.5174923539161682,
"learning_rate": 1.626601838626634e-05,
"loss": 0.0638,
"step": 1740
},
{
"epoch": 0.30684016177246354,
"grad_norm": 0.47316744923591614,
"learning_rate": 1.624379968617269e-05,
"loss": 0.0571,
"step": 1745
},
{
"epoch": 0.30771935994373134,
"grad_norm": 0.4440658986568451,
"learning_rate": 1.6221530352350713e-05,
"loss": 0.0551,
"step": 1750
},
{
"epoch": 0.30859855811499914,
"grad_norm": 0.5249147415161133,
"learning_rate": 1.619921056539226e-05,
"loss": 0.0559,
"step": 1755
},
{
"epoch": 0.30947775628626695,
"grad_norm": 0.6782397627830505,
"learning_rate": 1.6176840506298345e-05,
"loss": 0.0695,
"step": 1760
},
{
"epoch": 0.31035695445753475,
"grad_norm": 0.6498162150382996,
"learning_rate": 1.615442035647765e-05,
"loss": 0.0547,
"step": 1765
},
{
"epoch": 0.31123615262880255,
"grad_norm": 0.458238422870636,
"learning_rate": 1.6131950297745075e-05,
"loss": 0.0608,
"step": 1770
},
{
"epoch": 0.31211535080007036,
"grad_norm": 0.15339304506778717,
"learning_rate": 1.6109430512320235e-05,
"loss": 0.0583,
"step": 1775
},
{
"epoch": 0.31299454897133816,
"grad_norm": 0.3111644387245178,
"learning_rate": 1.6086861182826024e-05,
"loss": 0.051,
"step": 1780
},
{
"epoch": 0.31387374714260596,
"grad_norm": 1.0354893207550049,
"learning_rate": 1.6064242492287095e-05,
"loss": 0.065,
"step": 1785
},
{
"epoch": 0.31475294531387377,
"grad_norm": 0.2242657095193863,
"learning_rate": 1.6041574624128392e-05,
"loss": 0.0473,
"step": 1790
},
{
"epoch": 0.31563214348514157,
"grad_norm": 0.2808036208152771,
"learning_rate": 1.6018857762173672e-05,
"loss": 0.0537,
"step": 1795
},
{
"epoch": 0.3165113416564094,
"grad_norm": 0.3030805289745331,
"learning_rate": 1.5996092090643993e-05,
"loss": 0.0529,
"step": 1800
},
{
"epoch": 0.3173905398276772,
"grad_norm": 1.5149699449539185,
"learning_rate": 1.597327779415624e-05,
"loss": 0.0541,
"step": 1805
},
{
"epoch": 0.318269737998945,
"grad_norm": 1.2089002132415771,
"learning_rate": 1.595041505772162e-05,
"loss": 0.0748,
"step": 1810
},
{
"epoch": 0.3191489361702128,
"grad_norm": 0.28580719232559204,
"learning_rate": 1.5927504066744147e-05,
"loss": 0.0569,
"step": 1815
},
{
"epoch": 0.3200281343414806,
"grad_norm": 0.7665076851844788,
"learning_rate": 1.590454500701917e-05,
"loss": 0.0648,
"step": 1820
},
{
"epoch": 0.3209073325127484,
"grad_norm": 0.771531343460083,
"learning_rate": 1.5881538064731838e-05,
"loss": 0.0624,
"step": 1825
},
{
"epoch": 0.3217865306840162,
"grad_norm": 0.11554717272520065,
"learning_rate": 1.58584834264556e-05,
"loss": 0.0474,
"step": 1830
},
{
"epoch": 0.322665728855284,
"grad_norm": 0.13242988288402557,
"learning_rate": 1.5835381279150705e-05,
"loss": 0.055,
"step": 1835
},
{
"epoch": 0.3235449270265518,
"grad_norm": 1.2687695026397705,
"learning_rate": 1.5812231810162656e-05,
"loss": 0.0595,
"step": 1840
},
{
"epoch": 0.3244241251978196,
"grad_norm": 0.23642951250076294,
"learning_rate": 1.5789035207220725e-05,
"loss": 0.0433,
"step": 1845
},
{
"epoch": 0.3253033233690874,
"grad_norm": 0.6303196549415588,
"learning_rate": 1.5765791658436406e-05,
"loss": 0.0495,
"step": 1850
},
{
"epoch": 0.3261825215403552,
"grad_norm": 0.23932726681232452,
"learning_rate": 1.5742501352301894e-05,
"loss": 0.0558,
"step": 1855
},
{
"epoch": 0.327061719711623,
"grad_norm": 0.4368959069252014,
"learning_rate": 1.5719164477688566e-05,
"loss": 0.0666,
"step": 1860
},
{
"epoch": 0.3279409178828908,
"grad_norm": 0.2366788238286972,
"learning_rate": 1.5695781223845442e-05,
"loss": 0.0716,
"step": 1865
},
{
"epoch": 0.3288201160541586,
"grad_norm": 0.643233060836792,
"learning_rate": 1.5672351780397653e-05,
"loss": 0.0524,
"step": 1870
},
{
"epoch": 0.3296993142254264,
"grad_norm": 0.523089587688446,
"learning_rate": 1.5648876337344898e-05,
"loss": 0.0615,
"step": 1875
},
{
"epoch": 0.3305785123966942,
"grad_norm": 0.18103045225143433,
"learning_rate": 1.5625355085059907e-05,
"loss": 0.0622,
"step": 1880
},
{
"epoch": 0.331457710567962,
"grad_norm": 0.35785582661628723,
"learning_rate": 1.5601788214286905e-05,
"loss": 0.0578,
"step": 1885
},
{
"epoch": 0.33233690873922983,
"grad_norm": 0.586683988571167,
"learning_rate": 1.557817591614005e-05,
"loss": 0.059,
"step": 1890
},
{
"epoch": 0.33321610691049763,
"grad_norm": 0.43284872174263,
"learning_rate": 1.555451838210189e-05,
"loss": 0.0553,
"step": 1895
},
{
"epoch": 0.33409530508176544,
"grad_norm": 0.44119471311569214,
"learning_rate": 1.553081580402182e-05,
"loss": 0.0563,
"step": 1900
},
{
"epoch": 0.33497450325303324,
"grad_norm": 0.4126788377761841,
"learning_rate": 1.55070683741145e-05,
"loss": 0.0564,
"step": 1905
},
{
"epoch": 0.33585370142430104,
"grad_norm": 0.581628680229187,
"learning_rate": 1.548327628495833e-05,
"loss": 0.0528,
"step": 1910
},
{
"epoch": 0.33673289959556885,
"grad_norm": 0.49338245391845703,
"learning_rate": 1.5459439729493864e-05,
"loss": 0.046,
"step": 1915
},
{
"epoch": 0.33761209776683665,
"grad_norm": 0.43671730160713196,
"learning_rate": 1.543555890102226e-05,
"loss": 0.062,
"step": 1920
},
{
"epoch": 0.33849129593810445,
"grad_norm": 0.6600947976112366,
"learning_rate": 1.5411633993203695e-05,
"loss": 0.0616,
"step": 1925
},
{
"epoch": 0.33937049410937226,
"grad_norm": 0.6367527842521667,
"learning_rate": 1.538766520005581e-05,
"loss": 0.0621,
"step": 1930
},
{
"epoch": 0.34024969228064006,
"grad_norm": 0.39215588569641113,
"learning_rate": 1.536365271595212e-05,
"loss": 0.0659,
"step": 1935
},
{
"epoch": 0.34112889045190786,
"grad_norm": 1.8016176223754883,
"learning_rate": 1.5339596735620485e-05,
"loss": 0.0596,
"step": 1940
},
{
"epoch": 0.34200808862317567,
"grad_norm": 0.7933741807937622,
"learning_rate": 1.5315497454141446e-05,
"loss": 0.0602,
"step": 1945
},
{
"epoch": 0.34288728679444347,
"grad_norm": 0.2943550944328308,
"learning_rate": 1.529135506694673e-05,
"loss": 0.0514,
"step": 1950
},
{
"epoch": 0.34376648496571127,
"grad_norm": 0.20394988358020782,
"learning_rate": 1.526716976981761e-05,
"loss": 0.0613,
"step": 1955
},
{
"epoch": 0.3446456831369791,
"grad_norm": 0.3414583206176758,
"learning_rate": 1.5242941758883341e-05,
"loss": 0.0446,
"step": 1960
},
{
"epoch": 0.3455248813082469,
"grad_norm": 0.6178276538848877,
"learning_rate": 1.5218671230619558e-05,
"loss": 0.0586,
"step": 1965
},
{
"epoch": 0.3464040794795147,
"grad_norm": 0.8685587644577026,
"learning_rate": 1.5194358381846686e-05,
"loss": 0.0577,
"step": 1970
},
{
"epoch": 0.3472832776507825,
"grad_norm": 0.6029766201972961,
"learning_rate": 1.5170003409728358e-05,
"loss": 0.053,
"step": 1975
},
{
"epoch": 0.3481624758220503,
"grad_norm": 0.33781400322914124,
"learning_rate": 1.5145606511769788e-05,
"loss": 0.0625,
"step": 1980
},
{
"epoch": 0.3490416739933181,
"grad_norm": 0.541808009147644,
"learning_rate": 1.5121167885816202e-05,
"loss": 0.0505,
"step": 1985
},
{
"epoch": 0.3499208721645859,
"grad_norm": 1.0590039491653442,
"learning_rate": 1.50966877300512e-05,
"loss": 0.0698,
"step": 1990
},
{
"epoch": 0.3508000703358537,
"grad_norm": 0.4376682937145233,
"learning_rate": 1.5072166242995177e-05,
"loss": 0.066,
"step": 1995
},
{
"epoch": 0.3516792685071215,
"grad_norm": 0.44992053508758545,
"learning_rate": 1.5047603623503695e-05,
"loss": 0.074,
"step": 2000
},
{
"epoch": 0.3525584666783893,
"grad_norm": 0.5252031683921814,
"learning_rate": 1.5023000070765886e-05,
"loss": 0.0681,
"step": 2005
},
{
"epoch": 0.3534376648496571,
"grad_norm": 0.6418195366859436,
"learning_rate": 1.4998355784302816e-05,
"loss": 0.0655,
"step": 2010
},
{
"epoch": 0.3543168630209249,
"grad_norm": 1.0612142086029053,
"learning_rate": 1.4973670963965883e-05,
"loss": 0.0681,
"step": 2015
},
{
"epoch": 0.3551960611921927,
"grad_norm": 0.288524866104126,
"learning_rate": 1.49489458099352e-05,
"loss": 0.0472,
"step": 2020
},
{
"epoch": 0.3560752593634605,
"grad_norm": 0.44207853078842163,
"learning_rate": 1.4924180522717952e-05,
"loss": 0.0629,
"step": 2025
},
{
"epoch": 0.3569544575347283,
"grad_norm": 0.501887857913971,
"learning_rate": 1.4899375303146793e-05,
"loss": 0.0467,
"step": 2030
},
{
"epoch": 0.3578336557059961,
"grad_norm": 0.3725419044494629,
"learning_rate": 1.4874530352378193e-05,
"loss": 0.0592,
"step": 2035
},
{
"epoch": 0.3587128538772639,
"grad_norm": 0.3303128480911255,
"learning_rate": 1.4849645871890832e-05,
"loss": 0.0453,
"step": 2040
},
{
"epoch": 0.35959205204853173,
"grad_norm": 0.12348782271146774,
"learning_rate": 1.4824722063483944e-05,
"loss": 0.0434,
"step": 2045
},
{
"epoch": 0.36047125021979953,
"grad_norm": 0.381765753030777,
"learning_rate": 1.4799759129275703e-05,
"loss": 0.0497,
"step": 2050
},
{
"epoch": 0.36135044839106734,
"grad_norm": 0.24817384779453278,
"learning_rate": 1.477475727170156e-05,
"loss": 0.0495,
"step": 2055
},
{
"epoch": 0.36222964656233514,
"grad_norm": 0.3029944896697998,
"learning_rate": 1.4749716693512612e-05,
"loss": 0.0463,
"step": 2060
},
{
"epoch": 0.36310884473360294,
"grad_norm": 0.9858243465423584,
"learning_rate": 1.4724637597773969e-05,
"loss": 0.0769,
"step": 2065
},
{
"epoch": 0.36398804290487075,
"grad_norm": 0.38801249861717224,
"learning_rate": 1.469952018786309e-05,
"loss": 0.0472,
"step": 2070
},
{
"epoch": 0.36486724107613855,
"grad_norm": 0.15492072701454163,
"learning_rate": 1.467436466746814e-05,
"loss": 0.0574,
"step": 2075
},
{
"epoch": 0.36574643924740635,
"grad_norm": 0.8615152835845947,
"learning_rate": 1.464917124058634e-05,
"loss": 0.0651,
"step": 2080
},
{
"epoch": 0.36662563741867416,
"grad_norm": 1.6309150457382202,
"learning_rate": 1.4623940111522315e-05,
"loss": 0.0559,
"step": 2085
},
{
"epoch": 0.36750483558994196,
"grad_norm": 0.1200883612036705,
"learning_rate": 1.4598671484886423e-05,
"loss": 0.049,
"step": 2090
},
{
"epoch": 0.36838403376120976,
"grad_norm": 0.3056308627128601,
"learning_rate": 1.4573365565593121e-05,
"loss": 0.0514,
"step": 2095
},
{
"epoch": 0.36926323193247756,
"grad_norm": 0.3354267179965973,
"learning_rate": 1.4548022558859281e-05,
"loss": 0.0528,
"step": 2100
},
{
"epoch": 0.37014243010374537,
"grad_norm": 0.7310557961463928,
"learning_rate": 1.4522642670202528e-05,
"loss": 0.0676,
"step": 2105
},
{
"epoch": 0.37102162827501317,
"grad_norm": 0.6339288353919983,
"learning_rate": 1.4497226105439586e-05,
"loss": 0.0714,
"step": 2110
},
{
"epoch": 0.371900826446281,
"grad_norm": 0.2613738179206848,
"learning_rate": 1.44717730706846e-05,
"loss": 0.0586,
"step": 2115
},
{
"epoch": 0.3727800246175488,
"grad_norm": 0.4906211793422699,
"learning_rate": 1.4446283772347475e-05,
"loss": 0.0599,
"step": 2120
},
{
"epoch": 0.3736592227888166,
"grad_norm": 0.46116968989372253,
"learning_rate": 1.4420758417132177e-05,
"loss": 0.0537,
"step": 2125
},
{
"epoch": 0.3745384209600844,
"grad_norm": 0.5634986758232117,
"learning_rate": 1.4395197212035078e-05,
"loss": 0.0552,
"step": 2130
},
{
"epoch": 0.3754176191313522,
"grad_norm": 0.14335110783576965,
"learning_rate": 1.4369600364343286e-05,
"loss": 0.0587,
"step": 2135
},
{
"epoch": 0.37629681730262,
"grad_norm": 0.17126217484474182,
"learning_rate": 1.434396808163293e-05,
"loss": 0.066,
"step": 2140
},
{
"epoch": 0.3771760154738878,
"grad_norm": 0.5141481161117554,
"learning_rate": 1.4318300571767514e-05,
"loss": 0.058,
"step": 2145
},
{
"epoch": 0.3780552136451556,
"grad_norm": 0.19028577208518982,
"learning_rate": 1.4292598042896204e-05,
"loss": 0.0667,
"step": 2150
},
{
"epoch": 0.3789344118164234,
"grad_norm": 0.20005124807357788,
"learning_rate": 1.4266860703452156e-05,
"loss": 0.0519,
"step": 2155
},
{
"epoch": 0.3798136099876912,
"grad_norm": 0.36462101340293884,
"learning_rate": 1.4241088762150817e-05,
"loss": 0.0583,
"step": 2160
},
{
"epoch": 0.380692808158959,
"grad_norm": 0.26748377084732056,
"learning_rate": 1.4215282427988242e-05,
"loss": 0.0609,
"step": 2165
},
{
"epoch": 0.3815720063302268,
"grad_norm": 0.28044751286506653,
"learning_rate": 1.4189441910239383e-05,
"loss": 0.053,
"step": 2170
},
{
"epoch": 0.3824512045014946,
"grad_norm": 0.5757772326469421,
"learning_rate": 1.4163567418456408e-05,
"loss": 0.0651,
"step": 2175
},
{
"epoch": 0.3833304026727624,
"grad_norm": 0.6958622336387634,
"learning_rate": 1.4137659162466999e-05,
"loss": 0.0529,
"step": 2180
},
{
"epoch": 0.3842096008440302,
"grad_norm": 0.7717348337173462,
"learning_rate": 1.4111717352372635e-05,
"loss": 0.0498,
"step": 2185
},
{
"epoch": 0.385088799015298,
"grad_norm": 0.4615864157676697,
"learning_rate": 1.408574219854692e-05,
"loss": 0.0619,
"step": 2190
},
{
"epoch": 0.3859679971865658,
"grad_norm": 0.20736804604530334,
"learning_rate": 1.405973391163383e-05,
"loss": 0.0516,
"step": 2195
},
{
"epoch": 0.38684719535783363,
"grad_norm": 0.9234808087348938,
"learning_rate": 1.4033692702546056e-05,
"loss": 0.0553,
"step": 2200
},
{
"epoch": 0.38772639352910143,
"grad_norm": 0.9873343110084534,
"learning_rate": 1.4007618782463252e-05,
"loss": 0.0683,
"step": 2205
},
{
"epoch": 0.3886055917003693,
"grad_norm": 0.8674870133399963,
"learning_rate": 1.3981512362830359e-05,
"loss": 0.0553,
"step": 2210
},
{
"epoch": 0.3894847898716371,
"grad_norm": 0.3268803656101227,
"learning_rate": 1.3955373655355852e-05,
"loss": 0.0461,
"step": 2215
},
{
"epoch": 0.3903639880429049,
"grad_norm": 0.3193112909793854,
"learning_rate": 1.392920287201005e-05,
"loss": 0.0674,
"step": 2220
},
{
"epoch": 0.3912431862141727,
"grad_norm": 0.3444458842277527,
"learning_rate": 1.3903000225023393e-05,
"loss": 0.0471,
"step": 2225
},
{
"epoch": 0.3921223843854405,
"grad_norm": 0.6412308812141418,
"learning_rate": 1.3876765926884712e-05,
"loss": 0.0537,
"step": 2230
},
{
"epoch": 0.3930015825567083,
"grad_norm": 0.34802794456481934,
"learning_rate": 1.3850500190339515e-05,
"loss": 0.0627,
"step": 2235
},
{
"epoch": 0.3938807807279761,
"grad_norm": 0.3646249771118164,
"learning_rate": 1.3824203228388254e-05,
"loss": 0.0513,
"step": 2240
},
{
"epoch": 0.3947599788992439,
"grad_norm": 1.2918972969055176,
"learning_rate": 1.3797875254284605e-05,
"loss": 0.0782,
"step": 2245
},
{
"epoch": 0.3956391770705117,
"grad_norm": 0.2758769094944,
"learning_rate": 1.3771516481533733e-05,
"loss": 0.0479,
"step": 2250
},
{
"epoch": 0.3965183752417795,
"grad_norm": 0.1739499419927597,
"learning_rate": 1.3745127123890565e-05,
"loss": 0.0523,
"step": 2255
},
{
"epoch": 0.3973975734130473,
"grad_norm": 0.30399224162101746,
"learning_rate": 1.3718707395358053e-05,
"loss": 0.0604,
"step": 2260
},
{
"epoch": 0.3982767715843151,
"grad_norm": 0.3504631519317627,
"learning_rate": 1.3692257510185439e-05,
"loss": 0.0738,
"step": 2265
},
{
"epoch": 0.39915596975558293,
"grad_norm": 0.3818477988243103,
"learning_rate": 1.3665777682866521e-05,
"loss": 0.0572,
"step": 2270
},
{
"epoch": 0.40003516792685073,
"grad_norm": 0.8020049333572388,
"learning_rate": 1.3639268128137908e-05,
"loss": 0.0616,
"step": 2275
},
{
"epoch": 0.40091436609811854,
"grad_norm": 0.5802652835845947,
"learning_rate": 1.3612729060977287e-05,
"loss": 0.0647,
"step": 2280
},
{
"epoch": 0.40179356426938634,
"grad_norm": 0.1555125117301941,
"learning_rate": 1.3586160696601667e-05,
"loss": 0.0656,
"step": 2285
},
{
"epoch": 0.40267276244065414,
"grad_norm": 0.3963813781738281,
"learning_rate": 1.3559563250465645e-05,
"loss": 0.0555,
"step": 2290
},
{
"epoch": 0.40355196061192194,
"grad_norm": 0.31865325570106506,
"learning_rate": 1.3532936938259658e-05,
"loss": 0.0571,
"step": 2295
},
{
"epoch": 0.40443115878318975,
"grad_norm": 0.32378071546554565,
"learning_rate": 1.3506281975908224e-05,
"loss": 0.065,
"step": 2300
},
{
"epoch": 0.40531035695445755,
"grad_norm": 0.12490954995155334,
"learning_rate": 1.3479598579568205e-05,
"loss": 0.0529,
"step": 2305
},
{
"epoch": 0.40618955512572535,
"grad_norm": 0.3125900626182556,
"learning_rate": 1.3452886965627036e-05,
"loss": 0.0408,
"step": 2310
},
{
"epoch": 0.40706875329699316,
"grad_norm": 0.8287737369537354,
"learning_rate": 1.3426147350700995e-05,
"loss": 0.062,
"step": 2315
},
{
"epoch": 0.40794795146826096,
"grad_norm": 0.39330750703811646,
"learning_rate": 1.339937995163342e-05,
"loss": 0.0493,
"step": 2320
},
{
"epoch": 0.40882714963952876,
"grad_norm": 0.2195868045091629,
"learning_rate": 1.3372584985492972e-05,
"loss": 0.0545,
"step": 2325
},
{
"epoch": 0.40970634781079657,
"grad_norm": 0.15690335631370544,
"learning_rate": 1.3345762669571855e-05,
"loss": 0.0564,
"step": 2330
},
{
"epoch": 0.41058554598206437,
"grad_norm": 0.15210093557834625,
"learning_rate": 1.3318913221384078e-05,
"loss": 0.0501,
"step": 2335
},
{
"epoch": 0.4114647441533322,
"grad_norm": 0.08087150007486343,
"learning_rate": 1.3292036858663671e-05,
"loss": 0.0494,
"step": 2340
},
{
"epoch": 0.4123439423246,
"grad_norm": 0.17343612015247345,
"learning_rate": 1.3265133799362919e-05,
"loss": 0.0568,
"step": 2345
},
{
"epoch": 0.4132231404958678,
"grad_norm": 0.6120114922523499,
"learning_rate": 1.3238204261650613e-05,
"loss": 0.0819,
"step": 2350
},
{
"epoch": 0.4141023386671356,
"grad_norm": 0.5394456386566162,
"learning_rate": 1.3211248463910263e-05,
"loss": 0.0574,
"step": 2355
},
{
"epoch": 0.4149815368384034,
"grad_norm": 0.14764389395713806,
"learning_rate": 1.3184266624738333e-05,
"loss": 0.0588,
"step": 2360
},
{
"epoch": 0.4158607350096712,
"grad_norm": 0.664940595626831,
"learning_rate": 1.3157258962942468e-05,
"loss": 0.0499,
"step": 2365
},
{
"epoch": 0.416739933180939,
"grad_norm": 0.43065398931503296,
"learning_rate": 1.3130225697539725e-05,
"loss": 0.056,
"step": 2370
},
{
"epoch": 0.4176191313522068,
"grad_norm": 0.23071400821208954,
"learning_rate": 1.3103167047754786e-05,
"loss": 0.0505,
"step": 2375
},
{
"epoch": 0.4184983295234746,
"grad_norm": 0.2247258871793747,
"learning_rate": 1.3076083233018188e-05,
"loss": 0.0572,
"step": 2380
},
{
"epoch": 0.4193775276947424,
"grad_norm": 0.24306868016719818,
"learning_rate": 1.3048974472964547e-05,
"loss": 0.0751,
"step": 2385
},
{
"epoch": 0.4202567258660102,
"grad_norm": 0.6511367559432983,
"learning_rate": 1.3021840987430761e-05,
"loss": 0.0612,
"step": 2390
},
{
"epoch": 0.421135924037278,
"grad_norm": 0.5404173731803894,
"learning_rate": 1.2994682996454247e-05,
"loss": 0.0593,
"step": 2395
},
{
"epoch": 0.4220151222085458,
"grad_norm": 0.1526852548122406,
"learning_rate": 1.2967500720271142e-05,
"loss": 0.0557,
"step": 2400
},
{
"epoch": 0.4228943203798136,
"grad_norm": 0.2758397161960602,
"learning_rate": 1.2940294379314531e-05,
"loss": 0.0599,
"step": 2405
},
{
"epoch": 0.4237735185510814,
"grad_norm": 0.6543939709663391,
"learning_rate": 1.2913064194212634e-05,
"loss": 0.0579,
"step": 2410
},
{
"epoch": 0.4246527167223492,
"grad_norm": 0.5492807030677795,
"learning_rate": 1.2885810385787056e-05,
"loss": 0.0571,
"step": 2415
},
{
"epoch": 0.425531914893617,
"grad_norm": 0.200227290391922,
"learning_rate": 1.2858533175050955e-05,
"loss": 0.0514,
"step": 2420
},
{
"epoch": 0.42641111306488483,
"grad_norm": 0.5350441932678223,
"learning_rate": 1.2831232783207278e-05,
"loss": 0.0492,
"step": 2425
},
{
"epoch": 0.42729031123615263,
"grad_norm": 0.2634308636188507,
"learning_rate": 1.2803909431646952e-05,
"loss": 0.0511,
"step": 2430
},
{
"epoch": 0.42816950940742043,
"grad_norm": 0.1868433952331543,
"learning_rate": 1.2776563341947104e-05,
"loss": 0.0483,
"step": 2435
},
{
"epoch": 0.42904870757868824,
"grad_norm": 0.29465433955192566,
"learning_rate": 1.2749194735869246e-05,
"loss": 0.0543,
"step": 2440
},
{
"epoch": 0.42992790574995604,
"grad_norm": 0.13924354314804077,
"learning_rate": 1.2721803835357486e-05,
"loss": 0.0564,
"step": 2445
},
{
"epoch": 0.43080710392122384,
"grad_norm": 0.5867925882339478,
"learning_rate": 1.2694390862536736e-05,
"loss": 0.0651,
"step": 2450
},
{
"epoch": 0.43168630209249165,
"grad_norm": 0.0857938826084137,
"learning_rate": 1.2666956039710889e-05,
"loss": 0.049,
"step": 2455
},
{
"epoch": 0.43256550026375945,
"grad_norm": 0.5406795740127563,
"learning_rate": 1.2639499589361041e-05,
"loss": 0.0662,
"step": 2460
},
{
"epoch": 0.43344469843502725,
"grad_norm": 0.6612178087234497,
"learning_rate": 1.2612021734143667e-05,
"loss": 0.0634,
"step": 2465
},
{
"epoch": 0.43432389660629506,
"grad_norm": 0.9012327194213867,
"learning_rate": 1.2584522696888825e-05,
"loss": 0.0652,
"step": 2470
},
{
"epoch": 0.43520309477756286,
"grad_norm": 0.2964789569377899,
"learning_rate": 1.2557002700598353e-05,
"loss": 0.0511,
"step": 2475
},
{
"epoch": 0.43608229294883066,
"grad_norm": 0.22119061648845673,
"learning_rate": 1.2529461968444047e-05,
"loss": 0.0556,
"step": 2480
},
{
"epoch": 0.43696149112009847,
"grad_norm": 0.35610833764076233,
"learning_rate": 1.250190072376587e-05,
"loss": 0.0559,
"step": 2485
},
{
"epoch": 0.43784068929136627,
"grad_norm": 0.596125066280365,
"learning_rate": 1.2474319190070115e-05,
"loss": 0.0562,
"step": 2490
},
{
"epoch": 0.4387198874626341,
"grad_norm": 0.31192147731781006,
"learning_rate": 1.2446717591027624e-05,
"loss": 0.0581,
"step": 2495
},
{
"epoch": 0.4395990856339019,
"grad_norm": 0.6106126308441162,
"learning_rate": 1.2419096150471944e-05,
"loss": 0.0599,
"step": 2500
},
{
"epoch": 0.4404782838051697,
"grad_norm": 0.33565065264701843,
"learning_rate": 1.2391455092397535e-05,
"loss": 0.0627,
"step": 2505
},
{
"epoch": 0.4413574819764375,
"grad_norm": 0.3373861610889435,
"learning_rate": 1.236379464095794e-05,
"loss": 0.0605,
"step": 2510
},
{
"epoch": 0.4422366801477053,
"grad_norm": 0.2420874387025833,
"learning_rate": 1.233611502046397e-05,
"loss": 0.0584,
"step": 2515
},
{
"epoch": 0.4431158783189731,
"grad_norm": 0.17154277861118317,
"learning_rate": 1.2308416455381891e-05,
"loss": 0.0428,
"step": 2520
},
{
"epoch": 0.4439950764902409,
"grad_norm": 0.4789488613605499,
"learning_rate": 1.2280699170331593e-05,
"loss": 0.0512,
"step": 2525
},
{
"epoch": 0.4448742746615087,
"grad_norm": 0.2340506613254547,
"learning_rate": 1.2252963390084784e-05,
"loss": 0.0586,
"step": 2530
},
{
"epoch": 0.4457534728327765,
"grad_norm": 0.3497225046157837,
"learning_rate": 1.2225209339563144e-05,
"loss": 0.0597,
"step": 2535
},
{
"epoch": 0.4466326710040443,
"grad_norm": 0.22254657745361328,
"learning_rate": 1.2197437243836529e-05,
"loss": 0.0784,
"step": 2540
},
{
"epoch": 0.4475118691753121,
"grad_norm": 0.3253319561481476,
"learning_rate": 1.2169647328121119e-05,
"loss": 0.0575,
"step": 2545
},
{
"epoch": 0.4483910673465799,
"grad_norm": 0.455172061920166,
"learning_rate": 1.2141839817777616e-05,
"loss": 0.06,
"step": 2550
},
{
"epoch": 0.4492702655178477,
"grad_norm": 0.30848458409309387,
"learning_rate": 1.2114014938309393e-05,
"loss": 0.0583,
"step": 2555
},
{
"epoch": 0.4501494636891155,
"grad_norm": 0.38188642263412476,
"learning_rate": 1.2086172915360684e-05,
"loss": 0.0575,
"step": 2560
},
{
"epoch": 0.4510286618603833,
"grad_norm": 0.2141093611717224,
"learning_rate": 1.2058313974714746e-05,
"loss": 0.0678,
"step": 2565
},
{
"epoch": 0.4519078600316511,
"grad_norm": 0.37475940585136414,
"learning_rate": 1.2030438342292028e-05,
"loss": 0.0621,
"step": 2570
},
{
"epoch": 0.4527870582029189,
"grad_norm": 0.5214441418647766,
"learning_rate": 1.2002546244148345e-05,
"loss": 0.0559,
"step": 2575
},
{
"epoch": 0.4536662563741867,
"grad_norm": 0.5643457174301147,
"learning_rate": 1.197463790647303e-05,
"loss": 0.0551,
"step": 2580
},
{
"epoch": 0.45454545454545453,
"grad_norm": 0.2806403338909149,
"learning_rate": 1.1946713555587115e-05,
"loss": 0.059,
"step": 2585
},
{
"epoch": 0.45542465271672233,
"grad_norm": 0.7372429370880127,
"learning_rate": 1.1918773417941494e-05,
"loss": 0.0649,
"step": 2590
},
{
"epoch": 0.45630385088799014,
"grad_norm": 0.9332758188247681,
"learning_rate": 1.1890817720115075e-05,
"loss": 0.0519,
"step": 2595
},
{
"epoch": 0.45718304905925794,
"grad_norm": 0.36277708411216736,
"learning_rate": 1.1862846688812956e-05,
"loss": 0.0605,
"step": 2600
},
{
"epoch": 0.45806224723052574,
"grad_norm": 0.19159919023513794,
"learning_rate": 1.183486055086458e-05,
"loss": 0.0528,
"step": 2605
},
{
"epoch": 0.45894144540179355,
"grad_norm": 0.21766141057014465,
"learning_rate": 1.1806859533221896e-05,
"loss": 0.0587,
"step": 2610
},
{
"epoch": 0.45982064357306135,
"grad_norm": 0.8714537024497986,
"learning_rate": 1.1778843862957515e-05,
"loss": 0.0695,
"step": 2615
},
{
"epoch": 0.46069984174432915,
"grad_norm": 0.25225165486335754,
"learning_rate": 1.1750813767262879e-05,
"loss": 0.0551,
"step": 2620
},
{
"epoch": 0.46157903991559696,
"grad_norm": 0.2513810992240906,
"learning_rate": 1.1722769473446412e-05,
"loss": 0.0604,
"step": 2625
},
{
"epoch": 0.46245823808686476,
"grad_norm": 0.1506785750389099,
"learning_rate": 1.1694711208931668e-05,
"loss": 0.0562,
"step": 2630
},
{
"epoch": 0.46333743625813256,
"grad_norm": 0.20237652957439423,
"learning_rate": 1.1666639201255507e-05,
"loss": 0.0526,
"step": 2635
},
{
"epoch": 0.46421663442940037,
"grad_norm": 0.5076216459274292,
"learning_rate": 1.163855367806623e-05,
"loss": 0.0594,
"step": 2640
},
{
"epoch": 0.46509583260066817,
"grad_norm": 0.3569527268409729,
"learning_rate": 1.1610454867121747e-05,
"loss": 0.0594,
"step": 2645
},
{
"epoch": 0.46597503077193597,
"grad_norm": 0.19620120525360107,
"learning_rate": 1.158234299628772e-05,
"loss": 0.0658,
"step": 2650
},
{
"epoch": 0.4668542289432038,
"grad_norm": 0.46498578786849976,
"learning_rate": 1.1554218293535727e-05,
"loss": 0.0606,
"step": 2655
},
{
"epoch": 0.4677334271144716,
"grad_norm": 0.2302287071943283,
"learning_rate": 1.1526080986941389e-05,
"loss": 0.0589,
"step": 2660
},
{
"epoch": 0.4686126252857394,
"grad_norm": 0.39261671900749207,
"learning_rate": 1.1497931304682554e-05,
"loss": 0.0486,
"step": 2665
},
{
"epoch": 0.4694918234570072,
"grad_norm": 1.2169684171676636,
"learning_rate": 1.1469769475037427e-05,
"loss": 0.0666,
"step": 2670
},
{
"epoch": 0.470371021628275,
"grad_norm": 0.7795551419258118,
"learning_rate": 1.144159572638271e-05,
"loss": 0.0614,
"step": 2675
},
{
"epoch": 0.4712502197995428,
"grad_norm": 0.5861936211585999,
"learning_rate": 1.141341028719178e-05,
"loss": 0.0522,
"step": 2680
},
{
"epoch": 0.4721294179708106,
"grad_norm": 0.3658725917339325,
"learning_rate": 1.1385213386032797e-05,
"loss": 0.0506,
"step": 2685
},
{
"epoch": 0.4730086161420784,
"grad_norm": 0.5009949803352356,
"learning_rate": 1.1357005251566888e-05,
"loss": 0.0716,
"step": 2690
},
{
"epoch": 0.4738878143133462,
"grad_norm": 0.34323883056640625,
"learning_rate": 1.1328786112546268e-05,
"loss": 0.0772,
"step": 2695
},
{
"epoch": 0.47476701248461406,
"grad_norm": 0.5411744713783264,
"learning_rate": 1.1300556197812393e-05,
"loss": 0.0604,
"step": 2700
},
{
"epoch": 0.47564621065588186,
"grad_norm": 0.5839160680770874,
"learning_rate": 1.1272315736294108e-05,
"loss": 0.0575,
"step": 2705
},
{
"epoch": 0.47652540882714967,
"grad_norm": 0.2879360020160675,
"learning_rate": 1.1244064957005782e-05,
"loss": 0.0572,
"step": 2710
},
{
"epoch": 0.47740460699841747,
"grad_norm": 0.309555321931839,
"learning_rate": 1.121580408904546e-05,
"loss": 0.0679,
"step": 2715
},
{
"epoch": 0.47828380516968527,
"grad_norm": 0.31096187233924866,
"learning_rate": 1.1187533361592988e-05,
"loss": 0.0672,
"step": 2720
},
{
"epoch": 0.4791630033409531,
"grad_norm": 0.24858668446540833,
"learning_rate": 1.1159253003908188e-05,
"loss": 0.0604,
"step": 2725
},
{
"epoch": 0.4800422015122209,
"grad_norm": 0.38457539677619934,
"learning_rate": 1.113096324532896e-05,
"loss": 0.0656,
"step": 2730
},
{
"epoch": 0.4809213996834887,
"grad_norm": 0.20690588653087616,
"learning_rate": 1.1102664315269452e-05,
"loss": 0.0612,
"step": 2735
},
{
"epoch": 0.4818005978547565,
"grad_norm": 0.8975262641906738,
"learning_rate": 1.1074356443218175e-05,
"loss": 0.0552,
"step": 2740
},
{
"epoch": 0.4826797960260243,
"grad_norm": 0.31369662284851074,
"learning_rate": 1.1046039858736167e-05,
"loss": 0.0685,
"step": 2745
},
{
"epoch": 0.4835589941972921,
"grad_norm": 0.1889607012271881,
"learning_rate": 1.101771479145511e-05,
"loss": 0.0642,
"step": 2750
},
{
"epoch": 0.4844381923685599,
"grad_norm": 0.4452875256538391,
"learning_rate": 1.0989381471075481e-05,
"loss": 0.0643,
"step": 2755
},
{
"epoch": 0.4853173905398277,
"grad_norm": 0.9008978009223938,
"learning_rate": 1.0961040127364688e-05,
"loss": 0.0634,
"step": 2760
},
{
"epoch": 0.4861965887110955,
"grad_norm": 1.0086400508880615,
"learning_rate": 1.0932690990155195e-05,
"loss": 0.053,
"step": 2765
},
{
"epoch": 0.4870757868823633,
"grad_norm": 0.3842066526412964,
"learning_rate": 1.0904334289342675e-05,
"loss": 0.0548,
"step": 2770
},
{
"epoch": 0.4879549850536311,
"grad_norm": 0.23029272258281708,
"learning_rate": 1.087597025488413e-05,
"loss": 0.0417,
"step": 2775
},
{
"epoch": 0.4888341832248989,
"grad_norm": 0.9451369643211365,
"learning_rate": 1.0847599116796047e-05,
"loss": 0.0535,
"step": 2780
},
{
"epoch": 0.4897133813961667,
"grad_norm": 1.2543659210205078,
"learning_rate": 1.0819221105152504e-05,
"loss": 0.0644,
"step": 2785
},
{
"epoch": 0.4905925795674345,
"grad_norm": 0.2994709014892578,
"learning_rate": 1.0790836450083327e-05,
"loss": 0.053,
"step": 2790
},
{
"epoch": 0.4914717777387023,
"grad_norm": 0.43622636795043945,
"learning_rate": 1.0762445381772217e-05,
"loss": 0.0609,
"step": 2795
},
{
"epoch": 0.4923509759099701,
"grad_norm": 0.27832546830177307,
"learning_rate": 1.0734048130454882e-05,
"loss": 0.0642,
"step": 2800
},
{
"epoch": 0.4932301740812379,
"grad_norm": 1.0674911737442017,
"learning_rate": 1.0705644926417172e-05,
"loss": 0.0445,
"step": 2805
},
{
"epoch": 0.49410937225250573,
"grad_norm": 0.5751109719276428,
"learning_rate": 1.0677235999993205e-05,
"loss": 0.0482,
"step": 2810
},
{
"epoch": 0.49498857042377353,
"grad_norm": 0.15411067008972168,
"learning_rate": 1.0648821581563514e-05,
"loss": 0.0561,
"step": 2815
},
{
"epoch": 0.49586776859504134,
"grad_norm": 0.29986828565597534,
"learning_rate": 1.0620401901553155e-05,
"loss": 0.0655,
"step": 2820
},
{
"epoch": 0.49674696676630914,
"grad_norm": 0.6311131119728088,
"learning_rate": 1.0591977190429868e-05,
"loss": 0.0519,
"step": 2825
},
{
"epoch": 0.49762616493757694,
"grad_norm": 0.30874118208885193,
"learning_rate": 1.056354767870218e-05,
"loss": 0.0581,
"step": 2830
},
{
"epoch": 0.49850536310884475,
"grad_norm": 0.38604286313056946,
"learning_rate": 1.0535113596917556e-05,
"loss": 0.0627,
"step": 2835
},
{
"epoch": 0.49938456128011255,
"grad_norm": 0.16394232213497162,
"learning_rate": 1.0506675175660519e-05,
"loss": 0.0591,
"step": 2840
},
{
"epoch": 0.5002637594513804,
"grad_norm": 0.5202212929725647,
"learning_rate": 1.0478232645550784e-05,
"loss": 0.0585,
"step": 2845
},
{
"epoch": 0.5011429576226482,
"grad_norm": 0.18142499029636383,
"learning_rate": 1.0449786237241382e-05,
"loss": 0.0603,
"step": 2850
},
{
"epoch": 0.502022155793916,
"grad_norm": 0.38024476170539856,
"learning_rate": 1.0421336181416796e-05,
"loss": 0.0712,
"step": 2855
},
{
"epoch": 0.5029013539651838,
"grad_norm": 0.28926122188568115,
"learning_rate": 1.03928827087911e-05,
"loss": 0.0669,
"step": 2860
},
{
"epoch": 0.5037805521364516,
"grad_norm": 0.4077168405056,
"learning_rate": 1.036442605010605e-05,
"loss": 0.0616,
"step": 2865
},
{
"epoch": 0.5046597503077194,
"grad_norm": 0.4079400300979614,
"learning_rate": 1.0335966436129268e-05,
"loss": 0.058,
"step": 2870
},
{
"epoch": 0.5055389484789872,
"grad_norm": 0.5996482968330383,
"learning_rate": 1.0307504097652323e-05,
"loss": 0.0512,
"step": 2875
},
{
"epoch": 0.506418146650255,
"grad_norm": 0.09414978325366974,
"learning_rate": 1.0279039265488885e-05,
"loss": 0.0519,
"step": 2880
},
{
"epoch": 0.5072973448215228,
"grad_norm": 0.3783423602581024,
"learning_rate": 1.0250572170472848e-05,
"loss": 0.0599,
"step": 2885
},
{
"epoch": 0.5081765429927906,
"grad_norm": 0.7148971557617188,
"learning_rate": 1.0222103043456447e-05,
"loss": 0.0681,
"step": 2890
},
{
"epoch": 0.5090557411640584,
"grad_norm": 0.29532909393310547,
"learning_rate": 1.0193632115308412e-05,
"loss": 0.0628,
"step": 2895
},
{
"epoch": 0.5099349393353262,
"grad_norm": 0.231553316116333,
"learning_rate": 1.016515961691206e-05,
"loss": 0.0489,
"step": 2900
},
{
"epoch": 0.510814137506594,
"grad_norm": 0.6670016646385193,
"learning_rate": 1.0136685779163458e-05,
"loss": 0.0596,
"step": 2905
},
{
"epoch": 0.5116933356778618,
"grad_norm": 0.8102641105651855,
"learning_rate": 1.010821083296952e-05,
"loss": 0.0563,
"step": 2910
},
{
"epoch": 0.5125725338491296,
"grad_norm": 0.21613669395446777,
"learning_rate": 1.0079735009246168e-05,
"loss": 0.0571,
"step": 2915
},
{
"epoch": 0.5134517320203974,
"grad_norm": 0.34142959117889404,
"learning_rate": 1.0051258538916422e-05,
"loss": 0.0611,
"step": 2920
},
{
"epoch": 0.5143309301916652,
"grad_norm": 0.5886263847351074,
"learning_rate": 1.0022781652908549e-05,
"loss": 0.0596,
"step": 2925
},
{
"epoch": 0.515210128362933,
"grad_norm": 0.37875401973724365,
"learning_rate": 9.994304582154197e-06,
"loss": 0.045,
"step": 2930
},
{
"epoch": 0.5160893265342008,
"grad_norm": 0.6814181804656982,
"learning_rate": 9.9658275575865e-06,
"loss": 0.0399,
"step": 2935
},
{
"epoch": 0.5169685247054686,
"grad_norm": 0.6115921139717102,
"learning_rate": 9.93735081013823e-06,
"loss": 0.0586,
"step": 2940
},
{
"epoch": 0.5178477228767364,
"grad_norm": 0.7454824447631836,
"learning_rate": 9.908874570739899e-06,
"loss": 0.0623,
"step": 2945
},
{
"epoch": 0.5187269210480042,
"grad_norm": 0.16522662341594696,
"learning_rate": 9.880399070317907e-06,
"loss": 0.0578,
"step": 2950
},
{
"epoch": 0.519606119219272,
"grad_norm": 0.5657525658607483,
"learning_rate": 9.851924539792656e-06,
"loss": 0.0468,
"step": 2955
},
{
"epoch": 0.5204853173905398,
"grad_norm": 0.1268445998430252,
"learning_rate": 9.823451210076691e-06,
"loss": 0.0525,
"step": 2960
},
{
"epoch": 0.5213645155618076,
"grad_norm": 0.3536778390407562,
"learning_rate": 9.794979312072807e-06,
"loss": 0.0557,
"step": 2965
},
{
"epoch": 0.5222437137330754,
"grad_norm": 0.9080764651298523,
"learning_rate": 9.766509076672204e-06,
"loss": 0.0611,
"step": 2970
},
{
"epoch": 0.5231229119043432,
"grad_norm": 0.3755652904510498,
"learning_rate": 9.738040734752582e-06,
"loss": 0.0683,
"step": 2975
},
{
"epoch": 0.524002110075611,
"grad_norm": 0.6789637207984924,
"learning_rate": 9.709574517176301e-06,
"loss": 0.0475,
"step": 2980
},
{
"epoch": 0.5248813082468788,
"grad_norm": 0.7782059907913208,
"learning_rate": 9.681110654788483e-06,
"loss": 0.0521,
"step": 2985
},
{
"epoch": 0.5257605064181466,
"grad_norm": 0.20607317984104156,
"learning_rate": 9.65264937841516e-06,
"loss": 0.0517,
"step": 2990
},
{
"epoch": 0.5266397045894144,
"grad_norm": 0.18635804951190948,
"learning_rate": 9.62419091886138e-06,
"loss": 0.0545,
"step": 2995
},
{
"epoch": 0.5275189027606823,
"grad_norm": 0.26298439502716064,
"learning_rate": 9.595735506909365e-06,
"loss": 0.0529,
"step": 3000
},
{
"epoch": 0.52839810093195,
"grad_norm": 0.4759673774242401,
"learning_rate": 9.567283373316608e-06,
"loss": 0.0544,
"step": 3005
},
{
"epoch": 0.5292772991032179,
"grad_norm": 0.2273968905210495,
"learning_rate": 9.538834748814028e-06,
"loss": 0.0643,
"step": 3010
},
{
"epoch": 0.5301564972744857,
"grad_norm": 0.27050191164016724,
"learning_rate": 9.510389864104069e-06,
"loss": 0.057,
"step": 3015
},
{
"epoch": 0.5310356954457535,
"grad_norm": 0.23292891681194305,
"learning_rate": 9.481948949858876e-06,
"loss": 0.0656,
"step": 3020
},
{
"epoch": 0.5319148936170213,
"grad_norm": 0.7964295744895935,
"learning_rate": 9.453512236718365e-06,
"loss": 0.0506,
"step": 3025
},
{
"epoch": 0.5327940917882891,
"grad_norm": 0.46331024169921875,
"learning_rate": 9.42507995528841e-06,
"loss": 0.0612,
"step": 3030
},
{
"epoch": 0.5336732899595569,
"grad_norm": 0.5655067563056946,
"learning_rate": 9.396652336138923e-06,
"loss": 0.0585,
"step": 3035
},
{
"epoch": 0.5345524881308247,
"grad_norm": 0.18470239639282227,
"learning_rate": 9.368229609802028e-06,
"loss": 0.048,
"step": 3040
},
{
"epoch": 0.5354316863020925,
"grad_norm": 0.7371414303779602,
"learning_rate": 9.339812006770154e-06,
"loss": 0.0526,
"step": 3045
},
{
"epoch": 0.5363108844733603,
"grad_norm": 0.8986045122146606,
"learning_rate": 9.311399757494196e-06,
"loss": 0.0584,
"step": 3050
},
{
"epoch": 0.5371900826446281,
"grad_norm": 0.4300435185432434,
"learning_rate": 9.282993092381626e-06,
"loss": 0.0519,
"step": 3055
},
{
"epoch": 0.5380692808158959,
"grad_norm": 0.6866196393966675,
"learning_rate": 9.254592241794633e-06,
"loss": 0.0567,
"step": 3060
},
{
"epoch": 0.5389484789871637,
"grad_norm": 0.17242960631847382,
"learning_rate": 9.226197436048252e-06,
"loss": 0.0611,
"step": 3065
},
{
"epoch": 0.5398276771584315,
"grad_norm": 0.6466073989868164,
"learning_rate": 9.197808905408504e-06,
"loss": 0.0623,
"step": 3070
},
{
"epoch": 0.5407068753296993,
"grad_norm": 0.20931695401668549,
"learning_rate": 9.169426880090509e-06,
"loss": 0.0606,
"step": 3075
},
{
"epoch": 0.5415860735009671,
"grad_norm": 0.21108533442020416,
"learning_rate": 9.141051590256651e-06,
"loss": 0.0456,
"step": 3080
},
{
"epoch": 0.5424652716722349,
"grad_norm": 0.1871514916419983,
"learning_rate": 9.112683266014677e-06,
"loss": 0.0468,
"step": 3085
},
{
"epoch": 0.5433444698435027,
"grad_norm": 0.25235074758529663,
"learning_rate": 9.084322137415855e-06,
"loss": 0.0524,
"step": 3090
},
{
"epoch": 0.5442236680147705,
"grad_norm": 0.6398855447769165,
"learning_rate": 9.055968434453096e-06,
"loss": 0.0523,
"step": 3095
},
{
"epoch": 0.5451028661860383,
"grad_norm": 0.6872847676277161,
"learning_rate": 9.027622387059103e-06,
"loss": 0.0456,
"step": 3100
},
{
"epoch": 0.5459820643573061,
"grad_norm": 0.7495630383491516,
"learning_rate": 8.999284225104476e-06,
"loss": 0.0673,
"step": 3105
},
{
"epoch": 0.5468612625285739,
"grad_norm": 0.18154507875442505,
"learning_rate": 8.970954178395894e-06,
"loss": 0.0511,
"step": 3110
},
{
"epoch": 0.5477404606998417,
"grad_norm": 0.16267339885234833,
"learning_rate": 8.94263247667421e-06,
"loss": 0.0475,
"step": 3115
},
{
"epoch": 0.5486196588711095,
"grad_norm": 0.0940733551979065,
"learning_rate": 8.914319349612607e-06,
"loss": 0.0659,
"step": 3120
},
{
"epoch": 0.5494988570423773,
"grad_norm": 0.5106806755065918,
"learning_rate": 8.886015026814736e-06,
"loss": 0.0532,
"step": 3125
},
{
"epoch": 0.5503780552136451,
"grad_norm": 0.2427133470773697,
"learning_rate": 8.857719737812836e-06,
"loss": 0.0523,
"step": 3130
},
{
"epoch": 0.5512572533849129,
"grad_norm": 0.36954110860824585,
"learning_rate": 8.829433712065915e-06,
"loss": 0.061,
"step": 3135
},
{
"epoch": 0.5521364515561807,
"grad_norm": 0.892874538898468,
"learning_rate": 8.801157178957827e-06,
"loss": 0.0491,
"step": 3140
},
{
"epoch": 0.5530156497274485,
"grad_norm": 0.3820838928222656,
"learning_rate": 8.772890367795476e-06,
"loss": 0.048,
"step": 3145
},
{
"epoch": 0.5538948478987163,
"grad_norm": 1.087281346321106,
"learning_rate": 8.744633507806907e-06,
"loss": 0.0608,
"step": 3150
},
{
"epoch": 0.5547740460699842,
"grad_norm": 0.2828584909439087,
"learning_rate": 8.716386828139478e-06,
"loss": 0.0616,
"step": 3155
},
{
"epoch": 0.555653244241252,
"grad_norm": 0.6577861309051514,
"learning_rate": 8.688150557857979e-06,
"loss": 0.0586,
"step": 3160
},
{
"epoch": 0.5565324424125198,
"grad_norm": 0.45899686217308044,
"learning_rate": 8.659924925942798e-06,
"loss": 0.058,
"step": 3165
},
{
"epoch": 0.5574116405837876,
"grad_norm": 0.12640362977981567,
"learning_rate": 8.631710161288043e-06,
"loss": 0.0628,
"step": 3170
},
{
"epoch": 0.5582908387550554,
"grad_norm": 0.478767454624176,
"learning_rate": 8.603506492699698e-06,
"loss": 0.0636,
"step": 3175
},
{
"epoch": 0.5591700369263232,
"grad_norm": 0.22441603243350983,
"learning_rate": 8.575314148893765e-06,
"loss": 0.0461,
"step": 3180
},
{
"epoch": 0.560049235097591,
"grad_norm": 0.9642320275306702,
"learning_rate": 8.547133358494408e-06,
"loss": 0.0541,
"step": 3185
},
{
"epoch": 0.5609284332688588,
"grad_norm": 0.4824267029762268,
"learning_rate": 8.518964350032092e-06,
"loss": 0.0516,
"step": 3190
},
{
"epoch": 0.5618076314401266,
"grad_norm": 0.26887422800064087,
"learning_rate": 8.490807351941753e-06,
"loss": 0.0497,
"step": 3195
},
{
"epoch": 0.5626868296113944,
"grad_norm": 0.4273674488067627,
"learning_rate": 8.462662592560911e-06,
"loss": 0.0573,
"step": 3200
},
{
"epoch": 0.5635660277826622,
"grad_norm": 0.385642409324646,
"learning_rate": 8.434530300127853e-06,
"loss": 0.0592,
"step": 3205
},
{
"epoch": 0.56444522595393,
"grad_norm": 0.2365254908800125,
"learning_rate": 8.406410702779754e-06,
"loss": 0.0597,
"step": 3210
},
{
"epoch": 0.5653244241251978,
"grad_norm": 0.20545728504657745,
"learning_rate": 8.378304028550848e-06,
"loss": 0.0524,
"step": 3215
},
{
"epoch": 0.5662036222964656,
"grad_norm": 0.45571327209472656,
"learning_rate": 8.35021050537056e-06,
"loss": 0.0651,
"step": 3220
},
{
"epoch": 0.5670828204677334,
"grad_norm": 0.3450813591480255,
"learning_rate": 8.32213036106168e-06,
"loss": 0.0534,
"step": 3225
},
{
"epoch": 0.5679620186390012,
"grad_norm": 0.8915445804595947,
"learning_rate": 8.294063823338486e-06,
"loss": 0.0607,
"step": 3230
},
{
"epoch": 0.568841216810269,
"grad_norm": 0.3208938539028168,
"learning_rate": 8.266011119804937e-06,
"loss": 0.0487,
"step": 3235
},
{
"epoch": 0.5697204149815368,
"grad_norm": 0.3896248936653137,
"learning_rate": 8.237972477952779e-06,
"loss": 0.0534,
"step": 3240
},
{
"epoch": 0.5705996131528046,
"grad_norm": 0.28958839178085327,
"learning_rate": 8.209948125159745e-06,
"loss": 0.0564,
"step": 3245
},
{
"epoch": 0.5714788113240724,
"grad_norm": 0.8576890230178833,
"learning_rate": 8.181938288687683e-06,
"loss": 0.0602,
"step": 3250
},
{
"epoch": 0.5723580094953402,
"grad_norm": 1.088234305381775,
"learning_rate": 8.153943195680724e-06,
"loss": 0.0578,
"step": 3255
},
{
"epoch": 0.573237207666608,
"grad_norm": 0.2483123540878296,
"learning_rate": 8.125963073163435e-06,
"loss": 0.0588,
"step": 3260
},
{
"epoch": 0.5741164058378758,
"grad_norm": 0.21865952014923096,
"learning_rate": 8.097998148038986e-06,
"loss": 0.0587,
"step": 3265
},
{
"epoch": 0.5749956040091436,
"grad_norm": 0.44943463802337646,
"learning_rate": 8.070048647087298e-06,
"loss": 0.0542,
"step": 3270
},
{
"epoch": 0.5758748021804114,
"grad_norm": 0.1797959953546524,
"learning_rate": 8.042114796963219e-06,
"loss": 0.0541,
"step": 3275
},
{
"epoch": 0.5767540003516792,
"grad_norm": 0.7815648317337036,
"learning_rate": 8.014196824194668e-06,
"loss": 0.0466,
"step": 3280
},
{
"epoch": 0.577633198522947,
"grad_norm": 0.5617738366127014,
"learning_rate": 7.986294955180815e-06,
"loss": 0.0521,
"step": 3285
},
{
"epoch": 0.5785123966942148,
"grad_norm": 0.35597896575927734,
"learning_rate": 7.958409416190233e-06,
"loss": 0.0611,
"step": 3290
},
{
"epoch": 0.5793915948654826,
"grad_norm": 0.4443993866443634,
"learning_rate": 7.93054043335907e-06,
"loss": 0.0641,
"step": 3295
},
{
"epoch": 0.5802707930367504,
"grad_norm": 0.5022766590118408,
"learning_rate": 7.902688232689212e-06,
"loss": 0.0489,
"step": 3300
},
{
"epoch": 0.5811499912080182,
"grad_norm": 0.6365528702735901,
"learning_rate": 7.874853040046455e-06,
"loss": 0.0686,
"step": 3305
},
{
"epoch": 0.582029189379286,
"grad_norm": 0.3193661570549011,
"learning_rate": 7.847035081158654e-06,
"loss": 0.0552,
"step": 3310
},
{
"epoch": 0.5829083875505539,
"grad_norm": 0.3971255123615265,
"learning_rate": 7.819234581613934e-06,
"loss": 0.068,
"step": 3315
},
{
"epoch": 0.5837875857218217,
"grad_norm": 0.4976908564567566,
"learning_rate": 7.791451766858808e-06,
"loss": 0.0508,
"step": 3320
},
{
"epoch": 0.5846667838930895,
"grad_norm": 0.25572288036346436,
"learning_rate": 7.763686862196397e-06,
"loss": 0.0594,
"step": 3325
},
{
"epoch": 0.5855459820643573,
"grad_norm": 0.7076115012168884,
"learning_rate": 7.735940092784564e-06,
"loss": 0.0649,
"step": 3330
},
{
"epoch": 0.5864251802356251,
"grad_norm": 0.33293214440345764,
"learning_rate": 7.708211683634112e-06,
"loss": 0.0528,
"step": 3335
},
{
"epoch": 0.5873043784068929,
"grad_norm": 0.33272784948349,
"learning_rate": 7.680501859606961e-06,
"loss": 0.0505,
"step": 3340
},
{
"epoch": 0.5881835765781607,
"grad_norm": 0.4092160165309906,
"learning_rate": 7.652810845414297e-06,
"loss": 0.0437,
"step": 3345
},
{
"epoch": 0.5890627747494285,
"grad_norm": 0.24491731822490692,
"learning_rate": 7.625138865614795e-06,
"loss": 0.0635,
"step": 3350
},
{
"epoch": 0.5899419729206963,
"grad_norm": 0.6766128540039062,
"learning_rate": 7.597486144612741e-06,
"loss": 0.0473,
"step": 3355
},
{
"epoch": 0.5908211710919641,
"grad_norm": 0.27820366621017456,
"learning_rate": 7.569852906656269e-06,
"loss": 0.0521,
"step": 3360
},
{
"epoch": 0.5917003692632319,
"grad_norm": 0.8893203735351562,
"learning_rate": 7.542239375835499e-06,
"loss": 0.0644,
"step": 3365
},
{
"epoch": 0.5925795674344997,
"grad_norm": 0.4650630056858063,
"learning_rate": 7.514645776080747e-06,
"loss": 0.0694,
"step": 3370
},
{
"epoch": 0.5934587656057675,
"grad_norm": 0.42498430609703064,
"learning_rate": 7.487072331160696e-06,
"loss": 0.0588,
"step": 3375
},
{
"epoch": 0.5943379637770353,
"grad_norm": 0.5707778334617615,
"learning_rate": 7.459519264680586e-06,
"loss": 0.0655,
"step": 3380
},
{
"epoch": 0.5952171619483031,
"grad_norm": 0.4820030629634857,
"learning_rate": 7.431986800080394e-06,
"loss": 0.0765,
"step": 3385
},
{
"epoch": 0.5960963601195709,
"grad_norm": 1.1453475952148438,
"learning_rate": 7.4044751606330365e-06,
"loss": 0.062,
"step": 3390
},
{
"epoch": 0.5969755582908387,
"grad_norm": 0.18737494945526123,
"learning_rate": 7.37698456944254e-06,
"loss": 0.0495,
"step": 3395
},
{
"epoch": 0.5978547564621065,
"grad_norm": 0.49220606684684753,
"learning_rate": 7.349515249442248e-06,
"loss": 0.0575,
"step": 3400
},
{
"epoch": 0.5987339546333743,
"grad_norm": 0.1627589464187622,
"learning_rate": 7.322067423393002e-06,
"loss": 0.0556,
"step": 3405
},
{
"epoch": 0.5996131528046421,
"grad_norm": 0.5828942060470581,
"learning_rate": 7.294641313881348e-06,
"loss": 0.0597,
"step": 3410
},
{
"epoch": 0.6004923509759099,
"grad_norm": 0.5319778323173523,
"learning_rate": 7.267237143317707e-06,
"loss": 0.0579,
"step": 3415
},
{
"epoch": 0.6013715491471777,
"grad_norm": 0.14266423881053925,
"learning_rate": 7.239855133934608e-06,
"loss": 0.0591,
"step": 3420
},
{
"epoch": 0.6022507473184455,
"grad_norm": 0.13865630328655243,
"learning_rate": 7.212495507784843e-06,
"loss": 0.0589,
"step": 3425
},
{
"epoch": 0.6031299454897133,
"grad_norm": 0.7919948697090149,
"learning_rate": 7.185158486739712e-06,
"loss": 0.052,
"step": 3430
},
{
"epoch": 0.6040091436609812,
"grad_norm": 0.4793383777141571,
"learning_rate": 7.157844292487174e-06,
"loss": 0.0637,
"step": 3435
},
{
"epoch": 0.604888341832249,
"grad_norm": 0.2558303773403168,
"learning_rate": 7.130553146530105e-06,
"loss": 0.0724,
"step": 3440
},
{
"epoch": 0.6057675400035168,
"grad_norm": 0.33608704805374146,
"learning_rate": 7.103285270184446e-06,
"loss": 0.0502,
"step": 3445
},
{
"epoch": 0.6066467381747846,
"grad_norm": 0.31324923038482666,
"learning_rate": 7.076040884577449e-06,
"loss": 0.0559,
"step": 3450
},
{
"epoch": 0.6075259363460525,
"grad_norm": 0.13528025150299072,
"learning_rate": 7.048820210645862e-06,
"loss": 0.0579,
"step": 3455
},
{
"epoch": 0.6084051345173203,
"grad_norm": 0.12272872775793076,
"learning_rate": 7.021623469134156e-06,
"loss": 0.0573,
"step": 3460
},
{
"epoch": 0.6092843326885881,
"grad_norm": 0.2325238287448883,
"learning_rate": 6.994450880592706e-06,
"loss": 0.0698,
"step": 3465
},
{
"epoch": 0.6101635308598559,
"grad_norm": 0.27444854378700256,
"learning_rate": 6.967302665376037e-06,
"loss": 0.0605,
"step": 3470
},
{
"epoch": 0.6110427290311237,
"grad_norm": 0.7133885622024536,
"learning_rate": 6.940179043641005e-06,
"loss": 0.055,
"step": 3475
},
{
"epoch": 0.6119219272023915,
"grad_norm": 0.22960059344768524,
"learning_rate": 6.913080235345042e-06,
"loss": 0.0635,
"step": 3480
},
{
"epoch": 0.6128011253736593,
"grad_norm": 0.4592248201370239,
"learning_rate": 6.886006460244342e-06,
"loss": 0.0575,
"step": 3485
},
{
"epoch": 0.6136803235449271,
"grad_norm": 0.21212299168109894,
"learning_rate": 6.858957937892105e-06,
"loss": 0.0607,
"step": 3490
},
{
"epoch": 0.6145595217161949,
"grad_norm": 0.23819631338119507,
"learning_rate": 6.831934887636737e-06,
"loss": 0.0512,
"step": 3495
},
{
"epoch": 0.6154387198874627,
"grad_norm": 0.3616998493671417,
"learning_rate": 6.804937528620088e-06,
"loss": 0.0613,
"step": 3500
},
{
"epoch": 0.6163179180587305,
"grad_norm": 0.21602647006511688,
"learning_rate": 6.777966079775657e-06,
"loss": 0.0648,
"step": 3505
},
{
"epoch": 0.6171971162299983,
"grad_norm": 0.3276112973690033,
"learning_rate": 6.751020759826836e-06,
"loss": 0.0496,
"step": 3510
},
{
"epoch": 0.6180763144012661,
"grad_norm": 0.5276904106140137,
"learning_rate": 6.724101787285113e-06,
"loss": 0.057,
"step": 3515
},
{
"epoch": 0.6189555125725339,
"grad_norm": 0.3810863196849823,
"learning_rate": 6.697209380448333e-06,
"loss": 0.0584,
"step": 3520
},
{
"epoch": 0.6198347107438017,
"grad_norm": 0.2644079029560089,
"learning_rate": 6.670343757398882e-06,
"loss": 0.0657,
"step": 3525
},
{
"epoch": 0.6207139089150695,
"grad_norm": 0.24872681498527527,
"learning_rate": 6.643505136001972e-06,
"loss": 0.0435,
"step": 3530
},
{
"epoch": 0.6215931070863373,
"grad_norm": 0.15694132447242737,
"learning_rate": 6.616693733903823e-06,
"loss": 0.053,
"step": 3535
},
{
"epoch": 0.6224723052576051,
"grad_norm": 0.58054518699646,
"learning_rate": 6.5899097685299395e-06,
"loss": 0.0735,
"step": 3540
},
{
"epoch": 0.6233515034288729,
"grad_norm": 0.479245662689209,
"learning_rate": 6.563153457083315e-06,
"loss": 0.0588,
"step": 3545
},
{
"epoch": 0.6242307016001407,
"grad_norm": 0.28133952617645264,
"learning_rate": 6.5364250165427e-06,
"loss": 0.0573,
"step": 3550
},
{
"epoch": 0.6251098997714085,
"grad_norm": 0.5088506937026978,
"learning_rate": 6.509724663660813e-06,
"loss": 0.055,
"step": 3555
},
{
"epoch": 0.6259890979426763,
"grad_norm": 0.7142006158828735,
"learning_rate": 6.4830526149626064e-06,
"loss": 0.04,
"step": 3560
},
{
"epoch": 0.6268682961139441,
"grad_norm": 0.31554004549980164,
"learning_rate": 6.4564090867435e-06,
"loss": 0.0593,
"step": 3565
},
{
"epoch": 0.6277474942852119,
"grad_norm": 0.2784283757209778,
"learning_rate": 6.429794295067625e-06,
"loss": 0.046,
"step": 3570
},
{
"epoch": 0.6286266924564797,
"grad_norm": 0.44909903407096863,
"learning_rate": 6.403208455766081e-06,
"loss": 0.0563,
"step": 3575
},
{
"epoch": 0.6295058906277475,
"grad_norm": 0.21697324514389038,
"learning_rate": 6.376651784435174e-06,
"loss": 0.0527,
"step": 3580
},
{
"epoch": 0.6303850887990153,
"grad_norm": 0.5999769568443298,
"learning_rate": 6.350124496434677e-06,
"loss": 0.066,
"step": 3585
},
{
"epoch": 0.6312642869702831,
"grad_norm": 0.4327978193759918,
"learning_rate": 6.323626806886082e-06,
"loss": 0.0493,
"step": 3590
},
{
"epoch": 0.6321434851415509,
"grad_norm": 0.47847869992256165,
"learning_rate": 6.297158930670852e-06,
"loss": 0.0593,
"step": 3595
},
{
"epoch": 0.6330226833128187,
"grad_norm": 1.0836427211761475,
"learning_rate": 6.270721082428678e-06,
"loss": 0.0557,
"step": 3600
},
{
"epoch": 0.6339018814840865,
"grad_norm": 0.2735603153705597,
"learning_rate": 6.2443134765557475e-06,
"loss": 0.0662,
"step": 3605
},
{
"epoch": 0.6347810796553544,
"grad_norm": 0.3386712372303009,
"learning_rate": 6.2179363272029935e-06,
"loss": 0.0497,
"step": 3610
},
{
"epoch": 0.6356602778266222,
"grad_norm": 0.11221656948328018,
"learning_rate": 6.191589848274369e-06,
"loss": 0.0498,
"step": 3615
},
{
"epoch": 0.63653947599789,
"grad_norm": 0.5436195135116577,
"learning_rate": 6.1652742534251e-06,
"loss": 0.054,
"step": 3620
},
{
"epoch": 0.6374186741691578,
"grad_norm": 0.5452234148979187,
"learning_rate": 6.138989756059968e-06,
"loss": 0.0448,
"step": 3625
},
{
"epoch": 0.6382978723404256,
"grad_norm": 0.4096541404724121,
"learning_rate": 6.1127365693315566e-06,
"loss": 0.0556,
"step": 3630
},
{
"epoch": 0.6391770705116934,
"grad_norm": 0.4600813388824463,
"learning_rate": 6.086514906138563e-06,
"loss": 0.0562,
"step": 3635
},
{
"epoch": 0.6400562686829612,
"grad_norm": 0.8097591996192932,
"learning_rate": 6.060324979124016e-06,
"loss": 0.0534,
"step": 3640
},
{
"epoch": 0.640935466854229,
"grad_norm": 0.8001208305358887,
"learning_rate": 6.034167000673611e-06,
"loss": 0.0589,
"step": 3645
},
{
"epoch": 0.6418146650254968,
"grad_norm": 0.3946583867073059,
"learning_rate": 6.008041182913933e-06,
"loss": 0.0594,
"step": 3650
},
{
"epoch": 0.6426938631967646,
"grad_norm": 0.3865828216075897,
"learning_rate": 5.981947737710779e-06,
"loss": 0.0655,
"step": 3655
},
{
"epoch": 0.6435730613680324,
"grad_norm": 1.0482414960861206,
"learning_rate": 5.955886876667414e-06,
"loss": 0.0652,
"step": 3660
},
{
"epoch": 0.6444522595393002,
"grad_norm": 1.3280454874038696,
"learning_rate": 5.929858811122868e-06,
"loss": 0.0678,
"step": 3665
},
{
"epoch": 0.645331457710568,
"grad_norm": 0.1955081820487976,
"learning_rate": 5.903863752150212e-06,
"loss": 0.0565,
"step": 3670
},
{
"epoch": 0.6462106558818358,
"grad_norm": 0.37470242381095886,
"learning_rate": 5.877901910554862e-06,
"loss": 0.0558,
"step": 3675
},
{
"epoch": 0.6470898540531036,
"grad_norm": 0.8022100329399109,
"learning_rate": 5.851973496872849e-06,
"loss": 0.0498,
"step": 3680
},
{
"epoch": 0.6479690522243714,
"grad_norm": 0.23341333866119385,
"learning_rate": 5.82607872136913e-06,
"loss": 0.0637,
"step": 3685
},
{
"epoch": 0.6488482503956392,
"grad_norm": 0.40961551666259766,
"learning_rate": 5.800217794035872e-06,
"loss": 0.0463,
"step": 3690
},
{
"epoch": 0.649727448566907,
"grad_norm": 0.21275675296783447,
"learning_rate": 5.774390924590754e-06,
"loss": 0.0552,
"step": 3695
},
{
"epoch": 0.6506066467381748,
"grad_norm": 0.17066530883312225,
"learning_rate": 5.748598322475258e-06,
"loss": 0.0585,
"step": 3700
},
{
"epoch": 0.6514858449094426,
"grad_norm": 0.3308875262737274,
"learning_rate": 5.7228401968529836e-06,
"loss": 0.058,
"step": 3705
},
{
"epoch": 0.6523650430807104,
"grad_norm": 0.7064841389656067,
"learning_rate": 5.697116756607946e-06,
"loss": 0.0608,
"step": 3710
},
{
"epoch": 0.6532442412519782,
"grad_norm": 0.13864412903785706,
"learning_rate": 5.671428210342884e-06,
"loss": 0.0409,
"step": 3715
},
{
"epoch": 0.654123439423246,
"grad_norm": 0.49417930841445923,
"learning_rate": 5.64577476637755e-06,
"loss": 0.0633,
"step": 3720
},
{
"epoch": 0.6550026375945138,
"grad_norm": 0.5518152117729187,
"learning_rate": 5.620156632747053e-06,
"loss": 0.0522,
"step": 3725
},
{
"epoch": 0.6558818357657816,
"grad_norm": 0.157115638256073,
"learning_rate": 5.594574017200149e-06,
"loss": 0.0474,
"step": 3730
},
{
"epoch": 0.6567610339370494,
"grad_norm": 0.16834110021591187,
"learning_rate": 5.569027127197565e-06,
"loss": 0.0573,
"step": 3735
},
{
"epoch": 0.6576402321083172,
"grad_norm": 1.1739530563354492,
"learning_rate": 5.5435161699103055e-06,
"loss": 0.0531,
"step": 3740
},
{
"epoch": 0.658519430279585,
"grad_norm": 0.4163840115070343,
"learning_rate": 5.518041352217989e-06,
"loss": 0.0731,
"step": 3745
},
{
"epoch": 0.6593986284508528,
"grad_norm": 0.2617214322090149,
"learning_rate": 5.492602880707161e-06,
"loss": 0.0614,
"step": 3750
},
{
"epoch": 0.6602778266221206,
"grad_norm": 0.3267338275909424,
"learning_rate": 5.467200961669619e-06,
"loss": 0.0511,
"step": 3755
},
{
"epoch": 0.6611570247933884,
"grad_norm": 0.7266274094581604,
"learning_rate": 5.441835801100734e-06,
"loss": 0.0526,
"step": 3760
},
{
"epoch": 0.6620362229646563,
"grad_norm": 0.7042490243911743,
"learning_rate": 5.416507604697801e-06,
"loss": 0.0383,
"step": 3765
},
{
"epoch": 0.662915421135924,
"grad_norm": 0.5207750797271729,
"learning_rate": 5.391216577858331e-06,
"loss": 0.0561,
"step": 3770
},
{
"epoch": 0.6637946193071919,
"grad_norm": 0.7317136526107788,
"learning_rate": 5.365962925678443e-06,
"loss": 0.0609,
"step": 3775
},
{
"epoch": 0.6646738174784597,
"grad_norm": 0.47223329544067383,
"learning_rate": 5.340746852951151e-06,
"loss": 0.0661,
"step": 3780
},
{
"epoch": 0.6655530156497275,
"grad_norm": 0.5919240713119507,
"learning_rate": 5.315568564164713e-06,
"loss": 0.0591,
"step": 3785
},
{
"epoch": 0.6664322138209953,
"grad_norm": 0.16643045842647552,
"learning_rate": 5.290428263500996e-06,
"loss": 0.0512,
"step": 3790
},
{
"epoch": 0.6673114119922631,
"grad_norm": 0.6606490612030029,
"learning_rate": 5.26532615483379e-06,
"loss": 0.06,
"step": 3795
},
{
"epoch": 0.6681906101635309,
"grad_norm": 0.5036027431488037,
"learning_rate": 5.240262441727187e-06,
"loss": 0.0546,
"step": 3800
},
{
"epoch": 0.6690698083347987,
"grad_norm": 0.2331734448671341,
"learning_rate": 5.215237327433895e-06,
"loss": 0.0512,
"step": 3805
},
{
"epoch": 0.6699490065060665,
"grad_norm": 0.19755728542804718,
"learning_rate": 5.190251014893621e-06,
"loss": 0.047,
"step": 3810
},
{
"epoch": 0.6708282046773343,
"grad_norm": 0.788175642490387,
"learning_rate": 5.165303706731397e-06,
"loss": 0.0681,
"step": 3815
},
{
"epoch": 0.6717074028486021,
"grad_norm": 0.19185423851013184,
"learning_rate": 5.140395605255965e-06,
"loss": 0.0535,
"step": 3820
},
{
"epoch": 0.6725866010198699,
"grad_norm": 0.22378072142601013,
"learning_rate": 5.115526912458113e-06,
"loss": 0.0584,
"step": 3825
},
{
"epoch": 0.6734657991911377,
"grad_norm": 0.31949400901794434,
"learning_rate": 5.090697830009057e-06,
"loss": 0.059,
"step": 3830
},
{
"epoch": 0.6743449973624055,
"grad_norm": 0.31779804825782776,
"learning_rate": 5.065908559258782e-06,
"loss": 0.0541,
"step": 3835
},
{
"epoch": 0.6752241955336733,
"grad_norm": 0.9284188747406006,
"learning_rate": 5.0411593012344305e-06,
"loss": 0.0461,
"step": 3840
},
{
"epoch": 0.6761033937049411,
"grad_norm": 1.0573227405548096,
"learning_rate": 5.0164502566386655e-06,
"loss": 0.0529,
"step": 3845
},
{
"epoch": 0.6769825918762089,
"grad_norm": 0.8638303279876709,
"learning_rate": 4.991781625848039e-06,
"loss": 0.0652,
"step": 3850
},
{
"epoch": 0.6778617900474767,
"grad_norm": 0.3580770194530487,
"learning_rate": 4.967153608911366e-06,
"loss": 0.0456,
"step": 3855
},
{
"epoch": 0.6787409882187445,
"grad_norm": 0.1639026552438736,
"learning_rate": 4.942566405548109e-06,
"loss": 0.0624,
"step": 3860
},
{
"epoch": 0.6796201863900123,
"grad_norm": 0.22067619860172272,
"learning_rate": 4.918020215146759e-06,
"loss": 0.0586,
"step": 3865
},
{
"epoch": 0.6804993845612801,
"grad_norm": 0.24657614529132843,
"learning_rate": 4.8935152367632136e-06,
"loss": 0.0542,
"step": 3870
},
{
"epoch": 0.6813785827325479,
"grad_norm": 0.9170181751251221,
"learning_rate": 4.869051669119153e-06,
"loss": 0.0517,
"step": 3875
},
{
"epoch": 0.6822577809038157,
"grad_norm": 1.0674784183502197,
"learning_rate": 4.844629710600457e-06,
"loss": 0.0725,
"step": 3880
},
{
"epoch": 0.6831369790750835,
"grad_norm": 0.3955898880958557,
"learning_rate": 4.820249559255559e-06,
"loss": 0.0557,
"step": 3885
},
{
"epoch": 0.6840161772463513,
"grad_norm": 0.3524361252784729,
"learning_rate": 4.795911412793883e-06,
"loss": 0.0589,
"step": 3890
},
{
"epoch": 0.6848953754176191,
"grad_norm": 0.3493446111679077,
"learning_rate": 4.771615468584194e-06,
"loss": 0.0516,
"step": 3895
},
{
"epoch": 0.6857745735888869,
"grad_norm": 0.3028711676597595,
"learning_rate": 4.747361923653039e-06,
"loss": 0.0513,
"step": 3900
},
{
"epoch": 0.6866537717601547,
"grad_norm": 0.09052418917417526,
"learning_rate": 4.723150974683112e-06,
"loss": 0.0559,
"step": 3905
},
{
"epoch": 0.6875329699314225,
"grad_norm": 0.7868338823318481,
"learning_rate": 4.698982818011694e-06,
"loss": 0.0666,
"step": 3910
},
{
"epoch": 0.6884121681026903,
"grad_norm": 0.16918179392814636,
"learning_rate": 4.674857649629035e-06,
"loss": 0.0527,
"step": 3915
},
{
"epoch": 0.6892913662739582,
"grad_norm": 0.3652224540710449,
"learning_rate": 4.650775665176783e-06,
"loss": 0.0567,
"step": 3920
},
{
"epoch": 0.690170564445226,
"grad_norm": 0.5725377798080444,
"learning_rate": 4.626737059946375e-06,
"loss": 0.0632,
"step": 3925
},
{
"epoch": 0.6910497626164938,
"grad_norm": 0.4080544114112854,
"learning_rate": 4.602742028877475e-06,
"loss": 0.0485,
"step": 3930
},
{
"epoch": 0.6919289607877616,
"grad_norm": 0.13161161541938782,
"learning_rate": 4.578790766556386e-06,
"loss": 0.0661,
"step": 3935
},
{
"epoch": 0.6928081589590294,
"grad_norm": 0.28293490409851074,
"learning_rate": 4.554883467214472e-06,
"loss": 0.0572,
"step": 3940
},
{
"epoch": 0.6936873571302972,
"grad_norm": 0.4282214343547821,
"learning_rate": 4.53102032472657e-06,
"loss": 0.0519,
"step": 3945
},
{
"epoch": 0.694566555301565,
"grad_norm": 0.44754886627197266,
"learning_rate": 4.507201532609444e-06,
"loss": 0.056,
"step": 3950
},
{
"epoch": 0.6954457534728328,
"grad_norm": 0.1610032469034195,
"learning_rate": 4.4834272840201945e-06,
"loss": 0.0592,
"step": 3955
},
{
"epoch": 0.6963249516441006,
"grad_norm": 0.18849897384643555,
"learning_rate": 4.459697771754704e-06,
"loss": 0.0546,
"step": 3960
},
{
"epoch": 0.6972041498153684,
"grad_norm": 0.41627243161201477,
"learning_rate": 4.436013188246056e-06,
"loss": 0.0654,
"step": 3965
},
{
"epoch": 0.6980833479866362,
"grad_norm": 0.3271617889404297,
"learning_rate": 4.412373725563001e-06,
"loss": 0.0524,
"step": 3970
},
{
"epoch": 0.698962546157904,
"grad_norm": 0.231711283326149,
"learning_rate": 4.388779575408371e-06,
"loss": 0.0543,
"step": 3975
},
{
"epoch": 0.6998417443291718,
"grad_norm": 0.38711780309677124,
"learning_rate": 4.36523092911756e-06,
"loss": 0.0592,
"step": 3980
},
{
"epoch": 0.7007209425004396,
"grad_norm": 0.3789129853248596,
"learning_rate": 4.341727977656925e-06,
"loss": 0.0556,
"step": 3985
},
{
"epoch": 0.7016001406717074,
"grad_norm": 1.4185389280319214,
"learning_rate": 4.318270911622285e-06,
"loss": 0.0618,
"step": 3990
},
{
"epoch": 0.7024793388429752,
"grad_norm": 0.749077558517456,
"learning_rate": 4.2948599212373386e-06,
"loss": 0.0558,
"step": 3995
},
{
"epoch": 0.703358537014243,
"grad_norm": 0.23492804169654846,
"learning_rate": 4.271495196352141e-06,
"loss": 0.0614,
"step": 4000
},
{
"epoch": 0.7042377351855108,
"grad_norm": 0.2221526801586151,
"learning_rate": 4.248176926441574e-06,
"loss": 0.0592,
"step": 4005
},
{
"epoch": 0.7051169333567786,
"grad_norm": 0.2977111339569092,
"learning_rate": 4.224905300603772e-06,
"loss": 0.0449,
"step": 4010
},
{
"epoch": 0.7059961315280464,
"grad_norm": 0.3051705062389374,
"learning_rate": 4.2016805075586306e-06,
"loss": 0.0507,
"step": 4015
},
{
"epoch": 0.7068753296993142,
"grad_norm": 0.21050839126110077,
"learning_rate": 4.178502735646244e-06,
"loss": 0.0666,
"step": 4020
},
{
"epoch": 0.707754527870582,
"grad_norm": 0.37007981538772583,
"learning_rate": 4.1553721728254e-06,
"loss": 0.0565,
"step": 4025
},
{
"epoch": 0.7086337260418498,
"grad_norm": 0.8668897747993469,
"learning_rate": 4.1322890066720465e-06,
"loss": 0.0503,
"step": 4030
},
{
"epoch": 0.7095129242131176,
"grad_norm": 0.13824816048145294,
"learning_rate": 4.109253424377773e-06,
"loss": 0.0656,
"step": 4035
},
{
"epoch": 0.7103921223843854,
"grad_norm": 0.5541465282440186,
"learning_rate": 4.086265612748277e-06,
"loss": 0.0518,
"step": 4040
},
{
"epoch": 0.7112713205556532,
"grad_norm": 0.5307297706604004,
"learning_rate": 4.063325758201878e-06,
"loss": 0.0554,
"step": 4045
},
{
"epoch": 0.712150518726921,
"grad_norm": 0.6194763779640198,
"learning_rate": 4.040434046767984e-06,
"loss": 0.0648,
"step": 4050
},
{
"epoch": 0.7130297168981888,
"grad_norm": 0.08174088597297668,
"learning_rate": 4.017590664085593e-06,
"loss": 0.0512,
"step": 4055
},
{
"epoch": 0.7139089150694566,
"grad_norm": 0.3599446713924408,
"learning_rate": 3.994795795401774e-06,
"loss": 0.0466,
"step": 4060
},
{
"epoch": 0.7147881132407244,
"grad_norm": 0.7686516046524048,
"learning_rate": 3.9720496255701855e-06,
"loss": 0.0576,
"step": 4065
},
{
"epoch": 0.7156673114119922,
"grad_norm": 0.4762028157711029,
"learning_rate": 3.949352339049561e-06,
"loss": 0.0507,
"step": 4070
},
{
"epoch": 0.71654650958326,
"grad_norm": 0.37046587467193604,
"learning_rate": 3.926704119902219e-06,
"loss": 0.063,
"step": 4075
},
{
"epoch": 0.7174257077545279,
"grad_norm": 0.0833682268857956,
"learning_rate": 3.904105151792563e-06,
"loss": 0.0501,
"step": 4080
},
{
"epoch": 0.7183049059257957,
"grad_norm": 0.24260376393795013,
"learning_rate": 3.8815556179856106e-06,
"loss": 0.0531,
"step": 4085
},
{
"epoch": 0.7191841040970635,
"grad_norm": 0.28895097970962524,
"learning_rate": 3.859055701345477e-06,
"loss": 0.0558,
"step": 4090
},
{
"epoch": 0.7200633022683313,
"grad_norm": 0.3396805226802826,
"learning_rate": 3.8366055843339315e-06,
"loss": 0.0718,
"step": 4095
},
{
"epoch": 0.7209425004395991,
"grad_norm": 0.32812032103538513,
"learning_rate": 3.8142054490088752e-06,
"loss": 0.06,
"step": 4100
},
{
"epoch": 0.7218216986108669,
"grad_norm": 0.2770575284957886,
"learning_rate": 3.791855477022903e-06,
"loss": 0.0596,
"step": 4105
},
{
"epoch": 0.7227008967821347,
"grad_norm": 0.38149547576904297,
"learning_rate": 3.769555849621799e-06,
"loss": 0.0639,
"step": 4110
},
{
"epoch": 0.7235800949534025,
"grad_norm": 0.19870160520076752,
"learning_rate": 3.747306747643089e-06,
"loss": 0.0503,
"step": 4115
},
{
"epoch": 0.7244592931246703,
"grad_norm": 0.31559768319129944,
"learning_rate": 3.7251083515145658e-06,
"loss": 0.0546,
"step": 4120
},
{
"epoch": 0.7253384912959381,
"grad_norm": 0.4508054852485657,
"learning_rate": 3.7029608412528263e-06,
"loss": 0.0658,
"step": 4125
},
{
"epoch": 0.7262176894672059,
"grad_norm": 0.34780052304267883,
"learning_rate": 3.680864396461803e-06,
"loss": 0.0562,
"step": 4130
},
{
"epoch": 0.7270968876384737,
"grad_norm": 0.5677065849304199,
"learning_rate": 3.658819196331327e-06,
"loss": 0.0643,
"step": 4135
},
{
"epoch": 0.7279760858097415,
"grad_norm": 0.1682279258966446,
"learning_rate": 3.6368254196356576e-06,
"loss": 0.053,
"step": 4140
},
{
"epoch": 0.7288552839810093,
"grad_norm": 0.32326585054397583,
"learning_rate": 3.614883244732045e-06,
"loss": 0.0408,
"step": 4145
},
{
"epoch": 0.7297344821522771,
"grad_norm": 0.2262571007013321,
"learning_rate": 3.5929928495592657e-06,
"loss": 0.0552,
"step": 4150
},
{
"epoch": 0.7306136803235449,
"grad_norm": 0.26115310192108154,
"learning_rate": 3.5711544116362028e-06,
"loss": 0.0611,
"step": 4155
},
{
"epoch": 0.7314928784948127,
"grad_norm": 0.464222252368927,
"learning_rate": 3.5493681080603903e-06,
"loss": 0.055,
"step": 4160
},
{
"epoch": 0.7323720766660805,
"grad_norm": 0.35738715529441833,
"learning_rate": 3.5276341155065864e-06,
"loss": 0.0632,
"step": 4165
},
{
"epoch": 0.7332512748373483,
"grad_norm": 0.393279105424881,
"learning_rate": 3.505952610225327e-06,
"loss": 0.0529,
"step": 4170
},
{
"epoch": 0.7341304730086161,
"grad_norm": 0.5563225746154785,
"learning_rate": 3.4843237680415153e-06,
"loss": 0.0628,
"step": 4175
},
{
"epoch": 0.7350096711798839,
"grad_norm": 0.2666977047920227,
"learning_rate": 3.462747764352974e-06,
"loss": 0.0547,
"step": 4180
},
{
"epoch": 0.7358888693511517,
"grad_norm": 0.4078899919986725,
"learning_rate": 3.441224774129055e-06,
"loss": 0.0639,
"step": 4185
},
{
"epoch": 0.7367680675224195,
"grad_norm": 0.31975606083869934,
"learning_rate": 3.4197549719091794e-06,
"loss": 0.0628,
"step": 4190
},
{
"epoch": 0.7376472656936873,
"grad_norm": 0.295015424489975,
"learning_rate": 3.3983385318014573e-06,
"loss": 0.049,
"step": 4195
},
{
"epoch": 0.7385264638649551,
"grad_norm": 0.608062207698822,
"learning_rate": 3.3769756274812526e-06,
"loss": 0.047,
"step": 4200
},
{
"epoch": 0.7394056620362229,
"grad_norm": 0.20837105810642242,
"learning_rate": 3.3556664321897914e-06,
"loss": 0.0623,
"step": 4205
},
{
"epoch": 0.7402848602074907,
"grad_norm": 0.20374645292758942,
"learning_rate": 3.334411118732744e-06,
"loss": 0.0576,
"step": 4210
},
{
"epoch": 0.7411640583787585,
"grad_norm": 0.49013030529022217,
"learning_rate": 3.3132098594788385e-06,
"loss": 0.0632,
"step": 4215
},
{
"epoch": 0.7420432565500263,
"grad_norm": 0.3315906822681427,
"learning_rate": 3.2920628263584375e-06,
"loss": 0.0536,
"step": 4220
},
{
"epoch": 0.7429224547212941,
"grad_norm": 0.3624337613582611,
"learning_rate": 3.2709701908621726e-06,
"loss": 0.0542,
"step": 4225
},
{
"epoch": 0.743801652892562,
"grad_norm": 0.2681962549686432,
"learning_rate": 3.2499321240395387e-06,
"loss": 0.0581,
"step": 4230
},
{
"epoch": 0.7446808510638298,
"grad_norm": 0.11040078103542328,
"learning_rate": 3.2289487964975074e-06,
"loss": 0.0497,
"step": 4235
},
{
"epoch": 0.7455600492350976,
"grad_norm": 0.5186774730682373,
"learning_rate": 3.2080203783991504e-06,
"loss": 0.0594,
"step": 4240
},
{
"epoch": 0.7464392474063654,
"grad_norm": 0.4543941617012024,
"learning_rate": 3.1871470394622407e-06,
"loss": 0.0602,
"step": 4245
},
{
"epoch": 0.7473184455776332,
"grad_norm": 0.8331423997879028,
"learning_rate": 3.1663289489579054e-06,
"loss": 0.0453,
"step": 4250
},
{
"epoch": 0.748197643748901,
"grad_norm": 0.21638324856758118,
"learning_rate": 3.145566275709231e-06,
"loss": 0.0534,
"step": 4255
},
{
"epoch": 0.7490768419201688,
"grad_norm": 0.19550643861293793,
"learning_rate": 3.124859188089905e-06,
"loss": 0.0502,
"step": 4260
},
{
"epoch": 0.7499560400914366,
"grad_norm": 0.18126316368579865,
"learning_rate": 3.1042078540228358e-06,
"loss": 0.0542,
"step": 4265
},
{
"epoch": 0.7508352382627044,
"grad_norm": 0.4952530264854431,
"learning_rate": 3.0836124409788137e-06,
"loss": 0.0518,
"step": 4270
},
{
"epoch": 0.7517144364339722,
"grad_norm": 0.7046754956245422,
"learning_rate": 3.063073115975136e-06,
"loss": 0.0575,
"step": 4275
},
{
"epoch": 0.75259363460524,
"grad_norm": 0.13164618611335754,
"learning_rate": 3.0425900455742584e-06,
"loss": 0.0475,
"step": 4280
},
{
"epoch": 0.7534728327765078,
"grad_norm": 0.21266759932041168,
"learning_rate": 3.022163395882438e-06,
"loss": 0.0532,
"step": 4285
},
{
"epoch": 0.7543520309477756,
"grad_norm": 0.6557819247245789,
"learning_rate": 3.0017933325484028e-06,
"loss": 0.0501,
"step": 4290
},
{
"epoch": 0.7552312291190434,
"grad_norm": 0.253121554851532,
"learning_rate": 2.981480020761978e-06,
"loss": 0.0568,
"step": 4295
},
{
"epoch": 0.7561104272903112,
"grad_norm": 0.26722845435142517,
"learning_rate": 2.9612236252527904e-06,
"loss": 0.0564,
"step": 4300
},
{
"epoch": 0.756989625461579,
"grad_norm": 0.1494354009628296,
"learning_rate": 2.941024310288886e-06,
"loss": 0.0577,
"step": 4305
},
{
"epoch": 0.7578688236328468,
"grad_norm": 0.16053201258182526,
"learning_rate": 2.9208822396754333e-06,
"loss": 0.0604,
"step": 4310
},
{
"epoch": 0.7587480218041146,
"grad_norm": 0.5304045677185059,
"learning_rate": 2.9007975767533714e-06,
"loss": 0.0598,
"step": 4315
},
{
"epoch": 0.7596272199753824,
"grad_norm": 0.587253987789154,
"learning_rate": 2.8807704843981e-06,
"loss": 0.0596,
"step": 4320
},
{
"epoch": 0.7605064181466502,
"grad_norm": 0.6153156161308289,
"learning_rate": 2.8608011250181544e-06,
"loss": 0.052,
"step": 4325
},
{
"epoch": 0.761385616317918,
"grad_norm": 0.7715466618537903,
"learning_rate": 2.8408896605538905e-06,
"loss": 0.0501,
"step": 4330
},
{
"epoch": 0.7622648144891858,
"grad_norm": 0.24888671934604645,
"learning_rate": 2.8210362524761557e-06,
"loss": 0.0594,
"step": 4335
},
{
"epoch": 0.7631440126604536,
"grad_norm": 0.6384350061416626,
"learning_rate": 2.8012410617850083e-06,
"loss": 0.0491,
"step": 4340
},
{
"epoch": 0.7640232108317214,
"grad_norm": 0.5361153483390808,
"learning_rate": 2.7815042490083857e-06,
"loss": 0.053,
"step": 4345
},
{
"epoch": 0.7649024090029892,
"grad_norm": 0.3747937083244324,
"learning_rate": 2.7618259742008226e-06,
"loss": 0.0555,
"step": 4350
},
{
"epoch": 0.765781607174257,
"grad_norm": 0.2643256187438965,
"learning_rate": 2.7422063969421286e-06,
"loss": 0.0533,
"step": 4355
},
{
"epoch": 0.7666608053455248,
"grad_norm": 0.6822528839111328,
"learning_rate": 2.722645676336123e-06,
"loss": 0.057,
"step": 4360
},
{
"epoch": 0.7675400035167926,
"grad_norm": 0.2672845423221588,
"learning_rate": 2.7031439710093254e-06,
"loss": 0.058,
"step": 4365
},
{
"epoch": 0.7684192016880604,
"grad_norm": 0.1608499139547348,
"learning_rate": 2.683701439109676e-06,
"loss": 0.0573,
"step": 4370
},
{
"epoch": 0.7692983998593282,
"grad_norm": 0.2795965373516083,
"learning_rate": 2.6643182383052448e-06,
"loss": 0.0667,
"step": 4375
},
{
"epoch": 0.770177598030596,
"grad_norm": 0.11451299488544464,
"learning_rate": 2.644994525782971e-06,
"loss": 0.0527,
"step": 4380
},
{
"epoch": 0.7710567962018638,
"grad_norm": 0.48407652974128723,
"learning_rate": 2.625730458247362e-06,
"loss": 0.0572,
"step": 4385
},
{
"epoch": 0.7719359943731317,
"grad_norm": 0.5724853277206421,
"learning_rate": 2.606526191919259e-06,
"loss": 0.0432,
"step": 4390
},
{
"epoch": 0.7728151925443995,
"grad_norm": 0.6579432487487793,
"learning_rate": 2.5873818825345254e-06,
"loss": 0.0521,
"step": 4395
},
{
"epoch": 0.7736943907156673,
"grad_norm": 0.7850777506828308,
"learning_rate": 2.5682976853428264e-06,
"loss": 0.0563,
"step": 4400
},
{
"epoch": 0.7745735888869351,
"grad_norm": 0.20982079207897186,
"learning_rate": 2.5492737551063374e-06,
"loss": 0.0587,
"step": 4405
},
{
"epoch": 0.7754527870582029,
"grad_norm": 0.20126987993717194,
"learning_rate": 2.5303102460985098e-06,
"loss": 0.0585,
"step": 4410
},
{
"epoch": 0.7763319852294708,
"grad_norm": 0.16406214237213135,
"learning_rate": 2.511407312102809e-06,
"loss": 0.0641,
"step": 4415
},
{
"epoch": 0.7772111834007386,
"grad_norm": 0.8947567939758301,
"learning_rate": 2.4925651064114788e-06,
"loss": 0.0563,
"step": 4420
},
{
"epoch": 0.7780903815720064,
"grad_norm": 0.9534220099449158,
"learning_rate": 2.4737837818242747e-06,
"loss": 0.0472,
"step": 4425
},
{
"epoch": 0.7789695797432742,
"grad_norm": 0.24452577531337738,
"learning_rate": 2.455063490647257e-06,
"loss": 0.0545,
"step": 4430
},
{
"epoch": 0.779848777914542,
"grad_norm": 0.5126246213912964,
"learning_rate": 2.4364043846915273e-06,
"loss": 0.0502,
"step": 4435
},
{
"epoch": 0.7807279760858098,
"grad_norm": 0.3192353844642639,
"learning_rate": 2.4178066152720203e-06,
"loss": 0.0672,
"step": 4440
},
{
"epoch": 0.7816071742570776,
"grad_norm": 0.8498159050941467,
"learning_rate": 2.399270333206253e-06,
"loss": 0.0575,
"step": 4445
},
{
"epoch": 0.7824863724283454,
"grad_norm": 0.7275409698486328,
"learning_rate": 2.3807956888131213e-06,
"loss": 0.0623,
"step": 4450
},
{
"epoch": 0.7833655705996132,
"grad_norm": 0.6221399307250977,
"learning_rate": 2.362382831911675e-06,
"loss": 0.0554,
"step": 4455
},
{
"epoch": 0.784244768770881,
"grad_norm": 0.21380391716957092,
"learning_rate": 2.3440319118198997e-06,
"loss": 0.0551,
"step": 4460
},
{
"epoch": 0.7851239669421488,
"grad_norm": 0.8011379837989807,
"learning_rate": 2.3257430773535116e-06,
"loss": 0.051,
"step": 4465
},
{
"epoch": 0.7860031651134166,
"grad_norm": 0.5205139517784119,
"learning_rate": 2.307516476824738e-06,
"loss": 0.0615,
"step": 4470
},
{
"epoch": 0.7868823632846844,
"grad_norm": 0.40472784638404846,
"learning_rate": 2.289352258041133e-06,
"loss": 0.0515,
"step": 4475
},
{
"epoch": 0.7877615614559522,
"grad_norm": 0.11419567465782166,
"learning_rate": 2.271250568304366e-06,
"loss": 0.0511,
"step": 4480
},
{
"epoch": 0.78864075962722,
"grad_norm": 0.11799798905849457,
"learning_rate": 2.253211554409034e-06,
"loss": 0.0584,
"step": 4485
},
{
"epoch": 0.7895199577984878,
"grad_norm": 0.5825390219688416,
"learning_rate": 2.235235362641458e-06,
"loss": 0.052,
"step": 4490
},
{
"epoch": 0.7903991559697556,
"grad_norm": 0.14880183339118958,
"learning_rate": 2.2173221387785215e-06,
"loss": 0.0567,
"step": 4495
},
{
"epoch": 0.7912783541410234,
"grad_norm": 0.16404080390930176,
"learning_rate": 2.1994720280864567e-06,
"loss": 0.0555,
"step": 4500
},
{
"epoch": 0.7921575523122912,
"grad_norm": 0.13647328317165375,
"learning_rate": 2.1816851753197023e-06,
"loss": 0.0443,
"step": 4505
},
{
"epoch": 0.793036750483559,
"grad_norm": 0.3101443648338318,
"learning_rate": 2.163961724719693e-06,
"loss": 0.0568,
"step": 4510
},
{
"epoch": 0.7939159486548268,
"grad_norm": 0.5262371897697449,
"learning_rate": 2.1463018200137197e-06,
"loss": 0.0541,
"step": 4515
},
{
"epoch": 0.7947951468260946,
"grad_norm": 0.15383735299110413,
"learning_rate": 2.128705604413741e-06,
"loss": 0.057,
"step": 4520
},
{
"epoch": 0.7956743449973624,
"grad_norm": 0.33696064352989197,
"learning_rate": 2.1111732206152424e-06,
"loss": 0.0541,
"step": 4525
},
{
"epoch": 0.7965535431686303,
"grad_norm": 0.37046894431114197,
"learning_rate": 2.093704810796062e-06,
"loss": 0.0677,
"step": 4530
},
{
"epoch": 0.797432741339898,
"grad_norm": 0.2881315350532532,
"learning_rate": 2.076300516615252e-06,
"loss": 0.0516,
"step": 4535
},
{
"epoch": 0.7983119395111659,
"grad_norm": 0.21759085357189178,
"learning_rate": 2.0589604792119124e-06,
"loss": 0.0604,
"step": 4540
},
{
"epoch": 0.7991911376824337,
"grad_norm": 0.6279814839363098,
"learning_rate": 2.0416848392040647e-06,
"loss": 0.0618,
"step": 4545
},
{
"epoch": 0.8000703358537015,
"grad_norm": 0.06609740853309631,
"learning_rate": 2.024473736687501e-06,
"loss": 0.0478,
"step": 4550
},
{
"epoch": 0.8009495340249693,
"grad_norm": 0.3990488052368164,
"learning_rate": 2.0073273112346526e-06,
"loss": 0.0563,
"step": 4555
},
{
"epoch": 0.8018287321962371,
"grad_norm": 0.54508376121521,
"learning_rate": 1.9902457018934496e-06,
"loss": 0.0665,
"step": 4560
},
{
"epoch": 0.8027079303675049,
"grad_norm": 0.28929072618484497,
"learning_rate": 1.973229047186206e-06,
"loss": 0.0583,
"step": 4565
},
{
"epoch": 0.8035871285387727,
"grad_norm": 0.4705251157283783,
"learning_rate": 1.9562774851084865e-06,
"loss": 0.0639,
"step": 4570
},
{
"epoch": 0.8044663267100405,
"grad_norm": 0.10792229324579239,
"learning_rate": 1.9393911531279973e-06,
"loss": 0.0687,
"step": 4575
},
{
"epoch": 0.8053455248813083,
"grad_norm": 0.8370270729064941,
"learning_rate": 1.9225701881834524e-06,
"loss": 0.0616,
"step": 4580
},
{
"epoch": 0.8062247230525761,
"grad_norm": 0.20940972864627838,
"learning_rate": 1.9058147266834892e-06,
"loss": 0.0588,
"step": 4585
},
{
"epoch": 0.8071039212238439,
"grad_norm": 0.43674102425575256,
"learning_rate": 1.8891249045055349e-06,
"loss": 0.0424,
"step": 4590
},
{
"epoch": 0.8079831193951117,
"grad_norm": 0.4499530494213104,
"learning_rate": 1.8725008569947366e-06,
"loss": 0.0583,
"step": 4595
},
{
"epoch": 0.8088623175663795,
"grad_norm": 0.16668100655078888,
"learning_rate": 1.8559427189628277e-06,
"loss": 0.0604,
"step": 4600
},
{
"epoch": 0.8097415157376473,
"grad_norm": 0.20123635232448578,
"learning_rate": 1.8394506246870635e-06,
"loss": 0.0561,
"step": 4605
},
{
"epoch": 0.8106207139089151,
"grad_norm": 0.2664503753185272,
"learning_rate": 1.8230247079091146e-06,
"loss": 0.053,
"step": 4610
},
{
"epoch": 0.8114999120801829,
"grad_norm": 0.7137978076934814,
"learning_rate": 1.8066651018339943e-06,
"loss": 0.0572,
"step": 4615
},
{
"epoch": 0.8123791102514507,
"grad_norm": 0.5590645670890808,
"learning_rate": 1.790371939128972e-06,
"loss": 0.0616,
"step": 4620
},
{
"epoch": 0.8132583084227185,
"grad_norm": 0.08397898077964783,
"learning_rate": 1.7741453519224982e-06,
"loss": 0.058,
"step": 4625
},
{
"epoch": 0.8141375065939863,
"grad_norm": 0.33352193236351013,
"learning_rate": 1.7579854718031285e-06,
"loss": 0.0517,
"step": 4630
},
{
"epoch": 0.8150167047652541,
"grad_norm": 0.20153024792671204,
"learning_rate": 1.741892429818468e-06,
"loss": 0.0547,
"step": 4635
},
{
"epoch": 0.8158959029365219,
"grad_norm": 0.8683350086212158,
"learning_rate": 1.7258663564740996e-06,
"loss": 0.0618,
"step": 4640
},
{
"epoch": 0.8167751011077897,
"grad_norm": 0.19968271255493164,
"learning_rate": 1.7099073817325307e-06,
"loss": 0.0568,
"step": 4645
},
{
"epoch": 0.8176542992790575,
"grad_norm": 0.3933415412902832,
"learning_rate": 1.6940156350121273e-06,
"loss": 0.0622,
"step": 4650
},
{
"epoch": 0.8185334974503253,
"grad_norm": 0.8188037872314453,
"learning_rate": 1.6781912451860827e-06,
"loss": 0.0645,
"step": 4655
},
{
"epoch": 0.8194126956215931,
"grad_norm": 0.6926817893981934,
"learning_rate": 1.6624343405813615e-06,
"loss": 0.0561,
"step": 4660
},
{
"epoch": 0.8202918937928609,
"grad_norm": 0.5045862197875977,
"learning_rate": 1.6467450489776581e-06,
"loss": 0.0668,
"step": 4665
},
{
"epoch": 0.8211710919641287,
"grad_norm": 0.5935778617858887,
"learning_rate": 1.6311234976063694e-06,
"loss": 0.0575,
"step": 4670
},
{
"epoch": 0.8220502901353965,
"grad_norm": 0.3604169189929962,
"learning_rate": 1.6155698131495457e-06,
"loss": 0.0543,
"step": 4675
},
{
"epoch": 0.8229294883066643,
"grad_norm": 0.24668653309345245,
"learning_rate": 1.6000841217388864e-06,
"loss": 0.057,
"step": 4680
},
{
"epoch": 0.8238086864779322,
"grad_norm": 0.33264681696891785,
"learning_rate": 1.5846665489546964e-06,
"loss": 0.0572,
"step": 4685
},
{
"epoch": 0.8246878846492,
"grad_norm": 0.6432152986526489,
"learning_rate": 1.5693172198248863e-06,
"loss": 0.0604,
"step": 4690
},
{
"epoch": 0.8255670828204678,
"grad_norm": 0.4105263948440552,
"learning_rate": 1.5540362588239366e-06,
"loss": 0.0701,
"step": 4695
},
{
"epoch": 0.8264462809917356,
"grad_norm": 0.25226283073425293,
"learning_rate": 1.5388237898719105e-06,
"loss": 0.0534,
"step": 4700
},
{
"epoch": 0.8273254791630034,
"grad_norm": 0.2534274160861969,
"learning_rate": 1.5236799363334298e-06,
"loss": 0.0535,
"step": 4705
},
{
"epoch": 0.8282046773342712,
"grad_norm": 0.29621097445487976,
"learning_rate": 1.508604821016698e-06,
"loss": 0.0499,
"step": 4710
},
{
"epoch": 0.829083875505539,
"grad_norm": 0.38203129172325134,
"learning_rate": 1.4935985661724727e-06,
"loss": 0.0539,
"step": 4715
},
{
"epoch": 0.8299630736768068,
"grad_norm": 0.9345189929008484,
"learning_rate": 1.4786612934931055e-06,
"loss": 0.0578,
"step": 4720
},
{
"epoch": 0.8308422718480746,
"grad_norm": 0.21688897907733917,
"learning_rate": 1.463793124111531e-06,
"loss": 0.0427,
"step": 4725
},
{
"epoch": 0.8317214700193424,
"grad_norm": 0.22990085184574127,
"learning_rate": 1.4489941786003004e-06,
"loss": 0.0441,
"step": 4730
},
{
"epoch": 0.8326006681906102,
"grad_norm": 0.4832035005092621,
"learning_rate": 1.4342645769705977e-06,
"loss": 0.0588,
"step": 4735
},
{
"epoch": 0.833479866361878,
"grad_norm": 0.40238794684410095,
"learning_rate": 1.419604438671267e-06,
"loss": 0.0519,
"step": 4740
},
{
"epoch": 0.8343590645331458,
"grad_norm": 0.5739127397537231,
"learning_rate": 1.405013882587839e-06,
"loss": 0.0637,
"step": 4745
},
{
"epoch": 0.8352382627044136,
"grad_norm": 0.3309503495693207,
"learning_rate": 1.3904930270415763e-06,
"loss": 0.0506,
"step": 4750
},
{
"epoch": 0.8361174608756814,
"grad_norm": 0.09466574341058731,
"learning_rate": 1.376041989788508e-06,
"loss": 0.0524,
"step": 4755
},
{
"epoch": 0.8369966590469492,
"grad_norm": 0.27358055114746094,
"learning_rate": 1.3616608880184768e-06,
"loss": 0.0545,
"step": 4760
},
{
"epoch": 0.837875857218217,
"grad_norm": 0.26218411326408386,
"learning_rate": 1.3473498383541817e-06,
"loss": 0.0467,
"step": 4765
},
{
"epoch": 0.8387550553894848,
"grad_norm": 0.3839583992958069,
"learning_rate": 1.3331089568502465e-06,
"loss": 0.043,
"step": 4770
},
{
"epoch": 0.8396342535607526,
"grad_norm": 0.37673670053482056,
"learning_rate": 1.3189383589922667e-06,
"loss": 0.0634,
"step": 4775
},
{
"epoch": 0.8405134517320204,
"grad_norm": 0.3072827160358429,
"learning_rate": 1.304838159695877e-06,
"loss": 0.0684,
"step": 4780
},
{
"epoch": 0.8413926499032882,
"grad_norm": 0.7255908250808716,
"learning_rate": 1.290808473305817e-06,
"loss": 0.0545,
"step": 4785
},
{
"epoch": 0.842271848074556,
"grad_norm": 0.4259006381034851,
"learning_rate": 1.2768494135950093e-06,
"loss": 0.0516,
"step": 4790
},
{
"epoch": 0.8431510462458238,
"grad_norm": 0.6573328971862793,
"learning_rate": 1.2629610937636284e-06,
"loss": 0.0494,
"step": 4795
},
{
"epoch": 0.8440302444170916,
"grad_norm": 0.09125727415084839,
"learning_rate": 1.2491436264381984e-06,
"loss": 0.0621,
"step": 4800
},
{
"epoch": 0.8449094425883594,
"grad_norm": 0.4536990821361542,
"learning_rate": 1.2353971236706564e-06,
"loss": 0.0506,
"step": 4805
},
{
"epoch": 0.8457886407596272,
"grad_norm": 0.47011682391166687,
"learning_rate": 1.2217216969374669e-06,
"loss": 0.0582,
"step": 4810
},
{
"epoch": 0.846667838930895,
"grad_norm": 0.15771393477916718,
"learning_rate": 1.208117457138699e-06,
"loss": 0.0612,
"step": 4815
},
{
"epoch": 0.8475470371021628,
"grad_norm": 1.0707141160964966,
"learning_rate": 1.1945845145971414e-06,
"loss": 0.0597,
"step": 4820
},
{
"epoch": 0.8484262352734306,
"grad_norm": 0.381056547164917,
"learning_rate": 1.1811229790573996e-06,
"loss": 0.0678,
"step": 4825
},
{
"epoch": 0.8493054334446984,
"grad_norm": 0.33800312876701355,
"learning_rate": 1.1677329596850117e-06,
"loss": 0.0516,
"step": 4830
},
{
"epoch": 0.8501846316159662,
"grad_norm": 0.19112703204154968,
"learning_rate": 1.1544145650655514e-06,
"loss": 0.062,
"step": 4835
},
{
"epoch": 0.851063829787234,
"grad_norm": 0.4575953483581543,
"learning_rate": 1.1411679032037636e-06,
"loss": 0.0542,
"step": 4840
},
{
"epoch": 0.8519430279585019,
"grad_norm": 0.6570442914962769,
"learning_rate": 1.127993081522678e-06,
"loss": 0.0567,
"step": 4845
},
{
"epoch": 0.8528222261297697,
"grad_norm": 0.4207151234149933,
"learning_rate": 1.114890206862742e-06,
"loss": 0.0595,
"step": 4850
},
{
"epoch": 0.8537014243010375,
"grad_norm": 0.183942511677742,
"learning_rate": 1.1018593854809478e-06,
"loss": 0.0537,
"step": 4855
},
{
"epoch": 0.8545806224723053,
"grad_norm": 0.5328112840652466,
"learning_rate": 1.0889007230499805e-06,
"loss": 0.0598,
"step": 4860
},
{
"epoch": 0.8554598206435731,
"grad_norm": 0.1587759107351303,
"learning_rate": 1.0760143246573552e-06,
"loss": 0.0607,
"step": 4865
},
{
"epoch": 0.8563390188148409,
"grad_norm": 0.3230392336845398,
"learning_rate": 1.0632002948045672e-06,
"loss": 0.0434,
"step": 4870
},
{
"epoch": 0.8572182169861087,
"grad_norm": 0.6485795378684998,
"learning_rate": 1.0504587374062392e-06,
"loss": 0.0518,
"step": 4875
},
{
"epoch": 0.8580974151573765,
"grad_norm": 0.20032288134098053,
"learning_rate": 1.037789755789289e-06,
"loss": 0.047,
"step": 4880
},
{
"epoch": 0.8589766133286443,
"grad_norm": 0.2695741355419159,
"learning_rate": 1.025193452692076e-06,
"loss": 0.0544,
"step": 4885
},
{
"epoch": 0.8598558114999121,
"grad_norm": 0.4286085069179535,
"learning_rate": 1.0126699302635901e-06,
"loss": 0.0749,
"step": 4890
},
{
"epoch": 0.8607350096711799,
"grad_norm": 0.21349282562732697,
"learning_rate": 1.0002192900626028e-06,
"loss": 0.0598,
"step": 4895
},
{
"epoch": 0.8616142078424477,
"grad_norm": 0.16585257649421692,
"learning_rate": 9.878416330568486e-07,
"loss": 0.056,
"step": 4900
},
{
"epoch": 0.8624934060137155,
"grad_norm": 0.723551332950592,
"learning_rate": 9.75537059622218e-07,
"loss": 0.0531,
"step": 4905
},
{
"epoch": 0.8633726041849833,
"grad_norm": 0.7833985090255737,
"learning_rate": 9.633056695419229e-07,
"loss": 0.0499,
"step": 4910
},
{
"epoch": 0.8642518023562511,
"grad_norm": 0.18503841757774353,
"learning_rate": 9.511475620057132e-07,
"loss": 0.0432,
"step": 4915
},
{
"epoch": 0.8651310005275189,
"grad_norm": 0.25365808606147766,
"learning_rate": 9.390628356090459e-07,
"loss": 0.0677,
"step": 4920
},
{
"epoch": 0.8660101986987867,
"grad_norm": 0.6311047673225403,
"learning_rate": 9.270515883523057e-07,
"loss": 0.0642,
"step": 4925
},
{
"epoch": 0.8668893968700545,
"grad_norm": 0.13977408409118652,
"learning_rate": 9.15113917639997e-07,
"loss": 0.0538,
"step": 4930
},
{
"epoch": 0.8677685950413223,
"grad_norm": 0.3203240931034088,
"learning_rate": 9.032499202799627e-07,
"loss": 0.0535,
"step": 4935
},
{
"epoch": 0.8686477932125901,
"grad_norm": 0.5966960191726685,
"learning_rate": 8.914596924825958e-07,
"loss": 0.0485,
"step": 4940
},
{
"epoch": 0.8695269913838579,
"grad_norm": 0.4596826732158661,
"learning_rate": 8.797433298600622e-07,
"loss": 0.0659,
"step": 4945
},
{
"epoch": 0.8704061895551257,
"grad_norm": 0.757762610912323,
"learning_rate": 8.681009274255136e-07,
"loss": 0.0639,
"step": 4950
},
{
"epoch": 0.8712853877263935,
"grad_norm": 0.2202068269252777,
"learning_rate": 8.56532579592334e-07,
"loss": 0.0493,
"step": 4955
},
{
"epoch": 0.8721645858976613,
"grad_norm": 0.655022144317627,
"learning_rate": 8.450383801733642e-07,
"loss": 0.0631,
"step": 4960
},
{
"epoch": 0.8730437840689291,
"grad_norm": 0.5713546872138977,
"learning_rate": 8.336184223801424e-07,
"loss": 0.0592,
"step": 4965
},
{
"epoch": 0.8739229822401969,
"grad_norm": 0.4096655249595642,
"learning_rate": 8.222727988221469e-07,
"loss": 0.0644,
"step": 4970
},
{
"epoch": 0.8748021804114647,
"grad_norm": 0.3128865957260132,
"learning_rate": 8.110016015060484e-07,
"loss": 0.059,
"step": 4975
},
{
"epoch": 0.8756813785827325,
"grad_norm": 0.1819022297859192,
"learning_rate": 7.998049218349624e-07,
"loss": 0.0547,
"step": 4980
},
{
"epoch": 0.8765605767540003,
"grad_norm": 0.2281774878501892,
"learning_rate": 7.886828506077105e-07,
"loss": 0.0584,
"step": 4985
},
{
"epoch": 0.8774397749252681,
"grad_norm": 0.29507550597190857,
"learning_rate": 7.776354780180739e-07,
"loss": 0.0523,
"step": 4990
},
{
"epoch": 0.878318973096536,
"grad_norm": 0.1599227786064148,
"learning_rate": 7.666628936540776e-07,
"loss": 0.0597,
"step": 4995
},
{
"epoch": 0.8791981712678038,
"grad_norm": 0.33402958512306213,
"learning_rate": 7.557651864972504e-07,
"loss": 0.048,
"step": 5000
},
{
"epoch": 0.8800773694390716,
"grad_norm": 0.8994088172912598,
"learning_rate": 7.449424449219144e-07,
"loss": 0.0602,
"step": 5005
},
{
"epoch": 0.8809565676103394,
"grad_norm": 0.392220139503479,
"learning_rate": 7.341947566944563e-07,
"loss": 0.0438,
"step": 5010
},
{
"epoch": 0.8818357657816072,
"grad_norm": 0.3253031373023987,
"learning_rate": 7.23522208972628e-07,
"loss": 0.0568,
"step": 5015
},
{
"epoch": 0.882714963952875,
"grad_norm": 0.41497498750686646,
"learning_rate": 7.129248883048278e-07,
"loss": 0.0453,
"step": 5020
},
{
"epoch": 0.8835941621241428,
"grad_norm": 0.4564589560031891,
"learning_rate": 7.024028806294092e-07,
"loss": 0.0559,
"step": 5025
},
{
"epoch": 0.8844733602954106,
"grad_norm": 0.6449925303459167,
"learning_rate": 6.91956271273978e-07,
"loss": 0.047,
"step": 5030
},
{
"epoch": 0.8853525584666784,
"grad_norm": 0.3050267994403839,
"learning_rate": 6.815851449547029e-07,
"loss": 0.0583,
"step": 5035
},
{
"epoch": 0.8862317566379462,
"grad_norm": 0.13408750295639038,
"learning_rate": 6.712895857756229e-07,
"loss": 0.0434,
"step": 5040
},
{
"epoch": 0.887110954809214,
"grad_norm": 0.2986939549446106,
"learning_rate": 6.610696772279757e-07,
"loss": 0.0594,
"step": 5045
},
{
"epoch": 0.8879901529804818,
"grad_norm": 0.6257616877555847,
"learning_rate": 6.509255021895111e-07,
"loss": 0.0621,
"step": 5050
},
{
"epoch": 0.8888693511517496,
"grad_norm": 0.2986117899417877,
"learning_rate": 6.408571429238253e-07,
"loss": 0.0505,
"step": 5055
},
{
"epoch": 0.8897485493230174,
"grad_norm": 0.15408103168010712,
"learning_rate": 6.308646810796836e-07,
"loss": 0.0534,
"step": 5060
},
{
"epoch": 0.8906277474942852,
"grad_norm": 0.4512818157672882,
"learning_rate": 6.209481976903752e-07,
"loss": 0.0433,
"step": 5065
},
{
"epoch": 0.891506945665553,
"grad_norm": 0.41841113567352295,
"learning_rate": 6.111077731730408e-07,
"loss": 0.0697,
"step": 5070
},
{
"epoch": 0.8923861438368208,
"grad_norm": 0.472064346075058,
"learning_rate": 6.013434873280288e-07,
"loss": 0.058,
"step": 5075
},
{
"epoch": 0.8932653420080886,
"grad_norm": 0.20314988493919373,
"learning_rate": 5.916554193382418e-07,
"loss": 0.0456,
"step": 5080
},
{
"epoch": 0.8941445401793564,
"grad_norm": 0.1879410743713379,
"learning_rate": 5.820436477685021e-07,
"loss": 0.0506,
"step": 5085
},
{
"epoch": 0.8950237383506242,
"grad_norm": 0.4098430573940277,
"learning_rate": 5.72508250564906e-07,
"loss": 0.0528,
"step": 5090
},
{
"epoch": 0.895902936521892,
"grad_norm": 0.5645979642868042,
"learning_rate": 5.63049305054204e-07,
"loss": 0.0689,
"step": 5095
},
{
"epoch": 0.8967821346931598,
"grad_norm": 0.38696274161338806,
"learning_rate": 5.536668879431584e-07,
"loss": 0.0621,
"step": 5100
},
{
"epoch": 0.8976613328644276,
"grad_norm": 0.18984673917293549,
"learning_rate": 5.44361075317934e-07,
"loss": 0.0574,
"step": 5105
},
{
"epoch": 0.8985405310356954,
"grad_norm": 0.1238960400223732,
"learning_rate": 5.35131942643472e-07,
"loss": 0.0522,
"step": 5110
},
{
"epoch": 0.8994197292069632,
"grad_norm": 0.47579723596572876,
"learning_rate": 5.259795647628818e-07,
"loss": 0.0437,
"step": 5115
},
{
"epoch": 0.900298927378231,
"grad_norm": 0.0708310604095459,
"learning_rate": 5.169040158968431e-07,
"loss": 0.057,
"step": 5120
},
{
"epoch": 0.9011781255494988,
"grad_norm": 0.24418748915195465,
"learning_rate": 5.079053696429837e-07,
"loss": 0.054,
"step": 5125
},
{
"epoch": 0.9020573237207666,
"grad_norm": 0.4015823304653168,
"learning_rate": 4.989836989753005e-07,
"loss": 0.0472,
"step": 5130
},
{
"epoch": 0.9029365218920344,
"grad_norm": 0.09180589020252228,
"learning_rate": 4.901390762435588e-07,
"loss": 0.0565,
"step": 5135
},
{
"epoch": 0.9038157200633022,
"grad_norm": 0.23137515783309937,
"learning_rate": 4.813715731727098e-07,
"loss": 0.0594,
"step": 5140
},
{
"epoch": 0.90469491823457,
"grad_norm": 0.42869314551353455,
"learning_rate": 4.726812608623077e-07,
"loss": 0.0578,
"step": 5145
},
{
"epoch": 0.9055741164058378,
"grad_norm": 0.28821223974227905,
"learning_rate": 4.640682097859317e-07,
"loss": 0.0608,
"step": 5150
},
{
"epoch": 0.9064533145771057,
"grad_norm": 0.8055384755134583,
"learning_rate": 4.555324897906133e-07,
"loss": 0.0635,
"step": 5155
},
{
"epoch": 0.9073325127483735,
"grad_norm": 0.12101097404956818,
"learning_rate": 4.470741700962777e-07,
"loss": 0.0559,
"step": 5160
},
{
"epoch": 0.9082117109196413,
"grad_norm": 0.4471381902694702,
"learning_rate": 4.3869331929517144e-07,
"loss": 0.055,
"step": 5165
},
{
"epoch": 0.9090909090909091,
"grad_norm": 0.554818332195282,
"learning_rate": 4.303900053513166e-07,
"loss": 0.0565,
"step": 5170
},
{
"epoch": 0.9099701072621769,
"grad_norm": 0.6611088514328003,
"learning_rate": 4.2216429559994945e-07,
"loss": 0.0556,
"step": 5175
},
{
"epoch": 0.9108493054334447,
"grad_norm": 0.1738196760416031,
"learning_rate": 4.1401625674698186e-07,
"loss": 0.0574,
"step": 5180
},
{
"epoch": 0.9117285036047125,
"grad_norm": 0.19940215349197388,
"learning_rate": 4.0594595486845964e-07,
"loss": 0.0598,
"step": 5185
},
{
"epoch": 0.9126077017759803,
"grad_norm": 0.6054111123085022,
"learning_rate": 3.9795345541002395e-07,
"loss": 0.0467,
"step": 5190
},
{
"epoch": 0.9134868999472481,
"grad_norm": 0.5578285455703735,
"learning_rate": 3.9003882318638053e-07,
"loss": 0.0577,
"step": 5195
},
{
"epoch": 0.9143660981185159,
"grad_norm": 0.12331661581993103,
"learning_rate": 3.8220212238077703e-07,
"loss": 0.0632,
"step": 5200
},
{
"epoch": 0.9152452962897837,
"grad_norm": 0.4738437831401825,
"learning_rate": 3.744434165444788e-07,
"loss": 0.0619,
"step": 5205
},
{
"epoch": 0.9161244944610515,
"grad_norm": 0.35755178332328796,
"learning_rate": 3.667627685962605e-07,
"loss": 0.059,
"step": 5210
},
{
"epoch": 0.9170036926323193,
"grad_norm": 0.45652657747268677,
"learning_rate": 3.591602408218842e-07,
"loss": 0.0543,
"step": 5215
},
{
"epoch": 0.9178828908035871,
"grad_norm": 0.2660428583621979,
"learning_rate": 3.516358948736065e-07,
"loss": 0.0526,
"step": 5220
},
{
"epoch": 0.9187620889748549,
"grad_norm": 0.21268558502197266,
"learning_rate": 3.441897917696679e-07,
"loss": 0.0644,
"step": 5225
},
{
"epoch": 0.9196412871461227,
"grad_norm": 0.6168246865272522,
"learning_rate": 3.368219918938076e-07,
"loss": 0.0512,
"step": 5230
},
{
"epoch": 0.9205204853173905,
"grad_norm": 0.5585867762565613,
"learning_rate": 3.29532554994767e-07,
"loss": 0.0612,
"step": 5235
},
{
"epoch": 0.9213996834886583,
"grad_norm": 0.6709286570549011,
"learning_rate": 3.223215401858115e-07,
"loss": 0.047,
"step": 5240
},
{
"epoch": 0.9222788816599261,
"grad_norm": 0.2702469527721405,
"learning_rate": 3.151890059442386e-07,
"loss": 0.0445,
"step": 5245
},
{
"epoch": 0.9231580798311939,
"grad_norm": 0.6882309913635254,
"learning_rate": 3.081350101109215e-07,
"loss": 0.0513,
"step": 5250
},
{
"epoch": 0.9240372780024617,
"grad_norm": 0.3388536274433136,
"learning_rate": 3.0115960988982504e-07,
"loss": 0.0525,
"step": 5255
},
{
"epoch": 0.9249164761737295,
"grad_norm": 0.3622197210788727,
"learning_rate": 2.942628618475507e-07,
"loss": 0.0565,
"step": 5260
},
{
"epoch": 0.9257956743449973,
"grad_norm": 0.519477903842926,
"learning_rate": 2.8744482191287113e-07,
"loss": 0.0605,
"step": 5265
},
{
"epoch": 0.9266748725162651,
"grad_norm": 0.3441823422908783,
"learning_rate": 2.8070554537628413e-07,
"loss": 0.0545,
"step": 5270
},
{
"epoch": 0.9275540706875329,
"grad_norm": 0.2639375329017639,
"learning_rate": 2.7404508688955835e-07,
"loss": 0.0402,
"step": 5275
},
{
"epoch": 0.9284332688588007,
"grad_norm": 0.35200339555740356,
"learning_rate": 2.674635004652926e-07,
"loss": 0.057,
"step": 5280
},
{
"epoch": 0.9293124670300685,
"grad_norm": 0.30402621626853943,
"learning_rate": 2.609608394764751e-07,
"loss": 0.0508,
"step": 5285
},
{
"epoch": 0.9301916652013363,
"grad_norm": 0.6213688254356384,
"learning_rate": 2.5453715665605725e-07,
"loss": 0.0652,
"step": 5290
},
{
"epoch": 0.9310708633726041,
"grad_norm": 0.24661576747894287,
"learning_rate": 2.4819250409651605e-07,
"loss": 0.0398,
"step": 5295
},
{
"epoch": 0.9319500615438719,
"grad_norm": 0.13059848546981812,
"learning_rate": 2.419269332494434e-07,
"loss": 0.0586,
"step": 5300
},
{
"epoch": 0.9328292597151397,
"grad_norm": 0.4985171854496002,
"learning_rate": 2.3574049492511852e-07,
"loss": 0.0649,
"step": 5305
},
{
"epoch": 0.9337084578864076,
"grad_norm": 0.5534043312072754,
"learning_rate": 2.296332392921019e-07,
"loss": 0.0564,
"step": 5310
},
{
"epoch": 0.9345876560576754,
"grad_norm": 0.386444091796875,
"learning_rate": 2.2360521587682316e-07,
"loss": 0.0646,
"step": 5315
},
{
"epoch": 0.9354668542289432,
"grad_norm": 0.22402134537696838,
"learning_rate": 2.176564735631881e-07,
"loss": 0.0614,
"step": 5320
},
{
"epoch": 0.936346052400211,
"grad_norm": 0.6845077872276306,
"learning_rate": 2.1178706059217346e-07,
"loss": 0.0639,
"step": 5325
},
{
"epoch": 0.9372252505714788,
"grad_norm": 0.5488569736480713,
"learning_rate": 2.0599702456144178e-07,
"loss": 0.0518,
"step": 5330
},
{
"epoch": 0.9381044487427466,
"grad_norm": 0.6429914832115173,
"learning_rate": 2.002864124249504e-07,
"loss": 0.0618,
"step": 5335
},
{
"epoch": 0.9389836469140144,
"grad_norm": 0.8213487267494202,
"learning_rate": 1.9465527049257416e-07,
"loss": 0.0623,
"step": 5340
},
{
"epoch": 0.9398628450852822,
"grad_norm": 0.6064088940620422,
"learning_rate": 1.8910364442972896e-07,
"loss": 0.068,
"step": 5345
},
{
"epoch": 0.94074204325655,
"grad_norm": 0.9644356966018677,
"learning_rate": 1.8363157925700316e-07,
"loss": 0.0581,
"step": 5350
},
{
"epoch": 0.9416212414278178,
"grad_norm": 0.7088313102722168,
"learning_rate": 1.78239119349789e-07,
"loss": 0.0469,
"step": 5355
},
{
"epoch": 0.9425004395990856,
"grad_norm": 0.14932291209697723,
"learning_rate": 1.7292630843792292e-07,
"loss": 0.0584,
"step": 5360
},
{
"epoch": 0.9433796377703534,
"grad_norm": 0.18209953606128693,
"learning_rate": 1.6769318960533465e-07,
"loss": 0.0526,
"step": 5365
},
{
"epoch": 0.9442588359416212,
"grad_norm": 0.5487361550331116,
"learning_rate": 1.625398052896965e-07,
"loss": 0.0551,
"step": 5370
},
{
"epoch": 0.945138034112889,
"grad_norm": 0.1913604438304901,
"learning_rate": 1.574661972820779e-07,
"loss": 0.0556,
"step": 5375
},
{
"epoch": 0.9460172322841568,
"grad_norm": 0.44386959075927734,
"learning_rate": 1.5247240672660258e-07,
"loss": 0.0633,
"step": 5380
},
{
"epoch": 0.9468964304554246,
"grad_norm": 0.3759801685810089,
"learning_rate": 1.4755847412012635e-07,
"loss": 0.0557,
"step": 5385
},
{
"epoch": 0.9477756286266924,
"grad_norm": 0.19469892978668213,
"learning_rate": 1.427244393118965e-07,
"loss": 0.0591,
"step": 5390
},
{
"epoch": 0.9486548267979603,
"grad_norm": 0.5927003026008606,
"learning_rate": 1.379703415032374e-07,
"loss": 0.045,
"step": 5395
},
{
"epoch": 0.9495340249692281,
"grad_norm": 0.13490422070026398,
"learning_rate": 1.3329621924722536e-07,
"loss": 0.0437,
"step": 5400
},
{
"epoch": 0.9504132231404959,
"grad_norm": 0.3926790654659271,
"learning_rate": 1.287021104483821e-07,
"loss": 0.053,
"step": 5405
},
{
"epoch": 0.9512924213117637,
"grad_norm": 0.5249255299568176,
"learning_rate": 1.2418805236236287e-07,
"loss": 0.06,
"step": 5410
},
{
"epoch": 0.9521716194830315,
"grad_norm": 1.2005724906921387,
"learning_rate": 1.1975408159566105e-07,
"loss": 0.0479,
"step": 5415
},
{
"epoch": 0.9530508176542993,
"grad_norm": 0.2839159667491913,
"learning_rate": 1.1540023410529844e-07,
"loss": 0.0623,
"step": 5420
},
{
"epoch": 0.9539300158255671,
"grad_norm": 0.5383673906326294,
"learning_rate": 1.1112654519855104e-07,
"loss": 0.0596,
"step": 5425
},
{
"epoch": 0.9548092139968349,
"grad_norm": 0.595245361328125,
"learning_rate": 1.0693304953264705e-07,
"loss": 0.0661,
"step": 5430
},
{
"epoch": 0.9556884121681027,
"grad_norm": 0.5355046391487122,
"learning_rate": 1.0281978111449375e-07,
"loss": 0.0715,
"step": 5435
},
{
"epoch": 0.9565676103393705,
"grad_norm": 0.31870976090431213,
"learning_rate": 9.87867733004011e-08,
"loss": 0.053,
"step": 5440
},
{
"epoch": 0.9574468085106383,
"grad_norm": 0.6525245308876038,
"learning_rate": 9.483405879581187e-08,
"loss": 0.0519,
"step": 5445
},
{
"epoch": 0.9583260066819062,
"grad_norm": 0.6109974384307861,
"learning_rate": 9.096166965502972e-08,
"loss": 0.0583,
"step": 5450
},
{
"epoch": 0.959205204853174,
"grad_norm": 0.22431720793247223,
"learning_rate": 8.71696372809705e-08,
"loss": 0.0486,
"step": 5455
},
{
"epoch": 0.9600844030244418,
"grad_norm": 0.5329923033714294,
"learning_rate": 8.345799242489905e-08,
"loss": 0.0525,
"step": 5460
},
{
"epoch": 0.9609636011957096,
"grad_norm": 0.4853318929672241,
"learning_rate": 7.982676518618059e-08,
"loss": 0.0597,
"step": 5465
},
{
"epoch": 0.9618427993669774,
"grad_norm": 0.3214046061038971,
"learning_rate": 7.627598501204092e-08,
"loss": 0.05,
"step": 5470
},
{
"epoch": 0.9627219975382452,
"grad_norm": 0.18154819309711456,
"learning_rate": 7.28056806973243e-08,
"loss": 0.0484,
"step": 5475
},
{
"epoch": 0.963601195709513,
"grad_norm": 0.14486095309257507,
"learning_rate": 6.941588038426039e-08,
"loss": 0.0586,
"step": 5480
},
{
"epoch": 0.9644803938807808,
"grad_norm": 0.6655594706535339,
"learning_rate": 6.610661156223664e-08,
"loss": 0.0641,
"step": 5485
},
{
"epoch": 0.9653595920520486,
"grad_norm": 0.22004252672195435,
"learning_rate": 6.287790106757396e-08,
"loss": 0.0483,
"step": 5490
},
{
"epoch": 0.9662387902233164,
"grad_norm": 0.20186270773410797,
"learning_rate": 5.972977508331368e-08,
"loss": 0.0528,
"step": 5495
},
{
"epoch": 0.9671179883945842,
"grad_norm": 0.23505160212516785,
"learning_rate": 5.666225913899648e-08,
"loss": 0.0663,
"step": 5500
},
{
"epoch": 0.967997186565852,
"grad_norm": 0.423562616109848,
"learning_rate": 5.367537811046486e-08,
"loss": 0.0516,
"step": 5505
},
{
"epoch": 0.9688763847371198,
"grad_norm": 0.23930394649505615,
"learning_rate": 5.0769156219656614e-08,
"loss": 0.0572,
"step": 5510
},
{
"epoch": 0.9697555829083876,
"grad_norm": 0.1267559826374054,
"learning_rate": 4.7943617034407196e-08,
"loss": 0.0411,
"step": 5515
},
{
"epoch": 0.9706347810796554,
"grad_norm": 0.5073260068893433,
"learning_rate": 4.51987834682599e-08,
"loss": 0.052,
"step": 5520
},
{
"epoch": 0.9715139792509232,
"grad_norm": 0.3135651648044586,
"learning_rate": 4.253467778028486e-08,
"loss": 0.0547,
"step": 5525
},
{
"epoch": 0.972393177422191,
"grad_norm": 0.5157844424247742,
"learning_rate": 3.9951321574890345e-08,
"loss": 0.0463,
"step": 5530
},
{
"epoch": 0.9732723755934588,
"grad_norm": 0.16779236495494843,
"learning_rate": 3.744873580165176e-08,
"loss": 0.059,
"step": 5535
},
{
"epoch": 0.9741515737647266,
"grad_norm": 0.26359498500823975,
"learning_rate": 3.502694075514179e-08,
"loss": 0.0567,
"step": 5540
},
{
"epoch": 0.9750307719359944,
"grad_norm": 0.6399820446968079,
"learning_rate": 3.26859560747661e-08,
"loss": 0.0544,
"step": 5545
},
{
"epoch": 0.9759099701072622,
"grad_norm": 0.49394690990448,
"learning_rate": 3.042580074460344e-08,
"loss": 0.0649,
"step": 5550
},
{
"epoch": 0.97678916827853,
"grad_norm": 0.11140932142734528,
"learning_rate": 2.8246493093250226e-08,
"loss": 0.0572,
"step": 5555
},
{
"epoch": 0.9776683664497978,
"grad_norm": 0.803503155708313,
"learning_rate": 2.6148050793676217e-08,
"loss": 0.0583,
"step": 5560
},
{
"epoch": 0.9785475646210656,
"grad_norm": 0.20883022248744965,
"learning_rate": 2.4130490863075727e-08,
"loss": 0.0631,
"step": 5565
},
{
"epoch": 0.9794267627923334,
"grad_norm": 0.5355504155158997,
"learning_rate": 2.2193829662731093e-08,
"loss": 0.0548,
"step": 5570
},
{
"epoch": 0.9803059609636012,
"grad_norm": 0.5118641257286072,
"learning_rate": 2.033808289788608e-08,
"loss": 0.0551,
"step": 5575
},
{
"epoch": 0.981185159134869,
"grad_norm": 0.4112605154514313,
"learning_rate": 1.856326561760824e-08,
"loss": 0.0567,
"step": 5580
},
{
"epoch": 0.9820643573061368,
"grad_norm": 0.36727771162986755,
"learning_rate": 1.686939221467565e-08,
"loss": 0.0524,
"step": 5585
},
{
"epoch": 0.9829435554774046,
"grad_norm": 0.32696032524108887,
"learning_rate": 1.5256476425455912e-08,
"loss": 0.0546,
"step": 5590
},
{
"epoch": 0.9838227536486724,
"grad_norm": 0.14299306273460388,
"learning_rate": 1.37245313297929e-08,
"loss": 0.0603,
"step": 5595
},
{
"epoch": 0.9847019518199402,
"grad_norm": 0.1304006576538086,
"learning_rate": 1.2273569350909065e-08,
"loss": 0.048,
"step": 5600
},
{
"epoch": 0.985581149991208,
"grad_norm": 0.3255922496318817,
"learning_rate": 1.09036022552933e-08,
"loss": 0.0624,
"step": 5605
},
{
"epoch": 0.9864603481624759,
"grad_norm": 0.14127831161022186,
"learning_rate": 9.614641152615457e-09,
"loss": 0.0537,
"step": 5610
},
{
"epoch": 0.9873395463337437,
"grad_norm": 0.49166056513786316,
"learning_rate": 8.406696495627531e-09,
"loss": 0.0555,
"step": 5615
},
{
"epoch": 0.9882187445050115,
"grad_norm": 0.4225272238254547,
"learning_rate": 7.279778080089284e-09,
"loss": 0.0508,
"step": 5620
},
{
"epoch": 0.9890979426762793,
"grad_norm": 0.43666836619377136,
"learning_rate": 6.233895044677196e-09,
"loss": 0.0495,
"step": 5625
},
{
"epoch": 0.9899771408475471,
"grad_norm": 0.23192152380943298,
"learning_rate": 5.269055870920081e-09,
"loss": 0.0601,
"step": 5630
},
{
"epoch": 0.9908563390188149,
"grad_norm": 0.09601382911205292,
"learning_rate": 4.385268383123586e-09,
"loss": 0.0571,
"step": 5635
},
{
"epoch": 0.9917355371900827,
"grad_norm": 0.22825530171394348,
"learning_rate": 3.5825397483113532e-09,
"loss": 0.0647,
"step": 5640
},
{
"epoch": 0.9926147353613505,
"grad_norm": 0.18555951118469238,
"learning_rate": 2.8608764761639542e-09,
"loss": 0.0468,
"step": 5645
},
{
"epoch": 0.9934939335326183,
"grad_norm": 0.15407155454158783,
"learning_rate": 2.220284418968932e-09,
"loss": 0.0543,
"step": 5650
},
{
"epoch": 0.9943731317038861,
"grad_norm": 0.30971744656562805,
"learning_rate": 1.6607687715675113e-09,
"loss": 0.0788,
"step": 5655
},
{
"epoch": 0.9952523298751539,
"grad_norm": 0.4101414680480957,
"learning_rate": 1.1823340713212894e-09,
"loss": 0.0527,
"step": 5660
},
{
"epoch": 0.9961315280464217,
"grad_norm": 0.29319489002227783,
"learning_rate": 7.849841980667183e-10,
"loss": 0.0472,
"step": 5665
},
{
"epoch": 0.9970107262176895,
"grad_norm": 0.3551650941371918,
"learning_rate": 4.687223740917901e-10,
"loss": 0.0683,
"step": 5670
},
{
"epoch": 0.9978899243889573,
"grad_norm": 0.2584504783153534,
"learning_rate": 2.335511641005095e-10,
"loss": 0.0501,
"step": 5675
},
{
"epoch": 0.9987691225602251,
"grad_norm": 0.5419056415557861,
"learning_rate": 7.947247520179169e-11,
"loss": 0.0518,
"step": 5680
},
{
"epoch": 0.9996483207314929,
"grad_norm": 0.7458012700080872,
"learning_rate": 6.487556887257995e-12,
"loss": 0.0659,
"step": 5685
},
{
"epoch": 1.0,
"step": 5687,
"total_flos": 0.0,
"train_loss": 0.059674585497827885,
"train_runtime": 13149.4885,
"train_samples_per_second": 13.837,
"train_steps_per_second": 0.432
}
],
"logging_steps": 5,
"max_steps": 5687,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": false,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 0.0,
"train_batch_size": 4,
"trial_name": null,
"trial_params": null
}