llama2-7b-sft-ultrachat-safeRLHF / trainer_state.json
AmberYifan's picture
Model save
c310fec verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 1.0,
"eval_steps": 500,
"global_step": 1549,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0006455777921239509,
"grad_norm": 3.9650591213827973,
"learning_rate": 1.2903225806451614e-07,
"loss": 1.6036,
"step": 1
},
{
"epoch": 0.0032278889606197547,
"grad_norm": 3.8088274952823418,
"learning_rate": 6.451612903225807e-07,
"loss": 1.6307,
"step": 5
},
{
"epoch": 0.006455777921239509,
"grad_norm": 4.165879562455664,
"learning_rate": 1.2903225806451614e-06,
"loss": 1.6249,
"step": 10
},
{
"epoch": 0.009683666881859263,
"grad_norm": 2.990018728469599,
"learning_rate": 1.935483870967742e-06,
"loss": 1.5588,
"step": 15
},
{
"epoch": 0.012911555842479019,
"grad_norm": 2.7197323548375154,
"learning_rate": 2.580645161290323e-06,
"loss": 1.4927,
"step": 20
},
{
"epoch": 0.016139444803098774,
"grad_norm": 2.249170792509018,
"learning_rate": 3.225806451612903e-06,
"loss": 1.4732,
"step": 25
},
{
"epoch": 0.019367333763718526,
"grad_norm": 2.525470719551368,
"learning_rate": 3.870967741935484e-06,
"loss": 1.4238,
"step": 30
},
{
"epoch": 0.022595222724338282,
"grad_norm": 2.0358718107154865,
"learning_rate": 4.516129032258065e-06,
"loss": 1.3624,
"step": 35
},
{
"epoch": 0.025823111684958037,
"grad_norm": 1.992683800905439,
"learning_rate": 5.161290322580646e-06,
"loss": 1.4032,
"step": 40
},
{
"epoch": 0.029051000645577793,
"grad_norm": 2.013454166084203,
"learning_rate": 5.806451612903226e-06,
"loss": 1.331,
"step": 45
},
{
"epoch": 0.03227888960619755,
"grad_norm": 2.090725865373585,
"learning_rate": 6.451612903225806e-06,
"loss": 1.3562,
"step": 50
},
{
"epoch": 0.035506778566817304,
"grad_norm": 2.05356594398191,
"learning_rate": 7.096774193548388e-06,
"loss": 1.3419,
"step": 55
},
{
"epoch": 0.03873466752743705,
"grad_norm": 2.0581334238835094,
"learning_rate": 7.741935483870968e-06,
"loss": 1.3581,
"step": 60
},
{
"epoch": 0.04196255648805681,
"grad_norm": 1.8723945697068236,
"learning_rate": 8.387096774193549e-06,
"loss": 1.3274,
"step": 65
},
{
"epoch": 0.045190445448676564,
"grad_norm": 1.926329815796449,
"learning_rate": 9.03225806451613e-06,
"loss": 1.3388,
"step": 70
},
{
"epoch": 0.04841833440929632,
"grad_norm": 1.860663046947963,
"learning_rate": 9.67741935483871e-06,
"loss": 1.3248,
"step": 75
},
{
"epoch": 0.051646223369916075,
"grad_norm": 2.003718296473059,
"learning_rate": 1.0322580645161291e-05,
"loss": 1.3043,
"step": 80
},
{
"epoch": 0.05487411233053583,
"grad_norm": 2.067099708161208,
"learning_rate": 1.096774193548387e-05,
"loss": 1.2791,
"step": 85
},
{
"epoch": 0.058102001291155586,
"grad_norm": 1.912623762729298,
"learning_rate": 1.1612903225806453e-05,
"loss": 1.3056,
"step": 90
},
{
"epoch": 0.06132989025177534,
"grad_norm": 1.9330132101669817,
"learning_rate": 1.2258064516129034e-05,
"loss": 1.2744,
"step": 95
},
{
"epoch": 0.0645577792123951,
"grad_norm": 1.9526926931743687,
"learning_rate": 1.2903225806451613e-05,
"loss": 1.3269,
"step": 100
},
{
"epoch": 0.06778566817301485,
"grad_norm": 1.9107443115368858,
"learning_rate": 1.3548387096774194e-05,
"loss": 1.3216,
"step": 105
},
{
"epoch": 0.07101355713363461,
"grad_norm": 2.028122509274084,
"learning_rate": 1.4193548387096776e-05,
"loss": 1.3015,
"step": 110
},
{
"epoch": 0.07424144609425436,
"grad_norm": 1.9705558160434318,
"learning_rate": 1.4838709677419357e-05,
"loss": 1.3778,
"step": 115
},
{
"epoch": 0.0774693350548741,
"grad_norm": 1.9147018582759738,
"learning_rate": 1.5483870967741936e-05,
"loss": 1.2741,
"step": 120
},
{
"epoch": 0.08069722401549387,
"grad_norm": 1.9414619491995477,
"learning_rate": 1.6129032258064517e-05,
"loss": 1.2515,
"step": 125
},
{
"epoch": 0.08392511297611362,
"grad_norm": 1.9507407646646784,
"learning_rate": 1.6774193548387098e-05,
"loss": 1.3155,
"step": 130
},
{
"epoch": 0.08715300193673338,
"grad_norm": 2.079634842680955,
"learning_rate": 1.741935483870968e-05,
"loss": 1.3175,
"step": 135
},
{
"epoch": 0.09038089089735313,
"grad_norm": 1.9123604229377198,
"learning_rate": 1.806451612903226e-05,
"loss": 1.3449,
"step": 140
},
{
"epoch": 0.09360877985797289,
"grad_norm": 1.9246949261057293,
"learning_rate": 1.870967741935484e-05,
"loss": 1.3144,
"step": 145
},
{
"epoch": 0.09683666881859264,
"grad_norm": 1.906386143347907,
"learning_rate": 1.935483870967742e-05,
"loss": 1.378,
"step": 150
},
{
"epoch": 0.1000645577792124,
"grad_norm": 1.8601855072949223,
"learning_rate": 2e-05,
"loss": 1.343,
"step": 155
},
{
"epoch": 0.10329244673983215,
"grad_norm": 1.8098540840198305,
"learning_rate": 1.9999365137586883e-05,
"loss": 1.3381,
"step": 160
},
{
"epoch": 0.1065203357004519,
"grad_norm": 1.8706270922180774,
"learning_rate": 1.9997460630957587e-05,
"loss": 1.3223,
"step": 165
},
{
"epoch": 0.10974822466107166,
"grad_norm": 1.8608950873937071,
"learning_rate": 1.9994286721932043e-05,
"loss": 1.3626,
"step": 170
},
{
"epoch": 0.11297611362169141,
"grad_norm": 1.7900195911673324,
"learning_rate": 1.9989843813509366e-05,
"loss": 1.3378,
"step": 175
},
{
"epoch": 0.11620400258231117,
"grad_norm": 1.9009457846729683,
"learning_rate": 1.998413246981666e-05,
"loss": 1.3092,
"step": 180
},
{
"epoch": 0.11943189154293092,
"grad_norm": 1.817245920497017,
"learning_rate": 1.9977153416037424e-05,
"loss": 1.3178,
"step": 185
},
{
"epoch": 0.12265978050355068,
"grad_norm": 1.9248355064764926,
"learning_rate": 1.9968907538319433e-05,
"loss": 1.3679,
"step": 190
},
{
"epoch": 0.12588766946417043,
"grad_norm": 1.8232558588327803,
"learning_rate": 1.9959395883662257e-05,
"loss": 1.3209,
"step": 195
},
{
"epoch": 0.1291155584247902,
"grad_norm": 1.7891335229988536,
"learning_rate": 1.99486196597843e-05,
"loss": 1.3159,
"step": 200
},
{
"epoch": 0.13234344738540993,
"grad_norm": 1.8906319036499681,
"learning_rate": 1.9936580234969464e-05,
"loss": 1.3602,
"step": 205
},
{
"epoch": 0.1355713363460297,
"grad_norm": 1.7907546683009863,
"learning_rate": 1.9923279137893405e-05,
"loss": 1.3354,
"step": 210
},
{
"epoch": 0.13879922530664945,
"grad_norm": 1.700699011195732,
"learning_rate": 1.990871805742944e-05,
"loss": 1.3204,
"step": 215
},
{
"epoch": 0.14202711426726922,
"grad_norm": 1.6951471445484736,
"learning_rate": 1.9892898842434106e-05,
"loss": 1.3477,
"step": 220
},
{
"epoch": 0.14525500322788895,
"grad_norm": 1.7370980540115004,
"learning_rate": 1.98758235015124e-05,
"loss": 1.2897,
"step": 225
},
{
"epoch": 0.1484828921885087,
"grad_norm": 1.90980565221276,
"learning_rate": 1.9857494202762758e-05,
"loss": 1.3285,
"step": 230
},
{
"epoch": 0.15171078114912848,
"grad_norm": 1.8327559047783981,
"learning_rate": 1.983791327350174e-05,
"loss": 1.3393,
"step": 235
},
{
"epoch": 0.1549386701097482,
"grad_norm": 1.7088673595234396,
"learning_rate": 1.9817083199968552e-05,
"loss": 1.3366,
"step": 240
},
{
"epoch": 0.15816655907036797,
"grad_norm": 1.663460603889217,
"learning_rate": 1.979500662700934e-05,
"loss": 1.3397,
"step": 245
},
{
"epoch": 0.16139444803098774,
"grad_norm": 1.8442180798418375,
"learning_rate": 1.977168635774138e-05,
"loss": 1.3195,
"step": 250
},
{
"epoch": 0.1646223369916075,
"grad_norm": 1.643074105204939,
"learning_rate": 1.974712535319716e-05,
"loss": 1.3721,
"step": 255
},
{
"epoch": 0.16785022595222723,
"grad_norm": 1.6902951726350381,
"learning_rate": 1.97213267319484e-05,
"loss": 1.3117,
"step": 260
},
{
"epoch": 0.171078114912847,
"grad_norm": 1.7911120795907636,
"learning_rate": 1.969429376971009e-05,
"loss": 1.3133,
"step": 265
},
{
"epoch": 0.17430600387346676,
"grad_norm": 1.7123588730228843,
"learning_rate": 1.9666029898924558e-05,
"loss": 1.3363,
"step": 270
},
{
"epoch": 0.17753389283408652,
"grad_norm": 1.729255435009809,
"learning_rate": 1.9636538708325642e-05,
"loss": 1.2907,
"step": 275
},
{
"epoch": 0.18076178179470626,
"grad_norm": 1.7319264824190221,
"learning_rate": 1.9605823942483034e-05,
"loss": 1.3096,
"step": 280
},
{
"epoch": 0.18398967075532602,
"grad_norm": 1.741144359063169,
"learning_rate": 1.9573889501326803e-05,
"loss": 1.3652,
"step": 285
},
{
"epoch": 0.18721755971594578,
"grad_norm": 1.7151312053405912,
"learning_rate": 1.9540739439652222e-05,
"loss": 1.3049,
"step": 290
},
{
"epoch": 0.19044544867656552,
"grad_norm": 1.683212569724672,
"learning_rate": 1.9506377966604923e-05,
"loss": 1.3282,
"step": 295
},
{
"epoch": 0.19367333763718528,
"grad_norm": 1.7808644734695298,
"learning_rate": 1.9470809445146447e-05,
"loss": 1.3231,
"step": 300
},
{
"epoch": 0.19690122659780504,
"grad_norm": 1.7655574417528708,
"learning_rate": 1.9434038391500266e-05,
"loss": 1.3086,
"step": 305
},
{
"epoch": 0.2001291155584248,
"grad_norm": 1.832682966934851,
"learning_rate": 1.9396069474578348e-05,
"loss": 1.2703,
"step": 310
},
{
"epoch": 0.20335700451904454,
"grad_norm": 1.7313343256591136,
"learning_rate": 1.935690751538834e-05,
"loss": 1.3656,
"step": 315
},
{
"epoch": 0.2065848934796643,
"grad_norm": 1.6875459508805728,
"learning_rate": 1.9316557486421423e-05,
"loss": 1.3134,
"step": 320
},
{
"epoch": 0.20981278244028406,
"grad_norm": 1.8211446122910608,
"learning_rate": 1.927502451102095e-05,
"loss": 1.285,
"step": 325
},
{
"epoch": 0.2130406714009038,
"grad_norm": 1.7175166450700674,
"learning_rate": 1.9232313862731915e-05,
"loss": 1.2818,
"step": 330
},
{
"epoch": 0.21626856036152356,
"grad_norm": 1.7073399712647515,
"learning_rate": 1.918843096463137e-05,
"loss": 1.2797,
"step": 335
},
{
"epoch": 0.21949644932214332,
"grad_norm": 1.6282483524863707,
"learning_rate": 1.914338138863983e-05,
"loss": 1.2858,
"step": 340
},
{
"epoch": 0.22272433828276308,
"grad_norm": 1.6109035926257675,
"learning_rate": 1.9097170854813797e-05,
"loss": 1.3073,
"step": 345
},
{
"epoch": 0.22595222724338282,
"grad_norm": 1.6959193091313398,
"learning_rate": 1.904980523061948e-05,
"loss": 1.2859,
"step": 350
},
{
"epoch": 0.22918011620400258,
"grad_norm": 1.672830416845886,
"learning_rate": 1.9001290530187768e-05,
"loss": 1.3208,
"step": 355
},
{
"epoch": 0.23240800516462234,
"grad_norm": 1.6811479547049457,
"learning_rate": 1.8951632913550625e-05,
"loss": 1.3022,
"step": 360
},
{
"epoch": 0.23563589412524208,
"grad_norm": 1.7021754751138019,
"learning_rate": 1.8900838685858913e-05,
"loss": 1.3064,
"step": 365
},
{
"epoch": 0.23886378308586184,
"grad_norm": 1.648769003483825,
"learning_rate": 1.884891429658182e-05,
"loss": 1.3075,
"step": 370
},
{
"epoch": 0.2420916720464816,
"grad_norm": 1.6177276058253132,
"learning_rate": 1.8795866338687968e-05,
"loss": 1.3385,
"step": 375
},
{
"epoch": 0.24531956100710137,
"grad_norm": 1.7513420547011154,
"learning_rate": 1.8741701547808272e-05,
"loss": 1.2579,
"step": 380
},
{
"epoch": 0.2485474499677211,
"grad_norm": 1.7655791699049692,
"learning_rate": 1.868642680138069e-05,
"loss": 1.3426,
"step": 385
},
{
"epoch": 0.25177533892834086,
"grad_norm": 1.707720917863569,
"learning_rate": 1.863004911777701e-05,
"loss": 1.2992,
"step": 390
},
{
"epoch": 0.2550032278889606,
"grad_norm": 1.6805474301888417,
"learning_rate": 1.8572575655411683e-05,
"loss": 1.2762,
"step": 395
},
{
"epoch": 0.2582311168495804,
"grad_norm": 1.8396750556049477,
"learning_rate": 1.851401371183291e-05,
"loss": 1.324,
"step": 400
},
{
"epoch": 0.26145900581020015,
"grad_norm": 1.608188327745593,
"learning_rate": 1.8454370722796052e-05,
"loss": 1.2912,
"step": 405
},
{
"epoch": 0.26468689477081986,
"grad_norm": 1.740643541629945,
"learning_rate": 1.8393654261319504e-05,
"loss": 1.3491,
"step": 410
},
{
"epoch": 0.2679147837314396,
"grad_norm": 1.6745914246557183,
"learning_rate": 1.8331872036723103e-05,
"loss": 1.2991,
"step": 415
},
{
"epoch": 0.2711426726920594,
"grad_norm": 1.7639108707152669,
"learning_rate": 1.8269031893649306e-05,
"loss": 1.2739,
"step": 420
},
{
"epoch": 0.27437056165267915,
"grad_norm": 1.6200810138499548,
"learning_rate": 1.8205141811067073e-05,
"loss": 1.3004,
"step": 425
},
{
"epoch": 0.2775984506132989,
"grad_norm": 1.5828067553615806,
"learning_rate": 1.814020990125881e-05,
"loss": 1.2567,
"step": 430
},
{
"epoch": 0.28082633957391867,
"grad_norm": 1.5814677474743155,
"learning_rate": 1.807424440879031e-05,
"loss": 1.3042,
"step": 435
},
{
"epoch": 0.28405422853453843,
"grad_norm": 1.6470517664415398,
"learning_rate": 1.8007253709463915e-05,
"loss": 1.2653,
"step": 440
},
{
"epoch": 0.28728211749515814,
"grad_norm": 1.6071009098035933,
"learning_rate": 1.7939246309255028e-05,
"loss": 1.2751,
"step": 445
},
{
"epoch": 0.2905100064557779,
"grad_norm": 1.8030936098227344,
"learning_rate": 1.78702308432321e-05,
"loss": 1.3339,
"step": 450
},
{
"epoch": 0.29373789541639767,
"grad_norm": 1.595362038534452,
"learning_rate": 1.7800216074460183e-05,
"loss": 1.3132,
"step": 455
},
{
"epoch": 0.2969657843770174,
"grad_norm": 1.684190508104507,
"learning_rate": 1.772921089288829e-05,
"loss": 1.291,
"step": 460
},
{
"epoch": 0.3001936733376372,
"grad_norm": 1.7750127106851745,
"learning_rate": 1.7657224314220604e-05,
"loss": 1.3048,
"step": 465
},
{
"epoch": 0.30342156229825695,
"grad_norm": 1.7084489526934932,
"learning_rate": 1.7584265478771737e-05,
"loss": 1.2713,
"step": 470
},
{
"epoch": 0.3066494512588767,
"grad_norm": 1.6912636819694478,
"learning_rate": 1.7510343650306155e-05,
"loss": 1.3093,
"step": 475
},
{
"epoch": 0.3098773402194964,
"grad_norm": 1.7094409772505903,
"learning_rate": 1.7435468214861933e-05,
"loss": 1.2405,
"step": 480
},
{
"epoch": 0.3131052291801162,
"grad_norm": 1.717986074001015,
"learning_rate": 1.7359648679559006e-05,
"loss": 1.2455,
"step": 485
},
{
"epoch": 0.31633311814073595,
"grad_norm": 1.6087377811266534,
"learning_rate": 1.7282894671391996e-05,
"loss": 1.3034,
"step": 490
},
{
"epoch": 0.3195610071013557,
"grad_norm": 1.582277729471747,
"learning_rate": 1.720521593600787e-05,
"loss": 1.329,
"step": 495
},
{
"epoch": 0.32278889606197547,
"grad_norm": 1.6529121399219846,
"learning_rate": 1.7126622336468514e-05,
"loss": 1.2535,
"step": 500
},
{
"epoch": 0.32601678502259523,
"grad_norm": 1.6248137205027384,
"learning_rate": 1.7047123851998374e-05,
"loss": 1.2898,
"step": 505
},
{
"epoch": 0.329244673983215,
"grad_norm": 1.6149646818990504,
"learning_rate": 1.6966730576717388e-05,
"loss": 1.2932,
"step": 510
},
{
"epoch": 0.33247256294383476,
"grad_norm": 1.5229634789864221,
"learning_rate": 1.6885452718359306e-05,
"loss": 1.2456,
"step": 515
},
{
"epoch": 0.33570045190445447,
"grad_norm": 1.5590252870902126,
"learning_rate": 1.6803300596975586e-05,
"loss": 1.2706,
"step": 520
},
{
"epoch": 0.33892834086507423,
"grad_norm": 1.7252113088576104,
"learning_rate": 1.6720284643625035e-05,
"loss": 1.2845,
"step": 525
},
{
"epoch": 0.342156229825694,
"grad_norm": 1.7238035921002546,
"learning_rate": 1.6636415399049347e-05,
"loss": 1.2395,
"step": 530
},
{
"epoch": 0.34538411878631375,
"grad_norm": 1.7671097644574092,
"learning_rate": 1.6551703512334716e-05,
"loss": 1.2962,
"step": 535
},
{
"epoch": 0.3486120077469335,
"grad_norm": 1.7029022963737455,
"learning_rate": 1.6466159739559712e-05,
"loss": 1.2807,
"step": 540
},
{
"epoch": 0.3518398967075533,
"grad_norm": 1.675502013934829,
"learning_rate": 1.6379794942429534e-05,
"loss": 1.2592,
"step": 545
},
{
"epoch": 0.35506778566817304,
"grad_norm": 1.7914748074744888,
"learning_rate": 1.629262008689689e-05,
"loss": 1.2902,
"step": 550
},
{
"epoch": 0.35829567462879275,
"grad_norm": 1.682038467478413,
"learning_rate": 1.62046462417696e-05,
"loss": 1.265,
"step": 555
},
{
"epoch": 0.3615235635894125,
"grad_norm": 1.636775201747519,
"learning_rate": 1.611588457730519e-05,
"loss": 1.2939,
"step": 560
},
{
"epoch": 0.3647514525500323,
"grad_norm": 1.6464312559612535,
"learning_rate": 1.6026346363792565e-05,
"loss": 1.2933,
"step": 565
},
{
"epoch": 0.36797934151065204,
"grad_norm": 1.6581603778474074,
"learning_rate": 1.5936042970120976e-05,
"loss": 1.2823,
"step": 570
},
{
"epoch": 0.3712072304712718,
"grad_norm": 1.6321585067614632,
"learning_rate": 1.5844985862336516e-05,
"loss": 1.2793,
"step": 575
},
{
"epoch": 0.37443511943189156,
"grad_norm": 1.5366946270427047,
"learning_rate": 1.5753186602186207e-05,
"loss": 1.257,
"step": 580
},
{
"epoch": 0.3776630083925113,
"grad_norm": 1.604449837575112,
"learning_rate": 1.5660656845650027e-05,
"loss": 1.2136,
"step": 585
},
{
"epoch": 0.38089089735313103,
"grad_norm": 1.6084609331330948,
"learning_rate": 1.556740834146087e-05,
"loss": 1.3093,
"step": 590
},
{
"epoch": 0.3841187863137508,
"grad_norm": 1.7866690447019067,
"learning_rate": 1.547345292961282e-05,
"loss": 1.2652,
"step": 595
},
{
"epoch": 0.38734667527437056,
"grad_norm": 1.6397669249338218,
"learning_rate": 1.5378802539857775e-05,
"loss": 1.3328,
"step": 600
},
{
"epoch": 0.3905745642349903,
"grad_norm": 1.5711856318785005,
"learning_rate": 1.52834691901907e-05,
"loss": 1.268,
"step": 605
},
{
"epoch": 0.3938024531956101,
"grad_norm": 1.7458250460751268,
"learning_rate": 1.5187464985323681e-05,
"loss": 1.314,
"step": 610
},
{
"epoch": 0.39703034215622984,
"grad_norm": 1.6297232609827654,
"learning_rate": 1.5090802115148956e-05,
"loss": 1.2891,
"step": 615
},
{
"epoch": 0.4002582311168496,
"grad_norm": 1.618673004010897,
"learning_rate": 1.4993492853191118e-05,
"loss": 1.2679,
"step": 620
},
{
"epoch": 0.4034861200774693,
"grad_norm": 1.5618198023723413,
"learning_rate": 1.4895549555048751e-05,
"loss": 1.2596,
"step": 625
},
{
"epoch": 0.4067140090380891,
"grad_norm": 1.6384560883294799,
"learning_rate": 1.4796984656825572e-05,
"loss": 1.2735,
"step": 630
},
{
"epoch": 0.40994189799870884,
"grad_norm": 1.6127858175302265,
"learning_rate": 1.4697810673551408e-05,
"loss": 1.3048,
"step": 635
},
{
"epoch": 0.4131697869593286,
"grad_norm": 1.648004507456454,
"learning_rate": 1.4598040197593128e-05,
"loss": 1.326,
"step": 640
},
{
"epoch": 0.41639767591994836,
"grad_norm": 1.5828368380983484,
"learning_rate": 1.4497685897055758e-05,
"loss": 1.2148,
"step": 645
},
{
"epoch": 0.4196255648805681,
"grad_norm": 1.5711258927882057,
"learning_rate": 1.4396760514173976e-05,
"loss": 1.2806,
"step": 650
},
{
"epoch": 0.4228534538411879,
"grad_norm": 1.7260560292823433,
"learning_rate": 1.4295276863694205e-05,
"loss": 1.2374,
"step": 655
},
{
"epoch": 0.4260813428018076,
"grad_norm": 1.5876415474050052,
"learning_rate": 1.4193247831247499e-05,
"loss": 1.3225,
"step": 660
},
{
"epoch": 0.42930923176242736,
"grad_norm": 1.6505847676867402,
"learning_rate": 1.4090686371713403e-05,
"loss": 1.2948,
"step": 665
},
{
"epoch": 0.4325371207230471,
"grad_norm": 1.526921850992316,
"learning_rate": 1.3987605507575053e-05,
"loss": 1.262,
"step": 670
},
{
"epoch": 0.4357650096836669,
"grad_norm": 1.6535400875687096,
"learning_rate": 1.3884018327265683e-05,
"loss": 1.3213,
"step": 675
},
{
"epoch": 0.43899289864428664,
"grad_norm": 1.5791268753862373,
"learning_rate": 1.3779937983506746e-05,
"loss": 1.2948,
"step": 680
},
{
"epoch": 0.4422207876049064,
"grad_norm": 1.6917933114261559,
"learning_rate": 1.3675377691637879e-05,
"loss": 1.3094,
"step": 685
},
{
"epoch": 0.44544867656552617,
"grad_norm": 1.7180616960296269,
"learning_rate": 1.3570350727938925e-05,
"loss": 1.2872,
"step": 690
},
{
"epoch": 0.4486765655261459,
"grad_norm": 1.6479976846946962,
"learning_rate": 1.3464870427944208e-05,
"loss": 1.2492,
"step": 695
},
{
"epoch": 0.45190445448676564,
"grad_norm": 1.6294418357249567,
"learning_rate": 1.3358950184749284e-05,
"loss": 1.2867,
"step": 700
},
{
"epoch": 0.4551323434473854,
"grad_norm": 1.5878399440904527,
"learning_rate": 1.3252603447310396e-05,
"loss": 1.2246,
"step": 705
},
{
"epoch": 0.45836023240800516,
"grad_norm": 1.679514731044212,
"learning_rate": 1.3145843718736809e-05,
"loss": 1.2988,
"step": 710
},
{
"epoch": 0.4615881213686249,
"grad_norm": 1.7316167126803805,
"learning_rate": 1.3038684554576308e-05,
"loss": 1.2834,
"step": 715
},
{
"epoch": 0.4648160103292447,
"grad_norm": 1.6144875544271897,
"learning_rate": 1.2931139561094007e-05,
"loss": 1.2357,
"step": 720
},
{
"epoch": 0.46804389928986445,
"grad_norm": 1.6469776474272737,
"learning_rate": 1.2823222393544717e-05,
"loss": 1.2407,
"step": 725
},
{
"epoch": 0.47127178825048416,
"grad_norm": 1.517601534581415,
"learning_rate": 1.2714946754439117e-05,
"loss": 1.2584,
"step": 730
},
{
"epoch": 0.4744996772111039,
"grad_norm": 1.6683118073994527,
"learning_rate": 1.2606326391803915e-05,
"loss": 1.2449,
"step": 735
},
{
"epoch": 0.4777275661717237,
"grad_norm": 1.5690047869024784,
"learning_rate": 1.249737509743622e-05,
"loss": 1.2716,
"step": 740
},
{
"epoch": 0.48095545513234345,
"grad_norm": 1.7097192789221543,
"learning_rate": 1.2388106705152361e-05,
"loss": 1.2535,
"step": 745
},
{
"epoch": 0.4841833440929632,
"grad_norm": 1.57392054276293,
"learning_rate": 1.2278535089031377e-05,
"loss": 1.2778,
"step": 750
},
{
"epoch": 0.48741123305358297,
"grad_norm": 1.5836852821682212,
"learning_rate": 1.2168674161653395e-05,
"loss": 1.2386,
"step": 755
},
{
"epoch": 0.49063912201420273,
"grad_norm": 1.714465868583922,
"learning_rate": 1.2058537872333104e-05,
"loss": 1.2193,
"step": 760
},
{
"epoch": 0.49386701097482244,
"grad_norm": 1.4951779206694975,
"learning_rate": 1.1948140205348592e-05,
"loss": 1.2317,
"step": 765
},
{
"epoch": 0.4970948999354422,
"grad_norm": 1.666029933988235,
"learning_rate": 1.1837495178165706e-05,
"loss": 1.2316,
"step": 770
},
{
"epoch": 0.500322788896062,
"grad_norm": 1.5701604478724709,
"learning_rate": 1.1726616839658237e-05,
"loss": 1.259,
"step": 775
},
{
"epoch": 0.5035506778566817,
"grad_norm": 1.6465072436129509,
"learning_rate": 1.1615519268324101e-05,
"loss": 1.1986,
"step": 780
},
{
"epoch": 0.5067785668173015,
"grad_norm": 1.7270365055532542,
"learning_rate": 1.1504216570497737e-05,
"loss": 1.2149,
"step": 785
},
{
"epoch": 0.5100064557779213,
"grad_norm": 1.595916061722449,
"learning_rate": 1.1392722878559012e-05,
"loss": 1.2763,
"step": 790
},
{
"epoch": 0.513234344738541,
"grad_norm": 1.6329464108256435,
"learning_rate": 1.1281052349138793e-05,
"loss": 1.2441,
"step": 795
},
{
"epoch": 0.5164622336991608,
"grad_norm": 1.6630997767795845,
"learning_rate": 1.116921916132143e-05,
"loss": 1.2315,
"step": 800
},
{
"epoch": 0.5196901226597805,
"grad_norm": 1.6726472503428615,
"learning_rate": 1.1057237514844423e-05,
"loss": 1.289,
"step": 805
},
{
"epoch": 0.5229180116204003,
"grad_norm": 1.5376847707418333,
"learning_rate": 1.0945121628295437e-05,
"loss": 1.2573,
"step": 810
},
{
"epoch": 0.5261459005810201,
"grad_norm": 2.2493075416466146,
"learning_rate": 1.0832885737306922e-05,
"loss": 1.2879,
"step": 815
},
{
"epoch": 0.5293737895416397,
"grad_norm": 1.7048259030945168,
"learning_rate": 1.0720544092748599e-05,
"loss": 1.2093,
"step": 820
},
{
"epoch": 0.5326016785022595,
"grad_norm": 1.6052643185830913,
"learning_rate": 1.0608110958917982e-05,
"loss": 1.2787,
"step": 825
},
{
"epoch": 0.5358295674628792,
"grad_norm": 1.5286978908553153,
"learning_rate": 1.04956006117292e-05,
"loss": 1.2354,
"step": 830
},
{
"epoch": 0.539057456423499,
"grad_norm": 1.544930091476391,
"learning_rate": 1.0383027336900356e-05,
"loss": 1.2293,
"step": 835
},
{
"epoch": 0.5422853453841188,
"grad_norm": 1.5385671678921857,
"learning_rate": 1.0270405428139633e-05,
"loss": 1.2994,
"step": 840
},
{
"epoch": 0.5455132343447385,
"grad_norm": 1.5752108128614972,
"learning_rate": 1.0157749185330384e-05,
"loss": 1.2836,
"step": 845
},
{
"epoch": 0.5487411233053583,
"grad_norm": 1.5927488211766814,
"learning_rate": 1.0045072912715443e-05,
"loss": 1.2659,
"step": 850
},
{
"epoch": 0.551969012265978,
"grad_norm": 1.592292517508467,
"learning_rate": 9.932390917080874e-06,
"loss": 1.2447,
"step": 855
},
{
"epoch": 0.5551969012265978,
"grad_norm": 1.641667371840128,
"learning_rate": 9.81971750593941e-06,
"loss": 1.2588,
"step": 860
},
{
"epoch": 0.5584247901872176,
"grad_norm": 1.624109747232424,
"learning_rate": 9.707066985713795e-06,
"loss": 1.234,
"step": 865
},
{
"epoch": 0.5616526791478373,
"grad_norm": 1.5276862360143446,
"learning_rate": 9.594453659920241e-06,
"loss": 1.2018,
"step": 870
},
{
"epoch": 0.5648805681084571,
"grad_norm": 1.524996526315475,
"learning_rate": 9.48189182735231e-06,
"loss": 1.226,
"step": 875
},
{
"epoch": 0.5681084570690769,
"grad_norm": 1.593943477253438,
"learning_rate": 9.369395780265323e-06,
"loss": 1.2354,
"step": 880
},
{
"epoch": 0.5713363460296966,
"grad_norm": 1.6682262226100923,
"learning_rate": 9.256979802561675e-06,
"loss": 1.2574,
"step": 885
},
{
"epoch": 0.5745642349903163,
"grad_norm": 1.6115401523560124,
"learning_rate": 9.144658167977134e-06,
"loss": 1.2554,
"step": 890
},
{
"epoch": 0.577792123950936,
"grad_norm": 1.6664705944442975,
"learning_rate": 9.032445138268493e-06,
"loss": 1.2277,
"step": 895
},
{
"epoch": 0.5810200129115558,
"grad_norm": 1.6141614694279862,
"learning_rate": 8.920354961402723e-06,
"loss": 1.2327,
"step": 900
},
{
"epoch": 0.5842479018721756,
"grad_norm": 1.7041618253649367,
"learning_rate": 8.808401869747858e-06,
"loss": 1.2736,
"step": 905
},
{
"epoch": 0.5874757908327953,
"grad_norm": 1.5532217130112593,
"learning_rate": 8.696600078265876e-06,
"loss": 1.2057,
"step": 910
},
{
"epoch": 0.5907036797934151,
"grad_norm": 1.5645080700847085,
"learning_rate": 8.584963782707812e-06,
"loss": 1.2062,
"step": 915
},
{
"epoch": 0.5939315687540349,
"grad_norm": 1.539207505181734,
"learning_rate": 8.473507157811254e-06,
"loss": 1.2006,
"step": 920
},
{
"epoch": 0.5971594577146546,
"grad_norm": 1.6045881124979031,
"learning_rate": 8.362244355500583e-06,
"loss": 1.2519,
"step": 925
},
{
"epoch": 0.6003873466752744,
"grad_norm": 1.5717390602748484,
"learning_rate": 8.251189503090023e-06,
"loss": 1.2507,
"step": 930
},
{
"epoch": 0.6036152356358941,
"grad_norm": 1.6041786748836233,
"learning_rate": 8.140356701489892e-06,
"loss": 1.1995,
"step": 935
},
{
"epoch": 0.6068431245965139,
"grad_norm": 1.544159227072635,
"learning_rate": 8.029760023416168e-06,
"loss": 1.2159,
"step": 940
},
{
"epoch": 0.6100710135571337,
"grad_norm": 1.5674929554867256,
"learning_rate": 7.919413511603636e-06,
"loss": 1.2855,
"step": 945
},
{
"epoch": 0.6132989025177534,
"grad_norm": 1.6041034106028236,
"learning_rate": 7.809331177022847e-06,
"loss": 1.2285,
"step": 950
},
{
"epoch": 0.6165267914783732,
"grad_norm": 1.652846244359187,
"learning_rate": 7.699526997101121e-06,
"loss": 1.249,
"step": 955
},
{
"epoch": 0.6197546804389928,
"grad_norm": 1.6651647494538366,
"learning_rate": 7.590014913947778e-06,
"loss": 1.2591,
"step": 960
},
{
"epoch": 0.6229825693996126,
"grad_norm": 1.590193753660995,
"learning_rate": 7.4808088325839e-06,
"loss": 1.2256,
"step": 965
},
{
"epoch": 0.6262104583602324,
"grad_norm": 1.6722268348838438,
"learning_rate": 7.3719226191767526e-06,
"loss": 1.2612,
"step": 970
},
{
"epoch": 0.6294383473208521,
"grad_norm": 1.563385218810824,
"learning_rate": 7.263370099279173e-06,
"loss": 1.2219,
"step": 975
},
{
"epoch": 0.6326662362814719,
"grad_norm": 1.538051823490198,
"learning_rate": 7.155165056074111e-06,
"loss": 1.2142,
"step": 980
},
{
"epoch": 0.6358941252420917,
"grad_norm": 1.6083527642442943,
"learning_rate": 7.0473212286245316e-06,
"loss": 1.2812,
"step": 985
},
{
"epoch": 0.6391220142027114,
"grad_norm": 1.6678078232565665,
"learning_rate": 6.9398523101289475e-06,
"loss": 1.2625,
"step": 990
},
{
"epoch": 0.6423499031633312,
"grad_norm": 1.540993663270435,
"learning_rate": 6.832771946182741e-06,
"loss": 1.2522,
"step": 995
},
{
"epoch": 0.6455777921239509,
"grad_norm": 1.5968723128811868,
"learning_rate": 6.72609373304556e-06,
"loss": 1.1989,
"step": 1000
},
{
"epoch": 0.6488056810845707,
"grad_norm": 1.588868645829306,
"learning_rate": 6.619831215914974e-06,
"loss": 1.2157,
"step": 1005
},
{
"epoch": 0.6520335700451905,
"grad_norm": 1.6524103231111353,
"learning_rate": 6.5139978872065865e-06,
"loss": 1.2082,
"step": 1010
},
{
"epoch": 0.6552614590058102,
"grad_norm": 1.6245516939163356,
"learning_rate": 6.408607184840897e-06,
"loss": 1.2593,
"step": 1015
},
{
"epoch": 0.65848934796643,
"grad_norm": 1.5570333034084014,
"learning_rate": 6.303672490537022e-06,
"loss": 1.2099,
"step": 1020
},
{
"epoch": 0.6617172369270498,
"grad_norm": 1.5832778345200962,
"learning_rate": 6.199207128113614e-06,
"loss": 1.2631,
"step": 1025
},
{
"epoch": 0.6649451258876695,
"grad_norm": 1.5815865015835413,
"learning_rate": 6.095224361797087e-06,
"loss": 1.2945,
"step": 1030
},
{
"epoch": 0.6681730148482892,
"grad_norm": 1.6366177470747854,
"learning_rate": 5.991737394537434e-06,
"loss": 1.2463,
"step": 1035
},
{
"epoch": 0.6714009038089089,
"grad_norm": 1.6926432200133767,
"learning_rate": 5.888759366331798e-06,
"loss": 1.2356,
"step": 1040
},
{
"epoch": 0.6746287927695287,
"grad_norm": 1.62278769384498,
"learning_rate": 5.786303352556088e-06,
"loss": 1.1839,
"step": 1045
},
{
"epoch": 0.6778566817301485,
"grad_norm": 1.6754788350541943,
"learning_rate": 5.684382362304723e-06,
"loss": 1.2289,
"step": 1050
},
{
"epoch": 0.6810845706907682,
"grad_norm": 1.6388911381430855,
"learning_rate": 5.583009336738874e-06,
"loss": 1.2548,
"step": 1055
},
{
"epoch": 0.684312459651388,
"grad_norm": 1.6495374955760909,
"learning_rate": 5.48219714744326e-06,
"loss": 1.2328,
"step": 1060
},
{
"epoch": 0.6875403486120077,
"grad_norm": 1.6872246491519647,
"learning_rate": 5.381958594791843e-06,
"loss": 1.2096,
"step": 1065
},
{
"epoch": 0.6907682375726275,
"grad_norm": 1.5950519328952013,
"learning_rate": 5.282306406322502e-06,
"loss": 1.2502,
"step": 1070
},
{
"epoch": 0.6939961265332473,
"grad_norm": 1.6677034871070087,
"learning_rate": 5.1832532351210095e-06,
"loss": 1.2355,
"step": 1075
},
{
"epoch": 0.697224015493867,
"grad_norm": 1.6586625169970663,
"learning_rate": 5.084811658214421e-06,
"loss": 1.2264,
"step": 1080
},
{
"epoch": 0.7004519044544868,
"grad_norm": 1.6873770773693175,
"learning_rate": 4.986994174974155e-06,
"loss": 1.2508,
"step": 1085
},
{
"epoch": 0.7036797934151066,
"grad_norm": 1.5894542742895856,
"learning_rate": 4.889813205528895e-06,
"loss": 1.1727,
"step": 1090
},
{
"epoch": 0.7069076823757263,
"grad_norm": 1.6562699720711116,
"learning_rate": 4.793281089187603e-06,
"loss": 1.1841,
"step": 1095
},
{
"epoch": 0.7101355713363461,
"grad_norm": 1.5766425378140871,
"learning_rate": 4.697410082872741e-06,
"loss": 1.2142,
"step": 1100
},
{
"epoch": 0.7133634602969657,
"grad_norm": 1.6122901002424155,
"learning_rate": 4.602212359563988e-06,
"loss": 1.2304,
"step": 1105
},
{
"epoch": 0.7165913492575855,
"grad_norm": 1.8055415704300306,
"learning_rate": 4.50770000675262e-06,
"loss": 1.2243,
"step": 1110
},
{
"epoch": 0.7198192382182053,
"grad_norm": 1.705388366214554,
"learning_rate": 4.413885024906705e-06,
"loss": 1.2252,
"step": 1115
},
{
"epoch": 0.723047127178825,
"grad_norm": 1.690011241853718,
"learning_rate": 4.320779325947402e-06,
"loss": 1.1987,
"step": 1120
},
{
"epoch": 0.7262750161394448,
"grad_norm": 1.701641933857813,
"learning_rate": 4.228394731736451e-06,
"loss": 1.257,
"step": 1125
},
{
"epoch": 0.7295029051000645,
"grad_norm": 1.523477374915637,
"learning_rate": 4.136742972575132e-06,
"loss": 1.2409,
"step": 1130
},
{
"epoch": 0.7327307940606843,
"grad_norm": 1.6070065262090218,
"learning_rate": 4.045835685714848e-06,
"loss": 1.1725,
"step": 1135
},
{
"epoch": 0.7359586830213041,
"grad_norm": 1.6081728290175998,
"learning_rate": 3.955684413879499e-06,
"loss": 1.22,
"step": 1140
},
{
"epoch": 0.7391865719819238,
"grad_norm": 1.582549702759197,
"learning_rate": 3.866300603799876e-06,
"loss": 1.2019,
"step": 1145
},
{
"epoch": 0.7424144609425436,
"grad_norm": 1.7141546881146241,
"learning_rate": 3.77769560476026e-06,
"loss": 1.2075,
"step": 1150
},
{
"epoch": 0.7456423499031634,
"grad_norm": 1.6644378155563186,
"learning_rate": 3.689880667157344e-06,
"loss": 1.2735,
"step": 1155
},
{
"epoch": 0.7488702388637831,
"grad_norm": 1.7183053243234099,
"learning_rate": 3.602866941071773e-06,
"loss": 1.1943,
"step": 1160
},
{
"epoch": 0.7520981278244029,
"grad_norm": 1.5926230650949367,
"learning_rate": 3.516665474852369e-06,
"loss": 1.1967,
"step": 1165
},
{
"epoch": 0.7553260167850226,
"grad_norm": 1.6234706555802916,
"learning_rate": 3.431287213713299e-06,
"loss": 1.2206,
"step": 1170
},
{
"epoch": 0.7585539057456423,
"grad_norm": 1.7812533274829592,
"learning_rate": 3.3467429983443477e-06,
"loss": 1.1672,
"step": 1175
},
{
"epoch": 0.7617817947062621,
"grad_norm": 1.6294013276039931,
"learning_rate": 3.2630435635344283e-06,
"loss": 1.1823,
"step": 1180
},
{
"epoch": 0.7650096836668818,
"grad_norm": 1.7287640987665838,
"learning_rate": 3.180199536808576e-06,
"loss": 1.1738,
"step": 1185
},
{
"epoch": 0.7682375726275016,
"grad_norm": 1.600269379657349,
"learning_rate": 3.0982214370785335e-06,
"loss": 1.1996,
"step": 1190
},
{
"epoch": 0.7714654615881213,
"grad_norm": 1.5636282826440273,
"learning_rate": 3.0171196733071405e-06,
"loss": 1.2442,
"step": 1195
},
{
"epoch": 0.7746933505487411,
"grad_norm": 1.7137012337541018,
"learning_rate": 2.936904543186695e-06,
"loss": 1.2059,
"step": 1200
},
{
"epoch": 0.7779212395093609,
"grad_norm": 1.6511272127287338,
"learning_rate": 2.8575862318314087e-06,
"loss": 1.1886,
"step": 1205
},
{
"epoch": 0.7811491284699806,
"grad_norm": 1.6282736399632571,
"learning_rate": 2.7791748104841876e-06,
"loss": 1.199,
"step": 1210
},
{
"epoch": 0.7843770174306004,
"grad_norm": 1.6356472661837154,
"learning_rate": 2.7016802352378755e-06,
"loss": 1.1759,
"step": 1215
},
{
"epoch": 0.7876049063912202,
"grad_norm": 1.6120024552770273,
"learning_rate": 2.6251123457710726e-06,
"loss": 1.1921,
"step": 1220
},
{
"epoch": 0.7908327953518399,
"grad_norm": 1.5949781516189967,
"learning_rate": 2.5494808640988e-06,
"loss": 1.2128,
"step": 1225
},
{
"epoch": 0.7940606843124597,
"grad_norm": 1.6494400626843206,
"learning_rate": 2.4747953933380463e-06,
"loss": 1.2222,
"step": 1230
},
{
"epoch": 0.7972885732730794,
"grad_norm": 1.7679182098039068,
"learning_rate": 2.4010654164884527e-06,
"loss": 1.2308,
"step": 1235
},
{
"epoch": 0.8005164622336992,
"grad_norm": 1.567858838790038,
"learning_rate": 2.328300295228221e-06,
"loss": 1.2143,
"step": 1240
},
{
"epoch": 0.8037443511943189,
"grad_norm": 1.5218279025659718,
"learning_rate": 2.2565092687254486e-06,
"loss": 1.2266,
"step": 1245
},
{
"epoch": 0.8069722401549386,
"grad_norm": 1.591250607928255,
"learning_rate": 2.185701452464997e-06,
"loss": 1.1945,
"step": 1250
},
{
"epoch": 0.8102001291155584,
"grad_norm": 1.5033575415964024,
"learning_rate": 2.1158858370910905e-06,
"loss": 1.1877,
"step": 1255
},
{
"epoch": 0.8134280180761781,
"grad_norm": 1.556794336082307,
"learning_rate": 2.047071287265735e-06,
"loss": 1.1718,
"step": 1260
},
{
"epoch": 0.8166559070367979,
"grad_norm": 1.60369367023606,
"learning_rate": 1.9792665405431654e-06,
"loss": 1.1449,
"step": 1265
},
{
"epoch": 0.8198837959974177,
"grad_norm": 1.6751904466505534,
"learning_rate": 1.9124802062604066e-06,
"loss": 1.2021,
"step": 1270
},
{
"epoch": 0.8231116849580374,
"grad_norm": 1.711192714943588,
"learning_rate": 1.8467207644441243e-06,
"loss": 1.2269,
"step": 1275
},
{
"epoch": 0.8263395739186572,
"grad_norm": 1.772092654829715,
"learning_rate": 1.7819965647339065e-06,
"loss": 1.2432,
"step": 1280
},
{
"epoch": 0.829567462879277,
"grad_norm": 1.676701962809608,
"learning_rate": 1.718315825322071e-06,
"loss": 1.2017,
"step": 1285
},
{
"epoch": 0.8327953518398967,
"grad_norm": 1.5761976498566383,
"learning_rate": 1.6556866319102027e-06,
"loss": 1.2506,
"step": 1290
},
{
"epoch": 0.8360232408005165,
"grad_norm": 1.6807189675833636,
"learning_rate": 1.5941169366824705e-06,
"loss": 1.2133,
"step": 1295
},
{
"epoch": 0.8392511297611362,
"grad_norm": 1.646503425758142,
"learning_rate": 1.5336145572959283e-06,
"loss": 1.1936,
"step": 1300
},
{
"epoch": 0.842479018721756,
"grad_norm": 1.7026203537198492,
"learning_rate": 1.4741871758878978e-06,
"loss": 1.2032,
"step": 1305
},
{
"epoch": 0.8457069076823758,
"grad_norm": 1.6346183464452626,
"learning_rate": 1.4158423381005294e-06,
"loss": 1.2116,
"step": 1310
},
{
"epoch": 0.8489347966429954,
"grad_norm": 1.7297662015967483,
"learning_rate": 1.3585874521227226e-06,
"loss": 1.1539,
"step": 1315
},
{
"epoch": 0.8521626856036152,
"grad_norm": 1.5738539691074456,
"learning_rate": 1.3024297877494973e-06,
"loss": 1.2341,
"step": 1320
},
{
"epoch": 0.855390574564235,
"grad_norm": 1.665243874982246,
"learning_rate": 1.2473764754589123e-06,
"loss": 1.199,
"step": 1325
},
{
"epoch": 0.8586184635248547,
"grad_norm": 1.6694340869186672,
"learning_rate": 1.193434505506711e-06,
"loss": 1.1952,
"step": 1330
},
{
"epoch": 0.8618463524854745,
"grad_norm": 1.5537154219219604,
"learning_rate": 1.1406107270387323e-06,
"loss": 1.1915,
"step": 1335
},
{
"epoch": 0.8650742414460942,
"grad_norm": 1.725368998865333,
"learning_rate": 1.0889118472212702e-06,
"loss": 1.2092,
"step": 1340
},
{
"epoch": 0.868302130406714,
"grad_norm": 1.5428415313575736,
"learning_rate": 1.0383444303894453e-06,
"loss": 1.1772,
"step": 1345
},
{
"epoch": 0.8715300193673338,
"grad_norm": 1.5815501053581127,
"learning_rate": 9.889148972137097e-07,
"loss": 1.2139,
"step": 1350
},
{
"epoch": 0.8747579083279535,
"grad_norm": 1.7341001178375826,
"learning_rate": 9.406295238846108e-07,
"loss": 1.1595,
"step": 1355
},
{
"epoch": 0.8779857972885733,
"grad_norm": 1.4627233349193656,
"learning_rate": 8.934944413158708e-07,
"loss": 1.1948,
"step": 1360
},
{
"epoch": 0.881213686249193,
"grad_norm": 1.6770957968166575,
"learning_rate": 8.475156343659408e-07,
"loss": 1.168,
"step": 1365
},
{
"epoch": 0.8844415752098128,
"grad_norm": 1.7864490716701726,
"learning_rate": 8.026989410780917e-07,
"loss": 1.1725,
"step": 1370
},
{
"epoch": 0.8876694641704326,
"grad_norm": 1.6586604033600898,
"learning_rate": 7.590500519391309e-07,
"loss": 1.2569,
"step": 1375
},
{
"epoch": 0.8908973531310523,
"grad_norm": 1.5709797013859719,
"learning_rate": 7.165745091568743e-07,
"loss": 1.208,
"step": 1380
},
{
"epoch": 0.8941252420916721,
"grad_norm": 1.6100593510027281,
"learning_rate": 6.752777059564431e-07,
"loss": 1.173,
"step": 1385
},
{
"epoch": 0.8973531310522918,
"grad_norm": 1.5537220228932802,
"learning_rate": 6.351648858954618e-07,
"loss": 1.2228,
"step": 1390
},
{
"epoch": 0.9005810200129115,
"grad_norm": 1.6474724452037988,
"learning_rate": 5.962411421982805e-07,
"loss": 1.2409,
"step": 1395
},
{
"epoch": 0.9038089089735313,
"grad_norm": 1.79701983524165,
"learning_rate": 5.585114171092665e-07,
"loss": 1.2447,
"step": 1400
},
{
"epoch": 0.907036797934151,
"grad_norm": 1.6288904196875291,
"learning_rate": 5.219805012652867e-07,
"loss": 1.2474,
"step": 1405
},
{
"epoch": 0.9102646868947708,
"grad_norm": 1.6672222213895183,
"learning_rate": 4.866530330874153e-07,
"loss": 1.2176,
"step": 1410
},
{
"epoch": 0.9134925758553906,
"grad_norm": 1.646804223428847,
"learning_rate": 4.5253349819199375e-07,
"loss": 1.1707,
"step": 1415
},
{
"epoch": 0.9167204648160103,
"grad_norm": 1.604950832825935,
"learning_rate": 4.1962622882107174e-07,
"loss": 1.2503,
"step": 1420
},
{
"epoch": 0.9199483537766301,
"grad_norm": 1.5500232834639758,
"learning_rate": 3.8793540329233994e-07,
"loss": 1.202,
"step": 1425
},
{
"epoch": 0.9231762427372499,
"grad_norm": 1.706686354946513,
"learning_rate": 3.574650454685902e-07,
"loss": 1.1805,
"step": 1430
},
{
"epoch": 0.9264041316978696,
"grad_norm": 1.7851736282340886,
"learning_rate": 3.282190242468031e-07,
"loss": 1.209,
"step": 1435
},
{
"epoch": 0.9296320206584894,
"grad_norm": 1.6284062775954382,
"learning_rate": 3.0020105306689973e-07,
"loss": 1.185,
"step": 1440
},
{
"epoch": 0.9328599096191091,
"grad_norm": 1.5959442808129058,
"learning_rate": 2.7341468944023677e-07,
"loss": 1.1799,
"step": 1445
},
{
"epoch": 0.9360877985797289,
"grad_norm": 1.6373373190700566,
"learning_rate": 2.4786333449790753e-07,
"loss": 1.2108,
"step": 1450
},
{
"epoch": 0.9393156875403487,
"grad_norm": 1.5974098530281817,
"learning_rate": 2.235502325588823e-07,
"loss": 1.2309,
"step": 1455
},
{
"epoch": 0.9425435765009683,
"grad_norm": 1.6001479697293755,
"learning_rate": 2.0047847071807402e-07,
"loss": 1.2221,
"step": 1460
},
{
"epoch": 0.9457714654615881,
"grad_norm": 1.6270735112550445,
"learning_rate": 1.786509784543633e-07,
"loss": 1.2538,
"step": 1465
},
{
"epoch": 0.9489993544222078,
"grad_norm": 1.5793008113099312,
"learning_rate": 1.5807052725863025e-07,
"loss": 1.1756,
"step": 1470
},
{
"epoch": 0.9522272433828276,
"grad_norm": 1.6398931143884907,
"learning_rate": 1.3873973028185827e-07,
"loss": 1.1875,
"step": 1475
},
{
"epoch": 0.9554551323434474,
"grad_norm": 1.6178609914467408,
"learning_rate": 1.206610420033305e-07,
"loss": 1.189,
"step": 1480
},
{
"epoch": 0.9586830213040671,
"grad_norm": 1.697620840896172,
"learning_rate": 1.038367579189803e-07,
"loss": 1.1788,
"step": 1485
},
{
"epoch": 0.9619109102646869,
"grad_norm": 1.7239724713198632,
"learning_rate": 8.82690142499254e-08,
"loss": 1.2185,
"step": 1490
},
{
"epoch": 0.9651387992253067,
"grad_norm": 1.5330470110994838,
"learning_rate": 7.395978767122946e-08,
"loss": 1.1958,
"step": 1495
},
{
"epoch": 0.9683666881859264,
"grad_norm": 1.7081829292862492,
"learning_rate": 6.091089506091386e-08,
"loss": 1.1942,
"step": 1500
},
{
"epoch": 0.9715945771465462,
"grad_norm": 1.8325780163771659,
"learning_rate": 4.9123993269271084e-08,
"loss": 1.1928,
"step": 1505
},
{
"epoch": 0.9748224661071659,
"grad_norm": 1.6145771848502664,
"learning_rate": 3.860057890848201e-08,
"loss": 1.1975,
"step": 1510
},
{
"epoch": 0.9780503550677857,
"grad_norm": 1.6524344607725974,
"learning_rate": 2.9341988162595593e-08,
"loss": 1.1951,
"step": 1515
},
{
"epoch": 0.9812782440284055,
"grad_norm": 1.6975269723487802,
"learning_rate": 2.1349396617862395e-08,
"loss": 1.2068,
"step": 1520
},
{
"epoch": 0.9845061329890252,
"grad_norm": 1.6264164940688373,
"learning_rate": 1.4623819113475102e-08,
"loss": 1.1978,
"step": 1525
},
{
"epoch": 0.9877340219496449,
"grad_norm": 1.6407711137766725,
"learning_rate": 9.166109612706031e-09,
"loss": 1.2098,
"step": 1530
},
{
"epoch": 0.9909619109102646,
"grad_norm": 1.556515755815733,
"learning_rate": 4.976961094479427e-09,
"loss": 1.2264,
"step": 1535
},
{
"epoch": 0.9941897998708844,
"grad_norm": 1.6816771411617426,
"learning_rate": 2.0569054653840625e-09,
"loss": 1.1912,
"step": 1540
},
{
"epoch": 0.9974176888315042,
"grad_norm": 1.6707018085579257,
"learning_rate": 4.0631349213060555e-10,
"loss": 1.1822,
"step": 1545
},
{
"epoch": 1.0,
"eval_loss": 1.1962528228759766,
"eval_runtime": 267.2537,
"eval_samples_per_second": 31.206,
"eval_steps_per_second": 0.977,
"step": 1549
},
{
"epoch": 1.0,
"step": 1549,
"total_flos": 20529608785920.0,
"train_loss": 1.263828207016607,
"train_runtime": 6453.9435,
"train_samples_per_second": 7.677,
"train_steps_per_second": 0.24
}
],
"logging_steps": 5,
"max_steps": 1549,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 20529608785920.0,
"train_batch_size": 8,
"trial_name": null,
"trial_params": null
}