MHGanainy's picture
MHGanainy/mgpt-lora-multi-switzerland-balanced-1024
86398f0 verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 1.0,
"eval_steps": 500,
"global_step": 47974,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.002084462417142619,
"grad_norm": 13.836400985717773,
"learning_rate": 5.2111831991453666e-08,
"loss": 6.7155,
"step": 100
},
{
"epoch": 0.004168924834285238,
"grad_norm": 8.658980369567871,
"learning_rate": 1.0422366398290733e-07,
"loss": 6.7017,
"step": 200
},
{
"epoch": 0.006253387251427857,
"grad_norm": 9.073185920715332,
"learning_rate": 1.5633549597436098e-07,
"loss": 6.7153,
"step": 300
},
{
"epoch": 0.008337849668570476,
"grad_norm": 7.736963748931885,
"learning_rate": 2.0844732796581466e-07,
"loss": 6.666,
"step": 400
},
{
"epoch": 0.010422312085713094,
"grad_norm": 6.6067938804626465,
"learning_rate": 2.605591599572683e-07,
"loss": 6.5606,
"step": 500
},
{
"epoch": 0.012506774502855714,
"grad_norm": 7.66089391708374,
"learning_rate": 3.1267099194872195e-07,
"loss": 6.4177,
"step": 600
},
{
"epoch": 0.014591236919998333,
"grad_norm": 6.463258266448975,
"learning_rate": 3.6478282394017564e-07,
"loss": 6.1661,
"step": 700
},
{
"epoch": 0.016675699337140953,
"grad_norm": 6.471975326538086,
"learning_rate": 4.168946559316293e-07,
"loss": 5.9347,
"step": 800
},
{
"epoch": 0.01876016175428357,
"grad_norm": 4.940770149230957,
"learning_rate": 4.6900648792308296e-07,
"loss": 5.5964,
"step": 900
},
{
"epoch": 0.02084462417142619,
"grad_norm": 4.507166385650635,
"learning_rate": 5.211183199145366e-07,
"loss": 5.3496,
"step": 1000
},
{
"epoch": 0.02292908658856881,
"grad_norm": 3.123079776763916,
"learning_rate": 5.732301519059903e-07,
"loss": 5.1377,
"step": 1100
},
{
"epoch": 0.025013549005711427,
"grad_norm": 2.5389294624328613,
"learning_rate": 6.253419838974439e-07,
"loss": 4.9738,
"step": 1200
},
{
"epoch": 0.027098011422854045,
"grad_norm": 3.0952858924865723,
"learning_rate": 6.774538158888977e-07,
"loss": 4.8748,
"step": 1300
},
{
"epoch": 0.029182473839996666,
"grad_norm": 2.792858362197876,
"learning_rate": 7.295656478803513e-07,
"loss": 4.7841,
"step": 1400
},
{
"epoch": 0.031266936257139284,
"grad_norm": 1.8279093503952026,
"learning_rate": 7.81677479871805e-07,
"loss": 4.741,
"step": 1500
},
{
"epoch": 0.033351398674281905,
"grad_norm": 5.27761173248291,
"learning_rate": 8.337893118632586e-07,
"loss": 4.6813,
"step": 1600
},
{
"epoch": 0.03543586109142452,
"grad_norm": 7.852522373199463,
"learning_rate": 8.859011438547123e-07,
"loss": 4.6384,
"step": 1700
},
{
"epoch": 0.03752032350856714,
"grad_norm": 1.914821982383728,
"learning_rate": 9.380129758461659e-07,
"loss": 4.5633,
"step": 1800
},
{
"epoch": 0.03960478592570976,
"grad_norm": 2.3500185012817383,
"learning_rate": 9.901248078376196e-07,
"loss": 4.5251,
"step": 1900
},
{
"epoch": 0.04168924834285238,
"grad_norm": 1.5976965427398682,
"learning_rate": 1.0422366398290733e-06,
"loss": 4.4963,
"step": 2000
},
{
"epoch": 0.043773710759995,
"grad_norm": 1.1844316720962524,
"learning_rate": 1.094348471820527e-06,
"loss": 4.4873,
"step": 2100
},
{
"epoch": 0.04585817317713762,
"grad_norm": 2.388838529586792,
"learning_rate": 1.1464603038119807e-06,
"loss": 4.4591,
"step": 2200
},
{
"epoch": 0.04794263559428023,
"grad_norm": 2.453315496444702,
"learning_rate": 1.1985721358034343e-06,
"loss": 4.4017,
"step": 2300
},
{
"epoch": 0.050027098011422855,
"grad_norm": 4.433303356170654,
"learning_rate": 1.2506839677948878e-06,
"loss": 4.3945,
"step": 2400
},
{
"epoch": 0.052111560428565476,
"grad_norm": 6.372536659240723,
"learning_rate": 1.3027957997863417e-06,
"loss": 4.379,
"step": 2500
},
{
"epoch": 0.05419602284570809,
"grad_norm": 1.4244662523269653,
"learning_rate": 1.3549076317777954e-06,
"loss": 4.3155,
"step": 2600
},
{
"epoch": 0.05628048526285071,
"grad_norm": 2.0939528942108154,
"learning_rate": 1.4070194637692489e-06,
"loss": 4.3134,
"step": 2700
},
{
"epoch": 0.05836494767999333,
"grad_norm": 3.5981059074401855,
"learning_rate": 1.4591312957607026e-06,
"loss": 4.2774,
"step": 2800
},
{
"epoch": 0.06044941009713595,
"grad_norm": 2.061983108520508,
"learning_rate": 1.5112431277521565e-06,
"loss": 4.2482,
"step": 2900
},
{
"epoch": 0.06253387251427857,
"grad_norm": 2.26055645942688,
"learning_rate": 1.56335495974361e-06,
"loss": 4.1851,
"step": 3000
},
{
"epoch": 0.06461833493142119,
"grad_norm": 3.287569999694824,
"learning_rate": 1.6154667917350636e-06,
"loss": 4.1233,
"step": 3100
},
{
"epoch": 0.06670279734856381,
"grad_norm": 10.731806755065918,
"learning_rate": 1.6675786237265173e-06,
"loss": 4.0644,
"step": 3200
},
{
"epoch": 0.06878725976570642,
"grad_norm": 5.914673805236816,
"learning_rate": 1.7196904557179708e-06,
"loss": 4.0184,
"step": 3300
},
{
"epoch": 0.07087172218284904,
"grad_norm": 4.28048038482666,
"learning_rate": 1.7718022877094247e-06,
"loss": 3.9232,
"step": 3400
},
{
"epoch": 0.07295618459999166,
"grad_norm": 3.385477304458618,
"learning_rate": 1.8239141197008784e-06,
"loss": 3.8465,
"step": 3500
},
{
"epoch": 0.07504064701713428,
"grad_norm": 4.852660655975342,
"learning_rate": 1.8760259516923318e-06,
"loss": 3.7294,
"step": 3600
},
{
"epoch": 0.0771251094342769,
"grad_norm": 8.669583320617676,
"learning_rate": 1.9281377836837857e-06,
"loss": 3.6384,
"step": 3700
},
{
"epoch": 0.07920957185141952,
"grad_norm": 4.202250003814697,
"learning_rate": 1.980249615675239e-06,
"loss": 3.602,
"step": 3800
},
{
"epoch": 0.08129403426856213,
"grad_norm": 4.960677623748779,
"learning_rate": 2.0323614476666927e-06,
"loss": 3.5076,
"step": 3900
},
{
"epoch": 0.08337849668570475,
"grad_norm": 6.4040632247924805,
"learning_rate": 2.0844732796581466e-06,
"loss": 3.4811,
"step": 4000
},
{
"epoch": 0.08546295910284737,
"grad_norm": 15.12913703918457,
"learning_rate": 2.1365851116496005e-06,
"loss": 3.4111,
"step": 4100
},
{
"epoch": 0.08754742151999,
"grad_norm": 3.1779630184173584,
"learning_rate": 2.188696943641054e-06,
"loss": 3.3648,
"step": 4200
},
{
"epoch": 0.08963188393713262,
"grad_norm": 3.345259428024292,
"learning_rate": 2.2408087756325074e-06,
"loss": 3.2919,
"step": 4300
},
{
"epoch": 0.09171634635427524,
"grad_norm": 2.6787526607513428,
"learning_rate": 2.2929206076239613e-06,
"loss": 3.232,
"step": 4400
},
{
"epoch": 0.09380080877141785,
"grad_norm": 4.2783942222595215,
"learning_rate": 2.3450324396154148e-06,
"loss": 3.1733,
"step": 4500
},
{
"epoch": 0.09588527118856047,
"grad_norm": 2.9659714698791504,
"learning_rate": 2.3971442716068687e-06,
"loss": 3.1213,
"step": 4600
},
{
"epoch": 0.09796973360570309,
"grad_norm": 2.596327543258667,
"learning_rate": 2.449256103598322e-06,
"loss": 2.9828,
"step": 4700
},
{
"epoch": 0.10005419602284571,
"grad_norm": 2.4865667819976807,
"learning_rate": 2.5013679355897756e-06,
"loss": 2.9395,
"step": 4800
},
{
"epoch": 0.10213865843998833,
"grad_norm": 2.3512156009674072,
"learning_rate": 2.5534797675812295e-06,
"loss": 2.9167,
"step": 4900
},
{
"epoch": 0.10422312085713095,
"grad_norm": 3.3229358196258545,
"learning_rate": 2.6055915995726834e-06,
"loss": 2.8808,
"step": 5000
},
{
"epoch": 0.10630758327427356,
"grad_norm": 1.6489683389663696,
"learning_rate": 2.657703431564137e-06,
"loss": 2.8482,
"step": 5100
},
{
"epoch": 0.10839204569141618,
"grad_norm": 1.825037956237793,
"learning_rate": 2.709815263555591e-06,
"loss": 2.7804,
"step": 5200
},
{
"epoch": 0.1104765081085588,
"grad_norm": 2.1118900775909424,
"learning_rate": 2.761927095547044e-06,
"loss": 2.7678,
"step": 5300
},
{
"epoch": 0.11256097052570142,
"grad_norm": 1.8719017505645752,
"learning_rate": 2.8140389275384977e-06,
"loss": 2.8172,
"step": 5400
},
{
"epoch": 0.11464543294284404,
"grad_norm": 2.1248514652252197,
"learning_rate": 2.8661507595299516e-06,
"loss": 2.7631,
"step": 5500
},
{
"epoch": 0.11672989535998667,
"grad_norm": 2.4080471992492676,
"learning_rate": 2.918262591521405e-06,
"loss": 2.7575,
"step": 5600
},
{
"epoch": 0.11881435777712927,
"grad_norm": 8.700263977050781,
"learning_rate": 2.970374423512859e-06,
"loss": 2.7149,
"step": 5700
},
{
"epoch": 0.1208988201942719,
"grad_norm": 3.667365550994873,
"learning_rate": 3.022486255504313e-06,
"loss": 2.6248,
"step": 5800
},
{
"epoch": 0.12298328261141452,
"grad_norm": 2.9087088108062744,
"learning_rate": 3.074598087495766e-06,
"loss": 2.6602,
"step": 5900
},
{
"epoch": 0.12506774502855714,
"grad_norm": 76.12700653076172,
"learning_rate": 3.12670991948722e-06,
"loss": 2.6869,
"step": 6000
},
{
"epoch": 0.12715220744569974,
"grad_norm": 5.0292158126831055,
"learning_rate": 3.1788217514786733e-06,
"loss": 2.681,
"step": 6100
},
{
"epoch": 0.12923666986284238,
"grad_norm": 1.637168526649475,
"learning_rate": 3.2309335834701272e-06,
"loss": 2.6457,
"step": 6200
},
{
"epoch": 0.131321132279985,
"grad_norm": 5.850615978240967,
"learning_rate": 3.283045415461581e-06,
"loss": 2.6134,
"step": 6300
},
{
"epoch": 0.13340559469712762,
"grad_norm": 1.638176441192627,
"learning_rate": 3.3351572474530346e-06,
"loss": 2.5828,
"step": 6400
},
{
"epoch": 0.13549005711427023,
"grad_norm": 2.268254280090332,
"learning_rate": 3.387269079444488e-06,
"loss": 2.5978,
"step": 6500
},
{
"epoch": 0.13757451953141284,
"grad_norm": 3.5241456031799316,
"learning_rate": 3.4393809114359415e-06,
"loss": 2.6069,
"step": 6600
},
{
"epoch": 0.13965898194855547,
"grad_norm": 1.5713567733764648,
"learning_rate": 3.4914927434273954e-06,
"loss": 2.6177,
"step": 6700
},
{
"epoch": 0.14174344436569808,
"grad_norm": 4.928940773010254,
"learning_rate": 3.5436045754188493e-06,
"loss": 2.5887,
"step": 6800
},
{
"epoch": 0.14382790678284071,
"grad_norm": 1.6744616031646729,
"learning_rate": 3.595716407410303e-06,
"loss": 2.5754,
"step": 6900
},
{
"epoch": 0.14591236919998332,
"grad_norm": 2.2109663486480713,
"learning_rate": 3.6478282394017567e-06,
"loss": 2.5368,
"step": 7000
},
{
"epoch": 0.14799683161712596,
"grad_norm": 4.187534332275391,
"learning_rate": 3.6999400713932098e-06,
"loss": 2.5559,
"step": 7100
},
{
"epoch": 0.15008129403426856,
"grad_norm": 1.7843375205993652,
"learning_rate": 3.7520519033846637e-06,
"loss": 2.5637,
"step": 7200
},
{
"epoch": 0.15216575645141117,
"grad_norm": 2.460526943206787,
"learning_rate": 3.8041637353761176e-06,
"loss": 2.5256,
"step": 7300
},
{
"epoch": 0.1542502188685538,
"grad_norm": 1.5797803401947021,
"learning_rate": 3.8562755673675715e-06,
"loss": 2.5141,
"step": 7400
},
{
"epoch": 0.15633468128569641,
"grad_norm": 4.426311492919922,
"learning_rate": 3.9083873993590245e-06,
"loss": 2.5137,
"step": 7500
},
{
"epoch": 0.15841914370283905,
"grad_norm": 1.332321286201477,
"learning_rate": 3.960499231350478e-06,
"loss": 2.5083,
"step": 7600
},
{
"epoch": 0.16050360611998166,
"grad_norm": 2.9853885173797607,
"learning_rate": 4.0126110633419315e-06,
"loss": 2.5184,
"step": 7700
},
{
"epoch": 0.16258806853712426,
"grad_norm": 1.614306092262268,
"learning_rate": 4.064722895333385e-06,
"loss": 2.4973,
"step": 7800
},
{
"epoch": 0.1646725309542669,
"grad_norm": 1.6364295482635498,
"learning_rate": 4.116834727324839e-06,
"loss": 2.4649,
"step": 7900
},
{
"epoch": 0.1667569933714095,
"grad_norm": 3.250443458557129,
"learning_rate": 4.168946559316293e-06,
"loss": 2.5036,
"step": 8000
},
{
"epoch": 0.16884145578855214,
"grad_norm": 6.634191989898682,
"learning_rate": 4.221058391307747e-06,
"loss": 2.4898,
"step": 8100
},
{
"epoch": 0.17092591820569475,
"grad_norm": 1.429456114768982,
"learning_rate": 4.273170223299201e-06,
"loss": 2.4417,
"step": 8200
},
{
"epoch": 0.17301038062283736,
"grad_norm": 7.4600629806518555,
"learning_rate": 4.325282055290654e-06,
"loss": 2.4756,
"step": 8300
},
{
"epoch": 0.17509484303998,
"grad_norm": 4.795893669128418,
"learning_rate": 4.377393887282108e-06,
"loss": 2.4688,
"step": 8400
},
{
"epoch": 0.1771793054571226,
"grad_norm": 1.5229403972625732,
"learning_rate": 4.429505719273561e-06,
"loss": 2.4711,
"step": 8500
},
{
"epoch": 0.17926376787426523,
"grad_norm": 3.5206642150878906,
"learning_rate": 4.481617551265015e-06,
"loss": 2.4717,
"step": 8600
},
{
"epoch": 0.18134823029140784,
"grad_norm": 3.3243954181671143,
"learning_rate": 4.533729383256469e-06,
"loss": 2.4083,
"step": 8700
},
{
"epoch": 0.18343269270855048,
"grad_norm": 1.5031596422195435,
"learning_rate": 4.585841215247923e-06,
"loss": 2.4354,
"step": 8800
},
{
"epoch": 0.18551715512569308,
"grad_norm": 1.401964545249939,
"learning_rate": 4.6379530472393765e-06,
"loss": 2.4274,
"step": 8900
},
{
"epoch": 0.1876016175428357,
"grad_norm": 2.375384569168091,
"learning_rate": 4.6900648792308296e-06,
"loss": 2.4012,
"step": 9000
},
{
"epoch": 0.18968607995997833,
"grad_norm": 1.4875749349594116,
"learning_rate": 4.7421767112222835e-06,
"loss": 2.3942,
"step": 9100
},
{
"epoch": 0.19177054237712093,
"grad_norm": 23.226573944091797,
"learning_rate": 4.794288543213737e-06,
"loss": 2.4138,
"step": 9200
},
{
"epoch": 0.19385500479426357,
"grad_norm": 1.3855865001678467,
"learning_rate": 4.84640037520519e-06,
"loss": 2.3718,
"step": 9300
},
{
"epoch": 0.19593946721140618,
"grad_norm": 1.4095040559768677,
"learning_rate": 4.898512207196644e-06,
"loss": 2.396,
"step": 9400
},
{
"epoch": 0.19802392962854878,
"grad_norm": 1.4427977800369263,
"learning_rate": 4.950624039188098e-06,
"loss": 2.387,
"step": 9500
},
{
"epoch": 0.20010839204569142,
"grad_norm": 2.55526065826416,
"learning_rate": 5.002735871179551e-06,
"loss": 2.4142,
"step": 9600
},
{
"epoch": 0.20219285446283403,
"grad_norm": 7.573841571807861,
"learning_rate": 5.054847703171005e-06,
"loss": 2.3758,
"step": 9700
},
{
"epoch": 0.20427731687997666,
"grad_norm": 4.039884567260742,
"learning_rate": 5.106959535162459e-06,
"loss": 2.4,
"step": 9800
},
{
"epoch": 0.20636177929711927,
"grad_norm": 2.0507349967956543,
"learning_rate": 5.159071367153913e-06,
"loss": 2.3814,
"step": 9900
},
{
"epoch": 0.2084462417142619,
"grad_norm": 2.276043176651001,
"learning_rate": 5.211183199145367e-06,
"loss": 2.3702,
"step": 10000
},
{
"epoch": 0.2105307041314045,
"grad_norm": 1.605398178100586,
"learning_rate": 5.26329503113682e-06,
"loss": 2.3945,
"step": 10100
},
{
"epoch": 0.21261516654854712,
"grad_norm": 4.99838399887085,
"learning_rate": 5.315406863128274e-06,
"loss": 2.3842,
"step": 10200
},
{
"epoch": 0.21469962896568975,
"grad_norm": 6.735813617706299,
"learning_rate": 5.367518695119728e-06,
"loss": 2.3526,
"step": 10300
},
{
"epoch": 0.21678409138283236,
"grad_norm": 1.6342523097991943,
"learning_rate": 5.419630527111182e-06,
"loss": 2.3472,
"step": 10400
},
{
"epoch": 0.218868553799975,
"grad_norm": 1.485580563545227,
"learning_rate": 5.471742359102634e-06,
"loss": 2.3572,
"step": 10500
},
{
"epoch": 0.2209530162171176,
"grad_norm": 2.2503373622894287,
"learning_rate": 5.523854191094088e-06,
"loss": 2.3738,
"step": 10600
},
{
"epoch": 0.2230374786342602,
"grad_norm": 1.6811813116073608,
"learning_rate": 5.575966023085542e-06,
"loss": 2.3432,
"step": 10700
},
{
"epoch": 0.22512194105140285,
"grad_norm": 7.358359336853027,
"learning_rate": 5.6280778550769955e-06,
"loss": 2.3529,
"step": 10800
},
{
"epoch": 0.22720640346854545,
"grad_norm": 1.528684377670288,
"learning_rate": 5.680189687068449e-06,
"loss": 2.3507,
"step": 10900
},
{
"epoch": 0.2292908658856881,
"grad_norm": 1.4262800216674805,
"learning_rate": 5.732301519059903e-06,
"loss": 2.3016,
"step": 11000
},
{
"epoch": 0.2313753283028307,
"grad_norm": 2.4375407695770264,
"learning_rate": 5.784413351051356e-06,
"loss": 2.3177,
"step": 11100
},
{
"epoch": 0.23345979071997333,
"grad_norm": 18.75299644470215,
"learning_rate": 5.83652518304281e-06,
"loss": 2.3324,
"step": 11200
},
{
"epoch": 0.23554425313711594,
"grad_norm": 2.372840642929077,
"learning_rate": 5.888637015034264e-06,
"loss": 2.3284,
"step": 11300
},
{
"epoch": 0.23762871555425855,
"grad_norm": 2.269366502761841,
"learning_rate": 5.940748847025718e-06,
"loss": 2.2799,
"step": 11400
},
{
"epoch": 0.23971317797140118,
"grad_norm": 1.4434478282928467,
"learning_rate": 5.992860679017172e-06,
"loss": 2.3036,
"step": 11500
},
{
"epoch": 0.2417976403885438,
"grad_norm": 1.3579438924789429,
"learning_rate": 6.044972511008626e-06,
"loss": 2.3045,
"step": 11600
},
{
"epoch": 0.24388210280568642,
"grad_norm": 4.852165699005127,
"learning_rate": 6.097084343000078e-06,
"loss": 2.3113,
"step": 11700
},
{
"epoch": 0.24596656522282903,
"grad_norm": 12.108736991882324,
"learning_rate": 6.149196174991532e-06,
"loss": 2.2867,
"step": 11800
},
{
"epoch": 0.24805102763997164,
"grad_norm": 2.931642532348633,
"learning_rate": 6.201308006982986e-06,
"loss": 2.2815,
"step": 11900
},
{
"epoch": 0.2501354900571143,
"grad_norm": 5.88316535949707,
"learning_rate": 6.25341983897444e-06,
"loss": 2.2789,
"step": 12000
},
{
"epoch": 0.2522199524742569,
"grad_norm": 2.041398048400879,
"learning_rate": 6.305531670965893e-06,
"loss": 2.2884,
"step": 12100
},
{
"epoch": 0.2543044148913995,
"grad_norm": 3.2297041416168213,
"learning_rate": 6.357643502957347e-06,
"loss": 2.2711,
"step": 12200
},
{
"epoch": 0.25638887730854215,
"grad_norm": 7.2022528648376465,
"learning_rate": 6.4097553349488006e-06,
"loss": 2.2823,
"step": 12300
},
{
"epoch": 0.25847333972568476,
"grad_norm": 2.5352327823638916,
"learning_rate": 6.4618671669402545e-06,
"loss": 2.2806,
"step": 12400
},
{
"epoch": 0.26055780214282737,
"grad_norm": 2.3590762615203857,
"learning_rate": 6.513978998931708e-06,
"loss": 2.2521,
"step": 12500
},
{
"epoch": 0.26264226455997,
"grad_norm": 7.908915042877197,
"learning_rate": 6.566090830923162e-06,
"loss": 2.2594,
"step": 12600
},
{
"epoch": 0.2647267269771126,
"grad_norm": 8.686135292053223,
"learning_rate": 6.618202662914615e-06,
"loss": 2.2563,
"step": 12700
},
{
"epoch": 0.26681118939425524,
"grad_norm": 2.0514814853668213,
"learning_rate": 6.670314494906069e-06,
"loss": 2.2479,
"step": 12800
},
{
"epoch": 0.26889565181139785,
"grad_norm": 4.685930252075195,
"learning_rate": 6.722426326897523e-06,
"loss": 2.2353,
"step": 12900
},
{
"epoch": 0.27098011422854046,
"grad_norm": 2.2005646228790283,
"learning_rate": 6.774538158888976e-06,
"loss": 2.2428,
"step": 13000
},
{
"epoch": 0.27306457664568307,
"grad_norm": 3.514885902404785,
"learning_rate": 6.826649990880429e-06,
"loss": 2.2545,
"step": 13100
},
{
"epoch": 0.2751490390628257,
"grad_norm": 4.6062541007995605,
"learning_rate": 6.878761822871883e-06,
"loss": 2.2369,
"step": 13200
},
{
"epoch": 0.27723350147996834,
"grad_norm": 7.698457717895508,
"learning_rate": 6.930873654863337e-06,
"loss": 2.2333,
"step": 13300
},
{
"epoch": 0.27931796389711094,
"grad_norm": 3.1448521614074707,
"learning_rate": 6.982985486854791e-06,
"loss": 2.2231,
"step": 13400
},
{
"epoch": 0.28140242631425355,
"grad_norm": 4.200199127197266,
"learning_rate": 7.035097318846245e-06,
"loss": 2.2481,
"step": 13500
},
{
"epoch": 0.28348688873139616,
"grad_norm": 6.762167453765869,
"learning_rate": 7.087209150837699e-06,
"loss": 2.2332,
"step": 13600
},
{
"epoch": 0.28557135114853877,
"grad_norm": 3.139690637588501,
"learning_rate": 7.139320982829152e-06,
"loss": 2.2327,
"step": 13700
},
{
"epoch": 0.28765581356568143,
"grad_norm": 1.887600064277649,
"learning_rate": 7.191432814820606e-06,
"loss": 2.2404,
"step": 13800
},
{
"epoch": 0.28974027598282404,
"grad_norm": 1.8191242218017578,
"learning_rate": 7.2435446468120595e-06,
"loss": 2.228,
"step": 13900
},
{
"epoch": 0.29182473839996664,
"grad_norm": 3.9372363090515137,
"learning_rate": 7.2956564788035134e-06,
"loss": 2.1995,
"step": 14000
},
{
"epoch": 0.29390920081710925,
"grad_norm": 1.8228873014450073,
"learning_rate": 7.347768310794967e-06,
"loss": 2.2253,
"step": 14100
},
{
"epoch": 0.2959936632342519,
"grad_norm": 3.99737811088562,
"learning_rate": 7.3998801427864195e-06,
"loss": 2.2307,
"step": 14200
},
{
"epoch": 0.2980781256513945,
"grad_norm": 1.8391298055648804,
"learning_rate": 7.451991974777873e-06,
"loss": 2.2049,
"step": 14300
},
{
"epoch": 0.30016258806853713,
"grad_norm": 10.315858840942383,
"learning_rate": 7.504103806769327e-06,
"loss": 2.2189,
"step": 14400
},
{
"epoch": 0.30224705048567974,
"grad_norm": 1.3917250633239746,
"learning_rate": 7.556215638760781e-06,
"loss": 2.2243,
"step": 14500
},
{
"epoch": 0.30433151290282234,
"grad_norm": 3.459721326828003,
"learning_rate": 7.608327470752235e-06,
"loss": 2.185,
"step": 14600
},
{
"epoch": 0.306415975319965,
"grad_norm": 29.758569717407227,
"learning_rate": 7.660439302743689e-06,
"loss": 2.2134,
"step": 14700
},
{
"epoch": 0.3085004377371076,
"grad_norm": 2.3059887886047363,
"learning_rate": 7.712551134735143e-06,
"loss": 2.205,
"step": 14800
},
{
"epoch": 0.3105849001542502,
"grad_norm": 1.9825279712677002,
"learning_rate": 7.764662966726597e-06,
"loss": 2.2075,
"step": 14900
},
{
"epoch": 0.31266936257139283,
"grad_norm": 2.2246243953704834,
"learning_rate": 7.816774798718049e-06,
"loss": 2.2076,
"step": 15000
},
{
"epoch": 0.31475382498853544,
"grad_norm": 2.0113601684570312,
"learning_rate": 7.868886630709503e-06,
"loss": 2.1915,
"step": 15100
},
{
"epoch": 0.3168382874056781,
"grad_norm": 2.44480037689209,
"learning_rate": 7.920998462700957e-06,
"loss": 2.1928,
"step": 15200
},
{
"epoch": 0.3189227498228207,
"grad_norm": 2.129798173904419,
"learning_rate": 7.97311029469241e-06,
"loss": 2.1883,
"step": 15300
},
{
"epoch": 0.3210072122399633,
"grad_norm": 1.7509479522705078,
"learning_rate": 8.025222126683863e-06,
"loss": 2.1919,
"step": 15400
},
{
"epoch": 0.3230916746571059,
"grad_norm": 2.4539670944213867,
"learning_rate": 8.077333958675317e-06,
"loss": 2.1617,
"step": 15500
},
{
"epoch": 0.32517613707424853,
"grad_norm": 3.520508289337158,
"learning_rate": 8.12944579066677e-06,
"loss": 2.1987,
"step": 15600
},
{
"epoch": 0.3272605994913912,
"grad_norm": 3.6062064170837402,
"learning_rate": 8.181557622658225e-06,
"loss": 2.1994,
"step": 15700
},
{
"epoch": 0.3293450619085338,
"grad_norm": 3.181687355041504,
"learning_rate": 8.233669454649678e-06,
"loss": 2.1792,
"step": 15800
},
{
"epoch": 0.3314295243256764,
"grad_norm": 1.296694278717041,
"learning_rate": 8.285781286641132e-06,
"loss": 2.1744,
"step": 15900
},
{
"epoch": 0.333513986742819,
"grad_norm": 2.883561849594116,
"learning_rate": 8.337893118632586e-06,
"loss": 2.1785,
"step": 16000
},
{
"epoch": 0.3355984491599616,
"grad_norm": 1.8239498138427734,
"learning_rate": 8.39000495062404e-06,
"loss": 2.1833,
"step": 16100
},
{
"epoch": 0.3376829115771043,
"grad_norm": 5.87345552444458,
"learning_rate": 8.442116782615494e-06,
"loss": 2.1665,
"step": 16200
},
{
"epoch": 0.3397673739942469,
"grad_norm": 5.876729965209961,
"learning_rate": 8.494228614606948e-06,
"loss": 2.1671,
"step": 16300
},
{
"epoch": 0.3418518364113895,
"grad_norm": 5.031464099884033,
"learning_rate": 8.546340446598402e-06,
"loss": 2.1877,
"step": 16400
},
{
"epoch": 0.3439362988285321,
"grad_norm": 3.857750415802002,
"learning_rate": 8.598452278589856e-06,
"loss": 2.1781,
"step": 16500
},
{
"epoch": 0.3460207612456747,
"grad_norm": 2.921267032623291,
"learning_rate": 8.650564110581308e-06,
"loss": 2.1584,
"step": 16600
},
{
"epoch": 0.3481052236628174,
"grad_norm": 2.5080487728118896,
"learning_rate": 8.702675942572762e-06,
"loss": 2.1608,
"step": 16700
},
{
"epoch": 0.35018968607996,
"grad_norm": 2.8279311656951904,
"learning_rate": 8.754787774564216e-06,
"loss": 2.1528,
"step": 16800
},
{
"epoch": 0.3522741484971026,
"grad_norm": 1.7604950666427612,
"learning_rate": 8.80689960655567e-06,
"loss": 2.1748,
"step": 16900
},
{
"epoch": 0.3543586109142452,
"grad_norm": 12.567907333374023,
"learning_rate": 8.859011438547122e-06,
"loss": 2.1604,
"step": 17000
},
{
"epoch": 0.35644307333138786,
"grad_norm": 1.497755527496338,
"learning_rate": 8.911123270538576e-06,
"loss": 2.147,
"step": 17100
},
{
"epoch": 0.35852753574853047,
"grad_norm": 1.711016058921814,
"learning_rate": 8.96323510253003e-06,
"loss": 2.1759,
"step": 17200
},
{
"epoch": 0.3606119981656731,
"grad_norm": 7.569229602813721,
"learning_rate": 9.015346934521484e-06,
"loss": 2.1666,
"step": 17300
},
{
"epoch": 0.3626964605828157,
"grad_norm": 1.858202576637268,
"learning_rate": 9.067458766512937e-06,
"loss": 2.1569,
"step": 17400
},
{
"epoch": 0.3647809229999583,
"grad_norm": 4.447068691253662,
"learning_rate": 9.119570598504391e-06,
"loss": 2.1773,
"step": 17500
},
{
"epoch": 0.36686538541710095,
"grad_norm": 2.1441404819488525,
"learning_rate": 9.171682430495845e-06,
"loss": 2.1504,
"step": 17600
},
{
"epoch": 0.36894984783424356,
"grad_norm": 1.575992465019226,
"learning_rate": 9.223794262487299e-06,
"loss": 2.1642,
"step": 17700
},
{
"epoch": 0.37103431025138617,
"grad_norm": 2.184628486633301,
"learning_rate": 9.275906094478753e-06,
"loss": 2.1454,
"step": 17800
},
{
"epoch": 0.3731187726685288,
"grad_norm": 5.227324485778809,
"learning_rate": 9.328017926470205e-06,
"loss": 2.1464,
"step": 17900
},
{
"epoch": 0.3752032350856714,
"grad_norm": 1.5199558734893799,
"learning_rate": 9.380129758461659e-06,
"loss": 2.1461,
"step": 18000
},
{
"epoch": 0.37728769750281405,
"grad_norm": 1.9436824321746826,
"learning_rate": 9.432241590453113e-06,
"loss": 2.1522,
"step": 18100
},
{
"epoch": 0.37937215991995665,
"grad_norm": 2.0353071689605713,
"learning_rate": 9.484353422444567e-06,
"loss": 2.1546,
"step": 18200
},
{
"epoch": 0.38145662233709926,
"grad_norm": 1.867530107498169,
"learning_rate": 9.53646525443602e-06,
"loss": 2.1511,
"step": 18300
},
{
"epoch": 0.38354108475424187,
"grad_norm": 7.113090515136719,
"learning_rate": 9.588577086427475e-06,
"loss": 2.1417,
"step": 18400
},
{
"epoch": 0.3856255471713845,
"grad_norm": 2.368337392807007,
"learning_rate": 9.640688918418929e-06,
"loss": 2.1423,
"step": 18500
},
{
"epoch": 0.38771000958852714,
"grad_norm": 3.9562978744506836,
"learning_rate": 9.69280075041038e-06,
"loss": 2.1608,
"step": 18600
},
{
"epoch": 0.38979447200566975,
"grad_norm": 1.4725730419158936,
"learning_rate": 9.744912582401835e-06,
"loss": 2.1409,
"step": 18700
},
{
"epoch": 0.39187893442281235,
"grad_norm": 2.8534648418426514,
"learning_rate": 9.797024414393289e-06,
"loss": 2.1328,
"step": 18800
},
{
"epoch": 0.39396339683995496,
"grad_norm": 1.781811237335205,
"learning_rate": 9.849136246384743e-06,
"loss": 2.1201,
"step": 18900
},
{
"epoch": 0.39604785925709757,
"grad_norm": 1.6135547161102295,
"learning_rate": 9.901248078376196e-06,
"loss": 2.1413,
"step": 19000
},
{
"epoch": 0.39813232167424023,
"grad_norm": 1.416398048400879,
"learning_rate": 9.953359910367649e-06,
"loss": 2.1419,
"step": 19100
},
{
"epoch": 0.40021678409138284,
"grad_norm": 7.111115455627441,
"learning_rate": 1.0005471742359103e-05,
"loss": 2.1386,
"step": 19200
},
{
"epoch": 0.40230124650852545,
"grad_norm": 2.906944990158081,
"learning_rate": 1.0057583574350558e-05,
"loss": 2.123,
"step": 19300
},
{
"epoch": 0.40438570892566805,
"grad_norm": 1.5303776264190674,
"learning_rate": 1.010969540634201e-05,
"loss": 2.1393,
"step": 19400
},
{
"epoch": 0.4064701713428107,
"grad_norm": 2.383948564529419,
"learning_rate": 1.0161807238333466e-05,
"loss": 2.1314,
"step": 19500
},
{
"epoch": 0.4085546337599533,
"grad_norm": 6.843499660491943,
"learning_rate": 1.0213919070324918e-05,
"loss": 2.118,
"step": 19600
},
{
"epoch": 0.41063909617709593,
"grad_norm": 1.690718412399292,
"learning_rate": 1.026603090231637e-05,
"loss": 2.1212,
"step": 19700
},
{
"epoch": 0.41272355859423854,
"grad_norm": 2.3637075424194336,
"learning_rate": 1.0318142734307826e-05,
"loss": 2.1133,
"step": 19800
},
{
"epoch": 0.41480802101138115,
"grad_norm": 1.7208225727081299,
"learning_rate": 1.0370254566299278e-05,
"loss": 2.1309,
"step": 19900
},
{
"epoch": 0.4168924834285238,
"grad_norm": 1.4891283512115479,
"learning_rate": 1.0422366398290734e-05,
"loss": 2.1078,
"step": 20000
},
{
"epoch": 0.4189769458456664,
"grad_norm": 4.39108943939209,
"learning_rate": 1.0474478230282186e-05,
"loss": 2.1268,
"step": 20100
},
{
"epoch": 0.421061408262809,
"grad_norm": 5.60664701461792,
"learning_rate": 1.052659006227364e-05,
"loss": 2.1055,
"step": 20200
},
{
"epoch": 0.42314587067995163,
"grad_norm": 2.8107378482818604,
"learning_rate": 1.0578701894265094e-05,
"loss": 2.1271,
"step": 20300
},
{
"epoch": 0.42523033309709424,
"grad_norm": 1.2925037145614624,
"learning_rate": 1.0630813726256548e-05,
"loss": 2.1145,
"step": 20400
},
{
"epoch": 0.4273147955142369,
"grad_norm": 1.4800461530685425,
"learning_rate": 1.0682925558248001e-05,
"loss": 2.1142,
"step": 20500
},
{
"epoch": 0.4293992579313795,
"grad_norm": 1.6407443284988403,
"learning_rate": 1.0735037390239455e-05,
"loss": 2.1102,
"step": 20600
},
{
"epoch": 0.4314837203485221,
"grad_norm": 7.9351396560668945,
"learning_rate": 1.0787149222230908e-05,
"loss": 2.1254,
"step": 20700
},
{
"epoch": 0.4335681827656647,
"grad_norm": 1.4959195852279663,
"learning_rate": 1.0839261054222363e-05,
"loss": 2.0962,
"step": 20800
},
{
"epoch": 0.43565264518280733,
"grad_norm": 3.0826125144958496,
"learning_rate": 1.0891372886213815e-05,
"loss": 2.1306,
"step": 20900
},
{
"epoch": 0.43773710759995,
"grad_norm": 2.9349727630615234,
"learning_rate": 1.0943484718205268e-05,
"loss": 2.1095,
"step": 21000
},
{
"epoch": 0.4398215700170926,
"grad_norm": 3.080888271331787,
"learning_rate": 1.0995596550196723e-05,
"loss": 2.1147,
"step": 21100
},
{
"epoch": 0.4419060324342352,
"grad_norm": 3.0811104774475098,
"learning_rate": 1.1047708382188175e-05,
"loss": 2.1217,
"step": 21200
},
{
"epoch": 0.4439904948513778,
"grad_norm": 2.106529712677002,
"learning_rate": 1.1099820214179631e-05,
"loss": 2.1215,
"step": 21300
},
{
"epoch": 0.4460749572685204,
"grad_norm": 1.6550434827804565,
"learning_rate": 1.1151932046171083e-05,
"loss": 2.1187,
"step": 21400
},
{
"epoch": 0.4481594196856631,
"grad_norm": 1.5209170579910278,
"learning_rate": 1.1204043878162539e-05,
"loss": 2.089,
"step": 21500
},
{
"epoch": 0.4502438821028057,
"grad_norm": 4.730747699737549,
"learning_rate": 1.1256155710153991e-05,
"loss": 2.1099,
"step": 21600
},
{
"epoch": 0.4523283445199483,
"grad_norm": 1.7369521856307983,
"learning_rate": 1.1308267542145447e-05,
"loss": 2.12,
"step": 21700
},
{
"epoch": 0.4544128069370909,
"grad_norm": 3.3095576763153076,
"learning_rate": 1.1360379374136899e-05,
"loss": 2.0933,
"step": 21800
},
{
"epoch": 0.45649726935423357,
"grad_norm": 1.7241291999816895,
"learning_rate": 1.1412491206128353e-05,
"loss": 2.1042,
"step": 21900
},
{
"epoch": 0.4585817317713762,
"grad_norm": 1.6191275119781494,
"learning_rate": 1.1464603038119807e-05,
"loss": 2.0997,
"step": 22000
},
{
"epoch": 0.4606661941885188,
"grad_norm": 1.364058256149292,
"learning_rate": 1.151671487011126e-05,
"loss": 2.0988,
"step": 22100
},
{
"epoch": 0.4627506566056614,
"grad_norm": 2.2130119800567627,
"learning_rate": 1.1568826702102713e-05,
"loss": 2.0974,
"step": 22200
},
{
"epoch": 0.464835119022804,
"grad_norm": 1.5549209117889404,
"learning_rate": 1.1620938534094167e-05,
"loss": 2.1039,
"step": 22300
},
{
"epoch": 0.46691958143994666,
"grad_norm": 2.114710807800293,
"learning_rate": 1.167305036608562e-05,
"loss": 2.1095,
"step": 22400
},
{
"epoch": 0.46900404385708927,
"grad_norm": 1.5596140623092651,
"learning_rate": 1.1725162198077074e-05,
"loss": 2.108,
"step": 22500
},
{
"epoch": 0.4710885062742319,
"grad_norm": 1.5475258827209473,
"learning_rate": 1.1777274030068528e-05,
"loss": 2.0928,
"step": 22600
},
{
"epoch": 0.4731729686913745,
"grad_norm": 1.2037123441696167,
"learning_rate": 1.182938586205998e-05,
"loss": 2.0732,
"step": 22700
},
{
"epoch": 0.4752574311085171,
"grad_norm": 3.852919340133667,
"learning_rate": 1.1881497694051436e-05,
"loss": 2.0803,
"step": 22800
},
{
"epoch": 0.47734189352565976,
"grad_norm": 2.0710220336914062,
"learning_rate": 1.1933609526042888e-05,
"loss": 2.0866,
"step": 22900
},
{
"epoch": 0.47942635594280236,
"grad_norm": 2.6531641483306885,
"learning_rate": 1.1985721358034344e-05,
"loss": 2.1082,
"step": 23000
},
{
"epoch": 0.48151081835994497,
"grad_norm": 2.664095878601074,
"learning_rate": 1.2037833190025796e-05,
"loss": 2.0914,
"step": 23100
},
{
"epoch": 0.4835952807770876,
"grad_norm": 2.249077081680298,
"learning_rate": 1.2089945022017252e-05,
"loss": 2.084,
"step": 23200
},
{
"epoch": 0.4856797431942302,
"grad_norm": 1.6785763502120972,
"learning_rate": 1.2142056854008704e-05,
"loss": 2.081,
"step": 23300
},
{
"epoch": 0.48776420561137285,
"grad_norm": 1.1575652360916138,
"learning_rate": 1.2194168686000156e-05,
"loss": 2.0958,
"step": 23400
},
{
"epoch": 0.48984866802851545,
"grad_norm": 1.2044528722763062,
"learning_rate": 1.2246280517991612e-05,
"loss": 2.0845,
"step": 23500
},
{
"epoch": 0.49193313044565806,
"grad_norm": 1.905847430229187,
"learning_rate": 1.2298392349983064e-05,
"loss": 2.0728,
"step": 23600
},
{
"epoch": 0.49401759286280067,
"grad_norm": 1.8150228261947632,
"learning_rate": 1.235050418197452e-05,
"loss": 2.1076,
"step": 23700
},
{
"epoch": 0.4961020552799433,
"grad_norm": 1.8528637886047363,
"learning_rate": 1.2402616013965972e-05,
"loss": 2.0719,
"step": 23800
},
{
"epoch": 0.49818651769708594,
"grad_norm": 2.793405532836914,
"learning_rate": 1.2454727845957426e-05,
"loss": 2.0775,
"step": 23900
},
{
"epoch": 0.5002709801142285,
"grad_norm": 1.9311914443969727,
"learning_rate": 1.250683967794888e-05,
"loss": 2.0741,
"step": 24000
},
{
"epoch": 0.5023554425313712,
"grad_norm": 1.2110543251037598,
"learning_rate": 1.2558951509940333e-05,
"loss": 2.0913,
"step": 24100
},
{
"epoch": 0.5044399049485138,
"grad_norm": 3.272123336791992,
"learning_rate": 1.2611063341931786e-05,
"loss": 2.0925,
"step": 24200
},
{
"epoch": 0.5065243673656564,
"grad_norm": 8.41162395477295,
"learning_rate": 1.2663175173923241e-05,
"loss": 2.077,
"step": 24300
},
{
"epoch": 0.508608829782799,
"grad_norm": 1.559525489807129,
"learning_rate": 1.2715287005914693e-05,
"loss": 2.0946,
"step": 24400
},
{
"epoch": 0.5106932921999416,
"grad_norm": 15.453913688659668,
"learning_rate": 1.2767398837906149e-05,
"loss": 2.0828,
"step": 24500
},
{
"epoch": 0.5127777546170843,
"grad_norm": 2.0540664196014404,
"learning_rate": 1.2819510669897601e-05,
"loss": 2.0717,
"step": 24600
},
{
"epoch": 0.5148622170342269,
"grad_norm": 2.6987600326538086,
"learning_rate": 1.2871622501889053e-05,
"loss": 2.071,
"step": 24700
},
{
"epoch": 0.5169466794513695,
"grad_norm": 2.532989025115967,
"learning_rate": 1.2923734333880509e-05,
"loss": 2.0601,
"step": 24800
},
{
"epoch": 0.5190311418685121,
"grad_norm": 2.437511444091797,
"learning_rate": 1.2975846165871961e-05,
"loss": 2.0705,
"step": 24900
},
{
"epoch": 0.5211156042856547,
"grad_norm": 1.8108067512512207,
"learning_rate": 1.3027957997863417e-05,
"loss": 2.0959,
"step": 25000
},
{
"epoch": 0.5232000667027974,
"grad_norm": 2.173082113265991,
"learning_rate": 1.3080069829854869e-05,
"loss": 2.0635,
"step": 25100
},
{
"epoch": 0.52528452911994,
"grad_norm": 1.5072120428085327,
"learning_rate": 1.3132181661846325e-05,
"loss": 2.0631,
"step": 25200
},
{
"epoch": 0.5273689915370826,
"grad_norm": 1.1707432270050049,
"learning_rate": 1.3184293493837777e-05,
"loss": 2.0819,
"step": 25300
},
{
"epoch": 0.5294534539542252,
"grad_norm": 1.753009557723999,
"learning_rate": 1.323640532582923e-05,
"loss": 2.0948,
"step": 25400
},
{
"epoch": 0.5315379163713678,
"grad_norm": 2.3794548511505127,
"learning_rate": 1.3288517157820685e-05,
"loss": 2.0517,
"step": 25500
},
{
"epoch": 0.5336223787885105,
"grad_norm": 2.1440374851226807,
"learning_rate": 1.3340628989812138e-05,
"loss": 2.0593,
"step": 25600
},
{
"epoch": 0.535706841205653,
"grad_norm": 2.6394147872924805,
"learning_rate": 1.3392740821803592e-05,
"loss": 2.07,
"step": 25700
},
{
"epoch": 0.5377913036227957,
"grad_norm": 6.6029229164123535,
"learning_rate": 1.3444852653795046e-05,
"loss": 2.0689,
"step": 25800
},
{
"epoch": 0.5398757660399383,
"grad_norm": 1.942874789237976,
"learning_rate": 1.3496964485786498e-05,
"loss": 2.0675,
"step": 25900
},
{
"epoch": 0.5419602284570809,
"grad_norm": 1.4140956401824951,
"learning_rate": 1.3549076317777952e-05,
"loss": 2.0671,
"step": 26000
},
{
"epoch": 0.5440446908742236,
"grad_norm": 1.1899230480194092,
"learning_rate": 1.3601188149769406e-05,
"loss": 2.0552,
"step": 26100
},
{
"epoch": 0.5461291532913661,
"grad_norm": 1.152624249458313,
"learning_rate": 1.3653299981760858e-05,
"loss": 2.0732,
"step": 26200
},
{
"epoch": 0.5482136157085088,
"grad_norm": 4.112461566925049,
"learning_rate": 1.3705411813752314e-05,
"loss": 2.0646,
"step": 26300
},
{
"epoch": 0.5502980781256513,
"grad_norm": 1.8623366355895996,
"learning_rate": 1.3757523645743766e-05,
"loss": 2.0434,
"step": 26400
},
{
"epoch": 0.552382540542794,
"grad_norm": 1.2094752788543701,
"learning_rate": 1.3809635477735222e-05,
"loss": 2.0554,
"step": 26500
},
{
"epoch": 0.5544670029599367,
"grad_norm": 2.651811361312866,
"learning_rate": 1.3861747309726674e-05,
"loss": 2.0483,
"step": 26600
},
{
"epoch": 0.5565514653770792,
"grad_norm": 1.336885690689087,
"learning_rate": 1.391385914171813e-05,
"loss": 2.0609,
"step": 26700
},
{
"epoch": 0.5586359277942219,
"grad_norm": 1.191576600074768,
"learning_rate": 1.3965970973709582e-05,
"loss": 2.0517,
"step": 26800
},
{
"epoch": 0.5607203902113644,
"grad_norm": 4.2899885177612305,
"learning_rate": 1.4018082805701037e-05,
"loss": 2.0407,
"step": 26900
},
{
"epoch": 0.5628048526285071,
"grad_norm": 1.5923058986663818,
"learning_rate": 1.407019463769249e-05,
"loss": 2.0518,
"step": 27000
},
{
"epoch": 0.5648893150456498,
"grad_norm": 1.543811559677124,
"learning_rate": 1.4122306469683942e-05,
"loss": 2.0483,
"step": 27100
},
{
"epoch": 0.5669737774627923,
"grad_norm": 1.221245288848877,
"learning_rate": 1.4174418301675397e-05,
"loss": 2.0434,
"step": 27200
},
{
"epoch": 0.569058239879935,
"grad_norm": 1.0497828722000122,
"learning_rate": 1.422653013366685e-05,
"loss": 2.0598,
"step": 27300
},
{
"epoch": 0.5711427022970775,
"grad_norm": 1.603359341621399,
"learning_rate": 1.4278641965658303e-05,
"loss": 2.0547,
"step": 27400
},
{
"epoch": 0.5732271647142202,
"grad_norm": 2.4841086864471436,
"learning_rate": 1.4330753797649757e-05,
"loss": 2.0493,
"step": 27500
},
{
"epoch": 0.5753116271313629,
"grad_norm": 2.896749496459961,
"learning_rate": 1.4382865629641211e-05,
"loss": 2.0595,
"step": 27600
},
{
"epoch": 0.5773960895485054,
"grad_norm": 1.320147156715393,
"learning_rate": 1.4434977461632665e-05,
"loss": 2.0455,
"step": 27700
},
{
"epoch": 0.5794805519656481,
"grad_norm": 2.056804895401001,
"learning_rate": 1.4487089293624119e-05,
"loss": 2.0473,
"step": 27800
},
{
"epoch": 0.5815650143827906,
"grad_norm": 1.9830574989318848,
"learning_rate": 1.4539201125615571e-05,
"loss": 2.0709,
"step": 27900
},
{
"epoch": 0.5836494767999333,
"grad_norm": 1.7711538076400757,
"learning_rate": 1.4591312957607027e-05,
"loss": 2.0381,
"step": 28000
},
{
"epoch": 0.585733939217076,
"grad_norm": 2.1394202709198,
"learning_rate": 1.4643424789598479e-05,
"loss": 2.0267,
"step": 28100
},
{
"epoch": 0.5878184016342185,
"grad_norm": 1.3552168607711792,
"learning_rate": 1.4695536621589935e-05,
"loss": 2.0718,
"step": 28200
},
{
"epoch": 0.5899028640513612,
"grad_norm": 1.2812672853469849,
"learning_rate": 1.4747648453581387e-05,
"loss": 2.0546,
"step": 28300
},
{
"epoch": 0.5919873264685038,
"grad_norm": 1.9260282516479492,
"learning_rate": 1.4799760285572839e-05,
"loss": 2.0402,
"step": 28400
},
{
"epoch": 0.5940717888856464,
"grad_norm": 1.829573392868042,
"learning_rate": 1.4851872117564295e-05,
"loss": 2.0429,
"step": 28500
},
{
"epoch": 0.596156251302789,
"grad_norm": 1.2826175689697266,
"learning_rate": 1.4903983949555747e-05,
"loss": 2.0297,
"step": 28600
},
{
"epoch": 0.5982407137199316,
"grad_norm": 1.3624961376190186,
"learning_rate": 1.4956095781547202e-05,
"loss": 2.0591,
"step": 28700
},
{
"epoch": 0.6003251761370743,
"grad_norm": 1.4224090576171875,
"learning_rate": 1.5008207613538655e-05,
"loss": 2.051,
"step": 28800
},
{
"epoch": 0.6024096385542169,
"grad_norm": 1.3509138822555542,
"learning_rate": 1.5060319445530109e-05,
"loss": 2.0488,
"step": 28900
},
{
"epoch": 0.6044941009713595,
"grad_norm": 2.7724287509918213,
"learning_rate": 1.5112431277521562e-05,
"loss": 2.0405,
"step": 29000
},
{
"epoch": 0.6065785633885021,
"grad_norm": 2.30072283744812,
"learning_rate": 1.5164543109513016e-05,
"loss": 2.0289,
"step": 29100
},
{
"epoch": 0.6086630258056447,
"grad_norm": 3.4419546127319336,
"learning_rate": 1.521665494150447e-05,
"loss": 2.0412,
"step": 29200
},
{
"epoch": 0.6107474882227873,
"grad_norm": 1.519679307937622,
"learning_rate": 1.5268766773495924e-05,
"loss": 2.0515,
"step": 29300
},
{
"epoch": 0.61283195063993,
"grad_norm": 2.3584392070770264,
"learning_rate": 1.5320878605487378e-05,
"loss": 2.057,
"step": 29400
},
{
"epoch": 0.6149164130570726,
"grad_norm": 1.6007986068725586,
"learning_rate": 1.5372990437478832e-05,
"loss": 2.0314,
"step": 29500
},
{
"epoch": 0.6170008754742152,
"grad_norm": 1.243009090423584,
"learning_rate": 1.5425102269470286e-05,
"loss": 2.0361,
"step": 29600
},
{
"epoch": 0.6190853378913578,
"grad_norm": 1.230760097503662,
"learning_rate": 1.5477214101461736e-05,
"loss": 2.0283,
"step": 29700
},
{
"epoch": 0.6211698003085004,
"grad_norm": 2.167958974838257,
"learning_rate": 1.5529325933453194e-05,
"loss": 2.0361,
"step": 29800
},
{
"epoch": 0.6232542627256431,
"grad_norm": 1.149268388748169,
"learning_rate": 1.5581437765444644e-05,
"loss": 2.0357,
"step": 29900
},
{
"epoch": 0.6253387251427857,
"grad_norm": 1.1137298345565796,
"learning_rate": 1.5633549597436098e-05,
"loss": 2.032,
"step": 30000
},
{
"epoch": 0.6274231875599283,
"grad_norm": 2.2308285236358643,
"learning_rate": 1.5685661429427552e-05,
"loss": 2.0435,
"step": 30100
},
{
"epoch": 0.6295076499770709,
"grad_norm": 1.1314657926559448,
"learning_rate": 1.5737773261419006e-05,
"loss": 2.038,
"step": 30200
},
{
"epoch": 0.6315921123942135,
"grad_norm": 1.2725504636764526,
"learning_rate": 1.578988509341046e-05,
"loss": 2.0499,
"step": 30300
},
{
"epoch": 0.6336765748113562,
"grad_norm": 1.6600308418273926,
"learning_rate": 1.5841996925401914e-05,
"loss": 2.0207,
"step": 30400
},
{
"epoch": 0.6357610372284987,
"grad_norm": 3.023804187774658,
"learning_rate": 1.5894108757393368e-05,
"loss": 2.0312,
"step": 30500
},
{
"epoch": 0.6378454996456414,
"grad_norm": 1.2331023216247559,
"learning_rate": 1.594622058938482e-05,
"loss": 2.023,
"step": 30600
},
{
"epoch": 0.639929962062784,
"grad_norm": 1.7768906354904175,
"learning_rate": 1.5998332421376275e-05,
"loss": 2.0373,
"step": 30700
},
{
"epoch": 0.6420144244799266,
"grad_norm": 2.608337879180908,
"learning_rate": 1.6050444253367726e-05,
"loss": 2.0293,
"step": 30800
},
{
"epoch": 0.6440988868970693,
"grad_norm": 1.0889259576797485,
"learning_rate": 1.6102556085359183e-05,
"loss": 2.0452,
"step": 30900
},
{
"epoch": 0.6461833493142118,
"grad_norm": 2.6050333976745605,
"learning_rate": 1.6154667917350634e-05,
"loss": 2.0249,
"step": 31000
},
{
"epoch": 0.6482678117313545,
"grad_norm": 4.261449813842773,
"learning_rate": 1.620677974934209e-05,
"loss": 2.0235,
"step": 31100
},
{
"epoch": 0.6503522741484971,
"grad_norm": 3.3580896854400635,
"learning_rate": 1.625889158133354e-05,
"loss": 2.0138,
"step": 31200
},
{
"epoch": 0.6524367365656397,
"grad_norm": 1.480364203453064,
"learning_rate": 1.6311003413325e-05,
"loss": 2.0214,
"step": 31300
},
{
"epoch": 0.6545211989827824,
"grad_norm": 1.1904412508010864,
"learning_rate": 1.636311524531645e-05,
"loss": 2.0299,
"step": 31400
},
{
"epoch": 0.6566056613999249,
"grad_norm": 2.2615163326263428,
"learning_rate": 1.6415227077307903e-05,
"loss": 2.0224,
"step": 31500
},
{
"epoch": 0.6586901238170676,
"grad_norm": 1.2494678497314453,
"learning_rate": 1.6467338909299357e-05,
"loss": 2.0184,
"step": 31600
},
{
"epoch": 0.6607745862342101,
"grad_norm": 1.1532669067382812,
"learning_rate": 1.651945074129081e-05,
"loss": 2.0345,
"step": 31700
},
{
"epoch": 0.6628590486513528,
"grad_norm": 1.1344257593154907,
"learning_rate": 1.6571562573282265e-05,
"loss": 2.0098,
"step": 31800
},
{
"epoch": 0.6649435110684955,
"grad_norm": 1.2463632822036743,
"learning_rate": 1.662367440527372e-05,
"loss": 1.9941,
"step": 31900
},
{
"epoch": 0.667027973485638,
"grad_norm": 1.452102541923523,
"learning_rate": 1.6675786237265173e-05,
"loss": 2.0176,
"step": 32000
},
{
"epoch": 0.6691124359027807,
"grad_norm": 7.941076278686523,
"learning_rate": 1.6727898069256626e-05,
"loss": 2.0026,
"step": 32100
},
{
"epoch": 0.6711968983199232,
"grad_norm": 1.1016764640808105,
"learning_rate": 1.678000990124808e-05,
"loss": 2.0111,
"step": 32200
},
{
"epoch": 0.6732813607370659,
"grad_norm": 1.4043099880218506,
"learning_rate": 1.6832121733239534e-05,
"loss": 2.0261,
"step": 32300
},
{
"epoch": 0.6753658231542086,
"grad_norm": 1.8107479810714722,
"learning_rate": 1.6884233565230988e-05,
"loss": 2.0145,
"step": 32400
},
{
"epoch": 0.6774502855713511,
"grad_norm": 2.062927007675171,
"learning_rate": 1.693634539722244e-05,
"loss": 2.0047,
"step": 32500
},
{
"epoch": 0.6795347479884938,
"grad_norm": 1.1044921875,
"learning_rate": 1.6988457229213896e-05,
"loss": 2.036,
"step": 32600
},
{
"epoch": 0.6816192104056363,
"grad_norm": 1.2338470220565796,
"learning_rate": 1.7040569061205346e-05,
"loss": 1.9982,
"step": 32700
},
{
"epoch": 0.683703672822779,
"grad_norm": 1.2513076066970825,
"learning_rate": 1.7092680893196804e-05,
"loss": 2.0106,
"step": 32800
},
{
"epoch": 0.6857881352399217,
"grad_norm": 2.0419886112213135,
"learning_rate": 1.7144792725188254e-05,
"loss": 1.9798,
"step": 32900
},
{
"epoch": 0.6878725976570642,
"grad_norm": 2.81988263130188,
"learning_rate": 1.719690455717971e-05,
"loss": 2.031,
"step": 33000
},
{
"epoch": 0.6899570600742069,
"grad_norm": 1.4116230010986328,
"learning_rate": 1.7249016389171162e-05,
"loss": 2.0427,
"step": 33100
},
{
"epoch": 0.6920415224913494,
"grad_norm": 1.2295700311660767,
"learning_rate": 1.7301128221162616e-05,
"loss": 2.0318,
"step": 33200
},
{
"epoch": 0.6941259849084921,
"grad_norm": 1.5113853216171265,
"learning_rate": 1.735324005315407e-05,
"loss": 2.0064,
"step": 33300
},
{
"epoch": 0.6962104473256348,
"grad_norm": 1.56009042263031,
"learning_rate": 1.7405351885145524e-05,
"loss": 1.9917,
"step": 33400
},
{
"epoch": 0.6982949097427773,
"grad_norm": 2.8100123405456543,
"learning_rate": 1.7457463717136978e-05,
"loss": 2.0319,
"step": 33500
},
{
"epoch": 0.70037937215992,
"grad_norm": 0.9532743096351624,
"learning_rate": 1.750957554912843e-05,
"loss": 1.9983,
"step": 33600
},
{
"epoch": 0.7024638345770626,
"grad_norm": 1.053802728652954,
"learning_rate": 1.7561687381119885e-05,
"loss": 1.9854,
"step": 33700
},
{
"epoch": 0.7045482969942052,
"grad_norm": 1.6877485513687134,
"learning_rate": 1.761379921311134e-05,
"loss": 1.9885,
"step": 33800
},
{
"epoch": 0.7066327594113478,
"grad_norm": 2.6555633544921875,
"learning_rate": 1.7665911045102793e-05,
"loss": 1.9824,
"step": 33900
},
{
"epoch": 0.7087172218284904,
"grad_norm": 1.2686346769332886,
"learning_rate": 1.7718022877094244e-05,
"loss": 1.9991,
"step": 34000
},
{
"epoch": 0.7108016842456331,
"grad_norm": 1.2505468130111694,
"learning_rate": 1.77701347090857e-05,
"loss": 2.013,
"step": 34100
},
{
"epoch": 0.7128861466627757,
"grad_norm": 1.2440173625946045,
"learning_rate": 1.782224654107715e-05,
"loss": 2.0002,
"step": 34200
},
{
"epoch": 0.7149706090799183,
"grad_norm": 1.239490270614624,
"learning_rate": 1.787435837306861e-05,
"loss": 2.0048,
"step": 34300
},
{
"epoch": 0.7170550714970609,
"grad_norm": 1.5791493654251099,
"learning_rate": 1.792647020506006e-05,
"loss": 1.9814,
"step": 34400
},
{
"epoch": 0.7191395339142035,
"grad_norm": 3.0702290534973145,
"learning_rate": 1.7978582037051513e-05,
"loss": 2.0036,
"step": 34500
},
{
"epoch": 0.7212239963313462,
"grad_norm": 1.0357660055160522,
"learning_rate": 1.8030693869042967e-05,
"loss": 1.9967,
"step": 34600
},
{
"epoch": 0.7233084587484888,
"grad_norm": 1.1576104164123535,
"learning_rate": 1.808280570103442e-05,
"loss": 1.9827,
"step": 34700
},
{
"epoch": 0.7253929211656314,
"grad_norm": 2.3989617824554443,
"learning_rate": 1.8134917533025875e-05,
"loss": 1.9849,
"step": 34800
},
{
"epoch": 0.727477383582774,
"grad_norm": 1.1362382173538208,
"learning_rate": 1.818702936501733e-05,
"loss": 1.9917,
"step": 34900
},
{
"epoch": 0.7295618459999166,
"grad_norm": 1.3115246295928955,
"learning_rate": 1.8239141197008783e-05,
"loss": 2.0011,
"step": 35000
},
{
"epoch": 0.7316463084170592,
"grad_norm": 1.0895345211029053,
"learning_rate": 1.8291253029000237e-05,
"loss": 2.0067,
"step": 35100
},
{
"epoch": 0.7337307708342019,
"grad_norm": 1.2841103076934814,
"learning_rate": 1.834336486099169e-05,
"loss": 1.9993,
"step": 35200
},
{
"epoch": 0.7358152332513445,
"grad_norm": 2.463681697845459,
"learning_rate": 1.8395476692983144e-05,
"loss": 1.9928,
"step": 35300
},
{
"epoch": 0.7378996956684871,
"grad_norm": 1.7267930507659912,
"learning_rate": 1.8447588524974598e-05,
"loss": 2.0095,
"step": 35400
},
{
"epoch": 0.7399841580856297,
"grad_norm": 1.6381916999816895,
"learning_rate": 1.849970035696605e-05,
"loss": 1.9895,
"step": 35500
},
{
"epoch": 0.7420686205027723,
"grad_norm": 1.1697516441345215,
"learning_rate": 1.8551812188957506e-05,
"loss": 1.9899,
"step": 35600
},
{
"epoch": 0.744153082919915,
"grad_norm": 1.7448146343231201,
"learning_rate": 1.8603924020948957e-05,
"loss": 1.9929,
"step": 35700
},
{
"epoch": 0.7462375453370576,
"grad_norm": 2.5885157585144043,
"learning_rate": 1.865603585294041e-05,
"loss": 1.9909,
"step": 35800
},
{
"epoch": 0.7483220077542002,
"grad_norm": 1.1813591718673706,
"learning_rate": 1.8708147684931864e-05,
"loss": 1.9859,
"step": 35900
},
{
"epoch": 0.7504064701713428,
"grad_norm": 1.0246374607086182,
"learning_rate": 1.8760259516923318e-05,
"loss": 1.979,
"step": 36000
},
{
"epoch": 0.7524909325884854,
"grad_norm": 1.160070538520813,
"learning_rate": 1.8812371348914772e-05,
"loss": 1.9839,
"step": 36100
},
{
"epoch": 0.7545753950056281,
"grad_norm": 1.8342937231063843,
"learning_rate": 1.8864483180906226e-05,
"loss": 2.0015,
"step": 36200
},
{
"epoch": 0.7566598574227706,
"grad_norm": 1.306425929069519,
"learning_rate": 1.891659501289768e-05,
"loss": 1.9988,
"step": 36300
},
{
"epoch": 0.7587443198399133,
"grad_norm": 1.4170417785644531,
"learning_rate": 1.8968706844889134e-05,
"loss": 1.9969,
"step": 36400
},
{
"epoch": 0.7608287822570559,
"grad_norm": 1.1687771081924438,
"learning_rate": 1.9020818676880588e-05,
"loss": 1.9492,
"step": 36500
},
{
"epoch": 0.7629132446741985,
"grad_norm": 1.840988039970398,
"learning_rate": 1.907293050887204e-05,
"loss": 1.9754,
"step": 36600
},
{
"epoch": 0.7649977070913412,
"grad_norm": 2.1230602264404297,
"learning_rate": 1.9125042340863496e-05,
"loss": 1.9562,
"step": 36700
},
{
"epoch": 0.7670821695084837,
"grad_norm": 1.1470071077346802,
"learning_rate": 1.917715417285495e-05,
"loss": 1.9794,
"step": 36800
},
{
"epoch": 0.7691666319256264,
"grad_norm": 2.209970712661743,
"learning_rate": 1.9229266004846403e-05,
"loss": 1.9639,
"step": 36900
},
{
"epoch": 0.771251094342769,
"grad_norm": 1.5943410396575928,
"learning_rate": 1.9281377836837857e-05,
"loss": 1.9853,
"step": 37000
},
{
"epoch": 0.7733355567599116,
"grad_norm": 2.3891971111297607,
"learning_rate": 1.9333489668829308e-05,
"loss": 1.9747,
"step": 37100
},
{
"epoch": 0.7754200191770543,
"grad_norm": 6.2893385887146,
"learning_rate": 1.938560150082076e-05,
"loss": 1.9744,
"step": 37200
},
{
"epoch": 0.7775044815941968,
"grad_norm": 1.0393949747085571,
"learning_rate": 1.9437713332812216e-05,
"loss": 1.975,
"step": 37300
},
{
"epoch": 0.7795889440113395,
"grad_norm": 1.9847209453582764,
"learning_rate": 1.948982516480367e-05,
"loss": 2.0046,
"step": 37400
},
{
"epoch": 0.781673406428482,
"grad_norm": 1.9294075965881348,
"learning_rate": 1.9541936996795123e-05,
"loss": 1.9717,
"step": 37500
},
{
"epoch": 0.7837578688456247,
"grad_norm": 1.2474915981292725,
"learning_rate": 1.9594048828786577e-05,
"loss": 1.975,
"step": 37600
},
{
"epoch": 0.7858423312627674,
"grad_norm": 1.1927696466445923,
"learning_rate": 1.964616066077803e-05,
"loss": 1.9759,
"step": 37700
},
{
"epoch": 0.7879267936799099,
"grad_norm": 1.0184850692749023,
"learning_rate": 1.9698272492769485e-05,
"loss": 1.9819,
"step": 37800
},
{
"epoch": 0.7900112560970526,
"grad_norm": 1.4298509359359741,
"learning_rate": 1.975038432476094e-05,
"loss": 1.9711,
"step": 37900
},
{
"epoch": 0.7920957185141951,
"grad_norm": 1.0294862985610962,
"learning_rate": 1.9802496156752393e-05,
"loss": 1.996,
"step": 38000
},
{
"epoch": 0.7941801809313378,
"grad_norm": 1.0123859643936157,
"learning_rate": 1.9854607988743847e-05,
"loss": 1.983,
"step": 38100
},
{
"epoch": 0.7962646433484805,
"grad_norm": 1.6565953493118286,
"learning_rate": 1.9906719820735297e-05,
"loss": 1.9865,
"step": 38200
},
{
"epoch": 0.798349105765623,
"grad_norm": 1.0507214069366455,
"learning_rate": 1.9958831652726755e-05,
"loss": 1.9808,
"step": 38300
},
{
"epoch": 0.8004335681827657,
"grad_norm": 1.4064078330993652,
"learning_rate": 1.9999763616765983e-05,
"loss": 1.9825,
"step": 38400
},
{
"epoch": 0.8025180305999083,
"grad_norm": 1.7041994333267212,
"learning_rate": 1.9992153179340215e-05,
"loss": 1.9614,
"step": 38500
},
{
"epoch": 0.8046024930170509,
"grad_norm": 2.012880802154541,
"learning_rate": 1.9973831740621872e-05,
"loss": 1.9654,
"step": 38600
},
{
"epoch": 0.8066869554341936,
"grad_norm": 1.9788340330123901,
"learning_rate": 1.9944818940117093e-05,
"loss": 1.98,
"step": 38700
},
{
"epoch": 0.8087714178513361,
"grad_norm": 1.257907748222351,
"learning_rate": 1.9905145877843883e-05,
"loss": 1.9782,
"step": 38800
},
{
"epoch": 0.8108558802684788,
"grad_norm": 1.1625444889068604,
"learning_rate": 1.9854855080994695e-05,
"loss": 1.9707,
"step": 38900
},
{
"epoch": 0.8129403426856214,
"grad_norm": 1.428716778755188,
"learning_rate": 1.9794000458349786e-05,
"loss": 1.9624,
"step": 39000
},
{
"epoch": 0.815024805102764,
"grad_norm": 1.1273753643035889,
"learning_rate": 1.9722647242490173e-05,
"loss": 1.9638,
"step": 39100
},
{
"epoch": 0.8171092675199066,
"grad_norm": 1.8102099895477295,
"learning_rate": 1.9640871919872137e-05,
"loss": 1.9671,
"step": 39200
},
{
"epoch": 0.8191937299370492,
"grad_norm": 1.0880202054977417,
"learning_rate": 1.9548762148838235e-05,
"loss": 1.9693,
"step": 39300
},
{
"epoch": 0.8212781923541919,
"grad_norm": 1.2022266387939453,
"learning_rate": 1.94464166656527e-05,
"loss": 1.9723,
"step": 39400
},
{
"epoch": 0.8233626547713345,
"grad_norm": 1.6763168573379517,
"learning_rate": 1.933394517866198e-05,
"loss": 1.9473,
"step": 39500
},
{
"epoch": 0.8254471171884771,
"grad_norm": 1.172217607498169,
"learning_rate": 1.9211468250693814e-05,
"loss": 1.9732,
"step": 39600
},
{
"epoch": 0.8275315796056197,
"grad_norm": 1.1193639039993286,
"learning_rate": 1.907911716982097e-05,
"loss": 1.9503,
"step": 39700
},
{
"epoch": 0.8296160420227623,
"grad_norm": 1.0473952293395996,
"learning_rate": 1.8937033808628157e-05,
"loss": 1.9676,
"step": 39800
},
{
"epoch": 0.831700504439905,
"grad_norm": 0.9671579599380493,
"learning_rate": 1.8785370472132925e-05,
"loss": 1.9703,
"step": 39900
},
{
"epoch": 0.8337849668570476,
"grad_norm": 2.828726291656494,
"learning_rate": 1.8624289734523655e-05,
"loss": 1.9747,
"step": 40000
},
{
"epoch": 0.8358694292741902,
"grad_norm": 1.1326168775558472,
"learning_rate": 1.8453964264889555e-05,
"loss": 1.9622,
"step": 40100
},
{
"epoch": 0.8379538916913328,
"grad_norm": 1.3057445287704468,
"learning_rate": 1.8274576642129552e-05,
"loss": 1.9578,
"step": 40200
},
{
"epoch": 0.8400383541084754,
"grad_norm": 1.1235833168029785,
"learning_rate": 1.8086319159238442e-05,
"loss": 1.9577,
"step": 40300
},
{
"epoch": 0.842122816525618,
"grad_norm": 1.126379132270813,
"learning_rate": 1.7889393617180074e-05,
"loss": 1.9582,
"step": 40400
},
{
"epoch": 0.8442072789427607,
"grad_norm": 1.1817550659179688,
"learning_rate": 1.7684011108568593e-05,
"loss": 1.9492,
"step": 40500
},
{
"epoch": 0.8462917413599033,
"grad_norm": 4.150036334991455,
"learning_rate": 1.747039179138954e-05,
"loss": 1.9373,
"step": 40600
},
{
"epoch": 0.8483762037770459,
"grad_norm": 1.1571305990219116,
"learning_rate": 1.7248764653003433e-05,
"loss": 1.9573,
"step": 40700
},
{
"epoch": 0.8504606661941885,
"grad_norm": 0.9550127387046814,
"learning_rate": 1.701936726468477e-05,
"loss": 1.9825,
"step": 40800
},
{
"epoch": 0.8525451286113311,
"grad_norm": 1.2614582777023315,
"learning_rate": 1.6782445526959553e-05,
"loss": 1.9543,
"step": 40900
},
{
"epoch": 0.8546295910284738,
"grad_norm": 1.3649072647094727,
"learning_rate": 1.65382534060144e-05,
"loss": 1.9612,
"step": 41000
},
{
"epoch": 0.8567140534456164,
"grad_norm": 1.1681647300720215,
"learning_rate": 1.628705266145969e-05,
"loss": 1.9598,
"step": 41100
},
{
"epoch": 0.858798515862759,
"grad_norm": 1.2847682237625122,
"learning_rate": 1.6029112565738647e-05,
"loss": 1.9581,
"step": 41200
},
{
"epoch": 0.8608829782799016,
"grad_norm": 1.1968718767166138,
"learning_rate": 1.5764709615483084e-05,
"loss": 1.9368,
"step": 41300
},
{
"epoch": 0.8629674406970442,
"grad_norm": 1.6897144317626953,
"learning_rate": 1.549412723512526e-05,
"loss": 1.9419,
"step": 41400
},
{
"epoch": 0.8650519031141869,
"grad_norm": 1.051757574081421,
"learning_rate": 1.521765547308355e-05,
"loss": 1.9491,
"step": 41500
},
{
"epoch": 0.8671363655313294,
"grad_norm": 1.1903842687606812,
"learning_rate": 1.4935590690847555e-05,
"loss": 1.947,
"step": 41600
},
{
"epoch": 0.8692208279484721,
"grad_norm": 1.3480621576309204,
"learning_rate": 1.464823524529601e-05,
"loss": 1.9539,
"step": 41700
},
{
"epoch": 0.8713052903656147,
"grad_norm": 1.382934808731079,
"learning_rate": 1.4355897164587957e-05,
"loss": 1.9607,
"step": 41800
},
{
"epoch": 0.8733897527827573,
"grad_norm": 1.0020027160644531,
"learning_rate": 1.4058889817974664e-05,
"loss": 1.9393,
"step": 41900
},
{
"epoch": 0.8754742151999,
"grad_norm": 0.9402817487716675,
"learning_rate": 1.3757531579886186e-05,
"loss": 1.9524,
"step": 42000
},
{
"epoch": 0.8775586776170425,
"grad_norm": 1.1450611352920532,
"learning_rate": 1.345214548865267e-05,
"loss": 1.9806,
"step": 42100
},
{
"epoch": 0.8796431400341852,
"grad_norm": 1.3311893939971924,
"learning_rate": 1.3143058900226262e-05,
"loss": 1.9735,
"step": 42200
},
{
"epoch": 0.8817276024513278,
"grad_norm": 1.2113134860992432,
"learning_rate": 1.2830603137274729e-05,
"loss": 1.9428,
"step": 42300
},
{
"epoch": 0.8838120648684704,
"grad_norm": 3.3376879692077637,
"learning_rate": 1.2515113134023035e-05,
"loss": 1.9595,
"step": 42400
},
{
"epoch": 0.8858965272856131,
"grad_norm": 41.23748016357422,
"learning_rate": 1.2196927077223523e-05,
"loss": 1.9483,
"step": 42500
},
{
"epoch": 0.8879809897027556,
"grad_norm": 2.1873326301574707,
"learning_rate": 1.187638604363958e-05,
"loss": 1.9469,
"step": 42600
},
{
"epoch": 0.8900654521198983,
"grad_norm": 1.1909996271133423,
"learning_rate": 1.1553833634431375e-05,
"loss": 1.956,
"step": 42700
},
{
"epoch": 0.8921499145370408,
"grad_norm": 0.9936875700950623,
"learning_rate": 1.1229615606835609e-05,
"loss": 1.9309,
"step": 42800
},
{
"epoch": 0.8942343769541835,
"grad_norm": 1.1771327257156372,
"learning_rate": 1.0904079503534057e-05,
"loss": 1.953,
"step": 42900
},
{
"epoch": 0.8963188393713262,
"grad_norm": 0.9900943636894226,
"learning_rate": 1.0577574280108226e-05,
"loss": 1.9461,
"step": 43000
},
{
"epoch": 0.8984033017884687,
"grad_norm": 1.4324864149093628,
"learning_rate": 1.0250449930979484e-05,
"loss": 1.9477,
"step": 43100
},
{
"epoch": 0.9004877642056114,
"grad_norm": 1.1139860153198242,
"learning_rate": 9.92305711423557e-06,
"loss": 1.9354,
"step": 43200
},
{
"epoch": 0.9025722266227539,
"grad_norm": 1.8465943336486816,
"learning_rate": 9.595746775745753e-06,
"loss": 1.9447,
"step": 43300
},
{
"epoch": 0.9046566890398966,
"grad_norm": 1.6876327991485596,
"learning_rate": 9.268869772967448e-06,
"loss": 1.9309,
"step": 43400
},
{
"epoch": 0.9067411514570393,
"grad_norm": 1.6580865383148193,
"learning_rate": 8.942776498847645e-06,
"loss": 1.9482,
"step": 43500
},
{
"epoch": 0.9088256138741818,
"grad_norm": 1.1271007061004639,
"learning_rate": 8.617816506222246e-06,
"loss": 1.9408,
"step": 43600
},
{
"epoch": 0.9109100762913245,
"grad_norm": 0.9445035457611084,
"learning_rate": 8.294338133115956e-06,
"loss": 1.935,
"step": 43700
},
{
"epoch": 0.9129945387084671,
"grad_norm": 1.051377534866333,
"learning_rate": 7.972688129344382e-06,
"loss": 1.9538,
"step": 43800
},
{
"epoch": 0.9150790011256097,
"grad_norm": 1.6551326513290405,
"learning_rate": 7.653211284818598e-06,
"loss": 1.9404,
"step": 43900
},
{
"epoch": 0.9171634635427524,
"grad_norm": 1.1971542835235596,
"learning_rate": 7.336250059950618e-06,
"loss": 1.9294,
"step": 44000
},
{
"epoch": 0.9192479259598949,
"grad_norm": 1.6140626668930054,
"learning_rate": 7.0221442185559375e-06,
"loss": 1.9401,
"step": 44100
},
{
"epoch": 0.9213323883770376,
"grad_norm": 1.212536096572876,
"learning_rate": 6.711230463646687e-06,
"loss": 1.9478,
"step": 44200
},
{
"epoch": 0.9234168507941802,
"grad_norm": 25.560195922851562,
"learning_rate": 6.4038420765057915e-06,
"loss": 1.9375,
"step": 44300
},
{
"epoch": 0.9255013132113228,
"grad_norm": 2.924628257751465,
"learning_rate": 6.100308559428989e-06,
"loss": 1.928,
"step": 44400
},
{
"epoch": 0.9275857756284654,
"grad_norm": 1.2299394607543945,
"learning_rate": 5.800955282517735e-06,
"loss": 1.9343,
"step": 44500
},
{
"epoch": 0.929670238045608,
"grad_norm": 2.705091714859009,
"learning_rate": 5.50610313490157e-06,
"loss": 1.9602,
"step": 44600
},
{
"epoch": 0.9317547004627507,
"grad_norm": 1.0268359184265137,
"learning_rate": 5.2160681807638155e-06,
"loss": 1.944,
"step": 44700
},
{
"epoch": 0.9338391628798933,
"grad_norm": 1.0274441242218018,
"learning_rate": 4.931161320539333e-06,
"loss": 1.9416,
"step": 44800
},
{
"epoch": 0.9359236252970359,
"grad_norm": 1.2513482570648193,
"learning_rate": 4.6516879576475195e-06,
"loss": 1.9414,
"step": 44900
},
{
"epoch": 0.9380080877141785,
"grad_norm": 1.1307276487350464,
"learning_rate": 4.377947671117785e-06,
"loss": 1.9272,
"step": 45000
},
{
"epoch": 0.9400925501313211,
"grad_norm": 1.0546061992645264,
"learning_rate": 4.110233894458426e-06,
"loss": 1.9381,
"step": 45100
},
{
"epoch": 0.9421770125484638,
"grad_norm": 1.4873950481414795,
"learning_rate": 3.848833601113141e-06,
"loss": 1.952,
"step": 45200
},
{
"epoch": 0.9442614749656064,
"grad_norm": 1.4963133335113525,
"learning_rate": 3.594026996842356e-06,
"loss": 1.9269,
"step": 45300
},
{
"epoch": 0.946345937382749,
"grad_norm": 1.6361576318740845,
"learning_rate": 3.3460872193591042e-06,
"loss": 1.9366,
"step": 45400
},
{
"epoch": 0.9484303997998916,
"grad_norm": 0.9263481497764587,
"learning_rate": 3.1052800455414456e-06,
"loss": 1.9368,
"step": 45500
},
{
"epoch": 0.9505148622170342,
"grad_norm": 5.031856060028076,
"learning_rate": 2.8718636065352667e-06,
"loss": 1.9247,
"step": 45600
},
{
"epoch": 0.9525993246341768,
"grad_norm": 1.3974571228027344,
"learning_rate": 2.646088111052847e-06,
"loss": 1.9392,
"step": 45700
},
{
"epoch": 0.9546837870513195,
"grad_norm": 1.053136944770813,
"learning_rate": 2.4281955771638177e-06,
"loss": 1.934,
"step": 45800
},
{
"epoch": 0.9567682494684621,
"grad_norm": 1.02759850025177,
"learning_rate": 2.218419572866011e-06,
"loss": 1.947,
"step": 45900
},
{
"epoch": 0.9588527118856047,
"grad_norm": 1.1901452541351318,
"learning_rate": 2.0169849657142825e-06,
"loss": 1.937,
"step": 46000
},
{
"epoch": 0.9609371743027473,
"grad_norm": 1.374199628829956,
"learning_rate": 1.8241076817757041e-06,
"loss": 1.941,
"step": 46100
},
{
"epoch": 0.9630216367198899,
"grad_norm": 0.9044139385223389,
"learning_rate": 1.6399944741694962e-06,
"loss": 1.9477,
"step": 46200
},
{
"epoch": 0.9651060991370326,
"grad_norm": 1.1857671737670898,
"learning_rate": 1.4648427014398336e-06,
"loss": 1.9561,
"step": 46300
},
{
"epoch": 0.9671905615541752,
"grad_norm": 1.1466737985610962,
"learning_rate": 1.2988401159990793e-06,
"loss": 1.9396,
"step": 46400
},
{
"epoch": 0.9692750239713178,
"grad_norm": 1.0394240617752075,
"learning_rate": 1.1421646628682215e-06,
"loss": 1.9217,
"step": 46500
},
{
"epoch": 0.9713594863884604,
"grad_norm": 1.18535315990448,
"learning_rate": 9.949842889302675e-07,
"loss": 1.9473,
"step": 46600
},
{
"epoch": 0.973443948805603,
"grad_norm": 0.9806298017501831,
"learning_rate": 8.574567629010489e-07,
"loss": 1.9415,
"step": 46700
},
{
"epoch": 0.9755284112227457,
"grad_norm": 1.027891993522644,
"learning_rate": 7.297295062104215e-07,
"loss": 1.9392,
"step": 46800
},
{
"epoch": 0.9776128736398882,
"grad_norm": 12.703132629394531,
"learning_rate": 6.119394349751628e-07,
"loss": 1.9455,
"step": 46900
},
{
"epoch": 0.9796973360570309,
"grad_norm": 1.4290614128112793,
"learning_rate": 5.042128132329205e-07,
"loss": 1.9533,
"step": 47000
},
{
"epoch": 0.9817817984741735,
"grad_norm": 1.15633225440979,
"learning_rate": 4.0666511759459457e-07,
"loss": 1.946,
"step": 47100
},
{
"epoch": 0.9838662608913161,
"grad_norm": 1.006787896156311,
"learning_rate": 3.19400913460185e-07,
"loss": 1.9255,
"step": 47200
},
{
"epoch": 0.9859507233084588,
"grad_norm": 17.578834533691406,
"learning_rate": 2.4251374293082885e-07,
"loss": 1.9303,
"step": 47300
},
{
"epoch": 0.9880351857256013,
"grad_norm": 3.2122883796691895,
"learning_rate": 1.7608602453716205e-07,
"loss": 1.9326,
"step": 47400
},
{
"epoch": 0.990119648142744,
"grad_norm": 1.0698961019515991,
"learning_rate": 1.2018896489149333e-07,
"loss": 1.9339,
"step": 47500
},
{
"epoch": 0.9922041105598866,
"grad_norm": 1.2213988304138184,
"learning_rate": 7.488248235850038e-08,
"loss": 1.9379,
"step": 47600
},
{
"epoch": 0.9942885729770292,
"grad_norm": 1.0943001508712769,
"learning_rate": 4.0215142826263155e-08,
"loss": 1.9595,
"step": 47700
},
{
"epoch": 0.9963730353941719,
"grad_norm": 1.1492195129394531,
"learning_rate": 1.6224107646479747e-08,
"loss": 1.9347,
"step": 47800
},
{
"epoch": 0.9984574978113144,
"grad_norm": 1.0283631086349487,
"learning_rate": 2.935093799688593e-09,
"loss": 1.9271,
"step": 47900
},
{
"epoch": 1.0,
"step": 47974,
"total_flos": 5.740611116623462e+18,
"train_loss": 2.363025343879811,
"train_runtime": 6339.5944,
"train_samples_per_second": 60.539,
"train_steps_per_second": 7.567
}
],
"logging_steps": 100,
"max_steps": 47974,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": false,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 5.740611116623462e+18,
"train_batch_size": 1,
"trial_name": null,
"trial_params": null
}