Whispful's picture
Training in progress, step 142, checkpoint
bd29501 verified
raw
history blame
27 kB
{
"best_metric": 6.867819786071777,
"best_model_checkpoint": "miner_id_24/checkpoint-100",
"epoch": 0.023317678741502184,
"eval_steps": 25,
"global_step": 142,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.00016420900522184638,
"grad_norm": 0.18168503046035767,
"learning_rate": 2e-05,
"loss": 6.9428,
"step": 1
},
{
"epoch": 0.00016420900522184638,
"eval_loss": 6.944418907165527,
"eval_runtime": 0.6891,
"eval_samples_per_second": 72.556,
"eval_steps_per_second": 13.06,
"step": 1
},
{
"epoch": 0.00032841801044369276,
"grad_norm": 0.1871073693037033,
"learning_rate": 4e-05,
"loss": 6.9417,
"step": 2
},
{
"epoch": 0.0004926270156655391,
"grad_norm": 0.18547073006629944,
"learning_rate": 6e-05,
"loss": 6.9422,
"step": 3
},
{
"epoch": 0.0006568360208873855,
"grad_norm": 0.19758914411067963,
"learning_rate": 8e-05,
"loss": 6.9364,
"step": 4
},
{
"epoch": 0.0008210450261092318,
"grad_norm": 0.1958891600370407,
"learning_rate": 0.0001,
"loss": 6.9404,
"step": 5
},
{
"epoch": 0.0009852540313310782,
"grad_norm": 0.20084300637245178,
"learning_rate": 9.99881689824633e-05,
"loss": 6.9411,
"step": 6
},
{
"epoch": 0.0011494630365529246,
"grad_norm": 0.19459381699562073,
"learning_rate": 9.995268215087426e-05,
"loss": 6.9339,
"step": 7
},
{
"epoch": 0.001313672041774771,
"grad_norm": 0.20113036036491394,
"learning_rate": 9.989355816502525e-05,
"loss": 6.9377,
"step": 8
},
{
"epoch": 0.0014778810469966172,
"grad_norm": 0.207706481218338,
"learning_rate": 9.981082811366797e-05,
"loss": 6.9359,
"step": 9
},
{
"epoch": 0.0016420900522184636,
"grad_norm": 0.17941485345363617,
"learning_rate": 9.970453549816632e-05,
"loss": 6.9367,
"step": 10
},
{
"epoch": 0.00180629905744031,
"grad_norm": 0.20235049724578857,
"learning_rate": 9.957473620962246e-05,
"loss": 6.9335,
"step": 11
},
{
"epoch": 0.0019705080626621564,
"grad_norm": 0.2030957043170929,
"learning_rate": 9.94214984994879e-05,
"loss": 6.9291,
"step": 12
},
{
"epoch": 0.0021347170678840026,
"grad_norm": 0.21488922834396362,
"learning_rate": 9.924490294367533e-05,
"loss": 6.9362,
"step": 13
},
{
"epoch": 0.0022989260731058492,
"grad_norm": 0.2119452953338623,
"learning_rate": 9.904504240019e-05,
"loss": 6.9315,
"step": 14
},
{
"epoch": 0.0024631350783276954,
"grad_norm": 0.2180822491645813,
"learning_rate": 9.88220219603028e-05,
"loss": 6.9296,
"step": 15
},
{
"epoch": 0.002627344083549542,
"grad_norm": 0.19745118916034698,
"learning_rate": 9.85759588932908e-05,
"loss": 6.9261,
"step": 16
},
{
"epoch": 0.0027915530887713882,
"grad_norm": 0.20321951806545258,
"learning_rate": 9.830698258477458e-05,
"loss": 6.9285,
"step": 17
},
{
"epoch": 0.0029557620939932344,
"grad_norm": 0.2177504599094391,
"learning_rate": 9.801523446868399e-05,
"loss": 6.9288,
"step": 18
},
{
"epoch": 0.003119971099215081,
"grad_norm": 0.21581852436065674,
"learning_rate": 9.770086795288913e-05,
"loss": 6.9266,
"step": 19
},
{
"epoch": 0.0032841801044369272,
"grad_norm": 0.22525547444820404,
"learning_rate": 9.736404833853502e-05,
"loss": 6.9243,
"step": 20
},
{
"epoch": 0.003448389109658774,
"grad_norm": 0.24186238646507263,
"learning_rate": 9.700495273312223e-05,
"loss": 6.92,
"step": 21
},
{
"epoch": 0.00361259811488062,
"grad_norm": 0.21517185866832733,
"learning_rate": 9.662376995737989e-05,
"loss": 6.9185,
"step": 22
},
{
"epoch": 0.0037768071201024662,
"grad_norm": 0.20967762172222137,
"learning_rate": 9.622070044597935e-05,
"loss": 6.9212,
"step": 23
},
{
"epoch": 0.003941016125324313,
"grad_norm": 0.23955151438713074,
"learning_rate": 9.579595614214087e-05,
"loss": 6.9187,
"step": 24
},
{
"epoch": 0.0041052251305461595,
"grad_norm": 0.228123739361763,
"learning_rate": 9.534976038618931e-05,
"loss": 6.9194,
"step": 25
},
{
"epoch": 0.0041052251305461595,
"eval_loss": 6.920280933380127,
"eval_runtime": 0.1615,
"eval_samples_per_second": 309.601,
"eval_steps_per_second": 55.728,
"step": 25
},
{
"epoch": 0.004269434135768005,
"grad_norm": 0.24925658106803894,
"learning_rate": 9.488234779811635e-05,
"loss": 6.9146,
"step": 26
},
{
"epoch": 0.004433643140989852,
"grad_norm": 0.24287429451942444,
"learning_rate": 9.439396415421204e-05,
"loss": 6.9166,
"step": 27
},
{
"epoch": 0.0045978521462116985,
"grad_norm": 0.2547374963760376,
"learning_rate": 9.388486625782995e-05,
"loss": 6.9139,
"step": 28
},
{
"epoch": 0.004762061151433544,
"grad_norm": 0.2530066668987274,
"learning_rate": 9.335532180435412e-05,
"loss": 6.9132,
"step": 29
},
{
"epoch": 0.004926270156655391,
"grad_norm": 0.2472325563430786,
"learning_rate": 9.280560924043858e-05,
"loss": 6.9146,
"step": 30
},
{
"epoch": 0.0050904791618772375,
"grad_norm": 0.25216802954673767,
"learning_rate": 9.223601761759367e-05,
"loss": 6.915,
"step": 31
},
{
"epoch": 0.005254688167099084,
"grad_norm": 0.2528752386569977,
"learning_rate": 9.164684644019624e-05,
"loss": 6.9113,
"step": 32
},
{
"epoch": 0.00541889717232093,
"grad_norm": 0.24980315566062927,
"learning_rate": 9.103840550800329e-05,
"loss": 6.9052,
"step": 33
},
{
"epoch": 0.0055831061775427765,
"grad_norm": 0.26271212100982666,
"learning_rate": 9.041101475325209e-05,
"loss": 6.9033,
"step": 34
},
{
"epoch": 0.005747315182764623,
"grad_norm": 0.2919352948665619,
"learning_rate": 8.976500407243247e-05,
"loss": 6.9013,
"step": 35
},
{
"epoch": 0.005911524187986469,
"grad_norm": 0.27319350838661194,
"learning_rate": 8.910071315281975e-05,
"loss": 6.9097,
"step": 36
},
{
"epoch": 0.0060757331932083155,
"grad_norm": 0.29328155517578125,
"learning_rate": 8.841849129385921e-05,
"loss": 6.904,
"step": 37
},
{
"epoch": 0.006239942198430162,
"grad_norm": 0.2633126676082611,
"learning_rate": 8.771869722349651e-05,
"loss": 6.9065,
"step": 38
},
{
"epoch": 0.006404151203652008,
"grad_norm": 0.2593594789505005,
"learning_rate": 8.700169890955027e-05,
"loss": 6.8937,
"step": 39
},
{
"epoch": 0.0065683602088738545,
"grad_norm": 0.2693818211555481,
"learning_rate": 8.626787336622607e-05,
"loss": 6.9007,
"step": 40
},
{
"epoch": 0.006732569214095701,
"grad_norm": 0.29676708579063416,
"learning_rate": 8.55176064558738e-05,
"loss": 6.8984,
"step": 41
},
{
"epoch": 0.006896778219317548,
"grad_norm": 0.3058428168296814,
"learning_rate": 8.475129268609227e-05,
"loss": 6.8938,
"step": 42
},
{
"epoch": 0.0070609872245393935,
"grad_norm": 0.322170615196228,
"learning_rate": 8.396933500228808e-05,
"loss": 6.8846,
"step": 43
},
{
"epoch": 0.00722519622976124,
"grad_norm": 0.34816184639930725,
"learning_rate": 8.317214457579773e-05,
"loss": 6.8835,
"step": 44
},
{
"epoch": 0.007389405234983087,
"grad_norm": 0.32875800132751465,
"learning_rate": 8.23601405876841e-05,
"loss": 6.8953,
"step": 45
},
{
"epoch": 0.0075536142402049325,
"grad_norm": 0.35221952199935913,
"learning_rate": 8.153375000832157e-05,
"loss": 6.8907,
"step": 46
},
{
"epoch": 0.007717823245426779,
"grad_norm": 0.363881915807724,
"learning_rate": 8.069340737288512e-05,
"loss": 6.8798,
"step": 47
},
{
"epoch": 0.007882032250648626,
"grad_norm": 0.44765737652778625,
"learning_rate": 7.98395545528617e-05,
"loss": 6.8804,
"step": 48
},
{
"epoch": 0.008046241255870471,
"grad_norm": 0.48432090878486633,
"learning_rate": 7.897264052370409e-05,
"loss": 6.8855,
"step": 49
},
{
"epoch": 0.008210450261092319,
"grad_norm": 0.6992037296295166,
"learning_rate": 7.809312112874924e-05,
"loss": 6.8687,
"step": 50
},
{
"epoch": 0.008210450261092319,
"eval_loss": 6.891801834106445,
"eval_runtime": 0.1484,
"eval_samples_per_second": 336.864,
"eval_steps_per_second": 60.635,
"step": 50
},
{
"epoch": 0.008374659266314165,
"grad_norm": 0.23850670456886292,
"learning_rate": 7.720145883952544e-05,
"loss": 6.8929,
"step": 51
},
{
"epoch": 0.00853886827153601,
"grad_norm": 0.23872490227222443,
"learning_rate": 7.629812251257401e-05,
"loss": 6.8878,
"step": 52
},
{
"epoch": 0.008703077276757858,
"grad_norm": 0.25407809019088745,
"learning_rate": 7.53835871429139e-05,
"loss": 6.8843,
"step": 53
},
{
"epoch": 0.008867286281979704,
"grad_norm": 0.23447373509407043,
"learning_rate": 7.445833361427828e-05,
"loss": 6.8832,
"step": 54
},
{
"epoch": 0.00903149528720155,
"grad_norm": 0.2190893590450287,
"learning_rate": 7.352284844625481e-05,
"loss": 6.8877,
"step": 55
},
{
"epoch": 0.009195704292423397,
"grad_norm": 0.22474221885204315,
"learning_rate": 7.257762353846257e-05,
"loss": 6.8882,
"step": 56
},
{
"epoch": 0.009359913297645243,
"grad_norm": 0.21934156119823456,
"learning_rate": 7.162315591189978e-05,
"loss": 6.884,
"step": 57
},
{
"epoch": 0.009524122302867088,
"grad_norm": 0.22677500545978546,
"learning_rate": 7.065994744759879e-05,
"loss": 6.888,
"step": 58
},
{
"epoch": 0.009688331308088936,
"grad_norm": 0.21878303587436676,
"learning_rate": 6.96885046227255e-05,
"loss": 6.8825,
"step": 59
},
{
"epoch": 0.009852540313310782,
"grad_norm": 0.21341657638549805,
"learning_rate": 6.8709338244262e-05,
"loss": 6.8831,
"step": 60
},
{
"epoch": 0.010016749318532627,
"grad_norm": 0.20267024636268616,
"learning_rate": 6.772296318041253e-05,
"loss": 6.884,
"step": 61
},
{
"epoch": 0.010180958323754475,
"grad_norm": 0.19920732080936432,
"learning_rate": 6.672989808987385e-05,
"loss": 6.8809,
"step": 62
},
{
"epoch": 0.01034516732897632,
"grad_norm": 0.2178788185119629,
"learning_rate": 6.573066514911273e-05,
"loss": 6.8755,
"step": 63
},
{
"epoch": 0.010509376334198168,
"grad_norm": 0.210996612906456,
"learning_rate": 6.472578977779339e-05,
"loss": 6.8735,
"step": 64
},
{
"epoch": 0.010673585339420014,
"grad_norm": 0.19956302642822266,
"learning_rate": 6.371580036249985e-05,
"loss": 6.8776,
"step": 65
},
{
"epoch": 0.01083779434464186,
"grad_norm": 0.20250411331653595,
"learning_rate": 6.270122797889806e-05,
"loss": 6.8728,
"step": 66
},
{
"epoch": 0.011002003349863707,
"grad_norm": 0.20616371929645538,
"learning_rate": 6.168260611248417e-05,
"loss": 6.8754,
"step": 67
},
{
"epoch": 0.011166212355085553,
"grad_norm": 0.2024230659008026,
"learning_rate": 6.066047037806549e-05,
"loss": 6.8729,
"step": 68
},
{
"epoch": 0.011330421360307399,
"grad_norm": 0.19448429346084595,
"learning_rate": 5.9635358238121954e-05,
"loss": 6.872,
"step": 69
},
{
"epoch": 0.011494630365529246,
"grad_norm": 0.18782764673233032,
"learning_rate": 5.860780872019601e-05,
"loss": 6.8785,
"step": 70
},
{
"epoch": 0.011658839370751092,
"grad_norm": 0.1987268328666687,
"learning_rate": 5.7578362133459494e-05,
"loss": 6.868,
"step": 71
},
{
"epoch": 0.011823048375972938,
"grad_norm": 0.1736784130334854,
"learning_rate": 5.6547559784606675e-05,
"loss": 6.8722,
"step": 72
},
{
"epoch": 0.011987257381194785,
"grad_norm": 0.17477966845035553,
"learning_rate": 5.551594369322271e-05,
"loss": 6.8697,
"step": 73
},
{
"epoch": 0.012151466386416631,
"grad_norm": 0.19487003982067108,
"learning_rate": 5.44840563067773e-05,
"loss": 6.8721,
"step": 74
},
{
"epoch": 0.012315675391638477,
"grad_norm": 0.20581433176994324,
"learning_rate": 5.3452440215393315e-05,
"loss": 6.8684,
"step": 75
},
{
"epoch": 0.012315675391638477,
"eval_loss": 6.876440048217773,
"eval_runtime": 0.1565,
"eval_samples_per_second": 319.512,
"eval_steps_per_second": 57.512,
"step": 75
},
{
"epoch": 0.012479884396860324,
"grad_norm": 0.18974201381206512,
"learning_rate": 5.242163786654051e-05,
"loss": 6.8705,
"step": 76
},
{
"epoch": 0.01264409340208217,
"grad_norm": 0.19225001335144043,
"learning_rate": 5.139219127980399e-05,
"loss": 6.866,
"step": 77
},
{
"epoch": 0.012808302407304016,
"grad_norm": 0.18675003945827484,
"learning_rate": 5.036464176187806e-05,
"loss": 6.8649,
"step": 78
},
{
"epoch": 0.012972511412525863,
"grad_norm": 0.19270944595336914,
"learning_rate": 4.933952962193452e-05,
"loss": 6.8649,
"step": 79
},
{
"epoch": 0.013136720417747709,
"grad_norm": 0.22745263576507568,
"learning_rate": 4.831739388751584e-05,
"loss": 6.8694,
"step": 80
},
{
"epoch": 0.013300929422969556,
"grad_norm": 0.2070324420928955,
"learning_rate": 4.729877202110195e-05,
"loss": 6.8627,
"step": 81
},
{
"epoch": 0.013465138428191402,
"grad_norm": 0.188340425491333,
"learning_rate": 4.628419963750016e-05,
"loss": 6.8661,
"step": 82
},
{
"epoch": 0.013629347433413248,
"grad_norm": 0.19918252527713776,
"learning_rate": 4.527421022220663e-05,
"loss": 6.8717,
"step": 83
},
{
"epoch": 0.013793556438635095,
"grad_norm": 0.20109033584594727,
"learning_rate": 4.426933485088729e-05,
"loss": 6.8634,
"step": 84
},
{
"epoch": 0.013957765443856941,
"grad_norm": 0.21436934173107147,
"learning_rate": 4.327010191012617e-05,
"loss": 6.8627,
"step": 85
},
{
"epoch": 0.014121974449078787,
"grad_norm": 0.2412048876285553,
"learning_rate": 4.227703681958749e-05,
"loss": 6.8715,
"step": 86
},
{
"epoch": 0.014286183454300634,
"grad_norm": 0.21019670367240906,
"learning_rate": 4.1290661755738e-05,
"loss": 6.8719,
"step": 87
},
{
"epoch": 0.01445039245952248,
"grad_norm": 0.23564065992832184,
"learning_rate": 4.03114953772745e-05,
"loss": 6.8576,
"step": 88
},
{
"epoch": 0.014614601464744326,
"grad_norm": 0.23380470275878906,
"learning_rate": 3.934005255240122e-05,
"loss": 6.8628,
"step": 89
},
{
"epoch": 0.014778810469966173,
"grad_norm": 0.2209361046552658,
"learning_rate": 3.837684408810023e-05,
"loss": 6.8548,
"step": 90
},
{
"epoch": 0.01494301947518802,
"grad_norm": 0.23763206601142883,
"learning_rate": 3.7422376461537435e-05,
"loss": 6.8656,
"step": 91
},
{
"epoch": 0.015107228480409865,
"grad_norm": 0.2792171537876129,
"learning_rate": 3.647715155374519e-05,
"loss": 6.8605,
"step": 92
},
{
"epoch": 0.015271437485631712,
"grad_norm": 0.2585715353488922,
"learning_rate": 3.554166638572175e-05,
"loss": 6.8548,
"step": 93
},
{
"epoch": 0.015435646490853558,
"grad_norm": 0.2927703559398651,
"learning_rate": 3.461641285708611e-05,
"loss": 6.8593,
"step": 94
},
{
"epoch": 0.015599855496075404,
"grad_norm": 0.31993693113327026,
"learning_rate": 3.370187748742601e-05,
"loss": 6.8542,
"step": 95
},
{
"epoch": 0.01576406450129725,
"grad_norm": 0.3359384834766388,
"learning_rate": 3.279854116047457e-05,
"loss": 6.8526,
"step": 96
},
{
"epoch": 0.015928273506519097,
"grad_norm": 0.3367615044116974,
"learning_rate": 3.190687887125077e-05,
"loss": 6.8617,
"step": 97
},
{
"epoch": 0.016092482511740943,
"grad_norm": 0.4395657181739807,
"learning_rate": 3.102735947629594e-05,
"loss": 6.8552,
"step": 98
},
{
"epoch": 0.01625669151696279,
"grad_norm": 0.5388174653053284,
"learning_rate": 3.0160445447138308e-05,
"loss": 6.8528,
"step": 99
},
{
"epoch": 0.016420900522184638,
"grad_norm": 0.9235198497772217,
"learning_rate": 2.9306592627114883e-05,
"loss": 6.8479,
"step": 100
},
{
"epoch": 0.016420900522184638,
"eval_loss": 6.867819786071777,
"eval_runtime": 0.1615,
"eval_samples_per_second": 309.663,
"eval_steps_per_second": 55.739,
"step": 100
},
{
"epoch": 0.016585109527406484,
"grad_norm": 0.15563268959522247,
"learning_rate": 2.846624999167843e-05,
"loss": 6.8711,
"step": 101
},
{
"epoch": 0.01674931853262833,
"grad_norm": 0.16060693562030792,
"learning_rate": 2.7639859412315917e-05,
"loss": 6.8678,
"step": 102
},
{
"epoch": 0.016913527537850175,
"grad_norm": 0.16681014001369476,
"learning_rate": 2.682785542420229e-05,
"loss": 6.869,
"step": 103
},
{
"epoch": 0.01707773654307202,
"grad_norm": 0.14798974990844727,
"learning_rate": 2.603066499771192e-05,
"loss": 6.8684,
"step": 104
},
{
"epoch": 0.017241945548293867,
"grad_norm": 0.15809090435504913,
"learning_rate": 2.5248707313907747e-05,
"loss": 6.8688,
"step": 105
},
{
"epoch": 0.017406154553515716,
"grad_norm": 0.15511666238307953,
"learning_rate": 2.4482393544126215e-05,
"loss": 6.8664,
"step": 106
},
{
"epoch": 0.01757036355873756,
"grad_norm": 0.1479611098766327,
"learning_rate": 2.3732126633773928e-05,
"loss": 6.8648,
"step": 107
},
{
"epoch": 0.017734572563959407,
"grad_norm": 0.14057576656341553,
"learning_rate": 2.2998301090449738e-05,
"loss": 6.8655,
"step": 108
},
{
"epoch": 0.017898781569181253,
"grad_norm": 0.1501767635345459,
"learning_rate": 2.2281302776503497e-05,
"loss": 6.8658,
"step": 109
},
{
"epoch": 0.0180629905744031,
"grad_norm": 0.148577019572258,
"learning_rate": 2.1581508706140802e-05,
"loss": 6.8669,
"step": 110
},
{
"epoch": 0.018227199579624948,
"grad_norm": 0.1595151275396347,
"learning_rate": 2.0899286847180243e-05,
"loss": 6.8665,
"step": 111
},
{
"epoch": 0.018391408584846794,
"grad_norm": 0.14874982833862305,
"learning_rate": 2.0234995927567523e-05,
"loss": 6.8682,
"step": 112
},
{
"epoch": 0.01855561759006864,
"grad_norm": 0.16214674711227417,
"learning_rate": 1.9588985246747925e-05,
"loss": 6.8665,
"step": 113
},
{
"epoch": 0.018719826595290485,
"grad_norm": 0.17474649846553802,
"learning_rate": 1.896159449199672e-05,
"loss": 6.8564,
"step": 114
},
{
"epoch": 0.01888403560051233,
"grad_norm": 0.17561936378479004,
"learning_rate": 1.835315355980376e-05,
"loss": 6.8664,
"step": 115
},
{
"epoch": 0.019048244605734177,
"grad_norm": 0.155584454536438,
"learning_rate": 1.7763982382406352e-05,
"loss": 6.8678,
"step": 116
},
{
"epoch": 0.019212453610956026,
"grad_norm": 0.161391481757164,
"learning_rate": 1.7194390759561453e-05,
"loss": 6.8642,
"step": 117
},
{
"epoch": 0.019376662616177872,
"grad_norm": 0.17117644846439362,
"learning_rate": 1.664467819564588e-05,
"loss": 6.8658,
"step": 118
},
{
"epoch": 0.019540871621399718,
"grad_norm": 0.15190771222114563,
"learning_rate": 1.6115133742170053e-05,
"loss": 6.8577,
"step": 119
},
{
"epoch": 0.019705080626621563,
"grad_norm": 0.15246935188770294,
"learning_rate": 1.5606035845787987e-05,
"loss": 6.8638,
"step": 120
},
{
"epoch": 0.01986928963184341,
"grad_norm": 0.1524462103843689,
"learning_rate": 1.511765220188367e-05,
"loss": 6.8575,
"step": 121
},
{
"epoch": 0.020033498637065255,
"grad_norm": 0.187980517745018,
"learning_rate": 1.4650239613810693e-05,
"loss": 6.8698,
"step": 122
},
{
"epoch": 0.020197707642287104,
"grad_norm": 0.17757758498191833,
"learning_rate": 1.4204043857859129e-05,
"loss": 6.8604,
"step": 123
},
{
"epoch": 0.02036191664750895,
"grad_norm": 0.15591758489608765,
"learning_rate": 1.3779299554020672e-05,
"loss": 6.8676,
"step": 124
},
{
"epoch": 0.020526125652730796,
"grad_norm": 0.16383862495422363,
"learning_rate": 1.3376230042620109e-05,
"loss": 6.8664,
"step": 125
},
{
"epoch": 0.020526125652730796,
"eval_loss": 6.865334510803223,
"eval_runtime": 0.1697,
"eval_samples_per_second": 294.637,
"eval_steps_per_second": 53.035,
"step": 125
},
{
"epoch": 0.02069033465795264,
"grad_norm": 0.1573963165283203,
"learning_rate": 1.2995047266877775e-05,
"loss": 6.8635,
"step": 126
},
{
"epoch": 0.020854543663174487,
"grad_norm": 0.17853564023971558,
"learning_rate": 1.2635951661464995e-05,
"loss": 6.8617,
"step": 127
},
{
"epoch": 0.021018752668396336,
"grad_norm": 0.1534070074558258,
"learning_rate": 1.2299132047110876e-05,
"loss": 6.8606,
"step": 128
},
{
"epoch": 0.021182961673618182,
"grad_norm": 0.16048799455165863,
"learning_rate": 1.1984765531316038e-05,
"loss": 6.8625,
"step": 129
},
{
"epoch": 0.021347170678840028,
"grad_norm": 0.19437247514724731,
"learning_rate": 1.1693017415225432e-05,
"loss": 6.8558,
"step": 130
},
{
"epoch": 0.021511379684061874,
"grad_norm": 0.1909008026123047,
"learning_rate": 1.1424041106709194e-05,
"loss": 6.862,
"step": 131
},
{
"epoch": 0.02167558868928372,
"grad_norm": 0.18675506114959717,
"learning_rate": 1.1177978039697217e-05,
"loss": 6.8617,
"step": 132
},
{
"epoch": 0.021839797694505565,
"grad_norm": 0.19926699995994568,
"learning_rate": 1.0954957599810003e-05,
"loss": 6.8532,
"step": 133
},
{
"epoch": 0.022004006699727414,
"grad_norm": 0.18085141479969025,
"learning_rate": 1.0755097056324672e-05,
"loss": 6.8539,
"step": 134
},
{
"epoch": 0.02216821570494926,
"grad_norm": 0.19364075362682343,
"learning_rate": 1.0578501500512109e-05,
"loss": 6.8556,
"step": 135
},
{
"epoch": 0.022332424710171106,
"grad_norm": 0.21459238231182098,
"learning_rate": 1.042526379037754e-05,
"loss": 6.8641,
"step": 136
},
{
"epoch": 0.02249663371539295,
"grad_norm": 0.1923208385705948,
"learning_rate": 1.0295464501833682e-05,
"loss": 6.8561,
"step": 137
},
{
"epoch": 0.022660842720614797,
"grad_norm": 0.18413542211055756,
"learning_rate": 1.0189171886332038e-05,
"loss": 6.8555,
"step": 138
},
{
"epoch": 0.022825051725836643,
"grad_norm": 0.21922001242637634,
"learning_rate": 1.0106441834974748e-05,
"loss": 6.8461,
"step": 139
},
{
"epoch": 0.022989260731058492,
"grad_norm": 0.24008332192897797,
"learning_rate": 1.0047317849125743e-05,
"loss": 6.863,
"step": 140
},
{
"epoch": 0.023153469736280338,
"grad_norm": 0.23309044539928436,
"learning_rate": 1.0011831017536722e-05,
"loss": 6.8683,
"step": 141
},
{
"epoch": 0.023317678741502184,
"grad_norm": 0.2427399456501007,
"learning_rate": 1e-05,
"loss": 6.8653,
"step": 142
}
],
"logging_steps": 1,
"max_steps": 142,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 50,
"stateful_callbacks": {
"EarlyStoppingCallback": {
"args": {
"early_stopping_patience": 1,
"early_stopping_threshold": 0.0
},
"attributes": {
"early_stopping_patience_counter": 0
}
},
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 3803377213440.0,
"train_batch_size": 6,
"trial_name": null,
"trial_params": null
}