RoyJoy's picture
Training in progress, step 228, checkpoint
45b35b8 verified
raw
history blame
43 kB
{
"best_metric": 1.084336519241333,
"best_model_checkpoint": "miner_id_24/checkpoint-200",
"epoch": 0.057406780861259075,
"eval_steps": 25,
"global_step": 228,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.00025178412658446965,
"grad_norm": 0.2894246280193329,
"learning_rate": 1.1111111111111112e-05,
"loss": 1.3358,
"step": 1
},
{
"epoch": 0.00025178412658446965,
"eval_loss": 1.4041720628738403,
"eval_runtime": 1.5236,
"eval_samples_per_second": 32.818,
"eval_steps_per_second": 8.533,
"step": 1
},
{
"epoch": 0.0005035682531689393,
"grad_norm": 0.3336651027202606,
"learning_rate": 2.2222222222222223e-05,
"loss": 1.4653,
"step": 2
},
{
"epoch": 0.0007553523797534089,
"grad_norm": 0.36238351464271545,
"learning_rate": 3.3333333333333335e-05,
"loss": 1.4747,
"step": 3
},
{
"epoch": 0.0010071365063378786,
"grad_norm": 0.38116249442100525,
"learning_rate": 4.4444444444444447e-05,
"loss": 1.4752,
"step": 4
},
{
"epoch": 0.0012589206329223482,
"grad_norm": 0.4064643979072571,
"learning_rate": 5.555555555555556e-05,
"loss": 1.4008,
"step": 5
},
{
"epoch": 0.0015107047595068178,
"grad_norm": 0.43084007501602173,
"learning_rate": 6.666666666666667e-05,
"loss": 1.383,
"step": 6
},
{
"epoch": 0.0017624888860912876,
"grad_norm": 0.46177080273628235,
"learning_rate": 7.777777777777778e-05,
"loss": 1.36,
"step": 7
},
{
"epoch": 0.002014273012675757,
"grad_norm": 0.37692713737487793,
"learning_rate": 8.888888888888889e-05,
"loss": 1.3537,
"step": 8
},
{
"epoch": 0.0022660571392602268,
"grad_norm": 0.25504958629608154,
"learning_rate": 0.0001,
"loss": 1.3239,
"step": 9
},
{
"epoch": 0.0025178412658446963,
"grad_norm": 0.23159584403038025,
"learning_rate": 9.999536994034917e-05,
"loss": 1.3255,
"step": 10
},
{
"epoch": 0.002769625392429166,
"grad_norm": 0.28963929414749146,
"learning_rate": 9.99814807141723e-05,
"loss": 1.3385,
"step": 11
},
{
"epoch": 0.0030214095190136355,
"grad_norm": 0.31955209374427795,
"learning_rate": 9.995833517960035e-05,
"loss": 1.3097,
"step": 12
},
{
"epoch": 0.003273193645598105,
"grad_norm": 0.23069429397583008,
"learning_rate": 9.992593809953133e-05,
"loss": 1.2359,
"step": 13
},
{
"epoch": 0.003524977772182575,
"grad_norm": 0.27882739901542664,
"learning_rate": 9.988429614065029e-05,
"loss": 1.3039,
"step": 14
},
{
"epoch": 0.0037767618987670447,
"grad_norm": 0.30989697575569153,
"learning_rate": 9.983341787205736e-05,
"loss": 1.3263,
"step": 15
},
{
"epoch": 0.004028546025351514,
"grad_norm": 0.25735488533973694,
"learning_rate": 9.97733137635045e-05,
"loss": 1.3418,
"step": 16
},
{
"epoch": 0.0042803301519359835,
"grad_norm": 0.18925218284130096,
"learning_rate": 9.970399618324092e-05,
"loss": 1.2892,
"step": 17
},
{
"epoch": 0.0045321142785204535,
"grad_norm": 0.15477243065834045,
"learning_rate": 9.962547939546805e-05,
"loss": 1.2695,
"step": 18
},
{
"epoch": 0.0047838984051049235,
"grad_norm": 0.1201956495642662,
"learning_rate": 9.953777955740415e-05,
"loss": 1.2515,
"step": 19
},
{
"epoch": 0.005035682531689393,
"grad_norm": 0.08931563794612885,
"learning_rate": 9.944091471595951e-05,
"loss": 1.2399,
"step": 20
},
{
"epoch": 0.005287466658273863,
"grad_norm": 0.10346703231334686,
"learning_rate": 9.933490480402273e-05,
"loss": 1.2807,
"step": 21
},
{
"epoch": 0.005539250784858332,
"grad_norm": 0.14112868905067444,
"learning_rate": 9.921977163635899e-05,
"loss": 1.2333,
"step": 22
},
{
"epoch": 0.005791034911442802,
"grad_norm": 0.1877644807100296,
"learning_rate": 9.90955389051209e-05,
"loss": 1.2533,
"step": 23
},
{
"epoch": 0.006042819038027271,
"grad_norm": 0.215586319565773,
"learning_rate": 9.89622321749732e-05,
"loss": 1.2433,
"step": 24
},
{
"epoch": 0.006294603164611741,
"grad_norm": 0.2447165846824646,
"learning_rate": 9.881987887783194e-05,
"loss": 1.1634,
"step": 25
},
{
"epoch": 0.006294603164611741,
"eval_loss": 1.2141509056091309,
"eval_runtime": 1.5236,
"eval_samples_per_second": 32.817,
"eval_steps_per_second": 8.532,
"step": 25
},
{
"epoch": 0.00654638729119621,
"grad_norm": 0.1898128092288971,
"learning_rate": 9.866850830721973e-05,
"loss": 1.2322,
"step": 26
},
{
"epoch": 0.00679817141778068,
"grad_norm": 0.16772133111953735,
"learning_rate": 9.85081516122375e-05,
"loss": 1.2542,
"step": 27
},
{
"epoch": 0.00704995554436515,
"grad_norm": 0.11975707113742828,
"learning_rate": 9.83388417911547e-05,
"loss": 1.2536,
"step": 28
},
{
"epoch": 0.0073017396709496194,
"grad_norm": 0.09228307008743286,
"learning_rate": 9.816061368461896e-05,
"loss": 1.2566,
"step": 29
},
{
"epoch": 0.0075535237975340895,
"grad_norm": 0.0792098417878151,
"learning_rate": 9.79735039684865e-05,
"loss": 1.226,
"step": 30
},
{
"epoch": 0.007805307924118559,
"grad_norm": 0.0848434716463089,
"learning_rate": 9.777755114627491e-05,
"loss": 1.1899,
"step": 31
},
{
"epoch": 0.008057092050703029,
"grad_norm": 0.09312910586595535,
"learning_rate": 9.757279554124004e-05,
"loss": 1.1568,
"step": 32
},
{
"epoch": 0.008308876177287499,
"grad_norm": 0.11826145648956299,
"learning_rate": 9.735927928807813e-05,
"loss": 1.2118,
"step": 33
},
{
"epoch": 0.008560660303871967,
"grad_norm": 0.1303245574235916,
"learning_rate": 9.713704632425529e-05,
"loss": 1.1613,
"step": 34
},
{
"epoch": 0.008812444430456437,
"grad_norm": 0.11749281734228134,
"learning_rate": 9.690614238096617e-05,
"loss": 1.2169,
"step": 35
},
{
"epoch": 0.009064228557040907,
"grad_norm": 0.11288820207118988,
"learning_rate": 9.666661497372324e-05,
"loss": 1.1986,
"step": 36
},
{
"epoch": 0.009316012683625377,
"grad_norm": 0.10559144616127014,
"learning_rate": 9.641851339257912e-05,
"loss": 1.2018,
"step": 37
},
{
"epoch": 0.009567796810209847,
"grad_norm": 0.10261145234107971,
"learning_rate": 9.616188869198361e-05,
"loss": 1.0372,
"step": 38
},
{
"epoch": 0.009819580936794315,
"grad_norm": 0.09313081204891205,
"learning_rate": 9.589679368027765e-05,
"loss": 1.2235,
"step": 39
},
{
"epoch": 0.010071365063378785,
"grad_norm": 0.08457491546869278,
"learning_rate": 9.562328290882643e-05,
"loss": 1.2094,
"step": 40
},
{
"epoch": 0.010323149189963255,
"grad_norm": 0.08044976741075516,
"learning_rate": 9.534141266079385e-05,
"loss": 1.2016,
"step": 41
},
{
"epoch": 0.010574933316547725,
"grad_norm": 0.08418776094913483,
"learning_rate": 9.505124093956045e-05,
"loss": 1.2094,
"step": 42
},
{
"epoch": 0.010826717443132194,
"grad_norm": 0.09928078949451447,
"learning_rate": 9.475282745678749e-05,
"loss": 1.1695,
"step": 43
},
{
"epoch": 0.011078501569716664,
"grad_norm": 0.09286114573478699,
"learning_rate": 9.444623362012944e-05,
"loss": 1.0966,
"step": 44
},
{
"epoch": 0.011330285696301134,
"grad_norm": 0.09147851914167404,
"learning_rate": 9.413152252059749e-05,
"loss": 1.1769,
"step": 45
},
{
"epoch": 0.011582069822885604,
"grad_norm": 0.09161302447319031,
"learning_rate": 9.380875891957674e-05,
"loss": 1.1761,
"step": 46
},
{
"epoch": 0.011833853949470074,
"grad_norm": 0.09320323169231415,
"learning_rate": 9.347800923549942e-05,
"loss": 1.1795,
"step": 47
},
{
"epoch": 0.012085638076054542,
"grad_norm": 0.08234869688749313,
"learning_rate": 9.313934153017741e-05,
"loss": 1.1745,
"step": 48
},
{
"epoch": 0.012337422202639012,
"grad_norm": 0.08729473501443863,
"learning_rate": 9.279282549479634e-05,
"loss": 1.1614,
"step": 49
},
{
"epoch": 0.012589206329223482,
"grad_norm": 0.11173596233129501,
"learning_rate": 9.243853243557462e-05,
"loss": 1.0938,
"step": 50
},
{
"epoch": 0.012589206329223482,
"eval_loss": 1.1531009674072266,
"eval_runtime": 1.5229,
"eval_samples_per_second": 32.832,
"eval_steps_per_second": 8.536,
"step": 50
},
{
"epoch": 0.012840990455807952,
"grad_norm": 0.10851637274026871,
"learning_rate": 9.207653525908994e-05,
"loss": 1.1919,
"step": 51
},
{
"epoch": 0.01309277458239242,
"grad_norm": 0.12280933558940887,
"learning_rate": 9.170690845727655e-05,
"loss": 1.2008,
"step": 52
},
{
"epoch": 0.01334455870897689,
"grad_norm": 0.11200592666864395,
"learning_rate": 9.132972809209626e-05,
"loss": 1.1743,
"step": 53
},
{
"epoch": 0.01359634283556136,
"grad_norm": 0.10003667324781418,
"learning_rate": 9.094507177988643e-05,
"loss": 1.1631,
"step": 54
},
{
"epoch": 0.01384812696214583,
"grad_norm": 0.09316058456897736,
"learning_rate": 9.055301867538794e-05,
"loss": 1.2185,
"step": 55
},
{
"epoch": 0.0140999110887303,
"grad_norm": 0.07689037919044495,
"learning_rate": 9.01536494554568e-05,
"loss": 1.1249,
"step": 56
},
{
"epoch": 0.014351695215314769,
"grad_norm": 0.0907893106341362,
"learning_rate": 8.974704630246239e-05,
"loss": 1.1772,
"step": 57
},
{
"epoch": 0.014603479341899239,
"grad_norm": 0.09316601604223251,
"learning_rate": 8.933329288737597e-05,
"loss": 1.1482,
"step": 58
},
{
"epoch": 0.014855263468483709,
"grad_norm": 0.09033600986003876,
"learning_rate": 8.89124743525527e-05,
"loss": 1.1258,
"step": 59
},
{
"epoch": 0.015107047595068179,
"grad_norm": 0.10970163345336914,
"learning_rate": 8.848467729421124e-05,
"loss": 1.1349,
"step": 60
},
{
"epoch": 0.015358831721652647,
"grad_norm": 0.10259843617677689,
"learning_rate": 8.804998974461371e-05,
"loss": 1.1576,
"step": 61
},
{
"epoch": 0.015610615848237117,
"grad_norm": 0.11174706369638443,
"learning_rate": 8.760850115395054e-05,
"loss": 1.1705,
"step": 62
},
{
"epoch": 0.015862399974821587,
"grad_norm": 0.08973610401153564,
"learning_rate": 8.716030237193325e-05,
"loss": 1.1515,
"step": 63
},
{
"epoch": 0.016114184101406057,
"grad_norm": 0.08280681818723679,
"learning_rate": 8.670548562909947e-05,
"loss": 1.1607,
"step": 64
},
{
"epoch": 0.016365968227990527,
"grad_norm": 0.09221348911523819,
"learning_rate": 8.624414451783364e-05,
"loss": 1.1482,
"step": 65
},
{
"epoch": 0.016617752354574997,
"grad_norm": 0.09574998915195465,
"learning_rate": 8.577637397310749e-05,
"loss": 1.241,
"step": 66
},
{
"epoch": 0.016869536481159467,
"grad_norm": 0.09795909374952316,
"learning_rate": 8.530227025294435e-05,
"loss": 1.1739,
"step": 67
},
{
"epoch": 0.017121320607743934,
"grad_norm": 0.09504640102386475,
"learning_rate": 8.482193091861112e-05,
"loss": 1.1334,
"step": 68
},
{
"epoch": 0.017373104734328404,
"grad_norm": 0.08822384476661682,
"learning_rate": 8.433545481454206e-05,
"loss": 1.137,
"step": 69
},
{
"epoch": 0.017624888860912874,
"grad_norm": 0.0878458246588707,
"learning_rate": 8.384294204799853e-05,
"loss": 1.1719,
"step": 70
},
{
"epoch": 0.017876672987497344,
"grad_norm": 0.08970195800065994,
"learning_rate": 8.334449396846886e-05,
"loss": 1.1298,
"step": 71
},
{
"epoch": 0.018128457114081814,
"grad_norm": 0.0950387567281723,
"learning_rate": 8.284021314681265e-05,
"loss": 1.1556,
"step": 72
},
{
"epoch": 0.018380241240666284,
"grad_norm": 0.09765107929706573,
"learning_rate": 8.233020335415371e-05,
"loss": 1.1732,
"step": 73
},
{
"epoch": 0.018632025367250754,
"grad_norm": 0.11165706068277359,
"learning_rate": 8.18145695405259e-05,
"loss": 1.1746,
"step": 74
},
{
"epoch": 0.018883809493835224,
"grad_norm": 0.15581081807613373,
"learning_rate": 8.129341781327658e-05,
"loss": 1.0963,
"step": 75
},
{
"epoch": 0.018883809493835224,
"eval_loss": 1.122991681098938,
"eval_runtime": 1.5233,
"eval_samples_per_second": 32.822,
"eval_steps_per_second": 8.534,
"step": 75
},
{
"epoch": 0.019135593620419694,
"grad_norm": 0.08741319179534912,
"learning_rate": 8.07668554152317e-05,
"loss": 1.1688,
"step": 76
},
{
"epoch": 0.01938737774700416,
"grad_norm": 0.08875293284654617,
"learning_rate": 8.02349907026274e-05,
"loss": 1.1829,
"step": 77
},
{
"epoch": 0.01963916187358863,
"grad_norm": 0.08861377090215683,
"learning_rate": 7.969793312281237e-05,
"loss": 1.1803,
"step": 78
},
{
"epoch": 0.0198909460001731,
"grad_norm": 0.09791383892297745,
"learning_rate": 7.915579319172573e-05,
"loss": 1.2001,
"step": 79
},
{
"epoch": 0.02014273012675757,
"grad_norm": 0.0991906151175499,
"learning_rate": 7.860868247115505e-05,
"loss": 1.1669,
"step": 80
},
{
"epoch": 0.02039451425334204,
"grad_norm": 0.09529578685760498,
"learning_rate": 7.805671354577908e-05,
"loss": 1.1522,
"step": 81
},
{
"epoch": 0.02064629837992651,
"grad_norm": 0.09793104231357574,
"learning_rate": 7.75e-05,
"loss": 1.1407,
"step": 82
},
{
"epoch": 0.02089808250651098,
"grad_norm": 0.09607716649770737,
"learning_rate": 7.693865639457011e-05,
"loss": 1.1291,
"step": 83
},
{
"epoch": 0.02114986663309545,
"grad_norm": 0.10056506842374802,
"learning_rate": 7.637279824301728e-05,
"loss": 1.1124,
"step": 84
},
{
"epoch": 0.02140165075967992,
"grad_norm": 0.10144215822219849,
"learning_rate": 7.580254198787463e-05,
"loss": 1.1022,
"step": 85
},
{
"epoch": 0.021653434886264387,
"grad_norm": 0.10326708108186722,
"learning_rate": 7.522800497671897e-05,
"loss": 1.073,
"step": 86
},
{
"epoch": 0.021905219012848857,
"grad_norm": 0.1260983794927597,
"learning_rate": 7.464930543802289e-05,
"loss": 1.1218,
"step": 87
},
{
"epoch": 0.022157003139433327,
"grad_norm": 0.10883725434541702,
"learning_rate": 7.406656245682565e-05,
"loss": 1.1167,
"step": 88
},
{
"epoch": 0.022408787266017798,
"grad_norm": 0.09835278987884521,
"learning_rate": 7.34798959502279e-05,
"loss": 1.1506,
"step": 89
},
{
"epoch": 0.022660571392602268,
"grad_norm": 0.10083261877298355,
"learning_rate": 7.288942664271503e-05,
"loss": 1.1792,
"step": 90
},
{
"epoch": 0.022912355519186738,
"grad_norm": 0.10250851511955261,
"learning_rate": 7.229527604131436e-05,
"loss": 1.1897,
"step": 91
},
{
"epoch": 0.023164139645771208,
"grad_norm": 0.1085507944226265,
"learning_rate": 7.16975664105915e-05,
"loss": 1.1402,
"step": 92
},
{
"epoch": 0.023415923772355678,
"grad_norm": 0.10734712332487106,
"learning_rate": 7.109642074749067e-05,
"loss": 1.0878,
"step": 93
},
{
"epoch": 0.023667707898940148,
"grad_norm": 0.10455948859453201,
"learning_rate": 7.049196275602421e-05,
"loss": 1.116,
"step": 94
},
{
"epoch": 0.023919492025524614,
"grad_norm": 0.10420308262109756,
"learning_rate": 6.988431682181693e-05,
"loss": 1.1243,
"step": 95
},
{
"epoch": 0.024171276152109084,
"grad_norm": 0.10482881218194962,
"learning_rate": 6.927360798650978e-05,
"loss": 1.1198,
"step": 96
},
{
"epoch": 0.024423060278693554,
"grad_norm": 0.11121159791946411,
"learning_rate": 6.865996192202884e-05,
"loss": 1.1097,
"step": 97
},
{
"epoch": 0.024674844405278024,
"grad_norm": 0.11902576684951782,
"learning_rate": 6.804350490472446e-05,
"loss": 1.1056,
"step": 98
},
{
"epoch": 0.024926628531862494,
"grad_norm": 0.12510505318641663,
"learning_rate": 6.742436378938612e-05,
"loss": 1.109,
"step": 99
},
{
"epoch": 0.025178412658446964,
"grad_norm": 0.1589539796113968,
"learning_rate": 6.680266598313802e-05,
"loss": 1.065,
"step": 100
},
{
"epoch": 0.025178412658446964,
"eval_loss": 1.106641411781311,
"eval_runtime": 1.5233,
"eval_samples_per_second": 32.824,
"eval_steps_per_second": 8.534,
"step": 100
},
{
"epoch": 0.025430196785031434,
"grad_norm": 0.09652504324913025,
"learning_rate": 6.617853941922146e-05,
"loss": 1.0807,
"step": 101
},
{
"epoch": 0.025681980911615904,
"grad_norm": 0.10196559131145477,
"learning_rate": 6.555211253066844e-05,
"loss": 1.1715,
"step": 102
},
{
"epoch": 0.025933765038200374,
"grad_norm": 0.10689554363489151,
"learning_rate": 6.49235142238728e-05,
"loss": 1.1938,
"step": 103
},
{
"epoch": 0.02618554916478484,
"grad_norm": 0.11235436052083969,
"learning_rate": 6.429287385206368e-05,
"loss": 1.2069,
"step": 104
},
{
"epoch": 0.02643733329136931,
"grad_norm": 0.11319782584905624,
"learning_rate": 6.366032118868734e-05,
"loss": 1.1022,
"step": 105
},
{
"epoch": 0.02668911741795378,
"grad_norm": 0.12285393476486206,
"learning_rate": 6.302598640070218e-05,
"loss": 1.131,
"step": 106
},
{
"epoch": 0.02694090154453825,
"grad_norm": 0.11379806697368622,
"learning_rate": 6.23900000217929e-05,
"loss": 1.0844,
"step": 107
},
{
"epoch": 0.02719268567112272,
"grad_norm": 0.1146794855594635,
"learning_rate": 6.175249292550937e-05,
"loss": 1.1122,
"step": 108
},
{
"epoch": 0.02744446979770719,
"grad_norm": 0.1138162612915039,
"learning_rate": 6.111359629833533e-05,
"loss": 1.1059,
"step": 109
},
{
"epoch": 0.02769625392429166,
"grad_norm": 0.13073591887950897,
"learning_rate": 6.0473441612692705e-05,
"loss": 1.1073,
"step": 110
},
{
"epoch": 0.02794803805087613,
"grad_norm": 0.13190491497516632,
"learning_rate": 5.9832160599887344e-05,
"loss": 1.1323,
"step": 111
},
{
"epoch": 0.0281998221774606,
"grad_norm": 0.1431579738855362,
"learning_rate": 5.9189885223001094e-05,
"loss": 1.1094,
"step": 112
},
{
"epoch": 0.028451606304045068,
"grad_norm": 0.11174172163009644,
"learning_rate": 5.85467476497365e-05,
"loss": 1.0261,
"step": 113
},
{
"epoch": 0.028703390430629538,
"grad_norm": 0.10128598660230637,
"learning_rate": 5.790288022521925e-05,
"loss": 1.1019,
"step": 114
},
{
"epoch": 0.028955174557214008,
"grad_norm": 0.10618976503610611,
"learning_rate": 5.725841544476413e-05,
"loss": 1.1591,
"step": 115
},
{
"epoch": 0.029206958683798478,
"grad_norm": 0.11264601349830627,
"learning_rate": 5.661348592661009e-05,
"loss": 1.1057,
"step": 116
},
{
"epoch": 0.029458742810382948,
"grad_norm": 0.11875821650028229,
"learning_rate": 5.596822438463001e-05,
"loss": 1.1278,
"step": 117
},
{
"epoch": 0.029710526936967418,
"grad_norm": 0.13417275249958038,
"learning_rate": 5.532276360102076e-05,
"loss": 1.0758,
"step": 118
},
{
"epoch": 0.029962311063551888,
"grad_norm": 0.12961986660957336,
"learning_rate": 5.467723639897926e-05,
"loss": 1.0888,
"step": 119
},
{
"epoch": 0.030214095190136358,
"grad_norm": 0.12340384721755981,
"learning_rate": 5.4031775615370004e-05,
"loss": 1.098,
"step": 120
},
{
"epoch": 0.030465879316720828,
"grad_norm": 0.11747000366449356,
"learning_rate": 5.3386514073389936e-05,
"loss": 1.1203,
"step": 121
},
{
"epoch": 0.030717663443305294,
"grad_norm": 0.11870179325342178,
"learning_rate": 5.274158455523588e-05,
"loss": 1.1258,
"step": 122
},
{
"epoch": 0.030969447569889764,
"grad_norm": 0.12783485651016235,
"learning_rate": 5.209711977478078e-05,
"loss": 1.097,
"step": 123
},
{
"epoch": 0.031221231696474235,
"grad_norm": 0.14884835481643677,
"learning_rate": 5.145325235026351e-05,
"loss": 1.103,
"step": 124
},
{
"epoch": 0.031473015823058705,
"grad_norm": 0.17791038751602173,
"learning_rate": 5.081011477699893e-05,
"loss": 1.0389,
"step": 125
},
{
"epoch": 0.031473015823058705,
"eval_loss": 1.096808671951294,
"eval_runtime": 1.5229,
"eval_samples_per_second": 32.831,
"eval_steps_per_second": 8.536,
"step": 125
},
{
"epoch": 0.031724799949643175,
"grad_norm": 0.11103296279907227,
"learning_rate": 5.016783940011267e-05,
"loss": 1.1911,
"step": 126
},
{
"epoch": 0.031976584076227645,
"grad_norm": 0.11420492827892303,
"learning_rate": 4.952655838730731e-05,
"loss": 1.1282,
"step": 127
},
{
"epoch": 0.032228368202812115,
"grad_norm": 0.11212627589702606,
"learning_rate": 4.888640370166469e-05,
"loss": 1.1621,
"step": 128
},
{
"epoch": 0.032480152329396585,
"grad_norm": 0.11292574554681778,
"learning_rate": 4.824750707449064e-05,
"loss": 1.1511,
"step": 129
},
{
"epoch": 0.032731936455981055,
"grad_norm": 0.12400747090578079,
"learning_rate": 4.760999997820711e-05,
"loss": 1.1402,
"step": 130
},
{
"epoch": 0.032983720582565525,
"grad_norm": 0.1370488405227661,
"learning_rate": 4.6974013599297837e-05,
"loss": 1.0905,
"step": 131
},
{
"epoch": 0.033235504709149995,
"grad_norm": 0.1564428210258484,
"learning_rate": 4.633967881131266e-05,
"loss": 1.1266,
"step": 132
},
{
"epoch": 0.033487288835734465,
"grad_norm": 0.12354940921068192,
"learning_rate": 4.570712614793633e-05,
"loss": 1.1013,
"step": 133
},
{
"epoch": 0.033739072962318935,
"grad_norm": 0.12833015620708466,
"learning_rate": 4.507648577612722e-05,
"loss": 1.1246,
"step": 134
},
{
"epoch": 0.033990857088903405,
"grad_norm": 0.12590007483959198,
"learning_rate": 4.4447887469331574e-05,
"loss": 1.1417,
"step": 135
},
{
"epoch": 0.03424264121548787,
"grad_norm": 0.13550645112991333,
"learning_rate": 4.382146058077855e-05,
"loss": 1.13,
"step": 136
},
{
"epoch": 0.03449442534207234,
"grad_norm": 0.14064566791057587,
"learning_rate": 4.319733401686199e-05,
"loss": 1.0597,
"step": 137
},
{
"epoch": 0.03474620946865681,
"grad_norm": 0.1269257515668869,
"learning_rate": 4.25756362106139e-05,
"loss": 1.1038,
"step": 138
},
{
"epoch": 0.03499799359524128,
"grad_norm": 0.11279810220003128,
"learning_rate": 4.195649509527555e-05,
"loss": 1.1551,
"step": 139
},
{
"epoch": 0.03524977772182575,
"grad_norm": 0.1206924095749855,
"learning_rate": 4.134003807797116e-05,
"loss": 1.1122,
"step": 140
},
{
"epoch": 0.03550156184841022,
"grad_norm": 0.12054243683815002,
"learning_rate": 4.0726392013490235e-05,
"loss": 1.188,
"step": 141
},
{
"epoch": 0.03575334597499469,
"grad_norm": 0.1204872876405716,
"learning_rate": 4.0115683178183084e-05,
"loss": 1.0985,
"step": 142
},
{
"epoch": 0.03600513010157916,
"grad_norm": 0.12411932647228241,
"learning_rate": 3.95080372439758e-05,
"loss": 1.1114,
"step": 143
},
{
"epoch": 0.03625691422816363,
"grad_norm": 0.12376714497804642,
"learning_rate": 3.8903579252509345e-05,
"loss": 1.0835,
"step": 144
},
{
"epoch": 0.0365086983547481,
"grad_norm": 0.12805478274822235,
"learning_rate": 3.8302433589408525e-05,
"loss": 1.1225,
"step": 145
},
{
"epoch": 0.03676048248133257,
"grad_norm": 0.12796953320503235,
"learning_rate": 3.770472395868566e-05,
"loss": 1.1206,
"step": 146
},
{
"epoch": 0.03701226660791704,
"grad_norm": 0.12267071008682251,
"learning_rate": 3.711057335728499e-05,
"loss": 1.1043,
"step": 147
},
{
"epoch": 0.03726405073450151,
"grad_norm": 0.12217283993959427,
"learning_rate": 3.65201040497721e-05,
"loss": 1.0964,
"step": 148
},
{
"epoch": 0.03751583486108598,
"grad_norm": 0.1348910927772522,
"learning_rate": 3.5933437543174363e-05,
"loss": 1.1044,
"step": 149
},
{
"epoch": 0.03776761898767045,
"grad_norm": 0.16145659983158112,
"learning_rate": 3.5350694561977125e-05,
"loss": 1.0322,
"step": 150
},
{
"epoch": 0.03776761898767045,
"eval_loss": 1.090155839920044,
"eval_runtime": 1.5223,
"eval_samples_per_second": 32.844,
"eval_steps_per_second": 8.539,
"step": 150
},
{
"epoch": 0.03801940311425492,
"grad_norm": 0.11269905418157578,
"learning_rate": 3.4771995023281044e-05,
"loss": 1.1413,
"step": 151
},
{
"epoch": 0.03827118724083939,
"grad_norm": 0.11654523015022278,
"learning_rate": 3.419745801212538e-05,
"loss": 1.1552,
"step": 152
},
{
"epoch": 0.03852297136742386,
"grad_norm": 0.12446989119052887,
"learning_rate": 3.362720175698275e-05,
"loss": 1.1491,
"step": 153
},
{
"epoch": 0.03877475549400832,
"grad_norm": 0.1188817173242569,
"learning_rate": 3.30613436054299e-05,
"loss": 1.1733,
"step": 154
},
{
"epoch": 0.03902653962059279,
"grad_norm": 0.12001082301139832,
"learning_rate": 3.250000000000001e-05,
"loss": 1.1019,
"step": 155
},
{
"epoch": 0.03927832374717726,
"grad_norm": 0.12011527270078659,
"learning_rate": 3.194328645422094e-05,
"loss": 1.1118,
"step": 156
},
{
"epoch": 0.03953010787376173,
"grad_norm": 0.1201213002204895,
"learning_rate": 3.1391317528844965e-05,
"loss": 1.1333,
"step": 157
},
{
"epoch": 0.0397818920003462,
"grad_norm": 0.1220758855342865,
"learning_rate": 3.0844206808274287e-05,
"loss": 1.096,
"step": 158
},
{
"epoch": 0.04003367612693067,
"grad_norm": 0.12169603258371353,
"learning_rate": 3.030206687718765e-05,
"loss": 1.1063,
"step": 159
},
{
"epoch": 0.04028546025351514,
"grad_norm": 0.1275063306093216,
"learning_rate": 2.9765009297372602e-05,
"loss": 1.1171,
"step": 160
},
{
"epoch": 0.04053724438009961,
"grad_norm": 0.13592584431171417,
"learning_rate": 2.92331445847683e-05,
"loss": 1.1148,
"step": 161
},
{
"epoch": 0.04078902850668408,
"grad_norm": 0.1522008776664734,
"learning_rate": 2.8706582186723417e-05,
"loss": 1.0848,
"step": 162
},
{
"epoch": 0.04104081263326855,
"grad_norm": 0.12438967078924179,
"learning_rate": 2.8185430459474105e-05,
"loss": 1.1159,
"step": 163
},
{
"epoch": 0.04129259675985302,
"grad_norm": 0.11593794822692871,
"learning_rate": 2.76697966458463e-05,
"loss": 1.1523,
"step": 164
},
{
"epoch": 0.04154438088643749,
"grad_norm": 0.1169944554567337,
"learning_rate": 2.7159786853187362e-05,
"loss": 1.1226,
"step": 165
},
{
"epoch": 0.04179616501302196,
"grad_norm": 0.1212480217218399,
"learning_rate": 2.6655506031531153e-05,
"loss": 1.1441,
"step": 166
},
{
"epoch": 0.04204794913960643,
"grad_norm": 0.12019576132297516,
"learning_rate": 2.6157057952001485e-05,
"loss": 1.1018,
"step": 167
},
{
"epoch": 0.0422997332661909,
"grad_norm": 0.12491545081138611,
"learning_rate": 2.5664545185457933e-05,
"loss": 1.1019,
"step": 168
},
{
"epoch": 0.04255151739277537,
"grad_norm": 0.13169804215431213,
"learning_rate": 2.517806908138888e-05,
"loss": 1.0623,
"step": 169
},
{
"epoch": 0.04280330151935984,
"grad_norm": 0.13421769440174103,
"learning_rate": 2.469772974705565e-05,
"loss": 1.0871,
"step": 170
},
{
"epoch": 0.04305508564594431,
"grad_norm": 0.12427503615617752,
"learning_rate": 2.422362602689254e-05,
"loss": 1.0794,
"step": 171
},
{
"epoch": 0.043306869772528775,
"grad_norm": 0.12563304603099823,
"learning_rate": 2.3755855482166378e-05,
"loss": 1.1098,
"step": 172
},
{
"epoch": 0.043558653899113245,
"grad_norm": 0.13072235882282257,
"learning_rate": 2.3294514370900542e-05,
"loss": 1.0911,
"step": 173
},
{
"epoch": 0.043810438025697715,
"grad_norm": 0.14273561537265778,
"learning_rate": 2.283969762806676e-05,
"loss": 1.109,
"step": 174
},
{
"epoch": 0.044062222152282185,
"grad_norm": 0.1899888515472412,
"learning_rate": 2.239149884604948e-05,
"loss": 0.9888,
"step": 175
},
{
"epoch": 0.044062222152282185,
"eval_loss": 1.0866016149520874,
"eval_runtime": 1.5236,
"eval_samples_per_second": 32.816,
"eval_steps_per_second": 8.532,
"step": 175
},
{
"epoch": 0.044314006278866655,
"grad_norm": 0.10499967634677887,
"learning_rate": 2.1950010255386297e-05,
"loss": 1.0887,
"step": 176
},
{
"epoch": 0.044565790405451125,
"grad_norm": 0.11740684509277344,
"learning_rate": 2.1515322705788788e-05,
"loss": 1.1305,
"step": 177
},
{
"epoch": 0.044817574532035595,
"grad_norm": 0.12669669091701508,
"learning_rate": 2.108752564744731e-05,
"loss": 1.1639,
"step": 178
},
{
"epoch": 0.045069358658620065,
"grad_norm": 0.12538516521453857,
"learning_rate": 2.0666707112624058e-05,
"loss": 1.1097,
"step": 179
},
{
"epoch": 0.045321142785204535,
"grad_norm": 0.12294058501720428,
"learning_rate": 2.025295369753761e-05,
"loss": 1.122,
"step": 180
},
{
"epoch": 0.045572926911789005,
"grad_norm": 0.12443839758634567,
"learning_rate": 1.9846350544543215e-05,
"loss": 1.0877,
"step": 181
},
{
"epoch": 0.045824711038373475,
"grad_norm": 0.12381776422262192,
"learning_rate": 1.944698132461207e-05,
"loss": 1.0927,
"step": 182
},
{
"epoch": 0.046076495164957945,
"grad_norm": 0.12740595638751984,
"learning_rate": 1.905492822011359e-05,
"loss": 1.1031,
"step": 183
},
{
"epoch": 0.046328279291542415,
"grad_norm": 0.12863051891326904,
"learning_rate": 1.8670271907903737e-05,
"loss": 1.1065,
"step": 184
},
{
"epoch": 0.046580063418126885,
"grad_norm": 0.1300704926252365,
"learning_rate": 1.829309154272346e-05,
"loss": 1.1169,
"step": 185
},
{
"epoch": 0.046831847544711355,
"grad_norm": 0.13567733764648438,
"learning_rate": 1.7923464740910063e-05,
"loss": 1.1257,
"step": 186
},
{
"epoch": 0.047083631671295825,
"grad_norm": 0.145668625831604,
"learning_rate": 1.756146756442539e-05,
"loss": 1.0824,
"step": 187
},
{
"epoch": 0.047335415797880295,
"grad_norm": 0.12303412705659866,
"learning_rate": 1.720717450520366e-05,
"loss": 1.0278,
"step": 188
},
{
"epoch": 0.047587199924464765,
"grad_norm": 0.11618991941213608,
"learning_rate": 1.686065846982261e-05,
"loss": 1.151,
"step": 189
},
{
"epoch": 0.04783898405104923,
"grad_norm": 0.11975349485874176,
"learning_rate": 1.6521990764500582e-05,
"loss": 1.1182,
"step": 190
},
{
"epoch": 0.0480907681776337,
"grad_norm": 0.12053832411766052,
"learning_rate": 1.619124108042327e-05,
"loss": 1.1471,
"step": 191
},
{
"epoch": 0.04834255230421817,
"grad_norm": 0.12356948852539062,
"learning_rate": 1.5868477479402504e-05,
"loss": 1.1284,
"step": 192
},
{
"epoch": 0.04859433643080264,
"grad_norm": 0.12398120015859604,
"learning_rate": 1.5553766379870584e-05,
"loss": 1.1457,
"step": 193
},
{
"epoch": 0.04884612055738711,
"grad_norm": 0.12426292151212692,
"learning_rate": 1.5247172543212521e-05,
"loss": 1.0898,
"step": 194
},
{
"epoch": 0.04909790468397158,
"grad_norm": 0.12630872428417206,
"learning_rate": 1.4948759060439551e-05,
"loss": 1.0753,
"step": 195
},
{
"epoch": 0.04934968881055605,
"grad_norm": 0.1257810890674591,
"learning_rate": 1.4658587339206153e-05,
"loss": 1.1022,
"step": 196
},
{
"epoch": 0.04960147293714052,
"grad_norm": 0.12942546606063843,
"learning_rate": 1.4376717091173584e-05,
"loss": 1.0655,
"step": 197
},
{
"epoch": 0.04985325706372499,
"grad_norm": 0.13958528637886047,
"learning_rate": 1.410320631972237e-05,
"loss": 1.0952,
"step": 198
},
{
"epoch": 0.05010504119030946,
"grad_norm": 0.14069540798664093,
"learning_rate": 1.38381113080164e-05,
"loss": 1.0969,
"step": 199
},
{
"epoch": 0.05035682531689393,
"grad_norm": 0.17611843347549438,
"learning_rate": 1.3581486607420874e-05,
"loss": 0.9935,
"step": 200
},
{
"epoch": 0.05035682531689393,
"eval_loss": 1.084336519241333,
"eval_runtime": 1.5232,
"eval_samples_per_second": 32.826,
"eval_steps_per_second": 8.535,
"step": 200
},
{
"epoch": 0.0506086094434784,
"grad_norm": 0.11260085552930832,
"learning_rate": 1.333338502627676e-05,
"loss": 1.139,
"step": 201
},
{
"epoch": 0.05086039357006287,
"grad_norm": 0.11994125694036484,
"learning_rate": 1.3093857619033844e-05,
"loss": 1.1337,
"step": 202
},
{
"epoch": 0.05111217769664734,
"grad_norm": 0.11982115358114243,
"learning_rate": 1.2862953675744722e-05,
"loss": 1.1353,
"step": 203
},
{
"epoch": 0.05136396182323181,
"grad_norm": 0.12615817785263062,
"learning_rate": 1.2640720711921882e-05,
"loss": 1.1105,
"step": 204
},
{
"epoch": 0.05161574594981628,
"grad_norm": 0.12592007219791412,
"learning_rate": 1.2427204458759955e-05,
"loss": 1.0824,
"step": 205
},
{
"epoch": 0.05186753007640075,
"grad_norm": 0.1267462968826294,
"learning_rate": 1.2222448853725088e-05,
"loss": 1.0621,
"step": 206
},
{
"epoch": 0.05211931420298522,
"grad_norm": 0.13077248632907867,
"learning_rate": 1.2026496031513518e-05,
"loss": 1.062,
"step": 207
},
{
"epoch": 0.05237109832956968,
"grad_norm": 0.12852057814598083,
"learning_rate": 1.1839386315381043e-05,
"loss": 1.0745,
"step": 208
},
{
"epoch": 0.05262288245615415,
"grad_norm": 0.12591220438480377,
"learning_rate": 1.1661158208845307e-05,
"loss": 1.072,
"step": 209
},
{
"epoch": 0.05287466658273862,
"grad_norm": 0.12850439548492432,
"learning_rate": 1.1491848387762514e-05,
"loss": 1.1041,
"step": 210
},
{
"epoch": 0.05312645070932309,
"grad_norm": 0.13350600004196167,
"learning_rate": 1.1331491692780279e-05,
"loss": 1.096,
"step": 211
},
{
"epoch": 0.05337823483590756,
"grad_norm": 0.14876709878444672,
"learning_rate": 1.1180121122168064e-05,
"loss": 1.0571,
"step": 212
},
{
"epoch": 0.05363001896249203,
"grad_norm": 0.12453413009643555,
"learning_rate": 1.1037767825026826e-05,
"loss": 1.0797,
"step": 213
},
{
"epoch": 0.0538818030890765,
"grad_norm": 0.1117461547255516,
"learning_rate": 1.0904461094879107e-05,
"loss": 1.0913,
"step": 214
},
{
"epoch": 0.05413358721566097,
"grad_norm": 0.11662810295820236,
"learning_rate": 1.0780228363641018e-05,
"loss": 1.1452,
"step": 215
},
{
"epoch": 0.05438537134224544,
"grad_norm": 0.12274463474750519,
"learning_rate": 1.0665095195977271e-05,
"loss": 1.1162,
"step": 216
},
{
"epoch": 0.05463715546882991,
"grad_norm": 0.12894800305366516,
"learning_rate": 1.0559085284040506e-05,
"loss": 1.0798,
"step": 217
},
{
"epoch": 0.05488893959541438,
"grad_norm": 0.12095481902360916,
"learning_rate": 1.0462220442595853e-05,
"loss": 1.0852,
"step": 218
},
{
"epoch": 0.05514072372199885,
"grad_norm": 0.12701094150543213,
"learning_rate": 1.0374520604531953e-05,
"loss": 1.0814,
"step": 219
},
{
"epoch": 0.05539250784858332,
"grad_norm": 0.12494029849767685,
"learning_rate": 1.0296003816759086e-05,
"loss": 1.086,
"step": 220
},
{
"epoch": 0.05564429197516779,
"grad_norm": 0.1267097145318985,
"learning_rate": 1.0226686236495517e-05,
"loss": 1.1057,
"step": 221
},
{
"epoch": 0.05589607610175226,
"grad_norm": 0.13075008988380432,
"learning_rate": 1.0166582127942649e-05,
"loss": 1.1317,
"step": 222
},
{
"epoch": 0.05614786022833673,
"grad_norm": 0.13261082768440247,
"learning_rate": 1.0115703859349725e-05,
"loss": 1.1039,
"step": 223
},
{
"epoch": 0.0563996443549212,
"grad_norm": 0.14049747586250305,
"learning_rate": 1.0074061900468672e-05,
"loss": 1.116,
"step": 224
},
{
"epoch": 0.05665142848150567,
"grad_norm": 0.19462983310222626,
"learning_rate": 1.0041664820399652e-05,
"loss": 0.9881,
"step": 225
},
{
"epoch": 0.05665142848150567,
"eval_loss": 1.0821517705917358,
"eval_runtime": 1.523,
"eval_samples_per_second": 32.83,
"eval_steps_per_second": 8.536,
"step": 225
},
{
"epoch": 0.056903212608090135,
"grad_norm": 0.10417667776346207,
"learning_rate": 1.0018519285827698e-05,
"loss": 1.0784,
"step": 226
},
{
"epoch": 0.057154996734674605,
"grad_norm": 0.1178843155503273,
"learning_rate": 1.000463005965084e-05,
"loss": 1.1522,
"step": 227
},
{
"epoch": 0.057406780861259075,
"grad_norm": 0.12229274213314056,
"learning_rate": 1e-05,
"loss": 1.1221,
"step": 228
}
],
"logging_steps": 1,
"max_steps": 228,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 50,
"stateful_callbacks": {
"EarlyStoppingCallback": {
"args": {
"early_stopping_patience": 30,
"early_stopping_threshold": 0.0
},
"attributes": {
"early_stopping_patience_counter": 0
}
},
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 2.4583097096570143e+18,
"train_batch_size": 1,
"trial_name": null,
"trial_params": null
}