|
{ |
|
"best_metric": 0.2740105390548706, |
|
"best_model_checkpoint": "miner_id_24/checkpoint-150", |
|
"epoch": 0.06143764079459349, |
|
"eval_steps": 25, |
|
"global_step": 150, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.00040958427196395656, |
|
"grad_norm": 5.032626152038574, |
|
"learning_rate": 2.9999999999999997e-05, |
|
"loss": 4.0382, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.00040958427196395656, |
|
"eval_loss": 5.339817047119141, |
|
"eval_runtime": 6.9343, |
|
"eval_samples_per_second": 7.21, |
|
"eval_steps_per_second": 1.009, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.0008191685439279131, |
|
"grad_norm": 5.231523036956787, |
|
"learning_rate": 5.9999999999999995e-05, |
|
"loss": 4.3188, |
|
"step": 2 |
|
}, |
|
{ |
|
"epoch": 0.0012287528158918697, |
|
"grad_norm": 5.883774280548096, |
|
"learning_rate": 8.999999999999999e-05, |
|
"loss": 4.773, |
|
"step": 3 |
|
}, |
|
{ |
|
"epoch": 0.0016383370878558263, |
|
"grad_norm": 4.877133846282959, |
|
"learning_rate": 0.00011999999999999999, |
|
"loss": 3.6963, |
|
"step": 4 |
|
}, |
|
{ |
|
"epoch": 0.0020479213598197828, |
|
"grad_norm": 7.621021747589111, |
|
"learning_rate": 0.00015, |
|
"loss": 4.4737, |
|
"step": 5 |
|
}, |
|
{ |
|
"epoch": 0.0024575056317837395, |
|
"grad_norm": 7.185449123382568, |
|
"learning_rate": 0.00017999999999999998, |
|
"loss": 3.2811, |
|
"step": 6 |
|
}, |
|
{ |
|
"epoch": 0.0028670899037476962, |
|
"grad_norm": 6.503012180328369, |
|
"learning_rate": 0.00020999999999999998, |
|
"loss": 2.059, |
|
"step": 7 |
|
}, |
|
{ |
|
"epoch": 0.0032766741757116525, |
|
"grad_norm": 4.8399810791015625, |
|
"learning_rate": 0.00023999999999999998, |
|
"loss": 0.9927, |
|
"step": 8 |
|
}, |
|
{ |
|
"epoch": 0.0036862584476756092, |
|
"grad_norm": 11.872732162475586, |
|
"learning_rate": 0.00027, |
|
"loss": 1.1176, |
|
"step": 9 |
|
}, |
|
{ |
|
"epoch": 0.0040958427196395655, |
|
"grad_norm": 5.976637363433838, |
|
"learning_rate": 0.0003, |
|
"loss": 0.944, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.004505426991603522, |
|
"grad_norm": 4.395268440246582, |
|
"learning_rate": 0.0002999794957488703, |
|
"loss": 0.8581, |
|
"step": 11 |
|
}, |
|
{ |
|
"epoch": 0.004915011263567479, |
|
"grad_norm": 3.741900682449341, |
|
"learning_rate": 0.0002999179886011389, |
|
"loss": 0.7349, |
|
"step": 12 |
|
}, |
|
{ |
|
"epoch": 0.005324595535531436, |
|
"grad_norm": 2.950590133666992, |
|
"learning_rate": 0.0002998154953722457, |
|
"loss": 0.4204, |
|
"step": 13 |
|
}, |
|
{ |
|
"epoch": 0.0057341798074953924, |
|
"grad_norm": 2.196455955505371, |
|
"learning_rate": 0.00029967204408281613, |
|
"loss": 0.3694, |
|
"step": 14 |
|
}, |
|
{ |
|
"epoch": 0.006143764079459349, |
|
"grad_norm": 2.7689871788024902, |
|
"learning_rate": 0.00029948767395100045, |
|
"loss": 0.4119, |
|
"step": 15 |
|
}, |
|
{ |
|
"epoch": 0.006553348351423305, |
|
"grad_norm": 3.9493329524993896, |
|
"learning_rate": 0.0002992624353817517, |
|
"loss": 0.6404, |
|
"step": 16 |
|
}, |
|
{ |
|
"epoch": 0.006962932623387262, |
|
"grad_norm": 2.9368627071380615, |
|
"learning_rate": 0.0002989963899530457, |
|
"loss": 0.4964, |
|
"step": 17 |
|
}, |
|
{ |
|
"epoch": 0.0073725168953512185, |
|
"grad_norm": 2.873953342437744, |
|
"learning_rate": 0.00029868961039904624, |
|
"loss": 0.467, |
|
"step": 18 |
|
}, |
|
{ |
|
"epoch": 0.007782101167315175, |
|
"grad_norm": 4.046209812164307, |
|
"learning_rate": 0.00029834218059022024, |
|
"loss": 0.591, |
|
"step": 19 |
|
}, |
|
{ |
|
"epoch": 0.008191685439279131, |
|
"grad_norm": 2.7434680461883545, |
|
"learning_rate": 0.00029795419551040833, |
|
"loss": 0.5402, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.008601269711243089, |
|
"grad_norm": 2.1096296310424805, |
|
"learning_rate": 0.00029752576123085736, |
|
"loss": 0.3771, |
|
"step": 21 |
|
}, |
|
{ |
|
"epoch": 0.009010853983207045, |
|
"grad_norm": 1.6251838207244873, |
|
"learning_rate": 0.0002970569948812214, |
|
"loss": 0.3872, |
|
"step": 22 |
|
}, |
|
{ |
|
"epoch": 0.009420438255171002, |
|
"grad_norm": 2.345319986343384, |
|
"learning_rate": 0.0002965480246175399, |
|
"loss": 0.4481, |
|
"step": 23 |
|
}, |
|
{ |
|
"epoch": 0.009830022527134958, |
|
"grad_norm": 2.1055288314819336, |
|
"learning_rate": 0.0002959989895872009, |
|
"loss": 0.4142, |
|
"step": 24 |
|
}, |
|
{ |
|
"epoch": 0.010239606799098914, |
|
"grad_norm": 2.2987918853759766, |
|
"learning_rate": 0.0002954100398908995, |
|
"loss": 0.4101, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.010239606799098914, |
|
"eval_loss": 0.391013503074646, |
|
"eval_runtime": 7.0436, |
|
"eval_samples_per_second": 7.099, |
|
"eval_steps_per_second": 0.994, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.010649191071062871, |
|
"grad_norm": 1.5232685804367065, |
|
"learning_rate": 0.0002947813365416023, |
|
"loss": 0.3703, |
|
"step": 26 |
|
}, |
|
{ |
|
"epoch": 0.011058775343026827, |
|
"grad_norm": 2.0562989711761475, |
|
"learning_rate": 0.0002941130514205272, |
|
"loss": 0.3834, |
|
"step": 27 |
|
}, |
|
{ |
|
"epoch": 0.011468359614990785, |
|
"grad_norm": 1.3030779361724854, |
|
"learning_rate": 0.0002934053672301536, |
|
"loss": 0.2305, |
|
"step": 28 |
|
}, |
|
{ |
|
"epoch": 0.01187794388695474, |
|
"grad_norm": 1.9921797513961792, |
|
"learning_rate": 0.00029265847744427303, |
|
"loss": 0.3723, |
|
"step": 29 |
|
}, |
|
{ |
|
"epoch": 0.012287528158918698, |
|
"grad_norm": 1.2121187448501587, |
|
"learning_rate": 0.00029187258625509513, |
|
"loss": 0.302, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.012697112430882654, |
|
"grad_norm": 2.0275485515594482, |
|
"learning_rate": 0.00029104790851742417, |
|
"loss": 0.4247, |
|
"step": 31 |
|
}, |
|
{ |
|
"epoch": 0.01310669670284661, |
|
"grad_norm": 1.6529933214187622, |
|
"learning_rate": 0.0002901846696899191, |
|
"loss": 0.2814, |
|
"step": 32 |
|
}, |
|
{ |
|
"epoch": 0.013516280974810568, |
|
"grad_norm": 1.8799407482147217, |
|
"learning_rate": 0.00028928310577345606, |
|
"loss": 0.3434, |
|
"step": 33 |
|
}, |
|
{ |
|
"epoch": 0.013925865246774524, |
|
"grad_norm": 1.9642513990402222, |
|
"learning_rate": 0.0002883434632466077, |
|
"loss": 0.4427, |
|
"step": 34 |
|
}, |
|
{ |
|
"epoch": 0.014335449518738481, |
|
"grad_norm": 1.1001381874084473, |
|
"learning_rate": 0.00028736599899825856, |
|
"loss": 0.3256, |
|
"step": 35 |
|
}, |
|
{ |
|
"epoch": 0.014745033790702437, |
|
"grad_norm": 0.934917688369751, |
|
"learning_rate": 0.00028635098025737434, |
|
"loss": 0.2164, |
|
"step": 36 |
|
}, |
|
{ |
|
"epoch": 0.015154618062666393, |
|
"grad_norm": 1.091238260269165, |
|
"learning_rate": 0.00028529868451994384, |
|
"loss": 0.3009, |
|
"step": 37 |
|
}, |
|
{ |
|
"epoch": 0.01556420233463035, |
|
"grad_norm": 1.2641563415527344, |
|
"learning_rate": 0.0002842093994731145, |
|
"loss": 0.289, |
|
"step": 38 |
|
}, |
|
{ |
|
"epoch": 0.015973786606594306, |
|
"grad_norm": 1.184543251991272, |
|
"learning_rate": 0.00028308342291654174, |
|
"loss": 0.3097, |
|
"step": 39 |
|
}, |
|
{ |
|
"epoch": 0.016383370878558262, |
|
"grad_norm": 1.7306755781173706, |
|
"learning_rate": 0.00028192106268097334, |
|
"loss": 0.2913, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.01679295515052222, |
|
"grad_norm": 1.6884584426879883, |
|
"learning_rate": 0.00028072263654409154, |
|
"loss": 0.3326, |
|
"step": 41 |
|
}, |
|
{ |
|
"epoch": 0.017202539422486177, |
|
"grad_norm": 1.2175815105438232, |
|
"learning_rate": 0.0002794884721436361, |
|
"loss": 0.3437, |
|
"step": 42 |
|
}, |
|
{ |
|
"epoch": 0.017612123694450133, |
|
"grad_norm": 1.8150399923324585, |
|
"learning_rate": 0.00027821890688783083, |
|
"loss": 0.3872, |
|
"step": 43 |
|
}, |
|
{ |
|
"epoch": 0.01802170796641409, |
|
"grad_norm": 1.13820219039917, |
|
"learning_rate": 0.0002769142878631403, |
|
"loss": 0.2204, |
|
"step": 44 |
|
}, |
|
{ |
|
"epoch": 0.018431292238378045, |
|
"grad_norm": 1.012037992477417, |
|
"learning_rate": 0.00027557497173937923, |
|
"loss": 0.2499, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 0.018840876510342004, |
|
"grad_norm": 1.1927300691604614, |
|
"learning_rate": 0.000274201324672203, |
|
"loss": 0.2392, |
|
"step": 46 |
|
}, |
|
{ |
|
"epoch": 0.01925046078230596, |
|
"grad_norm": 1.3892017602920532, |
|
"learning_rate": 0.00027279372220300385, |
|
"loss": 0.3714, |
|
"step": 47 |
|
}, |
|
{ |
|
"epoch": 0.019660045054269916, |
|
"grad_norm": 1.2792326211929321, |
|
"learning_rate": 0.0002713525491562421, |
|
"loss": 0.3209, |
|
"step": 48 |
|
}, |
|
{ |
|
"epoch": 0.020069629326233872, |
|
"grad_norm": 0.9809417128562927, |
|
"learning_rate": 0.00026987819953423867, |
|
"loss": 0.2435, |
|
"step": 49 |
|
}, |
|
{ |
|
"epoch": 0.020479213598197828, |
|
"grad_norm": 1.2792021036148071, |
|
"learning_rate": 0.00026837107640945905, |
|
"loss": 0.276, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.020479213598197828, |
|
"eval_loss": 0.3750147223472595, |
|
"eval_runtime": 7.0572, |
|
"eval_samples_per_second": 7.085, |
|
"eval_steps_per_second": 0.992, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.020888797870161787, |
|
"grad_norm": 1.9061239957809448, |
|
"learning_rate": 0.0002668315918143169, |
|
"loss": 0.3689, |
|
"step": 51 |
|
}, |
|
{ |
|
"epoch": 0.021298382142125743, |
|
"grad_norm": 1.625785231590271, |
|
"learning_rate": 0.00026526016662852886, |
|
"loss": 0.3729, |
|
"step": 52 |
|
}, |
|
{ |
|
"epoch": 0.0217079664140897, |
|
"grad_norm": 1.5701191425323486, |
|
"learning_rate": 0.00026365723046405023, |
|
"loss": 0.3479, |
|
"step": 53 |
|
}, |
|
{ |
|
"epoch": 0.022117550686053655, |
|
"grad_norm": 1.1037153005599976, |
|
"learning_rate": 0.0002620232215476231, |
|
"loss": 0.2747, |
|
"step": 54 |
|
}, |
|
{ |
|
"epoch": 0.02252713495801761, |
|
"grad_norm": 1.2575494050979614, |
|
"learning_rate": 0.0002603585866009697, |
|
"loss": 0.2879, |
|
"step": 55 |
|
}, |
|
{ |
|
"epoch": 0.02293671922998157, |
|
"grad_norm": 1.5166040658950806, |
|
"learning_rate": 0.00025866378071866334, |
|
"loss": 0.3112, |
|
"step": 56 |
|
}, |
|
{ |
|
"epoch": 0.023346303501945526, |
|
"grad_norm": 1.4540307521820068, |
|
"learning_rate": 0.00025693926724370956, |
|
"loss": 0.4235, |
|
"step": 57 |
|
}, |
|
{ |
|
"epoch": 0.02375588777390948, |
|
"grad_norm": 1.4702589511871338, |
|
"learning_rate": 0.00025518551764087326, |
|
"loss": 0.3107, |
|
"step": 58 |
|
}, |
|
{ |
|
"epoch": 0.024165472045873437, |
|
"grad_norm": 2.018094539642334, |
|
"learning_rate": 0.00025340301136778483, |
|
"loss": 0.3377, |
|
"step": 59 |
|
}, |
|
{ |
|
"epoch": 0.024575056317837397, |
|
"grad_norm": 1.072698950767517, |
|
"learning_rate": 0.00025159223574386114, |
|
"loss": 0.2737, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.024984640589801353, |
|
"grad_norm": 2.5592000484466553, |
|
"learning_rate": 0.0002497536858170772, |
|
"loss": 0.2333, |
|
"step": 61 |
|
}, |
|
{ |
|
"epoch": 0.02539422486176531, |
|
"grad_norm": 1.2713217735290527, |
|
"learning_rate": 0.00024788786422862526, |
|
"loss": 0.3628, |
|
"step": 62 |
|
}, |
|
{ |
|
"epoch": 0.025803809133729264, |
|
"grad_norm": 0.9973049163818359, |
|
"learning_rate": 0.00024599528107549745, |
|
"loss": 0.2947, |
|
"step": 63 |
|
}, |
|
{ |
|
"epoch": 0.02621339340569322, |
|
"grad_norm": 1.593890905380249, |
|
"learning_rate": 0.00024407645377103054, |
|
"loss": 0.3915, |
|
"step": 64 |
|
}, |
|
{ |
|
"epoch": 0.02662297767765718, |
|
"grad_norm": 2.379255533218384, |
|
"learning_rate": 0.00024213190690345018, |
|
"loss": 0.3678, |
|
"step": 65 |
|
}, |
|
{ |
|
"epoch": 0.027032561949621135, |
|
"grad_norm": 1.0315568447113037, |
|
"learning_rate": 0.00024016217209245374, |
|
"loss": 0.265, |
|
"step": 66 |
|
}, |
|
{ |
|
"epoch": 0.02744214622158509, |
|
"grad_norm": 1.6524887084960938, |
|
"learning_rate": 0.00023816778784387094, |
|
"loss": 0.3766, |
|
"step": 67 |
|
}, |
|
{ |
|
"epoch": 0.027851730493549047, |
|
"grad_norm": 1.5630675554275513, |
|
"learning_rate": 0.0002361492994024415, |
|
"loss": 0.3315, |
|
"step": 68 |
|
}, |
|
{ |
|
"epoch": 0.028261314765513003, |
|
"grad_norm": 2.2701518535614014, |
|
"learning_rate": 0.0002341072586027509, |
|
"loss": 0.3104, |
|
"step": 69 |
|
}, |
|
{ |
|
"epoch": 0.028670899037476962, |
|
"grad_norm": 1.7529304027557373, |
|
"learning_rate": 0.00023204222371836405, |
|
"loss": 0.4427, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.029080483309440918, |
|
"grad_norm": 1.255183458328247, |
|
"learning_rate": 0.00022995475930919905, |
|
"loss": 0.2884, |
|
"step": 71 |
|
}, |
|
{ |
|
"epoch": 0.029490067581404874, |
|
"grad_norm": 1.3396148681640625, |
|
"learning_rate": 0.00022784543606718227, |
|
"loss": 0.3427, |
|
"step": 72 |
|
}, |
|
{ |
|
"epoch": 0.02989965185336883, |
|
"grad_norm": 1.2909964323043823, |
|
"learning_rate": 0.00022571483066022657, |
|
"loss": 0.3006, |
|
"step": 73 |
|
}, |
|
{ |
|
"epoch": 0.030309236125332786, |
|
"grad_norm": 1.2211729288101196, |
|
"learning_rate": 0.0002235635255745762, |
|
"loss": 0.2923, |
|
"step": 74 |
|
}, |
|
{ |
|
"epoch": 0.030718820397296745, |
|
"grad_norm": 1.8062753677368164, |
|
"learning_rate": 0.00022139210895556104, |
|
"loss": 0.4641, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 0.030718820397296745, |
|
"eval_loss": 0.34614190459251404, |
|
"eval_runtime": 7.0556, |
|
"eval_samples_per_second": 7.087, |
|
"eval_steps_per_second": 0.992, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 0.0311284046692607, |
|
"grad_norm": 1.9081662893295288, |
|
"learning_rate": 0.00021920117444680317, |
|
"loss": 0.4002, |
|
"step": 76 |
|
}, |
|
{ |
|
"epoch": 0.03153798894122466, |
|
"grad_norm": 1.015621304512024, |
|
"learning_rate": 0.00021699132102792097, |
|
"loss": 0.2631, |
|
"step": 77 |
|
}, |
|
{ |
|
"epoch": 0.03194757321318861, |
|
"grad_norm": 0.9762264490127563, |
|
"learning_rate": 0.0002147631528507739, |
|
"loss": 0.3077, |
|
"step": 78 |
|
}, |
|
{ |
|
"epoch": 0.03235715748515257, |
|
"grad_norm": 0.9308927059173584, |
|
"learning_rate": 0.00021251727907429355, |
|
"loss": 0.317, |
|
"step": 79 |
|
}, |
|
{ |
|
"epoch": 0.032766741757116524, |
|
"grad_norm": 1.0790940523147583, |
|
"learning_rate": 0.0002102543136979454, |
|
"loss": 0.3218, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.03317632602908048, |
|
"grad_norm": 0.8803948163986206, |
|
"learning_rate": 0.0002079748753938678, |
|
"loss": 0.2791, |
|
"step": 81 |
|
}, |
|
{ |
|
"epoch": 0.03358591030104444, |
|
"grad_norm": 0.9842001795768738, |
|
"learning_rate": 0.0002056795873377331, |
|
"loss": 0.2876, |
|
"step": 82 |
|
}, |
|
{ |
|
"epoch": 0.0339954945730084, |
|
"grad_norm": 1.2645741701126099, |
|
"learning_rate": 0.00020336907703837748, |
|
"loss": 0.3469, |
|
"step": 83 |
|
}, |
|
{ |
|
"epoch": 0.034405078844972355, |
|
"grad_norm": 1.3563826084136963, |
|
"learning_rate": 0.00020104397616624645, |
|
"loss": 0.2817, |
|
"step": 84 |
|
}, |
|
{ |
|
"epoch": 0.03481466311693631, |
|
"grad_norm": 1.3064996004104614, |
|
"learning_rate": 0.00019870492038070252, |
|
"loss": 0.4047, |
|
"step": 85 |
|
}, |
|
{ |
|
"epoch": 0.035224247388900266, |
|
"grad_norm": 1.1984425783157349, |
|
"learning_rate": 0.0001963525491562421, |
|
"loss": 0.2203, |
|
"step": 86 |
|
}, |
|
{ |
|
"epoch": 0.03563383166086422, |
|
"grad_norm": 1.6247293949127197, |
|
"learning_rate": 0.0001939875056076697, |
|
"loss": 0.3285, |
|
"step": 87 |
|
}, |
|
{ |
|
"epoch": 0.03604341593282818, |
|
"grad_norm": 1.0864574909210205, |
|
"learning_rate": 0.00019161043631427666, |
|
"loss": 0.3267, |
|
"step": 88 |
|
}, |
|
{ |
|
"epoch": 0.036453000204792134, |
|
"grad_norm": 1.6605820655822754, |
|
"learning_rate": 0.00018922199114307294, |
|
"loss": 0.2532, |
|
"step": 89 |
|
}, |
|
{ |
|
"epoch": 0.03686258447675609, |
|
"grad_norm": 1.0407780408859253, |
|
"learning_rate": 0.00018682282307111987, |
|
"loss": 0.27, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.03727216874872005, |
|
"grad_norm": 1.0968496799468994, |
|
"learning_rate": 0.00018441358800701273, |
|
"loss": 0.3515, |
|
"step": 91 |
|
}, |
|
{ |
|
"epoch": 0.03768175302068401, |
|
"grad_norm": 1.1806151866912842, |
|
"learning_rate": 0.00018199494461156203, |
|
"loss": 0.2406, |
|
"step": 92 |
|
}, |
|
{ |
|
"epoch": 0.038091337292647964, |
|
"grad_norm": 1.2452479600906372, |
|
"learning_rate": 0.000179567554117722, |
|
"loss": 0.4016, |
|
"step": 93 |
|
}, |
|
{ |
|
"epoch": 0.03850092156461192, |
|
"grad_norm": 0.8575266599655151, |
|
"learning_rate": 0.00017713208014981648, |
|
"loss": 0.2095, |
|
"step": 94 |
|
}, |
|
{ |
|
"epoch": 0.038910505836575876, |
|
"grad_norm": 0.912777841091156, |
|
"learning_rate": 0.00017468918854211007, |
|
"loss": 0.2773, |
|
"step": 95 |
|
}, |
|
{ |
|
"epoch": 0.03932009010853983, |
|
"grad_norm": 0.8512354493141174, |
|
"learning_rate": 0.00017223954715677627, |
|
"loss": 0.3243, |
|
"step": 96 |
|
}, |
|
{ |
|
"epoch": 0.03972967438050379, |
|
"grad_norm": 0.9844990968704224, |
|
"learning_rate": 0.00016978382570131034, |
|
"loss": 0.2411, |
|
"step": 97 |
|
}, |
|
{ |
|
"epoch": 0.040139258652467744, |
|
"grad_norm": 1.317114233970642, |
|
"learning_rate": 0.00016732269554543794, |
|
"loss": 0.2255, |
|
"step": 98 |
|
}, |
|
{ |
|
"epoch": 0.0405488429244317, |
|
"grad_norm": 1.6339136362075806, |
|
"learning_rate": 0.00016485682953756942, |
|
"loss": 0.2351, |
|
"step": 99 |
|
}, |
|
{ |
|
"epoch": 0.040958427196395655, |
|
"grad_norm": 0.9930921196937561, |
|
"learning_rate": 0.00016238690182084986, |
|
"loss": 0.2448, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.040958427196395655, |
|
"eval_loss": 0.31713107228279114, |
|
"eval_runtime": 7.0547, |
|
"eval_samples_per_second": 7.087, |
|
"eval_steps_per_second": 0.992, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.04136801146835962, |
|
"grad_norm": 0.9509855508804321, |
|
"learning_rate": 0.0001599135876488549, |
|
"loss": 0.2413, |
|
"step": 101 |
|
}, |
|
{ |
|
"epoch": 0.041777595740323574, |
|
"grad_norm": 1.002145767211914, |
|
"learning_rate": 0.00015743756320098332, |
|
"loss": 0.2644, |
|
"step": 102 |
|
}, |
|
{ |
|
"epoch": 0.04218718001228753, |
|
"grad_norm": 1.1608006954193115, |
|
"learning_rate": 0.0001549595053975962, |
|
"loss": 0.2308, |
|
"step": 103 |
|
}, |
|
{ |
|
"epoch": 0.042596764284251486, |
|
"grad_norm": 1.313498616218567, |
|
"learning_rate": 0.00015248009171495378, |
|
"loss": 0.3244, |
|
"step": 104 |
|
}, |
|
{ |
|
"epoch": 0.04300634855621544, |
|
"grad_norm": 1.3625378608703613, |
|
"learning_rate": 0.00015, |
|
"loss": 0.3126, |
|
"step": 105 |
|
}, |
|
{ |
|
"epoch": 0.0434159328281794, |
|
"grad_norm": 0.931500256061554, |
|
"learning_rate": 0.00014751990828504622, |
|
"loss": 0.2741, |
|
"step": 106 |
|
}, |
|
{ |
|
"epoch": 0.04382551710014335, |
|
"grad_norm": 1.7665846347808838, |
|
"learning_rate": 0.00014504049460240375, |
|
"loss": 0.332, |
|
"step": 107 |
|
}, |
|
{ |
|
"epoch": 0.04423510137210731, |
|
"grad_norm": 1.2403225898742676, |
|
"learning_rate": 0.00014256243679901663, |
|
"loss": 0.3456, |
|
"step": 108 |
|
}, |
|
{ |
|
"epoch": 0.044644685644071265, |
|
"grad_norm": 1.3050745725631714, |
|
"learning_rate": 0.00014008641235114508, |
|
"loss": 0.3015, |
|
"step": 109 |
|
}, |
|
{ |
|
"epoch": 0.04505426991603522, |
|
"grad_norm": 1.056707501411438, |
|
"learning_rate": 0.00013761309817915014, |
|
"loss": 0.3497, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.045463854187999184, |
|
"grad_norm": 1.468729019165039, |
|
"learning_rate": 0.00013514317046243058, |
|
"loss": 0.3546, |
|
"step": 111 |
|
}, |
|
{ |
|
"epoch": 0.04587343845996314, |
|
"grad_norm": 1.686792254447937, |
|
"learning_rate": 0.00013267730445456208, |
|
"loss": 0.3862, |
|
"step": 112 |
|
}, |
|
{ |
|
"epoch": 0.046283022731927095, |
|
"grad_norm": 1.143621563911438, |
|
"learning_rate": 0.00013021617429868963, |
|
"loss": 0.3013, |
|
"step": 113 |
|
}, |
|
{ |
|
"epoch": 0.04669260700389105, |
|
"grad_norm": 1.3882373571395874, |
|
"learning_rate": 0.00012776045284322368, |
|
"loss": 0.3389, |
|
"step": 114 |
|
}, |
|
{ |
|
"epoch": 0.04710219127585501, |
|
"grad_norm": 1.0089846849441528, |
|
"learning_rate": 0.00012531081145788987, |
|
"loss": 0.3643, |
|
"step": 115 |
|
}, |
|
{ |
|
"epoch": 0.04751177554781896, |
|
"grad_norm": 1.1870588064193726, |
|
"learning_rate": 0.00012286791985018355, |
|
"loss": 0.3509, |
|
"step": 116 |
|
}, |
|
{ |
|
"epoch": 0.04792135981978292, |
|
"grad_norm": 0.859807014465332, |
|
"learning_rate": 0.00012043244588227796, |
|
"loss": 0.2182, |
|
"step": 117 |
|
}, |
|
{ |
|
"epoch": 0.048330944091746875, |
|
"grad_norm": 1.7744337320327759, |
|
"learning_rate": 0.00011800505538843798, |
|
"loss": 0.4168, |
|
"step": 118 |
|
}, |
|
{ |
|
"epoch": 0.04874052836371083, |
|
"grad_norm": 1.268139123916626, |
|
"learning_rate": 0.00011558641199298727, |
|
"loss": 0.3958, |
|
"step": 119 |
|
}, |
|
{ |
|
"epoch": 0.04915011263567479, |
|
"grad_norm": 1.4337314367294312, |
|
"learning_rate": 0.00011317717692888012, |
|
"loss": 0.4177, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.04955969690763875, |
|
"grad_norm": 1.177187204360962, |
|
"learning_rate": 0.00011077800885692702, |
|
"loss": 0.2095, |
|
"step": 121 |
|
}, |
|
{ |
|
"epoch": 0.049969281179602705, |
|
"grad_norm": 1.257442831993103, |
|
"learning_rate": 0.00010838956368572334, |
|
"loss": 0.3648, |
|
"step": 122 |
|
}, |
|
{ |
|
"epoch": 0.05037886545156666, |
|
"grad_norm": 1.1356306076049805, |
|
"learning_rate": 0.0001060124943923303, |
|
"loss": 0.2471, |
|
"step": 123 |
|
}, |
|
{ |
|
"epoch": 0.05078844972353062, |
|
"grad_norm": 1.0337142944335938, |
|
"learning_rate": 0.0001036474508437579, |
|
"loss": 0.2465, |
|
"step": 124 |
|
}, |
|
{ |
|
"epoch": 0.05119803399549457, |
|
"grad_norm": 1.0032552480697632, |
|
"learning_rate": 0.00010129507961929748, |
|
"loss": 0.2275, |
|
"step": 125 |
|
}, |
|
{ |
|
"epoch": 0.05119803399549457, |
|
"eval_loss": 0.27536189556121826, |
|
"eval_runtime": 7.0469, |
|
"eval_samples_per_second": 7.095, |
|
"eval_steps_per_second": 0.993, |
|
"step": 125 |
|
}, |
|
{ |
|
"epoch": 0.05160761826745853, |
|
"grad_norm": 1.302047848701477, |
|
"learning_rate": 9.895602383375353e-05, |
|
"loss": 0.3416, |
|
"step": 126 |
|
}, |
|
{ |
|
"epoch": 0.052017202539422484, |
|
"grad_norm": 0.766473650932312, |
|
"learning_rate": 9.663092296162251e-05, |
|
"loss": 0.2151, |
|
"step": 127 |
|
}, |
|
{ |
|
"epoch": 0.05242678681138644, |
|
"grad_norm": 0.8692076802253723, |
|
"learning_rate": 9.432041266226686e-05, |
|
"loss": 0.1766, |
|
"step": 128 |
|
}, |
|
{ |
|
"epoch": 0.052836371083350396, |
|
"grad_norm": 1.3593523502349854, |
|
"learning_rate": 9.202512460613219e-05, |
|
"loss": 0.3038, |
|
"step": 129 |
|
}, |
|
{ |
|
"epoch": 0.05324595535531436, |
|
"grad_norm": 1.0448229312896729, |
|
"learning_rate": 8.97456863020546e-05, |
|
"loss": 0.2512, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.053655539627278315, |
|
"grad_norm": 1.5594054460525513, |
|
"learning_rate": 8.748272092570646e-05, |
|
"loss": 0.2915, |
|
"step": 131 |
|
}, |
|
{ |
|
"epoch": 0.05406512389924227, |
|
"grad_norm": 1.2313745021820068, |
|
"learning_rate": 8.523684714922608e-05, |
|
"loss": 0.2394, |
|
"step": 132 |
|
}, |
|
{ |
|
"epoch": 0.054474708171206226, |
|
"grad_norm": 1.4682273864746094, |
|
"learning_rate": 8.300867897207903e-05, |
|
"loss": 0.3008, |
|
"step": 133 |
|
}, |
|
{ |
|
"epoch": 0.05488429244317018, |
|
"grad_norm": 0.9318922758102417, |
|
"learning_rate": 8.079882555319684e-05, |
|
"loss": 0.2269, |
|
"step": 134 |
|
}, |
|
{ |
|
"epoch": 0.05529387671513414, |
|
"grad_norm": 1.0974271297454834, |
|
"learning_rate": 7.860789104443896e-05, |
|
"loss": 0.3247, |
|
"step": 135 |
|
}, |
|
{ |
|
"epoch": 0.055703460987098094, |
|
"grad_norm": 3.1087758541107178, |
|
"learning_rate": 7.643647442542382e-05, |
|
"loss": 0.3786, |
|
"step": 136 |
|
}, |
|
{ |
|
"epoch": 0.05611304525906205, |
|
"grad_norm": 0.9446348547935486, |
|
"learning_rate": 7.428516933977347e-05, |
|
"loss": 0.252, |
|
"step": 137 |
|
}, |
|
{ |
|
"epoch": 0.056522629531026006, |
|
"grad_norm": 1.5979207754135132, |
|
"learning_rate": 7.215456393281776e-05, |
|
"loss": 0.2715, |
|
"step": 138 |
|
}, |
|
{ |
|
"epoch": 0.05693221380298997, |
|
"grad_norm": 2.039667844772339, |
|
"learning_rate": 7.004524069080096e-05, |
|
"loss": 0.3273, |
|
"step": 139 |
|
}, |
|
{ |
|
"epoch": 0.057341798074953924, |
|
"grad_norm": 1.4651241302490234, |
|
"learning_rate": 6.795777628163599e-05, |
|
"loss": 0.3456, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.05775138234691788, |
|
"grad_norm": 0.962695300579071, |
|
"learning_rate": 6.58927413972491e-05, |
|
"loss": 0.2002, |
|
"step": 141 |
|
}, |
|
{ |
|
"epoch": 0.058160966618881836, |
|
"grad_norm": 0.79236900806427, |
|
"learning_rate": 6.385070059755846e-05, |
|
"loss": 0.275, |
|
"step": 142 |
|
}, |
|
{ |
|
"epoch": 0.05857055089084579, |
|
"grad_norm": 1.0277087688446045, |
|
"learning_rate": 6.183221215612904e-05, |
|
"loss": 0.2996, |
|
"step": 143 |
|
}, |
|
{ |
|
"epoch": 0.05898013516280975, |
|
"grad_norm": 0.698850154876709, |
|
"learning_rate": 5.983782790754623e-05, |
|
"loss": 0.2285, |
|
"step": 144 |
|
}, |
|
{ |
|
"epoch": 0.059389719434773704, |
|
"grad_norm": 1.2999036312103271, |
|
"learning_rate": 5.786809309654982e-05, |
|
"loss": 0.2599, |
|
"step": 145 |
|
}, |
|
{ |
|
"epoch": 0.05979930370673766, |
|
"grad_norm": 1.0272763967514038, |
|
"learning_rate": 5.592354622896944e-05, |
|
"loss": 0.3243, |
|
"step": 146 |
|
}, |
|
{ |
|
"epoch": 0.060208887978701615, |
|
"grad_norm": 0.919711172580719, |
|
"learning_rate": 5.40047189245025e-05, |
|
"loss": 0.2842, |
|
"step": 147 |
|
}, |
|
{ |
|
"epoch": 0.06061847225066557, |
|
"grad_norm": 0.9950523972511292, |
|
"learning_rate": 5.211213577137469e-05, |
|
"loss": 0.2792, |
|
"step": 148 |
|
}, |
|
{ |
|
"epoch": 0.061028056522629534, |
|
"grad_norm": 0.7421355843544006, |
|
"learning_rate": 5.024631418292274e-05, |
|
"loss": 0.2375, |
|
"step": 149 |
|
}, |
|
{ |
|
"epoch": 0.06143764079459349, |
|
"grad_norm": 0.952510416507721, |
|
"learning_rate": 4.840776425613886e-05, |
|
"loss": 0.3047, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.06143764079459349, |
|
"eval_loss": 0.2740105390548706, |
|
"eval_runtime": 7.0485, |
|
"eval_samples_per_second": 7.094, |
|
"eval_steps_per_second": 0.993, |
|
"step": 150 |
|
} |
|
], |
|
"logging_steps": 1, |
|
"max_steps": 200, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 1, |
|
"save_steps": 50, |
|
"stateful_callbacks": { |
|
"EarlyStoppingCallback": { |
|
"args": { |
|
"early_stopping_patience": 1, |
|
"early_stopping_threshold": 0.0 |
|
}, |
|
"attributes": { |
|
"early_stopping_patience_counter": 0 |
|
} |
|
}, |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": false |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 2.625264977707008e+17, |
|
"train_batch_size": 8, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|