|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 4.934950385887541, |
|
"eval_steps": 50, |
|
"global_step": 3360, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.01470047776552738, |
|
"grad_norm": 0.5667821168899536, |
|
"learning_rate": 0.0003, |
|
"loss": 14.1829, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.02940095553105476, |
|
"grad_norm": 0.34381482005119324, |
|
"learning_rate": 0.0002995581737849779, |
|
"loss": 10.6392, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.044101433296582136, |
|
"grad_norm": 0.4359741806983948, |
|
"learning_rate": 0.00029911634756995577, |
|
"loss": 9.822, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.05880191106210952, |
|
"grad_norm": 0.4812168478965759, |
|
"learning_rate": 0.0002986745213549337, |
|
"loss": 9.5543, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.07350238882763689, |
|
"grad_norm": 0.5038645267486572, |
|
"learning_rate": 0.0002982326951399116, |
|
"loss": 9.2649, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.07350238882763689, |
|
"eval_loss": 1.1481549739837646, |
|
"eval_runtime": 122.7544, |
|
"eval_samples_per_second": 7.079, |
|
"eval_steps_per_second": 0.888, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.08820286659316427, |
|
"grad_norm": 0.5140169858932495, |
|
"learning_rate": 0.00029779086892488954, |
|
"loss": 9.0792, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.10290334435869165, |
|
"grad_norm": 0.5402693152427673, |
|
"learning_rate": 0.0002973490427098674, |
|
"loss": 8.9419, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.11760382212421903, |
|
"grad_norm": 0.5764286518096924, |
|
"learning_rate": 0.00029690721649484533, |
|
"loss": 8.8216, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.13230429988974643, |
|
"grad_norm": 0.5907140374183655, |
|
"learning_rate": 0.00029646539027982326, |
|
"loss": 8.7672, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.14700477765527378, |
|
"grad_norm": 0.6670350432395935, |
|
"learning_rate": 0.0002960235640648012, |
|
"loss": 8.6517, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.14700477765527378, |
|
"eval_loss": 1.0773104429244995, |
|
"eval_runtime": 122.6477, |
|
"eval_samples_per_second": 7.085, |
|
"eval_steps_per_second": 0.889, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.16170525542080116, |
|
"grad_norm": 0.6163733601570129, |
|
"learning_rate": 0.00029558173784977905, |
|
"loss": 8.514, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.17640573318632854, |
|
"grad_norm": 0.6671579480171204, |
|
"learning_rate": 0.000295139911634757, |
|
"loss": 8.453, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.19110621095185593, |
|
"grad_norm": 0.6266494989395142, |
|
"learning_rate": 0.00029469808541973485, |
|
"loss": 8.432, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.2058066887173833, |
|
"grad_norm": 0.6384584307670593, |
|
"learning_rate": 0.00029425625920471277, |
|
"loss": 8.4165, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.2205071664829107, |
|
"grad_norm": 0.6454419493675232, |
|
"learning_rate": 0.0002938144329896907, |
|
"loss": 8.3296, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.2205071664829107, |
|
"eval_loss": 1.0390021800994873, |
|
"eval_runtime": 122.6855, |
|
"eval_samples_per_second": 7.083, |
|
"eval_steps_per_second": 0.888, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.23520764424843807, |
|
"grad_norm": 0.7196301817893982, |
|
"learning_rate": 0.0002933726067746686, |
|
"loss": 8.3398, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.24990812201396545, |
|
"grad_norm": 0.6549277305603027, |
|
"learning_rate": 0.0002929307805596465, |
|
"loss": 8.2129, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.26460859977949286, |
|
"grad_norm": 0.7620148062705994, |
|
"learning_rate": 0.0002924889543446244, |
|
"loss": 8.1606, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.2793090775450202, |
|
"grad_norm": 0.7137699723243713, |
|
"learning_rate": 0.00029204712812960234, |
|
"loss": 8.149, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.29400955531054757, |
|
"grad_norm": 0.8829444050788879, |
|
"learning_rate": 0.00029160530191458026, |
|
"loss": 8.1411, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.29400955531054757, |
|
"eval_loss": 1.013519048690796, |
|
"eval_runtime": 122.7693, |
|
"eval_samples_per_second": 7.078, |
|
"eval_steps_per_second": 0.888, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.308710033076075, |
|
"grad_norm": 0.7682989239692688, |
|
"learning_rate": 0.00029116347569955813, |
|
"loss": 8.0776, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.32341051084160233, |
|
"grad_norm": 0.7389448285102844, |
|
"learning_rate": 0.00029072164948453606, |
|
"loss": 8.0282, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.33811098860712974, |
|
"grad_norm": 0.6548839211463928, |
|
"learning_rate": 0.000290279823269514, |
|
"loss": 7.9301, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 0.3528114663726571, |
|
"grad_norm": 0.6590954065322876, |
|
"learning_rate": 0.0002898379970544919, |
|
"loss": 7.9751, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.3675119441381845, |
|
"grad_norm": 0.6717875003814697, |
|
"learning_rate": 0.0002893961708394698, |
|
"loss": 7.9119, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.3675119441381845, |
|
"eval_loss": 0.9925356507301331, |
|
"eval_runtime": 122.9342, |
|
"eval_samples_per_second": 7.069, |
|
"eval_steps_per_second": 0.887, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.38221242190371185, |
|
"grad_norm": 0.7246499061584473, |
|
"learning_rate": 0.0002889543446244477, |
|
"loss": 7.967, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 0.39691289966923926, |
|
"grad_norm": 0.6811556220054626, |
|
"learning_rate": 0.0002885125184094256, |
|
"loss": 7.8876, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 0.4116133774347666, |
|
"grad_norm": 0.6982465386390686, |
|
"learning_rate": 0.0002880706921944035, |
|
"loss": 7.7926, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 0.426313855200294, |
|
"grad_norm": 0.7146357893943787, |
|
"learning_rate": 0.0002876288659793814, |
|
"loss": 7.8198, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 0.4410143329658214, |
|
"grad_norm": 0.6905196309089661, |
|
"learning_rate": 0.00028718703976435934, |
|
"loss": 7.8613, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.4410143329658214, |
|
"eval_loss": 0.9757246971130371, |
|
"eval_runtime": 122.6633, |
|
"eval_samples_per_second": 7.084, |
|
"eval_steps_per_second": 0.889, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.4557148107313488, |
|
"grad_norm": 0.7458569407463074, |
|
"learning_rate": 0.00028674521354933726, |
|
"loss": 7.753, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 0.47041528849687614, |
|
"grad_norm": 0.6701686382293701, |
|
"learning_rate": 0.00028630338733431513, |
|
"loss": 7.7493, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 0.48511576626240355, |
|
"grad_norm": 0.6544702053070068, |
|
"learning_rate": 0.00028586156111929306, |
|
"loss": 7.7707, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 0.4998162440279309, |
|
"grad_norm": 0.7933269739151001, |
|
"learning_rate": 0.00028541973490427093, |
|
"loss": 7.6491, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 0.5145167217934583, |
|
"grad_norm": 0.9241804480552673, |
|
"learning_rate": 0.00028497790868924885, |
|
"loss": 7.654, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.5145167217934583, |
|
"eval_loss": 0.9614924788475037, |
|
"eval_runtime": 122.7436, |
|
"eval_samples_per_second": 7.08, |
|
"eval_steps_per_second": 0.888, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.5292171995589857, |
|
"grad_norm": 0.7038714289665222, |
|
"learning_rate": 0.0002845360824742268, |
|
"loss": 7.6464, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 0.543917677324513, |
|
"grad_norm": 0.6981613636016846, |
|
"learning_rate": 0.0002840942562592047, |
|
"loss": 7.7035, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 0.5586181550900404, |
|
"grad_norm": 0.7330546379089355, |
|
"learning_rate": 0.00028365243004418257, |
|
"loss": 7.5588, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 0.5733186328555678, |
|
"grad_norm": 0.7032233476638794, |
|
"learning_rate": 0.0002832106038291605, |
|
"loss": 7.558, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 0.5880191106210951, |
|
"grad_norm": 0.6604267358779907, |
|
"learning_rate": 0.0002827687776141384, |
|
"loss": 7.5529, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.5880191106210951, |
|
"eval_loss": 0.9492942094802856, |
|
"eval_runtime": 122.7734, |
|
"eval_samples_per_second": 7.078, |
|
"eval_steps_per_second": 0.888, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.6027195883866225, |
|
"grad_norm": 0.6671615839004517, |
|
"learning_rate": 0.00028232695139911634, |
|
"loss": 7.5444, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 0.61742006615215, |
|
"grad_norm": 0.8184302449226379, |
|
"learning_rate": 0.0002818851251840942, |
|
"loss": 7.55, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 0.6321205439176774, |
|
"grad_norm": 0.7789945006370544, |
|
"learning_rate": 0.00028144329896907214, |
|
"loss": 7.5528, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 0.6468210216832047, |
|
"grad_norm": 0.6673751473426819, |
|
"learning_rate": 0.00028100147275405006, |
|
"loss": 7.5363, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 0.6615214994487321, |
|
"grad_norm": 0.6923938989639282, |
|
"learning_rate": 0.000280559646539028, |
|
"loss": 7.4971, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 0.6615214994487321, |
|
"eval_loss": 0.9389350414276123, |
|
"eval_runtime": 122.7652, |
|
"eval_samples_per_second": 7.079, |
|
"eval_steps_per_second": 0.888, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 0.6762219772142595, |
|
"grad_norm": 0.7583535313606262, |
|
"learning_rate": 0.00028011782032400586, |
|
"loss": 7.5674, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 0.6909224549797869, |
|
"grad_norm": 0.8926804065704346, |
|
"learning_rate": 0.0002796759941089838, |
|
"loss": 7.494, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 0.7056229327453142, |
|
"grad_norm": 0.6774755716323853, |
|
"learning_rate": 0.0002792341678939617, |
|
"loss": 7.4429, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 0.7203234105108416, |
|
"grad_norm": 0.7251358032226562, |
|
"learning_rate": 0.00027879234167893963, |
|
"loss": 7.4177, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 0.735023888276369, |
|
"grad_norm": 0.7453545928001404, |
|
"learning_rate": 0.0002783505154639175, |
|
"loss": 7.4214, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.735023888276369, |
|
"eval_loss": 0.9294928312301636, |
|
"eval_runtime": 122.745, |
|
"eval_samples_per_second": 7.08, |
|
"eval_steps_per_second": 0.888, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.7497243660418964, |
|
"grad_norm": 0.7907696962356567, |
|
"learning_rate": 0.0002779086892488954, |
|
"loss": 7.4205, |
|
"step": 510 |
|
}, |
|
{ |
|
"epoch": 0.7644248438074237, |
|
"grad_norm": 0.8346834182739258, |
|
"learning_rate": 0.00027746686303387335, |
|
"loss": 7.3813, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 0.7791253215729511, |
|
"grad_norm": 0.6804636120796204, |
|
"learning_rate": 0.0002770250368188512, |
|
"loss": 7.381, |
|
"step": 530 |
|
}, |
|
{ |
|
"epoch": 0.7938257993384785, |
|
"grad_norm": 0.6866320371627808, |
|
"learning_rate": 0.00027658321060382914, |
|
"loss": 7.3764, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 0.8085262771040059, |
|
"grad_norm": 0.669920027256012, |
|
"learning_rate": 0.000276141384388807, |
|
"loss": 7.3525, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 0.8085262771040059, |
|
"eval_loss": 0.9208266735076904, |
|
"eval_runtime": 122.7608, |
|
"eval_samples_per_second": 7.079, |
|
"eval_steps_per_second": 0.888, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 0.8232267548695332, |
|
"grad_norm": 0.6660575866699219, |
|
"learning_rate": 0.00027569955817378493, |
|
"loss": 7.3205, |
|
"step": 560 |
|
}, |
|
{ |
|
"epoch": 0.8379272326350606, |
|
"grad_norm": 0.6585595011711121, |
|
"learning_rate": 0.00027525773195876286, |
|
"loss": 7.3378, |
|
"step": 570 |
|
}, |
|
{ |
|
"epoch": 0.852627710400588, |
|
"grad_norm": 0.6798352003097534, |
|
"learning_rate": 0.0002748159057437408, |
|
"loss": 7.3742, |
|
"step": 580 |
|
}, |
|
{ |
|
"epoch": 0.8673281881661153, |
|
"grad_norm": 0.6834390759468079, |
|
"learning_rate": 0.00027437407952871865, |
|
"loss": 7.3129, |
|
"step": 590 |
|
}, |
|
{ |
|
"epoch": 0.8820286659316428, |
|
"grad_norm": 0.6835374236106873, |
|
"learning_rate": 0.0002739322533136966, |
|
"loss": 7.3155, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 0.8820286659316428, |
|
"eval_loss": 0.914364218711853, |
|
"eval_runtime": 122.6825, |
|
"eval_samples_per_second": 7.083, |
|
"eval_steps_per_second": 0.888, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 0.8967291436971702, |
|
"grad_norm": 0.6754274368286133, |
|
"learning_rate": 0.0002734904270986745, |
|
"loss": 7.2469, |
|
"step": 610 |
|
}, |
|
{ |
|
"epoch": 0.9114296214626976, |
|
"grad_norm": 0.685310959815979, |
|
"learning_rate": 0.0002730486008836524, |
|
"loss": 7.2842, |
|
"step": 620 |
|
}, |
|
{ |
|
"epoch": 0.9261300992282249, |
|
"grad_norm": 0.6500253081321716, |
|
"learning_rate": 0.0002726067746686303, |
|
"loss": 7.2391, |
|
"step": 630 |
|
}, |
|
{ |
|
"epoch": 0.9408305769937523, |
|
"grad_norm": 0.6883764863014221, |
|
"learning_rate": 0.0002721649484536082, |
|
"loss": 7.2422, |
|
"step": 640 |
|
}, |
|
{ |
|
"epoch": 0.9555310547592797, |
|
"grad_norm": 0.6952683329582214, |
|
"learning_rate": 0.00027172312223858614, |
|
"loss": 7.2299, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 0.9555310547592797, |
|
"eval_loss": 0.9077203273773193, |
|
"eval_runtime": 122.8456, |
|
"eval_samples_per_second": 7.074, |
|
"eval_steps_per_second": 0.887, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 0.9702315325248071, |
|
"grad_norm": 0.689813494682312, |
|
"learning_rate": 0.00027128129602356407, |
|
"loss": 7.195, |
|
"step": 660 |
|
}, |
|
{ |
|
"epoch": 0.9849320102903344, |
|
"grad_norm": 0.7432010173797607, |
|
"learning_rate": 0.00027083946980854194, |
|
"loss": 7.2269, |
|
"step": 670 |
|
}, |
|
{ |
|
"epoch": 0.9996324880558618, |
|
"grad_norm": 0.6559237837791443, |
|
"learning_rate": 0.00027039764359351986, |
|
"loss": 7.2033, |
|
"step": 680 |
|
}, |
|
{ |
|
"epoch": 1.0132304299889747, |
|
"grad_norm": 0.6594626307487488, |
|
"learning_rate": 0.0002699558173784978, |
|
"loss": 6.3891, |
|
"step": 690 |
|
}, |
|
{ |
|
"epoch": 1.027930907754502, |
|
"grad_norm": 0.7604880928993225, |
|
"learning_rate": 0.0002695139911634757, |
|
"loss": 6.9323, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 1.027930907754502, |
|
"eval_loss": 0.9017989039421082, |
|
"eval_runtime": 122.7598, |
|
"eval_samples_per_second": 7.079, |
|
"eval_steps_per_second": 0.888, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 1.0426313855200293, |
|
"grad_norm": 0.6518153548240662, |
|
"learning_rate": 0.0002690721649484536, |
|
"loss": 6.8683, |
|
"step": 710 |
|
}, |
|
{ |
|
"epoch": 1.0573318632855568, |
|
"grad_norm": 0.6659955978393555, |
|
"learning_rate": 0.0002686303387334315, |
|
"loss": 6.8752, |
|
"step": 720 |
|
}, |
|
{ |
|
"epoch": 1.0720323410510841, |
|
"grad_norm": 0.6533622145652771, |
|
"learning_rate": 0.00026818851251840943, |
|
"loss": 6.9249, |
|
"step": 730 |
|
}, |
|
{ |
|
"epoch": 1.0867328188166114, |
|
"grad_norm": 0.6731697916984558, |
|
"learning_rate": 0.00026774668630338735, |
|
"loss": 6.8896, |
|
"step": 740 |
|
}, |
|
{ |
|
"epoch": 1.101433296582139, |
|
"grad_norm": 0.6576088070869446, |
|
"learning_rate": 0.0002673048600883652, |
|
"loss": 6.9015, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 1.101433296582139, |
|
"eval_loss": 0.8985611200332642, |
|
"eval_runtime": 122.7501, |
|
"eval_samples_per_second": 7.079, |
|
"eval_steps_per_second": 0.888, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 1.1161337743476663, |
|
"grad_norm": 0.6596420407295227, |
|
"learning_rate": 0.0002668630338733431, |
|
"loss": 6.9081, |
|
"step": 760 |
|
}, |
|
{ |
|
"epoch": 1.1308342521131938, |
|
"grad_norm": 0.6679617762565613, |
|
"learning_rate": 0.000266421207658321, |
|
"loss": 6.8619, |
|
"step": 770 |
|
}, |
|
{ |
|
"epoch": 1.145534729878721, |
|
"grad_norm": 0.6485927700996399, |
|
"learning_rate": 0.00026597938144329894, |
|
"loss": 6.9068, |
|
"step": 780 |
|
}, |
|
{ |
|
"epoch": 1.1602352076442484, |
|
"grad_norm": 0.679757297039032, |
|
"learning_rate": 0.00026553755522827686, |
|
"loss": 6.9091, |
|
"step": 790 |
|
}, |
|
{ |
|
"epoch": 1.1749356854097759, |
|
"grad_norm": 0.7027860879898071, |
|
"learning_rate": 0.00026509572901325473, |
|
"loss": 6.9184, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 1.1749356854097759, |
|
"eval_loss": 0.8939267992973328, |
|
"eval_runtime": 122.7304, |
|
"eval_samples_per_second": 7.081, |
|
"eval_steps_per_second": 0.888, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 1.1896361631753032, |
|
"grad_norm": 0.651599645614624, |
|
"learning_rate": 0.00026465390279823266, |
|
"loss": 6.8758, |
|
"step": 810 |
|
}, |
|
{ |
|
"epoch": 1.2043366409408305, |
|
"grad_norm": 0.6500714421272278, |
|
"learning_rate": 0.0002642120765832106, |
|
"loss": 6.8177, |
|
"step": 820 |
|
}, |
|
{ |
|
"epoch": 1.219037118706358, |
|
"grad_norm": 0.7008342146873474, |
|
"learning_rate": 0.0002637702503681885, |
|
"loss": 6.8862, |
|
"step": 830 |
|
}, |
|
{ |
|
"epoch": 1.2337375964718853, |
|
"grad_norm": 0.6531012058258057, |
|
"learning_rate": 0.0002633284241531664, |
|
"loss": 6.8651, |
|
"step": 840 |
|
}, |
|
{ |
|
"epoch": 1.2484380742374128, |
|
"grad_norm": 0.6673538684844971, |
|
"learning_rate": 0.0002628865979381443, |
|
"loss": 6.8483, |
|
"step": 850 |
|
}, |
|
{ |
|
"epoch": 1.2484380742374128, |
|
"eval_loss": 0.8902646899223328, |
|
"eval_runtime": 122.8026, |
|
"eval_samples_per_second": 7.076, |
|
"eval_steps_per_second": 0.888, |
|
"step": 850 |
|
}, |
|
{ |
|
"epoch": 1.2631385520029401, |
|
"grad_norm": 0.6538046002388, |
|
"learning_rate": 0.0002624447717231222, |
|
"loss": 6.8312, |
|
"step": 860 |
|
}, |
|
{ |
|
"epoch": 1.2778390297684674, |
|
"grad_norm": 0.7091453671455383, |
|
"learning_rate": 0.00026200294550810015, |
|
"loss": 6.8254, |
|
"step": 870 |
|
}, |
|
{ |
|
"epoch": 1.292539507533995, |
|
"grad_norm": 0.6591873168945312, |
|
"learning_rate": 0.000261561119293078, |
|
"loss": 6.8611, |
|
"step": 880 |
|
}, |
|
{ |
|
"epoch": 1.3072399852995222, |
|
"grad_norm": 0.6869573593139648, |
|
"learning_rate": 0.00026111929307805594, |
|
"loss": 6.8559, |
|
"step": 890 |
|
}, |
|
{ |
|
"epoch": 1.3219404630650495, |
|
"grad_norm": 0.6487476229667664, |
|
"learning_rate": 0.00026067746686303387, |
|
"loss": 6.841, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 1.3219404630650495, |
|
"eval_loss": 0.8851333260536194, |
|
"eval_runtime": 122.7209, |
|
"eval_samples_per_second": 7.081, |
|
"eval_steps_per_second": 0.888, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 1.336640940830577, |
|
"grad_norm": 0.957991361618042, |
|
"learning_rate": 0.0002602356406480118, |
|
"loss": 6.8572, |
|
"step": 910 |
|
}, |
|
{ |
|
"epoch": 1.3513414185961043, |
|
"grad_norm": 0.6700526475906372, |
|
"learning_rate": 0.00025979381443298966, |
|
"loss": 6.8354, |
|
"step": 920 |
|
}, |
|
{ |
|
"epoch": 1.3660418963616316, |
|
"grad_norm": 0.6402295827865601, |
|
"learning_rate": 0.0002593519882179676, |
|
"loss": 6.7743, |
|
"step": 930 |
|
}, |
|
{ |
|
"epoch": 1.3807423741271592, |
|
"grad_norm": 0.6479661464691162, |
|
"learning_rate": 0.0002589101620029455, |
|
"loss": 6.8425, |
|
"step": 940 |
|
}, |
|
{ |
|
"epoch": 1.3954428518926865, |
|
"grad_norm": 0.638687789440155, |
|
"learning_rate": 0.00025846833578792343, |
|
"loss": 6.8481, |
|
"step": 950 |
|
}, |
|
{ |
|
"epoch": 1.3954428518926865, |
|
"eval_loss": 0.881273627281189, |
|
"eval_runtime": 122.7019, |
|
"eval_samples_per_second": 7.082, |
|
"eval_steps_per_second": 0.888, |
|
"step": 950 |
|
}, |
|
{ |
|
"epoch": 1.4101433296582138, |
|
"grad_norm": 0.6479136347770691, |
|
"learning_rate": 0.0002580265095729013, |
|
"loss": 6.851, |
|
"step": 960 |
|
}, |
|
{ |
|
"epoch": 1.4248438074237413, |
|
"grad_norm": 0.6393547654151917, |
|
"learning_rate": 0.0002575846833578792, |
|
"loss": 6.7861, |
|
"step": 970 |
|
}, |
|
{ |
|
"epoch": 1.4395442851892686, |
|
"grad_norm": 0.6324586272239685, |
|
"learning_rate": 0.0002571428571428571, |
|
"loss": 6.8184, |
|
"step": 980 |
|
}, |
|
{ |
|
"epoch": 1.454244762954796, |
|
"grad_norm": 0.6082057952880859, |
|
"learning_rate": 0.000256701030927835, |
|
"loss": 6.7677, |
|
"step": 990 |
|
}, |
|
{ |
|
"epoch": 1.4689452407203234, |
|
"grad_norm": 0.6476999521255493, |
|
"learning_rate": 0.00025625920471281295, |
|
"loss": 6.846, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 1.4689452407203234, |
|
"eval_loss": 0.877618134021759, |
|
"eval_runtime": 122.7086, |
|
"eval_samples_per_second": 7.082, |
|
"eval_steps_per_second": 0.888, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 1.483645718485851, |
|
"grad_norm": 0.6280727386474609, |
|
"learning_rate": 0.0002558173784977908, |
|
"loss": 6.785, |
|
"step": 1010 |
|
}, |
|
{ |
|
"epoch": 1.4983461962513782, |
|
"grad_norm": 0.6497882604598999, |
|
"learning_rate": 0.00025537555228276874, |
|
"loss": 6.7953, |
|
"step": 1020 |
|
}, |
|
{ |
|
"epoch": 1.5130466740169055, |
|
"grad_norm": 0.6304103136062622, |
|
"learning_rate": 0.00025493372606774666, |
|
"loss": 6.7844, |
|
"step": 1030 |
|
}, |
|
{ |
|
"epoch": 1.527747151782433, |
|
"grad_norm": 0.6390955448150635, |
|
"learning_rate": 0.0002544918998527246, |
|
"loss": 6.8057, |
|
"step": 1040 |
|
}, |
|
{ |
|
"epoch": 1.5424476295479603, |
|
"grad_norm": 0.6420630812644958, |
|
"learning_rate": 0.00025405007363770246, |
|
"loss": 6.772, |
|
"step": 1050 |
|
}, |
|
{ |
|
"epoch": 1.5424476295479603, |
|
"eval_loss": 0.8740328550338745, |
|
"eval_runtime": 122.7668, |
|
"eval_samples_per_second": 7.078, |
|
"eval_steps_per_second": 0.888, |
|
"step": 1050 |
|
}, |
|
{ |
|
"epoch": 1.5571481073134876, |
|
"grad_norm": 0.6527525186538696, |
|
"learning_rate": 0.0002536082474226804, |
|
"loss": 6.7597, |
|
"step": 1060 |
|
}, |
|
{ |
|
"epoch": 1.5718485850790151, |
|
"grad_norm": 0.63811856508255, |
|
"learning_rate": 0.0002531664212076583, |
|
"loss": 6.7273, |
|
"step": 1070 |
|
}, |
|
{ |
|
"epoch": 1.5865490628445424, |
|
"grad_norm": 0.6332401037216187, |
|
"learning_rate": 0.00025272459499263623, |
|
"loss": 6.7035, |
|
"step": 1080 |
|
}, |
|
{ |
|
"epoch": 1.6012495406100697, |
|
"grad_norm": 0.631254255771637, |
|
"learning_rate": 0.0002522827687776141, |
|
"loss": 6.7374, |
|
"step": 1090 |
|
}, |
|
{ |
|
"epoch": 1.6159500183755973, |
|
"grad_norm": 0.5980453491210938, |
|
"learning_rate": 0.000251840942562592, |
|
"loss": 6.7379, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 1.6159500183755973, |
|
"eval_loss": 0.8703284859657288, |
|
"eval_runtime": 122.7173, |
|
"eval_samples_per_second": 7.081, |
|
"eval_steps_per_second": 0.888, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 1.6306504961411246, |
|
"grad_norm": 0.6735087633132935, |
|
"learning_rate": 0.00025139911634756995, |
|
"loss": 6.729, |
|
"step": 1110 |
|
}, |
|
{ |
|
"epoch": 1.6453509739066519, |
|
"grad_norm": 0.629978597164154, |
|
"learning_rate": 0.00025095729013254787, |
|
"loss": 6.7366, |
|
"step": 1120 |
|
}, |
|
{ |
|
"epoch": 1.6600514516721794, |
|
"grad_norm": 0.6302997469902039, |
|
"learning_rate": 0.00025051546391752574, |
|
"loss": 6.743, |
|
"step": 1130 |
|
}, |
|
{ |
|
"epoch": 1.6747519294377067, |
|
"grad_norm": 0.6438983082771301, |
|
"learning_rate": 0.00025007363770250367, |
|
"loss": 6.7066, |
|
"step": 1140 |
|
}, |
|
{ |
|
"epoch": 1.689452407203234, |
|
"grad_norm": 0.6278811097145081, |
|
"learning_rate": 0.0002496318114874816, |
|
"loss": 6.7407, |
|
"step": 1150 |
|
}, |
|
{ |
|
"epoch": 1.689452407203234, |
|
"eval_loss": 0.8671343922615051, |
|
"eval_runtime": 122.7608, |
|
"eval_samples_per_second": 7.079, |
|
"eval_steps_per_second": 0.888, |
|
"step": 1150 |
|
}, |
|
{ |
|
"epoch": 1.7041528849687615, |
|
"grad_norm": 0.625558614730835, |
|
"learning_rate": 0.0002491899852724595, |
|
"loss": 6.7356, |
|
"step": 1160 |
|
}, |
|
{ |
|
"epoch": 1.718853362734289, |
|
"grad_norm": 0.6096333861351013, |
|
"learning_rate": 0.0002487481590574374, |
|
"loss": 6.7057, |
|
"step": 1170 |
|
}, |
|
{ |
|
"epoch": 1.733553840499816, |
|
"grad_norm": 0.619067370891571, |
|
"learning_rate": 0.00024830633284241525, |
|
"loss": 6.6987, |
|
"step": 1180 |
|
}, |
|
{ |
|
"epoch": 1.7482543182653436, |
|
"grad_norm": 0.6105065941810608, |
|
"learning_rate": 0.0002478645066273932, |
|
"loss": 6.6905, |
|
"step": 1190 |
|
}, |
|
{ |
|
"epoch": 1.7629547960308711, |
|
"grad_norm": 0.6466421484947205, |
|
"learning_rate": 0.0002474226804123711, |
|
"loss": 6.714, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 1.7629547960308711, |
|
"eval_loss": 0.8641793131828308, |
|
"eval_runtime": 122.7219, |
|
"eval_samples_per_second": 7.081, |
|
"eval_steps_per_second": 0.888, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 1.7776552737963984, |
|
"grad_norm": 0.6360054612159729, |
|
"learning_rate": 0.000246980854197349, |
|
"loss": 6.7256, |
|
"step": 1210 |
|
}, |
|
{ |
|
"epoch": 1.7923557515619257, |
|
"grad_norm": 0.604234516620636, |
|
"learning_rate": 0.0002465390279823269, |
|
"loss": 6.7378, |
|
"step": 1220 |
|
}, |
|
{ |
|
"epoch": 1.8070562293274532, |
|
"grad_norm": 0.5942307710647583, |
|
"learning_rate": 0.0002460972017673048, |
|
"loss": 6.6839, |
|
"step": 1230 |
|
}, |
|
{ |
|
"epoch": 1.8217567070929805, |
|
"grad_norm": 0.6054413318634033, |
|
"learning_rate": 0.00024565537555228275, |
|
"loss": 6.6793, |
|
"step": 1240 |
|
}, |
|
{ |
|
"epoch": 1.8364571848585078, |
|
"grad_norm": 0.6159862279891968, |
|
"learning_rate": 0.00024521354933726067, |
|
"loss": 6.7124, |
|
"step": 1250 |
|
}, |
|
{ |
|
"epoch": 1.8364571848585078, |
|
"eval_loss": 0.8611465096473694, |
|
"eval_runtime": 122.7049, |
|
"eval_samples_per_second": 7.082, |
|
"eval_steps_per_second": 0.888, |
|
"step": 1250 |
|
}, |
|
{ |
|
"epoch": 1.8511576626240354, |
|
"grad_norm": 0.6350423693656921, |
|
"learning_rate": 0.00024477172312223854, |
|
"loss": 6.6738, |
|
"step": 1260 |
|
}, |
|
{ |
|
"epoch": 1.8658581403895627, |
|
"grad_norm": 0.6083235740661621, |
|
"learning_rate": 0.00024432989690721646, |
|
"loss": 6.6895, |
|
"step": 1270 |
|
}, |
|
{ |
|
"epoch": 1.88055861815509, |
|
"grad_norm": 0.6661822199821472, |
|
"learning_rate": 0.0002438880706921944, |
|
"loss": 6.6935, |
|
"step": 1280 |
|
}, |
|
{ |
|
"epoch": 1.8952590959206175, |
|
"grad_norm": 0.6287885308265686, |
|
"learning_rate": 0.00024344624447717228, |
|
"loss": 6.6507, |
|
"step": 1290 |
|
}, |
|
{ |
|
"epoch": 1.9099595736861448, |
|
"grad_norm": 0.6344292759895325, |
|
"learning_rate": 0.0002430044182621502, |
|
"loss": 6.6706, |
|
"step": 1300 |
|
}, |
|
{ |
|
"epoch": 1.9099595736861448, |
|
"eval_loss": 0.8582596778869629, |
|
"eval_runtime": 122.7802, |
|
"eval_samples_per_second": 7.078, |
|
"eval_steps_per_second": 0.888, |
|
"step": 1300 |
|
}, |
|
{ |
|
"epoch": 1.924660051451672, |
|
"grad_norm": 0.6139386892318726, |
|
"learning_rate": 0.0002425625920471281, |
|
"loss": 6.6578, |
|
"step": 1310 |
|
}, |
|
{ |
|
"epoch": 1.9393605292171996, |
|
"grad_norm": 0.6080993413925171, |
|
"learning_rate": 0.00024212076583210603, |
|
"loss": 6.6478, |
|
"step": 1320 |
|
}, |
|
{ |
|
"epoch": 1.954061006982727, |
|
"grad_norm": 0.6097526550292969, |
|
"learning_rate": 0.00024167893961708393, |
|
"loss": 6.6307, |
|
"step": 1330 |
|
}, |
|
{ |
|
"epoch": 1.9687614847482542, |
|
"grad_norm": 0.603585958480835, |
|
"learning_rate": 0.00024123711340206185, |
|
"loss": 6.6696, |
|
"step": 1340 |
|
}, |
|
{ |
|
"epoch": 1.9834619625137817, |
|
"grad_norm": 0.6306211948394775, |
|
"learning_rate": 0.00024079528718703975, |
|
"loss": 6.6335, |
|
"step": 1350 |
|
}, |
|
{ |
|
"epoch": 1.9834619625137817, |
|
"eval_loss": 0.8551880121231079, |
|
"eval_runtime": 122.671, |
|
"eval_samples_per_second": 7.084, |
|
"eval_steps_per_second": 0.889, |
|
"step": 1350 |
|
}, |
|
{ |
|
"epoch": 1.9981624402793092, |
|
"grad_norm": 0.6002718806266785, |
|
"learning_rate": 0.00024035346097201767, |
|
"loss": 6.6681, |
|
"step": 1360 |
|
}, |
|
{ |
|
"epoch": 2.011760382212422, |
|
"grad_norm": 0.609912097454071, |
|
"learning_rate": 0.00023991163475699557, |
|
"loss": 5.7921, |
|
"step": 1370 |
|
}, |
|
{ |
|
"epoch": 2.0264608599779494, |
|
"grad_norm": 0.631758451461792, |
|
"learning_rate": 0.0002394698085419735, |
|
"loss": 6.2318, |
|
"step": 1380 |
|
}, |
|
{ |
|
"epoch": 2.0411613377434765, |
|
"grad_norm": 0.6468050479888916, |
|
"learning_rate": 0.00023902798232695136, |
|
"loss": 6.2074, |
|
"step": 1390 |
|
}, |
|
{ |
|
"epoch": 2.055861815509004, |
|
"grad_norm": 0.6332272887229919, |
|
"learning_rate": 0.00023858615611192926, |
|
"loss": 6.2245, |
|
"step": 1400 |
|
}, |
|
{ |
|
"epoch": 2.055861815509004, |
|
"eval_loss": 0.8581995368003845, |
|
"eval_runtime": 122.7582, |
|
"eval_samples_per_second": 7.079, |
|
"eval_steps_per_second": 0.888, |
|
"step": 1400 |
|
}, |
|
{ |
|
"epoch": 2.0705622932745316, |
|
"grad_norm": 0.6591159105300903, |
|
"learning_rate": 0.00023814432989690718, |
|
"loss": 6.1981, |
|
"step": 1410 |
|
}, |
|
{ |
|
"epoch": 2.0852627710400586, |
|
"grad_norm": 0.634479820728302, |
|
"learning_rate": 0.00023770250368188508, |
|
"loss": 6.2003, |
|
"step": 1420 |
|
}, |
|
{ |
|
"epoch": 2.099963248805586, |
|
"grad_norm": 0.6693385243415833, |
|
"learning_rate": 0.000237260677466863, |
|
"loss": 6.2305, |
|
"step": 1430 |
|
}, |
|
{ |
|
"epoch": 2.1146637265711137, |
|
"grad_norm": 0.68646639585495, |
|
"learning_rate": 0.0002368188512518409, |
|
"loss": 6.2214, |
|
"step": 1440 |
|
}, |
|
{ |
|
"epoch": 2.1293642043366408, |
|
"grad_norm": 0.6264299750328064, |
|
"learning_rate": 0.00023637702503681883, |
|
"loss": 6.1968, |
|
"step": 1450 |
|
}, |
|
{ |
|
"epoch": 2.1293642043366408, |
|
"eval_loss": 0.8580149412155151, |
|
"eval_runtime": 122.7321, |
|
"eval_samples_per_second": 7.08, |
|
"eval_steps_per_second": 0.888, |
|
"step": 1450 |
|
}, |
|
{ |
|
"epoch": 2.1440646821021683, |
|
"grad_norm": 0.6439588069915771, |
|
"learning_rate": 0.00023593519882179672, |
|
"loss": 6.2362, |
|
"step": 1460 |
|
}, |
|
{ |
|
"epoch": 2.158765159867696, |
|
"grad_norm": 0.6767362952232361, |
|
"learning_rate": 0.00023549337260677465, |
|
"loss": 6.2137, |
|
"step": 1470 |
|
}, |
|
{ |
|
"epoch": 2.173465637633223, |
|
"grad_norm": 0.6421840786933899, |
|
"learning_rate": 0.00023505154639175254, |
|
"loss": 6.2331, |
|
"step": 1480 |
|
}, |
|
{ |
|
"epoch": 2.1881661153987504, |
|
"grad_norm": 0.6356670260429382, |
|
"learning_rate": 0.00023460972017673047, |
|
"loss": 6.1292, |
|
"step": 1490 |
|
}, |
|
{ |
|
"epoch": 2.202866593164278, |
|
"grad_norm": 0.6405894756317139, |
|
"learning_rate": 0.00023416789396170837, |
|
"loss": 6.212, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 2.202866593164278, |
|
"eval_loss": 0.8558148741722107, |
|
"eval_runtime": 122.8325, |
|
"eval_samples_per_second": 7.075, |
|
"eval_steps_per_second": 0.887, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 2.217567070929805, |
|
"grad_norm": 0.640738308429718, |
|
"learning_rate": 0.0002337260677466863, |
|
"loss": 6.2344, |
|
"step": 1510 |
|
}, |
|
{ |
|
"epoch": 2.2322675486953325, |
|
"grad_norm": 0.6253748536109924, |
|
"learning_rate": 0.0002332842415316642, |
|
"loss": 6.2409, |
|
"step": 1520 |
|
}, |
|
{ |
|
"epoch": 2.24696802646086, |
|
"grad_norm": 0.6264599561691284, |
|
"learning_rate": 0.0002328424153166421, |
|
"loss": 6.2622, |
|
"step": 1530 |
|
}, |
|
{ |
|
"epoch": 2.2616685042263875, |
|
"grad_norm": 0.6570128202438354, |
|
"learning_rate": 0.00023240058910162, |
|
"loss": 6.2325, |
|
"step": 1540 |
|
}, |
|
{ |
|
"epoch": 2.2763689819919146, |
|
"grad_norm": 0.6414036750793457, |
|
"learning_rate": 0.00023195876288659793, |
|
"loss": 6.2025, |
|
"step": 1550 |
|
}, |
|
{ |
|
"epoch": 2.2763689819919146, |
|
"eval_loss": 0.8544706702232361, |
|
"eval_runtime": 122.7389, |
|
"eval_samples_per_second": 7.08, |
|
"eval_steps_per_second": 0.888, |
|
"step": 1550 |
|
}, |
|
{ |
|
"epoch": 2.291069459757442, |
|
"grad_norm": 0.6400018930435181, |
|
"learning_rate": 0.00023151693667157583, |
|
"loss": 6.2232, |
|
"step": 1560 |
|
}, |
|
{ |
|
"epoch": 2.3057699375229697, |
|
"grad_norm": 0.657616376876831, |
|
"learning_rate": 0.00023107511045655375, |
|
"loss": 6.1867, |
|
"step": 1570 |
|
}, |
|
{ |
|
"epoch": 2.3204704152884967, |
|
"grad_norm": 0.6157673001289368, |
|
"learning_rate": 0.00023063328424153165, |
|
"loss": 6.2029, |
|
"step": 1580 |
|
}, |
|
{ |
|
"epoch": 2.3351708930540243, |
|
"grad_norm": 0.6498163342475891, |
|
"learning_rate": 0.00023019145802650957, |
|
"loss": 6.2112, |
|
"step": 1590 |
|
}, |
|
{ |
|
"epoch": 2.3498713708195518, |
|
"grad_norm": 0.6593335866928101, |
|
"learning_rate": 0.00022974963181148747, |
|
"loss": 6.1991, |
|
"step": 1600 |
|
}, |
|
{ |
|
"epoch": 2.3498713708195518, |
|
"eval_loss": 0.8528178930282593, |
|
"eval_runtime": 122.9309, |
|
"eval_samples_per_second": 7.069, |
|
"eval_steps_per_second": 0.887, |
|
"step": 1600 |
|
}, |
|
{ |
|
"epoch": 2.364571848585079, |
|
"grad_norm": 0.6478355526924133, |
|
"learning_rate": 0.00022930780559646534, |
|
"loss": 6.2688, |
|
"step": 1610 |
|
}, |
|
{ |
|
"epoch": 2.3792723263506064, |
|
"grad_norm": 0.6342290043830872, |
|
"learning_rate": 0.00022886597938144327, |
|
"loss": 6.2411, |
|
"step": 1620 |
|
}, |
|
{ |
|
"epoch": 2.393972804116134, |
|
"grad_norm": 0.6611024737358093, |
|
"learning_rate": 0.00022842415316642116, |
|
"loss": 6.233, |
|
"step": 1630 |
|
}, |
|
{ |
|
"epoch": 2.408673281881661, |
|
"grad_norm": 0.6265544295310974, |
|
"learning_rate": 0.0002279823269513991, |
|
"loss": 6.1916, |
|
"step": 1640 |
|
}, |
|
{ |
|
"epoch": 2.4233737596471885, |
|
"grad_norm": 0.6478269696235657, |
|
"learning_rate": 0.00022754050073637698, |
|
"loss": 6.1738, |
|
"step": 1650 |
|
}, |
|
{ |
|
"epoch": 2.4233737596471885, |
|
"eval_loss": 0.8510602712631226, |
|
"eval_runtime": 122.8125, |
|
"eval_samples_per_second": 7.076, |
|
"eval_steps_per_second": 0.888, |
|
"step": 1650 |
|
}, |
|
{ |
|
"epoch": 2.438074237412716, |
|
"grad_norm": 0.6446574926376343, |
|
"learning_rate": 0.0002270986745213549, |
|
"loss": 6.2713, |
|
"step": 1660 |
|
}, |
|
{ |
|
"epoch": 2.4527747151782435, |
|
"grad_norm": 0.6360475420951843, |
|
"learning_rate": 0.0002266568483063328, |
|
"loss": 6.2584, |
|
"step": 1670 |
|
}, |
|
{ |
|
"epoch": 2.4674751929437706, |
|
"grad_norm": 0.6492727398872375, |
|
"learning_rate": 0.00022621502209131073, |
|
"loss": 6.2234, |
|
"step": 1680 |
|
}, |
|
{ |
|
"epoch": 2.482175670709298, |
|
"grad_norm": 0.6489901542663574, |
|
"learning_rate": 0.00022577319587628863, |
|
"loss": 6.2155, |
|
"step": 1690 |
|
}, |
|
{ |
|
"epoch": 2.4968761484748256, |
|
"grad_norm": 0.6320230960845947, |
|
"learning_rate": 0.00022533136966126655, |
|
"loss": 6.1839, |
|
"step": 1700 |
|
}, |
|
{ |
|
"epoch": 2.4968761484748256, |
|
"eval_loss": 0.8486818075180054, |
|
"eval_runtime": 122.741, |
|
"eval_samples_per_second": 7.08, |
|
"eval_steps_per_second": 0.888, |
|
"step": 1700 |
|
}, |
|
{ |
|
"epoch": 2.5115766262403527, |
|
"grad_norm": 0.642189621925354, |
|
"learning_rate": 0.00022488954344624445, |
|
"loss": 6.1757, |
|
"step": 1710 |
|
}, |
|
{ |
|
"epoch": 2.5262771040058802, |
|
"grad_norm": 0.6360085010528564, |
|
"learning_rate": 0.00022444771723122237, |
|
"loss": 6.2302, |
|
"step": 1720 |
|
}, |
|
{ |
|
"epoch": 2.5409775817714078, |
|
"grad_norm": 0.6216627359390259, |
|
"learning_rate": 0.00022400589101620027, |
|
"loss": 6.2138, |
|
"step": 1730 |
|
}, |
|
{ |
|
"epoch": 2.555678059536935, |
|
"grad_norm": 0.6285056471824646, |
|
"learning_rate": 0.0002235640648011782, |
|
"loss": 6.2394, |
|
"step": 1740 |
|
}, |
|
{ |
|
"epoch": 2.5703785373024624, |
|
"grad_norm": 0.6265233755111694, |
|
"learning_rate": 0.0002231222385861561, |
|
"loss": 6.2358, |
|
"step": 1750 |
|
}, |
|
{ |
|
"epoch": 2.5703785373024624, |
|
"eval_loss": 0.846447765827179, |
|
"eval_runtime": 122.9508, |
|
"eval_samples_per_second": 7.068, |
|
"eval_steps_per_second": 0.887, |
|
"step": 1750 |
|
}, |
|
{ |
|
"epoch": 2.58507901506799, |
|
"grad_norm": 0.6178449988365173, |
|
"learning_rate": 0.00022268041237113401, |
|
"loss": 6.2289, |
|
"step": 1760 |
|
}, |
|
{ |
|
"epoch": 2.599779492833517, |
|
"grad_norm": 0.6336231827735901, |
|
"learning_rate": 0.0002222385861561119, |
|
"loss": 6.1892, |
|
"step": 1770 |
|
}, |
|
{ |
|
"epoch": 2.6144799705990445, |
|
"grad_norm": 0.6177706718444824, |
|
"learning_rate": 0.00022179675994108984, |
|
"loss": 6.2211, |
|
"step": 1780 |
|
}, |
|
{ |
|
"epoch": 2.629180448364572, |
|
"grad_norm": 0.6471459269523621, |
|
"learning_rate": 0.00022135493372606773, |
|
"loss": 6.2231, |
|
"step": 1790 |
|
}, |
|
{ |
|
"epoch": 2.643880926130099, |
|
"grad_norm": 0.6517642140388489, |
|
"learning_rate": 0.00022091310751104566, |
|
"loss": 6.2072, |
|
"step": 1800 |
|
}, |
|
{ |
|
"epoch": 2.643880926130099, |
|
"eval_loss": 0.8459606170654297, |
|
"eval_runtime": 122.7829, |
|
"eval_samples_per_second": 7.078, |
|
"eval_steps_per_second": 0.888, |
|
"step": 1800 |
|
}, |
|
{ |
|
"epoch": 2.6585814038956266, |
|
"grad_norm": 0.6251035332679749, |
|
"learning_rate": 0.00022047128129602355, |
|
"loss": 6.2043, |
|
"step": 1810 |
|
}, |
|
{ |
|
"epoch": 2.673281881661154, |
|
"grad_norm": 0.6282381415367126, |
|
"learning_rate": 0.00022002945508100145, |
|
"loss": 6.2106, |
|
"step": 1820 |
|
}, |
|
{ |
|
"epoch": 2.687982359426681, |
|
"grad_norm": 0.6424158811569214, |
|
"learning_rate": 0.00021958762886597935, |
|
"loss": 6.1704, |
|
"step": 1830 |
|
}, |
|
{ |
|
"epoch": 2.7026828371922087, |
|
"grad_norm": 0.6325989365577698, |
|
"learning_rate": 0.00021914580265095727, |
|
"loss": 6.2082, |
|
"step": 1840 |
|
}, |
|
{ |
|
"epoch": 2.717383314957736, |
|
"grad_norm": 0.6277638673782349, |
|
"learning_rate": 0.00021870397643593517, |
|
"loss": 6.2104, |
|
"step": 1850 |
|
}, |
|
{ |
|
"epoch": 2.717383314957736, |
|
"eval_loss": 0.8437173366546631, |
|
"eval_runtime": 122.7354, |
|
"eval_samples_per_second": 7.08, |
|
"eval_steps_per_second": 0.888, |
|
"step": 1850 |
|
}, |
|
{ |
|
"epoch": 2.7320837927232633, |
|
"grad_norm": 0.59511798620224, |
|
"learning_rate": 0.00021826215022091307, |
|
"loss": 6.2104, |
|
"step": 1860 |
|
}, |
|
{ |
|
"epoch": 2.746784270488791, |
|
"grad_norm": 0.6076375246047974, |
|
"learning_rate": 0.000217820324005891, |
|
"loss": 6.2272, |
|
"step": 1870 |
|
}, |
|
{ |
|
"epoch": 2.7614847482543183, |
|
"grad_norm": 0.6336222290992737, |
|
"learning_rate": 0.0002173784977908689, |
|
"loss": 6.2629, |
|
"step": 1880 |
|
}, |
|
{ |
|
"epoch": 2.7761852260198454, |
|
"grad_norm": 0.6477245092391968, |
|
"learning_rate": 0.0002169366715758468, |
|
"loss": 6.2086, |
|
"step": 1890 |
|
}, |
|
{ |
|
"epoch": 2.790885703785373, |
|
"grad_norm": 0.63583904504776, |
|
"learning_rate": 0.0002164948453608247, |
|
"loss": 6.2289, |
|
"step": 1900 |
|
}, |
|
{ |
|
"epoch": 2.790885703785373, |
|
"eval_loss": 0.8419505953788757, |
|
"eval_runtime": 122.7793, |
|
"eval_samples_per_second": 7.078, |
|
"eval_steps_per_second": 0.888, |
|
"step": 1900 |
|
}, |
|
{ |
|
"epoch": 2.8055861815509004, |
|
"grad_norm": 0.6694473028182983, |
|
"learning_rate": 0.00021605301914580263, |
|
"loss": 6.2386, |
|
"step": 1910 |
|
}, |
|
{ |
|
"epoch": 2.8202866593164275, |
|
"grad_norm": 0.6175634860992432, |
|
"learning_rate": 0.00021561119293078053, |
|
"loss": 6.1825, |
|
"step": 1920 |
|
}, |
|
{ |
|
"epoch": 2.834987137081955, |
|
"grad_norm": 0.6627750992774963, |
|
"learning_rate": 0.00021516936671575845, |
|
"loss": 6.204, |
|
"step": 1930 |
|
}, |
|
{ |
|
"epoch": 2.8496876148474826, |
|
"grad_norm": 0.6206278800964355, |
|
"learning_rate": 0.00021472754050073635, |
|
"loss": 6.1706, |
|
"step": 1940 |
|
}, |
|
{ |
|
"epoch": 2.86438809261301, |
|
"grad_norm": 0.6408950686454773, |
|
"learning_rate": 0.00021428571428571427, |
|
"loss": 6.2532, |
|
"step": 1950 |
|
}, |
|
{ |
|
"epoch": 2.86438809261301, |
|
"eval_loss": 0.8408024907112122, |
|
"eval_runtime": 122.8418, |
|
"eval_samples_per_second": 7.074, |
|
"eval_steps_per_second": 0.887, |
|
"step": 1950 |
|
}, |
|
{ |
|
"epoch": 2.879088570378537, |
|
"grad_norm": 0.6331806778907776, |
|
"learning_rate": 0.00021384388807069217, |
|
"loss": 6.1977, |
|
"step": 1960 |
|
}, |
|
{ |
|
"epoch": 2.8937890481440647, |
|
"grad_norm": 0.6263200640678406, |
|
"learning_rate": 0.0002134020618556701, |
|
"loss": 6.2267, |
|
"step": 1970 |
|
}, |
|
{ |
|
"epoch": 2.908489525909592, |
|
"grad_norm": 0.6077616214752197, |
|
"learning_rate": 0.000212960235640648, |
|
"loss": 6.2016, |
|
"step": 1980 |
|
}, |
|
{ |
|
"epoch": 2.9231900036751193, |
|
"grad_norm": 0.6300930976867676, |
|
"learning_rate": 0.00021251840942562592, |
|
"loss": 6.1878, |
|
"step": 1990 |
|
}, |
|
{ |
|
"epoch": 2.937890481440647, |
|
"grad_norm": 0.6281541585922241, |
|
"learning_rate": 0.00021207658321060381, |
|
"loss": 6.2084, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 2.937890481440647, |
|
"eval_loss": 0.8391561508178711, |
|
"eval_runtime": 122.7198, |
|
"eval_samples_per_second": 7.081, |
|
"eval_steps_per_second": 0.888, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 2.9525909592061743, |
|
"grad_norm": 0.6204951405525208, |
|
"learning_rate": 0.00021163475699558174, |
|
"loss": 6.2271, |
|
"step": 2010 |
|
}, |
|
{ |
|
"epoch": 2.967291436971702, |
|
"grad_norm": 0.6341344118118286, |
|
"learning_rate": 0.00021119293078055964, |
|
"loss": 6.2171, |
|
"step": 2020 |
|
}, |
|
{ |
|
"epoch": 2.981991914737229, |
|
"grad_norm": 0.6163744330406189, |
|
"learning_rate": 0.00021075110456553753, |
|
"loss": 6.2194, |
|
"step": 2030 |
|
}, |
|
{ |
|
"epoch": 2.9966923925027564, |
|
"grad_norm": 0.635434627532959, |
|
"learning_rate": 0.00021030927835051543, |
|
"loss": 6.1609, |
|
"step": 2040 |
|
}, |
|
{ |
|
"epoch": 3.010290334435869, |
|
"grad_norm": 0.8156677484512329, |
|
"learning_rate": 0.00020986745213549335, |
|
"loss": 5.3662, |
|
"step": 2050 |
|
}, |
|
{ |
|
"epoch": 3.010290334435869, |
|
"eval_loss": 0.8518180251121521, |
|
"eval_runtime": 122.7832, |
|
"eval_samples_per_second": 7.078, |
|
"eval_steps_per_second": 0.888, |
|
"step": 2050 |
|
}, |
|
{ |
|
"epoch": 3.0249908122013967, |
|
"grad_norm": 0.6709501147270203, |
|
"learning_rate": 0.00020942562592047125, |
|
"loss": 5.657, |
|
"step": 2060 |
|
}, |
|
{ |
|
"epoch": 3.0396912899669237, |
|
"grad_norm": 0.6364277601242065, |
|
"learning_rate": 0.00020898379970544917, |
|
"loss": 5.6556, |
|
"step": 2070 |
|
}, |
|
{ |
|
"epoch": 3.0543917677324512, |
|
"grad_norm": 0.6877652406692505, |
|
"learning_rate": 0.00020854197349042707, |
|
"loss": 5.6971, |
|
"step": 2080 |
|
}, |
|
{ |
|
"epoch": 3.0690922454979788, |
|
"grad_norm": 0.6691612005233765, |
|
"learning_rate": 0.000208100147275405, |
|
"loss": 5.6758, |
|
"step": 2090 |
|
}, |
|
{ |
|
"epoch": 3.083792723263506, |
|
"grad_norm": 0.6702853441238403, |
|
"learning_rate": 0.0002076583210603829, |
|
"loss": 5.6567, |
|
"step": 2100 |
|
}, |
|
{ |
|
"epoch": 3.083792723263506, |
|
"eval_loss": 0.8499862551689148, |
|
"eval_runtime": 122.6921, |
|
"eval_samples_per_second": 7.083, |
|
"eval_steps_per_second": 0.888, |
|
"step": 2100 |
|
}, |
|
{ |
|
"epoch": 3.0984932010290334, |
|
"grad_norm": 0.6671494245529175, |
|
"learning_rate": 0.0002072164948453608, |
|
"loss": 5.6459, |
|
"step": 2110 |
|
}, |
|
{ |
|
"epoch": 3.113193678794561, |
|
"grad_norm": 0.6615011692047119, |
|
"learning_rate": 0.00020677466863033871, |
|
"loss": 5.6901, |
|
"step": 2120 |
|
}, |
|
{ |
|
"epoch": 3.1278941565600884, |
|
"grad_norm": 0.6596587300300598, |
|
"learning_rate": 0.0002063328424153166, |
|
"loss": 5.6881, |
|
"step": 2130 |
|
}, |
|
{ |
|
"epoch": 3.1425946343256155, |
|
"grad_norm": 0.66860032081604, |
|
"learning_rate": 0.00020589101620029454, |
|
"loss": 5.6988, |
|
"step": 2140 |
|
}, |
|
{ |
|
"epoch": 3.157295112091143, |
|
"grad_norm": 0.6529964208602905, |
|
"learning_rate": 0.00020544918998527243, |
|
"loss": 5.7, |
|
"step": 2150 |
|
}, |
|
{ |
|
"epoch": 3.157295112091143, |
|
"eval_loss": 0.8492234945297241, |
|
"eval_runtime": 122.695, |
|
"eval_samples_per_second": 7.083, |
|
"eval_steps_per_second": 0.888, |
|
"step": 2150 |
|
}, |
|
{ |
|
"epoch": 3.1719955898566705, |
|
"grad_norm": 0.6981674432754517, |
|
"learning_rate": 0.00020500736377025036, |
|
"loss": 5.6757, |
|
"step": 2160 |
|
}, |
|
{ |
|
"epoch": 3.1866960676221976, |
|
"grad_norm": 0.6831721067428589, |
|
"learning_rate": 0.00020456553755522825, |
|
"loss": 5.7149, |
|
"step": 2170 |
|
}, |
|
{ |
|
"epoch": 3.201396545387725, |
|
"grad_norm": 0.6813935041427612, |
|
"learning_rate": 0.00020412371134020618, |
|
"loss": 5.6557, |
|
"step": 2180 |
|
}, |
|
{ |
|
"epoch": 3.2160970231532526, |
|
"grad_norm": 0.6628422737121582, |
|
"learning_rate": 0.00020368188512518407, |
|
"loss": 5.6812, |
|
"step": 2190 |
|
}, |
|
{ |
|
"epoch": 3.2307975009187797, |
|
"grad_norm": 0.6785872578620911, |
|
"learning_rate": 0.000203240058910162, |
|
"loss": 5.74, |
|
"step": 2200 |
|
}, |
|
{ |
|
"epoch": 3.2307975009187797, |
|
"eval_loss": 0.8484858870506287, |
|
"eval_runtime": 122.7176, |
|
"eval_samples_per_second": 7.081, |
|
"eval_steps_per_second": 0.888, |
|
"step": 2200 |
|
}, |
|
{ |
|
"epoch": 3.2454979786843072, |
|
"grad_norm": 0.6826450228691101, |
|
"learning_rate": 0.0002027982326951399, |
|
"loss": 5.7303, |
|
"step": 2210 |
|
}, |
|
{ |
|
"epoch": 3.2601984564498347, |
|
"grad_norm": 0.6782203912734985, |
|
"learning_rate": 0.00020235640648011782, |
|
"loss": 5.7306, |
|
"step": 2220 |
|
}, |
|
{ |
|
"epoch": 3.274898934215362, |
|
"grad_norm": 0.7322261929512024, |
|
"learning_rate": 0.00020191458026509572, |
|
"loss": 5.7121, |
|
"step": 2230 |
|
}, |
|
{ |
|
"epoch": 3.2895994119808893, |
|
"grad_norm": 0.676398754119873, |
|
"learning_rate": 0.00020147275405007361, |
|
"loss": 5.676, |
|
"step": 2240 |
|
}, |
|
{ |
|
"epoch": 3.304299889746417, |
|
"grad_norm": 0.6796535849571228, |
|
"learning_rate": 0.0002010309278350515, |
|
"loss": 5.6793, |
|
"step": 2250 |
|
}, |
|
{ |
|
"epoch": 3.304299889746417, |
|
"eval_loss": 0.8485282063484192, |
|
"eval_runtime": 122.7649, |
|
"eval_samples_per_second": 7.079, |
|
"eval_steps_per_second": 0.888, |
|
"step": 2250 |
|
}, |
|
{ |
|
"epoch": 3.3190003675119444, |
|
"grad_norm": 0.6691915392875671, |
|
"learning_rate": 0.00020058910162002944, |
|
"loss": 5.7447, |
|
"step": 2260 |
|
}, |
|
{ |
|
"epoch": 3.3337008452774715, |
|
"grad_norm": 0.6691219210624695, |
|
"learning_rate": 0.00020014727540500733, |
|
"loss": 5.7213, |
|
"step": 2270 |
|
}, |
|
{ |
|
"epoch": 3.348401323042999, |
|
"grad_norm": 0.691150963306427, |
|
"learning_rate": 0.00019970544918998526, |
|
"loss": 5.7139, |
|
"step": 2280 |
|
}, |
|
{ |
|
"epoch": 3.3631018008085265, |
|
"grad_norm": 0.6797978281974792, |
|
"learning_rate": 0.00019926362297496315, |
|
"loss": 5.7259, |
|
"step": 2290 |
|
}, |
|
{ |
|
"epoch": 3.3778022785740536, |
|
"grad_norm": 0.680253803730011, |
|
"learning_rate": 0.00019882179675994108, |
|
"loss": 5.729, |
|
"step": 2300 |
|
}, |
|
{ |
|
"epoch": 3.3778022785740536, |
|
"eval_loss": 0.8467084765434265, |
|
"eval_runtime": 122.7168, |
|
"eval_samples_per_second": 7.081, |
|
"eval_steps_per_second": 0.888, |
|
"step": 2300 |
|
}, |
|
{ |
|
"epoch": 3.392502756339581, |
|
"grad_norm": 0.6667632460594177, |
|
"learning_rate": 0.00019837997054491897, |
|
"loss": 5.7414, |
|
"step": 2310 |
|
}, |
|
{ |
|
"epoch": 3.4072032341051086, |
|
"grad_norm": 0.6695231795310974, |
|
"learning_rate": 0.0001979381443298969, |
|
"loss": 5.6706, |
|
"step": 2320 |
|
}, |
|
{ |
|
"epoch": 3.4219037118706357, |
|
"grad_norm": 0.6954268217086792, |
|
"learning_rate": 0.0001974963181148748, |
|
"loss": 5.786, |
|
"step": 2330 |
|
}, |
|
{ |
|
"epoch": 3.436604189636163, |
|
"grad_norm": 0.6876401901245117, |
|
"learning_rate": 0.00019705449189985272, |
|
"loss": 5.7293, |
|
"step": 2340 |
|
}, |
|
{ |
|
"epoch": 3.4513046674016907, |
|
"grad_norm": 0.6845841407775879, |
|
"learning_rate": 0.00019661266568483062, |
|
"loss": 5.7367, |
|
"step": 2350 |
|
}, |
|
{ |
|
"epoch": 3.4513046674016907, |
|
"eval_loss": 0.8462281823158264, |
|
"eval_runtime": 122.7084, |
|
"eval_samples_per_second": 7.082, |
|
"eval_steps_per_second": 0.888, |
|
"step": 2350 |
|
}, |
|
{ |
|
"epoch": 3.466005145167218, |
|
"grad_norm": 0.6698250770568848, |
|
"learning_rate": 0.00019617083946980851, |
|
"loss": 5.7564, |
|
"step": 2360 |
|
}, |
|
{ |
|
"epoch": 3.4807056229327453, |
|
"grad_norm": 0.691417396068573, |
|
"learning_rate": 0.00019572901325478644, |
|
"loss": 5.7239, |
|
"step": 2370 |
|
}, |
|
{ |
|
"epoch": 3.495406100698273, |
|
"grad_norm": 0.69003826379776, |
|
"learning_rate": 0.00019528718703976433, |
|
"loss": 5.7359, |
|
"step": 2380 |
|
}, |
|
{ |
|
"epoch": 3.5101065784638, |
|
"grad_norm": 0.665431797504425, |
|
"learning_rate": 0.00019484536082474226, |
|
"loss": 5.7405, |
|
"step": 2390 |
|
}, |
|
{ |
|
"epoch": 3.5248070562293274, |
|
"grad_norm": 0.6713572144508362, |
|
"learning_rate": 0.00019440353460972016, |
|
"loss": 5.7417, |
|
"step": 2400 |
|
}, |
|
{ |
|
"epoch": 3.5248070562293274, |
|
"eval_loss": 0.8447020053863525, |
|
"eval_runtime": 122.7287, |
|
"eval_samples_per_second": 7.081, |
|
"eval_steps_per_second": 0.888, |
|
"step": 2400 |
|
}, |
|
{ |
|
"epoch": 3.539507533994855, |
|
"grad_norm": 0.6731536388397217, |
|
"learning_rate": 0.00019396170839469808, |
|
"loss": 5.7519, |
|
"step": 2410 |
|
}, |
|
{ |
|
"epoch": 3.554208011760382, |
|
"grad_norm": 0.6862500905990601, |
|
"learning_rate": 0.00019351988217967598, |
|
"loss": 5.7698, |
|
"step": 2420 |
|
}, |
|
{ |
|
"epoch": 3.5689084895259096, |
|
"grad_norm": 0.6730573773384094, |
|
"learning_rate": 0.0001930780559646539, |
|
"loss": 5.7444, |
|
"step": 2430 |
|
}, |
|
{ |
|
"epoch": 3.583608967291437, |
|
"grad_norm": 0.7046417593955994, |
|
"learning_rate": 0.0001926362297496318, |
|
"loss": 5.7886, |
|
"step": 2440 |
|
}, |
|
{ |
|
"epoch": 3.598309445056964, |
|
"grad_norm": 0.6981428265571594, |
|
"learning_rate": 0.00019219440353460972, |
|
"loss": 5.7708, |
|
"step": 2450 |
|
}, |
|
{ |
|
"epoch": 3.598309445056964, |
|
"eval_loss": 0.8444002270698547, |
|
"eval_runtime": 122.7558, |
|
"eval_samples_per_second": 7.079, |
|
"eval_steps_per_second": 0.888, |
|
"step": 2450 |
|
}, |
|
{ |
|
"epoch": 3.6130099228224917, |
|
"grad_norm": 0.6807316541671753, |
|
"learning_rate": 0.0001917525773195876, |
|
"loss": 5.7589, |
|
"step": 2460 |
|
}, |
|
{ |
|
"epoch": 3.627710400588019, |
|
"grad_norm": 0.6898190975189209, |
|
"learning_rate": 0.00019131075110456552, |
|
"loss": 5.716, |
|
"step": 2470 |
|
}, |
|
{ |
|
"epoch": 3.6424108783535463, |
|
"grad_norm": 0.6854293942451477, |
|
"learning_rate": 0.00019086892488954341, |
|
"loss": 5.7499, |
|
"step": 2480 |
|
}, |
|
{ |
|
"epoch": 3.657111356119074, |
|
"grad_norm": 0.6629763245582581, |
|
"learning_rate": 0.00019042709867452134, |
|
"loss": 5.763, |
|
"step": 2490 |
|
}, |
|
{ |
|
"epoch": 3.6718118338846013, |
|
"grad_norm": 0.6828572750091553, |
|
"learning_rate": 0.00018998527245949923, |
|
"loss": 5.748, |
|
"step": 2500 |
|
}, |
|
{ |
|
"epoch": 3.6718118338846013, |
|
"eval_loss": 0.8433157801628113, |
|
"eval_runtime": 122.7268, |
|
"eval_samples_per_second": 7.081, |
|
"eval_steps_per_second": 0.888, |
|
"step": 2500 |
|
}, |
|
{ |
|
"epoch": 3.6865123116501284, |
|
"grad_norm": 0.6758775115013123, |
|
"learning_rate": 0.00018954344624447716, |
|
"loss": 5.8063, |
|
"step": 2510 |
|
}, |
|
{ |
|
"epoch": 3.701212789415656, |
|
"grad_norm": 0.699603796005249, |
|
"learning_rate": 0.00018910162002945506, |
|
"loss": 5.7323, |
|
"step": 2520 |
|
}, |
|
{ |
|
"epoch": 3.7159132671811834, |
|
"grad_norm": 0.6797698140144348, |
|
"learning_rate": 0.00018865979381443298, |
|
"loss": 5.7159, |
|
"step": 2530 |
|
}, |
|
{ |
|
"epoch": 3.7306137449467105, |
|
"grad_norm": 0.7055732011795044, |
|
"learning_rate": 0.00018821796759941088, |
|
"loss": 5.7788, |
|
"step": 2540 |
|
}, |
|
{ |
|
"epoch": 3.745314222712238, |
|
"grad_norm": 0.6939615607261658, |
|
"learning_rate": 0.0001877761413843888, |
|
"loss": 5.8017, |
|
"step": 2550 |
|
}, |
|
{ |
|
"epoch": 3.745314222712238, |
|
"eval_loss": 0.8415707349777222, |
|
"eval_runtime": 122.7864, |
|
"eval_samples_per_second": 7.077, |
|
"eval_steps_per_second": 0.888, |
|
"step": 2550 |
|
}, |
|
{ |
|
"epoch": 3.7600147004777655, |
|
"grad_norm": 0.6803593635559082, |
|
"learning_rate": 0.0001873343151693667, |
|
"loss": 5.7542, |
|
"step": 2560 |
|
}, |
|
{ |
|
"epoch": 3.774715178243293, |
|
"grad_norm": 0.6966648101806641, |
|
"learning_rate": 0.00018689248895434462, |
|
"loss": 5.7682, |
|
"step": 2570 |
|
}, |
|
{ |
|
"epoch": 3.78941565600882, |
|
"grad_norm": 0.6992161870002747, |
|
"learning_rate": 0.00018645066273932252, |
|
"loss": 5.7216, |
|
"step": 2580 |
|
}, |
|
{ |
|
"epoch": 3.8041161337743477, |
|
"grad_norm": 0.6834452152252197, |
|
"learning_rate": 0.00018600883652430044, |
|
"loss": 5.7347, |
|
"step": 2590 |
|
}, |
|
{ |
|
"epoch": 3.818816611539875, |
|
"grad_norm": 0.6835057139396667, |
|
"learning_rate": 0.00018556701030927834, |
|
"loss": 5.7475, |
|
"step": 2600 |
|
}, |
|
{ |
|
"epoch": 3.818816611539875, |
|
"eval_loss": 0.8405690789222717, |
|
"eval_runtime": 122.7131, |
|
"eval_samples_per_second": 7.082, |
|
"eval_steps_per_second": 0.888, |
|
"step": 2600 |
|
}, |
|
{ |
|
"epoch": 3.8335170893054027, |
|
"grad_norm": 0.6868228316307068, |
|
"learning_rate": 0.00018512518409425626, |
|
"loss": 5.7805, |
|
"step": 2610 |
|
}, |
|
{ |
|
"epoch": 3.8482175670709298, |
|
"grad_norm": 0.6829861402511597, |
|
"learning_rate": 0.00018468335787923416, |
|
"loss": 5.7193, |
|
"step": 2620 |
|
}, |
|
{ |
|
"epoch": 3.8629180448364573, |
|
"grad_norm": 0.6857191920280457, |
|
"learning_rate": 0.00018424153166421206, |
|
"loss": 5.7893, |
|
"step": 2630 |
|
}, |
|
{ |
|
"epoch": 3.877618522601985, |
|
"grad_norm": 0.6723675727844238, |
|
"learning_rate": 0.00018379970544918998, |
|
"loss": 5.7788, |
|
"step": 2640 |
|
}, |
|
{ |
|
"epoch": 3.892319000367512, |
|
"grad_norm": 0.6871203780174255, |
|
"learning_rate": 0.00018335787923416788, |
|
"loss": 5.7881, |
|
"step": 2650 |
|
}, |
|
{ |
|
"epoch": 3.892319000367512, |
|
"eval_loss": 0.8391252160072327, |
|
"eval_runtime": 122.6983, |
|
"eval_samples_per_second": 7.082, |
|
"eval_steps_per_second": 0.888, |
|
"step": 2650 |
|
}, |
|
{ |
|
"epoch": 3.9070194781330394, |
|
"grad_norm": 0.6965262293815613, |
|
"learning_rate": 0.0001829160530191458, |
|
"loss": 5.7777, |
|
"step": 2660 |
|
}, |
|
{ |
|
"epoch": 3.921719955898567, |
|
"grad_norm": 0.6634519696235657, |
|
"learning_rate": 0.00018247422680412367, |
|
"loss": 5.7385, |
|
"step": 2670 |
|
}, |
|
{ |
|
"epoch": 3.936420433664094, |
|
"grad_norm": 0.658750593662262, |
|
"learning_rate": 0.0001820324005891016, |
|
"loss": 5.7514, |
|
"step": 2680 |
|
}, |
|
{ |
|
"epoch": 3.9511209114296215, |
|
"grad_norm": 0.6765331029891968, |
|
"learning_rate": 0.0001815905743740795, |
|
"loss": 5.7902, |
|
"step": 2690 |
|
}, |
|
{ |
|
"epoch": 3.965821389195149, |
|
"grad_norm": 0.684141993522644, |
|
"learning_rate": 0.00018114874815905742, |
|
"loss": 5.8105, |
|
"step": 2700 |
|
}, |
|
{ |
|
"epoch": 3.965821389195149, |
|
"eval_loss": 0.8379185795783997, |
|
"eval_runtime": 122.8086, |
|
"eval_samples_per_second": 7.076, |
|
"eval_steps_per_second": 0.888, |
|
"step": 2700 |
|
}, |
|
{ |
|
"epoch": 3.980521866960676, |
|
"grad_norm": 0.668866753578186, |
|
"learning_rate": 0.00018070692194403532, |
|
"loss": 5.7643, |
|
"step": 2710 |
|
}, |
|
{ |
|
"epoch": 3.9952223447262036, |
|
"grad_norm": 0.6660596132278442, |
|
"learning_rate": 0.00018026509572901324, |
|
"loss": 5.757, |
|
"step": 2720 |
|
}, |
|
{ |
|
"epoch": 4.008820286659317, |
|
"grad_norm": 1.04069983959198, |
|
"learning_rate": 0.00017982326951399114, |
|
"loss": 5.0052, |
|
"step": 2730 |
|
}, |
|
{ |
|
"epoch": 4.023520764424844, |
|
"grad_norm": 0.7606070637702942, |
|
"learning_rate": 0.00017938144329896906, |
|
"loss": 5.2139, |
|
"step": 2740 |
|
}, |
|
{ |
|
"epoch": 4.038221242190371, |
|
"grad_norm": 0.743680477142334, |
|
"learning_rate": 0.00017893961708394696, |
|
"loss": 5.1857, |
|
"step": 2750 |
|
}, |
|
{ |
|
"epoch": 4.038221242190371, |
|
"eval_loss": 0.8622083067893982, |
|
"eval_runtime": 122.732, |
|
"eval_samples_per_second": 7.08, |
|
"eval_steps_per_second": 0.888, |
|
"step": 2750 |
|
}, |
|
{ |
|
"epoch": 4.052921719955899, |
|
"grad_norm": 0.7183839678764343, |
|
"learning_rate": 0.00017849779086892488, |
|
"loss": 5.1678, |
|
"step": 2760 |
|
}, |
|
{ |
|
"epoch": 4.067622197721426, |
|
"grad_norm": 0.7291563153266907, |
|
"learning_rate": 0.00017805596465390278, |
|
"loss": 5.2046, |
|
"step": 2770 |
|
}, |
|
{ |
|
"epoch": 4.082322675486953, |
|
"grad_norm": 0.7452117204666138, |
|
"learning_rate": 0.0001776141384388807, |
|
"loss": 5.1908, |
|
"step": 2780 |
|
}, |
|
{ |
|
"epoch": 4.097023153252481, |
|
"grad_norm": 0.730151355266571, |
|
"learning_rate": 0.0001771723122238586, |
|
"loss": 5.1827, |
|
"step": 2790 |
|
}, |
|
{ |
|
"epoch": 4.111723631018008, |
|
"grad_norm": 0.7287917137145996, |
|
"learning_rate": 0.00017673048600883653, |
|
"loss": 5.2302, |
|
"step": 2800 |
|
}, |
|
{ |
|
"epoch": 4.111723631018008, |
|
"eval_loss": 0.8607571125030518, |
|
"eval_runtime": 122.8437, |
|
"eval_samples_per_second": 7.074, |
|
"eval_steps_per_second": 0.887, |
|
"step": 2800 |
|
}, |
|
{ |
|
"epoch": 4.126424108783535, |
|
"grad_norm": 0.7352184653282166, |
|
"learning_rate": 0.00017628865979381442, |
|
"loss": 5.2023, |
|
"step": 2810 |
|
}, |
|
{ |
|
"epoch": 4.141124586549063, |
|
"grad_norm": 0.7560921311378479, |
|
"learning_rate": 0.00017584683357879235, |
|
"loss": 5.1905, |
|
"step": 2820 |
|
}, |
|
{ |
|
"epoch": 4.15582506431459, |
|
"grad_norm": 0.7419695258140564, |
|
"learning_rate": 0.00017540500736377024, |
|
"loss": 5.187, |
|
"step": 2830 |
|
}, |
|
{ |
|
"epoch": 4.170525542080117, |
|
"grad_norm": 0.7407976388931274, |
|
"learning_rate": 0.00017496318114874817, |
|
"loss": 5.2322, |
|
"step": 2840 |
|
}, |
|
{ |
|
"epoch": 4.185226019845645, |
|
"grad_norm": 0.7482364773750305, |
|
"learning_rate": 0.00017452135493372606, |
|
"loss": 5.2617, |
|
"step": 2850 |
|
}, |
|
{ |
|
"epoch": 4.185226019845645, |
|
"eval_loss": 0.8609734177589417, |
|
"eval_runtime": 122.7306, |
|
"eval_samples_per_second": 7.081, |
|
"eval_steps_per_second": 0.888, |
|
"step": 2850 |
|
}, |
|
{ |
|
"epoch": 4.199926497611172, |
|
"grad_norm": 0.7562023997306824, |
|
"learning_rate": 0.000174079528718704, |
|
"loss": 5.2432, |
|
"step": 2860 |
|
}, |
|
{ |
|
"epoch": 4.214626975376699, |
|
"grad_norm": 0.76356440782547, |
|
"learning_rate": 0.00017363770250368189, |
|
"loss": 5.2342, |
|
"step": 2870 |
|
}, |
|
{ |
|
"epoch": 4.229327453142227, |
|
"grad_norm": 0.7754319906234741, |
|
"learning_rate": 0.00017319587628865976, |
|
"loss": 5.2736, |
|
"step": 2880 |
|
}, |
|
{ |
|
"epoch": 4.244027930907754, |
|
"grad_norm": 0.7472639679908752, |
|
"learning_rate": 0.00017275405007363768, |
|
"loss": 5.1607, |
|
"step": 2890 |
|
}, |
|
{ |
|
"epoch": 4.2587284086732815, |
|
"grad_norm": 0.759757936000824, |
|
"learning_rate": 0.00017231222385861558, |
|
"loss": 5.2299, |
|
"step": 2900 |
|
}, |
|
{ |
|
"epoch": 4.2587284086732815, |
|
"eval_loss": 0.8612560629844666, |
|
"eval_runtime": 122.7459, |
|
"eval_samples_per_second": 7.08, |
|
"eval_steps_per_second": 0.888, |
|
"step": 2900 |
|
}, |
|
{ |
|
"epoch": 4.2734288864388095, |
|
"grad_norm": 0.80708247423172, |
|
"learning_rate": 0.0001718703976435935, |
|
"loss": 5.2688, |
|
"step": 2910 |
|
}, |
|
{ |
|
"epoch": 4.2881293642043365, |
|
"grad_norm": 0.7704065442085266, |
|
"learning_rate": 0.0001714285714285714, |
|
"loss": 5.2478, |
|
"step": 2920 |
|
}, |
|
{ |
|
"epoch": 4.302829841969864, |
|
"grad_norm": 0.7821296453475952, |
|
"learning_rate": 0.00017098674521354932, |
|
"loss": 5.2805, |
|
"step": 2930 |
|
}, |
|
{ |
|
"epoch": 4.317530319735392, |
|
"grad_norm": 0.7851975560188293, |
|
"learning_rate": 0.00017054491899852722, |
|
"loss": 5.246, |
|
"step": 2940 |
|
}, |
|
{ |
|
"epoch": 4.332230797500919, |
|
"grad_norm": 0.7333672046661377, |
|
"learning_rate": 0.00017010309278350514, |
|
"loss": 5.2627, |
|
"step": 2950 |
|
}, |
|
{ |
|
"epoch": 4.332230797500919, |
|
"eval_loss": 0.8597420454025269, |
|
"eval_runtime": 122.891, |
|
"eval_samples_per_second": 7.071, |
|
"eval_steps_per_second": 0.887, |
|
"step": 2950 |
|
}, |
|
{ |
|
"epoch": 4.346931275266446, |
|
"grad_norm": 0.7573877573013306, |
|
"learning_rate": 0.00016966126656848304, |
|
"loss": 5.2539, |
|
"step": 2960 |
|
}, |
|
{ |
|
"epoch": 4.361631753031974, |
|
"grad_norm": 0.7531468272209167, |
|
"learning_rate": 0.00016921944035346096, |
|
"loss": 5.2819, |
|
"step": 2970 |
|
}, |
|
{ |
|
"epoch": 4.376332230797501, |
|
"grad_norm": 0.7768471837043762, |
|
"learning_rate": 0.00016877761413843886, |
|
"loss": 5.2684, |
|
"step": 2980 |
|
}, |
|
{ |
|
"epoch": 4.391032708563028, |
|
"grad_norm": 0.7904699444770813, |
|
"learning_rate": 0.00016833578792341679, |
|
"loss": 5.2538, |
|
"step": 2990 |
|
}, |
|
{ |
|
"epoch": 4.405733186328556, |
|
"grad_norm": 0.8030019402503967, |
|
"learning_rate": 0.00016789396170839468, |
|
"loss": 5.2828, |
|
"step": 3000 |
|
}, |
|
{ |
|
"epoch": 4.405733186328556, |
|
"eval_loss": 0.8594609498977661, |
|
"eval_runtime": 122.7873, |
|
"eval_samples_per_second": 7.077, |
|
"eval_steps_per_second": 0.888, |
|
"step": 3000 |
|
}, |
|
{ |
|
"epoch": 4.420433664094083, |
|
"grad_norm": 0.751737117767334, |
|
"learning_rate": 0.0001674521354933726, |
|
"loss": 5.3038, |
|
"step": 3010 |
|
}, |
|
{ |
|
"epoch": 4.43513414185961, |
|
"grad_norm": 0.7465061545372009, |
|
"learning_rate": 0.0001670103092783505, |
|
"loss": 5.2605, |
|
"step": 3020 |
|
}, |
|
{ |
|
"epoch": 4.449834619625138, |
|
"grad_norm": 0.759131669998169, |
|
"learning_rate": 0.00016656848306332843, |
|
"loss": 5.2886, |
|
"step": 3030 |
|
}, |
|
{ |
|
"epoch": 4.464535097390665, |
|
"grad_norm": 0.7656858563423157, |
|
"learning_rate": 0.00016612665684830633, |
|
"loss": 5.2273, |
|
"step": 3040 |
|
}, |
|
{ |
|
"epoch": 4.479235575156193, |
|
"grad_norm": 0.7533978223800659, |
|
"learning_rate": 0.00016568483063328425, |
|
"loss": 5.2777, |
|
"step": 3050 |
|
}, |
|
{ |
|
"epoch": 4.479235575156193, |
|
"eval_loss": 0.8585497140884399, |
|
"eval_runtime": 122.7392, |
|
"eval_samples_per_second": 7.08, |
|
"eval_steps_per_second": 0.888, |
|
"step": 3050 |
|
}, |
|
{ |
|
"epoch": 4.49393605292172, |
|
"grad_norm": 0.7555689811706543, |
|
"learning_rate": 0.00016524300441826215, |
|
"loss": 5.2478, |
|
"step": 3060 |
|
}, |
|
{ |
|
"epoch": 4.508636530687247, |
|
"grad_norm": 0.7516697645187378, |
|
"learning_rate": 0.00016480117820324007, |
|
"loss": 5.2559, |
|
"step": 3070 |
|
}, |
|
{ |
|
"epoch": 4.523337008452775, |
|
"grad_norm": 0.7887890934944153, |
|
"learning_rate": 0.00016435935198821797, |
|
"loss": 5.3158, |
|
"step": 3080 |
|
}, |
|
{ |
|
"epoch": 4.538037486218302, |
|
"grad_norm": 0.7691967487335205, |
|
"learning_rate": 0.00016391752577319584, |
|
"loss": 5.2946, |
|
"step": 3090 |
|
}, |
|
{ |
|
"epoch": 4.552737963983829, |
|
"grad_norm": 0.7757486701011658, |
|
"learning_rate": 0.00016347569955817376, |
|
"loss": 5.2849, |
|
"step": 3100 |
|
}, |
|
{ |
|
"epoch": 4.552737963983829, |
|
"eval_loss": 0.8570435047149658, |
|
"eval_runtime": 122.7277, |
|
"eval_samples_per_second": 7.081, |
|
"eval_steps_per_second": 0.888, |
|
"step": 3100 |
|
}, |
|
{ |
|
"epoch": 4.567438441749357, |
|
"grad_norm": 0.7805380821228027, |
|
"learning_rate": 0.00016303387334315166, |
|
"loss": 5.3412, |
|
"step": 3110 |
|
}, |
|
{ |
|
"epoch": 4.582138919514884, |
|
"grad_norm": 0.7780531048774719, |
|
"learning_rate": 0.00016259204712812958, |
|
"loss": 5.2718, |
|
"step": 3120 |
|
}, |
|
{ |
|
"epoch": 4.596839397280411, |
|
"grad_norm": 0.7380784749984741, |
|
"learning_rate": 0.00016215022091310748, |
|
"loss": 5.3036, |
|
"step": 3130 |
|
}, |
|
{ |
|
"epoch": 4.611539875045939, |
|
"grad_norm": 0.7692278623580933, |
|
"learning_rate": 0.0001617083946980854, |
|
"loss": 5.3028, |
|
"step": 3140 |
|
}, |
|
{ |
|
"epoch": 4.626240352811466, |
|
"grad_norm": 0.7445947527885437, |
|
"learning_rate": 0.0001612665684830633, |
|
"loss": 5.2811, |
|
"step": 3150 |
|
}, |
|
{ |
|
"epoch": 4.626240352811466, |
|
"eval_loss": 0.8577845692634583, |
|
"eval_runtime": 122.7855, |
|
"eval_samples_per_second": 7.077, |
|
"eval_steps_per_second": 0.888, |
|
"step": 3150 |
|
}, |
|
{ |
|
"epoch": 4.6409408305769935, |
|
"grad_norm": 0.7451578974723816, |
|
"learning_rate": 0.00016082474226804122, |
|
"loss": 5.3035, |
|
"step": 3160 |
|
}, |
|
{ |
|
"epoch": 4.655641308342521, |
|
"grad_norm": 0.7448246479034424, |
|
"learning_rate": 0.00016038291605301912, |
|
"loss": 5.2956, |
|
"step": 3170 |
|
}, |
|
{ |
|
"epoch": 4.6703417861080485, |
|
"grad_norm": 0.7711780071258545, |
|
"learning_rate": 0.00015994108983799705, |
|
"loss": 5.281, |
|
"step": 3180 |
|
}, |
|
{ |
|
"epoch": 4.685042263873576, |
|
"grad_norm": 0.7698442339897156, |
|
"learning_rate": 0.00015949926362297494, |
|
"loss": 5.3382, |
|
"step": 3190 |
|
}, |
|
{ |
|
"epoch": 4.6997427416391035, |
|
"grad_norm": 0.7665743231773376, |
|
"learning_rate": 0.00015905743740795287, |
|
"loss": 5.2781, |
|
"step": 3200 |
|
}, |
|
{ |
|
"epoch": 4.6997427416391035, |
|
"eval_loss": 0.8565701842308044, |
|
"eval_runtime": 122.777, |
|
"eval_samples_per_second": 7.078, |
|
"eval_steps_per_second": 0.888, |
|
"step": 3200 |
|
}, |
|
{ |
|
"epoch": 4.714443219404631, |
|
"grad_norm": 0.7319465279579163, |
|
"learning_rate": 0.00015861561119293076, |
|
"loss": 5.3044, |
|
"step": 3210 |
|
}, |
|
{ |
|
"epoch": 4.729143697170158, |
|
"grad_norm": 0.7576644420623779, |
|
"learning_rate": 0.0001581737849779087, |
|
"loss": 5.3616, |
|
"step": 3220 |
|
}, |
|
{ |
|
"epoch": 4.743844174935686, |
|
"grad_norm": 0.7484728693962097, |
|
"learning_rate": 0.00015773195876288659, |
|
"loss": 5.3121, |
|
"step": 3230 |
|
}, |
|
{ |
|
"epoch": 4.758544652701213, |
|
"grad_norm": 0.7493030428886414, |
|
"learning_rate": 0.0001572901325478645, |
|
"loss": 5.2786, |
|
"step": 3240 |
|
}, |
|
{ |
|
"epoch": 4.77324513046674, |
|
"grad_norm": 0.7527353763580322, |
|
"learning_rate": 0.0001568483063328424, |
|
"loss": 5.3367, |
|
"step": 3250 |
|
}, |
|
{ |
|
"epoch": 4.77324513046674, |
|
"eval_loss": 0.8557420372962952, |
|
"eval_runtime": 122.7867, |
|
"eval_samples_per_second": 7.077, |
|
"eval_steps_per_second": 0.888, |
|
"step": 3250 |
|
}, |
|
{ |
|
"epoch": 4.787945608232268, |
|
"grad_norm": 0.7508755326271057, |
|
"learning_rate": 0.00015640648011782033, |
|
"loss": 5.3221, |
|
"step": 3260 |
|
}, |
|
{ |
|
"epoch": 4.802646085997795, |
|
"grad_norm": 0.7572493553161621, |
|
"learning_rate": 0.00015596465390279823, |
|
"loss": 5.2777, |
|
"step": 3270 |
|
}, |
|
{ |
|
"epoch": 4.817346563763322, |
|
"grad_norm": 0.7531789541244507, |
|
"learning_rate": 0.00015552282768777615, |
|
"loss": 5.2999, |
|
"step": 3280 |
|
}, |
|
{ |
|
"epoch": 4.83204704152885, |
|
"grad_norm": 0.7436636686325073, |
|
"learning_rate": 0.00015508100147275405, |
|
"loss": 5.2683, |
|
"step": 3290 |
|
}, |
|
{ |
|
"epoch": 4.846747519294377, |
|
"grad_norm": 0.746624767780304, |
|
"learning_rate": 0.00015463917525773192, |
|
"loss": 5.3402, |
|
"step": 3300 |
|
}, |
|
{ |
|
"epoch": 4.846747519294377, |
|
"eval_loss": 0.8542863726615906, |
|
"eval_runtime": 122.7376, |
|
"eval_samples_per_second": 7.08, |
|
"eval_steps_per_second": 0.888, |
|
"step": 3300 |
|
}, |
|
{ |
|
"epoch": 4.861447997059905, |
|
"grad_norm": 0.7520425319671631, |
|
"learning_rate": 0.00015419734904270984, |
|
"loss": 5.3424, |
|
"step": 3310 |
|
}, |
|
{ |
|
"epoch": 4.876148474825432, |
|
"grad_norm": 0.7883239388465881, |
|
"learning_rate": 0.00015375552282768774, |
|
"loss": 5.3252, |
|
"step": 3320 |
|
}, |
|
{ |
|
"epoch": 4.890848952590959, |
|
"grad_norm": 0.7612333297729492, |
|
"learning_rate": 0.00015331369661266566, |
|
"loss": 5.314, |
|
"step": 3330 |
|
}, |
|
{ |
|
"epoch": 4.905549430356487, |
|
"grad_norm": 0.7855140566825867, |
|
"learning_rate": 0.00015287187039764356, |
|
"loss": 5.2951, |
|
"step": 3340 |
|
}, |
|
{ |
|
"epoch": 4.920249908122014, |
|
"grad_norm": 0.7624037861824036, |
|
"learning_rate": 0.00015243004418262149, |
|
"loss": 5.331, |
|
"step": 3350 |
|
}, |
|
{ |
|
"epoch": 4.920249908122014, |
|
"eval_loss": 0.8540411591529846, |
|
"eval_runtime": 122.7574, |
|
"eval_samples_per_second": 7.079, |
|
"eval_steps_per_second": 0.888, |
|
"step": 3350 |
|
}, |
|
{ |
|
"epoch": 4.934950385887541, |
|
"grad_norm": 0.7521130442619324, |
|
"learning_rate": 0.00015198821796759938, |
|
"loss": 5.2818, |
|
"step": 3360 |
|
} |
|
], |
|
"logging_steps": 10, |
|
"max_steps": 6800, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 10, |
|
"save_steps": 120, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": false |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 1.7030313392487583e+20, |
|
"train_batch_size": 4, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|