|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 0.9997807498355624, |
|
"eval_steps": 500, |
|
"global_step": 1140, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.0008770006577504934, |
|
"grad_norm": 0.1337890625, |
|
"learning_rate": 1.7543859649122807e-06, |
|
"loss": 1.3755, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.0043850032887524665, |
|
"grad_norm": 0.11865234375, |
|
"learning_rate": 8.771929824561403e-06, |
|
"loss": 1.37, |
|
"step": 5 |
|
}, |
|
{ |
|
"epoch": 0.008770006577504933, |
|
"grad_norm": 0.12060546875, |
|
"learning_rate": 1.7543859649122806e-05, |
|
"loss": 1.3508, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.0131550098662574, |
|
"grad_norm": 0.1259765625, |
|
"learning_rate": 2.6315789473684212e-05, |
|
"loss": 1.3572, |
|
"step": 15 |
|
}, |
|
{ |
|
"epoch": 0.017540013155009866, |
|
"grad_norm": 0.1162109375, |
|
"learning_rate": 3.508771929824561e-05, |
|
"loss": 1.3438, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.021925016443762334, |
|
"grad_norm": 0.10986328125, |
|
"learning_rate": 4.3859649122807014e-05, |
|
"loss": 1.3356, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.0263100197325148, |
|
"grad_norm": 0.09814453125, |
|
"learning_rate": 5.2631578947368424e-05, |
|
"loss": 1.3111, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.030695023021267268, |
|
"grad_norm": 0.095703125, |
|
"learning_rate": 6.140350877192983e-05, |
|
"loss": 1.2578, |
|
"step": 35 |
|
}, |
|
{ |
|
"epoch": 0.03508002631001973, |
|
"grad_norm": 0.07421875, |
|
"learning_rate": 7.017543859649122e-05, |
|
"loss": 1.2393, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.0394650295987722, |
|
"grad_norm": 0.064453125, |
|
"learning_rate": 7.894736842105263e-05, |
|
"loss": 1.2206, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 0.04385003288752467, |
|
"grad_norm": 0.049072265625, |
|
"learning_rate": 8.771929824561403e-05, |
|
"loss": 1.1976, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.048235036176277134, |
|
"grad_norm": 0.044921875, |
|
"learning_rate": 9.649122807017544e-05, |
|
"loss": 1.1976, |
|
"step": 55 |
|
}, |
|
{ |
|
"epoch": 0.0526200394650296, |
|
"grad_norm": 0.042724609375, |
|
"learning_rate": 0.00010526315789473685, |
|
"loss": 1.1789, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.05700504275378206, |
|
"grad_norm": 0.042236328125, |
|
"learning_rate": 0.00011403508771929824, |
|
"loss": 1.1716, |
|
"step": 65 |
|
}, |
|
{ |
|
"epoch": 0.061390046042534535, |
|
"grad_norm": 0.044677734375, |
|
"learning_rate": 0.00012280701754385965, |
|
"loss": 1.1691, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.065775049331287, |
|
"grad_norm": 0.03857421875, |
|
"learning_rate": 0.00013157894736842108, |
|
"loss": 1.1533, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 0.07016005262003946, |
|
"grad_norm": 0.041748046875, |
|
"learning_rate": 0.00014035087719298245, |
|
"loss": 1.1353, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.07454505590879193, |
|
"grad_norm": 0.04833984375, |
|
"learning_rate": 0.00014912280701754387, |
|
"loss": 1.1404, |
|
"step": 85 |
|
}, |
|
{ |
|
"epoch": 0.0789300591975444, |
|
"grad_norm": 0.046142578125, |
|
"learning_rate": 0.00015789473684210527, |
|
"loss": 1.1498, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.08331506248629686, |
|
"grad_norm": 0.04443359375, |
|
"learning_rate": 0.0001666666666666667, |
|
"loss": 1.1125, |
|
"step": 95 |
|
}, |
|
{ |
|
"epoch": 0.08770006577504934, |
|
"grad_norm": 0.049560546875, |
|
"learning_rate": 0.00017543859649122806, |
|
"loss": 1.1447, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.0920850690638018, |
|
"grad_norm": 0.05712890625, |
|
"learning_rate": 0.00018421052631578948, |
|
"loss": 1.1351, |
|
"step": 105 |
|
}, |
|
{ |
|
"epoch": 0.09647007235255427, |
|
"grad_norm": 0.04931640625, |
|
"learning_rate": 0.00019298245614035088, |
|
"loss": 1.1294, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.10085507564130673, |
|
"grad_norm": 0.05029296875, |
|
"learning_rate": 0.00019999953121394002, |
|
"loss": 1.1315, |
|
"step": 115 |
|
}, |
|
{ |
|
"epoch": 0.1052400789300592, |
|
"grad_norm": 0.060546875, |
|
"learning_rate": 0.00019998312416333227, |
|
"loss": 1.1284, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.10962508221881166, |
|
"grad_norm": 0.04931640625, |
|
"learning_rate": 0.00019994328220474688, |
|
"loss": 1.136, |
|
"step": 125 |
|
}, |
|
{ |
|
"epoch": 0.11401008550756413, |
|
"grad_norm": 0.052001953125, |
|
"learning_rate": 0.0001998800146766861, |
|
"loss": 1.1213, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.11839508879631659, |
|
"grad_norm": 0.064453125, |
|
"learning_rate": 0.00019979333640833947, |
|
"loss": 1.1152, |
|
"step": 135 |
|
}, |
|
{ |
|
"epoch": 0.12278009208506907, |
|
"grad_norm": 0.051513671875, |
|
"learning_rate": 0.00019968326771610797, |
|
"loss": 1.1193, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.12716509537382154, |
|
"grad_norm": 0.05126953125, |
|
"learning_rate": 0.0001995498343988421, |
|
"loss": 1.121, |
|
"step": 145 |
|
}, |
|
{ |
|
"epoch": 0.131550098662574, |
|
"grad_norm": 0.055419921875, |
|
"learning_rate": 0.00019939306773179497, |
|
"loss": 1.1291, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.13593510195132646, |
|
"grad_norm": 0.04736328125, |
|
"learning_rate": 0.0001992130044592916, |
|
"loss": 1.122, |
|
"step": 155 |
|
}, |
|
{ |
|
"epoch": 0.14032010524007893, |
|
"grad_norm": 0.05078125, |
|
"learning_rate": 0.00019900968678611666, |
|
"loss": 1.1174, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.1447051085288314, |
|
"grad_norm": 0.06298828125, |
|
"learning_rate": 0.00019878316236762196, |
|
"loss": 1.1205, |
|
"step": 165 |
|
}, |
|
{ |
|
"epoch": 0.14909011181758386, |
|
"grad_norm": 0.05615234375, |
|
"learning_rate": 0.00019853348429855672, |
|
"loss": 1.1086, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.15347511510633632, |
|
"grad_norm": 0.053955078125, |
|
"learning_rate": 0.0001982607111006227, |
|
"loss": 1.1086, |
|
"step": 175 |
|
}, |
|
{ |
|
"epoch": 0.1578601183950888, |
|
"grad_norm": 0.052978515625, |
|
"learning_rate": 0.0001979649067087574, |
|
"loss": 1.1202, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.16224512168384125, |
|
"grad_norm": 0.0517578125, |
|
"learning_rate": 0.00019764614045614836, |
|
"loss": 1.1248, |
|
"step": 185 |
|
}, |
|
{ |
|
"epoch": 0.16663012497259372, |
|
"grad_norm": 0.05615234375, |
|
"learning_rate": 0.00019730448705798239, |
|
"loss": 1.1231, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.17101512826134618, |
|
"grad_norm": 0.05810546875, |
|
"learning_rate": 0.00019694002659393305, |
|
"loss": 1.102, |
|
"step": 195 |
|
}, |
|
{ |
|
"epoch": 0.17540013155009868, |
|
"grad_norm": 0.056396484375, |
|
"learning_rate": 0.00019655284448939094, |
|
"loss": 1.1203, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.17978513483885114, |
|
"grad_norm": 0.054931640625, |
|
"learning_rate": 0.00019614303149544102, |
|
"loss": 1.1073, |
|
"step": 205 |
|
}, |
|
{ |
|
"epoch": 0.1841701381276036, |
|
"grad_norm": 0.05029296875, |
|
"learning_rate": 0.00019571068366759143, |
|
"loss": 1.1082, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.18855514141635607, |
|
"grad_norm": 0.054443359375, |
|
"learning_rate": 0.00019525590234325933, |
|
"loss": 1.1114, |
|
"step": 215 |
|
}, |
|
{ |
|
"epoch": 0.19294014470510853, |
|
"grad_norm": 0.0576171875, |
|
"learning_rate": 0.00019477879411801844, |
|
"loss": 1.1007, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.197325147993861, |
|
"grad_norm": 0.051513671875, |
|
"learning_rate": 0.00019427947082061432, |
|
"loss": 1.0978, |
|
"step": 225 |
|
}, |
|
{ |
|
"epoch": 0.20171015128261346, |
|
"grad_norm": 0.05419921875, |
|
"learning_rate": 0.00019375804948675306, |
|
"loss": 1.0876, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 0.20609515457136593, |
|
"grad_norm": 0.050537109375, |
|
"learning_rate": 0.00019321465233166924, |
|
"loss": 1.1182, |
|
"step": 235 |
|
}, |
|
{ |
|
"epoch": 0.2104801578601184, |
|
"grad_norm": 0.05224609375, |
|
"learning_rate": 0.00019264940672148018, |
|
"loss": 1.0853, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.21486516114887086, |
|
"grad_norm": 0.05224609375, |
|
"learning_rate": 0.00019206244514333282, |
|
"loss": 1.1151, |
|
"step": 245 |
|
}, |
|
{ |
|
"epoch": 0.21925016443762332, |
|
"grad_norm": 0.050537109375, |
|
"learning_rate": 0.00019145390517435012, |
|
"loss": 1.111, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.2236351677263758, |
|
"grad_norm": 0.048828125, |
|
"learning_rate": 0.00019082392944938466, |
|
"loss": 1.1262, |
|
"step": 255 |
|
}, |
|
{ |
|
"epoch": 0.22802017101512825, |
|
"grad_norm": 0.046875, |
|
"learning_rate": 0.00019017266562758659, |
|
"loss": 1.1036, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 0.23240517430388072, |
|
"grad_norm": 0.048095703125, |
|
"learning_rate": 0.00018950026635779397, |
|
"loss": 1.1162, |
|
"step": 265 |
|
}, |
|
{ |
|
"epoch": 0.23679017759263318, |
|
"grad_norm": 0.049560546875, |
|
"learning_rate": 0.00018880688924275378, |
|
"loss": 1.102, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 0.24117518088138565, |
|
"grad_norm": 0.05419921875, |
|
"learning_rate": 0.00018809269680218136, |
|
"loss": 1.1, |
|
"step": 275 |
|
}, |
|
{ |
|
"epoch": 0.24556018417013814, |
|
"grad_norm": 0.050537109375, |
|
"learning_rate": 0.00018735785643466784, |
|
"loss": 1.1037, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 0.2499451874588906, |
|
"grad_norm": 0.04931640625, |
|
"learning_rate": 0.00018660254037844388, |
|
"loss": 1.0826, |
|
"step": 285 |
|
}, |
|
{ |
|
"epoch": 0.25433019074764307, |
|
"grad_norm": 0.053466796875, |
|
"learning_rate": 0.00018582692567100867, |
|
"loss": 1.1046, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 0.25871519403639553, |
|
"grad_norm": 0.05859375, |
|
"learning_rate": 0.0001850311941076346, |
|
"loss": 1.082, |
|
"step": 295 |
|
}, |
|
{ |
|
"epoch": 0.263100197325148, |
|
"grad_norm": 0.053955078125, |
|
"learning_rate": 0.00018421553219875658, |
|
"loss": 1.0945, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.26748520061390046, |
|
"grad_norm": 0.053955078125, |
|
"learning_rate": 0.00018338013112625587, |
|
"loss": 1.0985, |
|
"step": 305 |
|
}, |
|
{ |
|
"epoch": 0.27187020390265293, |
|
"grad_norm": 0.051513671875, |
|
"learning_rate": 0.00018252518669864936, |
|
"loss": 1.1186, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 0.2762552071914054, |
|
"grad_norm": 0.0625, |
|
"learning_rate": 0.0001816508993051943, |
|
"loss": 1.1253, |
|
"step": 315 |
|
}, |
|
{ |
|
"epoch": 0.28064021048015786, |
|
"grad_norm": 0.05224609375, |
|
"learning_rate": 0.0001807574738689193, |
|
"loss": 1.0904, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 0.2850252137689103, |
|
"grad_norm": 0.0498046875, |
|
"learning_rate": 0.00017984511979859263, |
|
"loss": 1.109, |
|
"step": 325 |
|
}, |
|
{ |
|
"epoch": 0.2894102170576628, |
|
"grad_norm": 0.051513671875, |
|
"learning_rate": 0.00017891405093963938, |
|
"loss": 1.1041, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 0.29379522034641525, |
|
"grad_norm": 0.052734375, |
|
"learning_rate": 0.00017796448552401825, |
|
"loss": 1.0927, |
|
"step": 335 |
|
}, |
|
{ |
|
"epoch": 0.2981802236351677, |
|
"grad_norm": 0.04931640625, |
|
"learning_rate": 0.00017699664611907072, |
|
"loss": 1.1041, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 0.3025652269239202, |
|
"grad_norm": 0.0537109375, |
|
"learning_rate": 0.00017601075957535364, |
|
"loss": 1.1115, |
|
"step": 345 |
|
}, |
|
{ |
|
"epoch": 0.30695023021267265, |
|
"grad_norm": 0.047119140625, |
|
"learning_rate": 0.0001750070569734681, |
|
"loss": 1.1088, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.3113352335014251, |
|
"grad_norm": 0.048583984375, |
|
"learning_rate": 0.00017398577356989665, |
|
"loss": 1.0905, |
|
"step": 355 |
|
}, |
|
{ |
|
"epoch": 0.3157202367901776, |
|
"grad_norm": 0.04638671875, |
|
"learning_rate": 0.0001729471487418621, |
|
"loss": 1.1071, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 0.32010524007893004, |
|
"grad_norm": 0.052734375, |
|
"learning_rate": 0.00017189142593121993, |
|
"loss": 1.0872, |
|
"step": 365 |
|
}, |
|
{ |
|
"epoch": 0.3244902433676825, |
|
"grad_norm": 0.0478515625, |
|
"learning_rate": 0.00017081885258739846, |
|
"loss": 1.1054, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 0.32887524665643497, |
|
"grad_norm": 0.049072265625, |
|
"learning_rate": 0.00016972968010939954, |
|
"loss": 1.1035, |
|
"step": 375 |
|
}, |
|
{ |
|
"epoch": 0.33326024994518744, |
|
"grad_norm": 0.05126953125, |
|
"learning_rate": 0.0001686241637868734, |
|
"loss": 1.0976, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 0.3376452532339399, |
|
"grad_norm": 0.05126953125, |
|
"learning_rate": 0.00016750256274028152, |
|
"loss": 1.099, |
|
"step": 385 |
|
}, |
|
{ |
|
"epoch": 0.34203025652269237, |
|
"grad_norm": 0.050048828125, |
|
"learning_rate": 0.00016636513986016213, |
|
"loss": 1.1267, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 0.34641525981144483, |
|
"grad_norm": 0.05224609375, |
|
"learning_rate": 0.0001652121617455113, |
|
"loss": 1.101, |
|
"step": 395 |
|
}, |
|
{ |
|
"epoch": 0.35080026310019735, |
|
"grad_norm": 0.050048828125, |
|
"learning_rate": 0.00016404389864129533, |
|
"loss": 1.1005, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.3551852663889498, |
|
"grad_norm": 0.049072265625, |
|
"learning_rate": 0.0001628606243751082, |
|
"loss": 1.0973, |
|
"step": 405 |
|
}, |
|
{ |
|
"epoch": 0.3595702696777023, |
|
"grad_norm": 0.04931640625, |
|
"learning_rate": 0.00016166261629298995, |
|
"loss": 1.1016, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 0.36395527296645475, |
|
"grad_norm": 0.05517578125, |
|
"learning_rate": 0.0001604501551944193, |
|
"loss": 1.0863, |
|
"step": 415 |
|
}, |
|
{ |
|
"epoch": 0.3683402762552072, |
|
"grad_norm": 0.05517578125, |
|
"learning_rate": 0.00015922352526649803, |
|
"loss": 1.1008, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 0.3727252795439597, |
|
"grad_norm": 0.05517578125, |
|
"learning_rate": 0.0001579830140173403, |
|
"loss": 1.0999, |
|
"step": 425 |
|
}, |
|
{ |
|
"epoch": 0.37711028283271214, |
|
"grad_norm": 0.049072265625, |
|
"learning_rate": 0.00015672891220868432, |
|
"loss": 1.0944, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 0.3814952861214646, |
|
"grad_norm": 0.048095703125, |
|
"learning_rate": 0.00015546151378774086, |
|
"loss": 1.084, |
|
"step": 435 |
|
}, |
|
{ |
|
"epoch": 0.38588028941021707, |
|
"grad_norm": 0.049560546875, |
|
"learning_rate": 0.00015418111581829574, |
|
"loss": 1.0957, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 0.39026529269896953, |
|
"grad_norm": 0.047607421875, |
|
"learning_rate": 0.00015288801841108093, |
|
"loss": 1.0823, |
|
"step": 445 |
|
}, |
|
{ |
|
"epoch": 0.394650295987722, |
|
"grad_norm": 0.051025390625, |
|
"learning_rate": 0.00015158252465343242, |
|
"loss": 1.0925, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 0.39903529927647446, |
|
"grad_norm": 0.05029296875, |
|
"learning_rate": 0.00015026494053824982, |
|
"loss": 1.0917, |
|
"step": 455 |
|
}, |
|
{ |
|
"epoch": 0.40342030256522693, |
|
"grad_norm": 0.05029296875, |
|
"learning_rate": 0.00014893557489227517, |
|
"loss": 1.0935, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 0.4078053058539794, |
|
"grad_norm": 0.048095703125, |
|
"learning_rate": 0.00014759473930370736, |
|
"loss": 1.0795, |
|
"step": 465 |
|
}, |
|
{ |
|
"epoch": 0.41219030914273186, |
|
"grad_norm": 0.051025390625, |
|
"learning_rate": 0.00014624274804916958, |
|
"loss": 1.0943, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 0.4165753124314843, |
|
"grad_norm": 0.05322265625, |
|
"learning_rate": 0.00014487991802004623, |
|
"loss": 1.1022, |
|
"step": 475 |
|
}, |
|
{ |
|
"epoch": 0.4209603157202368, |
|
"grad_norm": 0.055419921875, |
|
"learning_rate": 0.00014350656864820733, |
|
"loss": 1.0849, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 0.42534531900898925, |
|
"grad_norm": 0.05224609375, |
|
"learning_rate": 0.00014212302183113732, |
|
"loss": 1.0865, |
|
"step": 485 |
|
}, |
|
{ |
|
"epoch": 0.4297303222977417, |
|
"grad_norm": 0.05224609375, |
|
"learning_rate": 0.00014072960185648577, |
|
"loss": 1.106, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 0.4341153255864942, |
|
"grad_norm": 0.04736328125, |
|
"learning_rate": 0.0001393266353260583, |
|
"loss": 1.0926, |
|
"step": 495 |
|
}, |
|
{ |
|
"epoch": 0.43850032887524665, |
|
"grad_norm": 0.048583984375, |
|
"learning_rate": 0.00013791445107926478, |
|
"loss": 1.1091, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.4428853321639991, |
|
"grad_norm": 0.0478515625, |
|
"learning_rate": 0.0001364933801160428, |
|
"loss": 1.0858, |
|
"step": 505 |
|
}, |
|
{ |
|
"epoch": 0.4472703354527516, |
|
"grad_norm": 0.048583984375, |
|
"learning_rate": 0.00013506375551927547, |
|
"loss": 1.0901, |
|
"step": 510 |
|
}, |
|
{ |
|
"epoch": 0.45165533874150404, |
|
"grad_norm": 0.0478515625, |
|
"learning_rate": 0.0001336259123767203, |
|
"loss": 1.0852, |
|
"step": 515 |
|
}, |
|
{ |
|
"epoch": 0.4560403420302565, |
|
"grad_norm": 0.050048828125, |
|
"learning_rate": 0.00013218018770246858, |
|
"loss": 1.0838, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 0.46042534531900897, |
|
"grad_norm": 0.052490234375, |
|
"learning_rate": 0.00013072692035795305, |
|
"loss": 1.0795, |
|
"step": 525 |
|
}, |
|
{ |
|
"epoch": 0.46481034860776144, |
|
"grad_norm": 0.048828125, |
|
"learning_rate": 0.0001292664509725226, |
|
"loss": 1.1038, |
|
"step": 530 |
|
}, |
|
{ |
|
"epoch": 0.4691953518965139, |
|
"grad_norm": 0.051513671875, |
|
"learning_rate": 0.00012779912186360268, |
|
"loss": 1.0937, |
|
"step": 535 |
|
}, |
|
{ |
|
"epoch": 0.47358035518526637, |
|
"grad_norm": 0.05126953125, |
|
"learning_rate": 0.00012632527695645993, |
|
"loss": 1.08, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 0.47796535847401883, |
|
"grad_norm": 0.0498046875, |
|
"learning_rate": 0.00012484526170359012, |
|
"loss": 1.0642, |
|
"step": 545 |
|
}, |
|
{ |
|
"epoch": 0.4823503617627713, |
|
"grad_norm": 0.051513671875, |
|
"learning_rate": 0.00012335942300374788, |
|
"loss": 1.089, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 0.4867353650515238, |
|
"grad_norm": 0.05126953125, |
|
"learning_rate": 0.0001218681091206376, |
|
"loss": 1.0893, |
|
"step": 555 |
|
}, |
|
{ |
|
"epoch": 0.4911203683402763, |
|
"grad_norm": 0.049560546875, |
|
"learning_rate": 0.00012037166960128443, |
|
"loss": 1.0996, |
|
"step": 560 |
|
}, |
|
{ |
|
"epoch": 0.49550537162902875, |
|
"grad_norm": 0.04736328125, |
|
"learning_rate": 0.00011887045519410442, |
|
"loss": 1.0955, |
|
"step": 565 |
|
}, |
|
{ |
|
"epoch": 0.4998903749177812, |
|
"grad_norm": 0.050537109375, |
|
"learning_rate": 0.00011736481776669306, |
|
"loss": 1.0904, |
|
"step": 570 |
|
}, |
|
{ |
|
"epoch": 0.5042753782065337, |
|
"grad_norm": 0.04833984375, |
|
"learning_rate": 0.00011585511022335142, |
|
"loss": 1.1074, |
|
"step": 575 |
|
}, |
|
{ |
|
"epoch": 0.5086603814952861, |
|
"grad_norm": 0.0498046875, |
|
"learning_rate": 0.00011434168642236964, |
|
"loss": 1.0855, |
|
"step": 580 |
|
}, |
|
{ |
|
"epoch": 0.5130453847840386, |
|
"grad_norm": 0.052734375, |
|
"learning_rate": 0.00011282490109308633, |
|
"loss": 1.0872, |
|
"step": 585 |
|
}, |
|
{ |
|
"epoch": 0.5174303880727911, |
|
"grad_norm": 0.050048828125, |
|
"learning_rate": 0.00011130510975274409, |
|
"loss": 1.0824, |
|
"step": 590 |
|
}, |
|
{ |
|
"epoch": 0.5218153913615435, |
|
"grad_norm": 0.049072265625, |
|
"learning_rate": 0.0001097826686231604, |
|
"loss": 1.1002, |
|
"step": 595 |
|
}, |
|
{ |
|
"epoch": 0.526200394650296, |
|
"grad_norm": 0.050048828125, |
|
"learning_rate": 0.00010825793454723325, |
|
"loss": 1.083, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 0.5305853979390485, |
|
"grad_norm": 0.05126953125, |
|
"learning_rate": 0.00010673126490530112, |
|
"loss": 1.1003, |
|
"step": 605 |
|
}, |
|
{ |
|
"epoch": 0.5349704012278009, |
|
"grad_norm": 0.049072265625, |
|
"learning_rate": 0.00010520301753137724, |
|
"loss": 1.0852, |
|
"step": 610 |
|
}, |
|
{ |
|
"epoch": 0.5393554045165534, |
|
"grad_norm": 0.050048828125, |
|
"learning_rate": 0.00010367355062927726, |
|
"loss": 1.0904, |
|
"step": 615 |
|
}, |
|
{ |
|
"epoch": 0.5437404078053059, |
|
"grad_norm": 0.050048828125, |
|
"learning_rate": 0.00010214322268866032, |
|
"loss": 1.0839, |
|
"step": 620 |
|
}, |
|
{ |
|
"epoch": 0.5481254110940583, |
|
"grad_norm": 0.053466796875, |
|
"learning_rate": 0.00010061239240100327, |
|
"loss": 1.079, |
|
"step": 625 |
|
}, |
|
{ |
|
"epoch": 0.5525104143828108, |
|
"grad_norm": 0.048583984375, |
|
"learning_rate": 9.908141857552737e-05, |
|
"loss": 1.0987, |
|
"step": 630 |
|
}, |
|
{ |
|
"epoch": 0.5568954176715633, |
|
"grad_norm": 0.04833984375, |
|
"learning_rate": 9.755066005509753e-05, |
|
"loss": 1.075, |
|
"step": 635 |
|
}, |
|
{ |
|
"epoch": 0.5612804209603157, |
|
"grad_norm": 0.04931640625, |
|
"learning_rate": 9.602047563211359e-05, |
|
"loss": 1.0803, |
|
"step": 640 |
|
}, |
|
{ |
|
"epoch": 0.5656654242490682, |
|
"grad_norm": 0.053466796875, |
|
"learning_rate": 9.449122396441345e-05, |
|
"loss": 1.1114, |
|
"step": 645 |
|
}, |
|
{ |
|
"epoch": 0.5700504275378206, |
|
"grad_norm": 0.048828125, |
|
"learning_rate": 9.296326349120785e-05, |
|
"loss": 1.0771, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 0.5744354308265731, |
|
"grad_norm": 0.049560546875, |
|
"learning_rate": 9.143695234906611e-05, |
|
"loss": 1.0917, |
|
"step": 655 |
|
}, |
|
{ |
|
"epoch": 0.5788204341153256, |
|
"grad_norm": 0.049072265625, |
|
"learning_rate": 8.991264828797319e-05, |
|
"loss": 1.0843, |
|
"step": 660 |
|
}, |
|
{ |
|
"epoch": 0.583205437404078, |
|
"grad_norm": 0.0498046875, |
|
"learning_rate": 8.839070858747697e-05, |
|
"loss": 1.0863, |
|
"step": 665 |
|
}, |
|
{ |
|
"epoch": 0.5875904406928305, |
|
"grad_norm": 0.0517578125, |
|
"learning_rate": 8.687148997294621e-05, |
|
"loss": 1.086, |
|
"step": 670 |
|
}, |
|
{ |
|
"epoch": 0.591975443981583, |
|
"grad_norm": 0.05029296875, |
|
"learning_rate": 8.535534853195786e-05, |
|
"loss": 1.08, |
|
"step": 675 |
|
}, |
|
{ |
|
"epoch": 0.5963604472703354, |
|
"grad_norm": 0.05126953125, |
|
"learning_rate": 8.384263963083453e-05, |
|
"loss": 1.0686, |
|
"step": 680 |
|
}, |
|
{ |
|
"epoch": 0.6007454505590879, |
|
"grad_norm": 0.051025390625, |
|
"learning_rate": 8.23337178313504e-05, |
|
"loss": 1.075, |
|
"step": 685 |
|
}, |
|
{ |
|
"epoch": 0.6051304538478404, |
|
"grad_norm": 0.052490234375, |
|
"learning_rate": 8.082893680762619e-05, |
|
"loss": 1.0926, |
|
"step": 690 |
|
}, |
|
{ |
|
"epoch": 0.6095154571365928, |
|
"grad_norm": 0.051025390625, |
|
"learning_rate": 7.932864926323161e-05, |
|
"loss": 1.079, |
|
"step": 695 |
|
}, |
|
{ |
|
"epoch": 0.6139004604253453, |
|
"grad_norm": 0.049072265625, |
|
"learning_rate": 7.783320684851614e-05, |
|
"loss": 1.0844, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 0.6182854637140978, |
|
"grad_norm": 0.04833984375, |
|
"learning_rate": 7.634296007818576e-05, |
|
"loss": 1.1056, |
|
"step": 705 |
|
}, |
|
{ |
|
"epoch": 0.6226704670028502, |
|
"grad_norm": 0.049560546875, |
|
"learning_rate": 7.485825824914659e-05, |
|
"loss": 1.0851, |
|
"step": 710 |
|
}, |
|
{ |
|
"epoch": 0.6270554702916027, |
|
"grad_norm": 0.048828125, |
|
"learning_rate": 7.337944935863333e-05, |
|
"loss": 1.0786, |
|
"step": 715 |
|
}, |
|
{ |
|
"epoch": 0.6314404735803552, |
|
"grad_norm": 0.050048828125, |
|
"learning_rate": 7.190688002264308e-05, |
|
"loss": 1.089, |
|
"step": 720 |
|
}, |
|
{ |
|
"epoch": 0.6358254768691076, |
|
"grad_norm": 0.05126953125, |
|
"learning_rate": 7.044089539469212e-05, |
|
"loss": 1.0826, |
|
"step": 725 |
|
}, |
|
{ |
|
"epoch": 0.6402104801578601, |
|
"grad_norm": 0.0517578125, |
|
"learning_rate": 6.898183908491617e-05, |
|
"loss": 1.1004, |
|
"step": 730 |
|
}, |
|
{ |
|
"epoch": 0.6445954834466125, |
|
"grad_norm": 0.050537109375, |
|
"learning_rate": 6.753005307953167e-05, |
|
"loss": 1.0722, |
|
"step": 735 |
|
}, |
|
{ |
|
"epoch": 0.648980486735365, |
|
"grad_norm": 0.050048828125, |
|
"learning_rate": 6.608587766067852e-05, |
|
"loss": 1.0859, |
|
"step": 740 |
|
}, |
|
{ |
|
"epoch": 0.6533654900241175, |
|
"grad_norm": 0.050048828125, |
|
"learning_rate": 6.464965132666163e-05, |
|
"loss": 1.1088, |
|
"step": 745 |
|
}, |
|
{ |
|
"epoch": 0.6577504933128699, |
|
"grad_norm": 0.049560546875, |
|
"learning_rate": 6.322171071261071e-05, |
|
"loss": 1.0726, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 0.6621354966016224, |
|
"grad_norm": 0.049560546875, |
|
"learning_rate": 6.180239051157681e-05, |
|
"loss": 1.0897, |
|
"step": 755 |
|
}, |
|
{ |
|
"epoch": 0.6665204998903749, |
|
"grad_norm": 0.0478515625, |
|
"learning_rate": 6.039202339608432e-05, |
|
"loss": 1.0972, |
|
"step": 760 |
|
}, |
|
{ |
|
"epoch": 0.6709055031791273, |
|
"grad_norm": 0.049560546875, |
|
"learning_rate": 5.8990939940156e-05, |
|
"loss": 1.0884, |
|
"step": 765 |
|
}, |
|
{ |
|
"epoch": 0.6752905064678798, |
|
"grad_norm": 0.04833984375, |
|
"learning_rate": 5.7599468541830356e-05, |
|
"loss": 1.086, |
|
"step": 770 |
|
}, |
|
{ |
|
"epoch": 0.6796755097566323, |
|
"grad_norm": 0.0478515625, |
|
"learning_rate": 5.62179353461888e-05, |
|
"loss": 1.0921, |
|
"step": 775 |
|
}, |
|
{ |
|
"epoch": 0.6840605130453847, |
|
"grad_norm": 0.051025390625, |
|
"learning_rate": 5.484666416891109e-05, |
|
"loss": 1.0834, |
|
"step": 780 |
|
}, |
|
{ |
|
"epoch": 0.6884455163341372, |
|
"grad_norm": 0.04833984375, |
|
"learning_rate": 5.3485976420376336e-05, |
|
"loss": 1.0827, |
|
"step": 785 |
|
}, |
|
{ |
|
"epoch": 0.6928305196228897, |
|
"grad_norm": 0.050048828125, |
|
"learning_rate": 5.2136191030328455e-05, |
|
"loss": 1.0851, |
|
"step": 790 |
|
}, |
|
{ |
|
"epoch": 0.6972155229116422, |
|
"grad_norm": 0.0517578125, |
|
"learning_rate": 5.079762437312219e-05, |
|
"loss": 1.0834, |
|
"step": 795 |
|
}, |
|
{ |
|
"epoch": 0.7016005262003947, |
|
"grad_norm": 0.048583984375, |
|
"learning_rate": 4.9470590193569044e-05, |
|
"loss": 1.1016, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 0.7059855294891472, |
|
"grad_norm": 0.049560546875, |
|
"learning_rate": 4.815539953339865e-05, |
|
"loss": 1.0686, |
|
"step": 805 |
|
}, |
|
{ |
|
"epoch": 0.7103705327778996, |
|
"grad_norm": 0.047607421875, |
|
"learning_rate": 4.685236065835443e-05, |
|
"loss": 1.086, |
|
"step": 810 |
|
}, |
|
{ |
|
"epoch": 0.7147555360666521, |
|
"grad_norm": 0.0498046875, |
|
"learning_rate": 4.5561778985939366e-05, |
|
"loss": 1.0817, |
|
"step": 815 |
|
}, |
|
{ |
|
"epoch": 0.7191405393554046, |
|
"grad_norm": 0.048583984375, |
|
"learning_rate": 4.4283957013829846e-05, |
|
"loss": 1.0837, |
|
"step": 820 |
|
}, |
|
{ |
|
"epoch": 0.723525542644157, |
|
"grad_norm": 0.0478515625, |
|
"learning_rate": 4.301919424897338e-05, |
|
"loss": 1.0791, |
|
"step": 825 |
|
}, |
|
{ |
|
"epoch": 0.7279105459329095, |
|
"grad_norm": 0.0478515625, |
|
"learning_rate": 4.176778713738787e-05, |
|
"loss": 1.0865, |
|
"step": 830 |
|
}, |
|
{ |
|
"epoch": 0.732295549221662, |
|
"grad_norm": 0.048583984375, |
|
"learning_rate": 4.053002899467774e-05, |
|
"loss": 1.0842, |
|
"step": 835 |
|
}, |
|
{ |
|
"epoch": 0.7366805525104144, |
|
"grad_norm": 0.050048828125, |
|
"learning_rate": 3.9306209937284346e-05, |
|
"loss": 1.0939, |
|
"step": 840 |
|
}, |
|
{ |
|
"epoch": 0.7410655557991669, |
|
"grad_norm": 0.048095703125, |
|
"learning_rate": 3.809661681448576e-05, |
|
"loss": 1.0941, |
|
"step": 845 |
|
}, |
|
{ |
|
"epoch": 0.7454505590879194, |
|
"grad_norm": 0.048583984375, |
|
"learning_rate": 3.69015331411628e-05, |
|
"loss": 1.0664, |
|
"step": 850 |
|
}, |
|
{ |
|
"epoch": 0.7498355623766718, |
|
"grad_norm": 0.05029296875, |
|
"learning_rate": 3.5721239031346066e-05, |
|
"loss": 1.0844, |
|
"step": 855 |
|
}, |
|
{ |
|
"epoch": 0.7542205656654243, |
|
"grad_norm": 0.047607421875, |
|
"learning_rate": 3.455601113256073e-05, |
|
"loss": 1.1036, |
|
"step": 860 |
|
}, |
|
{ |
|
"epoch": 0.7586055689541767, |
|
"grad_norm": 0.048828125, |
|
"learning_rate": 3.340612256098316e-05, |
|
"loss": 1.0641, |
|
"step": 865 |
|
}, |
|
{ |
|
"epoch": 0.7629905722429292, |
|
"grad_norm": 0.04833984375, |
|
"learning_rate": 3.227184283742591e-05, |
|
"loss": 1.087, |
|
"step": 870 |
|
}, |
|
{ |
|
"epoch": 0.7673755755316817, |
|
"grad_norm": 0.04833984375, |
|
"learning_rate": 3.115343782416483e-05, |
|
"loss": 1.0992, |
|
"step": 875 |
|
}, |
|
{ |
|
"epoch": 0.7717605788204341, |
|
"grad_norm": 0.047607421875, |
|
"learning_rate": 3.0051169662624225e-05, |
|
"loss": 1.0838, |
|
"step": 880 |
|
}, |
|
{ |
|
"epoch": 0.7761455821091866, |
|
"grad_norm": 0.048583984375, |
|
"learning_rate": 2.89652967119336e-05, |
|
"loss": 1.0899, |
|
"step": 885 |
|
}, |
|
{ |
|
"epoch": 0.7805305853979391, |
|
"grad_norm": 0.048583984375, |
|
"learning_rate": 2.789607348837153e-05, |
|
"loss": 1.0906, |
|
"step": 890 |
|
}, |
|
{ |
|
"epoch": 0.7849155886866915, |
|
"grad_norm": 0.04736328125, |
|
"learning_rate": 2.684375060570965e-05, |
|
"loss": 1.0897, |
|
"step": 895 |
|
}, |
|
{ |
|
"epoch": 0.789300591975444, |
|
"grad_norm": 0.04736328125, |
|
"learning_rate": 2.5808574716471856e-05, |
|
"loss": 1.0857, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 0.7936855952641965, |
|
"grad_norm": 0.050537109375, |
|
"learning_rate": 2.4790788454121584e-05, |
|
"loss": 1.0973, |
|
"step": 905 |
|
}, |
|
{ |
|
"epoch": 0.7980705985529489, |
|
"grad_norm": 0.04833984375, |
|
"learning_rate": 2.379063037619146e-05, |
|
"loss": 1.0714, |
|
"step": 910 |
|
}, |
|
{ |
|
"epoch": 0.8024556018417014, |
|
"grad_norm": 0.048095703125, |
|
"learning_rate": 2.2808334908367914e-05, |
|
"loss": 1.0919, |
|
"step": 915 |
|
}, |
|
{ |
|
"epoch": 0.8068406051304539, |
|
"grad_norm": 0.0478515625, |
|
"learning_rate": 2.184413228954468e-05, |
|
"loss": 1.082, |
|
"step": 920 |
|
}, |
|
{ |
|
"epoch": 0.8112256084192063, |
|
"grad_norm": 0.0478515625, |
|
"learning_rate": 2.0898248517857256e-05, |
|
"loss": 1.091, |
|
"step": 925 |
|
}, |
|
{ |
|
"epoch": 0.8156106117079588, |
|
"grad_norm": 0.04833984375, |
|
"learning_rate": 1.9970905297711606e-05, |
|
"loss": 1.0919, |
|
"step": 930 |
|
}, |
|
{ |
|
"epoch": 0.8199956149967113, |
|
"grad_norm": 0.050048828125, |
|
"learning_rate": 1.9062319987819067e-05, |
|
"loss": 1.0668, |
|
"step": 935 |
|
}, |
|
{ |
|
"epoch": 0.8243806182854637, |
|
"grad_norm": 0.04833984375, |
|
"learning_rate": 1.8172705550250092e-05, |
|
"loss": 1.0912, |
|
"step": 940 |
|
}, |
|
{ |
|
"epoch": 0.8287656215742162, |
|
"grad_norm": 0.046630859375, |
|
"learning_rate": 1.7302270500518182e-05, |
|
"loss": 1.0886, |
|
"step": 945 |
|
}, |
|
{ |
|
"epoch": 0.8331506248629686, |
|
"grad_norm": 0.047607421875, |
|
"learning_rate": 1.6451218858706374e-05, |
|
"loss": 1.0815, |
|
"step": 950 |
|
}, |
|
{ |
|
"epoch": 0.8375356281517211, |
|
"grad_norm": 0.0498046875, |
|
"learning_rate": 1.5619750101647114e-05, |
|
"loss": 1.1055, |
|
"step": 955 |
|
}, |
|
{ |
|
"epoch": 0.8419206314404736, |
|
"grad_norm": 0.04833984375, |
|
"learning_rate": 1.4808059116167305e-05, |
|
"loss": 1.0854, |
|
"step": 960 |
|
}, |
|
{ |
|
"epoch": 0.846305634729226, |
|
"grad_norm": 0.048828125, |
|
"learning_rate": 1.4016336153408893e-05, |
|
"loss": 1.1044, |
|
"step": 965 |
|
}, |
|
{ |
|
"epoch": 0.8506906380179785, |
|
"grad_norm": 0.047607421875, |
|
"learning_rate": 1.3244766784236307e-05, |
|
"loss": 1.103, |
|
"step": 970 |
|
}, |
|
{ |
|
"epoch": 0.855075641306731, |
|
"grad_norm": 0.048583984375, |
|
"learning_rate": 1.2493531855740625e-05, |
|
"loss": 1.0638, |
|
"step": 975 |
|
}, |
|
{ |
|
"epoch": 0.8594606445954834, |
|
"grad_norm": 0.047119140625, |
|
"learning_rate": 1.176280744885121e-05, |
|
"loss": 1.067, |
|
"step": 980 |
|
}, |
|
{ |
|
"epoch": 0.8638456478842359, |
|
"grad_norm": 0.0478515625, |
|
"learning_rate": 1.1052764837064178e-05, |
|
"loss": 1.0787, |
|
"step": 985 |
|
}, |
|
{ |
|
"epoch": 0.8682306511729884, |
|
"grad_norm": 0.04833984375, |
|
"learning_rate": 1.0363570446297999e-05, |
|
"loss": 1.0825, |
|
"step": 990 |
|
}, |
|
{ |
|
"epoch": 0.8726156544617408, |
|
"grad_norm": 0.047607421875, |
|
"learning_rate": 9.695385815885016e-06, |
|
"loss": 1.0905, |
|
"step": 995 |
|
}, |
|
{ |
|
"epoch": 0.8770006577504933, |
|
"grad_norm": 0.048095703125, |
|
"learning_rate": 9.048367560708604e-06, |
|
"loss": 1.0723, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 0.8813856610392458, |
|
"grad_norm": 0.0478515625, |
|
"learning_rate": 8.422667334494249e-06, |
|
"loss": 1.1059, |
|
"step": 1005 |
|
}, |
|
{ |
|
"epoch": 0.8857706643279982, |
|
"grad_norm": 0.047607421875, |
|
"learning_rate": 7.818431794263836e-06, |
|
"loss": 1.1027, |
|
"step": 1010 |
|
}, |
|
{ |
|
"epoch": 0.8901556676167507, |
|
"grad_norm": 0.048095703125, |
|
"learning_rate": 7.235802565960714e-06, |
|
"loss": 1.0733, |
|
"step": 1015 |
|
}, |
|
{ |
|
"epoch": 0.8945406709055032, |
|
"grad_norm": 0.051025390625, |
|
"learning_rate": 6.674916211254289e-06, |
|
"loss": 1.0807, |
|
"step": 1020 |
|
}, |
|
{ |
|
"epoch": 0.8989256741942556, |
|
"grad_norm": 0.05810546875, |
|
"learning_rate": 6.1359041955315725e-06, |
|
"loss": 1.0729, |
|
"step": 1025 |
|
}, |
|
{ |
|
"epoch": 0.9033106774830081, |
|
"grad_norm": 0.0478515625, |
|
"learning_rate": 5.618892857083069e-06, |
|
"loss": 1.0816, |
|
"step": 1030 |
|
}, |
|
{ |
|
"epoch": 0.9076956807717605, |
|
"grad_norm": 0.047607421875, |
|
"learning_rate": 5.124003377490582e-06, |
|
"loss": 1.0853, |
|
"step": 1035 |
|
}, |
|
{ |
|
"epoch": 0.912080684060513, |
|
"grad_norm": 0.049560546875, |
|
"learning_rate": 4.65135175322361e-06, |
|
"loss": 1.0829, |
|
"step": 1040 |
|
}, |
|
{ |
|
"epoch": 0.9164656873492655, |
|
"grad_norm": 0.047607421875, |
|
"learning_rate": 4.20104876845111e-06, |
|
"loss": 1.0797, |
|
"step": 1045 |
|
}, |
|
{ |
|
"epoch": 0.9208506906380179, |
|
"grad_norm": 0.0478515625, |
|
"learning_rate": 3.7731999690749585e-06, |
|
"loss": 1.0908, |
|
"step": 1050 |
|
}, |
|
{ |
|
"epoch": 0.9252356939267704, |
|
"grad_norm": 0.04736328125, |
|
"learning_rate": 3.367905637991142e-06, |
|
"loss": 1.0784, |
|
"step": 1055 |
|
}, |
|
{ |
|
"epoch": 0.9296206972155229, |
|
"grad_norm": 0.0478515625, |
|
"learning_rate": 2.9852607715846193e-06, |
|
"loss": 1.101, |
|
"step": 1060 |
|
}, |
|
{ |
|
"epoch": 0.9340057005042753, |
|
"grad_norm": 0.048095703125, |
|
"learning_rate": 2.6253550574632303e-06, |
|
"loss": 1.1063, |
|
"step": 1065 |
|
}, |
|
{ |
|
"epoch": 0.9383907037930278, |
|
"grad_norm": 0.0478515625, |
|
"learning_rate": 2.288272853436013e-06, |
|
"loss": 1.0768, |
|
"step": 1070 |
|
}, |
|
{ |
|
"epoch": 0.9427757070817803, |
|
"grad_norm": 0.049072265625, |
|
"learning_rate": 1.974093167740565e-06, |
|
"loss": 1.0755, |
|
"step": 1075 |
|
}, |
|
{ |
|
"epoch": 0.9471607103705327, |
|
"grad_norm": 0.04833984375, |
|
"learning_rate": 1.6828896405244988e-06, |
|
"loss": 1.0714, |
|
"step": 1080 |
|
}, |
|
{ |
|
"epoch": 0.9515457136592852, |
|
"grad_norm": 0.047119140625, |
|
"learning_rate": 1.4147305265850175e-06, |
|
"loss": 1.085, |
|
"step": 1085 |
|
}, |
|
{ |
|
"epoch": 0.9559307169480377, |
|
"grad_norm": 0.047119140625, |
|
"learning_rate": 1.1696786793707781e-06, |
|
"loss": 1.0816, |
|
"step": 1090 |
|
}, |
|
{ |
|
"epoch": 0.9603157202367901, |
|
"grad_norm": 0.046875, |
|
"learning_rate": 9.477915362496758e-07, |
|
"loss": 1.0925, |
|
"step": 1095 |
|
}, |
|
{ |
|
"epoch": 0.9647007235255426, |
|
"grad_norm": 0.0478515625, |
|
"learning_rate": 7.491211050462798e-07, |
|
"loss": 1.0662, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 0.9690857268142951, |
|
"grad_norm": 0.048095703125, |
|
"learning_rate": 5.737139518517509e-07, |
|
"loss": 1.0941, |
|
"step": 1105 |
|
}, |
|
{ |
|
"epoch": 0.9734707301030476, |
|
"grad_norm": 0.0498046875, |
|
"learning_rate": 4.216111901092501e-07, |
|
"loss": 1.086, |
|
"step": 1110 |
|
}, |
|
{ |
|
"epoch": 0.9778557333918001, |
|
"grad_norm": 0.0478515625, |
|
"learning_rate": 2.9284847097746923e-07, |
|
"loss": 1.0768, |
|
"step": 1115 |
|
}, |
|
{ |
|
"epoch": 0.9822407366805526, |
|
"grad_norm": 0.04736328125, |
|
"learning_rate": 1.8745597497433765e-07, |
|
"loss": 1.0733, |
|
"step": 1120 |
|
}, |
|
{ |
|
"epoch": 0.986625739969305, |
|
"grad_norm": 0.04833984375, |
|
"learning_rate": 1.0545840490313596e-07, |
|
"loss": 1.0981, |
|
"step": 1125 |
|
}, |
|
{ |
|
"epoch": 0.9910107432580575, |
|
"grad_norm": 0.05029296875, |
|
"learning_rate": 4.687498006236135e-08, |
|
"loss": 1.1037, |
|
"step": 1130 |
|
}, |
|
{ |
|
"epoch": 0.99539574654681, |
|
"grad_norm": 0.0478515625, |
|
"learning_rate": 1.1719431740997433e-08, |
|
"loss": 1.0715, |
|
"step": 1135 |
|
}, |
|
{ |
|
"epoch": 0.9997807498355624, |
|
"grad_norm": 0.0478515625, |
|
"learning_rate": 0.0, |
|
"loss": 1.0883, |
|
"step": 1140 |
|
}, |
|
{ |
|
"epoch": 0.9997807498355624, |
|
"eval_loss": 1.089297890663147, |
|
"eval_runtime": 2048.9326, |
|
"eval_samples_per_second": 7.883, |
|
"eval_steps_per_second": 7.883, |
|
"step": 1140 |
|
}, |
|
{ |
|
"epoch": 0.9997807498355624, |
|
"step": 1140, |
|
"total_flos": 3.167597749864497e+18, |
|
"train_loss": 0.80264539467661, |
|
"train_runtime": 53655.2232, |
|
"train_samples_per_second": 2.72, |
|
"train_steps_per_second": 0.021 |
|
} |
|
], |
|
"logging_steps": 5, |
|
"max_steps": 1140, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 1, |
|
"save_steps": 100, |
|
"total_flos": 3.167597749864497e+18, |
|
"train_batch_size": 1, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|