RoyJoy's picture
Training in progress, step 111, checkpoint
f6a2eec verified
{
"best_metric": 1.2475459575653076,
"best_model_checkpoint": "miner_id_24/checkpoint-100",
"epoch": 0.2748800495279369,
"eval_steps": 25,
"global_step": 111,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.002476396842594026,
"grad_norm": 0.7329409122467041,
"learning_rate": 9.999999999999999e-05,
"loss": 2.4297,
"step": 1
},
{
"epoch": 0.002476396842594026,
"eval_loss": 2.370262622833252,
"eval_runtime": 1.0104,
"eval_samples_per_second": 49.483,
"eval_steps_per_second": 12.866,
"step": 1
},
{
"epoch": 0.004952793685188052,
"grad_norm": 0.7226835489273071,
"learning_rate": 0.00019999999999999998,
"loss": 2.0074,
"step": 2
},
{
"epoch": 0.007429190527782077,
"grad_norm": 0.7592707276344299,
"learning_rate": 0.0003,
"loss": 2.0726,
"step": 3
},
{
"epoch": 0.009905587370376103,
"grad_norm": 0.8533850312232971,
"learning_rate": 0.0002999428882610971,
"loss": 1.9296,
"step": 4
},
{
"epoch": 0.012381984212970128,
"grad_norm": 0.7311192750930786,
"learning_rate": 0.0002997716013666212,
"loss": 1.7437,
"step": 5
},
{
"epoch": 0.014858381055564154,
"grad_norm": 0.6385353803634644,
"learning_rate": 0.0002994862842423856,
"loss": 1.7528,
"step": 6
},
{
"epoch": 0.01733477789815818,
"grad_norm": 0.8438978791236877,
"learning_rate": 0.0002990871782951623,
"loss": 1.6075,
"step": 7
},
{
"epoch": 0.019811174740752207,
"grad_norm": 0.9431717395782471,
"learning_rate": 0.00029857462120842744,
"loss": 1.6422,
"step": 8
},
{
"epoch": 0.02228757158334623,
"grad_norm": 0.758976399898529,
"learning_rate": 0.0002979490466566481,
"loss": 1.6292,
"step": 9
},
{
"epoch": 0.024763968425940256,
"grad_norm": 0.6007578372955322,
"learning_rate": 0.0002972109839383494,
"loss": 1.5238,
"step": 10
},
{
"epoch": 0.027240365268534284,
"grad_norm": 0.4644957482814789,
"learning_rate": 0.0002963610575282762,
"loss": 1.4831,
"step": 11
},
{
"epoch": 0.02971676211112831,
"grad_norm": 0.5231449604034424,
"learning_rate": 0.0002953999865490242,
"loss": 1.4047,
"step": 12
},
{
"epoch": 0.03219315895372234,
"grad_norm": 0.6303499937057495,
"learning_rate": 0.00029432858416259097,
"loss": 1.8963,
"step": 13
},
{
"epoch": 0.03466955579631636,
"grad_norm": 0.7002948522567749,
"learning_rate": 0.0002931477568823596,
"loss": 1.5882,
"step": 14
},
{
"epoch": 0.037145952638910386,
"grad_norm": 0.48810043931007385,
"learning_rate": 0.00029185850380609757,
"loss": 1.5572,
"step": 15
},
{
"epoch": 0.039622349481504414,
"grad_norm": 0.4367665648460388,
"learning_rate": 0.000290461915770621,
"loss": 1.4534,
"step": 16
},
{
"epoch": 0.042098746324098435,
"grad_norm": 0.39929234981536865,
"learning_rate": 0.00028895917442883697,
"loss": 1.4147,
"step": 17
},
{
"epoch": 0.04457514316669246,
"grad_norm": 0.38216057419776917,
"learning_rate": 0.00028735155124994774,
"loss": 1.438,
"step": 18
},
{
"epoch": 0.04705154000928649,
"grad_norm": 0.3422548770904541,
"learning_rate": 0.0002856404064436606,
"loss": 1.3886,
"step": 19
},
{
"epoch": 0.04952793685188051,
"grad_norm": 0.4055199921131134,
"learning_rate": 0.000283827187809315,
"loss": 1.47,
"step": 20
},
{
"epoch": 0.05200433369447454,
"grad_norm": 0.42674520611763,
"learning_rate": 0.0002819134295108992,
"loss": 1.4333,
"step": 21
},
{
"epoch": 0.05448073053706857,
"grad_norm": 0.3820836544036865,
"learning_rate": 0.00027990075077899494,
"loss": 1.4092,
"step": 22
},
{
"epoch": 0.05695712737966259,
"grad_norm": 0.4357558786869049,
"learning_rate": 0.0002777908545407464,
"loss": 1.4545,
"step": 23
},
{
"epoch": 0.05943352422225662,
"grad_norm": 0.43220487236976624,
"learning_rate": 0.0002755855259790139,
"loss": 1.3423,
"step": 24
},
{
"epoch": 0.061909921064850645,
"grad_norm": 0.5457894206047058,
"learning_rate": 0.0002732866310219309,
"loss": 1.4758,
"step": 25
},
{
"epoch": 0.061909921064850645,
"eval_loss": 1.4271225929260254,
"eval_runtime": 1.0088,
"eval_samples_per_second": 49.566,
"eval_steps_per_second": 12.887,
"step": 25
},
{
"epoch": 0.06438631790744467,
"grad_norm": 0.6328326463699341,
"learning_rate": 0.0002708961147641427,
"loss": 1.6565,
"step": 26
},
{
"epoch": 0.06686271475003869,
"grad_norm": 0.39392906427383423,
"learning_rate": 0.000268415999821062,
"loss": 1.445,
"step": 27
},
{
"epoch": 0.06933911159263272,
"grad_norm": 0.40518683195114136,
"learning_rate": 0.00026584838461753444,
"loss": 1.4417,
"step": 28
},
{
"epoch": 0.07181550843522674,
"grad_norm": 0.40206319093704224,
"learning_rate": 0.00026319544161236156,
"loss": 1.3806,
"step": 29
},
{
"epoch": 0.07429190527782077,
"grad_norm": 0.427487313747406,
"learning_rate": 0.0002604594154601839,
"loss": 1.3721,
"step": 30
},
{
"epoch": 0.0767683021204148,
"grad_norm": 0.4002684950828552,
"learning_rate": 0.00025764262111227905,
"loss": 1.3487,
"step": 31
},
{
"epoch": 0.07924469896300883,
"grad_norm": 0.474567711353302,
"learning_rate": 0.00025474744185788155,
"loss": 1.3678,
"step": 32
},
{
"epoch": 0.08172109580560284,
"grad_norm": 0.40476325154304504,
"learning_rate": 0.0002517763273076828,
"loss": 1.3158,
"step": 33
},
{
"epoch": 0.08419749264819687,
"grad_norm": 0.3845173418521881,
"learning_rate": 0.00024873179132121507,
"loss": 1.3835,
"step": 34
},
{
"epoch": 0.0866738894907909,
"grad_norm": 0.3646795451641083,
"learning_rate": 0.0002456164098798761,
"loss": 1.3434,
"step": 35
},
{
"epoch": 0.08915028633338493,
"grad_norm": 0.449341356754303,
"learning_rate": 0.0002424328189073912,
"loss": 1.297,
"step": 36
},
{
"epoch": 0.09162668317597895,
"grad_norm": 0.44050559401512146,
"learning_rate": 0.0002391837120395588,
"loss": 1.2494,
"step": 37
},
{
"epoch": 0.09410308001857298,
"grad_norm": 0.5633939504623413,
"learning_rate": 0.00023587183834516558,
"loss": 1.6648,
"step": 38
},
{
"epoch": 0.096579476861167,
"grad_norm": 0.47090381383895874,
"learning_rate": 0.00023249999999999999,
"loss": 1.4137,
"step": 39
},
{
"epoch": 0.09905587370376102,
"grad_norm": 0.3539658486843109,
"learning_rate": 0.00022907104991593143,
"loss": 1.3808,
"step": 40
},
{
"epoch": 0.10153227054635505,
"grad_norm": 0.33691152930259705,
"learning_rate": 0.0002255878893270624,
"loss": 1.2999,
"step": 41
},
{
"epoch": 0.10400866738894908,
"grad_norm": 0.3699648976325989,
"learning_rate": 0.00022205346533499438,
"loss": 1.3159,
"step": 42
},
{
"epoch": 0.10648506423154311,
"grad_norm": 0.3926730155944824,
"learning_rate": 0.00021847076841528617,
"loss": 1.2978,
"step": 43
},
{
"epoch": 0.10896146107413714,
"grad_norm": 0.43968045711517334,
"learning_rate": 0.00021484282988721236,
"loss": 1.3076,
"step": 44
},
{
"epoch": 0.11143785791673115,
"grad_norm": 0.4484625458717346,
"learning_rate": 0.00021117271934896527,
"loss": 1.3052,
"step": 45
},
{
"epoch": 0.11391425475932518,
"grad_norm": 0.4811747670173645,
"learning_rate": 0.00020746354208046782,
"loss": 1.2317,
"step": 46
},
{
"epoch": 0.1163906516019192,
"grad_norm": 0.38913848996162415,
"learning_rate": 0.00020371843641599718,
"loss": 1.2534,
"step": 47
},
{
"epoch": 0.11886704844451323,
"grad_norm": 0.4041205048561096,
"learning_rate": 0.0001999405710888403,
"loss": 1.3195,
"step": 48
},
{
"epoch": 0.12134344528710726,
"grad_norm": 0.391659677028656,
"learning_rate": 0.0001961331425502294,
"loss": 1.2099,
"step": 49
},
{
"epoch": 0.12381984212970129,
"grad_norm": 0.4483822286128998,
"learning_rate": 0.0001922993722648251,
"loss": 1.2427,
"step": 50
},
{
"epoch": 0.12381984212970129,
"eval_loss": 1.3279614448547363,
"eval_runtime": 1.0087,
"eval_samples_per_second": 49.57,
"eval_steps_per_second": 12.888,
"step": 50
},
{
"epoch": 0.1262962389722953,
"grad_norm": 0.5131921768188477,
"learning_rate": 0.0001884425039850356,
"loss": 1.602,
"step": 51
},
{
"epoch": 0.12877263581488935,
"grad_norm": 0.588230550289154,
"learning_rate": 0.00018456580100647827,
"loss": 1.3518,
"step": 52
},
{
"epoch": 0.13124903265748336,
"grad_norm": 0.5344261527061462,
"learning_rate": 0.00018067254340690606,
"loss": 1.2842,
"step": 53
},
{
"epoch": 0.13372542950007738,
"grad_norm": 0.42600345611572266,
"learning_rate": 0.00017676602527093386,
"loss": 1.2732,
"step": 54
},
{
"epoch": 0.13620182634267142,
"grad_norm": 0.3716365396976471,
"learning_rate": 0.00017284955190291422,
"loss": 1.3104,
"step": 55
},
{
"epoch": 0.13867822318526543,
"grad_norm": 0.36169472336769104,
"learning_rate": 0.00016892643703032004,
"loss": 1.2831,
"step": 56
},
{
"epoch": 0.14115462002785947,
"grad_norm": 0.38514360785484314,
"learning_rate": 0.000165,
"loss": 1.1825,
"step": 57
},
{
"epoch": 0.1436310168704535,
"grad_norm": 0.46233615279197693,
"learning_rate": 0.00016107356296967993,
"loss": 1.3065,
"step": 58
},
{
"epoch": 0.14610741371304753,
"grad_norm": 0.49720239639282227,
"learning_rate": 0.00015715044809708577,
"loss": 1.2713,
"step": 59
},
{
"epoch": 0.14858381055564154,
"grad_norm": 0.4614707827568054,
"learning_rate": 0.0001532339747290661,
"loss": 1.261,
"step": 60
},
{
"epoch": 0.15106020739823556,
"grad_norm": 0.4578586220741272,
"learning_rate": 0.00014932745659309386,
"loss": 1.2485,
"step": 61
},
{
"epoch": 0.1535366042408296,
"grad_norm": 0.38602447509765625,
"learning_rate": 0.00014543419899352172,
"loss": 1.1659,
"step": 62
},
{
"epoch": 0.1560130010834236,
"grad_norm": 0.4391464293003082,
"learning_rate": 0.0001415574960149644,
"loss": 1.5451,
"step": 63
},
{
"epoch": 0.15848939792601766,
"grad_norm": 0.39024028182029724,
"learning_rate": 0.0001377006277351749,
"loss": 1.3855,
"step": 64
},
{
"epoch": 0.16096579476861167,
"grad_norm": 0.33916693925857544,
"learning_rate": 0.00013386685744977056,
"loss": 1.2976,
"step": 65
},
{
"epoch": 0.16344219161120568,
"grad_norm": 0.3237704634666443,
"learning_rate": 0.00013005942891115968,
"loss": 1.2733,
"step": 66
},
{
"epoch": 0.16591858845379973,
"grad_norm": 0.3709225654602051,
"learning_rate": 0.00012628156358400285,
"loss": 1.2839,
"step": 67
},
{
"epoch": 0.16839498529639374,
"grad_norm": 0.3892875611782074,
"learning_rate": 0.00012253645791953217,
"loss": 1.3312,
"step": 68
},
{
"epoch": 0.17087138213898778,
"grad_norm": 0.3935694992542267,
"learning_rate": 0.00011882728065103471,
"loss": 1.246,
"step": 69
},
{
"epoch": 0.1733477789815818,
"grad_norm": 0.3991238474845886,
"learning_rate": 0.0001151571701127876,
"loss": 1.2713,
"step": 70
},
{
"epoch": 0.17582417582417584,
"grad_norm": 0.39288103580474854,
"learning_rate": 0.00011152923158471383,
"loss": 1.228,
"step": 71
},
{
"epoch": 0.17830057266676985,
"grad_norm": 0.3925284743309021,
"learning_rate": 0.0001079465346650056,
"loss": 1.2817,
"step": 72
},
{
"epoch": 0.18077696950936387,
"grad_norm": 0.41174066066741943,
"learning_rate": 0.00010441211067293761,
"loss": 1.2499,
"step": 73
},
{
"epoch": 0.1832533663519579,
"grad_norm": 0.3850061595439911,
"learning_rate": 0.00010092895008406854,
"loss": 1.1749,
"step": 74
},
{
"epoch": 0.18572976319455192,
"grad_norm": 0.45076093077659607,
"learning_rate": 9.750000000000003e-05,
"loss": 1.2532,
"step": 75
},
{
"epoch": 0.18572976319455192,
"eval_loss": 1.2752010822296143,
"eval_runtime": 1.015,
"eval_samples_per_second": 49.259,
"eval_steps_per_second": 12.807,
"step": 75
},
{
"epoch": 0.18820616003714596,
"grad_norm": 0.44986942410469055,
"learning_rate": 9.412816165483439e-05,
"loss": 1.6073,
"step": 76
},
{
"epoch": 0.19068255687973998,
"grad_norm": 0.35847336053848267,
"learning_rate": 9.081628796044118e-05,
"loss": 1.2849,
"step": 77
},
{
"epoch": 0.193158953722334,
"grad_norm": 0.3534252643585205,
"learning_rate": 8.756718109260881e-05,
"loss": 1.3032,
"step": 78
},
{
"epoch": 0.19563535056492803,
"grad_norm": 0.3720012903213501,
"learning_rate": 8.438359012012389e-05,
"loss": 1.2459,
"step": 79
},
{
"epoch": 0.19811174740752205,
"grad_norm": 0.3475172817707062,
"learning_rate": 8.126820867878491e-05,
"loss": 1.2174,
"step": 80
},
{
"epoch": 0.2005881442501161,
"grad_norm": 0.3700581192970276,
"learning_rate": 7.822367269231717e-05,
"loss": 1.3342,
"step": 81
},
{
"epoch": 0.2030645410927101,
"grad_norm": 0.38341790437698364,
"learning_rate": 7.525255814211838e-05,
"loss": 1.2484,
"step": 82
},
{
"epoch": 0.20554093793530412,
"grad_norm": 0.3764133155345917,
"learning_rate": 7.235737888772097e-05,
"loss": 1.2419,
"step": 83
},
{
"epoch": 0.20801733477789816,
"grad_norm": 0.387975811958313,
"learning_rate": 6.954058453981609e-05,
"loss": 1.2377,
"step": 84
},
{
"epoch": 0.21049373162049217,
"grad_norm": 0.40386706590652466,
"learning_rate": 6.680455838763842e-05,
"loss": 1.206,
"step": 85
},
{
"epoch": 0.21297012846308622,
"grad_norm": 0.4010809063911438,
"learning_rate": 6.415161538246557e-05,
"loss": 1.2053,
"step": 86
},
{
"epoch": 0.21544652530568023,
"grad_norm": 0.420891135931015,
"learning_rate": 6.158400017893799e-05,
"loss": 1.2282,
"step": 87
},
{
"epoch": 0.21792292214827427,
"grad_norm": 0.44719812273979187,
"learning_rate": 5.9103885235857274e-05,
"loss": 1.5592,
"step": 88
},
{
"epoch": 0.2203993189908683,
"grad_norm": 0.36982178688049316,
"learning_rate": 5.671336897806908e-05,
"loss": 1.348,
"step": 89
},
{
"epoch": 0.2228757158334623,
"grad_norm": 0.3596123158931732,
"learning_rate": 5.441447402098609e-05,
"loss": 1.2514,
"step": 90
},
{
"epoch": 0.22535211267605634,
"grad_norm": 0.3666294515132904,
"learning_rate": 5.2209145459253604e-05,
"loss": 1.2815,
"step": 91
},
{
"epoch": 0.22782850951865036,
"grad_norm": 0.3626820743083954,
"learning_rate": 5.0099249221005035e-05,
"loss": 1.242,
"step": 92
},
{
"epoch": 0.2303049063612444,
"grad_norm": 0.3683141767978668,
"learning_rate": 4.808657048910077e-05,
"loss": 1.2619,
"step": 93
},
{
"epoch": 0.2327813032038384,
"grad_norm": 0.38167521357536316,
"learning_rate": 4.617281219068502e-05,
"loss": 1.2324,
"step": 94
},
{
"epoch": 0.23525770004643243,
"grad_norm": 0.386264443397522,
"learning_rate": 4.435959355633935e-05,
"loss": 1.2487,
"step": 95
},
{
"epoch": 0.23773409688902647,
"grad_norm": 0.3971124589443207,
"learning_rate": 4.264844875005226e-05,
"loss": 1.2667,
"step": 96
},
{
"epoch": 0.24021049373162048,
"grad_norm": 0.39618241786956787,
"learning_rate": 4.104082557116302e-05,
"loss": 1.2715,
"step": 97
},
{
"epoch": 0.24268689057421453,
"grad_norm": 0.4155503809452057,
"learning_rate": 3.953808422937896e-05,
"loss": 1.199,
"step": 98
},
{
"epoch": 0.24516328741680854,
"grad_norm": 0.40676349401474,
"learning_rate": 3.814149619390237e-05,
"loss": 1.1614,
"step": 99
},
{
"epoch": 0.24763968425940258,
"grad_norm": 0.4450746178627014,
"learning_rate": 3.685224311764042e-05,
"loss": 1.185,
"step": 100
},
{
"epoch": 0.24763968425940258,
"eval_loss": 1.2475459575653076,
"eval_runtime": 1.0194,
"eval_samples_per_second": 49.047,
"eval_steps_per_second": 12.752,
"step": 100
},
{
"epoch": 0.25011608110199657,
"grad_norm": 0.4321860671043396,
"learning_rate": 3.567141583740899e-05,
"loss": 1.5005,
"step": 101
},
{
"epoch": 0.2525924779445906,
"grad_norm": 0.3553882837295532,
"learning_rate": 3.4600013450975794e-05,
"loss": 1.2593,
"step": 102
},
{
"epoch": 0.25506887478718465,
"grad_norm": 0.3538253605365753,
"learning_rate": 3.3638942471723784e-05,
"loss": 1.2722,
"step": 103
},
{
"epoch": 0.2575452716297787,
"grad_norm": 0.36580267548561096,
"learning_rate": 3.2789016061650545e-05,
"loss": 1.2305,
"step": 104
},
{
"epoch": 0.2600216684723727,
"grad_norm": 0.36455708742141724,
"learning_rate": 3.205095334335192e-05,
"loss": 1.2613,
"step": 105
},
{
"epoch": 0.2624980653149667,
"grad_norm": 0.3923397660255432,
"learning_rate": 3.1425378791572495e-05,
"loss": 1.3292,
"step": 106
},
{
"epoch": 0.26497446215756076,
"grad_norm": 0.3707919120788574,
"learning_rate": 3.0912821704837695e-05,
"loss": 1.1993,
"step": 107
},
{
"epoch": 0.26745085900015475,
"grad_norm": 0.3740595281124115,
"learning_rate": 3.051371575761435e-05,
"loss": 1.1775,
"step": 108
},
{
"epoch": 0.2699272558427488,
"grad_norm": 0.3999345004558563,
"learning_rate": 3.0228398633378795e-05,
"loss": 1.198,
"step": 109
},
{
"epoch": 0.27240365268534283,
"grad_norm": 0.43420764803886414,
"learning_rate": 3.005711173890292e-05,
"loss": 1.2404,
"step": 110
},
{
"epoch": 0.2748800495279369,
"grad_norm": 0.45214489102363586,
"learning_rate": 2.9999999999999997e-05,
"loss": 1.2226,
"step": 111
}
],
"logging_steps": 1,
"max_steps": 111,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 50,
"stateful_callbacks": {
"EarlyStoppingCallback": {
"args": {
"early_stopping_patience": 20,
"early_stopping_threshold": 0.0
},
"attributes": {
"early_stopping_patience_counter": 0
}
},
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 1.2015776107027497e+18,
"train_batch_size": 1,
"trial_name": null,
"trial_params": null
}