RoyJoy's picture
Training in progress, step 81, checkpoint
e4aa62f verified
{
"best_metric": 0.4470236301422119,
"best_model_checkpoint": "miner_id_24/checkpoint-50",
"epoch": 2.009298721425804,
"eval_steps": 25,
"global_step": 81,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.02479659046881054,
"grad_norm": 9.463377952575684,
"learning_rate": 0.00015,
"loss": 8.2539,
"step": 1
},
{
"epoch": 0.02479659046881054,
"eval_loss": 8.061068534851074,
"eval_runtime": 1.5646,
"eval_samples_per_second": 31.957,
"eval_steps_per_second": 8.309,
"step": 1
},
{
"epoch": 0.04959318093762108,
"grad_norm": 10.420730590820312,
"learning_rate": 0.0003,
"loss": 8.1104,
"step": 2
},
{
"epoch": 0.07438977140643162,
"grad_norm": 9.304948806762695,
"learning_rate": 0.0002998932686256857,
"loss": 5.7079,
"step": 3
},
{
"epoch": 0.09918636187524216,
"grad_norm": 7.6371588706970215,
"learning_rate": 0.0002995732432669839,
"loss": 4.4867,
"step": 4
},
{
"epoch": 0.1239829523440527,
"grad_norm": 5.276372909545898,
"learning_rate": 0.00029904042994976636,
"loss": 4.1186,
"step": 5
},
{
"epoch": 0.14877954281286324,
"grad_norm": 7.107175827026367,
"learning_rate": 0.000298295671161405,
"loss": 3.336,
"step": 6
},
{
"epoch": 0.17357613328167376,
"grad_norm": 7.58524751663208,
"learning_rate": 0.00029734014451862585,
"loss": 2.9452,
"step": 7
},
{
"epoch": 0.19837272375048431,
"grad_norm": 4.276736259460449,
"learning_rate": 0.0002961753609054553,
"loss": 2.3813,
"step": 8
},
{
"epoch": 0.22316931421929484,
"grad_norm": 3.3326385021209717,
"learning_rate": 0.00029480316208420175,
"loss": 1.6502,
"step": 9
},
{
"epoch": 0.2479659046881054,
"grad_norm": 2.8657619953155518,
"learning_rate": 0.0002932257177832507,
"loss": 1.3725,
"step": 10
},
{
"epoch": 0.2727624951569159,
"grad_norm": 6.442175388336182,
"learning_rate": 0.0002914455222662782,
"loss": 2.0797,
"step": 11
},
{
"epoch": 0.29755908562572647,
"grad_norm": 4.634378910064697,
"learning_rate": 0.00028946539038830787,
"loss": 1.4848,
"step": 12
},
{
"epoch": 0.322355676094537,
"grad_norm": 3.6060268878936768,
"learning_rate": 0.0002872884531448456,
"loss": 1.1722,
"step": 13
},
{
"epoch": 0.3471522665633475,
"grad_norm": 3.540088176727295,
"learning_rate": 0.0002849181527211328,
"loss": 0.9732,
"step": 14
},
{
"epoch": 0.3719488570321581,
"grad_norm": 2.787266492843628,
"learning_rate": 0.0002823582370493436,
"loss": 0.8253,
"step": 15
},
{
"epoch": 0.39674544750096863,
"grad_norm": 2.26003360748291,
"learning_rate": 0.00027961275388233416,
"loss": 0.7514,
"step": 16
},
{
"epoch": 0.4215420379697792,
"grad_norm": 1.5791990756988525,
"learning_rate": 0.0002766860443933127,
"loss": 0.7033,
"step": 17
},
{
"epoch": 0.4463386284385897,
"grad_norm": 2.00994610786438,
"learning_rate": 0.00027358273631155326,
"loss": 0.8012,
"step": 18
},
{
"epoch": 0.47113521890740023,
"grad_norm": 1.9376633167266846,
"learning_rate": 0.0002703077366050036,
"loss": 0.7117,
"step": 19
},
{
"epoch": 0.4959318093762108,
"grad_norm": 0.8688062429428101,
"learning_rate": 0.00026686622372136103,
"loss": 0.6083,
"step": 20
},
{
"epoch": 0.5207283998450213,
"grad_norm": 2.4382097721099854,
"learning_rate": 0.0002632636393998817,
"loss": 0.7095,
"step": 21
},
{
"epoch": 0.5455249903138318,
"grad_norm": 1.1420600414276123,
"learning_rate": 0.0002595056800668724,
"loss": 0.5918,
"step": 22
},
{
"epoch": 0.5703215807826424,
"grad_norm": 1.245035171508789,
"learning_rate": 0.00025559828782846994,
"loss": 0.6538,
"step": 23
},
{
"epoch": 0.5951181712514529,
"grad_norm": 0.8991405367851257,
"learning_rate": 0.0002515476410749497,
"loss": 0.6102,
"step": 24
},
{
"epoch": 0.6199147617202635,
"grad_norm": 1.0999940633773804,
"learning_rate": 0.00024736014471142076,
"loss": 0.5558,
"step": 25
},
{
"epoch": 0.6199147617202635,
"eval_loss": 0.5675167441368103,
"eval_runtime": 1.566,
"eval_samples_per_second": 31.929,
"eval_steps_per_second": 8.302,
"step": 25
},
{
"epoch": 0.644711352189074,
"grad_norm": 0.8524123430252075,
"learning_rate": 0.0002430424200303545,
"loss": 0.5466,
"step": 26
},
{
"epoch": 0.6695079426578845,
"grad_norm": 0.7143929600715637,
"learning_rate": 0.00023860129424196008,
"loss": 0.5559,
"step": 27
},
{
"epoch": 0.694304533126695,
"grad_norm": 0.33965978026390076,
"learning_rate": 0.000234043789678962,
"loss": 0.5331,
"step": 28
},
{
"epoch": 0.7191011235955056,
"grad_norm": 0.8655706644058228,
"learning_rate": 0.00022937711269284834,
"loss": 0.6151,
"step": 29
},
{
"epoch": 0.7438977140643162,
"grad_norm": 0.618858277797699,
"learning_rate": 0.00022460864225914807,
"loss": 0.5515,
"step": 30
},
{
"epoch": 0.7686943045331267,
"grad_norm": 1.0988430976867676,
"learning_rate": 0.00021974591830975417,
"loss": 0.5621,
"step": 31
},
{
"epoch": 0.7934908950019373,
"grad_norm": 0.6801258325576782,
"learning_rate": 0.00021479662981074103,
"loss": 0.5204,
"step": 32
},
{
"epoch": 0.8182874854707478,
"grad_norm": 0.7187148928642273,
"learning_rate": 0.00020976860260452914,
"loss": 0.5153,
"step": 33
},
{
"epoch": 0.8430840759395584,
"grad_norm": 0.48717305064201355,
"learning_rate": 0.00020466978703561934,
"loss": 0.4948,
"step": 34
},
{
"epoch": 0.8678806664083688,
"grad_norm": 0.4217776656150818,
"learning_rate": 0.00019950824537946405,
"loss": 0.4804,
"step": 35
},
{
"epoch": 0.8926772568771794,
"grad_norm": 0.37866833806037903,
"learning_rate": 0.00019429213909435291,
"loss": 0.4657,
"step": 36
},
{
"epoch": 0.9174738473459899,
"grad_norm": 0.3701322674751282,
"learning_rate": 0.00018902971591646932,
"loss": 0.4642,
"step": 37
},
{
"epoch": 0.9422704378148005,
"grad_norm": 0.4386747181415558,
"learning_rate": 0.00018372929681852423,
"loss": 0.4875,
"step": 38
},
{
"epoch": 0.967067028283611,
"grad_norm": 0.5399623513221741,
"learning_rate": 0.0001783992628525883,
"loss": 0.5031,
"step": 39
},
{
"epoch": 0.9918636187524216,
"grad_norm": 0.3022868037223816,
"learning_rate": 0.00017304804189792567,
"loss": 0.4871,
"step": 40
},
{
"epoch": 1.0170476559473072,
"grad_norm": 1.213976502418518,
"learning_rate": 0.00016768409533478448,
"loss": 0.8302,
"step": 41
},
{
"epoch": 1.0418442464161177,
"grad_norm": 0.42101261019706726,
"learning_rate": 0.0001623159046652155,
"loss": 0.4438,
"step": 42
},
{
"epoch": 1.0666408368849283,
"grad_norm": 0.4285363554954529,
"learning_rate": 0.0001569519581020743,
"loss": 0.4617,
"step": 43
},
{
"epoch": 1.0914374273537388,
"grad_norm": 0.37666165828704834,
"learning_rate": 0.00015160073714741169,
"loss": 0.4486,
"step": 44
},
{
"epoch": 1.1162340178225494,
"grad_norm": 0.28406891226768494,
"learning_rate": 0.00014627070318147577,
"loss": 0.4323,
"step": 45
},
{
"epoch": 1.14103060829136,
"grad_norm": 0.28831884264945984,
"learning_rate": 0.0001409702840835307,
"loss": 0.4359,
"step": 46
},
{
"epoch": 1.1658271987601705,
"grad_norm": 0.35207369923591614,
"learning_rate": 0.00013570786090564706,
"loss": 0.4276,
"step": 47
},
{
"epoch": 1.190623789228981,
"grad_norm": 0.44619789719581604,
"learning_rate": 0.0001304917546205359,
"loss": 0.4528,
"step": 48
},
{
"epoch": 1.2154203796977916,
"grad_norm": 0.42749229073524475,
"learning_rate": 0.00012533021296438072,
"loss": 0.4688,
"step": 49
},
{
"epoch": 1.2402169701666022,
"grad_norm": 0.536782443523407,
"learning_rate": 0.00012023139739547084,
"loss": 0.4451,
"step": 50
},
{
"epoch": 1.2402169701666022,
"eval_loss": 0.4470236301422119,
"eval_runtime": 1.5686,
"eval_samples_per_second": 31.875,
"eval_steps_per_second": 8.287,
"step": 50
},
{
"epoch": 1.2650135606354127,
"grad_norm": 1.0695531368255615,
"learning_rate": 0.00011520337018925895,
"loss": 0.4534,
"step": 51
},
{
"epoch": 1.289810151104223,
"grad_norm": 0.326072633266449,
"learning_rate": 0.00011025408169024583,
"loss": 0.4319,
"step": 52
},
{
"epoch": 1.3146067415730336,
"grad_norm": 0.3365119993686676,
"learning_rate": 0.00010539135774085195,
"loss": 0.4362,
"step": 53
},
{
"epoch": 1.3394033320418441,
"grad_norm": 0.4222407042980194,
"learning_rate": 0.0001006228873071517,
"loss": 0.43,
"step": 54
},
{
"epoch": 1.3641999225106547,
"grad_norm": 0.35905617475509644,
"learning_rate": 9.595621032103801e-05,
"loss": 0.427,
"step": 55
},
{
"epoch": 1.3889965129794652,
"grad_norm": 0.26657891273498535,
"learning_rate": 9.139870575803991e-05,
"loss": 0.4208,
"step": 56
},
{
"epoch": 1.4137931034482758,
"grad_norm": 0.2459261119365692,
"learning_rate": 8.695757996964544e-05,
"loss": 0.4123,
"step": 57
},
{
"epoch": 1.4385896939170864,
"grad_norm": 0.3061677813529968,
"learning_rate": 8.263985528857921e-05,
"loss": 0.429,
"step": 58
},
{
"epoch": 1.463386284385897,
"grad_norm": 0.4884737432003021,
"learning_rate": 7.845235892505033e-05,
"loss": 0.4428,
"step": 59
},
{
"epoch": 1.4881828748547075,
"grad_norm": 0.3974491059780121,
"learning_rate": 7.440171217153001e-05,
"loss": 0.4357,
"step": 60
},
{
"epoch": 1.512979465323518,
"grad_norm": 0.31677502393722534,
"learning_rate": 7.049431993312759e-05,
"loss": 0.4063,
"step": 61
},
{
"epoch": 1.5377760557923286,
"grad_norm": 0.5034978985786438,
"learning_rate": 6.67363606001183e-05,
"loss": 0.416,
"step": 62
},
{
"epoch": 1.5625726462611391,
"grad_norm": 0.3409871757030487,
"learning_rate": 6.313377627863891e-05,
"loss": 0.406,
"step": 63
},
{
"epoch": 1.5873692367299497,
"grad_norm": 0.2753334939479828,
"learning_rate": 5.9692263394996376e-05,
"loss": 0.3993,
"step": 64
},
{
"epoch": 1.6121658271987602,
"grad_norm": 0.4815467596054077,
"learning_rate": 5.641726368844675e-05,
"loss": 0.4139,
"step": 65
},
{
"epoch": 1.6369624176675708,
"grad_norm": 0.42127665877342224,
"learning_rate": 5.3313955606687296e-05,
"loss": 0.4155,
"step": 66
},
{
"epoch": 1.6617590081363813,
"grad_norm": 0.29522252082824707,
"learning_rate": 5.038724611766585e-05,
"loss": 0.4038,
"step": 67
},
{
"epoch": 1.6865555986051919,
"grad_norm": 0.3209092319011688,
"learning_rate": 4.764176295065636e-05,
"loss": 0.4012,
"step": 68
},
{
"epoch": 1.7113521890740024,
"grad_norm": 0.4263613820075989,
"learning_rate": 4.508184727886723e-05,
"loss": 0.4195,
"step": 69
},
{
"epoch": 1.736148779542813,
"grad_norm": 0.42666929960250854,
"learning_rate": 4.271154685515435e-05,
"loss": 0.4198,
"step": 70
},
{
"epoch": 1.7609453700116235,
"grad_norm": 0.29947665333747864,
"learning_rate": 4.0534609611692115e-05,
"loss": 0.3954,
"step": 71
},
{
"epoch": 1.785741960480434,
"grad_norm": 0.3265860676765442,
"learning_rate": 3.855447773372175e-05,
"loss": 0.402,
"step": 72
},
{
"epoch": 1.8105385509492444,
"grad_norm": 0.4390391707420349,
"learning_rate": 3.6774282216749336e-05,
"loss": 0.3979,
"step": 73
},
{
"epoch": 1.835335141418055,
"grad_norm": 0.28207874298095703,
"learning_rate": 3.519683791579824e-05,
"loss": 0.3837,
"step": 74
},
{
"epoch": 1.8601317318868655,
"grad_norm": 0.3026869297027588,
"learning_rate": 3.382463909454464e-05,
"loss": 0.3973,
"step": 75
},
{
"epoch": 1.8601317318868655,
"eval_loss": 0.39713945984840393,
"eval_runtime": 1.5675,
"eval_samples_per_second": 31.898,
"eval_steps_per_second": 8.294,
"step": 75
},
{
"epoch": 1.884928322355676,
"grad_norm": 0.25843551754951477,
"learning_rate": 3.265985548137413e-05,
"loss": 0.3886,
"step": 76
},
{
"epoch": 1.9097249128244866,
"grad_norm": 0.23140764236450195,
"learning_rate": 3.1704328838595e-05,
"loss": 0.3856,
"step": 77
},
{
"epoch": 1.9345215032932972,
"grad_norm": 0.34332171082496643,
"learning_rate": 3.0959570050233586e-05,
"loss": 0.3945,
"step": 78
},
{
"epoch": 1.9593180937621077,
"grad_norm": 0.3156374990940094,
"learning_rate": 3.0426756733016063e-05,
"loss": 0.3956,
"step": 79
},
{
"epoch": 1.9841146842309183,
"grad_norm": 0.34009358286857605,
"learning_rate": 3.010673137431425e-05,
"loss": 0.4036,
"step": 80
},
{
"epoch": 2.009298721425804,
"grad_norm": 0.43867069482803345,
"learning_rate": 2.9999999999999997e-05,
"loss": 0.6494,
"step": 81
}
],
"logging_steps": 1,
"max_steps": 81,
"num_input_tokens_seen": 0,
"num_train_epochs": 3,
"save_steps": 50,
"stateful_callbacks": {
"EarlyStoppingCallback": {
"args": {
"early_stopping_patience": 1,
"early_stopping_threshold": 0.0
},
"attributes": {
"early_stopping_patience_counter": 0
}
},
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 1.9339388490004562e+18,
"train_batch_size": 1,
"trial_name": null,
"trial_params": null
}