nttx's picture
Training in progress, step 100, checkpoint
4eebbbb verified
{
"best_metric": 2.227423667907715,
"best_model_checkpoint": "miner_id_24/checkpoint-100",
"epoch": 0.01278935925310142,
"eval_steps": 50,
"global_step": 100,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0001278935925310142,
"grad_norm": 0.6531283259391785,
"learning_rate": 1.6666666666666668e-07,
"loss": 1.572,
"step": 1
},
{
"epoch": 0.0001278935925310142,
"eval_loss": 2.3208112716674805,
"eval_runtime": 141.1671,
"eval_samples_per_second": 23.32,
"eval_steps_per_second": 2.919,
"step": 1
},
{
"epoch": 0.0002557871850620284,
"grad_norm": 0.7421135902404785,
"learning_rate": 3.3333333333333335e-07,
"loss": 1.611,
"step": 2
},
{
"epoch": 0.0003836807775930426,
"grad_norm": 0.8427505493164062,
"learning_rate": 5.000000000000001e-07,
"loss": 1.7915,
"step": 3
},
{
"epoch": 0.0005115743701240568,
"grad_norm": 0.9320747256278992,
"learning_rate": 6.666666666666667e-07,
"loss": 1.8287,
"step": 4
},
{
"epoch": 0.000639467962655071,
"grad_norm": 0.9002219438552856,
"learning_rate": 8.333333333333333e-07,
"loss": 1.7165,
"step": 5
},
{
"epoch": 0.0007673615551860852,
"grad_norm": 0.9916430711746216,
"learning_rate": 1.0000000000000002e-06,
"loss": 1.9549,
"step": 6
},
{
"epoch": 0.0008952551477170994,
"grad_norm": 1.2199214696884155,
"learning_rate": 1.1666666666666668e-06,
"loss": 1.8518,
"step": 7
},
{
"epoch": 0.0010231487402481137,
"grad_norm": 1.0491547584533691,
"learning_rate": 1.3333333333333334e-06,
"loss": 2.0224,
"step": 8
},
{
"epoch": 0.0011510423327791277,
"grad_norm": 0.9636436700820923,
"learning_rate": 1.5e-06,
"loss": 1.7608,
"step": 9
},
{
"epoch": 0.001278935925310142,
"grad_norm": 1.075803279876709,
"learning_rate": 1.6666666666666667e-06,
"loss": 2.024,
"step": 10
},
{
"epoch": 0.001406829517841156,
"grad_norm": 1.1795305013656616,
"learning_rate": 1.8333333333333333e-06,
"loss": 1.838,
"step": 11
},
{
"epoch": 0.0015347231103721704,
"grad_norm": 1.0322589874267578,
"learning_rate": 2.0000000000000003e-06,
"loss": 2.1333,
"step": 12
},
{
"epoch": 0.0016626167029031844,
"grad_norm": 1.224502682685852,
"learning_rate": 2.166666666666667e-06,
"loss": 2.028,
"step": 13
},
{
"epoch": 0.0017905102954341987,
"grad_norm": 1.1498585939407349,
"learning_rate": 2.3333333333333336e-06,
"loss": 2.0027,
"step": 14
},
{
"epoch": 0.001918403887965213,
"grad_norm": 1.244385838508606,
"learning_rate": 2.5e-06,
"loss": 1.9931,
"step": 15
},
{
"epoch": 0.0020462974804962273,
"grad_norm": 1.036781668663025,
"learning_rate": 2.666666666666667e-06,
"loss": 1.7908,
"step": 16
},
{
"epoch": 0.002174191073027241,
"grad_norm": 1.1222697496414185,
"learning_rate": 2.8333333333333335e-06,
"loss": 2.1778,
"step": 17
},
{
"epoch": 0.0023020846655582555,
"grad_norm": 1.145642876625061,
"learning_rate": 3e-06,
"loss": 2.0192,
"step": 18
},
{
"epoch": 0.0024299782580892697,
"grad_norm": 1.0589532852172852,
"learning_rate": 3.1666666666666667e-06,
"loss": 2.1066,
"step": 19
},
{
"epoch": 0.002557871850620284,
"grad_norm": 1.317820429801941,
"learning_rate": 3.3333333333333333e-06,
"loss": 2.0042,
"step": 20
},
{
"epoch": 0.0026857654431512983,
"grad_norm": 1.223871111869812,
"learning_rate": 3.5e-06,
"loss": 2.012,
"step": 21
},
{
"epoch": 0.002813659035682312,
"grad_norm": 1.3080352544784546,
"learning_rate": 3.6666666666666666e-06,
"loss": 2.0961,
"step": 22
},
{
"epoch": 0.0029415526282133265,
"grad_norm": 1.4565980434417725,
"learning_rate": 3.833333333333334e-06,
"loss": 2.0999,
"step": 23
},
{
"epoch": 0.0030694462207443408,
"grad_norm": 1.2557575702667236,
"learning_rate": 4.000000000000001e-06,
"loss": 2.1239,
"step": 24
},
{
"epoch": 0.003197339813275355,
"grad_norm": 1.3740392923355103,
"learning_rate": 4.166666666666667e-06,
"loss": 2.354,
"step": 25
},
{
"epoch": 0.003325233405806369,
"grad_norm": 1.4395512342453003,
"learning_rate": 4.333333333333334e-06,
"loss": 2.6028,
"step": 26
},
{
"epoch": 0.003453126998337383,
"grad_norm": 1.3880198001861572,
"learning_rate": 4.5e-06,
"loss": 2.1235,
"step": 27
},
{
"epoch": 0.0035810205908683975,
"grad_norm": 1.6280770301818848,
"learning_rate": 4.666666666666667e-06,
"loss": 2.2612,
"step": 28
},
{
"epoch": 0.0037089141833994118,
"grad_norm": 1.5140023231506348,
"learning_rate": 4.833333333333333e-06,
"loss": 2.2578,
"step": 29
},
{
"epoch": 0.003836807775930426,
"grad_norm": 1.3038387298583984,
"learning_rate": 5e-06,
"loss": 2.0897,
"step": 30
},
{
"epoch": 0.00396470136846144,
"grad_norm": 1.3410831689834595,
"learning_rate": 4.997482666353287e-06,
"loss": 2.112,
"step": 31
},
{
"epoch": 0.004092594960992455,
"grad_norm": 1.4958971738815308,
"learning_rate": 4.989935734988098e-06,
"loss": 2.4475,
"step": 32
},
{
"epoch": 0.004220488553523468,
"grad_norm": 1.638153076171875,
"learning_rate": 4.977374404419838e-06,
"loss": 2.4451,
"step": 33
},
{
"epoch": 0.004348382146054482,
"grad_norm": 1.6533570289611816,
"learning_rate": 4.959823971496575e-06,
"loss": 2.5101,
"step": 34
},
{
"epoch": 0.004476275738585497,
"grad_norm": 1.6795432567596436,
"learning_rate": 4.937319780454559e-06,
"loss": 2.2065,
"step": 35
},
{
"epoch": 0.004604169331116511,
"grad_norm": 1.6957430839538574,
"learning_rate": 4.909907151739634e-06,
"loss": 2.6026,
"step": 36
},
{
"epoch": 0.004732062923647525,
"grad_norm": 1.6112371683120728,
"learning_rate": 4.8776412907378845e-06,
"loss": 2.5477,
"step": 37
},
{
"epoch": 0.0048599565161785395,
"grad_norm": 1.9008305072784424,
"learning_rate": 4.8405871765993435e-06,
"loss": 2.5082,
"step": 38
},
{
"epoch": 0.004987850108709554,
"grad_norm": 1.69205904006958,
"learning_rate": 4.7988194313786275e-06,
"loss": 2.6694,
"step": 39
},
{
"epoch": 0.005115743701240568,
"grad_norm": 1.9658174514770508,
"learning_rate": 4.752422169756048e-06,
"loss": 3.1494,
"step": 40
},
{
"epoch": 0.005243637293771582,
"grad_norm": 1.7338308095932007,
"learning_rate": 4.701488829641845e-06,
"loss": 2.5999,
"step": 41
},
{
"epoch": 0.005371530886302597,
"grad_norm": 1.9238468408584595,
"learning_rate": 4.646121984004666e-06,
"loss": 2.7518,
"step": 42
},
{
"epoch": 0.00549942447883361,
"grad_norm": 1.9128526449203491,
"learning_rate": 4.586433134303257e-06,
"loss": 2.4259,
"step": 43
},
{
"epoch": 0.005627318071364624,
"grad_norm": 1.9675108194351196,
"learning_rate": 4.522542485937369e-06,
"loss": 2.5403,
"step": 44
},
{
"epoch": 0.005755211663895639,
"grad_norm": 2.1037259101867676,
"learning_rate": 4.454578706170075e-06,
"loss": 3.0036,
"step": 45
},
{
"epoch": 0.005883105256426653,
"grad_norm": 2.1286771297454834,
"learning_rate": 4.382678665009028e-06,
"loss": 2.9692,
"step": 46
},
{
"epoch": 0.006010998848957667,
"grad_norm": 2.4412331581115723,
"learning_rate": 4.3069871595684795e-06,
"loss": 3.1744,
"step": 47
},
{
"epoch": 0.0061388924414886815,
"grad_norm": 2.340425491333008,
"learning_rate": 4.227656622467162e-06,
"loss": 3.1443,
"step": 48
},
{
"epoch": 0.006266786034019696,
"grad_norm": 2.878222703933716,
"learning_rate": 4.144846814849282e-06,
"loss": 3.6609,
"step": 49
},
{
"epoch": 0.00639467962655071,
"grad_norm": 4.124127388000488,
"learning_rate": 4.058724504646834e-06,
"loss": 4.5455,
"step": 50
},
{
"epoch": 0.00639467962655071,
"eval_loss": 2.273921012878418,
"eval_runtime": 141.3295,
"eval_samples_per_second": 23.293,
"eval_steps_per_second": 2.915,
"step": 50
},
{
"epoch": 0.006522573219081724,
"grad_norm": 0.824315071105957,
"learning_rate": 3.969463130731183e-06,
"loss": 1.7344,
"step": 51
},
{
"epoch": 0.006650466811612738,
"grad_norm": 0.8575537204742432,
"learning_rate": 3.8772424536302565e-06,
"loss": 1.6085,
"step": 52
},
{
"epoch": 0.006778360404143752,
"grad_norm": 1.030655860900879,
"learning_rate": 3.782248193514766e-06,
"loss": 1.6915,
"step": 53
},
{
"epoch": 0.006906253996674766,
"grad_norm": 0.9780040979385376,
"learning_rate": 3.684671656182497e-06,
"loss": 1.8391,
"step": 54
},
{
"epoch": 0.007034147589205781,
"grad_norm": 1.075355887413025,
"learning_rate": 3.5847093477938955e-06,
"loss": 1.7932,
"step": 55
},
{
"epoch": 0.007162041181736795,
"grad_norm": 1.0918196439743042,
"learning_rate": 3.4825625791348093e-06,
"loss": 1.7666,
"step": 56
},
{
"epoch": 0.007289934774267809,
"grad_norm": 1.0339546203613281,
"learning_rate": 3.3784370602033572e-06,
"loss": 1.8442,
"step": 57
},
{
"epoch": 0.0074178283667988235,
"grad_norm": 1.178619384765625,
"learning_rate": 3.272542485937369e-06,
"loss": 1.9234,
"step": 58
},
{
"epoch": 0.007545721959329838,
"grad_norm": 1.1075314283370972,
"learning_rate": 3.165092113916688e-06,
"loss": 1.9327,
"step": 59
},
{
"epoch": 0.007673615551860852,
"grad_norm": 1.2657276391983032,
"learning_rate": 3.056302334890786e-06,
"loss": 1.8388,
"step": 60
},
{
"epoch": 0.007801509144391866,
"grad_norm": 1.1688446998596191,
"learning_rate": 2.946392236996592e-06,
"loss": 1.7806,
"step": 61
},
{
"epoch": 0.00792940273692288,
"grad_norm": 1.1615688800811768,
"learning_rate": 2.835583164544139e-06,
"loss": 1.9443,
"step": 62
},
{
"epoch": 0.008057296329453895,
"grad_norm": 1.2162307500839233,
"learning_rate": 2.724098272258584e-06,
"loss": 1.7797,
"step": 63
},
{
"epoch": 0.00818518992198491,
"grad_norm": 1.4072235822677612,
"learning_rate": 2.6121620758762877e-06,
"loss": 1.7035,
"step": 64
},
{
"epoch": 0.008313083514515924,
"grad_norm": 1.484116792678833,
"learning_rate": 2.5e-06,
"loss": 2.0504,
"step": 65
},
{
"epoch": 0.008440977107046936,
"grad_norm": 1.3416109085083008,
"learning_rate": 2.3878379241237136e-06,
"loss": 1.9764,
"step": 66
},
{
"epoch": 0.00856887069957795,
"grad_norm": 1.3301259279251099,
"learning_rate": 2.2759017277414165e-06,
"loss": 1.951,
"step": 67
},
{
"epoch": 0.008696764292108965,
"grad_norm": 1.3805782794952393,
"learning_rate": 2.1644168354558623e-06,
"loss": 2.0396,
"step": 68
},
{
"epoch": 0.008824657884639979,
"grad_norm": 1.2760380506515503,
"learning_rate": 2.053607763003409e-06,
"loss": 1.9385,
"step": 69
},
{
"epoch": 0.008952551477170993,
"grad_norm": 1.4715478420257568,
"learning_rate": 1.9436976651092143e-06,
"loss": 2.2723,
"step": 70
},
{
"epoch": 0.009080445069702008,
"grad_norm": 1.3770544528961182,
"learning_rate": 1.8349078860833125e-06,
"loss": 1.888,
"step": 71
},
{
"epoch": 0.009208338662233022,
"grad_norm": 1.2823768854141235,
"learning_rate": 1.7274575140626318e-06,
"loss": 2.0006,
"step": 72
},
{
"epoch": 0.009336232254764036,
"grad_norm": 1.366890788078308,
"learning_rate": 1.6215629397966432e-06,
"loss": 1.8851,
"step": 73
},
{
"epoch": 0.00946412584729505,
"grad_norm": 1.4455935955047607,
"learning_rate": 1.5174374208651913e-06,
"loss": 2.0491,
"step": 74
},
{
"epoch": 0.009592019439826065,
"grad_norm": 1.420132040977478,
"learning_rate": 1.415290652206105e-06,
"loss": 1.9303,
"step": 75
},
{
"epoch": 0.009719913032357079,
"grad_norm": 1.3549017906188965,
"learning_rate": 1.3153283438175036e-06,
"loss": 1.97,
"step": 76
},
{
"epoch": 0.009847806624888093,
"grad_norm": 1.558439016342163,
"learning_rate": 1.217751806485235e-06,
"loss": 2.4078,
"step": 77
},
{
"epoch": 0.009975700217419108,
"grad_norm": 1.561328649520874,
"learning_rate": 1.122757546369744e-06,
"loss": 2.0117,
"step": 78
},
{
"epoch": 0.010103593809950122,
"grad_norm": 1.3918482065200806,
"learning_rate": 1.0305368692688175e-06,
"loss": 2.0692,
"step": 79
},
{
"epoch": 0.010231487402481136,
"grad_norm": 1.4925870895385742,
"learning_rate": 9.412754953531664e-07,
"loss": 2.1758,
"step": 80
},
{
"epoch": 0.01035938099501215,
"grad_norm": 1.8017629384994507,
"learning_rate": 8.551531851507186e-07,
"loss": 2.1219,
"step": 81
},
{
"epoch": 0.010487274587543165,
"grad_norm": 1.533422827720642,
"learning_rate": 7.723433775328385e-07,
"loss": 2.0952,
"step": 82
},
{
"epoch": 0.010615168180074179,
"grad_norm": 1.8227595090866089,
"learning_rate": 6.930128404315214e-07,
"loss": 2.258,
"step": 83
},
{
"epoch": 0.010743061772605193,
"grad_norm": 1.8445152044296265,
"learning_rate": 6.17321334990973e-07,
"loss": 2.3289,
"step": 84
},
{
"epoch": 0.010870955365136206,
"grad_norm": 1.7444809675216675,
"learning_rate": 5.454212938299256e-07,
"loss": 2.2124,
"step": 85
},
{
"epoch": 0.01099884895766722,
"grad_norm": 1.7502052783966064,
"learning_rate": 4.774575140626317e-07,
"loss": 2.2203,
"step": 86
},
{
"epoch": 0.011126742550198234,
"grad_norm": 1.734458327293396,
"learning_rate": 4.1356686569674344e-07,
"loss": 2.5345,
"step": 87
},
{
"epoch": 0.011254636142729249,
"grad_norm": 2.3257081508636475,
"learning_rate": 3.538780159953348e-07,
"loss": 2.403,
"step": 88
},
{
"epoch": 0.011382529735260263,
"grad_norm": 1.9918384552001953,
"learning_rate": 2.98511170358155e-07,
"loss": 2.5831,
"step": 89
},
{
"epoch": 0.011510423327791277,
"grad_norm": 2.127302885055542,
"learning_rate": 2.4757783024395244e-07,
"loss": 2.9882,
"step": 90
},
{
"epoch": 0.011638316920322292,
"grad_norm": 2.2728164196014404,
"learning_rate": 2.0118056862137358e-07,
"loss": 2.6982,
"step": 91
},
{
"epoch": 0.011766210512853306,
"grad_norm": 2.0041537284851074,
"learning_rate": 1.59412823400657e-07,
"loss": 2.5671,
"step": 92
},
{
"epoch": 0.01189410410538432,
"grad_norm": 2.0824832916259766,
"learning_rate": 1.223587092621162e-07,
"loss": 2.5581,
"step": 93
},
{
"epoch": 0.012021997697915334,
"grad_norm": 2.170213460922241,
"learning_rate": 9.00928482603669e-08,
"loss": 2.4461,
"step": 94
},
{
"epoch": 0.012149891290446349,
"grad_norm": 2.363558769226074,
"learning_rate": 6.268021954544095e-08,
"loss": 2.8194,
"step": 95
},
{
"epoch": 0.012277784882977363,
"grad_norm": 2.4575698375701904,
"learning_rate": 4.017602850342584e-08,
"loss": 3.1582,
"step": 96
},
{
"epoch": 0.012405678475508377,
"grad_norm": 2.772186517715454,
"learning_rate": 2.262559558016325e-08,
"loss": 3.2808,
"step": 97
},
{
"epoch": 0.012533572068039392,
"grad_norm": 3.1454925537109375,
"learning_rate": 1.006426501190233e-08,
"loss": 3.2904,
"step": 98
},
{
"epoch": 0.012661465660570406,
"grad_norm": 2.797976493835449,
"learning_rate": 2.5173336467135266e-09,
"loss": 3.0262,
"step": 99
},
{
"epoch": 0.01278935925310142,
"grad_norm": 3.6094884872436523,
"learning_rate": 0.0,
"loss": 4.3375,
"step": 100
},
{
"epoch": 0.01278935925310142,
"eval_loss": 2.227423667907715,
"eval_runtime": 141.3088,
"eval_samples_per_second": 23.296,
"eval_steps_per_second": 2.916,
"step": 100
}
],
"logging_steps": 1,
"max_steps": 100,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 50,
"stateful_callbacks": {
"EarlyStoppingCallback": {
"args": {
"early_stopping_patience": 5,
"early_stopping_threshold": 0.0
},
"attributes": {
"early_stopping_patience_counter": 0
}
},
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 5971447382016000.0,
"train_batch_size": 8,
"trial_name": null,
"trial_params": null
}