{ "best_metric": 2.227423667907715, "best_model_checkpoint": "miner_id_24/checkpoint-100", "epoch": 0.01278935925310142, "eval_steps": 50, "global_step": 100, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0001278935925310142, "grad_norm": 0.6531283259391785, "learning_rate": 1.6666666666666668e-07, "loss": 1.572, "step": 1 }, { "epoch": 0.0001278935925310142, "eval_loss": 2.3208112716674805, "eval_runtime": 141.1671, "eval_samples_per_second": 23.32, "eval_steps_per_second": 2.919, "step": 1 }, { "epoch": 0.0002557871850620284, "grad_norm": 0.7421135902404785, "learning_rate": 3.3333333333333335e-07, "loss": 1.611, "step": 2 }, { "epoch": 0.0003836807775930426, "grad_norm": 0.8427505493164062, "learning_rate": 5.000000000000001e-07, "loss": 1.7915, "step": 3 }, { "epoch": 0.0005115743701240568, "grad_norm": 0.9320747256278992, "learning_rate": 6.666666666666667e-07, "loss": 1.8287, "step": 4 }, { "epoch": 0.000639467962655071, "grad_norm": 0.9002219438552856, "learning_rate": 8.333333333333333e-07, "loss": 1.7165, "step": 5 }, { "epoch": 0.0007673615551860852, "grad_norm": 0.9916430711746216, "learning_rate": 1.0000000000000002e-06, "loss": 1.9549, "step": 6 }, { "epoch": 0.0008952551477170994, "grad_norm": 1.2199214696884155, "learning_rate": 1.1666666666666668e-06, "loss": 1.8518, "step": 7 }, { "epoch": 0.0010231487402481137, "grad_norm": 1.0491547584533691, "learning_rate": 1.3333333333333334e-06, "loss": 2.0224, "step": 8 }, { "epoch": 0.0011510423327791277, "grad_norm": 0.9636436700820923, "learning_rate": 1.5e-06, "loss": 1.7608, "step": 9 }, { "epoch": 0.001278935925310142, "grad_norm": 1.075803279876709, "learning_rate": 1.6666666666666667e-06, "loss": 2.024, "step": 10 }, { "epoch": 0.001406829517841156, "grad_norm": 1.1795305013656616, "learning_rate": 1.8333333333333333e-06, "loss": 1.838, "step": 11 }, { "epoch": 0.0015347231103721704, "grad_norm": 1.0322589874267578, "learning_rate": 2.0000000000000003e-06, "loss": 2.1333, "step": 12 }, { "epoch": 0.0016626167029031844, "grad_norm": 1.224502682685852, "learning_rate": 2.166666666666667e-06, "loss": 2.028, "step": 13 }, { "epoch": 0.0017905102954341987, "grad_norm": 1.1498585939407349, "learning_rate": 2.3333333333333336e-06, "loss": 2.0027, "step": 14 }, { "epoch": 0.001918403887965213, "grad_norm": 1.244385838508606, "learning_rate": 2.5e-06, "loss": 1.9931, "step": 15 }, { "epoch": 0.0020462974804962273, "grad_norm": 1.036781668663025, "learning_rate": 2.666666666666667e-06, "loss": 1.7908, "step": 16 }, { "epoch": 0.002174191073027241, "grad_norm": 1.1222697496414185, "learning_rate": 2.8333333333333335e-06, "loss": 2.1778, "step": 17 }, { "epoch": 0.0023020846655582555, "grad_norm": 1.145642876625061, "learning_rate": 3e-06, "loss": 2.0192, "step": 18 }, { "epoch": 0.0024299782580892697, "grad_norm": 1.0589532852172852, "learning_rate": 3.1666666666666667e-06, "loss": 2.1066, "step": 19 }, { "epoch": 0.002557871850620284, "grad_norm": 1.317820429801941, "learning_rate": 3.3333333333333333e-06, "loss": 2.0042, "step": 20 }, { "epoch": 0.0026857654431512983, "grad_norm": 1.223871111869812, "learning_rate": 3.5e-06, "loss": 2.012, "step": 21 }, { "epoch": 0.002813659035682312, "grad_norm": 1.3080352544784546, "learning_rate": 3.6666666666666666e-06, "loss": 2.0961, "step": 22 }, { "epoch": 0.0029415526282133265, "grad_norm": 1.4565980434417725, "learning_rate": 3.833333333333334e-06, "loss": 2.0999, "step": 23 }, { "epoch": 0.0030694462207443408, "grad_norm": 1.2557575702667236, "learning_rate": 4.000000000000001e-06, "loss": 2.1239, "step": 24 }, { "epoch": 0.003197339813275355, "grad_norm": 1.3740392923355103, "learning_rate": 4.166666666666667e-06, "loss": 2.354, "step": 25 }, { "epoch": 0.003325233405806369, "grad_norm": 1.4395512342453003, "learning_rate": 4.333333333333334e-06, "loss": 2.6028, "step": 26 }, { "epoch": 0.003453126998337383, "grad_norm": 1.3880198001861572, "learning_rate": 4.5e-06, "loss": 2.1235, "step": 27 }, { "epoch": 0.0035810205908683975, "grad_norm": 1.6280770301818848, "learning_rate": 4.666666666666667e-06, "loss": 2.2612, "step": 28 }, { "epoch": 0.0037089141833994118, "grad_norm": 1.5140023231506348, "learning_rate": 4.833333333333333e-06, "loss": 2.2578, "step": 29 }, { "epoch": 0.003836807775930426, "grad_norm": 1.3038387298583984, "learning_rate": 5e-06, "loss": 2.0897, "step": 30 }, { "epoch": 0.00396470136846144, "grad_norm": 1.3410831689834595, "learning_rate": 4.997482666353287e-06, "loss": 2.112, "step": 31 }, { "epoch": 0.004092594960992455, "grad_norm": 1.4958971738815308, "learning_rate": 4.989935734988098e-06, "loss": 2.4475, "step": 32 }, { "epoch": 0.004220488553523468, "grad_norm": 1.638153076171875, "learning_rate": 4.977374404419838e-06, "loss": 2.4451, "step": 33 }, { "epoch": 0.004348382146054482, "grad_norm": 1.6533570289611816, "learning_rate": 4.959823971496575e-06, "loss": 2.5101, "step": 34 }, { "epoch": 0.004476275738585497, "grad_norm": 1.6795432567596436, "learning_rate": 4.937319780454559e-06, "loss": 2.2065, "step": 35 }, { "epoch": 0.004604169331116511, "grad_norm": 1.6957430839538574, "learning_rate": 4.909907151739634e-06, "loss": 2.6026, "step": 36 }, { "epoch": 0.004732062923647525, "grad_norm": 1.6112371683120728, "learning_rate": 4.8776412907378845e-06, "loss": 2.5477, "step": 37 }, { "epoch": 0.0048599565161785395, "grad_norm": 1.9008305072784424, "learning_rate": 4.8405871765993435e-06, "loss": 2.5082, "step": 38 }, { "epoch": 0.004987850108709554, "grad_norm": 1.69205904006958, "learning_rate": 4.7988194313786275e-06, "loss": 2.6694, "step": 39 }, { "epoch": 0.005115743701240568, "grad_norm": 1.9658174514770508, "learning_rate": 4.752422169756048e-06, "loss": 3.1494, "step": 40 }, { "epoch": 0.005243637293771582, "grad_norm": 1.7338308095932007, "learning_rate": 4.701488829641845e-06, "loss": 2.5999, "step": 41 }, { "epoch": 0.005371530886302597, "grad_norm": 1.9238468408584595, "learning_rate": 4.646121984004666e-06, "loss": 2.7518, "step": 42 }, { "epoch": 0.00549942447883361, "grad_norm": 1.9128526449203491, "learning_rate": 4.586433134303257e-06, "loss": 2.4259, "step": 43 }, { "epoch": 0.005627318071364624, "grad_norm": 1.9675108194351196, "learning_rate": 4.522542485937369e-06, "loss": 2.5403, "step": 44 }, { "epoch": 0.005755211663895639, "grad_norm": 2.1037259101867676, "learning_rate": 4.454578706170075e-06, "loss": 3.0036, "step": 45 }, { "epoch": 0.005883105256426653, "grad_norm": 2.1286771297454834, "learning_rate": 4.382678665009028e-06, "loss": 2.9692, "step": 46 }, { "epoch": 0.006010998848957667, "grad_norm": 2.4412331581115723, "learning_rate": 4.3069871595684795e-06, "loss": 3.1744, "step": 47 }, { "epoch": 0.0061388924414886815, "grad_norm": 2.340425491333008, "learning_rate": 4.227656622467162e-06, "loss": 3.1443, "step": 48 }, { "epoch": 0.006266786034019696, "grad_norm": 2.878222703933716, "learning_rate": 4.144846814849282e-06, "loss": 3.6609, "step": 49 }, { "epoch": 0.00639467962655071, "grad_norm": 4.124127388000488, "learning_rate": 4.058724504646834e-06, "loss": 4.5455, "step": 50 }, { "epoch": 0.00639467962655071, "eval_loss": 2.273921012878418, "eval_runtime": 141.3295, "eval_samples_per_second": 23.293, "eval_steps_per_second": 2.915, "step": 50 }, { "epoch": 0.006522573219081724, "grad_norm": 0.824315071105957, "learning_rate": 3.969463130731183e-06, "loss": 1.7344, "step": 51 }, { "epoch": 0.006650466811612738, "grad_norm": 0.8575537204742432, "learning_rate": 3.8772424536302565e-06, "loss": 1.6085, "step": 52 }, { "epoch": 0.006778360404143752, "grad_norm": 1.030655860900879, "learning_rate": 3.782248193514766e-06, "loss": 1.6915, "step": 53 }, { "epoch": 0.006906253996674766, "grad_norm": 0.9780040979385376, "learning_rate": 3.684671656182497e-06, "loss": 1.8391, "step": 54 }, { "epoch": 0.007034147589205781, "grad_norm": 1.075355887413025, "learning_rate": 3.5847093477938955e-06, "loss": 1.7932, "step": 55 }, { "epoch": 0.007162041181736795, "grad_norm": 1.0918196439743042, "learning_rate": 3.4825625791348093e-06, "loss": 1.7666, "step": 56 }, { "epoch": 0.007289934774267809, "grad_norm": 1.0339546203613281, "learning_rate": 3.3784370602033572e-06, "loss": 1.8442, "step": 57 }, { "epoch": 0.0074178283667988235, "grad_norm": 1.178619384765625, "learning_rate": 3.272542485937369e-06, "loss": 1.9234, "step": 58 }, { "epoch": 0.007545721959329838, "grad_norm": 1.1075314283370972, "learning_rate": 3.165092113916688e-06, "loss": 1.9327, "step": 59 }, { "epoch": 0.007673615551860852, "grad_norm": 1.2657276391983032, "learning_rate": 3.056302334890786e-06, "loss": 1.8388, "step": 60 }, { "epoch": 0.007801509144391866, "grad_norm": 1.1688446998596191, "learning_rate": 2.946392236996592e-06, "loss": 1.7806, "step": 61 }, { "epoch": 0.00792940273692288, "grad_norm": 1.1615688800811768, "learning_rate": 2.835583164544139e-06, "loss": 1.9443, "step": 62 }, { "epoch": 0.008057296329453895, "grad_norm": 1.2162307500839233, "learning_rate": 2.724098272258584e-06, "loss": 1.7797, "step": 63 }, { "epoch": 0.00818518992198491, "grad_norm": 1.4072235822677612, "learning_rate": 2.6121620758762877e-06, "loss": 1.7035, "step": 64 }, { "epoch": 0.008313083514515924, "grad_norm": 1.484116792678833, "learning_rate": 2.5e-06, "loss": 2.0504, "step": 65 }, { "epoch": 0.008440977107046936, "grad_norm": 1.3416109085083008, "learning_rate": 2.3878379241237136e-06, "loss": 1.9764, "step": 66 }, { "epoch": 0.00856887069957795, "grad_norm": 1.3301259279251099, "learning_rate": 2.2759017277414165e-06, "loss": 1.951, "step": 67 }, { "epoch": 0.008696764292108965, "grad_norm": 1.3805782794952393, "learning_rate": 2.1644168354558623e-06, "loss": 2.0396, "step": 68 }, { "epoch": 0.008824657884639979, "grad_norm": 1.2760380506515503, "learning_rate": 2.053607763003409e-06, "loss": 1.9385, "step": 69 }, { "epoch": 0.008952551477170993, "grad_norm": 1.4715478420257568, "learning_rate": 1.9436976651092143e-06, "loss": 2.2723, "step": 70 }, { "epoch": 0.009080445069702008, "grad_norm": 1.3770544528961182, "learning_rate": 1.8349078860833125e-06, "loss": 1.888, "step": 71 }, { "epoch": 0.009208338662233022, "grad_norm": 1.2823768854141235, "learning_rate": 1.7274575140626318e-06, "loss": 2.0006, "step": 72 }, { "epoch": 0.009336232254764036, "grad_norm": 1.366890788078308, "learning_rate": 1.6215629397966432e-06, "loss": 1.8851, "step": 73 }, { "epoch": 0.00946412584729505, "grad_norm": 1.4455935955047607, "learning_rate": 1.5174374208651913e-06, "loss": 2.0491, "step": 74 }, { "epoch": 0.009592019439826065, "grad_norm": 1.420132040977478, "learning_rate": 1.415290652206105e-06, "loss": 1.9303, "step": 75 }, { "epoch": 0.009719913032357079, "grad_norm": 1.3549017906188965, "learning_rate": 1.3153283438175036e-06, "loss": 1.97, "step": 76 }, { "epoch": 0.009847806624888093, "grad_norm": 1.558439016342163, "learning_rate": 1.217751806485235e-06, "loss": 2.4078, "step": 77 }, { "epoch": 0.009975700217419108, "grad_norm": 1.561328649520874, "learning_rate": 1.122757546369744e-06, "loss": 2.0117, "step": 78 }, { "epoch": 0.010103593809950122, "grad_norm": 1.3918482065200806, "learning_rate": 1.0305368692688175e-06, "loss": 2.0692, "step": 79 }, { "epoch": 0.010231487402481136, "grad_norm": 1.4925870895385742, "learning_rate": 9.412754953531664e-07, "loss": 2.1758, "step": 80 }, { "epoch": 0.01035938099501215, "grad_norm": 1.8017629384994507, "learning_rate": 8.551531851507186e-07, "loss": 2.1219, "step": 81 }, { "epoch": 0.010487274587543165, "grad_norm": 1.533422827720642, "learning_rate": 7.723433775328385e-07, "loss": 2.0952, "step": 82 }, { "epoch": 0.010615168180074179, "grad_norm": 1.8227595090866089, "learning_rate": 6.930128404315214e-07, "loss": 2.258, "step": 83 }, { "epoch": 0.010743061772605193, "grad_norm": 1.8445152044296265, "learning_rate": 6.17321334990973e-07, "loss": 2.3289, "step": 84 }, { "epoch": 0.010870955365136206, "grad_norm": 1.7444809675216675, "learning_rate": 5.454212938299256e-07, "loss": 2.2124, "step": 85 }, { "epoch": 0.01099884895766722, "grad_norm": 1.7502052783966064, "learning_rate": 4.774575140626317e-07, "loss": 2.2203, "step": 86 }, { "epoch": 0.011126742550198234, "grad_norm": 1.734458327293396, "learning_rate": 4.1356686569674344e-07, "loss": 2.5345, "step": 87 }, { "epoch": 0.011254636142729249, "grad_norm": 2.3257081508636475, "learning_rate": 3.538780159953348e-07, "loss": 2.403, "step": 88 }, { "epoch": 0.011382529735260263, "grad_norm": 1.9918384552001953, "learning_rate": 2.98511170358155e-07, "loss": 2.5831, "step": 89 }, { "epoch": 0.011510423327791277, "grad_norm": 2.127302885055542, "learning_rate": 2.4757783024395244e-07, "loss": 2.9882, "step": 90 }, { "epoch": 0.011638316920322292, "grad_norm": 2.2728164196014404, "learning_rate": 2.0118056862137358e-07, "loss": 2.6982, "step": 91 }, { "epoch": 0.011766210512853306, "grad_norm": 2.0041537284851074, "learning_rate": 1.59412823400657e-07, "loss": 2.5671, "step": 92 }, { "epoch": 0.01189410410538432, "grad_norm": 2.0824832916259766, "learning_rate": 1.223587092621162e-07, "loss": 2.5581, "step": 93 }, { "epoch": 0.012021997697915334, "grad_norm": 2.170213460922241, "learning_rate": 9.00928482603669e-08, "loss": 2.4461, "step": 94 }, { "epoch": 0.012149891290446349, "grad_norm": 2.363558769226074, "learning_rate": 6.268021954544095e-08, "loss": 2.8194, "step": 95 }, { "epoch": 0.012277784882977363, "grad_norm": 2.4575698375701904, "learning_rate": 4.017602850342584e-08, "loss": 3.1582, "step": 96 }, { "epoch": 0.012405678475508377, "grad_norm": 2.772186517715454, "learning_rate": 2.262559558016325e-08, "loss": 3.2808, "step": 97 }, { "epoch": 0.012533572068039392, "grad_norm": 3.1454925537109375, "learning_rate": 1.006426501190233e-08, "loss": 3.2904, "step": 98 }, { "epoch": 0.012661465660570406, "grad_norm": 2.797976493835449, "learning_rate": 2.5173336467135266e-09, "loss": 3.0262, "step": 99 }, { "epoch": 0.01278935925310142, "grad_norm": 3.6094884872436523, "learning_rate": 0.0, "loss": 4.3375, "step": 100 }, { "epoch": 0.01278935925310142, "eval_loss": 2.227423667907715, "eval_runtime": 141.3088, "eval_samples_per_second": 23.296, "eval_steps_per_second": 2.916, "step": 100 } ], "logging_steps": 1, "max_steps": 100, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 50, "stateful_callbacks": { "EarlyStoppingCallback": { "args": { "early_stopping_patience": 5, "early_stopping_threshold": 0.0 }, "attributes": { "early_stopping_patience_counter": 0 } }, "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 5971447382016000.0, "train_batch_size": 8, "trial_name": null, "trial_params": null }