diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,8281 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.1353055842219929, + "eval_steps": 500, + "global_step": 1180, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0, + "grad_norm": 0.0, + "learning_rate": 0.0, + "loss": 6.7202, + "step": 1 + }, + { + "epoch": 0.0, + "grad_norm": 0.0, + "learning_rate": 0.0, + "loss": 6.7739, + "step": 2 + }, + { + "epoch": 0.0, + "grad_norm": 0.0, + "learning_rate": 0.0, + "loss": 6.7015, + "step": 3 + }, + { + "epoch": 0.0, + "grad_norm": 24.833694217574944, + "learning_rate": 3.816793893129771e-06, + "loss": 6.9159, + "step": 4 + }, + { + "epoch": 0.0, + "grad_norm": 24.089403116981405, + "learning_rate": 7.633587786259541e-06, + "loss": 6.8581, + "step": 5 + }, + { + "epoch": 0.0, + "grad_norm": 24.089403116981405, + "learning_rate": 7.633587786259541e-06, + "loss": 6.5905, + "step": 6 + }, + { + "epoch": 0.0, + "grad_norm": 30.730797406977864, + "learning_rate": 1.1450381679389314e-05, + "loss": 6.9473, + "step": 7 + }, + { + "epoch": 0.0, + "grad_norm": 39.80648644878442, + "learning_rate": 1.5267175572519083e-05, + "loss": 6.8672, + "step": 8 + }, + { + "epoch": 0.0, + "grad_norm": 58.83195470368333, + "learning_rate": 1.9083969465648855e-05, + "loss": 6.8771, + "step": 9 + }, + { + "epoch": 0.0, + "grad_norm": 37.76609702390963, + "learning_rate": 2.2900763358778628e-05, + "loss": 6.8123, + "step": 10 + }, + { + "epoch": 0.0, + "grad_norm": 16.80166973405484, + "learning_rate": 2.6717557251908397e-05, + "loss": 6.4488, + "step": 11 + }, + { + "epoch": 0.0, + "grad_norm": 25.54197945689145, + "learning_rate": 3.0534351145038166e-05, + "loss": 6.5151, + "step": 12 + }, + { + "epoch": 0.0, + "grad_norm": 39.55278632647528, + "learning_rate": 3.435114503816794e-05, + "loss": 6.5851, + "step": 13 + }, + { + "epoch": 0.0, + "grad_norm": 30.02274564294928, + "learning_rate": 3.816793893129771e-05, + "loss": 6.5235, + "step": 14 + }, + { + "epoch": 0.0, + "grad_norm": 26.152574346690436, + "learning_rate": 4.198473282442748e-05, + "loss": 6.4511, + "step": 15 + }, + { + "epoch": 0.0, + "grad_norm": 34.33162883163699, + "learning_rate": 4.5801526717557256e-05, + "loss": 6.3687, + "step": 16 + }, + { + "epoch": 0.0, + "grad_norm": 17.80393923246746, + "learning_rate": 4.9618320610687025e-05, + "loss": 6.4149, + "step": 17 + }, + { + "epoch": 0.0, + "grad_norm": 15.25400804429858, + "learning_rate": 5.3435114503816794e-05, + "loss": 6.0852, + "step": 18 + }, + { + "epoch": 0.0, + "grad_norm": 13.035795814576668, + "learning_rate": 5.725190839694656e-05, + "loss": 6.1313, + "step": 19 + }, + { + "epoch": 0.0, + "grad_norm": 14.788487589977752, + "learning_rate": 6.106870229007633e-05, + "loss": 6.0491, + "step": 20 + }, + { + "epoch": 0.0, + "grad_norm": 15.87894494754325, + "learning_rate": 6.488549618320611e-05, + "loss": 6.0371, + "step": 21 + }, + { + "epoch": 0.0, + "grad_norm": 19.93334663657767, + "learning_rate": 6.870229007633588e-05, + "loss": 5.9827, + "step": 22 + }, + { + "epoch": 0.0, + "grad_norm": 10.586689666709068, + "learning_rate": 7.251908396946565e-05, + "loss": 5.9068, + "step": 23 + }, + { + "epoch": 0.0, + "grad_norm": 10.586689666709068, + "learning_rate": 7.251908396946565e-05, + "loss": 5.8078, + "step": 24 + }, + { + "epoch": 0.0, + "grad_norm": 11.009282905400905, + "learning_rate": 7.633587786259542e-05, + "loss": 6.0599, + "step": 25 + }, + { + "epoch": 0.0, + "grad_norm": 103.37611984633007, + "learning_rate": 8.015267175572518e-05, + "loss": 6.0033, + "step": 26 + }, + { + "epoch": 0.0, + "grad_norm": 9.093408232269466, + "learning_rate": 8.396946564885496e-05, + "loss": 5.9485, + "step": 27 + }, + { + "epoch": 0.0, + "grad_norm": 13.409013384441376, + "learning_rate": 8.778625954198472e-05, + "loss": 5.747, + "step": 28 + }, + { + "epoch": 0.0, + "grad_norm": 20.723480886920953, + "learning_rate": 9.160305343511451e-05, + "loss": 5.9794, + "step": 29 + }, + { + "epoch": 0.0, + "grad_norm": 27.227889622861333, + "learning_rate": 9.541984732824429e-05, + "loss": 5.9006, + "step": 30 + }, + { + "epoch": 0.0, + "grad_norm": 26.139568621344836, + "learning_rate": 9.923664122137405e-05, + "loss": 5.5044, + "step": 31 + }, + { + "epoch": 0.0, + "grad_norm": 14.752230119326473, + "learning_rate": 0.00010305343511450383, + "loss": 5.4819, + "step": 32 + }, + { + "epoch": 0.0, + "grad_norm": 13.441487864353205, + "learning_rate": 0.00010687022900763359, + "loss": 5.76, + "step": 33 + }, + { + "epoch": 0.0, + "grad_norm": 19.896976107816972, + "learning_rate": 0.00011068702290076336, + "loss": 5.8905, + "step": 34 + }, + { + "epoch": 0.0, + "grad_norm": 16.411578435909693, + "learning_rate": 0.00011450381679389313, + "loss": 5.4038, + "step": 35 + }, + { + "epoch": 0.0, + "grad_norm": 16.411578435909693, + "learning_rate": 0.00011450381679389313, + "loss": 5.4958, + "step": 36 + }, + { + "epoch": 0.0, + "grad_norm": 16.013488491082, + "learning_rate": 0.0001183206106870229, + "loss": 5.6131, + "step": 37 + }, + { + "epoch": 0.0, + "grad_norm": 53.19172274365636, + "learning_rate": 0.00012213740458015266, + "loss": 5.324, + "step": 38 + }, + { + "epoch": 0.0, + "grad_norm": 36.15408253437268, + "learning_rate": 0.00012595419847328244, + "loss": 5.4318, + "step": 39 + }, + { + "epoch": 0.0, + "grad_norm": 19.271262304467943, + "learning_rate": 0.00012977099236641222, + "loss": 5.6865, + "step": 40 + }, + { + "epoch": 0.0, + "grad_norm": 7.0078367115137485, + "learning_rate": 0.000133587786259542, + "loss": 5.5618, + "step": 41 + }, + { + "epoch": 0.0, + "grad_norm": 12.801895272987867, + "learning_rate": 0.00013740458015267177, + "loss": 5.578, + "step": 42 + }, + { + "epoch": 0.0, + "grad_norm": 9.471273863525738, + "learning_rate": 0.00014122137404580154, + "loss": 5.4528, + "step": 43 + }, + { + "epoch": 0.01, + "grad_norm": 11.309015734374855, + "learning_rate": 0.0001450381679389313, + "loss": 5.4494, + "step": 44 + }, + { + "epoch": 0.01, + "grad_norm": 21.597093927953562, + "learning_rate": 0.00014885496183206107, + "loss": 5.5692, + "step": 45 + }, + { + "epoch": 0.01, + "grad_norm": 6.168922998526033, + "learning_rate": 0.00015267175572519084, + "loss": 5.5889, + "step": 46 + }, + { + "epoch": 0.01, + "grad_norm": 26.407797807852603, + "learning_rate": 0.00015648854961832062, + "loss": 5.5628, + "step": 47 + }, + { + "epoch": 0.01, + "grad_norm": 29.41099617504654, + "learning_rate": 0.00016030534351145037, + "loss": 5.6865, + "step": 48 + }, + { + "epoch": 0.01, + "grad_norm": 54.04515562049684, + "learning_rate": 0.00016412213740458014, + "loss": 5.3337, + "step": 49 + }, + { + "epoch": 0.01, + "grad_norm": 21.848816944482426, + "learning_rate": 0.00016793893129770992, + "loss": 5.455, + "step": 50 + }, + { + "epoch": 0.01, + "grad_norm": 21.77248137676844, + "learning_rate": 0.0001717557251908397, + "loss": 5.7747, + "step": 51 + }, + { + "epoch": 0.01, + "grad_norm": 56.97962669273316, + "learning_rate": 0.00017557251908396944, + "loss": 5.8369, + "step": 52 + }, + { + "epoch": 0.01, + "grad_norm": 191.66821782581252, + "learning_rate": 0.00017938931297709925, + "loss": 5.7323, + "step": 53 + }, + { + "epoch": 0.01, + "grad_norm": 29.622135366710424, + "learning_rate": 0.00018320610687022902, + "loss": 5.696, + "step": 54 + }, + { + "epoch": 0.01, + "grad_norm": 24.485759627294552, + "learning_rate": 0.0001870229007633588, + "loss": 5.5594, + "step": 55 + }, + { + "epoch": 0.01, + "grad_norm": 23.761324220206912, + "learning_rate": 0.00019083969465648857, + "loss": 5.4566, + "step": 56 + }, + { + "epoch": 0.01, + "grad_norm": 18.30283741545266, + "learning_rate": 0.00019465648854961832, + "loss": 5.5711, + "step": 57 + }, + { + "epoch": 0.01, + "grad_norm": 46.402196170850495, + "learning_rate": 0.0001984732824427481, + "loss": 5.584, + "step": 58 + }, + { + "epoch": 0.01, + "grad_norm": 27.29379203075219, + "learning_rate": 0.00020229007633587788, + "loss": 5.1017, + "step": 59 + }, + { + "epoch": 0.01, + "grad_norm": 25.72050010332052, + "learning_rate": 0.00020610687022900765, + "loss": 5.4472, + "step": 60 + }, + { + "epoch": 0.01, + "grad_norm": 11.90204071984756, + "learning_rate": 0.0002099236641221374, + "loss": 5.5801, + "step": 61 + }, + { + "epoch": 0.01, + "grad_norm": 10.964221676367517, + "learning_rate": 0.00021374045801526718, + "loss": 5.3488, + "step": 62 + }, + { + "epoch": 0.01, + "grad_norm": 26.717989647168515, + "learning_rate": 0.00021755725190839695, + "loss": 5.1711, + "step": 63 + }, + { + "epoch": 0.01, + "grad_norm": 9.600628953387018, + "learning_rate": 0.00022137404580152673, + "loss": 5.4042, + "step": 64 + }, + { + "epoch": 0.01, + "grad_norm": 22.344895453828535, + "learning_rate": 0.00022519083969465648, + "loss": 5.0538, + "step": 65 + }, + { + "epoch": 0.01, + "grad_norm": 12.190617761435066, + "learning_rate": 0.00022900763358778625, + "loss": 5.4556, + "step": 66 + }, + { + "epoch": 0.01, + "grad_norm": 7.053431452864492, + "learning_rate": 0.00023282442748091603, + "loss": 5.2617, + "step": 67 + }, + { + "epoch": 0.01, + "grad_norm": 49.09212426058433, + "learning_rate": 0.0002366412213740458, + "loss": 5.4076, + "step": 68 + }, + { + "epoch": 0.01, + "grad_norm": 38.22133703991094, + "learning_rate": 0.00024045801526717558, + "loss": 5.3227, + "step": 69 + }, + { + "epoch": 0.01, + "grad_norm": 11.852793468649201, + "learning_rate": 0.00024427480916030533, + "loss": 5.1096, + "step": 70 + }, + { + "epoch": 0.01, + "grad_norm": 11.55578534578585, + "learning_rate": 0.00024809160305343513, + "loss": 5.1777, + "step": 71 + }, + { + "epoch": 0.01, + "grad_norm": 28.44407093616628, + "learning_rate": 0.0002519083969465649, + "loss": 5.1845, + "step": 72 + }, + { + "epoch": 0.01, + "grad_norm": 12.945012628276542, + "learning_rate": 0.00025572519083969463, + "loss": 4.909, + "step": 73 + }, + { + "epoch": 0.01, + "grad_norm": 9.829628432755785, + "learning_rate": 0.00025954198473282443, + "loss": 5.1863, + "step": 74 + }, + { + "epoch": 0.01, + "grad_norm": 55.241813577801985, + "learning_rate": 0.0002633587786259542, + "loss": 5.0904, + "step": 75 + }, + { + "epoch": 0.01, + "grad_norm": 5.074212095135481, + "learning_rate": 0.000267175572519084, + "loss": 5.18, + "step": 76 + }, + { + "epoch": 0.01, + "grad_norm": 30.34910144604937, + "learning_rate": 0.00027099236641221373, + "loss": 5.5077, + "step": 77 + }, + { + "epoch": 0.01, + "grad_norm": 94.78455661102247, + "learning_rate": 0.00027480916030534353, + "loss": 5.2978, + "step": 78 + }, + { + "epoch": 0.01, + "grad_norm": 6.076099688665433, + "learning_rate": 0.0002786259541984733, + "loss": 5.1118, + "step": 79 + }, + { + "epoch": 0.01, + "grad_norm": 7.9946755435844095, + "learning_rate": 0.0002824427480916031, + "loss": 5.0875, + "step": 80 + }, + { + "epoch": 0.01, + "grad_norm": 5.431891860428228, + "learning_rate": 0.0002862595419847328, + "loss": 5.1588, + "step": 81 + }, + { + "epoch": 0.01, + "grad_norm": 4.047652245569124, + "learning_rate": 0.0002900763358778626, + "loss": 5.0777, + "step": 82 + }, + { + "epoch": 0.01, + "grad_norm": 9.660302962418715, + "learning_rate": 0.0002938931297709924, + "loss": 5.073, + "step": 83 + }, + { + "epoch": 0.01, + "grad_norm": 99.18039670794278, + "learning_rate": 0.00029770992366412214, + "loss": 5.2711, + "step": 84 + }, + { + "epoch": 0.01, + "grad_norm": 41.24087771516951, + "learning_rate": 0.00030152671755725194, + "loss": 5.2529, + "step": 85 + }, + { + "epoch": 0.01, + "grad_norm": 15.955748113178291, + "learning_rate": 0.0003053435114503817, + "loss": 5.1201, + "step": 86 + }, + { + "epoch": 0.01, + "grad_norm": 20.04512145951128, + "learning_rate": 0.0003091603053435115, + "loss": 5.5749, + "step": 87 + }, + { + "epoch": 0.01, + "grad_norm": 63.28989845481338, + "learning_rate": 0.00031297709923664124, + "loss": 5.3352, + "step": 88 + }, + { + "epoch": 0.01, + "grad_norm": 30.65392260684527, + "learning_rate": 0.000316793893129771, + "loss": 5.2994, + "step": 89 + }, + { + "epoch": 0.01, + "grad_norm": 19.570933105458376, + "learning_rate": 0.00032061068702290074, + "loss": 5.4428, + "step": 90 + }, + { + "epoch": 0.01, + "grad_norm": 33.27702030284148, + "learning_rate": 0.00032442748091603054, + "loss": 5.5179, + "step": 91 + }, + { + "epoch": 0.01, + "grad_norm": 143.06233640693532, + "learning_rate": 0.0003282442748091603, + "loss": 5.4859, + "step": 92 + }, + { + "epoch": 0.01, + "grad_norm": 17.652574355644063, + "learning_rate": 0.0003320610687022901, + "loss": 5.3578, + "step": 93 + }, + { + "epoch": 0.01, + "grad_norm": 99.14161014042287, + "learning_rate": 0.00033587786259541984, + "loss": 5.1511, + "step": 94 + }, + { + "epoch": 0.01, + "grad_norm": 108.46150115295116, + "learning_rate": 0.00033969465648854964, + "loss": 5.332, + "step": 95 + }, + { + "epoch": 0.01, + "grad_norm": 13.200674925756799, + "learning_rate": 0.0003435114503816794, + "loss": 5.2414, + "step": 96 + }, + { + "epoch": 0.01, + "grad_norm": 17.42613416743868, + "learning_rate": 0.0003473282442748092, + "loss": 5.0496, + "step": 97 + }, + { + "epoch": 0.01, + "grad_norm": 13.787168320397248, + "learning_rate": 0.0003511450381679389, + "loss": 5.2121, + "step": 98 + }, + { + "epoch": 0.01, + "grad_norm": 13.13064175563793, + "learning_rate": 0.0003549618320610687, + "loss": 5.3562, + "step": 99 + }, + { + "epoch": 0.01, + "grad_norm": 15.209194797874353, + "learning_rate": 0.0003587786259541985, + "loss": 5.1117, + "step": 100 + }, + { + "epoch": 0.01, + "grad_norm": 48.91476148699009, + "learning_rate": 0.00036259541984732824, + "loss": 5.3374, + "step": 101 + }, + { + "epoch": 0.01, + "grad_norm": 24.29356061884079, + "learning_rate": 0.00036641221374045805, + "loss": 5.3117, + "step": 102 + }, + { + "epoch": 0.01, + "grad_norm": 17.452060193355486, + "learning_rate": 0.0003702290076335878, + "loss": 5.27, + "step": 103 + }, + { + "epoch": 0.01, + "grad_norm": 12.408641021402929, + "learning_rate": 0.0003740458015267176, + "loss": 5.2371, + "step": 104 + }, + { + "epoch": 0.01, + "grad_norm": 11.216879534678895, + "learning_rate": 0.00037786259541984735, + "loss": 5.3758, + "step": 105 + }, + { + "epoch": 0.01, + "grad_norm": 14.122037288362222, + "learning_rate": 0.00038167938931297715, + "loss": 5.3233, + "step": 106 + }, + { + "epoch": 0.01, + "grad_norm": 7.779989122053916, + "learning_rate": 0.00038549618320610684, + "loss": 5.4026, + "step": 107 + }, + { + "epoch": 0.01, + "grad_norm": 11.626138250979961, + "learning_rate": 0.00038931297709923665, + "loss": 5.0434, + "step": 108 + }, + { + "epoch": 0.01, + "grad_norm": 7.385310006965435, + "learning_rate": 0.0003931297709923664, + "loss": 5.1588, + "step": 109 + }, + { + "epoch": 0.01, + "grad_norm": 6.111534750981958, + "learning_rate": 0.0003969465648854962, + "loss": 5.3004, + "step": 110 + }, + { + "epoch": 0.01, + "grad_norm": 13.108563585041713, + "learning_rate": 0.00040076335877862595, + "loss": 4.9844, + "step": 111 + }, + { + "epoch": 0.01, + "grad_norm": 12.463164343212153, + "learning_rate": 0.00040458015267175575, + "loss": 5.0383, + "step": 112 + }, + { + "epoch": 0.01, + "grad_norm": 26.111947637716288, + "learning_rate": 0.0004083969465648855, + "loss": 4.9475, + "step": 113 + }, + { + "epoch": 0.01, + "grad_norm": 8.324439787028272, + "learning_rate": 0.0004122137404580153, + "loss": 5.0533, + "step": 114 + }, + { + "epoch": 0.01, + "grad_norm": 9.234173217749362, + "learning_rate": 0.00041603053435114505, + "loss": 4.9712, + "step": 115 + }, + { + "epoch": 0.01, + "grad_norm": 5.525390780179457, + "learning_rate": 0.0004198473282442748, + "loss": 5.026, + "step": 116 + }, + { + "epoch": 0.01, + "grad_norm": 5.227610724744892, + "learning_rate": 0.00042366412213740455, + "loss": 5.2434, + "step": 117 + }, + { + "epoch": 0.01, + "grad_norm": 7.154430996699472, + "learning_rate": 0.00042748091603053435, + "loss": 5.0155, + "step": 118 + }, + { + "epoch": 0.01, + "grad_norm": 8.76941351196938, + "learning_rate": 0.00043129770992366415, + "loss": 4.86, + "step": 119 + }, + { + "epoch": 0.01, + "grad_norm": 3.4065374733856135, + "learning_rate": 0.0004351145038167939, + "loss": 4.7745, + "step": 120 + }, + { + "epoch": 0.01, + "grad_norm": 4.933753082082301, + "learning_rate": 0.0004389312977099237, + "loss": 5.1687, + "step": 121 + }, + { + "epoch": 0.01, + "grad_norm": 4.17356231729226, + "learning_rate": 0.00044274809160305345, + "loss": 4.8747, + "step": 122 + }, + { + "epoch": 0.01, + "grad_norm": 3.455904317777208, + "learning_rate": 0.00044656488549618326, + "loss": 4.6564, + "step": 123 + }, + { + "epoch": 0.01, + "grad_norm": 3.9064455284575668, + "learning_rate": 0.00045038167938931295, + "loss": 4.9689, + "step": 124 + }, + { + "epoch": 0.01, + "grad_norm": 2.85035028157288, + "learning_rate": 0.00045419847328244275, + "loss": 5.0042, + "step": 125 + }, + { + "epoch": 0.01, + "grad_norm": 42.12231453713657, + "learning_rate": 0.0004580152671755725, + "loss": 4.9134, + "step": 126 + }, + { + "epoch": 0.01, + "grad_norm": 33.33935309010155, + "learning_rate": 0.0004618320610687023, + "loss": 4.7722, + "step": 127 + }, + { + "epoch": 0.01, + "grad_norm": 3.264613929291729, + "learning_rate": 0.00046564885496183206, + "loss": 4.7236, + "step": 128 + }, + { + "epoch": 0.01, + "grad_norm": 6.284470635260648, + "learning_rate": 0.00046946564885496186, + "loss": 4.9849, + "step": 129 + }, + { + "epoch": 0.01, + "grad_norm": 2.701662592354433, + "learning_rate": 0.0004732824427480916, + "loss": 4.7388, + "step": 130 + }, + { + "epoch": 0.02, + "grad_norm": 17.156141458264692, + "learning_rate": 0.0004770992366412214, + "loss": 4.9292, + "step": 131 + }, + { + "epoch": 0.02, + "grad_norm": 11.896147445442196, + "learning_rate": 0.00048091603053435116, + "loss": 5.0198, + "step": 132 + }, + { + "epoch": 0.02, + "grad_norm": 3.9435670187222325, + "learning_rate": 0.0004847328244274809, + "loss": 4.6752, + "step": 133 + }, + { + "epoch": 0.02, + "grad_norm": 7.213241065570718, + "learning_rate": 0.0004885496183206107, + "loss": 4.6556, + "step": 134 + }, + { + "epoch": 0.02, + "grad_norm": 62.61093309333762, + "learning_rate": 0.0004923664122137404, + "loss": 5.0054, + "step": 135 + }, + { + "epoch": 0.02, + "grad_norm": 112.46264314904546, + "learning_rate": 0.0004961832061068703, + "loss": 5.1782, + "step": 136 + }, + { + "epoch": 0.02, + "grad_norm": 41.97202498980487, + "learning_rate": 0.0005, + "loss": 5.2959, + "step": 137 + }, + { + "epoch": 0.02, + "grad_norm": 11.493483729668776, + "learning_rate": 0.0005038167938931298, + "loss": 5.8508, + "step": 138 + }, + { + "epoch": 0.02, + "grad_norm": 11.021723732018117, + "learning_rate": 0.0005076335877862596, + "loss": 5.2118, + "step": 139 + }, + { + "epoch": 0.02, + "grad_norm": 7.006304671598398, + "learning_rate": 0.0005114503816793893, + "loss": 5.3601, + "step": 140 + }, + { + "epoch": 0.02, + "grad_norm": 7.289194721886362, + "learning_rate": 0.0005152671755725191, + "loss": 5.2464, + "step": 141 + }, + { + "epoch": 0.02, + "grad_norm": 7.311468819904173, + "learning_rate": 0.0005190839694656489, + "loss": 5.2711, + "step": 142 + }, + { + "epoch": 0.02, + "grad_norm": 26.398719072271508, + "learning_rate": 0.0005229007633587787, + "loss": 5.289, + "step": 143 + }, + { + "epoch": 0.02, + "grad_norm": 11.625422270433745, + "learning_rate": 0.0005267175572519084, + "loss": 5.2699, + "step": 144 + }, + { + "epoch": 0.02, + "grad_norm": 49.995882338347684, + "learning_rate": 0.0005305343511450382, + "loss": 5.295, + "step": 145 + }, + { + "epoch": 0.02, + "grad_norm": 38.177044422132425, + "learning_rate": 0.000534351145038168, + "loss": 5.1886, + "step": 146 + }, + { + "epoch": 0.02, + "grad_norm": 18.345238825731563, + "learning_rate": 0.0005381679389312977, + "loss": 5.1197, + "step": 147 + }, + { + "epoch": 0.02, + "grad_norm": 6.121211476002137, + "learning_rate": 0.0005419847328244275, + "loss": 5.1665, + "step": 148 + }, + { + "epoch": 0.02, + "grad_norm": 7.033090979387724, + "learning_rate": 0.0005458015267175572, + "loss": 5.2001, + "step": 149 + }, + { + "epoch": 0.02, + "grad_norm": 8.465524804046279, + "learning_rate": 0.0005496183206106871, + "loss": 5.1638, + "step": 150 + }, + { + "epoch": 0.02, + "grad_norm": 15.612237680230512, + "learning_rate": 0.0005534351145038168, + "loss": 4.7495, + "step": 151 + }, + { + "epoch": 0.02, + "grad_norm": 6.388943581119024, + "learning_rate": 0.0005572519083969466, + "loss": 4.9927, + "step": 152 + }, + { + "epoch": 0.02, + "grad_norm": 4.0182990972402575, + "learning_rate": 0.0005610687022900763, + "loss": 5.0069, + "step": 153 + }, + { + "epoch": 0.02, + "grad_norm": 4.445692947191389, + "learning_rate": 0.0005648854961832062, + "loss": 4.8021, + "step": 154 + }, + { + "epoch": 0.02, + "grad_norm": 14.423302455504722, + "learning_rate": 0.0005687022900763359, + "loss": 4.9944, + "step": 155 + }, + { + "epoch": 0.02, + "grad_norm": 3.863803180947149, + "learning_rate": 0.0005725190839694656, + "loss": 5.0374, + "step": 156 + }, + { + "epoch": 0.02, + "grad_norm": 4.675446873357981, + "learning_rate": 0.0005763358778625954, + "loss": 4.8162, + "step": 157 + }, + { + "epoch": 0.02, + "grad_norm": 7.031369179358917, + "learning_rate": 0.0005801526717557252, + "loss": 5.0332, + "step": 158 + }, + { + "epoch": 0.02, + "grad_norm": 37.0810100366357, + "learning_rate": 0.000583969465648855, + "loss": 4.7682, + "step": 159 + }, + { + "epoch": 0.02, + "grad_norm": 6.66547076623082, + "learning_rate": 0.0005877862595419848, + "loss": 4.8632, + "step": 160 + }, + { + "epoch": 0.02, + "grad_norm": 3.3275498864086015, + "learning_rate": 0.0005916030534351145, + "loss": 4.6909, + "step": 161 + }, + { + "epoch": 0.02, + "grad_norm": 26.188174306741093, + "learning_rate": 0.0005954198473282443, + "loss": 4.6418, + "step": 162 + }, + { + "epoch": 0.02, + "grad_norm": 10.37007904683383, + "learning_rate": 0.0005992366412213741, + "loss": 4.7376, + "step": 163 + }, + { + "epoch": 0.02, + "grad_norm": 5.244057220680342, + "learning_rate": 0.0006030534351145039, + "loss": 4.8339, + "step": 164 + }, + { + "epoch": 0.02, + "grad_norm": 2.5201069409738923, + "learning_rate": 0.0006068702290076335, + "loss": 4.6116, + "step": 165 + }, + { + "epoch": 0.02, + "grad_norm": 5.974803396981725, + "learning_rate": 0.0006106870229007634, + "loss": 4.5625, + "step": 166 + }, + { + "epoch": 0.02, + "grad_norm": 26.210473236143425, + "learning_rate": 0.0006145038167938931, + "loss": 4.7733, + "step": 167 + }, + { + "epoch": 0.02, + "grad_norm": 27.9365658071239, + "learning_rate": 0.000618320610687023, + "loss": 4.911, + "step": 168 + }, + { + "epoch": 0.02, + "grad_norm": 8.230381951592534, + "learning_rate": 0.0006221374045801526, + "loss": 4.7406, + "step": 169 + }, + { + "epoch": 0.02, + "grad_norm": 4.732268568832821, + "learning_rate": 0.0006259541984732825, + "loss": 4.6123, + "step": 170 + }, + { + "epoch": 0.02, + "grad_norm": 9.15487252616632, + "learning_rate": 0.0006297709923664122, + "loss": 4.7896, + "step": 171 + }, + { + "epoch": 0.02, + "grad_norm": 3.4653332544275686, + "learning_rate": 0.000633587786259542, + "loss": 4.7309, + "step": 172 + }, + { + "epoch": 0.02, + "grad_norm": 4.850780915920418, + "learning_rate": 0.0006374045801526717, + "loss": 4.7314, + "step": 173 + }, + { + "epoch": 0.02, + "grad_norm": 7.170622336197695, + "learning_rate": 0.0006412213740458015, + "loss": 4.4504, + "step": 174 + }, + { + "epoch": 0.02, + "grad_norm": 8.243144623116297, + "learning_rate": 0.0006450381679389313, + "loss": 4.804, + "step": 175 + }, + { + "epoch": 0.02, + "grad_norm": 2.966695490413193, + "learning_rate": 0.0006488549618320611, + "loss": 4.3688, + "step": 176 + }, + { + "epoch": 0.02, + "grad_norm": 3.1607265339983766, + "learning_rate": 0.0006526717557251909, + "loss": 4.4259, + "step": 177 + }, + { + "epoch": 0.02, + "grad_norm": 12.640808983517074, + "learning_rate": 0.0006564885496183206, + "loss": 4.7304, + "step": 178 + }, + { + "epoch": 0.02, + "grad_norm": 3.6523872935286272, + "learning_rate": 0.0006603053435114504, + "loss": 4.7386, + "step": 179 + }, + { + "epoch": 0.02, + "grad_norm": 8.30083609660972, + "learning_rate": 0.0006641221374045802, + "loss": 4.5598, + "step": 180 + }, + { + "epoch": 0.02, + "grad_norm": 3.0623057509893377, + "learning_rate": 0.0006679389312977099, + "loss": 4.5194, + "step": 181 + }, + { + "epoch": 0.02, + "grad_norm": 4.883454177185916, + "learning_rate": 0.0006717557251908397, + "loss": 4.5887, + "step": 182 + }, + { + "epoch": 0.02, + "grad_norm": 2.9525920582406457, + "learning_rate": 0.0006755725190839694, + "loss": 4.541, + "step": 183 + }, + { + "epoch": 0.02, + "grad_norm": 4.270008428820161, + "learning_rate": 0.0006793893129770993, + "loss": 4.8857, + "step": 184 + }, + { + "epoch": 0.02, + "grad_norm": 5.672713861411733, + "learning_rate": 0.000683206106870229, + "loss": 4.3204, + "step": 185 + }, + { + "epoch": 0.02, + "grad_norm": 3.761348480989461, + "learning_rate": 0.0006870229007633588, + "loss": 4.3439, + "step": 186 + }, + { + "epoch": 0.02, + "grad_norm": 2.4326020308075034, + "learning_rate": 0.0006908396946564885, + "loss": 4.7289, + "step": 187 + }, + { + "epoch": 0.02, + "grad_norm": 7.427959833506058, + "learning_rate": 0.0006946564885496184, + "loss": 4.5812, + "step": 188 + }, + { + "epoch": 0.02, + "grad_norm": 1.961163714438519, + "learning_rate": 0.0006984732824427481, + "loss": 4.5939, + "step": 189 + }, + { + "epoch": 0.02, + "grad_norm": 6.895535057551813, + "learning_rate": 0.0007022900763358778, + "loss": 4.5405, + "step": 190 + }, + { + "epoch": 0.02, + "grad_norm": 8.807431310414946, + "learning_rate": 0.0007061068702290076, + "loss": 4.5149, + "step": 191 + }, + { + "epoch": 0.02, + "grad_norm": 3.388905199194286, + "learning_rate": 0.0007099236641221374, + "loss": 4.5301, + "step": 192 + }, + { + "epoch": 0.02, + "grad_norm": 7.913993022689122, + "learning_rate": 0.0007137404580152672, + "loss": 4.3506, + "step": 193 + }, + { + "epoch": 0.02, + "grad_norm": 2.333063361221933, + "learning_rate": 0.000717557251908397, + "loss": 4.6727, + "step": 194 + }, + { + "epoch": 0.02, + "grad_norm": 3.2359830428102474, + "learning_rate": 0.0007213740458015267, + "loss": 4.6725, + "step": 195 + }, + { + "epoch": 0.02, + "grad_norm": 3.1482078742978143, + "learning_rate": 0.0007251908396946565, + "loss": 4.6814, + "step": 196 + }, + { + "epoch": 0.02, + "grad_norm": 2.593895050022191, + "learning_rate": 0.0007290076335877863, + "loss": 4.3763, + "step": 197 + }, + { + "epoch": 0.02, + "grad_norm": 2.942514833308484, + "learning_rate": 0.0007328244274809161, + "loss": 4.5224, + "step": 198 + }, + { + "epoch": 0.02, + "grad_norm": 2.252146175157487, + "learning_rate": 0.0007366412213740457, + "loss": 4.607, + "step": 199 + }, + { + "epoch": 0.02, + "grad_norm": 4.628967497586178, + "learning_rate": 0.0007404580152671756, + "loss": 4.6547, + "step": 200 + }, + { + "epoch": 0.02, + "grad_norm": 3.1214679416699287, + "learning_rate": 0.0007442748091603053, + "loss": 4.4936, + "step": 201 + }, + { + "epoch": 0.02, + "grad_norm": 3.2123452294111003, + "learning_rate": 0.0007480916030534352, + "loss": 4.5136, + "step": 202 + }, + { + "epoch": 0.02, + "grad_norm": 2.6031227255266622, + "learning_rate": 0.0007519083969465648, + "loss": 4.5817, + "step": 203 + }, + { + "epoch": 0.02, + "grad_norm": 4.501632369287978, + "learning_rate": 0.0007557251908396947, + "loss": 4.5266, + "step": 204 + }, + { + "epoch": 0.02, + "grad_norm": 3.3047686153645777, + "learning_rate": 0.0007595419847328244, + "loss": 4.6692, + "step": 205 + }, + { + "epoch": 0.02, + "grad_norm": 1.90038519113313, + "learning_rate": 0.0007633587786259543, + "loss": 4.6605, + "step": 206 + }, + { + "epoch": 0.02, + "grad_norm": 3.366071645507772, + "learning_rate": 0.0007671755725190839, + "loss": 4.6692, + "step": 207 + }, + { + "epoch": 0.02, + "grad_norm": 4.031546045036477, + "learning_rate": 0.0007709923664122137, + "loss": 4.5349, + "step": 208 + }, + { + "epoch": 0.02, + "grad_norm": 8.290888966118121, + "learning_rate": 0.0007748091603053435, + "loss": 4.5302, + "step": 209 + }, + { + "epoch": 0.02, + "grad_norm": 2.353501923149862, + "learning_rate": 0.0007786259541984733, + "loss": 4.5482, + "step": 210 + }, + { + "epoch": 0.02, + "grad_norm": 6.391124795162947, + "learning_rate": 0.000782442748091603, + "loss": 4.6502, + "step": 211 + }, + { + "epoch": 0.02, + "grad_norm": 2.000245415269884, + "learning_rate": 0.0007862595419847328, + "loss": 4.7538, + "step": 212 + }, + { + "epoch": 0.02, + "grad_norm": 6.0989354402413465, + "learning_rate": 0.0007900763358778626, + "loss": 4.5299, + "step": 213 + }, + { + "epoch": 0.02, + "grad_norm": 2.7640114509283844, + "learning_rate": 0.0007938931297709924, + "loss": 4.5933, + "step": 214 + }, + { + "epoch": 0.02, + "grad_norm": 2.2141741723409734, + "learning_rate": 0.0007977099236641223, + "loss": 4.6185, + "step": 215 + }, + { + "epoch": 0.02, + "grad_norm": 2.9889777111731024, + "learning_rate": 0.0008015267175572519, + "loss": 4.2585, + "step": 216 + }, + { + "epoch": 0.02, + "grad_norm": 3.8055064049034963, + "learning_rate": 0.0008053435114503816, + "loss": 4.6684, + "step": 217 + }, + { + "epoch": 0.02, + "grad_norm": 2.4440615330231577, + "learning_rate": 0.0008091603053435115, + "loss": 4.4236, + "step": 218 + }, + { + "epoch": 0.03, + "grad_norm": 2.67540754637004, + "learning_rate": 0.0008129770992366412, + "loss": 4.3685, + "step": 219 + }, + { + "epoch": 0.03, + "grad_norm": 5.793630358099666, + "learning_rate": 0.000816793893129771, + "loss": 4.48, + "step": 220 + }, + { + "epoch": 0.03, + "grad_norm": 2.483681384184715, + "learning_rate": 0.0008206106870229007, + "loss": 4.4383, + "step": 221 + }, + { + "epoch": 0.03, + "grad_norm": 2.073067619234986, + "learning_rate": 0.0008244274809160306, + "loss": 4.7067, + "step": 222 + }, + { + "epoch": 0.03, + "grad_norm": 5.512466666008747, + "learning_rate": 0.0008282442748091604, + "loss": 4.4971, + "step": 223 + }, + { + "epoch": 0.03, + "grad_norm": 2.4680754308671604, + "learning_rate": 0.0008320610687022901, + "loss": 4.7178, + "step": 224 + }, + { + "epoch": 0.03, + "grad_norm": 2.350356149063761, + "learning_rate": 0.0008358778625954198, + "loss": 4.5058, + "step": 225 + }, + { + "epoch": 0.03, + "grad_norm": 1.9235303889832627, + "learning_rate": 0.0008396946564885496, + "loss": 4.4643, + "step": 226 + }, + { + "epoch": 0.03, + "grad_norm": 3.4372031826778033, + "learning_rate": 0.0008435114503816795, + "loss": 4.5712, + "step": 227 + }, + { + "epoch": 0.03, + "grad_norm": 1.9601887476121664, + "learning_rate": 0.0008473282442748091, + "loss": 4.4648, + "step": 228 + }, + { + "epoch": 0.03, + "grad_norm": 2.45846639486102, + "learning_rate": 0.000851145038167939, + "loss": 4.5603, + "step": 229 + }, + { + "epoch": 0.03, + "grad_norm": 2.7887446648129437, + "learning_rate": 0.0008549618320610687, + "loss": 4.6925, + "step": 230 + }, + { + "epoch": 0.03, + "grad_norm": 2.2147831858211275, + "learning_rate": 0.0008587786259541986, + "loss": 4.4329, + "step": 231 + }, + { + "epoch": 0.03, + "grad_norm": 4.3706275121022315, + "learning_rate": 0.0008625954198473283, + "loss": 4.6871, + "step": 232 + }, + { + "epoch": 0.03, + "grad_norm": 3.7937170135720426, + "learning_rate": 0.0008664122137404581, + "loss": 4.4012, + "step": 233 + }, + { + "epoch": 0.03, + "grad_norm": 4.435956727807511, + "learning_rate": 0.0008702290076335878, + "loss": 4.3689, + "step": 234 + }, + { + "epoch": 0.03, + "grad_norm": 5.239393029537955, + "learning_rate": 0.0008740458015267176, + "loss": 4.4296, + "step": 235 + }, + { + "epoch": 0.03, + "grad_norm": 3.187055953738488, + "learning_rate": 0.0008778625954198474, + "loss": 4.681, + "step": 236 + }, + { + "epoch": 0.03, + "grad_norm": 2.9083664215185605, + "learning_rate": 0.000881679389312977, + "loss": 4.4031, + "step": 237 + }, + { + "epoch": 0.03, + "grad_norm": 3.1200850538271188, + "learning_rate": 0.0008854961832061069, + "loss": 4.5064, + "step": 238 + }, + { + "epoch": 0.03, + "grad_norm": 2.277426952151577, + "learning_rate": 0.0008893129770992367, + "loss": 4.4562, + "step": 239 + }, + { + "epoch": 0.03, + "grad_norm": 2.588541096844217, + "learning_rate": 0.0008931297709923665, + "loss": 4.4816, + "step": 240 + }, + { + "epoch": 0.03, + "grad_norm": 2.1713161104056526, + "learning_rate": 0.0008969465648854962, + "loss": 4.2547, + "step": 241 + }, + { + "epoch": 0.03, + "grad_norm": 1.9979300139791905, + "learning_rate": 0.0009007633587786259, + "loss": 4.4082, + "step": 242 + }, + { + "epoch": 0.03, + "grad_norm": 7.13986641828722, + "learning_rate": 0.0009045801526717558, + "loss": 4.3108, + "step": 243 + }, + { + "epoch": 0.03, + "grad_norm": 2.464791404148931, + "learning_rate": 0.0009083969465648855, + "loss": 4.4189, + "step": 244 + }, + { + "epoch": 0.03, + "grad_norm": 2.234666589131025, + "learning_rate": 0.0009122137404580153, + "loss": 4.4217, + "step": 245 + }, + { + "epoch": 0.03, + "grad_norm": 4.733346152783746, + "learning_rate": 0.000916030534351145, + "loss": 4.436, + "step": 246 + }, + { + "epoch": 0.03, + "grad_norm": 1.8815470156640648, + "learning_rate": 0.0009198473282442749, + "loss": 4.3699, + "step": 247 + }, + { + "epoch": 0.03, + "grad_norm": 2.985621573989371, + "learning_rate": 0.0009236641221374046, + "loss": 4.4429, + "step": 248 + }, + { + "epoch": 0.03, + "grad_norm": 2.9757159724836653, + "learning_rate": 0.0009274809160305345, + "loss": 4.4512, + "step": 249 + }, + { + "epoch": 0.03, + "grad_norm": 2.579476705353371, + "learning_rate": 0.0009312977099236641, + "loss": 4.4356, + "step": 250 + }, + { + "epoch": 0.03, + "grad_norm": 2.9285783053055527, + "learning_rate": 0.0009351145038167939, + "loss": 4.5585, + "step": 251 + }, + { + "epoch": 0.03, + "grad_norm": 3.65895208962216, + "learning_rate": 0.0009389312977099237, + "loss": 4.2691, + "step": 252 + }, + { + "epoch": 0.03, + "grad_norm": 1.9673319651551415, + "learning_rate": 0.0009427480916030535, + "loss": 4.5441, + "step": 253 + }, + { + "epoch": 0.03, + "grad_norm": 3.1238476750284234, + "learning_rate": 0.0009465648854961832, + "loss": 4.4657, + "step": 254 + }, + { + "epoch": 0.03, + "grad_norm": 1.9438137125018247, + "learning_rate": 0.000950381679389313, + "loss": 4.6594, + "step": 255 + }, + { + "epoch": 0.03, + "grad_norm": 2.450145616842104, + "learning_rate": 0.0009541984732824428, + "loss": 4.7328, + "step": 256 + }, + { + "epoch": 0.03, + "grad_norm": 4.407173830430405, + "learning_rate": 0.0009580152671755726, + "loss": 4.1906, + "step": 257 + }, + { + "epoch": 0.03, + "grad_norm": 4.436698880591971, + "learning_rate": 0.0009618320610687023, + "loss": 4.4834, + "step": 258 + }, + { + "epoch": 0.03, + "grad_norm": 5.230990693011375, + "learning_rate": 0.0009656488549618321, + "loss": 4.4125, + "step": 259 + }, + { + "epoch": 0.03, + "grad_norm": 3.3753204853522574, + "learning_rate": 0.0009694656488549618, + "loss": 4.1311, + "step": 260 + }, + { + "epoch": 0.03, + "grad_norm": 6.025454995306366, + "learning_rate": 0.0009732824427480917, + "loss": 4.5515, + "step": 261 + }, + { + "epoch": 0.03, + "grad_norm": 6.099556869358559, + "learning_rate": 0.0009770992366412213, + "loss": 4.3692, + "step": 262 + }, + { + "epoch": 0.03, + "grad_norm": 2.2532819690576127, + "learning_rate": 0.0009809160305343512, + "loss": 4.4647, + "step": 263 + }, + { + "epoch": 0.03, + "grad_norm": 2.9321564302319585, + "learning_rate": 0.0009847328244274808, + "loss": 4.5391, + "step": 264 + }, + { + "epoch": 0.03, + "grad_norm": 1.7877794819325101, + "learning_rate": 0.0009885496183206107, + "loss": 4.4656, + "step": 265 + }, + { + "epoch": 0.03, + "grad_norm": 1.7179336582935352, + "learning_rate": 0.0009923664122137405, + "loss": 4.4662, + "step": 266 + }, + { + "epoch": 0.03, + "grad_norm": 2.048454145524189, + "learning_rate": 0.0009961832061068704, + "loss": 4.247, + "step": 267 + }, + { + "epoch": 0.03, + "grad_norm": 2.0370693677273164, + "learning_rate": 0.001, + "loss": 4.4564, + "step": 268 + }, + { + "epoch": 0.03, + "grad_norm": 2.5222638452498476, + "learning_rate": 0.0009999999655172654, + "loss": 4.2618, + "step": 269 + }, + { + "epoch": 0.03, + "grad_norm": 2.1108937154982486, + "learning_rate": 0.0009999998620690664, + "loss": 4.5203, + "step": 270 + }, + { + "epoch": 0.03, + "grad_norm": 1.8392493558241325, + "learning_rate": 0.0009999996896554175, + "loss": 4.2112, + "step": 271 + }, + { + "epoch": 0.03, + "grad_norm": 2.2938832897333836, + "learning_rate": 0.0009999994482763422, + "loss": 4.3127, + "step": 272 + }, + { + "epoch": 0.03, + "grad_norm": 1.7529449127976362, + "learning_rate": 0.0009999991379318737, + "loss": 4.3054, + "step": 273 + }, + { + "epoch": 0.03, + "grad_norm": 1.9574686246354027, + "learning_rate": 0.000999998758622055, + "loss": 4.4122, + "step": 274 + }, + { + "epoch": 0.03, + "grad_norm": 5.821259062674746, + "learning_rate": 0.0009999983103469385, + "loss": 4.4032, + "step": 275 + }, + { + "epoch": 0.03, + "grad_norm": 2.4689544587787062, + "learning_rate": 0.0009999977931065857, + "loss": 4.3253, + "step": 276 + }, + { + "epoch": 0.03, + "grad_norm": 2.2321621725081267, + "learning_rate": 0.0009999972069010686, + "loss": 4.1706, + "step": 277 + }, + { + "epoch": 0.03, + "grad_norm": 3.4926949882090748, + "learning_rate": 0.0009999965517304673, + "loss": 4.3912, + "step": 278 + }, + { + "epoch": 0.03, + "grad_norm": 2.360900925114398, + "learning_rate": 0.0009999958275948725, + "loss": 4.3268, + "step": 279 + }, + { + "epoch": 0.03, + "grad_norm": 2.201926126895682, + "learning_rate": 0.0009999950344943842, + "loss": 4.4753, + "step": 280 + }, + { + "epoch": 0.03, + "grad_norm": 1.7171916658696942, + "learning_rate": 0.0009999941724291115, + "loss": 4.3981, + "step": 281 + }, + { + "epoch": 0.03, + "grad_norm": 2.169691498257731, + "learning_rate": 0.0009999932413991737, + "loss": 4.3607, + "step": 282 + }, + { + "epoch": 0.03, + "grad_norm": 1.6266513074005855, + "learning_rate": 0.0009999922414046986, + "loss": 4.4905, + "step": 283 + }, + { + "epoch": 0.03, + "grad_norm": 2.7426231842873903, + "learning_rate": 0.0009999911724458248, + "loss": 4.5396, + "step": 284 + }, + { + "epoch": 0.03, + "grad_norm": 1.5392942285058555, + "learning_rate": 0.0009999900345226994, + "loss": 4.4994, + "step": 285 + }, + { + "epoch": 0.03, + "grad_norm": 1.678356795108113, + "learning_rate": 0.0009999888276354795, + "loss": 4.4029, + "step": 286 + }, + { + "epoch": 0.03, + "grad_norm": 2.691200017633891, + "learning_rate": 0.0009999875517843315, + "loss": 4.2561, + "step": 287 + }, + { + "epoch": 0.03, + "grad_norm": 3.3950536951859154, + "learning_rate": 0.0009999862069694312, + "loss": 4.4023, + "step": 288 + }, + { + "epoch": 0.03, + "grad_norm": 1.8958335318394337, + "learning_rate": 0.0009999847931909645, + "loss": 4.5222, + "step": 289 + }, + { + "epoch": 0.03, + "grad_norm": 1.7171986816761662, + "learning_rate": 0.000999983310449126, + "loss": 4.2009, + "step": 290 + }, + { + "epoch": 0.03, + "grad_norm": 1.7441516633821998, + "learning_rate": 0.0009999817587441203, + "loss": 4.2292, + "step": 291 + }, + { + "epoch": 0.03, + "grad_norm": 2.8399045457207577, + "learning_rate": 0.0009999801380761615, + "loss": 4.3146, + "step": 292 + }, + { + "epoch": 0.03, + "grad_norm": 2.5530683875413502, + "learning_rate": 0.0009999784484454734, + "loss": 4.6165, + "step": 293 + }, + { + "epoch": 0.03, + "grad_norm": 6.893237896818217, + "learning_rate": 0.0009999766898522884, + "loss": 4.4322, + "step": 294 + }, + { + "epoch": 0.03, + "grad_norm": 1.7278460987247395, + "learning_rate": 0.0009999748622968496, + "loss": 4.246, + "step": 295 + }, + { + "epoch": 0.03, + "grad_norm": 4.980795698751086, + "learning_rate": 0.000999972965779409, + "loss": 4.257, + "step": 296 + }, + { + "epoch": 0.03, + "grad_norm": 2.7099296748788064, + "learning_rate": 0.000999971000300228, + "loss": 4.4146, + "step": 297 + }, + { + "epoch": 0.03, + "grad_norm": 4.015270426069638, + "learning_rate": 0.000999968965859578, + "loss": 4.3122, + "step": 298 + }, + { + "epoch": 0.03, + "grad_norm": 1.7048017849738346, + "learning_rate": 0.0009999668624577395, + "loss": 4.563, + "step": 299 + }, + { + "epoch": 0.03, + "grad_norm": 1.890124937059456, + "learning_rate": 0.0009999646900950023, + "loss": 4.6135, + "step": 300 + }, + { + "epoch": 0.03, + "grad_norm": 1.6240602186049096, + "learning_rate": 0.0009999624487716666, + "loss": 4.3446, + "step": 301 + }, + { + "epoch": 0.03, + "grad_norm": 1.900172258525495, + "learning_rate": 0.000999960138488041, + "loss": 4.3648, + "step": 302 + }, + { + "epoch": 0.03, + "grad_norm": 1.6425318766445263, + "learning_rate": 0.0009999577592444443, + "loss": 4.2827, + "step": 303 + }, + { + "epoch": 0.03, + "grad_norm": 2.413089673147163, + "learning_rate": 0.000999955311041205, + "loss": 4.2912, + "step": 304 + }, + { + "epoch": 0.03, + "grad_norm": 2.4031038367300814, + "learning_rate": 0.0009999527938786606, + "loss": 4.6092, + "step": 305 + }, + { + "epoch": 0.04, + "grad_norm": 2.957841858118663, + "learning_rate": 0.0009999502077571581, + "loss": 4.1404, + "step": 306 + }, + { + "epoch": 0.04, + "grad_norm": 2.9290732269664215, + "learning_rate": 0.0009999475526770545, + "loss": 4.4611, + "step": 307 + }, + { + "epoch": 0.04, + "grad_norm": 1.6682796108498466, + "learning_rate": 0.0009999448286387158, + "loss": 4.2804, + "step": 308 + }, + { + "epoch": 0.04, + "grad_norm": 2.8424514884370637, + "learning_rate": 0.0009999420356425178, + "loss": 4.5973, + "step": 309 + }, + { + "epoch": 0.04, + "grad_norm": 2.322467005072296, + "learning_rate": 0.0009999391736888457, + "loss": 4.2886, + "step": 310 + }, + { + "epoch": 0.04, + "grad_norm": 1.8235278372086394, + "learning_rate": 0.0009999362427780942, + "loss": 4.1618, + "step": 311 + }, + { + "epoch": 0.04, + "grad_norm": 2.369434094607981, + "learning_rate": 0.0009999332429106679, + "loss": 4.1849, + "step": 312 + }, + { + "epoch": 0.04, + "grad_norm": 2.0829626206004206, + "learning_rate": 0.00099993017408698, + "loss": 4.2619, + "step": 313 + }, + { + "epoch": 0.04, + "grad_norm": 1.562135665499809, + "learning_rate": 0.0009999270363074547, + "loss": 4.2468, + "step": 314 + }, + { + "epoch": 0.04, + "grad_norm": 1.9672135647360018, + "learning_rate": 0.0009999238295725237, + "loss": 4.2449, + "step": 315 + }, + { + "epoch": 0.04, + "grad_norm": 2.237390477781047, + "learning_rate": 0.00099992055388263, + "loss": 4.4515, + "step": 316 + }, + { + "epoch": 0.04, + "grad_norm": 1.7289822481385195, + "learning_rate": 0.0009999172092382252, + "loss": 4.4428, + "step": 317 + }, + { + "epoch": 0.04, + "grad_norm": 1.807361964324125, + "learning_rate": 0.0009999137956397707, + "loss": 4.2761, + "step": 318 + }, + { + "epoch": 0.04, + "grad_norm": 1.914227784366172, + "learning_rate": 0.0009999103130877373, + "loss": 4.3202, + "step": 319 + }, + { + "epoch": 0.04, + "grad_norm": 1.639973206740026, + "learning_rate": 0.0009999067615826054, + "loss": 4.4163, + "step": 320 + }, + { + "epoch": 0.04, + "grad_norm": 2.636950509972736, + "learning_rate": 0.000999903141124865, + "loss": 4.1639, + "step": 321 + }, + { + "epoch": 0.04, + "grad_norm": 2.92044342908847, + "learning_rate": 0.000999899451715015, + "loss": 4.2216, + "step": 322 + }, + { + "epoch": 0.04, + "grad_norm": 1.8259185649724194, + "learning_rate": 0.0009998956933535649, + "loss": 4.5109, + "step": 323 + }, + { + "epoch": 0.04, + "grad_norm": 4.089977847097526, + "learning_rate": 0.0009998918660410324, + "loss": 4.2523, + "step": 324 + }, + { + "epoch": 0.04, + "grad_norm": 2.035188344176966, + "learning_rate": 0.000999887969777946, + "loss": 4.4065, + "step": 325 + }, + { + "epoch": 0.04, + "grad_norm": 2.290073152608485, + "learning_rate": 0.000999884004564843, + "loss": 4.3425, + "step": 326 + }, + { + "epoch": 0.04, + "grad_norm": 1.9330517104682972, + "learning_rate": 0.00099987997040227, + "loss": 4.1753, + "step": 327 + }, + { + "epoch": 0.04, + "grad_norm": 11.357390832457739, + "learning_rate": 0.0009998758672907838, + "loss": 4.2634, + "step": 328 + }, + { + "epoch": 0.04, + "grad_norm": 2.9448695204114603, + "learning_rate": 0.0009998716952309501, + "loss": 4.3565, + "step": 329 + }, + { + "epoch": 0.04, + "grad_norm": 1.6938183308028358, + "learning_rate": 0.0009998674542233445, + "loss": 4.1042, + "step": 330 + }, + { + "epoch": 0.04, + "grad_norm": 1.710763084670905, + "learning_rate": 0.000999863144268552, + "loss": 4.4057, + "step": 331 + }, + { + "epoch": 0.04, + "grad_norm": 3.48367778651573, + "learning_rate": 0.000999858765367167, + "loss": 4.1762, + "step": 332 + }, + { + "epoch": 0.04, + "grad_norm": 2.771254612675359, + "learning_rate": 0.0009998543175197936, + "loss": 4.2629, + "step": 333 + }, + { + "epoch": 0.04, + "grad_norm": 2.890136265607465, + "learning_rate": 0.000999849800727045, + "loss": 4.3958, + "step": 334 + }, + { + "epoch": 0.04, + "grad_norm": 2.0315577935707196, + "learning_rate": 0.0009998452149895445, + "loss": 4.2555, + "step": 335 + }, + { + "epoch": 0.04, + "grad_norm": 4.67655098266027, + "learning_rate": 0.0009998405603079243, + "loss": 4.3867, + "step": 336 + }, + { + "epoch": 0.04, + "grad_norm": 2.2075607062272584, + "learning_rate": 0.0009998358366828269, + "loss": 4.3525, + "step": 337 + }, + { + "epoch": 0.04, + "grad_norm": 2.21224444669047, + "learning_rate": 0.0009998310441149034, + "loss": 4.4316, + "step": 338 + }, + { + "epoch": 0.04, + "grad_norm": 2.9320424085076833, + "learning_rate": 0.000999826182604815, + "loss": 4.353, + "step": 339 + }, + { + "epoch": 0.04, + "grad_norm": 1.8362671580047158, + "learning_rate": 0.0009998212521532325, + "loss": 4.2803, + "step": 340 + }, + { + "epoch": 0.04, + "grad_norm": 1.824304176840615, + "learning_rate": 0.0009998162527608354, + "loss": 4.2204, + "step": 341 + }, + { + "epoch": 0.04, + "grad_norm": 1.9692547275793184, + "learning_rate": 0.0009998111844283137, + "loss": 4.4642, + "step": 342 + }, + { + "epoch": 0.04, + "grad_norm": 1.9274740726597106, + "learning_rate": 0.0009998060471563665, + "loss": 4.4741, + "step": 343 + }, + { + "epoch": 0.04, + "grad_norm": 1.9171628146499389, + "learning_rate": 0.0009998008409457023, + "loss": 4.3747, + "step": 344 + }, + { + "epoch": 0.04, + "grad_norm": 1.8141376277252172, + "learning_rate": 0.000999795565797039, + "loss": 4.0678, + "step": 345 + }, + { + "epoch": 0.04, + "grad_norm": 2.4331340166834967, + "learning_rate": 0.0009997902217111045, + "loss": 4.1271, + "step": 346 + }, + { + "epoch": 0.04, + "grad_norm": 2.8169361835673388, + "learning_rate": 0.0009997848086886357, + "loss": 4.3644, + "step": 347 + }, + { + "epoch": 0.04, + "grad_norm": 1.5517213596259445, + "learning_rate": 0.0009997793267303792, + "loss": 4.2935, + "step": 348 + }, + { + "epoch": 0.04, + "grad_norm": 1.731354291279978, + "learning_rate": 0.0009997737758370914, + "loss": 4.3753, + "step": 349 + }, + { + "epoch": 0.04, + "grad_norm": 2.9214099135868503, + "learning_rate": 0.0009997681560095378, + "loss": 4.2818, + "step": 350 + }, + { + "epoch": 0.04, + "grad_norm": 2.3168053361047427, + "learning_rate": 0.0009997624672484933, + "loss": 4.243, + "step": 351 + }, + { + "epoch": 0.04, + "grad_norm": 2.608830425511979, + "learning_rate": 0.0009997567095547432, + "loss": 4.3503, + "step": 352 + }, + { + "epoch": 0.04, + "grad_norm": 1.5085659676460148, + "learning_rate": 0.000999750882929081, + "loss": 4.4028, + "step": 353 + }, + { + "epoch": 0.04, + "grad_norm": 3.3475479897967886, + "learning_rate": 0.0009997449873723105, + "loss": 4.3183, + "step": 354 + }, + { + "epoch": 0.04, + "grad_norm": 3.506348953965699, + "learning_rate": 0.000999739022885245, + "loss": 4.3996, + "step": 355 + }, + { + "epoch": 0.04, + "grad_norm": 2.174349927085039, + "learning_rate": 0.0009997329894687072, + "loss": 4.2434, + "step": 356 + }, + { + "epoch": 0.04, + "grad_norm": 5.1077458540111875, + "learning_rate": 0.0009997268871235296, + "loss": 4.3555, + "step": 357 + }, + { + "epoch": 0.04, + "grad_norm": 2.029820315403982, + "learning_rate": 0.0009997207158505533, + "loss": 4.3735, + "step": 358 + }, + { + "epoch": 0.04, + "grad_norm": 2.0327017578579363, + "learning_rate": 0.0009997144756506298, + "loss": 4.4029, + "step": 359 + }, + { + "epoch": 0.04, + "grad_norm": 1.4527800941913545, + "learning_rate": 0.00099970816652462, + "loss": 4.2341, + "step": 360 + }, + { + "epoch": 0.04, + "grad_norm": 1.860703957721792, + "learning_rate": 0.0009997017884733938, + "loss": 4.2853, + "step": 361 + }, + { + "epoch": 0.04, + "grad_norm": 2.0322573868389124, + "learning_rate": 0.000999695341497831, + "loss": 4.1132, + "step": 362 + }, + { + "epoch": 0.04, + "grad_norm": 3.8151951596454006, + "learning_rate": 0.0009996888255988207, + "loss": 4.264, + "step": 363 + }, + { + "epoch": 0.04, + "grad_norm": 1.4077202309359398, + "learning_rate": 0.0009996822407772623, + "loss": 4.2707, + "step": 364 + }, + { + "epoch": 0.04, + "grad_norm": 2.392880519936668, + "learning_rate": 0.0009996755870340633, + "loss": 4.2809, + "step": 365 + }, + { + "epoch": 0.04, + "grad_norm": 4.222587831656337, + "learning_rate": 0.0009996688643701419, + "loss": 4.3933, + "step": 366 + }, + { + "epoch": 0.04, + "grad_norm": 1.607045817504694, + "learning_rate": 0.0009996620727864252, + "loss": 4.3359, + "step": 367 + }, + { + "epoch": 0.04, + "grad_norm": 1.9762970613273254, + "learning_rate": 0.00099965521228385, + "loss": 4.132, + "step": 368 + }, + { + "epoch": 0.04, + "grad_norm": 2.4482652699405167, + "learning_rate": 0.0009996482828633624, + "loss": 4.4534, + "step": 369 + }, + { + "epoch": 0.04, + "grad_norm": 3.9466657139702224, + "learning_rate": 0.0009996412845259183, + "loss": 4.3283, + "step": 370 + }, + { + "epoch": 0.04, + "grad_norm": 2.3363606930209673, + "learning_rate": 0.0009996342172724833, + "loss": 4.3269, + "step": 371 + }, + { + "epoch": 0.04, + "grad_norm": 1.555438533816772, + "learning_rate": 0.0009996270811040318, + "loss": 4.1546, + "step": 372 + }, + { + "epoch": 0.04, + "grad_norm": 1.3799450891693201, + "learning_rate": 0.0009996198760215483, + "loss": 4.0072, + "step": 373 + }, + { + "epoch": 0.04, + "grad_norm": 2.789227331604965, + "learning_rate": 0.0009996126020260262, + "loss": 4.3284, + "step": 374 + }, + { + "epoch": 0.04, + "grad_norm": 5.995065064397545, + "learning_rate": 0.0009996052591184695, + "loss": 4.3202, + "step": 375 + }, + { + "epoch": 0.04, + "grad_norm": 3.1329861418074287, + "learning_rate": 0.0009995978472998905, + "loss": 4.3774, + "step": 376 + }, + { + "epoch": 0.04, + "grad_norm": 1.8037810973037383, + "learning_rate": 0.0009995903665713118, + "loss": 4.3896, + "step": 377 + }, + { + "epoch": 0.04, + "grad_norm": 2.5156038836115218, + "learning_rate": 0.000999582816933765, + "loss": 4.1237, + "step": 378 + }, + { + "epoch": 0.04, + "grad_norm": 2.4809106832995123, + "learning_rate": 0.0009995751983882914, + "loss": 4.3039, + "step": 379 + }, + { + "epoch": 0.04, + "grad_norm": 1.50925999240428, + "learning_rate": 0.000999567510935942, + "loss": 4.4198, + "step": 380 + }, + { + "epoch": 0.04, + "grad_norm": 1.8919292954802003, + "learning_rate": 0.0009995597545777771, + "loss": 4.3502, + "step": 381 + }, + { + "epoch": 0.04, + "grad_norm": 1.4631282964061914, + "learning_rate": 0.0009995519293148666, + "loss": 4.2612, + "step": 382 + }, + { + "epoch": 0.04, + "grad_norm": 2.7030133937626757, + "learning_rate": 0.0009995440351482897, + "loss": 4.4067, + "step": 383 + }, + { + "epoch": 0.04, + "grad_norm": 1.9711718644711285, + "learning_rate": 0.0009995360720791353, + "loss": 4.4199, + "step": 384 + }, + { + "epoch": 0.04, + "grad_norm": 2.054539931349483, + "learning_rate": 0.000999528040108502, + "loss": 4.3721, + "step": 385 + }, + { + "epoch": 0.04, + "grad_norm": 1.703097984938856, + "learning_rate": 0.0009995199392374972, + "loss": 4.4162, + "step": 386 + }, + { + "epoch": 0.04, + "grad_norm": 2.0142157133427907, + "learning_rate": 0.0009995117694672386, + "loss": 4.3357, + "step": 387 + }, + { + "epoch": 0.04, + "grad_norm": 1.8872273699246864, + "learning_rate": 0.000999503530798853, + "loss": 4.3758, + "step": 388 + }, + { + "epoch": 0.04, + "grad_norm": 2.711382463360124, + "learning_rate": 0.0009994952232334766, + "loss": 4.4434, + "step": 389 + }, + { + "epoch": 0.04, + "grad_norm": 2.7816815660153114, + "learning_rate": 0.0009994868467722556, + "loss": 4.2981, + "step": 390 + }, + { + "epoch": 0.04, + "grad_norm": 1.874592225183986, + "learning_rate": 0.0009994784014163449, + "loss": 4.2146, + "step": 391 + }, + { + "epoch": 0.04, + "grad_norm": 1.653423412607284, + "learning_rate": 0.0009994698871669098, + "loss": 4.4266, + "step": 392 + }, + { + "epoch": 0.05, + "grad_norm": 1.715946786925207, + "learning_rate": 0.0009994613040251246, + "loss": 4.3713, + "step": 393 + }, + { + "epoch": 0.05, + "grad_norm": 1.7543716989747122, + "learning_rate": 0.000999452651992173, + "loss": 4.4658, + "step": 394 + }, + { + "epoch": 0.05, + "grad_norm": 4.817152810934617, + "learning_rate": 0.0009994439310692486, + "loss": 4.0528, + "step": 395 + }, + { + "epoch": 0.05, + "grad_norm": 2.7384509728442397, + "learning_rate": 0.0009994351412575542, + "loss": 4.1503, + "step": 396 + }, + { + "epoch": 0.05, + "grad_norm": 2.954948826171897, + "learning_rate": 0.000999426282558302, + "loss": 4.335, + "step": 397 + }, + { + "epoch": 0.05, + "grad_norm": 2.305346089693677, + "learning_rate": 0.000999417354972714, + "loss": 4.1482, + "step": 398 + }, + { + "epoch": 0.05, + "grad_norm": 2.0779507356623155, + "learning_rate": 0.000999408358502022, + "loss": 4.2645, + "step": 399 + }, + { + "epoch": 0.05, + "grad_norm": 3.3992067111514417, + "learning_rate": 0.0009993992931474661, + "loss": 4.4103, + "step": 400 + }, + { + "epoch": 0.05, + "grad_norm": 1.8649210842050046, + "learning_rate": 0.0009993901589102974, + "loss": 4.4022, + "step": 401 + }, + { + "epoch": 0.05, + "grad_norm": 1.8994789460454895, + "learning_rate": 0.0009993809557917754, + "loss": 4.2069, + "step": 402 + }, + { + "epoch": 0.05, + "grad_norm": 4.7193723609231, + "learning_rate": 0.0009993716837931696, + "loss": 4.3625, + "step": 403 + }, + { + "epoch": 0.05, + "grad_norm": 3.0296104523406804, + "learning_rate": 0.000999362342915759, + "loss": 4.3738, + "step": 404 + }, + { + "epoch": 0.05, + "grad_norm": 2.5838482333449595, + "learning_rate": 0.0009993529331608318, + "loss": 4.3047, + "step": 405 + }, + { + "epoch": 0.05, + "grad_norm": 1.4792352344402693, + "learning_rate": 0.0009993434545296862, + "loss": 4.1021, + "step": 406 + }, + { + "epoch": 0.05, + "grad_norm": 3.167619525955671, + "learning_rate": 0.0009993339070236292, + "loss": 4.1586, + "step": 407 + }, + { + "epoch": 0.05, + "grad_norm": 1.6018565716773459, + "learning_rate": 0.000999324290643978, + "loss": 4.0348, + "step": 408 + }, + { + "epoch": 0.05, + "grad_norm": 1.427542081196093, + "learning_rate": 0.0009993146053920588, + "loss": 3.9817, + "step": 409 + }, + { + "epoch": 0.05, + "grad_norm": 1.4610839463941518, + "learning_rate": 0.0009993048512692078, + "loss": 3.9734, + "step": 410 + }, + { + "epoch": 0.05, + "grad_norm": 1.8807297775651135, + "learning_rate": 0.00099929502827677, + "loss": 4.3116, + "step": 411 + }, + { + "epoch": 0.05, + "grad_norm": 1.916794844204584, + "learning_rate": 0.0009992851364161006, + "loss": 4.3492, + "step": 412 + }, + { + "epoch": 0.05, + "grad_norm": 1.6042535763809467, + "learning_rate": 0.0009992751756885637, + "loss": 4.2904, + "step": 413 + }, + { + "epoch": 0.05, + "grad_norm": 3.2432575271957913, + "learning_rate": 0.0009992651460955335, + "loss": 4.4161, + "step": 414 + }, + { + "epoch": 0.05, + "grad_norm": 1.6664656283005956, + "learning_rate": 0.0009992550476383931, + "loss": 4.1874, + "step": 415 + }, + { + "epoch": 0.05, + "grad_norm": 2.399227905333266, + "learning_rate": 0.0009992448803185356, + "loss": 4.2845, + "step": 416 + }, + { + "epoch": 0.05, + "grad_norm": 1.8837521453313297, + "learning_rate": 0.0009992346441373633, + "loss": 4.0753, + "step": 417 + }, + { + "epoch": 0.05, + "grad_norm": 1.6459876908246365, + "learning_rate": 0.0009992243390962883, + "loss": 4.3527, + "step": 418 + }, + { + "epoch": 0.05, + "grad_norm": 1.3935154843005806, + "learning_rate": 0.0009992139651967319, + "loss": 4.2882, + "step": 419 + }, + { + "epoch": 0.05, + "grad_norm": 1.4413558791410421, + "learning_rate": 0.0009992035224401245, + "loss": 4.1771, + "step": 420 + }, + { + "epoch": 0.05, + "grad_norm": 1.7337219904992842, + "learning_rate": 0.0009991930108279074, + "loss": 4.2516, + "step": 421 + }, + { + "epoch": 0.05, + "grad_norm": 1.5745527404436643, + "learning_rate": 0.0009991824303615293, + "loss": 4.2974, + "step": 422 + }, + { + "epoch": 0.05, + "grad_norm": 1.7758463076457154, + "learning_rate": 0.0009991717810424506, + "loss": 4.247, + "step": 423 + }, + { + "epoch": 0.05, + "grad_norm": 1.6631677527762334, + "learning_rate": 0.0009991610628721397, + "loss": 4.4106, + "step": 424 + }, + { + "epoch": 0.05, + "grad_norm": 1.8672230363361033, + "learning_rate": 0.000999150275852075, + "loss": 4.266, + "step": 425 + }, + { + "epoch": 0.05, + "grad_norm": 1.9090522524430469, + "learning_rate": 0.0009991394199837444, + "loss": 4.1662, + "step": 426 + }, + { + "epoch": 0.05, + "grad_norm": 1.91133916486316, + "learning_rate": 0.0009991284952686455, + "loss": 3.9843, + "step": 427 + }, + { + "epoch": 0.05, + "grad_norm": 1.6146778810250457, + "learning_rate": 0.0009991175017082848, + "loss": 4.0937, + "step": 428 + }, + { + "epoch": 0.05, + "grad_norm": 2.200513031755401, + "learning_rate": 0.0009991064393041786, + "loss": 4.1786, + "step": 429 + }, + { + "epoch": 0.05, + "grad_norm": 1.588028511494425, + "learning_rate": 0.0009990953080578533, + "loss": 4.2679, + "step": 430 + }, + { + "epoch": 0.05, + "grad_norm": 1.8832431580890614, + "learning_rate": 0.0009990841079708435, + "loss": 4.1996, + "step": 431 + }, + { + "epoch": 0.05, + "grad_norm": 2.938964510419675, + "learning_rate": 0.0009990728390446946, + "loss": 4.0794, + "step": 432 + }, + { + "epoch": 0.05, + "grad_norm": 2.352905728010987, + "learning_rate": 0.0009990615012809608, + "loss": 4.0449, + "step": 433 + }, + { + "epoch": 0.05, + "grad_norm": 1.5626585709764087, + "learning_rate": 0.0009990500946812058, + "loss": 4.4967, + "step": 434 + }, + { + "epoch": 0.05, + "grad_norm": 2.750916997679828, + "learning_rate": 0.000999038619247003, + "loss": 4.2462, + "step": 435 + }, + { + "epoch": 0.05, + "grad_norm": 1.5449652538799465, + "learning_rate": 0.0009990270749799352, + "loss": 4.3279, + "step": 436 + }, + { + "epoch": 0.05, + "grad_norm": 1.9330843091419507, + "learning_rate": 0.0009990154618815948, + "loss": 4.0706, + "step": 437 + }, + { + "epoch": 0.05, + "grad_norm": 2.1510611600876746, + "learning_rate": 0.0009990037799535833, + "loss": 4.4807, + "step": 438 + }, + { + "epoch": 0.05, + "grad_norm": 2.612616012426449, + "learning_rate": 0.0009989920291975124, + "loss": 4.2772, + "step": 439 + }, + { + "epoch": 0.05, + "grad_norm": 1.5717237611199786, + "learning_rate": 0.0009989802096150029, + "loss": 4.3891, + "step": 440 + }, + { + "epoch": 0.05, + "grad_norm": 1.577107474934209, + "learning_rate": 0.0009989683212076848, + "loss": 4.1637, + "step": 441 + }, + { + "epoch": 0.05, + "grad_norm": 1.6478300244580233, + "learning_rate": 0.0009989563639771978, + "loss": 4.2522, + "step": 442 + }, + { + "epoch": 0.05, + "grad_norm": 1.6927587430572832, + "learning_rate": 0.0009989443379251916, + "loss": 4.3065, + "step": 443 + }, + { + "epoch": 0.05, + "grad_norm": 2.8891996311556603, + "learning_rate": 0.0009989322430533245, + "loss": 4.4178, + "step": 444 + }, + { + "epoch": 0.05, + "grad_norm": 1.8081540980021196, + "learning_rate": 0.0009989200793632652, + "loss": 4.1146, + "step": 445 + }, + { + "epoch": 0.05, + "grad_norm": 1.7554401129243336, + "learning_rate": 0.0009989078468566912, + "loss": 4.2285, + "step": 446 + }, + { + "epoch": 0.05, + "grad_norm": 1.8601166131951627, + "learning_rate": 0.0009988955455352898, + "loss": 4.313, + "step": 447 + }, + { + "epoch": 0.05, + "grad_norm": 1.3043767220073055, + "learning_rate": 0.0009988831754007576, + "loss": 4.2489, + "step": 448 + }, + { + "epoch": 0.05, + "grad_norm": 1.3446798904621964, + "learning_rate": 0.000998870736454801, + "loss": 4.0232, + "step": 449 + }, + { + "epoch": 0.05, + "grad_norm": 1.5138647439544766, + "learning_rate": 0.0009988582286991356, + "loss": 4.2073, + "step": 450 + }, + { + "epoch": 0.05, + "grad_norm": 4.3473933009612535, + "learning_rate": 0.0009988456521354868, + "loss": 4.1479, + "step": 451 + }, + { + "epoch": 0.05, + "grad_norm": 1.7081683867716282, + "learning_rate": 0.000998833006765589, + "loss": 4.1977, + "step": 452 + }, + { + "epoch": 0.05, + "grad_norm": 2.008695733739199, + "learning_rate": 0.0009988202925911864, + "loss": 4.3076, + "step": 453 + }, + { + "epoch": 0.05, + "grad_norm": 2.226676390416783, + "learning_rate": 0.000998807509614033, + "loss": 4.3455, + "step": 454 + }, + { + "epoch": 0.05, + "grad_norm": 2.4529790355120613, + "learning_rate": 0.0009987946578358918, + "loss": 4.2358, + "step": 455 + }, + { + "epoch": 0.05, + "grad_norm": 1.702687405469556, + "learning_rate": 0.0009987817372585355, + "loss": 4.0999, + "step": 456 + }, + { + "epoch": 0.05, + "grad_norm": 1.6464839082624225, + "learning_rate": 0.000998768747883746, + "loss": 4.1645, + "step": 457 + }, + { + "epoch": 0.05, + "grad_norm": 1.8219406274361665, + "learning_rate": 0.0009987556897133151, + "loss": 4.2115, + "step": 458 + }, + { + "epoch": 0.05, + "grad_norm": 2.0047366208483584, + "learning_rate": 0.0009987425627490441, + "loss": 4.3685, + "step": 459 + }, + { + "epoch": 0.05, + "grad_norm": 1.6433088845060244, + "learning_rate": 0.0009987293669927436, + "loss": 4.1339, + "step": 460 + }, + { + "epoch": 0.05, + "grad_norm": 1.7008352746961095, + "learning_rate": 0.0009987161024462333, + "loss": 4.4192, + "step": 461 + }, + { + "epoch": 0.05, + "grad_norm": 3.81429443488762, + "learning_rate": 0.0009987027691113432, + "loss": 4.3099, + "step": 462 + }, + { + "epoch": 0.05, + "grad_norm": 2.957255037549792, + "learning_rate": 0.0009986893669899123, + "loss": 4.0336, + "step": 463 + }, + { + "epoch": 0.05, + "grad_norm": 1.4487172164503406, + "learning_rate": 0.0009986758960837889, + "loss": 4.334, + "step": 464 + }, + { + "epoch": 0.05, + "grad_norm": 21.030796056337785, + "learning_rate": 0.0009986623563948314, + "loss": 4.2902, + "step": 465 + }, + { + "epoch": 0.05, + "grad_norm": 1.3780642257801548, + "learning_rate": 0.000998648747924907, + "loss": 4.3129, + "step": 466 + }, + { + "epoch": 0.05, + "grad_norm": 1.705575202553169, + "learning_rate": 0.0009986350706758934, + "loss": 4.3348, + "step": 467 + }, + { + "epoch": 0.05, + "grad_norm": 2.084448408644449, + "learning_rate": 0.0009986213246496762, + "loss": 4.3745, + "step": 468 + }, + { + "epoch": 0.05, + "grad_norm": 2.1588504930194965, + "learning_rate": 0.000998607509848152, + "loss": 4.2281, + "step": 469 + }, + { + "epoch": 0.05, + "grad_norm": 1.7234762989631718, + "learning_rate": 0.0009985936262732263, + "loss": 4.2508, + "step": 470 + }, + { + "epoch": 0.05, + "grad_norm": 4.0707378126056994, + "learning_rate": 0.0009985796739268138, + "loss": 4.2954, + "step": 471 + }, + { + "epoch": 0.05, + "grad_norm": 1.800964140352813, + "learning_rate": 0.000998565652810839, + "loss": 3.9769, + "step": 472 + }, + { + "epoch": 0.05, + "grad_norm": 1.4446475932540723, + "learning_rate": 0.000998551562927236, + "loss": 4.3583, + "step": 473 + }, + { + "epoch": 0.05, + "grad_norm": 2.523896449216337, + "learning_rate": 0.000998537404277948, + "loss": 4.1434, + "step": 474 + }, + { + "epoch": 0.05, + "grad_norm": 1.9069530193850508, + "learning_rate": 0.0009985231768649284, + "loss": 4.195, + "step": 475 + }, + { + "epoch": 0.05, + "grad_norm": 1.5923551803269194, + "learning_rate": 0.000998508880690139, + "loss": 4.3102, + "step": 476 + }, + { + "epoch": 0.05, + "grad_norm": 1.3656745062183016, + "learning_rate": 0.000998494515755552, + "loss": 4.1989, + "step": 477 + }, + { + "epoch": 0.05, + "grad_norm": 1.9344111813161828, + "learning_rate": 0.0009984800820631488, + "loss": 4.3079, + "step": 478 + }, + { + "epoch": 0.05, + "grad_norm": 1.8460016090634654, + "learning_rate": 0.0009984655796149201, + "loss": 4.3253, + "step": 479 + }, + { + "epoch": 0.06, + "grad_norm": 1.4443807170439855, + "learning_rate": 0.0009984510084128661, + "loss": 4.2087, + "step": 480 + }, + { + "epoch": 0.06, + "grad_norm": 2.504351563017082, + "learning_rate": 0.0009984363684589972, + "loss": 4.1932, + "step": 481 + }, + { + "epoch": 0.06, + "grad_norm": 1.875473736895432, + "learning_rate": 0.0009984216597553322, + "loss": 4.2492, + "step": 482 + }, + { + "epoch": 0.06, + "grad_norm": 1.782924005727873, + "learning_rate": 0.0009984068823039, + "loss": 4.2634, + "step": 483 + }, + { + "epoch": 0.06, + "grad_norm": 1.7377419834196606, + "learning_rate": 0.0009983920361067388, + "loss": 4.1939, + "step": 484 + }, + { + "epoch": 0.06, + "grad_norm": 1.49780906534707, + "learning_rate": 0.0009983771211658965, + "loss": 4.2586, + "step": 485 + }, + { + "epoch": 0.06, + "grad_norm": 5.212474395281273, + "learning_rate": 0.0009983621374834303, + "loss": 4.2255, + "step": 486 + }, + { + "epoch": 0.06, + "grad_norm": 1.3335553929577793, + "learning_rate": 0.0009983470850614068, + "loss": 4.0619, + "step": 487 + }, + { + "epoch": 0.06, + "grad_norm": 1.7155363566328607, + "learning_rate": 0.0009983319639019024, + "loss": 4.1229, + "step": 488 + }, + { + "epoch": 0.06, + "grad_norm": 1.78815904460192, + "learning_rate": 0.0009983167740070025, + "loss": 4.2626, + "step": 489 + }, + { + "epoch": 0.06, + "grad_norm": 4.786593652576243, + "learning_rate": 0.0009983015153788026, + "loss": 4.1652, + "step": 490 + }, + { + "epoch": 0.06, + "grad_norm": 2.03885457511657, + "learning_rate": 0.000998286188019407, + "loss": 4.2851, + "step": 491 + }, + { + "epoch": 0.06, + "grad_norm": 1.912137282293859, + "learning_rate": 0.00099827079193093, + "loss": 4.2247, + "step": 492 + }, + { + "epoch": 0.06, + "grad_norm": 1.5207324767271615, + "learning_rate": 0.0009982553271154953, + "loss": 4.1751, + "step": 493 + }, + { + "epoch": 0.06, + "grad_norm": 1.594172759661226, + "learning_rate": 0.0009982397935752356, + "loss": 4.1073, + "step": 494 + }, + { + "epoch": 0.06, + "grad_norm": 1.4192927756443627, + "learning_rate": 0.0009982241913122937, + "loss": 4.0173, + "step": 495 + }, + { + "epoch": 0.06, + "grad_norm": 1.5622376954998693, + "learning_rate": 0.000998208520328822, + "loss": 4.4546, + "step": 496 + }, + { + "epoch": 0.06, + "grad_norm": 1.7975572398456645, + "learning_rate": 0.0009981927806269812, + "loss": 4.0506, + "step": 497 + }, + { + "epoch": 0.06, + "grad_norm": 1.586452293418299, + "learning_rate": 0.0009981769722089428, + "loss": 4.3771, + "step": 498 + }, + { + "epoch": 0.06, + "grad_norm": 2.507141545274864, + "learning_rate": 0.0009981610950768873, + "loss": 4.0819, + "step": 499 + }, + { + "epoch": 0.06, + "grad_norm": 1.589922120273323, + "learning_rate": 0.0009981451492330046, + "loss": 4.3237, + "step": 500 + }, + { + "epoch": 0.06, + "grad_norm": 1.5099246398548452, + "learning_rate": 0.000998129134679494, + "loss": 4.2263, + "step": 501 + }, + { + "epoch": 0.06, + "grad_norm": 1.5278854374975164, + "learning_rate": 0.0009981130514185646, + "loss": 4.1423, + "step": 502 + }, + { + "epoch": 0.06, + "grad_norm": 2.071764368335239, + "learning_rate": 0.0009980968994524344, + "loss": 4.1247, + "step": 503 + }, + { + "epoch": 0.06, + "grad_norm": 4.360920871701456, + "learning_rate": 0.0009980806787833316, + "loss": 4.3393, + "step": 504 + }, + { + "epoch": 0.06, + "grad_norm": 1.4479058911134457, + "learning_rate": 0.0009980643894134935, + "loss": 4.2023, + "step": 505 + }, + { + "epoch": 0.06, + "grad_norm": 1.4282311467477264, + "learning_rate": 0.000998048031345167, + "loss": 4.1102, + "step": 506 + }, + { + "epoch": 0.06, + "grad_norm": 1.6012850105837713, + "learning_rate": 0.0009980316045806082, + "loss": 4.0026, + "step": 507 + }, + { + "epoch": 0.06, + "grad_norm": 1.5234065973771034, + "learning_rate": 0.0009980151091220826, + "loss": 4.1014, + "step": 508 + }, + { + "epoch": 0.06, + "grad_norm": 1.7079381420146302, + "learning_rate": 0.000997998544971866, + "loss": 4.4228, + "step": 509 + }, + { + "epoch": 0.06, + "grad_norm": 3.0879048988915425, + "learning_rate": 0.0009979819121322426, + "loss": 4.0952, + "step": 510 + }, + { + "epoch": 0.06, + "grad_norm": 1.2534923658055779, + "learning_rate": 0.000997965210605507, + "loss": 4.2086, + "step": 511 + }, + { + "epoch": 0.06, + "grad_norm": 2.3904070210883575, + "learning_rate": 0.0009979484403939626, + "loss": 3.9227, + "step": 512 + }, + { + "epoch": 0.06, + "grad_norm": 2.2013550010173057, + "learning_rate": 0.0009979316014999226, + "loss": 4.0698, + "step": 513 + }, + { + "epoch": 0.06, + "grad_norm": 1.874971330992358, + "learning_rate": 0.0009979146939257098, + "loss": 4.1274, + "step": 514 + }, + { + "epoch": 0.06, + "grad_norm": 1.2884000658407369, + "learning_rate": 0.000997897717673656, + "loss": 4.0722, + "step": 515 + }, + { + "epoch": 0.06, + "grad_norm": 1.4247112824547676, + "learning_rate": 0.0009978806727461028, + "loss": 4.1469, + "step": 516 + }, + { + "epoch": 0.06, + "grad_norm": 1.8338444652621828, + "learning_rate": 0.000997863559145401, + "loss": 4.2789, + "step": 517 + }, + { + "epoch": 0.06, + "grad_norm": 2.585322298329817, + "learning_rate": 0.0009978463768739118, + "loss": 4.1187, + "step": 518 + }, + { + "epoch": 0.06, + "grad_norm": 1.451768166506986, + "learning_rate": 0.0009978291259340045, + "loss": 3.8886, + "step": 519 + }, + { + "epoch": 0.06, + "grad_norm": 2.2135602121699254, + "learning_rate": 0.0009978118063280587, + "loss": 4.2749, + "step": 520 + }, + { + "epoch": 0.06, + "grad_norm": 7.7690976475861095, + "learning_rate": 0.0009977944180584637, + "loss": 3.9638, + "step": 521 + }, + { + "epoch": 0.06, + "grad_norm": 3.595326296207757, + "learning_rate": 0.0009977769611276173, + "loss": 4.134, + "step": 522 + }, + { + "epoch": 0.06, + "grad_norm": 2.1456430341295403, + "learning_rate": 0.0009977594355379275, + "loss": 4.0029, + "step": 523 + }, + { + "epoch": 0.06, + "grad_norm": 1.4701388696275608, + "learning_rate": 0.000997741841291812, + "loss": 4.1146, + "step": 524 + }, + { + "epoch": 0.06, + "grad_norm": 2.599532997688662, + "learning_rate": 0.000997724178391697, + "loss": 4.307, + "step": 525 + }, + { + "epoch": 0.06, + "grad_norm": 1.5914841650355807, + "learning_rate": 0.0009977064468400193, + "loss": 4.2285, + "step": 526 + }, + { + "epoch": 0.06, + "grad_norm": 2.20154652964313, + "learning_rate": 0.0009976886466392244, + "loss": 4.2076, + "step": 527 + }, + { + "epoch": 0.06, + "grad_norm": 2.8448031908182587, + "learning_rate": 0.0009976707777917676, + "loss": 4.086, + "step": 528 + }, + { + "epoch": 0.06, + "grad_norm": 2.333745987165948, + "learning_rate": 0.0009976528403001133, + "loss": 3.9884, + "step": 529 + }, + { + "epoch": 0.06, + "grad_norm": 1.9260832400646073, + "learning_rate": 0.0009976348341667358, + "loss": 4.2554, + "step": 530 + }, + { + "epoch": 0.06, + "grad_norm": 1.279482443540274, + "learning_rate": 0.0009976167593941188, + "loss": 4.3154, + "step": 531 + }, + { + "epoch": 0.06, + "grad_norm": 5.483665194492572, + "learning_rate": 0.000997598615984755, + "loss": 4.1684, + "step": 532 + }, + { + "epoch": 0.06, + "grad_norm": 1.4700802465594387, + "learning_rate": 0.0009975804039411475, + "loss": 4.2683, + "step": 533 + }, + { + "epoch": 0.06, + "grad_norm": 1.556617514531295, + "learning_rate": 0.0009975621232658082, + "loss": 4.2066, + "step": 534 + }, + { + "epoch": 0.06, + "grad_norm": 5.938519899910758, + "learning_rate": 0.000997543773961258, + "loss": 4.2524, + "step": 535 + }, + { + "epoch": 0.06, + "grad_norm": 1.2585559541296696, + "learning_rate": 0.0009975253560300283, + "loss": 3.9952, + "step": 536 + }, + { + "epoch": 0.06, + "grad_norm": 1.6792312392559359, + "learning_rate": 0.0009975068694746596, + "loss": 4.2522, + "step": 537 + }, + { + "epoch": 0.06, + "grad_norm": 1.6587467590262839, + "learning_rate": 0.0009974883142977015, + "loss": 4.3997, + "step": 538 + }, + { + "epoch": 0.06, + "grad_norm": 2.3202724469002165, + "learning_rate": 0.0009974696905017135, + "loss": 4.0084, + "step": 539 + }, + { + "epoch": 0.06, + "grad_norm": 2.4828249324301743, + "learning_rate": 0.0009974509980892642, + "loss": 4.1457, + "step": 540 + }, + { + "epoch": 0.06, + "grad_norm": 1.6082393164446473, + "learning_rate": 0.0009974322370629321, + "loss": 4.2472, + "step": 541 + }, + { + "epoch": 0.06, + "grad_norm": 1.6290195202216855, + "learning_rate": 0.000997413407425305, + "loss": 4.1145, + "step": 542 + }, + { + "epoch": 0.06, + "grad_norm": 1.523104617602103, + "learning_rate": 0.0009973945091789796, + "loss": 4.244, + "step": 543 + }, + { + "epoch": 0.06, + "grad_norm": 1.5844748833215803, + "learning_rate": 0.000997375542326563, + "loss": 4.1834, + "step": 544 + }, + { + "epoch": 0.06, + "grad_norm": 1.669913922464535, + "learning_rate": 0.0009973565068706711, + "loss": 3.9686, + "step": 545 + }, + { + "epoch": 0.06, + "grad_norm": 1.434144360816346, + "learning_rate": 0.0009973374028139296, + "loss": 4.1261, + "step": 546 + }, + { + "epoch": 0.06, + "grad_norm": 1.9903829897651333, + "learning_rate": 0.0009973182301589736, + "loss": 4.2357, + "step": 547 + }, + { + "epoch": 0.06, + "grad_norm": 1.3247511726465688, + "learning_rate": 0.0009972989889084473, + "loss": 4.0462, + "step": 548 + }, + { + "epoch": 0.06, + "grad_norm": 2.064275838901619, + "learning_rate": 0.000997279679065005, + "loss": 4.1727, + "step": 549 + }, + { + "epoch": 0.06, + "grad_norm": 3.6651841984017866, + "learning_rate": 0.0009972603006313098, + "loss": 4.35, + "step": 550 + }, + { + "epoch": 0.06, + "grad_norm": 2.0187157158460702, + "learning_rate": 0.000997240853610035, + "loss": 4.1029, + "step": 551 + }, + { + "epoch": 0.06, + "grad_norm": 1.4129647219336012, + "learning_rate": 0.0009972213380038627, + "loss": 4.0784, + "step": 552 + }, + { + "epoch": 0.06, + "grad_norm": 1.4235750312775746, + "learning_rate": 0.0009972017538154845, + "loss": 4.2098, + "step": 553 + }, + { + "epoch": 0.06, + "grad_norm": 1.7876370331041684, + "learning_rate": 0.000997182101047602, + "loss": 3.9709, + "step": 554 + }, + { + "epoch": 0.06, + "grad_norm": 1.8227111300640289, + "learning_rate": 0.0009971623797029258, + "loss": 4.0964, + "step": 555 + }, + { + "epoch": 0.06, + "grad_norm": 1.5843076877398035, + "learning_rate": 0.0009971425897841765, + "loss": 3.9849, + "step": 556 + }, + { + "epoch": 0.06, + "grad_norm": 2.132265876377265, + "learning_rate": 0.0009971227312940826, + "loss": 4.1936, + "step": 557 + }, + { + "epoch": 0.06, + "grad_norm": 1.6889622193517102, + "learning_rate": 0.0009971028042353844, + "loss": 4.1141, + "step": 558 + }, + { + "epoch": 0.06, + "grad_norm": 2.400131063705241, + "learning_rate": 0.00099708280861083, + "loss": 4.1785, + "step": 559 + }, + { + "epoch": 0.06, + "grad_norm": 2.9345285584843466, + "learning_rate": 0.0009970627444231776, + "loss": 4.1451, + "step": 560 + }, + { + "epoch": 0.06, + "grad_norm": 1.5176755910159163, + "learning_rate": 0.000997042611675194, + "loss": 3.8362, + "step": 561 + }, + { + "epoch": 0.06, + "grad_norm": 5.320823231381811, + "learning_rate": 0.0009970224103696568, + "loss": 4.0823, + "step": 562 + }, + { + "epoch": 0.06, + "grad_norm": 1.3897336441661774, + "learning_rate": 0.0009970021405093523, + "loss": 4.2072, + "step": 563 + }, + { + "epoch": 0.06, + "grad_norm": 3.4344538608757165, + "learning_rate": 0.0009969818020970761, + "loss": 3.993, + "step": 564 + }, + { + "epoch": 0.06, + "grad_norm": 1.9334547713916386, + "learning_rate": 0.0009969613951356338, + "loss": 4.1662, + "step": 565 + }, + { + "epoch": 0.06, + "grad_norm": 1.358149840169689, + "learning_rate": 0.0009969409196278398, + "loss": 4.0533, + "step": 566 + }, + { + "epoch": 0.07, + "grad_norm": 1.60757100887447, + "learning_rate": 0.0009969203755765186, + "loss": 4.2153, + "step": 567 + }, + { + "epoch": 0.07, + "grad_norm": 2.183863320900081, + "learning_rate": 0.0009968997629845038, + "loss": 4.1311, + "step": 568 + }, + { + "epoch": 0.07, + "grad_norm": 1.5471200211544884, + "learning_rate": 0.0009968790818546383, + "loss": 4.0602, + "step": 569 + }, + { + "epoch": 0.07, + "grad_norm": 1.9183073849495123, + "learning_rate": 0.000996858332189775, + "loss": 4.1157, + "step": 570 + }, + { + "epoch": 0.07, + "grad_norm": 1.894555648296673, + "learning_rate": 0.0009968375139927756, + "loss": 4.0403, + "step": 571 + }, + { + "epoch": 0.07, + "grad_norm": 1.476813880071536, + "learning_rate": 0.000996816627266512, + "loss": 4.4461, + "step": 572 + }, + { + "epoch": 0.07, + "grad_norm": 1.3078705247856606, + "learning_rate": 0.0009967956720138647, + "loss": 4.0242, + "step": 573 + }, + { + "epoch": 0.07, + "grad_norm": 1.4241452228625129, + "learning_rate": 0.0009967746482377243, + "loss": 4.1377, + "step": 574 + }, + { + "epoch": 0.07, + "grad_norm": 2.088606286674306, + "learning_rate": 0.0009967535559409905, + "loss": 4.1366, + "step": 575 + }, + { + "epoch": 0.07, + "grad_norm": 3.1183697030300097, + "learning_rate": 0.0009967323951265725, + "loss": 4.0718, + "step": 576 + }, + { + "epoch": 0.07, + "grad_norm": 2.8181021636795642, + "learning_rate": 0.0009967111657973892, + "loss": 3.9951, + "step": 577 + }, + { + "epoch": 0.07, + "grad_norm": 1.7985810211091366, + "learning_rate": 0.000996689867956369, + "loss": 4.1789, + "step": 578 + }, + { + "epoch": 0.07, + "grad_norm": 2.674205183704797, + "learning_rate": 0.0009966685016064491, + "loss": 4.264, + "step": 579 + }, + { + "epoch": 0.07, + "grad_norm": 1.433067428408238, + "learning_rate": 0.0009966470667505767, + "loss": 4.2296, + "step": 580 + }, + { + "epoch": 0.07, + "grad_norm": 1.9155876942205001, + "learning_rate": 0.0009966255633917086, + "loss": 4.1366, + "step": 581 + }, + { + "epoch": 0.07, + "grad_norm": 1.540915979669818, + "learning_rate": 0.0009966039915328105, + "loss": 4.2495, + "step": 582 + }, + { + "epoch": 0.07, + "grad_norm": 2.296715200614702, + "learning_rate": 0.0009965823511768578, + "loss": 4.1212, + "step": 583 + }, + { + "epoch": 0.07, + "grad_norm": 4.306543377455833, + "learning_rate": 0.0009965606423268355, + "loss": 4.2714, + "step": 584 + }, + { + "epoch": 0.07, + "grad_norm": 2.7273977277692967, + "learning_rate": 0.000996538864985738, + "loss": 4.2327, + "step": 585 + }, + { + "epoch": 0.07, + "grad_norm": 1.7551143785243801, + "learning_rate": 0.0009965170191565688, + "loss": 4.0823, + "step": 586 + }, + { + "epoch": 0.07, + "grad_norm": 1.893740375273181, + "learning_rate": 0.0009964951048423414, + "loss": 4.1585, + "step": 587 + }, + { + "epoch": 0.07, + "grad_norm": 1.9905641083956318, + "learning_rate": 0.0009964731220460784, + "loss": 4.1868, + "step": 588 + }, + { + "epoch": 0.07, + "grad_norm": 1.5449030665713914, + "learning_rate": 0.000996451070770812, + "loss": 4.2439, + "step": 589 + }, + { + "epoch": 0.07, + "grad_norm": 1.4462505162959032, + "learning_rate": 0.0009964289510195831, + "loss": 4.1768, + "step": 590 + }, + { + "epoch": 0.07, + "grad_norm": 1.5434352739544415, + "learning_rate": 0.0009964067627954436, + "loss": 4.1383, + "step": 591 + }, + { + "epoch": 0.07, + "grad_norm": 1.5851040024218337, + "learning_rate": 0.0009963845061014534, + "loss": 4.2143, + "step": 592 + }, + { + "epoch": 0.07, + "grad_norm": 2.9737392162317438, + "learning_rate": 0.0009963621809406826, + "loss": 4.3159, + "step": 593 + }, + { + "epoch": 0.07, + "grad_norm": 2.4643244274622575, + "learning_rate": 0.0009963397873162107, + "loss": 3.9931, + "step": 594 + }, + { + "epoch": 0.07, + "grad_norm": 1.7591523096500026, + "learning_rate": 0.0009963173252311257, + "loss": 4.4628, + "step": 595 + }, + { + "epoch": 0.07, + "grad_norm": 2.192205108599557, + "learning_rate": 0.0009962947946885268, + "loss": 4.0644, + "step": 596 + }, + { + "epoch": 0.07, + "grad_norm": 2.6868547989070533, + "learning_rate": 0.000996272195691521, + "loss": 4.0916, + "step": 597 + }, + { + "epoch": 0.07, + "grad_norm": 1.512219098700672, + "learning_rate": 0.0009962495282432255, + "loss": 4.0525, + "step": 598 + }, + { + "epoch": 0.07, + "grad_norm": 2.966362752698928, + "learning_rate": 0.0009962267923467672, + "loss": 4.1998, + "step": 599 + }, + { + "epoch": 0.07, + "grad_norm": 1.8545509706675711, + "learning_rate": 0.0009962039880052817, + "loss": 4.2738, + "step": 600 + }, + { + "epoch": 0.07, + "grad_norm": 1.612391865955191, + "learning_rate": 0.0009961811152219148, + "loss": 4.3363, + "step": 601 + }, + { + "epoch": 0.07, + "grad_norm": 2.215275682880689, + "learning_rate": 0.0009961581739998209, + "loss": 4.1228, + "step": 602 + }, + { + "epoch": 0.07, + "grad_norm": 1.5388436719900926, + "learning_rate": 0.0009961351643421646, + "loss": 4.2861, + "step": 603 + }, + { + "epoch": 0.07, + "grad_norm": 3.26413100231765, + "learning_rate": 0.0009961120862521195, + "loss": 4.1929, + "step": 604 + }, + { + "epoch": 0.07, + "grad_norm": 1.7755420053851467, + "learning_rate": 0.000996088939732869, + "loss": 4.0446, + "step": 605 + }, + { + "epoch": 0.07, + "grad_norm": 2.051055947769499, + "learning_rate": 0.0009960657247876056, + "loss": 4.2947, + "step": 606 + }, + { + "epoch": 0.07, + "grad_norm": 1.85108045901961, + "learning_rate": 0.000996042441419531, + "loss": 4.1702, + "step": 607 + }, + { + "epoch": 0.07, + "grad_norm": 1.7304917268419593, + "learning_rate": 0.0009960190896318572, + "loss": 4.0421, + "step": 608 + }, + { + "epoch": 0.07, + "grad_norm": 1.2556119399054029, + "learning_rate": 0.0009959956694278052, + "loss": 3.9978, + "step": 609 + }, + { + "epoch": 0.07, + "grad_norm": 1.6370960465991575, + "learning_rate": 0.000995972180810605, + "loss": 3.9639, + "step": 610 + }, + { + "epoch": 0.07, + "grad_norm": 1.5581173571942428, + "learning_rate": 0.0009959486237834964, + "loss": 3.897, + "step": 611 + }, + { + "epoch": 0.07, + "grad_norm": 2.3889100125210514, + "learning_rate": 0.0009959249983497289, + "loss": 4.269, + "step": 612 + }, + { + "epoch": 0.07, + "grad_norm": 1.4596146156794254, + "learning_rate": 0.0009959013045125612, + "loss": 4.1034, + "step": 613 + }, + { + "epoch": 0.07, + "grad_norm": 1.575489312922, + "learning_rate": 0.000995877542275261, + "loss": 4.0463, + "step": 614 + }, + { + "epoch": 0.07, + "grad_norm": 1.3724750625373647, + "learning_rate": 0.0009958537116411064, + "loss": 4.0892, + "step": 615 + }, + { + "epoch": 0.07, + "grad_norm": 1.4147832091311163, + "learning_rate": 0.000995829812613384, + "loss": 4.2942, + "step": 616 + }, + { + "epoch": 0.07, + "grad_norm": 1.545629033846108, + "learning_rate": 0.0009958058451953902, + "loss": 4.1335, + "step": 617 + }, + { + "epoch": 0.07, + "grad_norm": 1.382987493144587, + "learning_rate": 0.0009957818093904313, + "loss": 4.2096, + "step": 618 + }, + { + "epoch": 0.07, + "grad_norm": 1.3620340266808024, + "learning_rate": 0.000995757705201822, + "loss": 4.2155, + "step": 619 + }, + { + "epoch": 0.07, + "grad_norm": 2.087045663576736, + "learning_rate": 0.0009957335326328874, + "loss": 4.2768, + "step": 620 + }, + { + "epoch": 0.07, + "grad_norm": 1.5217177262847221, + "learning_rate": 0.0009957092916869613, + "loss": 4.1648, + "step": 621 + }, + { + "epoch": 0.07, + "grad_norm": 1.5085147186425738, + "learning_rate": 0.0009956849823673877, + "loss": 3.9862, + "step": 622 + }, + { + "epoch": 0.07, + "grad_norm": 1.2811490069695686, + "learning_rate": 0.0009956606046775192, + "loss": 4.1947, + "step": 623 + }, + { + "epoch": 0.07, + "grad_norm": 1.4784842952015558, + "learning_rate": 0.0009956361586207186, + "loss": 4.0288, + "step": 624 + }, + { + "epoch": 0.07, + "grad_norm": 3.001338900911761, + "learning_rate": 0.0009956116442003575, + "loss": 4.1915, + "step": 625 + }, + { + "epoch": 0.07, + "grad_norm": 1.7617320475361695, + "learning_rate": 0.0009955870614198174, + "loss": 3.9299, + "step": 626 + }, + { + "epoch": 0.07, + "grad_norm": 2.6332876781789483, + "learning_rate": 0.000995562410282489, + "loss": 4.2955, + "step": 627 + }, + { + "epoch": 0.07, + "grad_norm": 1.3639849609347612, + "learning_rate": 0.0009955376907917722, + "loss": 3.9406, + "step": 628 + }, + { + "epoch": 0.07, + "grad_norm": 1.5138907550520855, + "learning_rate": 0.0009955129029510768, + "loss": 4.2578, + "step": 629 + }, + { + "epoch": 0.07, + "grad_norm": 1.7896493816680665, + "learning_rate": 0.0009954880467638219, + "loss": 4.1592, + "step": 630 + }, + { + "epoch": 0.07, + "grad_norm": 1.235994667315824, + "learning_rate": 0.0009954631222334356, + "loss": 4.2443, + "step": 631 + }, + { + "epoch": 0.07, + "grad_norm": 2.707557780772975, + "learning_rate": 0.0009954381293633561, + "loss": 4.2409, + "step": 632 + }, + { + "epoch": 0.07, + "grad_norm": 1.4183097961636217, + "learning_rate": 0.0009954130681570305, + "loss": 4.1186, + "step": 633 + }, + { + "epoch": 0.07, + "grad_norm": 1.7298670716087088, + "learning_rate": 0.0009953879386179157, + "loss": 4.3454, + "step": 634 + }, + { + "epoch": 0.07, + "grad_norm": 3.949787643026631, + "learning_rate": 0.0009953627407494777, + "loss": 4.2464, + "step": 635 + }, + { + "epoch": 0.07, + "grad_norm": 2.532381439273023, + "learning_rate": 0.000995337474555192, + "loss": 4.0315, + "step": 636 + }, + { + "epoch": 0.07, + "grad_norm": 1.6807418462402264, + "learning_rate": 0.0009953121400385438, + "loss": 3.9328, + "step": 637 + }, + { + "epoch": 0.07, + "grad_norm": 1.8488274304785741, + "learning_rate": 0.0009952867372030273, + "loss": 4.2027, + "step": 638 + }, + { + "epoch": 0.07, + "grad_norm": 1.5004804954449449, + "learning_rate": 0.0009952612660521466, + "loss": 4.1245, + "step": 639 + }, + { + "epoch": 0.07, + "grad_norm": 1.5083268156226164, + "learning_rate": 0.0009952357265894146, + "loss": 4.0478, + "step": 640 + }, + { + "epoch": 0.07, + "grad_norm": 2.22439978004962, + "learning_rate": 0.000995210118818354, + "loss": 4.1608, + "step": 641 + }, + { + "epoch": 0.07, + "grad_norm": 1.9152396601119313, + "learning_rate": 0.0009951844427424973, + "loss": 4.1071, + "step": 642 + }, + { + "epoch": 0.07, + "grad_norm": 3.633611473231003, + "learning_rate": 0.0009951586983653858, + "loss": 4.1028, + "step": 643 + }, + { + "epoch": 0.07, + "grad_norm": 1.3858037616390062, + "learning_rate": 0.0009951328856905703, + "loss": 4.0111, + "step": 644 + }, + { + "epoch": 0.07, + "grad_norm": 1.446600371331332, + "learning_rate": 0.0009951070047216116, + "loss": 4.1573, + "step": 645 + }, + { + "epoch": 0.07, + "grad_norm": 1.454007103373636, + "learning_rate": 0.000995081055462079, + "loss": 4.1426, + "step": 646 + }, + { + "epoch": 0.07, + "grad_norm": 5.429911994205693, + "learning_rate": 0.0009950550379155519, + "loss": 4.0318, + "step": 647 + }, + { + "epoch": 0.07, + "grad_norm": 1.8483374877344385, + "learning_rate": 0.000995028952085619, + "loss": 4.0266, + "step": 648 + }, + { + "epoch": 0.07, + "grad_norm": 1.7012374213836405, + "learning_rate": 0.0009950027979758781, + "loss": 4.023, + "step": 649 + }, + { + "epoch": 0.07, + "grad_norm": 1.4842828652408195, + "learning_rate": 0.0009949765755899369, + "loss": 4.1377, + "step": 650 + }, + { + "epoch": 0.07, + "grad_norm": 5.12346651183532, + "learning_rate": 0.0009949502849314123, + "loss": 4.2203, + "step": 651 + }, + { + "epoch": 0.07, + "grad_norm": 1.9863785238513911, + "learning_rate": 0.0009949239260039304, + "loss": 4.1451, + "step": 652 + }, + { + "epoch": 0.07, + "grad_norm": 1.5733648107746758, + "learning_rate": 0.0009948974988111272, + "loss": 4.0476, + "step": 653 + }, + { + "epoch": 0.07, + "grad_norm": 1.3851246908349588, + "learning_rate": 0.0009948710033566475, + "loss": 3.9108, + "step": 654 + }, + { + "epoch": 0.08, + "grad_norm": 1.413087596958269, + "learning_rate": 0.000994844439644146, + "loss": 3.9836, + "step": 655 + }, + { + "epoch": 0.08, + "grad_norm": 1.5858218947834306, + "learning_rate": 0.0009948178076772867, + "loss": 4.0915, + "step": 656 + }, + { + "epoch": 0.08, + "grad_norm": 1.580882372405347, + "learning_rate": 0.0009947911074597428, + "loss": 4.2436, + "step": 657 + }, + { + "epoch": 0.08, + "grad_norm": 1.6617444315008543, + "learning_rate": 0.0009947643389951973, + "loss": 4.2557, + "step": 658 + }, + { + "epoch": 0.08, + "grad_norm": 1.4136657364986869, + "learning_rate": 0.0009947375022873422, + "loss": 4.1125, + "step": 659 + }, + { + "epoch": 0.08, + "grad_norm": 1.4282784659608772, + "learning_rate": 0.0009947105973398794, + "loss": 3.9847, + "step": 660 + }, + { + "epoch": 0.08, + "grad_norm": 1.802011690932386, + "learning_rate": 0.0009946836241565195, + "loss": 4.5018, + "step": 661 + }, + { + "epoch": 0.08, + "grad_norm": 1.762425890699875, + "learning_rate": 0.0009946565827409833, + "loss": 3.9913, + "step": 662 + }, + { + "epoch": 0.08, + "grad_norm": 1.4064618593143094, + "learning_rate": 0.0009946294730970005, + "loss": 4.24, + "step": 663 + }, + { + "epoch": 0.08, + "grad_norm": 2.057643590301231, + "learning_rate": 0.0009946022952283106, + "loss": 4.0534, + "step": 664 + }, + { + "epoch": 0.08, + "grad_norm": 1.3660268829451327, + "learning_rate": 0.0009945750491386616, + "loss": 4.1487, + "step": 665 + }, + { + "epoch": 0.08, + "grad_norm": 2.460997056810734, + "learning_rate": 0.0009945477348318123, + "loss": 4.351, + "step": 666 + }, + { + "epoch": 0.08, + "grad_norm": 1.5412447154686766, + "learning_rate": 0.00099452035231153, + "loss": 4.2104, + "step": 667 + }, + { + "epoch": 0.08, + "grad_norm": 3.3804350220699337, + "learning_rate": 0.0009944929015815913, + "loss": 4.1453, + "step": 668 + }, + { + "epoch": 0.08, + "grad_norm": 1.9147446832418717, + "learning_rate": 0.0009944653826457828, + "loss": 3.9929, + "step": 669 + }, + { + "epoch": 0.08, + "grad_norm": 1.3330460143249674, + "learning_rate": 0.0009944377955079004, + "loss": 4.0753, + "step": 670 + }, + { + "epoch": 0.08, + "grad_norm": 1.3134469374813962, + "learning_rate": 0.0009944101401717486, + "loss": 4.0162, + "step": 671 + }, + { + "epoch": 0.08, + "grad_norm": 1.486141109678482, + "learning_rate": 0.0009943824166411424, + "loss": 4.0119, + "step": 672 + }, + { + "epoch": 0.08, + "grad_norm": 1.5920173618340074, + "learning_rate": 0.0009943546249199056, + "loss": 4.0771, + "step": 673 + }, + { + "epoch": 0.08, + "grad_norm": 3.783770288620055, + "learning_rate": 0.0009943267650118716, + "loss": 3.8222, + "step": 674 + }, + { + "epoch": 0.08, + "grad_norm": 1.5883219744980457, + "learning_rate": 0.0009942988369208829, + "loss": 4.1278, + "step": 675 + }, + { + "epoch": 0.08, + "grad_norm": 4.252308145309663, + "learning_rate": 0.000994270840650792, + "loss": 3.8363, + "step": 676 + }, + { + "epoch": 0.08, + "grad_norm": 1.2879696726784864, + "learning_rate": 0.0009942427762054604, + "loss": 3.9805, + "step": 677 + }, + { + "epoch": 0.08, + "grad_norm": 2.1959768053912816, + "learning_rate": 0.0009942146435887589, + "loss": 3.9806, + "step": 678 + }, + { + "epoch": 0.08, + "grad_norm": 1.3530639059438625, + "learning_rate": 0.0009941864428045677, + "loss": 4.2631, + "step": 679 + }, + { + "epoch": 0.08, + "grad_norm": 1.234259697603252, + "learning_rate": 0.0009941581738567768, + "loss": 4.2267, + "step": 680 + }, + { + "epoch": 0.08, + "grad_norm": 1.5186806563407114, + "learning_rate": 0.0009941298367492854, + "loss": 4.3256, + "step": 681 + }, + { + "epoch": 0.08, + "grad_norm": 1.6869551109394108, + "learning_rate": 0.0009941014314860021, + "loss": 4.2521, + "step": 682 + }, + { + "epoch": 0.08, + "grad_norm": 1.399775753515481, + "learning_rate": 0.0009940729580708448, + "loss": 4.1332, + "step": 683 + }, + { + "epoch": 0.08, + "grad_norm": 1.4880893351267843, + "learning_rate": 0.0009940444165077408, + "loss": 4.1871, + "step": 684 + }, + { + "epoch": 0.08, + "grad_norm": 2.0846614972362167, + "learning_rate": 0.0009940158068006267, + "loss": 4.2293, + "step": 685 + }, + { + "epoch": 0.08, + "grad_norm": 1.4512658006094312, + "learning_rate": 0.0009939871289534488, + "loss": 4.0961, + "step": 686 + }, + { + "epoch": 0.08, + "grad_norm": 2.213945903257091, + "learning_rate": 0.0009939583829701628, + "loss": 4.074, + "step": 687 + }, + { + "epoch": 0.08, + "grad_norm": 1.8919105099835127, + "learning_rate": 0.0009939295688547337, + "loss": 4.1389, + "step": 688 + }, + { + "epoch": 0.08, + "grad_norm": 1.6391367835023194, + "learning_rate": 0.0009939006866111356, + "loss": 4.3763, + "step": 689 + }, + { + "epoch": 0.08, + "grad_norm": 1.5376709921483438, + "learning_rate": 0.0009938717362433524, + "loss": 4.1265, + "step": 690 + }, + { + "epoch": 0.08, + "grad_norm": 1.8821901951745228, + "learning_rate": 0.0009938427177553773, + "loss": 4.2587, + "step": 691 + }, + { + "epoch": 0.08, + "grad_norm": 1.9187971881871668, + "learning_rate": 0.0009938136311512127, + "loss": 4.1903, + "step": 692 + }, + { + "epoch": 0.08, + "grad_norm": 1.379118461132903, + "learning_rate": 0.0009937844764348707, + "loss": 3.9662, + "step": 693 + }, + { + "epoch": 0.08, + "grad_norm": 3.3290556733092025, + "learning_rate": 0.0009937552536103727, + "loss": 4.1497, + "step": 694 + }, + { + "epoch": 0.08, + "grad_norm": 2.2849707521158082, + "learning_rate": 0.000993725962681749, + "loss": 4.1624, + "step": 695 + }, + { + "epoch": 0.08, + "grad_norm": 3.731279486405896, + "learning_rate": 0.0009936966036530402, + "loss": 3.9731, + "step": 696 + }, + { + "epoch": 0.08, + "grad_norm": 1.7043471733458793, + "learning_rate": 0.0009936671765282956, + "loss": 4.3333, + "step": 697 + }, + { + "epoch": 0.08, + "grad_norm": 1.7853690135525797, + "learning_rate": 0.0009936376813115741, + "loss": 3.853, + "step": 698 + }, + { + "epoch": 0.08, + "grad_norm": 2.039520411297824, + "learning_rate": 0.000993608118006944, + "loss": 4.1309, + "step": 699 + }, + { + "epoch": 0.08, + "grad_norm": 1.5822555739936295, + "learning_rate": 0.0009935784866184833, + "loss": 4.1114, + "step": 700 + }, + { + "epoch": 0.08, + "grad_norm": 6.350469745437764, + "learning_rate": 0.0009935487871502787, + "loss": 4.1319, + "step": 701 + }, + { + "epoch": 0.08, + "grad_norm": 3.5098705682611695, + "learning_rate": 0.0009935190196064267, + "loss": 3.996, + "step": 702 + }, + { + "epoch": 0.08, + "grad_norm": 1.8037871243348724, + "learning_rate": 0.0009934891839910333, + "loss": 4.1872, + "step": 703 + }, + { + "epoch": 0.08, + "grad_norm": 4.420987942270726, + "learning_rate": 0.0009934592803082138, + "loss": 4.0095, + "step": 704 + }, + { + "epoch": 0.08, + "grad_norm": 2.3756940275583855, + "learning_rate": 0.0009934293085620929, + "loss": 4.4257, + "step": 705 + }, + { + "epoch": 0.08, + "grad_norm": 2.2349550904005206, + "learning_rate": 0.0009933992687568044, + "loss": 3.9381, + "step": 706 + }, + { + "epoch": 0.08, + "grad_norm": 1.2737657961091353, + "learning_rate": 0.0009933691608964917, + "loss": 4.2288, + "step": 707 + }, + { + "epoch": 0.08, + "grad_norm": 1.4877276508498172, + "learning_rate": 0.0009933389849853078, + "loss": 4.0222, + "step": 708 + }, + { + "epoch": 0.08, + "grad_norm": 1.9461136730740822, + "learning_rate": 0.0009933087410274148, + "loss": 4.1435, + "step": 709 + }, + { + "epoch": 0.08, + "grad_norm": 1.608162646122769, + "learning_rate": 0.0009932784290269843, + "loss": 4.2694, + "step": 710 + }, + { + "epoch": 0.08, + "grad_norm": 1.782466565429436, + "learning_rate": 0.0009932480489881974, + "loss": 4.1623, + "step": 711 + }, + { + "epoch": 0.08, + "grad_norm": 1.6479064806227908, + "learning_rate": 0.0009932176009152442, + "loss": 4.2787, + "step": 712 + }, + { + "epoch": 0.08, + "grad_norm": 1.6435651627528534, + "learning_rate": 0.0009931870848123245, + "loss": 4.2694, + "step": 713 + }, + { + "epoch": 0.08, + "grad_norm": 1.4703496447852837, + "learning_rate": 0.0009931565006836476, + "loss": 4.0805, + "step": 714 + }, + { + "epoch": 0.08, + "grad_norm": 1.4829289788949316, + "learning_rate": 0.0009931258485334315, + "loss": 4.2708, + "step": 715 + }, + { + "epoch": 0.08, + "grad_norm": 2.5900367960834214, + "learning_rate": 0.0009930951283659048, + "loss": 3.891, + "step": 716 + }, + { + "epoch": 0.08, + "grad_norm": 1.8875196289077798, + "learning_rate": 0.0009930643401853043, + "loss": 4.0185, + "step": 717 + }, + { + "epoch": 0.08, + "grad_norm": 1.461788156993926, + "learning_rate": 0.0009930334839958765, + "loss": 4.1196, + "step": 718 + }, + { + "epoch": 0.08, + "grad_norm": 1.5222977257512633, + "learning_rate": 0.000993002559801878, + "loss": 4.0555, + "step": 719 + }, + { + "epoch": 0.08, + "grad_norm": 3.8503979664643304, + "learning_rate": 0.0009929715676075736, + "loss": 3.9817, + "step": 720 + }, + { + "epoch": 0.08, + "grad_norm": 2.053695215129397, + "learning_rate": 0.0009929405074172383, + "loss": 4.4499, + "step": 721 + }, + { + "epoch": 0.08, + "grad_norm": 1.531752112452093, + "learning_rate": 0.0009929093792351567, + "loss": 3.9741, + "step": 722 + }, + { + "epoch": 0.08, + "grad_norm": 4.082202578649599, + "learning_rate": 0.0009928781830656215, + "loss": 4.3045, + "step": 723 + }, + { + "epoch": 0.08, + "grad_norm": 1.9048628451082397, + "learning_rate": 0.0009928469189129363, + "loss": 4.2997, + "step": 724 + }, + { + "epoch": 0.08, + "grad_norm": 1.3102157689427596, + "learning_rate": 0.0009928155867814131, + "loss": 4.1007, + "step": 725 + }, + { + "epoch": 0.08, + "grad_norm": 1.3166056446145378, + "learning_rate": 0.0009927841866753735, + "loss": 4.2061, + "step": 726 + }, + { + "epoch": 0.08, + "grad_norm": 1.712375209673574, + "learning_rate": 0.000992752718599149, + "loss": 4.3986, + "step": 727 + }, + { + "epoch": 0.08, + "grad_norm": 1.4959218245341306, + "learning_rate": 0.0009927211825570793, + "loss": 4.0061, + "step": 728 + }, + { + "epoch": 0.08, + "grad_norm": 2.137165697288484, + "learning_rate": 0.000992689578553515, + "loss": 4.0319, + "step": 729 + }, + { + "epoch": 0.08, + "grad_norm": 1.3762459943741774, + "learning_rate": 0.0009926579065928144, + "loss": 3.8738, + "step": 730 + }, + { + "epoch": 0.08, + "grad_norm": 2.5979882660965306, + "learning_rate": 0.000992626166679347, + "loss": 4.1582, + "step": 731 + }, + { + "epoch": 0.08, + "grad_norm": 1.7174436227796919, + "learning_rate": 0.0009925943588174897, + "loss": 4.0937, + "step": 732 + }, + { + "epoch": 0.08, + "grad_norm": 2.379061018713314, + "learning_rate": 0.0009925624830116305, + "loss": 4.3185, + "step": 733 + }, + { + "epoch": 0.08, + "grad_norm": 1.9172839175239496, + "learning_rate": 0.000992530539266166, + "loss": 4.1907, + "step": 734 + }, + { + "epoch": 0.08, + "grad_norm": 1.4845328136986566, + "learning_rate": 0.0009924985275855018, + "loss": 4.1901, + "step": 735 + }, + { + "epoch": 0.08, + "grad_norm": 1.9436011884517015, + "learning_rate": 0.000992466447974054, + "loss": 3.9915, + "step": 736 + }, + { + "epoch": 0.08, + "grad_norm": 1.5635948131574493, + "learning_rate": 0.0009924343004362466, + "loss": 4.004, + "step": 737 + }, + { + "epoch": 0.08, + "grad_norm": 1.6891859420652, + "learning_rate": 0.0009924020849765142, + "loss": 4.0188, + "step": 738 + }, + { + "epoch": 0.08, + "grad_norm": 1.8613933117912842, + "learning_rate": 0.0009923698015993003, + "loss": 3.8481, + "step": 739 + }, + { + "epoch": 0.08, + "grad_norm": 1.3842663601801606, + "learning_rate": 0.0009923374503090577, + "loss": 4.2203, + "step": 740 + }, + { + "epoch": 0.08, + "grad_norm": 2.09529312034581, + "learning_rate": 0.0009923050311102487, + "loss": 4.0311, + "step": 741 + }, + { + "epoch": 0.09, + "grad_norm": 1.6233330964871857, + "learning_rate": 0.0009922725440073446, + "loss": 4.2288, + "step": 742 + }, + { + "epoch": 0.09, + "grad_norm": 1.5915011985174687, + "learning_rate": 0.0009922399890048268, + "loss": 3.8865, + "step": 743 + }, + { + "epoch": 0.09, + "grad_norm": 1.3742105686216515, + "learning_rate": 0.0009922073661071855, + "loss": 4.0008, + "step": 744 + }, + { + "epoch": 0.09, + "grad_norm": 2.0608412808373853, + "learning_rate": 0.0009921746753189203, + "loss": 4.1446, + "step": 745 + }, + { + "epoch": 0.09, + "grad_norm": 1.5888780670070313, + "learning_rate": 0.0009921419166445404, + "loss": 4.0121, + "step": 746 + }, + { + "epoch": 0.09, + "grad_norm": 1.265541328055408, + "learning_rate": 0.0009921090900885641, + "loss": 4.1654, + "step": 747 + }, + { + "epoch": 0.09, + "grad_norm": 1.4874840245069292, + "learning_rate": 0.0009920761956555193, + "loss": 4.0256, + "step": 748 + }, + { + "epoch": 0.09, + "grad_norm": 2.23067104817785, + "learning_rate": 0.0009920432333499433, + "loss": 4.1285, + "step": 749 + }, + { + "epoch": 0.09, + "grad_norm": 1.803586401300367, + "learning_rate": 0.0009920102031763822, + "loss": 4.1161, + "step": 750 + }, + { + "epoch": 0.09, + "grad_norm": 1.2950900331867707, + "learning_rate": 0.0009919771051393922, + "loss": 4.0044, + "step": 751 + }, + { + "epoch": 0.09, + "grad_norm": 3.6537802139836675, + "learning_rate": 0.0009919439392435385, + "loss": 3.9011, + "step": 752 + }, + { + "epoch": 0.09, + "grad_norm": 1.6266253975019092, + "learning_rate": 0.0009919107054933956, + "loss": 4.2069, + "step": 753 + }, + { + "epoch": 0.09, + "grad_norm": 1.3100424463049318, + "learning_rate": 0.0009918774038935477, + "loss": 4.2516, + "step": 754 + }, + { + "epoch": 0.09, + "grad_norm": 1.568253036461531, + "learning_rate": 0.000991844034448588, + "loss": 3.9562, + "step": 755 + }, + { + "epoch": 0.09, + "grad_norm": 2.2350638813150208, + "learning_rate": 0.000991810597163119, + "loss": 4.0138, + "step": 756 + }, + { + "epoch": 0.09, + "grad_norm": 1.5601470900280976, + "learning_rate": 0.000991777092041753, + "loss": 3.9203, + "step": 757 + }, + { + "epoch": 0.09, + "grad_norm": 1.1019994123823387, + "learning_rate": 0.0009917435190891111, + "loss": 3.9658, + "step": 758 + }, + { + "epoch": 0.09, + "grad_norm": 4.436430897466474, + "learning_rate": 0.0009917098783098243, + "loss": 4.0674, + "step": 759 + }, + { + "epoch": 0.09, + "grad_norm": 1.5380130199940163, + "learning_rate": 0.0009916761697085327, + "loss": 4.2066, + "step": 760 + }, + { + "epoch": 0.09, + "grad_norm": 1.799214650424404, + "learning_rate": 0.0009916423932898857, + "loss": 4.0897, + "step": 761 + }, + { + "epoch": 0.09, + "grad_norm": 1.9883547786814795, + "learning_rate": 0.0009916085490585423, + "loss": 4.1879, + "step": 762 + }, + { + "epoch": 0.09, + "grad_norm": 1.8271889789986793, + "learning_rate": 0.0009915746370191701, + "loss": 4.1315, + "step": 763 + }, + { + "epoch": 0.09, + "grad_norm": 1.2990876773015267, + "learning_rate": 0.0009915406571764471, + "loss": 4.1213, + "step": 764 + }, + { + "epoch": 0.09, + "grad_norm": 2.312643638462174, + "learning_rate": 0.0009915066095350603, + "loss": 4.2784, + "step": 765 + }, + { + "epoch": 0.09, + "grad_norm": 3.437472453934804, + "learning_rate": 0.0009914724940997053, + "loss": 4.269, + "step": 766 + }, + { + "epoch": 0.09, + "grad_norm": 2.391918574393383, + "learning_rate": 0.0009914383108750883, + "loss": 3.9158, + "step": 767 + }, + { + "epoch": 0.09, + "grad_norm": 2.459397033169588, + "learning_rate": 0.000991404059865924, + "loss": 4.254, + "step": 768 + }, + { + "epoch": 0.09, + "grad_norm": 1.5056946961373214, + "learning_rate": 0.0009913697410769366, + "loss": 3.9395, + "step": 769 + }, + { + "epoch": 0.09, + "grad_norm": 1.588335241429291, + "learning_rate": 0.0009913353545128597, + "loss": 4.095, + "step": 770 + }, + { + "epoch": 0.09, + "grad_norm": 2.957160013007683, + "learning_rate": 0.0009913009001784364, + "loss": 4.3068, + "step": 771 + }, + { + "epoch": 0.09, + "grad_norm": 2.7564536672978432, + "learning_rate": 0.0009912663780784188, + "loss": 4.2376, + "step": 772 + }, + { + "epoch": 0.09, + "grad_norm": 1.9578620602619432, + "learning_rate": 0.000991231788217569, + "loss": 3.9008, + "step": 773 + }, + { + "epoch": 0.09, + "grad_norm": 2.0411401208523645, + "learning_rate": 0.0009911971306006575, + "loss": 4.148, + "step": 774 + }, + { + "epoch": 0.09, + "grad_norm": 1.3266303336156848, + "learning_rate": 0.000991162405232465, + "loss": 4.0964, + "step": 775 + }, + { + "epoch": 0.09, + "grad_norm": 1.7313435379031945, + "learning_rate": 0.0009911276121177812, + "loss": 4.4221, + "step": 776 + }, + { + "epoch": 0.09, + "grad_norm": 1.5773925852216288, + "learning_rate": 0.0009910927512614051, + "loss": 4.2298, + "step": 777 + }, + { + "epoch": 0.09, + "grad_norm": 1.7923144358760363, + "learning_rate": 0.000991057822668145, + "loss": 4.1448, + "step": 778 + }, + { + "epoch": 0.09, + "grad_norm": 1.9762495520882326, + "learning_rate": 0.0009910228263428186, + "loss": 4.053, + "step": 779 + }, + { + "epoch": 0.09, + "grad_norm": 1.364840414866272, + "learning_rate": 0.000990987762290253, + "loss": 3.9459, + "step": 780 + }, + { + "epoch": 0.09, + "grad_norm": 1.3170548492412446, + "learning_rate": 0.0009909526305152848, + "loss": 4.1839, + "step": 781 + }, + { + "epoch": 0.09, + "grad_norm": 1.4851904515030712, + "learning_rate": 0.0009909174310227596, + "loss": 3.9447, + "step": 782 + }, + { + "epoch": 0.09, + "grad_norm": 1.2615054080730894, + "learning_rate": 0.0009908821638175325, + "loss": 3.9618, + "step": 783 + }, + { + "epoch": 0.09, + "grad_norm": 1.4097100490736516, + "learning_rate": 0.000990846828904468, + "loss": 4.3071, + "step": 784 + }, + { + "epoch": 0.09, + "grad_norm": 2.360200741587504, + "learning_rate": 0.0009908114262884397, + "loss": 4.2358, + "step": 785 + }, + { + "epoch": 0.09, + "grad_norm": 17.20122497542913, + "learning_rate": 0.0009907759559743311, + "loss": 4.0454, + "step": 786 + }, + { + "epoch": 0.09, + "grad_norm": 1.6158886209071366, + "learning_rate": 0.0009907404179670342, + "loss": 4.074, + "step": 787 + }, + { + "epoch": 0.09, + "grad_norm": 2.2709370867141287, + "learning_rate": 0.000990704812271451, + "loss": 4.2261, + "step": 788 + }, + { + "epoch": 0.09, + "grad_norm": 1.255600981552734, + "learning_rate": 0.0009906691388924928, + "loss": 4.376, + "step": 789 + }, + { + "epoch": 0.09, + "grad_norm": 1.629476776989314, + "learning_rate": 0.0009906333978350799, + "loss": 3.9338, + "step": 790 + }, + { + "epoch": 0.09, + "grad_norm": 17.38085593199682, + "learning_rate": 0.000990597589104142, + "loss": 4.0696, + "step": 791 + }, + { + "epoch": 0.09, + "grad_norm": 8.036231504489686, + "learning_rate": 0.0009905617127046182, + "loss": 3.9258, + "step": 792 + }, + { + "epoch": 0.09, + "grad_norm": 1.4429522426425332, + "learning_rate": 0.0009905257686414573, + "loss": 4.0944, + "step": 793 + }, + { + "epoch": 0.09, + "grad_norm": 1.3579793736907337, + "learning_rate": 0.0009904897569196168, + "loss": 4.2673, + "step": 794 + }, + { + "epoch": 0.09, + "grad_norm": 1.3456311024088905, + "learning_rate": 0.0009904536775440641, + "loss": 3.9632, + "step": 795 + }, + { + "epoch": 0.09, + "grad_norm": 1.18416963357136, + "learning_rate": 0.0009904175305197752, + "loss": 4.0098, + "step": 796 + }, + { + "epoch": 0.09, + "grad_norm": 1.781172523127943, + "learning_rate": 0.0009903813158517363, + "loss": 4.0676, + "step": 797 + }, + { + "epoch": 0.09, + "grad_norm": 1.5266544065259735, + "learning_rate": 0.0009903450335449423, + "loss": 4.2542, + "step": 798 + }, + { + "epoch": 0.09, + "grad_norm": 1.744801038215624, + "learning_rate": 0.0009903086836043978, + "loss": 4.2541, + "step": 799 + }, + { + "epoch": 0.09, + "grad_norm": 1.4002368410495831, + "learning_rate": 0.0009902722660351166, + "loss": 4.1107, + "step": 800 + }, + { + "epoch": 0.09, + "grad_norm": 1.286801490908429, + "learning_rate": 0.0009902357808421218, + "loss": 3.9083, + "step": 801 + }, + { + "epoch": 0.09, + "grad_norm": 1.6759711857123307, + "learning_rate": 0.0009901992280304456, + "loss": 4.1167, + "step": 802 + }, + { + "epoch": 0.09, + "grad_norm": 1.3345841796770548, + "learning_rate": 0.00099016260760513, + "loss": 4.1082, + "step": 803 + }, + { + "epoch": 0.09, + "grad_norm": 2.1475747215420826, + "learning_rate": 0.000990125919571226, + "loss": 4.0635, + "step": 804 + }, + { + "epoch": 0.09, + "grad_norm": 2.2426641437880592, + "learning_rate": 0.000990089163933794, + "loss": 4.051, + "step": 805 + }, + { + "epoch": 0.09, + "grad_norm": 9.29529912028014, + "learning_rate": 0.000990052340697904, + "loss": 4.2248, + "step": 806 + }, + { + "epoch": 0.09, + "grad_norm": 1.441681225478933, + "learning_rate": 0.0009900154498686349, + "loss": 4.2893, + "step": 807 + }, + { + "epoch": 0.09, + "grad_norm": 1.3589736316717063, + "learning_rate": 0.0009899784914510748, + "loss": 3.811, + "step": 808 + }, + { + "epoch": 0.09, + "grad_norm": 1.5766839084839277, + "learning_rate": 0.0009899414654503216, + "loss": 4.1897, + "step": 809 + }, + { + "epoch": 0.09, + "grad_norm": 7.399332756469976, + "learning_rate": 0.0009899043718714826, + "loss": 3.8399, + "step": 810 + }, + { + "epoch": 0.09, + "grad_norm": 1.6649543075900002, + "learning_rate": 0.0009898672107196739, + "loss": 4.2661, + "step": 811 + }, + { + "epoch": 0.09, + "grad_norm": 1.8014402286618858, + "learning_rate": 0.000989829982000021, + "loss": 4.1699, + "step": 812 + }, + { + "epoch": 0.09, + "grad_norm": 1.56731257169305, + "learning_rate": 0.000989792685717659, + "loss": 3.8234, + "step": 813 + }, + { + "epoch": 0.09, + "grad_norm": 1.3844508086333362, + "learning_rate": 0.0009897553218777327, + "loss": 4.3748, + "step": 814 + }, + { + "epoch": 0.09, + "grad_norm": 1.4379101979071727, + "learning_rate": 0.000989717890485395, + "loss": 4.0049, + "step": 815 + }, + { + "epoch": 0.09, + "grad_norm": 2.569345715157095, + "learning_rate": 0.0009896803915458094, + "loss": 4.0613, + "step": 816 + }, + { + "epoch": 0.09, + "grad_norm": 1.4951417577470074, + "learning_rate": 0.0009896428250641479, + "loss": 4.0152, + "step": 817 + }, + { + "epoch": 0.09, + "grad_norm": 1.4142744268436234, + "learning_rate": 0.000989605191045592, + "loss": 4.1667, + "step": 818 + }, + { + "epoch": 0.09, + "grad_norm": 1.6652016973498989, + "learning_rate": 0.0009895674894953327, + "loss": 4.0481, + "step": 819 + }, + { + "epoch": 0.09, + "grad_norm": 2.129185343182342, + "learning_rate": 0.0009895297204185706, + "loss": 4.0972, + "step": 820 + }, + { + "epoch": 0.09, + "grad_norm": 1.4586831829493963, + "learning_rate": 0.0009894918838205145, + "loss": 4.176, + "step": 821 + }, + { + "epoch": 0.09, + "grad_norm": 1.7686321361429058, + "learning_rate": 0.0009894539797063837, + "loss": 4.0891, + "step": 822 + }, + { + "epoch": 0.09, + "grad_norm": 3.9028000496141932, + "learning_rate": 0.0009894160080814061, + "loss": 4.2762, + "step": 823 + }, + { + "epoch": 0.09, + "grad_norm": 1.6233650852294506, + "learning_rate": 0.0009893779689508194, + "loss": 4.2213, + "step": 824 + }, + { + "epoch": 0.09, + "grad_norm": 1.6995991967356299, + "learning_rate": 0.0009893398623198703, + "loss": 4.0469, + "step": 825 + }, + { + "epoch": 0.09, + "grad_norm": 1.7138268586315053, + "learning_rate": 0.0009893016881938148, + "loss": 4.4536, + "step": 826 + }, + { + "epoch": 0.09, + "grad_norm": 12.222807809258466, + "learning_rate": 0.0009892634465779185, + "loss": 4.0576, + "step": 827 + }, + { + "epoch": 0.09, + "grad_norm": 2.0979289498034968, + "learning_rate": 0.000989225137477456, + "loss": 4.099, + "step": 828 + }, + { + "epoch": 0.1, + "grad_norm": 1.7814119378572693, + "learning_rate": 0.000989186760897711, + "loss": 4.1562, + "step": 829 + }, + { + "epoch": 0.1, + "grad_norm": 1.478146420513788, + "learning_rate": 0.0009891483168439773, + "loss": 4.1868, + "step": 830 + }, + { + "epoch": 0.1, + "grad_norm": 3.714317827936107, + "learning_rate": 0.000989109805321557, + "loss": 4.1632, + "step": 831 + }, + { + "epoch": 0.1, + "grad_norm": 3.7841301342451863, + "learning_rate": 0.0009890712263357626, + "loss": 4.1657, + "step": 832 + }, + { + "epoch": 0.1, + "grad_norm": 1.5302849696184502, + "learning_rate": 0.000989032579891915, + "loss": 4.184, + "step": 833 + }, + { + "epoch": 0.1, + "grad_norm": 1.3819473192914429, + "learning_rate": 0.000988993865995345, + "loss": 4.1686, + "step": 834 + }, + { + "epoch": 0.1, + "grad_norm": 1.5227706756198385, + "learning_rate": 0.000988955084651392, + "loss": 4.1063, + "step": 835 + }, + { + "epoch": 0.1, + "grad_norm": 2.1564115222494316, + "learning_rate": 0.0009889162358654056, + "loss": 4.1291, + "step": 836 + }, + { + "epoch": 0.1, + "grad_norm": 4.230457294296131, + "learning_rate": 0.000988877319642744, + "loss": 4.2703, + "step": 837 + }, + { + "epoch": 0.1, + "grad_norm": 1.9964827995851897, + "learning_rate": 0.000988838335988775, + "loss": 4.1894, + "step": 838 + }, + { + "epoch": 0.1, + "grad_norm": 2.0406491401140276, + "learning_rate": 0.0009887992849088754, + "loss": 4.3152, + "step": 839 + }, + { + "epoch": 0.1, + "grad_norm": 1.6085421671755622, + "learning_rate": 0.000988760166408432, + "loss": 4.1769, + "step": 840 + }, + { + "epoch": 0.1, + "grad_norm": 1.8315553915979064, + "learning_rate": 0.0009887209804928404, + "loss": 4.1022, + "step": 841 + }, + { + "epoch": 0.1, + "grad_norm": 3.3794568203614013, + "learning_rate": 0.0009886817271675052, + "loss": 4.0518, + "step": 842 + }, + { + "epoch": 0.1, + "grad_norm": 2.1814928581227737, + "learning_rate": 0.000988642406437841, + "loss": 4.234, + "step": 843 + }, + { + "epoch": 0.1, + "grad_norm": 1.5869702261844612, + "learning_rate": 0.0009886030183092712, + "loss": 4.2866, + "step": 844 + }, + { + "epoch": 0.1, + "grad_norm": 1.5700726822071096, + "learning_rate": 0.0009885635627872285, + "loss": 4.2778, + "step": 845 + }, + { + "epoch": 0.1, + "grad_norm": 1.4491229621323376, + "learning_rate": 0.0009885240398771554, + "loss": 4.0254, + "step": 846 + }, + { + "epoch": 0.1, + "grad_norm": 2.8395167527650202, + "learning_rate": 0.0009884844495845029, + "loss": 3.837, + "step": 847 + }, + { + "epoch": 0.1, + "grad_norm": 2.0076759503758663, + "learning_rate": 0.000988444791914732, + "loss": 3.9375, + "step": 848 + }, + { + "epoch": 0.1, + "grad_norm": 4.3589814248214696, + "learning_rate": 0.0009884050668733126, + "loss": 3.9184, + "step": 849 + }, + { + "epoch": 0.1, + "grad_norm": 1.5590471550623999, + "learning_rate": 0.0009883652744657244, + "loss": 3.9813, + "step": 850 + }, + { + "epoch": 0.1, + "grad_norm": 2.4959186932592177, + "learning_rate": 0.0009883254146974554, + "loss": 4.064, + "step": 851 + }, + { + "epoch": 0.1, + "grad_norm": 1.9508550664621633, + "learning_rate": 0.0009882854875740037, + "loss": 3.9079, + "step": 852 + }, + { + "epoch": 0.1, + "grad_norm": 1.4567996861311256, + "learning_rate": 0.0009882454931008768, + "loss": 4.0238, + "step": 853 + }, + { + "epoch": 0.1, + "grad_norm": 1.4506773316372805, + "learning_rate": 0.0009882054312835907, + "loss": 4.0492, + "step": 854 + }, + { + "epoch": 0.1, + "grad_norm": 1.8847096976239732, + "learning_rate": 0.0009881653021276715, + "loss": 4.1245, + "step": 855 + }, + { + "epoch": 0.1, + "grad_norm": 1.4566847278233952, + "learning_rate": 0.0009881251056386541, + "loss": 4.1634, + "step": 856 + }, + { + "epoch": 0.1, + "grad_norm": 1.4388279201195637, + "learning_rate": 0.000988084841822083, + "loss": 4.0466, + "step": 857 + }, + { + "epoch": 0.1, + "grad_norm": 1.3152963008077352, + "learning_rate": 0.0009880445106835117, + "loss": 4.1887, + "step": 858 + }, + { + "epoch": 0.1, + "grad_norm": 1.829521836713052, + "learning_rate": 0.000988004112228503, + "loss": 3.9422, + "step": 859 + }, + { + "epoch": 0.1, + "grad_norm": 1.7683553482509518, + "learning_rate": 0.0009879636464626294, + "loss": 4.2288, + "step": 860 + }, + { + "epoch": 0.1, + "grad_norm": 1.5707412825756044, + "learning_rate": 0.0009879231133914721, + "loss": 4.0048, + "step": 861 + }, + { + "epoch": 0.1, + "grad_norm": 1.969529807325113, + "learning_rate": 0.000987882513020622, + "loss": 4.1848, + "step": 862 + }, + { + "epoch": 0.1, + "grad_norm": 1.3957050421234052, + "learning_rate": 0.000987841845355679, + "loss": 4.0344, + "step": 863 + }, + { + "epoch": 0.1, + "grad_norm": 1.3466275978937852, + "learning_rate": 0.0009878011104022526, + "loss": 4.2563, + "step": 864 + }, + { + "epoch": 0.1, + "grad_norm": 3.0018477168130997, + "learning_rate": 0.0009877603081659614, + "loss": 4.2019, + "step": 865 + }, + { + "epoch": 0.1, + "grad_norm": 3.6752736040736806, + "learning_rate": 0.0009877194386524334, + "loss": 4.0374, + "step": 866 + }, + { + "epoch": 0.1, + "grad_norm": 3.0727665360696284, + "learning_rate": 0.0009876785018673054, + "loss": 3.9671, + "step": 867 + }, + { + "epoch": 0.1, + "grad_norm": 1.7870156973958151, + "learning_rate": 0.0009876374978162242, + "loss": 4.1461, + "step": 868 + }, + { + "epoch": 0.1, + "grad_norm": 1.3735506606338912, + "learning_rate": 0.0009875964265048452, + "loss": 4.2198, + "step": 869 + }, + { + "epoch": 0.1, + "grad_norm": 1.3218542574864838, + "learning_rate": 0.0009875552879388336, + "loss": 3.9638, + "step": 870 + }, + { + "epoch": 0.1, + "grad_norm": 1.0943544952431903, + "learning_rate": 0.000987514082123864, + "loss": 4.0521, + "step": 871 + }, + { + "epoch": 0.1, + "grad_norm": 1.3252001668099544, + "learning_rate": 0.0009874728090656193, + "loss": 3.975, + "step": 872 + }, + { + "epoch": 0.1, + "grad_norm": 1.5014269548392112, + "learning_rate": 0.0009874314687697927, + "loss": 4.1303, + "step": 873 + }, + { + "epoch": 0.1, + "grad_norm": 1.5372271375653475, + "learning_rate": 0.0009873900612420866, + "loss": 3.867, + "step": 874 + }, + { + "epoch": 0.1, + "grad_norm": 1.2843747646745087, + "learning_rate": 0.0009873485864882116, + "loss": 4.1952, + "step": 875 + }, + { + "epoch": 0.1, + "grad_norm": 1.315868796542572, + "learning_rate": 0.000987307044513889, + "loss": 3.9562, + "step": 876 + }, + { + "epoch": 0.1, + "grad_norm": 1.6928113796533932, + "learning_rate": 0.0009872654353248486, + "loss": 4.1208, + "step": 877 + }, + { + "epoch": 0.1, + "grad_norm": 1.4173688015632502, + "learning_rate": 0.0009872237589268295, + "loss": 4.0893, + "step": 878 + }, + { + "epoch": 0.1, + "grad_norm": 2.2275560125875273, + "learning_rate": 0.00098718201532558, + "loss": 4.1059, + "step": 879 + }, + { + "epoch": 0.1, + "grad_norm": 5.122230574757189, + "learning_rate": 0.0009871402045268582, + "loss": 4.1319, + "step": 880 + }, + { + "epoch": 0.1, + "grad_norm": 1.476643187041516, + "learning_rate": 0.000987098326536431, + "loss": 4.0193, + "step": 881 + }, + { + "epoch": 0.1, + "grad_norm": 1.4040179738262668, + "learning_rate": 0.0009870563813600744, + "loss": 4.0415, + "step": 882 + }, + { + "epoch": 0.1, + "grad_norm": 2.886185077891304, + "learning_rate": 0.0009870143690035743, + "loss": 4.2009, + "step": 883 + }, + { + "epoch": 0.1, + "grad_norm": 1.9096872741562954, + "learning_rate": 0.0009869722894727251, + "loss": 3.9822, + "step": 884 + }, + { + "epoch": 0.1, + "grad_norm": 1.4362990754540939, + "learning_rate": 0.0009869301427733314, + "loss": 3.9824, + "step": 885 + }, + { + "epoch": 0.1, + "grad_norm": 3.007925592492388, + "learning_rate": 0.000986887928911206, + "loss": 4.1258, + "step": 886 + }, + { + "epoch": 0.1, + "grad_norm": 1.691359626621572, + "learning_rate": 0.0009868456478921719, + "loss": 4.0665, + "step": 887 + }, + { + "epoch": 0.1, + "grad_norm": 1.5900343477393342, + "learning_rate": 0.0009868032997220608, + "loss": 4.1633, + "step": 888 + }, + { + "epoch": 0.1, + "grad_norm": 2.105024800169, + "learning_rate": 0.0009867608844067136, + "loss": 4.0624, + "step": 889 + }, + { + "epoch": 0.1, + "grad_norm": 1.5609714372608663, + "learning_rate": 0.000986718401951981, + "loss": 4.0395, + "step": 890 + }, + { + "epoch": 0.1, + "grad_norm": 1.190038163136122, + "learning_rate": 0.0009866758523637228, + "loss": 4.1511, + "step": 891 + }, + { + "epoch": 0.1, + "grad_norm": 1.6345611315285546, + "learning_rate": 0.0009866332356478075, + "loss": 3.9277, + "step": 892 + }, + { + "epoch": 0.1, + "grad_norm": 1.4068003927564272, + "learning_rate": 0.000986590551810113, + "loss": 3.9804, + "step": 893 + }, + { + "epoch": 0.1, + "grad_norm": 2.8895820469451077, + "learning_rate": 0.0009865478008565275, + "loss": 4.2113, + "step": 894 + }, + { + "epoch": 0.1, + "grad_norm": 1.5962174824713355, + "learning_rate": 0.0009865049827929475, + "loss": 4.0425, + "step": 895 + }, + { + "epoch": 0.1, + "grad_norm": 2.110629365373305, + "learning_rate": 0.0009864620976252785, + "loss": 4.3759, + "step": 896 + }, + { + "epoch": 0.1, + "grad_norm": 1.6510518771710678, + "learning_rate": 0.000986419145359436, + "loss": 4.1769, + "step": 897 + }, + { + "epoch": 0.1, + "grad_norm": 1.644861439913146, + "learning_rate": 0.0009863761260013443, + "loss": 4.1194, + "step": 898 + }, + { + "epoch": 0.1, + "grad_norm": 1.3219321754614737, + "learning_rate": 0.0009863330395569374, + "loss": 4.114, + "step": 899 + }, + { + "epoch": 0.1, + "grad_norm": 1.721551214848858, + "learning_rate": 0.000986289886032158, + "loss": 4.1789, + "step": 900 + }, + { + "epoch": 0.1, + "grad_norm": 1.5994473048104454, + "learning_rate": 0.0009862466654329582, + "loss": 4.1776, + "step": 901 + }, + { + "epoch": 0.1, + "grad_norm": 1.5919192712676673, + "learning_rate": 0.0009862033777652997, + "loss": 4.2, + "step": 902 + }, + { + "epoch": 0.1, + "grad_norm": 1.6780732129117522, + "learning_rate": 0.000986160023035153, + "loss": 4.1056, + "step": 903 + }, + { + "epoch": 0.1, + "grad_norm": 1.2632961486485421, + "learning_rate": 0.0009861166012484982, + "loss": 3.9371, + "step": 904 + }, + { + "epoch": 0.1, + "grad_norm": 2.085402194580041, + "learning_rate": 0.0009860731124113247, + "loss": 3.8418, + "step": 905 + }, + { + "epoch": 0.1, + "grad_norm": 1.7248485021509143, + "learning_rate": 0.0009860295565296306, + "loss": 4.1345, + "step": 906 + }, + { + "epoch": 0.1, + "grad_norm": 1.2734828845705393, + "learning_rate": 0.000985985933609424, + "loss": 3.8753, + "step": 907 + }, + { + "epoch": 0.1, + "grad_norm": 1.6305859669201221, + "learning_rate": 0.0009859422436567212, + "loss": 4.0774, + "step": 908 + }, + { + "epoch": 0.1, + "grad_norm": 2.631588019075633, + "learning_rate": 0.000985898486677549, + "loss": 4.0552, + "step": 909 + }, + { + "epoch": 0.1, + "grad_norm": 5.146631533969775, + "learning_rate": 0.0009858546626779425, + "loss": 4.0444, + "step": 910 + }, + { + "epoch": 0.1, + "grad_norm": 1.4864078775176424, + "learning_rate": 0.0009858107716639464, + "loss": 4.1628, + "step": 911 + }, + { + "epoch": 0.1, + "grad_norm": 1.255878831726148, + "learning_rate": 0.000985766813641615, + "loss": 3.9995, + "step": 912 + }, + { + "epoch": 0.1, + "grad_norm": 1.3828357507560953, + "learning_rate": 0.0009857227886170112, + "loss": 3.9688, + "step": 913 + }, + { + "epoch": 0.1, + "grad_norm": 1.480334348267613, + "learning_rate": 0.0009856786965962074, + "loss": 3.9517, + "step": 914 + }, + { + "epoch": 0.1, + "grad_norm": 2.186957690755025, + "learning_rate": 0.0009856345375852853, + "loss": 4.1979, + "step": 915 + }, + { + "epoch": 0.11, + "grad_norm": 2.380674836048694, + "learning_rate": 0.0009855903115903357, + "loss": 4.1367, + "step": 916 + }, + { + "epoch": 0.11, + "grad_norm": 1.6111821549940448, + "learning_rate": 0.0009855460186174588, + "loss": 4.2874, + "step": 917 + }, + { + "epoch": 0.11, + "grad_norm": 2.2088280426213163, + "learning_rate": 0.000985501658672764, + "loss": 3.8984, + "step": 918 + }, + { + "epoch": 0.11, + "grad_norm": 2.1061845949570723, + "learning_rate": 0.0009854572317623698, + "loss": 4.0018, + "step": 919 + }, + { + "epoch": 0.11, + "grad_norm": 1.5826233210173768, + "learning_rate": 0.0009854127378924043, + "loss": 4.0688, + "step": 920 + }, + { + "epoch": 0.11, + "grad_norm": 2.319828940222645, + "learning_rate": 0.0009853681770690043, + "loss": 3.9957, + "step": 921 + }, + { + "epoch": 0.11, + "grad_norm": 2.639522521806413, + "learning_rate": 0.0009853235492983164, + "loss": 3.9084, + "step": 922 + }, + { + "epoch": 0.11, + "grad_norm": 1.2057356050329093, + "learning_rate": 0.000985278854586496, + "loss": 3.9021, + "step": 923 + }, + { + "epoch": 0.11, + "grad_norm": 1.415430097369293, + "learning_rate": 0.0009852340929397076, + "loss": 3.931, + "step": 924 + }, + { + "epoch": 0.11, + "grad_norm": 1.3068607128680707, + "learning_rate": 0.0009851892643641257, + "loss": 3.9395, + "step": 925 + }, + { + "epoch": 0.11, + "grad_norm": 1.4897963281173685, + "learning_rate": 0.000985144368865933, + "loss": 4.0867, + "step": 926 + }, + { + "epoch": 0.11, + "grad_norm": 1.2781927711086296, + "learning_rate": 0.0009850994064513226, + "loss": 4.2004, + "step": 927 + }, + { + "epoch": 0.11, + "grad_norm": 1.320457471366556, + "learning_rate": 0.000985054377126496, + "loss": 4.0297, + "step": 928 + }, + { + "epoch": 0.11, + "grad_norm": 1.5277842280122598, + "learning_rate": 0.0009850092808976639, + "loss": 4.0596, + "step": 929 + }, + { + "epoch": 0.11, + "grad_norm": 1.2518662497623179, + "learning_rate": 0.0009849641177710467, + "loss": 4.2665, + "step": 930 + }, + { + "epoch": 0.11, + "grad_norm": 1.5149067967894774, + "learning_rate": 0.0009849188877528736, + "loss": 4.2191, + "step": 931 + }, + { + "epoch": 0.11, + "grad_norm": 1.6905230949838688, + "learning_rate": 0.0009848735908493834, + "loss": 4.0743, + "step": 932 + }, + { + "epoch": 0.11, + "grad_norm": 1.1929701681584368, + "learning_rate": 0.0009848282270668238, + "loss": 4.1644, + "step": 933 + }, + { + "epoch": 0.11, + "grad_norm": 1.2257841469605364, + "learning_rate": 0.000984782796411452, + "loss": 3.9519, + "step": 934 + }, + { + "epoch": 0.11, + "grad_norm": 1.176879136105362, + "learning_rate": 0.0009847372988895343, + "loss": 3.9867, + "step": 935 + }, + { + "epoch": 0.11, + "grad_norm": 1.9992555455613272, + "learning_rate": 0.000984691734507346, + "loss": 4.0503, + "step": 936 + }, + { + "epoch": 0.11, + "grad_norm": 1.1602851654795971, + "learning_rate": 0.0009846461032711723, + "loss": 3.7602, + "step": 937 + }, + { + "epoch": 0.11, + "grad_norm": 1.8971186764060644, + "learning_rate": 0.0009846004051873066, + "loss": 4.0302, + "step": 938 + }, + { + "epoch": 0.11, + "grad_norm": 1.5633693369995434, + "learning_rate": 0.0009845546402620523, + "loss": 4.0715, + "step": 939 + }, + { + "epoch": 0.11, + "grad_norm": 1.1842397888352805, + "learning_rate": 0.0009845088085017218, + "loss": 4.1694, + "step": 940 + }, + { + "epoch": 0.11, + "grad_norm": 1.207349419360585, + "learning_rate": 0.000984462909912637, + "loss": 4.1562, + "step": 941 + }, + { + "epoch": 0.11, + "grad_norm": 3.1162379496479007, + "learning_rate": 0.0009844169445011282, + "loss": 4.2172, + "step": 942 + }, + { + "epoch": 0.11, + "grad_norm": 1.2323412620635488, + "learning_rate": 0.0009843709122735358, + "loss": 4.0566, + "step": 943 + }, + { + "epoch": 0.11, + "grad_norm": 1.422142142553048, + "learning_rate": 0.000984324813236209, + "loss": 3.9576, + "step": 944 + }, + { + "epoch": 0.11, + "grad_norm": 1.2023201557705174, + "learning_rate": 0.0009842786473955062, + "loss": 4.0906, + "step": 945 + }, + { + "epoch": 0.11, + "grad_norm": 1.285804898787631, + "learning_rate": 0.0009842324147577954, + "loss": 3.8996, + "step": 946 + }, + { + "epoch": 0.11, + "grad_norm": 1.3110230140133707, + "learning_rate": 0.0009841861153294534, + "loss": 4.1524, + "step": 947 + }, + { + "epoch": 0.11, + "grad_norm": 1.8117412704677454, + "learning_rate": 0.000984139749116866, + "loss": 4.1041, + "step": 948 + }, + { + "epoch": 0.11, + "grad_norm": 1.321714473979289, + "learning_rate": 0.0009840933161264288, + "loss": 4.2487, + "step": 949 + }, + { + "epoch": 0.11, + "grad_norm": 1.984874592434313, + "learning_rate": 0.0009840468163645462, + "loss": 4.0886, + "step": 950 + }, + { + "epoch": 0.11, + "grad_norm": 1.3566132759748373, + "learning_rate": 0.0009840002498376322, + "loss": 4.4746, + "step": 951 + }, + { + "epoch": 0.11, + "grad_norm": 1.317596157846536, + "learning_rate": 0.0009839536165521094, + "loss": 3.9685, + "step": 952 + }, + { + "epoch": 0.11, + "grad_norm": 2.4264478553663755, + "learning_rate": 0.0009839069165144103, + "loss": 4.1508, + "step": 953 + }, + { + "epoch": 0.11, + "grad_norm": 1.4048612046314342, + "learning_rate": 0.0009838601497309763, + "loss": 3.8894, + "step": 954 + }, + { + "epoch": 0.11, + "grad_norm": 4.379628338406131, + "learning_rate": 0.0009838133162082578, + "loss": 3.9963, + "step": 955 + }, + { + "epoch": 0.11, + "grad_norm": 1.3055416549723382, + "learning_rate": 0.0009837664159527146, + "loss": 4.0372, + "step": 956 + }, + { + "epoch": 0.11, + "grad_norm": 1.446338268413068, + "learning_rate": 0.0009837194489708157, + "loss": 4.1787, + "step": 957 + }, + { + "epoch": 0.11, + "grad_norm": 1.1405526985096115, + "learning_rate": 0.0009836724152690395, + "loss": 4.1131, + "step": 958 + }, + { + "epoch": 0.11, + "grad_norm": 1.2692440128786606, + "learning_rate": 0.0009836253148538731, + "loss": 4.0634, + "step": 959 + }, + { + "epoch": 0.11, + "grad_norm": 2.9207788332963034, + "learning_rate": 0.0009835781477318133, + "loss": 4.1055, + "step": 960 + }, + { + "epoch": 0.11, + "grad_norm": 1.7000128588792343, + "learning_rate": 0.000983530913909366, + "loss": 4.1679, + "step": 961 + }, + { + "epoch": 0.11, + "grad_norm": 1.2184849719349153, + "learning_rate": 0.0009834836133930458, + "loss": 4.0771, + "step": 962 + }, + { + "epoch": 0.11, + "grad_norm": 1.1866419847927028, + "learning_rate": 0.0009834362461893773, + "loss": 4.0561, + "step": 963 + }, + { + "epoch": 0.11, + "grad_norm": 1.1816064035871716, + "learning_rate": 0.0009833888123048937, + "loss": 4.0231, + "step": 964 + }, + { + "epoch": 0.11, + "grad_norm": 6.652066844511286, + "learning_rate": 0.0009833413117461378, + "loss": 4.0028, + "step": 965 + }, + { + "epoch": 0.11, + "grad_norm": 1.4053628013176662, + "learning_rate": 0.0009832937445196613, + "loss": 3.908, + "step": 966 + }, + { + "epoch": 0.11, + "grad_norm": 6.141565525936523, + "learning_rate": 0.000983246110632025, + "loss": 4.275, + "step": 967 + }, + { + "epoch": 0.11, + "grad_norm": 1.4278201820569127, + "learning_rate": 0.0009831984100897994, + "loss": 4.0542, + "step": 968 + }, + { + "epoch": 0.11, + "grad_norm": 1.199384260694025, + "learning_rate": 0.0009831506428995636, + "loss": 4.1086, + "step": 969 + }, + { + "epoch": 0.11, + "grad_norm": 1.2346590703393536, + "learning_rate": 0.0009831028090679064, + "loss": 3.9633, + "step": 970 + }, + { + "epoch": 0.11, + "grad_norm": 1.3517228515130117, + "learning_rate": 0.0009830549086014254, + "loss": 3.9808, + "step": 971 + }, + { + "epoch": 0.11, + "grad_norm": 1.926508834080503, + "learning_rate": 0.0009830069415067276, + "loss": 3.8177, + "step": 972 + }, + { + "epoch": 0.11, + "grad_norm": 4.397354890876198, + "learning_rate": 0.0009829589077904293, + "loss": 3.995, + "step": 973 + }, + { + "epoch": 0.11, + "grad_norm": 2.5506131629600852, + "learning_rate": 0.0009829108074591556, + "loss": 3.9053, + "step": 974 + }, + { + "epoch": 0.11, + "grad_norm": 1.4924478582790404, + "learning_rate": 0.0009828626405195412, + "loss": 3.6817, + "step": 975 + }, + { + "epoch": 0.11, + "grad_norm": 1.287765183688655, + "learning_rate": 0.0009828144069782296, + "loss": 4.0913, + "step": 976 + }, + { + "epoch": 0.11, + "grad_norm": 1.816691401337574, + "learning_rate": 0.0009827661068418738, + "loss": 4.1, + "step": 977 + }, + { + "epoch": 0.11, + "grad_norm": 1.1693005548358926, + "learning_rate": 0.0009827177401171361, + "loss": 4.0689, + "step": 978 + }, + { + "epoch": 0.11, + "grad_norm": 1.1144680941717817, + "learning_rate": 0.0009826693068106876, + "loss": 4.0424, + "step": 979 + }, + { + "epoch": 0.11, + "grad_norm": 1.2934458002590496, + "learning_rate": 0.0009826208069292086, + "loss": 4.0227, + "step": 980 + }, + { + "epoch": 0.11, + "grad_norm": 1.4835842495086131, + "learning_rate": 0.000982572240479389, + "loss": 4.0982, + "step": 981 + }, + { + "epoch": 0.11, + "grad_norm": 1.271483482139452, + "learning_rate": 0.0009825236074679274, + "loss": 4.0279, + "step": 982 + }, + { + "epoch": 0.11, + "grad_norm": 1.7214152289466564, + "learning_rate": 0.0009824749079015318, + "loss": 4.1749, + "step": 983 + }, + { + "epoch": 0.11, + "grad_norm": 1.5790129001826994, + "learning_rate": 0.0009824261417869197, + "loss": 3.902, + "step": 984 + }, + { + "epoch": 0.11, + "grad_norm": 1.2710077031760036, + "learning_rate": 0.000982377309130817, + "loss": 4.1015, + "step": 985 + }, + { + "epoch": 0.11, + "grad_norm": 1.079904617272973, + "learning_rate": 0.0009823284099399596, + "loss": 4.1324, + "step": 986 + }, + { + "epoch": 0.11, + "grad_norm": 1.3034347301445846, + "learning_rate": 0.000982279444221092, + "loss": 4.1291, + "step": 987 + }, + { + "epoch": 0.11, + "grad_norm": 1.2092294885850745, + "learning_rate": 0.0009822304119809682, + "loss": 4.0041, + "step": 988 + }, + { + "epoch": 0.11, + "grad_norm": 1.2730109767279272, + "learning_rate": 0.0009821813132263513, + "loss": 3.9237, + "step": 989 + }, + { + "epoch": 0.11, + "grad_norm": 1.5581751315706167, + "learning_rate": 0.0009821321479640134, + "loss": 4.2097, + "step": 990 + }, + { + "epoch": 0.11, + "grad_norm": 1.6129165168717297, + "learning_rate": 0.0009820829162007357, + "loss": 3.9833, + "step": 991 + }, + { + "epoch": 0.11, + "grad_norm": 1.2873737278633184, + "learning_rate": 0.0009820336179433091, + "loss": 4.171, + "step": 992 + }, + { + "epoch": 0.11, + "grad_norm": 1.4312967003286392, + "learning_rate": 0.0009819842531985337, + "loss": 4.0234, + "step": 993 + }, + { + "epoch": 0.11, + "grad_norm": 1.6915493531514212, + "learning_rate": 0.0009819348219732176, + "loss": 4.0339, + "step": 994 + }, + { + "epoch": 0.11, + "grad_norm": 1.5336906521936657, + "learning_rate": 0.0009818853242741796, + "loss": 4.0189, + "step": 995 + }, + { + "epoch": 0.11, + "grad_norm": 1.3149077305768606, + "learning_rate": 0.0009818357601082467, + "loss": 3.9543, + "step": 996 + }, + { + "epoch": 0.11, + "grad_norm": 1.5716224194463693, + "learning_rate": 0.0009817861294822551, + "loss": 4.0773, + "step": 997 + }, + { + "epoch": 0.11, + "grad_norm": 1.40519710985977, + "learning_rate": 0.0009817364324030506, + "loss": 3.9557, + "step": 998 + }, + { + "epoch": 0.11, + "grad_norm": 1.01794971224117, + "learning_rate": 0.0009816866688774882, + "loss": 4.1058, + "step": 999 + }, + { + "epoch": 0.11, + "grad_norm": 4.860652808748809, + "learning_rate": 0.0009816368389124314, + "loss": 4.2831, + "step": 1000 + }, + { + "epoch": 0.11, + "grad_norm": 4.402129028043682, + "learning_rate": 0.0009815869425147537, + "loss": 4.0437, + "step": 1001 + }, + { + "epoch": 0.11, + "grad_norm": 1.4823211432674894, + "learning_rate": 0.0009815369796913373, + "loss": 4.2357, + "step": 1002 + }, + { + "epoch": 0.12, + "grad_norm": 1.4790538387348315, + "learning_rate": 0.0009814869504490731, + "loss": 4.0382, + "step": 1003 + }, + { + "epoch": 0.12, + "grad_norm": 1.2730933337418775, + "learning_rate": 0.0009814368547948623, + "loss": 4.1144, + "step": 1004 + }, + { + "epoch": 0.12, + "grad_norm": 1.1489404794798845, + "learning_rate": 0.0009813866927356142, + "loss": 4.206, + "step": 1005 + }, + { + "epoch": 0.12, + "grad_norm": 2.718423900008804, + "learning_rate": 0.000981336464278248, + "loss": 4.0481, + "step": 1006 + }, + { + "epoch": 0.12, + "grad_norm": 1.9794000243327088, + "learning_rate": 0.0009812861694296917, + "loss": 4.195, + "step": 1007 + }, + { + "epoch": 0.12, + "grad_norm": 1.047499317782648, + "learning_rate": 0.0009812358081968825, + "loss": 3.8762, + "step": 1008 + }, + { + "epoch": 0.12, + "grad_norm": 1.312670767675802, + "learning_rate": 0.0009811853805867668, + "loss": 3.8914, + "step": 1009 + }, + { + "epoch": 0.12, + "grad_norm": 1.2490019835768826, + "learning_rate": 0.0009811348866063, + "loss": 4.0617, + "step": 1010 + }, + { + "epoch": 0.12, + "grad_norm": 1.4056700523809305, + "learning_rate": 0.0009810843262624467, + "loss": 4.3159, + "step": 1011 + }, + { + "epoch": 0.12, + "grad_norm": 1.6390340828956516, + "learning_rate": 0.000981033699562181, + "loss": 4.2921, + "step": 1012 + }, + { + "epoch": 0.12, + "grad_norm": 1.162608838536548, + "learning_rate": 0.0009809830065124858, + "loss": 4.1538, + "step": 1013 + }, + { + "epoch": 0.12, + "grad_norm": 1.6026138425946854, + "learning_rate": 0.0009809322471203534, + "loss": 4.0057, + "step": 1014 + }, + { + "epoch": 0.12, + "grad_norm": 1.7498607343990364, + "learning_rate": 0.0009808814213927847, + "loss": 4.1682, + "step": 1015 + }, + { + "epoch": 0.12, + "grad_norm": 1.3922144822159193, + "learning_rate": 0.0009808305293367904, + "loss": 4.1587, + "step": 1016 + }, + { + "epoch": 0.12, + "grad_norm": 26.277603721175055, + "learning_rate": 0.00098077957095939, + "loss": 4.0795, + "step": 1017 + }, + { + "epoch": 0.12, + "grad_norm": 3.8112385727503764, + "learning_rate": 0.0009807285462676122, + "loss": 4.066, + "step": 1018 + }, + { + "epoch": 0.12, + "grad_norm": 1.8959416289394118, + "learning_rate": 0.0009806774552684953, + "loss": 4.0906, + "step": 1019 + }, + { + "epoch": 0.12, + "grad_norm": 1.622234905733966, + "learning_rate": 0.0009806262979690857, + "loss": 3.919, + "step": 1020 + }, + { + "epoch": 0.12, + "grad_norm": 1.4925078199962707, + "learning_rate": 0.00098057507437644, + "loss": 3.8106, + "step": 1021 + }, + { + "epoch": 0.12, + "grad_norm": 1.4575498548230545, + "learning_rate": 0.0009805237844976234, + "loss": 4.0375, + "step": 1022 + }, + { + "epoch": 0.12, + "grad_norm": 3.7165346716241943, + "learning_rate": 0.00098047242833971, + "loss": 4.1195, + "step": 1023 + }, + { + "epoch": 0.12, + "grad_norm": 1.390624756790253, + "learning_rate": 0.0009804210059097841, + "loss": 4.0582, + "step": 1024 + }, + { + "epoch": 0.12, + "grad_norm": 1.627757532921064, + "learning_rate": 0.0009803695172149382, + "loss": 4.0557, + "step": 1025 + }, + { + "epoch": 0.12, + "grad_norm": 6.181473916727016, + "learning_rate": 0.0009803179622622738, + "loss": 3.9394, + "step": 1026 + }, + { + "epoch": 0.12, + "grad_norm": 1.9392090615767972, + "learning_rate": 0.0009802663410589023, + "loss": 4.2315, + "step": 1027 + }, + { + "epoch": 0.12, + "grad_norm": 1.3179995506591398, + "learning_rate": 0.0009802146536119437, + "loss": 3.9797, + "step": 1028 + }, + { + "epoch": 0.12, + "grad_norm": 1.45031357075047, + "learning_rate": 0.0009801628999285274, + "loss": 3.975, + "step": 1029 + }, + { + "epoch": 0.12, + "grad_norm": 5.921840222943073, + "learning_rate": 0.000980111080015792, + "loss": 4.1748, + "step": 1030 + }, + { + "epoch": 0.12, + "grad_norm": 1.2850071444042936, + "learning_rate": 0.0009800591938808846, + "loss": 3.877, + "step": 1031 + }, + { + "epoch": 0.12, + "grad_norm": 1.2793305392606493, + "learning_rate": 0.0009800072415309623, + "loss": 4.2125, + "step": 1032 + }, + { + "epoch": 0.12, + "grad_norm": 2.1523367727147753, + "learning_rate": 0.0009799552229731907, + "loss": 3.8641, + "step": 1033 + }, + { + "epoch": 0.12, + "grad_norm": 4.354578693520637, + "learning_rate": 0.0009799031382147448, + "loss": 4.1685, + "step": 1034 + }, + { + "epoch": 0.12, + "grad_norm": 3.4434103059761116, + "learning_rate": 0.000979850987262809, + "loss": 4.0627, + "step": 1035 + }, + { + "epoch": 0.12, + "grad_norm": 0.7087287464153761, + "learning_rate": 0.0009797987701245761, + "loss": 3.9373, + "step": 1036 + }, + { + "epoch": 0.12, + "grad_norm": 1.4322652783547312, + "learning_rate": 0.0009797464868072487, + "loss": 4.1694, + "step": 1037 + }, + { + "epoch": 0.12, + "grad_norm": 1.258691315160102, + "learning_rate": 0.0009796941373180384, + "loss": 4.2494, + "step": 1038 + }, + { + "epoch": 0.12, + "grad_norm": 1.3545471197187142, + "learning_rate": 0.0009796417216641653, + "loss": 4.1533, + "step": 1039 + }, + { + "epoch": 0.12, + "grad_norm": 1.0610387625266209, + "learning_rate": 0.00097958923985286, + "loss": 4.0507, + "step": 1040 + }, + { + "epoch": 0.12, + "grad_norm": 1.1441854336507373, + "learning_rate": 0.0009795366918913604, + "loss": 4.1602, + "step": 1041 + }, + { + "epoch": 0.12, + "grad_norm": 1.683186222995479, + "learning_rate": 0.0009794840777869152, + "loss": 4.1068, + "step": 1042 + }, + { + "epoch": 0.12, + "grad_norm": 1.3010141741874532, + "learning_rate": 0.0009794313975467813, + "loss": 4.3194, + "step": 1043 + }, + { + "epoch": 0.12, + "grad_norm": 1.401158710932506, + "learning_rate": 0.0009793786511782248, + "loss": 3.9999, + "step": 1044 + }, + { + "epoch": 0.12, + "grad_norm": 1.2228105604228117, + "learning_rate": 0.000979325838688521, + "loss": 4.0648, + "step": 1045 + }, + { + "epoch": 0.12, + "grad_norm": 1.3001665446288744, + "learning_rate": 0.000979272960084955, + "loss": 4.1342, + "step": 1046 + }, + { + "epoch": 0.12, + "grad_norm": 1.923309509834412, + "learning_rate": 0.0009792200153748195, + "loss": 4.0739, + "step": 1047 + }, + { + "epoch": 0.12, + "grad_norm": 3.6890518840189803, + "learning_rate": 0.0009791670045654177, + "loss": 4.0867, + "step": 1048 + }, + { + "epoch": 0.12, + "grad_norm": 1.5598404960921566, + "learning_rate": 0.0009791139276640614, + "loss": 3.8474, + "step": 1049 + }, + { + "epoch": 0.12, + "grad_norm": 1.747740107737911, + "learning_rate": 0.0009790607846780718, + "loss": 4.087, + "step": 1050 + }, + { + "epoch": 0.12, + "grad_norm": 3.5324063230961253, + "learning_rate": 0.0009790075756147783, + "loss": 4.0739, + "step": 1051 + }, + { + "epoch": 0.12, + "grad_norm": 2.171647229274092, + "learning_rate": 0.0009789543004815207, + "loss": 3.9563, + "step": 1052 + }, + { + "epoch": 0.12, + "grad_norm": 1.4985059033678876, + "learning_rate": 0.000978900959285647, + "loss": 4.2187, + "step": 1053 + }, + { + "epoch": 0.12, + "grad_norm": 6.064379861767049, + "learning_rate": 0.0009788475520345146, + "loss": 4.3221, + "step": 1054 + }, + { + "epoch": 0.12, + "grad_norm": 1.4971275840059683, + "learning_rate": 0.0009787940787354902, + "loss": 4.069, + "step": 1055 + }, + { + "epoch": 0.12, + "grad_norm": 2.263977554472576, + "learning_rate": 0.000978740539395949, + "loss": 4.4513, + "step": 1056 + }, + { + "epoch": 0.12, + "grad_norm": 1.4908730806011838, + "learning_rate": 0.0009786869340232761, + "loss": 4.3548, + "step": 1057 + }, + { + "epoch": 0.12, + "grad_norm": 1.4719282241238945, + "learning_rate": 0.0009786332626248655, + "loss": 3.9538, + "step": 1058 + }, + { + "epoch": 0.12, + "grad_norm": 1.340626012032056, + "learning_rate": 0.0009785795252081199, + "loss": 4.0992, + "step": 1059 + }, + { + "epoch": 0.12, + "grad_norm": 1.306660821928319, + "learning_rate": 0.000978525721780451, + "loss": 4.1287, + "step": 1060 + }, + { + "epoch": 0.12, + "grad_norm": 1.2300265414566451, + "learning_rate": 0.0009784718523492804, + "loss": 3.9858, + "step": 1061 + }, + { + "epoch": 0.12, + "grad_norm": 1.1588443166430544, + "learning_rate": 0.0009784179169220384, + "loss": 4.2258, + "step": 1062 + }, + { + "epoch": 0.12, + "grad_norm": 1.57047185802853, + "learning_rate": 0.0009783639155061643, + "loss": 3.7996, + "step": 1063 + }, + { + "epoch": 0.12, + "grad_norm": 1.2808171957774321, + "learning_rate": 0.0009783098481091063, + "loss": 4.0964, + "step": 1064 + }, + { + "epoch": 0.12, + "grad_norm": 1.3042816824374917, + "learning_rate": 0.0009782557147383225, + "loss": 3.9128, + "step": 1065 + }, + { + "epoch": 0.12, + "grad_norm": 1.1420537559137431, + "learning_rate": 0.0009782015154012789, + "loss": 4.1346, + "step": 1066 + }, + { + "epoch": 0.12, + "grad_norm": 1.327327290934473, + "learning_rate": 0.0009781472501054517, + "loss": 3.9409, + "step": 1067 + }, + { + "epoch": 0.12, + "grad_norm": 3.05505262983701, + "learning_rate": 0.0009780929188583256, + "loss": 4.0558, + "step": 1068 + }, + { + "epoch": 0.12, + "grad_norm": 1.1374303636099061, + "learning_rate": 0.000978038521667395, + "loss": 4.0334, + "step": 1069 + }, + { + "epoch": 0.12, + "grad_norm": 1.2314833517094605, + "learning_rate": 0.000977984058540162, + "loss": 4.0432, + "step": 1070 + }, + { + "epoch": 0.12, + "grad_norm": 1.103150152733436, + "learning_rate": 0.0009779295294841397, + "loss": 3.9199, + "step": 1071 + }, + { + "epoch": 0.12, + "grad_norm": 1.3462765793506142, + "learning_rate": 0.0009778749345068487, + "loss": 4.0583, + "step": 1072 + }, + { + "epoch": 0.12, + "grad_norm": 1.739819561788374, + "learning_rate": 0.00097782027361582, + "loss": 4.0955, + "step": 1073 + }, + { + "epoch": 0.12, + "grad_norm": 1.2288585735959598, + "learning_rate": 0.0009777655468185924, + "loss": 3.9591, + "step": 1074 + }, + { + "epoch": 0.12, + "grad_norm": 1.5945075402894981, + "learning_rate": 0.0009777107541227147, + "loss": 4.142, + "step": 1075 + }, + { + "epoch": 0.12, + "grad_norm": 2.389277968946144, + "learning_rate": 0.0009776558955357443, + "loss": 4.0779, + "step": 1076 + }, + { + "epoch": 0.12, + "grad_norm": 1.4375073059031855, + "learning_rate": 0.0009776009710652483, + "loss": 3.9904, + "step": 1077 + }, + { + "epoch": 0.12, + "grad_norm": 1.4348995213661113, + "learning_rate": 0.0009775459807188022, + "loss": 3.9242, + "step": 1078 + }, + { + "epoch": 0.12, + "grad_norm": 1.5934869452469487, + "learning_rate": 0.0009774909245039909, + "loss": 3.8806, + "step": 1079 + }, + { + "epoch": 0.12, + "grad_norm": 1.479845608471666, + "learning_rate": 0.0009774358024284082, + "loss": 4.0791, + "step": 1080 + }, + { + "epoch": 0.12, + "grad_norm": 1.117296021687194, + "learning_rate": 0.0009773806144996575, + "loss": 3.9955, + "step": 1081 + }, + { + "epoch": 0.12, + "grad_norm": 2.183772677672718, + "learning_rate": 0.0009773253607253507, + "loss": 3.9873, + "step": 1082 + }, + { + "epoch": 0.12, + "grad_norm": 1.5789962639928703, + "learning_rate": 0.000977270041113109, + "loss": 3.9276, + "step": 1083 + }, + { + "epoch": 0.12, + "grad_norm": 1.0193743656006615, + "learning_rate": 0.0009772146556705629, + "loss": 3.9963, + "step": 1084 + }, + { + "epoch": 0.12, + "grad_norm": 2.2295991421472445, + "learning_rate": 0.0009771592044053512, + "loss": 4.1109, + "step": 1085 + }, + { + "epoch": 0.12, + "grad_norm": 2.9581005964807026, + "learning_rate": 0.000977103687325123, + "loss": 4.2778, + "step": 1086 + }, + { + "epoch": 0.12, + "grad_norm": 1.2850283541503715, + "learning_rate": 0.0009770481044375356, + "loss": 3.7693, + "step": 1087 + }, + { + "epoch": 0.12, + "grad_norm": 1.1692506007914478, + "learning_rate": 0.0009769924557502553, + "loss": 4.0464, + "step": 1088 + }, + { + "epoch": 0.12, + "grad_norm": 1.360475215456834, + "learning_rate": 0.0009769367412709585, + "loss": 4.0362, + "step": 1089 + }, + { + "epoch": 0.12, + "grad_norm": 1.493610507999107, + "learning_rate": 0.0009768809610073291, + "loss": 3.9251, + "step": 1090 + }, + { + "epoch": 0.13, + "grad_norm": 2.0423129596254905, + "learning_rate": 0.0009768251149670614, + "loss": 4.0073, + "step": 1091 + }, + { + "epoch": 0.13, + "grad_norm": 1.126386563454413, + "learning_rate": 0.000976769203157858, + "loss": 4.0113, + "step": 1092 + }, + { + "epoch": 0.13, + "grad_norm": 1.6427277520739216, + "learning_rate": 0.0009767132255874315, + "loss": 4.0531, + "step": 1093 + }, + { + "epoch": 0.13, + "grad_norm": 1.4804944398674211, + "learning_rate": 0.0009766571822635022, + "loss": 3.9697, + "step": 1094 + }, + { + "epoch": 0.13, + "grad_norm": 1.4291578612082383, + "learning_rate": 0.0009766010731938007, + "loss": 4.094, + "step": 1095 + }, + { + "epoch": 0.13, + "grad_norm": 1.5141542606663225, + "learning_rate": 0.0009765448983860658, + "loss": 3.9599, + "step": 1096 + }, + { + "epoch": 0.13, + "grad_norm": 1.8248712042767385, + "learning_rate": 0.0009764886578480461, + "loss": 3.8438, + "step": 1097 + }, + { + "epoch": 0.13, + "grad_norm": 1.372364023592408, + "learning_rate": 0.0009764323515874986, + "loss": 4.222, + "step": 1098 + }, + { + "epoch": 0.13, + "grad_norm": 1.231832143212094, + "learning_rate": 0.00097637597961219, + "loss": 4.1938, + "step": 1099 + }, + { + "epoch": 0.13, + "grad_norm": 1.4374091428190032, + "learning_rate": 0.0009763195419298955, + "loss": 4.0506, + "step": 1100 + }, + { + "epoch": 0.13, + "grad_norm": 3.022287851742623, + "learning_rate": 0.0009762630385483997, + "loss": 4.0225, + "step": 1101 + }, + { + "epoch": 0.13, + "grad_norm": 1.3815036560974403, + "learning_rate": 0.000976206469475496, + "loss": 3.9241, + "step": 1102 + }, + { + "epoch": 0.13, + "grad_norm": 1.5642226838309352, + "learning_rate": 0.0009761498347189872, + "loss": 4.1569, + "step": 1103 + }, + { + "epoch": 0.13, + "grad_norm": 1.8100534976268143, + "learning_rate": 0.000976093134286685, + "loss": 4.1148, + "step": 1104 + }, + { + "epoch": 0.13, + "grad_norm": 1.1517123086389929, + "learning_rate": 0.0009760363681864102, + "loss": 4.0619, + "step": 1105 + }, + { + "epoch": 0.13, + "grad_norm": 1.612100174452912, + "learning_rate": 0.0009759795364259923, + "loss": 3.9537, + "step": 1106 + }, + { + "epoch": 0.13, + "grad_norm": 1.208072970379895, + "learning_rate": 0.0009759226390132704, + "loss": 4.0334, + "step": 1107 + }, + { + "epoch": 0.13, + "grad_norm": 1.3887279501098326, + "learning_rate": 0.0009758656759560923, + "loss": 3.9506, + "step": 1108 + }, + { + "epoch": 0.13, + "grad_norm": 1.3908901557035898, + "learning_rate": 0.0009758086472623151, + "loss": 3.9903, + "step": 1109 + }, + { + "epoch": 0.13, + "grad_norm": 5.983151859479315, + "learning_rate": 0.0009757515529398047, + "loss": 3.9413, + "step": 1110 + }, + { + "epoch": 0.13, + "grad_norm": 1.2815439165164777, + "learning_rate": 0.0009756943929964363, + "loss": 3.9637, + "step": 1111 + }, + { + "epoch": 0.13, + "grad_norm": 1.3602353462716779, + "learning_rate": 0.0009756371674400939, + "loss": 3.957, + "step": 1112 + }, + { + "epoch": 0.13, + "grad_norm": 1.3029872792102541, + "learning_rate": 0.0009755798762786707, + "loss": 4.0644, + "step": 1113 + }, + { + "epoch": 0.13, + "grad_norm": 1.1470586218455885, + "learning_rate": 0.0009755225195200689, + "loss": 3.7977, + "step": 1114 + }, + { + "epoch": 0.13, + "grad_norm": 2.3476550444761335, + "learning_rate": 0.0009754650971722, + "loss": 4.1035, + "step": 1115 + }, + { + "epoch": 0.13, + "grad_norm": 1.4704414796415541, + "learning_rate": 0.000975407609242984, + "loss": 4.0167, + "step": 1116 + }, + { + "epoch": 0.13, + "grad_norm": 3.768555022379108, + "learning_rate": 0.0009753500557403504, + "loss": 4.0987, + "step": 1117 + }, + { + "epoch": 0.13, + "grad_norm": 1.7362033927045515, + "learning_rate": 0.0009752924366722376, + "loss": 3.9892, + "step": 1118 + }, + { + "epoch": 0.13, + "grad_norm": 1.2465462081912857, + "learning_rate": 0.0009752347520465931, + "loss": 4.1808, + "step": 1119 + }, + { + "epoch": 0.13, + "grad_norm": 2.365849090450502, + "learning_rate": 0.0009751770018713734, + "loss": 4.0288, + "step": 1120 + }, + { + "epoch": 0.13, + "grad_norm": 1.2217654375116476, + "learning_rate": 0.0009751191861545439, + "loss": 4.0525, + "step": 1121 + }, + { + "epoch": 0.13, + "grad_norm": 4.082356606142538, + "learning_rate": 0.0009750613049040792, + "loss": 4.2771, + "step": 1122 + }, + { + "epoch": 0.13, + "grad_norm": 2.215685880262868, + "learning_rate": 0.0009750033581279632, + "loss": 3.9067, + "step": 1123 + }, + { + "epoch": 0.13, + "grad_norm": 1.9365207432276945, + "learning_rate": 0.0009749453458341882, + "loss": 4.2362, + "step": 1124 + }, + { + "epoch": 0.13, + "grad_norm": 1.243393173952921, + "learning_rate": 0.000974887268030756, + "loss": 4.1093, + "step": 1125 + }, + { + "epoch": 0.13, + "grad_norm": 1.113055690560134, + "learning_rate": 0.0009748291247256774, + "loss": 4.0303, + "step": 1126 + }, + { + "epoch": 0.13, + "grad_norm": 1.2195322565632427, + "learning_rate": 0.000974770915926972, + "loss": 4.2279, + "step": 1127 + }, + { + "epoch": 0.13, + "grad_norm": 1.2534133818251345, + "learning_rate": 0.0009747126416426688, + "loss": 4.1046, + "step": 1128 + }, + { + "epoch": 0.13, + "grad_norm": 2.2762379100531556, + "learning_rate": 0.0009746543018808057, + "loss": 3.9392, + "step": 1129 + }, + { + "epoch": 0.13, + "grad_norm": 1.32019351523326, + "learning_rate": 0.000974595896649429, + "loss": 4.0348, + "step": 1130 + }, + { + "epoch": 0.13, + "grad_norm": 1.2715962000111387, + "learning_rate": 0.0009745374259565953, + "loss": 3.9784, + "step": 1131 + }, + { + "epoch": 0.13, + "grad_norm": 1.037934731168759, + "learning_rate": 0.0009744788898103691, + "loss": 3.7759, + "step": 1132 + }, + { + "epoch": 0.13, + "grad_norm": 1.1862931088699729, + "learning_rate": 0.0009744202882188245, + "loss": 3.9936, + "step": 1133 + }, + { + "epoch": 0.13, + "grad_norm": 1.5470478998787935, + "learning_rate": 0.0009743616211900443, + "loss": 4.3491, + "step": 1134 + }, + { + "epoch": 0.13, + "grad_norm": 1.7282053763887202, + "learning_rate": 0.0009743028887321206, + "loss": 4.2604, + "step": 1135 + }, + { + "epoch": 0.13, + "grad_norm": 2.04449528113819, + "learning_rate": 0.0009742440908531545, + "loss": 3.8811, + "step": 1136 + }, + { + "epoch": 0.13, + "grad_norm": 1.240227069887739, + "learning_rate": 0.0009741852275612559, + "loss": 4.0271, + "step": 1137 + }, + { + "epoch": 0.13, + "grad_norm": 1.1097643794007848, + "learning_rate": 0.0009741262988645441, + "loss": 3.6696, + "step": 1138 + }, + { + "epoch": 0.13, + "grad_norm": 1.2398726318213478, + "learning_rate": 0.000974067304771147, + "loss": 4.2604, + "step": 1139 + }, + { + "epoch": 0.13, + "grad_norm": 1.0346633290848877, + "learning_rate": 0.0009740082452892017, + "loss": 3.9715, + "step": 1140 + }, + { + "epoch": 0.13, + "grad_norm": 1.1932297566003178, + "learning_rate": 0.0009739491204268545, + "loss": 3.9658, + "step": 1141 + }, + { + "epoch": 0.13, + "grad_norm": 1.205304594451489, + "learning_rate": 0.0009738899301922602, + "loss": 3.98, + "step": 1142 + }, + { + "epoch": 0.13, + "grad_norm": 1.2020454920984538, + "learning_rate": 0.0009738306745935833, + "loss": 3.7182, + "step": 1143 + }, + { + "epoch": 0.13, + "grad_norm": 1.260432300070686, + "learning_rate": 0.0009737713536389969, + "loss": 4.2419, + "step": 1144 + }, + { + "epoch": 0.13, + "grad_norm": 1.1133372525884535, + "learning_rate": 0.0009737119673366832, + "loss": 4.0668, + "step": 1145 + }, + { + "epoch": 0.13, + "grad_norm": 1.4353979020163097, + "learning_rate": 0.0009736525156948333, + "loss": 4.1236, + "step": 1146 + }, + { + "epoch": 0.13, + "grad_norm": 3.232836220065639, + "learning_rate": 0.0009735929987216476, + "loss": 3.9061, + "step": 1147 + }, + { + "epoch": 0.13, + "grad_norm": 1.5406095573245218, + "learning_rate": 0.0009735334164253351, + "loss": 4.0218, + "step": 1148 + }, + { + "epoch": 0.13, + "grad_norm": 1.9271397077839116, + "learning_rate": 0.0009734737688141142, + "loss": 4.0121, + "step": 1149 + }, + { + "epoch": 0.13, + "grad_norm": 1.8965940181337684, + "learning_rate": 0.0009734140558962123, + "loss": 3.9482, + "step": 1150 + }, + { + "epoch": 0.13, + "grad_norm": 2.617457047223621, + "learning_rate": 0.0009733542776798653, + "loss": 4.0872, + "step": 1151 + }, + { + "epoch": 0.13, + "grad_norm": 1.3860655864056333, + "learning_rate": 0.0009732944341733188, + "loss": 4.042, + "step": 1152 + }, + { + "epoch": 0.13, + "grad_norm": 1.747993380220601, + "learning_rate": 0.0009732345253848267, + "loss": 4.0696, + "step": 1153 + }, + { + "epoch": 0.13, + "grad_norm": 1.207376581682096, + "learning_rate": 0.0009731745513226526, + "loss": 3.9544, + "step": 1154 + }, + { + "epoch": 0.13, + "grad_norm": 1.2256604213238373, + "learning_rate": 0.0009731145119950686, + "loss": 4.0532, + "step": 1155 + }, + { + "epoch": 0.13, + "grad_norm": 1.0817362493268414, + "learning_rate": 0.0009730544074103562, + "loss": 4.1034, + "step": 1156 + }, + { + "epoch": 0.13, + "grad_norm": 1.1600626371391756, + "learning_rate": 0.0009729942375768055, + "loss": 4.191, + "step": 1157 + }, + { + "epoch": 0.13, + "grad_norm": 1.70580656369459, + "learning_rate": 0.0009729340025027158, + "loss": 4.1685, + "step": 1158 + }, + { + "epoch": 0.13, + "grad_norm": 3.0876482048800766, + "learning_rate": 0.0009728737021963954, + "loss": 4.1988, + "step": 1159 + }, + { + "epoch": 0.13, + "grad_norm": 1.046961117495001, + "learning_rate": 0.0009728133366661615, + "loss": 3.8947, + "step": 1160 + }, + { + "epoch": 0.13, + "grad_norm": 1.2109004698355041, + "learning_rate": 0.0009727529059203406, + "loss": 3.9896, + "step": 1161 + }, + { + "epoch": 0.13, + "grad_norm": 1.7573711158460956, + "learning_rate": 0.0009726924099672676, + "loss": 3.8891, + "step": 1162 + }, + { + "epoch": 0.13, + "grad_norm": 1.1794932763368144, + "learning_rate": 0.0009726318488152872, + "loss": 3.9238, + "step": 1163 + }, + { + "epoch": 0.13, + "grad_norm": 1.395301234022215, + "learning_rate": 0.0009725712224727523, + "loss": 3.9941, + "step": 1164 + }, + { + "epoch": 0.13, + "grad_norm": 1.1358170395882634, + "learning_rate": 0.0009725105309480253, + "loss": 3.9771, + "step": 1165 + }, + { + "epoch": 0.13, + "grad_norm": 1.2158345909415074, + "learning_rate": 0.0009724497742494776, + "loss": 3.9244, + "step": 1166 + }, + { + "epoch": 0.13, + "grad_norm": 1.2711099423714294, + "learning_rate": 0.000972388952385489, + "loss": 4.0551, + "step": 1167 + }, + { + "epoch": 0.13, + "grad_norm": 1.1051383784413746, + "learning_rate": 0.000972328065364449, + "loss": 3.8987, + "step": 1168 + }, + { + "epoch": 0.13, + "grad_norm": 1.1468378846132352, + "learning_rate": 0.0009722671131947559, + "loss": 3.819, + "step": 1169 + }, + { + "epoch": 0.13, + "grad_norm": 1.0203706772290229, + "learning_rate": 0.0009722060958848168, + "loss": 3.7945, + "step": 1170 + }, + { + "epoch": 0.13, + "grad_norm": 1.4775824263762234, + "learning_rate": 0.0009721450134430478, + "loss": 3.883, + "step": 1171 + }, + { + "epoch": 0.13, + "grad_norm": 1.3536524118876307, + "learning_rate": 0.000972083865877874, + "loss": 4.1824, + "step": 1172 + }, + { + "epoch": 0.13, + "grad_norm": 3.101880117571411, + "learning_rate": 0.0009720226531977296, + "loss": 4.1951, + "step": 1173 + }, + { + "epoch": 0.13, + "grad_norm": 1.3680630555380664, + "learning_rate": 0.0009719613754110578, + "loss": 4.136, + "step": 1174 + }, + { + "epoch": 0.13, + "grad_norm": 1.1442504024431006, + "learning_rate": 0.0009719000325263109, + "loss": 3.9422, + "step": 1175 + }, + { + "epoch": 0.13, + "grad_norm": 7.083179856231108, + "learning_rate": 0.0009718386245519495, + "loss": 4.1419, + "step": 1176 + }, + { + "epoch": 0.13, + "grad_norm": 1.1467413861101263, + "learning_rate": 0.0009717771514964439, + "loss": 3.9311, + "step": 1177 + }, + { + "epoch": 0.14, + "grad_norm": 4.848618204270367, + "learning_rate": 0.0009717156133682734, + "loss": 3.9715, + "step": 1178 + }, + { + "epoch": 0.14, + "grad_norm": 2.3302178796004167, + "learning_rate": 0.0009716540101759255, + "loss": 4.1595, + "step": 1179 + }, + { + "epoch": 0.14, + "grad_norm": 3.108320054341137, + "learning_rate": 0.0009715923419278976, + "loss": 4.1926, + "step": 1180 + } + ], + "logging_steps": 1.0, + "max_steps": 8721, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 10, + "total_flos": 3861465440256.0, + "train_batch_size": 32, + "trial_name": null, + "trial_params": null +}