{ "best_metric": null, "best_model_checkpoint": null, "epoch": 3.0, "eval_steps": 500, "global_step": 11187, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0002681684097613301, "grad_norm": 11.546087741490192, "learning_rate": 8.936550491510278e-09, "loss": 0.3111, "step": 1 }, { "epoch": 0.0005363368195226602, "grad_norm": 9.953375608712667, "learning_rate": 1.7873100983020556e-08, "loss": 0.2913, "step": 2 }, { "epoch": 0.0008045052292839903, "grad_norm": 10.083574369824506, "learning_rate": 2.6809651474530834e-08, "loss": 0.3191, "step": 3 }, { "epoch": 0.0010726736390453205, "grad_norm": 9.447970830397251, "learning_rate": 3.574620196604111e-08, "loss": 0.2578, "step": 4 }, { "epoch": 0.0013408420488066506, "grad_norm": 11.251499396258808, "learning_rate": 4.4682752457551387e-08, "loss": 0.3124, "step": 5 }, { "epoch": 0.0016090104585679806, "grad_norm": 9.761419389834565, "learning_rate": 5.361930294906167e-08, "loss": 0.2643, "step": 6 }, { "epoch": 0.0018771788683293108, "grad_norm": 10.706098225922805, "learning_rate": 6.255585344057195e-08, "loss": 0.2931, "step": 7 }, { "epoch": 0.002145347278090641, "grad_norm": 8.634989096819421, "learning_rate": 7.149240393208222e-08, "loss": 0.2391, "step": 8 }, { "epoch": 0.002413515687851971, "grad_norm": 9.36594310115394, "learning_rate": 8.04289544235925e-08, "loss": 0.244, "step": 9 }, { "epoch": 0.0026816840976133013, "grad_norm": 9.897222332347056, "learning_rate": 8.936550491510277e-08, "loss": 0.2507, "step": 10 }, { "epoch": 0.0029498525073746312, "grad_norm": 10.265339217923621, "learning_rate": 9.830205540661306e-08, "loss": 0.3087, "step": 11 }, { "epoch": 0.003218020917135961, "grad_norm": 8.984086419805022, "learning_rate": 1.0723860589812334e-07, "loss": 0.2468, "step": 12 }, { "epoch": 0.0034861893268972916, "grad_norm": 8.435693892454692, "learning_rate": 1.1617515638963361e-07, "loss": 0.2275, "step": 13 }, { "epoch": 0.0037543577366586215, "grad_norm": 10.277637040079355, "learning_rate": 1.251117068811439e-07, "loss": 0.282, "step": 14 }, { "epoch": 0.004022526146419952, "grad_norm": 8.68011596079924, "learning_rate": 1.3404825737265417e-07, "loss": 0.2324, "step": 15 }, { "epoch": 0.004290694556181282, "grad_norm": 8.284776298222052, "learning_rate": 1.4298480786416445e-07, "loss": 0.2202, "step": 16 }, { "epoch": 0.004558862965942612, "grad_norm": 9.401693902327883, "learning_rate": 1.5192135835567472e-07, "loss": 0.2571, "step": 17 }, { "epoch": 0.004827031375703942, "grad_norm": 9.550271277538208, "learning_rate": 1.60857908847185e-07, "loss": 0.2842, "step": 18 }, { "epoch": 0.005095199785465273, "grad_norm": 9.133814633288498, "learning_rate": 1.6979445933869527e-07, "loss": 0.2666, "step": 19 }, { "epoch": 0.0053633681952266025, "grad_norm": 10.390099110752441, "learning_rate": 1.7873100983020555e-07, "loss": 0.3296, "step": 20 }, { "epoch": 0.0056315366049879325, "grad_norm": 7.785194449037081, "learning_rate": 1.8766756032171582e-07, "loss": 0.2268, "step": 21 }, { "epoch": 0.0058997050147492625, "grad_norm": 10.33638714203637, "learning_rate": 1.9660411081322612e-07, "loss": 0.2862, "step": 22 }, { "epoch": 0.006167873424510592, "grad_norm": 9.11443524442616, "learning_rate": 2.055406613047364e-07, "loss": 0.2795, "step": 23 }, { "epoch": 0.006436041834271922, "grad_norm": 8.59863373964623, "learning_rate": 2.1447721179624667e-07, "loss": 0.2628, "step": 24 }, { "epoch": 0.006704210244033253, "grad_norm": 8.049790757653193, "learning_rate": 2.2341376228775695e-07, "loss": 0.245, "step": 25 }, { "epoch": 0.006972378653794583, "grad_norm": 7.239513277027356, "learning_rate": 2.3235031277926722e-07, "loss": 0.2461, "step": 26 }, { "epoch": 0.007240547063555913, "grad_norm": 7.1298417211879235, "learning_rate": 2.412868632707775e-07, "loss": 0.2581, "step": 27 }, { "epoch": 0.007508715473317243, "grad_norm": 7.779228481865567, "learning_rate": 2.502234137622878e-07, "loss": 0.2421, "step": 28 }, { "epoch": 0.007776883883078573, "grad_norm": 6.470433619076548, "learning_rate": 2.5915996425379804e-07, "loss": 0.2052, "step": 29 }, { "epoch": 0.008045052292839904, "grad_norm": 6.236463756139538, "learning_rate": 2.6809651474530835e-07, "loss": 0.2009, "step": 30 }, { "epoch": 0.008313220702601234, "grad_norm": 4.587968342061977, "learning_rate": 2.770330652368186e-07, "loss": 0.1814, "step": 31 }, { "epoch": 0.008581389112362564, "grad_norm": 5.024911407808915, "learning_rate": 2.859696157283289e-07, "loss": 0.2327, "step": 32 }, { "epoch": 0.008849557522123894, "grad_norm": 4.59761757749979, "learning_rate": 2.9490616621983914e-07, "loss": 0.1625, "step": 33 }, { "epoch": 0.009117725931885224, "grad_norm": 4.065074858815684, "learning_rate": 3.0384271671134944e-07, "loss": 0.1453, "step": 34 }, { "epoch": 0.009385894341646554, "grad_norm": 4.078798101124644, "learning_rate": 3.127792672028597e-07, "loss": 0.1847, "step": 35 }, { "epoch": 0.009654062751407884, "grad_norm": 4.323295936444864, "learning_rate": 3.2171581769437e-07, "loss": 0.1868, "step": 36 }, { "epoch": 0.009922231161169213, "grad_norm": 3.949963666581816, "learning_rate": 3.3065236818588024e-07, "loss": 0.1906, "step": 37 }, { "epoch": 0.010190399570930545, "grad_norm": 3.2811446803161313, "learning_rate": 3.3958891867739054e-07, "loss": 0.1658, "step": 38 }, { "epoch": 0.010458567980691875, "grad_norm": 2.919088266268021, "learning_rate": 3.4852546916890084e-07, "loss": 0.1501, "step": 39 }, { "epoch": 0.010726736390453205, "grad_norm": 4.237774178797634, "learning_rate": 3.574620196604111e-07, "loss": 0.1772, "step": 40 }, { "epoch": 0.010994904800214535, "grad_norm": 3.09519794159859, "learning_rate": 3.663985701519214e-07, "loss": 0.2102, "step": 41 }, { "epoch": 0.011263073209975865, "grad_norm": 2.4541059356482333, "learning_rate": 3.7533512064343164e-07, "loss": 0.1405, "step": 42 }, { "epoch": 0.011531241619737195, "grad_norm": 2.312503120150902, "learning_rate": 3.8427167113494194e-07, "loss": 0.1433, "step": 43 }, { "epoch": 0.011799410029498525, "grad_norm": 2.3968643746407965, "learning_rate": 3.9320822162645224e-07, "loss": 0.1169, "step": 44 }, { "epoch": 0.012067578439259855, "grad_norm": 1.845564836703066, "learning_rate": 4.021447721179625e-07, "loss": 0.1158, "step": 45 }, { "epoch": 0.012335746849021185, "grad_norm": 2.211840397481718, "learning_rate": 4.110813226094728e-07, "loss": 0.1478, "step": 46 }, { "epoch": 0.012603915258782515, "grad_norm": 1.9718038161636013, "learning_rate": 4.2001787310098304e-07, "loss": 0.158, "step": 47 }, { "epoch": 0.012872083668543845, "grad_norm": 2.2523119193681467, "learning_rate": 4.2895442359249334e-07, "loss": 0.1632, "step": 48 }, { "epoch": 0.013140252078305176, "grad_norm": 2.059556702923991, "learning_rate": 4.378909740840036e-07, "loss": 0.1268, "step": 49 }, { "epoch": 0.013408420488066506, "grad_norm": 2.0221668076822907, "learning_rate": 4.468275245755139e-07, "loss": 0.1747, "step": 50 }, { "epoch": 0.013676588897827836, "grad_norm": 1.3183282774039613, "learning_rate": 4.557640750670242e-07, "loss": 0.0998, "step": 51 }, { "epoch": 0.013944757307589166, "grad_norm": 1.9226372556275977, "learning_rate": 4.6470062555853444e-07, "loss": 0.1229, "step": 52 }, { "epoch": 0.014212925717350496, "grad_norm": 1.6578946859244836, "learning_rate": 4.7363717605004474e-07, "loss": 0.125, "step": 53 }, { "epoch": 0.014481094127111826, "grad_norm": 2.00885525013515, "learning_rate": 4.82573726541555e-07, "loss": 0.1275, "step": 54 }, { "epoch": 0.014749262536873156, "grad_norm": 1.511464235405912, "learning_rate": 4.915102770330653e-07, "loss": 0.1131, "step": 55 }, { "epoch": 0.015017430946634486, "grad_norm": 1.9971433448480032, "learning_rate": 5.004468275245756e-07, "loss": 0.1147, "step": 56 }, { "epoch": 0.015285599356395816, "grad_norm": 1.5847978657117172, "learning_rate": 5.093833780160858e-07, "loss": 0.1201, "step": 57 }, { "epoch": 0.015553767766157146, "grad_norm": 2.0949312852003965, "learning_rate": 5.183199285075961e-07, "loss": 0.1002, "step": 58 }, { "epoch": 0.015821936175918476, "grad_norm": 2.166412250989985, "learning_rate": 5.272564789991064e-07, "loss": 0.1451, "step": 59 }, { "epoch": 0.016090104585679808, "grad_norm": 2.081983124826213, "learning_rate": 5.361930294906167e-07, "loss": 0.1474, "step": 60 }, { "epoch": 0.016358272995441136, "grad_norm": 2.0203000295104703, "learning_rate": 5.451295799821269e-07, "loss": 0.13, "step": 61 }, { "epoch": 0.016626441405202468, "grad_norm": 1.4814879197707982, "learning_rate": 5.540661304736372e-07, "loss": 0.1256, "step": 62 }, { "epoch": 0.016894609814963796, "grad_norm": 1.7368842792391574, "learning_rate": 5.630026809651475e-07, "loss": 0.1156, "step": 63 }, { "epoch": 0.017162778224725127, "grad_norm": 1.5395558579212352, "learning_rate": 5.719392314566578e-07, "loss": 0.121, "step": 64 }, { "epoch": 0.01743094663448646, "grad_norm": 1.5812943147268939, "learning_rate": 5.808757819481681e-07, "loss": 0.1199, "step": 65 }, { "epoch": 0.017699115044247787, "grad_norm": 2.2486492517524708, "learning_rate": 5.898123324396783e-07, "loss": 0.1176, "step": 66 }, { "epoch": 0.01796728345400912, "grad_norm": 1.5989523680967617, "learning_rate": 5.987488829311886e-07, "loss": 0.1054, "step": 67 }, { "epoch": 0.018235451863770447, "grad_norm": 1.7234831440536167, "learning_rate": 6.076854334226989e-07, "loss": 0.1416, "step": 68 }, { "epoch": 0.01850362027353178, "grad_norm": 2.158302389030971, "learning_rate": 6.166219839142092e-07, "loss": 0.1474, "step": 69 }, { "epoch": 0.018771788683293107, "grad_norm": 2.478784731943137, "learning_rate": 6.255585344057194e-07, "loss": 0.1171, "step": 70 }, { "epoch": 0.01903995709305444, "grad_norm": 1.360559845020528, "learning_rate": 6.344950848972298e-07, "loss": 0.0987, "step": 71 }, { "epoch": 0.019308125502815767, "grad_norm": 1.1204522642522026, "learning_rate": 6.4343163538874e-07, "loss": 0.0744, "step": 72 }, { "epoch": 0.0195762939125771, "grad_norm": 2.6753643379873027, "learning_rate": 6.523681858802503e-07, "loss": 0.1074, "step": 73 }, { "epoch": 0.019844462322338427, "grad_norm": 1.7157580160480599, "learning_rate": 6.613047363717605e-07, "loss": 0.0985, "step": 74 }, { "epoch": 0.02011263073209976, "grad_norm": 1.7968912989210934, "learning_rate": 6.702412868632709e-07, "loss": 0.1301, "step": 75 }, { "epoch": 0.02038079914186109, "grad_norm": 2.0129220271235435, "learning_rate": 6.791778373547811e-07, "loss": 0.093, "step": 76 }, { "epoch": 0.02064896755162242, "grad_norm": 2.089724817770505, "learning_rate": 6.881143878462914e-07, "loss": 0.0997, "step": 77 }, { "epoch": 0.02091713596138375, "grad_norm": 1.438308295940405, "learning_rate": 6.970509383378017e-07, "loss": 0.1016, "step": 78 }, { "epoch": 0.02118530437114508, "grad_norm": 2.154762741384592, "learning_rate": 7.05987488829312e-07, "loss": 0.1048, "step": 79 }, { "epoch": 0.02145347278090641, "grad_norm": 1.7224331562153552, "learning_rate": 7.149240393208222e-07, "loss": 0.1234, "step": 80 }, { "epoch": 0.02172164119066774, "grad_norm": 1.482899071115104, "learning_rate": 7.238605898123326e-07, "loss": 0.1352, "step": 81 }, { "epoch": 0.02198980960042907, "grad_norm": 3.6555661574975833, "learning_rate": 7.327971403038428e-07, "loss": 0.1377, "step": 82 }, { "epoch": 0.0222579780101904, "grad_norm": 1.50266018779977, "learning_rate": 7.417336907953531e-07, "loss": 0.1072, "step": 83 }, { "epoch": 0.02252614641995173, "grad_norm": 1.3726027929133422, "learning_rate": 7.506702412868633e-07, "loss": 0.1054, "step": 84 }, { "epoch": 0.022794314829713058, "grad_norm": 1.2830790779857988, "learning_rate": 7.596067917783737e-07, "loss": 0.074, "step": 85 }, { "epoch": 0.02306248323947439, "grad_norm": 1.465152769407914, "learning_rate": 7.685433422698839e-07, "loss": 0.1108, "step": 86 }, { "epoch": 0.02333065164923572, "grad_norm": 1.4158571606334536, "learning_rate": 7.774798927613941e-07, "loss": 0.09, "step": 87 }, { "epoch": 0.02359882005899705, "grad_norm": 1.509351898824975, "learning_rate": 7.864164432529045e-07, "loss": 0.1205, "step": 88 }, { "epoch": 0.02386698846875838, "grad_norm": 1.2708777586285185, "learning_rate": 7.953529937444148e-07, "loss": 0.1265, "step": 89 }, { "epoch": 0.02413515687851971, "grad_norm": 1.3011507433475809, "learning_rate": 8.04289544235925e-07, "loss": 0.0941, "step": 90 }, { "epoch": 0.02440332528828104, "grad_norm": 1.925125276184661, "learning_rate": 8.132260947274352e-07, "loss": 0.1126, "step": 91 }, { "epoch": 0.02467149369804237, "grad_norm": 2.163626181375269, "learning_rate": 8.221626452189456e-07, "loss": 0.1236, "step": 92 }, { "epoch": 0.0249396621078037, "grad_norm": 1.2094321310616027, "learning_rate": 8.310991957104558e-07, "loss": 0.1022, "step": 93 }, { "epoch": 0.02520783051756503, "grad_norm": 2.0119224304097574, "learning_rate": 8.400357462019661e-07, "loss": 0.1265, "step": 94 }, { "epoch": 0.02547599892732636, "grad_norm": 1.0132683770266218, "learning_rate": 8.489722966934765e-07, "loss": 0.0843, "step": 95 }, { "epoch": 0.02574416733708769, "grad_norm": 1.321449959342236, "learning_rate": 8.579088471849867e-07, "loss": 0.1177, "step": 96 }, { "epoch": 0.02601233574684902, "grad_norm": 1.2803596390894127, "learning_rate": 8.668453976764969e-07, "loss": 0.1053, "step": 97 }, { "epoch": 0.026280504156610353, "grad_norm": 1.3875316429294406, "learning_rate": 8.757819481680072e-07, "loss": 0.091, "step": 98 }, { "epoch": 0.02654867256637168, "grad_norm": 1.598445796830463, "learning_rate": 8.847184986595175e-07, "loss": 0.1115, "step": 99 }, { "epoch": 0.026816840976133013, "grad_norm": 1.3669246808889461, "learning_rate": 8.936550491510278e-07, "loss": 0.1133, "step": 100 }, { "epoch": 0.02708500938589434, "grad_norm": 1.1443200549299357, "learning_rate": 9.02591599642538e-07, "loss": 0.0865, "step": 101 }, { "epoch": 0.027353177795655673, "grad_norm": 1.01461740344635, "learning_rate": 9.115281501340484e-07, "loss": 0.0859, "step": 102 }, { "epoch": 0.027621346205417, "grad_norm": 1.6783856953916776, "learning_rate": 9.204647006255586e-07, "loss": 0.1044, "step": 103 }, { "epoch": 0.027889514615178333, "grad_norm": 1.7192371055136828, "learning_rate": 9.294012511170689e-07, "loss": 0.099, "step": 104 }, { "epoch": 0.02815768302493966, "grad_norm": 1.4339125746991326, "learning_rate": 9.383378016085791e-07, "loss": 0.0845, "step": 105 }, { "epoch": 0.028425851434700992, "grad_norm": 1.1823270250600704, "learning_rate": 9.472743521000895e-07, "loss": 0.0818, "step": 106 }, { "epoch": 0.02869401984446232, "grad_norm": 1.2149966227513322, "learning_rate": 9.562109025915998e-07, "loss": 0.1017, "step": 107 }, { "epoch": 0.028962188254223652, "grad_norm": 2.1827174599443433, "learning_rate": 9.6514745308311e-07, "loss": 0.0856, "step": 108 }, { "epoch": 0.029230356663984984, "grad_norm": 0.9497273289254222, "learning_rate": 9.740840035746204e-07, "loss": 0.0737, "step": 109 }, { "epoch": 0.029498525073746312, "grad_norm": 1.7965517534461946, "learning_rate": 9.830205540661306e-07, "loss": 0.1048, "step": 110 }, { "epoch": 0.029766693483507644, "grad_norm": 1.667039042581746, "learning_rate": 9.919571045576408e-07, "loss": 0.0842, "step": 111 }, { "epoch": 0.030034861893268972, "grad_norm": 1.673985178498018, "learning_rate": 1.0008936550491512e-06, "loss": 0.0994, "step": 112 }, { "epoch": 0.030303030303030304, "grad_norm": 1.1084966539963994, "learning_rate": 1.0098302055406614e-06, "loss": 0.088, "step": 113 }, { "epoch": 0.030571198712791632, "grad_norm": 0.9720597320902342, "learning_rate": 1.0187667560321716e-06, "loss": 0.0847, "step": 114 }, { "epoch": 0.030839367122552964, "grad_norm": 1.2788184490897927, "learning_rate": 1.027703306523682e-06, "loss": 0.082, "step": 115 }, { "epoch": 0.031107535532314292, "grad_norm": 1.5430443172288189, "learning_rate": 1.0366398570151922e-06, "loss": 0.102, "step": 116 }, { "epoch": 0.031375703942075624, "grad_norm": 1.473097911474061, "learning_rate": 1.0455764075067026e-06, "loss": 0.1184, "step": 117 }, { "epoch": 0.03164387235183695, "grad_norm": 1.2654661888914576, "learning_rate": 1.0545129579982128e-06, "loss": 0.079, "step": 118 }, { "epoch": 0.03191204076159829, "grad_norm": 1.1052864700478846, "learning_rate": 1.0634495084897232e-06, "loss": 0.0851, "step": 119 }, { "epoch": 0.032180209171359615, "grad_norm": 1.0968028766223177, "learning_rate": 1.0723860589812334e-06, "loss": 0.0802, "step": 120 }, { "epoch": 0.032448377581120944, "grad_norm": 2.119394522864081, "learning_rate": 1.0813226094727436e-06, "loss": 0.119, "step": 121 }, { "epoch": 0.03271654599088227, "grad_norm": 1.3754365860570354, "learning_rate": 1.0902591599642538e-06, "loss": 0.1094, "step": 122 }, { "epoch": 0.03298471440064361, "grad_norm": 1.0817566918879613, "learning_rate": 1.0991957104557642e-06, "loss": 0.0807, "step": 123 }, { "epoch": 0.033252882810404935, "grad_norm": 1.4660078891560506, "learning_rate": 1.1081322609472744e-06, "loss": 0.0955, "step": 124 }, { "epoch": 0.03352105122016626, "grad_norm": 1.322438350545607, "learning_rate": 1.1170688114387848e-06, "loss": 0.0868, "step": 125 }, { "epoch": 0.03378921962992759, "grad_norm": 1.6453669902239199, "learning_rate": 1.126005361930295e-06, "loss": 0.0896, "step": 126 }, { "epoch": 0.03405738803968893, "grad_norm": 1.3169906526374648, "learning_rate": 1.1349419124218054e-06, "loss": 0.1015, "step": 127 }, { "epoch": 0.034325556449450255, "grad_norm": 1.6763197171817381, "learning_rate": 1.1438784629133156e-06, "loss": 0.1004, "step": 128 }, { "epoch": 0.03459372485921158, "grad_norm": 1.3386567444919648, "learning_rate": 1.1528150134048258e-06, "loss": 0.1044, "step": 129 }, { "epoch": 0.03486189326897292, "grad_norm": 1.099399950219615, "learning_rate": 1.1617515638963362e-06, "loss": 0.1041, "step": 130 }, { "epoch": 0.035130061678734247, "grad_norm": 1.193225836465599, "learning_rate": 1.1706881143878464e-06, "loss": 0.087, "step": 131 }, { "epoch": 0.035398230088495575, "grad_norm": 1.198084681075304, "learning_rate": 1.1796246648793566e-06, "loss": 0.1094, "step": 132 }, { "epoch": 0.0356663984982569, "grad_norm": 1.6231020969071568, "learning_rate": 1.188561215370867e-06, "loss": 0.1303, "step": 133 }, { "epoch": 0.03593456690801824, "grad_norm": 1.1228555329730097, "learning_rate": 1.1974977658623772e-06, "loss": 0.0633, "step": 134 }, { "epoch": 0.036202735317779566, "grad_norm": 1.6275135486500187, "learning_rate": 1.2064343163538874e-06, "loss": 0.0975, "step": 135 }, { "epoch": 0.036470903727540895, "grad_norm": 1.0685846990212808, "learning_rate": 1.2153708668453978e-06, "loss": 0.0901, "step": 136 }, { "epoch": 0.03673907213730222, "grad_norm": 1.1412152534648226, "learning_rate": 1.2243074173369082e-06, "loss": 0.0677, "step": 137 }, { "epoch": 0.03700724054706356, "grad_norm": 1.2740147615387647, "learning_rate": 1.2332439678284184e-06, "loss": 0.1037, "step": 138 }, { "epoch": 0.037275408956824886, "grad_norm": 1.4248639957231923, "learning_rate": 1.2421805183199286e-06, "loss": 0.1157, "step": 139 }, { "epoch": 0.037543577366586214, "grad_norm": 1.4089851788721515, "learning_rate": 1.2511170688114388e-06, "loss": 0.0801, "step": 140 }, { "epoch": 0.03781174577634755, "grad_norm": 1.5692303894639283, "learning_rate": 1.2600536193029492e-06, "loss": 0.0938, "step": 141 }, { "epoch": 0.03807991418610888, "grad_norm": 1.0578937241746678, "learning_rate": 1.2689901697944596e-06, "loss": 0.0941, "step": 142 }, { "epoch": 0.038348082595870206, "grad_norm": 1.6745613942744022, "learning_rate": 1.2779267202859696e-06, "loss": 0.1175, "step": 143 }, { "epoch": 0.038616251005631534, "grad_norm": 1.3226994059594668, "learning_rate": 1.28686327077748e-06, "loss": 0.1159, "step": 144 }, { "epoch": 0.03888441941539287, "grad_norm": 1.9491993546160369, "learning_rate": 1.2957998212689904e-06, "loss": 0.1018, "step": 145 }, { "epoch": 0.0391525878251542, "grad_norm": 1.0454887522951368, "learning_rate": 1.3047363717605006e-06, "loss": 0.0888, "step": 146 }, { "epoch": 0.039420756234915526, "grad_norm": 1.1397291273859587, "learning_rate": 1.3136729222520108e-06, "loss": 0.1033, "step": 147 }, { "epoch": 0.039688924644676854, "grad_norm": 1.2775298474847148, "learning_rate": 1.322609472743521e-06, "loss": 0.0654, "step": 148 }, { "epoch": 0.03995709305443819, "grad_norm": 1.493806290236156, "learning_rate": 1.3315460232350314e-06, "loss": 0.0975, "step": 149 }, { "epoch": 0.04022526146419952, "grad_norm": 1.1898609187066904, "learning_rate": 1.3404825737265418e-06, "loss": 0.0814, "step": 150 }, { "epoch": 0.040493429873960846, "grad_norm": 2.3125461981097715, "learning_rate": 1.3494191242180518e-06, "loss": 0.1062, "step": 151 }, { "epoch": 0.04076159828372218, "grad_norm": 1.7080973010518823, "learning_rate": 1.3583556747095622e-06, "loss": 0.1411, "step": 152 }, { "epoch": 0.04102976669348351, "grad_norm": 1.4174747561664844, "learning_rate": 1.3672922252010726e-06, "loss": 0.0893, "step": 153 }, { "epoch": 0.04129793510324484, "grad_norm": 1.0231755913661127, "learning_rate": 1.3762287756925828e-06, "loss": 0.103, "step": 154 }, { "epoch": 0.041566103513006165, "grad_norm": 1.249424641538587, "learning_rate": 1.385165326184093e-06, "loss": 0.0987, "step": 155 }, { "epoch": 0.0418342719227675, "grad_norm": 1.4209494718223319, "learning_rate": 1.3941018766756034e-06, "loss": 0.097, "step": 156 }, { "epoch": 0.04210244033252883, "grad_norm": 1.0032072565441066, "learning_rate": 1.4030384271671136e-06, "loss": 0.0677, "step": 157 }, { "epoch": 0.04237060874229016, "grad_norm": 1.1314671181491947, "learning_rate": 1.411974977658624e-06, "loss": 0.0851, "step": 158 }, { "epoch": 0.042638777152051485, "grad_norm": 1.2415077124489695, "learning_rate": 1.4209115281501342e-06, "loss": 0.1071, "step": 159 }, { "epoch": 0.04290694556181282, "grad_norm": 1.3721044667011975, "learning_rate": 1.4298480786416444e-06, "loss": 0.0967, "step": 160 }, { "epoch": 0.04317511397157415, "grad_norm": 1.2972095895372564, "learning_rate": 1.4387846291331548e-06, "loss": 0.0826, "step": 161 }, { "epoch": 0.04344328238133548, "grad_norm": 1.3859470079002227, "learning_rate": 1.4477211796246652e-06, "loss": 0.0998, "step": 162 }, { "epoch": 0.04371145079109681, "grad_norm": 1.4393563901050674, "learning_rate": 1.4566577301161752e-06, "loss": 0.1162, "step": 163 }, { "epoch": 0.04397961920085814, "grad_norm": 1.188824644398946, "learning_rate": 1.4655942806076856e-06, "loss": 0.0964, "step": 164 }, { "epoch": 0.04424778761061947, "grad_norm": 1.2382403823881156, "learning_rate": 1.4745308310991958e-06, "loss": 0.1065, "step": 165 }, { "epoch": 0.0445159560203808, "grad_norm": 0.9862500206144987, "learning_rate": 1.4834673815907062e-06, "loss": 0.1199, "step": 166 }, { "epoch": 0.04478412443014213, "grad_norm": 1.4656954794170536, "learning_rate": 1.4924039320822164e-06, "loss": 0.1021, "step": 167 }, { "epoch": 0.04505229283990346, "grad_norm": 0.813784566619627, "learning_rate": 1.5013404825737266e-06, "loss": 0.061, "step": 168 }, { "epoch": 0.04532046124966479, "grad_norm": 1.0919041252310842, "learning_rate": 1.510277033065237e-06, "loss": 0.0804, "step": 169 }, { "epoch": 0.045588629659426116, "grad_norm": 1.090801767575039, "learning_rate": 1.5192135835567474e-06, "loss": 0.0658, "step": 170 }, { "epoch": 0.04585679806918745, "grad_norm": 1.7751565185330098, "learning_rate": 1.5281501340482574e-06, "loss": 0.12, "step": 171 }, { "epoch": 0.04612496647894878, "grad_norm": 1.3939004891517237, "learning_rate": 1.5370866845397678e-06, "loss": 0.0997, "step": 172 }, { "epoch": 0.04639313488871011, "grad_norm": 1.1556505204332022, "learning_rate": 1.5460232350312782e-06, "loss": 0.0853, "step": 173 }, { "epoch": 0.04666130329847144, "grad_norm": 1.8067880076074672, "learning_rate": 1.5549597855227882e-06, "loss": 0.1104, "step": 174 }, { "epoch": 0.04692947170823277, "grad_norm": 1.4818208702632059, "learning_rate": 1.5638963360142986e-06, "loss": 0.0993, "step": 175 }, { "epoch": 0.0471976401179941, "grad_norm": 1.060650239550061, "learning_rate": 1.572832886505809e-06, "loss": 0.0847, "step": 176 }, { "epoch": 0.04746580852775543, "grad_norm": 1.2781630339731105, "learning_rate": 1.5817694369973192e-06, "loss": 0.0995, "step": 177 }, { "epoch": 0.04773397693751676, "grad_norm": 1.2122131064061366, "learning_rate": 1.5907059874888296e-06, "loss": 0.0861, "step": 178 }, { "epoch": 0.04800214534727809, "grad_norm": 1.002858431869078, "learning_rate": 1.5996425379803396e-06, "loss": 0.0874, "step": 179 }, { "epoch": 0.04827031375703942, "grad_norm": 1.5415661235806746, "learning_rate": 1.60857908847185e-06, "loss": 0.1106, "step": 180 }, { "epoch": 0.04853848216680075, "grad_norm": 1.6895963052964655, "learning_rate": 1.6175156389633604e-06, "loss": 0.0772, "step": 181 }, { "epoch": 0.04880665057656208, "grad_norm": 1.906945789365188, "learning_rate": 1.6264521894548704e-06, "loss": 0.0942, "step": 182 }, { "epoch": 0.04907481898632341, "grad_norm": 1.3593115252223362, "learning_rate": 1.6353887399463808e-06, "loss": 0.1009, "step": 183 }, { "epoch": 0.04934298739608474, "grad_norm": 1.2707177701223717, "learning_rate": 1.6443252904378912e-06, "loss": 0.0809, "step": 184 }, { "epoch": 0.049611155805846074, "grad_norm": 1.0345907315663958, "learning_rate": 1.6532618409294014e-06, "loss": 0.0913, "step": 185 }, { "epoch": 0.0498793242156074, "grad_norm": 1.1327901025470064, "learning_rate": 1.6621983914209116e-06, "loss": 0.0717, "step": 186 }, { "epoch": 0.05014749262536873, "grad_norm": 1.617990122545335, "learning_rate": 1.671134941912422e-06, "loss": 0.1044, "step": 187 }, { "epoch": 0.05041566103513006, "grad_norm": 1.275293844844148, "learning_rate": 1.6800714924039322e-06, "loss": 0.1282, "step": 188 }, { "epoch": 0.050683829444891394, "grad_norm": 0.7745382686882211, "learning_rate": 1.6890080428954426e-06, "loss": 0.0549, "step": 189 }, { "epoch": 0.05095199785465272, "grad_norm": 0.9088949475908857, "learning_rate": 1.697944593386953e-06, "loss": 0.0783, "step": 190 }, { "epoch": 0.05122016626441405, "grad_norm": 1.1097357241011565, "learning_rate": 1.706881143878463e-06, "loss": 0.065, "step": 191 }, { "epoch": 0.05148833467417538, "grad_norm": 1.2961857805427983, "learning_rate": 1.7158176943699734e-06, "loss": 0.0784, "step": 192 }, { "epoch": 0.051756503083936714, "grad_norm": 1.6996578847145942, "learning_rate": 1.7247542448614838e-06, "loss": 0.0859, "step": 193 }, { "epoch": 0.05202467149369804, "grad_norm": 1.967996884334931, "learning_rate": 1.7336907953529938e-06, "loss": 0.0709, "step": 194 }, { "epoch": 0.05229283990345937, "grad_norm": 1.3595184169562722, "learning_rate": 1.7426273458445042e-06, "loss": 0.0859, "step": 195 }, { "epoch": 0.052561008313220706, "grad_norm": 1.135911814953798, "learning_rate": 1.7515638963360144e-06, "loss": 0.089, "step": 196 }, { "epoch": 0.052829176722982034, "grad_norm": 0.9422063898824449, "learning_rate": 1.7605004468275248e-06, "loss": 0.0729, "step": 197 }, { "epoch": 0.05309734513274336, "grad_norm": 0.9875862591713065, "learning_rate": 1.769436997319035e-06, "loss": 0.0754, "step": 198 }, { "epoch": 0.05336551354250469, "grad_norm": 1.3623897132632274, "learning_rate": 1.7783735478105452e-06, "loss": 0.091, "step": 199 }, { "epoch": 0.053633681952266025, "grad_norm": 0.9798723111197029, "learning_rate": 1.7873100983020556e-06, "loss": 0.0635, "step": 200 }, { "epoch": 0.053901850362027354, "grad_norm": 1.5508263393183908, "learning_rate": 1.796246648793566e-06, "loss": 0.0698, "step": 201 }, { "epoch": 0.05417001877178868, "grad_norm": 1.0606035220148973, "learning_rate": 1.805183199285076e-06, "loss": 0.0691, "step": 202 }, { "epoch": 0.05443818718155001, "grad_norm": 1.1805117048051683, "learning_rate": 1.8141197497765864e-06, "loss": 0.0949, "step": 203 }, { "epoch": 0.054706355591311345, "grad_norm": 1.1891551679992307, "learning_rate": 1.8230563002680968e-06, "loss": 0.0824, "step": 204 }, { "epoch": 0.054974524001072674, "grad_norm": 1.0004818694010413, "learning_rate": 1.831992850759607e-06, "loss": 0.0834, "step": 205 }, { "epoch": 0.055242692410834, "grad_norm": 1.7031573973935137, "learning_rate": 1.8409294012511172e-06, "loss": 0.0772, "step": 206 }, { "epoch": 0.05551086082059534, "grad_norm": 1.1022836761180141, "learning_rate": 1.8498659517426276e-06, "loss": 0.0727, "step": 207 }, { "epoch": 0.055779029230356665, "grad_norm": 0.97464431723532, "learning_rate": 1.8588025022341378e-06, "loss": 0.0799, "step": 208 }, { "epoch": 0.05604719764011799, "grad_norm": 1.1196614965663036, "learning_rate": 1.8677390527256482e-06, "loss": 0.0758, "step": 209 }, { "epoch": 0.05631536604987932, "grad_norm": 1.0214083368219748, "learning_rate": 1.8766756032171582e-06, "loss": 0.0804, "step": 210 }, { "epoch": 0.05658353445964066, "grad_norm": 1.7073239247139622, "learning_rate": 1.8856121537086686e-06, "loss": 0.0834, "step": 211 }, { "epoch": 0.056851702869401985, "grad_norm": 1.6994048032035973, "learning_rate": 1.894548704200179e-06, "loss": 0.1011, "step": 212 }, { "epoch": 0.05711987127916331, "grad_norm": 0.9450318226641397, "learning_rate": 1.903485254691689e-06, "loss": 0.0696, "step": 213 }, { "epoch": 0.05738803968892464, "grad_norm": 1.302625736882656, "learning_rate": 1.9124218051831996e-06, "loss": 0.0766, "step": 214 }, { "epoch": 0.057656208098685977, "grad_norm": 1.0940203942956976, "learning_rate": 1.9213583556747098e-06, "loss": 0.0813, "step": 215 }, { "epoch": 0.057924376508447305, "grad_norm": 1.0980544705207405, "learning_rate": 1.93029490616622e-06, "loss": 0.0975, "step": 216 }, { "epoch": 0.05819254491820863, "grad_norm": 1.003720133760256, "learning_rate": 1.93923145665773e-06, "loss": 0.0847, "step": 217 }, { "epoch": 0.05846071332796997, "grad_norm": 0.992449677103783, "learning_rate": 1.9481680071492408e-06, "loss": 0.0719, "step": 218 }, { "epoch": 0.058728881737731296, "grad_norm": 1.0841219777969402, "learning_rate": 1.9571045576407505e-06, "loss": 0.0828, "step": 219 }, { "epoch": 0.058997050147492625, "grad_norm": 1.1513634602915603, "learning_rate": 1.966041108132261e-06, "loss": 0.0795, "step": 220 }, { "epoch": 0.05926521855725395, "grad_norm": 1.0694947066944582, "learning_rate": 1.9749776586237714e-06, "loss": 0.0701, "step": 221 }, { "epoch": 0.05953338696701529, "grad_norm": 1.0703881862769697, "learning_rate": 1.9839142091152816e-06, "loss": 0.0688, "step": 222 }, { "epoch": 0.059801555376776616, "grad_norm": 1.1841394468171016, "learning_rate": 1.992850759606792e-06, "loss": 0.0657, "step": 223 }, { "epoch": 0.060069723786537944, "grad_norm": 1.1142825478669494, "learning_rate": 2.0017873100983024e-06, "loss": 0.0951, "step": 224 }, { "epoch": 0.06033789219629927, "grad_norm": 0.9169747298049504, "learning_rate": 2.0107238605898126e-06, "loss": 0.0838, "step": 225 }, { "epoch": 0.06060606060606061, "grad_norm": 1.0149479432400015, "learning_rate": 2.0196604110813228e-06, "loss": 0.0795, "step": 226 }, { "epoch": 0.060874229015821936, "grad_norm": 0.9592934458994421, "learning_rate": 2.028596961572833e-06, "loss": 0.0621, "step": 227 }, { "epoch": 0.061142397425583264, "grad_norm": 0.8364743216317536, "learning_rate": 2.037533512064343e-06, "loss": 0.0552, "step": 228 }, { "epoch": 0.0614105658353446, "grad_norm": 1.458384943068544, "learning_rate": 2.0464700625558538e-06, "loss": 0.0674, "step": 229 }, { "epoch": 0.06167873424510593, "grad_norm": 1.1258437427885113, "learning_rate": 2.055406613047364e-06, "loss": 0.0885, "step": 230 }, { "epoch": 0.061946902654867256, "grad_norm": 1.2821790247045186, "learning_rate": 2.064343163538874e-06, "loss": 0.0884, "step": 231 }, { "epoch": 0.062215071064628584, "grad_norm": 1.1703389731992098, "learning_rate": 2.0732797140303844e-06, "loss": 0.0953, "step": 232 }, { "epoch": 0.06248323947438992, "grad_norm": 1.1379252307096102, "learning_rate": 2.0822162645218946e-06, "loss": 0.0646, "step": 233 }, { "epoch": 0.06275140788415125, "grad_norm": 1.3492186294879762, "learning_rate": 2.091152815013405e-06, "loss": 0.0542, "step": 234 }, { "epoch": 0.06301957629391258, "grad_norm": 1.1813225197161794, "learning_rate": 2.1000893655049154e-06, "loss": 0.0794, "step": 235 }, { "epoch": 0.0632877447036739, "grad_norm": 1.3245581725610034, "learning_rate": 2.1090259159964256e-06, "loss": 0.1079, "step": 236 }, { "epoch": 0.06355591311343524, "grad_norm": 1.0003102403199535, "learning_rate": 2.1179624664879358e-06, "loss": 0.0865, "step": 237 }, { "epoch": 0.06382408152319657, "grad_norm": 1.2509886633142286, "learning_rate": 2.1268990169794464e-06, "loss": 0.0952, "step": 238 }, { "epoch": 0.0640922499329579, "grad_norm": 1.028817385796426, "learning_rate": 2.135835567470956e-06, "loss": 0.0844, "step": 239 }, { "epoch": 0.06436041834271923, "grad_norm": 1.0010745672393349, "learning_rate": 2.1447721179624668e-06, "loss": 0.0636, "step": 240 }, { "epoch": 0.06462858675248055, "grad_norm": 1.4693215373072745, "learning_rate": 2.153708668453977e-06, "loss": 0.0929, "step": 241 }, { "epoch": 0.06489675516224189, "grad_norm": 1.1922131321695308, "learning_rate": 2.162645218945487e-06, "loss": 0.0992, "step": 242 }, { "epoch": 0.06516492357200322, "grad_norm": 0.8317363838035656, "learning_rate": 2.1715817694369974e-06, "loss": 0.0686, "step": 243 }, { "epoch": 0.06543309198176454, "grad_norm": 0.9917164531210007, "learning_rate": 2.1805183199285075e-06, "loss": 0.0721, "step": 244 }, { "epoch": 0.06570126039152588, "grad_norm": 0.8756151194293332, "learning_rate": 2.189454870420018e-06, "loss": 0.0918, "step": 245 }, { "epoch": 0.06596942880128721, "grad_norm": 0.8521832570277006, "learning_rate": 2.1983914209115284e-06, "loss": 0.0624, "step": 246 }, { "epoch": 0.06623759721104854, "grad_norm": 1.1242564816638847, "learning_rate": 2.2073279714030386e-06, "loss": 0.0936, "step": 247 }, { "epoch": 0.06650576562080987, "grad_norm": 1.2103044921972406, "learning_rate": 2.2162645218945488e-06, "loss": 0.0618, "step": 248 }, { "epoch": 0.0667739340305712, "grad_norm": 0.9597814099444822, "learning_rate": 2.2252010723860594e-06, "loss": 0.089, "step": 249 }, { "epoch": 0.06704210244033253, "grad_norm": 2.5217819195007936, "learning_rate": 2.2341376228775696e-06, "loss": 0.089, "step": 250 }, { "epoch": 0.06731027085009386, "grad_norm": 1.1028408365990845, "learning_rate": 2.2430741733690798e-06, "loss": 0.0781, "step": 251 }, { "epoch": 0.06757843925985518, "grad_norm": 1.084549270598852, "learning_rate": 2.25201072386059e-06, "loss": 0.0782, "step": 252 }, { "epoch": 0.06784660766961652, "grad_norm": 1.1984779264748933, "learning_rate": 2.2609472743521e-06, "loss": 0.0795, "step": 253 }, { "epoch": 0.06811477607937785, "grad_norm": 1.0664352457383879, "learning_rate": 2.2698838248436108e-06, "loss": 0.0656, "step": 254 }, { "epoch": 0.06838294448913917, "grad_norm": 1.2451774925418897, "learning_rate": 2.278820375335121e-06, "loss": 0.0824, "step": 255 }, { "epoch": 0.06865111289890051, "grad_norm": 1.1133994485426701, "learning_rate": 2.287756925826631e-06, "loss": 0.0771, "step": 256 }, { "epoch": 0.06891928130866184, "grad_norm": 0.9239024095356093, "learning_rate": 2.2966934763181414e-06, "loss": 0.0782, "step": 257 }, { "epoch": 0.06918744971842317, "grad_norm": 1.0055580512747866, "learning_rate": 2.3056300268096516e-06, "loss": 0.0803, "step": 258 }, { "epoch": 0.0694556181281845, "grad_norm": 1.1608691448290145, "learning_rate": 2.3145665773011617e-06, "loss": 0.0586, "step": 259 }, { "epoch": 0.06972378653794584, "grad_norm": 1.2539259150831619, "learning_rate": 2.3235031277926724e-06, "loss": 0.0929, "step": 260 }, { "epoch": 0.06999195494770716, "grad_norm": 1.0651457458640197, "learning_rate": 2.3324396782841826e-06, "loss": 0.0864, "step": 261 }, { "epoch": 0.07026012335746849, "grad_norm": 0.6704416714824581, "learning_rate": 2.3413762287756928e-06, "loss": 0.0652, "step": 262 }, { "epoch": 0.07052829176722981, "grad_norm": 0.7900206228074174, "learning_rate": 2.350312779267203e-06, "loss": 0.0554, "step": 263 }, { "epoch": 0.07079646017699115, "grad_norm": 0.9597113694947739, "learning_rate": 2.359249329758713e-06, "loss": 0.0884, "step": 264 }, { "epoch": 0.07106462858675248, "grad_norm": 0.8488229827769775, "learning_rate": 2.3681858802502238e-06, "loss": 0.0663, "step": 265 }, { "epoch": 0.0713327969965138, "grad_norm": 0.7119387481319007, "learning_rate": 2.377122430741734e-06, "loss": 0.0637, "step": 266 }, { "epoch": 0.07160096540627514, "grad_norm": 1.6074978329230172, "learning_rate": 2.386058981233244e-06, "loss": 0.092, "step": 267 }, { "epoch": 0.07186913381603648, "grad_norm": 0.7059400238368266, "learning_rate": 2.3949955317247544e-06, "loss": 0.0553, "step": 268 }, { "epoch": 0.0721373022257978, "grad_norm": 0.8065024141029649, "learning_rate": 2.403932082216265e-06, "loss": 0.0661, "step": 269 }, { "epoch": 0.07240547063555913, "grad_norm": 1.627514462340302, "learning_rate": 2.4128686327077747e-06, "loss": 0.0793, "step": 270 }, { "epoch": 0.07267363904532047, "grad_norm": 0.8530652841457095, "learning_rate": 2.4218051831992854e-06, "loss": 0.0533, "step": 271 }, { "epoch": 0.07294180745508179, "grad_norm": 1.7280767664486572, "learning_rate": 2.4307417336907956e-06, "loss": 0.1086, "step": 272 }, { "epoch": 0.07320997586484312, "grad_norm": 1.1797278482687943, "learning_rate": 2.4396782841823058e-06, "loss": 0.0888, "step": 273 }, { "epoch": 0.07347814427460445, "grad_norm": 1.1738974422848678, "learning_rate": 2.4486148346738164e-06, "loss": 0.0805, "step": 274 }, { "epoch": 0.07374631268436578, "grad_norm": 1.2072795316038485, "learning_rate": 2.457551385165326e-06, "loss": 0.0908, "step": 275 }, { "epoch": 0.07401448109412712, "grad_norm": 1.4159990356977101, "learning_rate": 2.4664879356568368e-06, "loss": 0.0761, "step": 276 }, { "epoch": 0.07428264950388844, "grad_norm": 1.0466463023915136, "learning_rate": 2.475424486148347e-06, "loss": 0.1091, "step": 277 }, { "epoch": 0.07455081791364977, "grad_norm": 1.2665158322129189, "learning_rate": 2.484361036639857e-06, "loss": 0.0846, "step": 278 }, { "epoch": 0.07481898632341111, "grad_norm": 1.0631858461709345, "learning_rate": 2.4932975871313673e-06, "loss": 0.0821, "step": 279 }, { "epoch": 0.07508715473317243, "grad_norm": 1.1710981073280144, "learning_rate": 2.5022341376228775e-06, "loss": 0.1136, "step": 280 }, { "epoch": 0.07535532314293376, "grad_norm": 0.828022804384634, "learning_rate": 2.511170688114388e-06, "loss": 0.0624, "step": 281 }, { "epoch": 0.0756234915526951, "grad_norm": 1.508075132888285, "learning_rate": 2.5201072386058984e-06, "loss": 0.0821, "step": 282 }, { "epoch": 0.07589165996245642, "grad_norm": 0.9022038984688973, "learning_rate": 2.5290437890974086e-06, "loss": 0.0652, "step": 283 }, { "epoch": 0.07615982837221776, "grad_norm": 0.9393358799580087, "learning_rate": 2.537980339588919e-06, "loss": 0.0849, "step": 284 }, { "epoch": 0.07642799678197908, "grad_norm": 0.9690669281510286, "learning_rate": 2.5469168900804294e-06, "loss": 0.0882, "step": 285 }, { "epoch": 0.07669616519174041, "grad_norm": 1.003417487869948, "learning_rate": 2.555853440571939e-06, "loss": 0.0841, "step": 286 }, { "epoch": 0.07696433360150175, "grad_norm": 2.0460151772606867, "learning_rate": 2.5647899910634498e-06, "loss": 0.1027, "step": 287 }, { "epoch": 0.07723250201126307, "grad_norm": 0.7215361864674158, "learning_rate": 2.57372654155496e-06, "loss": 0.0618, "step": 288 }, { "epoch": 0.0775006704210244, "grad_norm": 0.9582439103566321, "learning_rate": 2.58266309204647e-06, "loss": 0.0747, "step": 289 }, { "epoch": 0.07776883883078574, "grad_norm": 1.1565622904192823, "learning_rate": 2.5915996425379808e-06, "loss": 0.0791, "step": 290 }, { "epoch": 0.07803700724054706, "grad_norm": 0.9954087740293288, "learning_rate": 2.600536193029491e-06, "loss": 0.077, "step": 291 }, { "epoch": 0.0783051756503084, "grad_norm": 1.1051014947767286, "learning_rate": 2.609472743521001e-06, "loss": 0.0694, "step": 292 }, { "epoch": 0.07857334406006973, "grad_norm": 1.4335081031224706, "learning_rate": 2.6184092940125118e-06, "loss": 0.0988, "step": 293 }, { "epoch": 0.07884151246983105, "grad_norm": 0.9157223238037455, "learning_rate": 2.6273458445040215e-06, "loss": 0.0687, "step": 294 }, { "epoch": 0.07910968087959239, "grad_norm": 0.9870856447241909, "learning_rate": 2.6362823949955317e-06, "loss": 0.0783, "step": 295 }, { "epoch": 0.07937784928935371, "grad_norm": 1.0884120462342197, "learning_rate": 2.645218945487042e-06, "loss": 0.0655, "step": 296 }, { "epoch": 0.07964601769911504, "grad_norm": 0.8691392738234774, "learning_rate": 2.6541554959785526e-06, "loss": 0.0605, "step": 297 }, { "epoch": 0.07991418610887638, "grad_norm": 1.0147108903119446, "learning_rate": 2.6630920464700628e-06, "loss": 0.0744, "step": 298 }, { "epoch": 0.0801823545186377, "grad_norm": 1.5883804469625813, "learning_rate": 2.672028596961573e-06, "loss": 0.068, "step": 299 }, { "epoch": 0.08045052292839903, "grad_norm": 1.2524670615500189, "learning_rate": 2.6809651474530836e-06, "loss": 0.0811, "step": 300 }, { "epoch": 0.08071869133816037, "grad_norm": 1.0479724605410745, "learning_rate": 2.6899016979445938e-06, "loss": 0.0561, "step": 301 }, { "epoch": 0.08098685974792169, "grad_norm": 1.0859122108575248, "learning_rate": 2.6988382484361035e-06, "loss": 0.079, "step": 302 }, { "epoch": 0.08125502815768303, "grad_norm": 0.9619983404050642, "learning_rate": 2.707774798927614e-06, "loss": 0.073, "step": 303 }, { "epoch": 0.08152319656744436, "grad_norm": 1.307535170062756, "learning_rate": 2.7167113494191243e-06, "loss": 0.0895, "step": 304 }, { "epoch": 0.08179136497720568, "grad_norm": 1.0701695250730898, "learning_rate": 2.7256478999106345e-06, "loss": 0.0787, "step": 305 }, { "epoch": 0.08205953338696702, "grad_norm": 1.0687106263316555, "learning_rate": 2.734584450402145e-06, "loss": 0.0626, "step": 306 }, { "epoch": 0.08232770179672834, "grad_norm": 1.002157793093474, "learning_rate": 2.7435210008936554e-06, "loss": 0.0781, "step": 307 }, { "epoch": 0.08259587020648967, "grad_norm": 0.9378859444023192, "learning_rate": 2.7524575513851655e-06, "loss": 0.056, "step": 308 }, { "epoch": 0.08286403861625101, "grad_norm": 1.685474229427092, "learning_rate": 2.761394101876676e-06, "loss": 0.0742, "step": 309 }, { "epoch": 0.08313220702601233, "grad_norm": 1.1208292505894206, "learning_rate": 2.770330652368186e-06, "loss": 0.0887, "step": 310 }, { "epoch": 0.08340037543577367, "grad_norm": 0.8097955547490224, "learning_rate": 2.779267202859696e-06, "loss": 0.0541, "step": 311 }, { "epoch": 0.083668543845535, "grad_norm": 1.0727064085873945, "learning_rate": 2.7882037533512068e-06, "loss": 0.0801, "step": 312 }, { "epoch": 0.08393671225529632, "grad_norm": 0.8981169038743971, "learning_rate": 2.797140303842717e-06, "loss": 0.0864, "step": 313 }, { "epoch": 0.08420488066505766, "grad_norm": 0.798231015805989, "learning_rate": 2.806076854334227e-06, "loss": 0.059, "step": 314 }, { "epoch": 0.08447304907481899, "grad_norm": 1.1440732517300938, "learning_rate": 2.8150134048257378e-06, "loss": 0.0903, "step": 315 }, { "epoch": 0.08474121748458031, "grad_norm": 1.3570227253510636, "learning_rate": 2.823949955317248e-06, "loss": 0.0903, "step": 316 }, { "epoch": 0.08500938589434165, "grad_norm": 1.3753091739520849, "learning_rate": 2.8328865058087577e-06, "loss": 0.0993, "step": 317 }, { "epoch": 0.08527755430410297, "grad_norm": 0.827931212541233, "learning_rate": 2.8418230563002683e-06, "loss": 0.0619, "step": 318 }, { "epoch": 0.0855457227138643, "grad_norm": 1.4903805164868298, "learning_rate": 2.8507596067917785e-06, "loss": 0.061, "step": 319 }, { "epoch": 0.08581389112362564, "grad_norm": 1.077024687984436, "learning_rate": 2.8596961572832887e-06, "loss": 0.0742, "step": 320 }, { "epoch": 0.08608205953338696, "grad_norm": 0.9191356525563837, "learning_rate": 2.8686327077747994e-06, "loss": 0.0778, "step": 321 }, { "epoch": 0.0863502279431483, "grad_norm": 1.0400915825906893, "learning_rate": 2.8775692582663096e-06, "loss": 0.0825, "step": 322 }, { "epoch": 0.08661839635290963, "grad_norm": 0.9202031895155263, "learning_rate": 2.8865058087578197e-06, "loss": 0.0686, "step": 323 }, { "epoch": 0.08688656476267095, "grad_norm": 1.0230903117879404, "learning_rate": 2.8954423592493304e-06, "loss": 0.0905, "step": 324 }, { "epoch": 0.08715473317243229, "grad_norm": 1.0574152487931676, "learning_rate": 2.9043789097408406e-06, "loss": 0.0832, "step": 325 }, { "epoch": 0.08742290158219362, "grad_norm": 0.8171027114437683, "learning_rate": 2.9133154602323503e-06, "loss": 0.0657, "step": 326 }, { "epoch": 0.08769106999195495, "grad_norm": 1.2485191655184722, "learning_rate": 2.9222520107238605e-06, "loss": 0.0769, "step": 327 }, { "epoch": 0.08795923840171628, "grad_norm": 1.3259720374834723, "learning_rate": 2.931188561215371e-06, "loss": 0.0891, "step": 328 }, { "epoch": 0.0882274068114776, "grad_norm": 2.811216326336966, "learning_rate": 2.9401251117068813e-06, "loss": 0.0842, "step": 329 }, { "epoch": 0.08849557522123894, "grad_norm": 1.1527833879102507, "learning_rate": 2.9490616621983915e-06, "loss": 0.0912, "step": 330 }, { "epoch": 0.08876374363100027, "grad_norm": 1.1658411322197932, "learning_rate": 2.957998212689902e-06, "loss": 0.0687, "step": 331 }, { "epoch": 0.0890319120407616, "grad_norm": 1.081588085132814, "learning_rate": 2.9669347631814124e-06, "loss": 0.0803, "step": 332 }, { "epoch": 0.08930008045052293, "grad_norm": 0.9446185208778421, "learning_rate": 2.975871313672922e-06, "loss": 0.0601, "step": 333 }, { "epoch": 0.08956824886028426, "grad_norm": 1.0997109244581862, "learning_rate": 2.9848078641644327e-06, "loss": 0.0751, "step": 334 }, { "epoch": 0.08983641727004558, "grad_norm": 0.8143012910624685, "learning_rate": 2.993744414655943e-06, "loss": 0.0738, "step": 335 }, { "epoch": 0.09010458567980692, "grad_norm": 1.406278919886631, "learning_rate": 3.002680965147453e-06, "loss": 0.0919, "step": 336 }, { "epoch": 0.09037275408956826, "grad_norm": 1.1249163229623687, "learning_rate": 3.0116175156389638e-06, "loss": 0.0579, "step": 337 }, { "epoch": 0.09064092249932958, "grad_norm": 0.6473890487621132, "learning_rate": 3.020554066130474e-06, "loss": 0.0499, "step": 338 }, { "epoch": 0.09090909090909091, "grad_norm": 1.2174147140056035, "learning_rate": 3.029490616621984e-06, "loss": 0.0961, "step": 339 }, { "epoch": 0.09117725931885223, "grad_norm": 1.2417750502005842, "learning_rate": 3.0384271671134948e-06, "loss": 0.0753, "step": 340 }, { "epoch": 0.09144542772861357, "grad_norm": 1.3856305404094686, "learning_rate": 3.0473637176050045e-06, "loss": 0.0747, "step": 341 }, { "epoch": 0.0917135961383749, "grad_norm": 0.8251725957137185, "learning_rate": 3.0563002680965147e-06, "loss": 0.0505, "step": 342 }, { "epoch": 0.09198176454813622, "grad_norm": 0.9690766891612745, "learning_rate": 3.0652368185880253e-06, "loss": 0.0779, "step": 343 }, { "epoch": 0.09224993295789756, "grad_norm": 0.9426468068913342, "learning_rate": 3.0741733690795355e-06, "loss": 0.0625, "step": 344 }, { "epoch": 0.0925181013676589, "grad_norm": 1.6635687048533288, "learning_rate": 3.0831099195710457e-06, "loss": 0.076, "step": 345 }, { "epoch": 0.09278626977742022, "grad_norm": 1.2042708166020768, "learning_rate": 3.0920464700625564e-06, "loss": 0.0942, "step": 346 }, { "epoch": 0.09305443818718155, "grad_norm": 0.918881700830368, "learning_rate": 3.1009830205540666e-06, "loss": 0.0557, "step": 347 }, { "epoch": 0.09332260659694289, "grad_norm": 1.1206222445934224, "learning_rate": 3.1099195710455763e-06, "loss": 0.0736, "step": 348 }, { "epoch": 0.09359077500670421, "grad_norm": 1.92039125119349, "learning_rate": 3.1188561215370874e-06, "loss": 0.107, "step": 349 }, { "epoch": 0.09385894341646554, "grad_norm": 1.4661266745846624, "learning_rate": 3.127792672028597e-06, "loss": 0.0917, "step": 350 }, { "epoch": 0.09412711182622686, "grad_norm": 1.0335237981658958, "learning_rate": 3.1367292225201073e-06, "loss": 0.0622, "step": 351 }, { "epoch": 0.0943952802359882, "grad_norm": 0.9000440806026753, "learning_rate": 3.145665773011618e-06, "loss": 0.0568, "step": 352 }, { "epoch": 0.09466344864574953, "grad_norm": 0.887741801012921, "learning_rate": 3.154602323503128e-06, "loss": 0.0502, "step": 353 }, { "epoch": 0.09493161705551086, "grad_norm": 0.903705476438776, "learning_rate": 3.1635388739946383e-06, "loss": 0.0509, "step": 354 }, { "epoch": 0.09519978546527219, "grad_norm": 1.1072642212309256, "learning_rate": 3.172475424486149e-06, "loss": 0.0734, "step": 355 }, { "epoch": 0.09546795387503353, "grad_norm": 1.1012522319067872, "learning_rate": 3.181411974977659e-06, "loss": 0.0731, "step": 356 }, { "epoch": 0.09573612228479485, "grad_norm": 0.8976382109675861, "learning_rate": 3.190348525469169e-06, "loss": 0.0844, "step": 357 }, { "epoch": 0.09600429069455618, "grad_norm": 1.6457634228322715, "learning_rate": 3.199285075960679e-06, "loss": 0.072, "step": 358 }, { "epoch": 0.09627245910431752, "grad_norm": 1.3141929485046302, "learning_rate": 3.2082216264521897e-06, "loss": 0.0797, "step": 359 }, { "epoch": 0.09654062751407884, "grad_norm": 0.6857987991249308, "learning_rate": 3.2171581769437e-06, "loss": 0.0592, "step": 360 }, { "epoch": 0.09680879592384017, "grad_norm": 0.9072021830930354, "learning_rate": 3.22609472743521e-06, "loss": 0.0716, "step": 361 }, { "epoch": 0.0970769643336015, "grad_norm": 0.9729303928487002, "learning_rate": 3.2350312779267208e-06, "loss": 0.0836, "step": 362 }, { "epoch": 0.09734513274336283, "grad_norm": 0.8535025577818581, "learning_rate": 3.243967828418231e-06, "loss": 0.0597, "step": 363 }, { "epoch": 0.09761330115312417, "grad_norm": 0.7185008385879719, "learning_rate": 3.2529043789097407e-06, "loss": 0.0567, "step": 364 }, { "epoch": 0.09788146956288549, "grad_norm": 1.4503499628007936, "learning_rate": 3.2618409294012513e-06, "loss": 0.0913, "step": 365 }, { "epoch": 0.09814963797264682, "grad_norm": 1.0550875866772251, "learning_rate": 3.2707774798927615e-06, "loss": 0.0653, "step": 366 }, { "epoch": 0.09841780638240816, "grad_norm": 1.0355247644516965, "learning_rate": 3.2797140303842717e-06, "loss": 0.091, "step": 367 }, { "epoch": 0.09868597479216948, "grad_norm": 0.869526429019359, "learning_rate": 3.2886505808757823e-06, "loss": 0.0663, "step": 368 }, { "epoch": 0.09895414320193081, "grad_norm": 1.0454422901891616, "learning_rate": 3.2975871313672925e-06, "loss": 0.0685, "step": 369 }, { "epoch": 0.09922231161169215, "grad_norm": 1.3344383792030077, "learning_rate": 3.3065236818588027e-06, "loss": 0.0837, "step": 370 }, { "epoch": 0.09949048002145347, "grad_norm": 0.8711190171305349, "learning_rate": 3.3154602323503134e-06, "loss": 0.0631, "step": 371 }, { "epoch": 0.0997586484312148, "grad_norm": 1.8034412169688117, "learning_rate": 3.324396782841823e-06, "loss": 0.0799, "step": 372 }, { "epoch": 0.10002681684097613, "grad_norm": 1.1544396747079537, "learning_rate": 3.3333333333333333e-06, "loss": 0.0658, "step": 373 }, { "epoch": 0.10029498525073746, "grad_norm": 1.0956568837305507, "learning_rate": 3.342269883824844e-06, "loss": 0.0971, "step": 374 }, { "epoch": 0.1005631536604988, "grad_norm": 0.9152152667154431, "learning_rate": 3.351206434316354e-06, "loss": 0.0699, "step": 375 }, { "epoch": 0.10083132207026012, "grad_norm": 1.2003730717751693, "learning_rate": 3.3601429848078643e-06, "loss": 0.0728, "step": 376 }, { "epoch": 0.10109949048002145, "grad_norm": 0.7839119427197051, "learning_rate": 3.369079535299375e-06, "loss": 0.0609, "step": 377 }, { "epoch": 0.10136765888978279, "grad_norm": 0.7857265623885019, "learning_rate": 3.378016085790885e-06, "loss": 0.0712, "step": 378 }, { "epoch": 0.10163582729954411, "grad_norm": 1.1896410448477541, "learning_rate": 3.386952636282395e-06, "loss": 0.0787, "step": 379 }, { "epoch": 0.10190399570930544, "grad_norm": 0.95025320751986, "learning_rate": 3.395889186773906e-06, "loss": 0.0584, "step": 380 }, { "epoch": 0.10217216411906678, "grad_norm": 1.007263048487301, "learning_rate": 3.4048257372654157e-06, "loss": 0.0589, "step": 381 }, { "epoch": 0.1024403325288281, "grad_norm": 1.0622571583865128, "learning_rate": 3.413762287756926e-06, "loss": 0.1023, "step": 382 }, { "epoch": 0.10270850093858944, "grad_norm": 1.5801401474760421, "learning_rate": 3.4226988382484365e-06, "loss": 0.0691, "step": 383 }, { "epoch": 0.10297666934835076, "grad_norm": 0.9180005102082369, "learning_rate": 3.4316353887399467e-06, "loss": 0.059, "step": 384 }, { "epoch": 0.10324483775811209, "grad_norm": 1.3006990037120956, "learning_rate": 3.440571939231457e-06, "loss": 0.1537, "step": 385 }, { "epoch": 0.10351300616787343, "grad_norm": 3.388616644982409, "learning_rate": 3.4495084897229676e-06, "loss": 0.072, "step": 386 }, { "epoch": 0.10378117457763475, "grad_norm": 1.6250064874481471, "learning_rate": 3.4584450402144778e-06, "loss": 0.0798, "step": 387 }, { "epoch": 0.10404934298739608, "grad_norm": 2.2028328340019545, "learning_rate": 3.4673815907059875e-06, "loss": 0.1086, "step": 388 }, { "epoch": 0.10431751139715742, "grad_norm": 0.8735149416116579, "learning_rate": 3.4763181411974977e-06, "loss": 0.085, "step": 389 }, { "epoch": 0.10458567980691874, "grad_norm": 0.8826399197314917, "learning_rate": 3.4852546916890083e-06, "loss": 0.0777, "step": 390 }, { "epoch": 0.10485384821668008, "grad_norm": 1.7772903501906536, "learning_rate": 3.4941912421805185e-06, "loss": 0.067, "step": 391 }, { "epoch": 0.10512201662644141, "grad_norm": 0.6612012304390023, "learning_rate": 3.5031277926720287e-06, "loss": 0.0555, "step": 392 }, { "epoch": 0.10539018503620273, "grad_norm": 1.0992829113760316, "learning_rate": 3.5120643431635393e-06, "loss": 0.085, "step": 393 }, { "epoch": 0.10565835344596407, "grad_norm": 0.9158163317418148, "learning_rate": 3.5210008936550495e-06, "loss": 0.072, "step": 394 }, { "epoch": 0.10592652185572539, "grad_norm": 1.1235386907017653, "learning_rate": 3.5299374441465593e-06, "loss": 0.078, "step": 395 }, { "epoch": 0.10619469026548672, "grad_norm": 0.9295731162362485, "learning_rate": 3.53887399463807e-06, "loss": 0.0769, "step": 396 }, { "epoch": 0.10646285867524806, "grad_norm": 0.8189493812836391, "learning_rate": 3.54781054512958e-06, "loss": 0.0766, "step": 397 }, { "epoch": 0.10673102708500938, "grad_norm": 0.7264464833157396, "learning_rate": 3.5567470956210903e-06, "loss": 0.0625, "step": 398 }, { "epoch": 0.10699919549477072, "grad_norm": 0.9336534372125695, "learning_rate": 3.565683646112601e-06, "loss": 0.0642, "step": 399 }, { "epoch": 0.10726736390453205, "grad_norm": 0.8082667028332312, "learning_rate": 3.574620196604111e-06, "loss": 0.0762, "step": 400 }, { "epoch": 0.10753553231429337, "grad_norm": 1.1965741254330413, "learning_rate": 3.5835567470956213e-06, "loss": 0.0851, "step": 401 }, { "epoch": 0.10780370072405471, "grad_norm": 0.7280924257260993, "learning_rate": 3.592493297587132e-06, "loss": 0.0592, "step": 402 }, { "epoch": 0.10807186913381604, "grad_norm": 0.720547204956332, "learning_rate": 3.6014298480786417e-06, "loss": 0.054, "step": 403 }, { "epoch": 0.10834003754357736, "grad_norm": 1.3765699071733941, "learning_rate": 3.610366398570152e-06, "loss": 0.0634, "step": 404 }, { "epoch": 0.1086082059533387, "grad_norm": 1.1563674924199965, "learning_rate": 3.6193029490616625e-06, "loss": 0.0729, "step": 405 }, { "epoch": 0.10887637436310002, "grad_norm": 0.8671488689492501, "learning_rate": 3.6282394995531727e-06, "loss": 0.0592, "step": 406 }, { "epoch": 0.10914454277286136, "grad_norm": 0.7925270272445619, "learning_rate": 3.637176050044683e-06, "loss": 0.068, "step": 407 }, { "epoch": 0.10941271118262269, "grad_norm": 2.7992708918294036, "learning_rate": 3.6461126005361935e-06, "loss": 0.091, "step": 408 }, { "epoch": 0.10968087959238401, "grad_norm": 0.8933122734105511, "learning_rate": 3.6550491510277037e-06, "loss": 0.0639, "step": 409 }, { "epoch": 0.10994904800214535, "grad_norm": 0.9735053304309693, "learning_rate": 3.663985701519214e-06, "loss": 0.0692, "step": 410 }, { "epoch": 0.11021721641190668, "grad_norm": 1.1329451443201408, "learning_rate": 3.6729222520107246e-06, "loss": 0.0608, "step": 411 }, { "epoch": 0.110485384821668, "grad_norm": 1.0833987010340869, "learning_rate": 3.6818588025022343e-06, "loss": 0.082, "step": 412 }, { "epoch": 0.11075355323142934, "grad_norm": 0.6075048864304109, "learning_rate": 3.6907953529937445e-06, "loss": 0.0572, "step": 413 }, { "epoch": 0.11102172164119067, "grad_norm": 0.8820252585041759, "learning_rate": 3.699731903485255e-06, "loss": 0.0735, "step": 414 }, { "epoch": 0.111289890050952, "grad_norm": 0.8103064957156563, "learning_rate": 3.7086684539767653e-06, "loss": 0.0769, "step": 415 }, { "epoch": 0.11155805846071333, "grad_norm": 0.9355080913163316, "learning_rate": 3.7176050044682755e-06, "loss": 0.0916, "step": 416 }, { "epoch": 0.11182622687047465, "grad_norm": 1.1030965188188988, "learning_rate": 3.726541554959786e-06, "loss": 0.0849, "step": 417 }, { "epoch": 0.11209439528023599, "grad_norm": 0.8955547579110715, "learning_rate": 3.7354781054512963e-06, "loss": 0.07, "step": 418 }, { "epoch": 0.11236256368999732, "grad_norm": 1.2804628803781006, "learning_rate": 3.744414655942806e-06, "loss": 0.0712, "step": 419 }, { "epoch": 0.11263073209975864, "grad_norm": 0.973257227307041, "learning_rate": 3.7533512064343163e-06, "loss": 0.0651, "step": 420 }, { "epoch": 0.11289890050951998, "grad_norm": 0.9144714758455673, "learning_rate": 3.762287756925827e-06, "loss": 0.0772, "step": 421 }, { "epoch": 0.11316706891928131, "grad_norm": 0.8811434862280502, "learning_rate": 3.771224307417337e-06, "loss": 0.0708, "step": 422 }, { "epoch": 0.11343523732904263, "grad_norm": 0.9275137033365234, "learning_rate": 3.7801608579088473e-06, "loss": 0.0734, "step": 423 }, { "epoch": 0.11370340573880397, "grad_norm": 0.5946467810079347, "learning_rate": 3.789097408400358e-06, "loss": 0.0509, "step": 424 }, { "epoch": 0.1139715741485653, "grad_norm": 1.0392142809650076, "learning_rate": 3.798033958891868e-06, "loss": 0.0669, "step": 425 }, { "epoch": 0.11423974255832663, "grad_norm": 0.9945808768720859, "learning_rate": 3.806970509383378e-06, "loss": 0.0813, "step": 426 }, { "epoch": 0.11450791096808796, "grad_norm": 1.252642635051418, "learning_rate": 3.815907059874889e-06, "loss": 0.0625, "step": 427 }, { "epoch": 0.11477607937784928, "grad_norm": 0.8750667824002176, "learning_rate": 3.824843610366399e-06, "loss": 0.0613, "step": 428 }, { "epoch": 0.11504424778761062, "grad_norm": 1.1042134673035502, "learning_rate": 3.833780160857909e-06, "loss": 0.0824, "step": 429 }, { "epoch": 0.11531241619737195, "grad_norm": 1.055246599454212, "learning_rate": 3.8427167113494195e-06, "loss": 0.0685, "step": 430 }, { "epoch": 0.11558058460713327, "grad_norm": 0.9600247518426471, "learning_rate": 3.85165326184093e-06, "loss": 0.0642, "step": 431 }, { "epoch": 0.11584875301689461, "grad_norm": 1.5056101596788642, "learning_rate": 3.86058981233244e-06, "loss": 0.0826, "step": 432 }, { "epoch": 0.11611692142665594, "grad_norm": 1.021655401782829, "learning_rate": 3.86952636282395e-06, "loss": 0.0774, "step": 433 }, { "epoch": 0.11638508983641727, "grad_norm": 1.6619518501251627, "learning_rate": 3.87846291331546e-06, "loss": 0.0971, "step": 434 }, { "epoch": 0.1166532582461786, "grad_norm": 1.1813479577991888, "learning_rate": 3.8873994638069705e-06, "loss": 0.07, "step": 435 }, { "epoch": 0.11692142665593994, "grad_norm": 0.786445758749673, "learning_rate": 3.8963360142984816e-06, "loss": 0.0585, "step": 436 }, { "epoch": 0.11718959506570126, "grad_norm": 0.6088598052153829, "learning_rate": 3.905272564789992e-06, "loss": 0.0428, "step": 437 }, { "epoch": 0.11745776347546259, "grad_norm": 1.468539965074711, "learning_rate": 3.914209115281501e-06, "loss": 0.0732, "step": 438 }, { "epoch": 0.11772593188522391, "grad_norm": 2.6555324733651844, "learning_rate": 3.923145665773012e-06, "loss": 0.0964, "step": 439 }, { "epoch": 0.11799410029498525, "grad_norm": 1.3817248719686481, "learning_rate": 3.932082216264522e-06, "loss": 0.0964, "step": 440 }, { "epoch": 0.11826226870474658, "grad_norm": 1.6721245521382098, "learning_rate": 3.9410187667560325e-06, "loss": 0.0679, "step": 441 }, { "epoch": 0.1185304371145079, "grad_norm": 0.8951246377469109, "learning_rate": 3.949955317247543e-06, "loss": 0.0733, "step": 442 }, { "epoch": 0.11879860552426924, "grad_norm": 0.9157102904250533, "learning_rate": 3.958891867739053e-06, "loss": 0.0537, "step": 443 }, { "epoch": 0.11906677393403058, "grad_norm": 0.9833019540438356, "learning_rate": 3.967828418230563e-06, "loss": 0.0861, "step": 444 }, { "epoch": 0.1193349423437919, "grad_norm": 0.9813063291336864, "learning_rate": 3.976764968722074e-06, "loss": 0.0858, "step": 445 }, { "epoch": 0.11960311075355323, "grad_norm": 1.0712971937769715, "learning_rate": 3.985701519213584e-06, "loss": 0.0908, "step": 446 }, { "epoch": 0.11987127916331457, "grad_norm": 1.0460466685790129, "learning_rate": 3.994638069705094e-06, "loss": 0.0655, "step": 447 }, { "epoch": 0.12013944757307589, "grad_norm": 0.9771270953154231, "learning_rate": 4.003574620196605e-06, "loss": 0.0674, "step": 448 }, { "epoch": 0.12040761598283722, "grad_norm": 1.1754193477088417, "learning_rate": 4.012511170688115e-06, "loss": 0.0776, "step": 449 }, { "epoch": 0.12067578439259855, "grad_norm": 0.8875512805066087, "learning_rate": 4.021447721179625e-06, "loss": 0.0824, "step": 450 }, { "epoch": 0.12094395280235988, "grad_norm": 1.4439920003823963, "learning_rate": 4.030384271671135e-06, "loss": 0.0794, "step": 451 }, { "epoch": 0.12121212121212122, "grad_norm": 1.0877772359601794, "learning_rate": 4.0393208221626455e-06, "loss": 0.0893, "step": 452 }, { "epoch": 0.12148028962188254, "grad_norm": 0.8399015261876391, "learning_rate": 4.048257372654156e-06, "loss": 0.0597, "step": 453 }, { "epoch": 0.12174845803164387, "grad_norm": 0.9154618737483778, "learning_rate": 4.057193923145666e-06, "loss": 0.0811, "step": 454 }, { "epoch": 0.12201662644140521, "grad_norm": 0.7294910575903597, "learning_rate": 4.066130473637176e-06, "loss": 0.0583, "step": 455 }, { "epoch": 0.12228479485116653, "grad_norm": 0.8192999514974959, "learning_rate": 4.075067024128686e-06, "loss": 0.0711, "step": 456 }, { "epoch": 0.12255296326092786, "grad_norm": 1.0804038523900057, "learning_rate": 4.0840035746201965e-06, "loss": 0.0799, "step": 457 }, { "epoch": 0.1228211316706892, "grad_norm": 1.18231349058715, "learning_rate": 4.0929401251117075e-06, "loss": 0.0759, "step": 458 }, { "epoch": 0.12308930008045052, "grad_norm": 1.0971368765685716, "learning_rate": 4.101876675603218e-06, "loss": 0.0921, "step": 459 }, { "epoch": 0.12335746849021186, "grad_norm": 0.992674402696658, "learning_rate": 4.110813226094728e-06, "loss": 0.0683, "step": 460 }, { "epoch": 0.12362563689997318, "grad_norm": 0.9673134781586926, "learning_rate": 4.119749776586238e-06, "loss": 0.0846, "step": 461 }, { "epoch": 0.12389380530973451, "grad_norm": 0.9742752411950797, "learning_rate": 4.128686327077748e-06, "loss": 0.0733, "step": 462 }, { "epoch": 0.12416197371949585, "grad_norm": 1.0635529993593695, "learning_rate": 4.1376228775692585e-06, "loss": 0.0775, "step": 463 }, { "epoch": 0.12443014212925717, "grad_norm": 1.0650182732480298, "learning_rate": 4.146559428060769e-06, "loss": 0.0642, "step": 464 }, { "epoch": 0.1246983105390185, "grad_norm": 0.9293667582643371, "learning_rate": 4.155495978552279e-06, "loss": 0.0573, "step": 465 }, { "epoch": 0.12496647894877984, "grad_norm": 1.0126793969591357, "learning_rate": 4.164432529043789e-06, "loss": 0.074, "step": 466 }, { "epoch": 0.12523464735854117, "grad_norm": 0.8200191311995798, "learning_rate": 4.1733690795353e-06, "loss": 0.0634, "step": 467 }, { "epoch": 0.1255028157683025, "grad_norm": 1.1820205903006376, "learning_rate": 4.18230563002681e-06, "loss": 0.0814, "step": 468 }, { "epoch": 0.12577098417806382, "grad_norm": 0.8882258356911072, "learning_rate": 4.19124218051832e-06, "loss": 0.0757, "step": 469 }, { "epoch": 0.12603915258782517, "grad_norm": 0.6749977100624934, "learning_rate": 4.200178731009831e-06, "loss": 0.0554, "step": 470 }, { "epoch": 0.1263073209975865, "grad_norm": 0.9624656027661049, "learning_rate": 4.209115281501341e-06, "loss": 0.0879, "step": 471 }, { "epoch": 0.1265754894073478, "grad_norm": 1.1434353268458874, "learning_rate": 4.218051831992851e-06, "loss": 0.092, "step": 472 }, { "epoch": 0.12684365781710916, "grad_norm": 0.9015544704914815, "learning_rate": 4.226988382484361e-06, "loss": 0.0686, "step": 473 }, { "epoch": 0.12711182622687048, "grad_norm": 1.3229621969737553, "learning_rate": 4.2359249329758715e-06, "loss": 0.0768, "step": 474 }, { "epoch": 0.1273799946366318, "grad_norm": 0.914331834017034, "learning_rate": 4.244861483467382e-06, "loss": 0.0763, "step": 475 }, { "epoch": 0.12764816304639315, "grad_norm": 0.8331861325707116, "learning_rate": 4.253798033958893e-06, "loss": 0.066, "step": 476 }, { "epoch": 0.12791633145615447, "grad_norm": 1.2165336558394069, "learning_rate": 4.262734584450403e-06, "loss": 0.0828, "step": 477 }, { "epoch": 0.1281844998659158, "grad_norm": 0.9932826529136056, "learning_rate": 4.271671134941912e-06, "loss": 0.0844, "step": 478 }, { "epoch": 0.1284526682756771, "grad_norm": 1.0675943501939562, "learning_rate": 4.280607685433423e-06, "loss": 0.1115, "step": 479 }, { "epoch": 0.12872083668543846, "grad_norm": 0.8987343897708248, "learning_rate": 4.2895442359249335e-06, "loss": 0.0697, "step": 480 }, { "epoch": 0.12898900509519978, "grad_norm": 0.815139970203552, "learning_rate": 4.298480786416444e-06, "loss": 0.0727, "step": 481 }, { "epoch": 0.1292571735049611, "grad_norm": 0.910883252574242, "learning_rate": 4.307417336907954e-06, "loss": 0.0628, "step": 482 }, { "epoch": 0.12952534191472245, "grad_norm": 1.3298315588571508, "learning_rate": 4.316353887399464e-06, "loss": 0.0998, "step": 483 }, { "epoch": 0.12979351032448377, "grad_norm": 0.5751987297929497, "learning_rate": 4.325290437890974e-06, "loss": 0.0514, "step": 484 }, { "epoch": 0.1300616787342451, "grad_norm": 0.9025309733786545, "learning_rate": 4.3342269883824845e-06, "loss": 0.0441, "step": 485 }, { "epoch": 0.13032984714400644, "grad_norm": 0.9592140461702673, "learning_rate": 4.343163538873995e-06, "loss": 0.0764, "step": 486 }, { "epoch": 0.13059801555376777, "grad_norm": 0.9915824231527102, "learning_rate": 4.352100089365505e-06, "loss": 0.0838, "step": 487 }, { "epoch": 0.1308661839635291, "grad_norm": 0.8148280108622022, "learning_rate": 4.361036639857015e-06, "loss": 0.0652, "step": 488 }, { "epoch": 0.13113435237329044, "grad_norm": 0.8110114865775023, "learning_rate": 4.369973190348526e-06, "loss": 0.0731, "step": 489 }, { "epoch": 0.13140252078305176, "grad_norm": 0.9382468491421662, "learning_rate": 4.378909740840036e-06, "loss": 0.0788, "step": 490 }, { "epoch": 0.13167068919281308, "grad_norm": 0.669865933683254, "learning_rate": 4.3878462913315465e-06, "loss": 0.0508, "step": 491 }, { "epoch": 0.13193885760257443, "grad_norm": 1.0548767545832722, "learning_rate": 4.396782841823057e-06, "loss": 0.0833, "step": 492 }, { "epoch": 0.13220702601233575, "grad_norm": 0.9310081694817219, "learning_rate": 4.405719392314567e-06, "loss": 0.0563, "step": 493 }, { "epoch": 0.13247519442209707, "grad_norm": 1.4873241864686493, "learning_rate": 4.414655942806077e-06, "loss": 0.0906, "step": 494 }, { "epoch": 0.13274336283185842, "grad_norm": 0.7466982007412089, "learning_rate": 4.423592493297587e-06, "loss": 0.066, "step": 495 }, { "epoch": 0.13301153124161974, "grad_norm": 6.549921233618365, "learning_rate": 4.4325290437890975e-06, "loss": 0.0849, "step": 496 }, { "epoch": 0.13327969965138106, "grad_norm": 1.1850852344843947, "learning_rate": 4.441465594280608e-06, "loss": 0.1006, "step": 497 }, { "epoch": 0.1335478680611424, "grad_norm": 0.9599501906156346, "learning_rate": 4.450402144772119e-06, "loss": 0.071, "step": 498 }, { "epoch": 0.13381603647090373, "grad_norm": 0.6613122372374796, "learning_rate": 4.459338695263629e-06, "loss": 0.0511, "step": 499 }, { "epoch": 0.13408420488066505, "grad_norm": 0.7274558616569075, "learning_rate": 4.468275245755139e-06, "loss": 0.0569, "step": 500 }, { "epoch": 0.13435237329042637, "grad_norm": 0.6553914130475342, "learning_rate": 4.477211796246649e-06, "loss": 0.0444, "step": 501 }, { "epoch": 0.13462054170018772, "grad_norm": 0.9214035952956035, "learning_rate": 4.4861483467381595e-06, "loss": 0.0952, "step": 502 }, { "epoch": 0.13488871010994904, "grad_norm": 0.6800480855024846, "learning_rate": 4.49508489722967e-06, "loss": 0.0586, "step": 503 }, { "epoch": 0.13515687851971037, "grad_norm": 0.9687928078495497, "learning_rate": 4.50402144772118e-06, "loss": 0.0818, "step": 504 }, { "epoch": 0.13542504692947172, "grad_norm": 1.0448640684308557, "learning_rate": 4.51295799821269e-06, "loss": 0.0924, "step": 505 }, { "epoch": 0.13569321533923304, "grad_norm": 0.858131542510142, "learning_rate": 4.5218945487042e-06, "loss": 0.0663, "step": 506 }, { "epoch": 0.13596138374899436, "grad_norm": 0.8436086147946624, "learning_rate": 4.530831099195711e-06, "loss": 0.0667, "step": 507 }, { "epoch": 0.1362295521587557, "grad_norm": 2.1066669524317843, "learning_rate": 4.5397676496872215e-06, "loss": 0.1133, "step": 508 }, { "epoch": 0.13649772056851703, "grad_norm": 0.9183759144866963, "learning_rate": 4.548704200178731e-06, "loss": 0.075, "step": 509 }, { "epoch": 0.13676588897827835, "grad_norm": 0.8226875570281615, "learning_rate": 4.557640750670242e-06, "loss": 0.0745, "step": 510 }, { "epoch": 0.1370340573880397, "grad_norm": 0.8715530458420914, "learning_rate": 4.566577301161752e-06, "loss": 0.0669, "step": 511 }, { "epoch": 0.13730222579780102, "grad_norm": 0.9925259115116932, "learning_rate": 4.575513851653262e-06, "loss": 0.0657, "step": 512 }, { "epoch": 0.13757039420756234, "grad_norm": 0.6546855538953195, "learning_rate": 4.5844504021447725e-06, "loss": 0.0513, "step": 513 }, { "epoch": 0.1378385626173237, "grad_norm": 1.350478732524976, "learning_rate": 4.593386952636283e-06, "loss": 0.075, "step": 514 }, { "epoch": 0.138106731027085, "grad_norm": 0.8933342582644004, "learning_rate": 4.602323503127793e-06, "loss": 0.0675, "step": 515 }, { "epoch": 0.13837489943684633, "grad_norm": 1.0202490110975357, "learning_rate": 4.611260053619303e-06, "loss": 0.0746, "step": 516 }, { "epoch": 0.13864306784660768, "grad_norm": 0.7005508485761252, "learning_rate": 4.620196604110813e-06, "loss": 0.0618, "step": 517 }, { "epoch": 0.138911236256369, "grad_norm": 0.7109384080894464, "learning_rate": 4.6291331546023235e-06, "loss": 0.0605, "step": 518 }, { "epoch": 0.13917940466613032, "grad_norm": 0.9563738280846782, "learning_rate": 4.638069705093834e-06, "loss": 0.0783, "step": 519 }, { "epoch": 0.13944757307589167, "grad_norm": 1.3636375020852676, "learning_rate": 4.647006255585345e-06, "loss": 0.0881, "step": 520 }, { "epoch": 0.139715741485653, "grad_norm": 0.7255393317443555, "learning_rate": 4.655942806076855e-06, "loss": 0.0593, "step": 521 }, { "epoch": 0.13998390989541432, "grad_norm": 0.6918089214278179, "learning_rate": 4.664879356568365e-06, "loss": 0.0733, "step": 522 }, { "epoch": 0.14025207830517564, "grad_norm": 0.8351970419627537, "learning_rate": 4.673815907059875e-06, "loss": 0.0842, "step": 523 }, { "epoch": 0.14052024671493699, "grad_norm": 1.0642601386011878, "learning_rate": 4.6827524575513855e-06, "loss": 0.0822, "step": 524 }, { "epoch": 0.1407884151246983, "grad_norm": 0.8818098211601748, "learning_rate": 4.691689008042896e-06, "loss": 0.0704, "step": 525 }, { "epoch": 0.14105658353445963, "grad_norm": 0.7943586043249317, "learning_rate": 4.700625558534406e-06, "loss": 0.0695, "step": 526 }, { "epoch": 0.14132475194422098, "grad_norm": 0.6536497460148595, "learning_rate": 4.709562109025916e-06, "loss": 0.056, "step": 527 }, { "epoch": 0.1415929203539823, "grad_norm": 1.5667849774609017, "learning_rate": 4.718498659517426e-06, "loss": 0.0924, "step": 528 }, { "epoch": 0.14186108876374362, "grad_norm": 0.7565027568868655, "learning_rate": 4.727435210008937e-06, "loss": 0.0701, "step": 529 }, { "epoch": 0.14212925717350497, "grad_norm": 1.0077783748022413, "learning_rate": 4.7363717605004475e-06, "loss": 0.0751, "step": 530 }, { "epoch": 0.1423974255832663, "grad_norm": 0.7372614052460006, "learning_rate": 4.745308310991958e-06, "loss": 0.0651, "step": 531 }, { "epoch": 0.1426655939930276, "grad_norm": 0.8179854580683265, "learning_rate": 4.754244861483468e-06, "loss": 0.0624, "step": 532 }, { "epoch": 0.14293376240278896, "grad_norm": 0.7921806357668186, "learning_rate": 4.763181411974978e-06, "loss": 0.0654, "step": 533 }, { "epoch": 0.14320193081255028, "grad_norm": 1.0094717121856216, "learning_rate": 4.772117962466488e-06, "loss": 0.0778, "step": 534 }, { "epoch": 0.1434700992223116, "grad_norm": 1.1648539047305473, "learning_rate": 4.7810545129579985e-06, "loss": 0.0986, "step": 535 }, { "epoch": 0.14373826763207295, "grad_norm": 0.6223026277966669, "learning_rate": 4.789991063449509e-06, "loss": 0.0519, "step": 536 }, { "epoch": 0.14400643604183427, "grad_norm": 0.6586799327475868, "learning_rate": 4.798927613941019e-06, "loss": 0.0488, "step": 537 }, { "epoch": 0.1442746044515956, "grad_norm": 0.709980287852735, "learning_rate": 4.80786416443253e-06, "loss": 0.0461, "step": 538 }, { "epoch": 0.14454277286135694, "grad_norm": 1.4837721203925192, "learning_rate": 4.81680071492404e-06, "loss": 0.0874, "step": 539 }, { "epoch": 0.14481094127111827, "grad_norm": 0.7866693482783739, "learning_rate": 4.8257372654155495e-06, "loss": 0.0575, "step": 540 }, { "epoch": 0.1450791096808796, "grad_norm": 0.7167748661541813, "learning_rate": 4.8346738159070605e-06, "loss": 0.0674, "step": 541 }, { "epoch": 0.14534727809064094, "grad_norm": 0.8462443866200521, "learning_rate": 4.843610366398571e-06, "loss": 0.0589, "step": 542 }, { "epoch": 0.14561544650040226, "grad_norm": 0.6643656764470212, "learning_rate": 4.852546916890081e-06, "loss": 0.056, "step": 543 }, { "epoch": 0.14588361491016358, "grad_norm": 0.8552434165874294, "learning_rate": 4.861483467381591e-06, "loss": 0.0823, "step": 544 }, { "epoch": 0.1461517833199249, "grad_norm": 0.8903376388832913, "learning_rate": 4.870420017873101e-06, "loss": 0.07, "step": 545 }, { "epoch": 0.14641995172968625, "grad_norm": 0.8353595510018919, "learning_rate": 4.8793565683646115e-06, "loss": 0.0658, "step": 546 }, { "epoch": 0.14668812013944757, "grad_norm": 0.8215287733778167, "learning_rate": 4.888293118856122e-06, "loss": 0.0593, "step": 547 }, { "epoch": 0.1469562885492089, "grad_norm": 1.000115285057022, "learning_rate": 4.897229669347633e-06, "loss": 0.0765, "step": 548 }, { "epoch": 0.14722445695897024, "grad_norm": 0.7405585700170938, "learning_rate": 4.906166219839142e-06, "loss": 0.0501, "step": 549 }, { "epoch": 0.14749262536873156, "grad_norm": 0.9142307742236662, "learning_rate": 4.915102770330652e-06, "loss": 0.0592, "step": 550 }, { "epoch": 0.14776079377849288, "grad_norm": 0.6160805294173842, "learning_rate": 4.924039320822163e-06, "loss": 0.0491, "step": 551 }, { "epoch": 0.14802896218825423, "grad_norm": 1.1391281691417314, "learning_rate": 4.9329758713136735e-06, "loss": 0.0614, "step": 552 }, { "epoch": 0.14829713059801555, "grad_norm": 0.616929305519688, "learning_rate": 4.941912421805184e-06, "loss": 0.0499, "step": 553 }, { "epoch": 0.14856529900777687, "grad_norm": 0.8509743989261174, "learning_rate": 4.950848972296694e-06, "loss": 0.0733, "step": 554 }, { "epoch": 0.14883346741753822, "grad_norm": 0.7293632113916422, "learning_rate": 4.959785522788204e-06, "loss": 0.0756, "step": 555 }, { "epoch": 0.14910163582729954, "grad_norm": 0.8819254466979063, "learning_rate": 4.968722073279714e-06, "loss": 0.0848, "step": 556 }, { "epoch": 0.14936980423706087, "grad_norm": 0.8080837803424797, "learning_rate": 4.9776586237712245e-06, "loss": 0.0812, "step": 557 }, { "epoch": 0.14963797264682221, "grad_norm": 0.7911275831913056, "learning_rate": 4.986595174262735e-06, "loss": 0.0644, "step": 558 }, { "epoch": 0.14990614105658354, "grad_norm": 0.47038585896765805, "learning_rate": 4.995531724754245e-06, "loss": 0.0395, "step": 559 }, { "epoch": 0.15017430946634486, "grad_norm": 0.8369747807835302, "learning_rate": 5.004468275245755e-06, "loss": 0.079, "step": 560 }, { "epoch": 0.1504424778761062, "grad_norm": 1.0874992943514181, "learning_rate": 5.013404825737266e-06, "loss": 0.0849, "step": 561 }, { "epoch": 0.15071064628586753, "grad_norm": 0.733063815926613, "learning_rate": 5.022341376228776e-06, "loss": 0.0822, "step": 562 }, { "epoch": 0.15097881469562885, "grad_norm": 0.7077567305270279, "learning_rate": 5.031277926720286e-06, "loss": 0.0612, "step": 563 }, { "epoch": 0.1512469831053902, "grad_norm": 1.2505369073509383, "learning_rate": 5.040214477211797e-06, "loss": 0.0811, "step": 564 }, { "epoch": 0.15151515151515152, "grad_norm": 0.9800800107631583, "learning_rate": 5.049151027703308e-06, "loss": 0.057, "step": 565 }, { "epoch": 0.15178331992491284, "grad_norm": 0.7504456722153466, "learning_rate": 5.058087578194817e-06, "loss": 0.0525, "step": 566 }, { "epoch": 0.15205148833467416, "grad_norm": 0.6270440486099853, "learning_rate": 5.067024128686327e-06, "loss": 0.0506, "step": 567 }, { "epoch": 0.1523196567444355, "grad_norm": 1.1128116960948156, "learning_rate": 5.075960679177838e-06, "loss": 0.0722, "step": 568 }, { "epoch": 0.15258782515419683, "grad_norm": 0.9071547221861076, "learning_rate": 5.084897229669348e-06, "loss": 0.0797, "step": 569 }, { "epoch": 0.15285599356395815, "grad_norm": 0.6287612256262585, "learning_rate": 5.093833780160859e-06, "loss": 0.0406, "step": 570 }, { "epoch": 0.1531241619737195, "grad_norm": 0.7376754946842811, "learning_rate": 5.102770330652369e-06, "loss": 0.0737, "step": 571 }, { "epoch": 0.15339233038348082, "grad_norm": 1.4511842740734502, "learning_rate": 5.111706881143878e-06, "loss": 0.05, "step": 572 }, { "epoch": 0.15366049879324215, "grad_norm": 1.9345293173757914, "learning_rate": 5.120643431635389e-06, "loss": 0.0573, "step": 573 }, { "epoch": 0.1539286672030035, "grad_norm": 0.8099357012864555, "learning_rate": 5.1295799821268995e-06, "loss": 0.0746, "step": 574 }, { "epoch": 0.15419683561276482, "grad_norm": 1.084405035264434, "learning_rate": 5.13851653261841e-06, "loss": 0.0801, "step": 575 }, { "epoch": 0.15446500402252614, "grad_norm": 0.875224172009727, "learning_rate": 5.14745308310992e-06, "loss": 0.0793, "step": 576 }, { "epoch": 0.15473317243228749, "grad_norm": 0.6845971696845544, "learning_rate": 5.156389633601431e-06, "loss": 0.0501, "step": 577 }, { "epoch": 0.1550013408420488, "grad_norm": 0.7015040398761847, "learning_rate": 5.16532618409294e-06, "loss": 0.0561, "step": 578 }, { "epoch": 0.15526950925181013, "grad_norm": 0.8101119698797669, "learning_rate": 5.174262734584451e-06, "loss": 0.0638, "step": 579 }, { "epoch": 0.15553767766157148, "grad_norm": 0.902501554293187, "learning_rate": 5.1831992850759615e-06, "loss": 0.0809, "step": 580 }, { "epoch": 0.1558058460713328, "grad_norm": 0.747158066441556, "learning_rate": 5.192135835567471e-06, "loss": 0.0438, "step": 581 }, { "epoch": 0.15607401448109412, "grad_norm": 0.644553316811843, "learning_rate": 5.201072386058982e-06, "loss": 0.0685, "step": 582 }, { "epoch": 0.15634218289085547, "grad_norm": 0.8887122120977885, "learning_rate": 5.210008936550492e-06, "loss": 0.0702, "step": 583 }, { "epoch": 0.1566103513006168, "grad_norm": 0.7918242279058249, "learning_rate": 5.218945487042002e-06, "loss": 0.0552, "step": 584 }, { "epoch": 0.1568785197103781, "grad_norm": 0.7173083180734886, "learning_rate": 5.2278820375335125e-06, "loss": 0.0592, "step": 585 }, { "epoch": 0.15714668812013946, "grad_norm": 0.7722190178674668, "learning_rate": 5.2368185880250235e-06, "loss": 0.0556, "step": 586 }, { "epoch": 0.15741485652990078, "grad_norm": 0.8365821825053458, "learning_rate": 5.245755138516533e-06, "loss": 0.0848, "step": 587 }, { "epoch": 0.1576830249396621, "grad_norm": 0.7757694062814907, "learning_rate": 5.254691689008043e-06, "loss": 0.0509, "step": 588 }, { "epoch": 0.15795119334942342, "grad_norm": 1.0987816124799554, "learning_rate": 5.263628239499554e-06, "loss": 0.0766, "step": 589 }, { "epoch": 0.15821936175918477, "grad_norm": 0.8239668683990621, "learning_rate": 5.2725647899910635e-06, "loss": 0.0701, "step": 590 }, { "epoch": 0.1584875301689461, "grad_norm": 1.0241210739973214, "learning_rate": 5.2815013404825745e-06, "loss": 0.0764, "step": 591 }, { "epoch": 0.15875569857870742, "grad_norm": 0.7479314447965554, "learning_rate": 5.290437890974084e-06, "loss": 0.0507, "step": 592 }, { "epoch": 0.15902386698846877, "grad_norm": 0.73417080951846, "learning_rate": 5.299374441465595e-06, "loss": 0.0582, "step": 593 }, { "epoch": 0.1592920353982301, "grad_norm": 0.7483484584095388, "learning_rate": 5.308310991957105e-06, "loss": 0.0686, "step": 594 }, { "epoch": 0.1595602038079914, "grad_norm": 1.206962169332511, "learning_rate": 5.3172475424486145e-06, "loss": 0.0887, "step": 595 }, { "epoch": 0.15982837221775276, "grad_norm": 0.7954937171928648, "learning_rate": 5.3261840929401255e-06, "loss": 0.0522, "step": 596 }, { "epoch": 0.16009654062751408, "grad_norm": 0.6674926580235067, "learning_rate": 5.335120643431636e-06, "loss": 0.0492, "step": 597 }, { "epoch": 0.1603647090372754, "grad_norm": 0.8918142975031488, "learning_rate": 5.344057193923146e-06, "loss": 0.0734, "step": 598 }, { "epoch": 0.16063287744703675, "grad_norm": 1.554050689585358, "learning_rate": 5.352993744414656e-06, "loss": 0.0704, "step": 599 }, { "epoch": 0.16090104585679807, "grad_norm": 0.6165248410471955, "learning_rate": 5.361930294906167e-06, "loss": 0.0717, "step": 600 }, { "epoch": 0.1611692142665594, "grad_norm": 1.2799547351650304, "learning_rate": 5.3708668453976765e-06, "loss": 0.0759, "step": 601 }, { "epoch": 0.16143738267632074, "grad_norm": 1.3544895398760768, "learning_rate": 5.3798033958891875e-06, "loss": 0.0675, "step": 602 }, { "epoch": 0.16170555108608206, "grad_norm": 0.7554679017626319, "learning_rate": 5.388739946380698e-06, "loss": 0.0626, "step": 603 }, { "epoch": 0.16197371949584338, "grad_norm": 0.6859086960854943, "learning_rate": 5.397676496872207e-06, "loss": 0.054, "step": 604 }, { "epoch": 0.16224188790560473, "grad_norm": 0.8028854363715127, "learning_rate": 5.406613047363718e-06, "loss": 0.0713, "step": 605 }, { "epoch": 0.16251005631536605, "grad_norm": 0.9924929945204021, "learning_rate": 5.415549597855228e-06, "loss": 0.08, "step": 606 }, { "epoch": 0.16277822472512737, "grad_norm": 0.9439529533222822, "learning_rate": 5.4244861483467385e-06, "loss": 0.0876, "step": 607 }, { "epoch": 0.16304639313488872, "grad_norm": 0.7868861895706132, "learning_rate": 5.433422698838249e-06, "loss": 0.0803, "step": 608 }, { "epoch": 0.16331456154465004, "grad_norm": 0.8302736765526366, "learning_rate": 5.44235924932976e-06, "loss": 0.0674, "step": 609 }, { "epoch": 0.16358272995441137, "grad_norm": 0.6945923024705025, "learning_rate": 5.451295799821269e-06, "loss": 0.0545, "step": 610 }, { "epoch": 0.1638508983641727, "grad_norm": 0.692563574807357, "learning_rate": 5.460232350312779e-06, "loss": 0.0506, "step": 611 }, { "epoch": 0.16411906677393404, "grad_norm": 0.5964248691787993, "learning_rate": 5.46916890080429e-06, "loss": 0.0494, "step": 612 }, { "epoch": 0.16438723518369536, "grad_norm": 0.6159780108053444, "learning_rate": 5.4781054512958e-06, "loss": 0.0594, "step": 613 }, { "epoch": 0.16465540359345668, "grad_norm": 0.6806279251009878, "learning_rate": 5.487042001787311e-06, "loss": 0.0525, "step": 614 }, { "epoch": 0.16492357200321803, "grad_norm": 0.8489523265868572, "learning_rate": 5.495978552278821e-06, "loss": 0.0746, "step": 615 }, { "epoch": 0.16519174041297935, "grad_norm": 0.9277273860675815, "learning_rate": 5.504915102770331e-06, "loss": 0.0762, "step": 616 }, { "epoch": 0.16545990882274067, "grad_norm": 0.7561222996460426, "learning_rate": 5.513851653261841e-06, "loss": 0.0504, "step": 617 }, { "epoch": 0.16572807723250202, "grad_norm": 0.8105741312570891, "learning_rate": 5.522788203753352e-06, "loss": 0.0742, "step": 618 }, { "epoch": 0.16599624564226334, "grad_norm": 0.986618312108674, "learning_rate": 5.531724754244862e-06, "loss": 0.0636, "step": 619 }, { "epoch": 0.16626441405202466, "grad_norm": 0.5174299889183016, "learning_rate": 5.540661304736372e-06, "loss": 0.0478, "step": 620 }, { "epoch": 0.166532582461786, "grad_norm": 0.7972667418516348, "learning_rate": 5.549597855227883e-06, "loss": 0.0599, "step": 621 }, { "epoch": 0.16680075087154733, "grad_norm": 0.7521932679692337, "learning_rate": 5.558534405719392e-06, "loss": 0.0636, "step": 622 }, { "epoch": 0.16706891928130865, "grad_norm": 0.7596575425805953, "learning_rate": 5.567470956210903e-06, "loss": 0.0626, "step": 623 }, { "epoch": 0.16733708769107, "grad_norm": 1.0278251859880065, "learning_rate": 5.5764075067024135e-06, "loss": 0.0842, "step": 624 }, { "epoch": 0.16760525610083132, "grad_norm": 0.9691655242783864, "learning_rate": 5.585344057193923e-06, "loss": 0.0736, "step": 625 }, { "epoch": 0.16787342451059264, "grad_norm": 0.7920761320230346, "learning_rate": 5.594280607685434e-06, "loss": 0.0645, "step": 626 }, { "epoch": 0.168141592920354, "grad_norm": 0.669562952121449, "learning_rate": 5.603217158176945e-06, "loss": 0.0528, "step": 627 }, { "epoch": 0.16840976133011532, "grad_norm": 0.8605030640451825, "learning_rate": 5.612153708668454e-06, "loss": 0.0572, "step": 628 }, { "epoch": 0.16867792973987664, "grad_norm": 0.7266176227579504, "learning_rate": 5.6210902591599645e-06, "loss": 0.0687, "step": 629 }, { "epoch": 0.16894609814963799, "grad_norm": 0.543948406582875, "learning_rate": 5.6300268096514755e-06, "loss": 0.0554, "step": 630 }, { "epoch": 0.1692142665593993, "grad_norm": 0.7049075660316708, "learning_rate": 5.638963360142985e-06, "loss": 0.0685, "step": 631 }, { "epoch": 0.16948243496916063, "grad_norm": 0.7040457753482281, "learning_rate": 5.647899910634496e-06, "loss": 0.0575, "step": 632 }, { "epoch": 0.16975060337892195, "grad_norm": 0.9803494851741242, "learning_rate": 5.656836461126006e-06, "loss": 0.0803, "step": 633 }, { "epoch": 0.1700187717886833, "grad_norm": 0.7701798894334154, "learning_rate": 5.6657730116175155e-06, "loss": 0.0591, "step": 634 }, { "epoch": 0.17028694019844462, "grad_norm": 1.0212495968892932, "learning_rate": 5.6747095621090265e-06, "loss": 0.0663, "step": 635 }, { "epoch": 0.17055510860820594, "grad_norm": 0.9483956706703315, "learning_rate": 5.683646112600537e-06, "loss": 0.061, "step": 636 }, { "epoch": 0.1708232770179673, "grad_norm": 0.6614724647899509, "learning_rate": 5.692582663092047e-06, "loss": 0.0582, "step": 637 }, { "epoch": 0.1710914454277286, "grad_norm": 0.9309092810402265, "learning_rate": 5.701519213583557e-06, "loss": 0.0678, "step": 638 }, { "epoch": 0.17135961383748993, "grad_norm": 0.6903336159620008, "learning_rate": 5.710455764075068e-06, "loss": 0.0763, "step": 639 }, { "epoch": 0.17162778224725128, "grad_norm": 0.7321010050308411, "learning_rate": 5.7193923145665775e-06, "loss": 0.0696, "step": 640 }, { "epoch": 0.1718959506570126, "grad_norm": 1.2930160472906331, "learning_rate": 5.7283288650580885e-06, "loss": 0.0878, "step": 641 }, { "epoch": 0.17216411906677392, "grad_norm": 0.9092015676328017, "learning_rate": 5.737265415549599e-06, "loss": 0.083, "step": 642 }, { "epoch": 0.17243228747653527, "grad_norm": 0.6640708112231667, "learning_rate": 5.746201966041108e-06, "loss": 0.0445, "step": 643 }, { "epoch": 0.1727004558862966, "grad_norm": 0.7783794510147659, "learning_rate": 5.755138516532619e-06, "loss": 0.0522, "step": 644 }, { "epoch": 0.17296862429605792, "grad_norm": 0.8009510229888075, "learning_rate": 5.764075067024129e-06, "loss": 0.0605, "step": 645 }, { "epoch": 0.17323679270581926, "grad_norm": 1.219838757150327, "learning_rate": 5.7730116175156395e-06, "loss": 0.0697, "step": 646 }, { "epoch": 0.17350496111558059, "grad_norm": 0.7651132017369591, "learning_rate": 5.78194816800715e-06, "loss": 0.067, "step": 647 }, { "epoch": 0.1737731295253419, "grad_norm": 0.6797202399923142, "learning_rate": 5.790884718498661e-06, "loss": 0.0548, "step": 648 }, { "epoch": 0.17404129793510326, "grad_norm": 0.743548123014249, "learning_rate": 5.79982126899017e-06, "loss": 0.0632, "step": 649 }, { "epoch": 0.17430946634486458, "grad_norm": 0.6220641344644409, "learning_rate": 5.808757819481681e-06, "loss": 0.0593, "step": 650 }, { "epoch": 0.1745776347546259, "grad_norm": 0.5767877867894173, "learning_rate": 5.817694369973191e-06, "loss": 0.0491, "step": 651 }, { "epoch": 0.17484580316438725, "grad_norm": 0.9321535693181042, "learning_rate": 5.826630920464701e-06, "loss": 0.0969, "step": 652 }, { "epoch": 0.17511397157414857, "grad_norm": 0.7302847182071687, "learning_rate": 5.835567470956212e-06, "loss": 0.0526, "step": 653 }, { "epoch": 0.1753821399839099, "grad_norm": 0.9198630289552819, "learning_rate": 5.844504021447721e-06, "loss": 0.0809, "step": 654 }, { "epoch": 0.1756503083936712, "grad_norm": 0.8641497488576019, "learning_rate": 5.853440571939232e-06, "loss": 0.0572, "step": 655 }, { "epoch": 0.17591847680343256, "grad_norm": 0.9951660257601543, "learning_rate": 5.862377122430742e-06, "loss": 0.0751, "step": 656 }, { "epoch": 0.17618664521319388, "grad_norm": 0.9342921859576737, "learning_rate": 5.871313672922252e-06, "loss": 0.0696, "step": 657 }, { "epoch": 0.1764548136229552, "grad_norm": 1.0568270426819426, "learning_rate": 5.880250223413763e-06, "loss": 0.0938, "step": 658 }, { "epoch": 0.17672298203271655, "grad_norm": 1.0442793889830606, "learning_rate": 5.889186773905273e-06, "loss": 0.0534, "step": 659 }, { "epoch": 0.17699115044247787, "grad_norm": 0.846547023328437, "learning_rate": 5.898123324396783e-06, "loss": 0.0589, "step": 660 }, { "epoch": 0.1772593188522392, "grad_norm": 0.9473986989715096, "learning_rate": 5.907059874888293e-06, "loss": 0.0622, "step": 661 }, { "epoch": 0.17752748726200054, "grad_norm": 0.6554661375896103, "learning_rate": 5.915996425379804e-06, "loss": 0.0488, "step": 662 }, { "epoch": 0.17779565567176187, "grad_norm": 1.6004788984951979, "learning_rate": 5.924932975871314e-06, "loss": 0.0898, "step": 663 }, { "epoch": 0.1780638240815232, "grad_norm": 0.6196855926162149, "learning_rate": 5.933869526362825e-06, "loss": 0.0427, "step": 664 }, { "epoch": 0.17833199249128454, "grad_norm": 2.1065193315449484, "learning_rate": 5.942806076854335e-06, "loss": 0.0796, "step": 665 }, { "epoch": 0.17860016090104586, "grad_norm": 0.7052395430019095, "learning_rate": 5.951742627345844e-06, "loss": 0.0526, "step": 666 }, { "epoch": 0.17886832931080718, "grad_norm": 1.6387407274136412, "learning_rate": 5.960679177837355e-06, "loss": 0.0875, "step": 667 }, { "epoch": 0.17913649772056853, "grad_norm": 0.8190896983147427, "learning_rate": 5.9696157283288655e-06, "loss": 0.0597, "step": 668 }, { "epoch": 0.17940466613032985, "grad_norm": 0.9239746877888685, "learning_rate": 5.978552278820376e-06, "loss": 0.0589, "step": 669 }, { "epoch": 0.17967283454009117, "grad_norm": 0.7258910140441152, "learning_rate": 5.987488829311886e-06, "loss": 0.0584, "step": 670 }, { "epoch": 0.17994100294985252, "grad_norm": 0.6944481489693888, "learning_rate": 5.996425379803397e-06, "loss": 0.0619, "step": 671 }, { "epoch": 0.18020917135961384, "grad_norm": 0.5876336060985152, "learning_rate": 6.005361930294906e-06, "loss": 0.0541, "step": 672 }, { "epoch": 0.18047733976937516, "grad_norm": 1.7155744326653621, "learning_rate": 6.0142984807864165e-06, "loss": 0.083, "step": 673 }, { "epoch": 0.1807455081791365, "grad_norm": 0.798476351806146, "learning_rate": 6.0232350312779275e-06, "loss": 0.062, "step": 674 }, { "epoch": 0.18101367658889783, "grad_norm": 0.7091761098575183, "learning_rate": 6.032171581769437e-06, "loss": 0.0582, "step": 675 }, { "epoch": 0.18128184499865915, "grad_norm": 1.2051898320378824, "learning_rate": 6.041108132260948e-06, "loss": 0.0861, "step": 676 }, { "epoch": 0.18155001340842047, "grad_norm": 0.6781611369231361, "learning_rate": 6.050044682752458e-06, "loss": 0.0487, "step": 677 }, { "epoch": 0.18181818181818182, "grad_norm": 0.6771110157218314, "learning_rate": 6.058981233243968e-06, "loss": 0.054, "step": 678 }, { "epoch": 0.18208635022794314, "grad_norm": 0.7362147440205621, "learning_rate": 6.0679177837354785e-06, "loss": 0.0501, "step": 679 }, { "epoch": 0.18235451863770447, "grad_norm": 0.7620786157266373, "learning_rate": 6.0768543342269895e-06, "loss": 0.0563, "step": 680 }, { "epoch": 0.18262268704746581, "grad_norm": 1.0385514676432708, "learning_rate": 6.085790884718499e-06, "loss": 0.083, "step": 681 }, { "epoch": 0.18289085545722714, "grad_norm": 0.6984213217944104, "learning_rate": 6.094727435210009e-06, "loss": 0.0655, "step": 682 }, { "epoch": 0.18315902386698846, "grad_norm": 0.7142706630671537, "learning_rate": 6.10366398570152e-06, "loss": 0.0698, "step": 683 }, { "epoch": 0.1834271922767498, "grad_norm": 0.7154365286170392, "learning_rate": 6.1126005361930295e-06, "loss": 0.0629, "step": 684 }, { "epoch": 0.18369536068651113, "grad_norm": 0.647465816127068, "learning_rate": 6.1215370866845405e-06, "loss": 0.0572, "step": 685 }, { "epoch": 0.18396352909627245, "grad_norm": 0.7181705009097983, "learning_rate": 6.130473637176051e-06, "loss": 0.0735, "step": 686 }, { "epoch": 0.1842316975060338, "grad_norm": 0.6845166979442948, "learning_rate": 6.139410187667561e-06, "loss": 0.0492, "step": 687 }, { "epoch": 0.18449986591579512, "grad_norm": 0.6513732441259296, "learning_rate": 6.148346738159071e-06, "loss": 0.0659, "step": 688 }, { "epoch": 0.18476803432555644, "grad_norm": 0.683061812339965, "learning_rate": 6.157283288650582e-06, "loss": 0.0562, "step": 689 }, { "epoch": 0.1850362027353178, "grad_norm": 0.7399919598847905, "learning_rate": 6.1662198391420915e-06, "loss": 0.0455, "step": 690 }, { "epoch": 0.1853043711450791, "grad_norm": 0.5870580379027058, "learning_rate": 6.175156389633602e-06, "loss": 0.0597, "step": 691 }, { "epoch": 0.18557253955484043, "grad_norm": 0.7438342058561855, "learning_rate": 6.184092940125113e-06, "loss": 0.0638, "step": 692 }, { "epoch": 0.18584070796460178, "grad_norm": 0.6849801703471039, "learning_rate": 6.193029490616622e-06, "loss": 0.0651, "step": 693 }, { "epoch": 0.1861088763743631, "grad_norm": 0.6160922392252735, "learning_rate": 6.201966041108133e-06, "loss": 0.0463, "step": 694 }, { "epoch": 0.18637704478412442, "grad_norm": 0.5765825815633243, "learning_rate": 6.210902591599643e-06, "loss": 0.0436, "step": 695 }, { "epoch": 0.18664521319388577, "grad_norm": 0.5969321503429132, "learning_rate": 6.219839142091153e-06, "loss": 0.0693, "step": 696 }, { "epoch": 0.1869133816036471, "grad_norm": 0.643916489809957, "learning_rate": 6.228775692582664e-06, "loss": 0.0814, "step": 697 }, { "epoch": 0.18718155001340842, "grad_norm": 0.7132738026241066, "learning_rate": 6.237712243074175e-06, "loss": 0.0628, "step": 698 }, { "epoch": 0.18744971842316976, "grad_norm": 0.6426709438117871, "learning_rate": 6.246648793565684e-06, "loss": 0.0568, "step": 699 }, { "epoch": 0.18771788683293109, "grad_norm": 0.5918446386191234, "learning_rate": 6.255585344057194e-06, "loss": 0.0484, "step": 700 }, { "epoch": 0.1879860552426924, "grad_norm": 0.5462704738042106, "learning_rate": 6.264521894548705e-06, "loss": 0.044, "step": 701 }, { "epoch": 0.18825422365245373, "grad_norm": 0.7314206212373209, "learning_rate": 6.273458445040215e-06, "loss": 0.0727, "step": 702 }, { "epoch": 0.18852239206221508, "grad_norm": 0.6659283042547272, "learning_rate": 6.282394995531726e-06, "loss": 0.0589, "step": 703 }, { "epoch": 0.1887905604719764, "grad_norm": 0.6684930273676993, "learning_rate": 6.291331546023236e-06, "loss": 0.0661, "step": 704 }, { "epoch": 0.18905872888173772, "grad_norm": 0.8904769710559233, "learning_rate": 6.300268096514745e-06, "loss": 0.0632, "step": 705 }, { "epoch": 0.18932689729149907, "grad_norm": 0.8789493229124882, "learning_rate": 6.309204647006256e-06, "loss": 0.0676, "step": 706 }, { "epoch": 0.1895950657012604, "grad_norm": 0.6501276177777463, "learning_rate": 6.3181411974977665e-06, "loss": 0.0545, "step": 707 }, { "epoch": 0.1898632341110217, "grad_norm": 1.0843691126438044, "learning_rate": 6.327077747989277e-06, "loss": 0.0551, "step": 708 }, { "epoch": 0.19013140252078306, "grad_norm": 1.0010513130523677, "learning_rate": 6.336014298480787e-06, "loss": 0.0754, "step": 709 }, { "epoch": 0.19039957093054438, "grad_norm": 1.049178684107961, "learning_rate": 6.344950848972298e-06, "loss": 0.088, "step": 710 }, { "epoch": 0.1906677393403057, "grad_norm": 0.7670234048871027, "learning_rate": 6.353887399463807e-06, "loss": 0.063, "step": 711 }, { "epoch": 0.19093590775006705, "grad_norm": 0.71542928973344, "learning_rate": 6.362823949955318e-06, "loss": 0.047, "step": 712 }, { "epoch": 0.19120407615982837, "grad_norm": 1.4447203202553363, "learning_rate": 6.3717605004468285e-06, "loss": 0.0461, "step": 713 }, { "epoch": 0.1914722445695897, "grad_norm": 0.7640657463427055, "learning_rate": 6.380697050938338e-06, "loss": 0.0546, "step": 714 }, { "epoch": 0.19174041297935104, "grad_norm": 0.9553917219481026, "learning_rate": 6.389633601429849e-06, "loss": 0.0729, "step": 715 }, { "epoch": 0.19200858138911236, "grad_norm": 0.5881144124533128, "learning_rate": 6.398570151921358e-06, "loss": 0.0454, "step": 716 }, { "epoch": 0.1922767497988737, "grad_norm": 0.7518000587182011, "learning_rate": 6.407506702412869e-06, "loss": 0.0704, "step": 717 }, { "epoch": 0.19254491820863504, "grad_norm": 0.743107153030265, "learning_rate": 6.4164432529043795e-06, "loss": 0.0679, "step": 718 }, { "epoch": 0.19281308661839636, "grad_norm": 2.0013645612435336, "learning_rate": 6.425379803395889e-06, "loss": 0.0759, "step": 719 }, { "epoch": 0.19308125502815768, "grad_norm": 0.6417405077349332, "learning_rate": 6.4343163538874e-06, "loss": 0.0649, "step": 720 }, { "epoch": 0.19334942343791903, "grad_norm": 0.6050146092617343, "learning_rate": 6.44325290437891e-06, "loss": 0.0438, "step": 721 }, { "epoch": 0.19361759184768035, "grad_norm": 0.6985273578197806, "learning_rate": 6.45218945487042e-06, "loss": 0.0472, "step": 722 }, { "epoch": 0.19388576025744167, "grad_norm": 0.7744006084837489, "learning_rate": 6.4611260053619305e-06, "loss": 0.0696, "step": 723 }, { "epoch": 0.194153928667203, "grad_norm": 1.0599279825449373, "learning_rate": 6.4700625558534415e-06, "loss": 0.066, "step": 724 }, { "epoch": 0.19442209707696434, "grad_norm": 0.6193995176176766, "learning_rate": 6.478999106344951e-06, "loss": 0.0626, "step": 725 }, { "epoch": 0.19469026548672566, "grad_norm": 0.680651661916281, "learning_rate": 6.487935656836462e-06, "loss": 0.0609, "step": 726 }, { "epoch": 0.19495843389648698, "grad_norm": 0.7131379200257753, "learning_rate": 6.496872207327972e-06, "loss": 0.0582, "step": 727 }, { "epoch": 0.19522660230624833, "grad_norm": 0.9252839433347055, "learning_rate": 6.5058087578194814e-06, "loss": 0.0648, "step": 728 }, { "epoch": 0.19549477071600965, "grad_norm": 0.6412540232830793, "learning_rate": 6.5147453083109925e-06, "loss": 0.0605, "step": 729 }, { "epoch": 0.19576293912577097, "grad_norm": 1.0658277131055782, "learning_rate": 6.523681858802503e-06, "loss": 0.0836, "step": 730 }, { "epoch": 0.19603110753553232, "grad_norm": 1.922141508857633, "learning_rate": 6.532618409294013e-06, "loss": 0.0904, "step": 731 }, { "epoch": 0.19629927594529364, "grad_norm": 0.5708354262791332, "learning_rate": 6.541554959785523e-06, "loss": 0.0552, "step": 732 }, { "epoch": 0.19656744435505497, "grad_norm": 0.6681504053636753, "learning_rate": 6.550491510277034e-06, "loss": 0.0491, "step": 733 }, { "epoch": 0.19683561276481631, "grad_norm": 0.588696101953411, "learning_rate": 6.5594280607685435e-06, "loss": 0.0457, "step": 734 }, { "epoch": 0.19710378117457764, "grad_norm": 0.7627814762672769, "learning_rate": 6.5683646112600545e-06, "loss": 0.0627, "step": 735 }, { "epoch": 0.19737194958433896, "grad_norm": 0.8569070314146487, "learning_rate": 6.577301161751565e-06, "loss": 0.0685, "step": 736 }, { "epoch": 0.1976401179941003, "grad_norm": 0.8085062704934447, "learning_rate": 6.586237712243074e-06, "loss": 0.0781, "step": 737 }, { "epoch": 0.19790828640386163, "grad_norm": 0.9934832587248366, "learning_rate": 6.595174262734585e-06, "loss": 0.063, "step": 738 }, { "epoch": 0.19817645481362295, "grad_norm": 0.7144092411098115, "learning_rate": 6.604110813226095e-06, "loss": 0.0632, "step": 739 }, { "epoch": 0.1984446232233843, "grad_norm": 0.45612866487212417, "learning_rate": 6.6130473637176055e-06, "loss": 0.0432, "step": 740 }, { "epoch": 0.19871279163314562, "grad_norm": 0.6463781410835909, "learning_rate": 6.621983914209116e-06, "loss": 0.0688, "step": 741 }, { "epoch": 0.19898096004290694, "grad_norm": 0.6339875479418067, "learning_rate": 6.630920464700627e-06, "loss": 0.0554, "step": 742 }, { "epoch": 0.1992491284526683, "grad_norm": 0.6039134839528961, "learning_rate": 6.639857015192136e-06, "loss": 0.0509, "step": 743 }, { "epoch": 0.1995172968624296, "grad_norm": 0.9204030128839711, "learning_rate": 6.648793565683646e-06, "loss": 0.0691, "step": 744 }, { "epoch": 0.19978546527219093, "grad_norm": 0.6527283096508315, "learning_rate": 6.657730116175157e-06, "loss": 0.0672, "step": 745 }, { "epoch": 0.20005363368195225, "grad_norm": 0.5656081531362551, "learning_rate": 6.666666666666667e-06, "loss": 0.0493, "step": 746 }, { "epoch": 0.2003218020917136, "grad_norm": 0.5388498726659108, "learning_rate": 6.675603217158178e-06, "loss": 0.0505, "step": 747 }, { "epoch": 0.20058997050147492, "grad_norm": 0.634389081117935, "learning_rate": 6.684539767649688e-06, "loss": 0.0598, "step": 748 }, { "epoch": 0.20085813891123624, "grad_norm": 0.7855810387645783, "learning_rate": 6.693476318141198e-06, "loss": 0.0577, "step": 749 }, { "epoch": 0.2011263073209976, "grad_norm": 0.9079828407382363, "learning_rate": 6.702412868632708e-06, "loss": 0.0572, "step": 750 }, { "epoch": 0.20139447573075892, "grad_norm": 0.670347649066555, "learning_rate": 6.711349419124219e-06, "loss": 0.0832, "step": 751 }, { "epoch": 0.20166264414052024, "grad_norm": 0.6314476472333589, "learning_rate": 6.720285969615729e-06, "loss": 0.064, "step": 752 }, { "epoch": 0.20193081255028159, "grad_norm": 0.6554007399542807, "learning_rate": 6.729222520107239e-06, "loss": 0.0595, "step": 753 }, { "epoch": 0.2021989809600429, "grad_norm": 0.9287464566943074, "learning_rate": 6.73815907059875e-06, "loss": 0.0673, "step": 754 }, { "epoch": 0.20246714936980423, "grad_norm": 1.0299474180112156, "learning_rate": 6.747095621090259e-06, "loss": 0.0768, "step": 755 }, { "epoch": 0.20273531777956558, "grad_norm": 0.6450382377010188, "learning_rate": 6.75603217158177e-06, "loss": 0.0467, "step": 756 }, { "epoch": 0.2030034861893269, "grad_norm": 0.9463199608458214, "learning_rate": 6.7649687220732805e-06, "loss": 0.0572, "step": 757 }, { "epoch": 0.20327165459908822, "grad_norm": 0.7266263781091684, "learning_rate": 6.77390527256479e-06, "loss": 0.0542, "step": 758 }, { "epoch": 0.20353982300884957, "grad_norm": 0.8903936647426972, "learning_rate": 6.782841823056301e-06, "loss": 0.073, "step": 759 }, { "epoch": 0.2038079914186109, "grad_norm": 1.032084666218354, "learning_rate": 6.791778373547812e-06, "loss": 0.0625, "step": 760 }, { "epoch": 0.2040761598283722, "grad_norm": 0.8230260287218641, "learning_rate": 6.800714924039321e-06, "loss": 0.08, "step": 761 }, { "epoch": 0.20434432823813356, "grad_norm": 0.7691609736623329, "learning_rate": 6.8096514745308315e-06, "loss": 0.0795, "step": 762 }, { "epoch": 0.20461249664789488, "grad_norm": 0.9735415188732415, "learning_rate": 6.8185880250223425e-06, "loss": 0.0563, "step": 763 }, { "epoch": 0.2048806650576562, "grad_norm": 0.6016653128156609, "learning_rate": 6.827524575513852e-06, "loss": 0.0581, "step": 764 }, { "epoch": 0.20514883346741755, "grad_norm": 1.0826231146782523, "learning_rate": 6.836461126005363e-06, "loss": 0.0632, "step": 765 }, { "epoch": 0.20541700187717887, "grad_norm": 0.9275693654000192, "learning_rate": 6.845397676496873e-06, "loss": 0.0575, "step": 766 }, { "epoch": 0.2056851702869402, "grad_norm": 0.8153801558858359, "learning_rate": 6.8543342269883824e-06, "loss": 0.0649, "step": 767 }, { "epoch": 0.20595333869670152, "grad_norm": 0.6880063379564347, "learning_rate": 6.8632707774798935e-06, "loss": 0.0616, "step": 768 }, { "epoch": 0.20622150710646286, "grad_norm": 0.9700171857880753, "learning_rate": 6.8722073279714045e-06, "loss": 0.0756, "step": 769 }, { "epoch": 0.20648967551622419, "grad_norm": 1.0846412565864332, "learning_rate": 6.881143878462914e-06, "loss": 0.0633, "step": 770 }, { "epoch": 0.2067578439259855, "grad_norm": 0.6348597667757498, "learning_rate": 6.890080428954424e-06, "loss": 0.0661, "step": 771 }, { "epoch": 0.20702601233574686, "grad_norm": 0.797333089257687, "learning_rate": 6.899016979445935e-06, "loss": 0.0672, "step": 772 }, { "epoch": 0.20729418074550818, "grad_norm": 0.5463356833367039, "learning_rate": 6.9079535299374445e-06, "loss": 0.0447, "step": 773 }, { "epoch": 0.2075623491552695, "grad_norm": 0.6191221910365377, "learning_rate": 6.9168900804289555e-06, "loss": 0.0571, "step": 774 }, { "epoch": 0.20783051756503085, "grad_norm": 0.5603309360612991, "learning_rate": 6.925826630920466e-06, "loss": 0.0518, "step": 775 }, { "epoch": 0.20809868597479217, "grad_norm": 0.9648687674676005, "learning_rate": 6.934763181411975e-06, "loss": 0.0646, "step": 776 }, { "epoch": 0.2083668543845535, "grad_norm": 0.45845159925062956, "learning_rate": 6.943699731903486e-06, "loss": 0.0405, "step": 777 }, { "epoch": 0.20863502279431484, "grad_norm": 0.4927102770396721, "learning_rate": 6.9526362823949954e-06, "loss": 0.0505, "step": 778 }, { "epoch": 0.20890319120407616, "grad_norm": 0.552962488159094, "learning_rate": 6.9615728328865065e-06, "loss": 0.0584, "step": 779 }, { "epoch": 0.20917135961383748, "grad_norm": 0.6149816687747405, "learning_rate": 6.970509383378017e-06, "loss": 0.0596, "step": 780 }, { "epoch": 0.20943952802359883, "grad_norm": 0.7737296116789633, "learning_rate": 6.979445933869526e-06, "loss": 0.0571, "step": 781 }, { "epoch": 0.20970769643336015, "grad_norm": 0.6835090103999594, "learning_rate": 6.988382484361037e-06, "loss": 0.065, "step": 782 }, { "epoch": 0.20997586484312147, "grad_norm": 0.7096155170503166, "learning_rate": 6.997319034852548e-06, "loss": 0.0529, "step": 783 }, { "epoch": 0.21024403325288282, "grad_norm": 0.6805209391388485, "learning_rate": 7.0062555853440575e-06, "loss": 0.056, "step": 784 }, { "epoch": 0.21051220166264414, "grad_norm": 0.7119773150022219, "learning_rate": 7.015192135835568e-06, "loss": 0.0707, "step": 785 }, { "epoch": 0.21078037007240547, "grad_norm": 0.554659736143505, "learning_rate": 7.024128686327079e-06, "loss": 0.048, "step": 786 }, { "epoch": 0.21104853848216681, "grad_norm": 0.6601577051403399, "learning_rate": 7.033065236818588e-06, "loss": 0.0462, "step": 787 }, { "epoch": 0.21131670689192814, "grad_norm": 0.9211809589193016, "learning_rate": 7.042001787310099e-06, "loss": 0.0677, "step": 788 }, { "epoch": 0.21158487530168946, "grad_norm": 0.6163791453478012, "learning_rate": 7.050938337801609e-06, "loss": 0.0521, "step": 789 }, { "epoch": 0.21185304371145078, "grad_norm": 0.40380838402768837, "learning_rate": 7.059874888293119e-06, "loss": 0.0384, "step": 790 }, { "epoch": 0.21212121212121213, "grad_norm": 0.833909629797656, "learning_rate": 7.06881143878463e-06, "loss": 0.0537, "step": 791 }, { "epoch": 0.21238938053097345, "grad_norm": 1.1530861121014904, "learning_rate": 7.07774798927614e-06, "loss": 0.0951, "step": 792 }, { "epoch": 0.21265754894073477, "grad_norm": 0.8188636801174013, "learning_rate": 7.08668453976765e-06, "loss": 0.0697, "step": 793 }, { "epoch": 0.21292571735049612, "grad_norm": 0.7751355457348119, "learning_rate": 7.09562109025916e-06, "loss": 0.0716, "step": 794 }, { "epoch": 0.21319388576025744, "grad_norm": 0.7196255635276604, "learning_rate": 7.104557640750671e-06, "loss": 0.0646, "step": 795 }, { "epoch": 0.21346205417001876, "grad_norm": 0.7736053275119991, "learning_rate": 7.113494191242181e-06, "loss": 0.0627, "step": 796 }, { "epoch": 0.2137302225797801, "grad_norm": 0.8514683615561239, "learning_rate": 7.122430741733692e-06, "loss": 0.0685, "step": 797 }, { "epoch": 0.21399839098954143, "grad_norm": 0.5222887146153904, "learning_rate": 7.131367292225202e-06, "loss": 0.0544, "step": 798 }, { "epoch": 0.21426655939930275, "grad_norm": 0.6663872399483806, "learning_rate": 7.140303842716711e-06, "loss": 0.0552, "step": 799 }, { "epoch": 0.2145347278090641, "grad_norm": 0.7370556435111065, "learning_rate": 7.149240393208222e-06, "loss": 0.0586, "step": 800 }, { "epoch": 0.21480289621882542, "grad_norm": 0.6546419429478776, "learning_rate": 7.1581769436997325e-06, "loss": 0.0605, "step": 801 }, { "epoch": 0.21507106462858674, "grad_norm": 0.8946085794405229, "learning_rate": 7.167113494191243e-06, "loss": 0.0752, "step": 802 }, { "epoch": 0.2153392330383481, "grad_norm": 0.541265191858049, "learning_rate": 7.176050044682753e-06, "loss": 0.0596, "step": 803 }, { "epoch": 0.21560740144810941, "grad_norm": 0.9304484664641347, "learning_rate": 7.184986595174264e-06, "loss": 0.0722, "step": 804 }, { "epoch": 0.21587556985787074, "grad_norm": 0.6079871798137215, "learning_rate": 7.193923145665773e-06, "loss": 0.0531, "step": 805 }, { "epoch": 0.21614373826763209, "grad_norm": 0.675408167444496, "learning_rate": 7.2028596961572834e-06, "loss": 0.0514, "step": 806 }, { "epoch": 0.2164119066773934, "grad_norm": 0.6361953625181196, "learning_rate": 7.2117962466487945e-06, "loss": 0.055, "step": 807 }, { "epoch": 0.21668007508715473, "grad_norm": 0.700218172568977, "learning_rate": 7.220732797140304e-06, "loss": 0.0734, "step": 808 }, { "epoch": 0.21694824349691608, "grad_norm": 0.7624824922129924, "learning_rate": 7.229669347631815e-06, "loss": 0.0519, "step": 809 }, { "epoch": 0.2172164119066774, "grad_norm": 0.7079601223869098, "learning_rate": 7.238605898123325e-06, "loss": 0.0584, "step": 810 }, { "epoch": 0.21748458031643872, "grad_norm": 0.5521740348722388, "learning_rate": 7.247542448614835e-06, "loss": 0.0448, "step": 811 }, { "epoch": 0.21775274872620004, "grad_norm": 0.778954048716299, "learning_rate": 7.2564789991063455e-06, "loss": 0.0553, "step": 812 }, { "epoch": 0.2180209171359614, "grad_norm": 0.7355032627907264, "learning_rate": 7.2654155495978565e-06, "loss": 0.061, "step": 813 }, { "epoch": 0.2182890855457227, "grad_norm": 0.7668956497469606, "learning_rate": 7.274352100089366e-06, "loss": 0.0505, "step": 814 }, { "epoch": 0.21855725395548403, "grad_norm": 0.4678948657869099, "learning_rate": 7.283288650580876e-06, "loss": 0.0388, "step": 815 }, { "epoch": 0.21882542236524538, "grad_norm": 0.6300577022663109, "learning_rate": 7.292225201072387e-06, "loss": 0.0629, "step": 816 }, { "epoch": 0.2190935907750067, "grad_norm": 0.8240611502389711, "learning_rate": 7.3011617515638964e-06, "loss": 0.0811, "step": 817 }, { "epoch": 0.21936175918476802, "grad_norm": 0.5334912625091875, "learning_rate": 7.3100983020554075e-06, "loss": 0.0597, "step": 818 }, { "epoch": 0.21962992759452937, "grad_norm": 0.9344188331640078, "learning_rate": 7.319034852546918e-06, "loss": 0.0584, "step": 819 }, { "epoch": 0.2198980960042907, "grad_norm": 0.6195094826631371, "learning_rate": 7.327971403038428e-06, "loss": 0.093, "step": 820 }, { "epoch": 0.22016626441405202, "grad_norm": 0.7350945497655491, "learning_rate": 7.336907953529938e-06, "loss": 0.056, "step": 821 }, { "epoch": 0.22043443282381336, "grad_norm": 0.7389468481777717, "learning_rate": 7.345844504021449e-06, "loss": 0.0609, "step": 822 }, { "epoch": 0.22070260123357469, "grad_norm": 0.5480749027899575, "learning_rate": 7.3547810545129585e-06, "loss": 0.0607, "step": 823 }, { "epoch": 0.220970769643336, "grad_norm": 0.6358348155909911, "learning_rate": 7.363717605004469e-06, "loss": 0.0579, "step": 824 }, { "epoch": 0.22123893805309736, "grad_norm": 0.9408099167799561, "learning_rate": 7.37265415549598e-06, "loss": 0.0679, "step": 825 }, { "epoch": 0.22150710646285868, "grad_norm": 0.5620699642124926, "learning_rate": 7.381590705987489e-06, "loss": 0.0449, "step": 826 }, { "epoch": 0.22177527487262, "grad_norm": 0.5964240925617175, "learning_rate": 7.390527256479e-06, "loss": 0.0489, "step": 827 }, { "epoch": 0.22204344328238135, "grad_norm": 0.6139468901504379, "learning_rate": 7.39946380697051e-06, "loss": 0.0548, "step": 828 }, { "epoch": 0.22231161169214267, "grad_norm": 0.6963174669383083, "learning_rate": 7.40840035746202e-06, "loss": 0.0489, "step": 829 }, { "epoch": 0.222579780101904, "grad_norm": 0.7612530912502669, "learning_rate": 7.417336907953531e-06, "loss": 0.0595, "step": 830 }, { "epoch": 0.22284794851166534, "grad_norm": 0.7539832432683371, "learning_rate": 7.426273458445042e-06, "loss": 0.0565, "step": 831 }, { "epoch": 0.22311611692142666, "grad_norm": 0.5610360101535783, "learning_rate": 7.435210008936551e-06, "loss": 0.0481, "step": 832 }, { "epoch": 0.22338428533118798, "grad_norm": 0.6708051272039379, "learning_rate": 7.444146559428061e-06, "loss": 0.0536, "step": 833 }, { "epoch": 0.2236524537409493, "grad_norm": 0.7948128165152702, "learning_rate": 7.453083109919572e-06, "loss": 0.0571, "step": 834 }, { "epoch": 0.22392062215071065, "grad_norm": 0.8119772632790323, "learning_rate": 7.462019660411082e-06, "loss": 0.0607, "step": 835 }, { "epoch": 0.22418879056047197, "grad_norm": 0.5938019231714236, "learning_rate": 7.470956210902593e-06, "loss": 0.0514, "step": 836 }, { "epoch": 0.2244569589702333, "grad_norm": 0.480339720088699, "learning_rate": 7.479892761394103e-06, "loss": 0.0572, "step": 837 }, { "epoch": 0.22472512737999464, "grad_norm": 0.7321419282499156, "learning_rate": 7.488829311885612e-06, "loss": 0.0614, "step": 838 }, { "epoch": 0.22499329578975596, "grad_norm": 0.7862174304581754, "learning_rate": 7.497765862377123e-06, "loss": 0.0597, "step": 839 }, { "epoch": 0.2252614641995173, "grad_norm": 0.6837691085703087, "learning_rate": 7.506702412868633e-06, "loss": 0.0571, "step": 840 }, { "epoch": 0.22552963260927864, "grad_norm": 0.5766328720356244, "learning_rate": 7.515638963360144e-06, "loss": 0.0612, "step": 841 }, { "epoch": 0.22579780101903996, "grad_norm": 0.8518640741431467, "learning_rate": 7.524575513851654e-06, "loss": 0.0652, "step": 842 }, { "epoch": 0.22606596942880128, "grad_norm": 0.6266510418612837, "learning_rate": 7.533512064343163e-06, "loss": 0.076, "step": 843 }, { "epoch": 0.22633413783856263, "grad_norm": 0.6071164995865543, "learning_rate": 7.542448614834674e-06, "loss": 0.0522, "step": 844 }, { "epoch": 0.22660230624832395, "grad_norm": 0.6121507959380281, "learning_rate": 7.551385165326185e-06, "loss": 0.0518, "step": 845 }, { "epoch": 0.22687047465808527, "grad_norm": 0.5731850553581667, "learning_rate": 7.560321715817695e-06, "loss": 0.0452, "step": 846 }, { "epoch": 0.22713864306784662, "grad_norm": 0.8535266943982462, "learning_rate": 7.569258266309205e-06, "loss": 0.0718, "step": 847 }, { "epoch": 0.22740681147760794, "grad_norm": 0.5806179329893834, "learning_rate": 7.578194816800716e-06, "loss": 0.0647, "step": 848 }, { "epoch": 0.22767497988736926, "grad_norm": 0.6007673294586532, "learning_rate": 7.587131367292225e-06, "loss": 0.0422, "step": 849 }, { "epoch": 0.2279431482971306, "grad_norm": 0.7635953689482369, "learning_rate": 7.596067917783736e-06, "loss": 0.0711, "step": 850 }, { "epoch": 0.22821131670689193, "grad_norm": 1.1178338935923162, "learning_rate": 7.6050044682752465e-06, "loss": 0.0692, "step": 851 }, { "epoch": 0.22847948511665325, "grad_norm": 0.5318873712770549, "learning_rate": 7.613941018766756e-06, "loss": 0.0521, "step": 852 }, { "epoch": 0.2287476535264146, "grad_norm": 0.8985212543648291, "learning_rate": 7.622877569258267e-06, "loss": 0.06, "step": 853 }, { "epoch": 0.22901582193617592, "grad_norm": 1.1797356361978766, "learning_rate": 7.631814119749778e-06, "loss": 0.0695, "step": 854 }, { "epoch": 0.22928399034593724, "grad_norm": 0.8184713676267756, "learning_rate": 7.640750670241287e-06, "loss": 0.0704, "step": 855 }, { "epoch": 0.22955215875569857, "grad_norm": 1.0384965914146926, "learning_rate": 7.649687220732798e-06, "loss": 0.0862, "step": 856 }, { "epoch": 0.22982032716545991, "grad_norm": 0.7610325618010567, "learning_rate": 7.658623771224308e-06, "loss": 0.0585, "step": 857 }, { "epoch": 0.23008849557522124, "grad_norm": 0.7263880463799376, "learning_rate": 7.667560321715819e-06, "loss": 0.0623, "step": 858 }, { "epoch": 0.23035666398498256, "grad_norm": 0.4807807023999299, "learning_rate": 7.676496872207328e-06, "loss": 0.0392, "step": 859 }, { "epoch": 0.2306248323947439, "grad_norm": 0.915275660737601, "learning_rate": 7.685433422698839e-06, "loss": 0.0793, "step": 860 }, { "epoch": 0.23089300080450523, "grad_norm": 0.6927968400855913, "learning_rate": 7.694369973190348e-06, "loss": 0.0564, "step": 861 }, { "epoch": 0.23116116921426655, "grad_norm": 0.6241870950750825, "learning_rate": 7.70330652368186e-06, "loss": 0.0479, "step": 862 }, { "epoch": 0.2314293376240279, "grad_norm": 0.7556368603029495, "learning_rate": 7.71224307417337e-06, "loss": 0.0559, "step": 863 }, { "epoch": 0.23169750603378922, "grad_norm": 0.7097977952793527, "learning_rate": 7.72117962466488e-06, "loss": 0.0659, "step": 864 }, { "epoch": 0.23196567444355054, "grad_norm": 0.5177822919638604, "learning_rate": 7.730116175156391e-06, "loss": 0.043, "step": 865 }, { "epoch": 0.2322338428533119, "grad_norm": 0.5847983833993663, "learning_rate": 7.7390527256479e-06, "loss": 0.0534, "step": 866 }, { "epoch": 0.2325020112630732, "grad_norm": 0.823173932051227, "learning_rate": 7.74798927613941e-06, "loss": 0.0745, "step": 867 }, { "epoch": 0.23277017967283453, "grad_norm": 0.5009681755231303, "learning_rate": 7.75692582663092e-06, "loss": 0.0528, "step": 868 }, { "epoch": 0.23303834808259588, "grad_norm": 1.0199982189387549, "learning_rate": 7.765862377122432e-06, "loss": 0.073, "step": 869 }, { "epoch": 0.2333065164923572, "grad_norm": 0.6737003312932051, "learning_rate": 7.774798927613941e-06, "loss": 0.0673, "step": 870 }, { "epoch": 0.23357468490211852, "grad_norm": 0.6752496192867327, "learning_rate": 7.783735478105452e-06, "loss": 0.041, "step": 871 }, { "epoch": 0.23384285331187987, "grad_norm": 0.5037915271983077, "learning_rate": 7.792672028596963e-06, "loss": 0.0401, "step": 872 }, { "epoch": 0.2341110217216412, "grad_norm": 0.8169239919037079, "learning_rate": 7.801608579088472e-06, "loss": 0.0685, "step": 873 }, { "epoch": 0.23437919013140252, "grad_norm": 0.8391500186631106, "learning_rate": 7.810545129579983e-06, "loss": 0.0545, "step": 874 }, { "epoch": 0.23464735854116386, "grad_norm": 0.8150667820952092, "learning_rate": 7.819481680071493e-06, "loss": 0.0652, "step": 875 }, { "epoch": 0.23491552695092519, "grad_norm": 0.6264426857860458, "learning_rate": 7.828418230563002e-06, "loss": 0.0547, "step": 876 }, { "epoch": 0.2351836953606865, "grad_norm": 0.8134112683694246, "learning_rate": 7.837354781054513e-06, "loss": 0.057, "step": 877 }, { "epoch": 0.23545186377044783, "grad_norm": 0.6083904370377538, "learning_rate": 7.846291331546024e-06, "loss": 0.0789, "step": 878 }, { "epoch": 0.23572003218020918, "grad_norm": 0.6001353311499135, "learning_rate": 7.855227882037534e-06, "loss": 0.0482, "step": 879 }, { "epoch": 0.2359882005899705, "grad_norm": 0.6620588703489341, "learning_rate": 7.864164432529045e-06, "loss": 0.0643, "step": 880 }, { "epoch": 0.23625636899973182, "grad_norm": 0.789291932526228, "learning_rate": 7.873100983020556e-06, "loss": 0.0633, "step": 881 }, { "epoch": 0.23652453740949317, "grad_norm": 0.5284387863787268, "learning_rate": 7.882037533512065e-06, "loss": 0.0502, "step": 882 }, { "epoch": 0.2367927058192545, "grad_norm": 0.6725341413669234, "learning_rate": 7.890974084003576e-06, "loss": 0.0453, "step": 883 }, { "epoch": 0.2370608742290158, "grad_norm": 0.524813557038549, "learning_rate": 7.899910634495085e-06, "loss": 0.0462, "step": 884 }, { "epoch": 0.23732904263877716, "grad_norm": 1.1488010368727157, "learning_rate": 7.908847184986595e-06, "loss": 0.0847, "step": 885 }, { "epoch": 0.23759721104853848, "grad_norm": 0.5537651629181534, "learning_rate": 7.917783735478106e-06, "loss": 0.0467, "step": 886 }, { "epoch": 0.2378653794582998, "grad_norm": 0.7749166298656239, "learning_rate": 7.926720285969617e-06, "loss": 0.0718, "step": 887 }, { "epoch": 0.23813354786806115, "grad_norm": 0.7645899977632732, "learning_rate": 7.935656836461126e-06, "loss": 0.069, "step": 888 }, { "epoch": 0.23840171627782247, "grad_norm": 0.6075901641135533, "learning_rate": 7.944593386952637e-06, "loss": 0.0655, "step": 889 }, { "epoch": 0.2386698846875838, "grad_norm": 0.8423462141079968, "learning_rate": 7.953529937444148e-06, "loss": 0.0801, "step": 890 }, { "epoch": 0.23893805309734514, "grad_norm": 0.6073224284984499, "learning_rate": 7.962466487935658e-06, "loss": 0.06, "step": 891 }, { "epoch": 0.23920622150710646, "grad_norm": 1.1648594726131332, "learning_rate": 7.971403038427169e-06, "loss": 0.0619, "step": 892 }, { "epoch": 0.23947438991686779, "grad_norm": 0.8909215026738083, "learning_rate": 7.980339588918678e-06, "loss": 0.0604, "step": 893 }, { "epoch": 0.23974255832662913, "grad_norm": 0.8066015283724232, "learning_rate": 7.989276139410187e-06, "loss": 0.0616, "step": 894 }, { "epoch": 0.24001072673639046, "grad_norm": 0.604755819715382, "learning_rate": 7.998212689901698e-06, "loss": 0.0557, "step": 895 }, { "epoch": 0.24027889514615178, "grad_norm": 0.7222841586906538, "learning_rate": 8.00714924039321e-06, "loss": 0.0585, "step": 896 }, { "epoch": 0.24054706355591313, "grad_norm": 0.7352634596043768, "learning_rate": 8.016085790884719e-06, "loss": 0.0494, "step": 897 }, { "epoch": 0.24081523196567445, "grad_norm": 0.7017740197602509, "learning_rate": 8.02502234137623e-06, "loss": 0.0526, "step": 898 }, { "epoch": 0.24108340037543577, "grad_norm": 0.6464193318091813, "learning_rate": 8.033958891867741e-06, "loss": 0.0443, "step": 899 }, { "epoch": 0.2413515687851971, "grad_norm": 0.7951271709068436, "learning_rate": 8.04289544235925e-06, "loss": 0.0574, "step": 900 }, { "epoch": 0.24161973719495844, "grad_norm": 0.6137734565500936, "learning_rate": 8.05183199285076e-06, "loss": 0.0438, "step": 901 }, { "epoch": 0.24188790560471976, "grad_norm": 0.7820226516609139, "learning_rate": 8.06076854334227e-06, "loss": 0.0545, "step": 902 }, { "epoch": 0.24215607401448108, "grad_norm": 0.563780104756091, "learning_rate": 8.06970509383378e-06, "loss": 0.0524, "step": 903 }, { "epoch": 0.24242424242424243, "grad_norm": 0.6931644539860238, "learning_rate": 8.078641644325291e-06, "loss": 0.0592, "step": 904 }, { "epoch": 0.24269241083400375, "grad_norm": 0.6859909114052662, "learning_rate": 8.0875781948168e-06, "loss": 0.0568, "step": 905 }, { "epoch": 0.24296057924376507, "grad_norm": 0.928319627995896, "learning_rate": 8.096514745308311e-06, "loss": 0.0869, "step": 906 }, { "epoch": 0.24322874765352642, "grad_norm": 0.691708928671463, "learning_rate": 8.105451295799822e-06, "loss": 0.0601, "step": 907 }, { "epoch": 0.24349691606328774, "grad_norm": 0.6665719493309162, "learning_rate": 8.114387846291332e-06, "loss": 0.0421, "step": 908 }, { "epoch": 0.24376508447304907, "grad_norm": 0.7438422072025176, "learning_rate": 8.123324396782843e-06, "loss": 0.0711, "step": 909 }, { "epoch": 0.24403325288281041, "grad_norm": 0.6446036821278024, "learning_rate": 8.132260947274352e-06, "loss": 0.0508, "step": 910 }, { "epoch": 0.24430142129257174, "grad_norm": 0.6005692800639328, "learning_rate": 8.141197497765863e-06, "loss": 0.0661, "step": 911 }, { "epoch": 0.24456958970233306, "grad_norm": 0.8938961327249468, "learning_rate": 8.150134048257373e-06, "loss": 0.0564, "step": 912 }, { "epoch": 0.2448377581120944, "grad_norm": 0.5968895734141487, "learning_rate": 8.159070598748884e-06, "loss": 0.0367, "step": 913 }, { "epoch": 0.24510592652185573, "grad_norm": 0.7871740531855175, "learning_rate": 8.168007149240393e-06, "loss": 0.0559, "step": 914 }, { "epoch": 0.24537409493161705, "grad_norm": 0.637551375796371, "learning_rate": 8.176943699731904e-06, "loss": 0.0501, "step": 915 }, { "epoch": 0.2456422633413784, "grad_norm": 0.8154387546175497, "learning_rate": 8.185880250223415e-06, "loss": 0.0687, "step": 916 }, { "epoch": 0.24591043175113972, "grad_norm": 0.586194381551852, "learning_rate": 8.194816800714924e-06, "loss": 0.0493, "step": 917 }, { "epoch": 0.24617860016090104, "grad_norm": 0.8807508292400504, "learning_rate": 8.203753351206435e-06, "loss": 0.0597, "step": 918 }, { "epoch": 0.2464467685706624, "grad_norm": 0.7654807499221806, "learning_rate": 8.212689901697945e-06, "loss": 0.0578, "step": 919 }, { "epoch": 0.2467149369804237, "grad_norm": 1.8894647306254642, "learning_rate": 8.221626452189456e-06, "loss": 0.0732, "step": 920 }, { "epoch": 0.24698310539018503, "grad_norm": 0.8042343108910344, "learning_rate": 8.230563002680965e-06, "loss": 0.0603, "step": 921 }, { "epoch": 0.24725127379994635, "grad_norm": 0.5974569485535882, "learning_rate": 8.239499553172476e-06, "loss": 0.0555, "step": 922 }, { "epoch": 0.2475194422097077, "grad_norm": 0.6081532476159442, "learning_rate": 8.248436103663986e-06, "loss": 0.0415, "step": 923 }, { "epoch": 0.24778761061946902, "grad_norm": 0.672627802864003, "learning_rate": 8.257372654155497e-06, "loss": 0.0601, "step": 924 }, { "epoch": 0.24805577902923034, "grad_norm": 0.7018590878081592, "learning_rate": 8.266309204647008e-06, "loss": 0.058, "step": 925 }, { "epoch": 0.2483239474389917, "grad_norm": 0.6022603429913677, "learning_rate": 8.275245755138517e-06, "loss": 0.0574, "step": 926 }, { "epoch": 0.24859211584875301, "grad_norm": 0.7660595830126783, "learning_rate": 8.284182305630028e-06, "loss": 0.0543, "step": 927 }, { "epoch": 0.24886028425851434, "grad_norm": 0.8697793367321248, "learning_rate": 8.293118856121537e-06, "loss": 0.0647, "step": 928 }, { "epoch": 0.24912845266827569, "grad_norm": 0.7249985205243299, "learning_rate": 8.302055406613048e-06, "loss": 0.0578, "step": 929 }, { "epoch": 0.249396621078037, "grad_norm": 0.790442464060151, "learning_rate": 8.310991957104558e-06, "loss": 0.0514, "step": 930 }, { "epoch": 0.24966478948779833, "grad_norm": 0.6139450930385338, "learning_rate": 8.319928507596069e-06, "loss": 0.0462, "step": 931 }, { "epoch": 0.24993295789755968, "grad_norm": 0.6896407577807269, "learning_rate": 8.328865058087578e-06, "loss": 0.0586, "step": 932 }, { "epoch": 0.250201126307321, "grad_norm": 0.7650698727407607, "learning_rate": 8.33780160857909e-06, "loss": 0.0634, "step": 933 }, { "epoch": 0.25046929471708235, "grad_norm": 0.4670828956333003, "learning_rate": 8.3467381590706e-06, "loss": 0.0362, "step": 934 }, { "epoch": 0.25073746312684364, "grad_norm": 0.6958426960844856, "learning_rate": 8.35567470956211e-06, "loss": 0.0573, "step": 935 }, { "epoch": 0.251005631536605, "grad_norm": 0.8764140587163858, "learning_rate": 8.36461126005362e-06, "loss": 0.0453, "step": 936 }, { "epoch": 0.25127379994636634, "grad_norm": 0.7673356156776654, "learning_rate": 8.37354781054513e-06, "loss": 0.0681, "step": 937 }, { "epoch": 0.25154196835612763, "grad_norm": 0.4478473814274472, "learning_rate": 8.38248436103664e-06, "loss": 0.0404, "step": 938 }, { "epoch": 0.251810136765889, "grad_norm": 0.668969094249436, "learning_rate": 8.39142091152815e-06, "loss": 0.0652, "step": 939 }, { "epoch": 0.25207830517565033, "grad_norm": 0.7538091479871053, "learning_rate": 8.400357462019661e-06, "loss": 0.0676, "step": 940 }, { "epoch": 0.2523464735854116, "grad_norm": 0.4333366253309145, "learning_rate": 8.40929401251117e-06, "loss": 0.0456, "step": 941 }, { "epoch": 0.252614641995173, "grad_norm": 0.7700579996556718, "learning_rate": 8.418230563002682e-06, "loss": 0.0641, "step": 942 }, { "epoch": 0.2528828104049343, "grad_norm": 0.5276077661127693, "learning_rate": 8.427167113494193e-06, "loss": 0.0492, "step": 943 }, { "epoch": 0.2531509788146956, "grad_norm": 0.6869835500427105, "learning_rate": 8.436103663985702e-06, "loss": 0.0503, "step": 944 }, { "epoch": 0.25341914722445696, "grad_norm": 1.2973263013390004, "learning_rate": 8.445040214477213e-06, "loss": 0.0574, "step": 945 }, { "epoch": 0.2536873156342183, "grad_norm": 0.6292400183413236, "learning_rate": 8.453976764968723e-06, "loss": 0.0494, "step": 946 }, { "epoch": 0.2539554840439796, "grad_norm": 0.8170201856506285, "learning_rate": 8.462913315460232e-06, "loss": 0.069, "step": 947 }, { "epoch": 0.25422365245374096, "grad_norm": 0.8883212693708068, "learning_rate": 8.471849865951743e-06, "loss": 0.0633, "step": 948 }, { "epoch": 0.2544918208635023, "grad_norm": 0.7630928207909858, "learning_rate": 8.480786416443254e-06, "loss": 0.0582, "step": 949 }, { "epoch": 0.2547599892732636, "grad_norm": 0.6126004445222877, "learning_rate": 8.489722966934763e-06, "loss": 0.0546, "step": 950 }, { "epoch": 0.25502815768302495, "grad_norm": 1.5122336589806007, "learning_rate": 8.498659517426274e-06, "loss": 0.0673, "step": 951 }, { "epoch": 0.2552963260927863, "grad_norm": 0.6899019681036999, "learning_rate": 8.507596067917786e-06, "loss": 0.0538, "step": 952 }, { "epoch": 0.2555644945025476, "grad_norm": 0.5130942519524846, "learning_rate": 8.516532618409295e-06, "loss": 0.0465, "step": 953 }, { "epoch": 0.25583266291230894, "grad_norm": 1.0530481008505155, "learning_rate": 8.525469168900806e-06, "loss": 0.0605, "step": 954 }, { "epoch": 0.25610083132207023, "grad_norm": 0.518302535721877, "learning_rate": 8.534405719392315e-06, "loss": 0.0421, "step": 955 }, { "epoch": 0.2563689997318316, "grad_norm": 0.7573322925643349, "learning_rate": 8.543342269883825e-06, "loss": 0.0699, "step": 956 }, { "epoch": 0.25663716814159293, "grad_norm": 1.2476866653178869, "learning_rate": 8.552278820375336e-06, "loss": 0.0683, "step": 957 }, { "epoch": 0.2569053365513542, "grad_norm": 0.4914396775913551, "learning_rate": 8.561215370866847e-06, "loss": 0.0543, "step": 958 }, { "epoch": 0.2571735049611156, "grad_norm": 0.8061720447165363, "learning_rate": 8.570151921358356e-06, "loss": 0.0685, "step": 959 }, { "epoch": 0.2574416733708769, "grad_norm": 0.5857652117623684, "learning_rate": 8.579088471849867e-06, "loss": 0.045, "step": 960 }, { "epoch": 0.2577098417806382, "grad_norm": 0.5909047883637759, "learning_rate": 8.588025022341378e-06, "loss": 0.0612, "step": 961 }, { "epoch": 0.25797801019039956, "grad_norm": 1.0393365434663968, "learning_rate": 8.596961572832887e-06, "loss": 0.0721, "step": 962 }, { "epoch": 0.2582461786001609, "grad_norm": 1.051358034922522, "learning_rate": 8.605898123324398e-06, "loss": 0.0649, "step": 963 }, { "epoch": 0.2585143470099222, "grad_norm": 0.9210418580735691, "learning_rate": 8.614834673815908e-06, "loss": 0.0687, "step": 964 }, { "epoch": 0.25878251541968356, "grad_norm": 0.7566778738587868, "learning_rate": 8.623771224307417e-06, "loss": 0.0476, "step": 965 }, { "epoch": 0.2590506838294449, "grad_norm": 0.721868408774214, "learning_rate": 8.632707774798928e-06, "loss": 0.0529, "step": 966 }, { "epoch": 0.2593188522392062, "grad_norm": 0.5634817468783463, "learning_rate": 8.641644325290438e-06, "loss": 0.0468, "step": 967 }, { "epoch": 0.25958702064896755, "grad_norm": 0.6019583463099332, "learning_rate": 8.650580875781949e-06, "loss": 0.0392, "step": 968 }, { "epoch": 0.2598551890587289, "grad_norm": 0.652480640490992, "learning_rate": 8.65951742627346e-06, "loss": 0.0551, "step": 969 }, { "epoch": 0.2601233574684902, "grad_norm": 0.6721375058000493, "learning_rate": 8.668453976764969e-06, "loss": 0.0465, "step": 970 }, { "epoch": 0.26039152587825154, "grad_norm": 0.9328464996839846, "learning_rate": 8.67739052725648e-06, "loss": 0.0801, "step": 971 }, { "epoch": 0.2606596942880129, "grad_norm": 0.7857595012352139, "learning_rate": 8.68632707774799e-06, "loss": 0.0739, "step": 972 }, { "epoch": 0.2609278626977742, "grad_norm": 0.6447579404961109, "learning_rate": 8.6952636282395e-06, "loss": 0.0555, "step": 973 }, { "epoch": 0.26119603110753553, "grad_norm": 0.5577057236734498, "learning_rate": 8.70420017873101e-06, "loss": 0.0451, "step": 974 }, { "epoch": 0.2614641995172969, "grad_norm": 1.2135674538071934, "learning_rate": 8.71313672922252e-06, "loss": 0.062, "step": 975 }, { "epoch": 0.2617323679270582, "grad_norm": 0.8310745766948165, "learning_rate": 8.72207327971403e-06, "loss": 0.0938, "step": 976 }, { "epoch": 0.2620005363368195, "grad_norm": 0.6158366022939997, "learning_rate": 8.731009830205541e-06, "loss": 0.0592, "step": 977 }, { "epoch": 0.26226870474658087, "grad_norm": 0.6359645763327832, "learning_rate": 8.739946380697052e-06, "loss": 0.0494, "step": 978 }, { "epoch": 0.26253687315634217, "grad_norm": 1.0502437345759257, "learning_rate": 8.748882931188562e-06, "loss": 0.0911, "step": 979 }, { "epoch": 0.2628050415661035, "grad_norm": 0.6175171276587017, "learning_rate": 8.757819481680073e-06, "loss": 0.0539, "step": 980 }, { "epoch": 0.26307320997586486, "grad_norm": 0.6749708959577044, "learning_rate": 8.766756032171582e-06, "loss": 0.0634, "step": 981 }, { "epoch": 0.26334137838562616, "grad_norm": 0.49555002188692787, "learning_rate": 8.775692582663093e-06, "loss": 0.0587, "step": 982 }, { "epoch": 0.2636095467953875, "grad_norm": 0.8687916311240435, "learning_rate": 8.784629133154602e-06, "loss": 0.0552, "step": 983 }, { "epoch": 0.26387771520514886, "grad_norm": 0.5543685744511956, "learning_rate": 8.793565683646113e-06, "loss": 0.0503, "step": 984 }, { "epoch": 0.26414588361491015, "grad_norm": 0.9107328064719851, "learning_rate": 8.802502234137623e-06, "loss": 0.0768, "step": 985 }, { "epoch": 0.2644140520246715, "grad_norm": 0.9027915874226776, "learning_rate": 8.811438784629134e-06, "loss": 0.0595, "step": 986 }, { "epoch": 0.26468222043443285, "grad_norm": 0.6522561716767531, "learning_rate": 8.820375335120645e-06, "loss": 0.0417, "step": 987 }, { "epoch": 0.26495038884419414, "grad_norm": 0.4962857610158182, "learning_rate": 8.829311885612154e-06, "loss": 0.0415, "step": 988 }, { "epoch": 0.2652185572539555, "grad_norm": 0.6728417980795768, "learning_rate": 8.838248436103665e-06, "loss": 0.0582, "step": 989 }, { "epoch": 0.26548672566371684, "grad_norm": 0.7842330537946006, "learning_rate": 8.847184986595175e-06, "loss": 0.064, "step": 990 }, { "epoch": 0.26575489407347813, "grad_norm": 0.6175498477543568, "learning_rate": 8.856121537086686e-06, "loss": 0.0411, "step": 991 }, { "epoch": 0.2660230624832395, "grad_norm": 0.658647518155521, "learning_rate": 8.865058087578195e-06, "loss": 0.0657, "step": 992 }, { "epoch": 0.26629123089300083, "grad_norm": 0.6864937394665676, "learning_rate": 8.873994638069706e-06, "loss": 0.0478, "step": 993 }, { "epoch": 0.2665593993027621, "grad_norm": 0.6185926792241573, "learning_rate": 8.882931188561215e-06, "loss": 0.0502, "step": 994 }, { "epoch": 0.2668275677125235, "grad_norm": 0.6393543357525548, "learning_rate": 8.891867739052726e-06, "loss": 0.0651, "step": 995 }, { "epoch": 0.2670957361222848, "grad_norm": 0.6194030557164245, "learning_rate": 8.900804289544237e-06, "loss": 0.0533, "step": 996 }, { "epoch": 0.2673639045320461, "grad_norm": 0.8608883766157296, "learning_rate": 8.909740840035747e-06, "loss": 0.0701, "step": 997 }, { "epoch": 0.26763207294180746, "grad_norm": 0.6945023804738137, "learning_rate": 8.918677390527258e-06, "loss": 0.0741, "step": 998 }, { "epoch": 0.26790024135156876, "grad_norm": 0.5843338370911121, "learning_rate": 8.927613941018767e-06, "loss": 0.0495, "step": 999 }, { "epoch": 0.2681684097613301, "grad_norm": 0.6206738894298448, "learning_rate": 8.936550491510278e-06, "loss": 0.061, "step": 1000 }, { "epoch": 0.26843657817109146, "grad_norm": 0.7964771863924608, "learning_rate": 8.945487042001788e-06, "loss": 0.0706, "step": 1001 }, { "epoch": 0.26870474658085275, "grad_norm": 0.8235197971875386, "learning_rate": 8.954423592493299e-06, "loss": 0.0656, "step": 1002 }, { "epoch": 0.2689729149906141, "grad_norm": 0.5037490913995571, "learning_rate": 8.963360142984808e-06, "loss": 0.0418, "step": 1003 }, { "epoch": 0.26924108340037545, "grad_norm": 1.048784315575701, "learning_rate": 8.972296693476319e-06, "loss": 0.0741, "step": 1004 }, { "epoch": 0.26950925181013674, "grad_norm": 0.5853159150447301, "learning_rate": 8.98123324396783e-06, "loss": 0.0531, "step": 1005 }, { "epoch": 0.2697774202198981, "grad_norm": 0.7691364112473073, "learning_rate": 8.99016979445934e-06, "loss": 0.048, "step": 1006 }, { "epoch": 0.27004558862965944, "grad_norm": 0.6306331783167517, "learning_rate": 8.99910634495085e-06, "loss": 0.0725, "step": 1007 }, { "epoch": 0.27031375703942073, "grad_norm": 0.5081907019308503, "learning_rate": 9.00804289544236e-06, "loss": 0.048, "step": 1008 }, { "epoch": 0.2705819254491821, "grad_norm": 0.7247969216222044, "learning_rate": 9.01697944593387e-06, "loss": 0.0626, "step": 1009 }, { "epoch": 0.27085009385894343, "grad_norm": 0.5182847795356663, "learning_rate": 9.02591599642538e-06, "loss": 0.0453, "step": 1010 }, { "epoch": 0.2711182622687047, "grad_norm": 0.621615755971977, "learning_rate": 9.034852546916891e-06, "loss": 0.0415, "step": 1011 }, { "epoch": 0.2713864306784661, "grad_norm": 0.6607938541868332, "learning_rate": 9.0437890974084e-06, "loss": 0.0448, "step": 1012 }, { "epoch": 0.2716545990882274, "grad_norm": 0.5557200661827993, "learning_rate": 9.052725647899912e-06, "loss": 0.0565, "step": 1013 }, { "epoch": 0.2719227674979887, "grad_norm": 0.5558571489023096, "learning_rate": 9.061662198391423e-06, "loss": 0.0483, "step": 1014 }, { "epoch": 0.27219093590775006, "grad_norm": 0.48628039931342637, "learning_rate": 9.070598748882932e-06, "loss": 0.0479, "step": 1015 }, { "epoch": 0.2724591043175114, "grad_norm": 0.4817534033592412, "learning_rate": 9.079535299374443e-06, "loss": 0.0536, "step": 1016 }, { "epoch": 0.2727272727272727, "grad_norm": 0.6634588409570141, "learning_rate": 9.088471849865952e-06, "loss": 0.0511, "step": 1017 }, { "epoch": 0.27299544113703406, "grad_norm": 0.5068500419797295, "learning_rate": 9.097408400357462e-06, "loss": 0.0526, "step": 1018 }, { "epoch": 0.2732636095467954, "grad_norm": 0.7080402718833271, "learning_rate": 9.106344950848973e-06, "loss": 0.0773, "step": 1019 }, { "epoch": 0.2735317779565567, "grad_norm": 0.7437439716392791, "learning_rate": 9.115281501340484e-06, "loss": 0.0772, "step": 1020 }, { "epoch": 0.27379994636631805, "grad_norm": 0.42275638927373943, "learning_rate": 9.124218051831993e-06, "loss": 0.0501, "step": 1021 }, { "epoch": 0.2740681147760794, "grad_norm": 0.4980086095522623, "learning_rate": 9.133154602323504e-06, "loss": 0.0508, "step": 1022 }, { "epoch": 0.2743362831858407, "grad_norm": 0.8111532213277497, "learning_rate": 9.142091152815015e-06, "loss": 0.0666, "step": 1023 }, { "epoch": 0.27460445159560204, "grad_norm": 0.6977891408528311, "learning_rate": 9.151027703306525e-06, "loss": 0.0462, "step": 1024 }, { "epoch": 0.2748726200053634, "grad_norm": 0.5020957373738434, "learning_rate": 9.159964253798036e-06, "loss": 0.0472, "step": 1025 }, { "epoch": 0.2751407884151247, "grad_norm": 0.7721213385558459, "learning_rate": 9.168900804289545e-06, "loss": 0.0758, "step": 1026 }, { "epoch": 0.27540895682488603, "grad_norm": 0.48250191303558965, "learning_rate": 9.177837354781054e-06, "loss": 0.0362, "step": 1027 }, { "epoch": 0.2756771252346474, "grad_norm": 0.9170517547245687, "learning_rate": 9.186773905272565e-06, "loss": 0.0562, "step": 1028 }, { "epoch": 0.2759452936444087, "grad_norm": 0.7780528461956794, "learning_rate": 9.195710455764075e-06, "loss": 0.0512, "step": 1029 }, { "epoch": 0.27621346205417, "grad_norm": 0.8568728483540402, "learning_rate": 9.204647006255586e-06, "loss": 0.065, "step": 1030 }, { "epoch": 0.27648163046393137, "grad_norm": 0.7231529764681266, "learning_rate": 9.213583556747097e-06, "loss": 0.0504, "step": 1031 }, { "epoch": 0.27674979887369267, "grad_norm": 1.4510960016837429, "learning_rate": 9.222520107238606e-06, "loss": 0.0678, "step": 1032 }, { "epoch": 0.277017967283454, "grad_norm": 0.653576965736248, "learning_rate": 9.231456657730117e-06, "loss": 0.0469, "step": 1033 }, { "epoch": 0.27728613569321536, "grad_norm": 1.0757416177249457, "learning_rate": 9.240393208221627e-06, "loss": 0.0756, "step": 1034 }, { "epoch": 0.27755430410297666, "grad_norm": 0.5098378843646046, "learning_rate": 9.249329758713138e-06, "loss": 0.0411, "step": 1035 }, { "epoch": 0.277822472512738, "grad_norm": 0.9646397874228843, "learning_rate": 9.258266309204647e-06, "loss": 0.0829, "step": 1036 }, { "epoch": 0.27809064092249935, "grad_norm": 0.649432006245567, "learning_rate": 9.267202859696158e-06, "loss": 0.047, "step": 1037 }, { "epoch": 0.27835880933226065, "grad_norm": 0.639605357977057, "learning_rate": 9.276139410187667e-06, "loss": 0.0719, "step": 1038 }, { "epoch": 0.278626977742022, "grad_norm": 0.7921143678019575, "learning_rate": 9.285075960679178e-06, "loss": 0.089, "step": 1039 }, { "epoch": 0.27889514615178335, "grad_norm": 0.5956386826367281, "learning_rate": 9.29401251117069e-06, "loss": 0.052, "step": 1040 }, { "epoch": 0.27916331456154464, "grad_norm": 0.9116127392221476, "learning_rate": 9.302949061662199e-06, "loss": 0.0648, "step": 1041 }, { "epoch": 0.279431482971306, "grad_norm": 0.7520329878990218, "learning_rate": 9.31188561215371e-06, "loss": 0.0464, "step": 1042 }, { "epoch": 0.27969965138106734, "grad_norm": 0.6022048551338887, "learning_rate": 9.32082216264522e-06, "loss": 0.0621, "step": 1043 }, { "epoch": 0.27996781979082863, "grad_norm": 1.393949141929977, "learning_rate": 9.32975871313673e-06, "loss": 0.0803, "step": 1044 }, { "epoch": 0.28023598820059, "grad_norm": 0.6495763689052004, "learning_rate": 9.33869526362824e-06, "loss": 0.0553, "step": 1045 }, { "epoch": 0.2805041566103513, "grad_norm": 0.6746904093844857, "learning_rate": 9.34763181411975e-06, "loss": 0.0553, "step": 1046 }, { "epoch": 0.2807723250201126, "grad_norm": 0.552107745573398, "learning_rate": 9.35656836461126e-06, "loss": 0.0529, "step": 1047 }, { "epoch": 0.28104049342987397, "grad_norm": 0.6497245971155569, "learning_rate": 9.365504915102771e-06, "loss": 0.0499, "step": 1048 }, { "epoch": 0.28130866183963527, "grad_norm": 0.7272730621428664, "learning_rate": 9.374441465594282e-06, "loss": 0.0448, "step": 1049 }, { "epoch": 0.2815768302493966, "grad_norm": 0.6484358015409448, "learning_rate": 9.383378016085791e-06, "loss": 0.0604, "step": 1050 }, { "epoch": 0.28184499865915796, "grad_norm": 0.7510489590503887, "learning_rate": 9.392314566577302e-06, "loss": 0.0514, "step": 1051 }, { "epoch": 0.28211316706891926, "grad_norm": 0.5371802431393131, "learning_rate": 9.401251117068812e-06, "loss": 0.0435, "step": 1052 }, { "epoch": 0.2823813354786806, "grad_norm": 0.47874966320861945, "learning_rate": 9.410187667560323e-06, "loss": 0.0542, "step": 1053 }, { "epoch": 0.28264950388844196, "grad_norm": 0.9621708383271833, "learning_rate": 9.419124218051832e-06, "loss": 0.0564, "step": 1054 }, { "epoch": 0.28291767229820325, "grad_norm": 0.6323614411408063, "learning_rate": 9.428060768543343e-06, "loss": 0.0453, "step": 1055 }, { "epoch": 0.2831858407079646, "grad_norm": 0.5484857293081461, "learning_rate": 9.436997319034853e-06, "loss": 0.0383, "step": 1056 }, { "epoch": 0.28345400911772595, "grad_norm": 0.5372153442667412, "learning_rate": 9.445933869526364e-06, "loss": 0.0519, "step": 1057 }, { "epoch": 0.28372217752748724, "grad_norm": 0.7419231869336939, "learning_rate": 9.454870420017875e-06, "loss": 0.0434, "step": 1058 }, { "epoch": 0.2839903459372486, "grad_norm": 0.59402491157313, "learning_rate": 9.463806970509384e-06, "loss": 0.0504, "step": 1059 }, { "epoch": 0.28425851434700994, "grad_norm": 0.930869987585662, "learning_rate": 9.472743521000895e-06, "loss": 0.0656, "step": 1060 }, { "epoch": 0.28452668275677123, "grad_norm": 0.6486607995316331, "learning_rate": 9.481680071492404e-06, "loss": 0.0564, "step": 1061 }, { "epoch": 0.2847948511665326, "grad_norm": 0.6712834828663943, "learning_rate": 9.490616621983915e-06, "loss": 0.0618, "step": 1062 }, { "epoch": 0.28506301957629393, "grad_norm": 1.3713518521465855, "learning_rate": 9.499553172475425e-06, "loss": 0.0797, "step": 1063 }, { "epoch": 0.2853311879860552, "grad_norm": 0.603819232680413, "learning_rate": 9.508489722966936e-06, "loss": 0.0531, "step": 1064 }, { "epoch": 0.2855993563958166, "grad_norm": 0.5956016266402101, "learning_rate": 9.517426273458445e-06, "loss": 0.0614, "step": 1065 }, { "epoch": 0.2858675248055779, "grad_norm": 0.46252523601941165, "learning_rate": 9.526362823949956e-06, "loss": 0.0421, "step": 1066 }, { "epoch": 0.2861356932153392, "grad_norm": 0.731060264461064, "learning_rate": 9.535299374441467e-06, "loss": 0.0626, "step": 1067 }, { "epoch": 0.28640386162510056, "grad_norm": 0.5824600238051167, "learning_rate": 9.544235924932977e-06, "loss": 0.0435, "step": 1068 }, { "epoch": 0.2866720300348619, "grad_norm": 0.5954843312453503, "learning_rate": 9.553172475424488e-06, "loss": 0.0489, "step": 1069 }, { "epoch": 0.2869401984446232, "grad_norm": 1.176109853012195, "learning_rate": 9.562109025915997e-06, "loss": 0.065, "step": 1070 }, { "epoch": 0.28720836685438456, "grad_norm": 0.5634419748716052, "learning_rate": 9.571045576407506e-06, "loss": 0.0649, "step": 1071 }, { "epoch": 0.2874765352641459, "grad_norm": 0.5857694096213143, "learning_rate": 9.579982126899017e-06, "loss": 0.0503, "step": 1072 }, { "epoch": 0.2877447036739072, "grad_norm": 0.6608186945724066, "learning_rate": 9.588918677390528e-06, "loss": 0.0578, "step": 1073 }, { "epoch": 0.28801287208366855, "grad_norm": 0.762027099211578, "learning_rate": 9.597855227882038e-06, "loss": 0.0654, "step": 1074 }, { "epoch": 0.2882810404934299, "grad_norm": 0.46947107649955794, "learning_rate": 9.606791778373549e-06, "loss": 0.051, "step": 1075 }, { "epoch": 0.2885492089031912, "grad_norm": 0.5221337872140267, "learning_rate": 9.61572832886506e-06, "loss": 0.0515, "step": 1076 }, { "epoch": 0.28881737731295254, "grad_norm": 0.9578724975879174, "learning_rate": 9.62466487935657e-06, "loss": 0.0443, "step": 1077 }, { "epoch": 0.2890855457227139, "grad_norm": 0.5763965641719264, "learning_rate": 9.63360142984808e-06, "loss": 0.0451, "step": 1078 }, { "epoch": 0.2893537141324752, "grad_norm": 0.7010837187212227, "learning_rate": 9.64253798033959e-06, "loss": 0.0521, "step": 1079 }, { "epoch": 0.28962188254223653, "grad_norm": 0.4707298750872888, "learning_rate": 9.651474530831099e-06, "loss": 0.0447, "step": 1080 }, { "epoch": 0.2898900509519979, "grad_norm": 0.8713811621183434, "learning_rate": 9.66041108132261e-06, "loss": 0.0606, "step": 1081 }, { "epoch": 0.2901582193617592, "grad_norm": 0.8172399257965689, "learning_rate": 9.669347631814121e-06, "loss": 0.0542, "step": 1082 }, { "epoch": 0.2904263877715205, "grad_norm": 0.8277305311428933, "learning_rate": 9.67828418230563e-06, "loss": 0.0465, "step": 1083 }, { "epoch": 0.29069455618128187, "grad_norm": 0.5957409430682489, "learning_rate": 9.687220732797141e-06, "loss": 0.0509, "step": 1084 }, { "epoch": 0.29096272459104316, "grad_norm": 0.7388381696095824, "learning_rate": 9.696157283288652e-06, "loss": 0.0627, "step": 1085 }, { "epoch": 0.2912308930008045, "grad_norm": 0.9208301495223172, "learning_rate": 9.705093833780162e-06, "loss": 0.0498, "step": 1086 }, { "epoch": 0.29149906141056586, "grad_norm": 0.6326590013322599, "learning_rate": 9.714030384271673e-06, "loss": 0.0441, "step": 1087 }, { "epoch": 0.29176722982032716, "grad_norm": 0.6967249536075673, "learning_rate": 9.722966934763182e-06, "loss": 0.0597, "step": 1088 }, { "epoch": 0.2920353982300885, "grad_norm": 1.1541930624146808, "learning_rate": 9.731903485254692e-06, "loss": 0.0584, "step": 1089 }, { "epoch": 0.2923035666398498, "grad_norm": 0.7784515320622181, "learning_rate": 9.740840035746203e-06, "loss": 0.0426, "step": 1090 }, { "epoch": 0.29257173504961115, "grad_norm": 0.7452996845248616, "learning_rate": 9.749776586237712e-06, "loss": 0.0697, "step": 1091 }, { "epoch": 0.2928399034593725, "grad_norm": 1.0257377184335728, "learning_rate": 9.758713136729223e-06, "loss": 0.0549, "step": 1092 }, { "epoch": 0.2931080718691338, "grad_norm": 0.7274660477799325, "learning_rate": 9.767649687220734e-06, "loss": 0.0596, "step": 1093 }, { "epoch": 0.29337624027889514, "grad_norm": 0.7357046563041781, "learning_rate": 9.776586237712243e-06, "loss": 0.0708, "step": 1094 }, { "epoch": 0.2936444086886565, "grad_norm": 0.9121530513103581, "learning_rate": 9.785522788203754e-06, "loss": 0.0643, "step": 1095 }, { "epoch": 0.2939125770984178, "grad_norm": 1.0739227451064046, "learning_rate": 9.794459338695265e-06, "loss": 0.0594, "step": 1096 }, { "epoch": 0.29418074550817913, "grad_norm": 0.9766897376558196, "learning_rate": 9.803395889186775e-06, "loss": 0.0699, "step": 1097 }, { "epoch": 0.2944489139179405, "grad_norm": 0.49341165796093067, "learning_rate": 9.812332439678284e-06, "loss": 0.0405, "step": 1098 }, { "epoch": 0.2947170823277018, "grad_norm": 0.7330904635525741, "learning_rate": 9.821268990169795e-06, "loss": 0.0592, "step": 1099 }, { "epoch": 0.2949852507374631, "grad_norm": 0.6662615996827013, "learning_rate": 9.830205540661305e-06, "loss": 0.0512, "step": 1100 }, { "epoch": 0.29525341914722447, "grad_norm": 0.4622151507107239, "learning_rate": 9.839142091152816e-06, "loss": 0.0577, "step": 1101 }, { "epoch": 0.29552158755698577, "grad_norm": 0.6377067812080595, "learning_rate": 9.848078641644327e-06, "loss": 0.058, "step": 1102 }, { "epoch": 0.2957897559667471, "grad_norm": 0.8201111139968565, "learning_rate": 9.857015192135836e-06, "loss": 0.0622, "step": 1103 }, { "epoch": 0.29605792437650846, "grad_norm": 0.6608748012257967, "learning_rate": 9.865951742627347e-06, "loss": 0.0583, "step": 1104 }, { "epoch": 0.29632609278626976, "grad_norm": 0.6701380246533643, "learning_rate": 9.874888293118856e-06, "loss": 0.0649, "step": 1105 }, { "epoch": 0.2965942611960311, "grad_norm": 0.6727116846777095, "learning_rate": 9.883824843610367e-06, "loss": 0.0574, "step": 1106 }, { "epoch": 0.29686242960579245, "grad_norm": 0.5562309106392723, "learning_rate": 9.892761394101877e-06, "loss": 0.0492, "step": 1107 }, { "epoch": 0.29713059801555375, "grad_norm": 0.6445083808230152, "learning_rate": 9.901697944593388e-06, "loss": 0.0449, "step": 1108 }, { "epoch": 0.2973987664253151, "grad_norm": 0.5978559151947976, "learning_rate": 9.910634495084897e-06, "loss": 0.0557, "step": 1109 }, { "epoch": 0.29766693483507645, "grad_norm": 0.6484753688220354, "learning_rate": 9.919571045576408e-06, "loss": 0.0569, "step": 1110 }, { "epoch": 0.29793510324483774, "grad_norm": 0.8588180038306833, "learning_rate": 9.92850759606792e-06, "loss": 0.0851, "step": 1111 }, { "epoch": 0.2982032716545991, "grad_norm": 0.9150265504351235, "learning_rate": 9.937444146559429e-06, "loss": 0.0727, "step": 1112 }, { "epoch": 0.29847144006436044, "grad_norm": 0.5880704450963565, "learning_rate": 9.94638069705094e-06, "loss": 0.0558, "step": 1113 }, { "epoch": 0.29873960847412173, "grad_norm": 0.7413578943581071, "learning_rate": 9.955317247542449e-06, "loss": 0.0558, "step": 1114 }, { "epoch": 0.2990077768838831, "grad_norm": 0.7418022916728801, "learning_rate": 9.96425379803396e-06, "loss": 0.044, "step": 1115 }, { "epoch": 0.29927594529364443, "grad_norm": 0.6806110597988253, "learning_rate": 9.97319034852547e-06, "loss": 0.0682, "step": 1116 }, { "epoch": 0.2995441137034057, "grad_norm": 0.5294203016313889, "learning_rate": 9.98212689901698e-06, "loss": 0.0497, "step": 1117 }, { "epoch": 0.2998122821131671, "grad_norm": 0.6555745503096044, "learning_rate": 9.99106344950849e-06, "loss": 0.0592, "step": 1118 }, { "epoch": 0.3000804505229284, "grad_norm": 0.6349988874894202, "learning_rate": 1e-05, "loss": 0.0453, "step": 1119 }, { "epoch": 0.3003486189326897, "grad_norm": 0.6338997340109369, "learning_rate": 9.999999756581638e-06, "loss": 0.0463, "step": 1120 }, { "epoch": 0.30061678734245106, "grad_norm": 0.8333822777464575, "learning_rate": 9.999999026326575e-06, "loss": 0.0682, "step": 1121 }, { "epoch": 0.3008849557522124, "grad_norm": 0.5896573059518729, "learning_rate": 9.999997809234879e-06, "loss": 0.0485, "step": 1122 }, { "epoch": 0.3011531241619737, "grad_norm": 0.6374461687270109, "learning_rate": 9.999996105306673e-06, "loss": 0.0538, "step": 1123 }, { "epoch": 0.30142129257173506, "grad_norm": 0.7762849683222337, "learning_rate": 9.99999391454212e-06, "loss": 0.0625, "step": 1124 }, { "epoch": 0.3016894609814964, "grad_norm": 0.7384412788392885, "learning_rate": 9.999991236941436e-06, "loss": 0.0582, "step": 1125 }, { "epoch": 0.3019576293912577, "grad_norm": 0.6880480519340163, "learning_rate": 9.999988072504878e-06, "loss": 0.0598, "step": 1126 }, { "epoch": 0.30222579780101905, "grad_norm": 0.5579098832408893, "learning_rate": 9.999984421232756e-06, "loss": 0.0616, "step": 1127 }, { "epoch": 0.3024939662107804, "grad_norm": 0.4298755433135871, "learning_rate": 9.999980283125427e-06, "loss": 0.041, "step": 1128 }, { "epoch": 0.3027621346205417, "grad_norm": 0.39961239646229213, "learning_rate": 9.999975658183292e-06, "loss": 0.0361, "step": 1129 }, { "epoch": 0.30303030303030304, "grad_norm": 0.7320445080181024, "learning_rate": 9.9999705464068e-06, "loss": 0.0584, "step": 1130 }, { "epoch": 0.3032984714400644, "grad_norm": 0.5745009853485151, "learning_rate": 9.999964947796453e-06, "loss": 0.0454, "step": 1131 }, { "epoch": 0.3035666398498257, "grad_norm": 0.5469506729601884, "learning_rate": 9.999958862352793e-06, "loss": 0.0593, "step": 1132 }, { "epoch": 0.30383480825958703, "grad_norm": 0.7118954016306777, "learning_rate": 9.999952290076414e-06, "loss": 0.0495, "step": 1133 }, { "epoch": 0.3041029766693483, "grad_norm": 0.4022427423278802, "learning_rate": 9.999945230967953e-06, "loss": 0.0397, "step": 1134 }, { "epoch": 0.3043711450791097, "grad_norm": 0.7483138244062656, "learning_rate": 9.999937685028101e-06, "loss": 0.0407, "step": 1135 }, { "epoch": 0.304639313488871, "grad_norm": 0.7307167930130598, "learning_rate": 9.999929652257592e-06, "loss": 0.0479, "step": 1136 }, { "epoch": 0.3049074818986323, "grad_norm": 0.653058552018561, "learning_rate": 9.999921132657206e-06, "loss": 0.0494, "step": 1137 }, { "epoch": 0.30517565030839366, "grad_norm": 0.4217059301508981, "learning_rate": 9.999912126227774e-06, "loss": 0.0403, "step": 1138 }, { "epoch": 0.305443818718155, "grad_norm": 0.5579569473617421, "learning_rate": 9.999902632970174e-06, "loss": 0.0706, "step": 1139 }, { "epoch": 0.3057119871279163, "grad_norm": 0.6524567103369456, "learning_rate": 9.999892652885325e-06, "loss": 0.0544, "step": 1140 }, { "epoch": 0.30598015553767766, "grad_norm": 0.6066458913716382, "learning_rate": 9.999882185974208e-06, "loss": 0.0473, "step": 1141 }, { "epoch": 0.306248323947439, "grad_norm": 0.6673392372739456, "learning_rate": 9.999871232237835e-06, "loss": 0.0606, "step": 1142 }, { "epoch": 0.3065164923572003, "grad_norm": 0.8725173658516677, "learning_rate": 9.999859791677274e-06, "loss": 0.0549, "step": 1143 }, { "epoch": 0.30678466076696165, "grad_norm": 0.552648634582255, "learning_rate": 9.99984786429364e-06, "loss": 0.0488, "step": 1144 }, { "epoch": 0.307052829176723, "grad_norm": 0.48734345870093704, "learning_rate": 9.999835450088093e-06, "loss": 0.0439, "step": 1145 }, { "epoch": 0.3073209975864843, "grad_norm": 0.4156890641948285, "learning_rate": 9.999822549061843e-06, "loss": 0.041, "step": 1146 }, { "epoch": 0.30758916599624564, "grad_norm": 0.610066849580524, "learning_rate": 9.999809161216145e-06, "loss": 0.0631, "step": 1147 }, { "epoch": 0.307857334406007, "grad_norm": 0.4806768380383877, "learning_rate": 9.999795286552305e-06, "loss": 0.0455, "step": 1148 }, { "epoch": 0.3081255028157683, "grad_norm": 1.3768498811039278, "learning_rate": 9.99978092507167e-06, "loss": 0.055, "step": 1149 }, { "epoch": 0.30839367122552963, "grad_norm": 0.5343046337991524, "learning_rate": 9.999766076775641e-06, "loss": 0.0445, "step": 1150 }, { "epoch": 0.308661839635291, "grad_norm": 0.53634179559045, "learning_rate": 9.999750741665664e-06, "loss": 0.0486, "step": 1151 }, { "epoch": 0.3089300080450523, "grad_norm": 0.6620571078765336, "learning_rate": 9.999734919743231e-06, "loss": 0.057, "step": 1152 }, { "epoch": 0.3091981764548136, "grad_norm": 0.5574830943974115, "learning_rate": 9.999718611009884e-06, "loss": 0.0501, "step": 1153 }, { "epoch": 0.30946634486457497, "grad_norm": 0.3974204726716722, "learning_rate": 9.999701815467209e-06, "loss": 0.0441, "step": 1154 }, { "epoch": 0.30973451327433627, "grad_norm": 0.6109708637738438, "learning_rate": 9.999684533116843e-06, "loss": 0.053, "step": 1155 }, { "epoch": 0.3100026816840976, "grad_norm": 0.5923972158843595, "learning_rate": 9.999666763960468e-06, "loss": 0.0519, "step": 1156 }, { "epoch": 0.31027085009385896, "grad_norm": 0.6207914836159666, "learning_rate": 9.99964850799981e-06, "loss": 0.0594, "step": 1157 }, { "epoch": 0.31053901850362026, "grad_norm": 0.6971266018166004, "learning_rate": 9.999629765236655e-06, "loss": 0.0606, "step": 1158 }, { "epoch": 0.3108071869133816, "grad_norm": 0.7627432317040791, "learning_rate": 9.999610535672822e-06, "loss": 0.0672, "step": 1159 }, { "epoch": 0.31107535532314295, "grad_norm": 1.0462355989146774, "learning_rate": 9.999590819310185e-06, "loss": 0.055, "step": 1160 }, { "epoch": 0.31134352373290425, "grad_norm": 0.505303486568486, "learning_rate": 9.999570616150664e-06, "loss": 0.0473, "step": 1161 }, { "epoch": 0.3116116921426656, "grad_norm": 0.6403514606799499, "learning_rate": 9.999549926196225e-06, "loss": 0.0495, "step": 1162 }, { "epoch": 0.31187986055242695, "grad_norm": 0.4974097145780926, "learning_rate": 9.999528749448883e-06, "loss": 0.0448, "step": 1163 }, { "epoch": 0.31214802896218824, "grad_norm": 1.1813164642164455, "learning_rate": 9.999507085910702e-06, "loss": 0.1163, "step": 1164 }, { "epoch": 0.3124161973719496, "grad_norm": 1.602332724454136, "learning_rate": 9.999484935583787e-06, "loss": 0.0749, "step": 1165 }, { "epoch": 0.31268436578171094, "grad_norm": 0.4867184233528548, "learning_rate": 9.9994622984703e-06, "loss": 0.0433, "step": 1166 }, { "epoch": 0.31295253419147223, "grad_norm": 0.8302492851395153, "learning_rate": 9.999439174572441e-06, "loss": 0.0656, "step": 1167 }, { "epoch": 0.3132207026012336, "grad_norm": 0.6085673534634279, "learning_rate": 9.999415563892461e-06, "loss": 0.0473, "step": 1168 }, { "epoch": 0.31348887101099493, "grad_norm": 0.48774901026953765, "learning_rate": 9.999391466432663e-06, "loss": 0.0492, "step": 1169 }, { "epoch": 0.3137570394207562, "grad_norm": 0.49479692933604774, "learning_rate": 9.99936688219539e-06, "loss": 0.0427, "step": 1170 }, { "epoch": 0.31402520783051757, "grad_norm": 1.0947714375321784, "learning_rate": 9.999341811183036e-06, "loss": 0.0689, "step": 1171 }, { "epoch": 0.3142933762402789, "grad_norm": 0.3917382750014761, "learning_rate": 9.999316253398046e-06, "loss": 0.0423, "step": 1172 }, { "epoch": 0.3145615446500402, "grad_norm": 0.5745478880230427, "learning_rate": 9.999290208842902e-06, "loss": 0.0692, "step": 1173 }, { "epoch": 0.31482971305980156, "grad_norm": 0.6362465976943918, "learning_rate": 9.999263677520146e-06, "loss": 0.0477, "step": 1174 }, { "epoch": 0.3150978814695629, "grad_norm": 0.6101117032982308, "learning_rate": 9.999236659432357e-06, "loss": 0.056, "step": 1175 }, { "epoch": 0.3153660498793242, "grad_norm": 0.6489249654946811, "learning_rate": 9.999209154582166e-06, "loss": 0.0569, "step": 1176 }, { "epoch": 0.31563421828908556, "grad_norm": 0.8783984427421814, "learning_rate": 9.999181162972253e-06, "loss": 0.0837, "step": 1177 }, { "epoch": 0.31590238669884685, "grad_norm": 0.5240421221217416, "learning_rate": 9.999152684605345e-06, "loss": 0.0508, "step": 1178 }, { "epoch": 0.3161705551086082, "grad_norm": 0.658332854238612, "learning_rate": 9.999123719484209e-06, "loss": 0.0522, "step": 1179 }, { "epoch": 0.31643872351836955, "grad_norm": 0.8568673677041208, "learning_rate": 9.999094267611671e-06, "loss": 0.0714, "step": 1180 }, { "epoch": 0.31670689192813084, "grad_norm": 0.7389052637372896, "learning_rate": 9.999064328990596e-06, "loss": 0.0541, "step": 1181 }, { "epoch": 0.3169750603378922, "grad_norm": 0.7070077968214553, "learning_rate": 9.9990339036239e-06, "loss": 0.0609, "step": 1182 }, { "epoch": 0.31724322874765354, "grad_norm": 0.670723418855042, "learning_rate": 9.999002991514544e-06, "loss": 0.0458, "step": 1183 }, { "epoch": 0.31751139715741483, "grad_norm": 0.8827426677524723, "learning_rate": 9.99897159266554e-06, "loss": 0.0648, "step": 1184 }, { "epoch": 0.3177795655671762, "grad_norm": 0.48443309147074826, "learning_rate": 9.998939707079943e-06, "loss": 0.0371, "step": 1185 }, { "epoch": 0.31804773397693753, "grad_norm": 0.47214541872025856, "learning_rate": 9.998907334760859e-06, "loss": 0.0526, "step": 1186 }, { "epoch": 0.3183159023866988, "grad_norm": 0.6325525109954492, "learning_rate": 9.998874475711439e-06, "loss": 0.0425, "step": 1187 }, { "epoch": 0.3185840707964602, "grad_norm": 0.5281246235985751, "learning_rate": 9.998841129934885e-06, "loss": 0.054, "step": 1188 }, { "epoch": 0.3188522392062215, "grad_norm": 0.4669553258907945, "learning_rate": 9.998807297434441e-06, "loss": 0.0422, "step": 1189 }, { "epoch": 0.3191204076159828, "grad_norm": 0.44491007764543106, "learning_rate": 9.9987729782134e-06, "loss": 0.0478, "step": 1190 }, { "epoch": 0.31938857602574416, "grad_norm": 0.574355405116833, "learning_rate": 9.99873817227511e-06, "loss": 0.0488, "step": 1191 }, { "epoch": 0.3196567444355055, "grad_norm": 0.5312550040991334, "learning_rate": 9.998702879622952e-06, "loss": 0.0407, "step": 1192 }, { "epoch": 0.3199249128452668, "grad_norm": 0.7391872260788728, "learning_rate": 9.998667100260367e-06, "loss": 0.0501, "step": 1193 }, { "epoch": 0.32019308125502816, "grad_norm": 0.7315235721401155, "learning_rate": 9.998630834190837e-06, "loss": 0.0721, "step": 1194 }, { "epoch": 0.3204612496647895, "grad_norm": 0.4374554188285498, "learning_rate": 9.998594081417894e-06, "loss": 0.0511, "step": 1195 }, { "epoch": 0.3207294180745508, "grad_norm": 0.7754960133985275, "learning_rate": 9.998556841945118e-06, "loss": 0.0611, "step": 1196 }, { "epoch": 0.32099758648431215, "grad_norm": 1.0268822408348215, "learning_rate": 9.99851911577613e-06, "loss": 0.0717, "step": 1197 }, { "epoch": 0.3212657548940735, "grad_norm": 0.5614775493313844, "learning_rate": 9.998480902914609e-06, "loss": 0.0431, "step": 1198 }, { "epoch": 0.3215339233038348, "grad_norm": 0.6282347904587501, "learning_rate": 9.998442203364271e-06, "loss": 0.0479, "step": 1199 }, { "epoch": 0.32180209171359614, "grad_norm": 0.7929797452066377, "learning_rate": 9.998403017128889e-06, "loss": 0.0557, "step": 1200 }, { "epoch": 0.3220702601233575, "grad_norm": 0.47353556889844406, "learning_rate": 9.998363344212275e-06, "loss": 0.0464, "step": 1201 }, { "epoch": 0.3223384285331188, "grad_norm": 0.5415896452679779, "learning_rate": 9.99832318461829e-06, "loss": 0.0424, "step": 1202 }, { "epoch": 0.32260659694288013, "grad_norm": 0.5802447111370217, "learning_rate": 9.998282538350849e-06, "loss": 0.0565, "step": 1203 }, { "epoch": 0.3228747653526415, "grad_norm": 0.5861409225922304, "learning_rate": 9.998241405413908e-06, "loss": 0.0454, "step": 1204 }, { "epoch": 0.3231429337624028, "grad_norm": 0.42444180614709814, "learning_rate": 9.998199785811472e-06, "loss": 0.0434, "step": 1205 }, { "epoch": 0.3234111021721641, "grad_norm": 0.7093359796241029, "learning_rate": 9.99815767954759e-06, "loss": 0.0498, "step": 1206 }, { "epoch": 0.32367927058192547, "grad_norm": 0.5168428815853503, "learning_rate": 9.998115086626367e-06, "loss": 0.0397, "step": 1207 }, { "epoch": 0.32394743899168676, "grad_norm": 0.48611064237149737, "learning_rate": 9.998072007051946e-06, "loss": 0.0442, "step": 1208 }, { "epoch": 0.3242156074014481, "grad_norm": 0.6441423210736196, "learning_rate": 9.998028440828524e-06, "loss": 0.0538, "step": 1209 }, { "epoch": 0.32448377581120946, "grad_norm": 0.5386982162337676, "learning_rate": 9.997984387960342e-06, "loss": 0.0548, "step": 1210 }, { "epoch": 0.32475194422097076, "grad_norm": 0.5447310769884404, "learning_rate": 9.99793984845169e-06, "loss": 0.0637, "step": 1211 }, { "epoch": 0.3250201126307321, "grad_norm": 0.5920110680526529, "learning_rate": 9.997894822306903e-06, "loss": 0.0422, "step": 1212 }, { "epoch": 0.32528828104049345, "grad_norm": 0.6131999907550065, "learning_rate": 9.997849309530369e-06, "loss": 0.0466, "step": 1213 }, { "epoch": 0.32555644945025475, "grad_norm": 0.5980967950419176, "learning_rate": 9.997803310126515e-06, "loss": 0.069, "step": 1214 }, { "epoch": 0.3258246178600161, "grad_norm": 0.6150266055109079, "learning_rate": 9.997756824099822e-06, "loss": 0.0469, "step": 1215 }, { "epoch": 0.32609278626977745, "grad_norm": 0.7447691919561921, "learning_rate": 9.997709851454815e-06, "loss": 0.0701, "step": 1216 }, { "epoch": 0.32636095467953874, "grad_norm": 1.0724543438698761, "learning_rate": 9.99766239219607e-06, "loss": 0.0652, "step": 1217 }, { "epoch": 0.3266291230893001, "grad_norm": 0.5885863722442383, "learning_rate": 9.997614446328205e-06, "loss": 0.0468, "step": 1218 }, { "epoch": 0.32689729149906144, "grad_norm": 0.5698531398006067, "learning_rate": 9.997566013855891e-06, "loss": 0.046, "step": 1219 }, { "epoch": 0.32716545990882273, "grad_norm": 0.6531407221404981, "learning_rate": 9.997517094783843e-06, "loss": 0.0568, "step": 1220 }, { "epoch": 0.3274336283185841, "grad_norm": 0.980803681536748, "learning_rate": 9.99746768911682e-06, "loss": 0.0551, "step": 1221 }, { "epoch": 0.3277017967283454, "grad_norm": 0.5509925108741643, "learning_rate": 9.997417796859638e-06, "loss": 0.0663, "step": 1222 }, { "epoch": 0.3279699651381067, "grad_norm": 0.4396352512594954, "learning_rate": 9.997367418017154e-06, "loss": 0.0437, "step": 1223 }, { "epoch": 0.32823813354786807, "grad_norm": 0.597977681561772, "learning_rate": 9.997316552594271e-06, "loss": 0.0484, "step": 1224 }, { "epoch": 0.32850630195762937, "grad_norm": 0.6206789935584044, "learning_rate": 9.997265200595944e-06, "loss": 0.065, "step": 1225 }, { "epoch": 0.3287744703673907, "grad_norm": 0.5457175018181399, "learning_rate": 9.997213362027171e-06, "loss": 0.0449, "step": 1226 }, { "epoch": 0.32904263877715206, "grad_norm": 1.4069740679095741, "learning_rate": 9.997161036893001e-06, "loss": 0.0496, "step": 1227 }, { "epoch": 0.32931080718691336, "grad_norm": 0.42957803259939653, "learning_rate": 9.997108225198528e-06, "loss": 0.0481, "step": 1228 }, { "epoch": 0.3295789755966747, "grad_norm": 0.689776942083758, "learning_rate": 9.997054926948895e-06, "loss": 0.0599, "step": 1229 }, { "epoch": 0.32984714400643605, "grad_norm": 0.6644283461062305, "learning_rate": 9.99700114214929e-06, "loss": 0.0847, "step": 1230 }, { "epoch": 0.33011531241619735, "grad_norm": 0.5652240018221643, "learning_rate": 9.99694687080495e-06, "loss": 0.0621, "step": 1231 }, { "epoch": 0.3303834808259587, "grad_norm": 0.6717250716810702, "learning_rate": 9.996892112921161e-06, "loss": 0.0627, "step": 1232 }, { "epoch": 0.33065164923572005, "grad_norm": 0.686197907154484, "learning_rate": 9.996836868503253e-06, "loss": 0.054, "step": 1233 }, { "epoch": 0.33091981764548134, "grad_norm": 0.4377274461909573, "learning_rate": 9.996781137556604e-06, "loss": 0.0412, "step": 1234 }, { "epoch": 0.3311879860552427, "grad_norm": 0.44317086271523176, "learning_rate": 9.996724920086643e-06, "loss": 0.0458, "step": 1235 }, { "epoch": 0.33145615446500404, "grad_norm": 0.6155713385364404, "learning_rate": 9.996668216098841e-06, "loss": 0.0627, "step": 1236 }, { "epoch": 0.33172432287476533, "grad_norm": 0.7323427849479378, "learning_rate": 9.996611025598723e-06, "loss": 0.0609, "step": 1237 }, { "epoch": 0.3319924912845267, "grad_norm": 0.8651074583277407, "learning_rate": 9.996553348591856e-06, "loss": 0.055, "step": 1238 }, { "epoch": 0.33226065969428803, "grad_norm": 0.7603584456656953, "learning_rate": 9.996495185083853e-06, "loss": 0.0594, "step": 1239 }, { "epoch": 0.3325288281040493, "grad_norm": 0.45835268560219034, "learning_rate": 9.996436535080379e-06, "loss": 0.0428, "step": 1240 }, { "epoch": 0.3327969965138107, "grad_norm": 0.5440838001305944, "learning_rate": 9.996377398587146e-06, "loss": 0.0442, "step": 1241 }, { "epoch": 0.333065164923572, "grad_norm": 0.7047541313901163, "learning_rate": 9.996317775609909e-06, "loss": 0.0612, "step": 1242 }, { "epoch": 0.3333333333333333, "grad_norm": 0.47697292098096294, "learning_rate": 9.996257666154477e-06, "loss": 0.0382, "step": 1243 }, { "epoch": 0.33360150174309466, "grad_norm": 1.2807016967998974, "learning_rate": 9.996197070226701e-06, "loss": 0.0595, "step": 1244 }, { "epoch": 0.333869670152856, "grad_norm": 0.874449200275916, "learning_rate": 9.996135987832481e-06, "loss": 0.0591, "step": 1245 }, { "epoch": 0.3341378385626173, "grad_norm": 0.47341171170424673, "learning_rate": 9.996074418977762e-06, "loss": 0.0458, "step": 1246 }, { "epoch": 0.33440600697237866, "grad_norm": 1.0609136534081791, "learning_rate": 9.996012363668544e-06, "loss": 0.0462, "step": 1247 }, { "epoch": 0.33467417538214, "grad_norm": 0.602144045036516, "learning_rate": 9.995949821910864e-06, "loss": 0.0612, "step": 1248 }, { "epoch": 0.3349423437919013, "grad_norm": 0.5365518375145811, "learning_rate": 9.995886793710816e-06, "loss": 0.0476, "step": 1249 }, { "epoch": 0.33521051220166265, "grad_norm": 0.5062873122293781, "learning_rate": 9.995823279074532e-06, "loss": 0.0479, "step": 1250 }, { "epoch": 0.335478680611424, "grad_norm": 0.6166015609290926, "learning_rate": 9.995759278008202e-06, "loss": 0.0557, "step": 1251 }, { "epoch": 0.3357468490211853, "grad_norm": 0.41152630913083493, "learning_rate": 9.995694790518052e-06, "loss": 0.0412, "step": 1252 }, { "epoch": 0.33601501743094664, "grad_norm": 0.6861178050546083, "learning_rate": 9.995629816610365e-06, "loss": 0.0377, "step": 1253 }, { "epoch": 0.336283185840708, "grad_norm": 0.6458812180499209, "learning_rate": 9.995564356291466e-06, "loss": 0.0504, "step": 1254 }, { "epoch": 0.3365513542504693, "grad_norm": 0.6029266492400795, "learning_rate": 9.995498409567727e-06, "loss": 0.0557, "step": 1255 }, { "epoch": 0.33681952266023063, "grad_norm": 0.5395568920510841, "learning_rate": 9.995431976445572e-06, "loss": 0.0476, "step": 1256 }, { "epoch": 0.337087691069992, "grad_norm": 0.6062976502019732, "learning_rate": 9.99536505693147e-06, "loss": 0.0601, "step": 1257 }, { "epoch": 0.3373558594797533, "grad_norm": 0.5870956103032208, "learning_rate": 9.995297651031931e-06, "loss": 0.0603, "step": 1258 }, { "epoch": 0.3376240278895146, "grad_norm": 0.5980065779499646, "learning_rate": 9.995229758753523e-06, "loss": 0.0637, "step": 1259 }, { "epoch": 0.33789219629927597, "grad_norm": 0.5176078545077247, "learning_rate": 9.995161380102857e-06, "loss": 0.0435, "step": 1260 }, { "epoch": 0.33816036470903726, "grad_norm": 0.5980781610960592, "learning_rate": 9.99509251508659e-06, "loss": 0.0465, "step": 1261 }, { "epoch": 0.3384285331187986, "grad_norm": 0.653243173535905, "learning_rate": 9.995023163711424e-06, "loss": 0.0527, "step": 1262 }, { "epoch": 0.33869670152855996, "grad_norm": 0.5851340216883456, "learning_rate": 9.994953325984116e-06, "loss": 0.0474, "step": 1263 }, { "epoch": 0.33896486993832126, "grad_norm": 0.7005734563979943, "learning_rate": 9.994883001911464e-06, "loss": 0.0691, "step": 1264 }, { "epoch": 0.3392330383480826, "grad_norm": 0.6783434105871767, "learning_rate": 9.994812191500313e-06, "loss": 0.0685, "step": 1265 }, { "epoch": 0.3395012067578439, "grad_norm": 0.6003280977215824, "learning_rate": 9.994740894757562e-06, "loss": 0.0511, "step": 1266 }, { "epoch": 0.33976937516760525, "grad_norm": 1.0974303054537655, "learning_rate": 9.99466911169015e-06, "loss": 0.0442, "step": 1267 }, { "epoch": 0.3400375435773666, "grad_norm": 0.6827668179140555, "learning_rate": 9.994596842305067e-06, "loss": 0.0585, "step": 1268 }, { "epoch": 0.3403057119871279, "grad_norm": 0.8259226948894844, "learning_rate": 9.994524086609349e-06, "loss": 0.0717, "step": 1269 }, { "epoch": 0.34057388039688924, "grad_norm": 0.7346163895712556, "learning_rate": 9.994450844610085e-06, "loss": 0.045, "step": 1270 }, { "epoch": 0.3408420488066506, "grad_norm": 0.39543826481643735, "learning_rate": 9.994377116314397e-06, "loss": 0.0344, "step": 1271 }, { "epoch": 0.3411102172164119, "grad_norm": 0.5800373328720898, "learning_rate": 9.994302901729472e-06, "loss": 0.0556, "step": 1272 }, { "epoch": 0.34137838562617323, "grad_norm": 0.5519684158816232, "learning_rate": 9.994228200862532e-06, "loss": 0.0438, "step": 1273 }, { "epoch": 0.3416465540359346, "grad_norm": 0.5751398448320688, "learning_rate": 9.994153013720853e-06, "loss": 0.0467, "step": 1274 }, { "epoch": 0.3419147224456959, "grad_norm": 0.642724842344229, "learning_rate": 9.994077340311751e-06, "loss": 0.0686, "step": 1275 }, { "epoch": 0.3421828908554572, "grad_norm": 0.49356457083655547, "learning_rate": 9.9940011806426e-06, "loss": 0.0505, "step": 1276 }, { "epoch": 0.34245105926521857, "grad_norm": 0.46694832922304197, "learning_rate": 9.993924534720814e-06, "loss": 0.0618, "step": 1277 }, { "epoch": 0.34271922767497986, "grad_norm": 0.36982061817828604, "learning_rate": 9.993847402553854e-06, "loss": 0.0501, "step": 1278 }, { "epoch": 0.3429873960847412, "grad_norm": 0.4995239486650329, "learning_rate": 9.993769784149229e-06, "loss": 0.0396, "step": 1279 }, { "epoch": 0.34325556449450256, "grad_norm": 0.6177456174818781, "learning_rate": 9.993691679514499e-06, "loss": 0.0542, "step": 1280 }, { "epoch": 0.34352373290426386, "grad_norm": 0.5107819308050413, "learning_rate": 9.993613088657267e-06, "loss": 0.0535, "step": 1281 }, { "epoch": 0.3437919013140252, "grad_norm": 0.5957240597566732, "learning_rate": 9.993534011585188e-06, "loss": 0.0703, "step": 1282 }, { "epoch": 0.34406006972378655, "grad_norm": 0.5658605239578426, "learning_rate": 9.993454448305961e-06, "loss": 0.0525, "step": 1283 }, { "epoch": 0.34432823813354785, "grad_norm": 0.5167561348024428, "learning_rate": 9.99337439882733e-06, "loss": 0.0515, "step": 1284 }, { "epoch": 0.3445964065433092, "grad_norm": 0.5499404042489819, "learning_rate": 9.993293863157092e-06, "loss": 0.058, "step": 1285 }, { "epoch": 0.34486457495307055, "grad_norm": 0.5381475485063885, "learning_rate": 9.993212841303087e-06, "loss": 0.038, "step": 1286 }, { "epoch": 0.34513274336283184, "grad_norm": 0.47875268790706615, "learning_rate": 9.993131333273203e-06, "loss": 0.0477, "step": 1287 }, { "epoch": 0.3454009117725932, "grad_norm": 0.9039109418458877, "learning_rate": 9.99304933907538e-06, "loss": 0.0627, "step": 1288 }, { "epoch": 0.34566908018235454, "grad_norm": 0.5251831747499652, "learning_rate": 9.992966858717597e-06, "loss": 0.0466, "step": 1289 }, { "epoch": 0.34593724859211583, "grad_norm": 0.42591907741104934, "learning_rate": 9.992883892207888e-06, "loss": 0.0379, "step": 1290 }, { "epoch": 0.3462054170018772, "grad_norm": 0.4219803045185595, "learning_rate": 9.992800439554331e-06, "loss": 0.055, "step": 1291 }, { "epoch": 0.34647358541163853, "grad_norm": 0.5276466399247531, "learning_rate": 9.992716500765049e-06, "loss": 0.056, "step": 1292 }, { "epoch": 0.3467417538213998, "grad_norm": 0.49990087461416416, "learning_rate": 9.992632075848218e-06, "loss": 0.0527, "step": 1293 }, { "epoch": 0.34700992223116117, "grad_norm": 0.44478894410907177, "learning_rate": 9.992547164812056e-06, "loss": 0.0521, "step": 1294 }, { "epoch": 0.3472780906409225, "grad_norm": 0.7637729685973763, "learning_rate": 9.992461767664833e-06, "loss": 0.0788, "step": 1295 }, { "epoch": 0.3475462590506838, "grad_norm": 0.522570047005033, "learning_rate": 9.992375884414862e-06, "loss": 0.0543, "step": 1296 }, { "epoch": 0.34781442746044516, "grad_norm": 0.4562174693155051, "learning_rate": 9.992289515070506e-06, "loss": 0.0554, "step": 1297 }, { "epoch": 0.3480825958702065, "grad_norm": 0.49980977283571265, "learning_rate": 9.992202659640173e-06, "loss": 0.0639, "step": 1298 }, { "epoch": 0.3483507642799678, "grad_norm": 0.4277720363213342, "learning_rate": 9.99211531813232e-06, "loss": 0.047, "step": 1299 }, { "epoch": 0.34861893268972916, "grad_norm": 0.4520260442436689, "learning_rate": 9.992027490555454e-06, "loss": 0.0455, "step": 1300 }, { "epoch": 0.3488871010994905, "grad_norm": 0.46186180773458396, "learning_rate": 9.991939176918127e-06, "loss": 0.053, "step": 1301 }, { "epoch": 0.3491552695092518, "grad_norm": 0.4178861470283483, "learning_rate": 9.991850377228932e-06, "loss": 0.0436, "step": 1302 }, { "epoch": 0.34942343791901315, "grad_norm": 0.6055802130945195, "learning_rate": 9.99176109149652e-06, "loss": 0.0669, "step": 1303 }, { "epoch": 0.3496916063287745, "grad_norm": 0.397151347765842, "learning_rate": 9.991671319729583e-06, "loss": 0.0366, "step": 1304 }, { "epoch": 0.3499597747385358, "grad_norm": 0.37708390837146133, "learning_rate": 9.991581061936864e-06, "loss": 0.0433, "step": 1305 }, { "epoch": 0.35022794314829714, "grad_norm": 1.0498601351046697, "learning_rate": 9.991490318127149e-06, "loss": 0.0585, "step": 1306 }, { "epoch": 0.3504961115580585, "grad_norm": 0.6765797780022859, "learning_rate": 9.991399088309273e-06, "loss": 0.0583, "step": 1307 }, { "epoch": 0.3507642799678198, "grad_norm": 0.4840836771083987, "learning_rate": 9.991307372492118e-06, "loss": 0.0379, "step": 1308 }, { "epoch": 0.35103244837758113, "grad_norm": 0.6389956233628771, "learning_rate": 9.991215170684618e-06, "loss": 0.0437, "step": 1309 }, { "epoch": 0.3513006167873424, "grad_norm": 0.8518253753262383, "learning_rate": 9.991122482895748e-06, "loss": 0.0663, "step": 1310 }, { "epoch": 0.3515687851971038, "grad_norm": 1.0277398900534278, "learning_rate": 9.991029309134533e-06, "loss": 0.0481, "step": 1311 }, { "epoch": 0.3518369536068651, "grad_norm": 0.4634799855311521, "learning_rate": 9.990935649410046e-06, "loss": 0.0492, "step": 1312 }, { "epoch": 0.3521051220166264, "grad_norm": 0.4825046604865742, "learning_rate": 9.990841503731403e-06, "loss": 0.0378, "step": 1313 }, { "epoch": 0.35237329042638776, "grad_norm": 0.6459396059372403, "learning_rate": 9.990746872107775e-06, "loss": 0.0617, "step": 1314 }, { "epoch": 0.3526414588361491, "grad_norm": 0.3842373939407726, "learning_rate": 9.990651754548374e-06, "loss": 0.0387, "step": 1315 }, { "epoch": 0.3529096272459104, "grad_norm": 0.7060714051663174, "learning_rate": 9.990556151062461e-06, "loss": 0.0532, "step": 1316 }, { "epoch": 0.35317779565567176, "grad_norm": 0.778506076799676, "learning_rate": 9.990460061659346e-06, "loss": 0.0528, "step": 1317 }, { "epoch": 0.3534459640654331, "grad_norm": 0.5069258753947526, "learning_rate": 9.990363486348384e-06, "loss": 0.0475, "step": 1318 }, { "epoch": 0.3537141324751944, "grad_norm": 0.9320111810595793, "learning_rate": 9.990266425138979e-06, "loss": 0.0567, "step": 1319 }, { "epoch": 0.35398230088495575, "grad_norm": 0.7319769166641872, "learning_rate": 9.990168878040581e-06, "loss": 0.0737, "step": 1320 }, { "epoch": 0.3542504692947171, "grad_norm": 0.48965015588875527, "learning_rate": 9.990070845062687e-06, "loss": 0.0405, "step": 1321 }, { "epoch": 0.3545186377044784, "grad_norm": 0.57053143327353, "learning_rate": 9.989972326214842e-06, "loss": 0.051, "step": 1322 }, { "epoch": 0.35478680611423974, "grad_norm": 0.4705955014165205, "learning_rate": 9.989873321506643e-06, "loss": 0.0441, "step": 1323 }, { "epoch": 0.3550549745240011, "grad_norm": 0.4422692632885619, "learning_rate": 9.989773830947724e-06, "loss": 0.0469, "step": 1324 }, { "epoch": 0.3553231429337624, "grad_norm": 0.6531553954413803, "learning_rate": 9.989673854547778e-06, "loss": 0.0734, "step": 1325 }, { "epoch": 0.35559131134352373, "grad_norm": 0.6493501769789799, "learning_rate": 9.989573392316531e-06, "loss": 0.0594, "step": 1326 }, { "epoch": 0.3558594797532851, "grad_norm": 0.6505359519446975, "learning_rate": 9.989472444263773e-06, "loss": 0.0526, "step": 1327 }, { "epoch": 0.3561276481630464, "grad_norm": 0.5554740860009785, "learning_rate": 9.989371010399329e-06, "loss": 0.0539, "step": 1328 }, { "epoch": 0.3563958165728077, "grad_norm": 0.6903193022984716, "learning_rate": 9.989269090733078e-06, "loss": 0.0741, "step": 1329 }, { "epoch": 0.35666398498256907, "grad_norm": 0.48923714967170945, "learning_rate": 9.98916668527494e-06, "loss": 0.0483, "step": 1330 }, { "epoch": 0.35693215339233036, "grad_norm": 0.6731720070292243, "learning_rate": 9.98906379403489e-06, "loss": 0.0521, "step": 1331 }, { "epoch": 0.3572003218020917, "grad_norm": 0.6401578741863662, "learning_rate": 9.988960417022942e-06, "loss": 0.0665, "step": 1332 }, { "epoch": 0.35746849021185306, "grad_norm": 0.559259783618805, "learning_rate": 9.988856554249163e-06, "loss": 0.0413, "step": 1333 }, { "epoch": 0.35773665862161436, "grad_norm": 0.580708091016445, "learning_rate": 9.988752205723668e-06, "loss": 0.0526, "step": 1334 }, { "epoch": 0.3580048270313757, "grad_norm": 0.5785587006161642, "learning_rate": 9.988647371456614e-06, "loss": 0.0651, "step": 1335 }, { "epoch": 0.35827299544113705, "grad_norm": 0.4559278963571047, "learning_rate": 9.988542051458211e-06, "loss": 0.0488, "step": 1336 }, { "epoch": 0.35854116385089835, "grad_norm": 0.5337252755996704, "learning_rate": 9.988436245738714e-06, "loss": 0.054, "step": 1337 }, { "epoch": 0.3588093322606597, "grad_norm": 0.5822690385648438, "learning_rate": 9.988329954308423e-06, "loss": 0.0543, "step": 1338 }, { "epoch": 0.35907750067042105, "grad_norm": 0.5339882584104336, "learning_rate": 9.988223177177688e-06, "loss": 0.0513, "step": 1339 }, { "epoch": 0.35934566908018234, "grad_norm": 0.5741821274216442, "learning_rate": 9.988115914356906e-06, "loss": 0.0552, "step": 1340 }, { "epoch": 0.3596138374899437, "grad_norm": 0.6667213116760979, "learning_rate": 9.98800816585652e-06, "loss": 0.053, "step": 1341 }, { "epoch": 0.35988200589970504, "grad_norm": 0.46428363199712525, "learning_rate": 9.987899931687021e-06, "loss": 0.0406, "step": 1342 }, { "epoch": 0.36015017430946633, "grad_norm": 0.6755381552305845, "learning_rate": 9.98779121185895e-06, "loss": 0.0517, "step": 1343 }, { "epoch": 0.3604183427192277, "grad_norm": 0.6398551206927541, "learning_rate": 9.987682006382891e-06, "loss": 0.0498, "step": 1344 }, { "epoch": 0.36068651112898903, "grad_norm": 0.47982944839862945, "learning_rate": 9.987572315269475e-06, "loss": 0.0491, "step": 1345 }, { "epoch": 0.3609546795387503, "grad_norm": 0.474606418957216, "learning_rate": 9.987462138529387e-06, "loss": 0.0401, "step": 1346 }, { "epoch": 0.36122284794851167, "grad_norm": 0.4887012362353088, "learning_rate": 9.987351476173352e-06, "loss": 0.0409, "step": 1347 }, { "epoch": 0.361491016358273, "grad_norm": 0.5209333140026641, "learning_rate": 9.987240328212144e-06, "loss": 0.078, "step": 1348 }, { "epoch": 0.3617591847680343, "grad_norm": 0.5177846902182508, "learning_rate": 9.987128694656585e-06, "loss": 0.0493, "step": 1349 }, { "epoch": 0.36202735317779566, "grad_norm": 0.34744880810299267, "learning_rate": 9.987016575517546e-06, "loss": 0.0429, "step": 1350 }, { "epoch": 0.362295521587557, "grad_norm": 0.41574452069130186, "learning_rate": 9.986903970805944e-06, "loss": 0.0377, "step": 1351 }, { "epoch": 0.3625636899973183, "grad_norm": 0.4276759332150814, "learning_rate": 9.986790880532741e-06, "loss": 0.066, "step": 1352 }, { "epoch": 0.36283185840707965, "grad_norm": 0.3752553111902337, "learning_rate": 9.986677304708952e-06, "loss": 0.0386, "step": 1353 }, { "epoch": 0.36310002681684095, "grad_norm": 0.4675732142592896, "learning_rate": 9.986563243345633e-06, "loss": 0.0472, "step": 1354 }, { "epoch": 0.3633681952266023, "grad_norm": 0.49895234443550995, "learning_rate": 9.986448696453887e-06, "loss": 0.0437, "step": 1355 }, { "epoch": 0.36363636363636365, "grad_norm": 0.4759400809143723, "learning_rate": 9.986333664044874e-06, "loss": 0.0442, "step": 1356 }, { "epoch": 0.36390453204612494, "grad_norm": 0.7809012490443222, "learning_rate": 9.986218146129789e-06, "loss": 0.0539, "step": 1357 }, { "epoch": 0.3641727004558863, "grad_norm": 0.4419410675720373, "learning_rate": 9.986102142719881e-06, "loss": 0.0415, "step": 1358 }, { "epoch": 0.36444086886564764, "grad_norm": 0.44661705689939796, "learning_rate": 9.985985653826444e-06, "loss": 0.0488, "step": 1359 }, { "epoch": 0.36470903727540893, "grad_norm": 0.4420119341665048, "learning_rate": 9.985868679460824e-06, "loss": 0.0375, "step": 1360 }, { "epoch": 0.3649772056851703, "grad_norm": 0.4735353599702838, "learning_rate": 9.985751219634406e-06, "loss": 0.0437, "step": 1361 }, { "epoch": 0.36524537409493163, "grad_norm": 0.6893757938273076, "learning_rate": 9.98563327435863e-06, "loss": 0.0712, "step": 1362 }, { "epoch": 0.3655135425046929, "grad_norm": 0.4074776746040115, "learning_rate": 9.985514843644978e-06, "loss": 0.0489, "step": 1363 }, { "epoch": 0.36578171091445427, "grad_norm": 0.4898356575508194, "learning_rate": 9.985395927504981e-06, "loss": 0.041, "step": 1364 }, { "epoch": 0.3660498793242156, "grad_norm": 0.5550625430428274, "learning_rate": 9.985276525950221e-06, "loss": 0.0404, "step": 1365 }, { "epoch": 0.3663180477339769, "grad_norm": 0.7342254774305252, "learning_rate": 9.98515663899232e-06, "loss": 0.0437, "step": 1366 }, { "epoch": 0.36658621614373826, "grad_norm": 0.47754662716095736, "learning_rate": 9.985036266642952e-06, "loss": 0.0416, "step": 1367 }, { "epoch": 0.3668543845534996, "grad_norm": 0.5840497009801184, "learning_rate": 9.984915408913837e-06, "loss": 0.0435, "step": 1368 }, { "epoch": 0.3671225529632609, "grad_norm": 0.490322351220109, "learning_rate": 9.984794065816744e-06, "loss": 0.0551, "step": 1369 }, { "epoch": 0.36739072137302226, "grad_norm": 0.43913971012292896, "learning_rate": 9.984672237363487e-06, "loss": 0.0431, "step": 1370 }, { "epoch": 0.3676588897827836, "grad_norm": 0.38038132117967227, "learning_rate": 9.98454992356593e-06, "loss": 0.0345, "step": 1371 }, { "epoch": 0.3679270581925449, "grad_norm": 0.5660507623698914, "learning_rate": 9.984427124435978e-06, "loss": 0.0426, "step": 1372 }, { "epoch": 0.36819522660230625, "grad_norm": 0.578623278729196, "learning_rate": 9.984303839985593e-06, "loss": 0.0534, "step": 1373 }, { "epoch": 0.3684633950120676, "grad_norm": 0.5730871549562503, "learning_rate": 9.984180070226774e-06, "loss": 0.0535, "step": 1374 }, { "epoch": 0.3687315634218289, "grad_norm": 0.9317908754052814, "learning_rate": 9.984055815171576e-06, "loss": 0.066, "step": 1375 }, { "epoch": 0.36899973183159024, "grad_norm": 0.44931134962916097, "learning_rate": 9.983931074832094e-06, "loss": 0.0431, "step": 1376 }, { "epoch": 0.3692679002413516, "grad_norm": 0.42163778082235204, "learning_rate": 9.983805849220475e-06, "loss": 0.0337, "step": 1377 }, { "epoch": 0.3695360686511129, "grad_norm": 0.5783753666495938, "learning_rate": 9.983680138348914e-06, "loss": 0.0419, "step": 1378 }, { "epoch": 0.36980423706087423, "grad_norm": 0.6042800248274317, "learning_rate": 9.983553942229649e-06, "loss": 0.0442, "step": 1379 }, { "epoch": 0.3700724054706356, "grad_norm": 0.5290235600246085, "learning_rate": 9.983427260874967e-06, "loss": 0.0503, "step": 1380 }, { "epoch": 0.3703405738803969, "grad_norm": 0.47350076586267714, "learning_rate": 9.983300094297205e-06, "loss": 0.0468, "step": 1381 }, { "epoch": 0.3706087422901582, "grad_norm": 0.5343609301608477, "learning_rate": 9.983172442508743e-06, "loss": 0.0471, "step": 1382 }, { "epoch": 0.37087691069991957, "grad_norm": 0.4194077126788963, "learning_rate": 9.983044305522007e-06, "loss": 0.027, "step": 1383 }, { "epoch": 0.37114507910968086, "grad_norm": 0.6802081209523023, "learning_rate": 9.98291568334948e-06, "loss": 0.0506, "step": 1384 }, { "epoch": 0.3714132475194422, "grad_norm": 0.5234856291234419, "learning_rate": 9.982786576003682e-06, "loss": 0.0514, "step": 1385 }, { "epoch": 0.37168141592920356, "grad_norm": 0.5926422302263622, "learning_rate": 9.982656983497183e-06, "loss": 0.0468, "step": 1386 }, { "epoch": 0.37194958433896486, "grad_norm": 0.5520763702431307, "learning_rate": 9.982526905842601e-06, "loss": 0.0417, "step": 1387 }, { "epoch": 0.3722177527487262, "grad_norm": 0.3956913429740374, "learning_rate": 9.982396343052606e-06, "loss": 0.0342, "step": 1388 }, { "epoch": 0.37248592115848755, "grad_norm": 0.5835380426673424, "learning_rate": 9.982265295139906e-06, "loss": 0.0524, "step": 1389 }, { "epoch": 0.37275408956824885, "grad_norm": 0.9411097239192372, "learning_rate": 9.982133762117262e-06, "loss": 0.0762, "step": 1390 }, { "epoch": 0.3730222579780102, "grad_norm": 0.7319551021174818, "learning_rate": 9.982001743997477e-06, "loss": 0.0498, "step": 1391 }, { "epoch": 0.37329042638777155, "grad_norm": 0.4576709008205559, "learning_rate": 9.981869240793414e-06, "loss": 0.0385, "step": 1392 }, { "epoch": 0.37355859479753284, "grad_norm": 0.42831650216039885, "learning_rate": 9.981736252517967e-06, "loss": 0.0297, "step": 1393 }, { "epoch": 0.3738267632072942, "grad_norm": 0.4726164380036066, "learning_rate": 9.981602779184089e-06, "loss": 0.0488, "step": 1394 }, { "epoch": 0.37409493161705554, "grad_norm": 0.4643216536315915, "learning_rate": 9.981468820804774e-06, "loss": 0.039, "step": 1395 }, { "epoch": 0.37436310002681683, "grad_norm": 0.6958952257581869, "learning_rate": 9.981334377393063e-06, "loss": 0.0559, "step": 1396 }, { "epoch": 0.3746312684365782, "grad_norm": 0.5852392534460145, "learning_rate": 9.98119944896205e-06, "loss": 0.0505, "step": 1397 }, { "epoch": 0.37489943684633953, "grad_norm": 0.3958672556069444, "learning_rate": 9.981064035524875e-06, "loss": 0.0366, "step": 1398 }, { "epoch": 0.3751676052561008, "grad_norm": 0.49623659509855444, "learning_rate": 9.980928137094716e-06, "loss": 0.0401, "step": 1399 }, { "epoch": 0.37543577366586217, "grad_norm": 0.7581586890739327, "learning_rate": 9.980791753684809e-06, "loss": 0.0793, "step": 1400 }, { "epoch": 0.37570394207562346, "grad_norm": 0.3510886591633425, "learning_rate": 9.980654885308433e-06, "loss": 0.0355, "step": 1401 }, { "epoch": 0.3759721104853848, "grad_norm": 0.7732746990894042, "learning_rate": 9.980517531978914e-06, "loss": 0.0637, "step": 1402 }, { "epoch": 0.37624027889514616, "grad_norm": 0.45666944170885615, "learning_rate": 9.980379693709626e-06, "loss": 0.0427, "step": 1403 }, { "epoch": 0.37650844730490746, "grad_norm": 0.4787066788974094, "learning_rate": 9.980241370513989e-06, "loss": 0.0568, "step": 1404 }, { "epoch": 0.3767766157146688, "grad_norm": 0.4654427281543683, "learning_rate": 9.980102562405474e-06, "loss": 0.0561, "step": 1405 }, { "epoch": 0.37704478412443015, "grad_norm": 0.5599225065243438, "learning_rate": 9.979963269397592e-06, "loss": 0.0484, "step": 1406 }, { "epoch": 0.37731295253419145, "grad_norm": 0.3823018157604615, "learning_rate": 9.979823491503909e-06, "loss": 0.0343, "step": 1407 }, { "epoch": 0.3775811209439528, "grad_norm": 0.6441600031989471, "learning_rate": 9.979683228738034e-06, "loss": 0.0428, "step": 1408 }, { "epoch": 0.37784928935371415, "grad_norm": 0.48365263685157994, "learning_rate": 9.979542481113625e-06, "loss": 0.042, "step": 1409 }, { "epoch": 0.37811745776347544, "grad_norm": 0.4933179841794089, "learning_rate": 9.979401248644383e-06, "loss": 0.042, "step": 1410 }, { "epoch": 0.3783856261732368, "grad_norm": 1.0468988150171847, "learning_rate": 9.979259531344062e-06, "loss": 0.0524, "step": 1411 }, { "epoch": 0.37865379458299814, "grad_norm": 0.46632216206808014, "learning_rate": 9.97911732922646e-06, "loss": 0.045, "step": 1412 }, { "epoch": 0.37892196299275943, "grad_norm": 0.46767908699365535, "learning_rate": 9.978974642305424e-06, "loss": 0.0727, "step": 1413 }, { "epoch": 0.3791901314025208, "grad_norm": 0.38075460376642184, "learning_rate": 9.978831470594846e-06, "loss": 0.0535, "step": 1414 }, { "epoch": 0.37945829981228213, "grad_norm": 0.4659791271109738, "learning_rate": 9.978687814108666e-06, "loss": 0.0416, "step": 1415 }, { "epoch": 0.3797264682220434, "grad_norm": 0.4240134075415706, "learning_rate": 9.97854367286087e-06, "loss": 0.0438, "step": 1416 }, { "epoch": 0.37999463663180477, "grad_norm": 0.4538111104862816, "learning_rate": 9.978399046865498e-06, "loss": 0.0717, "step": 1417 }, { "epoch": 0.3802628050415661, "grad_norm": 0.4100673087654287, "learning_rate": 9.978253936136624e-06, "loss": 0.0436, "step": 1418 }, { "epoch": 0.3805309734513274, "grad_norm": 0.4799623130181287, "learning_rate": 9.978108340688383e-06, "loss": 0.0458, "step": 1419 }, { "epoch": 0.38079914186108876, "grad_norm": 0.4443899332532248, "learning_rate": 9.97796226053495e-06, "loss": 0.0377, "step": 1420 }, { "epoch": 0.3810673102708501, "grad_norm": 0.5300463512289733, "learning_rate": 9.977815695690547e-06, "loss": 0.0431, "step": 1421 }, { "epoch": 0.3813354786806114, "grad_norm": 0.5234572709973816, "learning_rate": 9.977668646169447e-06, "loss": 0.061, "step": 1422 }, { "epoch": 0.38160364709037276, "grad_norm": 0.38089329821921597, "learning_rate": 9.977521111985965e-06, "loss": 0.041, "step": 1423 }, { "epoch": 0.3818718155001341, "grad_norm": 0.5109836358567956, "learning_rate": 9.977373093154468e-06, "loss": 0.0458, "step": 1424 }, { "epoch": 0.3821399839098954, "grad_norm": 0.3724071955563545, "learning_rate": 9.977224589689366e-06, "loss": 0.0368, "step": 1425 }, { "epoch": 0.38240815231965675, "grad_norm": 0.4487793140625272, "learning_rate": 9.977075601605123e-06, "loss": 0.0355, "step": 1426 }, { "epoch": 0.3826763207294181, "grad_norm": 0.5629154801988521, "learning_rate": 9.976926128916242e-06, "loss": 0.051, "step": 1427 }, { "epoch": 0.3829444891391794, "grad_norm": 0.5217214389959313, "learning_rate": 9.976776171637275e-06, "loss": 0.0726, "step": 1428 }, { "epoch": 0.38321265754894074, "grad_norm": 0.5314739590759987, "learning_rate": 9.976625729782827e-06, "loss": 0.0511, "step": 1429 }, { "epoch": 0.3834808259587021, "grad_norm": 0.4960070344457156, "learning_rate": 9.976474803367544e-06, "loss": 0.0439, "step": 1430 }, { "epoch": 0.3837489943684634, "grad_norm": 0.38587853940109235, "learning_rate": 9.976323392406122e-06, "loss": 0.0479, "step": 1431 }, { "epoch": 0.38401716277822473, "grad_norm": 0.5398012707240039, "learning_rate": 9.976171496913303e-06, "loss": 0.0401, "step": 1432 }, { "epoch": 0.3842853311879861, "grad_norm": 0.4346807089344981, "learning_rate": 9.976019116903875e-06, "loss": 0.0439, "step": 1433 }, { "epoch": 0.3845534995977474, "grad_norm": 0.6565075836443373, "learning_rate": 9.975866252392678e-06, "loss": 0.0615, "step": 1434 }, { "epoch": 0.3848216680075087, "grad_norm": 0.46036894970990455, "learning_rate": 9.975712903394597e-06, "loss": 0.0521, "step": 1435 }, { "epoch": 0.38508983641727007, "grad_norm": 0.38701226068556727, "learning_rate": 9.975559069924558e-06, "loss": 0.0415, "step": 1436 }, { "epoch": 0.38535800482703136, "grad_norm": 0.36945310510925883, "learning_rate": 9.975404751997543e-06, "loss": 0.0356, "step": 1437 }, { "epoch": 0.3856261732367927, "grad_norm": 0.5141870173591446, "learning_rate": 9.975249949628576e-06, "loss": 0.0449, "step": 1438 }, { "epoch": 0.38589434164655406, "grad_norm": 0.5391929841638313, "learning_rate": 9.975094662832732e-06, "loss": 0.0463, "step": 1439 }, { "epoch": 0.38616251005631536, "grad_norm": 0.5242149219576956, "learning_rate": 9.974938891625128e-06, "loss": 0.0547, "step": 1440 }, { "epoch": 0.3864306784660767, "grad_norm": 0.37537260138139494, "learning_rate": 9.974782636020933e-06, "loss": 0.0296, "step": 1441 }, { "epoch": 0.38669884687583805, "grad_norm": 0.5324900321642044, "learning_rate": 9.974625896035361e-06, "loss": 0.0421, "step": 1442 }, { "epoch": 0.38696701528559935, "grad_norm": 0.9984801550917732, "learning_rate": 9.974468671683673e-06, "loss": 0.0613, "step": 1443 }, { "epoch": 0.3872351836953607, "grad_norm": 0.4202347866884358, "learning_rate": 9.974310962981176e-06, "loss": 0.0504, "step": 1444 }, { "epoch": 0.387503352105122, "grad_norm": 0.49957061268931924, "learning_rate": 9.974152769943227e-06, "loss": 0.0517, "step": 1445 }, { "epoch": 0.38777152051488334, "grad_norm": 0.479976258927118, "learning_rate": 9.97399409258523e-06, "loss": 0.0566, "step": 1446 }, { "epoch": 0.3880396889246447, "grad_norm": 0.7692274486872865, "learning_rate": 9.973834930922634e-06, "loss": 0.0729, "step": 1447 }, { "epoch": 0.388307857334406, "grad_norm": 0.4691759416819609, "learning_rate": 9.973675284970936e-06, "loss": 0.0425, "step": 1448 }, { "epoch": 0.38857602574416733, "grad_norm": 0.6746321261388285, "learning_rate": 9.973515154745679e-06, "loss": 0.0627, "step": 1449 }, { "epoch": 0.3888441941539287, "grad_norm": 0.7383042564015457, "learning_rate": 9.973354540262456e-06, "loss": 0.0573, "step": 1450 }, { "epoch": 0.38911236256369, "grad_norm": 0.6949022748509741, "learning_rate": 9.973193441536905e-06, "loss": 0.0623, "step": 1451 }, { "epoch": 0.3893805309734513, "grad_norm": 0.6076571369328968, "learning_rate": 9.973031858584716e-06, "loss": 0.0536, "step": 1452 }, { "epoch": 0.38964869938321267, "grad_norm": 0.4937626334395001, "learning_rate": 9.972869791421615e-06, "loss": 0.0469, "step": 1453 }, { "epoch": 0.38991686779297396, "grad_norm": 0.6935468145804896, "learning_rate": 9.972707240063385e-06, "loss": 0.0492, "step": 1454 }, { "epoch": 0.3901850362027353, "grad_norm": 0.5307630703036336, "learning_rate": 9.972544204525853e-06, "loss": 0.0456, "step": 1455 }, { "epoch": 0.39045320461249666, "grad_norm": 0.4598150390437496, "learning_rate": 9.972380684824896e-06, "loss": 0.0522, "step": 1456 }, { "epoch": 0.39072137302225796, "grad_norm": 0.729621194188739, "learning_rate": 9.97221668097643e-06, "loss": 0.0689, "step": 1457 }, { "epoch": 0.3909895414320193, "grad_norm": 0.621999751544139, "learning_rate": 9.97205219299643e-06, "loss": 0.0578, "step": 1458 }, { "epoch": 0.39125770984178065, "grad_norm": 0.48484683547037954, "learning_rate": 9.971887220900907e-06, "loss": 0.0586, "step": 1459 }, { "epoch": 0.39152587825154195, "grad_norm": 0.7533900547061578, "learning_rate": 9.971721764705924e-06, "loss": 0.0674, "step": 1460 }, { "epoch": 0.3917940466613033, "grad_norm": 0.4832192375594444, "learning_rate": 9.971555824427594e-06, "loss": 0.0509, "step": 1461 }, { "epoch": 0.39206221507106465, "grad_norm": 0.5832665691678989, "learning_rate": 9.971389400082073e-06, "loss": 0.0585, "step": 1462 }, { "epoch": 0.39233038348082594, "grad_norm": 0.5559653938928762, "learning_rate": 9.971222491685564e-06, "loss": 0.0401, "step": 1463 }, { "epoch": 0.3925985518905873, "grad_norm": 0.40470074450263593, "learning_rate": 9.971055099254318e-06, "loss": 0.0357, "step": 1464 }, { "epoch": 0.39286672030034864, "grad_norm": 0.6743024056813168, "learning_rate": 9.970887222804637e-06, "loss": 0.0711, "step": 1465 }, { "epoch": 0.39313488871010993, "grad_norm": 1.0236978797170537, "learning_rate": 9.970718862352865e-06, "loss": 0.0513, "step": 1466 }, { "epoch": 0.3934030571198713, "grad_norm": 0.6387977085505682, "learning_rate": 9.970550017915393e-06, "loss": 0.0575, "step": 1467 }, { "epoch": 0.39367122552963263, "grad_norm": 0.6916774413735859, "learning_rate": 9.970380689508664e-06, "loss": 0.0488, "step": 1468 }, { "epoch": 0.3939393939393939, "grad_norm": 0.7584007830815458, "learning_rate": 9.97021087714916e-06, "loss": 0.0597, "step": 1469 }, { "epoch": 0.39420756234915527, "grad_norm": 0.3948690896021368, "learning_rate": 9.970040580853422e-06, "loss": 0.0398, "step": 1470 }, { "epoch": 0.3944757307589166, "grad_norm": 0.3631523775322716, "learning_rate": 9.969869800638026e-06, "loss": 0.032, "step": 1471 }, { "epoch": 0.3947438991686779, "grad_norm": 0.6044347965634627, "learning_rate": 9.969698536519603e-06, "loss": 0.0457, "step": 1472 }, { "epoch": 0.39501206757843926, "grad_norm": 0.4279184496651392, "learning_rate": 9.969526788514827e-06, "loss": 0.0351, "step": 1473 }, { "epoch": 0.3952802359882006, "grad_norm": 0.5568081474423444, "learning_rate": 9.969354556640423e-06, "loss": 0.049, "step": 1474 }, { "epoch": 0.3955484043979619, "grad_norm": 0.5232715356198936, "learning_rate": 9.969181840913159e-06, "loss": 0.0486, "step": 1475 }, { "epoch": 0.39581657280772325, "grad_norm": 0.6600744673638719, "learning_rate": 9.969008641349848e-06, "loss": 0.0397, "step": 1476 }, { "epoch": 0.3960847412174846, "grad_norm": 0.7289736681617289, "learning_rate": 9.968834957967363e-06, "loss": 0.0759, "step": 1477 }, { "epoch": 0.3963529096272459, "grad_norm": 0.48884858872863574, "learning_rate": 9.968660790782608e-06, "loss": 0.0411, "step": 1478 }, { "epoch": 0.39662107803700725, "grad_norm": 0.5627631949884025, "learning_rate": 9.968486139812544e-06, "loss": 0.0441, "step": 1479 }, { "epoch": 0.3968892464467686, "grad_norm": 0.6120268467122656, "learning_rate": 9.968311005074175e-06, "loss": 0.045, "step": 1480 }, { "epoch": 0.3971574148565299, "grad_norm": 0.6758895689277288, "learning_rate": 9.968135386584554e-06, "loss": 0.042, "step": 1481 }, { "epoch": 0.39742558326629124, "grad_norm": 0.5316117909588363, "learning_rate": 9.967959284360781e-06, "loss": 0.0453, "step": 1482 }, { "epoch": 0.3976937516760526, "grad_norm": 0.4211777878796908, "learning_rate": 9.967782698420004e-06, "loss": 0.0427, "step": 1483 }, { "epoch": 0.3979619200858139, "grad_norm": 0.6619456950817497, "learning_rate": 9.967605628779412e-06, "loss": 0.0422, "step": 1484 }, { "epoch": 0.39823008849557523, "grad_norm": 1.1739600897471947, "learning_rate": 9.967428075456248e-06, "loss": 0.0541, "step": 1485 }, { "epoch": 0.3984982569053366, "grad_norm": 0.511679976345095, "learning_rate": 9.967250038467803e-06, "loss": 0.0563, "step": 1486 }, { "epoch": 0.39876642531509787, "grad_norm": 0.6363511043776823, "learning_rate": 9.967071517831411e-06, "loss": 0.0537, "step": 1487 }, { "epoch": 0.3990345937248592, "grad_norm": 0.4250145312251278, "learning_rate": 9.96689251356445e-06, "loss": 0.0529, "step": 1488 }, { "epoch": 0.3993027621346205, "grad_norm": 0.7341479308550573, "learning_rate": 9.966713025684351e-06, "loss": 0.0545, "step": 1489 }, { "epoch": 0.39957093054438186, "grad_norm": 0.7234099827193285, "learning_rate": 9.966533054208593e-06, "loss": 0.0722, "step": 1490 }, { "epoch": 0.3998390989541432, "grad_norm": 0.5179774960511757, "learning_rate": 9.966352599154697e-06, "loss": 0.0404, "step": 1491 }, { "epoch": 0.4001072673639045, "grad_norm": 0.6020615267730577, "learning_rate": 9.966171660540233e-06, "loss": 0.0476, "step": 1492 }, { "epoch": 0.40037543577366586, "grad_norm": 0.4817371378206565, "learning_rate": 9.96599023838282e-06, "loss": 0.0506, "step": 1493 }, { "epoch": 0.4006436041834272, "grad_norm": 0.44723070637766194, "learning_rate": 9.965808332700122e-06, "loss": 0.045, "step": 1494 }, { "epoch": 0.4009117725931885, "grad_norm": 0.6039138079646764, "learning_rate": 9.965625943509851e-06, "loss": 0.0729, "step": 1495 }, { "epoch": 0.40117994100294985, "grad_norm": 0.3655912305770551, "learning_rate": 9.965443070829763e-06, "loss": 0.0348, "step": 1496 }, { "epoch": 0.4014481094127112, "grad_norm": 0.7820269867978273, "learning_rate": 9.965259714677668e-06, "loss": 0.0518, "step": 1497 }, { "epoch": 0.4017162778224725, "grad_norm": 0.3538558107447198, "learning_rate": 9.965075875071417e-06, "loss": 0.0375, "step": 1498 }, { "epoch": 0.40198444623223384, "grad_norm": 0.6178258602842533, "learning_rate": 9.964891552028911e-06, "loss": 0.0417, "step": 1499 }, { "epoch": 0.4022526146419952, "grad_norm": 0.4536696421404748, "learning_rate": 9.964706745568096e-06, "loss": 0.0479, "step": 1500 }, { "epoch": 0.4025207830517565, "grad_norm": 0.5099670672020854, "learning_rate": 9.964521455706965e-06, "loss": 0.0399, "step": 1501 }, { "epoch": 0.40278895146151783, "grad_norm": 0.5779780026066804, "learning_rate": 9.964335682463561e-06, "loss": 0.0558, "step": 1502 }, { "epoch": 0.4030571198712792, "grad_norm": 0.8210955795531958, "learning_rate": 9.964149425855971e-06, "loss": 0.0525, "step": 1503 }, { "epoch": 0.4033252882810405, "grad_norm": 1.0478277529182631, "learning_rate": 9.963962685902331e-06, "loss": 0.0611, "step": 1504 }, { "epoch": 0.4035934566908018, "grad_norm": 0.505294396513007, "learning_rate": 9.963775462620825e-06, "loss": 0.0454, "step": 1505 }, { "epoch": 0.40386162510056317, "grad_norm": 0.9066308955405175, "learning_rate": 9.963587756029679e-06, "loss": 0.0465, "step": 1506 }, { "epoch": 0.40412979351032446, "grad_norm": 0.5019543675031959, "learning_rate": 9.963399566147172e-06, "loss": 0.0572, "step": 1507 }, { "epoch": 0.4043979619200858, "grad_norm": 0.40156110640085935, "learning_rate": 9.963210892991626e-06, "loss": 0.0363, "step": 1508 }, { "epoch": 0.40466613032984716, "grad_norm": 0.5787893379514217, "learning_rate": 9.963021736581413e-06, "loss": 0.0494, "step": 1509 }, { "epoch": 0.40493429873960846, "grad_norm": 0.6864524043023535, "learning_rate": 9.962832096934952e-06, "loss": 0.0612, "step": 1510 }, { "epoch": 0.4052024671493698, "grad_norm": 0.5714415032227125, "learning_rate": 9.962641974070702e-06, "loss": 0.0628, "step": 1511 }, { "epoch": 0.40547063555913115, "grad_norm": 0.6093447704244753, "learning_rate": 9.962451368007182e-06, "loss": 0.0629, "step": 1512 }, { "epoch": 0.40573880396889245, "grad_norm": 0.5614457167971783, "learning_rate": 9.962260278762946e-06, "loss": 0.0363, "step": 1513 }, { "epoch": 0.4060069723786538, "grad_norm": 0.4806050014375334, "learning_rate": 9.9620687063566e-06, "loss": 0.0395, "step": 1514 }, { "epoch": 0.40627514078841515, "grad_norm": 0.3849303782813662, "learning_rate": 9.961876650806799e-06, "loss": 0.0419, "step": 1515 }, { "epoch": 0.40654330919817644, "grad_norm": 0.4952169567558052, "learning_rate": 9.961684112132242e-06, "loss": 0.0348, "step": 1516 }, { "epoch": 0.4068114776079378, "grad_norm": 0.49944364398352076, "learning_rate": 9.961491090351676e-06, "loss": 0.0545, "step": 1517 }, { "epoch": 0.40707964601769914, "grad_norm": 0.4481913069439499, "learning_rate": 9.961297585483895e-06, "loss": 0.0494, "step": 1518 }, { "epoch": 0.40734781442746043, "grad_norm": 0.48121855869425484, "learning_rate": 9.961103597547741e-06, "loss": 0.0516, "step": 1519 }, { "epoch": 0.4076159828372218, "grad_norm": 0.5555965516580667, "learning_rate": 9.960909126562102e-06, "loss": 0.0613, "step": 1520 }, { "epoch": 0.40788415124698313, "grad_norm": 0.7679943289847897, "learning_rate": 9.96071417254591e-06, "loss": 0.0658, "step": 1521 }, { "epoch": 0.4081523196567444, "grad_norm": 0.3603558260148338, "learning_rate": 9.96051873551815e-06, "loss": 0.039, "step": 1522 }, { "epoch": 0.40842048806650577, "grad_norm": 0.4828801955429522, "learning_rate": 9.960322815497852e-06, "loss": 0.0498, "step": 1523 }, { "epoch": 0.4086886564762671, "grad_norm": 0.5000190310469207, "learning_rate": 9.960126412504089e-06, "loss": 0.0495, "step": 1524 }, { "epoch": 0.4089568248860284, "grad_norm": 0.5593440535457198, "learning_rate": 9.959929526555987e-06, "loss": 0.0536, "step": 1525 }, { "epoch": 0.40922499329578976, "grad_norm": 0.371495004109536, "learning_rate": 9.959732157672715e-06, "loss": 0.0424, "step": 1526 }, { "epoch": 0.4094931617055511, "grad_norm": 0.42931403654557876, "learning_rate": 9.95953430587349e-06, "loss": 0.0441, "step": 1527 }, { "epoch": 0.4097613301153124, "grad_norm": 0.48393681157996615, "learning_rate": 9.95933597117758e-06, "loss": 0.0579, "step": 1528 }, { "epoch": 0.41002949852507375, "grad_norm": 0.4026835220522151, "learning_rate": 9.95913715360429e-06, "loss": 0.0451, "step": 1529 }, { "epoch": 0.4102976669348351, "grad_norm": 0.45003484204788097, "learning_rate": 9.958937853172982e-06, "loss": 0.0554, "step": 1530 }, { "epoch": 0.4105658353445964, "grad_norm": 0.4828875688079087, "learning_rate": 9.958738069903062e-06, "loss": 0.0384, "step": 1531 }, { "epoch": 0.41083400375435775, "grad_norm": 0.4112170299303783, "learning_rate": 9.958537803813981e-06, "loss": 0.0442, "step": 1532 }, { "epoch": 0.41110217216411904, "grad_norm": 0.5178106943809002, "learning_rate": 9.958337054925239e-06, "loss": 0.0574, "step": 1533 }, { "epoch": 0.4113703405738804, "grad_norm": 0.4614124403584122, "learning_rate": 9.958135823256381e-06, "loss": 0.0501, "step": 1534 }, { "epoch": 0.41163850898364174, "grad_norm": 0.47731808924633257, "learning_rate": 9.957934108827003e-06, "loss": 0.052, "step": 1535 }, { "epoch": 0.41190667739340303, "grad_norm": 0.3426622014631583, "learning_rate": 9.957731911656743e-06, "loss": 0.0368, "step": 1536 }, { "epoch": 0.4121748458031644, "grad_norm": 0.34897719156197654, "learning_rate": 9.957529231765289e-06, "loss": 0.0345, "step": 1537 }, { "epoch": 0.41244301421292573, "grad_norm": 0.4462920273034147, "learning_rate": 9.957326069172378e-06, "loss": 0.0528, "step": 1538 }, { "epoch": 0.412711182622687, "grad_norm": 0.41206181916970774, "learning_rate": 9.957122423897786e-06, "loss": 0.0429, "step": 1539 }, { "epoch": 0.41297935103244837, "grad_norm": 0.6963846821272254, "learning_rate": 9.956918295961347e-06, "loss": 0.0728, "step": 1540 }, { "epoch": 0.4132475194422097, "grad_norm": 0.34280354412948955, "learning_rate": 9.956713685382931e-06, "loss": 0.042, "step": 1541 }, { "epoch": 0.413515687851971, "grad_norm": 0.36298017184811465, "learning_rate": 9.956508592182465e-06, "loss": 0.0409, "step": 1542 }, { "epoch": 0.41378385626173236, "grad_norm": 0.7288312084774158, "learning_rate": 9.956303016379916e-06, "loss": 0.0566, "step": 1543 }, { "epoch": 0.4140520246714937, "grad_norm": 0.3622871056181386, "learning_rate": 9.956096957995302e-06, "loss": 0.0291, "step": 1544 }, { "epoch": 0.414320193081255, "grad_norm": 0.5026766141301878, "learning_rate": 9.955890417048686e-06, "loss": 0.0568, "step": 1545 }, { "epoch": 0.41458836149101636, "grad_norm": 0.5093066010614714, "learning_rate": 9.955683393560176e-06, "loss": 0.0657, "step": 1546 }, { "epoch": 0.4148565299007777, "grad_norm": 0.5802828379215412, "learning_rate": 9.955475887549933e-06, "loss": 0.0674, "step": 1547 }, { "epoch": 0.415124698310539, "grad_norm": 0.5230095098745715, "learning_rate": 9.955267899038157e-06, "loss": 0.0415, "step": 1548 }, { "epoch": 0.41539286672030035, "grad_norm": 0.517732704419722, "learning_rate": 9.955059428045102e-06, "loss": 0.0545, "step": 1549 }, { "epoch": 0.4156610351300617, "grad_norm": 0.5387615935592449, "learning_rate": 9.954850474591066e-06, "loss": 0.0471, "step": 1550 }, { "epoch": 0.415929203539823, "grad_norm": 0.3883734640432954, "learning_rate": 9.954641038696395e-06, "loss": 0.0452, "step": 1551 }, { "epoch": 0.41619737194958434, "grad_norm": 0.4776342629723951, "learning_rate": 9.954431120381482e-06, "loss": 0.0465, "step": 1552 }, { "epoch": 0.4164655403593457, "grad_norm": 0.39763965851013067, "learning_rate": 9.954220719666761e-06, "loss": 0.0385, "step": 1553 }, { "epoch": 0.416733708769107, "grad_norm": 0.3795763593740213, "learning_rate": 9.954009836572725e-06, "loss": 0.0373, "step": 1554 }, { "epoch": 0.41700187717886833, "grad_norm": 0.3210735210097809, "learning_rate": 9.953798471119902e-06, "loss": 0.038, "step": 1555 }, { "epoch": 0.4172700455886297, "grad_norm": 0.332998182005263, "learning_rate": 9.953586623328875e-06, "loss": 0.0291, "step": 1556 }, { "epoch": 0.417538213998391, "grad_norm": 0.5053798458218474, "learning_rate": 9.953374293220268e-06, "loss": 0.0673, "step": 1557 }, { "epoch": 0.4178063824081523, "grad_norm": 0.3886065554822884, "learning_rate": 9.95316148081476e-06, "loss": 0.0386, "step": 1558 }, { "epoch": 0.41807455081791367, "grad_norm": 0.6169379555091556, "learning_rate": 9.952948186133066e-06, "loss": 0.0533, "step": 1559 }, { "epoch": 0.41834271922767496, "grad_norm": 0.51392758421929, "learning_rate": 9.952734409195959e-06, "loss": 0.0504, "step": 1560 }, { "epoch": 0.4186108876374363, "grad_norm": 0.9874120763985208, "learning_rate": 9.952520150024251e-06, "loss": 0.0564, "step": 1561 }, { "epoch": 0.41887905604719766, "grad_norm": 0.457472097900679, "learning_rate": 9.952305408638806e-06, "loss": 0.0437, "step": 1562 }, { "epoch": 0.41914722445695896, "grad_norm": 0.39183715518657786, "learning_rate": 9.952090185060528e-06, "loss": 0.0379, "step": 1563 }, { "epoch": 0.4194153928667203, "grad_norm": 0.5822137581104712, "learning_rate": 9.951874479310379e-06, "loss": 0.0419, "step": 1564 }, { "epoch": 0.41968356127648165, "grad_norm": 0.7028137838978532, "learning_rate": 9.951658291409358e-06, "loss": 0.0492, "step": 1565 }, { "epoch": 0.41995172968624295, "grad_norm": 0.6141577153719346, "learning_rate": 9.951441621378516e-06, "loss": 0.0622, "step": 1566 }, { "epoch": 0.4202198980960043, "grad_norm": 0.546514140691345, "learning_rate": 9.951224469238949e-06, "loss": 0.0422, "step": 1567 }, { "epoch": 0.42048806650576565, "grad_norm": 0.4318719798859912, "learning_rate": 9.951006835011801e-06, "loss": 0.0328, "step": 1568 }, { "epoch": 0.42075623491552694, "grad_norm": 0.5127036067420944, "learning_rate": 9.950788718718262e-06, "loss": 0.0613, "step": 1569 }, { "epoch": 0.4210244033252883, "grad_norm": 0.5753542784949802, "learning_rate": 9.95057012037957e-06, "loss": 0.0585, "step": 1570 }, { "epoch": 0.42129257173504964, "grad_norm": 0.5743911053517858, "learning_rate": 9.950351040017007e-06, "loss": 0.0397, "step": 1571 }, { "epoch": 0.42156074014481093, "grad_norm": 0.3936589974756621, "learning_rate": 9.950131477651909e-06, "loss": 0.0377, "step": 1572 }, { "epoch": 0.4218289085545723, "grad_norm": 0.4549551754493138, "learning_rate": 9.94991143330565e-06, "loss": 0.0455, "step": 1573 }, { "epoch": 0.42209707696433363, "grad_norm": 0.4737514804531119, "learning_rate": 9.949690906999656e-06, "loss": 0.0546, "step": 1574 }, { "epoch": 0.4223652453740949, "grad_norm": 0.43066367347249324, "learning_rate": 9.9494698987554e-06, "loss": 0.0436, "step": 1575 }, { "epoch": 0.42263341378385627, "grad_norm": 0.5178512345061418, "learning_rate": 9.9492484085944e-06, "loss": 0.0434, "step": 1576 }, { "epoch": 0.42290158219361756, "grad_norm": 0.44720641002369177, "learning_rate": 9.949026436538223e-06, "loss": 0.0446, "step": 1577 }, { "epoch": 0.4231697506033789, "grad_norm": 0.44016599414656754, "learning_rate": 9.948803982608482e-06, "loss": 0.0335, "step": 1578 }, { "epoch": 0.42343791901314026, "grad_norm": 1.2207721154373405, "learning_rate": 9.948581046826835e-06, "loss": 0.0458, "step": 1579 }, { "epoch": 0.42370608742290156, "grad_norm": 0.4692588099437912, "learning_rate": 9.948357629214991e-06, "loss": 0.0356, "step": 1580 }, { "epoch": 0.4239742558326629, "grad_norm": 0.45585523027207786, "learning_rate": 9.9481337297947e-06, "loss": 0.055, "step": 1581 }, { "epoch": 0.42424242424242425, "grad_norm": 0.5481561511247544, "learning_rate": 9.947909348587766e-06, "loss": 0.0466, "step": 1582 }, { "epoch": 0.42451059265218555, "grad_norm": 0.3940284385551459, "learning_rate": 9.947684485616036e-06, "loss": 0.0459, "step": 1583 }, { "epoch": 0.4247787610619469, "grad_norm": 0.6957996115252733, "learning_rate": 9.947459140901402e-06, "loss": 0.0603, "step": 1584 }, { "epoch": 0.42504692947170825, "grad_norm": 0.36842237803172034, "learning_rate": 9.947233314465807e-06, "loss": 0.0417, "step": 1585 }, { "epoch": 0.42531509788146954, "grad_norm": 0.9956851587603684, "learning_rate": 9.94700700633124e-06, "loss": 0.0605, "step": 1586 }, { "epoch": 0.4255832662912309, "grad_norm": 0.4588119715929885, "learning_rate": 9.946780216519734e-06, "loss": 0.0577, "step": 1587 }, { "epoch": 0.42585143470099224, "grad_norm": 0.8052413969500355, "learning_rate": 9.946552945053371e-06, "loss": 0.074, "step": 1588 }, { "epoch": 0.42611960311075353, "grad_norm": 0.41446893683361724, "learning_rate": 9.946325191954283e-06, "loss": 0.0423, "step": 1589 }, { "epoch": 0.4263877715205149, "grad_norm": 0.6248109615112485, "learning_rate": 9.946096957244641e-06, "loss": 0.0503, "step": 1590 }, { "epoch": 0.42665593993027623, "grad_norm": 0.3754820451072399, "learning_rate": 9.94586824094667e-06, "loss": 0.0354, "step": 1591 }, { "epoch": 0.4269241083400375, "grad_norm": 0.5458444562760195, "learning_rate": 9.94563904308264e-06, "loss": 0.0422, "step": 1592 }, { "epoch": 0.42719227674979887, "grad_norm": 0.5745767930221711, "learning_rate": 9.945409363674867e-06, "loss": 0.0508, "step": 1593 }, { "epoch": 0.4274604451595602, "grad_norm": 0.4711404522052878, "learning_rate": 9.945179202745713e-06, "loss": 0.044, "step": 1594 }, { "epoch": 0.4277286135693215, "grad_norm": 0.6747971332877517, "learning_rate": 9.944948560317592e-06, "loss": 0.0412, "step": 1595 }, { "epoch": 0.42799678197908286, "grad_norm": 0.759244857477804, "learning_rate": 9.944717436412956e-06, "loss": 0.0508, "step": 1596 }, { "epoch": 0.4282649503888442, "grad_norm": 0.6738618283942054, "learning_rate": 9.944485831054311e-06, "loss": 0.0499, "step": 1597 }, { "epoch": 0.4285331187986055, "grad_norm": 0.4285162583446723, "learning_rate": 9.944253744264209e-06, "loss": 0.0374, "step": 1598 }, { "epoch": 0.42880128720836685, "grad_norm": 0.8442385233061325, "learning_rate": 9.944021176065247e-06, "loss": 0.0621, "step": 1599 }, { "epoch": 0.4290694556181282, "grad_norm": 0.42337405318536786, "learning_rate": 9.943788126480068e-06, "loss": 0.042, "step": 1600 }, { "epoch": 0.4293376240278895, "grad_norm": 0.4690496757975748, "learning_rate": 9.943554595531364e-06, "loss": 0.0389, "step": 1601 }, { "epoch": 0.42960579243765085, "grad_norm": 0.44661641944769725, "learning_rate": 9.943320583241876e-06, "loss": 0.037, "step": 1602 }, { "epoch": 0.4298739608474122, "grad_norm": 0.6738844918007444, "learning_rate": 9.943086089634387e-06, "loss": 0.0484, "step": 1603 }, { "epoch": 0.4301421292571735, "grad_norm": 0.5299063485742996, "learning_rate": 9.942851114731726e-06, "loss": 0.05, "step": 1604 }, { "epoch": 0.43041029766693484, "grad_norm": 0.6381442442925218, "learning_rate": 9.942615658556778e-06, "loss": 0.0485, "step": 1605 }, { "epoch": 0.4306784660766962, "grad_norm": 0.5640439949105651, "learning_rate": 9.942379721132464e-06, "loss": 0.0424, "step": 1606 }, { "epoch": 0.4309466344864575, "grad_norm": 0.8609338246697618, "learning_rate": 9.942143302481759e-06, "loss": 0.0514, "step": 1607 }, { "epoch": 0.43121480289621883, "grad_norm": 0.5061651565168902, "learning_rate": 9.941906402627684e-06, "loss": 0.0534, "step": 1608 }, { "epoch": 0.4314829713059802, "grad_norm": 0.415662373088618, "learning_rate": 9.9416690215933e-06, "loss": 0.0538, "step": 1609 }, { "epoch": 0.43175113971574147, "grad_norm": 0.6675060128771009, "learning_rate": 9.941431159401725e-06, "loss": 0.0453, "step": 1610 }, { "epoch": 0.4320193081255028, "grad_norm": 0.555454595793176, "learning_rate": 9.941192816076114e-06, "loss": 0.0719, "step": 1611 }, { "epoch": 0.43228747653526417, "grad_norm": 0.4434391905217857, "learning_rate": 9.94095399163968e-06, "loss": 0.0529, "step": 1612 }, { "epoch": 0.43255564494502546, "grad_norm": 0.4432154150312489, "learning_rate": 9.940714686115674e-06, "loss": 0.0479, "step": 1613 }, { "epoch": 0.4328238133547868, "grad_norm": 0.5696991475671687, "learning_rate": 9.940474899527397e-06, "loss": 0.0415, "step": 1614 }, { "epoch": 0.43309198176454816, "grad_norm": 0.5934137566000263, "learning_rate": 9.940234631898193e-06, "loss": 0.0444, "step": 1615 }, { "epoch": 0.43336015017430946, "grad_norm": 0.6787309663515398, "learning_rate": 9.939993883251462e-06, "loss": 0.0548, "step": 1616 }, { "epoch": 0.4336283185840708, "grad_norm": 0.5298444752480082, "learning_rate": 9.939752653610639e-06, "loss": 0.0413, "step": 1617 }, { "epoch": 0.43389648699383215, "grad_norm": 0.544173022269774, "learning_rate": 9.939510942999218e-06, "loss": 0.0529, "step": 1618 }, { "epoch": 0.43416465540359345, "grad_norm": 0.8399925652779808, "learning_rate": 9.939268751440728e-06, "loss": 0.053, "step": 1619 }, { "epoch": 0.4344328238133548, "grad_norm": 0.8372998831208353, "learning_rate": 9.939026078958755e-06, "loss": 0.0617, "step": 1620 }, { "epoch": 0.4347009922231161, "grad_norm": 0.5481772332624137, "learning_rate": 9.938782925576925e-06, "loss": 0.0489, "step": 1621 }, { "epoch": 0.43496916063287744, "grad_norm": 0.3957594716097898, "learning_rate": 9.938539291318913e-06, "loss": 0.0553, "step": 1622 }, { "epoch": 0.4352373290426388, "grad_norm": 0.5159704723397789, "learning_rate": 9.938295176208441e-06, "loss": 0.0525, "step": 1623 }, { "epoch": 0.4355054974524001, "grad_norm": 0.4112912373675489, "learning_rate": 9.93805058026928e-06, "loss": 0.0417, "step": 1624 }, { "epoch": 0.43577366586216143, "grad_norm": 0.7328767049079161, "learning_rate": 9.937805503525244e-06, "loss": 0.0394, "step": 1625 }, { "epoch": 0.4360418342719228, "grad_norm": 0.5292512846454595, "learning_rate": 9.937559946000196e-06, "loss": 0.0385, "step": 1626 }, { "epoch": 0.4363100026816841, "grad_norm": 0.6468614827634845, "learning_rate": 9.937313907718046e-06, "loss": 0.0485, "step": 1627 }, { "epoch": 0.4365781710914454, "grad_norm": 0.4505443998986794, "learning_rate": 9.937067388702748e-06, "loss": 0.061, "step": 1628 }, { "epoch": 0.43684633950120677, "grad_norm": 0.5597095803322025, "learning_rate": 9.936820388978306e-06, "loss": 0.043, "step": 1629 }, { "epoch": 0.43711450791096806, "grad_norm": 0.4728603467284444, "learning_rate": 9.936572908568768e-06, "loss": 0.0647, "step": 1630 }, { "epoch": 0.4373826763207294, "grad_norm": 0.4722589663682352, "learning_rate": 9.936324947498237e-06, "loss": 0.0386, "step": 1631 }, { "epoch": 0.43765084473049076, "grad_norm": 0.5969013106142882, "learning_rate": 9.936076505790848e-06, "loss": 0.0482, "step": 1632 }, { "epoch": 0.43791901314025206, "grad_norm": 0.6084121950394153, "learning_rate": 9.935827583470793e-06, "loss": 0.05, "step": 1633 }, { "epoch": 0.4381871815500134, "grad_norm": 0.7002806716948767, "learning_rate": 9.935578180562315e-06, "loss": 0.0389, "step": 1634 }, { "epoch": 0.43845534995977475, "grad_norm": 0.5758641398579936, "learning_rate": 9.93532829708969e-06, "loss": 0.0625, "step": 1635 }, { "epoch": 0.43872351836953605, "grad_norm": 0.44420550676278925, "learning_rate": 9.935077933077252e-06, "loss": 0.0355, "step": 1636 }, { "epoch": 0.4389916867792974, "grad_norm": 0.5035070446091369, "learning_rate": 9.93482708854938e-06, "loss": 0.0519, "step": 1637 }, { "epoch": 0.43925985518905875, "grad_norm": 0.7675657327100923, "learning_rate": 9.934575763530496e-06, "loss": 0.0627, "step": 1638 }, { "epoch": 0.43952802359882004, "grad_norm": 0.417443837502166, "learning_rate": 9.934323958045069e-06, "loss": 0.045, "step": 1639 }, { "epoch": 0.4397961920085814, "grad_norm": 0.4514709191998314, "learning_rate": 9.93407167211762e-06, "loss": 0.0402, "step": 1640 }, { "epoch": 0.44006436041834274, "grad_norm": 0.3617548510653869, "learning_rate": 9.933818905772713e-06, "loss": 0.0372, "step": 1641 }, { "epoch": 0.44033252882810403, "grad_norm": 0.5013704477166068, "learning_rate": 9.933565659034955e-06, "loss": 0.0592, "step": 1642 }, { "epoch": 0.4406006972378654, "grad_norm": 1.1095673450338919, "learning_rate": 9.933311931929008e-06, "loss": 0.0652, "step": 1643 }, { "epoch": 0.44086886564762673, "grad_norm": 0.44111484308396137, "learning_rate": 9.933057724479578e-06, "loss": 0.0456, "step": 1644 }, { "epoch": 0.441137034057388, "grad_norm": 0.6813778350113352, "learning_rate": 9.932803036711413e-06, "loss": 0.0533, "step": 1645 }, { "epoch": 0.44140520246714937, "grad_norm": 0.6524824535315231, "learning_rate": 9.932547868649313e-06, "loss": 0.071, "step": 1646 }, { "epoch": 0.4416733708769107, "grad_norm": 0.40175275304674, "learning_rate": 9.932292220318121e-06, "loss": 0.0481, "step": 1647 }, { "epoch": 0.441941539286672, "grad_norm": 0.4050965128307958, "learning_rate": 9.932036091742732e-06, "loss": 0.0392, "step": 1648 }, { "epoch": 0.44220970769643336, "grad_norm": 0.5467004541457118, "learning_rate": 9.931779482948083e-06, "loss": 0.0491, "step": 1649 }, { "epoch": 0.4424778761061947, "grad_norm": 0.44115926393665356, "learning_rate": 9.93152239395916e-06, "loss": 0.0383, "step": 1650 }, { "epoch": 0.442746044515956, "grad_norm": 0.4863668581054675, "learning_rate": 9.931264824800995e-06, "loss": 0.0452, "step": 1651 }, { "epoch": 0.44301421292571735, "grad_norm": 0.6932025326781432, "learning_rate": 9.931006775498664e-06, "loss": 0.0543, "step": 1652 }, { "epoch": 0.4432823813354787, "grad_norm": 0.629272932778374, "learning_rate": 9.930748246077296e-06, "loss": 0.0622, "step": 1653 }, { "epoch": 0.44355054974524, "grad_norm": 0.5376602941608818, "learning_rate": 9.930489236562063e-06, "loss": 0.0486, "step": 1654 }, { "epoch": 0.44381871815500135, "grad_norm": 0.46613971273142707, "learning_rate": 9.930229746978181e-06, "loss": 0.0504, "step": 1655 }, { "epoch": 0.4440868865647627, "grad_norm": 0.49500407773854394, "learning_rate": 9.92996977735092e-06, "loss": 0.052, "step": 1656 }, { "epoch": 0.444355054974524, "grad_norm": 0.6253522537145251, "learning_rate": 9.92970932770559e-06, "loss": 0.0706, "step": 1657 }, { "epoch": 0.44462322338428534, "grad_norm": 0.5504296330605859, "learning_rate": 9.92944839806755e-06, "loss": 0.0502, "step": 1658 }, { "epoch": 0.4448913917940467, "grad_norm": 0.4766863316440304, "learning_rate": 9.929186988462208e-06, "loss": 0.0368, "step": 1659 }, { "epoch": 0.445159560203808, "grad_norm": 0.5014563876664402, "learning_rate": 9.928925098915016e-06, "loss": 0.0375, "step": 1660 }, { "epoch": 0.44542772861356933, "grad_norm": 0.49760274952258365, "learning_rate": 9.928662729451472e-06, "loss": 0.041, "step": 1661 }, { "epoch": 0.4456958970233307, "grad_norm": 0.5118654949774181, "learning_rate": 9.928399880097124e-06, "loss": 0.0444, "step": 1662 }, { "epoch": 0.44596406543309197, "grad_norm": 0.7442468080500084, "learning_rate": 9.928136550877565e-06, "loss": 0.0526, "step": 1663 }, { "epoch": 0.4462322338428533, "grad_norm": 0.3719104078439192, "learning_rate": 9.927872741818432e-06, "loss": 0.0351, "step": 1664 }, { "epoch": 0.4465004022526146, "grad_norm": 0.39610326639199905, "learning_rate": 9.927608452945413e-06, "loss": 0.0425, "step": 1665 }, { "epoch": 0.44676857066237596, "grad_norm": 0.44652217705156433, "learning_rate": 9.927343684284245e-06, "loss": 0.0341, "step": 1666 }, { "epoch": 0.4470367390721373, "grad_norm": 0.42568090636904093, "learning_rate": 9.927078435860702e-06, "loss": 0.0352, "step": 1667 }, { "epoch": 0.4473049074818986, "grad_norm": 0.4157220147822246, "learning_rate": 9.926812707700614e-06, "loss": 0.0349, "step": 1668 }, { "epoch": 0.44757307589165995, "grad_norm": 0.6036301514601518, "learning_rate": 9.926546499829853e-06, "loss": 0.0417, "step": 1669 }, { "epoch": 0.4478412443014213, "grad_norm": 0.40777734814952366, "learning_rate": 9.926279812274338e-06, "loss": 0.0468, "step": 1670 }, { "epoch": 0.4481094127111826, "grad_norm": 0.4870265793774108, "learning_rate": 9.926012645060037e-06, "loss": 0.0462, "step": 1671 }, { "epoch": 0.44837758112094395, "grad_norm": 0.4426130121868837, "learning_rate": 9.925744998212962e-06, "loss": 0.0386, "step": 1672 }, { "epoch": 0.4486457495307053, "grad_norm": 0.48820723774934593, "learning_rate": 9.925476871759177e-06, "loss": 0.0551, "step": 1673 }, { "epoch": 0.4489139179404666, "grad_norm": 0.5068056004251991, "learning_rate": 9.925208265724782e-06, "loss": 0.0474, "step": 1674 }, { "epoch": 0.44918208635022794, "grad_norm": 0.46199564110468133, "learning_rate": 9.924939180135938e-06, "loss": 0.0339, "step": 1675 }, { "epoch": 0.4494502547599893, "grad_norm": 0.6675978079712932, "learning_rate": 9.92466961501884e-06, "loss": 0.0448, "step": 1676 }, { "epoch": 0.4497184231697506, "grad_norm": 0.3429677272426827, "learning_rate": 9.924399570399737e-06, "loss": 0.0273, "step": 1677 }, { "epoch": 0.44998659157951193, "grad_norm": 0.4598651121029453, "learning_rate": 9.92412904630492e-06, "loss": 0.0404, "step": 1678 }, { "epoch": 0.4502547599892733, "grad_norm": 0.6511549196432105, "learning_rate": 9.923858042760732e-06, "loss": 0.0507, "step": 1679 }, { "epoch": 0.4505229283990346, "grad_norm": 0.4503890355111746, "learning_rate": 9.923586559793559e-06, "loss": 0.0368, "step": 1680 }, { "epoch": 0.4507910968087959, "grad_norm": 0.4385224459183911, "learning_rate": 9.923314597429834e-06, "loss": 0.0411, "step": 1681 }, { "epoch": 0.45105926521855727, "grad_norm": 0.5908317341630881, "learning_rate": 9.92304215569604e-06, "loss": 0.0487, "step": 1682 }, { "epoch": 0.45132743362831856, "grad_norm": 0.4569852608040678, "learning_rate": 9.9227692346187e-06, "loss": 0.0428, "step": 1683 }, { "epoch": 0.4515956020380799, "grad_norm": 0.4224631141995175, "learning_rate": 9.922495834224391e-06, "loss": 0.0368, "step": 1684 }, { "epoch": 0.45186377044784126, "grad_norm": 0.4727641560899038, "learning_rate": 9.922221954539728e-06, "loss": 0.0378, "step": 1685 }, { "epoch": 0.45213193885760256, "grad_norm": 0.3404200421747889, "learning_rate": 9.921947595591385e-06, "loss": 0.0368, "step": 1686 }, { "epoch": 0.4524001072673639, "grad_norm": 0.4576762270549373, "learning_rate": 9.92167275740607e-06, "loss": 0.0344, "step": 1687 }, { "epoch": 0.45266827567712525, "grad_norm": 0.5073019169766373, "learning_rate": 9.921397440010548e-06, "loss": 0.045, "step": 1688 }, { "epoch": 0.45293644408688655, "grad_norm": 0.6825397825776557, "learning_rate": 9.92112164343162e-06, "loss": 0.0527, "step": 1689 }, { "epoch": 0.4532046124966479, "grad_norm": 0.6321986630393275, "learning_rate": 9.920845367696145e-06, "loss": 0.0512, "step": 1690 }, { "epoch": 0.45347278090640925, "grad_norm": 0.43329048651791213, "learning_rate": 9.920568612831021e-06, "loss": 0.0489, "step": 1691 }, { "epoch": 0.45374094931617054, "grad_norm": 0.3764795309768748, "learning_rate": 9.920291378863193e-06, "loss": 0.0422, "step": 1692 }, { "epoch": 0.4540091177259319, "grad_norm": 0.5019906052233006, "learning_rate": 9.92001366581966e-06, "loss": 0.0385, "step": 1693 }, { "epoch": 0.45427728613569324, "grad_norm": 1.4052075032240339, "learning_rate": 9.919735473727457e-06, "loss": 0.0491, "step": 1694 }, { "epoch": 0.45454545454545453, "grad_norm": 0.4529497522787487, "learning_rate": 9.919456802613672e-06, "loss": 0.0521, "step": 1695 }, { "epoch": 0.4548136229552159, "grad_norm": 0.6191353712941574, "learning_rate": 9.91917765250544e-06, "loss": 0.051, "step": 1696 }, { "epoch": 0.45508179136497723, "grad_norm": 0.3268028784558863, "learning_rate": 9.91889802342994e-06, "loss": 0.0313, "step": 1697 }, { "epoch": 0.4553499597747385, "grad_norm": 0.4291915712611819, "learning_rate": 9.918617915414397e-06, "loss": 0.0418, "step": 1698 }, { "epoch": 0.45561812818449987, "grad_norm": 0.6413031865675974, "learning_rate": 9.918337328486088e-06, "loss": 0.0703, "step": 1699 }, { "epoch": 0.4558862965942612, "grad_norm": 0.5361991268706133, "learning_rate": 9.91805626267233e-06, "loss": 0.0595, "step": 1700 }, { "epoch": 0.4561544650040225, "grad_norm": 0.45859914276293234, "learning_rate": 9.917774718000494e-06, "loss": 0.0363, "step": 1701 }, { "epoch": 0.45642263341378386, "grad_norm": 0.40210384626960993, "learning_rate": 9.917492694497988e-06, "loss": 0.0366, "step": 1702 }, { "epoch": 0.4566908018235452, "grad_norm": 0.3537240608016449, "learning_rate": 9.917210192192273e-06, "loss": 0.0334, "step": 1703 }, { "epoch": 0.4569589702333065, "grad_norm": 0.6405893452238196, "learning_rate": 9.916927211110858e-06, "loss": 0.0643, "step": 1704 }, { "epoch": 0.45722713864306785, "grad_norm": 0.591703100228027, "learning_rate": 9.916643751281294e-06, "loss": 0.0439, "step": 1705 }, { "epoch": 0.4574953070528292, "grad_norm": 0.5620820011368417, "learning_rate": 9.916359812731183e-06, "loss": 0.0408, "step": 1706 }, { "epoch": 0.4577634754625905, "grad_norm": 0.6006057309355749, "learning_rate": 9.916075395488167e-06, "loss": 0.0484, "step": 1707 }, { "epoch": 0.45803164387235185, "grad_norm": 0.5364452630668544, "learning_rate": 9.915790499579944e-06, "loss": 0.0477, "step": 1708 }, { "epoch": 0.4582998122821132, "grad_norm": 1.2791693088141756, "learning_rate": 9.91550512503425e-06, "loss": 0.0504, "step": 1709 }, { "epoch": 0.4585679806918745, "grad_norm": 0.5055088373475854, "learning_rate": 9.915219271878873e-06, "loss": 0.0491, "step": 1710 }, { "epoch": 0.45883614910163584, "grad_norm": 0.7445202228130572, "learning_rate": 9.914932940141644e-06, "loss": 0.0515, "step": 1711 }, { "epoch": 0.45910431751139713, "grad_norm": 0.39566929014430885, "learning_rate": 9.914646129850445e-06, "loss": 0.042, "step": 1712 }, { "epoch": 0.4593724859211585, "grad_norm": 0.3038402514697219, "learning_rate": 9.9143588410332e-06, "loss": 0.0258, "step": 1713 }, { "epoch": 0.45964065433091983, "grad_norm": 0.5075452566281609, "learning_rate": 9.914071073717882e-06, "loss": 0.0496, "step": 1714 }, { "epoch": 0.4599088227406811, "grad_norm": 0.8474723888530709, "learning_rate": 9.913782827932509e-06, "loss": 0.0457, "step": 1715 }, { "epoch": 0.46017699115044247, "grad_norm": 0.42358759439897153, "learning_rate": 9.91349410370515e-06, "loss": 0.0398, "step": 1716 }, { "epoch": 0.4604451595602038, "grad_norm": 0.4176654890436293, "learning_rate": 9.913204901063915e-06, "loss": 0.0488, "step": 1717 }, { "epoch": 0.4607133279699651, "grad_norm": 0.34955004699368303, "learning_rate": 9.912915220036961e-06, "loss": 0.0415, "step": 1718 }, { "epoch": 0.46098149637972646, "grad_norm": 0.4963476647141926, "learning_rate": 9.912625060652496e-06, "loss": 0.0667, "step": 1719 }, { "epoch": 0.4612496647894878, "grad_norm": 0.6313162827924645, "learning_rate": 9.912334422938773e-06, "loss": 0.0465, "step": 1720 }, { "epoch": 0.4615178331992491, "grad_norm": 0.5378640728108217, "learning_rate": 9.912043306924088e-06, "loss": 0.0635, "step": 1721 }, { "epoch": 0.46178600160901045, "grad_norm": 0.35823355752758573, "learning_rate": 9.911751712636789e-06, "loss": 0.0355, "step": 1722 }, { "epoch": 0.4620541700187718, "grad_norm": 0.9441209544404643, "learning_rate": 9.911459640105266e-06, "loss": 0.0554, "step": 1723 }, { "epoch": 0.4623223384285331, "grad_norm": 0.49998182992054657, "learning_rate": 9.911167089357957e-06, "loss": 0.0484, "step": 1724 }, { "epoch": 0.46259050683829445, "grad_norm": 0.46891903368928706, "learning_rate": 9.910874060423345e-06, "loss": 0.0441, "step": 1725 }, { "epoch": 0.4628586752480558, "grad_norm": 0.49538066604237285, "learning_rate": 9.910580553329966e-06, "loss": 0.0354, "step": 1726 }, { "epoch": 0.4631268436578171, "grad_norm": 0.6188115327983452, "learning_rate": 9.910286568106398e-06, "loss": 0.0744, "step": 1727 }, { "epoch": 0.46339501206757844, "grad_norm": 0.6378434879759454, "learning_rate": 9.90999210478126e-06, "loss": 0.0448, "step": 1728 }, { "epoch": 0.4636631804773398, "grad_norm": 0.3240291798483186, "learning_rate": 9.909697163383228e-06, "loss": 0.0379, "step": 1729 }, { "epoch": 0.4639313488871011, "grad_norm": 0.4050012435184163, "learning_rate": 9.909401743941018e-06, "loss": 0.0399, "step": 1730 }, { "epoch": 0.46419951729686243, "grad_norm": 0.5529102795659663, "learning_rate": 9.909105846483394e-06, "loss": 0.045, "step": 1731 }, { "epoch": 0.4644676857066238, "grad_norm": 0.48061242299094764, "learning_rate": 9.908809471039168e-06, "loss": 0.0313, "step": 1732 }, { "epoch": 0.46473585411638507, "grad_norm": 0.35225216044219293, "learning_rate": 9.908512617637195e-06, "loss": 0.0337, "step": 1733 }, { "epoch": 0.4650040225261464, "grad_norm": 0.33861235300653747, "learning_rate": 9.908215286306381e-06, "loss": 0.0372, "step": 1734 }, { "epoch": 0.46527219093590777, "grad_norm": 0.7720102132987724, "learning_rate": 9.907917477075677e-06, "loss": 0.0591, "step": 1735 }, { "epoch": 0.46554035934566906, "grad_norm": 0.6628331823992449, "learning_rate": 9.907619189974078e-06, "loss": 0.056, "step": 1736 }, { "epoch": 0.4658085277554304, "grad_norm": 0.4207837347947557, "learning_rate": 9.907320425030627e-06, "loss": 0.0334, "step": 1737 }, { "epoch": 0.46607669616519176, "grad_norm": 0.3858327041238357, "learning_rate": 9.907021182274415e-06, "loss": 0.0425, "step": 1738 }, { "epoch": 0.46634486457495306, "grad_norm": 0.6451169304631398, "learning_rate": 9.906721461734579e-06, "loss": 0.0741, "step": 1739 }, { "epoch": 0.4666130329847144, "grad_norm": 1.058262038340549, "learning_rate": 9.9064212634403e-06, "loss": 0.0477, "step": 1740 }, { "epoch": 0.46688120139447575, "grad_norm": 0.72372869651805, "learning_rate": 9.906120587420811e-06, "loss": 0.0474, "step": 1741 }, { "epoch": 0.46714936980423705, "grad_norm": 0.39987774915037994, "learning_rate": 9.905819433705383e-06, "loss": 0.0329, "step": 1742 }, { "epoch": 0.4674175382139984, "grad_norm": 0.6041263065761336, "learning_rate": 9.905517802323345e-06, "loss": 0.0593, "step": 1743 }, { "epoch": 0.46768570662375974, "grad_norm": 0.6718618450696869, "learning_rate": 9.90521569330406e-06, "loss": 0.0611, "step": 1744 }, { "epoch": 0.46795387503352104, "grad_norm": 0.5251679506324479, "learning_rate": 9.904913106676948e-06, "loss": 0.0389, "step": 1745 }, { "epoch": 0.4682220434432824, "grad_norm": 0.4968956157918924, "learning_rate": 9.904610042471468e-06, "loss": 0.0379, "step": 1746 }, { "epoch": 0.46849021185304374, "grad_norm": 0.4786245883249044, "learning_rate": 9.904306500717129e-06, "loss": 0.0606, "step": 1747 }, { "epoch": 0.46875838026280503, "grad_norm": 0.33940635055566865, "learning_rate": 9.904002481443487e-06, "loss": 0.0289, "step": 1748 }, { "epoch": 0.4690265486725664, "grad_norm": 0.5693703898372524, "learning_rate": 9.903697984680143e-06, "loss": 0.0434, "step": 1749 }, { "epoch": 0.46929471708232773, "grad_norm": 0.8198444706976998, "learning_rate": 9.903393010456745e-06, "loss": 0.0429, "step": 1750 }, { "epoch": 0.469562885492089, "grad_norm": 0.6508211644006137, "learning_rate": 9.90308755880299e-06, "loss": 0.0654, "step": 1751 }, { "epoch": 0.46983105390185037, "grad_norm": 0.6950013915623617, "learning_rate": 9.902781629748614e-06, "loss": 0.0502, "step": 1752 }, { "epoch": 0.4700992223116117, "grad_norm": 0.4375822632606668, "learning_rate": 9.902475223323408e-06, "loss": 0.0378, "step": 1753 }, { "epoch": 0.470367390721373, "grad_norm": 0.5975537899050268, "learning_rate": 9.902168339557208e-06, "loss": 0.0352, "step": 1754 }, { "epoch": 0.47063555913113436, "grad_norm": 0.6701800057022425, "learning_rate": 9.901860978479889e-06, "loss": 0.0491, "step": 1755 }, { "epoch": 0.47090372754089566, "grad_norm": 0.6375782576353912, "learning_rate": 9.901553140121382e-06, "loss": 0.0466, "step": 1756 }, { "epoch": 0.471171895950657, "grad_norm": 0.7874985228200917, "learning_rate": 9.90124482451166e-06, "loss": 0.0387, "step": 1757 }, { "epoch": 0.47144006436041835, "grad_norm": 0.5944394117621088, "learning_rate": 9.900936031680739e-06, "loss": 0.0492, "step": 1758 }, { "epoch": 0.47170823277017965, "grad_norm": 0.517331602287784, "learning_rate": 9.900626761658691e-06, "loss": 0.0391, "step": 1759 }, { "epoch": 0.471976401179941, "grad_norm": 0.48643856120626966, "learning_rate": 9.900317014475624e-06, "loss": 0.0432, "step": 1760 }, { "epoch": 0.47224456958970235, "grad_norm": 0.4547546331936923, "learning_rate": 9.900006790161701e-06, "loss": 0.0434, "step": 1761 }, { "epoch": 0.47251273799946364, "grad_norm": 0.5777305007089715, "learning_rate": 9.899696088747127e-06, "loss": 0.0532, "step": 1762 }, { "epoch": 0.472780906409225, "grad_norm": 0.4327120754997299, "learning_rate": 9.899384910262151e-06, "loss": 0.0457, "step": 1763 }, { "epoch": 0.47304907481898634, "grad_norm": 0.5010195261192465, "learning_rate": 9.899073254737076e-06, "loss": 0.0445, "step": 1764 }, { "epoch": 0.47331724322874763, "grad_norm": 0.9244833450364123, "learning_rate": 9.898761122202242e-06, "loss": 0.0436, "step": 1765 }, { "epoch": 0.473585411638509, "grad_norm": 0.6626612406826882, "learning_rate": 9.898448512688046e-06, "loss": 0.0615, "step": 1766 }, { "epoch": 0.47385358004827033, "grad_norm": 0.32932030147486296, "learning_rate": 9.898135426224923e-06, "loss": 0.0296, "step": 1767 }, { "epoch": 0.4741217484580316, "grad_norm": 0.48407560467866817, "learning_rate": 9.897821862843357e-06, "loss": 0.0539, "step": 1768 }, { "epoch": 0.47438991686779297, "grad_norm": 0.54509267539949, "learning_rate": 9.89750782257388e-06, "loss": 0.0461, "step": 1769 }, { "epoch": 0.4746580852775543, "grad_norm": 0.596702990208063, "learning_rate": 9.89719330544707e-06, "loss": 0.0402, "step": 1770 }, { "epoch": 0.4749262536873156, "grad_norm": 0.3935959320536733, "learning_rate": 9.896878311493549e-06, "loss": 0.0387, "step": 1771 }, { "epoch": 0.47519442209707696, "grad_norm": 0.44863894068749766, "learning_rate": 9.896562840743989e-06, "loss": 0.0506, "step": 1772 }, { "epoch": 0.4754625905068383, "grad_norm": 0.6138091048140234, "learning_rate": 9.896246893229102e-06, "loss": 0.07, "step": 1773 }, { "epoch": 0.4757307589165996, "grad_norm": 0.3417915825115119, "learning_rate": 9.895930468979658e-06, "loss": 0.0299, "step": 1774 }, { "epoch": 0.47599892732636095, "grad_norm": 0.43282816637646726, "learning_rate": 9.89561356802646e-06, "loss": 0.0368, "step": 1775 }, { "epoch": 0.4762670957361223, "grad_norm": 0.45485755304282816, "learning_rate": 9.895296190400368e-06, "loss": 0.0378, "step": 1776 }, { "epoch": 0.4765352641458836, "grad_norm": 0.4769666416615768, "learning_rate": 9.894978336132281e-06, "loss": 0.0399, "step": 1777 }, { "epoch": 0.47680343255564495, "grad_norm": 0.5523558325377964, "learning_rate": 9.894660005253152e-06, "loss": 0.052, "step": 1778 }, { "epoch": 0.4770716009654063, "grad_norm": 0.6889942355286874, "learning_rate": 9.89434119779397e-06, "loss": 0.046, "step": 1779 }, { "epoch": 0.4773397693751676, "grad_norm": 0.3732427447516459, "learning_rate": 9.894021913785782e-06, "loss": 0.0313, "step": 1780 }, { "epoch": 0.47760793778492894, "grad_norm": 0.5736278985682072, "learning_rate": 9.893702153259674e-06, "loss": 0.0395, "step": 1781 }, { "epoch": 0.4778761061946903, "grad_norm": 0.4799894240219493, "learning_rate": 9.893381916246778e-06, "loss": 0.0405, "step": 1782 }, { "epoch": 0.4781442746044516, "grad_norm": 0.6258538662251879, "learning_rate": 9.893061202778278e-06, "loss": 0.0656, "step": 1783 }, { "epoch": 0.47841244301421293, "grad_norm": 0.5109226239200082, "learning_rate": 9.8927400128854e-06, "loss": 0.0441, "step": 1784 }, { "epoch": 0.4786806114239743, "grad_norm": 0.492922324867543, "learning_rate": 9.892418346599415e-06, "loss": 0.0364, "step": 1785 }, { "epoch": 0.47894877983373557, "grad_norm": 0.40374312419019354, "learning_rate": 9.892096203951646e-06, "loss": 0.0314, "step": 1786 }, { "epoch": 0.4792169482434969, "grad_norm": 0.4159149521602461, "learning_rate": 9.891773584973457e-06, "loss": 0.0311, "step": 1787 }, { "epoch": 0.47948511665325827, "grad_norm": 0.4636132528323613, "learning_rate": 9.891450489696262e-06, "loss": 0.0376, "step": 1788 }, { "epoch": 0.47975328506301956, "grad_norm": 0.6025106729415012, "learning_rate": 9.89112691815152e-06, "loss": 0.0374, "step": 1789 }, { "epoch": 0.4800214534727809, "grad_norm": 0.4393233777855714, "learning_rate": 9.890802870370735e-06, "loss": 0.0448, "step": 1790 }, { "epoch": 0.48028962188254226, "grad_norm": 0.5201045352328328, "learning_rate": 9.89047834638546e-06, "loss": 0.0532, "step": 1791 }, { "epoch": 0.48055779029230355, "grad_norm": 0.5664281761949992, "learning_rate": 9.890153346227291e-06, "loss": 0.0479, "step": 1792 }, { "epoch": 0.4808259587020649, "grad_norm": 0.36924989986991813, "learning_rate": 9.889827869927875e-06, "loss": 0.0383, "step": 1793 }, { "epoch": 0.48109412711182625, "grad_norm": 0.3925287220558661, "learning_rate": 9.889501917518903e-06, "loss": 0.038, "step": 1794 }, { "epoch": 0.48136229552158755, "grad_norm": 0.4753518632399133, "learning_rate": 9.889175489032107e-06, "loss": 0.0447, "step": 1795 }, { "epoch": 0.4816304639313489, "grad_norm": 0.8029924703751461, "learning_rate": 9.888848584499275e-06, "loss": 0.0505, "step": 1796 }, { "epoch": 0.48189863234111024, "grad_norm": 0.4922593191622795, "learning_rate": 9.888521203952238e-06, "loss": 0.0435, "step": 1797 }, { "epoch": 0.48216680075087154, "grad_norm": 0.39693139987327125, "learning_rate": 9.88819334742287e-06, "loss": 0.0341, "step": 1798 }, { "epoch": 0.4824349691606329, "grad_norm": 0.46920216394789727, "learning_rate": 9.887865014943093e-06, "loss": 0.0515, "step": 1799 }, { "epoch": 0.4827031375703942, "grad_norm": 1.2314364810929457, "learning_rate": 9.887536206544876e-06, "loss": 0.0496, "step": 1800 }, { "epoch": 0.48297130598015553, "grad_norm": 0.47841482873956864, "learning_rate": 9.887206922260234e-06, "loss": 0.0635, "step": 1801 }, { "epoch": 0.4832394743899169, "grad_norm": 0.3660634008024165, "learning_rate": 9.886877162121231e-06, "loss": 0.0337, "step": 1802 }, { "epoch": 0.4835076427996782, "grad_norm": 0.6591664315646486, "learning_rate": 9.886546926159972e-06, "loss": 0.0659, "step": 1803 }, { "epoch": 0.4837758112094395, "grad_norm": 0.5540911662550497, "learning_rate": 9.886216214408612e-06, "loss": 0.0466, "step": 1804 }, { "epoch": 0.48404397961920087, "grad_norm": 0.49402038189082687, "learning_rate": 9.885885026899353e-06, "loss": 0.0588, "step": 1805 }, { "epoch": 0.48431214802896216, "grad_norm": 0.6205308999880106, "learning_rate": 9.88555336366444e-06, "loss": 0.0556, "step": 1806 }, { "epoch": 0.4845803164387235, "grad_norm": 0.39004279198349906, "learning_rate": 9.885221224736168e-06, "loss": 0.0413, "step": 1807 }, { "epoch": 0.48484848484848486, "grad_norm": 0.5759200245907066, "learning_rate": 9.884888610146875e-06, "loss": 0.0482, "step": 1808 }, { "epoch": 0.48511665325824616, "grad_norm": 0.3489770836013558, "learning_rate": 9.884555519928946e-06, "loss": 0.0399, "step": 1809 }, { "epoch": 0.4853848216680075, "grad_norm": 0.3878163578482397, "learning_rate": 9.884221954114814e-06, "loss": 0.0386, "step": 1810 }, { "epoch": 0.48565299007776885, "grad_norm": 0.7810601410169329, "learning_rate": 9.883887912736959e-06, "loss": 0.0428, "step": 1811 }, { "epoch": 0.48592115848753015, "grad_norm": 0.5659781609761343, "learning_rate": 9.883553395827905e-06, "loss": 0.0363, "step": 1812 }, { "epoch": 0.4861893268972915, "grad_norm": 0.44707113790463116, "learning_rate": 9.88321840342022e-06, "loss": 0.0417, "step": 1813 }, { "epoch": 0.48645749530705285, "grad_norm": 0.822240914556162, "learning_rate": 9.882882935546525e-06, "loss": 0.0379, "step": 1814 }, { "epoch": 0.48672566371681414, "grad_norm": 0.4147258047315414, "learning_rate": 9.882546992239483e-06, "loss": 0.0399, "step": 1815 }, { "epoch": 0.4869938321265755, "grad_norm": 0.4615974285102183, "learning_rate": 9.882210573531803e-06, "loss": 0.0423, "step": 1816 }, { "epoch": 0.48726200053633684, "grad_norm": 0.5103617488938941, "learning_rate": 9.881873679456243e-06, "loss": 0.0392, "step": 1817 }, { "epoch": 0.48753016894609813, "grad_norm": 0.3251048453102637, "learning_rate": 9.8815363100456e-06, "loss": 0.0311, "step": 1818 }, { "epoch": 0.4877983373558595, "grad_norm": 0.5939575424655739, "learning_rate": 9.88119846533273e-06, "loss": 0.0624, "step": 1819 }, { "epoch": 0.48806650576562083, "grad_norm": 0.48243322721865456, "learning_rate": 9.880860145350525e-06, "loss": 0.0485, "step": 1820 }, { "epoch": 0.4883346741753821, "grad_norm": 0.5092356182948604, "learning_rate": 9.880521350131925e-06, "loss": 0.0406, "step": 1821 }, { "epoch": 0.48860284258514347, "grad_norm": 0.48925062079129156, "learning_rate": 9.880182079709917e-06, "loss": 0.0384, "step": 1822 }, { "epoch": 0.4888710109949048, "grad_norm": 0.5668046082032859, "learning_rate": 9.879842334117538e-06, "loss": 0.0424, "step": 1823 }, { "epoch": 0.4891391794046661, "grad_norm": 0.439398472948795, "learning_rate": 9.879502113387867e-06, "loss": 0.0493, "step": 1824 }, { "epoch": 0.48940734781442746, "grad_norm": 0.36057799895253606, "learning_rate": 9.879161417554031e-06, "loss": 0.0302, "step": 1825 }, { "epoch": 0.4896755162241888, "grad_norm": 0.6085256437879389, "learning_rate": 9.878820246649198e-06, "loss": 0.0392, "step": 1826 }, { "epoch": 0.4899436846339501, "grad_norm": 0.4552925694922547, "learning_rate": 9.878478600706595e-06, "loss": 0.0496, "step": 1827 }, { "epoch": 0.49021185304371145, "grad_norm": 0.34872506471865444, "learning_rate": 9.87813647975948e-06, "loss": 0.0364, "step": 1828 }, { "epoch": 0.4904800214534728, "grad_norm": 0.4826648926075266, "learning_rate": 9.877793883841169e-06, "loss": 0.0437, "step": 1829 }, { "epoch": 0.4907481898632341, "grad_norm": 0.42159502591897136, "learning_rate": 9.877450812985016e-06, "loss": 0.0388, "step": 1830 }, { "epoch": 0.49101635827299545, "grad_norm": 0.391279464763829, "learning_rate": 9.877107267224429e-06, "loss": 0.0385, "step": 1831 }, { "epoch": 0.4912845266827568, "grad_norm": 0.6040449871499166, "learning_rate": 9.876763246592854e-06, "loss": 0.05, "step": 1832 }, { "epoch": 0.4915526950925181, "grad_norm": 0.5017377787875782, "learning_rate": 9.876418751123788e-06, "loss": 0.0396, "step": 1833 }, { "epoch": 0.49182086350227944, "grad_norm": 0.4467692897043541, "learning_rate": 9.876073780850776e-06, "loss": 0.036, "step": 1834 }, { "epoch": 0.4920890319120408, "grad_norm": 0.5069890223066578, "learning_rate": 9.875728335807407e-06, "loss": 0.0301, "step": 1835 }, { "epoch": 0.4923572003218021, "grad_norm": 0.5293469628442357, "learning_rate": 9.875382416027314e-06, "loss": 0.0345, "step": 1836 }, { "epoch": 0.49262536873156343, "grad_norm": 0.4030484546203759, "learning_rate": 9.87503602154418e-06, "loss": 0.0427, "step": 1837 }, { "epoch": 0.4928935371413248, "grad_norm": 0.38772730450355747, "learning_rate": 9.874689152391729e-06, "loss": 0.0374, "step": 1838 }, { "epoch": 0.49316170555108607, "grad_norm": 0.5485050135750426, "learning_rate": 9.87434180860374e-06, "loss": 0.0427, "step": 1839 }, { "epoch": 0.4934298739608474, "grad_norm": 0.895425894542109, "learning_rate": 9.873993990214028e-06, "loss": 0.0552, "step": 1840 }, { "epoch": 0.49369804237060877, "grad_norm": 0.4356913690541344, "learning_rate": 9.873645697256463e-06, "loss": 0.0489, "step": 1841 }, { "epoch": 0.49396621078037006, "grad_norm": 0.6255653046576253, "learning_rate": 9.873296929764956e-06, "loss": 0.0561, "step": 1842 }, { "epoch": 0.4942343791901314, "grad_norm": 0.41068129417844157, "learning_rate": 9.872947687773464e-06, "loss": 0.0514, "step": 1843 }, { "epoch": 0.4945025475998927, "grad_norm": 0.4759611275145552, "learning_rate": 9.872597971315994e-06, "loss": 0.0395, "step": 1844 }, { "epoch": 0.49477071600965405, "grad_norm": 1.8774257778098693, "learning_rate": 9.872247780426595e-06, "loss": 0.0504, "step": 1845 }, { "epoch": 0.4950388844194154, "grad_norm": 0.4792237876346564, "learning_rate": 9.871897115139367e-06, "loss": 0.0345, "step": 1846 }, { "epoch": 0.4953070528291767, "grad_norm": 0.5397751928617246, "learning_rate": 9.871545975488448e-06, "loss": 0.0406, "step": 1847 }, { "epoch": 0.49557522123893805, "grad_norm": 0.5254123324079035, "learning_rate": 9.871194361508034e-06, "loss": 0.0494, "step": 1848 }, { "epoch": 0.4958433896486994, "grad_norm": 0.5963120818612687, "learning_rate": 9.870842273232358e-06, "loss": 0.049, "step": 1849 }, { "epoch": 0.4961115580584607, "grad_norm": 0.41331531584818376, "learning_rate": 9.8704897106957e-06, "loss": 0.042, "step": 1850 }, { "epoch": 0.49637972646822204, "grad_norm": 0.42699918316620317, "learning_rate": 9.87013667393239e-06, "loss": 0.0261, "step": 1851 }, { "epoch": 0.4966478948779834, "grad_norm": 0.469610714399729, "learning_rate": 9.869783162976803e-06, "loss": 0.0445, "step": 1852 }, { "epoch": 0.4969160632877447, "grad_norm": 0.8678444719875582, "learning_rate": 9.869429177863358e-06, "loss": 0.0674, "step": 1853 }, { "epoch": 0.49718423169750603, "grad_norm": 0.4383735925094647, "learning_rate": 9.86907471862652e-06, "loss": 0.0397, "step": 1854 }, { "epoch": 0.4974524001072674, "grad_norm": 0.3869606055517455, "learning_rate": 9.868719785300808e-06, "loss": 0.0349, "step": 1855 }, { "epoch": 0.49772056851702867, "grad_norm": 0.5074712119960155, "learning_rate": 9.868364377920773e-06, "loss": 0.0426, "step": 1856 }, { "epoch": 0.49798873692679, "grad_norm": 0.3480526457978337, "learning_rate": 9.868008496521027e-06, "loss": 0.0308, "step": 1857 }, { "epoch": 0.49825690533655137, "grad_norm": 0.5862240516067672, "learning_rate": 9.867652141136217e-06, "loss": 0.0509, "step": 1858 }, { "epoch": 0.49852507374631266, "grad_norm": 0.9092458268113179, "learning_rate": 9.86729531180104e-06, "loss": 0.0491, "step": 1859 }, { "epoch": 0.498793242156074, "grad_norm": 0.5614843788500468, "learning_rate": 9.866938008550241e-06, "loss": 0.044, "step": 1860 }, { "epoch": 0.49906141056583536, "grad_norm": 0.4485914744424546, "learning_rate": 9.866580231418612e-06, "loss": 0.0425, "step": 1861 }, { "epoch": 0.49932957897559666, "grad_norm": 0.40731958733621093, "learning_rate": 9.866221980440983e-06, "loss": 0.0376, "step": 1862 }, { "epoch": 0.499597747385358, "grad_norm": 0.5380937722838296, "learning_rate": 9.865863255652242e-06, "loss": 0.0318, "step": 1863 }, { "epoch": 0.49986591579511935, "grad_norm": 0.7654845250567571, "learning_rate": 9.865504057087313e-06, "loss": 0.057, "step": 1864 }, { "epoch": 0.5001340842048807, "grad_norm": 0.5566431707799038, "learning_rate": 9.865144384781171e-06, "loss": 0.0537, "step": 1865 }, { "epoch": 0.500402252614642, "grad_norm": 0.41807511265658265, "learning_rate": 9.864784238768839e-06, "loss": 0.0365, "step": 1866 }, { "epoch": 0.5006704210244033, "grad_norm": 0.7394143471398104, "learning_rate": 9.86442361908538e-06, "loss": 0.0544, "step": 1867 }, { "epoch": 0.5009385894341647, "grad_norm": 0.31639467889448114, "learning_rate": 9.864062525765909e-06, "loss": 0.0286, "step": 1868 }, { "epoch": 0.501206757843926, "grad_norm": 0.5109002128410765, "learning_rate": 9.863700958845582e-06, "loss": 0.0518, "step": 1869 }, { "epoch": 0.5014749262536873, "grad_norm": 0.507758032128084, "learning_rate": 9.863338918359606e-06, "loss": 0.0478, "step": 1870 }, { "epoch": 0.5017430946634487, "grad_norm": 0.41086170239629327, "learning_rate": 9.862976404343234e-06, "loss": 0.0492, "step": 1871 }, { "epoch": 0.50201126307321, "grad_norm": 0.4581922378723714, "learning_rate": 9.862613416831757e-06, "loss": 0.0346, "step": 1872 }, { "epoch": 0.5022794314829713, "grad_norm": 0.464125572725527, "learning_rate": 9.862249955860524e-06, "loss": 0.0358, "step": 1873 }, { "epoch": 0.5025475998927327, "grad_norm": 0.47219795030124384, "learning_rate": 9.86188602146492e-06, "loss": 0.0522, "step": 1874 }, { "epoch": 0.502815768302494, "grad_norm": 1.0254670954831642, "learning_rate": 9.861521613680384e-06, "loss": 0.037, "step": 1875 }, { "epoch": 0.5030839367122553, "grad_norm": 0.7118083427317898, "learning_rate": 9.861156732542393e-06, "loss": 0.0509, "step": 1876 }, { "epoch": 0.5033521051220167, "grad_norm": 1.005742751152509, "learning_rate": 9.86079137808648e-06, "loss": 0.0553, "step": 1877 }, { "epoch": 0.503620273531778, "grad_norm": 0.43639645321982073, "learning_rate": 9.860425550348214e-06, "loss": 0.0322, "step": 1878 }, { "epoch": 0.5038884419415393, "grad_norm": 0.39540392914836325, "learning_rate": 9.860059249363217e-06, "loss": 0.0349, "step": 1879 }, { "epoch": 0.5041566103513007, "grad_norm": 0.30716063712663144, "learning_rate": 9.859692475167153e-06, "loss": 0.0414, "step": 1880 }, { "epoch": 0.504424778761062, "grad_norm": 0.42916293937995925, "learning_rate": 9.859325227795736e-06, "loss": 0.0463, "step": 1881 }, { "epoch": 0.5046929471708232, "grad_norm": 0.42059838880493067, "learning_rate": 9.858957507284723e-06, "loss": 0.0541, "step": 1882 }, { "epoch": 0.5049611155805847, "grad_norm": 0.6780767624438282, "learning_rate": 9.858589313669918e-06, "loss": 0.0446, "step": 1883 }, { "epoch": 0.505229283990346, "grad_norm": 0.6026634806555805, "learning_rate": 9.858220646987168e-06, "loss": 0.048, "step": 1884 }, { "epoch": 0.5054974524001072, "grad_norm": 0.5533851408987825, "learning_rate": 9.857851507272376e-06, "loss": 0.0608, "step": 1885 }, { "epoch": 0.5057656208098686, "grad_norm": 0.6812708088287186, "learning_rate": 9.857481894561478e-06, "loss": 0.0376, "step": 1886 }, { "epoch": 0.5060337892196299, "grad_norm": 0.6507469169042163, "learning_rate": 9.857111808890465e-06, "loss": 0.0385, "step": 1887 }, { "epoch": 0.5063019576293912, "grad_norm": 0.3843719663581745, "learning_rate": 9.85674125029537e-06, "loss": 0.0428, "step": 1888 }, { "epoch": 0.5065701260391526, "grad_norm": 0.7375810130340558, "learning_rate": 9.856370218812274e-06, "loss": 0.0622, "step": 1889 }, { "epoch": 0.5068382944489139, "grad_norm": 0.4450931021728746, "learning_rate": 9.855998714477302e-06, "loss": 0.0434, "step": 1890 }, { "epoch": 0.5071064628586752, "grad_norm": 0.3779130332249112, "learning_rate": 9.85562673732663e-06, "loss": 0.0384, "step": 1891 }, { "epoch": 0.5073746312684366, "grad_norm": 0.49767523085243587, "learning_rate": 9.855254287396473e-06, "loss": 0.0591, "step": 1892 }, { "epoch": 0.5076427996781979, "grad_norm": 0.3987772224031206, "learning_rate": 9.854881364723094e-06, "loss": 0.0451, "step": 1893 }, { "epoch": 0.5079109680879592, "grad_norm": 0.44421769020093776, "learning_rate": 9.854507969342808e-06, "loss": 0.0512, "step": 1894 }, { "epoch": 0.5081791364977206, "grad_norm": 0.4481980708263646, "learning_rate": 9.854134101291972e-06, "loss": 0.0516, "step": 1895 }, { "epoch": 0.5084473049074819, "grad_norm": 0.5109881101545968, "learning_rate": 9.853759760606982e-06, "loss": 0.0393, "step": 1896 }, { "epoch": 0.5087154733172432, "grad_norm": 0.3534620215630057, "learning_rate": 9.853384947324293e-06, "loss": 0.0328, "step": 1897 }, { "epoch": 0.5089836417270046, "grad_norm": 0.3811009870491632, "learning_rate": 9.853009661480397e-06, "loss": 0.0409, "step": 1898 }, { "epoch": 0.5092518101367659, "grad_norm": 0.4789581048963292, "learning_rate": 9.852633903111834e-06, "loss": 0.0441, "step": 1899 }, { "epoch": 0.5095199785465272, "grad_norm": 0.4601266127667206, "learning_rate": 9.852257672255193e-06, "loss": 0.0409, "step": 1900 }, { "epoch": 0.5097881469562886, "grad_norm": 0.5089488473649736, "learning_rate": 9.851880968947102e-06, "loss": 0.0428, "step": 1901 }, { "epoch": 0.5100563153660499, "grad_norm": 0.4878199552228014, "learning_rate": 9.851503793224244e-06, "loss": 0.033, "step": 1902 }, { "epoch": 0.5103244837758112, "grad_norm": 0.4493974613474292, "learning_rate": 9.851126145123341e-06, "loss": 0.0495, "step": 1903 }, { "epoch": 0.5105926521855726, "grad_norm": 0.3125771381799123, "learning_rate": 9.850748024681168e-06, "loss": 0.0314, "step": 1904 }, { "epoch": 0.5108608205953339, "grad_norm": 0.40382139296307484, "learning_rate": 9.850369431934536e-06, "loss": 0.0359, "step": 1905 }, { "epoch": 0.5111289890050952, "grad_norm": 0.7241205269506823, "learning_rate": 9.84999036692031e-06, "loss": 0.0526, "step": 1906 }, { "epoch": 0.5113971574148566, "grad_norm": 0.3107369658748318, "learning_rate": 9.849610829675398e-06, "loss": 0.0307, "step": 1907 }, { "epoch": 0.5116653258246179, "grad_norm": 0.33351763330345985, "learning_rate": 9.849230820236756e-06, "loss": 0.0445, "step": 1908 }, { "epoch": 0.5119334942343792, "grad_norm": 0.5749430815443008, "learning_rate": 9.848850338641382e-06, "loss": 0.0421, "step": 1909 }, { "epoch": 0.5122016626441405, "grad_norm": 0.4833725106367499, "learning_rate": 9.848469384926328e-06, "loss": 0.0456, "step": 1910 }, { "epoch": 0.5124698310539019, "grad_norm": 0.5156372110936757, "learning_rate": 9.848087959128679e-06, "loss": 0.0498, "step": 1911 }, { "epoch": 0.5127379994636632, "grad_norm": 0.6176247978298052, "learning_rate": 9.847706061285575e-06, "loss": 0.0357, "step": 1912 }, { "epoch": 0.5130061678734245, "grad_norm": 0.43171110707880106, "learning_rate": 9.847323691434205e-06, "loss": 0.0448, "step": 1913 }, { "epoch": 0.5132743362831859, "grad_norm": 0.3990706233838123, "learning_rate": 9.846940849611797e-06, "loss": 0.0366, "step": 1914 }, { "epoch": 0.5135425046929472, "grad_norm": 0.404395381243116, "learning_rate": 9.846557535855625e-06, "loss": 0.0294, "step": 1915 }, { "epoch": 0.5138106731027084, "grad_norm": 0.42657056067898025, "learning_rate": 9.846173750203014e-06, "loss": 0.027, "step": 1916 }, { "epoch": 0.5140788415124699, "grad_norm": 0.5766397342309836, "learning_rate": 9.845789492691332e-06, "loss": 0.0402, "step": 1917 }, { "epoch": 0.5143470099222311, "grad_norm": 0.3729853346069816, "learning_rate": 9.845404763357991e-06, "loss": 0.0387, "step": 1918 }, { "epoch": 0.5146151783319924, "grad_norm": 0.5106127211076743, "learning_rate": 9.845019562240455e-06, "loss": 0.0481, "step": 1919 }, { "epoch": 0.5148833467417538, "grad_norm": 0.3897415463160873, "learning_rate": 9.844633889376225e-06, "loss": 0.0368, "step": 1920 }, { "epoch": 0.5151515151515151, "grad_norm": 0.5860622588651289, "learning_rate": 9.844247744802857e-06, "loss": 0.0553, "step": 1921 }, { "epoch": 0.5154196835612764, "grad_norm": 0.37246291251504515, "learning_rate": 9.843861128557945e-06, "loss": 0.0373, "step": 1922 }, { "epoch": 0.5156878519710378, "grad_norm": 0.5745569495626512, "learning_rate": 9.843474040679137e-06, "loss": 0.036, "step": 1923 }, { "epoch": 0.5159560203807991, "grad_norm": 0.6707355482823639, "learning_rate": 9.84308648120412e-06, "loss": 0.0559, "step": 1924 }, { "epoch": 0.5162241887905604, "grad_norm": 0.5100638170327184, "learning_rate": 9.842698450170631e-06, "loss": 0.0453, "step": 1925 }, { "epoch": 0.5164923572003218, "grad_norm": 0.4907191306181842, "learning_rate": 9.842309947616451e-06, "loss": 0.0474, "step": 1926 }, { "epoch": 0.5167605256100831, "grad_norm": 0.42121510319549144, "learning_rate": 9.841920973579408e-06, "loss": 0.0482, "step": 1927 }, { "epoch": 0.5170286940198444, "grad_norm": 0.9185573422712652, "learning_rate": 9.841531528097374e-06, "loss": 0.0771, "step": 1928 }, { "epoch": 0.5172968624296058, "grad_norm": 0.5518653802424296, "learning_rate": 9.84114161120827e-06, "loss": 0.053, "step": 1929 }, { "epoch": 0.5175650308393671, "grad_norm": 0.6237041078165506, "learning_rate": 9.84075122295006e-06, "loss": 0.045, "step": 1930 }, { "epoch": 0.5178331992491284, "grad_norm": 0.35822703425892305, "learning_rate": 9.840360363360755e-06, "loss": 0.036, "step": 1931 }, { "epoch": 0.5181013676588898, "grad_norm": 0.3286410773732383, "learning_rate": 9.839969032478412e-06, "loss": 0.0385, "step": 1932 }, { "epoch": 0.5183695360686511, "grad_norm": 0.6133069238667195, "learning_rate": 9.839577230341135e-06, "loss": 0.0411, "step": 1933 }, { "epoch": 0.5186377044784124, "grad_norm": 0.6423314141769016, "learning_rate": 9.839184956987072e-06, "loss": 0.0531, "step": 1934 }, { "epoch": 0.5189058728881738, "grad_norm": 0.4309056236846783, "learning_rate": 9.838792212454416e-06, "loss": 0.043, "step": 1935 }, { "epoch": 0.5191740412979351, "grad_norm": 0.7235472835134716, "learning_rate": 9.838398996781411e-06, "loss": 0.048, "step": 1936 }, { "epoch": 0.5194422097076964, "grad_norm": 0.5042307332850442, "learning_rate": 9.83800531000634e-06, "loss": 0.0468, "step": 1937 }, { "epoch": 0.5197103781174578, "grad_norm": 0.44779758949145526, "learning_rate": 9.837611152167537e-06, "loss": 0.0455, "step": 1938 }, { "epoch": 0.5199785465272191, "grad_norm": 0.6389948644499553, "learning_rate": 9.83721652330338e-06, "loss": 0.0431, "step": 1939 }, { "epoch": 0.5202467149369804, "grad_norm": 0.46670359205564366, "learning_rate": 9.836821423452293e-06, "loss": 0.0344, "step": 1940 }, { "epoch": 0.5205148833467418, "grad_norm": 0.4901862331847499, "learning_rate": 9.836425852652744e-06, "loss": 0.0593, "step": 1941 }, { "epoch": 0.5207830517565031, "grad_norm": 0.5864491269815173, "learning_rate": 9.836029810943252e-06, "loss": 0.0706, "step": 1942 }, { "epoch": 0.5210512201662644, "grad_norm": 0.33757569663805637, "learning_rate": 9.835633298362377e-06, "loss": 0.0337, "step": 1943 }, { "epoch": 0.5213193885760258, "grad_norm": 0.43250520818770744, "learning_rate": 9.835236314948724e-06, "loss": 0.0417, "step": 1944 }, { "epoch": 0.5215875569857871, "grad_norm": 0.6846583128704155, "learning_rate": 9.834838860740949e-06, "loss": 0.0429, "step": 1945 }, { "epoch": 0.5218557253955484, "grad_norm": 0.3503436281744212, "learning_rate": 9.834440935777751e-06, "loss": 0.03, "step": 1946 }, { "epoch": 0.5221238938053098, "grad_norm": 0.41595731129623564, "learning_rate": 9.834042540097875e-06, "loss": 0.0393, "step": 1947 }, { "epoch": 0.5223920622150711, "grad_norm": 0.3496022950857249, "learning_rate": 9.833643673740109e-06, "loss": 0.0336, "step": 1948 }, { "epoch": 0.5226602306248324, "grad_norm": 0.6312545212138936, "learning_rate": 9.833244336743293e-06, "loss": 0.0391, "step": 1949 }, { "epoch": 0.5229283990345938, "grad_norm": 0.4021598407955603, "learning_rate": 9.832844529146307e-06, "loss": 0.0403, "step": 1950 }, { "epoch": 0.523196567444355, "grad_norm": 0.38071431845456905, "learning_rate": 9.83244425098808e-06, "loss": 0.0406, "step": 1951 }, { "epoch": 0.5234647358541163, "grad_norm": 0.39321990686693775, "learning_rate": 9.832043502307586e-06, "loss": 0.046, "step": 1952 }, { "epoch": 0.5237329042638778, "grad_norm": 0.6248491736936698, "learning_rate": 9.831642283143847e-06, "loss": 0.0474, "step": 1953 }, { "epoch": 0.524001072673639, "grad_norm": 0.3378895479291673, "learning_rate": 9.831240593535926e-06, "loss": 0.036, "step": 1954 }, { "epoch": 0.5242692410834003, "grad_norm": 0.9819261143089283, "learning_rate": 9.830838433522934e-06, "loss": 0.0521, "step": 1955 }, { "epoch": 0.5245374094931617, "grad_norm": 0.681714251541869, "learning_rate": 9.83043580314403e-06, "loss": 0.0693, "step": 1956 }, { "epoch": 0.524805577902923, "grad_norm": 1.5256861818451446, "learning_rate": 9.830032702438416e-06, "loss": 0.0541, "step": 1957 }, { "epoch": 0.5250737463126843, "grad_norm": 0.329938399394509, "learning_rate": 9.829629131445342e-06, "loss": 0.0281, "step": 1958 }, { "epoch": 0.5253419147224457, "grad_norm": 0.47282152029280494, "learning_rate": 9.829225090204102e-06, "loss": 0.0417, "step": 1959 }, { "epoch": 0.525610083132207, "grad_norm": 0.41137365489454303, "learning_rate": 9.828820578754036e-06, "loss": 0.0437, "step": 1960 }, { "epoch": 0.5258782515419683, "grad_norm": 0.5122239422513344, "learning_rate": 9.82841559713453e-06, "loss": 0.0597, "step": 1961 }, { "epoch": 0.5261464199517297, "grad_norm": 0.48809842965384104, "learning_rate": 9.828010145385017e-06, "loss": 0.0514, "step": 1962 }, { "epoch": 0.526414588361491, "grad_norm": 0.44217222530563616, "learning_rate": 9.827604223544973e-06, "loss": 0.046, "step": 1963 }, { "epoch": 0.5266827567712523, "grad_norm": 0.7315258258220281, "learning_rate": 9.827197831653926e-06, "loss": 0.0607, "step": 1964 }, { "epoch": 0.5269509251810137, "grad_norm": 0.3509288366635582, "learning_rate": 9.826790969751438e-06, "loss": 0.0393, "step": 1965 }, { "epoch": 0.527219093590775, "grad_norm": 0.3528880637316147, "learning_rate": 9.826383637877131e-06, "loss": 0.0317, "step": 1966 }, { "epoch": 0.5274872620005363, "grad_norm": 0.5689149692030911, "learning_rate": 9.825975836070662e-06, "loss": 0.0381, "step": 1967 }, { "epoch": 0.5277554304102977, "grad_norm": 0.6289910115824681, "learning_rate": 9.825567564371738e-06, "loss": 0.0572, "step": 1968 }, { "epoch": 0.528023598820059, "grad_norm": 0.3384676975157835, "learning_rate": 9.825158822820113e-06, "loss": 0.0307, "step": 1969 }, { "epoch": 0.5282917672298203, "grad_norm": 0.6597062114764977, "learning_rate": 9.824749611455583e-06, "loss": 0.0424, "step": 1970 }, { "epoch": 0.5285599356395817, "grad_norm": 0.4474356497119647, "learning_rate": 9.824339930317994e-06, "loss": 0.04, "step": 1971 }, { "epoch": 0.528828104049343, "grad_norm": 0.4425259584233051, "learning_rate": 9.823929779447235e-06, "loss": 0.0423, "step": 1972 }, { "epoch": 0.5290962724591043, "grad_norm": 0.39778137771121064, "learning_rate": 9.823519158883237e-06, "loss": 0.0372, "step": 1973 }, { "epoch": 0.5293644408688657, "grad_norm": 0.3458657863402165, "learning_rate": 9.823108068665987e-06, "loss": 0.0337, "step": 1974 }, { "epoch": 0.529632609278627, "grad_norm": 0.4328796269012874, "learning_rate": 9.822696508835511e-06, "loss": 0.0452, "step": 1975 }, { "epoch": 0.5299007776883883, "grad_norm": 2.090740379595086, "learning_rate": 9.822284479431878e-06, "loss": 0.0495, "step": 1976 }, { "epoch": 0.5301689460981497, "grad_norm": 0.41427382342182656, "learning_rate": 9.821871980495208e-06, "loss": 0.0465, "step": 1977 }, { "epoch": 0.530437114507911, "grad_norm": 0.6721291614225887, "learning_rate": 9.821459012065666e-06, "loss": 0.0364, "step": 1978 }, { "epoch": 0.5307052829176723, "grad_norm": 0.42888362415783904, "learning_rate": 9.821045574183461e-06, "loss": 0.0319, "step": 1979 }, { "epoch": 0.5309734513274337, "grad_norm": 0.6734723561228295, "learning_rate": 9.820631666888847e-06, "loss": 0.0495, "step": 1980 }, { "epoch": 0.531241619737195, "grad_norm": 0.3538424065302501, "learning_rate": 9.820217290222127e-06, "loss": 0.0298, "step": 1981 }, { "epoch": 0.5315097881469563, "grad_norm": 0.5062514779579452, "learning_rate": 9.819802444223647e-06, "loss": 0.039, "step": 1982 }, { "epoch": 0.5317779565567177, "grad_norm": 0.5016167979760771, "learning_rate": 9.819387128933799e-06, "loss": 0.05, "step": 1983 }, { "epoch": 0.532046124966479, "grad_norm": 0.537226155910277, "learning_rate": 9.81897134439302e-06, "loss": 0.0479, "step": 1984 }, { "epoch": 0.5323142933762403, "grad_norm": 0.5413373632027572, "learning_rate": 9.818555090641797e-06, "loss": 0.042, "step": 1985 }, { "epoch": 0.5325824617860017, "grad_norm": 0.6882519895178072, "learning_rate": 9.818138367720657e-06, "loss": 0.0511, "step": 1986 }, { "epoch": 0.532850630195763, "grad_norm": 0.5362041863974267, "learning_rate": 9.817721175670176e-06, "loss": 0.0458, "step": 1987 }, { "epoch": 0.5331187986055242, "grad_norm": 0.46323507083640103, "learning_rate": 9.817303514530975e-06, "loss": 0.0358, "step": 1988 }, { "epoch": 0.5333869670152857, "grad_norm": 0.46288558438143196, "learning_rate": 9.816885384343722e-06, "loss": 0.0412, "step": 1989 }, { "epoch": 0.533655135425047, "grad_norm": 0.4645334753230899, "learning_rate": 9.816466785149127e-06, "loss": 0.0427, "step": 1990 }, { "epoch": 0.5339233038348082, "grad_norm": 0.3971151377830419, "learning_rate": 9.81604771698795e-06, "loss": 0.0387, "step": 1991 }, { "epoch": 0.5341914722445696, "grad_norm": 0.4267761556992693, "learning_rate": 9.815628179900988e-06, "loss": 0.0416, "step": 1992 }, { "epoch": 0.5344596406543309, "grad_norm": 0.6567419634869496, "learning_rate": 9.8152081739291e-06, "loss": 0.0358, "step": 1993 }, { "epoch": 0.5347278090640922, "grad_norm": 0.45082081992977835, "learning_rate": 9.814787699113175e-06, "loss": 0.0466, "step": 1994 }, { "epoch": 0.5349959774738536, "grad_norm": 0.5310439962914222, "learning_rate": 9.814366755494155e-06, "loss": 0.0435, "step": 1995 }, { "epoch": 0.5352641458836149, "grad_norm": 0.553501840553156, "learning_rate": 9.813945343113026e-06, "loss": 0.0436, "step": 1996 }, { "epoch": 0.5355323142933762, "grad_norm": 0.4259732765278402, "learning_rate": 9.813523462010819e-06, "loss": 0.0377, "step": 1997 }, { "epoch": 0.5358004827031375, "grad_norm": 1.5511140657561304, "learning_rate": 9.813101112228613e-06, "loss": 0.0411, "step": 1998 }, { "epoch": 0.5360686511128989, "grad_norm": 0.47376425038306474, "learning_rate": 9.81267829380753e-06, "loss": 0.0416, "step": 1999 }, { "epoch": 0.5363368195226602, "grad_norm": 0.6550935872750709, "learning_rate": 9.812255006788738e-06, "loss": 0.0469, "step": 2000 }, { "epoch": 0.5366049879324215, "grad_norm": 0.6607356353045636, "learning_rate": 9.811831251213453e-06, "loss": 0.0452, "step": 2001 }, { "epoch": 0.5368731563421829, "grad_norm": 0.4446414671365038, "learning_rate": 9.811407027122935e-06, "loss": 0.0375, "step": 2002 }, { "epoch": 0.5371413247519442, "grad_norm": 0.3949614749945395, "learning_rate": 9.810982334558487e-06, "loss": 0.0397, "step": 2003 }, { "epoch": 0.5374094931617055, "grad_norm": 0.4427131013536356, "learning_rate": 9.810557173561464e-06, "loss": 0.0414, "step": 2004 }, { "epoch": 0.5376776615714669, "grad_norm": 0.35130823272872097, "learning_rate": 9.810131544173258e-06, "loss": 0.0367, "step": 2005 }, { "epoch": 0.5379458299812282, "grad_norm": 0.8107306647559662, "learning_rate": 9.809705446435317e-06, "loss": 0.061, "step": 2006 }, { "epoch": 0.5382139983909895, "grad_norm": 0.5106172543697388, "learning_rate": 9.809278880389126e-06, "loss": 0.0445, "step": 2007 }, { "epoch": 0.5384821668007509, "grad_norm": 0.5370949472531165, "learning_rate": 9.808851846076218e-06, "loss": 0.05, "step": 2008 }, { "epoch": 0.5387503352105122, "grad_norm": 0.4537942000808546, "learning_rate": 9.808424343538172e-06, "loss": 0.0471, "step": 2009 }, { "epoch": 0.5390185036202735, "grad_norm": 0.47430974170344364, "learning_rate": 9.807996372816614e-06, "loss": 0.0398, "step": 2010 }, { "epoch": 0.5392866720300349, "grad_norm": 0.3367586635058746, "learning_rate": 9.807567933953217e-06, "loss": 0.0364, "step": 2011 }, { "epoch": 0.5395548404397962, "grad_norm": 0.504305868405743, "learning_rate": 9.807139026989691e-06, "loss": 0.0465, "step": 2012 }, { "epoch": 0.5398230088495575, "grad_norm": 0.3781998561433705, "learning_rate": 9.806709651967802e-06, "loss": 0.0379, "step": 2013 }, { "epoch": 0.5400911772593189, "grad_norm": 0.45412341800651856, "learning_rate": 9.806279808929355e-06, "loss": 0.0675, "step": 2014 }, { "epoch": 0.5403593456690802, "grad_norm": 0.8238655463195337, "learning_rate": 9.805849497916205e-06, "loss": 0.0449, "step": 2015 }, { "epoch": 0.5406275140788415, "grad_norm": 0.44307730767133374, "learning_rate": 9.805418718970248e-06, "loss": 0.0473, "step": 2016 }, { "epoch": 0.5408956824886029, "grad_norm": 0.6771360137143813, "learning_rate": 9.804987472133427e-06, "loss": 0.0474, "step": 2017 }, { "epoch": 0.5411638508983642, "grad_norm": 0.4298039556547534, "learning_rate": 9.804555757447734e-06, "loss": 0.0354, "step": 2018 }, { "epoch": 0.5414320193081255, "grad_norm": 0.4922060094158956, "learning_rate": 9.804123574955202e-06, "loss": 0.048, "step": 2019 }, { "epoch": 0.5417001877178869, "grad_norm": 0.38747284837817864, "learning_rate": 9.803690924697913e-06, "loss": 0.0363, "step": 2020 }, { "epoch": 0.5419683561276482, "grad_norm": 0.5680746067080892, "learning_rate": 9.803257806717993e-06, "loss": 0.0425, "step": 2021 }, { "epoch": 0.5422365245374094, "grad_norm": 0.30652112214799315, "learning_rate": 9.802824221057613e-06, "loss": 0.03, "step": 2022 }, { "epoch": 0.5425046929471709, "grad_norm": 0.6247593189557609, "learning_rate": 9.802390167758987e-06, "loss": 0.0371, "step": 2023 }, { "epoch": 0.5427728613569321, "grad_norm": 0.39695721378562643, "learning_rate": 9.801955646864383e-06, "loss": 0.0338, "step": 2024 }, { "epoch": 0.5430410297666934, "grad_norm": 0.5010609172765217, "learning_rate": 9.801520658416105e-06, "loss": 0.0407, "step": 2025 }, { "epoch": 0.5433091981764548, "grad_norm": 0.6081915900344399, "learning_rate": 9.80108520245651e-06, "loss": 0.0433, "step": 2026 }, { "epoch": 0.5435773665862161, "grad_norm": 0.36573294103600473, "learning_rate": 9.800649279027994e-06, "loss": 0.0321, "step": 2027 }, { "epoch": 0.5438455349959774, "grad_norm": 2.2304779803443497, "learning_rate": 9.800212888173005e-06, "loss": 0.0385, "step": 2028 }, { "epoch": 0.5441137034057388, "grad_norm": 0.32192436541585506, "learning_rate": 9.799776029934029e-06, "loss": 0.0331, "step": 2029 }, { "epoch": 0.5443818718155001, "grad_norm": 0.40096091621010693, "learning_rate": 9.799338704353605e-06, "loss": 0.0418, "step": 2030 }, { "epoch": 0.5446500402252614, "grad_norm": 0.575553366921704, "learning_rate": 9.798900911474315e-06, "loss": 0.0563, "step": 2031 }, { "epoch": 0.5449182086350228, "grad_norm": 0.5158739079181425, "learning_rate": 9.798462651338782e-06, "loss": 0.0487, "step": 2032 }, { "epoch": 0.5451863770447841, "grad_norm": 0.36719436387320414, "learning_rate": 9.79802392398968e-06, "loss": 0.0301, "step": 2033 }, { "epoch": 0.5454545454545454, "grad_norm": 0.6538025558989532, "learning_rate": 9.797584729469728e-06, "loss": 0.0655, "step": 2034 }, { "epoch": 0.5457227138643068, "grad_norm": 0.8195894836188615, "learning_rate": 9.797145067821689e-06, "loss": 0.0488, "step": 2035 }, { "epoch": 0.5459908822740681, "grad_norm": 0.35510011274434927, "learning_rate": 9.79670493908837e-06, "loss": 0.0306, "step": 2036 }, { "epoch": 0.5462590506838294, "grad_norm": 0.5362784949062973, "learning_rate": 9.796264343312626e-06, "loss": 0.0528, "step": 2037 }, { "epoch": 0.5465272190935908, "grad_norm": 0.29060534024733253, "learning_rate": 9.795823280537358e-06, "loss": 0.026, "step": 2038 }, { "epoch": 0.5467953875033521, "grad_norm": 0.6519799557829945, "learning_rate": 9.795381750805508e-06, "loss": 0.0512, "step": 2039 }, { "epoch": 0.5470635559131134, "grad_norm": 0.5900074585378169, "learning_rate": 9.794939754160069e-06, "loss": 0.0586, "step": 2040 }, { "epoch": 0.5473317243228748, "grad_norm": 0.43756819270212416, "learning_rate": 9.794497290644076e-06, "loss": 0.0427, "step": 2041 }, { "epoch": 0.5475998927326361, "grad_norm": 0.4906247245972443, "learning_rate": 9.794054360300612e-06, "loss": 0.0328, "step": 2042 }, { "epoch": 0.5478680611423974, "grad_norm": 0.5929987261893424, "learning_rate": 9.793610963172802e-06, "loss": 0.0462, "step": 2043 }, { "epoch": 0.5481362295521588, "grad_norm": 0.4337172730837812, "learning_rate": 9.793167099303821e-06, "loss": 0.0406, "step": 2044 }, { "epoch": 0.5484043979619201, "grad_norm": 0.4838839310179972, "learning_rate": 9.792722768736885e-06, "loss": 0.0561, "step": 2045 }, { "epoch": 0.5486725663716814, "grad_norm": 0.4786640842564696, "learning_rate": 9.792277971515255e-06, "loss": 0.0435, "step": 2046 }, { "epoch": 0.5489407347814428, "grad_norm": 0.4166148286841388, "learning_rate": 9.791832707682242e-06, "loss": 0.0369, "step": 2047 }, { "epoch": 0.5492089031912041, "grad_norm": 0.36192826392679633, "learning_rate": 9.791386977281203e-06, "loss": 0.0493, "step": 2048 }, { "epoch": 0.5494770716009654, "grad_norm": 0.4842276034441402, "learning_rate": 9.790940780355534e-06, "loss": 0.0409, "step": 2049 }, { "epoch": 0.5497452400107268, "grad_norm": 0.4699337757958404, "learning_rate": 9.790494116948681e-06, "loss": 0.0406, "step": 2050 }, { "epoch": 0.5500134084204881, "grad_norm": 0.5737432895283681, "learning_rate": 9.790046987104133e-06, "loss": 0.0589, "step": 2051 }, { "epoch": 0.5502815768302494, "grad_norm": 0.3615656440886661, "learning_rate": 9.78959939086543e-06, "loss": 0.0359, "step": 2052 }, { "epoch": 0.5505497452400108, "grad_norm": 0.6521552816726939, "learning_rate": 9.789151328276147e-06, "loss": 0.0387, "step": 2053 }, { "epoch": 0.5508179136497721, "grad_norm": 0.3515190780840208, "learning_rate": 9.788702799379916e-06, "loss": 0.0316, "step": 2054 }, { "epoch": 0.5510860820595334, "grad_norm": 0.45202218978759595, "learning_rate": 9.78825380422041e-06, "loss": 0.0429, "step": 2055 }, { "epoch": 0.5513542504692948, "grad_norm": 0.5250935924423318, "learning_rate": 9.787804342841338e-06, "loss": 0.0565, "step": 2056 }, { "epoch": 0.551622418879056, "grad_norm": 0.47788448557594704, "learning_rate": 9.787354415286472e-06, "loss": 0.053, "step": 2057 }, { "epoch": 0.5518905872888173, "grad_norm": 0.3251582506228371, "learning_rate": 9.786904021599617e-06, "loss": 0.0301, "step": 2058 }, { "epoch": 0.5521587556985788, "grad_norm": 0.3484218443039387, "learning_rate": 9.786453161824625e-06, "loss": 0.0319, "step": 2059 }, { "epoch": 0.55242692410834, "grad_norm": 0.47340406708614097, "learning_rate": 9.786001836005397e-06, "loss": 0.0462, "step": 2060 }, { "epoch": 0.5526950925181013, "grad_norm": 0.48626099584110305, "learning_rate": 9.785550044185877e-06, "loss": 0.0433, "step": 2061 }, { "epoch": 0.5529632609278627, "grad_norm": 0.3167950381107025, "learning_rate": 9.785097786410055e-06, "loss": 0.0312, "step": 2062 }, { "epoch": 0.553231429337624, "grad_norm": 0.6022408890720156, "learning_rate": 9.784645062721962e-06, "loss": 0.0547, "step": 2063 }, { "epoch": 0.5534995977473853, "grad_norm": 0.35470841267040665, "learning_rate": 9.784191873165686e-06, "loss": 0.0367, "step": 2064 }, { "epoch": 0.5537677661571467, "grad_norm": 0.424495311964531, "learning_rate": 9.783738217785349e-06, "loss": 0.0325, "step": 2065 }, { "epoch": 0.554035934566908, "grad_norm": 0.36146023364684426, "learning_rate": 9.78328409662512e-06, "loss": 0.0409, "step": 2066 }, { "epoch": 0.5543041029766693, "grad_norm": 0.5301766890345284, "learning_rate": 9.78282950972922e-06, "loss": 0.053, "step": 2067 }, { "epoch": 0.5545722713864307, "grad_norm": 0.5017354808164451, "learning_rate": 9.782374457141908e-06, "loss": 0.0544, "step": 2068 }, { "epoch": 0.554840439796192, "grad_norm": 0.5374461758811344, "learning_rate": 9.781918938907494e-06, "loss": 0.0506, "step": 2069 }, { "epoch": 0.5551086082059533, "grad_norm": 0.43022143865597107, "learning_rate": 9.781462955070326e-06, "loss": 0.0339, "step": 2070 }, { "epoch": 0.5553767766157147, "grad_norm": 0.5169420354861711, "learning_rate": 9.781006505674807e-06, "loss": 0.0387, "step": 2071 }, { "epoch": 0.555644945025476, "grad_norm": 0.4581194811481187, "learning_rate": 9.780549590765375e-06, "loss": 0.05, "step": 2072 }, { "epoch": 0.5559131134352373, "grad_norm": 0.3565631334832981, "learning_rate": 9.780092210386522e-06, "loss": 0.0421, "step": 2073 }, { "epoch": 0.5561812818449987, "grad_norm": 0.5216370615825768, "learning_rate": 9.779634364582784e-06, "loss": 0.0518, "step": 2074 }, { "epoch": 0.55644945025476, "grad_norm": 0.3122264947393512, "learning_rate": 9.779176053398736e-06, "loss": 0.0213, "step": 2075 }, { "epoch": 0.5567176186645213, "grad_norm": 1.579424762978187, "learning_rate": 9.778717276879004e-06, "loss": 0.0754, "step": 2076 }, { "epoch": 0.5569857870742827, "grad_norm": 0.3053673373031889, "learning_rate": 9.778258035068258e-06, "loss": 0.0281, "step": 2077 }, { "epoch": 0.557253955484044, "grad_norm": 0.5593653090136709, "learning_rate": 9.777798328011213e-06, "loss": 0.0513, "step": 2078 }, { "epoch": 0.5575221238938053, "grad_norm": 0.5753492890240071, "learning_rate": 9.77733815575263e-06, "loss": 0.0358, "step": 2079 }, { "epoch": 0.5577902923035667, "grad_norm": 0.4172494707571354, "learning_rate": 9.776877518337313e-06, "loss": 0.0382, "step": 2080 }, { "epoch": 0.558058460713328, "grad_norm": 0.4039088483929679, "learning_rate": 9.776416415810116e-06, "loss": 0.0309, "step": 2081 }, { "epoch": 0.5583266291230893, "grad_norm": 0.2979210038766568, "learning_rate": 9.775954848215934e-06, "loss": 0.0284, "step": 2082 }, { "epoch": 0.5585947975328507, "grad_norm": 0.3216565089918019, "learning_rate": 9.775492815599708e-06, "loss": 0.0308, "step": 2083 }, { "epoch": 0.558862965942612, "grad_norm": 0.4367729345593559, "learning_rate": 9.775030318006424e-06, "loss": 0.0467, "step": 2084 }, { "epoch": 0.5591311343523733, "grad_norm": 0.3212750754072431, "learning_rate": 9.774567355481116e-06, "loss": 0.035, "step": 2085 }, { "epoch": 0.5593993027621347, "grad_norm": 0.42474297370472514, "learning_rate": 9.77410392806886e-06, "loss": 0.0441, "step": 2086 }, { "epoch": 0.559667471171896, "grad_norm": 0.29477129658167295, "learning_rate": 9.773640035814783e-06, "loss": 0.0216, "step": 2087 }, { "epoch": 0.5599356395816573, "grad_norm": 0.3762342046061444, "learning_rate": 9.773175678764047e-06, "loss": 0.0355, "step": 2088 }, { "epoch": 0.5602038079914186, "grad_norm": 0.43503288944396545, "learning_rate": 9.772710856961867e-06, "loss": 0.0424, "step": 2089 }, { "epoch": 0.56047197640118, "grad_norm": 0.35264929411733437, "learning_rate": 9.772245570453502e-06, "loss": 0.0417, "step": 2090 }, { "epoch": 0.5607401448109413, "grad_norm": 0.5080284736308984, "learning_rate": 9.771779819284257e-06, "loss": 0.0464, "step": 2091 }, { "epoch": 0.5610083132207025, "grad_norm": 0.39316649675191195, "learning_rate": 9.77131360349948e-06, "loss": 0.0352, "step": 2092 }, { "epoch": 0.561276481630464, "grad_norm": 0.38818583247822613, "learning_rate": 9.770846923144565e-06, "loss": 0.047, "step": 2093 }, { "epoch": 0.5615446500402252, "grad_norm": 0.5377635772988183, "learning_rate": 9.77037977826495e-06, "loss": 0.051, "step": 2094 }, { "epoch": 0.5618128184499865, "grad_norm": 0.44354737977605535, "learning_rate": 9.769912168906124e-06, "loss": 0.0484, "step": 2095 }, { "epoch": 0.5620809868597479, "grad_norm": 0.45223325705951223, "learning_rate": 9.769444095113611e-06, "loss": 0.041, "step": 2096 }, { "epoch": 0.5623491552695092, "grad_norm": 0.4838352388714414, "learning_rate": 9.768975556932991e-06, "loss": 0.0442, "step": 2097 }, { "epoch": 0.5626173236792705, "grad_norm": 0.40456575148472995, "learning_rate": 9.768506554409882e-06, "loss": 0.0362, "step": 2098 }, { "epoch": 0.5628854920890319, "grad_norm": 0.5774510742223516, "learning_rate": 9.76803708758995e-06, "loss": 0.0483, "step": 2099 }, { "epoch": 0.5631536604987932, "grad_norm": 0.4171752223131799, "learning_rate": 9.767567156518904e-06, "loss": 0.0447, "step": 2100 }, { "epoch": 0.5634218289085545, "grad_norm": 0.4476448468888343, "learning_rate": 9.767096761242503e-06, "loss": 0.0463, "step": 2101 }, { "epoch": 0.5636899973183159, "grad_norm": 0.45290939533458785, "learning_rate": 9.766625901806549e-06, "loss": 0.0389, "step": 2102 }, { "epoch": 0.5639581657280772, "grad_norm": 0.7304028395868565, "learning_rate": 9.766154578256883e-06, "loss": 0.0443, "step": 2103 }, { "epoch": 0.5642263341378385, "grad_norm": 0.4765604495993669, "learning_rate": 9.765682790639399e-06, "loss": 0.0365, "step": 2104 }, { "epoch": 0.5644945025475999, "grad_norm": 0.7336690976584431, "learning_rate": 9.765210539000036e-06, "loss": 0.0438, "step": 2105 }, { "epoch": 0.5647626709573612, "grad_norm": 0.3560323954515163, "learning_rate": 9.764737823384774e-06, "loss": 0.0414, "step": 2106 }, { "epoch": 0.5650308393671225, "grad_norm": 0.45287341106154827, "learning_rate": 9.76426464383964e-06, "loss": 0.0384, "step": 2107 }, { "epoch": 0.5652990077768839, "grad_norm": 0.41152190747814515, "learning_rate": 9.763791000410708e-06, "loss": 0.0348, "step": 2108 }, { "epoch": 0.5655671761866452, "grad_norm": 0.285392034075322, "learning_rate": 9.763316893144092e-06, "loss": 0.0278, "step": 2109 }, { "epoch": 0.5658353445964065, "grad_norm": 0.3535447222610612, "learning_rate": 9.762842322085958e-06, "loss": 0.037, "step": 2110 }, { "epoch": 0.5661035130061679, "grad_norm": 0.4524967452957079, "learning_rate": 9.76236728728251e-06, "loss": 0.0427, "step": 2111 }, { "epoch": 0.5663716814159292, "grad_norm": 0.5218421110550597, "learning_rate": 9.761891788780005e-06, "loss": 0.037, "step": 2112 }, { "epoch": 0.5666398498256905, "grad_norm": 1.0863645734291743, "learning_rate": 9.761415826624737e-06, "loss": 0.0629, "step": 2113 }, { "epoch": 0.5669080182354519, "grad_norm": 0.325169515071217, "learning_rate": 9.760939400863054e-06, "loss": 0.0349, "step": 2114 }, { "epoch": 0.5671761866452132, "grad_norm": 0.36504534353636786, "learning_rate": 9.76046251154134e-06, "loss": 0.0397, "step": 2115 }, { "epoch": 0.5674443550549745, "grad_norm": 0.2962652617410509, "learning_rate": 9.759985158706031e-06, "loss": 0.0279, "step": 2116 }, { "epoch": 0.5677125234647359, "grad_norm": 0.4082815963454779, "learning_rate": 9.759507342403604e-06, "loss": 0.0305, "step": 2117 }, { "epoch": 0.5679806918744972, "grad_norm": 0.40428969729654063, "learning_rate": 9.759029062680582e-06, "loss": 0.0462, "step": 2118 }, { "epoch": 0.5682488602842585, "grad_norm": 0.3753737286300627, "learning_rate": 9.758550319583537e-06, "loss": 0.0421, "step": 2119 }, { "epoch": 0.5685170286940199, "grad_norm": 0.43614003710445937, "learning_rate": 9.758071113159082e-06, "loss": 0.0413, "step": 2120 }, { "epoch": 0.5687851971037812, "grad_norm": 0.31396105168948363, "learning_rate": 9.757591443453872e-06, "loss": 0.0446, "step": 2121 }, { "epoch": 0.5690533655135425, "grad_norm": 0.3462105378169937, "learning_rate": 9.757111310514616e-06, "loss": 0.0395, "step": 2122 }, { "epoch": 0.5693215339233039, "grad_norm": 0.8338544591156548, "learning_rate": 9.756630714388062e-06, "loss": 0.0456, "step": 2123 }, { "epoch": 0.5695897023330652, "grad_norm": 0.5701323783020232, "learning_rate": 9.756149655121003e-06, "loss": 0.0508, "step": 2124 }, { "epoch": 0.5698578707428265, "grad_norm": 0.34827120079679913, "learning_rate": 9.755668132760281e-06, "loss": 0.0306, "step": 2125 }, { "epoch": 0.5701260391525879, "grad_norm": 0.38347429166130564, "learning_rate": 9.755186147352778e-06, "loss": 0.0263, "step": 2126 }, { "epoch": 0.5703942075623492, "grad_norm": 0.588490483151567, "learning_rate": 9.754703698945425e-06, "loss": 0.0383, "step": 2127 }, { "epoch": 0.5706623759721104, "grad_norm": 0.27296474551765554, "learning_rate": 9.754220787585195e-06, "loss": 0.0289, "step": 2128 }, { "epoch": 0.5709305443818719, "grad_norm": 0.5444461516683837, "learning_rate": 9.753737413319109e-06, "loss": 0.0527, "step": 2129 }, { "epoch": 0.5711987127916331, "grad_norm": 0.6572488267723118, "learning_rate": 9.753253576194234e-06, "loss": 0.0491, "step": 2130 }, { "epoch": 0.5714668812013944, "grad_norm": 0.48119970459292355, "learning_rate": 9.752769276257676e-06, "loss": 0.0448, "step": 2131 }, { "epoch": 0.5717350496111558, "grad_norm": 0.38797727277170296, "learning_rate": 9.752284513556594e-06, "loss": 0.0372, "step": 2132 }, { "epoch": 0.5720032180209171, "grad_norm": 0.4276980331548487, "learning_rate": 9.751799288138184e-06, "loss": 0.0454, "step": 2133 }, { "epoch": 0.5722713864306784, "grad_norm": 0.43575611990419155, "learning_rate": 9.751313600049695e-06, "loss": 0.0544, "step": 2134 }, { "epoch": 0.5725395548404398, "grad_norm": 0.491247967210608, "learning_rate": 9.750827449338414e-06, "loss": 0.038, "step": 2135 }, { "epoch": 0.5728077232502011, "grad_norm": 0.5282192663398635, "learning_rate": 9.750340836051677e-06, "loss": 0.0615, "step": 2136 }, { "epoch": 0.5730758916599624, "grad_norm": 0.37085949609241015, "learning_rate": 9.749853760236866e-06, "loss": 0.0349, "step": 2137 }, { "epoch": 0.5733440600697238, "grad_norm": 0.44860013365532597, "learning_rate": 9.749366221941403e-06, "loss": 0.0384, "step": 2138 }, { "epoch": 0.5736122284794851, "grad_norm": 0.4542119002071712, "learning_rate": 9.748878221212763e-06, "loss": 0.0377, "step": 2139 }, { "epoch": 0.5738803968892464, "grad_norm": 0.5271416867196699, "learning_rate": 9.748389758098457e-06, "loss": 0.0396, "step": 2140 }, { "epoch": 0.5741485652990078, "grad_norm": 0.34991141371662426, "learning_rate": 9.747900832646047e-06, "loss": 0.0378, "step": 2141 }, { "epoch": 0.5744167337087691, "grad_norm": 0.3447358137381132, "learning_rate": 9.747411444903138e-06, "loss": 0.0264, "step": 2142 }, { "epoch": 0.5746849021185304, "grad_norm": 0.3728515318322614, "learning_rate": 9.746921594917382e-06, "loss": 0.0332, "step": 2143 }, { "epoch": 0.5749530705282918, "grad_norm": 0.5298190664493202, "learning_rate": 9.746431282736472e-06, "loss": 0.0413, "step": 2144 }, { "epoch": 0.5752212389380531, "grad_norm": 0.44507945510516694, "learning_rate": 9.74594050840815e-06, "loss": 0.0464, "step": 2145 }, { "epoch": 0.5754894073478144, "grad_norm": 0.49002151818755324, "learning_rate": 9.745449271980201e-06, "loss": 0.0552, "step": 2146 }, { "epoch": 0.5757575757575758, "grad_norm": 0.4441600986230594, "learning_rate": 9.744957573500455e-06, "loss": 0.0506, "step": 2147 }, { "epoch": 0.5760257441673371, "grad_norm": 0.5764105257002559, "learning_rate": 9.744465413016788e-06, "loss": 0.0397, "step": 2148 }, { "epoch": 0.5762939125770984, "grad_norm": 0.2855476292579802, "learning_rate": 9.74397279057712e-06, "loss": 0.0287, "step": 2149 }, { "epoch": 0.5765620809868598, "grad_norm": 0.390194945143855, "learning_rate": 9.743479706229417e-06, "loss": 0.0394, "step": 2150 }, { "epoch": 0.5768302493966211, "grad_norm": 0.5366491903500469, "learning_rate": 9.742986160021688e-06, "loss": 0.0443, "step": 2151 }, { "epoch": 0.5770984178063824, "grad_norm": 0.3072158253636778, "learning_rate": 9.742492152001989e-06, "loss": 0.0285, "step": 2152 }, { "epoch": 0.5773665862161438, "grad_norm": 0.467524127366552, "learning_rate": 9.741997682218421e-06, "loss": 0.0446, "step": 2153 }, { "epoch": 0.5776347546259051, "grad_norm": 0.48528206664672324, "learning_rate": 9.741502750719127e-06, "loss": 0.0452, "step": 2154 }, { "epoch": 0.5779029230356664, "grad_norm": 0.3451612756623971, "learning_rate": 9.741007357552298e-06, "loss": 0.0363, "step": 2155 }, { "epoch": 0.5781710914454278, "grad_norm": 0.43844220443100984, "learning_rate": 9.740511502766172e-06, "loss": 0.0413, "step": 2156 }, { "epoch": 0.5784392598551891, "grad_norm": 0.4696624759029428, "learning_rate": 9.740015186409027e-06, "loss": 0.0368, "step": 2157 }, { "epoch": 0.5787074282649504, "grad_norm": 0.4185091162817286, "learning_rate": 9.739518408529184e-06, "loss": 0.0359, "step": 2158 }, { "epoch": 0.5789755966747118, "grad_norm": 0.3941101034875836, "learning_rate": 9.739021169175021e-06, "loss": 0.0456, "step": 2159 }, { "epoch": 0.5792437650844731, "grad_norm": 0.8054524078002959, "learning_rate": 9.738523468394947e-06, "loss": 0.0406, "step": 2160 }, { "epoch": 0.5795119334942344, "grad_norm": 0.454688565025983, "learning_rate": 9.738025306237424e-06, "loss": 0.0382, "step": 2161 }, { "epoch": 0.5797801019039958, "grad_norm": 0.4507206044020744, "learning_rate": 9.737526682750955e-06, "loss": 0.043, "step": 2162 }, { "epoch": 0.580048270313757, "grad_norm": 0.460623106564885, "learning_rate": 9.73702759798409e-06, "loss": 0.0402, "step": 2163 }, { "epoch": 0.5803164387235183, "grad_norm": 0.47627129821737607, "learning_rate": 9.736528051985428e-06, "loss": 0.0458, "step": 2164 }, { "epoch": 0.5805846071332798, "grad_norm": 0.3166286003648832, "learning_rate": 9.736028044803602e-06, "loss": 0.0348, "step": 2165 }, { "epoch": 0.580852775543041, "grad_norm": 0.37749916470278194, "learning_rate": 9.7355275764873e-06, "loss": 0.0344, "step": 2166 }, { "epoch": 0.5811209439528023, "grad_norm": 0.6153093354304093, "learning_rate": 9.73502664708525e-06, "loss": 0.0418, "step": 2167 }, { "epoch": 0.5813891123625637, "grad_norm": 0.5107566190026994, "learning_rate": 9.734525256646226e-06, "loss": 0.034, "step": 2168 }, { "epoch": 0.581657280772325, "grad_norm": 0.4121734087539575, "learning_rate": 9.734023405219049e-06, "loss": 0.032, "step": 2169 }, { "epoch": 0.5819254491820863, "grad_norm": 0.43666703629430087, "learning_rate": 9.73352109285258e-06, "loss": 0.048, "step": 2170 }, { "epoch": 0.5821936175918477, "grad_norm": 0.3134956978568538, "learning_rate": 9.73301831959573e-06, "loss": 0.0383, "step": 2171 }, { "epoch": 0.582461786001609, "grad_norm": 0.4474145598439904, "learning_rate": 9.732515085497454e-06, "loss": 0.0372, "step": 2172 }, { "epoch": 0.5827299544113703, "grad_norm": 0.5343621344166201, "learning_rate": 9.732011390606748e-06, "loss": 0.0437, "step": 2173 }, { "epoch": 0.5829981228211317, "grad_norm": 0.5079619718393555, "learning_rate": 9.731507234972654e-06, "loss": 0.0453, "step": 2174 }, { "epoch": 0.583266291230893, "grad_norm": 0.9096753109290278, "learning_rate": 9.731002618644265e-06, "loss": 0.0359, "step": 2175 }, { "epoch": 0.5835344596406543, "grad_norm": 0.4280373228754013, "learning_rate": 9.73049754167071e-06, "loss": 0.0388, "step": 2176 }, { "epoch": 0.5838026280504156, "grad_norm": 0.43719637585384824, "learning_rate": 9.729992004101169e-06, "loss": 0.033, "step": 2177 }, { "epoch": 0.584070796460177, "grad_norm": 0.48033573099564386, "learning_rate": 9.729486005984864e-06, "loss": 0.0574, "step": 2178 }, { "epoch": 0.5843389648699383, "grad_norm": 1.0212384826506733, "learning_rate": 9.728979547371065e-06, "loss": 0.0527, "step": 2179 }, { "epoch": 0.5846071332796996, "grad_norm": 1.0403255456417992, "learning_rate": 9.728472628309081e-06, "loss": 0.0477, "step": 2180 }, { "epoch": 0.584875301689461, "grad_norm": 0.5244813618126417, "learning_rate": 9.727965248848273e-06, "loss": 0.0392, "step": 2181 }, { "epoch": 0.5851434700992223, "grad_norm": 0.3572887513832535, "learning_rate": 9.727457409038038e-06, "loss": 0.0356, "step": 2182 }, { "epoch": 0.5854116385089836, "grad_norm": 0.36221084727651665, "learning_rate": 9.726949108927828e-06, "loss": 0.0334, "step": 2183 }, { "epoch": 0.585679806918745, "grad_norm": 0.4966681747415277, "learning_rate": 9.726440348567133e-06, "loss": 0.04, "step": 2184 }, { "epoch": 0.5859479753285063, "grad_norm": 0.44885949706985573, "learning_rate": 9.72593112800549e-06, "loss": 0.0449, "step": 2185 }, { "epoch": 0.5862161437382676, "grad_norm": 0.3802783362425603, "learning_rate": 9.725421447292481e-06, "loss": 0.0354, "step": 2186 }, { "epoch": 0.586484312148029, "grad_norm": 0.41588836895012404, "learning_rate": 9.724911306477729e-06, "loss": 0.0363, "step": 2187 }, { "epoch": 0.5867524805577903, "grad_norm": 0.6468364941165152, "learning_rate": 9.72440070561091e-06, "loss": 0.05, "step": 2188 }, { "epoch": 0.5870206489675516, "grad_norm": 0.8598087938002239, "learning_rate": 9.723889644741735e-06, "loss": 0.0635, "step": 2189 }, { "epoch": 0.587288817377313, "grad_norm": 0.3545117807646672, "learning_rate": 9.723378123919969e-06, "loss": 0.0436, "step": 2190 }, { "epoch": 0.5875569857870743, "grad_norm": 0.47836329028824426, "learning_rate": 9.722866143195412e-06, "loss": 0.0413, "step": 2191 }, { "epoch": 0.5878251541968356, "grad_norm": 0.34704835376357335, "learning_rate": 9.722353702617921e-06, "loss": 0.036, "step": 2192 }, { "epoch": 0.588093322606597, "grad_norm": 0.4768755230370726, "learning_rate": 9.721840802237388e-06, "loss": 0.0395, "step": 2193 }, { "epoch": 0.5883614910163583, "grad_norm": 0.5389125022348678, "learning_rate": 9.72132744210375e-06, "loss": 0.0437, "step": 2194 }, { "epoch": 0.5886296594261196, "grad_norm": 0.41198521475630223, "learning_rate": 9.720813622266996e-06, "loss": 0.0485, "step": 2195 }, { "epoch": 0.588897827835881, "grad_norm": 0.470765776709888, "learning_rate": 9.72029934277715e-06, "loss": 0.0434, "step": 2196 }, { "epoch": 0.5891659962456423, "grad_norm": 0.5384187008665143, "learning_rate": 9.719784603684292e-06, "loss": 0.0446, "step": 2197 }, { "epoch": 0.5894341646554035, "grad_norm": 0.3969894997788225, "learning_rate": 9.719269405038537e-06, "loss": 0.033, "step": 2198 }, { "epoch": 0.589702333065165, "grad_norm": 0.5838373076782158, "learning_rate": 9.71875374689005e-06, "loss": 0.047, "step": 2199 }, { "epoch": 0.5899705014749262, "grad_norm": 0.4063369736458305, "learning_rate": 9.718237629289038e-06, "loss": 0.0401, "step": 2200 }, { "epoch": 0.5902386698846875, "grad_norm": 0.41504110835775054, "learning_rate": 9.717721052285754e-06, "loss": 0.0413, "step": 2201 }, { "epoch": 0.5905068382944489, "grad_norm": 0.39772463748917164, "learning_rate": 9.717204015930498e-06, "loss": 0.0427, "step": 2202 }, { "epoch": 0.5907750067042102, "grad_norm": 0.3346179999534236, "learning_rate": 9.716686520273611e-06, "loss": 0.0404, "step": 2203 }, { "epoch": 0.5910431751139715, "grad_norm": 0.4103858188853257, "learning_rate": 9.716168565365477e-06, "loss": 0.0437, "step": 2204 }, { "epoch": 0.5913113435237329, "grad_norm": 0.3527285845368918, "learning_rate": 9.715650151256534e-06, "loss": 0.0286, "step": 2205 }, { "epoch": 0.5915795119334942, "grad_norm": 0.3807566350942936, "learning_rate": 9.715131277997256e-06, "loss": 0.0359, "step": 2206 }, { "epoch": 0.5918476803432555, "grad_norm": 0.4000130132722489, "learning_rate": 9.714611945638162e-06, "loss": 0.0436, "step": 2207 }, { "epoch": 0.5921158487530169, "grad_norm": 0.47314414916876274, "learning_rate": 9.714092154229821e-06, "loss": 0.0379, "step": 2208 }, { "epoch": 0.5923840171627782, "grad_norm": 0.24046492937674482, "learning_rate": 9.713571903822842e-06, "loss": 0.0265, "step": 2209 }, { "epoch": 0.5926521855725395, "grad_norm": 0.6198286297498614, "learning_rate": 9.713051194467882e-06, "loss": 0.035, "step": 2210 }, { "epoch": 0.5929203539823009, "grad_norm": 0.4993532950411199, "learning_rate": 9.71253002621564e-06, "loss": 0.0432, "step": 2211 }, { "epoch": 0.5931885223920622, "grad_norm": 0.3145678969510866, "learning_rate": 9.712008399116861e-06, "loss": 0.032, "step": 2212 }, { "epoch": 0.5934566908018235, "grad_norm": 0.32512092579945895, "learning_rate": 9.711486313222332e-06, "loss": 0.0265, "step": 2213 }, { "epoch": 0.5937248592115849, "grad_norm": 0.4596823544846739, "learning_rate": 9.710963768582891e-06, "loss": 0.0392, "step": 2214 }, { "epoch": 0.5939930276213462, "grad_norm": 0.3044444403104068, "learning_rate": 9.710440765249416e-06, "loss": 0.0245, "step": 2215 }, { "epoch": 0.5942611960311075, "grad_norm": 0.5105664294956231, "learning_rate": 9.709917303272828e-06, "loss": 0.0382, "step": 2216 }, { "epoch": 0.5945293644408689, "grad_norm": 0.3442700076277357, "learning_rate": 9.709393382704098e-06, "loss": 0.0358, "step": 2217 }, { "epoch": 0.5947975328506302, "grad_norm": 0.6048828723248941, "learning_rate": 9.708869003594238e-06, "loss": 0.0563, "step": 2218 }, { "epoch": 0.5950657012603915, "grad_norm": 0.4216081745123837, "learning_rate": 9.708344165994304e-06, "loss": 0.0477, "step": 2219 }, { "epoch": 0.5953338696701529, "grad_norm": 0.29586953307035435, "learning_rate": 9.707818869955398e-06, "loss": 0.0347, "step": 2220 }, { "epoch": 0.5956020380799142, "grad_norm": 0.469085796136734, "learning_rate": 9.70729311552867e-06, "loss": 0.0475, "step": 2221 }, { "epoch": 0.5958702064896755, "grad_norm": 0.34934826031390986, "learning_rate": 9.706766902765308e-06, "loss": 0.0337, "step": 2222 }, { "epoch": 0.5961383748994369, "grad_norm": 0.4507792359312918, "learning_rate": 9.706240231716549e-06, "loss": 0.0386, "step": 2223 }, { "epoch": 0.5964065433091982, "grad_norm": 0.43696323476873283, "learning_rate": 9.705713102433673e-06, "loss": 0.0421, "step": 2224 }, { "epoch": 0.5966747117189595, "grad_norm": 0.4822237783343997, "learning_rate": 9.705185514968006e-06, "loss": 0.0563, "step": 2225 }, { "epoch": 0.5969428801287209, "grad_norm": 0.3733374702583346, "learning_rate": 9.704657469370917e-06, "loss": 0.0356, "step": 2226 }, { "epoch": 0.5972110485384822, "grad_norm": 0.39314583754393495, "learning_rate": 9.70412896569382e-06, "loss": 0.0365, "step": 2227 }, { "epoch": 0.5974792169482435, "grad_norm": 0.38806069218131134, "learning_rate": 9.703600003988176e-06, "loss": 0.0338, "step": 2228 }, { "epoch": 0.5977473853580049, "grad_norm": 0.3754476111476056, "learning_rate": 9.703070584305487e-06, "loss": 0.0302, "step": 2229 }, { "epoch": 0.5980155537677662, "grad_norm": 0.5970809307581462, "learning_rate": 9.702540706697303e-06, "loss": 0.0601, "step": 2230 }, { "epoch": 0.5982837221775275, "grad_norm": 0.6346509039914741, "learning_rate": 9.702010371215213e-06, "loss": 0.0335, "step": 2231 }, { "epoch": 0.5985518905872889, "grad_norm": 0.4036233679298006, "learning_rate": 9.701479577910859e-06, "loss": 0.0477, "step": 2232 }, { "epoch": 0.5988200589970502, "grad_norm": 0.4439968651954235, "learning_rate": 9.70094832683592e-06, "loss": 0.0407, "step": 2233 }, { "epoch": 0.5990882274068114, "grad_norm": 0.37095619925690465, "learning_rate": 9.700416618042123e-06, "loss": 0.0412, "step": 2234 }, { "epoch": 0.5993563958165729, "grad_norm": 0.41861742846672156, "learning_rate": 9.699884451581238e-06, "loss": 0.0314, "step": 2235 }, { "epoch": 0.5996245642263341, "grad_norm": 0.44463773300016574, "learning_rate": 9.699351827505084e-06, "loss": 0.0456, "step": 2236 }, { "epoch": 0.5998927326360954, "grad_norm": 0.3693821852752085, "learning_rate": 9.698818745865518e-06, "loss": 0.0296, "step": 2237 }, { "epoch": 0.6001609010458568, "grad_norm": 0.28920306561067777, "learning_rate": 9.698285206714446e-06, "loss": 0.0279, "step": 2238 }, { "epoch": 0.6004290694556181, "grad_norm": 0.5730761223565658, "learning_rate": 9.697751210103817e-06, "loss": 0.0418, "step": 2239 }, { "epoch": 0.6006972378653794, "grad_norm": 0.42848234973724597, "learning_rate": 9.697216756085625e-06, "loss": 0.0361, "step": 2240 }, { "epoch": 0.6009654062751408, "grad_norm": 0.3231561664789962, "learning_rate": 9.696681844711909e-06, "loss": 0.0304, "step": 2241 }, { "epoch": 0.6012335746849021, "grad_norm": 0.5239270405028139, "learning_rate": 9.696146476034752e-06, "loss": 0.035, "step": 2242 }, { "epoch": 0.6015017430946634, "grad_norm": 0.7148736205821443, "learning_rate": 9.695610650106278e-06, "loss": 0.0666, "step": 2243 }, { "epoch": 0.6017699115044248, "grad_norm": 0.2620489523356466, "learning_rate": 9.695074366978663e-06, "loss": 0.0269, "step": 2244 }, { "epoch": 0.6020380799141861, "grad_norm": 0.43710790784223125, "learning_rate": 9.69453762670412e-06, "loss": 0.0477, "step": 2245 }, { "epoch": 0.6023062483239474, "grad_norm": 0.4288880193649563, "learning_rate": 9.694000429334915e-06, "loss": 0.0426, "step": 2246 }, { "epoch": 0.6025744167337088, "grad_norm": 0.5896139340292553, "learning_rate": 9.693462774923351e-06, "loss": 0.0476, "step": 2247 }, { "epoch": 0.6028425851434701, "grad_norm": 0.3110674263352271, "learning_rate": 9.692924663521776e-06, "loss": 0.0339, "step": 2248 }, { "epoch": 0.6031107535532314, "grad_norm": 0.5414130537588965, "learning_rate": 9.692386095182585e-06, "loss": 0.0414, "step": 2249 }, { "epoch": 0.6033789219629928, "grad_norm": 1.1355091141009677, "learning_rate": 9.691847069958221e-06, "loss": 0.0366, "step": 2250 }, { "epoch": 0.6036470903727541, "grad_norm": 0.5677678615109082, "learning_rate": 9.691307587901161e-06, "loss": 0.0361, "step": 2251 }, { "epoch": 0.6039152587825154, "grad_norm": 0.3910756127631531, "learning_rate": 9.690767649063938e-06, "loss": 0.0437, "step": 2252 }, { "epoch": 0.6041834271922768, "grad_norm": 0.3594163077118075, "learning_rate": 9.690227253499122e-06, "loss": 0.0333, "step": 2253 }, { "epoch": 0.6044515956020381, "grad_norm": 0.3372276024627769, "learning_rate": 9.689686401259331e-06, "loss": 0.0325, "step": 2254 }, { "epoch": 0.6047197640117994, "grad_norm": 0.2571848951828705, "learning_rate": 9.689145092397227e-06, "loss": 0.0268, "step": 2255 }, { "epoch": 0.6049879324215608, "grad_norm": 0.5801177810993481, "learning_rate": 9.688603326965513e-06, "loss": 0.0489, "step": 2256 }, { "epoch": 0.6052561008313221, "grad_norm": 0.3058781308045926, "learning_rate": 9.688061105016942e-06, "loss": 0.0288, "step": 2257 }, { "epoch": 0.6055242692410834, "grad_norm": 0.4697661910113497, "learning_rate": 9.687518426604308e-06, "loss": 0.0331, "step": 2258 }, { "epoch": 0.6057924376508448, "grad_norm": 0.5441314074030762, "learning_rate": 9.686975291780449e-06, "loss": 0.0503, "step": 2259 }, { "epoch": 0.6060606060606061, "grad_norm": 0.5737967022001436, "learning_rate": 9.686431700598251e-06, "loss": 0.0492, "step": 2260 }, { "epoch": 0.6063287744703674, "grad_norm": 0.6483603804329836, "learning_rate": 9.68588765311064e-06, "loss": 0.0541, "step": 2261 }, { "epoch": 0.6065969428801288, "grad_norm": 0.3304615518027629, "learning_rate": 9.68534314937059e-06, "loss": 0.0332, "step": 2262 }, { "epoch": 0.6068651112898901, "grad_norm": 0.42468857667842247, "learning_rate": 9.684798189431114e-06, "loss": 0.0449, "step": 2263 }, { "epoch": 0.6071332796996514, "grad_norm": 0.4399345744885095, "learning_rate": 9.684252773345279e-06, "loss": 0.0451, "step": 2264 }, { "epoch": 0.6074014481094127, "grad_norm": 1.1710580615219952, "learning_rate": 9.683706901166186e-06, "loss": 0.0482, "step": 2265 }, { "epoch": 0.6076696165191741, "grad_norm": 0.5256092821455982, "learning_rate": 9.683160572946988e-06, "loss": 0.0628, "step": 2266 }, { "epoch": 0.6079377849289354, "grad_norm": 0.42811551374286155, "learning_rate": 9.682613788740878e-06, "loss": 0.0444, "step": 2267 }, { "epoch": 0.6082059533386966, "grad_norm": 0.5378938284618875, "learning_rate": 9.682066548601097e-06, "loss": 0.046, "step": 2268 }, { "epoch": 0.608474121748458, "grad_norm": 0.3245020121384951, "learning_rate": 9.681518852580925e-06, "loss": 0.033, "step": 2269 }, { "epoch": 0.6087422901582193, "grad_norm": 0.4649481392805461, "learning_rate": 9.680970700733692e-06, "loss": 0.0374, "step": 2270 }, { "epoch": 0.6090104585679806, "grad_norm": 0.6778754461735137, "learning_rate": 9.68042209311277e-06, "loss": 0.0403, "step": 2271 }, { "epoch": 0.609278626977742, "grad_norm": 0.46378905290317013, "learning_rate": 9.679873029771575e-06, "loss": 0.0403, "step": 2272 }, { "epoch": 0.6095467953875033, "grad_norm": 0.4552230860160537, "learning_rate": 9.679323510763568e-06, "loss": 0.0419, "step": 2273 }, { "epoch": 0.6098149637972646, "grad_norm": 0.5471325484675843, "learning_rate": 9.678773536142254e-06, "loss": 0.047, "step": 2274 }, { "epoch": 0.610083132207026, "grad_norm": 0.5233573422025999, "learning_rate": 9.678223105961182e-06, "loss": 0.0599, "step": 2275 }, { "epoch": 0.6103513006167873, "grad_norm": 0.2911582456311716, "learning_rate": 9.677672220273948e-06, "loss": 0.0227, "step": 2276 }, { "epoch": 0.6106194690265486, "grad_norm": 0.3375568328231506, "learning_rate": 9.677120879134187e-06, "loss": 0.0454, "step": 2277 }, { "epoch": 0.61088763743631, "grad_norm": 0.4487487839547596, "learning_rate": 9.676569082595585e-06, "loss": 0.0361, "step": 2278 }, { "epoch": 0.6111558058460713, "grad_norm": 0.6009020436948649, "learning_rate": 9.676016830711868e-06, "loss": 0.0529, "step": 2279 }, { "epoch": 0.6114239742558326, "grad_norm": 0.5289667287982411, "learning_rate": 9.675464123536803e-06, "loss": 0.0477, "step": 2280 }, { "epoch": 0.611692142665594, "grad_norm": 0.46202018559487407, "learning_rate": 9.674910961124215e-06, "loss": 0.0454, "step": 2281 }, { "epoch": 0.6119603110753553, "grad_norm": 0.408030503770942, "learning_rate": 9.674357343527955e-06, "loss": 0.0394, "step": 2282 }, { "epoch": 0.6122284794851166, "grad_norm": 0.478802591725159, "learning_rate": 9.67380327080193e-06, "loss": 0.0398, "step": 2283 }, { "epoch": 0.612496647894878, "grad_norm": 0.5665621214222397, "learning_rate": 9.673248743000092e-06, "loss": 0.0562, "step": 2284 }, { "epoch": 0.6127648163046393, "grad_norm": 0.3295307071238235, "learning_rate": 9.67269376017643e-06, "loss": 0.0439, "step": 2285 }, { "epoch": 0.6130329847144006, "grad_norm": 0.5155569113910213, "learning_rate": 9.67213832238498e-06, "loss": 0.0353, "step": 2286 }, { "epoch": 0.613301153124162, "grad_norm": 0.495477585095254, "learning_rate": 9.671582429679829e-06, "loss": 0.0363, "step": 2287 }, { "epoch": 0.6135693215339233, "grad_norm": 0.41075967198482366, "learning_rate": 9.671026082115098e-06, "loss": 0.0335, "step": 2288 }, { "epoch": 0.6138374899436846, "grad_norm": 0.3036951598571639, "learning_rate": 9.670469279744959e-06, "loss": 0.0273, "step": 2289 }, { "epoch": 0.614105658353446, "grad_norm": 0.4304101878353346, "learning_rate": 9.669912022623626e-06, "loss": 0.0332, "step": 2290 }, { "epoch": 0.6143738267632073, "grad_norm": 0.47724570133383293, "learning_rate": 9.669354310805359e-06, "loss": 0.0372, "step": 2291 }, { "epoch": 0.6146419951729686, "grad_norm": 0.501098182017171, "learning_rate": 9.668796144344459e-06, "loss": 0.0332, "step": 2292 }, { "epoch": 0.61491016358273, "grad_norm": 0.4038034464914963, "learning_rate": 9.668237523295272e-06, "loss": 0.0374, "step": 2293 }, { "epoch": 0.6151783319924913, "grad_norm": 0.9153982800595213, "learning_rate": 9.667678447712194e-06, "loss": 0.0453, "step": 2294 }, { "epoch": 0.6154465004022526, "grad_norm": 0.5609581816682323, "learning_rate": 9.667118917649656e-06, "loss": 0.0532, "step": 2295 }, { "epoch": 0.615714668812014, "grad_norm": 0.3613140997722086, "learning_rate": 9.66655893316214e-06, "loss": 0.025, "step": 2296 }, { "epoch": 0.6159828372217753, "grad_norm": 0.5666033023724834, "learning_rate": 9.665998494304172e-06, "loss": 0.0434, "step": 2297 }, { "epoch": 0.6162510056315366, "grad_norm": 0.7711228743063872, "learning_rate": 9.665437601130318e-06, "loss": 0.0433, "step": 2298 }, { "epoch": 0.616519174041298, "grad_norm": 0.6214107150343847, "learning_rate": 9.66487625369519e-06, "loss": 0.0393, "step": 2299 }, { "epoch": 0.6167873424510593, "grad_norm": 0.5390563830457505, "learning_rate": 9.664314452053447e-06, "loss": 0.062, "step": 2300 }, { "epoch": 0.6170555108608206, "grad_norm": 0.3856033220879983, "learning_rate": 9.66375219625979e-06, "loss": 0.0391, "step": 2301 }, { "epoch": 0.617323679270582, "grad_norm": 0.5878638617531717, "learning_rate": 9.663189486368962e-06, "loss": 0.0601, "step": 2302 }, { "epoch": 0.6175918476803433, "grad_norm": 0.4563286618497218, "learning_rate": 9.662626322435755e-06, "loss": 0.0331, "step": 2303 }, { "epoch": 0.6178600160901045, "grad_norm": 0.7963332856275146, "learning_rate": 9.662062704515003e-06, "loss": 0.0453, "step": 2304 }, { "epoch": 0.618128184499866, "grad_norm": 0.3389645185938771, "learning_rate": 9.661498632661582e-06, "loss": 0.0279, "step": 2305 }, { "epoch": 0.6183963529096272, "grad_norm": 0.4686813651433305, "learning_rate": 9.660934106930416e-06, "loss": 0.0473, "step": 2306 }, { "epoch": 0.6186645213193885, "grad_norm": 0.46239118270460333, "learning_rate": 9.660369127376469e-06, "loss": 0.0418, "step": 2307 }, { "epoch": 0.6189326897291499, "grad_norm": 0.3264683836244514, "learning_rate": 9.659803694054753e-06, "loss": 0.0394, "step": 2308 }, { "epoch": 0.6192008581389112, "grad_norm": 0.43244190268315214, "learning_rate": 9.659237807020326e-06, "loss": 0.039, "step": 2309 }, { "epoch": 0.6194690265486725, "grad_norm": 0.47067277918301836, "learning_rate": 9.65867146632828e-06, "loss": 0.0468, "step": 2310 }, { "epoch": 0.6197371949584339, "grad_norm": 0.5989890643605386, "learning_rate": 9.658104672033763e-06, "loss": 0.0425, "step": 2311 }, { "epoch": 0.6200053633681952, "grad_norm": 0.293211081035094, "learning_rate": 9.657537424191964e-06, "loss": 0.0327, "step": 2312 }, { "epoch": 0.6202735317779565, "grad_norm": 0.4091336108375734, "learning_rate": 9.656969722858108e-06, "loss": 0.0367, "step": 2313 }, { "epoch": 0.6205417001877179, "grad_norm": 0.46586720171971074, "learning_rate": 9.656401568087475e-06, "loss": 0.0461, "step": 2314 }, { "epoch": 0.6208098685974792, "grad_norm": 0.49094714545753565, "learning_rate": 9.655832959935382e-06, "loss": 0.0485, "step": 2315 }, { "epoch": 0.6210780370072405, "grad_norm": 0.43078934717974526, "learning_rate": 9.6552638984572e-06, "loss": 0.0398, "step": 2316 }, { "epoch": 0.6213462054170019, "grad_norm": 0.46902333369633836, "learning_rate": 9.654694383708327e-06, "loss": 0.0559, "step": 2317 }, { "epoch": 0.6216143738267632, "grad_norm": 0.3883938870513487, "learning_rate": 9.654124415744222e-06, "loss": 0.0353, "step": 2318 }, { "epoch": 0.6218825422365245, "grad_norm": 0.5478775503864985, "learning_rate": 9.653553994620378e-06, "loss": 0.0468, "step": 2319 }, { "epoch": 0.6221507106462859, "grad_norm": 0.44792210367963803, "learning_rate": 9.652983120392336e-06, "loss": 0.035, "step": 2320 }, { "epoch": 0.6224188790560472, "grad_norm": 0.5006488537645952, "learning_rate": 9.652411793115681e-06, "loss": 0.0448, "step": 2321 }, { "epoch": 0.6226870474658085, "grad_norm": 0.3595507458608492, "learning_rate": 9.651840012846043e-06, "loss": 0.0322, "step": 2322 }, { "epoch": 0.6229552158755699, "grad_norm": 0.30493458510324334, "learning_rate": 9.651267779639093e-06, "loss": 0.0295, "step": 2323 }, { "epoch": 0.6232233842853312, "grad_norm": 0.2718145354244287, "learning_rate": 9.650695093550549e-06, "loss": 0.0322, "step": 2324 }, { "epoch": 0.6234915526950925, "grad_norm": 0.4115545924262125, "learning_rate": 9.650121954636169e-06, "loss": 0.0376, "step": 2325 }, { "epoch": 0.6237597211048539, "grad_norm": 0.48162397063309037, "learning_rate": 9.649548362951762e-06, "loss": 0.0389, "step": 2326 }, { "epoch": 0.6240278895146152, "grad_norm": 0.2898629326587395, "learning_rate": 9.648974318553173e-06, "loss": 0.0256, "step": 2327 }, { "epoch": 0.6242960579243765, "grad_norm": 0.3617439375559836, "learning_rate": 9.6483998214963e-06, "loss": 0.0338, "step": 2328 }, { "epoch": 0.6245642263341379, "grad_norm": 0.3574358550245054, "learning_rate": 9.647824871837076e-06, "loss": 0.0385, "step": 2329 }, { "epoch": 0.6248323947438992, "grad_norm": 0.38540020261967123, "learning_rate": 9.647249469631484e-06, "loss": 0.0456, "step": 2330 }, { "epoch": 0.6251005631536605, "grad_norm": 0.3896947160111264, "learning_rate": 9.64667361493555e-06, "loss": 0.0423, "step": 2331 }, { "epoch": 0.6253687315634219, "grad_norm": 0.31638908917071545, "learning_rate": 9.646097307805341e-06, "loss": 0.0307, "step": 2332 }, { "epoch": 0.6256368999731832, "grad_norm": 0.6441690799074042, "learning_rate": 9.645520548296975e-06, "loss": 0.043, "step": 2333 }, { "epoch": 0.6259050683829445, "grad_norm": 0.47773262502237623, "learning_rate": 9.644943336466605e-06, "loss": 0.0502, "step": 2334 }, { "epoch": 0.6261732367927059, "grad_norm": 0.42448614977277965, "learning_rate": 9.644365672370437e-06, "loss": 0.0481, "step": 2335 }, { "epoch": 0.6264414052024672, "grad_norm": 0.3902042113230199, "learning_rate": 9.643787556064713e-06, "loss": 0.0392, "step": 2336 }, { "epoch": 0.6267095736122285, "grad_norm": 0.4046381474320068, "learning_rate": 9.643208987605722e-06, "loss": 0.0364, "step": 2337 }, { "epoch": 0.6269777420219899, "grad_norm": 0.5577086271643042, "learning_rate": 9.642629967049799e-06, "loss": 0.0595, "step": 2338 }, { "epoch": 0.6272459104317512, "grad_norm": 0.4059179032915396, "learning_rate": 9.642050494453323e-06, "loss": 0.0377, "step": 2339 }, { "epoch": 0.6275140788415124, "grad_norm": 0.3274461308138133, "learning_rate": 9.641470569872715e-06, "loss": 0.0334, "step": 2340 }, { "epoch": 0.6277822472512739, "grad_norm": 0.4533377206356322, "learning_rate": 9.64089019336444e-06, "loss": 0.0379, "step": 2341 }, { "epoch": 0.6280504156610351, "grad_norm": 0.41142488777789143, "learning_rate": 9.64030936498501e-06, "loss": 0.0417, "step": 2342 }, { "epoch": 0.6283185840707964, "grad_norm": 0.31353325891357314, "learning_rate": 9.639728084790976e-06, "loss": 0.031, "step": 2343 }, { "epoch": 0.6285867524805578, "grad_norm": 0.43536443553142146, "learning_rate": 9.639146352838935e-06, "loss": 0.0533, "step": 2344 }, { "epoch": 0.6288549208903191, "grad_norm": 0.4717141453954006, "learning_rate": 9.638564169185532e-06, "loss": 0.0461, "step": 2345 }, { "epoch": 0.6291230893000804, "grad_norm": 0.3298703550734682, "learning_rate": 9.63798153388745e-06, "loss": 0.0295, "step": 2346 }, { "epoch": 0.6293912577098418, "grad_norm": 0.44540558681983444, "learning_rate": 9.63739844700142e-06, "loss": 0.0378, "step": 2347 }, { "epoch": 0.6296594261196031, "grad_norm": 0.35193675516462675, "learning_rate": 9.636814908584215e-06, "loss": 0.0448, "step": 2348 }, { "epoch": 0.6299275945293644, "grad_norm": 0.5073069239342581, "learning_rate": 9.636230918692654e-06, "loss": 0.0565, "step": 2349 }, { "epoch": 0.6301957629391258, "grad_norm": 0.41555359739675146, "learning_rate": 9.635646477383597e-06, "loss": 0.0304, "step": 2350 }, { "epoch": 0.6304639313488871, "grad_norm": 0.5570800204635162, "learning_rate": 9.635061584713949e-06, "loss": 0.0433, "step": 2351 }, { "epoch": 0.6307320997586484, "grad_norm": 0.333977143755449, "learning_rate": 9.634476240740662e-06, "loss": 0.0305, "step": 2352 }, { "epoch": 0.6310002681684097, "grad_norm": 0.4597261511904927, "learning_rate": 9.633890445520726e-06, "loss": 0.0322, "step": 2353 }, { "epoch": 0.6312684365781711, "grad_norm": 0.34734635337566555, "learning_rate": 9.63330419911118e-06, "loss": 0.0407, "step": 2354 }, { "epoch": 0.6315366049879324, "grad_norm": 0.37833481662675106, "learning_rate": 9.632717501569106e-06, "loss": 0.0442, "step": 2355 }, { "epoch": 0.6318047733976937, "grad_norm": 0.38784359330636803, "learning_rate": 9.63213035295163e-06, "loss": 0.0359, "step": 2356 }, { "epoch": 0.6320729418074551, "grad_norm": 0.30656933615245535, "learning_rate": 9.631542753315917e-06, "loss": 0.0297, "step": 2357 }, { "epoch": 0.6323411102172164, "grad_norm": 0.4015790915141212, "learning_rate": 9.630954702719184e-06, "loss": 0.0418, "step": 2358 }, { "epoch": 0.6326092786269777, "grad_norm": 0.4126321499975713, "learning_rate": 9.630366201218686e-06, "loss": 0.0443, "step": 2359 }, { "epoch": 0.6328774470367391, "grad_norm": 0.48963380843404836, "learning_rate": 9.629777248871725e-06, "loss": 0.0345, "step": 2360 }, { "epoch": 0.6331456154465004, "grad_norm": 0.46095669428663266, "learning_rate": 9.629187845735645e-06, "loss": 0.0335, "step": 2361 }, { "epoch": 0.6334137838562617, "grad_norm": 0.5838893428481031, "learning_rate": 9.628597991867834e-06, "loss": 0.0575, "step": 2362 }, { "epoch": 0.6336819522660231, "grad_norm": 0.325218286175406, "learning_rate": 9.628007687325726e-06, "loss": 0.0372, "step": 2363 }, { "epoch": 0.6339501206757844, "grad_norm": 0.9756261661285838, "learning_rate": 9.627416932166795e-06, "loss": 0.0524, "step": 2364 }, { "epoch": 0.6342182890855457, "grad_norm": 0.36053462758323435, "learning_rate": 9.626825726448565e-06, "loss": 0.0361, "step": 2365 }, { "epoch": 0.6344864574953071, "grad_norm": 0.3325114396441304, "learning_rate": 9.626234070228597e-06, "loss": 0.0325, "step": 2366 }, { "epoch": 0.6347546259050684, "grad_norm": 0.46119397884936003, "learning_rate": 9.6256419635645e-06, "loss": 0.0376, "step": 2367 }, { "epoch": 0.6350227943148297, "grad_norm": 0.4507052840812779, "learning_rate": 9.625049406513926e-06, "loss": 0.0444, "step": 2368 }, { "epoch": 0.6352909627245911, "grad_norm": 0.3856328897999926, "learning_rate": 9.624456399134571e-06, "loss": 0.0396, "step": 2369 }, { "epoch": 0.6355591311343524, "grad_norm": 0.3756979633762363, "learning_rate": 9.623862941484174e-06, "loss": 0.039, "step": 2370 }, { "epoch": 0.6358272995441137, "grad_norm": 0.2936440553209829, "learning_rate": 9.623269033620518e-06, "loss": 0.0219, "step": 2371 }, { "epoch": 0.6360954679538751, "grad_norm": 0.8162721629582562, "learning_rate": 9.622674675601431e-06, "loss": 0.0483, "step": 2372 }, { "epoch": 0.6363636363636364, "grad_norm": 0.3916503536588522, "learning_rate": 9.622079867484785e-06, "loss": 0.032, "step": 2373 }, { "epoch": 0.6366318047733976, "grad_norm": 0.5746142914165883, "learning_rate": 9.621484609328491e-06, "loss": 0.0417, "step": 2374 }, { "epoch": 0.636899973183159, "grad_norm": 0.4322147575797061, "learning_rate": 9.620888901190513e-06, "loss": 0.0442, "step": 2375 }, { "epoch": 0.6371681415929203, "grad_norm": 0.3742842450561866, "learning_rate": 9.62029274312885e-06, "loss": 0.0447, "step": 2376 }, { "epoch": 0.6374363100026816, "grad_norm": 0.3680071447793632, "learning_rate": 9.619696135201549e-06, "loss": 0.0313, "step": 2377 }, { "epoch": 0.637704478412443, "grad_norm": 0.3939743367736881, "learning_rate": 9.619099077466699e-06, "loss": 0.0406, "step": 2378 }, { "epoch": 0.6379726468222043, "grad_norm": 0.40012194755094194, "learning_rate": 9.618501569982437e-06, "loss": 0.0445, "step": 2379 }, { "epoch": 0.6382408152319656, "grad_norm": 0.2973338726410971, "learning_rate": 9.617903612806938e-06, "loss": 0.0328, "step": 2380 }, { "epoch": 0.638508983641727, "grad_norm": 0.6488117433440108, "learning_rate": 9.617305205998427e-06, "loss": 0.0536, "step": 2381 }, { "epoch": 0.6387771520514883, "grad_norm": 0.49315862534546395, "learning_rate": 9.616706349615163e-06, "loss": 0.0461, "step": 2382 }, { "epoch": 0.6390453204612496, "grad_norm": 0.45762092630839146, "learning_rate": 9.616107043715462e-06, "loss": 0.0364, "step": 2383 }, { "epoch": 0.639313488871011, "grad_norm": 0.5471956231718901, "learning_rate": 9.615507288357671e-06, "loss": 0.0438, "step": 2384 }, { "epoch": 0.6395816572807723, "grad_norm": 0.5165387190817277, "learning_rate": 9.614907083600191e-06, "loss": 0.0594, "step": 2385 }, { "epoch": 0.6398498256905336, "grad_norm": 0.2879088575027302, "learning_rate": 9.614306429501461e-06, "loss": 0.026, "step": 2386 }, { "epoch": 0.640117994100295, "grad_norm": 0.33367566019930045, "learning_rate": 9.613705326119964e-06, "loss": 0.0354, "step": 2387 }, { "epoch": 0.6403861625100563, "grad_norm": 0.47695440721622656, "learning_rate": 9.613103773514229e-06, "loss": 0.0505, "step": 2388 }, { "epoch": 0.6406543309198176, "grad_norm": 0.39381038539484803, "learning_rate": 9.612501771742825e-06, "loss": 0.04, "step": 2389 }, { "epoch": 0.640922499329579, "grad_norm": 0.30907986913105073, "learning_rate": 9.611899320864374e-06, "loss": 0.0294, "step": 2390 }, { "epoch": 0.6411906677393403, "grad_norm": 0.39142191760284184, "learning_rate": 9.611296420937526e-06, "loss": 0.0461, "step": 2391 }, { "epoch": 0.6414588361491016, "grad_norm": 0.6549380522117229, "learning_rate": 9.61069307202099e-06, "loss": 0.0641, "step": 2392 }, { "epoch": 0.641727004558863, "grad_norm": 0.4241414654650714, "learning_rate": 9.61008927417351e-06, "loss": 0.0418, "step": 2393 }, { "epoch": 0.6419951729686243, "grad_norm": 0.6335894993721941, "learning_rate": 9.609485027453878e-06, "loss": 0.0361, "step": 2394 }, { "epoch": 0.6422633413783856, "grad_norm": 0.3288830427939089, "learning_rate": 9.608880331920927e-06, "loss": 0.0347, "step": 2395 }, { "epoch": 0.642531509788147, "grad_norm": 0.5051519359924269, "learning_rate": 9.608275187633533e-06, "loss": 0.0322, "step": 2396 }, { "epoch": 0.6427996781979083, "grad_norm": 0.47400700635569226, "learning_rate": 9.60766959465062e-06, "loss": 0.0334, "step": 2397 }, { "epoch": 0.6430678466076696, "grad_norm": 0.3763986125445419, "learning_rate": 9.607063553031152e-06, "loss": 0.0339, "step": 2398 }, { "epoch": 0.643336015017431, "grad_norm": 0.5680003205967475, "learning_rate": 9.606457062834137e-06, "loss": 0.0495, "step": 2399 }, { "epoch": 0.6436041834271923, "grad_norm": 0.4451727428090002, "learning_rate": 9.605850124118627e-06, "loss": 0.034, "step": 2400 }, { "epoch": 0.6438723518369536, "grad_norm": 0.9832018030631693, "learning_rate": 9.605242736943718e-06, "loss": 0.037, "step": 2401 }, { "epoch": 0.644140520246715, "grad_norm": 0.3035305558820535, "learning_rate": 9.604634901368553e-06, "loss": 0.0343, "step": 2402 }, { "epoch": 0.6444086886564763, "grad_norm": 0.6354074868499225, "learning_rate": 9.60402661745231e-06, "loss": 0.0471, "step": 2403 }, { "epoch": 0.6446768570662376, "grad_norm": 0.34136416753254806, "learning_rate": 9.60341788525422e-06, "loss": 0.049, "step": 2404 }, { "epoch": 0.644945025475999, "grad_norm": 0.626553087598181, "learning_rate": 9.602808704833553e-06, "loss": 0.0467, "step": 2405 }, { "epoch": 0.6452131938857603, "grad_norm": 0.5093310108556991, "learning_rate": 9.602199076249621e-06, "loss": 0.0457, "step": 2406 }, { "epoch": 0.6454813622955216, "grad_norm": 0.33155382181522636, "learning_rate": 9.601588999561784e-06, "loss": 0.0368, "step": 2407 }, { "epoch": 0.645749530705283, "grad_norm": 0.6191788648658846, "learning_rate": 9.600978474829443e-06, "loss": 0.0336, "step": 2408 }, { "epoch": 0.6460176991150443, "grad_norm": 0.33634917035568584, "learning_rate": 9.600367502112044e-06, "loss": 0.0296, "step": 2409 }, { "epoch": 0.6462858675248055, "grad_norm": 0.5203711763493594, "learning_rate": 9.599756081469076e-06, "loss": 0.0415, "step": 2410 }, { "epoch": 0.646554035934567, "grad_norm": 0.4494497834554676, "learning_rate": 9.59914421296007e-06, "loss": 0.0398, "step": 2411 }, { "epoch": 0.6468222043443282, "grad_norm": 0.42143423700300287, "learning_rate": 9.5985318966446e-06, "loss": 0.0389, "step": 2412 }, { "epoch": 0.6470903727540895, "grad_norm": 0.42023154114947653, "learning_rate": 9.59791913258229e-06, "loss": 0.0462, "step": 2413 }, { "epoch": 0.6473585411638509, "grad_norm": 0.35897589537607527, "learning_rate": 9.597305920832801e-06, "loss": 0.0313, "step": 2414 }, { "epoch": 0.6476267095736122, "grad_norm": 0.3570337438373847, "learning_rate": 9.59669226145584e-06, "loss": 0.0304, "step": 2415 }, { "epoch": 0.6478948779833735, "grad_norm": 0.510795561905092, "learning_rate": 9.596078154511159e-06, "loss": 0.0473, "step": 2416 }, { "epoch": 0.6481630463931349, "grad_norm": 1.0685936705529246, "learning_rate": 9.595463600058547e-06, "loss": 0.0446, "step": 2417 }, { "epoch": 0.6484312148028962, "grad_norm": 0.32259851236522114, "learning_rate": 9.594848598157848e-06, "loss": 0.0376, "step": 2418 }, { "epoch": 0.6486993832126575, "grad_norm": 0.6224002037008342, "learning_rate": 9.59423314886894e-06, "loss": 0.0524, "step": 2419 }, { "epoch": 0.6489675516224189, "grad_norm": 0.43694412247903197, "learning_rate": 9.593617252251747e-06, "loss": 0.0465, "step": 2420 }, { "epoch": 0.6492357200321802, "grad_norm": 0.5181856402056153, "learning_rate": 9.593000908366238e-06, "loss": 0.0516, "step": 2421 }, { "epoch": 0.6495038884419415, "grad_norm": 0.36826459683129004, "learning_rate": 9.592384117272424e-06, "loss": 0.0291, "step": 2422 }, { "epoch": 0.6497720568517029, "grad_norm": 0.43393607236531845, "learning_rate": 9.591766879030363e-06, "loss": 0.041, "step": 2423 }, { "epoch": 0.6500402252614642, "grad_norm": 0.3962791427376134, "learning_rate": 9.59114919370015e-06, "loss": 0.0341, "step": 2424 }, { "epoch": 0.6503083936712255, "grad_norm": 0.6295076291848843, "learning_rate": 9.59053106134193e-06, "loss": 0.053, "step": 2425 }, { "epoch": 0.6505765620809869, "grad_norm": 0.39380011469837295, "learning_rate": 9.589912482015888e-06, "loss": 0.0383, "step": 2426 }, { "epoch": 0.6508447304907482, "grad_norm": 0.2598954492813354, "learning_rate": 9.589293455782253e-06, "loss": 0.0239, "step": 2427 }, { "epoch": 0.6511128989005095, "grad_norm": 0.817868949073019, "learning_rate": 9.5886739827013e-06, "loss": 0.0425, "step": 2428 }, { "epoch": 0.6513810673102709, "grad_norm": 0.44105717206507045, "learning_rate": 9.588054062833343e-06, "loss": 0.0447, "step": 2429 }, { "epoch": 0.6516492357200322, "grad_norm": 0.42351482218457953, "learning_rate": 9.587433696238742e-06, "loss": 0.0361, "step": 2430 }, { "epoch": 0.6519174041297935, "grad_norm": 0.3201253240795514, "learning_rate": 9.586812882977902e-06, "loss": 0.035, "step": 2431 }, { "epoch": 0.6521855725395549, "grad_norm": 0.31174656582466975, "learning_rate": 9.586191623111272e-06, "loss": 0.0301, "step": 2432 }, { "epoch": 0.6524537409493162, "grad_norm": 0.2379491745497701, "learning_rate": 9.585569916699337e-06, "loss": 0.0258, "step": 2433 }, { "epoch": 0.6527219093590775, "grad_norm": 0.3597768311397914, "learning_rate": 9.584947763802635e-06, "loss": 0.0375, "step": 2434 }, { "epoch": 0.6529900777688389, "grad_norm": 0.8436306951206857, "learning_rate": 9.58432516448174e-06, "loss": 0.0551, "step": 2435 }, { "epoch": 0.6532582461786002, "grad_norm": 0.4146036138808878, "learning_rate": 9.583702118797277e-06, "loss": 0.0432, "step": 2436 }, { "epoch": 0.6535264145883615, "grad_norm": 0.5496255201024303, "learning_rate": 9.583078626809907e-06, "loss": 0.0525, "step": 2437 }, { "epoch": 0.6537945829981229, "grad_norm": 0.3729896976666874, "learning_rate": 9.58245468858034e-06, "loss": 0.0284, "step": 2438 }, { "epoch": 0.6540627514078842, "grad_norm": 1.127738702096665, "learning_rate": 9.581830304169325e-06, "loss": 0.038, "step": 2439 }, { "epoch": 0.6543309198176455, "grad_norm": 0.3253717241871863, "learning_rate": 9.581205473637659e-06, "loss": 0.0343, "step": 2440 }, { "epoch": 0.6545990882274069, "grad_norm": 1.0452832246653085, "learning_rate": 9.580580197046177e-06, "loss": 0.0434, "step": 2441 }, { "epoch": 0.6548672566371682, "grad_norm": 0.35309634759056413, "learning_rate": 9.579954474455763e-06, "loss": 0.0361, "step": 2442 }, { "epoch": 0.6551354250469295, "grad_norm": 0.47579145737988227, "learning_rate": 9.579328305927343e-06, "loss": 0.0384, "step": 2443 }, { "epoch": 0.6554035934566907, "grad_norm": 0.8231036525009132, "learning_rate": 9.578701691521884e-06, "loss": 0.0434, "step": 2444 }, { "epoch": 0.6556717618664522, "grad_norm": 0.4654866188529236, "learning_rate": 9.578074631300396e-06, "loss": 0.0361, "step": 2445 }, { "epoch": 0.6559399302762134, "grad_norm": 0.36001860117085316, "learning_rate": 9.577447125323935e-06, "loss": 0.031, "step": 2446 }, { "epoch": 0.6562080986859747, "grad_norm": 0.2925098776649432, "learning_rate": 9.576819173653602e-06, "loss": 0.0295, "step": 2447 }, { "epoch": 0.6564762670957361, "grad_norm": 0.3427065794250314, "learning_rate": 9.576190776350536e-06, "loss": 0.0261, "step": 2448 }, { "epoch": 0.6567444355054974, "grad_norm": 0.9873337287243593, "learning_rate": 9.575561933475926e-06, "loss": 0.0388, "step": 2449 }, { "epoch": 0.6570126039152587, "grad_norm": 0.4922538569310218, "learning_rate": 9.574932645090995e-06, "loss": 0.0472, "step": 2450 }, { "epoch": 0.6572807723250201, "grad_norm": 0.5722429065472527, "learning_rate": 9.574302911257021e-06, "loss": 0.04, "step": 2451 }, { "epoch": 0.6575489407347814, "grad_norm": 0.64442187215386, "learning_rate": 9.573672732035315e-06, "loss": 0.0429, "step": 2452 }, { "epoch": 0.6578171091445427, "grad_norm": 0.43402451605292736, "learning_rate": 9.57304210748724e-06, "loss": 0.0388, "step": 2453 }, { "epoch": 0.6580852775543041, "grad_norm": 0.3536492387021235, "learning_rate": 9.572411037674196e-06, "loss": 0.0304, "step": 2454 }, { "epoch": 0.6583534459640654, "grad_norm": 0.4402933639571913, "learning_rate": 9.571779522657628e-06, "loss": 0.0378, "step": 2455 }, { "epoch": 0.6586216143738267, "grad_norm": 0.3869416724247881, "learning_rate": 9.571147562499026e-06, "loss": 0.0429, "step": 2456 }, { "epoch": 0.6588897827835881, "grad_norm": 0.49193432809463833, "learning_rate": 9.57051515725992e-06, "loss": 0.057, "step": 2457 }, { "epoch": 0.6591579511933494, "grad_norm": 0.5806472762725997, "learning_rate": 9.569882307001891e-06, "loss": 0.0451, "step": 2458 }, { "epoch": 0.6594261196031107, "grad_norm": 0.46980681800654517, "learning_rate": 9.569249011786552e-06, "loss": 0.0455, "step": 2459 }, { "epoch": 0.6596942880128721, "grad_norm": 0.44967370251054656, "learning_rate": 9.56861527167557e-06, "loss": 0.0456, "step": 2460 }, { "epoch": 0.6599624564226334, "grad_norm": 0.5928011679265865, "learning_rate": 9.567981086730646e-06, "loss": 0.0416, "step": 2461 }, { "epoch": 0.6602306248323947, "grad_norm": 0.6376397959798353, "learning_rate": 9.567346457013533e-06, "loss": 0.0569, "step": 2462 }, { "epoch": 0.6604987932421561, "grad_norm": 0.21152571402982312, "learning_rate": 9.566711382586022e-06, "loss": 0.0237, "step": 2463 }, { "epoch": 0.6607669616519174, "grad_norm": 0.4689455073504569, "learning_rate": 9.566075863509947e-06, "loss": 0.0394, "step": 2464 }, { "epoch": 0.6610351300616787, "grad_norm": 0.4584965534072919, "learning_rate": 9.565439899847188e-06, "loss": 0.0519, "step": 2465 }, { "epoch": 0.6613032984714401, "grad_norm": 0.49901060559888427, "learning_rate": 9.564803491659668e-06, "loss": 0.0446, "step": 2466 }, { "epoch": 0.6615714668812014, "grad_norm": 0.34127402888108216, "learning_rate": 9.564166639009351e-06, "loss": 0.0357, "step": 2467 }, { "epoch": 0.6618396352909627, "grad_norm": 0.31564006057320915, "learning_rate": 9.563529341958245e-06, "loss": 0.0334, "step": 2468 }, { "epoch": 0.6621078037007241, "grad_norm": 0.2805183913596248, "learning_rate": 9.562891600568403e-06, "loss": 0.0298, "step": 2469 }, { "epoch": 0.6623759721104854, "grad_norm": 0.2861276794650215, "learning_rate": 9.562253414901921e-06, "loss": 0.0302, "step": 2470 }, { "epoch": 0.6626441405202467, "grad_norm": 0.4707432008524316, "learning_rate": 9.561614785020937e-06, "loss": 0.0467, "step": 2471 }, { "epoch": 0.6629123089300081, "grad_norm": 0.38525907337681276, "learning_rate": 9.56097571098763e-06, "loss": 0.0478, "step": 2472 }, { "epoch": 0.6631804773397694, "grad_norm": 0.455098753017548, "learning_rate": 9.560336192864227e-06, "loss": 0.0396, "step": 2473 }, { "epoch": 0.6634486457495307, "grad_norm": 0.4910643876533477, "learning_rate": 9.559696230713e-06, "loss": 0.0338, "step": 2474 }, { "epoch": 0.6637168141592921, "grad_norm": 0.3463796325396375, "learning_rate": 9.559055824596252e-06, "loss": 0.025, "step": 2475 }, { "epoch": 0.6639849825690534, "grad_norm": 0.3507585873572776, "learning_rate": 9.558414974576346e-06, "loss": 0.0351, "step": 2476 }, { "epoch": 0.6642531509788147, "grad_norm": 0.3371623164827828, "learning_rate": 9.557773680715676e-06, "loss": 0.0315, "step": 2477 }, { "epoch": 0.6645213193885761, "grad_norm": 0.3495124463857507, "learning_rate": 9.557131943076683e-06, "loss": 0.0342, "step": 2478 }, { "epoch": 0.6647894877983374, "grad_norm": 0.591054972580119, "learning_rate": 9.55648976172185e-06, "loss": 0.0467, "step": 2479 }, { "epoch": 0.6650576562080986, "grad_norm": 0.4820148096233884, "learning_rate": 9.555847136713707e-06, "loss": 0.0512, "step": 2480 }, { "epoch": 0.66532582461786, "grad_norm": 0.27898432702346293, "learning_rate": 9.555204068114826e-06, "loss": 0.0321, "step": 2481 }, { "epoch": 0.6655939930276213, "grad_norm": 1.0538362878660053, "learning_rate": 9.554560555987816e-06, "loss": 0.0505, "step": 2482 }, { "epoch": 0.6658621614373826, "grad_norm": 0.3751456903140579, "learning_rate": 9.55391660039534e-06, "loss": 0.0309, "step": 2483 }, { "epoch": 0.666130329847144, "grad_norm": 0.38022316589501687, "learning_rate": 9.55327220140009e-06, "loss": 0.0367, "step": 2484 }, { "epoch": 0.6663984982569053, "grad_norm": 0.44799752919908253, "learning_rate": 9.55262735906482e-06, "loss": 0.0455, "step": 2485 }, { "epoch": 0.6666666666666666, "grad_norm": 0.5767863237603938, "learning_rate": 9.551982073452305e-06, "loss": 0.0463, "step": 2486 }, { "epoch": 0.666934835076428, "grad_norm": 0.25303247890909913, "learning_rate": 9.551336344625387e-06, "loss": 0.0286, "step": 2487 }, { "epoch": 0.6672030034861893, "grad_norm": 0.29930611475809443, "learning_rate": 9.550690172646928e-06, "loss": 0.0284, "step": 2488 }, { "epoch": 0.6674711718959506, "grad_norm": 0.502576244389439, "learning_rate": 9.55004355757985e-06, "loss": 0.0388, "step": 2489 }, { "epoch": 0.667739340305712, "grad_norm": 0.5509057566369913, "learning_rate": 9.549396499487112e-06, "loss": 0.0502, "step": 2490 }, { "epoch": 0.6680075087154733, "grad_norm": 0.4595956673690557, "learning_rate": 9.548748998431715e-06, "loss": 0.0479, "step": 2491 }, { "epoch": 0.6682756771252346, "grad_norm": 0.792796775270736, "learning_rate": 9.548101054476704e-06, "loss": 0.0645, "step": 2492 }, { "epoch": 0.668543845534996, "grad_norm": 0.3158969278603121, "learning_rate": 9.54745266768517e-06, "loss": 0.038, "step": 2493 }, { "epoch": 0.6688120139447573, "grad_norm": 0.40803514424811943, "learning_rate": 9.546803838120241e-06, "loss": 0.039, "step": 2494 }, { "epoch": 0.6690801823545186, "grad_norm": 0.5313483773031777, "learning_rate": 9.546154565845095e-06, "loss": 0.042, "step": 2495 }, { "epoch": 0.66934835076428, "grad_norm": 0.4605502204898016, "learning_rate": 9.545504850922947e-06, "loss": 0.0415, "step": 2496 }, { "epoch": 0.6696165191740413, "grad_norm": 0.45128250519507584, "learning_rate": 9.544854693417061e-06, "loss": 0.0364, "step": 2497 }, { "epoch": 0.6698846875838026, "grad_norm": 0.47598216038976343, "learning_rate": 9.544204093390741e-06, "loss": 0.0495, "step": 2498 }, { "epoch": 0.670152855993564, "grad_norm": 0.4048113685517227, "learning_rate": 9.543553050907332e-06, "loss": 0.0369, "step": 2499 }, { "epoch": 0.6704210244033253, "grad_norm": 0.35790476949815786, "learning_rate": 9.542901566030225e-06, "loss": 0.0333, "step": 2500 }, { "epoch": 0.6706891928130866, "grad_norm": 0.35151156933088024, "learning_rate": 9.542249638822854e-06, "loss": 0.0283, "step": 2501 }, { "epoch": 0.670957361222848, "grad_norm": 0.8957338686836287, "learning_rate": 9.541597269348695e-06, "loss": 0.0649, "step": 2502 }, { "epoch": 0.6712255296326093, "grad_norm": 0.3778752702840763, "learning_rate": 9.540944457671267e-06, "loss": 0.0339, "step": 2503 }, { "epoch": 0.6714936980423706, "grad_norm": 0.4435685329985762, "learning_rate": 9.540291203854135e-06, "loss": 0.0342, "step": 2504 }, { "epoch": 0.671761866452132, "grad_norm": 0.3457214893318813, "learning_rate": 9.5396375079609e-06, "loss": 0.0354, "step": 2505 }, { "epoch": 0.6720300348618933, "grad_norm": 0.582011908779313, "learning_rate": 9.538983370055215e-06, "loss": 0.0331, "step": 2506 }, { "epoch": 0.6722982032716546, "grad_norm": 0.4103006624082808, "learning_rate": 9.53832879020077e-06, "loss": 0.0459, "step": 2507 }, { "epoch": 0.672566371681416, "grad_norm": 0.2693647014282593, "learning_rate": 9.5376737684613e-06, "loss": 0.0247, "step": 2508 }, { "epoch": 0.6728345400911773, "grad_norm": 0.4183588272621983, "learning_rate": 9.537018304900583e-06, "loss": 0.0393, "step": 2509 }, { "epoch": 0.6731027085009386, "grad_norm": 0.5369666746817175, "learning_rate": 9.536362399582438e-06, "loss": 0.0633, "step": 2510 }, { "epoch": 0.6733708769107, "grad_norm": 0.36759454430928884, "learning_rate": 9.53570605257073e-06, "loss": 0.0367, "step": 2511 }, { "epoch": 0.6736390453204613, "grad_norm": 0.3846405856360957, "learning_rate": 9.535049263929366e-06, "loss": 0.043, "step": 2512 }, { "epoch": 0.6739072137302226, "grad_norm": 0.41733956583088194, "learning_rate": 9.534392033722295e-06, "loss": 0.0409, "step": 2513 }, { "epoch": 0.674175382139984, "grad_norm": 0.6748139333251381, "learning_rate": 9.53373436201351e-06, "loss": 0.0425, "step": 2514 }, { "epoch": 0.6744435505497453, "grad_norm": 0.406760941920907, "learning_rate": 9.533076248867048e-06, "loss": 0.0331, "step": 2515 }, { "epoch": 0.6747117189595065, "grad_norm": 0.6370082990594801, "learning_rate": 9.532417694346987e-06, "loss": 0.0427, "step": 2516 }, { "epoch": 0.674979887369268, "grad_norm": 0.40993807671236726, "learning_rate": 9.531758698517449e-06, "loss": 0.0395, "step": 2517 }, { "epoch": 0.6752480557790292, "grad_norm": 0.43755432883047446, "learning_rate": 9.531099261442596e-06, "loss": 0.0328, "step": 2518 }, { "epoch": 0.6755162241887905, "grad_norm": 0.690390566912873, "learning_rate": 9.530439383186637e-06, "loss": 0.0687, "step": 2519 }, { "epoch": 0.6757843925985519, "grad_norm": 0.5754759549920266, "learning_rate": 9.529779063813826e-06, "loss": 0.0339, "step": 2520 }, { "epoch": 0.6760525610083132, "grad_norm": 0.7550415783642387, "learning_rate": 9.529118303388454e-06, "loss": 0.039, "step": 2521 }, { "epoch": 0.6763207294180745, "grad_norm": 0.35245417283232505, "learning_rate": 9.528457101974855e-06, "loss": 0.034, "step": 2522 }, { "epoch": 0.6765888978278359, "grad_norm": 0.7118386755008984, "learning_rate": 9.527795459637413e-06, "loss": 0.0458, "step": 2523 }, { "epoch": 0.6768570662375972, "grad_norm": 0.49006175605253444, "learning_rate": 9.527133376440547e-06, "loss": 0.0333, "step": 2524 }, { "epoch": 0.6771252346473585, "grad_norm": 0.2789279502086122, "learning_rate": 9.526470852448724e-06, "loss": 0.0243, "step": 2525 }, { "epoch": 0.6773934030571199, "grad_norm": 0.39351651684997646, "learning_rate": 9.52580788772645e-06, "loss": 0.0411, "step": 2526 }, { "epoch": 0.6776615714668812, "grad_norm": 0.4855281613138422, "learning_rate": 9.52514448233828e-06, "loss": 0.0293, "step": 2527 }, { "epoch": 0.6779297398766425, "grad_norm": 0.44212379171255034, "learning_rate": 9.524480636348804e-06, "loss": 0.0369, "step": 2528 }, { "epoch": 0.6781979082864039, "grad_norm": 0.44747876160053385, "learning_rate": 9.523816349822663e-06, "loss": 0.042, "step": 2529 }, { "epoch": 0.6784660766961652, "grad_norm": 0.6072242268782345, "learning_rate": 9.523151622824534e-06, "loss": 0.0462, "step": 2530 }, { "epoch": 0.6787342451059265, "grad_norm": 0.3222438521891234, "learning_rate": 9.522486455419137e-06, "loss": 0.0356, "step": 2531 }, { "epoch": 0.6790024135156878, "grad_norm": 0.3472575731295562, "learning_rate": 9.521820847671243e-06, "loss": 0.0355, "step": 2532 }, { "epoch": 0.6792705819254492, "grad_norm": 0.35892499406226547, "learning_rate": 9.52115479964566e-06, "loss": 0.0317, "step": 2533 }, { "epoch": 0.6795387503352105, "grad_norm": 0.4112503167761095, "learning_rate": 9.520488311407237e-06, "loss": 0.0458, "step": 2534 }, { "epoch": 0.6798069187449718, "grad_norm": 0.5881966467078341, "learning_rate": 9.519821383020866e-06, "loss": 0.0479, "step": 2535 }, { "epoch": 0.6800750871547332, "grad_norm": 0.4325103237279451, "learning_rate": 9.51915401455149e-06, "loss": 0.0387, "step": 2536 }, { "epoch": 0.6803432555644945, "grad_norm": 0.2996825749001986, "learning_rate": 9.518486206064084e-06, "loss": 0.0327, "step": 2537 }, { "epoch": 0.6806114239742558, "grad_norm": 0.6776460505679925, "learning_rate": 9.517817957623673e-06, "loss": 0.0326, "step": 2538 }, { "epoch": 0.6808795923840172, "grad_norm": 0.39865282048201456, "learning_rate": 9.517149269295321e-06, "loss": 0.0385, "step": 2539 }, { "epoch": 0.6811477607937785, "grad_norm": 0.40569190739321653, "learning_rate": 9.516480141144139e-06, "loss": 0.0403, "step": 2540 }, { "epoch": 0.6814159292035398, "grad_norm": 0.3684866639240183, "learning_rate": 9.515810573235275e-06, "loss": 0.0393, "step": 2541 }, { "epoch": 0.6816840976133012, "grad_norm": 0.3154899506669576, "learning_rate": 9.515140565633927e-06, "loss": 0.039, "step": 2542 }, { "epoch": 0.6819522660230625, "grad_norm": 0.3803868759522678, "learning_rate": 9.514470118405328e-06, "loss": 0.0381, "step": 2543 }, { "epoch": 0.6822204344328238, "grad_norm": 0.3177215285600402, "learning_rate": 9.51379923161476e-06, "loss": 0.039, "step": 2544 }, { "epoch": 0.6824886028425852, "grad_norm": 0.29728491779977984, "learning_rate": 9.513127905327545e-06, "loss": 0.0254, "step": 2545 }, { "epoch": 0.6827567712523465, "grad_norm": 0.4799023322003075, "learning_rate": 9.512456139609045e-06, "loss": 0.0449, "step": 2546 }, { "epoch": 0.6830249396621078, "grad_norm": 0.6179982179369042, "learning_rate": 9.511783934524674e-06, "loss": 0.0486, "step": 2547 }, { "epoch": 0.6832931080718692, "grad_norm": 0.3190895603249532, "learning_rate": 9.51111129013988e-06, "loss": 0.0353, "step": 2548 }, { "epoch": 0.6835612764816305, "grad_norm": 0.28988687997146284, "learning_rate": 9.510438206520155e-06, "loss": 0.0258, "step": 2549 }, { "epoch": 0.6838294448913917, "grad_norm": 0.4381121592562018, "learning_rate": 9.509764683731038e-06, "loss": 0.0324, "step": 2550 }, { "epoch": 0.6840976133011532, "grad_norm": 0.5883931011333319, "learning_rate": 9.509090721838106e-06, "loss": 0.0434, "step": 2551 }, { "epoch": 0.6843657817109144, "grad_norm": 0.4591180277738051, "learning_rate": 9.508416320906981e-06, "loss": 0.0376, "step": 2552 }, { "epoch": 0.6846339501206757, "grad_norm": 0.2808601787284627, "learning_rate": 9.50774148100333e-06, "loss": 0.0279, "step": 2553 }, { "epoch": 0.6849021185304371, "grad_norm": 0.33379721928923084, "learning_rate": 9.507066202192858e-06, "loss": 0.0374, "step": 2554 }, { "epoch": 0.6851702869401984, "grad_norm": 0.333905454460441, "learning_rate": 9.506390484541317e-06, "loss": 0.0326, "step": 2555 }, { "epoch": 0.6854384553499597, "grad_norm": 0.351033039229521, "learning_rate": 9.5057143281145e-06, "loss": 0.0503, "step": 2556 }, { "epoch": 0.6857066237597211, "grad_norm": 0.3489231349528481, "learning_rate": 9.505037732978237e-06, "loss": 0.0295, "step": 2557 }, { "epoch": 0.6859747921694824, "grad_norm": 0.38039790413061136, "learning_rate": 9.504360699198414e-06, "loss": 0.0512, "step": 2558 }, { "epoch": 0.6862429605792437, "grad_norm": 0.4325365751610192, "learning_rate": 9.503683226840948e-06, "loss": 0.048, "step": 2559 }, { "epoch": 0.6865111289890051, "grad_norm": 0.3766428564210721, "learning_rate": 9.503005315971803e-06, "loss": 0.0444, "step": 2560 }, { "epoch": 0.6867792973987664, "grad_norm": 0.267400591958884, "learning_rate": 9.502326966656984e-06, "loss": 0.0345, "step": 2561 }, { "epoch": 0.6870474658085277, "grad_norm": 0.36314328183183203, "learning_rate": 9.501648178962544e-06, "loss": 0.0396, "step": 2562 }, { "epoch": 0.6873156342182891, "grad_norm": 0.4466959766130045, "learning_rate": 9.500968952954572e-06, "loss": 0.0357, "step": 2563 }, { "epoch": 0.6875838026280504, "grad_norm": 0.5250677968669969, "learning_rate": 9.500289288699202e-06, "loss": 0.0372, "step": 2564 }, { "epoch": 0.6878519710378117, "grad_norm": 0.3719551829815669, "learning_rate": 9.499609186262612e-06, "loss": 0.0254, "step": 2565 }, { "epoch": 0.6881201394475731, "grad_norm": 0.5587475972831016, "learning_rate": 9.498928645711023e-06, "loss": 0.0392, "step": 2566 }, { "epoch": 0.6883883078573344, "grad_norm": 0.5663351284849398, "learning_rate": 9.498247667110696e-06, "loss": 0.0551, "step": 2567 }, { "epoch": 0.6886564762670957, "grad_norm": 0.30181982505256044, "learning_rate": 9.497566250527935e-06, "loss": 0.0312, "step": 2568 }, { "epoch": 0.6889246446768571, "grad_norm": 0.32836183299663185, "learning_rate": 9.49688439602909e-06, "loss": 0.0277, "step": 2569 }, { "epoch": 0.6891928130866184, "grad_norm": 0.45476675360163143, "learning_rate": 9.496202103680549e-06, "loss": 0.036, "step": 2570 }, { "epoch": 0.6894609814963797, "grad_norm": 0.3346050811291732, "learning_rate": 9.495519373548748e-06, "loss": 0.0361, "step": 2571 }, { "epoch": 0.6897291499061411, "grad_norm": 0.430181423539497, "learning_rate": 9.49483620570016e-06, "loss": 0.0379, "step": 2572 }, { "epoch": 0.6899973183159024, "grad_norm": 0.4251714909760362, "learning_rate": 9.494152600201306e-06, "loss": 0.0321, "step": 2573 }, { "epoch": 0.6902654867256637, "grad_norm": 0.5993517567355114, "learning_rate": 9.493468557118744e-06, "loss": 0.0458, "step": 2574 }, { "epoch": 0.6905336551354251, "grad_norm": 0.3365545433534593, "learning_rate": 9.492784076519077e-06, "loss": 0.0377, "step": 2575 }, { "epoch": 0.6908018235451864, "grad_norm": 0.4712980170300561, "learning_rate": 9.492099158468955e-06, "loss": 0.0532, "step": 2576 }, { "epoch": 0.6910699919549477, "grad_norm": 0.3653586012199385, "learning_rate": 9.491413803035062e-06, "loss": 0.0381, "step": 2577 }, { "epoch": 0.6913381603647091, "grad_norm": 0.42926558411140125, "learning_rate": 9.490728010284132e-06, "loss": 0.0418, "step": 2578 }, { "epoch": 0.6916063287744704, "grad_norm": 0.4791043152210209, "learning_rate": 9.490041780282939e-06, "loss": 0.0321, "step": 2579 }, { "epoch": 0.6918744971842317, "grad_norm": 0.45313370537527153, "learning_rate": 9.489355113098298e-06, "loss": 0.0516, "step": 2580 }, { "epoch": 0.6921426655939931, "grad_norm": 0.435819599578046, "learning_rate": 9.488668008797071e-06, "loss": 0.0415, "step": 2581 }, { "epoch": 0.6924108340037544, "grad_norm": 0.3933139340371445, "learning_rate": 9.487980467446153e-06, "loss": 0.0428, "step": 2582 }, { "epoch": 0.6926790024135157, "grad_norm": 0.4604090914208074, "learning_rate": 9.487292489112497e-06, "loss": 0.047, "step": 2583 }, { "epoch": 0.6929471708232771, "grad_norm": 0.28180255787866837, "learning_rate": 9.486604073863081e-06, "loss": 0.0338, "step": 2584 }, { "epoch": 0.6932153392330384, "grad_norm": 0.459239400187049, "learning_rate": 9.48591522176494e-06, "loss": 0.0413, "step": 2585 }, { "epoch": 0.6934835076427996, "grad_norm": 0.4225224386795257, "learning_rate": 9.485225932885143e-06, "loss": 0.0343, "step": 2586 }, { "epoch": 0.693751676052561, "grad_norm": 0.42121334035161756, "learning_rate": 9.484536207290804e-06, "loss": 0.0406, "step": 2587 }, { "epoch": 0.6940198444623223, "grad_norm": 0.3290552017152997, "learning_rate": 9.483846045049083e-06, "loss": 0.0406, "step": 2588 }, { "epoch": 0.6942880128720836, "grad_norm": 0.344510815942509, "learning_rate": 9.483155446227176e-06, "loss": 0.0283, "step": 2589 }, { "epoch": 0.694556181281845, "grad_norm": 0.38528408477262127, "learning_rate": 9.482464410892324e-06, "loss": 0.03, "step": 2590 }, { "epoch": 0.6948243496916063, "grad_norm": 0.30466968099348896, "learning_rate": 9.481772939111815e-06, "loss": 0.0294, "step": 2591 }, { "epoch": 0.6950925181013676, "grad_norm": 0.4508906122509676, "learning_rate": 9.481081030952974e-06, "loss": 0.0394, "step": 2592 }, { "epoch": 0.695360686511129, "grad_norm": 0.3462477316840472, "learning_rate": 9.48038868648317e-06, "loss": 0.0351, "step": 2593 }, { "epoch": 0.6956288549208903, "grad_norm": 0.4053748803448256, "learning_rate": 9.479695905769813e-06, "loss": 0.0321, "step": 2594 }, { "epoch": 0.6958970233306516, "grad_norm": 0.6797731725806727, "learning_rate": 9.479002688880362e-06, "loss": 0.0286, "step": 2595 }, { "epoch": 0.696165191740413, "grad_norm": 0.4001893269467368, "learning_rate": 9.478309035882308e-06, "loss": 0.0435, "step": 2596 }, { "epoch": 0.6964333601501743, "grad_norm": 0.5297141001294885, "learning_rate": 9.477614946843194e-06, "loss": 0.0394, "step": 2597 }, { "epoch": 0.6967015285599356, "grad_norm": 0.5739944481572751, "learning_rate": 9.4769204218306e-06, "loss": 0.0376, "step": 2598 }, { "epoch": 0.696969696969697, "grad_norm": 0.37926399791468157, "learning_rate": 9.47622546091215e-06, "loss": 0.0385, "step": 2599 }, { "epoch": 0.6972378653794583, "grad_norm": 0.33394294276152053, "learning_rate": 9.475530064155512e-06, "loss": 0.0293, "step": 2600 }, { "epoch": 0.6975060337892196, "grad_norm": 0.4481808270233917, "learning_rate": 9.474834231628394e-06, "loss": 0.0437, "step": 2601 }, { "epoch": 0.697774202198981, "grad_norm": 0.4214258811414788, "learning_rate": 9.474137963398546e-06, "loss": 0.0348, "step": 2602 }, { "epoch": 0.6980423706087423, "grad_norm": 0.7041863271410196, "learning_rate": 9.473441259533765e-06, "loss": 0.0533, "step": 2603 }, { "epoch": 0.6983105390185036, "grad_norm": 0.45993421469224777, "learning_rate": 9.472744120101884e-06, "loss": 0.0574, "step": 2604 }, { "epoch": 0.698578707428265, "grad_norm": 0.48370580541406893, "learning_rate": 9.472046545170784e-06, "loss": 0.0362, "step": 2605 }, { "epoch": 0.6988468758380263, "grad_norm": 0.5008560265974089, "learning_rate": 9.471348534808385e-06, "loss": 0.0408, "step": 2606 }, { "epoch": 0.6991150442477876, "grad_norm": 0.4458965308879168, "learning_rate": 9.470650089082649e-06, "loss": 0.029, "step": 2607 }, { "epoch": 0.699383212657549, "grad_norm": 0.3500305270414859, "learning_rate": 9.469951208061584e-06, "loss": 0.0386, "step": 2608 }, { "epoch": 0.6996513810673103, "grad_norm": 0.36297712015513944, "learning_rate": 9.469251891813238e-06, "loss": 0.0332, "step": 2609 }, { "epoch": 0.6999195494770716, "grad_norm": 0.4247977316549169, "learning_rate": 9.4685521404057e-06, "loss": 0.0324, "step": 2610 }, { "epoch": 0.700187717886833, "grad_norm": 0.6117997400671085, "learning_rate": 9.467851953907103e-06, "loss": 0.043, "step": 2611 }, { "epoch": 0.7004558862965943, "grad_norm": 0.3059310892043086, "learning_rate": 9.467151332385624e-06, "loss": 0.0312, "step": 2612 }, { "epoch": 0.7007240547063556, "grad_norm": 0.31896964285004903, "learning_rate": 9.466450275909481e-06, "loss": 0.0261, "step": 2613 }, { "epoch": 0.700992223116117, "grad_norm": 0.5542071309897022, "learning_rate": 9.465748784546932e-06, "loss": 0.043, "step": 2614 }, { "epoch": 0.7012603915258783, "grad_norm": 0.2990252918943228, "learning_rate": 9.46504685836628e-06, "loss": 0.0289, "step": 2615 }, { "epoch": 0.7015285599356396, "grad_norm": 0.44788446770739, "learning_rate": 9.46434449743587e-06, "loss": 0.0283, "step": 2616 }, { "epoch": 0.701796728345401, "grad_norm": 0.30622799118200966, "learning_rate": 9.46364170182409e-06, "loss": 0.0385, "step": 2617 }, { "epoch": 0.7020648967551623, "grad_norm": 0.3486997893903188, "learning_rate": 9.462938471599366e-06, "loss": 0.0429, "step": 2618 }, { "epoch": 0.7023330651649236, "grad_norm": 0.42453664983518774, "learning_rate": 9.462234806830172e-06, "loss": 0.0398, "step": 2619 }, { "epoch": 0.7026012335746848, "grad_norm": 0.5372338962115267, "learning_rate": 9.461530707585023e-06, "loss": 0.0435, "step": 2620 }, { "epoch": 0.7028694019844463, "grad_norm": 0.46297375941689933, "learning_rate": 9.460826173932475e-06, "loss": 0.057, "step": 2621 }, { "epoch": 0.7031375703942075, "grad_norm": 0.3846557121024482, "learning_rate": 9.460121205941125e-06, "loss": 0.0437, "step": 2622 }, { "epoch": 0.7034057388039688, "grad_norm": 0.289497351076226, "learning_rate": 9.459415803679613e-06, "loss": 0.0271, "step": 2623 }, { "epoch": 0.7036739072137302, "grad_norm": 0.3689500869142178, "learning_rate": 9.458709967216625e-06, "loss": 0.0393, "step": 2624 }, { "epoch": 0.7039420756234915, "grad_norm": 0.4058195670260156, "learning_rate": 9.458003696620885e-06, "loss": 0.0329, "step": 2625 }, { "epoch": 0.7042102440332528, "grad_norm": 0.37015538351102995, "learning_rate": 9.457296991961163e-06, "loss": 0.0357, "step": 2626 }, { "epoch": 0.7044784124430142, "grad_norm": 0.3943897197560825, "learning_rate": 9.456589853306263e-06, "loss": 0.031, "step": 2627 }, { "epoch": 0.7047465808527755, "grad_norm": 0.3557154907479563, "learning_rate": 9.455882280725045e-06, "loss": 0.0282, "step": 2628 }, { "epoch": 0.7050147492625368, "grad_norm": 0.31914531507869204, "learning_rate": 9.455174274286396e-06, "loss": 0.0344, "step": 2629 }, { "epoch": 0.7052829176722982, "grad_norm": 0.4410708893157704, "learning_rate": 9.454465834059257e-06, "loss": 0.0581, "step": 2630 }, { "epoch": 0.7055510860820595, "grad_norm": 0.3467163725262633, "learning_rate": 9.453756960112605e-06, "loss": 0.0347, "step": 2631 }, { "epoch": 0.7058192544918208, "grad_norm": 0.33880922694137566, "learning_rate": 9.453047652515466e-06, "loss": 0.034, "step": 2632 }, { "epoch": 0.7060874229015822, "grad_norm": 0.365933099742316, "learning_rate": 9.452337911336896e-06, "loss": 0.0338, "step": 2633 }, { "epoch": 0.7063555913113435, "grad_norm": 0.32131246842023076, "learning_rate": 9.451627736646006e-06, "loss": 0.0352, "step": 2634 }, { "epoch": 0.7066237597211048, "grad_norm": 0.3735523107297418, "learning_rate": 9.450917128511942e-06, "loss": 0.0275, "step": 2635 }, { "epoch": 0.7068919281308662, "grad_norm": 0.4353006311922661, "learning_rate": 9.450206087003895e-06, "loss": 0.0397, "step": 2636 }, { "epoch": 0.7071600965406275, "grad_norm": 0.5263977724241677, "learning_rate": 9.449494612191094e-06, "loss": 0.044, "step": 2637 }, { "epoch": 0.7074282649503888, "grad_norm": 0.6886391063866727, "learning_rate": 9.448782704142818e-06, "loss": 0.0623, "step": 2638 }, { "epoch": 0.7076964333601502, "grad_norm": 0.24988749434997662, "learning_rate": 9.44807036292838e-06, "loss": 0.0255, "step": 2639 }, { "epoch": 0.7079646017699115, "grad_norm": 0.563986417897695, "learning_rate": 9.447357588617142e-06, "loss": 0.0387, "step": 2640 }, { "epoch": 0.7082327701796728, "grad_norm": 0.38306834556808345, "learning_rate": 9.4466443812785e-06, "loss": 0.0453, "step": 2641 }, { "epoch": 0.7085009385894342, "grad_norm": 0.3083146974228588, "learning_rate": 9.445930740981904e-06, "loss": 0.0358, "step": 2642 }, { "epoch": 0.7087691069991955, "grad_norm": 0.31657131910368547, "learning_rate": 9.445216667796833e-06, "loss": 0.042, "step": 2643 }, { "epoch": 0.7090372754089568, "grad_norm": 0.3983298516828245, "learning_rate": 9.444502161792817e-06, "loss": 0.0382, "step": 2644 }, { "epoch": 0.7093054438187182, "grad_norm": 0.257400812724172, "learning_rate": 9.443787223039425e-06, "loss": 0.0208, "step": 2645 }, { "epoch": 0.7095736122284795, "grad_norm": 0.4662173454064492, "learning_rate": 9.443071851606271e-06, "loss": 0.0307, "step": 2646 }, { "epoch": 0.7098417806382408, "grad_norm": 0.3823289015294413, "learning_rate": 9.442356047563007e-06, "loss": 0.0322, "step": 2647 }, { "epoch": 0.7101099490480022, "grad_norm": 0.45729757992449044, "learning_rate": 9.441639810979327e-06, "loss": 0.0396, "step": 2648 }, { "epoch": 0.7103781174577635, "grad_norm": 0.4920484398909539, "learning_rate": 9.44092314192497e-06, "loss": 0.0368, "step": 2649 }, { "epoch": 0.7106462858675248, "grad_norm": 0.42011224625710414, "learning_rate": 9.44020604046972e-06, "loss": 0.0409, "step": 2650 }, { "epoch": 0.7109144542772862, "grad_norm": 0.4732314867817539, "learning_rate": 9.439488506683393e-06, "loss": 0.0423, "step": 2651 }, { "epoch": 0.7111826226870475, "grad_norm": 0.4227496017019246, "learning_rate": 9.43877054063586e-06, "loss": 0.0448, "step": 2652 }, { "epoch": 0.7114507910968088, "grad_norm": 0.4386308568902799, "learning_rate": 9.438052142397024e-06, "loss": 0.0336, "step": 2653 }, { "epoch": 0.7117189595065702, "grad_norm": 0.4675238932937317, "learning_rate": 9.437333312036832e-06, "loss": 0.0434, "step": 2654 }, { "epoch": 0.7119871279163315, "grad_norm": 0.30064764201401006, "learning_rate": 9.436614049625277e-06, "loss": 0.034, "step": 2655 }, { "epoch": 0.7122552963260927, "grad_norm": 0.3873299831428463, "learning_rate": 9.435894355232392e-06, "loss": 0.0476, "step": 2656 }, { "epoch": 0.7125234647358542, "grad_norm": 0.3059157820894288, "learning_rate": 9.435174228928248e-06, "loss": 0.0312, "step": 2657 }, { "epoch": 0.7127916331456154, "grad_norm": 0.6277293936392747, "learning_rate": 9.434453670782968e-06, "loss": 0.0529, "step": 2658 }, { "epoch": 0.7130598015553767, "grad_norm": 0.29561977553876284, "learning_rate": 9.433732680866704e-06, "loss": 0.0237, "step": 2659 }, { "epoch": 0.7133279699651381, "grad_norm": 0.40827700251984134, "learning_rate": 9.433011259249663e-06, "loss": 0.0462, "step": 2660 }, { "epoch": 0.7135961383748994, "grad_norm": 0.39495441024244804, "learning_rate": 9.432289406002085e-06, "loss": 0.0413, "step": 2661 }, { "epoch": 0.7138643067846607, "grad_norm": 0.5363938821784745, "learning_rate": 9.431567121194251e-06, "loss": 0.0403, "step": 2662 }, { "epoch": 0.7141324751944221, "grad_norm": 0.31257287055808247, "learning_rate": 9.430844404896496e-06, "loss": 0.0248, "step": 2663 }, { "epoch": 0.7144006436041834, "grad_norm": 0.832348316463663, "learning_rate": 9.430121257179183e-06, "loss": 0.0526, "step": 2664 }, { "epoch": 0.7146688120139447, "grad_norm": 0.506238721951936, "learning_rate": 9.429397678112726e-06, "loss": 0.0421, "step": 2665 }, { "epoch": 0.7149369804237061, "grad_norm": 0.387817490729793, "learning_rate": 9.428673667767577e-06, "loss": 0.0368, "step": 2666 }, { "epoch": 0.7152051488334674, "grad_norm": 0.3816904342879657, "learning_rate": 9.42794922621423e-06, "loss": 0.0344, "step": 2667 }, { "epoch": 0.7154733172432287, "grad_norm": 0.32147696479265375, "learning_rate": 9.427224353523223e-06, "loss": 0.0248, "step": 2668 }, { "epoch": 0.7157414856529901, "grad_norm": 0.36970052678357146, "learning_rate": 9.426499049765136e-06, "loss": 0.0358, "step": 2669 }, { "epoch": 0.7160096540627514, "grad_norm": 0.3789213281709949, "learning_rate": 9.425773315010587e-06, "loss": 0.0331, "step": 2670 }, { "epoch": 0.7162778224725127, "grad_norm": 0.5757118573519422, "learning_rate": 9.425047149330242e-06, "loss": 0.0581, "step": 2671 }, { "epoch": 0.7165459908822741, "grad_norm": 0.3777858458474212, "learning_rate": 9.424320552794803e-06, "loss": 0.0475, "step": 2672 }, { "epoch": 0.7168141592920354, "grad_norm": 0.32835603840106786, "learning_rate": 9.423593525475019e-06, "loss": 0.0329, "step": 2673 }, { "epoch": 0.7170823277017967, "grad_norm": 0.43375540459549766, "learning_rate": 9.422866067441678e-06, "loss": 0.0335, "step": 2674 }, { "epoch": 0.7173504961115581, "grad_norm": 0.5776245092895724, "learning_rate": 9.42213817876561e-06, "loss": 0.0457, "step": 2675 }, { "epoch": 0.7176186645213194, "grad_norm": 0.3716675522805873, "learning_rate": 9.421409859517688e-06, "loss": 0.0347, "step": 2676 }, { "epoch": 0.7178868329310807, "grad_norm": 0.32693660346552744, "learning_rate": 9.420681109768827e-06, "loss": 0.0306, "step": 2677 }, { "epoch": 0.7181550013408421, "grad_norm": 0.54620231822749, "learning_rate": 9.419951929589983e-06, "loss": 0.036, "step": 2678 }, { "epoch": 0.7184231697506034, "grad_norm": 0.34161193262352446, "learning_rate": 9.419222319052154e-06, "loss": 0.035, "step": 2679 }, { "epoch": 0.7186913381603647, "grad_norm": 0.49071864076508265, "learning_rate": 9.41849227822638e-06, "loss": 0.0469, "step": 2680 }, { "epoch": 0.7189595065701261, "grad_norm": 0.47220287395251737, "learning_rate": 9.417761807183745e-06, "loss": 0.0351, "step": 2681 }, { "epoch": 0.7192276749798874, "grad_norm": 0.346906855876665, "learning_rate": 9.417030905995372e-06, "loss": 0.0371, "step": 2682 }, { "epoch": 0.7194958433896487, "grad_norm": 0.36879488049620507, "learning_rate": 9.416299574732425e-06, "loss": 0.0388, "step": 2683 }, { "epoch": 0.7197640117994101, "grad_norm": 0.41172394744974267, "learning_rate": 9.415567813466114e-06, "loss": 0.0361, "step": 2684 }, { "epoch": 0.7200321802091714, "grad_norm": 0.4304818958839077, "learning_rate": 9.414835622267689e-06, "loss": 0.0278, "step": 2685 }, { "epoch": 0.7203003486189327, "grad_norm": 0.2956703087356682, "learning_rate": 9.41410300120844e-06, "loss": 0.0348, "step": 2686 }, { "epoch": 0.7205685170286941, "grad_norm": 0.28843451323726627, "learning_rate": 9.4133699503597e-06, "loss": 0.029, "step": 2687 }, { "epoch": 0.7208366854384554, "grad_norm": 0.46630114862955235, "learning_rate": 9.412636469792847e-06, "loss": 0.0347, "step": 2688 }, { "epoch": 0.7211048538482167, "grad_norm": 0.3173721013535111, "learning_rate": 9.411902559579295e-06, "loss": 0.0287, "step": 2689 }, { "epoch": 0.7213730222579781, "grad_norm": 0.5303554957669326, "learning_rate": 9.411168219790506e-06, "loss": 0.0403, "step": 2690 }, { "epoch": 0.7216411906677394, "grad_norm": 0.27369982832533374, "learning_rate": 9.410433450497977e-06, "loss": 0.0335, "step": 2691 }, { "epoch": 0.7219093590775006, "grad_norm": 0.5513954865903511, "learning_rate": 9.409698251773254e-06, "loss": 0.038, "step": 2692 }, { "epoch": 0.722177527487262, "grad_norm": 0.3215182094325023, "learning_rate": 9.408962623687918e-06, "loss": 0.0293, "step": 2693 }, { "epoch": 0.7224456958970233, "grad_norm": 0.4354235663623081, "learning_rate": 9.408226566313598e-06, "loss": 0.034, "step": 2694 }, { "epoch": 0.7227138643067846, "grad_norm": 0.45467759149282094, "learning_rate": 9.407490079721959e-06, "loss": 0.0387, "step": 2695 }, { "epoch": 0.722982032716546, "grad_norm": 0.3572642249436466, "learning_rate": 9.406753163984714e-06, "loss": 0.0359, "step": 2696 }, { "epoch": 0.7232502011263073, "grad_norm": 0.43167678135881854, "learning_rate": 9.406015819173614e-06, "loss": 0.0266, "step": 2697 }, { "epoch": 0.7235183695360686, "grad_norm": 0.22929703376388086, "learning_rate": 9.40527804536045e-06, "loss": 0.0271, "step": 2698 }, { "epoch": 0.72378653794583, "grad_norm": 0.4762434226247083, "learning_rate": 9.404539842617059e-06, "loss": 0.0353, "step": 2699 }, { "epoch": 0.7240547063555913, "grad_norm": 0.43229494341064917, "learning_rate": 9.403801211015317e-06, "loss": 0.0342, "step": 2700 }, { "epoch": 0.7243228747653526, "grad_norm": 0.5582270287177813, "learning_rate": 9.403062150627145e-06, "loss": 0.0429, "step": 2701 }, { "epoch": 0.724591043175114, "grad_norm": 0.43457849315388136, "learning_rate": 9.402322661524499e-06, "loss": 0.0333, "step": 2702 }, { "epoch": 0.7248592115848753, "grad_norm": 0.47674628828301996, "learning_rate": 9.401582743779384e-06, "loss": 0.0384, "step": 2703 }, { "epoch": 0.7251273799946366, "grad_norm": 0.4836546715157746, "learning_rate": 9.400842397463842e-06, "loss": 0.0311, "step": 2704 }, { "epoch": 0.725395548404398, "grad_norm": 0.35911375326587447, "learning_rate": 9.400101622649962e-06, "loss": 0.0445, "step": 2705 }, { "epoch": 0.7256637168141593, "grad_norm": 0.4726544443293054, "learning_rate": 9.399360419409869e-06, "loss": 0.0358, "step": 2706 }, { "epoch": 0.7259318852239206, "grad_norm": 0.4743025316039749, "learning_rate": 9.398618787815731e-06, "loss": 0.0387, "step": 2707 }, { "epoch": 0.7262000536336819, "grad_norm": 0.6972954834740794, "learning_rate": 9.39787672793976e-06, "loss": 0.0411, "step": 2708 }, { "epoch": 0.7264682220434433, "grad_norm": 1.2269764745242044, "learning_rate": 9.39713423985421e-06, "loss": 0.0327, "step": 2709 }, { "epoch": 0.7267363904532046, "grad_norm": 0.37503187748723465, "learning_rate": 9.39639132363137e-06, "loss": 0.0326, "step": 2710 }, { "epoch": 0.7270045588629659, "grad_norm": 0.5397512962600839, "learning_rate": 9.39564797934358e-06, "loss": 0.0535, "step": 2711 }, { "epoch": 0.7272727272727273, "grad_norm": 0.31694373979239815, "learning_rate": 9.394904207063216e-06, "loss": 0.0289, "step": 2712 }, { "epoch": 0.7275408956824886, "grad_norm": 0.5704152848423665, "learning_rate": 9.394160006862698e-06, "loss": 0.0351, "step": 2713 }, { "epoch": 0.7278090640922499, "grad_norm": 0.3729889250766108, "learning_rate": 9.393415378814488e-06, "loss": 0.0376, "step": 2714 }, { "epoch": 0.7280772325020113, "grad_norm": 0.4198426567186146, "learning_rate": 9.392670322991085e-06, "loss": 0.0388, "step": 2715 }, { "epoch": 0.7283454009117726, "grad_norm": 0.3223314182164308, "learning_rate": 9.391924839465036e-06, "loss": 0.0296, "step": 2716 }, { "epoch": 0.7286135693215339, "grad_norm": 0.7235493059670693, "learning_rate": 9.391178928308925e-06, "loss": 0.0302, "step": 2717 }, { "epoch": 0.7288817377312953, "grad_norm": 0.3672472666522542, "learning_rate": 9.390432589595379e-06, "loss": 0.0407, "step": 2718 }, { "epoch": 0.7291499061410566, "grad_norm": 0.20448967956359898, "learning_rate": 9.38968582339707e-06, "loss": 0.0253, "step": 2719 }, { "epoch": 0.7294180745508179, "grad_norm": 0.3373537464993408, "learning_rate": 9.388938629786708e-06, "loss": 0.0405, "step": 2720 }, { "epoch": 0.7296862429605793, "grad_norm": 0.3108173757712986, "learning_rate": 9.388191008837042e-06, "loss": 0.0349, "step": 2721 }, { "epoch": 0.7299544113703406, "grad_norm": 0.518795718010647, "learning_rate": 9.387442960620868e-06, "loss": 0.0353, "step": 2722 }, { "epoch": 0.7302225797801019, "grad_norm": 0.4120483815959848, "learning_rate": 9.386694485211021e-06, "loss": 0.0343, "step": 2723 }, { "epoch": 0.7304907481898633, "grad_norm": 0.6108624511819352, "learning_rate": 9.385945582680379e-06, "loss": 0.0408, "step": 2724 }, { "epoch": 0.7307589165996246, "grad_norm": 0.4656012118755314, "learning_rate": 9.385196253101861e-06, "loss": 0.0462, "step": 2725 }, { "epoch": 0.7310270850093858, "grad_norm": 0.28342164680600157, "learning_rate": 9.384446496548425e-06, "loss": 0.0267, "step": 2726 }, { "epoch": 0.7312952534191473, "grad_norm": 0.264307583114337, "learning_rate": 9.383696313093073e-06, "loss": 0.027, "step": 2727 }, { "epoch": 0.7315634218289085, "grad_norm": 0.6074153635124294, "learning_rate": 9.382945702808852e-06, "loss": 0.0429, "step": 2728 }, { "epoch": 0.7318315902386698, "grad_norm": 0.5174273502949047, "learning_rate": 9.382194665768844e-06, "loss": 0.051, "step": 2729 }, { "epoch": 0.7320997586484312, "grad_norm": 0.6265495632451057, "learning_rate": 9.381443202046175e-06, "loss": 0.0358, "step": 2730 }, { "epoch": 0.7323679270581925, "grad_norm": 0.32620497314101726, "learning_rate": 9.380691311714013e-06, "loss": 0.0336, "step": 2731 }, { "epoch": 0.7326360954679538, "grad_norm": 0.35064993515706444, "learning_rate": 9.37993899484557e-06, "loss": 0.0257, "step": 2732 }, { "epoch": 0.7329042638777152, "grad_norm": 0.4175418038425128, "learning_rate": 9.379186251514096e-06, "loss": 0.039, "step": 2733 }, { "epoch": 0.7331724322874765, "grad_norm": 0.6179551406430485, "learning_rate": 9.378433081792883e-06, "loss": 0.0471, "step": 2734 }, { "epoch": 0.7334406006972378, "grad_norm": 0.4189199048181353, "learning_rate": 9.377679485755265e-06, "loss": 0.03, "step": 2735 }, { "epoch": 0.7337087691069992, "grad_norm": 0.334562590479166, "learning_rate": 9.376925463474618e-06, "loss": 0.0312, "step": 2736 }, { "epoch": 0.7339769375167605, "grad_norm": 0.33658092472066803, "learning_rate": 9.376171015024358e-06, "loss": 0.0358, "step": 2737 }, { "epoch": 0.7342451059265218, "grad_norm": 0.292881163076582, "learning_rate": 9.375416140477945e-06, "loss": 0.0353, "step": 2738 }, { "epoch": 0.7345132743362832, "grad_norm": 0.4485549853790626, "learning_rate": 9.374660839908881e-06, "loss": 0.0434, "step": 2739 }, { "epoch": 0.7347814427460445, "grad_norm": 0.4348298214180735, "learning_rate": 9.373905113390704e-06, "loss": 0.0377, "step": 2740 }, { "epoch": 0.7350496111558058, "grad_norm": 0.34086553822572935, "learning_rate": 9.373148960996998e-06, "loss": 0.0316, "step": 2741 }, { "epoch": 0.7353177795655672, "grad_norm": 0.45340390928171354, "learning_rate": 9.372392382801389e-06, "loss": 0.042, "step": 2742 }, { "epoch": 0.7355859479753285, "grad_norm": 0.41290457730171065, "learning_rate": 9.371635378877542e-06, "loss": 0.0387, "step": 2743 }, { "epoch": 0.7358541163850898, "grad_norm": 0.6464675423412379, "learning_rate": 9.370877949299164e-06, "loss": 0.0339, "step": 2744 }, { "epoch": 0.7361222847948512, "grad_norm": 0.34479109473592995, "learning_rate": 9.370120094140006e-06, "loss": 0.0278, "step": 2745 }, { "epoch": 0.7363904532046125, "grad_norm": 0.3870503407858689, "learning_rate": 9.369361813473856e-06, "loss": 0.0437, "step": 2746 }, { "epoch": 0.7366586216143738, "grad_norm": 0.38155919561310914, "learning_rate": 9.368603107374547e-06, "loss": 0.0394, "step": 2747 }, { "epoch": 0.7369267900241352, "grad_norm": 0.26727591386995, "learning_rate": 9.36784397591595e-06, "loss": 0.0359, "step": 2748 }, { "epoch": 0.7371949584338965, "grad_norm": 0.5343887998039686, "learning_rate": 9.367084419171984e-06, "loss": 0.0477, "step": 2749 }, { "epoch": 0.7374631268436578, "grad_norm": 0.3355150329619468, "learning_rate": 9.366324437216602e-06, "loss": 0.0309, "step": 2750 }, { "epoch": 0.7377312952534192, "grad_norm": 0.5286632485501943, "learning_rate": 9.365564030123802e-06, "loss": 0.0492, "step": 2751 }, { "epoch": 0.7379994636631805, "grad_norm": 0.314639637029734, "learning_rate": 9.364803197967623e-06, "loss": 0.0311, "step": 2752 }, { "epoch": 0.7382676320729418, "grad_norm": 0.4540144994912781, "learning_rate": 9.364041940822143e-06, "loss": 0.0383, "step": 2753 }, { "epoch": 0.7385358004827032, "grad_norm": 0.4032548200009073, "learning_rate": 9.363280258761487e-06, "loss": 0.0382, "step": 2754 }, { "epoch": 0.7388039688924645, "grad_norm": 0.5072680538948661, "learning_rate": 9.362518151859816e-06, "loss": 0.0418, "step": 2755 }, { "epoch": 0.7390721373022258, "grad_norm": 0.3879804826519922, "learning_rate": 9.361755620191335e-06, "loss": 0.031, "step": 2756 }, { "epoch": 0.7393403057119872, "grad_norm": 0.34329708817200605, "learning_rate": 9.360992663830289e-06, "loss": 0.0335, "step": 2757 }, { "epoch": 0.7396084741217485, "grad_norm": 0.35043694286614124, "learning_rate": 9.360229282850966e-06, "loss": 0.0374, "step": 2758 }, { "epoch": 0.7398766425315098, "grad_norm": 0.328290188830268, "learning_rate": 9.359465477327694e-06, "loss": 0.0373, "step": 2759 }, { "epoch": 0.7401448109412712, "grad_norm": 0.2892081286213167, "learning_rate": 9.358701247334842e-06, "loss": 0.037, "step": 2760 }, { "epoch": 0.7404129793510325, "grad_norm": 0.4259384745948188, "learning_rate": 9.357936592946823e-06, "loss": 0.0403, "step": 2761 }, { "epoch": 0.7406811477607937, "grad_norm": 0.33539921631282366, "learning_rate": 9.357171514238086e-06, "loss": 0.0348, "step": 2762 }, { "epoch": 0.7409493161705552, "grad_norm": 0.4534777268325014, "learning_rate": 9.356406011283128e-06, "loss": 0.0339, "step": 2763 }, { "epoch": 0.7412174845803164, "grad_norm": 0.4445702179297352, "learning_rate": 9.355640084156483e-06, "loss": 0.0346, "step": 2764 }, { "epoch": 0.7414856529900777, "grad_norm": 0.4973435423297838, "learning_rate": 9.354873732932727e-06, "loss": 0.0464, "step": 2765 }, { "epoch": 0.7417538213998391, "grad_norm": 0.46215667235157587, "learning_rate": 9.354106957686478e-06, "loss": 0.0325, "step": 2766 }, { "epoch": 0.7420219898096004, "grad_norm": 0.3586736475885427, "learning_rate": 9.353339758492394e-06, "loss": 0.0391, "step": 2767 }, { "epoch": 0.7422901582193617, "grad_norm": 0.35818587554594095, "learning_rate": 9.352572135425175e-06, "loss": 0.0289, "step": 2768 }, { "epoch": 0.7425583266291231, "grad_norm": 0.3472660612561635, "learning_rate": 9.351804088559565e-06, "loss": 0.0387, "step": 2769 }, { "epoch": 0.7428264950388844, "grad_norm": 0.38512994460302025, "learning_rate": 9.35103561797034e-06, "loss": 0.0315, "step": 2770 }, { "epoch": 0.7430946634486457, "grad_norm": 0.32312852819656157, "learning_rate": 9.350266723732333e-06, "loss": 0.0377, "step": 2771 }, { "epoch": 0.7433628318584071, "grad_norm": 0.3025608674208243, "learning_rate": 9.349497405920405e-06, "loss": 0.0297, "step": 2772 }, { "epoch": 0.7436310002681684, "grad_norm": 0.3983460176911991, "learning_rate": 9.34872766460946e-06, "loss": 0.0483, "step": 2773 }, { "epoch": 0.7438991686779297, "grad_norm": 0.36466344321223576, "learning_rate": 9.347957499874449e-06, "loss": 0.0463, "step": 2774 }, { "epoch": 0.7441673370876911, "grad_norm": 0.35713408458877244, "learning_rate": 9.34718691179036e-06, "loss": 0.0324, "step": 2775 }, { "epoch": 0.7444355054974524, "grad_norm": 0.45480867301502875, "learning_rate": 9.346415900432222e-06, "loss": 0.039, "step": 2776 }, { "epoch": 0.7447036739072137, "grad_norm": 0.3379154057008416, "learning_rate": 9.345644465875108e-06, "loss": 0.0386, "step": 2777 }, { "epoch": 0.7449718423169751, "grad_norm": 0.3398445033643358, "learning_rate": 9.34487260819413e-06, "loss": 0.0436, "step": 2778 }, { "epoch": 0.7452400107267364, "grad_norm": 0.5005377681793879, "learning_rate": 9.344100327464441e-06, "loss": 0.0589, "step": 2779 }, { "epoch": 0.7455081791364977, "grad_norm": 0.31331370622173227, "learning_rate": 9.343327623761237e-06, "loss": 0.0354, "step": 2780 }, { "epoch": 0.7457763475462591, "grad_norm": 0.31095828425191324, "learning_rate": 9.342554497159753e-06, "loss": 0.039, "step": 2781 }, { "epoch": 0.7460445159560204, "grad_norm": 0.2672237072998254, "learning_rate": 9.34178094773527e-06, "loss": 0.0282, "step": 2782 }, { "epoch": 0.7463126843657817, "grad_norm": 0.34134367428360257, "learning_rate": 9.341006975563098e-06, "loss": 0.0309, "step": 2783 }, { "epoch": 0.7465808527755431, "grad_norm": 0.44398683839981, "learning_rate": 9.340232580718604e-06, "loss": 0.032, "step": 2784 }, { "epoch": 0.7468490211853044, "grad_norm": 0.29778815373743833, "learning_rate": 9.339457763277188e-06, "loss": 0.0317, "step": 2785 }, { "epoch": 0.7471171895950657, "grad_norm": 0.4981136466879834, "learning_rate": 9.33868252331429e-06, "loss": 0.0391, "step": 2786 }, { "epoch": 0.7473853580048271, "grad_norm": 0.6912211320297063, "learning_rate": 9.337906860905394e-06, "loss": 0.0389, "step": 2787 }, { "epoch": 0.7476535264145884, "grad_norm": 0.46057263005209936, "learning_rate": 9.337130776126021e-06, "loss": 0.0391, "step": 2788 }, { "epoch": 0.7479216948243497, "grad_norm": 0.240339964203105, "learning_rate": 9.336354269051744e-06, "loss": 0.0274, "step": 2789 }, { "epoch": 0.7481898632341111, "grad_norm": 0.34925017912672046, "learning_rate": 9.335577339758162e-06, "loss": 0.0322, "step": 2790 }, { "epoch": 0.7484580316438724, "grad_norm": 0.3467047405252644, "learning_rate": 9.334799988320926e-06, "loss": 0.0254, "step": 2791 }, { "epoch": 0.7487262000536337, "grad_norm": 0.4209284572408516, "learning_rate": 9.334022214815722e-06, "loss": 0.0309, "step": 2792 }, { "epoch": 0.7489943684633951, "grad_norm": 0.32821203004969746, "learning_rate": 9.333244019318283e-06, "loss": 0.0404, "step": 2793 }, { "epoch": 0.7492625368731564, "grad_norm": 0.28862734723977007, "learning_rate": 9.332465401904379e-06, "loss": 0.0255, "step": 2794 }, { "epoch": 0.7495307052829177, "grad_norm": 0.36497909507937576, "learning_rate": 9.331686362649819e-06, "loss": 0.04, "step": 2795 }, { "epoch": 0.7497988736926791, "grad_norm": 0.4421134454525723, "learning_rate": 9.33090690163046e-06, "loss": 0.0481, "step": 2796 }, { "epoch": 0.7500670421024404, "grad_norm": 0.40687626672383576, "learning_rate": 9.330127018922195e-06, "loss": 0.0422, "step": 2797 }, { "epoch": 0.7503352105122016, "grad_norm": 0.3488669750301508, "learning_rate": 9.329346714600957e-06, "loss": 0.0323, "step": 2798 }, { "epoch": 0.7506033789219629, "grad_norm": 0.4117735162237764, "learning_rate": 9.328565988742723e-06, "loss": 0.0429, "step": 2799 }, { "epoch": 0.7508715473317243, "grad_norm": 0.30377496881316224, "learning_rate": 9.327784841423512e-06, "loss": 0.0253, "step": 2800 }, { "epoch": 0.7511397157414856, "grad_norm": 0.38558397181756543, "learning_rate": 9.327003272719382e-06, "loss": 0.0385, "step": 2801 }, { "epoch": 0.7514078841512469, "grad_norm": 0.5658615480261258, "learning_rate": 9.32622128270643e-06, "loss": 0.0391, "step": 2802 }, { "epoch": 0.7516760525610083, "grad_norm": 0.38644785287456734, "learning_rate": 9.325438871460798e-06, "loss": 0.0402, "step": 2803 }, { "epoch": 0.7519442209707696, "grad_norm": 0.31169179554676274, "learning_rate": 9.324656039058666e-06, "loss": 0.0232, "step": 2804 }, { "epoch": 0.7522123893805309, "grad_norm": 0.4454915867544068, "learning_rate": 9.323872785576257e-06, "loss": 0.0421, "step": 2805 }, { "epoch": 0.7524805577902923, "grad_norm": 0.3671060804392625, "learning_rate": 9.323089111089835e-06, "loss": 0.0293, "step": 2806 }, { "epoch": 0.7527487262000536, "grad_norm": 0.2748163698812082, "learning_rate": 9.322305015675706e-06, "loss": 0.0308, "step": 2807 }, { "epoch": 0.7530168946098149, "grad_norm": 0.38387044672063336, "learning_rate": 9.32152049941021e-06, "loss": 0.0342, "step": 2808 }, { "epoch": 0.7532850630195763, "grad_norm": 0.35711667070284353, "learning_rate": 9.320735562369739e-06, "loss": 0.0386, "step": 2809 }, { "epoch": 0.7535532314293376, "grad_norm": 0.39585449874355755, "learning_rate": 9.319950204630716e-06, "loss": 0.0311, "step": 2810 }, { "epoch": 0.7538213998390989, "grad_norm": 0.2594219980914141, "learning_rate": 9.31916442626961e-06, "loss": 0.03, "step": 2811 }, { "epoch": 0.7540895682488603, "grad_norm": 0.29246613793869797, "learning_rate": 9.318378227362934e-06, "loss": 0.0295, "step": 2812 }, { "epoch": 0.7543577366586216, "grad_norm": 0.3386397756663265, "learning_rate": 9.317591607987233e-06, "loss": 0.0264, "step": 2813 }, { "epoch": 0.7546259050683829, "grad_norm": 0.5574377174150268, "learning_rate": 9.3168045682191e-06, "loss": 0.0345, "step": 2814 }, { "epoch": 0.7548940734781443, "grad_norm": 0.29644893730102756, "learning_rate": 9.316017108135167e-06, "loss": 0.0304, "step": 2815 }, { "epoch": 0.7551622418879056, "grad_norm": 0.30848909279110376, "learning_rate": 9.315229227812108e-06, "loss": 0.0323, "step": 2816 }, { "epoch": 0.7554304102976669, "grad_norm": 0.3686131786249299, "learning_rate": 9.314440927326635e-06, "loss": 0.0579, "step": 2817 }, { "epoch": 0.7556985787074283, "grad_norm": 0.3451032885483723, "learning_rate": 9.313652206755503e-06, "loss": 0.0387, "step": 2818 }, { "epoch": 0.7559667471171896, "grad_norm": 0.4316139119185835, "learning_rate": 9.31286306617551e-06, "loss": 0.0305, "step": 2819 }, { "epoch": 0.7562349155269509, "grad_norm": 0.3896054839897764, "learning_rate": 9.312073505663489e-06, "loss": 0.0331, "step": 2820 }, { "epoch": 0.7565030839367123, "grad_norm": 0.2630097832827242, "learning_rate": 9.31128352529632e-06, "loss": 0.0284, "step": 2821 }, { "epoch": 0.7567712523464736, "grad_norm": 0.4975251258421063, "learning_rate": 9.31049312515092e-06, "loss": 0.0375, "step": 2822 }, { "epoch": 0.7570394207562349, "grad_norm": 0.43725150047313127, "learning_rate": 9.30970230530425e-06, "loss": 0.0326, "step": 2823 }, { "epoch": 0.7573075891659963, "grad_norm": 0.4395012434604193, "learning_rate": 9.308911065833307e-06, "loss": 0.0349, "step": 2824 }, { "epoch": 0.7575757575757576, "grad_norm": 0.2948723105878521, "learning_rate": 9.308119406815132e-06, "loss": 0.0313, "step": 2825 }, { "epoch": 0.7578439259855189, "grad_norm": 0.32737556273358553, "learning_rate": 9.30732732832681e-06, "loss": 0.0306, "step": 2826 }, { "epoch": 0.7581120943952803, "grad_norm": 0.29765855399117036, "learning_rate": 9.306534830445462e-06, "loss": 0.0311, "step": 2827 }, { "epoch": 0.7583802628050416, "grad_norm": 0.43533171977478574, "learning_rate": 9.30574191324825e-06, "loss": 0.0392, "step": 2828 }, { "epoch": 0.7586484312148029, "grad_norm": 0.34123353046946786, "learning_rate": 9.30494857681238e-06, "loss": 0.0391, "step": 2829 }, { "epoch": 0.7589165996245643, "grad_norm": 0.28412177952417883, "learning_rate": 9.304154821215097e-06, "loss": 0.0256, "step": 2830 }, { "epoch": 0.7591847680343256, "grad_norm": 0.3952978871819059, "learning_rate": 9.303360646533686e-06, "loss": 0.0373, "step": 2831 }, { "epoch": 0.7594529364440868, "grad_norm": 0.31200664206565537, "learning_rate": 9.302566052845474e-06, "loss": 0.0347, "step": 2832 }, { "epoch": 0.7597211048538483, "grad_norm": 0.28754811845098593, "learning_rate": 9.30177104022783e-06, "loss": 0.0252, "step": 2833 }, { "epoch": 0.7599892732636095, "grad_norm": 0.3062300046971535, "learning_rate": 9.300975608758159e-06, "loss": 0.0258, "step": 2834 }, { "epoch": 0.7602574416733708, "grad_norm": 0.46971055749416557, "learning_rate": 9.300179758513912e-06, "loss": 0.0338, "step": 2835 }, { "epoch": 0.7605256100831322, "grad_norm": 0.44441042175321327, "learning_rate": 9.29938348957258e-06, "loss": 0.0441, "step": 2836 }, { "epoch": 0.7607937784928935, "grad_norm": 0.31396063959115716, "learning_rate": 9.29858680201169e-06, "loss": 0.0338, "step": 2837 }, { "epoch": 0.7610619469026548, "grad_norm": 0.4217592096793415, "learning_rate": 9.297789695908816e-06, "loss": 0.0251, "step": 2838 }, { "epoch": 0.7613301153124162, "grad_norm": 0.3895764660852341, "learning_rate": 9.296992171341573e-06, "loss": 0.0438, "step": 2839 }, { "epoch": 0.7615982837221775, "grad_norm": 0.5054486100633101, "learning_rate": 9.296194228387607e-06, "loss": 0.0303, "step": 2840 }, { "epoch": 0.7618664521319388, "grad_norm": 0.3170490172278557, "learning_rate": 9.295395867124617e-06, "loss": 0.0312, "step": 2841 }, { "epoch": 0.7621346205417002, "grad_norm": 0.3836661595493773, "learning_rate": 9.294597087630333e-06, "loss": 0.0505, "step": 2842 }, { "epoch": 0.7624027889514615, "grad_norm": 0.3457173638208979, "learning_rate": 9.293797889982535e-06, "loss": 0.028, "step": 2843 }, { "epoch": 0.7626709573612228, "grad_norm": 0.4499570372614781, "learning_rate": 9.292998274259035e-06, "loss": 0.0268, "step": 2844 }, { "epoch": 0.7629391257709842, "grad_norm": 0.3683629565418978, "learning_rate": 9.292198240537692e-06, "loss": 0.023, "step": 2845 }, { "epoch": 0.7632072941807455, "grad_norm": 0.2631422748647027, "learning_rate": 9.291397788896401e-06, "loss": 0.0211, "step": 2846 }, { "epoch": 0.7634754625905068, "grad_norm": 0.3721415811454178, "learning_rate": 9.290596919413101e-06, "loss": 0.0385, "step": 2847 }, { "epoch": 0.7637436310002682, "grad_norm": 0.38618573361904773, "learning_rate": 9.28979563216577e-06, "loss": 0.036, "step": 2848 }, { "epoch": 0.7640117994100295, "grad_norm": 0.32117983868967254, "learning_rate": 9.288993927232428e-06, "loss": 0.0308, "step": 2849 }, { "epoch": 0.7642799678197908, "grad_norm": 0.4286102768522441, "learning_rate": 9.288191804691133e-06, "loss": 0.0551, "step": 2850 }, { "epoch": 0.7645481362295522, "grad_norm": 0.27393345841022004, "learning_rate": 9.287389264619988e-06, "loss": 0.0324, "step": 2851 }, { "epoch": 0.7648163046393135, "grad_norm": 1.9631919697935445, "learning_rate": 9.286586307097133e-06, "loss": 0.0369, "step": 2852 }, { "epoch": 0.7650844730490748, "grad_norm": 0.47120762936962785, "learning_rate": 9.28578293220075e-06, "loss": 0.047, "step": 2853 }, { "epoch": 0.7653526414588362, "grad_norm": 0.35379517799937205, "learning_rate": 9.284979140009063e-06, "loss": 0.0302, "step": 2854 }, { "epoch": 0.7656208098685975, "grad_norm": 0.3077324276445034, "learning_rate": 9.28417493060033e-06, "loss": 0.0328, "step": 2855 }, { "epoch": 0.7658889782783588, "grad_norm": 0.4032159073727949, "learning_rate": 9.28337030405286e-06, "loss": 0.0349, "step": 2856 }, { "epoch": 0.7661571466881202, "grad_norm": 0.32622453273250657, "learning_rate": 9.282565260444997e-06, "loss": 0.0315, "step": 2857 }, { "epoch": 0.7664253150978815, "grad_norm": 0.24278069794108856, "learning_rate": 9.281759799855124e-06, "loss": 0.0215, "step": 2858 }, { "epoch": 0.7666934835076428, "grad_norm": 0.4207401007821895, "learning_rate": 9.280953922361667e-06, "loss": 0.0413, "step": 2859 }, { "epoch": 0.7669616519174042, "grad_norm": 0.37167423437554475, "learning_rate": 9.28014762804309e-06, "loss": 0.0447, "step": 2860 }, { "epoch": 0.7672298203271655, "grad_norm": 0.38849035610185767, "learning_rate": 9.279340916977905e-06, "loss": 0.0276, "step": 2861 }, { "epoch": 0.7674979887369268, "grad_norm": 0.33990446055648604, "learning_rate": 9.278533789244654e-06, "loss": 0.0411, "step": 2862 }, { "epoch": 0.7677661571466882, "grad_norm": 0.44579152880493045, "learning_rate": 9.27772624492193e-06, "loss": 0.03, "step": 2863 }, { "epoch": 0.7680343255564495, "grad_norm": 0.5594183542152563, "learning_rate": 9.276918284088357e-06, "loss": 0.0453, "step": 2864 }, { "epoch": 0.7683024939662108, "grad_norm": 0.28616054293230125, "learning_rate": 9.276109906822606e-06, "loss": 0.0278, "step": 2865 }, { "epoch": 0.7685706623759722, "grad_norm": 0.3998977683121425, "learning_rate": 9.275301113203385e-06, "loss": 0.0445, "step": 2866 }, { "epoch": 0.7688388307857335, "grad_norm": 0.2906678388234518, "learning_rate": 9.274491903309446e-06, "loss": 0.0177, "step": 2867 }, { "epoch": 0.7691069991954947, "grad_norm": 0.32125109508961996, "learning_rate": 9.27368227721958e-06, "loss": 0.0321, "step": 2868 }, { "epoch": 0.7693751676052561, "grad_norm": 0.29516684440327334, "learning_rate": 9.272872235012616e-06, "loss": 0.0184, "step": 2869 }, { "epoch": 0.7696433360150174, "grad_norm": 0.39368941622323006, "learning_rate": 9.272061776767426e-06, "loss": 0.0354, "step": 2870 }, { "epoch": 0.7699115044247787, "grad_norm": 0.30777440585080384, "learning_rate": 9.271250902562925e-06, "loss": 0.0391, "step": 2871 }, { "epoch": 0.7701796728345401, "grad_norm": 0.6774327998503527, "learning_rate": 9.270439612478062e-06, "loss": 0.0359, "step": 2872 }, { "epoch": 0.7704478412443014, "grad_norm": 0.4441090177897583, "learning_rate": 9.269627906591832e-06, "loss": 0.0316, "step": 2873 }, { "epoch": 0.7707160096540627, "grad_norm": 0.33288132099469747, "learning_rate": 9.268815784983269e-06, "loss": 0.0296, "step": 2874 }, { "epoch": 0.7709841780638241, "grad_norm": 0.4206354089356336, "learning_rate": 9.268003247731446e-06, "loss": 0.0306, "step": 2875 }, { "epoch": 0.7712523464735854, "grad_norm": 0.3168111800525939, "learning_rate": 9.267190294915477e-06, "loss": 0.0314, "step": 2876 }, { "epoch": 0.7715205148833467, "grad_norm": 0.3264549854941881, "learning_rate": 9.266376926614521e-06, "loss": 0.033, "step": 2877 }, { "epoch": 0.7717886832931081, "grad_norm": 0.3753908491147173, "learning_rate": 9.26556314290777e-06, "loss": 0.0336, "step": 2878 }, { "epoch": 0.7720568517028694, "grad_norm": 0.3618555930468025, "learning_rate": 9.264748943874459e-06, "loss": 0.0274, "step": 2879 }, { "epoch": 0.7723250201126307, "grad_norm": 0.40818383519936635, "learning_rate": 9.263934329593867e-06, "loss": 0.0305, "step": 2880 }, { "epoch": 0.7725931885223921, "grad_norm": 0.5633948549489048, "learning_rate": 9.26311930014531e-06, "loss": 0.0416, "step": 2881 }, { "epoch": 0.7728613569321534, "grad_norm": 0.4888058442782244, "learning_rate": 9.262303855608144e-06, "loss": 0.0325, "step": 2882 }, { "epoch": 0.7731295253419147, "grad_norm": 0.31288107955832184, "learning_rate": 9.26148799606177e-06, "loss": 0.0262, "step": 2883 }, { "epoch": 0.7733976937516761, "grad_norm": 0.2663125418483868, "learning_rate": 9.260671721585621e-06, "loss": 0.0259, "step": 2884 }, { "epoch": 0.7736658621614374, "grad_norm": 0.3425978119906183, "learning_rate": 9.25985503225918e-06, "loss": 0.0328, "step": 2885 }, { "epoch": 0.7739340305711987, "grad_norm": 0.2477388835190887, "learning_rate": 9.259037928161963e-06, "loss": 0.0242, "step": 2886 }, { "epoch": 0.77420219898096, "grad_norm": 0.47817972581460266, "learning_rate": 9.258220409373531e-06, "loss": 0.0376, "step": 2887 }, { "epoch": 0.7744703673907214, "grad_norm": 0.4591811733774961, "learning_rate": 9.257402475973484e-06, "loss": 0.0442, "step": 2888 }, { "epoch": 0.7747385358004827, "grad_norm": 0.36702684523285767, "learning_rate": 9.256584128041461e-06, "loss": 0.0301, "step": 2889 }, { "epoch": 0.775006704210244, "grad_norm": 0.2607986350619122, "learning_rate": 9.255765365657142e-06, "loss": 0.023, "step": 2890 }, { "epoch": 0.7752748726200054, "grad_norm": 0.38115942224921817, "learning_rate": 9.254946188900247e-06, "loss": 0.027, "step": 2891 }, { "epoch": 0.7755430410297667, "grad_norm": 0.4750067878282839, "learning_rate": 9.25412659785054e-06, "loss": 0.0469, "step": 2892 }, { "epoch": 0.775811209439528, "grad_norm": 0.38435579051399765, "learning_rate": 9.253306592587819e-06, "loss": 0.03, "step": 2893 }, { "epoch": 0.7760793778492894, "grad_norm": 0.42991710155314566, "learning_rate": 9.25248617319193e-06, "loss": 0.0369, "step": 2894 }, { "epoch": 0.7763475462590507, "grad_norm": 0.291274152886589, "learning_rate": 9.251665339742751e-06, "loss": 0.0274, "step": 2895 }, { "epoch": 0.776615714668812, "grad_norm": 0.3959025467719945, "learning_rate": 9.250844092320205e-06, "loss": 0.0343, "step": 2896 }, { "epoch": 0.7768838830785734, "grad_norm": 0.2963606879208201, "learning_rate": 9.250022431004256e-06, "loss": 0.0348, "step": 2897 }, { "epoch": 0.7771520514883347, "grad_norm": 0.32210228850944017, "learning_rate": 9.249200355874907e-06, "loss": 0.0253, "step": 2898 }, { "epoch": 0.777420219898096, "grad_norm": 0.3874885696876033, "learning_rate": 9.248377867012201e-06, "loss": 0.0317, "step": 2899 }, { "epoch": 0.7776883883078574, "grad_norm": 0.29586802768376946, "learning_rate": 9.247554964496219e-06, "loss": 0.035, "step": 2900 }, { "epoch": 0.7779565567176187, "grad_norm": 0.2857833803016495, "learning_rate": 9.24673164840709e-06, "loss": 0.0266, "step": 2901 }, { "epoch": 0.77822472512738, "grad_norm": 0.39239095191745055, "learning_rate": 9.245907918824975e-06, "loss": 0.0432, "step": 2902 }, { "epoch": 0.7784928935371414, "grad_norm": 0.7872422186360707, "learning_rate": 9.245083775830078e-06, "loss": 0.037, "step": 2903 }, { "epoch": 0.7787610619469026, "grad_norm": 0.35325558197097995, "learning_rate": 9.244259219502642e-06, "loss": 0.0467, "step": 2904 }, { "epoch": 0.7790292303566639, "grad_norm": 0.42416244258953445, "learning_rate": 9.243434249922956e-06, "loss": 0.0341, "step": 2905 }, { "epoch": 0.7792973987664253, "grad_norm": 0.2851745504245758, "learning_rate": 9.242608867171344e-06, "loss": 0.0337, "step": 2906 }, { "epoch": 0.7795655671761866, "grad_norm": 0.3285099770515234, "learning_rate": 9.24178307132817e-06, "loss": 0.0304, "step": 2907 }, { "epoch": 0.7798337355859479, "grad_norm": 0.3120850566213279, "learning_rate": 9.24095686247384e-06, "loss": 0.0252, "step": 2908 }, { "epoch": 0.7801019039957093, "grad_norm": 0.3715484083170873, "learning_rate": 9.2401302406888e-06, "loss": 0.0312, "step": 2909 }, { "epoch": 0.7803700724054706, "grad_norm": 0.4724172755880104, "learning_rate": 9.239303206053536e-06, "loss": 0.0393, "step": 2910 }, { "epoch": 0.7806382408152319, "grad_norm": 0.24490762360874477, "learning_rate": 9.238475758648572e-06, "loss": 0.0241, "step": 2911 }, { "epoch": 0.7809064092249933, "grad_norm": 0.43182627428023196, "learning_rate": 9.237647898554478e-06, "loss": 0.0364, "step": 2912 }, { "epoch": 0.7811745776347546, "grad_norm": 0.2983818565572143, "learning_rate": 9.236819625851858e-06, "loss": 0.0341, "step": 2913 }, { "epoch": 0.7814427460445159, "grad_norm": 0.3156349225491049, "learning_rate": 9.23599094062136e-06, "loss": 0.041, "step": 2914 }, { "epoch": 0.7817109144542773, "grad_norm": 0.32192952501813255, "learning_rate": 9.235161842943671e-06, "loss": 0.028, "step": 2915 }, { "epoch": 0.7819790828640386, "grad_norm": 0.31853734732326805, "learning_rate": 9.234332332899518e-06, "loss": 0.0279, "step": 2916 }, { "epoch": 0.7822472512737999, "grad_norm": 0.2877768595970916, "learning_rate": 9.233502410569666e-06, "loss": 0.0286, "step": 2917 }, { "epoch": 0.7825154196835613, "grad_norm": 0.6719638037032413, "learning_rate": 9.232672076034924e-06, "loss": 0.046, "step": 2918 }, { "epoch": 0.7827835880933226, "grad_norm": 0.42557318495665974, "learning_rate": 9.231841329376142e-06, "loss": 0.0292, "step": 2919 }, { "epoch": 0.7830517565030839, "grad_norm": 0.47262458431062365, "learning_rate": 9.231010170674202e-06, "loss": 0.0453, "step": 2920 }, { "epoch": 0.7833199249128453, "grad_norm": 0.25954311224914733, "learning_rate": 9.230178600010039e-06, "loss": 0.0284, "step": 2921 }, { "epoch": 0.7835880933226066, "grad_norm": 0.2921434926455236, "learning_rate": 9.229346617464612e-06, "loss": 0.03, "step": 2922 }, { "epoch": 0.7838562617323679, "grad_norm": 0.2462859847809662, "learning_rate": 9.228514223118936e-06, "loss": 0.022, "step": 2923 }, { "epoch": 0.7841244301421293, "grad_norm": 0.71145255409983, "learning_rate": 9.227681417054056e-06, "loss": 0.0266, "step": 2924 }, { "epoch": 0.7843925985518906, "grad_norm": 0.4334291185190317, "learning_rate": 9.22684819935106e-06, "loss": 0.0348, "step": 2925 }, { "epoch": 0.7846607669616519, "grad_norm": 0.3735825386362639, "learning_rate": 9.226014570091078e-06, "loss": 0.0352, "step": 2926 }, { "epoch": 0.7849289353714133, "grad_norm": 0.6750603457745463, "learning_rate": 9.225180529355275e-06, "loss": 0.038, "step": 2927 }, { "epoch": 0.7851971037811746, "grad_norm": 0.5312029882760894, "learning_rate": 9.224346077224864e-06, "loss": 0.0378, "step": 2928 }, { "epoch": 0.7854652721909359, "grad_norm": 0.34456646067107993, "learning_rate": 9.223511213781091e-06, "loss": 0.0325, "step": 2929 }, { "epoch": 0.7857334406006973, "grad_norm": 0.2864240529300436, "learning_rate": 9.222675939105243e-06, "loss": 0.0273, "step": 2930 }, { "epoch": 0.7860016090104586, "grad_norm": 0.5657466333074619, "learning_rate": 9.22184025327865e-06, "loss": 0.0299, "step": 2931 }, { "epoch": 0.7862697774202199, "grad_norm": 0.34960080141117295, "learning_rate": 9.22100415638268e-06, "loss": 0.0397, "step": 2932 }, { "epoch": 0.7865379458299813, "grad_norm": 0.4910558216783106, "learning_rate": 9.220167648498743e-06, "loss": 0.045, "step": 2933 }, { "epoch": 0.7868061142397426, "grad_norm": 0.39766018944907, "learning_rate": 9.219330729708286e-06, "loss": 0.0303, "step": 2934 }, { "epoch": 0.7870742826495039, "grad_norm": 0.3346817620662647, "learning_rate": 9.218493400092796e-06, "loss": 0.0332, "step": 2935 }, { "epoch": 0.7873424510592653, "grad_norm": 0.40600667941086627, "learning_rate": 9.217655659733807e-06, "loss": 0.0326, "step": 2936 }, { "epoch": 0.7876106194690266, "grad_norm": 0.2918206742183915, "learning_rate": 9.216817508712882e-06, "loss": 0.0311, "step": 2937 }, { "epoch": 0.7878787878787878, "grad_norm": 0.25306235681448896, "learning_rate": 9.21597894711163e-06, "loss": 0.0236, "step": 2938 }, { "epoch": 0.7881469562885492, "grad_norm": 0.3642812214651202, "learning_rate": 9.215139975011705e-06, "loss": 0.039, "step": 2939 }, { "epoch": 0.7884151246983105, "grad_norm": 0.286237346956489, "learning_rate": 9.214300592494789e-06, "loss": 0.0275, "step": 2940 }, { "epoch": 0.7886832931080718, "grad_norm": 0.4393501334033114, "learning_rate": 9.213460799642612e-06, "loss": 0.0399, "step": 2941 }, { "epoch": 0.7889514615178332, "grad_norm": 0.45173118327172107, "learning_rate": 9.212620596536948e-06, "loss": 0.0334, "step": 2942 }, { "epoch": 0.7892196299275945, "grad_norm": 0.434343633182564, "learning_rate": 9.211779983259597e-06, "loss": 0.046, "step": 2943 }, { "epoch": 0.7894877983373558, "grad_norm": 0.29258152473573695, "learning_rate": 9.21093895989241e-06, "loss": 0.03, "step": 2944 }, { "epoch": 0.7897559667471172, "grad_norm": 0.29418289324012686, "learning_rate": 9.210097526517281e-06, "loss": 0.0348, "step": 2945 }, { "epoch": 0.7900241351568785, "grad_norm": 0.3852292607319929, "learning_rate": 9.20925568321613e-06, "loss": 0.0349, "step": 2946 }, { "epoch": 0.7902923035666398, "grad_norm": 0.4279837255662349, "learning_rate": 9.20841343007093e-06, "loss": 0.0391, "step": 2947 }, { "epoch": 0.7905604719764012, "grad_norm": 0.3371197227831816, "learning_rate": 9.207570767163686e-06, "loss": 0.0286, "step": 2948 }, { "epoch": 0.7908286403861625, "grad_norm": 1.2247903958194084, "learning_rate": 9.206727694576448e-06, "loss": 0.0413, "step": 2949 }, { "epoch": 0.7910968087959238, "grad_norm": 0.32768764007714346, "learning_rate": 9.205884212391303e-06, "loss": 0.0372, "step": 2950 }, { "epoch": 0.7913649772056852, "grad_norm": 0.2957693033485463, "learning_rate": 9.205040320690379e-06, "loss": 0.0348, "step": 2951 }, { "epoch": 0.7916331456154465, "grad_norm": 0.25105726877522855, "learning_rate": 9.204196019555842e-06, "loss": 0.0286, "step": 2952 }, { "epoch": 0.7919013140252078, "grad_norm": 0.846676857505455, "learning_rate": 9.203351309069902e-06, "loss": 0.0285, "step": 2953 }, { "epoch": 0.7921694824349692, "grad_norm": 0.3317145111184569, "learning_rate": 9.202506189314803e-06, "loss": 0.0405, "step": 2954 }, { "epoch": 0.7924376508447305, "grad_norm": 0.3337868633636277, "learning_rate": 9.201660660372835e-06, "loss": 0.0421, "step": 2955 }, { "epoch": 0.7927058192544918, "grad_norm": 0.21465959803285392, "learning_rate": 9.200814722326323e-06, "loss": 0.0234, "step": 2956 }, { "epoch": 0.7929739876642532, "grad_norm": 0.3155521926615943, "learning_rate": 9.199968375257635e-06, "loss": 0.0374, "step": 2957 }, { "epoch": 0.7932421560740145, "grad_norm": 0.4547267880282319, "learning_rate": 9.199121619249178e-06, "loss": 0.0389, "step": 2958 }, { "epoch": 0.7935103244837758, "grad_norm": 0.5314352221096742, "learning_rate": 9.198274454383396e-06, "loss": 0.047, "step": 2959 }, { "epoch": 0.7937784928935372, "grad_norm": 0.29606600019463253, "learning_rate": 9.197426880742777e-06, "loss": 0.03, "step": 2960 }, { "epoch": 0.7940466613032985, "grad_norm": 0.3829191972952014, "learning_rate": 9.196578898409847e-06, "loss": 0.0358, "step": 2961 }, { "epoch": 0.7943148297130598, "grad_norm": 0.6983473976847496, "learning_rate": 9.19573050746717e-06, "loss": 0.035, "step": 2962 }, { "epoch": 0.7945829981228212, "grad_norm": 0.48353845768653064, "learning_rate": 9.194881707997355e-06, "loss": 0.0355, "step": 2963 }, { "epoch": 0.7948511665325825, "grad_norm": 0.34744082322204334, "learning_rate": 9.194032500083044e-06, "loss": 0.0286, "step": 2964 }, { "epoch": 0.7951193349423438, "grad_norm": 0.31098731922195244, "learning_rate": 9.193182883806921e-06, "loss": 0.0299, "step": 2965 }, { "epoch": 0.7953875033521052, "grad_norm": 0.5097885362172042, "learning_rate": 9.192332859251719e-06, "loss": 0.0296, "step": 2966 }, { "epoch": 0.7956556717618665, "grad_norm": 0.325767160332601, "learning_rate": 9.191482426500192e-06, "loss": 0.0356, "step": 2967 }, { "epoch": 0.7959238401716278, "grad_norm": 0.4658919716997365, "learning_rate": 9.19063158563515e-06, "loss": 0.03, "step": 2968 }, { "epoch": 0.7961920085813892, "grad_norm": 0.31387209183212905, "learning_rate": 9.189780336739437e-06, "loss": 0.0277, "step": 2969 }, { "epoch": 0.7964601769911505, "grad_norm": 0.34838515313085294, "learning_rate": 9.188928679895938e-06, "loss": 0.0377, "step": 2970 }, { "epoch": 0.7967283454009118, "grad_norm": 0.40982727028896004, "learning_rate": 9.188076615187573e-06, "loss": 0.0379, "step": 2971 }, { "epoch": 0.7969965138106732, "grad_norm": 0.4140543624781629, "learning_rate": 9.18722414269731e-06, "loss": 0.0426, "step": 2972 }, { "epoch": 0.7972646822204345, "grad_norm": 0.3852544884138152, "learning_rate": 9.186371262508144e-06, "loss": 0.0358, "step": 2973 }, { "epoch": 0.7975328506301957, "grad_norm": 0.3271135808761734, "learning_rate": 9.185517974703127e-06, "loss": 0.0253, "step": 2974 }, { "epoch": 0.797801019039957, "grad_norm": 0.39152692049923954, "learning_rate": 9.184664279365334e-06, "loss": 0.0315, "step": 2975 }, { "epoch": 0.7980691874497184, "grad_norm": 0.6384409622919172, "learning_rate": 9.183810176577891e-06, "loss": 0.0483, "step": 2976 }, { "epoch": 0.7983373558594797, "grad_norm": 0.49404448480020785, "learning_rate": 9.182955666423961e-06, "loss": 0.0473, "step": 2977 }, { "epoch": 0.798605524269241, "grad_norm": 0.5881588925218225, "learning_rate": 9.182100748986742e-06, "loss": 0.0569, "step": 2978 }, { "epoch": 0.7988736926790024, "grad_norm": 0.4696415108203217, "learning_rate": 9.181245424349477e-06, "loss": 0.0458, "step": 2979 }, { "epoch": 0.7991418610887637, "grad_norm": 0.3106350506183395, "learning_rate": 9.180389692595444e-06, "loss": 0.0293, "step": 2980 }, { "epoch": 0.799410029498525, "grad_norm": 0.40883137381720336, "learning_rate": 9.179533553807967e-06, "loss": 0.0283, "step": 2981 }, { "epoch": 0.7996781979082864, "grad_norm": 0.29495381881447597, "learning_rate": 9.178677008070404e-06, "loss": 0.0265, "step": 2982 }, { "epoch": 0.7999463663180477, "grad_norm": 0.3299318535064028, "learning_rate": 9.177820055466155e-06, "loss": 0.0267, "step": 2983 }, { "epoch": 0.800214534727809, "grad_norm": 0.5106851715958182, "learning_rate": 9.17696269607866e-06, "loss": 0.0396, "step": 2984 }, { "epoch": 0.8004827031375704, "grad_norm": 0.37280014915346493, "learning_rate": 9.176104929991396e-06, "loss": 0.0408, "step": 2985 }, { "epoch": 0.8007508715473317, "grad_norm": 0.38444075467190897, "learning_rate": 9.175246757287881e-06, "loss": 0.0425, "step": 2986 }, { "epoch": 0.801019039957093, "grad_norm": 0.3502836852687659, "learning_rate": 9.174388178051676e-06, "loss": 0.0269, "step": 2987 }, { "epoch": 0.8012872083668544, "grad_norm": 0.40021572399901684, "learning_rate": 9.173529192366377e-06, "loss": 0.0362, "step": 2988 }, { "epoch": 0.8015553767766157, "grad_norm": 0.30812843746569213, "learning_rate": 9.172669800315619e-06, "loss": 0.0295, "step": 2989 }, { "epoch": 0.801823545186377, "grad_norm": 0.5901777184890611, "learning_rate": 9.171810001983082e-06, "loss": 0.0645, "step": 2990 }, { "epoch": 0.8020917135961384, "grad_norm": 0.3689056433248043, "learning_rate": 9.170949797452481e-06, "loss": 0.0278, "step": 2991 }, { "epoch": 0.8023598820058997, "grad_norm": 0.35330117841691916, "learning_rate": 9.170089186807574e-06, "loss": 0.0413, "step": 2992 }, { "epoch": 0.802628050415661, "grad_norm": 0.30306525495539743, "learning_rate": 9.169228170132151e-06, "loss": 0.0254, "step": 2993 }, { "epoch": 0.8028962188254224, "grad_norm": 0.47532318464789686, "learning_rate": 9.168366747510051e-06, "loss": 0.0406, "step": 2994 }, { "epoch": 0.8031643872351837, "grad_norm": 0.30685209833731675, "learning_rate": 9.16750491902515e-06, "loss": 0.0236, "step": 2995 }, { "epoch": 0.803432555644945, "grad_norm": 0.9273462803496283, "learning_rate": 9.166642684761358e-06, "loss": 0.044, "step": 2996 }, { "epoch": 0.8037007240547064, "grad_norm": 0.591294086170457, "learning_rate": 9.165780044802629e-06, "loss": 0.0362, "step": 2997 }, { "epoch": 0.8039688924644677, "grad_norm": 0.4094740124956683, "learning_rate": 9.164916999232958e-06, "loss": 0.0295, "step": 2998 }, { "epoch": 0.804237060874229, "grad_norm": 0.3992424506482337, "learning_rate": 9.164053548136376e-06, "loss": 0.0321, "step": 2999 }, { "epoch": 0.8045052292839904, "grad_norm": 0.6171502261179619, "learning_rate": 9.163189691596954e-06, "loss": 0.0371, "step": 3000 }, { "epoch": 0.8047733976937517, "grad_norm": 0.272662381707896, "learning_rate": 9.162325429698807e-06, "loss": 0.0228, "step": 3001 }, { "epoch": 0.805041566103513, "grad_norm": 0.45704383560185574, "learning_rate": 9.161460762526082e-06, "loss": 0.0349, "step": 3002 }, { "epoch": 0.8053097345132744, "grad_norm": 0.39471126398923945, "learning_rate": 9.160595690162974e-06, "loss": 0.0351, "step": 3003 }, { "epoch": 0.8055779029230357, "grad_norm": 0.36693805310353855, "learning_rate": 9.159730212693707e-06, "loss": 0.0331, "step": 3004 }, { "epoch": 0.805846071332797, "grad_norm": 0.3495793064558823, "learning_rate": 9.158864330202553e-06, "loss": 0.0367, "step": 3005 }, { "epoch": 0.8061142397425584, "grad_norm": 0.6080548003635315, "learning_rate": 9.157998042773823e-06, "loss": 0.0383, "step": 3006 }, { "epoch": 0.8063824081523197, "grad_norm": 0.42982678015602827, "learning_rate": 9.15713135049186e-06, "loss": 0.0327, "step": 3007 }, { "epoch": 0.806650576562081, "grad_norm": 0.3839392069262633, "learning_rate": 9.156264253441056e-06, "loss": 0.0411, "step": 3008 }, { "epoch": 0.8069187449718423, "grad_norm": 0.5147548557781123, "learning_rate": 9.155396751705836e-06, "loss": 0.0415, "step": 3009 }, { "epoch": 0.8071869133816036, "grad_norm": 0.27929030668574467, "learning_rate": 9.154528845370667e-06, "loss": 0.0304, "step": 3010 }, { "epoch": 0.8074550817913649, "grad_norm": 0.5720679358707791, "learning_rate": 9.153660534520054e-06, "loss": 0.0423, "step": 3011 }, { "epoch": 0.8077232502011263, "grad_norm": 0.39670541702155265, "learning_rate": 9.152791819238542e-06, "loss": 0.0406, "step": 3012 }, { "epoch": 0.8079914186108876, "grad_norm": 0.33911535696143075, "learning_rate": 9.151922699610718e-06, "loss": 0.0391, "step": 3013 }, { "epoch": 0.8082595870206489, "grad_norm": 0.25756459819393707, "learning_rate": 9.151053175721202e-06, "loss": 0.0288, "step": 3014 }, { "epoch": 0.8085277554304103, "grad_norm": 0.25873870855606657, "learning_rate": 9.15018324765466e-06, "loss": 0.0307, "step": 3015 }, { "epoch": 0.8087959238401716, "grad_norm": 0.2824723327497554, "learning_rate": 9.149312915495793e-06, "loss": 0.0312, "step": 3016 }, { "epoch": 0.8090640922499329, "grad_norm": 0.3470452490431448, "learning_rate": 9.148442179329344e-06, "loss": 0.0303, "step": 3017 }, { "epoch": 0.8093322606596943, "grad_norm": 0.3040693577018474, "learning_rate": 9.147571039240095e-06, "loss": 0.0325, "step": 3018 }, { "epoch": 0.8096004290694556, "grad_norm": 0.2810814305780686, "learning_rate": 9.146699495312862e-06, "loss": 0.0329, "step": 3019 }, { "epoch": 0.8098685974792169, "grad_norm": 0.31857804514296867, "learning_rate": 9.145827547632512e-06, "loss": 0.0264, "step": 3020 }, { "epoch": 0.8101367658889783, "grad_norm": 0.32349264980699577, "learning_rate": 9.14495519628394e-06, "loss": 0.028, "step": 3021 }, { "epoch": 0.8104049342987396, "grad_norm": 0.2674122395201621, "learning_rate": 9.144082441352084e-06, "loss": 0.0273, "step": 3022 }, { "epoch": 0.8106731027085009, "grad_norm": 0.2604662455400156, "learning_rate": 9.143209282921924e-06, "loss": 0.0251, "step": 3023 }, { "epoch": 0.8109412711182623, "grad_norm": 0.3456059845859436, "learning_rate": 9.142335721078475e-06, "loss": 0.04, "step": 3024 }, { "epoch": 0.8112094395280236, "grad_norm": 0.31504184947130953, "learning_rate": 9.141461755906795e-06, "loss": 0.0271, "step": 3025 }, { "epoch": 0.8114776079377849, "grad_norm": 0.2742502967078285, "learning_rate": 9.140587387491979e-06, "loss": 0.0261, "step": 3026 }, { "epoch": 0.8117457763475463, "grad_norm": 0.33228275138759356, "learning_rate": 9.139712615919163e-06, "loss": 0.0289, "step": 3027 }, { "epoch": 0.8120139447573076, "grad_norm": 0.28690398005725837, "learning_rate": 9.13883744127352e-06, "loss": 0.0339, "step": 3028 }, { "epoch": 0.8122821131670689, "grad_norm": 0.36197665868251, "learning_rate": 9.137961863640263e-06, "loss": 0.0318, "step": 3029 }, { "epoch": 0.8125502815768303, "grad_norm": 0.5392236751847658, "learning_rate": 9.137085883104648e-06, "loss": 0.0551, "step": 3030 }, { "epoch": 0.8128184499865916, "grad_norm": 0.4478948689781035, "learning_rate": 9.136209499751962e-06, "loss": 0.042, "step": 3031 }, { "epoch": 0.8130866183963529, "grad_norm": 0.3211957503723893, "learning_rate": 9.135332713667538e-06, "loss": 0.0257, "step": 3032 }, { "epoch": 0.8133547868061143, "grad_norm": 0.26994167133789987, "learning_rate": 9.134455524936749e-06, "loss": 0.0303, "step": 3033 }, { "epoch": 0.8136229552158756, "grad_norm": 0.5568876154616585, "learning_rate": 9.133577933644999e-06, "loss": 0.0278, "step": 3034 }, { "epoch": 0.8138911236256369, "grad_norm": 0.3306401385621047, "learning_rate": 9.132699939877744e-06, "loss": 0.0228, "step": 3035 }, { "epoch": 0.8141592920353983, "grad_norm": 0.39528198039001566, "learning_rate": 9.131821543720464e-06, "loss": 0.0436, "step": 3036 }, { "epoch": 0.8144274604451596, "grad_norm": 0.31628972788774434, "learning_rate": 9.130942745258694e-06, "loss": 0.0293, "step": 3037 }, { "epoch": 0.8146956288549209, "grad_norm": 0.30613185330806464, "learning_rate": 9.130063544577993e-06, "loss": 0.0279, "step": 3038 }, { "epoch": 0.8149637972646823, "grad_norm": 0.374751125623404, "learning_rate": 9.129183941763971e-06, "loss": 0.037, "step": 3039 }, { "epoch": 0.8152319656744436, "grad_norm": 0.4063969657160524, "learning_rate": 9.128303936902272e-06, "loss": 0.0338, "step": 3040 }, { "epoch": 0.8155001340842049, "grad_norm": 0.28772467941568475, "learning_rate": 9.127423530078578e-06, "loss": 0.0268, "step": 3041 }, { "epoch": 0.8157683024939663, "grad_norm": 0.3127625889624516, "learning_rate": 9.126542721378613e-06, "loss": 0.0304, "step": 3042 }, { "epoch": 0.8160364709037276, "grad_norm": 0.3429209130916632, "learning_rate": 9.12566151088814e-06, "loss": 0.0375, "step": 3043 }, { "epoch": 0.8163046393134888, "grad_norm": 0.2551717373100411, "learning_rate": 9.124779898692959e-06, "loss": 0.0253, "step": 3044 }, { "epoch": 0.8165728077232502, "grad_norm": 0.30956001862151933, "learning_rate": 9.123897884878909e-06, "loss": 0.0356, "step": 3045 }, { "epoch": 0.8168409761330115, "grad_norm": 0.6829366931363557, "learning_rate": 9.123015469531872e-06, "loss": 0.0454, "step": 3046 }, { "epoch": 0.8171091445427728, "grad_norm": 0.4532034604499228, "learning_rate": 9.122132652737765e-06, "loss": 0.0397, "step": 3047 }, { "epoch": 0.8173773129525342, "grad_norm": 0.29280328713506043, "learning_rate": 9.121249434582545e-06, "loss": 0.0296, "step": 3048 }, { "epoch": 0.8176454813622955, "grad_norm": 0.30626780408156906, "learning_rate": 9.12036581515221e-06, "loss": 0.0434, "step": 3049 }, { "epoch": 0.8179136497720568, "grad_norm": 0.30361495489996665, "learning_rate": 9.119481794532794e-06, "loss": 0.0328, "step": 3050 }, { "epoch": 0.8181818181818182, "grad_norm": 0.3683483553884046, "learning_rate": 9.118597372810374e-06, "loss": 0.0328, "step": 3051 }, { "epoch": 0.8184499865915795, "grad_norm": 0.3869111290161808, "learning_rate": 9.11771255007106e-06, "loss": 0.0364, "step": 3052 }, { "epoch": 0.8187181550013408, "grad_norm": 0.30593473785642084, "learning_rate": 9.11682732640101e-06, "loss": 0.0359, "step": 3053 }, { "epoch": 0.8189863234111022, "grad_norm": 1.1506624742367686, "learning_rate": 9.115941701886412e-06, "loss": 0.0372, "step": 3054 }, { "epoch": 0.8192544918208635, "grad_norm": 0.38895242082064607, "learning_rate": 9.115055676613498e-06, "loss": 0.036, "step": 3055 }, { "epoch": 0.8195226602306248, "grad_norm": 0.40728702812150686, "learning_rate": 9.114169250668539e-06, "loss": 0.0309, "step": 3056 }, { "epoch": 0.8197908286403862, "grad_norm": 0.30913350372350334, "learning_rate": 9.113282424137842e-06, "loss": 0.0454, "step": 3057 }, { "epoch": 0.8200589970501475, "grad_norm": 0.608336766134122, "learning_rate": 9.112395197107757e-06, "loss": 0.0313, "step": 3058 }, { "epoch": 0.8203271654599088, "grad_norm": 0.3376312625315893, "learning_rate": 9.111507569664668e-06, "loss": 0.0322, "step": 3059 }, { "epoch": 0.8205953338696702, "grad_norm": 0.45436284478093475, "learning_rate": 9.110619541895006e-06, "loss": 0.0385, "step": 3060 }, { "epoch": 0.8208635022794315, "grad_norm": 0.36910999613327167, "learning_rate": 9.10973111388523e-06, "loss": 0.025, "step": 3061 }, { "epoch": 0.8211316706891928, "grad_norm": 0.5460036612796674, "learning_rate": 9.108842285721846e-06, "loss": 0.0383, "step": 3062 }, { "epoch": 0.8213998390989542, "grad_norm": 0.3670920836595705, "learning_rate": 9.107953057491399e-06, "loss": 0.0375, "step": 3063 }, { "epoch": 0.8216680075087155, "grad_norm": 0.37793424664600395, "learning_rate": 9.10706342928047e-06, "loss": 0.0308, "step": 3064 }, { "epoch": 0.8219361759184768, "grad_norm": 0.5431083712142541, "learning_rate": 9.106173401175677e-06, "loss": 0.052, "step": 3065 }, { "epoch": 0.8222043443282381, "grad_norm": 0.42426652187567626, "learning_rate": 9.105282973263682e-06, "loss": 0.0393, "step": 3066 }, { "epoch": 0.8224725127379995, "grad_norm": 0.5028644869569973, "learning_rate": 9.104392145631183e-06, "loss": 0.0322, "step": 3067 }, { "epoch": 0.8227406811477608, "grad_norm": 0.6119837149316172, "learning_rate": 9.103500918364917e-06, "loss": 0.0469, "step": 3068 }, { "epoch": 0.8230088495575221, "grad_norm": 0.38931155191318556, "learning_rate": 9.102609291551664e-06, "loss": 0.0384, "step": 3069 }, { "epoch": 0.8232770179672835, "grad_norm": 0.36962275632493946, "learning_rate": 9.101717265278235e-06, "loss": 0.0344, "step": 3070 }, { "epoch": 0.8235451863770448, "grad_norm": 0.30944197089227476, "learning_rate": 9.100824839631485e-06, "loss": 0.0314, "step": 3071 }, { "epoch": 0.8238133547868061, "grad_norm": 0.3783513842794875, "learning_rate": 9.09993201469831e-06, "loss": 0.0364, "step": 3072 }, { "epoch": 0.8240815231965675, "grad_norm": 0.4389371095230446, "learning_rate": 9.099038790565638e-06, "loss": 0.0418, "step": 3073 }, { "epoch": 0.8243496916063288, "grad_norm": 0.32554529847844593, "learning_rate": 9.098145167320442e-06, "loss": 0.0262, "step": 3074 }, { "epoch": 0.82461786001609, "grad_norm": 0.33860790537922875, "learning_rate": 9.09725114504973e-06, "loss": 0.0364, "step": 3075 }, { "epoch": 0.8248860284258515, "grad_norm": 0.32978994764763964, "learning_rate": 9.096356723840555e-06, "loss": 0.0332, "step": 3076 }, { "epoch": 0.8251541968356128, "grad_norm": 0.5680882997051623, "learning_rate": 9.095461903779999e-06, "loss": 0.0276, "step": 3077 }, { "epoch": 0.825422365245374, "grad_norm": 0.3331176452849128, "learning_rate": 9.094566684955191e-06, "loss": 0.0311, "step": 3078 }, { "epoch": 0.8256905336551355, "grad_norm": 0.3278046495070281, "learning_rate": 9.093671067453296e-06, "loss": 0.0351, "step": 3079 }, { "epoch": 0.8259587020648967, "grad_norm": 0.25079030749389003, "learning_rate": 9.092775051361519e-06, "loss": 0.0329, "step": 3080 }, { "epoch": 0.826226870474658, "grad_norm": 0.4086511740591196, "learning_rate": 9.0918786367671e-06, "loss": 0.0333, "step": 3081 }, { "epoch": 0.8264950388844194, "grad_norm": 0.3127012071838921, "learning_rate": 9.090981823757322e-06, "loss": 0.0241, "step": 3082 }, { "epoch": 0.8267632072941807, "grad_norm": 0.4489518033135561, "learning_rate": 9.090084612419506e-06, "loss": 0.044, "step": 3083 }, { "epoch": 0.827031375703942, "grad_norm": 0.5340948978724632, "learning_rate": 9.089187002841008e-06, "loss": 0.038, "step": 3084 }, { "epoch": 0.8272995441137034, "grad_norm": 0.2978179656184194, "learning_rate": 9.08828899510923e-06, "loss": 0.018, "step": 3085 }, { "epoch": 0.8275677125234647, "grad_norm": 0.40182384092960544, "learning_rate": 9.087390589311606e-06, "loss": 0.0357, "step": 3086 }, { "epoch": 0.827835880933226, "grad_norm": 0.3523924782387512, "learning_rate": 9.086491785535613e-06, "loss": 0.0371, "step": 3087 }, { "epoch": 0.8281040493429874, "grad_norm": 0.33564387106327137, "learning_rate": 9.085592583868765e-06, "loss": 0.0244, "step": 3088 }, { "epoch": 0.8283722177527487, "grad_norm": 0.5159426099321865, "learning_rate": 9.084692984398612e-06, "loss": 0.0344, "step": 3089 }, { "epoch": 0.82864038616251, "grad_norm": 0.37687601396302756, "learning_rate": 9.083792987212749e-06, "loss": 0.0325, "step": 3090 }, { "epoch": 0.8289085545722714, "grad_norm": 0.5465975755904534, "learning_rate": 9.082892592398804e-06, "loss": 0.033, "step": 3091 }, { "epoch": 0.8291767229820327, "grad_norm": 0.2717473455166893, "learning_rate": 9.08199180004445e-06, "loss": 0.03, "step": 3092 }, { "epoch": 0.829444891391794, "grad_norm": 0.5581666388459535, "learning_rate": 9.081090610237388e-06, "loss": 0.0315, "step": 3093 }, { "epoch": 0.8297130598015554, "grad_norm": 0.2894183770250827, "learning_rate": 9.080189023065371e-06, "loss": 0.0224, "step": 3094 }, { "epoch": 0.8299812282113167, "grad_norm": 0.3082148665308753, "learning_rate": 9.07928703861618e-06, "loss": 0.0389, "step": 3095 }, { "epoch": 0.830249396621078, "grad_norm": 0.22991995262418755, "learning_rate": 9.078384656977642e-06, "loss": 0.0257, "step": 3096 }, { "epoch": 0.8305175650308394, "grad_norm": 0.7300450657758292, "learning_rate": 9.077481878237618e-06, "loss": 0.0577, "step": 3097 }, { "epoch": 0.8307857334406007, "grad_norm": 0.30654389943120847, "learning_rate": 9.076578702484007e-06, "loss": 0.0274, "step": 3098 }, { "epoch": 0.831053901850362, "grad_norm": 0.24124307359018687, "learning_rate": 9.07567512980475e-06, "loss": 0.0218, "step": 3099 }, { "epoch": 0.8313220702601234, "grad_norm": 0.3875661642554389, "learning_rate": 9.07477116028783e-06, "loss": 0.0381, "step": 3100 }, { "epoch": 0.8315902386698847, "grad_norm": 0.4334348219983835, "learning_rate": 9.073866794021258e-06, "loss": 0.0414, "step": 3101 }, { "epoch": 0.831858407079646, "grad_norm": 0.8260876271711652, "learning_rate": 9.072962031093093e-06, "loss": 0.0374, "step": 3102 }, { "epoch": 0.8321265754894074, "grad_norm": 0.34637611007844676, "learning_rate": 9.072056871591428e-06, "loss": 0.0285, "step": 3103 }, { "epoch": 0.8323947438991687, "grad_norm": 0.2868896701001528, "learning_rate": 9.071151315604396e-06, "loss": 0.029, "step": 3104 }, { "epoch": 0.83266291230893, "grad_norm": 0.4918340253657461, "learning_rate": 9.070245363220172e-06, "loss": 0.0319, "step": 3105 }, { "epoch": 0.8329310807186914, "grad_norm": 0.3190149869348753, "learning_rate": 9.06933901452696e-06, "loss": 0.034, "step": 3106 }, { "epoch": 0.8331992491284527, "grad_norm": 0.44155784116958563, "learning_rate": 9.068432269613012e-06, "loss": 0.0392, "step": 3107 }, { "epoch": 0.833467417538214, "grad_norm": 0.4072642396779156, "learning_rate": 9.067525128566617e-06, "loss": 0.0341, "step": 3108 }, { "epoch": 0.8337355859479754, "grad_norm": 0.30112994993753417, "learning_rate": 9.066617591476099e-06, "loss": 0.0289, "step": 3109 }, { "epoch": 0.8340037543577367, "grad_norm": 0.44499814112431435, "learning_rate": 9.065709658429823e-06, "loss": 0.0412, "step": 3110 }, { "epoch": 0.834271922767498, "grad_norm": 0.28479295369719443, "learning_rate": 9.064801329516192e-06, "loss": 0.03, "step": 3111 }, { "epoch": 0.8345400911772594, "grad_norm": 0.39129826086620084, "learning_rate": 9.063892604823647e-06, "loss": 0.0249, "step": 3112 }, { "epoch": 0.8348082595870207, "grad_norm": 0.38430350964101617, "learning_rate": 9.062983484440668e-06, "loss": 0.0386, "step": 3113 }, { "epoch": 0.835076427996782, "grad_norm": 0.37062951028795926, "learning_rate": 9.062073968455773e-06, "loss": 0.0378, "step": 3114 }, { "epoch": 0.8353445964065433, "grad_norm": 0.4369090690362323, "learning_rate": 9.061164056957523e-06, "loss": 0.0406, "step": 3115 }, { "epoch": 0.8356127648163046, "grad_norm": 0.26945041050326907, "learning_rate": 9.06025375003451e-06, "loss": 0.0236, "step": 3116 }, { "epoch": 0.8358809332260659, "grad_norm": 0.43478029030002374, "learning_rate": 9.059343047775368e-06, "loss": 0.032, "step": 3117 }, { "epoch": 0.8361491016358273, "grad_norm": 0.2865719118338404, "learning_rate": 9.058431950268773e-06, "loss": 0.0304, "step": 3118 }, { "epoch": 0.8364172700455886, "grad_norm": 0.34997029572149246, "learning_rate": 9.057520457603433e-06, "loss": 0.0301, "step": 3119 }, { "epoch": 0.8366854384553499, "grad_norm": 0.4465879469737071, "learning_rate": 9.056608569868098e-06, "loss": 0.0383, "step": 3120 }, { "epoch": 0.8369536068651113, "grad_norm": 0.6074705756134123, "learning_rate": 9.055696287151556e-06, "loss": 0.0455, "step": 3121 }, { "epoch": 0.8372217752748726, "grad_norm": 0.3705844393107477, "learning_rate": 9.054783609542634e-06, "loss": 0.0365, "step": 3122 }, { "epoch": 0.8374899436846339, "grad_norm": 0.3909780785040752, "learning_rate": 9.053870537130198e-06, "loss": 0.0384, "step": 3123 }, { "epoch": 0.8377581120943953, "grad_norm": 0.3485549622796398, "learning_rate": 9.052957070003152e-06, "loss": 0.0251, "step": 3124 }, { "epoch": 0.8380262805041566, "grad_norm": 0.43383363917469075, "learning_rate": 9.052043208250435e-06, "loss": 0.0468, "step": 3125 }, { "epoch": 0.8382944489139179, "grad_norm": 0.3426200470566251, "learning_rate": 9.05112895196103e-06, "loss": 0.0377, "step": 3126 }, { "epoch": 0.8385626173236793, "grad_norm": 0.31915652098840464, "learning_rate": 9.050214301223952e-06, "loss": 0.0361, "step": 3127 }, { "epoch": 0.8388307857334406, "grad_norm": 0.2967358238652847, "learning_rate": 9.049299256128263e-06, "loss": 0.0199, "step": 3128 }, { "epoch": 0.8390989541432019, "grad_norm": 0.37109728605334674, "learning_rate": 9.048383816763056e-06, "loss": 0.0332, "step": 3129 }, { "epoch": 0.8393671225529633, "grad_norm": 0.4029886474786045, "learning_rate": 9.047467983217464e-06, "loss": 0.0305, "step": 3130 }, { "epoch": 0.8396352909627246, "grad_norm": 0.4449371363808408, "learning_rate": 9.04655175558066e-06, "loss": 0.0318, "step": 3131 }, { "epoch": 0.8399034593724859, "grad_norm": 0.32005270812870085, "learning_rate": 9.045635133941855e-06, "loss": 0.0364, "step": 3132 }, { "epoch": 0.8401716277822473, "grad_norm": 0.49584681897023297, "learning_rate": 9.044718118390299e-06, "loss": 0.0269, "step": 3133 }, { "epoch": 0.8404397961920086, "grad_norm": 0.31728975496481265, "learning_rate": 9.043800709015278e-06, "loss": 0.0357, "step": 3134 }, { "epoch": 0.8407079646017699, "grad_norm": 0.3453864283996984, "learning_rate": 9.042882905906118e-06, "loss": 0.0332, "step": 3135 }, { "epoch": 0.8409761330115313, "grad_norm": 0.4074679565986499, "learning_rate": 9.041964709152183e-06, "loss": 0.0369, "step": 3136 }, { "epoch": 0.8412443014212926, "grad_norm": 0.2910588243536259, "learning_rate": 9.041046118842874e-06, "loss": 0.0283, "step": 3137 }, { "epoch": 0.8415124698310539, "grad_norm": 0.44745177748835996, "learning_rate": 9.040127135067636e-06, "loss": 0.0365, "step": 3138 }, { "epoch": 0.8417806382408153, "grad_norm": 0.29617703208644147, "learning_rate": 9.039207757915942e-06, "loss": 0.0342, "step": 3139 }, { "epoch": 0.8420488066505766, "grad_norm": 0.362173470727667, "learning_rate": 9.038287987477314e-06, "loss": 0.0358, "step": 3140 }, { "epoch": 0.8423169750603379, "grad_norm": 0.341053417337464, "learning_rate": 9.037367823841308e-06, "loss": 0.0275, "step": 3141 }, { "epoch": 0.8425851434700993, "grad_norm": 0.39031860077077973, "learning_rate": 9.036447267097514e-06, "loss": 0.0362, "step": 3142 }, { "epoch": 0.8428533118798606, "grad_norm": 0.3492842547118297, "learning_rate": 9.035526317335567e-06, "loss": 0.0364, "step": 3143 }, { "epoch": 0.8431214802896219, "grad_norm": 0.38848250292252945, "learning_rate": 9.034604974645136e-06, "loss": 0.0314, "step": 3144 }, { "epoch": 0.8433896486993833, "grad_norm": 0.3153718121194499, "learning_rate": 9.03368323911593e-06, "loss": 0.0336, "step": 3145 }, { "epoch": 0.8436578171091446, "grad_norm": 0.3795468649491991, "learning_rate": 9.032761110837696e-06, "loss": 0.04, "step": 3146 }, { "epoch": 0.8439259855189059, "grad_norm": 0.25117989891886827, "learning_rate": 9.03183858990022e-06, "loss": 0.0316, "step": 3147 }, { "epoch": 0.8441941539286673, "grad_norm": 0.4281617229993968, "learning_rate": 9.030915676393326e-06, "loss": 0.0426, "step": 3148 }, { "epoch": 0.8444623223384286, "grad_norm": 0.30421827082506114, "learning_rate": 9.029992370406872e-06, "loss": 0.0294, "step": 3149 }, { "epoch": 0.8447304907481898, "grad_norm": 0.3227374630994128, "learning_rate": 9.029068672030764e-06, "loss": 0.0268, "step": 3150 }, { "epoch": 0.8449986591579512, "grad_norm": 0.7705866503023938, "learning_rate": 9.028144581354932e-06, "loss": 0.0591, "step": 3151 }, { "epoch": 0.8452668275677125, "grad_norm": 0.3443625661788152, "learning_rate": 9.02722009846936e-06, "loss": 0.0317, "step": 3152 }, { "epoch": 0.8455349959774738, "grad_norm": 0.5854392657362937, "learning_rate": 9.026295223464056e-06, "loss": 0.044, "step": 3153 }, { "epoch": 0.8458031643872351, "grad_norm": 0.24733972724772027, "learning_rate": 9.025369956429077e-06, "loss": 0.0252, "step": 3154 }, { "epoch": 0.8460713327969965, "grad_norm": 0.3728444055790479, "learning_rate": 9.024444297454515e-06, "loss": 0.0325, "step": 3155 }, { "epoch": 0.8463395012067578, "grad_norm": 0.22894489727878467, "learning_rate": 9.023518246630494e-06, "loss": 0.0264, "step": 3156 }, { "epoch": 0.8466076696165191, "grad_norm": 0.6619200610442779, "learning_rate": 9.022591804047183e-06, "loss": 0.0435, "step": 3157 }, { "epoch": 0.8468758380262805, "grad_norm": 0.3090323040461443, "learning_rate": 9.02166496979479e-06, "loss": 0.0282, "step": 3158 }, { "epoch": 0.8471440064360418, "grad_norm": 0.4461665208429402, "learning_rate": 9.020737743963555e-06, "loss": 0.0437, "step": 3159 }, { "epoch": 0.8474121748458031, "grad_norm": 0.4745944533188282, "learning_rate": 9.01981012664376e-06, "loss": 0.0554, "step": 3160 }, { "epoch": 0.8476803432555645, "grad_norm": 0.34101599772660124, "learning_rate": 9.018882117925726e-06, "loss": 0.0243, "step": 3161 }, { "epoch": 0.8479485116653258, "grad_norm": 0.25307496336010993, "learning_rate": 9.01795371789981e-06, "loss": 0.0214, "step": 3162 }, { "epoch": 0.8482166800750871, "grad_norm": 0.4041079798718042, "learning_rate": 9.017024926656408e-06, "loss": 0.0346, "step": 3163 }, { "epoch": 0.8484848484848485, "grad_norm": 0.3346664683517816, "learning_rate": 9.016095744285955e-06, "loss": 0.0352, "step": 3164 }, { "epoch": 0.8487530168946098, "grad_norm": 0.3183958910192475, "learning_rate": 9.015166170878919e-06, "loss": 0.0281, "step": 3165 }, { "epoch": 0.8490211853043711, "grad_norm": 0.31849158753590223, "learning_rate": 9.014236206525815e-06, "loss": 0.0319, "step": 3166 }, { "epoch": 0.8492893537141325, "grad_norm": 0.3657656980748535, "learning_rate": 9.01330585131719e-06, "loss": 0.0369, "step": 3167 }, { "epoch": 0.8495575221238938, "grad_norm": 0.39188994611326783, "learning_rate": 9.012375105343628e-06, "loss": 0.0379, "step": 3168 }, { "epoch": 0.8498256905336551, "grad_norm": 0.3985568203003311, "learning_rate": 9.011443968695756e-06, "loss": 0.0492, "step": 3169 }, { "epoch": 0.8500938589434165, "grad_norm": 0.31146790713106864, "learning_rate": 9.010512441464233e-06, "loss": 0.031, "step": 3170 }, { "epoch": 0.8503620273531778, "grad_norm": 0.3813254373964755, "learning_rate": 9.009580523739763e-06, "loss": 0.0362, "step": 3171 }, { "epoch": 0.8506301957629391, "grad_norm": 0.43182819013375173, "learning_rate": 9.008648215613082e-06, "loss": 0.0401, "step": 3172 }, { "epoch": 0.8508983641727005, "grad_norm": 0.3577038071168452, "learning_rate": 9.007715517174966e-06, "loss": 0.0325, "step": 3173 }, { "epoch": 0.8511665325824618, "grad_norm": 0.29550644536017023, "learning_rate": 9.006782428516232e-06, "loss": 0.0364, "step": 3174 }, { "epoch": 0.8514347009922231, "grad_norm": 0.28672215170160686, "learning_rate": 9.005848949727729e-06, "loss": 0.0343, "step": 3175 }, { "epoch": 0.8517028694019845, "grad_norm": 0.5555534676425643, "learning_rate": 9.004915080900352e-06, "loss": 0.0394, "step": 3176 }, { "epoch": 0.8519710378117458, "grad_norm": 0.33193332563266886, "learning_rate": 9.003980822125024e-06, "loss": 0.0287, "step": 3177 }, { "epoch": 0.8522392062215071, "grad_norm": 0.5095974674079591, "learning_rate": 9.003046173492713e-06, "loss": 0.0311, "step": 3178 }, { "epoch": 0.8525073746312685, "grad_norm": 0.47380994656224745, "learning_rate": 9.002111135094426e-06, "loss": 0.0596, "step": 3179 }, { "epoch": 0.8527755430410298, "grad_norm": 0.4185313007132331, "learning_rate": 9.001175707021202e-06, "loss": 0.0344, "step": 3180 }, { "epoch": 0.853043711450791, "grad_norm": 0.5477696917652317, "learning_rate": 9.000239889364123e-06, "loss": 0.0342, "step": 3181 }, { "epoch": 0.8533118798605525, "grad_norm": 0.34776106384848265, "learning_rate": 8.999303682214307e-06, "loss": 0.0315, "step": 3182 }, { "epoch": 0.8535800482703138, "grad_norm": 0.32848918843377956, "learning_rate": 8.998367085662908e-06, "loss": 0.0292, "step": 3183 }, { "epoch": 0.853848216680075, "grad_norm": 0.37073682237609096, "learning_rate": 8.997430099801122e-06, "loss": 0.0406, "step": 3184 }, { "epoch": 0.8541163850898364, "grad_norm": 0.3436111221285223, "learning_rate": 8.996492724720181e-06, "loss": 0.0393, "step": 3185 }, { "epoch": 0.8543845534995977, "grad_norm": 0.3792804269836349, "learning_rate": 8.995554960511352e-06, "loss": 0.0365, "step": 3186 }, { "epoch": 0.854652721909359, "grad_norm": 0.3466226889155668, "learning_rate": 8.994616807265945e-06, "loss": 0.0322, "step": 3187 }, { "epoch": 0.8549208903191204, "grad_norm": 0.43391486593982026, "learning_rate": 8.993678265075305e-06, "loss": 0.0263, "step": 3188 }, { "epoch": 0.8551890587288817, "grad_norm": 0.3492127444934099, "learning_rate": 8.992739334030815e-06, "loss": 0.0392, "step": 3189 }, { "epoch": 0.855457227138643, "grad_norm": 0.24623676316287096, "learning_rate": 8.9918000142239e-06, "loss": 0.0274, "step": 3190 }, { "epoch": 0.8557253955484044, "grad_norm": 0.3669496130756555, "learning_rate": 8.990860305746012e-06, "loss": 0.0375, "step": 3191 }, { "epoch": 0.8559935639581657, "grad_norm": 0.3500109449136609, "learning_rate": 8.989920208688652e-06, "loss": 0.0302, "step": 3192 }, { "epoch": 0.856261732367927, "grad_norm": 0.32621100491343347, "learning_rate": 8.988979723143354e-06, "loss": 0.0288, "step": 3193 }, { "epoch": 0.8565299007776884, "grad_norm": 0.6814033336907981, "learning_rate": 8.988038849201692e-06, "loss": 0.0335, "step": 3194 }, { "epoch": 0.8567980691874497, "grad_norm": 0.358976994800179, "learning_rate": 8.987097586955276e-06, "loss": 0.0271, "step": 3195 }, { "epoch": 0.857066237597211, "grad_norm": 0.2608819323564406, "learning_rate": 8.986155936495751e-06, "loss": 0.0255, "step": 3196 }, { "epoch": 0.8573344060069724, "grad_norm": 0.41455994412143765, "learning_rate": 8.985213897914808e-06, "loss": 0.0467, "step": 3197 }, { "epoch": 0.8576025744167337, "grad_norm": 0.3229304118858955, "learning_rate": 8.984271471304167e-06, "loss": 0.0239, "step": 3198 }, { "epoch": 0.857870742826495, "grad_norm": 0.7222250436027328, "learning_rate": 8.98332865675559e-06, "loss": 0.052, "step": 3199 }, { "epoch": 0.8581389112362564, "grad_norm": 0.35502807835787314, "learning_rate": 8.98238545436088e-06, "loss": 0.0317, "step": 3200 }, { "epoch": 0.8584070796460177, "grad_norm": 0.39044392043991466, "learning_rate": 8.981441864211869e-06, "loss": 0.0438, "step": 3201 }, { "epoch": 0.858675248055779, "grad_norm": 0.3282572212354986, "learning_rate": 8.980497886400435e-06, "loss": 0.0311, "step": 3202 }, { "epoch": 0.8589434164655404, "grad_norm": 0.3019406058203457, "learning_rate": 8.97955352101849e-06, "loss": 0.0319, "step": 3203 }, { "epoch": 0.8592115848753017, "grad_norm": 0.27288360170442766, "learning_rate": 8.978608768157984e-06, "loss": 0.0296, "step": 3204 }, { "epoch": 0.859479753285063, "grad_norm": 0.38901088771041836, "learning_rate": 8.977663627910903e-06, "loss": 0.0358, "step": 3205 }, { "epoch": 0.8597479216948244, "grad_norm": 0.28495683773255065, "learning_rate": 8.97671810036928e-06, "loss": 0.0287, "step": 3206 }, { "epoch": 0.8600160901045857, "grad_norm": 0.7244943475156721, "learning_rate": 8.97577218562517e-06, "loss": 0.0557, "step": 3207 }, { "epoch": 0.860284258514347, "grad_norm": 0.26255045452274306, "learning_rate": 8.974825883770678e-06, "loss": 0.0285, "step": 3208 }, { "epoch": 0.8605524269241084, "grad_norm": 0.4006075356604569, "learning_rate": 8.973879194897942e-06, "loss": 0.0418, "step": 3209 }, { "epoch": 0.8608205953338697, "grad_norm": 0.32337104089515767, "learning_rate": 8.972932119099142e-06, "loss": 0.0354, "step": 3210 }, { "epoch": 0.861088763743631, "grad_norm": 0.372123180523371, "learning_rate": 8.971984656466488e-06, "loss": 0.041, "step": 3211 }, { "epoch": 0.8613569321533924, "grad_norm": 0.36017765223260895, "learning_rate": 8.971036807092233e-06, "loss": 0.0393, "step": 3212 }, { "epoch": 0.8616251005631537, "grad_norm": 0.2943371140137503, "learning_rate": 8.970088571068667e-06, "loss": 0.026, "step": 3213 }, { "epoch": 0.861893268972915, "grad_norm": 0.4696854845032073, "learning_rate": 8.969139948488116e-06, "loss": 0.0324, "step": 3214 }, { "epoch": 0.8621614373826764, "grad_norm": 0.2966347142680746, "learning_rate": 8.968190939442948e-06, "loss": 0.0258, "step": 3215 }, { "epoch": 0.8624296057924377, "grad_norm": 0.33896000040923613, "learning_rate": 8.967241544025564e-06, "loss": 0.0261, "step": 3216 }, { "epoch": 0.862697774202199, "grad_norm": 0.321391771592232, "learning_rate": 8.966291762328404e-06, "loss": 0.0314, "step": 3217 }, { "epoch": 0.8629659426119604, "grad_norm": 0.382194289813668, "learning_rate": 8.965341594443944e-06, "loss": 0.0443, "step": 3218 }, { "epoch": 0.8632341110217217, "grad_norm": 0.3550964040323414, "learning_rate": 8.964391040464699e-06, "loss": 0.0358, "step": 3219 }, { "epoch": 0.8635022794314829, "grad_norm": 0.4411124751548579, "learning_rate": 8.963440100483228e-06, "loss": 0.04, "step": 3220 }, { "epoch": 0.8637704478412443, "grad_norm": 0.631411787320404, "learning_rate": 8.962488774592113e-06, "loss": 0.0326, "step": 3221 }, { "epoch": 0.8640386162510056, "grad_norm": 0.28770758140320707, "learning_rate": 8.961537062883989e-06, "loss": 0.0368, "step": 3222 }, { "epoch": 0.8643067846607669, "grad_norm": 0.5407600269692727, "learning_rate": 8.960584965451517e-06, "loss": 0.0417, "step": 3223 }, { "epoch": 0.8645749530705283, "grad_norm": 0.28969312791378926, "learning_rate": 8.959632482387404e-06, "loss": 0.0286, "step": 3224 }, { "epoch": 0.8648431214802896, "grad_norm": 0.34774294214349366, "learning_rate": 8.958679613784387e-06, "loss": 0.04, "step": 3225 }, { "epoch": 0.8651112898900509, "grad_norm": 0.2809542274150366, "learning_rate": 8.957726359735245e-06, "loss": 0.0304, "step": 3226 }, { "epoch": 0.8653794582998123, "grad_norm": 0.3701205745189522, "learning_rate": 8.956772720332799e-06, "loss": 0.0481, "step": 3227 }, { "epoch": 0.8656476267095736, "grad_norm": 0.38449005232171146, "learning_rate": 8.955818695669895e-06, "loss": 0.0375, "step": 3228 }, { "epoch": 0.8659157951193349, "grad_norm": 0.3150166236925231, "learning_rate": 8.954864285839428e-06, "loss": 0.0289, "step": 3229 }, { "epoch": 0.8661839635290963, "grad_norm": 0.5767512780363198, "learning_rate": 8.953909490934327e-06, "loss": 0.0586, "step": 3230 }, { "epoch": 0.8664521319388576, "grad_norm": 0.2885328817364179, "learning_rate": 8.952954311047554e-06, "loss": 0.0366, "step": 3231 }, { "epoch": 0.8667203003486189, "grad_norm": 0.4288043198371314, "learning_rate": 8.951998746272115e-06, "loss": 0.0413, "step": 3232 }, { "epoch": 0.8669884687583803, "grad_norm": 0.4649732287926261, "learning_rate": 8.951042796701051e-06, "loss": 0.0377, "step": 3233 }, { "epoch": 0.8672566371681416, "grad_norm": 0.378922569249171, "learning_rate": 8.95008646242744e-06, "loss": 0.0399, "step": 3234 }, { "epoch": 0.8675248055779029, "grad_norm": 0.33603473191199146, "learning_rate": 8.949129743544396e-06, "loss": 0.0322, "step": 3235 }, { "epoch": 0.8677929739876643, "grad_norm": 0.3940563696723601, "learning_rate": 8.948172640145076e-06, "loss": 0.0378, "step": 3236 }, { "epoch": 0.8680611423974256, "grad_norm": 0.3213987291497469, "learning_rate": 8.947215152322666e-06, "loss": 0.0326, "step": 3237 }, { "epoch": 0.8683293108071869, "grad_norm": 0.29584201504898233, "learning_rate": 8.946257280170398e-06, "loss": 0.0234, "step": 3238 }, { "epoch": 0.8685974792169483, "grad_norm": 0.3463067154635824, "learning_rate": 8.945299023781534e-06, "loss": 0.0347, "step": 3239 }, { "epoch": 0.8688656476267096, "grad_norm": 0.377574910832234, "learning_rate": 8.944340383249379e-06, "loss": 0.0382, "step": 3240 }, { "epoch": 0.8691338160364709, "grad_norm": 0.4535462750940223, "learning_rate": 8.943381358667273e-06, "loss": 0.0504, "step": 3241 }, { "epoch": 0.8694019844462322, "grad_norm": 0.4041825913856687, "learning_rate": 8.942421950128595e-06, "loss": 0.0326, "step": 3242 }, { "epoch": 0.8696701528559936, "grad_norm": 0.26440936377405416, "learning_rate": 8.941462157726757e-06, "loss": 0.0264, "step": 3243 }, { "epoch": 0.8699383212657549, "grad_norm": 0.3270436586550396, "learning_rate": 8.940501981555215e-06, "loss": 0.0356, "step": 3244 }, { "epoch": 0.8702064896755162, "grad_norm": 0.30588659881944097, "learning_rate": 8.939541421707454e-06, "loss": 0.0294, "step": 3245 }, { "epoch": 0.8704746580852776, "grad_norm": 0.8022143408534514, "learning_rate": 8.938580478277006e-06, "loss": 0.0319, "step": 3246 }, { "epoch": 0.8707428264950389, "grad_norm": 0.4480963503906354, "learning_rate": 8.937619151357433e-06, "loss": 0.0317, "step": 3247 }, { "epoch": 0.8710109949048002, "grad_norm": 0.48739118396863423, "learning_rate": 8.936657441042339e-06, "loss": 0.0374, "step": 3248 }, { "epoch": 0.8712791633145616, "grad_norm": 0.2952567352809718, "learning_rate": 8.935695347425359e-06, "loss": 0.029, "step": 3249 }, { "epoch": 0.8715473317243229, "grad_norm": 0.5099873586801738, "learning_rate": 8.934732870600174e-06, "loss": 0.0313, "step": 3250 }, { "epoch": 0.8718155001340842, "grad_norm": 0.34996629164151183, "learning_rate": 8.933770010660494e-06, "loss": 0.0322, "step": 3251 }, { "epoch": 0.8720836685438456, "grad_norm": 0.46288528816219054, "learning_rate": 8.932806767700074e-06, "loss": 0.0406, "step": 3252 }, { "epoch": 0.8723518369536069, "grad_norm": 0.2612171339194805, "learning_rate": 8.9318431418127e-06, "loss": 0.023, "step": 3253 }, { "epoch": 0.8726200053633681, "grad_norm": 0.5499180765106316, "learning_rate": 8.930879133092197e-06, "loss": 0.0286, "step": 3254 }, { "epoch": 0.8728881737731295, "grad_norm": 0.4750467431180505, "learning_rate": 8.92991474163243e-06, "loss": 0.0395, "step": 3255 }, { "epoch": 0.8731563421828908, "grad_norm": 0.36273049877686386, "learning_rate": 8.928949967527298e-06, "loss": 0.0324, "step": 3256 }, { "epoch": 0.8734245105926521, "grad_norm": 0.5923851200452512, "learning_rate": 8.927984810870739e-06, "loss": 0.0428, "step": 3257 }, { "epoch": 0.8736926790024135, "grad_norm": 0.9218356995878388, "learning_rate": 8.927019271756728e-06, "loss": 0.0424, "step": 3258 }, { "epoch": 0.8739608474121748, "grad_norm": 0.38520121678613917, "learning_rate": 8.926053350279276e-06, "loss": 0.0358, "step": 3259 }, { "epoch": 0.8742290158219361, "grad_norm": 0.5619199815534117, "learning_rate": 8.925087046532432e-06, "loss": 0.0424, "step": 3260 }, { "epoch": 0.8744971842316975, "grad_norm": 0.3592836946330606, "learning_rate": 8.924120360610284e-06, "loss": 0.0257, "step": 3261 }, { "epoch": 0.8747653526414588, "grad_norm": 0.5075692318255762, "learning_rate": 8.923153292606955e-06, "loss": 0.0459, "step": 3262 }, { "epoch": 0.8750335210512201, "grad_norm": 0.3932397047440339, "learning_rate": 8.922185842616607e-06, "loss": 0.0425, "step": 3263 }, { "epoch": 0.8753016894609815, "grad_norm": 0.36168841369329974, "learning_rate": 8.921218010733435e-06, "loss": 0.0339, "step": 3264 }, { "epoch": 0.8755698578707428, "grad_norm": 0.3686775604349555, "learning_rate": 8.920249797051675e-06, "loss": 0.023, "step": 3265 }, { "epoch": 0.8758380262805041, "grad_norm": 0.3090428418666465, "learning_rate": 8.919281201665601e-06, "loss": 0.0269, "step": 3266 }, { "epoch": 0.8761061946902655, "grad_norm": 0.34827805528932293, "learning_rate": 8.918312224669523e-06, "loss": 0.0249, "step": 3267 }, { "epoch": 0.8763743631000268, "grad_norm": 0.3664979037303756, "learning_rate": 8.917342866157786e-06, "loss": 0.0321, "step": 3268 }, { "epoch": 0.8766425315097881, "grad_norm": 0.36225470137596133, "learning_rate": 8.916373126224775e-06, "loss": 0.0363, "step": 3269 }, { "epoch": 0.8769106999195495, "grad_norm": 0.28954922673491484, "learning_rate": 8.915403004964908e-06, "loss": 0.0412, "step": 3270 }, { "epoch": 0.8771788683293108, "grad_norm": 0.41905707171959417, "learning_rate": 8.914432502472648e-06, "loss": 0.0307, "step": 3271 }, { "epoch": 0.8774470367390721, "grad_norm": 0.31273456641517294, "learning_rate": 8.913461618842488e-06, "loss": 0.0423, "step": 3272 }, { "epoch": 0.8777152051488335, "grad_norm": 0.5431537286675475, "learning_rate": 8.912490354168959e-06, "loss": 0.0251, "step": 3273 }, { "epoch": 0.8779833735585948, "grad_norm": 0.2959314442421235, "learning_rate": 8.911518708546633e-06, "loss": 0.0369, "step": 3274 }, { "epoch": 0.8782515419683561, "grad_norm": 0.35466975607671825, "learning_rate": 8.910546682070114e-06, "loss": 0.0349, "step": 3275 }, { "epoch": 0.8785197103781175, "grad_norm": 0.3232715242869063, "learning_rate": 8.909574274834047e-06, "loss": 0.0274, "step": 3276 }, { "epoch": 0.8787878787878788, "grad_norm": 0.31891309892168923, "learning_rate": 8.908601486933113e-06, "loss": 0.035, "step": 3277 }, { "epoch": 0.8790560471976401, "grad_norm": 0.3313864187110509, "learning_rate": 8.907628318462029e-06, "loss": 0.0303, "step": 3278 }, { "epoch": 0.8793242156074015, "grad_norm": 0.31161800667180134, "learning_rate": 8.906654769515551e-06, "loss": 0.0278, "step": 3279 }, { "epoch": 0.8795923840171628, "grad_norm": 0.45887522355644644, "learning_rate": 8.905680840188469e-06, "loss": 0.0334, "step": 3280 }, { "epoch": 0.8798605524269241, "grad_norm": 0.2942099822667207, "learning_rate": 8.904706530575613e-06, "loss": 0.0236, "step": 3281 }, { "epoch": 0.8801287208366855, "grad_norm": 0.2882601686904067, "learning_rate": 8.90373184077185e-06, "loss": 0.0273, "step": 3282 }, { "epoch": 0.8803968892464468, "grad_norm": 0.31027336066663547, "learning_rate": 8.902756770872082e-06, "loss": 0.0249, "step": 3283 }, { "epoch": 0.8806650576562081, "grad_norm": 0.45038515932523937, "learning_rate": 8.901781320971248e-06, "loss": 0.0324, "step": 3284 }, { "epoch": 0.8809332260659695, "grad_norm": 0.8601207000566916, "learning_rate": 8.900805491164326e-06, "loss": 0.037, "step": 3285 }, { "epoch": 0.8812013944757308, "grad_norm": 0.2614037160607683, "learning_rate": 8.89982928154633e-06, "loss": 0.0307, "step": 3286 }, { "epoch": 0.881469562885492, "grad_norm": 0.3785975348403741, "learning_rate": 8.89885269221231e-06, "loss": 0.0307, "step": 3287 }, { "epoch": 0.8817377312952535, "grad_norm": 0.4236452411071127, "learning_rate": 8.897875723257354e-06, "loss": 0.0429, "step": 3288 }, { "epoch": 0.8820058997050148, "grad_norm": 0.3723546714413032, "learning_rate": 8.89689837477659e-06, "loss": 0.0376, "step": 3289 }, { "epoch": 0.882274068114776, "grad_norm": 0.47873485227272405, "learning_rate": 8.895920646865174e-06, "loss": 0.0368, "step": 3290 }, { "epoch": 0.8825422365245374, "grad_norm": 0.4302309795989242, "learning_rate": 8.89494253961831e-06, "loss": 0.0343, "step": 3291 }, { "epoch": 0.8828104049342987, "grad_norm": 0.6503964904763161, "learning_rate": 8.893964053131232e-06, "loss": 0.0449, "step": 3292 }, { "epoch": 0.88307857334406, "grad_norm": 0.5351174767158696, "learning_rate": 8.892985187499212e-06, "loss": 0.0315, "step": 3293 }, { "epoch": 0.8833467417538214, "grad_norm": 0.30873965425267474, "learning_rate": 8.892005942817562e-06, "loss": 0.0294, "step": 3294 }, { "epoch": 0.8836149101635827, "grad_norm": 0.5722857051970847, "learning_rate": 8.891026319181625e-06, "loss": 0.0371, "step": 3295 }, { "epoch": 0.883883078573344, "grad_norm": 0.2791279025873966, "learning_rate": 8.890046316686783e-06, "loss": 0.031, "step": 3296 }, { "epoch": 0.8841512469831054, "grad_norm": 0.340300266612849, "learning_rate": 8.889065935428462e-06, "loss": 0.0377, "step": 3297 }, { "epoch": 0.8844194153928667, "grad_norm": 0.37073189110358273, "learning_rate": 8.888085175502114e-06, "loss": 0.0301, "step": 3298 }, { "epoch": 0.884687583802628, "grad_norm": 0.39141817682558183, "learning_rate": 8.887104037003238e-06, "loss": 0.0476, "step": 3299 }, { "epoch": 0.8849557522123894, "grad_norm": 0.4228666881111964, "learning_rate": 8.88612252002736e-06, "loss": 0.0349, "step": 3300 }, { "epoch": 0.8852239206221507, "grad_norm": 0.4339392215255841, "learning_rate": 8.88514062467005e-06, "loss": 0.0311, "step": 3301 }, { "epoch": 0.885492089031912, "grad_norm": 0.34311183664319495, "learning_rate": 8.884158351026912e-06, "loss": 0.0394, "step": 3302 }, { "epoch": 0.8857602574416734, "grad_norm": 0.3506717474607233, "learning_rate": 8.883175699193589e-06, "loss": 0.028, "step": 3303 }, { "epoch": 0.8860284258514347, "grad_norm": 0.4117176940344392, "learning_rate": 8.882192669265757e-06, "loss": 0.0336, "step": 3304 }, { "epoch": 0.886296594261196, "grad_norm": 0.3948742260453566, "learning_rate": 8.881209261339133e-06, "loss": 0.0395, "step": 3305 }, { "epoch": 0.8865647626709574, "grad_norm": 0.34122840519738834, "learning_rate": 8.880225475509465e-06, "loss": 0.032, "step": 3306 }, { "epoch": 0.8868329310807187, "grad_norm": 0.4820425898133421, "learning_rate": 8.879241311872546e-06, "loss": 0.0347, "step": 3307 }, { "epoch": 0.88710109949048, "grad_norm": 0.900549644164003, "learning_rate": 8.878256770524199e-06, "loss": 0.0288, "step": 3308 }, { "epoch": 0.8873692679002414, "grad_norm": 0.49876627362592213, "learning_rate": 8.877271851560287e-06, "loss": 0.0378, "step": 3309 }, { "epoch": 0.8876374363100027, "grad_norm": 0.38837102569002036, "learning_rate": 8.876286555076708e-06, "loss": 0.0375, "step": 3310 }, { "epoch": 0.887905604719764, "grad_norm": 0.6987808592398832, "learning_rate": 8.8753008811694e-06, "loss": 0.0476, "step": 3311 }, { "epoch": 0.8881737731295254, "grad_norm": 0.3703855496726898, "learning_rate": 8.874314829934333e-06, "loss": 0.0367, "step": 3312 }, { "epoch": 0.8884419415392867, "grad_norm": 0.30861846324516695, "learning_rate": 8.873328401467517e-06, "loss": 0.0429, "step": 3313 }, { "epoch": 0.888710109949048, "grad_norm": 0.324905712302882, "learning_rate": 8.872341595864998e-06, "loss": 0.0344, "step": 3314 }, { "epoch": 0.8889782783588094, "grad_norm": 0.3144290213595244, "learning_rate": 8.871354413222859e-06, "loss": 0.0245, "step": 3315 }, { "epoch": 0.8892464467685707, "grad_norm": 0.3865801837851577, "learning_rate": 8.870366853637218e-06, "loss": 0.0381, "step": 3316 }, { "epoch": 0.889514615178332, "grad_norm": 0.2726863307668217, "learning_rate": 8.869378917204235e-06, "loss": 0.0264, "step": 3317 }, { "epoch": 0.8897827835880934, "grad_norm": 0.31691792052155127, "learning_rate": 8.868390604020097e-06, "loss": 0.0279, "step": 3318 }, { "epoch": 0.8900509519978547, "grad_norm": 0.4878425369129178, "learning_rate": 8.867401914181037e-06, "loss": 0.0336, "step": 3319 }, { "epoch": 0.890319120407616, "grad_norm": 0.36781826793960626, "learning_rate": 8.866412847783322e-06, "loss": 0.0272, "step": 3320 }, { "epoch": 0.8905872888173774, "grad_norm": 0.7612783888577035, "learning_rate": 8.86542340492325e-06, "loss": 0.0418, "step": 3321 }, { "epoch": 0.8908554572271387, "grad_norm": 0.2927581106778206, "learning_rate": 8.864433585697165e-06, "loss": 0.0298, "step": 3322 }, { "epoch": 0.8911236256369, "grad_norm": 0.3719632434211825, "learning_rate": 8.863443390201443e-06, "loss": 0.035, "step": 3323 }, { "epoch": 0.8913917940466614, "grad_norm": 0.5549574892921412, "learning_rate": 8.862452818532495e-06, "loss": 0.0299, "step": 3324 }, { "epoch": 0.8916599624564227, "grad_norm": 0.5077451261812873, "learning_rate": 8.86146187078677e-06, "loss": 0.0385, "step": 3325 }, { "epoch": 0.8919281308661839, "grad_norm": 0.42890304998150874, "learning_rate": 8.860470547060753e-06, "loss": 0.0405, "step": 3326 }, { "epoch": 0.8921962992759453, "grad_norm": 0.3861114806017171, "learning_rate": 8.85947884745097e-06, "loss": 0.0281, "step": 3327 }, { "epoch": 0.8924644676857066, "grad_norm": 0.3594894988476645, "learning_rate": 8.858486772053979e-06, "loss": 0.0292, "step": 3328 }, { "epoch": 0.8927326360954679, "grad_norm": 0.3106272669394502, "learning_rate": 8.857494320966374e-06, "loss": 0.0255, "step": 3329 }, { "epoch": 0.8930008045052292, "grad_norm": 0.6409873548751294, "learning_rate": 8.856501494284789e-06, "loss": 0.0492, "step": 3330 }, { "epoch": 0.8932689729149906, "grad_norm": 0.52757966250911, "learning_rate": 8.85550829210589e-06, "loss": 0.0456, "step": 3331 }, { "epoch": 0.8935371413247519, "grad_norm": 0.4850196871129454, "learning_rate": 8.854514714526387e-06, "loss": 0.0535, "step": 3332 }, { "epoch": 0.8938053097345132, "grad_norm": 0.3396470589916414, "learning_rate": 8.85352076164302e-06, "loss": 0.03, "step": 3333 }, { "epoch": 0.8940734781442746, "grad_norm": 0.31826293536107214, "learning_rate": 8.852526433552567e-06, "loss": 0.037, "step": 3334 }, { "epoch": 0.8943416465540359, "grad_norm": 0.33806972384301787, "learning_rate": 8.851531730351843e-06, "loss": 0.0385, "step": 3335 }, { "epoch": 0.8946098149637972, "grad_norm": 0.33337727025388797, "learning_rate": 8.8505366521377e-06, "loss": 0.0243, "step": 3336 }, { "epoch": 0.8948779833735586, "grad_norm": 0.40307956983732934, "learning_rate": 8.849541199007028e-06, "loss": 0.0374, "step": 3337 }, { "epoch": 0.8951461517833199, "grad_norm": 0.31194524328963236, "learning_rate": 8.848545371056747e-06, "loss": 0.0327, "step": 3338 }, { "epoch": 0.8954143201930812, "grad_norm": 0.29791350784274095, "learning_rate": 8.847549168383823e-06, "loss": 0.0294, "step": 3339 }, { "epoch": 0.8956824886028426, "grad_norm": 0.35764332892848805, "learning_rate": 8.84655259108525e-06, "loss": 0.0266, "step": 3340 }, { "epoch": 0.8959506570126039, "grad_norm": 0.5887583962089283, "learning_rate": 8.845555639258065e-06, "loss": 0.0441, "step": 3341 }, { "epoch": 0.8962188254223652, "grad_norm": 0.3677823172074353, "learning_rate": 8.844558312999337e-06, "loss": 0.0382, "step": 3342 }, { "epoch": 0.8964869938321266, "grad_norm": 0.30989777476938146, "learning_rate": 8.843560612406171e-06, "loss": 0.0344, "step": 3343 }, { "epoch": 0.8967551622418879, "grad_norm": 0.31029728861607003, "learning_rate": 8.842562537575715e-06, "loss": 0.0266, "step": 3344 }, { "epoch": 0.8970233306516492, "grad_norm": 0.28901366386189237, "learning_rate": 8.841564088605145e-06, "loss": 0.0208, "step": 3345 }, { "epoch": 0.8972914990614106, "grad_norm": 0.44138477391091824, "learning_rate": 8.840565265591679e-06, "loss": 0.0378, "step": 3346 }, { "epoch": 0.8975596674711719, "grad_norm": 0.41458639723773355, "learning_rate": 8.83956606863257e-06, "loss": 0.0369, "step": 3347 }, { "epoch": 0.8978278358809332, "grad_norm": 0.3479715390778149, "learning_rate": 8.838566497825108e-06, "loss": 0.0237, "step": 3348 }, { "epoch": 0.8980960042906946, "grad_norm": 0.30128962440086493, "learning_rate": 8.837566553266617e-06, "loss": 0.0294, "step": 3349 }, { "epoch": 0.8983641727004559, "grad_norm": 0.258723872046188, "learning_rate": 8.836566235054458e-06, "loss": 0.0312, "step": 3350 }, { "epoch": 0.8986323411102172, "grad_norm": 0.31627960368159475, "learning_rate": 8.835565543286031e-06, "loss": 0.0323, "step": 3351 }, { "epoch": 0.8989005095199786, "grad_norm": 0.28015933918117214, "learning_rate": 8.83456447805877e-06, "loss": 0.0255, "step": 3352 }, { "epoch": 0.8991686779297399, "grad_norm": 0.47879418858873796, "learning_rate": 8.833563039470146e-06, "loss": 0.0408, "step": 3353 }, { "epoch": 0.8994368463395012, "grad_norm": 0.32973095511955236, "learning_rate": 8.832561227617667e-06, "loss": 0.0266, "step": 3354 }, { "epoch": 0.8997050147492626, "grad_norm": 0.7922216973115608, "learning_rate": 8.831559042598879e-06, "loss": 0.0447, "step": 3355 }, { "epoch": 0.8999731831590239, "grad_norm": 0.30996261247351614, "learning_rate": 8.830556484511356e-06, "loss": 0.0337, "step": 3356 }, { "epoch": 0.9002413515687852, "grad_norm": 0.324663332546099, "learning_rate": 8.82955355345272e-06, "loss": 0.0298, "step": 3357 }, { "epoch": 0.9005095199785466, "grad_norm": 0.3067072650808897, "learning_rate": 8.828550249520623e-06, "loss": 0.0398, "step": 3358 }, { "epoch": 0.9007776883883079, "grad_norm": 0.3793997395327298, "learning_rate": 8.82754657281275e-06, "loss": 0.0332, "step": 3359 }, { "epoch": 0.9010458567980691, "grad_norm": 0.3051300451510978, "learning_rate": 8.82654252342683e-06, "loss": 0.0266, "step": 3360 }, { "epoch": 0.9013140252078305, "grad_norm": 0.4698608248360729, "learning_rate": 8.825538101460625e-06, "loss": 0.0343, "step": 3361 }, { "epoch": 0.9015821936175918, "grad_norm": 0.4392734641558583, "learning_rate": 8.82453330701193e-06, "loss": 0.0344, "step": 3362 }, { "epoch": 0.9018503620273531, "grad_norm": 0.5054275725233702, "learning_rate": 8.82352814017858e-06, "loss": 0.0418, "step": 3363 }, { "epoch": 0.9021185304371145, "grad_norm": 0.42184616039625833, "learning_rate": 8.82252260105845e-06, "loss": 0.0325, "step": 3364 }, { "epoch": 0.9023866988468758, "grad_norm": 0.3165308049364458, "learning_rate": 8.82151668974944e-06, "loss": 0.0258, "step": 3365 }, { "epoch": 0.9026548672566371, "grad_norm": 0.29336659180468483, "learning_rate": 8.820510406349496e-06, "loss": 0.0329, "step": 3366 }, { "epoch": 0.9029230356663985, "grad_norm": 0.3014416750605149, "learning_rate": 8.819503750956598e-06, "loss": 0.0321, "step": 3367 }, { "epoch": 0.9031912040761598, "grad_norm": 0.4039786332984564, "learning_rate": 8.81849672366876e-06, "loss": 0.0338, "step": 3368 }, { "epoch": 0.9034593724859211, "grad_norm": 0.3363985523291303, "learning_rate": 8.817489324584035e-06, "loss": 0.0209, "step": 3369 }, { "epoch": 0.9037275408956825, "grad_norm": 0.5406397513027865, "learning_rate": 8.816481553800508e-06, "loss": 0.0385, "step": 3370 }, { "epoch": 0.9039957093054438, "grad_norm": 0.4226334837778787, "learning_rate": 8.815473411416305e-06, "loss": 0.0256, "step": 3371 }, { "epoch": 0.9042638777152051, "grad_norm": 0.3296580986693705, "learning_rate": 8.814464897529587e-06, "loss": 0.0317, "step": 3372 }, { "epoch": 0.9045320461249665, "grad_norm": 0.3171868429539967, "learning_rate": 8.813456012238548e-06, "loss": 0.0285, "step": 3373 }, { "epoch": 0.9048002145347278, "grad_norm": 0.35259781151707476, "learning_rate": 8.812446755641422e-06, "loss": 0.0328, "step": 3374 }, { "epoch": 0.9050683829444891, "grad_norm": 0.4107802128087662, "learning_rate": 8.811437127836477e-06, "loss": 0.0422, "step": 3375 }, { "epoch": 0.9053365513542505, "grad_norm": 0.9102784371618802, "learning_rate": 8.810427128922016e-06, "loss": 0.0549, "step": 3376 }, { "epoch": 0.9056047197640118, "grad_norm": 0.2976432925411997, "learning_rate": 8.809416758996386e-06, "loss": 0.0286, "step": 3377 }, { "epoch": 0.9058728881737731, "grad_norm": 0.4101456242582264, "learning_rate": 8.808406018157957e-06, "loss": 0.0375, "step": 3378 }, { "epoch": 0.9061410565835345, "grad_norm": 0.7880858506767984, "learning_rate": 8.807394906505146e-06, "loss": 0.0479, "step": 3379 }, { "epoch": 0.9064092249932958, "grad_norm": 0.4130148510419459, "learning_rate": 8.806383424136403e-06, "loss": 0.0319, "step": 3380 }, { "epoch": 0.9066773934030571, "grad_norm": 0.34816711663518507, "learning_rate": 8.80537157115021e-06, "loss": 0.0342, "step": 3381 }, { "epoch": 0.9069455618128185, "grad_norm": 0.704650126882243, "learning_rate": 8.804359347645091e-06, "loss": 0.0655, "step": 3382 }, { "epoch": 0.9072137302225798, "grad_norm": 0.498878898192836, "learning_rate": 8.803346753719602e-06, "loss": 0.0355, "step": 3383 }, { "epoch": 0.9074818986323411, "grad_norm": 0.358983957861865, "learning_rate": 8.802333789472338e-06, "loss": 0.044, "step": 3384 }, { "epoch": 0.9077500670421025, "grad_norm": 0.5446881841330066, "learning_rate": 8.801320455001928e-06, "loss": 0.0308, "step": 3385 }, { "epoch": 0.9080182354518638, "grad_norm": 0.4286690498701049, "learning_rate": 8.800306750407034e-06, "loss": 0.0349, "step": 3386 }, { "epoch": 0.9082864038616251, "grad_norm": 0.3838394017896056, "learning_rate": 8.799292675786365e-06, "loss": 0.0329, "step": 3387 }, { "epoch": 0.9085545722713865, "grad_norm": 0.4076643387926635, "learning_rate": 8.798278231238655e-06, "loss": 0.0301, "step": 3388 }, { "epoch": 0.9088227406811478, "grad_norm": 0.42922836932543895, "learning_rate": 8.797263416862678e-06, "loss": 0.036, "step": 3389 }, { "epoch": 0.9090909090909091, "grad_norm": 0.3852969741160308, "learning_rate": 8.796248232757243e-06, "loss": 0.0385, "step": 3390 }, { "epoch": 0.9093590775006705, "grad_norm": 0.826275552472805, "learning_rate": 8.795232679021197e-06, "loss": 0.0468, "step": 3391 }, { "epoch": 0.9096272459104318, "grad_norm": 0.3395180810836074, "learning_rate": 8.79421675575342e-06, "loss": 0.0341, "step": 3392 }, { "epoch": 0.909895414320193, "grad_norm": 0.5598328947810235, "learning_rate": 8.793200463052832e-06, "loss": 0.0542, "step": 3393 }, { "epoch": 0.9101635827299545, "grad_norm": 0.3010822094662956, "learning_rate": 8.792183801018384e-06, "loss": 0.0324, "step": 3394 }, { "epoch": 0.9104317511397158, "grad_norm": 0.299682936763229, "learning_rate": 8.791166769749068e-06, "loss": 0.027, "step": 3395 }, { "epoch": 0.910699919549477, "grad_norm": 0.3549950679173996, "learning_rate": 8.79014936934391e-06, "loss": 0.0306, "step": 3396 }, { "epoch": 0.9109680879592384, "grad_norm": 0.3216899509017038, "learning_rate": 8.78913159990197e-06, "loss": 0.0262, "step": 3397 }, { "epoch": 0.9112362563689997, "grad_norm": 0.3896680482040204, "learning_rate": 8.788113461522346e-06, "loss": 0.0346, "step": 3398 }, { "epoch": 0.911504424778761, "grad_norm": 0.3242145948279843, "learning_rate": 8.787094954304172e-06, "loss": 0.0294, "step": 3399 }, { "epoch": 0.9117725931885224, "grad_norm": 0.44211969895353237, "learning_rate": 8.786076078346615e-06, "loss": 0.0458, "step": 3400 }, { "epoch": 0.9120407615982837, "grad_norm": 0.4171091732377363, "learning_rate": 8.785056833748883e-06, "loss": 0.0461, "step": 3401 }, { "epoch": 0.912308930008045, "grad_norm": 0.3025890450589386, "learning_rate": 8.78403722061022e-06, "loss": 0.0303, "step": 3402 }, { "epoch": 0.9125770984178064, "grad_norm": 0.39468709179311234, "learning_rate": 8.783017239029894e-06, "loss": 0.0406, "step": 3403 }, { "epoch": 0.9128452668275677, "grad_norm": 0.23899144352990173, "learning_rate": 8.781996889107226e-06, "loss": 0.0226, "step": 3404 }, { "epoch": 0.913113435237329, "grad_norm": 0.4167370601959788, "learning_rate": 8.780976170941561e-06, "loss": 0.0424, "step": 3405 }, { "epoch": 0.9133816036470904, "grad_norm": 0.6560744760719583, "learning_rate": 8.779955084632284e-06, "loss": 0.039, "step": 3406 }, { "epoch": 0.9136497720568517, "grad_norm": 0.4032588849662207, "learning_rate": 8.778933630278817e-06, "loss": 0.032, "step": 3407 }, { "epoch": 0.913917940466613, "grad_norm": 0.3298897366921061, "learning_rate": 8.777911807980615e-06, "loss": 0.027, "step": 3408 }, { "epoch": 0.9141861088763744, "grad_norm": 0.3313719192088538, "learning_rate": 8.77688961783717e-06, "loss": 0.0358, "step": 3409 }, { "epoch": 0.9144542772861357, "grad_norm": 0.3052567224309724, "learning_rate": 8.775867059948011e-06, "loss": 0.0372, "step": 3410 }, { "epoch": 0.914722445695897, "grad_norm": 0.49090418032028404, "learning_rate": 8.7748441344127e-06, "loss": 0.0396, "step": 3411 }, { "epoch": 0.9149906141056584, "grad_norm": 0.535214036038146, "learning_rate": 8.77382084133084e-06, "loss": 0.0279, "step": 3412 }, { "epoch": 0.9152587825154197, "grad_norm": 0.7593444634931664, "learning_rate": 8.772797180802061e-06, "loss": 0.0404, "step": 3413 }, { "epoch": 0.915526950925181, "grad_norm": 0.694009308072015, "learning_rate": 8.77177315292604e-06, "loss": 0.0447, "step": 3414 }, { "epoch": 0.9157951193349424, "grad_norm": 0.46289340729487094, "learning_rate": 8.770748757802478e-06, "loss": 0.0488, "step": 3415 }, { "epoch": 0.9160632877447037, "grad_norm": 0.3729024557999195, "learning_rate": 8.769723995531123e-06, "loss": 0.043, "step": 3416 }, { "epoch": 0.916331456154465, "grad_norm": 0.282382262049196, "learning_rate": 8.76869886621175e-06, "loss": 0.0288, "step": 3417 }, { "epoch": 0.9165996245642264, "grad_norm": 0.3647329979242271, "learning_rate": 8.767673369944174e-06, "loss": 0.036, "step": 3418 }, { "epoch": 0.9168677929739877, "grad_norm": 0.5492621290030115, "learning_rate": 8.766647506828246e-06, "loss": 0.0381, "step": 3419 }, { "epoch": 0.917135961383749, "grad_norm": 0.42478480924233625, "learning_rate": 8.76562127696385e-06, "loss": 0.0276, "step": 3420 }, { "epoch": 0.9174041297935103, "grad_norm": 0.3516947363251873, "learning_rate": 8.764594680450907e-06, "loss": 0.0262, "step": 3421 }, { "epoch": 0.9176722982032717, "grad_norm": 0.6767930162581993, "learning_rate": 8.763567717389374e-06, "loss": 0.0362, "step": 3422 }, { "epoch": 0.917940466613033, "grad_norm": 0.5268727666848313, "learning_rate": 8.762540387879245e-06, "loss": 0.0263, "step": 3423 }, { "epoch": 0.9182086350227943, "grad_norm": 0.3294366556338562, "learning_rate": 8.76151269202055e-06, "loss": 0.0395, "step": 3424 }, { "epoch": 0.9184768034325557, "grad_norm": 0.412935295743048, "learning_rate": 8.760484629913349e-06, "loss": 0.0353, "step": 3425 }, { "epoch": 0.918744971842317, "grad_norm": 0.41242589394257234, "learning_rate": 8.759456201657743e-06, "loss": 0.0283, "step": 3426 }, { "epoch": 0.9190131402520783, "grad_norm": 0.6378129400959122, "learning_rate": 8.758427407353868e-06, "loss": 0.0399, "step": 3427 }, { "epoch": 0.9192813086618397, "grad_norm": 0.36620448421197715, "learning_rate": 8.757398247101896e-06, "loss": 0.0289, "step": 3428 }, { "epoch": 0.919549477071601, "grad_norm": 0.44196634307598237, "learning_rate": 8.756368721002031e-06, "loss": 0.0422, "step": 3429 }, { "epoch": 0.9198176454813622, "grad_norm": 0.820781953925549, "learning_rate": 8.755338829154517e-06, "loss": 0.0532, "step": 3430 }, { "epoch": 0.9200858138911236, "grad_norm": 0.4193592014543343, "learning_rate": 8.754308571659633e-06, "loss": 0.0439, "step": 3431 }, { "epoch": 0.9203539823008849, "grad_norm": 0.44182622778399266, "learning_rate": 8.753277948617688e-06, "loss": 0.0381, "step": 3432 }, { "epoch": 0.9206221507106462, "grad_norm": 0.48822212540505355, "learning_rate": 8.752246960129037e-06, "loss": 0.0339, "step": 3433 }, { "epoch": 0.9208903191204076, "grad_norm": 0.28524736457012784, "learning_rate": 8.75121560629406e-06, "loss": 0.0322, "step": 3434 }, { "epoch": 0.9211584875301689, "grad_norm": 0.36135301850127477, "learning_rate": 8.75018388721318e-06, "loss": 0.0374, "step": 3435 }, { "epoch": 0.9214266559399302, "grad_norm": 0.5109162939722136, "learning_rate": 8.749151802986848e-06, "loss": 0.0289, "step": 3436 }, { "epoch": 0.9216948243496916, "grad_norm": 0.24387996746349797, "learning_rate": 8.748119353715562e-06, "loss": 0.0292, "step": 3437 }, { "epoch": 0.9219629927594529, "grad_norm": 0.2721041202750463, "learning_rate": 8.747086539499844e-06, "loss": 0.0297, "step": 3438 }, { "epoch": 0.9222311611692142, "grad_norm": 0.5447837286688044, "learning_rate": 8.746053360440258e-06, "loss": 0.037, "step": 3439 }, { "epoch": 0.9224993295789756, "grad_norm": 0.61975082715662, "learning_rate": 8.745019816637404e-06, "loss": 0.0493, "step": 3440 }, { "epoch": 0.9227674979887369, "grad_norm": 0.44113927796891955, "learning_rate": 8.74398590819191e-06, "loss": 0.0362, "step": 3441 }, { "epoch": 0.9230356663984982, "grad_norm": 0.45648420759892344, "learning_rate": 8.742951635204451e-06, "loss": 0.0467, "step": 3442 }, { "epoch": 0.9233038348082596, "grad_norm": 0.40932278296883784, "learning_rate": 8.741916997775727e-06, "loss": 0.0339, "step": 3443 }, { "epoch": 0.9235720032180209, "grad_norm": 0.3230665570246833, "learning_rate": 8.74088199600648e-06, "loss": 0.0367, "step": 3444 }, { "epoch": 0.9238401716277822, "grad_norm": 0.33853428956400516, "learning_rate": 8.739846629997483e-06, "loss": 0.0399, "step": 3445 }, { "epoch": 0.9241083400375436, "grad_norm": 0.34244815286203717, "learning_rate": 8.738810899849552e-06, "loss": 0.0379, "step": 3446 }, { "epoch": 0.9243765084473049, "grad_norm": 0.38652611343802634, "learning_rate": 8.73777480566353e-06, "loss": 0.0371, "step": 3447 }, { "epoch": 0.9246446768570662, "grad_norm": 0.3622205138979121, "learning_rate": 8.736738347540296e-06, "loss": 0.0356, "step": 3448 }, { "epoch": 0.9249128452668276, "grad_norm": 0.2430869320434499, "learning_rate": 8.73570152558077e-06, "loss": 0.023, "step": 3449 }, { "epoch": 0.9251810136765889, "grad_norm": 0.32667291740777665, "learning_rate": 8.734664339885908e-06, "loss": 0.0274, "step": 3450 }, { "epoch": 0.9254491820863502, "grad_norm": 0.5214866564969718, "learning_rate": 8.733626790556692e-06, "loss": 0.0341, "step": 3451 }, { "epoch": 0.9257173504961116, "grad_norm": 0.4031660735112484, "learning_rate": 8.732588877694148e-06, "loss": 0.044, "step": 3452 }, { "epoch": 0.9259855189058729, "grad_norm": 0.5090607295337447, "learning_rate": 8.731550601399336e-06, "loss": 0.04, "step": 3453 }, { "epoch": 0.9262536873156342, "grad_norm": 0.2754062649743409, "learning_rate": 8.730511961773347e-06, "loss": 0.0331, "step": 3454 }, { "epoch": 0.9265218557253956, "grad_norm": 0.34410703272911203, "learning_rate": 8.729472958917316e-06, "loss": 0.0283, "step": 3455 }, { "epoch": 0.9267900241351569, "grad_norm": 0.28724429531507517, "learning_rate": 8.728433592932402e-06, "loss": 0.0293, "step": 3456 }, { "epoch": 0.9270581925449182, "grad_norm": 0.4618735599154895, "learning_rate": 8.727393863919809e-06, "loss": 0.0383, "step": 3457 }, { "epoch": 0.9273263609546796, "grad_norm": 0.32350893327829805, "learning_rate": 8.72635377198077e-06, "loss": 0.0288, "step": 3458 }, { "epoch": 0.9275945293644409, "grad_norm": 0.41559622051719486, "learning_rate": 8.725313317216558e-06, "loss": 0.0341, "step": 3459 }, { "epoch": 0.9278626977742022, "grad_norm": 0.4024445185653892, "learning_rate": 8.724272499728479e-06, "loss": 0.0321, "step": 3460 }, { "epoch": 0.9281308661839636, "grad_norm": 0.6736470771994812, "learning_rate": 8.723231319617875e-06, "loss": 0.0308, "step": 3461 }, { "epoch": 0.9283990345937249, "grad_norm": 0.4698535114534645, "learning_rate": 8.72218977698612e-06, "loss": 0.0445, "step": 3462 }, { "epoch": 0.9286672030034862, "grad_norm": 0.4191570139774224, "learning_rate": 8.721147871934632e-06, "loss": 0.0299, "step": 3463 }, { "epoch": 0.9289353714132476, "grad_norm": 0.2745301507101826, "learning_rate": 8.720105604564853e-06, "loss": 0.0312, "step": 3464 }, { "epoch": 0.9292035398230089, "grad_norm": 0.5027932031584817, "learning_rate": 8.71906297497827e-06, "loss": 0.0348, "step": 3465 }, { "epoch": 0.9294717082327701, "grad_norm": 0.3461598921463335, "learning_rate": 8.718019983276398e-06, "loss": 0.0283, "step": 3466 }, { "epoch": 0.9297398766425315, "grad_norm": 0.4111005201466307, "learning_rate": 8.716976629560791e-06, "loss": 0.0446, "step": 3467 }, { "epoch": 0.9300080450522928, "grad_norm": 0.22090084559272372, "learning_rate": 8.715932913933039e-06, "loss": 0.0235, "step": 3468 }, { "epoch": 0.9302762134620541, "grad_norm": 0.29757557387345096, "learning_rate": 8.714888836494762e-06, "loss": 0.0316, "step": 3469 }, { "epoch": 0.9305443818718155, "grad_norm": 0.3756043675996844, "learning_rate": 8.713844397347624e-06, "loss": 0.0285, "step": 3470 }, { "epoch": 0.9308125502815768, "grad_norm": 0.3420941353744393, "learning_rate": 8.712799596593317e-06, "loss": 0.0322, "step": 3471 }, { "epoch": 0.9310807186913381, "grad_norm": 0.38742701931654105, "learning_rate": 8.71175443433357e-06, "loss": 0.0269, "step": 3472 }, { "epoch": 0.9313488871010995, "grad_norm": 0.3039553662019852, "learning_rate": 8.71070891067015e-06, "loss": 0.0321, "step": 3473 }, { "epoch": 0.9316170555108608, "grad_norm": 0.2603762774187866, "learning_rate": 8.709663025704852e-06, "loss": 0.0257, "step": 3474 }, { "epoch": 0.9318852239206221, "grad_norm": 0.4011365345342904, "learning_rate": 8.708616779539518e-06, "loss": 0.0433, "step": 3475 }, { "epoch": 0.9321533923303835, "grad_norm": 0.357826966875023, "learning_rate": 8.707570172276013e-06, "loss": 0.0374, "step": 3476 }, { "epoch": 0.9324215607401448, "grad_norm": 0.5352465010550165, "learning_rate": 8.706523204016242e-06, "loss": 0.0348, "step": 3477 }, { "epoch": 0.9326897291499061, "grad_norm": 0.4656758186023748, "learning_rate": 8.705475874862149e-06, "loss": 0.0418, "step": 3478 }, { "epoch": 0.9329578975596675, "grad_norm": 0.2819884277921083, "learning_rate": 8.704428184915706e-06, "loss": 0.0291, "step": 3479 }, { "epoch": 0.9332260659694288, "grad_norm": 0.3788414876227959, "learning_rate": 8.703380134278928e-06, "loss": 0.0397, "step": 3480 }, { "epoch": 0.9334942343791901, "grad_norm": 0.35115623110238153, "learning_rate": 8.702331723053856e-06, "loss": 0.0287, "step": 3481 }, { "epoch": 0.9337624027889515, "grad_norm": 0.3830012409872812, "learning_rate": 8.701282951342576e-06, "loss": 0.0275, "step": 3482 }, { "epoch": 0.9340305711987128, "grad_norm": 0.38353914155370866, "learning_rate": 8.7002338192472e-06, "loss": 0.0297, "step": 3483 }, { "epoch": 0.9342987396084741, "grad_norm": 0.3548797181718097, "learning_rate": 8.699184326869883e-06, "loss": 0.0306, "step": 3484 }, { "epoch": 0.9345669080182355, "grad_norm": 0.46752604753855237, "learning_rate": 8.698134474312808e-06, "loss": 0.0293, "step": 3485 }, { "epoch": 0.9348350764279968, "grad_norm": 0.5277310151457333, "learning_rate": 8.697084261678198e-06, "loss": 0.028, "step": 3486 }, { "epoch": 0.9351032448377581, "grad_norm": 0.5228027595623886, "learning_rate": 8.696033689068309e-06, "loss": 0.0269, "step": 3487 }, { "epoch": 0.9353714132475195, "grad_norm": 0.2855481317424195, "learning_rate": 8.69498275658543e-06, "loss": 0.0282, "step": 3488 }, { "epoch": 0.9356395816572808, "grad_norm": 0.46545715257434983, "learning_rate": 8.693931464331893e-06, "loss": 0.0429, "step": 3489 }, { "epoch": 0.9359077500670421, "grad_norm": 0.33826932897132, "learning_rate": 8.692879812410056e-06, "loss": 0.0365, "step": 3490 }, { "epoch": 0.9361759184768035, "grad_norm": 0.27034163115876086, "learning_rate": 8.691827800922314e-06, "loss": 0.018, "step": 3491 }, { "epoch": 0.9364440868865648, "grad_norm": 0.2966105529902935, "learning_rate": 8.690775429971103e-06, "loss": 0.0299, "step": 3492 }, { "epoch": 0.9367122552963261, "grad_norm": 0.5051992027831839, "learning_rate": 8.689722699658887e-06, "loss": 0.0501, "step": 3493 }, { "epoch": 0.9369804237060875, "grad_norm": 0.5659243650659339, "learning_rate": 8.688669610088168e-06, "loss": 0.0337, "step": 3494 }, { "epoch": 0.9372485921158488, "grad_norm": 0.5983067254823169, "learning_rate": 8.68761616136148e-06, "loss": 0.0476, "step": 3495 }, { "epoch": 0.9375167605256101, "grad_norm": 0.3767861591315911, "learning_rate": 8.686562353581398e-06, "loss": 0.0356, "step": 3496 }, { "epoch": 0.9377849289353715, "grad_norm": 0.5100621616522427, "learning_rate": 8.685508186850529e-06, "loss": 0.0537, "step": 3497 }, { "epoch": 0.9380530973451328, "grad_norm": 0.2526780218401074, "learning_rate": 8.684453661271511e-06, "loss": 0.0244, "step": 3498 }, { "epoch": 0.938321265754894, "grad_norm": 0.414386692575756, "learning_rate": 8.683398776947022e-06, "loss": 0.0294, "step": 3499 }, { "epoch": 0.9385894341646555, "grad_norm": 0.36250574876565156, "learning_rate": 8.682343533979774e-06, "loss": 0.0343, "step": 3500 }, { "epoch": 0.9388576025744167, "grad_norm": 0.3349926664960653, "learning_rate": 8.681287932472512e-06, "loss": 0.0254, "step": 3501 }, { "epoch": 0.939125770984178, "grad_norm": 0.4367327587110873, "learning_rate": 8.680231972528019e-06, "loss": 0.0312, "step": 3502 }, { "epoch": 0.9393939393939394, "grad_norm": 0.3143020796655468, "learning_rate": 8.679175654249107e-06, "loss": 0.0347, "step": 3503 }, { "epoch": 0.9396621078037007, "grad_norm": 0.2827420421407842, "learning_rate": 8.678118977738632e-06, "loss": 0.0323, "step": 3504 }, { "epoch": 0.939930276213462, "grad_norm": 0.7881596764792689, "learning_rate": 8.677061943099476e-06, "loss": 0.0229, "step": 3505 }, { "epoch": 0.9401984446232234, "grad_norm": 0.22302746717536126, "learning_rate": 8.676004550434563e-06, "loss": 0.026, "step": 3506 }, { "epoch": 0.9404666130329847, "grad_norm": 0.3731899474737974, "learning_rate": 8.674946799846844e-06, "loss": 0.031, "step": 3507 }, { "epoch": 0.940734781442746, "grad_norm": 0.33740896175859675, "learning_rate": 8.673888691439314e-06, "loss": 0.0293, "step": 3508 }, { "epoch": 0.9410029498525073, "grad_norm": 0.26470976805274643, "learning_rate": 8.672830225314994e-06, "loss": 0.0232, "step": 3509 }, { "epoch": 0.9412711182622687, "grad_norm": 0.3627384719239412, "learning_rate": 8.671771401576948e-06, "loss": 0.0356, "step": 3510 }, { "epoch": 0.94153928667203, "grad_norm": 0.2843430303576068, "learning_rate": 8.670712220328268e-06, "loss": 0.0332, "step": 3511 }, { "epoch": 0.9418074550817913, "grad_norm": 0.3531038305328944, "learning_rate": 8.669652681672085e-06, "loss": 0.038, "step": 3512 }, { "epoch": 0.9420756234915527, "grad_norm": 0.5380350044943221, "learning_rate": 8.668592785711562e-06, "loss": 0.0383, "step": 3513 }, { "epoch": 0.942343791901314, "grad_norm": 0.3625632128159095, "learning_rate": 8.667532532549901e-06, "loss": 0.0389, "step": 3514 }, { "epoch": 0.9426119603110753, "grad_norm": 0.5321597287764244, "learning_rate": 8.666471922290334e-06, "loss": 0.0321, "step": 3515 }, { "epoch": 0.9428801287208367, "grad_norm": 0.2629506409769947, "learning_rate": 8.665410955036129e-06, "loss": 0.0262, "step": 3516 }, { "epoch": 0.943148297130598, "grad_norm": 0.6597690825186551, "learning_rate": 8.664349630890592e-06, "loss": 0.0359, "step": 3517 }, { "epoch": 0.9434164655403593, "grad_norm": 0.3440416485674077, "learning_rate": 8.663287949957059e-06, "loss": 0.0284, "step": 3518 }, { "epoch": 0.9436846339501207, "grad_norm": 0.5020157659771631, "learning_rate": 8.662225912338906e-06, "loss": 0.0456, "step": 3519 }, { "epoch": 0.943952802359882, "grad_norm": 0.3237051835938411, "learning_rate": 8.661163518139537e-06, "loss": 0.0376, "step": 3520 }, { "epoch": 0.9442209707696433, "grad_norm": 0.6234433385623344, "learning_rate": 8.660100767462398e-06, "loss": 0.047, "step": 3521 }, { "epoch": 0.9444891391794047, "grad_norm": 0.3018094850498469, "learning_rate": 8.659037660410965e-06, "loss": 0.0221, "step": 3522 }, { "epoch": 0.944757307589166, "grad_norm": 0.4534977464659579, "learning_rate": 8.657974197088747e-06, "loss": 0.0286, "step": 3523 }, { "epoch": 0.9450254759989273, "grad_norm": 0.4638779444898991, "learning_rate": 8.656910377599294e-06, "loss": 0.0519, "step": 3524 }, { "epoch": 0.9452936444086887, "grad_norm": 0.3722790087068143, "learning_rate": 8.65584620204619e-06, "loss": 0.038, "step": 3525 }, { "epoch": 0.94556181281845, "grad_norm": 0.4024395799425294, "learning_rate": 8.654781670533043e-06, "loss": 0.0262, "step": 3526 }, { "epoch": 0.9458299812282113, "grad_norm": 0.6244448385619118, "learning_rate": 8.65371678316351e-06, "loss": 0.041, "step": 3527 }, { "epoch": 0.9460981496379727, "grad_norm": 0.5481710451507913, "learning_rate": 8.652651540041272e-06, "loss": 0.0333, "step": 3528 }, { "epoch": 0.946366318047734, "grad_norm": 0.40051380102013634, "learning_rate": 8.651585941270056e-06, "loss": 0.0366, "step": 3529 }, { "epoch": 0.9466344864574953, "grad_norm": 0.33217248672407196, "learning_rate": 8.650519986953607e-06, "loss": 0.0245, "step": 3530 }, { "epoch": 0.9469026548672567, "grad_norm": 0.27813446074695647, "learning_rate": 8.64945367719572e-06, "loss": 0.0239, "step": 3531 }, { "epoch": 0.947170823277018, "grad_norm": 0.3921792402048546, "learning_rate": 8.64838701210022e-06, "loss": 0.0327, "step": 3532 }, { "epoch": 0.9474389916867793, "grad_norm": 0.32849280702804706, "learning_rate": 8.647319991770959e-06, "loss": 0.0371, "step": 3533 }, { "epoch": 0.9477071600965407, "grad_norm": 0.34770606469846554, "learning_rate": 8.646252616311836e-06, "loss": 0.0304, "step": 3534 }, { "epoch": 0.947975328506302, "grad_norm": 0.28908639654540036, "learning_rate": 8.645184885826776e-06, "loss": 0.0386, "step": 3535 }, { "epoch": 0.9482434969160632, "grad_norm": 0.23792373829134256, "learning_rate": 8.644116800419741e-06, "loss": 0.0316, "step": 3536 }, { "epoch": 0.9485116653258246, "grad_norm": 0.3191023292441791, "learning_rate": 8.64304836019473e-06, "loss": 0.0365, "step": 3537 }, { "epoch": 0.9487798337355859, "grad_norm": 0.6874460414117828, "learning_rate": 8.64197956525577e-06, "loss": 0.029, "step": 3538 }, { "epoch": 0.9490480021453472, "grad_norm": 0.43920993916557255, "learning_rate": 8.640910415706931e-06, "loss": 0.0431, "step": 3539 }, { "epoch": 0.9493161705551086, "grad_norm": 0.45930704010464496, "learning_rate": 8.639840911652309e-06, "loss": 0.0302, "step": 3540 }, { "epoch": 0.9495843389648699, "grad_norm": 0.36442804312876265, "learning_rate": 8.638771053196043e-06, "loss": 0.0341, "step": 3541 }, { "epoch": 0.9498525073746312, "grad_norm": 0.4167081809713381, "learning_rate": 8.6377008404423e-06, "loss": 0.0343, "step": 3542 }, { "epoch": 0.9501206757843926, "grad_norm": 0.5302044618722979, "learning_rate": 8.636630273495284e-06, "loss": 0.0259, "step": 3543 }, { "epoch": 0.9503888441941539, "grad_norm": 0.3762756929809467, "learning_rate": 8.635559352459232e-06, "loss": 0.0306, "step": 3544 }, { "epoch": 0.9506570126039152, "grad_norm": 0.3277359014067011, "learning_rate": 8.634488077438419e-06, "loss": 0.0304, "step": 3545 }, { "epoch": 0.9509251810136766, "grad_norm": 0.3348328328896239, "learning_rate": 8.63341644853715e-06, "loss": 0.0269, "step": 3546 }, { "epoch": 0.9511933494234379, "grad_norm": 0.40890586199432016, "learning_rate": 8.63234446585977e-06, "loss": 0.0315, "step": 3547 }, { "epoch": 0.9514615178331992, "grad_norm": 0.5167310419009069, "learning_rate": 8.631272129510653e-06, "loss": 0.0425, "step": 3548 }, { "epoch": 0.9517296862429606, "grad_norm": 0.24842406418806748, "learning_rate": 8.630199439594209e-06, "loss": 0.0278, "step": 3549 }, { "epoch": 0.9519978546527219, "grad_norm": 0.3142024869226592, "learning_rate": 8.629126396214884e-06, "loss": 0.0303, "step": 3550 }, { "epoch": 0.9522660230624832, "grad_norm": 0.6790042026125757, "learning_rate": 8.628052999477156e-06, "loss": 0.0459, "step": 3551 }, { "epoch": 0.9525341914722446, "grad_norm": 0.6341699008332742, "learning_rate": 8.626979249485538e-06, "loss": 0.0265, "step": 3552 }, { "epoch": 0.9528023598820059, "grad_norm": 0.40537804865859745, "learning_rate": 8.625905146344585e-06, "loss": 0.0323, "step": 3553 }, { "epoch": 0.9530705282917672, "grad_norm": 0.7606901496524828, "learning_rate": 8.624830690158869e-06, "loss": 0.0447, "step": 3554 }, { "epoch": 0.9533386967015286, "grad_norm": 0.24681365532420824, "learning_rate": 8.623755881033016e-06, "loss": 0.0249, "step": 3555 }, { "epoch": 0.9536068651112899, "grad_norm": 0.3451250793849473, "learning_rate": 8.622680719071673e-06, "loss": 0.0301, "step": 3556 }, { "epoch": 0.9538750335210512, "grad_norm": 0.45135034503313176, "learning_rate": 8.621605204379525e-06, "loss": 0.0298, "step": 3557 }, { "epoch": 0.9541432019308126, "grad_norm": 0.3253546747082488, "learning_rate": 8.620529337061295e-06, "loss": 0.0338, "step": 3558 }, { "epoch": 0.9544113703405739, "grad_norm": 0.4531375412289693, "learning_rate": 8.619453117221737e-06, "loss": 0.0386, "step": 3559 }, { "epoch": 0.9546795387503352, "grad_norm": 0.31633192625082573, "learning_rate": 8.618376544965636e-06, "loss": 0.0359, "step": 3560 }, { "epoch": 0.9549477071600966, "grad_norm": 0.3033622808161688, "learning_rate": 8.617299620397817e-06, "loss": 0.0286, "step": 3561 }, { "epoch": 0.9552158755698579, "grad_norm": 0.8627661596775185, "learning_rate": 8.61622234362314e-06, "loss": 0.0397, "step": 3562 }, { "epoch": 0.9554840439796192, "grad_norm": 0.4169885695082447, "learning_rate": 8.615144714746493e-06, "loss": 0.0367, "step": 3563 }, { "epoch": 0.9557522123893806, "grad_norm": 0.26010141517055063, "learning_rate": 8.614066733872803e-06, "loss": 0.0236, "step": 3564 }, { "epoch": 0.9560203807991419, "grad_norm": 0.3150500922750423, "learning_rate": 8.61298840110703e-06, "loss": 0.0315, "step": 3565 }, { "epoch": 0.9562885492089032, "grad_norm": 0.44559932471656305, "learning_rate": 8.61190971655417e-06, "loss": 0.0316, "step": 3566 }, { "epoch": 0.9565567176186646, "grad_norm": 0.43338971782395724, "learning_rate": 8.61083068031925e-06, "loss": 0.0434, "step": 3567 }, { "epoch": 0.9568248860284259, "grad_norm": 0.4800334563938576, "learning_rate": 8.609751292507332e-06, "loss": 0.0334, "step": 3568 }, { "epoch": 0.9570930544381872, "grad_norm": 0.6113471916596498, "learning_rate": 8.608671553223515e-06, "loss": 0.0446, "step": 3569 }, { "epoch": 0.9573612228479486, "grad_norm": 0.4067172924078759, "learning_rate": 8.60759146257293e-06, "loss": 0.0283, "step": 3570 }, { "epoch": 0.9576293912577098, "grad_norm": 0.24974496459760726, "learning_rate": 8.606511020660743e-06, "loss": 0.0351, "step": 3571 }, { "epoch": 0.9578975596674711, "grad_norm": 0.23015473531430836, "learning_rate": 8.605430227592152e-06, "loss": 0.0197, "step": 3572 }, { "epoch": 0.9581657280772325, "grad_norm": 0.5053490005188851, "learning_rate": 8.604349083472392e-06, "loss": 0.0304, "step": 3573 }, { "epoch": 0.9584338964869938, "grad_norm": 0.4218803396825629, "learning_rate": 8.603267588406732e-06, "loss": 0.033, "step": 3574 }, { "epoch": 0.9587020648967551, "grad_norm": 0.3185953959767205, "learning_rate": 8.602185742500473e-06, "loss": 0.0322, "step": 3575 }, { "epoch": 0.9589702333065165, "grad_norm": 0.2783640410490793, "learning_rate": 8.601103545858951e-06, "loss": 0.0307, "step": 3576 }, { "epoch": 0.9592384017162778, "grad_norm": 0.3495298034297191, "learning_rate": 8.600020998587537e-06, "loss": 0.0264, "step": 3577 }, { "epoch": 0.9595065701260391, "grad_norm": 0.28682301255627823, "learning_rate": 8.59893810079164e-06, "loss": 0.0283, "step": 3578 }, { "epoch": 0.9597747385358005, "grad_norm": 0.28204027645970686, "learning_rate": 8.59785485257669e-06, "loss": 0.0245, "step": 3579 }, { "epoch": 0.9600429069455618, "grad_norm": 0.3151392967216016, "learning_rate": 8.596771254048167e-06, "loss": 0.031, "step": 3580 }, { "epoch": 0.9603110753553231, "grad_norm": 0.49012244798563714, "learning_rate": 8.595687305311578e-06, "loss": 0.0422, "step": 3581 }, { "epoch": 0.9605792437650845, "grad_norm": 0.6291615130829359, "learning_rate": 8.59460300647246e-06, "loss": 0.0417, "step": 3582 }, { "epoch": 0.9608474121748458, "grad_norm": 0.7142187479303892, "learning_rate": 8.593518357636391e-06, "loss": 0.0328, "step": 3583 }, { "epoch": 0.9611155805846071, "grad_norm": 0.3275506431842715, "learning_rate": 8.592433358908983e-06, "loss": 0.031, "step": 3584 }, { "epoch": 0.9613837489943685, "grad_norm": 0.2988858656577797, "learning_rate": 8.591348010395874e-06, "loss": 0.0275, "step": 3585 }, { "epoch": 0.9616519174041298, "grad_norm": 0.3757247528570066, "learning_rate": 8.590262312202745e-06, "loss": 0.0306, "step": 3586 }, { "epoch": 0.9619200858138911, "grad_norm": 0.273062698872959, "learning_rate": 8.589176264435306e-06, "loss": 0.0317, "step": 3587 }, { "epoch": 0.9621882542236525, "grad_norm": 0.4989624237172853, "learning_rate": 8.588089867199302e-06, "loss": 0.0285, "step": 3588 }, { "epoch": 0.9624564226334138, "grad_norm": 0.3460364337520605, "learning_rate": 8.587003120600516e-06, "loss": 0.0303, "step": 3589 }, { "epoch": 0.9627245910431751, "grad_norm": 0.6028209878297713, "learning_rate": 8.58591602474476e-06, "loss": 0.035, "step": 3590 }, { "epoch": 0.9629927594529365, "grad_norm": 0.3627544850398372, "learning_rate": 8.58482857973788e-06, "loss": 0.0306, "step": 3591 }, { "epoch": 0.9632609278626978, "grad_norm": 0.5036391619583225, "learning_rate": 8.58374078568576e-06, "loss": 0.0286, "step": 3592 }, { "epoch": 0.9635290962724591, "grad_norm": 0.4005179781099706, "learning_rate": 8.582652642694311e-06, "loss": 0.037, "step": 3593 }, { "epoch": 0.9637972646822205, "grad_norm": 0.3949093922850974, "learning_rate": 8.581564150869488e-06, "loss": 0.0326, "step": 3594 }, { "epoch": 0.9640654330919818, "grad_norm": 0.28835274838047764, "learning_rate": 8.580475310317273e-06, "loss": 0.0326, "step": 3595 }, { "epoch": 0.9643336015017431, "grad_norm": 0.3104035876542182, "learning_rate": 8.579386121143682e-06, "loss": 0.0288, "step": 3596 }, { "epoch": 0.9646017699115044, "grad_norm": 0.34041386153468695, "learning_rate": 8.578296583454767e-06, "loss": 0.0333, "step": 3597 }, { "epoch": 0.9648699383212658, "grad_norm": 0.500109460322815, "learning_rate": 8.577206697356614e-06, "loss": 0.0441, "step": 3598 }, { "epoch": 0.9651381067310271, "grad_norm": 0.3013686309288792, "learning_rate": 8.576116462955345e-06, "loss": 0.0184, "step": 3599 }, { "epoch": 0.9654062751407884, "grad_norm": 0.5428248015501635, "learning_rate": 8.575025880357109e-06, "loss": 0.0324, "step": 3600 }, { "epoch": 0.9656744435505498, "grad_norm": 0.30118618146595383, "learning_rate": 8.573934949668093e-06, "loss": 0.0341, "step": 3601 }, { "epoch": 0.9659426119603111, "grad_norm": 0.32486728670072945, "learning_rate": 8.572843670994521e-06, "loss": 0.0462, "step": 3602 }, { "epoch": 0.9662107803700724, "grad_norm": 0.39848629559963616, "learning_rate": 8.571752044442645e-06, "loss": 0.041, "step": 3603 }, { "epoch": 0.9664789487798338, "grad_norm": 0.3016615814992247, "learning_rate": 8.570660070118758e-06, "loss": 0.0232, "step": 3604 }, { "epoch": 0.966747117189595, "grad_norm": 0.3903766925165771, "learning_rate": 8.569567748129177e-06, "loss": 0.0304, "step": 3605 }, { "epoch": 0.9670152855993563, "grad_norm": 0.39932121786725394, "learning_rate": 8.568475078580262e-06, "loss": 0.0293, "step": 3606 }, { "epoch": 0.9672834540091177, "grad_norm": 0.39562142160648417, "learning_rate": 8.567382061578404e-06, "loss": 0.0415, "step": 3607 }, { "epoch": 0.967551622418879, "grad_norm": 0.26051531713373094, "learning_rate": 8.566288697230024e-06, "loss": 0.0256, "step": 3608 }, { "epoch": 0.9678197908286403, "grad_norm": 0.7105109594981944, "learning_rate": 8.565194985641582e-06, "loss": 0.0333, "step": 3609 }, { "epoch": 0.9680879592384017, "grad_norm": 0.3720483660672912, "learning_rate": 8.564100926919572e-06, "loss": 0.0451, "step": 3610 }, { "epoch": 0.968356127648163, "grad_norm": 0.784727960567823, "learning_rate": 8.563006521170516e-06, "loss": 0.034, "step": 3611 }, { "epoch": 0.9686242960579243, "grad_norm": 0.6062223015771004, "learning_rate": 8.561911768500974e-06, "loss": 0.032, "step": 3612 }, { "epoch": 0.9688924644676857, "grad_norm": 0.39504268843170937, "learning_rate": 8.56081666901754e-06, "loss": 0.0364, "step": 3613 }, { "epoch": 0.969160632877447, "grad_norm": 0.3030056471490794, "learning_rate": 8.559721222826841e-06, "loss": 0.0237, "step": 3614 }, { "epoch": 0.9694288012872083, "grad_norm": 0.7599004933675892, "learning_rate": 8.558625430035537e-06, "loss": 0.0305, "step": 3615 }, { "epoch": 0.9696969696969697, "grad_norm": 0.343388948770909, "learning_rate": 8.557529290750324e-06, "loss": 0.0311, "step": 3616 }, { "epoch": 0.969965138106731, "grad_norm": 0.3402353120384854, "learning_rate": 8.556432805077927e-06, "loss": 0.0476, "step": 3617 }, { "epoch": 0.9702333065164923, "grad_norm": 0.4781030988722, "learning_rate": 8.555335973125111e-06, "loss": 0.0382, "step": 3618 }, { "epoch": 0.9705014749262537, "grad_norm": 0.4992659874134966, "learning_rate": 8.554238794998672e-06, "loss": 0.0397, "step": 3619 }, { "epoch": 0.970769643336015, "grad_norm": 0.31974148482946496, "learning_rate": 8.553141270805435e-06, "loss": 0.0257, "step": 3620 }, { "epoch": 0.9710378117457763, "grad_norm": 0.2328888531367756, "learning_rate": 8.552043400652269e-06, "loss": 0.0261, "step": 3621 }, { "epoch": 0.9713059801555377, "grad_norm": 0.3657806404679391, "learning_rate": 8.550945184646065e-06, "loss": 0.0373, "step": 3622 }, { "epoch": 0.971574148565299, "grad_norm": 0.32882856077966904, "learning_rate": 8.549846622893758e-06, "loss": 0.0345, "step": 3623 }, { "epoch": 0.9718423169750603, "grad_norm": 0.4022086541233728, "learning_rate": 8.548747715502309e-06, "loss": 0.0268, "step": 3624 }, { "epoch": 0.9721104853848217, "grad_norm": 0.39835162949469044, "learning_rate": 8.547648462578717e-06, "loss": 0.0363, "step": 3625 }, { "epoch": 0.972378653794583, "grad_norm": 0.330564160035514, "learning_rate": 8.546548864230011e-06, "loss": 0.0328, "step": 3626 }, { "epoch": 0.9726468222043443, "grad_norm": 0.3309434377910914, "learning_rate": 8.54544892056326e-06, "loss": 0.0231, "step": 3627 }, { "epoch": 0.9729149906141057, "grad_norm": 0.39382364885742527, "learning_rate": 8.544348631685561e-06, "loss": 0.0369, "step": 3628 }, { "epoch": 0.973183159023867, "grad_norm": 0.34978941019018345, "learning_rate": 8.543247997704045e-06, "loss": 0.0343, "step": 3629 }, { "epoch": 0.9734513274336283, "grad_norm": 0.2924714109476883, "learning_rate": 8.542147018725878e-06, "loss": 0.0248, "step": 3630 }, { "epoch": 0.9737194958433897, "grad_norm": 0.4231194015496994, "learning_rate": 8.54104569485826e-06, "loss": 0.0429, "step": 3631 }, { "epoch": 0.973987664253151, "grad_norm": 0.30205461411936785, "learning_rate": 8.539944026208426e-06, "loss": 0.0287, "step": 3632 }, { "epoch": 0.9742558326629123, "grad_norm": 0.4436855941384288, "learning_rate": 8.538842012883639e-06, "loss": 0.0253, "step": 3633 }, { "epoch": 0.9745240010726737, "grad_norm": 0.3042032110696812, "learning_rate": 8.537739654991201e-06, "loss": 0.0284, "step": 3634 }, { "epoch": 0.974792169482435, "grad_norm": 0.3740067423378039, "learning_rate": 8.536636952638444e-06, "loss": 0.0305, "step": 3635 }, { "epoch": 0.9750603378921963, "grad_norm": 0.4220913053207132, "learning_rate": 8.535533905932739e-06, "loss": 0.0402, "step": 3636 }, { "epoch": 0.9753285063019577, "grad_norm": 0.4331822385387482, "learning_rate": 8.534430514981483e-06, "loss": 0.0373, "step": 3637 }, { "epoch": 0.975596674711719, "grad_norm": 0.39750156887269056, "learning_rate": 8.533326779892111e-06, "loss": 0.0277, "step": 3638 }, { "epoch": 0.9758648431214803, "grad_norm": 1.3201139789972984, "learning_rate": 8.53222270077209e-06, "loss": 0.0479, "step": 3639 }, { "epoch": 0.9761330115312417, "grad_norm": 0.40423632278071825, "learning_rate": 8.531118277728925e-06, "loss": 0.042, "step": 3640 }, { "epoch": 0.976401179941003, "grad_norm": 0.364463374774072, "learning_rate": 8.530013510870148e-06, "loss": 0.0367, "step": 3641 }, { "epoch": 0.9766693483507642, "grad_norm": 0.2982704885536568, "learning_rate": 8.528908400303326e-06, "loss": 0.0359, "step": 3642 }, { "epoch": 0.9769375167605256, "grad_norm": 0.36406833969195906, "learning_rate": 8.527802946136063e-06, "loss": 0.036, "step": 3643 }, { "epoch": 0.9772056851702869, "grad_norm": 0.2926632869322191, "learning_rate": 8.526697148475993e-06, "loss": 0.0232, "step": 3644 }, { "epoch": 0.9774738535800482, "grad_norm": 0.3104761628676533, "learning_rate": 8.525591007430785e-06, "loss": 0.0244, "step": 3645 }, { "epoch": 0.9777420219898096, "grad_norm": 0.7557963682686898, "learning_rate": 8.52448452310814e-06, "loss": 0.0355, "step": 3646 }, { "epoch": 0.9780101903995709, "grad_norm": 0.35280727238867876, "learning_rate": 8.523377695615797e-06, "loss": 0.0334, "step": 3647 }, { "epoch": 0.9782783588093322, "grad_norm": 0.36715952591403095, "learning_rate": 8.52227052506152e-06, "loss": 0.0309, "step": 3648 }, { "epoch": 0.9785465272190936, "grad_norm": 0.3839059885291677, "learning_rate": 8.521163011553114e-06, "loss": 0.0254, "step": 3649 }, { "epoch": 0.9788146956288549, "grad_norm": 0.3918615767690337, "learning_rate": 8.520055155198413e-06, "loss": 0.0333, "step": 3650 }, { "epoch": 0.9790828640386162, "grad_norm": 0.32140123859303615, "learning_rate": 8.518946956105288e-06, "loss": 0.0289, "step": 3651 }, { "epoch": 0.9793510324483776, "grad_norm": 0.31017674200415773, "learning_rate": 8.51783841438164e-06, "loss": 0.0234, "step": 3652 }, { "epoch": 0.9796192008581389, "grad_norm": 0.29975746247711027, "learning_rate": 8.516729530135406e-06, "loss": 0.0298, "step": 3653 }, { "epoch": 0.9798873692679002, "grad_norm": 0.3268654142759821, "learning_rate": 8.515620303474554e-06, "loss": 0.0272, "step": 3654 }, { "epoch": 0.9801555376776616, "grad_norm": 0.3142866388604586, "learning_rate": 8.514510734507087e-06, "loss": 0.0227, "step": 3655 }, { "epoch": 0.9804237060874229, "grad_norm": 0.34433421524887536, "learning_rate": 8.513400823341042e-06, "loss": 0.0324, "step": 3656 }, { "epoch": 0.9806918744971842, "grad_norm": 0.26529099639928977, "learning_rate": 8.512290570084486e-06, "loss": 0.0224, "step": 3657 }, { "epoch": 0.9809600429069456, "grad_norm": 0.36606696508016767, "learning_rate": 8.51117997484552e-06, "loss": 0.0349, "step": 3658 }, { "epoch": 0.9812282113167069, "grad_norm": 0.31790399298441924, "learning_rate": 8.510069037732284e-06, "loss": 0.0314, "step": 3659 }, { "epoch": 0.9814963797264682, "grad_norm": 0.5595156414326738, "learning_rate": 8.508957758852943e-06, "loss": 0.0407, "step": 3660 }, { "epoch": 0.9817645481362296, "grad_norm": 0.28438168158365773, "learning_rate": 8.507846138315702e-06, "loss": 0.0268, "step": 3661 }, { "epoch": 0.9820327165459909, "grad_norm": 0.41442313395237773, "learning_rate": 8.506734176228797e-06, "loss": 0.0357, "step": 3662 }, { "epoch": 0.9823008849557522, "grad_norm": 0.30821055404697556, "learning_rate": 8.505621872700493e-06, "loss": 0.0373, "step": 3663 }, { "epoch": 0.9825690533655136, "grad_norm": 0.4322438902686305, "learning_rate": 8.504509227839097e-06, "loss": 0.0309, "step": 3664 }, { "epoch": 0.9828372217752749, "grad_norm": 0.24182023383172518, "learning_rate": 8.503396241752942e-06, "loss": 0.0234, "step": 3665 }, { "epoch": 0.9831053901850362, "grad_norm": 0.3329531361639742, "learning_rate": 8.502282914550395e-06, "loss": 0.0294, "step": 3666 }, { "epoch": 0.9833735585947976, "grad_norm": 0.30467797241152317, "learning_rate": 8.501169246339857e-06, "loss": 0.0271, "step": 3667 }, { "epoch": 0.9836417270045589, "grad_norm": 2.0568412835692036, "learning_rate": 8.500055237229768e-06, "loss": 0.0427, "step": 3668 }, { "epoch": 0.9839098954143202, "grad_norm": 0.298483773343428, "learning_rate": 8.498940887328592e-06, "loss": 0.0369, "step": 3669 }, { "epoch": 0.9841780638240816, "grad_norm": 0.4765706278252705, "learning_rate": 8.497826196744832e-06, "loss": 0.0252, "step": 3670 }, { "epoch": 0.9844462322338429, "grad_norm": 0.32576955375356076, "learning_rate": 8.496711165587021e-06, "loss": 0.0315, "step": 3671 }, { "epoch": 0.9847144006436042, "grad_norm": 0.2771121438909427, "learning_rate": 8.495595793963726e-06, "loss": 0.0189, "step": 3672 }, { "epoch": 0.9849825690533656, "grad_norm": 0.5073905161745529, "learning_rate": 8.494480081983552e-06, "loss": 0.0306, "step": 3673 }, { "epoch": 0.9852507374631269, "grad_norm": 0.2921129310714333, "learning_rate": 8.493364029755126e-06, "loss": 0.0303, "step": 3674 }, { "epoch": 0.9855189058728882, "grad_norm": 0.302137244795995, "learning_rate": 8.492247637387123e-06, "loss": 0.0201, "step": 3675 }, { "epoch": 0.9857870742826496, "grad_norm": 0.31201247644287644, "learning_rate": 8.491130904988239e-06, "loss": 0.0238, "step": 3676 }, { "epoch": 0.9860552426924108, "grad_norm": 0.33907238930914346, "learning_rate": 8.490013832667205e-06, "loss": 0.0333, "step": 3677 }, { "epoch": 0.9863234111021721, "grad_norm": 0.3367281057477326, "learning_rate": 8.488896420532793e-06, "loss": 0.0256, "step": 3678 }, { "epoch": 0.9865915795119335, "grad_norm": 0.4660794227660885, "learning_rate": 8.487778668693797e-06, "loss": 0.0303, "step": 3679 }, { "epoch": 0.9868597479216948, "grad_norm": 0.4623525994882106, "learning_rate": 8.486660577259054e-06, "loss": 0.0289, "step": 3680 }, { "epoch": 0.9871279163314561, "grad_norm": 0.3131601269450747, "learning_rate": 8.485542146337424e-06, "loss": 0.0285, "step": 3681 }, { "epoch": 0.9873960847412175, "grad_norm": 0.3414207166407291, "learning_rate": 8.484423376037813e-06, "loss": 0.034, "step": 3682 }, { "epoch": 0.9876642531509788, "grad_norm": 0.33735721905316585, "learning_rate": 8.483304266469148e-06, "loss": 0.0295, "step": 3683 }, { "epoch": 0.9879324215607401, "grad_norm": 0.21880821900721562, "learning_rate": 8.482184817740393e-06, "loss": 0.0258, "step": 3684 }, { "epoch": 0.9882005899705014, "grad_norm": 0.4774625732927812, "learning_rate": 8.481065029960548e-06, "loss": 0.0388, "step": 3685 }, { "epoch": 0.9884687583802628, "grad_norm": 0.3485597208300633, "learning_rate": 8.479944903238644e-06, "loss": 0.0302, "step": 3686 }, { "epoch": 0.9887369267900241, "grad_norm": 0.32281229103472747, "learning_rate": 8.478824437683742e-06, "loss": 0.0291, "step": 3687 }, { "epoch": 0.9890050951997854, "grad_norm": 0.2962120415204118, "learning_rate": 8.477703633404943e-06, "loss": 0.0257, "step": 3688 }, { "epoch": 0.9892732636095468, "grad_norm": 0.2295530027038438, "learning_rate": 8.476582490511373e-06, "loss": 0.0206, "step": 3689 }, { "epoch": 0.9895414320193081, "grad_norm": 0.27983002006826563, "learning_rate": 8.475461009112196e-06, "loss": 0.0357, "step": 3690 }, { "epoch": 0.9898096004290694, "grad_norm": 0.485469828273517, "learning_rate": 8.474339189316607e-06, "loss": 0.0256, "step": 3691 }, { "epoch": 0.9900777688388308, "grad_norm": 0.47746549093288887, "learning_rate": 8.473217031233835e-06, "loss": 0.0394, "step": 3692 }, { "epoch": 0.9903459372485921, "grad_norm": 0.35861345590939137, "learning_rate": 8.472094534973143e-06, "loss": 0.0325, "step": 3693 }, { "epoch": 0.9906141056583534, "grad_norm": 0.6419213139539572, "learning_rate": 8.470971700643826e-06, "loss": 0.0335, "step": 3694 }, { "epoch": 0.9908822740681148, "grad_norm": 0.31569654672949454, "learning_rate": 8.469848528355208e-06, "loss": 0.0342, "step": 3695 }, { "epoch": 0.9911504424778761, "grad_norm": 0.39403525569129794, "learning_rate": 8.46872501821665e-06, "loss": 0.0409, "step": 3696 }, { "epoch": 0.9914186108876374, "grad_norm": 0.29754479097228625, "learning_rate": 8.467601170337548e-06, "loss": 0.0373, "step": 3697 }, { "epoch": 0.9916867792973988, "grad_norm": 0.274223177068339, "learning_rate": 8.466476984827326e-06, "loss": 0.0267, "step": 3698 }, { "epoch": 0.9919549477071601, "grad_norm": 0.48924391873126055, "learning_rate": 8.465352461795443e-06, "loss": 0.0355, "step": 3699 }, { "epoch": 0.9922231161169214, "grad_norm": 0.4956224694892889, "learning_rate": 8.464227601351391e-06, "loss": 0.0529, "step": 3700 }, { "epoch": 0.9924912845266828, "grad_norm": 0.40193231427195764, "learning_rate": 8.463102403604697e-06, "loss": 0.0377, "step": 3701 }, { "epoch": 0.9927594529364441, "grad_norm": 0.3688838728014438, "learning_rate": 8.461976868664912e-06, "loss": 0.0303, "step": 3702 }, { "epoch": 0.9930276213462054, "grad_norm": 0.8738824779200808, "learning_rate": 8.460850996641635e-06, "loss": 0.0664, "step": 3703 }, { "epoch": 0.9932957897559668, "grad_norm": 0.3243956347912429, "learning_rate": 8.459724787644481e-06, "loss": 0.0306, "step": 3704 }, { "epoch": 0.9935639581657281, "grad_norm": 0.35510414528393397, "learning_rate": 8.458598241783112e-06, "loss": 0.0326, "step": 3705 }, { "epoch": 0.9938321265754894, "grad_norm": 0.2966461560419278, "learning_rate": 8.457471359167215e-06, "loss": 0.032, "step": 3706 }, { "epoch": 0.9941002949852508, "grad_norm": 0.4555609553132581, "learning_rate": 8.45634413990651e-06, "loss": 0.0399, "step": 3707 }, { "epoch": 0.9943684633950121, "grad_norm": 0.23939977132081122, "learning_rate": 8.455216584110753e-06, "loss": 0.0269, "step": 3708 }, { "epoch": 0.9946366318047734, "grad_norm": 0.36194181468132897, "learning_rate": 8.45408869188973e-06, "loss": 0.0414, "step": 3709 }, { "epoch": 0.9949048002145348, "grad_norm": 0.31030928597820945, "learning_rate": 8.452960463353262e-06, "loss": 0.027, "step": 3710 }, { "epoch": 0.995172968624296, "grad_norm": 0.4404306219599595, "learning_rate": 8.451831898611202e-06, "loss": 0.0377, "step": 3711 }, { "epoch": 0.9954411370340573, "grad_norm": 0.43689770072349654, "learning_rate": 8.450702997773435e-06, "loss": 0.0464, "step": 3712 }, { "epoch": 0.9957093054438187, "grad_norm": 0.35043458295306473, "learning_rate": 8.449573760949876e-06, "loss": 0.0268, "step": 3713 }, { "epoch": 0.99597747385358, "grad_norm": 0.9409836069390604, "learning_rate": 8.44844418825048e-06, "loss": 0.0427, "step": 3714 }, { "epoch": 0.9962456422633413, "grad_norm": 0.2992776504581052, "learning_rate": 8.447314279785228e-06, "loss": 0.0305, "step": 3715 }, { "epoch": 0.9965138106731027, "grad_norm": 0.320474019308063, "learning_rate": 8.446184035664136e-06, "loss": 0.0282, "step": 3716 }, { "epoch": 0.996781979082864, "grad_norm": 0.22390935275976268, "learning_rate": 8.445053455997256e-06, "loss": 0.0291, "step": 3717 }, { "epoch": 0.9970501474926253, "grad_norm": 0.30609713309834785, "learning_rate": 8.443922540894666e-06, "loss": 0.0262, "step": 3718 }, { "epoch": 0.9973183159023867, "grad_norm": 0.2915056754638032, "learning_rate": 8.442791290466483e-06, "loss": 0.0251, "step": 3719 }, { "epoch": 0.997586484312148, "grad_norm": 0.5315435716537974, "learning_rate": 8.441659704822853e-06, "loss": 0.0501, "step": 3720 }, { "epoch": 0.9978546527219093, "grad_norm": 0.2797431325832765, "learning_rate": 8.440527784073953e-06, "loss": 0.0313, "step": 3721 }, { "epoch": 0.9981228211316707, "grad_norm": 0.28667142303323095, "learning_rate": 8.439395528329998e-06, "loss": 0.028, "step": 3722 }, { "epoch": 0.998390989541432, "grad_norm": 0.3869771203519002, "learning_rate": 8.438262937701232e-06, "loss": 0.0372, "step": 3723 }, { "epoch": 0.9986591579511933, "grad_norm": 0.48439110013001574, "learning_rate": 8.437130012297932e-06, "loss": 0.0397, "step": 3724 }, { "epoch": 0.9989273263609547, "grad_norm": 0.3479838437239292, "learning_rate": 8.435996752230408e-06, "loss": 0.0364, "step": 3725 }, { "epoch": 0.999195494770716, "grad_norm": 0.3807811851975286, "learning_rate": 8.434863157609003e-06, "loss": 0.0371, "step": 3726 }, { "epoch": 0.9994636631804773, "grad_norm": 0.25942704416765444, "learning_rate": 8.433729228544092e-06, "loss": 0.0262, "step": 3727 }, { "epoch": 0.9997318315902387, "grad_norm": 0.5239163375832191, "learning_rate": 8.43259496514608e-06, "loss": 0.0325, "step": 3728 }, { "epoch": 1.0, "grad_norm": 0.40559477338466393, "learning_rate": 8.431460367525412e-06, "loss": 0.0378, "step": 3729 }, { "epoch": 1.0, "eval_loss": 0.0330052524805069, "eval_runtime": 300.123, "eval_samples_per_second": 83.702, "eval_steps_per_second": 1.309, "step": 3729 }, { "epoch": 1.0002681684097614, "grad_norm": 0.4085592416947716, "learning_rate": 8.430325435792557e-06, "loss": 0.0299, "step": 3730 }, { "epoch": 1.0005363368195226, "grad_norm": 0.46192065977986485, "learning_rate": 8.429190170058024e-06, "loss": 0.0277, "step": 3731 }, { "epoch": 1.000804505229284, "grad_norm": 0.27098473567763354, "learning_rate": 8.428054570432345e-06, "loss": 0.0214, "step": 3732 }, { "epoch": 1.0010726736390454, "grad_norm": 0.2669374902576558, "learning_rate": 8.426918637026097e-06, "loss": 0.0221, "step": 3733 }, { "epoch": 1.0013408420488066, "grad_norm": 0.3660660784561016, "learning_rate": 8.425782369949877e-06, "loss": 0.0319, "step": 3734 }, { "epoch": 1.001609010458568, "grad_norm": 0.3476119667558623, "learning_rate": 8.424645769314324e-06, "loss": 0.0219, "step": 3735 }, { "epoch": 1.0018771788683294, "grad_norm": 0.3795493525156423, "learning_rate": 8.423508835230106e-06, "loss": 0.0296, "step": 3736 }, { "epoch": 1.0021453472780906, "grad_norm": 0.24058794037465714, "learning_rate": 8.422371567807921e-06, "loss": 0.025, "step": 3737 }, { "epoch": 1.002413515687852, "grad_norm": 0.3015092909201774, "learning_rate": 8.421233967158503e-06, "loss": 0.0283, "step": 3738 }, { "epoch": 1.0026816840976134, "grad_norm": 0.40630201578035013, "learning_rate": 8.420096033392615e-06, "loss": 0.0315, "step": 3739 }, { "epoch": 1.0029498525073746, "grad_norm": 0.3218054422630295, "learning_rate": 8.418957766621058e-06, "loss": 0.0254, "step": 3740 }, { "epoch": 1.003218020917136, "grad_norm": 0.32459839233578236, "learning_rate": 8.417819166954659e-06, "loss": 0.0234, "step": 3741 }, { "epoch": 1.0034861893268974, "grad_norm": 0.42755527210206806, "learning_rate": 8.416680234504284e-06, "loss": 0.0332, "step": 3742 }, { "epoch": 1.0037543577366586, "grad_norm": 0.3576850275051218, "learning_rate": 8.415540969380824e-06, "loss": 0.0294, "step": 3743 }, { "epoch": 1.00402252614642, "grad_norm": 0.272713424689016, "learning_rate": 8.41440137169521e-06, "loss": 0.019, "step": 3744 }, { "epoch": 1.0042906945561814, "grad_norm": 0.3670898597024907, "learning_rate": 8.413261441558397e-06, "loss": 0.0323, "step": 3745 }, { "epoch": 1.0045588629659425, "grad_norm": 0.4390344580459956, "learning_rate": 8.41212117908138e-06, "loss": 0.0335, "step": 3746 }, { "epoch": 1.004827031375704, "grad_norm": 0.40433151403062706, "learning_rate": 8.410980584375184e-06, "loss": 0.0241, "step": 3747 }, { "epoch": 1.0050951997854654, "grad_norm": 0.26900384574546155, "learning_rate": 8.409839657550864e-06, "loss": 0.0235, "step": 3748 }, { "epoch": 1.0053633681952265, "grad_norm": 0.3193739196282925, "learning_rate": 8.408698398719509e-06, "loss": 0.0322, "step": 3749 }, { "epoch": 1.005631536604988, "grad_norm": 0.6659207090847489, "learning_rate": 8.40755680799224e-06, "loss": 0.0394, "step": 3750 }, { "epoch": 1.0058997050147493, "grad_norm": 0.4990929017232432, "learning_rate": 8.406414885480215e-06, "loss": 0.0326, "step": 3751 }, { "epoch": 1.0061678734245105, "grad_norm": 0.28010703533005055, "learning_rate": 8.405272631294612e-06, "loss": 0.0235, "step": 3752 }, { "epoch": 1.006436041834272, "grad_norm": 0.2762779329761583, "learning_rate": 8.404130045546657e-06, "loss": 0.0225, "step": 3753 }, { "epoch": 1.0067042102440333, "grad_norm": 0.30014224071003875, "learning_rate": 8.402987128347596e-06, "loss": 0.0254, "step": 3754 }, { "epoch": 1.0069723786537945, "grad_norm": 0.3498981961975498, "learning_rate": 8.401843879808713e-06, "loss": 0.032, "step": 3755 }, { "epoch": 1.007240547063556, "grad_norm": 0.32819334380532583, "learning_rate": 8.400700300041322e-06, "loss": 0.0348, "step": 3756 }, { "epoch": 1.0075087154733173, "grad_norm": 0.23572749407751117, "learning_rate": 8.399556389156773e-06, "loss": 0.0199, "step": 3757 }, { "epoch": 1.0077768838830785, "grad_norm": 0.40191951510931007, "learning_rate": 8.398412147266444e-06, "loss": 0.031, "step": 3758 }, { "epoch": 1.00804505229284, "grad_norm": 0.2771216370970497, "learning_rate": 8.397267574481746e-06, "loss": 0.0245, "step": 3759 }, { "epoch": 1.0083132207026013, "grad_norm": 0.41700642889115425, "learning_rate": 8.396122670914124e-06, "loss": 0.0279, "step": 3760 }, { "epoch": 1.0085813891123625, "grad_norm": 0.32155443627374597, "learning_rate": 8.394977436675056e-06, "loss": 0.0301, "step": 3761 }, { "epoch": 1.008849557522124, "grad_norm": 0.35877587432304964, "learning_rate": 8.393831871876046e-06, "loss": 0.0246, "step": 3762 }, { "epoch": 1.0091177259318853, "grad_norm": 0.47724872482040814, "learning_rate": 8.392685976628638e-06, "loss": 0.0325, "step": 3763 }, { "epoch": 1.0093858943416465, "grad_norm": 0.2064204727749518, "learning_rate": 8.391539751044405e-06, "loss": 0.0186, "step": 3764 }, { "epoch": 1.009654062751408, "grad_norm": 0.35911668532173563, "learning_rate": 8.39039319523495e-06, "loss": 0.0265, "step": 3765 }, { "epoch": 1.0099222311611693, "grad_norm": 0.3409964516919302, "learning_rate": 8.389246309311909e-06, "loss": 0.0283, "step": 3766 }, { "epoch": 1.0101903995709305, "grad_norm": 0.27576876707319053, "learning_rate": 8.388099093386957e-06, "loss": 0.0221, "step": 3767 }, { "epoch": 1.010458567980692, "grad_norm": 0.3094779445128216, "learning_rate": 8.38695154757179e-06, "loss": 0.0216, "step": 3768 }, { "epoch": 1.0107267363904533, "grad_norm": 0.30393419031360175, "learning_rate": 8.38580367197814e-06, "loss": 0.0307, "step": 3769 }, { "epoch": 1.0109949048002145, "grad_norm": 0.2738097178406625, "learning_rate": 8.384655466717778e-06, "loss": 0.0245, "step": 3770 }, { "epoch": 1.0112630732099759, "grad_norm": 0.7482951141666783, "learning_rate": 8.3835069319025e-06, "loss": 0.0231, "step": 3771 }, { "epoch": 1.0115312416197373, "grad_norm": 0.5591449305281037, "learning_rate": 8.382358067644132e-06, "loss": 0.0443, "step": 3772 }, { "epoch": 1.0117994100294985, "grad_norm": 0.22570568957075818, "learning_rate": 8.381208874054542e-06, "loss": 0.0195, "step": 3773 }, { "epoch": 1.0120675784392599, "grad_norm": 0.31093540589036334, "learning_rate": 8.380059351245621e-06, "loss": 0.0317, "step": 3774 }, { "epoch": 1.0123357468490213, "grad_norm": 0.30725587911981944, "learning_rate": 8.378909499329294e-06, "loss": 0.0228, "step": 3775 }, { "epoch": 1.0126039152587825, "grad_norm": 0.42345997836301325, "learning_rate": 8.377759318417519e-06, "loss": 0.0348, "step": 3776 }, { "epoch": 1.0128720836685439, "grad_norm": 0.28414294566052456, "learning_rate": 8.376608808622287e-06, "loss": 0.0289, "step": 3777 }, { "epoch": 1.0131402520783053, "grad_norm": 0.4654614677584702, "learning_rate": 8.375457970055622e-06, "loss": 0.0391, "step": 3778 }, { "epoch": 1.0134084204880665, "grad_norm": 0.2791578766369954, "learning_rate": 8.374306802829574e-06, "loss": 0.027, "step": 3779 }, { "epoch": 1.0136765888978279, "grad_norm": 0.3448044939649245, "learning_rate": 8.373155307056233e-06, "loss": 0.0352, "step": 3780 }, { "epoch": 1.0139447573075893, "grad_norm": 0.47713641216543373, "learning_rate": 8.372003482847715e-06, "loss": 0.0402, "step": 3781 }, { "epoch": 1.0142129257173504, "grad_norm": 0.4443335553917476, "learning_rate": 8.370851330316169e-06, "loss": 0.0331, "step": 3782 }, { "epoch": 1.0144810941271118, "grad_norm": 0.269267544885955, "learning_rate": 8.369698849573778e-06, "loss": 0.0253, "step": 3783 }, { "epoch": 1.0147492625368733, "grad_norm": 0.4586000036646455, "learning_rate": 8.36854604073276e-06, "loss": 0.028, "step": 3784 }, { "epoch": 1.0150174309466344, "grad_norm": 0.42019985201543586, "learning_rate": 8.367392903905354e-06, "loss": 0.0356, "step": 3785 }, { "epoch": 1.0152855993563958, "grad_norm": 0.38361208979041245, "learning_rate": 8.366239439203842e-06, "loss": 0.0271, "step": 3786 }, { "epoch": 1.0155537677661572, "grad_norm": 0.2922204364996332, "learning_rate": 8.365085646740533e-06, "loss": 0.0342, "step": 3787 }, { "epoch": 1.0158219361759184, "grad_norm": 0.35345300203593727, "learning_rate": 8.363931526627769e-06, "loss": 0.0365, "step": 3788 }, { "epoch": 1.0160901045856798, "grad_norm": 0.37501489821123735, "learning_rate": 8.362777078977922e-06, "loss": 0.0293, "step": 3789 }, { "epoch": 1.0163582729954412, "grad_norm": 0.30180584712850295, "learning_rate": 8.3616223039034e-06, "loss": 0.0254, "step": 3790 }, { "epoch": 1.0166264414052024, "grad_norm": 0.28679505730314925, "learning_rate": 8.36046720151664e-06, "loss": 0.0255, "step": 3791 }, { "epoch": 1.0168946098149638, "grad_norm": 0.2970591186403319, "learning_rate": 8.35931177193011e-06, "loss": 0.028, "step": 3792 }, { "epoch": 1.0171627782247252, "grad_norm": 0.2543144193869907, "learning_rate": 8.35815601525631e-06, "loss": 0.0232, "step": 3793 }, { "epoch": 1.0174309466344864, "grad_norm": 0.3018916098642797, "learning_rate": 8.356999931607776e-06, "loss": 0.0212, "step": 3794 }, { "epoch": 1.0176991150442478, "grad_norm": 0.33552176408734113, "learning_rate": 8.355843521097071e-06, "loss": 0.0389, "step": 3795 }, { "epoch": 1.0179672834540092, "grad_norm": 0.283386370700812, "learning_rate": 8.354686783836791e-06, "loss": 0.0345, "step": 3796 }, { "epoch": 1.0182354518637704, "grad_norm": 0.322363003285709, "learning_rate": 8.353529719939568e-06, "loss": 0.025, "step": 3797 }, { "epoch": 1.0185036202735318, "grad_norm": 0.3169390203133281, "learning_rate": 8.352372329518058e-06, "loss": 0.0315, "step": 3798 }, { "epoch": 1.0187717886832932, "grad_norm": 0.2941212797810095, "learning_rate": 8.351214612684954e-06, "loss": 0.0217, "step": 3799 }, { "epoch": 1.0190399570930544, "grad_norm": 0.4832800729816997, "learning_rate": 8.35005656955298e-06, "loss": 0.0327, "step": 3800 }, { "epoch": 1.0193081255028158, "grad_norm": 0.2730801793906435, "learning_rate": 8.348898200234894e-06, "loss": 0.0185, "step": 3801 }, { "epoch": 1.0195762939125772, "grad_norm": 0.3334436308894534, "learning_rate": 8.34773950484348e-06, "loss": 0.0265, "step": 3802 }, { "epoch": 1.0198444623223384, "grad_norm": 0.6072727480769846, "learning_rate": 8.34658048349156e-06, "loss": 0.0381, "step": 3803 }, { "epoch": 1.0201126307320998, "grad_norm": 0.3565312767690091, "learning_rate": 8.345421136291983e-06, "loss": 0.0309, "step": 3804 }, { "epoch": 1.0203807991418612, "grad_norm": 0.5155837498721744, "learning_rate": 8.34426146335763e-06, "loss": 0.0383, "step": 3805 }, { "epoch": 1.0206489675516224, "grad_norm": 0.2794960117448267, "learning_rate": 8.343101464801418e-06, "loss": 0.0252, "step": 3806 }, { "epoch": 1.0209171359613838, "grad_norm": 0.3065205659296354, "learning_rate": 8.341941140736292e-06, "loss": 0.0214, "step": 3807 }, { "epoch": 1.0211853043711452, "grad_norm": 0.29031315599694557, "learning_rate": 8.340780491275232e-06, "loss": 0.0256, "step": 3808 }, { "epoch": 1.0214534727809064, "grad_norm": 0.3335494786164975, "learning_rate": 8.339619516531243e-06, "loss": 0.0302, "step": 3809 }, { "epoch": 1.0217216411906678, "grad_norm": 0.3518610473281327, "learning_rate": 8.338458216617368e-06, "loss": 0.0315, "step": 3810 }, { "epoch": 1.0219898096004292, "grad_norm": 0.2766770290407197, "learning_rate": 8.33729659164668e-06, "loss": 0.0289, "step": 3811 }, { "epoch": 1.0222579780101904, "grad_norm": 0.3273592234950132, "learning_rate": 8.336134641732284e-06, "loss": 0.0206, "step": 3812 }, { "epoch": 1.0225261464199518, "grad_norm": 0.30383067605075476, "learning_rate": 8.334972366987315e-06, "loss": 0.0254, "step": 3813 }, { "epoch": 1.0227943148297132, "grad_norm": 0.26127846155456075, "learning_rate": 8.33380976752494e-06, "loss": 0.0199, "step": 3814 }, { "epoch": 1.0230624832394744, "grad_norm": 0.28937918653128086, "learning_rate": 8.332646843458361e-06, "loss": 0.0381, "step": 3815 }, { "epoch": 1.0233306516492358, "grad_norm": 0.4017441387816911, "learning_rate": 8.331483594900807e-06, "loss": 0.0292, "step": 3816 }, { "epoch": 1.023598820058997, "grad_norm": 0.7819605173276373, "learning_rate": 8.330320021965538e-06, "loss": 0.0401, "step": 3817 }, { "epoch": 1.0238669884687583, "grad_norm": 0.27368879784032657, "learning_rate": 8.329156124765852e-06, "loss": 0.0299, "step": 3818 }, { "epoch": 1.0241351568785197, "grad_norm": 0.3752563169493104, "learning_rate": 8.327991903415071e-06, "loss": 0.0348, "step": 3819 }, { "epoch": 1.0244033252882812, "grad_norm": 0.34666768827658945, "learning_rate": 8.326827358026554e-06, "loss": 0.0286, "step": 3820 }, { "epoch": 1.0246714936980423, "grad_norm": 0.28213166251876426, "learning_rate": 8.325662488713692e-06, "loss": 0.0259, "step": 3821 }, { "epoch": 1.0249396621078037, "grad_norm": 0.4862306698248169, "learning_rate": 8.324497295589902e-06, "loss": 0.035, "step": 3822 }, { "epoch": 1.025207830517565, "grad_norm": 0.35464146836772437, "learning_rate": 8.323331778768636e-06, "loss": 0.0302, "step": 3823 }, { "epoch": 1.0254759989273263, "grad_norm": 0.3762974267368971, "learning_rate": 8.322165938363378e-06, "loss": 0.026, "step": 3824 }, { "epoch": 1.0257441673370877, "grad_norm": 0.3214660876082944, "learning_rate": 8.320999774487644e-06, "loss": 0.0254, "step": 3825 }, { "epoch": 1.026012335746849, "grad_norm": 0.22857557781654275, "learning_rate": 8.319833287254978e-06, "loss": 0.0207, "step": 3826 }, { "epoch": 1.0262805041566103, "grad_norm": 0.337835605928234, "learning_rate": 8.31866647677896e-06, "loss": 0.0217, "step": 3827 }, { "epoch": 1.0265486725663717, "grad_norm": 0.30305957386391386, "learning_rate": 8.317499343173196e-06, "loss": 0.0294, "step": 3828 }, { "epoch": 1.026816840976133, "grad_norm": 0.2996183089214558, "learning_rate": 8.31633188655133e-06, "loss": 0.0233, "step": 3829 }, { "epoch": 1.0270850093858943, "grad_norm": 0.3157018484871345, "learning_rate": 8.315164107027032e-06, "loss": 0.027, "step": 3830 }, { "epoch": 1.0273531777956557, "grad_norm": 0.3795048687387756, "learning_rate": 8.313996004714007e-06, "loss": 0.0287, "step": 3831 }, { "epoch": 1.027621346205417, "grad_norm": 0.3330762033868233, "learning_rate": 8.31282757972599e-06, "loss": 0.0334, "step": 3832 }, { "epoch": 1.0278895146151783, "grad_norm": 0.4211117371737151, "learning_rate": 8.311658832176747e-06, "loss": 0.0373, "step": 3833 }, { "epoch": 1.0281576830249397, "grad_norm": 0.23944118013247687, "learning_rate": 8.310489762180076e-06, "loss": 0.0225, "step": 3834 }, { "epoch": 1.0284258514347009, "grad_norm": 0.47375174348866994, "learning_rate": 8.309320369849804e-06, "loss": 0.0248, "step": 3835 }, { "epoch": 1.0286940198444623, "grad_norm": 0.3568185149623637, "learning_rate": 8.308150655299795e-06, "loss": 0.0292, "step": 3836 }, { "epoch": 1.0289621882542237, "grad_norm": 0.29398404465578026, "learning_rate": 8.30698061864394e-06, "loss": 0.024, "step": 3837 }, { "epoch": 1.0292303566639849, "grad_norm": 0.25828520343396716, "learning_rate": 8.305810259996163e-06, "loss": 0.0225, "step": 3838 }, { "epoch": 1.0294985250737463, "grad_norm": 0.4260845095853011, "learning_rate": 8.304639579470416e-06, "loss": 0.0223, "step": 3839 }, { "epoch": 1.0297666934835077, "grad_norm": 0.32662873648817065, "learning_rate": 8.303468577180688e-06, "loss": 0.0207, "step": 3840 }, { "epoch": 1.0300348618932689, "grad_norm": 0.2474136480743761, "learning_rate": 8.302297253240995e-06, "loss": 0.0221, "step": 3841 }, { "epoch": 1.0303030303030303, "grad_norm": 0.29506853028474905, "learning_rate": 8.301125607765385e-06, "loss": 0.0342, "step": 3842 }, { "epoch": 1.0305711987127917, "grad_norm": 0.22539710793886594, "learning_rate": 8.29995364086794e-06, "loss": 0.0226, "step": 3843 }, { "epoch": 1.0308393671225529, "grad_norm": 0.19963115176027918, "learning_rate": 8.298781352662768e-06, "loss": 0.0203, "step": 3844 }, { "epoch": 1.0311075355323143, "grad_norm": 0.2855574254135013, "learning_rate": 8.297608743264017e-06, "loss": 0.0286, "step": 3845 }, { "epoch": 1.0313757039420757, "grad_norm": 0.28464176338512165, "learning_rate": 8.296435812785855e-06, "loss": 0.0305, "step": 3846 }, { "epoch": 1.0316438723518369, "grad_norm": 0.3489872617769799, "learning_rate": 8.295262561342493e-06, "loss": 0.0279, "step": 3847 }, { "epoch": 1.0319120407615983, "grad_norm": 0.3102699032514512, "learning_rate": 8.29408898904816e-06, "loss": 0.0212, "step": 3848 }, { "epoch": 1.0321802091713597, "grad_norm": 0.3235451019302266, "learning_rate": 8.292915096017128e-06, "loss": 0.0374, "step": 3849 }, { "epoch": 1.0324483775811208, "grad_norm": 0.5010230408849221, "learning_rate": 8.2917408823637e-06, "loss": 0.0439, "step": 3850 }, { "epoch": 1.0327165459908823, "grad_norm": 0.394496270455239, "learning_rate": 8.290566348202199e-06, "loss": 0.0286, "step": 3851 }, { "epoch": 1.0329847144006437, "grad_norm": 0.2985601829037895, "learning_rate": 8.289391493646988e-06, "loss": 0.0325, "step": 3852 }, { "epoch": 1.0332528828104048, "grad_norm": 0.30639940628632806, "learning_rate": 8.288216318812461e-06, "loss": 0.0282, "step": 3853 }, { "epoch": 1.0335210512201662, "grad_norm": 0.38466733970420747, "learning_rate": 8.287040823813041e-06, "loss": 0.0479, "step": 3854 }, { "epoch": 1.0337892196299276, "grad_norm": 0.28080142910781136, "learning_rate": 8.285865008763185e-06, "loss": 0.0251, "step": 3855 }, { "epoch": 1.0340573880396888, "grad_norm": 0.3422093961001927, "learning_rate": 8.284688873777375e-06, "loss": 0.0305, "step": 3856 }, { "epoch": 1.0343255564494502, "grad_norm": 0.39872738116836304, "learning_rate": 8.28351241897013e-06, "loss": 0.0289, "step": 3857 }, { "epoch": 1.0345937248592116, "grad_norm": 0.2538782521239731, "learning_rate": 8.282335644455998e-06, "loss": 0.0268, "step": 3858 }, { "epoch": 1.0348618932689728, "grad_norm": 0.3766756735196799, "learning_rate": 8.281158550349559e-06, "loss": 0.0267, "step": 3859 }, { "epoch": 1.0351300616787342, "grad_norm": 0.2872915280957078, "learning_rate": 8.279981136765424e-06, "loss": 0.0222, "step": 3860 }, { "epoch": 1.0353982300884956, "grad_norm": 0.32415337756830676, "learning_rate": 8.278803403818234e-06, "loss": 0.0334, "step": 3861 }, { "epoch": 1.0356663984982568, "grad_norm": 0.3623314879061271, "learning_rate": 8.277625351622662e-06, "loss": 0.0323, "step": 3862 }, { "epoch": 1.0359345669080182, "grad_norm": 0.39562800800957365, "learning_rate": 8.27644698029341e-06, "loss": 0.0371, "step": 3863 }, { "epoch": 1.0362027353177796, "grad_norm": 0.3041544310230412, "learning_rate": 8.275268289945215e-06, "loss": 0.0329, "step": 3864 }, { "epoch": 1.0364709037275408, "grad_norm": 0.30754787519176163, "learning_rate": 8.274089280692842e-06, "loss": 0.0229, "step": 3865 }, { "epoch": 1.0367390721373022, "grad_norm": 0.36251501627351373, "learning_rate": 8.27290995265109e-06, "loss": 0.0377, "step": 3866 }, { "epoch": 1.0370072405470636, "grad_norm": 0.29145796904724314, "learning_rate": 8.271730305934781e-06, "loss": 0.0363, "step": 3867 }, { "epoch": 1.0372754089568248, "grad_norm": 0.29271289674244216, "learning_rate": 8.270550340658782e-06, "loss": 0.0315, "step": 3868 }, { "epoch": 1.0375435773665862, "grad_norm": 0.4601282488362476, "learning_rate": 8.26937005693798e-06, "loss": 0.0259, "step": 3869 }, { "epoch": 1.0378117457763476, "grad_norm": 0.33055012338726836, "learning_rate": 8.268189454887292e-06, "loss": 0.0287, "step": 3870 }, { "epoch": 1.0380799141861088, "grad_norm": 0.3116507009776977, "learning_rate": 8.267008534621676e-06, "loss": 0.0287, "step": 3871 }, { "epoch": 1.0383480825958702, "grad_norm": 0.539901879925059, "learning_rate": 8.265827296256114e-06, "loss": 0.0358, "step": 3872 }, { "epoch": 1.0386162510056316, "grad_norm": 0.2676443502395585, "learning_rate": 8.264645739905618e-06, "loss": 0.0285, "step": 3873 }, { "epoch": 1.0388844194153928, "grad_norm": 0.2784292362651536, "learning_rate": 8.263463865685231e-06, "loss": 0.0273, "step": 3874 }, { "epoch": 1.0391525878251542, "grad_norm": 0.2592479966043478, "learning_rate": 8.262281673710035e-06, "loss": 0.0212, "step": 3875 }, { "epoch": 1.0394207562349156, "grad_norm": 0.25196195681290257, "learning_rate": 8.261099164095132e-06, "loss": 0.0243, "step": 3876 }, { "epoch": 1.0396889246446768, "grad_norm": 0.449428140724111, "learning_rate": 8.259916336955662e-06, "loss": 0.0286, "step": 3877 }, { "epoch": 1.0399570930544382, "grad_norm": 0.3084548153357562, "learning_rate": 8.258733192406793e-06, "loss": 0.0202, "step": 3878 }, { "epoch": 1.0402252614641996, "grad_norm": 0.3944415065709748, "learning_rate": 8.257549730563726e-06, "loss": 0.0285, "step": 3879 }, { "epoch": 1.0404934298739608, "grad_norm": 0.4391545746233972, "learning_rate": 8.256365951541689e-06, "loss": 0.0312, "step": 3880 }, { "epoch": 1.0407615982837222, "grad_norm": 0.30001147859520644, "learning_rate": 8.255181855455944e-06, "loss": 0.0205, "step": 3881 }, { "epoch": 1.0410297666934836, "grad_norm": 0.39110410990857647, "learning_rate": 8.253997442421786e-06, "loss": 0.0322, "step": 3882 }, { "epoch": 1.0412979351032448, "grad_norm": 0.2723746850971545, "learning_rate": 8.252812712554535e-06, "loss": 0.0267, "step": 3883 }, { "epoch": 1.0415661035130062, "grad_norm": 0.41680566928831214, "learning_rate": 8.251627665969548e-06, "loss": 0.033, "step": 3884 }, { "epoch": 1.0418342719227676, "grad_norm": 0.32525191155937244, "learning_rate": 8.250442302782206e-06, "loss": 0.0233, "step": 3885 }, { "epoch": 1.0421024403325287, "grad_norm": 0.33399947802976115, "learning_rate": 8.249256623107928e-06, "loss": 0.0411, "step": 3886 }, { "epoch": 1.0423706087422902, "grad_norm": 0.4595065460795739, "learning_rate": 8.248070627062156e-06, "loss": 0.0234, "step": 3887 }, { "epoch": 1.0426387771520516, "grad_norm": 0.3864656176156845, "learning_rate": 8.246884314760376e-06, "loss": 0.0311, "step": 3888 }, { "epoch": 1.0429069455618127, "grad_norm": 0.3578768444335209, "learning_rate": 8.245697686318086e-06, "loss": 0.0247, "step": 3889 }, { "epoch": 1.0431751139715741, "grad_norm": 0.2482538524658289, "learning_rate": 8.244510741850831e-06, "loss": 0.0207, "step": 3890 }, { "epoch": 1.0434432823813355, "grad_norm": 0.37534496980449944, "learning_rate": 8.24332348147418e-06, "loss": 0.0316, "step": 3891 }, { "epoch": 1.0437114507910967, "grad_norm": 0.2907304945498778, "learning_rate": 8.242135905303731e-06, "loss": 0.0201, "step": 3892 }, { "epoch": 1.0439796192008581, "grad_norm": 0.3257092226721669, "learning_rate": 8.240948013455117e-06, "loss": 0.0278, "step": 3893 }, { "epoch": 1.0442477876106195, "grad_norm": 0.35073029390515487, "learning_rate": 8.239759806044001e-06, "loss": 0.0295, "step": 3894 }, { "epoch": 1.0445159560203807, "grad_norm": 0.30462722276583604, "learning_rate": 8.238571283186074e-06, "loss": 0.0271, "step": 3895 }, { "epoch": 1.0447841244301421, "grad_norm": 0.4259263989554586, "learning_rate": 8.237382444997057e-06, "loss": 0.0476, "step": 3896 }, { "epoch": 1.0450522928399035, "grad_norm": 0.42279144990955564, "learning_rate": 8.236193291592707e-06, "loss": 0.0245, "step": 3897 }, { "epoch": 1.0453204612496647, "grad_norm": 0.5125161513850827, "learning_rate": 8.23500382308881e-06, "loss": 0.0343, "step": 3898 }, { "epoch": 1.0455886296594261, "grad_norm": 0.3807787884102021, "learning_rate": 8.233814039601178e-06, "loss": 0.0322, "step": 3899 }, { "epoch": 1.0458567980691875, "grad_norm": 0.3259210123790663, "learning_rate": 8.232623941245658e-06, "loss": 0.0269, "step": 3900 }, { "epoch": 1.0461249664789487, "grad_norm": 0.4489841358960552, "learning_rate": 8.231433528138128e-06, "loss": 0.0322, "step": 3901 }, { "epoch": 1.04639313488871, "grad_norm": 0.3891554092911375, "learning_rate": 8.230242800394496e-06, "loss": 0.0232, "step": 3902 }, { "epoch": 1.0466613032984715, "grad_norm": 0.3236736418976952, "learning_rate": 8.229051758130697e-06, "loss": 0.0413, "step": 3903 }, { "epoch": 1.0469294717082327, "grad_norm": 0.4731956143790975, "learning_rate": 8.227860401462702e-06, "loss": 0.0428, "step": 3904 }, { "epoch": 1.047197640117994, "grad_norm": 0.3034438191866115, "learning_rate": 8.226668730506509e-06, "loss": 0.0243, "step": 3905 }, { "epoch": 1.0474658085277555, "grad_norm": 0.23666139585077156, "learning_rate": 8.225476745378149e-06, "loss": 0.0186, "step": 3906 }, { "epoch": 1.0477339769375167, "grad_norm": 0.26601601814055414, "learning_rate": 8.224284446193682e-06, "loss": 0.0221, "step": 3907 }, { "epoch": 1.048002145347278, "grad_norm": 0.2708272749856384, "learning_rate": 8.223091833069197e-06, "loss": 0.0351, "step": 3908 }, { "epoch": 1.0482703137570395, "grad_norm": 0.29135360701756563, "learning_rate": 8.22189890612082e-06, "loss": 0.0242, "step": 3909 }, { "epoch": 1.0485384821668007, "grad_norm": 0.28996393847561214, "learning_rate": 8.220705665464699e-06, "loss": 0.0243, "step": 3910 }, { "epoch": 1.048806650576562, "grad_norm": 0.3175549122744191, "learning_rate": 8.219512111217016e-06, "loss": 0.0243, "step": 3911 }, { "epoch": 1.0490748189863235, "grad_norm": 0.2598688258498686, "learning_rate": 8.21831824349399e-06, "loss": 0.0268, "step": 3912 }, { "epoch": 1.0493429873960847, "grad_norm": 0.3497872500920714, "learning_rate": 8.21712406241186e-06, "loss": 0.0266, "step": 3913 }, { "epoch": 1.049611155805846, "grad_norm": 0.41673824946410626, "learning_rate": 8.2159295680869e-06, "loss": 0.0337, "step": 3914 }, { "epoch": 1.0498793242156075, "grad_norm": 0.36579571937782474, "learning_rate": 8.214734760635418e-06, "loss": 0.0274, "step": 3915 }, { "epoch": 1.0501474926253687, "grad_norm": 0.4705266609663164, "learning_rate": 8.213539640173746e-06, "loss": 0.0334, "step": 3916 }, { "epoch": 1.05041566103513, "grad_norm": 0.24524251519608842, "learning_rate": 8.21234420681825e-06, "loss": 0.0209, "step": 3917 }, { "epoch": 1.0506838294448915, "grad_norm": 0.2800023852075173, "learning_rate": 8.21114846068533e-06, "loss": 0.0321, "step": 3918 }, { "epoch": 1.0509519978546527, "grad_norm": 0.842326507784826, "learning_rate": 8.209952401891409e-06, "loss": 0.0506, "step": 3919 }, { "epoch": 1.051220166264414, "grad_norm": 0.2850190962926409, "learning_rate": 8.208756030552944e-06, "loss": 0.0218, "step": 3920 }, { "epoch": 1.0514883346741755, "grad_norm": 0.3038562543432837, "learning_rate": 8.207559346786424e-06, "loss": 0.0325, "step": 3921 }, { "epoch": 1.0517565030839366, "grad_norm": 0.46708199822661717, "learning_rate": 8.206362350708365e-06, "loss": 0.0282, "step": 3922 }, { "epoch": 1.052024671493698, "grad_norm": 0.7569133649961993, "learning_rate": 8.205165042435317e-06, "loss": 0.0336, "step": 3923 }, { "epoch": 1.0522928399034595, "grad_norm": 0.3111984150983337, "learning_rate": 8.20396742208386e-06, "loss": 0.0252, "step": 3924 }, { "epoch": 1.0525610083132206, "grad_norm": 0.4982732376998536, "learning_rate": 8.202769489770602e-06, "loss": 0.029, "step": 3925 }, { "epoch": 1.052829176722982, "grad_norm": 0.3474166551775235, "learning_rate": 8.201571245612179e-06, "loss": 0.0231, "step": 3926 }, { "epoch": 1.0530973451327434, "grad_norm": 0.3632753671134533, "learning_rate": 8.200372689725265e-06, "loss": 0.0309, "step": 3927 }, { "epoch": 1.0533655135425046, "grad_norm": 0.24294925159143982, "learning_rate": 8.19917382222656e-06, "loss": 0.0227, "step": 3928 }, { "epoch": 1.053633681952266, "grad_norm": 0.3093868874005352, "learning_rate": 8.197974643232794e-06, "loss": 0.0192, "step": 3929 }, { "epoch": 1.0539018503620274, "grad_norm": 0.34198263219409675, "learning_rate": 8.196775152860725e-06, "loss": 0.0233, "step": 3930 }, { "epoch": 1.0541700187717886, "grad_norm": 0.2904204092953673, "learning_rate": 8.195575351227148e-06, "loss": 0.0226, "step": 3931 }, { "epoch": 1.05443818718155, "grad_norm": 0.28397167378299093, "learning_rate": 8.194375238448884e-06, "loss": 0.027, "step": 3932 }, { "epoch": 1.0547063555913114, "grad_norm": 0.35042399503223015, "learning_rate": 8.193174814642782e-06, "loss": 0.0235, "step": 3933 }, { "epoch": 1.0549745240010726, "grad_norm": 0.349350310943682, "learning_rate": 8.191974079925727e-06, "loss": 0.0233, "step": 3934 }, { "epoch": 1.055242692410834, "grad_norm": 0.38937277649520335, "learning_rate": 8.190773034414631e-06, "loss": 0.0317, "step": 3935 }, { "epoch": 1.0555108608205954, "grad_norm": 0.2714228418371403, "learning_rate": 8.189571678226436e-06, "loss": 0.0242, "step": 3936 }, { "epoch": 1.0557790292303566, "grad_norm": 0.3033725002359017, "learning_rate": 8.188370011478114e-06, "loss": 0.0328, "step": 3937 }, { "epoch": 1.056047197640118, "grad_norm": 0.25273246606998884, "learning_rate": 8.187168034286668e-06, "loss": 0.0221, "step": 3938 }, { "epoch": 1.0563153660498794, "grad_norm": 0.367957599015788, "learning_rate": 8.185965746769134e-06, "loss": 0.0297, "step": 3939 }, { "epoch": 1.0565835344596406, "grad_norm": 0.3640407952586068, "learning_rate": 8.184763149042574e-06, "loss": 0.0314, "step": 3940 }, { "epoch": 1.056851702869402, "grad_norm": 0.310981058659115, "learning_rate": 8.18356024122408e-06, "loss": 0.025, "step": 3941 }, { "epoch": 1.0571198712791634, "grad_norm": 0.4646813103790253, "learning_rate": 8.182357023430778e-06, "loss": 0.031, "step": 3942 }, { "epoch": 1.0573880396889246, "grad_norm": 0.28785398293696013, "learning_rate": 8.181153495779823e-06, "loss": 0.031, "step": 3943 }, { "epoch": 1.057656208098686, "grad_norm": 0.27760908368375986, "learning_rate": 8.179949658388398e-06, "loss": 0.0277, "step": 3944 }, { "epoch": 1.0579243765084474, "grad_norm": 0.2571912854244579, "learning_rate": 8.178745511373714e-06, "loss": 0.0253, "step": 3945 }, { "epoch": 1.0581925449182086, "grad_norm": 0.34089477172819593, "learning_rate": 8.177541054853022e-06, "loss": 0.0294, "step": 3946 }, { "epoch": 1.05846071332797, "grad_norm": 0.34671190931635476, "learning_rate": 8.176336288943592e-06, "loss": 0.0336, "step": 3947 }, { "epoch": 1.0587288817377314, "grad_norm": 0.45043297039886504, "learning_rate": 8.175131213762732e-06, "loss": 0.0351, "step": 3948 }, { "epoch": 1.0589970501474926, "grad_norm": 0.25241412512169514, "learning_rate": 8.173925829427776e-06, "loss": 0.02, "step": 3949 }, { "epoch": 1.059265218557254, "grad_norm": 0.36921106447317537, "learning_rate": 8.172720136056087e-06, "loss": 0.0262, "step": 3950 }, { "epoch": 1.0595333869670154, "grad_norm": 0.29990404493953793, "learning_rate": 8.171514133765062e-06, "loss": 0.0255, "step": 3951 }, { "epoch": 1.0598015553767766, "grad_norm": 0.25368062851736867, "learning_rate": 8.170307822672127e-06, "loss": 0.0196, "step": 3952 }, { "epoch": 1.060069723786538, "grad_norm": 0.2653707732623665, "learning_rate": 8.169101202894735e-06, "loss": 0.0232, "step": 3953 }, { "epoch": 1.0603378921962994, "grad_norm": 0.2749175424405886, "learning_rate": 8.167894274550374e-06, "loss": 0.026, "step": 3954 }, { "epoch": 1.0606060606060606, "grad_norm": 0.29656461527362604, "learning_rate": 8.166687037756558e-06, "loss": 0.0298, "step": 3955 }, { "epoch": 1.060874229015822, "grad_norm": 0.2764124505783319, "learning_rate": 8.165479492630831e-06, "loss": 0.0315, "step": 3956 }, { "epoch": 1.0611423974255834, "grad_norm": 0.30121642778279656, "learning_rate": 8.164271639290771e-06, "loss": 0.0201, "step": 3957 }, { "epoch": 1.0614105658353445, "grad_norm": 0.22477680343907117, "learning_rate": 8.163063477853983e-06, "loss": 0.019, "step": 3958 }, { "epoch": 1.061678734245106, "grad_norm": 0.34861880902156456, "learning_rate": 8.1618550084381e-06, "loss": 0.0258, "step": 3959 }, { "epoch": 1.0619469026548674, "grad_norm": 0.39784897431861116, "learning_rate": 8.160646231160792e-06, "loss": 0.0359, "step": 3960 }, { "epoch": 1.0622150710646285, "grad_norm": 0.3638237368655141, "learning_rate": 8.159437146139751e-06, "loss": 0.0279, "step": 3961 }, { "epoch": 1.06248323947439, "grad_norm": 0.2640167172934997, "learning_rate": 8.158227753492702e-06, "loss": 0.0221, "step": 3962 }, { "epoch": 1.0627514078841513, "grad_norm": 0.3637955799726516, "learning_rate": 8.157018053337401e-06, "loss": 0.0356, "step": 3963 }, { "epoch": 1.0630195762939125, "grad_norm": 0.35121023919622213, "learning_rate": 8.155808045791635e-06, "loss": 0.0418, "step": 3964 }, { "epoch": 1.063287744703674, "grad_norm": 0.33740026660793343, "learning_rate": 8.15459773097322e-06, "loss": 0.0202, "step": 3965 }, { "epoch": 1.0635559131134353, "grad_norm": 0.33070590433530217, "learning_rate": 8.153387108999998e-06, "loss": 0.0201, "step": 3966 }, { "epoch": 1.0638240815231965, "grad_norm": 0.2767549144089937, "learning_rate": 8.152176179989843e-06, "loss": 0.0268, "step": 3967 }, { "epoch": 1.064092249932958, "grad_norm": 0.3318908117189271, "learning_rate": 8.150964944060665e-06, "loss": 0.0263, "step": 3968 }, { "epoch": 1.0643604183427193, "grad_norm": 0.296243539509169, "learning_rate": 8.149753401330395e-06, "loss": 0.0333, "step": 3969 }, { "epoch": 1.0646285867524805, "grad_norm": 0.3618070580911903, "learning_rate": 8.148541551917e-06, "loss": 0.037, "step": 3970 }, { "epoch": 1.064896755162242, "grad_norm": 0.2422412686978263, "learning_rate": 8.14732939593847e-06, "loss": 0.0283, "step": 3971 }, { "epoch": 1.0651649235720033, "grad_norm": 0.43128567606317986, "learning_rate": 8.146116933512836e-06, "loss": 0.0282, "step": 3972 }, { "epoch": 1.0654330919817645, "grad_norm": 0.3552815259027964, "learning_rate": 8.144904164758147e-06, "loss": 0.0266, "step": 3973 }, { "epoch": 1.065701260391526, "grad_norm": 0.40991021268771566, "learning_rate": 8.14369108979249e-06, "loss": 0.0373, "step": 3974 }, { "epoch": 1.0659694288012873, "grad_norm": 0.4585034175360701, "learning_rate": 8.142477708733977e-06, "loss": 0.0325, "step": 3975 }, { "epoch": 1.0662375972110485, "grad_norm": 0.2914855856042394, "learning_rate": 8.141264021700755e-06, "loss": 0.0193, "step": 3976 }, { "epoch": 1.06650576562081, "grad_norm": 0.28897058798545794, "learning_rate": 8.140050028810992e-06, "loss": 0.0287, "step": 3977 }, { "epoch": 1.0667739340305713, "grad_norm": 0.3306548149138281, "learning_rate": 8.138835730182897e-06, "loss": 0.0249, "step": 3978 }, { "epoch": 1.0670421024403325, "grad_norm": 0.39512041903544326, "learning_rate": 8.1376211259347e-06, "loss": 0.0329, "step": 3979 }, { "epoch": 1.067310270850094, "grad_norm": 0.2811496613617414, "learning_rate": 8.136406216184662e-06, "loss": 0.0217, "step": 3980 }, { "epoch": 1.0675784392598553, "grad_norm": 0.2565723393861113, "learning_rate": 8.13519100105108e-06, "loss": 0.024, "step": 3981 }, { "epoch": 1.0678466076696165, "grad_norm": 0.2789733120821343, "learning_rate": 8.133975480652273e-06, "loss": 0.0255, "step": 3982 }, { "epoch": 1.0681147760793779, "grad_norm": 0.24664027471438452, "learning_rate": 8.132759655106595e-06, "loss": 0.0222, "step": 3983 }, { "epoch": 1.068382944489139, "grad_norm": 0.282852694286996, "learning_rate": 8.131543524532426e-06, "loss": 0.0176, "step": 3984 }, { "epoch": 1.0686511128989005, "grad_norm": 0.2660293135939434, "learning_rate": 8.130327089048179e-06, "loss": 0.0253, "step": 3985 }, { "epoch": 1.0689192813086619, "grad_norm": 0.22955810939491159, "learning_rate": 8.129110348772291e-06, "loss": 0.0191, "step": 3986 }, { "epoch": 1.0691874497184233, "grad_norm": 0.23306371231669049, "learning_rate": 8.127893303823237e-06, "loss": 0.019, "step": 3987 }, { "epoch": 1.0694556181281845, "grad_norm": 0.6135945724220748, "learning_rate": 8.126675954319519e-06, "loss": 0.0275, "step": 3988 }, { "epoch": 1.0697237865379459, "grad_norm": 0.36718940422748597, "learning_rate": 8.125458300379663e-06, "loss": 0.0346, "step": 3989 }, { "epoch": 1.069991954947707, "grad_norm": 0.29321727987001633, "learning_rate": 8.12424034212223e-06, "loss": 0.0292, "step": 3990 }, { "epoch": 1.0702601233574685, "grad_norm": 0.22561550980456965, "learning_rate": 8.12302207966581e-06, "loss": 0.0184, "step": 3991 }, { "epoch": 1.0705282917672299, "grad_norm": 0.2852455716702975, "learning_rate": 8.121803513129021e-06, "loss": 0.0274, "step": 3992 }, { "epoch": 1.0707964601769913, "grad_norm": 0.2751951925624213, "learning_rate": 8.120584642630513e-06, "loss": 0.0173, "step": 3993 }, { "epoch": 1.0710646285867524, "grad_norm": 0.32032060637510307, "learning_rate": 8.119365468288963e-06, "loss": 0.0267, "step": 3994 }, { "epoch": 1.0713327969965138, "grad_norm": 0.3583189484360853, "learning_rate": 8.118145990223078e-06, "loss": 0.0425, "step": 3995 }, { "epoch": 1.071600965406275, "grad_norm": 0.2937272559409034, "learning_rate": 8.116926208551598e-06, "loss": 0.0193, "step": 3996 }, { "epoch": 1.0718691338160364, "grad_norm": 0.2382120323371079, "learning_rate": 8.115706123393287e-06, "loss": 0.0261, "step": 3997 }, { "epoch": 1.0721373022257978, "grad_norm": 0.378921276441055, "learning_rate": 8.114485734866944e-06, "loss": 0.0283, "step": 3998 }, { "epoch": 1.0724054706355592, "grad_norm": 0.27047055759150507, "learning_rate": 8.113265043091393e-06, "loss": 0.0229, "step": 3999 }, { "epoch": 1.0726736390453204, "grad_norm": 0.25535675650451845, "learning_rate": 8.112044048185492e-06, "loss": 0.0286, "step": 4000 }, { "epoch": 1.0729418074550818, "grad_norm": 0.19169275795802673, "learning_rate": 8.110822750268124e-06, "loss": 0.0154, "step": 4001 }, { "epoch": 1.073209975864843, "grad_norm": 0.538203409576204, "learning_rate": 8.109601149458202e-06, "loss": 0.0471, "step": 4002 }, { "epoch": 1.0734781442746044, "grad_norm": 0.29095760048083846, "learning_rate": 8.108379245874673e-06, "loss": 0.0356, "step": 4003 }, { "epoch": 1.0737463126843658, "grad_norm": 0.35789636388950097, "learning_rate": 8.107157039636509e-06, "loss": 0.0348, "step": 4004 }, { "epoch": 1.0740144810941272, "grad_norm": 0.3621126745542607, "learning_rate": 8.105934530862714e-06, "loss": 0.0274, "step": 4005 }, { "epoch": 1.0742826495038884, "grad_norm": 0.32130110514766413, "learning_rate": 8.104711719672318e-06, "loss": 0.0279, "step": 4006 }, { "epoch": 1.0745508179136498, "grad_norm": 0.38446401161196014, "learning_rate": 8.103488606184387e-06, "loss": 0.0272, "step": 4007 }, { "epoch": 1.074818986323411, "grad_norm": 0.3013428203215738, "learning_rate": 8.10226519051801e-06, "loss": 0.0253, "step": 4008 }, { "epoch": 1.0750871547331724, "grad_norm": 0.28859934631746204, "learning_rate": 8.101041472792307e-06, "loss": 0.0202, "step": 4009 }, { "epoch": 1.0753553231429338, "grad_norm": 0.3038656480376863, "learning_rate": 8.099817453126428e-06, "loss": 0.0254, "step": 4010 }, { "epoch": 1.0756234915526952, "grad_norm": 0.36494270594228884, "learning_rate": 8.098593131639555e-06, "loss": 0.0245, "step": 4011 }, { "epoch": 1.0758916599624564, "grad_norm": 0.30542772176140154, "learning_rate": 8.097368508450894e-06, "loss": 0.028, "step": 4012 }, { "epoch": 1.0761598283722178, "grad_norm": 0.326473523512798, "learning_rate": 8.096143583679685e-06, "loss": 0.0248, "step": 4013 }, { "epoch": 1.076427996781979, "grad_norm": 0.27202087299970124, "learning_rate": 8.094918357445197e-06, "loss": 0.0211, "step": 4014 }, { "epoch": 1.0766961651917404, "grad_norm": 0.3484324401446948, "learning_rate": 8.093692829866723e-06, "loss": 0.0247, "step": 4015 }, { "epoch": 1.0769643336015018, "grad_norm": 0.5177219208661106, "learning_rate": 8.092467001063593e-06, "loss": 0.0398, "step": 4016 }, { "epoch": 1.077232502011263, "grad_norm": 0.3729813655612874, "learning_rate": 8.091240871155162e-06, "loss": 0.0395, "step": 4017 }, { "epoch": 1.0775006704210244, "grad_norm": 0.23546090939524478, "learning_rate": 8.090014440260814e-06, "loss": 0.0222, "step": 4018 }, { "epoch": 1.0777688388307858, "grad_norm": 0.34919507426610485, "learning_rate": 8.088787708499964e-06, "loss": 0.0355, "step": 4019 }, { "epoch": 1.078037007240547, "grad_norm": 0.42953376691963135, "learning_rate": 8.087560675992053e-06, "loss": 0.0221, "step": 4020 }, { "epoch": 1.0783051756503084, "grad_norm": 0.41296775415768683, "learning_rate": 8.086333342856561e-06, "loss": 0.0317, "step": 4021 }, { "epoch": 1.0785733440600698, "grad_norm": 0.30368758400624407, "learning_rate": 8.08510570921298e-06, "loss": 0.0274, "step": 4022 }, { "epoch": 1.078841512469831, "grad_norm": 0.3544727514387742, "learning_rate": 8.083877775180851e-06, "loss": 0.0257, "step": 4023 }, { "epoch": 1.0791096808795924, "grad_norm": 0.28199482312265, "learning_rate": 8.08264954087973e-06, "loss": 0.0261, "step": 4024 }, { "epoch": 1.0793778492893538, "grad_norm": 0.3426246650758932, "learning_rate": 8.081421006429205e-06, "loss": 0.0372, "step": 4025 }, { "epoch": 1.079646017699115, "grad_norm": 0.3440385616870937, "learning_rate": 8.0801921719489e-06, "loss": 0.0241, "step": 4026 }, { "epoch": 1.0799141861088764, "grad_norm": 0.30631584242231974, "learning_rate": 8.078963037558461e-06, "loss": 0.0278, "step": 4027 }, { "epoch": 1.0801823545186378, "grad_norm": 0.23438062454742695, "learning_rate": 8.077733603377565e-06, "loss": 0.0217, "step": 4028 }, { "epoch": 1.080450522928399, "grad_norm": 0.2534919756097813, "learning_rate": 8.076503869525919e-06, "loss": 0.0257, "step": 4029 }, { "epoch": 1.0807186913381603, "grad_norm": 0.2536373087260301, "learning_rate": 8.07527383612326e-06, "loss": 0.0209, "step": 4030 }, { "epoch": 1.0809868597479217, "grad_norm": 0.44563928327614527, "learning_rate": 8.074043503289349e-06, "loss": 0.0363, "step": 4031 }, { "epoch": 1.081255028157683, "grad_norm": 0.34378620511302005, "learning_rate": 8.072812871143987e-06, "loss": 0.0282, "step": 4032 }, { "epoch": 1.0815231965674443, "grad_norm": 0.30719081251400204, "learning_rate": 8.071581939806991e-06, "loss": 0.0342, "step": 4033 }, { "epoch": 1.0817913649772057, "grad_norm": 0.2635018480871943, "learning_rate": 8.07035070939822e-06, "loss": 0.0303, "step": 4034 }, { "epoch": 1.082059533386967, "grad_norm": 0.2991762134067907, "learning_rate": 8.06911918003755e-06, "loss": 0.0255, "step": 4035 }, { "epoch": 1.0823277017967283, "grad_norm": 0.5953932006014163, "learning_rate": 8.067887351844895e-06, "loss": 0.0347, "step": 4036 }, { "epoch": 1.0825958702064897, "grad_norm": 0.26123024138162865, "learning_rate": 8.066655224940193e-06, "loss": 0.0227, "step": 4037 }, { "epoch": 1.082864038616251, "grad_norm": 0.25532011411982336, "learning_rate": 8.065422799443412e-06, "loss": 0.0212, "step": 4038 }, { "epoch": 1.0831322070260123, "grad_norm": 0.2595992096164906, "learning_rate": 8.064190075474553e-06, "loss": 0.0216, "step": 4039 }, { "epoch": 1.0834003754357737, "grad_norm": 0.3168796294216165, "learning_rate": 8.062957053153641e-06, "loss": 0.0235, "step": 4040 }, { "epoch": 1.083668543845535, "grad_norm": 0.31844684525508077, "learning_rate": 8.061723732600733e-06, "loss": 0.0323, "step": 4041 }, { "epoch": 1.0839367122552963, "grad_norm": 0.3486797883559125, "learning_rate": 8.060490113935916e-06, "loss": 0.027, "step": 4042 }, { "epoch": 1.0842048806650577, "grad_norm": 0.26238202735324084, "learning_rate": 8.0592561972793e-06, "loss": 0.0234, "step": 4043 }, { "epoch": 1.084473049074819, "grad_norm": 0.25955016335902786, "learning_rate": 8.05802198275103e-06, "loss": 0.0239, "step": 4044 }, { "epoch": 1.0847412174845803, "grad_norm": 0.3794019931453181, "learning_rate": 8.056787470471279e-06, "loss": 0.0317, "step": 4045 }, { "epoch": 1.0850093858943417, "grad_norm": 0.27177213588586335, "learning_rate": 8.055552660560246e-06, "loss": 0.0265, "step": 4046 }, { "epoch": 1.0852775543041029, "grad_norm": 0.2719606560556994, "learning_rate": 8.054317553138164e-06, "loss": 0.0231, "step": 4047 }, { "epoch": 1.0855457227138643, "grad_norm": 0.35985091577195283, "learning_rate": 8.053082148325293e-06, "loss": 0.024, "step": 4048 }, { "epoch": 1.0858138911236257, "grad_norm": 0.3224255478113859, "learning_rate": 8.051846446241916e-06, "loss": 0.0253, "step": 4049 }, { "epoch": 1.0860820595333869, "grad_norm": 0.2576902214272502, "learning_rate": 8.050610447008354e-06, "loss": 0.0188, "step": 4050 }, { "epoch": 1.0863502279431483, "grad_norm": 0.29356023393777875, "learning_rate": 8.049374150744953e-06, "loss": 0.0268, "step": 4051 }, { "epoch": 1.0866183963529097, "grad_norm": 0.28297118725891, "learning_rate": 8.048137557572087e-06, "loss": 0.0263, "step": 4052 }, { "epoch": 1.0868865647626709, "grad_norm": 0.3422632096740693, "learning_rate": 8.046900667610158e-06, "loss": 0.024, "step": 4053 }, { "epoch": 1.0871547331724323, "grad_norm": 0.38670197076052765, "learning_rate": 8.0456634809796e-06, "loss": 0.0378, "step": 4054 }, { "epoch": 1.0874229015821937, "grad_norm": 0.28359859242672236, "learning_rate": 8.044425997800875e-06, "loss": 0.0251, "step": 4055 }, { "epoch": 1.0876910699919549, "grad_norm": 0.42216898171422046, "learning_rate": 8.043188218194476e-06, "loss": 0.0284, "step": 4056 }, { "epoch": 1.0879592384017163, "grad_norm": 0.2782266862935743, "learning_rate": 8.041950142280918e-06, "loss": 0.0256, "step": 4057 }, { "epoch": 1.0882274068114777, "grad_norm": 0.3419740042546062, "learning_rate": 8.040711770180751e-06, "loss": 0.03, "step": 4058 }, { "epoch": 1.0884955752212389, "grad_norm": 0.30344508289312155, "learning_rate": 8.039473102014552e-06, "loss": 0.027, "step": 4059 }, { "epoch": 1.0887637436310003, "grad_norm": 0.4485768283632113, "learning_rate": 8.038234137902925e-06, "loss": 0.0281, "step": 4060 }, { "epoch": 1.0890319120407617, "grad_norm": 0.36335700574738167, "learning_rate": 8.036994877966509e-06, "loss": 0.0272, "step": 4061 }, { "epoch": 1.0893000804505228, "grad_norm": 0.25781821185777726, "learning_rate": 8.035755322325964e-06, "loss": 0.0191, "step": 4062 }, { "epoch": 1.0895682488602842, "grad_norm": 0.3929097040945974, "learning_rate": 8.034515471101982e-06, "loss": 0.0296, "step": 4063 }, { "epoch": 1.0898364172700457, "grad_norm": 0.26604554681248765, "learning_rate": 8.033275324415286e-06, "loss": 0.0213, "step": 4064 }, { "epoch": 1.0901045856798068, "grad_norm": 0.38964493018400487, "learning_rate": 8.032034882386625e-06, "loss": 0.0353, "step": 4065 }, { "epoch": 1.0903727540895682, "grad_norm": 0.3098233263762199, "learning_rate": 8.030794145136776e-06, "loss": 0.0246, "step": 4066 }, { "epoch": 1.0906409224993296, "grad_norm": 0.3479751544175198, "learning_rate": 8.02955311278655e-06, "loss": 0.033, "step": 4067 }, { "epoch": 1.0909090909090908, "grad_norm": 0.26280201990364777, "learning_rate": 8.028311785456778e-06, "loss": 0.0253, "step": 4068 }, { "epoch": 1.0911772593188522, "grad_norm": 0.3485378301828578, "learning_rate": 8.02707016326833e-06, "loss": 0.0318, "step": 4069 }, { "epoch": 1.0914454277286136, "grad_norm": 0.32296027816786327, "learning_rate": 8.025828246342096e-06, "loss": 0.028, "step": 4070 }, { "epoch": 1.0917135961383748, "grad_norm": 0.3752148376626047, "learning_rate": 8.024586034798998e-06, "loss": 0.0248, "step": 4071 }, { "epoch": 1.0919817645481362, "grad_norm": 0.2916201458555858, "learning_rate": 8.02334352875999e-06, "loss": 0.0279, "step": 4072 }, { "epoch": 1.0922499329578976, "grad_norm": 0.30133250812374757, "learning_rate": 8.022100728346048e-06, "loss": 0.0226, "step": 4073 }, { "epoch": 1.0925181013676588, "grad_norm": 0.2970744858560541, "learning_rate": 8.02085763367818e-06, "loss": 0.0236, "step": 4074 }, { "epoch": 1.0927862697774202, "grad_norm": 0.35383393670432006, "learning_rate": 8.019614244877429e-06, "loss": 0.0388, "step": 4075 }, { "epoch": 1.0930544381871816, "grad_norm": 0.19300881920988497, "learning_rate": 8.018370562064852e-06, "loss": 0.0199, "step": 4076 }, { "epoch": 1.0933226065969428, "grad_norm": 0.31978837121061177, "learning_rate": 8.017126585361548e-06, "loss": 0.0266, "step": 4077 }, { "epoch": 1.0935907750067042, "grad_norm": 0.30849703002597245, "learning_rate": 8.015882314888638e-06, "loss": 0.0299, "step": 4078 }, { "epoch": 1.0938589434164656, "grad_norm": 0.32214237088160264, "learning_rate": 8.014637750767276e-06, "loss": 0.0335, "step": 4079 }, { "epoch": 1.0941271118262268, "grad_norm": 0.27209436953459176, "learning_rate": 8.01339289311864e-06, "loss": 0.0221, "step": 4080 }, { "epoch": 1.0943952802359882, "grad_norm": 0.2563874563309693, "learning_rate": 8.012147742063936e-06, "loss": 0.026, "step": 4081 }, { "epoch": 1.0946634486457496, "grad_norm": 0.3253618024823035, "learning_rate": 8.010902297724404e-06, "loss": 0.0225, "step": 4082 }, { "epoch": 1.0949316170555108, "grad_norm": 0.34270618754671317, "learning_rate": 8.00965656022131e-06, "loss": 0.0296, "step": 4083 }, { "epoch": 1.0951997854652722, "grad_norm": 0.280581188469629, "learning_rate": 8.008410529675947e-06, "loss": 0.0277, "step": 4084 }, { "epoch": 1.0954679538750336, "grad_norm": 0.410099538397887, "learning_rate": 8.007164206209637e-06, "loss": 0.027, "step": 4085 }, { "epoch": 1.0957361222847948, "grad_norm": 0.28806532649586924, "learning_rate": 8.005917589943734e-06, "loss": 0.0277, "step": 4086 }, { "epoch": 1.0960042906945562, "grad_norm": 0.3224546854164573, "learning_rate": 8.004670680999615e-06, "loss": 0.0251, "step": 4087 }, { "epoch": 1.0962724591043176, "grad_norm": 0.2703998670068158, "learning_rate": 8.003423479498689e-06, "loss": 0.0176, "step": 4088 }, { "epoch": 1.0965406275140788, "grad_norm": 0.2318160085559615, "learning_rate": 8.002175985562392e-06, "loss": 0.0262, "step": 4089 }, { "epoch": 1.0968087959238402, "grad_norm": 0.44641462746047506, "learning_rate": 8.00092819931219e-06, "loss": 0.0263, "step": 4090 }, { "epoch": 1.0970769643336016, "grad_norm": 0.2891948519662771, "learning_rate": 7.999680120869578e-06, "loss": 0.0218, "step": 4091 }, { "epoch": 1.0973451327433628, "grad_norm": 0.2839405783514393, "learning_rate": 7.998431750356074e-06, "loss": 0.0248, "step": 4092 }, { "epoch": 1.0976133011531242, "grad_norm": 0.26092561202577574, "learning_rate": 7.997183087893233e-06, "loss": 0.0228, "step": 4093 }, { "epoch": 1.0978814695628856, "grad_norm": 0.2977862387545977, "learning_rate": 7.995934133602633e-06, "loss": 0.0293, "step": 4094 }, { "epoch": 1.0981496379726468, "grad_norm": 0.3074935005603619, "learning_rate": 7.994684887605877e-06, "loss": 0.0311, "step": 4095 }, { "epoch": 1.0984178063824082, "grad_norm": 0.2916319802357935, "learning_rate": 7.993435350024608e-06, "loss": 0.03, "step": 4096 }, { "epoch": 1.0986859747921696, "grad_norm": 0.2693427729525745, "learning_rate": 7.992185520980484e-06, "loss": 0.0217, "step": 4097 }, { "epoch": 1.0989541432019307, "grad_norm": 0.2746059920526003, "learning_rate": 7.9909354005952e-06, "loss": 0.0257, "step": 4098 }, { "epoch": 1.0992223116116921, "grad_norm": 0.30469054896273656, "learning_rate": 7.98968498899048e-06, "loss": 0.0282, "step": 4099 }, { "epoch": 1.0994904800214536, "grad_norm": 0.3051123006979975, "learning_rate": 7.988434286288067e-06, "loss": 0.0342, "step": 4100 }, { "epoch": 1.0997586484312147, "grad_norm": 0.24741231267079192, "learning_rate": 7.987183292609743e-06, "loss": 0.0185, "step": 4101 }, { "epoch": 1.1000268168409761, "grad_norm": 0.2352936467387726, "learning_rate": 7.985932008077311e-06, "loss": 0.0235, "step": 4102 }, { "epoch": 1.1002949852507375, "grad_norm": 0.29503384550294093, "learning_rate": 7.984680432812608e-06, "loss": 0.0266, "step": 4103 }, { "epoch": 1.1005631536604987, "grad_norm": 0.40973989250210113, "learning_rate": 7.983428566937495e-06, "loss": 0.0282, "step": 4104 }, { "epoch": 1.1008313220702601, "grad_norm": 0.3206852796346588, "learning_rate": 7.982176410573865e-06, "loss": 0.0321, "step": 4105 }, { "epoch": 1.1010994904800215, "grad_norm": 0.27259908659708826, "learning_rate": 7.980923963843634e-06, "loss": 0.0266, "step": 4106 }, { "epoch": 1.1013676588897827, "grad_norm": 0.2880521717002182, "learning_rate": 7.97967122686875e-06, "loss": 0.0228, "step": 4107 }, { "epoch": 1.1016358272995441, "grad_norm": 0.29752434595387006, "learning_rate": 7.97841819977119e-06, "loss": 0.0307, "step": 4108 }, { "epoch": 1.1019039957093055, "grad_norm": 0.3522697399587234, "learning_rate": 7.977164882672958e-06, "loss": 0.0293, "step": 4109 }, { "epoch": 1.1021721641190667, "grad_norm": 0.47534931460561736, "learning_rate": 7.975911275696086e-06, "loss": 0.0296, "step": 4110 }, { "epoch": 1.1024403325288281, "grad_norm": 0.2962036124613042, "learning_rate": 7.974657378962633e-06, "loss": 0.023, "step": 4111 }, { "epoch": 1.1027085009385895, "grad_norm": 0.3441166925427178, "learning_rate": 7.97340319259469e-06, "loss": 0.0332, "step": 4112 }, { "epoch": 1.1029766693483507, "grad_norm": 0.305179966381331, "learning_rate": 7.972148716714371e-06, "loss": 0.0232, "step": 4113 }, { "epoch": 1.103244837758112, "grad_norm": 0.34390807825084896, "learning_rate": 7.970893951443822e-06, "loss": 0.037, "step": 4114 }, { "epoch": 1.1035130061678735, "grad_norm": 0.35274026351897547, "learning_rate": 7.969638896905217e-06, "loss": 0.024, "step": 4115 }, { "epoch": 1.1037811745776347, "grad_norm": 0.2815630527758902, "learning_rate": 7.968383553220757e-06, "loss": 0.025, "step": 4116 }, { "epoch": 1.104049342987396, "grad_norm": 0.4146906240158036, "learning_rate": 7.967127920512672e-06, "loss": 0.0315, "step": 4117 }, { "epoch": 1.1043175113971575, "grad_norm": 0.26493684010792395, "learning_rate": 7.965871998903219e-06, "loss": 0.0213, "step": 4118 }, { "epoch": 1.1045856798069187, "grad_norm": 0.23645294919797766, "learning_rate": 7.964615788514683e-06, "loss": 0.0176, "step": 4119 }, { "epoch": 1.10485384821668, "grad_norm": 1.3698124769199465, "learning_rate": 7.963359289469378e-06, "loss": 0.0339, "step": 4120 }, { "epoch": 1.1051220166264415, "grad_norm": 0.238507584414975, "learning_rate": 7.962102501889647e-06, "loss": 0.0268, "step": 4121 }, { "epoch": 1.1053901850362027, "grad_norm": 0.41823921052502794, "learning_rate": 7.96084542589786e-06, "loss": 0.0326, "step": 4122 }, { "epoch": 1.105658353445964, "grad_norm": 0.41418909189497843, "learning_rate": 7.959588061616415e-06, "loss": 0.0346, "step": 4123 }, { "epoch": 1.1059265218557255, "grad_norm": 0.35998392424701786, "learning_rate": 7.958330409167737e-06, "loss": 0.0224, "step": 4124 }, { "epoch": 1.1061946902654867, "grad_norm": 0.2666897462502224, "learning_rate": 7.957072468674282e-06, "loss": 0.0233, "step": 4125 }, { "epoch": 1.106462858675248, "grad_norm": 0.26985241281361594, "learning_rate": 7.955814240258532e-06, "loss": 0.0187, "step": 4126 }, { "epoch": 1.1067310270850095, "grad_norm": 0.35953815483619417, "learning_rate": 7.954555724042996e-06, "loss": 0.0291, "step": 4127 }, { "epoch": 1.1069991954947707, "grad_norm": 0.41332380527038576, "learning_rate": 7.953296920150213e-06, "loss": 0.0287, "step": 4128 }, { "epoch": 1.107267363904532, "grad_norm": 0.370337189715217, "learning_rate": 7.952037828702752e-06, "loss": 0.0224, "step": 4129 }, { "epoch": 1.1075355323142935, "grad_norm": 0.4690741665195384, "learning_rate": 7.950778449823203e-06, "loss": 0.0259, "step": 4130 }, { "epoch": 1.1078037007240547, "grad_norm": 0.2835463083034622, "learning_rate": 7.949518783634191e-06, "loss": 0.0241, "step": 4131 }, { "epoch": 1.108071869133816, "grad_norm": 0.5299333996966844, "learning_rate": 7.948258830258366e-06, "loss": 0.0204, "step": 4132 }, { "epoch": 1.1083400375435775, "grad_norm": 0.4309818192664088, "learning_rate": 7.946998589818406e-06, "loss": 0.0314, "step": 4133 }, { "epoch": 1.1086082059533386, "grad_norm": 0.42446611482525437, "learning_rate": 7.945738062437018e-06, "loss": 0.0383, "step": 4134 }, { "epoch": 1.1088763743631, "grad_norm": 0.5847952278898373, "learning_rate": 7.944477248236934e-06, "loss": 0.0278, "step": 4135 }, { "epoch": 1.1091445427728615, "grad_norm": 0.28954735959107586, "learning_rate": 7.94321614734092e-06, "loss": 0.0219, "step": 4136 }, { "epoch": 1.1094127111826226, "grad_norm": 0.26011725131205254, "learning_rate": 7.941954759871763e-06, "loss": 0.0236, "step": 4137 }, { "epoch": 1.109680879592384, "grad_norm": 0.30336296060694157, "learning_rate": 7.940693085952281e-06, "loss": 0.0324, "step": 4138 }, { "epoch": 1.1099490480021454, "grad_norm": 0.2322930076356446, "learning_rate": 7.93943112570532e-06, "loss": 0.0273, "step": 4139 }, { "epoch": 1.1102172164119066, "grad_norm": 0.29883359732806297, "learning_rate": 7.938168879253757e-06, "loss": 0.0293, "step": 4140 }, { "epoch": 1.110485384821668, "grad_norm": 0.24570490921669041, "learning_rate": 7.93690634672049e-06, "loss": 0.0241, "step": 4141 }, { "epoch": 1.1107535532314294, "grad_norm": 0.3376069571332474, "learning_rate": 7.935643528228448e-06, "loss": 0.0363, "step": 4142 }, { "epoch": 1.1110217216411906, "grad_norm": 0.2663791489009323, "learning_rate": 7.934380423900591e-06, "loss": 0.0256, "step": 4143 }, { "epoch": 1.111289890050952, "grad_norm": 0.2670221037078257, "learning_rate": 7.933117033859903e-06, "loss": 0.0278, "step": 4144 }, { "epoch": 1.1115580584607134, "grad_norm": 0.3265573900272089, "learning_rate": 7.931853358229396e-06, "loss": 0.0247, "step": 4145 }, { "epoch": 1.1118262268704746, "grad_norm": 0.2944649524168005, "learning_rate": 7.93058939713211e-06, "loss": 0.0304, "step": 4146 }, { "epoch": 1.112094395280236, "grad_norm": 0.2429980744319957, "learning_rate": 7.929325150691116e-06, "loss": 0.024, "step": 4147 }, { "epoch": 1.1123625636899974, "grad_norm": 0.30571124497970364, "learning_rate": 7.92806061902951e-06, "loss": 0.0255, "step": 4148 }, { "epoch": 1.1126307320997586, "grad_norm": 0.35103142715837277, "learning_rate": 7.926795802270415e-06, "loss": 0.0306, "step": 4149 }, { "epoch": 1.11289890050952, "grad_norm": 0.34649021647851863, "learning_rate": 7.925530700536982e-06, "loss": 0.0263, "step": 4150 }, { "epoch": 1.1131670689192814, "grad_norm": 0.28368951759623695, "learning_rate": 7.924265313952393e-06, "loss": 0.0258, "step": 4151 }, { "epoch": 1.1134352373290426, "grad_norm": 0.347437968716451, "learning_rate": 7.922999642639854e-06, "loss": 0.037, "step": 4152 }, { "epoch": 1.113703405738804, "grad_norm": 0.335328594290532, "learning_rate": 7.921733686722602e-06, "loss": 0.0208, "step": 4153 }, { "epoch": 1.1139715741485654, "grad_norm": 0.49890591533761997, "learning_rate": 7.920467446323896e-06, "loss": 0.0225, "step": 4154 }, { "epoch": 1.1142397425583266, "grad_norm": 0.1888072012916062, "learning_rate": 7.919200921567029e-06, "loss": 0.0177, "step": 4155 }, { "epoch": 1.114507910968088, "grad_norm": 0.2428824590146001, "learning_rate": 7.917934112575318e-06, "loss": 0.0244, "step": 4156 }, { "epoch": 1.1147760793778492, "grad_norm": 0.6292839446651106, "learning_rate": 7.916667019472111e-06, "loss": 0.0253, "step": 4157 }, { "epoch": 1.1150442477876106, "grad_norm": 0.2797895336527246, "learning_rate": 7.915399642380778e-06, "loss": 0.022, "step": 4158 }, { "epoch": 1.115312416197372, "grad_norm": 0.3709033263265123, "learning_rate": 7.914131981424722e-06, "loss": 0.0245, "step": 4159 }, { "epoch": 1.1155805846071334, "grad_norm": 0.37350488573930757, "learning_rate": 7.912864036727374e-06, "loss": 0.0277, "step": 4160 }, { "epoch": 1.1158487530168946, "grad_norm": 0.35591135124029405, "learning_rate": 7.911595808412186e-06, "loss": 0.0293, "step": 4161 }, { "epoch": 1.116116921426656, "grad_norm": 0.3323658415209433, "learning_rate": 7.910327296602644e-06, "loss": 0.0274, "step": 4162 }, { "epoch": 1.1163850898364172, "grad_norm": 0.33674838702454474, "learning_rate": 7.909058501422262e-06, "loss": 0.0296, "step": 4163 }, { "epoch": 1.1166532582461786, "grad_norm": 0.30186607148297506, "learning_rate": 7.907789422994577e-06, "loss": 0.0361, "step": 4164 }, { "epoch": 1.11692142665594, "grad_norm": 0.32897186080708757, "learning_rate": 7.906520061443154e-06, "loss": 0.0368, "step": 4165 }, { "epoch": 1.1171895950657014, "grad_norm": 0.394905437197485, "learning_rate": 7.90525041689159e-06, "loss": 0.0277, "step": 4166 }, { "epoch": 1.1174577634754626, "grad_norm": 0.40044966362355716, "learning_rate": 7.903980489463507e-06, "loss": 0.0254, "step": 4167 }, { "epoch": 1.117725931885224, "grad_norm": 0.35285803009717864, "learning_rate": 7.902710279282554e-06, "loss": 0.0293, "step": 4168 }, { "epoch": 1.1179941002949851, "grad_norm": 0.2549185645416134, "learning_rate": 7.901439786472408e-06, "loss": 0.0209, "step": 4169 }, { "epoch": 1.1182622687047465, "grad_norm": 0.2965982372943463, "learning_rate": 7.90016901115677e-06, "loss": 0.032, "step": 4170 }, { "epoch": 1.118530437114508, "grad_norm": 0.27237683647084293, "learning_rate": 7.898897953459376e-06, "loss": 0.0232, "step": 4171 }, { "epoch": 1.1187986055242694, "grad_norm": 0.3157510762942235, "learning_rate": 7.897626613503988e-06, "loss": 0.0315, "step": 4172 }, { "epoch": 1.1190667739340305, "grad_norm": 0.3819584516368457, "learning_rate": 7.896354991414387e-06, "loss": 0.0405, "step": 4173 }, { "epoch": 1.119334942343792, "grad_norm": 0.3813124136664552, "learning_rate": 7.895083087314391e-06, "loss": 0.0289, "step": 4174 }, { "epoch": 1.1196031107535531, "grad_norm": 0.2559914631645647, "learning_rate": 7.89381090132784e-06, "loss": 0.0261, "step": 4175 }, { "epoch": 1.1198712791633145, "grad_norm": 0.3851260813916714, "learning_rate": 7.892538433578607e-06, "loss": 0.0205, "step": 4176 }, { "epoch": 1.120139447573076, "grad_norm": 0.32815564214621246, "learning_rate": 7.891265684190583e-06, "loss": 0.0326, "step": 4177 }, { "epoch": 1.1204076159828373, "grad_norm": 0.40285146442820446, "learning_rate": 7.889992653287698e-06, "loss": 0.0267, "step": 4178 }, { "epoch": 1.1206757843925985, "grad_norm": 0.2797457296293269, "learning_rate": 7.8887193409939e-06, "loss": 0.0252, "step": 4179 }, { "epoch": 1.12094395280236, "grad_norm": 0.27093535622584985, "learning_rate": 7.887445747433169e-06, "loss": 0.0224, "step": 4180 }, { "epoch": 1.121212121212121, "grad_norm": 0.3803487139516584, "learning_rate": 7.886171872729513e-06, "loss": 0.029, "step": 4181 }, { "epoch": 1.1214802896218825, "grad_norm": 0.8751256647126664, "learning_rate": 7.884897717006962e-06, "loss": 0.0249, "step": 4182 }, { "epoch": 1.121748458031644, "grad_norm": 0.3298758675712611, "learning_rate": 7.883623280389581e-06, "loss": 0.0307, "step": 4183 }, { "epoch": 1.1220166264414053, "grad_norm": 0.26013915217697675, "learning_rate": 7.882348563001459e-06, "loss": 0.0193, "step": 4184 }, { "epoch": 1.1222847948511665, "grad_norm": 0.31239487604244787, "learning_rate": 7.881073564966706e-06, "loss": 0.0314, "step": 4185 }, { "epoch": 1.122552963260928, "grad_norm": 0.29606126443292674, "learning_rate": 7.879798286409471e-06, "loss": 0.0256, "step": 4186 }, { "epoch": 1.122821131670689, "grad_norm": 0.25036759598590835, "learning_rate": 7.878522727453924e-06, "loss": 0.021, "step": 4187 }, { "epoch": 1.1230893000804505, "grad_norm": 0.34537980384278044, "learning_rate": 7.87724688822426e-06, "loss": 0.0244, "step": 4188 }, { "epoch": 1.123357468490212, "grad_norm": 0.3848214154466663, "learning_rate": 7.875970768844703e-06, "loss": 0.024, "step": 4189 }, { "epoch": 1.123625636899973, "grad_norm": 0.32802128320292184, "learning_rate": 7.874694369439511e-06, "loss": 0.026, "step": 4190 }, { "epoch": 1.1238938053097345, "grad_norm": 0.29409589510706435, "learning_rate": 7.87341769013296e-06, "loss": 0.0256, "step": 4191 }, { "epoch": 1.1241619737194959, "grad_norm": 0.39505588215971155, "learning_rate": 7.872140731049356e-06, "loss": 0.0311, "step": 4192 }, { "epoch": 1.124430142129257, "grad_norm": 0.30949360256394914, "learning_rate": 7.870863492313035e-06, "loss": 0.0251, "step": 4193 }, { "epoch": 1.1246983105390185, "grad_norm": 0.4524253177076165, "learning_rate": 7.869585974048359e-06, "loss": 0.0249, "step": 4194 }, { "epoch": 1.1249664789487799, "grad_norm": 0.2907674867665826, "learning_rate": 7.868308176379713e-06, "loss": 0.0293, "step": 4195 }, { "epoch": 1.1252346473585413, "grad_norm": 0.2804988109618617, "learning_rate": 7.867030099431517e-06, "loss": 0.0238, "step": 4196 }, { "epoch": 1.1255028157683025, "grad_norm": 0.322025921058777, "learning_rate": 7.865751743328211e-06, "loss": 0.0286, "step": 4197 }, { "epoch": 1.1257709841780639, "grad_norm": 0.24311528037573407, "learning_rate": 7.864473108194268e-06, "loss": 0.0257, "step": 4198 }, { "epoch": 1.126039152587825, "grad_norm": 0.283552249667885, "learning_rate": 7.86319419415418e-06, "loss": 0.0213, "step": 4199 }, { "epoch": 1.1263073209975865, "grad_norm": 0.34624401723656195, "learning_rate": 7.861915001332479e-06, "loss": 0.0385, "step": 4200 }, { "epoch": 1.1265754894073479, "grad_norm": 0.34788546625631106, "learning_rate": 7.860635529853708e-06, "loss": 0.026, "step": 4201 }, { "epoch": 1.1268436578171093, "grad_norm": 0.2502238275382981, "learning_rate": 7.859355779842455e-06, "loss": 0.0239, "step": 4202 }, { "epoch": 1.1271118262268705, "grad_norm": 0.25643857948199306, "learning_rate": 7.858075751423319e-06, "loss": 0.024, "step": 4203 }, { "epoch": 1.1273799946366319, "grad_norm": 0.3768920264595995, "learning_rate": 7.856795444720934e-06, "loss": 0.031, "step": 4204 }, { "epoch": 1.127648163046393, "grad_norm": 0.32855784639250407, "learning_rate": 7.855514859859963e-06, "loss": 0.0313, "step": 4205 }, { "epoch": 1.1279163314561544, "grad_norm": 0.31682520285739896, "learning_rate": 7.854233996965091e-06, "loss": 0.0252, "step": 4206 }, { "epoch": 1.1281844998659158, "grad_norm": 0.30662700997080405, "learning_rate": 7.852952856161031e-06, "loss": 0.0187, "step": 4207 }, { "epoch": 1.128452668275677, "grad_norm": 0.39238111491725974, "learning_rate": 7.851671437572528e-06, "loss": 0.0235, "step": 4208 }, { "epoch": 1.1287208366854384, "grad_norm": 0.35376105569710337, "learning_rate": 7.850389741324347e-06, "loss": 0.0294, "step": 4209 }, { "epoch": 1.1289890050951998, "grad_norm": 0.511336622821663, "learning_rate": 7.849107767541284e-06, "loss": 0.0313, "step": 4210 }, { "epoch": 1.129257173504961, "grad_norm": 0.2961101797776158, "learning_rate": 7.847825516348164e-06, "loss": 0.0255, "step": 4211 }, { "epoch": 1.1295253419147224, "grad_norm": 0.4827423105268493, "learning_rate": 7.846542987869832e-06, "loss": 0.0201, "step": 4212 }, { "epoch": 1.1297935103244838, "grad_norm": 0.5004546710916487, "learning_rate": 7.845260182231166e-06, "loss": 0.0305, "step": 4213 }, { "epoch": 1.130061678734245, "grad_norm": 0.35102034172555613, "learning_rate": 7.843977099557075e-06, "loss": 0.0272, "step": 4214 }, { "epoch": 1.1303298471440064, "grad_norm": 0.3400291502830289, "learning_rate": 7.84269373997248e-06, "loss": 0.0283, "step": 4215 }, { "epoch": 1.1305980155537678, "grad_norm": 0.4477580913977664, "learning_rate": 7.841410103602345e-06, "loss": 0.052, "step": 4216 }, { "epoch": 1.130866183963529, "grad_norm": 0.5295502076277343, "learning_rate": 7.84012619057165e-06, "loss": 0.0282, "step": 4217 }, { "epoch": 1.1311343523732904, "grad_norm": 0.2537692994848566, "learning_rate": 7.83884200100541e-06, "loss": 0.0206, "step": 4218 }, { "epoch": 1.1314025207830518, "grad_norm": 0.3237797465001269, "learning_rate": 7.83755753502866e-06, "loss": 0.0286, "step": 4219 }, { "epoch": 1.131670689192813, "grad_norm": 0.2368752717365191, "learning_rate": 7.836272792766467e-06, "loss": 0.0211, "step": 4220 }, { "epoch": 1.1319388576025744, "grad_norm": 0.2620768265194458, "learning_rate": 7.83498777434392e-06, "loss": 0.0258, "step": 4221 }, { "epoch": 1.1322070260123358, "grad_norm": 0.5178884195798888, "learning_rate": 7.833702479886143e-06, "loss": 0.0266, "step": 4222 }, { "epoch": 1.132475194422097, "grad_norm": 0.24665835650074486, "learning_rate": 7.832416909518278e-06, "loss": 0.0179, "step": 4223 }, { "epoch": 1.1327433628318584, "grad_norm": 0.2936674418331212, "learning_rate": 7.8311310633655e-06, "loss": 0.0295, "step": 4224 }, { "epoch": 1.1330115312416198, "grad_norm": 0.21644312525985643, "learning_rate": 7.829844941553004e-06, "loss": 0.0204, "step": 4225 }, { "epoch": 1.133279699651381, "grad_norm": 0.6281995662689923, "learning_rate": 7.828558544206022e-06, "loss": 0.0376, "step": 4226 }, { "epoch": 1.1335478680611424, "grad_norm": 0.32516705681810687, "learning_rate": 7.827271871449803e-06, "loss": 0.0302, "step": 4227 }, { "epoch": 1.1338160364709038, "grad_norm": 0.369947823275506, "learning_rate": 7.825984923409627e-06, "loss": 0.0326, "step": 4228 }, { "epoch": 1.134084204880665, "grad_norm": 0.3660119726816325, "learning_rate": 7.824697700210804e-06, "loss": 0.0354, "step": 4229 }, { "epoch": 1.1343523732904264, "grad_norm": 0.2944373871083325, "learning_rate": 7.823410201978663e-06, "loss": 0.0276, "step": 4230 }, { "epoch": 1.1346205417001878, "grad_norm": 0.29410204219220454, "learning_rate": 7.822122428838567e-06, "loss": 0.0231, "step": 4231 }, { "epoch": 1.134888710109949, "grad_norm": 0.3461791241750489, "learning_rate": 7.820834380915905e-06, "loss": 0.0272, "step": 4232 }, { "epoch": 1.1351568785197104, "grad_norm": 0.6258491007417226, "learning_rate": 7.819546058336087e-06, "loss": 0.0258, "step": 4233 }, { "epoch": 1.1354250469294718, "grad_norm": 0.23974911076342684, "learning_rate": 7.818257461224556e-06, "loss": 0.0196, "step": 4234 }, { "epoch": 1.135693215339233, "grad_norm": 0.28849904997014614, "learning_rate": 7.816968589706776e-06, "loss": 0.0256, "step": 4235 }, { "epoch": 1.1359613837489944, "grad_norm": 0.2745885167132827, "learning_rate": 7.815679443908246e-06, "loss": 0.0204, "step": 4236 }, { "epoch": 1.1362295521587558, "grad_norm": 0.3612156690860062, "learning_rate": 7.814390023954482e-06, "loss": 0.0298, "step": 4237 }, { "epoch": 1.136497720568517, "grad_norm": 0.2634264638326276, "learning_rate": 7.813100329971034e-06, "loss": 0.0285, "step": 4238 }, { "epoch": 1.1367658889782783, "grad_norm": 0.24617403448572123, "learning_rate": 7.811810362083476e-06, "loss": 0.0188, "step": 4239 }, { "epoch": 1.1370340573880398, "grad_norm": 0.33102241663509985, "learning_rate": 7.810520120417408e-06, "loss": 0.0286, "step": 4240 }, { "epoch": 1.137302225797801, "grad_norm": 0.2654474689221968, "learning_rate": 7.809229605098458e-06, "loss": 0.0256, "step": 4241 }, { "epoch": 1.1375703942075623, "grad_norm": 0.3962137473269484, "learning_rate": 7.807938816252279e-06, "loss": 0.0324, "step": 4242 }, { "epoch": 1.1378385626173237, "grad_norm": 0.32330450061901295, "learning_rate": 7.806647754004553e-06, "loss": 0.0325, "step": 4243 }, { "epoch": 1.138106731027085, "grad_norm": 0.678484345510273, "learning_rate": 7.805356418480988e-06, "loss": 0.0329, "step": 4244 }, { "epoch": 1.1383748994368463, "grad_norm": 0.5607088944147768, "learning_rate": 7.804064809807315e-06, "loss": 0.032, "step": 4245 }, { "epoch": 1.1386430678466077, "grad_norm": 0.31684433010752583, "learning_rate": 7.802772928109295e-06, "loss": 0.0234, "step": 4246 }, { "epoch": 1.138911236256369, "grad_norm": 0.27692348125251015, "learning_rate": 7.801480773512717e-06, "loss": 0.0224, "step": 4247 }, { "epoch": 1.1391794046661303, "grad_norm": 0.36685219250353557, "learning_rate": 7.800188346143394e-06, "loss": 0.0262, "step": 4248 }, { "epoch": 1.1394475730758917, "grad_norm": 0.39491953956763987, "learning_rate": 7.798895646127166e-06, "loss": 0.0294, "step": 4249 }, { "epoch": 1.139715741485653, "grad_norm": 0.3346128325556546, "learning_rate": 7.797602673589902e-06, "loss": 0.022, "step": 4250 }, { "epoch": 1.1399839098954143, "grad_norm": 0.30610599247694353, "learning_rate": 7.79630942865749e-06, "loss": 0.0345, "step": 4251 }, { "epoch": 1.1402520783051757, "grad_norm": 0.4330202243592825, "learning_rate": 7.795015911455854e-06, "loss": 0.0316, "step": 4252 }, { "epoch": 1.140520246714937, "grad_norm": 0.28897876681884577, "learning_rate": 7.793722122110941e-06, "loss": 0.0203, "step": 4253 }, { "epoch": 1.1407884151246983, "grad_norm": 0.5172339099949775, "learning_rate": 7.79242806074872e-06, "loss": 0.0315, "step": 4254 }, { "epoch": 1.1410565835344597, "grad_norm": 0.33746761284792126, "learning_rate": 7.791133727495193e-06, "loss": 0.0266, "step": 4255 }, { "epoch": 1.141324751944221, "grad_norm": 0.2816554852681798, "learning_rate": 7.789839122476386e-06, "loss": 0.0173, "step": 4256 }, { "epoch": 1.1415929203539823, "grad_norm": 0.5013424642812735, "learning_rate": 7.78854424581835e-06, "loss": 0.0332, "step": 4257 }, { "epoch": 1.1418610887637437, "grad_norm": 0.3536130326308502, "learning_rate": 7.787249097647166e-06, "loss": 0.0306, "step": 4258 }, { "epoch": 1.1421292571735049, "grad_norm": 0.3199859963214248, "learning_rate": 7.785953678088934e-06, "loss": 0.021, "step": 4259 }, { "epoch": 1.1423974255832663, "grad_norm": 0.26422165616646365, "learning_rate": 7.784657987269791e-06, "loss": 0.0215, "step": 4260 }, { "epoch": 1.1426655939930277, "grad_norm": 0.2649532672309566, "learning_rate": 7.783362025315892e-06, "loss": 0.0216, "step": 4261 }, { "epoch": 1.1429337624027889, "grad_norm": 0.5290362084315056, "learning_rate": 7.782065792353424e-06, "loss": 0.0347, "step": 4262 }, { "epoch": 1.1432019308125503, "grad_norm": 0.2466631279295337, "learning_rate": 7.780769288508594e-06, "loss": 0.0171, "step": 4263 }, { "epoch": 1.1434700992223117, "grad_norm": 0.26756269310407144, "learning_rate": 7.779472513907643e-06, "loss": 0.0258, "step": 4264 }, { "epoch": 1.1437382676320729, "grad_norm": 0.31464546983450736, "learning_rate": 7.778175468676833e-06, "loss": 0.0259, "step": 4265 }, { "epoch": 1.1440064360418343, "grad_norm": 0.9465559199681399, "learning_rate": 7.77687815294245e-06, "loss": 0.0325, "step": 4266 }, { "epoch": 1.1442746044515957, "grad_norm": 0.29915735079750944, "learning_rate": 7.775580566830815e-06, "loss": 0.0231, "step": 4267 }, { "epoch": 1.1445427728613569, "grad_norm": 0.2986760320703412, "learning_rate": 7.774282710468272e-06, "loss": 0.0297, "step": 4268 }, { "epoch": 1.1448109412711183, "grad_norm": 0.36049965750516594, "learning_rate": 7.772984583981185e-06, "loss": 0.0365, "step": 4269 }, { "epoch": 1.1450791096808797, "grad_norm": 0.26777655277023116, "learning_rate": 7.771686187495951e-06, "loss": 0.0192, "step": 4270 }, { "epoch": 1.1453472780906409, "grad_norm": 0.37497343502051117, "learning_rate": 7.770387521138993e-06, "loss": 0.033, "step": 4271 }, { "epoch": 1.1456154465004023, "grad_norm": 0.2677634811512745, "learning_rate": 7.769088585036756e-06, "loss": 0.0193, "step": 4272 }, { "epoch": 1.1458836149101637, "grad_norm": 0.47689982404128367, "learning_rate": 7.767789379315715e-06, "loss": 0.0264, "step": 4273 }, { "epoch": 1.1461517833199248, "grad_norm": 0.3201366203193023, "learning_rate": 7.76648990410237e-06, "loss": 0.0255, "step": 4274 }, { "epoch": 1.1464199517296862, "grad_norm": 0.24855301247333264, "learning_rate": 7.76519015952325e-06, "loss": 0.0234, "step": 4275 }, { "epoch": 1.1466881201394477, "grad_norm": 0.6443500196832805, "learning_rate": 7.763890145704905e-06, "loss": 0.0444, "step": 4276 }, { "epoch": 1.1469562885492088, "grad_norm": 0.38965367623426517, "learning_rate": 7.762589862773915e-06, "loss": 0.0299, "step": 4277 }, { "epoch": 1.1472244569589702, "grad_norm": 0.3033499072089395, "learning_rate": 7.761289310856883e-06, "loss": 0.0213, "step": 4278 }, { "epoch": 1.1474926253687316, "grad_norm": 0.2439365103139888, "learning_rate": 7.759988490080442e-06, "loss": 0.021, "step": 4279 }, { "epoch": 1.1477607937784928, "grad_norm": 0.39021049452889756, "learning_rate": 7.758687400571249e-06, "loss": 0.0372, "step": 4280 }, { "epoch": 1.1480289621882542, "grad_norm": 0.41446979545727886, "learning_rate": 7.75738604245599e-06, "loss": 0.0268, "step": 4281 }, { "epoch": 1.1482971305980156, "grad_norm": 0.23676277417677782, "learning_rate": 7.756084415861372e-06, "loss": 0.0177, "step": 4282 }, { "epoch": 1.1485652990077768, "grad_norm": 0.3458059722111906, "learning_rate": 7.754782520914132e-06, "loss": 0.0262, "step": 4283 }, { "epoch": 1.1488334674175382, "grad_norm": 0.43022796192827195, "learning_rate": 7.753480357741031e-06, "loss": 0.0318, "step": 4284 }, { "epoch": 1.1491016358272996, "grad_norm": 0.3685181160512033, "learning_rate": 7.752177926468858e-06, "loss": 0.0309, "step": 4285 }, { "epoch": 1.1493698042370608, "grad_norm": 0.29869264451200384, "learning_rate": 7.750875227224427e-06, "loss": 0.0265, "step": 4286 }, { "epoch": 1.1496379726468222, "grad_norm": 0.20817491676799332, "learning_rate": 7.749572260134578e-06, "loss": 0.0195, "step": 4287 }, { "epoch": 1.1499061410565836, "grad_norm": 0.3010388268156244, "learning_rate": 7.74826902532618e-06, "loss": 0.0313, "step": 4288 }, { "epoch": 1.1501743094663448, "grad_norm": 0.29954934878178924, "learning_rate": 7.746965522926122e-06, "loss": 0.0272, "step": 4289 }, { "epoch": 1.1504424778761062, "grad_norm": 0.39997731380915796, "learning_rate": 7.745661753061324e-06, "loss": 0.0331, "step": 4290 }, { "epoch": 1.1507106462858676, "grad_norm": 0.3652876056699885, "learning_rate": 7.744357715858732e-06, "loss": 0.0269, "step": 4291 }, { "epoch": 1.1509788146956288, "grad_norm": 0.25785314584095237, "learning_rate": 7.743053411445314e-06, "loss": 0.0291, "step": 4292 }, { "epoch": 1.1512469831053902, "grad_norm": 0.368211670063683, "learning_rate": 7.74174883994807e-06, "loss": 0.0297, "step": 4293 }, { "epoch": 1.1515151515151516, "grad_norm": 0.258032882044038, "learning_rate": 7.740444001494019e-06, "loss": 0.0196, "step": 4294 }, { "epoch": 1.1517833199249128, "grad_norm": 0.28749854983365075, "learning_rate": 7.73913889621021e-06, "loss": 0.0255, "step": 4295 }, { "epoch": 1.1520514883346742, "grad_norm": 0.29263897225893865, "learning_rate": 7.73783352422372e-06, "loss": 0.0239, "step": 4296 }, { "epoch": 1.1523196567444356, "grad_norm": 0.34683086193244417, "learning_rate": 7.73652788566165e-06, "loss": 0.0345, "step": 4297 }, { "epoch": 1.1525878251541968, "grad_norm": 0.3342812025673864, "learning_rate": 7.735221980651123e-06, "loss": 0.0334, "step": 4298 }, { "epoch": 1.1528559935639582, "grad_norm": 0.6164938357874172, "learning_rate": 7.733915809319295e-06, "loss": 0.0332, "step": 4299 }, { "epoch": 1.1531241619737196, "grad_norm": 0.2654121422922887, "learning_rate": 7.732609371793343e-06, "loss": 0.0194, "step": 4300 }, { "epoch": 1.1533923303834808, "grad_norm": 0.2086328259464695, "learning_rate": 7.73130266820047e-06, "loss": 0.016, "step": 4301 }, { "epoch": 1.1536604987932422, "grad_norm": 0.4448641003453191, "learning_rate": 7.72999569866791e-06, "loss": 0.0273, "step": 4302 }, { "epoch": 1.1539286672030036, "grad_norm": 0.45066491909822026, "learning_rate": 7.728688463322916e-06, "loss": 0.0329, "step": 4303 }, { "epoch": 1.1541968356127648, "grad_norm": 0.29239175973703524, "learning_rate": 7.72738096229277e-06, "loss": 0.0228, "step": 4304 }, { "epoch": 1.1544650040225262, "grad_norm": 0.38554594269411363, "learning_rate": 7.72607319570478e-06, "loss": 0.0331, "step": 4305 }, { "epoch": 1.1547331724322876, "grad_norm": 0.3886670459018736, "learning_rate": 7.724765163686283e-06, "loss": 0.0404, "step": 4306 }, { "epoch": 1.1550013408420488, "grad_norm": 0.2082667714461019, "learning_rate": 7.723456866364634e-06, "loss": 0.0207, "step": 4307 }, { "epoch": 1.1552695092518102, "grad_norm": 0.2710229342194994, "learning_rate": 7.722148303867222e-06, "loss": 0.027, "step": 4308 }, { "epoch": 1.1555376776615716, "grad_norm": 0.3792971961133131, "learning_rate": 7.720839476321455e-06, "loss": 0.0334, "step": 4309 }, { "epoch": 1.1558058460713327, "grad_norm": 0.2832787782592128, "learning_rate": 7.719530383854774e-06, "loss": 0.0243, "step": 4310 }, { "epoch": 1.1560740144810941, "grad_norm": 0.2635144206908196, "learning_rate": 7.718221026594638e-06, "loss": 0.029, "step": 4311 }, { "epoch": 1.1563421828908556, "grad_norm": 0.30802942333593264, "learning_rate": 7.71691140466854e-06, "loss": 0.037, "step": 4312 }, { "epoch": 1.1566103513006167, "grad_norm": 0.3353185715368533, "learning_rate": 7.71560151820399e-06, "loss": 0.0235, "step": 4313 }, { "epoch": 1.1568785197103781, "grad_norm": 0.3060210952740613, "learning_rate": 7.71429136732853e-06, "loss": 0.0298, "step": 4314 }, { "epoch": 1.1571466881201395, "grad_norm": 0.39684939927741403, "learning_rate": 7.712980952169723e-06, "loss": 0.0366, "step": 4315 }, { "epoch": 1.1574148565299007, "grad_norm": 0.3570417885446842, "learning_rate": 7.711670272855168e-06, "loss": 0.0269, "step": 4316 }, { "epoch": 1.1576830249396621, "grad_norm": 0.30741797389464814, "learning_rate": 7.710359329512477e-06, "loss": 0.0255, "step": 4317 }, { "epoch": 1.1579511933494233, "grad_norm": 0.30854503925735277, "learning_rate": 7.709048122269294e-06, "loss": 0.033, "step": 4318 }, { "epoch": 1.1582193617591847, "grad_norm": 0.27007192500181315, "learning_rate": 7.707736651253287e-06, "loss": 0.0295, "step": 4319 }, { "epoch": 1.1584875301689461, "grad_norm": 0.3753033877696913, "learning_rate": 7.70642491659215e-06, "loss": 0.0274, "step": 4320 }, { "epoch": 1.1587556985787075, "grad_norm": 0.4200284521883131, "learning_rate": 7.705112918413604e-06, "loss": 0.0238, "step": 4321 }, { "epoch": 1.1590238669884687, "grad_norm": 0.3153348811269925, "learning_rate": 7.703800656845398e-06, "loss": 0.0268, "step": 4322 }, { "epoch": 1.1592920353982301, "grad_norm": 0.24869822085959417, "learning_rate": 7.7024881320153e-06, "loss": 0.0245, "step": 4323 }, { "epoch": 1.1595602038079913, "grad_norm": 0.35995238034039806, "learning_rate": 7.701175344051104e-06, "loss": 0.038, "step": 4324 }, { "epoch": 1.1598283722177527, "grad_norm": 0.2828943298263581, "learning_rate": 7.69986229308064e-06, "loss": 0.0249, "step": 4325 }, { "epoch": 1.160096540627514, "grad_norm": 0.3066586834331653, "learning_rate": 7.69854897923175e-06, "loss": 0.0221, "step": 4326 }, { "epoch": 1.1603647090372755, "grad_norm": 0.30243658433683224, "learning_rate": 7.697235402632313e-06, "loss": 0.0241, "step": 4327 }, { "epoch": 1.1606328774470367, "grad_norm": 0.2563730687665492, "learning_rate": 7.695921563410224e-06, "loss": 0.0288, "step": 4328 }, { "epoch": 1.160901045856798, "grad_norm": 0.3556467999462351, "learning_rate": 7.69460746169341e-06, "loss": 0.0267, "step": 4329 }, { "epoch": 1.1611692142665593, "grad_norm": 0.2889553548937902, "learning_rate": 7.693293097609823e-06, "loss": 0.0245, "step": 4330 }, { "epoch": 1.1614373826763207, "grad_norm": 0.29564191329218553, "learning_rate": 7.691978471287436e-06, "loss": 0.029, "step": 4331 }, { "epoch": 1.161705551086082, "grad_norm": 0.45360432971456777, "learning_rate": 7.690663582854252e-06, "loss": 0.0375, "step": 4332 }, { "epoch": 1.1619737194958435, "grad_norm": 0.27273409170922563, "learning_rate": 7.6893484324383e-06, "loss": 0.0333, "step": 4333 }, { "epoch": 1.1622418879056047, "grad_norm": 0.3239549806534504, "learning_rate": 7.68803302016763e-06, "loss": 0.0286, "step": 4334 }, { "epoch": 1.162510056315366, "grad_norm": 0.2299537341531845, "learning_rate": 7.686717346170323e-06, "loss": 0.0254, "step": 4335 }, { "epoch": 1.1627782247251273, "grad_norm": 0.3460722858313576, "learning_rate": 7.68540141057448e-06, "loss": 0.0328, "step": 4336 }, { "epoch": 1.1630463931348887, "grad_norm": 0.2444923379843955, "learning_rate": 7.684085213508231e-06, "loss": 0.0271, "step": 4337 }, { "epoch": 1.16331456154465, "grad_norm": 0.37404422052135783, "learning_rate": 7.682768755099731e-06, "loss": 0.0357, "step": 4338 }, { "epoch": 1.1635827299544115, "grad_norm": 0.2986704937362207, "learning_rate": 7.68145203547716e-06, "loss": 0.0257, "step": 4339 }, { "epoch": 1.1638508983641727, "grad_norm": 0.25364652538116367, "learning_rate": 7.680135054768722e-06, "loss": 0.0207, "step": 4340 }, { "epoch": 1.164119066773934, "grad_norm": 0.40862688252688634, "learning_rate": 7.678817813102652e-06, "loss": 0.0367, "step": 4341 }, { "epoch": 1.1643872351836952, "grad_norm": 0.2538291278555848, "learning_rate": 7.677500310607203e-06, "loss": 0.0288, "step": 4342 }, { "epoch": 1.1646554035934567, "grad_norm": 0.29825961844099835, "learning_rate": 7.676182547410658e-06, "loss": 0.0254, "step": 4343 }, { "epoch": 1.164923572003218, "grad_norm": 0.2999762143903309, "learning_rate": 7.674864523641321e-06, "loss": 0.0215, "step": 4344 }, { "epoch": 1.1651917404129795, "grad_norm": 0.31912144705543466, "learning_rate": 7.673546239427528e-06, "loss": 0.0282, "step": 4345 }, { "epoch": 1.1654599088227406, "grad_norm": 0.2816485283465777, "learning_rate": 7.672227694897635e-06, "loss": 0.0256, "step": 4346 }, { "epoch": 1.165728077232502, "grad_norm": 0.359348848690921, "learning_rate": 7.67090889018003e-06, "loss": 0.0329, "step": 4347 }, { "epoch": 1.1659962456422632, "grad_norm": 0.29257703652306855, "learning_rate": 7.669589825403114e-06, "loss": 0.0282, "step": 4348 }, { "epoch": 1.1662644140520246, "grad_norm": 0.3359729538724772, "learning_rate": 7.668270500695324e-06, "loss": 0.0295, "step": 4349 }, { "epoch": 1.166532582461786, "grad_norm": 0.40956587777007475, "learning_rate": 7.666950916185121e-06, "loss": 0.0342, "step": 4350 }, { "epoch": 1.1668007508715474, "grad_norm": 0.328078990890433, "learning_rate": 7.665631072000987e-06, "loss": 0.0216, "step": 4351 }, { "epoch": 1.1670689192813086, "grad_norm": 0.9930732297452027, "learning_rate": 7.664310968271434e-06, "loss": 0.0381, "step": 4352 }, { "epoch": 1.16733708769107, "grad_norm": 0.26823805239047616, "learning_rate": 7.662990605124995e-06, "loss": 0.0282, "step": 4353 }, { "epoch": 1.1676052561008312, "grad_norm": 0.2691268578480586, "learning_rate": 7.661669982690229e-06, "loss": 0.0208, "step": 4354 }, { "epoch": 1.1678734245105926, "grad_norm": 0.33584675795374064, "learning_rate": 7.660349101095726e-06, "loss": 0.0363, "step": 4355 }, { "epoch": 1.168141592920354, "grad_norm": 0.502758116647226, "learning_rate": 7.659027960470091e-06, "loss": 0.0249, "step": 4356 }, { "epoch": 1.1684097613301154, "grad_norm": 0.31744495919719334, "learning_rate": 7.657706560941966e-06, "loss": 0.0228, "step": 4357 }, { "epoch": 1.1686779297398766, "grad_norm": 0.2799809329660122, "learning_rate": 7.656384902640008e-06, "loss": 0.021, "step": 4358 }, { "epoch": 1.168946098149638, "grad_norm": 0.3288606356169136, "learning_rate": 7.655062985692905e-06, "loss": 0.0276, "step": 4359 }, { "epoch": 1.1692142665593992, "grad_norm": 0.36575651073610055, "learning_rate": 7.65374081022937e-06, "loss": 0.0342, "step": 4360 }, { "epoch": 1.1694824349691606, "grad_norm": 0.3518942901763917, "learning_rate": 7.652418376378135e-06, "loss": 0.0265, "step": 4361 }, { "epoch": 1.169750603378922, "grad_norm": 0.3897395421248616, "learning_rate": 7.651095684267965e-06, "loss": 0.0293, "step": 4362 }, { "epoch": 1.1700187717886834, "grad_norm": 0.3658907444140491, "learning_rate": 7.649772734027647e-06, "loss": 0.034, "step": 4363 }, { "epoch": 1.1702869401984446, "grad_norm": 0.27193956589434276, "learning_rate": 7.648449525785994e-06, "loss": 0.023, "step": 4364 }, { "epoch": 1.170555108608206, "grad_norm": 0.332709353640698, "learning_rate": 7.647126059671841e-06, "loss": 0.0235, "step": 4365 }, { "epoch": 1.1708232770179672, "grad_norm": 0.2478024448506725, "learning_rate": 7.645802335814051e-06, "loss": 0.0208, "step": 4366 }, { "epoch": 1.1710914454277286, "grad_norm": 0.2821467696834224, "learning_rate": 7.644478354341513e-06, "loss": 0.0233, "step": 4367 }, { "epoch": 1.17135961383749, "grad_norm": 0.44911085329501554, "learning_rate": 7.64315411538314e-06, "loss": 0.024, "step": 4368 }, { "epoch": 1.1716277822472514, "grad_norm": 0.25506199678025704, "learning_rate": 7.641829619067866e-06, "loss": 0.0252, "step": 4369 }, { "epoch": 1.1718959506570126, "grad_norm": 0.7530798808426693, "learning_rate": 7.640504865524657e-06, "loss": 0.0402, "step": 4370 }, { "epoch": 1.172164119066774, "grad_norm": 0.24143648447291616, "learning_rate": 7.639179854882499e-06, "loss": 0.0268, "step": 4371 }, { "epoch": 1.1724322874765352, "grad_norm": 0.3008868251377793, "learning_rate": 7.637854587270405e-06, "loss": 0.0239, "step": 4372 }, { "epoch": 1.1727004558862966, "grad_norm": 0.252131024309555, "learning_rate": 7.636529062817415e-06, "loss": 0.0238, "step": 4373 }, { "epoch": 1.172968624296058, "grad_norm": 0.25597351374811295, "learning_rate": 7.63520328165259e-06, "loss": 0.0256, "step": 4374 }, { "epoch": 1.1732367927058194, "grad_norm": 0.3256072693131043, "learning_rate": 7.633877243905016e-06, "loss": 0.0339, "step": 4375 }, { "epoch": 1.1735049611155806, "grad_norm": 0.30354561756683646, "learning_rate": 7.632550949703808e-06, "loss": 0.0291, "step": 4376 }, { "epoch": 1.173773129525342, "grad_norm": 0.374452967729698, "learning_rate": 7.631224399178103e-06, "loss": 0.028, "step": 4377 }, { "epoch": 1.1740412979351031, "grad_norm": 0.40644174553048573, "learning_rate": 7.629897592457066e-06, "loss": 0.0379, "step": 4378 }, { "epoch": 1.1743094663448645, "grad_norm": 0.26270674276326994, "learning_rate": 7.6285705296698805e-06, "loss": 0.0265, "step": 4379 }, { "epoch": 1.174577634754626, "grad_norm": 0.24152654671526466, "learning_rate": 7.6272432109457625e-06, "loss": 0.0214, "step": 4380 }, { "epoch": 1.1748458031643874, "grad_norm": 0.24944512630579888, "learning_rate": 7.625915636413949e-06, "loss": 0.0237, "step": 4381 }, { "epoch": 1.1751139715741485, "grad_norm": 0.3856902614856428, "learning_rate": 7.624587806203699e-06, "loss": 0.0345, "step": 4382 }, { "epoch": 1.17538213998391, "grad_norm": 0.3433893841318947, "learning_rate": 7.623259720444305e-06, "loss": 0.0357, "step": 4383 }, { "epoch": 1.1756503083936711, "grad_norm": 0.5225586607053644, "learning_rate": 7.621931379265075e-06, "loss": 0.0235, "step": 4384 }, { "epoch": 1.1759184768034325, "grad_norm": 0.2643009982410421, "learning_rate": 7.62060278279535e-06, "loss": 0.0238, "step": 4385 }, { "epoch": 1.176186645213194, "grad_norm": 0.4332736693601954, "learning_rate": 7.619273931164487e-06, "loss": 0.0388, "step": 4386 }, { "epoch": 1.1764548136229551, "grad_norm": 0.3554110575019689, "learning_rate": 7.617944824501875e-06, "loss": 0.032, "step": 4387 }, { "epoch": 1.1767229820327165, "grad_norm": 0.40561517274220243, "learning_rate": 7.616615462936927e-06, "loss": 0.0432, "step": 4388 }, { "epoch": 1.176991150442478, "grad_norm": 0.4039621670163445, "learning_rate": 7.615285846599079e-06, "loss": 0.0493, "step": 4389 }, { "epoch": 1.1772593188522391, "grad_norm": 0.35421278904572157, "learning_rate": 7.613955975617793e-06, "loss": 0.0224, "step": 4390 }, { "epoch": 1.1775274872620005, "grad_norm": 0.3701044926323328, "learning_rate": 7.612625850122551e-06, "loss": 0.0247, "step": 4391 }, { "epoch": 1.177795655671762, "grad_norm": 0.287184922853217, "learning_rate": 7.611295470242867e-06, "loss": 0.0262, "step": 4392 }, { "epoch": 1.178063824081523, "grad_norm": 0.222661566241732, "learning_rate": 7.609964836108275e-06, "loss": 0.0202, "step": 4393 }, { "epoch": 1.1783319924912845, "grad_norm": 0.2559458709126972, "learning_rate": 7.608633947848338e-06, "loss": 0.0251, "step": 4394 }, { "epoch": 1.178600160901046, "grad_norm": 0.37746487354964414, "learning_rate": 7.6073028055926375e-06, "loss": 0.0276, "step": 4395 }, { "epoch": 1.178868329310807, "grad_norm": 0.20469225204007865, "learning_rate": 7.605971409470787e-06, "loss": 0.0211, "step": 4396 }, { "epoch": 1.1791364977205685, "grad_norm": 0.3975510855032446, "learning_rate": 7.604639759612419e-06, "loss": 0.0392, "step": 4397 }, { "epoch": 1.17940466613033, "grad_norm": 0.25485730713009525, "learning_rate": 7.603307856147191e-06, "loss": 0.0196, "step": 4398 }, { "epoch": 1.179672834540091, "grad_norm": 0.2707617827444718, "learning_rate": 7.601975699204789e-06, "loss": 0.0228, "step": 4399 }, { "epoch": 1.1799410029498525, "grad_norm": 0.4794470341019494, "learning_rate": 7.600643288914922e-06, "loss": 0.0399, "step": 4400 }, { "epoch": 1.180209171359614, "grad_norm": 0.39914905167803033, "learning_rate": 7.599310625407323e-06, "loss": 0.0263, "step": 4401 }, { "epoch": 1.180477339769375, "grad_norm": 0.2633384193625362, "learning_rate": 7.59797770881175e-06, "loss": 0.018, "step": 4402 }, { "epoch": 1.1807455081791365, "grad_norm": 0.40450352183191624, "learning_rate": 7.596644539257983e-06, "loss": 0.0437, "step": 4403 }, { "epoch": 1.1810136765888979, "grad_norm": 0.3453283301426952, "learning_rate": 7.595311116875832e-06, "loss": 0.0253, "step": 4404 }, { "epoch": 1.181281844998659, "grad_norm": 0.2774001802931696, "learning_rate": 7.593977441795129e-06, "loss": 0.0269, "step": 4405 }, { "epoch": 1.1815500134084205, "grad_norm": 0.41555957218667217, "learning_rate": 7.592643514145728e-06, "loss": 0.0328, "step": 4406 }, { "epoch": 1.1818181818181819, "grad_norm": 0.295926535130485, "learning_rate": 7.591309334057511e-06, "loss": 0.0235, "step": 4407 }, { "epoch": 1.182086350227943, "grad_norm": 0.31182175251646144, "learning_rate": 7.5899749016603855e-06, "loss": 0.0291, "step": 4408 }, { "epoch": 1.1823545186377045, "grad_norm": 0.35616126381957625, "learning_rate": 7.588640217084279e-06, "loss": 0.0321, "step": 4409 }, { "epoch": 1.1826226870474659, "grad_norm": 0.18489561105003466, "learning_rate": 7.587305280459148e-06, "loss": 0.0188, "step": 4410 }, { "epoch": 1.182890855457227, "grad_norm": 0.3006398542060778, "learning_rate": 7.585970091914969e-06, "loss": 0.0326, "step": 4411 }, { "epoch": 1.1831590238669885, "grad_norm": 0.2562784750773555, "learning_rate": 7.5846346515817505e-06, "loss": 0.0246, "step": 4412 }, { "epoch": 1.1834271922767499, "grad_norm": 0.28981603742122397, "learning_rate": 7.583298959589517e-06, "loss": 0.0202, "step": 4413 }, { "epoch": 1.183695360686511, "grad_norm": 0.31245897061298017, "learning_rate": 7.581963016068322e-06, "loss": 0.0276, "step": 4414 }, { "epoch": 1.1839635290962724, "grad_norm": 0.5325014254939602, "learning_rate": 7.580626821148242e-06, "loss": 0.0364, "step": 4415 }, { "epoch": 1.1842316975060339, "grad_norm": 0.3605253847379521, "learning_rate": 7.579290374959383e-06, "loss": 0.0303, "step": 4416 }, { "epoch": 1.184499865915795, "grad_norm": 0.3178142240579125, "learning_rate": 7.577953677631866e-06, "loss": 0.0355, "step": 4417 }, { "epoch": 1.1847680343255564, "grad_norm": 0.30090543514016804, "learning_rate": 7.576616729295844e-06, "loss": 0.023, "step": 4418 }, { "epoch": 1.1850362027353178, "grad_norm": 0.33870155754602516, "learning_rate": 7.5752795300814915e-06, "loss": 0.0287, "step": 4419 }, { "epoch": 1.185304371145079, "grad_norm": 0.30032008142654054, "learning_rate": 7.573942080119009e-06, "loss": 0.0226, "step": 4420 }, { "epoch": 1.1855725395548404, "grad_norm": 0.28028849556239216, "learning_rate": 7.57260437953862e-06, "loss": 0.0215, "step": 4421 }, { "epoch": 1.1858407079646018, "grad_norm": 0.2904747056658521, "learning_rate": 7.571266428470571e-06, "loss": 0.0262, "step": 4422 }, { "epoch": 1.186108876374363, "grad_norm": 0.24052403319642113, "learning_rate": 7.569928227045138e-06, "loss": 0.0178, "step": 4423 }, { "epoch": 1.1863770447841244, "grad_norm": 0.272311129063299, "learning_rate": 7.568589775392616e-06, "loss": 0.0287, "step": 4424 }, { "epoch": 1.1866452131938858, "grad_norm": 0.23113936682361502, "learning_rate": 7.567251073643327e-06, "loss": 0.0221, "step": 4425 }, { "epoch": 1.186913381603647, "grad_norm": 0.4685161826481084, "learning_rate": 7.565912121927619e-06, "loss": 0.0304, "step": 4426 }, { "epoch": 1.1871815500134084, "grad_norm": 0.3322442878531938, "learning_rate": 7.564572920375857e-06, "loss": 0.0348, "step": 4427 }, { "epoch": 1.1874497184231698, "grad_norm": 0.37726724738671846, "learning_rate": 7.563233469118441e-06, "loss": 0.0322, "step": 4428 }, { "epoch": 1.187717886832931, "grad_norm": 0.45879219758321593, "learning_rate": 7.561893768285786e-06, "loss": 0.0276, "step": 4429 }, { "epoch": 1.1879860552426924, "grad_norm": 0.33563006918116495, "learning_rate": 7.560553818008336e-06, "loss": 0.0272, "step": 4430 }, { "epoch": 1.1882542236524538, "grad_norm": 0.35362311137597124, "learning_rate": 7.5592136184165586e-06, "loss": 0.0251, "step": 4431 }, { "epoch": 1.188522392062215, "grad_norm": 0.294115552078573, "learning_rate": 7.557873169640948e-06, "loss": 0.024, "step": 4432 }, { "epoch": 1.1887905604719764, "grad_norm": 0.26852106883543686, "learning_rate": 7.556532471812017e-06, "loss": 0.0187, "step": 4433 }, { "epoch": 1.1890587288817378, "grad_norm": 0.24276099576075158, "learning_rate": 7.555191525060306e-06, "loss": 0.0212, "step": 4434 }, { "epoch": 1.189326897291499, "grad_norm": 0.4012440513094016, "learning_rate": 7.553850329516379e-06, "loss": 0.0359, "step": 4435 }, { "epoch": 1.1895950657012604, "grad_norm": 0.36456691742290903, "learning_rate": 7.552508885310827e-06, "loss": 0.0272, "step": 4436 }, { "epoch": 1.1898632341110218, "grad_norm": 0.31289299973777823, "learning_rate": 7.551167192574262e-06, "loss": 0.0263, "step": 4437 }, { "epoch": 1.190131402520783, "grad_norm": 0.2828939854146892, "learning_rate": 7.549825251437322e-06, "loss": 0.0241, "step": 4438 }, { "epoch": 1.1903995709305444, "grad_norm": 0.27400089339526057, "learning_rate": 7.548483062030665e-06, "loss": 0.0221, "step": 4439 }, { "epoch": 1.1906677393403058, "grad_norm": 0.4142505118260145, "learning_rate": 7.547140624484979e-06, "loss": 0.0301, "step": 4440 }, { "epoch": 1.190935907750067, "grad_norm": 0.30637732035402343, "learning_rate": 7.545797938930971e-06, "loss": 0.0241, "step": 4441 }, { "epoch": 1.1912040761598284, "grad_norm": 0.27809901510567464, "learning_rate": 7.544455005499378e-06, "loss": 0.0214, "step": 4442 }, { "epoch": 1.1914722445695898, "grad_norm": 0.24565749813099186, "learning_rate": 7.543111824320956e-06, "loss": 0.0267, "step": 4443 }, { "epoch": 1.191740412979351, "grad_norm": 0.5846477988826597, "learning_rate": 7.54176839552649e-06, "loss": 0.0404, "step": 4444 }, { "epoch": 1.1920085813891124, "grad_norm": 0.27620285388453714, "learning_rate": 7.540424719246782e-06, "loss": 0.0251, "step": 4445 }, { "epoch": 1.1922767497988738, "grad_norm": 0.3664908823665164, "learning_rate": 7.5390807956126634e-06, "loss": 0.0291, "step": 4446 }, { "epoch": 1.192544918208635, "grad_norm": 0.40985888784764357, "learning_rate": 7.537736624754989e-06, "loss": 0.0422, "step": 4447 }, { "epoch": 1.1928130866183964, "grad_norm": 0.2398115965487242, "learning_rate": 7.536392206804637e-06, "loss": 0.0152, "step": 4448 }, { "epoch": 1.1930812550281578, "grad_norm": 0.44417929945724893, "learning_rate": 7.5350475418925096e-06, "loss": 0.0474, "step": 4449 }, { "epoch": 1.193349423437919, "grad_norm": 0.42945619165428184, "learning_rate": 7.533702630149535e-06, "loss": 0.0348, "step": 4450 }, { "epoch": 1.1936175918476803, "grad_norm": 0.28148322073536697, "learning_rate": 7.53235747170666e-06, "loss": 0.0193, "step": 4451 }, { "epoch": 1.1938857602574418, "grad_norm": 0.26205175546746634, "learning_rate": 7.5310120666948625e-06, "loss": 0.0206, "step": 4452 }, { "epoch": 1.194153928667203, "grad_norm": 0.33307252113023333, "learning_rate": 7.529666415245138e-06, "loss": 0.0328, "step": 4453 }, { "epoch": 1.1944220970769643, "grad_norm": 0.2596800382767961, "learning_rate": 7.528320517488513e-06, "loss": 0.0213, "step": 4454 }, { "epoch": 1.1946902654867257, "grad_norm": 0.3286835578712297, "learning_rate": 7.526974373556031e-06, "loss": 0.0272, "step": 4455 }, { "epoch": 1.194958433896487, "grad_norm": 0.24580953173462058, "learning_rate": 7.525627983578762e-06, "loss": 0.0215, "step": 4456 }, { "epoch": 1.1952266023062483, "grad_norm": 0.3415685635723789, "learning_rate": 7.5242813476878055e-06, "loss": 0.0345, "step": 4457 }, { "epoch": 1.1954947707160097, "grad_norm": 0.27019503769639147, "learning_rate": 7.522934466014272e-06, "loss": 0.0271, "step": 4458 }, { "epoch": 1.195762939125771, "grad_norm": 0.3123228896911459, "learning_rate": 7.521587338689309e-06, "loss": 0.0311, "step": 4459 }, { "epoch": 1.1960311075355323, "grad_norm": 0.3274366681518503, "learning_rate": 7.520239965844083e-06, "loss": 0.0356, "step": 4460 }, { "epoch": 1.1962992759452937, "grad_norm": 0.3695126843300847, "learning_rate": 7.518892347609781e-06, "loss": 0.0274, "step": 4461 }, { "epoch": 1.196567444355055, "grad_norm": 0.2725941007248382, "learning_rate": 7.517544484117618e-06, "loss": 0.0207, "step": 4462 }, { "epoch": 1.1968356127648163, "grad_norm": 0.2735553490169365, "learning_rate": 7.516196375498834e-06, "loss": 0.0252, "step": 4463 }, { "epoch": 1.1971037811745777, "grad_norm": 0.27285384194398116, "learning_rate": 7.514848021884689e-06, "loss": 0.0245, "step": 4464 }, { "epoch": 1.197371949584339, "grad_norm": 0.41638126080756105, "learning_rate": 7.513499423406469e-06, "loss": 0.034, "step": 4465 }, { "epoch": 1.1976401179941003, "grad_norm": 0.3445200035163466, "learning_rate": 7.512150580195483e-06, "loss": 0.0328, "step": 4466 }, { "epoch": 1.1979082864038617, "grad_norm": 0.4052549094238427, "learning_rate": 7.510801492383064e-06, "loss": 0.0267, "step": 4467 }, { "epoch": 1.198176454813623, "grad_norm": 0.22580605361727296, "learning_rate": 7.509452160100572e-06, "loss": 0.0248, "step": 4468 }, { "epoch": 1.1984446232233843, "grad_norm": 0.2814304603046185, "learning_rate": 7.508102583479384e-06, "loss": 0.0268, "step": 4469 }, { "epoch": 1.1987127916331457, "grad_norm": 0.22678302130361858, "learning_rate": 7.506752762650906e-06, "loss": 0.0182, "step": 4470 }, { "epoch": 1.1989809600429069, "grad_norm": 0.36248691912287667, "learning_rate": 7.5054026977465665e-06, "loss": 0.0252, "step": 4471 }, { "epoch": 1.1992491284526683, "grad_norm": 0.2996751589194444, "learning_rate": 7.50405238889782e-06, "loss": 0.0355, "step": 4472 }, { "epoch": 1.1995172968624297, "grad_norm": 0.3204499027331999, "learning_rate": 7.502701836236137e-06, "loss": 0.0344, "step": 4473 }, { "epoch": 1.1997854652721909, "grad_norm": 0.4295427047837426, "learning_rate": 7.501351039893026e-06, "loss": 0.0287, "step": 4474 }, { "epoch": 1.2000536336819523, "grad_norm": 0.2911949205971258, "learning_rate": 7.500000000000001e-06, "loss": 0.0303, "step": 4475 }, { "epoch": 1.2003218020917137, "grad_norm": 0.26688931046396713, "learning_rate": 7.498648716688615e-06, "loss": 0.0251, "step": 4476 }, { "epoch": 1.2005899705014749, "grad_norm": 0.2993977733387709, "learning_rate": 7.497297190090436e-06, "loss": 0.0271, "step": 4477 }, { "epoch": 1.2008581389112363, "grad_norm": 0.27422796698902213, "learning_rate": 7.49594542033706e-06, "loss": 0.0276, "step": 4478 }, { "epoch": 1.2011263073209977, "grad_norm": 0.26991343548144825, "learning_rate": 7.494593407560105e-06, "loss": 0.0252, "step": 4479 }, { "epoch": 1.2013944757307589, "grad_norm": 0.6588222432973042, "learning_rate": 7.4932411518912154e-06, "loss": 0.0275, "step": 4480 }, { "epoch": 1.2016626441405203, "grad_norm": 0.2982332264880145, "learning_rate": 7.491888653462053e-06, "loss": 0.0238, "step": 4481 }, { "epoch": 1.2019308125502817, "grad_norm": 0.2791956788274975, "learning_rate": 7.490535912404308e-06, "loss": 0.0181, "step": 4482 }, { "epoch": 1.2021989809600429, "grad_norm": 0.3863429670376822, "learning_rate": 7.489182928849692e-06, "loss": 0.0327, "step": 4483 }, { "epoch": 1.2024671493698043, "grad_norm": 0.43156336730434414, "learning_rate": 7.487829702929944e-06, "loss": 0.0216, "step": 4484 }, { "epoch": 1.2027353177795657, "grad_norm": 0.36499746409086237, "learning_rate": 7.486476234776823e-06, "loss": 0.031, "step": 4485 }, { "epoch": 1.2030034861893268, "grad_norm": 0.42075494464808405, "learning_rate": 7.4851225245221125e-06, "loss": 0.0238, "step": 4486 }, { "epoch": 1.2032716545990882, "grad_norm": 0.617076950599668, "learning_rate": 7.4837685722976204e-06, "loss": 0.0218, "step": 4487 }, { "epoch": 1.2035398230088497, "grad_norm": 0.345412137591531, "learning_rate": 7.482414378235175e-06, "loss": 0.0252, "step": 4488 }, { "epoch": 1.2038079914186108, "grad_norm": 0.2238204369740869, "learning_rate": 7.481059942466633e-06, "loss": 0.0217, "step": 4489 }, { "epoch": 1.2040761598283722, "grad_norm": 0.2510821633393369, "learning_rate": 7.4797052651238715e-06, "loss": 0.0184, "step": 4490 }, { "epoch": 1.2043443282381336, "grad_norm": 0.23434387586905642, "learning_rate": 7.4783503463387915e-06, "loss": 0.021, "step": 4491 }, { "epoch": 1.2046124966478948, "grad_norm": 0.2822910293440983, "learning_rate": 7.4769951862433185e-06, "loss": 0.0301, "step": 4492 }, { "epoch": 1.2048806650576562, "grad_norm": 0.46807449784524263, "learning_rate": 7.475639784969402e-06, "loss": 0.0484, "step": 4493 }, { "epoch": 1.2051488334674176, "grad_norm": 0.28437476842421794, "learning_rate": 7.474284142649011e-06, "loss": 0.0284, "step": 4494 }, { "epoch": 1.2054170018771788, "grad_norm": 0.4746364532318406, "learning_rate": 7.472928259414142e-06, "loss": 0.0253, "step": 4495 }, { "epoch": 1.2056851702869402, "grad_norm": 0.2781041061733415, "learning_rate": 7.471572135396814e-06, "loss": 0.018, "step": 4496 }, { "epoch": 1.2059533386967014, "grad_norm": 0.2564956521564478, "learning_rate": 7.470215770729069e-06, "loss": 0.0206, "step": 4497 }, { "epoch": 1.2062215071064628, "grad_norm": 0.4304370575022056, "learning_rate": 7.4688591655429724e-06, "loss": 0.0285, "step": 4498 }, { "epoch": 1.2064896755162242, "grad_norm": 0.2643710428644127, "learning_rate": 7.467502319970614e-06, "loss": 0.0239, "step": 4499 }, { "epoch": 1.2067578439259856, "grad_norm": 0.2964106533306818, "learning_rate": 7.466145234144106e-06, "loss": 0.0312, "step": 4500 }, { "epoch": 1.2070260123357468, "grad_norm": 0.42816573319818774, "learning_rate": 7.464787908195583e-06, "loss": 0.04, "step": 4501 }, { "epoch": 1.2072941807455082, "grad_norm": 0.37638317872667626, "learning_rate": 7.463430342257206e-06, "loss": 0.0191, "step": 4502 }, { "epoch": 1.2075623491552694, "grad_norm": 0.2968712282540018, "learning_rate": 7.462072536461158e-06, "loss": 0.0258, "step": 4503 }, { "epoch": 1.2078305175650308, "grad_norm": 0.3130575085906102, "learning_rate": 7.460714490939643e-06, "loss": 0.0236, "step": 4504 }, { "epoch": 1.2080986859747922, "grad_norm": 0.3744932419549456, "learning_rate": 7.45935620582489e-06, "loss": 0.0406, "step": 4505 }, { "epoch": 1.2083668543845536, "grad_norm": 0.24225816007294187, "learning_rate": 7.457997681249154e-06, "loss": 0.0265, "step": 4506 }, { "epoch": 1.2086350227943148, "grad_norm": 0.32717352290677904, "learning_rate": 7.456638917344709e-06, "loss": 0.0333, "step": 4507 }, { "epoch": 1.2089031912040762, "grad_norm": 0.2969234501248619, "learning_rate": 7.455279914243853e-06, "loss": 0.0291, "step": 4508 }, { "epoch": 1.2091713596138374, "grad_norm": 0.407933700823053, "learning_rate": 7.453920672078913e-06, "loss": 0.021, "step": 4509 }, { "epoch": 1.2094395280235988, "grad_norm": 0.3237137334269245, "learning_rate": 7.452561190982233e-06, "loss": 0.0266, "step": 4510 }, { "epoch": 1.2097076964333602, "grad_norm": 0.35074633031756647, "learning_rate": 7.451201471086179e-06, "loss": 0.0212, "step": 4511 }, { "epoch": 1.2099758648431216, "grad_norm": 0.2685436054914016, "learning_rate": 7.449841512523146e-06, "loss": 0.0205, "step": 4512 }, { "epoch": 1.2102440332528828, "grad_norm": 0.3897514979162601, "learning_rate": 7.448481315425549e-06, "loss": 0.0306, "step": 4513 }, { "epoch": 1.2105122016626442, "grad_norm": 0.24786361108061764, "learning_rate": 7.4471208799258285e-06, "loss": 0.0182, "step": 4514 }, { "epoch": 1.2107803700724054, "grad_norm": 0.3313627505795268, "learning_rate": 7.445760206156443e-06, "loss": 0.0266, "step": 4515 }, { "epoch": 1.2110485384821668, "grad_norm": 0.30752886224655723, "learning_rate": 7.44439929424988e-06, "loss": 0.0229, "step": 4516 }, { "epoch": 1.2113167068919282, "grad_norm": 0.3239327785399324, "learning_rate": 7.443038144338649e-06, "loss": 0.0335, "step": 4517 }, { "epoch": 1.2115848753016896, "grad_norm": 0.34242355856629536, "learning_rate": 7.4416767565552794e-06, "loss": 0.0258, "step": 4518 }, { "epoch": 1.2118530437114508, "grad_norm": 0.26026967410999, "learning_rate": 7.440315131032325e-06, "loss": 0.0257, "step": 4519 }, { "epoch": 1.2121212121212122, "grad_norm": 0.30302898912181625, "learning_rate": 7.4389532679023665e-06, "loss": 0.0256, "step": 4520 }, { "epoch": 1.2123893805309733, "grad_norm": 0.5469247536028212, "learning_rate": 7.437591167298003e-06, "loss": 0.0348, "step": 4521 }, { "epoch": 1.2126575489407347, "grad_norm": 0.3323778659087132, "learning_rate": 7.4362288293518595e-06, "loss": 0.0278, "step": 4522 }, { "epoch": 1.2129257173504961, "grad_norm": 0.24202459602360218, "learning_rate": 7.434866254196584e-06, "loss": 0.0205, "step": 4523 }, { "epoch": 1.2131938857602576, "grad_norm": 0.2909708979538222, "learning_rate": 7.433503441964844e-06, "loss": 0.0328, "step": 4524 }, { "epoch": 1.2134620541700187, "grad_norm": 0.29589874835613944, "learning_rate": 7.432140392789337e-06, "loss": 0.0266, "step": 4525 }, { "epoch": 1.2137302225797801, "grad_norm": 0.3191294950057547, "learning_rate": 7.430777106802774e-06, "loss": 0.0299, "step": 4526 }, { "epoch": 1.2139983909895413, "grad_norm": 0.25099079978991795, "learning_rate": 7.429413584137899e-06, "loss": 0.0215, "step": 4527 }, { "epoch": 1.2142665593993027, "grad_norm": 0.6211934731640977, "learning_rate": 7.428049824927475e-06, "loss": 0.0347, "step": 4528 }, { "epoch": 1.2145347278090641, "grad_norm": 0.3451770304916984, "learning_rate": 7.4266858293042844e-06, "loss": 0.034, "step": 4529 }, { "epoch": 1.2148028962188255, "grad_norm": 0.2620606434819824, "learning_rate": 7.425321597401137e-06, "loss": 0.0202, "step": 4530 }, { "epoch": 1.2150710646285867, "grad_norm": 0.2826778209621178, "learning_rate": 7.423957129350866e-06, "loss": 0.0221, "step": 4531 }, { "epoch": 1.2153392330383481, "grad_norm": 0.2700643306128579, "learning_rate": 7.422592425286323e-06, "loss": 0.0282, "step": 4532 }, { "epoch": 1.2156074014481093, "grad_norm": 0.3257281546326731, "learning_rate": 7.421227485340388e-06, "loss": 0.025, "step": 4533 }, { "epoch": 1.2158755698578707, "grad_norm": 0.40094345596617353, "learning_rate": 7.419862309645962e-06, "loss": 0.0363, "step": 4534 }, { "epoch": 1.2161437382676321, "grad_norm": 0.20590312002635133, "learning_rate": 7.418496898335967e-06, "loss": 0.0212, "step": 4535 }, { "epoch": 1.2164119066773935, "grad_norm": 0.27211492806021564, "learning_rate": 7.41713125154335e-06, "loss": 0.0323, "step": 4536 }, { "epoch": 1.2166800750871547, "grad_norm": 0.29660303903446456, "learning_rate": 7.41576536940108e-06, "loss": 0.0225, "step": 4537 }, { "epoch": 1.216948243496916, "grad_norm": 0.259929367511306, "learning_rate": 7.41439925204215e-06, "loss": 0.021, "step": 4538 }, { "epoch": 1.2172164119066773, "grad_norm": 0.16006148487536237, "learning_rate": 7.413032899599575e-06, "loss": 0.014, "step": 4539 }, { "epoch": 1.2174845803164387, "grad_norm": 0.3174993289379245, "learning_rate": 7.411666312206393e-06, "loss": 0.0254, "step": 4540 }, { "epoch": 1.2177527487262, "grad_norm": 0.35129032622558476, "learning_rate": 7.410299489995666e-06, "loss": 0.0232, "step": 4541 }, { "epoch": 1.2180209171359615, "grad_norm": 0.29660953603191165, "learning_rate": 7.408932433100476e-06, "loss": 0.0228, "step": 4542 }, { "epoch": 1.2182890855457227, "grad_norm": 0.28627306779046874, "learning_rate": 7.4075651416539294e-06, "loss": 0.0287, "step": 4543 }, { "epoch": 1.218557253955484, "grad_norm": 0.3630584496434604, "learning_rate": 7.4061976157891576e-06, "loss": 0.0324, "step": 4544 }, { "epoch": 1.2188254223652453, "grad_norm": 0.2644223970293549, "learning_rate": 7.404829855639313e-06, "loss": 0.016, "step": 4545 }, { "epoch": 1.2190935907750067, "grad_norm": 0.3018082301801645, "learning_rate": 7.40346186133757e-06, "loss": 0.0284, "step": 4546 }, { "epoch": 1.219361759184768, "grad_norm": 0.3992535574654829, "learning_rate": 7.402093633017127e-06, "loss": 0.041, "step": 4547 }, { "epoch": 1.2196299275945295, "grad_norm": 0.6832877437170843, "learning_rate": 7.400725170811202e-06, "loss": 0.0259, "step": 4548 }, { "epoch": 1.2198980960042907, "grad_norm": 0.3001789431762235, "learning_rate": 7.399356474853044e-06, "loss": 0.0215, "step": 4549 }, { "epoch": 1.220166264414052, "grad_norm": 0.37194912519700635, "learning_rate": 7.397987545275915e-06, "loss": 0.0347, "step": 4550 }, { "epoch": 1.2204344328238133, "grad_norm": 0.35476468429301733, "learning_rate": 7.3966183822131055e-06, "loss": 0.0297, "step": 4551 }, { "epoch": 1.2207026012335747, "grad_norm": 0.36480357319359435, "learning_rate": 7.395248985797926e-06, "loss": 0.0299, "step": 4552 }, { "epoch": 1.220970769643336, "grad_norm": 0.3462489979509227, "learning_rate": 7.393879356163713e-06, "loss": 0.0287, "step": 4553 }, { "epoch": 1.2212389380530975, "grad_norm": 0.3041588042929789, "learning_rate": 7.392509493443823e-06, "loss": 0.0218, "step": 4554 }, { "epoch": 1.2215071064628586, "grad_norm": 0.3626684397157777, "learning_rate": 7.391139397771634e-06, "loss": 0.0293, "step": 4555 }, { "epoch": 1.22177527487262, "grad_norm": 0.49884822193535594, "learning_rate": 7.389769069280551e-06, "loss": 0.0255, "step": 4556 }, { "epoch": 1.2220434432823812, "grad_norm": 0.5867673052748384, "learning_rate": 7.388398508103999e-06, "loss": 0.0359, "step": 4557 }, { "epoch": 1.2223116116921426, "grad_norm": 0.39207681714803233, "learning_rate": 7.3870277143754254e-06, "loss": 0.0294, "step": 4558 }, { "epoch": 1.222579780101904, "grad_norm": 0.23645364262797505, "learning_rate": 7.3856566882283e-06, "loss": 0.0179, "step": 4559 }, { "epoch": 1.2228479485116655, "grad_norm": 0.2672458401084975, "learning_rate": 7.384285429796116e-06, "loss": 0.028, "step": 4560 }, { "epoch": 1.2231161169214266, "grad_norm": 0.44498423871638726, "learning_rate": 7.382913939212392e-06, "loss": 0.0356, "step": 4561 }, { "epoch": 1.223384285331188, "grad_norm": 0.29408750385160326, "learning_rate": 7.3815422166106635e-06, "loss": 0.0252, "step": 4562 }, { "epoch": 1.2236524537409492, "grad_norm": 0.3337315396152775, "learning_rate": 7.380170262124491e-06, "loss": 0.0313, "step": 4563 }, { "epoch": 1.2239206221507106, "grad_norm": 0.4550946550495223, "learning_rate": 7.378798075887459e-06, "loss": 0.039, "step": 4564 }, { "epoch": 1.224188790560472, "grad_norm": 0.8652703006448639, "learning_rate": 7.377425658033177e-06, "loss": 0.0448, "step": 4565 }, { "epoch": 1.2244569589702332, "grad_norm": 0.33737324005927144, "learning_rate": 7.376053008695268e-06, "loss": 0.0197, "step": 4566 }, { "epoch": 1.2247251273799946, "grad_norm": 0.30680406156451767, "learning_rate": 7.374680128007387e-06, "loss": 0.0268, "step": 4567 }, { "epoch": 1.224993295789756, "grad_norm": 0.24723886685433172, "learning_rate": 7.373307016103206e-06, "loss": 0.0218, "step": 4568 }, { "epoch": 1.2252614641995172, "grad_norm": 0.3706334551808108, "learning_rate": 7.371933673116421e-06, "loss": 0.0202, "step": 4569 }, { "epoch": 1.2255296326092786, "grad_norm": 0.4038018117910843, "learning_rate": 7.370560099180752e-06, "loss": 0.0377, "step": 4570 }, { "epoch": 1.22579780101904, "grad_norm": 0.403778743997774, "learning_rate": 7.369186294429942e-06, "loss": 0.029, "step": 4571 }, { "epoch": 1.2260659694288012, "grad_norm": 0.2259859789392059, "learning_rate": 7.367812258997751e-06, "loss": 0.0219, "step": 4572 }, { "epoch": 1.2263341378385626, "grad_norm": 0.3607258061778454, "learning_rate": 7.366437993017966e-06, "loss": 0.0351, "step": 4573 }, { "epoch": 1.226602306248324, "grad_norm": 0.33638584466644306, "learning_rate": 7.365063496624396e-06, "loss": 0.0301, "step": 4574 }, { "epoch": 1.2268704746580852, "grad_norm": 0.35178627517505556, "learning_rate": 7.363688769950874e-06, "loss": 0.0281, "step": 4575 }, { "epoch": 1.2271386430678466, "grad_norm": 0.7159865188908692, "learning_rate": 7.362313813131252e-06, "loss": 0.0283, "step": 4576 }, { "epoch": 1.227406811477608, "grad_norm": 0.3495736027349909, "learning_rate": 7.360938626299406e-06, "loss": 0.0271, "step": 4577 }, { "epoch": 1.2276749798873692, "grad_norm": 0.3359739105792775, "learning_rate": 7.359563209589233e-06, "loss": 0.0279, "step": 4578 }, { "epoch": 1.2279431482971306, "grad_norm": 0.27424014791979473, "learning_rate": 7.358187563134655e-06, "loss": 0.0202, "step": 4579 }, { "epoch": 1.228211316706892, "grad_norm": 0.2873198249334603, "learning_rate": 7.356811687069615e-06, "loss": 0.0272, "step": 4580 }, { "epoch": 1.2284794851166532, "grad_norm": 0.43816873826968705, "learning_rate": 7.355435581528079e-06, "loss": 0.0377, "step": 4581 }, { "epoch": 1.2287476535264146, "grad_norm": 0.3038780411671199, "learning_rate": 7.354059246644033e-06, "loss": 0.0318, "step": 4582 }, { "epoch": 1.229015821936176, "grad_norm": 0.3368851360339834, "learning_rate": 7.35268268255149e-06, "loss": 0.0208, "step": 4583 }, { "epoch": 1.2292839903459372, "grad_norm": 0.29585244573474934, "learning_rate": 7.351305889384478e-06, "loss": 0.0299, "step": 4584 }, { "epoch": 1.2295521587556986, "grad_norm": 0.2547671520203688, "learning_rate": 7.349928867277054e-06, "loss": 0.0195, "step": 4585 }, { "epoch": 1.22982032716546, "grad_norm": 0.30120602150452863, "learning_rate": 7.348551616363297e-06, "loss": 0.0345, "step": 4586 }, { "epoch": 1.2300884955752212, "grad_norm": 0.37004673592601683, "learning_rate": 7.347174136777303e-06, "loss": 0.0334, "step": 4587 }, { "epoch": 1.2303566639849826, "grad_norm": 0.3331548158373684, "learning_rate": 7.345796428653196e-06, "loss": 0.0233, "step": 4588 }, { "epoch": 1.230624832394744, "grad_norm": 0.3475126666687277, "learning_rate": 7.3444184921251196e-06, "loss": 0.0356, "step": 4589 }, { "epoch": 1.2308930008045051, "grad_norm": 0.18770027717112514, "learning_rate": 7.343040327327235e-06, "loss": 0.0161, "step": 4590 }, { "epoch": 1.2311611692142665, "grad_norm": 0.33711089348844175, "learning_rate": 7.341661934393738e-06, "loss": 0.033, "step": 4591 }, { "epoch": 1.231429337624028, "grad_norm": 0.7302487037283778, "learning_rate": 7.340283313458833e-06, "loss": 0.0216, "step": 4592 }, { "epoch": 1.2316975060337891, "grad_norm": 0.30724595639708796, "learning_rate": 7.338904464656757e-06, "loss": 0.0284, "step": 4593 }, { "epoch": 1.2319656744435505, "grad_norm": 0.31163989899019334, "learning_rate": 7.337525388121763e-06, "loss": 0.0291, "step": 4594 }, { "epoch": 1.232233842853312, "grad_norm": 0.3763300405663219, "learning_rate": 7.336146083988127e-06, "loss": 0.0287, "step": 4595 }, { "epoch": 1.2325020112630731, "grad_norm": 0.2897800690428736, "learning_rate": 7.3347665523901504e-06, "loss": 0.0225, "step": 4596 }, { "epoch": 1.2327701796728345, "grad_norm": 0.3689517733787715, "learning_rate": 7.333386793462152e-06, "loss": 0.0255, "step": 4597 }, { "epoch": 1.233038348082596, "grad_norm": 0.2981821533118846, "learning_rate": 7.332006807338476e-06, "loss": 0.0315, "step": 4598 }, { "epoch": 1.2333065164923571, "grad_norm": 0.35341023508648384, "learning_rate": 7.33062659415349e-06, "loss": 0.0419, "step": 4599 }, { "epoch": 1.2335746849021185, "grad_norm": 1.3760849648356483, "learning_rate": 7.32924615404158e-06, "loss": 0.0384, "step": 4600 }, { "epoch": 1.23384285331188, "grad_norm": 0.37688422670872324, "learning_rate": 7.327865487137155e-06, "loss": 0.0296, "step": 4601 }, { "epoch": 1.234111021721641, "grad_norm": 0.32672402525907873, "learning_rate": 7.32648459357465e-06, "loss": 0.0252, "step": 4602 }, { "epoch": 1.2343791901314025, "grad_norm": 0.3413727808984585, "learning_rate": 7.325103473488515e-06, "loss": 0.0273, "step": 4603 }, { "epoch": 1.234647358541164, "grad_norm": 0.24895310750367244, "learning_rate": 7.323722127013228e-06, "loss": 0.0212, "step": 4604 }, { "epoch": 1.234915526950925, "grad_norm": 0.25689965838596335, "learning_rate": 7.322340554283287e-06, "loss": 0.0261, "step": 4605 }, { "epoch": 1.2351836953606865, "grad_norm": 0.28516955771968505, "learning_rate": 7.320958755433213e-06, "loss": 0.0257, "step": 4606 }, { "epoch": 1.235451863770448, "grad_norm": 0.6609218660796218, "learning_rate": 7.319576730597544e-06, "loss": 0.0344, "step": 4607 }, { "epoch": 1.235720032180209, "grad_norm": 0.3641844729896522, "learning_rate": 7.318194479910851e-06, "loss": 0.0253, "step": 4608 }, { "epoch": 1.2359882005899705, "grad_norm": 0.30597029063321235, "learning_rate": 7.316812003507714e-06, "loss": 0.024, "step": 4609 }, { "epoch": 1.236256368999732, "grad_norm": 0.4364275448858784, "learning_rate": 7.315429301522743e-06, "loss": 0.0256, "step": 4610 }, { "epoch": 1.236524537409493, "grad_norm": 0.2198695416951886, "learning_rate": 7.314046374090569e-06, "loss": 0.0206, "step": 4611 }, { "epoch": 1.2367927058192545, "grad_norm": 0.4380974927149838, "learning_rate": 7.3126632213458425e-06, "loss": 0.0293, "step": 4612 }, { "epoch": 1.237060874229016, "grad_norm": 0.43322391953863987, "learning_rate": 7.311279843423238e-06, "loss": 0.029, "step": 4613 }, { "epoch": 1.237329042638777, "grad_norm": 0.3036743520680998, "learning_rate": 7.309896240457454e-06, "loss": 0.0222, "step": 4614 }, { "epoch": 1.2375972110485385, "grad_norm": 0.26627289630811046, "learning_rate": 7.308512412583204e-06, "loss": 0.0235, "step": 4615 }, { "epoch": 1.2378653794582999, "grad_norm": 0.43643661559758146, "learning_rate": 7.307128359935229e-06, "loss": 0.0308, "step": 4616 }, { "epoch": 1.238133547868061, "grad_norm": 0.3114275083496713, "learning_rate": 7.305744082648289e-06, "loss": 0.0276, "step": 4617 }, { "epoch": 1.2384017162778225, "grad_norm": 0.5339547706820242, "learning_rate": 7.304359580857172e-06, "loss": 0.0376, "step": 4618 }, { "epoch": 1.2386698846875839, "grad_norm": 0.27663588284588975, "learning_rate": 7.30297485469668e-06, "loss": 0.0277, "step": 4619 }, { "epoch": 1.238938053097345, "grad_norm": 0.3652557962957018, "learning_rate": 7.301589904301642e-06, "loss": 0.03, "step": 4620 }, { "epoch": 1.2392062215071065, "grad_norm": 0.47877058991234983, "learning_rate": 7.3002047298069044e-06, "loss": 0.0395, "step": 4621 }, { "epoch": 1.2394743899168679, "grad_norm": 0.277284796560363, "learning_rate": 7.298819331347338e-06, "loss": 0.021, "step": 4622 }, { "epoch": 1.239742558326629, "grad_norm": 0.2523358160662548, "learning_rate": 7.297433709057837e-06, "loss": 0.0175, "step": 4623 }, { "epoch": 1.2400107267363905, "grad_norm": 0.22755489918793204, "learning_rate": 7.296047863073316e-06, "loss": 0.0184, "step": 4624 }, { "epoch": 1.2402788951461519, "grad_norm": 0.3388627071382217, "learning_rate": 7.294661793528711e-06, "loss": 0.0254, "step": 4625 }, { "epoch": 1.240547063555913, "grad_norm": 0.49282530997591506, "learning_rate": 7.2932755005589805e-06, "loss": 0.0261, "step": 4626 }, { "epoch": 1.2408152319656744, "grad_norm": 0.3107196778430525, "learning_rate": 7.2918889842991e-06, "loss": 0.025, "step": 4627 }, { "epoch": 1.2410834003754359, "grad_norm": 0.22908325456793002, "learning_rate": 7.2905022448840745e-06, "loss": 0.014, "step": 4628 }, { "epoch": 1.241351568785197, "grad_norm": 0.23730692454528507, "learning_rate": 7.289115282448929e-06, "loss": 0.0193, "step": 4629 }, { "epoch": 1.2416197371949584, "grad_norm": 0.3477707378928443, "learning_rate": 7.2877280971287036e-06, "loss": 0.0347, "step": 4630 }, { "epoch": 1.2418879056047198, "grad_norm": 0.2745608744036304, "learning_rate": 7.286340689058468e-06, "loss": 0.0167, "step": 4631 }, { "epoch": 1.242156074014481, "grad_norm": 0.28831988965138905, "learning_rate": 7.28495305837331e-06, "loss": 0.0264, "step": 4632 }, { "epoch": 1.2424242424242424, "grad_norm": 0.32660945354746357, "learning_rate": 7.283565205208339e-06, "loss": 0.0327, "step": 4633 }, { "epoch": 1.2426924108340038, "grad_norm": 0.32502254225438143, "learning_rate": 7.282177129698685e-06, "loss": 0.0297, "step": 4634 }, { "epoch": 1.242960579243765, "grad_norm": 0.33248051898369596, "learning_rate": 7.280788831979504e-06, "loss": 0.0371, "step": 4635 }, { "epoch": 1.2432287476535264, "grad_norm": 0.37914241985462177, "learning_rate": 7.279400312185972e-06, "loss": 0.0316, "step": 4636 }, { "epoch": 1.2434969160632878, "grad_norm": 0.2701833421937976, "learning_rate": 7.27801157045328e-06, "loss": 0.025, "step": 4637 }, { "epoch": 1.243765084473049, "grad_norm": 1.249076515219052, "learning_rate": 7.276622606916651e-06, "loss": 0.0298, "step": 4638 }, { "epoch": 1.2440332528828104, "grad_norm": 0.3235058584659483, "learning_rate": 7.275233421711323e-06, "loss": 0.0232, "step": 4639 }, { "epoch": 1.2443014212925718, "grad_norm": 0.3999710440453591, "learning_rate": 7.273844014972558e-06, "loss": 0.041, "step": 4640 }, { "epoch": 1.244569589702333, "grad_norm": 0.3393825397974193, "learning_rate": 7.272454386835637e-06, "loss": 0.026, "step": 4641 }, { "epoch": 1.2448377581120944, "grad_norm": 0.21443882538740036, "learning_rate": 7.2710645374358655e-06, "loss": 0.0177, "step": 4642 }, { "epoch": 1.2451059265218558, "grad_norm": 0.30189557592707156, "learning_rate": 7.269674466908569e-06, "loss": 0.0329, "step": 4643 }, { "epoch": 1.245374094931617, "grad_norm": 0.32027811783203153, "learning_rate": 7.2682841753890975e-06, "loss": 0.0261, "step": 4644 }, { "epoch": 1.2456422633413784, "grad_norm": 0.4002705779720046, "learning_rate": 7.2668936630128175e-06, "loss": 0.0255, "step": 4645 }, { "epoch": 1.2459104317511398, "grad_norm": 0.2987993236283025, "learning_rate": 7.265502929915119e-06, "loss": 0.0303, "step": 4646 }, { "epoch": 1.246178600160901, "grad_norm": 0.5364409399423652, "learning_rate": 7.264111976231416e-06, "loss": 0.0375, "step": 4647 }, { "epoch": 1.2464467685706624, "grad_norm": 0.31461749483185736, "learning_rate": 7.262720802097141e-06, "loss": 0.0238, "step": 4648 }, { "epoch": 1.2467149369804238, "grad_norm": 0.4286401391247154, "learning_rate": 7.261329407647748e-06, "loss": 0.0354, "step": 4649 }, { "epoch": 1.246983105390185, "grad_norm": 0.2384308556558068, "learning_rate": 7.259937793018715e-06, "loss": 0.019, "step": 4650 }, { "epoch": 1.2472512737999464, "grad_norm": 0.41438447323786604, "learning_rate": 7.2585459583455396e-06, "loss": 0.0403, "step": 4651 }, { "epoch": 1.2475194422097078, "grad_norm": 0.22818140660695477, "learning_rate": 7.25715390376374e-06, "loss": 0.0194, "step": 4652 }, { "epoch": 1.247787610619469, "grad_norm": 0.3729362434083917, "learning_rate": 7.255761629408857e-06, "loss": 0.0297, "step": 4653 }, { "epoch": 1.2480557790292304, "grad_norm": 0.32139561431725594, "learning_rate": 7.254369135416454e-06, "loss": 0.0309, "step": 4654 }, { "epoch": 1.2483239474389918, "grad_norm": 0.4147638591567828, "learning_rate": 7.252976421922111e-06, "loss": 0.0291, "step": 4655 }, { "epoch": 1.248592115848753, "grad_norm": 0.2829048146044312, "learning_rate": 7.251583489061438e-06, "loss": 0.0197, "step": 4656 }, { "epoch": 1.2488602842585144, "grad_norm": 0.33685684552610673, "learning_rate": 7.250190336970058e-06, "loss": 0.0241, "step": 4657 }, { "epoch": 1.2491284526682758, "grad_norm": 0.35808295937540635, "learning_rate": 7.248796965783619e-06, "loss": 0.0266, "step": 4658 }, { "epoch": 1.249396621078037, "grad_norm": 0.4008921060217825, "learning_rate": 7.247403375637789e-06, "loss": 0.0309, "step": 4659 }, { "epoch": 1.2496647894877984, "grad_norm": 0.27462655259806784, "learning_rate": 7.24600956666826e-06, "loss": 0.0255, "step": 4660 }, { "epoch": 1.2499329578975598, "grad_norm": 0.3264078120027483, "learning_rate": 7.244615539010742e-06, "loss": 0.0241, "step": 4661 }, { "epoch": 1.250201126307321, "grad_norm": 0.36509909705706384, "learning_rate": 7.24322129280097e-06, "loss": 0.0364, "step": 4662 }, { "epoch": 1.2504692947170823, "grad_norm": 0.4725937058839055, "learning_rate": 7.241826828174694e-06, "loss": 0.0344, "step": 4663 }, { "epoch": 1.2507374631268435, "grad_norm": 0.38102819470968535, "learning_rate": 7.240432145267693e-06, "loss": 0.0248, "step": 4664 }, { "epoch": 1.251005631536605, "grad_norm": 0.2560207691158638, "learning_rate": 7.239037244215761e-06, "loss": 0.0292, "step": 4665 }, { "epoch": 1.2512737999463663, "grad_norm": 0.2981043092727063, "learning_rate": 7.237642125154718e-06, "loss": 0.0263, "step": 4666 }, { "epoch": 1.2515419683561277, "grad_norm": 0.3361091275044608, "learning_rate": 7.2362467882204015e-06, "loss": 0.0296, "step": 4667 }, { "epoch": 1.251810136765889, "grad_norm": 0.24405378773863384, "learning_rate": 7.2348512335486745e-06, "loss": 0.0236, "step": 4668 }, { "epoch": 1.2520783051756503, "grad_norm": 0.6029162396416538, "learning_rate": 7.233455461275414e-06, "loss": 0.0279, "step": 4669 }, { "epoch": 1.2523464735854115, "grad_norm": 0.3099823442534736, "learning_rate": 7.232059471536524e-06, "loss": 0.0317, "step": 4670 }, { "epoch": 1.252614641995173, "grad_norm": 0.33266854661579565, "learning_rate": 7.230663264467932e-06, "loss": 0.0165, "step": 4671 }, { "epoch": 1.2528828104049343, "grad_norm": 0.5203541189118916, "learning_rate": 7.229266840205579e-06, "loss": 0.0386, "step": 4672 }, { "epoch": 1.2531509788146957, "grad_norm": 0.5928478618930845, "learning_rate": 7.227870198885432e-06, "loss": 0.0262, "step": 4673 }, { "epoch": 1.253419147224457, "grad_norm": 0.2593291101208747, "learning_rate": 7.22647334064348e-06, "loss": 0.0211, "step": 4674 }, { "epoch": 1.2536873156342183, "grad_norm": 0.2702138730288383, "learning_rate": 7.225076265615729e-06, "loss": 0.0239, "step": 4675 }, { "epoch": 1.2539554840439795, "grad_norm": 0.4475458546222948, "learning_rate": 7.223678973938208e-06, "loss": 0.0311, "step": 4676 }, { "epoch": 1.254223652453741, "grad_norm": 0.32122534444233874, "learning_rate": 7.222281465746972e-06, "loss": 0.0296, "step": 4677 }, { "epoch": 1.2544918208635023, "grad_norm": 0.3399582262355133, "learning_rate": 7.220883741178088e-06, "loss": 0.0317, "step": 4678 }, { "epoch": 1.2547599892732637, "grad_norm": 0.23935242514832128, "learning_rate": 7.219485800367651e-06, "loss": 0.0185, "step": 4679 }, { "epoch": 1.255028157683025, "grad_norm": 0.36079681100413075, "learning_rate": 7.218087643451774e-06, "loss": 0.0388, "step": 4680 }, { "epoch": 1.2552963260927863, "grad_norm": 0.32140297862881134, "learning_rate": 7.2166892705665915e-06, "loss": 0.0268, "step": 4681 }, { "epoch": 1.2555644945025475, "grad_norm": 0.2154650001459716, "learning_rate": 7.21529068184826e-06, "loss": 0.0216, "step": 4682 }, { "epoch": 1.2558326629123089, "grad_norm": 0.28701630817367546, "learning_rate": 7.213891877432957e-06, "loss": 0.0276, "step": 4683 }, { "epoch": 1.2561008313220703, "grad_norm": 0.43070471072192734, "learning_rate": 7.21249285745688e-06, "loss": 0.0249, "step": 4684 }, { "epoch": 1.2563689997318317, "grad_norm": 1.1330304007504517, "learning_rate": 7.211093622056247e-06, "loss": 0.0281, "step": 4685 }, { "epoch": 1.2566371681415929, "grad_norm": 0.42332563304315374, "learning_rate": 7.209694171367299e-06, "loss": 0.0256, "step": 4686 }, { "epoch": 1.2569053365513543, "grad_norm": 0.30346302512607193, "learning_rate": 7.2082945055262946e-06, "loss": 0.024, "step": 4687 }, { "epoch": 1.2571735049611155, "grad_norm": 0.283264992493608, "learning_rate": 7.206894624669518e-06, "loss": 0.0302, "step": 4688 }, { "epoch": 1.2574416733708769, "grad_norm": 0.26438175654965845, "learning_rate": 7.205494528933271e-06, "loss": 0.0244, "step": 4689 }, { "epoch": 1.2577098417806383, "grad_norm": 0.3113243744283048, "learning_rate": 7.204094218453878e-06, "loss": 0.0256, "step": 4690 }, { "epoch": 1.2579780101903997, "grad_norm": 0.2773209875687245, "learning_rate": 7.202693693367681e-06, "loss": 0.0213, "step": 4691 }, { "epoch": 1.2582461786001609, "grad_norm": 0.3050636885955443, "learning_rate": 7.201292953811049e-06, "loss": 0.0239, "step": 4692 }, { "epoch": 1.2585143470099223, "grad_norm": 0.40974868699938893, "learning_rate": 7.1998919999203654e-06, "loss": 0.0272, "step": 4693 }, { "epoch": 1.2587825154196834, "grad_norm": 0.2866837714009931, "learning_rate": 7.198490831832038e-06, "loss": 0.0222, "step": 4694 }, { "epoch": 1.2590506838294448, "grad_norm": 0.2720715218580527, "learning_rate": 7.197089449682495e-06, "loss": 0.0228, "step": 4695 }, { "epoch": 1.2593188522392063, "grad_norm": 0.2585326345434907, "learning_rate": 7.195687853608186e-06, "loss": 0.0221, "step": 4696 }, { "epoch": 1.2595870206489677, "grad_norm": 0.17888864913340874, "learning_rate": 7.194286043745581e-06, "loss": 0.0176, "step": 4697 }, { "epoch": 1.2598551890587288, "grad_norm": 0.24775708536258945, "learning_rate": 7.192884020231168e-06, "loss": 0.0234, "step": 4698 }, { "epoch": 1.2601233574684902, "grad_norm": 0.38607211094775334, "learning_rate": 7.191481783201462e-06, "loss": 0.0401, "step": 4699 }, { "epoch": 1.2603915258782514, "grad_norm": 0.36877156749304657, "learning_rate": 7.1900793327929905e-06, "loss": 0.0224, "step": 4700 }, { "epoch": 1.2606596942880128, "grad_norm": 0.28847864979838256, "learning_rate": 7.18867666914231e-06, "loss": 0.0304, "step": 4701 }, { "epoch": 1.2609278626977742, "grad_norm": 0.23471396173447606, "learning_rate": 7.187273792385993e-06, "loss": 0.0205, "step": 4702 }, { "epoch": 1.2611960311075356, "grad_norm": 0.3763317305198127, "learning_rate": 7.185870702660635e-06, "loss": 0.032, "step": 4703 }, { "epoch": 1.2614641995172968, "grad_norm": 0.3355533906189248, "learning_rate": 7.184467400102849e-06, "loss": 0.0234, "step": 4704 }, { "epoch": 1.2617323679270582, "grad_norm": 0.3416500229416669, "learning_rate": 7.183063884849273e-06, "loss": 0.0362, "step": 4705 }, { "epoch": 1.2620005363368194, "grad_norm": 0.42009334967075956, "learning_rate": 7.181660157036561e-06, "loss": 0.0433, "step": 4706 }, { "epoch": 1.2622687047465808, "grad_norm": 0.5272575632300055, "learning_rate": 7.180256216801392e-06, "loss": 0.0198, "step": 4707 }, { "epoch": 1.2625368731563422, "grad_norm": 0.3038540837713627, "learning_rate": 7.178852064280463e-06, "loss": 0.0289, "step": 4708 }, { "epoch": 1.2628050415661036, "grad_norm": 0.5595897114225376, "learning_rate": 7.177447699610494e-06, "loss": 0.0354, "step": 4709 }, { "epoch": 1.2630732099758648, "grad_norm": 0.27253750989891773, "learning_rate": 7.176043122928226e-06, "loss": 0.0313, "step": 4710 }, { "epoch": 1.2633413783856262, "grad_norm": 0.2449812345445171, "learning_rate": 7.174638334370414e-06, "loss": 0.0202, "step": 4711 }, { "epoch": 1.2636095467953874, "grad_norm": 0.24259820431462986, "learning_rate": 7.17323333407384e-06, "loss": 0.0183, "step": 4712 }, { "epoch": 1.2638777152051488, "grad_norm": 0.28687155189711966, "learning_rate": 7.171828122175306e-06, "loss": 0.0285, "step": 4713 }, { "epoch": 1.2641458836149102, "grad_norm": 0.5550916667869823, "learning_rate": 7.170422698811635e-06, "loss": 0.0342, "step": 4714 }, { "epoch": 1.2644140520246716, "grad_norm": 0.3129796676803681, "learning_rate": 7.169017064119669e-06, "loss": 0.0272, "step": 4715 }, { "epoch": 1.2646822204344328, "grad_norm": 0.23766870449714858, "learning_rate": 7.167611218236269e-06, "loss": 0.0215, "step": 4716 }, { "epoch": 1.2649503888441942, "grad_norm": 0.3050900711430389, "learning_rate": 7.1662051612983204e-06, "loss": 0.0335, "step": 4717 }, { "epoch": 1.2652185572539554, "grad_norm": 0.3561590115918739, "learning_rate": 7.164798893442725e-06, "loss": 0.03, "step": 4718 }, { "epoch": 1.2654867256637168, "grad_norm": 0.2635287448350578, "learning_rate": 7.163392414806409e-06, "loss": 0.0244, "step": 4719 }, { "epoch": 1.2657548940734782, "grad_norm": 0.21757609324070415, "learning_rate": 7.161985725526318e-06, "loss": 0.0177, "step": 4720 }, { "epoch": 1.2660230624832396, "grad_norm": 0.4295325132318656, "learning_rate": 7.160578825739417e-06, "loss": 0.0424, "step": 4721 }, { "epoch": 1.2662912308930008, "grad_norm": 0.3442957616473958, "learning_rate": 7.159171715582691e-06, "loss": 0.0247, "step": 4722 }, { "epoch": 1.2665593993027622, "grad_norm": 0.4787361140962861, "learning_rate": 7.15776439519315e-06, "loss": 0.0337, "step": 4723 }, { "epoch": 1.2668275677125234, "grad_norm": 0.2826131328346021, "learning_rate": 7.156356864707816e-06, "loss": 0.0316, "step": 4724 }, { "epoch": 1.2670957361222848, "grad_norm": 0.24194820661125024, "learning_rate": 7.15494912426374e-06, "loss": 0.0234, "step": 4725 }, { "epoch": 1.2673639045320462, "grad_norm": 0.2504561737977258, "learning_rate": 7.153541173997989e-06, "loss": 0.0215, "step": 4726 }, { "epoch": 1.2676320729418076, "grad_norm": 0.2624578576635326, "learning_rate": 7.152133014047651e-06, "loss": 0.0277, "step": 4727 }, { "epoch": 1.2679002413515688, "grad_norm": 0.20817334817586813, "learning_rate": 7.150724644549835e-06, "loss": 0.023, "step": 4728 }, { "epoch": 1.2681684097613302, "grad_norm": 0.27974233361435313, "learning_rate": 7.149316065641672e-06, "loss": 0.0196, "step": 4729 }, { "epoch": 1.2684365781710913, "grad_norm": 0.2452883345373778, "learning_rate": 7.147907277460309e-06, "loss": 0.0229, "step": 4730 }, { "epoch": 1.2687047465808527, "grad_norm": 0.22663814677122504, "learning_rate": 7.146498280142917e-06, "loss": 0.0216, "step": 4731 }, { "epoch": 1.2689729149906142, "grad_norm": 0.31882531265751635, "learning_rate": 7.1450890738266864e-06, "loss": 0.0246, "step": 4732 }, { "epoch": 1.2692410834003756, "grad_norm": 0.305931639343806, "learning_rate": 7.143679658648828e-06, "loss": 0.028, "step": 4733 }, { "epoch": 1.2695092518101367, "grad_norm": 0.22779022938319887, "learning_rate": 7.142270034746572e-06, "loss": 0.0228, "step": 4734 }, { "epoch": 1.2697774202198981, "grad_norm": 0.3377587138873126, "learning_rate": 7.14086020225717e-06, "loss": 0.0439, "step": 4735 }, { "epoch": 1.2700455886296593, "grad_norm": 0.3092986307040245, "learning_rate": 7.139450161317895e-06, "loss": 0.0215, "step": 4736 }, { "epoch": 1.2703137570394207, "grad_norm": 0.34411196779703523, "learning_rate": 7.138039912066037e-06, "loss": 0.0295, "step": 4737 }, { "epoch": 1.2705819254491821, "grad_norm": 0.2608856831695081, "learning_rate": 7.13662945463891e-06, "loss": 0.0214, "step": 4738 }, { "epoch": 1.2708500938589435, "grad_norm": 0.25474214353535524, "learning_rate": 7.135218789173845e-06, "loss": 0.0201, "step": 4739 }, { "epoch": 1.2711182622687047, "grad_norm": 0.3418477350425559, "learning_rate": 7.133807915808197e-06, "loss": 0.0271, "step": 4740 }, { "epoch": 1.2713864306784661, "grad_norm": 0.39139052752025294, "learning_rate": 7.132396834679336e-06, "loss": 0.0273, "step": 4741 }, { "epoch": 1.2716545990882273, "grad_norm": 0.27457020042631014, "learning_rate": 7.1309855459246554e-06, "loss": 0.0203, "step": 4742 }, { "epoch": 1.2719227674979887, "grad_norm": 0.3814701703593124, "learning_rate": 7.1295740496815715e-06, "loss": 0.0259, "step": 4743 }, { "epoch": 1.2721909359077501, "grad_norm": 0.30643678065442015, "learning_rate": 7.128162346087517e-06, "loss": 0.027, "step": 4744 }, { "epoch": 1.2724591043175115, "grad_norm": 0.333129129835628, "learning_rate": 7.1267504352799435e-06, "loss": 0.0261, "step": 4745 }, { "epoch": 1.2727272727272727, "grad_norm": 0.5725631708459238, "learning_rate": 7.125338317396326e-06, "loss": 0.0379, "step": 4746 }, { "epoch": 1.2729954411370341, "grad_norm": 0.4652776249926086, "learning_rate": 7.123925992574161e-06, "loss": 0.0182, "step": 4747 }, { "epoch": 1.2732636095467953, "grad_norm": 0.24973758622134376, "learning_rate": 7.122513460950961e-06, "loss": 0.0245, "step": 4748 }, { "epoch": 1.2735317779565567, "grad_norm": 0.2786046932153775, "learning_rate": 7.1211007226642595e-06, "loss": 0.0274, "step": 4749 }, { "epoch": 1.273799946366318, "grad_norm": 0.6252322990247723, "learning_rate": 7.119687777851612e-06, "loss": 0.0307, "step": 4750 }, { "epoch": 1.2740681147760795, "grad_norm": 0.34541225053432, "learning_rate": 7.118274626650592e-06, "loss": 0.0298, "step": 4751 }, { "epoch": 1.2743362831858407, "grad_norm": 0.24688049491691627, "learning_rate": 7.116861269198797e-06, "loss": 0.0258, "step": 4752 }, { "epoch": 1.274604451595602, "grad_norm": 0.36296936998738577, "learning_rate": 7.115447705633843e-06, "loss": 0.032, "step": 4753 }, { "epoch": 1.2748726200053633, "grad_norm": 0.29465779582373297, "learning_rate": 7.1140339360933585e-06, "loss": 0.0268, "step": 4754 }, { "epoch": 1.2751407884151247, "grad_norm": 0.2858302671808592, "learning_rate": 7.112619960715004e-06, "loss": 0.0247, "step": 4755 }, { "epoch": 1.275408956824886, "grad_norm": 0.978837405781421, "learning_rate": 7.111205779636451e-06, "loss": 0.0328, "step": 4756 }, { "epoch": 1.2756771252346475, "grad_norm": 0.3678102723854502, "learning_rate": 7.109791392995398e-06, "loss": 0.0377, "step": 4757 }, { "epoch": 1.2759452936444087, "grad_norm": 0.30400460572480026, "learning_rate": 7.108376800929557e-06, "loss": 0.0244, "step": 4758 }, { "epoch": 1.27621346205417, "grad_norm": 0.2862356258847062, "learning_rate": 7.106962003576667e-06, "loss": 0.0218, "step": 4759 }, { "epoch": 1.2764816304639313, "grad_norm": 0.3866617766850235, "learning_rate": 7.10554700107448e-06, "loss": 0.0332, "step": 4760 }, { "epoch": 1.2767497988736927, "grad_norm": 0.9133371131334812, "learning_rate": 7.104131793560769e-06, "loss": 0.0265, "step": 4761 }, { "epoch": 1.277017967283454, "grad_norm": 0.38227843211387863, "learning_rate": 7.102716381173333e-06, "loss": 0.0276, "step": 4762 }, { "epoch": 1.2772861356932155, "grad_norm": 0.31344339500752677, "learning_rate": 7.1013007640499876e-06, "loss": 0.0326, "step": 4763 }, { "epoch": 1.2775543041029767, "grad_norm": 0.2587135210242422, "learning_rate": 7.0998849423285634e-06, "loss": 0.0238, "step": 4764 }, { "epoch": 1.277822472512738, "grad_norm": 0.4377231954297647, "learning_rate": 7.098468916146919e-06, "loss": 0.0229, "step": 4765 }, { "epoch": 1.2780906409224992, "grad_norm": 0.32452490064993916, "learning_rate": 7.097052685642926e-06, "loss": 0.0237, "step": 4766 }, { "epoch": 1.2783588093322606, "grad_norm": 0.28134442810637317, "learning_rate": 7.095636250954481e-06, "loss": 0.021, "step": 4767 }, { "epoch": 1.278626977742022, "grad_norm": 0.36106217262579693, "learning_rate": 7.094219612219499e-06, "loss": 0.0385, "step": 4768 }, { "epoch": 1.2788951461517835, "grad_norm": 0.28043151031439695, "learning_rate": 7.092802769575912e-06, "loss": 0.025, "step": 4769 }, { "epoch": 1.2791633145615446, "grad_norm": 0.24466535471147616, "learning_rate": 7.091385723161677e-06, "loss": 0.0192, "step": 4770 }, { "epoch": 1.279431482971306, "grad_norm": 0.29842425166775094, "learning_rate": 7.089968473114766e-06, "loss": 0.0255, "step": 4771 }, { "epoch": 1.2796996513810672, "grad_norm": 0.3872752327369388, "learning_rate": 7.088551019573173e-06, "loss": 0.0235, "step": 4772 }, { "epoch": 1.2799678197908286, "grad_norm": 0.268444105498665, "learning_rate": 7.0871333626749124e-06, "loss": 0.0245, "step": 4773 }, { "epoch": 1.28023598820059, "grad_norm": 0.2841865000366025, "learning_rate": 7.08571550255802e-06, "loss": 0.0281, "step": 4774 }, { "epoch": 1.2805041566103512, "grad_norm": 0.2904410037739432, "learning_rate": 7.084297439360544e-06, "loss": 0.0272, "step": 4775 }, { "epoch": 1.2807723250201126, "grad_norm": 0.3339511697779544, "learning_rate": 7.082879173220559e-06, "loss": 0.0272, "step": 4776 }, { "epoch": 1.281040493429874, "grad_norm": 0.33707742489089143, "learning_rate": 7.08146070427616e-06, "loss": 0.027, "step": 4777 }, { "epoch": 1.2813086618396352, "grad_norm": 0.33320785171339967, "learning_rate": 7.0800420326654596e-06, "loss": 0.0235, "step": 4778 }, { "epoch": 1.2815768302493966, "grad_norm": 0.3066933512206115, "learning_rate": 7.078623158526588e-06, "loss": 0.0232, "step": 4779 }, { "epoch": 1.281844998659158, "grad_norm": 0.4262843644960294, "learning_rate": 7.0772040819976986e-06, "loss": 0.0199, "step": 4780 }, { "epoch": 1.2821131670689192, "grad_norm": 0.27367984142949514, "learning_rate": 7.075784803216962e-06, "loss": 0.0327, "step": 4781 }, { "epoch": 1.2823813354786806, "grad_norm": 0.23137538533687435, "learning_rate": 7.074365322322569e-06, "loss": 0.0233, "step": 4782 }, { "epoch": 1.282649503888442, "grad_norm": 0.2850597057186108, "learning_rate": 7.072945639452734e-06, "loss": 0.0251, "step": 4783 }, { "epoch": 1.2829176722982032, "grad_norm": 0.25481635694712884, "learning_rate": 7.071525754745685e-06, "loss": 0.0185, "step": 4784 }, { "epoch": 1.2831858407079646, "grad_norm": 0.4018834688385554, "learning_rate": 7.070105668339673e-06, "loss": 0.027, "step": 4785 }, { "epoch": 1.283454009117726, "grad_norm": 0.24589730498200643, "learning_rate": 7.068685380372969e-06, "loss": 0.0206, "step": 4786 }, { "epoch": 1.2837221775274872, "grad_norm": 0.24766180555617603, "learning_rate": 7.0672648909838605e-06, "loss": 0.0218, "step": 4787 }, { "epoch": 1.2839903459372486, "grad_norm": 0.2783606320911617, "learning_rate": 7.065844200310657e-06, "loss": 0.0238, "step": 4788 }, { "epoch": 1.28425851434701, "grad_norm": 0.2477553004410436, "learning_rate": 7.06442330849169e-06, "loss": 0.0233, "step": 4789 }, { "epoch": 1.2845266827567712, "grad_norm": 0.28577697983506045, "learning_rate": 7.063002215665306e-06, "loss": 0.0184, "step": 4790 }, { "epoch": 1.2847948511665326, "grad_norm": 0.2996777420822747, "learning_rate": 7.061580921969875e-06, "loss": 0.0303, "step": 4791 }, { "epoch": 1.285063019576294, "grad_norm": 0.31173212762765723, "learning_rate": 7.060159427543782e-06, "loss": 0.0223, "step": 4792 }, { "epoch": 1.2853311879860552, "grad_norm": 0.37916466301197804, "learning_rate": 7.058737732525434e-06, "loss": 0.0271, "step": 4793 }, { "epoch": 1.2855993563958166, "grad_norm": 0.32937563128442965, "learning_rate": 7.057315837053258e-06, "loss": 0.0346, "step": 4794 }, { "epoch": 1.285867524805578, "grad_norm": 0.24912591263883926, "learning_rate": 7.055893741265704e-06, "loss": 0.0211, "step": 4795 }, { "epoch": 1.2861356932153392, "grad_norm": 0.26247988664809824, "learning_rate": 7.054471445301233e-06, "loss": 0.0232, "step": 4796 }, { "epoch": 1.2864038616251006, "grad_norm": 0.22760702276672298, "learning_rate": 7.053048949298332e-06, "loss": 0.0207, "step": 4797 }, { "epoch": 1.286672030034862, "grad_norm": 0.36051326131093536, "learning_rate": 7.051626253395506e-06, "loss": 0.0313, "step": 4798 }, { "epoch": 1.2869401984446232, "grad_norm": 0.30355714404803946, "learning_rate": 7.0502033577312775e-06, "loss": 0.0298, "step": 4799 }, { "epoch": 1.2872083668543846, "grad_norm": 0.277884193966726, "learning_rate": 7.0487802624441925e-06, "loss": 0.0239, "step": 4800 }, { "epoch": 1.287476535264146, "grad_norm": 0.2892237300536667, "learning_rate": 7.0473569676728134e-06, "loss": 0.0257, "step": 4801 }, { "epoch": 1.2877447036739071, "grad_norm": 0.2660815314578183, "learning_rate": 7.045933473555721e-06, "loss": 0.0249, "step": 4802 }, { "epoch": 1.2880128720836685, "grad_norm": 0.5281454350174769, "learning_rate": 7.044509780231517e-06, "loss": 0.035, "step": 4803 }, { "epoch": 1.28828104049343, "grad_norm": 0.33086197563769065, "learning_rate": 7.043085887838825e-06, "loss": 0.0286, "step": 4804 }, { "epoch": 1.2885492089031911, "grad_norm": 0.2447160752965827, "learning_rate": 7.041661796516283e-06, "loss": 0.0257, "step": 4805 }, { "epoch": 1.2888173773129525, "grad_norm": 0.21796824603750267, "learning_rate": 7.040237506402554e-06, "loss": 0.0232, "step": 4806 }, { "epoch": 1.289085545722714, "grad_norm": 0.27237410007160057, "learning_rate": 7.038813017636317e-06, "loss": 0.0352, "step": 4807 }, { "epoch": 1.2893537141324751, "grad_norm": 0.24921526916866615, "learning_rate": 7.0373883303562675e-06, "loss": 0.0242, "step": 4808 }, { "epoch": 1.2896218825422365, "grad_norm": 0.39418199912321283, "learning_rate": 7.035963444701125e-06, "loss": 0.0348, "step": 4809 }, { "epoch": 1.289890050951998, "grad_norm": 0.2047648246584857, "learning_rate": 7.034538360809628e-06, "loss": 0.0161, "step": 4810 }, { "epoch": 1.2901582193617591, "grad_norm": 0.29033463282902877, "learning_rate": 7.0331130788205325e-06, "loss": 0.0274, "step": 4811 }, { "epoch": 1.2904263877715205, "grad_norm": 0.2928477146104081, "learning_rate": 7.031687598872614e-06, "loss": 0.0207, "step": 4812 }, { "epoch": 1.290694556181282, "grad_norm": 0.28932778345600285, "learning_rate": 7.03026192110467e-06, "loss": 0.0242, "step": 4813 }, { "epoch": 1.290962724591043, "grad_norm": 0.42108012916388293, "learning_rate": 7.028836045655509e-06, "loss": 0.0284, "step": 4814 }, { "epoch": 1.2912308930008045, "grad_norm": 0.35370972371715004, "learning_rate": 7.027409972663972e-06, "loss": 0.0289, "step": 4815 }, { "epoch": 1.291499061410566, "grad_norm": 0.3058040764888948, "learning_rate": 7.025983702268907e-06, "loss": 0.0208, "step": 4816 }, { "epoch": 1.291767229820327, "grad_norm": 0.3268480988852991, "learning_rate": 7.024557234609189e-06, "loss": 0.0355, "step": 4817 }, { "epoch": 1.2920353982300885, "grad_norm": 0.3528283882842599, "learning_rate": 7.023130569823707e-06, "loss": 0.0207, "step": 4818 }, { "epoch": 1.2923035666398497, "grad_norm": 0.43276314773533986, "learning_rate": 7.021703708051375e-06, "loss": 0.0251, "step": 4819 }, { "epoch": 1.292571735049611, "grad_norm": 0.25370146613199024, "learning_rate": 7.020276649431117e-06, "loss": 0.0184, "step": 4820 }, { "epoch": 1.2928399034593725, "grad_norm": 0.31952983852406536, "learning_rate": 7.018849394101888e-06, "loss": 0.015, "step": 4821 }, { "epoch": 1.293108071869134, "grad_norm": 0.3064726012251518, "learning_rate": 7.017421942202651e-06, "loss": 0.0289, "step": 4822 }, { "epoch": 1.293376240278895, "grad_norm": 0.297163986002648, "learning_rate": 7.015994293872398e-06, "loss": 0.024, "step": 4823 }, { "epoch": 1.2936444086886565, "grad_norm": 0.3367729279902189, "learning_rate": 7.014566449250132e-06, "loss": 0.0361, "step": 4824 }, { "epoch": 1.2939125770984177, "grad_norm": 0.28463481177564703, "learning_rate": 7.01313840847488e-06, "loss": 0.021, "step": 4825 }, { "epoch": 1.294180745508179, "grad_norm": 0.31739381243300907, "learning_rate": 7.011710171685685e-06, "loss": 0.0261, "step": 4826 }, { "epoch": 1.2944489139179405, "grad_norm": 0.26269494074434335, "learning_rate": 7.010281739021612e-06, "loss": 0.0254, "step": 4827 }, { "epoch": 1.2947170823277019, "grad_norm": 0.3114438789220039, "learning_rate": 7.008853110621744e-06, "loss": 0.0242, "step": 4828 }, { "epoch": 1.294985250737463, "grad_norm": 0.2765408180816259, "learning_rate": 7.007424286625181e-06, "loss": 0.0246, "step": 4829 }, { "epoch": 1.2952534191472245, "grad_norm": 0.31736257463918727, "learning_rate": 7.005995267171045e-06, "loss": 0.0201, "step": 4830 }, { "epoch": 1.2955215875569857, "grad_norm": 0.4419308689434764, "learning_rate": 7.004566052398476e-06, "loss": 0.0199, "step": 4831 }, { "epoch": 1.295789755966747, "grad_norm": 0.28619185258921215, "learning_rate": 7.003136642446632e-06, "loss": 0.0198, "step": 4832 }, { "epoch": 1.2960579243765085, "grad_norm": 0.24417459606865535, "learning_rate": 7.001707037454693e-06, "loss": 0.023, "step": 4833 }, { "epoch": 1.2963260927862699, "grad_norm": 0.2782980026858033, "learning_rate": 7.000277237561852e-06, "loss": 0.0241, "step": 4834 }, { "epoch": 1.296594261196031, "grad_norm": 0.24062819725398713, "learning_rate": 6.998847242907327e-06, "loss": 0.0244, "step": 4835 }, { "epoch": 1.2968624296057925, "grad_norm": 0.28502146271686457, "learning_rate": 6.997417053630354e-06, "loss": 0.0284, "step": 4836 }, { "epoch": 1.2971305980155536, "grad_norm": 0.4186347837248692, "learning_rate": 6.995986669870185e-06, "loss": 0.0337, "step": 4837 }, { "epoch": 1.297398766425315, "grad_norm": 0.4421422998790123, "learning_rate": 6.994556091766094e-06, "loss": 0.0292, "step": 4838 }, { "epoch": 1.2976669348350764, "grad_norm": 0.31127006541185676, "learning_rate": 6.993125319457371e-06, "loss": 0.0234, "step": 4839 }, { "epoch": 1.2979351032448379, "grad_norm": 0.38932711669783254, "learning_rate": 6.991694353083327e-06, "loss": 0.023, "step": 4840 }, { "epoch": 1.298203271654599, "grad_norm": 0.25418419706694056, "learning_rate": 6.990263192783293e-06, "loss": 0.025, "step": 4841 }, { "epoch": 1.2984714400643604, "grad_norm": 0.30014918209266206, "learning_rate": 6.988831838696614e-06, "loss": 0.0341, "step": 4842 }, { "epoch": 1.2987396084741216, "grad_norm": 0.20695649062811497, "learning_rate": 6.987400290962661e-06, "loss": 0.0184, "step": 4843 }, { "epoch": 1.299007776883883, "grad_norm": 0.3194793690667802, "learning_rate": 6.985968549720817e-06, "loss": 0.0238, "step": 4844 }, { "epoch": 1.2992759452936444, "grad_norm": 0.38394850866529884, "learning_rate": 6.984536615110489e-06, "loss": 0.0248, "step": 4845 }, { "epoch": 1.2995441137034058, "grad_norm": 0.2834082241347149, "learning_rate": 6.983104487271099e-06, "loss": 0.0252, "step": 4846 }, { "epoch": 1.299812282113167, "grad_norm": 0.21470481357818075, "learning_rate": 6.981672166342088e-06, "loss": 0.0215, "step": 4847 }, { "epoch": 1.3000804505229284, "grad_norm": 0.22390070392440534, "learning_rate": 6.980239652462921e-06, "loss": 0.0203, "step": 4848 }, { "epoch": 1.3003486189326896, "grad_norm": 0.25660961695523843, "learning_rate": 6.978806945773078e-06, "loss": 0.0236, "step": 4849 }, { "epoch": 1.300616787342451, "grad_norm": 0.27222541444735276, "learning_rate": 6.977374046412056e-06, "loss": 0.0286, "step": 4850 }, { "epoch": 1.3008849557522124, "grad_norm": 0.810680475093095, "learning_rate": 6.975940954519372e-06, "loss": 0.0356, "step": 4851 }, { "epoch": 1.3011531241619738, "grad_norm": 0.2744870102858102, "learning_rate": 6.974507670234563e-06, "loss": 0.0293, "step": 4852 }, { "epoch": 1.301421292571735, "grad_norm": 0.20740687727674478, "learning_rate": 6.9730741936971844e-06, "loss": 0.0177, "step": 4853 }, { "epoch": 1.3016894609814964, "grad_norm": 0.26604847943190046, "learning_rate": 6.97164052504681e-06, "loss": 0.0238, "step": 4854 }, { "epoch": 1.3019576293912576, "grad_norm": 0.433969866889402, "learning_rate": 6.970206664423034e-06, "loss": 0.0193, "step": 4855 }, { "epoch": 1.302225797801019, "grad_norm": 0.2929816513194438, "learning_rate": 6.968772611965464e-06, "loss": 0.0285, "step": 4856 }, { "epoch": 1.3024939662107804, "grad_norm": 0.2465491294944875, "learning_rate": 6.967338367813732e-06, "loss": 0.0266, "step": 4857 }, { "epoch": 1.3027621346205418, "grad_norm": 0.23244167731757573, "learning_rate": 6.965903932107488e-06, "loss": 0.0219, "step": 4858 }, { "epoch": 1.303030303030303, "grad_norm": 0.584477894418307, "learning_rate": 6.964469304986394e-06, "loss": 0.0425, "step": 4859 }, { "epoch": 1.3032984714400644, "grad_norm": 1.03064282865046, "learning_rate": 6.963034486590142e-06, "loss": 0.0305, "step": 4860 }, { "epoch": 1.3035666398498256, "grad_norm": 0.2492180453331263, "learning_rate": 6.9615994770584335e-06, "loss": 0.0209, "step": 4861 }, { "epoch": 1.303834808259587, "grad_norm": 0.323259893857889, "learning_rate": 6.9601642765309925e-06, "loss": 0.0218, "step": 4862 }, { "epoch": 1.3041029766693484, "grad_norm": 0.3059534688000298, "learning_rate": 6.958728885147559e-06, "loss": 0.0361, "step": 4863 }, { "epoch": 1.3043711450791098, "grad_norm": 0.2697071480144899, "learning_rate": 6.957293303047894e-06, "loss": 0.0251, "step": 4864 }, { "epoch": 1.304639313488871, "grad_norm": 0.7178545448976204, "learning_rate": 6.955857530371777e-06, "loss": 0.0391, "step": 4865 }, { "epoch": 1.3049074818986324, "grad_norm": 0.29814923602884263, "learning_rate": 6.954421567259004e-06, "loss": 0.0297, "step": 4866 }, { "epoch": 1.3051756503083936, "grad_norm": 0.20432820919085684, "learning_rate": 6.952985413849393e-06, "loss": 0.0171, "step": 4867 }, { "epoch": 1.305443818718155, "grad_norm": 0.4457507368037086, "learning_rate": 6.951549070282778e-06, "loss": 0.042, "step": 4868 }, { "epoch": 1.3057119871279164, "grad_norm": 0.32892171814189264, "learning_rate": 6.95011253669901e-06, "loss": 0.0379, "step": 4869 }, { "epoch": 1.3059801555376778, "grad_norm": 0.23763358060658146, "learning_rate": 6.948675813237963e-06, "loss": 0.0186, "step": 4870 }, { "epoch": 1.306248323947439, "grad_norm": 0.29450754689905423, "learning_rate": 6.9472389000395245e-06, "loss": 0.024, "step": 4871 }, { "epoch": 1.3065164923572004, "grad_norm": 0.41974692404292063, "learning_rate": 6.945801797243604e-06, "loss": 0.0203, "step": 4872 }, { "epoch": 1.3067846607669615, "grad_norm": 0.3398192797197089, "learning_rate": 6.94436450499013e-06, "loss": 0.0245, "step": 4873 }, { "epoch": 1.307052829176723, "grad_norm": 0.2937893670893608, "learning_rate": 6.9429270234190445e-06, "loss": 0.0258, "step": 4874 }, { "epoch": 1.3073209975864843, "grad_norm": 0.3502837389274702, "learning_rate": 6.941489352670315e-06, "loss": 0.0294, "step": 4875 }, { "epoch": 1.3075891659962458, "grad_norm": 0.29098717465034046, "learning_rate": 6.94005149288392e-06, "loss": 0.0253, "step": 4876 }, { "epoch": 1.307857334406007, "grad_norm": 0.461387272545984, "learning_rate": 6.938613444199863e-06, "loss": 0.0281, "step": 4877 }, { "epoch": 1.3081255028157683, "grad_norm": 0.4530933206181577, "learning_rate": 6.937175206758162e-06, "loss": 0.0352, "step": 4878 }, { "epoch": 1.3083936712255295, "grad_norm": 0.32241022629790644, "learning_rate": 6.935736780698852e-06, "loss": 0.0294, "step": 4879 }, { "epoch": 1.308661839635291, "grad_norm": 0.2906357516500285, "learning_rate": 6.934298166161994e-06, "loss": 0.0235, "step": 4880 }, { "epoch": 1.3089300080450523, "grad_norm": 1.0161698066131566, "learning_rate": 6.932859363287658e-06, "loss": 0.0263, "step": 4881 }, { "epoch": 1.3091981764548137, "grad_norm": 0.30313904851121987, "learning_rate": 6.931420372215937e-06, "loss": 0.025, "step": 4882 }, { "epoch": 1.309466344864575, "grad_norm": 0.2533592012422497, "learning_rate": 6.9299811930869424e-06, "loss": 0.0204, "step": 4883 }, { "epoch": 1.3097345132743363, "grad_norm": 0.4603647696553685, "learning_rate": 6.928541826040802e-06, "loss": 0.0278, "step": 4884 }, { "epoch": 1.3100026816840975, "grad_norm": 0.41317225891731385, "learning_rate": 6.927102271217665e-06, "loss": 0.0342, "step": 4885 }, { "epoch": 1.310270850093859, "grad_norm": 0.39563307264148795, "learning_rate": 6.925662528757697e-06, "loss": 0.027, "step": 4886 }, { "epoch": 1.3105390185036203, "grad_norm": 0.33441073436271396, "learning_rate": 6.92422259880108e-06, "loss": 0.039, "step": 4887 }, { "epoch": 1.3108071869133817, "grad_norm": 0.2699155256210237, "learning_rate": 6.922782481488018e-06, "loss": 0.0237, "step": 4888 }, { "epoch": 1.311075355323143, "grad_norm": 0.304075316409533, "learning_rate": 6.921342176958731e-06, "loss": 0.0275, "step": 4889 }, { "epoch": 1.3113435237329043, "grad_norm": 0.30637682322971943, "learning_rate": 6.919901685353456e-06, "loss": 0.0262, "step": 4890 }, { "epoch": 1.3116116921426655, "grad_norm": 0.3259472298533724, "learning_rate": 6.918461006812451e-06, "loss": 0.0218, "step": 4891 }, { "epoch": 1.311879860552427, "grad_norm": 0.3099504722722003, "learning_rate": 6.917020141475993e-06, "loss": 0.0243, "step": 4892 }, { "epoch": 1.3121480289621883, "grad_norm": 0.297677554356472, "learning_rate": 6.915579089484373e-06, "loss": 0.0218, "step": 4893 }, { "epoch": 1.3124161973719497, "grad_norm": 0.2747069678237856, "learning_rate": 6.914137850977902e-06, "loss": 0.0233, "step": 4894 }, { "epoch": 1.3126843657817109, "grad_norm": 0.6239502987725213, "learning_rate": 6.91269642609691e-06, "loss": 0.0304, "step": 4895 }, { "epoch": 1.3129525341914723, "grad_norm": 0.25087415344624836, "learning_rate": 6.911254814981745e-06, "loss": 0.024, "step": 4896 }, { "epoch": 1.3132207026012335, "grad_norm": 0.38573212091099346, "learning_rate": 6.909813017772774e-06, "loss": 0.0273, "step": 4897 }, { "epoch": 1.3134888710109949, "grad_norm": 0.2707328617433765, "learning_rate": 6.908371034610381e-06, "loss": 0.0249, "step": 4898 }, { "epoch": 1.3137570394207563, "grad_norm": 0.22384755211100074, "learning_rate": 6.9069288656349654e-06, "loss": 0.0244, "step": 4899 }, { "epoch": 1.3140252078305177, "grad_norm": 0.2420820051508984, "learning_rate": 6.905486510986947e-06, "loss": 0.0207, "step": 4900 }, { "epoch": 1.3142933762402789, "grad_norm": 0.34842368181048305, "learning_rate": 6.904043970806769e-06, "loss": 0.0231, "step": 4901 }, { "epoch": 1.3145615446500403, "grad_norm": 0.41518191732115567, "learning_rate": 6.902601245234883e-06, "loss": 0.0285, "step": 4902 }, { "epoch": 1.3148297130598015, "grad_norm": 0.19781740071146248, "learning_rate": 6.901158334411767e-06, "loss": 0.0177, "step": 4903 }, { "epoch": 1.3150978814695629, "grad_norm": 0.2886746586953782, "learning_rate": 6.899715238477911e-06, "loss": 0.0299, "step": 4904 }, { "epoch": 1.3153660498793243, "grad_norm": 0.3185701982662068, "learning_rate": 6.8982719575738245e-06, "loss": 0.0259, "step": 4905 }, { "epoch": 1.3156342182890857, "grad_norm": 0.34932236817918927, "learning_rate": 6.896828491840037e-06, "loss": 0.0436, "step": 4906 }, { "epoch": 1.3159023866988468, "grad_norm": 0.26564991458347226, "learning_rate": 6.895384841417097e-06, "loss": 0.0183, "step": 4907 }, { "epoch": 1.3161705551086083, "grad_norm": 0.723695264267591, "learning_rate": 6.893941006445567e-06, "loss": 0.0201, "step": 4908 }, { "epoch": 1.3164387235183694, "grad_norm": 0.38622558453268685, "learning_rate": 6.892496987066029e-06, "loss": 0.0209, "step": 4909 }, { "epoch": 1.3167068919281308, "grad_norm": 0.25010421752593626, "learning_rate": 6.891052783419084e-06, "loss": 0.0231, "step": 4910 }, { "epoch": 1.3169750603378922, "grad_norm": 0.3222507198631701, "learning_rate": 6.8896083956453495e-06, "loss": 0.0261, "step": 4911 }, { "epoch": 1.3172432287476536, "grad_norm": 0.3208914690053035, "learning_rate": 6.888163823885463e-06, "loss": 0.0187, "step": 4912 }, { "epoch": 1.3175113971574148, "grad_norm": 0.4486125192031171, "learning_rate": 6.886719068280076e-06, "loss": 0.0351, "step": 4913 }, { "epoch": 1.3177795655671762, "grad_norm": 0.5377007631316899, "learning_rate": 6.8852741289698656e-06, "loss": 0.0328, "step": 4914 }, { "epoch": 1.3180477339769374, "grad_norm": 0.24357504741930416, "learning_rate": 6.883829006095517e-06, "loss": 0.0186, "step": 4915 }, { "epoch": 1.3183159023866988, "grad_norm": 0.38088382229333384, "learning_rate": 6.882383699797739e-06, "loss": 0.0324, "step": 4916 }, { "epoch": 1.3185840707964602, "grad_norm": 0.3394026464087262, "learning_rate": 6.880938210217258e-06, "loss": 0.0308, "step": 4917 }, { "epoch": 1.3188522392062216, "grad_norm": 0.29869246423031914, "learning_rate": 6.879492537494818e-06, "loss": 0.0281, "step": 4918 }, { "epoch": 1.3191204076159828, "grad_norm": 0.2598201693530205, "learning_rate": 6.87804668177118e-06, "loss": 0.0269, "step": 4919 }, { "epoch": 1.3193885760257442, "grad_norm": 0.3646353847608352, "learning_rate": 6.876600643187122e-06, "loss": 0.0293, "step": 4920 }, { "epoch": 1.3196567444355054, "grad_norm": 0.49146531165977025, "learning_rate": 6.875154421883441e-06, "loss": 0.0233, "step": 4921 }, { "epoch": 1.3199249128452668, "grad_norm": 0.216996068430698, "learning_rate": 6.873708018000953e-06, "loss": 0.0193, "step": 4922 }, { "epoch": 1.3201930812550282, "grad_norm": 0.300845406352333, "learning_rate": 6.87226143168049e-06, "loss": 0.0224, "step": 4923 }, { "epoch": 1.3204612496647896, "grad_norm": 0.2947035824608659, "learning_rate": 6.870814663062902e-06, "loss": 0.0322, "step": 4924 }, { "epoch": 1.3207294180745508, "grad_norm": 0.276578861351353, "learning_rate": 6.869367712289058e-06, "loss": 0.0218, "step": 4925 }, { "epoch": 1.3209975864843122, "grad_norm": 0.31121376375453613, "learning_rate": 6.867920579499842e-06, "loss": 0.0346, "step": 4926 }, { "epoch": 1.3212657548940734, "grad_norm": 0.6949039369640413, "learning_rate": 6.866473264836158e-06, "loss": 0.0245, "step": 4927 }, { "epoch": 1.3215339233038348, "grad_norm": 0.30704677991537715, "learning_rate": 6.865025768438929e-06, "loss": 0.032, "step": 4928 }, { "epoch": 1.3218020917135962, "grad_norm": 0.371050280348174, "learning_rate": 6.8635780904490915e-06, "loss": 0.0399, "step": 4929 }, { "epoch": 1.3220702601233576, "grad_norm": 0.3013306753539082, "learning_rate": 6.862130231007603e-06, "loss": 0.0281, "step": 4930 }, { "epoch": 1.3223384285331188, "grad_norm": 0.33183434083088414, "learning_rate": 6.860682190255437e-06, "loss": 0.0305, "step": 4931 }, { "epoch": 1.3226065969428802, "grad_norm": 0.2310559487393208, "learning_rate": 6.8592339683335886e-06, "loss": 0.0238, "step": 4932 }, { "epoch": 1.3228747653526414, "grad_norm": 0.36213826723591064, "learning_rate": 6.857785565383062e-06, "loss": 0.0375, "step": 4933 }, { "epoch": 1.3231429337624028, "grad_norm": 0.25825111427282366, "learning_rate": 6.856336981544889e-06, "loss": 0.0251, "step": 4934 }, { "epoch": 1.3234111021721642, "grad_norm": 0.28790813158603973, "learning_rate": 6.8548882169601125e-06, "loss": 0.0228, "step": 4935 }, { "epoch": 1.3236792705819256, "grad_norm": 0.3759520759105815, "learning_rate": 6.853439271769795e-06, "loss": 0.0239, "step": 4936 }, { "epoch": 1.3239474389916868, "grad_norm": 0.37512221341993857, "learning_rate": 6.851990146115016e-06, "loss": 0.0281, "step": 4937 }, { "epoch": 1.3242156074014482, "grad_norm": 0.31777879749238375, "learning_rate": 6.850540840136872e-06, "loss": 0.0229, "step": 4938 }, { "epoch": 1.3244837758112094, "grad_norm": 0.17559967255584505, "learning_rate": 6.849091353976481e-06, "loss": 0.0145, "step": 4939 }, { "epoch": 1.3247519442209708, "grad_norm": 0.3259172329794027, "learning_rate": 6.8476416877749754e-06, "loss": 0.0327, "step": 4940 }, { "epoch": 1.3250201126307322, "grad_norm": 0.2907116942161748, "learning_rate": 6.846191841673503e-06, "loss": 0.0329, "step": 4941 }, { "epoch": 1.3252882810404936, "grad_norm": 0.25019848212929, "learning_rate": 6.844741815813232e-06, "loss": 0.0218, "step": 4942 }, { "epoch": 1.3255564494502547, "grad_norm": 0.258364293260714, "learning_rate": 6.843291610335347e-06, "loss": 0.0233, "step": 4943 }, { "epoch": 1.3258246178600162, "grad_norm": 0.3563590288885514, "learning_rate": 6.841841225381053e-06, "loss": 0.0146, "step": 4944 }, { "epoch": 1.3260927862697773, "grad_norm": 0.3420528503957618, "learning_rate": 6.840390661091569e-06, "loss": 0.022, "step": 4945 }, { "epoch": 1.3263609546795387, "grad_norm": 0.41597688452723464, "learning_rate": 6.838939917608134e-06, "loss": 0.027, "step": 4946 }, { "epoch": 1.3266291230893001, "grad_norm": 0.2802760455006701, "learning_rate": 6.837488995071999e-06, "loss": 0.0209, "step": 4947 }, { "epoch": 1.3268972914990615, "grad_norm": 0.4047945261340898, "learning_rate": 6.836037893624439e-06, "loss": 0.0292, "step": 4948 }, { "epoch": 1.3271654599088227, "grad_norm": 0.30388158283999817, "learning_rate": 6.8345866134067454e-06, "loss": 0.0252, "step": 4949 }, { "epoch": 1.3274336283185841, "grad_norm": 0.2265875521441694, "learning_rate": 6.8331351545602235e-06, "loss": 0.0165, "step": 4950 }, { "epoch": 1.3277017967283453, "grad_norm": 0.37970021307672275, "learning_rate": 6.831683517226197e-06, "loss": 0.0331, "step": 4951 }, { "epoch": 1.3279699651381067, "grad_norm": 0.2763960190806142, "learning_rate": 6.830231701546013e-06, "loss": 0.0297, "step": 4952 }, { "epoch": 1.3282381335478681, "grad_norm": 0.29118190580593933, "learning_rate": 6.8287797076610244e-06, "loss": 0.0301, "step": 4953 }, { "epoch": 1.3285063019576293, "grad_norm": 0.20484567452707492, "learning_rate": 6.827327535712611e-06, "loss": 0.0179, "step": 4954 }, { "epoch": 1.3287744703673907, "grad_norm": 0.34291610280074086, "learning_rate": 6.825875185842168e-06, "loss": 0.0292, "step": 4955 }, { "epoch": 1.3290426387771521, "grad_norm": 0.28051924251426985, "learning_rate": 6.824422658191105e-06, "loss": 0.0203, "step": 4956 }, { "epoch": 1.3293108071869133, "grad_norm": 0.3801734970185959, "learning_rate": 6.822969952900852e-06, "loss": 0.0418, "step": 4957 }, { "epoch": 1.3295789755966747, "grad_norm": 0.3048450416270671, "learning_rate": 6.821517070112855e-06, "loss": 0.0275, "step": 4958 }, { "epoch": 1.329847144006436, "grad_norm": 0.2976863771417503, "learning_rate": 6.820064009968577e-06, "loss": 0.0262, "step": 4959 }, { "epoch": 1.3301153124161973, "grad_norm": 0.29337800063249153, "learning_rate": 6.818610772609498e-06, "loss": 0.0328, "step": 4960 }, { "epoch": 1.3303834808259587, "grad_norm": 0.28890172261264035, "learning_rate": 6.817157358177116e-06, "loss": 0.0301, "step": 4961 }, { "epoch": 1.33065164923572, "grad_norm": 0.31893099145563, "learning_rate": 6.815703766812947e-06, "loss": 0.0293, "step": 4962 }, { "epoch": 1.3309198176454813, "grad_norm": 0.5345769181535043, "learning_rate": 6.814249998658522e-06, "loss": 0.0233, "step": 4963 }, { "epoch": 1.3311879860552427, "grad_norm": 0.3467316316980195, "learning_rate": 6.8127960538553925e-06, "loss": 0.0275, "step": 4964 }, { "epoch": 1.331456154465004, "grad_norm": 0.22988188256488568, "learning_rate": 6.811341932545124e-06, "loss": 0.0259, "step": 4965 }, { "epoch": 1.3317243228747653, "grad_norm": 0.2787744974049355, "learning_rate": 6.8098876348693e-06, "loss": 0.0274, "step": 4966 }, { "epoch": 1.3319924912845267, "grad_norm": 0.27861630160438033, "learning_rate": 6.808433160969522e-06, "loss": 0.0263, "step": 4967 }, { "epoch": 1.332260659694288, "grad_norm": 0.3944310291759222, "learning_rate": 6.806978510987409e-06, "loss": 0.0257, "step": 4968 }, { "epoch": 1.3325288281040493, "grad_norm": 0.2868600141692782, "learning_rate": 6.805523685064595e-06, "loss": 0.0246, "step": 4969 }, { "epoch": 1.3327969965138107, "grad_norm": 0.3465793229177891, "learning_rate": 6.804068683342733e-06, "loss": 0.0291, "step": 4970 }, { "epoch": 1.333065164923572, "grad_norm": 0.28155141041966797, "learning_rate": 6.802613505963496e-06, "loss": 0.0257, "step": 4971 }, { "epoch": 1.3333333333333333, "grad_norm": 0.41168379309458086, "learning_rate": 6.801158153068565e-06, "loss": 0.0304, "step": 4972 }, { "epoch": 1.3336015017430947, "grad_norm": 0.34263938319443127, "learning_rate": 6.799702624799648e-06, "loss": 0.0313, "step": 4973 }, { "epoch": 1.333869670152856, "grad_norm": 0.29541717819596386, "learning_rate": 6.798246921298464e-06, "loss": 0.0235, "step": 4974 }, { "epoch": 1.3341378385626173, "grad_norm": 0.25406444689351265, "learning_rate": 6.796791042706751e-06, "loss": 0.0224, "step": 4975 }, { "epoch": 1.3344060069723787, "grad_norm": 0.25384342543674693, "learning_rate": 6.795334989166264e-06, "loss": 0.0264, "step": 4976 }, { "epoch": 1.33467417538214, "grad_norm": 0.2567595734509517, "learning_rate": 6.793878760818777e-06, "loss": 0.0236, "step": 4977 }, { "epoch": 1.3349423437919012, "grad_norm": 0.32582796434854466, "learning_rate": 6.792422357806078e-06, "loss": 0.0308, "step": 4978 }, { "epoch": 1.3352105122016626, "grad_norm": 0.31064508302798366, "learning_rate": 6.790965780269971e-06, "loss": 0.0312, "step": 4979 }, { "epoch": 1.335478680611424, "grad_norm": 0.4741136829846071, "learning_rate": 6.789509028352284e-06, "loss": 0.024, "step": 4980 }, { "epoch": 1.3357468490211852, "grad_norm": 0.2688833349750044, "learning_rate": 6.78805210219485e-06, "loss": 0.0254, "step": 4981 }, { "epoch": 1.3360150174309466, "grad_norm": 0.2606080248073186, "learning_rate": 6.786595001939532e-06, "loss": 0.0229, "step": 4982 }, { "epoch": 1.336283185840708, "grad_norm": 0.3916911675434629, "learning_rate": 6.7851377277282025e-06, "loss": 0.0312, "step": 4983 }, { "epoch": 1.3365513542504692, "grad_norm": 0.3213175131824702, "learning_rate": 6.783680279702751e-06, "loss": 0.0254, "step": 4984 }, { "epoch": 1.3368195226602306, "grad_norm": 1.5219608260964959, "learning_rate": 6.782222658005088e-06, "loss": 0.034, "step": 4985 }, { "epoch": 1.337087691069992, "grad_norm": 0.28192536345419233, "learning_rate": 6.780764862777134e-06, "loss": 0.0205, "step": 4986 }, { "epoch": 1.3373558594797532, "grad_norm": 0.2576976473293279, "learning_rate": 6.779306894160834e-06, "loss": 0.0298, "step": 4987 }, { "epoch": 1.3376240278895146, "grad_norm": 0.2448813601073709, "learning_rate": 6.777848752298146e-06, "loss": 0.0213, "step": 4988 }, { "epoch": 1.337892196299276, "grad_norm": 0.21347433366229293, "learning_rate": 6.776390437331047e-06, "loss": 0.0218, "step": 4989 }, { "epoch": 1.3381603647090372, "grad_norm": 0.2865009017799368, "learning_rate": 6.774931949401524e-06, "loss": 0.0274, "step": 4990 }, { "epoch": 1.3384285331187986, "grad_norm": 0.3584167033879194, "learning_rate": 6.773473288651591e-06, "loss": 0.029, "step": 4991 }, { "epoch": 1.33869670152856, "grad_norm": 0.3428683375564989, "learning_rate": 6.772014455223271e-06, "loss": 0.0189, "step": 4992 }, { "epoch": 1.3389648699383212, "grad_norm": 0.43496325510121936, "learning_rate": 6.770555449258609e-06, "loss": 0.0308, "step": 4993 }, { "epoch": 1.3392330383480826, "grad_norm": 0.3300736792516876, "learning_rate": 6.7690962708996645e-06, "loss": 0.0319, "step": 4994 }, { "epoch": 1.339501206757844, "grad_norm": 0.41014148844154846, "learning_rate": 6.767636920288514e-06, "loss": 0.0275, "step": 4995 }, { "epoch": 1.3397693751676052, "grad_norm": 0.2818767179010466, "learning_rate": 6.766177397567246e-06, "loss": 0.0227, "step": 4996 }, { "epoch": 1.3400375435773666, "grad_norm": 0.5531159672986595, "learning_rate": 6.764717702877977e-06, "loss": 0.0307, "step": 4997 }, { "epoch": 1.3403057119871278, "grad_norm": 0.28902114755061614, "learning_rate": 6.763257836362829e-06, "loss": 0.0293, "step": 4998 }, { "epoch": 1.3405738803968892, "grad_norm": 0.3377310159301404, "learning_rate": 6.7617977981639475e-06, "loss": 0.0251, "step": 4999 }, { "epoch": 1.3408420488066506, "grad_norm": 0.25864389816161665, "learning_rate": 6.760337588423491e-06, "loss": 0.0255, "step": 5000 }, { "epoch": 1.341110217216412, "grad_norm": 0.2794139874470606, "learning_rate": 6.758877207283638e-06, "loss": 0.0233, "step": 5001 }, { "epoch": 1.3413783856261732, "grad_norm": 0.27218271806755717, "learning_rate": 6.7574166548865804e-06, "loss": 0.0228, "step": 5002 }, { "epoch": 1.3416465540359346, "grad_norm": 0.2867806012271734, "learning_rate": 6.75595593137453e-06, "loss": 0.0198, "step": 5003 }, { "epoch": 1.3419147224456958, "grad_norm": 0.2133792441275049, "learning_rate": 6.754495036889711e-06, "loss": 0.0226, "step": 5004 }, { "epoch": 1.3421828908554572, "grad_norm": 0.2932134827669461, "learning_rate": 6.75303397157437e-06, "loss": 0.029, "step": 5005 }, { "epoch": 1.3424510592652186, "grad_norm": 0.28411200108564105, "learning_rate": 6.751572735570764e-06, "loss": 0.0238, "step": 5006 }, { "epoch": 1.34271922767498, "grad_norm": 0.34661277051320044, "learning_rate": 6.7501113290211715e-06, "loss": 0.0329, "step": 5007 }, { "epoch": 1.3429873960847412, "grad_norm": 0.386897762415121, "learning_rate": 6.7486497520678864e-06, "loss": 0.035, "step": 5008 }, { "epoch": 1.3432555644945026, "grad_norm": 0.23075269916957455, "learning_rate": 6.747188004853216e-06, "loss": 0.0274, "step": 5009 }, { "epoch": 1.3435237329042637, "grad_norm": 0.2465671540890511, "learning_rate": 6.745726087519489e-06, "loss": 0.0278, "step": 5010 }, { "epoch": 1.3437919013140252, "grad_norm": 0.30167976493206206, "learning_rate": 6.7442640002090475e-06, "loss": 0.027, "step": 5011 }, { "epoch": 1.3440600697237866, "grad_norm": 0.5502437565311378, "learning_rate": 6.742801743064251e-06, "loss": 0.0273, "step": 5012 }, { "epoch": 1.344328238133548, "grad_norm": 0.3252126651516387, "learning_rate": 6.741339316227477e-06, "loss": 0.0225, "step": 5013 }, { "epoch": 1.3445964065433091, "grad_norm": 0.2553623964033162, "learning_rate": 6.739876719841115e-06, "loss": 0.0194, "step": 5014 }, { "epoch": 1.3448645749530705, "grad_norm": 0.38137221270285687, "learning_rate": 6.738413954047578e-06, "loss": 0.0277, "step": 5015 }, { "epoch": 1.3451327433628317, "grad_norm": 0.4981605090645751, "learning_rate": 6.736951018989289e-06, "loss": 0.0273, "step": 5016 }, { "epoch": 1.3454009117725931, "grad_norm": 0.27367997014703416, "learning_rate": 6.7354879148086904e-06, "loss": 0.0255, "step": 5017 }, { "epoch": 1.3456690801823545, "grad_norm": 0.3641032804386525, "learning_rate": 6.73402464164824e-06, "loss": 0.023, "step": 5018 }, { "epoch": 1.345937248592116, "grad_norm": 0.3156356141243637, "learning_rate": 6.732561199650417e-06, "loss": 0.034, "step": 5019 }, { "epoch": 1.3462054170018771, "grad_norm": 0.2805549338188656, "learning_rate": 6.731097588957708e-06, "loss": 0.0208, "step": 5020 }, { "epoch": 1.3464735854116385, "grad_norm": 0.19540225284971263, "learning_rate": 6.729633809712623e-06, "loss": 0.0157, "step": 5021 }, { "epoch": 1.3467417538213997, "grad_norm": 0.25919978245756614, "learning_rate": 6.728169862057687e-06, "loss": 0.0233, "step": 5022 }, { "epoch": 1.3470099222311611, "grad_norm": 0.2887090582260099, "learning_rate": 6.726705746135438e-06, "loss": 0.0298, "step": 5023 }, { "epoch": 1.3472780906409225, "grad_norm": 0.3042642899693252, "learning_rate": 6.725241462088434e-06, "loss": 0.0306, "step": 5024 }, { "epoch": 1.347546259050684, "grad_norm": 0.29991265501159897, "learning_rate": 6.723777010059253e-06, "loss": 0.022, "step": 5025 }, { "epoch": 1.347814427460445, "grad_norm": 0.2394799332537473, "learning_rate": 6.7223123901904785e-06, "loss": 0.0212, "step": 5026 }, { "epoch": 1.3480825958702065, "grad_norm": 0.24444992425515327, "learning_rate": 6.72084760262472e-06, "loss": 0.02, "step": 5027 }, { "epoch": 1.3483507642799677, "grad_norm": 0.3206941383547955, "learning_rate": 6.719382647504598e-06, "loss": 0.0274, "step": 5028 }, { "epoch": 1.348618932689729, "grad_norm": 0.35043114424613836, "learning_rate": 6.717917524972752e-06, "loss": 0.0359, "step": 5029 }, { "epoch": 1.3488871010994905, "grad_norm": 0.3900173049273321, "learning_rate": 6.71645223517184e-06, "loss": 0.0314, "step": 5030 }, { "epoch": 1.349155269509252, "grad_norm": 0.23186204819174158, "learning_rate": 6.71498677824453e-06, "loss": 0.0245, "step": 5031 }, { "epoch": 1.349423437919013, "grad_norm": 0.302822575564756, "learning_rate": 6.713521154333511e-06, "loss": 0.0302, "step": 5032 }, { "epoch": 1.3496916063287745, "grad_norm": 0.3158300891581313, "learning_rate": 6.712055363581485e-06, "loss": 0.0291, "step": 5033 }, { "epoch": 1.3499597747385357, "grad_norm": 0.30361361557921546, "learning_rate": 6.710589406131174e-06, "loss": 0.019, "step": 5034 }, { "epoch": 1.350227943148297, "grad_norm": 0.28954453278253367, "learning_rate": 6.7091232821253145e-06, "loss": 0.0244, "step": 5035 }, { "epoch": 1.3504961115580585, "grad_norm": 0.20660424090738022, "learning_rate": 6.707656991706659e-06, "loss": 0.0234, "step": 5036 }, { "epoch": 1.35076427996782, "grad_norm": 0.28698718794474076, "learning_rate": 6.706190535017977e-06, "loss": 0.023, "step": 5037 }, { "epoch": 1.351032448377581, "grad_norm": 0.49103456427724895, "learning_rate": 6.704723912202051e-06, "loss": 0.0317, "step": 5038 }, { "epoch": 1.3513006167873425, "grad_norm": 0.2869390023519569, "learning_rate": 6.703257123401683e-06, "loss": 0.0209, "step": 5039 }, { "epoch": 1.3515687851971037, "grad_norm": 0.28363044008807875, "learning_rate": 6.701790168759692e-06, "loss": 0.0226, "step": 5040 }, { "epoch": 1.351836953606865, "grad_norm": 0.2972507729032398, "learning_rate": 6.700323048418911e-06, "loss": 0.024, "step": 5041 }, { "epoch": 1.3521051220166265, "grad_norm": 0.6309358069669206, "learning_rate": 6.698855762522187e-06, "loss": 0.0296, "step": 5042 }, { "epoch": 1.3523732904263879, "grad_norm": 0.32758714252897797, "learning_rate": 6.69738831121239e-06, "loss": 0.0214, "step": 5043 }, { "epoch": 1.352641458836149, "grad_norm": 0.27862056562215365, "learning_rate": 6.695920694632399e-06, "loss": 0.0317, "step": 5044 }, { "epoch": 1.3529096272459105, "grad_norm": 0.2583153668257148, "learning_rate": 6.694452912925111e-06, "loss": 0.0223, "step": 5045 }, { "epoch": 1.3531777956556716, "grad_norm": 0.36701797301508854, "learning_rate": 6.692984966233443e-06, "loss": 0.0228, "step": 5046 }, { "epoch": 1.353445964065433, "grad_norm": 0.2713090792345699, "learning_rate": 6.691516854700324e-06, "loss": 0.0191, "step": 5047 }, { "epoch": 1.3537141324751945, "grad_norm": 0.2614201343082274, "learning_rate": 6.690048578468698e-06, "loss": 0.0187, "step": 5048 }, { "epoch": 1.3539823008849559, "grad_norm": 0.29326486334863117, "learning_rate": 6.68858013768153e-06, "loss": 0.0326, "step": 5049 }, { "epoch": 1.354250469294717, "grad_norm": 0.2952834898180767, "learning_rate": 6.687111532481798e-06, "loss": 0.0272, "step": 5050 }, { "epoch": 1.3545186377044784, "grad_norm": 0.27522460560402734, "learning_rate": 6.685642763012494e-06, "loss": 0.0196, "step": 5051 }, { "epoch": 1.3547868061142396, "grad_norm": 0.23217384018522005, "learning_rate": 6.68417382941663e-06, "loss": 0.0209, "step": 5052 }, { "epoch": 1.355054974524001, "grad_norm": 0.25616569109118725, "learning_rate": 6.682704731837232e-06, "loss": 0.017, "step": 5053 }, { "epoch": 1.3553231429337624, "grad_norm": 0.2546297647693074, "learning_rate": 6.681235470417341e-06, "loss": 0.0217, "step": 5054 }, { "epoch": 1.3555913113435238, "grad_norm": 0.3291739391909463, "learning_rate": 6.679766045300017e-06, "loss": 0.024, "step": 5055 }, { "epoch": 1.355859479753285, "grad_norm": 0.34111097541759494, "learning_rate": 6.6782964566283315e-06, "loss": 0.0297, "step": 5056 }, { "epoch": 1.3561276481630464, "grad_norm": 0.3192875922138327, "learning_rate": 6.676826704545376e-06, "loss": 0.0171, "step": 5057 }, { "epoch": 1.3563958165728076, "grad_norm": 0.3534537385551134, "learning_rate": 6.675356789194256e-06, "loss": 0.0327, "step": 5058 }, { "epoch": 1.356663984982569, "grad_norm": 0.3061867979262889, "learning_rate": 6.673886710718095e-06, "loss": 0.0268, "step": 5059 }, { "epoch": 1.3569321533923304, "grad_norm": 0.30448102013977585, "learning_rate": 6.672416469260028e-06, "loss": 0.0341, "step": 5060 }, { "epoch": 1.3572003218020918, "grad_norm": 0.28534281318190696, "learning_rate": 6.670946064963209e-06, "loss": 0.0238, "step": 5061 }, { "epoch": 1.357468490211853, "grad_norm": 0.21015625975559304, "learning_rate": 6.669475497970809e-06, "loss": 0.0223, "step": 5062 }, { "epoch": 1.3577366586216144, "grad_norm": 0.31317725447923994, "learning_rate": 6.668004768426011e-06, "loss": 0.0273, "step": 5063 }, { "epoch": 1.3580048270313756, "grad_norm": 0.2362236238183627, "learning_rate": 6.666533876472018e-06, "loss": 0.0254, "step": 5064 }, { "epoch": 1.358272995441137, "grad_norm": 0.24699963504566408, "learning_rate": 6.665062822252046e-06, "loss": 0.024, "step": 5065 }, { "epoch": 1.3585411638508984, "grad_norm": 0.326434897125347, "learning_rate": 6.663591605909325e-06, "loss": 0.0327, "step": 5066 }, { "epoch": 1.3588093322606598, "grad_norm": 0.19945791465968243, "learning_rate": 6.66212022758711e-06, "loss": 0.0293, "step": 5067 }, { "epoch": 1.359077500670421, "grad_norm": 0.2708259302081907, "learning_rate": 6.66064868742866e-06, "loss": 0.0224, "step": 5068 }, { "epoch": 1.3593456690801824, "grad_norm": 0.21620411722317956, "learning_rate": 6.659176985577256e-06, "loss": 0.0229, "step": 5069 }, { "epoch": 1.3596138374899436, "grad_norm": 0.27788741044009674, "learning_rate": 6.657705122176194e-06, "loss": 0.0241, "step": 5070 }, { "epoch": 1.359882005899705, "grad_norm": 0.18041673717614787, "learning_rate": 6.656233097368785e-06, "loss": 0.0203, "step": 5071 }, { "epoch": 1.3601501743094664, "grad_norm": 0.2991962375235837, "learning_rate": 6.654760911298357e-06, "loss": 0.0237, "step": 5072 }, { "epoch": 1.3604183427192278, "grad_norm": 0.24732961148700947, "learning_rate": 6.653288564108252e-06, "loss": 0.0354, "step": 5073 }, { "epoch": 1.360686511128989, "grad_norm": 0.2259752050825254, "learning_rate": 6.65181605594183e-06, "loss": 0.0218, "step": 5074 }, { "epoch": 1.3609546795387504, "grad_norm": 0.4170065395801382, "learning_rate": 6.650343386942462e-06, "loss": 0.0349, "step": 5075 }, { "epoch": 1.3612228479485116, "grad_norm": 0.30123259997765317, "learning_rate": 6.648870557253541e-06, "loss": 0.0344, "step": 5076 }, { "epoch": 1.361491016358273, "grad_norm": 0.2482891797060849, "learning_rate": 6.647397567018472e-06, "loss": 0.0203, "step": 5077 }, { "epoch": 1.3617591847680344, "grad_norm": 0.30567748975754927, "learning_rate": 6.645924416380676e-06, "loss": 0.0317, "step": 5078 }, { "epoch": 1.3620273531777958, "grad_norm": 0.24767753062552794, "learning_rate": 6.644451105483588e-06, "loss": 0.0218, "step": 5079 }, { "epoch": 1.362295521587557, "grad_norm": 0.23412280814432446, "learning_rate": 6.642977634470665e-06, "loss": 0.0204, "step": 5080 }, { "epoch": 1.3625636899973184, "grad_norm": 0.21496519906965245, "learning_rate": 6.641504003485369e-06, "loss": 0.0216, "step": 5081 }, { "epoch": 1.3628318584070795, "grad_norm": 0.24889239029486848, "learning_rate": 6.6400302126711845e-06, "loss": 0.0187, "step": 5082 }, { "epoch": 1.363100026816841, "grad_norm": 0.40078029198581455, "learning_rate": 6.638556262171615e-06, "loss": 0.0224, "step": 5083 }, { "epoch": 1.3633681952266024, "grad_norm": 0.2905008394101974, "learning_rate": 6.637082152130172e-06, "loss": 0.0239, "step": 5084 }, { "epoch": 1.3636363636363638, "grad_norm": 0.31030734336727234, "learning_rate": 6.635607882690387e-06, "loss": 0.0273, "step": 5085 }, { "epoch": 1.363904532046125, "grad_norm": 0.34070729252319876, "learning_rate": 6.634133453995805e-06, "loss": 0.024, "step": 5086 }, { "epoch": 1.3641727004558863, "grad_norm": 0.28896385082043063, "learning_rate": 6.632658866189985e-06, "loss": 0.0213, "step": 5087 }, { "epoch": 1.3644408688656475, "grad_norm": 0.23465225002551524, "learning_rate": 6.631184119416507e-06, "loss": 0.0193, "step": 5088 }, { "epoch": 1.364709037275409, "grad_norm": 0.34667035711358335, "learning_rate": 6.629709213818962e-06, "loss": 0.0349, "step": 5089 }, { "epoch": 1.3649772056851703, "grad_norm": 0.21791769530711644, "learning_rate": 6.628234149540957e-06, "loss": 0.0216, "step": 5090 }, { "epoch": 1.3652453740949317, "grad_norm": 0.2739570789065874, "learning_rate": 6.626758926726118e-06, "loss": 0.0275, "step": 5091 }, { "epoch": 1.365513542504693, "grad_norm": 0.21960668466380617, "learning_rate": 6.62528354551808e-06, "loss": 0.0209, "step": 5092 }, { "epoch": 1.3657817109144543, "grad_norm": 0.21993935213671614, "learning_rate": 6.623808006060497e-06, "loss": 0.0189, "step": 5093 }, { "epoch": 1.3660498793242155, "grad_norm": 0.2758263814019537, "learning_rate": 6.6223323084970415e-06, "loss": 0.0207, "step": 5094 }, { "epoch": 1.366318047733977, "grad_norm": 0.2284052009677256, "learning_rate": 6.620856452971395e-06, "loss": 0.0194, "step": 5095 }, { "epoch": 1.3665862161437383, "grad_norm": 0.2812677323751444, "learning_rate": 6.61938043962726e-06, "loss": 0.0273, "step": 5096 }, { "epoch": 1.3668543845534997, "grad_norm": 0.3668572195161334, "learning_rate": 6.6179042686083505e-06, "loss": 0.0215, "step": 5097 }, { "epoch": 1.367122552963261, "grad_norm": 0.4838162027145904, "learning_rate": 6.616427940058398e-06, "loss": 0.0258, "step": 5098 }, { "epoch": 1.3673907213730223, "grad_norm": 0.3505781867486169, "learning_rate": 6.614951454121148e-06, "loss": 0.0286, "step": 5099 }, { "epoch": 1.3676588897827835, "grad_norm": 0.28022479874431183, "learning_rate": 6.613474810940363e-06, "loss": 0.0252, "step": 5100 }, { "epoch": 1.367927058192545, "grad_norm": 0.6008600469381709, "learning_rate": 6.611998010659818e-06, "loss": 0.0271, "step": 5101 }, { "epoch": 1.3681952266023063, "grad_norm": 0.2967687825135098, "learning_rate": 6.610521053423308e-06, "loss": 0.0296, "step": 5102 }, { "epoch": 1.3684633950120677, "grad_norm": 0.21958961292984855, "learning_rate": 6.609043939374638e-06, "loss": 0.0204, "step": 5103 }, { "epoch": 1.368731563421829, "grad_norm": 0.3159632223460224, "learning_rate": 6.607566668657632e-06, "loss": 0.024, "step": 5104 }, { "epoch": 1.3689997318315903, "grad_norm": 0.30005556663436195, "learning_rate": 6.606089241416127e-06, "loss": 0.0271, "step": 5105 }, { "epoch": 1.3692679002413515, "grad_norm": 0.25760099062866715, "learning_rate": 6.604611657793978e-06, "loss": 0.0235, "step": 5106 }, { "epoch": 1.3695360686511129, "grad_norm": 0.3204697494909888, "learning_rate": 6.60313391793505e-06, "loss": 0.0295, "step": 5107 }, { "epoch": 1.3698042370608743, "grad_norm": 0.2727651145898471, "learning_rate": 6.60165602198323e-06, "loss": 0.029, "step": 5108 }, { "epoch": 1.3700724054706357, "grad_norm": 0.20786565898512394, "learning_rate": 6.600177970082416e-06, "loss": 0.0198, "step": 5109 }, { "epoch": 1.3703405738803969, "grad_norm": 0.2700042646917856, "learning_rate": 6.59869976237652e-06, "loss": 0.0184, "step": 5110 }, { "epoch": 1.3706087422901583, "grad_norm": 0.3866713630628548, "learning_rate": 6.597221399009474e-06, "loss": 0.0252, "step": 5111 }, { "epoch": 1.3708769106999195, "grad_norm": 0.28361676049979284, "learning_rate": 6.59574288012522e-06, "loss": 0.0222, "step": 5112 }, { "epoch": 1.3711450791096809, "grad_norm": 0.3996774438131774, "learning_rate": 6.5942642058677185e-06, "loss": 0.0331, "step": 5113 }, { "epoch": 1.3714132475194423, "grad_norm": 0.2227104512568194, "learning_rate": 6.592785376380944e-06, "loss": 0.0169, "step": 5114 }, { "epoch": 1.3716814159292037, "grad_norm": 0.7584157525301737, "learning_rate": 6.591306391808886e-06, "loss": 0.0286, "step": 5115 }, { "epoch": 1.3719495843389649, "grad_norm": 0.5069549852650557, "learning_rate": 6.589827252295551e-06, "loss": 0.0201, "step": 5116 }, { "epoch": 1.3722177527487263, "grad_norm": 0.28620843722007233, "learning_rate": 6.588347957984955e-06, "loss": 0.0248, "step": 5117 }, { "epoch": 1.3724859211584874, "grad_norm": 0.6020014541029284, "learning_rate": 6.586868509021137e-06, "loss": 0.0248, "step": 5118 }, { "epoch": 1.3727540895682488, "grad_norm": 0.2866972865999652, "learning_rate": 6.585388905548144e-06, "loss": 0.025, "step": 5119 }, { "epoch": 1.3730222579780103, "grad_norm": 0.3861905887095278, "learning_rate": 6.583909147710042e-06, "loss": 0.0209, "step": 5120 }, { "epoch": 1.3732904263877717, "grad_norm": 0.2416065141345552, "learning_rate": 6.582429235650914e-06, "loss": 0.0208, "step": 5121 }, { "epoch": 1.3735585947975328, "grad_norm": 0.2825162122475292, "learning_rate": 6.580949169514852e-06, "loss": 0.0254, "step": 5122 }, { "epoch": 1.3738267632072942, "grad_norm": 0.23836590871192379, "learning_rate": 6.579468949445965e-06, "loss": 0.0176, "step": 5123 }, { "epoch": 1.3740949316170554, "grad_norm": 0.2217462910104824, "learning_rate": 6.577988575588381e-06, "loss": 0.0201, "step": 5124 }, { "epoch": 1.3743631000268168, "grad_norm": 0.3829205152823053, "learning_rate": 6.576508048086237e-06, "loss": 0.0274, "step": 5125 }, { "epoch": 1.3746312684365782, "grad_norm": 0.2668024941088268, "learning_rate": 6.5750273670836915e-06, "loss": 0.0182, "step": 5126 }, { "epoch": 1.3748994368463396, "grad_norm": 0.33111035300024017, "learning_rate": 6.5735465327249125e-06, "loss": 0.0228, "step": 5127 }, { "epoch": 1.3751676052561008, "grad_norm": 0.37058450124543657, "learning_rate": 6.572065545154087e-06, "loss": 0.0366, "step": 5128 }, { "epoch": 1.3754357736658622, "grad_norm": 0.3549421173434805, "learning_rate": 6.570584404515412e-06, "loss": 0.0208, "step": 5129 }, { "epoch": 1.3757039420756234, "grad_norm": 0.2699091902228763, "learning_rate": 6.569103110953103e-06, "loss": 0.0197, "step": 5130 }, { "epoch": 1.3759721104853848, "grad_norm": 0.3762589093209011, "learning_rate": 6.567621664611389e-06, "loss": 0.0316, "step": 5131 }, { "epoch": 1.3762402788951462, "grad_norm": 0.2351667055258288, "learning_rate": 6.566140065634516e-06, "loss": 0.0243, "step": 5132 }, { "epoch": 1.3765084473049074, "grad_norm": 0.2893426605052858, "learning_rate": 6.5646583141667446e-06, "loss": 0.0259, "step": 5133 }, { "epoch": 1.3767766157146688, "grad_norm": 0.2701150698004994, "learning_rate": 6.563176410352347e-06, "loss": 0.0277, "step": 5134 }, { "epoch": 1.3770447841244302, "grad_norm": 0.3352044360474642, "learning_rate": 6.56169435433561e-06, "loss": 0.0279, "step": 5135 }, { "epoch": 1.3773129525341914, "grad_norm": 0.29523620558317343, "learning_rate": 6.560212146260841e-06, "loss": 0.0271, "step": 5136 }, { "epoch": 1.3775811209439528, "grad_norm": 0.49138982849964424, "learning_rate": 6.558729786272359e-06, "loss": 0.0333, "step": 5137 }, { "epoch": 1.3778492893537142, "grad_norm": 0.33503344411027447, "learning_rate": 6.5572472745144944e-06, "loss": 0.0275, "step": 5138 }, { "epoch": 1.3781174577634754, "grad_norm": 0.2416364582209329, "learning_rate": 6.555764611131599e-06, "loss": 0.0135, "step": 5139 }, { "epoch": 1.3783856261732368, "grad_norm": 0.26789059717793146, "learning_rate": 6.554281796268031e-06, "loss": 0.0325, "step": 5140 }, { "epoch": 1.3786537945829982, "grad_norm": 0.32317284330765095, "learning_rate": 6.552798830068173e-06, "loss": 0.0192, "step": 5141 }, { "epoch": 1.3789219629927594, "grad_norm": 0.35869690065737037, "learning_rate": 6.551315712676416e-06, "loss": 0.0353, "step": 5142 }, { "epoch": 1.3791901314025208, "grad_norm": 0.36553281445631164, "learning_rate": 6.549832444237165e-06, "loss": 0.0287, "step": 5143 }, { "epoch": 1.3794582998122822, "grad_norm": 0.30364053577404354, "learning_rate": 6.548349024894846e-06, "loss": 0.03, "step": 5144 }, { "epoch": 1.3797264682220434, "grad_norm": 0.30298155570238244, "learning_rate": 6.546865454793891e-06, "loss": 0.0256, "step": 5145 }, { "epoch": 1.3799946366318048, "grad_norm": 0.36046815855821246, "learning_rate": 6.545381734078754e-06, "loss": 0.0299, "step": 5146 }, { "epoch": 1.3802628050415662, "grad_norm": 0.27389916574120127, "learning_rate": 6.543897862893901e-06, "loss": 0.027, "step": 5147 }, { "epoch": 1.3805309734513274, "grad_norm": 0.25778395382622, "learning_rate": 6.542413841383813e-06, "loss": 0.0269, "step": 5148 }, { "epoch": 1.3807991418610888, "grad_norm": 0.316177238561347, "learning_rate": 6.5409296696929825e-06, "loss": 0.0355, "step": 5149 }, { "epoch": 1.3810673102708502, "grad_norm": 0.2877127953208787, "learning_rate": 6.5394453479659215e-06, "loss": 0.0178, "step": 5150 }, { "epoch": 1.3813354786806114, "grad_norm": 0.23668135190631837, "learning_rate": 6.537960876347155e-06, "loss": 0.0171, "step": 5151 }, { "epoch": 1.3816036470903728, "grad_norm": 0.21441462891255378, "learning_rate": 6.5364762549812195e-06, "loss": 0.0239, "step": 5152 }, { "epoch": 1.3818718155001342, "grad_norm": 0.30021628193401917, "learning_rate": 6.534991484012672e-06, "loss": 0.0266, "step": 5153 }, { "epoch": 1.3821399839098953, "grad_norm": 0.2585604144668049, "learning_rate": 6.533506563586079e-06, "loss": 0.0199, "step": 5154 }, { "epoch": 1.3824081523196567, "grad_norm": 0.4001361624400113, "learning_rate": 6.532021493846021e-06, "loss": 0.0302, "step": 5155 }, { "epoch": 1.3826763207294182, "grad_norm": 0.260373697212655, "learning_rate": 6.5305362749371e-06, "loss": 0.022, "step": 5156 }, { "epoch": 1.3829444891391793, "grad_norm": 0.37664659813509344, "learning_rate": 6.529050907003924e-06, "loss": 0.0255, "step": 5157 }, { "epoch": 1.3832126575489407, "grad_norm": 0.2806128700654724, "learning_rate": 6.527565390191122e-06, "loss": 0.0211, "step": 5158 }, { "epoch": 1.3834808259587021, "grad_norm": 0.20592256332628703, "learning_rate": 6.526079724643334e-06, "loss": 0.0168, "step": 5159 }, { "epoch": 1.3837489943684633, "grad_norm": 0.26195819425124944, "learning_rate": 6.524593910505214e-06, "loss": 0.0249, "step": 5160 }, { "epoch": 1.3840171627782247, "grad_norm": 0.32691775010066143, "learning_rate": 6.5231079479214325e-06, "loss": 0.0345, "step": 5161 }, { "epoch": 1.3842853311879861, "grad_norm": 0.21849658606767608, "learning_rate": 6.521621837036672e-06, "loss": 0.019, "step": 5162 }, { "epoch": 1.3845534995977473, "grad_norm": 0.34732717083683623, "learning_rate": 6.520135577995636e-06, "loss": 0.0373, "step": 5163 }, { "epoch": 1.3848216680075087, "grad_norm": 0.19546418682449748, "learning_rate": 6.518649170943035e-06, "loss": 0.015, "step": 5164 }, { "epoch": 1.3850898364172701, "grad_norm": 0.23112305909493605, "learning_rate": 6.517162616023593e-06, "loss": 0.0181, "step": 5165 }, { "epoch": 1.3853580048270313, "grad_norm": 0.23798546114377456, "learning_rate": 6.515675913382058e-06, "loss": 0.0218, "step": 5166 }, { "epoch": 1.3856261732367927, "grad_norm": 0.45282538591777605, "learning_rate": 6.514189063163182e-06, "loss": 0.0271, "step": 5167 }, { "epoch": 1.3858943416465541, "grad_norm": 0.2970928429852598, "learning_rate": 6.5127020655117355e-06, "loss": 0.0313, "step": 5168 }, { "epoch": 1.3861625100563153, "grad_norm": 0.23505047994155392, "learning_rate": 6.511214920572507e-06, "loss": 0.0202, "step": 5169 }, { "epoch": 1.3864306784660767, "grad_norm": 0.47194009371941376, "learning_rate": 6.509727628490294e-06, "loss": 0.0195, "step": 5170 }, { "epoch": 1.386698846875838, "grad_norm": 0.27713698068059905, "learning_rate": 6.508240189409911e-06, "loss": 0.0249, "step": 5171 }, { "epoch": 1.3869670152855993, "grad_norm": 0.19444911249917674, "learning_rate": 6.506752603476183e-06, "loss": 0.0175, "step": 5172 }, { "epoch": 1.3872351836953607, "grad_norm": 0.3400962721634422, "learning_rate": 6.505264870833954e-06, "loss": 0.0274, "step": 5173 }, { "epoch": 1.387503352105122, "grad_norm": 0.2822669872235687, "learning_rate": 6.503776991628082e-06, "loss": 0.0191, "step": 5174 }, { "epoch": 1.3877715205148833, "grad_norm": 0.22144025490236657, "learning_rate": 6.502288966003437e-06, "loss": 0.0156, "step": 5175 }, { "epoch": 1.3880396889246447, "grad_norm": 0.2796820310291233, "learning_rate": 6.500800794104905e-06, "loss": 0.0199, "step": 5176 }, { "epoch": 1.3883078573344059, "grad_norm": 0.3575901827382534, "learning_rate": 6.499312476077383e-06, "loss": 0.0279, "step": 5177 }, { "epoch": 1.3885760257441673, "grad_norm": 0.35267346214958295, "learning_rate": 6.497824012065784e-06, "loss": 0.0326, "step": 5178 }, { "epoch": 1.3888441941539287, "grad_norm": 0.28469498710799546, "learning_rate": 6.496335402215039e-06, "loss": 0.0278, "step": 5179 }, { "epoch": 1.38911236256369, "grad_norm": 0.32922009994379825, "learning_rate": 6.4948466466700895e-06, "loss": 0.0253, "step": 5180 }, { "epoch": 1.3893805309734513, "grad_norm": 0.43433414287820016, "learning_rate": 6.49335774557589e-06, "loss": 0.0241, "step": 5181 }, { "epoch": 1.3896486993832127, "grad_norm": 0.22944106146363083, "learning_rate": 6.4918686990774126e-06, "loss": 0.0171, "step": 5182 }, { "epoch": 1.3899168677929739, "grad_norm": 0.35259992801980905, "learning_rate": 6.49037950731964e-06, "loss": 0.0254, "step": 5183 }, { "epoch": 1.3901850362027353, "grad_norm": 0.37116440513520066, "learning_rate": 6.48889017044757e-06, "loss": 0.0231, "step": 5184 }, { "epoch": 1.3904532046124967, "grad_norm": 0.3408909605440221, "learning_rate": 6.4874006886062205e-06, "loss": 0.033, "step": 5185 }, { "epoch": 1.390721373022258, "grad_norm": 0.32272411590045763, "learning_rate": 6.4859110619406125e-06, "loss": 0.0275, "step": 5186 }, { "epoch": 1.3909895414320192, "grad_norm": 0.2877175369892691, "learning_rate": 6.48442129059579e-06, "loss": 0.0274, "step": 5187 }, { "epoch": 1.3912577098417807, "grad_norm": 0.20653224324990985, "learning_rate": 6.4829313747168075e-06, "loss": 0.019, "step": 5188 }, { "epoch": 1.3915258782515418, "grad_norm": 0.37871118074958016, "learning_rate": 6.481441314448734e-06, "loss": 0.0374, "step": 5189 }, { "epoch": 1.3917940466613032, "grad_norm": 0.18229658447990152, "learning_rate": 6.479951109936653e-06, "loss": 0.0144, "step": 5190 }, { "epoch": 1.3920622150710646, "grad_norm": 0.3074400030242003, "learning_rate": 6.4784607613256615e-06, "loss": 0.021, "step": 5191 }, { "epoch": 1.392330383480826, "grad_norm": 0.3109855953044543, "learning_rate": 6.476970268760871e-06, "loss": 0.0236, "step": 5192 }, { "epoch": 1.3925985518905872, "grad_norm": 0.2473880043851488, "learning_rate": 6.475479632387407e-06, "loss": 0.0179, "step": 5193 }, { "epoch": 1.3928667203003486, "grad_norm": 0.22139972218326306, "learning_rate": 6.473988852350408e-06, "loss": 0.019, "step": 5194 }, { "epoch": 1.3931348887101098, "grad_norm": 0.2159896545649916, "learning_rate": 6.472497928795027e-06, "loss": 0.0175, "step": 5195 }, { "epoch": 1.3934030571198712, "grad_norm": 0.2222372696612467, "learning_rate": 6.471006861866433e-06, "loss": 0.0226, "step": 5196 }, { "epoch": 1.3936712255296326, "grad_norm": 0.3106854918072371, "learning_rate": 6.469515651709808e-06, "loss": 0.0303, "step": 5197 }, { "epoch": 1.393939393939394, "grad_norm": 0.39325850247664007, "learning_rate": 6.468024298470343e-06, "loss": 0.0174, "step": 5198 }, { "epoch": 1.3942075623491552, "grad_norm": 0.4710431167945331, "learning_rate": 6.4665328022932505e-06, "loss": 0.0307, "step": 5199 }, { "epoch": 1.3944757307589166, "grad_norm": 0.22877078657714775, "learning_rate": 6.465041163323752e-06, "loss": 0.0181, "step": 5200 }, { "epoch": 1.3947438991686778, "grad_norm": 0.26515572725263986, "learning_rate": 6.463549381707087e-06, "loss": 0.0188, "step": 5201 }, { "epoch": 1.3950120675784392, "grad_norm": 0.27943431328503227, "learning_rate": 6.462057457588503e-06, "loss": 0.0207, "step": 5202 }, { "epoch": 1.3952802359882006, "grad_norm": 0.2785532796197089, "learning_rate": 6.460565391113266e-06, "loss": 0.0212, "step": 5203 }, { "epoch": 1.395548404397962, "grad_norm": 0.4291464036396191, "learning_rate": 6.459073182426654e-06, "loss": 0.0223, "step": 5204 }, { "epoch": 1.3958165728077232, "grad_norm": 0.27142747503558823, "learning_rate": 6.45758083167396e-06, "loss": 0.0171, "step": 5205 }, { "epoch": 1.3960847412174846, "grad_norm": 0.21253071722607556, "learning_rate": 6.4560883390004905e-06, "loss": 0.0201, "step": 5206 }, { "epoch": 1.3963529096272458, "grad_norm": 0.27426484295345344, "learning_rate": 6.454595704551567e-06, "loss": 0.023, "step": 5207 }, { "epoch": 1.3966210780370072, "grad_norm": 0.2801770565473622, "learning_rate": 6.45310292847252e-06, "loss": 0.0325, "step": 5208 }, { "epoch": 1.3968892464467686, "grad_norm": 0.3291962380521347, "learning_rate": 6.4516100109087e-06, "loss": 0.0274, "step": 5209 }, { "epoch": 1.39715741485653, "grad_norm": 0.2301296610068382, "learning_rate": 6.450116952005464e-06, "loss": 0.0207, "step": 5210 }, { "epoch": 1.3974255832662912, "grad_norm": 0.43305789779332676, "learning_rate": 6.448623751908193e-06, "loss": 0.0265, "step": 5211 }, { "epoch": 1.3976937516760526, "grad_norm": 0.6956404534281433, "learning_rate": 6.4471304107622725e-06, "loss": 0.0407, "step": 5212 }, { "epoch": 1.3979619200858138, "grad_norm": 0.349718616042425, "learning_rate": 6.4456369287131084e-06, "loss": 0.0302, "step": 5213 }, { "epoch": 1.3982300884955752, "grad_norm": 0.33105537887205494, "learning_rate": 6.444143305906112e-06, "loss": 0.0204, "step": 5214 }, { "epoch": 1.3984982569053366, "grad_norm": 0.264735580058941, "learning_rate": 6.4426495424867164e-06, "loss": 0.0224, "step": 5215 }, { "epoch": 1.398766425315098, "grad_norm": 0.35248519644064036, "learning_rate": 6.441155638600365e-06, "loss": 0.0259, "step": 5216 }, { "epoch": 1.3990345937248592, "grad_norm": 0.3713305696021956, "learning_rate": 6.439661594392515e-06, "loss": 0.0251, "step": 5217 }, { "epoch": 1.3993027621346206, "grad_norm": 0.28178920462796453, "learning_rate": 6.438167410008638e-06, "loss": 0.0212, "step": 5218 }, { "epoch": 1.3995709305443818, "grad_norm": 0.6224282570483649, "learning_rate": 6.436673085594221e-06, "loss": 0.0251, "step": 5219 }, { "epoch": 1.3998390989541432, "grad_norm": 0.2858509706628988, "learning_rate": 6.4351786212947565e-06, "loss": 0.0226, "step": 5220 }, { "epoch": 1.4001072673639046, "grad_norm": 0.2914970784605602, "learning_rate": 6.433684017255761e-06, "loss": 0.0269, "step": 5221 }, { "epoch": 1.400375435773666, "grad_norm": 0.304243040205742, "learning_rate": 6.4321892736227585e-06, "loss": 0.0392, "step": 5222 }, { "epoch": 1.4006436041834271, "grad_norm": 0.24467194358553385, "learning_rate": 6.43069439054129e-06, "loss": 0.0227, "step": 5223 }, { "epoch": 1.4009117725931886, "grad_norm": 0.23797578224929014, "learning_rate": 6.4291993681569065e-06, "loss": 0.0252, "step": 5224 }, { "epoch": 1.4011799410029497, "grad_norm": 0.23607386543837883, "learning_rate": 6.427704206615176e-06, "loss": 0.0214, "step": 5225 }, { "epoch": 1.4014481094127111, "grad_norm": 0.3347313654782488, "learning_rate": 6.426208906061676e-06, "loss": 0.0345, "step": 5226 }, { "epoch": 1.4017162778224725, "grad_norm": 0.2561400006699048, "learning_rate": 6.424713466642002e-06, "loss": 0.0189, "step": 5227 }, { "epoch": 1.401984446232234, "grad_norm": 0.3299244174657613, "learning_rate": 6.4232178885017606e-06, "loss": 0.0275, "step": 5228 }, { "epoch": 1.4022526146419951, "grad_norm": 0.3099302222251416, "learning_rate": 6.421722171786571e-06, "loss": 0.0291, "step": 5229 }, { "epoch": 1.4025207830517565, "grad_norm": 0.2866042133617756, "learning_rate": 6.420226316642069e-06, "loss": 0.0224, "step": 5230 }, { "epoch": 1.4027889514615177, "grad_norm": 0.3153110884215908, "learning_rate": 6.418730323213901e-06, "loss": 0.0228, "step": 5231 }, { "epoch": 1.4030571198712791, "grad_norm": 0.43223590951397056, "learning_rate": 6.417234191647728e-06, "loss": 0.0233, "step": 5232 }, { "epoch": 1.4033252882810405, "grad_norm": 0.2854848003877294, "learning_rate": 6.415737922089224e-06, "loss": 0.0275, "step": 5233 }, { "epoch": 1.403593456690802, "grad_norm": 0.2693254431737801, "learning_rate": 6.414241514684077e-06, "loss": 0.0238, "step": 5234 }, { "epoch": 1.4038616251005631, "grad_norm": 0.2882725970015527, "learning_rate": 6.4127449695779894e-06, "loss": 0.0199, "step": 5235 }, { "epoch": 1.4041297935103245, "grad_norm": 0.2821050582967342, "learning_rate": 6.411248286916675e-06, "loss": 0.0188, "step": 5236 }, { "epoch": 1.4043979619200857, "grad_norm": 0.4002808657214984, "learning_rate": 6.4097514668458614e-06, "loss": 0.0295, "step": 5237 }, { "epoch": 1.404666130329847, "grad_norm": 0.22753981501253998, "learning_rate": 6.4082545095112904e-06, "loss": 0.025, "step": 5238 }, { "epoch": 1.4049342987396085, "grad_norm": 0.2826434960327498, "learning_rate": 6.406757415058717e-06, "loss": 0.0252, "step": 5239 }, { "epoch": 1.40520246714937, "grad_norm": 0.3243913881501028, "learning_rate": 6.405260183633908e-06, "loss": 0.0317, "step": 5240 }, { "epoch": 1.405470635559131, "grad_norm": 0.38825089959778947, "learning_rate": 6.403762815382646e-06, "loss": 0.0351, "step": 5241 }, { "epoch": 1.4057388039688925, "grad_norm": 0.3005784717623661, "learning_rate": 6.402265310450726e-06, "loss": 0.0243, "step": 5242 }, { "epoch": 1.4060069723786537, "grad_norm": 0.2820356864044271, "learning_rate": 6.400767668983955e-06, "loss": 0.0315, "step": 5243 }, { "epoch": 1.406275140788415, "grad_norm": 0.32030767616135214, "learning_rate": 6.399269891128156e-06, "loss": 0.0242, "step": 5244 }, { "epoch": 1.4065433091981765, "grad_norm": 0.2640864191029548, "learning_rate": 6.397771977029161e-06, "loss": 0.026, "step": 5245 }, { "epoch": 1.406811477607938, "grad_norm": 0.449536447638611, "learning_rate": 6.396273926832821e-06, "loss": 0.034, "step": 5246 }, { "epoch": 1.407079646017699, "grad_norm": 0.2624313155718272, "learning_rate": 6.394775740684996e-06, "loss": 0.0265, "step": 5247 }, { "epoch": 1.4073478144274605, "grad_norm": 0.23758192821345253, "learning_rate": 6.39327741873156e-06, "loss": 0.0228, "step": 5248 }, { "epoch": 1.4076159828372217, "grad_norm": 0.2667669305291097, "learning_rate": 6.3917789611184e-06, "loss": 0.0219, "step": 5249 }, { "epoch": 1.407884151246983, "grad_norm": 0.43251985750390337, "learning_rate": 6.390280367991419e-06, "loss": 0.0296, "step": 5250 }, { "epoch": 1.4081523196567445, "grad_norm": 0.6050776657347895, "learning_rate": 6.388781639496529e-06, "loss": 0.0258, "step": 5251 }, { "epoch": 1.4084204880665059, "grad_norm": 0.3003150905683207, "learning_rate": 6.387282775779657e-06, "loss": 0.0225, "step": 5252 }, { "epoch": 1.408688656476267, "grad_norm": 0.2683298170080757, "learning_rate": 6.385783776986745e-06, "loss": 0.0198, "step": 5253 }, { "epoch": 1.4089568248860285, "grad_norm": 0.268678344864896, "learning_rate": 6.384284643263745e-06, "loss": 0.0216, "step": 5254 }, { "epoch": 1.4092249932957897, "grad_norm": 0.3731545596970325, "learning_rate": 6.382785374756627e-06, "loss": 0.032, "step": 5255 }, { "epoch": 1.409493161705551, "grad_norm": 0.3426301584764138, "learning_rate": 6.3812859716113675e-06, "loss": 0.0248, "step": 5256 }, { "epoch": 1.4097613301153125, "grad_norm": 0.285326095267181, "learning_rate": 6.379786433973959e-06, "loss": 0.0274, "step": 5257 }, { "epoch": 1.4100294985250739, "grad_norm": 0.4712938969360387, "learning_rate": 6.378286761990408e-06, "loss": 0.0202, "step": 5258 }, { "epoch": 1.410297666934835, "grad_norm": 0.4685059918605246, "learning_rate": 6.376786955806735e-06, "loss": 0.0381, "step": 5259 }, { "epoch": 1.4105658353445965, "grad_norm": 0.22491126398403954, "learning_rate": 6.375287015568971e-06, "loss": 0.0189, "step": 5260 }, { "epoch": 1.4108340037543576, "grad_norm": 0.387916355012814, "learning_rate": 6.373786941423165e-06, "loss": 0.0295, "step": 5261 }, { "epoch": 1.411102172164119, "grad_norm": 0.28576208946203746, "learning_rate": 6.37228673351537e-06, "loss": 0.021, "step": 5262 }, { "epoch": 1.4113703405738804, "grad_norm": 0.2885125546012685, "learning_rate": 6.3707863919916565e-06, "loss": 0.0221, "step": 5263 }, { "epoch": 1.4116385089836418, "grad_norm": 0.2869403262283209, "learning_rate": 6.369285916998113e-06, "loss": 0.0288, "step": 5264 }, { "epoch": 1.411906677393403, "grad_norm": 0.28287860071558796, "learning_rate": 6.367785308680835e-06, "loss": 0.0234, "step": 5265 }, { "epoch": 1.4121748458031644, "grad_norm": 0.5990039101555422, "learning_rate": 6.366284567185934e-06, "loss": 0.028, "step": 5266 }, { "epoch": 1.4124430142129256, "grad_norm": 0.39137981904099617, "learning_rate": 6.364783692659532e-06, "loss": 0.0255, "step": 5267 }, { "epoch": 1.412711182622687, "grad_norm": 0.34672424828636894, "learning_rate": 6.363282685247765e-06, "loss": 0.0347, "step": 5268 }, { "epoch": 1.4129793510324484, "grad_norm": 0.2955189419664367, "learning_rate": 6.36178154509678e-06, "loss": 0.0211, "step": 5269 }, { "epoch": 1.4132475194422098, "grad_norm": 0.30517366119817585, "learning_rate": 6.360280272352744e-06, "loss": 0.0266, "step": 5270 }, { "epoch": 1.413515687851971, "grad_norm": 0.29938145815087114, "learning_rate": 6.358778867161829e-06, "loss": 0.0251, "step": 5271 }, { "epoch": 1.4137838562617324, "grad_norm": 0.24257819976122985, "learning_rate": 6.357277329670223e-06, "loss": 0.0209, "step": 5272 }, { "epoch": 1.4140520246714936, "grad_norm": 0.23794818770066298, "learning_rate": 6.355775660024128e-06, "loss": 0.0169, "step": 5273 }, { "epoch": 1.414320193081255, "grad_norm": 0.32894626930039006, "learning_rate": 6.354273858369754e-06, "loss": 0.0231, "step": 5274 }, { "epoch": 1.4145883614910164, "grad_norm": 0.29096812876719, "learning_rate": 6.352771924853331e-06, "loss": 0.0232, "step": 5275 }, { "epoch": 1.4148565299007778, "grad_norm": 0.38797090038222903, "learning_rate": 6.3512698596210985e-06, "loss": 0.0253, "step": 5276 }, { "epoch": 1.415124698310539, "grad_norm": 0.2774409407450422, "learning_rate": 6.349767662819306e-06, "loss": 0.0226, "step": 5277 }, { "epoch": 1.4153928667203004, "grad_norm": 0.2958631893308465, "learning_rate": 6.34826533459422e-06, "loss": 0.0238, "step": 5278 }, { "epoch": 1.4156610351300616, "grad_norm": 0.2816345864410328, "learning_rate": 6.346762875092118e-06, "loss": 0.0208, "step": 5279 }, { "epoch": 1.415929203539823, "grad_norm": 0.26791417253659155, "learning_rate": 6.3452602844592905e-06, "loss": 0.0201, "step": 5280 }, { "epoch": 1.4161973719495844, "grad_norm": 0.2740727744967442, "learning_rate": 6.3437575628420415e-06, "loss": 0.0199, "step": 5281 }, { "epoch": 1.4164655403593458, "grad_norm": 0.29438001216708637, "learning_rate": 6.342254710386685e-06, "loss": 0.0271, "step": 5282 }, { "epoch": 1.416733708769107, "grad_norm": 0.33155585095414847, "learning_rate": 6.340751727239551e-06, "loss": 0.0274, "step": 5283 }, { "epoch": 1.4170018771788684, "grad_norm": 0.4094600522951844, "learning_rate": 6.339248613546981e-06, "loss": 0.0321, "step": 5284 }, { "epoch": 1.4172700455886296, "grad_norm": 0.23928278020833182, "learning_rate": 6.33774536945533e-06, "loss": 0.023, "step": 5285 }, { "epoch": 1.417538213998391, "grad_norm": 0.25189107980082537, "learning_rate": 6.336241995110963e-06, "loss": 0.0202, "step": 5286 }, { "epoch": 1.4178063824081524, "grad_norm": 0.23110726629153236, "learning_rate": 6.3347384906602615e-06, "loss": 0.0215, "step": 5287 }, { "epoch": 1.4180745508179138, "grad_norm": 0.2477437644769945, "learning_rate": 6.333234856249616e-06, "loss": 0.0214, "step": 5288 }, { "epoch": 1.418342719227675, "grad_norm": 0.2452979555821637, "learning_rate": 6.331731092025433e-06, "loss": 0.0228, "step": 5289 }, { "epoch": 1.4186108876374364, "grad_norm": 0.33282911104338675, "learning_rate": 6.330227198134129e-06, "loss": 0.0279, "step": 5290 }, { "epoch": 1.4188790560471976, "grad_norm": 0.21087370513202716, "learning_rate": 6.328723174722133e-06, "loss": 0.0196, "step": 5291 }, { "epoch": 1.419147224456959, "grad_norm": 0.6791959302081488, "learning_rate": 6.3272190219358905e-06, "loss": 0.0381, "step": 5292 }, { "epoch": 1.4194153928667204, "grad_norm": 0.29527479029750564, "learning_rate": 6.325714739921857e-06, "loss": 0.0266, "step": 5293 }, { "epoch": 1.4196835612764818, "grad_norm": 0.29092415678884226, "learning_rate": 6.324210328826496e-06, "loss": 0.0252, "step": 5294 }, { "epoch": 1.419951729686243, "grad_norm": 0.17590664031497444, "learning_rate": 6.322705788796293e-06, "loss": 0.0143, "step": 5295 }, { "epoch": 1.4202198980960044, "grad_norm": 0.1954030057545939, "learning_rate": 6.321201119977738e-06, "loss": 0.0162, "step": 5296 }, { "epoch": 1.4204880665057655, "grad_norm": 0.5147688847097033, "learning_rate": 6.319696322517339e-06, "loss": 0.0418, "step": 5297 }, { "epoch": 1.420756234915527, "grad_norm": 0.28811284829650596, "learning_rate": 6.318191396561613e-06, "loss": 0.0298, "step": 5298 }, { "epoch": 1.4210244033252883, "grad_norm": 0.3596284113754197, "learning_rate": 6.316686342257088e-06, "loss": 0.0317, "step": 5299 }, { "epoch": 1.4212925717350497, "grad_norm": 0.26525356528217353, "learning_rate": 6.315181159750311e-06, "loss": 0.0235, "step": 5300 }, { "epoch": 1.421560740144811, "grad_norm": 0.19579851057319878, "learning_rate": 6.313675849187836e-06, "loss": 0.0166, "step": 5301 }, { "epoch": 1.4218289085545723, "grad_norm": 0.23903943648080928, "learning_rate": 6.312170410716231e-06, "loss": 0.0212, "step": 5302 }, { "epoch": 1.4220970769643335, "grad_norm": 0.2662655635446794, "learning_rate": 6.310664844482078e-06, "loss": 0.022, "step": 5303 }, { "epoch": 1.422365245374095, "grad_norm": 0.296284990351937, "learning_rate": 6.3091591506319695e-06, "loss": 0.0222, "step": 5304 }, { "epoch": 1.4226334137838563, "grad_norm": 0.3513022918152601, "learning_rate": 6.307653329312509e-06, "loss": 0.024, "step": 5305 }, { "epoch": 1.4229015821936175, "grad_norm": 0.2912085737295004, "learning_rate": 6.306147380670313e-06, "loss": 0.0178, "step": 5306 }, { "epoch": 1.423169750603379, "grad_norm": 0.2431643508910642, "learning_rate": 6.304641304852017e-06, "loss": 0.0209, "step": 5307 }, { "epoch": 1.4234379190131403, "grad_norm": 0.26397467506014016, "learning_rate": 6.303135102004261e-06, "loss": 0.029, "step": 5308 }, { "epoch": 1.4237060874229015, "grad_norm": 0.2451141481956949, "learning_rate": 6.3016287722737015e-06, "loss": 0.021, "step": 5309 }, { "epoch": 1.423974255832663, "grad_norm": 0.31323563700490503, "learning_rate": 6.300122315807003e-06, "loss": 0.022, "step": 5310 }, { "epoch": 1.4242424242424243, "grad_norm": 0.2689697616355054, "learning_rate": 6.298615732750845e-06, "loss": 0.0345, "step": 5311 }, { "epoch": 1.4245105926521855, "grad_norm": 0.2515621802688394, "learning_rate": 6.297109023251919e-06, "loss": 0.0243, "step": 5312 }, { "epoch": 1.424778761061947, "grad_norm": 0.287115642051253, "learning_rate": 6.295602187456935e-06, "loss": 0.0317, "step": 5313 }, { "epoch": 1.4250469294717083, "grad_norm": 0.30084846365717655, "learning_rate": 6.294095225512604e-06, "loss": 0.0246, "step": 5314 }, { "epoch": 1.4253150978814695, "grad_norm": 0.5250332356935385, "learning_rate": 6.292588137565657e-06, "loss": 0.0161, "step": 5315 }, { "epoch": 1.4255832662912309, "grad_norm": 0.2009372908100997, "learning_rate": 6.291080923762836e-06, "loss": 0.0171, "step": 5316 }, { "epoch": 1.4258514347009923, "grad_norm": 0.25315568040806347, "learning_rate": 6.289573584250892e-06, "loss": 0.0165, "step": 5317 }, { "epoch": 1.4261196031107535, "grad_norm": 0.34238494980140943, "learning_rate": 6.2880661191765916e-06, "loss": 0.0242, "step": 5318 }, { "epoch": 1.4263877715205149, "grad_norm": 0.29577184021758646, "learning_rate": 6.286558528686713e-06, "loss": 0.0235, "step": 5319 }, { "epoch": 1.4266559399302763, "grad_norm": 0.33966007520380315, "learning_rate": 6.285050812928047e-06, "loss": 0.0239, "step": 5320 }, { "epoch": 1.4269241083400375, "grad_norm": 0.22092185642465098, "learning_rate": 6.283542972047394e-06, "loss": 0.019, "step": 5321 }, { "epoch": 1.4271922767497989, "grad_norm": 0.31381530525257817, "learning_rate": 6.28203500619157e-06, "loss": 0.0324, "step": 5322 }, { "epoch": 1.4274604451595603, "grad_norm": 0.22676412223518147, "learning_rate": 6.280526915507401e-06, "loss": 0.0187, "step": 5323 }, { "epoch": 1.4277286135693215, "grad_norm": 0.3388827535947665, "learning_rate": 6.279018700141727e-06, "loss": 0.0262, "step": 5324 }, { "epoch": 1.4279967819790829, "grad_norm": 0.2531809595948373, "learning_rate": 6.277510360241397e-06, "loss": 0.0251, "step": 5325 }, { "epoch": 1.4282649503888443, "grad_norm": 0.2514353170350491, "learning_rate": 6.276001895953276e-06, "loss": 0.017, "step": 5326 }, { "epoch": 1.4285331187986055, "grad_norm": 0.34551389676483873, "learning_rate": 6.274493307424237e-06, "loss": 0.0353, "step": 5327 }, { "epoch": 1.4288012872083669, "grad_norm": 0.27627166393599467, "learning_rate": 6.272984594801168e-06, "loss": 0.0265, "step": 5328 }, { "epoch": 1.4290694556181283, "grad_norm": 0.3529716224944672, "learning_rate": 6.271475758230969e-06, "loss": 0.0266, "step": 5329 }, { "epoch": 1.4293376240278894, "grad_norm": 0.238325768865046, "learning_rate": 6.2699667978605514e-06, "loss": 0.0202, "step": 5330 }, { "epoch": 1.4296057924376508, "grad_norm": 0.24948170084339144, "learning_rate": 6.268457713836839e-06, "loss": 0.0266, "step": 5331 }, { "epoch": 1.4298739608474123, "grad_norm": 0.24790634538282338, "learning_rate": 6.266948506306767e-06, "loss": 0.0228, "step": 5332 }, { "epoch": 1.4301421292571734, "grad_norm": 0.1966884924329599, "learning_rate": 6.265439175417282e-06, "loss": 0.0163, "step": 5333 }, { "epoch": 1.4304102976669348, "grad_norm": 0.2746508998224272, "learning_rate": 6.263929721315343e-06, "loss": 0.0257, "step": 5334 }, { "epoch": 1.4306784660766962, "grad_norm": 0.2831131848300995, "learning_rate": 6.262420144147924e-06, "loss": 0.0315, "step": 5335 }, { "epoch": 1.4309466344864574, "grad_norm": 0.27531355947121033, "learning_rate": 6.260910444062007e-06, "loss": 0.0216, "step": 5336 }, { "epoch": 1.4312148028962188, "grad_norm": 0.2284117946181391, "learning_rate": 6.2594006212045874e-06, "loss": 0.0259, "step": 5337 }, { "epoch": 1.4314829713059802, "grad_norm": 0.3635535174231931, "learning_rate": 6.257890675722673e-06, "loss": 0.0332, "step": 5338 }, { "epoch": 1.4317511397157414, "grad_norm": 0.2597549147439196, "learning_rate": 6.256380607763283e-06, "loss": 0.0301, "step": 5339 }, { "epoch": 1.4320193081255028, "grad_norm": 0.30154118391154877, "learning_rate": 6.25487041747345e-06, "loss": 0.0256, "step": 5340 }, { "epoch": 1.4322874765352642, "grad_norm": 0.25394083036931275, "learning_rate": 6.253360105000215e-06, "loss": 0.0278, "step": 5341 }, { "epoch": 1.4325556449450254, "grad_norm": 0.32571866178191444, "learning_rate": 6.251849670490634e-06, "loss": 0.0316, "step": 5342 }, { "epoch": 1.4328238133547868, "grad_norm": 0.2573370619177963, "learning_rate": 6.250339114091775e-06, "loss": 0.0208, "step": 5343 }, { "epoch": 1.4330919817645482, "grad_norm": 0.27500914381349895, "learning_rate": 6.248828435950714e-06, "loss": 0.0222, "step": 5344 }, { "epoch": 1.4333601501743094, "grad_norm": 0.34246373123089185, "learning_rate": 6.247317636214546e-06, "loss": 0.0307, "step": 5345 }, { "epoch": 1.4336283185840708, "grad_norm": 0.2243444086184642, "learning_rate": 6.2458067150303715e-06, "loss": 0.0171, "step": 5346 }, { "epoch": 1.4338964869938322, "grad_norm": 0.37676339920252094, "learning_rate": 6.244295672545304e-06, "loss": 0.0244, "step": 5347 }, { "epoch": 1.4341646554035934, "grad_norm": 0.4287692486266675, "learning_rate": 6.24278450890647e-06, "loss": 0.0254, "step": 5348 }, { "epoch": 1.4344328238133548, "grad_norm": 0.3084207538886438, "learning_rate": 6.241273224261007e-06, "loss": 0.0192, "step": 5349 }, { "epoch": 1.434700992223116, "grad_norm": 0.3092660479953778, "learning_rate": 6.239761818756067e-06, "loss": 0.0288, "step": 5350 }, { "epoch": 1.4349691606328774, "grad_norm": 0.2897819533030703, "learning_rate": 6.23825029253881e-06, "loss": 0.0301, "step": 5351 }, { "epoch": 1.4352373290426388, "grad_norm": 0.38790822378766804, "learning_rate": 6.236738645756412e-06, "loss": 0.028, "step": 5352 }, { "epoch": 1.4355054974524002, "grad_norm": 0.312685517382252, "learning_rate": 6.235226878556052e-06, "loss": 0.0223, "step": 5353 }, { "epoch": 1.4357736658621614, "grad_norm": 0.4029924728266074, "learning_rate": 6.233714991084931e-06, "loss": 0.0321, "step": 5354 }, { "epoch": 1.4360418342719228, "grad_norm": 0.2589231604220812, "learning_rate": 6.2322029834902565e-06, "loss": 0.0253, "step": 5355 }, { "epoch": 1.436310002681684, "grad_norm": 0.18906847993576714, "learning_rate": 6.23069085591925e-06, "loss": 0.0172, "step": 5356 }, { "epoch": 1.4365781710914454, "grad_norm": 0.26724489482962327, "learning_rate": 6.229178608519142e-06, "loss": 0.0223, "step": 5357 }, { "epoch": 1.4368463395012068, "grad_norm": 0.31589357740095964, "learning_rate": 6.227666241437177e-06, "loss": 0.0266, "step": 5358 }, { "epoch": 1.4371145079109682, "grad_norm": 0.23280446922185238, "learning_rate": 6.2261537548206084e-06, "loss": 0.0168, "step": 5359 }, { "epoch": 1.4373826763207294, "grad_norm": 0.24234337781626092, "learning_rate": 6.2246411488167035e-06, "loss": 0.0226, "step": 5360 }, { "epoch": 1.4376508447304908, "grad_norm": 0.2947696170753836, "learning_rate": 6.2231284235727426e-06, "loss": 0.0258, "step": 5361 }, { "epoch": 1.437919013140252, "grad_norm": 0.2898646835357457, "learning_rate": 6.2216155792360144e-06, "loss": 0.026, "step": 5362 }, { "epoch": 1.4381871815500133, "grad_norm": 0.34950747799872084, "learning_rate": 6.22010261595382e-06, "loss": 0.0287, "step": 5363 }, { "epoch": 1.4384553499597748, "grad_norm": 0.2823247912419688, "learning_rate": 6.218589533873474e-06, "loss": 0.02, "step": 5364 }, { "epoch": 1.4387235183695362, "grad_norm": 0.37375612660157514, "learning_rate": 6.217076333142299e-06, "loss": 0.0267, "step": 5365 }, { "epoch": 1.4389916867792973, "grad_norm": 0.3170426325065319, "learning_rate": 6.215563013907633e-06, "loss": 0.0259, "step": 5366 }, { "epoch": 1.4392598551890587, "grad_norm": 0.32428165033529943, "learning_rate": 6.214049576316824e-06, "loss": 0.0266, "step": 5367 }, { "epoch": 1.43952802359882, "grad_norm": 0.39350757506337875, "learning_rate": 6.212536020517231e-06, "loss": 0.0208, "step": 5368 }, { "epoch": 1.4397961920085813, "grad_norm": 0.2894035195973142, "learning_rate": 6.211022346656226e-06, "loss": 0.0254, "step": 5369 }, { "epoch": 1.4400643604183427, "grad_norm": 0.3397095198647925, "learning_rate": 6.209508554881189e-06, "loss": 0.0308, "step": 5370 }, { "epoch": 1.4403325288281041, "grad_norm": 0.3856211192662421, "learning_rate": 6.207994645339514e-06, "loss": 0.034, "step": 5371 }, { "epoch": 1.4406006972378653, "grad_norm": 0.5271103929955762, "learning_rate": 6.20648061817861e-06, "loss": 0.0444, "step": 5372 }, { "epoch": 1.4408688656476267, "grad_norm": 0.22365516816898437, "learning_rate": 6.20496647354589e-06, "loss": 0.0188, "step": 5373 }, { "epoch": 1.441137034057388, "grad_norm": 0.28126830149709386, "learning_rate": 6.203452211588784e-06, "loss": 0.0218, "step": 5374 }, { "epoch": 1.4414052024671493, "grad_norm": 0.30106082794817834, "learning_rate": 6.201937832454732e-06, "loss": 0.0266, "step": 5375 }, { "epoch": 1.4416733708769107, "grad_norm": 0.3926781714014268, "learning_rate": 6.200423336291184e-06, "loss": 0.0229, "step": 5376 }, { "epoch": 1.4419415392866721, "grad_norm": 0.35126430458160074, "learning_rate": 6.198908723245601e-06, "loss": 0.0294, "step": 5377 }, { "epoch": 1.4422097076964333, "grad_norm": 0.277438551264075, "learning_rate": 6.1973939934654606e-06, "loss": 0.026, "step": 5378 }, { "epoch": 1.4424778761061947, "grad_norm": 0.2719039380982307, "learning_rate": 6.195879147098246e-06, "loss": 0.0277, "step": 5379 }, { "epoch": 1.442746044515956, "grad_norm": 0.2488959067645933, "learning_rate": 6.194364184291452e-06, "loss": 0.0219, "step": 5380 }, { "epoch": 1.4430142129257173, "grad_norm": 0.28226131451840397, "learning_rate": 6.192849105192591e-06, "loss": 0.0257, "step": 5381 }, { "epoch": 1.4432823813354787, "grad_norm": 2.656061093999901, "learning_rate": 6.191333909949178e-06, "loss": 0.0329, "step": 5382 }, { "epoch": 1.44355054974524, "grad_norm": 0.3199529356714819, "learning_rate": 6.189818598708746e-06, "loss": 0.0265, "step": 5383 }, { "epoch": 1.4438187181550013, "grad_norm": 0.31335336995284435, "learning_rate": 6.188303171618835e-06, "loss": 0.0277, "step": 5384 }, { "epoch": 1.4440868865647627, "grad_norm": 0.2001070001128666, "learning_rate": 6.186787628826999e-06, "loss": 0.0163, "step": 5385 }, { "epoch": 1.4443550549745239, "grad_norm": 0.3590382991931159, "learning_rate": 6.185271970480803e-06, "loss": 0.0401, "step": 5386 }, { "epoch": 1.4446232233842853, "grad_norm": 0.4490398866323869, "learning_rate": 6.183756196727821e-06, "loss": 0.0276, "step": 5387 }, { "epoch": 1.4448913917940467, "grad_norm": 0.49620282453467407, "learning_rate": 6.1822403077156425e-06, "loss": 0.0281, "step": 5388 }, { "epoch": 1.445159560203808, "grad_norm": 0.3056842327650162, "learning_rate": 6.180724303591863e-06, "loss": 0.0194, "step": 5389 }, { "epoch": 1.4454277286135693, "grad_norm": 0.25076202895367433, "learning_rate": 6.179208184504094e-06, "loss": 0.0242, "step": 5390 }, { "epoch": 1.4456958970233307, "grad_norm": 0.3380564454144418, "learning_rate": 6.177691950599953e-06, "loss": 0.0288, "step": 5391 }, { "epoch": 1.4459640654330919, "grad_norm": 0.2661658336252339, "learning_rate": 6.176175602027075e-06, "loss": 0.0249, "step": 5392 }, { "epoch": 1.4462322338428533, "grad_norm": 0.30951523977936357, "learning_rate": 6.1746591389331e-06, "loss": 0.0292, "step": 5393 }, { "epoch": 1.4465004022526147, "grad_norm": 0.35093535702746337, "learning_rate": 6.173142561465685e-06, "loss": 0.0229, "step": 5394 }, { "epoch": 1.446768570662376, "grad_norm": 0.2905202282933159, "learning_rate": 6.171625869772493e-06, "loss": 0.0252, "step": 5395 }, { "epoch": 1.4470367390721373, "grad_norm": 0.28793645631755854, "learning_rate": 6.1701090640012e-06, "loss": 0.0169, "step": 5396 }, { "epoch": 1.4473049074818987, "grad_norm": 0.2918871024965385, "learning_rate": 6.168592144299493e-06, "loss": 0.0185, "step": 5397 }, { "epoch": 1.4475730758916598, "grad_norm": 0.3992701886046388, "learning_rate": 6.167075110815073e-06, "loss": 0.0236, "step": 5398 }, { "epoch": 1.4478412443014212, "grad_norm": 0.5432146900149479, "learning_rate": 6.165557963695648e-06, "loss": 0.0324, "step": 5399 }, { "epoch": 1.4481094127111827, "grad_norm": 0.22214525160955245, "learning_rate": 6.164040703088938e-06, "loss": 0.0171, "step": 5400 }, { "epoch": 1.448377581120944, "grad_norm": 0.27235723109764803, "learning_rate": 6.162523329142675e-06, "loss": 0.0177, "step": 5401 }, { "epoch": 1.4486457495307052, "grad_norm": 0.5863557922518207, "learning_rate": 6.161005842004603e-06, "loss": 0.0238, "step": 5402 }, { "epoch": 1.4489139179404666, "grad_norm": 0.3458640342663173, "learning_rate": 6.159488241822473e-06, "loss": 0.036, "step": 5403 }, { "epoch": 1.4491820863502278, "grad_norm": 0.44968478529302625, "learning_rate": 6.157970528744052e-06, "loss": 0.0386, "step": 5404 }, { "epoch": 1.4494502547599892, "grad_norm": 0.2481024175759677, "learning_rate": 6.156452702917115e-06, "loss": 0.0224, "step": 5405 }, { "epoch": 1.4497184231697506, "grad_norm": 0.2767639939105911, "learning_rate": 6.154934764489451e-06, "loss": 0.0227, "step": 5406 }, { "epoch": 1.449986591579512, "grad_norm": 0.28716338790752344, "learning_rate": 6.1534167136088526e-06, "loss": 0.0179, "step": 5407 }, { "epoch": 1.4502547599892732, "grad_norm": 0.256288954001623, "learning_rate": 6.15189855042313e-06, "loss": 0.0194, "step": 5408 }, { "epoch": 1.4505229283990346, "grad_norm": 0.33969281969695686, "learning_rate": 6.150380275080106e-06, "loss": 0.0334, "step": 5409 }, { "epoch": 1.4507910968087958, "grad_norm": 0.2768410126681915, "learning_rate": 6.148861887727607e-06, "loss": 0.0252, "step": 5410 }, { "epoch": 1.4510592652185572, "grad_norm": 0.8544553061361833, "learning_rate": 6.147343388513478e-06, "loss": 0.0331, "step": 5411 }, { "epoch": 1.4513274336283186, "grad_norm": 0.2713558423907453, "learning_rate": 6.145824777585569e-06, "loss": 0.0281, "step": 5412 }, { "epoch": 1.45159560203808, "grad_norm": 0.22835967825886871, "learning_rate": 6.144306055091743e-06, "loss": 0.0255, "step": 5413 }, { "epoch": 1.4518637704478412, "grad_norm": 0.29043065678398794, "learning_rate": 6.142787221179874e-06, "loss": 0.0245, "step": 5414 }, { "epoch": 1.4521319388576026, "grad_norm": 0.3105668017941954, "learning_rate": 6.141268275997848e-06, "loss": 0.0218, "step": 5415 }, { "epoch": 1.4524001072673638, "grad_norm": 0.20940118728885287, "learning_rate": 6.1397492196935605e-06, "loss": 0.0166, "step": 5416 }, { "epoch": 1.4526682756771252, "grad_norm": 0.27948167936148255, "learning_rate": 6.1382300524149175e-06, "loss": 0.0307, "step": 5417 }, { "epoch": 1.4529364440868866, "grad_norm": 0.2369622337780405, "learning_rate": 6.136710774309836e-06, "loss": 0.0262, "step": 5418 }, { "epoch": 1.453204612496648, "grad_norm": 0.24471813881081952, "learning_rate": 6.1351913855262435e-06, "loss": 0.0176, "step": 5419 }, { "epoch": 1.4534727809064092, "grad_norm": 0.251004682865238, "learning_rate": 6.13367188621208e-06, "loss": 0.0183, "step": 5420 }, { "epoch": 1.4537409493161706, "grad_norm": 0.36163705295938214, "learning_rate": 6.1321522765152955e-06, "loss": 0.0347, "step": 5421 }, { "epoch": 1.4540091177259318, "grad_norm": 0.3215894335482965, "learning_rate": 6.130632556583849e-06, "loss": 0.0252, "step": 5422 }, { "epoch": 1.4542772861356932, "grad_norm": 0.3460147679776799, "learning_rate": 6.129112726565711e-06, "loss": 0.0295, "step": 5423 }, { "epoch": 1.4545454545454546, "grad_norm": 0.48694449343876706, "learning_rate": 6.127592786608867e-06, "loss": 0.0321, "step": 5424 }, { "epoch": 1.454813622955216, "grad_norm": 0.37432867130095054, "learning_rate": 6.1260727368613045e-06, "loss": 0.0321, "step": 5425 }, { "epoch": 1.4550817913649772, "grad_norm": 0.23627940977878795, "learning_rate": 6.124552577471032e-06, "loss": 0.0254, "step": 5426 }, { "epoch": 1.4553499597747386, "grad_norm": 0.27414333772204913, "learning_rate": 6.123032308586059e-06, "loss": 0.0164, "step": 5427 }, { "epoch": 1.4556181281844998, "grad_norm": 0.2512431065160774, "learning_rate": 6.121511930354413e-06, "loss": 0.023, "step": 5428 }, { "epoch": 1.4558862965942612, "grad_norm": 0.481374583153334, "learning_rate": 6.119991442924126e-06, "loss": 0.0291, "step": 5429 }, { "epoch": 1.4561544650040226, "grad_norm": 0.29528617302655935, "learning_rate": 6.118470846443247e-06, "loss": 0.0186, "step": 5430 }, { "epoch": 1.456422633413784, "grad_norm": 0.2643976611022459, "learning_rate": 6.11695014105983e-06, "loss": 0.0254, "step": 5431 }, { "epoch": 1.4566908018235452, "grad_norm": 0.19463257263029335, "learning_rate": 6.115429326921944e-06, "loss": 0.0251, "step": 5432 }, { "epoch": 1.4569589702333066, "grad_norm": 0.2890816914521631, "learning_rate": 6.1139084041776665e-06, "loss": 0.0192, "step": 5433 }, { "epoch": 1.4572271386430677, "grad_norm": 0.34849557744052806, "learning_rate": 6.112387372975084e-06, "loss": 0.0287, "step": 5434 }, { "epoch": 1.4574953070528291, "grad_norm": 0.2816684641889638, "learning_rate": 6.110866233462296e-06, "loss": 0.0262, "step": 5435 }, { "epoch": 1.4577634754625906, "grad_norm": 0.3456625339070116, "learning_rate": 6.109344985787413e-06, "loss": 0.0246, "step": 5436 }, { "epoch": 1.458031643872352, "grad_norm": 0.28936340843726444, "learning_rate": 6.1078236300985546e-06, "loss": 0.021, "step": 5437 }, { "epoch": 1.4582998122821131, "grad_norm": 0.4005875167768937, "learning_rate": 6.10630216654385e-06, "loss": 0.0326, "step": 5438 }, { "epoch": 1.4585679806918745, "grad_norm": 0.2900700953682369, "learning_rate": 6.10478059527144e-06, "loss": 0.0265, "step": 5439 }, { "epoch": 1.4588361491016357, "grad_norm": 0.24161069917715003, "learning_rate": 6.103258916429476e-06, "loss": 0.0196, "step": 5440 }, { "epoch": 1.4591043175113971, "grad_norm": 0.24877202089521064, "learning_rate": 6.101737130166122e-06, "loss": 0.0218, "step": 5441 }, { "epoch": 1.4593724859211585, "grad_norm": 0.3376080151510017, "learning_rate": 6.100215236629549e-06, "loss": 0.0262, "step": 5442 }, { "epoch": 1.45964065433092, "grad_norm": 0.30126873974226254, "learning_rate": 6.098693235967938e-06, "loss": 0.0257, "step": 5443 }, { "epoch": 1.4599088227406811, "grad_norm": 0.40290923525559247, "learning_rate": 6.097171128329486e-06, "loss": 0.0301, "step": 5444 }, { "epoch": 1.4601769911504425, "grad_norm": 0.21616161716749552, "learning_rate": 6.095648913862391e-06, "loss": 0.0166, "step": 5445 }, { "epoch": 1.4604451595602037, "grad_norm": 0.2706419875647318, "learning_rate": 6.094126592714871e-06, "loss": 0.0231, "step": 5446 }, { "epoch": 1.4607133279699651, "grad_norm": 0.28580643931958816, "learning_rate": 6.09260416503515e-06, "loss": 0.0225, "step": 5447 }, { "epoch": 1.4609814963797265, "grad_norm": 0.29414059145657817, "learning_rate": 6.091081630971462e-06, "loss": 0.0248, "step": 5448 }, { "epoch": 1.461249664789488, "grad_norm": 0.3957696831297019, "learning_rate": 6.0895589906720535e-06, "loss": 0.0206, "step": 5449 }, { "epoch": 1.461517833199249, "grad_norm": 0.2622825582085395, "learning_rate": 6.088036244285178e-06, "loss": 0.0212, "step": 5450 }, { "epoch": 1.4617860016090105, "grad_norm": 0.41511455823834637, "learning_rate": 6.086513391959101e-06, "loss": 0.0263, "step": 5451 }, { "epoch": 1.4620541700187717, "grad_norm": 0.24889719476700797, "learning_rate": 6.0849904338421e-06, "loss": 0.0207, "step": 5452 }, { "epoch": 1.462322338428533, "grad_norm": 0.22872603468769862, "learning_rate": 6.083467370082462e-06, "loss": 0.0209, "step": 5453 }, { "epoch": 1.4625905068382945, "grad_norm": 0.29901523655128687, "learning_rate": 6.0819442008284825e-06, "loss": 0.0257, "step": 5454 }, { "epoch": 1.462858675248056, "grad_norm": 0.1790766409342946, "learning_rate": 6.0804209262284695e-06, "loss": 0.0158, "step": 5455 }, { "epoch": 1.463126843657817, "grad_norm": 0.2933995115633074, "learning_rate": 6.078897546430738e-06, "loss": 0.0267, "step": 5456 }, { "epoch": 1.4633950120675785, "grad_norm": 0.5929806845358147, "learning_rate": 6.077374061583618e-06, "loss": 0.03, "step": 5457 }, { "epoch": 1.4636631804773397, "grad_norm": 0.23504409614451224, "learning_rate": 6.075850471835445e-06, "loss": 0.0193, "step": 5458 }, { "epoch": 1.463931348887101, "grad_norm": 0.289579617116702, "learning_rate": 6.0743267773345675e-06, "loss": 0.0262, "step": 5459 }, { "epoch": 1.4641995172968625, "grad_norm": 0.2889018562724464, "learning_rate": 6.072802978229345e-06, "loss": 0.0242, "step": 5460 }, { "epoch": 1.464467685706624, "grad_norm": 0.22262130688063334, "learning_rate": 6.071279074668144e-06, "loss": 0.0212, "step": 5461 }, { "epoch": 1.464735854116385, "grad_norm": 0.2735253346267087, "learning_rate": 6.069755066799344e-06, "loss": 0.0253, "step": 5462 }, { "epoch": 1.4650040225261465, "grad_norm": 0.2665191785600146, "learning_rate": 6.068230954771334e-06, "loss": 0.024, "step": 5463 }, { "epoch": 1.4652721909359077, "grad_norm": 0.6899917675183308, "learning_rate": 6.066706738732511e-06, "loss": 0.0252, "step": 5464 }, { "epoch": 1.465540359345669, "grad_norm": 0.34312010097158707, "learning_rate": 6.065182418831286e-06, "loss": 0.0256, "step": 5465 }, { "epoch": 1.4658085277554305, "grad_norm": 0.5130515776071124, "learning_rate": 6.063657995216076e-06, "loss": 0.0245, "step": 5466 }, { "epoch": 1.4660766961651919, "grad_norm": 4.286859314422194, "learning_rate": 6.062133468035311e-06, "loss": 0.031, "step": 5467 }, { "epoch": 1.466344864574953, "grad_norm": 0.7128010720138436, "learning_rate": 6.06060883743743e-06, "loss": 0.0231, "step": 5468 }, { "epoch": 1.4666130329847145, "grad_norm": 0.17554618634067015, "learning_rate": 6.059084103570882e-06, "loss": 0.0132, "step": 5469 }, { "epoch": 1.4668812013944756, "grad_norm": 2.7022875264643766, "learning_rate": 6.057559266584128e-06, "loss": 0.0235, "step": 5470 }, { "epoch": 1.467149369804237, "grad_norm": 0.3911127454031509, "learning_rate": 6.056034326625635e-06, "loss": 0.0329, "step": 5471 }, { "epoch": 1.4674175382139985, "grad_norm": 0.41996558270268564, "learning_rate": 6.054509283843884e-06, "loss": 0.0345, "step": 5472 }, { "epoch": 1.4676857066237599, "grad_norm": 3.7022715726398485, "learning_rate": 6.052984138387364e-06, "loss": 0.0402, "step": 5473 }, { "epoch": 1.467953875033521, "grad_norm": 0.2835899790683567, "learning_rate": 6.051458890404573e-06, "loss": 0.0259, "step": 5474 }, { "epoch": 1.4682220434432824, "grad_norm": 0.24990812161338463, "learning_rate": 6.0499335400440216e-06, "loss": 0.0215, "step": 5475 }, { "epoch": 1.4684902118530436, "grad_norm": 0.23447965147836516, "learning_rate": 6.048408087454228e-06, "loss": 0.0231, "step": 5476 }, { "epoch": 1.468758380262805, "grad_norm": 0.8289793989645695, "learning_rate": 6.046882532783724e-06, "loss": 0.031, "step": 5477 }, { "epoch": 1.4690265486725664, "grad_norm": 0.2534794667819891, "learning_rate": 6.045356876181045e-06, "loss": 0.0289, "step": 5478 }, { "epoch": 1.4692947170823278, "grad_norm": 0.24948837704737659, "learning_rate": 6.043831117794746e-06, "loss": 0.026, "step": 5479 }, { "epoch": 1.469562885492089, "grad_norm": 0.20815076741510852, "learning_rate": 6.04230525777338e-06, "loss": 0.0175, "step": 5480 }, { "epoch": 1.4698310539018504, "grad_norm": 0.4398409332815767, "learning_rate": 6.040779296265519e-06, "loss": 0.0241, "step": 5481 }, { "epoch": 1.4700992223116116, "grad_norm": 0.23037170449282124, "learning_rate": 6.03925323341974e-06, "loss": 0.0186, "step": 5482 }, { "epoch": 1.470367390721373, "grad_norm": 0.2108856480755226, "learning_rate": 6.037727069384633e-06, "loss": 0.0152, "step": 5483 }, { "epoch": 1.4706355591311344, "grad_norm": 0.2675989913269997, "learning_rate": 6.036200804308798e-06, "loss": 0.0282, "step": 5484 }, { "epoch": 1.4709037275408956, "grad_norm": 0.3024248068631541, "learning_rate": 6.034674438340842e-06, "loss": 0.0313, "step": 5485 }, { "epoch": 1.471171895950657, "grad_norm": 0.2725665857246259, "learning_rate": 6.033147971629381e-06, "loss": 0.0254, "step": 5486 }, { "epoch": 1.4714400643604184, "grad_norm": 0.21102537000935545, "learning_rate": 6.031621404323046e-06, "loss": 0.0159, "step": 5487 }, { "epoch": 1.4717082327701796, "grad_norm": 0.3055529882076157, "learning_rate": 6.030094736570472e-06, "loss": 0.0264, "step": 5488 }, { "epoch": 1.471976401179941, "grad_norm": 0.2306060368839079, "learning_rate": 6.02856796852031e-06, "loss": 0.0185, "step": 5489 }, { "epoch": 1.4722445695897024, "grad_norm": 0.2957048053302528, "learning_rate": 6.027041100321216e-06, "loss": 0.0224, "step": 5490 }, { "epoch": 1.4725127379994636, "grad_norm": 0.22119621726835992, "learning_rate": 6.025514132121857e-06, "loss": 0.0141, "step": 5491 }, { "epoch": 1.472780906409225, "grad_norm": 0.3291149477952848, "learning_rate": 6.0239870640709084e-06, "loss": 0.0305, "step": 5492 }, { "epoch": 1.4730490748189864, "grad_norm": 0.48902882079114485, "learning_rate": 6.022459896317058e-06, "loss": 0.0341, "step": 5493 }, { "epoch": 1.4733172432287476, "grad_norm": 0.5681868465548964, "learning_rate": 6.020932629009003e-06, "loss": 0.0236, "step": 5494 }, { "epoch": 1.473585411638509, "grad_norm": 0.3127715697611482, "learning_rate": 6.019405262295448e-06, "loss": 0.0243, "step": 5495 }, { "epoch": 1.4738535800482704, "grad_norm": 0.31903810291132906, "learning_rate": 6.017877796325109e-06, "loss": 0.0233, "step": 5496 }, { "epoch": 1.4741217484580316, "grad_norm": 0.23545837119695515, "learning_rate": 6.016350231246713e-06, "loss": 0.019, "step": 5497 }, { "epoch": 1.474389916867793, "grad_norm": 0.3950414886658713, "learning_rate": 6.014822567208991e-06, "loss": 0.0265, "step": 5498 }, { "epoch": 1.4746580852775544, "grad_norm": 0.26107162474385875, "learning_rate": 6.013294804360689e-06, "loss": 0.0276, "step": 5499 }, { "epoch": 1.4749262536873156, "grad_norm": 0.2726101585297778, "learning_rate": 6.011766942850565e-06, "loss": 0.0278, "step": 5500 }, { "epoch": 1.475194422097077, "grad_norm": 0.49028381985058334, "learning_rate": 6.010238982827379e-06, "loss": 0.0233, "step": 5501 }, { "epoch": 1.4754625905068384, "grad_norm": 0.2728249185938734, "learning_rate": 6.008710924439906e-06, "loss": 0.0201, "step": 5502 }, { "epoch": 1.4757307589165995, "grad_norm": 0.3941689004766452, "learning_rate": 6.0071827678369275e-06, "loss": 0.0324, "step": 5503 }, { "epoch": 1.475998927326361, "grad_norm": 0.2826319405596248, "learning_rate": 6.005654513167236e-06, "loss": 0.0234, "step": 5504 }, { "epoch": 1.4762670957361224, "grad_norm": 0.26923757130186193, "learning_rate": 6.004126160579636e-06, "loss": 0.0285, "step": 5505 }, { "epoch": 1.4765352641458835, "grad_norm": 1.1364890871828919, "learning_rate": 6.002597710222937e-06, "loss": 0.027, "step": 5506 }, { "epoch": 1.476803432555645, "grad_norm": 0.32964184896064597, "learning_rate": 6.001069162245961e-06, "loss": 0.0233, "step": 5507 }, { "epoch": 1.4770716009654064, "grad_norm": 0.37049448029540216, "learning_rate": 5.99954051679754e-06, "loss": 0.0385, "step": 5508 }, { "epoch": 1.4773397693751675, "grad_norm": 0.3298870980573362, "learning_rate": 5.99801177402651e-06, "loss": 0.0272, "step": 5509 }, { "epoch": 1.477607937784929, "grad_norm": 0.27298646530649634, "learning_rate": 5.9964829340817245e-06, "loss": 0.0226, "step": 5510 }, { "epoch": 1.4778761061946903, "grad_norm": 0.2075410674078432, "learning_rate": 5.9949539971120405e-06, "loss": 0.0232, "step": 5511 }, { "epoch": 1.4781442746044515, "grad_norm": 0.25537790265889376, "learning_rate": 5.9934249632663284e-06, "loss": 0.022, "step": 5512 }, { "epoch": 1.478412443014213, "grad_norm": 0.2730132910531634, "learning_rate": 5.991895832693465e-06, "loss": 0.0247, "step": 5513 }, { "epoch": 1.4786806114239743, "grad_norm": 0.3094383438004259, "learning_rate": 5.990366605542337e-06, "loss": 0.0264, "step": 5514 }, { "epoch": 1.4789487798337355, "grad_norm": 0.29112896158730067, "learning_rate": 5.988837281961843e-06, "loss": 0.0238, "step": 5515 }, { "epoch": 1.479216948243497, "grad_norm": 0.29883686047691954, "learning_rate": 5.987307862100889e-06, "loss": 0.0229, "step": 5516 }, { "epoch": 1.4794851166532583, "grad_norm": 0.2555849103254526, "learning_rate": 5.9857783461083876e-06, "loss": 0.0228, "step": 5517 }, { "epoch": 1.4797532850630195, "grad_norm": 0.24502050290903835, "learning_rate": 5.9842487341332664e-06, "loss": 0.0179, "step": 5518 }, { "epoch": 1.480021453472781, "grad_norm": 0.28595293924555465, "learning_rate": 5.982719026324459e-06, "loss": 0.0247, "step": 5519 }, { "epoch": 1.4802896218825423, "grad_norm": 0.3072219668312463, "learning_rate": 5.98118922283091e-06, "loss": 0.0286, "step": 5520 }, { "epoch": 1.4805577902923035, "grad_norm": 0.2894312890121414, "learning_rate": 5.97965932380157e-06, "loss": 0.0299, "step": 5521 }, { "epoch": 1.480825958702065, "grad_norm": 0.2665255298877472, "learning_rate": 5.978129329385404e-06, "loss": 0.024, "step": 5522 }, { "epoch": 1.4810941271118263, "grad_norm": 0.2317873720728504, "learning_rate": 5.976599239731381e-06, "loss": 0.0193, "step": 5523 }, { "epoch": 1.4813622955215875, "grad_norm": 0.3793191612637848, "learning_rate": 5.975069054988484e-06, "loss": 0.0393, "step": 5524 }, { "epoch": 1.481630463931349, "grad_norm": 0.23471341090092424, "learning_rate": 5.9735387753057e-06, "loss": 0.0191, "step": 5525 }, { "epoch": 1.4818986323411103, "grad_norm": 0.20109425842193432, "learning_rate": 5.972008400832031e-06, "loss": 0.0195, "step": 5526 }, { "epoch": 1.4821668007508715, "grad_norm": 0.2399398648672672, "learning_rate": 5.970477931716486e-06, "loss": 0.0172, "step": 5527 }, { "epoch": 1.4824349691606329, "grad_norm": 0.2544584432268763, "learning_rate": 5.96894736810808e-06, "loss": 0.024, "step": 5528 }, { "epoch": 1.482703137570394, "grad_norm": 0.23286021521581438, "learning_rate": 5.967416710155842e-06, "loss": 0.0256, "step": 5529 }, { "epoch": 1.4829713059801555, "grad_norm": 0.29833414152397497, "learning_rate": 5.965885958008807e-06, "loss": 0.0241, "step": 5530 }, { "epoch": 1.4832394743899169, "grad_norm": 0.27925084043506393, "learning_rate": 5.964355111816019e-06, "loss": 0.0221, "step": 5531 }, { "epoch": 1.4835076427996783, "grad_norm": 0.22182479224317272, "learning_rate": 5.962824171726535e-06, "loss": 0.0134, "step": 5532 }, { "epoch": 1.4837758112094395, "grad_norm": 0.28203517020552366, "learning_rate": 5.961293137889421e-06, "loss": 0.0293, "step": 5533 }, { "epoch": 1.4840439796192009, "grad_norm": 0.23097937484173658, "learning_rate": 5.959762010453744e-06, "loss": 0.0153, "step": 5534 }, { "epoch": 1.484312148028962, "grad_norm": 0.40418503266438394, "learning_rate": 5.9582307895685876e-06, "loss": 0.0228, "step": 5535 }, { "epoch": 1.4845803164387235, "grad_norm": 0.3130813403330914, "learning_rate": 5.956699475383042e-06, "loss": 0.0226, "step": 5536 }, { "epoch": 1.4848484848484849, "grad_norm": 0.30087008475163485, "learning_rate": 5.95516806804621e-06, "loss": 0.0235, "step": 5537 }, { "epoch": 1.4851166532582463, "grad_norm": 0.2687948291377881, "learning_rate": 5.953636567707199e-06, "loss": 0.0194, "step": 5538 }, { "epoch": 1.4853848216680074, "grad_norm": 0.38209765853558325, "learning_rate": 5.952104974515128e-06, "loss": 0.0297, "step": 5539 }, { "epoch": 1.4856529900777689, "grad_norm": 0.2511443524149898, "learning_rate": 5.950573288619123e-06, "loss": 0.0205, "step": 5540 }, { "epoch": 1.48592115848753, "grad_norm": 0.44062224813241213, "learning_rate": 5.94904151016832e-06, "loss": 0.0325, "step": 5541 }, { "epoch": 1.4861893268972914, "grad_norm": 0.3974100542224825, "learning_rate": 5.947509639311865e-06, "loss": 0.0217, "step": 5542 }, { "epoch": 1.4864574953070528, "grad_norm": 0.27743962244039516, "learning_rate": 5.9459776761989115e-06, "loss": 0.0274, "step": 5543 }, { "epoch": 1.4867256637168142, "grad_norm": 0.2989258273654116, "learning_rate": 5.944445620978625e-06, "loss": 0.0262, "step": 5544 }, { "epoch": 1.4869938321265754, "grad_norm": 0.3357574042713301, "learning_rate": 5.942913473800176e-06, "loss": 0.0297, "step": 5545 }, { "epoch": 1.4872620005363368, "grad_norm": 0.22832780119863322, "learning_rate": 5.941381234812744e-06, "loss": 0.0216, "step": 5546 }, { "epoch": 1.487530168946098, "grad_norm": 0.2660018556288038, "learning_rate": 5.939848904165519e-06, "loss": 0.0237, "step": 5547 }, { "epoch": 1.4877983373558594, "grad_norm": 0.23718718397951402, "learning_rate": 5.938316482007704e-06, "loss": 0.0237, "step": 5548 }, { "epoch": 1.4880665057656208, "grad_norm": 0.285674650376869, "learning_rate": 5.936783968488503e-06, "loss": 0.0239, "step": 5549 }, { "epoch": 1.4883346741753822, "grad_norm": 0.2964334132191494, "learning_rate": 5.935251363757135e-06, "loss": 0.0257, "step": 5550 }, { "epoch": 1.4886028425851434, "grad_norm": 0.3013285505051643, "learning_rate": 5.9337186679628246e-06, "loss": 0.0339, "step": 5551 }, { "epoch": 1.4888710109949048, "grad_norm": 0.2467066730204887, "learning_rate": 5.932185881254806e-06, "loss": 0.0197, "step": 5552 }, { "epoch": 1.489139179404666, "grad_norm": 0.3065028837999952, "learning_rate": 5.930653003782323e-06, "loss": 0.0245, "step": 5553 }, { "epoch": 1.4894073478144274, "grad_norm": 0.27385106738870474, "learning_rate": 5.929120035694628e-06, "loss": 0.0235, "step": 5554 }, { "epoch": 1.4896755162241888, "grad_norm": 0.29229846149576494, "learning_rate": 5.927586977140982e-06, "loss": 0.0242, "step": 5555 }, { "epoch": 1.4899436846339502, "grad_norm": 0.2472474677187293, "learning_rate": 5.926053828270655e-06, "loss": 0.0204, "step": 5556 }, { "epoch": 1.4902118530437114, "grad_norm": 0.3041712883849561, "learning_rate": 5.9245205892329246e-06, "loss": 0.0202, "step": 5557 }, { "epoch": 1.4904800214534728, "grad_norm": 0.2523814381303282, "learning_rate": 5.92298726017708e-06, "loss": 0.0239, "step": 5558 }, { "epoch": 1.490748189863234, "grad_norm": 0.449100213482074, "learning_rate": 5.9214538412524155e-06, "loss": 0.0232, "step": 5559 }, { "epoch": 1.4910163582729954, "grad_norm": 0.2878046923559746, "learning_rate": 5.919920332608238e-06, "loss": 0.0253, "step": 5560 }, { "epoch": 1.4912845266827568, "grad_norm": 0.3028003999994938, "learning_rate": 5.918386734393858e-06, "loss": 0.021, "step": 5561 }, { "epoch": 1.4915526950925182, "grad_norm": 0.333529303522482, "learning_rate": 5.916853046758601e-06, "loss": 0.0281, "step": 5562 }, { "epoch": 1.4918208635022794, "grad_norm": 0.27108622047871817, "learning_rate": 5.915319269851798e-06, "loss": 0.0239, "step": 5563 }, { "epoch": 1.4920890319120408, "grad_norm": 0.34430232790234094, "learning_rate": 5.913785403822787e-06, "loss": 0.023, "step": 5564 }, { "epoch": 1.492357200321802, "grad_norm": 0.3878949200427505, "learning_rate": 5.912251448820917e-06, "loss": 0.0337, "step": 5565 }, { "epoch": 1.4926253687315634, "grad_norm": 0.27612430231844265, "learning_rate": 5.910717404995547e-06, "loss": 0.027, "step": 5566 }, { "epoch": 1.4928935371413248, "grad_norm": 0.26461288588866205, "learning_rate": 5.90918327249604e-06, "loss": 0.0209, "step": 5567 }, { "epoch": 1.4931617055510862, "grad_norm": 0.27116742805347177, "learning_rate": 5.907649051471771e-06, "loss": 0.0218, "step": 5568 }, { "epoch": 1.4934298739608474, "grad_norm": 0.2963346536338028, "learning_rate": 5.9061147420721236e-06, "loss": 0.0268, "step": 5569 }, { "epoch": 1.4936980423706088, "grad_norm": 0.24782413001509243, "learning_rate": 5.90458034444649e-06, "loss": 0.0283, "step": 5570 }, { "epoch": 1.49396621078037, "grad_norm": 0.25275120235781356, "learning_rate": 5.903045858744271e-06, "loss": 0.02, "step": 5571 }, { "epoch": 1.4942343791901314, "grad_norm": 0.27009394332073783, "learning_rate": 5.901511285114872e-06, "loss": 0.028, "step": 5572 }, { "epoch": 1.4945025475998928, "grad_norm": 0.2028918152272486, "learning_rate": 5.899976623707714e-06, "loss": 0.0197, "step": 5573 }, { "epoch": 1.4947707160096542, "grad_norm": 0.30533550779263857, "learning_rate": 5.898441874672221e-06, "loss": 0.0185, "step": 5574 }, { "epoch": 1.4950388844194153, "grad_norm": 0.2579388636731747, "learning_rate": 5.896907038157827e-06, "loss": 0.0239, "step": 5575 }, { "epoch": 1.4953070528291768, "grad_norm": 0.3906252456478026, "learning_rate": 5.89537211431398e-06, "loss": 0.0283, "step": 5576 }, { "epoch": 1.495575221238938, "grad_norm": 0.2818911707506467, "learning_rate": 5.893837103290124e-06, "loss": 0.0232, "step": 5577 }, { "epoch": 1.4958433896486993, "grad_norm": 0.33289947928629654, "learning_rate": 5.892302005235722e-06, "loss": 0.0215, "step": 5578 }, { "epoch": 1.4961115580584607, "grad_norm": 0.24732012346739682, "learning_rate": 5.8907668203002434e-06, "loss": 0.0249, "step": 5579 }, { "epoch": 1.4963797264682221, "grad_norm": 0.3905034457715331, "learning_rate": 5.8892315486331656e-06, "loss": 0.0235, "step": 5580 }, { "epoch": 1.4966478948779833, "grad_norm": 0.2378470667444129, "learning_rate": 5.887696190383971e-06, "loss": 0.021, "step": 5581 }, { "epoch": 1.4969160632877447, "grad_norm": 0.30400457357558186, "learning_rate": 5.886160745702157e-06, "loss": 0.0297, "step": 5582 }, { "epoch": 1.497184231697506, "grad_norm": 0.26324130492614334, "learning_rate": 5.884625214737224e-06, "loss": 0.0252, "step": 5583 }, { "epoch": 1.4974524001072673, "grad_norm": 0.24174751675002068, "learning_rate": 5.8830895976386805e-06, "loss": 0.0198, "step": 5584 }, { "epoch": 1.4977205685170287, "grad_norm": 0.8439808531499249, "learning_rate": 5.881553894556048e-06, "loss": 0.0202, "step": 5585 }, { "epoch": 1.4979887369267901, "grad_norm": 0.4142554070989203, "learning_rate": 5.880018105638854e-06, "loss": 0.026, "step": 5586 }, { "epoch": 1.4982569053365513, "grad_norm": 0.2762681565180677, "learning_rate": 5.878482231036633e-06, "loss": 0.0247, "step": 5587 }, { "epoch": 1.4985250737463127, "grad_norm": 0.25716566321710965, "learning_rate": 5.8769462708989314e-06, "loss": 0.0203, "step": 5588 }, { "epoch": 1.498793242156074, "grad_norm": 0.2709806663682435, "learning_rate": 5.875410225375296e-06, "loss": 0.0238, "step": 5589 }, { "epoch": 1.4990614105658353, "grad_norm": 0.42818817145258176, "learning_rate": 5.873874094615292e-06, "loss": 0.0392, "step": 5590 }, { "epoch": 1.4993295789755967, "grad_norm": 0.2681138253961643, "learning_rate": 5.872337878768489e-06, "loss": 0.0217, "step": 5591 }, { "epoch": 1.4995977473853581, "grad_norm": 0.2571445264272858, "learning_rate": 5.870801577984463e-06, "loss": 0.0203, "step": 5592 }, { "epoch": 1.4998659157951193, "grad_norm": 0.6055661381954486, "learning_rate": 5.869265192412799e-06, "loss": 0.0233, "step": 5593 }, { "epoch": 1.5001340842048807, "grad_norm": 0.315042041722011, "learning_rate": 5.867728722203091e-06, "loss": 0.0313, "step": 5594 }, { "epoch": 1.5004022526146419, "grad_norm": 0.23790638095399344, "learning_rate": 5.866192167504941e-06, "loss": 0.0239, "step": 5595 }, { "epoch": 1.5006704210244033, "grad_norm": 0.3875392155026543, "learning_rate": 5.864655528467958e-06, "loss": 0.0266, "step": 5596 }, { "epoch": 1.5009385894341647, "grad_norm": 0.38238177545662455, "learning_rate": 5.863118805241763e-06, "loss": 0.0188, "step": 5597 }, { "epoch": 1.501206757843926, "grad_norm": 0.26042034341708187, "learning_rate": 5.861581997975982e-06, "loss": 0.0227, "step": 5598 }, { "epoch": 1.5014749262536873, "grad_norm": 0.2799455269219071, "learning_rate": 5.860045106820248e-06, "loss": 0.0184, "step": 5599 }, { "epoch": 1.5017430946634487, "grad_norm": 0.24066719478373227, "learning_rate": 5.858508131924205e-06, "loss": 0.0203, "step": 5600 }, { "epoch": 1.5020112630732099, "grad_norm": 0.32610117000517014, "learning_rate": 5.8569710734375065e-06, "loss": 0.0235, "step": 5601 }, { "epoch": 1.5022794314829713, "grad_norm": 1.3333399260113405, "learning_rate": 5.855433931509808e-06, "loss": 0.0417, "step": 5602 }, { "epoch": 1.5025475998927327, "grad_norm": 0.3980826970751715, "learning_rate": 5.85389670629078e-06, "loss": 0.0229, "step": 5603 }, { "epoch": 1.502815768302494, "grad_norm": 0.20671233516502335, "learning_rate": 5.8523593979300965e-06, "loss": 0.0162, "step": 5604 }, { "epoch": 1.5030839367122553, "grad_norm": 0.23829169576732914, "learning_rate": 5.850822006577441e-06, "loss": 0.0213, "step": 5605 }, { "epoch": 1.5033521051220167, "grad_norm": 0.23039151914578046, "learning_rate": 5.849284532382507e-06, "loss": 0.018, "step": 5606 }, { "epoch": 1.5036202735317779, "grad_norm": 0.4364328325200387, "learning_rate": 5.84774697549499e-06, "loss": 0.0196, "step": 5607 }, { "epoch": 1.5038884419415393, "grad_norm": 0.3058393083448035, "learning_rate": 5.846209336064603e-06, "loss": 0.0214, "step": 5608 }, { "epoch": 1.5041566103513007, "grad_norm": 0.31919261816242034, "learning_rate": 5.844671614241059e-06, "loss": 0.0253, "step": 5609 }, { "epoch": 1.504424778761062, "grad_norm": 0.26337489246626966, "learning_rate": 5.843133810174082e-06, "loss": 0.0217, "step": 5610 }, { "epoch": 1.5046929471708232, "grad_norm": 0.19861726623475384, "learning_rate": 5.841595924013405e-06, "loss": 0.0142, "step": 5611 }, { "epoch": 1.5049611155805847, "grad_norm": 0.6083739116517318, "learning_rate": 5.840057955908767e-06, "loss": 0.0379, "step": 5612 }, { "epoch": 1.5052292839903458, "grad_norm": 0.2535710292704241, "learning_rate": 5.838519906009916e-06, "loss": 0.0351, "step": 5613 }, { "epoch": 1.5054974524001072, "grad_norm": 0.28833669027022435, "learning_rate": 5.836981774466609e-06, "loss": 0.0272, "step": 5614 }, { "epoch": 1.5057656208098686, "grad_norm": 0.2435681258709255, "learning_rate": 5.835443561428608e-06, "loss": 0.0283, "step": 5615 }, { "epoch": 1.50603378921963, "grad_norm": 0.2210469103699519, "learning_rate": 5.8339052670456855e-06, "loss": 0.0278, "step": 5616 }, { "epoch": 1.5063019576293912, "grad_norm": 0.2943264715787047, "learning_rate": 5.83236689146762e-06, "loss": 0.0297, "step": 5617 }, { "epoch": 1.5065701260391526, "grad_norm": 0.40096061083794754, "learning_rate": 5.830828434844203e-06, "loss": 0.0354, "step": 5618 }, { "epoch": 1.5068382944489138, "grad_norm": 0.21254842549853487, "learning_rate": 5.8292898973252246e-06, "loss": 0.0207, "step": 5619 }, { "epoch": 1.5071064628586752, "grad_norm": 0.22271247414305018, "learning_rate": 5.827751279060492e-06, "loss": 0.0222, "step": 5620 }, { "epoch": 1.5073746312684366, "grad_norm": 0.37835364344017663, "learning_rate": 5.826212580199814e-06, "loss": 0.0341, "step": 5621 }, { "epoch": 1.507642799678198, "grad_norm": 0.415501864027961, "learning_rate": 5.824673800893009e-06, "loss": 0.0192, "step": 5622 }, { "epoch": 1.5079109680879592, "grad_norm": 0.3331994448702936, "learning_rate": 5.823134941289908e-06, "loss": 0.0362, "step": 5623 }, { "epoch": 1.5081791364977206, "grad_norm": 0.22254154787461786, "learning_rate": 5.821596001540344e-06, "loss": 0.0276, "step": 5624 }, { "epoch": 1.5084473049074818, "grad_norm": 0.2868007354623883, "learning_rate": 5.820056981794156e-06, "loss": 0.025, "step": 5625 }, { "epoch": 1.5087154733172432, "grad_norm": 0.2787331439782924, "learning_rate": 5.818517882201199e-06, "loss": 0.023, "step": 5626 }, { "epoch": 1.5089836417270046, "grad_norm": 0.2369457390400773, "learning_rate": 5.8169787029113265e-06, "loss": 0.0195, "step": 5627 }, { "epoch": 1.509251810136766, "grad_norm": 0.2816889932192299, "learning_rate": 5.815439444074409e-06, "loss": 0.021, "step": 5628 }, { "epoch": 1.5095199785465272, "grad_norm": 0.3021655693505944, "learning_rate": 5.813900105840317e-06, "loss": 0.036, "step": 5629 }, { "epoch": 1.5097881469562886, "grad_norm": 0.21144757656709084, "learning_rate": 5.812360688358934e-06, "loss": 0.0143, "step": 5630 }, { "epoch": 1.5100563153660498, "grad_norm": 0.313374960497439, "learning_rate": 5.810821191780146e-06, "loss": 0.0241, "step": 5631 }, { "epoch": 1.5103244837758112, "grad_norm": 0.2551709234817503, "learning_rate": 5.8092816162538505e-06, "loss": 0.0253, "step": 5632 }, { "epoch": 1.5105926521855726, "grad_norm": 0.3686113233130328, "learning_rate": 5.807741961929954e-06, "loss": 0.0336, "step": 5633 }, { "epoch": 1.510860820595334, "grad_norm": 0.23682743778153306, "learning_rate": 5.8062022289583666e-06, "loss": 0.0202, "step": 5634 }, { "epoch": 1.5111289890050952, "grad_norm": 0.30746025824082746, "learning_rate": 5.804662417489009e-06, "loss": 0.0277, "step": 5635 }, { "epoch": 1.5113971574148566, "grad_norm": 0.4616300480713767, "learning_rate": 5.8031225276718086e-06, "loss": 0.0416, "step": 5636 }, { "epoch": 1.5116653258246178, "grad_norm": 0.25173439425613714, "learning_rate": 5.8015825596566975e-06, "loss": 0.0223, "step": 5637 }, { "epoch": 1.5119334942343792, "grad_norm": 0.3323393409565662, "learning_rate": 5.800042513593623e-06, "loss": 0.0335, "step": 5638 }, { "epoch": 1.5122016626441406, "grad_norm": 0.3322455504314431, "learning_rate": 5.798502389632532e-06, "loss": 0.0271, "step": 5639 }, { "epoch": 1.512469831053902, "grad_norm": 0.21080474798170887, "learning_rate": 5.7969621879233835e-06, "loss": 0.0179, "step": 5640 }, { "epoch": 1.5127379994636632, "grad_norm": 0.2691875061443105, "learning_rate": 5.7954219086161435e-06, "loss": 0.0262, "step": 5641 }, { "epoch": 1.5130061678734243, "grad_norm": 0.32874860876100154, "learning_rate": 5.793881551860785e-06, "loss": 0.034, "step": 5642 }, { "epoch": 1.5132743362831858, "grad_norm": 0.4384530856719144, "learning_rate": 5.792341117807284e-06, "loss": 0.0288, "step": 5643 }, { "epoch": 1.5135425046929472, "grad_norm": 0.23250615703674113, "learning_rate": 5.790800606605634e-06, "loss": 0.0194, "step": 5644 }, { "epoch": 1.5138106731027086, "grad_norm": 0.7509029240452141, "learning_rate": 5.789260018405829e-06, "loss": 0.0282, "step": 5645 }, { "epoch": 1.51407884151247, "grad_norm": 0.23355063075276608, "learning_rate": 5.7877193533578705e-06, "loss": 0.0237, "step": 5646 }, { "epoch": 1.5143470099222311, "grad_norm": 0.27121096490274155, "learning_rate": 5.786178611611771e-06, "loss": 0.0211, "step": 5647 }, { "epoch": 1.5146151783319923, "grad_norm": 0.24604209972002095, "learning_rate": 5.784637793317546e-06, "loss": 0.0262, "step": 5648 }, { "epoch": 1.5148833467417537, "grad_norm": 0.5360331439528703, "learning_rate": 5.783096898625223e-06, "loss": 0.0307, "step": 5649 }, { "epoch": 1.5151515151515151, "grad_norm": 0.2704959263174878, "learning_rate": 5.781555927684834e-06, "loss": 0.0253, "step": 5650 }, { "epoch": 1.5154196835612765, "grad_norm": 0.2884320982644645, "learning_rate": 5.7800148806464195e-06, "loss": 0.0246, "step": 5651 }, { "epoch": 1.515687851971038, "grad_norm": 0.921487710000508, "learning_rate": 5.778473757660027e-06, "loss": 0.0301, "step": 5652 }, { "epoch": 1.5159560203807991, "grad_norm": 0.25673545365398687, "learning_rate": 5.776932558875711e-06, "loss": 0.018, "step": 5653 }, { "epoch": 1.5162241887905603, "grad_norm": 0.24008968301534545, "learning_rate": 5.775391284443534e-06, "loss": 0.0177, "step": 5654 }, { "epoch": 1.5164923572003217, "grad_norm": 0.3521025599568721, "learning_rate": 5.773849934513568e-06, "loss": 0.0242, "step": 5655 }, { "epoch": 1.5167605256100831, "grad_norm": 0.27916171552616187, "learning_rate": 5.772308509235887e-06, "loss": 0.0189, "step": 5656 }, { "epoch": 1.5170286940198445, "grad_norm": 0.2960768244730077, "learning_rate": 5.770767008760577e-06, "loss": 0.0193, "step": 5657 }, { "epoch": 1.517296862429606, "grad_norm": 0.24117817444012207, "learning_rate": 5.769225433237731e-06, "loss": 0.0235, "step": 5658 }, { "epoch": 1.5175650308393671, "grad_norm": 0.2324261884603527, "learning_rate": 5.767683782817445e-06, "loss": 0.0221, "step": 5659 }, { "epoch": 1.5178331992491283, "grad_norm": 0.2751068012591845, "learning_rate": 5.7661420576498286e-06, "loss": 0.0239, "step": 5660 }, { "epoch": 1.5181013676588897, "grad_norm": 0.292053080152648, "learning_rate": 5.764600257884994e-06, "loss": 0.0289, "step": 5661 }, { "epoch": 1.518369536068651, "grad_norm": 0.28062267121928614, "learning_rate": 5.763058383673061e-06, "loss": 0.0202, "step": 5662 }, { "epoch": 1.5186377044784125, "grad_norm": 0.7502887509083108, "learning_rate": 5.7615164351641595e-06, "loss": 0.0267, "step": 5663 }, { "epoch": 1.518905872888174, "grad_norm": 0.30115760173570805, "learning_rate": 5.7599744125084254e-06, "loss": 0.0226, "step": 5664 }, { "epoch": 1.519174041297935, "grad_norm": 0.2997687071542531, "learning_rate": 5.758432315855998e-06, "loss": 0.0284, "step": 5665 }, { "epoch": 1.5194422097076963, "grad_norm": 0.2581725741934641, "learning_rate": 5.756890145357034e-06, "loss": 0.0217, "step": 5666 }, { "epoch": 1.5197103781174577, "grad_norm": 0.2123244143198249, "learning_rate": 5.755347901161683e-06, "loss": 0.0157, "step": 5667 }, { "epoch": 1.519978546527219, "grad_norm": 0.4653688411725856, "learning_rate": 5.7538055834201126e-06, "loss": 0.0399, "step": 5668 }, { "epoch": 1.5202467149369805, "grad_norm": 0.2281316463828835, "learning_rate": 5.752263192282494e-06, "loss": 0.0144, "step": 5669 }, { "epoch": 1.520514883346742, "grad_norm": 0.27445750167100974, "learning_rate": 5.750720727899005e-06, "loss": 0.0257, "step": 5670 }, { "epoch": 1.520783051756503, "grad_norm": 0.4169519438922934, "learning_rate": 5.7491781904198325e-06, "loss": 0.0208, "step": 5671 }, { "epoch": 1.5210512201662643, "grad_norm": 0.19831896585244732, "learning_rate": 5.74763557999517e-06, "loss": 0.0207, "step": 5672 }, { "epoch": 1.5213193885760257, "grad_norm": 0.3035046033768232, "learning_rate": 5.746092896775215e-06, "loss": 0.0263, "step": 5673 }, { "epoch": 1.521587556985787, "grad_norm": 0.2726092511200985, "learning_rate": 5.744550140910174e-06, "loss": 0.0278, "step": 5674 }, { "epoch": 1.5218557253955485, "grad_norm": 0.3546267777063928, "learning_rate": 5.743007312550262e-06, "loss": 0.0308, "step": 5675 }, { "epoch": 1.5221238938053099, "grad_norm": 0.2283742120629325, "learning_rate": 5.741464411845703e-06, "loss": 0.0212, "step": 5676 }, { "epoch": 1.522392062215071, "grad_norm": 0.21529046878764158, "learning_rate": 5.739921438946723e-06, "loss": 0.019, "step": 5677 }, { "epoch": 1.5226602306248322, "grad_norm": 0.23543639597915197, "learning_rate": 5.738378394003555e-06, "loss": 0.0233, "step": 5678 }, { "epoch": 1.5229283990345936, "grad_norm": 0.24227341971156127, "learning_rate": 5.736835277166446e-06, "loss": 0.0309, "step": 5679 }, { "epoch": 1.523196567444355, "grad_norm": 0.22815138996685275, "learning_rate": 5.735292088585639e-06, "loss": 0.0295, "step": 5680 }, { "epoch": 1.5234647358541165, "grad_norm": 0.27172423193096673, "learning_rate": 5.733748828411395e-06, "loss": 0.0207, "step": 5681 }, { "epoch": 1.5237329042638779, "grad_norm": 0.3760921854553939, "learning_rate": 5.732205496793975e-06, "loss": 0.0295, "step": 5682 }, { "epoch": 1.524001072673639, "grad_norm": 0.27021373041046626, "learning_rate": 5.730662093883651e-06, "loss": 0.025, "step": 5683 }, { "epoch": 1.5242692410834002, "grad_norm": 0.24832106154854647, "learning_rate": 5.729118619830698e-06, "loss": 0.0181, "step": 5684 }, { "epoch": 1.5245374094931616, "grad_norm": 0.2657935891027441, "learning_rate": 5.727575074785402e-06, "loss": 0.0245, "step": 5685 }, { "epoch": 1.524805577902923, "grad_norm": 0.2979300222517417, "learning_rate": 5.726031458898052e-06, "loss": 0.0208, "step": 5686 }, { "epoch": 1.5250737463126844, "grad_norm": 0.31210219315826204, "learning_rate": 5.724487772318945e-06, "loss": 0.0195, "step": 5687 }, { "epoch": 1.5253419147224458, "grad_norm": 0.2204749999575347, "learning_rate": 5.722944015198389e-06, "loss": 0.0214, "step": 5688 }, { "epoch": 1.525610083132207, "grad_norm": 0.30798072545184324, "learning_rate": 5.721400187686692e-06, "loss": 0.0193, "step": 5689 }, { "epoch": 1.5258782515419682, "grad_norm": 0.2463888221385075, "learning_rate": 5.719856289934175e-06, "loss": 0.0253, "step": 5690 }, { "epoch": 1.5261464199517296, "grad_norm": 0.2604091964240948, "learning_rate": 5.7183123220911615e-06, "loss": 0.0264, "step": 5691 }, { "epoch": 1.526414588361491, "grad_norm": 0.3154452037396957, "learning_rate": 5.716768284307984e-06, "loss": 0.0323, "step": 5692 }, { "epoch": 1.5266827567712524, "grad_norm": 0.25747267193062306, "learning_rate": 5.715224176734983e-06, "loss": 0.0278, "step": 5693 }, { "epoch": 1.5269509251810138, "grad_norm": 0.27490496500600864, "learning_rate": 5.7136799995225024e-06, "loss": 0.0221, "step": 5694 }, { "epoch": 1.527219093590775, "grad_norm": 0.38894697480927437, "learning_rate": 5.712135752820894e-06, "loss": 0.0254, "step": 5695 }, { "epoch": 1.5274872620005362, "grad_norm": 0.307632248326358, "learning_rate": 5.7105914367805195e-06, "loss": 0.0267, "step": 5696 }, { "epoch": 1.5277554304102976, "grad_norm": 0.23844044215938706, "learning_rate": 5.7090470515517424e-06, "loss": 0.0194, "step": 5697 }, { "epoch": 1.528023598820059, "grad_norm": 0.36420267485545255, "learning_rate": 5.707502597284936e-06, "loss": 0.0211, "step": 5698 }, { "epoch": 1.5282917672298204, "grad_norm": 0.44312382618731344, "learning_rate": 5.705958074130481e-06, "loss": 0.0364, "step": 5699 }, { "epoch": 1.5285599356395818, "grad_norm": 0.28332776399373194, "learning_rate": 5.704413482238763e-06, "loss": 0.0245, "step": 5700 }, { "epoch": 1.528828104049343, "grad_norm": 0.3052448684382652, "learning_rate": 5.7028688217601735e-06, "loss": 0.0358, "step": 5701 }, { "epoch": 1.5290962724591042, "grad_norm": 0.24420424355556966, "learning_rate": 5.7013240928451115e-06, "loss": 0.0184, "step": 5702 }, { "epoch": 1.5293644408688656, "grad_norm": 0.5802673732170867, "learning_rate": 5.699779295643988e-06, "loss": 0.0319, "step": 5703 }, { "epoch": 1.529632609278627, "grad_norm": 0.3134531794316103, "learning_rate": 5.69823443030721e-06, "loss": 0.0214, "step": 5704 }, { "epoch": 1.5299007776883884, "grad_norm": 0.2663682275767274, "learning_rate": 5.6966894969852e-06, "loss": 0.0223, "step": 5705 }, { "epoch": 1.5301689460981498, "grad_norm": 0.2835338997274253, "learning_rate": 5.695144495828384e-06, "loss": 0.0198, "step": 5706 }, { "epoch": 1.530437114507911, "grad_norm": 0.2458120205201112, "learning_rate": 5.6935994269871934e-06, "loss": 0.0235, "step": 5707 }, { "epoch": 1.5307052829176722, "grad_norm": 0.449432823421994, "learning_rate": 5.692054290612067e-06, "loss": 0.0234, "step": 5708 }, { "epoch": 1.5309734513274336, "grad_norm": 0.21637138911498027, "learning_rate": 5.690509086853453e-06, "loss": 0.0179, "step": 5709 }, { "epoch": 1.531241619737195, "grad_norm": 0.25692754519217453, "learning_rate": 5.6889638158618025e-06, "loss": 0.0201, "step": 5710 }, { "epoch": 1.5315097881469564, "grad_norm": 0.3676727015795023, "learning_rate": 5.687418477787574e-06, "loss": 0.0242, "step": 5711 }, { "epoch": 1.5317779565567178, "grad_norm": 0.2868699579114758, "learning_rate": 5.685873072781233e-06, "loss": 0.0314, "step": 5712 }, { "epoch": 1.532046124966479, "grad_norm": 0.31423446852688425, "learning_rate": 5.684327600993251e-06, "loss": 0.0265, "step": 5713 }, { "epoch": 1.5323142933762401, "grad_norm": 0.22972983571210082, "learning_rate": 5.682782062574109e-06, "loss": 0.0223, "step": 5714 }, { "epoch": 1.5325824617860015, "grad_norm": 0.3799780691609827, "learning_rate": 5.68123645767429e-06, "loss": 0.0249, "step": 5715 }, { "epoch": 1.532850630195763, "grad_norm": 0.2944892685817284, "learning_rate": 5.679690786444286e-06, "loss": 0.023, "step": 5716 }, { "epoch": 1.5331187986055244, "grad_norm": 0.2873578055352119, "learning_rate": 5.678145049034595e-06, "loss": 0.027, "step": 5717 }, { "epoch": 1.5333869670152858, "grad_norm": 0.5063012146416181, "learning_rate": 5.67659924559572e-06, "loss": 0.0336, "step": 5718 }, { "epoch": 1.533655135425047, "grad_norm": 0.2884435442890642, "learning_rate": 5.675053376278172e-06, "loss": 0.0228, "step": 5719 }, { "epoch": 1.5339233038348081, "grad_norm": 0.2529400235046711, "learning_rate": 5.673507441232471e-06, "loss": 0.0243, "step": 5720 }, { "epoch": 1.5341914722445695, "grad_norm": 0.32425283569712937, "learning_rate": 5.671961440609139e-06, "loss": 0.0304, "step": 5721 }, { "epoch": 1.534459640654331, "grad_norm": 0.17700257998097885, "learning_rate": 5.670415374558703e-06, "loss": 0.0167, "step": 5722 }, { "epoch": 1.5347278090640923, "grad_norm": 0.20182553022854352, "learning_rate": 5.668869243231703e-06, "loss": 0.0215, "step": 5723 }, { "epoch": 1.5349959774738537, "grad_norm": 0.25712961024746317, "learning_rate": 5.66732304677868e-06, "loss": 0.022, "step": 5724 }, { "epoch": 1.535264145883615, "grad_norm": 0.3313476328880328, "learning_rate": 5.665776785350185e-06, "loss": 0.0235, "step": 5725 }, { "epoch": 1.535532314293376, "grad_norm": 0.2842534625051747, "learning_rate": 5.664230459096771e-06, "loss": 0.0181, "step": 5726 }, { "epoch": 1.5358004827031375, "grad_norm": 0.30080164173995977, "learning_rate": 5.662684068169002e-06, "loss": 0.0264, "step": 5727 }, { "epoch": 1.536068651112899, "grad_norm": 0.3369639033568627, "learning_rate": 5.661137612717443e-06, "loss": 0.0309, "step": 5728 }, { "epoch": 1.5363368195226603, "grad_norm": 0.3455896171981691, "learning_rate": 5.659591092892671e-06, "loss": 0.0255, "step": 5729 }, { "epoch": 1.5366049879324215, "grad_norm": 0.25439846952598577, "learning_rate": 5.658044508845265e-06, "loss": 0.0266, "step": 5730 }, { "epoch": 1.536873156342183, "grad_norm": 0.2645720142203955, "learning_rate": 5.656497860725813e-06, "loss": 0.0202, "step": 5731 }, { "epoch": 1.537141324751944, "grad_norm": 0.33940589095765233, "learning_rate": 5.654951148684906e-06, "loss": 0.0273, "step": 5732 }, { "epoch": 1.5374094931617055, "grad_norm": 0.21455109519307344, "learning_rate": 5.653404372873147e-06, "loss": 0.0226, "step": 5733 }, { "epoch": 1.537677661571467, "grad_norm": 0.22834359695674483, "learning_rate": 5.651857533441135e-06, "loss": 0.0253, "step": 5734 }, { "epoch": 1.5379458299812283, "grad_norm": 0.2387426409412802, "learning_rate": 5.6503106305394886e-06, "loss": 0.02, "step": 5735 }, { "epoch": 1.5382139983909895, "grad_norm": 0.28209534713580514, "learning_rate": 5.648763664318822e-06, "loss": 0.0254, "step": 5736 }, { "epoch": 1.538482166800751, "grad_norm": 0.2779312710459575, "learning_rate": 5.64721663492976e-06, "loss": 0.0182, "step": 5737 }, { "epoch": 1.538750335210512, "grad_norm": 0.22084880881381772, "learning_rate": 5.645669542522932e-06, "loss": 0.0255, "step": 5738 }, { "epoch": 1.5390185036202735, "grad_norm": 0.2351995122580828, "learning_rate": 5.644122387248975e-06, "loss": 0.02, "step": 5739 }, { "epoch": 1.5392866720300349, "grad_norm": 0.26840350213046954, "learning_rate": 5.6425751692585306e-06, "loss": 0.0232, "step": 5740 }, { "epoch": 1.5395548404397963, "grad_norm": 0.22445596732401996, "learning_rate": 5.64102788870225e-06, "loss": 0.0197, "step": 5741 }, { "epoch": 1.5398230088495575, "grad_norm": 0.2872718551966616, "learning_rate": 5.6394805457307845e-06, "loss": 0.0222, "step": 5742 }, { "epoch": 1.5400911772593189, "grad_norm": 0.27616924281798305, "learning_rate": 5.637933140494795e-06, "loss": 0.0249, "step": 5743 }, { "epoch": 1.54035934566908, "grad_norm": 0.28644014535857876, "learning_rate": 5.636385673144951e-06, "loss": 0.0207, "step": 5744 }, { "epoch": 1.5406275140788415, "grad_norm": 0.23731642144448914, "learning_rate": 5.634838143831923e-06, "loss": 0.0183, "step": 5745 }, { "epoch": 1.5408956824886029, "grad_norm": 0.2661190331253653, "learning_rate": 5.6332905527063894e-06, "loss": 0.0246, "step": 5746 }, { "epoch": 1.5411638508983643, "grad_norm": 0.2175919733529343, "learning_rate": 5.631742899919038e-06, "loss": 0.02, "step": 5747 }, { "epoch": 1.5414320193081255, "grad_norm": 0.21254866486342017, "learning_rate": 5.630195185620556e-06, "loss": 0.0179, "step": 5748 }, { "epoch": 1.5417001877178869, "grad_norm": 0.2884630363663017, "learning_rate": 5.6286474099616426e-06, "loss": 0.0224, "step": 5749 }, { "epoch": 1.541968356127648, "grad_norm": 0.270084307882223, "learning_rate": 5.627099573092999e-06, "loss": 0.0236, "step": 5750 }, { "epoch": 1.5422365245374094, "grad_norm": 0.23498673770513662, "learning_rate": 5.6255516751653376e-06, "loss": 0.0294, "step": 5751 }, { "epoch": 1.5425046929471709, "grad_norm": 0.3956886363237127, "learning_rate": 5.624003716329368e-06, "loss": 0.0259, "step": 5752 }, { "epoch": 1.5427728613569323, "grad_norm": 0.27750588818098615, "learning_rate": 5.622455696735814e-06, "loss": 0.0334, "step": 5753 }, { "epoch": 1.5430410297666934, "grad_norm": 0.2332957865599169, "learning_rate": 5.620907616535401e-06, "loss": 0.0234, "step": 5754 }, { "epoch": 1.5433091981764548, "grad_norm": 0.2049568464963472, "learning_rate": 5.619359475878863e-06, "loss": 0.0173, "step": 5755 }, { "epoch": 1.543577366586216, "grad_norm": 0.21491316073123515, "learning_rate": 5.617811274916934e-06, "loss": 0.0227, "step": 5756 }, { "epoch": 1.5438455349959774, "grad_norm": 0.2252422932601072, "learning_rate": 5.616263013800365e-06, "loss": 0.029, "step": 5757 }, { "epoch": 1.5441137034057388, "grad_norm": 0.20270289684955514, "learning_rate": 5.614714692679902e-06, "loss": 0.0212, "step": 5758 }, { "epoch": 1.5443818718155002, "grad_norm": 0.2855415760518405, "learning_rate": 5.6131663117063e-06, "loss": 0.0246, "step": 5759 }, { "epoch": 1.5446500402252614, "grad_norm": 0.3296676413496741, "learning_rate": 5.611617871030323e-06, "loss": 0.0276, "step": 5760 }, { "epoch": 1.5449182086350228, "grad_norm": 0.23198940027234094, "learning_rate": 5.610069370802737e-06, "loss": 0.019, "step": 5761 }, { "epoch": 1.545186377044784, "grad_norm": 0.18675903922071008, "learning_rate": 5.608520811174316e-06, "loss": 0.0175, "step": 5762 }, { "epoch": 1.5454545454545454, "grad_norm": 0.5241161496963793, "learning_rate": 5.606972192295841e-06, "loss": 0.0286, "step": 5763 }, { "epoch": 1.5457227138643068, "grad_norm": 0.35228370447127927, "learning_rate": 5.605423514318093e-06, "loss": 0.0184, "step": 5764 }, { "epoch": 1.5459908822740682, "grad_norm": 0.26790111357365193, "learning_rate": 5.603874777391866e-06, "loss": 0.0293, "step": 5765 }, { "epoch": 1.5462590506838294, "grad_norm": 0.2450524478404383, "learning_rate": 5.602325981667954e-06, "loss": 0.0188, "step": 5766 }, { "epoch": 1.5465272190935908, "grad_norm": 0.2432962404596528, "learning_rate": 5.6007771272971616e-06, "loss": 0.0264, "step": 5767 }, { "epoch": 1.546795387503352, "grad_norm": 0.25385082226850986, "learning_rate": 5.599228214430294e-06, "loss": 0.0242, "step": 5768 }, { "epoch": 1.5470635559131134, "grad_norm": 0.26047603848496403, "learning_rate": 5.597679243218168e-06, "loss": 0.0242, "step": 5769 }, { "epoch": 1.5473317243228748, "grad_norm": 0.29797685700009646, "learning_rate": 5.596130213811601e-06, "loss": 0.0276, "step": 5770 }, { "epoch": 1.5475998927326362, "grad_norm": 0.3054756105588386, "learning_rate": 5.594581126361415e-06, "loss": 0.0289, "step": 5771 }, { "epoch": 1.5478680611423974, "grad_norm": 0.22135795426264612, "learning_rate": 5.593031981018445e-06, "loss": 0.0206, "step": 5772 }, { "epoch": 1.5481362295521588, "grad_norm": 0.1848947166486402, "learning_rate": 5.5914827779335254e-06, "loss": 0.0178, "step": 5773 }, { "epoch": 1.54840439796192, "grad_norm": 0.283183884330318, "learning_rate": 5.589933517257497e-06, "loss": 0.0272, "step": 5774 }, { "epoch": 1.5486725663716814, "grad_norm": 0.36648650992183907, "learning_rate": 5.588384199141211e-06, "loss": 0.0224, "step": 5775 }, { "epoch": 1.5489407347814428, "grad_norm": 0.2775681882105126, "learning_rate": 5.586834823735515e-06, "loss": 0.0303, "step": 5776 }, { "epoch": 1.5492089031912042, "grad_norm": 0.3798152430855655, "learning_rate": 5.585285391191272e-06, "loss": 0.0399, "step": 5777 }, { "epoch": 1.5494770716009654, "grad_norm": 0.44720095353450573, "learning_rate": 5.583735901659343e-06, "loss": 0.0275, "step": 5778 }, { "epoch": 1.5497452400107268, "grad_norm": 0.2443377044649307, "learning_rate": 5.5821863552905995e-06, "loss": 0.0211, "step": 5779 }, { "epoch": 1.550013408420488, "grad_norm": 0.43419135380242213, "learning_rate": 5.580636752235917e-06, "loss": 0.028, "step": 5780 }, { "epoch": 1.5502815768302494, "grad_norm": 0.3313408070125449, "learning_rate": 5.579087092646176e-06, "loss": 0.0267, "step": 5781 }, { "epoch": 1.5505497452400108, "grad_norm": 0.265268100258148, "learning_rate": 5.57753737667226e-06, "loss": 0.0221, "step": 5782 }, { "epoch": 1.5508179136497722, "grad_norm": 0.18036619365878084, "learning_rate": 5.575987604465064e-06, "loss": 0.0203, "step": 5783 }, { "epoch": 1.5510860820595334, "grad_norm": 0.31588799872982937, "learning_rate": 5.574437776175484e-06, "loss": 0.0145, "step": 5784 }, { "epoch": 1.5513542504692948, "grad_norm": 0.2515203888713182, "learning_rate": 5.572887891954424e-06, "loss": 0.0222, "step": 5785 }, { "epoch": 1.551622418879056, "grad_norm": 0.2888236038985623, "learning_rate": 5.57133795195279e-06, "loss": 0.0287, "step": 5786 }, { "epoch": 1.5518905872888173, "grad_norm": 0.5375321952717957, "learning_rate": 5.569787956321496e-06, "loss": 0.0386, "step": 5787 }, { "epoch": 1.5521587556985788, "grad_norm": 0.20647806258075907, "learning_rate": 5.568237905211462e-06, "loss": 0.0191, "step": 5788 }, { "epoch": 1.5524269241083402, "grad_norm": 0.2964841610412619, "learning_rate": 5.5666877987736125e-06, "loss": 0.0283, "step": 5789 }, { "epoch": 1.5526950925181013, "grad_norm": 0.23126085349292808, "learning_rate": 5.565137637158875e-06, "loss": 0.0189, "step": 5790 }, { "epoch": 1.5529632609278627, "grad_norm": 0.26927983158941593, "learning_rate": 5.563587420518187e-06, "loss": 0.0179, "step": 5791 }, { "epoch": 1.553231429337624, "grad_norm": 0.2558603066314946, "learning_rate": 5.562037149002488e-06, "loss": 0.0275, "step": 5792 }, { "epoch": 1.5534995977473853, "grad_norm": 0.30940498194298244, "learning_rate": 5.5604868227627254e-06, "loss": 0.0195, "step": 5793 }, { "epoch": 1.5537677661571467, "grad_norm": 0.2872279563516604, "learning_rate": 5.558936441949847e-06, "loss": 0.0181, "step": 5794 }, { "epoch": 1.5540359345669081, "grad_norm": 0.2611642610619393, "learning_rate": 5.557386006714813e-06, "loss": 0.0167, "step": 5795 }, { "epoch": 1.5543041029766693, "grad_norm": 0.20751634319980033, "learning_rate": 5.555835517208581e-06, "loss": 0.0159, "step": 5796 }, { "epoch": 1.5545722713864307, "grad_norm": 0.36702568854315043, "learning_rate": 5.5542849735821225e-06, "loss": 0.0379, "step": 5797 }, { "epoch": 1.554840439796192, "grad_norm": 0.24810260757580488, "learning_rate": 5.552734375986406e-06, "loss": 0.0228, "step": 5798 }, { "epoch": 1.5551086082059533, "grad_norm": 0.22295872824604504, "learning_rate": 5.551183724572411e-06, "loss": 0.024, "step": 5799 }, { "epoch": 1.5553767766157147, "grad_norm": 0.3411361468541338, "learning_rate": 5.549633019491122e-06, "loss": 0.0387, "step": 5800 }, { "epoch": 1.5556449450254761, "grad_norm": 0.2553148853813788, "learning_rate": 5.548082260893524e-06, "loss": 0.0249, "step": 5801 }, { "epoch": 1.5559131134352373, "grad_norm": 0.25435255963987563, "learning_rate": 5.546531448930611e-06, "loss": 0.024, "step": 5802 }, { "epoch": 1.5561812818449987, "grad_norm": 0.34857332406692054, "learning_rate": 5.544980583753382e-06, "loss": 0.0291, "step": 5803 }, { "epoch": 1.55644945025476, "grad_norm": 0.29514801438758986, "learning_rate": 5.543429665512841e-06, "loss": 0.0276, "step": 5804 }, { "epoch": 1.5567176186645213, "grad_norm": 0.2171926709747748, "learning_rate": 5.541878694359994e-06, "loss": 0.0154, "step": 5805 }, { "epoch": 1.5569857870742827, "grad_norm": 0.36382482094701857, "learning_rate": 5.540327670445861e-06, "loss": 0.0272, "step": 5806 }, { "epoch": 1.557253955484044, "grad_norm": 0.2789946163644132, "learning_rate": 5.538776593921455e-06, "loss": 0.025, "step": 5807 }, { "epoch": 1.5575221238938053, "grad_norm": 0.2648631296091051, "learning_rate": 5.537225464937803e-06, "loss": 0.0216, "step": 5808 }, { "epoch": 1.5577902923035667, "grad_norm": 0.2010730329586277, "learning_rate": 5.535674283645933e-06, "loss": 0.0147, "step": 5809 }, { "epoch": 1.5580584607133279, "grad_norm": 0.27928125347365207, "learning_rate": 5.53412305019688e-06, "loss": 0.0248, "step": 5810 }, { "epoch": 1.5583266291230893, "grad_norm": 0.26745939779367256, "learning_rate": 5.532571764741686e-06, "loss": 0.0236, "step": 5811 }, { "epoch": 1.5585947975328507, "grad_norm": 0.2544960304997697, "learning_rate": 5.531020427431392e-06, "loss": 0.0221, "step": 5812 }, { "epoch": 1.558862965942612, "grad_norm": 0.2561873102987595, "learning_rate": 5.5294690384170476e-06, "loss": 0.0232, "step": 5813 }, { "epoch": 1.5591311343523733, "grad_norm": 0.3007071286914221, "learning_rate": 5.527917597849709e-06, "loss": 0.0234, "step": 5814 }, { "epoch": 1.5593993027621347, "grad_norm": 0.3262051417004512, "learning_rate": 5.526366105880436e-06, "loss": 0.0297, "step": 5815 }, { "epoch": 1.5596674711718959, "grad_norm": 0.46757410464441185, "learning_rate": 5.524814562660292e-06, "loss": 0.0359, "step": 5816 }, { "epoch": 1.5599356395816573, "grad_norm": 0.2715996016485874, "learning_rate": 5.523262968340349e-06, "loss": 0.022, "step": 5817 }, { "epoch": 1.5602038079914187, "grad_norm": 0.2541366514131034, "learning_rate": 5.52171132307168e-06, "loss": 0.02, "step": 5818 }, { "epoch": 1.56047197640118, "grad_norm": 0.2666978913274109, "learning_rate": 5.520159627005363e-06, "loss": 0.0161, "step": 5819 }, { "epoch": 1.5607401448109413, "grad_norm": 0.31924283760853445, "learning_rate": 5.5186078802924845e-06, "loss": 0.0265, "step": 5820 }, { "epoch": 1.5610083132207024, "grad_norm": 0.2679156846724079, "learning_rate": 5.517056083084133e-06, "loss": 0.024, "step": 5821 }, { "epoch": 1.5612764816304638, "grad_norm": 0.3940345499201356, "learning_rate": 5.515504235531406e-06, "loss": 0.0291, "step": 5822 }, { "epoch": 1.5615446500402252, "grad_norm": 0.31645967781516077, "learning_rate": 5.513952337785398e-06, "loss": 0.0267, "step": 5823 }, { "epoch": 1.5618128184499867, "grad_norm": 0.401432240405321, "learning_rate": 5.512400389997216e-06, "loss": 0.0369, "step": 5824 }, { "epoch": 1.562080986859748, "grad_norm": 0.24132351316335632, "learning_rate": 5.5108483923179686e-06, "loss": 0.0219, "step": 5825 }, { "epoch": 1.5623491552695092, "grad_norm": 0.23293792904271826, "learning_rate": 5.509296344898769e-06, "loss": 0.0192, "step": 5826 }, { "epoch": 1.5626173236792704, "grad_norm": 0.2757454661697072, "learning_rate": 5.507744247890737e-06, "loss": 0.0202, "step": 5827 }, { "epoch": 1.5628854920890318, "grad_norm": 0.25516461802360024, "learning_rate": 5.506192101444996e-06, "loss": 0.0277, "step": 5828 }, { "epoch": 1.5631536604987932, "grad_norm": 0.15995263879387953, "learning_rate": 5.504639905712673e-06, "loss": 0.016, "step": 5829 }, { "epoch": 1.5634218289085546, "grad_norm": 0.24089787510113514, "learning_rate": 5.503087660844902e-06, "loss": 0.023, "step": 5830 }, { "epoch": 1.563689997318316, "grad_norm": 0.2034493903208532, "learning_rate": 5.501535366992822e-06, "loss": 0.0174, "step": 5831 }, { "epoch": 1.5639581657280772, "grad_norm": 0.2655021878270429, "learning_rate": 5.499983024307575e-06, "loss": 0.0225, "step": 5832 }, { "epoch": 1.5642263341378384, "grad_norm": 0.2549957199837677, "learning_rate": 5.498430632940308e-06, "loss": 0.0272, "step": 5833 }, { "epoch": 1.5644945025475998, "grad_norm": 0.3743131073677069, "learning_rate": 5.496878193042174e-06, "loss": 0.0218, "step": 5834 }, { "epoch": 1.5647626709573612, "grad_norm": 0.29935273657104855, "learning_rate": 5.4953257047643284e-06, "loss": 0.0258, "step": 5835 }, { "epoch": 1.5650308393671226, "grad_norm": 0.26077473137465346, "learning_rate": 5.493773168257934e-06, "loss": 0.0212, "step": 5836 }, { "epoch": 1.565299007776884, "grad_norm": 0.320467895905032, "learning_rate": 5.492220583674158e-06, "loss": 0.0236, "step": 5837 }, { "epoch": 1.5655671761866452, "grad_norm": 0.2527012922901406, "learning_rate": 5.49066795116417e-06, "loss": 0.021, "step": 5838 }, { "epoch": 1.5658353445964064, "grad_norm": 0.2645461569879721, "learning_rate": 5.4891152708791476e-06, "loss": 0.0212, "step": 5839 }, { "epoch": 1.5661035130061678, "grad_norm": 0.34149655158928116, "learning_rate": 5.487562542970268e-06, "loss": 0.0251, "step": 5840 }, { "epoch": 1.5663716814159292, "grad_norm": 0.27729191409247045, "learning_rate": 5.4860097675887184e-06, "loss": 0.0226, "step": 5841 }, { "epoch": 1.5666398498256906, "grad_norm": 0.27102400836669815, "learning_rate": 5.48445694488569e-06, "loss": 0.0202, "step": 5842 }, { "epoch": 1.566908018235452, "grad_norm": 0.21191969622596857, "learning_rate": 5.482904075012373e-06, "loss": 0.0155, "step": 5843 }, { "epoch": 1.5671761866452132, "grad_norm": 0.3332278755523665, "learning_rate": 5.481351158119969e-06, "loss": 0.0393, "step": 5844 }, { "epoch": 1.5674443550549744, "grad_norm": 0.2294410548586722, "learning_rate": 5.479798194359681e-06, "loss": 0.0169, "step": 5845 }, { "epoch": 1.5677125234647358, "grad_norm": 0.45667269274626, "learning_rate": 5.478245183882716e-06, "loss": 0.0281, "step": 5846 }, { "epoch": 1.5679806918744972, "grad_norm": 0.2274918709890738, "learning_rate": 5.476692126840287e-06, "loss": 0.0136, "step": 5847 }, { "epoch": 1.5682488602842586, "grad_norm": 0.2894774191830741, "learning_rate": 5.475139023383613e-06, "loss": 0.0248, "step": 5848 }, { "epoch": 1.56851702869402, "grad_norm": 0.4493864804380648, "learning_rate": 5.473585873663912e-06, "loss": 0.0207, "step": 5849 }, { "epoch": 1.5687851971037812, "grad_norm": 0.22222687959419377, "learning_rate": 5.472032677832413e-06, "loss": 0.018, "step": 5850 }, { "epoch": 1.5690533655135424, "grad_norm": 0.2726627808842648, "learning_rate": 5.470479436040345e-06, "loss": 0.0268, "step": 5851 }, { "epoch": 1.5693215339233038, "grad_norm": 0.23237193897466998, "learning_rate": 5.468926148438941e-06, "loss": 0.0174, "step": 5852 }, { "epoch": 1.5695897023330652, "grad_norm": 0.35849497184247436, "learning_rate": 5.4673728151794445e-06, "loss": 0.0412, "step": 5853 }, { "epoch": 1.5698578707428266, "grad_norm": 0.21967626194382844, "learning_rate": 5.4658194364131e-06, "loss": 0.0165, "step": 5854 }, { "epoch": 1.570126039152588, "grad_norm": 0.2336329812437427, "learning_rate": 5.464266012291152e-06, "loss": 0.0215, "step": 5855 }, { "epoch": 1.5703942075623492, "grad_norm": 0.2315289778832945, "learning_rate": 5.4627125429648554e-06, "loss": 0.0194, "step": 5856 }, { "epoch": 1.5706623759721103, "grad_norm": 0.2970776996570779, "learning_rate": 5.461159028585465e-06, "loss": 0.0269, "step": 5857 }, { "epoch": 1.5709305443818717, "grad_norm": 0.3559462998382894, "learning_rate": 5.459605469304246e-06, "loss": 0.0191, "step": 5858 }, { "epoch": 1.5711987127916331, "grad_norm": 0.2229350753484545, "learning_rate": 5.458051865272462e-06, "loss": 0.0221, "step": 5859 }, { "epoch": 1.5714668812013945, "grad_norm": 0.22543193089532465, "learning_rate": 5.456498216641386e-06, "loss": 0.0167, "step": 5860 }, { "epoch": 1.571735049611156, "grad_norm": 0.4031050755784418, "learning_rate": 5.45494452356229e-06, "loss": 0.0299, "step": 5861 }, { "epoch": 1.5720032180209171, "grad_norm": 0.24041060338818002, "learning_rate": 5.453390786186451e-06, "loss": 0.0173, "step": 5862 }, { "epoch": 1.5722713864306783, "grad_norm": 0.2667667689363464, "learning_rate": 5.451837004665157e-06, "loss": 0.0237, "step": 5863 }, { "epoch": 1.5725395548404397, "grad_norm": 0.4133156903539484, "learning_rate": 5.450283179149694e-06, "loss": 0.031, "step": 5864 }, { "epoch": 1.5728077232502011, "grad_norm": 0.4072169726708982, "learning_rate": 5.448729309791352e-06, "loss": 0.0303, "step": 5865 }, { "epoch": 1.5730758916599625, "grad_norm": 0.24392834757441723, "learning_rate": 5.44717539674143e-06, "loss": 0.0269, "step": 5866 }, { "epoch": 1.573344060069724, "grad_norm": 0.24925391065428767, "learning_rate": 5.445621440151224e-06, "loss": 0.0219, "step": 5867 }, { "epoch": 1.5736122284794851, "grad_norm": 0.23223701915434392, "learning_rate": 5.444067440172044e-06, "loss": 0.0204, "step": 5868 }, { "epoch": 1.5738803968892463, "grad_norm": 0.2391676909841526, "learning_rate": 5.4425133969551955e-06, "loss": 0.0206, "step": 5869 }, { "epoch": 1.5741485652990077, "grad_norm": 0.2414102179528656, "learning_rate": 5.440959310651992e-06, "loss": 0.0224, "step": 5870 }, { "epoch": 1.5744167337087691, "grad_norm": 0.3047875178474884, "learning_rate": 5.439405181413752e-06, "loss": 0.0309, "step": 5871 }, { "epoch": 1.5746849021185305, "grad_norm": 0.28948959480008596, "learning_rate": 5.437851009391796e-06, "loss": 0.0279, "step": 5872 }, { "epoch": 1.574953070528292, "grad_norm": 0.27862069444331405, "learning_rate": 5.436296794737449e-06, "loss": 0.0255, "step": 5873 }, { "epoch": 1.575221238938053, "grad_norm": 0.30435352952368366, "learning_rate": 5.4347425376020425e-06, "loss": 0.0392, "step": 5874 }, { "epoch": 1.5754894073478143, "grad_norm": 1.40333211171616, "learning_rate": 5.433188238136908e-06, "loss": 0.0336, "step": 5875 }, { "epoch": 1.5757575757575757, "grad_norm": 0.3040604626568992, "learning_rate": 5.431633896493386e-06, "loss": 0.0248, "step": 5876 }, { "epoch": 1.576025744167337, "grad_norm": 0.20880730600122188, "learning_rate": 5.430079512822816e-06, "loss": 0.016, "step": 5877 }, { "epoch": 1.5762939125770985, "grad_norm": 0.2617828668570294, "learning_rate": 5.428525087276547e-06, "loss": 0.0234, "step": 5878 }, { "epoch": 1.57656208098686, "grad_norm": 1.0351001448133075, "learning_rate": 5.426970620005928e-06, "loss": 0.0259, "step": 5879 }, { "epoch": 1.576830249396621, "grad_norm": 0.35742636956913065, "learning_rate": 5.425416111162313e-06, "loss": 0.0336, "step": 5880 }, { "epoch": 1.5770984178063823, "grad_norm": 0.39148249161812765, "learning_rate": 5.423861560897061e-06, "loss": 0.0272, "step": 5881 }, { "epoch": 1.5773665862161437, "grad_norm": 0.26025513724092664, "learning_rate": 5.422306969361534e-06, "loss": 0.0241, "step": 5882 }, { "epoch": 1.577634754625905, "grad_norm": 0.24563358992123274, "learning_rate": 5.420752336707098e-06, "loss": 0.0249, "step": 5883 }, { "epoch": 1.5779029230356665, "grad_norm": 0.26113091987661613, "learning_rate": 5.419197663085124e-06, "loss": 0.0189, "step": 5884 }, { "epoch": 1.5781710914454279, "grad_norm": 0.19755526317127137, "learning_rate": 5.4176429486469874e-06, "loss": 0.0205, "step": 5885 }, { "epoch": 1.578439259855189, "grad_norm": 0.27211830904507195, "learning_rate": 5.416088193544065e-06, "loss": 0.0237, "step": 5886 }, { "epoch": 1.5787074282649503, "grad_norm": 0.40750040005999355, "learning_rate": 5.41453339792774e-06, "loss": 0.0313, "step": 5887 }, { "epoch": 1.5789755966747117, "grad_norm": 0.30350027107092764, "learning_rate": 5.412978561949399e-06, "loss": 0.0206, "step": 5888 }, { "epoch": 1.579243765084473, "grad_norm": 0.2441571527278083, "learning_rate": 5.41142368576043e-06, "loss": 0.0239, "step": 5889 }, { "epoch": 1.5795119334942345, "grad_norm": 0.29404984651520594, "learning_rate": 5.409868769512232e-06, "loss": 0.0253, "step": 5890 }, { "epoch": 1.5797801019039959, "grad_norm": 0.39778103027842027, "learning_rate": 5.408313813356199e-06, "loss": 0.0276, "step": 5891 }, { "epoch": 1.580048270313757, "grad_norm": 0.271169906933484, "learning_rate": 5.406758817443734e-06, "loss": 0.0284, "step": 5892 }, { "epoch": 1.5803164387235182, "grad_norm": 0.23557018880994973, "learning_rate": 5.405203781926243e-06, "loss": 0.0243, "step": 5893 }, { "epoch": 1.5805846071332796, "grad_norm": 0.20517366119919953, "learning_rate": 5.403648706955134e-06, "loss": 0.0148, "step": 5894 }, { "epoch": 1.580852775543041, "grad_norm": 0.27899723205633437, "learning_rate": 5.402093592681823e-06, "loss": 0.0252, "step": 5895 }, { "epoch": 1.5811209439528024, "grad_norm": 0.3467633689642137, "learning_rate": 5.400538439257728e-06, "loss": 0.0239, "step": 5896 }, { "epoch": 1.5813891123625639, "grad_norm": 0.38836358272094407, "learning_rate": 5.398983246834266e-06, "loss": 0.0274, "step": 5897 }, { "epoch": 1.581657280772325, "grad_norm": 0.36891585054561965, "learning_rate": 5.397428015562867e-06, "loss": 0.0285, "step": 5898 }, { "epoch": 1.5819254491820862, "grad_norm": 0.2175745974120072, "learning_rate": 5.395872745594956e-06, "loss": 0.0198, "step": 5899 }, { "epoch": 1.5821936175918476, "grad_norm": 0.18597927012916518, "learning_rate": 5.3943174370819655e-06, "loss": 0.0156, "step": 5900 }, { "epoch": 1.582461786001609, "grad_norm": 0.20825942034196168, "learning_rate": 5.392762090175334e-06, "loss": 0.0281, "step": 5901 }, { "epoch": 1.5827299544113704, "grad_norm": 0.23259600966112637, "learning_rate": 5.391206705026503e-06, "loss": 0.0168, "step": 5902 }, { "epoch": 1.5829981228211318, "grad_norm": 0.401115360485174, "learning_rate": 5.3896512817869106e-06, "loss": 0.0292, "step": 5903 }, { "epoch": 1.583266291230893, "grad_norm": 0.2973006326095244, "learning_rate": 5.388095820608009e-06, "loss": 0.0286, "step": 5904 }, { "epoch": 1.5835344596406542, "grad_norm": 0.2709043594758099, "learning_rate": 5.386540321641246e-06, "loss": 0.0248, "step": 5905 }, { "epoch": 1.5838026280504156, "grad_norm": 0.2349072638507586, "learning_rate": 5.3849847850380795e-06, "loss": 0.0229, "step": 5906 }, { "epoch": 1.584070796460177, "grad_norm": 0.20478047669248844, "learning_rate": 5.383429210949967e-06, "loss": 0.0209, "step": 5907 }, { "epoch": 1.5843389648699384, "grad_norm": 0.4071892187392045, "learning_rate": 5.3818735995283696e-06, "loss": 0.0276, "step": 5908 }, { "epoch": 1.5846071332796996, "grad_norm": 0.37477060557175523, "learning_rate": 5.380317950924754e-06, "loss": 0.0287, "step": 5909 }, { "epoch": 1.584875301689461, "grad_norm": 0.1873676279645749, "learning_rate": 5.378762265290588e-06, "loss": 0.0166, "step": 5910 }, { "epoch": 1.5851434700992222, "grad_norm": 0.2326070774761628, "learning_rate": 5.377206542777347e-06, "loss": 0.0201, "step": 5911 }, { "epoch": 1.5854116385089836, "grad_norm": 0.41848050492964683, "learning_rate": 5.3756507835365055e-06, "loss": 0.0221, "step": 5912 }, { "epoch": 1.585679806918745, "grad_norm": 0.22848219656747337, "learning_rate": 5.374094987719546e-06, "loss": 0.0194, "step": 5913 }, { "epoch": 1.5859479753285064, "grad_norm": 0.27227213121114097, "learning_rate": 5.372539155477949e-06, "loss": 0.025, "step": 5914 }, { "epoch": 1.5862161437382676, "grad_norm": 0.2627599685784755, "learning_rate": 5.370983286963205e-06, "loss": 0.0197, "step": 5915 }, { "epoch": 1.586484312148029, "grad_norm": 0.3673819508367893, "learning_rate": 5.3694273823268025e-06, "loss": 0.0365, "step": 5916 }, { "epoch": 1.5867524805577902, "grad_norm": 0.24787909188307017, "learning_rate": 5.367871441720237e-06, "loss": 0.0221, "step": 5917 }, { "epoch": 1.5870206489675516, "grad_norm": 0.256494757800115, "learning_rate": 5.366315465295007e-06, "loss": 0.0273, "step": 5918 }, { "epoch": 1.587288817377313, "grad_norm": 0.22657997176062997, "learning_rate": 5.36475945320261e-06, "loss": 0.0203, "step": 5919 }, { "epoch": 1.5875569857870744, "grad_norm": 0.2571208789377835, "learning_rate": 5.363203405594556e-06, "loss": 0.0229, "step": 5920 }, { "epoch": 1.5878251541968356, "grad_norm": 0.24036200495959897, "learning_rate": 5.361647322622349e-06, "loss": 0.0199, "step": 5921 }, { "epoch": 1.588093322606597, "grad_norm": 0.29267885056346876, "learning_rate": 5.360091204437503e-06, "loss": 0.0302, "step": 5922 }, { "epoch": 1.5883614910163582, "grad_norm": 0.27554752034193536, "learning_rate": 5.358535051191533e-06, "loss": 0.0317, "step": 5923 }, { "epoch": 1.5886296594261196, "grad_norm": 0.36860529882388043, "learning_rate": 5.3569788630359555e-06, "loss": 0.0271, "step": 5924 }, { "epoch": 1.588897827835881, "grad_norm": 0.2001404589355475, "learning_rate": 5.355422640122295e-06, "loss": 0.0189, "step": 5925 }, { "epoch": 1.5891659962456424, "grad_norm": 0.30703510503116255, "learning_rate": 5.353866382602075e-06, "loss": 0.0296, "step": 5926 }, { "epoch": 1.5894341646554035, "grad_norm": 0.19684585323053286, "learning_rate": 5.352310090626825e-06, "loss": 0.0175, "step": 5927 }, { "epoch": 1.589702333065165, "grad_norm": 0.24399467623586565, "learning_rate": 5.350753764348078e-06, "loss": 0.0235, "step": 5928 }, { "epoch": 1.5899705014749261, "grad_norm": 0.22616408827129386, "learning_rate": 5.349197403917367e-06, "loss": 0.0167, "step": 5929 }, { "epoch": 1.5902386698846875, "grad_norm": 0.30793300379073, "learning_rate": 5.347641009486232e-06, "loss": 0.0359, "step": 5930 }, { "epoch": 1.590506838294449, "grad_norm": 0.22846964947760046, "learning_rate": 5.346084581206215e-06, "loss": 0.0203, "step": 5931 }, { "epoch": 1.5907750067042103, "grad_norm": 0.3042662760094639, "learning_rate": 5.34452811922886e-06, "loss": 0.0286, "step": 5932 }, { "epoch": 1.5910431751139715, "grad_norm": 0.3454143053370181, "learning_rate": 5.3429716237057195e-06, "loss": 0.0261, "step": 5933 }, { "epoch": 1.591311343523733, "grad_norm": 0.6230807884794777, "learning_rate": 5.3414150947883405e-06, "loss": 0.0348, "step": 5934 }, { "epoch": 1.5915795119334941, "grad_norm": 0.26295699529041155, "learning_rate": 5.33985853262828e-06, "loss": 0.0229, "step": 5935 }, { "epoch": 1.5918476803432555, "grad_norm": 0.28842054638345643, "learning_rate": 5.338301937377098e-06, "loss": 0.022, "step": 5936 }, { "epoch": 1.592115848753017, "grad_norm": 0.27344782957928476, "learning_rate": 5.336745309186352e-06, "loss": 0.0253, "step": 5937 }, { "epoch": 1.5923840171627783, "grad_norm": 0.27272408498609274, "learning_rate": 5.335188648207611e-06, "loss": 0.0217, "step": 5938 }, { "epoch": 1.5926521855725395, "grad_norm": 0.2515661389659537, "learning_rate": 5.333631954592443e-06, "loss": 0.0242, "step": 5939 }, { "epoch": 1.592920353982301, "grad_norm": 0.2772289401029188, "learning_rate": 5.332075228492415e-06, "loss": 0.0254, "step": 5940 }, { "epoch": 1.593188522392062, "grad_norm": 0.2964947650971952, "learning_rate": 5.330518470059104e-06, "loss": 0.0269, "step": 5941 }, { "epoch": 1.5934566908018235, "grad_norm": 0.25682096151794853, "learning_rate": 5.328961679444088e-06, "loss": 0.0181, "step": 5942 }, { "epoch": 1.593724859211585, "grad_norm": 0.2346649369431299, "learning_rate": 5.327404856798944e-06, "loss": 0.0237, "step": 5943 }, { "epoch": 1.5939930276213463, "grad_norm": 0.3113577938375682, "learning_rate": 5.3258480022752605e-06, "loss": 0.0358, "step": 5944 }, { "epoch": 1.5942611960311075, "grad_norm": 0.28413354263208634, "learning_rate": 5.324291116024624e-06, "loss": 0.0238, "step": 5945 }, { "epoch": 1.594529364440869, "grad_norm": 0.35919195492328093, "learning_rate": 5.3227341981986195e-06, "loss": 0.0274, "step": 5946 }, { "epoch": 1.59479753285063, "grad_norm": 0.29814673508237627, "learning_rate": 5.321177248948843e-06, "loss": 0.0269, "step": 5947 }, { "epoch": 1.5950657012603915, "grad_norm": 0.5370517814596732, "learning_rate": 5.31962026842689e-06, "loss": 0.0325, "step": 5948 }, { "epoch": 1.595333869670153, "grad_norm": 0.21727628818489103, "learning_rate": 5.318063256784362e-06, "loss": 0.0143, "step": 5949 }, { "epoch": 1.5956020380799143, "grad_norm": 0.31274065648392296, "learning_rate": 5.316506214172859e-06, "loss": 0.0256, "step": 5950 }, { "epoch": 1.5958702064896755, "grad_norm": 0.299232386328162, "learning_rate": 5.314949140743987e-06, "loss": 0.0393, "step": 5951 }, { "epoch": 1.5961383748994369, "grad_norm": 0.28199596439634944, "learning_rate": 5.313392036649352e-06, "loss": 0.0232, "step": 5952 }, { "epoch": 1.596406543309198, "grad_norm": 0.7740250048378449, "learning_rate": 5.311834902040565e-06, "loss": 0.0265, "step": 5953 }, { "epoch": 1.5966747117189595, "grad_norm": 0.2339104992962593, "learning_rate": 5.310277737069243e-06, "loss": 0.0199, "step": 5954 }, { "epoch": 1.5969428801287209, "grad_norm": 0.18196098600313304, "learning_rate": 5.3087205418870014e-06, "loss": 0.0183, "step": 5955 }, { "epoch": 1.5972110485384823, "grad_norm": 0.32928732196638727, "learning_rate": 5.30716331664546e-06, "loss": 0.0267, "step": 5956 }, { "epoch": 1.5974792169482435, "grad_norm": 0.21388404808914088, "learning_rate": 5.305606061496245e-06, "loss": 0.0206, "step": 5957 }, { "epoch": 1.5977473853580049, "grad_norm": 0.2699802934236477, "learning_rate": 5.304048776590974e-06, "loss": 0.0262, "step": 5958 }, { "epoch": 1.598015553767766, "grad_norm": 0.25632449599985513, "learning_rate": 5.302491462081284e-06, "loss": 0.0209, "step": 5959 }, { "epoch": 1.5982837221775275, "grad_norm": 0.18273788289269718, "learning_rate": 5.300934118118802e-06, "loss": 0.0168, "step": 5960 }, { "epoch": 1.5985518905872889, "grad_norm": 0.35557764025654975, "learning_rate": 5.299376744855164e-06, "loss": 0.0186, "step": 5961 }, { "epoch": 1.5988200589970503, "grad_norm": 0.2835202399023816, "learning_rate": 5.2978193424420085e-06, "loss": 0.0273, "step": 5962 }, { "epoch": 1.5990882274068114, "grad_norm": 0.23649122288407107, "learning_rate": 5.296261911030974e-06, "loss": 0.0194, "step": 5963 }, { "epoch": 1.5993563958165729, "grad_norm": 0.36240407725143836, "learning_rate": 5.294704450773703e-06, "loss": 0.0253, "step": 5964 }, { "epoch": 1.599624564226334, "grad_norm": 0.188571532012262, "learning_rate": 5.293146961821844e-06, "loss": 0.0171, "step": 5965 }, { "epoch": 1.5998927326360954, "grad_norm": 0.24442335426769185, "learning_rate": 5.291589444327041e-06, "loss": 0.0222, "step": 5966 }, { "epoch": 1.6001609010458568, "grad_norm": 0.41593302365873674, "learning_rate": 5.29003189844095e-06, "loss": 0.035, "step": 5967 }, { "epoch": 1.6004290694556182, "grad_norm": 0.32286049397605465, "learning_rate": 5.2884743243152215e-06, "loss": 0.0186, "step": 5968 }, { "epoch": 1.6006972378653794, "grad_norm": 0.32079716822399607, "learning_rate": 5.286916722101515e-06, "loss": 0.029, "step": 5969 }, { "epoch": 1.6009654062751408, "grad_norm": 0.3842356553383585, "learning_rate": 5.285359091951488e-06, "loss": 0.023, "step": 5970 }, { "epoch": 1.601233574684902, "grad_norm": 0.32706843211427317, "learning_rate": 5.283801434016804e-06, "loss": 0.0188, "step": 5971 }, { "epoch": 1.6015017430946634, "grad_norm": 0.22574854414977236, "learning_rate": 5.282243748449128e-06, "loss": 0.0235, "step": 5972 }, { "epoch": 1.6017699115044248, "grad_norm": 0.28283305260577324, "learning_rate": 5.280686035400129e-06, "loss": 0.0288, "step": 5973 }, { "epoch": 1.6020380799141862, "grad_norm": 0.22885691873132874, "learning_rate": 5.279128295021475e-06, "loss": 0.0174, "step": 5974 }, { "epoch": 1.6023062483239474, "grad_norm": 0.35956557902849146, "learning_rate": 5.277570527464839e-06, "loss": 0.0201, "step": 5975 }, { "epoch": 1.6025744167337088, "grad_norm": 0.2377556920890842, "learning_rate": 5.2760127328818985e-06, "loss": 0.0223, "step": 5976 }, { "epoch": 1.60284258514347, "grad_norm": 0.3101119214428328, "learning_rate": 5.274454911424332e-06, "loss": 0.0175, "step": 5977 }, { "epoch": 1.6031107535532314, "grad_norm": 0.3094101265218406, "learning_rate": 5.272897063243818e-06, "loss": 0.0265, "step": 5978 }, { "epoch": 1.6033789219629928, "grad_norm": 0.27841471904500864, "learning_rate": 5.2713391884920415e-06, "loss": 0.026, "step": 5979 }, { "epoch": 1.6036470903727542, "grad_norm": 0.21221480698318357, "learning_rate": 5.269781287320688e-06, "loss": 0.0204, "step": 5980 }, { "epoch": 1.6039152587825154, "grad_norm": 0.25175395440423437, "learning_rate": 5.268223359881449e-06, "loss": 0.0196, "step": 5981 }, { "epoch": 1.6041834271922768, "grad_norm": 0.18821356695325567, "learning_rate": 5.266665406326013e-06, "loss": 0.0157, "step": 5982 }, { "epoch": 1.604451595602038, "grad_norm": 0.39938492382497204, "learning_rate": 5.265107426806073e-06, "loss": 0.0345, "step": 5983 }, { "epoch": 1.6047197640117994, "grad_norm": 0.2870471474656807, "learning_rate": 5.263549421473327e-06, "loss": 0.024, "step": 5984 }, { "epoch": 1.6049879324215608, "grad_norm": 0.2572883142093176, "learning_rate": 5.261991390479473e-06, "loss": 0.0314, "step": 5985 }, { "epoch": 1.6052561008313222, "grad_norm": 0.2942134032355729, "learning_rate": 5.260433333976214e-06, "loss": 0.0214, "step": 5986 }, { "epoch": 1.6055242692410834, "grad_norm": 0.326256260329595, "learning_rate": 5.258875252115253e-06, "loss": 0.027, "step": 5987 }, { "epoch": 1.6057924376508448, "grad_norm": 0.26644813713302845, "learning_rate": 5.2573171450482964e-06, "loss": 0.0254, "step": 5988 }, { "epoch": 1.606060606060606, "grad_norm": 0.26920740438061475, "learning_rate": 5.255759012927051e-06, "loss": 0.0242, "step": 5989 }, { "epoch": 1.6063287744703674, "grad_norm": 0.2917397965364728, "learning_rate": 5.254200855903231e-06, "loss": 0.025, "step": 5990 }, { "epoch": 1.6065969428801288, "grad_norm": 0.22476125679757147, "learning_rate": 5.2526426741285465e-06, "loss": 0.0224, "step": 5991 }, { "epoch": 1.6068651112898902, "grad_norm": 0.24064636953706056, "learning_rate": 5.251084467754717e-06, "loss": 0.0265, "step": 5992 }, { "epoch": 1.6071332796996514, "grad_norm": 0.18621273956615175, "learning_rate": 5.24952623693346e-06, "loss": 0.0147, "step": 5993 }, { "epoch": 1.6074014481094125, "grad_norm": 0.2606553170068972, "learning_rate": 5.247967981816495e-06, "loss": 0.0234, "step": 5994 }, { "epoch": 1.607669616519174, "grad_norm": 0.27529611039384405, "learning_rate": 5.246409702555546e-06, "loss": 0.0321, "step": 5995 }, { "epoch": 1.6079377849289354, "grad_norm": 0.24850101839979494, "learning_rate": 5.244851399302337e-06, "loss": 0.0214, "step": 5996 }, { "epoch": 1.6082059533386968, "grad_norm": 0.280861568674063, "learning_rate": 5.243293072208599e-06, "loss": 0.0197, "step": 5997 }, { "epoch": 1.6084741217484582, "grad_norm": 0.26195068069906485, "learning_rate": 5.24173472142606e-06, "loss": 0.0263, "step": 5998 }, { "epoch": 1.6087422901582193, "grad_norm": 0.21410842407406058, "learning_rate": 5.240176347106455e-06, "loss": 0.0185, "step": 5999 }, { "epoch": 1.6090104585679805, "grad_norm": 0.2162034235587646, "learning_rate": 5.2386179494015155e-06, "loss": 0.0161, "step": 6000 }, { "epoch": 1.609278626977742, "grad_norm": 0.2556408572684341, "learning_rate": 5.237059528462978e-06, "loss": 0.0213, "step": 6001 }, { "epoch": 1.6095467953875033, "grad_norm": 0.2433063478818582, "learning_rate": 5.235501084442584e-06, "loss": 0.0206, "step": 6002 }, { "epoch": 1.6098149637972647, "grad_norm": 0.18704173480783193, "learning_rate": 5.233942617492077e-06, "loss": 0.0166, "step": 6003 }, { "epoch": 1.6100831322070261, "grad_norm": 0.3414662461042351, "learning_rate": 5.232384127763197e-06, "loss": 0.0286, "step": 6004 }, { "epoch": 1.6103513006167873, "grad_norm": 0.30623816719307223, "learning_rate": 5.230825615407692e-06, "loss": 0.0325, "step": 6005 }, { "epoch": 1.6106194690265485, "grad_norm": 0.30049527260513015, "learning_rate": 5.229267080577308e-06, "loss": 0.0245, "step": 6006 }, { "epoch": 1.61088763743631, "grad_norm": 0.3291948578205075, "learning_rate": 5.227708523423799e-06, "loss": 0.022, "step": 6007 }, { "epoch": 1.6111558058460713, "grad_norm": 0.24892906175819682, "learning_rate": 5.2261499440989145e-06, "loss": 0.018, "step": 6008 }, { "epoch": 1.6114239742558327, "grad_norm": 0.30053619352184857, "learning_rate": 5.22459134275441e-06, "loss": 0.0312, "step": 6009 }, { "epoch": 1.6116921426655941, "grad_norm": 0.2800944378429468, "learning_rate": 5.223032719542044e-06, "loss": 0.0251, "step": 6010 }, { "epoch": 1.6119603110753553, "grad_norm": 0.37663210803717057, "learning_rate": 5.221474074613574e-06, "loss": 0.0455, "step": 6011 }, { "epoch": 1.6122284794851165, "grad_norm": 0.29887326190554997, "learning_rate": 5.21991540812076e-06, "loss": 0.0202, "step": 6012 }, { "epoch": 1.612496647894878, "grad_norm": 0.18280999941730336, "learning_rate": 5.218356720215369e-06, "loss": 0.0187, "step": 6013 }, { "epoch": 1.6127648163046393, "grad_norm": 0.5243515358027353, "learning_rate": 5.216798011049162e-06, "loss": 0.037, "step": 6014 }, { "epoch": 1.6130329847144007, "grad_norm": 0.35393016554438766, "learning_rate": 5.215239280773908e-06, "loss": 0.0252, "step": 6015 }, { "epoch": 1.6133011531241621, "grad_norm": 0.2677728232561824, "learning_rate": 5.213680529541378e-06, "loss": 0.0149, "step": 6016 }, { "epoch": 1.6135693215339233, "grad_norm": 0.27862803882646314, "learning_rate": 5.212121757503341e-06, "loss": 0.0261, "step": 6017 }, { "epoch": 1.6138374899436845, "grad_norm": 0.27686407190652823, "learning_rate": 5.210562964811573e-06, "loss": 0.0211, "step": 6018 }, { "epoch": 1.6141056583534459, "grad_norm": 0.2561234659475586, "learning_rate": 5.209004151617847e-06, "loss": 0.0254, "step": 6019 }, { "epoch": 1.6143738267632073, "grad_norm": 0.4276827291126018, "learning_rate": 5.2074453180739415e-06, "loss": 0.0302, "step": 6020 }, { "epoch": 1.6146419951729687, "grad_norm": 0.3012334452437274, "learning_rate": 5.2058864643316376e-06, "loss": 0.0283, "step": 6021 }, { "epoch": 1.61491016358273, "grad_norm": 0.24279073294681833, "learning_rate": 5.204327590542714e-06, "loss": 0.0283, "step": 6022 }, { "epoch": 1.6151783319924913, "grad_norm": 0.7858266002805424, "learning_rate": 5.202768696858955e-06, "loss": 0.0272, "step": 6023 }, { "epoch": 1.6154465004022525, "grad_norm": 0.23956895246764823, "learning_rate": 5.201209783432148e-06, "loss": 0.0258, "step": 6024 }, { "epoch": 1.6157146688120139, "grad_norm": 0.27379602379957696, "learning_rate": 5.199650850414078e-06, "loss": 0.0217, "step": 6025 }, { "epoch": 1.6159828372217753, "grad_norm": 0.29975127098184795, "learning_rate": 5.198091897956533e-06, "loss": 0.0307, "step": 6026 }, { "epoch": 1.6162510056315367, "grad_norm": 0.29321911292415415, "learning_rate": 5.196532926211307e-06, "loss": 0.024, "step": 6027 }, { "epoch": 1.616519174041298, "grad_norm": 0.34472867113858013, "learning_rate": 5.194973935330192e-06, "loss": 0.0197, "step": 6028 }, { "epoch": 1.6167873424510593, "grad_norm": 0.29235380736397093, "learning_rate": 5.193414925464984e-06, "loss": 0.0217, "step": 6029 }, { "epoch": 1.6170555108608204, "grad_norm": 0.23448410003232165, "learning_rate": 5.191855896767476e-06, "loss": 0.0239, "step": 6030 }, { "epoch": 1.6173236792705818, "grad_norm": 0.27416450321143826, "learning_rate": 5.190296849389469e-06, "loss": 0.0215, "step": 6031 }, { "epoch": 1.6175918476803433, "grad_norm": 0.2892588302737628, "learning_rate": 5.188737783482763e-06, "loss": 0.0267, "step": 6032 }, { "epoch": 1.6178600160901047, "grad_norm": 0.2994178332088737, "learning_rate": 5.18717869919916e-06, "loss": 0.0297, "step": 6033 }, { "epoch": 1.618128184499866, "grad_norm": 0.3097190474461079, "learning_rate": 5.185619596690465e-06, "loss": 0.0278, "step": 6034 }, { "epoch": 1.6183963529096272, "grad_norm": 0.4767684147008618, "learning_rate": 5.184060476108482e-06, "loss": 0.0489, "step": 6035 }, { "epoch": 1.6186645213193884, "grad_norm": 0.23188519277538727, "learning_rate": 5.18250133760502e-06, "loss": 0.0232, "step": 6036 }, { "epoch": 1.6189326897291498, "grad_norm": 0.30037075235176075, "learning_rate": 5.180942181331887e-06, "loss": 0.0253, "step": 6037 }, { "epoch": 1.6192008581389112, "grad_norm": 0.266405387697413, "learning_rate": 5.179383007440895e-06, "loss": 0.0183, "step": 6038 }, { "epoch": 1.6194690265486726, "grad_norm": 0.2564585842780013, "learning_rate": 5.177823816083853e-06, "loss": 0.0306, "step": 6039 }, { "epoch": 1.619737194958434, "grad_norm": 0.3627470151918637, "learning_rate": 5.17626460741258e-06, "loss": 0.0257, "step": 6040 }, { "epoch": 1.6200053633681952, "grad_norm": 0.29836857151728524, "learning_rate": 5.174705381578889e-06, "loss": 0.0248, "step": 6041 }, { "epoch": 1.6202735317779564, "grad_norm": 0.2341975065759829, "learning_rate": 5.173146138734602e-06, "loss": 0.0225, "step": 6042 }, { "epoch": 1.6205417001877178, "grad_norm": 0.33041769120531833, "learning_rate": 5.171586879031533e-06, "loss": 0.0267, "step": 6043 }, { "epoch": 1.6208098685974792, "grad_norm": 0.2330067745350485, "learning_rate": 5.170027602621503e-06, "loss": 0.0193, "step": 6044 }, { "epoch": 1.6210780370072406, "grad_norm": 0.26148962792699865, "learning_rate": 5.1684683096563394e-06, "loss": 0.0289, "step": 6045 }, { "epoch": 1.621346205417002, "grad_norm": 0.20458666207828188, "learning_rate": 5.166909000287863e-06, "loss": 0.0204, "step": 6046 }, { "epoch": 1.6216143738267632, "grad_norm": 0.2363615157684955, "learning_rate": 5.1653496746679015e-06, "loss": 0.0249, "step": 6047 }, { "epoch": 1.6218825422365244, "grad_norm": 0.2530554438559414, "learning_rate": 5.163790332948281e-06, "loss": 0.0191, "step": 6048 }, { "epoch": 1.6221507106462858, "grad_norm": 0.2456789262000578, "learning_rate": 5.162230975280828e-06, "loss": 0.0216, "step": 6049 }, { "epoch": 1.6224188790560472, "grad_norm": 0.23074711947451249, "learning_rate": 5.160671601817378e-06, "loss": 0.0187, "step": 6050 }, { "epoch": 1.6226870474658086, "grad_norm": 0.23768296132007963, "learning_rate": 5.15911221270976e-06, "loss": 0.0207, "step": 6051 }, { "epoch": 1.62295521587557, "grad_norm": 0.25552137649982226, "learning_rate": 5.157552808109808e-06, "loss": 0.0245, "step": 6052 }, { "epoch": 1.6232233842853312, "grad_norm": 0.17883270148671665, "learning_rate": 5.155993388169358e-06, "loss": 0.015, "step": 6053 }, { "epoch": 1.6234915526950924, "grad_norm": 0.23532704641393742, "learning_rate": 5.154433953040247e-06, "loss": 0.0204, "step": 6054 }, { "epoch": 1.6237597211048538, "grad_norm": 0.25588525341142726, "learning_rate": 5.15287450287431e-06, "loss": 0.027, "step": 6055 }, { "epoch": 1.6240278895146152, "grad_norm": 0.4809724210214293, "learning_rate": 5.151315037823389e-06, "loss": 0.0234, "step": 6056 }, { "epoch": 1.6242960579243766, "grad_norm": 0.20666663597947887, "learning_rate": 5.149755558039325e-06, "loss": 0.0181, "step": 6057 }, { "epoch": 1.624564226334138, "grad_norm": 0.28275606776408146, "learning_rate": 5.14819606367396e-06, "loss": 0.0233, "step": 6058 }, { "epoch": 1.6248323947438992, "grad_norm": 0.3301984013941858, "learning_rate": 5.1466365548791374e-06, "loss": 0.0221, "step": 6059 }, { "epoch": 1.6251005631536604, "grad_norm": 0.2621536530004677, "learning_rate": 5.1450770318067035e-06, "loss": 0.0245, "step": 6060 }, { "epoch": 1.6253687315634218, "grad_norm": 0.24970437149897817, "learning_rate": 5.1435174946085035e-06, "loss": 0.0262, "step": 6061 }, { "epoch": 1.6256368999731832, "grad_norm": 0.2373632645904136, "learning_rate": 5.141957943436385e-06, "loss": 0.0181, "step": 6062 }, { "epoch": 1.6259050683829446, "grad_norm": 0.5757162828613352, "learning_rate": 5.140398378442201e-06, "loss": 0.0278, "step": 6063 }, { "epoch": 1.626173236792706, "grad_norm": 0.2624991113408737, "learning_rate": 5.138838799777798e-06, "loss": 0.026, "step": 6064 }, { "epoch": 1.6264414052024672, "grad_norm": 0.23832560544425088, "learning_rate": 5.137279207595029e-06, "loss": 0.0227, "step": 6065 }, { "epoch": 1.6267095736122283, "grad_norm": 0.24601496639083625, "learning_rate": 5.135719602045749e-06, "loss": 0.0255, "step": 6066 }, { "epoch": 1.6269777420219897, "grad_norm": 0.33635008573275843, "learning_rate": 5.134159983281812e-06, "loss": 0.0336, "step": 6067 }, { "epoch": 1.6272459104317512, "grad_norm": 0.3007709447918642, "learning_rate": 5.1326003514550734e-06, "loss": 0.0224, "step": 6068 }, { "epoch": 1.6275140788415126, "grad_norm": 0.3210673007629223, "learning_rate": 5.131040706717391e-06, "loss": 0.0287, "step": 6069 }, { "epoch": 1.627782247251274, "grad_norm": 0.29417267098866273, "learning_rate": 5.1294810492206215e-06, "loss": 0.021, "step": 6070 }, { "epoch": 1.6280504156610351, "grad_norm": 0.2301582234612842, "learning_rate": 5.127921379116626e-06, "loss": 0.018, "step": 6071 }, { "epoch": 1.6283185840707963, "grad_norm": 0.22663413201006224, "learning_rate": 5.126361696557267e-06, "loss": 0.0218, "step": 6072 }, { "epoch": 1.6285867524805577, "grad_norm": 0.18462084472413337, "learning_rate": 5.124802001694405e-06, "loss": 0.019, "step": 6073 }, { "epoch": 1.6288549208903191, "grad_norm": 0.33562067442140076, "learning_rate": 5.123242294679902e-06, "loss": 0.0193, "step": 6074 }, { "epoch": 1.6291230893000805, "grad_norm": 0.31308721531222267, "learning_rate": 5.121682575665625e-06, "loss": 0.0204, "step": 6075 }, { "epoch": 1.629391257709842, "grad_norm": 0.29623547599765365, "learning_rate": 5.120122844803438e-06, "loss": 0.0269, "step": 6076 }, { "epoch": 1.6296594261196031, "grad_norm": 0.2233984822872891, "learning_rate": 5.118563102245209e-06, "loss": 0.0191, "step": 6077 }, { "epoch": 1.6299275945293643, "grad_norm": 0.31272444422733586, "learning_rate": 5.117003348142806e-06, "loss": 0.0319, "step": 6078 }, { "epoch": 1.6301957629391257, "grad_norm": 0.17813320955010767, "learning_rate": 5.115443582648097e-06, "loss": 0.0152, "step": 6079 }, { "epoch": 1.6304639313488871, "grad_norm": 0.19825503189207636, "learning_rate": 5.113883805912954e-06, "loss": 0.0154, "step": 6080 }, { "epoch": 1.6307320997586485, "grad_norm": 0.26589363251148396, "learning_rate": 5.112324018089244e-06, "loss": 0.0206, "step": 6081 }, { "epoch": 1.6310002681684097, "grad_norm": 0.40941421157692737, "learning_rate": 5.1107642193288455e-06, "loss": 0.0221, "step": 6082 }, { "epoch": 1.631268436578171, "grad_norm": 0.3381186719260909, "learning_rate": 5.109204409783628e-06, "loss": 0.0248, "step": 6083 }, { "epoch": 1.6315366049879323, "grad_norm": 0.39045215737394795, "learning_rate": 5.107644589605468e-06, "loss": 0.028, "step": 6084 }, { "epoch": 1.6318047733976937, "grad_norm": 0.22200185977850678, "learning_rate": 5.106084758946239e-06, "loss": 0.0193, "step": 6085 }, { "epoch": 1.632072941807455, "grad_norm": 0.29156947592748633, "learning_rate": 5.1045249179578194e-06, "loss": 0.0192, "step": 6086 }, { "epoch": 1.6323411102172165, "grad_norm": 0.37698898476503057, "learning_rate": 5.102965066792085e-06, "loss": 0.0249, "step": 6087 }, { "epoch": 1.6326092786269777, "grad_norm": 0.1756982931067425, "learning_rate": 5.101405205600916e-06, "loss": 0.0175, "step": 6088 }, { "epoch": 1.632877447036739, "grad_norm": 0.394779058511191, "learning_rate": 5.099845334536192e-06, "loss": 0.0216, "step": 6089 }, { "epoch": 1.6331456154465003, "grad_norm": 0.22842017782334087, "learning_rate": 5.098285453749793e-06, "loss": 0.0173, "step": 6090 }, { "epoch": 1.6334137838562617, "grad_norm": 0.2359118837383206, "learning_rate": 5.0967255633936e-06, "loss": 0.0159, "step": 6091 }, { "epoch": 1.633681952266023, "grad_norm": 0.2970209266940588, "learning_rate": 5.095165663619493e-06, "loss": 0.0249, "step": 6092 }, { "epoch": 1.6339501206757845, "grad_norm": 0.22417536970352475, "learning_rate": 5.093605754579361e-06, "loss": 0.0238, "step": 6093 }, { "epoch": 1.6342182890855457, "grad_norm": 0.20927671610944049, "learning_rate": 5.092045836425085e-06, "loss": 0.0165, "step": 6094 }, { "epoch": 1.634486457495307, "grad_norm": 0.27103763245266027, "learning_rate": 5.09048590930855e-06, "loss": 0.0265, "step": 6095 }, { "epoch": 1.6347546259050683, "grad_norm": 0.24024158869159878, "learning_rate": 5.088925973381643e-06, "loss": 0.0202, "step": 6096 }, { "epoch": 1.6350227943148297, "grad_norm": 0.21876169914772395, "learning_rate": 5.087366028796248e-06, "loss": 0.0153, "step": 6097 }, { "epoch": 1.635290962724591, "grad_norm": 0.2845222612648831, "learning_rate": 5.085806075704256e-06, "loss": 0.0237, "step": 6098 }, { "epoch": 1.6355591311343525, "grad_norm": 0.19497967804508182, "learning_rate": 5.084246114257554e-06, "loss": 0.0173, "step": 6099 }, { "epoch": 1.6358272995441137, "grad_norm": 0.34828636169363086, "learning_rate": 5.082686144608031e-06, "loss": 0.0287, "step": 6100 }, { "epoch": 1.636095467953875, "grad_norm": 0.34117679103592735, "learning_rate": 5.081126166907579e-06, "loss": 0.0235, "step": 6101 }, { "epoch": 1.6363636363636362, "grad_norm": 0.3326581253294498, "learning_rate": 5.079566181308087e-06, "loss": 0.02, "step": 6102 }, { "epoch": 1.6366318047733976, "grad_norm": 0.42181084124198487, "learning_rate": 5.078006187961446e-06, "loss": 0.0312, "step": 6103 }, { "epoch": 1.636899973183159, "grad_norm": 0.316096892378578, "learning_rate": 5.076446187019551e-06, "loss": 0.0211, "step": 6104 }, { "epoch": 1.6371681415929205, "grad_norm": 0.2625254049210759, "learning_rate": 5.074886178634293e-06, "loss": 0.0241, "step": 6105 }, { "epoch": 1.6374363100026816, "grad_norm": 0.244530541594946, "learning_rate": 5.073326162957567e-06, "loss": 0.0275, "step": 6106 }, { "epoch": 1.637704478412443, "grad_norm": 0.23704026229732905, "learning_rate": 5.071766140141266e-06, "loss": 0.0242, "step": 6107 }, { "epoch": 1.6379726468222042, "grad_norm": 0.22107967236641637, "learning_rate": 5.070206110337289e-06, "loss": 0.0224, "step": 6108 }, { "epoch": 1.6382408152319656, "grad_norm": 0.25770741226244026, "learning_rate": 5.068646073697527e-06, "loss": 0.0218, "step": 6109 }, { "epoch": 1.638508983641727, "grad_norm": 0.24259167656555394, "learning_rate": 5.06708603037388e-06, "loss": 0.0185, "step": 6110 }, { "epoch": 1.6387771520514884, "grad_norm": 0.32348092987774324, "learning_rate": 5.065525980518244e-06, "loss": 0.0241, "step": 6111 }, { "epoch": 1.6390453204612496, "grad_norm": 0.3842292736350233, "learning_rate": 5.063965924282518e-06, "loss": 0.0211, "step": 6112 }, { "epoch": 1.639313488871011, "grad_norm": 0.2519085217158739, "learning_rate": 5.0624058618186e-06, "loss": 0.0227, "step": 6113 }, { "epoch": 1.6395816572807722, "grad_norm": 0.26504645280621525, "learning_rate": 5.060845793278389e-06, "loss": 0.0186, "step": 6114 }, { "epoch": 1.6398498256905336, "grad_norm": 0.2551925233916949, "learning_rate": 5.059285718813784e-06, "loss": 0.0275, "step": 6115 }, { "epoch": 1.640117994100295, "grad_norm": 0.3274824054181548, "learning_rate": 5.0577256385766845e-06, "loss": 0.019, "step": 6116 }, { "epoch": 1.6403861625100564, "grad_norm": 0.27776774778885893, "learning_rate": 5.056165552718995e-06, "loss": 0.0181, "step": 6117 }, { "epoch": 1.6406543309198176, "grad_norm": 0.24007962023790153, "learning_rate": 5.054605461392613e-06, "loss": 0.0193, "step": 6118 }, { "epoch": 1.640922499329579, "grad_norm": 0.2668968467847279, "learning_rate": 5.0530453647494424e-06, "loss": 0.0227, "step": 6119 }, { "epoch": 1.6411906677393402, "grad_norm": 0.35086458967795603, "learning_rate": 5.051485262941388e-06, "loss": 0.022, "step": 6120 }, { "epoch": 1.6414588361491016, "grad_norm": 0.27549829296499784, "learning_rate": 5.049925156120348e-06, "loss": 0.0239, "step": 6121 }, { "epoch": 1.641727004558863, "grad_norm": 0.2756624507691886, "learning_rate": 5.048365044438228e-06, "loss": 0.0238, "step": 6122 }, { "epoch": 1.6419951729686244, "grad_norm": 0.2528537300404635, "learning_rate": 5.046804928046933e-06, "loss": 0.0208, "step": 6123 }, { "epoch": 1.6422633413783856, "grad_norm": 0.2678735306392268, "learning_rate": 5.045244807098365e-06, "loss": 0.0228, "step": 6124 }, { "epoch": 1.642531509788147, "grad_norm": 0.25113900214577967, "learning_rate": 5.0436846817444305e-06, "loss": 0.0288, "step": 6125 }, { "epoch": 1.6427996781979082, "grad_norm": 0.23566121392342365, "learning_rate": 5.042124552137037e-06, "loss": 0.0186, "step": 6126 }, { "epoch": 1.6430678466076696, "grad_norm": 0.1996452594743113, "learning_rate": 5.040564418428087e-06, "loss": 0.0195, "step": 6127 }, { "epoch": 1.643336015017431, "grad_norm": 0.26773308708002486, "learning_rate": 5.039004280769485e-06, "loss": 0.0228, "step": 6128 }, { "epoch": 1.6436041834271924, "grad_norm": 0.2967392689698117, "learning_rate": 5.03744413931314e-06, "loss": 0.0338, "step": 6129 }, { "epoch": 1.6438723518369536, "grad_norm": 0.27226092342885233, "learning_rate": 5.035883994210959e-06, "loss": 0.0238, "step": 6130 }, { "epoch": 1.644140520246715, "grad_norm": 0.21322430627109468, "learning_rate": 5.0343238456148504e-06, "loss": 0.0215, "step": 6131 }, { "epoch": 1.6444086886564762, "grad_norm": 0.22294413621127496, "learning_rate": 5.03276369367672e-06, "loss": 0.0266, "step": 6132 }, { "epoch": 1.6446768570662376, "grad_norm": 0.30823491889454835, "learning_rate": 5.031203538548475e-06, "loss": 0.026, "step": 6133 }, { "epoch": 1.644945025475999, "grad_norm": 0.2518707478035237, "learning_rate": 5.029643380382023e-06, "loss": 0.0222, "step": 6134 }, { "epoch": 1.6452131938857604, "grad_norm": 0.2771416620034353, "learning_rate": 5.028083219329274e-06, "loss": 0.0277, "step": 6135 }, { "epoch": 1.6454813622955216, "grad_norm": 0.21601617624630448, "learning_rate": 5.026523055542137e-06, "loss": 0.0196, "step": 6136 }, { "epoch": 1.645749530705283, "grad_norm": 0.22492964582708713, "learning_rate": 5.024962889172521e-06, "loss": 0.0195, "step": 6137 }, { "epoch": 1.6460176991150441, "grad_norm": 0.21458198278153567, "learning_rate": 5.023402720372336e-06, "loss": 0.0166, "step": 6138 }, { "epoch": 1.6462858675248055, "grad_norm": 0.3791692395576329, "learning_rate": 5.0218425492934885e-06, "loss": 0.0257, "step": 6139 }, { "epoch": 1.646554035934567, "grad_norm": 0.1906638900915563, "learning_rate": 5.0202823760878895e-06, "loss": 0.021, "step": 6140 }, { "epoch": 1.6468222043443284, "grad_norm": 0.22461107377310655, "learning_rate": 5.01872220090745e-06, "loss": 0.0199, "step": 6141 }, { "epoch": 1.6470903727540895, "grad_norm": 0.2854405400539301, "learning_rate": 5.01716202390408e-06, "loss": 0.0223, "step": 6142 }, { "epoch": 1.647358541163851, "grad_norm": 0.24648452362140388, "learning_rate": 5.015601845229689e-06, "loss": 0.0214, "step": 6143 }, { "epoch": 1.6476267095736121, "grad_norm": 0.2719032134933166, "learning_rate": 5.014041665036188e-06, "loss": 0.0245, "step": 6144 }, { "epoch": 1.6478948779833735, "grad_norm": 0.25971498493430406, "learning_rate": 5.012481483475487e-06, "loss": 0.023, "step": 6145 }, { "epoch": 1.648163046393135, "grad_norm": 0.24443304645049677, "learning_rate": 5.010921300699496e-06, "loss": 0.0237, "step": 6146 }, { "epoch": 1.6484312148028963, "grad_norm": 1.4825258691671008, "learning_rate": 5.009361116860129e-06, "loss": 0.0298, "step": 6147 }, { "epoch": 1.6486993832126575, "grad_norm": 0.23378382024070332, "learning_rate": 5.007800932109294e-06, "loss": 0.0199, "step": 6148 }, { "epoch": 1.648967551622419, "grad_norm": 0.29686400432411814, "learning_rate": 5.006240746598903e-06, "loss": 0.0224, "step": 6149 }, { "epoch": 1.64923572003218, "grad_norm": 0.2778656312914394, "learning_rate": 5.004680560480867e-06, "loss": 0.0207, "step": 6150 }, { "epoch": 1.6495038884419415, "grad_norm": 0.24278804216190678, "learning_rate": 5.003120373907097e-06, "loss": 0.0238, "step": 6151 }, { "epoch": 1.649772056851703, "grad_norm": 0.25119924973138547, "learning_rate": 5.001560187029504e-06, "loss": 0.0211, "step": 6152 }, { "epoch": 1.6500402252614643, "grad_norm": 0.24211970386266002, "learning_rate": 5e-06, "loss": 0.0181, "step": 6153 }, { "epoch": 1.6503083936712255, "grad_norm": 0.5013893010478379, "learning_rate": 4.998439812970497e-06, "loss": 0.0249, "step": 6154 }, { "epoch": 1.650576562080987, "grad_norm": 0.2558768253063386, "learning_rate": 4.996879626092905e-06, "loss": 0.0199, "step": 6155 }, { "epoch": 1.650844730490748, "grad_norm": 0.2303959086260235, "learning_rate": 4.995319439519135e-06, "loss": 0.0197, "step": 6156 }, { "epoch": 1.6511128989005095, "grad_norm": 0.29805525445199327, "learning_rate": 4.993759253401098e-06, "loss": 0.0219, "step": 6157 }, { "epoch": 1.651381067310271, "grad_norm": 0.277233716207636, "learning_rate": 4.992199067890706e-06, "loss": 0.0294, "step": 6158 }, { "epoch": 1.6516492357200323, "grad_norm": 0.3356773507598051, "learning_rate": 4.990638883139872e-06, "loss": 0.0266, "step": 6159 }, { "epoch": 1.6519174041297935, "grad_norm": 0.24076851670671096, "learning_rate": 4.989078699300506e-06, "loss": 0.0202, "step": 6160 }, { "epoch": 1.652185572539555, "grad_norm": 0.7107944079502402, "learning_rate": 4.987518516524516e-06, "loss": 0.031, "step": 6161 }, { "epoch": 1.652453740949316, "grad_norm": 0.20879493738803492, "learning_rate": 4.985958334963814e-06, "loss": 0.0189, "step": 6162 }, { "epoch": 1.6527219093590775, "grad_norm": 1.0488750835186145, "learning_rate": 4.984398154770313e-06, "loss": 0.0287, "step": 6163 }, { "epoch": 1.6529900777688389, "grad_norm": 0.25427660525306484, "learning_rate": 4.982837976095922e-06, "loss": 0.0231, "step": 6164 }, { "epoch": 1.6532582461786003, "grad_norm": 0.2541122716445635, "learning_rate": 4.981277799092552e-06, "loss": 0.0219, "step": 6165 }, { "epoch": 1.6535264145883615, "grad_norm": 0.25981135939691236, "learning_rate": 4.979717623912112e-06, "loss": 0.0232, "step": 6166 }, { "epoch": 1.6537945829981229, "grad_norm": 0.34772785923116034, "learning_rate": 4.978157450706513e-06, "loss": 0.0338, "step": 6167 }, { "epoch": 1.654062751407884, "grad_norm": 0.23742788465566605, "learning_rate": 4.976597279627666e-06, "loss": 0.022, "step": 6168 }, { "epoch": 1.6543309198176455, "grad_norm": 0.3400946105740804, "learning_rate": 4.975037110827479e-06, "loss": 0.0351, "step": 6169 }, { "epoch": 1.6545990882274069, "grad_norm": 0.2267906729363588, "learning_rate": 4.973476944457863e-06, "loss": 0.0181, "step": 6170 }, { "epoch": 1.6548672566371683, "grad_norm": 0.39808383379433115, "learning_rate": 4.9719167806707265e-06, "loss": 0.0308, "step": 6171 }, { "epoch": 1.6551354250469295, "grad_norm": 0.3118057576019345, "learning_rate": 4.970356619617979e-06, "loss": 0.019, "step": 6172 }, { "epoch": 1.6554035934566906, "grad_norm": 0.28529835824501487, "learning_rate": 4.968796461451528e-06, "loss": 0.0186, "step": 6173 }, { "epoch": 1.655671761866452, "grad_norm": 0.25359484548992256, "learning_rate": 4.967236306323283e-06, "loss": 0.0214, "step": 6174 }, { "epoch": 1.6559399302762134, "grad_norm": 0.20736358203270808, "learning_rate": 4.965676154385151e-06, "loss": 0.0238, "step": 6175 }, { "epoch": 1.6562080986859748, "grad_norm": 0.19689528295429204, "learning_rate": 4.964116005789042e-06, "loss": 0.0192, "step": 6176 }, { "epoch": 1.6564762670957363, "grad_norm": 0.2600759461141454, "learning_rate": 4.962555860686861e-06, "loss": 0.0271, "step": 6177 }, { "epoch": 1.6567444355054974, "grad_norm": 0.3364115725642357, "learning_rate": 4.960995719230516e-06, "loss": 0.0388, "step": 6178 }, { "epoch": 1.6570126039152586, "grad_norm": 0.22916523317337809, "learning_rate": 4.9594355815719155e-06, "loss": 0.0216, "step": 6179 }, { "epoch": 1.65728077232502, "grad_norm": 0.22515940129017173, "learning_rate": 4.957875447862964e-06, "loss": 0.0224, "step": 6180 }, { "epoch": 1.6575489407347814, "grad_norm": 0.6924541070110336, "learning_rate": 4.956315318255569e-06, "loss": 0.0227, "step": 6181 }, { "epoch": 1.6578171091445428, "grad_norm": 0.25949980571586984, "learning_rate": 4.954755192901635e-06, "loss": 0.0216, "step": 6182 }, { "epoch": 1.6580852775543042, "grad_norm": 0.3409524754679679, "learning_rate": 4.953195071953069e-06, "loss": 0.0359, "step": 6183 }, { "epoch": 1.6583534459640654, "grad_norm": 0.32242139702372924, "learning_rate": 4.951634955561774e-06, "loss": 0.0206, "step": 6184 }, { "epoch": 1.6586216143738266, "grad_norm": 0.250284433218115, "learning_rate": 4.9500748438796545e-06, "loss": 0.0228, "step": 6185 }, { "epoch": 1.658889782783588, "grad_norm": 0.2947322972473216, "learning_rate": 4.948514737058615e-06, "loss": 0.0212, "step": 6186 }, { "epoch": 1.6591579511933494, "grad_norm": 0.6010174902729412, "learning_rate": 4.946954635250558e-06, "loss": 0.0245, "step": 6187 }, { "epoch": 1.6594261196031108, "grad_norm": 0.2703662506341328, "learning_rate": 4.945394538607388e-06, "loss": 0.0198, "step": 6188 }, { "epoch": 1.6596942880128722, "grad_norm": 0.21735997454713535, "learning_rate": 4.943834447281007e-06, "loss": 0.027, "step": 6189 }, { "epoch": 1.6599624564226334, "grad_norm": 0.24119598812394463, "learning_rate": 4.942274361423316e-06, "loss": 0.0264, "step": 6190 }, { "epoch": 1.6602306248323946, "grad_norm": 0.3858199920346831, "learning_rate": 4.940714281186219e-06, "loss": 0.0309, "step": 6191 }, { "epoch": 1.660498793242156, "grad_norm": 0.4388666048354723, "learning_rate": 4.939154206721614e-06, "loss": 0.0203, "step": 6192 }, { "epoch": 1.6607669616519174, "grad_norm": 0.31200839155819476, "learning_rate": 4.937594138181402e-06, "loss": 0.0273, "step": 6193 }, { "epoch": 1.6610351300616788, "grad_norm": 0.18783997027436997, "learning_rate": 4.936034075717482e-06, "loss": 0.0174, "step": 6194 }, { "epoch": 1.6613032984714402, "grad_norm": 0.2299260346666292, "learning_rate": 4.934474019481755e-06, "loss": 0.0158, "step": 6195 }, { "epoch": 1.6615714668812014, "grad_norm": 0.17236641844227232, "learning_rate": 4.932913969626122e-06, "loss": 0.0168, "step": 6196 }, { "epoch": 1.6618396352909626, "grad_norm": 0.30771126941958554, "learning_rate": 4.931353926302475e-06, "loss": 0.0331, "step": 6197 }, { "epoch": 1.662107803700724, "grad_norm": 1.2097126361748678, "learning_rate": 4.929793889662713e-06, "loss": 0.0228, "step": 6198 }, { "epoch": 1.6623759721104854, "grad_norm": 0.23614330205139014, "learning_rate": 4.9282338598587345e-06, "loss": 0.0229, "step": 6199 }, { "epoch": 1.6626441405202468, "grad_norm": 0.35045185449640587, "learning_rate": 4.926673837042434e-06, "loss": 0.0228, "step": 6200 }, { "epoch": 1.6629123089300082, "grad_norm": 0.27953354378675693, "learning_rate": 4.925113821365709e-06, "loss": 0.0242, "step": 6201 }, { "epoch": 1.6631804773397694, "grad_norm": 0.34948293206496317, "learning_rate": 4.92355381298045e-06, "loss": 0.0313, "step": 6202 }, { "epoch": 1.6634486457495306, "grad_norm": 0.30250952809969756, "learning_rate": 4.921993812038555e-06, "loss": 0.0262, "step": 6203 }, { "epoch": 1.663716814159292, "grad_norm": 0.29357577438494004, "learning_rate": 4.920433818691915e-06, "loss": 0.0216, "step": 6204 }, { "epoch": 1.6639849825690534, "grad_norm": 0.2280377785960035, "learning_rate": 4.918873833092421e-06, "loss": 0.0173, "step": 6205 }, { "epoch": 1.6642531509788148, "grad_norm": 0.3207970297352996, "learning_rate": 4.917313855391968e-06, "loss": 0.0259, "step": 6206 }, { "epoch": 1.6645213193885762, "grad_norm": 0.31177439601896406, "learning_rate": 4.915753885742446e-06, "loss": 0.0276, "step": 6207 }, { "epoch": 1.6647894877983374, "grad_norm": 0.3124497039447132, "learning_rate": 4.9141939242957465e-06, "loss": 0.0297, "step": 6208 }, { "epoch": 1.6650576562080985, "grad_norm": 0.2898099151039896, "learning_rate": 4.912633971203754e-06, "loss": 0.0209, "step": 6209 }, { "epoch": 1.66532582461786, "grad_norm": 0.34800484303163226, "learning_rate": 4.91107402661836e-06, "loss": 0.0216, "step": 6210 }, { "epoch": 1.6655939930276213, "grad_norm": 0.26100236297065904, "learning_rate": 4.909514090691452e-06, "loss": 0.0204, "step": 6211 }, { "epoch": 1.6658621614373827, "grad_norm": 0.23967511082758283, "learning_rate": 4.907954163574917e-06, "loss": 0.0149, "step": 6212 }, { "epoch": 1.6661303298471442, "grad_norm": 0.2766161703389561, "learning_rate": 4.90639424542064e-06, "loss": 0.0278, "step": 6213 }, { "epoch": 1.6663984982569053, "grad_norm": 0.40412787301831055, "learning_rate": 4.904834336380508e-06, "loss": 0.0285, "step": 6214 }, { "epoch": 1.6666666666666665, "grad_norm": 0.33513385336195617, "learning_rate": 4.903274436606402e-06, "loss": 0.0238, "step": 6215 }, { "epoch": 1.666934835076428, "grad_norm": 0.5253592480616325, "learning_rate": 4.9017145462502085e-06, "loss": 0.0271, "step": 6216 }, { "epoch": 1.6672030034861893, "grad_norm": 0.3573544908279431, "learning_rate": 4.9001546654638084e-06, "loss": 0.0357, "step": 6217 }, { "epoch": 1.6674711718959507, "grad_norm": 0.3010404230565133, "learning_rate": 4.898594794399084e-06, "loss": 0.0217, "step": 6218 }, { "epoch": 1.6677393403057121, "grad_norm": 0.47455039877559896, "learning_rate": 4.8970349332079155e-06, "loss": 0.0225, "step": 6219 }, { "epoch": 1.6680075087154733, "grad_norm": 0.3136234408321886, "learning_rate": 4.895475082042182e-06, "loss": 0.023, "step": 6220 }, { "epoch": 1.6682756771252345, "grad_norm": 0.32937213070687044, "learning_rate": 4.8939152410537624e-06, "loss": 0.027, "step": 6221 }, { "epoch": 1.668543845534996, "grad_norm": 0.22725253011225463, "learning_rate": 4.892355410394534e-06, "loss": 0.0178, "step": 6222 }, { "epoch": 1.6688120139447573, "grad_norm": 0.24566533284389136, "learning_rate": 4.890795590216373e-06, "loss": 0.0187, "step": 6223 }, { "epoch": 1.6690801823545187, "grad_norm": 0.2392447815990188, "learning_rate": 4.889235780671156e-06, "loss": 0.0204, "step": 6224 }, { "epoch": 1.6693483507642801, "grad_norm": 0.3777099239953422, "learning_rate": 4.887675981910758e-06, "loss": 0.0175, "step": 6225 }, { "epoch": 1.6696165191740413, "grad_norm": 0.6188552460311392, "learning_rate": 4.8861161940870485e-06, "loss": 0.028, "step": 6226 }, { "epoch": 1.6698846875838025, "grad_norm": 0.2882985003771321, "learning_rate": 4.884556417351904e-06, "loss": 0.0262, "step": 6227 }, { "epoch": 1.670152855993564, "grad_norm": 0.28665360160732434, "learning_rate": 4.882996651857195e-06, "loss": 0.0299, "step": 6228 }, { "epoch": 1.6704210244033253, "grad_norm": 0.23200212474144313, "learning_rate": 4.881436897754792e-06, "loss": 0.0239, "step": 6229 }, { "epoch": 1.6706891928130867, "grad_norm": 0.25939978062292507, "learning_rate": 4.879877155196563e-06, "loss": 0.0239, "step": 6230 }, { "epoch": 1.670957361222848, "grad_norm": 0.3825557905496175, "learning_rate": 4.8783174243343765e-06, "loss": 0.0389, "step": 6231 }, { "epoch": 1.6712255296326093, "grad_norm": 0.3196018341383587, "learning_rate": 4.8767577053200995e-06, "loss": 0.03, "step": 6232 }, { "epoch": 1.6714936980423705, "grad_norm": 0.2611207693747397, "learning_rate": 4.875197998305598e-06, "loss": 0.0235, "step": 6233 }, { "epoch": 1.6717618664521319, "grad_norm": 0.31331817672865775, "learning_rate": 4.873638303442736e-06, "loss": 0.0216, "step": 6234 }, { "epoch": 1.6720300348618933, "grad_norm": 0.34434648020018555, "learning_rate": 4.872078620883375e-06, "loss": 0.0238, "step": 6235 }, { "epoch": 1.6722982032716547, "grad_norm": 0.298754656036478, "learning_rate": 4.87051895077938e-06, "loss": 0.0336, "step": 6236 }, { "epoch": 1.672566371681416, "grad_norm": 0.41354456536356643, "learning_rate": 4.868959293282612e-06, "loss": 0.0345, "step": 6237 }, { "epoch": 1.6728345400911773, "grad_norm": 0.5907229727097764, "learning_rate": 4.867399648544928e-06, "loss": 0.0503, "step": 6238 }, { "epoch": 1.6731027085009385, "grad_norm": 0.33502611699016016, "learning_rate": 4.8658400167181895e-06, "loss": 0.0355, "step": 6239 }, { "epoch": 1.6733708769106999, "grad_norm": 0.22305208419511505, "learning_rate": 4.864280397954252e-06, "loss": 0.0183, "step": 6240 }, { "epoch": 1.6736390453204613, "grad_norm": 0.2571871582865664, "learning_rate": 4.862720792404972e-06, "loss": 0.024, "step": 6241 }, { "epoch": 1.6739072137302227, "grad_norm": 0.24332121626100825, "learning_rate": 4.861161200222202e-06, "loss": 0.0204, "step": 6242 }, { "epoch": 1.674175382139984, "grad_norm": 0.21537279266052337, "learning_rate": 4.8596016215578e-06, "loss": 0.0166, "step": 6243 }, { "epoch": 1.6744435505497453, "grad_norm": 0.21715795610663519, "learning_rate": 4.8580420565636166e-06, "loss": 0.0234, "step": 6244 }, { "epoch": 1.6747117189595064, "grad_norm": 0.31109998743971046, "learning_rate": 4.856482505391499e-06, "loss": 0.0299, "step": 6245 }, { "epoch": 1.6749798873692678, "grad_norm": 0.33321261418044074, "learning_rate": 4.854922968193298e-06, "loss": 0.0276, "step": 6246 }, { "epoch": 1.6752480557790292, "grad_norm": 0.3354433762144696, "learning_rate": 4.853363445120864e-06, "loss": 0.0346, "step": 6247 }, { "epoch": 1.6755162241887906, "grad_norm": 0.23717027549408892, "learning_rate": 4.851803936326041e-06, "loss": 0.0179, "step": 6248 }, { "epoch": 1.675784392598552, "grad_norm": 0.21892196581109968, "learning_rate": 4.850244441960676e-06, "loss": 0.0224, "step": 6249 }, { "epoch": 1.6760525610083132, "grad_norm": 0.28582952271418016, "learning_rate": 4.848684962176612e-06, "loss": 0.023, "step": 6250 }, { "epoch": 1.6763207294180744, "grad_norm": 0.30536437021563817, "learning_rate": 4.847125497125691e-06, "loss": 0.0219, "step": 6251 }, { "epoch": 1.6765888978278358, "grad_norm": 0.6540955027331902, "learning_rate": 4.845566046959756e-06, "loss": 0.0301, "step": 6252 }, { "epoch": 1.6768570662375972, "grad_norm": 0.19642702873777676, "learning_rate": 4.844006611830642e-06, "loss": 0.0187, "step": 6253 }, { "epoch": 1.6771252346473586, "grad_norm": 0.20567296040165778, "learning_rate": 4.842447191890192e-06, "loss": 0.021, "step": 6254 }, { "epoch": 1.67739340305712, "grad_norm": 0.2289053020925815, "learning_rate": 4.8408877872902404e-06, "loss": 0.0185, "step": 6255 }, { "epoch": 1.6776615714668812, "grad_norm": 0.8811787205644825, "learning_rate": 4.839328398182625e-06, "loss": 0.0303, "step": 6256 }, { "epoch": 1.6779297398766424, "grad_norm": 0.2869935013096887, "learning_rate": 4.837769024719173e-06, "loss": 0.0184, "step": 6257 }, { "epoch": 1.6781979082864038, "grad_norm": 0.2534741648671677, "learning_rate": 4.836209667051722e-06, "loss": 0.0289, "step": 6258 }, { "epoch": 1.6784660766961652, "grad_norm": 0.2522350860456307, "learning_rate": 4.834650325332101e-06, "loss": 0.0181, "step": 6259 }, { "epoch": 1.6787342451059266, "grad_norm": 0.36905823241810504, "learning_rate": 4.833090999712138e-06, "loss": 0.0209, "step": 6260 }, { "epoch": 1.6790024135156878, "grad_norm": 0.2143740879804254, "learning_rate": 4.831531690343662e-06, "loss": 0.0157, "step": 6261 }, { "epoch": 1.6792705819254492, "grad_norm": 0.3116936920628131, "learning_rate": 4.829972397378498e-06, "loss": 0.0242, "step": 6262 }, { "epoch": 1.6795387503352104, "grad_norm": 0.23565216066290012, "learning_rate": 4.828413120968469e-06, "loss": 0.0205, "step": 6263 }, { "epoch": 1.6798069187449718, "grad_norm": 0.2869115052076514, "learning_rate": 4.8268538612654005e-06, "loss": 0.0285, "step": 6264 }, { "epoch": 1.6800750871547332, "grad_norm": 0.34037826905564667, "learning_rate": 4.825294618421111e-06, "loss": 0.028, "step": 6265 }, { "epoch": 1.6803432555644946, "grad_norm": 0.38029819499718504, "learning_rate": 4.8237353925874205e-06, "loss": 0.0171, "step": 6266 }, { "epoch": 1.6806114239742558, "grad_norm": 0.26708579044907993, "learning_rate": 4.822176183916147e-06, "loss": 0.0244, "step": 6267 }, { "epoch": 1.6808795923840172, "grad_norm": 0.24329263095444462, "learning_rate": 4.8206169925591085e-06, "loss": 0.0252, "step": 6268 }, { "epoch": 1.6811477607937784, "grad_norm": 0.25872516581485283, "learning_rate": 4.819057818668115e-06, "loss": 0.0186, "step": 6269 }, { "epoch": 1.6814159292035398, "grad_norm": 0.23702091638054273, "learning_rate": 4.817498662394982e-06, "loss": 0.0167, "step": 6270 }, { "epoch": 1.6816840976133012, "grad_norm": 0.2387810229128028, "learning_rate": 4.815939523891519e-06, "loss": 0.0238, "step": 6271 }, { "epoch": 1.6819522660230626, "grad_norm": 0.2869172819536674, "learning_rate": 4.8143804033095365e-06, "loss": 0.0241, "step": 6272 }, { "epoch": 1.6822204344328238, "grad_norm": 0.2295049722018516, "learning_rate": 4.812821300800841e-06, "loss": 0.0151, "step": 6273 }, { "epoch": 1.6824886028425852, "grad_norm": 0.2249664855227666, "learning_rate": 4.8112622165172374e-06, "loss": 0.0218, "step": 6274 }, { "epoch": 1.6827567712523464, "grad_norm": 0.24728665860098115, "learning_rate": 4.809703150610532e-06, "loss": 0.0198, "step": 6275 }, { "epoch": 1.6830249396621078, "grad_norm": 0.3121812560598261, "learning_rate": 4.808144103232525e-06, "loss": 0.0199, "step": 6276 }, { "epoch": 1.6832931080718692, "grad_norm": 0.3468699691864694, "learning_rate": 4.806585074535018e-06, "loss": 0.0289, "step": 6277 }, { "epoch": 1.6835612764816306, "grad_norm": 0.2578331554775838, "learning_rate": 4.8050260646698085e-06, "loss": 0.0222, "step": 6278 }, { "epoch": 1.6838294448913917, "grad_norm": 0.35315272489694033, "learning_rate": 4.803467073788694e-06, "loss": 0.0239, "step": 6279 }, { "epoch": 1.6840976133011532, "grad_norm": 0.3836240289682844, "learning_rate": 4.801908102043467e-06, "loss": 0.0298, "step": 6280 }, { "epoch": 1.6843657817109143, "grad_norm": 0.295912208442318, "learning_rate": 4.800349149585926e-06, "loss": 0.0311, "step": 6281 }, { "epoch": 1.6846339501206757, "grad_norm": 0.2918125801933405, "learning_rate": 4.798790216567855e-06, "loss": 0.0218, "step": 6282 }, { "epoch": 1.6849021185304371, "grad_norm": 0.2276152979012104, "learning_rate": 4.797231303141046e-06, "loss": 0.0218, "step": 6283 }, { "epoch": 1.6851702869401985, "grad_norm": 0.3169084240024691, "learning_rate": 4.7956724094572875e-06, "loss": 0.0234, "step": 6284 }, { "epoch": 1.6854384553499597, "grad_norm": 0.2657307743035358, "learning_rate": 4.794113535668364e-06, "loss": 0.0277, "step": 6285 }, { "epoch": 1.6857066237597211, "grad_norm": 0.5833083127092006, "learning_rate": 4.792554681926059e-06, "loss": 0.0295, "step": 6286 }, { "epoch": 1.6859747921694823, "grad_norm": 0.2898574922016076, "learning_rate": 4.790995848382155e-06, "loss": 0.0353, "step": 6287 }, { "epoch": 1.6862429605792437, "grad_norm": 0.3071474411402569, "learning_rate": 4.789437035188429e-06, "loss": 0.0298, "step": 6288 }, { "epoch": 1.6865111289890051, "grad_norm": 0.22853000189798037, "learning_rate": 4.78787824249666e-06, "loss": 0.0213, "step": 6289 }, { "epoch": 1.6867792973987665, "grad_norm": 0.2888919265737888, "learning_rate": 4.786319470458623e-06, "loss": 0.0201, "step": 6290 }, { "epoch": 1.6870474658085277, "grad_norm": 0.5774403861127113, "learning_rate": 4.7847607192260916e-06, "loss": 0.0231, "step": 6291 }, { "epoch": 1.6873156342182891, "grad_norm": 0.24846526012438114, "learning_rate": 4.783201988950839e-06, "loss": 0.0206, "step": 6292 }, { "epoch": 1.6875838026280503, "grad_norm": 0.23008040308026992, "learning_rate": 4.781643279784634e-06, "loss": 0.0216, "step": 6293 }, { "epoch": 1.6878519710378117, "grad_norm": 0.23492649226686793, "learning_rate": 4.78008459187924e-06, "loss": 0.0221, "step": 6294 }, { "epoch": 1.688120139447573, "grad_norm": 0.2479433293250395, "learning_rate": 4.778525925386428e-06, "loss": 0.0214, "step": 6295 }, { "epoch": 1.6883883078573345, "grad_norm": 0.29170408003263393, "learning_rate": 4.776967280457958e-06, "loss": 0.0218, "step": 6296 }, { "epoch": 1.6886564762670957, "grad_norm": 0.35125627257341807, "learning_rate": 4.775408657245591e-06, "loss": 0.0245, "step": 6297 }, { "epoch": 1.688924644676857, "grad_norm": 0.31770460569189707, "learning_rate": 4.773850055901087e-06, "loss": 0.0275, "step": 6298 }, { "epoch": 1.6891928130866183, "grad_norm": 0.23659505127107036, "learning_rate": 4.772291476576203e-06, "loss": 0.0153, "step": 6299 }, { "epoch": 1.6894609814963797, "grad_norm": 0.21211417595374113, "learning_rate": 4.770732919422693e-06, "loss": 0.0192, "step": 6300 }, { "epoch": 1.689729149906141, "grad_norm": 0.27809539007935324, "learning_rate": 4.769174384592309e-06, "loss": 0.0195, "step": 6301 }, { "epoch": 1.6899973183159025, "grad_norm": 0.2790284931813096, "learning_rate": 4.767615872236804e-06, "loss": 0.0268, "step": 6302 }, { "epoch": 1.6902654867256637, "grad_norm": 0.34085502751266017, "learning_rate": 4.766057382507924e-06, "loss": 0.0253, "step": 6303 }, { "epoch": 1.690533655135425, "grad_norm": 0.5369143384605487, "learning_rate": 4.764498915557415e-06, "loss": 0.0318, "step": 6304 }, { "epoch": 1.6908018235451863, "grad_norm": 0.3432143055774044, "learning_rate": 4.762940471537023e-06, "loss": 0.0266, "step": 6305 }, { "epoch": 1.6910699919549477, "grad_norm": 0.27977175914961383, "learning_rate": 4.761382050598487e-06, "loss": 0.021, "step": 6306 }, { "epoch": 1.691338160364709, "grad_norm": 0.48627656837281513, "learning_rate": 4.759823652893547e-06, "loss": 0.0176, "step": 6307 }, { "epoch": 1.6916063287744705, "grad_norm": 0.28772215548885527, "learning_rate": 4.7582652785739405e-06, "loss": 0.0175, "step": 6308 }, { "epoch": 1.6918744971842317, "grad_norm": 0.688231324568534, "learning_rate": 4.7567069277914015e-06, "loss": 0.0235, "step": 6309 }, { "epoch": 1.692142665593993, "grad_norm": 0.19922517745761928, "learning_rate": 4.7551486006976635e-06, "loss": 0.0194, "step": 6310 }, { "epoch": 1.6924108340037542, "grad_norm": 0.24667781412463077, "learning_rate": 4.7535902974444555e-06, "loss": 0.0202, "step": 6311 }, { "epoch": 1.6926790024135157, "grad_norm": 0.25207793101525133, "learning_rate": 4.752032018183506e-06, "loss": 0.0199, "step": 6312 }, { "epoch": 1.692947170823277, "grad_norm": 0.2776264193532967, "learning_rate": 4.7504737630665415e-06, "loss": 0.0264, "step": 6313 }, { "epoch": 1.6932153392330385, "grad_norm": 0.32088960937767774, "learning_rate": 4.748915532245284e-06, "loss": 0.0257, "step": 6314 }, { "epoch": 1.6934835076427996, "grad_norm": 0.2432793248746285, "learning_rate": 4.747357325871454e-06, "loss": 0.0202, "step": 6315 }, { "epoch": 1.693751676052561, "grad_norm": 0.3049679032671894, "learning_rate": 4.745799144096771e-06, "loss": 0.0291, "step": 6316 }, { "epoch": 1.6940198444623222, "grad_norm": 0.3171388912523261, "learning_rate": 4.7442409870729515e-06, "loss": 0.0319, "step": 6317 }, { "epoch": 1.6942880128720836, "grad_norm": 0.2812546901113973, "learning_rate": 4.742682854951707e-06, "loss": 0.0282, "step": 6318 }, { "epoch": 1.694556181281845, "grad_norm": 0.19141470144703276, "learning_rate": 4.741124747884749e-06, "loss": 0.0163, "step": 6319 }, { "epoch": 1.6948243496916064, "grad_norm": 0.20716836513579462, "learning_rate": 4.7395666660237875e-06, "loss": 0.0163, "step": 6320 }, { "epoch": 1.6950925181013676, "grad_norm": 0.25896220227710987, "learning_rate": 4.738008609520528e-06, "loss": 0.0264, "step": 6321 }, { "epoch": 1.695360686511129, "grad_norm": 0.2965498330808392, "learning_rate": 4.7364505785266745e-06, "loss": 0.031, "step": 6322 }, { "epoch": 1.6956288549208902, "grad_norm": 0.27264077191870656, "learning_rate": 4.7348925731939284e-06, "loss": 0.0222, "step": 6323 }, { "epoch": 1.6958970233306516, "grad_norm": 0.8778555907344555, "learning_rate": 4.73333459367399e-06, "loss": 0.0435, "step": 6324 }, { "epoch": 1.696165191740413, "grad_norm": 0.3052699430561915, "learning_rate": 4.7317766401185526e-06, "loss": 0.0213, "step": 6325 }, { "epoch": 1.6964333601501744, "grad_norm": 0.4084887913366384, "learning_rate": 4.730218712679313e-06, "loss": 0.0302, "step": 6326 }, { "epoch": 1.6967015285599356, "grad_norm": 0.3794880910073861, "learning_rate": 4.72866081150796e-06, "loss": 0.0271, "step": 6327 }, { "epoch": 1.696969696969697, "grad_norm": 0.23805193813393927, "learning_rate": 4.727102936756182e-06, "loss": 0.0256, "step": 6328 }, { "epoch": 1.6972378653794582, "grad_norm": 0.27476507243719384, "learning_rate": 4.7255450885756715e-06, "loss": 0.0341, "step": 6329 }, { "epoch": 1.6975060337892196, "grad_norm": 0.2544707302982708, "learning_rate": 4.723987267118104e-06, "loss": 0.0265, "step": 6330 }, { "epoch": 1.697774202198981, "grad_norm": 0.28900254412485393, "learning_rate": 4.722429472535163e-06, "loss": 0.0226, "step": 6331 }, { "epoch": 1.6980423706087424, "grad_norm": 0.22219667525937103, "learning_rate": 4.720871704978527e-06, "loss": 0.0169, "step": 6332 }, { "epoch": 1.6983105390185036, "grad_norm": 0.6221915190987494, "learning_rate": 4.719313964599872e-06, "loss": 0.0193, "step": 6333 }, { "epoch": 1.698578707428265, "grad_norm": 0.22977728301444644, "learning_rate": 4.717756251550873e-06, "loss": 0.0197, "step": 6334 }, { "epoch": 1.6988468758380262, "grad_norm": 0.1999820524106122, "learning_rate": 4.7161985659831965e-06, "loss": 0.0197, "step": 6335 }, { "epoch": 1.6991150442477876, "grad_norm": 0.21939446591698422, "learning_rate": 4.714640908048513e-06, "loss": 0.0205, "step": 6336 }, { "epoch": 1.699383212657549, "grad_norm": 0.45029681448276454, "learning_rate": 4.713083277898487e-06, "loss": 0.0272, "step": 6337 }, { "epoch": 1.6996513810673104, "grad_norm": 0.24615787353965504, "learning_rate": 4.71152567568478e-06, "loss": 0.0184, "step": 6338 }, { "epoch": 1.6999195494770716, "grad_norm": 0.2642746308775313, "learning_rate": 4.709968101559051e-06, "loss": 0.0292, "step": 6339 }, { "epoch": 1.700187717886833, "grad_norm": 0.3734068638883106, "learning_rate": 4.708410555672959e-06, "loss": 0.0286, "step": 6340 }, { "epoch": 1.7004558862965942, "grad_norm": 0.23210075879332442, "learning_rate": 4.706853038178159e-06, "loss": 0.0175, "step": 6341 }, { "epoch": 1.7007240547063556, "grad_norm": 0.24502943848092656, "learning_rate": 4.705295549226298e-06, "loss": 0.0287, "step": 6342 }, { "epoch": 1.700992223116117, "grad_norm": 0.5181188086924858, "learning_rate": 4.703738088969027e-06, "loss": 0.0161, "step": 6343 }, { "epoch": 1.7012603915258784, "grad_norm": 0.2543993043994865, "learning_rate": 4.702180657557992e-06, "loss": 0.0248, "step": 6344 }, { "epoch": 1.7015285599356396, "grad_norm": 0.39582768305361116, "learning_rate": 4.700623255144836e-06, "loss": 0.0212, "step": 6345 }, { "epoch": 1.701796728345401, "grad_norm": 0.27506846570551235, "learning_rate": 4.699065881881199e-06, "loss": 0.0263, "step": 6346 }, { "epoch": 1.7020648967551621, "grad_norm": 0.20851567791630915, "learning_rate": 4.697508537918718e-06, "loss": 0.0207, "step": 6347 }, { "epoch": 1.7023330651649236, "grad_norm": 0.2284271920304712, "learning_rate": 4.695951223409027e-06, "loss": 0.0202, "step": 6348 }, { "epoch": 1.702601233574685, "grad_norm": 0.2884981162959601, "learning_rate": 4.694393938503758e-06, "loss": 0.0346, "step": 6349 }, { "epoch": 1.7028694019844464, "grad_norm": 0.2290169105839306, "learning_rate": 4.69283668335454e-06, "loss": 0.0231, "step": 6350 }, { "epoch": 1.7031375703942075, "grad_norm": 0.26134288237591163, "learning_rate": 4.6912794581129985e-06, "loss": 0.023, "step": 6351 }, { "epoch": 1.7034057388039687, "grad_norm": 0.3025765356927052, "learning_rate": 4.689722262930757e-06, "loss": 0.0337, "step": 6352 }, { "epoch": 1.7036739072137301, "grad_norm": 0.6042970536707212, "learning_rate": 4.688165097959436e-06, "loss": 0.0292, "step": 6353 }, { "epoch": 1.7039420756234915, "grad_norm": 0.351462532966335, "learning_rate": 4.686607963350651e-06, "loss": 0.0196, "step": 6354 }, { "epoch": 1.704210244033253, "grad_norm": 0.21502543418359332, "learning_rate": 4.685050859256016e-06, "loss": 0.0172, "step": 6355 }, { "epoch": 1.7044784124430143, "grad_norm": 0.6167146726303089, "learning_rate": 4.683493785827142e-06, "loss": 0.027, "step": 6356 }, { "epoch": 1.7047465808527755, "grad_norm": 0.30325661538503634, "learning_rate": 4.681936743215639e-06, "loss": 0.0223, "step": 6357 }, { "epoch": 1.7050147492625367, "grad_norm": 0.480340114941563, "learning_rate": 4.6803797315731105e-06, "loss": 0.0183, "step": 6358 }, { "epoch": 1.7052829176722981, "grad_norm": 0.3265955608107701, "learning_rate": 4.6788227510511574e-06, "loss": 0.0292, "step": 6359 }, { "epoch": 1.7055510860820595, "grad_norm": 0.3203826259528443, "learning_rate": 4.677265801801381e-06, "loss": 0.0209, "step": 6360 }, { "epoch": 1.705819254491821, "grad_norm": 0.29200179442589275, "learning_rate": 4.675708883975378e-06, "loss": 0.0244, "step": 6361 }, { "epoch": 1.7060874229015823, "grad_norm": 0.3419050571252299, "learning_rate": 4.6741519977247395e-06, "loss": 0.0239, "step": 6362 }, { "epoch": 1.7063555913113435, "grad_norm": 0.3888286137005385, "learning_rate": 4.672595143201056e-06, "loss": 0.0214, "step": 6363 }, { "epoch": 1.7066237597211047, "grad_norm": 0.35690950348030837, "learning_rate": 4.671038320555914e-06, "loss": 0.0209, "step": 6364 }, { "epoch": 1.706891928130866, "grad_norm": 0.25258727286638094, "learning_rate": 4.669481529940898e-06, "loss": 0.0199, "step": 6365 }, { "epoch": 1.7071600965406275, "grad_norm": 0.20964608752150893, "learning_rate": 4.667924771507588e-06, "loss": 0.0201, "step": 6366 }, { "epoch": 1.707428264950389, "grad_norm": 0.28034749108572227, "learning_rate": 4.66636804540756e-06, "loss": 0.0263, "step": 6367 }, { "epoch": 1.7076964333601503, "grad_norm": 0.28981534077986654, "learning_rate": 4.66481135179239e-06, "loss": 0.0269, "step": 6368 }, { "epoch": 1.7079646017699115, "grad_norm": 0.2918889591186924, "learning_rate": 4.6632546908136485e-06, "loss": 0.0276, "step": 6369 }, { "epoch": 1.7082327701796727, "grad_norm": 0.3392958239757785, "learning_rate": 4.6616980626229035e-06, "loss": 0.0231, "step": 6370 }, { "epoch": 1.708500938589434, "grad_norm": 0.21715828398995005, "learning_rate": 4.660141467371721e-06, "loss": 0.0203, "step": 6371 }, { "epoch": 1.7087691069991955, "grad_norm": 0.27390462159956414, "learning_rate": 4.65858490521166e-06, "loss": 0.0321, "step": 6372 }, { "epoch": 1.709037275408957, "grad_norm": 0.2144885215041506, "learning_rate": 4.657028376294282e-06, "loss": 0.0243, "step": 6373 }, { "epoch": 1.7093054438187183, "grad_norm": 0.2569661132113024, "learning_rate": 4.65547188077114e-06, "loss": 0.0201, "step": 6374 }, { "epoch": 1.7095736122284795, "grad_norm": 0.6061871034696534, "learning_rate": 4.653915418793786e-06, "loss": 0.0282, "step": 6375 }, { "epoch": 1.7098417806382407, "grad_norm": 0.2208418442614102, "learning_rate": 4.652358990513768e-06, "loss": 0.02, "step": 6376 }, { "epoch": 1.710109949048002, "grad_norm": 0.21293640868509356, "learning_rate": 4.650802596082635e-06, "loss": 0.0152, "step": 6377 }, { "epoch": 1.7103781174577635, "grad_norm": 0.36145123522996625, "learning_rate": 4.649246235651924e-06, "loss": 0.0237, "step": 6378 }, { "epoch": 1.7106462858675249, "grad_norm": 0.2004306247805123, "learning_rate": 4.647689909373177e-06, "loss": 0.0172, "step": 6379 }, { "epoch": 1.7109144542772863, "grad_norm": 0.25057747641026273, "learning_rate": 4.646133617397926e-06, "loss": 0.0182, "step": 6380 }, { "epoch": 1.7111826226870475, "grad_norm": 0.2404041274416568, "learning_rate": 4.644577359877707e-06, "loss": 0.0195, "step": 6381 }, { "epoch": 1.7114507910968086, "grad_norm": 0.24961038717265038, "learning_rate": 4.643021136964045e-06, "loss": 0.0163, "step": 6382 }, { "epoch": 1.71171895950657, "grad_norm": 0.6088510004065834, "learning_rate": 4.641464948808469e-06, "loss": 0.0276, "step": 6383 }, { "epoch": 1.7119871279163315, "grad_norm": 0.4054811489487194, "learning_rate": 4.6399087955624986e-06, "loss": 0.0342, "step": 6384 }, { "epoch": 1.7122552963260929, "grad_norm": 0.3511439766043749, "learning_rate": 4.638352677377652e-06, "loss": 0.0268, "step": 6385 }, { "epoch": 1.7125234647358543, "grad_norm": 0.47862065454059155, "learning_rate": 4.636796594405446e-06, "loss": 0.0234, "step": 6386 }, { "epoch": 1.7127916331456154, "grad_norm": 0.2562139056172085, "learning_rate": 4.63524054679739e-06, "loss": 0.0181, "step": 6387 }, { "epoch": 1.7130598015553766, "grad_norm": 0.23886231636990105, "learning_rate": 4.633684534704994e-06, "loss": 0.0232, "step": 6388 }, { "epoch": 1.713327969965138, "grad_norm": 0.27953331527246256, "learning_rate": 4.632128558279765e-06, "loss": 0.0265, "step": 6389 }, { "epoch": 1.7135961383748994, "grad_norm": 0.30283132133069074, "learning_rate": 4.630572617673198e-06, "loss": 0.0253, "step": 6390 }, { "epoch": 1.7138643067846608, "grad_norm": 0.3458181467931392, "learning_rate": 4.629016713036796e-06, "loss": 0.0233, "step": 6391 }, { "epoch": 1.7141324751944222, "grad_norm": 0.24099334684317164, "learning_rate": 4.627460844522052e-06, "loss": 0.0209, "step": 6392 }, { "epoch": 1.7144006436041834, "grad_norm": 0.297409239774494, "learning_rate": 4.625905012280455e-06, "loss": 0.0225, "step": 6393 }, { "epoch": 1.7146688120139446, "grad_norm": 0.36283418601111095, "learning_rate": 4.624349216463495e-06, "loss": 0.0219, "step": 6394 }, { "epoch": 1.714936980423706, "grad_norm": 0.2132015368426918, "learning_rate": 4.622793457222655e-06, "loss": 0.0192, "step": 6395 }, { "epoch": 1.7152051488334674, "grad_norm": 0.2185737026319438, "learning_rate": 4.621237734709414e-06, "loss": 0.0199, "step": 6396 }, { "epoch": 1.7154733172432288, "grad_norm": 0.21405495661617283, "learning_rate": 4.619682049075247e-06, "loss": 0.0201, "step": 6397 }, { "epoch": 1.7157414856529902, "grad_norm": 0.22595533867447962, "learning_rate": 4.618126400471631e-06, "loss": 0.0156, "step": 6398 }, { "epoch": 1.7160096540627514, "grad_norm": 0.24579561336061392, "learning_rate": 4.616570789050034e-06, "loss": 0.0244, "step": 6399 }, { "epoch": 1.7162778224725126, "grad_norm": 0.2006160706291946, "learning_rate": 4.615015214961921e-06, "loss": 0.0245, "step": 6400 }, { "epoch": 1.716545990882274, "grad_norm": 0.292840726954951, "learning_rate": 4.6134596783587545e-06, "loss": 0.0242, "step": 6401 }, { "epoch": 1.7168141592920354, "grad_norm": 0.2227734680946169, "learning_rate": 4.611904179391994e-06, "loss": 0.0224, "step": 6402 }, { "epoch": 1.7170823277017968, "grad_norm": 0.2782502528780826, "learning_rate": 4.610348718213091e-06, "loss": 0.0201, "step": 6403 }, { "epoch": 1.7173504961115582, "grad_norm": 0.26251425414886725, "learning_rate": 4.6087932949735e-06, "loss": 0.0248, "step": 6404 }, { "epoch": 1.7176186645213194, "grad_norm": 0.3159578346490633, "learning_rate": 4.607237909824667e-06, "loss": 0.0264, "step": 6405 }, { "epoch": 1.7178868329310806, "grad_norm": 0.22926726184711335, "learning_rate": 4.605682562918036e-06, "loss": 0.0234, "step": 6406 }, { "epoch": 1.718155001340842, "grad_norm": 0.4190480875532073, "learning_rate": 4.604127254405046e-06, "loss": 0.0209, "step": 6407 }, { "epoch": 1.7184231697506034, "grad_norm": 0.5877977492782358, "learning_rate": 4.602571984437135e-06, "loss": 0.0314, "step": 6408 }, { "epoch": 1.7186913381603648, "grad_norm": 0.31774377563547274, "learning_rate": 4.6010167531657345e-06, "loss": 0.0197, "step": 6409 }, { "epoch": 1.7189595065701262, "grad_norm": 0.29960669808198426, "learning_rate": 4.599461560742274e-06, "loss": 0.0302, "step": 6410 }, { "epoch": 1.7192276749798874, "grad_norm": 0.20843969555383424, "learning_rate": 4.5979064073181775e-06, "loss": 0.0217, "step": 6411 }, { "epoch": 1.7194958433896486, "grad_norm": 0.2516746683809809, "learning_rate": 4.5963512930448665e-06, "loss": 0.0236, "step": 6412 }, { "epoch": 1.71976401179941, "grad_norm": 0.18467585946167467, "learning_rate": 4.59479621807376e-06, "loss": 0.0147, "step": 6413 }, { "epoch": 1.7200321802091714, "grad_norm": 0.2976623812388055, "learning_rate": 4.5932411825562686e-06, "loss": 0.0265, "step": 6414 }, { "epoch": 1.7203003486189328, "grad_norm": 0.29087501312431563, "learning_rate": 4.591686186643804e-06, "loss": 0.0264, "step": 6415 }, { "epoch": 1.7205685170286942, "grad_norm": 0.1936673758862618, "learning_rate": 4.59013123048777e-06, "loss": 0.018, "step": 6416 }, { "epoch": 1.7208366854384554, "grad_norm": 0.23319293221166942, "learning_rate": 4.588576314239571e-06, "loss": 0.0172, "step": 6417 }, { "epoch": 1.7211048538482165, "grad_norm": 0.27132871628213684, "learning_rate": 4.587021438050602e-06, "loss": 0.0224, "step": 6418 }, { "epoch": 1.721373022257978, "grad_norm": 0.4189080153188926, "learning_rate": 4.585466602072261e-06, "loss": 0.0387, "step": 6419 }, { "epoch": 1.7216411906677394, "grad_norm": 0.2714348760447238, "learning_rate": 4.583911806455936e-06, "loss": 0.0215, "step": 6420 }, { "epoch": 1.7219093590775008, "grad_norm": 0.25303363834760845, "learning_rate": 4.582357051353014e-06, "loss": 0.026, "step": 6421 }, { "epoch": 1.7221775274872622, "grad_norm": 0.282215305120375, "learning_rate": 4.580802336914877e-06, "loss": 0.0172, "step": 6422 }, { "epoch": 1.7224456958970233, "grad_norm": 0.2904703912765796, "learning_rate": 4.579247663292903e-06, "loss": 0.0131, "step": 6423 }, { "epoch": 1.7227138643067845, "grad_norm": 0.8511263241464649, "learning_rate": 4.577693030638467e-06, "loss": 0.0318, "step": 6424 }, { "epoch": 1.722982032716546, "grad_norm": 0.1899432557181762, "learning_rate": 4.576138439102939e-06, "loss": 0.0209, "step": 6425 }, { "epoch": 1.7232502011263073, "grad_norm": 0.44431252992603865, "learning_rate": 4.5745838888376895e-06, "loss": 0.0236, "step": 6426 }, { "epoch": 1.7235183695360687, "grad_norm": 0.2693359998028363, "learning_rate": 4.5730293799940745e-06, "loss": 0.0219, "step": 6427 }, { "epoch": 1.7237865379458301, "grad_norm": 0.2011968930253362, "learning_rate": 4.571474912723454e-06, "loss": 0.015, "step": 6428 }, { "epoch": 1.7240547063555913, "grad_norm": 0.2170750760879635, "learning_rate": 4.5699204871771844e-06, "loss": 0.0207, "step": 6429 }, { "epoch": 1.7243228747653525, "grad_norm": 0.36953092681657856, "learning_rate": 4.568366103506616e-06, "loss": 0.0183, "step": 6430 }, { "epoch": 1.724591043175114, "grad_norm": 0.3232965925714277, "learning_rate": 4.566811761863093e-06, "loss": 0.0336, "step": 6431 }, { "epoch": 1.7248592115848753, "grad_norm": 0.212676947473235, "learning_rate": 4.565257462397959e-06, "loss": 0.0163, "step": 6432 }, { "epoch": 1.7251273799946367, "grad_norm": 0.3558702979747367, "learning_rate": 4.563703205262553e-06, "loss": 0.0261, "step": 6433 }, { "epoch": 1.7253955484043981, "grad_norm": 0.4435637556975014, "learning_rate": 4.562148990608205e-06, "loss": 0.0211, "step": 6434 }, { "epoch": 1.7256637168141593, "grad_norm": 0.3438820831116945, "learning_rate": 4.560594818586248e-06, "loss": 0.0227, "step": 6435 }, { "epoch": 1.7259318852239205, "grad_norm": 0.3603280861203436, "learning_rate": 4.559040689348007e-06, "loss": 0.0217, "step": 6436 }, { "epoch": 1.726200053633682, "grad_norm": 0.4490564042589232, "learning_rate": 4.557486603044805e-06, "loss": 0.0253, "step": 6437 }, { "epoch": 1.7264682220434433, "grad_norm": 0.24312458985997554, "learning_rate": 4.555932559827958e-06, "loss": 0.0225, "step": 6438 }, { "epoch": 1.7267363904532047, "grad_norm": 0.3166950316706425, "learning_rate": 4.554378559848777e-06, "loss": 0.0228, "step": 6439 }, { "epoch": 1.7270045588629659, "grad_norm": 0.3054955419968058, "learning_rate": 4.552824603258572e-06, "loss": 0.0319, "step": 6440 }, { "epoch": 1.7272727272727273, "grad_norm": 0.24554769717001518, "learning_rate": 4.55127069020865e-06, "loss": 0.0218, "step": 6441 }, { "epoch": 1.7275408956824885, "grad_norm": 0.2870202611499414, "learning_rate": 4.549716820850308e-06, "loss": 0.0269, "step": 6442 }, { "epoch": 1.7278090640922499, "grad_norm": 0.22628678079288225, "learning_rate": 4.5481629953348445e-06, "loss": 0.0179, "step": 6443 }, { "epoch": 1.7280772325020113, "grad_norm": 0.25315916559438045, "learning_rate": 4.54660921381355e-06, "loss": 0.0215, "step": 6444 }, { "epoch": 1.7283454009117727, "grad_norm": 0.334683159670059, "learning_rate": 4.545055476437712e-06, "loss": 0.0287, "step": 6445 }, { "epoch": 1.7286135693215339, "grad_norm": 0.31759386592107286, "learning_rate": 4.543501783358615e-06, "loss": 0.0216, "step": 6446 }, { "epoch": 1.7288817377312953, "grad_norm": 0.29661195932179135, "learning_rate": 4.541948134727538e-06, "loss": 0.0202, "step": 6447 }, { "epoch": 1.7291499061410565, "grad_norm": 0.2825993214530079, "learning_rate": 4.540394530695755e-06, "loss": 0.0263, "step": 6448 }, { "epoch": 1.7294180745508179, "grad_norm": 0.29270493692302146, "learning_rate": 4.538840971414535e-06, "loss": 0.0241, "step": 6449 }, { "epoch": 1.7296862429605793, "grad_norm": 0.26100436728179294, "learning_rate": 4.537287457035147e-06, "loss": 0.0216, "step": 6450 }, { "epoch": 1.7299544113703407, "grad_norm": 0.35005063879838255, "learning_rate": 4.53573398770885e-06, "loss": 0.0447, "step": 6451 }, { "epoch": 1.7302225797801019, "grad_norm": 0.2489727720738474, "learning_rate": 4.534180563586902e-06, "loss": 0.0252, "step": 6452 }, { "epoch": 1.7304907481898633, "grad_norm": 0.19111093427962636, "learning_rate": 4.532627184820556e-06, "loss": 0.0159, "step": 6453 }, { "epoch": 1.7307589165996244, "grad_norm": 0.2530464899545907, "learning_rate": 4.53107385156106e-06, "loss": 0.0209, "step": 6454 }, { "epoch": 1.7310270850093858, "grad_norm": 0.23644032975061133, "learning_rate": 4.5295205639596576e-06, "loss": 0.0199, "step": 6455 }, { "epoch": 1.7312952534191473, "grad_norm": 1.058208200639574, "learning_rate": 4.527967322167589e-06, "loss": 0.0294, "step": 6456 }, { "epoch": 1.7315634218289087, "grad_norm": 0.2596593704390698, "learning_rate": 4.526414126336089e-06, "loss": 0.0188, "step": 6457 }, { "epoch": 1.7318315902386698, "grad_norm": 0.25836639971373576, "learning_rate": 4.524860976616389e-06, "loss": 0.0253, "step": 6458 }, { "epoch": 1.7320997586484312, "grad_norm": 0.27490766985711773, "learning_rate": 4.523307873159714e-06, "loss": 0.025, "step": 6459 }, { "epoch": 1.7323679270581924, "grad_norm": 0.24715561741475772, "learning_rate": 4.5217548161172845e-06, "loss": 0.0223, "step": 6460 }, { "epoch": 1.7326360954679538, "grad_norm": 0.28262608764647656, "learning_rate": 4.5202018056403205e-06, "loss": 0.0232, "step": 6461 }, { "epoch": 1.7329042638777152, "grad_norm": 0.25568283633105937, "learning_rate": 4.518648841880033e-06, "loss": 0.0257, "step": 6462 }, { "epoch": 1.7331724322874766, "grad_norm": 0.9030726392649076, "learning_rate": 4.517095924987629e-06, "loss": 0.0247, "step": 6463 }, { "epoch": 1.7334406006972378, "grad_norm": 0.2569839859577526, "learning_rate": 4.515543055114313e-06, "loss": 0.0272, "step": 6464 }, { "epoch": 1.7337087691069992, "grad_norm": 0.24626860998587086, "learning_rate": 4.513990232411282e-06, "loss": 0.0196, "step": 6465 }, { "epoch": 1.7339769375167604, "grad_norm": 0.293138551042103, "learning_rate": 4.512437457029733e-06, "loss": 0.0244, "step": 6466 }, { "epoch": 1.7342451059265218, "grad_norm": 0.3069112491509141, "learning_rate": 4.510884729120854e-06, "loss": 0.0235, "step": 6467 }, { "epoch": 1.7345132743362832, "grad_norm": 0.2099436163462238, "learning_rate": 4.509332048835831e-06, "loss": 0.0166, "step": 6468 }, { "epoch": 1.7347814427460446, "grad_norm": 0.30581200350003007, "learning_rate": 4.5077794163258436e-06, "loss": 0.0222, "step": 6469 }, { "epoch": 1.7350496111558058, "grad_norm": 0.21905588396488787, "learning_rate": 4.506226831742067e-06, "loss": 0.0204, "step": 6470 }, { "epoch": 1.7353177795655672, "grad_norm": 0.22406103961009521, "learning_rate": 4.504674295235673e-06, "loss": 0.0216, "step": 6471 }, { "epoch": 1.7355859479753284, "grad_norm": 0.24113953060861756, "learning_rate": 4.503121806957827e-06, "loss": 0.0228, "step": 6472 }, { "epoch": 1.7358541163850898, "grad_norm": 0.2375838482667167, "learning_rate": 4.501569367059693e-06, "loss": 0.0206, "step": 6473 }, { "epoch": 1.7361222847948512, "grad_norm": 0.25021050566314373, "learning_rate": 4.500016975692428e-06, "loss": 0.0286, "step": 6474 }, { "epoch": 1.7363904532046126, "grad_norm": 0.2841938934805119, "learning_rate": 4.4984646330071804e-06, "loss": 0.0276, "step": 6475 }, { "epoch": 1.7366586216143738, "grad_norm": 0.27697248619501563, "learning_rate": 4.4969123391550986e-06, "loss": 0.0227, "step": 6476 }, { "epoch": 1.7369267900241352, "grad_norm": 0.8943681941849546, "learning_rate": 4.4953600942873285e-06, "loss": 0.0193, "step": 6477 }, { "epoch": 1.7371949584338964, "grad_norm": 0.3538927603089793, "learning_rate": 4.493807898555006e-06, "loss": 0.0211, "step": 6478 }, { "epoch": 1.7374631268436578, "grad_norm": 0.31957073225375754, "learning_rate": 4.4922557521092645e-06, "loss": 0.0236, "step": 6479 }, { "epoch": 1.7377312952534192, "grad_norm": 0.291148075935853, "learning_rate": 4.490703655101233e-06, "loss": 0.0239, "step": 6480 }, { "epoch": 1.7379994636631806, "grad_norm": 0.20784384933869116, "learning_rate": 4.489151607682033e-06, "loss": 0.0139, "step": 6481 }, { "epoch": 1.7382676320729418, "grad_norm": 0.358811818471631, "learning_rate": 4.487599610002785e-06, "loss": 0.0347, "step": 6482 }, { "epoch": 1.7385358004827032, "grad_norm": 0.2540495473011931, "learning_rate": 4.486047662214602e-06, "loss": 0.0193, "step": 6483 }, { "epoch": 1.7388039688924644, "grad_norm": 0.3714001253272932, "learning_rate": 4.484495764468595e-06, "loss": 0.0227, "step": 6484 }, { "epoch": 1.7390721373022258, "grad_norm": 0.3346176000637883, "learning_rate": 4.482943916915866e-06, "loss": 0.0267, "step": 6485 }, { "epoch": 1.7393403057119872, "grad_norm": 0.19609119227542604, "learning_rate": 4.481392119707516e-06, "loss": 0.0164, "step": 6486 }, { "epoch": 1.7396084741217486, "grad_norm": 0.25685979813932897, "learning_rate": 4.479840372994639e-06, "loss": 0.0272, "step": 6487 }, { "epoch": 1.7398766425315098, "grad_norm": 0.21554669626296594, "learning_rate": 4.478288676928323e-06, "loss": 0.0178, "step": 6488 }, { "epoch": 1.7401448109412712, "grad_norm": 0.29392994696640956, "learning_rate": 4.476737031659652e-06, "loss": 0.0243, "step": 6489 }, { "epoch": 1.7404129793510323, "grad_norm": 0.2652690122044581, "learning_rate": 4.475185437339709e-06, "loss": 0.019, "step": 6490 }, { "epoch": 1.7406811477607937, "grad_norm": 0.21638443378919595, "learning_rate": 4.473633894119566e-06, "loss": 0.0195, "step": 6491 }, { "epoch": 1.7409493161705552, "grad_norm": 0.2414075086187707, "learning_rate": 4.472082402150293e-06, "loss": 0.0165, "step": 6492 }, { "epoch": 1.7412174845803166, "grad_norm": 0.29728451527539523, "learning_rate": 4.470530961582953e-06, "loss": 0.0209, "step": 6493 }, { "epoch": 1.7414856529900777, "grad_norm": 0.29730796283734645, "learning_rate": 4.46897957256861e-06, "loss": 0.0275, "step": 6494 }, { "epoch": 1.7417538213998391, "grad_norm": 0.23686197439919526, "learning_rate": 4.467428235258315e-06, "loss": 0.0241, "step": 6495 }, { "epoch": 1.7420219898096003, "grad_norm": 0.2457365495072019, "learning_rate": 4.465876949803119e-06, "loss": 0.0183, "step": 6496 }, { "epoch": 1.7422901582193617, "grad_norm": 0.3807462059794738, "learning_rate": 4.464325716354068e-06, "loss": 0.0253, "step": 6497 }, { "epoch": 1.7425583266291231, "grad_norm": 0.30192227098828495, "learning_rate": 4.4627745350622e-06, "loss": 0.0196, "step": 6498 }, { "epoch": 1.7428264950388845, "grad_norm": 0.2820641896269207, "learning_rate": 4.461223406078547e-06, "loss": 0.0258, "step": 6499 }, { "epoch": 1.7430946634486457, "grad_norm": 0.22543569607692057, "learning_rate": 4.459672329554141e-06, "loss": 0.0169, "step": 6500 }, { "epoch": 1.7433628318584071, "grad_norm": 0.21931643050261612, "learning_rate": 4.4581213056400065e-06, "loss": 0.0197, "step": 6501 }, { "epoch": 1.7436310002681683, "grad_norm": 0.2256126373119399, "learning_rate": 4.456570334487162e-06, "loss": 0.0183, "step": 6502 }, { "epoch": 1.7438991686779297, "grad_norm": 0.21283835312360114, "learning_rate": 4.4550194162466186e-06, "loss": 0.0164, "step": 6503 }, { "epoch": 1.7441673370876911, "grad_norm": 0.29344583959686266, "learning_rate": 4.45346855106939e-06, "loss": 0.0273, "step": 6504 }, { "epoch": 1.7444355054974525, "grad_norm": 0.36467223590148035, "learning_rate": 4.451917739106477e-06, "loss": 0.0215, "step": 6505 }, { "epoch": 1.7447036739072137, "grad_norm": 0.2808819641689337, "learning_rate": 4.4503669805088795e-06, "loss": 0.0231, "step": 6506 }, { "epoch": 1.744971842316975, "grad_norm": 0.25427657464387626, "learning_rate": 4.448816275427589e-06, "loss": 0.018, "step": 6507 }, { "epoch": 1.7452400107267363, "grad_norm": 0.4088444785638285, "learning_rate": 4.4472656240135944e-06, "loss": 0.0279, "step": 6508 }, { "epoch": 1.7455081791364977, "grad_norm": 0.30612194341089866, "learning_rate": 4.445715026417879e-06, "loss": 0.0241, "step": 6509 }, { "epoch": 1.745776347546259, "grad_norm": 0.30212596918895857, "learning_rate": 4.444164482791421e-06, "loss": 0.0315, "step": 6510 }, { "epoch": 1.7460445159560205, "grad_norm": 0.25555227875874187, "learning_rate": 4.44261399328519e-06, "loss": 0.0215, "step": 6511 }, { "epoch": 1.7463126843657817, "grad_norm": 0.4050417566582068, "learning_rate": 4.441063558050155e-06, "loss": 0.0378, "step": 6512 }, { "epoch": 1.746580852775543, "grad_norm": 0.25937021127220256, "learning_rate": 4.439513177237278e-06, "loss": 0.0208, "step": 6513 }, { "epoch": 1.7468490211853043, "grad_norm": 0.2984237796435717, "learning_rate": 4.437962850997513e-06, "loss": 0.0233, "step": 6514 }, { "epoch": 1.7471171895950657, "grad_norm": 0.27073554464521254, "learning_rate": 4.436412579481814e-06, "loss": 0.0244, "step": 6515 }, { "epoch": 1.747385358004827, "grad_norm": 0.21261532557665283, "learning_rate": 4.434862362841127e-06, "loss": 0.0182, "step": 6516 }, { "epoch": 1.7476535264145885, "grad_norm": 0.25845891614663813, "learning_rate": 4.43331220122639e-06, "loss": 0.02, "step": 6517 }, { "epoch": 1.7479216948243497, "grad_norm": 0.408710817489067, "learning_rate": 4.431762094788539e-06, "loss": 0.0194, "step": 6518 }, { "epoch": 1.748189863234111, "grad_norm": 0.360335422346512, "learning_rate": 4.430212043678506e-06, "loss": 0.0234, "step": 6519 }, { "epoch": 1.7484580316438723, "grad_norm": 0.24712552031865853, "learning_rate": 4.428662048047211e-06, "loss": 0.0169, "step": 6520 }, { "epoch": 1.7487262000536337, "grad_norm": 0.25596790137806275, "learning_rate": 4.427112108045577e-06, "loss": 0.0242, "step": 6521 }, { "epoch": 1.748994368463395, "grad_norm": 0.2696038836553065, "learning_rate": 4.425562223824516e-06, "loss": 0.0279, "step": 6522 }, { "epoch": 1.7492625368731565, "grad_norm": 0.3816278598487673, "learning_rate": 4.424012395534938e-06, "loss": 0.0304, "step": 6523 }, { "epoch": 1.7495307052829177, "grad_norm": 0.25264010398938386, "learning_rate": 4.422462623327741e-06, "loss": 0.0188, "step": 6524 }, { "epoch": 1.749798873692679, "grad_norm": 0.38698846448960067, "learning_rate": 4.420912907353827e-06, "loss": 0.0216, "step": 6525 }, { "epoch": 1.7500670421024402, "grad_norm": 0.2397148260481921, "learning_rate": 4.4193632477640845e-06, "loss": 0.02, "step": 6526 }, { "epoch": 1.7503352105122016, "grad_norm": 0.2354028156650949, "learning_rate": 4.417813644709401e-06, "loss": 0.0153, "step": 6527 }, { "epoch": 1.750603378921963, "grad_norm": 0.31743057499584115, "learning_rate": 4.4162640983406576e-06, "loss": 0.0289, "step": 6528 }, { "epoch": 1.7508715473317245, "grad_norm": 0.22589095778137178, "learning_rate": 4.41471460880873e-06, "loss": 0.0182, "step": 6529 }, { "epoch": 1.7511397157414856, "grad_norm": 0.24397849366700827, "learning_rate": 4.413165176264486e-06, "loss": 0.0146, "step": 6530 }, { "epoch": 1.7514078841512468, "grad_norm": 0.3887415229182343, "learning_rate": 4.41161580085879e-06, "loss": 0.0227, "step": 6531 }, { "epoch": 1.7516760525610082, "grad_norm": 0.22014004856332256, "learning_rate": 4.410066482742502e-06, "loss": 0.0173, "step": 6532 }, { "epoch": 1.7519442209707696, "grad_norm": 0.21221940604558215, "learning_rate": 4.408517222066475e-06, "loss": 0.0226, "step": 6533 }, { "epoch": 1.752212389380531, "grad_norm": 0.23280535262751043, "learning_rate": 4.4069680189815555e-06, "loss": 0.0242, "step": 6534 }, { "epoch": 1.7524805577902924, "grad_norm": 0.28071235175100595, "learning_rate": 4.405418873638587e-06, "loss": 0.0248, "step": 6535 }, { "epoch": 1.7527487262000536, "grad_norm": 0.3019807266059885, "learning_rate": 4.403869786188402e-06, "loss": 0.0227, "step": 6536 }, { "epoch": 1.7530168946098148, "grad_norm": 0.449080921453452, "learning_rate": 4.402320756781834e-06, "loss": 0.0237, "step": 6537 }, { "epoch": 1.7532850630195762, "grad_norm": 0.40570327690455504, "learning_rate": 4.400771785569707e-06, "loss": 0.0225, "step": 6538 }, { "epoch": 1.7535532314293376, "grad_norm": 0.22825364680278976, "learning_rate": 4.39922287270284e-06, "loss": 0.0193, "step": 6539 }, { "epoch": 1.753821399839099, "grad_norm": 0.40499817569131374, "learning_rate": 4.397674018332048e-06, "loss": 0.0323, "step": 6540 }, { "epoch": 1.7540895682488604, "grad_norm": 0.22502654489241705, "learning_rate": 4.396125222608135e-06, "loss": 0.0181, "step": 6541 }, { "epoch": 1.7543577366586216, "grad_norm": 0.32987327685191936, "learning_rate": 4.394576485681908e-06, "loss": 0.027, "step": 6542 }, { "epoch": 1.7546259050683828, "grad_norm": 0.2901487910018218, "learning_rate": 4.39302780770416e-06, "loss": 0.0256, "step": 6543 }, { "epoch": 1.7548940734781442, "grad_norm": 0.3162872961713637, "learning_rate": 4.391479188825685e-06, "loss": 0.015, "step": 6544 }, { "epoch": 1.7551622418879056, "grad_norm": 0.23497007261101452, "learning_rate": 4.389930629197264e-06, "loss": 0.0216, "step": 6545 }, { "epoch": 1.755430410297667, "grad_norm": 0.3313479747763374, "learning_rate": 4.388382128969678e-06, "loss": 0.0238, "step": 6546 }, { "epoch": 1.7556985787074284, "grad_norm": 0.2122541864861825, "learning_rate": 4.386833688293702e-06, "loss": 0.0162, "step": 6547 }, { "epoch": 1.7559667471171896, "grad_norm": 0.2730854040466608, "learning_rate": 4.385285307320101e-06, "loss": 0.0203, "step": 6548 }, { "epoch": 1.7562349155269508, "grad_norm": 0.3223586988120385, "learning_rate": 4.383736986199637e-06, "loss": 0.0307, "step": 6549 }, { "epoch": 1.7565030839367122, "grad_norm": 0.22277545145014743, "learning_rate": 4.382188725083067e-06, "loss": 0.0242, "step": 6550 }, { "epoch": 1.7567712523464736, "grad_norm": 0.24326848272924081, "learning_rate": 4.380640524121139e-06, "loss": 0.0235, "step": 6551 }, { "epoch": 1.757039420756235, "grad_norm": 0.2276587514413136, "learning_rate": 4.3790923834646e-06, "loss": 0.0263, "step": 6552 }, { "epoch": 1.7573075891659964, "grad_norm": 0.4418116757750436, "learning_rate": 4.377544303264187e-06, "loss": 0.0212, "step": 6553 }, { "epoch": 1.7575757575757576, "grad_norm": 0.2728853888795205, "learning_rate": 4.3759962836706335e-06, "loss": 0.0215, "step": 6554 }, { "epoch": 1.7578439259855188, "grad_norm": 0.35453506518115274, "learning_rate": 4.374448324834664e-06, "loss": 0.0187, "step": 6555 }, { "epoch": 1.7581120943952802, "grad_norm": 0.2388154121816742, "learning_rate": 4.372900426907001e-06, "loss": 0.0199, "step": 6556 }, { "epoch": 1.7583802628050416, "grad_norm": 0.2304444919425137, "learning_rate": 4.371352590038358e-06, "loss": 0.0194, "step": 6557 }, { "epoch": 1.758648431214803, "grad_norm": 0.3046340493735745, "learning_rate": 4.369804814379444e-06, "loss": 0.0236, "step": 6558 }, { "epoch": 1.7589165996245644, "grad_norm": 0.4868312479756422, "learning_rate": 4.3682571000809645e-06, "loss": 0.0277, "step": 6559 }, { "epoch": 1.7591847680343256, "grad_norm": 0.24376103363638615, "learning_rate": 4.366709447293612e-06, "loss": 0.0251, "step": 6560 }, { "epoch": 1.7594529364440867, "grad_norm": 0.2939820559270936, "learning_rate": 4.36516185616808e-06, "loss": 0.0326, "step": 6561 }, { "epoch": 1.7597211048538481, "grad_norm": 0.42947584252657406, "learning_rate": 4.363614326855051e-06, "loss": 0.0215, "step": 6562 }, { "epoch": 1.7599892732636095, "grad_norm": 0.22593132605632218, "learning_rate": 4.362066859505206e-06, "loss": 0.0157, "step": 6563 }, { "epoch": 1.760257441673371, "grad_norm": 0.28119489178972057, "learning_rate": 4.360519454269217e-06, "loss": 0.0199, "step": 6564 }, { "epoch": 1.7605256100831324, "grad_norm": 0.4091881239249858, "learning_rate": 4.358972111297753e-06, "loss": 0.0186, "step": 6565 }, { "epoch": 1.7607937784928935, "grad_norm": 0.20849350309215653, "learning_rate": 4.35742483074147e-06, "loss": 0.0204, "step": 6566 }, { "epoch": 1.7610619469026547, "grad_norm": 0.21730521025640218, "learning_rate": 4.355877612751027e-06, "loss": 0.0206, "step": 6567 }, { "epoch": 1.7613301153124161, "grad_norm": 0.26825021426511597, "learning_rate": 4.354330457477068e-06, "loss": 0.0245, "step": 6568 }, { "epoch": 1.7615982837221775, "grad_norm": 0.2727637142938813, "learning_rate": 4.352783365070241e-06, "loss": 0.0274, "step": 6569 }, { "epoch": 1.761866452131939, "grad_norm": 0.2769239225359918, "learning_rate": 4.351236335681179e-06, "loss": 0.0164, "step": 6570 }, { "epoch": 1.7621346205417003, "grad_norm": 0.22370329671912081, "learning_rate": 4.349689369460514e-06, "loss": 0.0198, "step": 6571 }, { "epoch": 1.7624027889514615, "grad_norm": 0.2637611985272846, "learning_rate": 4.348142466558866e-06, "loss": 0.0196, "step": 6572 }, { "epoch": 1.7626709573612227, "grad_norm": 0.3104679890912792, "learning_rate": 4.346595627126856e-06, "loss": 0.0164, "step": 6573 }, { "epoch": 1.762939125770984, "grad_norm": 0.2658216491005964, "learning_rate": 4.345048851315095e-06, "loss": 0.0205, "step": 6574 }, { "epoch": 1.7632072941807455, "grad_norm": 0.20642752954177654, "learning_rate": 4.343502139274189e-06, "loss": 0.0183, "step": 6575 }, { "epoch": 1.763475462590507, "grad_norm": 0.22056467714912686, "learning_rate": 4.3419554911547365e-06, "loss": 0.0225, "step": 6576 }, { "epoch": 1.7637436310002683, "grad_norm": 0.2085381637499523, "learning_rate": 4.340408907107331e-06, "loss": 0.0206, "step": 6577 }, { "epoch": 1.7640117994100295, "grad_norm": 0.30350482707148163, "learning_rate": 4.3388623872825585e-06, "loss": 0.0233, "step": 6578 }, { "epoch": 1.7642799678197907, "grad_norm": 0.2849709172782828, "learning_rate": 4.337315931830999e-06, "loss": 0.0227, "step": 6579 }, { "epoch": 1.764548136229552, "grad_norm": 0.2542651365145202, "learning_rate": 4.3357695409032295e-06, "loss": 0.0216, "step": 6580 }, { "epoch": 1.7648163046393135, "grad_norm": 0.2826327438094655, "learning_rate": 4.334223214649815e-06, "loss": 0.0256, "step": 6581 }, { "epoch": 1.765084473049075, "grad_norm": 0.2689120885388809, "learning_rate": 4.33267695322132e-06, "loss": 0.0289, "step": 6582 }, { "epoch": 1.7653526414588363, "grad_norm": 0.23496053556283258, "learning_rate": 4.331130756768299e-06, "loss": 0.0169, "step": 6583 }, { "epoch": 1.7656208098685975, "grad_norm": 0.301231505837828, "learning_rate": 4.329584625441299e-06, "loss": 0.0346, "step": 6584 }, { "epoch": 1.7658889782783587, "grad_norm": 0.24854590129953075, "learning_rate": 4.328038559390864e-06, "loss": 0.0216, "step": 6585 }, { "epoch": 1.76615714668812, "grad_norm": 0.22083538140857897, "learning_rate": 4.326492558767531e-06, "loss": 0.0202, "step": 6586 }, { "epoch": 1.7664253150978815, "grad_norm": 0.2897861630179568, "learning_rate": 4.324946623721829e-06, "loss": 0.0229, "step": 6587 }, { "epoch": 1.7666934835076429, "grad_norm": 0.2857102913831269, "learning_rate": 4.323400754404282e-06, "loss": 0.025, "step": 6588 }, { "epoch": 1.7669616519174043, "grad_norm": 0.25771446990462094, "learning_rate": 4.321854950965407e-06, "loss": 0.0197, "step": 6589 }, { "epoch": 1.7672298203271655, "grad_norm": 0.26539367910330064, "learning_rate": 4.320309213555715e-06, "loss": 0.025, "step": 6590 }, { "epoch": 1.7674979887369267, "grad_norm": 0.2264616827536108, "learning_rate": 4.318763542325711e-06, "loss": 0.0394, "step": 6591 }, { "epoch": 1.767766157146688, "grad_norm": 0.31945189718486294, "learning_rate": 4.317217937425892e-06, "loss": 0.0204, "step": 6592 }, { "epoch": 1.7680343255564495, "grad_norm": 0.2142969217486577, "learning_rate": 4.315672399006749e-06, "loss": 0.0166, "step": 6593 }, { "epoch": 1.7683024939662109, "grad_norm": 0.23465187644585472, "learning_rate": 4.314126927218768e-06, "loss": 0.0227, "step": 6594 }, { "epoch": 1.7685706623759723, "grad_norm": 0.28450310807171997, "learning_rate": 4.312581522212429e-06, "loss": 0.029, "step": 6595 }, { "epoch": 1.7688388307857335, "grad_norm": 0.25405642049352506, "learning_rate": 4.3110361841382e-06, "loss": 0.0201, "step": 6596 }, { "epoch": 1.7691069991954946, "grad_norm": 0.23905117818461016, "learning_rate": 4.30949091314655e-06, "loss": 0.0223, "step": 6597 }, { "epoch": 1.769375167605256, "grad_norm": 0.2567288657088796, "learning_rate": 4.307945709387935e-06, "loss": 0.0224, "step": 6598 }, { "epoch": 1.7696433360150174, "grad_norm": 0.2422462014906323, "learning_rate": 4.306400573012808e-06, "loss": 0.019, "step": 6599 }, { "epoch": 1.7699115044247788, "grad_norm": 0.24886571120796347, "learning_rate": 4.304855504171618e-06, "loss": 0.0267, "step": 6600 }, { "epoch": 1.7701796728345403, "grad_norm": 0.3151559384829323, "learning_rate": 4.303310503014801e-06, "loss": 0.0326, "step": 6601 }, { "epoch": 1.7704478412443014, "grad_norm": 0.31400754102830536, "learning_rate": 4.301765569692791e-06, "loss": 0.037, "step": 6602 }, { "epoch": 1.7707160096540626, "grad_norm": 0.3000391082581648, "learning_rate": 4.3002207043560135e-06, "loss": 0.0171, "step": 6603 }, { "epoch": 1.770984178063824, "grad_norm": 0.3805458777459763, "learning_rate": 4.298675907154888e-06, "loss": 0.0224, "step": 6604 }, { "epoch": 1.7712523464735854, "grad_norm": 0.2891592412137003, "learning_rate": 4.297131178239828e-06, "loss": 0.0215, "step": 6605 }, { "epoch": 1.7715205148833468, "grad_norm": 0.33864301742074604, "learning_rate": 4.295586517761238e-06, "loss": 0.0271, "step": 6606 }, { "epoch": 1.7717886832931082, "grad_norm": 0.2190997532747725, "learning_rate": 4.294041925869521e-06, "loss": 0.0167, "step": 6607 }, { "epoch": 1.7720568517028694, "grad_norm": 0.28211744425414365, "learning_rate": 4.292497402715066e-06, "loss": 0.0183, "step": 6608 }, { "epoch": 1.7723250201126306, "grad_norm": 0.2837826267334727, "learning_rate": 4.29095294844826e-06, "loss": 0.0221, "step": 6609 }, { "epoch": 1.772593188522392, "grad_norm": 0.29304468394839417, "learning_rate": 4.289408563219482e-06, "loss": 0.0299, "step": 6610 }, { "epoch": 1.7728613569321534, "grad_norm": 0.2835078102077901, "learning_rate": 4.287864247179107e-06, "loss": 0.026, "step": 6611 }, { "epoch": 1.7731295253419148, "grad_norm": 0.3458443714615249, "learning_rate": 4.286320000477499e-06, "loss": 0.0351, "step": 6612 }, { "epoch": 1.7733976937516762, "grad_norm": 0.7772405112241315, "learning_rate": 4.284775823265018e-06, "loss": 0.039, "step": 6613 }, { "epoch": 1.7736658621614374, "grad_norm": 0.285768535386129, "learning_rate": 4.2832317156920165e-06, "loss": 0.0213, "step": 6614 }, { "epoch": 1.7739340305711986, "grad_norm": 0.2853992890874127, "learning_rate": 4.28168767790884e-06, "loss": 0.0248, "step": 6615 }, { "epoch": 1.77420219898096, "grad_norm": 0.2700787323201239, "learning_rate": 4.280143710065826e-06, "loss": 0.0202, "step": 6616 }, { "epoch": 1.7744703673907214, "grad_norm": 0.3455657918272816, "learning_rate": 4.278599812313309e-06, "loss": 0.0261, "step": 6617 }, { "epoch": 1.7747385358004828, "grad_norm": 0.2698580209550594, "learning_rate": 4.277055984801612e-06, "loss": 0.0282, "step": 6618 }, { "epoch": 1.775006704210244, "grad_norm": 0.30556582883085825, "learning_rate": 4.275512227681058e-06, "loss": 0.026, "step": 6619 }, { "epoch": 1.7752748726200054, "grad_norm": 0.2438454443865886, "learning_rate": 4.273968541101951e-06, "loss": 0.0239, "step": 6620 }, { "epoch": 1.7755430410297666, "grad_norm": 0.2694373769326546, "learning_rate": 4.2724249252146e-06, "loss": 0.0239, "step": 6621 }, { "epoch": 1.775811209439528, "grad_norm": 0.20522869808468355, "learning_rate": 4.270881380169303e-06, "loss": 0.0182, "step": 6622 }, { "epoch": 1.7760793778492894, "grad_norm": 0.2054025402508133, "learning_rate": 4.26933790611635e-06, "loss": 0.0182, "step": 6623 }, { "epoch": 1.7763475462590508, "grad_norm": 0.23689554537916407, "learning_rate": 4.267794503206026e-06, "loss": 0.0185, "step": 6624 }, { "epoch": 1.776615714668812, "grad_norm": 0.27403239441667726, "learning_rate": 4.266251171588606e-06, "loss": 0.0284, "step": 6625 }, { "epoch": 1.7768838830785734, "grad_norm": 0.40455277045562754, "learning_rate": 4.264707911414362e-06, "loss": 0.0243, "step": 6626 }, { "epoch": 1.7771520514883345, "grad_norm": 0.319652170605408, "learning_rate": 4.263164722833556e-06, "loss": 0.0242, "step": 6627 }, { "epoch": 1.777420219898096, "grad_norm": 0.43047627434277724, "learning_rate": 4.261621605996444e-06, "loss": 0.0271, "step": 6628 }, { "epoch": 1.7776883883078574, "grad_norm": 0.2223414149876421, "learning_rate": 4.260078561053278e-06, "loss": 0.0227, "step": 6629 }, { "epoch": 1.7779565567176188, "grad_norm": 0.24430550881236943, "learning_rate": 4.258535588154296e-06, "loss": 0.0216, "step": 6630 }, { "epoch": 1.77822472512738, "grad_norm": 0.22457773850604412, "learning_rate": 4.256992687449738e-06, "loss": 0.0173, "step": 6631 }, { "epoch": 1.7784928935371414, "grad_norm": 0.2982168046736609, "learning_rate": 4.255449859089828e-06, "loss": 0.0238, "step": 6632 }, { "epoch": 1.7787610619469025, "grad_norm": 0.3127617701661512, "learning_rate": 4.253907103224788e-06, "loss": 0.0249, "step": 6633 }, { "epoch": 1.779029230356664, "grad_norm": 0.3291656367646268, "learning_rate": 4.252364420004833e-06, "loss": 0.0251, "step": 6634 }, { "epoch": 1.7792973987664253, "grad_norm": 0.3225332072314181, "learning_rate": 4.250821809580169e-06, "loss": 0.0352, "step": 6635 }, { "epoch": 1.7795655671761867, "grad_norm": 0.24731290273540263, "learning_rate": 4.249279272100997e-06, "loss": 0.0227, "step": 6636 }, { "epoch": 1.779833735585948, "grad_norm": 0.183353280575414, "learning_rate": 4.2477368077175074e-06, "loss": 0.0166, "step": 6637 }, { "epoch": 1.7801019039957093, "grad_norm": 0.2156822787505547, "learning_rate": 4.246194416579888e-06, "loss": 0.0213, "step": 6638 }, { "epoch": 1.7803700724054705, "grad_norm": 0.19218463468083016, "learning_rate": 4.2446520988383185e-06, "loss": 0.0258, "step": 6639 }, { "epoch": 1.780638240815232, "grad_norm": 0.23637738317656248, "learning_rate": 4.243109854642968e-06, "loss": 0.0196, "step": 6640 }, { "epoch": 1.7809064092249933, "grad_norm": 0.27330496861106013, "learning_rate": 4.241567684144002e-06, "loss": 0.024, "step": 6641 }, { "epoch": 1.7811745776347547, "grad_norm": 0.23886036500790972, "learning_rate": 4.240025587491576e-06, "loss": 0.0175, "step": 6642 }, { "epoch": 1.781442746044516, "grad_norm": 0.27525386060529294, "learning_rate": 4.238483564835841e-06, "loss": 0.0257, "step": 6643 }, { "epoch": 1.7817109144542773, "grad_norm": 0.2882858717851429, "learning_rate": 4.236941616326941e-06, "loss": 0.0272, "step": 6644 }, { "epoch": 1.7819790828640385, "grad_norm": 0.2347400865077592, "learning_rate": 4.2353997421150095e-06, "loss": 0.0249, "step": 6645 }, { "epoch": 1.7822472512738, "grad_norm": 0.21453441943170148, "learning_rate": 4.233857942350175e-06, "loss": 0.0147, "step": 6646 }, { "epoch": 1.7825154196835613, "grad_norm": 0.2371919650673341, "learning_rate": 4.232316217182556e-06, "loss": 0.0163, "step": 6647 }, { "epoch": 1.7827835880933227, "grad_norm": 0.2607805258384071, "learning_rate": 4.230774566762271e-06, "loss": 0.0228, "step": 6648 }, { "epoch": 1.783051756503084, "grad_norm": 0.2734213446793632, "learning_rate": 4.229232991239424e-06, "loss": 0.0215, "step": 6649 }, { "epoch": 1.7833199249128453, "grad_norm": 0.355790570257276, "learning_rate": 4.227691490764114e-06, "loss": 0.0188, "step": 6650 }, { "epoch": 1.7835880933226065, "grad_norm": 0.1677779038204809, "learning_rate": 4.226150065486434e-06, "loss": 0.015, "step": 6651 }, { "epoch": 1.7838562617323679, "grad_norm": 0.2695579857489374, "learning_rate": 4.224608715556467e-06, "loss": 0.0313, "step": 6652 }, { "epoch": 1.7841244301421293, "grad_norm": 0.252829111384665, "learning_rate": 4.2230674411242904e-06, "loss": 0.0194, "step": 6653 }, { "epoch": 1.7843925985518907, "grad_norm": 0.4937866565790345, "learning_rate": 4.221526242339974e-06, "loss": 0.0206, "step": 6654 }, { "epoch": 1.7846607669616519, "grad_norm": 0.21933337991268634, "learning_rate": 4.219985119353581e-06, "loss": 0.0218, "step": 6655 }, { "epoch": 1.7849289353714133, "grad_norm": 0.3400303813794031, "learning_rate": 4.218444072315169e-06, "loss": 0.0254, "step": 6656 }, { "epoch": 1.7851971037811745, "grad_norm": 0.28531163506413326, "learning_rate": 4.216903101374779e-06, "loss": 0.0205, "step": 6657 }, { "epoch": 1.7854652721909359, "grad_norm": 0.21210337302631257, "learning_rate": 4.215362206682455e-06, "loss": 0.0215, "step": 6658 }, { "epoch": 1.7857334406006973, "grad_norm": 0.2690984888102947, "learning_rate": 4.213821388388231e-06, "loss": 0.0254, "step": 6659 }, { "epoch": 1.7860016090104587, "grad_norm": 0.2554775458014713, "learning_rate": 4.21228064664213e-06, "loss": 0.0267, "step": 6660 }, { "epoch": 1.7862697774202199, "grad_norm": 0.44127575643964523, "learning_rate": 4.210739981594173e-06, "loss": 0.0242, "step": 6661 }, { "epoch": 1.7865379458299813, "grad_norm": 0.4805196866925903, "learning_rate": 4.209199393394367e-06, "loss": 0.034, "step": 6662 }, { "epoch": 1.7868061142397424, "grad_norm": 0.18466641479117393, "learning_rate": 4.207658882192717e-06, "loss": 0.0145, "step": 6663 }, { "epoch": 1.7870742826495039, "grad_norm": 0.24062821369102835, "learning_rate": 4.206118448139217e-06, "loss": 0.0165, "step": 6664 }, { "epoch": 1.7873424510592653, "grad_norm": 0.4057592044932434, "learning_rate": 4.2045780913838565e-06, "loss": 0.0323, "step": 6665 }, { "epoch": 1.7876106194690267, "grad_norm": 0.24449401586288344, "learning_rate": 4.2030378120766164e-06, "loss": 0.0331, "step": 6666 }, { "epoch": 1.7878787878787878, "grad_norm": 0.29611562893005733, "learning_rate": 4.201497610367468e-06, "loss": 0.0303, "step": 6667 }, { "epoch": 1.7881469562885492, "grad_norm": 0.3123916389928919, "learning_rate": 4.199957486406378e-06, "loss": 0.0263, "step": 6668 }, { "epoch": 1.7884151246983104, "grad_norm": 0.23309353967094745, "learning_rate": 4.198417440343303e-06, "loss": 0.0182, "step": 6669 }, { "epoch": 1.7886832931080718, "grad_norm": 0.49380835765925324, "learning_rate": 4.196877472328195e-06, "loss": 0.0213, "step": 6670 }, { "epoch": 1.7889514615178332, "grad_norm": 0.3222763372938126, "learning_rate": 4.1953375825109934e-06, "loss": 0.0283, "step": 6671 }, { "epoch": 1.7892196299275946, "grad_norm": 0.22420052426870496, "learning_rate": 4.193797771041636e-06, "loss": 0.0192, "step": 6672 }, { "epoch": 1.7894877983373558, "grad_norm": 0.25966430477193336, "learning_rate": 4.192258038070048e-06, "loss": 0.018, "step": 6673 }, { "epoch": 1.7897559667471172, "grad_norm": 0.24389508586166533, "learning_rate": 4.190718383746151e-06, "loss": 0.0245, "step": 6674 }, { "epoch": 1.7900241351568784, "grad_norm": 0.2627313193531553, "learning_rate": 4.189178808219856e-06, "loss": 0.0259, "step": 6675 }, { "epoch": 1.7902923035666398, "grad_norm": 0.2022195252926253, "learning_rate": 4.187639311641067e-06, "loss": 0.0125, "step": 6676 }, { "epoch": 1.7905604719764012, "grad_norm": 0.24133090625837397, "learning_rate": 4.186099894159683e-06, "loss": 0.0201, "step": 6677 }, { "epoch": 1.7908286403861626, "grad_norm": 0.28689997135300455, "learning_rate": 4.184560555925592e-06, "loss": 0.0329, "step": 6678 }, { "epoch": 1.7910968087959238, "grad_norm": 0.3400058649605173, "learning_rate": 4.1830212970886726e-06, "loss": 0.0243, "step": 6679 }, { "epoch": 1.7913649772056852, "grad_norm": 0.34549925543862436, "learning_rate": 4.181482117798804e-06, "loss": 0.0255, "step": 6680 }, { "epoch": 1.7916331456154464, "grad_norm": 0.5544224351823214, "learning_rate": 4.1799430182058455e-06, "loss": 0.0241, "step": 6681 }, { "epoch": 1.7919013140252078, "grad_norm": 0.20483333931320563, "learning_rate": 4.178403998459659e-06, "loss": 0.0189, "step": 6682 }, { "epoch": 1.7921694824349692, "grad_norm": 0.24882463791480655, "learning_rate": 4.176865058710093e-06, "loss": 0.0301, "step": 6683 }, { "epoch": 1.7924376508447306, "grad_norm": 0.24334359663094082, "learning_rate": 4.1753261991069925e-06, "loss": 0.0168, "step": 6684 }, { "epoch": 1.7927058192544918, "grad_norm": 0.29999370509245216, "learning_rate": 4.173787419800188e-06, "loss": 0.019, "step": 6685 }, { "epoch": 1.7929739876642532, "grad_norm": 0.27181358265214045, "learning_rate": 4.1722487209395105e-06, "loss": 0.0281, "step": 6686 }, { "epoch": 1.7932421560740144, "grad_norm": 0.25289061369119714, "learning_rate": 4.170710102674777e-06, "loss": 0.0233, "step": 6687 }, { "epoch": 1.7935103244837758, "grad_norm": 0.33237193067746496, "learning_rate": 4.1691715651558e-06, "loss": 0.0257, "step": 6688 }, { "epoch": 1.7937784928935372, "grad_norm": 0.19949116673577705, "learning_rate": 4.1676331085323805e-06, "loss": 0.0162, "step": 6689 }, { "epoch": 1.7940466613032986, "grad_norm": 0.2598115223396874, "learning_rate": 4.166094732954316e-06, "loss": 0.0166, "step": 6690 }, { "epoch": 1.7943148297130598, "grad_norm": 0.2468456822813418, "learning_rate": 4.164556438571394e-06, "loss": 0.0267, "step": 6691 }, { "epoch": 1.7945829981228212, "grad_norm": 0.2723044786984433, "learning_rate": 4.163018225533394e-06, "loss": 0.0231, "step": 6692 }, { "epoch": 1.7948511665325824, "grad_norm": 0.2048621594059903, "learning_rate": 4.161480093990085e-06, "loss": 0.0263, "step": 6693 }, { "epoch": 1.7951193349423438, "grad_norm": 0.27400691103231534, "learning_rate": 4.159942044091235e-06, "loss": 0.027, "step": 6694 }, { "epoch": 1.7953875033521052, "grad_norm": 0.21666605568021075, "learning_rate": 4.158404075986596e-06, "loss": 0.0194, "step": 6695 }, { "epoch": 1.7956556717618666, "grad_norm": 0.23443305710684675, "learning_rate": 4.156866189825919e-06, "loss": 0.0223, "step": 6696 }, { "epoch": 1.7959238401716278, "grad_norm": 0.20601941173937266, "learning_rate": 4.155328385758943e-06, "loss": 0.0196, "step": 6697 }, { "epoch": 1.7961920085813892, "grad_norm": 0.24045230340890567, "learning_rate": 4.1537906639353984e-06, "loss": 0.0235, "step": 6698 }, { "epoch": 1.7964601769911503, "grad_norm": 0.18517583947282584, "learning_rate": 4.152253024505011e-06, "loss": 0.0139, "step": 6699 }, { "epoch": 1.7967283454009118, "grad_norm": 1.227824663229762, "learning_rate": 4.150715467617496e-06, "loss": 0.0469, "step": 6700 }, { "epoch": 1.7969965138106732, "grad_norm": 0.20497978050103396, "learning_rate": 4.14917799342256e-06, "loss": 0.018, "step": 6701 }, { "epoch": 1.7972646822204346, "grad_norm": 0.2305021146981779, "learning_rate": 4.1476406020699035e-06, "loss": 0.0172, "step": 6702 }, { "epoch": 1.7975328506301957, "grad_norm": 0.28063696584716963, "learning_rate": 4.14610329370922e-06, "loss": 0.018, "step": 6703 }, { "epoch": 1.797801019039957, "grad_norm": 0.28669868868680687, "learning_rate": 4.144566068490194e-06, "loss": 0.0244, "step": 6704 }, { "epoch": 1.7980691874497183, "grad_norm": 0.1998920811528924, "learning_rate": 4.143028926562496e-06, "loss": 0.0206, "step": 6705 }, { "epoch": 1.7983373558594797, "grad_norm": 0.6375603109420804, "learning_rate": 4.141491868075796e-06, "loss": 0.0308, "step": 6706 }, { "epoch": 1.7986055242692411, "grad_norm": 0.2803581789714871, "learning_rate": 4.139954893179753e-06, "loss": 0.0246, "step": 6707 }, { "epoch": 1.7988736926790025, "grad_norm": 0.2553709709015671, "learning_rate": 4.13841800202402e-06, "loss": 0.0257, "step": 6708 }, { "epoch": 1.7991418610887637, "grad_norm": 0.30195671463843493, "learning_rate": 4.136881194758239e-06, "loss": 0.0332, "step": 6709 }, { "epoch": 1.799410029498525, "grad_norm": 0.2709945461966601, "learning_rate": 4.1353444715320435e-06, "loss": 0.027, "step": 6710 }, { "epoch": 1.7996781979082863, "grad_norm": 0.3727611576541873, "learning_rate": 4.133807832495062e-06, "loss": 0.0239, "step": 6711 }, { "epoch": 1.7999463663180477, "grad_norm": 0.25567542171079866, "learning_rate": 4.13227127779691e-06, "loss": 0.0271, "step": 6712 }, { "epoch": 1.8002145347278091, "grad_norm": 0.204581720476797, "learning_rate": 4.130734807587202e-06, "loss": 0.0236, "step": 6713 }, { "epoch": 1.8004827031375705, "grad_norm": 0.4480581152810066, "learning_rate": 4.129198422015538e-06, "loss": 0.0259, "step": 6714 }, { "epoch": 1.8007508715473317, "grad_norm": 0.7823187443641172, "learning_rate": 4.127662121231511e-06, "loss": 0.0265, "step": 6715 }, { "epoch": 1.801019039957093, "grad_norm": 0.23277483801871576, "learning_rate": 4.126125905384708e-06, "loss": 0.0194, "step": 6716 }, { "epoch": 1.8012872083668543, "grad_norm": 0.2596039986991663, "learning_rate": 4.124589774624705e-06, "loss": 0.0157, "step": 6717 }, { "epoch": 1.8015553767766157, "grad_norm": 0.2944941310907993, "learning_rate": 4.123053729101072e-06, "loss": 0.0162, "step": 6718 }, { "epoch": 1.801823545186377, "grad_norm": 0.2154380049989296, "learning_rate": 4.121517768963368e-06, "loss": 0.0193, "step": 6719 }, { "epoch": 1.8020917135961385, "grad_norm": 0.23528207213000266, "learning_rate": 4.119981894361148e-06, "loss": 0.0152, "step": 6720 }, { "epoch": 1.8023598820058997, "grad_norm": 0.2279863941059893, "learning_rate": 4.118446105443954e-06, "loss": 0.021, "step": 6721 }, { "epoch": 1.8026280504156609, "grad_norm": 0.3406190347016827, "learning_rate": 4.116910402361321e-06, "loss": 0.0247, "step": 6722 }, { "epoch": 1.8028962188254223, "grad_norm": 0.21943475353075056, "learning_rate": 4.1153747852627775e-06, "loss": 0.0222, "step": 6723 }, { "epoch": 1.8031643872351837, "grad_norm": 0.21148308557683881, "learning_rate": 4.113839254297844e-06, "loss": 0.023, "step": 6724 }, { "epoch": 1.803432555644945, "grad_norm": 0.2800476033652728, "learning_rate": 4.112303809616029e-06, "loss": 0.0203, "step": 6725 }, { "epoch": 1.8037007240547065, "grad_norm": 0.22080487358032752, "learning_rate": 4.110768451366835e-06, "loss": 0.0208, "step": 6726 }, { "epoch": 1.8039688924644677, "grad_norm": 0.232841307215043, "learning_rate": 4.1092331796997565e-06, "loss": 0.0166, "step": 6727 }, { "epoch": 1.8042370608742289, "grad_norm": 0.5051797158104181, "learning_rate": 4.107697994764279e-06, "loss": 0.0209, "step": 6728 }, { "epoch": 1.8045052292839903, "grad_norm": 0.275117312626732, "learning_rate": 4.106162896709879e-06, "loss": 0.022, "step": 6729 }, { "epoch": 1.8047733976937517, "grad_norm": 0.24154150452115744, "learning_rate": 4.104627885686024e-06, "loss": 0.0229, "step": 6730 }, { "epoch": 1.805041566103513, "grad_norm": 0.217497691804683, "learning_rate": 4.1030929618421734e-06, "loss": 0.0193, "step": 6731 }, { "epoch": 1.8053097345132745, "grad_norm": 0.2199335744863198, "learning_rate": 4.101558125327781e-06, "loss": 0.0215, "step": 6732 }, { "epoch": 1.8055779029230357, "grad_norm": 0.20939191927980935, "learning_rate": 4.100023376292287e-06, "loss": 0.0181, "step": 6733 }, { "epoch": 1.8058460713327968, "grad_norm": 0.2195496023332162, "learning_rate": 4.098488714885128e-06, "loss": 0.0257, "step": 6734 }, { "epoch": 1.8061142397425582, "grad_norm": 0.27899400976937916, "learning_rate": 4.096954141255731e-06, "loss": 0.0291, "step": 6735 }, { "epoch": 1.8063824081523197, "grad_norm": 0.2230556743039783, "learning_rate": 4.095419655553511e-06, "loss": 0.0183, "step": 6736 }, { "epoch": 1.806650576562081, "grad_norm": 0.2873994415925082, "learning_rate": 4.093885257927877e-06, "loss": 0.026, "step": 6737 }, { "epoch": 1.8069187449718425, "grad_norm": 0.28089788550910544, "learning_rate": 4.092350948528231e-06, "loss": 0.0253, "step": 6738 }, { "epoch": 1.8071869133816036, "grad_norm": 0.2418715010373118, "learning_rate": 4.090816727503963e-06, "loss": 0.0177, "step": 6739 }, { "epoch": 1.8074550817913648, "grad_norm": 0.2971758679036104, "learning_rate": 4.089282595004456e-06, "loss": 0.0296, "step": 6740 }, { "epoch": 1.8077232502011262, "grad_norm": 0.39536350742392024, "learning_rate": 4.087748551179085e-06, "loss": 0.028, "step": 6741 }, { "epoch": 1.8079914186108876, "grad_norm": 0.32338214288373573, "learning_rate": 4.086214596177215e-06, "loss": 0.0325, "step": 6742 }, { "epoch": 1.808259587020649, "grad_norm": 0.23977566774657652, "learning_rate": 4.084680730148203e-06, "loss": 0.022, "step": 6743 }, { "epoch": 1.8085277554304104, "grad_norm": 0.3316673780638422, "learning_rate": 4.0831469532414e-06, "loss": 0.0312, "step": 6744 }, { "epoch": 1.8087959238401716, "grad_norm": 0.2626437346661217, "learning_rate": 4.081613265606143e-06, "loss": 0.0246, "step": 6745 }, { "epoch": 1.8090640922499328, "grad_norm": 0.19738911239542084, "learning_rate": 4.080079667391764e-06, "loss": 0.0238, "step": 6746 }, { "epoch": 1.8093322606596942, "grad_norm": 0.18392613338760166, "learning_rate": 4.078546158747586e-06, "loss": 0.0157, "step": 6747 }, { "epoch": 1.8096004290694556, "grad_norm": 0.3463968639828081, "learning_rate": 4.077012739822922e-06, "loss": 0.029, "step": 6748 }, { "epoch": 1.809868597479217, "grad_norm": 0.299426391573603, "learning_rate": 4.075479410767077e-06, "loss": 0.0206, "step": 6749 }, { "epoch": 1.8101367658889784, "grad_norm": 0.2548237033694923, "learning_rate": 4.073946171729345e-06, "loss": 0.017, "step": 6750 }, { "epoch": 1.8104049342987396, "grad_norm": 0.36604643494492584, "learning_rate": 4.072413022859019e-06, "loss": 0.0313, "step": 6751 }, { "epoch": 1.8106731027085008, "grad_norm": 0.22311092784413067, "learning_rate": 4.070879964305375e-06, "loss": 0.0156, "step": 6752 }, { "epoch": 1.8109412711182622, "grad_norm": 0.2864596293380473, "learning_rate": 4.06934699621768e-06, "loss": 0.0242, "step": 6753 }, { "epoch": 1.8112094395280236, "grad_norm": 0.3757063814748383, "learning_rate": 4.067814118745196e-06, "loss": 0.0258, "step": 6754 }, { "epoch": 1.811477607937785, "grad_norm": 0.34204519093042285, "learning_rate": 4.066281332037177e-06, "loss": 0.0268, "step": 6755 }, { "epoch": 1.8117457763475464, "grad_norm": 0.2621929833168547, "learning_rate": 4.064748636242866e-06, "loss": 0.0195, "step": 6756 }, { "epoch": 1.8120139447573076, "grad_norm": 0.19095296990533725, "learning_rate": 4.063216031511498e-06, "loss": 0.0158, "step": 6757 }, { "epoch": 1.8122821131670688, "grad_norm": 0.3425119346371788, "learning_rate": 4.061683517992298e-06, "loss": 0.0393, "step": 6758 }, { "epoch": 1.8125502815768302, "grad_norm": 0.21724192638083836, "learning_rate": 4.060151095834482e-06, "loss": 0.0269, "step": 6759 }, { "epoch": 1.8128184499865916, "grad_norm": 0.28678584327931267, "learning_rate": 4.0586187651872576e-06, "loss": 0.0189, "step": 6760 }, { "epoch": 1.813086618396353, "grad_norm": 0.22954393230681405, "learning_rate": 4.057086526199826e-06, "loss": 0.023, "step": 6761 }, { "epoch": 1.8133547868061144, "grad_norm": 0.2629698468416312, "learning_rate": 4.055554379021376e-06, "loss": 0.017, "step": 6762 }, { "epoch": 1.8136229552158756, "grad_norm": 0.29882126979966056, "learning_rate": 4.054022323801088e-06, "loss": 0.0293, "step": 6763 }, { "epoch": 1.8138911236256368, "grad_norm": 0.24581747260462916, "learning_rate": 4.052490360688136e-06, "loss": 0.0168, "step": 6764 }, { "epoch": 1.8141592920353982, "grad_norm": 0.3093360289318342, "learning_rate": 4.050958489831682e-06, "loss": 0.0259, "step": 6765 }, { "epoch": 1.8144274604451596, "grad_norm": 0.2657436843714907, "learning_rate": 4.049426711380879e-06, "loss": 0.0169, "step": 6766 }, { "epoch": 1.814695628854921, "grad_norm": 0.2238164247076444, "learning_rate": 4.047895025484874e-06, "loss": 0.0189, "step": 6767 }, { "epoch": 1.8149637972646824, "grad_norm": 0.22517291718914573, "learning_rate": 4.046363432292803e-06, "loss": 0.0165, "step": 6768 }, { "epoch": 1.8152319656744436, "grad_norm": 0.1938959493010603, "learning_rate": 4.044831931953792e-06, "loss": 0.0168, "step": 6769 }, { "epoch": 1.8155001340842047, "grad_norm": 0.2539273665245295, "learning_rate": 4.043300524616959e-06, "loss": 0.0197, "step": 6770 }, { "epoch": 1.8157683024939661, "grad_norm": 0.23172378738021313, "learning_rate": 4.041769210431414e-06, "loss": 0.0177, "step": 6771 }, { "epoch": 1.8160364709037276, "grad_norm": 0.2717901896887012, "learning_rate": 4.040237989546258e-06, "loss": 0.0195, "step": 6772 }, { "epoch": 1.816304639313489, "grad_norm": 0.2998323827895008, "learning_rate": 4.038706862110581e-06, "loss": 0.0258, "step": 6773 }, { "epoch": 1.8165728077232504, "grad_norm": 0.2037346963684397, "learning_rate": 4.037175828273464e-06, "loss": 0.0188, "step": 6774 }, { "epoch": 1.8168409761330115, "grad_norm": 0.2507450837111786, "learning_rate": 4.035644888183981e-06, "loss": 0.0195, "step": 6775 }, { "epoch": 1.8171091445427727, "grad_norm": 0.25337597882070734, "learning_rate": 4.034114041991194e-06, "loss": 0.0244, "step": 6776 }, { "epoch": 1.8173773129525341, "grad_norm": 0.2504672897396629, "learning_rate": 4.032583289844161e-06, "loss": 0.0191, "step": 6777 }, { "epoch": 1.8176454813622955, "grad_norm": 0.2747984583872415, "learning_rate": 4.0310526318919215e-06, "loss": 0.0309, "step": 6778 }, { "epoch": 1.817913649772057, "grad_norm": 0.2980035015276697, "learning_rate": 4.0295220682835165e-06, "loss": 0.0349, "step": 6779 }, { "epoch": 1.8181818181818183, "grad_norm": 0.24208563459654803, "learning_rate": 4.02799159916797e-06, "loss": 0.0144, "step": 6780 }, { "epoch": 1.8184499865915795, "grad_norm": 0.203754230039691, "learning_rate": 4.0264612246943005e-06, "loss": 0.0178, "step": 6781 }, { "epoch": 1.8187181550013407, "grad_norm": 0.24106893433842919, "learning_rate": 4.024930945011517e-06, "loss": 0.0214, "step": 6782 }, { "epoch": 1.8189863234111021, "grad_norm": 0.1775754813791592, "learning_rate": 4.02340076026862e-06, "loss": 0.0139, "step": 6783 }, { "epoch": 1.8192544918208635, "grad_norm": 0.37858519432593574, "learning_rate": 4.021870670614598e-06, "loss": 0.0344, "step": 6784 }, { "epoch": 1.819522660230625, "grad_norm": 0.18031010900343794, "learning_rate": 4.020340676198431e-06, "loss": 0.0173, "step": 6785 }, { "epoch": 1.8197908286403863, "grad_norm": 0.2840417034112351, "learning_rate": 4.0188107771690915e-06, "loss": 0.0243, "step": 6786 }, { "epoch": 1.8200589970501475, "grad_norm": 0.2757190835114367, "learning_rate": 4.017280973675542e-06, "loss": 0.019, "step": 6787 }, { "epoch": 1.8203271654599087, "grad_norm": 0.28824303360994463, "learning_rate": 4.015751265866734e-06, "loss": 0.0237, "step": 6788 }, { "epoch": 1.82059533386967, "grad_norm": 0.29892945555152645, "learning_rate": 4.014221653891615e-06, "loss": 0.0281, "step": 6789 }, { "epoch": 1.8208635022794315, "grad_norm": 0.3756929818448466, "learning_rate": 4.012692137899115e-06, "loss": 0.0256, "step": 6790 }, { "epoch": 1.821131670689193, "grad_norm": 0.2056648016688155, "learning_rate": 4.011162718038158e-06, "loss": 0.0163, "step": 6791 }, { "epoch": 1.8213998390989543, "grad_norm": 0.29864134874987647, "learning_rate": 4.009633394457664e-06, "loss": 0.0168, "step": 6792 }, { "epoch": 1.8216680075087155, "grad_norm": 0.3009766078845872, "learning_rate": 4.008104167306536e-06, "loss": 0.0225, "step": 6793 }, { "epoch": 1.8219361759184767, "grad_norm": 0.29875943916046327, "learning_rate": 4.006575036733672e-06, "loss": 0.0363, "step": 6794 }, { "epoch": 1.822204344328238, "grad_norm": 0.2742650673904684, "learning_rate": 4.00504600288796e-06, "loss": 0.0237, "step": 6795 }, { "epoch": 1.8224725127379995, "grad_norm": 0.20830622921579933, "learning_rate": 4.003517065918276e-06, "loss": 0.0183, "step": 6796 }, { "epoch": 1.8227406811477609, "grad_norm": 0.3679081977175839, "learning_rate": 4.001988225973491e-06, "loss": 0.0203, "step": 6797 }, { "epoch": 1.823008849557522, "grad_norm": 0.22142711152058328, "learning_rate": 4.000459483202461e-06, "loss": 0.0182, "step": 6798 }, { "epoch": 1.8232770179672835, "grad_norm": 0.22516563897539243, "learning_rate": 3.998930837754039e-06, "loss": 0.0168, "step": 6799 }, { "epoch": 1.8235451863770447, "grad_norm": 0.35351982823343914, "learning_rate": 3.997402289777062e-06, "loss": 0.0323, "step": 6800 }, { "epoch": 1.823813354786806, "grad_norm": 0.20877106570325116, "learning_rate": 3.995873839420366e-06, "loss": 0.0197, "step": 6801 }, { "epoch": 1.8240815231965675, "grad_norm": 0.21577659199885574, "learning_rate": 3.994345486832765e-06, "loss": 0.0191, "step": 6802 }, { "epoch": 1.8243496916063289, "grad_norm": 0.20263713304583783, "learning_rate": 3.992817232163074e-06, "loss": 0.0198, "step": 6803 }, { "epoch": 1.82461786001609, "grad_norm": 0.2824159740656318, "learning_rate": 3.991289075560096e-06, "loss": 0.0245, "step": 6804 }, { "epoch": 1.8248860284258515, "grad_norm": 0.216934186634219, "learning_rate": 3.989761017172622e-06, "loss": 0.0214, "step": 6805 }, { "epoch": 1.8251541968356126, "grad_norm": 0.43027848493971427, "learning_rate": 3.988233057149436e-06, "loss": 0.0284, "step": 6806 }, { "epoch": 1.825422365245374, "grad_norm": 0.23051301465692267, "learning_rate": 3.9867051956393114e-06, "loss": 0.0196, "step": 6807 }, { "epoch": 1.8256905336551355, "grad_norm": 0.1988892980410211, "learning_rate": 3.98517743279101e-06, "loss": 0.0158, "step": 6808 }, { "epoch": 1.8259587020648969, "grad_norm": 0.35709483893954613, "learning_rate": 3.983649768753289e-06, "loss": 0.0243, "step": 6809 }, { "epoch": 1.826226870474658, "grad_norm": 0.2125081941853152, "learning_rate": 3.982122203674891e-06, "loss": 0.0195, "step": 6810 }, { "epoch": 1.8264950388844194, "grad_norm": 0.2688770451637419, "learning_rate": 3.980594737704552e-06, "loss": 0.0234, "step": 6811 }, { "epoch": 1.8267632072941806, "grad_norm": 0.24043044830035892, "learning_rate": 3.979067370990998e-06, "loss": 0.0215, "step": 6812 }, { "epoch": 1.827031375703942, "grad_norm": 0.2598625661582212, "learning_rate": 3.977540103682943e-06, "loss": 0.018, "step": 6813 }, { "epoch": 1.8272995441137034, "grad_norm": 0.23956972153666029, "learning_rate": 3.976012935929093e-06, "loss": 0.0247, "step": 6814 }, { "epoch": 1.8275677125234648, "grad_norm": 0.23936322454548165, "learning_rate": 3.974485867878145e-06, "loss": 0.0195, "step": 6815 }, { "epoch": 1.827835880933226, "grad_norm": 0.22774332461626628, "learning_rate": 3.972958899678786e-06, "loss": 0.0161, "step": 6816 }, { "epoch": 1.8281040493429874, "grad_norm": 0.4217813254027379, "learning_rate": 3.971432031479691e-06, "loss": 0.0194, "step": 6817 }, { "epoch": 1.8283722177527486, "grad_norm": 0.23866481546410268, "learning_rate": 3.969905263429529e-06, "loss": 0.0261, "step": 6818 }, { "epoch": 1.82864038616251, "grad_norm": 0.17992612296186683, "learning_rate": 3.968378595676956e-06, "loss": 0.017, "step": 6819 }, { "epoch": 1.8289085545722714, "grad_norm": 0.19829157259462094, "learning_rate": 3.96685202837062e-06, "loss": 0.0177, "step": 6820 }, { "epoch": 1.8291767229820328, "grad_norm": 0.22727604953837693, "learning_rate": 3.96532556165916e-06, "loss": 0.0159, "step": 6821 }, { "epoch": 1.829444891391794, "grad_norm": 0.3051590683026175, "learning_rate": 3.963799195691202e-06, "loss": 0.0189, "step": 6822 }, { "epoch": 1.8297130598015554, "grad_norm": 0.20057192286069714, "learning_rate": 3.962272930615367e-06, "loss": 0.0185, "step": 6823 }, { "epoch": 1.8299812282113166, "grad_norm": 0.186962097277256, "learning_rate": 3.96074676658026e-06, "loss": 0.0126, "step": 6824 }, { "epoch": 1.830249396621078, "grad_norm": 0.25601154824880307, "learning_rate": 3.959220703734483e-06, "loss": 0.0219, "step": 6825 }, { "epoch": 1.8305175650308394, "grad_norm": 0.3674955839651876, "learning_rate": 3.957694742226622e-06, "loss": 0.0191, "step": 6826 }, { "epoch": 1.8307857334406008, "grad_norm": 0.26116638812738735, "learning_rate": 3.956168882205257e-06, "loss": 0.0236, "step": 6827 }, { "epoch": 1.831053901850362, "grad_norm": 0.21275240977905405, "learning_rate": 3.9546431238189556e-06, "loss": 0.017, "step": 6828 }, { "epoch": 1.8313220702601234, "grad_norm": 0.3226555855535195, "learning_rate": 3.953117467216277e-06, "loss": 0.0232, "step": 6829 }, { "epoch": 1.8315902386698846, "grad_norm": 0.3442536500584343, "learning_rate": 3.951591912545773e-06, "loss": 0.0317, "step": 6830 }, { "epoch": 1.831858407079646, "grad_norm": 0.19991822717616334, "learning_rate": 3.95006645995598e-06, "loss": 0.0157, "step": 6831 }, { "epoch": 1.8321265754894074, "grad_norm": 0.5828679670967211, "learning_rate": 3.948541109595429e-06, "loss": 0.0224, "step": 6832 }, { "epoch": 1.8323947438991688, "grad_norm": 0.31329316749037833, "learning_rate": 3.947015861612639e-06, "loss": 0.0243, "step": 6833 }, { "epoch": 1.83266291230893, "grad_norm": 0.3191192436104779, "learning_rate": 3.945490716156117e-06, "loss": 0.0216, "step": 6834 }, { "epoch": 1.8329310807186914, "grad_norm": 0.22617607198159442, "learning_rate": 3.943965673374367e-06, "loss": 0.0131, "step": 6835 }, { "epoch": 1.8331992491284526, "grad_norm": 0.24348389254197622, "learning_rate": 3.942440733415873e-06, "loss": 0.0219, "step": 6836 }, { "epoch": 1.833467417538214, "grad_norm": 0.21359357406744642, "learning_rate": 3.94091589642912e-06, "loss": 0.0139, "step": 6837 }, { "epoch": 1.8337355859479754, "grad_norm": 0.33468017246055964, "learning_rate": 3.939391162562573e-06, "loss": 0.0224, "step": 6838 }, { "epoch": 1.8340037543577368, "grad_norm": 0.25589175427869687, "learning_rate": 3.937866531964691e-06, "loss": 0.0242, "step": 6839 }, { "epoch": 1.834271922767498, "grad_norm": 0.24865337370624824, "learning_rate": 3.936342004783926e-06, "loss": 0.0195, "step": 6840 }, { "epoch": 1.8345400911772594, "grad_norm": 0.27366062312724504, "learning_rate": 3.934817581168716e-06, "loss": 0.0135, "step": 6841 }, { "epoch": 1.8348082595870205, "grad_norm": 0.40039656771597726, "learning_rate": 3.93329326126749e-06, "loss": 0.0238, "step": 6842 }, { "epoch": 1.835076427996782, "grad_norm": 0.22879054148184938, "learning_rate": 3.931769045228668e-06, "loss": 0.0179, "step": 6843 }, { "epoch": 1.8353445964065433, "grad_norm": 0.25529385671504007, "learning_rate": 3.930244933200657e-06, "loss": 0.0245, "step": 6844 }, { "epoch": 1.8356127648163048, "grad_norm": 0.28914893650218615, "learning_rate": 3.928720925331857e-06, "loss": 0.0194, "step": 6845 }, { "epoch": 1.835880933226066, "grad_norm": 0.32301510600306277, "learning_rate": 3.9271970217706555e-06, "loss": 0.023, "step": 6846 }, { "epoch": 1.8361491016358273, "grad_norm": 0.21494747235990944, "learning_rate": 3.9256732226654325e-06, "loss": 0.0202, "step": 6847 }, { "epoch": 1.8364172700455885, "grad_norm": 0.23488292999177632, "learning_rate": 3.924149528164556e-06, "loss": 0.0177, "step": 6848 }, { "epoch": 1.83668543845535, "grad_norm": 0.3131259614361377, "learning_rate": 3.9226259384163855e-06, "loss": 0.0229, "step": 6849 }, { "epoch": 1.8369536068651113, "grad_norm": 0.2783242141847321, "learning_rate": 3.921102453569264e-06, "loss": 0.0184, "step": 6850 }, { "epoch": 1.8372217752748727, "grad_norm": 0.229044604854004, "learning_rate": 3.919579073771532e-06, "loss": 0.0221, "step": 6851 }, { "epoch": 1.837489943684634, "grad_norm": 0.21689194706259882, "learning_rate": 3.918055799171518e-06, "loss": 0.02, "step": 6852 }, { "epoch": 1.8377581120943953, "grad_norm": 0.2758134826921344, "learning_rate": 3.9165326299175385e-06, "loss": 0.0245, "step": 6853 }, { "epoch": 1.8380262805041565, "grad_norm": 0.3092085392029247, "learning_rate": 3.9150095661579005e-06, "loss": 0.0276, "step": 6854 }, { "epoch": 1.838294448913918, "grad_norm": 0.1773941236689758, "learning_rate": 3.9134866080409e-06, "loss": 0.0173, "step": 6855 }, { "epoch": 1.8385626173236793, "grad_norm": 0.2930888952838018, "learning_rate": 3.911963755714823e-06, "loss": 0.0223, "step": 6856 }, { "epoch": 1.8388307857334407, "grad_norm": 0.24267788426849224, "learning_rate": 3.910441009327947e-06, "loss": 0.0217, "step": 6857 }, { "epoch": 1.839098954143202, "grad_norm": 0.2880526324637451, "learning_rate": 3.908918369028538e-06, "loss": 0.0219, "step": 6858 }, { "epoch": 1.8393671225529633, "grad_norm": 0.19992320634992655, "learning_rate": 3.90739583496485e-06, "loss": 0.0152, "step": 6859 }, { "epoch": 1.8396352909627245, "grad_norm": 0.22795088920161416, "learning_rate": 3.90587340728513e-06, "loss": 0.0132, "step": 6860 }, { "epoch": 1.839903459372486, "grad_norm": 0.38167979539488595, "learning_rate": 3.904351086137611e-06, "loss": 0.0193, "step": 6861 }, { "epoch": 1.8401716277822473, "grad_norm": 0.25737225202146746, "learning_rate": 3.902828871670518e-06, "loss": 0.0224, "step": 6862 }, { "epoch": 1.8404397961920087, "grad_norm": 0.30221183184085193, "learning_rate": 3.901306764032063e-06, "loss": 0.0221, "step": 6863 }, { "epoch": 1.8407079646017699, "grad_norm": 0.2529562720173654, "learning_rate": 3.8997847633704535e-06, "loss": 0.0272, "step": 6864 }, { "epoch": 1.8409761330115313, "grad_norm": 0.2397132250066566, "learning_rate": 3.898262869833879e-06, "loss": 0.0202, "step": 6865 }, { "epoch": 1.8412443014212925, "grad_norm": 0.2809433690288611, "learning_rate": 3.896741083570525e-06, "loss": 0.0268, "step": 6866 }, { "epoch": 1.8415124698310539, "grad_norm": 0.20066394068501658, "learning_rate": 3.895219404728561e-06, "loss": 0.0178, "step": 6867 }, { "epoch": 1.8417806382408153, "grad_norm": 0.17565602349233542, "learning_rate": 3.8936978334561516e-06, "loss": 0.0145, "step": 6868 }, { "epoch": 1.8420488066505767, "grad_norm": 0.4268600800034085, "learning_rate": 3.892176369901446e-06, "loss": 0.0271, "step": 6869 }, { "epoch": 1.8423169750603379, "grad_norm": 0.19654079281168427, "learning_rate": 3.890655014212588e-06, "loss": 0.021, "step": 6870 }, { "epoch": 1.8425851434700993, "grad_norm": 0.2041703701901406, "learning_rate": 3.889133766537705e-06, "loss": 0.0154, "step": 6871 }, { "epoch": 1.8428533118798605, "grad_norm": 0.23275420660408408, "learning_rate": 3.887612627024917e-06, "loss": 0.0199, "step": 6872 }, { "epoch": 1.8431214802896219, "grad_norm": 0.19357632367642727, "learning_rate": 3.8860915958223335e-06, "loss": 0.0191, "step": 6873 }, { "epoch": 1.8433896486993833, "grad_norm": 0.3224278366745231, "learning_rate": 3.884570673078058e-06, "loss": 0.027, "step": 6874 }, { "epoch": 1.8436578171091447, "grad_norm": 0.19959528199234783, "learning_rate": 3.883049858940172e-06, "loss": 0.0143, "step": 6875 }, { "epoch": 1.8439259855189059, "grad_norm": 0.15741918000248914, "learning_rate": 3.881529153556756e-06, "loss": 0.0162, "step": 6876 }, { "epoch": 1.8441941539286673, "grad_norm": 0.4075349315689493, "learning_rate": 3.880008557075875e-06, "loss": 0.0261, "step": 6877 }, { "epoch": 1.8444623223384284, "grad_norm": 0.31178568204909296, "learning_rate": 3.87848806964559e-06, "loss": 0.0207, "step": 6878 }, { "epoch": 1.8447304907481898, "grad_norm": 0.21134883750081107, "learning_rate": 3.8769676914139426e-06, "loss": 0.0188, "step": 6879 }, { "epoch": 1.8449986591579512, "grad_norm": 0.22902306980986897, "learning_rate": 3.87544742252897e-06, "loss": 0.0217, "step": 6880 }, { "epoch": 1.8452668275677127, "grad_norm": 0.240613679407034, "learning_rate": 3.873927263138696e-06, "loss": 0.027, "step": 6881 }, { "epoch": 1.8455349959774738, "grad_norm": 0.3063596773488676, "learning_rate": 3.872407213391135e-06, "loss": 0.029, "step": 6882 }, { "epoch": 1.845803164387235, "grad_norm": 0.2771864551051406, "learning_rate": 3.87088727343429e-06, "loss": 0.0279, "step": 6883 }, { "epoch": 1.8460713327969964, "grad_norm": 0.24117336067420936, "learning_rate": 3.869367443416152e-06, "loss": 0.0201, "step": 6884 }, { "epoch": 1.8463395012067578, "grad_norm": 0.8634318098260425, "learning_rate": 3.867847723484705e-06, "loss": 0.0431, "step": 6885 }, { "epoch": 1.8466076696165192, "grad_norm": 0.19649903365244265, "learning_rate": 3.866328113787923e-06, "loss": 0.0168, "step": 6886 }, { "epoch": 1.8468758380262806, "grad_norm": 0.3700210095681029, "learning_rate": 3.86480861447376e-06, "loss": 0.0287, "step": 6887 }, { "epoch": 1.8471440064360418, "grad_norm": 0.3337350163514652, "learning_rate": 3.863289225690166e-06, "loss": 0.013, "step": 6888 }, { "epoch": 1.847412174845803, "grad_norm": 0.2373124555373982, "learning_rate": 3.861769947585084e-06, "loss": 0.023, "step": 6889 }, { "epoch": 1.8476803432555644, "grad_norm": 0.22158872838084537, "learning_rate": 3.860250780306441e-06, "loss": 0.0237, "step": 6890 }, { "epoch": 1.8479485116653258, "grad_norm": 0.2453803896127285, "learning_rate": 3.858731724002153e-06, "loss": 0.0235, "step": 6891 }, { "epoch": 1.8482166800750872, "grad_norm": 0.2268365167463081, "learning_rate": 3.857212778820127e-06, "loss": 0.0219, "step": 6892 }, { "epoch": 1.8484848484848486, "grad_norm": 0.22553892905562506, "learning_rate": 3.8556939449082584e-06, "loss": 0.0257, "step": 6893 }, { "epoch": 1.8487530168946098, "grad_norm": 0.4539107942943889, "learning_rate": 3.854175222414432e-06, "loss": 0.0262, "step": 6894 }, { "epoch": 1.849021185304371, "grad_norm": 0.1796585338016776, "learning_rate": 3.852656611486522e-06, "loss": 0.0096, "step": 6895 }, { "epoch": 1.8492893537141324, "grad_norm": 0.27183537425558546, "learning_rate": 3.851138112272393e-06, "loss": 0.0282, "step": 6896 }, { "epoch": 1.8495575221238938, "grad_norm": 0.2463449117573995, "learning_rate": 3.849619724919895e-06, "loss": 0.0364, "step": 6897 }, { "epoch": 1.8498256905336552, "grad_norm": 0.19304805188821675, "learning_rate": 3.848101449576871e-06, "loss": 0.0201, "step": 6898 }, { "epoch": 1.8500938589434166, "grad_norm": 0.17187817169568295, "learning_rate": 3.84658328639115e-06, "loss": 0.0134, "step": 6899 }, { "epoch": 1.8503620273531778, "grad_norm": 0.17592823579543396, "learning_rate": 3.8450652355105525e-06, "loss": 0.0158, "step": 6900 }, { "epoch": 1.850630195762939, "grad_norm": 0.2616139902425976, "learning_rate": 3.843547297082886e-06, "loss": 0.0226, "step": 6901 }, { "epoch": 1.8508983641727004, "grad_norm": 0.30199025938468294, "learning_rate": 3.842029471255949e-06, "loss": 0.0246, "step": 6902 }, { "epoch": 1.8511665325824618, "grad_norm": 0.40386045452013797, "learning_rate": 3.840511758177528e-06, "loss": 0.0179, "step": 6903 }, { "epoch": 1.8514347009922232, "grad_norm": 0.22174483233751435, "learning_rate": 3.838994157995398e-06, "loss": 0.0309, "step": 6904 }, { "epoch": 1.8517028694019846, "grad_norm": 0.541906728761193, "learning_rate": 3.837476670857326e-06, "loss": 0.0354, "step": 6905 }, { "epoch": 1.8519710378117458, "grad_norm": 0.2746545247641939, "learning_rate": 3.835959296911063e-06, "loss": 0.0288, "step": 6906 }, { "epoch": 1.852239206221507, "grad_norm": 0.2634335445212595, "learning_rate": 3.834442036304353e-06, "loss": 0.0248, "step": 6907 }, { "epoch": 1.8525073746312684, "grad_norm": 0.1854678347013182, "learning_rate": 3.832924889184928e-06, "loss": 0.0137, "step": 6908 }, { "epoch": 1.8527755430410298, "grad_norm": 0.21435738902581836, "learning_rate": 3.831407855700508e-06, "loss": 0.0209, "step": 6909 }, { "epoch": 1.8530437114507912, "grad_norm": 0.5612920963719109, "learning_rate": 3.829890935998802e-06, "loss": 0.0212, "step": 6910 }, { "epoch": 1.8533118798605526, "grad_norm": 0.19901517860855003, "learning_rate": 3.82837413022751e-06, "loss": 0.0152, "step": 6911 }, { "epoch": 1.8535800482703138, "grad_norm": 0.2122049907532062, "learning_rate": 3.826857438534317e-06, "loss": 0.0172, "step": 6912 }, { "epoch": 1.853848216680075, "grad_norm": 0.29029020089342544, "learning_rate": 3.8253408610669015e-06, "loss": 0.0287, "step": 6913 }, { "epoch": 1.8541163850898363, "grad_norm": 0.21149485758919676, "learning_rate": 3.8238243979729275e-06, "loss": 0.0196, "step": 6914 }, { "epoch": 1.8543845534995977, "grad_norm": 0.35089621676942184, "learning_rate": 3.822308049400047e-06, "loss": 0.0215, "step": 6915 }, { "epoch": 1.8546527219093591, "grad_norm": 0.6843409134261673, "learning_rate": 3.820791815495908e-06, "loss": 0.0194, "step": 6916 }, { "epoch": 1.8549208903191206, "grad_norm": 0.32253935023872665, "learning_rate": 3.819275696408138e-06, "loss": 0.0286, "step": 6917 }, { "epoch": 1.8551890587288817, "grad_norm": 0.2526319835967422, "learning_rate": 3.817759692284358e-06, "loss": 0.0215, "step": 6918 }, { "epoch": 1.855457227138643, "grad_norm": 0.2588629061232179, "learning_rate": 3.816243803272179e-06, "loss": 0.0225, "step": 6919 }, { "epoch": 1.8557253955484043, "grad_norm": 0.22739556193210014, "learning_rate": 3.814728029519198e-06, "loss": 0.0187, "step": 6920 }, { "epoch": 1.8559935639581657, "grad_norm": 0.23295706503447042, "learning_rate": 3.8132123711730012e-06, "loss": 0.0176, "step": 6921 }, { "epoch": 1.8562617323679271, "grad_norm": 0.2347224468116582, "learning_rate": 3.811696828381168e-06, "loss": 0.0175, "step": 6922 }, { "epoch": 1.8565299007776885, "grad_norm": 0.2994484563086511, "learning_rate": 3.8101814012912575e-06, "loss": 0.0226, "step": 6923 }, { "epoch": 1.8567980691874497, "grad_norm": 0.27877140233863795, "learning_rate": 3.808666090050824e-06, "loss": 0.0203, "step": 6924 }, { "epoch": 1.857066237597211, "grad_norm": 0.23736863197131516, "learning_rate": 3.807150894807411e-06, "loss": 0.0241, "step": 6925 }, { "epoch": 1.8573344060069723, "grad_norm": 0.25013429411361215, "learning_rate": 3.805635815708548e-06, "loss": 0.0305, "step": 6926 }, { "epoch": 1.8576025744167337, "grad_norm": 0.26634989133974035, "learning_rate": 3.804120852901756e-06, "loss": 0.0237, "step": 6927 }, { "epoch": 1.8578707428264951, "grad_norm": 0.28896852925463723, "learning_rate": 3.8026060065345407e-06, "loss": 0.0189, "step": 6928 }, { "epoch": 1.8581389112362565, "grad_norm": 0.2528866207024745, "learning_rate": 3.8010912767543993e-06, "loss": 0.0241, "step": 6929 }, { "epoch": 1.8584070796460177, "grad_norm": 0.24553085521626805, "learning_rate": 3.799576663708818e-06, "loss": 0.0206, "step": 6930 }, { "epoch": 1.8586752480557789, "grad_norm": 0.4310048778041812, "learning_rate": 3.79806216754527e-06, "loss": 0.0257, "step": 6931 }, { "epoch": 1.8589434164655403, "grad_norm": 0.2862437213992107, "learning_rate": 3.7965477884112156e-06, "loss": 0.0207, "step": 6932 }, { "epoch": 1.8592115848753017, "grad_norm": 0.23133394353864725, "learning_rate": 3.79503352645411e-06, "loss": 0.0264, "step": 6933 }, { "epoch": 1.859479753285063, "grad_norm": 0.20907174962762276, "learning_rate": 3.793519381821392e-06, "loss": 0.0202, "step": 6934 }, { "epoch": 1.8597479216948245, "grad_norm": 0.23161348247079522, "learning_rate": 3.7920053546604875e-06, "loss": 0.0223, "step": 6935 }, { "epoch": 1.8600160901045857, "grad_norm": 0.19155366008762223, "learning_rate": 3.790491445118812e-06, "loss": 0.0179, "step": 6936 }, { "epoch": 1.8602842585143469, "grad_norm": 0.4005368812297641, "learning_rate": 3.788977653343776e-06, "loss": 0.0251, "step": 6937 }, { "epoch": 1.8605524269241083, "grad_norm": 0.25845495421528586, "learning_rate": 3.78746397948277e-06, "loss": 0.0193, "step": 6938 }, { "epoch": 1.8608205953338697, "grad_norm": 0.44631407335768314, "learning_rate": 3.7859504236831766e-06, "loss": 0.0179, "step": 6939 }, { "epoch": 1.861088763743631, "grad_norm": 0.27091810661515703, "learning_rate": 3.784436986092368e-06, "loss": 0.0165, "step": 6940 }, { "epoch": 1.8613569321533925, "grad_norm": 0.2500962624375777, "learning_rate": 3.7829236668577025e-06, "loss": 0.0283, "step": 6941 }, { "epoch": 1.8616251005631537, "grad_norm": 0.24769954266746042, "learning_rate": 3.7814104661265272e-06, "loss": 0.0194, "step": 6942 }, { "epoch": 1.8618932689729148, "grad_norm": 0.2648395917941033, "learning_rate": 3.7798973840461805e-06, "loss": 0.0246, "step": 6943 }, { "epoch": 1.8621614373826763, "grad_norm": 0.25525958801830617, "learning_rate": 3.7783844207639864e-06, "loss": 0.0256, "step": 6944 }, { "epoch": 1.8624296057924377, "grad_norm": 0.2447714175444359, "learning_rate": 3.776871576427258e-06, "loss": 0.0189, "step": 6945 }, { "epoch": 1.862697774202199, "grad_norm": 0.527679564382456, "learning_rate": 3.7753588511832973e-06, "loss": 0.0264, "step": 6946 }, { "epoch": 1.8629659426119605, "grad_norm": 0.19437480485350803, "learning_rate": 3.7738462451793932e-06, "loss": 0.0222, "step": 6947 }, { "epoch": 1.8632341110217217, "grad_norm": 0.24861708777404487, "learning_rate": 3.772333758562825e-06, "loss": 0.0224, "step": 6948 }, { "epoch": 1.8635022794314828, "grad_norm": 0.20935678403845923, "learning_rate": 3.7708213914808595e-06, "loss": 0.0155, "step": 6949 }, { "epoch": 1.8637704478412442, "grad_norm": 0.20856999955869213, "learning_rate": 3.7693091440807517e-06, "loss": 0.0212, "step": 6950 }, { "epoch": 1.8640386162510056, "grad_norm": 0.23998810518278219, "learning_rate": 3.7677970165097444e-06, "loss": 0.0199, "step": 6951 }, { "epoch": 1.864306784660767, "grad_norm": 0.23121167793853545, "learning_rate": 3.7662850089150693e-06, "loss": 0.0281, "step": 6952 }, { "epoch": 1.8645749530705285, "grad_norm": 0.32107159270748564, "learning_rate": 3.764773121443949e-06, "loss": 0.021, "step": 6953 }, { "epoch": 1.8648431214802896, "grad_norm": 0.16394119503769808, "learning_rate": 3.7632613542435904e-06, "loss": 0.0171, "step": 6954 }, { "epoch": 1.8651112898900508, "grad_norm": 0.22308678785144828, "learning_rate": 3.76174970746119e-06, "loss": 0.0166, "step": 6955 }, { "epoch": 1.8653794582998122, "grad_norm": 0.2607107652943991, "learning_rate": 3.760238181243933e-06, "loss": 0.0184, "step": 6956 }, { "epoch": 1.8656476267095736, "grad_norm": 0.2805988845489507, "learning_rate": 3.758726775738993e-06, "loss": 0.0258, "step": 6957 }, { "epoch": 1.865915795119335, "grad_norm": 0.2794543660113636, "learning_rate": 3.757215491093532e-06, "loss": 0.0224, "step": 6958 }, { "epoch": 1.8661839635290964, "grad_norm": 0.1937611782214648, "learning_rate": 3.7557043274546983e-06, "loss": 0.0167, "step": 6959 }, { "epoch": 1.8664521319388576, "grad_norm": 0.23307789623691946, "learning_rate": 3.754193284969631e-06, "loss": 0.0183, "step": 6960 }, { "epoch": 1.8667203003486188, "grad_norm": 0.18633256546762245, "learning_rate": 3.752682363785456e-06, "loss": 0.0249, "step": 6961 }, { "epoch": 1.8669884687583802, "grad_norm": 0.21827672780996532, "learning_rate": 3.7511715640492867e-06, "loss": 0.018, "step": 6962 }, { "epoch": 1.8672566371681416, "grad_norm": 0.197992716588563, "learning_rate": 3.749660885908226e-06, "loss": 0.0208, "step": 6963 }, { "epoch": 1.867524805577903, "grad_norm": 0.35439091852439303, "learning_rate": 3.7481503295093667e-06, "loss": 0.0342, "step": 6964 }, { "epoch": 1.8677929739876644, "grad_norm": 0.218978327780633, "learning_rate": 3.7466398949997864e-06, "loss": 0.0236, "step": 6965 }, { "epoch": 1.8680611423974256, "grad_norm": 0.34114200973506364, "learning_rate": 3.745129582526552e-06, "loss": 0.0192, "step": 6966 }, { "epoch": 1.8683293108071868, "grad_norm": 0.23455429643048675, "learning_rate": 3.7436193922367183e-06, "loss": 0.022, "step": 6967 }, { "epoch": 1.8685974792169482, "grad_norm": 0.19155953919726126, "learning_rate": 3.7421093242773284e-06, "loss": 0.0108, "step": 6968 }, { "epoch": 1.8688656476267096, "grad_norm": 0.22619788885285666, "learning_rate": 3.740599378795413e-06, "loss": 0.0169, "step": 6969 }, { "epoch": 1.869133816036471, "grad_norm": 0.1958097300640288, "learning_rate": 3.7390895559379956e-06, "loss": 0.019, "step": 6970 }, { "epoch": 1.8694019844462322, "grad_norm": 0.2888452949974741, "learning_rate": 3.737579855852078e-06, "loss": 0.0231, "step": 6971 }, { "epoch": 1.8696701528559936, "grad_norm": 0.2187344420831675, "learning_rate": 3.7360702786846594e-06, "loss": 0.0182, "step": 6972 }, { "epoch": 1.8699383212657548, "grad_norm": 0.19178801091015069, "learning_rate": 3.73456082458272e-06, "loss": 0.0174, "step": 6973 }, { "epoch": 1.8702064896755162, "grad_norm": 0.2697875391590228, "learning_rate": 3.733051493693235e-06, "loss": 0.0271, "step": 6974 }, { "epoch": 1.8704746580852776, "grad_norm": 0.16702355312191433, "learning_rate": 3.7315422861631623e-06, "loss": 0.0125, "step": 6975 }, { "epoch": 1.870742826495039, "grad_norm": 0.2753655706747481, "learning_rate": 3.7300332021394494e-06, "loss": 0.0284, "step": 6976 }, { "epoch": 1.8710109949048002, "grad_norm": 0.29307470070896974, "learning_rate": 3.7285242417690315e-06, "loss": 0.0239, "step": 6977 }, { "epoch": 1.8712791633145616, "grad_norm": 0.6665346557541083, "learning_rate": 3.727015405198833e-06, "loss": 0.0315, "step": 6978 }, { "epoch": 1.8715473317243227, "grad_norm": 0.28816724856558723, "learning_rate": 3.725506692575764e-06, "loss": 0.0222, "step": 6979 }, { "epoch": 1.8718155001340842, "grad_norm": 0.2638693850188307, "learning_rate": 3.723998104046725e-06, "loss": 0.0266, "step": 6980 }, { "epoch": 1.8720836685438456, "grad_norm": 0.18862376917066837, "learning_rate": 3.722489639758603e-06, "loss": 0.0147, "step": 6981 }, { "epoch": 1.872351836953607, "grad_norm": 0.24689150124554218, "learning_rate": 3.7209812998582753e-06, "loss": 0.0172, "step": 6982 }, { "epoch": 1.8726200053633681, "grad_norm": 0.292614102732543, "learning_rate": 3.719473084492601e-06, "loss": 0.0243, "step": 6983 }, { "epoch": 1.8728881737731295, "grad_norm": 0.20445945568691337, "learning_rate": 3.7179649938084305e-06, "loss": 0.0142, "step": 6984 }, { "epoch": 1.8731563421828907, "grad_norm": 0.27538521797114024, "learning_rate": 3.7164570279526067e-06, "loss": 0.025, "step": 6985 }, { "epoch": 1.8734245105926521, "grad_norm": 0.21288748774711, "learning_rate": 3.7149491870719546e-06, "loss": 0.0201, "step": 6986 }, { "epoch": 1.8736926790024135, "grad_norm": 0.3490647517692959, "learning_rate": 3.7134414713132883e-06, "loss": 0.0195, "step": 6987 }, { "epoch": 1.873960847412175, "grad_norm": 0.2699833440703194, "learning_rate": 3.71193388082341e-06, "loss": 0.0271, "step": 6988 }, { "epoch": 1.8742290158219361, "grad_norm": 0.21326003424104192, "learning_rate": 3.71042641574911e-06, "loss": 0.0197, "step": 6989 }, { "epoch": 1.8744971842316975, "grad_norm": 0.22691999511398894, "learning_rate": 3.7089190762371653e-06, "loss": 0.0153, "step": 6990 }, { "epoch": 1.8747653526414587, "grad_norm": 0.1958500585735867, "learning_rate": 3.707411862434343e-06, "loss": 0.0164, "step": 6991 }, { "epoch": 1.8750335210512201, "grad_norm": 0.21554136204973157, "learning_rate": 3.705904774487396e-06, "loss": 0.0186, "step": 6992 }, { "epoch": 1.8753016894609815, "grad_norm": 0.2693994121691686, "learning_rate": 3.7043978125430656e-06, "loss": 0.021, "step": 6993 }, { "epoch": 1.875569857870743, "grad_norm": 0.3652888290555059, "learning_rate": 3.7028909767480805e-06, "loss": 0.0295, "step": 6994 }, { "epoch": 1.8758380262805041, "grad_norm": 0.2914650508213058, "learning_rate": 3.7013842672491577e-06, "loss": 0.0342, "step": 6995 }, { "epoch": 1.8761061946902655, "grad_norm": 0.21938719293537903, "learning_rate": 3.6998776841930006e-06, "loss": 0.0215, "step": 6996 }, { "epoch": 1.8763743631000267, "grad_norm": 0.4920219307855389, "learning_rate": 3.698371227726302e-06, "loss": 0.0265, "step": 6997 }, { "epoch": 1.876642531509788, "grad_norm": 0.3262475262146318, "learning_rate": 3.69686489799574e-06, "loss": 0.0206, "step": 6998 }, { "epoch": 1.8769106999195495, "grad_norm": 0.27683135678380505, "learning_rate": 3.6953586951479834e-06, "loss": 0.0178, "step": 6999 }, { "epoch": 1.877178868329311, "grad_norm": 0.21968624320647417, "learning_rate": 3.6938526193296873e-06, "loss": 0.0185, "step": 7000 }, { "epoch": 1.877447036739072, "grad_norm": 0.4556426087693223, "learning_rate": 3.6923466706874933e-06, "loss": 0.0159, "step": 7001 }, { "epoch": 1.8777152051488335, "grad_norm": 0.29466512473219314, "learning_rate": 3.6908408493680326e-06, "loss": 0.0295, "step": 7002 }, { "epoch": 1.8779833735585947, "grad_norm": 0.7259532545691287, "learning_rate": 3.689335155517923e-06, "loss": 0.0219, "step": 7003 }, { "epoch": 1.878251541968356, "grad_norm": 0.24147129949229526, "learning_rate": 3.687829589283769e-06, "loss": 0.0335, "step": 7004 }, { "epoch": 1.8785197103781175, "grad_norm": 0.31874545322073783, "learning_rate": 3.686324150812165e-06, "loss": 0.0441, "step": 7005 }, { "epoch": 1.878787878787879, "grad_norm": 0.23483730295668898, "learning_rate": 3.6848188402496897e-06, "loss": 0.0244, "step": 7006 }, { "epoch": 1.87905604719764, "grad_norm": 0.2111676055028106, "learning_rate": 3.6833136577429142e-06, "loss": 0.0144, "step": 7007 }, { "epoch": 1.8793242156074015, "grad_norm": 0.26821349236064757, "learning_rate": 3.6818086034383904e-06, "loss": 0.0196, "step": 7008 }, { "epoch": 1.8795923840171627, "grad_norm": 0.3146965854347319, "learning_rate": 3.6803036774826633e-06, "loss": 0.0236, "step": 7009 }, { "epoch": 1.879860552426924, "grad_norm": 0.33443018022702936, "learning_rate": 3.678798880022264e-06, "loss": 0.0437, "step": 7010 }, { "epoch": 1.8801287208366855, "grad_norm": 0.452089724738561, "learning_rate": 3.677294211203708e-06, "loss": 0.026, "step": 7011 }, { "epoch": 1.8803968892464469, "grad_norm": 0.48043155991685377, "learning_rate": 3.675789671173504e-06, "loss": 0.0253, "step": 7012 }, { "epoch": 1.880665057656208, "grad_norm": 0.17162679145973483, "learning_rate": 3.674285260078145e-06, "loss": 0.0147, "step": 7013 }, { "epoch": 1.8809332260659695, "grad_norm": 0.20991801599544427, "learning_rate": 3.67278097806411e-06, "loss": 0.0198, "step": 7014 }, { "epoch": 1.8812013944757306, "grad_norm": 0.24309580096810576, "learning_rate": 3.6712768252778675e-06, "loss": 0.0177, "step": 7015 }, { "epoch": 1.881469562885492, "grad_norm": 0.2082945742775015, "learning_rate": 3.669772801865873e-06, "loss": 0.018, "step": 7016 }, { "epoch": 1.8817377312952535, "grad_norm": 0.2499281183632208, "learning_rate": 3.668268907974567e-06, "loss": 0.0201, "step": 7017 }, { "epoch": 1.8820058997050149, "grad_norm": 0.22758519362615992, "learning_rate": 3.6667651437503836e-06, "loss": 0.0234, "step": 7018 }, { "epoch": 1.882274068114776, "grad_norm": 0.2714685689520811, "learning_rate": 3.6652615093397414e-06, "loss": 0.0242, "step": 7019 }, { "epoch": 1.8825422365245374, "grad_norm": 0.21201332235644643, "learning_rate": 3.663758004889039e-06, "loss": 0.0142, "step": 7020 }, { "epoch": 1.8828104049342986, "grad_norm": 0.370982055867651, "learning_rate": 3.6622546305446714e-06, "loss": 0.0387, "step": 7021 }, { "epoch": 1.88307857334406, "grad_norm": 0.32092564669169865, "learning_rate": 3.6607513864530197e-06, "loss": 0.0189, "step": 7022 }, { "epoch": 1.8833467417538214, "grad_norm": 0.2446124574540049, "learning_rate": 3.6592482727604508e-06, "loss": 0.0238, "step": 7023 }, { "epoch": 1.8836149101635828, "grad_norm": 0.2457979077141817, "learning_rate": 3.657745289613317e-06, "loss": 0.0197, "step": 7024 }, { "epoch": 1.883883078573344, "grad_norm": 0.2129551794920912, "learning_rate": 3.656242437157961e-06, "loss": 0.0143, "step": 7025 }, { "epoch": 1.8841512469831054, "grad_norm": 0.20616959603395404, "learning_rate": 3.6547397155407104e-06, "loss": 0.0179, "step": 7026 }, { "epoch": 1.8844194153928666, "grad_norm": 0.3634533808171303, "learning_rate": 3.6532371249078834e-06, "loss": 0.0198, "step": 7027 }, { "epoch": 1.884687583802628, "grad_norm": 0.25091978318769564, "learning_rate": 3.6517346654057804e-06, "loss": 0.0191, "step": 7028 }, { "epoch": 1.8849557522123894, "grad_norm": 0.26318757034769874, "learning_rate": 3.6502323371806947e-06, "loss": 0.0174, "step": 7029 }, { "epoch": 1.8852239206221508, "grad_norm": 0.19227605880187817, "learning_rate": 3.6487301403789024e-06, "loss": 0.017, "step": 7030 }, { "epoch": 1.885492089031912, "grad_norm": 0.25021704873425865, "learning_rate": 3.647228075146671e-06, "loss": 0.0204, "step": 7031 }, { "epoch": 1.8857602574416734, "grad_norm": 0.2698683286019005, "learning_rate": 3.645726141630247e-06, "loss": 0.031, "step": 7032 }, { "epoch": 1.8860284258514346, "grad_norm": 0.2286795448611874, "learning_rate": 3.6442243399758748e-06, "loss": 0.0264, "step": 7033 }, { "epoch": 1.886296594261196, "grad_norm": 0.2954885386849269, "learning_rate": 3.6427226703297785e-06, "loss": 0.0328, "step": 7034 }, { "epoch": 1.8865647626709574, "grad_norm": 0.34084282901186963, "learning_rate": 3.641221132838173e-06, "loss": 0.0244, "step": 7035 }, { "epoch": 1.8868329310807188, "grad_norm": 0.33870234616780215, "learning_rate": 3.6397197276472574e-06, "loss": 0.0276, "step": 7036 }, { "epoch": 1.88710109949048, "grad_norm": 0.2178317180176426, "learning_rate": 3.638218454903221e-06, "loss": 0.0191, "step": 7037 }, { "epoch": 1.8873692679002414, "grad_norm": 0.2569043496159182, "learning_rate": 3.636717314752237e-06, "loss": 0.0293, "step": 7038 }, { "epoch": 1.8876374363100026, "grad_norm": 0.2465427753762059, "learning_rate": 3.6352163073404695e-06, "loss": 0.0178, "step": 7039 }, { "epoch": 1.887905604719764, "grad_norm": 0.3143826234022342, "learning_rate": 3.6337154328140665e-06, "loss": 0.0251, "step": 7040 }, { "epoch": 1.8881737731295254, "grad_norm": 0.2935050413340353, "learning_rate": 3.632214691319165e-06, "loss": 0.036, "step": 7041 }, { "epoch": 1.8884419415392868, "grad_norm": 0.23619078363731105, "learning_rate": 3.6307140830018874e-06, "loss": 0.0186, "step": 7042 }, { "epoch": 1.888710109949048, "grad_norm": 0.18216337134079702, "learning_rate": 3.629213608008345e-06, "loss": 0.0143, "step": 7043 }, { "epoch": 1.8889782783588094, "grad_norm": 0.2933549512345838, "learning_rate": 3.6277132664846338e-06, "loss": 0.0215, "step": 7044 }, { "epoch": 1.8892464467685706, "grad_norm": 0.32323796260027526, "learning_rate": 3.6262130585768377e-06, "loss": 0.024, "step": 7045 }, { "epoch": 1.889514615178332, "grad_norm": 0.3519092971036347, "learning_rate": 3.624712984431029e-06, "loss": 0.0277, "step": 7046 }, { "epoch": 1.8897827835880934, "grad_norm": 0.22728157075217575, "learning_rate": 3.623213044193266e-06, "loss": 0.0158, "step": 7047 }, { "epoch": 1.8900509519978548, "grad_norm": 0.23834144482277606, "learning_rate": 3.621713238009593e-06, "loss": 0.0181, "step": 7048 }, { "epoch": 1.890319120407616, "grad_norm": 0.27202791484022565, "learning_rate": 3.6202135660260424e-06, "loss": 0.0242, "step": 7049 }, { "epoch": 1.8905872888173774, "grad_norm": 0.28648044472606465, "learning_rate": 3.6187140283886346e-06, "loss": 0.0272, "step": 7050 }, { "epoch": 1.8908554572271385, "grad_norm": 0.21874665828440795, "learning_rate": 3.6172146252433736e-06, "loss": 0.0184, "step": 7051 }, { "epoch": 1.8911236256369, "grad_norm": 0.24518167430022472, "learning_rate": 3.6157153567362548e-06, "loss": 0.025, "step": 7052 }, { "epoch": 1.8913917940466614, "grad_norm": 0.22912996093580784, "learning_rate": 3.6142162230132554e-06, "loss": 0.024, "step": 7053 }, { "epoch": 1.8916599624564228, "grad_norm": 0.3031929104786631, "learning_rate": 3.6127172242203433e-06, "loss": 0.0154, "step": 7054 }, { "epoch": 1.891928130866184, "grad_norm": 0.24333704048729993, "learning_rate": 3.6112183605034734e-06, "loss": 0.0181, "step": 7055 }, { "epoch": 1.8921962992759453, "grad_norm": 0.26608979603609817, "learning_rate": 3.609719632008584e-06, "loss": 0.0177, "step": 7056 }, { "epoch": 1.8924644676857065, "grad_norm": 0.2819566043907426, "learning_rate": 3.608221038881602e-06, "loss": 0.0265, "step": 7057 }, { "epoch": 1.892732636095468, "grad_norm": 0.3278746292151866, "learning_rate": 3.6067225812684427e-06, "loss": 0.0224, "step": 7058 }, { "epoch": 1.8930008045052293, "grad_norm": 0.25462131932433885, "learning_rate": 3.605224259315005e-06, "loss": 0.0279, "step": 7059 }, { "epoch": 1.8932689729149907, "grad_norm": 0.24428452901546202, "learning_rate": 3.6037260731671797e-06, "loss": 0.0216, "step": 7060 }, { "epoch": 1.893537141324752, "grad_norm": 0.2336104719904052, "learning_rate": 3.6022280229708393e-06, "loss": 0.0187, "step": 7061 }, { "epoch": 1.893805309734513, "grad_norm": 0.25492555326375443, "learning_rate": 3.600730108871846e-06, "loss": 0.0252, "step": 7062 }, { "epoch": 1.8940734781442745, "grad_norm": 0.20146735109100866, "learning_rate": 3.599232331016046e-06, "loss": 0.0134, "step": 7063 }, { "epoch": 1.894341646554036, "grad_norm": 0.19210063985976, "learning_rate": 3.597734689549276e-06, "loss": 0.0153, "step": 7064 }, { "epoch": 1.8946098149637973, "grad_norm": 0.35925790424416315, "learning_rate": 3.5962371846173542e-06, "loss": 0.0238, "step": 7065 }, { "epoch": 1.8948779833735587, "grad_norm": 0.25860596809124786, "learning_rate": 3.594739816366093e-06, "loss": 0.0214, "step": 7066 }, { "epoch": 1.89514615178332, "grad_norm": 0.18475219782595212, "learning_rate": 3.593242584941286e-06, "loss": 0.0145, "step": 7067 }, { "epoch": 1.895414320193081, "grad_norm": 0.19765040423060004, "learning_rate": 3.591745490488712e-06, "loss": 0.013, "step": 7068 }, { "epoch": 1.8956824886028425, "grad_norm": 0.2685459568541738, "learning_rate": 3.59024853315414e-06, "loss": 0.0266, "step": 7069 }, { "epoch": 1.895950657012604, "grad_norm": 0.19851357402496145, "learning_rate": 3.588751713083326e-06, "loss": 0.0174, "step": 7070 }, { "epoch": 1.8962188254223653, "grad_norm": 0.3022403495807125, "learning_rate": 3.587255030422011e-06, "loss": 0.0292, "step": 7071 }, { "epoch": 1.8964869938321267, "grad_norm": 0.19896067677700732, "learning_rate": 3.5857584853159235e-06, "loss": 0.0182, "step": 7072 }, { "epoch": 1.896755162241888, "grad_norm": 0.3307489908240372, "learning_rate": 3.5842620779107774e-06, "loss": 0.0203, "step": 7073 }, { "epoch": 1.897023330651649, "grad_norm": 0.2472748316509056, "learning_rate": 3.5827658083522737e-06, "loss": 0.0243, "step": 7074 }, { "epoch": 1.8972914990614105, "grad_norm": 0.19046092220160468, "learning_rate": 3.5812696767861012e-06, "loss": 0.0148, "step": 7075 }, { "epoch": 1.8975596674711719, "grad_norm": 0.24328357820071297, "learning_rate": 3.5797736833579315e-06, "loss": 0.0289, "step": 7076 }, { "epoch": 1.8978278358809333, "grad_norm": 0.2859728910214367, "learning_rate": 3.578277828213429e-06, "loss": 0.0272, "step": 7077 }, { "epoch": 1.8980960042906947, "grad_norm": 0.24579901952801445, "learning_rate": 3.5767821114982402e-06, "loss": 0.0224, "step": 7078 }, { "epoch": 1.8983641727004559, "grad_norm": 0.36692602768068056, "learning_rate": 3.5752865333580007e-06, "loss": 0.0219, "step": 7079 }, { "epoch": 1.898632341110217, "grad_norm": 0.18376942084347292, "learning_rate": 3.573791093938326e-06, "loss": 0.0148, "step": 7080 }, { "epoch": 1.8989005095199785, "grad_norm": 0.20212165538605545, "learning_rate": 3.572295793384826e-06, "loss": 0.0218, "step": 7081 }, { "epoch": 1.8991686779297399, "grad_norm": 0.22785970856467372, "learning_rate": 3.5708006318430947e-06, "loss": 0.0182, "step": 7082 }, { "epoch": 1.8994368463395013, "grad_norm": 0.3583667876106721, "learning_rate": 3.569305609458712e-06, "loss": 0.0271, "step": 7083 }, { "epoch": 1.8997050147492627, "grad_norm": 0.22805695506938622, "learning_rate": 3.5678107263772428e-06, "loss": 0.0193, "step": 7084 }, { "epoch": 1.8999731831590239, "grad_norm": 0.20571963827933853, "learning_rate": 3.566315982744241e-06, "loss": 0.0194, "step": 7085 }, { "epoch": 1.900241351568785, "grad_norm": 0.27578171672999596, "learning_rate": 3.5648213787052443e-06, "loss": 0.0212, "step": 7086 }, { "epoch": 1.9005095199785464, "grad_norm": 0.2469544742118166, "learning_rate": 3.5633269144057816e-06, "loss": 0.0213, "step": 7087 }, { "epoch": 1.9007776883883079, "grad_norm": 0.23025679927357307, "learning_rate": 3.5618325899913617e-06, "loss": 0.0214, "step": 7088 }, { "epoch": 1.9010458567980693, "grad_norm": 0.24061436892342547, "learning_rate": 3.560338405607485e-06, "loss": 0.0233, "step": 7089 }, { "epoch": 1.9013140252078307, "grad_norm": 0.24674169746577665, "learning_rate": 3.558844361399636e-06, "loss": 0.0211, "step": 7090 }, { "epoch": 1.9015821936175918, "grad_norm": 0.2478521225693802, "learning_rate": 3.5573504575132856e-06, "loss": 0.0223, "step": 7091 }, { "epoch": 1.901850362027353, "grad_norm": 0.2501985639402288, "learning_rate": 3.5558566940938906e-06, "loss": 0.019, "step": 7092 }, { "epoch": 1.9021185304371144, "grad_norm": 0.23993871837771222, "learning_rate": 3.554363071286895e-06, "loss": 0.025, "step": 7093 }, { "epoch": 1.9023866988468758, "grad_norm": 0.3510888308825505, "learning_rate": 3.5528695892377287e-06, "loss": 0.0269, "step": 7094 }, { "epoch": 1.9026548672566372, "grad_norm": 0.31294959490497587, "learning_rate": 3.5513762480918084e-06, "loss": 0.0175, "step": 7095 }, { "epoch": 1.9029230356663986, "grad_norm": 0.2816340358035628, "learning_rate": 3.5498830479945367e-06, "loss": 0.0239, "step": 7096 }, { "epoch": 1.9031912040761598, "grad_norm": 0.17500505971658042, "learning_rate": 3.548389989091302e-06, "loss": 0.0166, "step": 7097 }, { "epoch": 1.903459372485921, "grad_norm": 0.29381925972440326, "learning_rate": 3.546897071527481e-06, "loss": 0.0223, "step": 7098 }, { "epoch": 1.9037275408956824, "grad_norm": 0.33782898747479334, "learning_rate": 3.5454042954484345e-06, "loss": 0.0202, "step": 7099 }, { "epoch": 1.9039957093054438, "grad_norm": 0.25456411633782733, "learning_rate": 3.5439116609995095e-06, "loss": 0.0244, "step": 7100 }, { "epoch": 1.9042638777152052, "grad_norm": 0.29473023530956094, "learning_rate": 3.54241916832604e-06, "loss": 0.0223, "step": 7101 }, { "epoch": 1.9045320461249666, "grad_norm": 0.27121348732917167, "learning_rate": 3.540926817573347e-06, "loss": 0.0314, "step": 7102 }, { "epoch": 1.9048002145347278, "grad_norm": 0.29320278577891506, "learning_rate": 3.539434608886737e-06, "loss": 0.0236, "step": 7103 }, { "epoch": 1.905068382944489, "grad_norm": 0.25023252016743847, "learning_rate": 3.5379425424115e-06, "loss": 0.0163, "step": 7104 }, { "epoch": 1.9053365513542504, "grad_norm": 0.23036393735880645, "learning_rate": 3.536450618292916e-06, "loss": 0.0204, "step": 7105 }, { "epoch": 1.9056047197640118, "grad_norm": 0.33599322456956554, "learning_rate": 3.5349588366762493e-06, "loss": 0.0275, "step": 7106 }, { "epoch": 1.9058728881737732, "grad_norm": 0.2749105394421017, "learning_rate": 3.5334671977067504e-06, "loss": 0.0206, "step": 7107 }, { "epoch": 1.9061410565835346, "grad_norm": 0.2873911744369586, "learning_rate": 3.531975701529658e-06, "loss": 0.019, "step": 7108 }, { "epoch": 1.9064092249932958, "grad_norm": 0.3030079154593814, "learning_rate": 3.5304843482901942e-06, "loss": 0.0234, "step": 7109 }, { "epoch": 1.906677393403057, "grad_norm": 0.24472037067753993, "learning_rate": 3.5289931381335672e-06, "loss": 0.0209, "step": 7110 }, { "epoch": 1.9069455618128184, "grad_norm": 0.32036583600200136, "learning_rate": 3.5275020712049736e-06, "loss": 0.0259, "step": 7111 }, { "epoch": 1.9072137302225798, "grad_norm": 0.2928462312796386, "learning_rate": 3.5260111476495935e-06, "loss": 0.0218, "step": 7112 }, { "epoch": 1.9074818986323412, "grad_norm": 0.21290740136466552, "learning_rate": 3.524520367612595e-06, "loss": 0.0163, "step": 7113 }, { "epoch": 1.9077500670421026, "grad_norm": 0.28203123533284463, "learning_rate": 3.523029731239129e-06, "loss": 0.0175, "step": 7114 }, { "epoch": 1.9080182354518638, "grad_norm": 0.3867072447200513, "learning_rate": 3.5215392386743385e-06, "loss": 0.0224, "step": 7115 }, { "epoch": 1.908286403861625, "grad_norm": 0.2224486449250359, "learning_rate": 3.52004889006335e-06, "loss": 0.0205, "step": 7116 }, { "epoch": 1.9085545722713864, "grad_norm": 0.22106275986429766, "learning_rate": 3.5185586855512677e-06, "loss": 0.0174, "step": 7117 }, { "epoch": 1.9088227406811478, "grad_norm": 0.25210774165645616, "learning_rate": 3.5170686252831942e-06, "loss": 0.0157, "step": 7118 }, { "epoch": 1.9090909090909092, "grad_norm": 0.3298744388389556, "learning_rate": 3.5155787094042113e-06, "loss": 0.0285, "step": 7119 }, { "epoch": 1.9093590775006706, "grad_norm": 0.2592173864842358, "learning_rate": 3.514088938059389e-06, "loss": 0.0246, "step": 7120 }, { "epoch": 1.9096272459104318, "grad_norm": 0.23042473043732348, "learning_rate": 3.5125993113937816e-06, "loss": 0.0195, "step": 7121 }, { "epoch": 1.909895414320193, "grad_norm": 0.23237998100882748, "learning_rate": 3.5111098295524303e-06, "loss": 0.018, "step": 7122 }, { "epoch": 1.9101635827299543, "grad_norm": 0.26747933204721447, "learning_rate": 3.5096204926803623e-06, "loss": 0.0253, "step": 7123 }, { "epoch": 1.9104317511397158, "grad_norm": 0.2990659677619925, "learning_rate": 3.5081313009225882e-06, "loss": 0.0205, "step": 7124 }, { "epoch": 1.9106999195494772, "grad_norm": 0.2168266502087794, "learning_rate": 3.5066422544241096e-06, "loss": 0.019, "step": 7125 }, { "epoch": 1.9109680879592386, "grad_norm": 0.2390449036304791, "learning_rate": 3.505153353329911e-06, "loss": 0.0226, "step": 7126 }, { "epoch": 1.9112362563689997, "grad_norm": 0.24053754421376564, "learning_rate": 3.5036645977849605e-06, "loss": 0.0226, "step": 7127 }, { "epoch": 1.911504424778761, "grad_norm": 0.3254712317406596, "learning_rate": 3.5021759879342167e-06, "loss": 0.0271, "step": 7128 }, { "epoch": 1.9117725931885223, "grad_norm": 0.1958839737995743, "learning_rate": 3.5006875239226196e-06, "loss": 0.0182, "step": 7129 }, { "epoch": 1.9120407615982837, "grad_norm": 0.2412095689354165, "learning_rate": 3.4991992058950973e-06, "loss": 0.02, "step": 7130 }, { "epoch": 1.9123089300080451, "grad_norm": 0.3022310793339784, "learning_rate": 3.497711033996564e-06, "loss": 0.0172, "step": 7131 }, { "epoch": 1.9125770984178065, "grad_norm": 0.2527084266574049, "learning_rate": 3.496223008371919e-06, "loss": 0.0149, "step": 7132 }, { "epoch": 1.9128452668275677, "grad_norm": 0.25208367710569185, "learning_rate": 3.494735129166047e-06, "loss": 0.0261, "step": 7133 }, { "epoch": 1.913113435237329, "grad_norm": 0.4008542264004503, "learning_rate": 3.4932473965238177e-06, "loss": 0.0267, "step": 7134 }, { "epoch": 1.9133816036470903, "grad_norm": 0.1965505746694254, "learning_rate": 3.491759810590091e-06, "loss": 0.0131, "step": 7135 }, { "epoch": 1.9136497720568517, "grad_norm": 0.38812085596641105, "learning_rate": 3.4902723715097064e-06, "loss": 0.0303, "step": 7136 }, { "epoch": 1.9139179404666131, "grad_norm": 0.33912552162151005, "learning_rate": 3.4887850794274934e-06, "loss": 0.0316, "step": 7137 }, { "epoch": 1.9141861088763745, "grad_norm": 0.26935010123984887, "learning_rate": 3.487297934488264e-06, "loss": 0.0218, "step": 7138 }, { "epoch": 1.9144542772861357, "grad_norm": 0.37203557446645347, "learning_rate": 3.48581093683682e-06, "loss": 0.0221, "step": 7139 }, { "epoch": 1.914722445695897, "grad_norm": 0.22235062689065008, "learning_rate": 3.484324086617945e-06, "loss": 0.018, "step": 7140 }, { "epoch": 1.9149906141056583, "grad_norm": 0.2279918453222769, "learning_rate": 3.4828373839764084e-06, "loss": 0.0177, "step": 7141 }, { "epoch": 1.9152587825154197, "grad_norm": 0.22661893251707593, "learning_rate": 3.4813508290569685e-06, "loss": 0.0208, "step": 7142 }, { "epoch": 1.915526950925181, "grad_norm": 0.27673828332223194, "learning_rate": 3.4798644220043663e-06, "loss": 0.0189, "step": 7143 }, { "epoch": 1.9157951193349425, "grad_norm": 0.3117087018257065, "learning_rate": 3.4783781629633295e-06, "loss": 0.0231, "step": 7144 }, { "epoch": 1.9160632877447037, "grad_norm": 0.2831047371764861, "learning_rate": 3.47689205207857e-06, "loss": 0.0254, "step": 7145 }, { "epoch": 1.9163314561544649, "grad_norm": 0.27731941056345627, "learning_rate": 3.475406089494788e-06, "loss": 0.0207, "step": 7146 }, { "epoch": 1.9165996245642263, "grad_norm": 0.32552537424055394, "learning_rate": 3.473920275356668e-06, "loss": 0.033, "step": 7147 }, { "epoch": 1.9168677929739877, "grad_norm": 0.32894881999074593, "learning_rate": 3.472434609808879e-06, "loss": 0.0243, "step": 7148 }, { "epoch": 1.917135961383749, "grad_norm": 0.17805837741423605, "learning_rate": 3.4709490929960763e-06, "loss": 0.0138, "step": 7149 }, { "epoch": 1.9174041297935103, "grad_norm": 0.3966218260711857, "learning_rate": 3.469463725062901e-06, "loss": 0.0372, "step": 7150 }, { "epoch": 1.9176722982032717, "grad_norm": 0.27974986981162187, "learning_rate": 3.4679785061539786e-06, "loss": 0.0245, "step": 7151 }, { "epoch": 1.9179404666130329, "grad_norm": 0.25413126519346585, "learning_rate": 3.466493436413924e-06, "loss": 0.0231, "step": 7152 }, { "epoch": 1.9182086350227943, "grad_norm": 0.21716194407860884, "learning_rate": 3.4650085159873303e-06, "loss": 0.0212, "step": 7153 }, { "epoch": 1.9184768034325557, "grad_norm": 0.26567614042245846, "learning_rate": 3.4635237450187825e-06, "loss": 0.0261, "step": 7154 }, { "epoch": 1.918744971842317, "grad_norm": 0.4032409840009527, "learning_rate": 3.462039123652847e-06, "loss": 0.0229, "step": 7155 }, { "epoch": 1.9190131402520783, "grad_norm": 0.2961964304892529, "learning_rate": 3.4605546520340797e-06, "loss": 0.0257, "step": 7156 }, { "epoch": 1.9192813086618397, "grad_norm": 0.24278901595220345, "learning_rate": 3.4590703303070196e-06, "loss": 0.0189, "step": 7157 }, { "epoch": 1.9195494770716008, "grad_norm": 0.2304473108782152, "learning_rate": 3.4575861586161895e-06, "loss": 0.0159, "step": 7158 }, { "epoch": 1.9198176454813622, "grad_norm": 0.18761957800106538, "learning_rate": 3.4561021371061e-06, "loss": 0.0142, "step": 7159 }, { "epoch": 1.9200858138911236, "grad_norm": 0.38824301838507186, "learning_rate": 3.454618265921247e-06, "loss": 0.0226, "step": 7160 }, { "epoch": 1.920353982300885, "grad_norm": 0.2276031532765234, "learning_rate": 3.4531345452061105e-06, "loss": 0.0224, "step": 7161 }, { "epoch": 1.9206221507106462, "grad_norm": 0.2540901591824478, "learning_rate": 3.4516509751051554e-06, "loss": 0.0215, "step": 7162 }, { "epoch": 1.9208903191204076, "grad_norm": 0.28058980858823557, "learning_rate": 3.4501675557628347e-06, "loss": 0.0237, "step": 7163 }, { "epoch": 1.9211584875301688, "grad_norm": 0.17620710019289818, "learning_rate": 3.448684287323587e-06, "loss": 0.0186, "step": 7164 }, { "epoch": 1.9214266559399302, "grad_norm": 0.2075668144541961, "learning_rate": 3.4472011699318274e-06, "loss": 0.0155, "step": 7165 }, { "epoch": 1.9216948243496916, "grad_norm": 0.29413577551278464, "learning_rate": 3.44571820373197e-06, "loss": 0.0223, "step": 7166 }, { "epoch": 1.921962992759453, "grad_norm": 0.2803808805642251, "learning_rate": 3.444235388868403e-06, "loss": 0.023, "step": 7167 }, { "epoch": 1.9222311611692142, "grad_norm": 0.3094673471903735, "learning_rate": 3.4427527254855064e-06, "loss": 0.019, "step": 7168 }, { "epoch": 1.9224993295789756, "grad_norm": 0.2814274784983791, "learning_rate": 3.4412702137276426e-06, "loss": 0.0157, "step": 7169 }, { "epoch": 1.9227674979887368, "grad_norm": 0.28985489314384305, "learning_rate": 3.43978785373916e-06, "loss": 0.0243, "step": 7170 }, { "epoch": 1.9230356663984982, "grad_norm": 0.24281668017009025, "learning_rate": 3.4383056456643917e-06, "loss": 0.0197, "step": 7171 }, { "epoch": 1.9233038348082596, "grad_norm": 0.2497423913142394, "learning_rate": 3.4368235896476547e-06, "loss": 0.0243, "step": 7172 }, { "epoch": 1.923572003218021, "grad_norm": 0.2671980177168584, "learning_rate": 3.4353416858332562e-06, "loss": 0.0283, "step": 7173 }, { "epoch": 1.9238401716277822, "grad_norm": 0.29430809419339904, "learning_rate": 3.4338599343654833e-06, "loss": 0.0264, "step": 7174 }, { "epoch": 1.9241083400375436, "grad_norm": 0.24272818818406977, "learning_rate": 3.432378335388611e-06, "loss": 0.0236, "step": 7175 }, { "epoch": 1.9243765084473048, "grad_norm": 0.30702434915932736, "learning_rate": 3.430896889046899e-06, "loss": 0.0238, "step": 7176 }, { "epoch": 1.9246446768570662, "grad_norm": 0.24330676953622418, "learning_rate": 3.42941559548459e-06, "loss": 0.0209, "step": 7177 }, { "epoch": 1.9249128452668276, "grad_norm": 0.24873215986675828, "learning_rate": 3.427934454845915e-06, "loss": 0.0232, "step": 7178 }, { "epoch": 1.925181013676589, "grad_norm": 0.24657372971640107, "learning_rate": 3.4264534672750884e-06, "loss": 0.0255, "step": 7179 }, { "epoch": 1.9254491820863502, "grad_norm": 0.2188551181062495, "learning_rate": 3.4249726329163097e-06, "loss": 0.0185, "step": 7180 }, { "epoch": 1.9257173504961116, "grad_norm": 0.32082288628458633, "learning_rate": 3.4234919519137644e-06, "loss": 0.0218, "step": 7181 }, { "epoch": 1.9259855189058728, "grad_norm": 0.2394315828054108, "learning_rate": 3.422011424411621e-06, "loss": 0.0281, "step": 7182 }, { "epoch": 1.9262536873156342, "grad_norm": 0.30388306820669747, "learning_rate": 3.420531050554036e-06, "loss": 0.0221, "step": 7183 }, { "epoch": 1.9265218557253956, "grad_norm": 0.2618962040986585, "learning_rate": 3.41905083048515e-06, "loss": 0.0274, "step": 7184 }, { "epoch": 1.926790024135157, "grad_norm": 0.227393532735514, "learning_rate": 3.417570764349087e-06, "loss": 0.018, "step": 7185 }, { "epoch": 1.9270581925449182, "grad_norm": 0.2678705892558985, "learning_rate": 3.4160908522899576e-06, "loss": 0.023, "step": 7186 }, { "epoch": 1.9273263609546796, "grad_norm": 0.3614885657950307, "learning_rate": 3.4146110944518566e-06, "loss": 0.0378, "step": 7187 }, { "epoch": 1.9275945293644408, "grad_norm": 0.2132148171696766, "learning_rate": 3.413131490978866e-06, "loss": 0.0209, "step": 7188 }, { "epoch": 1.9278626977742022, "grad_norm": 0.21854427691784456, "learning_rate": 3.411652042015047e-06, "loss": 0.014, "step": 7189 }, { "epoch": 1.9281308661839636, "grad_norm": 0.2494560387101364, "learning_rate": 3.4101727477044522e-06, "loss": 0.0202, "step": 7190 }, { "epoch": 1.928399034593725, "grad_norm": 0.24560861689224098, "learning_rate": 3.408693608191116e-06, "loss": 0.0187, "step": 7191 }, { "epoch": 1.9286672030034862, "grad_norm": 0.3342363343579039, "learning_rate": 3.4072146236190582e-06, "loss": 0.0204, "step": 7192 }, { "epoch": 1.9289353714132476, "grad_norm": 0.28751513464329487, "learning_rate": 3.4057357941322823e-06, "loss": 0.0222, "step": 7193 }, { "epoch": 1.9292035398230087, "grad_norm": 0.2585111833717884, "learning_rate": 3.404257119874781e-06, "loss": 0.0184, "step": 7194 }, { "epoch": 1.9294717082327701, "grad_norm": 0.22808487868361046, "learning_rate": 3.4027786009905272e-06, "loss": 0.0153, "step": 7195 }, { "epoch": 1.9297398766425315, "grad_norm": 0.19831924775167226, "learning_rate": 3.4013002376234805e-06, "loss": 0.0195, "step": 7196 }, { "epoch": 1.930008045052293, "grad_norm": 0.322005055277257, "learning_rate": 3.3998220299175855e-06, "loss": 0.0258, "step": 7197 }, { "epoch": 1.9302762134620541, "grad_norm": 0.23385855106823822, "learning_rate": 3.39834397801677e-06, "loss": 0.0288, "step": 7198 }, { "epoch": 1.9305443818718155, "grad_norm": 0.41744507805904746, "learning_rate": 3.3968660820649492e-06, "loss": 0.0648, "step": 7199 }, { "epoch": 1.9308125502815767, "grad_norm": 0.311327903330626, "learning_rate": 3.3953883422060247e-06, "loss": 0.0218, "step": 7200 }, { "epoch": 1.9310807186913381, "grad_norm": 0.27739334145761296, "learning_rate": 3.3939107585838747e-06, "loss": 0.0338, "step": 7201 }, { "epoch": 1.9313488871010995, "grad_norm": 0.2541434844328274, "learning_rate": 3.39243333134237e-06, "loss": 0.023, "step": 7202 }, { "epoch": 1.931617055510861, "grad_norm": 0.23381246788016055, "learning_rate": 3.3909560606253632e-06, "loss": 0.0172, "step": 7203 }, { "epoch": 1.9318852239206221, "grad_norm": 0.2968184767314714, "learning_rate": 3.3894789465766935e-06, "loss": 0.0261, "step": 7204 }, { "epoch": 1.9321533923303835, "grad_norm": 0.2658141494372896, "learning_rate": 3.388001989340183e-06, "loss": 0.0295, "step": 7205 }, { "epoch": 1.9324215607401447, "grad_norm": 0.2074266215311971, "learning_rate": 3.3865251890596395e-06, "loss": 0.0162, "step": 7206 }, { "epoch": 1.932689729149906, "grad_norm": 0.22734488754899393, "learning_rate": 3.385048545878854e-06, "loss": 0.0206, "step": 7207 }, { "epoch": 1.9329578975596675, "grad_norm": 0.24505164404169855, "learning_rate": 3.3835720599416042e-06, "loss": 0.0246, "step": 7208 }, { "epoch": 1.933226065969429, "grad_norm": 0.29462017521221207, "learning_rate": 3.3820957313916516e-06, "loss": 0.0245, "step": 7209 }, { "epoch": 1.93349423437919, "grad_norm": 0.30073703615901964, "learning_rate": 3.380619560372741e-06, "loss": 0.0193, "step": 7210 }, { "epoch": 1.9337624027889515, "grad_norm": 0.29354234197600404, "learning_rate": 3.379143547028605e-06, "loss": 0.022, "step": 7211 }, { "epoch": 1.9340305711987127, "grad_norm": 0.20948275570957947, "learning_rate": 3.3776676915029615e-06, "loss": 0.0168, "step": 7212 }, { "epoch": 1.934298739608474, "grad_norm": 0.2468844181637911, "learning_rate": 3.376191993939504e-06, "loss": 0.0243, "step": 7213 }, { "epoch": 1.9345669080182355, "grad_norm": 0.18626674621039455, "learning_rate": 3.374716454481922e-06, "loss": 0.0175, "step": 7214 }, { "epoch": 1.934835076427997, "grad_norm": 0.24787908005964948, "learning_rate": 3.3732410732738843e-06, "loss": 0.0187, "step": 7215 }, { "epoch": 1.935103244837758, "grad_norm": 0.36552051754551834, "learning_rate": 3.371765850459043e-06, "loss": 0.0342, "step": 7216 }, { "epoch": 1.9353714132475195, "grad_norm": 0.26048597673486495, "learning_rate": 3.3702907861810393e-06, "loss": 0.0251, "step": 7217 }, { "epoch": 1.9356395816572807, "grad_norm": 0.22184261423279286, "learning_rate": 3.368815880583494e-06, "loss": 0.026, "step": 7218 }, { "epoch": 1.935907750067042, "grad_norm": 0.17910308693188762, "learning_rate": 3.3673411338100163e-06, "loss": 0.0135, "step": 7219 }, { "epoch": 1.9361759184768035, "grad_norm": 0.29436803644453036, "learning_rate": 3.365866546004197e-06, "loss": 0.0275, "step": 7220 }, { "epoch": 1.9364440868865649, "grad_norm": 0.2113852297753171, "learning_rate": 3.364392117309614e-06, "loss": 0.0221, "step": 7221 }, { "epoch": 1.936712255296326, "grad_norm": 0.2139667079589505, "learning_rate": 3.3629178478698284e-06, "loss": 0.0116, "step": 7222 }, { "epoch": 1.9369804237060875, "grad_norm": 0.24418555802654415, "learning_rate": 3.3614437378283854e-06, "loss": 0.0256, "step": 7223 }, { "epoch": 1.9372485921158487, "grad_norm": 0.2961938992212667, "learning_rate": 3.3599697873288163e-06, "loss": 0.0288, "step": 7224 }, { "epoch": 1.93751676052561, "grad_norm": 0.21618108159324634, "learning_rate": 3.358495996514634e-06, "loss": 0.0224, "step": 7225 }, { "epoch": 1.9377849289353715, "grad_norm": 0.3339499853342691, "learning_rate": 3.3570223655293388e-06, "loss": 0.0227, "step": 7226 }, { "epoch": 1.9380530973451329, "grad_norm": 0.23482922100773249, "learning_rate": 3.3555488945164127e-06, "loss": 0.0189, "step": 7227 }, { "epoch": 1.938321265754894, "grad_norm": 0.21011744169434862, "learning_rate": 3.354075583619326e-06, "loss": 0.0171, "step": 7228 }, { "epoch": 1.9385894341646555, "grad_norm": 0.21357533378241214, "learning_rate": 3.3526024329815293e-06, "loss": 0.0237, "step": 7229 }, { "epoch": 1.9388576025744166, "grad_norm": 0.33637065603835076, "learning_rate": 3.351129442746459e-06, "loss": 0.0237, "step": 7230 }, { "epoch": 1.939125770984178, "grad_norm": 0.3036592755976574, "learning_rate": 3.349656613057538e-06, "loss": 0.0303, "step": 7231 }, { "epoch": 1.9393939393939394, "grad_norm": 0.24233432752626646, "learning_rate": 3.3481839440581714e-06, "loss": 0.0164, "step": 7232 }, { "epoch": 1.9396621078037009, "grad_norm": 0.22201470923065794, "learning_rate": 3.3467114358917484e-06, "loss": 0.0159, "step": 7233 }, { "epoch": 1.939930276213462, "grad_norm": 0.24328650220008333, "learning_rate": 3.345239088701644e-06, "loss": 0.0177, "step": 7234 }, { "epoch": 1.9401984446232234, "grad_norm": 0.19824929278595427, "learning_rate": 3.343766902631216e-06, "loss": 0.02, "step": 7235 }, { "epoch": 1.9404666130329846, "grad_norm": 0.28296271181276195, "learning_rate": 3.3422948778238074e-06, "loss": 0.0251, "step": 7236 }, { "epoch": 1.940734781442746, "grad_norm": 1.7159774157584902, "learning_rate": 3.340823014422746e-06, "loss": 0.0228, "step": 7237 }, { "epoch": 1.9410029498525074, "grad_norm": 0.4766270752010277, "learning_rate": 3.339351312571342e-06, "loss": 0.0188, "step": 7238 }, { "epoch": 1.9412711182622688, "grad_norm": 0.6164525604350836, "learning_rate": 3.337879772412892e-06, "loss": 0.0278, "step": 7239 }, { "epoch": 1.94153928667203, "grad_norm": 0.24052239713881798, "learning_rate": 3.3364083940906754e-06, "loss": 0.0219, "step": 7240 }, { "epoch": 1.9418074550817912, "grad_norm": 0.20742263430130703, "learning_rate": 3.3349371777479556e-06, "loss": 0.0189, "step": 7241 }, { "epoch": 1.9420756234915526, "grad_norm": 0.28813426158630956, "learning_rate": 3.3334661235279836e-06, "loss": 0.0189, "step": 7242 }, { "epoch": 1.942343791901314, "grad_norm": 0.2601999809015, "learning_rate": 3.33199523157399e-06, "loss": 0.0246, "step": 7243 }, { "epoch": 1.9426119603110754, "grad_norm": 0.45000488648307824, "learning_rate": 3.3305245020291933e-06, "loss": 0.021, "step": 7244 }, { "epoch": 1.9428801287208368, "grad_norm": 0.2791839331735888, "learning_rate": 3.3290539350367925e-06, "loss": 0.0238, "step": 7245 }, { "epoch": 1.943148297130598, "grad_norm": 0.936226692962895, "learning_rate": 3.327583530739974e-06, "loss": 0.027, "step": 7246 }, { "epoch": 1.9434164655403592, "grad_norm": 0.23492671749295022, "learning_rate": 3.3261132892819057e-06, "loss": 0.0204, "step": 7247 }, { "epoch": 1.9436846339501206, "grad_norm": 0.27128458575834713, "learning_rate": 3.3246432108057436e-06, "loss": 0.0245, "step": 7248 }, { "epoch": 1.943952802359882, "grad_norm": 0.6157341757689855, "learning_rate": 3.323173295454626e-06, "loss": 0.0285, "step": 7249 }, { "epoch": 1.9442209707696434, "grad_norm": 0.28910675168210076, "learning_rate": 3.321703543371671e-06, "loss": 0.0163, "step": 7250 }, { "epoch": 1.9444891391794048, "grad_norm": 0.3279579902447013, "learning_rate": 3.320233954699985e-06, "loss": 0.0241, "step": 7251 }, { "epoch": 1.944757307589166, "grad_norm": 0.3275048533410263, "learning_rate": 3.31876452958266e-06, "loss": 0.0211, "step": 7252 }, { "epoch": 1.9450254759989272, "grad_norm": 0.3711006868317802, "learning_rate": 3.317295268162769e-06, "loss": 0.0229, "step": 7253 }, { "epoch": 1.9452936444086886, "grad_norm": 0.2550440119806674, "learning_rate": 3.3158261705833706e-06, "loss": 0.0244, "step": 7254 }, { "epoch": 1.94556181281845, "grad_norm": 0.225557914235331, "learning_rate": 3.3143572369875064e-06, "loss": 0.0231, "step": 7255 }, { "epoch": 1.9458299812282114, "grad_norm": 0.23567582128649975, "learning_rate": 3.312888467518203e-06, "loss": 0.0241, "step": 7256 }, { "epoch": 1.9460981496379728, "grad_norm": 0.3077048960104107, "learning_rate": 3.31141986231847e-06, "loss": 0.0225, "step": 7257 }, { "epoch": 1.946366318047734, "grad_norm": 0.4478250638408225, "learning_rate": 3.309951421531301e-06, "loss": 0.0284, "step": 7258 }, { "epoch": 1.9466344864574952, "grad_norm": 0.25523355166992107, "learning_rate": 3.3084831452996768e-06, "loss": 0.0165, "step": 7259 }, { "epoch": 1.9469026548672566, "grad_norm": 0.1947316463659574, "learning_rate": 3.3070150337665573e-06, "loss": 0.019, "step": 7260 }, { "epoch": 1.947170823277018, "grad_norm": 0.24770198588914882, "learning_rate": 3.3055470870748898e-06, "loss": 0.02, "step": 7261 }, { "epoch": 1.9474389916867794, "grad_norm": 0.2500610610571737, "learning_rate": 3.3040793053676035e-06, "loss": 0.0196, "step": 7262 }, { "epoch": 1.9477071600965408, "grad_norm": 0.29429674818239904, "learning_rate": 3.302611688787612e-06, "loss": 0.0238, "step": 7263 }, { "epoch": 1.947975328506302, "grad_norm": 0.3142482582260968, "learning_rate": 3.3011442374778137e-06, "loss": 0.0225, "step": 7264 }, { "epoch": 1.9482434969160631, "grad_norm": 0.26214611116363046, "learning_rate": 3.299676951581091e-06, "loss": 0.0162, "step": 7265 }, { "epoch": 1.9485116653258245, "grad_norm": 0.25188292884309677, "learning_rate": 3.2982098312403094e-06, "loss": 0.0237, "step": 7266 }, { "epoch": 1.948779833735586, "grad_norm": 0.27074429397873917, "learning_rate": 3.2967428765983185e-06, "loss": 0.0157, "step": 7267 }, { "epoch": 1.9490480021453473, "grad_norm": 0.267728122834913, "learning_rate": 3.29527608779795e-06, "loss": 0.0209, "step": 7268 }, { "epoch": 1.9493161705551088, "grad_norm": 0.2827254870826069, "learning_rate": 3.293809464982024e-06, "loss": 0.0176, "step": 7269 }, { "epoch": 1.94958433896487, "grad_norm": 0.22768053765300514, "learning_rate": 3.292343008293341e-06, "loss": 0.0177, "step": 7270 }, { "epoch": 1.9498525073746311, "grad_norm": 0.19869803234542954, "learning_rate": 3.290876717874686e-06, "loss": 0.0157, "step": 7271 }, { "epoch": 1.9501206757843925, "grad_norm": 0.23543443144286302, "learning_rate": 3.289410593868826e-06, "loss": 0.0193, "step": 7272 }, { "epoch": 1.950388844194154, "grad_norm": 0.26468877854466416, "learning_rate": 3.287944636418517e-06, "loss": 0.0262, "step": 7273 }, { "epoch": 1.9506570126039153, "grad_norm": 1.3406117055371682, "learning_rate": 3.286478845666492e-06, "loss": 0.0204, "step": 7274 }, { "epoch": 1.9509251810136767, "grad_norm": 0.22614158940232104, "learning_rate": 3.285013221755472e-06, "loss": 0.0207, "step": 7275 }, { "epoch": 1.951193349423438, "grad_norm": 0.28261212982529965, "learning_rate": 3.2835477648281624e-06, "loss": 0.026, "step": 7276 }, { "epoch": 1.951461517833199, "grad_norm": 0.19577555641572983, "learning_rate": 3.282082475027249e-06, "loss": 0.016, "step": 7277 }, { "epoch": 1.9517296862429605, "grad_norm": 0.21446217015446803, "learning_rate": 3.280617352495403e-06, "loss": 0.0245, "step": 7278 }, { "epoch": 1.951997854652722, "grad_norm": 0.5695721712053765, "learning_rate": 3.279152397375282e-06, "loss": 0.031, "step": 7279 }, { "epoch": 1.9522660230624833, "grad_norm": 0.22887071708210222, "learning_rate": 3.277687609809523e-06, "loss": 0.0242, "step": 7280 }, { "epoch": 1.9525341914722447, "grad_norm": 0.17226352459104693, "learning_rate": 3.2762229899407494e-06, "loss": 0.0153, "step": 7281 }, { "epoch": 1.952802359882006, "grad_norm": 1.95857696256105, "learning_rate": 3.274758537911566e-06, "loss": 0.0288, "step": 7282 }, { "epoch": 1.953070528291767, "grad_norm": 0.23704344986946801, "learning_rate": 3.273294253864564e-06, "loss": 0.0228, "step": 7283 }, { "epoch": 1.9533386967015285, "grad_norm": 0.2599262793136385, "learning_rate": 3.271830137942316e-06, "loss": 0.0227, "step": 7284 }, { "epoch": 1.95360686511129, "grad_norm": 0.2406656249609967, "learning_rate": 3.2703661902873797e-06, "loss": 0.0226, "step": 7285 }, { "epoch": 1.9538750335210513, "grad_norm": 0.2951081278937298, "learning_rate": 3.2689024110422946e-06, "loss": 0.0237, "step": 7286 }, { "epoch": 1.9541432019308127, "grad_norm": 0.2986252436390067, "learning_rate": 3.267438800349586e-06, "loss": 0.0189, "step": 7287 }, { "epoch": 1.9544113703405739, "grad_norm": 0.2089620551078566, "learning_rate": 3.2659753583517613e-06, "loss": 0.0153, "step": 7288 }, { "epoch": 1.954679538750335, "grad_norm": 0.2374683369847728, "learning_rate": 3.2645120851913116e-06, "loss": 0.0163, "step": 7289 }, { "epoch": 1.9549477071600965, "grad_norm": 0.2110732831376364, "learning_rate": 3.2630489810107125e-06, "loss": 0.0194, "step": 7290 }, { "epoch": 1.9552158755698579, "grad_norm": 0.3688689667573004, "learning_rate": 3.261586045952424e-06, "loss": 0.0284, "step": 7291 }, { "epoch": 1.9554840439796193, "grad_norm": 0.24806684077869232, "learning_rate": 3.2601232801588857e-06, "loss": 0.0217, "step": 7292 }, { "epoch": 1.9557522123893807, "grad_norm": 0.2438056003496756, "learning_rate": 3.258660683772525e-06, "loss": 0.0159, "step": 7293 }, { "epoch": 1.9560203807991419, "grad_norm": 0.4361051095382746, "learning_rate": 3.25719825693575e-06, "loss": 0.0209, "step": 7294 }, { "epoch": 1.956288549208903, "grad_norm": 0.21364374597346794, "learning_rate": 3.255735999790953e-06, "loss": 0.0175, "step": 7295 }, { "epoch": 1.9565567176186645, "grad_norm": 0.3588876401576768, "learning_rate": 3.2542739124805113e-06, "loss": 0.022, "step": 7296 }, { "epoch": 1.9568248860284259, "grad_norm": 0.2799114898841924, "learning_rate": 3.252811995146786e-06, "loss": 0.0203, "step": 7297 }, { "epoch": 1.9570930544381873, "grad_norm": 0.2814971297534194, "learning_rate": 3.251350247932117e-06, "loss": 0.0265, "step": 7298 }, { "epoch": 1.9573612228479487, "grad_norm": 0.24009168386660548, "learning_rate": 3.2498886709788298e-06, "loss": 0.0203, "step": 7299 }, { "epoch": 1.9576293912577098, "grad_norm": 0.17919401403269225, "learning_rate": 3.248427264429237e-06, "loss": 0.015, "step": 7300 }, { "epoch": 1.957897559667471, "grad_norm": 0.3011371564775238, "learning_rate": 3.246966028425632e-06, "loss": 0.0242, "step": 7301 }, { "epoch": 1.9581657280772324, "grad_norm": 0.2544898288661922, "learning_rate": 3.2455049631102896e-06, "loss": 0.0187, "step": 7302 }, { "epoch": 1.9584338964869938, "grad_norm": 0.21456869475937265, "learning_rate": 3.244044068625472e-06, "loss": 0.0187, "step": 7303 }, { "epoch": 1.9587020648967552, "grad_norm": 0.26101091975300716, "learning_rate": 3.242583345113421e-06, "loss": 0.0209, "step": 7304 }, { "epoch": 1.9589702333065167, "grad_norm": 0.2598864629403866, "learning_rate": 3.2411227927163636e-06, "loss": 0.0202, "step": 7305 }, { "epoch": 1.9592384017162778, "grad_norm": 0.2660274408710632, "learning_rate": 3.2396624115765095e-06, "loss": 0.02, "step": 7306 }, { "epoch": 1.959506570126039, "grad_norm": 0.2783187593611475, "learning_rate": 3.2382022018360537e-06, "loss": 0.0306, "step": 7307 }, { "epoch": 1.9597747385358004, "grad_norm": 0.26086595271569946, "learning_rate": 3.2367421636371717e-06, "loss": 0.0237, "step": 7308 }, { "epoch": 1.9600429069455618, "grad_norm": 0.19809928884422928, "learning_rate": 3.2352822971220255e-06, "loss": 0.0136, "step": 7309 }, { "epoch": 1.9603110753553232, "grad_norm": 0.272623021485699, "learning_rate": 3.2338226024327545e-06, "loss": 0.0284, "step": 7310 }, { "epoch": 1.9605792437650846, "grad_norm": 0.19195028280333934, "learning_rate": 3.2323630797114892e-06, "loss": 0.0163, "step": 7311 }, { "epoch": 1.9608474121748458, "grad_norm": 0.2516825566785902, "learning_rate": 3.2309037291003363e-06, "loss": 0.0191, "step": 7312 }, { "epoch": 1.961115580584607, "grad_norm": 0.20860736884877254, "learning_rate": 3.229444550741392e-06, "loss": 0.0164, "step": 7313 }, { "epoch": 1.9613837489943684, "grad_norm": 0.2018496953655276, "learning_rate": 3.22798554477673e-06, "loss": 0.017, "step": 7314 }, { "epoch": 1.9616519174041298, "grad_norm": 0.18254981433131096, "learning_rate": 3.2265267113484113e-06, "loss": 0.017, "step": 7315 }, { "epoch": 1.9619200858138912, "grad_norm": 0.22958193195765997, "learning_rate": 3.2250680505984773e-06, "loss": 0.0167, "step": 7316 }, { "epoch": 1.9621882542236526, "grad_norm": 0.3633120850043984, "learning_rate": 3.223609562668956e-06, "loss": 0.0234, "step": 7317 }, { "epoch": 1.9624564226334138, "grad_norm": 0.254500331645394, "learning_rate": 3.2221512477018546e-06, "loss": 0.0169, "step": 7318 }, { "epoch": 1.962724591043175, "grad_norm": 0.24183995972691102, "learning_rate": 3.2206931058391668e-06, "loss": 0.0248, "step": 7319 }, { "epoch": 1.9629927594529364, "grad_norm": 0.22707204737752154, "learning_rate": 3.219235137222867e-06, "loss": 0.0163, "step": 7320 }, { "epoch": 1.9632609278626978, "grad_norm": 0.2720860932807066, "learning_rate": 3.217777341994915e-06, "loss": 0.0287, "step": 7321 }, { "epoch": 1.9635290962724592, "grad_norm": 0.22002086700853982, "learning_rate": 3.2163197202972505e-06, "loss": 0.0196, "step": 7322 }, { "epoch": 1.9637972646822206, "grad_norm": 0.22239090285014979, "learning_rate": 3.214862272271799e-06, "loss": 0.0151, "step": 7323 }, { "epoch": 1.9640654330919818, "grad_norm": 0.20407577512504013, "learning_rate": 3.21340499806047e-06, "loss": 0.0185, "step": 7324 }, { "epoch": 1.964333601501743, "grad_norm": 0.20539263802490418, "learning_rate": 3.211947897805151e-06, "loss": 0.0163, "step": 7325 }, { "epoch": 1.9646017699115044, "grad_norm": 0.24540685676415408, "learning_rate": 3.2104909716477184e-06, "loss": 0.0199, "step": 7326 }, { "epoch": 1.9648699383212658, "grad_norm": 0.23768470831320426, "learning_rate": 3.2090342197300296e-06, "loss": 0.0179, "step": 7327 }, { "epoch": 1.9651381067310272, "grad_norm": 0.22811968805251903, "learning_rate": 3.2075776421939235e-06, "loss": 0.0231, "step": 7328 }, { "epoch": 1.9654062751407884, "grad_norm": 0.35688940079785403, "learning_rate": 3.2061212391812237e-06, "loss": 0.0299, "step": 7329 }, { "epoch": 1.9656744435505498, "grad_norm": 0.25489483982162836, "learning_rate": 3.204665010833736e-06, "loss": 0.0188, "step": 7330 }, { "epoch": 1.965942611960311, "grad_norm": 0.3011087616456042, "learning_rate": 3.2032089572932507e-06, "loss": 0.0205, "step": 7331 }, { "epoch": 1.9662107803700724, "grad_norm": 0.22885799414865518, "learning_rate": 3.2017530787015383e-06, "loss": 0.024, "step": 7332 }, { "epoch": 1.9664789487798338, "grad_norm": 0.378775383877279, "learning_rate": 3.200297375200355e-06, "loss": 0.0157, "step": 7333 }, { "epoch": 1.9667471171895952, "grad_norm": 0.17746184834781112, "learning_rate": 3.1988418469314374e-06, "loss": 0.0129, "step": 7334 }, { "epoch": 1.9670152855993563, "grad_norm": 0.2993150294588135, "learning_rate": 3.1973864940365076e-06, "loss": 0.034, "step": 7335 }, { "epoch": 1.9672834540091177, "grad_norm": 0.26434196326787734, "learning_rate": 3.195931316657268e-06, "loss": 0.0196, "step": 7336 }, { "epoch": 1.967551622418879, "grad_norm": 0.20504594226240752, "learning_rate": 3.1944763149354064e-06, "loss": 0.0192, "step": 7337 }, { "epoch": 1.9678197908286403, "grad_norm": 0.2653934118817414, "learning_rate": 3.1930214890125925e-06, "loss": 0.0203, "step": 7338 }, { "epoch": 1.9680879592384017, "grad_norm": 0.24862668972346125, "learning_rate": 3.1915668390304794e-06, "loss": 0.0202, "step": 7339 }, { "epoch": 1.9683561276481631, "grad_norm": 0.22975807044828392, "learning_rate": 3.190112365130702e-06, "loss": 0.019, "step": 7340 }, { "epoch": 1.9686242960579243, "grad_norm": 0.18577627817299944, "learning_rate": 3.1886580674548784e-06, "loss": 0.0165, "step": 7341 }, { "epoch": 1.9688924644676857, "grad_norm": 0.3053345260670494, "learning_rate": 3.187203946144609e-06, "loss": 0.0186, "step": 7342 }, { "epoch": 1.969160632877447, "grad_norm": 0.6893799695770465, "learning_rate": 3.1857500013414782e-06, "loss": 0.0313, "step": 7343 }, { "epoch": 1.9694288012872083, "grad_norm": 0.27379994105932004, "learning_rate": 3.184296233187054e-06, "loss": 0.022, "step": 7344 }, { "epoch": 1.9696969696969697, "grad_norm": 0.3314645121203494, "learning_rate": 3.182842641822887e-06, "loss": 0.0328, "step": 7345 }, { "epoch": 1.9699651381067311, "grad_norm": 0.18876612622861563, "learning_rate": 3.1813892273905056e-06, "loss": 0.0149, "step": 7346 }, { "epoch": 1.9702333065164923, "grad_norm": 0.32912056657911964, "learning_rate": 3.179935990031425e-06, "loss": 0.0257, "step": 7347 }, { "epoch": 1.9705014749262537, "grad_norm": 0.21852652622625018, "learning_rate": 3.1784829298871467e-06, "loss": 0.0186, "step": 7348 }, { "epoch": 1.970769643336015, "grad_norm": 0.33535109451713574, "learning_rate": 3.1770300470991487e-06, "loss": 0.0231, "step": 7349 }, { "epoch": 1.9710378117457763, "grad_norm": 0.2788020002932741, "learning_rate": 3.1755773418088966e-06, "loss": 0.0232, "step": 7350 }, { "epoch": 1.9713059801555377, "grad_norm": 0.20622793888609375, "learning_rate": 3.1741248141578334e-06, "loss": 0.016, "step": 7351 }, { "epoch": 1.9715741485652991, "grad_norm": 0.2550309528468716, "learning_rate": 3.17267246428739e-06, "loss": 0.0166, "step": 7352 }, { "epoch": 1.9718423169750603, "grad_norm": 0.2029493381118636, "learning_rate": 3.171220292338978e-06, "loss": 0.0178, "step": 7353 }, { "epoch": 1.9721104853848217, "grad_norm": 0.3143055064180271, "learning_rate": 3.169768298453989e-06, "loss": 0.0174, "step": 7354 }, { "epoch": 1.9723786537945829, "grad_norm": 0.24823623314562687, "learning_rate": 3.1683164827738024e-06, "loss": 0.0215, "step": 7355 }, { "epoch": 1.9726468222043443, "grad_norm": 0.2916915396643583, "learning_rate": 3.1668648454397778e-06, "loss": 0.0254, "step": 7356 }, { "epoch": 1.9729149906141057, "grad_norm": 0.24807135993276522, "learning_rate": 3.165413386593256e-06, "loss": 0.0206, "step": 7357 }, { "epoch": 1.973183159023867, "grad_norm": 0.38789111520190983, "learning_rate": 3.1639621063755622e-06, "loss": 0.0429, "step": 7358 }, { "epoch": 1.9734513274336283, "grad_norm": 0.21587589980216482, "learning_rate": 3.162511004928003e-06, "loss": 0.015, "step": 7359 }, { "epoch": 1.9737194958433897, "grad_norm": 0.21760216320504983, "learning_rate": 3.161060082391869e-06, "loss": 0.0201, "step": 7360 }, { "epoch": 1.9739876642531509, "grad_norm": 0.24406782437877078, "learning_rate": 3.1596093389084327e-06, "loss": 0.0203, "step": 7361 }, { "epoch": 1.9742558326629123, "grad_norm": 0.22483130290143524, "learning_rate": 3.158158774618948e-06, "loss": 0.0163, "step": 7362 }, { "epoch": 1.9745240010726737, "grad_norm": 0.36283461802211503, "learning_rate": 3.1567083896646545e-06, "loss": 0.0239, "step": 7363 }, { "epoch": 1.974792169482435, "grad_norm": 0.27661984566535774, "learning_rate": 3.1552581841867695e-06, "loss": 0.0271, "step": 7364 }, { "epoch": 1.9750603378921963, "grad_norm": 0.21704552032261018, "learning_rate": 3.153808158326499e-06, "loss": 0.0177, "step": 7365 }, { "epoch": 1.9753285063019577, "grad_norm": 0.32349575102684636, "learning_rate": 3.1523583122250262e-06, "loss": 0.0265, "step": 7366 }, { "epoch": 1.9755966747117188, "grad_norm": 0.2065823253197649, "learning_rate": 3.1509086460235196e-06, "loss": 0.0146, "step": 7367 }, { "epoch": 1.9758648431214803, "grad_norm": 0.2576949465385535, "learning_rate": 3.149459159863128e-06, "loss": 0.023, "step": 7368 }, { "epoch": 1.9761330115312417, "grad_norm": 0.23996351615262054, "learning_rate": 3.1480098538849858e-06, "loss": 0.0261, "step": 7369 }, { "epoch": 1.976401179941003, "grad_norm": 0.2528034941796443, "learning_rate": 3.146560728230208e-06, "loss": 0.0269, "step": 7370 }, { "epoch": 1.9766693483507642, "grad_norm": 0.26831565550418635, "learning_rate": 3.1451117830398896e-06, "loss": 0.0272, "step": 7371 }, { "epoch": 1.9769375167605256, "grad_norm": 0.3226853905969348, "learning_rate": 3.1436630184551133e-06, "loss": 0.0162, "step": 7372 }, { "epoch": 1.9772056851702868, "grad_norm": 0.2878301407997294, "learning_rate": 3.1422144346169392e-06, "loss": 0.0291, "step": 7373 }, { "epoch": 1.9774738535800482, "grad_norm": 0.23692783745681056, "learning_rate": 3.1407660316664135e-06, "loss": 0.02, "step": 7374 }, { "epoch": 1.9777420219898096, "grad_norm": 0.26298198501097647, "learning_rate": 3.139317809744563e-06, "loss": 0.0235, "step": 7375 }, { "epoch": 1.978010190399571, "grad_norm": 0.2416055364972395, "learning_rate": 3.1378697689923987e-06, "loss": 0.0198, "step": 7376 }, { "epoch": 1.9782783588093322, "grad_norm": 0.2944780137913131, "learning_rate": 3.1364219095509097e-06, "loss": 0.0375, "step": 7377 }, { "epoch": 1.9785465272190936, "grad_norm": 0.2031424635839962, "learning_rate": 3.1349742315610728e-06, "loss": 0.0217, "step": 7378 }, { "epoch": 1.9788146956288548, "grad_norm": 0.22271753917188494, "learning_rate": 3.133526735163843e-06, "loss": 0.0205, "step": 7379 }, { "epoch": 1.9790828640386162, "grad_norm": 0.2136198449560959, "learning_rate": 3.1320794205001593e-06, "loss": 0.0171, "step": 7380 }, { "epoch": 1.9793510324483776, "grad_norm": 0.282087360678035, "learning_rate": 3.1306322877109428e-06, "loss": 0.0176, "step": 7381 }, { "epoch": 1.979619200858139, "grad_norm": 0.222004588244296, "learning_rate": 3.1291853369370994e-06, "loss": 0.0131, "step": 7382 }, { "epoch": 1.9798873692679002, "grad_norm": 0.3151243301339527, "learning_rate": 3.1277385683195117e-06, "loss": 0.0201, "step": 7383 }, { "epoch": 1.9801555376776616, "grad_norm": 0.4881756423051212, "learning_rate": 3.1262919819990487e-06, "loss": 0.0227, "step": 7384 }, { "epoch": 1.9804237060874228, "grad_norm": 0.22106597372028725, "learning_rate": 3.12484557811656e-06, "loss": 0.0208, "step": 7385 }, { "epoch": 1.9806918744971842, "grad_norm": 0.34861999846441566, "learning_rate": 3.1233993568128795e-06, "loss": 0.038, "step": 7386 }, { "epoch": 1.9809600429069456, "grad_norm": 0.2140444409961593, "learning_rate": 3.1219533182288215e-06, "loss": 0.0151, "step": 7387 }, { "epoch": 1.981228211316707, "grad_norm": 0.23096658649425703, "learning_rate": 3.120507462505183e-06, "loss": 0.0194, "step": 7388 }, { "epoch": 1.9814963797264682, "grad_norm": 0.2571300662907828, "learning_rate": 3.1190617897827424e-06, "loss": 0.0205, "step": 7389 }, { "epoch": 1.9817645481362296, "grad_norm": 0.26626853620922364, "learning_rate": 3.117616300202262e-06, "loss": 0.0178, "step": 7390 }, { "epoch": 1.9820327165459908, "grad_norm": 0.2347314821306523, "learning_rate": 3.116170993904484e-06, "loss": 0.018, "step": 7391 }, { "epoch": 1.9823008849557522, "grad_norm": 0.2665582790794659, "learning_rate": 3.1147258710301353e-06, "loss": 0.0253, "step": 7392 }, { "epoch": 1.9825690533655136, "grad_norm": 0.2235231577765956, "learning_rate": 3.1132809317199232e-06, "loss": 0.0163, "step": 7393 }, { "epoch": 1.982837221775275, "grad_norm": 0.291832044055794, "learning_rate": 3.11183617611454e-06, "loss": 0.0276, "step": 7394 }, { "epoch": 1.9831053901850362, "grad_norm": 0.2565445659398976, "learning_rate": 3.110391604354652e-06, "loss": 0.0182, "step": 7395 }, { "epoch": 1.9833735585947976, "grad_norm": 0.24574814995802347, "learning_rate": 3.1089472165809182e-06, "loss": 0.022, "step": 7396 }, { "epoch": 1.9836417270045588, "grad_norm": 0.21008965169423827, "learning_rate": 3.1075030129339726e-06, "loss": 0.0194, "step": 7397 }, { "epoch": 1.9839098954143202, "grad_norm": 0.27711553277624235, "learning_rate": 3.106058993554435e-06, "loss": 0.024, "step": 7398 }, { "epoch": 1.9841780638240816, "grad_norm": 0.2827563800443053, "learning_rate": 3.1046151585829044e-06, "loss": 0.019, "step": 7399 }, { "epoch": 1.984446232233843, "grad_norm": 0.24667839000918576, "learning_rate": 3.103171508159963e-06, "loss": 0.0213, "step": 7400 }, { "epoch": 1.9847144006436042, "grad_norm": 0.2594698001185999, "learning_rate": 3.101728042426177e-06, "loss": 0.0143, "step": 7401 }, { "epoch": 1.9849825690533656, "grad_norm": 0.3020645267810662, "learning_rate": 3.10028476152209e-06, "loss": 0.0224, "step": 7402 }, { "epoch": 1.9852507374631267, "grad_norm": 0.17948683487701536, "learning_rate": 3.0988416655882336e-06, "loss": 0.0181, "step": 7403 }, { "epoch": 1.9855189058728882, "grad_norm": 0.2313641352568494, "learning_rate": 3.0973987547651165e-06, "loss": 0.0181, "step": 7404 }, { "epoch": 1.9857870742826496, "grad_norm": 0.3050315924544107, "learning_rate": 3.0959560291932312e-06, "loss": 0.0348, "step": 7405 }, { "epoch": 1.986055242692411, "grad_norm": 0.20137844016622877, "learning_rate": 3.0945134890130535e-06, "loss": 0.0188, "step": 7406 }, { "epoch": 1.9863234111021721, "grad_norm": 0.19319960525229268, "learning_rate": 3.093071134365037e-06, "loss": 0.0188, "step": 7407 }, { "epoch": 1.9865915795119335, "grad_norm": 0.28920204216037787, "learning_rate": 3.0916289653896213e-06, "loss": 0.0228, "step": 7408 }, { "epoch": 1.9868597479216947, "grad_norm": 0.4729750825144965, "learning_rate": 3.0901869822272273e-06, "loss": 0.0249, "step": 7409 }, { "epoch": 1.9871279163314561, "grad_norm": 0.2444558524642889, "learning_rate": 3.0887451850182558e-06, "loss": 0.0204, "step": 7410 }, { "epoch": 1.9873960847412175, "grad_norm": 0.3185866682215508, "learning_rate": 3.087303573903092e-06, "loss": 0.0258, "step": 7411 }, { "epoch": 1.987664253150979, "grad_norm": 0.21494902143895483, "learning_rate": 3.0858621490220997e-06, "loss": 0.0192, "step": 7412 }, { "epoch": 1.9879324215607401, "grad_norm": 0.3666635706981367, "learning_rate": 3.0844209105156287e-06, "loss": 0.0265, "step": 7413 }, { "epoch": 1.9882005899705013, "grad_norm": 0.28634963661309604, "learning_rate": 3.082979858524008e-06, "loss": 0.0381, "step": 7414 }, { "epoch": 1.9884687583802627, "grad_norm": 0.1712615015464958, "learning_rate": 3.0815389931875487e-06, "loss": 0.015, "step": 7415 }, { "epoch": 1.9887369267900241, "grad_norm": 0.3530635406178965, "learning_rate": 3.0800983146465445e-06, "loss": 0.0223, "step": 7416 }, { "epoch": 1.9890050951997855, "grad_norm": 0.2845667185918553, "learning_rate": 3.0786578230412707e-06, "loss": 0.0332, "step": 7417 }, { "epoch": 1.989273263609547, "grad_norm": 0.21285679255801424, "learning_rate": 3.077217518511984e-06, "loss": 0.0178, "step": 7418 }, { "epoch": 1.989541432019308, "grad_norm": 0.24307945681723617, "learning_rate": 3.075777401198922e-06, "loss": 0.0187, "step": 7419 }, { "epoch": 1.9898096004290693, "grad_norm": 0.21302391386330244, "learning_rate": 3.0743374712423052e-06, "loss": 0.012, "step": 7420 }, { "epoch": 1.9900777688388307, "grad_norm": 0.1799656239805504, "learning_rate": 3.072897728782336e-06, "loss": 0.0148, "step": 7421 }, { "epoch": 1.990345937248592, "grad_norm": 0.24642616148258245, "learning_rate": 3.0714581739591987e-06, "loss": 0.019, "step": 7422 }, { "epoch": 1.9906141056583535, "grad_norm": 0.21716787213132804, "learning_rate": 3.0700188069130592e-06, "loss": 0.0163, "step": 7423 }, { "epoch": 1.990882274068115, "grad_norm": 0.2968506622592472, "learning_rate": 3.0685796277840646e-06, "loss": 0.0226, "step": 7424 }, { "epoch": 1.991150442477876, "grad_norm": 0.24817357913670118, "learning_rate": 3.0671406367123435e-06, "loss": 0.0229, "step": 7425 }, { "epoch": 1.9914186108876373, "grad_norm": 0.2665521491486088, "learning_rate": 3.0657018338380073e-06, "loss": 0.0262, "step": 7426 }, { "epoch": 1.9916867792973987, "grad_norm": 0.18469737353500937, "learning_rate": 3.064263219301148e-06, "loss": 0.0197, "step": 7427 }, { "epoch": 1.99195494770716, "grad_norm": 0.24652167297969219, "learning_rate": 3.06282479324184e-06, "loss": 0.019, "step": 7428 }, { "epoch": 1.9922231161169215, "grad_norm": 0.24431141321946612, "learning_rate": 3.0613865558001375e-06, "loss": 0.0219, "step": 7429 }, { "epoch": 1.992491284526683, "grad_norm": 0.2665662830426719, "learning_rate": 3.0599485071160816e-06, "loss": 0.027, "step": 7430 }, { "epoch": 1.992759452936444, "grad_norm": 0.27019070076379764, "learning_rate": 3.058510647329688e-06, "loss": 0.0226, "step": 7431 }, { "epoch": 1.9930276213462053, "grad_norm": 0.24214299235672676, "learning_rate": 3.057072976580957e-06, "loss": 0.0175, "step": 7432 }, { "epoch": 1.9932957897559667, "grad_norm": 0.20842572650993418, "learning_rate": 3.0556354950098723e-06, "loss": 0.0117, "step": 7433 }, { "epoch": 1.993563958165728, "grad_norm": 0.19974925738073906, "learning_rate": 3.0541982027563966e-06, "loss": 0.0186, "step": 7434 }, { "epoch": 1.9938321265754895, "grad_norm": 0.2588081528180662, "learning_rate": 3.052761099960477e-06, "loss": 0.0243, "step": 7435 }, { "epoch": 1.9941002949852509, "grad_norm": 0.25642999768080527, "learning_rate": 3.0513241867620393e-06, "loss": 0.0207, "step": 7436 }, { "epoch": 1.994368463395012, "grad_norm": 0.3141689587911263, "learning_rate": 3.049887463300991e-06, "loss": 0.0239, "step": 7437 }, { "epoch": 1.9946366318047732, "grad_norm": 0.18123282776401012, "learning_rate": 3.0484509297172238e-06, "loss": 0.0148, "step": 7438 }, { "epoch": 1.9949048002145346, "grad_norm": 0.2051447647572944, "learning_rate": 3.0470145861506067e-06, "loss": 0.0113, "step": 7439 }, { "epoch": 1.995172968624296, "grad_norm": 0.2906063319378687, "learning_rate": 3.0455784327409952e-06, "loss": 0.0171, "step": 7440 }, { "epoch": 1.9954411370340575, "grad_norm": 0.18450219791355704, "learning_rate": 3.044142469628224e-06, "loss": 0.0211, "step": 7441 }, { "epoch": 1.9957093054438189, "grad_norm": 0.2906210929336202, "learning_rate": 3.042706696952109e-06, "loss": 0.0241, "step": 7442 }, { "epoch": 1.99597747385358, "grad_norm": 0.310680163917282, "learning_rate": 3.041271114852443e-06, "loss": 0.0241, "step": 7443 }, { "epoch": 1.9962456422633412, "grad_norm": 0.2281328746717507, "learning_rate": 3.03983572346901e-06, "loss": 0.0244, "step": 7444 }, { "epoch": 1.9965138106731026, "grad_norm": 0.253398596561827, "learning_rate": 3.038400522941568e-06, "loss": 0.0271, "step": 7445 }, { "epoch": 1.996781979082864, "grad_norm": 0.27319636185335416, "learning_rate": 3.036965513409859e-06, "loss": 0.0226, "step": 7446 }, { "epoch": 1.9970501474926254, "grad_norm": 0.19227077943847015, "learning_rate": 3.0355306950136064e-06, "loss": 0.0148, "step": 7447 }, { "epoch": 1.9973183159023868, "grad_norm": 0.18355091154511627, "learning_rate": 3.0340960678925145e-06, "loss": 0.0178, "step": 7448 }, { "epoch": 1.997586484312148, "grad_norm": 0.23418098390373618, "learning_rate": 3.0326616321862697e-06, "loss": 0.0214, "step": 7449 }, { "epoch": 1.9978546527219092, "grad_norm": 0.199396815002417, "learning_rate": 3.0312273880345366e-06, "loss": 0.0164, "step": 7450 }, { "epoch": 1.9981228211316706, "grad_norm": 0.2767245716782409, "learning_rate": 3.029793335576967e-06, "loss": 0.0191, "step": 7451 }, { "epoch": 1.998390989541432, "grad_norm": 0.2053903300568752, "learning_rate": 3.0283594749531897e-06, "loss": 0.0162, "step": 7452 }, { "epoch": 1.9986591579511934, "grad_norm": 0.38045886851341454, "learning_rate": 3.0269258063028164e-06, "loss": 0.0265, "step": 7453 }, { "epoch": 1.9989273263609548, "grad_norm": 0.20079491219064619, "learning_rate": 3.025492329765439e-06, "loss": 0.0151, "step": 7454 }, { "epoch": 1.999195494770716, "grad_norm": 0.23426590368769132, "learning_rate": 3.02405904548063e-06, "loss": 0.019, "step": 7455 }, { "epoch": 1.9994636631804772, "grad_norm": 0.20517232366711743, "learning_rate": 3.0226259535879464e-06, "loss": 0.0159, "step": 7456 }, { "epoch": 1.9997318315902386, "grad_norm": 0.22793208301656498, "learning_rate": 3.0211930542269235e-06, "loss": 0.016, "step": 7457 }, { "epoch": 2.0, "grad_norm": 0.39354771443243336, "learning_rate": 3.0197603475370796e-06, "loss": 0.0234, "step": 7458 }, { "epoch": 2.0, "eval_loss": 0.02269063889980316, "eval_runtime": 292.4648, "eval_samples_per_second": 85.894, "eval_steps_per_second": 1.344, "step": 7458 }, { "epoch": 2.0002681684097614, "grad_norm": 0.23489950279988983, "learning_rate": 3.018327833657913e-06, "loss": 0.0139, "step": 7459 }, { "epoch": 2.000536336819523, "grad_norm": 0.2166443793372242, "learning_rate": 3.016895512728903e-06, "loss": 0.0233, "step": 7460 }, { "epoch": 2.0008045052292838, "grad_norm": 0.23708360077148635, "learning_rate": 3.0154633848895133e-06, "loss": 0.0226, "step": 7461 }, { "epoch": 2.001072673639045, "grad_norm": 0.19538134775717597, "learning_rate": 3.0140314502791844e-06, "loss": 0.0188, "step": 7462 }, { "epoch": 2.0013408420488066, "grad_norm": 0.1601255417953357, "learning_rate": 3.0125997090373405e-06, "loss": 0.0122, "step": 7463 }, { "epoch": 2.001609010458568, "grad_norm": 0.16805485886862115, "learning_rate": 3.0111681613033865e-06, "loss": 0.0104, "step": 7464 }, { "epoch": 2.0018771788683294, "grad_norm": 0.1656221174760972, "learning_rate": 3.0097368072167087e-06, "loss": 0.0169, "step": 7465 }, { "epoch": 2.002145347278091, "grad_norm": 0.2600000150978306, "learning_rate": 3.0083056469166747e-06, "loss": 0.0209, "step": 7466 }, { "epoch": 2.0024135156878518, "grad_norm": 0.19028199662574155, "learning_rate": 3.0068746805426318e-06, "loss": 0.0114, "step": 7467 }, { "epoch": 2.002681684097613, "grad_norm": 0.25343507668890985, "learning_rate": 3.0054439082339087e-06, "loss": 0.0198, "step": 7468 }, { "epoch": 2.0029498525073746, "grad_norm": 0.21261757869788384, "learning_rate": 3.004013330129817e-06, "loss": 0.0175, "step": 7469 }, { "epoch": 2.003218020917136, "grad_norm": 0.1839270582563166, "learning_rate": 3.002582946369647e-06, "loss": 0.0125, "step": 7470 }, { "epoch": 2.0034861893268974, "grad_norm": 0.2037284747646333, "learning_rate": 3.0011527570926737e-06, "loss": 0.0151, "step": 7471 }, { "epoch": 2.0037543577366588, "grad_norm": 0.19902822935756748, "learning_rate": 2.99972276243815e-06, "loss": 0.0151, "step": 7472 }, { "epoch": 2.0040225261464197, "grad_norm": 0.1973807850271639, "learning_rate": 2.9982929625453095e-06, "loss": 0.0141, "step": 7473 }, { "epoch": 2.004290694556181, "grad_norm": 0.42438275880131476, "learning_rate": 2.996863357553369e-06, "loss": 0.0216, "step": 7474 }, { "epoch": 2.0045588629659425, "grad_norm": 0.2076909436157284, "learning_rate": 2.995433947601526e-06, "loss": 0.0108, "step": 7475 }, { "epoch": 2.004827031375704, "grad_norm": 0.31569897887903686, "learning_rate": 2.9940047328289567e-06, "loss": 0.0187, "step": 7476 }, { "epoch": 2.0050951997854654, "grad_norm": 0.26309934735449786, "learning_rate": 2.9925757133748195e-06, "loss": 0.017, "step": 7477 }, { "epoch": 2.0053633681952268, "grad_norm": 0.2644244493175591, "learning_rate": 2.9911468893782567e-06, "loss": 0.0165, "step": 7478 }, { "epoch": 2.0056315366049877, "grad_norm": 0.32883418637146883, "learning_rate": 2.9897182609783905e-06, "loss": 0.0154, "step": 7479 }, { "epoch": 2.005899705014749, "grad_norm": 0.18981880730016132, "learning_rate": 2.9882898283143168e-06, "loss": 0.0096, "step": 7480 }, { "epoch": 2.0061678734245105, "grad_norm": 0.22745313225750138, "learning_rate": 2.986861591525122e-06, "loss": 0.0133, "step": 7481 }, { "epoch": 2.006436041834272, "grad_norm": 0.23484915996811426, "learning_rate": 2.9854335507498687e-06, "loss": 0.016, "step": 7482 }, { "epoch": 2.0067042102440333, "grad_norm": 0.220663042121539, "learning_rate": 2.9840057061276033e-06, "loss": 0.0142, "step": 7483 }, { "epoch": 2.0069723786537947, "grad_norm": 0.16448025834135904, "learning_rate": 2.9825780577973493e-06, "loss": 0.0097, "step": 7484 }, { "epoch": 2.0072405470635557, "grad_norm": 0.16562107674097515, "learning_rate": 2.9811506058981143e-06, "loss": 0.01, "step": 7485 }, { "epoch": 2.007508715473317, "grad_norm": 0.25094263465801914, "learning_rate": 2.9797233505688837e-06, "loss": 0.0143, "step": 7486 }, { "epoch": 2.0077768838830785, "grad_norm": 0.24932291826034217, "learning_rate": 2.978296291948627e-06, "loss": 0.0294, "step": 7487 }, { "epoch": 2.00804505229284, "grad_norm": 0.19548318427479167, "learning_rate": 2.9768694301762935e-06, "loss": 0.0158, "step": 7488 }, { "epoch": 2.0083132207026013, "grad_norm": 0.17589784508019232, "learning_rate": 2.975442765390811e-06, "loss": 0.0155, "step": 7489 }, { "epoch": 2.0085813891123627, "grad_norm": 0.23841810665647767, "learning_rate": 2.9740162977310927e-06, "loss": 0.0167, "step": 7490 }, { "epoch": 2.0088495575221237, "grad_norm": 0.19605529369839467, "learning_rate": 2.97259002733603e-06, "loss": 0.0143, "step": 7491 }, { "epoch": 2.009117725931885, "grad_norm": 0.5871476747400058, "learning_rate": 2.9711639543444916e-06, "loss": 0.0404, "step": 7492 }, { "epoch": 2.0093858943416465, "grad_norm": 0.26888065896842267, "learning_rate": 2.969738078895333e-06, "loss": 0.0184, "step": 7493 }, { "epoch": 2.009654062751408, "grad_norm": 0.2823101081848726, "learning_rate": 2.9683124011273866e-06, "loss": 0.0206, "step": 7494 }, { "epoch": 2.0099222311611693, "grad_norm": 0.28155577329191805, "learning_rate": 2.9668869211794687e-06, "loss": 0.0181, "step": 7495 }, { "epoch": 2.0101903995709307, "grad_norm": 0.48348232181364204, "learning_rate": 2.9654616391903733e-06, "loss": 0.017, "step": 7496 }, { "epoch": 2.0104585679806917, "grad_norm": 0.25450549495080865, "learning_rate": 2.964036555298876e-06, "loss": 0.0167, "step": 7497 }, { "epoch": 2.010726736390453, "grad_norm": 0.20343532503522083, "learning_rate": 2.9626116696437333e-06, "loss": 0.0127, "step": 7498 }, { "epoch": 2.0109949048002145, "grad_norm": 0.19794852314342581, "learning_rate": 2.961186982363684e-06, "loss": 0.0151, "step": 7499 }, { "epoch": 2.011263073209976, "grad_norm": 0.2279525975982331, "learning_rate": 2.9597624935974456e-06, "loss": 0.0112, "step": 7500 }, { "epoch": 2.0115312416197373, "grad_norm": 0.22218260609266904, "learning_rate": 2.958338203483716e-06, "loss": 0.0133, "step": 7501 }, { "epoch": 2.0117994100294987, "grad_norm": 0.23456234024098258, "learning_rate": 2.9569141121611755e-06, "loss": 0.0112, "step": 7502 }, { "epoch": 2.0120675784392597, "grad_norm": 0.2313246901210944, "learning_rate": 2.9554902197684843e-06, "loss": 0.0125, "step": 7503 }, { "epoch": 2.012335746849021, "grad_norm": 0.23036932680523303, "learning_rate": 2.954066526444281e-06, "loss": 0.0148, "step": 7504 }, { "epoch": 2.0126039152587825, "grad_norm": 0.2679572720914199, "learning_rate": 2.9526430323271895e-06, "loss": 0.0135, "step": 7505 }, { "epoch": 2.012872083668544, "grad_norm": 0.24741511811374528, "learning_rate": 2.951219737555809e-06, "loss": 0.0145, "step": 7506 }, { "epoch": 2.0131402520783053, "grad_norm": 0.4171189661729088, "learning_rate": 2.949796642268724e-06, "loss": 0.027, "step": 7507 }, { "epoch": 2.0134084204880667, "grad_norm": 0.1830619943929899, "learning_rate": 2.9483737466044947e-06, "loss": 0.0086, "step": 7508 }, { "epoch": 2.0136765888978276, "grad_norm": 0.22385108941513912, "learning_rate": 2.9469510507016685e-06, "loss": 0.014, "step": 7509 }, { "epoch": 2.013944757307589, "grad_norm": 0.21351214165674365, "learning_rate": 2.945528554698768e-06, "loss": 0.0111, "step": 7510 }, { "epoch": 2.0142129257173504, "grad_norm": 0.2433055974705479, "learning_rate": 2.944106258734297e-06, "loss": 0.0149, "step": 7511 }, { "epoch": 2.014481094127112, "grad_norm": 0.387752912025201, "learning_rate": 2.9426841629467416e-06, "loss": 0.0166, "step": 7512 }, { "epoch": 2.0147492625368733, "grad_norm": 0.20330569283308908, "learning_rate": 2.9412622674745673e-06, "loss": 0.0089, "step": 7513 }, { "epoch": 2.0150174309466347, "grad_norm": 0.25700264546417606, "learning_rate": 2.939840572456221e-06, "loss": 0.0158, "step": 7514 }, { "epoch": 2.0152855993563956, "grad_norm": 0.25914163364973664, "learning_rate": 2.938419078030128e-06, "loss": 0.0135, "step": 7515 }, { "epoch": 2.015553767766157, "grad_norm": 0.2063836713738416, "learning_rate": 2.9369977843346953e-06, "loss": 0.0137, "step": 7516 }, { "epoch": 2.0158219361759184, "grad_norm": 0.21921724638019818, "learning_rate": 2.935576691508312e-06, "loss": 0.0145, "step": 7517 }, { "epoch": 2.01609010458568, "grad_norm": 0.24915627274791488, "learning_rate": 2.9341557996893434e-06, "loss": 0.0196, "step": 7518 }, { "epoch": 2.0163582729954412, "grad_norm": 0.2083609775697057, "learning_rate": 2.9327351090161416e-06, "loss": 0.0149, "step": 7519 }, { "epoch": 2.0166264414052026, "grad_norm": 0.26257055507353083, "learning_rate": 2.931314619627033e-06, "loss": 0.0142, "step": 7520 }, { "epoch": 2.0168946098149636, "grad_norm": 0.1936169381201401, "learning_rate": 2.929894331660328e-06, "loss": 0.0103, "step": 7521 }, { "epoch": 2.017162778224725, "grad_norm": 0.24293578218180717, "learning_rate": 2.9284742452543158e-06, "loss": 0.0148, "step": 7522 }, { "epoch": 2.0174309466344864, "grad_norm": 0.6438504353893504, "learning_rate": 2.9270543605472666e-06, "loss": 0.0175, "step": 7523 }, { "epoch": 2.017699115044248, "grad_norm": 0.22089208552804374, "learning_rate": 2.9256346776774314e-06, "loss": 0.017, "step": 7524 }, { "epoch": 2.017967283454009, "grad_norm": 0.2349678535373348, "learning_rate": 2.9242151967830383e-06, "loss": 0.0143, "step": 7525 }, { "epoch": 2.0182354518637706, "grad_norm": 0.21829784135254585, "learning_rate": 2.9227959180023022e-06, "loss": 0.0138, "step": 7526 }, { "epoch": 2.0185036202735316, "grad_norm": 0.22976061677915935, "learning_rate": 2.9213768414734146e-06, "loss": 0.0127, "step": 7527 }, { "epoch": 2.018771788683293, "grad_norm": 0.29345166398674855, "learning_rate": 2.9199579673345425e-06, "loss": 0.0179, "step": 7528 }, { "epoch": 2.0190399570930544, "grad_norm": 0.21773320080179898, "learning_rate": 2.91853929572384e-06, "loss": 0.0174, "step": 7529 }, { "epoch": 2.019308125502816, "grad_norm": 0.22800539812649076, "learning_rate": 2.9171208267794426e-06, "loss": 0.0127, "step": 7530 }, { "epoch": 2.019576293912577, "grad_norm": 0.17651883533777846, "learning_rate": 2.9157025606394585e-06, "loss": 0.0125, "step": 7531 }, { "epoch": 2.0198444623223386, "grad_norm": 0.2341419831339138, "learning_rate": 2.9142844974419816e-06, "loss": 0.0119, "step": 7532 }, { "epoch": 2.0201126307320996, "grad_norm": 0.27717449494580126, "learning_rate": 2.912866637325088e-06, "loss": 0.0178, "step": 7533 }, { "epoch": 2.020380799141861, "grad_norm": 0.2600731613202258, "learning_rate": 2.911448980426827e-06, "loss": 0.0131, "step": 7534 }, { "epoch": 2.0206489675516224, "grad_norm": 0.311333670750166, "learning_rate": 2.9100315268852353e-06, "loss": 0.0185, "step": 7535 }, { "epoch": 2.020917135961384, "grad_norm": 0.47447332089608946, "learning_rate": 2.9086142768383234e-06, "loss": 0.0149, "step": 7536 }, { "epoch": 2.021185304371145, "grad_norm": 0.3262803253519475, "learning_rate": 2.9071972304240893e-06, "loss": 0.0182, "step": 7537 }, { "epoch": 2.0214534727809066, "grad_norm": 0.2638230073706627, "learning_rate": 2.9057803877805014e-06, "loss": 0.0132, "step": 7538 }, { "epoch": 2.0217216411906676, "grad_norm": 0.28829189656336746, "learning_rate": 2.90436374904552e-06, "loss": 0.0208, "step": 7539 }, { "epoch": 2.021989809600429, "grad_norm": 0.1812612058895704, "learning_rate": 2.902947314357076e-06, "loss": 0.0086, "step": 7540 }, { "epoch": 2.0222579780101904, "grad_norm": 0.255219537459475, "learning_rate": 2.901531083853083e-06, "loss": 0.0115, "step": 7541 }, { "epoch": 2.0225261464199518, "grad_norm": 0.29334351375240375, "learning_rate": 2.9001150576714365e-06, "loss": 0.0203, "step": 7542 }, { "epoch": 2.022794314829713, "grad_norm": 0.2524550255023345, "learning_rate": 2.8986992359500145e-06, "loss": 0.0167, "step": 7543 }, { "epoch": 2.0230624832394746, "grad_norm": 0.25868206463526977, "learning_rate": 2.8972836188266663e-06, "loss": 0.0142, "step": 7544 }, { "epoch": 2.0233306516492355, "grad_norm": 0.25746762839820775, "learning_rate": 2.8958682064392316e-06, "loss": 0.0162, "step": 7545 }, { "epoch": 2.023598820058997, "grad_norm": 0.2501647148560753, "learning_rate": 2.894452998925522e-06, "loss": 0.0118, "step": 7546 }, { "epoch": 2.0238669884687583, "grad_norm": 0.29233433862903013, "learning_rate": 2.893037996423335e-06, "loss": 0.0148, "step": 7547 }, { "epoch": 2.0241351568785197, "grad_norm": 0.20507037645838974, "learning_rate": 2.8916231990704426e-06, "loss": 0.011, "step": 7548 }, { "epoch": 2.024403325288281, "grad_norm": 0.29959654755109144, "learning_rate": 2.8902086070046022e-06, "loss": 0.0169, "step": 7549 }, { "epoch": 2.0246714936980426, "grad_norm": 0.20979613095539784, "learning_rate": 2.88879422036355e-06, "loss": 0.0133, "step": 7550 }, { "epoch": 2.0249396621078035, "grad_norm": 0.24080328660394865, "learning_rate": 2.887380039284999e-06, "loss": 0.0161, "step": 7551 }, { "epoch": 2.025207830517565, "grad_norm": 0.25259137629101686, "learning_rate": 2.885966063906645e-06, "loss": 0.0152, "step": 7552 }, { "epoch": 2.0254759989273263, "grad_norm": 0.20275738222937442, "learning_rate": 2.8845522943661607e-06, "loss": 0.0092, "step": 7553 }, { "epoch": 2.0257441673370877, "grad_norm": 0.41217218894728186, "learning_rate": 2.8831387308012026e-06, "loss": 0.0147, "step": 7554 }, { "epoch": 2.026012335746849, "grad_norm": 0.530517225164426, "learning_rate": 2.881725373349409e-06, "loss": 0.0103, "step": 7555 }, { "epoch": 2.0262805041566105, "grad_norm": 0.21636164293780694, "learning_rate": 2.8803122221483894e-06, "loss": 0.0149, "step": 7556 }, { "epoch": 2.0265486725663715, "grad_norm": 0.19866528391209431, "learning_rate": 2.8788992773357434e-06, "loss": 0.0126, "step": 7557 }, { "epoch": 2.026816840976133, "grad_norm": 0.25466899694722617, "learning_rate": 2.877486539049041e-06, "loss": 0.0132, "step": 7558 }, { "epoch": 2.0270850093858943, "grad_norm": 0.3411201873121345, "learning_rate": 2.876074007425839e-06, "loss": 0.0238, "step": 7559 }, { "epoch": 2.0273531777956557, "grad_norm": 0.30859737679804833, "learning_rate": 2.8746616826036743e-06, "loss": 0.02, "step": 7560 }, { "epoch": 2.027621346205417, "grad_norm": 0.19501590285559692, "learning_rate": 2.873249564720057e-06, "loss": 0.013, "step": 7561 }, { "epoch": 2.0278895146151785, "grad_norm": 0.1990772040455571, "learning_rate": 2.8718376539124847e-06, "loss": 0.0099, "step": 7562 }, { "epoch": 2.0281576830249395, "grad_norm": 0.22436812306168474, "learning_rate": 2.8704259503184306e-06, "loss": 0.0157, "step": 7563 }, { "epoch": 2.028425851434701, "grad_norm": 0.19783852265454183, "learning_rate": 2.869014454075345e-06, "loss": 0.0116, "step": 7564 }, { "epoch": 2.0286940198444623, "grad_norm": 0.33222482334949044, "learning_rate": 2.867603165320667e-06, "loss": 0.0189, "step": 7565 }, { "epoch": 2.0289621882542237, "grad_norm": 0.28303302603205105, "learning_rate": 2.866192084191805e-06, "loss": 0.0184, "step": 7566 }, { "epoch": 2.029230356663985, "grad_norm": 0.3262104097521295, "learning_rate": 2.864781210826157e-06, "loss": 0.0155, "step": 7567 }, { "epoch": 2.0294985250737465, "grad_norm": 0.2378418028714297, "learning_rate": 2.863370545361091e-06, "loss": 0.0142, "step": 7568 }, { "epoch": 2.0297666934835075, "grad_norm": 0.28313730795896697, "learning_rate": 2.861960087933965e-06, "loss": 0.0115, "step": 7569 }, { "epoch": 2.030034861893269, "grad_norm": 0.44450501048029917, "learning_rate": 2.860549838682106e-06, "loss": 0.0193, "step": 7570 }, { "epoch": 2.0303030303030303, "grad_norm": 0.3046844779196827, "learning_rate": 2.8591397977428302e-06, "loss": 0.0143, "step": 7571 }, { "epoch": 2.0305711987127917, "grad_norm": 0.2293520704442382, "learning_rate": 2.8577299652534296e-06, "loss": 0.0128, "step": 7572 }, { "epoch": 2.030839367122553, "grad_norm": 0.2959090792850215, "learning_rate": 2.856320341351173e-06, "loss": 0.0203, "step": 7573 }, { "epoch": 2.0311075355323145, "grad_norm": 0.26044329440030056, "learning_rate": 2.8549109261733157e-06, "loss": 0.017, "step": 7574 }, { "epoch": 2.0313757039420755, "grad_norm": 0.20129799051204625, "learning_rate": 2.853501719857086e-06, "loss": 0.0126, "step": 7575 }, { "epoch": 2.031643872351837, "grad_norm": 0.20048314134540815, "learning_rate": 2.852092722539693e-06, "loss": 0.0111, "step": 7576 }, { "epoch": 2.0319120407615983, "grad_norm": 0.2376770429565458, "learning_rate": 2.8506839343583305e-06, "loss": 0.0142, "step": 7577 }, { "epoch": 2.0321802091713597, "grad_norm": 0.32710284887905805, "learning_rate": 2.8492753554501654e-06, "loss": 0.0129, "step": 7578 }, { "epoch": 2.032448377581121, "grad_norm": 0.22974977356813775, "learning_rate": 2.847866985952351e-06, "loss": 0.011, "step": 7579 }, { "epoch": 2.0327165459908825, "grad_norm": 0.27761784279976204, "learning_rate": 2.8464588260020127e-06, "loss": 0.0184, "step": 7580 }, { "epoch": 2.0329847144006434, "grad_norm": 0.3002205915429074, "learning_rate": 2.8450508757362604e-06, "loss": 0.0207, "step": 7581 }, { "epoch": 2.033252882810405, "grad_norm": 0.2364849171722325, "learning_rate": 2.8436431352921858e-06, "loss": 0.0168, "step": 7582 }, { "epoch": 2.0335210512201662, "grad_norm": 0.18787496974897674, "learning_rate": 2.842235604806851e-06, "loss": 0.0117, "step": 7583 }, { "epoch": 2.0337892196299276, "grad_norm": 0.19239178388055117, "learning_rate": 2.8408282844173095e-06, "loss": 0.0113, "step": 7584 }, { "epoch": 2.034057388039689, "grad_norm": 0.18366943066791078, "learning_rate": 2.839421174260583e-06, "loss": 0.0095, "step": 7585 }, { "epoch": 2.0343255564494505, "grad_norm": 0.45739077596746874, "learning_rate": 2.8380142744736825e-06, "loss": 0.0141, "step": 7586 }, { "epoch": 2.0345937248592114, "grad_norm": 0.1678249315102992, "learning_rate": 2.8366075851935927e-06, "loss": 0.01, "step": 7587 }, { "epoch": 2.034861893268973, "grad_norm": 0.3544836929280356, "learning_rate": 2.8352011065572765e-06, "loss": 0.0216, "step": 7588 }, { "epoch": 2.0351300616787342, "grad_norm": 0.2493553050579759, "learning_rate": 2.833794838701683e-06, "loss": 0.0157, "step": 7589 }, { "epoch": 2.0353982300884956, "grad_norm": 0.34303654641597986, "learning_rate": 2.832388781763733e-06, "loss": 0.0199, "step": 7590 }, { "epoch": 2.035666398498257, "grad_norm": 0.21021688702140712, "learning_rate": 2.830982935880332e-06, "loss": 0.0131, "step": 7591 }, { "epoch": 2.0359345669080184, "grad_norm": 0.2813391094914667, "learning_rate": 2.829577301188366e-06, "loss": 0.0222, "step": 7592 }, { "epoch": 2.0362027353177794, "grad_norm": 0.2761304039721216, "learning_rate": 2.8281718778246935e-06, "loss": 0.0168, "step": 7593 }, { "epoch": 2.036470903727541, "grad_norm": 0.24069181603053547, "learning_rate": 2.8267666659261618e-06, "loss": 0.0139, "step": 7594 }, { "epoch": 2.036739072137302, "grad_norm": 0.6661830368658052, "learning_rate": 2.825361665629588e-06, "loss": 0.0195, "step": 7595 }, { "epoch": 2.0370072405470636, "grad_norm": 0.2277700657493533, "learning_rate": 2.8239568770717773e-06, "loss": 0.015, "step": 7596 }, { "epoch": 2.037275408956825, "grad_norm": 0.34759315231499377, "learning_rate": 2.822552300389506e-06, "loss": 0.0187, "step": 7597 }, { "epoch": 2.0375435773665864, "grad_norm": 0.23356602970661425, "learning_rate": 2.8211479357195358e-06, "loss": 0.0127, "step": 7598 }, { "epoch": 2.0378117457763474, "grad_norm": 0.25408002371108773, "learning_rate": 2.8197437831986085e-06, "loss": 0.0152, "step": 7599 }, { "epoch": 2.038079914186109, "grad_norm": 0.31036162616530666, "learning_rate": 2.818339842963441e-06, "loss": 0.0163, "step": 7600 }, { "epoch": 2.03834808259587, "grad_norm": 0.19402082880136062, "learning_rate": 2.816936115150729e-06, "loss": 0.0109, "step": 7601 }, { "epoch": 2.0386162510056316, "grad_norm": 0.25002141296901886, "learning_rate": 2.815532599897153e-06, "loss": 0.0148, "step": 7602 }, { "epoch": 2.038884419415393, "grad_norm": 0.21040791446249388, "learning_rate": 2.8141292973393665e-06, "loss": 0.011, "step": 7603 }, { "epoch": 2.0391525878251544, "grad_norm": 0.20722636571663264, "learning_rate": 2.8127262076140083e-06, "loss": 0.0178, "step": 7604 }, { "epoch": 2.0394207562349154, "grad_norm": 0.2634016195371481, "learning_rate": 2.8113233308576905e-06, "loss": 0.0176, "step": 7605 }, { "epoch": 2.0396889246446768, "grad_norm": 0.19717885553385983, "learning_rate": 2.809920667207011e-06, "loss": 0.0117, "step": 7606 }, { "epoch": 2.039957093054438, "grad_norm": 0.22116492212406716, "learning_rate": 2.80851821679854e-06, "loss": 0.0126, "step": 7607 }, { "epoch": 2.0402252614641996, "grad_norm": 0.23852568685772516, "learning_rate": 2.8071159797688317e-06, "loss": 0.0128, "step": 7608 }, { "epoch": 2.040493429873961, "grad_norm": 0.21021542830938886, "learning_rate": 2.8057139562544206e-06, "loss": 0.0133, "step": 7609 }, { "epoch": 2.0407615982837224, "grad_norm": 0.2883124516518787, "learning_rate": 2.804312146391814e-06, "loss": 0.0188, "step": 7610 }, { "epoch": 2.0410297666934833, "grad_norm": 0.3689620261501219, "learning_rate": 2.802910550317506e-06, "loss": 0.0341, "step": 7611 }, { "epoch": 2.0412979351032448, "grad_norm": 0.2067491822100232, "learning_rate": 2.801509168167964e-06, "loss": 0.0137, "step": 7612 }, { "epoch": 2.041566103513006, "grad_norm": 0.2905114627154889, "learning_rate": 2.8001080000796367e-06, "loss": 0.0164, "step": 7613 }, { "epoch": 2.0418342719227676, "grad_norm": 0.21047354864477175, "learning_rate": 2.798707046188954e-06, "loss": 0.0122, "step": 7614 }, { "epoch": 2.042102440332529, "grad_norm": 0.2917454901723074, "learning_rate": 2.7973063066323204e-06, "loss": 0.0157, "step": 7615 }, { "epoch": 2.0423706087422904, "grad_norm": 0.4873833906904073, "learning_rate": 2.795905781546125e-06, "loss": 0.0178, "step": 7616 }, { "epoch": 2.0426387771520513, "grad_norm": 0.31929872031208717, "learning_rate": 2.79450547106673e-06, "loss": 0.0193, "step": 7617 }, { "epoch": 2.0429069455618127, "grad_norm": 0.21727485611875952, "learning_rate": 2.7931053753304823e-06, "loss": 0.011, "step": 7618 }, { "epoch": 2.043175113971574, "grad_norm": 0.194203062363349, "learning_rate": 2.7917054944737067e-06, "loss": 0.0095, "step": 7619 }, { "epoch": 2.0434432823813355, "grad_norm": 0.3394330226034791, "learning_rate": 2.7903058286327024e-06, "loss": 0.0151, "step": 7620 }, { "epoch": 2.043711450791097, "grad_norm": 0.2620041953314781, "learning_rate": 2.788906377943754e-06, "loss": 0.0141, "step": 7621 }, { "epoch": 2.0439796192008584, "grad_norm": 0.21341633109770938, "learning_rate": 2.78750714254312e-06, "loss": 0.0118, "step": 7622 }, { "epoch": 2.0442477876106193, "grad_norm": 0.2529716269629954, "learning_rate": 2.786108122567044e-06, "loss": 0.015, "step": 7623 }, { "epoch": 2.0445159560203807, "grad_norm": 0.25797681493688296, "learning_rate": 2.7847093181517416e-06, "loss": 0.0156, "step": 7624 }, { "epoch": 2.044784124430142, "grad_norm": 0.2288560546611802, "learning_rate": 2.7833107294334093e-06, "loss": 0.0154, "step": 7625 }, { "epoch": 2.0450522928399035, "grad_norm": 0.2591306875180103, "learning_rate": 2.781912356548229e-06, "loss": 0.012, "step": 7626 }, { "epoch": 2.045320461249665, "grad_norm": 0.3645268065715245, "learning_rate": 2.78051419963235e-06, "loss": 0.0122, "step": 7627 }, { "epoch": 2.0455886296594263, "grad_norm": 0.39408770236896157, "learning_rate": 2.779116258821912e-06, "loss": 0.0191, "step": 7628 }, { "epoch": 2.0458567980691873, "grad_norm": 0.39490895944363624, "learning_rate": 2.77771853425303e-06, "loss": 0.0205, "step": 7629 }, { "epoch": 2.0461249664789487, "grad_norm": 0.320728686999571, "learning_rate": 2.7763210260617913e-06, "loss": 0.0176, "step": 7630 }, { "epoch": 2.04639313488871, "grad_norm": 0.2222840622875859, "learning_rate": 2.774923734384273e-06, "loss": 0.0204, "step": 7631 }, { "epoch": 2.0466613032984715, "grad_norm": 0.27130286422010474, "learning_rate": 2.7735266593565213e-06, "loss": 0.0167, "step": 7632 }, { "epoch": 2.046929471708233, "grad_norm": 0.29069233423556323, "learning_rate": 2.7721298011145694e-06, "loss": 0.0121, "step": 7633 }, { "epoch": 2.047197640117994, "grad_norm": 0.30051265335161315, "learning_rate": 2.770733159794422e-06, "loss": 0.0196, "step": 7634 }, { "epoch": 2.0474658085277553, "grad_norm": 0.656495921034612, "learning_rate": 2.769336735532068e-06, "loss": 0.0228, "step": 7635 }, { "epoch": 2.0477339769375167, "grad_norm": 0.29592878171203374, "learning_rate": 2.7679405284634776e-06, "loss": 0.0147, "step": 7636 }, { "epoch": 2.048002145347278, "grad_norm": 0.21991749633550534, "learning_rate": 2.766544538724588e-06, "loss": 0.0121, "step": 7637 }, { "epoch": 2.0482703137570395, "grad_norm": 0.29130025046941943, "learning_rate": 2.7651487664513276e-06, "loss": 0.021, "step": 7638 }, { "epoch": 2.048538482166801, "grad_norm": 0.3385503778050744, "learning_rate": 2.7637532117795997e-06, "loss": 0.0177, "step": 7639 }, { "epoch": 2.0488066505765623, "grad_norm": 0.2012070288436613, "learning_rate": 2.762357874845283e-06, "loss": 0.012, "step": 7640 }, { "epoch": 2.0490748189863233, "grad_norm": 0.2794097486466313, "learning_rate": 2.760962755784241e-06, "loss": 0.0133, "step": 7641 }, { "epoch": 2.0493429873960847, "grad_norm": 0.2128953119863023, "learning_rate": 2.7595678547323086e-06, "loss": 0.012, "step": 7642 }, { "epoch": 2.049611155805846, "grad_norm": 0.2769894902868293, "learning_rate": 2.758173171825308e-06, "loss": 0.0198, "step": 7643 }, { "epoch": 2.0498793242156075, "grad_norm": 0.2266302090848311, "learning_rate": 2.756778707199032e-06, "loss": 0.0133, "step": 7644 }, { "epoch": 2.050147492625369, "grad_norm": 0.21090134476753206, "learning_rate": 2.7553844609892577e-06, "loss": 0.0112, "step": 7645 }, { "epoch": 2.05041566103513, "grad_norm": 0.29409961820948194, "learning_rate": 2.7539904333317414e-06, "loss": 0.0238, "step": 7646 }, { "epoch": 2.0506838294448912, "grad_norm": 0.1719398691378149, "learning_rate": 2.7525966243622105e-06, "loss": 0.0085, "step": 7647 }, { "epoch": 2.0509519978546527, "grad_norm": 0.2694756083312958, "learning_rate": 2.751203034216384e-06, "loss": 0.02, "step": 7648 }, { "epoch": 2.051220166264414, "grad_norm": 0.21639886610137676, "learning_rate": 2.7498096630299442e-06, "loss": 0.0178, "step": 7649 }, { "epoch": 2.0514883346741755, "grad_norm": 0.25489676965159835, "learning_rate": 2.748416510938563e-06, "loss": 0.0128, "step": 7650 }, { "epoch": 2.051756503083937, "grad_norm": 0.2179017758461054, "learning_rate": 2.7470235780778905e-06, "loss": 0.0107, "step": 7651 }, { "epoch": 2.052024671493698, "grad_norm": 0.2214242766125523, "learning_rate": 2.7456308645835483e-06, "loss": 0.0169, "step": 7652 }, { "epoch": 2.0522928399034592, "grad_norm": 0.21097792498119794, "learning_rate": 2.7442383705911455e-06, "loss": 0.0133, "step": 7653 }, { "epoch": 2.0525610083132206, "grad_norm": 0.22408887807779684, "learning_rate": 2.7428460962362618e-06, "loss": 0.0133, "step": 7654 }, { "epoch": 2.052829176722982, "grad_norm": 0.30138300679382246, "learning_rate": 2.7414540416544612e-06, "loss": 0.0206, "step": 7655 }, { "epoch": 2.0530973451327434, "grad_norm": 0.22421951874922283, "learning_rate": 2.7400622069812866e-06, "loss": 0.0232, "step": 7656 }, { "epoch": 2.053365513542505, "grad_norm": 0.2119899403753158, "learning_rate": 2.7386705923522515e-06, "loss": 0.0161, "step": 7657 }, { "epoch": 2.053633681952266, "grad_norm": 0.2262296328539605, "learning_rate": 2.7372791979028607e-06, "loss": 0.015, "step": 7658 }, { "epoch": 2.053901850362027, "grad_norm": 0.31485860418057443, "learning_rate": 2.7358880237685844e-06, "loss": 0.0179, "step": 7659 }, { "epoch": 2.0541700187717886, "grad_norm": 0.252466256203146, "learning_rate": 2.7344970700848816e-06, "loss": 0.0152, "step": 7660 }, { "epoch": 2.05443818718155, "grad_norm": 0.1868073987475624, "learning_rate": 2.7331063369871845e-06, "loss": 0.0102, "step": 7661 }, { "epoch": 2.0547063555913114, "grad_norm": 0.2619938090491766, "learning_rate": 2.7317158246109037e-06, "loss": 0.0254, "step": 7662 }, { "epoch": 2.054974524001073, "grad_norm": 0.31282357805329125, "learning_rate": 2.7303255330914322e-06, "loss": 0.0203, "step": 7663 }, { "epoch": 2.055242692410834, "grad_norm": 0.18330107006077012, "learning_rate": 2.728935462564136e-06, "loss": 0.007, "step": 7664 }, { "epoch": 2.055510860820595, "grad_norm": 0.26481260107731464, "learning_rate": 2.7275456131643662e-06, "loss": 0.0141, "step": 7665 }, { "epoch": 2.0557790292303566, "grad_norm": 0.23886668223133986, "learning_rate": 2.7261559850274444e-06, "loss": 0.0122, "step": 7666 }, { "epoch": 2.056047197640118, "grad_norm": 0.2709176131999788, "learning_rate": 2.724766578288678e-06, "loss": 0.0138, "step": 7667 }, { "epoch": 2.0563153660498794, "grad_norm": 0.19414494778516486, "learning_rate": 2.7233773930833507e-06, "loss": 0.0104, "step": 7668 }, { "epoch": 2.056583534459641, "grad_norm": 0.28984598341842055, "learning_rate": 2.72198842954672e-06, "loss": 0.0236, "step": 7669 }, { "epoch": 2.0568517028694018, "grad_norm": 0.2601846943539223, "learning_rate": 2.7205996878140305e-06, "loss": 0.016, "step": 7670 }, { "epoch": 2.057119871279163, "grad_norm": 0.7737512536214434, "learning_rate": 2.7192111680204957e-06, "loss": 0.0156, "step": 7671 }, { "epoch": 2.0573880396889246, "grad_norm": 0.3452347217588337, "learning_rate": 2.7178228703013165e-06, "loss": 0.0109, "step": 7672 }, { "epoch": 2.057656208098686, "grad_norm": 0.2894236150007278, "learning_rate": 2.7164347947916647e-06, "loss": 0.0179, "step": 7673 }, { "epoch": 2.0579243765084474, "grad_norm": 0.2723769176509527, "learning_rate": 2.7150469416266923e-06, "loss": 0.017, "step": 7674 }, { "epoch": 2.058192544918209, "grad_norm": 0.9211103277351875, "learning_rate": 2.7136593109415342e-06, "loss": 0.0263, "step": 7675 }, { "epoch": 2.0584607133279698, "grad_norm": 0.29048002427244957, "learning_rate": 2.712271902871298e-06, "loss": 0.0197, "step": 7676 }, { "epoch": 2.058728881737731, "grad_norm": 0.3654829854584317, "learning_rate": 2.710884717551072e-06, "loss": 0.0113, "step": 7677 }, { "epoch": 2.0589970501474926, "grad_norm": 0.18818697340074578, "learning_rate": 2.7094977551159263e-06, "loss": 0.0118, "step": 7678 }, { "epoch": 2.059265218557254, "grad_norm": 0.32981962789332275, "learning_rate": 2.7081110157009005e-06, "loss": 0.0257, "step": 7679 }, { "epoch": 2.0595333869670154, "grad_norm": 0.30327205015097103, "learning_rate": 2.7067244994410224e-06, "loss": 0.0196, "step": 7680 }, { "epoch": 2.059801555376777, "grad_norm": 0.30993515510548025, "learning_rate": 2.705338206471289e-06, "loss": 0.017, "step": 7681 }, { "epoch": 2.0600697237865377, "grad_norm": 0.2639664835435476, "learning_rate": 2.703952136926684e-06, "loss": 0.0128, "step": 7682 }, { "epoch": 2.060337892196299, "grad_norm": 0.24405637754774448, "learning_rate": 2.7025662909421625e-06, "loss": 0.0148, "step": 7683 }, { "epoch": 2.0606060606060606, "grad_norm": 0.2474789652189972, "learning_rate": 2.7011806686526633e-06, "loss": 0.0126, "step": 7684 }, { "epoch": 2.060874229015822, "grad_norm": 0.280570317037517, "learning_rate": 2.6997952701930985e-06, "loss": 0.0169, "step": 7685 }, { "epoch": 2.0611423974255834, "grad_norm": 0.30008611115585604, "learning_rate": 2.6984100956983606e-06, "loss": 0.0137, "step": 7686 }, { "epoch": 2.0614105658353448, "grad_norm": 0.2377929929992578, "learning_rate": 2.69702514530332e-06, "loss": 0.0132, "step": 7687 }, { "epoch": 2.0616787342451057, "grad_norm": 0.40781943512293817, "learning_rate": 2.6956404191428287e-06, "loss": 0.0139, "step": 7688 }, { "epoch": 2.061946902654867, "grad_norm": 0.2248374944241652, "learning_rate": 2.6942559173517104e-06, "loss": 0.012, "step": 7689 }, { "epoch": 2.0622150710646285, "grad_norm": 0.22354030588739984, "learning_rate": 2.6928716400647736e-06, "loss": 0.0118, "step": 7690 }, { "epoch": 2.06248323947439, "grad_norm": 0.2739315495654821, "learning_rate": 2.6914875874167978e-06, "loss": 0.0177, "step": 7691 }, { "epoch": 2.0627514078841513, "grad_norm": 0.26515409843676874, "learning_rate": 2.6901037595425484e-06, "loss": 0.0176, "step": 7692 }, { "epoch": 2.0630195762939127, "grad_norm": 0.282707264927694, "learning_rate": 2.6887201565767617e-06, "loss": 0.013, "step": 7693 }, { "epoch": 2.0632877447036737, "grad_norm": 0.260192818733073, "learning_rate": 2.687336778654157e-06, "loss": 0.0165, "step": 7694 }, { "epoch": 2.063555913113435, "grad_norm": 0.20039845935710288, "learning_rate": 2.685953625909432e-06, "loss": 0.0139, "step": 7695 }, { "epoch": 2.0638240815231965, "grad_norm": 0.27477559579936767, "learning_rate": 2.684570698477259e-06, "loss": 0.0165, "step": 7696 }, { "epoch": 2.064092249932958, "grad_norm": 0.19862559956921383, "learning_rate": 2.6831879964922876e-06, "loss": 0.0109, "step": 7697 }, { "epoch": 2.0643604183427193, "grad_norm": 0.3618500579923991, "learning_rate": 2.6818055200891515e-06, "loss": 0.0109, "step": 7698 }, { "epoch": 2.0646285867524807, "grad_norm": 0.26668207173788555, "learning_rate": 2.6804232694024557e-06, "loss": 0.015, "step": 7699 }, { "epoch": 2.0648967551622417, "grad_norm": 0.23372684348939288, "learning_rate": 2.67904124456679e-06, "loss": 0.0107, "step": 7700 }, { "epoch": 2.065164923572003, "grad_norm": 0.19300226494686745, "learning_rate": 2.677659445716714e-06, "loss": 0.0078, "step": 7701 }, { "epoch": 2.0654330919817645, "grad_norm": 0.17021524042968345, "learning_rate": 2.6762778729867738e-06, "loss": 0.0122, "step": 7702 }, { "epoch": 2.065701260391526, "grad_norm": 0.43643968464307215, "learning_rate": 2.6748965265114862e-06, "loss": 0.0166, "step": 7703 }, { "epoch": 2.0659694288012873, "grad_norm": 0.494705108252885, "learning_rate": 2.673515406425351e-06, "loss": 0.0166, "step": 7704 }, { "epoch": 2.0662375972110487, "grad_norm": 0.2141935557451624, "learning_rate": 2.672134512862845e-06, "loss": 0.0141, "step": 7705 }, { "epoch": 2.0665057656208097, "grad_norm": 0.36310639989559573, "learning_rate": 2.6707538459584204e-06, "loss": 0.0259, "step": 7706 }, { "epoch": 2.066773934030571, "grad_norm": 0.24215238697875252, "learning_rate": 2.6693734058465105e-06, "loss": 0.0174, "step": 7707 }, { "epoch": 2.0670421024403325, "grad_norm": 0.21232332047147068, "learning_rate": 2.6679931926615255e-06, "loss": 0.012, "step": 7708 }, { "epoch": 2.067310270850094, "grad_norm": 0.23199012496554883, "learning_rate": 2.6666132065378493e-06, "loss": 0.0167, "step": 7709 }, { "epoch": 2.0675784392598553, "grad_norm": 0.1840605866346855, "learning_rate": 2.6652334476098517e-06, "loss": 0.0113, "step": 7710 }, { "epoch": 2.0678466076696167, "grad_norm": 0.33176599349519953, "learning_rate": 2.663853916011874e-06, "loss": 0.0133, "step": 7711 }, { "epoch": 2.0681147760793777, "grad_norm": 0.21128316165982455, "learning_rate": 2.662474611878239e-06, "loss": 0.0117, "step": 7712 }, { "epoch": 2.068382944489139, "grad_norm": 0.2357375298379638, "learning_rate": 2.6610955353432434e-06, "loss": 0.0137, "step": 7713 }, { "epoch": 2.0686511128989005, "grad_norm": 0.2154175164718772, "learning_rate": 2.6597166865411663e-06, "loss": 0.0155, "step": 7714 }, { "epoch": 2.068919281308662, "grad_norm": 0.23370564422429574, "learning_rate": 2.6583380656062633e-06, "loss": 0.014, "step": 7715 }, { "epoch": 2.0691874497184233, "grad_norm": 0.2846005711439057, "learning_rate": 2.6569596726727644e-06, "loss": 0.016, "step": 7716 }, { "epoch": 2.0694556181281847, "grad_norm": 0.2309099498552967, "learning_rate": 2.655581507874884e-06, "loss": 0.0117, "step": 7717 }, { "epoch": 2.0697237865379456, "grad_norm": 0.2481727782034555, "learning_rate": 2.6542035713468044e-06, "loss": 0.0168, "step": 7718 }, { "epoch": 2.069991954947707, "grad_norm": 0.21572005422211615, "learning_rate": 2.652825863222698e-06, "loss": 0.0119, "step": 7719 }, { "epoch": 2.0702601233574685, "grad_norm": 0.4252085788635779, "learning_rate": 2.6514483836367032e-06, "loss": 0.0162, "step": 7720 }, { "epoch": 2.07052829176723, "grad_norm": 0.25702842832019757, "learning_rate": 2.6500711327229466e-06, "loss": 0.0124, "step": 7721 }, { "epoch": 2.0707964601769913, "grad_norm": 0.2369580040043926, "learning_rate": 2.648694110615525e-06, "loss": 0.0158, "step": 7722 }, { "epoch": 2.0710646285867527, "grad_norm": 0.25925912573718557, "learning_rate": 2.647317317448513e-06, "loss": 0.0142, "step": 7723 }, { "epoch": 2.0713327969965136, "grad_norm": 0.26507247157478286, "learning_rate": 2.645940753355967e-06, "loss": 0.0156, "step": 7724 }, { "epoch": 2.071600965406275, "grad_norm": 0.28349726846616613, "learning_rate": 2.6445644184719227e-06, "loss": 0.0168, "step": 7725 }, { "epoch": 2.0718691338160364, "grad_norm": 0.24756852377775093, "learning_rate": 2.6431883129303847e-06, "loss": 0.0155, "step": 7726 }, { "epoch": 2.072137302225798, "grad_norm": 0.22453952784582623, "learning_rate": 2.6418124368653463e-06, "loss": 0.018, "step": 7727 }, { "epoch": 2.0724054706355592, "grad_norm": 0.3895212167507296, "learning_rate": 2.6404367904107674e-06, "loss": 0.0206, "step": 7728 }, { "epoch": 2.0726736390453206, "grad_norm": 0.20973319491954764, "learning_rate": 2.6390613737005964e-06, "loss": 0.0101, "step": 7729 }, { "epoch": 2.0729418074550816, "grad_norm": 0.2724216918795566, "learning_rate": 2.6376861868687486e-06, "loss": 0.0182, "step": 7730 }, { "epoch": 2.073209975864843, "grad_norm": 0.241944237983217, "learning_rate": 2.636311230049125e-06, "loss": 0.0142, "step": 7731 }, { "epoch": 2.0734781442746044, "grad_norm": 0.2454953485204207, "learning_rate": 2.6349365033756036e-06, "loss": 0.0159, "step": 7732 }, { "epoch": 2.073746312684366, "grad_norm": 0.30023736463301626, "learning_rate": 2.6335620069820357e-06, "loss": 0.0141, "step": 7733 }, { "epoch": 2.0740144810941272, "grad_norm": 0.22403473353155562, "learning_rate": 2.632187741002252e-06, "loss": 0.0108, "step": 7734 }, { "epoch": 2.0742826495038886, "grad_norm": 0.2822306552643744, "learning_rate": 2.6308137055700607e-06, "loss": 0.0153, "step": 7735 }, { "epoch": 2.0745508179136496, "grad_norm": 0.2082721746204847, "learning_rate": 2.6294399008192482e-06, "loss": 0.0143, "step": 7736 }, { "epoch": 2.074818986323411, "grad_norm": 0.23018017849928601, "learning_rate": 2.6280663268835806e-06, "loss": 0.0136, "step": 7737 }, { "epoch": 2.0750871547331724, "grad_norm": 0.2164380704003625, "learning_rate": 2.626692983896796e-06, "loss": 0.0159, "step": 7738 }, { "epoch": 2.075355323142934, "grad_norm": 0.24478738378930645, "learning_rate": 2.6253198719926153e-06, "loss": 0.019, "step": 7739 }, { "epoch": 2.075623491552695, "grad_norm": 0.22315854515723202, "learning_rate": 2.623946991304733e-06, "loss": 0.0243, "step": 7740 }, { "epoch": 2.0758916599624566, "grad_norm": 0.2656852748846713, "learning_rate": 2.622574341966824e-06, "loss": 0.0224, "step": 7741 }, { "epoch": 2.0761598283722176, "grad_norm": 0.3456167092166858, "learning_rate": 2.621201924112541e-06, "loss": 0.0208, "step": 7742 }, { "epoch": 2.076427996781979, "grad_norm": 0.20223844046148842, "learning_rate": 2.619829737875509e-06, "loss": 0.0141, "step": 7743 }, { "epoch": 2.0766961651917404, "grad_norm": 0.2576686887612501, "learning_rate": 2.6184577833893385e-06, "loss": 0.015, "step": 7744 }, { "epoch": 2.076964333601502, "grad_norm": 0.885705167326344, "learning_rate": 2.6170860607876103e-06, "loss": 0.0327, "step": 7745 }, { "epoch": 2.077232502011263, "grad_norm": 0.27918864192851117, "learning_rate": 2.6157145702038846e-06, "loss": 0.0192, "step": 7746 }, { "epoch": 2.0775006704210246, "grad_norm": 0.21790265963339858, "learning_rate": 2.6143433117717023e-06, "loss": 0.0108, "step": 7747 }, { "epoch": 2.0777688388307856, "grad_norm": 0.20373973622903138, "learning_rate": 2.6129722856245766e-06, "loss": 0.0114, "step": 7748 }, { "epoch": 2.078037007240547, "grad_norm": 0.2104928684702517, "learning_rate": 2.611601491896003e-06, "loss": 0.0179, "step": 7749 }, { "epoch": 2.0783051756503084, "grad_norm": 0.21137726954524697, "learning_rate": 2.6102309307194496e-06, "loss": 0.0119, "step": 7750 }, { "epoch": 2.0785733440600698, "grad_norm": 0.2918565734931733, "learning_rate": 2.6088606022283657e-06, "loss": 0.0162, "step": 7751 }, { "epoch": 2.078841512469831, "grad_norm": 0.24273231733085882, "learning_rate": 2.6074905065561797e-06, "loss": 0.0109, "step": 7752 }, { "epoch": 2.0791096808795926, "grad_norm": 0.23991044131939565, "learning_rate": 2.6061206438362874e-06, "loss": 0.0177, "step": 7753 }, { "epoch": 2.0793778492893535, "grad_norm": 0.250030478586736, "learning_rate": 2.604751014202075e-06, "loss": 0.0106, "step": 7754 }, { "epoch": 2.079646017699115, "grad_norm": 0.23984327350975468, "learning_rate": 2.6033816177868954e-06, "loss": 0.0112, "step": 7755 }, { "epoch": 2.0799141861088764, "grad_norm": 0.31751966435235784, "learning_rate": 2.6020124547240864e-06, "loss": 0.0141, "step": 7756 }, { "epoch": 2.0801823545186378, "grad_norm": 0.2576445124129311, "learning_rate": 2.6006435251469585e-06, "loss": 0.0154, "step": 7757 }, { "epoch": 2.080450522928399, "grad_norm": 0.24406682761798773, "learning_rate": 2.599274829188798e-06, "loss": 0.0141, "step": 7758 }, { "epoch": 2.0807186913381606, "grad_norm": 0.2435202725609098, "learning_rate": 2.5979063669828764e-06, "loss": 0.0157, "step": 7759 }, { "epoch": 2.0809868597479215, "grad_norm": 0.2598945965659765, "learning_rate": 2.596538138662431e-06, "loss": 0.0156, "step": 7760 }, { "epoch": 2.081255028157683, "grad_norm": 0.24044787656469926, "learning_rate": 2.5951701443606894e-06, "loss": 0.0151, "step": 7761 }, { "epoch": 2.0815231965674443, "grad_norm": 0.2493977258919132, "learning_rate": 2.5938023842108432e-06, "loss": 0.0128, "step": 7762 }, { "epoch": 2.0817913649772057, "grad_norm": 0.23491541652871945, "learning_rate": 2.5924348583460714e-06, "loss": 0.0118, "step": 7763 }, { "epoch": 2.082059533386967, "grad_norm": 0.3087070291815123, "learning_rate": 2.5910675668995265e-06, "loss": 0.0188, "step": 7764 }, { "epoch": 2.0823277017967285, "grad_norm": 0.33557035765790544, "learning_rate": 2.5897005100043354e-06, "loss": 0.0173, "step": 7765 }, { "epoch": 2.0825958702064895, "grad_norm": 0.18503835081425637, "learning_rate": 2.5883336877936082e-06, "loss": 0.0126, "step": 7766 }, { "epoch": 2.082864038616251, "grad_norm": 0.20954249333395397, "learning_rate": 2.5869671004004256e-06, "loss": 0.0136, "step": 7767 }, { "epoch": 2.0831322070260123, "grad_norm": 0.2704965951378242, "learning_rate": 2.585600747957849e-06, "loss": 0.0176, "step": 7768 }, { "epoch": 2.0834003754357737, "grad_norm": 0.41926793949021407, "learning_rate": 2.5842346305989224e-06, "loss": 0.0149, "step": 7769 }, { "epoch": 2.083668543845535, "grad_norm": 0.2375903577275654, "learning_rate": 2.5828687484566516e-06, "loss": 0.0159, "step": 7770 }, { "epoch": 2.0839367122552965, "grad_norm": 0.19387852836069325, "learning_rate": 2.5815031016640356e-06, "loss": 0.011, "step": 7771 }, { "epoch": 2.0842048806650575, "grad_norm": 0.2839735062556969, "learning_rate": 2.5801376903540397e-06, "loss": 0.015, "step": 7772 }, { "epoch": 2.084473049074819, "grad_norm": 0.2775165273328617, "learning_rate": 2.578772514659612e-06, "loss": 0.0157, "step": 7773 }, { "epoch": 2.0847412174845803, "grad_norm": 0.27001353630556, "learning_rate": 2.5774075747136784e-06, "loss": 0.0203, "step": 7774 }, { "epoch": 2.0850093858943417, "grad_norm": 0.23540319952856875, "learning_rate": 2.576042870649136e-06, "loss": 0.0144, "step": 7775 }, { "epoch": 2.085277554304103, "grad_norm": 0.2499355909973123, "learning_rate": 2.574678402598865e-06, "loss": 0.0147, "step": 7776 }, { "epoch": 2.0855457227138645, "grad_norm": 0.22177443038973987, "learning_rate": 2.5733141706957164e-06, "loss": 0.0195, "step": 7777 }, { "epoch": 2.0858138911236255, "grad_norm": 0.24356244243561812, "learning_rate": 2.5719501750725275e-06, "loss": 0.0097, "step": 7778 }, { "epoch": 2.086082059533387, "grad_norm": 0.18332421961323714, "learning_rate": 2.5705864158621008e-06, "loss": 0.0103, "step": 7779 }, { "epoch": 2.0863502279431483, "grad_norm": 0.19608620605403743, "learning_rate": 2.569222893197225e-06, "loss": 0.011, "step": 7780 }, { "epoch": 2.0866183963529097, "grad_norm": 0.22955382461034374, "learning_rate": 2.567859607210667e-06, "loss": 0.0155, "step": 7781 }, { "epoch": 2.086886564762671, "grad_norm": 0.2637592287841738, "learning_rate": 2.5664965580351577e-06, "loss": 0.0139, "step": 7782 }, { "epoch": 2.0871547331724325, "grad_norm": 0.3404513739504032, "learning_rate": 2.5651337458034177e-06, "loss": 0.0243, "step": 7783 }, { "epoch": 2.0874229015821935, "grad_norm": 0.24505780048582096, "learning_rate": 2.5637711706481417e-06, "loss": 0.0173, "step": 7784 }, { "epoch": 2.087691069991955, "grad_norm": 0.2653853423336899, "learning_rate": 2.5624088327019973e-06, "loss": 0.0178, "step": 7785 }, { "epoch": 2.0879592384017163, "grad_norm": 0.28839193322153894, "learning_rate": 2.561046732097635e-06, "loss": 0.0205, "step": 7786 }, { "epoch": 2.0882274068114777, "grad_norm": 0.21391527893694753, "learning_rate": 2.559684868967676e-06, "loss": 0.0115, "step": 7787 }, { "epoch": 2.088495575221239, "grad_norm": 0.3067244003732284, "learning_rate": 2.5583232434447235e-06, "loss": 0.0233, "step": 7788 }, { "epoch": 2.0887637436310005, "grad_norm": 0.19904461998871822, "learning_rate": 2.5569618556613524e-06, "loss": 0.0128, "step": 7789 }, { "epoch": 2.0890319120407614, "grad_norm": 0.23003269012907868, "learning_rate": 2.5556007057501193e-06, "loss": 0.014, "step": 7790 }, { "epoch": 2.089300080450523, "grad_norm": 0.20781771652649791, "learning_rate": 2.5542397938435574e-06, "loss": 0.0108, "step": 7791 }, { "epoch": 2.0895682488602842, "grad_norm": 0.19135770209049782, "learning_rate": 2.552879120074172e-06, "loss": 0.0091, "step": 7792 }, { "epoch": 2.0898364172700457, "grad_norm": 0.20109474538050728, "learning_rate": 2.551518684574451e-06, "loss": 0.0147, "step": 7793 }, { "epoch": 2.090104585679807, "grad_norm": 0.29519919017980856, "learning_rate": 2.550158487476856e-06, "loss": 0.0203, "step": 7794 }, { "epoch": 2.0903727540895685, "grad_norm": 0.20840732632833808, "learning_rate": 2.548798528913822e-06, "loss": 0.0129, "step": 7795 }, { "epoch": 2.0906409224993294, "grad_norm": 0.26036703547348683, "learning_rate": 2.5474388090177703e-06, "loss": 0.017, "step": 7796 }, { "epoch": 2.090909090909091, "grad_norm": 0.4551663720834099, "learning_rate": 2.5460793279210875e-06, "loss": 0.0184, "step": 7797 }, { "epoch": 2.0911772593188522, "grad_norm": 0.23990134021228154, "learning_rate": 2.5447200857561483e-06, "loss": 0.0142, "step": 7798 }, { "epoch": 2.0914454277286136, "grad_norm": 0.2780961309644883, "learning_rate": 2.5433610826552928e-06, "loss": 0.0159, "step": 7799 }, { "epoch": 2.091713596138375, "grad_norm": 0.41872528793108194, "learning_rate": 2.542002318750847e-06, "loss": 0.018, "step": 7800 }, { "epoch": 2.0919817645481364, "grad_norm": 0.23145321134317562, "learning_rate": 2.5406437941751115e-06, "loss": 0.014, "step": 7801 }, { "epoch": 2.0922499329578974, "grad_norm": 0.4889226539412063, "learning_rate": 2.5392855090603585e-06, "loss": 0.0174, "step": 7802 }, { "epoch": 2.092518101367659, "grad_norm": 0.20524264001478548, "learning_rate": 2.537927463538844e-06, "loss": 0.0147, "step": 7803 }, { "epoch": 2.09278626977742, "grad_norm": 0.2479107714634576, "learning_rate": 2.536569657742794e-06, "loss": 0.0177, "step": 7804 }, { "epoch": 2.0930544381871816, "grad_norm": 0.22561911625782316, "learning_rate": 2.5352120918044177e-06, "loss": 0.0131, "step": 7805 }, { "epoch": 2.093322606596943, "grad_norm": 0.20841409737777744, "learning_rate": 2.5338547658558964e-06, "loss": 0.0133, "step": 7806 }, { "epoch": 2.093590775006704, "grad_norm": 0.28581054212687634, "learning_rate": 2.5324976800293878e-06, "loss": 0.0138, "step": 7807 }, { "epoch": 2.0938589434164654, "grad_norm": 0.25722424408873795, "learning_rate": 2.5311408344570296e-06, "loss": 0.0154, "step": 7808 }, { "epoch": 2.094127111826227, "grad_norm": 0.1909763448951231, "learning_rate": 2.529784229270933e-06, "loss": 0.012, "step": 7809 }, { "epoch": 2.094395280235988, "grad_norm": 0.2764486631129896, "learning_rate": 2.5284278646031866e-06, "loss": 0.0121, "step": 7810 }, { "epoch": 2.0946634486457496, "grad_norm": 0.2563672268585981, "learning_rate": 2.5270717405858595e-06, "loss": 0.0113, "step": 7811 }, { "epoch": 2.094931617055511, "grad_norm": 0.3380069394499706, "learning_rate": 2.5257158573509897e-06, "loss": 0.0164, "step": 7812 }, { "epoch": 2.0951997854652724, "grad_norm": 0.27771938916306876, "learning_rate": 2.5243602150306e-06, "loss": 0.0208, "step": 7813 }, { "epoch": 2.0954679538750334, "grad_norm": 0.21363875538458896, "learning_rate": 2.523004813756681e-06, "loss": 0.015, "step": 7814 }, { "epoch": 2.0957361222847948, "grad_norm": 0.25004011044349933, "learning_rate": 2.521649653661209e-06, "loss": 0.0169, "step": 7815 }, { "epoch": 2.096004290694556, "grad_norm": 0.26408537920125097, "learning_rate": 2.5202947348761285e-06, "loss": 0.0154, "step": 7816 }, { "epoch": 2.0962724591043176, "grad_norm": 0.19232667758573357, "learning_rate": 2.5189400575333683e-06, "loss": 0.0112, "step": 7817 }, { "epoch": 2.096540627514079, "grad_norm": 0.2784143252662543, "learning_rate": 2.5175856217648276e-06, "loss": 0.0133, "step": 7818 }, { "epoch": 2.09680879592384, "grad_norm": 0.30414013804970524, "learning_rate": 2.516231427702382e-06, "loss": 0.0148, "step": 7819 }, { "epoch": 2.0970769643336014, "grad_norm": 0.2594960721784442, "learning_rate": 2.5148774754778883e-06, "loss": 0.0121, "step": 7820 }, { "epoch": 2.0973451327433628, "grad_norm": 0.2412482390136821, "learning_rate": 2.5135237652231786e-06, "loss": 0.0141, "step": 7821 }, { "epoch": 2.097613301153124, "grad_norm": 0.29111093189466325, "learning_rate": 2.512170297070057e-06, "loss": 0.0187, "step": 7822 }, { "epoch": 2.0978814695628856, "grad_norm": 0.24866882541305857, "learning_rate": 2.5108170711503098e-06, "loss": 0.0196, "step": 7823 }, { "epoch": 2.098149637972647, "grad_norm": 0.21974691579932207, "learning_rate": 2.509464087595694e-06, "loss": 0.0129, "step": 7824 }, { "epoch": 2.0984178063824084, "grad_norm": 0.29530791030069214, "learning_rate": 2.5081113465379494e-06, "loss": 0.017, "step": 7825 }, { "epoch": 2.0986859747921693, "grad_norm": 0.2932522976070235, "learning_rate": 2.506758848108786e-06, "loss": 0.0158, "step": 7826 }, { "epoch": 2.0989541432019307, "grad_norm": 0.21374569472312432, "learning_rate": 2.5054065924398934e-06, "loss": 0.0117, "step": 7827 }, { "epoch": 2.099222311611692, "grad_norm": 0.2687163209847595, "learning_rate": 2.50405457966294e-06, "loss": 0.015, "step": 7828 }, { "epoch": 2.0994904800214536, "grad_norm": 0.222392673636387, "learning_rate": 2.5027028099095657e-06, "loss": 0.0123, "step": 7829 }, { "epoch": 2.099758648431215, "grad_norm": 0.238935249020314, "learning_rate": 2.5013512833113882e-06, "loss": 0.0152, "step": 7830 }, { "epoch": 2.100026816840976, "grad_norm": 0.3125364967092152, "learning_rate": 2.5000000000000015e-06, "loss": 0.0166, "step": 7831 }, { "epoch": 2.1002949852507373, "grad_norm": 0.3790112076625781, "learning_rate": 2.4986489601069763e-06, "loss": 0.02, "step": 7832 }, { "epoch": 2.1005631536604987, "grad_norm": 0.20425784330227806, "learning_rate": 2.4972981637638636e-06, "loss": 0.0108, "step": 7833 }, { "epoch": 2.10083132207026, "grad_norm": 0.2963836894867957, "learning_rate": 2.495947611102182e-06, "loss": 0.0139, "step": 7834 }, { "epoch": 2.1010994904800215, "grad_norm": 0.1973590109006194, "learning_rate": 2.494597302253435e-06, "loss": 0.0143, "step": 7835 }, { "epoch": 2.101367658889783, "grad_norm": 0.2516538588973656, "learning_rate": 2.4932472373490956e-06, "loss": 0.0167, "step": 7836 }, { "epoch": 2.101635827299544, "grad_norm": 0.19969133623237156, "learning_rate": 2.4918974165206167e-06, "loss": 0.0093, "step": 7837 }, { "epoch": 2.1019039957093053, "grad_norm": 0.32134704318520835, "learning_rate": 2.4905478398994297e-06, "loss": 0.0241, "step": 7838 }, { "epoch": 2.1021721641190667, "grad_norm": 1.138671433871642, "learning_rate": 2.4891985076169356e-06, "loss": 0.0146, "step": 7839 }, { "epoch": 2.102440332528828, "grad_norm": 0.2859464090871284, "learning_rate": 2.487849419804518e-06, "loss": 0.0153, "step": 7840 }, { "epoch": 2.1027085009385895, "grad_norm": 0.2745663932216737, "learning_rate": 2.4865005765935315e-06, "loss": 0.014, "step": 7841 }, { "epoch": 2.102976669348351, "grad_norm": 0.22531938428063855, "learning_rate": 2.485151978115312e-06, "loss": 0.0106, "step": 7842 }, { "epoch": 2.103244837758112, "grad_norm": 0.29499935625440626, "learning_rate": 2.483803624501168e-06, "loss": 0.0132, "step": 7843 }, { "epoch": 2.1035130061678733, "grad_norm": 0.23111056666336333, "learning_rate": 2.4824555158823826e-06, "loss": 0.0127, "step": 7844 }, { "epoch": 2.1037811745776347, "grad_norm": 0.447364076529288, "learning_rate": 2.481107652390222e-06, "loss": 0.0177, "step": 7845 }, { "epoch": 2.104049342987396, "grad_norm": 0.2742038452840805, "learning_rate": 2.4797600341559193e-06, "loss": 0.014, "step": 7846 }, { "epoch": 2.1043175113971575, "grad_norm": 0.24256478834088863, "learning_rate": 2.478412661310693e-06, "loss": 0.0188, "step": 7847 }, { "epoch": 2.104585679806919, "grad_norm": 0.3775046509980866, "learning_rate": 2.4770655339857293e-06, "loss": 0.027, "step": 7848 }, { "epoch": 2.10485384821668, "grad_norm": 0.19176612309853314, "learning_rate": 2.475718652312196e-06, "loss": 0.0178, "step": 7849 }, { "epoch": 2.1051220166264413, "grad_norm": 0.19988670578350395, "learning_rate": 2.4743720164212376e-06, "loss": 0.0099, "step": 7850 }, { "epoch": 2.1053901850362027, "grad_norm": 0.2910376339324989, "learning_rate": 2.473025626443969e-06, "loss": 0.0147, "step": 7851 }, { "epoch": 2.105658353445964, "grad_norm": 0.2557734102761934, "learning_rate": 2.4716794825114884e-06, "loss": 0.0177, "step": 7852 }, { "epoch": 2.1059265218557255, "grad_norm": 0.4139985571789397, "learning_rate": 2.4703335847548615e-06, "loss": 0.0163, "step": 7853 }, { "epoch": 2.106194690265487, "grad_norm": 0.21542528515995693, "learning_rate": 2.468987933305139e-06, "loss": 0.013, "step": 7854 }, { "epoch": 2.106462858675248, "grad_norm": 0.2672350685822383, "learning_rate": 2.4676425282933423e-06, "loss": 0.014, "step": 7855 }, { "epoch": 2.1067310270850093, "grad_norm": 0.3782202885623261, "learning_rate": 2.4662973698504678e-06, "loss": 0.0194, "step": 7856 }, { "epoch": 2.1069991954947707, "grad_norm": 0.20681462266289063, "learning_rate": 2.464952458107493e-06, "loss": 0.0114, "step": 7857 }, { "epoch": 2.107267363904532, "grad_norm": 0.2659457043971076, "learning_rate": 2.4636077931953646e-06, "loss": 0.0155, "step": 7858 }, { "epoch": 2.1075355323142935, "grad_norm": 0.24802988596543613, "learning_rate": 2.462263375245012e-06, "loss": 0.0156, "step": 7859 }, { "epoch": 2.107803700724055, "grad_norm": 0.22603785920846087, "learning_rate": 2.4609192043873387e-06, "loss": 0.0149, "step": 7860 }, { "epoch": 2.108071869133816, "grad_norm": 0.20330812952321403, "learning_rate": 2.4595752807532192e-06, "loss": 0.0102, "step": 7861 }, { "epoch": 2.1083400375435772, "grad_norm": 0.2908241200696346, "learning_rate": 2.458231604473512e-06, "loss": 0.0195, "step": 7862 }, { "epoch": 2.1086082059533386, "grad_norm": 0.22060566708912435, "learning_rate": 2.4568881756790436e-06, "loss": 0.0143, "step": 7863 }, { "epoch": 2.1088763743631, "grad_norm": 0.2576106355817076, "learning_rate": 2.455544994500621e-06, "loss": 0.0153, "step": 7864 }, { "epoch": 2.1091445427728615, "grad_norm": 0.19877036564384692, "learning_rate": 2.4542020610690293e-06, "loss": 0.0096, "step": 7865 }, { "epoch": 2.109412711182623, "grad_norm": 0.36707721080367367, "learning_rate": 2.4528593755150235e-06, "loss": 0.0164, "step": 7866 }, { "epoch": 2.109680879592384, "grad_norm": 0.20635686105251694, "learning_rate": 2.4515169379693387e-06, "loss": 0.0097, "step": 7867 }, { "epoch": 2.1099490480021452, "grad_norm": 0.2171341227359243, "learning_rate": 2.4501747485626808e-06, "loss": 0.0141, "step": 7868 }, { "epoch": 2.1102172164119066, "grad_norm": 0.24348098058460246, "learning_rate": 2.4488328074257385e-06, "loss": 0.0175, "step": 7869 }, { "epoch": 2.110485384821668, "grad_norm": 0.25065814847701606, "learning_rate": 2.447491114689174e-06, "loss": 0.0147, "step": 7870 }, { "epoch": 2.1107535532314294, "grad_norm": 0.2540897462396622, "learning_rate": 2.4461496704836212e-06, "loss": 0.0135, "step": 7871 }, { "epoch": 2.111021721641191, "grad_norm": 0.29617889437252837, "learning_rate": 2.4448084749396966e-06, "loss": 0.0158, "step": 7872 }, { "epoch": 2.111289890050952, "grad_norm": 0.2407685159018953, "learning_rate": 2.443467528187985e-06, "loss": 0.011, "step": 7873 }, { "epoch": 2.111558058460713, "grad_norm": 0.23089395471355048, "learning_rate": 2.4421268303590543e-06, "loss": 0.0145, "step": 7874 }, { "epoch": 2.1118262268704746, "grad_norm": 0.23309342069539835, "learning_rate": 2.4407863815834414e-06, "loss": 0.0188, "step": 7875 }, { "epoch": 2.112094395280236, "grad_norm": 0.3329747006629458, "learning_rate": 2.4394461819916643e-06, "loss": 0.0212, "step": 7876 }, { "epoch": 2.1123625636899974, "grad_norm": 0.2214556082068569, "learning_rate": 2.4381062317142155e-06, "loss": 0.0119, "step": 7877 }, { "epoch": 2.112630732099759, "grad_norm": 0.1957792309757757, "learning_rate": 2.436766530881562e-06, "loss": 0.0103, "step": 7878 }, { "epoch": 2.11289890050952, "grad_norm": 0.2780324052074412, "learning_rate": 2.4354270796241435e-06, "loss": 0.012, "step": 7879 }, { "epoch": 2.113167068919281, "grad_norm": 0.23415876658599571, "learning_rate": 2.434087878072384e-06, "loss": 0.0149, "step": 7880 }, { "epoch": 2.1134352373290426, "grad_norm": 0.2830567756037706, "learning_rate": 2.4327489263566735e-06, "loss": 0.0183, "step": 7881 }, { "epoch": 2.113703405738804, "grad_norm": 0.2626538161238261, "learning_rate": 2.4314102246073857e-06, "loss": 0.0139, "step": 7882 }, { "epoch": 2.1139715741485654, "grad_norm": 0.24968245809550474, "learning_rate": 2.4300717729548627e-06, "loss": 0.0125, "step": 7883 }, { "epoch": 2.114239742558327, "grad_norm": 0.27107280847669807, "learning_rate": 2.428733571529431e-06, "loss": 0.0147, "step": 7884 }, { "epoch": 2.1145079109680878, "grad_norm": 0.29881299769198505, "learning_rate": 2.427395620461382e-06, "loss": 0.017, "step": 7885 }, { "epoch": 2.114776079377849, "grad_norm": 0.2792013123844687, "learning_rate": 2.4260579198809912e-06, "loss": 0.0158, "step": 7886 }, { "epoch": 2.1150442477876106, "grad_norm": 0.25540705407203057, "learning_rate": 2.42472046991851e-06, "loss": 0.0164, "step": 7887 }, { "epoch": 2.115312416197372, "grad_norm": 0.25699479520029667, "learning_rate": 2.4233832707041567e-06, "loss": 0.0197, "step": 7888 }, { "epoch": 2.1155805846071334, "grad_norm": 0.45805036653003944, "learning_rate": 2.4220463223681357e-06, "loss": 0.0285, "step": 7889 }, { "epoch": 2.115848753016895, "grad_norm": 0.20084173007950878, "learning_rate": 2.42070962504062e-06, "loss": 0.011, "step": 7890 }, { "epoch": 2.1161169214266558, "grad_norm": 0.2367780411226924, "learning_rate": 2.419373178851758e-06, "loss": 0.014, "step": 7891 }, { "epoch": 2.116385089836417, "grad_norm": 0.2179794871607164, "learning_rate": 2.4180369839316803e-06, "loss": 0.0154, "step": 7892 }, { "epoch": 2.1166532582461786, "grad_norm": 0.18526240429238738, "learning_rate": 2.4167010404104845e-06, "loss": 0.0144, "step": 7893 }, { "epoch": 2.11692142665594, "grad_norm": 0.2225525873483801, "learning_rate": 2.415365348418252e-06, "loss": 0.0103, "step": 7894 }, { "epoch": 2.1171895950657014, "grad_norm": 0.2237395291312261, "learning_rate": 2.4140299080850317e-06, "loss": 0.0174, "step": 7895 }, { "epoch": 2.1174577634754628, "grad_norm": 0.20714817735279512, "learning_rate": 2.4126947195408528e-06, "loss": 0.0108, "step": 7896 }, { "epoch": 2.1177259318852237, "grad_norm": 0.21782107388237565, "learning_rate": 2.4113597829157226e-06, "loss": 0.0112, "step": 7897 }, { "epoch": 2.117994100294985, "grad_norm": 0.23074007462769786, "learning_rate": 2.4100250983396157e-06, "loss": 0.0127, "step": 7898 }, { "epoch": 2.1182622687047465, "grad_norm": 0.27758486052621467, "learning_rate": 2.4086906659424904e-06, "loss": 0.0089, "step": 7899 }, { "epoch": 2.118530437114508, "grad_norm": 0.3587687430402201, "learning_rate": 2.407356485854273e-06, "loss": 0.0125, "step": 7900 }, { "epoch": 2.1187986055242694, "grad_norm": 0.2513200682101335, "learning_rate": 2.4060225582048734e-06, "loss": 0.0154, "step": 7901 }, { "epoch": 2.1190667739340308, "grad_norm": 0.27459538461571376, "learning_rate": 2.4046888831241704e-06, "loss": 0.0201, "step": 7902 }, { "epoch": 2.1193349423437917, "grad_norm": 0.18576660592158561, "learning_rate": 2.4033554607420186e-06, "loss": 0.0097, "step": 7903 }, { "epoch": 2.119603110753553, "grad_norm": 0.2996524996532301, "learning_rate": 2.4020222911882536e-06, "loss": 0.0148, "step": 7904 }, { "epoch": 2.1198712791633145, "grad_norm": 0.222826924939333, "learning_rate": 2.4006893745926786e-06, "loss": 0.0113, "step": 7905 }, { "epoch": 2.120139447573076, "grad_norm": 0.20557119178983438, "learning_rate": 2.399356711085078e-06, "loss": 0.0127, "step": 7906 }, { "epoch": 2.1204076159828373, "grad_norm": 0.24452158468071297, "learning_rate": 2.398024300795212e-06, "loss": 0.015, "step": 7907 }, { "epoch": 2.1206757843925987, "grad_norm": 0.32798380747471023, "learning_rate": 2.3966921438528095e-06, "loss": 0.0179, "step": 7908 }, { "epoch": 2.1209439528023597, "grad_norm": 0.2233215998080106, "learning_rate": 2.395360240387584e-06, "loss": 0.0127, "step": 7909 }, { "epoch": 2.121212121212121, "grad_norm": 0.28499317142519504, "learning_rate": 2.394028590529214e-06, "loss": 0.0156, "step": 7910 }, { "epoch": 2.1214802896218825, "grad_norm": 0.21155333048068273, "learning_rate": 2.392697194407363e-06, "loss": 0.0088, "step": 7911 }, { "epoch": 2.121748458031644, "grad_norm": 0.19665835875003398, "learning_rate": 2.391366052151663e-06, "loss": 0.0112, "step": 7912 }, { "epoch": 2.1220166264414053, "grad_norm": 0.17652852110713374, "learning_rate": 2.3900351638917243e-06, "loss": 0.008, "step": 7913 }, { "epoch": 2.1222847948511667, "grad_norm": 0.24171013199667543, "learning_rate": 2.388704529757136e-06, "loss": 0.012, "step": 7914 }, { "epoch": 2.1225529632609277, "grad_norm": 0.29826887274469777, "learning_rate": 2.387374149877451e-06, "loss": 0.0152, "step": 7915 }, { "epoch": 2.122821131670689, "grad_norm": 0.3003719743003063, "learning_rate": 2.3860440243822085e-06, "loss": 0.0174, "step": 7916 }, { "epoch": 2.1230893000804505, "grad_norm": 0.2832756450107226, "learning_rate": 2.384714153400922e-06, "loss": 0.019, "step": 7917 }, { "epoch": 2.123357468490212, "grad_norm": 0.30385719804939426, "learning_rate": 2.3833845370630727e-06, "loss": 0.0197, "step": 7918 }, { "epoch": 2.1236256368999733, "grad_norm": 0.2564785470875464, "learning_rate": 2.382055175498126e-06, "loss": 0.0175, "step": 7919 }, { "epoch": 2.1238938053097347, "grad_norm": 0.27910507181896177, "learning_rate": 2.380726068835514e-06, "loss": 0.0164, "step": 7920 }, { "epoch": 2.1241619737194957, "grad_norm": 0.44421601596509386, "learning_rate": 2.379397217204653e-06, "loss": 0.0133, "step": 7921 }, { "epoch": 2.124430142129257, "grad_norm": 0.1992449718667111, "learning_rate": 2.378068620734926e-06, "loss": 0.0104, "step": 7922 }, { "epoch": 2.1246983105390185, "grad_norm": 0.25798960941600607, "learning_rate": 2.3767402795556953e-06, "loss": 0.0111, "step": 7923 }, { "epoch": 2.12496647894878, "grad_norm": 0.3322222649260662, "learning_rate": 2.375412193796301e-06, "loss": 0.0142, "step": 7924 }, { "epoch": 2.1252346473585413, "grad_norm": 0.33046065382713263, "learning_rate": 2.3740843635860515e-06, "loss": 0.0148, "step": 7925 }, { "epoch": 2.1255028157683027, "grad_norm": 0.2904105108956896, "learning_rate": 2.3727567890542396e-06, "loss": 0.018, "step": 7926 }, { "epoch": 2.1257709841780636, "grad_norm": 0.3750946399741476, "learning_rate": 2.3714294703301203e-06, "loss": 0.015, "step": 7927 }, { "epoch": 2.126039152587825, "grad_norm": 0.22133139515784714, "learning_rate": 2.3701024075429353e-06, "loss": 0.0136, "step": 7928 }, { "epoch": 2.1263073209975865, "grad_norm": 0.2962988690057557, "learning_rate": 2.3687756008218978e-06, "loss": 0.0176, "step": 7929 }, { "epoch": 2.126575489407348, "grad_norm": 0.23685636651246786, "learning_rate": 2.367449050296193e-06, "loss": 0.0148, "step": 7930 }, { "epoch": 2.1268436578171093, "grad_norm": 0.2779211132296686, "learning_rate": 2.3661227560949855e-06, "loss": 0.0193, "step": 7931 }, { "epoch": 2.1271118262268707, "grad_norm": 0.28342744533710484, "learning_rate": 2.3647967183474118e-06, "loss": 0.0194, "step": 7932 }, { "epoch": 2.1273799946366316, "grad_norm": 0.31928777118279994, "learning_rate": 2.363470937182585e-06, "loss": 0.0117, "step": 7933 }, { "epoch": 2.127648163046393, "grad_norm": 0.28861081928734145, "learning_rate": 2.362145412729595e-06, "loss": 0.0129, "step": 7934 }, { "epoch": 2.1279163314561544, "grad_norm": 0.20055476340124134, "learning_rate": 2.3608201451175004e-06, "loss": 0.0091, "step": 7935 }, { "epoch": 2.128184499865916, "grad_norm": 0.16959510091002783, "learning_rate": 2.3594951344753443e-06, "loss": 0.008, "step": 7936 }, { "epoch": 2.1284526682756773, "grad_norm": 0.32126133917708066, "learning_rate": 2.3581703809321338e-06, "loss": 0.0175, "step": 7937 }, { "epoch": 2.1287208366854387, "grad_norm": 0.4955649652298568, "learning_rate": 2.356845884616862e-06, "loss": 0.0217, "step": 7938 }, { "epoch": 2.1289890050951996, "grad_norm": 0.5623919626805461, "learning_rate": 2.3555216456584883e-06, "loss": 0.0334, "step": 7939 }, { "epoch": 2.129257173504961, "grad_norm": 0.2709677742092666, "learning_rate": 2.3541976641859494e-06, "loss": 0.0162, "step": 7940 }, { "epoch": 2.1295253419147224, "grad_norm": 0.30091657535775834, "learning_rate": 2.3528739403281616e-06, "loss": 0.0172, "step": 7941 }, { "epoch": 2.129793510324484, "grad_norm": 0.3937953634404663, "learning_rate": 2.3515504742140077e-06, "loss": 0.0209, "step": 7942 }, { "epoch": 2.1300616787342452, "grad_norm": 0.2676133334986401, "learning_rate": 2.3502272659723554e-06, "loss": 0.0142, "step": 7943 }, { "epoch": 2.1303298471440066, "grad_norm": 0.2446688247433252, "learning_rate": 2.348904315732036e-06, "loss": 0.0103, "step": 7944 }, { "epoch": 2.1305980155537676, "grad_norm": 0.351231810117543, "learning_rate": 2.3475816236218663e-06, "loss": 0.0201, "step": 7945 }, { "epoch": 2.130866183963529, "grad_norm": 0.32394632780185834, "learning_rate": 2.3462591897706327e-06, "loss": 0.0155, "step": 7946 }, { "epoch": 2.1311343523732904, "grad_norm": 0.2429997740011695, "learning_rate": 2.3449370143070948e-06, "loss": 0.0095, "step": 7947 }, { "epoch": 2.131402520783052, "grad_norm": 0.25634774073929834, "learning_rate": 2.343615097359993e-06, "loss": 0.0145, "step": 7948 }, { "epoch": 2.131670689192813, "grad_norm": 0.2167753492891867, "learning_rate": 2.3422934390580345e-06, "loss": 0.0109, "step": 7949 }, { "epoch": 2.1319388576025746, "grad_norm": 0.3045759647870103, "learning_rate": 2.340972039529909e-06, "loss": 0.0173, "step": 7950 }, { "epoch": 2.1322070260123356, "grad_norm": 0.5835128277045868, "learning_rate": 2.339650898904277e-06, "loss": 0.0188, "step": 7951 }, { "epoch": 2.132475194422097, "grad_norm": 0.4367412297104279, "learning_rate": 2.3383300173097723e-06, "loss": 0.0235, "step": 7952 }, { "epoch": 2.1327433628318584, "grad_norm": 0.25357733879432376, "learning_rate": 2.337009394875009e-06, "loss": 0.0162, "step": 7953 }, { "epoch": 2.13301153124162, "grad_norm": 0.22250726293856296, "learning_rate": 2.3356890317285687e-06, "loss": 0.0129, "step": 7954 }, { "epoch": 2.133279699651381, "grad_norm": 0.3782824619425818, "learning_rate": 2.334368927999013e-06, "loss": 0.0194, "step": 7955 }, { "epoch": 2.1335478680611426, "grad_norm": 0.3218885402518952, "learning_rate": 2.3330490838148806e-06, "loss": 0.0148, "step": 7956 }, { "epoch": 2.1338160364709036, "grad_norm": 0.2479602933161318, "learning_rate": 2.3317294993046765e-06, "loss": 0.0129, "step": 7957 }, { "epoch": 2.134084204880665, "grad_norm": 0.22774505833059555, "learning_rate": 2.330410174596888e-06, "loss": 0.0103, "step": 7958 }, { "epoch": 2.1343523732904264, "grad_norm": 0.3026791635196307, "learning_rate": 2.329091109819972e-06, "loss": 0.0291, "step": 7959 }, { "epoch": 2.134620541700188, "grad_norm": 0.648516341782001, "learning_rate": 2.327772305102365e-06, "loss": 0.0197, "step": 7960 }, { "epoch": 2.134888710109949, "grad_norm": 0.3180973921918388, "learning_rate": 2.3264537605724724e-06, "loss": 0.0109, "step": 7961 }, { "epoch": 2.1351568785197106, "grad_norm": 0.19322295606814613, "learning_rate": 2.3251354763586786e-06, "loss": 0.0105, "step": 7962 }, { "epoch": 2.1354250469294715, "grad_norm": 0.5242891891270779, "learning_rate": 2.323817452589346e-06, "loss": 0.0172, "step": 7963 }, { "epoch": 2.135693215339233, "grad_norm": 0.2889423331338352, "learning_rate": 2.322499689392799e-06, "loss": 0.0149, "step": 7964 }, { "epoch": 2.1359613837489944, "grad_norm": 0.27990676759463123, "learning_rate": 2.3211821868973487e-06, "loss": 0.0138, "step": 7965 }, { "epoch": 2.1362295521587558, "grad_norm": 0.30288348465110804, "learning_rate": 2.3198649452312783e-06, "loss": 0.0179, "step": 7966 }, { "epoch": 2.136497720568517, "grad_norm": 0.16490793946641785, "learning_rate": 2.318547964522841e-06, "loss": 0.0116, "step": 7967 }, { "epoch": 2.136765888978278, "grad_norm": 0.21618514503702513, "learning_rate": 2.317231244900271e-06, "loss": 0.014, "step": 7968 }, { "epoch": 2.1370340573880395, "grad_norm": 0.2245955543113481, "learning_rate": 2.31591478649177e-06, "loss": 0.0118, "step": 7969 }, { "epoch": 2.137302225797801, "grad_norm": 0.20190763900114359, "learning_rate": 2.3145985894255228e-06, "loss": 0.0116, "step": 7970 }, { "epoch": 2.1375703942075623, "grad_norm": 0.24431367920414004, "learning_rate": 2.313282653829679e-06, "loss": 0.0088, "step": 7971 }, { "epoch": 2.1378385626173237, "grad_norm": 0.225336851625098, "learning_rate": 2.31196697983237e-06, "loss": 0.0162, "step": 7972 }, { "epoch": 2.138106731027085, "grad_norm": 0.24235146202362276, "learning_rate": 2.3106515675617014e-06, "loss": 0.0148, "step": 7973 }, { "epoch": 2.1383748994368466, "grad_norm": 0.2595781778976322, "learning_rate": 2.3093364171457476e-06, "loss": 0.014, "step": 7974 }, { "epoch": 2.1386430678466075, "grad_norm": 0.20727620067291733, "learning_rate": 2.3080215287125655e-06, "loss": 0.0129, "step": 7975 }, { "epoch": 2.138911236256369, "grad_norm": 0.5119276331929072, "learning_rate": 2.30670690239018e-06, "loss": 0.0163, "step": 7976 }, { "epoch": 2.1391794046661303, "grad_norm": 0.2443581168711156, "learning_rate": 2.305392538306591e-06, "loss": 0.0142, "step": 7977 }, { "epoch": 2.1394475730758917, "grad_norm": 0.20444266188957452, "learning_rate": 2.3040784365897783e-06, "loss": 0.0133, "step": 7978 }, { "epoch": 2.139715741485653, "grad_norm": 0.3315969333253959, "learning_rate": 2.3027645973676894e-06, "loss": 0.0221, "step": 7979 }, { "epoch": 2.139983909895414, "grad_norm": 0.24608245859256378, "learning_rate": 2.301451020768252e-06, "loss": 0.0135, "step": 7980 }, { "epoch": 2.1402520783051755, "grad_norm": 0.21084460797230892, "learning_rate": 2.300137706919362e-06, "loss": 0.0146, "step": 7981 }, { "epoch": 2.140520246714937, "grad_norm": 0.39647172003924136, "learning_rate": 2.298824655948896e-06, "loss": 0.0199, "step": 7982 }, { "epoch": 2.1407884151246983, "grad_norm": 0.23181260270623663, "learning_rate": 2.297511867984703e-06, "loss": 0.0123, "step": 7983 }, { "epoch": 2.1410565835344597, "grad_norm": 0.2510461195579083, "learning_rate": 2.296199343154603e-06, "loss": 0.0134, "step": 7984 }, { "epoch": 2.141324751944221, "grad_norm": 0.255312553623265, "learning_rate": 2.2948870815863965e-06, "loss": 0.0176, "step": 7985 }, { "epoch": 2.1415929203539825, "grad_norm": 0.3179942836296037, "learning_rate": 2.293575083407851e-06, "loss": 0.0244, "step": 7986 }, { "epoch": 2.1418610887637435, "grad_norm": 0.2797568873496874, "learning_rate": 2.292263348746716e-06, "loss": 0.0211, "step": 7987 }, { "epoch": 2.142129257173505, "grad_norm": 0.23380182424407198, "learning_rate": 2.29095187773071e-06, "loss": 0.016, "step": 7988 }, { "epoch": 2.1423974255832663, "grad_norm": 0.22688390934712127, "learning_rate": 2.289640670487525e-06, "loss": 0.0129, "step": 7989 }, { "epoch": 2.1426655939930277, "grad_norm": 0.29219539544329115, "learning_rate": 2.288329727144834e-06, "loss": 0.0241, "step": 7990 }, { "epoch": 2.142933762402789, "grad_norm": 0.3458336592294186, "learning_rate": 2.2870190478302766e-06, "loss": 0.0188, "step": 7991 }, { "epoch": 2.14320193081255, "grad_norm": 0.34207450175899656, "learning_rate": 2.2857086326714718e-06, "loss": 0.0154, "step": 7992 }, { "epoch": 2.1434700992223115, "grad_norm": 0.2231714773757065, "learning_rate": 2.2843984817960123e-06, "loss": 0.0145, "step": 7993 }, { "epoch": 2.143738267632073, "grad_norm": 0.27455225666629635, "learning_rate": 2.2830885953314614e-06, "loss": 0.0165, "step": 7994 }, { "epoch": 2.1440064360418343, "grad_norm": 0.23481039828751385, "learning_rate": 2.2817789734053626e-06, "loss": 0.0128, "step": 7995 }, { "epoch": 2.1442746044515957, "grad_norm": 0.2691035594236558, "learning_rate": 2.2804696161452267e-06, "loss": 0.0129, "step": 7996 }, { "epoch": 2.144542772861357, "grad_norm": 0.24946058396114074, "learning_rate": 2.279160523678546e-06, "loss": 0.0166, "step": 7997 }, { "epoch": 2.1448109412711185, "grad_norm": 0.23980744038236382, "learning_rate": 2.277851696132779e-06, "loss": 0.0111, "step": 7998 }, { "epoch": 2.1450791096808794, "grad_norm": 0.2706526655985602, "learning_rate": 2.2765431336353673e-06, "loss": 0.0157, "step": 7999 }, { "epoch": 2.145347278090641, "grad_norm": 0.23442760280382924, "learning_rate": 2.27523483631372e-06, "loss": 0.0133, "step": 8000 }, { "epoch": 2.1456154465004023, "grad_norm": 0.24755454079045205, "learning_rate": 2.273926804295221e-06, "loss": 0.0122, "step": 8001 }, { "epoch": 2.1458836149101637, "grad_norm": 0.346717729662209, "learning_rate": 2.272619037707231e-06, "loss": 0.016, "step": 8002 }, { "epoch": 2.146151783319925, "grad_norm": 0.20501631531767148, "learning_rate": 2.2713115366770865e-06, "loss": 0.0125, "step": 8003 }, { "epoch": 2.146419951729686, "grad_norm": 0.23392242412954342, "learning_rate": 2.2700043013320912e-06, "loss": 0.0155, "step": 8004 }, { "epoch": 2.1466881201394474, "grad_norm": 0.26199723562550337, "learning_rate": 2.268697331799531e-06, "loss": 0.0184, "step": 8005 }, { "epoch": 2.146956288549209, "grad_norm": 0.29942856671827195, "learning_rate": 2.2673906282066582e-06, "loss": 0.0149, "step": 8006 }, { "epoch": 2.1472244569589702, "grad_norm": 0.1804427056572445, "learning_rate": 2.266084190680707e-06, "loss": 0.0115, "step": 8007 }, { "epoch": 2.1474926253687316, "grad_norm": 0.20302925191029045, "learning_rate": 2.264778019348878e-06, "loss": 0.0119, "step": 8008 }, { "epoch": 2.147760793778493, "grad_norm": 0.2621412986424994, "learning_rate": 2.263472114338351e-06, "loss": 0.0134, "step": 8009 }, { "epoch": 2.1480289621882545, "grad_norm": 0.21007018715899442, "learning_rate": 2.262166475776281e-06, "loss": 0.0133, "step": 8010 }, { "epoch": 2.1482971305980154, "grad_norm": 0.2674813162364021, "learning_rate": 2.260861103789792e-06, "loss": 0.0103, "step": 8011 }, { "epoch": 2.148565299007777, "grad_norm": 0.20911402696491696, "learning_rate": 2.2595559985059834e-06, "loss": 0.0136, "step": 8012 }, { "epoch": 2.1488334674175382, "grad_norm": 0.14423736370677825, "learning_rate": 2.2582511600519336e-06, "loss": 0.008, "step": 8013 }, { "epoch": 2.1491016358272996, "grad_norm": 0.2528889306427548, "learning_rate": 2.256946588554687e-06, "loss": 0.0152, "step": 8014 }, { "epoch": 2.149369804237061, "grad_norm": 0.3306481744455614, "learning_rate": 2.25564228414127e-06, "loss": 0.0161, "step": 8015 }, { "epoch": 2.149637972646822, "grad_norm": 0.2721051172761682, "learning_rate": 2.2543382469386766e-06, "loss": 0.0163, "step": 8016 }, { "epoch": 2.1499061410565834, "grad_norm": 0.2645572583329774, "learning_rate": 2.25303447707388e-06, "loss": 0.0148, "step": 8017 }, { "epoch": 2.150174309466345, "grad_norm": 0.21385219535839486, "learning_rate": 2.2517309746738214e-06, "loss": 0.0153, "step": 8018 }, { "epoch": 2.150442477876106, "grad_norm": 0.2792873192621192, "learning_rate": 2.250427739865421e-06, "loss": 0.015, "step": 8019 }, { "epoch": 2.1507106462858676, "grad_norm": 0.25895326669984053, "learning_rate": 2.2491247727755737e-06, "loss": 0.0143, "step": 8020 }, { "epoch": 2.150978814695629, "grad_norm": 0.25932999380094524, "learning_rate": 2.2478220735311424e-06, "loss": 0.013, "step": 8021 }, { "epoch": 2.1512469831053904, "grad_norm": 0.2593156958550851, "learning_rate": 2.24651964225897e-06, "loss": 0.0174, "step": 8022 }, { "epoch": 2.1515151515151514, "grad_norm": 0.2604807961204911, "learning_rate": 2.2452174790858706e-06, "loss": 0.0164, "step": 8023 }, { "epoch": 2.151783319924913, "grad_norm": 0.2493312177837043, "learning_rate": 2.243915584138629e-06, "loss": 0.0184, "step": 8024 }, { "epoch": 2.152051488334674, "grad_norm": 0.21246497462815678, "learning_rate": 2.2426139575440116e-06, "loss": 0.0082, "step": 8025 }, { "epoch": 2.1523196567444356, "grad_norm": 0.2339212278506901, "learning_rate": 2.2413125994287504e-06, "loss": 0.0144, "step": 8026 }, { "epoch": 2.152587825154197, "grad_norm": 0.17725549221880427, "learning_rate": 2.2400115099195596e-06, "loss": 0.0087, "step": 8027 }, { "epoch": 2.152855993563958, "grad_norm": 0.22399268231120895, "learning_rate": 2.238710689143118e-06, "loss": 0.0093, "step": 8028 }, { "epoch": 2.1531241619737194, "grad_norm": 0.3159453661187273, "learning_rate": 2.237410137226086e-06, "loss": 0.0153, "step": 8029 }, { "epoch": 2.1533923303834808, "grad_norm": 0.2663617673880538, "learning_rate": 2.236109854295096e-06, "loss": 0.0187, "step": 8030 }, { "epoch": 2.153660498793242, "grad_norm": 0.30266677446731355, "learning_rate": 2.23480984047675e-06, "loss": 0.0142, "step": 8031 }, { "epoch": 2.1539286672030036, "grad_norm": 0.2333768598900261, "learning_rate": 2.23351009589763e-06, "loss": 0.0142, "step": 8032 }, { "epoch": 2.154196835612765, "grad_norm": 0.271393479351756, "learning_rate": 2.232210620684285e-06, "loss": 0.0136, "step": 8033 }, { "epoch": 2.154465004022526, "grad_norm": 0.2277806590665096, "learning_rate": 2.2309114149632457e-06, "loss": 0.0127, "step": 8034 }, { "epoch": 2.1547331724322873, "grad_norm": 0.2914395079232499, "learning_rate": 2.22961247886101e-06, "loss": 0.0248, "step": 8035 }, { "epoch": 2.1550013408420488, "grad_norm": 0.3188716537596488, "learning_rate": 2.22831381250405e-06, "loss": 0.0172, "step": 8036 }, { "epoch": 2.15526950925181, "grad_norm": 0.30413745488996374, "learning_rate": 2.2270154160188173e-06, "loss": 0.0184, "step": 8037 }, { "epoch": 2.1555376776615716, "grad_norm": 0.2674757566824522, "learning_rate": 2.2257172895317297e-06, "loss": 0.0144, "step": 8038 }, { "epoch": 2.155805846071333, "grad_norm": 0.279516692796293, "learning_rate": 2.224419433169186e-06, "loss": 0.0137, "step": 8039 }, { "epoch": 2.156074014481094, "grad_norm": 0.292076871831746, "learning_rate": 2.223121847057551e-06, "loss": 0.0139, "step": 8040 }, { "epoch": 2.1563421828908553, "grad_norm": 0.27152074566572293, "learning_rate": 2.2218245313231695e-06, "loss": 0.0154, "step": 8041 }, { "epoch": 2.1566103513006167, "grad_norm": 0.2611595643577145, "learning_rate": 2.2205274860923593e-06, "loss": 0.0144, "step": 8042 }, { "epoch": 2.156878519710378, "grad_norm": 0.36880324313185925, "learning_rate": 2.219230711491406e-06, "loss": 0.013, "step": 8043 }, { "epoch": 2.1571466881201395, "grad_norm": 0.24430947261024766, "learning_rate": 2.217934207646578e-06, "loss": 0.0127, "step": 8044 }, { "epoch": 2.157414856529901, "grad_norm": 0.24704968214928016, "learning_rate": 2.2166379746841083e-06, "loss": 0.0169, "step": 8045 }, { "epoch": 2.157683024939662, "grad_norm": 0.46221212444762366, "learning_rate": 2.215342012730209e-06, "loss": 0.0193, "step": 8046 }, { "epoch": 2.1579511933494233, "grad_norm": 0.3173248004275007, "learning_rate": 2.214046321911068e-06, "loss": 0.0198, "step": 8047 }, { "epoch": 2.1582193617591847, "grad_norm": 0.2107823871214829, "learning_rate": 2.212750902352837e-06, "loss": 0.0112, "step": 8048 }, { "epoch": 2.158487530168946, "grad_norm": 0.24004260338399516, "learning_rate": 2.2114557541816523e-06, "loss": 0.018, "step": 8049 }, { "epoch": 2.1587556985787075, "grad_norm": 0.23036593168895655, "learning_rate": 2.2101608775236153e-06, "loss": 0.0147, "step": 8050 }, { "epoch": 2.159023866988469, "grad_norm": 0.2747771137780415, "learning_rate": 2.2088662725048073e-06, "loss": 0.0261, "step": 8051 }, { "epoch": 2.15929203539823, "grad_norm": 0.3259847910062458, "learning_rate": 2.207571939251281e-06, "loss": 0.0171, "step": 8052 }, { "epoch": 2.1595602038079913, "grad_norm": 0.24175376801123527, "learning_rate": 2.2062778778890602e-06, "loss": 0.019, "step": 8053 }, { "epoch": 2.1598283722177527, "grad_norm": 0.24278409968790635, "learning_rate": 2.204984088544147e-06, "loss": 0.014, "step": 8054 }, { "epoch": 2.160096540627514, "grad_norm": 0.19992885679243982, "learning_rate": 2.2036905713425104e-06, "loss": 0.0115, "step": 8055 }, { "epoch": 2.1603647090372755, "grad_norm": 0.18310941914294204, "learning_rate": 2.2023973264101004e-06, "loss": 0.009, "step": 8056 }, { "epoch": 2.160632877447037, "grad_norm": 0.22197581903653207, "learning_rate": 2.2011043538728344e-06, "loss": 0.0131, "step": 8057 }, { "epoch": 2.160901045856798, "grad_norm": 0.16658008590452722, "learning_rate": 2.1998116538566056e-06, "loss": 0.0099, "step": 8058 }, { "epoch": 2.1611692142665593, "grad_norm": 0.3532977027193072, "learning_rate": 2.1985192264872856e-06, "loss": 0.0163, "step": 8059 }, { "epoch": 2.1614373826763207, "grad_norm": 0.21228090810477732, "learning_rate": 2.197227071890707e-06, "loss": 0.0114, "step": 8060 }, { "epoch": 2.161705551086082, "grad_norm": 0.18768895141799835, "learning_rate": 2.1959351901926873e-06, "loss": 0.0077, "step": 8061 }, { "epoch": 2.1619737194958435, "grad_norm": 0.19056166787115558, "learning_rate": 2.1946435815190153e-06, "loss": 0.0091, "step": 8062 }, { "epoch": 2.162241887905605, "grad_norm": 0.2591324893431378, "learning_rate": 2.1933522459954483e-06, "loss": 0.0116, "step": 8063 }, { "epoch": 2.162510056315366, "grad_norm": 0.2882499909428108, "learning_rate": 2.192061183747723e-06, "loss": 0.0179, "step": 8064 }, { "epoch": 2.1627782247251273, "grad_norm": 0.24574804368498632, "learning_rate": 2.190770394901543e-06, "loss": 0.017, "step": 8065 }, { "epoch": 2.1630463931348887, "grad_norm": 0.23303605949339473, "learning_rate": 2.189479879582594e-06, "loss": 0.0153, "step": 8066 }, { "epoch": 2.16331456154465, "grad_norm": 0.3211862668699343, "learning_rate": 2.1881896379165253e-06, "loss": 0.0123, "step": 8067 }, { "epoch": 2.1635827299544115, "grad_norm": 0.4703572603519183, "learning_rate": 2.1868996700289656e-06, "loss": 0.0162, "step": 8068 }, { "epoch": 2.163850898364173, "grad_norm": 0.254921749153243, "learning_rate": 2.185609976045519e-06, "loss": 0.0145, "step": 8069 }, { "epoch": 2.164119066773934, "grad_norm": 0.22625737318032926, "learning_rate": 2.1843205560917544e-06, "loss": 0.0116, "step": 8070 }, { "epoch": 2.1643872351836952, "grad_norm": 0.2526296210606797, "learning_rate": 2.1830314102932245e-06, "loss": 0.0142, "step": 8071 }, { "epoch": 2.1646554035934567, "grad_norm": 0.23607886916562654, "learning_rate": 2.181742538775447e-06, "loss": 0.0134, "step": 8072 }, { "epoch": 2.164923572003218, "grad_norm": 0.25202856715482796, "learning_rate": 2.1804539416639144e-06, "loss": 0.0149, "step": 8073 }, { "epoch": 2.1651917404129795, "grad_norm": 0.20994674657045104, "learning_rate": 2.179165619084097e-06, "loss": 0.0122, "step": 8074 }, { "epoch": 2.165459908822741, "grad_norm": 0.269923987967757, "learning_rate": 2.177877571161433e-06, "loss": 0.0125, "step": 8075 }, { "epoch": 2.165728077232502, "grad_norm": 0.2191229608400655, "learning_rate": 2.1765897980213386e-06, "loss": 0.0083, "step": 8076 }, { "epoch": 2.1659962456422632, "grad_norm": 0.2612854820787141, "learning_rate": 2.175302299789198e-06, "loss": 0.0283, "step": 8077 }, { "epoch": 2.1662644140520246, "grad_norm": 0.2165316065735465, "learning_rate": 2.174015076590373e-06, "loss": 0.0103, "step": 8078 }, { "epoch": 2.166532582461786, "grad_norm": 0.15655966155581802, "learning_rate": 2.172728128550199e-06, "loss": 0.0078, "step": 8079 }, { "epoch": 2.1668007508715474, "grad_norm": 0.3866620479020673, "learning_rate": 2.171441455793979e-06, "loss": 0.0118, "step": 8080 }, { "epoch": 2.167068919281309, "grad_norm": 0.2909887317256784, "learning_rate": 2.1701550584469965e-06, "loss": 0.0131, "step": 8081 }, { "epoch": 2.16733708769107, "grad_norm": 0.27286103094111597, "learning_rate": 2.168868936634501e-06, "loss": 0.0139, "step": 8082 }, { "epoch": 2.167605256100831, "grad_norm": 0.2843428855868537, "learning_rate": 2.1675830904817225e-06, "loss": 0.021, "step": 8083 }, { "epoch": 2.1678734245105926, "grad_norm": 0.22651028098052212, "learning_rate": 2.166297520113859e-06, "loss": 0.0142, "step": 8084 }, { "epoch": 2.168141592920354, "grad_norm": 0.22142482329587104, "learning_rate": 2.16501222565608e-06, "loss": 0.0099, "step": 8085 }, { "epoch": 2.1684097613301154, "grad_norm": 0.582591718091082, "learning_rate": 2.1637272072335366e-06, "loss": 0.0111, "step": 8086 }, { "epoch": 2.168677929739877, "grad_norm": 0.24074231755190728, "learning_rate": 2.162442464971342e-06, "loss": 0.0119, "step": 8087 }, { "epoch": 2.168946098149638, "grad_norm": 0.31821855727512416, "learning_rate": 2.161157998994592e-06, "loss": 0.0174, "step": 8088 }, { "epoch": 2.169214266559399, "grad_norm": 0.2907407892989868, "learning_rate": 2.1598738094283522e-06, "loss": 0.0162, "step": 8089 }, { "epoch": 2.1694824349691606, "grad_norm": 0.35207504529931855, "learning_rate": 2.158589896397657e-06, "loss": 0.0117, "step": 8090 }, { "epoch": 2.169750603378922, "grad_norm": 0.20976647789842587, "learning_rate": 2.1573062600275217e-06, "loss": 0.011, "step": 8091 }, { "epoch": 2.1700187717886834, "grad_norm": 0.24300854195882127, "learning_rate": 2.1560229004429273e-06, "loss": 0.0156, "step": 8092 }, { "epoch": 2.170286940198445, "grad_norm": 0.2050031651045972, "learning_rate": 2.154739817768834e-06, "loss": 0.0138, "step": 8093 }, { "epoch": 2.1705551086082058, "grad_norm": 0.24676933215122548, "learning_rate": 2.1534570121301683e-06, "loss": 0.0133, "step": 8094 }, { "epoch": 2.170823277017967, "grad_norm": 0.2226896173331888, "learning_rate": 2.1521744836518365e-06, "loss": 0.0088, "step": 8095 }, { "epoch": 2.1710914454277286, "grad_norm": 0.29122906277162347, "learning_rate": 2.1508922324587177e-06, "loss": 0.0197, "step": 8096 }, { "epoch": 2.17135961383749, "grad_norm": 0.22331543555203767, "learning_rate": 2.149610258675655e-06, "loss": 0.0149, "step": 8097 }, { "epoch": 2.1716277822472514, "grad_norm": 0.2484241043775842, "learning_rate": 2.148328562427473e-06, "loss": 0.0139, "step": 8098 }, { "epoch": 2.171895950657013, "grad_norm": 0.29761822904626356, "learning_rate": 2.14704714383897e-06, "loss": 0.0194, "step": 8099 }, { "epoch": 2.1721641190667738, "grad_norm": 0.29306204871437325, "learning_rate": 2.1457660030349103e-06, "loss": 0.0216, "step": 8100 }, { "epoch": 2.172432287476535, "grad_norm": 0.21978483611076352, "learning_rate": 2.144485140140039e-06, "loss": 0.0106, "step": 8101 }, { "epoch": 2.1727004558862966, "grad_norm": 0.22035098992198182, "learning_rate": 2.1432045552790666e-06, "loss": 0.0154, "step": 8102 }, { "epoch": 2.172968624296058, "grad_norm": 0.3790008117227063, "learning_rate": 2.1419242485766834e-06, "loss": 0.0276, "step": 8103 }, { "epoch": 2.1732367927058194, "grad_norm": 0.24058145481387644, "learning_rate": 2.1406442201575466e-06, "loss": 0.0147, "step": 8104 }, { "epoch": 2.173504961115581, "grad_norm": 0.19604162265155758, "learning_rate": 2.1393644701462905e-06, "loss": 0.0141, "step": 8105 }, { "epoch": 2.1737731295253417, "grad_norm": 0.3256522277797495, "learning_rate": 2.1380849986675233e-06, "loss": 0.0221, "step": 8106 }, { "epoch": 2.174041297935103, "grad_norm": 0.2363901596272482, "learning_rate": 2.136805805845819e-06, "loss": 0.0164, "step": 8107 }, { "epoch": 2.1743094663448645, "grad_norm": 0.23017513138042064, "learning_rate": 2.135526891805734e-06, "loss": 0.0133, "step": 8108 }, { "epoch": 2.174577634754626, "grad_norm": 0.20859311870996733, "learning_rate": 2.134248256671791e-06, "loss": 0.0098, "step": 8109 }, { "epoch": 2.1748458031643874, "grad_norm": 0.22378126404571747, "learning_rate": 2.132969900568484e-06, "loss": 0.0106, "step": 8110 }, { "epoch": 2.1751139715741488, "grad_norm": 0.24085432496567885, "learning_rate": 2.131691823620288e-06, "loss": 0.0135, "step": 8111 }, { "epoch": 2.1753821399839097, "grad_norm": 0.25274668792987276, "learning_rate": 2.1304140259516427e-06, "loss": 0.0158, "step": 8112 }, { "epoch": 2.175650308393671, "grad_norm": 0.34328649584052745, "learning_rate": 2.129136507686967e-06, "loss": 0.0134, "step": 8113 }, { "epoch": 2.1759184768034325, "grad_norm": 0.23421274126571387, "learning_rate": 2.127859268950645e-06, "loss": 0.0139, "step": 8114 }, { "epoch": 2.176186645213194, "grad_norm": 0.20872766785013253, "learning_rate": 2.126582309867041e-06, "loss": 0.0127, "step": 8115 }, { "epoch": 2.1764548136229553, "grad_norm": 0.22089091539277672, "learning_rate": 2.1253056305604903e-06, "loss": 0.0121, "step": 8116 }, { "epoch": 2.1767229820327167, "grad_norm": 0.2025494617496406, "learning_rate": 2.124029231155297e-06, "loss": 0.0114, "step": 8117 }, { "epoch": 2.1769911504424777, "grad_norm": 0.2348462942934537, "learning_rate": 2.122753111775743e-06, "loss": 0.0127, "step": 8118 }, { "epoch": 2.177259318852239, "grad_norm": 0.31586737831969236, "learning_rate": 2.1214772725460776e-06, "loss": 0.0217, "step": 8119 }, { "epoch": 2.1775274872620005, "grad_norm": 0.27694363575770603, "learning_rate": 2.12020171359053e-06, "loss": 0.0169, "step": 8120 }, { "epoch": 2.177795655671762, "grad_norm": 0.2528695585686696, "learning_rate": 2.118926435033296e-06, "loss": 0.0147, "step": 8121 }, { "epoch": 2.1780638240815233, "grad_norm": 0.21992149851146608, "learning_rate": 2.117651436998544e-06, "loss": 0.0137, "step": 8122 }, { "epoch": 2.1783319924912847, "grad_norm": 0.7633392282861222, "learning_rate": 2.1163767196104208e-06, "loss": 0.0204, "step": 8123 }, { "epoch": 2.1786001609010457, "grad_norm": 0.23054099712098766, "learning_rate": 2.115102282993039e-06, "loss": 0.0091, "step": 8124 }, { "epoch": 2.178868329310807, "grad_norm": 0.24569170686263575, "learning_rate": 2.1138281272704886e-06, "loss": 0.0142, "step": 8125 }, { "epoch": 2.1791364977205685, "grad_norm": 0.213678830825019, "learning_rate": 2.1125542525668323e-06, "loss": 0.011, "step": 8126 }, { "epoch": 2.17940466613033, "grad_norm": 0.41499181336326546, "learning_rate": 2.1112806590061006e-06, "loss": 0.0136, "step": 8127 }, { "epoch": 2.1796728345400913, "grad_norm": 0.22999885824234215, "learning_rate": 2.1100073467123038e-06, "loss": 0.0153, "step": 8128 }, { "epoch": 2.1799410029498527, "grad_norm": 0.26293442359681457, "learning_rate": 2.108734315809417e-06, "loss": 0.0131, "step": 8129 }, { "epoch": 2.1802091713596137, "grad_norm": 0.23219623234530773, "learning_rate": 2.1074615664213954e-06, "loss": 0.0133, "step": 8130 }, { "epoch": 2.180477339769375, "grad_norm": 0.26524067834774817, "learning_rate": 2.10618909867216e-06, "loss": 0.0096, "step": 8131 }, { "epoch": 2.1807455081791365, "grad_norm": 0.19115672063442035, "learning_rate": 2.104916912685611e-06, "loss": 0.0084, "step": 8132 }, { "epoch": 2.181013676588898, "grad_norm": 0.20204860780242606, "learning_rate": 2.103645008585615e-06, "loss": 0.0086, "step": 8133 }, { "epoch": 2.1812818449986593, "grad_norm": 0.36618569586391453, "learning_rate": 2.102373386496014e-06, "loss": 0.0253, "step": 8134 }, { "epoch": 2.1815500134084207, "grad_norm": 0.1988264697649137, "learning_rate": 2.1011020465406252e-06, "loss": 0.0089, "step": 8135 }, { "epoch": 2.1818181818181817, "grad_norm": 0.28269542428198746, "learning_rate": 2.099830988843231e-06, "loss": 0.0198, "step": 8136 }, { "epoch": 2.182086350227943, "grad_norm": 0.2660115767027406, "learning_rate": 2.0985602135275945e-06, "loss": 0.0136, "step": 8137 }, { "epoch": 2.1823545186377045, "grad_norm": 0.2717872721489681, "learning_rate": 2.097289720717448e-06, "loss": 0.0112, "step": 8138 }, { "epoch": 2.182622687047466, "grad_norm": 0.1925056333761443, "learning_rate": 2.0960195105364935e-06, "loss": 0.0098, "step": 8139 }, { "epoch": 2.1828908554572273, "grad_norm": 0.31810596385654494, "learning_rate": 2.094749583108411e-06, "loss": 0.0232, "step": 8140 }, { "epoch": 2.1831590238669882, "grad_norm": 0.24952955307448485, "learning_rate": 2.0934799385568465e-06, "loss": 0.015, "step": 8141 }, { "epoch": 2.1834271922767496, "grad_norm": 0.2425271445499934, "learning_rate": 2.0922105770054233e-06, "loss": 0.0164, "step": 8142 }, { "epoch": 2.183695360686511, "grad_norm": 0.32548553308322087, "learning_rate": 2.090941498577739e-06, "loss": 0.0148, "step": 8143 }, { "epoch": 2.1839635290962724, "grad_norm": 0.2211522211170737, "learning_rate": 2.089672703397357e-06, "loss": 0.014, "step": 8144 }, { "epoch": 2.184231697506034, "grad_norm": 0.22841761011608766, "learning_rate": 2.0884041915878166e-06, "loss": 0.0149, "step": 8145 }, { "epoch": 2.1844998659157953, "grad_norm": 0.23570868723918062, "learning_rate": 2.0871359632726286e-06, "loss": 0.0118, "step": 8146 }, { "epoch": 2.1847680343255567, "grad_norm": 0.26331158046353786, "learning_rate": 2.085868018575278e-06, "loss": 0.0165, "step": 8147 }, { "epoch": 2.1850362027353176, "grad_norm": 0.20156220052446783, "learning_rate": 2.084600357619224e-06, "loss": 0.0126, "step": 8148 }, { "epoch": 2.185304371145079, "grad_norm": 0.23608420576128006, "learning_rate": 2.0833329805278906e-06, "loss": 0.0165, "step": 8149 }, { "epoch": 2.1855725395548404, "grad_norm": 0.32920183448389395, "learning_rate": 2.0820658874246836e-06, "loss": 0.0149, "step": 8150 }, { "epoch": 2.185840707964602, "grad_norm": 0.26123006141187294, "learning_rate": 2.080799078432972e-06, "loss": 0.0151, "step": 8151 }, { "epoch": 2.1861088763743632, "grad_norm": 0.2734987516414429, "learning_rate": 2.079532553676106e-06, "loss": 0.0178, "step": 8152 }, { "epoch": 2.186377044784124, "grad_norm": 0.20487937864806954, "learning_rate": 2.0782663132773996e-06, "loss": 0.0107, "step": 8153 }, { "epoch": 2.1866452131938856, "grad_norm": 0.23746548338986337, "learning_rate": 2.077000357360145e-06, "loss": 0.0192, "step": 8154 }, { "epoch": 2.186913381603647, "grad_norm": 0.2048027419257131, "learning_rate": 2.0757346860476068e-06, "loss": 0.011, "step": 8155 }, { "epoch": 2.1871815500134084, "grad_norm": 0.2603045271583022, "learning_rate": 2.074469299463019e-06, "loss": 0.0154, "step": 8156 }, { "epoch": 2.18744971842317, "grad_norm": 0.46841210225369795, "learning_rate": 2.0732041977295857e-06, "loss": 0.0216, "step": 8157 }, { "epoch": 2.1877178868329312, "grad_norm": 0.23584240544035354, "learning_rate": 2.071939380970492e-06, "loss": 0.0177, "step": 8158 }, { "epoch": 2.1879860552426926, "grad_norm": 0.17859593871673468, "learning_rate": 2.0706748493088846e-06, "loss": 0.0102, "step": 8159 }, { "epoch": 2.1882542236524536, "grad_norm": 0.21592963910514304, "learning_rate": 2.0694106028678917e-06, "loss": 0.0137, "step": 8160 }, { "epoch": 2.188522392062215, "grad_norm": 0.43396943060621457, "learning_rate": 2.0681466417706063e-06, "loss": 0.0221, "step": 8161 }, { "epoch": 2.1887905604719764, "grad_norm": 0.2513472478430232, "learning_rate": 2.0668829661401e-06, "loss": 0.0165, "step": 8162 }, { "epoch": 2.189058728881738, "grad_norm": 0.27404586715450113, "learning_rate": 2.0656195760994104e-06, "loss": 0.0145, "step": 8163 }, { "epoch": 2.189326897291499, "grad_norm": 0.3061694564566703, "learning_rate": 2.064356471771552e-06, "loss": 0.0297, "step": 8164 }, { "epoch": 2.18959506570126, "grad_norm": 0.29982641693707146, "learning_rate": 2.0630936532795115e-06, "loss": 0.0169, "step": 8165 }, { "epoch": 2.1898632341110216, "grad_norm": 0.25865829064076146, "learning_rate": 2.0618311207462434e-06, "loss": 0.0135, "step": 8166 }, { "epoch": 2.190131402520783, "grad_norm": 0.22751474621199932, "learning_rate": 2.06056887429468e-06, "loss": 0.013, "step": 8167 }, { "epoch": 2.1903995709305444, "grad_norm": 0.2813881018832031, "learning_rate": 2.0593069140477208e-06, "loss": 0.0142, "step": 8168 }, { "epoch": 2.190667739340306, "grad_norm": 0.2420377068392333, "learning_rate": 2.0580452401282385e-06, "loss": 0.0151, "step": 8169 }, { "epoch": 2.190935907750067, "grad_norm": 0.21646894112325993, "learning_rate": 2.056783852659082e-06, "loss": 0.011, "step": 8170 }, { "epoch": 2.1912040761598286, "grad_norm": 0.37148941040044847, "learning_rate": 2.0555227517630667e-06, "loss": 0.0208, "step": 8171 }, { "epoch": 2.1914722445695896, "grad_norm": 0.34676088293175944, "learning_rate": 2.0542619375629845e-06, "loss": 0.0181, "step": 8172 }, { "epoch": 2.191740412979351, "grad_norm": 0.19305063787602153, "learning_rate": 2.0530014101815953e-06, "loss": 0.011, "step": 8173 }, { "epoch": 2.1920085813891124, "grad_norm": 0.2598414045646956, "learning_rate": 2.0517411697416344e-06, "loss": 0.0164, "step": 8174 }, { "epoch": 2.1922767497988738, "grad_norm": 0.2321866106729343, "learning_rate": 2.0504812163658104e-06, "loss": 0.0107, "step": 8175 }, { "epoch": 2.192544918208635, "grad_norm": 0.2170104636866185, "learning_rate": 2.0492215501767976e-06, "loss": 0.0141, "step": 8176 }, { "epoch": 2.192813086618396, "grad_norm": 0.3740561090199242, "learning_rate": 2.0479621712972503e-06, "loss": 0.0101, "step": 8177 }, { "epoch": 2.1930812550281575, "grad_norm": 0.5604265433835057, "learning_rate": 2.046703079849787e-06, "loss": 0.0309, "step": 8178 }, { "epoch": 2.193349423437919, "grad_norm": 0.2056957541594268, "learning_rate": 2.045444275957006e-06, "loss": 0.0128, "step": 8179 }, { "epoch": 2.1936175918476803, "grad_norm": 0.3069211144298621, "learning_rate": 2.044185759741471e-06, "loss": 0.0119, "step": 8180 }, { "epoch": 2.1938857602574418, "grad_norm": 0.26603444018427486, "learning_rate": 2.0429275313257197e-06, "loss": 0.0171, "step": 8181 }, { "epoch": 2.194153928667203, "grad_norm": 0.2556508232537799, "learning_rate": 2.0416695908322652e-06, "loss": 0.014, "step": 8182 }, { "epoch": 2.1944220970769646, "grad_norm": 0.3488252778296563, "learning_rate": 2.040411938383587e-06, "loss": 0.0202, "step": 8183 }, { "epoch": 2.1946902654867255, "grad_norm": 0.2972577514889578, "learning_rate": 2.039154574102141e-06, "loss": 0.0163, "step": 8184 }, { "epoch": 2.194958433896487, "grad_norm": 0.21881010199856296, "learning_rate": 2.0378974981103545e-06, "loss": 0.0117, "step": 8185 }, { "epoch": 2.1952266023062483, "grad_norm": 0.27077214245101255, "learning_rate": 2.036640710530623e-06, "loss": 0.0194, "step": 8186 }, { "epoch": 2.1954947707160097, "grad_norm": 0.1998819404367838, "learning_rate": 2.0353842114853194e-06, "loss": 0.01, "step": 8187 }, { "epoch": 2.195762939125771, "grad_norm": 0.22260520974842166, "learning_rate": 2.0341280010967824e-06, "loss": 0.0141, "step": 8188 }, { "epoch": 2.196031107535532, "grad_norm": 0.2853771752389982, "learning_rate": 2.0328720794873295e-06, "loss": 0.0115, "step": 8189 }, { "epoch": 2.1962992759452935, "grad_norm": 0.1785541231902277, "learning_rate": 2.031616446779243e-06, "loss": 0.0098, "step": 8190 }, { "epoch": 2.196567444355055, "grad_norm": 0.2972388292447769, "learning_rate": 2.0303611030947823e-06, "loss": 0.0141, "step": 8191 }, { "epoch": 2.1968356127648163, "grad_norm": 0.37714232502248174, "learning_rate": 2.0291060485561785e-06, "loss": 0.016, "step": 8192 }, { "epoch": 2.1971037811745777, "grad_norm": 0.19681231608108196, "learning_rate": 2.027851283285631e-06, "loss": 0.0114, "step": 8193 }, { "epoch": 2.197371949584339, "grad_norm": 0.2280910180781128, "learning_rate": 2.026596807405312e-06, "loss": 0.0145, "step": 8194 }, { "epoch": 2.1976401179941005, "grad_norm": 0.2715471987553624, "learning_rate": 2.025342621037369e-06, "loss": 0.0161, "step": 8195 }, { "epoch": 2.1979082864038615, "grad_norm": 0.2412949872358672, "learning_rate": 2.0240887243039154e-06, "loss": 0.0196, "step": 8196 }, { "epoch": 2.198176454813623, "grad_norm": 0.22691132220098353, "learning_rate": 2.0228351173270434e-06, "loss": 0.0136, "step": 8197 }, { "epoch": 2.1984446232233843, "grad_norm": 0.36719817122546394, "learning_rate": 2.0215818002288106e-06, "loss": 0.0136, "step": 8198 }, { "epoch": 2.1987127916331457, "grad_norm": 0.3220336399465674, "learning_rate": 2.020328773131252e-06, "loss": 0.016, "step": 8199 }, { "epoch": 2.198980960042907, "grad_norm": 0.2644067356268142, "learning_rate": 2.0190760361563675e-06, "loss": 0.0144, "step": 8200 }, { "epoch": 2.199249128452668, "grad_norm": 0.2469330683723476, "learning_rate": 2.0178235894261362e-06, "loss": 0.014, "step": 8201 }, { "epoch": 2.1995172968624295, "grad_norm": 0.2519095389504245, "learning_rate": 2.016571433062506e-06, "loss": 0.0156, "step": 8202 }, { "epoch": 2.199785465272191, "grad_norm": 0.3208467273914004, "learning_rate": 2.015319567187392e-06, "loss": 0.0187, "step": 8203 }, { "epoch": 2.2000536336819523, "grad_norm": 0.19666452044294888, "learning_rate": 2.0140679919226895e-06, "loss": 0.0121, "step": 8204 }, { "epoch": 2.2003218020917137, "grad_norm": 0.24004880719370658, "learning_rate": 2.01281670739026e-06, "loss": 0.0146, "step": 8205 }, { "epoch": 2.200589970501475, "grad_norm": 0.2228543806889746, "learning_rate": 2.011565713711934e-06, "loss": 0.0106, "step": 8206 }, { "epoch": 2.200858138911236, "grad_norm": 0.25683697306301495, "learning_rate": 2.010315011009523e-06, "loss": 0.0123, "step": 8207 }, { "epoch": 2.2011263073209975, "grad_norm": 0.29328489642980393, "learning_rate": 2.0090645994047996e-06, "loss": 0.0121, "step": 8208 }, { "epoch": 2.201394475730759, "grad_norm": 0.352048233486218, "learning_rate": 2.0078144790195175e-06, "loss": 0.0206, "step": 8209 }, { "epoch": 2.2016626441405203, "grad_norm": 0.22909367432900038, "learning_rate": 2.006564649975393e-06, "loss": 0.0109, "step": 8210 }, { "epoch": 2.2019308125502817, "grad_norm": 0.30855616002058456, "learning_rate": 2.005315112394122e-06, "loss": 0.017, "step": 8211 }, { "epoch": 2.202198980960043, "grad_norm": 0.19029501986501723, "learning_rate": 2.0040658663973695e-06, "loss": 0.0099, "step": 8212 }, { "epoch": 2.202467149369804, "grad_norm": 0.26692658344886994, "learning_rate": 2.0028169121067677e-06, "loss": 0.0128, "step": 8213 }, { "epoch": 2.2027353177795654, "grad_norm": 0.2360485003742221, "learning_rate": 2.001568249643927e-06, "loss": 0.0116, "step": 8214 }, { "epoch": 2.203003486189327, "grad_norm": 0.2737590515103898, "learning_rate": 2.0003198791304235e-06, "loss": 0.0112, "step": 8215 }, { "epoch": 2.2032716545990882, "grad_norm": 0.24667496606758174, "learning_rate": 1.9990718006878113e-06, "loss": 0.0112, "step": 8216 }, { "epoch": 2.2035398230088497, "grad_norm": 0.917697910845092, "learning_rate": 1.997824014437611e-06, "loss": 0.0244, "step": 8217 }, { "epoch": 2.203807991418611, "grad_norm": 0.23291443638213938, "learning_rate": 1.996576520501313e-06, "loss": 0.0126, "step": 8218 }, { "epoch": 2.204076159828372, "grad_norm": 0.294224745701501, "learning_rate": 1.995329319000388e-06, "loss": 0.0151, "step": 8219 }, { "epoch": 2.2043443282381334, "grad_norm": 0.18915102883293755, "learning_rate": 1.9940824100562676e-06, "loss": 0.0121, "step": 8220 }, { "epoch": 2.204612496647895, "grad_norm": 0.22905938352199806, "learning_rate": 1.992835793790363e-06, "loss": 0.0111, "step": 8221 }, { "epoch": 2.2048806650576562, "grad_norm": 0.26532166813197205, "learning_rate": 1.9915894703240547e-06, "loss": 0.0163, "step": 8222 }, { "epoch": 2.2051488334674176, "grad_norm": 0.21107062451266365, "learning_rate": 1.990343439778691e-06, "loss": 0.0127, "step": 8223 }, { "epoch": 2.205417001877179, "grad_norm": 0.24248473672811346, "learning_rate": 1.9890977022755975e-06, "loss": 0.0108, "step": 8224 }, { "epoch": 2.20568517028694, "grad_norm": 0.2726595704197026, "learning_rate": 1.987852257936065e-06, "loss": 0.015, "step": 8225 }, { "epoch": 2.2059533386967014, "grad_norm": 0.2174774113329928, "learning_rate": 1.986607106881363e-06, "loss": 0.0118, "step": 8226 }, { "epoch": 2.206221507106463, "grad_norm": 0.2482659082055337, "learning_rate": 1.985362249232725e-06, "loss": 0.0165, "step": 8227 }, { "epoch": 2.206489675516224, "grad_norm": 0.4659567327914753, "learning_rate": 1.984117685111361e-06, "loss": 0.0206, "step": 8228 }, { "epoch": 2.2067578439259856, "grad_norm": 0.2656340354421756, "learning_rate": 1.982873414638455e-06, "loss": 0.0122, "step": 8229 }, { "epoch": 2.207026012335747, "grad_norm": 0.2864654563733647, "learning_rate": 1.9816294379351495e-06, "loss": 0.0172, "step": 8230 }, { "epoch": 2.207294180745508, "grad_norm": 0.16663532732096026, "learning_rate": 1.980385755122575e-06, "loss": 0.0088, "step": 8231 }, { "epoch": 2.2075623491552694, "grad_norm": 0.35957101558993265, "learning_rate": 1.97914236632182e-06, "loss": 0.0244, "step": 8232 }, { "epoch": 2.207830517565031, "grad_norm": 0.24452708773953669, "learning_rate": 1.9778992716539532e-06, "loss": 0.0131, "step": 8233 }, { "epoch": 2.208098685974792, "grad_norm": 0.22462888779702833, "learning_rate": 1.976656471240012e-06, "loss": 0.0132, "step": 8234 }, { "epoch": 2.2083668543845536, "grad_norm": 0.21915421301990007, "learning_rate": 1.9754139652010025e-06, "loss": 0.0135, "step": 8235 }, { "epoch": 2.208635022794315, "grad_norm": 0.30582069285245794, "learning_rate": 1.9741717536579057e-06, "loss": 0.0173, "step": 8236 }, { "epoch": 2.208903191204076, "grad_norm": 0.20515263502991268, "learning_rate": 1.9729298367316713e-06, "loss": 0.0113, "step": 8237 }, { "epoch": 2.2091713596138374, "grad_norm": 0.24090092209809588, "learning_rate": 1.9716882145432208e-06, "loss": 0.0127, "step": 8238 }, { "epoch": 2.2094395280235988, "grad_norm": 0.2078462199163122, "learning_rate": 1.970446887213451e-06, "loss": 0.0118, "step": 8239 }, { "epoch": 2.20970769643336, "grad_norm": 0.21345577677834673, "learning_rate": 1.969205854863223e-06, "loss": 0.0133, "step": 8240 }, { "epoch": 2.2099758648431216, "grad_norm": 0.215015818637565, "learning_rate": 1.967965117613378e-06, "loss": 0.0129, "step": 8241 }, { "epoch": 2.210244033252883, "grad_norm": 0.2564102862523625, "learning_rate": 1.9667246755847157e-06, "loss": 0.0158, "step": 8242 }, { "epoch": 2.210512201662644, "grad_norm": 0.2982154800674518, "learning_rate": 1.9654845288980183e-06, "loss": 0.0167, "step": 8243 }, { "epoch": 2.2107803700724054, "grad_norm": 0.35845882568888393, "learning_rate": 1.9642446776740383e-06, "loss": 0.0147, "step": 8244 }, { "epoch": 2.2110485384821668, "grad_norm": 0.1894928467332662, "learning_rate": 1.9630051220334915e-06, "loss": 0.0103, "step": 8245 }, { "epoch": 2.211316706891928, "grad_norm": 0.3650781574872496, "learning_rate": 1.961765862097076e-06, "loss": 0.0192, "step": 8246 }, { "epoch": 2.2115848753016896, "grad_norm": 0.2778592857967054, "learning_rate": 1.9605268979854493e-06, "loss": 0.0165, "step": 8247 }, { "epoch": 2.211853043711451, "grad_norm": 0.22881872251585364, "learning_rate": 1.959288229819251e-06, "loss": 0.0142, "step": 8248 }, { "epoch": 2.212121212121212, "grad_norm": 0.22489033690033597, "learning_rate": 1.958049857719083e-06, "loss": 0.012, "step": 8249 }, { "epoch": 2.2123893805309733, "grad_norm": 0.223428197727709, "learning_rate": 1.9568117818055244e-06, "loss": 0.0122, "step": 8250 }, { "epoch": 2.2126575489407347, "grad_norm": 0.3002490564547751, "learning_rate": 1.955574002199125e-06, "loss": 0.0167, "step": 8251 }, { "epoch": 2.212925717350496, "grad_norm": 0.2954094443479846, "learning_rate": 1.9543365190204e-06, "loss": 0.0187, "step": 8252 }, { "epoch": 2.2131938857602576, "grad_norm": 0.2878556904645038, "learning_rate": 1.953099332389844e-06, "loss": 0.0191, "step": 8253 }, { "epoch": 2.213462054170019, "grad_norm": 0.18114150611054838, "learning_rate": 1.9518624424279166e-06, "loss": 0.0081, "step": 8254 }, { "epoch": 2.21373022257978, "grad_norm": 0.18329833849845795, "learning_rate": 1.9506258492550485e-06, "loss": 0.0107, "step": 8255 }, { "epoch": 2.2139983909895413, "grad_norm": 0.18942554235284997, "learning_rate": 1.9493895529916474e-06, "loss": 0.011, "step": 8256 }, { "epoch": 2.2142665593993027, "grad_norm": 0.19741336802141043, "learning_rate": 1.948153553758085e-06, "loss": 0.0104, "step": 8257 }, { "epoch": 2.214534727809064, "grad_norm": 0.17645697159424503, "learning_rate": 1.94691785167471e-06, "loss": 0.0105, "step": 8258 }, { "epoch": 2.2148028962188255, "grad_norm": 0.22691199148149038, "learning_rate": 1.9456824468618365e-06, "loss": 0.0131, "step": 8259 }, { "epoch": 2.215071064628587, "grad_norm": 0.24612984871556332, "learning_rate": 1.9444473394397535e-06, "loss": 0.0146, "step": 8260 }, { "epoch": 2.215339233038348, "grad_norm": 0.2614212515859232, "learning_rate": 1.943212529528723e-06, "loss": 0.0125, "step": 8261 }, { "epoch": 2.2156074014481093, "grad_norm": 0.2975679419960307, "learning_rate": 1.941978017248971e-06, "loss": 0.0156, "step": 8262 }, { "epoch": 2.2158755698578707, "grad_norm": 0.19591205733018469, "learning_rate": 1.940743802720702e-06, "loss": 0.0121, "step": 8263 }, { "epoch": 2.216143738267632, "grad_norm": 0.5660046526438169, "learning_rate": 1.9395098860640856e-06, "loss": 0.0495, "step": 8264 }, { "epoch": 2.2164119066773935, "grad_norm": 0.25615785085624326, "learning_rate": 1.9382762673992673e-06, "loss": 0.0148, "step": 8265 }, { "epoch": 2.216680075087155, "grad_norm": 0.2860856280241198, "learning_rate": 1.937042946846361e-06, "loss": 0.0105, "step": 8266 }, { "epoch": 2.216948243496916, "grad_norm": 0.20781741558592928, "learning_rate": 1.935809924525448e-06, "loss": 0.0108, "step": 8267 }, { "epoch": 2.2172164119066773, "grad_norm": 0.2593300934729011, "learning_rate": 1.93457720055659e-06, "loss": 0.018, "step": 8268 }, { "epoch": 2.2174845803164387, "grad_norm": 0.30742140921589506, "learning_rate": 1.9333447750598095e-06, "loss": 0.019, "step": 8269 }, { "epoch": 2.2177527487262, "grad_norm": 0.2561469110538905, "learning_rate": 1.932112648155107e-06, "loss": 0.0156, "step": 8270 }, { "epoch": 2.2180209171359615, "grad_norm": 0.27682924356337, "learning_rate": 1.9308808199624518e-06, "loss": 0.0164, "step": 8271 }, { "epoch": 2.218289085545723, "grad_norm": 0.2036876804075994, "learning_rate": 1.929649290601781e-06, "loss": 0.0118, "step": 8272 }, { "epoch": 2.218557253955484, "grad_norm": 0.21986702651922632, "learning_rate": 1.928418060193009e-06, "loss": 0.0121, "step": 8273 }, { "epoch": 2.2188254223652453, "grad_norm": 0.3104317301641841, "learning_rate": 1.927187128856014e-06, "loss": 0.0233, "step": 8274 }, { "epoch": 2.2190935907750067, "grad_norm": 0.2479554695197249, "learning_rate": 1.925956496710652e-06, "loss": 0.0135, "step": 8275 }, { "epoch": 2.219361759184768, "grad_norm": 0.2922416719432511, "learning_rate": 1.9247261638767427e-06, "loss": 0.0225, "step": 8276 }, { "epoch": 2.2196299275945295, "grad_norm": 0.23988489862933485, "learning_rate": 1.923496130474084e-06, "loss": 0.0151, "step": 8277 }, { "epoch": 2.219898096004291, "grad_norm": 0.2335797090472106, "learning_rate": 1.9222663966224386e-06, "loss": 0.0125, "step": 8278 }, { "epoch": 2.220166264414052, "grad_norm": 0.3362073759996765, "learning_rate": 1.9210369624415413e-06, "loss": 0.0193, "step": 8279 }, { "epoch": 2.2204344328238133, "grad_norm": 0.2321747879687417, "learning_rate": 1.9198078280511013e-06, "loss": 0.0194, "step": 8280 }, { "epoch": 2.2207026012335747, "grad_norm": 0.2524216600750648, "learning_rate": 1.918578993570796e-06, "loss": 0.0202, "step": 8281 }, { "epoch": 2.220970769643336, "grad_norm": 0.27983711289265273, "learning_rate": 1.917350459120272e-06, "loss": 0.0174, "step": 8282 }, { "epoch": 2.2212389380530975, "grad_norm": 0.2452037039069059, "learning_rate": 1.9161222248191515e-06, "loss": 0.0156, "step": 8283 }, { "epoch": 2.221507106462859, "grad_norm": 0.2364984564688047, "learning_rate": 1.91489429078702e-06, "loss": 0.0114, "step": 8284 }, { "epoch": 2.22177527487262, "grad_norm": 0.21775613990598863, "learning_rate": 1.9136666571434425e-06, "loss": 0.0135, "step": 8285 }, { "epoch": 2.2220434432823812, "grad_norm": 0.23605524853830348, "learning_rate": 1.912439324007947e-06, "loss": 0.0152, "step": 8286 }, { "epoch": 2.2223116116921426, "grad_norm": 0.25674699993789263, "learning_rate": 1.911212291500037e-06, "loss": 0.0173, "step": 8287 }, { "epoch": 2.222579780101904, "grad_norm": 0.2052191202252737, "learning_rate": 1.9099855597391876e-06, "loss": 0.0114, "step": 8288 }, { "epoch": 2.2228479485116655, "grad_norm": 0.26539626889977636, "learning_rate": 1.90875912884484e-06, "loss": 0.0144, "step": 8289 }, { "epoch": 2.223116116921427, "grad_norm": 0.2513821837348073, "learning_rate": 1.9075329989364076e-06, "loss": 0.0146, "step": 8290 }, { "epoch": 2.223384285331188, "grad_norm": 0.2310165207486927, "learning_rate": 1.9063071701332785e-06, "loss": 0.0176, "step": 8291 }, { "epoch": 2.223652453740949, "grad_norm": 0.24337359694745206, "learning_rate": 1.9050816425548046e-06, "loss": 0.0104, "step": 8292 }, { "epoch": 2.2239206221507106, "grad_norm": 0.32868182284790426, "learning_rate": 1.903856416320316e-06, "loss": 0.0174, "step": 8293 }, { "epoch": 2.224188790560472, "grad_norm": 0.18983830065859716, "learning_rate": 1.9026314915491067e-06, "loss": 0.0111, "step": 8294 }, { "epoch": 2.2244569589702334, "grad_norm": 0.2261355359206126, "learning_rate": 1.9014068683604475e-06, "loss": 0.0097, "step": 8295 }, { "epoch": 2.224725127379995, "grad_norm": 0.2719942161409358, "learning_rate": 1.9001825468735729e-06, "loss": 0.0161, "step": 8296 }, { "epoch": 2.224993295789756, "grad_norm": 0.27268868535095575, "learning_rate": 1.8989585272076937e-06, "loss": 0.0176, "step": 8297 }, { "epoch": 2.225261464199517, "grad_norm": 0.2729738206201802, "learning_rate": 1.8977348094819914e-06, "loss": 0.0157, "step": 8298 }, { "epoch": 2.2255296326092786, "grad_norm": 0.20276190528928145, "learning_rate": 1.8965113938156132e-06, "loss": 0.0118, "step": 8299 }, { "epoch": 2.22579780101904, "grad_norm": 0.30885195790218806, "learning_rate": 1.8952882803276822e-06, "loss": 0.0203, "step": 8300 }, { "epoch": 2.2260659694288014, "grad_norm": 0.26372368002897234, "learning_rate": 1.8940654691372884e-06, "loss": 0.0164, "step": 8301 }, { "epoch": 2.226334137838563, "grad_norm": 0.38067120211407274, "learning_rate": 1.8928429603634924e-06, "loss": 0.0157, "step": 8302 }, { "epoch": 2.226602306248324, "grad_norm": 0.24902300390960355, "learning_rate": 1.8916207541253295e-06, "loss": 0.0151, "step": 8303 }, { "epoch": 2.226870474658085, "grad_norm": 0.24593242992777087, "learning_rate": 1.8903988505417998e-06, "loss": 0.013, "step": 8304 }, { "epoch": 2.2271386430678466, "grad_norm": 0.29686637895574625, "learning_rate": 1.8891772497318795e-06, "loss": 0.0123, "step": 8305 }, { "epoch": 2.227406811477608, "grad_norm": 0.2765502319911622, "learning_rate": 1.8879559518145096e-06, "loss": 0.0135, "step": 8306 }, { "epoch": 2.2276749798873694, "grad_norm": 0.3273650551803932, "learning_rate": 1.8867349569086064e-06, "loss": 0.0145, "step": 8307 }, { "epoch": 2.227943148297131, "grad_norm": 0.3585962002939518, "learning_rate": 1.8855142651330566e-06, "loss": 0.0327, "step": 8308 }, { "epoch": 2.2282113167068918, "grad_norm": 0.23510924218910548, "learning_rate": 1.8842938766067127e-06, "loss": 0.0169, "step": 8309 }, { "epoch": 2.228479485116653, "grad_norm": 0.32140421696187305, "learning_rate": 1.8830737914484033e-06, "loss": 0.0172, "step": 8310 }, { "epoch": 2.2287476535264146, "grad_norm": 0.2639823590373743, "learning_rate": 1.8818540097769218e-06, "loss": 0.0187, "step": 8311 }, { "epoch": 2.229015821936176, "grad_norm": 0.22021570393667517, "learning_rate": 1.8806345317110387e-06, "loss": 0.0128, "step": 8312 }, { "epoch": 2.2292839903459374, "grad_norm": 0.2528034563400032, "learning_rate": 1.879415357369488e-06, "loss": 0.0113, "step": 8313 }, { "epoch": 2.2295521587556983, "grad_norm": 0.2159421474713139, "learning_rate": 1.87819648687098e-06, "loss": 0.0112, "step": 8314 }, { "epoch": 2.2298203271654597, "grad_norm": 0.25896997214402545, "learning_rate": 1.8769779203341925e-06, "loss": 0.014, "step": 8315 }, { "epoch": 2.230088495575221, "grad_norm": 0.20804231297209, "learning_rate": 1.8757596578777715e-06, "loss": 0.0113, "step": 8316 }, { "epoch": 2.2303566639849826, "grad_norm": 0.3032048472512723, "learning_rate": 1.8745416996203375e-06, "loss": 0.0179, "step": 8317 }, { "epoch": 2.230624832394744, "grad_norm": 0.439151122404767, "learning_rate": 1.8733240456804823e-06, "loss": 0.013, "step": 8318 }, { "epoch": 2.2308930008045054, "grad_norm": 0.22085748003204903, "learning_rate": 1.8721066961767626e-06, "loss": 0.0115, "step": 8319 }, { "epoch": 2.2311611692142668, "grad_norm": 0.2832245193928407, "learning_rate": 1.8708896512277102e-06, "loss": 0.0162, "step": 8320 }, { "epoch": 2.2314293376240277, "grad_norm": 0.2526843371098077, "learning_rate": 1.8696729109518235e-06, "loss": 0.0137, "step": 8321 }, { "epoch": 2.231697506033789, "grad_norm": 0.25348089304872545, "learning_rate": 1.8684564754675765e-06, "loss": 0.0119, "step": 8322 }, { "epoch": 2.2319656744435505, "grad_norm": 0.2002955813734136, "learning_rate": 1.8672403448934069e-06, "loss": 0.0092, "step": 8323 }, { "epoch": 2.232233842853312, "grad_norm": 0.1947161156217244, "learning_rate": 1.8660245193477266e-06, "loss": 0.0104, "step": 8324 }, { "epoch": 2.2325020112630733, "grad_norm": 0.29650823424436046, "learning_rate": 1.864808998948921e-06, "loss": 0.0154, "step": 8325 }, { "epoch": 2.2327701796728343, "grad_norm": 0.21422757560315422, "learning_rate": 1.863593783815339e-06, "loss": 0.0107, "step": 8326 }, { "epoch": 2.2330383480825957, "grad_norm": 0.28149428245118413, "learning_rate": 1.8623788740653032e-06, "loss": 0.017, "step": 8327 }, { "epoch": 2.233306516492357, "grad_norm": 0.24903829824693105, "learning_rate": 1.8611642698171045e-06, "loss": 0.012, "step": 8328 }, { "epoch": 2.2335746849021185, "grad_norm": 0.407864035131312, "learning_rate": 1.8599499711890079e-06, "loss": 0.0216, "step": 8329 }, { "epoch": 2.23384285331188, "grad_norm": 0.25157355224235883, "learning_rate": 1.858735978299247e-06, "loss": 0.0143, "step": 8330 }, { "epoch": 2.2341110217216413, "grad_norm": 0.46909416865291176, "learning_rate": 1.8575222912660224e-06, "loss": 0.0111, "step": 8331 }, { "epoch": 2.2343791901314027, "grad_norm": 0.30403674282606535, "learning_rate": 1.856308910207511e-06, "loss": 0.029, "step": 8332 }, { "epoch": 2.2346473585411637, "grad_norm": 0.3014268162125151, "learning_rate": 1.855095835241853e-06, "loss": 0.0183, "step": 8333 }, { "epoch": 2.234915526950925, "grad_norm": 0.2245817312371467, "learning_rate": 1.8538830664871644e-06, "loss": 0.0125, "step": 8334 }, { "epoch": 2.2351836953606865, "grad_norm": 0.24011462910908046, "learning_rate": 1.85267060406153e-06, "loss": 0.0153, "step": 8335 }, { "epoch": 2.235451863770448, "grad_norm": 0.23971004533726453, "learning_rate": 1.8514584480830012e-06, "loss": 0.0131, "step": 8336 }, { "epoch": 2.2357200321802093, "grad_norm": 0.18248817492219066, "learning_rate": 1.8502465986696062e-06, "loss": 0.0153, "step": 8337 }, { "epoch": 2.2359882005899703, "grad_norm": 0.3221824040820477, "learning_rate": 1.8490350559393367e-06, "loss": 0.0175, "step": 8338 }, { "epoch": 2.2362563689997317, "grad_norm": 0.3242498264003011, "learning_rate": 1.8478238200101573e-06, "loss": 0.0143, "step": 8339 }, { "epoch": 2.236524537409493, "grad_norm": 0.24828076698751095, "learning_rate": 1.8466128910000048e-06, "loss": 0.0121, "step": 8340 }, { "epoch": 2.2367927058192545, "grad_norm": 0.22528977817329923, "learning_rate": 1.8454022690267815e-06, "loss": 0.0122, "step": 8341 }, { "epoch": 2.237060874229016, "grad_norm": 0.24623456950600076, "learning_rate": 1.8441919542083658e-06, "loss": 0.0133, "step": 8342 }, { "epoch": 2.2373290426387773, "grad_norm": 0.2312126761601223, "learning_rate": 1.8429819466625993e-06, "loss": 0.0173, "step": 8343 }, { "epoch": 2.2375972110485387, "grad_norm": 0.28949892537224914, "learning_rate": 1.8417722465073e-06, "loss": 0.015, "step": 8344 }, { "epoch": 2.2378653794582997, "grad_norm": 0.31800230655328876, "learning_rate": 1.840562853860251e-06, "loss": 0.0113, "step": 8345 }, { "epoch": 2.238133547868061, "grad_norm": 0.28330561279559063, "learning_rate": 1.8393537688392088e-06, "loss": 0.0108, "step": 8346 }, { "epoch": 2.2384017162778225, "grad_norm": 0.41487225225241675, "learning_rate": 1.8381449915619003e-06, "loss": 0.0193, "step": 8347 }, { "epoch": 2.238669884687584, "grad_norm": 0.22765504947372817, "learning_rate": 1.8369365221460178e-06, "loss": 0.012, "step": 8348 }, { "epoch": 2.2389380530973453, "grad_norm": 0.2116448219753899, "learning_rate": 1.8357283607092302e-06, "loss": 0.009, "step": 8349 }, { "epoch": 2.2392062215071062, "grad_norm": 0.4486320902391489, "learning_rate": 1.8345205073691708e-06, "loss": 0.0236, "step": 8350 }, { "epoch": 2.2394743899168676, "grad_norm": 0.2427698266046734, "learning_rate": 1.8333129622434441e-06, "loss": 0.0129, "step": 8351 }, { "epoch": 2.239742558326629, "grad_norm": 0.2639546069728409, "learning_rate": 1.8321057254496283e-06, "loss": 0.0146, "step": 8352 }, { "epoch": 2.2400107267363905, "grad_norm": 0.35224524366660154, "learning_rate": 1.8308987971052662e-06, "loss": 0.0221, "step": 8353 }, { "epoch": 2.240278895146152, "grad_norm": 0.1962515099187816, "learning_rate": 1.8296921773278754e-06, "loss": 0.0123, "step": 8354 }, { "epoch": 2.2405470635559133, "grad_norm": 0.29625539643446247, "learning_rate": 1.8284858662349391e-06, "loss": 0.0127, "step": 8355 }, { "epoch": 2.2408152319656747, "grad_norm": 0.3294326863685458, "learning_rate": 1.8272798639439132e-06, "loss": 0.0173, "step": 8356 }, { "epoch": 2.2410834003754356, "grad_norm": 0.22753157609650246, "learning_rate": 1.8260741705722257e-06, "loss": 0.0104, "step": 8357 }, { "epoch": 2.241351568785197, "grad_norm": 0.2541140642483386, "learning_rate": 1.8248687862372683e-06, "loss": 0.0119, "step": 8358 }, { "epoch": 2.2416197371949584, "grad_norm": 0.2155614256869364, "learning_rate": 1.8236637110564081e-06, "loss": 0.0111, "step": 8359 }, { "epoch": 2.24188790560472, "grad_norm": 0.24373517582836976, "learning_rate": 1.8224589451469788e-06, "loss": 0.0141, "step": 8360 }, { "epoch": 2.2421560740144812, "grad_norm": 0.29753368477557174, "learning_rate": 1.8212544886262867e-06, "loss": 0.0184, "step": 8361 }, { "epoch": 2.242424242424242, "grad_norm": 0.2775801036783729, "learning_rate": 1.8200503416116056e-06, "loss": 0.0169, "step": 8362 }, { "epoch": 2.2426924108340036, "grad_norm": 0.2495354974777359, "learning_rate": 1.8188465042201786e-06, "loss": 0.0102, "step": 8363 }, { "epoch": 2.242960579243765, "grad_norm": 0.24388044664602188, "learning_rate": 1.817642976569224e-06, "loss": 0.0162, "step": 8364 }, { "epoch": 2.2432287476535264, "grad_norm": 0.2828977941671306, "learning_rate": 1.8164397587759208e-06, "loss": 0.0159, "step": 8365 }, { "epoch": 2.243496916063288, "grad_norm": 0.23560104623648043, "learning_rate": 1.815236850957427e-06, "loss": 0.0116, "step": 8366 }, { "epoch": 2.2437650844730492, "grad_norm": 0.2780074544742447, "learning_rate": 1.8140342532308675e-06, "loss": 0.0156, "step": 8367 }, { "epoch": 2.2440332528828106, "grad_norm": 0.287518522201214, "learning_rate": 1.812831965713332e-06, "loss": 0.0139, "step": 8368 }, { "epoch": 2.2443014212925716, "grad_norm": 0.22456352279964373, "learning_rate": 1.811629988521888e-06, "loss": 0.0146, "step": 8369 }, { "epoch": 2.244569589702333, "grad_norm": 0.3462136169085823, "learning_rate": 1.8104283217735651e-06, "loss": 0.018, "step": 8370 }, { "epoch": 2.2448377581120944, "grad_norm": 0.2360960588022182, "learning_rate": 1.8092269655853706e-06, "loss": 0.0142, "step": 8371 }, { "epoch": 2.245105926521856, "grad_norm": 0.20852790876816335, "learning_rate": 1.8080259200742735e-06, "loss": 0.0127, "step": 8372 }, { "epoch": 2.245374094931617, "grad_norm": 0.2910053259722999, "learning_rate": 1.8068251853572177e-06, "loss": 0.0153, "step": 8373 }, { "epoch": 2.245642263341378, "grad_norm": 0.2834484633699396, "learning_rate": 1.805624761551119e-06, "loss": 0.0167, "step": 8374 }, { "epoch": 2.2459104317511396, "grad_norm": 0.203046579914804, "learning_rate": 1.8044246487728535e-06, "loss": 0.0096, "step": 8375 }, { "epoch": 2.246178600160901, "grad_norm": 0.22024456760589958, "learning_rate": 1.8032248471392755e-06, "loss": 0.0118, "step": 8376 }, { "epoch": 2.2464467685706624, "grad_norm": 0.25249396309268335, "learning_rate": 1.8020253567672085e-06, "loss": 0.0154, "step": 8377 }, { "epoch": 2.246714936980424, "grad_norm": 0.38038584475962667, "learning_rate": 1.800826177773441e-06, "loss": 0.0207, "step": 8378 }, { "epoch": 2.246983105390185, "grad_norm": 0.33561147818734655, "learning_rate": 1.7996273102747363e-06, "loss": 0.0143, "step": 8379 }, { "epoch": 2.247251273799946, "grad_norm": 0.24747128263755447, "learning_rate": 1.7984287543878216e-06, "loss": 0.0156, "step": 8380 }, { "epoch": 2.2475194422097076, "grad_norm": 0.3108950197889388, "learning_rate": 1.7972305102294007e-06, "loss": 0.0151, "step": 8381 }, { "epoch": 2.247787610619469, "grad_norm": 0.2875559243437829, "learning_rate": 1.7960325779161408e-06, "loss": 0.0135, "step": 8382 }, { "epoch": 2.2480557790292304, "grad_norm": 0.22202555936019988, "learning_rate": 1.7948349575646817e-06, "loss": 0.0103, "step": 8383 }, { "epoch": 2.2483239474389918, "grad_norm": 0.2519429999289391, "learning_rate": 1.793637649291635e-06, "loss": 0.0175, "step": 8384 }, { "epoch": 2.248592115848753, "grad_norm": 0.28930475977184633, "learning_rate": 1.7924406532135764e-06, "loss": 0.0167, "step": 8385 }, { "epoch": 2.248860284258514, "grad_norm": 0.2354980726770266, "learning_rate": 1.7912439694470575e-06, "loss": 0.0127, "step": 8386 }, { "epoch": 2.2491284526682755, "grad_norm": 0.2518805655968204, "learning_rate": 1.7900475981085935e-06, "loss": 0.013, "step": 8387 }, { "epoch": 2.249396621078037, "grad_norm": 0.3403962671344334, "learning_rate": 1.788851539314671e-06, "loss": 0.0198, "step": 8388 }, { "epoch": 2.2496647894877984, "grad_norm": 0.26899681165811345, "learning_rate": 1.7876557931817507e-06, "loss": 0.0154, "step": 8389 }, { "epoch": 2.2499329578975598, "grad_norm": 0.27006601080909026, "learning_rate": 1.7864603598262549e-06, "loss": 0.0172, "step": 8390 }, { "epoch": 2.250201126307321, "grad_norm": 0.26206043491755426, "learning_rate": 1.7852652393645842e-06, "loss": 0.0168, "step": 8391 }, { "epoch": 2.2504692947170826, "grad_norm": 0.26925943706433847, "learning_rate": 1.7840704319131009e-06, "loss": 0.0168, "step": 8392 }, { "epoch": 2.2507374631268435, "grad_norm": 0.35960691979775616, "learning_rate": 1.782875937588141e-06, "loss": 0.016, "step": 8393 }, { "epoch": 2.251005631536605, "grad_norm": 0.3016779320410955, "learning_rate": 1.7816817565060113e-06, "loss": 0.0154, "step": 8394 }, { "epoch": 2.2512737999463663, "grad_norm": 0.202546778624593, "learning_rate": 1.780487888782983e-06, "loss": 0.0108, "step": 8395 }, { "epoch": 2.2515419683561277, "grad_norm": 0.19196112769990653, "learning_rate": 1.779294334535303e-06, "loss": 0.0119, "step": 8396 }, { "epoch": 2.251810136765889, "grad_norm": 0.20738712280280852, "learning_rate": 1.7781010938791815e-06, "loss": 0.0107, "step": 8397 }, { "epoch": 2.25207830517565, "grad_norm": 0.26993265985758114, "learning_rate": 1.7769081669308035e-06, "loss": 0.0127, "step": 8398 }, { "epoch": 2.2523464735854115, "grad_norm": 0.3232650437046919, "learning_rate": 1.775715553806321e-06, "loss": 0.0149, "step": 8399 }, { "epoch": 2.252614641995173, "grad_norm": 0.20868487809385922, "learning_rate": 1.7745232546218522e-06, "loss": 0.0118, "step": 8400 }, { "epoch": 2.2528828104049343, "grad_norm": 0.15214608610035332, "learning_rate": 1.7733312694934929e-06, "loss": 0.0071, "step": 8401 }, { "epoch": 2.2531509788146957, "grad_norm": 0.23290841853588823, "learning_rate": 1.7721395985372997e-06, "loss": 0.0129, "step": 8402 }, { "epoch": 2.253419147224457, "grad_norm": 0.3342736360530436, "learning_rate": 1.7709482418693036e-06, "loss": 0.0243, "step": 8403 }, { "epoch": 2.2536873156342185, "grad_norm": 0.23649168326749187, "learning_rate": 1.7697571996055058e-06, "loss": 0.0107, "step": 8404 }, { "epoch": 2.2539554840439795, "grad_norm": 0.21544977778078253, "learning_rate": 1.7685664718618716e-06, "loss": 0.0093, "step": 8405 }, { "epoch": 2.254223652453741, "grad_norm": 0.21530024485611202, "learning_rate": 1.7673760587543425e-06, "loss": 0.0088, "step": 8406 }, { "epoch": 2.2544918208635023, "grad_norm": 0.24820648766956802, "learning_rate": 1.766185960398823e-06, "loss": 0.0169, "step": 8407 }, { "epoch": 2.2547599892732637, "grad_norm": 0.19072420963277156, "learning_rate": 1.7649961769111922e-06, "loss": 0.0144, "step": 8408 }, { "epoch": 2.255028157683025, "grad_norm": 0.19346045257770214, "learning_rate": 1.763806708407293e-06, "loss": 0.0119, "step": 8409 }, { "epoch": 2.255296326092786, "grad_norm": 0.26910231680635227, "learning_rate": 1.7626175550029446e-06, "loss": 0.0139, "step": 8410 }, { "epoch": 2.2555644945025475, "grad_norm": 0.2779314829663427, "learning_rate": 1.7614287168139298e-06, "loss": 0.0189, "step": 8411 }, { "epoch": 2.255832662912309, "grad_norm": 0.2443602650053386, "learning_rate": 1.760240193956001e-06, "loss": 0.0101, "step": 8412 }, { "epoch": 2.2561008313220703, "grad_norm": 0.1945014191963831, "learning_rate": 1.7590519865448829e-06, "loss": 0.0086, "step": 8413 }, { "epoch": 2.2563689997318317, "grad_norm": 0.2162503105149135, "learning_rate": 1.7578640946962699e-06, "loss": 0.013, "step": 8414 }, { "epoch": 2.256637168141593, "grad_norm": 0.2962144829908356, "learning_rate": 1.7566765185258205e-06, "loss": 0.0169, "step": 8415 }, { "epoch": 2.256905336551354, "grad_norm": 0.18888010985306866, "learning_rate": 1.75548925814917e-06, "loss": 0.0103, "step": 8416 }, { "epoch": 2.2571735049611155, "grad_norm": 0.19608193591592452, "learning_rate": 1.7543023136819144e-06, "loss": 0.0097, "step": 8417 }, { "epoch": 2.257441673370877, "grad_norm": 0.29292365143455823, "learning_rate": 1.753115685239627e-06, "loss": 0.0125, "step": 8418 }, { "epoch": 2.2577098417806383, "grad_norm": 0.27850452980986307, "learning_rate": 1.7519293729378429e-06, "loss": 0.0143, "step": 8419 }, { "epoch": 2.2579780101903997, "grad_norm": 0.24212283742353258, "learning_rate": 1.750743376892073e-06, "loss": 0.0098, "step": 8420 }, { "epoch": 2.258246178600161, "grad_norm": 0.217117261100985, "learning_rate": 1.7495576972177952e-06, "loss": 0.0111, "step": 8421 }, { "epoch": 2.258514347009922, "grad_norm": 0.19667290683446428, "learning_rate": 1.7483723340304547e-06, "loss": 0.011, "step": 8422 }, { "epoch": 2.2587825154196834, "grad_norm": 0.4082059929411152, "learning_rate": 1.747187287445467e-06, "loss": 0.0186, "step": 8423 }, { "epoch": 2.259050683829445, "grad_norm": 0.18226474843049434, "learning_rate": 1.7460025575782158e-06, "loss": 0.0093, "step": 8424 }, { "epoch": 2.2593188522392063, "grad_norm": 0.2087185802700878, "learning_rate": 1.744818144544056e-06, "loss": 0.0108, "step": 8425 }, { "epoch": 2.2595870206489677, "grad_norm": 0.24860243094253248, "learning_rate": 1.743634048458313e-06, "loss": 0.0173, "step": 8426 }, { "epoch": 2.259855189058729, "grad_norm": 0.37384976359409355, "learning_rate": 1.7424502694362755e-06, "loss": 0.0206, "step": 8427 }, { "epoch": 2.26012335746849, "grad_norm": 0.21340279001700604, "learning_rate": 1.7412668075932082e-06, "loss": 0.0129, "step": 8428 }, { "epoch": 2.2603915258782514, "grad_norm": 0.26589111252871844, "learning_rate": 1.7400836630443386e-06, "loss": 0.0204, "step": 8429 }, { "epoch": 2.260659694288013, "grad_norm": 0.2154091391984328, "learning_rate": 1.7389008359048677e-06, "loss": 0.0119, "step": 8430 }, { "epoch": 2.2609278626977742, "grad_norm": 0.2795333436193349, "learning_rate": 1.7377183262899662e-06, "loss": 0.0138, "step": 8431 }, { "epoch": 2.2611960311075356, "grad_norm": 0.4037599656868992, "learning_rate": 1.736536134314768e-06, "loss": 0.0163, "step": 8432 }, { "epoch": 2.261464199517297, "grad_norm": 0.22674526989891602, "learning_rate": 1.735354260094384e-06, "loss": 0.0124, "step": 8433 }, { "epoch": 2.261732367927058, "grad_norm": 0.327041357828816, "learning_rate": 1.7341727037438866e-06, "loss": 0.0133, "step": 8434 }, { "epoch": 2.2620005363368194, "grad_norm": 0.2737326461304354, "learning_rate": 1.7329914653783241e-06, "loss": 0.0146, "step": 8435 }, { "epoch": 2.262268704746581, "grad_norm": 0.27410649926241926, "learning_rate": 1.7318105451127087e-06, "loss": 0.0141, "step": 8436 }, { "epoch": 2.262536873156342, "grad_norm": 0.19795372617334916, "learning_rate": 1.7306299430620222e-06, "loss": 0.0119, "step": 8437 }, { "epoch": 2.2628050415661036, "grad_norm": 0.4027353533822718, "learning_rate": 1.7294496593412198e-06, "loss": 0.0203, "step": 8438 }, { "epoch": 2.263073209975865, "grad_norm": 0.2637329834433796, "learning_rate": 1.7282696940652188e-06, "loss": 0.018, "step": 8439 }, { "epoch": 2.263341378385626, "grad_norm": 0.22831583786030787, "learning_rate": 1.7270900473489138e-06, "loss": 0.0104, "step": 8440 }, { "epoch": 2.2636095467953874, "grad_norm": 0.23113302301453473, "learning_rate": 1.7259107193071594e-06, "loss": 0.0168, "step": 8441 }, { "epoch": 2.263877715205149, "grad_norm": 0.20839182480001603, "learning_rate": 1.7247317100547855e-06, "loss": 0.0121, "step": 8442 }, { "epoch": 2.26414588361491, "grad_norm": 0.3121122235370847, "learning_rate": 1.7235530197065913e-06, "loss": 0.0245, "step": 8443 }, { "epoch": 2.2644140520246716, "grad_norm": 0.2634070448337121, "learning_rate": 1.722374648377339e-06, "loss": 0.0136, "step": 8444 }, { "epoch": 2.264682220434433, "grad_norm": 0.24115031838263482, "learning_rate": 1.7211965961817672e-06, "loss": 0.0119, "step": 8445 }, { "epoch": 2.264950388844194, "grad_norm": 0.22925340229063987, "learning_rate": 1.720018863234576e-06, "loss": 0.0122, "step": 8446 }, { "epoch": 2.2652185572539554, "grad_norm": 0.19698281015379107, "learning_rate": 1.7188414496504413e-06, "loss": 0.0134, "step": 8447 }, { "epoch": 2.265486725663717, "grad_norm": 0.20768577143007977, "learning_rate": 1.7176643555440036e-06, "loss": 0.0128, "step": 8448 }, { "epoch": 2.265754894073478, "grad_norm": 0.22495302651168775, "learning_rate": 1.7164875810298714e-06, "loss": 0.0105, "step": 8449 }, { "epoch": 2.2660230624832396, "grad_norm": 0.46937709758612356, "learning_rate": 1.715311126222628e-06, "loss": 0.013, "step": 8450 }, { "epoch": 2.266291230893001, "grad_norm": 0.23662344246469108, "learning_rate": 1.714134991236817e-06, "loss": 0.0165, "step": 8451 }, { "epoch": 2.266559399302762, "grad_norm": 0.3666947739177268, "learning_rate": 1.7129591761869585e-06, "loss": 0.014, "step": 8452 }, { "epoch": 2.2668275677125234, "grad_norm": 0.1530048273735077, "learning_rate": 1.71178368118754e-06, "loss": 0.0063, "step": 8453 }, { "epoch": 2.2670957361222848, "grad_norm": 0.24259126111725363, "learning_rate": 1.7106085063530125e-06, "loss": 0.0168, "step": 8454 }, { "epoch": 2.267363904532046, "grad_norm": 0.1991659782595108, "learning_rate": 1.7094336517978033e-06, "loss": 0.0126, "step": 8455 }, { "epoch": 2.2676320729418076, "grad_norm": 0.2532396945980159, "learning_rate": 1.7082591176363018e-06, "loss": 0.0111, "step": 8456 }, { "epoch": 2.267900241351569, "grad_norm": 0.30459714695482926, "learning_rate": 1.707084903982872e-06, "loss": 0.0267, "step": 8457 }, { "epoch": 2.26816840976133, "grad_norm": 0.2433995567309358, "learning_rate": 1.7059110109518407e-06, "loss": 0.0106, "step": 8458 }, { "epoch": 2.2684365781710913, "grad_norm": 0.28978017889972923, "learning_rate": 1.7047374386575104e-06, "loss": 0.0162, "step": 8459 }, { "epoch": 2.2687047465808527, "grad_norm": 0.29282937259234343, "learning_rate": 1.7035641872141474e-06, "loss": 0.023, "step": 8460 }, { "epoch": 2.268972914990614, "grad_norm": 0.2201059834557375, "learning_rate": 1.702391256735985e-06, "loss": 0.0093, "step": 8461 }, { "epoch": 2.2692410834003756, "grad_norm": 0.22789475962591008, "learning_rate": 1.7012186473372317e-06, "loss": 0.0113, "step": 8462 }, { "epoch": 2.2695092518101365, "grad_norm": 0.273189725525553, "learning_rate": 1.7000463591320621e-06, "loss": 0.0178, "step": 8463 }, { "epoch": 2.269777420219898, "grad_norm": 0.20273513360110934, "learning_rate": 1.6988743922346156e-06, "loss": 0.0099, "step": 8464 }, { "epoch": 2.2700455886296593, "grad_norm": 0.29250991970413154, "learning_rate": 1.6977027467590068e-06, "loss": 0.0216, "step": 8465 }, { "epoch": 2.2703137570394207, "grad_norm": 0.2723107381632108, "learning_rate": 1.6965314228193126e-06, "loss": 0.0192, "step": 8466 }, { "epoch": 2.270581925449182, "grad_norm": 0.20941681535076634, "learning_rate": 1.6953604205295848e-06, "loss": 0.0133, "step": 8467 }, { "epoch": 2.2708500938589435, "grad_norm": 0.27709954285805866, "learning_rate": 1.6941897400038376e-06, "loss": 0.015, "step": 8468 }, { "epoch": 2.271118262268705, "grad_norm": 0.23605061823897994, "learning_rate": 1.6930193813560586e-06, "loss": 0.0127, "step": 8469 }, { "epoch": 2.271386430678466, "grad_norm": 0.2826037160527682, "learning_rate": 1.6918493447002043e-06, "loss": 0.0145, "step": 8470 }, { "epoch": 2.2716545990882273, "grad_norm": 0.26291507719180107, "learning_rate": 1.6906796301501965e-06, "loss": 0.0135, "step": 8471 }, { "epoch": 2.2719227674979887, "grad_norm": 0.24340630299311553, "learning_rate": 1.6895102378199252e-06, "loss": 0.0151, "step": 8472 }, { "epoch": 2.27219093590775, "grad_norm": 0.2568415515716202, "learning_rate": 1.6883411678232548e-06, "loss": 0.0202, "step": 8473 }, { "epoch": 2.2724591043175115, "grad_norm": 0.16282139438450577, "learning_rate": 1.6871724202740103e-06, "loss": 0.0083, "step": 8474 }, { "epoch": 2.2727272727272725, "grad_norm": 0.2775997368887086, "learning_rate": 1.6860039952859941e-06, "loss": 0.0132, "step": 8475 }, { "epoch": 2.272995441137034, "grad_norm": 0.23970074685438542, "learning_rate": 1.6848358929729687e-06, "loss": 0.0114, "step": 8476 }, { "epoch": 2.2732636095467953, "grad_norm": 0.25299634110264096, "learning_rate": 1.683668113448672e-06, "loss": 0.0178, "step": 8477 }, { "epoch": 2.2735317779565567, "grad_norm": 0.24066257414749712, "learning_rate": 1.6825006568268048e-06, "loss": 0.0135, "step": 8478 }, { "epoch": 2.273799946366318, "grad_norm": 0.25926075723699976, "learning_rate": 1.6813335232210416e-06, "loss": 0.0129, "step": 8479 }, { "epoch": 2.2740681147760795, "grad_norm": 0.3643012234394694, "learning_rate": 1.6801667127450233e-06, "loss": 0.0142, "step": 8480 }, { "epoch": 2.274336283185841, "grad_norm": 0.22886255552419288, "learning_rate": 1.6790002255123567e-06, "loss": 0.0122, "step": 8481 }, { "epoch": 2.274604451595602, "grad_norm": 0.2621957705359423, "learning_rate": 1.6778340616366228e-06, "loss": 0.0124, "step": 8482 }, { "epoch": 2.2748726200053633, "grad_norm": 0.3351720043750013, "learning_rate": 1.6766682212313662e-06, "loss": 0.0142, "step": 8483 }, { "epoch": 2.2751407884151247, "grad_norm": 0.22377702206473182, "learning_rate": 1.6755027044100996e-06, "loss": 0.0128, "step": 8484 }, { "epoch": 2.275408956824886, "grad_norm": 0.29362666190855174, "learning_rate": 1.6743375112863103e-06, "loss": 0.0193, "step": 8485 }, { "epoch": 2.2756771252346475, "grad_norm": 0.3482797014707081, "learning_rate": 1.6731726419734461e-06, "loss": 0.0231, "step": 8486 }, { "epoch": 2.2759452936444085, "grad_norm": 0.2406188999772896, "learning_rate": 1.672008096584931e-06, "loss": 0.0177, "step": 8487 }, { "epoch": 2.27621346205417, "grad_norm": 0.24758160982945265, "learning_rate": 1.6708438752341506e-06, "loss": 0.015, "step": 8488 }, { "epoch": 2.2764816304639313, "grad_norm": 0.27442449128690044, "learning_rate": 1.669679978034463e-06, "loss": 0.0132, "step": 8489 }, { "epoch": 2.2767497988736927, "grad_norm": 0.2841118250840289, "learning_rate": 1.668516405099196e-06, "loss": 0.0144, "step": 8490 }, { "epoch": 2.277017967283454, "grad_norm": 0.1854022618083752, "learning_rate": 1.6673531565416395e-06, "loss": 0.008, "step": 8491 }, { "epoch": 2.2772861356932155, "grad_norm": 0.22814781678986457, "learning_rate": 1.66619023247506e-06, "loss": 0.015, "step": 8492 }, { "epoch": 2.277554304102977, "grad_norm": 0.20524171405366431, "learning_rate": 1.6650276330126853e-06, "loss": 0.0143, "step": 8493 }, { "epoch": 2.277822472512738, "grad_norm": 0.21054793591251186, "learning_rate": 1.6638653582677166e-06, "loss": 0.009, "step": 8494 }, { "epoch": 2.2780906409224992, "grad_norm": 0.2718074338655996, "learning_rate": 1.6627034083533216e-06, "loss": 0.0167, "step": 8495 }, { "epoch": 2.2783588093322606, "grad_norm": 0.28732617270328803, "learning_rate": 1.661541783382633e-06, "loss": 0.0123, "step": 8496 }, { "epoch": 2.278626977742022, "grad_norm": 0.4803803441162348, "learning_rate": 1.6603804834687597e-06, "loss": 0.0255, "step": 8497 }, { "epoch": 2.2788951461517835, "grad_norm": 0.24658436804267175, "learning_rate": 1.6592195087247703e-06, "loss": 0.0136, "step": 8498 }, { "epoch": 2.2791633145615444, "grad_norm": 0.2600345584599022, "learning_rate": 1.658058859263708e-06, "loss": 0.0148, "step": 8499 }, { "epoch": 2.279431482971306, "grad_norm": 0.2806213527335778, "learning_rate": 1.6568985351985833e-06, "loss": 0.0182, "step": 8500 }, { "epoch": 2.2796996513810672, "grad_norm": 0.21882891356023207, "learning_rate": 1.6557385366423706e-06, "loss": 0.0124, "step": 8501 }, { "epoch": 2.2799678197908286, "grad_norm": 0.2590529117257352, "learning_rate": 1.6545788637080195e-06, "loss": 0.0152, "step": 8502 }, { "epoch": 2.28023598820059, "grad_norm": 0.32070816632245663, "learning_rate": 1.6534195165084417e-06, "loss": 0.0209, "step": 8503 }, { "epoch": 2.2805041566103514, "grad_norm": 0.2245510799972336, "learning_rate": 1.6522604951565213e-06, "loss": 0.0127, "step": 8504 }, { "epoch": 2.280772325020113, "grad_norm": 0.2627587646071397, "learning_rate": 1.6511017997651069e-06, "loss": 0.0114, "step": 8505 }, { "epoch": 2.281040493429874, "grad_norm": 0.24922569063116717, "learning_rate": 1.6499434304470192e-06, "loss": 0.0147, "step": 8506 }, { "epoch": 2.281308661839635, "grad_norm": 0.28996286727518145, "learning_rate": 1.6487853873150489e-06, "loss": 0.0164, "step": 8507 }, { "epoch": 2.2815768302493966, "grad_norm": 0.28964302425281013, "learning_rate": 1.6476276704819443e-06, "loss": 0.0173, "step": 8508 }, { "epoch": 2.281844998659158, "grad_norm": 0.21446319948636575, "learning_rate": 1.6464702800604337e-06, "loss": 0.0077, "step": 8509 }, { "epoch": 2.2821131670689194, "grad_norm": 0.26190773480088075, "learning_rate": 1.6453132161632095e-06, "loss": 0.0159, "step": 8510 }, { "epoch": 2.2823813354786804, "grad_norm": 0.20840110109380103, "learning_rate": 1.64415647890293e-06, "loss": 0.0133, "step": 8511 }, { "epoch": 2.282649503888442, "grad_norm": 0.20761017796542586, "learning_rate": 1.6430000683922258e-06, "loss": 0.0088, "step": 8512 }, { "epoch": 2.282917672298203, "grad_norm": 0.21272298851348512, "learning_rate": 1.6418439847436907e-06, "loss": 0.0148, "step": 8513 }, { "epoch": 2.2831858407079646, "grad_norm": 0.23052055012735603, "learning_rate": 1.6406882280698926e-06, "loss": 0.0131, "step": 8514 }, { "epoch": 2.283454009117726, "grad_norm": 0.21278876835311675, "learning_rate": 1.6395327984833614e-06, "loss": 0.0115, "step": 8515 }, { "epoch": 2.2837221775274874, "grad_norm": 0.27951236105264277, "learning_rate": 1.6383776960965997e-06, "loss": 0.0145, "step": 8516 }, { "epoch": 2.283990345937249, "grad_norm": 0.39106713856218195, "learning_rate": 1.6372229210220785e-06, "loss": 0.0292, "step": 8517 }, { "epoch": 2.2842585143470098, "grad_norm": 0.27699602435211956, "learning_rate": 1.636068473372232e-06, "loss": 0.0139, "step": 8518 }, { "epoch": 2.284526682756771, "grad_norm": 0.21526888536503336, "learning_rate": 1.6349143532594702e-06, "loss": 0.0084, "step": 8519 }, { "epoch": 2.2847948511665326, "grad_norm": 0.26269675252639224, "learning_rate": 1.6337605607961604e-06, "loss": 0.0166, "step": 8520 }, { "epoch": 2.285063019576294, "grad_norm": 0.21093303678308473, "learning_rate": 1.6326070960946478e-06, "loss": 0.0142, "step": 8521 }, { "epoch": 2.2853311879860554, "grad_norm": 0.22947905171018737, "learning_rate": 1.6314539592672435e-06, "loss": 0.0155, "step": 8522 }, { "epoch": 2.2855993563958164, "grad_norm": 0.3304083339666778, "learning_rate": 1.6303011504262223e-06, "loss": 0.0179, "step": 8523 }, { "epoch": 2.2858675248055778, "grad_norm": 0.2898016325597004, "learning_rate": 1.6291486696838332e-06, "loss": 0.0145, "step": 8524 }, { "epoch": 2.286135693215339, "grad_norm": 0.2439958636102099, "learning_rate": 1.627996517152287e-06, "loss": 0.0105, "step": 8525 }, { "epoch": 2.2864038616251006, "grad_norm": 0.2693337846878799, "learning_rate": 1.6268446929437677e-06, "loss": 0.0143, "step": 8526 }, { "epoch": 2.286672030034862, "grad_norm": 0.19800695752480713, "learning_rate": 1.6256931971704266e-06, "loss": 0.0091, "step": 8527 }, { "epoch": 2.2869401984446234, "grad_norm": 0.2736790571679425, "learning_rate": 1.6245420299443788e-06, "loss": 0.0152, "step": 8528 }, { "epoch": 2.2872083668543848, "grad_norm": 0.2796611359984947, "learning_rate": 1.6233911913777135e-06, "loss": 0.015, "step": 8529 }, { "epoch": 2.2874765352641457, "grad_norm": 0.2698950595977366, "learning_rate": 1.622240681582481e-06, "loss": 0.0122, "step": 8530 }, { "epoch": 2.287744703673907, "grad_norm": 0.23332941311845956, "learning_rate": 1.6210905006707079e-06, "loss": 0.0142, "step": 8531 }, { "epoch": 2.2880128720836685, "grad_norm": 0.23352858651534042, "learning_rate": 1.6199406487543817e-06, "loss": 0.0106, "step": 8532 }, { "epoch": 2.28828104049343, "grad_norm": 0.3628079292751081, "learning_rate": 1.6187911259454586e-06, "loss": 0.0232, "step": 8533 }, { "epoch": 2.2885492089031914, "grad_norm": 0.21606384361387734, "learning_rate": 1.6176419323558689e-06, "loss": 0.0104, "step": 8534 }, { "epoch": 2.2888173773129523, "grad_norm": 0.23478407077827018, "learning_rate": 1.6164930680975021e-06, "loss": 0.0117, "step": 8535 }, { "epoch": 2.2890855457227137, "grad_norm": 0.27461262288016375, "learning_rate": 1.6153445332822243e-06, "loss": 0.0193, "step": 8536 }, { "epoch": 2.289353714132475, "grad_norm": 0.27065145211918784, "learning_rate": 1.6141963280218615e-06, "loss": 0.0155, "step": 8537 }, { "epoch": 2.2896218825422365, "grad_norm": 0.5295796368434448, "learning_rate": 1.6130484524282125e-06, "loss": 0.0219, "step": 8538 }, { "epoch": 2.289890050951998, "grad_norm": 0.16323762444684314, "learning_rate": 1.611900906613046e-06, "loss": 0.0098, "step": 8539 }, { "epoch": 2.2901582193617593, "grad_norm": 0.2892312485584017, "learning_rate": 1.6107536906880904e-06, "loss": 0.0188, "step": 8540 }, { "epoch": 2.2904263877715207, "grad_norm": 0.2436998962697862, "learning_rate": 1.6096068047650521e-06, "loss": 0.0136, "step": 8541 }, { "epoch": 2.2906945561812817, "grad_norm": 0.1988970918027792, "learning_rate": 1.6084602489555963e-06, "loss": 0.0098, "step": 8542 }, { "epoch": 2.290962724591043, "grad_norm": 0.2960631853201012, "learning_rate": 1.607314023371363e-06, "loss": 0.0148, "step": 8543 }, { "epoch": 2.2912308930008045, "grad_norm": 0.24372428053667514, "learning_rate": 1.6061681281239554e-06, "loss": 0.0108, "step": 8544 }, { "epoch": 2.291499061410566, "grad_norm": 0.24917036764393638, "learning_rate": 1.605022563324946e-06, "loss": 0.0146, "step": 8545 }, { "epoch": 2.2917672298203273, "grad_norm": 0.4487712075695106, "learning_rate": 1.6038773290858773e-06, "loss": 0.015, "step": 8546 }, { "epoch": 2.2920353982300883, "grad_norm": 0.17495563739516928, "learning_rate": 1.6027324255182547e-06, "loss": 0.0092, "step": 8547 }, { "epoch": 2.2923035666398497, "grad_norm": 0.2066927858919382, "learning_rate": 1.6015878527335566e-06, "loss": 0.0117, "step": 8548 }, { "epoch": 2.292571735049611, "grad_norm": 0.23330762011233286, "learning_rate": 1.600443610843228e-06, "loss": 0.0127, "step": 8549 }, { "epoch": 2.2928399034593725, "grad_norm": 0.2403612175731807, "learning_rate": 1.599299699958678e-06, "loss": 0.0139, "step": 8550 }, { "epoch": 2.293108071869134, "grad_norm": 0.2096230403278886, "learning_rate": 1.5981561201912893e-06, "loss": 0.0107, "step": 8551 }, { "epoch": 2.2933762402788953, "grad_norm": 0.18043890025319256, "learning_rate": 1.5970128716524052e-06, "loss": 0.0098, "step": 8552 }, { "epoch": 2.2936444086886567, "grad_norm": 0.28130873570064524, "learning_rate": 1.595869954453345e-06, "loss": 0.0125, "step": 8553 }, { "epoch": 2.2939125770984177, "grad_norm": 0.2985108464714816, "learning_rate": 1.5947273687053882e-06, "loss": 0.0196, "step": 8554 }, { "epoch": 2.294180745508179, "grad_norm": 0.19109555128876005, "learning_rate": 1.5935851145197867e-06, "loss": 0.0111, "step": 8555 }, { "epoch": 2.2944489139179405, "grad_norm": 0.2799930477074702, "learning_rate": 1.5924431920077615e-06, "loss": 0.0168, "step": 8556 }, { "epoch": 2.294717082327702, "grad_norm": 0.2061159324620164, "learning_rate": 1.5913016012804927e-06, "loss": 0.0114, "step": 8557 }, { "epoch": 2.2949852507374633, "grad_norm": 0.29094974679386515, "learning_rate": 1.5901603424491375e-06, "loss": 0.0157, "step": 8558 }, { "epoch": 2.2952534191472242, "grad_norm": 0.2748932428405859, "learning_rate": 1.5890194156248178e-06, "loss": 0.0185, "step": 8559 }, { "epoch": 2.2955215875569857, "grad_norm": 0.2340296208759539, "learning_rate": 1.5878788209186203e-06, "loss": 0.011, "step": 8560 }, { "epoch": 2.295789755966747, "grad_norm": 0.30880371525182687, "learning_rate": 1.5867385584416052e-06, "loss": 0.0146, "step": 8561 }, { "epoch": 2.2960579243765085, "grad_norm": 0.24634459825291338, "learning_rate": 1.585598628304792e-06, "loss": 0.0141, "step": 8562 }, { "epoch": 2.29632609278627, "grad_norm": 0.23238923951575596, "learning_rate": 1.5844590306191776e-06, "loss": 0.012, "step": 8563 }, { "epoch": 2.2965942611960313, "grad_norm": 0.22994407720841128, "learning_rate": 1.583319765495717e-06, "loss": 0.0108, "step": 8564 }, { "epoch": 2.2968624296057927, "grad_norm": 0.22202755640472555, "learning_rate": 1.5821808330453403e-06, "loss": 0.0138, "step": 8565 }, { "epoch": 2.2971305980155536, "grad_norm": 0.2306430729146389, "learning_rate": 1.5810422333789432e-06, "loss": 0.0096, "step": 8566 }, { "epoch": 2.297398766425315, "grad_norm": 0.2519542834760024, "learning_rate": 1.5799039666073852e-06, "loss": 0.0148, "step": 8567 }, { "epoch": 2.2976669348350764, "grad_norm": 0.20352706729632353, "learning_rate": 1.5787660328414993e-06, "loss": 0.0136, "step": 8568 }, { "epoch": 2.297935103244838, "grad_norm": 0.19152395116537818, "learning_rate": 1.5776284321920814e-06, "loss": 0.0113, "step": 8569 }, { "epoch": 2.2982032716545993, "grad_norm": 0.2470719405727053, "learning_rate": 1.5764911647698956e-06, "loss": 0.0135, "step": 8570 }, { "epoch": 2.29847144006436, "grad_norm": 0.24031869693112035, "learning_rate": 1.5753542306856774e-06, "loss": 0.0119, "step": 8571 }, { "epoch": 2.2987396084741216, "grad_norm": 0.2096520176333196, "learning_rate": 1.574217630050124e-06, "loss": 0.0126, "step": 8572 }, { "epoch": 2.299007776883883, "grad_norm": 0.477953625726046, "learning_rate": 1.5730813629739055e-06, "loss": 0.0121, "step": 8573 }, { "epoch": 2.2992759452936444, "grad_norm": 0.2057460610459564, "learning_rate": 1.5719454295676555e-06, "loss": 0.0104, "step": 8574 }, { "epoch": 2.299544113703406, "grad_norm": 0.18764641461850087, "learning_rate": 1.5708098299419778e-06, "loss": 0.0114, "step": 8575 }, { "epoch": 2.2998122821131672, "grad_norm": 0.23646356723862264, "learning_rate": 1.569674564207444e-06, "loss": 0.011, "step": 8576 }, { "epoch": 2.3000804505229286, "grad_norm": 0.25307598388373187, "learning_rate": 1.5685396324745888e-06, "loss": 0.0124, "step": 8577 }, { "epoch": 2.3003486189326896, "grad_norm": 0.25461928705345865, "learning_rate": 1.567405034853921e-06, "loss": 0.0116, "step": 8578 }, { "epoch": 2.300616787342451, "grad_norm": 0.2671985608749617, "learning_rate": 1.56627077145591e-06, "loss": 0.0144, "step": 8579 }, { "epoch": 2.3008849557522124, "grad_norm": 0.2755625660026213, "learning_rate": 1.565136842390999e-06, "loss": 0.0199, "step": 8580 }, { "epoch": 2.301153124161974, "grad_norm": 0.2247513374118995, "learning_rate": 1.5640032477695942e-06, "loss": 0.0135, "step": 8581 }, { "epoch": 2.3014212925717352, "grad_norm": 0.2494479077495726, "learning_rate": 1.5628699877020698e-06, "loss": 0.0145, "step": 8582 }, { "epoch": 2.301689460981496, "grad_norm": 0.3995341164074962, "learning_rate": 1.5617370622987703e-06, "loss": 0.018, "step": 8583 }, { "epoch": 2.3019576293912576, "grad_norm": 0.17771201803975975, "learning_rate": 1.560604471670003e-06, "loss": 0.0082, "step": 8584 }, { "epoch": 2.302225797801019, "grad_norm": 0.262371696980084, "learning_rate": 1.5594722159260473e-06, "loss": 0.0127, "step": 8585 }, { "epoch": 2.3024939662107804, "grad_norm": 0.2607488845230623, "learning_rate": 1.5583402951771492e-06, "loss": 0.0096, "step": 8586 }, { "epoch": 2.302762134620542, "grad_norm": 0.2438573230479588, "learning_rate": 1.557208709533517e-06, "loss": 0.0115, "step": 8587 }, { "epoch": 2.303030303030303, "grad_norm": 0.27221245642541997, "learning_rate": 1.556077459105334e-06, "loss": 0.0155, "step": 8588 }, { "epoch": 2.3032984714400646, "grad_norm": 0.1935712054688396, "learning_rate": 1.5549465440027444e-06, "loss": 0.0113, "step": 8589 }, { "epoch": 2.3035666398498256, "grad_norm": 0.2583407282458997, "learning_rate": 1.5538159643358642e-06, "loss": 0.0153, "step": 8590 }, { "epoch": 2.303834808259587, "grad_norm": 0.20017089471870775, "learning_rate": 1.5526857202147727e-06, "loss": 0.0121, "step": 8591 }, { "epoch": 2.3041029766693484, "grad_norm": 0.24224659808717244, "learning_rate": 1.551555811749522e-06, "loss": 0.014, "step": 8592 }, { "epoch": 2.30437114507911, "grad_norm": 0.251945168325294, "learning_rate": 1.5504262390501268e-06, "loss": 0.0154, "step": 8593 }, { "epoch": 2.304639313488871, "grad_norm": 0.18602493445541723, "learning_rate": 1.549297002226568e-06, "loss": 0.0091, "step": 8594 }, { "epoch": 2.304907481898632, "grad_norm": 0.21581372215885616, "learning_rate": 1.548168101388799e-06, "loss": 0.0161, "step": 8595 }, { "epoch": 2.3051756503083936, "grad_norm": 0.3066193808260382, "learning_rate": 1.547039536646739e-06, "loss": 0.0156, "step": 8596 }, { "epoch": 2.305443818718155, "grad_norm": 0.15517015279682161, "learning_rate": 1.54591130811027e-06, "loss": 0.0068, "step": 8597 }, { "epoch": 2.3057119871279164, "grad_norm": 0.3332669783422645, "learning_rate": 1.5447834158892483e-06, "loss": 0.0229, "step": 8598 }, { "epoch": 2.3059801555376778, "grad_norm": 0.18656299848711694, "learning_rate": 1.543655860093491e-06, "loss": 0.0119, "step": 8599 }, { "epoch": 2.306248323947439, "grad_norm": 0.2690538790953686, "learning_rate": 1.542528640832787e-06, "loss": 0.018, "step": 8600 }, { "epoch": 2.3065164923572, "grad_norm": 0.24971500485351675, "learning_rate": 1.5414017582168883e-06, "loss": 0.0186, "step": 8601 }, { "epoch": 2.3067846607669615, "grad_norm": 0.3341652588727936, "learning_rate": 1.5402752123555181e-06, "loss": 0.0156, "step": 8602 }, { "epoch": 2.307052829176723, "grad_norm": 0.26926704644825905, "learning_rate": 1.5391490033583674e-06, "loss": 0.0152, "step": 8603 }, { "epoch": 2.3073209975864843, "grad_norm": 0.23616964014569905, "learning_rate": 1.538023131335089e-06, "loss": 0.0151, "step": 8604 }, { "epoch": 2.3075891659962458, "grad_norm": 0.21770847723912887, "learning_rate": 1.536897596395307e-06, "loss": 0.0168, "step": 8605 }, { "epoch": 2.307857334406007, "grad_norm": 0.19345158612519578, "learning_rate": 1.53577239864861e-06, "loss": 0.0127, "step": 8606 }, { "epoch": 2.308125502815768, "grad_norm": 0.47884970037923785, "learning_rate": 1.5346475382045578e-06, "loss": 0.0199, "step": 8607 }, { "epoch": 2.3083936712255295, "grad_norm": 0.7507193422746499, "learning_rate": 1.5335230151726755e-06, "loss": 0.0177, "step": 8608 }, { "epoch": 2.308661839635291, "grad_norm": 0.23263385954289825, "learning_rate": 1.5323988296624526e-06, "loss": 0.0141, "step": 8609 }, { "epoch": 2.3089300080450523, "grad_norm": 0.24689395138637943, "learning_rate": 1.5312749817833511e-06, "loss": 0.0123, "step": 8610 }, { "epoch": 2.3091981764548137, "grad_norm": 0.20075668765169405, "learning_rate": 1.5301514716447935e-06, "loss": 0.0142, "step": 8611 }, { "epoch": 2.309466344864575, "grad_norm": 0.2737769088693984, "learning_rate": 1.5290282993561745e-06, "loss": 0.0163, "step": 8612 }, { "epoch": 2.309734513274336, "grad_norm": 0.2264524107132518, "learning_rate": 1.5279054650268571e-06, "loss": 0.0137, "step": 8613 }, { "epoch": 2.3100026816840975, "grad_norm": 0.26502545491268126, "learning_rate": 1.526782968766164e-06, "loss": 0.0213, "step": 8614 }, { "epoch": 2.310270850093859, "grad_norm": 0.3013048384046786, "learning_rate": 1.5256608106833942e-06, "loss": 0.018, "step": 8615 }, { "epoch": 2.3105390185036203, "grad_norm": 0.4271957220770806, "learning_rate": 1.5245389908878067e-06, "loss": 0.0137, "step": 8616 }, { "epoch": 2.3108071869133817, "grad_norm": 0.2765798965047091, "learning_rate": 1.523417509488629e-06, "loss": 0.014, "step": 8617 }, { "epoch": 2.311075355323143, "grad_norm": 0.21569703900886006, "learning_rate": 1.5222963665950597e-06, "loss": 0.0114, "step": 8618 }, { "epoch": 2.311343523732904, "grad_norm": 0.4386624662922243, "learning_rate": 1.5211755623162588e-06, "loss": 0.0151, "step": 8619 }, { "epoch": 2.3116116921426655, "grad_norm": 0.2995373571648426, "learning_rate": 1.5200550967613585e-06, "loss": 0.0185, "step": 8620 }, { "epoch": 2.311879860552427, "grad_norm": 0.28069588176177623, "learning_rate": 1.5189349700394529e-06, "loss": 0.0117, "step": 8621 }, { "epoch": 2.3121480289621883, "grad_norm": 0.30552362939894545, "learning_rate": 1.5178151822596077e-06, "loss": 0.0174, "step": 8622 }, { "epoch": 2.3124161973719497, "grad_norm": 0.23695501271099612, "learning_rate": 1.516695733530854e-06, "loss": 0.009, "step": 8623 }, { "epoch": 2.312684365781711, "grad_norm": 0.2830322535231299, "learning_rate": 1.5155766239621877e-06, "loss": 0.0167, "step": 8624 }, { "epoch": 2.312952534191472, "grad_norm": 0.2555145572284112, "learning_rate": 1.5144578536625758e-06, "loss": 0.0143, "step": 8625 }, { "epoch": 2.3132207026012335, "grad_norm": 0.2345941727158562, "learning_rate": 1.5133394227409476e-06, "loss": 0.016, "step": 8626 }, { "epoch": 2.313488871010995, "grad_norm": 0.30258556998965835, "learning_rate": 1.512221331306204e-06, "loss": 0.0215, "step": 8627 }, { "epoch": 2.3137570394207563, "grad_norm": 0.2281024957062922, "learning_rate": 1.5111035794672096e-06, "loss": 0.0086, "step": 8628 }, { "epoch": 2.3140252078305177, "grad_norm": 0.22569010762628666, "learning_rate": 1.5099861673327953e-06, "loss": 0.0124, "step": 8629 }, { "epoch": 2.314293376240279, "grad_norm": 0.25352845493748566, "learning_rate": 1.5088690950117641e-06, "loss": 0.011, "step": 8630 }, { "epoch": 2.31456154465004, "grad_norm": 0.25123579416985403, "learning_rate": 1.507752362612878e-06, "loss": 0.0153, "step": 8631 }, { "epoch": 2.3148297130598015, "grad_norm": 0.2758104481989522, "learning_rate": 1.5066359702448746e-06, "loss": 0.016, "step": 8632 }, { "epoch": 2.315097881469563, "grad_norm": 0.27401065533755614, "learning_rate": 1.5055199180164503e-06, "loss": 0.0138, "step": 8633 }, { "epoch": 2.3153660498793243, "grad_norm": 0.2266539853038789, "learning_rate": 1.504404206036274e-06, "loss": 0.0112, "step": 8634 }, { "epoch": 2.3156342182890857, "grad_norm": 0.200067612866986, "learning_rate": 1.5032888344129815e-06, "loss": 0.0068, "step": 8635 }, { "epoch": 2.3159023866988466, "grad_norm": 0.2388457188268222, "learning_rate": 1.5021738032551697e-06, "loss": 0.0119, "step": 8636 }, { "epoch": 2.316170555108608, "grad_norm": 0.2645702212712605, "learning_rate": 1.5010591126714096e-06, "loss": 0.0148, "step": 8637 }, { "epoch": 2.3164387235183694, "grad_norm": 0.2908803805266686, "learning_rate": 1.4999447627702329e-06, "loss": 0.0188, "step": 8638 }, { "epoch": 2.316706891928131, "grad_norm": 0.24610175218367125, "learning_rate": 1.4988307536601421e-06, "loss": 0.011, "step": 8639 }, { "epoch": 2.3169750603378922, "grad_norm": 0.24572047607030081, "learning_rate": 1.4977170854496082e-06, "loss": 0.0144, "step": 8640 }, { "epoch": 2.3172432287476536, "grad_norm": 0.19903839603043907, "learning_rate": 1.4966037582470605e-06, "loss": 0.0105, "step": 8641 }, { "epoch": 2.317511397157415, "grad_norm": 0.25698726669715444, "learning_rate": 1.4954907721609046e-06, "loss": 0.0125, "step": 8642 }, { "epoch": 2.317779565567176, "grad_norm": 0.2505192152892468, "learning_rate": 1.4943781272995073e-06, "loss": 0.0139, "step": 8643 }, { "epoch": 2.3180477339769374, "grad_norm": 0.33031862154365244, "learning_rate": 1.493265823771204e-06, "loss": 0.0137, "step": 8644 }, { "epoch": 2.318315902386699, "grad_norm": 0.5223775938305318, "learning_rate": 1.4921538616842983e-06, "loss": 0.0235, "step": 8645 }, { "epoch": 2.3185840707964602, "grad_norm": 0.24126975842564075, "learning_rate": 1.491042241147057e-06, "loss": 0.0114, "step": 8646 }, { "epoch": 2.3188522392062216, "grad_norm": 0.180003647600566, "learning_rate": 1.4899309622677177e-06, "loss": 0.0093, "step": 8647 }, { "epoch": 2.3191204076159826, "grad_norm": 0.23500087383834684, "learning_rate": 1.4888200251544803e-06, "loss": 0.0113, "step": 8648 }, { "epoch": 2.319388576025744, "grad_norm": 0.25511855656872956, "learning_rate": 1.487709429915517e-06, "loss": 0.0164, "step": 8649 }, { "epoch": 2.3196567444355054, "grad_norm": 0.2983068090196593, "learning_rate": 1.4865991766589593e-06, "loss": 0.0142, "step": 8650 }, { "epoch": 2.319924912845267, "grad_norm": 0.26417925216705035, "learning_rate": 1.4854892654929121e-06, "loss": 0.0153, "step": 8651 }, { "epoch": 2.320193081255028, "grad_norm": 0.3135512194675503, "learning_rate": 1.4843796965254475e-06, "loss": 0.0162, "step": 8652 }, { "epoch": 2.3204612496647896, "grad_norm": 0.340245926696201, "learning_rate": 1.483270469864595e-06, "loss": 0.0199, "step": 8653 }, { "epoch": 2.320729418074551, "grad_norm": 0.27072191202557566, "learning_rate": 1.4821615856183597e-06, "loss": 0.018, "step": 8654 }, { "epoch": 2.320997586484312, "grad_norm": 0.27330535554225177, "learning_rate": 1.481053043894713e-06, "loss": 0.0127, "step": 8655 }, { "epoch": 2.3212657548940734, "grad_norm": 0.23340843466556335, "learning_rate": 1.4799448448015874e-06, "loss": 0.0125, "step": 8656 }, { "epoch": 2.321533923303835, "grad_norm": 0.5090462783103876, "learning_rate": 1.4788369884468879e-06, "loss": 0.0202, "step": 8657 }, { "epoch": 2.321802091713596, "grad_norm": 0.20328981688919176, "learning_rate": 1.4777294749384812e-06, "loss": 0.0105, "step": 8658 }, { "epoch": 2.3220702601233576, "grad_norm": 0.2681476936830618, "learning_rate": 1.4766223043842055e-06, "loss": 0.0145, "step": 8659 }, { "epoch": 2.3223384285331186, "grad_norm": 0.2405206618204203, "learning_rate": 1.4755154768918595e-06, "loss": 0.0136, "step": 8660 }, { "epoch": 2.32260659694288, "grad_norm": 0.2585878013461934, "learning_rate": 1.4744089925692146e-06, "loss": 0.0117, "step": 8661 }, { "epoch": 2.3228747653526414, "grad_norm": 0.21775794137735757, "learning_rate": 1.4733028515240078e-06, "loss": 0.0127, "step": 8662 }, { "epoch": 2.3231429337624028, "grad_norm": 0.256179844960534, "learning_rate": 1.4721970538639368e-06, "loss": 0.0144, "step": 8663 }, { "epoch": 2.323411102172164, "grad_norm": 0.2557355753713361, "learning_rate": 1.4710915996966745e-06, "loss": 0.0138, "step": 8664 }, { "epoch": 2.3236792705819256, "grad_norm": 0.27140538148902027, "learning_rate": 1.4699864891298543e-06, "loss": 0.014, "step": 8665 }, { "epoch": 2.323947438991687, "grad_norm": 0.2261401722738106, "learning_rate": 1.468881722271076e-06, "loss": 0.0133, "step": 8666 }, { "epoch": 2.324215607401448, "grad_norm": 0.28234224604889957, "learning_rate": 1.467777299227911e-06, "loss": 0.0192, "step": 8667 }, { "epoch": 2.3244837758112094, "grad_norm": 0.24259925820230865, "learning_rate": 1.4666732201078903e-06, "loss": 0.0172, "step": 8668 }, { "epoch": 2.3247519442209708, "grad_norm": 0.22426983024369054, "learning_rate": 1.46556948501852e-06, "loss": 0.0142, "step": 8669 }, { "epoch": 2.325020112630732, "grad_norm": 0.2409633800842328, "learning_rate": 1.4644660940672628e-06, "loss": 0.0148, "step": 8670 }, { "epoch": 2.3252882810404936, "grad_norm": 0.20695553915390003, "learning_rate": 1.4633630473615557e-06, "loss": 0.0112, "step": 8671 }, { "epoch": 2.3255564494502545, "grad_norm": 0.2786438906176772, "learning_rate": 1.4622603450088007e-06, "loss": 0.0155, "step": 8672 }, { "epoch": 2.325824617860016, "grad_norm": 0.22704071141338902, "learning_rate": 1.4611579871163616e-06, "loss": 0.0102, "step": 8673 }, { "epoch": 2.3260927862697773, "grad_norm": 0.27249013841703296, "learning_rate": 1.4600559737915754e-06, "loss": 0.0147, "step": 8674 }, { "epoch": 2.3263609546795387, "grad_norm": 0.53912171181824, "learning_rate": 1.4589543051417394e-06, "loss": 0.0189, "step": 8675 }, { "epoch": 2.3266291230893, "grad_norm": 0.21709447295229015, "learning_rate": 1.4578529812741226e-06, "loss": 0.0134, "step": 8676 }, { "epoch": 2.3268972914990615, "grad_norm": 0.3740660350860136, "learning_rate": 1.4567520022959575e-06, "loss": 0.0195, "step": 8677 }, { "epoch": 2.327165459908823, "grad_norm": 0.23147447070010047, "learning_rate": 1.4556513683144403e-06, "loss": 0.0139, "step": 8678 }, { "epoch": 2.327433628318584, "grad_norm": 0.2503128661520003, "learning_rate": 1.4545510794367413e-06, "loss": 0.0145, "step": 8679 }, { "epoch": 2.3277017967283453, "grad_norm": 0.2210827532651394, "learning_rate": 1.4534511357699894e-06, "loss": 0.0137, "step": 8680 }, { "epoch": 2.3279699651381067, "grad_norm": 0.2590759723543358, "learning_rate": 1.4523515374212842e-06, "loss": 0.0122, "step": 8681 }, { "epoch": 2.328238133547868, "grad_norm": 0.196989088260634, "learning_rate": 1.4512522844976927e-06, "loss": 0.0078, "step": 8682 }, { "epoch": 2.3285063019576295, "grad_norm": 0.22556322722783412, "learning_rate": 1.4501533771062426e-06, "loss": 0.0126, "step": 8683 }, { "epoch": 2.3287744703673905, "grad_norm": 0.2604605854809054, "learning_rate": 1.4490548153539358e-06, "loss": 0.0129, "step": 8684 }, { "epoch": 2.329042638777152, "grad_norm": 0.45427552336200566, "learning_rate": 1.4479565993477317e-06, "loss": 0.0153, "step": 8685 }, { "epoch": 2.3293108071869133, "grad_norm": 0.2349043096559891, "learning_rate": 1.4468587291945651e-06, "loss": 0.0148, "step": 8686 }, { "epoch": 2.3295789755966747, "grad_norm": 0.26467804452173194, "learning_rate": 1.4457612050013288e-06, "loss": 0.0165, "step": 8687 }, { "epoch": 2.329847144006436, "grad_norm": 0.28971480344462647, "learning_rate": 1.4446640268748875e-06, "loss": 0.0169, "step": 8688 }, { "epoch": 2.3301153124161975, "grad_norm": 0.30258972850569443, "learning_rate": 1.4435671949220742e-06, "loss": 0.0165, "step": 8689 }, { "epoch": 2.330383480825959, "grad_norm": 0.1887149669584492, "learning_rate": 1.4424707092496777e-06, "loss": 0.0101, "step": 8690 }, { "epoch": 2.33065164923572, "grad_norm": 0.1861058472565932, "learning_rate": 1.4413745699644633e-06, "loss": 0.0096, "step": 8691 }, { "epoch": 2.3309198176454813, "grad_norm": 0.2149779785766391, "learning_rate": 1.4402787771731602e-06, "loss": 0.0112, "step": 8692 }, { "epoch": 2.3311879860552427, "grad_norm": 0.29153411132394547, "learning_rate": 1.4391833309824604e-06, "loss": 0.0154, "step": 8693 }, { "epoch": 2.331456154465004, "grad_norm": 0.3105750627854313, "learning_rate": 1.4380882314990274e-06, "loss": 0.0116, "step": 8694 }, { "epoch": 2.3317243228747655, "grad_norm": 0.22455057006404402, "learning_rate": 1.4369934788294854e-06, "loss": 0.0105, "step": 8695 }, { "epoch": 2.3319924912845265, "grad_norm": 0.26440743961439617, "learning_rate": 1.43589907308043e-06, "loss": 0.0195, "step": 8696 }, { "epoch": 2.332260659694288, "grad_norm": 0.22502744550643716, "learning_rate": 1.4348050143584175e-06, "loss": 0.0157, "step": 8697 }, { "epoch": 2.3325288281040493, "grad_norm": 0.23474988374050096, "learning_rate": 1.4337113027699757e-06, "loss": 0.0136, "step": 8698 }, { "epoch": 2.3327969965138107, "grad_norm": 0.2563789742052498, "learning_rate": 1.4326179384215976e-06, "loss": 0.0152, "step": 8699 }, { "epoch": 2.333065164923572, "grad_norm": 0.29205139396130114, "learning_rate": 1.431524921419738e-06, "loss": 0.019, "step": 8700 }, { "epoch": 2.3333333333333335, "grad_norm": 0.3274340907847781, "learning_rate": 1.4304322518708258e-06, "loss": 0.0222, "step": 8701 }, { "epoch": 2.333601501743095, "grad_norm": 0.19762588085895882, "learning_rate": 1.429339929881245e-06, "loss": 0.0145, "step": 8702 }, { "epoch": 2.333869670152856, "grad_norm": 0.22875533841975274, "learning_rate": 1.4282479555573559e-06, "loss": 0.011, "step": 8703 }, { "epoch": 2.3341378385626173, "grad_norm": 0.24973670746102064, "learning_rate": 1.4271563290054813e-06, "loss": 0.0185, "step": 8704 }, { "epoch": 2.3344060069723787, "grad_norm": 0.24010374259600617, "learning_rate": 1.4260650503319078e-06, "loss": 0.0149, "step": 8705 }, { "epoch": 2.33467417538214, "grad_norm": 0.2249552374506792, "learning_rate": 1.4249741196428935e-06, "loss": 0.016, "step": 8706 }, { "epoch": 2.3349423437919015, "grad_norm": 0.2938645353662435, "learning_rate": 1.4238835370446564e-06, "loss": 0.024, "step": 8707 }, { "epoch": 2.3352105122016624, "grad_norm": 0.21096546303225347, "learning_rate": 1.4227933026433844e-06, "loss": 0.0108, "step": 8708 }, { "epoch": 2.335478680611424, "grad_norm": 0.24438435878892928, "learning_rate": 1.4217034165452332e-06, "loss": 0.0121, "step": 8709 }, { "epoch": 2.3357468490211852, "grad_norm": 0.21412421639257484, "learning_rate": 1.4206138788563185e-06, "loss": 0.0116, "step": 8710 }, { "epoch": 2.3360150174309466, "grad_norm": 0.24530523152328118, "learning_rate": 1.4195246896827287e-06, "loss": 0.0212, "step": 8711 }, { "epoch": 2.336283185840708, "grad_norm": 0.18449482618449511, "learning_rate": 1.4184358491305123e-06, "loss": 0.0098, "step": 8712 }, { "epoch": 2.3365513542504694, "grad_norm": 0.24340499543134247, "learning_rate": 1.4173473573056894e-06, "loss": 0.0141, "step": 8713 }, { "epoch": 2.336819522660231, "grad_norm": 0.15628529029745822, "learning_rate": 1.4162592143142435e-06, "loss": 0.0069, "step": 8714 }, { "epoch": 2.337087691069992, "grad_norm": 0.22775841085893547, "learning_rate": 1.4151714202621214e-06, "loss": 0.0131, "step": 8715 }, { "epoch": 2.337355859479753, "grad_norm": 0.20057119975694843, "learning_rate": 1.414083975255242e-06, "loss": 0.0113, "step": 8716 }, { "epoch": 2.3376240278895146, "grad_norm": 0.20190872508930574, "learning_rate": 1.412996879399484e-06, "loss": 0.0126, "step": 8717 }, { "epoch": 2.337892196299276, "grad_norm": 0.2661059827505163, "learning_rate": 1.4119101328006984e-06, "loss": 0.0164, "step": 8718 }, { "epoch": 2.3381603647090374, "grad_norm": 0.26761286583020666, "learning_rate": 1.4108237355646954e-06, "loss": 0.0129, "step": 8719 }, { "epoch": 2.3384285331187984, "grad_norm": 0.26475320826499826, "learning_rate": 1.4097376877972556e-06, "loss": 0.0134, "step": 8720 }, { "epoch": 2.33869670152856, "grad_norm": 0.24252686662367595, "learning_rate": 1.4086519896041274e-06, "loss": 0.0159, "step": 8721 }, { "epoch": 2.338964869938321, "grad_norm": 0.23551756950743247, "learning_rate": 1.4075666410910178e-06, "loss": 0.0103, "step": 8722 }, { "epoch": 2.3392330383480826, "grad_norm": 0.22981124267325467, "learning_rate": 1.4064816423636091e-06, "loss": 0.0135, "step": 8723 }, { "epoch": 2.339501206757844, "grad_norm": 0.24957913105906604, "learning_rate": 1.4053969935275403e-06, "loss": 0.0149, "step": 8724 }, { "epoch": 2.3397693751676054, "grad_norm": 0.3834226660751616, "learning_rate": 1.4043126946884238e-06, "loss": 0.0142, "step": 8725 }, { "epoch": 2.340037543577367, "grad_norm": 0.2233007333266325, "learning_rate": 1.4032287459518346e-06, "loss": 0.0133, "step": 8726 }, { "epoch": 2.340305711987128, "grad_norm": 0.21591740359232955, "learning_rate": 1.4021451474233111e-06, "loss": 0.0109, "step": 8727 }, { "epoch": 2.340573880396889, "grad_norm": 0.22120398769901672, "learning_rate": 1.4010618992083646e-06, "loss": 0.0149, "step": 8728 }, { "epoch": 2.3408420488066506, "grad_norm": 0.22924108163063697, "learning_rate": 1.3999790014124636e-06, "loss": 0.0089, "step": 8729 }, { "epoch": 2.341110217216412, "grad_norm": 0.23076648069175332, "learning_rate": 1.39889645414105e-06, "loss": 0.0125, "step": 8730 }, { "epoch": 2.3413783856261734, "grad_norm": 0.20233388140574485, "learning_rate": 1.3978142574995296e-06, "loss": 0.0127, "step": 8731 }, { "epoch": 2.3416465540359344, "grad_norm": 0.21478124204734136, "learning_rate": 1.3967324115932696e-06, "loss": 0.0096, "step": 8732 }, { "epoch": 2.3419147224456958, "grad_norm": 0.37381161905392574, "learning_rate": 1.3956509165276095e-06, "loss": 0.0157, "step": 8733 }, { "epoch": 2.342182890855457, "grad_norm": 0.21164052749809473, "learning_rate": 1.394569772407849e-06, "loss": 0.0084, "step": 8734 }, { "epoch": 2.3424510592652186, "grad_norm": 0.2580662464620983, "learning_rate": 1.3934889793392575e-06, "loss": 0.0139, "step": 8735 }, { "epoch": 2.34271922767498, "grad_norm": 0.4764464248250427, "learning_rate": 1.3924085374270701e-06, "loss": 0.0232, "step": 8736 }, { "epoch": 2.3429873960847414, "grad_norm": 0.21738121815564201, "learning_rate": 1.3913284467764859e-06, "loss": 0.0134, "step": 8737 }, { "epoch": 2.343255564494503, "grad_norm": 0.24962066159399496, "learning_rate": 1.3902487074926702e-06, "loss": 0.0135, "step": 8738 }, { "epoch": 2.3435237329042637, "grad_norm": 0.24091110165201918, "learning_rate": 1.389169319680752e-06, "loss": 0.0157, "step": 8739 }, { "epoch": 2.343791901314025, "grad_norm": 0.3273392600999916, "learning_rate": 1.388090283445831e-06, "loss": 0.016, "step": 8740 }, { "epoch": 2.3440600697237866, "grad_norm": 0.20532480895440475, "learning_rate": 1.387011598892971e-06, "loss": 0.0121, "step": 8741 }, { "epoch": 2.344328238133548, "grad_norm": 0.25469043473168734, "learning_rate": 1.385933266127198e-06, "loss": 0.0139, "step": 8742 }, { "epoch": 2.3445964065433094, "grad_norm": 0.24838213309250026, "learning_rate": 1.384855285253509e-06, "loss": 0.0201, "step": 8743 }, { "epoch": 2.3448645749530703, "grad_norm": 0.3622427493343319, "learning_rate": 1.3837776563768613e-06, "loss": 0.0137, "step": 8744 }, { "epoch": 2.3451327433628317, "grad_norm": 0.23862192449280178, "learning_rate": 1.3827003796021838e-06, "loss": 0.0075, "step": 8745 }, { "epoch": 2.345400911772593, "grad_norm": 0.22864217964395311, "learning_rate": 1.3816234550343655e-06, "loss": 0.011, "step": 8746 }, { "epoch": 2.3456690801823545, "grad_norm": 0.2614900229516161, "learning_rate": 1.3805468827782642e-06, "loss": 0.0149, "step": 8747 }, { "epoch": 2.345937248592116, "grad_norm": 0.28310098599203615, "learning_rate": 1.3794706629387056e-06, "loss": 0.0196, "step": 8748 }, { "epoch": 2.3462054170018773, "grad_norm": 0.2840282487576676, "learning_rate": 1.3783947956204756e-06, "loss": 0.0139, "step": 8749 }, { "epoch": 2.3464735854116388, "grad_norm": 0.29103374726116016, "learning_rate": 1.3773192809283282e-06, "loss": 0.0143, "step": 8750 }, { "epoch": 2.3467417538213997, "grad_norm": 0.25237074371255347, "learning_rate": 1.3762441189669855e-06, "loss": 0.014, "step": 8751 }, { "epoch": 2.347009922231161, "grad_norm": 0.2563584933085516, "learning_rate": 1.3751693098411312e-06, "loss": 0.0134, "step": 8752 }, { "epoch": 2.3472780906409225, "grad_norm": 0.2812388417712608, "learning_rate": 1.3740948536554182e-06, "loss": 0.0164, "step": 8753 }, { "epoch": 2.347546259050684, "grad_norm": 0.2543324919796652, "learning_rate": 1.3730207505144617e-06, "loss": 0.0133, "step": 8754 }, { "epoch": 2.3478144274604453, "grad_norm": 0.24450088550378657, "learning_rate": 1.3719470005228463e-06, "loss": 0.0152, "step": 8755 }, { "epoch": 2.3480825958702063, "grad_norm": 0.24091163965857523, "learning_rate": 1.3708736037851177e-06, "loss": 0.0131, "step": 8756 }, { "epoch": 2.3483507642799677, "grad_norm": 0.26512613416436936, "learning_rate": 1.3698005604057918e-06, "loss": 0.0135, "step": 8757 }, { "epoch": 2.348618932689729, "grad_norm": 0.2392380160697273, "learning_rate": 1.368727870489348e-06, "loss": 0.0106, "step": 8758 }, { "epoch": 2.3488871010994905, "grad_norm": 0.3020065200032434, "learning_rate": 1.3676555341402298e-06, "loss": 0.0147, "step": 8759 }, { "epoch": 2.349155269509252, "grad_norm": 0.19561372886869396, "learning_rate": 1.36658355146285e-06, "loss": 0.0096, "step": 8760 }, { "epoch": 2.3494234379190133, "grad_norm": 0.2249178552632823, "learning_rate": 1.3655119225615832e-06, "loss": 0.0122, "step": 8761 }, { "epoch": 2.3496916063287747, "grad_norm": 0.23630131147452707, "learning_rate": 1.3644406475407689e-06, "loss": 0.0115, "step": 8762 }, { "epoch": 2.3499597747385357, "grad_norm": 0.17393779661894931, "learning_rate": 1.363369726504719e-06, "loss": 0.0091, "step": 8763 }, { "epoch": 2.350227943148297, "grad_norm": 0.2834533902175608, "learning_rate": 1.3622991595577018e-06, "loss": 0.011, "step": 8764 }, { "epoch": 2.3504961115580585, "grad_norm": 0.23481106304763782, "learning_rate": 1.3612289468039592e-06, "loss": 0.0134, "step": 8765 }, { "epoch": 2.35076427996782, "grad_norm": 0.31338252159442564, "learning_rate": 1.3601590883476917e-06, "loss": 0.0226, "step": 8766 }, { "epoch": 2.3510324483775813, "grad_norm": 0.32927384652862585, "learning_rate": 1.3590895842930702e-06, "loss": 0.0138, "step": 8767 }, { "epoch": 2.3513006167873423, "grad_norm": 0.2919477831314323, "learning_rate": 1.3580204347442304e-06, "loss": 0.018, "step": 8768 }, { "epoch": 2.3515687851971037, "grad_norm": 0.24778176043977437, "learning_rate": 1.3569516398052707e-06, "loss": 0.0133, "step": 8769 }, { "epoch": 2.351836953606865, "grad_norm": 0.2753183559944742, "learning_rate": 1.3558831995802597e-06, "loss": 0.0156, "step": 8770 }, { "epoch": 2.3521051220166265, "grad_norm": 0.22321299850041598, "learning_rate": 1.354815114173224e-06, "loss": 0.0122, "step": 8771 }, { "epoch": 2.352373290426388, "grad_norm": 0.2744825607349537, "learning_rate": 1.3537473836881655e-06, "loss": 0.0182, "step": 8772 }, { "epoch": 2.3526414588361493, "grad_norm": 0.2550076113894432, "learning_rate": 1.3526800082290425e-06, "loss": 0.0167, "step": 8773 }, { "epoch": 2.3529096272459102, "grad_norm": 0.27622058249258497, "learning_rate": 1.351612987899783e-06, "loss": 0.0143, "step": 8774 }, { "epoch": 2.3531777956556716, "grad_norm": 0.28347254876238986, "learning_rate": 1.3505463228042814e-06, "loss": 0.0187, "step": 8775 }, { "epoch": 2.353445964065433, "grad_norm": 0.2851610549181346, "learning_rate": 1.3494800130463943e-06, "loss": 0.0144, "step": 8776 }, { "epoch": 2.3537141324751945, "grad_norm": 0.306055209688671, "learning_rate": 1.348414058729946e-06, "loss": 0.0116, "step": 8777 }, { "epoch": 2.353982300884956, "grad_norm": 0.26858599344716333, "learning_rate": 1.3473484599587277e-06, "loss": 0.0161, "step": 8778 }, { "epoch": 2.3542504692947173, "grad_norm": 0.7184244746239631, "learning_rate": 1.3462832168364908e-06, "loss": 0.0169, "step": 8779 }, { "epoch": 2.3545186377044782, "grad_norm": 0.26973482776098645, "learning_rate": 1.3452183294669586e-06, "loss": 0.0156, "step": 8780 }, { "epoch": 2.3547868061142396, "grad_norm": 0.2332269645852738, "learning_rate": 1.3441537979538122e-06, "loss": 0.0148, "step": 8781 }, { "epoch": 2.355054974524001, "grad_norm": 0.2457826690013042, "learning_rate": 1.3430896224007062e-06, "loss": 0.0127, "step": 8782 }, { "epoch": 2.3553231429337624, "grad_norm": 0.2023712817341855, "learning_rate": 1.342025802911253e-06, "loss": 0.0128, "step": 8783 }, { "epoch": 2.355591311343524, "grad_norm": 0.29414674598457763, "learning_rate": 1.3409623395890358e-06, "loss": 0.0196, "step": 8784 }, { "epoch": 2.3558594797532852, "grad_norm": 0.24553793483254582, "learning_rate": 1.3398992325376042e-06, "loss": 0.0124, "step": 8785 }, { "epoch": 2.356127648163046, "grad_norm": 0.2736543228878304, "learning_rate": 1.3388364818604638e-06, "loss": 0.0205, "step": 8786 }, { "epoch": 2.3563958165728076, "grad_norm": 0.24233062247216783, "learning_rate": 1.337774087661095e-06, "loss": 0.0158, "step": 8787 }, { "epoch": 2.356663984982569, "grad_norm": 0.22995513994594335, "learning_rate": 1.336712050042942e-06, "loss": 0.0138, "step": 8788 }, { "epoch": 2.3569321533923304, "grad_norm": 0.3001133876697425, "learning_rate": 1.3356503691094092e-06, "loss": 0.0191, "step": 8789 }, { "epoch": 2.357200321802092, "grad_norm": 0.23956923387351903, "learning_rate": 1.3345890449638726e-06, "loss": 0.0142, "step": 8790 }, { "epoch": 2.3574684902118532, "grad_norm": 0.29043022241683497, "learning_rate": 1.3335280777096676e-06, "loss": 0.0191, "step": 8791 }, { "epoch": 2.357736658621614, "grad_norm": 0.25988682334278884, "learning_rate": 1.332467467450101e-06, "loss": 0.0129, "step": 8792 }, { "epoch": 2.3580048270313756, "grad_norm": 0.22658708464542276, "learning_rate": 1.3314072142884388e-06, "loss": 0.0122, "step": 8793 }, { "epoch": 2.358272995441137, "grad_norm": 0.2797839439996834, "learning_rate": 1.3303473183279158e-06, "loss": 0.0163, "step": 8794 }, { "epoch": 2.3585411638508984, "grad_norm": 0.24922688482613706, "learning_rate": 1.3292877796717335e-06, "loss": 0.0124, "step": 8795 }, { "epoch": 2.35880933226066, "grad_norm": 0.2962999612129481, "learning_rate": 1.3282285984230532e-06, "loss": 0.0201, "step": 8796 }, { "epoch": 2.359077500670421, "grad_norm": 0.3000251792134309, "learning_rate": 1.3271697746850066e-06, "loss": 0.0159, "step": 8797 }, { "epoch": 2.359345669080182, "grad_norm": 0.19414631907929364, "learning_rate": 1.3261113085606886e-06, "loss": 0.0132, "step": 8798 }, { "epoch": 2.3596138374899436, "grad_norm": 0.2034764083800131, "learning_rate": 1.3250532001531568e-06, "loss": 0.0107, "step": 8799 }, { "epoch": 2.359882005899705, "grad_norm": 0.290166773619518, "learning_rate": 1.3239954495654394e-06, "loss": 0.0141, "step": 8800 }, { "epoch": 2.3601501743094664, "grad_norm": 0.2962176899656806, "learning_rate": 1.322938056900524e-06, "loss": 0.0225, "step": 8801 }, { "epoch": 2.360418342719228, "grad_norm": 0.271721296400545, "learning_rate": 1.3218810222613694e-06, "loss": 0.0131, "step": 8802 }, { "epoch": 2.360686511128989, "grad_norm": 0.2017296912438486, "learning_rate": 1.3208243457508928e-06, "loss": 0.0104, "step": 8803 }, { "epoch": 2.36095467953875, "grad_norm": 0.33463941482740056, "learning_rate": 1.3197680274719816e-06, "loss": 0.024, "step": 8804 }, { "epoch": 2.3612228479485116, "grad_norm": 0.2824241849845443, "learning_rate": 1.3187120675274883e-06, "loss": 0.0142, "step": 8805 }, { "epoch": 2.361491016358273, "grad_norm": 0.26892092478512397, "learning_rate": 1.3176564660202256e-06, "loss": 0.0124, "step": 8806 }, { "epoch": 2.3617591847680344, "grad_norm": 0.34652967835942994, "learning_rate": 1.3166012230529784e-06, "loss": 0.0224, "step": 8807 }, { "epoch": 2.3620273531777958, "grad_norm": 0.2029031056031269, "learning_rate": 1.3155463387284889e-06, "loss": 0.0125, "step": 8808 }, { "epoch": 2.362295521587557, "grad_norm": 0.2623270919671144, "learning_rate": 1.3144918131494722e-06, "loss": 0.0133, "step": 8809 }, { "epoch": 2.362563689997318, "grad_norm": 0.22182874828442717, "learning_rate": 1.3134376464186028e-06, "loss": 0.0096, "step": 8810 }, { "epoch": 2.3628318584070795, "grad_norm": 0.2190420058664251, "learning_rate": 1.31238383863852e-06, "loss": 0.0173, "step": 8811 }, { "epoch": 2.363100026816841, "grad_norm": 0.29008216219437244, "learning_rate": 1.3113303899118346e-06, "loss": 0.0155, "step": 8812 }, { "epoch": 2.3633681952266024, "grad_norm": 0.22984360179359528, "learning_rate": 1.310277300341114e-06, "loss": 0.0135, "step": 8813 }, { "epoch": 2.3636363636363638, "grad_norm": 0.20017662413180887, "learning_rate": 1.3092245700288985e-06, "loss": 0.0131, "step": 8814 }, { "epoch": 2.363904532046125, "grad_norm": 0.265157379183699, "learning_rate": 1.3081721990776863e-06, "loss": 0.0122, "step": 8815 }, { "epoch": 2.364172700455886, "grad_norm": 0.3100513096766328, "learning_rate": 1.3071201875899453e-06, "loss": 0.0157, "step": 8816 }, { "epoch": 2.3644408688656475, "grad_norm": 0.2659911663172562, "learning_rate": 1.3060685356681084e-06, "loss": 0.0142, "step": 8817 }, { "epoch": 2.364709037275409, "grad_norm": 0.2160980153538925, "learning_rate": 1.3050172434145697e-06, "loss": 0.0134, "step": 8818 }, { "epoch": 2.3649772056851703, "grad_norm": 0.255943968011093, "learning_rate": 1.3039663109316936e-06, "loss": 0.0128, "step": 8819 }, { "epoch": 2.3652453740949317, "grad_norm": 0.33857607067218165, "learning_rate": 1.3029157383218039e-06, "loss": 0.0156, "step": 8820 }, { "epoch": 2.3655135425046927, "grad_norm": 0.21608554484041517, "learning_rate": 1.3018655256871925e-06, "loss": 0.012, "step": 8821 }, { "epoch": 2.365781710914454, "grad_norm": 0.3184395345812074, "learning_rate": 1.3008156731301202e-06, "loss": 0.0157, "step": 8822 }, { "epoch": 2.3660498793242155, "grad_norm": 0.25402450737454285, "learning_rate": 1.2997661807528011e-06, "loss": 0.0101, "step": 8823 }, { "epoch": 2.366318047733977, "grad_norm": 0.19659072128945074, "learning_rate": 1.298717048657427e-06, "loss": 0.0104, "step": 8824 }, { "epoch": 2.3665862161437383, "grad_norm": 0.2158454993872614, "learning_rate": 1.2976682769461451e-06, "loss": 0.0105, "step": 8825 }, { "epoch": 2.3668543845534997, "grad_norm": 0.2389747279511932, "learning_rate": 1.2966198657210738e-06, "loss": 0.0137, "step": 8826 }, { "epoch": 2.367122552963261, "grad_norm": 0.2700955864688488, "learning_rate": 1.2955718150842955e-06, "loss": 0.0136, "step": 8827 }, { "epoch": 2.367390721373022, "grad_norm": 0.2336387971828409, "learning_rate": 1.2945241251378531e-06, "loss": 0.014, "step": 8828 }, { "epoch": 2.3676588897827835, "grad_norm": 0.2981731815514084, "learning_rate": 1.2934767959837601e-06, "loss": 0.019, "step": 8829 }, { "epoch": 2.367927058192545, "grad_norm": 0.23071617343697035, "learning_rate": 1.2924298277239893e-06, "loss": 0.0129, "step": 8830 }, { "epoch": 2.3681952266023063, "grad_norm": 0.30180035200700134, "learning_rate": 1.2913832204604843e-06, "loss": 0.0151, "step": 8831 }, { "epoch": 2.3684633950120677, "grad_norm": 0.19731641864175456, "learning_rate": 1.2903369742951476e-06, "loss": 0.0113, "step": 8832 }, { "epoch": 2.3687315634218287, "grad_norm": 0.23816816914990033, "learning_rate": 1.2892910893298506e-06, "loss": 0.0136, "step": 8833 }, { "epoch": 2.36899973183159, "grad_norm": 0.2713304562654815, "learning_rate": 1.2882455656664316e-06, "loss": 0.0131, "step": 8834 }, { "epoch": 2.3692679002413515, "grad_norm": 0.2696318193781521, "learning_rate": 1.2872004034066843e-06, "loss": 0.0177, "step": 8835 }, { "epoch": 2.369536068651113, "grad_norm": 0.5978011517919262, "learning_rate": 1.2861556026523765e-06, "loss": 0.0295, "step": 8836 }, { "epoch": 2.3698042370608743, "grad_norm": 0.24738344049179034, "learning_rate": 1.2851111635052388e-06, "loss": 0.0156, "step": 8837 }, { "epoch": 2.3700724054706357, "grad_norm": 0.21850065252202885, "learning_rate": 1.284067086066963e-06, "loss": 0.0099, "step": 8838 }, { "epoch": 2.370340573880397, "grad_norm": 0.23101054793043033, "learning_rate": 1.2830233704392108e-06, "loss": 0.0091, "step": 8839 }, { "epoch": 2.370608742290158, "grad_norm": 0.2907302675671745, "learning_rate": 1.2819800167236034e-06, "loss": 0.0156, "step": 8840 }, { "epoch": 2.3708769106999195, "grad_norm": 0.21538506819431902, "learning_rate": 1.2809370250217324e-06, "loss": 0.012, "step": 8841 }, { "epoch": 2.371145079109681, "grad_norm": 0.20854084638980305, "learning_rate": 1.2798943954351472e-06, "loss": 0.0121, "step": 8842 }, { "epoch": 2.3714132475194423, "grad_norm": 0.24099820408389627, "learning_rate": 1.2788521280653682e-06, "loss": 0.0095, "step": 8843 }, { "epoch": 2.3716814159292037, "grad_norm": 0.1694392024663969, "learning_rate": 1.2778102230138795e-06, "loss": 0.0085, "step": 8844 }, { "epoch": 2.3719495843389646, "grad_norm": 0.2509983520713743, "learning_rate": 1.276768680382126e-06, "loss": 0.0136, "step": 8845 }, { "epoch": 2.372217752748726, "grad_norm": 0.2690816501472869, "learning_rate": 1.2757275002715219e-06, "loss": 0.0154, "step": 8846 }, { "epoch": 2.3724859211584874, "grad_norm": 0.2233873316102051, "learning_rate": 1.2746866827834443e-06, "loss": 0.01, "step": 8847 }, { "epoch": 2.372754089568249, "grad_norm": 0.2623754794963019, "learning_rate": 1.2736462280192318e-06, "loss": 0.0148, "step": 8848 }, { "epoch": 2.3730222579780103, "grad_norm": 0.2960986759332379, "learning_rate": 1.2726061360801945e-06, "loss": 0.0198, "step": 8849 }, { "epoch": 2.3732904263877717, "grad_norm": 0.24959970144102056, "learning_rate": 1.2715664070675998e-06, "loss": 0.0162, "step": 8850 }, { "epoch": 2.373558594797533, "grad_norm": 0.22214111939214695, "learning_rate": 1.270527041082687e-06, "loss": 0.014, "step": 8851 }, { "epoch": 2.373826763207294, "grad_norm": 0.23426650578312616, "learning_rate": 1.2694880382266534e-06, "loss": 0.0117, "step": 8852 }, { "epoch": 2.3740949316170554, "grad_norm": 0.33907043249678986, "learning_rate": 1.2684493986006646e-06, "loss": 0.019, "step": 8853 }, { "epoch": 2.374363100026817, "grad_norm": 0.24163505567158972, "learning_rate": 1.2674111223058526e-06, "loss": 0.011, "step": 8854 }, { "epoch": 2.3746312684365782, "grad_norm": 0.25438166133148776, "learning_rate": 1.2663732094433084e-06, "loss": 0.0143, "step": 8855 }, { "epoch": 2.3748994368463396, "grad_norm": 0.26791961151219207, "learning_rate": 1.2653356601140936e-06, "loss": 0.0234, "step": 8856 }, { "epoch": 2.3751676052561006, "grad_norm": 0.290136182835645, "learning_rate": 1.2642984744192289e-06, "loss": 0.0139, "step": 8857 }, { "epoch": 2.375435773665862, "grad_norm": 0.2653624783908322, "learning_rate": 1.2632616524597052e-06, "loss": 0.0128, "step": 8858 }, { "epoch": 2.3757039420756234, "grad_norm": 0.22840591599123197, "learning_rate": 1.2622251943364733e-06, "loss": 0.0117, "step": 8859 }, { "epoch": 2.375972110485385, "grad_norm": 0.2407417968247365, "learning_rate": 1.2611891001504489e-06, "loss": 0.0118, "step": 8860 }, { "epoch": 2.376240278895146, "grad_norm": 0.26844069939663145, "learning_rate": 1.2601533700025177e-06, "loss": 0.0159, "step": 8861 }, { "epoch": 2.3765084473049076, "grad_norm": 0.2932895122780787, "learning_rate": 1.259118003993522e-06, "loss": 0.0161, "step": 8862 }, { "epoch": 2.376776615714669, "grad_norm": 0.2978579334284393, "learning_rate": 1.258083002224274e-06, "loss": 0.0136, "step": 8863 }, { "epoch": 2.37704478412443, "grad_norm": 0.24840312389348312, "learning_rate": 1.2570483647955512e-06, "loss": 0.0103, "step": 8864 }, { "epoch": 2.3773129525341914, "grad_norm": 0.23327446557141393, "learning_rate": 1.2560140918080905e-06, "loss": 0.0149, "step": 8865 }, { "epoch": 2.377581120943953, "grad_norm": 0.2182026620305688, "learning_rate": 1.2549801833625986e-06, "loss": 0.0092, "step": 8866 }, { "epoch": 2.377849289353714, "grad_norm": 0.2452548195302081, "learning_rate": 1.253946639559742e-06, "loss": 0.0141, "step": 8867 }, { "epoch": 2.3781174577634756, "grad_norm": 0.25182169487687767, "learning_rate": 1.2529134605001575e-06, "loss": 0.0215, "step": 8868 }, { "epoch": 2.3783856261732366, "grad_norm": 0.23953267623160315, "learning_rate": 1.251880646284439e-06, "loss": 0.0149, "step": 8869 }, { "epoch": 2.378653794582998, "grad_norm": 0.2866394907189888, "learning_rate": 1.2508481970131525e-06, "loss": 0.0169, "step": 8870 }, { "epoch": 2.3789219629927594, "grad_norm": 0.30830857756430546, "learning_rate": 1.2498161127868236e-06, "loss": 0.0181, "step": 8871 }, { "epoch": 2.379190131402521, "grad_norm": 0.2159241675216704, "learning_rate": 1.2487843937059419e-06, "loss": 0.011, "step": 8872 }, { "epoch": 2.379458299812282, "grad_norm": 0.33188015865712117, "learning_rate": 1.2477530398709637e-06, "loss": 0.0189, "step": 8873 }, { "epoch": 2.3797264682220436, "grad_norm": 0.2307720202914826, "learning_rate": 1.246722051382312e-06, "loss": 0.0147, "step": 8874 }, { "epoch": 2.379994636631805, "grad_norm": 0.25960144181444483, "learning_rate": 1.2456914283403681e-06, "loss": 0.0114, "step": 8875 }, { "epoch": 2.380262805041566, "grad_norm": 0.2567508372623785, "learning_rate": 1.2446611708454836e-06, "loss": 0.0128, "step": 8876 }, { "epoch": 2.3805309734513274, "grad_norm": 0.27431585564137334, "learning_rate": 1.2436312789979687e-06, "loss": 0.0174, "step": 8877 }, { "epoch": 2.3807991418610888, "grad_norm": 0.29117160031955036, "learning_rate": 1.2426017528981054e-06, "loss": 0.0158, "step": 8878 }, { "epoch": 2.38106731027085, "grad_norm": 0.22451579205646643, "learning_rate": 1.241572592646132e-06, "loss": 0.0122, "step": 8879 }, { "epoch": 2.3813354786806116, "grad_norm": 0.3350332403248255, "learning_rate": 1.2405437983422569e-06, "loss": 0.0152, "step": 8880 }, { "epoch": 2.3816036470903725, "grad_norm": 0.2547375728447541, "learning_rate": 1.239515370086652e-06, "loss": 0.012, "step": 8881 }, { "epoch": 2.381871815500134, "grad_norm": 0.21113936705633257, "learning_rate": 1.2384873079794518e-06, "loss": 0.0105, "step": 8882 }, { "epoch": 2.3821399839098953, "grad_norm": 0.23284560499567805, "learning_rate": 1.237459612120755e-06, "loss": 0.0141, "step": 8883 }, { "epoch": 2.3824081523196567, "grad_norm": 0.22330282190267034, "learning_rate": 1.2364322826106273e-06, "loss": 0.0127, "step": 8884 }, { "epoch": 2.382676320729418, "grad_norm": 0.22630918655180907, "learning_rate": 1.2354053195490944e-06, "loss": 0.0105, "step": 8885 }, { "epoch": 2.3829444891391796, "grad_norm": 0.21455447557134635, "learning_rate": 1.2343787230361526e-06, "loss": 0.0099, "step": 8886 }, { "epoch": 2.383212657548941, "grad_norm": 0.35751760158608464, "learning_rate": 1.2333524931717555e-06, "loss": 0.0173, "step": 8887 }, { "epoch": 2.383480825958702, "grad_norm": 0.2994492926919677, "learning_rate": 1.2323266300558274e-06, "loss": 0.0104, "step": 8888 }, { "epoch": 2.3837489943684633, "grad_norm": 0.26217533467522347, "learning_rate": 1.2313011337882507e-06, "loss": 0.015, "step": 8889 }, { "epoch": 2.3840171627782247, "grad_norm": 0.40953940755675033, "learning_rate": 1.230276004468877e-06, "loss": 0.0153, "step": 8890 }, { "epoch": 2.384285331187986, "grad_norm": 0.29969263474363145, "learning_rate": 1.2292512421975221e-06, "loss": 0.0165, "step": 8891 }, { "epoch": 2.3845534995977475, "grad_norm": 0.49915587275499396, "learning_rate": 1.228226847073961e-06, "loss": 0.0171, "step": 8892 }, { "epoch": 2.3848216680075085, "grad_norm": 0.23819661281975443, "learning_rate": 1.2272028191979391e-06, "loss": 0.0141, "step": 8893 }, { "epoch": 2.38508983641727, "grad_norm": 0.2981933504414854, "learning_rate": 1.2261791586691624e-06, "loss": 0.0155, "step": 8894 }, { "epoch": 2.3853580048270313, "grad_norm": 0.231977298884141, "learning_rate": 1.2251558655873003e-06, "loss": 0.0112, "step": 8895 }, { "epoch": 2.3856261732367927, "grad_norm": 0.4240465431319797, "learning_rate": 1.2241329400519908e-06, "loss": 0.0146, "step": 8896 }, { "epoch": 2.385894341646554, "grad_norm": 0.2734949785986456, "learning_rate": 1.223110382162831e-06, "loss": 0.0168, "step": 8897 }, { "epoch": 2.3861625100563155, "grad_norm": 0.288096114709897, "learning_rate": 1.2220881920193867e-06, "loss": 0.015, "step": 8898 }, { "epoch": 2.386430678466077, "grad_norm": 0.310926460675642, "learning_rate": 1.2210663697211839e-06, "loss": 0.0172, "step": 8899 }, { "epoch": 2.386698846875838, "grad_norm": 0.34860188484869303, "learning_rate": 1.2200449153677158e-06, "loss": 0.0119, "step": 8900 }, { "epoch": 2.3869670152855993, "grad_norm": 0.22544749190147256, "learning_rate": 1.2190238290584404e-06, "loss": 0.0109, "step": 8901 }, { "epoch": 2.3872351836953607, "grad_norm": 0.23962491533368455, "learning_rate": 1.2180031108927747e-06, "loss": 0.0145, "step": 8902 }, { "epoch": 2.387503352105122, "grad_norm": 0.238123442608474, "learning_rate": 1.2169827609701067e-06, "loss": 0.0168, "step": 8903 }, { "epoch": 2.3877715205148835, "grad_norm": 0.2504086348923343, "learning_rate": 1.2159627793897822e-06, "loss": 0.0151, "step": 8904 }, { "epoch": 2.3880396889246445, "grad_norm": 0.24208487719736688, "learning_rate": 1.2149431662511164e-06, "loss": 0.0125, "step": 8905 }, { "epoch": 2.388307857334406, "grad_norm": 0.2726884796493773, "learning_rate": 1.213923921653386e-06, "loss": 0.017, "step": 8906 }, { "epoch": 2.3885760257441673, "grad_norm": 0.24761661896238568, "learning_rate": 1.2129050456958296e-06, "loss": 0.0149, "step": 8907 }, { "epoch": 2.3888441941539287, "grad_norm": 0.2903650345166876, "learning_rate": 1.2118865384776557e-06, "loss": 0.0146, "step": 8908 }, { "epoch": 2.38911236256369, "grad_norm": 0.2355149650340627, "learning_rate": 1.2108684000980314e-06, "loss": 0.0155, "step": 8909 }, { "epoch": 2.3893805309734515, "grad_norm": 0.17379595049950194, "learning_rate": 1.209850630656092e-06, "loss": 0.0095, "step": 8910 }, { "epoch": 2.389648699383213, "grad_norm": 0.20716854153297726, "learning_rate": 1.2088332302509331e-06, "loss": 0.0109, "step": 8911 }, { "epoch": 2.389916867792974, "grad_norm": 0.2976507143272806, "learning_rate": 1.2078161989816167e-06, "loss": 0.0148, "step": 8912 }, { "epoch": 2.3901850362027353, "grad_norm": 0.19630014860374387, "learning_rate": 1.2067995369471707e-06, "loss": 0.0091, "step": 8913 }, { "epoch": 2.3904532046124967, "grad_norm": 0.2708699642449018, "learning_rate": 1.2057832442465812e-06, "loss": 0.0152, "step": 8914 }, { "epoch": 2.390721373022258, "grad_norm": 0.2450899109755606, "learning_rate": 1.2047673209788058e-06, "loss": 0.0138, "step": 8915 }, { "epoch": 2.3909895414320195, "grad_norm": 0.2498755211362414, "learning_rate": 1.2037517672427584e-06, "loss": 0.0153, "step": 8916 }, { "epoch": 2.3912577098417804, "grad_norm": 0.24208444461348191, "learning_rate": 1.2027365831373229e-06, "loss": 0.0114, "step": 8917 }, { "epoch": 2.391525878251542, "grad_norm": 0.27464987038746524, "learning_rate": 1.2017217687613459e-06, "loss": 0.0119, "step": 8918 }, { "epoch": 2.3917940466613032, "grad_norm": 0.19308007987759734, "learning_rate": 1.2007073242136358e-06, "loss": 0.0079, "step": 8919 }, { "epoch": 2.3920622150710646, "grad_norm": 0.27478667978946936, "learning_rate": 1.199693249592967e-06, "loss": 0.0129, "step": 8920 }, { "epoch": 2.392330383480826, "grad_norm": 0.3724189446196056, "learning_rate": 1.1986795449980754e-06, "loss": 0.0092, "step": 8921 }, { "epoch": 2.3925985518905875, "grad_norm": 0.28723330298404476, "learning_rate": 1.1976662105276636e-06, "loss": 0.0133, "step": 8922 }, { "epoch": 2.392866720300349, "grad_norm": 0.28292753178080493, "learning_rate": 1.1966532462804004e-06, "loss": 0.0181, "step": 8923 }, { "epoch": 2.39313488871011, "grad_norm": 0.1728750320496948, "learning_rate": 1.1956406523549107e-06, "loss": 0.008, "step": 8924 }, { "epoch": 2.3934030571198712, "grad_norm": 0.27196377618620465, "learning_rate": 1.1946284288497917e-06, "loss": 0.0121, "step": 8925 }, { "epoch": 2.3936712255296326, "grad_norm": 0.2575641529535603, "learning_rate": 1.1936165758635988e-06, "loss": 0.0132, "step": 8926 }, { "epoch": 2.393939393939394, "grad_norm": 0.26288909062894616, "learning_rate": 1.1926050934948546e-06, "loss": 0.0151, "step": 8927 }, { "epoch": 2.3942075623491554, "grad_norm": 0.2775865845145447, "learning_rate": 1.1915939818420435e-06, "loss": 0.0165, "step": 8928 }, { "epoch": 2.3944757307589164, "grad_norm": 0.24843238587928054, "learning_rate": 1.190583241003615e-06, "loss": 0.0115, "step": 8929 }, { "epoch": 2.394743899168678, "grad_norm": 0.2084481830677716, "learning_rate": 1.1895728710779835e-06, "loss": 0.011, "step": 8930 }, { "epoch": 2.395012067578439, "grad_norm": 0.2303314316177401, "learning_rate": 1.1885628721635256e-06, "loss": 0.0146, "step": 8931 }, { "epoch": 2.3952802359882006, "grad_norm": 0.348939984788385, "learning_rate": 1.1875532443585796e-06, "loss": 0.0194, "step": 8932 }, { "epoch": 2.395548404397962, "grad_norm": 0.2714156748144347, "learning_rate": 1.1865439877614542e-06, "loss": 0.0118, "step": 8933 }, { "epoch": 2.3958165728077234, "grad_norm": 0.21323958876179125, "learning_rate": 1.1855351024704148e-06, "loss": 0.0116, "step": 8934 }, { "epoch": 2.396084741217485, "grad_norm": 0.2824618230103708, "learning_rate": 1.1845265885836971e-06, "loss": 0.0148, "step": 8935 }, { "epoch": 2.396352909627246, "grad_norm": 0.2558652677011885, "learning_rate": 1.1835184461994936e-06, "loss": 0.0184, "step": 8936 }, { "epoch": 2.396621078037007, "grad_norm": 0.21179821311893277, "learning_rate": 1.1825106754159677e-06, "loss": 0.02, "step": 8937 }, { "epoch": 2.3968892464467686, "grad_norm": 0.22470867529926203, "learning_rate": 1.1815032763312412e-06, "loss": 0.0127, "step": 8938 }, { "epoch": 2.39715741485653, "grad_norm": 0.4285237613611845, "learning_rate": 1.1804962490434024e-06, "loss": 0.0163, "step": 8939 }, { "epoch": 2.3974255832662914, "grad_norm": 0.23467361244108367, "learning_rate": 1.1794895936505047e-06, "loss": 0.0088, "step": 8940 }, { "epoch": 2.3976937516760524, "grad_norm": 0.21376197820355894, "learning_rate": 1.1784833102505604e-06, "loss": 0.0112, "step": 8941 }, { "epoch": 2.3979619200858138, "grad_norm": 0.2841502083921377, "learning_rate": 1.1774773989415522e-06, "loss": 0.0146, "step": 8942 }, { "epoch": 2.398230088495575, "grad_norm": 0.29752993559893237, "learning_rate": 1.176471859821421e-06, "loss": 0.0137, "step": 8943 }, { "epoch": 2.3984982569053366, "grad_norm": 0.2300866100203763, "learning_rate": 1.1754666929880714e-06, "loss": 0.0129, "step": 8944 }, { "epoch": 2.398766425315098, "grad_norm": 0.20249095947609028, "learning_rate": 1.1744618985393779e-06, "loss": 0.0127, "step": 8945 }, { "epoch": 2.3990345937248594, "grad_norm": 0.23461288041588463, "learning_rate": 1.1734574765731715e-06, "loss": 0.012, "step": 8946 }, { "epoch": 2.3993027621346203, "grad_norm": 0.2571935265214276, "learning_rate": 1.1724534271872522e-06, "loss": 0.0145, "step": 8947 }, { "epoch": 2.3995709305443818, "grad_norm": 0.19539378077179323, "learning_rate": 1.1714497504793797e-06, "loss": 0.0088, "step": 8948 }, { "epoch": 2.399839098954143, "grad_norm": 0.20966524970058859, "learning_rate": 1.1704464465472802e-06, "loss": 0.0118, "step": 8949 }, { "epoch": 2.4001072673639046, "grad_norm": 0.24338336768991495, "learning_rate": 1.1694435154886446e-06, "loss": 0.011, "step": 8950 }, { "epoch": 2.400375435773666, "grad_norm": 0.2625569150316832, "learning_rate": 1.168440957401123e-06, "loss": 0.0154, "step": 8951 }, { "epoch": 2.4006436041834274, "grad_norm": 0.26753006701296095, "learning_rate": 1.1674387723823332e-06, "loss": 0.0153, "step": 8952 }, { "epoch": 2.4009117725931883, "grad_norm": 0.22908348152004687, "learning_rate": 1.1664369605298543e-06, "loss": 0.0139, "step": 8953 }, { "epoch": 2.4011799410029497, "grad_norm": 0.34288830887703775, "learning_rate": 1.1654355219412316e-06, "loss": 0.0216, "step": 8954 }, { "epoch": 2.401448109412711, "grad_norm": 0.2993738772339097, "learning_rate": 1.1644344567139716e-06, "loss": 0.0227, "step": 8955 }, { "epoch": 2.4017162778224725, "grad_norm": 0.2540094089158068, "learning_rate": 1.163433764945544e-06, "loss": 0.0179, "step": 8956 }, { "epoch": 2.401984446232234, "grad_norm": 0.3824701523030589, "learning_rate": 1.162433446733386e-06, "loss": 0.013, "step": 8957 }, { "epoch": 2.4022526146419954, "grad_norm": 0.29860198097002477, "learning_rate": 1.1614335021748934e-06, "loss": 0.0159, "step": 8958 }, { "epoch": 2.4025207830517563, "grad_norm": 0.23830131060735016, "learning_rate": 1.16043393136743e-06, "loss": 0.0207, "step": 8959 }, { "epoch": 2.4027889514615177, "grad_norm": 0.2081124445473722, "learning_rate": 1.1594347344083218e-06, "loss": 0.0113, "step": 8960 }, { "epoch": 2.403057119871279, "grad_norm": 0.20663840139198966, "learning_rate": 1.1584359113948558e-06, "loss": 0.0131, "step": 8961 }, { "epoch": 2.4033252882810405, "grad_norm": 0.2665210583092656, "learning_rate": 1.1574374624242874e-06, "loss": 0.0133, "step": 8962 }, { "epoch": 2.403593456690802, "grad_norm": 0.2749575755414612, "learning_rate": 1.1564393875938295e-06, "loss": 0.0175, "step": 8963 }, { "epoch": 2.4038616251005633, "grad_norm": 0.25949339672251504, "learning_rate": 1.1554416870006657e-06, "loss": 0.0119, "step": 8964 }, { "epoch": 2.4041297935103243, "grad_norm": 0.2043555872963266, "learning_rate": 1.1544443607419364e-06, "loss": 0.0111, "step": 8965 }, { "epoch": 2.4043979619200857, "grad_norm": 0.22938426922413468, "learning_rate": 1.1534474089147501e-06, "loss": 0.0171, "step": 8966 }, { "epoch": 2.404666130329847, "grad_norm": 0.2442580051163986, "learning_rate": 1.1524508316161799e-06, "loss": 0.0111, "step": 8967 }, { "epoch": 2.4049342987396085, "grad_norm": 0.2676534371400079, "learning_rate": 1.1514546289432543e-06, "loss": 0.0148, "step": 8968 }, { "epoch": 2.40520246714937, "grad_norm": 0.24368749708155377, "learning_rate": 1.150458800992974e-06, "loss": 0.0134, "step": 8969 }, { "epoch": 2.4054706355591313, "grad_norm": 0.2455797409295501, "learning_rate": 1.1494633478623008e-06, "loss": 0.017, "step": 8970 }, { "epoch": 2.4057388039688923, "grad_norm": 0.21105909614053148, "learning_rate": 1.1484682696481575e-06, "loss": 0.0133, "step": 8971 }, { "epoch": 2.4060069723786537, "grad_norm": 0.2456015665349908, "learning_rate": 1.1474735664474346e-06, "loss": 0.0119, "step": 8972 }, { "epoch": 2.406275140788415, "grad_norm": 0.2373576103175216, "learning_rate": 1.1464792383569806e-06, "loss": 0.0115, "step": 8973 }, { "epoch": 2.4065433091981765, "grad_norm": 0.264275129918367, "learning_rate": 1.1454852854736138e-06, "loss": 0.0131, "step": 8974 }, { "epoch": 2.406811477607938, "grad_norm": 0.2188753446663117, "learning_rate": 1.1444917078941099e-06, "loss": 0.0112, "step": 8975 }, { "epoch": 2.4070796460176993, "grad_norm": 0.362218174027092, "learning_rate": 1.143498505715212e-06, "loss": 0.0227, "step": 8976 }, { "epoch": 2.4073478144274603, "grad_norm": 0.2568216875750802, "learning_rate": 1.1425056790336274e-06, "loss": 0.0147, "step": 8977 }, { "epoch": 2.4076159828372217, "grad_norm": 0.257097226202802, "learning_rate": 1.1415132279460217e-06, "loss": 0.015, "step": 8978 }, { "epoch": 2.407884151246983, "grad_norm": 0.267637388761784, "learning_rate": 1.1405211525490307e-06, "loss": 0.0153, "step": 8979 }, { "epoch": 2.4081523196567445, "grad_norm": 0.23261555592433777, "learning_rate": 1.1395294529392481e-06, "loss": 0.0113, "step": 8980 }, { "epoch": 2.408420488066506, "grad_norm": 0.2088945230170517, "learning_rate": 1.138538129213232e-06, "loss": 0.0135, "step": 8981 }, { "epoch": 2.4086886564762673, "grad_norm": 0.26757456713187644, "learning_rate": 1.1375471814675076e-06, "loss": 0.0172, "step": 8982 }, { "epoch": 2.4089568248860282, "grad_norm": 0.30707992635246745, "learning_rate": 1.136556609798558e-06, "loss": 0.0178, "step": 8983 }, { "epoch": 2.4092249932957897, "grad_norm": 0.18689621928705544, "learning_rate": 1.1355664143028356e-06, "loss": 0.0108, "step": 8984 }, { "epoch": 2.409493161705551, "grad_norm": 0.2204720872926068, "learning_rate": 1.1345765950767507e-06, "loss": 0.0138, "step": 8985 }, { "epoch": 2.4097613301153125, "grad_norm": 0.2252101783296155, "learning_rate": 1.1335871522166797e-06, "loss": 0.0099, "step": 8986 }, { "epoch": 2.410029498525074, "grad_norm": 0.3447113958971166, "learning_rate": 1.1325980858189639e-06, "loss": 0.0247, "step": 8987 }, { "epoch": 2.4102976669348353, "grad_norm": 0.20796525719621287, "learning_rate": 1.1316093959799034e-06, "loss": 0.0091, "step": 8988 }, { "epoch": 2.4105658353445962, "grad_norm": 0.2349923552570824, "learning_rate": 1.130621082795767e-06, "loss": 0.0135, "step": 8989 }, { "epoch": 2.4108340037543576, "grad_norm": 0.2330563868819737, "learning_rate": 1.1296331463627814e-06, "loss": 0.0152, "step": 8990 }, { "epoch": 2.411102172164119, "grad_norm": 0.221006971603871, "learning_rate": 1.1286455867771422e-06, "loss": 0.0118, "step": 8991 }, { "epoch": 2.4113703405738804, "grad_norm": 0.2586053294090216, "learning_rate": 1.1276584041350036e-06, "loss": 0.0259, "step": 8992 }, { "epoch": 2.411638508983642, "grad_norm": 0.35759300409319444, "learning_rate": 1.1266715985324843e-06, "loss": 0.0157, "step": 8993 }, { "epoch": 2.411906677393403, "grad_norm": 0.36323005406335307, "learning_rate": 1.1256851700656695e-06, "loss": 0.0151, "step": 8994 }, { "epoch": 2.412174845803164, "grad_norm": 0.20950723757364045, "learning_rate": 1.1246991188306018e-06, "loss": 0.0105, "step": 8995 }, { "epoch": 2.4124430142129256, "grad_norm": 0.29567405787535755, "learning_rate": 1.1237134449232922e-06, "loss": 0.0158, "step": 8996 }, { "epoch": 2.412711182622687, "grad_norm": 0.2931943115932395, "learning_rate": 1.122728148439715e-06, "loss": 0.0165, "step": 8997 }, { "epoch": 2.4129793510324484, "grad_norm": 0.2265225399155894, "learning_rate": 1.1217432294758023e-06, "loss": 0.0109, "step": 8998 }, { "epoch": 2.41324751944221, "grad_norm": 0.30373948040003457, "learning_rate": 1.1207586881274562e-06, "loss": 0.014, "step": 8999 }, { "epoch": 2.4135156878519712, "grad_norm": 0.2965281453965157, "learning_rate": 1.1197745244905362e-06, "loss": 0.0127, "step": 9000 }, { "epoch": 2.413783856261732, "grad_norm": 0.5315404855850755, "learning_rate": 1.1187907386608698e-06, "loss": 0.0174, "step": 9001 }, { "epoch": 2.4140520246714936, "grad_norm": 0.2215528231046045, "learning_rate": 1.117807330734244e-06, "loss": 0.0116, "step": 9002 }, { "epoch": 2.414320193081255, "grad_norm": 0.24203933761863441, "learning_rate": 1.1168243008064123e-06, "loss": 0.0104, "step": 9003 }, { "epoch": 2.4145883614910164, "grad_norm": 0.3103980599632524, "learning_rate": 1.1158416489730895e-06, "loss": 0.0187, "step": 9004 }, { "epoch": 2.414856529900778, "grad_norm": 0.3142940712897623, "learning_rate": 1.114859375329951e-06, "loss": 0.0164, "step": 9005 }, { "epoch": 2.4151246983105388, "grad_norm": 0.22536701560534933, "learning_rate": 1.113877479972642e-06, "loss": 0.0164, "step": 9006 }, { "epoch": 2.4153928667203, "grad_norm": 0.20302589638445268, "learning_rate": 1.112895962996764e-06, "loss": 0.0108, "step": 9007 }, { "epoch": 2.4156610351300616, "grad_norm": 0.3340456792769076, "learning_rate": 1.1119148244978856e-06, "loss": 0.0166, "step": 9008 }, { "epoch": 2.415929203539823, "grad_norm": 0.2820654035153035, "learning_rate": 1.1109340645715394e-06, "loss": 0.0246, "step": 9009 }, { "epoch": 2.4161973719495844, "grad_norm": 0.2615254818623496, "learning_rate": 1.1099536833132173e-06, "loss": 0.0161, "step": 9010 }, { "epoch": 2.416465540359346, "grad_norm": 0.21645441418326358, "learning_rate": 1.1089736808183782e-06, "loss": 0.0093, "step": 9011 }, { "epoch": 2.416733708769107, "grad_norm": 0.3305579819847133, "learning_rate": 1.1079940571824398e-06, "loss": 0.0255, "step": 9012 }, { "epoch": 2.417001877178868, "grad_norm": 0.2151647832925846, "learning_rate": 1.1070148125007874e-06, "loss": 0.0124, "step": 9013 }, { "epoch": 2.4172700455886296, "grad_norm": 0.3030285629924302, "learning_rate": 1.106035946868768e-06, "loss": 0.0202, "step": 9014 }, { "epoch": 2.417538213998391, "grad_norm": 0.2673913289134322, "learning_rate": 1.1050574603816905e-06, "loss": 0.0145, "step": 9015 }, { "epoch": 2.4178063824081524, "grad_norm": 0.25235668734710826, "learning_rate": 1.104079353134827e-06, "loss": 0.0134, "step": 9016 }, { "epoch": 2.418074550817914, "grad_norm": 0.26143734782930905, "learning_rate": 1.1031016252234123e-06, "loss": 0.0147, "step": 9017 }, { "epoch": 2.4183427192276747, "grad_norm": 0.32754674050455207, "learning_rate": 1.102124276742646e-06, "loss": 0.0237, "step": 9018 }, { "epoch": 2.418610887637436, "grad_norm": 0.21230924975657184, "learning_rate": 1.1011473077876916e-06, "loss": 0.0109, "step": 9019 }, { "epoch": 2.4188790560471976, "grad_norm": 0.2203401196432714, "learning_rate": 1.1001707184536714e-06, "loss": 0.0161, "step": 9020 }, { "epoch": 2.419147224456959, "grad_norm": 0.31127551350874344, "learning_rate": 1.099194508835676e-06, "loss": 0.0129, "step": 9021 }, { "epoch": 2.4194153928667204, "grad_norm": 0.40680028382634426, "learning_rate": 1.098218679028753e-06, "loss": 0.0189, "step": 9022 }, { "epoch": 2.4196835612764818, "grad_norm": 0.28671617662431553, "learning_rate": 1.09724322912792e-06, "loss": 0.0198, "step": 9023 }, { "epoch": 2.419951729686243, "grad_norm": 0.27722171814325464, "learning_rate": 1.0962681592281504e-06, "loss": 0.0175, "step": 9024 }, { "epoch": 2.420219898096004, "grad_norm": 0.21173922240727902, "learning_rate": 1.095293469424386e-06, "loss": 0.0114, "step": 9025 }, { "epoch": 2.4204880665057655, "grad_norm": 0.23040888032505438, "learning_rate": 1.094319159811531e-06, "loss": 0.0205, "step": 9026 }, { "epoch": 2.420756234915527, "grad_norm": 0.258905304366605, "learning_rate": 1.0933452304844505e-06, "loss": 0.0142, "step": 9027 }, { "epoch": 2.4210244033252883, "grad_norm": 0.22784106824899747, "learning_rate": 1.0923716815379714e-06, "loss": 0.0189, "step": 9028 }, { "epoch": 2.4212925717350497, "grad_norm": 0.17988995878797667, "learning_rate": 1.0913985130668881e-06, "loss": 0.0115, "step": 9029 }, { "epoch": 2.4215607401448107, "grad_norm": 0.526037192996088, "learning_rate": 1.0904257251659535e-06, "loss": 0.0221, "step": 9030 }, { "epoch": 2.421828908554572, "grad_norm": 0.21122668235110634, "learning_rate": 1.0894533179298877e-06, "loss": 0.0116, "step": 9031 }, { "epoch": 2.4220970769643335, "grad_norm": 0.25921485026370983, "learning_rate": 1.0884812914533683e-06, "loss": 0.0153, "step": 9032 }, { "epoch": 2.422365245374095, "grad_norm": 0.1783664618367453, "learning_rate": 1.0875096458310426e-06, "loss": 0.0103, "step": 9033 }, { "epoch": 2.4226334137838563, "grad_norm": 0.22760849341335523, "learning_rate": 1.086538381157513e-06, "loss": 0.0148, "step": 9034 }, { "epoch": 2.4229015821936177, "grad_norm": 0.20163762952074663, "learning_rate": 1.0855674975273517e-06, "loss": 0.0134, "step": 9035 }, { "epoch": 2.423169750603379, "grad_norm": 0.23594968666092236, "learning_rate": 1.0845969950350921e-06, "loss": 0.0135, "step": 9036 }, { "epoch": 2.42343791901314, "grad_norm": 0.2496630910614985, "learning_rate": 1.083626873775226e-06, "loss": 0.0109, "step": 9037 }, { "epoch": 2.4237060874229015, "grad_norm": 0.24237573922825387, "learning_rate": 1.0826571338422153e-06, "loss": 0.0129, "step": 9038 }, { "epoch": 2.423974255832663, "grad_norm": 0.35377429247880243, "learning_rate": 1.0816877753304777e-06, "loss": 0.0242, "step": 9039 }, { "epoch": 2.4242424242424243, "grad_norm": 0.2545030341173104, "learning_rate": 1.0807187983343998e-06, "loss": 0.0138, "step": 9040 }, { "epoch": 2.4245105926521857, "grad_norm": 0.27251373935521084, "learning_rate": 1.079750202948327e-06, "loss": 0.0203, "step": 9041 }, { "epoch": 2.4247787610619467, "grad_norm": 0.24023004743675008, "learning_rate": 1.0787819892665673e-06, "loss": 0.0167, "step": 9042 }, { "epoch": 2.425046929471708, "grad_norm": 0.2151738081314473, "learning_rate": 1.0778141573833962e-06, "loss": 0.0139, "step": 9043 }, { "epoch": 2.4253150978814695, "grad_norm": 0.23397875991047445, "learning_rate": 1.076846707393046e-06, "loss": 0.0118, "step": 9044 }, { "epoch": 2.425583266291231, "grad_norm": 0.2596426155955461, "learning_rate": 1.0758796393897163e-06, "loss": 0.0164, "step": 9045 }, { "epoch": 2.4258514347009923, "grad_norm": 0.3022000146567599, "learning_rate": 1.074912953467569e-06, "loss": 0.0182, "step": 9046 }, { "epoch": 2.4261196031107537, "grad_norm": 0.27741510558731025, "learning_rate": 1.0739466497207252e-06, "loss": 0.0182, "step": 9047 }, { "epoch": 2.426387771520515, "grad_norm": 0.22197595237692136, "learning_rate": 1.0729807282432737e-06, "loss": 0.0157, "step": 9048 }, { "epoch": 2.426655939930276, "grad_norm": 0.3109968939512372, "learning_rate": 1.0720151891292619e-06, "loss": 0.0186, "step": 9049 }, { "epoch": 2.4269241083400375, "grad_norm": 0.21830328781493383, "learning_rate": 1.0710500324727035e-06, "loss": 0.0141, "step": 9050 }, { "epoch": 2.427192276749799, "grad_norm": 0.2418846349771064, "learning_rate": 1.0700852583675708e-06, "loss": 0.0123, "step": 9051 }, { "epoch": 2.4274604451595603, "grad_norm": 0.25278427112603036, "learning_rate": 1.0691208669078045e-06, "loss": 0.0198, "step": 9052 }, { "epoch": 2.4277286135693217, "grad_norm": 0.24640597456561303, "learning_rate": 1.0681568581873031e-06, "loss": 0.0125, "step": 9053 }, { "epoch": 2.4279967819790826, "grad_norm": 0.2405237812770578, "learning_rate": 1.067193232299928e-06, "loss": 0.0137, "step": 9054 }, { "epoch": 2.428264950388844, "grad_norm": 0.1954389754531959, "learning_rate": 1.0662299893395062e-06, "loss": 0.0109, "step": 9055 }, { "epoch": 2.4285331187986055, "grad_norm": 0.21554416138708013, "learning_rate": 1.065267129399828e-06, "loss": 0.0103, "step": 9056 }, { "epoch": 2.428801287208367, "grad_norm": 0.2350961010780986, "learning_rate": 1.0643046525746414e-06, "loss": 0.0141, "step": 9057 }, { "epoch": 2.4290694556181283, "grad_norm": 0.26794093868997493, "learning_rate": 1.0633425589576636e-06, "loss": 0.0186, "step": 9058 }, { "epoch": 2.4293376240278897, "grad_norm": 0.2753119583072202, "learning_rate": 1.0623808486425674e-06, "loss": 0.0147, "step": 9059 }, { "epoch": 2.429605792437651, "grad_norm": 0.3086129590470031, "learning_rate": 1.0614195217229957e-06, "loss": 0.0234, "step": 9060 }, { "epoch": 2.429873960847412, "grad_norm": 0.20758031101122348, "learning_rate": 1.0604585782925465e-06, "loss": 0.013, "step": 9061 }, { "epoch": 2.4301421292571734, "grad_norm": 0.23980291178912524, "learning_rate": 1.059498018444786e-06, "loss": 0.0112, "step": 9062 }, { "epoch": 2.430410297666935, "grad_norm": 0.24558960006564906, "learning_rate": 1.0585378422732435e-06, "loss": 0.0131, "step": 9063 }, { "epoch": 2.4306784660766962, "grad_norm": 0.24156801034628894, "learning_rate": 1.0575780498714067e-06, "loss": 0.0137, "step": 9064 }, { "epoch": 2.4309466344864576, "grad_norm": 0.22565744533826532, "learning_rate": 1.0566186413327274e-06, "loss": 0.0134, "step": 9065 }, { "epoch": 2.4312148028962186, "grad_norm": 0.2812350174966112, "learning_rate": 1.0556596167506223e-06, "loss": 0.0109, "step": 9066 }, { "epoch": 2.43148297130598, "grad_norm": 0.28176904720687085, "learning_rate": 1.0547009762184667e-06, "loss": 0.0122, "step": 9067 }, { "epoch": 2.4317511397157414, "grad_norm": 0.3192953045772836, "learning_rate": 1.0537427198296046e-06, "loss": 0.0172, "step": 9068 }, { "epoch": 2.432019308125503, "grad_norm": 0.1968057586420274, "learning_rate": 1.052784847677335e-06, "loss": 0.0119, "step": 9069 }, { "epoch": 2.4322874765352642, "grad_norm": 0.24941351032200065, "learning_rate": 1.0518273598549267e-06, "loss": 0.015, "step": 9070 }, { "epoch": 2.4325556449450256, "grad_norm": 0.19255185992171772, "learning_rate": 1.0508702564556045e-06, "loss": 0.01, "step": 9071 }, { "epoch": 2.432823813354787, "grad_norm": 0.20205885516707056, "learning_rate": 1.0499135375725606e-06, "loss": 0.0131, "step": 9072 }, { "epoch": 2.433091981764548, "grad_norm": 0.2938080741917749, "learning_rate": 1.0489572032989498e-06, "loss": 0.0184, "step": 9073 }, { "epoch": 2.4333601501743094, "grad_norm": 0.23834134069507926, "learning_rate": 1.048001253727885e-06, "loss": 0.0119, "step": 9074 }, { "epoch": 2.433628318584071, "grad_norm": 0.2765386961751854, "learning_rate": 1.0470456889524473e-06, "loss": 0.0175, "step": 9075 }, { "epoch": 2.433896486993832, "grad_norm": 0.2646075931019891, "learning_rate": 1.046090509065676e-06, "loss": 0.0166, "step": 9076 }, { "epoch": 2.4341646554035936, "grad_norm": 0.25916736941990226, "learning_rate": 1.0451357141605728e-06, "loss": 0.0154, "step": 9077 }, { "epoch": 2.4344328238133546, "grad_norm": 0.21120882958734571, "learning_rate": 1.0441813043301064e-06, "loss": 0.0091, "step": 9078 }, { "epoch": 2.434700992223116, "grad_norm": 0.2941808092924196, "learning_rate": 1.0432272796672028e-06, "loss": 0.0121, "step": 9079 }, { "epoch": 2.4349691606328774, "grad_norm": 0.916290487157292, "learning_rate": 1.0422736402647548e-06, "loss": 0.0156, "step": 9080 }, { "epoch": 2.435237329042639, "grad_norm": 0.26048983439635126, "learning_rate": 1.041320386215614e-06, "loss": 0.0157, "step": 9081 }, { "epoch": 2.4355054974524, "grad_norm": 0.21262406827303099, "learning_rate": 1.0403675176125971e-06, "loss": 0.0125, "step": 9082 }, { "epoch": 2.4357736658621616, "grad_norm": 0.26436806913504657, "learning_rate": 1.0394150345484838e-06, "loss": 0.0135, "step": 9083 }, { "epoch": 2.436041834271923, "grad_norm": 0.25486126802829767, "learning_rate": 1.0384629371160122e-06, "loss": 0.0137, "step": 9084 }, { "epoch": 2.436310002681684, "grad_norm": 0.25583689636177115, "learning_rate": 1.0375112254078878e-06, "loss": 0.012, "step": 9085 }, { "epoch": 2.4365781710914454, "grad_norm": 0.20381927491149499, "learning_rate": 1.0365598995167736e-06, "loss": 0.0111, "step": 9086 }, { "epoch": 2.4368463395012068, "grad_norm": 0.20003463016584686, "learning_rate": 1.0356089595353008e-06, "loss": 0.0103, "step": 9087 }, { "epoch": 2.437114507910968, "grad_norm": 0.23715222517157653, "learning_rate": 1.0346584055560587e-06, "loss": 0.0128, "step": 9088 }, { "epoch": 2.4373826763207296, "grad_norm": 0.28597333888507026, "learning_rate": 1.0337082376715985e-06, "loss": 0.0159, "step": 9089 }, { "epoch": 2.4376508447304905, "grad_norm": 0.25194055810146543, "learning_rate": 1.032758455974438e-06, "loss": 0.0124, "step": 9090 }, { "epoch": 2.437919013140252, "grad_norm": 0.268752749077263, "learning_rate": 1.0318090605570523e-06, "loss": 0.0165, "step": 9091 }, { "epoch": 2.4381871815500133, "grad_norm": 0.6863306637929121, "learning_rate": 1.0308600515118834e-06, "loss": 0.0257, "step": 9092 }, { "epoch": 2.4384553499597748, "grad_norm": 0.22817493977227393, "learning_rate": 1.0299114289313344e-06, "loss": 0.0107, "step": 9093 }, { "epoch": 2.438723518369536, "grad_norm": 0.19288011275645034, "learning_rate": 1.028963192907768e-06, "loss": 0.0062, "step": 9094 }, { "epoch": 2.4389916867792976, "grad_norm": 0.21612729415146228, "learning_rate": 1.0280153435335139e-06, "loss": 0.0117, "step": 9095 }, { "epoch": 2.439259855189059, "grad_norm": 0.277536117826134, "learning_rate": 1.0270678809008594e-06, "loss": 0.0164, "step": 9096 }, { "epoch": 2.43952802359882, "grad_norm": 0.2918040960978186, "learning_rate": 1.0261208051020584e-06, "loss": 0.0137, "step": 9097 }, { "epoch": 2.4397961920085813, "grad_norm": 0.29771386613234274, "learning_rate": 1.0251741162293228e-06, "loss": 0.0165, "step": 9098 }, { "epoch": 2.4400643604183427, "grad_norm": 0.25496286299464455, "learning_rate": 1.0242278143748307e-06, "loss": 0.0108, "step": 9099 }, { "epoch": 2.440332528828104, "grad_norm": 0.20296479843569679, "learning_rate": 1.023281899630723e-06, "loss": 0.0106, "step": 9100 }, { "epoch": 2.4406006972378655, "grad_norm": 0.2729067625042114, "learning_rate": 1.0223363720890966e-06, "loss": 0.0113, "step": 9101 }, { "epoch": 2.4408688656476265, "grad_norm": 0.21198395923082908, "learning_rate": 1.0213912318420189e-06, "loss": 0.0111, "step": 9102 }, { "epoch": 2.441137034057388, "grad_norm": 0.2614452534793551, "learning_rate": 1.0204464789815122e-06, "loss": 0.0235, "step": 9103 }, { "epoch": 2.4414052024671493, "grad_norm": 0.2878785904861613, "learning_rate": 1.0195021135995658e-06, "loss": 0.0196, "step": 9104 }, { "epoch": 2.4416733708769107, "grad_norm": 0.23574699285158554, "learning_rate": 1.0185581357881325e-06, "loss": 0.0119, "step": 9105 }, { "epoch": 2.441941539286672, "grad_norm": 0.32303435496754435, "learning_rate": 1.0176145456391219e-06, "loss": 0.0128, "step": 9106 }, { "epoch": 2.4422097076964335, "grad_norm": 0.28010618977728075, "learning_rate": 1.0166713432444108e-06, "loss": 0.015, "step": 9107 }, { "epoch": 2.442477876106195, "grad_norm": 0.5614975562030432, "learning_rate": 1.015728528695834e-06, "loss": 0.0194, "step": 9108 }, { "epoch": 2.442746044515956, "grad_norm": 0.22129816851623307, "learning_rate": 1.0147861020851929e-06, "loss": 0.0138, "step": 9109 }, { "epoch": 2.4430142129257173, "grad_norm": 0.2623286495545508, "learning_rate": 1.0138440635042495e-06, "loss": 0.0115, "step": 9110 }, { "epoch": 2.4432823813354787, "grad_norm": 0.32372058895317196, "learning_rate": 1.012902413044725e-06, "loss": 0.0127, "step": 9111 }, { "epoch": 2.44355054974524, "grad_norm": 0.1925924332488762, "learning_rate": 1.0119611507983102e-06, "loss": 0.0111, "step": 9112 }, { "epoch": 2.4438187181550015, "grad_norm": 0.2288066064517032, "learning_rate": 1.0110202768566474e-06, "loss": 0.0097, "step": 9113 }, { "epoch": 2.4440868865647625, "grad_norm": 0.26058351563234644, "learning_rate": 1.0100797913113491e-06, "loss": 0.0143, "step": 9114 }, { "epoch": 2.444355054974524, "grad_norm": 0.3050554251588954, "learning_rate": 1.0091396942539904e-06, "loss": 0.0157, "step": 9115 }, { "epoch": 2.4446232233842853, "grad_norm": 0.22465701055104542, "learning_rate": 1.0081999857761026e-06, "loss": 0.0135, "step": 9116 }, { "epoch": 2.4448913917940467, "grad_norm": 0.18604000115387828, "learning_rate": 1.007260665969186e-06, "loss": 0.0089, "step": 9117 }, { "epoch": 2.445159560203808, "grad_norm": 0.3226109923379357, "learning_rate": 1.0063217349246956e-06, "loss": 0.0176, "step": 9118 }, { "epoch": 2.4454277286135695, "grad_norm": 0.23008113046334286, "learning_rate": 1.005383192734057e-06, "loss": 0.0101, "step": 9119 }, { "epoch": 2.445695897023331, "grad_norm": 0.22023505625016718, "learning_rate": 1.0044450394886496e-06, "loss": 0.0106, "step": 9120 }, { "epoch": 2.445964065433092, "grad_norm": 0.2294634620987983, "learning_rate": 1.0035072752798208e-06, "loss": 0.0118, "step": 9121 }, { "epoch": 2.4462322338428533, "grad_norm": 0.16497875276822027, "learning_rate": 1.0025699001988793e-06, "loss": 0.0085, "step": 9122 }, { "epoch": 2.4465004022526147, "grad_norm": 0.2968024056540365, "learning_rate": 1.0016329143370929e-06, "loss": 0.0206, "step": 9123 }, { "epoch": 2.446768570662376, "grad_norm": 0.27301448401906914, "learning_rate": 1.000696317785695e-06, "loss": 0.0165, "step": 9124 }, { "epoch": 2.4470367390721375, "grad_norm": 0.40771136280637615, "learning_rate": 9.997601106358785e-07, "loss": 0.0192, "step": 9125 }, { "epoch": 2.4473049074818984, "grad_norm": 0.2600030053596715, "learning_rate": 9.98824292978799e-07, "loss": 0.0145, "step": 9126 }, { "epoch": 2.44757307589166, "grad_norm": 0.23410809789546616, "learning_rate": 9.978888649055757e-07, "loss": 0.0099, "step": 9127 }, { "epoch": 2.4478412443014212, "grad_norm": 0.34575040072198565, "learning_rate": 9.969538265072876e-07, "loss": 0.0169, "step": 9128 }, { "epoch": 2.4481094127111827, "grad_norm": 0.34864998962584937, "learning_rate": 9.960191778749783e-07, "loss": 0.0175, "step": 9129 }, { "epoch": 2.448377581120944, "grad_norm": 0.24647498095747095, "learning_rate": 9.950849190996503e-07, "loss": 0.0124, "step": 9130 }, { "epoch": 2.4486457495307055, "grad_norm": 0.2783685073756563, "learning_rate": 9.941510502722707e-07, "loss": 0.0211, "step": 9131 }, { "epoch": 2.4489139179404664, "grad_norm": 0.29366357305368285, "learning_rate": 9.932175714837695e-07, "loss": 0.0193, "step": 9132 }, { "epoch": 2.449182086350228, "grad_norm": 0.23399124493062584, "learning_rate": 9.922844828250339e-07, "loss": 0.0129, "step": 9133 }, { "epoch": 2.4494502547599892, "grad_norm": 0.3924118652029773, "learning_rate": 9.913517843869197e-07, "loss": 0.0213, "step": 9134 }, { "epoch": 2.4497184231697506, "grad_norm": 0.25679192881433266, "learning_rate": 9.904194762602382e-07, "loss": 0.0117, "step": 9135 }, { "epoch": 2.449986591579512, "grad_norm": 0.31301070974190776, "learning_rate": 9.894875585357678e-07, "loss": 0.0205, "step": 9136 }, { "epoch": 2.4502547599892734, "grad_norm": 0.3004325418934879, "learning_rate": 9.885560313042464e-07, "loss": 0.0169, "step": 9137 }, { "epoch": 2.4505229283990344, "grad_norm": 0.17051049485653105, "learning_rate": 9.876248946563732e-07, "loss": 0.0076, "step": 9138 }, { "epoch": 2.450791096808796, "grad_norm": 0.2517870537487761, "learning_rate": 9.866941486828119e-07, "loss": 0.0175, "step": 9139 }, { "epoch": 2.451059265218557, "grad_norm": 0.20846544256639823, "learning_rate": 9.857637934741854e-07, "loss": 0.0141, "step": 9140 }, { "epoch": 2.4513274336283186, "grad_norm": 0.26846130221518455, "learning_rate": 9.848338291210807e-07, "loss": 0.0184, "step": 9141 }, { "epoch": 2.45159560203808, "grad_norm": 0.30482564251824906, "learning_rate": 9.839042557140472e-07, "loss": 0.0137, "step": 9142 }, { "epoch": 2.4518637704478414, "grad_norm": 0.18919862392541226, "learning_rate": 9.829750733435928e-07, "loss": 0.0095, "step": 9143 }, { "epoch": 2.4521319388576024, "grad_norm": 0.20190728646544717, "learning_rate": 9.820462821001913e-07, "loss": 0.0093, "step": 9144 }, { "epoch": 2.452400107267364, "grad_norm": 0.22804541036683662, "learning_rate": 9.811178820742751e-07, "loss": 0.0103, "step": 9145 }, { "epoch": 2.452668275677125, "grad_norm": 0.24552412353800554, "learning_rate": 9.801898733562415e-07, "loss": 0.012, "step": 9146 }, { "epoch": 2.4529364440868866, "grad_norm": 0.24316902822533418, "learning_rate": 9.792622560364467e-07, "loss": 0.0131, "step": 9147 }, { "epoch": 2.453204612496648, "grad_norm": 0.27237942060933457, "learning_rate": 9.783350302052124e-07, "loss": 0.0169, "step": 9148 }, { "epoch": 2.4534727809064094, "grad_norm": 0.23938322070577253, "learning_rate": 9.774081959528186e-07, "loss": 0.013, "step": 9149 }, { "epoch": 2.4537409493161704, "grad_norm": 0.26649978161389626, "learning_rate": 9.76481753369508e-07, "loss": 0.0166, "step": 9150 }, { "epoch": 2.4540091177259318, "grad_norm": 0.2607979516496919, "learning_rate": 9.755557025454865e-07, "loss": 0.0125, "step": 9151 }, { "epoch": 2.454277286135693, "grad_norm": 0.2703144633188623, "learning_rate": 9.746300435709233e-07, "loss": 0.0115, "step": 9152 }, { "epoch": 2.4545454545454546, "grad_norm": 0.26188508396072446, "learning_rate": 9.737047765359443e-07, "loss": 0.0142, "step": 9153 }, { "epoch": 2.454813622955216, "grad_norm": 0.2435200020645698, "learning_rate": 9.727799015306427e-07, "loss": 0.0173, "step": 9154 }, { "epoch": 2.4550817913649774, "grad_norm": 0.234523563020842, "learning_rate": 9.718554186450686e-07, "loss": 0.011, "step": 9155 }, { "epoch": 2.4553499597747384, "grad_norm": 0.23507239376410358, "learning_rate": 9.70931327969239e-07, "loss": 0.0106, "step": 9156 }, { "epoch": 2.4556181281844998, "grad_norm": 0.28793634651708705, "learning_rate": 9.700076295931281e-07, "loss": 0.0168, "step": 9157 }, { "epoch": 2.455886296594261, "grad_norm": 0.20399842206377147, "learning_rate": 9.69084323606675e-07, "loss": 0.0126, "step": 9158 }, { "epoch": 2.4561544650040226, "grad_norm": 0.2059213327754596, "learning_rate": 9.681614100997806e-07, "loss": 0.0123, "step": 9159 }, { "epoch": 2.456422633413784, "grad_norm": 0.31585661275508586, "learning_rate": 9.672388891623035e-07, "loss": 0.0135, "step": 9160 }, { "epoch": 2.4566908018235454, "grad_norm": 0.24657097062572503, "learning_rate": 9.663167608840711e-07, "loss": 0.014, "step": 9161 }, { "epoch": 2.4569589702333063, "grad_norm": 0.26050884219817527, "learning_rate": 9.65395025354866e-07, "loss": 0.0139, "step": 9162 }, { "epoch": 2.4572271386430677, "grad_norm": 0.3087878877371412, "learning_rate": 9.644736826644347e-07, "loss": 0.019, "step": 9163 }, { "epoch": 2.457495307052829, "grad_norm": 0.2777293432405128, "learning_rate": 9.63552732902488e-07, "loss": 0.0151, "step": 9164 }, { "epoch": 2.4577634754625906, "grad_norm": 0.2003181352156441, "learning_rate": 9.626321761586937e-07, "loss": 0.0116, "step": 9165 }, { "epoch": 2.458031643872352, "grad_norm": 0.22121233788334033, "learning_rate": 9.617120125226864e-07, "loss": 0.0111, "step": 9166 }, { "epoch": 2.4582998122821134, "grad_norm": 0.2084194033245839, "learning_rate": 9.607922420840583e-07, "loss": 0.0109, "step": 9167 }, { "epoch": 2.4585679806918743, "grad_norm": 2.658693526867989, "learning_rate": 9.59872864932365e-07, "loss": 0.0242, "step": 9168 }, { "epoch": 2.4588361491016357, "grad_norm": 0.358666355856082, "learning_rate": 9.589538811571265e-07, "loss": 0.0237, "step": 9169 }, { "epoch": 2.459104317511397, "grad_norm": 0.22743863409401832, "learning_rate": 9.580352908478184e-07, "loss": 0.0094, "step": 9170 }, { "epoch": 2.4593724859211585, "grad_norm": 0.3505622264692479, "learning_rate": 9.57117094093884e-07, "loss": 0.0129, "step": 9171 }, { "epoch": 2.45964065433092, "grad_norm": 0.24496157889107473, "learning_rate": 9.561992909847235e-07, "loss": 0.0129, "step": 9172 }, { "epoch": 2.4599088227406813, "grad_norm": 0.18498047063961395, "learning_rate": 9.552818816097026e-07, "loss": 0.0094, "step": 9173 }, { "epoch": 2.4601769911504423, "grad_norm": 0.364705860626398, "learning_rate": 9.543648660581466e-07, "loss": 0.024, "step": 9174 }, { "epoch": 2.4604451595602037, "grad_norm": 0.5617333184932559, "learning_rate": 9.534482444193416e-07, "loss": 0.0141, "step": 9175 }, { "epoch": 2.460713327969965, "grad_norm": 0.23894138948062665, "learning_rate": 9.525320167825386e-07, "loss": 0.0108, "step": 9176 }, { "epoch": 2.4609814963797265, "grad_norm": 0.2524481967325974, "learning_rate": 9.516161832369464e-07, "loss": 0.0134, "step": 9177 }, { "epoch": 2.461249664789488, "grad_norm": 0.2651150093550674, "learning_rate": 9.50700743871738e-07, "loss": 0.0117, "step": 9178 }, { "epoch": 2.461517833199249, "grad_norm": 0.28577626381068877, "learning_rate": 9.497856987760484e-07, "loss": 0.0138, "step": 9179 }, { "epoch": 2.4617860016090103, "grad_norm": 0.23908365808011442, "learning_rate": 9.488710480389718e-07, "loss": 0.0115, "step": 9180 }, { "epoch": 2.4620541700187717, "grad_norm": 0.2416333645910043, "learning_rate": 9.479567917495664e-07, "loss": 0.0169, "step": 9181 }, { "epoch": 2.462322338428533, "grad_norm": 0.2708626835498375, "learning_rate": 9.470429299968487e-07, "loss": 0.0166, "step": 9182 }, { "epoch": 2.4625905068382945, "grad_norm": 0.25508976425393765, "learning_rate": 9.46129462869802e-07, "loss": 0.0142, "step": 9183 }, { "epoch": 2.462858675248056, "grad_norm": 0.23599914868631805, "learning_rate": 9.452163904573658e-07, "loss": 0.0198, "step": 9184 }, { "epoch": 2.4631268436578173, "grad_norm": 0.25178989538739915, "learning_rate": 9.443037128484456e-07, "loss": 0.0107, "step": 9185 }, { "epoch": 2.4633950120675783, "grad_norm": 0.23814964649319523, "learning_rate": 9.433914301319047e-07, "loss": 0.0183, "step": 9186 }, { "epoch": 2.4636631804773397, "grad_norm": 0.24601129678752232, "learning_rate": 9.424795423965694e-07, "loss": 0.0145, "step": 9187 }, { "epoch": 2.463931348887101, "grad_norm": 0.28716767317147235, "learning_rate": 9.415680497312286e-07, "loss": 0.0167, "step": 9188 }, { "epoch": 2.4641995172968625, "grad_norm": 0.2866392488354706, "learning_rate": 9.406569522246328e-07, "loss": 0.018, "step": 9189 }, { "epoch": 2.464467685706624, "grad_norm": 0.197146445862933, "learning_rate": 9.397462499654907e-07, "loss": 0.0091, "step": 9190 }, { "epoch": 2.464735854116385, "grad_norm": 0.26896943471059503, "learning_rate": 9.388359430424781e-07, "loss": 0.011, "step": 9191 }, { "epoch": 2.4650040225261463, "grad_norm": 0.24824809911805282, "learning_rate": 9.379260315442262e-07, "loss": 0.0121, "step": 9192 }, { "epoch": 2.4652721909359077, "grad_norm": 0.21176101340558656, "learning_rate": 9.370165155593336e-07, "loss": 0.0104, "step": 9193 }, { "epoch": 2.465540359345669, "grad_norm": 0.2340141063259027, "learning_rate": 9.361073951763544e-07, "loss": 0.0146, "step": 9194 }, { "epoch": 2.4658085277554305, "grad_norm": 0.25249987942683655, "learning_rate": 9.351986704838084e-07, "loss": 0.015, "step": 9195 }, { "epoch": 2.466076696165192, "grad_norm": 0.3391748678436711, "learning_rate": 9.342903415701776e-07, "loss": 0.0311, "step": 9196 }, { "epoch": 2.4663448645749533, "grad_norm": 0.24056745241319946, "learning_rate": 9.333824085239018e-07, "loss": 0.0108, "step": 9197 }, { "epoch": 2.4666130329847142, "grad_norm": 0.2807332882703198, "learning_rate": 9.324748714333843e-07, "loss": 0.0177, "step": 9198 }, { "epoch": 2.4668812013944756, "grad_norm": 0.28351082786320636, "learning_rate": 9.315677303869885e-07, "loss": 0.0229, "step": 9199 }, { "epoch": 2.467149369804237, "grad_norm": 0.24293759022917852, "learning_rate": 9.306609854730409e-07, "loss": 0.0157, "step": 9200 }, { "epoch": 2.4674175382139985, "grad_norm": 0.20421934596844485, "learning_rate": 9.29754636779831e-07, "loss": 0.0142, "step": 9201 }, { "epoch": 2.46768570662376, "grad_norm": 0.20201085911995098, "learning_rate": 9.288486843956041e-07, "loss": 0.0127, "step": 9202 }, { "epoch": 2.467953875033521, "grad_norm": 0.24016654529758918, "learning_rate": 9.279431284085739e-07, "loss": 0.011, "step": 9203 }, { "epoch": 2.468222043443282, "grad_norm": 0.3278515204587565, "learning_rate": 9.270379689069081e-07, "loss": 0.014, "step": 9204 }, { "epoch": 2.4684902118530436, "grad_norm": 0.5102940585112304, "learning_rate": 9.261332059787425e-07, "loss": 0.0214, "step": 9205 }, { "epoch": 2.468758380262805, "grad_norm": 0.3695663672414119, "learning_rate": 9.252288397121717e-07, "loss": 0.0305, "step": 9206 }, { "epoch": 2.4690265486725664, "grad_norm": 0.21050008625065145, "learning_rate": 9.243248701952489e-07, "loss": 0.0107, "step": 9207 }, { "epoch": 2.469294717082328, "grad_norm": 0.23246668435878182, "learning_rate": 9.234212975159946e-07, "loss": 0.0134, "step": 9208 }, { "epoch": 2.4695628854920892, "grad_norm": 0.22608056272490695, "learning_rate": 9.225181217623852e-07, "loss": 0.0099, "step": 9209 }, { "epoch": 2.46983105390185, "grad_norm": 0.2893370296497248, "learning_rate": 9.216153430223595e-07, "loss": 0.0124, "step": 9210 }, { "epoch": 2.4700992223116116, "grad_norm": 0.1873890362573931, "learning_rate": 9.207129613838212e-07, "loss": 0.0098, "step": 9211 }, { "epoch": 2.470367390721373, "grad_norm": 0.24998210174784793, "learning_rate": 9.198109769346303e-07, "loss": 0.0113, "step": 9212 }, { "epoch": 2.4706355591311344, "grad_norm": 0.3439947767713896, "learning_rate": 9.189093897626134e-07, "loss": 0.013, "step": 9213 }, { "epoch": 2.470903727540896, "grad_norm": 0.21664700900986422, "learning_rate": 9.180081999555523e-07, "loss": 0.0115, "step": 9214 }, { "epoch": 2.471171895950657, "grad_norm": 0.23912641517332364, "learning_rate": 9.171074076011971e-07, "loss": 0.0096, "step": 9215 }, { "epoch": 2.471440064360418, "grad_norm": 0.32880356919331805, "learning_rate": 9.162070127872525e-07, "loss": 0.0236, "step": 9216 }, { "epoch": 2.4717082327701796, "grad_norm": 0.34252432156570595, "learning_rate": 9.153070156013883e-07, "loss": 0.0184, "step": 9217 }, { "epoch": 2.471976401179941, "grad_norm": 0.22622283317221914, "learning_rate": 9.144074161312372e-07, "loss": 0.0136, "step": 9218 }, { "epoch": 2.4722445695897024, "grad_norm": 0.24698397580815412, "learning_rate": 9.135082144643869e-07, "loss": 0.0114, "step": 9219 }, { "epoch": 2.472512737999464, "grad_norm": 0.25438089315939016, "learning_rate": 9.126094106883943e-07, "loss": 0.0118, "step": 9220 }, { "epoch": 2.472780906409225, "grad_norm": 0.24273703591156764, "learning_rate": 9.117110048907713e-07, "loss": 0.0154, "step": 9221 }, { "epoch": 2.473049074818986, "grad_norm": 0.327189171879408, "learning_rate": 9.108129971589924e-07, "loss": 0.0138, "step": 9222 }, { "epoch": 2.4733172432287476, "grad_norm": 0.2516938584466438, "learning_rate": 9.099153875804966e-07, "loss": 0.0167, "step": 9223 }, { "epoch": 2.473585411638509, "grad_norm": 0.16446938910255401, "learning_rate": 9.090181762426791e-07, "loss": 0.0074, "step": 9224 }, { "epoch": 2.4738535800482704, "grad_norm": 0.4072454825073082, "learning_rate": 9.081213632329022e-07, "loss": 0.0163, "step": 9225 }, { "epoch": 2.474121748458032, "grad_norm": 0.2856723753127558, "learning_rate": 9.072249486384827e-07, "loss": 0.0161, "step": 9226 }, { "epoch": 2.4743899168677927, "grad_norm": 0.2083062180906677, "learning_rate": 9.06328932546704e-07, "loss": 0.0092, "step": 9227 }, { "epoch": 2.474658085277554, "grad_norm": 0.20889370099236584, "learning_rate": 9.054333150448097e-07, "loss": 0.0115, "step": 9228 }, { "epoch": 2.4749262536873156, "grad_norm": 0.25967862634241773, "learning_rate": 9.045380962200018e-07, "loss": 0.012, "step": 9229 }, { "epoch": 2.475194422097077, "grad_norm": 0.19075519532576385, "learning_rate": 9.036432761594472e-07, "loss": 0.0129, "step": 9230 }, { "epoch": 2.4754625905068384, "grad_norm": 0.22766674300648002, "learning_rate": 9.0274885495027e-07, "loss": 0.0122, "step": 9231 }, { "epoch": 2.4757307589165998, "grad_norm": 0.2341301723430741, "learning_rate": 9.018548326795601e-07, "loss": 0.0159, "step": 9232 }, { "epoch": 2.475998927326361, "grad_norm": 0.23740926127419162, "learning_rate": 9.009612094343651e-07, "loss": 0.0134, "step": 9233 }, { "epoch": 2.476267095736122, "grad_norm": 0.2599515312830257, "learning_rate": 9.000679853016925e-07, "loss": 0.0158, "step": 9234 }, { "epoch": 2.4765352641458835, "grad_norm": 0.32379834884783953, "learning_rate": 8.991751603685167e-07, "loss": 0.0155, "step": 9235 }, { "epoch": 2.476803432555645, "grad_norm": 0.2880043763263693, "learning_rate": 8.982827347217665e-07, "loss": 0.0145, "step": 9236 }, { "epoch": 2.4770716009654064, "grad_norm": 0.3659978534304251, "learning_rate": 8.973907084483369e-07, "loss": 0.021, "step": 9237 }, { "epoch": 2.4773397693751678, "grad_norm": 0.3387240982597907, "learning_rate": 8.964990816350827e-07, "loss": 0.0193, "step": 9238 }, { "epoch": 2.4776079377849287, "grad_norm": 0.3092535010534687, "learning_rate": 8.956078543688174e-07, "loss": 0.0189, "step": 9239 }, { "epoch": 2.47787610619469, "grad_norm": 0.30021676895668187, "learning_rate": 8.947170267363197e-07, "loss": 0.0141, "step": 9240 }, { "epoch": 2.4781442746044515, "grad_norm": 0.24434914794978602, "learning_rate": 8.938265988243245e-07, "loss": 0.0118, "step": 9241 }, { "epoch": 2.478412443014213, "grad_norm": 0.2696306055270258, "learning_rate": 8.929365707195325e-07, "loss": 0.0149, "step": 9242 }, { "epoch": 2.4786806114239743, "grad_norm": 0.29524018147253095, "learning_rate": 8.92046942508602e-07, "loss": 0.016, "step": 9243 }, { "epoch": 2.4789487798337357, "grad_norm": 0.24036427029742488, "learning_rate": 8.911577142781536e-07, "loss": 0.0108, "step": 9244 }, { "epoch": 2.479216948243497, "grad_norm": 0.24997744630048055, "learning_rate": 8.902688861147729e-07, "loss": 0.0116, "step": 9245 }, { "epoch": 2.479485116653258, "grad_norm": 0.2661673124839509, "learning_rate": 8.89380458104997e-07, "loss": 0.0129, "step": 9246 }, { "epoch": 2.4797532850630195, "grad_norm": 0.22783306806154455, "learning_rate": 8.884924303353321e-07, "loss": 0.0131, "step": 9247 }, { "epoch": 2.480021453472781, "grad_norm": 0.25749362446005863, "learning_rate": 8.876048028922446e-07, "loss": 0.0147, "step": 9248 }, { "epoch": 2.4802896218825423, "grad_norm": 0.270717435769717, "learning_rate": 8.867175758621582e-07, "loss": 0.0132, "step": 9249 }, { "epoch": 2.4805577902923037, "grad_norm": 0.2869659336814672, "learning_rate": 8.858307493314622e-07, "loss": 0.0185, "step": 9250 }, { "epoch": 2.4808259587020647, "grad_norm": 0.24640016829378278, "learning_rate": 8.849443233865018e-07, "loss": 0.0093, "step": 9251 }, { "epoch": 2.481094127111826, "grad_norm": 0.22487455978537646, "learning_rate": 8.840582981135892e-07, "loss": 0.0119, "step": 9252 }, { "epoch": 2.4813622955215875, "grad_norm": 0.27228414469981654, "learning_rate": 8.831726735989909e-07, "loss": 0.0144, "step": 9253 }, { "epoch": 2.481630463931349, "grad_norm": 0.21755255587860597, "learning_rate": 8.822874499289391e-07, "loss": 0.0114, "step": 9254 }, { "epoch": 2.4818986323411103, "grad_norm": 0.4066055997903778, "learning_rate": 8.814026271896275e-07, "loss": 0.016, "step": 9255 }, { "epoch": 2.4821668007508717, "grad_norm": 0.24698757050547154, "learning_rate": 8.805182054672063e-07, "loss": 0.0142, "step": 9256 }, { "epoch": 2.482434969160633, "grad_norm": 0.3579199690811656, "learning_rate": 8.796341848477913e-07, "loss": 0.0218, "step": 9257 }, { "epoch": 2.482703137570394, "grad_norm": 0.31044684013343465, "learning_rate": 8.787505654174566e-07, "loss": 0.0124, "step": 9258 }, { "epoch": 2.4829713059801555, "grad_norm": 0.3383822012926626, "learning_rate": 8.778673472622362e-07, "loss": 0.0097, "step": 9259 }, { "epoch": 2.483239474389917, "grad_norm": 0.36223422557075685, "learning_rate": 8.769845304681296e-07, "loss": 0.0174, "step": 9260 }, { "epoch": 2.4835076427996783, "grad_norm": 0.25985621401960435, "learning_rate": 8.761021151210918e-07, "loss": 0.0145, "step": 9261 }, { "epoch": 2.4837758112094397, "grad_norm": 0.28789622563206746, "learning_rate": 8.752201013070433e-07, "loss": 0.0226, "step": 9262 }, { "epoch": 2.4840439796192006, "grad_norm": 0.2135333928111125, "learning_rate": 8.743384891118612e-07, "loss": 0.0112, "step": 9263 }, { "epoch": 2.484312148028962, "grad_norm": 0.2836690329154222, "learning_rate": 8.734572786213869e-07, "loss": 0.0179, "step": 9264 }, { "epoch": 2.4845803164387235, "grad_norm": 0.19701853884599754, "learning_rate": 8.725764699214234e-07, "loss": 0.0116, "step": 9265 }, { "epoch": 2.484848484848485, "grad_norm": 0.21933328660018325, "learning_rate": 8.716960630977289e-07, "loss": 0.0155, "step": 9266 }, { "epoch": 2.4851166532582463, "grad_norm": 0.2479011197687232, "learning_rate": 8.708160582360303e-07, "loss": 0.0134, "step": 9267 }, { "epoch": 2.4853848216680077, "grad_norm": 0.2787807893459746, "learning_rate": 8.699364554220074e-07, "loss": 0.0169, "step": 9268 }, { "epoch": 2.485652990077769, "grad_norm": 0.28692719094539865, "learning_rate": 8.690572547413084e-07, "loss": 0.0226, "step": 9269 }, { "epoch": 2.48592115848753, "grad_norm": 0.22605099456488412, "learning_rate": 8.68178456279537e-07, "loss": 0.0122, "step": 9270 }, { "epoch": 2.4861893268972914, "grad_norm": 0.24256574286868346, "learning_rate": 8.673000601222581e-07, "loss": 0.0169, "step": 9271 }, { "epoch": 2.486457495307053, "grad_norm": 0.1973340432413498, "learning_rate": 8.664220663550022e-07, "loss": 0.0096, "step": 9272 }, { "epoch": 2.4867256637168142, "grad_norm": 0.1851371195999561, "learning_rate": 8.65544475063253e-07, "loss": 0.009, "step": 9273 }, { "epoch": 2.4869938321265757, "grad_norm": 0.31312061246284895, "learning_rate": 8.646672863324623e-07, "loss": 0.0196, "step": 9274 }, { "epoch": 2.4872620005363366, "grad_norm": 0.3134208618134148, "learning_rate": 8.637905002480396e-07, "loss": 0.0127, "step": 9275 }, { "epoch": 2.487530168946098, "grad_norm": 0.24500803620328263, "learning_rate": 8.629141168953531e-07, "loss": 0.0117, "step": 9276 }, { "epoch": 2.4877983373558594, "grad_norm": 0.26815995287821287, "learning_rate": 8.620381363597369e-07, "loss": 0.0169, "step": 9277 }, { "epoch": 2.488066505765621, "grad_norm": 0.21817183086192776, "learning_rate": 8.611625587264799e-07, "loss": 0.0113, "step": 9278 }, { "epoch": 2.4883346741753822, "grad_norm": 0.20983831249169235, "learning_rate": 8.602873840808379e-07, "loss": 0.0138, "step": 9279 }, { "epoch": 2.4886028425851436, "grad_norm": 0.17894457823455856, "learning_rate": 8.594126125080204e-07, "loss": 0.0118, "step": 9280 }, { "epoch": 2.488871010994905, "grad_norm": 0.24750343311940268, "learning_rate": 8.585382440932044e-07, "loss": 0.011, "step": 9281 }, { "epoch": 2.489139179404666, "grad_norm": 0.20045463358672191, "learning_rate": 8.576642789215267e-07, "loss": 0.013, "step": 9282 }, { "epoch": 2.4894073478144274, "grad_norm": 0.28351680342072455, "learning_rate": 8.567907170780776e-07, "loss": 0.012, "step": 9283 }, { "epoch": 2.489675516224189, "grad_norm": 0.5307186204540704, "learning_rate": 8.559175586479168e-07, "loss": 0.0222, "step": 9284 }, { "epoch": 2.48994368463395, "grad_norm": 0.26590827075639856, "learning_rate": 8.550448037160619e-07, "loss": 0.0165, "step": 9285 }, { "epoch": 2.4902118530437116, "grad_norm": 0.1769189717905138, "learning_rate": 8.541724523674888e-07, "loss": 0.0089, "step": 9286 }, { "epoch": 2.4904800214534726, "grad_norm": 0.21917642471272678, "learning_rate": 8.533005046871384e-07, "loss": 0.0079, "step": 9287 }, { "epoch": 2.490748189863234, "grad_norm": 0.20868106367121747, "learning_rate": 8.524289607599068e-07, "loss": 0.0133, "step": 9288 }, { "epoch": 2.4910163582729954, "grad_norm": 0.23126503666288667, "learning_rate": 8.515578206706576e-07, "loss": 0.0063, "step": 9289 }, { "epoch": 2.491284526682757, "grad_norm": 0.2236184936481097, "learning_rate": 8.506870845042081e-07, "loss": 0.0135, "step": 9290 }, { "epoch": 2.491552695092518, "grad_norm": 0.18326375751710228, "learning_rate": 8.498167523453404e-07, "loss": 0.0115, "step": 9291 }, { "epoch": 2.4918208635022796, "grad_norm": 0.23731584688930557, "learning_rate": 8.48946824278799e-07, "loss": 0.0112, "step": 9292 }, { "epoch": 2.492089031912041, "grad_norm": 0.3549126874754087, "learning_rate": 8.480773003892828e-07, "loss": 0.0171, "step": 9293 }, { "epoch": 2.492357200321802, "grad_norm": 0.47497774591491004, "learning_rate": 8.47208180761459e-07, "loss": 0.0168, "step": 9294 }, { "epoch": 2.4926253687315634, "grad_norm": 0.17641447437470914, "learning_rate": 8.463394654799472e-07, "loss": 0.0075, "step": 9295 }, { "epoch": 2.4928935371413248, "grad_norm": 0.22724606909252956, "learning_rate": 8.454711546293337e-07, "loss": 0.0126, "step": 9296 }, { "epoch": 2.493161705551086, "grad_norm": 0.2295144701976218, "learning_rate": 8.446032482941652e-07, "loss": 0.0121, "step": 9297 }, { "epoch": 2.4934298739608476, "grad_norm": 0.2568841451754485, "learning_rate": 8.437357465589446e-07, "loss": 0.0135, "step": 9298 }, { "epoch": 2.4936980423706085, "grad_norm": 0.21031944016170767, "learning_rate": 8.428686495081412e-07, "loss": 0.0107, "step": 9299 }, { "epoch": 2.49396621078037, "grad_norm": 0.24915948410325814, "learning_rate": 8.420019572261789e-07, "loss": 0.0145, "step": 9300 }, { "epoch": 2.4942343791901314, "grad_norm": 0.24648888331787241, "learning_rate": 8.411356697974471e-07, "loss": 0.0132, "step": 9301 }, { "epoch": 2.4945025475998928, "grad_norm": 0.22145390383905986, "learning_rate": 8.402697873062942e-07, "loss": 0.0111, "step": 9302 }, { "epoch": 2.494770716009654, "grad_norm": 0.22356017805573006, "learning_rate": 8.394043098370275e-07, "loss": 0.0124, "step": 9303 }, { "epoch": 2.4950388844194156, "grad_norm": 0.4012839689174112, "learning_rate": 8.385392374739181e-07, "loss": 0.0164, "step": 9304 }, { "epoch": 2.4953070528291765, "grad_norm": 0.2434705140636726, "learning_rate": 8.376745703011935e-07, "loss": 0.0117, "step": 9305 }, { "epoch": 2.495575221238938, "grad_norm": 0.26533543321722064, "learning_rate": 8.36810308403046e-07, "loss": 0.0106, "step": 9306 }, { "epoch": 2.4958433896486993, "grad_norm": 0.23742776434874427, "learning_rate": 8.359464518636262e-07, "loss": 0.0116, "step": 9307 }, { "epoch": 2.4961115580584607, "grad_norm": 0.21951364812744953, "learning_rate": 8.350830007670435e-07, "loss": 0.0137, "step": 9308 }, { "epoch": 2.496379726468222, "grad_norm": 0.2317982391694231, "learning_rate": 8.342199551973728e-07, "loss": 0.0103, "step": 9309 }, { "epoch": 2.4966478948779836, "grad_norm": 0.20102024595668322, "learning_rate": 8.333573152386442e-07, "loss": 0.0086, "step": 9310 }, { "epoch": 2.4969160632877445, "grad_norm": 0.23566939560104486, "learning_rate": 8.324950809748528e-07, "loss": 0.0098, "step": 9311 }, { "epoch": 2.497184231697506, "grad_norm": 0.27893858250160447, "learning_rate": 8.31633252489949e-07, "loss": 0.0107, "step": 9312 }, { "epoch": 2.4974524001072673, "grad_norm": 0.36002322684128896, "learning_rate": 8.307718298678491e-07, "loss": 0.0172, "step": 9313 }, { "epoch": 2.4977205685170287, "grad_norm": 0.2926912626210707, "learning_rate": 8.299108131924282e-07, "loss": 0.014, "step": 9314 }, { "epoch": 2.49798873692679, "grad_norm": 0.26282003915860813, "learning_rate": 8.290502025475183e-07, "loss": 0.0126, "step": 9315 }, { "epoch": 2.4982569053365515, "grad_norm": 0.29328890442854455, "learning_rate": 8.281899980169183e-07, "loss": 0.0124, "step": 9316 }, { "epoch": 2.4985250737463125, "grad_norm": 0.2571295982446838, "learning_rate": 8.273301996843813e-07, "loss": 0.0112, "step": 9317 }, { "epoch": 2.498793242156074, "grad_norm": 0.2600465160491636, "learning_rate": 8.264708076336254e-07, "loss": 0.0162, "step": 9318 }, { "epoch": 2.4990614105658353, "grad_norm": 0.2609114514398319, "learning_rate": 8.256118219483261e-07, "loss": 0.0152, "step": 9319 }, { "epoch": 2.4993295789755967, "grad_norm": 0.26227824765205804, "learning_rate": 8.247532427121202e-07, "loss": 0.0132, "step": 9320 }, { "epoch": 2.499597747385358, "grad_norm": 0.190664796141924, "learning_rate": 8.238950700086068e-07, "loss": 0.0087, "step": 9321 }, { "epoch": 2.4998659157951195, "grad_norm": 0.20752542479626115, "learning_rate": 8.230373039213424e-07, "loss": 0.0135, "step": 9322 }, { "epoch": 2.5001340842048805, "grad_norm": 0.2609615414901576, "learning_rate": 8.221799445338457e-07, "loss": 0.0148, "step": 9323 }, { "epoch": 2.500402252614642, "grad_norm": 0.31585381704870674, "learning_rate": 8.213229919295973e-07, "loss": 0.0151, "step": 9324 }, { "epoch": 2.5006704210244033, "grad_norm": 0.19384376235332715, "learning_rate": 8.204664461920337e-07, "loss": 0.0098, "step": 9325 }, { "epoch": 2.5009385894341647, "grad_norm": 0.3019653416114766, "learning_rate": 8.19610307404557e-07, "loss": 0.018, "step": 9326 }, { "epoch": 2.501206757843926, "grad_norm": 0.24584219875330476, "learning_rate": 8.187545756505244e-07, "loss": 0.0125, "step": 9327 }, { "epoch": 2.501474926253687, "grad_norm": 0.32739421611320624, "learning_rate": 8.178992510132594e-07, "loss": 0.0211, "step": 9328 }, { "epoch": 2.501743094663449, "grad_norm": 0.3184702794939725, "learning_rate": 8.170443335760397e-07, "loss": 0.0147, "step": 9329 }, { "epoch": 2.50201126307321, "grad_norm": 0.3402417217515435, "learning_rate": 8.161898234221094e-07, "loss": 0.016, "step": 9330 }, { "epoch": 2.5022794314829713, "grad_norm": 0.2031215416719073, "learning_rate": 8.153357206346674e-07, "loss": 0.0113, "step": 9331 }, { "epoch": 2.5025475998927327, "grad_norm": 0.25098818373195, "learning_rate": 8.144820252968755e-07, "loss": 0.0116, "step": 9332 }, { "epoch": 2.502815768302494, "grad_norm": 0.31156939742557493, "learning_rate": 8.136287374918561e-07, "loss": 0.0133, "step": 9333 }, { "epoch": 2.5030839367122555, "grad_norm": 0.2578483384234698, "learning_rate": 8.12775857302694e-07, "loss": 0.0112, "step": 9334 }, { "epoch": 2.5033521051220164, "grad_norm": 0.3211914835844652, "learning_rate": 8.119233848124274e-07, "loss": 0.0116, "step": 9335 }, { "epoch": 2.503620273531778, "grad_norm": 0.32893121473117226, "learning_rate": 8.110713201040637e-07, "loss": 0.0223, "step": 9336 }, { "epoch": 2.5038884419415393, "grad_norm": 0.23734762545189533, "learning_rate": 8.10219663260563e-07, "loss": 0.0146, "step": 9337 }, { "epoch": 2.5041566103513007, "grad_norm": 0.2653282914067344, "learning_rate": 8.093684143648506e-07, "loss": 0.0138, "step": 9338 }, { "epoch": 2.504424778761062, "grad_norm": 0.20349547032176826, "learning_rate": 8.085175734998091e-07, "loss": 0.0094, "step": 9339 }, { "epoch": 2.504692947170823, "grad_norm": 0.17062330220112448, "learning_rate": 8.076671407482833e-07, "loss": 0.0082, "step": 9340 }, { "epoch": 2.504961115580585, "grad_norm": 0.2553387714102383, "learning_rate": 8.068171161930788e-07, "loss": 0.0134, "step": 9341 }, { "epoch": 2.505229283990346, "grad_norm": 0.23201569039276287, "learning_rate": 8.059674999169586e-07, "loss": 0.0146, "step": 9342 }, { "epoch": 2.5054974524001072, "grad_norm": 0.3484873723411901, "learning_rate": 8.051182920026474e-07, "loss": 0.0122, "step": 9343 }, { "epoch": 2.5057656208098686, "grad_norm": 0.23480805073852964, "learning_rate": 8.042694925328315e-07, "loss": 0.0112, "step": 9344 }, { "epoch": 2.50603378921963, "grad_norm": 0.2893715867129857, "learning_rate": 8.03421101590155e-07, "loss": 0.0152, "step": 9345 }, { "epoch": 2.5063019576293915, "grad_norm": 0.2644184628217205, "learning_rate": 8.02573119257225e-07, "loss": 0.0178, "step": 9346 }, { "epoch": 2.5065701260391524, "grad_norm": 0.258811159761651, "learning_rate": 8.01725545616605e-07, "loss": 0.0136, "step": 9347 }, { "epoch": 2.506838294448914, "grad_norm": 0.2776946895700495, "learning_rate": 8.008783807508241e-07, "loss": 0.0117, "step": 9348 }, { "epoch": 2.5071064628586752, "grad_norm": 0.19193524629280395, "learning_rate": 8.000316247423656e-07, "loss": 0.0126, "step": 9349 }, { "epoch": 2.5073746312684366, "grad_norm": 0.21740318014120313, "learning_rate": 7.991852776736769e-07, "loss": 0.0122, "step": 9350 }, { "epoch": 2.507642799678198, "grad_norm": 0.25071331519248574, "learning_rate": 7.98339339627166e-07, "loss": 0.0182, "step": 9351 }, { "epoch": 2.507910968087959, "grad_norm": 0.25425966438531805, "learning_rate": 7.97493810685197e-07, "loss": 0.0167, "step": 9352 }, { "epoch": 2.508179136497721, "grad_norm": 0.26998562688679706, "learning_rate": 7.966486909300997e-07, "loss": 0.0135, "step": 9353 }, { "epoch": 2.508447304907482, "grad_norm": 0.20898533458711333, "learning_rate": 7.958039804441597e-07, "loss": 0.0125, "step": 9354 }, { "epoch": 2.508715473317243, "grad_norm": 0.33041257815675745, "learning_rate": 7.949596793096226e-07, "loss": 0.0252, "step": 9355 }, { "epoch": 2.5089836417270046, "grad_norm": 0.28926202113459537, "learning_rate": 7.941157876086991e-07, "loss": 0.0161, "step": 9356 }, { "epoch": 2.509251810136766, "grad_norm": 0.23697618016860766, "learning_rate": 7.932723054235536e-07, "loss": 0.014, "step": 9357 }, { "epoch": 2.5095199785465274, "grad_norm": 0.3353980650390598, "learning_rate": 7.924292328363159e-07, "loss": 0.0167, "step": 9358 }, { "epoch": 2.5097881469562884, "grad_norm": 0.2148554768827169, "learning_rate": 7.915865699290721e-07, "loss": 0.0106, "step": 9359 }, { "epoch": 2.51005631536605, "grad_norm": 0.3206924056062531, "learning_rate": 7.907443167838708e-07, "loss": 0.0154, "step": 9360 }, { "epoch": 2.510324483775811, "grad_norm": 0.25481679779760774, "learning_rate": 7.89902473482721e-07, "loss": 0.0126, "step": 9361 }, { "epoch": 2.5105926521855726, "grad_norm": 0.25797670236067866, "learning_rate": 7.890610401075888e-07, "loss": 0.014, "step": 9362 }, { "epoch": 2.510860820595334, "grad_norm": 0.27637027720668916, "learning_rate": 7.882200167404047e-07, "loss": 0.0125, "step": 9363 }, { "epoch": 2.511128989005095, "grad_norm": 0.2440645985470306, "learning_rate": 7.873794034630539e-07, "loss": 0.0143, "step": 9364 }, { "epoch": 2.511397157414857, "grad_norm": 0.23538576769010774, "learning_rate": 7.865392003573874e-07, "loss": 0.0119, "step": 9365 }, { "epoch": 2.5116653258246178, "grad_norm": 0.27382243909041265, "learning_rate": 7.856994075052127e-07, "loss": 0.0188, "step": 9366 }, { "epoch": 2.511933494234379, "grad_norm": 0.28099861951984345, "learning_rate": 7.84860024988297e-07, "loss": 0.0116, "step": 9367 }, { "epoch": 2.5122016626441406, "grad_norm": 0.2560651309373312, "learning_rate": 7.840210528883708e-07, "loss": 0.0131, "step": 9368 }, { "epoch": 2.512469831053902, "grad_norm": 0.27970997941371256, "learning_rate": 7.8318249128712e-07, "loss": 0.0167, "step": 9369 }, { "epoch": 2.5127379994636634, "grad_norm": 0.31692455915245166, "learning_rate": 7.823443402661945e-07, "loss": 0.0165, "step": 9370 }, { "epoch": 2.5130061678734243, "grad_norm": 0.27795471323836, "learning_rate": 7.815065999072041e-07, "loss": 0.0132, "step": 9371 }, { "epoch": 2.5132743362831858, "grad_norm": 0.284347118224109, "learning_rate": 7.806692702917151e-07, "loss": 0.015, "step": 9372 }, { "epoch": 2.513542504692947, "grad_norm": 0.2949177890169353, "learning_rate": 7.798323515012585e-07, "loss": 0.0154, "step": 9373 }, { "epoch": 2.5138106731027086, "grad_norm": 0.23814751524051783, "learning_rate": 7.789958436173206e-07, "loss": 0.0129, "step": 9374 }, { "epoch": 2.51407884151247, "grad_norm": 0.20623561549637223, "learning_rate": 7.781597467213514e-07, "loss": 0.0101, "step": 9375 }, { "epoch": 2.514347009922231, "grad_norm": 0.3604045484507002, "learning_rate": 7.773240608947579e-07, "loss": 0.0196, "step": 9376 }, { "epoch": 2.5146151783319923, "grad_norm": 0.1923246582803911, "learning_rate": 7.764887862189097e-07, "loss": 0.0131, "step": 9377 }, { "epoch": 2.5148833467417537, "grad_norm": 0.24649372823669893, "learning_rate": 7.756539227751375e-07, "loss": 0.0101, "step": 9378 }, { "epoch": 2.515151515151515, "grad_norm": 0.26552433923361396, "learning_rate": 7.748194706447254e-07, "loss": 0.0128, "step": 9379 }, { "epoch": 2.5154196835612765, "grad_norm": 0.2718384037056347, "learning_rate": 7.739854299089233e-07, "loss": 0.0145, "step": 9380 }, { "epoch": 2.515687851971038, "grad_norm": 0.21685449209594948, "learning_rate": 7.731518006489414e-07, "loss": 0.012, "step": 9381 }, { "epoch": 2.5159560203807994, "grad_norm": 0.24099970576014063, "learning_rate": 7.723185829459456e-07, "loss": 0.0091, "step": 9382 }, { "epoch": 2.5162241887905603, "grad_norm": 0.4328528253769127, "learning_rate": 7.714857768810663e-07, "loss": 0.0169, "step": 9383 }, { "epoch": 2.5164923572003217, "grad_norm": 0.3075128121775694, "learning_rate": 7.706533825353891e-07, "loss": 0.0178, "step": 9384 }, { "epoch": 2.516760525610083, "grad_norm": 0.1879235576849328, "learning_rate": 7.698213999899645e-07, "loss": 0.0087, "step": 9385 }, { "epoch": 2.5170286940198445, "grad_norm": 0.31382938300899316, "learning_rate": 7.68989829325798e-07, "loss": 0.0237, "step": 9386 }, { "epoch": 2.517296862429606, "grad_norm": 0.2753799636498773, "learning_rate": 7.681586706238586e-07, "loss": 0.0139, "step": 9387 }, { "epoch": 2.517565030839367, "grad_norm": 0.2468139398245865, "learning_rate": 7.673279239650755e-07, "loss": 0.0145, "step": 9388 }, { "epoch": 2.5178331992491283, "grad_norm": 0.2799748470692375, "learning_rate": 7.66497589430334e-07, "loss": 0.0147, "step": 9389 }, { "epoch": 2.5181013676588897, "grad_norm": 0.22264963965353896, "learning_rate": 7.656676671004832e-07, "loss": 0.0137, "step": 9390 }, { "epoch": 2.518369536068651, "grad_norm": 0.26735823741414494, "learning_rate": 7.648381570563301e-07, "loss": 0.0149, "step": 9391 }, { "epoch": 2.5186377044784125, "grad_norm": 0.26516554334318954, "learning_rate": 7.640090593786398e-07, "loss": 0.0084, "step": 9392 }, { "epoch": 2.518905872888174, "grad_norm": 0.23098576365259924, "learning_rate": 7.631803741481425e-07, "loss": 0.0126, "step": 9393 }, { "epoch": 2.5191740412979353, "grad_norm": 0.28602044558464945, "learning_rate": 7.623521014455226e-07, "loss": 0.0198, "step": 9394 }, { "epoch": 2.5194422097076963, "grad_norm": 0.26324239958421525, "learning_rate": 7.61524241351429e-07, "loss": 0.0139, "step": 9395 }, { "epoch": 2.5197103781174577, "grad_norm": 0.23311793363772013, "learning_rate": 7.60696793946466e-07, "loss": 0.0133, "step": 9396 }, { "epoch": 2.519978546527219, "grad_norm": 0.21656357026413522, "learning_rate": 7.598697593112009e-07, "loss": 0.0113, "step": 9397 }, { "epoch": 2.5202467149369805, "grad_norm": 0.2075177922573687, "learning_rate": 7.590431375261615e-07, "loss": 0.01, "step": 9398 }, { "epoch": 2.520514883346742, "grad_norm": 0.21962244917120785, "learning_rate": 7.582169286718305e-07, "loss": 0.0119, "step": 9399 }, { "epoch": 2.520783051756503, "grad_norm": 0.28319058939515795, "learning_rate": 7.573911328286571e-07, "loss": 0.0231, "step": 9400 }, { "epoch": 2.5210512201662643, "grad_norm": 0.30751509893707457, "learning_rate": 7.565657500770435e-07, "loss": 0.0134, "step": 9401 }, { "epoch": 2.5213193885760257, "grad_norm": 0.1973453962088212, "learning_rate": 7.557407804973582e-07, "loss": 0.01, "step": 9402 }, { "epoch": 2.521587556985787, "grad_norm": 0.15641184525651589, "learning_rate": 7.549162241699248e-07, "loss": 0.0065, "step": 9403 }, { "epoch": 2.5218557253955485, "grad_norm": 0.16853363977514846, "learning_rate": 7.540920811750268e-07, "loss": 0.0088, "step": 9404 }, { "epoch": 2.52212389380531, "grad_norm": 0.31081775012674273, "learning_rate": 7.532683515929118e-07, "loss": 0.0179, "step": 9405 }, { "epoch": 2.5223920622150713, "grad_norm": 0.24935578779087167, "learning_rate": 7.52445035503781e-07, "loss": 0.0136, "step": 9406 }, { "epoch": 2.5226602306248322, "grad_norm": 0.24258004924281448, "learning_rate": 7.516221329878015e-07, "loss": 0.01, "step": 9407 }, { "epoch": 2.5229283990345936, "grad_norm": 0.2616340551514378, "learning_rate": 7.507996441250942e-07, "loss": 0.0163, "step": 9408 }, { "epoch": 2.523196567444355, "grad_norm": 0.4975836370875494, "learning_rate": 7.499775689957445e-07, "loss": 0.0231, "step": 9409 }, { "epoch": 2.5234647358541165, "grad_norm": 0.2680221311585629, "learning_rate": 7.491559076797967e-07, "loss": 0.0142, "step": 9410 }, { "epoch": 2.523732904263878, "grad_norm": 0.250994177820671, "learning_rate": 7.483346602572505e-07, "loss": 0.0125, "step": 9411 }, { "epoch": 2.524001072673639, "grad_norm": 0.22794402855872178, "learning_rate": 7.475138268080723e-07, "loss": 0.0118, "step": 9412 }, { "epoch": 2.5242692410834002, "grad_norm": 0.2976494447402904, "learning_rate": 7.466934074121806e-07, "loss": 0.017, "step": 9413 }, { "epoch": 2.5245374094931616, "grad_norm": 0.22547765459773292, "learning_rate": 7.458734021494601e-07, "loss": 0.0162, "step": 9414 }, { "epoch": 2.524805577902923, "grad_norm": 0.2624689756441355, "learning_rate": 7.450538110997546e-07, "loss": 0.0155, "step": 9415 }, { "epoch": 2.5250737463126844, "grad_norm": 0.24479507072800955, "learning_rate": 7.442346343428603e-07, "loss": 0.014, "step": 9416 }, { "epoch": 2.525341914722446, "grad_norm": 0.2583250532445407, "learning_rate": 7.434158719585416e-07, "loss": 0.0128, "step": 9417 }, { "epoch": 2.5256100831322073, "grad_norm": 0.3927372753334839, "learning_rate": 7.425975240265176e-07, "loss": 0.0144, "step": 9418 }, { "epoch": 2.525878251541968, "grad_norm": 0.27421540196886646, "learning_rate": 7.417795906264696e-07, "loss": 0.0117, "step": 9419 }, { "epoch": 2.5261464199517296, "grad_norm": 0.26677716246507216, "learning_rate": 7.409620718380383e-07, "loss": 0.0145, "step": 9420 }, { "epoch": 2.526414588361491, "grad_norm": 0.2321151408605178, "learning_rate": 7.401449677408218e-07, "loss": 0.0132, "step": 9421 }, { "epoch": 2.5266827567712524, "grad_norm": 0.25042192965746063, "learning_rate": 7.393282784143807e-07, "loss": 0.011, "step": 9422 }, { "epoch": 2.526950925181014, "grad_norm": 0.22564207907228678, "learning_rate": 7.385120039382326e-07, "loss": 0.014, "step": 9423 }, { "epoch": 2.527219093590775, "grad_norm": 0.1910721005379831, "learning_rate": 7.376961443918579e-07, "loss": 0.0111, "step": 9424 }, { "epoch": 2.527487262000536, "grad_norm": 0.2335135646519822, "learning_rate": 7.368806998546918e-07, "loss": 0.0142, "step": 9425 }, { "epoch": 2.5277554304102976, "grad_norm": 0.21520466697185064, "learning_rate": 7.360656704061337e-07, "loss": 0.0117, "step": 9426 }, { "epoch": 2.528023598820059, "grad_norm": 0.23402909635821295, "learning_rate": 7.352510561255433e-07, "loss": 0.0116, "step": 9427 }, { "epoch": 2.5282917672298204, "grad_norm": 0.3832516148622252, "learning_rate": 7.344368570922322e-07, "loss": 0.0201, "step": 9428 }, { "epoch": 2.528559935639582, "grad_norm": 0.21102974373105857, "learning_rate": 7.336230733854799e-07, "loss": 0.0085, "step": 9429 }, { "epoch": 2.528828104049343, "grad_norm": 0.2529114956554202, "learning_rate": 7.328097050845228e-07, "loss": 0.0088, "step": 9430 }, { "epoch": 2.529096272459104, "grad_norm": 0.27659341176767804, "learning_rate": 7.319967522685545e-07, "loss": 0.0175, "step": 9431 }, { "epoch": 2.5293644408688656, "grad_norm": 0.23515939399462932, "learning_rate": 7.31184215016732e-07, "loss": 0.0198, "step": 9432 }, { "epoch": 2.529632609278627, "grad_norm": 0.23230097262032512, "learning_rate": 7.303720934081681e-07, "loss": 0.009, "step": 9433 }, { "epoch": 2.5299007776883884, "grad_norm": 0.26441923470861695, "learning_rate": 7.295603875219392e-07, "loss": 0.0149, "step": 9434 }, { "epoch": 2.53016894609815, "grad_norm": 0.26770246842684875, "learning_rate": 7.287490974370759e-07, "loss": 0.0133, "step": 9435 }, { "epoch": 2.5304371145079108, "grad_norm": 0.33799200115558714, "learning_rate": 7.279382232325732e-07, "loss": 0.0202, "step": 9436 }, { "epoch": 2.530705282917672, "grad_norm": 0.23124341573191828, "learning_rate": 7.271277649873854e-07, "loss": 0.0131, "step": 9437 }, { "epoch": 2.5309734513274336, "grad_norm": 0.2536148812618344, "learning_rate": 7.263177227804207e-07, "loss": 0.0119, "step": 9438 }, { "epoch": 2.531241619737195, "grad_norm": 0.2470937402080855, "learning_rate": 7.255080966905547e-07, "loss": 0.0111, "step": 9439 }, { "epoch": 2.5315097881469564, "grad_norm": 0.37213692936665455, "learning_rate": 7.246988867966165e-07, "loss": 0.0169, "step": 9440 }, { "epoch": 2.531777956556718, "grad_norm": 0.24438838159388515, "learning_rate": 7.238900931773963e-07, "loss": 0.0188, "step": 9441 }, { "epoch": 2.532046124966479, "grad_norm": 0.31372434197498117, "learning_rate": 7.230817159116454e-07, "loss": 0.0185, "step": 9442 }, { "epoch": 2.53231429337624, "grad_norm": 0.2341786170493035, "learning_rate": 7.222737550780717e-07, "loss": 0.0129, "step": 9443 }, { "epoch": 2.5325824617860015, "grad_norm": 0.21083641700181532, "learning_rate": 7.214662107553472e-07, "loss": 0.0134, "step": 9444 }, { "epoch": 2.532850630195763, "grad_norm": 0.21339680792678886, "learning_rate": 7.206590830220966e-07, "loss": 0.0105, "step": 9445 }, { "epoch": 2.5331187986055244, "grad_norm": 0.22705720005814198, "learning_rate": 7.198523719569101e-07, "loss": 0.0114, "step": 9446 }, { "epoch": 2.5333869670152858, "grad_norm": 0.23861263153662646, "learning_rate": 7.190460776383351e-07, "loss": 0.0096, "step": 9447 }, { "epoch": 2.5336551354250467, "grad_norm": 0.2606539697455914, "learning_rate": 7.182402001448774e-07, "loss": 0.0118, "step": 9448 }, { "epoch": 2.533923303834808, "grad_norm": 0.24296670682648355, "learning_rate": 7.174347395550041e-07, "loss": 0.019, "step": 9449 }, { "epoch": 2.5341914722445695, "grad_norm": 0.19807502987992218, "learning_rate": 7.166296959471397e-07, "loss": 0.0131, "step": 9450 }, { "epoch": 2.534459640654331, "grad_norm": 0.3362818930677023, "learning_rate": 7.158250693996705e-07, "loss": 0.0252, "step": 9451 }, { "epoch": 2.5347278090640923, "grad_norm": 0.2311696274986188, "learning_rate": 7.150208599909397e-07, "loss": 0.0138, "step": 9452 }, { "epoch": 2.5349959774738537, "grad_norm": 0.17533233544582852, "learning_rate": 7.142170677992505e-07, "loss": 0.0078, "step": 9453 }, { "epoch": 2.535264145883615, "grad_norm": 0.2572256906544112, "learning_rate": 7.134136929028679e-07, "loss": 0.0174, "step": 9454 }, { "epoch": 2.535532314293376, "grad_norm": 0.2885086991592223, "learning_rate": 7.126107353800127e-07, "loss": 0.0116, "step": 9455 }, { "epoch": 2.5358004827031375, "grad_norm": 0.24484515519660538, "learning_rate": 7.118081953088668e-07, "loss": 0.0173, "step": 9456 }, { "epoch": 2.536068651112899, "grad_norm": 0.18998392629644956, "learning_rate": 7.110060727675733e-07, "loss": 0.013, "step": 9457 }, { "epoch": 2.5363368195226603, "grad_norm": 0.36590432450628585, "learning_rate": 7.102043678342307e-07, "loss": 0.0124, "step": 9458 }, { "epoch": 2.5366049879324217, "grad_norm": 0.28302041295818803, "learning_rate": 7.094030805869001e-07, "loss": 0.011, "step": 9459 }, { "epoch": 2.5368731563421827, "grad_norm": 0.35273529280911353, "learning_rate": 7.086022111035995e-07, "loss": 0.0317, "step": 9460 }, { "epoch": 2.537141324751944, "grad_norm": 0.20147569637295443, "learning_rate": 7.078017594623094e-07, "loss": 0.01, "step": 9461 }, { "epoch": 2.5374094931617055, "grad_norm": 0.20480887869424974, "learning_rate": 7.07001725740965e-07, "loss": 0.0106, "step": 9462 }, { "epoch": 2.537677661571467, "grad_norm": 0.22712988068062642, "learning_rate": 7.06202110017466e-07, "loss": 0.0102, "step": 9463 }, { "epoch": 2.5379458299812283, "grad_norm": 0.34864594075019645, "learning_rate": 7.054029123696682e-07, "loss": 0.0147, "step": 9464 }, { "epoch": 2.5382139983909893, "grad_norm": 0.417098312524783, "learning_rate": 7.04604132875385e-07, "loss": 0.012, "step": 9465 }, { "epoch": 2.538482166800751, "grad_norm": 0.18970005179386235, "learning_rate": 7.038057716123941e-07, "loss": 0.0096, "step": 9466 }, { "epoch": 2.538750335210512, "grad_norm": 0.17280985877681193, "learning_rate": 7.030078286584296e-07, "loss": 0.0076, "step": 9467 }, { "epoch": 2.5390185036202735, "grad_norm": 0.29175975028967266, "learning_rate": 7.022103040911838e-07, "loss": 0.0195, "step": 9468 }, { "epoch": 2.539286672030035, "grad_norm": 0.2586930911462124, "learning_rate": 7.014131979883109e-07, "loss": 0.0161, "step": 9469 }, { "epoch": 2.5395548404397963, "grad_norm": 0.19148936002369674, "learning_rate": 7.006165104274215e-07, "loss": 0.0102, "step": 9470 }, { "epoch": 2.5398230088495577, "grad_norm": 0.26466346090085807, "learning_rate": 6.998202414860894e-07, "loss": 0.022, "step": 9471 }, { "epoch": 2.5400911772593187, "grad_norm": 0.20814060088270436, "learning_rate": 6.990243912418421e-07, "loss": 0.0109, "step": 9472 }, { "epoch": 2.54035934566908, "grad_norm": 0.24725164465675215, "learning_rate": 6.982289597721709e-07, "loss": 0.0118, "step": 9473 }, { "epoch": 2.5406275140788415, "grad_norm": 0.20601353897899788, "learning_rate": 6.97433947154526e-07, "loss": 0.0089, "step": 9474 }, { "epoch": 2.540895682488603, "grad_norm": 0.2312719764486107, "learning_rate": 6.966393534663146e-07, "loss": 0.0142, "step": 9475 }, { "epoch": 2.5411638508983643, "grad_norm": 0.21179140604480123, "learning_rate": 6.958451787849046e-07, "loss": 0.0097, "step": 9476 }, { "epoch": 2.5414320193081252, "grad_norm": 0.22056481497299052, "learning_rate": 6.950514231876204e-07, "loss": 0.0113, "step": 9477 }, { "epoch": 2.541700187717887, "grad_norm": 0.21054747415008213, "learning_rate": 6.942580867517501e-07, "loss": 0.0123, "step": 9478 }, { "epoch": 2.541968356127648, "grad_norm": 0.2650449733202439, "learning_rate": 6.934651695545396e-07, "loss": 0.016, "step": 9479 }, { "epoch": 2.5422365245374094, "grad_norm": 0.23960924225287497, "learning_rate": 6.926726716731908e-07, "loss": 0.0104, "step": 9480 }, { "epoch": 2.542504692947171, "grad_norm": 0.2529858288042599, "learning_rate": 6.918805931848693e-07, "loss": 0.0155, "step": 9481 }, { "epoch": 2.5427728613569323, "grad_norm": 0.22624243306150238, "learning_rate": 6.910889341666955e-07, "loss": 0.0138, "step": 9482 }, { "epoch": 2.5430410297666937, "grad_norm": 0.2525850824082361, "learning_rate": 6.902976946957518e-07, "loss": 0.0161, "step": 9483 }, { "epoch": 2.5433091981764546, "grad_norm": 0.30318854664762496, "learning_rate": 6.895068748490807e-07, "loss": 0.0136, "step": 9484 }, { "epoch": 2.543577366586216, "grad_norm": 0.3446667585344443, "learning_rate": 6.887164747036807e-07, "loss": 0.0203, "step": 9485 }, { "epoch": 2.5438455349959774, "grad_norm": 0.21134220208256024, "learning_rate": 6.879264943365116e-07, "loss": 0.0137, "step": 9486 }, { "epoch": 2.544113703405739, "grad_norm": 0.24559824053085458, "learning_rate": 6.871369338244921e-07, "loss": 0.0161, "step": 9487 }, { "epoch": 2.5443818718155002, "grad_norm": 0.26401556026005163, "learning_rate": 6.863477932444973e-07, "loss": 0.012, "step": 9488 }, { "epoch": 2.544650040225261, "grad_norm": 0.2725076540183263, "learning_rate": 6.855590726733669e-07, "loss": 0.0104, "step": 9489 }, { "epoch": 2.544918208635023, "grad_norm": 0.2607824114444577, "learning_rate": 6.847707721878932e-07, "loss": 0.014, "step": 9490 }, { "epoch": 2.545186377044784, "grad_norm": 0.2516989925210422, "learning_rate": 6.839828918648344e-07, "loss": 0.0162, "step": 9491 }, { "epoch": 2.5454545454545454, "grad_norm": 0.19844603010961745, "learning_rate": 6.831954317809014e-07, "loss": 0.0105, "step": 9492 }, { "epoch": 2.545722713864307, "grad_norm": 0.20831928752433285, "learning_rate": 6.824083920127684e-07, "loss": 0.0124, "step": 9493 }, { "epoch": 2.5459908822740682, "grad_norm": 0.22225699525235018, "learning_rate": 6.816217726370677e-07, "loss": 0.0182, "step": 9494 }, { "epoch": 2.5462590506838296, "grad_norm": 0.22338885344206338, "learning_rate": 6.808355737303895e-07, "loss": 0.0134, "step": 9495 }, { "epoch": 2.5465272190935906, "grad_norm": 0.23310416661245775, "learning_rate": 6.800497953692852e-07, "loss": 0.0097, "step": 9496 }, { "epoch": 2.546795387503352, "grad_norm": 0.15456995314946473, "learning_rate": 6.79264437630262e-07, "loss": 0.007, "step": 9497 }, { "epoch": 2.5470635559131134, "grad_norm": 0.2588603927048196, "learning_rate": 6.784795005897904e-07, "loss": 0.0243, "step": 9498 }, { "epoch": 2.547331724322875, "grad_norm": 0.35351844221798034, "learning_rate": 6.776949843242964e-07, "loss": 0.0181, "step": 9499 }, { "epoch": 2.547599892732636, "grad_norm": 0.23335516098660394, "learning_rate": 6.76910888910165e-07, "loss": 0.0136, "step": 9500 }, { "epoch": 2.547868061142397, "grad_norm": 0.27165045123093073, "learning_rate": 6.76127214423744e-07, "loss": 0.0146, "step": 9501 }, { "epoch": 2.548136229552159, "grad_norm": 0.2554521551155583, "learning_rate": 6.753439609413354e-07, "loss": 0.0116, "step": 9502 }, { "epoch": 2.54840439796192, "grad_norm": 0.2906989885042511, "learning_rate": 6.745611285392045e-07, "loss": 0.013, "step": 9503 }, { "epoch": 2.5486725663716814, "grad_norm": 0.21810406691905312, "learning_rate": 6.737787172935717e-07, "loss": 0.0172, "step": 9504 }, { "epoch": 2.548940734781443, "grad_norm": 0.27675763821843874, "learning_rate": 6.729967272806193e-07, "loss": 0.0133, "step": 9505 }, { "epoch": 2.549208903191204, "grad_norm": 0.23657425124578915, "learning_rate": 6.722151585764891e-07, "loss": 0.0157, "step": 9506 }, { "epoch": 2.5494770716009656, "grad_norm": 0.19730935926404106, "learning_rate": 6.71434011257277e-07, "loss": 0.0098, "step": 9507 }, { "epoch": 2.5497452400107266, "grad_norm": 0.24848905590579023, "learning_rate": 6.70653285399045e-07, "loss": 0.0125, "step": 9508 }, { "epoch": 2.550013408420488, "grad_norm": 0.35235564818107734, "learning_rate": 6.698729810778065e-07, "loss": 0.0219, "step": 9509 }, { "epoch": 2.5502815768302494, "grad_norm": 0.30984199488128766, "learning_rate": 6.690930983695404e-07, "loss": 0.0198, "step": 9510 }, { "epoch": 2.5505497452400108, "grad_norm": 0.22742442133669358, "learning_rate": 6.683136373501814e-07, "loss": 0.0138, "step": 9511 }, { "epoch": 2.550817913649772, "grad_norm": 0.2378994277247094, "learning_rate": 6.675345980956232e-07, "loss": 0.0146, "step": 9512 }, { "epoch": 2.551086082059533, "grad_norm": 0.21513504156454763, "learning_rate": 6.667559806817192e-07, "loss": 0.0125, "step": 9513 }, { "epoch": 2.551354250469295, "grad_norm": 0.24894058685684417, "learning_rate": 6.659777851842792e-07, "loss": 0.0122, "step": 9514 }, { "epoch": 2.551622418879056, "grad_norm": 0.28221802343009905, "learning_rate": 6.652000116790758e-07, "loss": 0.0195, "step": 9515 }, { "epoch": 2.5518905872888173, "grad_norm": 0.22316113318038344, "learning_rate": 6.644226602418396e-07, "loss": 0.0122, "step": 9516 }, { "epoch": 2.5521587556985788, "grad_norm": 0.252378479072891, "learning_rate": 6.636457309482575e-07, "loss": 0.0102, "step": 9517 }, { "epoch": 2.55242692410834, "grad_norm": 0.27991927000859523, "learning_rate": 6.628692238739786e-07, "loss": 0.0092, "step": 9518 }, { "epoch": 2.5526950925181016, "grad_norm": 0.20249451341450286, "learning_rate": 6.620931390946078e-07, "loss": 0.0142, "step": 9519 }, { "epoch": 2.5529632609278625, "grad_norm": 0.2026432694820056, "learning_rate": 6.613174766857117e-07, "loss": 0.0144, "step": 9520 }, { "epoch": 2.553231429337624, "grad_norm": 0.25899302965105037, "learning_rate": 6.605422367228131e-07, "loss": 0.0151, "step": 9521 }, { "epoch": 2.5534995977473853, "grad_norm": 0.2653073779569596, "learning_rate": 6.597674192813958e-07, "loss": 0.0145, "step": 9522 }, { "epoch": 2.5537677661571467, "grad_norm": 0.35344767780103065, "learning_rate": 6.589930244369025e-07, "loss": 0.0125, "step": 9523 }, { "epoch": 2.554035934566908, "grad_norm": 0.326646743422972, "learning_rate": 6.582190522647336e-07, "loss": 0.0176, "step": 9524 }, { "epoch": 2.554304102976669, "grad_norm": 0.24722480993901944, "learning_rate": 6.574455028402476e-07, "loss": 0.0132, "step": 9525 }, { "epoch": 2.554572271386431, "grad_norm": 0.2193019539577276, "learning_rate": 6.566723762387645e-07, "loss": 0.0131, "step": 9526 }, { "epoch": 2.554840439796192, "grad_norm": 0.30719213146715, "learning_rate": 6.558996725355599e-07, "loss": 0.0217, "step": 9527 }, { "epoch": 2.5551086082059533, "grad_norm": 0.2581147632892664, "learning_rate": 6.551273918058715e-07, "loss": 0.0086, "step": 9528 }, { "epoch": 2.5553767766157147, "grad_norm": 0.23906684604652964, "learning_rate": 6.543555341248931e-07, "loss": 0.0115, "step": 9529 }, { "epoch": 2.555644945025476, "grad_norm": 0.2372846652298987, "learning_rate": 6.535840995677794e-07, "loss": 0.0126, "step": 9530 }, { "epoch": 2.5559131134352375, "grad_norm": 0.21851502494245809, "learning_rate": 6.528130882096418e-07, "loss": 0.0128, "step": 9531 }, { "epoch": 2.5561812818449985, "grad_norm": 0.2700136571282098, "learning_rate": 6.520425001255514e-07, "loss": 0.0091, "step": 9532 }, { "epoch": 2.55644945025476, "grad_norm": 0.2420919389649872, "learning_rate": 6.512723353905409e-07, "loss": 0.0109, "step": 9533 }, { "epoch": 2.5567176186645213, "grad_norm": 0.3193657751309933, "learning_rate": 6.505025940795962e-07, "loss": 0.0152, "step": 9534 }, { "epoch": 2.5569857870742827, "grad_norm": 0.25078611699879266, "learning_rate": 6.497332762676678e-07, "loss": 0.0135, "step": 9535 }, { "epoch": 2.557253955484044, "grad_norm": 0.3148107452211699, "learning_rate": 6.489643820296598e-07, "loss": 0.0174, "step": 9536 }, { "epoch": 2.557522123893805, "grad_norm": 0.2147691272104118, "learning_rate": 6.481959114404373e-07, "loss": 0.0096, "step": 9537 }, { "epoch": 2.557790292303567, "grad_norm": 0.1916516684301197, "learning_rate": 6.474278645748266e-07, "loss": 0.0091, "step": 9538 }, { "epoch": 2.558058460713328, "grad_norm": 0.27664828487899623, "learning_rate": 6.466602415076073e-07, "loss": 0.0194, "step": 9539 }, { "epoch": 2.5583266291230893, "grad_norm": 0.2632584254404309, "learning_rate": 6.458930423135234e-07, "loss": 0.0174, "step": 9540 }, { "epoch": 2.5585947975328507, "grad_norm": 0.1999741147358491, "learning_rate": 6.451262670672737e-07, "loss": 0.0095, "step": 9541 }, { "epoch": 2.558862965942612, "grad_norm": 0.23631347840402372, "learning_rate": 6.443599158435166e-07, "loss": 0.0171, "step": 9542 }, { "epoch": 2.5591311343523735, "grad_norm": 0.20368476714047465, "learning_rate": 6.435939887168718e-07, "loss": 0.0094, "step": 9543 }, { "epoch": 2.5593993027621345, "grad_norm": 0.24072949296536492, "learning_rate": 6.428284857619132e-07, "loss": 0.0103, "step": 9544 }, { "epoch": 2.559667471171896, "grad_norm": 0.24252462716023684, "learning_rate": 6.420634070531784e-07, "loss": 0.0172, "step": 9545 }, { "epoch": 2.5599356395816573, "grad_norm": 0.29879442948606977, "learning_rate": 6.412987526651582e-07, "loss": 0.0162, "step": 9546 }, { "epoch": 2.5602038079914187, "grad_norm": 0.2620983020359227, "learning_rate": 6.40534522672307e-07, "loss": 0.0159, "step": 9547 }, { "epoch": 2.56047197640118, "grad_norm": 0.29115664494466725, "learning_rate": 6.397707171490358e-07, "loss": 0.0109, "step": 9548 }, { "epoch": 2.560740144810941, "grad_norm": 0.2127764916180148, "learning_rate": 6.390073361697119e-07, "loss": 0.0134, "step": 9549 }, { "epoch": 2.5610083132207024, "grad_norm": 0.3353091940086364, "learning_rate": 6.382443798086668e-07, "loss": 0.0186, "step": 9550 }, { "epoch": 2.561276481630464, "grad_norm": 0.2809564606216137, "learning_rate": 6.374818481401857e-07, "loss": 0.0162, "step": 9551 }, { "epoch": 2.5615446500402252, "grad_norm": 0.1879216351288949, "learning_rate": 6.36719741238514e-07, "loss": 0.01, "step": 9552 }, { "epoch": 2.5618128184499867, "grad_norm": 0.24220223472299166, "learning_rate": 6.359580591778585e-07, "loss": 0.0127, "step": 9553 }, { "epoch": 2.562080986859748, "grad_norm": 0.2540490223487067, "learning_rate": 6.35196802032379e-07, "loss": 0.0138, "step": 9554 }, { "epoch": 2.5623491552695095, "grad_norm": 0.24445472446380354, "learning_rate": 6.344359698761998e-07, "loss": 0.0128, "step": 9555 }, { "epoch": 2.5626173236792704, "grad_norm": 0.251579212033659, "learning_rate": 6.33675562783399e-07, "loss": 0.0143, "step": 9556 }, { "epoch": 2.562885492089032, "grad_norm": 0.27640563444558186, "learning_rate": 6.329155808280168e-07, "loss": 0.0155, "step": 9557 }, { "epoch": 2.5631536604987932, "grad_norm": 0.3559963231355134, "learning_rate": 6.321560240840496e-07, "loss": 0.0138, "step": 9558 }, { "epoch": 2.5634218289085546, "grad_norm": 0.19415935577380275, "learning_rate": 6.313968926254532e-07, "loss": 0.0132, "step": 9559 }, { "epoch": 2.563689997318316, "grad_norm": 0.27803548424007435, "learning_rate": 6.306381865261457e-07, "loss": 0.0135, "step": 9560 }, { "epoch": 2.563958165728077, "grad_norm": 0.26510429029863436, "learning_rate": 6.298799058599953e-07, "loss": 0.0127, "step": 9561 }, { "epoch": 2.5642263341378384, "grad_norm": 0.23846288009430872, "learning_rate": 6.291220507008361e-07, "loss": 0.0116, "step": 9562 }, { "epoch": 2.5644945025476, "grad_norm": 0.2606124495259926, "learning_rate": 6.283646211224592e-07, "loss": 0.0173, "step": 9563 }, { "epoch": 2.564762670957361, "grad_norm": 0.21843678254155185, "learning_rate": 6.276076171986118e-07, "loss": 0.0096, "step": 9564 }, { "epoch": 2.5650308393671226, "grad_norm": 0.22313136152370336, "learning_rate": 6.268510390030036e-07, "loss": 0.0113, "step": 9565 }, { "epoch": 2.565299007776884, "grad_norm": 0.3313233812192762, "learning_rate": 6.260948866092975e-07, "loss": 0.0125, "step": 9566 }, { "epoch": 2.5655671761866454, "grad_norm": 0.31481594902030086, "learning_rate": 6.253391600911213e-07, "loss": 0.0114, "step": 9567 }, { "epoch": 2.5658353445964064, "grad_norm": 0.24279749017298913, "learning_rate": 6.245838595220554e-07, "loss": 0.0112, "step": 9568 }, { "epoch": 2.566103513006168, "grad_norm": 0.2111661157942914, "learning_rate": 6.238289849756424e-07, "loss": 0.0155, "step": 9569 }, { "epoch": 2.566371681415929, "grad_norm": 0.23318030760868128, "learning_rate": 6.230745365253843e-07, "loss": 0.0125, "step": 9570 }, { "epoch": 2.5666398498256906, "grad_norm": 0.6204347673957621, "learning_rate": 6.223205142447363e-07, "loss": 0.0107, "step": 9571 }, { "epoch": 2.566908018235452, "grad_norm": 0.19156784436037003, "learning_rate": 6.215669182071198e-07, "loss": 0.0105, "step": 9572 }, { "epoch": 2.567176186645213, "grad_norm": 0.33584013822487757, "learning_rate": 6.208137484859056e-07, "loss": 0.0138, "step": 9573 }, { "epoch": 2.5674443550549744, "grad_norm": 0.24788162282324636, "learning_rate": 6.200610051544303e-07, "loss": 0.012, "step": 9574 }, { "epoch": 2.5677125234647358, "grad_norm": 0.2138360215253765, "learning_rate": 6.193086882859878e-07, "loss": 0.0116, "step": 9575 }, { "epoch": 2.567980691874497, "grad_norm": 0.2635645696162309, "learning_rate": 6.185567979538265e-07, "loss": 0.0136, "step": 9576 }, { "epoch": 2.5682488602842586, "grad_norm": 0.27091264336285575, "learning_rate": 6.17805334231158e-07, "loss": 0.0195, "step": 9577 }, { "epoch": 2.56851702869402, "grad_norm": 0.3002674157313949, "learning_rate": 6.170542971911492e-07, "loss": 0.0183, "step": 9578 }, { "epoch": 2.5687851971037814, "grad_norm": 0.21573304830102366, "learning_rate": 6.163036869069267e-07, "loss": 0.0102, "step": 9579 }, { "epoch": 2.5690533655135424, "grad_norm": 0.22981437936207094, "learning_rate": 6.155535034515764e-07, "loss": 0.0144, "step": 9580 }, { "epoch": 2.5693215339233038, "grad_norm": 0.23787000209345202, "learning_rate": 6.148037468981404e-07, "loss": 0.0112, "step": 9581 }, { "epoch": 2.569589702333065, "grad_norm": 0.1886392946651145, "learning_rate": 6.140544173196223e-07, "loss": 0.0121, "step": 9582 }, { "epoch": 2.5698578707428266, "grad_norm": 0.2633728951485469, "learning_rate": 6.133055147889793e-07, "loss": 0.0179, "step": 9583 }, { "epoch": 2.570126039152588, "grad_norm": 0.31318794485473944, "learning_rate": 6.125570393791336e-07, "loss": 0.0174, "step": 9584 }, { "epoch": 2.570394207562349, "grad_norm": 0.31138280012968056, "learning_rate": 6.118089911629605e-07, "loss": 0.0184, "step": 9585 }, { "epoch": 2.5706623759721103, "grad_norm": 0.3231881498323299, "learning_rate": 6.110613702132939e-07, "loss": 0.0149, "step": 9586 }, { "epoch": 2.5709305443818717, "grad_norm": 0.20943652379313257, "learning_rate": 6.103141766029308e-07, "loss": 0.0109, "step": 9587 }, { "epoch": 2.571198712791633, "grad_norm": 0.27374095689920713, "learning_rate": 6.095674104046212e-07, "loss": 0.0147, "step": 9588 }, { "epoch": 2.5714668812013945, "grad_norm": 0.2889686610070686, "learning_rate": 6.088210716910769e-07, "loss": 0.0138, "step": 9589 }, { "epoch": 2.571735049611156, "grad_norm": 0.2273466533306436, "learning_rate": 6.080751605349655e-07, "loss": 0.0111, "step": 9590 }, { "epoch": 2.5720032180209174, "grad_norm": 0.2842319237140273, "learning_rate": 6.073296770089159e-07, "loss": 0.0146, "step": 9591 }, { "epoch": 2.5722713864306783, "grad_norm": 0.40721174345267563, "learning_rate": 6.065846211855136e-07, "loss": 0.0283, "step": 9592 }, { "epoch": 2.5725395548404397, "grad_norm": 0.3877247192794935, "learning_rate": 6.058399931373021e-07, "loss": 0.029, "step": 9593 }, { "epoch": 2.572807723250201, "grad_norm": 0.21692353197004266, "learning_rate": 6.050957929367851e-07, "loss": 0.0121, "step": 9594 }, { "epoch": 2.5730758916599625, "grad_norm": 0.22672274884597615, "learning_rate": 6.043520206564213e-07, "loss": 0.01, "step": 9595 }, { "epoch": 2.573344060069724, "grad_norm": 0.24027634185062474, "learning_rate": 6.036086763686322e-07, "loss": 0.0102, "step": 9596 }, { "epoch": 2.573612228479485, "grad_norm": 0.21531197569606886, "learning_rate": 6.028657601457938e-07, "loss": 0.0099, "step": 9597 }, { "epoch": 2.5738803968892463, "grad_norm": 0.29345009192828747, "learning_rate": 6.021232720602416e-07, "loss": 0.0157, "step": 9598 }, { "epoch": 2.5741485652990077, "grad_norm": 0.2323772356443803, "learning_rate": 6.013812121842711e-07, "loss": 0.0128, "step": 9599 }, { "epoch": 2.574416733708769, "grad_norm": 0.23008210507945906, "learning_rate": 6.006395805901327e-07, "loss": 0.0128, "step": 9600 }, { "epoch": 2.5746849021185305, "grad_norm": 0.24257049294277327, "learning_rate": 5.998983773500383e-07, "loss": 0.012, "step": 9601 }, { "epoch": 2.574953070528292, "grad_norm": 0.2896981451613895, "learning_rate": 5.991576025361578e-07, "loss": 0.0145, "step": 9602 }, { "epoch": 2.5752212389380533, "grad_norm": 0.26838698337079375, "learning_rate": 5.984172562206164e-07, "loss": 0.02, "step": 9603 }, { "epoch": 2.5754894073478143, "grad_norm": 0.3490655922217611, "learning_rate": 5.976773384755019e-07, "loss": 0.0147, "step": 9604 }, { "epoch": 2.5757575757575757, "grad_norm": 0.28232549831605586, "learning_rate": 5.969378493728561e-07, "loss": 0.0134, "step": 9605 }, { "epoch": 2.576025744167337, "grad_norm": 0.24480418125553421, "learning_rate": 5.961987889846821e-07, "loss": 0.0103, "step": 9606 }, { "epoch": 2.5762939125770985, "grad_norm": 0.22141500648385665, "learning_rate": 5.95460157382941e-07, "loss": 0.0095, "step": 9607 }, { "epoch": 2.57656208098686, "grad_norm": 0.2329942240550676, "learning_rate": 5.947219546395511e-07, "loss": 0.0098, "step": 9608 }, { "epoch": 2.576830249396621, "grad_norm": 0.21964374022417452, "learning_rate": 5.939841808263879e-07, "loss": 0.0152, "step": 9609 }, { "epoch": 2.5770984178063823, "grad_norm": 0.26982686832705055, "learning_rate": 5.932468360152866e-07, "loss": 0.0175, "step": 9610 }, { "epoch": 2.5773665862161437, "grad_norm": 0.20154045021809255, "learning_rate": 5.92509920278041e-07, "loss": 0.0109, "step": 9611 }, { "epoch": 2.577634754625905, "grad_norm": 0.2672357147931063, "learning_rate": 5.917734336864039e-07, "loss": 0.0125, "step": 9612 }, { "epoch": 2.5779029230356665, "grad_norm": 0.2083126523550803, "learning_rate": 5.91037376312083e-07, "loss": 0.0165, "step": 9613 }, { "epoch": 2.578171091445428, "grad_norm": 0.28786657670822385, "learning_rate": 5.903017482267487e-07, "loss": 0.0139, "step": 9614 }, { "epoch": 2.5784392598551893, "grad_norm": 0.32505621117056294, "learning_rate": 5.895665495020242e-07, "loss": 0.0165, "step": 9615 }, { "epoch": 2.5787074282649503, "grad_norm": 0.3021716616317763, "learning_rate": 5.888317802094962e-07, "loss": 0.0152, "step": 9616 }, { "epoch": 2.5789755966747117, "grad_norm": 0.2651405060361694, "learning_rate": 5.880974404207051e-07, "loss": 0.0139, "step": 9617 }, { "epoch": 2.579243765084473, "grad_norm": 0.2030589685361221, "learning_rate": 5.873635302071534e-07, "loss": 0.0109, "step": 9618 }, { "epoch": 2.5795119334942345, "grad_norm": 0.21084956334317448, "learning_rate": 5.866300496402999e-07, "loss": 0.0145, "step": 9619 }, { "epoch": 2.579780101903996, "grad_norm": 0.2919547969973172, "learning_rate": 5.858969987915614e-07, "loss": 0.0096, "step": 9620 }, { "epoch": 2.580048270313757, "grad_norm": 0.22073103686810105, "learning_rate": 5.851643777323124e-07, "loss": 0.0138, "step": 9621 }, { "epoch": 2.5803164387235182, "grad_norm": 0.2527187893110945, "learning_rate": 5.84432186533887e-07, "loss": 0.022, "step": 9622 }, { "epoch": 2.5805846071332796, "grad_norm": 0.25948060660323835, "learning_rate": 5.83700425267576e-07, "loss": 0.0129, "step": 9623 }, { "epoch": 2.580852775543041, "grad_norm": 0.21376553059459844, "learning_rate": 5.829690940046306e-07, "loss": 0.0113, "step": 9624 }, { "epoch": 2.5811209439528024, "grad_norm": 0.27229237607580975, "learning_rate": 5.822381928162563e-07, "loss": 0.0145, "step": 9625 }, { "epoch": 2.581389112362564, "grad_norm": 0.30076006379980436, "learning_rate": 5.815077217736209e-07, "loss": 0.0175, "step": 9626 }, { "epoch": 2.5816572807723253, "grad_norm": 0.20260742850851007, "learning_rate": 5.807776809478472e-07, "loss": 0.0104, "step": 9627 }, { "epoch": 2.581925449182086, "grad_norm": 0.2645360511042723, "learning_rate": 5.800480704100181e-07, "loss": 0.0147, "step": 9628 }, { "epoch": 2.5821936175918476, "grad_norm": 0.2295458897162766, "learning_rate": 5.793188902311742e-07, "loss": 0.0101, "step": 9629 }, { "epoch": 2.582461786001609, "grad_norm": 0.20310056674740432, "learning_rate": 5.785901404823124e-07, "loss": 0.0086, "step": 9630 }, { "epoch": 2.5827299544113704, "grad_norm": 0.3042658023438824, "learning_rate": 5.778618212343912e-07, "loss": 0.0196, "step": 9631 }, { "epoch": 2.582998122821132, "grad_norm": 0.27037584734066117, "learning_rate": 5.771339325583225e-07, "loss": 0.0127, "step": 9632 }, { "epoch": 2.583266291230893, "grad_norm": 0.24557460540419615, "learning_rate": 5.764064745249815e-07, "loss": 0.013, "step": 9633 }, { "epoch": 2.583534459640654, "grad_norm": 0.2531419830928739, "learning_rate": 5.75679447205198e-07, "loss": 0.013, "step": 9634 }, { "epoch": 2.5838026280504156, "grad_norm": 0.2676886021499712, "learning_rate": 5.749528506697589e-07, "loss": 0.0127, "step": 9635 }, { "epoch": 2.584070796460177, "grad_norm": 0.1817620486900637, "learning_rate": 5.742266849894141e-07, "loss": 0.0084, "step": 9636 }, { "epoch": 2.5843389648699384, "grad_norm": 0.29775432667213264, "learning_rate": 5.735009502348648e-07, "loss": 0.016, "step": 9637 }, { "epoch": 2.5846071332796994, "grad_norm": 0.25247249985868303, "learning_rate": 5.727756464767764e-07, "loss": 0.0109, "step": 9638 }, { "epoch": 2.5848753016894612, "grad_norm": 0.3198382469094153, "learning_rate": 5.720507737857706e-07, "loss": 0.012, "step": 9639 }, { "epoch": 2.585143470099222, "grad_norm": 0.22017368222906075, "learning_rate": 5.713263322324236e-07, "loss": 0.0117, "step": 9640 }, { "epoch": 2.5854116385089836, "grad_norm": 0.27690764653361155, "learning_rate": 5.706023218872747e-07, "loss": 0.0134, "step": 9641 }, { "epoch": 2.585679806918745, "grad_norm": 0.2509699069697176, "learning_rate": 5.698787428208169e-07, "loss": 0.0122, "step": 9642 }, { "epoch": 2.5859479753285064, "grad_norm": 0.20979573883287267, "learning_rate": 5.691555951035049e-07, "loss": 0.0104, "step": 9643 }, { "epoch": 2.586216143738268, "grad_norm": 0.2323767133558872, "learning_rate": 5.684328788057486e-07, "loss": 0.0089, "step": 9644 }, { "epoch": 2.5864843121480288, "grad_norm": 0.3283898502354258, "learning_rate": 5.677105939979178e-07, "loss": 0.0247, "step": 9645 }, { "epoch": 2.58675248055779, "grad_norm": 1.0638469810586986, "learning_rate": 5.669887407503394e-07, "loss": 0.0293, "step": 9646 }, { "epoch": 2.5870206489675516, "grad_norm": 0.2308026553970437, "learning_rate": 5.662673191332962e-07, "loss": 0.011, "step": 9647 }, { "epoch": 2.587288817377313, "grad_norm": 0.24368376355800547, "learning_rate": 5.655463292170332e-07, "loss": 0.0101, "step": 9648 }, { "epoch": 2.5875569857870744, "grad_norm": 0.30766176908353654, "learning_rate": 5.648257710717519e-07, "loss": 0.0152, "step": 9649 }, { "epoch": 2.5878251541968353, "grad_norm": 0.26915879885965305, "learning_rate": 5.641056447676091e-07, "loss": 0.0127, "step": 9650 }, { "epoch": 2.588093322606597, "grad_norm": 0.25638270898165233, "learning_rate": 5.633859503747241e-07, "loss": 0.0157, "step": 9651 }, { "epoch": 2.588361491016358, "grad_norm": 0.2558605318508441, "learning_rate": 5.626666879631681e-07, "loss": 0.0111, "step": 9652 }, { "epoch": 2.5886296594261196, "grad_norm": 0.26018949190115487, "learning_rate": 5.619478576029774e-07, "loss": 0.0114, "step": 9653 }, { "epoch": 2.588897827835881, "grad_norm": 0.21605831991327562, "learning_rate": 5.612294593641404e-07, "loss": 0.0139, "step": 9654 }, { "epoch": 2.5891659962456424, "grad_norm": 0.20357054983985248, "learning_rate": 5.605114933166056e-07, "loss": 0.012, "step": 9655 }, { "epoch": 2.5894341646554038, "grad_norm": 0.24555160558933845, "learning_rate": 5.597939595302815e-07, "loss": 0.0107, "step": 9656 }, { "epoch": 2.5897023330651647, "grad_norm": 0.2336453408079745, "learning_rate": 5.590768580750306e-07, "loss": 0.0116, "step": 9657 }, { "epoch": 2.589970501474926, "grad_norm": 0.24621125913539527, "learning_rate": 5.583601890206747e-07, "loss": 0.0136, "step": 9658 }, { "epoch": 2.5902386698846875, "grad_norm": 0.2437413621318941, "learning_rate": 5.576439524369959e-07, "loss": 0.0107, "step": 9659 }, { "epoch": 2.590506838294449, "grad_norm": 0.2704377603389716, "learning_rate": 5.569281483937305e-07, "loss": 0.0136, "step": 9660 }, { "epoch": 2.5907750067042103, "grad_norm": 0.27356030927549896, "learning_rate": 5.562127769605757e-07, "loss": 0.0155, "step": 9661 }, { "epoch": 2.5910431751139713, "grad_norm": 0.24653301104326364, "learning_rate": 5.554978382071841e-07, "loss": 0.0115, "step": 9662 }, { "epoch": 2.591311343523733, "grad_norm": 0.28260482023118916, "learning_rate": 5.547833322031693e-07, "loss": 0.0133, "step": 9663 }, { "epoch": 2.591579511933494, "grad_norm": 0.24103360051292708, "learning_rate": 5.540692590180979e-07, "loss": 0.0114, "step": 9664 }, { "epoch": 2.5918476803432555, "grad_norm": 0.25460054495610895, "learning_rate": 5.533556187214995e-07, "loss": 0.0124, "step": 9665 }, { "epoch": 2.592115848753017, "grad_norm": 0.262079400128003, "learning_rate": 5.526424113828599e-07, "loss": 0.0145, "step": 9666 }, { "epoch": 2.5923840171627783, "grad_norm": 0.27859602085568136, "learning_rate": 5.519296370716204e-07, "loss": 0.0132, "step": 9667 }, { "epoch": 2.5926521855725397, "grad_norm": 0.29989967859772665, "learning_rate": 5.512172958571832e-07, "loss": 0.0189, "step": 9668 }, { "epoch": 2.5929203539823007, "grad_norm": 0.2887292198785437, "learning_rate": 5.505053878089073e-07, "loss": 0.017, "step": 9669 }, { "epoch": 2.593188522392062, "grad_norm": 0.3029981092500451, "learning_rate": 5.497939129961072e-07, "loss": 0.0174, "step": 9670 }, { "epoch": 2.5934566908018235, "grad_norm": 0.2727914934286261, "learning_rate": 5.490828714880597e-07, "loss": 0.0125, "step": 9671 }, { "epoch": 2.593724859211585, "grad_norm": 0.23084753037087935, "learning_rate": 5.483722633539951e-07, "loss": 0.0112, "step": 9672 }, { "epoch": 2.5939930276213463, "grad_norm": 0.22356395308273508, "learning_rate": 5.476620886631051e-07, "loss": 0.0116, "step": 9673 }, { "epoch": 2.5942611960311073, "grad_norm": 0.24075710144226334, "learning_rate": 5.469523474845361e-07, "loss": 0.0111, "step": 9674 }, { "epoch": 2.594529364440869, "grad_norm": 0.22660721133326864, "learning_rate": 5.462430398873947e-07, "loss": 0.009, "step": 9675 }, { "epoch": 2.59479753285063, "grad_norm": 0.21824044277904792, "learning_rate": 5.455341659407443e-07, "loss": 0.0128, "step": 9676 }, { "epoch": 2.5950657012603915, "grad_norm": 0.23477599301938293, "learning_rate": 5.448257257136052e-07, "loss": 0.0134, "step": 9677 }, { "epoch": 2.595333869670153, "grad_norm": 0.20682349158116947, "learning_rate": 5.44117719274958e-07, "loss": 0.0102, "step": 9678 }, { "epoch": 2.5956020380799143, "grad_norm": 0.23453939734887239, "learning_rate": 5.434101466937375e-07, "loss": 0.0106, "step": 9679 }, { "epoch": 2.5958702064896757, "grad_norm": 0.23646830457939813, "learning_rate": 5.427030080388395e-07, "loss": 0.0112, "step": 9680 }, { "epoch": 2.5961383748994367, "grad_norm": 0.2426102653629967, "learning_rate": 5.41996303379116e-07, "loss": 0.012, "step": 9681 }, { "epoch": 2.596406543309198, "grad_norm": 0.3552421422788036, "learning_rate": 5.412900327833753e-07, "loss": 0.0099, "step": 9682 }, { "epoch": 2.5966747117189595, "grad_norm": 0.2189658202513733, "learning_rate": 5.405841963203878e-07, "loss": 0.0124, "step": 9683 }, { "epoch": 2.596942880128721, "grad_norm": 0.2683571119736287, "learning_rate": 5.398787940588768e-07, "loss": 0.018, "step": 9684 }, { "epoch": 2.5972110485384823, "grad_norm": 0.3097451674196923, "learning_rate": 5.391738260675266e-07, "loss": 0.0159, "step": 9685 }, { "epoch": 2.5974792169482432, "grad_norm": 0.2690517626914785, "learning_rate": 5.384692924149775e-07, "loss": 0.0093, "step": 9686 }, { "epoch": 2.597747385358005, "grad_norm": 0.2809748167695715, "learning_rate": 5.377651931698275e-07, "loss": 0.0212, "step": 9687 }, { "epoch": 2.598015553767766, "grad_norm": 0.291855495196349, "learning_rate": 5.370615284006353e-07, "loss": 0.0174, "step": 9688 }, { "epoch": 2.5982837221775275, "grad_norm": 0.27157849635993314, "learning_rate": 5.363582981759119e-07, "loss": 0.0133, "step": 9689 }, { "epoch": 2.598551890587289, "grad_norm": 0.2089887409921479, "learning_rate": 5.356555025641313e-07, "loss": 0.0131, "step": 9690 }, { "epoch": 2.5988200589970503, "grad_norm": 0.18097505551600332, "learning_rate": 5.349531416337206e-07, "loss": 0.0087, "step": 9691 }, { "epoch": 2.5990882274068117, "grad_norm": 0.30237609791212455, "learning_rate": 5.342512154530683e-07, "loss": 0.0139, "step": 9692 }, { "epoch": 2.5993563958165726, "grad_norm": 0.29363668357562717, "learning_rate": 5.335497240905207e-07, "loss": 0.0148, "step": 9693 }, { "epoch": 2.599624564226334, "grad_norm": 0.21664268671829232, "learning_rate": 5.328486676143762e-07, "loss": 0.0106, "step": 9694 }, { "epoch": 2.5998927326360954, "grad_norm": 0.2734728020378876, "learning_rate": 5.32148046092898e-07, "loss": 0.0113, "step": 9695 }, { "epoch": 2.600160901045857, "grad_norm": 0.2720700245107791, "learning_rate": 5.314478595943013e-07, "loss": 0.0134, "step": 9696 }, { "epoch": 2.6004290694556182, "grad_norm": 0.23089031090501963, "learning_rate": 5.307481081867632e-07, "loss": 0.0115, "step": 9697 }, { "epoch": 2.600697237865379, "grad_norm": 0.5793820527992269, "learning_rate": 5.300487919384168e-07, "loss": 0.0206, "step": 9698 }, { "epoch": 2.600965406275141, "grad_norm": 0.24691556390373193, "learning_rate": 5.293499109173517e-07, "loss": 0.0137, "step": 9699 }, { "epoch": 2.601233574684902, "grad_norm": 0.23243860541206107, "learning_rate": 5.286514651916169e-07, "loss": 0.011, "step": 9700 }, { "epoch": 2.6015017430946634, "grad_norm": 0.25151078223917833, "learning_rate": 5.279534548292165e-07, "loss": 0.0102, "step": 9701 }, { "epoch": 2.601769911504425, "grad_norm": 0.21992478956209324, "learning_rate": 5.272558798981164e-07, "loss": 0.0087, "step": 9702 }, { "epoch": 2.6020380799141862, "grad_norm": 0.30193752471653157, "learning_rate": 5.265587404662353e-07, "loss": 0.0223, "step": 9703 }, { "epoch": 2.6023062483239476, "grad_norm": 0.24956749217169652, "learning_rate": 5.258620366014533e-07, "loss": 0.0134, "step": 9704 }, { "epoch": 2.6025744167337086, "grad_norm": 0.18258629712663962, "learning_rate": 5.251657683716077e-07, "loss": 0.0095, "step": 9705 }, { "epoch": 2.60284258514347, "grad_norm": 0.30160196510325177, "learning_rate": 5.244699358444893e-07, "loss": 0.0109, "step": 9706 }, { "epoch": 2.6031107535532314, "grad_norm": 0.3065779431922009, "learning_rate": 5.237745390878502e-07, "loss": 0.0151, "step": 9707 }, { "epoch": 2.603378921962993, "grad_norm": 0.2521336028682725, "learning_rate": 5.230795781694015e-07, "loss": 0.0112, "step": 9708 }, { "epoch": 2.603647090372754, "grad_norm": 0.2269350614112126, "learning_rate": 5.223850531568076e-07, "loss": 0.0144, "step": 9709 }, { "epoch": 2.603915258782515, "grad_norm": 0.26521534861182117, "learning_rate": 5.216909641176937e-07, "loss": 0.0142, "step": 9710 }, { "epoch": 2.604183427192277, "grad_norm": 0.2341468526713637, "learning_rate": 5.209973111196404e-07, "loss": 0.0116, "step": 9711 }, { "epoch": 2.604451595602038, "grad_norm": 0.21313511438846597, "learning_rate": 5.203040942301879e-07, "loss": 0.0104, "step": 9712 }, { "epoch": 2.6047197640117994, "grad_norm": 0.31020807837126285, "learning_rate": 5.196113135168318e-07, "loss": 0.0173, "step": 9713 }, { "epoch": 2.604987932421561, "grad_norm": 0.2648750758505074, "learning_rate": 5.189189690470264e-07, "loss": 0.0125, "step": 9714 }, { "epoch": 2.605256100831322, "grad_norm": 0.27298593211255817, "learning_rate": 5.182270608881856e-07, "loss": 0.0143, "step": 9715 }, { "epoch": 2.6055242692410836, "grad_norm": 0.29689128774433354, "learning_rate": 5.175355891076755e-07, "loss": 0.0184, "step": 9716 }, { "epoch": 2.6057924376508446, "grad_norm": 0.29084685734435906, "learning_rate": 5.168445537728256e-07, "loss": 0.016, "step": 9717 }, { "epoch": 2.606060606060606, "grad_norm": 0.2365977274093258, "learning_rate": 5.161539549509186e-07, "loss": 0.0122, "step": 9718 }, { "epoch": 2.6063287744703674, "grad_norm": 0.35227329569852367, "learning_rate": 5.15463792709196e-07, "loss": 0.0258, "step": 9719 }, { "epoch": 2.6065969428801288, "grad_norm": 0.2518926627281299, "learning_rate": 5.147740671148588e-07, "loss": 0.0145, "step": 9720 }, { "epoch": 2.60686511128989, "grad_norm": 0.2557949679326179, "learning_rate": 5.140847782350616e-07, "loss": 0.0153, "step": 9721 }, { "epoch": 2.607133279699651, "grad_norm": 0.23683905531626048, "learning_rate": 5.133959261369203e-07, "loss": 0.0107, "step": 9722 }, { "epoch": 2.6074014481094125, "grad_norm": 0.2893682387866785, "learning_rate": 5.127075108875051e-07, "loss": 0.0181, "step": 9723 }, { "epoch": 2.607669616519174, "grad_norm": 0.3407715789685843, "learning_rate": 5.120195325538463e-07, "loss": 0.0166, "step": 9724 }, { "epoch": 2.6079377849289354, "grad_norm": 0.3188956049957169, "learning_rate": 5.113319912029313e-07, "loss": 0.0147, "step": 9725 }, { "epoch": 2.6082059533386968, "grad_norm": 0.20626689403722412, "learning_rate": 5.106448869017016e-07, "loss": 0.0111, "step": 9726 }, { "epoch": 2.608474121748458, "grad_norm": 0.20975218998156636, "learning_rate": 5.099582197170616e-07, "loss": 0.0095, "step": 9727 }, { "epoch": 2.6087422901582196, "grad_norm": 0.20807072565918291, "learning_rate": 5.092719897158682e-07, "loss": 0.0114, "step": 9728 }, { "epoch": 2.6090104585679805, "grad_norm": 0.23385151501162746, "learning_rate": 5.085861969649392e-07, "loss": 0.0122, "step": 9729 }, { "epoch": 2.609278626977742, "grad_norm": 0.18421212266994, "learning_rate": 5.079008415310476e-07, "loss": 0.0138, "step": 9730 }, { "epoch": 2.6095467953875033, "grad_norm": 0.22885200669836125, "learning_rate": 5.072159234809238e-07, "loss": 0.0117, "step": 9731 }, { "epoch": 2.6098149637972647, "grad_norm": 0.1938808652433685, "learning_rate": 5.065314428812584e-07, "loss": 0.0134, "step": 9732 }, { "epoch": 2.610083132207026, "grad_norm": 0.26879376980762076, "learning_rate": 5.058473997986957e-07, "loss": 0.0148, "step": 9733 }, { "epoch": 2.610351300616787, "grad_norm": 0.275230619608116, "learning_rate": 5.051637942998399e-07, "loss": 0.0131, "step": 9734 }, { "epoch": 2.6106194690265485, "grad_norm": 0.26023515323108526, "learning_rate": 5.044806264512525e-07, "loss": 0.0121, "step": 9735 }, { "epoch": 2.61088763743631, "grad_norm": 0.29313993264115074, "learning_rate": 5.037978963194507e-07, "loss": 0.0123, "step": 9736 }, { "epoch": 2.6111558058460713, "grad_norm": 0.22652109434881057, "learning_rate": 5.031156039709112e-07, "loss": 0.0127, "step": 9737 }, { "epoch": 2.6114239742558327, "grad_norm": 0.21490142795791686, "learning_rate": 5.024337494720655e-07, "loss": 0.0154, "step": 9738 }, { "epoch": 2.611692142665594, "grad_norm": 0.20401144903952398, "learning_rate": 5.017523328893059e-07, "loss": 0.0109, "step": 9739 }, { "epoch": 2.6119603110753555, "grad_norm": 0.2383892539758945, "learning_rate": 5.010713542889778e-07, "loss": 0.011, "step": 9740 }, { "epoch": 2.6122284794851165, "grad_norm": 0.26005931513439423, "learning_rate": 5.003908137373886e-07, "loss": 0.0117, "step": 9741 }, { "epoch": 2.612496647894878, "grad_norm": 0.197820296499835, "learning_rate": 4.997107113007992e-07, "loss": 0.0093, "step": 9742 }, { "epoch": 2.6127648163046393, "grad_norm": 0.21914521514595675, "learning_rate": 4.990310470454296e-07, "loss": 0.0141, "step": 9743 }, { "epoch": 2.6130329847144007, "grad_norm": 0.23043093774937187, "learning_rate": 4.983518210374566e-07, "loss": 0.0113, "step": 9744 }, { "epoch": 2.613301153124162, "grad_norm": 0.22347172797773107, "learning_rate": 4.976730333430163e-07, "loss": 0.0103, "step": 9745 }, { "epoch": 2.613569321533923, "grad_norm": 0.23695588244843113, "learning_rate": 4.969946840281986e-07, "loss": 0.0132, "step": 9746 }, { "epoch": 2.6138374899436845, "grad_norm": 0.26457191869328045, "learning_rate": 4.963167731590535e-07, "loss": 0.016, "step": 9747 }, { "epoch": 2.614105658353446, "grad_norm": 0.33477402094724973, "learning_rate": 4.95639300801587e-07, "loss": 0.0129, "step": 9748 }, { "epoch": 2.6143738267632073, "grad_norm": 0.27179647606986823, "learning_rate": 4.949622670217635e-07, "loss": 0.0147, "step": 9749 }, { "epoch": 2.6146419951729687, "grad_norm": 0.21369291600714949, "learning_rate": 4.942856718855021e-07, "loss": 0.0096, "step": 9750 }, { "epoch": 2.61491016358273, "grad_norm": 0.25316670681347625, "learning_rate": 4.936095154586829e-07, "loss": 0.0195, "step": 9751 }, { "epoch": 2.6151783319924915, "grad_norm": 0.19662728340224148, "learning_rate": 4.929337978071419e-07, "loss": 0.0083, "step": 9752 }, { "epoch": 2.6154465004022525, "grad_norm": 0.32034527896082726, "learning_rate": 4.9225851899667e-07, "loss": 0.0192, "step": 9753 }, { "epoch": 2.615714668812014, "grad_norm": 0.26566078713506464, "learning_rate": 4.91583679093019e-07, "loss": 0.0166, "step": 9754 }, { "epoch": 2.6159828372217753, "grad_norm": 0.2691305512811401, "learning_rate": 4.909092781618958e-07, "loss": 0.0204, "step": 9755 }, { "epoch": 2.6162510056315367, "grad_norm": 0.16020694821660136, "learning_rate": 4.902353162689632e-07, "loss": 0.0066, "step": 9756 }, { "epoch": 2.616519174041298, "grad_norm": 0.26211268997858794, "learning_rate": 4.895617934798463e-07, "loss": 0.0111, "step": 9757 }, { "epoch": 2.616787342451059, "grad_norm": 0.2144471290943061, "learning_rate": 4.888887098601214e-07, "loss": 0.0087, "step": 9758 }, { "epoch": 2.6170555108608204, "grad_norm": 0.2716440737664424, "learning_rate": 4.88216065475327e-07, "loss": 0.0155, "step": 9759 }, { "epoch": 2.617323679270582, "grad_norm": 0.27763575556640296, "learning_rate": 4.875438603909549e-07, "loss": 0.0138, "step": 9760 }, { "epoch": 2.6175918476803433, "grad_norm": 0.21515967624195942, "learning_rate": 4.86872094672457e-07, "loss": 0.0148, "step": 9761 }, { "epoch": 2.6178600160901047, "grad_norm": 0.2580595934868905, "learning_rate": 4.862007683852415e-07, "loss": 0.0115, "step": 9762 }, { "epoch": 2.618128184499866, "grad_norm": 0.24667810299490367, "learning_rate": 4.855298815946724e-07, "loss": 0.0138, "step": 9763 }, { "epoch": 2.6183963529096275, "grad_norm": 0.20610451413219072, "learning_rate": 4.848594343660745e-07, "loss": 0.009, "step": 9764 }, { "epoch": 2.6186645213193884, "grad_norm": 0.2741111709438867, "learning_rate": 4.841894267647246e-07, "loss": 0.0143, "step": 9765 }, { "epoch": 2.61893268972915, "grad_norm": 0.23834094004737522, "learning_rate": 4.83519858855862e-07, "loss": 0.0103, "step": 9766 }, { "epoch": 2.6192008581389112, "grad_norm": 0.26081984697330396, "learning_rate": 4.828507307046798e-07, "loss": 0.0132, "step": 9767 }, { "epoch": 2.6194690265486726, "grad_norm": 0.17148281182853317, "learning_rate": 4.821820423763285e-07, "loss": 0.0076, "step": 9768 }, { "epoch": 2.619737194958434, "grad_norm": 0.24628890803544362, "learning_rate": 4.815137939359176e-07, "loss": 0.0115, "step": 9769 }, { "epoch": 2.620005363368195, "grad_norm": 0.32299478922866015, "learning_rate": 4.808459854485114e-07, "loss": 0.0154, "step": 9770 }, { "epoch": 2.6202735317779564, "grad_norm": 0.2376250155801046, "learning_rate": 4.801786169791339e-07, "loss": 0.0139, "step": 9771 }, { "epoch": 2.620541700187718, "grad_norm": 0.23268253835744912, "learning_rate": 4.795116885927653e-07, "loss": 0.0141, "step": 9772 }, { "epoch": 2.620809868597479, "grad_norm": 0.41334527447785374, "learning_rate": 4.788452003543409e-07, "loss": 0.0239, "step": 9773 }, { "epoch": 2.6210780370072406, "grad_norm": 0.26891582381501156, "learning_rate": 4.781791523287566e-07, "loss": 0.0127, "step": 9774 }, { "epoch": 2.621346205417002, "grad_norm": 0.21680444918960082, "learning_rate": 4.77513544580862e-07, "loss": 0.0134, "step": 9775 }, { "epoch": 2.6216143738267634, "grad_norm": 0.27399730141778095, "learning_rate": 4.768483771754684e-07, "loss": 0.0153, "step": 9776 }, { "epoch": 2.6218825422365244, "grad_norm": 0.3548706812323244, "learning_rate": 4.761836501773376e-07, "loss": 0.014, "step": 9777 }, { "epoch": 2.622150710646286, "grad_norm": 0.21717215107681162, "learning_rate": 4.755193636511957e-07, "loss": 0.0069, "step": 9778 }, { "epoch": 2.622418879056047, "grad_norm": 0.2603111082292883, "learning_rate": 4.7485551766172146e-07, "loss": 0.0149, "step": 9779 }, { "epoch": 2.6226870474658086, "grad_norm": 0.2722953301480247, "learning_rate": 4.741921122735499e-07, "loss": 0.0166, "step": 9780 }, { "epoch": 2.62295521587557, "grad_norm": 0.21845657531053783, "learning_rate": 4.7352914755127764e-07, "loss": 0.0159, "step": 9781 }, { "epoch": 2.623223384285331, "grad_norm": 0.3281756138935632, "learning_rate": 4.7286662355945425e-07, "loss": 0.0139, "step": 9782 }, { "epoch": 2.6234915526950924, "grad_norm": 0.3985935920349267, "learning_rate": 4.7220454036258803e-07, "loss": 0.0131, "step": 9783 }, { "epoch": 2.623759721104854, "grad_norm": 0.2804698960480728, "learning_rate": 4.7154289802514574e-07, "loss": 0.0175, "step": 9784 }, { "epoch": 2.624027889514615, "grad_norm": 0.2511606691421615, "learning_rate": 4.708816966115476e-07, "loss": 0.013, "step": 9785 }, { "epoch": 2.6242960579243766, "grad_norm": 0.25376220842293135, "learning_rate": 4.702209361861748e-07, "loss": 0.011, "step": 9786 }, { "epoch": 2.624564226334138, "grad_norm": 0.20517549592946643, "learning_rate": 4.6956061681336263e-07, "loss": 0.009, "step": 9787 }, { "epoch": 2.6248323947438994, "grad_norm": 0.2343920918830164, "learning_rate": 4.689007385574046e-07, "loss": 0.0139, "step": 9788 }, { "epoch": 2.6251005631536604, "grad_norm": 0.33759617571009437, "learning_rate": 4.682413014825532e-07, "loss": 0.0167, "step": 9789 }, { "epoch": 2.6253687315634218, "grad_norm": 0.20875591195913856, "learning_rate": 4.6758230565301487e-07, "loss": 0.0133, "step": 9790 }, { "epoch": 2.625636899973183, "grad_norm": 0.20393149873070954, "learning_rate": 4.669237511329533e-07, "loss": 0.0119, "step": 9791 }, { "epoch": 2.6259050683829446, "grad_norm": 0.21029820252477016, "learning_rate": 4.6626563798649105e-07, "loss": 0.0118, "step": 9792 }, { "epoch": 2.626173236792706, "grad_norm": 0.30791521940225175, "learning_rate": 4.6560796627770577e-07, "loss": 0.0142, "step": 9793 }, { "epoch": 2.626441405202467, "grad_norm": 0.23836795309601663, "learning_rate": 4.649507360706362e-07, "loss": 0.019, "step": 9794 }, { "epoch": 2.6267095736122283, "grad_norm": 0.2723769139136709, "learning_rate": 4.642939474292713e-07, "loss": 0.0171, "step": 9795 }, { "epoch": 2.6269777420219897, "grad_norm": 0.2520573065321896, "learning_rate": 4.636376004175641e-07, "loss": 0.0139, "step": 9796 }, { "epoch": 2.627245910431751, "grad_norm": 0.25201012070466955, "learning_rate": 4.629816950994187e-07, "loss": 0.0138, "step": 9797 }, { "epoch": 2.6275140788415126, "grad_norm": 0.23223372355504587, "learning_rate": 4.623262315387017e-07, "loss": 0.0101, "step": 9798 }, { "epoch": 2.627782247251274, "grad_norm": 0.1977085873796665, "learning_rate": 4.616712097992304e-07, "loss": 0.0113, "step": 9799 }, { "epoch": 2.6280504156610354, "grad_norm": 0.2658375286942348, "learning_rate": 4.610166299447849e-07, "loss": 0.0131, "step": 9800 }, { "epoch": 2.6283185840707963, "grad_norm": 0.2848537874996271, "learning_rate": 4.603624920391003e-07, "loss": 0.0152, "step": 9801 }, { "epoch": 2.6285867524805577, "grad_norm": 0.2395367492887739, "learning_rate": 4.597087961458674e-07, "loss": 0.0136, "step": 9802 }, { "epoch": 2.628854920890319, "grad_norm": 0.24939657547570745, "learning_rate": 4.590555423287335e-07, "loss": 0.0085, "step": 9803 }, { "epoch": 2.6291230893000805, "grad_norm": 0.3005423809678072, "learning_rate": 4.584027306513067e-07, "loss": 0.0236, "step": 9804 }, { "epoch": 2.629391257709842, "grad_norm": 0.2680296278567531, "learning_rate": 4.577503611771472e-07, "loss": 0.0157, "step": 9805 }, { "epoch": 2.629659426119603, "grad_norm": 0.30993228623597807, "learning_rate": 4.5709843396977704e-07, "loss": 0.0188, "step": 9806 }, { "epoch": 2.6299275945293643, "grad_norm": 0.24181180880188047, "learning_rate": 4.5644694909266984e-07, "loss": 0.012, "step": 9807 }, { "epoch": 2.6301957629391257, "grad_norm": 0.33592756844666455, "learning_rate": 4.5579590660926097e-07, "loss": 0.0124, "step": 9808 }, { "epoch": 2.630463931348887, "grad_norm": 0.30988412959605455, "learning_rate": 4.551453065829392e-07, "loss": 0.0152, "step": 9809 }, { "epoch": 2.6307320997586485, "grad_norm": 0.2917006963761313, "learning_rate": 4.544951490770527e-07, "loss": 0.0184, "step": 9810 }, { "epoch": 2.6310002681684095, "grad_norm": 0.3244891526455222, "learning_rate": 4.5384543415490646e-07, "loss": 0.0206, "step": 9811 }, { "epoch": 2.6312684365781713, "grad_norm": 0.3140541456906654, "learning_rate": 4.5319616187975976e-07, "loss": 0.0217, "step": 9812 }, { "epoch": 2.6315366049879323, "grad_norm": 0.3144144139788163, "learning_rate": 4.5254733231483216e-07, "loss": 0.0177, "step": 9813 }, { "epoch": 2.6318047733976937, "grad_norm": 0.25031555801201266, "learning_rate": 4.5189894552329696e-07, "loss": 0.0111, "step": 9814 }, { "epoch": 2.632072941807455, "grad_norm": 0.297418032946646, "learning_rate": 4.5125100156828637e-07, "loss": 0.0162, "step": 9815 }, { "epoch": 2.6323411102172165, "grad_norm": 0.20373063294885077, "learning_rate": 4.506035005128895e-07, "loss": 0.0105, "step": 9816 }, { "epoch": 2.632609278626978, "grad_norm": 0.2437427539335146, "learning_rate": 4.4995644242015026e-07, "loss": 0.0119, "step": 9817 }, { "epoch": 2.632877447036739, "grad_norm": 0.22189272201829352, "learning_rate": 4.493098273530733e-07, "loss": 0.0111, "step": 9818 }, { "epoch": 2.6331456154465003, "grad_norm": 0.19138236566282554, "learning_rate": 4.4866365537461543e-07, "loss": 0.0124, "step": 9819 }, { "epoch": 2.6334137838562617, "grad_norm": 0.2233555139810297, "learning_rate": 4.4801792654769416e-07, "loss": 0.0121, "step": 9820 }, { "epoch": 2.633681952266023, "grad_norm": 0.22108736176406157, "learning_rate": 4.473726409351831e-07, "loss": 0.0098, "step": 9821 }, { "epoch": 2.6339501206757845, "grad_norm": 0.2953773818524073, "learning_rate": 4.467277985999097e-07, "loss": 0.0133, "step": 9822 }, { "epoch": 2.6342182890855455, "grad_norm": 0.4509975397616075, "learning_rate": 4.460833996046632e-07, "loss": 0.0127, "step": 9823 }, { "epoch": 2.6344864574953073, "grad_norm": 0.44251804461912303, "learning_rate": 4.454394440121845e-07, "loss": 0.0132, "step": 9824 }, { "epoch": 2.6347546259050683, "grad_norm": 0.2461787367495413, "learning_rate": 4.4479593188517576e-07, "loss": 0.0158, "step": 9825 }, { "epoch": 2.6350227943148297, "grad_norm": 0.3643113212431365, "learning_rate": 4.4415286328629346e-07, "loss": 0.0256, "step": 9826 }, { "epoch": 2.635290962724591, "grad_norm": 0.2777108198795846, "learning_rate": 4.435102382781503e-07, "loss": 0.015, "step": 9827 }, { "epoch": 2.6355591311343525, "grad_norm": 0.23314741268608813, "learning_rate": 4.4286805692331905e-07, "loss": 0.0151, "step": 9828 }, { "epoch": 2.635827299544114, "grad_norm": 0.26955267809173183, "learning_rate": 4.4222631928432566e-07, "loss": 0.0114, "step": 9829 }, { "epoch": 2.636095467953875, "grad_norm": 0.33377537973207594, "learning_rate": 4.415850254236542e-07, "loss": 0.0189, "step": 9830 }, { "epoch": 2.6363636363636362, "grad_norm": 0.24738662950534235, "learning_rate": 4.4094417540374745e-07, "loss": 0.0171, "step": 9831 }, { "epoch": 2.6366318047733976, "grad_norm": 0.2643976740493939, "learning_rate": 4.4030376928700156e-07, "loss": 0.0161, "step": 9832 }, { "epoch": 2.636899973183159, "grad_norm": 0.22302477916559263, "learning_rate": 4.396638071357728e-07, "loss": 0.0114, "step": 9833 }, { "epoch": 2.6371681415929205, "grad_norm": 0.23656257630278724, "learning_rate": 4.3902428901237083e-07, "loss": 0.0109, "step": 9834 }, { "epoch": 2.6374363100026814, "grad_norm": 0.25217439672653874, "learning_rate": 4.3838521497906574e-07, "loss": 0.0104, "step": 9835 }, { "epoch": 2.6377044784124433, "grad_norm": 0.2515161105036834, "learning_rate": 4.3774658509808055e-07, "loss": 0.009, "step": 9836 }, { "epoch": 2.6379726468222042, "grad_norm": 0.1979322847341452, "learning_rate": 4.371083994315972e-07, "loss": 0.012, "step": 9837 }, { "epoch": 2.6382408152319656, "grad_norm": 0.2135186068143742, "learning_rate": 4.3647065804175713e-07, "loss": 0.013, "step": 9838 }, { "epoch": 2.638508983641727, "grad_norm": 0.3826903203181729, "learning_rate": 4.358333609906512e-07, "loss": 0.0242, "step": 9839 }, { "epoch": 2.6387771520514884, "grad_norm": 0.22563769890435414, "learning_rate": 4.351965083403331e-07, "loss": 0.0134, "step": 9840 }, { "epoch": 2.63904532046125, "grad_norm": 0.2094758044403681, "learning_rate": 4.3456010015281267e-07, "loss": 0.0176, "step": 9841 }, { "epoch": 2.639313488871011, "grad_norm": 0.2957474311337995, "learning_rate": 4.339241364900537e-07, "loss": 0.0157, "step": 9842 }, { "epoch": 2.639581657280772, "grad_norm": 0.2944733433078101, "learning_rate": 4.332886174139794e-07, "loss": 0.0178, "step": 9843 }, { "epoch": 2.6398498256905336, "grad_norm": 0.22228963181782915, "learning_rate": 4.32653542986467e-07, "loss": 0.0113, "step": 9844 }, { "epoch": 2.640117994100295, "grad_norm": 0.24811066486274366, "learning_rate": 4.320189132693542e-07, "loss": 0.0119, "step": 9845 }, { "epoch": 2.6403861625100564, "grad_norm": 0.21756295545833926, "learning_rate": 4.31384728324431e-07, "loss": 0.0105, "step": 9846 }, { "epoch": 2.6406543309198174, "grad_norm": 0.29373081143815316, "learning_rate": 4.3075098821344754e-07, "loss": 0.0199, "step": 9847 }, { "epoch": 2.6409224993295792, "grad_norm": 0.2799336473878965, "learning_rate": 4.3011769299811047e-07, "loss": 0.0205, "step": 9848 }, { "epoch": 2.64119066773934, "grad_norm": 0.23688884869691956, "learning_rate": 4.294848427400794e-07, "loss": 0.013, "step": 9849 }, { "epoch": 2.6414588361491016, "grad_norm": 0.18740649651446645, "learning_rate": 4.288524375009756e-07, "loss": 0.0092, "step": 9850 }, { "epoch": 2.641727004558863, "grad_norm": 0.23925814990333194, "learning_rate": 4.282204773423737e-07, "loss": 0.0113, "step": 9851 }, { "epoch": 2.6419951729686244, "grad_norm": 0.2186495537695046, "learning_rate": 4.275889623258056e-07, "loss": 0.0112, "step": 9852 }, { "epoch": 2.642263341378386, "grad_norm": 0.22989413921777618, "learning_rate": 4.2695789251276154e-07, "loss": 0.0098, "step": 9853 }, { "epoch": 2.6425315097881468, "grad_norm": 0.2155083106354327, "learning_rate": 4.2632726796468515e-07, "loss": 0.0085, "step": 9854 }, { "epoch": 2.642799678197908, "grad_norm": 0.3364542638720638, "learning_rate": 4.2569708874298123e-07, "loss": 0.0197, "step": 9855 }, { "epoch": 2.6430678466076696, "grad_norm": 0.25283141312927687, "learning_rate": 4.2506735490900565e-07, "loss": 0.0185, "step": 9856 }, { "epoch": 2.643336015017431, "grad_norm": 0.23132672763976034, "learning_rate": 4.2443806652407605e-07, "loss": 0.0109, "step": 9857 }, { "epoch": 2.6436041834271924, "grad_norm": 0.25488881497009275, "learning_rate": 4.2380922364946455e-07, "loss": 0.0146, "step": 9858 }, { "epoch": 2.6438723518369533, "grad_norm": 0.2321640255945172, "learning_rate": 4.2318082634639877e-07, "loss": 0.0155, "step": 9859 }, { "epoch": 2.644140520246715, "grad_norm": 0.20403663081272436, "learning_rate": 4.225528746760654e-07, "loss": 0.0116, "step": 9860 }, { "epoch": 2.644408688656476, "grad_norm": 0.23332878834091897, "learning_rate": 4.21925368699605e-07, "loss": 0.0166, "step": 9861 }, { "epoch": 2.6446768570662376, "grad_norm": 0.24009026344660306, "learning_rate": 4.212983084781175e-07, "loss": 0.0126, "step": 9862 }, { "epoch": 2.644945025475999, "grad_norm": 0.24662279747164562, "learning_rate": 4.2067169407265793e-07, "loss": 0.012, "step": 9863 }, { "epoch": 2.6452131938857604, "grad_norm": 0.25749865654823295, "learning_rate": 4.200455255442365e-07, "loss": 0.0183, "step": 9864 }, { "epoch": 2.6454813622955218, "grad_norm": 0.2518921503901912, "learning_rate": 4.194198029538238e-07, "loss": 0.0166, "step": 9865 }, { "epoch": 2.6457495307052827, "grad_norm": 0.25365812183381214, "learning_rate": 4.187945263623422e-07, "loss": 0.0118, "step": 9866 }, { "epoch": 2.646017699115044, "grad_norm": 0.24524542855479292, "learning_rate": 4.1816969583067526e-07, "loss": 0.0171, "step": 9867 }, { "epoch": 2.6462858675248055, "grad_norm": 0.24311296221316883, "learning_rate": 4.175453114196615e-07, "loss": 0.0118, "step": 9868 }, { "epoch": 2.646554035934567, "grad_norm": 0.23222526506686028, "learning_rate": 4.169213731900934e-07, "loss": 0.014, "step": 9869 }, { "epoch": 2.6468222043443284, "grad_norm": 0.33713511906599675, "learning_rate": 4.162978812027241e-07, "loss": 0.0211, "step": 9870 }, { "epoch": 2.6470903727540893, "grad_norm": 0.31605268287240623, "learning_rate": 4.1567483551825995e-07, "loss": 0.0141, "step": 9871 }, { "epoch": 2.647358541163851, "grad_norm": 0.26787611519938204, "learning_rate": 4.150522361973669e-07, "loss": 0.013, "step": 9872 }, { "epoch": 2.647626709573612, "grad_norm": 0.2985614483839486, "learning_rate": 4.144300833006637e-07, "loss": 0.0143, "step": 9873 }, { "epoch": 2.6478948779833735, "grad_norm": 0.25781079857726485, "learning_rate": 4.138083768887291e-07, "loss": 0.0135, "step": 9874 }, { "epoch": 2.648163046393135, "grad_norm": 0.2764480029935245, "learning_rate": 4.13187117022098e-07, "loss": 0.0142, "step": 9875 }, { "epoch": 2.6484312148028963, "grad_norm": 0.24696743261577228, "learning_rate": 4.125663037612582e-07, "loss": 0.0171, "step": 9876 }, { "epoch": 2.6486993832126577, "grad_norm": 0.271617375629913, "learning_rate": 4.11945937166659e-07, "loss": 0.0225, "step": 9877 }, { "epoch": 2.6489675516224187, "grad_norm": 0.29396545343999836, "learning_rate": 4.113260172987016e-07, "loss": 0.0187, "step": 9878 }, { "epoch": 2.64923572003218, "grad_norm": 0.5704337946212032, "learning_rate": 4.1070654421774767e-07, "loss": 0.0102, "step": 9879 }, { "epoch": 2.6495038884419415, "grad_norm": 0.2939756820643174, "learning_rate": 4.100875179841135e-07, "loss": 0.0174, "step": 9880 }, { "epoch": 2.649772056851703, "grad_norm": 0.2531099141512188, "learning_rate": 4.0946893865807137e-07, "loss": 0.0137, "step": 9881 }, { "epoch": 2.6500402252614643, "grad_norm": 0.2415155202745602, "learning_rate": 4.0885080629985143e-07, "loss": 0.0122, "step": 9882 }, { "epoch": 2.6503083936712253, "grad_norm": 0.20514956260947687, "learning_rate": 4.0823312096963883e-07, "loss": 0.0118, "step": 9883 }, { "epoch": 2.650576562080987, "grad_norm": 0.23098217548736805, "learning_rate": 4.0761588272757614e-07, "loss": 0.0114, "step": 9884 }, { "epoch": 2.650844730490748, "grad_norm": 0.2800739071676577, "learning_rate": 4.0699909163376293e-07, "loss": 0.014, "step": 9885 }, { "epoch": 2.6511128989005095, "grad_norm": 0.27524636518215334, "learning_rate": 4.063827477482535e-07, "loss": 0.0153, "step": 9886 }, { "epoch": 2.651381067310271, "grad_norm": 0.19079089821662834, "learning_rate": 4.0576685113106196e-07, "loss": 0.0109, "step": 9887 }, { "epoch": 2.6516492357200323, "grad_norm": 0.2543506694931669, "learning_rate": 4.051514018421526e-07, "loss": 0.0153, "step": 9888 }, { "epoch": 2.6519174041297937, "grad_norm": 0.2010846590263328, "learning_rate": 4.0453639994145257e-07, "loss": 0.0126, "step": 9889 }, { "epoch": 2.6521855725395547, "grad_norm": 0.4028046531956334, "learning_rate": 4.0392184548884326e-07, "loss": 0.0228, "step": 9890 }, { "epoch": 2.652453740949316, "grad_norm": 0.2584006098997406, "learning_rate": 4.0330773854416025e-07, "loss": 0.0138, "step": 9891 }, { "epoch": 2.6527219093590775, "grad_norm": 0.3381562416319299, "learning_rate": 4.0269407916720016e-07, "loss": 0.0253, "step": 9892 }, { "epoch": 2.652990077768839, "grad_norm": 0.2820720644139439, "learning_rate": 4.020808674177107e-07, "loss": 0.0241, "step": 9893 }, { "epoch": 2.6532582461786003, "grad_norm": 0.29792382192122213, "learning_rate": 4.0146810335540074e-07, "loss": 0.015, "step": 9894 }, { "epoch": 2.6535264145883612, "grad_norm": 0.2398124551563542, "learning_rate": 4.008557870399321e-07, "loss": 0.0106, "step": 9895 }, { "epoch": 2.653794582998123, "grad_norm": 0.21783553808927528, "learning_rate": 4.002439185309248e-07, "loss": 0.0115, "step": 9896 }, { "epoch": 2.654062751407884, "grad_norm": 0.2614448084487735, "learning_rate": 3.996324978879562e-07, "loss": 0.0143, "step": 9897 }, { "epoch": 2.6543309198176455, "grad_norm": 0.3224886744583689, "learning_rate": 3.990215251705565e-07, "loss": 0.0193, "step": 9898 }, { "epoch": 2.654599088227407, "grad_norm": 0.24490896523396652, "learning_rate": 3.9841100043821637e-07, "loss": 0.0175, "step": 9899 }, { "epoch": 2.6548672566371683, "grad_norm": 0.21172278300241065, "learning_rate": 3.978009237503805e-07, "loss": 0.0115, "step": 9900 }, { "epoch": 2.6551354250469297, "grad_norm": 0.260501669870005, "learning_rate": 3.971912951664486e-07, "loss": 0.0148, "step": 9901 }, { "epoch": 2.6554035934566906, "grad_norm": 0.2383566029409828, "learning_rate": 3.9658211474578155e-07, "loss": 0.0131, "step": 9902 }, { "epoch": 2.655671761866452, "grad_norm": 0.2862660002085008, "learning_rate": 3.959733825476908e-07, "loss": 0.0136, "step": 9903 }, { "epoch": 2.6559399302762134, "grad_norm": 0.24669327237790925, "learning_rate": 3.9536509863144944e-07, "loss": 0.015, "step": 9904 }, { "epoch": 2.656208098685975, "grad_norm": 0.24262496711855822, "learning_rate": 3.9475726305628237e-07, "loss": 0.0139, "step": 9905 }, { "epoch": 2.6564762670957363, "grad_norm": 0.31579566406836074, "learning_rate": 3.941498758813739e-07, "loss": 0.0142, "step": 9906 }, { "epoch": 2.656744435505497, "grad_norm": 0.33095626950689366, "learning_rate": 3.9354293716586443e-07, "loss": 0.0182, "step": 9907 }, { "epoch": 2.6570126039152586, "grad_norm": 0.24178644664050422, "learning_rate": 3.9293644696884904e-07, "loss": 0.0159, "step": 9908 }, { "epoch": 2.65728077232502, "grad_norm": 0.282206204568637, "learning_rate": 3.923304053493804e-07, "loss": 0.012, "step": 9909 }, { "epoch": 2.6575489407347814, "grad_norm": 0.3021979266117166, "learning_rate": 3.9172481236646685e-07, "loss": 0.0195, "step": 9910 }, { "epoch": 2.657817109144543, "grad_norm": 0.208181351757412, "learning_rate": 3.911196680790741e-07, "loss": 0.0117, "step": 9911 }, { "epoch": 2.6580852775543042, "grad_norm": 0.2351755871462643, "learning_rate": 3.905149725461227e-07, "loss": 0.0168, "step": 9912 }, { "epoch": 2.6583534459640656, "grad_norm": 0.20800535440184445, "learning_rate": 3.8991072582649003e-07, "loss": 0.0108, "step": 9913 }, { "epoch": 2.6586216143738266, "grad_norm": 0.2209868783984825, "learning_rate": 3.893069279790113e-07, "loss": 0.0161, "step": 9914 }, { "epoch": 2.658889782783588, "grad_norm": 0.21700817440257628, "learning_rate": 3.8870357906247434e-07, "loss": 0.009, "step": 9915 }, { "epoch": 2.6591579511933494, "grad_norm": 0.25151635547065454, "learning_rate": 3.881006791356279e-07, "loss": 0.015, "step": 9916 }, { "epoch": 2.659426119603111, "grad_norm": 0.21806750130938918, "learning_rate": 3.874982282571743e-07, "loss": 0.0095, "step": 9917 }, { "epoch": 2.659694288012872, "grad_norm": 0.2104150451838226, "learning_rate": 3.868962264857717e-07, "loss": 0.0105, "step": 9918 }, { "epoch": 2.659962456422633, "grad_norm": 0.24542319231381937, "learning_rate": 3.8629467388003717e-07, "loss": 0.0163, "step": 9919 }, { "epoch": 2.6602306248323946, "grad_norm": 0.25043667118971946, "learning_rate": 3.8569357049853986e-07, "loss": 0.0136, "step": 9920 }, { "epoch": 2.660498793242156, "grad_norm": 0.20232480405205847, "learning_rate": 3.850929163998096e-07, "loss": 0.009, "step": 9921 }, { "epoch": 2.6607669616519174, "grad_norm": 0.2579181932483036, "learning_rate": 3.844927116423286e-07, "loss": 0.0125, "step": 9922 }, { "epoch": 2.661035130061679, "grad_norm": 0.27956837188709976, "learning_rate": 3.8389295628454003e-07, "loss": 0.0155, "step": 9923 }, { "epoch": 2.66130329847144, "grad_norm": 0.23746760177657622, "learning_rate": 3.8329365038483776e-07, "loss": 0.0126, "step": 9924 }, { "epoch": 2.6615714668812016, "grad_norm": 0.25289643184437777, "learning_rate": 3.826947940015757e-07, "loss": 0.0098, "step": 9925 }, { "epoch": 2.6618396352909626, "grad_norm": 0.24140113000033572, "learning_rate": 3.820963871930622e-07, "loss": 0.013, "step": 9926 }, { "epoch": 2.662107803700724, "grad_norm": 0.2744140528814186, "learning_rate": 3.814984300175645e-07, "loss": 0.0092, "step": 9927 }, { "epoch": 2.6623759721104854, "grad_norm": 0.28154436701574337, "learning_rate": 3.8090092253330156e-07, "loss": 0.0163, "step": 9928 }, { "epoch": 2.662644140520247, "grad_norm": 0.18346045057850063, "learning_rate": 3.803038647984536e-07, "loss": 0.0088, "step": 9929 }, { "epoch": 2.662912308930008, "grad_norm": 0.2980954695695256, "learning_rate": 3.7970725687115184e-07, "loss": 0.0127, "step": 9930 }, { "epoch": 2.663180477339769, "grad_norm": 0.2217523005011196, "learning_rate": 3.7911109880948924e-07, "loss": 0.0139, "step": 9931 }, { "epoch": 2.6634486457495306, "grad_norm": 0.25559064548276506, "learning_rate": 3.7851539067150944e-07, "loss": 0.0155, "step": 9932 }, { "epoch": 2.663716814159292, "grad_norm": 0.250660682316538, "learning_rate": 3.779201325152165e-07, "loss": 0.0128, "step": 9933 }, { "epoch": 2.6639849825690534, "grad_norm": 0.2774146533516982, "learning_rate": 3.7732532439856975e-07, "loss": 0.0145, "step": 9934 }, { "epoch": 2.6642531509788148, "grad_norm": 0.21089144504948068, "learning_rate": 3.767309663794833e-07, "loss": 0.0093, "step": 9935 }, { "epoch": 2.664521319388576, "grad_norm": 0.21204652663002455, "learning_rate": 3.761370585158269e-07, "loss": 0.0118, "step": 9936 }, { "epoch": 2.6647894877983376, "grad_norm": 0.6118608132707983, "learning_rate": 3.7554360086543005e-07, "loss": 0.0143, "step": 9937 }, { "epoch": 2.6650576562080985, "grad_norm": 0.2279560594254162, "learning_rate": 3.749505934860742e-07, "loss": 0.0131, "step": 9938 }, { "epoch": 2.66532582461786, "grad_norm": 0.20383163567640913, "learning_rate": 3.743580364355004e-07, "loss": 0.011, "step": 9939 }, { "epoch": 2.6655939930276213, "grad_norm": 0.2467789087161366, "learning_rate": 3.737659297714036e-07, "loss": 0.0151, "step": 9940 }, { "epoch": 2.6658621614373827, "grad_norm": 0.2474587250821761, "learning_rate": 3.7317427355143596e-07, "loss": 0.0149, "step": 9941 }, { "epoch": 2.666130329847144, "grad_norm": 0.31730773645879573, "learning_rate": 3.7258306783320484e-07, "loss": 0.0176, "step": 9942 }, { "epoch": 2.666398498256905, "grad_norm": 0.21477088936597902, "learning_rate": 3.719923126742747e-07, "loss": 0.0097, "step": 9943 }, { "epoch": 2.6666666666666665, "grad_norm": 0.21231930654647876, "learning_rate": 3.7140200813216677e-07, "loss": 0.011, "step": 9944 }, { "epoch": 2.666934835076428, "grad_norm": 0.3015072503116844, "learning_rate": 3.708121542643561e-07, "loss": 0.0218, "step": 9945 }, { "epoch": 2.6672030034861893, "grad_norm": 0.27169801782034875, "learning_rate": 3.7022275112827574e-07, "loss": 0.0144, "step": 9946 }, { "epoch": 2.6674711718959507, "grad_norm": 0.26881184706848005, "learning_rate": 3.6963379878131534e-07, "loss": 0.0169, "step": 9947 }, { "epoch": 2.667739340305712, "grad_norm": 0.3009968688747181, "learning_rate": 3.690452972808173e-07, "loss": 0.0191, "step": 9948 }, { "epoch": 2.6680075087154735, "grad_norm": 0.22071975309575997, "learning_rate": 3.684572466840841e-07, "loss": 0.012, "step": 9949 }, { "epoch": 2.6682756771252345, "grad_norm": 0.28707466484422095, "learning_rate": 3.678696470483717e-07, "loss": 0.0122, "step": 9950 }, { "epoch": 2.668543845534996, "grad_norm": 0.33301111582288656, "learning_rate": 3.672824984308948e-07, "loss": 0.0152, "step": 9951 }, { "epoch": 2.6688120139447573, "grad_norm": 0.21585744630120418, "learning_rate": 3.666958008888205e-07, "loss": 0.0111, "step": 9952 }, { "epoch": 2.6690801823545187, "grad_norm": 0.2069907174205666, "learning_rate": 3.6610955447927476e-07, "loss": 0.0112, "step": 9953 }, { "epoch": 2.66934835076428, "grad_norm": 0.2766982426267025, "learning_rate": 3.655237592593397e-07, "loss": 0.0103, "step": 9954 }, { "epoch": 2.669616519174041, "grad_norm": 0.22229575511824387, "learning_rate": 3.6493841528605134e-07, "loss": 0.0123, "step": 9955 }, { "epoch": 2.6698846875838025, "grad_norm": 0.207746300194602, "learning_rate": 3.643535226164041e-07, "loss": 0.0117, "step": 9956 }, { "epoch": 2.670152855993564, "grad_norm": 0.22165786019957717, "learning_rate": 3.637690813073469e-07, "loss": 0.011, "step": 9957 }, { "epoch": 2.6704210244033253, "grad_norm": 0.2308280488204009, "learning_rate": 3.631850914157853e-07, "loss": 0.0129, "step": 9958 }, { "epoch": 2.6706891928130867, "grad_norm": 0.2471966523237142, "learning_rate": 3.62601552998581e-07, "loss": 0.0129, "step": 9959 }, { "epoch": 2.670957361222848, "grad_norm": 0.2732889250894578, "learning_rate": 3.620184661125509e-07, "loss": 0.0139, "step": 9960 }, { "epoch": 2.6712255296326095, "grad_norm": 0.2131520220396545, "learning_rate": 3.6143583081447e-07, "loss": 0.0087, "step": 9961 }, { "epoch": 2.6714936980423705, "grad_norm": 0.2578249048950083, "learning_rate": 3.6085364716106575e-07, "loss": 0.0155, "step": 9962 }, { "epoch": 2.671761866452132, "grad_norm": 0.1970000339008575, "learning_rate": 3.602719152090256e-07, "loss": 0.0086, "step": 9963 }, { "epoch": 2.6720300348618933, "grad_norm": 0.2909319937868586, "learning_rate": 3.5969063501499146e-07, "loss": 0.0156, "step": 9964 }, { "epoch": 2.6722982032716547, "grad_norm": 0.34994095477867876, "learning_rate": 3.591098066355597e-07, "loss": 0.0227, "step": 9965 }, { "epoch": 2.672566371681416, "grad_norm": 0.23894680303076332, "learning_rate": 3.5852943012728567e-07, "loss": 0.0098, "step": 9966 }, { "epoch": 2.672834540091177, "grad_norm": 0.20276268045719037, "learning_rate": 3.579495055466775e-07, "loss": 0.0103, "step": 9967 }, { "epoch": 2.6731027085009385, "grad_norm": 0.3392770049157492, "learning_rate": 3.5737003295020165e-07, "loss": 0.014, "step": 9968 }, { "epoch": 2.6733708769107, "grad_norm": 0.32768824229427923, "learning_rate": 3.5679101239427973e-07, "loss": 0.0242, "step": 9969 }, { "epoch": 2.6736390453204613, "grad_norm": 0.27250459274638705, "learning_rate": 3.562124439352893e-07, "loss": 0.0131, "step": 9970 }, { "epoch": 2.6739072137302227, "grad_norm": 0.2701576415798605, "learning_rate": 3.556343276295654e-07, "loss": 0.0156, "step": 9971 }, { "epoch": 2.674175382139984, "grad_norm": 0.24944676155894252, "learning_rate": 3.550566635333952e-07, "loss": 0.0194, "step": 9972 }, { "epoch": 2.6744435505497455, "grad_norm": 0.2739204898929503, "learning_rate": 3.544794517030259e-07, "loss": 0.0177, "step": 9973 }, { "epoch": 2.6747117189595064, "grad_norm": 0.28043078030697444, "learning_rate": 3.539026921946587e-07, "loss": 0.016, "step": 9974 }, { "epoch": 2.674979887369268, "grad_norm": 0.30813056278004913, "learning_rate": 3.533263850644508e-07, "loss": 0.02, "step": 9975 }, { "epoch": 2.6752480557790292, "grad_norm": 0.20532239152065507, "learning_rate": 3.527505303685169e-07, "loss": 0.0102, "step": 9976 }, { "epoch": 2.6755162241887906, "grad_norm": 0.27545291431470365, "learning_rate": 3.5217512816292486e-07, "loss": 0.014, "step": 9977 }, { "epoch": 2.675784392598552, "grad_norm": 0.2306366449208292, "learning_rate": 3.5160017850370154e-07, "loss": 0.012, "step": 9978 }, { "epoch": 2.676052561008313, "grad_norm": 0.23143912620176887, "learning_rate": 3.5102568144682713e-07, "loss": 0.0094, "step": 9979 }, { "epoch": 2.6763207294180744, "grad_norm": 0.21756483303717467, "learning_rate": 3.5045163704823914e-07, "loss": 0.0123, "step": 9980 }, { "epoch": 2.676588897827836, "grad_norm": 0.2217713308136457, "learning_rate": 3.4987804536383174e-07, "loss": 0.0102, "step": 9981 }, { "epoch": 2.6768570662375972, "grad_norm": 0.19865529999219103, "learning_rate": 3.4930490644945246e-07, "loss": 0.0102, "step": 9982 }, { "epoch": 2.6771252346473586, "grad_norm": 0.2311762957651549, "learning_rate": 3.487322203609084e-07, "loss": 0.0128, "step": 9983 }, { "epoch": 2.67739340305712, "grad_norm": 0.20409204229367964, "learning_rate": 3.4815998715395816e-07, "loss": 0.0128, "step": 9984 }, { "epoch": 2.6776615714668814, "grad_norm": 0.23685875086382915, "learning_rate": 3.475882068843189e-07, "loss": 0.011, "step": 9985 }, { "epoch": 2.6779297398766424, "grad_norm": 0.20661139493652383, "learning_rate": 3.4701687960766504e-07, "loss": 0.0149, "step": 9986 }, { "epoch": 2.678197908286404, "grad_norm": 0.25825783266007146, "learning_rate": 3.464460053796237e-07, "loss": 0.0115, "step": 9987 }, { "epoch": 2.678466076696165, "grad_norm": 0.2449742580834682, "learning_rate": 3.458755842557804e-07, "loss": 0.0121, "step": 9988 }, { "epoch": 2.6787342451059266, "grad_norm": 0.22563721455622898, "learning_rate": 3.4530561629167457e-07, "loss": 0.0159, "step": 9989 }, { "epoch": 2.679002413515688, "grad_norm": 0.17115396014121265, "learning_rate": 3.44736101542803e-07, "loss": 0.0078, "step": 9990 }, { "epoch": 2.679270581925449, "grad_norm": 0.2981313032681861, "learning_rate": 3.4416704006461744e-07, "loss": 0.0169, "step": 9991 }, { "epoch": 2.6795387503352104, "grad_norm": 0.18473567185976986, "learning_rate": 3.435984319125263e-07, "loss": 0.0089, "step": 9992 }, { "epoch": 2.679806918744972, "grad_norm": 0.22467329248265597, "learning_rate": 3.4303027714189366e-07, "loss": 0.0092, "step": 9993 }, { "epoch": 2.680075087154733, "grad_norm": 0.20197785419558836, "learning_rate": 3.4246257580803855e-07, "loss": 0.0088, "step": 9994 }, { "epoch": 2.6803432555644946, "grad_norm": 0.2750637967870523, "learning_rate": 3.418953279662374e-07, "loss": 0.0113, "step": 9995 }, { "epoch": 2.6806114239742556, "grad_norm": 0.27544182920746385, "learning_rate": 3.4132853367172145e-07, "loss": 0.0133, "step": 9996 }, { "epoch": 2.6808795923840174, "grad_norm": 0.21877867924015149, "learning_rate": 3.407621929796767e-07, "loss": 0.0133, "step": 9997 }, { "epoch": 2.6811477607937784, "grad_norm": 0.3302875489444598, "learning_rate": 3.401963059452479e-07, "loss": 0.0139, "step": 9998 }, { "epoch": 2.6814159292035398, "grad_norm": 0.20531504165831338, "learning_rate": 3.396308726235326e-07, "loss": 0.0123, "step": 9999 }, { "epoch": 2.681684097613301, "grad_norm": 0.27873231096842677, "learning_rate": 3.3906589306958673e-07, "loss": 0.0153, "step": 10000 }, { "epoch": 2.6819522660230626, "grad_norm": 0.2673263275754631, "learning_rate": 3.3850136733841963e-07, "loss": 0.0133, "step": 10001 }, { "epoch": 2.682220434432824, "grad_norm": 0.22488874198961106, "learning_rate": 3.379372954849985e-07, "loss": 0.0117, "step": 10002 }, { "epoch": 2.682488602842585, "grad_norm": 0.21001963703954551, "learning_rate": 3.373736775642461e-07, "loss": 0.0103, "step": 10003 }, { "epoch": 2.6827567712523464, "grad_norm": 0.18567183311076185, "learning_rate": 3.3681051363103845e-07, "loss": 0.01, "step": 10004 }, { "epoch": 2.6830249396621078, "grad_norm": 0.2653104643175722, "learning_rate": 3.362478037402123e-07, "loss": 0.0131, "step": 10005 }, { "epoch": 2.683293108071869, "grad_norm": 0.28200697192598273, "learning_rate": 3.356855479465537e-07, "loss": 0.0144, "step": 10006 }, { "epoch": 2.6835612764816306, "grad_norm": 0.20752226266738377, "learning_rate": 3.3512374630481125e-07, "loss": 0.0105, "step": 10007 }, { "epoch": 2.6838294448913915, "grad_norm": 0.28352396377818323, "learning_rate": 3.345623988696844e-07, "loss": 0.0156, "step": 10008 }, { "epoch": 2.6840976133011534, "grad_norm": 0.2614648820818917, "learning_rate": 3.340015056958296e-07, "loss": 0.0143, "step": 10009 }, { "epoch": 2.6843657817109143, "grad_norm": 0.26206296025218345, "learning_rate": 3.334410668378607e-07, "loss": 0.0115, "step": 10010 }, { "epoch": 2.6846339501206757, "grad_norm": 0.23777404811317623, "learning_rate": 3.328810823503448e-07, "loss": 0.0191, "step": 10011 }, { "epoch": 2.684902118530437, "grad_norm": 0.23120094870085794, "learning_rate": 3.3232155228780704e-07, "loss": 0.0115, "step": 10012 }, { "epoch": 2.6851702869401985, "grad_norm": 0.27478399693499433, "learning_rate": 3.317624767047284e-07, "loss": 0.0175, "step": 10013 }, { "epoch": 2.68543845534996, "grad_norm": 0.2052603196988996, "learning_rate": 3.3120385565554247e-07, "loss": 0.0102, "step": 10014 }, { "epoch": 2.685706623759721, "grad_norm": 0.2042457132019438, "learning_rate": 3.3064568919464245e-07, "loss": 0.0122, "step": 10015 }, { "epoch": 2.6859747921694823, "grad_norm": 0.2208740827732014, "learning_rate": 3.300879773763743e-07, "loss": 0.0125, "step": 10016 }, { "epoch": 2.6862429605792437, "grad_norm": 0.260993393374754, "learning_rate": 3.2953072025504186e-07, "loss": 0.014, "step": 10017 }, { "epoch": 2.686511128989005, "grad_norm": 0.23594209576416675, "learning_rate": 3.289739178849027e-07, "loss": 0.0105, "step": 10018 }, { "epoch": 2.6867792973987665, "grad_norm": 0.2117900211064076, "learning_rate": 3.284175703201714e-07, "loss": 0.0125, "step": 10019 }, { "epoch": 2.6870474658085275, "grad_norm": 0.23049379191986713, "learning_rate": 3.2786167761502064e-07, "loss": 0.0172, "step": 10020 }, { "epoch": 2.6873156342182893, "grad_norm": 0.23672297894629166, "learning_rate": 3.2730623982357215e-07, "loss": 0.0133, "step": 10021 }, { "epoch": 2.6875838026280503, "grad_norm": 0.24690973843148198, "learning_rate": 3.267512569999093e-07, "loss": 0.0128, "step": 10022 }, { "epoch": 2.6878519710378117, "grad_norm": 0.3355428205491794, "learning_rate": 3.2619672919807054e-07, "loss": 0.0186, "step": 10023 }, { "epoch": 2.688120139447573, "grad_norm": 0.2436189415662277, "learning_rate": 3.2564265647204606e-07, "loss": 0.0151, "step": 10024 }, { "epoch": 2.6883883078573345, "grad_norm": 0.26350063222866427, "learning_rate": 3.2508903887578713e-07, "loss": 0.0171, "step": 10025 }, { "epoch": 2.688656476267096, "grad_norm": 0.2084271886703435, "learning_rate": 3.245358764631962e-07, "loss": 0.0164, "step": 10026 }, { "epoch": 2.688924644676857, "grad_norm": 0.28702299916448015, "learning_rate": 3.2398316928813454e-07, "loss": 0.0147, "step": 10027 }, { "epoch": 2.6891928130866183, "grad_norm": 0.29750609729810684, "learning_rate": 3.234309174044159e-07, "loss": 0.0116, "step": 10028 }, { "epoch": 2.6894609814963797, "grad_norm": 0.23142944801048917, "learning_rate": 3.228791208658133e-07, "loss": 0.0139, "step": 10029 }, { "epoch": 2.689729149906141, "grad_norm": 0.29508585384150066, "learning_rate": 3.2232777972605376e-07, "loss": 0.0204, "step": 10030 }, { "epoch": 2.6899973183159025, "grad_norm": 0.3598191051320857, "learning_rate": 3.2177689403881885e-07, "loss": 0.0115, "step": 10031 }, { "epoch": 2.6902654867256635, "grad_norm": 0.21732454575062218, "learning_rate": 3.212264638577478e-07, "loss": 0.0121, "step": 10032 }, { "epoch": 2.6905336551354253, "grad_norm": 0.21952435319814462, "learning_rate": 3.206764892364339e-07, "loss": 0.0109, "step": 10033 }, { "epoch": 2.6908018235451863, "grad_norm": 0.31091766598726395, "learning_rate": 3.2012697022842654e-07, "loss": 0.0155, "step": 10034 }, { "epoch": 2.6910699919549477, "grad_norm": 0.38660893547458436, "learning_rate": 3.195779068872318e-07, "loss": 0.018, "step": 10035 }, { "epoch": 2.691338160364709, "grad_norm": 0.25447225161005715, "learning_rate": 3.190292992663091e-07, "loss": 0.0148, "step": 10036 }, { "epoch": 2.6916063287744705, "grad_norm": 0.2376391543941223, "learning_rate": 3.1848114741907634e-07, "loss": 0.0114, "step": 10037 }, { "epoch": 2.691874497184232, "grad_norm": 0.21192542979570933, "learning_rate": 3.179334513989046e-07, "loss": 0.0118, "step": 10038 }, { "epoch": 2.692142665593993, "grad_norm": 0.23090341896737088, "learning_rate": 3.173862112591225e-07, "loss": 0.0116, "step": 10039 }, { "epoch": 2.6924108340037542, "grad_norm": 0.2543475828338398, "learning_rate": 3.1683942705301284e-07, "loss": 0.0114, "step": 10040 }, { "epoch": 2.6926790024135157, "grad_norm": 0.21631872674818173, "learning_rate": 3.162930988338148e-07, "loss": 0.0119, "step": 10041 }, { "epoch": 2.692947170823277, "grad_norm": 0.26669441135239735, "learning_rate": 3.157472266547229e-07, "loss": 0.0146, "step": 10042 }, { "epoch": 2.6932153392330385, "grad_norm": 0.1977208948752991, "learning_rate": 3.152018105688859e-07, "loss": 0.0121, "step": 10043 }, { "epoch": 2.6934835076427994, "grad_norm": 0.24658282883748822, "learning_rate": 3.1465685062941234e-07, "loss": 0.0141, "step": 10044 }, { "epoch": 2.6937516760525613, "grad_norm": 0.2806043403238001, "learning_rate": 3.1411234688936145e-07, "loss": 0.0143, "step": 10045 }, { "epoch": 2.6940198444623222, "grad_norm": 0.21415535357378834, "learning_rate": 3.1356829940174973e-07, "loss": 0.0117, "step": 10046 }, { "epoch": 2.6942880128720836, "grad_norm": 0.2992335510910722, "learning_rate": 3.1302470821955143e-07, "loss": 0.0148, "step": 10047 }, { "epoch": 2.694556181281845, "grad_norm": 0.21903298530033663, "learning_rate": 3.1248157339569265e-07, "loss": 0.0122, "step": 10048 }, { "epoch": 2.6948243496916064, "grad_norm": 0.2383975082075318, "learning_rate": 3.1193889498305816e-07, "loss": 0.0171, "step": 10049 }, { "epoch": 2.695092518101368, "grad_norm": 0.24396938238478916, "learning_rate": 3.113966730344875e-07, "loss": 0.0098, "step": 10050 }, { "epoch": 2.695360686511129, "grad_norm": 0.26109541547491394, "learning_rate": 3.108549076027739e-07, "loss": 0.0119, "step": 10051 }, { "epoch": 2.69562885492089, "grad_norm": 0.3039410893985119, "learning_rate": 3.103135987406697e-07, "loss": 0.0178, "step": 10052 }, { "epoch": 2.6958970233306516, "grad_norm": 0.19690200991629034, "learning_rate": 3.0977274650087817e-07, "loss": 0.0105, "step": 10053 }, { "epoch": 2.696165191740413, "grad_norm": 0.24034828942485903, "learning_rate": 3.092323509360634e-07, "loss": 0.0122, "step": 10054 }, { "epoch": 2.6964333601501744, "grad_norm": 0.44102258950536, "learning_rate": 3.086924120988399e-07, "loss": 0.0174, "step": 10055 }, { "epoch": 2.6967015285599354, "grad_norm": 0.2572631577938057, "learning_rate": 3.081529300417818e-07, "loss": 0.0125, "step": 10056 }, { "epoch": 2.6969696969696972, "grad_norm": 0.3642157881540682, "learning_rate": 3.076139048174159e-07, "loss": 0.0168, "step": 10057 }, { "epoch": 2.697237865379458, "grad_norm": 0.2522319538096927, "learning_rate": 3.0707533647822585e-07, "loss": 0.0106, "step": 10058 }, { "epoch": 2.6975060337892196, "grad_norm": 0.20718773525558293, "learning_rate": 3.0653722507665016e-07, "loss": 0.0082, "step": 10059 }, { "epoch": 2.697774202198981, "grad_norm": 0.3382134690946729, "learning_rate": 3.0599957066508534e-07, "loss": 0.0133, "step": 10060 }, { "epoch": 2.6980423706087424, "grad_norm": 0.24576192144405865, "learning_rate": 3.054623732958789e-07, "loss": 0.0119, "step": 10061 }, { "epoch": 2.698310539018504, "grad_norm": 0.24708451892766228, "learning_rate": 3.0492563302133846e-07, "loss": 0.0134, "step": 10062 }, { "epoch": 2.6985787074282648, "grad_norm": 0.2765739914259692, "learning_rate": 3.0438934989372284e-07, "loss": 0.0142, "step": 10063 }, { "epoch": 2.698846875838026, "grad_norm": 0.6960080105926635, "learning_rate": 3.038535239652507e-07, "loss": 0.0213, "step": 10064 }, { "epoch": 2.6991150442477876, "grad_norm": 0.19112631706057032, "learning_rate": 3.033181552880921e-07, "loss": 0.0124, "step": 10065 }, { "epoch": 2.699383212657549, "grad_norm": 0.349942462571877, "learning_rate": 3.027832439143752e-07, "loss": 0.0218, "step": 10066 }, { "epoch": 2.6996513810673104, "grad_norm": 0.3573100008294919, "learning_rate": 3.0224878989618346e-07, "loss": 0.0179, "step": 10067 }, { "epoch": 2.6999195494770714, "grad_norm": 0.22694198368133903, "learning_rate": 3.017147932855552e-07, "loss": 0.0146, "step": 10068 }, { "epoch": 2.700187717886833, "grad_norm": 0.19966955893551813, "learning_rate": 3.011812541344833e-07, "loss": 0.009, "step": 10069 }, { "epoch": 2.700455886296594, "grad_norm": 0.34611026006222856, "learning_rate": 3.006481724949173e-07, "loss": 0.0134, "step": 10070 }, { "epoch": 2.7007240547063556, "grad_norm": 0.2671818457342193, "learning_rate": 3.0011554841876236e-07, "loss": 0.0196, "step": 10071 }, { "epoch": 2.700992223116117, "grad_norm": 0.19766176087642223, "learning_rate": 2.995833819578786e-07, "loss": 0.0118, "step": 10072 }, { "epoch": 2.7012603915258784, "grad_norm": 0.2615912236081865, "learning_rate": 2.9905167316408135e-07, "loss": 0.0168, "step": 10073 }, { "epoch": 2.70152855993564, "grad_norm": 0.21245520264461423, "learning_rate": 2.9852042208914243e-07, "loss": 0.0096, "step": 10074 }, { "epoch": 2.7017967283454007, "grad_norm": 0.2195460463557986, "learning_rate": 2.979896287847872e-07, "loss": 0.0132, "step": 10075 }, { "epoch": 2.702064896755162, "grad_norm": 0.23276012930541176, "learning_rate": 2.974592933026982e-07, "loss": 0.0125, "step": 10076 }, { "epoch": 2.7023330651649236, "grad_norm": 0.22904691303340635, "learning_rate": 2.9692941569451296e-07, "loss": 0.0098, "step": 10077 }, { "epoch": 2.702601233574685, "grad_norm": 0.3002638794391005, "learning_rate": 2.963999960118241e-07, "loss": 0.0187, "step": 10078 }, { "epoch": 2.7028694019844464, "grad_norm": 0.9046273681117346, "learning_rate": 2.9587103430618045e-07, "loss": 0.0208, "step": 10079 }, { "epoch": 2.7031375703942073, "grad_norm": 0.2590412293383899, "learning_rate": 2.953425306290847e-07, "loss": 0.012, "step": 10080 }, { "epoch": 2.7034057388039687, "grad_norm": 0.33100045991560356, "learning_rate": 2.948144850319956e-07, "loss": 0.0148, "step": 10081 }, { "epoch": 2.70367390721373, "grad_norm": 0.23652284870884083, "learning_rate": 2.9428689756632876e-07, "loss": 0.013, "step": 10082 }, { "epoch": 2.7039420756234915, "grad_norm": 0.22557738497141674, "learning_rate": 2.9375976828345254e-07, "loss": 0.0105, "step": 10083 }, { "epoch": 2.704210244033253, "grad_norm": 0.21076126642046356, "learning_rate": 2.932330972346936e-07, "loss": 0.0145, "step": 10084 }, { "epoch": 2.7044784124430143, "grad_norm": 0.2591797880298394, "learning_rate": 2.92706884471331e-07, "loss": 0.0179, "step": 10085 }, { "epoch": 2.7047465808527758, "grad_norm": 0.2444132936226118, "learning_rate": 2.9218113004460204e-07, "loss": 0.0154, "step": 10086 }, { "epoch": 2.7050147492625367, "grad_norm": 0.3003104042494332, "learning_rate": 2.9165583400569686e-07, "loss": 0.0127, "step": 10087 }, { "epoch": 2.705282917672298, "grad_norm": 0.2469168704697331, "learning_rate": 2.9113099640576293e-07, "loss": 0.0114, "step": 10088 }, { "epoch": 2.7055510860820595, "grad_norm": 0.2807323904151426, "learning_rate": 2.906066172959021e-07, "loss": 0.013, "step": 10089 }, { "epoch": 2.705819254491821, "grad_norm": 0.2595370286899436, "learning_rate": 2.900826967271719e-07, "loss": 0.0119, "step": 10090 }, { "epoch": 2.7060874229015823, "grad_norm": 0.32656626934844674, "learning_rate": 2.895592347505849e-07, "loss": 0.0186, "step": 10091 }, { "epoch": 2.7063555913113433, "grad_norm": 0.20923410802601933, "learning_rate": 2.8903623141710966e-07, "loss": 0.0133, "step": 10092 }, { "epoch": 2.7066237597211047, "grad_norm": 0.24107661944610095, "learning_rate": 2.885136867776683e-07, "loss": 0.0096, "step": 10093 }, { "epoch": 2.706891928130866, "grad_norm": 0.21413002392570152, "learning_rate": 2.879916008831413e-07, "loss": 0.0146, "step": 10094 }, { "epoch": 2.7071600965406275, "grad_norm": 0.28638359566117866, "learning_rate": 2.8746997378436117e-07, "loss": 0.016, "step": 10095 }, { "epoch": 2.707428264950389, "grad_norm": 0.36549008060942845, "learning_rate": 2.8694880553211903e-07, "loss": 0.0106, "step": 10096 }, { "epoch": 2.7076964333601503, "grad_norm": 0.3131668139488774, "learning_rate": 2.864280961771582e-07, "loss": 0.0129, "step": 10097 }, { "epoch": 2.7079646017699117, "grad_norm": 0.2804486231139374, "learning_rate": 2.859078457701792e-07, "loss": 0.0128, "step": 10098 }, { "epoch": 2.7082327701796727, "grad_norm": 0.23309043282431077, "learning_rate": 2.8538805436183815e-07, "loss": 0.0112, "step": 10099 }, { "epoch": 2.708500938589434, "grad_norm": 0.2570046630685327, "learning_rate": 2.848687220027446e-07, "loss": 0.0132, "step": 10100 }, { "epoch": 2.7087691069991955, "grad_norm": 0.1885343389458234, "learning_rate": 2.843498487434659e-07, "loss": 0.0086, "step": 10101 }, { "epoch": 2.709037275408957, "grad_norm": 0.25947297903197725, "learning_rate": 2.838314346345217e-07, "loss": 0.0189, "step": 10102 }, { "epoch": 2.7093054438187183, "grad_norm": 0.2522663419739868, "learning_rate": 2.8331347972639034e-07, "loss": 0.0122, "step": 10103 }, { "epoch": 2.7095736122284793, "grad_norm": 0.3346132572757494, "learning_rate": 2.8279598406950324e-07, "loss": 0.0193, "step": 10104 }, { "epoch": 2.7098417806382407, "grad_norm": 0.18673409938462998, "learning_rate": 2.8227894771424624e-07, "loss": 0.0125, "step": 10105 }, { "epoch": 2.710109949048002, "grad_norm": 0.2208312631162379, "learning_rate": 2.817623707109635e-07, "loss": 0.0139, "step": 10106 }, { "epoch": 2.7103781174577635, "grad_norm": 0.20773393432408196, "learning_rate": 2.8124625310995136e-07, "loss": 0.0119, "step": 10107 }, { "epoch": 2.710646285867525, "grad_norm": 0.22906634615160432, "learning_rate": 2.807305949614636e-07, "loss": 0.0124, "step": 10108 }, { "epoch": 2.7109144542772863, "grad_norm": 0.28832244296912496, "learning_rate": 2.8021539631570896e-07, "loss": 0.0139, "step": 10109 }, { "epoch": 2.7111826226870477, "grad_norm": 0.20823692963071677, "learning_rate": 2.7970065722284944e-07, "loss": 0.0117, "step": 10110 }, { "epoch": 2.7114507910968086, "grad_norm": 0.409505423065092, "learning_rate": 2.7918637773300615e-07, "loss": 0.0173, "step": 10111 }, { "epoch": 2.71171895950657, "grad_norm": 0.26316303728179363, "learning_rate": 2.786725578962507e-07, "loss": 0.0142, "step": 10112 }, { "epoch": 2.7119871279163315, "grad_norm": 0.22194101728311771, "learning_rate": 2.7815919776261415e-07, "loss": 0.0148, "step": 10113 }, { "epoch": 2.712255296326093, "grad_norm": 0.28920723421608957, "learning_rate": 2.7764629738207927e-07, "loss": 0.0137, "step": 10114 }, { "epoch": 2.7125234647358543, "grad_norm": 0.2973945618538858, "learning_rate": 2.771338568045867e-07, "loss": 0.0157, "step": 10115 }, { "epoch": 2.7127916331456152, "grad_norm": 0.1694951523636434, "learning_rate": 2.766218760800327e-07, "loss": 0.0052, "step": 10116 }, { "epoch": 2.7130598015553766, "grad_norm": 0.22499071681850838, "learning_rate": 2.7611035525826614e-07, "loss": 0.0129, "step": 10117 }, { "epoch": 2.713327969965138, "grad_norm": 0.21686949477638687, "learning_rate": 2.755992943890923e-07, "loss": 0.0137, "step": 10118 }, { "epoch": 2.7135961383748994, "grad_norm": 0.19787165624272043, "learning_rate": 2.750886935222724e-07, "loss": 0.0109, "step": 10119 }, { "epoch": 2.713864306784661, "grad_norm": 0.27834288394583495, "learning_rate": 2.745785527075212e-07, "loss": 0.0135, "step": 10120 }, { "epoch": 2.7141324751944222, "grad_norm": 0.25931741692890964, "learning_rate": 2.740688719945112e-07, "loss": 0.0142, "step": 10121 }, { "epoch": 2.7144006436041836, "grad_norm": 0.22433002100203742, "learning_rate": 2.7355965143286753e-07, "loss": 0.0133, "step": 10122 }, { "epoch": 2.7146688120139446, "grad_norm": 0.2193055930782264, "learning_rate": 2.7305089107217295e-07, "loss": 0.015, "step": 10123 }, { "epoch": 2.714936980423706, "grad_norm": 0.22021854904232677, "learning_rate": 2.725425909619628e-07, "loss": 0.0123, "step": 10124 }, { "epoch": 2.7152051488334674, "grad_norm": 0.27222485997377344, "learning_rate": 2.720347511517291e-07, "loss": 0.014, "step": 10125 }, { "epoch": 2.715473317243229, "grad_norm": 0.2774959107938613, "learning_rate": 2.7152737169092005e-07, "loss": 0.0153, "step": 10126 }, { "epoch": 2.7157414856529902, "grad_norm": 0.23089316599624216, "learning_rate": 2.710204526289356e-07, "loss": 0.015, "step": 10127 }, { "epoch": 2.716009654062751, "grad_norm": 0.22850734753539792, "learning_rate": 2.705139940151358e-07, "loss": 0.0113, "step": 10128 }, { "epoch": 2.7162778224725126, "grad_norm": 0.21382124220516613, "learning_rate": 2.7000799589883166e-07, "loss": 0.0117, "step": 10129 }, { "epoch": 2.716545990882274, "grad_norm": 0.26662564640278064, "learning_rate": 2.6950245832929046e-07, "loss": 0.0163, "step": 10130 }, { "epoch": 2.7168141592920354, "grad_norm": 0.2660463732809721, "learning_rate": 2.689973813557367e-07, "loss": 0.0143, "step": 10131 }, { "epoch": 2.717082327701797, "grad_norm": 0.28877367165669693, "learning_rate": 2.684927650273461e-07, "loss": 0.017, "step": 10132 }, { "epoch": 2.717350496111558, "grad_norm": 0.23269313473078823, "learning_rate": 2.679886093932543e-07, "loss": 0.0131, "step": 10133 }, { "epoch": 2.7176186645213196, "grad_norm": 0.4221372247463844, "learning_rate": 2.674849145025471e-07, "loss": 0.0115, "step": 10134 }, { "epoch": 2.7178868329310806, "grad_norm": 0.272509445753347, "learning_rate": 2.6698168040426964e-07, "loss": 0.0143, "step": 10135 }, { "epoch": 2.718155001340842, "grad_norm": 0.25765841801038875, "learning_rate": 2.664789071474205e-07, "loss": 0.0129, "step": 10136 }, { "epoch": 2.7184231697506034, "grad_norm": 0.2262280714403418, "learning_rate": 2.659765947809523e-07, "loss": 0.0117, "step": 10137 }, { "epoch": 2.718691338160365, "grad_norm": 0.16487515671626357, "learning_rate": 2.6547474335377523e-07, "loss": 0.0065, "step": 10138 }, { "epoch": 2.718959506570126, "grad_norm": 0.25622958538467194, "learning_rate": 2.6497335291475133e-07, "loss": 0.0105, "step": 10139 }, { "epoch": 2.719227674979887, "grad_norm": 0.24826793676375233, "learning_rate": 2.644724235127016e-07, "loss": 0.0143, "step": 10140 }, { "epoch": 2.7194958433896486, "grad_norm": 0.2567674688866037, "learning_rate": 2.6397195519639983e-07, "loss": 0.0135, "step": 10141 }, { "epoch": 2.71976401179941, "grad_norm": 0.1691342962338418, "learning_rate": 2.634719480145742e-07, "loss": 0.0093, "step": 10142 }, { "epoch": 2.7200321802091714, "grad_norm": 0.28165669379629904, "learning_rate": 2.6297240201591025e-07, "loss": 0.0134, "step": 10143 }, { "epoch": 2.7203003486189328, "grad_norm": 0.27800160821264563, "learning_rate": 2.624733172490462e-07, "loss": 0.0142, "step": 10144 }, { "epoch": 2.720568517028694, "grad_norm": 0.23164947532696567, "learning_rate": 2.6197469376257724e-07, "loss": 0.0096, "step": 10145 }, { "epoch": 2.7208366854384556, "grad_norm": 0.2152848189530246, "learning_rate": 2.614765316050544e-07, "loss": 0.0173, "step": 10146 }, { "epoch": 2.7211048538482165, "grad_norm": 0.23289168459585774, "learning_rate": 2.6097883082498e-07, "loss": 0.0124, "step": 10147 }, { "epoch": 2.721373022257978, "grad_norm": 0.2218983801521475, "learning_rate": 2.604815914708153e-07, "loss": 0.0138, "step": 10148 }, { "epoch": 2.7216411906677394, "grad_norm": 0.2635082937707218, "learning_rate": 2.5998481359097496e-07, "loss": 0.0143, "step": 10149 }, { "epoch": 2.7219093590775008, "grad_norm": 0.28151299970223026, "learning_rate": 2.5948849723382906e-07, "loss": 0.0234, "step": 10150 }, { "epoch": 2.722177527487262, "grad_norm": 0.24786388950432012, "learning_rate": 2.589926424477018e-07, "loss": 0.016, "step": 10151 }, { "epoch": 2.722445695897023, "grad_norm": 0.25488643545589035, "learning_rate": 2.584972492808735e-07, "loss": 0.0165, "step": 10152 }, { "epoch": 2.7227138643067845, "grad_norm": 0.2717890470224508, "learning_rate": 2.580023177815816e-07, "loss": 0.011, "step": 10153 }, { "epoch": 2.722982032716546, "grad_norm": 0.27751522142135937, "learning_rate": 2.5750784799801255e-07, "loss": 0.0159, "step": 10154 }, { "epoch": 2.7232502011263073, "grad_norm": 0.2170607179433834, "learning_rate": 2.5701383997831284e-07, "loss": 0.0119, "step": 10155 }, { "epoch": 2.7235183695360687, "grad_norm": 0.2817797760124718, "learning_rate": 2.5652029377058465e-07, "loss": 0.0142, "step": 10156 }, { "epoch": 2.72378653794583, "grad_norm": 0.232999674125874, "learning_rate": 2.560272094228805e-07, "loss": 0.0119, "step": 10157 }, { "epoch": 2.7240547063555915, "grad_norm": 0.3220650106553259, "learning_rate": 2.5553458698321263e-07, "loss": 0.0161, "step": 10158 }, { "epoch": 2.7243228747653525, "grad_norm": 0.2558991183839685, "learning_rate": 2.5504242649954546e-07, "loss": 0.014, "step": 10159 }, { "epoch": 2.724591043175114, "grad_norm": 0.21497126511920372, "learning_rate": 2.545507280198001e-07, "loss": 0.0105, "step": 10160 }, { "epoch": 2.7248592115848753, "grad_norm": 0.3251135723498152, "learning_rate": 2.540594915918504e-07, "loss": 0.0274, "step": 10161 }, { "epoch": 2.7251273799946367, "grad_norm": 0.20574726414035674, "learning_rate": 2.5356871726352817e-07, "loss": 0.009, "step": 10162 }, { "epoch": 2.725395548404398, "grad_norm": 0.23393065244442354, "learning_rate": 2.5307840508261904e-07, "loss": 0.0116, "step": 10163 }, { "epoch": 2.725663716814159, "grad_norm": 0.2144140296155032, "learning_rate": 2.525885550968621e-07, "loss": 0.011, "step": 10164 }, { "epoch": 2.7259318852239205, "grad_norm": 0.22094296365370902, "learning_rate": 2.5209916735395467e-07, "loss": 0.012, "step": 10165 }, { "epoch": 2.726200053633682, "grad_norm": 0.1940292829081444, "learning_rate": 2.5161024190154427e-07, "loss": 0.009, "step": 10166 }, { "epoch": 2.7264682220434433, "grad_norm": 0.2154080547700454, "learning_rate": 2.5112177878723833e-07, "loss": 0.0127, "step": 10167 }, { "epoch": 2.7267363904532047, "grad_norm": 0.2538117030047245, "learning_rate": 2.506337780585971e-07, "loss": 0.0145, "step": 10168 }, { "epoch": 2.7270045588629657, "grad_norm": 0.2372653039010175, "learning_rate": 2.5014623976313546e-07, "loss": 0.0234, "step": 10169 }, { "epoch": 2.7272727272727275, "grad_norm": 0.3209603181066786, "learning_rate": 2.496591639483242e-07, "loss": 0.0149, "step": 10170 }, { "epoch": 2.7275408956824885, "grad_norm": 0.28903420557869525, "learning_rate": 2.491725506615872e-07, "loss": 0.0138, "step": 10171 }, { "epoch": 2.72780906409225, "grad_norm": 0.2319112836457248, "learning_rate": 2.4868639995030643e-07, "loss": 0.0112, "step": 10172 }, { "epoch": 2.7280772325020113, "grad_norm": 0.1765857314299491, "learning_rate": 2.4820071186181626e-07, "loss": 0.0084, "step": 10173 }, { "epoch": 2.7283454009117727, "grad_norm": 0.2847146295715286, "learning_rate": 2.477154864434067e-07, "loss": 0.0165, "step": 10174 }, { "epoch": 2.728613569321534, "grad_norm": 0.3082077564415258, "learning_rate": 2.472307237423244e-07, "loss": 0.015, "step": 10175 }, { "epoch": 2.728881737731295, "grad_norm": 0.32325184162292425, "learning_rate": 2.467464238057665e-07, "loss": 0.0095, "step": 10176 }, { "epoch": 2.7291499061410565, "grad_norm": 0.2070781417671562, "learning_rate": 2.46262586680891e-07, "loss": 0.0135, "step": 10177 }, { "epoch": 2.729418074550818, "grad_norm": 0.3202936888724756, "learning_rate": 2.457792124148062e-07, "loss": 0.0172, "step": 10178 }, { "epoch": 2.7296862429605793, "grad_norm": 0.26757917313394114, "learning_rate": 2.452963010545767e-07, "loss": 0.0131, "step": 10179 }, { "epoch": 2.7299544113703407, "grad_norm": 0.19102831234702866, "learning_rate": 2.448138526472238e-07, "loss": 0.0093, "step": 10180 }, { "epoch": 2.7302225797801016, "grad_norm": 0.22118426493052626, "learning_rate": 2.4433186723972045e-07, "loss": 0.0112, "step": 10181 }, { "epoch": 2.7304907481898635, "grad_norm": 0.18941046825797625, "learning_rate": 2.438503448789975e-07, "loss": 0.0111, "step": 10182 }, { "epoch": 2.7307589165996244, "grad_norm": 0.48831203652584326, "learning_rate": 2.4336928561193905e-07, "loss": 0.0137, "step": 10183 }, { "epoch": 2.731027085009386, "grad_norm": 0.23001800833565597, "learning_rate": 2.428886894853838e-07, "loss": 0.0137, "step": 10184 }, { "epoch": 2.7312952534191473, "grad_norm": 0.22151576935831416, "learning_rate": 2.4240855654612816e-07, "loss": 0.0121, "step": 10185 }, { "epoch": 2.7315634218289087, "grad_norm": 0.2636365617702101, "learning_rate": 2.4192888684091973e-07, "loss": 0.0158, "step": 10186 }, { "epoch": 2.73183159023867, "grad_norm": 0.25730794354940206, "learning_rate": 2.4144968041646343e-07, "loss": 0.0131, "step": 10187 }, { "epoch": 2.732099758648431, "grad_norm": 0.2266705224171433, "learning_rate": 2.40970937319418e-07, "loss": 0.0111, "step": 10188 }, { "epoch": 2.7323679270581924, "grad_norm": 0.1923055710247471, "learning_rate": 2.404926575963973e-07, "loss": 0.0082, "step": 10189 }, { "epoch": 2.732636095467954, "grad_norm": 0.2572208066024229, "learning_rate": 2.400148412939707e-07, "loss": 0.0137, "step": 10190 }, { "epoch": 2.7329042638777152, "grad_norm": 0.20868694979313432, "learning_rate": 2.3953748845866096e-07, "loss": 0.0114, "step": 10191 }, { "epoch": 2.7331724322874766, "grad_norm": 0.15256566167067673, "learning_rate": 2.390605991369477e-07, "loss": 0.0077, "step": 10192 }, { "epoch": 2.7334406006972376, "grad_norm": 0.26927236153892187, "learning_rate": 2.385841733752631e-07, "loss": 0.015, "step": 10193 }, { "epoch": 2.7337087691069994, "grad_norm": 0.2368625325813369, "learning_rate": 2.3810821121999616e-07, "loss": 0.0187, "step": 10194 }, { "epoch": 2.7339769375167604, "grad_norm": 0.2289195945449989, "learning_rate": 2.37632712717491e-07, "loss": 0.0128, "step": 10195 }, { "epoch": 2.734245105926522, "grad_norm": 0.22975656614025008, "learning_rate": 2.3715767791404387e-07, "loss": 0.016, "step": 10196 }, { "epoch": 2.734513274336283, "grad_norm": 0.2525111323513369, "learning_rate": 2.3668310685590945e-07, "loss": 0.0162, "step": 10197 }, { "epoch": 2.7347814427460446, "grad_norm": 0.3531149250183119, "learning_rate": 2.3620899958929354e-07, "loss": 0.0123, "step": 10198 }, { "epoch": 2.735049611155806, "grad_norm": 0.2363860754779539, "learning_rate": 2.3573535616036036e-07, "loss": 0.0124, "step": 10199 }, { "epoch": 2.735317779565567, "grad_norm": 0.33596924191149186, "learning_rate": 2.3526217661522632e-07, "loss": 0.0128, "step": 10200 }, { "epoch": 2.7355859479753284, "grad_norm": 0.23131245148846483, "learning_rate": 2.347894609999646e-07, "loss": 0.0113, "step": 10201 }, { "epoch": 2.73585411638509, "grad_norm": 0.19865156117447463, "learning_rate": 2.3431720936060166e-07, "loss": 0.009, "step": 10202 }, { "epoch": 2.736122284794851, "grad_norm": 0.27214901144528864, "learning_rate": 2.3384542174311908e-07, "loss": 0.012, "step": 10203 }, { "epoch": 2.7363904532046126, "grad_norm": 0.2697743120573721, "learning_rate": 2.3337409819345348e-07, "loss": 0.016, "step": 10204 }, { "epoch": 2.7366586216143736, "grad_norm": 0.2697348045195244, "learning_rate": 2.3290323875749754e-07, "loss": 0.0162, "step": 10205 }, { "epoch": 2.7369267900241354, "grad_norm": 0.22591343896685726, "learning_rate": 2.324328434810963e-07, "loss": 0.0125, "step": 10206 }, { "epoch": 2.7371949584338964, "grad_norm": 0.2908205132514924, "learning_rate": 2.3196291241005198e-07, "loss": 0.0195, "step": 10207 }, { "epoch": 2.737463126843658, "grad_norm": 0.2867602777133908, "learning_rate": 2.314934455901191e-07, "loss": 0.0151, "step": 10208 }, { "epoch": 2.737731295253419, "grad_norm": 0.1901515658051912, "learning_rate": 2.3102444306701e-07, "loss": 0.0124, "step": 10209 }, { "epoch": 2.7379994636631806, "grad_norm": 0.17653524232123627, "learning_rate": 2.3055590488638923e-07, "loss": 0.0102, "step": 10210 }, { "epoch": 2.738267632072942, "grad_norm": 0.28221149013141034, "learning_rate": 2.30087831093877e-07, "loss": 0.012, "step": 10211 }, { "epoch": 2.738535800482703, "grad_norm": 0.39051086649300387, "learning_rate": 2.296202217350496e-07, "loss": 0.0245, "step": 10212 }, { "epoch": 2.7388039688924644, "grad_norm": 0.295938389878031, "learning_rate": 2.291530768554362e-07, "loss": 0.014, "step": 10213 }, { "epoch": 2.7390721373022258, "grad_norm": 0.2392009215987486, "learning_rate": 2.286863965005204e-07, "loss": 0.0139, "step": 10214 }, { "epoch": 2.739340305711987, "grad_norm": 0.20829768416617755, "learning_rate": 2.282201807157436e-07, "loss": 0.0104, "step": 10215 }, { "epoch": 2.7396084741217486, "grad_norm": 0.20632633331738573, "learning_rate": 2.2775442954649786e-07, "loss": 0.0105, "step": 10216 }, { "epoch": 2.7398766425315095, "grad_norm": 0.2724529903820612, "learning_rate": 2.272891430381341e-07, "loss": 0.0142, "step": 10217 }, { "epoch": 2.7401448109412714, "grad_norm": 0.23027998567315175, "learning_rate": 2.26824321235955e-07, "loss": 0.0098, "step": 10218 }, { "epoch": 2.7404129793510323, "grad_norm": 0.3177990679203518, "learning_rate": 2.2635996418521932e-07, "loss": 0.0184, "step": 10219 }, { "epoch": 2.7406811477607937, "grad_norm": 0.18400364856294238, "learning_rate": 2.2589607193113982e-07, "loss": 0.014, "step": 10220 }, { "epoch": 2.740949316170555, "grad_norm": 0.1740607695303438, "learning_rate": 2.2543264451888424e-07, "loss": 0.0099, "step": 10221 }, { "epoch": 2.7412174845803166, "grad_norm": 0.36787119496236165, "learning_rate": 2.2496968199357705e-07, "loss": 0.0132, "step": 10222 }, { "epoch": 2.741485652990078, "grad_norm": 0.25222622983786247, "learning_rate": 2.2450718440029384e-07, "loss": 0.0114, "step": 10223 }, { "epoch": 2.741753821399839, "grad_norm": 0.23327250994105983, "learning_rate": 2.240451517840675e-07, "loss": 0.011, "step": 10224 }, { "epoch": 2.7420219898096003, "grad_norm": 0.2015999879553685, "learning_rate": 2.2358358418988536e-07, "loss": 0.0099, "step": 10225 }, { "epoch": 2.7422901582193617, "grad_norm": 0.25925054014848964, "learning_rate": 2.2312248166268758e-07, "loss": 0.0136, "step": 10226 }, { "epoch": 2.742558326629123, "grad_norm": 0.22678978116344337, "learning_rate": 2.2266184424737214e-07, "loss": 0.0125, "step": 10227 }, { "epoch": 2.7428264950388845, "grad_norm": 0.37391953056572996, "learning_rate": 2.2220167198878872e-07, "loss": 0.0158, "step": 10228 }, { "epoch": 2.7430946634486455, "grad_norm": 0.20504321962445998, "learning_rate": 2.2174196493174427e-07, "loss": 0.0104, "step": 10229 }, { "epoch": 2.7433628318584073, "grad_norm": 0.33702282905633757, "learning_rate": 2.2128272312099798e-07, "loss": 0.0128, "step": 10230 }, { "epoch": 2.7436310002681683, "grad_norm": 0.22776614844497967, "learning_rate": 2.208239466012657e-07, "loss": 0.0127, "step": 10231 }, { "epoch": 2.7438991686779297, "grad_norm": 0.3371623245606038, "learning_rate": 2.2036563541721789e-07, "loss": 0.024, "step": 10232 }, { "epoch": 2.744167337087691, "grad_norm": 0.22644136497706224, "learning_rate": 2.1990778961347768e-07, "loss": 0.0103, "step": 10233 }, { "epoch": 2.7444355054974525, "grad_norm": 0.28102350803875015, "learning_rate": 2.1945040923462614e-07, "loss": 0.0153, "step": 10234 }, { "epoch": 2.744703673907214, "grad_norm": 0.29960356666261806, "learning_rate": 2.1899349432519535e-07, "loss": 0.0222, "step": 10235 }, { "epoch": 2.744971842316975, "grad_norm": 0.296130382429142, "learning_rate": 2.1853704492967533e-07, "loss": 0.0128, "step": 10236 }, { "epoch": 2.7452400107267363, "grad_norm": 0.29711706491116263, "learning_rate": 2.180810610925077e-07, "loss": 0.0117, "step": 10237 }, { "epoch": 2.7455081791364977, "grad_norm": 0.2738283388550113, "learning_rate": 2.1762554285809258e-07, "loss": 0.0164, "step": 10238 }, { "epoch": 2.745776347546259, "grad_norm": 0.31280942182568827, "learning_rate": 2.1717049027078106e-07, "loss": 0.0181, "step": 10239 }, { "epoch": 2.7460445159560205, "grad_norm": 0.2682370193407817, "learning_rate": 2.1671590337488e-07, "loss": 0.0088, "step": 10240 }, { "epoch": 2.7463126843657815, "grad_norm": 0.200177021042343, "learning_rate": 2.1626178221465232e-07, "loss": 0.0113, "step": 10241 }, { "epoch": 2.7465808527755433, "grad_norm": 0.23299220982992275, "learning_rate": 2.1580812683431486e-07, "loss": 0.0121, "step": 10242 }, { "epoch": 2.7468490211853043, "grad_norm": 0.26113559828095206, "learning_rate": 2.153549372780378e-07, "loss": 0.0131, "step": 10243 }, { "epoch": 2.7471171895950657, "grad_norm": 0.30596950527762934, "learning_rate": 2.149022135899481e-07, "loss": 0.0112, "step": 10244 }, { "epoch": 2.747385358004827, "grad_norm": 0.2535548164256236, "learning_rate": 2.14449955814125e-07, "loss": 0.0158, "step": 10245 }, { "epoch": 2.7476535264145885, "grad_norm": 0.2241292329254599, "learning_rate": 2.1399816399460482e-07, "loss": 0.0125, "step": 10246 }, { "epoch": 2.74792169482435, "grad_norm": 0.29028886493608785, "learning_rate": 2.1354683817537637e-07, "loss": 0.0147, "step": 10247 }, { "epoch": 2.748189863234111, "grad_norm": 0.29754240709631247, "learning_rate": 2.1309597840038442e-07, "loss": 0.0134, "step": 10248 }, { "epoch": 2.7484580316438723, "grad_norm": 0.23200396880854102, "learning_rate": 2.1264558471352892e-07, "loss": 0.0134, "step": 10249 }, { "epoch": 2.7487262000536337, "grad_norm": 0.20940793849411454, "learning_rate": 2.1219565715866254e-07, "loss": 0.0093, "step": 10250 }, { "epoch": 2.748994368463395, "grad_norm": 0.24840800744348698, "learning_rate": 2.1174619577959355e-07, "loss": 0.0112, "step": 10251 }, { "epoch": 2.7492625368731565, "grad_norm": 0.23828654577646663, "learning_rate": 2.1129720062008418e-07, "loss": 0.0118, "step": 10252 }, { "epoch": 2.7495307052829174, "grad_norm": 0.22373162259808743, "learning_rate": 2.1084867172385282e-07, "loss": 0.0168, "step": 10253 }, { "epoch": 2.7497988736926793, "grad_norm": 0.23456597607422838, "learning_rate": 2.1040060913457171e-07, "loss": 0.0097, "step": 10254 }, { "epoch": 2.7500670421024402, "grad_norm": 0.2851256361029505, "learning_rate": 2.0995301289586656e-07, "loss": 0.0191, "step": 10255 }, { "epoch": 2.7503352105122016, "grad_norm": 0.19421952339888432, "learning_rate": 2.0950588305132024e-07, "loss": 0.0096, "step": 10256 }, { "epoch": 2.750603378921963, "grad_norm": 0.2081009268081321, "learning_rate": 2.0905921964446628e-07, "loss": 0.0071, "step": 10257 }, { "epoch": 2.7508715473317245, "grad_norm": 0.226446561290927, "learning_rate": 2.0861302271879713e-07, "loss": 0.0115, "step": 10258 }, { "epoch": 2.751139715741486, "grad_norm": 0.2510517821505098, "learning_rate": 2.0816729231775746e-07, "loss": 0.0134, "step": 10259 }, { "epoch": 2.751407884151247, "grad_norm": 0.3237875328770131, "learning_rate": 2.0772202848474587e-07, "loss": 0.0171, "step": 10260 }, { "epoch": 2.7516760525610082, "grad_norm": 0.23965919450221349, "learning_rate": 2.0727723126311715e-07, "loss": 0.0141, "step": 10261 }, { "epoch": 2.7519442209707696, "grad_norm": 0.2827376277757224, "learning_rate": 2.0683290069618055e-07, "loss": 0.0156, "step": 10262 }, { "epoch": 2.752212389380531, "grad_norm": 0.2506367706107894, "learning_rate": 2.0638903682719814e-07, "loss": 0.0131, "step": 10263 }, { "epoch": 2.7524805577902924, "grad_norm": 0.2766255687350105, "learning_rate": 2.059456396993892e-07, "loss": 0.0121, "step": 10264 }, { "epoch": 2.7527487262000534, "grad_norm": 0.2311344240626549, "learning_rate": 2.055027093559242e-07, "loss": 0.0122, "step": 10265 }, { "epoch": 2.753016894609815, "grad_norm": 0.2205222699875854, "learning_rate": 2.0506024583993256e-07, "loss": 0.0096, "step": 10266 }, { "epoch": 2.753285063019576, "grad_norm": 0.17159448343696415, "learning_rate": 2.0461824919449313e-07, "loss": 0.0078, "step": 10267 }, { "epoch": 2.7535532314293376, "grad_norm": 0.26279692574572383, "learning_rate": 2.0417671946264373e-07, "loss": 0.0125, "step": 10268 }, { "epoch": 2.753821399839099, "grad_norm": 0.6925207219733114, "learning_rate": 2.0373565668737493e-07, "loss": 0.0154, "step": 10269 }, { "epoch": 2.7540895682488604, "grad_norm": 0.22817350756203078, "learning_rate": 2.032950609116313e-07, "loss": 0.0127, "step": 10270 }, { "epoch": 2.754357736658622, "grad_norm": 0.48832810313717717, "learning_rate": 2.0285493217831242e-07, "loss": 0.0143, "step": 10271 }, { "epoch": 2.754625905068383, "grad_norm": 0.32622837516723335, "learning_rate": 2.0241527053027233e-07, "loss": 0.0257, "step": 10272 }, { "epoch": 2.754894073478144, "grad_norm": 0.24955000156224624, "learning_rate": 2.0197607601032067e-07, "loss": 0.0111, "step": 10273 }, { "epoch": 2.7551622418879056, "grad_norm": 0.3205979609858217, "learning_rate": 2.015373486612199e-07, "loss": 0.026, "step": 10274 }, { "epoch": 2.755430410297667, "grad_norm": 0.22140605952717737, "learning_rate": 2.010990885256875e-07, "loss": 0.0126, "step": 10275 }, { "epoch": 2.7556985787074284, "grad_norm": 0.2612654470878207, "learning_rate": 2.0066129564639658e-07, "loss": 0.0126, "step": 10276 }, { "epoch": 2.7559667471171894, "grad_norm": 0.379459263528785, "learning_rate": 2.002239700659725e-07, "loss": 0.0199, "step": 10277 }, { "epoch": 2.7562349155269508, "grad_norm": 0.36599950663530945, "learning_rate": 1.9978711182699783e-07, "loss": 0.0122, "step": 10278 }, { "epoch": 2.756503083936712, "grad_norm": 0.45018369586827184, "learning_rate": 1.9935072097200693e-07, "loss": 0.0165, "step": 10279 }, { "epoch": 2.7567712523464736, "grad_norm": 0.31162531621030426, "learning_rate": 1.989147975434913e-07, "loss": 0.0161, "step": 10280 }, { "epoch": 2.757039420756235, "grad_norm": 0.24638270124983994, "learning_rate": 1.9847934158389537e-07, "loss": 0.013, "step": 10281 }, { "epoch": 2.7573075891659964, "grad_norm": 0.2881351775983531, "learning_rate": 1.9804435313561798e-07, "loss": 0.0152, "step": 10282 }, { "epoch": 2.757575757575758, "grad_norm": 0.2815664557017304, "learning_rate": 1.976098322410136e-07, "loss": 0.0147, "step": 10283 }, { "epoch": 2.7578439259855188, "grad_norm": 0.4014799892547454, "learning_rate": 1.9717577894238893e-07, "loss": 0.0247, "step": 10284 }, { "epoch": 2.75811209439528, "grad_norm": 0.2727684225006125, "learning_rate": 1.9674219328200738e-07, "loss": 0.0113, "step": 10285 }, { "epoch": 2.7583802628050416, "grad_norm": 0.2464718439555285, "learning_rate": 1.9630907530208741e-07, "loss": 0.0125, "step": 10286 }, { "epoch": 2.758648431214803, "grad_norm": 0.25411832706798865, "learning_rate": 1.958764250447981e-07, "loss": 0.0173, "step": 10287 }, { "epoch": 2.7589165996245644, "grad_norm": 0.28624250104542454, "learning_rate": 1.9544424255226735e-07, "loss": 0.0166, "step": 10288 }, { "epoch": 2.7591847680343253, "grad_norm": 0.2241679073275532, "learning_rate": 1.9501252786657376e-07, "loss": 0.0114, "step": 10289 }, { "epoch": 2.7594529364440867, "grad_norm": 0.2624428034302116, "learning_rate": 1.945812810297537e-07, "loss": 0.0132, "step": 10290 }, { "epoch": 2.759721104853848, "grad_norm": 0.2370167165045738, "learning_rate": 1.9415050208379638e-07, "loss": 0.0138, "step": 10291 }, { "epoch": 2.7599892732636095, "grad_norm": 0.23993197510487468, "learning_rate": 1.9372019107064487e-07, "loss": 0.0141, "step": 10292 }, { "epoch": 2.760257441673371, "grad_norm": 0.19245829680014842, "learning_rate": 1.9329034803219903e-07, "loss": 0.0099, "step": 10293 }, { "epoch": 2.7605256100831324, "grad_norm": 0.23027509944270383, "learning_rate": 1.9286097301030927e-07, "loss": 0.0123, "step": 10294 }, { "epoch": 2.7607937784928938, "grad_norm": 0.2075698026182642, "learning_rate": 1.9243206604678487e-07, "loss": 0.0108, "step": 10295 }, { "epoch": 2.7610619469026547, "grad_norm": 0.17967105219804216, "learning_rate": 1.9200362718338528e-07, "loss": 0.0085, "step": 10296 }, { "epoch": 2.761330115312416, "grad_norm": 0.17285989310868857, "learning_rate": 1.915756564618282e-07, "loss": 0.0095, "step": 10297 }, { "epoch": 2.7615982837221775, "grad_norm": 0.2515371419058884, "learning_rate": 1.9114815392378362e-07, "loss": 0.0141, "step": 10298 }, { "epoch": 2.761866452131939, "grad_norm": 0.3648060177137761, "learning_rate": 1.9072111961087546e-07, "loss": 0.0265, "step": 10299 }, { "epoch": 2.7621346205417003, "grad_norm": 0.2574701574198003, "learning_rate": 1.902945535646833e-07, "loss": 0.0153, "step": 10300 }, { "epoch": 2.7624027889514613, "grad_norm": 0.2412810247703087, "learning_rate": 1.8986845582674164e-07, "loss": 0.0132, "step": 10301 }, { "epoch": 2.7626709573612227, "grad_norm": 0.20926607655787768, "learning_rate": 1.894428264385373e-07, "loss": 0.0099, "step": 10302 }, { "epoch": 2.762939125770984, "grad_norm": 0.1819578201015022, "learning_rate": 1.890176654415138e-07, "loss": 0.0094, "step": 10303 }, { "epoch": 2.7632072941807455, "grad_norm": 0.2701035941216311, "learning_rate": 1.8859297287706636e-07, "loss": 0.0136, "step": 10304 }, { "epoch": 2.763475462590507, "grad_norm": 0.20569477944370845, "learning_rate": 1.88168748786548e-07, "loss": 0.0102, "step": 10305 }, { "epoch": 2.7637436310002683, "grad_norm": 0.23322955056802552, "learning_rate": 1.8774499321126293e-07, "loss": 0.01, "step": 10306 }, { "epoch": 2.7640117994100297, "grad_norm": 0.23871855309907736, "learning_rate": 1.8732170619247093e-07, "loss": 0.0191, "step": 10307 }, { "epoch": 2.7642799678197907, "grad_norm": 0.2149647640690599, "learning_rate": 1.8689888777138843e-07, "loss": 0.015, "step": 10308 }, { "epoch": 2.764548136229552, "grad_norm": 0.3155170795165717, "learning_rate": 1.864765379891814e-07, "loss": 0.0124, "step": 10309 }, { "epoch": 2.7648163046393135, "grad_norm": 0.21049758398944282, "learning_rate": 1.8605465688697532e-07, "loss": 0.0105, "step": 10310 }, { "epoch": 2.765084473049075, "grad_norm": 0.2863906521356286, "learning_rate": 1.856332445058462e-07, "loss": 0.0164, "step": 10311 }, { "epoch": 2.7653526414588363, "grad_norm": 0.2270089325208188, "learning_rate": 1.8521230088682564e-07, "loss": 0.0127, "step": 10312 }, { "epoch": 2.7656208098685973, "grad_norm": 0.27355287480396784, "learning_rate": 1.8479182607090085e-07, "loss": 0.0147, "step": 10313 }, { "epoch": 2.7658889782783587, "grad_norm": 0.17183495368980595, "learning_rate": 1.8437182009901134e-07, "loss": 0.0077, "step": 10314 }, { "epoch": 2.76615714668812, "grad_norm": 0.5598178295342714, "learning_rate": 1.8395228301205325e-07, "loss": 0.0142, "step": 10315 }, { "epoch": 2.7664253150978815, "grad_norm": 0.43847833037313527, "learning_rate": 1.8353321485087394e-07, "loss": 0.0163, "step": 10316 }, { "epoch": 2.766693483507643, "grad_norm": 0.21230966128304207, "learning_rate": 1.8311461565627852e-07, "loss": 0.0119, "step": 10317 }, { "epoch": 2.7669616519174043, "grad_norm": 0.28864930293786856, "learning_rate": 1.826964854690244e-07, "loss": 0.0129, "step": 10318 }, { "epoch": 2.7672298203271657, "grad_norm": 0.2991216039914053, "learning_rate": 1.8227882432982347e-07, "loss": 0.0146, "step": 10319 }, { "epoch": 2.7674979887369267, "grad_norm": 0.2704800470352621, "learning_rate": 1.8186163227934316e-07, "loss": 0.0143, "step": 10320 }, { "epoch": 2.767766157146688, "grad_norm": 0.204038761943951, "learning_rate": 1.814449093582038e-07, "loss": 0.0098, "step": 10321 }, { "epoch": 2.7680343255564495, "grad_norm": 0.3319624950140885, "learning_rate": 1.8102865560698068e-07, "loss": 0.0122, "step": 10322 }, { "epoch": 2.768302493966211, "grad_norm": 0.3257821344791158, "learning_rate": 1.8061287106620308e-07, "loss": 0.0181, "step": 10323 }, { "epoch": 2.7685706623759723, "grad_norm": 0.20001357546567808, "learning_rate": 1.8019755577635468e-07, "loss": 0.0109, "step": 10324 }, { "epoch": 2.7688388307857332, "grad_norm": 0.23585049574639466, "learning_rate": 1.7978270977787426e-07, "loss": 0.01, "step": 10325 }, { "epoch": 2.7691069991954946, "grad_norm": 0.20227424915313286, "learning_rate": 1.7936833311115398e-07, "loss": 0.0094, "step": 10326 }, { "epoch": 2.769375167605256, "grad_norm": 0.2320709775885682, "learning_rate": 1.7895442581653988e-07, "loss": 0.0118, "step": 10327 }, { "epoch": 2.7696433360150174, "grad_norm": 0.19016772083065708, "learning_rate": 1.7854098793433472e-07, "loss": 0.0078, "step": 10328 }, { "epoch": 2.769911504424779, "grad_norm": 0.25879980862355717, "learning_rate": 1.7812801950479186e-07, "loss": 0.0111, "step": 10329 }, { "epoch": 2.7701796728345403, "grad_norm": 0.22364016472566298, "learning_rate": 1.7771552056812303e-07, "loss": 0.0112, "step": 10330 }, { "epoch": 2.7704478412443017, "grad_norm": 0.24812615777637864, "learning_rate": 1.7730349116448998e-07, "loss": 0.0133, "step": 10331 }, { "epoch": 2.7707160096540626, "grad_norm": 0.32595556281030735, "learning_rate": 1.7689193133401284e-07, "loss": 0.0201, "step": 10332 }, { "epoch": 2.770984178063824, "grad_norm": 0.335697145560089, "learning_rate": 1.764808411167629e-07, "loss": 0.0201, "step": 10333 }, { "epoch": 2.7712523464735854, "grad_norm": 0.23592271550726876, "learning_rate": 1.7607022055276758e-07, "loss": 0.0101, "step": 10334 }, { "epoch": 2.771520514883347, "grad_norm": 0.24657334384179988, "learning_rate": 1.7566006968200712e-07, "loss": 0.0118, "step": 10335 }, { "epoch": 2.7717886832931082, "grad_norm": 0.24704361849458073, "learning_rate": 1.7525038854441734e-07, "loss": 0.0177, "step": 10336 }, { "epoch": 2.772056851702869, "grad_norm": 0.265406988641684, "learning_rate": 1.7484117717988746e-07, "loss": 0.0135, "step": 10337 }, { "epoch": 2.7723250201126306, "grad_norm": 0.181827170597123, "learning_rate": 1.7443243562826228e-07, "loss": 0.0082, "step": 10338 }, { "epoch": 2.772593188522392, "grad_norm": 0.2827419545725674, "learning_rate": 1.7402416392933885e-07, "loss": 0.0112, "step": 10339 }, { "epoch": 2.7728613569321534, "grad_norm": 0.29644413744040143, "learning_rate": 1.7361636212287036e-07, "loss": 0.0124, "step": 10340 }, { "epoch": 2.773129525341915, "grad_norm": 0.2218596425328855, "learning_rate": 1.7320903024856227e-07, "loss": 0.013, "step": 10341 }, { "epoch": 2.773397693751676, "grad_norm": 0.21982000024457082, "learning_rate": 1.728021683460762e-07, "loss": 0.0117, "step": 10342 }, { "epoch": 2.7736658621614376, "grad_norm": 0.2526653705505683, "learning_rate": 1.7239577645502714e-07, "loss": 0.0112, "step": 10343 }, { "epoch": 2.7739340305711986, "grad_norm": 0.20579035471585921, "learning_rate": 1.7198985461498398e-07, "loss": 0.0105, "step": 10344 }, { "epoch": 2.77420219898096, "grad_norm": 0.18570489694528705, "learning_rate": 1.7158440286547117e-07, "loss": 0.0097, "step": 10345 }, { "epoch": 2.7744703673907214, "grad_norm": 0.24743635872723702, "learning_rate": 1.7117942124596555e-07, "loss": 0.0102, "step": 10346 }, { "epoch": 2.774738535800483, "grad_norm": 0.27089635845612203, "learning_rate": 1.7077490979589996e-07, "loss": 0.016, "step": 10347 }, { "epoch": 2.775006704210244, "grad_norm": 0.2658242080535775, "learning_rate": 1.7037086855465902e-07, "loss": 0.0147, "step": 10348 }, { "epoch": 2.775274872620005, "grad_norm": 0.2420758360240541, "learning_rate": 1.6996729756158403e-07, "loss": 0.0112, "step": 10349 }, { "epoch": 2.7755430410297666, "grad_norm": 0.1666718013045961, "learning_rate": 1.695641968559708e-07, "loss": 0.0122, "step": 10350 }, { "epoch": 2.775811209439528, "grad_norm": 0.24409090648180884, "learning_rate": 1.6916156647706683e-07, "loss": 0.0173, "step": 10351 }, { "epoch": 2.7760793778492894, "grad_norm": 0.23448958354029764, "learning_rate": 1.687594064640752e-07, "loss": 0.0103, "step": 10352 }, { "epoch": 2.776347546259051, "grad_norm": 0.351794842035431, "learning_rate": 1.6835771685615343e-07, "loss": 0.0256, "step": 10353 }, { "epoch": 2.7766157146688117, "grad_norm": 0.2847824787892679, "learning_rate": 1.6795649769241307e-07, "loss": 0.0152, "step": 10354 }, { "epoch": 2.7768838830785736, "grad_norm": 0.21984114751109476, "learning_rate": 1.6755574901192007e-07, "loss": 0.0121, "step": 10355 }, { "epoch": 2.7771520514883345, "grad_norm": 0.2768227547924538, "learning_rate": 1.6715547085369378e-07, "loss": 0.0149, "step": 10356 }, { "epoch": 2.777420219898096, "grad_norm": 0.27439159070393127, "learning_rate": 1.6675566325670856e-07, "loss": 0.0172, "step": 10357 }, { "epoch": 2.7776883883078574, "grad_norm": 0.31181356151709716, "learning_rate": 1.6635632625989163e-07, "loss": 0.0136, "step": 10358 }, { "epoch": 2.7779565567176188, "grad_norm": 0.24751533358959624, "learning_rate": 1.6595745990212686e-07, "loss": 0.0095, "step": 10359 }, { "epoch": 2.77822472512738, "grad_norm": 0.18923389868414806, "learning_rate": 1.6555906422225043e-07, "loss": 0.0102, "step": 10360 }, { "epoch": 2.778492893537141, "grad_norm": 0.24941852697524652, "learning_rate": 1.6516113925905185e-07, "loss": 0.0125, "step": 10361 }, { "epoch": 2.7787610619469025, "grad_norm": 0.35410922350237184, "learning_rate": 1.647636850512774e-07, "loss": 0.0288, "step": 10362 }, { "epoch": 2.779029230356664, "grad_norm": 0.2262450056972405, "learning_rate": 1.6436670163762492e-07, "loss": 0.0118, "step": 10363 }, { "epoch": 2.7792973987664253, "grad_norm": 0.29306291273751933, "learning_rate": 1.6397018905674912e-07, "loss": 0.0147, "step": 10364 }, { "epoch": 2.7795655671761867, "grad_norm": 0.32147155883807826, "learning_rate": 1.635741473472563e-07, "loss": 0.0145, "step": 10365 }, { "epoch": 2.7798337355859477, "grad_norm": 0.2551642323049274, "learning_rate": 1.6317857654770787e-07, "loss": 0.011, "step": 10366 }, { "epoch": 2.7801019039957096, "grad_norm": 0.3308911839901085, "learning_rate": 1.6278347669662077e-07, "loss": 0.0187, "step": 10367 }, { "epoch": 2.7803700724054705, "grad_norm": 0.3146301942924829, "learning_rate": 1.6238884783246312e-07, "loss": 0.0187, "step": 10368 }, { "epoch": 2.780638240815232, "grad_norm": 0.3258296054709916, "learning_rate": 1.6199468999366086e-07, "loss": 0.0263, "step": 10369 }, { "epoch": 2.7809064092249933, "grad_norm": 0.203573694491409, "learning_rate": 1.6160100321858996e-07, "loss": 0.0131, "step": 10370 }, { "epoch": 2.7811745776347547, "grad_norm": 0.30626492319516063, "learning_rate": 1.6120778754558418e-07, "loss": 0.0107, "step": 10371 }, { "epoch": 2.781442746044516, "grad_norm": 0.17102970758164596, "learning_rate": 1.608150430129296e-07, "loss": 0.0088, "step": 10372 }, { "epoch": 2.781710914454277, "grad_norm": 0.260349658145614, "learning_rate": 1.604227696588656e-07, "loss": 0.0154, "step": 10373 }, { "epoch": 2.7819790828640385, "grad_norm": 0.27792940635653424, "learning_rate": 1.6003096752158886e-07, "loss": 0.0154, "step": 10374 }, { "epoch": 2.7822472512738, "grad_norm": 0.20612069696434848, "learning_rate": 1.596396366392461e-07, "loss": 0.0143, "step": 10375 }, { "epoch": 2.7825154196835613, "grad_norm": 0.22436612925722413, "learning_rate": 1.5924877704994068e-07, "loss": 0.0125, "step": 10376 }, { "epoch": 2.7827835880933227, "grad_norm": 0.21744654798322047, "learning_rate": 1.588583887917311e-07, "loss": 0.0122, "step": 10377 }, { "epoch": 2.7830517565030837, "grad_norm": 0.3061791582866806, "learning_rate": 1.584684719026258e-07, "loss": 0.0132, "step": 10378 }, { "epoch": 2.7833199249128455, "grad_norm": 0.243636010690925, "learning_rate": 1.5807902642059282e-07, "loss": 0.0096, "step": 10379 }, { "epoch": 2.7835880933226065, "grad_norm": 0.2261238031706784, "learning_rate": 1.5769005238354894e-07, "loss": 0.0147, "step": 10380 }, { "epoch": 2.783856261732368, "grad_norm": 0.2553476864221084, "learning_rate": 1.573015498293684e-07, "loss": 0.0222, "step": 10381 }, { "epoch": 2.7841244301421293, "grad_norm": 0.6847567360528842, "learning_rate": 1.569135187958798e-07, "loss": 0.0154, "step": 10382 }, { "epoch": 2.7843925985518907, "grad_norm": 0.2406006914957366, "learning_rate": 1.5652595932086346e-07, "loss": 0.0115, "step": 10383 }, { "epoch": 2.784660766961652, "grad_norm": 0.1809979877173751, "learning_rate": 1.561388714420553e-07, "loss": 0.0098, "step": 10384 }, { "epoch": 2.784928935371413, "grad_norm": 0.267644877138911, "learning_rate": 1.5575225519714464e-07, "loss": 0.0101, "step": 10385 }, { "epoch": 2.7851971037811745, "grad_norm": 0.2444677341742721, "learning_rate": 1.5536611062377583e-07, "loss": 0.0094, "step": 10386 }, { "epoch": 2.785465272190936, "grad_norm": 0.2893272278301197, "learning_rate": 1.5498043775954707e-07, "loss": 0.0149, "step": 10387 }, { "epoch": 2.7857334406006973, "grad_norm": 0.2326711247304831, "learning_rate": 1.5459523664200894e-07, "loss": 0.0158, "step": 10388 }, { "epoch": 2.7860016090104587, "grad_norm": 0.5013544426240463, "learning_rate": 1.5421050730866914e-07, "loss": 0.0156, "step": 10389 }, { "epoch": 2.7862697774202196, "grad_norm": 0.19445489988059045, "learning_rate": 1.5382624979698613e-07, "loss": 0.0107, "step": 10390 }, { "epoch": 2.7865379458299815, "grad_norm": 0.23395153437828692, "learning_rate": 1.5344246414437546e-07, "loss": 0.0125, "step": 10391 }, { "epoch": 2.7868061142397424, "grad_norm": 0.22559644842053048, "learning_rate": 1.5305915038820397e-07, "loss": 0.0138, "step": 10392 }, { "epoch": 2.787074282649504, "grad_norm": 0.22746131041139123, "learning_rate": 1.5267630856579508e-07, "loss": 0.0085, "step": 10393 }, { "epoch": 2.7873424510592653, "grad_norm": 0.27648221746913276, "learning_rate": 1.522939387144251e-07, "loss": 0.0124, "step": 10394 }, { "epoch": 2.7876106194690267, "grad_norm": 0.19119714348072736, "learning_rate": 1.519120408713237e-07, "loss": 0.0094, "step": 10395 }, { "epoch": 2.787878787878788, "grad_norm": 0.22126481607399828, "learning_rate": 1.515306150736745e-07, "loss": 0.0124, "step": 10396 }, { "epoch": 2.788146956288549, "grad_norm": 0.2074502966363226, "learning_rate": 1.511496613586183e-07, "loss": 0.0092, "step": 10397 }, { "epoch": 2.7884151246983104, "grad_norm": 0.2194114895785131, "learning_rate": 1.507691797632449e-07, "loss": 0.0134, "step": 10398 }, { "epoch": 2.788683293108072, "grad_norm": 0.18321003190076166, "learning_rate": 1.5038917032460298e-07, "loss": 0.0078, "step": 10399 }, { "epoch": 2.7889514615178332, "grad_norm": 0.2879240989180353, "learning_rate": 1.500096330796913e-07, "loss": 0.0199, "step": 10400 }, { "epoch": 2.7892196299275946, "grad_norm": 0.2804627396054786, "learning_rate": 1.4963056806546582e-07, "loss": 0.0169, "step": 10401 }, { "epoch": 2.7894877983373556, "grad_norm": 0.2515257040649049, "learning_rate": 1.4925197531883374e-07, "loss": 0.0133, "step": 10402 }, { "epoch": 2.7897559667471175, "grad_norm": 0.28994729813499437, "learning_rate": 1.4887385487665828e-07, "loss": 0.0126, "step": 10403 }, { "epoch": 2.7900241351568784, "grad_norm": 0.20802634294868103, "learning_rate": 1.484962067757567e-07, "loss": 0.0121, "step": 10404 }, { "epoch": 2.79029230356664, "grad_norm": 0.24242197527195733, "learning_rate": 1.4811903105289904e-07, "loss": 0.0112, "step": 10405 }, { "epoch": 2.7905604719764012, "grad_norm": 0.2480661374254817, "learning_rate": 1.4774232774480978e-07, "loss": 0.0117, "step": 10406 }, { "epoch": 2.7908286403861626, "grad_norm": 0.24038377504023176, "learning_rate": 1.4736609688816738e-07, "loss": 0.0168, "step": 10407 }, { "epoch": 2.791096808795924, "grad_norm": 0.2560227372163319, "learning_rate": 1.469903385196042e-07, "loss": 0.0141, "step": 10408 }, { "epoch": 2.791364977205685, "grad_norm": 0.22674291457777546, "learning_rate": 1.466150526757082e-07, "loss": 0.0108, "step": 10409 }, { "epoch": 2.7916331456154464, "grad_norm": 0.2887108267352762, "learning_rate": 1.4624023939301845e-07, "loss": 0.0135, "step": 10410 }, { "epoch": 2.791901314025208, "grad_norm": 0.2622124591864205, "learning_rate": 1.4586589870803025e-07, "loss": 0.0129, "step": 10411 }, { "epoch": 2.792169482434969, "grad_norm": 0.2562398963460012, "learning_rate": 1.4549203065719163e-07, "loss": 0.0098, "step": 10412 }, { "epoch": 2.7924376508447306, "grad_norm": 0.30412303316149014, "learning_rate": 1.4511863527690573e-07, "loss": 0.0142, "step": 10413 }, { "epoch": 2.7927058192544916, "grad_norm": 0.31076244578127227, "learning_rate": 1.4474571260352898e-07, "loss": 0.0212, "step": 10414 }, { "epoch": 2.7929739876642534, "grad_norm": 0.3657686841788158, "learning_rate": 1.443732626733718e-07, "loss": 0.0211, "step": 10415 }, { "epoch": 2.7932421560740144, "grad_norm": 0.24214448364605679, "learning_rate": 1.4400128552269855e-07, "loss": 0.0107, "step": 10416 }, { "epoch": 2.793510324483776, "grad_norm": 0.23584827647361203, "learning_rate": 1.436297811877274e-07, "loss": 0.0115, "step": 10417 }, { "epoch": 2.793778492893537, "grad_norm": 0.28820192373450965, "learning_rate": 1.4325874970463172e-07, "loss": 0.015, "step": 10418 }, { "epoch": 2.7940466613032986, "grad_norm": 0.2237656459835972, "learning_rate": 1.42888191109537e-07, "loss": 0.0109, "step": 10419 }, { "epoch": 2.79431482971306, "grad_norm": 0.24680954567322236, "learning_rate": 1.425181054385233e-07, "loss": 0.012, "step": 10420 }, { "epoch": 2.794582998122821, "grad_norm": 0.232064208063594, "learning_rate": 1.4214849272762565e-07, "loss": 0.0105, "step": 10421 }, { "epoch": 2.7948511665325824, "grad_norm": 0.2785547264126087, "learning_rate": 1.417793530128314e-07, "loss": 0.0184, "step": 10422 }, { "epoch": 2.7951193349423438, "grad_norm": 0.26641413334934144, "learning_rate": 1.414106863300835e-07, "loss": 0.0162, "step": 10423 }, { "epoch": 2.795387503352105, "grad_norm": 0.21820182422666062, "learning_rate": 1.4104249271527815e-07, "loss": 0.0167, "step": 10424 }, { "epoch": 2.7956556717618666, "grad_norm": 0.2559928207004348, "learning_rate": 1.406747722042645e-07, "loss": 0.0155, "step": 10425 }, { "epoch": 2.7959238401716275, "grad_norm": 0.2977755717115644, "learning_rate": 1.4030752483284727e-07, "loss": 0.0178, "step": 10426 }, { "epoch": 2.7961920085813894, "grad_norm": 0.2220591626514631, "learning_rate": 1.3994075063678393e-07, "loss": 0.0097, "step": 10427 }, { "epoch": 2.7964601769911503, "grad_norm": 0.19230928808945955, "learning_rate": 1.3957444965178703e-07, "loss": 0.0086, "step": 10428 }, { "epoch": 2.7967283454009118, "grad_norm": 0.9443152836682657, "learning_rate": 1.392086219135208e-07, "loss": 0.0083, "step": 10429 }, { "epoch": 2.796996513810673, "grad_norm": 0.21101384917978036, "learning_rate": 1.3884326745760622e-07, "loss": 0.0123, "step": 10430 }, { "epoch": 2.7972646822204346, "grad_norm": 0.22971679317096116, "learning_rate": 1.3847838631961764e-07, "loss": 0.013, "step": 10431 }, { "epoch": 2.797532850630196, "grad_norm": 0.28036280132388014, "learning_rate": 1.3811397853508047e-07, "loss": 0.0106, "step": 10432 }, { "epoch": 2.797801019039957, "grad_norm": 0.2576678558789783, "learning_rate": 1.377500441394769e-07, "loss": 0.0133, "step": 10433 }, { "epoch": 2.7980691874497183, "grad_norm": 0.4126172021834964, "learning_rate": 1.373865831682436e-07, "loss": 0.0333, "step": 10434 }, { "epoch": 2.7983373558594797, "grad_norm": 0.32729245952100927, "learning_rate": 1.370235956567678e-07, "loss": 0.0374, "step": 10435 }, { "epoch": 2.798605524269241, "grad_norm": 0.3309749441799219, "learning_rate": 1.3666108164039406e-07, "loss": 0.0167, "step": 10436 }, { "epoch": 2.7988736926790025, "grad_norm": 0.2787860629229127, "learning_rate": 1.362990411544185e-07, "loss": 0.0148, "step": 10437 }, { "epoch": 2.7991418610887635, "grad_norm": 0.2478465003549631, "learning_rate": 1.35937474234093e-07, "loss": 0.0133, "step": 10438 }, { "epoch": 2.799410029498525, "grad_norm": 0.2194953338305714, "learning_rate": 1.3557638091462099e-07, "loss": 0.0096, "step": 10439 }, { "epoch": 2.7996781979082863, "grad_norm": 0.2449033353376612, "learning_rate": 1.352157612311622e-07, "loss": 0.0139, "step": 10440 }, { "epoch": 2.7999463663180477, "grad_norm": 0.28415259940058196, "learning_rate": 1.3485561521882906e-07, "loss": 0.0145, "step": 10441 }, { "epoch": 2.800214534727809, "grad_norm": 0.2637487643923356, "learning_rate": 1.3449594291268797e-07, "loss": 0.0171, "step": 10442 }, { "epoch": 2.8004827031375705, "grad_norm": 0.33749852024804883, "learning_rate": 1.341367443477598e-07, "loss": 0.0139, "step": 10443 }, { "epoch": 2.800750871547332, "grad_norm": 0.20998618750802264, "learning_rate": 1.337780195590177e-07, "loss": 0.0105, "step": 10444 }, { "epoch": 2.801019039957093, "grad_norm": 0.23861127514112854, "learning_rate": 1.3341976858139038e-07, "loss": 0.0173, "step": 10445 }, { "epoch": 2.8012872083668543, "grad_norm": 0.2455684868640165, "learning_rate": 1.330619914497594e-07, "loss": 0.0111, "step": 10446 }, { "epoch": 2.8015553767766157, "grad_norm": 0.25615967860042893, "learning_rate": 1.3270468819896076e-07, "loss": 0.0112, "step": 10447 }, { "epoch": 2.801823545186377, "grad_norm": 0.2703378774023744, "learning_rate": 1.3234785886378499e-07, "loss": 0.0155, "step": 10448 }, { "epoch": 2.8020917135961385, "grad_norm": 0.26591268382646427, "learning_rate": 1.3199150347897427e-07, "loss": 0.0164, "step": 10449 }, { "epoch": 2.8023598820058995, "grad_norm": 0.2619501554580531, "learning_rate": 1.316356220792264e-07, "loss": 0.0152, "step": 10450 }, { "epoch": 2.802628050415661, "grad_norm": 0.22985707092005225, "learning_rate": 1.3128021469919317e-07, "loss": 0.0117, "step": 10451 }, { "epoch": 2.8028962188254223, "grad_norm": 0.2450159160562203, "learning_rate": 1.3092528137347848e-07, "loss": 0.011, "step": 10452 }, { "epoch": 2.8031643872351837, "grad_norm": 0.32912648627599583, "learning_rate": 1.3057082213664307e-07, "loss": 0.0137, "step": 10453 }, { "epoch": 2.803432555644945, "grad_norm": 0.2278205689066198, "learning_rate": 1.3021683702319766e-07, "loss": 0.0138, "step": 10454 }, { "epoch": 2.8037007240547065, "grad_norm": 0.253014305577512, "learning_rate": 1.2986332606761077e-07, "loss": 0.017, "step": 10455 }, { "epoch": 2.803968892464468, "grad_norm": 0.42745883173923505, "learning_rate": 1.2951028930430155e-07, "loss": 0.0166, "step": 10456 }, { "epoch": 2.804237060874229, "grad_norm": 0.325634582241657, "learning_rate": 1.2915772676764416e-07, "loss": 0.0173, "step": 10457 }, { "epoch": 2.8045052292839903, "grad_norm": 0.3077726016166347, "learning_rate": 1.2880563849196725e-07, "loss": 0.0178, "step": 10458 }, { "epoch": 2.8047733976937517, "grad_norm": 0.2282917340907862, "learning_rate": 1.2845402451155231e-07, "loss": 0.0117, "step": 10459 }, { "epoch": 2.805041566103513, "grad_norm": 0.26700993642872733, "learning_rate": 1.2810288486063582e-07, "loss": 0.0115, "step": 10460 }, { "epoch": 2.8053097345132745, "grad_norm": 0.20885392443741777, "learning_rate": 1.2775221957340654e-07, "loss": 0.0109, "step": 10461 }, { "epoch": 2.8055779029230354, "grad_norm": 0.18612941625924867, "learning_rate": 1.2740202868400776e-07, "loss": 0.0116, "step": 10462 }, { "epoch": 2.805846071332797, "grad_norm": 0.3363810787023966, "learning_rate": 1.2705231222653768e-07, "loss": 0.0263, "step": 10463 }, { "epoch": 2.8061142397425582, "grad_norm": 0.2539166688734785, "learning_rate": 1.267030702350458e-07, "loss": 0.0147, "step": 10464 }, { "epoch": 2.8063824081523197, "grad_norm": 0.22304391928935519, "learning_rate": 1.263543027435382e-07, "loss": 0.0092, "step": 10465 }, { "epoch": 2.806650576562081, "grad_norm": 0.2653321377312349, "learning_rate": 1.2600600978597222e-07, "loss": 0.0191, "step": 10466 }, { "epoch": 2.8069187449718425, "grad_norm": 0.2287778530073877, "learning_rate": 1.2565819139626123e-07, "loss": 0.0139, "step": 10467 }, { "epoch": 2.807186913381604, "grad_norm": 0.2986165970589626, "learning_rate": 1.253108476082715e-07, "loss": 0.0177, "step": 10468 }, { "epoch": 2.807455081791365, "grad_norm": 0.2389811203872515, "learning_rate": 1.2496397845582152e-07, "loss": 0.0172, "step": 10469 }, { "epoch": 2.8077232502011262, "grad_norm": 0.2313254011790213, "learning_rate": 1.246175839726871e-07, "loss": 0.0093, "step": 10470 }, { "epoch": 2.8079914186108876, "grad_norm": 0.2540078090779224, "learning_rate": 1.242716641925934e-07, "loss": 0.0134, "step": 10471 }, { "epoch": 2.808259587020649, "grad_norm": 0.23870018947824648, "learning_rate": 1.2392621914922354e-07, "loss": 0.0121, "step": 10472 }, { "epoch": 2.8085277554304104, "grad_norm": 0.20133152690268585, "learning_rate": 1.2358124887621225e-07, "loss": 0.0081, "step": 10473 }, { "epoch": 2.8087959238401714, "grad_norm": 0.2951443022709948, "learning_rate": 1.2323675340714768e-07, "loss": 0.0155, "step": 10474 }, { "epoch": 2.809064092249933, "grad_norm": 0.20949239363459768, "learning_rate": 1.2289273277557356e-07, "loss": 0.0105, "step": 10475 }, { "epoch": 2.809332260659694, "grad_norm": 0.25121939776565294, "learning_rate": 1.2254918701498474e-07, "loss": 0.0095, "step": 10476 }, { "epoch": 2.8096004290694556, "grad_norm": 0.25489599586964196, "learning_rate": 1.222061161588328e-07, "loss": 0.0142, "step": 10477 }, { "epoch": 2.809868597479217, "grad_norm": 0.3713140867020353, "learning_rate": 1.218635202405205e-07, "loss": 0.0146, "step": 10478 }, { "epoch": 2.8101367658889784, "grad_norm": 0.26517108704676373, "learning_rate": 1.215213992934061e-07, "loss": 0.0121, "step": 10479 }, { "epoch": 2.81040493429874, "grad_norm": 0.23486400485530132, "learning_rate": 1.211797533508019e-07, "loss": 0.0104, "step": 10480 }, { "epoch": 2.810673102708501, "grad_norm": 0.22152003535505607, "learning_rate": 1.2083858244597124e-07, "loss": 0.0121, "step": 10481 }, { "epoch": 2.810941271118262, "grad_norm": 0.21521823373193297, "learning_rate": 1.2049788661213367e-07, "loss": 0.0139, "step": 10482 }, { "epoch": 2.8112094395280236, "grad_norm": 0.24934471274133665, "learning_rate": 1.2015766588246268e-07, "loss": 0.0168, "step": 10483 }, { "epoch": 2.811477607937785, "grad_norm": 0.18632973811217737, "learning_rate": 1.1981792029008342e-07, "loss": 0.011, "step": 10484 }, { "epoch": 2.8117457763475464, "grad_norm": 0.24445712819532772, "learning_rate": 1.1947864986807722e-07, "loss": 0.0133, "step": 10485 }, { "epoch": 2.8120139447573074, "grad_norm": 0.3464566346990064, "learning_rate": 1.1913985464947652e-07, "loss": 0.0162, "step": 10486 }, { "epoch": 2.8122821131670688, "grad_norm": 0.2157737623394709, "learning_rate": 1.1880153466727107e-07, "loss": 0.0086, "step": 10487 }, { "epoch": 2.81255028157683, "grad_norm": 0.221391901442086, "learning_rate": 1.184636899543995e-07, "loss": 0.0111, "step": 10488 }, { "epoch": 2.8128184499865916, "grad_norm": 0.3390963824654852, "learning_rate": 1.1812632054375883e-07, "loss": 0.0166, "step": 10489 }, { "epoch": 2.813086618396353, "grad_norm": 0.24287132858084956, "learning_rate": 1.1778942646819724e-07, "loss": 0.0114, "step": 10490 }, { "epoch": 2.8133547868061144, "grad_norm": 0.34294226074080464, "learning_rate": 1.1745300776051683e-07, "loss": 0.0164, "step": 10491 }, { "epoch": 2.813622955215876, "grad_norm": 0.23270887858535036, "learning_rate": 1.171170644534747e-07, "loss": 0.0089, "step": 10492 }, { "epoch": 2.8138911236256368, "grad_norm": 0.23224269257798463, "learning_rate": 1.1678159657978027e-07, "loss": 0.0144, "step": 10493 }, { "epoch": 2.814159292035398, "grad_norm": 0.18104815265243018, "learning_rate": 1.164466041720963e-07, "loss": 0.0073, "step": 10494 }, { "epoch": 2.8144274604451596, "grad_norm": 0.21350395185096221, "learning_rate": 1.1611208726304169e-07, "loss": 0.0094, "step": 10495 }, { "epoch": 2.814695628854921, "grad_norm": 0.2486309338339223, "learning_rate": 1.1577804588518593e-07, "loss": 0.0174, "step": 10496 }, { "epoch": 2.8149637972646824, "grad_norm": 0.28649282558673866, "learning_rate": 1.1544448007105524e-07, "loss": 0.0117, "step": 10497 }, { "epoch": 2.8152319656744433, "grad_norm": 0.2431961914125149, "learning_rate": 1.1511138985312698e-07, "loss": 0.0089, "step": 10498 }, { "epoch": 2.8155001340842047, "grad_norm": 0.3257855778533044, "learning_rate": 1.1477877526383296e-07, "loss": 0.0126, "step": 10499 }, { "epoch": 2.815768302493966, "grad_norm": 0.18654397407433732, "learning_rate": 1.1444663633556063e-07, "loss": 0.0073, "step": 10500 }, { "epoch": 2.8160364709037276, "grad_norm": 0.21329784236664553, "learning_rate": 1.1411497310064745e-07, "loss": 0.0112, "step": 10501 }, { "epoch": 2.816304639313489, "grad_norm": 0.2330606826695789, "learning_rate": 1.1378378559138814e-07, "loss": 0.0086, "step": 10502 }, { "epoch": 2.8165728077232504, "grad_norm": 0.3100593980493565, "learning_rate": 1.1345307384002857e-07, "loss": 0.024, "step": 10503 }, { "epoch": 2.8168409761330118, "grad_norm": 0.2350491979618477, "learning_rate": 1.131228378787702e-07, "loss": 0.0147, "step": 10504 }, { "epoch": 2.8171091445427727, "grad_norm": 0.2775991165155188, "learning_rate": 1.1279307773976677e-07, "loss": 0.0136, "step": 10505 }, { "epoch": 2.817377312952534, "grad_norm": 0.2707360254237531, "learning_rate": 1.1246379345512593e-07, "loss": 0.0122, "step": 10506 }, { "epoch": 2.8176454813622955, "grad_norm": 0.227220094865465, "learning_rate": 1.1213498505690924e-07, "loss": 0.0086, "step": 10507 }, { "epoch": 2.817913649772057, "grad_norm": 0.22817919608034942, "learning_rate": 1.1180665257713164e-07, "loss": 0.01, "step": 10508 }, { "epoch": 2.8181818181818183, "grad_norm": 0.28890061337198, "learning_rate": 1.1147879604776258e-07, "loss": 0.018, "step": 10509 }, { "epoch": 2.8184499865915793, "grad_norm": 0.3070013579283648, "learning_rate": 1.1115141550072483e-07, "loss": 0.0104, "step": 10510 }, { "epoch": 2.8187181550013407, "grad_norm": 0.2435290939246613, "learning_rate": 1.1082451096789348e-07, "loss": 0.0134, "step": 10511 }, { "epoch": 2.818986323411102, "grad_norm": 0.27000591558114306, "learning_rate": 1.1049808248109973e-07, "loss": 0.0139, "step": 10512 }, { "epoch": 2.8192544918208635, "grad_norm": 0.1944386742758072, "learning_rate": 1.1017213007212536e-07, "loss": 0.0102, "step": 10513 }, { "epoch": 2.819522660230625, "grad_norm": 0.2396153728456049, "learning_rate": 1.0984665377270942e-07, "loss": 0.0097, "step": 10514 }, { "epoch": 2.8197908286403863, "grad_norm": 0.18574527659173157, "learning_rate": 1.0952165361454103e-07, "loss": 0.0096, "step": 10515 }, { "epoch": 2.8200589970501477, "grad_norm": 0.27264315456353905, "learning_rate": 1.0919712962926543e-07, "loss": 0.0178, "step": 10516 }, { "epoch": 2.8203271654599087, "grad_norm": 0.23467576324785558, "learning_rate": 1.0887308184848122e-07, "loss": 0.0166, "step": 10517 }, { "epoch": 2.82059533386967, "grad_norm": 0.23237151439835077, "learning_rate": 1.0854951030373817e-07, "loss": 0.0126, "step": 10518 }, { "epoch": 2.8208635022794315, "grad_norm": 0.2265132858856544, "learning_rate": 1.082264150265433e-07, "loss": 0.012, "step": 10519 }, { "epoch": 2.821131670689193, "grad_norm": 0.2392742797338822, "learning_rate": 1.0790379604835477e-07, "loss": 0.0132, "step": 10520 }, { "epoch": 2.8213998390989543, "grad_norm": 0.2795524375002084, "learning_rate": 1.0758165340058524e-07, "loss": 0.0178, "step": 10521 }, { "epoch": 2.8216680075087153, "grad_norm": 0.2661763907993042, "learning_rate": 1.0725998711460184e-07, "loss": 0.0144, "step": 10522 }, { "epoch": 2.8219361759184767, "grad_norm": 0.22888801914804655, "learning_rate": 1.0693879722172285e-07, "loss": 0.0136, "step": 10523 }, { "epoch": 2.822204344328238, "grad_norm": 0.32246337040229756, "learning_rate": 1.0661808375322269e-07, "loss": 0.0181, "step": 10524 }, { "epoch": 2.8224725127379995, "grad_norm": 0.2958465744163367, "learning_rate": 1.0629784674032751e-07, "loss": 0.0149, "step": 10525 }, { "epoch": 2.822740681147761, "grad_norm": 0.24649871652539673, "learning_rate": 1.0597808621421846e-07, "loss": 0.0169, "step": 10526 }, { "epoch": 2.823008849557522, "grad_norm": 0.3134110662921053, "learning_rate": 1.0565880220603009e-07, "loss": 0.0105, "step": 10527 }, { "epoch": 2.8232770179672837, "grad_norm": 0.17443324557004608, "learning_rate": 1.053399947468503e-07, "loss": 0.008, "step": 10528 }, { "epoch": 2.8235451863770447, "grad_norm": 0.2187714883265498, "learning_rate": 1.0502166386771929e-07, "loss": 0.0199, "step": 10529 }, { "epoch": 2.823813354786806, "grad_norm": 0.22886551935926505, "learning_rate": 1.0470380959963388e-07, "loss": 0.0122, "step": 10530 }, { "epoch": 2.8240815231965675, "grad_norm": 0.2484388046265019, "learning_rate": 1.0438643197354103e-07, "loss": 0.0153, "step": 10531 }, { "epoch": 2.824349691606329, "grad_norm": 0.29250340854280743, "learning_rate": 1.0406953102034434e-07, "loss": 0.0129, "step": 10532 }, { "epoch": 2.8246178600160903, "grad_norm": 0.2011810687082208, "learning_rate": 1.03753106770898e-07, "loss": 0.0092, "step": 10533 }, { "epoch": 2.8248860284258512, "grad_norm": 0.25207777330616665, "learning_rate": 1.034371592560135e-07, "loss": 0.0131, "step": 10534 }, { "epoch": 2.8251541968356126, "grad_norm": 0.23081352811138797, "learning_rate": 1.0312168850645177e-07, "loss": 0.0116, "step": 10535 }, { "epoch": 2.825422365245374, "grad_norm": 0.2778166475281058, "learning_rate": 1.0280669455293046e-07, "loss": 0.0152, "step": 10536 }, { "epoch": 2.8256905336551355, "grad_norm": 0.19949269306913872, "learning_rate": 1.0249217742612005e-07, "loss": 0.0092, "step": 10537 }, { "epoch": 2.825958702064897, "grad_norm": 0.3158668509886965, "learning_rate": 1.0217813715664327e-07, "loss": 0.0226, "step": 10538 }, { "epoch": 2.826226870474658, "grad_norm": 0.20996423286752994, "learning_rate": 1.0186457377507786e-07, "loss": 0.009, "step": 10539 }, { "epoch": 2.8264950388844197, "grad_norm": 0.208105971675455, "learning_rate": 1.0155148731195497e-07, "loss": 0.0084, "step": 10540 }, { "epoch": 2.8267632072941806, "grad_norm": 0.2774483154697275, "learning_rate": 1.0123887779775798e-07, "loss": 0.0125, "step": 10541 }, { "epoch": 2.827031375703942, "grad_norm": 0.24656591178592982, "learning_rate": 1.009267452629259e-07, "loss": 0.0135, "step": 10542 }, { "epoch": 2.8272995441137034, "grad_norm": 0.19404473740136532, "learning_rate": 1.0061508973784995e-07, "loss": 0.0114, "step": 10543 }, { "epoch": 2.827567712523465, "grad_norm": 0.23654619646755948, "learning_rate": 1.0030391125287476e-07, "loss": 0.015, "step": 10544 }, { "epoch": 2.8278358809332262, "grad_norm": 0.23251931017808042, "learning_rate": 9.999320983829941e-08, "loss": 0.0118, "step": 10545 }, { "epoch": 2.828104049342987, "grad_norm": 0.22423685520950068, "learning_rate": 9.968298552437583e-08, "loss": 0.0111, "step": 10546 }, { "epoch": 2.8283722177527486, "grad_norm": 0.2283074961901126, "learning_rate": 9.937323834130985e-08, "loss": 0.0112, "step": 10547 }, { "epoch": 2.82864038616251, "grad_norm": 0.1998370102471561, "learning_rate": 9.906396831926069e-08, "loss": 0.011, "step": 10548 }, { "epoch": 2.8289085545722714, "grad_norm": 0.2712724812963372, "learning_rate": 9.875517548834201e-08, "loss": 0.0151, "step": 10549 }, { "epoch": 2.829176722982033, "grad_norm": 0.2497759348634584, "learning_rate": 9.844685987861813e-08, "loss": 0.0126, "step": 10550 }, { "epoch": 2.829444891391794, "grad_norm": 0.2854105222814435, "learning_rate": 9.813902152011112e-08, "loss": 0.017, "step": 10551 }, { "epoch": 2.8297130598015556, "grad_norm": 0.22328515175330568, "learning_rate": 9.783166044279313e-08, "loss": 0.0151, "step": 10552 }, { "epoch": 2.8299812282113166, "grad_norm": 0.27221734064435466, "learning_rate": 9.752477667659133e-08, "loss": 0.0126, "step": 10553 }, { "epoch": 2.830249396621078, "grad_norm": 0.26142095298663853, "learning_rate": 9.721837025138625e-08, "loss": 0.0131, "step": 10554 }, { "epoch": 2.8305175650308394, "grad_norm": 0.29516283162324486, "learning_rate": 9.691244119701126e-08, "loss": 0.0106, "step": 10555 }, { "epoch": 2.830785733440601, "grad_norm": 0.29419935012549886, "learning_rate": 9.660698954325531e-08, "loss": 0.0151, "step": 10556 }, { "epoch": 2.831053901850362, "grad_norm": 0.2644848867063579, "learning_rate": 9.630201531985794e-08, "loss": 0.0156, "step": 10557 }, { "epoch": 2.831322070260123, "grad_norm": 0.3452619118433279, "learning_rate": 9.599751855651429e-08, "loss": 0.0198, "step": 10558 }, { "epoch": 2.8315902386698846, "grad_norm": 0.2911449097193264, "learning_rate": 9.569349928287231e-08, "loss": 0.0132, "step": 10559 }, { "epoch": 2.831858407079646, "grad_norm": 0.2733649681675306, "learning_rate": 9.538995752853387e-08, "loss": 0.0138, "step": 10560 }, { "epoch": 2.8321265754894074, "grad_norm": 0.2712613627377445, "learning_rate": 9.508689332305365e-08, "loss": 0.0092, "step": 10561 }, { "epoch": 2.832394743899169, "grad_norm": 0.3491678286581166, "learning_rate": 9.47843066959403e-08, "loss": 0.0234, "step": 10562 }, { "epoch": 2.8326629123089297, "grad_norm": 0.1939961486760394, "learning_rate": 9.448219767665579e-08, "loss": 0.0094, "step": 10563 }, { "epoch": 2.8329310807186916, "grad_norm": 0.2276638070120413, "learning_rate": 9.418056629461658e-08, "loss": 0.0103, "step": 10564 }, { "epoch": 2.8331992491284526, "grad_norm": 0.24073772279406377, "learning_rate": 9.38794125791903e-08, "loss": 0.0181, "step": 10565 }, { "epoch": 2.833467417538214, "grad_norm": 0.2871623719892741, "learning_rate": 9.357873655970018e-08, "loss": 0.0121, "step": 10566 }, { "epoch": 2.8337355859479754, "grad_norm": 0.2167964775352199, "learning_rate": 9.327853826542221e-08, "loss": 0.0097, "step": 10567 }, { "epoch": 2.8340037543577368, "grad_norm": 0.2450520580095833, "learning_rate": 9.297881772558581e-08, "loss": 0.0148, "step": 10568 }, { "epoch": 2.834271922767498, "grad_norm": 0.246440215240128, "learning_rate": 9.267957496937429e-08, "loss": 0.0129, "step": 10569 }, { "epoch": 2.834540091177259, "grad_norm": 0.28089009870122456, "learning_rate": 9.238081002592326e-08, "loss": 0.011, "step": 10570 }, { "epoch": 2.8348082595870205, "grad_norm": 0.20602307036459055, "learning_rate": 9.208252292432385e-08, "loss": 0.0096, "step": 10571 }, { "epoch": 2.835076427996782, "grad_norm": 0.2528165779942726, "learning_rate": 9.178471369361841e-08, "loss": 0.0133, "step": 10572 }, { "epoch": 2.8353445964065433, "grad_norm": 0.31999369787628684, "learning_rate": 9.148738236280541e-08, "loss": 0.0235, "step": 10573 }, { "epoch": 2.8356127648163048, "grad_norm": 0.20547858624632034, "learning_rate": 9.11905289608328e-08, "loss": 0.0096, "step": 10574 }, { "epoch": 2.8358809332260657, "grad_norm": 0.22721597528761336, "learning_rate": 9.089415351660635e-08, "loss": 0.0099, "step": 10575 }, { "epoch": 2.8361491016358276, "grad_norm": 0.1888936459211925, "learning_rate": 9.059825605898354e-08, "loss": 0.0086, "step": 10576 }, { "epoch": 2.8364172700455885, "grad_norm": 0.20703990127078115, "learning_rate": 9.030283661677297e-08, "loss": 0.0115, "step": 10577 }, { "epoch": 2.83668543845535, "grad_norm": 0.29628996574905, "learning_rate": 9.000789521874054e-08, "loss": 0.0148, "step": 10578 }, { "epoch": 2.8369536068651113, "grad_norm": 0.25111679464742376, "learning_rate": 8.971343189360437e-08, "loss": 0.0122, "step": 10579 }, { "epoch": 2.8372217752748727, "grad_norm": 0.3115563093804111, "learning_rate": 8.941944667003377e-08, "loss": 0.014, "step": 10580 }, { "epoch": 2.837489943684634, "grad_norm": 0.2266011230599328, "learning_rate": 8.912593957665527e-08, "loss": 0.0096, "step": 10581 }, { "epoch": 2.837758112094395, "grad_norm": 0.2559349868780593, "learning_rate": 8.883291064204492e-08, "loss": 0.0184, "step": 10582 }, { "epoch": 2.8380262805041565, "grad_norm": 0.2190602977356901, "learning_rate": 8.854035989473597e-08, "loss": 0.0121, "step": 10583 }, { "epoch": 2.838294448913918, "grad_norm": 0.38782373003621023, "learning_rate": 8.824828736321178e-08, "loss": 0.021, "step": 10584 }, { "epoch": 2.8385626173236793, "grad_norm": 0.2365477169937749, "learning_rate": 8.795669307591182e-08, "loss": 0.0156, "step": 10585 }, { "epoch": 2.8388307857334407, "grad_norm": 0.23297278833207105, "learning_rate": 8.76655770612278e-08, "loss": 0.0115, "step": 10586 }, { "epoch": 2.8390989541432017, "grad_norm": 0.2454591733971592, "learning_rate": 8.737493934750374e-08, "loss": 0.0088, "step": 10587 }, { "epoch": 2.8393671225529635, "grad_norm": 0.3079366078069074, "learning_rate": 8.708477996303977e-08, "loss": 0.0187, "step": 10588 }, { "epoch": 2.8396352909627245, "grad_norm": 0.259339823241484, "learning_rate": 8.679509893608717e-08, "loss": 0.0122, "step": 10589 }, { "epoch": 2.839903459372486, "grad_norm": 0.2650153092394716, "learning_rate": 8.650589629485118e-08, "loss": 0.0171, "step": 10590 }, { "epoch": 2.8401716277822473, "grad_norm": 0.2797122545855559, "learning_rate": 8.621717206749092e-08, "loss": 0.0147, "step": 10591 }, { "epoch": 2.8404397961920087, "grad_norm": 0.24417525891297182, "learning_rate": 8.59289262821189e-08, "loss": 0.014, "step": 10592 }, { "epoch": 2.84070796460177, "grad_norm": 0.18874377319686708, "learning_rate": 8.564115896680103e-08, "loss": 0.0088, "step": 10593 }, { "epoch": 2.840976133011531, "grad_norm": 0.2573554419635773, "learning_rate": 8.535387014955544e-08, "loss": 0.011, "step": 10594 }, { "epoch": 2.8412443014212925, "grad_norm": 0.23311093464832702, "learning_rate": 8.506705985835584e-08, "loss": 0.0133, "step": 10595 }, { "epoch": 2.841512469831054, "grad_norm": 0.2679061021423933, "learning_rate": 8.47807281211277e-08, "loss": 0.0163, "step": 10596 }, { "epoch": 2.8417806382408153, "grad_norm": 0.19947671978766254, "learning_rate": 8.449487496575037e-08, "loss": 0.0149, "step": 10597 }, { "epoch": 2.8420488066505767, "grad_norm": 0.2926554532419335, "learning_rate": 8.420950042005716e-08, "loss": 0.0197, "step": 10598 }, { "epoch": 2.8423169750603376, "grad_norm": 0.2313980388441465, "learning_rate": 8.392460451183304e-08, "loss": 0.0101, "step": 10599 }, { "epoch": 2.8425851434700995, "grad_norm": 0.22247884604244084, "learning_rate": 8.364018726881861e-08, "loss": 0.0146, "step": 10600 }, { "epoch": 2.8428533118798605, "grad_norm": 0.22514501709865536, "learning_rate": 8.335624871870618e-08, "loss": 0.0126, "step": 10601 }, { "epoch": 2.843121480289622, "grad_norm": 0.2792580500414105, "learning_rate": 8.307278888914249e-08, "loss": 0.0141, "step": 10602 }, { "epoch": 2.8433896486993833, "grad_norm": 0.2445241118676122, "learning_rate": 8.278980780772717e-08, "loss": 0.0091, "step": 10603 }, { "epoch": 2.8436578171091447, "grad_norm": 0.3082495604497813, "learning_rate": 8.250730550201314e-08, "loss": 0.0242, "step": 10604 }, { "epoch": 2.843925985518906, "grad_norm": 0.1942839327026283, "learning_rate": 8.22252819995073e-08, "loss": 0.0081, "step": 10605 }, { "epoch": 2.844194153928667, "grad_norm": 0.3192349591541725, "learning_rate": 8.194373732766936e-08, "loss": 0.0146, "step": 10606 }, { "epoch": 2.8444623223384284, "grad_norm": 0.2933682104098624, "learning_rate": 8.166267151391238e-08, "loss": 0.0191, "step": 10607 }, { "epoch": 2.84473049074819, "grad_norm": 0.2531452625630813, "learning_rate": 8.138208458560337e-08, "loss": 0.0151, "step": 10608 }, { "epoch": 2.8449986591579512, "grad_norm": 0.3110276138444187, "learning_rate": 8.110197657006158e-08, "loss": 0.0179, "step": 10609 }, { "epoch": 2.8452668275677127, "grad_norm": 0.19220130459381576, "learning_rate": 8.082234749456186e-08, "loss": 0.0096, "step": 10610 }, { "epoch": 2.8455349959774736, "grad_norm": 0.2628642329696975, "learning_rate": 8.05431973863291e-08, "loss": 0.011, "step": 10611 }, { "epoch": 2.845803164387235, "grad_norm": 0.22975694864832294, "learning_rate": 8.026452627254432e-08, "loss": 0.0111, "step": 10612 }, { "epoch": 2.8460713327969964, "grad_norm": 0.30306437937548053, "learning_rate": 7.998633418034196e-08, "loss": 0.0197, "step": 10613 }, { "epoch": 2.846339501206758, "grad_norm": 0.244860342718741, "learning_rate": 7.9708621136807e-08, "loss": 0.0126, "step": 10614 }, { "epoch": 2.8466076696165192, "grad_norm": 0.19685454132794, "learning_rate": 7.943138716898058e-08, "loss": 0.0107, "step": 10615 }, { "epoch": 2.8468758380262806, "grad_norm": 0.3121269895594631, "learning_rate": 7.915463230385611e-08, "loss": 0.02, "step": 10616 }, { "epoch": 2.847144006436042, "grad_norm": 0.2642243776815256, "learning_rate": 7.887835656838038e-08, "loss": 0.0127, "step": 10617 }, { "epoch": 2.847412174845803, "grad_norm": 0.25861735624457133, "learning_rate": 7.86025599894541e-08, "loss": 0.0129, "step": 10618 }, { "epoch": 2.8476803432555644, "grad_norm": 0.25801462181122226, "learning_rate": 7.832724259393021e-08, "loss": 0.0129, "step": 10619 }, { "epoch": 2.847948511665326, "grad_norm": 0.23044693288051027, "learning_rate": 7.805240440861673e-08, "loss": 0.0111, "step": 10620 }, { "epoch": 2.848216680075087, "grad_norm": 0.26756974928200905, "learning_rate": 7.777804546027224e-08, "loss": 0.0133, "step": 10621 }, { "epoch": 2.8484848484848486, "grad_norm": 0.27012373730553907, "learning_rate": 7.750416577561092e-08, "loss": 0.0174, "step": 10622 }, { "epoch": 2.8487530168946096, "grad_norm": 0.268666903106677, "learning_rate": 7.723076538130093e-08, "loss": 0.0121, "step": 10623 }, { "epoch": 2.849021185304371, "grad_norm": 0.25512591981013816, "learning_rate": 7.695784430396092e-08, "loss": 0.0115, "step": 10624 }, { "epoch": 2.8492893537141324, "grad_norm": 0.19976715728678632, "learning_rate": 7.668540257016577e-08, "loss": 0.0082, "step": 10625 }, { "epoch": 2.849557522123894, "grad_norm": 0.3158987901622547, "learning_rate": 7.641344020644148e-08, "loss": 0.0186, "step": 10626 }, { "epoch": 2.849825690533655, "grad_norm": 0.24445990152292052, "learning_rate": 7.614195723926854e-08, "loss": 0.0125, "step": 10627 }, { "epoch": 2.8500938589434166, "grad_norm": 0.23900855271479524, "learning_rate": 7.58709536950808e-08, "loss": 0.0089, "step": 10628 }, { "epoch": 2.850362027353178, "grad_norm": 0.2994783391640885, "learning_rate": 7.560042960026493e-08, "loss": 0.0228, "step": 10629 }, { "epoch": 2.850630195762939, "grad_norm": 0.2339018765302426, "learning_rate": 7.533038498116152e-08, "loss": 0.0156, "step": 10630 }, { "epoch": 2.8508983641727004, "grad_norm": 0.18157292859375423, "learning_rate": 7.506081986406344e-08, "loss": 0.0089, "step": 10631 }, { "epoch": 2.8511665325824618, "grad_norm": 0.23329198027324297, "learning_rate": 7.479173427521747e-08, "loss": 0.013, "step": 10632 }, { "epoch": 2.851434700992223, "grad_norm": 0.1953503453061723, "learning_rate": 7.452312824082486e-08, "loss": 0.0082, "step": 10633 }, { "epoch": 2.8517028694019846, "grad_norm": 0.17172112633992723, "learning_rate": 7.425500178703804e-08, "loss": 0.0074, "step": 10634 }, { "epoch": 2.8519710378117455, "grad_norm": 0.22745110833554352, "learning_rate": 7.398735493996445e-08, "loss": 0.0093, "step": 10635 }, { "epoch": 2.852239206221507, "grad_norm": 0.22617404795611154, "learning_rate": 7.372018772566325e-08, "loss": 0.0087, "step": 10636 }, { "epoch": 2.8525073746312684, "grad_norm": 0.21104348577197363, "learning_rate": 7.345350017014863e-08, "loss": 0.0113, "step": 10637 }, { "epoch": 2.8527755430410298, "grad_norm": 0.20690144836158955, "learning_rate": 7.318729229938759e-08, "loss": 0.0094, "step": 10638 }, { "epoch": 2.853043711450791, "grad_norm": 0.31499414877338716, "learning_rate": 7.292156413929829e-08, "loss": 0.012, "step": 10639 }, { "epoch": 2.8533118798605526, "grad_norm": 0.25110972579920054, "learning_rate": 7.265631571575616e-08, "loss": 0.0098, "step": 10640 }, { "epoch": 2.853580048270314, "grad_norm": 0.29669863099853355, "learning_rate": 7.239154705458606e-08, "loss": 0.0148, "step": 10641 }, { "epoch": 2.853848216680075, "grad_norm": 0.20904327978688195, "learning_rate": 7.212725818156852e-08, "loss": 0.0128, "step": 10642 }, { "epoch": 2.8541163850898363, "grad_norm": 0.24825748832240835, "learning_rate": 7.186344912243681e-08, "loss": 0.0131, "step": 10643 }, { "epoch": 2.8543845534995977, "grad_norm": 0.30219151793847576, "learning_rate": 7.160011990287708e-08, "loss": 0.0154, "step": 10644 }, { "epoch": 2.854652721909359, "grad_norm": 0.29647750132174816, "learning_rate": 7.133727054852935e-08, "loss": 0.0224, "step": 10645 }, { "epoch": 2.8549208903191206, "grad_norm": 0.29879632130258627, "learning_rate": 7.107490108498539e-08, "loss": 0.014, "step": 10646 }, { "epoch": 2.8551890587288815, "grad_norm": 0.34309410047854594, "learning_rate": 7.081301153779308e-08, "loss": 0.0119, "step": 10647 }, { "epoch": 2.855457227138643, "grad_norm": 0.23692651617330582, "learning_rate": 7.055160193245036e-08, "loss": 0.0113, "step": 10648 }, { "epoch": 2.8557253955484043, "grad_norm": 0.25867194544708405, "learning_rate": 7.029067229441133e-08, "loss": 0.0095, "step": 10649 }, { "epoch": 2.8559935639581657, "grad_norm": 0.23751708220075085, "learning_rate": 7.003022264908177e-08, "loss": 0.0143, "step": 10650 }, { "epoch": 2.856261732367927, "grad_norm": 0.24329228810271727, "learning_rate": 6.977025302181973e-08, "loss": 0.0118, "step": 10651 }, { "epoch": 2.8565299007776885, "grad_norm": 0.25372366632987153, "learning_rate": 6.951076343793939e-08, "loss": 0.0129, "step": 10652 }, { "epoch": 2.85679806918745, "grad_norm": 0.2449690571642807, "learning_rate": 6.9251753922705e-08, "loss": 0.0135, "step": 10653 }, { "epoch": 2.857066237597211, "grad_norm": 0.2269502125287009, "learning_rate": 6.899322450133694e-08, "loss": 0.0161, "step": 10654 }, { "epoch": 2.8573344060069723, "grad_norm": 0.23648397133003965, "learning_rate": 6.873517519900729e-08, "loss": 0.0114, "step": 10655 }, { "epoch": 2.8576025744167337, "grad_norm": 0.18302301346135239, "learning_rate": 6.847760604084098e-08, "loss": 0.0102, "step": 10656 }, { "epoch": 2.857870742826495, "grad_norm": 0.21852575731600501, "learning_rate": 6.822051705191734e-08, "loss": 0.01, "step": 10657 }, { "epoch": 2.8581389112362565, "grad_norm": 0.27364786077113185, "learning_rate": 6.796390825726807e-08, "loss": 0.0136, "step": 10658 }, { "epoch": 2.8584070796460175, "grad_norm": 0.2099567349482354, "learning_rate": 6.77077796818787e-08, "loss": 0.0102, "step": 10659 }, { "epoch": 2.858675248055779, "grad_norm": 0.18837511541531632, "learning_rate": 6.745213135068818e-08, "loss": 0.0091, "step": 10660 }, { "epoch": 2.8589434164655403, "grad_norm": 0.23162943414378795, "learning_rate": 6.719696328858827e-08, "loss": 0.0097, "step": 10661 }, { "epoch": 2.8592115848753017, "grad_norm": 0.2685984372048917, "learning_rate": 6.694227552042354e-08, "loss": 0.0147, "step": 10662 }, { "epoch": 2.859479753285063, "grad_norm": 0.23250500136628197, "learning_rate": 6.668806807099193e-08, "loss": 0.013, "step": 10663 }, { "epoch": 2.8597479216948245, "grad_norm": 0.23767922398000396, "learning_rate": 6.643434096504587e-08, "loss": 0.0138, "step": 10664 }, { "epoch": 2.860016090104586, "grad_norm": 0.27494063792007367, "learning_rate": 6.618109422728947e-08, "loss": 0.0113, "step": 10665 }, { "epoch": 2.860284258514347, "grad_norm": 0.22921451603165832, "learning_rate": 6.59283278823808e-08, "loss": 0.014, "step": 10666 }, { "epoch": 2.8605524269241083, "grad_norm": 0.290705815372379, "learning_rate": 6.567604195493182e-08, "loss": 0.012, "step": 10667 }, { "epoch": 2.8608205953338697, "grad_norm": 0.26276898336995486, "learning_rate": 6.542423646950569e-08, "loss": 0.0181, "step": 10668 }, { "epoch": 2.861088763743631, "grad_norm": 0.23743219836329005, "learning_rate": 6.517291145062054e-08, "loss": 0.0092, "step": 10669 }, { "epoch": 2.8613569321533925, "grad_norm": 0.24654596737261228, "learning_rate": 6.492206692274739e-08, "loss": 0.0135, "step": 10670 }, { "epoch": 2.8616251005631534, "grad_norm": 0.3248934213139609, "learning_rate": 6.467170291030999e-08, "loss": 0.0222, "step": 10671 }, { "epoch": 2.861893268972915, "grad_norm": 0.22896643140062836, "learning_rate": 6.44218194376861e-08, "loss": 0.0113, "step": 10672 }, { "epoch": 2.8621614373826763, "grad_norm": 0.256494872984811, "learning_rate": 6.417241652920625e-08, "loss": 0.016, "step": 10673 }, { "epoch": 2.8624296057924377, "grad_norm": 0.2714888626904641, "learning_rate": 6.392349420915323e-08, "loss": 0.0136, "step": 10674 }, { "epoch": 2.862697774202199, "grad_norm": 0.28316952574319487, "learning_rate": 6.367505250176543e-08, "loss": 0.0117, "step": 10675 }, { "epoch": 2.8629659426119605, "grad_norm": 0.24094608967141434, "learning_rate": 6.342709143123126e-08, "loss": 0.0148, "step": 10676 }, { "epoch": 2.863234111021722, "grad_norm": 0.21256041140210563, "learning_rate": 6.317961102169534e-08, "loss": 0.0128, "step": 10677 }, { "epoch": 2.863502279431483, "grad_norm": 0.2500259222631522, "learning_rate": 6.293261129725281e-08, "loss": 0.0112, "step": 10678 }, { "epoch": 2.8637704478412442, "grad_norm": 0.23845455651228067, "learning_rate": 6.268609228195499e-08, "loss": 0.0117, "step": 10679 }, { "epoch": 2.8640386162510056, "grad_norm": 0.18522798161035695, "learning_rate": 6.24400539998038e-08, "loss": 0.0122, "step": 10680 }, { "epoch": 2.864306784660767, "grad_norm": 0.2958123940959807, "learning_rate": 6.219449647475562e-08, "loss": 0.0179, "step": 10681 }, { "epoch": 2.8645749530705285, "grad_norm": 0.2652717578234367, "learning_rate": 6.194941973072021e-08, "loss": 0.0124, "step": 10682 }, { "epoch": 2.8648431214802894, "grad_norm": 0.2524821898966749, "learning_rate": 6.170482379155907e-08, "loss": 0.018, "step": 10683 }, { "epoch": 2.865111289890051, "grad_norm": 0.22064700791201303, "learning_rate": 6.146070868108811e-08, "loss": 0.0121, "step": 10684 }, { "epoch": 2.865379458299812, "grad_norm": 0.2808869203904827, "learning_rate": 6.121707442307722e-08, "loss": 0.0129, "step": 10685 }, { "epoch": 2.8656476267095736, "grad_norm": 0.27018142674042606, "learning_rate": 6.097392104124633e-08, "loss": 0.0134, "step": 10686 }, { "epoch": 2.865915795119335, "grad_norm": 0.26159361132421055, "learning_rate": 6.073124855927315e-08, "loss": 0.0136, "step": 10687 }, { "epoch": 2.8661839635290964, "grad_norm": 0.22421743481175047, "learning_rate": 6.048905700078378e-08, "loss": 0.0095, "step": 10688 }, { "epoch": 2.866452131938858, "grad_norm": 0.30935380844032195, "learning_rate": 6.024734638936159e-08, "loss": 0.0152, "step": 10689 }, { "epoch": 2.866720300348619, "grad_norm": 0.4071782032792132, "learning_rate": 6.000611674853995e-08, "loss": 0.0167, "step": 10690 }, { "epoch": 2.86698846875838, "grad_norm": 0.20571066499942672, "learning_rate": 5.976536810180677e-08, "loss": 0.0103, "step": 10691 }, { "epoch": 2.8672566371681416, "grad_norm": 0.3646466863011554, "learning_rate": 5.9525100472604955e-08, "loss": 0.0169, "step": 10692 }, { "epoch": 2.867524805577903, "grad_norm": 0.2568548624802712, "learning_rate": 5.928531388432635e-08, "loss": 0.0101, "step": 10693 }, { "epoch": 2.8677929739876644, "grad_norm": 0.19578701561309472, "learning_rate": 5.904600836032004e-08, "loss": 0.0086, "step": 10694 }, { "epoch": 2.8680611423974254, "grad_norm": 0.2379866992449065, "learning_rate": 5.880718392388518e-08, "loss": 0.0113, "step": 10695 }, { "epoch": 2.868329310807187, "grad_norm": 0.22682817567827396, "learning_rate": 5.8568840598277054e-08, "loss": 0.009, "step": 10696 }, { "epoch": 2.868597479216948, "grad_norm": 0.24001908118358514, "learning_rate": 5.833097840670155e-08, "loss": 0.013, "step": 10697 }, { "epoch": 2.8688656476267096, "grad_norm": 0.19874948778386625, "learning_rate": 5.809359737231846e-08, "loss": 0.0091, "step": 10698 }, { "epoch": 2.869133816036471, "grad_norm": 0.23576132917668016, "learning_rate": 5.7856697518241524e-08, "loss": 0.0084, "step": 10699 }, { "epoch": 2.869401984446232, "grad_norm": 0.18163783281297288, "learning_rate": 5.7620278867536185e-08, "loss": 0.0099, "step": 10700 }, { "epoch": 2.869670152855994, "grad_norm": 0.2369892362471996, "learning_rate": 5.738434144322291e-08, "loss": 0.0138, "step": 10701 }, { "epoch": 2.8699383212657548, "grad_norm": 0.23892673665658076, "learning_rate": 5.714888526827444e-08, "loss": 0.0137, "step": 10702 }, { "epoch": 2.870206489675516, "grad_norm": 0.2926573374691794, "learning_rate": 5.691391036561522e-08, "loss": 0.017, "step": 10703 }, { "epoch": 2.8704746580852776, "grad_norm": 0.21972722960786198, "learning_rate": 5.667941675812527e-08, "loss": 0.0132, "step": 10704 }, { "epoch": 2.870742826495039, "grad_norm": 0.24287825711764252, "learning_rate": 5.6445404468636334e-08, "loss": 0.0135, "step": 10705 }, { "epoch": 2.8710109949048004, "grad_norm": 0.28202639186980843, "learning_rate": 5.6211873519933516e-08, "loss": 0.0132, "step": 10706 }, { "epoch": 2.8712791633145613, "grad_norm": 0.19571743792785304, "learning_rate": 5.597882393475473e-08, "loss": 0.0121, "step": 10707 }, { "epoch": 2.8715473317243227, "grad_norm": 0.2081721081017145, "learning_rate": 5.5746255735791846e-08, "loss": 0.0088, "step": 10708 }, { "epoch": 2.871815500134084, "grad_norm": 0.2672145414492506, "learning_rate": 5.5514168945689506e-08, "loss": 0.0112, "step": 10709 }, { "epoch": 2.8720836685438456, "grad_norm": 0.23324105151108748, "learning_rate": 5.52825635870452e-08, "loss": 0.0091, "step": 10710 }, { "epoch": 2.872351836953607, "grad_norm": 0.22366036685290314, "learning_rate": 5.505143968240923e-08, "loss": 0.0172, "step": 10711 }, { "epoch": 2.872620005363368, "grad_norm": 0.2159042028953645, "learning_rate": 5.4820797254286904e-08, "loss": 0.0088, "step": 10712 }, { "epoch": 2.8728881737731298, "grad_norm": 0.23077847823432823, "learning_rate": 5.4590636325133615e-08, "loss": 0.0134, "step": 10713 }, { "epoch": 2.8731563421828907, "grad_norm": 0.2030900613957855, "learning_rate": 5.436095691736032e-08, "loss": 0.0109, "step": 10714 }, { "epoch": 2.873424510592652, "grad_norm": 0.22615386980389088, "learning_rate": 5.413175905333024e-08, "loss": 0.0132, "step": 10715 }, { "epoch": 2.8736926790024135, "grad_norm": 0.26292300237158145, "learning_rate": 5.390304275535996e-08, "loss": 0.0177, "step": 10716 }, { "epoch": 2.873960847412175, "grad_norm": 0.3110479559754243, "learning_rate": 5.367480804571834e-08, "loss": 0.0212, "step": 10717 }, { "epoch": 2.8742290158219364, "grad_norm": 0.2147478759117049, "learning_rate": 5.3447054946628715e-08, "loss": 0.0092, "step": 10718 }, { "epoch": 2.8744971842316973, "grad_norm": 0.2323908150578418, "learning_rate": 5.3219783480266685e-08, "loss": 0.0124, "step": 10719 }, { "epoch": 2.8747653526414587, "grad_norm": 0.2282675714810508, "learning_rate": 5.2992993668760654e-08, "loss": 0.0118, "step": 10720 }, { "epoch": 2.87503352105122, "grad_norm": 0.18849140745619442, "learning_rate": 5.276668553419295e-08, "loss": 0.0107, "step": 10721 }, { "epoch": 2.8753016894609815, "grad_norm": 0.3035223877169693, "learning_rate": 5.254085909859874e-08, "loss": 0.0141, "step": 10722 }, { "epoch": 2.875569857870743, "grad_norm": 0.25746366177247537, "learning_rate": 5.231551438396543e-08, "loss": 0.0163, "step": 10723 }, { "epoch": 2.875838026280504, "grad_norm": 0.22775690719760497, "learning_rate": 5.2090651412234906e-08, "loss": 0.0121, "step": 10724 }, { "epoch": 2.8761061946902657, "grad_norm": 0.2014942840297933, "learning_rate": 5.186627020530077e-08, "loss": 0.0103, "step": 10725 }, { "epoch": 2.8763743631000267, "grad_norm": 0.29869823714863425, "learning_rate": 5.164237078501111e-08, "loss": 0.0217, "step": 10726 }, { "epoch": 2.876642531509788, "grad_norm": 0.2490907457747659, "learning_rate": 5.1418953173165695e-08, "loss": 0.0164, "step": 10727 }, { "epoch": 2.8769106999195495, "grad_norm": 0.19742446996431975, "learning_rate": 5.1196017391518804e-08, "loss": 0.0078, "step": 10728 }, { "epoch": 2.877178868329311, "grad_norm": 0.19184811122076945, "learning_rate": 5.097356346177751e-08, "loss": 0.0103, "step": 10729 }, { "epoch": 2.8774470367390723, "grad_norm": 0.20822349845271776, "learning_rate": 5.075159140560004e-08, "loss": 0.0092, "step": 10730 }, { "epoch": 2.8777152051488333, "grad_norm": 0.23490334002865088, "learning_rate": 5.053010124460078e-08, "loss": 0.0162, "step": 10731 }, { "epoch": 2.8779833735585947, "grad_norm": 0.32736689171025674, "learning_rate": 5.03090930003447e-08, "loss": 0.0134, "step": 10732 }, { "epoch": 2.878251541968356, "grad_norm": 0.2676807397876617, "learning_rate": 5.008856669435125e-08, "loss": 0.0152, "step": 10733 }, { "epoch": 2.8785197103781175, "grad_norm": 0.3198441360541822, "learning_rate": 4.986852234809214e-08, "loss": 0.0155, "step": 10734 }, { "epoch": 2.878787878787879, "grad_norm": 0.2427503049227703, "learning_rate": 4.964895998299246e-08, "loss": 0.0104, "step": 10735 }, { "epoch": 2.87905604719764, "grad_norm": 0.25758445885932013, "learning_rate": 4.9429879620431223e-08, "loss": 0.0132, "step": 10736 }, { "epoch": 2.8793242156074017, "grad_norm": 0.38261435675620703, "learning_rate": 4.9211281281738574e-08, "loss": 0.0093, "step": 10737 }, { "epoch": 2.8795923840171627, "grad_norm": 0.21337977691120827, "learning_rate": 4.8993164988199174e-08, "loss": 0.0122, "step": 10738 }, { "epoch": 2.879860552426924, "grad_norm": 0.27265397310305844, "learning_rate": 4.877553076105157e-08, "loss": 0.0151, "step": 10739 }, { "epoch": 2.8801287208366855, "grad_norm": 0.24466895999626964, "learning_rate": 4.8558378621484384e-08, "loss": 0.0136, "step": 10740 }, { "epoch": 2.880396889246447, "grad_norm": 0.1634364335481162, "learning_rate": 4.834170859064236e-08, "loss": 0.0064, "step": 10741 }, { "epoch": 2.8806650576562083, "grad_norm": 0.22797094254589756, "learning_rate": 4.812552068962195e-08, "loss": 0.0109, "step": 10742 }, { "epoch": 2.8809332260659692, "grad_norm": 0.20989060110905947, "learning_rate": 4.790981493947244e-08, "loss": 0.0132, "step": 10743 }, { "epoch": 2.8812013944757306, "grad_norm": 0.232073402958727, "learning_rate": 4.769459136119647e-08, "loss": 0.0113, "step": 10744 }, { "epoch": 2.881469562885492, "grad_norm": 0.39347502436718024, "learning_rate": 4.747984997575006e-08, "loss": 0.0216, "step": 10745 }, { "epoch": 2.8817377312952535, "grad_norm": 0.2418846290147638, "learning_rate": 4.726559080404258e-08, "loss": 0.0132, "step": 10746 }, { "epoch": 2.882005899705015, "grad_norm": 0.5233123498295074, "learning_rate": 4.705181386693458e-08, "loss": 0.0157, "step": 10747 }, { "epoch": 2.882274068114776, "grad_norm": 0.2476118016700826, "learning_rate": 4.6838519185242184e-08, "loss": 0.0159, "step": 10748 }, { "epoch": 2.8825422365245377, "grad_norm": 0.23716619977480552, "learning_rate": 4.662570677973266e-08, "loss": 0.0164, "step": 10749 }, { "epoch": 2.8828104049342986, "grad_norm": 0.3396500767859851, "learning_rate": 4.6413376671126665e-08, "loss": 0.0134, "step": 10750 }, { "epoch": 2.88307857334406, "grad_norm": 0.18867047756361685, "learning_rate": 4.620152888009932e-08, "loss": 0.01, "step": 10751 }, { "epoch": 2.8833467417538214, "grad_norm": 0.28418149568434675, "learning_rate": 4.599016342727636e-08, "loss": 0.02, "step": 10752 }, { "epoch": 2.883614910163583, "grad_norm": 0.19196365125978784, "learning_rate": 4.5779280333239086e-08, "loss": 0.0105, "step": 10753 }, { "epoch": 2.8838830785733442, "grad_norm": 0.33045287147534574, "learning_rate": 4.556887961851941e-08, "loss": 0.028, "step": 10754 }, { "epoch": 2.884151246983105, "grad_norm": 0.21781141614604496, "learning_rate": 4.5358961303604845e-08, "loss": 0.0101, "step": 10755 }, { "epoch": 2.8844194153928666, "grad_norm": 0.25561495282876745, "learning_rate": 4.514952540893347e-08, "loss": 0.012, "step": 10756 }, { "epoch": 2.884687583802628, "grad_norm": 0.24450844934152924, "learning_rate": 4.494057195489843e-08, "loss": 0.0142, "step": 10757 }, { "epoch": 2.8849557522123894, "grad_norm": 0.22941195540687426, "learning_rate": 4.473210096184455e-08, "loss": 0.0113, "step": 10758 }, { "epoch": 2.885223920622151, "grad_norm": 0.22881737929831394, "learning_rate": 4.452411245006949e-08, "loss": 0.0149, "step": 10759 }, { "epoch": 2.885492089031912, "grad_norm": 0.30357110617921057, "learning_rate": 4.431660643982483e-08, "loss": 0.013, "step": 10760 }, { "epoch": 2.8857602574416736, "grad_norm": 0.20301544718138229, "learning_rate": 4.410958295131551e-08, "loss": 0.0089, "step": 10761 }, { "epoch": 2.8860284258514346, "grad_norm": 0.34250606222542357, "learning_rate": 4.390304200469875e-08, "loss": 0.0168, "step": 10762 }, { "epoch": 2.886296594261196, "grad_norm": 0.2811058101704099, "learning_rate": 4.3696983620084566e-08, "loss": 0.0134, "step": 10763 }, { "epoch": 2.8865647626709574, "grad_norm": 0.23686941892929175, "learning_rate": 4.34914078175358e-08, "loss": 0.0113, "step": 10764 }, { "epoch": 2.886832931080719, "grad_norm": 0.2530079734053365, "learning_rate": 4.328631461706978e-08, "loss": 0.0144, "step": 10765 }, { "epoch": 2.88710109949048, "grad_norm": 0.284550594947064, "learning_rate": 4.308170403865497e-08, "loss": 0.0152, "step": 10766 }, { "epoch": 2.887369267900241, "grad_norm": 0.2586890597539836, "learning_rate": 4.287757610221488e-08, "loss": 0.0185, "step": 10767 }, { "epoch": 2.8876374363100026, "grad_norm": 0.24639457734596898, "learning_rate": 4.2673930827624166e-08, "loss": 0.0085, "step": 10768 }, { "epoch": 2.887905604719764, "grad_norm": 0.2833016823495599, "learning_rate": 4.247076823471141e-08, "loss": 0.014, "step": 10769 }, { "epoch": 2.8881737731295254, "grad_norm": 0.21993450036100445, "learning_rate": 4.226808834325802e-08, "loss": 0.0133, "step": 10770 }, { "epoch": 2.888441941539287, "grad_norm": 0.18793939372181975, "learning_rate": 4.20658911729982e-08, "loss": 0.0079, "step": 10771 }, { "epoch": 2.8887101099490478, "grad_norm": 0.24432389409290467, "learning_rate": 4.1864176743619535e-08, "loss": 0.0116, "step": 10772 }, { "epoch": 2.8889782783588096, "grad_norm": 0.23823909659121906, "learning_rate": 4.166294507476243e-08, "loss": 0.0111, "step": 10773 }, { "epoch": 2.8892464467685706, "grad_norm": 0.26261729110912074, "learning_rate": 4.1462196186019546e-08, "loss": 0.0133, "step": 10774 }, { "epoch": 2.889514615178332, "grad_norm": 0.29790486687422574, "learning_rate": 4.126193009693913e-08, "loss": 0.0191, "step": 10775 }, { "epoch": 2.8897827835880934, "grad_norm": 0.3491887357239462, "learning_rate": 4.106214682701837e-08, "loss": 0.0116, "step": 10776 }, { "epoch": 2.8900509519978548, "grad_norm": 0.24965399030364568, "learning_rate": 4.0862846395711145e-08, "loss": 0.0155, "step": 10777 }, { "epoch": 2.890319120407616, "grad_norm": 0.20786935512612767, "learning_rate": 4.0664028822422486e-08, "loss": 0.0102, "step": 10778 }, { "epoch": 2.890587288817377, "grad_norm": 0.24718692303597062, "learning_rate": 4.046569412651025e-08, "loss": 0.0177, "step": 10779 }, { "epoch": 2.8908554572271385, "grad_norm": 0.34224180609098015, "learning_rate": 4.02678423272862e-08, "loss": 0.0152, "step": 10780 }, { "epoch": 2.8911236256369, "grad_norm": 0.16358025461594786, "learning_rate": 4.007047344401438e-08, "loss": 0.0078, "step": 10781 }, { "epoch": 2.8913917940466614, "grad_norm": 0.2062651264482024, "learning_rate": 3.987358749591219e-08, "loss": 0.009, "step": 10782 }, { "epoch": 2.8916599624564228, "grad_norm": 0.2775558503949398, "learning_rate": 3.9677184502149855e-08, "loss": 0.0225, "step": 10783 }, { "epoch": 2.8919281308661837, "grad_norm": 0.284556850669693, "learning_rate": 3.948126448185097e-08, "loss": 0.0119, "step": 10784 }, { "epoch": 2.8921962992759456, "grad_norm": 0.9538459684149021, "learning_rate": 3.928582745409137e-08, "loss": 0.0128, "step": 10785 }, { "epoch": 2.8924644676857065, "grad_norm": 0.25046198133831027, "learning_rate": 3.90908734379003e-08, "loss": 0.0122, "step": 10786 }, { "epoch": 2.892732636095468, "grad_norm": 0.2113598638935938, "learning_rate": 3.889640245225923e-08, "loss": 0.0115, "step": 10787 }, { "epoch": 2.8930008045052293, "grad_norm": 0.2302661518063801, "learning_rate": 3.870241451610468e-08, "loss": 0.0129, "step": 10788 }, { "epoch": 2.8932689729149907, "grad_norm": 0.2448927180305606, "learning_rate": 3.850890964832377e-08, "loss": 0.0109, "step": 10789 }, { "epoch": 2.893537141324752, "grad_norm": 0.3047814027105091, "learning_rate": 3.8315887867758637e-08, "loss": 0.0187, "step": 10790 }, { "epoch": 2.893805309734513, "grad_norm": 0.21818109995242277, "learning_rate": 3.8123349193201484e-08, "loss": 0.0119, "step": 10791 }, { "epoch": 2.8940734781442745, "grad_norm": 0.19917567444670137, "learning_rate": 3.793129364340064e-08, "loss": 0.0066, "step": 10792 }, { "epoch": 2.894341646554036, "grad_norm": 0.17103409023366323, "learning_rate": 3.77397212370556e-08, "loss": 0.0084, "step": 10793 }, { "epoch": 2.8946098149637973, "grad_norm": 0.2780854285939121, "learning_rate": 3.754863199281977e-08, "loss": 0.0133, "step": 10794 }, { "epoch": 2.8948779833735587, "grad_norm": 0.2683967631065976, "learning_rate": 3.735802592929827e-08, "loss": 0.0183, "step": 10795 }, { "epoch": 2.8951461517833197, "grad_norm": 0.2780625250176772, "learning_rate": 3.7167903065050136e-08, "loss": 0.0106, "step": 10796 }, { "epoch": 2.895414320193081, "grad_norm": 0.3111365881263348, "learning_rate": 3.6978263418587235e-08, "loss": 0.0194, "step": 10797 }, { "epoch": 2.8956824886028425, "grad_norm": 0.33059402853027936, "learning_rate": 3.6789107008374235e-08, "loss": 0.0138, "step": 10798 }, { "epoch": 2.895950657012604, "grad_norm": 0.22064836189162967, "learning_rate": 3.6600433852829186e-08, "loss": 0.0132, "step": 10799 }, { "epoch": 2.8962188254223653, "grad_norm": 0.24614463949527357, "learning_rate": 3.641224397032184e-08, "loss": 0.0131, "step": 10800 }, { "epoch": 2.8964869938321267, "grad_norm": 0.227065095793287, "learning_rate": 3.622453737917642e-08, "loss": 0.0101, "step": 10801 }, { "epoch": 2.896755162241888, "grad_norm": 0.32632311494254657, "learning_rate": 3.603731409766942e-08, "loss": 0.0256, "step": 10802 }, { "epoch": 2.897023330651649, "grad_norm": 0.21838724873055596, "learning_rate": 3.585057414402959e-08, "loss": 0.0113, "step": 10803 }, { "epoch": 2.8972914990614105, "grad_norm": 0.27074453620563943, "learning_rate": 3.566431753644017e-08, "loss": 0.0134, "step": 10804 }, { "epoch": 2.897559667471172, "grad_norm": 0.30515959920856817, "learning_rate": 3.54785442930361e-08, "loss": 0.0163, "step": 10805 }, { "epoch": 2.8978278358809333, "grad_norm": 0.26337186650335676, "learning_rate": 3.529325443190568e-08, "loss": 0.0142, "step": 10806 }, { "epoch": 2.8980960042906947, "grad_norm": 0.19879095868834398, "learning_rate": 3.5108447971090034e-08, "loss": 0.0101, "step": 10807 }, { "epoch": 2.8983641727004557, "grad_norm": 0.17335105053057231, "learning_rate": 3.4924124928583104e-08, "loss": 0.0071, "step": 10808 }, { "epoch": 2.898632341110217, "grad_norm": 0.23356749881765212, "learning_rate": 3.47402853223322e-08, "loss": 0.0128, "step": 10809 }, { "epoch": 2.8989005095199785, "grad_norm": 0.3764161792507979, "learning_rate": 3.45569291702369e-08, "loss": 0.018, "step": 10810 }, { "epoch": 2.89916867792974, "grad_norm": 0.3370187586851141, "learning_rate": 3.437405649015069e-08, "loss": 0.0169, "step": 10811 }, { "epoch": 2.8994368463395013, "grad_norm": 0.3002358114283504, "learning_rate": 3.4191667299879325e-08, "loss": 0.0199, "step": 10812 }, { "epoch": 2.8997050147492627, "grad_norm": 0.29005365081811896, "learning_rate": 3.4009761617180834e-08, "loss": 0.0163, "step": 10813 }, { "epoch": 2.899973183159024, "grad_norm": 0.2410781304595417, "learning_rate": 3.382833945976771e-08, "loss": 0.0142, "step": 10814 }, { "epoch": 2.900241351568785, "grad_norm": 0.2224664254228026, "learning_rate": 3.364740084530416e-08, "loss": 0.0115, "step": 10815 }, { "epoch": 2.9005095199785464, "grad_norm": 0.28782405765399677, "learning_rate": 3.346694579140775e-08, "loss": 0.0141, "step": 10816 }, { "epoch": 2.900777688388308, "grad_norm": 0.22109900581821154, "learning_rate": 3.3286974315649426e-08, "loss": 0.0139, "step": 10817 }, { "epoch": 2.9010458567980693, "grad_norm": 0.27806366631288737, "learning_rate": 3.310748643555184e-08, "loss": 0.0177, "step": 10818 }, { "epoch": 2.9013140252078307, "grad_norm": 0.2773686658991523, "learning_rate": 3.292848216859101e-08, "loss": 0.0119, "step": 10819 }, { "epoch": 2.9015821936175916, "grad_norm": 0.1564384369181996, "learning_rate": 3.274996153219689e-08, "loss": 0.0069, "step": 10820 }, { "epoch": 2.901850362027353, "grad_norm": 0.272630106108349, "learning_rate": 3.257192454375113e-08, "loss": 0.0164, "step": 10821 }, { "epoch": 2.9021185304371144, "grad_norm": 0.22926231072659004, "learning_rate": 3.239437122058875e-08, "loss": 0.0149, "step": 10822 }, { "epoch": 2.902386698846876, "grad_norm": 0.17336560515504248, "learning_rate": 3.221730157999758e-08, "loss": 0.0087, "step": 10823 }, { "epoch": 2.9026548672566372, "grad_norm": 0.3225435136491405, "learning_rate": 3.204071563921885e-08, "loss": 0.0205, "step": 10824 }, { "epoch": 2.9029230356663986, "grad_norm": 0.2517478398980715, "learning_rate": 3.1864613415446e-08, "loss": 0.0127, "step": 10825 }, { "epoch": 2.90319120407616, "grad_norm": 0.2671185153747408, "learning_rate": 3.168899492582533e-08, "loss": 0.015, "step": 10826 }, { "epoch": 2.903459372485921, "grad_norm": 0.21847278256285665, "learning_rate": 3.1513860187457055e-08, "loss": 0.0119, "step": 10827 }, { "epoch": 2.9037275408956824, "grad_norm": 0.35113862020970865, "learning_rate": 3.133920921739253e-08, "loss": 0.0203, "step": 10828 }, { "epoch": 2.903995709305444, "grad_norm": 0.3425059752471659, "learning_rate": 3.1165042032638146e-08, "loss": 0.0202, "step": 10829 }, { "epoch": 2.9042638777152052, "grad_norm": 0.24174884084499454, "learning_rate": 3.099135865015146e-08, "loss": 0.0112, "step": 10830 }, { "epoch": 2.9045320461249666, "grad_norm": 0.23329382308367128, "learning_rate": 3.081815908684338e-08, "loss": 0.0113, "step": 10831 }, { "epoch": 2.9048002145347276, "grad_norm": 0.34952511048494866, "learning_rate": 3.0645443359578754e-08, "loss": 0.0192, "step": 10832 }, { "epoch": 2.905068382944489, "grad_norm": 0.24255461837975373, "learning_rate": 3.047321148517357e-08, "loss": 0.0097, "step": 10833 }, { "epoch": 2.9053365513542504, "grad_norm": 0.31755283098000875, "learning_rate": 3.030146348039831e-08, "loss": 0.027, "step": 10834 }, { "epoch": 2.905604719764012, "grad_norm": 0.2387300282402434, "learning_rate": 3.013019936197514e-08, "loss": 0.0131, "step": 10835 }, { "epoch": 2.905872888173773, "grad_norm": 0.29123761729236414, "learning_rate": 2.995941914657907e-08, "loss": 0.0204, "step": 10836 }, { "epoch": 2.9061410565835346, "grad_norm": 0.20668891821579455, "learning_rate": 2.9789122850840124e-08, "loss": 0.0088, "step": 10837 }, { "epoch": 2.906409224993296, "grad_norm": 0.19315548946213973, "learning_rate": 2.9619310491337814e-08, "loss": 0.0093, "step": 10838 }, { "epoch": 2.906677393403057, "grad_norm": 0.2291021266978299, "learning_rate": 2.9449982084607808e-08, "loss": 0.0119, "step": 10839 }, { "epoch": 2.9069455618128184, "grad_norm": 0.2940044090814101, "learning_rate": 2.9281137647135805e-08, "loss": 0.0158, "step": 10840 }, { "epoch": 2.90721373022258, "grad_norm": 0.2647232116255512, "learning_rate": 2.9112777195363096e-08, "loss": 0.0141, "step": 10841 }, { "epoch": 2.907481898632341, "grad_norm": 0.30093540365204746, "learning_rate": 2.894490074568157e-08, "loss": 0.0176, "step": 10842 }, { "epoch": 2.9077500670421026, "grad_norm": 0.5440335392178978, "learning_rate": 2.8777508314437043e-08, "loss": 0.0142, "step": 10843 }, { "epoch": 2.9080182354518636, "grad_norm": 0.27057289290393954, "learning_rate": 2.8610599917928694e-08, "loss": 0.0126, "step": 10844 }, { "epoch": 2.908286403861625, "grad_norm": 0.23844736325954108, "learning_rate": 2.8444175572406862e-08, "loss": 0.0111, "step": 10845 }, { "epoch": 2.9085545722713864, "grad_norm": 0.21932302395090356, "learning_rate": 2.8278235294076362e-08, "loss": 0.0114, "step": 10846 }, { "epoch": 2.9088227406811478, "grad_norm": 0.29483093832254936, "learning_rate": 2.8112779099094822e-08, "loss": 0.0138, "step": 10847 }, { "epoch": 2.909090909090909, "grad_norm": 0.3706939161842536, "learning_rate": 2.7947807003571582e-08, "loss": 0.0163, "step": 10848 }, { "epoch": 2.9093590775006706, "grad_norm": 0.23162637491168364, "learning_rate": 2.77833190235699e-08, "loss": 0.0148, "step": 10849 }, { "epoch": 2.909627245910432, "grad_norm": 0.21350993233796112, "learning_rate": 2.7619315175105856e-08, "loss": 0.0113, "step": 10850 }, { "epoch": 2.909895414320193, "grad_norm": 0.22518951800138112, "learning_rate": 2.7455795474147228e-08, "loss": 0.0131, "step": 10851 }, { "epoch": 2.9101635827299543, "grad_norm": 0.2963970757400302, "learning_rate": 2.7292759936615732e-08, "loss": 0.0173, "step": 10852 }, { "epoch": 2.9104317511397158, "grad_norm": 0.2506301399678004, "learning_rate": 2.713020857838644e-08, "loss": 0.0121, "step": 10853 }, { "epoch": 2.910699919549477, "grad_norm": 0.22642725002159597, "learning_rate": 2.696814141528614e-08, "loss": 0.0131, "step": 10854 }, { "epoch": 2.9109680879592386, "grad_norm": 0.2610499124594803, "learning_rate": 2.6806558463094433e-08, "loss": 0.0084, "step": 10855 }, { "epoch": 2.9112362563689995, "grad_norm": 0.23949194261336368, "learning_rate": 2.664545973754429e-08, "loss": 0.0121, "step": 10856 }, { "epoch": 2.911504424778761, "grad_norm": 0.21246806624092346, "learning_rate": 2.6484845254322045e-08, "loss": 0.0076, "step": 10857 }, { "epoch": 2.9117725931885223, "grad_norm": 0.20250600957057463, "learning_rate": 2.6324715029066307e-08, "loss": 0.0105, "step": 10858 }, { "epoch": 2.9120407615982837, "grad_norm": 0.23388056370842047, "learning_rate": 2.6165069077367932e-08, "loss": 0.0152, "step": 10859 }, { "epoch": 2.912308930008045, "grad_norm": 0.2653973236994618, "learning_rate": 2.6005907414771158e-08, "loss": 0.0123, "step": 10860 }, { "epoch": 2.9125770984178065, "grad_norm": 0.2628941638195506, "learning_rate": 2.584723005677414e-08, "loss": 0.0145, "step": 10861 }, { "epoch": 2.912845266827568, "grad_norm": 0.2157743822158722, "learning_rate": 2.5689037018825636e-08, "loss": 0.0118, "step": 10862 }, { "epoch": 2.913113435237329, "grad_norm": 0.25503115763105455, "learning_rate": 2.5531328316328875e-08, "loss": 0.0135, "step": 10863 }, { "epoch": 2.9133816036470903, "grad_norm": 0.24056010411234718, "learning_rate": 2.5374103964640464e-08, "loss": 0.0109, "step": 10864 }, { "epoch": 2.9136497720568517, "grad_norm": 0.26611635378582005, "learning_rate": 2.52173639790676e-08, "loss": 0.0162, "step": 10865 }, { "epoch": 2.913917940466613, "grad_norm": 0.6677466696572626, "learning_rate": 2.5061108374872523e-08, "loss": 0.0093, "step": 10866 }, { "epoch": 2.9141861088763745, "grad_norm": 0.24808190944493533, "learning_rate": 2.4905337167269173e-08, "loss": 0.0139, "step": 10867 }, { "epoch": 2.9144542772861355, "grad_norm": 0.27476715797803086, "learning_rate": 2.4750050371424304e-08, "loss": 0.0143, "step": 10868 }, { "epoch": 2.914722445695897, "grad_norm": 0.34318240794436666, "learning_rate": 2.459524800245805e-08, "loss": 0.019, "step": 10869 }, { "epoch": 2.9149906141056583, "grad_norm": 0.2696767831988475, "learning_rate": 2.4440930075443346e-08, "loss": 0.0159, "step": 10870 }, { "epoch": 2.9152587825154197, "grad_norm": 0.24835967669051537, "learning_rate": 2.4287096605404846e-08, "loss": 0.0144, "step": 10871 }, { "epoch": 2.915526950925181, "grad_norm": 0.26940980209753834, "learning_rate": 2.413374760732168e-08, "loss": 0.0121, "step": 10872 }, { "epoch": 2.9157951193349425, "grad_norm": 0.29826746230982826, "learning_rate": 2.398088309612523e-08, "loss": 0.0181, "step": 10873 }, { "epoch": 2.916063287744704, "grad_norm": 0.25385356638676215, "learning_rate": 2.3828503086698595e-08, "loss": 0.0134, "step": 10874 }, { "epoch": 2.916331456154465, "grad_norm": 0.22594750749665615, "learning_rate": 2.367660759387935e-08, "loss": 0.0102, "step": 10875 }, { "epoch": 2.9165996245642263, "grad_norm": 0.20442983103087703, "learning_rate": 2.3525196632457337e-08, "loss": 0.013, "step": 10876 }, { "epoch": 2.9168677929739877, "grad_norm": 0.19229739889544645, "learning_rate": 2.3374270217174087e-08, "loss": 0.0133, "step": 10877 }, { "epoch": 2.917135961383749, "grad_norm": 0.26002567860369435, "learning_rate": 2.3223828362725632e-08, "loss": 0.013, "step": 10878 }, { "epoch": 2.9174041297935105, "grad_norm": 0.24462049493545407, "learning_rate": 2.3073871083760247e-08, "loss": 0.0134, "step": 10879 }, { "epoch": 2.9176722982032715, "grad_norm": 0.2036830402708642, "learning_rate": 2.292439839487792e-08, "loss": 0.0115, "step": 10880 }, { "epoch": 2.917940466613033, "grad_norm": 0.26515560051268705, "learning_rate": 2.2775410310633682e-08, "loss": 0.0142, "step": 10881 }, { "epoch": 2.9182086350227943, "grad_norm": 0.25299142825611, "learning_rate": 2.2626906845533148e-08, "loss": 0.0124, "step": 10882 }, { "epoch": 2.9184768034325557, "grad_norm": 0.17581443722947993, "learning_rate": 2.2478888014035862e-08, "loss": 0.0119, "step": 10883 }, { "epoch": 2.918744971842317, "grad_norm": 0.40795838706052545, "learning_rate": 2.2331353830554183e-08, "loss": 0.0137, "step": 10884 }, { "epoch": 2.919013140252078, "grad_norm": 0.18418322698913572, "learning_rate": 2.2184304309453286e-08, "loss": 0.0088, "step": 10885 }, { "epoch": 2.91928130866184, "grad_norm": 0.27709397402148384, "learning_rate": 2.2037739465051166e-08, "loss": 0.0098, "step": 10886 }, { "epoch": 2.919549477071601, "grad_norm": 0.2802932302015498, "learning_rate": 2.189165931161752e-08, "loss": 0.0147, "step": 10887 }, { "epoch": 2.9198176454813622, "grad_norm": 0.179176681958024, "learning_rate": 2.1746063863377075e-08, "loss": 0.0086, "step": 10888 }, { "epoch": 2.9200858138911236, "grad_norm": 0.24416693066820558, "learning_rate": 2.160095313450461e-08, "loss": 0.012, "step": 10889 }, { "epoch": 2.920353982300885, "grad_norm": 0.18674075151576883, "learning_rate": 2.1456327139130484e-08, "loss": 0.0095, "step": 10890 }, { "epoch": 2.9206221507106465, "grad_norm": 0.23776588630764686, "learning_rate": 2.1312185891335658e-08, "loss": 0.0139, "step": 10891 }, { "epoch": 2.9208903191204074, "grad_norm": 0.2751537833052374, "learning_rate": 2.116852940515557e-08, "loss": 0.0187, "step": 10892 }, { "epoch": 2.921158487530169, "grad_norm": 0.21451154460290375, "learning_rate": 2.102535769457681e-08, "loss": 0.0118, "step": 10893 }, { "epoch": 2.9214266559399302, "grad_norm": 0.4785150139906985, "learning_rate": 2.0882670773540448e-08, "loss": 0.0188, "step": 10894 }, { "epoch": 2.9216948243496916, "grad_norm": 0.19762954433430227, "learning_rate": 2.0740468655938707e-08, "loss": 0.0102, "step": 10895 }, { "epoch": 2.921962992759453, "grad_norm": 0.2849525008582507, "learning_rate": 2.0598751355618284e-08, "loss": 0.0135, "step": 10896 }, { "epoch": 2.922231161169214, "grad_norm": 0.1794722805759241, "learning_rate": 2.0457518886377038e-08, "loss": 0.009, "step": 10897 }, { "epoch": 2.922499329578976, "grad_norm": 0.2756025572020325, "learning_rate": 2.031677126196674e-08, "loss": 0.0135, "step": 10898 }, { "epoch": 2.922767497988737, "grad_norm": 0.20799350811844905, "learning_rate": 2.017650849609143e-08, "loss": 0.0098, "step": 10899 }, { "epoch": 2.923035666398498, "grad_norm": 0.22144873139322496, "learning_rate": 2.0036730602408518e-08, "loss": 0.0155, "step": 10900 }, { "epoch": 2.9233038348082596, "grad_norm": 0.2513534599302159, "learning_rate": 1.9897437594528223e-08, "loss": 0.009, "step": 10901 }, { "epoch": 2.923572003218021, "grad_norm": 0.24681839379200368, "learning_rate": 1.975862948601137e-08, "loss": 0.0161, "step": 10902 }, { "epoch": 2.9238401716277824, "grad_norm": 0.2671882114915564, "learning_rate": 1.962030629037548e-08, "loss": 0.0122, "step": 10903 }, { "epoch": 2.9241083400375434, "grad_norm": 0.2627288788966217, "learning_rate": 1.9482468021087553e-08, "loss": 0.0124, "step": 10904 }, { "epoch": 2.924376508447305, "grad_norm": 0.2660958897987377, "learning_rate": 1.9345114691567966e-08, "loss": 0.0124, "step": 10905 }, { "epoch": 2.924644676857066, "grad_norm": 0.23112443387305495, "learning_rate": 1.9208246315192135e-08, "loss": 0.0148, "step": 10906 }, { "epoch": 2.9249128452668276, "grad_norm": 0.24064417362208038, "learning_rate": 1.9071862905284954e-08, "loss": 0.0134, "step": 10907 }, { "epoch": 2.925181013676589, "grad_norm": 0.2940959148897896, "learning_rate": 1.8935964475126912e-08, "loss": 0.0124, "step": 10908 }, { "epoch": 2.92544918208635, "grad_norm": 0.19432312501761773, "learning_rate": 1.8800551037949088e-08, "loss": 0.0079, "step": 10909 }, { "epoch": 2.925717350496112, "grad_norm": 0.2884075534348664, "learning_rate": 1.8665622606936497e-08, "loss": 0.0143, "step": 10910 }, { "epoch": 2.9259855189058728, "grad_norm": 0.22942145243531906, "learning_rate": 1.8531179195227512e-08, "loss": 0.0085, "step": 10911 }, { "epoch": 2.926253687315634, "grad_norm": 0.21690642565860213, "learning_rate": 1.839722081591222e-08, "loss": 0.0141, "step": 10912 }, { "epoch": 2.9265218557253956, "grad_norm": 0.24019350793914943, "learning_rate": 1.8263747482033522e-08, "loss": 0.0143, "step": 10913 }, { "epoch": 2.926790024135157, "grad_norm": 0.23615444280569983, "learning_rate": 1.8130759206587135e-08, "loss": 0.0128, "step": 10914 }, { "epoch": 2.9270581925449184, "grad_norm": 0.21250310265643144, "learning_rate": 1.7998256002522695e-08, "loss": 0.0128, "step": 10915 }, { "epoch": 2.9273263609546794, "grad_norm": 0.4299236312907545, "learning_rate": 1.7866237882740444e-08, "loss": 0.0198, "step": 10916 }, { "epoch": 2.9275945293644408, "grad_norm": 0.334468239435671, "learning_rate": 1.7734704860095652e-08, "loss": 0.019, "step": 10917 }, { "epoch": 2.927862697774202, "grad_norm": 0.23012294254895008, "learning_rate": 1.7603656947395297e-08, "loss": 0.0139, "step": 10918 }, { "epoch": 2.9281308661839636, "grad_norm": 0.2256914893620224, "learning_rate": 1.747309415739806e-08, "loss": 0.0147, "step": 10919 }, { "epoch": 2.928399034593725, "grad_norm": 0.3197902013077106, "learning_rate": 1.7343016502817667e-08, "loss": 0.0141, "step": 10920 }, { "epoch": 2.928667203003486, "grad_norm": 0.22949222162755775, "learning_rate": 1.7213423996319532e-08, "loss": 0.0094, "step": 10921 }, { "epoch": 2.928935371413248, "grad_norm": 0.2601219426063491, "learning_rate": 1.7084316650520795e-08, "loss": 0.015, "step": 10922 }, { "epoch": 2.9292035398230087, "grad_norm": 0.27709697234096875, "learning_rate": 1.6955694477993055e-08, "loss": 0.015, "step": 10923 }, { "epoch": 2.92947170823277, "grad_norm": 0.35969542260521226, "learning_rate": 1.6827557491259083e-08, "loss": 0.0136, "step": 10924 }, { "epoch": 2.9297398766425315, "grad_norm": 0.2777014455834459, "learning_rate": 1.6699905702796114e-08, "loss": 0.0183, "step": 10925 }, { "epoch": 2.930008045052293, "grad_norm": 0.2942994911345897, "learning_rate": 1.65727391250331e-08, "loss": 0.0203, "step": 10926 }, { "epoch": 2.9302762134620544, "grad_norm": 0.24350234493024012, "learning_rate": 1.6446057770351798e-08, "loss": 0.0113, "step": 10927 }, { "epoch": 2.9305443818718153, "grad_norm": 0.25727247506285983, "learning_rate": 1.6319861651086233e-08, "loss": 0.015, "step": 10928 }, { "epoch": 2.9308125502815767, "grad_norm": 0.21657136194624046, "learning_rate": 1.6194150779524353e-08, "loss": 0.0105, "step": 10929 }, { "epoch": 2.931080718691338, "grad_norm": 0.23798903100143778, "learning_rate": 1.6068925167906923e-08, "loss": 0.0127, "step": 10930 }, { "epoch": 2.9313488871010995, "grad_norm": 0.23611045165197303, "learning_rate": 1.5944184828425856e-08, "loss": 0.0133, "step": 10931 }, { "epoch": 2.931617055510861, "grad_norm": 0.28977469091433855, "learning_rate": 1.581992977322644e-08, "loss": 0.0132, "step": 10932 }, { "epoch": 2.931885223920622, "grad_norm": 0.3281394587405311, "learning_rate": 1.5696160014408436e-08, "loss": 0.0149, "step": 10933 }, { "epoch": 2.9321533923303837, "grad_norm": 0.34621639839528223, "learning_rate": 1.5572875564022206e-08, "loss": 0.0129, "step": 10934 }, { "epoch": 2.9324215607401447, "grad_norm": 0.20540521067679937, "learning_rate": 1.545007643407148e-08, "loss": 0.008, "step": 10935 }, { "epoch": 2.932689729149906, "grad_norm": 0.26270790769104435, "learning_rate": 1.5327762636513367e-08, "loss": 0.0118, "step": 10936 }, { "epoch": 2.9329578975596675, "grad_norm": 0.20646058657565122, "learning_rate": 1.5205934183256666e-08, "loss": 0.01, "step": 10937 }, { "epoch": 2.933226065969429, "grad_norm": 0.2165060735920053, "learning_rate": 1.5084591086164113e-08, "loss": 0.01, "step": 10938 }, { "epoch": 2.9334942343791903, "grad_norm": 0.2731529205126739, "learning_rate": 1.4963733357050147e-08, "loss": 0.012, "step": 10939 }, { "epoch": 2.9337624027889513, "grad_norm": 0.24638938825315598, "learning_rate": 1.4843361007682022e-08, "loss": 0.0154, "step": 10940 }, { "epoch": 2.9340305711987127, "grad_norm": 0.21061866867507942, "learning_rate": 1.4723474049780917e-08, "loss": 0.013, "step": 10941 }, { "epoch": 2.934298739608474, "grad_norm": 0.23984387495920148, "learning_rate": 1.4604072495019161e-08, "loss": 0.0112, "step": 10942 }, { "epoch": 2.9345669080182355, "grad_norm": 0.24114268966738003, "learning_rate": 1.4485156355023567e-08, "loss": 0.0127, "step": 10943 }, { "epoch": 2.934835076427997, "grad_norm": 0.30193331098843473, "learning_rate": 1.4366725641370982e-08, "loss": 0.0182, "step": 10944 }, { "epoch": 2.935103244837758, "grad_norm": 0.29895661705761783, "learning_rate": 1.4248780365594405e-08, "loss": 0.0167, "step": 10945 }, { "epoch": 2.9353714132475197, "grad_norm": 0.3266412504164508, "learning_rate": 1.4131320539177428e-08, "loss": 0.0146, "step": 10946 }, { "epoch": 2.9356395816572807, "grad_norm": 0.41820298922911775, "learning_rate": 1.4014346173555904e-08, "loss": 0.0244, "step": 10947 }, { "epoch": 2.935907750067042, "grad_norm": 0.25497468860597533, "learning_rate": 1.3897857280120164e-08, "loss": 0.0137, "step": 10948 }, { "epoch": 2.9361759184768035, "grad_norm": 0.26745734399606585, "learning_rate": 1.3781853870212247e-08, "loss": 0.013, "step": 10949 }, { "epoch": 2.936444086886565, "grad_norm": 0.25223436541181116, "learning_rate": 1.3666335955127008e-08, "loss": 0.0136, "step": 10950 }, { "epoch": 2.9367122552963263, "grad_norm": 0.21549469422500683, "learning_rate": 1.3551303546112671e-08, "loss": 0.0125, "step": 10951 }, { "epoch": 2.9369804237060873, "grad_norm": 0.5091830238050425, "learning_rate": 1.343675665436861e-08, "loss": 0.0169, "step": 10952 }, { "epoch": 2.9372485921158487, "grad_norm": 0.2150053677097419, "learning_rate": 1.3322695291048682e-08, "loss": 0.0106, "step": 10953 }, { "epoch": 2.93751676052561, "grad_norm": 0.24212300712390375, "learning_rate": 1.3209119467258447e-08, "loss": 0.0125, "step": 10954 }, { "epoch": 2.9377849289353715, "grad_norm": 0.24003109370612688, "learning_rate": 1.3096029194056836e-08, "loss": 0.0125, "step": 10955 }, { "epoch": 2.938053097345133, "grad_norm": 0.29986676349867614, "learning_rate": 1.2983424482454488e-08, "loss": 0.0212, "step": 10956 }, { "epoch": 2.938321265754894, "grad_norm": 0.23085016194647184, "learning_rate": 1.2871305343415964e-08, "loss": 0.0105, "step": 10957 }, { "epoch": 2.9385894341646557, "grad_norm": 0.22637526893915144, "learning_rate": 1.275967178785753e-08, "loss": 0.0124, "step": 10958 }, { "epoch": 2.9388576025744166, "grad_norm": 0.2821899239517504, "learning_rate": 1.2648523826649384e-08, "loss": 0.014, "step": 10959 }, { "epoch": 2.939125770984178, "grad_norm": 0.22929435166813625, "learning_rate": 1.2537861470613421e-08, "loss": 0.012, "step": 10960 }, { "epoch": 2.9393939393939394, "grad_norm": 0.22523382883723653, "learning_rate": 1.2427684730524358e-08, "loss": 0.0123, "step": 10961 }, { "epoch": 2.939662107803701, "grad_norm": 0.23359015417517265, "learning_rate": 1.2317993617109724e-08, "loss": 0.0106, "step": 10962 }, { "epoch": 2.9399302762134623, "grad_norm": 0.3021731993023363, "learning_rate": 1.2208788141049866e-08, "loss": 0.0157, "step": 10963 }, { "epoch": 2.940198444623223, "grad_norm": 0.20889239751945007, "learning_rate": 1.2100068312978496e-08, "loss": 0.0143, "step": 10964 }, { "epoch": 2.9404666130329846, "grad_norm": 0.30111804872513614, "learning_rate": 1.1991834143480485e-08, "loss": 0.0181, "step": 10965 }, { "epoch": 2.940734781442746, "grad_norm": 0.3000648374294634, "learning_rate": 1.1884085643094623e-08, "loss": 0.0187, "step": 10966 }, { "epoch": 2.9410029498525074, "grad_norm": 0.2225868173738058, "learning_rate": 1.1776822822312517e-08, "loss": 0.0097, "step": 10967 }, { "epoch": 2.941271118262269, "grad_norm": 0.21643714942560044, "learning_rate": 1.1670045691577481e-08, "loss": 0.0129, "step": 10968 }, { "epoch": 2.94153928667203, "grad_norm": 0.2557999911459049, "learning_rate": 1.1563754261286198e-08, "loss": 0.0165, "step": 10969 }, { "epoch": 2.941807455081791, "grad_norm": 0.2465186903940971, "learning_rate": 1.1457948541788166e-08, "loss": 0.0118, "step": 10970 }, { "epoch": 2.9420756234915526, "grad_norm": 0.252442814789678, "learning_rate": 1.1352628543385702e-08, "loss": 0.0126, "step": 10971 }, { "epoch": 2.942343791901314, "grad_norm": 0.1883836758101972, "learning_rate": 1.1247794276332824e-08, "loss": 0.0091, "step": 10972 }, { "epoch": 2.9426119603110754, "grad_norm": 0.19540019219023644, "learning_rate": 1.1143445750837479e-08, "loss": 0.01, "step": 10973 }, { "epoch": 2.942880128720837, "grad_norm": 0.19583168607066823, "learning_rate": 1.103958297705987e-08, "loss": 0.0075, "step": 10974 }, { "epoch": 2.9431482971305982, "grad_norm": 0.23055118245359696, "learning_rate": 1.0936205965111913e-08, "loss": 0.0096, "step": 10975 }, { "epoch": 2.943416465540359, "grad_norm": 0.19322218084880466, "learning_rate": 1.0833314725060551e-08, "loss": 0.0111, "step": 10976 }, { "epoch": 2.9436846339501206, "grad_norm": 0.23140029168761792, "learning_rate": 1.0730909266923328e-08, "loss": 0.0154, "step": 10977 }, { "epoch": 2.943952802359882, "grad_norm": 0.2212576074103909, "learning_rate": 1.0628989600671158e-08, "loss": 0.012, "step": 10978 }, { "epoch": 2.9442209707696434, "grad_norm": 0.27311872120108416, "learning_rate": 1.0527555736227213e-08, "loss": 0.015, "step": 10979 }, { "epoch": 2.944489139179405, "grad_norm": 0.2643609744650085, "learning_rate": 1.0426607683469148e-08, "loss": 0.0155, "step": 10980 }, { "epoch": 2.9447573075891658, "grad_norm": 0.16364578439557606, "learning_rate": 1.0326145452224656e-08, "loss": 0.0066, "step": 10981 }, { "epoch": 2.945025475998927, "grad_norm": 0.2218760926935012, "learning_rate": 1.0226169052276469e-08, "loss": 0.0151, "step": 10982 }, { "epoch": 2.9452936444086886, "grad_norm": 0.21742735518511308, "learning_rate": 1.0126678493358466e-08, "loss": 0.0118, "step": 10983 }, { "epoch": 2.94556181281845, "grad_norm": 0.2341012132103654, "learning_rate": 1.0027673785157899e-08, "loss": 0.0118, "step": 10984 }, { "epoch": 2.9458299812282114, "grad_norm": 0.29852071709328143, "learning_rate": 9.929154937314278e-09, "loss": 0.0143, "step": 10985 }, { "epoch": 2.946098149637973, "grad_norm": 0.26944244971715664, "learning_rate": 9.831121959421042e-09, "loss": 0.012, "step": 10986 }, { "epoch": 2.946366318047734, "grad_norm": 0.268586108013912, "learning_rate": 9.733574861022222e-09, "loss": 0.0149, "step": 10987 }, { "epoch": 2.946634486457495, "grad_norm": 0.2593955399371402, "learning_rate": 9.636513651616885e-09, "loss": 0.0197, "step": 10988 }, { "epoch": 2.9469026548672566, "grad_norm": 0.22545787927317099, "learning_rate": 9.539938340654698e-09, "loss": 0.0134, "step": 10989 }, { "epoch": 2.947170823277018, "grad_norm": 0.2043549721156079, "learning_rate": 9.443848937539246e-09, "loss": 0.012, "step": 10990 }, { "epoch": 2.9474389916867794, "grad_norm": 0.39302996078005364, "learning_rate": 9.348245451626935e-09, "loss": 0.0093, "step": 10991 }, { "epoch": 2.9477071600965408, "grad_norm": 0.29771201896700183, "learning_rate": 9.253127892225877e-09, "loss": 0.0121, "step": 10992 }, { "epoch": 2.9479753285063017, "grad_norm": 0.20806591628705107, "learning_rate": 9.15849626859755e-09, "loss": 0.0107, "step": 10993 }, { "epoch": 2.948243496916063, "grad_norm": 0.2608632504168459, "learning_rate": 9.064350589955695e-09, "loss": 0.0121, "step": 10994 }, { "epoch": 2.9485116653258245, "grad_norm": 0.21976282886729137, "learning_rate": 8.97069086546798e-09, "loss": 0.0125, "step": 10995 }, { "epoch": 2.948779833735586, "grad_norm": 0.2243083650198348, "learning_rate": 8.877517104252665e-09, "loss": 0.0129, "step": 10996 }, { "epoch": 2.9490480021453473, "grad_norm": 0.3116338752155259, "learning_rate": 8.784829315382493e-09, "loss": 0.0172, "step": 10997 }, { "epoch": 2.9493161705551088, "grad_norm": 0.1813790332835123, "learning_rate": 8.692627507881912e-09, "loss": 0.0084, "step": 10998 }, { "epoch": 2.94958433896487, "grad_norm": 0.20252986270411516, "learning_rate": 8.600911690728742e-09, "loss": 0.0123, "step": 10999 }, { "epoch": 2.949852507374631, "grad_norm": 0.28292403732637544, "learning_rate": 8.50968187285306e-09, "loss": 0.014, "step": 11000 }, { "epoch": 2.9501206757843925, "grad_norm": 0.2920487919435454, "learning_rate": 8.418938063137205e-09, "loss": 0.0118, "step": 11001 }, { "epoch": 2.950388844194154, "grad_norm": 0.32111722081115657, "learning_rate": 8.328680270416888e-09, "loss": 0.0126, "step": 11002 }, { "epoch": 2.9506570126039153, "grad_norm": 0.26180813662921765, "learning_rate": 8.238908503480636e-09, "loss": 0.0139, "step": 11003 }, { "epoch": 2.9509251810136767, "grad_norm": 0.3055862399151533, "learning_rate": 8.149622771068677e-09, "loss": 0.0139, "step": 11004 }, { "epoch": 2.9511933494234377, "grad_norm": 0.18615916869483817, "learning_rate": 8.06082308187517e-09, "loss": 0.0102, "step": 11005 }, { "epoch": 2.951461517833199, "grad_norm": 0.18563356252299068, "learning_rate": 7.972509444545973e-09, "loss": 0.0091, "step": 11006 }, { "epoch": 2.9517296862429605, "grad_norm": 0.23022551686459858, "learning_rate": 7.884681867679766e-09, "loss": 0.0134, "step": 11007 }, { "epoch": 2.951997854652722, "grad_norm": 0.21354139216914916, "learning_rate": 7.797340359828043e-09, "loss": 0.0181, "step": 11008 }, { "epoch": 2.9522660230624833, "grad_norm": 0.24139668495791866, "learning_rate": 7.710484929495666e-09, "loss": 0.0139, "step": 11009 }, { "epoch": 2.9525341914722447, "grad_norm": 0.22657338054194212, "learning_rate": 7.624115585139202e-09, "loss": 0.011, "step": 11010 }, { "epoch": 2.952802359882006, "grad_norm": 0.29407794008679367, "learning_rate": 7.538232335167483e-09, "loss": 0.012, "step": 11011 }, { "epoch": 2.953070528291767, "grad_norm": 0.23532974648279006, "learning_rate": 7.452835187943819e-09, "loss": 0.0116, "step": 11012 }, { "epoch": 2.9533386967015285, "grad_norm": 0.23972060940016243, "learning_rate": 7.3679241517826685e-09, "loss": 0.0075, "step": 11013 }, { "epoch": 2.95360686511129, "grad_norm": 0.26350688155962315, "learning_rate": 7.2834992349513075e-09, "loss": 0.0142, "step": 11014 }, { "epoch": 2.9538750335210513, "grad_norm": 0.29370899842408016, "learning_rate": 7.199560445670384e-09, "loss": 0.0141, "step": 11015 }, { "epoch": 2.9541432019308127, "grad_norm": 0.3539727969652111, "learning_rate": 7.116107792112803e-09, "loss": 0.0188, "step": 11016 }, { "epoch": 2.9544113703405737, "grad_norm": 0.2068094474418927, "learning_rate": 7.033141282403733e-09, "loss": 0.0109, "step": 11017 }, { "epoch": 2.954679538750335, "grad_norm": 0.27576145864803947, "learning_rate": 6.950660924621156e-09, "loss": 0.0167, "step": 11018 }, { "epoch": 2.9549477071600965, "grad_norm": 0.2496861556096474, "learning_rate": 6.86866672679698e-09, "loss": 0.014, "step": 11019 }, { "epoch": 2.955215875569858, "grad_norm": 0.261069895226711, "learning_rate": 6.787158696914265e-09, "loss": 0.0145, "step": 11020 }, { "epoch": 2.9554840439796193, "grad_norm": 0.1885906194524536, "learning_rate": 6.706136842909439e-09, "loss": 0.0087, "step": 11021 }, { "epoch": 2.9557522123893807, "grad_norm": 0.2320540735531457, "learning_rate": 6.625601172670637e-09, "loss": 0.0102, "step": 11022 }, { "epoch": 2.956020380799142, "grad_norm": 0.3001526439293878, "learning_rate": 6.54555169403992e-09, "loss": 0.0179, "step": 11023 }, { "epoch": 2.956288549208903, "grad_norm": 0.29231508712076265, "learning_rate": 6.465988414811608e-09, "loss": 0.0164, "step": 11024 }, { "epoch": 2.9565567176186645, "grad_norm": 0.2089858622079921, "learning_rate": 6.386911342732283e-09, "loss": 0.0111, "step": 11025 }, { "epoch": 2.956824886028426, "grad_norm": 0.456812526477083, "learning_rate": 6.308320485501895e-09, "loss": 0.0122, "step": 11026 }, { "epoch": 2.9570930544381873, "grad_norm": 0.22100851311188455, "learning_rate": 6.230215850772103e-09, "loss": 0.0105, "step": 11027 }, { "epoch": 2.9573612228479487, "grad_norm": 0.2692792636774736, "learning_rate": 6.1525974461479345e-09, "loss": 0.0155, "step": 11028 }, { "epoch": 2.9576293912577096, "grad_norm": 0.16411345040323266, "learning_rate": 6.075465279187232e-09, "loss": 0.0077, "step": 11029 }, { "epoch": 2.957897559667471, "grad_norm": 0.22520368159005086, "learning_rate": 5.998819357399543e-09, "loss": 0.0129, "step": 11030 }, { "epoch": 2.9581657280772324, "grad_norm": 0.28934349194706493, "learning_rate": 5.9226596882483445e-09, "loss": 0.0164, "step": 11031 }, { "epoch": 2.958433896486994, "grad_norm": 0.2595500065451148, "learning_rate": 5.846986279148259e-09, "loss": 0.0142, "step": 11032 }, { "epoch": 2.9587020648967552, "grad_norm": 0.15325889819975147, "learning_rate": 5.771799137468392e-09, "loss": 0.0073, "step": 11033 }, { "epoch": 2.9589702333065167, "grad_norm": 0.2484839551954366, "learning_rate": 5.697098270529e-09, "loss": 0.0147, "step": 11034 }, { "epoch": 2.959238401716278, "grad_norm": 0.327708875680215, "learning_rate": 5.622883685603153e-09, "loss": 0.0367, "step": 11035 }, { "epoch": 2.959506570126039, "grad_norm": 0.30876315799378196, "learning_rate": 5.549155389917848e-09, "loss": 0.0153, "step": 11036 }, { "epoch": 2.9597747385358004, "grad_norm": 0.23619310520640383, "learning_rate": 5.475913390650678e-09, "loss": 0.0144, "step": 11037 }, { "epoch": 2.960042906945562, "grad_norm": 0.2035711117694188, "learning_rate": 5.403157694933714e-09, "loss": 0.0117, "step": 11038 }, { "epoch": 2.9603110753553232, "grad_norm": 0.26458460359750696, "learning_rate": 5.33088830985129e-09, "loss": 0.014, "step": 11039 }, { "epoch": 2.9605792437650846, "grad_norm": 0.2328340881568737, "learning_rate": 5.259105242439444e-09, "loss": 0.0104, "step": 11040 }, { "epoch": 2.9608474121748456, "grad_norm": 0.25267594743758226, "learning_rate": 5.1878084996875855e-09, "loss": 0.013, "step": 11041 }, { "epoch": 2.961115580584607, "grad_norm": 0.22293967678766832, "learning_rate": 5.116998088537939e-09, "loss": 0.0113, "step": 11042 }, { "epoch": 2.9613837489943684, "grad_norm": 0.21258786732186968, "learning_rate": 5.0466740158849895e-09, "loss": 0.013, "step": 11043 }, { "epoch": 2.96165191740413, "grad_norm": 0.2267627508780837, "learning_rate": 4.976836288576592e-09, "loss": 0.0094, "step": 11044 }, { "epoch": 2.961920085813891, "grad_norm": 0.2795388000207788, "learning_rate": 4.907484913411198e-09, "loss": 0.0129, "step": 11045 }, { "epoch": 2.9621882542236526, "grad_norm": 0.18654059103612475, "learning_rate": 4.838619897142849e-09, "loss": 0.0109, "step": 11046 }, { "epoch": 2.962456422633414, "grad_norm": 0.22663611981306914, "learning_rate": 4.7702412464761815e-09, "loss": 0.0099, "step": 11047 }, { "epoch": 2.962724591043175, "grad_norm": 0.23270388261886116, "learning_rate": 4.702348968069204e-09, "loss": 0.0144, "step": 11048 }, { "epoch": 2.9629927594529364, "grad_norm": 0.2219330575825363, "learning_rate": 4.634943068531627e-09, "loss": 0.0129, "step": 11049 }, { "epoch": 2.963260927862698, "grad_norm": 0.261189059244269, "learning_rate": 4.568023554427647e-09, "loss": 0.0154, "step": 11050 }, { "epoch": 2.963529096272459, "grad_norm": 0.25025273335962905, "learning_rate": 4.501590432272607e-09, "loss": 0.0133, "step": 11051 }, { "epoch": 2.9637972646822206, "grad_norm": 0.2554766721300398, "learning_rate": 4.435643708534665e-09, "loss": 0.014, "step": 11052 }, { "epoch": 2.9640654330919816, "grad_norm": 0.20598332245553708, "learning_rate": 4.370183389635352e-09, "loss": 0.0137, "step": 11053 }, { "epoch": 2.964333601501743, "grad_norm": 0.24057108564672777, "learning_rate": 4.305209481948458e-09, "loss": 0.0119, "step": 11054 }, { "epoch": 2.9646017699115044, "grad_norm": 0.19577140392189143, "learning_rate": 4.240721991799479e-09, "loss": 0.0116, "step": 11055 }, { "epoch": 2.9648699383212658, "grad_norm": 0.28393335385980295, "learning_rate": 4.176720925467837e-09, "loss": 0.0126, "step": 11056 }, { "epoch": 2.965138106731027, "grad_norm": 0.27124130629002036, "learning_rate": 4.113206289185767e-09, "loss": 0.0151, "step": 11057 }, { "epoch": 2.965406275140788, "grad_norm": 0.2548893394015401, "learning_rate": 4.050178089136658e-09, "loss": 0.0115, "step": 11058 }, { "epoch": 2.96567444355055, "grad_norm": 0.20846308078470122, "learning_rate": 3.987636331457822e-09, "loss": 0.0115, "step": 11059 }, { "epoch": 2.965942611960311, "grad_norm": 0.2724776133684185, "learning_rate": 3.925581022238279e-09, "loss": 0.0154, "step": 11060 }, { "epoch": 2.9662107803700724, "grad_norm": 0.22127555164722354, "learning_rate": 3.86401216752097e-09, "loss": 0.0119, "step": 11061 }, { "epoch": 2.9664789487798338, "grad_norm": 0.23543738384679128, "learning_rate": 3.802929773300546e-09, "loss": 0.0162, "step": 11062 }, { "epoch": 2.966747117189595, "grad_norm": 0.23469447189839013, "learning_rate": 3.742333845523916e-09, "loss": 0.012, "step": 11063 }, { "epoch": 2.9670152855993566, "grad_norm": 0.3869669479271844, "learning_rate": 3.68222439009136e-09, "loss": 0.0221, "step": 11064 }, { "epoch": 2.9672834540091175, "grad_norm": 0.3842178041331062, "learning_rate": 3.6226014128554198e-09, "loss": 0.0092, "step": 11065 }, { "epoch": 2.967551622418879, "grad_norm": 0.3102696991711149, "learning_rate": 3.563464919622006e-09, "loss": 0.0165, "step": 11066 }, { "epoch": 2.9678197908286403, "grad_norm": 0.26073251863966534, "learning_rate": 3.5048149161487356e-09, "loss": 0.0158, "step": 11067 }, { "epoch": 2.9680879592384017, "grad_norm": 0.25194678460377523, "learning_rate": 3.44665140814604e-09, "loss": 0.0139, "step": 11068 }, { "epoch": 2.968356127648163, "grad_norm": 0.23923012932064258, "learning_rate": 3.3889744012771676e-09, "loss": 0.011, "step": 11069 }, { "epoch": 2.968624296057924, "grad_norm": 0.2595424809639399, "learning_rate": 3.331783901158736e-09, "loss": 0.0148, "step": 11070 }, { "epoch": 2.968892464467686, "grad_norm": 0.21031684967431186, "learning_rate": 3.275079913357959e-09, "loss": 0.0107, "step": 11071 }, { "epoch": 2.969160632877447, "grad_norm": 0.23175539703954384, "learning_rate": 3.218862443397086e-09, "loss": 0.0091, "step": 11072 }, { "epoch": 2.9694288012872083, "grad_norm": 0.280362620482897, "learning_rate": 3.163131496748961e-09, "loss": 0.0106, "step": 11073 }, { "epoch": 2.9696969696969697, "grad_norm": 0.3304426194496598, "learning_rate": 3.1078870788403547e-09, "loss": 0.0177, "step": 11074 }, { "epoch": 2.969965138106731, "grad_norm": 0.23580989752216705, "learning_rate": 3.053129195050852e-09, "loss": 0.0161, "step": 11075 }, { "epoch": 2.9702333065164925, "grad_norm": 0.21336601368907335, "learning_rate": 2.99885785071119e-09, "loss": 0.0077, "step": 11076 }, { "epoch": 2.9705014749262535, "grad_norm": 0.3206062621438976, "learning_rate": 2.9450730511060288e-09, "loss": 0.0171, "step": 11077 }, { "epoch": 2.970769643336015, "grad_norm": 0.2933543724305954, "learning_rate": 2.891774801471736e-09, "loss": 0.0136, "step": 11078 }, { "epoch": 2.9710378117457763, "grad_norm": 0.2986064974373111, "learning_rate": 2.8389631069986044e-09, "loss": 0.0174, "step": 11079 }, { "epoch": 2.9713059801555377, "grad_norm": 0.2198478181512586, "learning_rate": 2.786637972828632e-09, "loss": 0.012, "step": 11080 }, { "epoch": 2.971574148565299, "grad_norm": 0.23523966564909787, "learning_rate": 2.7347994040560767e-09, "loss": 0.0127, "step": 11081 }, { "epoch": 2.97184231697506, "grad_norm": 0.24498553157980932, "learning_rate": 2.6834474057285676e-09, "loss": 0.0187, "step": 11082 }, { "epoch": 2.972110485384822, "grad_norm": 0.2637130222516912, "learning_rate": 2.6325819828465495e-09, "loss": 0.0152, "step": 11083 }, { "epoch": 2.972378653794583, "grad_norm": 0.2110131468880044, "learning_rate": 2.5822031403621715e-09, "loss": 0.0109, "step": 11084 }, { "epoch": 2.9726468222043443, "grad_norm": 0.2585096548134443, "learning_rate": 2.5323108831804e-09, "loss": 0.0097, "step": 11085 }, { "epoch": 2.9729149906141057, "grad_norm": 0.30902473013626225, "learning_rate": 2.4829052161601253e-09, "loss": 0.014, "step": 11086 }, { "epoch": 2.973183159023867, "grad_norm": 0.3101559214986143, "learning_rate": 2.4339861441108337e-09, "loss": 0.018, "step": 11087 }, { "epoch": 2.9734513274336285, "grad_norm": 0.2098092126192229, "learning_rate": 2.3855536717959372e-09, "loss": 0.0116, "step": 11088 }, { "epoch": 2.9737194958433895, "grad_norm": 0.20397358814268166, "learning_rate": 2.337607803931108e-09, "loss": 0.009, "step": 11089 }, { "epoch": 2.973987664253151, "grad_norm": 0.23881556088138067, "learning_rate": 2.290148545185389e-09, "loss": 0.0119, "step": 11090 }, { "epoch": 2.9742558326629123, "grad_norm": 0.18642913363217348, "learning_rate": 2.2431759001789734e-09, "loss": 0.0085, "step": 11091 }, { "epoch": 2.9745240010726737, "grad_norm": 0.24466709529192943, "learning_rate": 2.1966898734854246e-09, "loss": 0.0113, "step": 11092 }, { "epoch": 2.974792169482435, "grad_norm": 0.23209169646385097, "learning_rate": 2.1506904696316775e-09, "loss": 0.0173, "step": 11093 }, { "epoch": 2.975060337892196, "grad_norm": 0.24040530436982996, "learning_rate": 2.1051776930963718e-09, "loss": 0.0109, "step": 11094 }, { "epoch": 2.975328506301958, "grad_norm": 0.2215895372181935, "learning_rate": 2.0601515483104073e-09, "loss": 0.0125, "step": 11095 }, { "epoch": 2.975596674711719, "grad_norm": 0.219227633530607, "learning_rate": 2.015612039658055e-09, "loss": 0.0106, "step": 11096 }, { "epoch": 2.9758648431214803, "grad_norm": 0.3057921284664526, "learning_rate": 1.9715591714764003e-09, "loss": 0.0139, "step": 11097 }, { "epoch": 2.9761330115312417, "grad_norm": 0.2557013325661048, "learning_rate": 1.927992948054791e-09, "loss": 0.0126, "step": 11098 }, { "epoch": 2.976401179941003, "grad_norm": 0.3541306906266848, "learning_rate": 1.8849133736342786e-09, "loss": 0.0136, "step": 11099 }, { "epoch": 2.9766693483507645, "grad_norm": 0.22237571207545903, "learning_rate": 1.8423204524103955e-09, "loss": 0.0157, "step": 11100 }, { "epoch": 2.9769375167605254, "grad_norm": 0.2434901625028527, "learning_rate": 1.8002141885298252e-09, "loss": 0.0103, "step": 11101 }, { "epoch": 2.977205685170287, "grad_norm": 0.21593467113912282, "learning_rate": 1.7585945860926213e-09, "loss": 0.0178, "step": 11102 }, { "epoch": 2.9774738535800482, "grad_norm": 0.26323112928104564, "learning_rate": 1.7174616491510975e-09, "loss": 0.016, "step": 11103 }, { "epoch": 2.9777420219898096, "grad_norm": 0.26082567390149924, "learning_rate": 1.6768153817098288e-09, "loss": 0.0095, "step": 11104 }, { "epoch": 2.978010190399571, "grad_norm": 0.22808575341026444, "learning_rate": 1.6366557877267597e-09, "loss": 0.0104, "step": 11105 }, { "epoch": 2.978278358809332, "grad_norm": 0.2130544878242677, "learning_rate": 1.5969828711120961e-09, "loss": 0.0086, "step": 11106 }, { "epoch": 2.978546527219094, "grad_norm": 0.2432737700232307, "learning_rate": 1.5577966357288588e-09, "loss": 0.0128, "step": 11107 }, { "epoch": 2.978814695628855, "grad_norm": 0.30025075504350274, "learning_rate": 1.5190970853923293e-09, "loss": 0.0137, "step": 11108 }, { "epoch": 2.979082864038616, "grad_norm": 0.2591173124550683, "learning_rate": 1.4808842238706045e-09, "loss": 0.015, "step": 11109 }, { "epoch": 2.9793510324483776, "grad_norm": 0.2618365627730751, "learning_rate": 1.4431580548840418e-09, "loss": 0.017, "step": 11110 }, { "epoch": 2.979619200858139, "grad_norm": 0.26814887701328494, "learning_rate": 1.4059185821069243e-09, "loss": 0.0115, "step": 11111 }, { "epoch": 2.9798873692679004, "grad_norm": 0.25802258121965493, "learning_rate": 1.3691658091641302e-09, "loss": 0.011, "step": 11112 }, { "epoch": 2.9801555376776614, "grad_norm": 0.2920627452502052, "learning_rate": 1.3328997396344634e-09, "loss": 0.0142, "step": 11113 }, { "epoch": 2.980423706087423, "grad_norm": 0.21314065808199437, "learning_rate": 1.2971203770489883e-09, "loss": 0.0105, "step": 11114 }, { "epoch": 2.980691874497184, "grad_norm": 0.2178413280274995, "learning_rate": 1.2618277248921397e-09, "loss": 0.0152, "step": 11115 }, { "epoch": 2.9809600429069456, "grad_norm": 0.21505661295541822, "learning_rate": 1.2270217865995026e-09, "loss": 0.0164, "step": 11116 }, { "epoch": 2.981228211316707, "grad_norm": 0.23383493051656698, "learning_rate": 1.1927025655600333e-09, "loss": 0.0093, "step": 11117 }, { "epoch": 2.981496379726468, "grad_norm": 0.3195386338478377, "learning_rate": 1.1588700651155027e-09, "loss": 0.0144, "step": 11118 }, { "epoch": 2.98176454813623, "grad_norm": 0.23018117007013592, "learning_rate": 1.1255242885604978e-09, "loss": 0.0137, "step": 11119 }, { "epoch": 2.982032716545991, "grad_norm": 0.26294794627346524, "learning_rate": 1.0926652391413106e-09, "loss": 0.0097, "step": 11120 }, { "epoch": 2.982300884955752, "grad_norm": 0.2206135189688444, "learning_rate": 1.060292920057604e-09, "loss": 0.013, "step": 11121 }, { "epoch": 2.9825690533655136, "grad_norm": 0.21675469562138897, "learning_rate": 1.0284073344607458e-09, "loss": 0.0088, "step": 11122 }, { "epoch": 2.982837221775275, "grad_norm": 0.18564725714428187, "learning_rate": 9.970084854565853e-10, "loss": 0.0098, "step": 11123 }, { "epoch": 2.9831053901850364, "grad_norm": 0.5889375437527694, "learning_rate": 9.660963761010112e-10, "loss": 0.0193, "step": 11124 }, { "epoch": 2.9833735585947974, "grad_norm": 0.2978600648543535, "learning_rate": 9.35671009404948e-10, "loss": 0.0129, "step": 11125 }, { "epoch": 2.9836417270045588, "grad_norm": 0.19947924721086488, "learning_rate": 9.057323883299163e-10, "loss": 0.0114, "step": 11126 }, { "epoch": 2.98390989541432, "grad_norm": 0.23424007650263012, "learning_rate": 8.762805157913612e-10, "loss": 0.0131, "step": 11127 }, { "epoch": 2.9841780638240816, "grad_norm": 0.19782068621730642, "learning_rate": 8.47315394656989e-10, "loss": 0.0096, "step": 11128 }, { "epoch": 2.984446232233843, "grad_norm": 0.23739222908975008, "learning_rate": 8.188370277473212e-10, "loss": 0.0149, "step": 11129 }, { "epoch": 2.984714400643604, "grad_norm": 0.2001292391251265, "learning_rate": 7.908454178345848e-10, "loss": 0.0094, "step": 11130 }, { "epoch": 2.984982569053366, "grad_norm": 0.23650733090340254, "learning_rate": 7.633405676449323e-10, "loss": 0.0142, "step": 11131 }, { "epoch": 2.9852507374631267, "grad_norm": 0.19164698734944594, "learning_rate": 7.363224798556668e-10, "loss": 0.0085, "step": 11132 }, { "epoch": 2.985518905872888, "grad_norm": 0.23934634512458913, "learning_rate": 7.097911570985717e-10, "loss": 0.0177, "step": 11133 }, { "epoch": 2.9857870742826496, "grad_norm": 0.24602476640544266, "learning_rate": 6.837466019554706e-10, "loss": 0.0131, "step": 11134 }, { "epoch": 2.986055242692411, "grad_norm": 0.2836286349639682, "learning_rate": 6.581888169637784e-10, "loss": 0.0196, "step": 11135 }, { "epoch": 2.9863234111021724, "grad_norm": 0.2358314743971647, "learning_rate": 6.331178046103948e-10, "loss": 0.0124, "step": 11136 }, { "epoch": 2.9865915795119333, "grad_norm": 0.25059738817762284, "learning_rate": 6.085335673378101e-10, "loss": 0.0113, "step": 11137 }, { "epoch": 2.9868597479216947, "grad_norm": 0.2377480780758983, "learning_rate": 5.844361075391103e-10, "loss": 0.0149, "step": 11138 }, { "epoch": 2.987127916331456, "grad_norm": 0.2823192037429361, "learning_rate": 5.608254275607516e-10, "loss": 0.0141, "step": 11139 }, { "epoch": 2.9873960847412175, "grad_norm": 0.2354067392717898, "learning_rate": 5.37701529701451e-10, "loss": 0.0163, "step": 11140 }, { "epoch": 2.987664253150979, "grad_norm": 0.27142116577533837, "learning_rate": 5.150644162127405e-10, "loss": 0.0145, "step": 11141 }, { "epoch": 2.98793242156074, "grad_norm": 0.23774416641152707, "learning_rate": 4.929140892989681e-10, "loss": 0.0176, "step": 11142 }, { "epoch": 2.9882005899705013, "grad_norm": 0.22655035337014107, "learning_rate": 4.71250551116742e-10, "loss": 0.0133, "step": 11143 }, { "epoch": 2.9884687583802627, "grad_norm": 0.24267125735223366, "learning_rate": 4.5007380377548594e-10, "loss": 0.0169, "step": 11144 }, { "epoch": 2.988736926790024, "grad_norm": 0.3861421649057795, "learning_rate": 4.2938384933688405e-10, "loss": 0.0123, "step": 11145 }, { "epoch": 2.9890050951997855, "grad_norm": 0.27172508949621915, "learning_rate": 4.09180689815436e-10, "loss": 0.0151, "step": 11146 }, { "epoch": 2.989273263609547, "grad_norm": 0.20851391388169851, "learning_rate": 3.8946432717845707e-10, "loss": 0.0097, "step": 11147 }, { "epoch": 2.9895414320193083, "grad_norm": 0.1904651841812917, "learning_rate": 3.7023476334552276e-10, "loss": 0.0116, "step": 11148 }, { "epoch": 2.9898096004290693, "grad_norm": 0.31894540289041023, "learning_rate": 3.514920001895794e-10, "loss": 0.0191, "step": 11149 }, { "epoch": 2.9900777688388307, "grad_norm": 0.24232342545378996, "learning_rate": 3.332360395341683e-10, "loss": 0.0163, "step": 11150 }, { "epoch": 2.990345937248592, "grad_norm": 0.27627238640933655, "learning_rate": 3.1546688315842177e-10, "loss": 0.0171, "step": 11151 }, { "epoch": 2.9906141056583535, "grad_norm": 0.2852730314315949, "learning_rate": 2.981845327915123e-10, "loss": 0.0172, "step": 11152 }, { "epoch": 2.990882274068115, "grad_norm": 0.19117218788983623, "learning_rate": 2.8138899011653785e-10, "loss": 0.0094, "step": 11153 }, { "epoch": 2.991150442477876, "grad_norm": 0.31070996350426516, "learning_rate": 2.6508025676885706e-10, "loss": 0.0178, "step": 11154 }, { "epoch": 2.9914186108876373, "grad_norm": 0.267172109353608, "learning_rate": 2.492583343360888e-10, "loss": 0.0125, "step": 11155 }, { "epoch": 2.9916867792973987, "grad_norm": 0.2784276436201992, "learning_rate": 2.3392322435866753e-10, "loss": 0.018, "step": 11156 }, { "epoch": 2.99195494770716, "grad_norm": 0.2577698577895211, "learning_rate": 2.1907492833039835e-10, "loss": 0.0147, "step": 11157 }, { "epoch": 2.9922231161169215, "grad_norm": 0.20364807102104018, "learning_rate": 2.0471344769623646e-10, "loss": 0.0141, "step": 11158 }, { "epoch": 2.992491284526683, "grad_norm": 0.20838888270026573, "learning_rate": 1.9083878385561806e-10, "loss": 0.0137, "step": 11159 }, { "epoch": 2.9927594529364443, "grad_norm": 0.23332824344910325, "learning_rate": 1.7745093815801916e-10, "loss": 0.0124, "step": 11160 }, { "epoch": 2.9930276213462053, "grad_norm": 0.2757713952020089, "learning_rate": 1.6454991190795188e-10, "loss": 0.0117, "step": 11161 }, { "epoch": 2.9932957897559667, "grad_norm": 0.2587955182055354, "learning_rate": 1.5213570636163356e-10, "loss": 0.0162, "step": 11162 }, { "epoch": 2.993563958165728, "grad_norm": 0.23051791135146446, "learning_rate": 1.4020832272754193e-10, "loss": 0.0103, "step": 11163 }, { "epoch": 2.9938321265754895, "grad_norm": 0.183293732768527, "learning_rate": 1.2876776216697028e-10, "loss": 0.0092, "step": 11164 }, { "epoch": 2.994100294985251, "grad_norm": 0.21153636109007778, "learning_rate": 1.1781402579347235e-10, "loss": 0.012, "step": 11165 }, { "epoch": 2.994368463395012, "grad_norm": 0.29251776124803863, "learning_rate": 1.0734711467452752e-10, "loss": 0.0138, "step": 11166 }, { "epoch": 2.9946366318047732, "grad_norm": 0.25994329029003777, "learning_rate": 9.736702982821033e-11, "loss": 0.0122, "step": 11167 }, { "epoch": 2.9949048002145346, "grad_norm": 0.27612524144044037, "learning_rate": 8.787377222707616e-11, "loss": 0.0172, "step": 11168 }, { "epoch": 2.995172968624296, "grad_norm": 0.3315308896947658, "learning_rate": 7.886734279538566e-11, "loss": 0.0312, "step": 11169 }, { "epoch": 2.9954411370340575, "grad_norm": 0.2510527447873307, "learning_rate": 7.034774240965992e-11, "loss": 0.0128, "step": 11170 }, { "epoch": 2.995709305443819, "grad_norm": 0.2284363134102265, "learning_rate": 6.231497189979063e-11, "loss": 0.0103, "step": 11171 }, { "epoch": 2.9959774738535803, "grad_norm": 0.3081636017108401, "learning_rate": 5.4769032047374823e-11, "loss": 0.0122, "step": 11172 }, { "epoch": 2.9962456422633412, "grad_norm": 0.25084693586701395, "learning_rate": 4.7709923587380136e-11, "loss": 0.0148, "step": 11173 }, { "epoch": 2.9965138106731026, "grad_norm": 0.21931056938745797, "learning_rate": 4.113764720758972e-11, "loss": 0.0115, "step": 11174 }, { "epoch": 2.996781979082864, "grad_norm": 0.24404822227757195, "learning_rate": 3.505220354749206e-11, "loss": 0.0112, "step": 11175 }, { "epoch": 2.9970501474926254, "grad_norm": 0.17974322241819385, "learning_rate": 2.9453593199946226e-11, "loss": 0.0108, "step": 11176 }, { "epoch": 2.997318315902387, "grad_norm": 0.34380747076878193, "learning_rate": 2.434181670951663e-11, "loss": 0.0164, "step": 11177 }, { "epoch": 2.997586484312148, "grad_norm": 0.23731527116957574, "learning_rate": 1.9716874574138288e-11, "loss": 0.0092, "step": 11178 }, { "epoch": 2.997854652721909, "grad_norm": 0.25534359720347416, "learning_rate": 1.5578767244561753e-11, "loss": 0.0119, "step": 11179 }, { "epoch": 2.9981228211316706, "grad_norm": 0.21976533655679964, "learning_rate": 1.1927495123242872e-11, "loss": 0.0121, "step": 11180 }, { "epoch": 2.998390989541432, "grad_norm": 0.21196187585790638, "learning_rate": 8.76305856600812e-12, "loss": 0.0146, "step": 11181 }, { "epoch": 2.9986591579511934, "grad_norm": 0.1956700201041784, "learning_rate": 6.085457880389278e-12, "loss": 0.0105, "step": 11182 }, { "epoch": 2.998927326360955, "grad_norm": 0.29091444988670095, "learning_rate": 3.8946933278438685e-12, "loss": 0.0146, "step": 11183 }, { "epoch": 2.9991954947707162, "grad_norm": 0.21513102373085805, "learning_rate": 2.19076512153471e-12, "loss": 0.018, "step": 11184 }, { "epoch": 2.999463663180477, "grad_norm": 0.6836501830867728, "learning_rate": 9.736734268850356e-13, "loss": 0.0203, "step": 11185 }, { "epoch": 2.9997318315902386, "grad_norm": 0.2272897283105736, "learning_rate": 2.4341836268870765e-13, "loss": 0.0096, "step": 11186 }, { "epoch": 3.0, "grad_norm": 0.2688336217588715, "learning_rate": 0.0, "loss": 0.0116, "step": 11187 }, { "epoch": 3.0, "eval_loss": 0.02185099571943283, "eval_runtime": 292.629, "eval_samples_per_second": 85.846, "eval_steps_per_second": 1.343, "step": 11187 }, { "epoch": 3.0, "step": 11187, "total_flos": 4.3304182632184545e+18, "train_loss": 0.030125039980372262, "train_runtime": 67465.0545, "train_samples_per_second": 21.222, "train_steps_per_second": 0.166 } ], "logging_steps": 1, "max_steps": 11187, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 4.3304182632184545e+18, "train_batch_size": 8, "trial_name": null, "trial_params": null }