{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0064516129032257, "eval_steps": 32, "global_step": 156, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0064516129032258064, "grad_norm": 1.8022076952539137, "learning_rate": 2.5e-06, "loss": 1.477, "step": 1 }, { "epoch": 0.0064516129032258064, "eval_loss": 1.3211307525634766, "eval_runtime": 61.2704, "eval_samples_per_second": 1.632, "eval_steps_per_second": 0.065, "step": 1 }, { "epoch": 0.012903225806451613, "grad_norm": 1.830164465281067, "learning_rate": 5e-06, "loss": 1.4299, "step": 2 }, { "epoch": 0.01935483870967742, "grad_norm": 1.7257186746629198, "learning_rate": 7.5e-06, "loss": 1.4482, "step": 3 }, { "epoch": 0.025806451612903226, "grad_norm": 1.7165970873170038, "learning_rate": 1e-05, "loss": 1.3717, "step": 4 }, { "epoch": 0.03225806451612903, "grad_norm": 1.1486983653469711, "learning_rate": 1.25e-05, "loss": 1.4594, "step": 5 }, { "epoch": 0.03870967741935484, "grad_norm": 0.6868389172099673, "learning_rate": 1.5e-05, "loss": 1.3619, "step": 6 }, { "epoch": 0.04516129032258064, "grad_norm": 0.8100802487851951, "learning_rate": 1.75e-05, "loss": 1.1633, "step": 7 }, { "epoch": 0.05161290322580645, "grad_norm": 1.1308634221406137, "learning_rate": 2e-05, "loss": 1.43, "step": 8 }, { "epoch": 0.05806451612903226, "grad_norm": 1.1032001609285251, "learning_rate": 2.25e-05, "loss": 1.4583, "step": 9 }, { "epoch": 0.06451612903225806, "grad_norm": 0.8673535874558637, "learning_rate": 2.5e-05, "loss": 1.3725, "step": 10 }, { "epoch": 0.07096774193548387, "grad_norm": 0.5856780577487628, "learning_rate": 2.7500000000000004e-05, "loss": 1.2705, "step": 11 }, { "epoch": 0.07741935483870968, "grad_norm": 0.5407118925923696, "learning_rate": 3e-05, "loss": 1.1978, "step": 12 }, { "epoch": 0.08387096774193549, "grad_norm": 21.5243092057336, "learning_rate": 3.2500000000000004e-05, "loss": 1.2099, "step": 13 }, { "epoch": 0.09032258064516129, "grad_norm": 0.5975726437730494, "learning_rate": 3.5e-05, "loss": 1.3989, "step": 14 }, { "epoch": 0.0967741935483871, "grad_norm": 0.632641768960225, "learning_rate": 3.7500000000000003e-05, "loss": 1.3262, "step": 15 }, { "epoch": 0.1032258064516129, "grad_norm": 0.5237190218830238, "learning_rate": 4e-05, "loss": 1.2497, "step": 16 }, { "epoch": 0.10967741935483871, "grad_norm": 0.532634703510973, "learning_rate": 4.25e-05, "loss": 1.2812, "step": 17 }, { "epoch": 0.11612903225806452, "grad_norm": 0.5230723189678446, "learning_rate": 4.5e-05, "loss": 1.3475, "step": 18 }, { "epoch": 0.12258064516129032, "grad_norm": 1.1606813044713529, "learning_rate": 4.75e-05, "loss": 1.372, "step": 19 }, { "epoch": 0.12903225806451613, "grad_norm": 0.4510232796621864, "learning_rate": 5e-05, "loss": 1.4073, "step": 20 }, { "epoch": 0.13548387096774195, "grad_norm": 0.5205928609096978, "learning_rate": 4.9993997182511844e-05, "loss": 1.2543, "step": 21 }, { "epoch": 0.14193548387096774, "grad_norm": 0.4663591464063973, "learning_rate": 4.9975991933053384e-05, "loss": 1.3796, "step": 22 }, { "epoch": 0.14838709677419354, "grad_norm": 0.5389896900391323, "learning_rate": 4.994599385893363e-05, "loss": 1.4438, "step": 23 }, { "epoch": 0.15483870967741936, "grad_norm": 0.5670034687541796, "learning_rate": 4.990401896663828e-05, "loss": 1.264, "step": 24 }, { "epoch": 0.16129032258064516, "grad_norm": 0.42454937922862174, "learning_rate": 4.985008965328888e-05, "loss": 1.2944, "step": 25 }, { "epoch": 0.16774193548387098, "grad_norm": 0.5194572848655961, "learning_rate": 4.9784234694692117e-05, "loss": 1.4043, "step": 26 }, { "epoch": 0.17419354838709677, "grad_norm": 0.5484146080851298, "learning_rate": 4.9706489229985524e-05, "loss": 1.4735, "step": 27 }, { "epoch": 0.18064516129032257, "grad_norm": 0.7784686106719038, "learning_rate": 4.961689474288779e-05, "loss": 1.3291, "step": 28 }, { "epoch": 0.1870967741935484, "grad_norm": 0.5710341199937055, "learning_rate": 4.9515499039563704e-05, "loss": 1.3314, "step": 29 }, { "epoch": 0.1935483870967742, "grad_norm": 0.4173913286385054, "learning_rate": 4.940235622311559e-05, "loss": 1.1272, "step": 30 }, { "epoch": 0.2, "grad_norm": 0.371163408997987, "learning_rate": 4.9277526664714765e-05, "loss": 1.2884, "step": 31 }, { "epoch": 0.2064516129032258, "grad_norm": 0.6894186561505441, "learning_rate": 4.914107697138843e-05, "loss": 1.2338, "step": 32 }, { "epoch": 0.2064516129032258, "eval_loss": 1.115579605102539, "eval_runtime": 61.6455, "eval_samples_per_second": 1.622, "eval_steps_per_second": 0.065, "step": 32 }, { "epoch": 0.2129032258064516, "grad_norm": 0.44925759222205974, "learning_rate": 4.8993079950479305e-05, "loss": 1.1968, "step": 33 }, { "epoch": 0.21935483870967742, "grad_norm": 0.4304874702516527, "learning_rate": 4.883361457079673e-05, "loss": 1.1706, "step": 34 }, { "epoch": 0.22580645161290322, "grad_norm": 0.3646005763538535, "learning_rate": 4.8662765920480274e-05, "loss": 1.2714, "step": 35 }, { "epoch": 0.23225806451612904, "grad_norm": 0.3568923781906157, "learning_rate": 4.8480625161598e-05, "loss": 1.175, "step": 36 }, { "epoch": 0.23870967741935484, "grad_norm": 0.37794009122137584, "learning_rate": 4.8287289481503954e-05, "loss": 1.2415, "step": 37 }, { "epoch": 0.24516129032258063, "grad_norm": 0.35188090283951096, "learning_rate": 4.808286204098047e-05, "loss": 1.3385, "step": 38 }, { "epoch": 0.25161290322580643, "grad_norm": 0.3863530195041696, "learning_rate": 4.7867451919193346e-05, "loss": 1.2419, "step": 39 }, { "epoch": 0.25806451612903225, "grad_norm": 0.3746461776891942, "learning_rate": 4.764117405548891e-05, "loss": 1.2624, "step": 40 }, { "epoch": 0.2645161290322581, "grad_norm": 0.34991236825502203, "learning_rate": 4.740414918806425e-05, "loss": 1.307, "step": 41 }, { "epoch": 0.2709677419354839, "grad_norm": 0.4011683523629512, "learning_rate": 4.715650378954331e-05, "loss": 1.357, "step": 42 }, { "epoch": 0.27741935483870966, "grad_norm": 0.3457706023597712, "learning_rate": 4.689836999949314e-05, "loss": 1.3757, "step": 43 }, { "epoch": 0.2838709677419355, "grad_norm": 0.2938536081434768, "learning_rate": 4.662988555391632e-05, "loss": 1.3486, "step": 44 }, { "epoch": 0.2903225806451613, "grad_norm": 0.33377012573731674, "learning_rate": 4.635119371175731e-05, "loss": 1.1417, "step": 45 }, { "epoch": 0.2967741935483871, "grad_norm": 0.5099149644951235, "learning_rate": 4.60624431784618e-05, "loss": 1.2184, "step": 46 }, { "epoch": 0.3032258064516129, "grad_norm": 0.4651969988328567, "learning_rate": 4.576378802662989e-05, "loss": 1.2518, "step": 47 }, { "epoch": 0.3096774193548387, "grad_norm": 0.3453144809110264, "learning_rate": 4.5455387613805396e-05, "loss": 1.4111, "step": 48 }, { "epoch": 0.3161290322580645, "grad_norm": 0.3338451350111794, "learning_rate": 4.513740649744536e-05, "loss": 1.2129, "step": 49 }, { "epoch": 0.3225806451612903, "grad_norm": 0.36687031408807524, "learning_rate": 4.4810014347114784e-05, "loss": 1.2703, "step": 50 }, { "epoch": 0.32903225806451614, "grad_norm": 0.38881877032764534, "learning_rate": 4.4473385853953693e-05, "loss": 1.2997, "step": 51 }, { "epoch": 0.33548387096774196, "grad_norm": 0.34976100346234607, "learning_rate": 4.4127700637464834e-05, "loss": 1.0796, "step": 52 }, { "epoch": 0.3419354838709677, "grad_norm": 0.3696068140674995, "learning_rate": 4.3773143149671576e-05, "loss": 1.3098, "step": 53 }, { "epoch": 0.34838709677419355, "grad_norm": 0.3168263761372888, "learning_rate": 4.340990257669732e-05, "loss": 1.2894, "step": 54 }, { "epoch": 0.3548387096774194, "grad_norm": 0.3442048604805361, "learning_rate": 4.303817273781886e-05, "loss": 1.3385, "step": 55 }, { "epoch": 0.36129032258064514, "grad_norm": 0.38003699951922426, "learning_rate": 4.2658151982047536e-05, "loss": 1.2548, "step": 56 }, { "epoch": 0.36774193548387096, "grad_norm": 0.5065610455915895, "learning_rate": 4.2270043082293463e-05, "loss": 1.2496, "step": 57 }, { "epoch": 0.3741935483870968, "grad_norm": 0.3259412922642011, "learning_rate": 4.1874053127169126e-05, "loss": 1.1696, "step": 58 }, { "epoch": 0.38064516129032255, "grad_norm": 0.33390560015814696, "learning_rate": 4.147039341049036e-05, "loss": 1.3276, "step": 59 }, { "epoch": 0.3870967741935484, "grad_norm": 0.3161062426181486, "learning_rate": 4.105927931853327e-05, "loss": 1.258, "step": 60 }, { "epoch": 0.3935483870967742, "grad_norm": 0.37939625630824414, "learning_rate": 4.0640930215107725e-05, "loss": 1.3119, "step": 61 }, { "epoch": 0.4, "grad_norm": 0.3447410882628537, "learning_rate": 4.021556932450832e-05, "loss": 1.1337, "step": 62 }, { "epoch": 0.4064516129032258, "grad_norm": 0.33262503192755777, "learning_rate": 3.978342361240553e-05, "loss": 1.3684, "step": 63 }, { "epoch": 0.4129032258064516, "grad_norm": 0.4082167906853653, "learning_rate": 3.9344723664740506e-05, "loss": 1.1973, "step": 64 }, { "epoch": 0.4129032258064516, "eval_loss": 1.0707180500030518, "eval_runtime": 62.1751, "eval_samples_per_second": 1.608, "eval_steps_per_second": 0.064, "step": 64 }, { "epoch": 0.41935483870967744, "grad_norm": 0.42898014496026954, "learning_rate": 3.8899703564688187e-05, "loss": 1.3098, "step": 65 }, { "epoch": 0.4258064516129032, "grad_norm": 0.3729718619879595, "learning_rate": 3.8448600767754265e-05, "loss": 1.3267, "step": 66 }, { "epoch": 0.432258064516129, "grad_norm": 0.5652836221912215, "learning_rate": 3.7991655975072834e-05, "loss": 1.3008, "step": 67 }, { "epoch": 0.43870967741935485, "grad_norm": 0.3611571783806379, "learning_rate": 3.752911300497212e-05, "loss": 1.2365, "step": 68 }, { "epoch": 0.44516129032258067, "grad_norm": 0.4101622999668487, "learning_rate": 3.706121866287699e-05, "loss": 1.2805, "step": 69 }, { "epoch": 0.45161290322580644, "grad_norm": 0.4194502800160711, "learning_rate": 3.658822260961763e-05, "loss": 1.2627, "step": 70 }, { "epoch": 0.45806451612903226, "grad_norm": 0.4464572963409143, "learning_rate": 3.611037722821452e-05, "loss": 1.3269, "step": 71 }, { "epoch": 0.4645161290322581, "grad_norm": 0.43900384749780696, "learning_rate": 3.562793748921095e-05, "loss": 1.0625, "step": 72 }, { "epoch": 0.47096774193548385, "grad_norm": 0.3492561062627179, "learning_rate": 3.514116081462488e-05, "loss": 1.2854, "step": 73 }, { "epoch": 0.4774193548387097, "grad_norm": 1.004303081481083, "learning_rate": 3.4650306940592784e-05, "loss": 1.3114, "step": 74 }, { "epoch": 0.4838709677419355, "grad_norm": 0.372149762179685, "learning_rate": 3.415563777877859e-05, "loss": 1.1604, "step": 75 }, { "epoch": 0.49032258064516127, "grad_norm": 0.36620109818968666, "learning_rate": 3.365741727662187e-05, "loss": 1.2055, "step": 76 }, { "epoch": 0.4967741935483871, "grad_norm": 0.3209403988829257, "learning_rate": 3.315591127649981e-05, "loss": 1.2652, "step": 77 }, { "epoch": 0.5032258064516129, "grad_norm": 0.6268869630058581, "learning_rate": 3.265138737387802e-05, "loss": 1.3451, "step": 78 }, { "epoch": 0.5096774193548387, "grad_norm": 0.37710251621094776, "learning_rate": 3.214411477452589e-05, "loss": 1.1998, "step": 79 }, { "epoch": 0.5161290322580645, "grad_norm": 0.3965119239115867, "learning_rate": 3.1634364150872836e-05, "loss": 1.198, "step": 80 }, { "epoch": 0.5225806451612903, "grad_norm": 0.38914331784636286, "learning_rate": 3.112240749758179e-05, "loss": 1.3164, "step": 81 }, { "epoch": 0.5290322580645161, "grad_norm": 0.4854967858248665, "learning_rate": 3.060851798641735e-05, "loss": 1.1669, "step": 82 }, { "epoch": 0.535483870967742, "grad_norm": 0.4486571105935308, "learning_rate": 3.00929698204857e-05, "loss": 1.3611, "step": 83 }, { "epoch": 0.5419354838709678, "grad_norm": 0.5816885351466946, "learning_rate": 2.9576038087924297e-05, "loss": 1.2272, "step": 84 }, { "epoch": 0.5483870967741935, "grad_norm": 0.3242743003758612, "learning_rate": 2.905799861511932e-05, "loss": 1.1925, "step": 85 }, { "epoch": 0.5548387096774193, "grad_norm": 0.3110545851314829, "learning_rate": 2.8539127819529143e-05, "loss": 0.9746, "step": 86 }, { "epoch": 0.5612903225806452, "grad_norm": 0.3102061641971853, "learning_rate": 2.801970256219253e-05, "loss": 1.352, "step": 87 }, { "epoch": 0.567741935483871, "grad_norm": 0.30361763618294724, "learning_rate": 2.7500000000000004e-05, "loss": 1.2039, "step": 88 }, { "epoch": 0.5741935483870968, "grad_norm": 0.5030242942383549, "learning_rate": 2.698029743780748e-05, "loss": 1.2757, "step": 89 }, { "epoch": 0.5806451612903226, "grad_norm": 0.5902079797954521, "learning_rate": 2.6460872180470865e-05, "loss": 1.1542, "step": 90 }, { "epoch": 0.5870967741935483, "grad_norm": 0.4650188539079032, "learning_rate": 2.594200138488069e-05, "loss": 1.1455, "step": 91 }, { "epoch": 0.5935483870967742, "grad_norm": 0.6953375177526994, "learning_rate": 2.5423961912075712e-05, "loss": 1.2476, "step": 92 }, { "epoch": 0.6, "grad_norm": 0.324295911329268, "learning_rate": 2.4907030179514307e-05, "loss": 1.0578, "step": 93 }, { "epoch": 0.6064516129032258, "grad_norm": 0.36056444973850205, "learning_rate": 2.4391482013582657e-05, "loss": 1.3128, "step": 94 }, { "epoch": 0.6129032258064516, "grad_norm": 0.31638336845784404, "learning_rate": 2.387759250241821e-05, "loss": 1.1412, "step": 95 }, { "epoch": 0.6193548387096774, "grad_norm": 0.3807737813278726, "learning_rate": 2.3365635849127166e-05, "loss": 1.301, "step": 96 }, { "epoch": 0.6193548387096774, "eval_loss": 1.0401562452316284, "eval_runtime": 62.5349, "eval_samples_per_second": 1.599, "eval_steps_per_second": 0.064, "step": 96 }, { "epoch": 0.6258064516129033, "grad_norm": 0.36219529568521813, "learning_rate": 2.285588522547411e-05, "loss": 1.2681, "step": 97 }, { "epoch": 0.632258064516129, "grad_norm": 0.4601161674119361, "learning_rate": 2.234861262612199e-05, "loss": 1.2387, "step": 98 }, { "epoch": 0.6387096774193548, "grad_norm": 0.6207212832715766, "learning_rate": 2.184408872350019e-05, "loss": 1.2087, "step": 99 }, { "epoch": 0.6451612903225806, "grad_norm": 0.3655891991096712, "learning_rate": 2.134258272337814e-05, "loss": 1.2769, "step": 100 }, { "epoch": 0.6516129032258065, "grad_norm": 0.4394265602792923, "learning_rate": 2.084436222122142e-05, "loss": 1.0799, "step": 101 }, { "epoch": 0.6580645161290323, "grad_norm": 0.5059663574517834, "learning_rate": 2.0349693059407215e-05, "loss": 1.0953, "step": 102 }, { "epoch": 0.6645161290322581, "grad_norm": 0.34732606007316424, "learning_rate": 1.9858839185375123e-05, "loss": 1.224, "step": 103 }, { "epoch": 0.6709677419354839, "grad_norm": 0.5464551769086812, "learning_rate": 1.9372062510789063e-05, "loss": 1.2413, "step": 104 }, { "epoch": 0.6774193548387096, "grad_norm": 0.977742231459624, "learning_rate": 1.888962277178548e-05, "loss": 1.2118, "step": 105 }, { "epoch": 0.6838709677419355, "grad_norm": 2.537109489591264, "learning_rate": 1.8411777390382367e-05, "loss": 1.2513, "step": 106 }, { "epoch": 0.6903225806451613, "grad_norm": 0.35948844839880034, "learning_rate": 1.7938781337123016e-05, "loss": 1.1404, "step": 107 }, { "epoch": 0.6967741935483871, "grad_norm": 0.457105884170092, "learning_rate": 1.747088699502789e-05, "loss": 1.1514, "step": 108 }, { "epoch": 0.7032258064516129, "grad_norm": 1.1486002566265734, "learning_rate": 1.7008344024927168e-05, "loss": 1.3249, "step": 109 }, { "epoch": 0.7096774193548387, "grad_norm": 0.36043342663778255, "learning_rate": 1.6551399232245737e-05, "loss": 1.1239, "step": 110 }, { "epoch": 0.7161290322580646, "grad_norm": 0.46594876338109426, "learning_rate": 1.610029643531182e-05, "loss": 1.2918, "step": 111 }, { "epoch": 0.7225806451612903, "grad_norm": 0.32990660251070025, "learning_rate": 1.5655276335259493e-05, "loss": 1.2266, "step": 112 }, { "epoch": 0.7290322580645161, "grad_norm": 0.30010478660077256, "learning_rate": 1.5216576387594481e-05, "loss": 1.2114, "step": 113 }, { "epoch": 0.7354838709677419, "grad_norm": 0.49532244626831723, "learning_rate": 1.4784430675491685e-05, "loss": 1.2457, "step": 114 }, { "epoch": 0.7419354838709677, "grad_norm": 0.5191609185311767, "learning_rate": 1.4359069784892282e-05, "loss": 1.2862, "step": 115 }, { "epoch": 0.7483870967741936, "grad_norm": 0.3826327354484767, "learning_rate": 1.3940720681466734e-05, "loss": 1.1351, "step": 116 }, { "epoch": 0.7548387096774194, "grad_norm": 0.330074625162551, "learning_rate": 1.3529606589509647e-05, "loss": 1.1871, "step": 117 }, { "epoch": 0.7612903225806451, "grad_norm": 0.34233269430078184, "learning_rate": 1.3125946872830877e-05, "loss": 1.1411, "step": 118 }, { "epoch": 0.7677419354838709, "grad_norm": 0.31326296304705775, "learning_rate": 1.2729956917706545e-05, "loss": 1.2387, "step": 119 }, { "epoch": 0.7741935483870968, "grad_norm": 0.3176809107580838, "learning_rate": 1.2341848017952464e-05, "loss": 1.2451, "step": 120 }, { "epoch": 0.7806451612903226, "grad_norm": 0.31420402228609556, "learning_rate": 1.1961827262181141e-05, "loss": 1.1766, "step": 121 }, { "epoch": 0.7870967741935484, "grad_norm": 0.4637761844099348, "learning_rate": 1.1590097423302684e-05, "loss": 1.1542, "step": 122 }, { "epoch": 0.7935483870967742, "grad_norm": 0.36159367839677437, "learning_rate": 1.1226856850328434e-05, "loss": 1.3127, "step": 123 }, { "epoch": 0.8, "grad_norm": 0.5010806704980222, "learning_rate": 1.0872299362535173e-05, "loss": 1.2729, "step": 124 }, { "epoch": 0.8064516129032258, "grad_norm": 0.3461696613483525, "learning_rate": 1.0526614146046312e-05, "loss": 1.2425, "step": 125 }, { "epoch": 0.8129032258064516, "grad_norm": 0.35751217338851793, "learning_rate": 1.0189985652885225e-05, "loss": 1.2222, "step": 126 }, { "epoch": 0.8193548387096774, "grad_norm": 0.43059544412165696, "learning_rate": 9.862593502554648e-06, "loss": 1.1938, "step": 127 }, { "epoch": 0.8258064516129032, "grad_norm": 0.7260092938036656, "learning_rate": 9.544612386194612e-06, "loss": 1.1063, "step": 128 }, { "epoch": 0.8258064516129032, "eval_loss": 1.0231536626815796, "eval_runtime": 62.2556, "eval_samples_per_second": 1.606, "eval_steps_per_second": 0.064, "step": 128 }, { "epoch": 0.832258064516129, "grad_norm": 0.2930692349121967, "learning_rate": 9.236211973370124e-06, "loss": 1.2804, "step": 129 }, { "epoch": 0.8387096774193549, "grad_norm": 0.3514011035647982, "learning_rate": 8.937556821538201e-06, "loss": 1.3527, "step": 130 }, { "epoch": 0.8451612903225807, "grad_norm": 0.3509271601664881, "learning_rate": 8.64880628824269e-06, "loss": 1.2336, "step": 131 }, { "epoch": 0.8516129032258064, "grad_norm": 0.369286535470622, "learning_rate": 8.370114446083686e-06, "loss": 1.2204, "step": 132 }, { "epoch": 0.8580645161290322, "grad_norm": 0.3376899684032205, "learning_rate": 8.101630000506864e-06, "loss": 1.114, "step": 133 }, { "epoch": 0.864516129032258, "grad_norm": 0.34528372468606205, "learning_rate": 7.843496210456687e-06, "loss": 1.2915, "step": 134 }, { "epoch": 0.8709677419354839, "grad_norm": 0.3271748537414322, "learning_rate": 7.595850811935759e-06, "loss": 1.2242, "step": 135 }, { "epoch": 0.8774193548387097, "grad_norm": 0.34552044795509895, "learning_rate": 7.358825944511101e-06, "loss": 1.2238, "step": 136 }, { "epoch": 0.8838709677419355, "grad_norm": 0.3645405834936748, "learning_rate": 7.132548080806653e-06, "loss": 1.1925, "step": 137 }, { "epoch": 0.8903225806451613, "grad_norm": 0.39117823625181364, "learning_rate": 6.917137959019528e-06, "loss": 1.1295, "step": 138 }, { "epoch": 0.896774193548387, "grad_norm": 0.3256830351093455, "learning_rate": 6.712710518496049e-06, "loss": 1.2506, "step": 139 }, { "epoch": 0.9032258064516129, "grad_norm": 0.4262467981624931, "learning_rate": 6.519374838401997e-06, "loss": 1.1759, "step": 140 }, { "epoch": 0.9096774193548387, "grad_norm": 0.35503437951993716, "learning_rate": 6.337234079519728e-06, "loss": 1.1777, "step": 141 }, { "epoch": 0.9161290322580645, "grad_norm": 0.3897540509188695, "learning_rate": 6.166385429203269e-06, "loss": 1.1239, "step": 142 }, { "epoch": 0.9225806451612903, "grad_norm": 0.36016445939620884, "learning_rate": 6.006920049520701e-06, "loss": 1.2692, "step": 143 }, { "epoch": 0.9290322580645162, "grad_norm": 0.4413576798023392, "learning_rate": 5.858923028611572e-06, "loss": 1.1879, "step": 144 }, { "epoch": 0.9354838709677419, "grad_norm": 0.37955599088497055, "learning_rate": 5.722473335285244e-06, "loss": 1.205, "step": 145 }, { "epoch": 0.9419354838709677, "grad_norm": 0.35919500181972724, "learning_rate": 5.597643776884412e-06, "loss": 1.1617, "step": 146 }, { "epoch": 0.9483870967741935, "grad_norm": 0.3022686971058462, "learning_rate": 5.4845009604363e-06, "loss": 1.2059, "step": 147 }, { "epoch": 0.9548387096774194, "grad_norm": 0.30291369490101205, "learning_rate": 5.38310525711221e-06, "loss": 1.2672, "step": 148 }, { "epoch": 0.9612903225806452, "grad_norm": 0.33599320279905975, "learning_rate": 5.293510770014475e-06, "loss": 1.2755, "step": 149 }, { "epoch": 0.967741935483871, "grad_norm": 0.2903929279243622, "learning_rate": 5.215765305307886e-06, "loss": 1.1675, "step": 150 }, { "epoch": 0.9741935483870968, "grad_norm": 0.3305110382050327, "learning_rate": 5.149910346711126e-06, "loss": 1.2342, "step": 151 }, { "epoch": 0.9806451612903225, "grad_norm": 0.33304378208594904, "learning_rate": 5.095981033361725e-06, "loss": 1.1312, "step": 152 }, { "epoch": 0.9870967741935484, "grad_norm": 0.3479102720763047, "learning_rate": 5.05400614106637e-06, "loss": 1.1753, "step": 153 }, { "epoch": 0.9935483870967742, "grad_norm": 0.31384042987234395, "learning_rate": 5.024008066946621e-06, "loss": 1.2077, "step": 154 }, { "epoch": 1.0, "grad_norm": 0.5248637716000059, "learning_rate": 5.006002817488162e-06, "loss": 1.1639, "step": 155 }, { "epoch": 1.0064516129032257, "grad_norm": 0.359683648131272, "learning_rate": 5e-06, "loss": 1.2093, "step": 156 } ], "logging_steps": 1, "max_steps": 156, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 32, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 319408664739840.0, "train_batch_size": 5, "trial_name": null, "trial_params": null }