{ "best_metric": null, "best_model_checkpoint": null, "epoch": 2.057189878625797, "eval_steps": 500, "global_step": 2500, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.016457519029006377, "grad_norm": 1.7265625, "learning_rate": 4e-05, "loss": 3.4409, "step": 20 }, { "epoch": 0.032915038058012755, "grad_norm": 1.125, "learning_rate": 8e-05, "loss": 2.834, "step": 40 }, { "epoch": 0.04937255708701913, "grad_norm": 2.203125, "learning_rate": 0.00012, "loss": 2.0422, "step": 60 }, { "epoch": 0.06583007611602551, "grad_norm": 1.734375, "learning_rate": 0.00016, "loss": 1.6617, "step": 80 }, { "epoch": 0.08228759514503188, "grad_norm": 1.4765625, "learning_rate": 0.0002, "loss": 1.5035, "step": 100 }, { "epoch": 0.09874511417403826, "grad_norm": 1.6328125, "learning_rate": 0.0001988716502115656, "loss": 1.4449, "step": 120 }, { "epoch": 0.11520263320304464, "grad_norm": 2.015625, "learning_rate": 0.00019774330042313118, "loss": 1.3653, "step": 140 }, { "epoch": 0.13166015223205102, "grad_norm": 1.640625, "learning_rate": 0.00019661495063469676, "loss": 1.3634, "step": 160 }, { "epoch": 0.1481176712610574, "grad_norm": 1.2734375, "learning_rate": 0.00019548660084626237, "loss": 1.3116, "step": 180 }, { "epoch": 0.16457519029006376, "grad_norm": 1.1796875, "learning_rate": 0.00019435825105782795, "loss": 1.2961, "step": 200 }, { "epoch": 0.18103270931907015, "grad_norm": 1.359375, "learning_rate": 0.00019322990126939354, "loss": 1.2868, "step": 220 }, { "epoch": 0.19749022834807653, "grad_norm": 1.2421875, "learning_rate": 0.0001921015514809591, "loss": 1.2902, "step": 240 }, { "epoch": 0.21394774737708291, "grad_norm": 1.5859375, "learning_rate": 0.00019097320169252468, "loss": 1.209, "step": 260 }, { "epoch": 0.23040526640608927, "grad_norm": 1.5859375, "learning_rate": 0.00018984485190409026, "loss": 1.2912, "step": 280 }, { "epoch": 0.24686278543509566, "grad_norm": 1.4296875, "learning_rate": 0.00018871650211565587, "loss": 1.2733, "step": 300 }, { "epoch": 0.26332030446410204, "grad_norm": 1.4140625, "learning_rate": 0.00018758815232722145, "loss": 1.1895, "step": 320 }, { "epoch": 0.2797778234931084, "grad_norm": 1.25, "learning_rate": 0.00018645980253878704, "loss": 1.2259, "step": 340 }, { "epoch": 0.2962353425221148, "grad_norm": 1.4921875, "learning_rate": 0.00018533145275035262, "loss": 1.2636, "step": 360 }, { "epoch": 0.3126928615511212, "grad_norm": 1.796875, "learning_rate": 0.0001842031029619182, "loss": 1.2748, "step": 380 }, { "epoch": 0.3291503805801275, "grad_norm": 1.4296875, "learning_rate": 0.0001830747531734838, "loss": 1.1335, "step": 400 }, { "epoch": 0.3456078996091339, "grad_norm": 1.6640625, "learning_rate": 0.00018194640338504937, "loss": 1.1985, "step": 420 }, { "epoch": 0.3620654186381403, "grad_norm": 1.5078125, "learning_rate": 0.00018081805359661496, "loss": 1.2015, "step": 440 }, { "epoch": 0.3785229376671467, "grad_norm": 1.53125, "learning_rate": 0.00017968970380818057, "loss": 1.1548, "step": 460 }, { "epoch": 0.39498045669615306, "grad_norm": 1.359375, "learning_rate": 0.00017856135401974612, "loss": 1.1249, "step": 480 }, { "epoch": 0.41143797572515944, "grad_norm": 1.7109375, "learning_rate": 0.0001774330042313117, "loss": 1.1734, "step": 500 }, { "epoch": 0.42789549475416583, "grad_norm": 1.71875, "learning_rate": 0.0001763046544428773, "loss": 1.1772, "step": 520 }, { "epoch": 0.4443530137831722, "grad_norm": 1.2578125, "learning_rate": 0.00017517630465444287, "loss": 1.1092, "step": 540 }, { "epoch": 0.46081053281217854, "grad_norm": 1.234375, "learning_rate": 0.00017404795486600846, "loss": 1.1306, "step": 560 }, { "epoch": 0.4772680518411849, "grad_norm": 1.5234375, "learning_rate": 0.00017291960507757407, "loss": 1.1457, "step": 580 }, { "epoch": 0.4937255708701913, "grad_norm": 1.4375, "learning_rate": 0.00017179125528913965, "loss": 1.1501, "step": 600 }, { "epoch": 0.5101830898991977, "grad_norm": 1.1796875, "learning_rate": 0.00017066290550070523, "loss": 1.1539, "step": 620 }, { "epoch": 0.5266406089282041, "grad_norm": 1.3984375, "learning_rate": 0.00016953455571227082, "loss": 1.1242, "step": 640 }, { "epoch": 0.5430981279572105, "grad_norm": 1.6015625, "learning_rate": 0.0001684062059238364, "loss": 1.1551, "step": 660 }, { "epoch": 0.5595556469862168, "grad_norm": 1.0625, "learning_rate": 0.00016727785613540198, "loss": 1.1252, "step": 680 }, { "epoch": 0.5760131660152232, "grad_norm": 1.0078125, "learning_rate": 0.00016614950634696757, "loss": 1.1023, "step": 700 }, { "epoch": 0.5924706850442296, "grad_norm": 1.28125, "learning_rate": 0.00016502115655853315, "loss": 1.1136, "step": 720 }, { "epoch": 0.608928204073236, "grad_norm": 1.34375, "learning_rate": 0.00016389280677009873, "loss": 1.1363, "step": 740 }, { "epoch": 0.6253857231022424, "grad_norm": 1.3671875, "learning_rate": 0.00016276445698166432, "loss": 1.0763, "step": 760 }, { "epoch": 0.6418432421312487, "grad_norm": 1.34375, "learning_rate": 0.0001616361071932299, "loss": 1.1558, "step": 780 }, { "epoch": 0.658300761160255, "grad_norm": 1.140625, "learning_rate": 0.00016050775740479548, "loss": 1.0374, "step": 800 }, { "epoch": 0.6747582801892614, "grad_norm": 1.40625, "learning_rate": 0.00015937940761636107, "loss": 1.1492, "step": 820 }, { "epoch": 0.6912157992182678, "grad_norm": 1.5234375, "learning_rate": 0.00015825105782792665, "loss": 1.1261, "step": 840 }, { "epoch": 0.7076733182472742, "grad_norm": 1.46875, "learning_rate": 0.00015712270803949226, "loss": 1.1236, "step": 860 }, { "epoch": 0.7241308372762806, "grad_norm": 1.5, "learning_rate": 0.00015599435825105785, "loss": 1.0795, "step": 880 }, { "epoch": 0.740588356305287, "grad_norm": 1.28125, "learning_rate": 0.00015486600846262343, "loss": 1.0866, "step": 900 }, { "epoch": 0.7570458753342934, "grad_norm": 1.640625, "learning_rate": 0.000153737658674189, "loss": 1.0635, "step": 920 }, { "epoch": 0.7735033943632997, "grad_norm": 1.453125, "learning_rate": 0.0001526093088857546, "loss": 1.1106, "step": 940 }, { "epoch": 0.7899609133923061, "grad_norm": 1.0390625, "learning_rate": 0.00015148095909732018, "loss": 1.1241, "step": 960 }, { "epoch": 0.8064184324213125, "grad_norm": 1.296875, "learning_rate": 0.00015035260930888576, "loss": 1.078, "step": 980 }, { "epoch": 0.8228759514503189, "grad_norm": 1.109375, "learning_rate": 0.00014922425952045135, "loss": 1.1545, "step": 1000 }, { "epoch": 0.8393334704793253, "grad_norm": 1.5703125, "learning_rate": 0.00014809590973201693, "loss": 1.1102, "step": 1020 }, { "epoch": 0.8557909895083317, "grad_norm": 1.2265625, "learning_rate": 0.0001469675599435825, "loss": 1.0258, "step": 1040 }, { "epoch": 0.872248508537338, "grad_norm": 1.4609375, "learning_rate": 0.0001458392101551481, "loss": 1.1205, "step": 1060 }, { "epoch": 0.8887060275663444, "grad_norm": 1.40625, "learning_rate": 0.00014471086036671368, "loss": 1.0562, "step": 1080 }, { "epoch": 0.9051635465953507, "grad_norm": 1.203125, "learning_rate": 0.00014358251057827926, "loss": 1.1234, "step": 1100 }, { "epoch": 0.9216210656243571, "grad_norm": 1.3046875, "learning_rate": 0.00014245416078984485, "loss": 1.0511, "step": 1120 }, { "epoch": 0.9380785846533635, "grad_norm": 1.0546875, "learning_rate": 0.00014132581100141046, "loss": 1.0451, "step": 1140 }, { "epoch": 0.9545361036823699, "grad_norm": 1.3046875, "learning_rate": 0.00014019746121297604, "loss": 1.0539, "step": 1160 }, { "epoch": 0.9709936227113762, "grad_norm": 1.1328125, "learning_rate": 0.00013906911142454162, "loss": 1.0957, "step": 1180 }, { "epoch": 0.9874511417403826, "grad_norm": 1.7109375, "learning_rate": 0.0001379407616361072, "loss": 1.1191, "step": 1200 }, { "epoch": 1.003908660769389, "grad_norm": 1.5625, "learning_rate": 0.0001368124118476728, "loss": 1.0942, "step": 1220 }, { "epoch": 1.0203661797983954, "grad_norm": 1.359375, "learning_rate": 0.00013568406205923835, "loss": 0.9808, "step": 1240 }, { "epoch": 1.0368236988274018, "grad_norm": 1.359375, "learning_rate": 0.00013455571227080396, "loss": 0.9606, "step": 1260 }, { "epoch": 1.0532812178564082, "grad_norm": 1.40625, "learning_rate": 0.00013342736248236954, "loss": 0.9658, "step": 1280 }, { "epoch": 1.0697387368854145, "grad_norm": 1.3828125, "learning_rate": 0.00013229901269393512, "loss": 0.8955, "step": 1300 }, { "epoch": 1.086196255914421, "grad_norm": 1.34375, "learning_rate": 0.0001311706629055007, "loss": 0.9449, "step": 1320 }, { "epoch": 1.1026537749434273, "grad_norm": 1.4296875, "learning_rate": 0.0001300423131170663, "loss": 0.936, "step": 1340 }, { "epoch": 1.1191112939724337, "grad_norm": 1.6015625, "learning_rate": 0.00012891396332863187, "loss": 0.8886, "step": 1360 }, { "epoch": 1.13556881300144, "grad_norm": 1.8125, "learning_rate": 0.00012778561354019746, "loss": 0.9193, "step": 1380 }, { "epoch": 1.1520263320304465, "grad_norm": 2.09375, "learning_rate": 0.00012665726375176307, "loss": 0.8895, "step": 1400 }, { "epoch": 1.1684838510594528, "grad_norm": 1.5703125, "learning_rate": 0.00012552891396332865, "loss": 0.9278, "step": 1420 }, { "epoch": 1.1849413700884592, "grad_norm": 1.5546875, "learning_rate": 0.00012440056417489424, "loss": 0.9096, "step": 1440 }, { "epoch": 1.2013988891174656, "grad_norm": 1.703125, "learning_rate": 0.00012327221438645982, "loss": 0.9084, "step": 1460 }, { "epoch": 1.217856408146472, "grad_norm": 1.1640625, "learning_rate": 0.0001221438645980254, "loss": 0.971, "step": 1480 }, { "epoch": 1.2343139271754784, "grad_norm": 1.4375, "learning_rate": 0.00012101551480959097, "loss": 0.9663, "step": 1500 }, { "epoch": 1.2507714462044848, "grad_norm": 1.2578125, "learning_rate": 0.00011988716502115656, "loss": 0.8331, "step": 1520 }, { "epoch": 1.2672289652334912, "grad_norm": 1.65625, "learning_rate": 0.00011875881523272214, "loss": 0.9695, "step": 1540 }, { "epoch": 1.2836864842624975, "grad_norm": 1.609375, "learning_rate": 0.00011763046544428774, "loss": 0.9584, "step": 1560 }, { "epoch": 1.300144003291504, "grad_norm": 1.546875, "learning_rate": 0.00011650211565585332, "loss": 0.975, "step": 1580 }, { "epoch": 1.3166015223205103, "grad_norm": 1.5234375, "learning_rate": 0.0001153737658674189, "loss": 0.9561, "step": 1600 }, { "epoch": 1.3330590413495167, "grad_norm": 1.609375, "learning_rate": 0.00011424541607898449, "loss": 0.9068, "step": 1620 }, { "epoch": 1.3495165603785229, "grad_norm": 1.375, "learning_rate": 0.00011311706629055008, "loss": 0.9088, "step": 1640 }, { "epoch": 1.3659740794075292, "grad_norm": 1.7734375, "learning_rate": 0.00011198871650211567, "loss": 0.8766, "step": 1660 }, { "epoch": 1.3824315984365356, "grad_norm": 1.5234375, "learning_rate": 0.00011086036671368125, "loss": 0.9034, "step": 1680 }, { "epoch": 1.398889117465542, "grad_norm": 1.921875, "learning_rate": 0.00010973201692524683, "loss": 0.9532, "step": 1700 }, { "epoch": 1.4153466364945484, "grad_norm": 1.765625, "learning_rate": 0.00010860366713681243, "loss": 0.8913, "step": 1720 }, { "epoch": 1.4318041555235548, "grad_norm": 1.578125, "learning_rate": 0.00010747531734837801, "loss": 0.886, "step": 1740 }, { "epoch": 1.4482616745525612, "grad_norm": 2.015625, "learning_rate": 0.00010634696755994358, "loss": 0.9405, "step": 1760 }, { "epoch": 1.4647191935815675, "grad_norm": 1.5859375, "learning_rate": 0.00010521861777150917, "loss": 0.9094, "step": 1780 }, { "epoch": 1.481176712610574, "grad_norm": 1.8984375, "learning_rate": 0.00010409026798307475, "loss": 0.9985, "step": 1800 }, { "epoch": 1.4976342316395803, "grad_norm": 1.5546875, "learning_rate": 0.00010296191819464033, "loss": 0.9577, "step": 1820 }, { "epoch": 1.5140917506685867, "grad_norm": 1.671875, "learning_rate": 0.00010183356840620593, "loss": 0.9125, "step": 1840 }, { "epoch": 1.530549269697593, "grad_norm": 1.390625, "learning_rate": 0.00010070521861777152, "loss": 0.9305, "step": 1860 }, { "epoch": 1.5470067887265995, "grad_norm": 1.1953125, "learning_rate": 9.95768688293371e-05, "loss": 0.9372, "step": 1880 }, { "epoch": 1.5634643077556059, "grad_norm": 1.15625, "learning_rate": 9.844851904090268e-05, "loss": 0.9658, "step": 1900 }, { "epoch": 1.5799218267846122, "grad_norm": 1.78125, "learning_rate": 9.732016925246828e-05, "loss": 0.8637, "step": 1920 }, { "epoch": 1.5963793458136186, "grad_norm": 1.515625, "learning_rate": 9.619181946403385e-05, "loss": 0.8523, "step": 1940 }, { "epoch": 1.612836864842625, "grad_norm": 1.28125, "learning_rate": 9.506346967559943e-05, "loss": 0.8689, "step": 1960 }, { "epoch": 1.6292943838716314, "grad_norm": 1.09375, "learning_rate": 9.393511988716503e-05, "loss": 0.8898, "step": 1980 }, { "epoch": 1.6457519029006378, "grad_norm": 1.7421875, "learning_rate": 9.280677009873061e-05, "loss": 0.9299, "step": 2000 }, { "epoch": 1.6622094219296442, "grad_norm": 1.84375, "learning_rate": 9.16784203102962e-05, "loss": 0.9165, "step": 2020 }, { "epoch": 1.6786669409586503, "grad_norm": 1.6875, "learning_rate": 9.055007052186178e-05, "loss": 0.897, "step": 2040 }, { "epoch": 1.6951244599876567, "grad_norm": 1.734375, "learning_rate": 8.942172073342738e-05, "loss": 0.9132, "step": 2060 }, { "epoch": 1.711581979016663, "grad_norm": 1.6875, "learning_rate": 8.829337094499295e-05, "loss": 0.8897, "step": 2080 }, { "epoch": 1.7280394980456695, "grad_norm": 1.6640625, "learning_rate": 8.716502115655853e-05, "loss": 0.9079, "step": 2100 }, { "epoch": 1.7444970170746759, "grad_norm": 1.234375, "learning_rate": 8.603667136812413e-05, "loss": 0.8468, "step": 2120 }, { "epoch": 1.7609545361036822, "grad_norm": 1.46875, "learning_rate": 8.490832157968971e-05, "loss": 0.862, "step": 2140 }, { "epoch": 1.7774120551326886, "grad_norm": 1.671875, "learning_rate": 8.37799717912553e-05, "loss": 0.901, "step": 2160 }, { "epoch": 1.793869574161695, "grad_norm": 1.5546875, "learning_rate": 8.265162200282088e-05, "loss": 0.864, "step": 2180 }, { "epoch": 1.8103270931907014, "grad_norm": 1.953125, "learning_rate": 8.152327221438646e-05, "loss": 0.8773, "step": 2200 }, { "epoch": 1.8267846122197078, "grad_norm": 1.421875, "learning_rate": 8.039492242595204e-05, "loss": 0.8714, "step": 2220 }, { "epoch": 1.8432421312487142, "grad_norm": 1.9375, "learning_rate": 7.926657263751763e-05, "loss": 0.9346, "step": 2240 }, { "epoch": 1.8596996502777206, "grad_norm": 1.5546875, "learning_rate": 7.813822284908322e-05, "loss": 0.877, "step": 2260 }, { "epoch": 1.876157169306727, "grad_norm": 1.4140625, "learning_rate": 7.700987306064881e-05, "loss": 0.9007, "step": 2280 }, { "epoch": 1.8926146883357333, "grad_norm": 1.734375, "learning_rate": 7.588152327221439e-05, "loss": 0.9481, "step": 2300 }, { "epoch": 1.9090722073647397, "grad_norm": 1.7109375, "learning_rate": 7.475317348377997e-05, "loss": 0.8854, "step": 2320 }, { "epoch": 1.925529726393746, "grad_norm": 1.6484375, "learning_rate": 7.362482369534556e-05, "loss": 0.9206, "step": 2340 }, { "epoch": 1.9419872454227525, "grad_norm": 1.8046875, "learning_rate": 7.249647390691114e-05, "loss": 0.8936, "step": 2360 }, { "epoch": 1.9584447644517589, "grad_norm": 1.6484375, "learning_rate": 7.136812411847673e-05, "loss": 0.8942, "step": 2380 }, { "epoch": 1.9749022834807652, "grad_norm": 1.390625, "learning_rate": 7.023977433004232e-05, "loss": 0.881, "step": 2400 }, { "epoch": 1.9913598025097716, "grad_norm": 1.5625, "learning_rate": 6.91114245416079e-05, "loss": 0.9034, "step": 2420 }, { "epoch": 2.007817321538778, "grad_norm": 1.515625, "learning_rate": 6.798307475317349e-05, "loss": 0.8102, "step": 2440 }, { "epoch": 2.0242748405677844, "grad_norm": 1.5078125, "learning_rate": 6.685472496473907e-05, "loss": 0.7575, "step": 2460 }, { "epoch": 2.040732359596791, "grad_norm": 1.671875, "learning_rate": 6.572637517630466e-05, "loss": 0.7636, "step": 2480 }, { "epoch": 2.057189878625797, "grad_norm": 1.5859375, "learning_rate": 6.459802538787024e-05, "loss": 0.8552, "step": 2500 } ], "logging_steps": 20, "max_steps": 3645, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 3.33909657102336e+16, "train_batch_size": 1, "trial_name": null, "trial_params": null }