{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.9676393598144797, "eval_steps": 500, "global_step": 28000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.007027283427908856, "grad_norm": 0.45293834805488586, "learning_rate": 0.0001995548787630315, "loss": 0.8897, "step": 100 }, { "epoch": 0.014054566855817713, "grad_norm": 0.5474397540092468, "learning_rate": 0.00019908633009253836, "loss": 0.7035, "step": 200 }, { "epoch": 0.02108185028372657, "grad_norm": 0.38776981830596924, "learning_rate": 0.00019861778142204523, "loss": 0.6916, "step": 300 }, { "epoch": 0.028109133711635426, "grad_norm": 0.39987438917160034, "learning_rate": 0.00019814923275155208, "loss": 0.6724, "step": 400 }, { "epoch": 0.03513641713954428, "grad_norm": 0.4090868830680847, "learning_rate": 0.00019768068408105893, "loss": 0.6842, "step": 500 }, { "epoch": 0.03513641713954428, "eval_loss": 0.6561594605445862, "eval_runtime": 192.4196, "eval_samples_per_second": 65.737, "eval_steps_per_second": 8.222, "step": 500 }, { "epoch": 0.04216370056745314, "grad_norm": 0.46690303087234497, "learning_rate": 0.00019721213541056578, "loss": 0.6738, "step": 600 }, { "epoch": 0.049190983995361995, "grad_norm": 0.48969191312789917, "learning_rate": 0.00019674358674007263, "loss": 0.662, "step": 700 }, { "epoch": 0.05621826742327085, "grad_norm": 0.3667367100715637, "learning_rate": 0.00019627503806957949, "loss": 0.6646, "step": 800 }, { "epoch": 0.06324555085117971, "grad_norm": 0.40032121539115906, "learning_rate": 0.00019580648939908634, "loss": 0.6564, "step": 900 }, { "epoch": 0.07027283427908856, "grad_norm": 0.3847985565662384, "learning_rate": 0.00019533794072859319, "loss": 0.6511, "step": 1000 }, { "epoch": 0.07027283427908856, "eval_loss": 0.6426891684532166, "eval_runtime": 192.091, "eval_samples_per_second": 65.849, "eval_steps_per_second": 8.236, "step": 1000 }, { "epoch": 0.07730011770699742, "grad_norm": 0.40991073846817017, "learning_rate": 0.00019486939205810004, "loss": 0.6495, "step": 1100 }, { "epoch": 0.08432740113490628, "grad_norm": 0.44681239128112793, "learning_rate": 0.00019440084338760689, "loss": 0.6516, "step": 1200 }, { "epoch": 0.09135468456281513, "grad_norm": 0.4842969477176666, "learning_rate": 0.00019393229471711376, "loss": 0.6614, "step": 1300 }, { "epoch": 0.09838196799072399, "grad_norm": 0.4123075008392334, "learning_rate": 0.00019346374604662061, "loss": 0.636, "step": 1400 }, { "epoch": 0.10540925141863285, "grad_norm": 0.45350509881973267, "learning_rate": 0.00019299519737612746, "loss": 0.6429, "step": 1500 }, { "epoch": 0.10540925141863285, "eval_loss": 0.6345575451850891, "eval_runtime": 192.5238, "eval_samples_per_second": 65.701, "eval_steps_per_second": 8.217, "step": 1500 }, { "epoch": 0.1124365348465417, "grad_norm": 0.36109960079193115, "learning_rate": 0.00019252664870563431, "loss": 0.6542, "step": 1600 }, { "epoch": 0.11946381827445056, "grad_norm": 0.3524057865142822, "learning_rate": 0.00019205810003514116, "loss": 0.6287, "step": 1700 }, { "epoch": 0.12649110170235942, "grad_norm": 0.42591190338134766, "learning_rate": 0.00019158955136464801, "loss": 0.645, "step": 1800 }, { "epoch": 0.13351838513026826, "grad_norm": 0.43938347697257996, "learning_rate": 0.00019112100269415486, "loss": 0.6444, "step": 1900 }, { "epoch": 0.14054566855817713, "grad_norm": 0.44198718667030334, "learning_rate": 0.00019065245402366172, "loss": 0.643, "step": 2000 }, { "epoch": 0.14054566855817713, "eval_loss": 0.6290879249572754, "eval_runtime": 192.8097, "eval_samples_per_second": 65.604, "eval_steps_per_second": 8.205, "step": 2000 }, { "epoch": 0.14757295198608597, "grad_norm": 0.39245936274528503, "learning_rate": 0.00019018390535316857, "loss": 0.65, "step": 2100 }, { "epoch": 0.15460023541399484, "grad_norm": 0.46238473057746887, "learning_rate": 0.00018971535668267542, "loss": 0.6557, "step": 2200 }, { "epoch": 0.16162751884190368, "grad_norm": 0.5030249357223511, "learning_rate": 0.00018924680801218227, "loss": 0.6384, "step": 2300 }, { "epoch": 0.16865480226981255, "grad_norm": 0.4010857343673706, "learning_rate": 0.00018877825934168914, "loss": 0.6424, "step": 2400 }, { "epoch": 0.1756820856977214, "grad_norm": 0.43222537636756897, "learning_rate": 0.000188309710671196, "loss": 0.6217, "step": 2500 }, { "epoch": 0.1756820856977214, "eval_loss": 0.6259664297103882, "eval_runtime": 196.2284, "eval_samples_per_second": 64.461, "eval_steps_per_second": 8.062, "step": 2500 }, { "epoch": 0.18270936912563027, "grad_norm": 0.5106024742126465, "learning_rate": 0.00018784116200070284, "loss": 0.6367, "step": 2600 }, { "epoch": 0.1897366525535391, "grad_norm": 0.463365763425827, "learning_rate": 0.00018737261333020967, "loss": 0.6392, "step": 2700 }, { "epoch": 0.19676393598144798, "grad_norm": 0.46747493743896484, "learning_rate": 0.00018690406465971652, "loss": 0.6221, "step": 2800 }, { "epoch": 0.20379121940935682, "grad_norm": 0.5577239394187927, "learning_rate": 0.0001864355159892234, "loss": 0.6313, "step": 2900 }, { "epoch": 0.2108185028372657, "grad_norm": 0.3764529228210449, "learning_rate": 0.00018596696731873024, "loss": 0.6309, "step": 3000 }, { "epoch": 0.2108185028372657, "eval_loss": 0.61882084608078, "eval_runtime": 191.698, "eval_samples_per_second": 65.984, "eval_steps_per_second": 8.253, "step": 3000 }, { "epoch": 0.21784578626517453, "grad_norm": 0.45881420373916626, "learning_rate": 0.0001854984186482371, "loss": 0.6177, "step": 3100 }, { "epoch": 0.2248730696930834, "grad_norm": 0.366277277469635, "learning_rate": 0.00018502986997774394, "loss": 0.6217, "step": 3200 }, { "epoch": 0.23190035312099225, "grad_norm": 0.5218221545219421, "learning_rate": 0.0001845613213072508, "loss": 0.6085, "step": 3300 }, { "epoch": 0.23892763654890112, "grad_norm": 0.42059990763664246, "learning_rate": 0.00018409277263675767, "loss": 0.624, "step": 3400 }, { "epoch": 0.24595491997680996, "grad_norm": 0.4570040702819824, "learning_rate": 0.0001836242239662645, "loss": 0.6151, "step": 3500 }, { "epoch": 0.24595491997680996, "eval_loss": 0.6193254590034485, "eval_runtime": 191.8558, "eval_samples_per_second": 65.93, "eval_steps_per_second": 8.246, "step": 3500 }, { "epoch": 0.25298220340471883, "grad_norm": 0.42353853583335876, "learning_rate": 0.00018315567529577135, "loss": 0.6245, "step": 3600 }, { "epoch": 0.2600094868326277, "grad_norm": 0.4249242842197418, "learning_rate": 0.0001826871266252782, "loss": 0.6323, "step": 3700 }, { "epoch": 0.2670367702605365, "grad_norm": 0.48913854360580444, "learning_rate": 0.00018221857795478505, "loss": 0.6393, "step": 3800 }, { "epoch": 0.2740640536884454, "grad_norm": 0.40342265367507935, "learning_rate": 0.00018175002928429192, "loss": 0.6241, "step": 3900 }, { "epoch": 0.28109133711635426, "grad_norm": 0.46278470754623413, "learning_rate": 0.00018128148061379877, "loss": 0.6235, "step": 4000 }, { "epoch": 0.28109133711635426, "eval_loss": 0.6143574714660645, "eval_runtime": 192.2015, "eval_samples_per_second": 65.811, "eval_steps_per_second": 8.231, "step": 4000 }, { "epoch": 0.2881186205442631, "grad_norm": 0.5402960777282715, "learning_rate": 0.00018081293194330562, "loss": 0.6106, "step": 4100 }, { "epoch": 0.29514590397217194, "grad_norm": 0.49978354573249817, "learning_rate": 0.00018034438327281247, "loss": 0.6316, "step": 4200 }, { "epoch": 0.3021731874000808, "grad_norm": 0.4171115458011627, "learning_rate": 0.00017987583460231932, "loss": 0.6329, "step": 4300 }, { "epoch": 0.3092004708279897, "grad_norm": 0.48519599437713623, "learning_rate": 0.00017940728593182617, "loss": 0.6204, "step": 4400 }, { "epoch": 0.31622775425589855, "grad_norm": 0.4046633243560791, "learning_rate": 0.00017893873726133302, "loss": 0.6291, "step": 4500 }, { "epoch": 0.31622775425589855, "eval_loss": 0.6102128028869629, "eval_runtime": 194.9353, "eval_samples_per_second": 64.888, "eval_steps_per_second": 8.116, "step": 4500 }, { "epoch": 0.32325503768380737, "grad_norm": 0.5373474359512329, "learning_rate": 0.00017847018859083988, "loss": 0.6137, "step": 4600 }, { "epoch": 0.33028232111171624, "grad_norm": 0.45230546593666077, "learning_rate": 0.00017800163992034673, "loss": 0.6254, "step": 4700 }, { "epoch": 0.3373096045396251, "grad_norm": 0.5114957094192505, "learning_rate": 0.00017753309124985358, "loss": 0.6256, "step": 4800 }, { "epoch": 0.344336887967534, "grad_norm": 0.5685564279556274, "learning_rate": 0.00017706454257936045, "loss": 0.6055, "step": 4900 }, { "epoch": 0.3513641713954428, "grad_norm": 0.47926437854766846, "learning_rate": 0.0001765959939088673, "loss": 0.6151, "step": 5000 }, { "epoch": 0.3513641713954428, "eval_loss": 0.6096778512001038, "eval_runtime": 191.152, "eval_samples_per_second": 66.172, "eval_steps_per_second": 8.276, "step": 5000 }, { "epoch": 0.35839145482335166, "grad_norm": 0.4746282696723938, "learning_rate": 0.00017612744523837415, "loss": 0.6122, "step": 5100 }, { "epoch": 0.36541873825126053, "grad_norm": 0.43606746196746826, "learning_rate": 0.000175658896567881, "loss": 0.6087, "step": 5200 }, { "epoch": 0.37244602167916935, "grad_norm": 0.6463629603385925, "learning_rate": 0.00017519034789738785, "loss": 0.6341, "step": 5300 }, { "epoch": 0.3794733051070782, "grad_norm": 0.5722381472587585, "learning_rate": 0.0001747217992268947, "loss": 0.6224, "step": 5400 }, { "epoch": 0.3865005885349871, "grad_norm": 0.45890772342681885, "learning_rate": 0.00017425325055640155, "loss": 0.611, "step": 5500 }, { "epoch": 0.3865005885349871, "eval_loss": 0.6061792969703674, "eval_runtime": 191.541, "eval_samples_per_second": 66.038, "eval_steps_per_second": 8.259, "step": 5500 }, { "epoch": 0.39352787196289596, "grad_norm": 0.32096725702285767, "learning_rate": 0.0001737847018859084, "loss": 0.6031, "step": 5600 }, { "epoch": 0.4005551553908048, "grad_norm": 0.46782708168029785, "learning_rate": 0.00017331615321541525, "loss": 0.5948, "step": 5700 }, { "epoch": 0.40758243881871364, "grad_norm": 0.5056571364402771, "learning_rate": 0.0001728476045449221, "loss": 0.6142, "step": 5800 }, { "epoch": 0.4146097222466225, "grad_norm": 0.5338990688323975, "learning_rate": 0.00017237905587442898, "loss": 0.6235, "step": 5900 }, { "epoch": 0.4216370056745314, "grad_norm": 0.5611251592636108, "learning_rate": 0.00017191050720393583, "loss": 0.6064, "step": 6000 }, { "epoch": 0.4216370056745314, "eval_loss": 0.6039415001869202, "eval_runtime": 192.9342, "eval_samples_per_second": 65.561, "eval_steps_per_second": 8.2, "step": 6000 }, { "epoch": 0.4286642891024402, "grad_norm": 0.34663501381874084, "learning_rate": 0.00017144195853344268, "loss": 0.619, "step": 6100 }, { "epoch": 0.43569157253034907, "grad_norm": 0.5357178449630737, "learning_rate": 0.0001709734098629495, "loss": 0.6148, "step": 6200 }, { "epoch": 0.44271885595825794, "grad_norm": 0.49474236369132996, "learning_rate": 0.00017050486119245636, "loss": 0.6021, "step": 6300 }, { "epoch": 0.4497461393861668, "grad_norm": 0.4276539981365204, "learning_rate": 0.00017003631252196323, "loss": 0.619, "step": 6400 }, { "epoch": 0.4567734228140756, "grad_norm": 0.4469217360019684, "learning_rate": 0.00016956776385147008, "loss": 0.6044, "step": 6500 }, { "epoch": 0.4567734228140756, "eval_loss": 0.6005575656890869, "eval_runtime": 191.475, "eval_samples_per_second": 66.061, "eval_steps_per_second": 8.262, "step": 6500 }, { "epoch": 0.4638007062419845, "grad_norm": 0.4944908916950226, "learning_rate": 0.00016909921518097693, "loss": 0.5871, "step": 6600 }, { "epoch": 0.47082798966989337, "grad_norm": 0.6753395795822144, "learning_rate": 0.00016863066651048378, "loss": 0.6058, "step": 6700 }, { "epoch": 0.47785527309780224, "grad_norm": 0.4985552132129669, "learning_rate": 0.00016816211783999063, "loss": 0.5935, "step": 6800 }, { "epoch": 0.48488255652571105, "grad_norm": 0.47422492504119873, "learning_rate": 0.0001676935691694975, "loss": 0.6138, "step": 6900 }, { "epoch": 0.4919098399536199, "grad_norm": 0.4696919620037079, "learning_rate": 0.00016722502049900433, "loss": 0.595, "step": 7000 }, { "epoch": 0.4919098399536199, "eval_loss": 0.5999007225036621, "eval_runtime": 192.1728, "eval_samples_per_second": 65.821, "eval_steps_per_second": 8.232, "step": 7000 }, { "epoch": 0.4989371233815288, "grad_norm": 0.5600836873054504, "learning_rate": 0.00016675647182851119, "loss": 0.6183, "step": 7100 }, { "epoch": 0.5059644068094377, "grad_norm": 0.600051760673523, "learning_rate": 0.00016628792315801804, "loss": 0.6035, "step": 7200 }, { "epoch": 0.5129916902373465, "grad_norm": 0.4427150785923004, "learning_rate": 0.00016581937448752489, "loss": 0.5925, "step": 7300 }, { "epoch": 0.5200189736652554, "grad_norm": 0.5799112319946289, "learning_rate": 0.00016535082581703176, "loss": 0.5938, "step": 7400 }, { "epoch": 0.5270462570931642, "grad_norm": 0.5303583741188049, "learning_rate": 0.0001648822771465386, "loss": 0.6041, "step": 7500 }, { "epoch": 0.5270462570931642, "eval_loss": 0.5995006561279297, "eval_runtime": 192.153, "eval_samples_per_second": 65.828, "eval_steps_per_second": 8.233, "step": 7500 }, { "epoch": 0.534073540521073, "grad_norm": 0.5901041030883789, "learning_rate": 0.00016441372847604546, "loss": 0.6089, "step": 7600 }, { "epoch": 0.541100823948982, "grad_norm": 0.4662562608718872, "learning_rate": 0.00016394517980555231, "loss": 0.6041, "step": 7700 }, { "epoch": 0.5481281073768908, "grad_norm": 0.578330934047699, "learning_rate": 0.00016347663113505916, "loss": 0.6042, "step": 7800 }, { "epoch": 0.5551553908047996, "grad_norm": 0.4467080533504486, "learning_rate": 0.00016300808246456601, "loss": 0.6106, "step": 7900 }, { "epoch": 0.5621826742327085, "grad_norm": 0.5268563628196716, "learning_rate": 0.00016253953379407286, "loss": 0.6064, "step": 8000 }, { "epoch": 0.5621826742327085, "eval_loss": 0.5974312424659729, "eval_runtime": 191.7777, "eval_samples_per_second": 65.957, "eval_steps_per_second": 8.249, "step": 8000 }, { "epoch": 0.5692099576606173, "grad_norm": 0.5392386317253113, "learning_rate": 0.00016207098512357971, "loss": 0.5957, "step": 8100 }, { "epoch": 0.5762372410885263, "grad_norm": 0.4071332812309265, "learning_rate": 0.00016160243645308656, "loss": 0.594, "step": 8200 }, { "epoch": 0.5832645245164351, "grad_norm": 0.49810290336608887, "learning_rate": 0.00016113388778259341, "loss": 0.6087, "step": 8300 }, { "epoch": 0.5902918079443439, "grad_norm": 0.5486442446708679, "learning_rate": 0.0001606653391121003, "loss": 0.6068, "step": 8400 }, { "epoch": 0.5973190913722528, "grad_norm": 0.4476531147956848, "learning_rate": 0.00016019679044160714, "loss": 0.6008, "step": 8500 }, { "epoch": 0.5973190913722528, "eval_loss": 0.5947303175926208, "eval_runtime": 191.9868, "eval_samples_per_second": 65.885, "eval_steps_per_second": 8.24, "step": 8500 }, { "epoch": 0.6043463748001616, "grad_norm": 0.46748480200767517, "learning_rate": 0.000159728241771114, "loss": 0.6073, "step": 8600 }, { "epoch": 0.6113736582280704, "grad_norm": 0.3768276870250702, "learning_rate": 0.00015925969310062084, "loss": 0.6025, "step": 8700 }, { "epoch": 0.6184009416559794, "grad_norm": 0.4775266647338867, "learning_rate": 0.0001587911444301277, "loss": 0.6139, "step": 8800 }, { "epoch": 0.6254282250838882, "grad_norm": 0.595317542552948, "learning_rate": 0.00015832259575963452, "loss": 0.606, "step": 8900 }, { "epoch": 0.6324555085117971, "grad_norm": 0.5021831393241882, "learning_rate": 0.0001578540470891414, "loss": 0.6141, "step": 9000 }, { "epoch": 0.6324555085117971, "eval_loss": 0.5933237671852112, "eval_runtime": 191.7573, "eval_samples_per_second": 65.964, "eval_steps_per_second": 8.25, "step": 9000 }, { "epoch": 0.6394827919397059, "grad_norm": 0.4338054955005646, "learning_rate": 0.00015738549841864824, "loss": 0.6081, "step": 9100 }, { "epoch": 0.6465100753676147, "grad_norm": 0.5167454481124878, "learning_rate": 0.0001569169497481551, "loss": 0.5881, "step": 9200 }, { "epoch": 0.6535373587955237, "grad_norm": 0.4964050054550171, "learning_rate": 0.00015644840107766194, "loss": 0.6227, "step": 9300 }, { "epoch": 0.6605646422234325, "grad_norm": 0.5711501240730286, "learning_rate": 0.0001559798524071688, "loss": 0.5927, "step": 9400 }, { "epoch": 0.6675919256513413, "grad_norm": 0.4459337890148163, "learning_rate": 0.00015551130373667567, "loss": 0.5921, "step": 9500 }, { "epoch": 0.6675919256513413, "eval_loss": 0.593291163444519, "eval_runtime": 191.3053, "eval_samples_per_second": 66.119, "eval_steps_per_second": 8.27, "step": 9500 }, { "epoch": 0.6746192090792502, "grad_norm": 0.5358896255493164, "learning_rate": 0.00015504275506618252, "loss": 0.595, "step": 9600 }, { "epoch": 0.681646492507159, "grad_norm": 0.5215060114860535, "learning_rate": 0.00015457420639568935, "loss": 0.6073, "step": 9700 }, { "epoch": 0.688673775935068, "grad_norm": 0.43110188841819763, "learning_rate": 0.0001541056577251962, "loss": 0.5947, "step": 9800 }, { "epoch": 0.6957010593629768, "grad_norm": 0.7894725799560547, "learning_rate": 0.00015363710905470305, "loss": 0.6036, "step": 9900 }, { "epoch": 0.7027283427908856, "grad_norm": 0.40468016266822815, "learning_rate": 0.00015316856038420992, "loss": 0.5662, "step": 10000 }, { "epoch": 0.7027283427908856, "eval_loss": 0.5914621949195862, "eval_runtime": 191.9924, "eval_samples_per_second": 65.883, "eval_steps_per_second": 8.24, "step": 10000 }, { "epoch": 0.7097556262187945, "grad_norm": 0.520021915435791, "learning_rate": 0.00015270001171371677, "loss": 0.6024, "step": 10100 }, { "epoch": 0.7167829096467033, "grad_norm": 0.5001341700553894, "learning_rate": 0.00015223146304322362, "loss": 0.6039, "step": 10200 }, { "epoch": 0.7238101930746121, "grad_norm": 0.5081097483634949, "learning_rate": 0.00015176291437273047, "loss": 0.5995, "step": 10300 }, { "epoch": 0.7308374765025211, "grad_norm": 0.7181329131126404, "learning_rate": 0.00015129436570223732, "loss": 0.5885, "step": 10400 }, { "epoch": 0.7378647599304299, "grad_norm": 0.5525663495063782, "learning_rate": 0.00015082581703174417, "loss": 0.6117, "step": 10500 }, { "epoch": 0.7378647599304299, "eval_loss": 0.5893301963806152, "eval_runtime": 191.2328, "eval_samples_per_second": 66.145, "eval_steps_per_second": 8.273, "step": 10500 }, { "epoch": 0.7448920433583387, "grad_norm": 0.41122502088546753, "learning_rate": 0.00015035726836125102, "loss": 0.6135, "step": 10600 }, { "epoch": 0.7519193267862476, "grad_norm": 0.501287043094635, "learning_rate": 0.00014988871969075787, "loss": 0.6045, "step": 10700 }, { "epoch": 0.7589466102141564, "grad_norm": 0.4683501720428467, "learning_rate": 0.00014942017102026472, "loss": 0.6044, "step": 10800 }, { "epoch": 0.7659738936420654, "grad_norm": 0.5879314541816711, "learning_rate": 0.00014895162234977158, "loss": 0.5979, "step": 10900 }, { "epoch": 0.7730011770699742, "grad_norm": 0.4201727509498596, "learning_rate": 0.00014848307367927845, "loss": 0.6133, "step": 11000 }, { "epoch": 0.7730011770699742, "eval_loss": 0.5890333652496338, "eval_runtime": 191.871, "eval_samples_per_second": 65.925, "eval_steps_per_second": 8.245, "step": 11000 }, { "epoch": 0.780028460497883, "grad_norm": 0.4720204770565033, "learning_rate": 0.0001480145250087853, "loss": 0.5926, "step": 11100 }, { "epoch": 0.7870557439257919, "grad_norm": 0.4714803695678711, "learning_rate": 0.00014754597633829215, "loss": 0.5936, "step": 11200 }, { "epoch": 0.7940830273537007, "grad_norm": 0.44766145944595337, "learning_rate": 0.000147077427667799, "loss": 0.5898, "step": 11300 }, { "epoch": 0.8011103107816095, "grad_norm": 0.5793395638465881, "learning_rate": 0.00014660887899730585, "loss": 0.5792, "step": 11400 }, { "epoch": 0.8081375942095185, "grad_norm": 0.5724961757659912, "learning_rate": 0.0001461403303268127, "loss": 0.5976, "step": 11500 }, { "epoch": 0.8081375942095185, "eval_loss": 0.5884692072868347, "eval_runtime": 192.9015, "eval_samples_per_second": 65.572, "eval_steps_per_second": 8.201, "step": 11500 }, { "epoch": 0.8151648776374273, "grad_norm": 0.4588712751865387, "learning_rate": 0.00014567178165631955, "loss": 0.5876, "step": 11600 }, { "epoch": 0.8221921610653362, "grad_norm": 0.5474816560745239, "learning_rate": 0.0001452032329858264, "loss": 0.6004, "step": 11700 }, { "epoch": 0.829219444493245, "grad_norm": 0.480760395526886, "learning_rate": 0.00014473468431533325, "loss": 0.6027, "step": 11800 }, { "epoch": 0.8362467279211538, "grad_norm": 0.44635719060897827, "learning_rate": 0.0001442661356448401, "loss": 0.5955, "step": 11900 }, { "epoch": 0.8432740113490628, "grad_norm": 0.370127409696579, "learning_rate": 0.00014379758697434698, "loss": 0.5873, "step": 12000 }, { "epoch": 0.8432740113490628, "eval_loss": 0.5867401361465454, "eval_runtime": 191.0689, "eval_samples_per_second": 66.201, "eval_steps_per_second": 8.28, "step": 12000 }, { "epoch": 0.8503012947769716, "grad_norm": 0.5330467224121094, "learning_rate": 0.00014332903830385383, "loss": 0.6036, "step": 12100 }, { "epoch": 0.8573285782048804, "grad_norm": 0.516147255897522, "learning_rate": 0.00014286048963336068, "loss": 0.5808, "step": 12200 }, { "epoch": 0.8643558616327893, "grad_norm": 0.5745691061019897, "learning_rate": 0.00014239194096286753, "loss": 0.6026, "step": 12300 }, { "epoch": 0.8713831450606981, "grad_norm": 0.5167753100395203, "learning_rate": 0.00014192339229237436, "loss": 0.5858, "step": 12400 }, { "epoch": 0.8784104284886071, "grad_norm": 0.644183874130249, "learning_rate": 0.00014145484362188123, "loss": 0.6009, "step": 12500 }, { "epoch": 0.8784104284886071, "eval_loss": 0.585637629032135, "eval_runtime": 192.0709, "eval_samples_per_second": 65.856, "eval_steps_per_second": 8.237, "step": 12500 }, { "epoch": 0.8854377119165159, "grad_norm": 0.5942461490631104, "learning_rate": 0.00014098629495138808, "loss": 0.5872, "step": 12600 }, { "epoch": 0.8924649953444247, "grad_norm": 0.5303371548652649, "learning_rate": 0.00014051774628089493, "loss": 0.603, "step": 12700 }, { "epoch": 0.8994922787723336, "grad_norm": 0.6340320706367493, "learning_rate": 0.00014004919761040178, "loss": 0.5967, "step": 12800 }, { "epoch": 0.9065195622002424, "grad_norm": 0.3721257150173187, "learning_rate": 0.00013958064893990863, "loss": 0.5951, "step": 12900 }, { "epoch": 0.9135468456281512, "grad_norm": 0.5798757672309875, "learning_rate": 0.0001391121002694155, "loss": 0.5872, "step": 13000 }, { "epoch": 0.9135468456281512, "eval_loss": 0.5851631164550781, "eval_runtime": 191.7607, "eval_samples_per_second": 65.962, "eval_steps_per_second": 8.25, "step": 13000 }, { "epoch": 0.9205741290560602, "grad_norm": 0.49079081416130066, "learning_rate": 0.00013864355159892236, "loss": 0.5945, "step": 13100 }, { "epoch": 0.927601412483969, "grad_norm": 0.6511490941047668, "learning_rate": 0.00013817500292842918, "loss": 0.5967, "step": 13200 }, { "epoch": 0.9346286959118779, "grad_norm": 0.6914525032043457, "learning_rate": 0.00013770645425793603, "loss": 0.595, "step": 13300 }, { "epoch": 0.9416559793397867, "grad_norm": 0.5525299310684204, "learning_rate": 0.00013723790558744289, "loss": 0.5994, "step": 13400 }, { "epoch": 0.9486832627676955, "grad_norm": 0.5471107363700867, "learning_rate": 0.00013676935691694976, "loss": 0.5778, "step": 13500 }, { "epoch": 0.9486832627676955, "eval_loss": 0.5845098495483398, "eval_runtime": 192.0133, "eval_samples_per_second": 65.876, "eval_steps_per_second": 8.239, "step": 13500 }, { "epoch": 0.9557105461956045, "grad_norm": 0.6058303117752075, "learning_rate": 0.0001363008082464566, "loss": 0.6011, "step": 13600 }, { "epoch": 0.9627378296235133, "grad_norm": 0.5132064819335938, "learning_rate": 0.00013583225957596346, "loss": 0.5814, "step": 13700 }, { "epoch": 0.9697651130514221, "grad_norm": 0.42012500762939453, "learning_rate": 0.0001353637109054703, "loss": 0.5834, "step": 13800 }, { "epoch": 0.976792396479331, "grad_norm": 0.5156047344207764, "learning_rate": 0.00013489516223497716, "loss": 0.5912, "step": 13900 }, { "epoch": 0.9838196799072398, "grad_norm": 0.578301727771759, "learning_rate": 0.00013442661356448404, "loss": 0.5888, "step": 14000 }, { "epoch": 0.9838196799072398, "eval_loss": 0.5840687155723572, "eval_runtime": 191.6947, "eval_samples_per_second": 65.985, "eval_steps_per_second": 8.253, "step": 14000 }, { "epoch": 0.9908469633351488, "grad_norm": 0.5532825589179993, "learning_rate": 0.00013395806489399086, "loss": 0.5951, "step": 14100 }, { "epoch": 0.9978742467630576, "grad_norm": 0.7117305994033813, "learning_rate": 0.00013348951622349771, "loss": 0.6017, "step": 14200 }, { "epoch": 1.0049015301909665, "grad_norm": 0.4331432580947876, "learning_rate": 0.00013302096755300456, "loss": 0.535, "step": 14300 }, { "epoch": 1.0119288136188753, "grad_norm": 0.5378111004829407, "learning_rate": 0.00013255241888251141, "loss": 0.5451, "step": 14400 }, { "epoch": 1.0189560970467841, "grad_norm": 0.5767349600791931, "learning_rate": 0.0001320838702120183, "loss": 0.5469, "step": 14500 }, { "epoch": 1.0189560970467841, "eval_loss": 0.5859206318855286, "eval_runtime": 192.2055, "eval_samples_per_second": 65.81, "eval_steps_per_second": 8.231, "step": 14500 }, { "epoch": 1.025983380474693, "grad_norm": 0.5575023889541626, "learning_rate": 0.00013161532154152514, "loss": 0.5449, "step": 14600 }, { "epoch": 1.0330106639026018, "grad_norm": 0.5510827302932739, "learning_rate": 0.000131146772871032, "loss": 0.5339, "step": 14700 }, { "epoch": 1.0400379473305108, "grad_norm": 0.5780773758888245, "learning_rate": 0.00013067822420053884, "loss": 0.5334, "step": 14800 }, { "epoch": 1.0470652307584196, "grad_norm": 0.6217513084411621, "learning_rate": 0.0001302096755300457, "loss": 0.5404, "step": 14900 }, { "epoch": 1.0540925141863284, "grad_norm": 0.5290889739990234, "learning_rate": 0.00012974112685955254, "loss": 0.5368, "step": 15000 }, { "epoch": 1.0540925141863284, "eval_loss": 0.5861626863479614, "eval_runtime": 192.0995, "eval_samples_per_second": 65.846, "eval_steps_per_second": 8.235, "step": 15000 }, { "epoch": 1.0611197976142372, "grad_norm": 0.5673081278800964, "learning_rate": 0.0001292725781890594, "loss": 0.5381, "step": 15100 }, { "epoch": 1.068147081042146, "grad_norm": 0.4198685884475708, "learning_rate": 0.00012880402951856624, "loss": 0.5379, "step": 15200 }, { "epoch": 1.075174364470055, "grad_norm": 0.45610734820365906, "learning_rate": 0.0001283354808480731, "loss": 0.5349, "step": 15300 }, { "epoch": 1.082201647897964, "grad_norm": 0.5499436259269714, "learning_rate": 0.00012786693217757994, "loss": 0.5411, "step": 15400 }, { "epoch": 1.0892289313258727, "grad_norm": 0.6300696730613708, "learning_rate": 0.0001273983835070868, "loss": 0.546, "step": 15500 }, { "epoch": 1.0892289313258727, "eval_loss": 0.5862005352973938, "eval_runtime": 191.9358, "eval_samples_per_second": 65.902, "eval_steps_per_second": 8.242, "step": 15500 }, { "epoch": 1.0962562147537815, "grad_norm": 0.6075917482376099, "learning_rate": 0.00012692983483659367, "loss": 0.5417, "step": 15600 }, { "epoch": 1.1032834981816904, "grad_norm": 0.46201905608177185, "learning_rate": 0.00012646128616610052, "loss": 0.5283, "step": 15700 }, { "epoch": 1.1103107816095992, "grad_norm": 0.4050121605396271, "learning_rate": 0.00012599273749560737, "loss": 0.5479, "step": 15800 }, { "epoch": 1.1173380650375082, "grad_norm": 0.4577913284301758, "learning_rate": 0.0001255241888251142, "loss": 0.5452, "step": 15900 }, { "epoch": 1.124365348465417, "grad_norm": 0.4985063672065735, "learning_rate": 0.00012505564015462105, "loss": 0.5444, "step": 16000 }, { "epoch": 1.124365348465417, "eval_loss": 0.5855303406715393, "eval_runtime": 191.4632, "eval_samples_per_second": 66.065, "eval_steps_per_second": 8.263, "step": 16000 }, { "epoch": 1.1313926318933258, "grad_norm": 0.4636366665363312, "learning_rate": 0.00012458709148412792, "loss": 0.5352, "step": 16100 }, { "epoch": 1.1384199153212347, "grad_norm": 0.5733815431594849, "learning_rate": 0.00012411854281363477, "loss": 0.5339, "step": 16200 }, { "epoch": 1.1454471987491435, "grad_norm": 0.5225148797035217, "learning_rate": 0.00012364999414314162, "loss": 0.5356, "step": 16300 }, { "epoch": 1.1524744821770523, "grad_norm": 0.5425903797149658, "learning_rate": 0.00012318144547264847, "loss": 0.544, "step": 16400 }, { "epoch": 1.1595017656049613, "grad_norm": 0.49336591362953186, "learning_rate": 0.00012271289680215532, "loss": 0.5386, "step": 16500 }, { "epoch": 1.1595017656049613, "eval_loss": 0.5844515562057495, "eval_runtime": 195.2478, "eval_samples_per_second": 64.784, "eval_steps_per_second": 8.103, "step": 16500 }, { "epoch": 1.1665290490328701, "grad_norm": 0.45879772305488586, "learning_rate": 0.0001222443481316622, "loss": 0.5334, "step": 16600 }, { "epoch": 1.173556332460779, "grad_norm": 0.4931933879852295, "learning_rate": 0.00012177579946116904, "loss": 0.5362, "step": 16700 }, { "epoch": 1.1805836158886878, "grad_norm": 0.7223409414291382, "learning_rate": 0.00012130725079067589, "loss": 0.5331, "step": 16800 }, { "epoch": 1.1876108993165966, "grad_norm": 0.4814956784248352, "learning_rate": 0.00012083870212018274, "loss": 0.5279, "step": 16900 }, { "epoch": 1.1946381827445056, "grad_norm": 0.5712910294532776, "learning_rate": 0.00012037015344968959, "loss": 0.5269, "step": 17000 }, { "epoch": 1.1946381827445056, "eval_loss": 0.5852189660072327, "eval_runtime": 191.4648, "eval_samples_per_second": 66.064, "eval_steps_per_second": 8.263, "step": 17000 }, { "epoch": 1.2016654661724144, "grad_norm": 0.5194661021232605, "learning_rate": 0.00011990160477919645, "loss": 0.5456, "step": 17100 }, { "epoch": 1.2086927496003232, "grad_norm": 0.42699483036994934, "learning_rate": 0.0001194330561087033, "loss": 0.5519, "step": 17200 }, { "epoch": 1.215720033028232, "grad_norm": 0.49879905581474304, "learning_rate": 0.00011896450743821015, "loss": 0.5327, "step": 17300 }, { "epoch": 1.2227473164561409, "grad_norm": 0.7226296663284302, "learning_rate": 0.000118495958767717, "loss": 0.545, "step": 17400 }, { "epoch": 1.22977459988405, "grad_norm": 0.5727146863937378, "learning_rate": 0.00011802741009722384, "loss": 0.5428, "step": 17500 }, { "epoch": 1.22977459988405, "eval_loss": 0.5834037065505981, "eval_runtime": 191.7102, "eval_samples_per_second": 65.98, "eval_steps_per_second": 8.252, "step": 17500 }, { "epoch": 1.2368018833119587, "grad_norm": 0.6738116145133972, "learning_rate": 0.00011755886142673072, "loss": 0.5494, "step": 17600 }, { "epoch": 1.2438291667398675, "grad_norm": 0.5708216428756714, "learning_rate": 0.00011709031275623757, "loss": 0.5321, "step": 17700 }, { "epoch": 1.2508564501677764, "grad_norm": 0.7585426568984985, "learning_rate": 0.00011662176408574442, "loss": 0.5405, "step": 17800 }, { "epoch": 1.2578837335956852, "grad_norm": 0.48215654492378235, "learning_rate": 0.00011615321541525125, "loss": 0.5317, "step": 17900 }, { "epoch": 1.2649110170235942, "grad_norm": 0.7234981060028076, "learning_rate": 0.0001156846667447581, "loss": 0.5198, "step": 18000 }, { "epoch": 1.2649110170235942, "eval_loss": 0.583666980266571, "eval_runtime": 191.0164, "eval_samples_per_second": 66.219, "eval_steps_per_second": 8.282, "step": 18000 }, { "epoch": 1.271938300451503, "grad_norm": 0.6166390180587769, "learning_rate": 0.00011521611807426498, "loss": 0.5391, "step": 18100 }, { "epoch": 1.2789655838794118, "grad_norm": 0.5026915669441223, "learning_rate": 0.00011474756940377183, "loss": 0.5453, "step": 18200 }, { "epoch": 1.2859928673073207, "grad_norm": 0.44040101766586304, "learning_rate": 0.00011427902073327867, "loss": 0.5459, "step": 18300 }, { "epoch": 1.2930201507352295, "grad_norm": 0.4429463744163513, "learning_rate": 0.00011381047206278552, "loss": 0.5302, "step": 18400 }, { "epoch": 1.3000474341631385, "grad_norm": 0.5252630710601807, "learning_rate": 0.00011334192339229237, "loss": 0.5373, "step": 18500 }, { "epoch": 1.3000474341631385, "eval_loss": 0.5821723341941833, "eval_runtime": 191.389, "eval_samples_per_second": 66.091, "eval_steps_per_second": 8.266, "step": 18500 }, { "epoch": 1.3070747175910473, "grad_norm": 0.590726912021637, "learning_rate": 0.00011287337472179925, "loss": 0.5382, "step": 18600 }, { "epoch": 1.3141020010189561, "grad_norm": 0.6306003928184509, "learning_rate": 0.00011240482605130608, "loss": 0.5199, "step": 18700 }, { "epoch": 1.321129284446865, "grad_norm": 0.5457239151000977, "learning_rate": 0.00011193627738081293, "loss": 0.5318, "step": 18800 }, { "epoch": 1.3281565678747738, "grad_norm": 0.5287586450576782, "learning_rate": 0.00011146772871031978, "loss": 0.545, "step": 18900 }, { "epoch": 1.3351838513026828, "grad_norm": 0.5725194811820984, "learning_rate": 0.00011099918003982663, "loss": 0.5439, "step": 19000 }, { "epoch": 1.3351838513026828, "eval_loss": 0.5815557241439819, "eval_runtime": 191.8001, "eval_samples_per_second": 65.949, "eval_steps_per_second": 8.248, "step": 19000 }, { "epoch": 1.3422111347305914, "grad_norm": 0.6155562400817871, "learning_rate": 0.0001105306313693335, "loss": 0.5337, "step": 19100 }, { "epoch": 1.3492384181585004, "grad_norm": 0.45534420013427734, "learning_rate": 0.00011006208269884035, "loss": 0.5437, "step": 19200 }, { "epoch": 1.3562657015864092, "grad_norm": 0.5221378803253174, "learning_rate": 0.0001095935340283472, "loss": 0.5433, "step": 19300 }, { "epoch": 1.363292985014318, "grad_norm": 0.6740151047706604, "learning_rate": 0.00010912498535785405, "loss": 0.5315, "step": 19400 }, { "epoch": 1.3703202684422269, "grad_norm": 0.5568748116493225, "learning_rate": 0.0001086564366873609, "loss": 0.5442, "step": 19500 }, { "epoch": 1.3703202684422269, "eval_loss": 0.5810661911964417, "eval_runtime": 193.8958, "eval_samples_per_second": 65.236, "eval_steps_per_second": 8.159, "step": 19500 }, { "epoch": 1.3773475518701357, "grad_norm": 0.6538209915161133, "learning_rate": 0.00010818788801686776, "loss": 0.5485, "step": 19600 }, { "epoch": 1.3843748352980447, "grad_norm": 0.3977202773094177, "learning_rate": 0.00010771933934637461, "loss": 0.551, "step": 19700 }, { "epoch": 1.3914021187259535, "grad_norm": 0.6462295651435852, "learning_rate": 0.00010725079067588146, "loss": 0.535, "step": 19800 }, { "epoch": 1.3984294021538624, "grad_norm": 0.6669672131538391, "learning_rate": 0.00010678224200538831, "loss": 0.5372, "step": 19900 }, { "epoch": 1.4054566855817712, "grad_norm": 0.672690749168396, "learning_rate": 0.00010631369333489516, "loss": 0.5562, "step": 20000 }, { "epoch": 1.4054566855817712, "eval_loss": 0.5803663730621338, "eval_runtime": 191.2217, "eval_samples_per_second": 66.148, "eval_steps_per_second": 8.273, "step": 20000 }, { "epoch": 1.41248396900968, "grad_norm": 0.5990594029426575, "learning_rate": 0.00010584514466440203, "loss": 0.5402, "step": 20100 }, { "epoch": 1.419511252437589, "grad_norm": 0.7736939787864685, "learning_rate": 0.00010537659599390888, "loss": 0.5514, "step": 20200 }, { "epoch": 1.4265385358654978, "grad_norm": 0.7517827749252319, "learning_rate": 0.00010490804732341573, "loss": 0.5199, "step": 20300 }, { "epoch": 1.4335658192934067, "grad_norm": 0.40587854385375977, "learning_rate": 0.00010443949865292258, "loss": 0.5453, "step": 20400 }, { "epoch": 1.4405931027213155, "grad_norm": 0.5752481818199158, "learning_rate": 0.00010397094998242943, "loss": 0.5327, "step": 20500 }, { "epoch": 1.4405931027213155, "eval_loss": 0.5793671607971191, "eval_runtime": 191.4019, "eval_samples_per_second": 66.086, "eval_steps_per_second": 8.265, "step": 20500 }, { "epoch": 1.4476203861492243, "grad_norm": 0.5109778642654419, "learning_rate": 0.00010350240131193629, "loss": 0.534, "step": 20600 }, { "epoch": 1.4546476695771333, "grad_norm": 0.5415691137313843, "learning_rate": 0.00010303385264144314, "loss": 0.5346, "step": 20700 }, { "epoch": 1.4616749530050421, "grad_norm": 0.6798147559165955, "learning_rate": 0.00010256530397094999, "loss": 0.5389, "step": 20800 }, { "epoch": 1.468702236432951, "grad_norm": 0.5930722951889038, "learning_rate": 0.00010209675530045684, "loss": 0.5553, "step": 20900 }, { "epoch": 1.4757295198608598, "grad_norm": 0.46035194396972656, "learning_rate": 0.00010162820662996368, "loss": 0.5488, "step": 21000 }, { "epoch": 1.4757295198608598, "eval_loss": 0.5797436833381653, "eval_runtime": 191.1985, "eval_samples_per_second": 66.156, "eval_steps_per_second": 8.274, "step": 21000 }, { "epoch": 1.4827568032887686, "grad_norm": 0.6289058327674866, "learning_rate": 0.00010115965795947056, "loss": 0.5373, "step": 21100 }, { "epoch": 1.4897840867166776, "grad_norm": 0.5587871670722961, "learning_rate": 0.0001006911092889774, "loss": 0.5433, "step": 21200 }, { "epoch": 1.4968113701445864, "grad_norm": 0.6402022838592529, "learning_rate": 0.00010022256061848426, "loss": 0.5369, "step": 21300 }, { "epoch": 1.5038386535724952, "grad_norm": 0.7074068784713745, "learning_rate": 9.975401194799109e-05, "loss": 0.5422, "step": 21400 }, { "epoch": 1.510865937000404, "grad_norm": 0.6283564567565918, "learning_rate": 9.928546327749796e-05, "loss": 0.53, "step": 21500 }, { "epoch": 1.510865937000404, "eval_loss": 0.5776032209396362, "eval_runtime": 195.2716, "eval_samples_per_second": 64.776, "eval_steps_per_second": 8.102, "step": 21500 }, { "epoch": 1.5178932204283129, "grad_norm": 0.5513471961021423, "learning_rate": 9.881691460700481e-05, "loss": 0.5363, "step": 21600 }, { "epoch": 1.524920503856222, "grad_norm": 0.6834294199943542, "learning_rate": 9.834836593651166e-05, "loss": 0.5357, "step": 21700 }, { "epoch": 1.5319477872841305, "grad_norm": 0.5920292139053345, "learning_rate": 9.787981726601851e-05, "loss": 0.5425, "step": 21800 }, { "epoch": 1.5389750707120395, "grad_norm": 0.7391265630722046, "learning_rate": 9.741126859552536e-05, "loss": 0.5536, "step": 21900 }, { "epoch": 1.5460023541399484, "grad_norm": 0.59616619348526, "learning_rate": 9.694271992503222e-05, "loss": 0.5266, "step": 22000 }, { "epoch": 1.5460023541399484, "eval_loss": 0.5772854089736938, "eval_runtime": 191.6654, "eval_samples_per_second": 65.995, "eval_steps_per_second": 8.254, "step": 22000 }, { "epoch": 1.5530296375678572, "grad_norm": 0.7118886709213257, "learning_rate": 9.647417125453907e-05, "loss": 0.5402, "step": 22100 }, { "epoch": 1.5600569209957662, "grad_norm": 0.5848612189292908, "learning_rate": 9.600562258404592e-05, "loss": 0.5471, "step": 22200 }, { "epoch": 1.5670842044236748, "grad_norm": 0.6379753947257996, "learning_rate": 9.553707391355277e-05, "loss": 0.5319, "step": 22300 }, { "epoch": 1.5741114878515838, "grad_norm": 0.5326395630836487, "learning_rate": 9.506852524305962e-05, "loss": 0.5362, "step": 22400 }, { "epoch": 1.5811387712794926, "grad_norm": 0.44863948225975037, "learning_rate": 9.459997657256649e-05, "loss": 0.5339, "step": 22500 }, { "epoch": 1.5811387712794926, "eval_loss": 0.5767233371734619, "eval_runtime": 191.4103, "eval_samples_per_second": 66.083, "eval_steps_per_second": 8.265, "step": 22500 }, { "epoch": 1.5881660547074015, "grad_norm": 0.6882546544075012, "learning_rate": 9.413142790207334e-05, "loss": 0.5338, "step": 22600 }, { "epoch": 1.5951933381353105, "grad_norm": 0.6498861312866211, "learning_rate": 9.366287923158019e-05, "loss": 0.5449, "step": 22700 }, { "epoch": 1.602220621563219, "grad_norm": 0.5820225477218628, "learning_rate": 9.319433056108704e-05, "loss": 0.541, "step": 22800 }, { "epoch": 1.6092479049911281, "grad_norm": 0.6746723651885986, "learning_rate": 9.272578189059389e-05, "loss": 0.5437, "step": 22900 }, { "epoch": 1.616275188419037, "grad_norm": 0.6235536932945251, "learning_rate": 9.225723322010075e-05, "loss": 0.5332, "step": 23000 }, { "epoch": 1.616275188419037, "eval_loss": 0.575995683670044, "eval_runtime": 191.6609, "eval_samples_per_second": 65.997, "eval_steps_per_second": 8.254, "step": 23000 }, { "epoch": 1.6233024718469458, "grad_norm": 0.7106947898864746, "learning_rate": 9.17886845496076e-05, "loss": 0.5466, "step": 23100 }, { "epoch": 1.6303297552748546, "grad_norm": 0.5090802907943726, "learning_rate": 9.132013587911444e-05, "loss": 0.5244, "step": 23200 }, { "epoch": 1.6373570387027634, "grad_norm": 0.5423375368118286, "learning_rate": 9.08515872086213e-05, "loss": 0.5331, "step": 23300 }, { "epoch": 1.6443843221306724, "grad_norm": 0.40430954098701477, "learning_rate": 9.038303853812815e-05, "loss": 0.543, "step": 23400 }, { "epoch": 1.6514116055585812, "grad_norm": 0.5849967002868652, "learning_rate": 8.991448986763502e-05, "loss": 0.5313, "step": 23500 }, { "epoch": 1.6514116055585812, "eval_loss": 0.5759484171867371, "eval_runtime": 198.3535, "eval_samples_per_second": 63.77, "eval_steps_per_second": 7.976, "step": 23500 }, { "epoch": 1.65843888898649, "grad_norm": 0.642866849899292, "learning_rate": 8.944594119714185e-05, "loss": 0.5375, "step": 23600 }, { "epoch": 1.6654661724143989, "grad_norm": 0.4610786437988281, "learning_rate": 8.89773925266487e-05, "loss": 0.5262, "step": 23700 }, { "epoch": 1.6724934558423077, "grad_norm": 0.5731543302536011, "learning_rate": 8.850884385615557e-05, "loss": 0.5316, "step": 23800 }, { "epoch": 1.6795207392702167, "grad_norm": 0.6324673295021057, "learning_rate": 8.804029518566242e-05, "loss": 0.5391, "step": 23900 }, { "epoch": 1.6865480226981253, "grad_norm": 0.7439347505569458, "learning_rate": 8.757174651516927e-05, "loss": 0.5328, "step": 24000 }, { "epoch": 1.6865480226981253, "eval_loss": 0.5753322839736938, "eval_runtime": 191.1076, "eval_samples_per_second": 66.188, "eval_steps_per_second": 8.278, "step": 24000 }, { "epoch": 1.6935753061260344, "grad_norm": 0.5916799306869507, "learning_rate": 8.710319784467612e-05, "loss": 0.5357, "step": 24100 }, { "epoch": 1.7006025895539432, "grad_norm": 0.641130268573761, "learning_rate": 8.663464917418297e-05, "loss": 0.5413, "step": 24200 }, { "epoch": 1.707629872981852, "grad_norm": 0.5414934754371643, "learning_rate": 8.616610050368983e-05, "loss": 0.5372, "step": 24300 }, { "epoch": 1.714657156409761, "grad_norm": 0.5349559783935547, "learning_rate": 8.569755183319668e-05, "loss": 0.5248, "step": 24400 }, { "epoch": 1.7216844398376696, "grad_norm": 0.657092273235321, "learning_rate": 8.522900316270352e-05, "loss": 0.5323, "step": 24500 }, { "epoch": 1.7216844398376696, "eval_loss": 0.5741798281669617, "eval_runtime": 191.815, "eval_samples_per_second": 65.944, "eval_steps_per_second": 8.248, "step": 24500 }, { "epoch": 1.7287117232655786, "grad_norm": 0.5300177335739136, "learning_rate": 8.476045449221038e-05, "loss": 0.5396, "step": 24600 }, { "epoch": 1.7357390066934875, "grad_norm": 0.5802739262580872, "learning_rate": 8.429190582171723e-05, "loss": 0.5343, "step": 24700 }, { "epoch": 1.7427662901213963, "grad_norm": 0.5847595930099487, "learning_rate": 8.38233571512241e-05, "loss": 0.5337, "step": 24800 }, { "epoch": 1.7497935735493053, "grad_norm": 0.5918022394180298, "learning_rate": 8.335480848073095e-05, "loss": 0.5332, "step": 24900 }, { "epoch": 1.756820856977214, "grad_norm": 0.6495380997657776, "learning_rate": 8.288625981023778e-05, "loss": 0.5337, "step": 25000 }, { "epoch": 1.756820856977214, "eval_loss": 0.5734357833862305, "eval_runtime": 191.5001, "eval_samples_per_second": 66.052, "eval_steps_per_second": 8.261, "step": 25000 }, { "epoch": 1.763848140405123, "grad_norm": 0.773017406463623, "learning_rate": 8.241771113974465e-05, "loss": 0.5195, "step": 25100 }, { "epoch": 1.7708754238330318, "grad_norm": 0.6224300861358643, "learning_rate": 8.19491624692515e-05, "loss": 0.5409, "step": 25200 }, { "epoch": 1.7779027072609406, "grad_norm": 0.561305582523346, "learning_rate": 8.148061379875836e-05, "loss": 0.5411, "step": 25300 }, { "epoch": 1.7849299906888496, "grad_norm": 0.5424569249153137, "learning_rate": 8.10120651282652e-05, "loss": 0.5385, "step": 25400 }, { "epoch": 1.7919572741167582, "grad_norm": 0.6570726037025452, "learning_rate": 8.054351645777205e-05, "loss": 0.5304, "step": 25500 }, { "epoch": 1.7919572741167582, "eval_loss": 0.5725218057632446, "eval_runtime": 191.5153, "eval_samples_per_second": 66.047, "eval_steps_per_second": 8.26, "step": 25500 }, { "epoch": 1.7989845575446672, "grad_norm": 0.619318962097168, "learning_rate": 8.007496778727891e-05, "loss": 0.5404, "step": 25600 }, { "epoch": 1.806011840972576, "grad_norm": 0.666038990020752, "learning_rate": 7.960641911678576e-05, "loss": 0.5367, "step": 25700 }, { "epoch": 1.8130391244004849, "grad_norm": 0.4837888777256012, "learning_rate": 7.913787044629261e-05, "loss": 0.5275, "step": 25800 }, { "epoch": 1.8200664078283937, "grad_norm": 0.5173748135566711, "learning_rate": 7.866932177579946e-05, "loss": 0.5159, "step": 25900 }, { "epoch": 1.8270936912563025, "grad_norm": 0.5783088207244873, "learning_rate": 7.820077310530631e-05, "loss": 0.526, "step": 26000 }, { "epoch": 1.8270936912563025, "eval_loss": 0.571327805519104, "eval_runtime": 191.7206, "eval_samples_per_second": 65.976, "eval_steps_per_second": 8.252, "step": 26000 }, { "epoch": 1.8341209746842115, "grad_norm": 0.4488949477672577, "learning_rate": 7.773222443481318e-05, "loss": 0.5264, "step": 26100 }, { "epoch": 1.8411482581121204, "grad_norm": 0.5412874817848206, "learning_rate": 7.726367576432003e-05, "loss": 0.5366, "step": 26200 }, { "epoch": 1.8481755415400292, "grad_norm": 0.5360791087150574, "learning_rate": 7.679512709382688e-05, "loss": 0.5277, "step": 26300 }, { "epoch": 1.855202824967938, "grad_norm": 0.5853073596954346, "learning_rate": 7.632657842333373e-05, "loss": 0.5221, "step": 26400 }, { "epoch": 1.8622301083958468, "grad_norm": 0.733930230140686, "learning_rate": 7.585802975284058e-05, "loss": 0.53, "step": 26500 }, { "epoch": 1.8622301083958468, "eval_loss": 0.5721610188484192, "eval_runtime": 191.5535, "eval_samples_per_second": 66.034, "eval_steps_per_second": 8.259, "step": 26500 }, { "epoch": 1.8692573918237558, "grad_norm": 0.5376147031784058, "learning_rate": 7.538948108234744e-05, "loss": 0.5203, "step": 26600 }, { "epoch": 1.8762846752516644, "grad_norm": 0.6613253951072693, "learning_rate": 7.492093241185428e-05, "loss": 0.5316, "step": 26700 }, { "epoch": 1.8833119586795735, "grad_norm": 0.6095169186592102, "learning_rate": 7.445238374136114e-05, "loss": 0.5396, "step": 26800 }, { "epoch": 1.8903392421074823, "grad_norm": 0.5879153609275818, "learning_rate": 7.398383507086799e-05, "loss": 0.5328, "step": 26900 }, { "epoch": 1.897366525535391, "grad_norm": 0.4561489522457123, "learning_rate": 7.351528640037484e-05, "loss": 0.5334, "step": 27000 }, { "epoch": 1.897366525535391, "eval_loss": 0.5708492398262024, "eval_runtime": 191.398, "eval_samples_per_second": 66.087, "eval_steps_per_second": 8.265, "step": 27000 }, { "epoch": 1.9043938089633001, "grad_norm": 1.0811113119125366, "learning_rate": 7.304673772988169e-05, "loss": 0.5299, "step": 27100 }, { "epoch": 1.9114210923912087, "grad_norm": 0.5830855965614319, "learning_rate": 7.257818905938854e-05, "loss": 0.5477, "step": 27200 }, { "epoch": 1.9184483758191178, "grad_norm": 0.6300702095031738, "learning_rate": 7.21096403888954e-05, "loss": 0.5481, "step": 27300 }, { "epoch": 1.9254756592470266, "grad_norm": 0.6268986463546753, "learning_rate": 7.164109171840226e-05, "loss": 0.5253, "step": 27400 }, { "epoch": 1.9325029426749354, "grad_norm": 0.6296914219856262, "learning_rate": 7.11725430479091e-05, "loss": 0.5233, "step": 27500 }, { "epoch": 1.9325029426749354, "eval_loss": 0.5700781941413879, "eval_runtime": 191.7014, "eval_samples_per_second": 65.983, "eval_steps_per_second": 8.252, "step": 27500 }, { "epoch": 1.9395302261028444, "grad_norm": 0.47624173760414124, "learning_rate": 7.070399437741596e-05, "loss": 0.54, "step": 27600 }, { "epoch": 1.946557509530753, "grad_norm": 0.6563848853111267, "learning_rate": 7.02354457069228e-05, "loss": 0.5273, "step": 27700 }, { "epoch": 1.953584792958662, "grad_norm": 0.6281931400299072, "learning_rate": 6.976689703642966e-05, "loss": 0.5265, "step": 27800 }, { "epoch": 1.9606120763865709, "grad_norm": 0.7378135323524475, "learning_rate": 6.929834836593652e-05, "loss": 0.5241, "step": 27900 }, { "epoch": 1.9676393598144797, "grad_norm": 0.5289490222930908, "learning_rate": 6.882979969544337e-05, "loss": 0.5327, "step": 28000 }, { "epoch": 1.9676393598144797, "eval_loss": 0.5690564513206482, "eval_runtime": 191.2308, "eval_samples_per_second": 66.145, "eval_steps_per_second": 8.273, "step": 28000 } ], "logging_steps": 100, "max_steps": 42690, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 1000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 5.0643079170298675e+17, "train_batch_size": 2, "trial_name": null, "trial_params": null }