{ "best_metric": 0.19889499247074127, "best_model_checkpoint": "multilingual-e5-small-aligned-transformed-readability/checkpoint-81288", "epoch": 3.0, "eval_steps": 500, "global_step": 81288, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.018452908178328904, "grad_norm": 2.7571725845336914, "learning_rate": 4.969245153036119e-05, "loss": 0.3765, "step": 500 }, { "epoch": 0.03690581635665781, "grad_norm": 2.832648515701294, "learning_rate": 4.938490306072237e-05, "loss": 0.2708, "step": 1000 }, { "epoch": 0.05535872453498671, "grad_norm": 1.4104365110397339, "learning_rate": 4.907735459108356e-05, "loss": 0.2557, "step": 1500 }, { "epoch": 0.07381163271331562, "grad_norm": 1.8531866073608398, "learning_rate": 4.876980612144474e-05, "loss": 0.2635, "step": 2000 }, { "epoch": 0.09226454089164453, "grad_norm": 1.649173378944397, "learning_rate": 4.846225765180593e-05, "loss": 0.2558, "step": 2500 }, { "epoch": 0.11071744906997343, "grad_norm": 1.7052029371261597, "learning_rate": 4.815470918216711e-05, "loss": 0.2514, "step": 3000 }, { "epoch": 0.12917035724830234, "grad_norm": 3.926635980606079, "learning_rate": 4.78471607125283e-05, "loss": 0.252, "step": 3500 }, { "epoch": 0.14762326542663123, "grad_norm": 3.181887626647949, "learning_rate": 4.7539612242889484e-05, "loss": 0.2541, "step": 4000 }, { "epoch": 0.16607617360496013, "grad_norm": 4.0558180809021, "learning_rate": 4.723206377325067e-05, "loss": 0.2421, "step": 4500 }, { "epoch": 0.18452908178328906, "grad_norm": 1.432974934577942, "learning_rate": 4.692451530361185e-05, "loss": 0.2362, "step": 5000 }, { "epoch": 0.20298198996161795, "grad_norm": 3.173771858215332, "learning_rate": 4.661696683397304e-05, "loss": 0.2443, "step": 5500 }, { "epoch": 0.22143489813994685, "grad_norm": 2.175633668899536, "learning_rate": 4.6309418364334224e-05, "loss": 0.2329, "step": 6000 }, { "epoch": 0.23988780631827575, "grad_norm": 4.211012840270996, "learning_rate": 4.60018698946954e-05, "loss": 0.2303, "step": 6500 }, { "epoch": 0.2583407144966047, "grad_norm": 1.5053297281265259, "learning_rate": 4.5694321425056594e-05, "loss": 0.2272, "step": 7000 }, { "epoch": 0.27679362267493357, "grad_norm": 2.2658045291900635, "learning_rate": 4.538677295541778e-05, "loss": 0.2309, "step": 7500 }, { "epoch": 0.29524653085326247, "grad_norm": 3.0872204303741455, "learning_rate": 4.507922448577896e-05, "loss": 0.228, "step": 8000 }, { "epoch": 0.31369943903159137, "grad_norm": 1.5754343271255493, "learning_rate": 4.477167601614014e-05, "loss": 0.2344, "step": 8500 }, { "epoch": 0.33215234720992026, "grad_norm": 8.282055854797363, "learning_rate": 4.4464127546501335e-05, "loss": 0.2235, "step": 9000 }, { "epoch": 0.3506052553882492, "grad_norm": 2.818925619125366, "learning_rate": 4.415657907686251e-05, "loss": 0.225, "step": 9500 }, { "epoch": 0.3690581635665781, "grad_norm": 4.582856178283691, "learning_rate": 4.38490306072237e-05, "loss": 0.2195, "step": 10000 }, { "epoch": 0.387511071744907, "grad_norm": 4.176349639892578, "learning_rate": 4.354148213758489e-05, "loss": 0.2249, "step": 10500 }, { "epoch": 0.4059639799232359, "grad_norm": 1.69513738155365, "learning_rate": 4.323393366794607e-05, "loss": 0.2227, "step": 11000 }, { "epoch": 0.4244168881015648, "grad_norm": 2.0948939323425293, "learning_rate": 4.2926385198307254e-05, "loss": 0.2248, "step": 11500 }, { "epoch": 0.4428697962798937, "grad_norm": 2.4989616870880127, "learning_rate": 4.261883672866844e-05, "loss": 0.2194, "step": 12000 }, { "epoch": 0.4613227044582226, "grad_norm": 1.1772059202194214, "learning_rate": 4.2311288259029624e-05, "loss": 0.2232, "step": 12500 }, { "epoch": 0.4797756126365515, "grad_norm": 5.26480770111084, "learning_rate": 4.200373978939081e-05, "loss": 0.2199, "step": 13000 }, { "epoch": 0.49822852081488045, "grad_norm": 1.3563578128814697, "learning_rate": 4.1696191319751994e-05, "loss": 0.2264, "step": 13500 }, { "epoch": 0.5166814289932093, "grad_norm": 1.2438708543777466, "learning_rate": 4.138864285011318e-05, "loss": 0.2239, "step": 14000 }, { "epoch": 0.5351343371715382, "grad_norm": 2.229975700378418, "learning_rate": 4.1081094380474365e-05, "loss": 0.211, "step": 14500 }, { "epoch": 0.5535872453498671, "grad_norm": 1.4763661623001099, "learning_rate": 4.077354591083555e-05, "loss": 0.2176, "step": 15000 }, { "epoch": 0.572040153528196, "grad_norm": 2.88029408454895, "learning_rate": 4.0465997441196735e-05, "loss": 0.2229, "step": 15500 }, { "epoch": 0.5904930617065249, "grad_norm": 0.7661384344100952, "learning_rate": 4.015844897155792e-05, "loss": 0.2195, "step": 16000 }, { "epoch": 0.6089459698848538, "grad_norm": 2.0358428955078125, "learning_rate": 3.9850900501919105e-05, "loss": 0.2161, "step": 16500 }, { "epoch": 0.6273988780631827, "grad_norm": 1.9549895524978638, "learning_rate": 3.954335203228029e-05, "loss": 0.2193, "step": 17000 }, { "epoch": 0.6458517862415116, "grad_norm": 2.1742184162139893, "learning_rate": 3.9235803562641475e-05, "loss": 0.2171, "step": 17500 }, { "epoch": 0.6643046944198405, "grad_norm": 1.1012811660766602, "learning_rate": 3.892825509300266e-05, "loss": 0.2246, "step": 18000 }, { "epoch": 0.6827576025981694, "grad_norm": 2.7291996479034424, "learning_rate": 3.8620706623363846e-05, "loss": 0.2114, "step": 18500 }, { "epoch": 0.7012105107764984, "grad_norm": 1.3418771028518677, "learning_rate": 3.8313158153725024e-05, "loss": 0.2173, "step": 19000 }, { "epoch": 0.7196634189548273, "grad_norm": 2.7479825019836426, "learning_rate": 3.8005609684086216e-05, "loss": 0.2163, "step": 19500 }, { "epoch": 0.7381163271331562, "grad_norm": 1.7314202785491943, "learning_rate": 3.76980612144474e-05, "loss": 0.2142, "step": 20000 }, { "epoch": 0.7565692353114851, "grad_norm": 1.5135014057159424, "learning_rate": 3.739051274480858e-05, "loss": 0.2156, "step": 20500 }, { "epoch": 0.775022143489814, "grad_norm": 0.9992055296897888, "learning_rate": 3.708296427516977e-05, "loss": 0.2136, "step": 21000 }, { "epoch": 0.7934750516681429, "grad_norm": 1.2363203763961792, "learning_rate": 3.6775415805530957e-05, "loss": 0.2134, "step": 21500 }, { "epoch": 0.8119279598464718, "grad_norm": 1.8317536115646362, "learning_rate": 3.6467867335892135e-05, "loss": 0.217, "step": 22000 }, { "epoch": 0.8303808680248007, "grad_norm": 1.7996548414230347, "learning_rate": 3.616031886625332e-05, "loss": 0.213, "step": 22500 }, { "epoch": 0.8488337762031296, "grad_norm": 1.1373772621154785, "learning_rate": 3.585277039661451e-05, "loss": 0.2249, "step": 23000 }, { "epoch": 0.8672866843814585, "grad_norm": 1.2996028661727905, "learning_rate": 3.554522192697569e-05, "loss": 0.207, "step": 23500 }, { "epoch": 0.8857395925597874, "grad_norm": 1.505035638809204, "learning_rate": 3.5237673457336876e-05, "loss": 0.2119, "step": 24000 }, { "epoch": 0.9041925007381163, "grad_norm": 1.2497526407241821, "learning_rate": 3.493012498769807e-05, "loss": 0.2096, "step": 24500 }, { "epoch": 0.9226454089164452, "grad_norm": 2.1352574825286865, "learning_rate": 3.4622576518059246e-05, "loss": 0.2143, "step": 25000 }, { "epoch": 0.9410983170947741, "grad_norm": 1.664171576499939, "learning_rate": 3.431502804842043e-05, "loss": 0.2036, "step": 25500 }, { "epoch": 0.959551225273103, "grad_norm": 2.7897629737854004, "learning_rate": 3.400747957878162e-05, "loss": 0.2118, "step": 26000 }, { "epoch": 0.978004133451432, "grad_norm": 1.0113285779953003, "learning_rate": 3.36999311091428e-05, "loss": 0.2175, "step": 26500 }, { "epoch": 0.9964570416297609, "grad_norm": 2.9997363090515137, "learning_rate": 3.3392382639503986e-05, "loss": 0.2104, "step": 27000 }, { "epoch": 1.0, "eval_loss": 0.20612339675426483, "eval_mse": 0.20612340591663994, "eval_runtime": 57.193, "eval_samples_per_second": 1684.438, "eval_steps_per_second": 210.568, "step": 27096 }, { "epoch": 1.0149099498080898, "grad_norm": 1.4240479469299316, "learning_rate": 3.308483416986517e-05, "loss": 0.1821, "step": 27500 }, { "epoch": 1.0333628579864187, "grad_norm": 1.0634160041809082, "learning_rate": 3.277728570022636e-05, "loss": 0.1745, "step": 28000 }, { "epoch": 1.0518157661647476, "grad_norm": 1.9994093179702759, "learning_rate": 3.246973723058754e-05, "loss": 0.1712, "step": 28500 }, { "epoch": 1.0702686743430765, "grad_norm": 0.736122727394104, "learning_rate": 3.216218876094873e-05, "loss": 0.1738, "step": 29000 }, { "epoch": 1.0887215825214054, "grad_norm": 1.7938990592956543, "learning_rate": 3.185464029130991e-05, "loss": 0.1698, "step": 29500 }, { "epoch": 1.1071744906997343, "grad_norm": 1.9040451049804688, "learning_rate": 3.15470918216711e-05, "loss": 0.1734, "step": 30000 }, { "epoch": 1.1256273988780632, "grad_norm": 1.222025990486145, "learning_rate": 3.123954335203228e-05, "loss": 0.1715, "step": 30500 }, { "epoch": 1.144080307056392, "grad_norm": 1.4371784925460815, "learning_rate": 3.093199488239347e-05, "loss": 0.1688, "step": 31000 }, { "epoch": 1.162533215234721, "grad_norm": 5.807870864868164, "learning_rate": 3.062444641275465e-05, "loss": 0.179, "step": 31500 }, { "epoch": 1.1809861234130499, "grad_norm": 1.3887362480163574, "learning_rate": 3.0316897943115834e-05, "loss": 0.179, "step": 32000 }, { "epoch": 1.1994390315913788, "grad_norm": 2.2503085136413574, "learning_rate": 3.0009349473477023e-05, "loss": 0.1738, "step": 32500 }, { "epoch": 1.2178919397697077, "grad_norm": 2.3477783203125, "learning_rate": 2.9701801003838208e-05, "loss": 0.1722, "step": 33000 }, { "epoch": 1.2363448479480366, "grad_norm": 2.7416176795959473, "learning_rate": 2.939425253419939e-05, "loss": 0.1786, "step": 33500 }, { "epoch": 1.2547977561263655, "grad_norm": 0.7052303552627563, "learning_rate": 2.9086704064560578e-05, "loss": 0.1728, "step": 34000 }, { "epoch": 1.2732506643046944, "grad_norm": 2.529670000076294, "learning_rate": 2.877915559492176e-05, "loss": 0.1741, "step": 34500 }, { "epoch": 1.2917035724830233, "grad_norm": 1.9189903736114502, "learning_rate": 2.8471607125282945e-05, "loss": 0.1762, "step": 35000 }, { "epoch": 1.3101564806613522, "grad_norm": 2.1008570194244385, "learning_rate": 2.8164058655644134e-05, "loss": 0.1719, "step": 35500 }, { "epoch": 1.328609388839681, "grad_norm": 2.663116216659546, "learning_rate": 2.7856510186005312e-05, "loss": 0.1739, "step": 36000 }, { "epoch": 1.34706229701801, "grad_norm": 3.453697443008423, "learning_rate": 2.75489617163665e-05, "loss": 0.1691, "step": 36500 }, { "epoch": 1.3655152051963388, "grad_norm": 5.848513603210449, "learning_rate": 2.7241413246727686e-05, "loss": 0.1674, "step": 37000 }, { "epoch": 1.3839681133746677, "grad_norm": 1.1454991102218628, "learning_rate": 2.6933864777088867e-05, "loss": 0.1764, "step": 37500 }, { "epoch": 1.4024210215529966, "grad_norm": 0.9938109517097473, "learning_rate": 2.6626316307450056e-05, "loss": 0.1706, "step": 38000 }, { "epoch": 1.4208739297313255, "grad_norm": 2.252068042755127, "learning_rate": 2.631876783781124e-05, "loss": 0.1665, "step": 38500 }, { "epoch": 1.4393268379096544, "grad_norm": 1.9789129495620728, "learning_rate": 2.6011219368172423e-05, "loss": 0.1746, "step": 39000 }, { "epoch": 1.4577797460879833, "grad_norm": 1.5638952255249023, "learning_rate": 2.570367089853361e-05, "loss": 0.1699, "step": 39500 }, { "epoch": 1.4762326542663124, "grad_norm": 2.094984292984009, "learning_rate": 2.5396122428894797e-05, "loss": 0.1715, "step": 40000 }, { "epoch": 1.4946855624446413, "grad_norm": 2.625145435333252, "learning_rate": 2.508857395925598e-05, "loss": 0.1708, "step": 40500 }, { "epoch": 1.51313847062297, "grad_norm": 1.2873293161392212, "learning_rate": 2.4781025489617167e-05, "loss": 0.1721, "step": 41000 }, { "epoch": 1.531591378801299, "grad_norm": 2.8465254306793213, "learning_rate": 2.447347701997835e-05, "loss": 0.1761, "step": 41500 }, { "epoch": 1.550044286979628, "grad_norm": 1.4705593585968018, "learning_rate": 2.4165928550339534e-05, "loss": 0.1756, "step": 42000 }, { "epoch": 1.568497195157957, "grad_norm": 0.9254862666130066, "learning_rate": 2.3858380080700722e-05, "loss": 0.1759, "step": 42500 }, { "epoch": 1.5869501033362858, "grad_norm": 1.8685784339904785, "learning_rate": 2.3550831611061904e-05, "loss": 0.167, "step": 43000 }, { "epoch": 1.6054030115146147, "grad_norm": 1.4468207359313965, "learning_rate": 2.324328314142309e-05, "loss": 0.1776, "step": 43500 }, { "epoch": 1.6238559196929436, "grad_norm": 2.477450132369995, "learning_rate": 2.2935734671784274e-05, "loss": 0.174, "step": 44000 }, { "epoch": 1.6423088278712725, "grad_norm": 11.740235328674316, "learning_rate": 2.262818620214546e-05, "loss": 0.1652, "step": 44500 }, { "epoch": 1.6607617360496014, "grad_norm": 2.253143548965454, "learning_rate": 2.2320637732506645e-05, "loss": 0.1776, "step": 45000 }, { "epoch": 1.6792146442279303, "grad_norm": 4.1611151695251465, "learning_rate": 2.201308926286783e-05, "loss": 0.1643, "step": 45500 }, { "epoch": 1.6976675524062592, "grad_norm": 3.693655252456665, "learning_rate": 2.1705540793229015e-05, "loss": 0.1705, "step": 46000 }, { "epoch": 1.7161204605845881, "grad_norm": 3.8450114727020264, "learning_rate": 2.13979923235902e-05, "loss": 0.1715, "step": 46500 }, { "epoch": 1.734573368762917, "grad_norm": 3.296321392059326, "learning_rate": 2.1090443853951382e-05, "loss": 0.1642, "step": 47000 }, { "epoch": 1.753026276941246, "grad_norm": 2.0819671154022217, "learning_rate": 2.0782895384312567e-05, "loss": 0.1624, "step": 47500 }, { "epoch": 1.7714791851195748, "grad_norm": 0.8893182873725891, "learning_rate": 2.0475346914673755e-05, "loss": 0.1678, "step": 48000 }, { "epoch": 1.7899320932979037, "grad_norm": 2.971529960632324, "learning_rate": 2.0167798445034937e-05, "loss": 0.1664, "step": 48500 }, { "epoch": 1.8083850014762326, "grad_norm": 2.0590310096740723, "learning_rate": 1.9860249975396122e-05, "loss": 0.1779, "step": 49000 }, { "epoch": 1.8268379096545617, "grad_norm": 2.0498523712158203, "learning_rate": 1.955270150575731e-05, "loss": 0.1695, "step": 49500 }, { "epoch": 1.8452908178328906, "grad_norm": 1.6503143310546875, "learning_rate": 1.9245153036118493e-05, "loss": 0.1678, "step": 50000 }, { "epoch": 1.8637437260112195, "grad_norm": 1.0318537950515747, "learning_rate": 1.8937604566479678e-05, "loss": 0.1644, "step": 50500 }, { "epoch": 1.8821966341895484, "grad_norm": 1.936584711074829, "learning_rate": 1.8630056096840863e-05, "loss": 0.1697, "step": 51000 }, { "epoch": 1.9006495423678773, "grad_norm": 2.5828168392181396, "learning_rate": 1.8322507627202048e-05, "loss": 0.1696, "step": 51500 }, { "epoch": 1.9191024505462062, "grad_norm": 3.156874895095825, "learning_rate": 1.8014959157563233e-05, "loss": 0.1673, "step": 52000 }, { "epoch": 1.937555358724535, "grad_norm": 3.178074836730957, "learning_rate": 1.7707410687924418e-05, "loss": 0.176, "step": 52500 }, { "epoch": 1.956008266902864, "grad_norm": 1.48374342918396, "learning_rate": 1.7399862218285603e-05, "loss": 0.1677, "step": 53000 }, { "epoch": 1.974461175081193, "grad_norm": 3.43747878074646, "learning_rate": 1.709231374864679e-05, "loss": 0.1696, "step": 53500 }, { "epoch": 1.9929140832595218, "grad_norm": 1.6862876415252686, "learning_rate": 1.678476527900797e-05, "loss": 0.1718, "step": 54000 }, { "epoch": 2.0, "eval_loss": 0.20655478537082672, "eval_mse": 0.2065547745621827, "eval_runtime": 52.3234, "eval_samples_per_second": 1841.202, "eval_steps_per_second": 230.165, "step": 54192 }, { "epoch": 2.0113669914378507, "grad_norm": 0.8314543962478638, "learning_rate": 1.647721680936916e-05, "loss": 0.1449, "step": 54500 }, { "epoch": 2.0298198996161796, "grad_norm": 1.8953380584716797, "learning_rate": 1.6169668339730344e-05, "loss": 0.1357, "step": 55000 }, { "epoch": 2.0482728077945085, "grad_norm": 0.7893266081809998, "learning_rate": 1.5862119870091526e-05, "loss": 0.138, "step": 55500 }, { "epoch": 2.0667257159728374, "grad_norm": 1.337292194366455, "learning_rate": 1.555457140045271e-05, "loss": 0.1407, "step": 56000 }, { "epoch": 2.0851786241511663, "grad_norm": 1.6890192031860352, "learning_rate": 1.5247022930813898e-05, "loss": 0.1406, "step": 56500 }, { "epoch": 2.103631532329495, "grad_norm": 2.1817214488983154, "learning_rate": 1.4939474461175081e-05, "loss": 0.1332, "step": 57000 }, { "epoch": 2.122084440507824, "grad_norm": 1.477333664894104, "learning_rate": 1.4631925991536266e-05, "loss": 0.1415, "step": 57500 }, { "epoch": 2.140537348686153, "grad_norm": 3.889193534851074, "learning_rate": 1.4324377521897453e-05, "loss": 0.1399, "step": 58000 }, { "epoch": 2.158990256864482, "grad_norm": 11.35392951965332, "learning_rate": 1.4016829052258637e-05, "loss": 0.1345, "step": 58500 }, { "epoch": 2.1774431650428108, "grad_norm": 2.2750699520111084, "learning_rate": 1.3709280582619822e-05, "loss": 0.1347, "step": 59000 }, { "epoch": 2.1958960732211397, "grad_norm": 4.66851282119751, "learning_rate": 1.3401732112981005e-05, "loss": 0.1359, "step": 59500 }, { "epoch": 2.2143489813994686, "grad_norm": 1.2594196796417236, "learning_rate": 1.3094183643342192e-05, "loss": 0.135, "step": 60000 }, { "epoch": 2.2328018895777975, "grad_norm": 0.6602271199226379, "learning_rate": 1.2786635173703375e-05, "loss": 0.1381, "step": 60500 }, { "epoch": 2.2512547977561264, "grad_norm": 0.8580902814865112, "learning_rate": 1.2479086704064562e-05, "loss": 0.1308, "step": 61000 }, { "epoch": 2.2697077059344553, "grad_norm": 0.8672662377357483, "learning_rate": 1.2171538234425746e-05, "loss": 0.1395, "step": 61500 }, { "epoch": 2.288160614112784, "grad_norm": 1.646864891052246, "learning_rate": 1.186398976478693e-05, "loss": 0.1419, "step": 62000 }, { "epoch": 2.306613522291113, "grad_norm": 4.04207181930542, "learning_rate": 1.1556441295148116e-05, "loss": 0.1337, "step": 62500 }, { "epoch": 2.325066430469442, "grad_norm": 5.613555431365967, "learning_rate": 1.1248892825509301e-05, "loss": 0.1429, "step": 63000 }, { "epoch": 2.343519338647771, "grad_norm": 1.977729082107544, "learning_rate": 1.0941344355870485e-05, "loss": 0.1323, "step": 63500 }, { "epoch": 2.3619722468260997, "grad_norm": 1.2868248224258423, "learning_rate": 1.0633795886231671e-05, "loss": 0.1383, "step": 64000 }, { "epoch": 2.3804251550044286, "grad_norm": 1.098742961883545, "learning_rate": 1.0326247416592857e-05, "loss": 0.1387, "step": 64500 }, { "epoch": 2.3988780631827575, "grad_norm": 2.9264678955078125, "learning_rate": 1.001869894695404e-05, "loss": 0.1386, "step": 65000 }, { "epoch": 2.4173309713610864, "grad_norm": 3.179082155227661, "learning_rate": 9.711150477315225e-06, "loss": 0.1444, "step": 65500 }, { "epoch": 2.4357838795394153, "grad_norm": 1.5083171129226685, "learning_rate": 9.40360200767641e-06, "loss": 0.1351, "step": 66000 }, { "epoch": 2.4542367877177442, "grad_norm": 1.590307354927063, "learning_rate": 9.096053538037595e-06, "loss": 0.1379, "step": 66500 }, { "epoch": 2.472689695896073, "grad_norm": 1.490502953529358, "learning_rate": 8.78850506839878e-06, "loss": 0.1285, "step": 67000 }, { "epoch": 2.491142604074402, "grad_norm": 2.0561413764953613, "learning_rate": 8.480956598759966e-06, "loss": 0.1396, "step": 67500 }, { "epoch": 2.509595512252731, "grad_norm": 1.0588093996047974, "learning_rate": 8.17340812912115e-06, "loss": 0.1367, "step": 68000 }, { "epoch": 2.52804842043106, "grad_norm": 0.8184725046157837, "learning_rate": 7.865859659482334e-06, "loss": 0.1322, "step": 68500 }, { "epoch": 2.5465013286093887, "grad_norm": 1.3976045846939087, "learning_rate": 7.55831118984352e-06, "loss": 0.1332, "step": 69000 }, { "epoch": 2.5649542367877176, "grad_norm": 2.417647361755371, "learning_rate": 7.250762720204704e-06, "loss": 0.1342, "step": 69500 }, { "epoch": 2.5834071449660465, "grad_norm": 4.064483165740967, "learning_rate": 6.94321425056589e-06, "loss": 0.1355, "step": 70000 }, { "epoch": 2.6018600531443754, "grad_norm": 2.23105788230896, "learning_rate": 6.635665780927075e-06, "loss": 0.1315, "step": 70500 }, { "epoch": 2.6203129613227043, "grad_norm": 2.205604076385498, "learning_rate": 6.328117311288259e-06, "loss": 0.1379, "step": 71000 }, { "epoch": 2.638765869501033, "grad_norm": 2.5101168155670166, "learning_rate": 6.020568841649444e-06, "loss": 0.142, "step": 71500 }, { "epoch": 2.657218777679362, "grad_norm": 11.855621337890625, "learning_rate": 5.713020372010629e-06, "loss": 0.1359, "step": 72000 }, { "epoch": 2.675671685857691, "grad_norm": 1.7274291515350342, "learning_rate": 5.4054719023718145e-06, "loss": 0.1386, "step": 72500 }, { "epoch": 2.69412459403602, "grad_norm": 1.0947271585464478, "learning_rate": 5.097923432732999e-06, "loss": 0.1393, "step": 73000 }, { "epoch": 2.712577502214349, "grad_norm": 1.6208831071853638, "learning_rate": 4.790374963094184e-06, "loss": 0.1276, "step": 73500 }, { "epoch": 2.7310304103926777, "grad_norm": 1.5204744338989258, "learning_rate": 4.482826493455368e-06, "loss": 0.1297, "step": 74000 }, { "epoch": 2.7494833185710066, "grad_norm": 4.482317924499512, "learning_rate": 4.175278023816553e-06, "loss": 0.1303, "step": 74500 }, { "epoch": 2.7679362267493355, "grad_norm": 9.054340362548828, "learning_rate": 3.8677295541777385e-06, "loss": 0.1319, "step": 75000 }, { "epoch": 2.7863891349276644, "grad_norm": 1.8670865297317505, "learning_rate": 3.5601810845389237e-06, "loss": 0.1301, "step": 75500 }, { "epoch": 2.8048420431059933, "grad_norm": 1.451202154159546, "learning_rate": 3.2526326149001084e-06, "loss": 0.1309, "step": 76000 }, { "epoch": 2.823294951284322, "grad_norm": 3.281291961669922, "learning_rate": 2.945084145261293e-06, "loss": 0.1407, "step": 76500 }, { "epoch": 2.841747859462651, "grad_norm": 3.273066997528076, "learning_rate": 2.6375356756224782e-06, "loss": 0.1267, "step": 77000 }, { "epoch": 2.86020076764098, "grad_norm": 8.522459030151367, "learning_rate": 2.3299872059836634e-06, "loss": 0.1304, "step": 77500 }, { "epoch": 2.878653675819309, "grad_norm": 1.6981911659240723, "learning_rate": 2.022438736344848e-06, "loss": 0.1436, "step": 78000 }, { "epoch": 2.8971065839976378, "grad_norm": 2.415241003036499, "learning_rate": 1.7148902667060328e-06, "loss": 0.1297, "step": 78500 }, { "epoch": 2.9155594921759667, "grad_norm": 1.65168035030365, "learning_rate": 1.4073417970672177e-06, "loss": 0.138, "step": 79000 }, { "epoch": 2.934012400354296, "grad_norm": 1.9556164741516113, "learning_rate": 1.0997933274284029e-06, "loss": 0.1346, "step": 79500 }, { "epoch": 2.952465308532625, "grad_norm": 2.9853076934814453, "learning_rate": 7.922448577895876e-07, "loss": 0.1371, "step": 80000 }, { "epoch": 2.970918216710954, "grad_norm": 2.885925054550171, "learning_rate": 4.846963881507725e-07, "loss": 0.1342, "step": 80500 }, { "epoch": 2.9893711248892827, "grad_norm": 2.020306348800659, "learning_rate": 1.771479185119575e-07, "loss": 0.141, "step": 81000 }, { "epoch": 3.0, "eval_loss": 0.19889499247074127, "eval_mse": 0.19889500241300032, "eval_runtime": 55.3999, "eval_samples_per_second": 1738.955, "eval_steps_per_second": 217.383, "step": 81288 }, { "epoch": 3.0, "step": 81288, "total_flos": 4.283504864539085e+16, "train_loss": 0.17857247165732115, "train_runtime": 4471.1905, "train_samples_per_second": 581.752, "train_steps_per_second": 18.18 } ], "logging_steps": 500, "max_steps": 81288, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 4.283504864539085e+16, "train_batch_size": 32, "trial_name": null, "trial_params": null }