|
{ |
|
"best_metric": 0.19889499247074127, |
|
"best_model_checkpoint": "multilingual-e5-small-aligned-transformed-readability/checkpoint-81288", |
|
"epoch": 3.0, |
|
"eval_steps": 500, |
|
"global_step": 81288, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.018452908178328904, |
|
"grad_norm": 2.7571725845336914, |
|
"learning_rate": 4.969245153036119e-05, |
|
"loss": 0.3765, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.03690581635665781, |
|
"grad_norm": 2.832648515701294, |
|
"learning_rate": 4.938490306072237e-05, |
|
"loss": 0.2708, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 0.05535872453498671, |
|
"grad_norm": 1.4104365110397339, |
|
"learning_rate": 4.907735459108356e-05, |
|
"loss": 0.2557, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 0.07381163271331562, |
|
"grad_norm": 1.8531866073608398, |
|
"learning_rate": 4.876980612144474e-05, |
|
"loss": 0.2635, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 0.09226454089164453, |
|
"grad_norm": 1.649173378944397, |
|
"learning_rate": 4.846225765180593e-05, |
|
"loss": 0.2558, |
|
"step": 2500 |
|
}, |
|
{ |
|
"epoch": 0.11071744906997343, |
|
"grad_norm": 1.7052029371261597, |
|
"learning_rate": 4.815470918216711e-05, |
|
"loss": 0.2514, |
|
"step": 3000 |
|
}, |
|
{ |
|
"epoch": 0.12917035724830234, |
|
"grad_norm": 3.926635980606079, |
|
"learning_rate": 4.78471607125283e-05, |
|
"loss": 0.252, |
|
"step": 3500 |
|
}, |
|
{ |
|
"epoch": 0.14762326542663123, |
|
"grad_norm": 3.181887626647949, |
|
"learning_rate": 4.7539612242889484e-05, |
|
"loss": 0.2541, |
|
"step": 4000 |
|
}, |
|
{ |
|
"epoch": 0.16607617360496013, |
|
"grad_norm": 4.0558180809021, |
|
"learning_rate": 4.723206377325067e-05, |
|
"loss": 0.2421, |
|
"step": 4500 |
|
}, |
|
{ |
|
"epoch": 0.18452908178328906, |
|
"grad_norm": 1.432974934577942, |
|
"learning_rate": 4.692451530361185e-05, |
|
"loss": 0.2362, |
|
"step": 5000 |
|
}, |
|
{ |
|
"epoch": 0.20298198996161795, |
|
"grad_norm": 3.173771858215332, |
|
"learning_rate": 4.661696683397304e-05, |
|
"loss": 0.2443, |
|
"step": 5500 |
|
}, |
|
{ |
|
"epoch": 0.22143489813994685, |
|
"grad_norm": 2.175633668899536, |
|
"learning_rate": 4.6309418364334224e-05, |
|
"loss": 0.2329, |
|
"step": 6000 |
|
}, |
|
{ |
|
"epoch": 0.23988780631827575, |
|
"grad_norm": 4.211012840270996, |
|
"learning_rate": 4.60018698946954e-05, |
|
"loss": 0.2303, |
|
"step": 6500 |
|
}, |
|
{ |
|
"epoch": 0.2583407144966047, |
|
"grad_norm": 1.5053297281265259, |
|
"learning_rate": 4.5694321425056594e-05, |
|
"loss": 0.2272, |
|
"step": 7000 |
|
}, |
|
{ |
|
"epoch": 0.27679362267493357, |
|
"grad_norm": 2.2658045291900635, |
|
"learning_rate": 4.538677295541778e-05, |
|
"loss": 0.2309, |
|
"step": 7500 |
|
}, |
|
{ |
|
"epoch": 0.29524653085326247, |
|
"grad_norm": 3.0872204303741455, |
|
"learning_rate": 4.507922448577896e-05, |
|
"loss": 0.228, |
|
"step": 8000 |
|
}, |
|
{ |
|
"epoch": 0.31369943903159137, |
|
"grad_norm": 1.5754343271255493, |
|
"learning_rate": 4.477167601614014e-05, |
|
"loss": 0.2344, |
|
"step": 8500 |
|
}, |
|
{ |
|
"epoch": 0.33215234720992026, |
|
"grad_norm": 8.282055854797363, |
|
"learning_rate": 4.4464127546501335e-05, |
|
"loss": 0.2235, |
|
"step": 9000 |
|
}, |
|
{ |
|
"epoch": 0.3506052553882492, |
|
"grad_norm": 2.818925619125366, |
|
"learning_rate": 4.415657907686251e-05, |
|
"loss": 0.225, |
|
"step": 9500 |
|
}, |
|
{ |
|
"epoch": 0.3690581635665781, |
|
"grad_norm": 4.582856178283691, |
|
"learning_rate": 4.38490306072237e-05, |
|
"loss": 0.2195, |
|
"step": 10000 |
|
}, |
|
{ |
|
"epoch": 0.387511071744907, |
|
"grad_norm": 4.176349639892578, |
|
"learning_rate": 4.354148213758489e-05, |
|
"loss": 0.2249, |
|
"step": 10500 |
|
}, |
|
{ |
|
"epoch": 0.4059639799232359, |
|
"grad_norm": 1.69513738155365, |
|
"learning_rate": 4.323393366794607e-05, |
|
"loss": 0.2227, |
|
"step": 11000 |
|
}, |
|
{ |
|
"epoch": 0.4244168881015648, |
|
"grad_norm": 2.0948939323425293, |
|
"learning_rate": 4.2926385198307254e-05, |
|
"loss": 0.2248, |
|
"step": 11500 |
|
}, |
|
{ |
|
"epoch": 0.4428697962798937, |
|
"grad_norm": 2.4989616870880127, |
|
"learning_rate": 4.261883672866844e-05, |
|
"loss": 0.2194, |
|
"step": 12000 |
|
}, |
|
{ |
|
"epoch": 0.4613227044582226, |
|
"grad_norm": 1.1772059202194214, |
|
"learning_rate": 4.2311288259029624e-05, |
|
"loss": 0.2232, |
|
"step": 12500 |
|
}, |
|
{ |
|
"epoch": 0.4797756126365515, |
|
"grad_norm": 5.26480770111084, |
|
"learning_rate": 4.200373978939081e-05, |
|
"loss": 0.2199, |
|
"step": 13000 |
|
}, |
|
{ |
|
"epoch": 0.49822852081488045, |
|
"grad_norm": 1.3563578128814697, |
|
"learning_rate": 4.1696191319751994e-05, |
|
"loss": 0.2264, |
|
"step": 13500 |
|
}, |
|
{ |
|
"epoch": 0.5166814289932093, |
|
"grad_norm": 1.2438708543777466, |
|
"learning_rate": 4.138864285011318e-05, |
|
"loss": 0.2239, |
|
"step": 14000 |
|
}, |
|
{ |
|
"epoch": 0.5351343371715382, |
|
"grad_norm": 2.229975700378418, |
|
"learning_rate": 4.1081094380474365e-05, |
|
"loss": 0.211, |
|
"step": 14500 |
|
}, |
|
{ |
|
"epoch": 0.5535872453498671, |
|
"grad_norm": 1.4763661623001099, |
|
"learning_rate": 4.077354591083555e-05, |
|
"loss": 0.2176, |
|
"step": 15000 |
|
}, |
|
{ |
|
"epoch": 0.572040153528196, |
|
"grad_norm": 2.88029408454895, |
|
"learning_rate": 4.0465997441196735e-05, |
|
"loss": 0.2229, |
|
"step": 15500 |
|
}, |
|
{ |
|
"epoch": 0.5904930617065249, |
|
"grad_norm": 0.7661384344100952, |
|
"learning_rate": 4.015844897155792e-05, |
|
"loss": 0.2195, |
|
"step": 16000 |
|
}, |
|
{ |
|
"epoch": 0.6089459698848538, |
|
"grad_norm": 2.0358428955078125, |
|
"learning_rate": 3.9850900501919105e-05, |
|
"loss": 0.2161, |
|
"step": 16500 |
|
}, |
|
{ |
|
"epoch": 0.6273988780631827, |
|
"grad_norm": 1.9549895524978638, |
|
"learning_rate": 3.954335203228029e-05, |
|
"loss": 0.2193, |
|
"step": 17000 |
|
}, |
|
{ |
|
"epoch": 0.6458517862415116, |
|
"grad_norm": 2.1742184162139893, |
|
"learning_rate": 3.9235803562641475e-05, |
|
"loss": 0.2171, |
|
"step": 17500 |
|
}, |
|
{ |
|
"epoch": 0.6643046944198405, |
|
"grad_norm": 1.1012811660766602, |
|
"learning_rate": 3.892825509300266e-05, |
|
"loss": 0.2246, |
|
"step": 18000 |
|
}, |
|
{ |
|
"epoch": 0.6827576025981694, |
|
"grad_norm": 2.7291996479034424, |
|
"learning_rate": 3.8620706623363846e-05, |
|
"loss": 0.2114, |
|
"step": 18500 |
|
}, |
|
{ |
|
"epoch": 0.7012105107764984, |
|
"grad_norm": 1.3418771028518677, |
|
"learning_rate": 3.8313158153725024e-05, |
|
"loss": 0.2173, |
|
"step": 19000 |
|
}, |
|
{ |
|
"epoch": 0.7196634189548273, |
|
"grad_norm": 2.7479825019836426, |
|
"learning_rate": 3.8005609684086216e-05, |
|
"loss": 0.2163, |
|
"step": 19500 |
|
}, |
|
{ |
|
"epoch": 0.7381163271331562, |
|
"grad_norm": 1.7314202785491943, |
|
"learning_rate": 3.76980612144474e-05, |
|
"loss": 0.2142, |
|
"step": 20000 |
|
}, |
|
{ |
|
"epoch": 0.7565692353114851, |
|
"grad_norm": 1.5135014057159424, |
|
"learning_rate": 3.739051274480858e-05, |
|
"loss": 0.2156, |
|
"step": 20500 |
|
}, |
|
{ |
|
"epoch": 0.775022143489814, |
|
"grad_norm": 0.9992055296897888, |
|
"learning_rate": 3.708296427516977e-05, |
|
"loss": 0.2136, |
|
"step": 21000 |
|
}, |
|
{ |
|
"epoch": 0.7934750516681429, |
|
"grad_norm": 1.2363203763961792, |
|
"learning_rate": 3.6775415805530957e-05, |
|
"loss": 0.2134, |
|
"step": 21500 |
|
}, |
|
{ |
|
"epoch": 0.8119279598464718, |
|
"grad_norm": 1.8317536115646362, |
|
"learning_rate": 3.6467867335892135e-05, |
|
"loss": 0.217, |
|
"step": 22000 |
|
}, |
|
{ |
|
"epoch": 0.8303808680248007, |
|
"grad_norm": 1.7996548414230347, |
|
"learning_rate": 3.616031886625332e-05, |
|
"loss": 0.213, |
|
"step": 22500 |
|
}, |
|
{ |
|
"epoch": 0.8488337762031296, |
|
"grad_norm": 1.1373772621154785, |
|
"learning_rate": 3.585277039661451e-05, |
|
"loss": 0.2249, |
|
"step": 23000 |
|
}, |
|
{ |
|
"epoch": 0.8672866843814585, |
|
"grad_norm": 1.2996028661727905, |
|
"learning_rate": 3.554522192697569e-05, |
|
"loss": 0.207, |
|
"step": 23500 |
|
}, |
|
{ |
|
"epoch": 0.8857395925597874, |
|
"grad_norm": 1.505035638809204, |
|
"learning_rate": 3.5237673457336876e-05, |
|
"loss": 0.2119, |
|
"step": 24000 |
|
}, |
|
{ |
|
"epoch": 0.9041925007381163, |
|
"grad_norm": 1.2497526407241821, |
|
"learning_rate": 3.493012498769807e-05, |
|
"loss": 0.2096, |
|
"step": 24500 |
|
}, |
|
{ |
|
"epoch": 0.9226454089164452, |
|
"grad_norm": 2.1352574825286865, |
|
"learning_rate": 3.4622576518059246e-05, |
|
"loss": 0.2143, |
|
"step": 25000 |
|
}, |
|
{ |
|
"epoch": 0.9410983170947741, |
|
"grad_norm": 1.664171576499939, |
|
"learning_rate": 3.431502804842043e-05, |
|
"loss": 0.2036, |
|
"step": 25500 |
|
}, |
|
{ |
|
"epoch": 0.959551225273103, |
|
"grad_norm": 2.7897629737854004, |
|
"learning_rate": 3.400747957878162e-05, |
|
"loss": 0.2118, |
|
"step": 26000 |
|
}, |
|
{ |
|
"epoch": 0.978004133451432, |
|
"grad_norm": 1.0113285779953003, |
|
"learning_rate": 3.36999311091428e-05, |
|
"loss": 0.2175, |
|
"step": 26500 |
|
}, |
|
{ |
|
"epoch": 0.9964570416297609, |
|
"grad_norm": 2.9997363090515137, |
|
"learning_rate": 3.3392382639503986e-05, |
|
"loss": 0.2104, |
|
"step": 27000 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"eval_loss": 0.20612339675426483, |
|
"eval_mse": 0.20612340591663994, |
|
"eval_runtime": 57.193, |
|
"eval_samples_per_second": 1684.438, |
|
"eval_steps_per_second": 210.568, |
|
"step": 27096 |
|
}, |
|
{ |
|
"epoch": 1.0149099498080898, |
|
"grad_norm": 1.4240479469299316, |
|
"learning_rate": 3.308483416986517e-05, |
|
"loss": 0.1821, |
|
"step": 27500 |
|
}, |
|
{ |
|
"epoch": 1.0333628579864187, |
|
"grad_norm": 1.0634160041809082, |
|
"learning_rate": 3.277728570022636e-05, |
|
"loss": 0.1745, |
|
"step": 28000 |
|
}, |
|
{ |
|
"epoch": 1.0518157661647476, |
|
"grad_norm": 1.9994093179702759, |
|
"learning_rate": 3.246973723058754e-05, |
|
"loss": 0.1712, |
|
"step": 28500 |
|
}, |
|
{ |
|
"epoch": 1.0702686743430765, |
|
"grad_norm": 0.736122727394104, |
|
"learning_rate": 3.216218876094873e-05, |
|
"loss": 0.1738, |
|
"step": 29000 |
|
}, |
|
{ |
|
"epoch": 1.0887215825214054, |
|
"grad_norm": 1.7938990592956543, |
|
"learning_rate": 3.185464029130991e-05, |
|
"loss": 0.1698, |
|
"step": 29500 |
|
}, |
|
{ |
|
"epoch": 1.1071744906997343, |
|
"grad_norm": 1.9040451049804688, |
|
"learning_rate": 3.15470918216711e-05, |
|
"loss": 0.1734, |
|
"step": 30000 |
|
}, |
|
{ |
|
"epoch": 1.1256273988780632, |
|
"grad_norm": 1.222025990486145, |
|
"learning_rate": 3.123954335203228e-05, |
|
"loss": 0.1715, |
|
"step": 30500 |
|
}, |
|
{ |
|
"epoch": 1.144080307056392, |
|
"grad_norm": 1.4371784925460815, |
|
"learning_rate": 3.093199488239347e-05, |
|
"loss": 0.1688, |
|
"step": 31000 |
|
}, |
|
{ |
|
"epoch": 1.162533215234721, |
|
"grad_norm": 5.807870864868164, |
|
"learning_rate": 3.062444641275465e-05, |
|
"loss": 0.179, |
|
"step": 31500 |
|
}, |
|
{ |
|
"epoch": 1.1809861234130499, |
|
"grad_norm": 1.3887362480163574, |
|
"learning_rate": 3.0316897943115834e-05, |
|
"loss": 0.179, |
|
"step": 32000 |
|
}, |
|
{ |
|
"epoch": 1.1994390315913788, |
|
"grad_norm": 2.2503085136413574, |
|
"learning_rate": 3.0009349473477023e-05, |
|
"loss": 0.1738, |
|
"step": 32500 |
|
}, |
|
{ |
|
"epoch": 1.2178919397697077, |
|
"grad_norm": 2.3477783203125, |
|
"learning_rate": 2.9701801003838208e-05, |
|
"loss": 0.1722, |
|
"step": 33000 |
|
}, |
|
{ |
|
"epoch": 1.2363448479480366, |
|
"grad_norm": 2.7416176795959473, |
|
"learning_rate": 2.939425253419939e-05, |
|
"loss": 0.1786, |
|
"step": 33500 |
|
}, |
|
{ |
|
"epoch": 1.2547977561263655, |
|
"grad_norm": 0.7052303552627563, |
|
"learning_rate": 2.9086704064560578e-05, |
|
"loss": 0.1728, |
|
"step": 34000 |
|
}, |
|
{ |
|
"epoch": 1.2732506643046944, |
|
"grad_norm": 2.529670000076294, |
|
"learning_rate": 2.877915559492176e-05, |
|
"loss": 0.1741, |
|
"step": 34500 |
|
}, |
|
{ |
|
"epoch": 1.2917035724830233, |
|
"grad_norm": 1.9189903736114502, |
|
"learning_rate": 2.8471607125282945e-05, |
|
"loss": 0.1762, |
|
"step": 35000 |
|
}, |
|
{ |
|
"epoch": 1.3101564806613522, |
|
"grad_norm": 2.1008570194244385, |
|
"learning_rate": 2.8164058655644134e-05, |
|
"loss": 0.1719, |
|
"step": 35500 |
|
}, |
|
{ |
|
"epoch": 1.328609388839681, |
|
"grad_norm": 2.663116216659546, |
|
"learning_rate": 2.7856510186005312e-05, |
|
"loss": 0.1739, |
|
"step": 36000 |
|
}, |
|
{ |
|
"epoch": 1.34706229701801, |
|
"grad_norm": 3.453697443008423, |
|
"learning_rate": 2.75489617163665e-05, |
|
"loss": 0.1691, |
|
"step": 36500 |
|
}, |
|
{ |
|
"epoch": 1.3655152051963388, |
|
"grad_norm": 5.848513603210449, |
|
"learning_rate": 2.7241413246727686e-05, |
|
"loss": 0.1674, |
|
"step": 37000 |
|
}, |
|
{ |
|
"epoch": 1.3839681133746677, |
|
"grad_norm": 1.1454991102218628, |
|
"learning_rate": 2.6933864777088867e-05, |
|
"loss": 0.1764, |
|
"step": 37500 |
|
}, |
|
{ |
|
"epoch": 1.4024210215529966, |
|
"grad_norm": 0.9938109517097473, |
|
"learning_rate": 2.6626316307450056e-05, |
|
"loss": 0.1706, |
|
"step": 38000 |
|
}, |
|
{ |
|
"epoch": 1.4208739297313255, |
|
"grad_norm": 2.252068042755127, |
|
"learning_rate": 2.631876783781124e-05, |
|
"loss": 0.1665, |
|
"step": 38500 |
|
}, |
|
{ |
|
"epoch": 1.4393268379096544, |
|
"grad_norm": 1.9789129495620728, |
|
"learning_rate": 2.6011219368172423e-05, |
|
"loss": 0.1746, |
|
"step": 39000 |
|
}, |
|
{ |
|
"epoch": 1.4577797460879833, |
|
"grad_norm": 1.5638952255249023, |
|
"learning_rate": 2.570367089853361e-05, |
|
"loss": 0.1699, |
|
"step": 39500 |
|
}, |
|
{ |
|
"epoch": 1.4762326542663124, |
|
"grad_norm": 2.094984292984009, |
|
"learning_rate": 2.5396122428894797e-05, |
|
"loss": 0.1715, |
|
"step": 40000 |
|
}, |
|
{ |
|
"epoch": 1.4946855624446413, |
|
"grad_norm": 2.625145435333252, |
|
"learning_rate": 2.508857395925598e-05, |
|
"loss": 0.1708, |
|
"step": 40500 |
|
}, |
|
{ |
|
"epoch": 1.51313847062297, |
|
"grad_norm": 1.2873293161392212, |
|
"learning_rate": 2.4781025489617167e-05, |
|
"loss": 0.1721, |
|
"step": 41000 |
|
}, |
|
{ |
|
"epoch": 1.531591378801299, |
|
"grad_norm": 2.8465254306793213, |
|
"learning_rate": 2.447347701997835e-05, |
|
"loss": 0.1761, |
|
"step": 41500 |
|
}, |
|
{ |
|
"epoch": 1.550044286979628, |
|
"grad_norm": 1.4705593585968018, |
|
"learning_rate": 2.4165928550339534e-05, |
|
"loss": 0.1756, |
|
"step": 42000 |
|
}, |
|
{ |
|
"epoch": 1.568497195157957, |
|
"grad_norm": 0.9254862666130066, |
|
"learning_rate": 2.3858380080700722e-05, |
|
"loss": 0.1759, |
|
"step": 42500 |
|
}, |
|
{ |
|
"epoch": 1.5869501033362858, |
|
"grad_norm": 1.8685784339904785, |
|
"learning_rate": 2.3550831611061904e-05, |
|
"loss": 0.167, |
|
"step": 43000 |
|
}, |
|
{ |
|
"epoch": 1.6054030115146147, |
|
"grad_norm": 1.4468207359313965, |
|
"learning_rate": 2.324328314142309e-05, |
|
"loss": 0.1776, |
|
"step": 43500 |
|
}, |
|
{ |
|
"epoch": 1.6238559196929436, |
|
"grad_norm": 2.477450132369995, |
|
"learning_rate": 2.2935734671784274e-05, |
|
"loss": 0.174, |
|
"step": 44000 |
|
}, |
|
{ |
|
"epoch": 1.6423088278712725, |
|
"grad_norm": 11.740235328674316, |
|
"learning_rate": 2.262818620214546e-05, |
|
"loss": 0.1652, |
|
"step": 44500 |
|
}, |
|
{ |
|
"epoch": 1.6607617360496014, |
|
"grad_norm": 2.253143548965454, |
|
"learning_rate": 2.2320637732506645e-05, |
|
"loss": 0.1776, |
|
"step": 45000 |
|
}, |
|
{ |
|
"epoch": 1.6792146442279303, |
|
"grad_norm": 4.1611151695251465, |
|
"learning_rate": 2.201308926286783e-05, |
|
"loss": 0.1643, |
|
"step": 45500 |
|
}, |
|
{ |
|
"epoch": 1.6976675524062592, |
|
"grad_norm": 3.693655252456665, |
|
"learning_rate": 2.1705540793229015e-05, |
|
"loss": 0.1705, |
|
"step": 46000 |
|
}, |
|
{ |
|
"epoch": 1.7161204605845881, |
|
"grad_norm": 3.8450114727020264, |
|
"learning_rate": 2.13979923235902e-05, |
|
"loss": 0.1715, |
|
"step": 46500 |
|
}, |
|
{ |
|
"epoch": 1.734573368762917, |
|
"grad_norm": 3.296321392059326, |
|
"learning_rate": 2.1090443853951382e-05, |
|
"loss": 0.1642, |
|
"step": 47000 |
|
}, |
|
{ |
|
"epoch": 1.753026276941246, |
|
"grad_norm": 2.0819671154022217, |
|
"learning_rate": 2.0782895384312567e-05, |
|
"loss": 0.1624, |
|
"step": 47500 |
|
}, |
|
{ |
|
"epoch": 1.7714791851195748, |
|
"grad_norm": 0.8893182873725891, |
|
"learning_rate": 2.0475346914673755e-05, |
|
"loss": 0.1678, |
|
"step": 48000 |
|
}, |
|
{ |
|
"epoch": 1.7899320932979037, |
|
"grad_norm": 2.971529960632324, |
|
"learning_rate": 2.0167798445034937e-05, |
|
"loss": 0.1664, |
|
"step": 48500 |
|
}, |
|
{ |
|
"epoch": 1.8083850014762326, |
|
"grad_norm": 2.0590310096740723, |
|
"learning_rate": 1.9860249975396122e-05, |
|
"loss": 0.1779, |
|
"step": 49000 |
|
}, |
|
{ |
|
"epoch": 1.8268379096545617, |
|
"grad_norm": 2.0498523712158203, |
|
"learning_rate": 1.955270150575731e-05, |
|
"loss": 0.1695, |
|
"step": 49500 |
|
}, |
|
{ |
|
"epoch": 1.8452908178328906, |
|
"grad_norm": 1.6503143310546875, |
|
"learning_rate": 1.9245153036118493e-05, |
|
"loss": 0.1678, |
|
"step": 50000 |
|
}, |
|
{ |
|
"epoch": 1.8637437260112195, |
|
"grad_norm": 1.0318537950515747, |
|
"learning_rate": 1.8937604566479678e-05, |
|
"loss": 0.1644, |
|
"step": 50500 |
|
}, |
|
{ |
|
"epoch": 1.8821966341895484, |
|
"grad_norm": 1.936584711074829, |
|
"learning_rate": 1.8630056096840863e-05, |
|
"loss": 0.1697, |
|
"step": 51000 |
|
}, |
|
{ |
|
"epoch": 1.9006495423678773, |
|
"grad_norm": 2.5828168392181396, |
|
"learning_rate": 1.8322507627202048e-05, |
|
"loss": 0.1696, |
|
"step": 51500 |
|
}, |
|
{ |
|
"epoch": 1.9191024505462062, |
|
"grad_norm": 3.156874895095825, |
|
"learning_rate": 1.8014959157563233e-05, |
|
"loss": 0.1673, |
|
"step": 52000 |
|
}, |
|
{ |
|
"epoch": 1.937555358724535, |
|
"grad_norm": 3.178074836730957, |
|
"learning_rate": 1.7707410687924418e-05, |
|
"loss": 0.176, |
|
"step": 52500 |
|
}, |
|
{ |
|
"epoch": 1.956008266902864, |
|
"grad_norm": 1.48374342918396, |
|
"learning_rate": 1.7399862218285603e-05, |
|
"loss": 0.1677, |
|
"step": 53000 |
|
}, |
|
{ |
|
"epoch": 1.974461175081193, |
|
"grad_norm": 3.43747878074646, |
|
"learning_rate": 1.709231374864679e-05, |
|
"loss": 0.1696, |
|
"step": 53500 |
|
}, |
|
{ |
|
"epoch": 1.9929140832595218, |
|
"grad_norm": 1.6862876415252686, |
|
"learning_rate": 1.678476527900797e-05, |
|
"loss": 0.1718, |
|
"step": 54000 |
|
}, |
|
{ |
|
"epoch": 2.0, |
|
"eval_loss": 0.20655478537082672, |
|
"eval_mse": 0.2065547745621827, |
|
"eval_runtime": 52.3234, |
|
"eval_samples_per_second": 1841.202, |
|
"eval_steps_per_second": 230.165, |
|
"step": 54192 |
|
}, |
|
{ |
|
"epoch": 2.0113669914378507, |
|
"grad_norm": 0.8314543962478638, |
|
"learning_rate": 1.647721680936916e-05, |
|
"loss": 0.1449, |
|
"step": 54500 |
|
}, |
|
{ |
|
"epoch": 2.0298198996161796, |
|
"grad_norm": 1.8953380584716797, |
|
"learning_rate": 1.6169668339730344e-05, |
|
"loss": 0.1357, |
|
"step": 55000 |
|
}, |
|
{ |
|
"epoch": 2.0482728077945085, |
|
"grad_norm": 0.7893266081809998, |
|
"learning_rate": 1.5862119870091526e-05, |
|
"loss": 0.138, |
|
"step": 55500 |
|
}, |
|
{ |
|
"epoch": 2.0667257159728374, |
|
"grad_norm": 1.337292194366455, |
|
"learning_rate": 1.555457140045271e-05, |
|
"loss": 0.1407, |
|
"step": 56000 |
|
}, |
|
{ |
|
"epoch": 2.0851786241511663, |
|
"grad_norm": 1.6890192031860352, |
|
"learning_rate": 1.5247022930813898e-05, |
|
"loss": 0.1406, |
|
"step": 56500 |
|
}, |
|
{ |
|
"epoch": 2.103631532329495, |
|
"grad_norm": 2.1817214488983154, |
|
"learning_rate": 1.4939474461175081e-05, |
|
"loss": 0.1332, |
|
"step": 57000 |
|
}, |
|
{ |
|
"epoch": 2.122084440507824, |
|
"grad_norm": 1.477333664894104, |
|
"learning_rate": 1.4631925991536266e-05, |
|
"loss": 0.1415, |
|
"step": 57500 |
|
}, |
|
{ |
|
"epoch": 2.140537348686153, |
|
"grad_norm": 3.889193534851074, |
|
"learning_rate": 1.4324377521897453e-05, |
|
"loss": 0.1399, |
|
"step": 58000 |
|
}, |
|
{ |
|
"epoch": 2.158990256864482, |
|
"grad_norm": 11.35392951965332, |
|
"learning_rate": 1.4016829052258637e-05, |
|
"loss": 0.1345, |
|
"step": 58500 |
|
}, |
|
{ |
|
"epoch": 2.1774431650428108, |
|
"grad_norm": 2.2750699520111084, |
|
"learning_rate": 1.3709280582619822e-05, |
|
"loss": 0.1347, |
|
"step": 59000 |
|
}, |
|
{ |
|
"epoch": 2.1958960732211397, |
|
"grad_norm": 4.66851282119751, |
|
"learning_rate": 1.3401732112981005e-05, |
|
"loss": 0.1359, |
|
"step": 59500 |
|
}, |
|
{ |
|
"epoch": 2.2143489813994686, |
|
"grad_norm": 1.2594196796417236, |
|
"learning_rate": 1.3094183643342192e-05, |
|
"loss": 0.135, |
|
"step": 60000 |
|
}, |
|
{ |
|
"epoch": 2.2328018895777975, |
|
"grad_norm": 0.6602271199226379, |
|
"learning_rate": 1.2786635173703375e-05, |
|
"loss": 0.1381, |
|
"step": 60500 |
|
}, |
|
{ |
|
"epoch": 2.2512547977561264, |
|
"grad_norm": 0.8580902814865112, |
|
"learning_rate": 1.2479086704064562e-05, |
|
"loss": 0.1308, |
|
"step": 61000 |
|
}, |
|
{ |
|
"epoch": 2.2697077059344553, |
|
"grad_norm": 0.8672662377357483, |
|
"learning_rate": 1.2171538234425746e-05, |
|
"loss": 0.1395, |
|
"step": 61500 |
|
}, |
|
{ |
|
"epoch": 2.288160614112784, |
|
"grad_norm": 1.646864891052246, |
|
"learning_rate": 1.186398976478693e-05, |
|
"loss": 0.1419, |
|
"step": 62000 |
|
}, |
|
{ |
|
"epoch": 2.306613522291113, |
|
"grad_norm": 4.04207181930542, |
|
"learning_rate": 1.1556441295148116e-05, |
|
"loss": 0.1337, |
|
"step": 62500 |
|
}, |
|
{ |
|
"epoch": 2.325066430469442, |
|
"grad_norm": 5.613555431365967, |
|
"learning_rate": 1.1248892825509301e-05, |
|
"loss": 0.1429, |
|
"step": 63000 |
|
}, |
|
{ |
|
"epoch": 2.343519338647771, |
|
"grad_norm": 1.977729082107544, |
|
"learning_rate": 1.0941344355870485e-05, |
|
"loss": 0.1323, |
|
"step": 63500 |
|
}, |
|
{ |
|
"epoch": 2.3619722468260997, |
|
"grad_norm": 1.2868248224258423, |
|
"learning_rate": 1.0633795886231671e-05, |
|
"loss": 0.1383, |
|
"step": 64000 |
|
}, |
|
{ |
|
"epoch": 2.3804251550044286, |
|
"grad_norm": 1.098742961883545, |
|
"learning_rate": 1.0326247416592857e-05, |
|
"loss": 0.1387, |
|
"step": 64500 |
|
}, |
|
{ |
|
"epoch": 2.3988780631827575, |
|
"grad_norm": 2.9264678955078125, |
|
"learning_rate": 1.001869894695404e-05, |
|
"loss": 0.1386, |
|
"step": 65000 |
|
}, |
|
{ |
|
"epoch": 2.4173309713610864, |
|
"grad_norm": 3.179082155227661, |
|
"learning_rate": 9.711150477315225e-06, |
|
"loss": 0.1444, |
|
"step": 65500 |
|
}, |
|
{ |
|
"epoch": 2.4357838795394153, |
|
"grad_norm": 1.5083171129226685, |
|
"learning_rate": 9.40360200767641e-06, |
|
"loss": 0.1351, |
|
"step": 66000 |
|
}, |
|
{ |
|
"epoch": 2.4542367877177442, |
|
"grad_norm": 1.590307354927063, |
|
"learning_rate": 9.096053538037595e-06, |
|
"loss": 0.1379, |
|
"step": 66500 |
|
}, |
|
{ |
|
"epoch": 2.472689695896073, |
|
"grad_norm": 1.490502953529358, |
|
"learning_rate": 8.78850506839878e-06, |
|
"loss": 0.1285, |
|
"step": 67000 |
|
}, |
|
{ |
|
"epoch": 2.491142604074402, |
|
"grad_norm": 2.0561413764953613, |
|
"learning_rate": 8.480956598759966e-06, |
|
"loss": 0.1396, |
|
"step": 67500 |
|
}, |
|
{ |
|
"epoch": 2.509595512252731, |
|
"grad_norm": 1.0588093996047974, |
|
"learning_rate": 8.17340812912115e-06, |
|
"loss": 0.1367, |
|
"step": 68000 |
|
}, |
|
{ |
|
"epoch": 2.52804842043106, |
|
"grad_norm": 0.8184725046157837, |
|
"learning_rate": 7.865859659482334e-06, |
|
"loss": 0.1322, |
|
"step": 68500 |
|
}, |
|
{ |
|
"epoch": 2.5465013286093887, |
|
"grad_norm": 1.3976045846939087, |
|
"learning_rate": 7.55831118984352e-06, |
|
"loss": 0.1332, |
|
"step": 69000 |
|
}, |
|
{ |
|
"epoch": 2.5649542367877176, |
|
"grad_norm": 2.417647361755371, |
|
"learning_rate": 7.250762720204704e-06, |
|
"loss": 0.1342, |
|
"step": 69500 |
|
}, |
|
{ |
|
"epoch": 2.5834071449660465, |
|
"grad_norm": 4.064483165740967, |
|
"learning_rate": 6.94321425056589e-06, |
|
"loss": 0.1355, |
|
"step": 70000 |
|
}, |
|
{ |
|
"epoch": 2.6018600531443754, |
|
"grad_norm": 2.23105788230896, |
|
"learning_rate": 6.635665780927075e-06, |
|
"loss": 0.1315, |
|
"step": 70500 |
|
}, |
|
{ |
|
"epoch": 2.6203129613227043, |
|
"grad_norm": 2.205604076385498, |
|
"learning_rate": 6.328117311288259e-06, |
|
"loss": 0.1379, |
|
"step": 71000 |
|
}, |
|
{ |
|
"epoch": 2.638765869501033, |
|
"grad_norm": 2.5101168155670166, |
|
"learning_rate": 6.020568841649444e-06, |
|
"loss": 0.142, |
|
"step": 71500 |
|
}, |
|
{ |
|
"epoch": 2.657218777679362, |
|
"grad_norm": 11.855621337890625, |
|
"learning_rate": 5.713020372010629e-06, |
|
"loss": 0.1359, |
|
"step": 72000 |
|
}, |
|
{ |
|
"epoch": 2.675671685857691, |
|
"grad_norm": 1.7274291515350342, |
|
"learning_rate": 5.4054719023718145e-06, |
|
"loss": 0.1386, |
|
"step": 72500 |
|
}, |
|
{ |
|
"epoch": 2.69412459403602, |
|
"grad_norm": 1.0947271585464478, |
|
"learning_rate": 5.097923432732999e-06, |
|
"loss": 0.1393, |
|
"step": 73000 |
|
}, |
|
{ |
|
"epoch": 2.712577502214349, |
|
"grad_norm": 1.6208831071853638, |
|
"learning_rate": 4.790374963094184e-06, |
|
"loss": 0.1276, |
|
"step": 73500 |
|
}, |
|
{ |
|
"epoch": 2.7310304103926777, |
|
"grad_norm": 1.5204744338989258, |
|
"learning_rate": 4.482826493455368e-06, |
|
"loss": 0.1297, |
|
"step": 74000 |
|
}, |
|
{ |
|
"epoch": 2.7494833185710066, |
|
"grad_norm": 4.482317924499512, |
|
"learning_rate": 4.175278023816553e-06, |
|
"loss": 0.1303, |
|
"step": 74500 |
|
}, |
|
{ |
|
"epoch": 2.7679362267493355, |
|
"grad_norm": 9.054340362548828, |
|
"learning_rate": 3.8677295541777385e-06, |
|
"loss": 0.1319, |
|
"step": 75000 |
|
}, |
|
{ |
|
"epoch": 2.7863891349276644, |
|
"grad_norm": 1.8670865297317505, |
|
"learning_rate": 3.5601810845389237e-06, |
|
"loss": 0.1301, |
|
"step": 75500 |
|
}, |
|
{ |
|
"epoch": 2.8048420431059933, |
|
"grad_norm": 1.451202154159546, |
|
"learning_rate": 3.2526326149001084e-06, |
|
"loss": 0.1309, |
|
"step": 76000 |
|
}, |
|
{ |
|
"epoch": 2.823294951284322, |
|
"grad_norm": 3.281291961669922, |
|
"learning_rate": 2.945084145261293e-06, |
|
"loss": 0.1407, |
|
"step": 76500 |
|
}, |
|
{ |
|
"epoch": 2.841747859462651, |
|
"grad_norm": 3.273066997528076, |
|
"learning_rate": 2.6375356756224782e-06, |
|
"loss": 0.1267, |
|
"step": 77000 |
|
}, |
|
{ |
|
"epoch": 2.86020076764098, |
|
"grad_norm": 8.522459030151367, |
|
"learning_rate": 2.3299872059836634e-06, |
|
"loss": 0.1304, |
|
"step": 77500 |
|
}, |
|
{ |
|
"epoch": 2.878653675819309, |
|
"grad_norm": 1.6981911659240723, |
|
"learning_rate": 2.022438736344848e-06, |
|
"loss": 0.1436, |
|
"step": 78000 |
|
}, |
|
{ |
|
"epoch": 2.8971065839976378, |
|
"grad_norm": 2.415241003036499, |
|
"learning_rate": 1.7148902667060328e-06, |
|
"loss": 0.1297, |
|
"step": 78500 |
|
}, |
|
{ |
|
"epoch": 2.9155594921759667, |
|
"grad_norm": 1.65168035030365, |
|
"learning_rate": 1.4073417970672177e-06, |
|
"loss": 0.138, |
|
"step": 79000 |
|
}, |
|
{ |
|
"epoch": 2.934012400354296, |
|
"grad_norm": 1.9556164741516113, |
|
"learning_rate": 1.0997933274284029e-06, |
|
"loss": 0.1346, |
|
"step": 79500 |
|
}, |
|
{ |
|
"epoch": 2.952465308532625, |
|
"grad_norm": 2.9853076934814453, |
|
"learning_rate": 7.922448577895876e-07, |
|
"loss": 0.1371, |
|
"step": 80000 |
|
}, |
|
{ |
|
"epoch": 2.970918216710954, |
|
"grad_norm": 2.885925054550171, |
|
"learning_rate": 4.846963881507725e-07, |
|
"loss": 0.1342, |
|
"step": 80500 |
|
}, |
|
{ |
|
"epoch": 2.9893711248892827, |
|
"grad_norm": 2.020306348800659, |
|
"learning_rate": 1.771479185119575e-07, |
|
"loss": 0.141, |
|
"step": 81000 |
|
}, |
|
{ |
|
"epoch": 3.0, |
|
"eval_loss": 0.19889499247074127, |
|
"eval_mse": 0.19889500241300032, |
|
"eval_runtime": 55.3999, |
|
"eval_samples_per_second": 1738.955, |
|
"eval_steps_per_second": 217.383, |
|
"step": 81288 |
|
}, |
|
{ |
|
"epoch": 3.0, |
|
"step": 81288, |
|
"total_flos": 4.283504864539085e+16, |
|
"train_loss": 0.17857247165732115, |
|
"train_runtime": 4471.1905, |
|
"train_samples_per_second": 581.752, |
|
"train_steps_per_second": 18.18 |
|
} |
|
], |
|
"logging_steps": 500, |
|
"max_steps": 81288, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 3, |
|
"save_steps": 500, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 4.283504864539085e+16, |
|
"train_batch_size": 32, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|